1 /* 2 * linux/fs/namespace.c 3 * 4 * (C) Copyright Al Viro 2000, 2001 5 * Released under GPL v2. 6 * 7 * Based on code from fs/super.c, copyright Linus Torvalds and others. 8 * Heavily rewritten. 9 */ 10 11 #include <linux/syscalls.h> 12 #include <linux/export.h> 13 #include <linux/capability.h> 14 #include <linux/mnt_namespace.h> 15 #include <linux/user_namespace.h> 16 #include <linux/namei.h> 17 #include <linux/security.h> 18 #include <linux/idr.h> 19 #include <linux/acct.h> /* acct_auto_close_mnt */ 20 #include <linux/init.h> /* init_rootfs */ 21 #include <linux/fs_struct.h> /* get_fs_root et.al. */ 22 #include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */ 23 #include <linux/uaccess.h> 24 #include <linux/proc_ns.h> 25 #include <linux/magic.h> 26 #include <linux/bootmem.h> 27 #include "pnode.h" 28 #include "internal.h" 29 30 static unsigned int m_hash_mask __read_mostly; 31 static unsigned int m_hash_shift __read_mostly; 32 static unsigned int mp_hash_mask __read_mostly; 33 static unsigned int mp_hash_shift __read_mostly; 34 35 static __initdata unsigned long mhash_entries; 36 static int __init set_mhash_entries(char *str) 37 { 38 if (!str) 39 return 0; 40 mhash_entries = simple_strtoul(str, &str, 0); 41 return 1; 42 } 43 __setup("mhash_entries=", set_mhash_entries); 44 45 static __initdata unsigned long mphash_entries; 46 static int __init set_mphash_entries(char *str) 47 { 48 if (!str) 49 return 0; 50 mphash_entries = simple_strtoul(str, &str, 0); 51 return 1; 52 } 53 __setup("mphash_entries=", set_mphash_entries); 54 55 static u64 event; 56 static DEFINE_IDA(mnt_id_ida); 57 static DEFINE_IDA(mnt_group_ida); 58 static DEFINE_SPINLOCK(mnt_id_lock); 59 static int mnt_id_start = 0; 60 static int mnt_group_start = 1; 61 62 static struct hlist_head *mount_hashtable __read_mostly; 63 static struct hlist_head *mountpoint_hashtable __read_mostly; 64 static struct kmem_cache *mnt_cache __read_mostly; 65 static DECLARE_RWSEM(namespace_sem); 66 67 /* /sys/fs */ 68 struct kobject *fs_kobj; 69 EXPORT_SYMBOL_GPL(fs_kobj); 70 71 /* 72 * vfsmount lock may be taken for read to prevent changes to the 73 * vfsmount hash, ie. during mountpoint lookups or walking back 74 * up the tree. 75 * 76 * It should be taken for write in all cases where the vfsmount 77 * tree or hash is modified or when a vfsmount structure is modified. 78 */ 79 __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock); 80 81 static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry) 82 { 83 unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES); 84 tmp += ((unsigned long)dentry / L1_CACHE_BYTES); 85 tmp = tmp + (tmp >> m_hash_shift); 86 return &mount_hashtable[tmp & m_hash_mask]; 87 } 88 89 static inline struct hlist_head *mp_hash(struct dentry *dentry) 90 { 91 unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES); 92 tmp = tmp + (tmp >> mp_hash_shift); 93 return &mountpoint_hashtable[tmp & mp_hash_mask]; 94 } 95 96 /* 97 * allocation is serialized by namespace_sem, but we need the spinlock to 98 * serialize with freeing. 99 */ 100 static int mnt_alloc_id(struct mount *mnt) 101 { 102 int res; 103 104 retry: 105 ida_pre_get(&mnt_id_ida, GFP_KERNEL); 106 spin_lock(&mnt_id_lock); 107 res = ida_get_new_above(&mnt_id_ida, mnt_id_start, &mnt->mnt_id); 108 if (!res) 109 mnt_id_start = mnt->mnt_id + 1; 110 spin_unlock(&mnt_id_lock); 111 if (res == -EAGAIN) 112 goto retry; 113 114 return res; 115 } 116 117 static void mnt_free_id(struct mount *mnt) 118 { 119 int id = mnt->mnt_id; 120 spin_lock(&mnt_id_lock); 121 ida_remove(&mnt_id_ida, id); 122 if (mnt_id_start > id) 123 mnt_id_start = id; 124 spin_unlock(&mnt_id_lock); 125 } 126 127 /* 128 * Allocate a new peer group ID 129 * 130 * mnt_group_ida is protected by namespace_sem 131 */ 132 static int mnt_alloc_group_id(struct mount *mnt) 133 { 134 int res; 135 136 if (!ida_pre_get(&mnt_group_ida, GFP_KERNEL)) 137 return -ENOMEM; 138 139 res = ida_get_new_above(&mnt_group_ida, 140 mnt_group_start, 141 &mnt->mnt_group_id); 142 if (!res) 143 mnt_group_start = mnt->mnt_group_id + 1; 144 145 return res; 146 } 147 148 /* 149 * Release a peer group ID 150 */ 151 void mnt_release_group_id(struct mount *mnt) 152 { 153 int id = mnt->mnt_group_id; 154 ida_remove(&mnt_group_ida, id); 155 if (mnt_group_start > id) 156 mnt_group_start = id; 157 mnt->mnt_group_id = 0; 158 } 159 160 /* 161 * vfsmount lock must be held for read 162 */ 163 static inline void mnt_add_count(struct mount *mnt, int n) 164 { 165 #ifdef CONFIG_SMP 166 this_cpu_add(mnt->mnt_pcp->mnt_count, n); 167 #else 168 preempt_disable(); 169 mnt->mnt_count += n; 170 preempt_enable(); 171 #endif 172 } 173 174 /* 175 * vfsmount lock must be held for write 176 */ 177 unsigned int mnt_get_count(struct mount *mnt) 178 { 179 #ifdef CONFIG_SMP 180 unsigned int count = 0; 181 int cpu; 182 183 for_each_possible_cpu(cpu) { 184 count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count; 185 } 186 187 return count; 188 #else 189 return mnt->mnt_count; 190 #endif 191 } 192 193 static struct mount *alloc_vfsmnt(const char *name) 194 { 195 struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); 196 if (mnt) { 197 int err; 198 199 err = mnt_alloc_id(mnt); 200 if (err) 201 goto out_free_cache; 202 203 if (name) { 204 mnt->mnt_devname = kstrdup(name, GFP_KERNEL); 205 if (!mnt->mnt_devname) 206 goto out_free_id; 207 } 208 209 #ifdef CONFIG_SMP 210 mnt->mnt_pcp = alloc_percpu(struct mnt_pcp); 211 if (!mnt->mnt_pcp) 212 goto out_free_devname; 213 214 this_cpu_add(mnt->mnt_pcp->mnt_count, 1); 215 #else 216 mnt->mnt_count = 1; 217 mnt->mnt_writers = 0; 218 #endif 219 220 INIT_HLIST_NODE(&mnt->mnt_hash); 221 INIT_LIST_HEAD(&mnt->mnt_child); 222 INIT_LIST_HEAD(&mnt->mnt_mounts); 223 INIT_LIST_HEAD(&mnt->mnt_list); 224 INIT_LIST_HEAD(&mnt->mnt_expire); 225 INIT_LIST_HEAD(&mnt->mnt_share); 226 INIT_LIST_HEAD(&mnt->mnt_slave_list); 227 INIT_LIST_HEAD(&mnt->mnt_slave); 228 #ifdef CONFIG_FSNOTIFY 229 INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks); 230 #endif 231 } 232 return mnt; 233 234 #ifdef CONFIG_SMP 235 out_free_devname: 236 kfree(mnt->mnt_devname); 237 #endif 238 out_free_id: 239 mnt_free_id(mnt); 240 out_free_cache: 241 kmem_cache_free(mnt_cache, mnt); 242 return NULL; 243 } 244 245 /* 246 * Most r/o checks on a fs are for operations that take 247 * discrete amounts of time, like a write() or unlink(). 248 * We must keep track of when those operations start 249 * (for permission checks) and when they end, so that 250 * we can determine when writes are able to occur to 251 * a filesystem. 252 */ 253 /* 254 * __mnt_is_readonly: check whether a mount is read-only 255 * @mnt: the mount to check for its write status 256 * 257 * This shouldn't be used directly ouside of the VFS. 258 * It does not guarantee that the filesystem will stay 259 * r/w, just that it is right *now*. This can not and 260 * should not be used in place of IS_RDONLY(inode). 261 * mnt_want/drop_write() will _keep_ the filesystem 262 * r/w. 263 */ 264 int __mnt_is_readonly(struct vfsmount *mnt) 265 { 266 if (mnt->mnt_flags & MNT_READONLY) 267 return 1; 268 if (mnt->mnt_sb->s_flags & MS_RDONLY) 269 return 1; 270 return 0; 271 } 272 EXPORT_SYMBOL_GPL(__mnt_is_readonly); 273 274 static inline void mnt_inc_writers(struct mount *mnt) 275 { 276 #ifdef CONFIG_SMP 277 this_cpu_inc(mnt->mnt_pcp->mnt_writers); 278 #else 279 mnt->mnt_writers++; 280 #endif 281 } 282 283 static inline void mnt_dec_writers(struct mount *mnt) 284 { 285 #ifdef CONFIG_SMP 286 this_cpu_dec(mnt->mnt_pcp->mnt_writers); 287 #else 288 mnt->mnt_writers--; 289 #endif 290 } 291 292 static unsigned int mnt_get_writers(struct mount *mnt) 293 { 294 #ifdef CONFIG_SMP 295 unsigned int count = 0; 296 int cpu; 297 298 for_each_possible_cpu(cpu) { 299 count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers; 300 } 301 302 return count; 303 #else 304 return mnt->mnt_writers; 305 #endif 306 } 307 308 static int mnt_is_readonly(struct vfsmount *mnt) 309 { 310 if (mnt->mnt_sb->s_readonly_remount) 311 return 1; 312 /* Order wrt setting s_flags/s_readonly_remount in do_remount() */ 313 smp_rmb(); 314 return __mnt_is_readonly(mnt); 315 } 316 317 /* 318 * Most r/o & frozen checks on a fs are for operations that take discrete 319 * amounts of time, like a write() or unlink(). We must keep track of when 320 * those operations start (for permission checks) and when they end, so that we 321 * can determine when writes are able to occur to a filesystem. 322 */ 323 /** 324 * __mnt_want_write - get write access to a mount without freeze protection 325 * @m: the mount on which to take a write 326 * 327 * This tells the low-level filesystem that a write is about to be performed to 328 * it, and makes sure that writes are allowed (mnt it read-write) before 329 * returning success. This operation does not protect against filesystem being 330 * frozen. When the write operation is finished, __mnt_drop_write() must be 331 * called. This is effectively a refcount. 332 */ 333 int __mnt_want_write(struct vfsmount *m) 334 { 335 struct mount *mnt = real_mount(m); 336 int ret = 0; 337 338 preempt_disable(); 339 mnt_inc_writers(mnt); 340 /* 341 * The store to mnt_inc_writers must be visible before we pass 342 * MNT_WRITE_HOLD loop below, so that the slowpath can see our 343 * incremented count after it has set MNT_WRITE_HOLD. 344 */ 345 smp_mb(); 346 while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) 347 cpu_relax(); 348 /* 349 * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will 350 * be set to match its requirements. So we must not load that until 351 * MNT_WRITE_HOLD is cleared. 352 */ 353 smp_rmb(); 354 if (mnt_is_readonly(m)) { 355 mnt_dec_writers(mnt); 356 ret = -EROFS; 357 } 358 preempt_enable(); 359 360 return ret; 361 } 362 363 /** 364 * mnt_want_write - get write access to a mount 365 * @m: the mount on which to take a write 366 * 367 * This tells the low-level filesystem that a write is about to be performed to 368 * it, and makes sure that writes are allowed (mount is read-write, filesystem 369 * is not frozen) before returning success. When the write operation is 370 * finished, mnt_drop_write() must be called. This is effectively a refcount. 371 */ 372 int mnt_want_write(struct vfsmount *m) 373 { 374 int ret; 375 376 sb_start_write(m->mnt_sb); 377 ret = __mnt_want_write(m); 378 if (ret) 379 sb_end_write(m->mnt_sb); 380 return ret; 381 } 382 EXPORT_SYMBOL_GPL(mnt_want_write); 383 384 /** 385 * mnt_clone_write - get write access to a mount 386 * @mnt: the mount on which to take a write 387 * 388 * This is effectively like mnt_want_write, except 389 * it must only be used to take an extra write reference 390 * on a mountpoint that we already know has a write reference 391 * on it. This allows some optimisation. 392 * 393 * After finished, mnt_drop_write must be called as usual to 394 * drop the reference. 395 */ 396 int mnt_clone_write(struct vfsmount *mnt) 397 { 398 /* superblock may be r/o */ 399 if (__mnt_is_readonly(mnt)) 400 return -EROFS; 401 preempt_disable(); 402 mnt_inc_writers(real_mount(mnt)); 403 preempt_enable(); 404 return 0; 405 } 406 EXPORT_SYMBOL_GPL(mnt_clone_write); 407 408 /** 409 * __mnt_want_write_file - get write access to a file's mount 410 * @file: the file who's mount on which to take a write 411 * 412 * This is like __mnt_want_write, but it takes a file and can 413 * do some optimisations if the file is open for write already 414 */ 415 int __mnt_want_write_file(struct file *file) 416 { 417 if (!(file->f_mode & FMODE_WRITER)) 418 return __mnt_want_write(file->f_path.mnt); 419 else 420 return mnt_clone_write(file->f_path.mnt); 421 } 422 423 /** 424 * mnt_want_write_file - get write access to a file's mount 425 * @file: the file who's mount on which to take a write 426 * 427 * This is like mnt_want_write, but it takes a file and can 428 * do some optimisations if the file is open for write already 429 */ 430 int mnt_want_write_file(struct file *file) 431 { 432 int ret; 433 434 sb_start_write(file->f_path.mnt->mnt_sb); 435 ret = __mnt_want_write_file(file); 436 if (ret) 437 sb_end_write(file->f_path.mnt->mnt_sb); 438 return ret; 439 } 440 EXPORT_SYMBOL_GPL(mnt_want_write_file); 441 442 /** 443 * __mnt_drop_write - give up write access to a mount 444 * @mnt: the mount on which to give up write access 445 * 446 * Tells the low-level filesystem that we are done 447 * performing writes to it. Must be matched with 448 * __mnt_want_write() call above. 449 */ 450 void __mnt_drop_write(struct vfsmount *mnt) 451 { 452 preempt_disable(); 453 mnt_dec_writers(real_mount(mnt)); 454 preempt_enable(); 455 } 456 457 /** 458 * mnt_drop_write - give up write access to a mount 459 * @mnt: the mount on which to give up write access 460 * 461 * Tells the low-level filesystem that we are done performing writes to it and 462 * also allows filesystem to be frozen again. Must be matched with 463 * mnt_want_write() call above. 464 */ 465 void mnt_drop_write(struct vfsmount *mnt) 466 { 467 __mnt_drop_write(mnt); 468 sb_end_write(mnt->mnt_sb); 469 } 470 EXPORT_SYMBOL_GPL(mnt_drop_write); 471 472 void __mnt_drop_write_file(struct file *file) 473 { 474 __mnt_drop_write(file->f_path.mnt); 475 } 476 477 void mnt_drop_write_file(struct file *file) 478 { 479 mnt_drop_write(file->f_path.mnt); 480 } 481 EXPORT_SYMBOL(mnt_drop_write_file); 482 483 static int mnt_make_readonly(struct mount *mnt) 484 { 485 int ret = 0; 486 487 lock_mount_hash(); 488 mnt->mnt.mnt_flags |= MNT_WRITE_HOLD; 489 /* 490 * After storing MNT_WRITE_HOLD, we'll read the counters. This store 491 * should be visible before we do. 492 */ 493 smp_mb(); 494 495 /* 496 * With writers on hold, if this value is zero, then there are 497 * definitely no active writers (although held writers may subsequently 498 * increment the count, they'll have to wait, and decrement it after 499 * seeing MNT_READONLY). 500 * 501 * It is OK to have counter incremented on one CPU and decremented on 502 * another: the sum will add up correctly. The danger would be when we 503 * sum up each counter, if we read a counter before it is incremented, 504 * but then read another CPU's count which it has been subsequently 505 * decremented from -- we would see more decrements than we should. 506 * MNT_WRITE_HOLD protects against this scenario, because 507 * mnt_want_write first increments count, then smp_mb, then spins on 508 * MNT_WRITE_HOLD, so it can't be decremented by another CPU while 509 * we're counting up here. 510 */ 511 if (mnt_get_writers(mnt) > 0) 512 ret = -EBUSY; 513 else 514 mnt->mnt.mnt_flags |= MNT_READONLY; 515 /* 516 * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers 517 * that become unheld will see MNT_READONLY. 518 */ 519 smp_wmb(); 520 mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; 521 unlock_mount_hash(); 522 return ret; 523 } 524 525 static void __mnt_unmake_readonly(struct mount *mnt) 526 { 527 lock_mount_hash(); 528 mnt->mnt.mnt_flags &= ~MNT_READONLY; 529 unlock_mount_hash(); 530 } 531 532 int sb_prepare_remount_readonly(struct super_block *sb) 533 { 534 struct mount *mnt; 535 int err = 0; 536 537 /* Racy optimization. Recheck the counter under MNT_WRITE_HOLD */ 538 if (atomic_long_read(&sb->s_remove_count)) 539 return -EBUSY; 540 541 lock_mount_hash(); 542 list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { 543 if (!(mnt->mnt.mnt_flags & MNT_READONLY)) { 544 mnt->mnt.mnt_flags |= MNT_WRITE_HOLD; 545 smp_mb(); 546 if (mnt_get_writers(mnt) > 0) { 547 err = -EBUSY; 548 break; 549 } 550 } 551 } 552 if (!err && atomic_long_read(&sb->s_remove_count)) 553 err = -EBUSY; 554 555 if (!err) { 556 sb->s_readonly_remount = 1; 557 smp_wmb(); 558 } 559 list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { 560 if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD) 561 mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; 562 } 563 unlock_mount_hash(); 564 565 return err; 566 } 567 568 static void free_vfsmnt(struct mount *mnt) 569 { 570 kfree(mnt->mnt_devname); 571 #ifdef CONFIG_SMP 572 free_percpu(mnt->mnt_pcp); 573 #endif 574 kmem_cache_free(mnt_cache, mnt); 575 } 576 577 static void delayed_free_vfsmnt(struct rcu_head *head) 578 { 579 free_vfsmnt(container_of(head, struct mount, mnt_rcu)); 580 } 581 582 /* call under rcu_read_lock */ 583 bool legitimize_mnt(struct vfsmount *bastard, unsigned seq) 584 { 585 struct mount *mnt; 586 if (read_seqretry(&mount_lock, seq)) 587 return false; 588 if (bastard == NULL) 589 return true; 590 mnt = real_mount(bastard); 591 mnt_add_count(mnt, 1); 592 if (likely(!read_seqretry(&mount_lock, seq))) 593 return true; 594 if (bastard->mnt_flags & MNT_SYNC_UMOUNT) { 595 mnt_add_count(mnt, -1); 596 return false; 597 } 598 rcu_read_unlock(); 599 mntput(bastard); 600 rcu_read_lock(); 601 return false; 602 } 603 604 /* 605 * find the first mount at @dentry on vfsmount @mnt. 606 * call under rcu_read_lock() 607 */ 608 struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) 609 { 610 struct hlist_head *head = m_hash(mnt, dentry); 611 struct mount *p; 612 613 hlist_for_each_entry_rcu(p, head, mnt_hash) 614 if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry) 615 return p; 616 return NULL; 617 } 618 619 /* 620 * find the last mount at @dentry on vfsmount @mnt. 621 * mount_lock must be held. 622 */ 623 struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry) 624 { 625 struct mount *p, *res; 626 res = p = __lookup_mnt(mnt, dentry); 627 if (!p) 628 goto out; 629 hlist_for_each_entry_continue(p, mnt_hash) { 630 if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry) 631 break; 632 res = p; 633 } 634 out: 635 return res; 636 } 637 638 /* 639 * lookup_mnt - Return the first child mount mounted at path 640 * 641 * "First" means first mounted chronologically. If you create the 642 * following mounts: 643 * 644 * mount /dev/sda1 /mnt 645 * mount /dev/sda2 /mnt 646 * mount /dev/sda3 /mnt 647 * 648 * Then lookup_mnt() on the base /mnt dentry in the root mount will 649 * return successively the root dentry and vfsmount of /dev/sda1, then 650 * /dev/sda2, then /dev/sda3, then NULL. 651 * 652 * lookup_mnt takes a reference to the found vfsmount. 653 */ 654 struct vfsmount *lookup_mnt(struct path *path) 655 { 656 struct mount *child_mnt; 657 struct vfsmount *m; 658 unsigned seq; 659 660 rcu_read_lock(); 661 do { 662 seq = read_seqbegin(&mount_lock); 663 child_mnt = __lookup_mnt(path->mnt, path->dentry); 664 m = child_mnt ? &child_mnt->mnt : NULL; 665 } while (!legitimize_mnt(m, seq)); 666 rcu_read_unlock(); 667 return m; 668 } 669 670 static struct mountpoint *new_mountpoint(struct dentry *dentry) 671 { 672 struct hlist_head *chain = mp_hash(dentry); 673 struct mountpoint *mp; 674 int ret; 675 676 hlist_for_each_entry(mp, chain, m_hash) { 677 if (mp->m_dentry == dentry) { 678 /* might be worth a WARN_ON() */ 679 if (d_unlinked(dentry)) 680 return ERR_PTR(-ENOENT); 681 mp->m_count++; 682 return mp; 683 } 684 } 685 686 mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL); 687 if (!mp) 688 return ERR_PTR(-ENOMEM); 689 690 ret = d_set_mounted(dentry); 691 if (ret) { 692 kfree(mp); 693 return ERR_PTR(ret); 694 } 695 696 mp->m_dentry = dentry; 697 mp->m_count = 1; 698 hlist_add_head(&mp->m_hash, chain); 699 return mp; 700 } 701 702 static void put_mountpoint(struct mountpoint *mp) 703 { 704 if (!--mp->m_count) { 705 struct dentry *dentry = mp->m_dentry; 706 spin_lock(&dentry->d_lock); 707 dentry->d_flags &= ~DCACHE_MOUNTED; 708 spin_unlock(&dentry->d_lock); 709 hlist_del(&mp->m_hash); 710 kfree(mp); 711 } 712 } 713 714 static inline int check_mnt(struct mount *mnt) 715 { 716 return mnt->mnt_ns == current->nsproxy->mnt_ns; 717 } 718 719 /* 720 * vfsmount lock must be held for write 721 */ 722 static void touch_mnt_namespace(struct mnt_namespace *ns) 723 { 724 if (ns) { 725 ns->event = ++event; 726 wake_up_interruptible(&ns->poll); 727 } 728 } 729 730 /* 731 * vfsmount lock must be held for write 732 */ 733 static void __touch_mnt_namespace(struct mnt_namespace *ns) 734 { 735 if (ns && ns->event != event) { 736 ns->event = event; 737 wake_up_interruptible(&ns->poll); 738 } 739 } 740 741 /* 742 * vfsmount lock must be held for write 743 */ 744 static void detach_mnt(struct mount *mnt, struct path *old_path) 745 { 746 old_path->dentry = mnt->mnt_mountpoint; 747 old_path->mnt = &mnt->mnt_parent->mnt; 748 mnt->mnt_parent = mnt; 749 mnt->mnt_mountpoint = mnt->mnt.mnt_root; 750 list_del_init(&mnt->mnt_child); 751 hlist_del_init_rcu(&mnt->mnt_hash); 752 put_mountpoint(mnt->mnt_mp); 753 mnt->mnt_mp = NULL; 754 } 755 756 /* 757 * vfsmount lock must be held for write 758 */ 759 void mnt_set_mountpoint(struct mount *mnt, 760 struct mountpoint *mp, 761 struct mount *child_mnt) 762 { 763 mp->m_count++; 764 mnt_add_count(mnt, 1); /* essentially, that's mntget */ 765 child_mnt->mnt_mountpoint = dget(mp->m_dentry); 766 child_mnt->mnt_parent = mnt; 767 child_mnt->mnt_mp = mp; 768 } 769 770 /* 771 * vfsmount lock must be held for write 772 */ 773 static void attach_mnt(struct mount *mnt, 774 struct mount *parent, 775 struct mountpoint *mp) 776 { 777 mnt_set_mountpoint(parent, mp, mnt); 778 hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mp->m_dentry)); 779 list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); 780 } 781 782 /* 783 * vfsmount lock must be held for write 784 */ 785 static void commit_tree(struct mount *mnt, struct mount *shadows) 786 { 787 struct mount *parent = mnt->mnt_parent; 788 struct mount *m; 789 LIST_HEAD(head); 790 struct mnt_namespace *n = parent->mnt_ns; 791 792 BUG_ON(parent == mnt); 793 794 list_add_tail(&head, &mnt->mnt_list); 795 list_for_each_entry(m, &head, mnt_list) 796 m->mnt_ns = n; 797 798 list_splice(&head, n->list.prev); 799 800 if (shadows) 801 hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash); 802 else 803 hlist_add_head_rcu(&mnt->mnt_hash, 804 m_hash(&parent->mnt, mnt->mnt_mountpoint)); 805 list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); 806 touch_mnt_namespace(n); 807 } 808 809 static struct mount *next_mnt(struct mount *p, struct mount *root) 810 { 811 struct list_head *next = p->mnt_mounts.next; 812 if (next == &p->mnt_mounts) { 813 while (1) { 814 if (p == root) 815 return NULL; 816 next = p->mnt_child.next; 817 if (next != &p->mnt_parent->mnt_mounts) 818 break; 819 p = p->mnt_parent; 820 } 821 } 822 return list_entry(next, struct mount, mnt_child); 823 } 824 825 static struct mount *skip_mnt_tree(struct mount *p) 826 { 827 struct list_head *prev = p->mnt_mounts.prev; 828 while (prev != &p->mnt_mounts) { 829 p = list_entry(prev, struct mount, mnt_child); 830 prev = p->mnt_mounts.prev; 831 } 832 return p; 833 } 834 835 struct vfsmount * 836 vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data) 837 { 838 struct mount *mnt; 839 struct dentry *root; 840 841 if (!type) 842 return ERR_PTR(-ENODEV); 843 844 mnt = alloc_vfsmnt(name); 845 if (!mnt) 846 return ERR_PTR(-ENOMEM); 847 848 if (flags & MS_KERNMOUNT) 849 mnt->mnt.mnt_flags = MNT_INTERNAL; 850 851 root = mount_fs(type, flags, name, data); 852 if (IS_ERR(root)) { 853 mnt_free_id(mnt); 854 free_vfsmnt(mnt); 855 return ERR_CAST(root); 856 } 857 858 mnt->mnt.mnt_root = root; 859 mnt->mnt.mnt_sb = root->d_sb; 860 mnt->mnt_mountpoint = mnt->mnt.mnt_root; 861 mnt->mnt_parent = mnt; 862 lock_mount_hash(); 863 list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts); 864 unlock_mount_hash(); 865 return &mnt->mnt; 866 } 867 EXPORT_SYMBOL_GPL(vfs_kern_mount); 868 869 static struct mount *clone_mnt(struct mount *old, struct dentry *root, 870 int flag) 871 { 872 struct super_block *sb = old->mnt.mnt_sb; 873 struct mount *mnt; 874 int err; 875 876 mnt = alloc_vfsmnt(old->mnt_devname); 877 if (!mnt) 878 return ERR_PTR(-ENOMEM); 879 880 if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE)) 881 mnt->mnt_group_id = 0; /* not a peer of original */ 882 else 883 mnt->mnt_group_id = old->mnt_group_id; 884 885 if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) { 886 err = mnt_alloc_group_id(mnt); 887 if (err) 888 goto out_free; 889 } 890 891 mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~(MNT_WRITE_HOLD|MNT_MARKED); 892 /* Don't allow unprivileged users to change mount flags */ 893 if (flag & CL_UNPRIVILEGED) { 894 mnt->mnt.mnt_flags |= MNT_LOCK_ATIME; 895 896 if (mnt->mnt.mnt_flags & MNT_READONLY) 897 mnt->mnt.mnt_flags |= MNT_LOCK_READONLY; 898 899 if (mnt->mnt.mnt_flags & MNT_NODEV) 900 mnt->mnt.mnt_flags |= MNT_LOCK_NODEV; 901 902 if (mnt->mnt.mnt_flags & MNT_NOSUID) 903 mnt->mnt.mnt_flags |= MNT_LOCK_NOSUID; 904 905 if (mnt->mnt.mnt_flags & MNT_NOEXEC) 906 mnt->mnt.mnt_flags |= MNT_LOCK_NOEXEC; 907 } 908 909 /* Don't allow unprivileged users to reveal what is under a mount */ 910 if ((flag & CL_UNPRIVILEGED) && list_empty(&old->mnt_expire)) 911 mnt->mnt.mnt_flags |= MNT_LOCKED; 912 913 atomic_inc(&sb->s_active); 914 mnt->mnt.mnt_sb = sb; 915 mnt->mnt.mnt_root = dget(root); 916 mnt->mnt_mountpoint = mnt->mnt.mnt_root; 917 mnt->mnt_parent = mnt; 918 lock_mount_hash(); 919 list_add_tail(&mnt->mnt_instance, &sb->s_mounts); 920 unlock_mount_hash(); 921 922 if ((flag & CL_SLAVE) || 923 ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) { 924 list_add(&mnt->mnt_slave, &old->mnt_slave_list); 925 mnt->mnt_master = old; 926 CLEAR_MNT_SHARED(mnt); 927 } else if (!(flag & CL_PRIVATE)) { 928 if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old)) 929 list_add(&mnt->mnt_share, &old->mnt_share); 930 if (IS_MNT_SLAVE(old)) 931 list_add(&mnt->mnt_slave, &old->mnt_slave); 932 mnt->mnt_master = old->mnt_master; 933 } 934 if (flag & CL_MAKE_SHARED) 935 set_mnt_shared(mnt); 936 937 /* stick the duplicate mount on the same expiry list 938 * as the original if that was on one */ 939 if (flag & CL_EXPIRE) { 940 if (!list_empty(&old->mnt_expire)) 941 list_add(&mnt->mnt_expire, &old->mnt_expire); 942 } 943 944 return mnt; 945 946 out_free: 947 mnt_free_id(mnt); 948 free_vfsmnt(mnt); 949 return ERR_PTR(err); 950 } 951 952 static void mntput_no_expire(struct mount *mnt) 953 { 954 put_again: 955 rcu_read_lock(); 956 mnt_add_count(mnt, -1); 957 if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */ 958 rcu_read_unlock(); 959 return; 960 } 961 lock_mount_hash(); 962 if (mnt_get_count(mnt)) { 963 rcu_read_unlock(); 964 unlock_mount_hash(); 965 return; 966 } 967 if (unlikely(mnt->mnt_pinned)) { 968 mnt_add_count(mnt, mnt->mnt_pinned + 1); 969 mnt->mnt_pinned = 0; 970 rcu_read_unlock(); 971 unlock_mount_hash(); 972 acct_auto_close_mnt(&mnt->mnt); 973 goto put_again; 974 } 975 if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) { 976 rcu_read_unlock(); 977 unlock_mount_hash(); 978 return; 979 } 980 mnt->mnt.mnt_flags |= MNT_DOOMED; 981 rcu_read_unlock(); 982 983 list_del(&mnt->mnt_instance); 984 unlock_mount_hash(); 985 986 /* 987 * This probably indicates that somebody messed 988 * up a mnt_want/drop_write() pair. If this 989 * happens, the filesystem was probably unable 990 * to make r/w->r/o transitions. 991 */ 992 /* 993 * The locking used to deal with mnt_count decrement provides barriers, 994 * so mnt_get_writers() below is safe. 995 */ 996 WARN_ON(mnt_get_writers(mnt)); 997 fsnotify_vfsmount_delete(&mnt->mnt); 998 dput(mnt->mnt.mnt_root); 999 deactivate_super(mnt->mnt.mnt_sb); 1000 mnt_free_id(mnt); 1001 call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt); 1002 } 1003 1004 void mntput(struct vfsmount *mnt) 1005 { 1006 if (mnt) { 1007 struct mount *m = real_mount(mnt); 1008 /* avoid cacheline pingpong, hope gcc doesn't get "smart" */ 1009 if (unlikely(m->mnt_expiry_mark)) 1010 m->mnt_expiry_mark = 0; 1011 mntput_no_expire(m); 1012 } 1013 } 1014 EXPORT_SYMBOL(mntput); 1015 1016 struct vfsmount *mntget(struct vfsmount *mnt) 1017 { 1018 if (mnt) 1019 mnt_add_count(real_mount(mnt), 1); 1020 return mnt; 1021 } 1022 EXPORT_SYMBOL(mntget); 1023 1024 void mnt_pin(struct vfsmount *mnt) 1025 { 1026 lock_mount_hash(); 1027 real_mount(mnt)->mnt_pinned++; 1028 unlock_mount_hash(); 1029 } 1030 EXPORT_SYMBOL(mnt_pin); 1031 1032 void mnt_unpin(struct vfsmount *m) 1033 { 1034 struct mount *mnt = real_mount(m); 1035 lock_mount_hash(); 1036 if (mnt->mnt_pinned) { 1037 mnt_add_count(mnt, 1); 1038 mnt->mnt_pinned--; 1039 } 1040 unlock_mount_hash(); 1041 } 1042 EXPORT_SYMBOL(mnt_unpin); 1043 1044 static inline void mangle(struct seq_file *m, const char *s) 1045 { 1046 seq_escape(m, s, " \t\n\\"); 1047 } 1048 1049 /* 1050 * Simple .show_options callback for filesystems which don't want to 1051 * implement more complex mount option showing. 1052 * 1053 * See also save_mount_options(). 1054 */ 1055 int generic_show_options(struct seq_file *m, struct dentry *root) 1056 { 1057 const char *options; 1058 1059 rcu_read_lock(); 1060 options = rcu_dereference(root->d_sb->s_options); 1061 1062 if (options != NULL && options[0]) { 1063 seq_putc(m, ','); 1064 mangle(m, options); 1065 } 1066 rcu_read_unlock(); 1067 1068 return 0; 1069 } 1070 EXPORT_SYMBOL(generic_show_options); 1071 1072 /* 1073 * If filesystem uses generic_show_options(), this function should be 1074 * called from the fill_super() callback. 1075 * 1076 * The .remount_fs callback usually needs to be handled in a special 1077 * way, to make sure, that previous options are not overwritten if the 1078 * remount fails. 1079 * 1080 * Also note, that if the filesystem's .remount_fs function doesn't 1081 * reset all options to their default value, but changes only newly 1082 * given options, then the displayed options will not reflect reality 1083 * any more. 1084 */ 1085 void save_mount_options(struct super_block *sb, char *options) 1086 { 1087 BUG_ON(sb->s_options); 1088 rcu_assign_pointer(sb->s_options, kstrdup(options, GFP_KERNEL)); 1089 } 1090 EXPORT_SYMBOL(save_mount_options); 1091 1092 void replace_mount_options(struct super_block *sb, char *options) 1093 { 1094 char *old = sb->s_options; 1095 rcu_assign_pointer(sb->s_options, options); 1096 if (old) { 1097 synchronize_rcu(); 1098 kfree(old); 1099 } 1100 } 1101 EXPORT_SYMBOL(replace_mount_options); 1102 1103 #ifdef CONFIG_PROC_FS 1104 /* iterator; we want it to have access to namespace_sem, thus here... */ 1105 static void *m_start(struct seq_file *m, loff_t *pos) 1106 { 1107 struct proc_mounts *p = proc_mounts(m); 1108 1109 down_read(&namespace_sem); 1110 if (p->cached_event == p->ns->event) { 1111 void *v = p->cached_mount; 1112 if (*pos == p->cached_index) 1113 return v; 1114 if (*pos == p->cached_index + 1) { 1115 v = seq_list_next(v, &p->ns->list, &p->cached_index); 1116 return p->cached_mount = v; 1117 } 1118 } 1119 1120 p->cached_event = p->ns->event; 1121 p->cached_mount = seq_list_start(&p->ns->list, *pos); 1122 p->cached_index = *pos; 1123 return p->cached_mount; 1124 } 1125 1126 static void *m_next(struct seq_file *m, void *v, loff_t *pos) 1127 { 1128 struct proc_mounts *p = proc_mounts(m); 1129 1130 p->cached_mount = seq_list_next(v, &p->ns->list, pos); 1131 p->cached_index = *pos; 1132 return p->cached_mount; 1133 } 1134 1135 static void m_stop(struct seq_file *m, void *v) 1136 { 1137 up_read(&namespace_sem); 1138 } 1139 1140 static int m_show(struct seq_file *m, void *v) 1141 { 1142 struct proc_mounts *p = proc_mounts(m); 1143 struct mount *r = list_entry(v, struct mount, mnt_list); 1144 return p->show(m, &r->mnt); 1145 } 1146 1147 const struct seq_operations mounts_op = { 1148 .start = m_start, 1149 .next = m_next, 1150 .stop = m_stop, 1151 .show = m_show, 1152 }; 1153 #endif /* CONFIG_PROC_FS */ 1154 1155 /** 1156 * may_umount_tree - check if a mount tree is busy 1157 * @mnt: root of mount tree 1158 * 1159 * This is called to check if a tree of mounts has any 1160 * open files, pwds, chroots or sub mounts that are 1161 * busy. 1162 */ 1163 int may_umount_tree(struct vfsmount *m) 1164 { 1165 struct mount *mnt = real_mount(m); 1166 int actual_refs = 0; 1167 int minimum_refs = 0; 1168 struct mount *p; 1169 BUG_ON(!m); 1170 1171 /* write lock needed for mnt_get_count */ 1172 lock_mount_hash(); 1173 for (p = mnt; p; p = next_mnt(p, mnt)) { 1174 actual_refs += mnt_get_count(p); 1175 minimum_refs += 2; 1176 } 1177 unlock_mount_hash(); 1178 1179 if (actual_refs > minimum_refs) 1180 return 0; 1181 1182 return 1; 1183 } 1184 1185 EXPORT_SYMBOL(may_umount_tree); 1186 1187 /** 1188 * may_umount - check if a mount point is busy 1189 * @mnt: root of mount 1190 * 1191 * This is called to check if a mount point has any 1192 * open files, pwds, chroots or sub mounts. If the 1193 * mount has sub mounts this will return busy 1194 * regardless of whether the sub mounts are busy. 1195 * 1196 * Doesn't take quota and stuff into account. IOW, in some cases it will 1197 * give false negatives. The main reason why it's here is that we need 1198 * a non-destructive way to look for easily umountable filesystems. 1199 */ 1200 int may_umount(struct vfsmount *mnt) 1201 { 1202 int ret = 1; 1203 down_read(&namespace_sem); 1204 lock_mount_hash(); 1205 if (propagate_mount_busy(real_mount(mnt), 2)) 1206 ret = 0; 1207 unlock_mount_hash(); 1208 up_read(&namespace_sem); 1209 return ret; 1210 } 1211 1212 EXPORT_SYMBOL(may_umount); 1213 1214 static HLIST_HEAD(unmounted); /* protected by namespace_sem */ 1215 1216 static void namespace_unlock(void) 1217 { 1218 struct mount *mnt; 1219 struct hlist_head head = unmounted; 1220 1221 if (likely(hlist_empty(&head))) { 1222 up_write(&namespace_sem); 1223 return; 1224 } 1225 1226 head.first->pprev = &head.first; 1227 INIT_HLIST_HEAD(&unmounted); 1228 1229 up_write(&namespace_sem); 1230 1231 synchronize_rcu(); 1232 1233 while (!hlist_empty(&head)) { 1234 mnt = hlist_entry(head.first, struct mount, mnt_hash); 1235 hlist_del_init(&mnt->mnt_hash); 1236 if (mnt->mnt_ex_mountpoint.mnt) 1237 path_put(&mnt->mnt_ex_mountpoint); 1238 mntput(&mnt->mnt); 1239 } 1240 } 1241 1242 static inline void namespace_lock(void) 1243 { 1244 down_write(&namespace_sem); 1245 } 1246 1247 /* 1248 * mount_lock must be held 1249 * namespace_sem must be held for write 1250 * how = 0 => just this tree, don't propagate 1251 * how = 1 => propagate; we know that nobody else has reference to any victims 1252 * how = 2 => lazy umount 1253 */ 1254 void umount_tree(struct mount *mnt, int how) 1255 { 1256 HLIST_HEAD(tmp_list); 1257 struct mount *p; 1258 struct mount *last = NULL; 1259 1260 for (p = mnt; p; p = next_mnt(p, mnt)) { 1261 hlist_del_init_rcu(&p->mnt_hash); 1262 hlist_add_head(&p->mnt_hash, &tmp_list); 1263 } 1264 1265 if (how) 1266 propagate_umount(&tmp_list); 1267 1268 hlist_for_each_entry(p, &tmp_list, mnt_hash) { 1269 list_del_init(&p->mnt_expire); 1270 list_del_init(&p->mnt_list); 1271 __touch_mnt_namespace(p->mnt_ns); 1272 p->mnt_ns = NULL; 1273 if (how < 2) 1274 p->mnt.mnt_flags |= MNT_SYNC_UMOUNT; 1275 list_del_init(&p->mnt_child); 1276 if (mnt_has_parent(p)) { 1277 put_mountpoint(p->mnt_mp); 1278 /* move the reference to mountpoint into ->mnt_ex_mountpoint */ 1279 p->mnt_ex_mountpoint.dentry = p->mnt_mountpoint; 1280 p->mnt_ex_mountpoint.mnt = &p->mnt_parent->mnt; 1281 p->mnt_mountpoint = p->mnt.mnt_root; 1282 p->mnt_parent = p; 1283 p->mnt_mp = NULL; 1284 } 1285 change_mnt_propagation(p, MS_PRIVATE); 1286 last = p; 1287 } 1288 if (last) { 1289 last->mnt_hash.next = unmounted.first; 1290 unmounted.first = tmp_list.first; 1291 unmounted.first->pprev = &unmounted.first; 1292 } 1293 } 1294 1295 static void shrink_submounts(struct mount *mnt); 1296 1297 static int do_umount(struct mount *mnt, int flags) 1298 { 1299 struct super_block *sb = mnt->mnt.mnt_sb; 1300 int retval; 1301 1302 retval = security_sb_umount(&mnt->mnt, flags); 1303 if (retval) 1304 return retval; 1305 1306 /* 1307 * Allow userspace to request a mountpoint be expired rather than 1308 * unmounting unconditionally. Unmount only happens if: 1309 * (1) the mark is already set (the mark is cleared by mntput()) 1310 * (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount] 1311 */ 1312 if (flags & MNT_EXPIRE) { 1313 if (&mnt->mnt == current->fs->root.mnt || 1314 flags & (MNT_FORCE | MNT_DETACH)) 1315 return -EINVAL; 1316 1317 /* 1318 * probably don't strictly need the lock here if we examined 1319 * all race cases, but it's a slowpath. 1320 */ 1321 lock_mount_hash(); 1322 if (mnt_get_count(mnt) != 2) { 1323 unlock_mount_hash(); 1324 return -EBUSY; 1325 } 1326 unlock_mount_hash(); 1327 1328 if (!xchg(&mnt->mnt_expiry_mark, 1)) 1329 return -EAGAIN; 1330 } 1331 1332 /* 1333 * If we may have to abort operations to get out of this 1334 * mount, and they will themselves hold resources we must 1335 * allow the fs to do things. In the Unix tradition of 1336 * 'Gee thats tricky lets do it in userspace' the umount_begin 1337 * might fail to complete on the first run through as other tasks 1338 * must return, and the like. Thats for the mount program to worry 1339 * about for the moment. 1340 */ 1341 1342 if (flags & MNT_FORCE && sb->s_op->umount_begin) { 1343 sb->s_op->umount_begin(sb); 1344 } 1345 1346 /* 1347 * No sense to grab the lock for this test, but test itself looks 1348 * somewhat bogus. Suggestions for better replacement? 1349 * Ho-hum... In principle, we might treat that as umount + switch 1350 * to rootfs. GC would eventually take care of the old vfsmount. 1351 * Actually it makes sense, especially if rootfs would contain a 1352 * /reboot - static binary that would close all descriptors and 1353 * call reboot(9). Then init(8) could umount root and exec /reboot. 1354 */ 1355 if (&mnt->mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) { 1356 /* 1357 * Special case for "unmounting" root ... 1358 * we just try to remount it readonly. 1359 */ 1360 down_write(&sb->s_umount); 1361 if (!(sb->s_flags & MS_RDONLY)) 1362 retval = do_remount_sb(sb, MS_RDONLY, NULL, 0); 1363 up_write(&sb->s_umount); 1364 return retval; 1365 } 1366 1367 namespace_lock(); 1368 lock_mount_hash(); 1369 event++; 1370 1371 if (flags & MNT_DETACH) { 1372 if (!list_empty(&mnt->mnt_list)) 1373 umount_tree(mnt, 2); 1374 retval = 0; 1375 } else { 1376 shrink_submounts(mnt); 1377 retval = -EBUSY; 1378 if (!propagate_mount_busy(mnt, 2)) { 1379 if (!list_empty(&mnt->mnt_list)) 1380 umount_tree(mnt, 1); 1381 retval = 0; 1382 } 1383 } 1384 unlock_mount_hash(); 1385 namespace_unlock(); 1386 return retval; 1387 } 1388 1389 /* 1390 * Is the caller allowed to modify his namespace? 1391 */ 1392 static inline bool may_mount(void) 1393 { 1394 return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN); 1395 } 1396 1397 /* 1398 * Now umount can handle mount points as well as block devices. 1399 * This is important for filesystems which use unnamed block devices. 1400 * 1401 * We now support a flag for forced unmount like the other 'big iron' 1402 * unixes. Our API is identical to OSF/1 to avoid making a mess of AMD 1403 */ 1404 1405 SYSCALL_DEFINE2(umount, char __user *, name, int, flags) 1406 { 1407 struct path path; 1408 struct mount *mnt; 1409 int retval; 1410 int lookup_flags = 0; 1411 1412 if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW)) 1413 return -EINVAL; 1414 1415 if (!may_mount()) 1416 return -EPERM; 1417 1418 if (!(flags & UMOUNT_NOFOLLOW)) 1419 lookup_flags |= LOOKUP_FOLLOW; 1420 1421 retval = user_path_mountpoint_at(AT_FDCWD, name, lookup_flags, &path); 1422 if (retval) 1423 goto out; 1424 mnt = real_mount(path.mnt); 1425 retval = -EINVAL; 1426 if (path.dentry != path.mnt->mnt_root) 1427 goto dput_and_out; 1428 if (!check_mnt(mnt)) 1429 goto dput_and_out; 1430 if (mnt->mnt.mnt_flags & MNT_LOCKED) 1431 goto dput_and_out; 1432 1433 retval = do_umount(mnt, flags); 1434 dput_and_out: 1435 /* we mustn't call path_put() as that would clear mnt_expiry_mark */ 1436 dput(path.dentry); 1437 mntput_no_expire(mnt); 1438 out: 1439 return retval; 1440 } 1441 1442 #ifdef __ARCH_WANT_SYS_OLDUMOUNT 1443 1444 /* 1445 * The 2.0 compatible umount. No flags. 1446 */ 1447 SYSCALL_DEFINE1(oldumount, char __user *, name) 1448 { 1449 return sys_umount(name, 0); 1450 } 1451 1452 #endif 1453 1454 static bool is_mnt_ns_file(struct dentry *dentry) 1455 { 1456 /* Is this a proxy for a mount namespace? */ 1457 struct inode *inode = dentry->d_inode; 1458 struct proc_ns *ei; 1459 1460 if (!proc_ns_inode(inode)) 1461 return false; 1462 1463 ei = get_proc_ns(inode); 1464 if (ei->ns_ops != &mntns_operations) 1465 return false; 1466 1467 return true; 1468 } 1469 1470 static bool mnt_ns_loop(struct dentry *dentry) 1471 { 1472 /* Could bind mounting the mount namespace inode cause a 1473 * mount namespace loop? 1474 */ 1475 struct mnt_namespace *mnt_ns; 1476 if (!is_mnt_ns_file(dentry)) 1477 return false; 1478 1479 mnt_ns = get_proc_ns(dentry->d_inode)->ns; 1480 return current->nsproxy->mnt_ns->seq >= mnt_ns->seq; 1481 } 1482 1483 struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, 1484 int flag) 1485 { 1486 struct mount *res, *p, *q, *r, *parent; 1487 1488 if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(mnt)) 1489 return ERR_PTR(-EINVAL); 1490 1491 if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry)) 1492 return ERR_PTR(-EINVAL); 1493 1494 res = q = clone_mnt(mnt, dentry, flag); 1495 if (IS_ERR(q)) 1496 return q; 1497 1498 q->mnt.mnt_flags &= ~MNT_LOCKED; 1499 q->mnt_mountpoint = mnt->mnt_mountpoint; 1500 1501 p = mnt; 1502 list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) { 1503 struct mount *s; 1504 if (!is_subdir(r->mnt_mountpoint, dentry)) 1505 continue; 1506 1507 for (s = r; s; s = next_mnt(s, r)) { 1508 if (!(flag & CL_COPY_UNBINDABLE) && 1509 IS_MNT_UNBINDABLE(s)) { 1510 s = skip_mnt_tree(s); 1511 continue; 1512 } 1513 if (!(flag & CL_COPY_MNT_NS_FILE) && 1514 is_mnt_ns_file(s->mnt.mnt_root)) { 1515 s = skip_mnt_tree(s); 1516 continue; 1517 } 1518 while (p != s->mnt_parent) { 1519 p = p->mnt_parent; 1520 q = q->mnt_parent; 1521 } 1522 p = s; 1523 parent = q; 1524 q = clone_mnt(p, p->mnt.mnt_root, flag); 1525 if (IS_ERR(q)) 1526 goto out; 1527 lock_mount_hash(); 1528 list_add_tail(&q->mnt_list, &res->mnt_list); 1529 attach_mnt(q, parent, p->mnt_mp); 1530 unlock_mount_hash(); 1531 } 1532 } 1533 return res; 1534 out: 1535 if (res) { 1536 lock_mount_hash(); 1537 umount_tree(res, 0); 1538 unlock_mount_hash(); 1539 } 1540 return q; 1541 } 1542 1543 /* Caller should check returned pointer for errors */ 1544 1545 struct vfsmount *collect_mounts(struct path *path) 1546 { 1547 struct mount *tree; 1548 namespace_lock(); 1549 tree = copy_tree(real_mount(path->mnt), path->dentry, 1550 CL_COPY_ALL | CL_PRIVATE); 1551 namespace_unlock(); 1552 if (IS_ERR(tree)) 1553 return ERR_CAST(tree); 1554 return &tree->mnt; 1555 } 1556 1557 void drop_collected_mounts(struct vfsmount *mnt) 1558 { 1559 namespace_lock(); 1560 lock_mount_hash(); 1561 umount_tree(real_mount(mnt), 0); 1562 unlock_mount_hash(); 1563 namespace_unlock(); 1564 } 1565 1566 int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg, 1567 struct vfsmount *root) 1568 { 1569 struct mount *mnt; 1570 int res = f(root, arg); 1571 if (res) 1572 return res; 1573 list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) { 1574 res = f(&mnt->mnt, arg); 1575 if (res) 1576 return res; 1577 } 1578 return 0; 1579 } 1580 1581 static void cleanup_group_ids(struct mount *mnt, struct mount *end) 1582 { 1583 struct mount *p; 1584 1585 for (p = mnt; p != end; p = next_mnt(p, mnt)) { 1586 if (p->mnt_group_id && !IS_MNT_SHARED(p)) 1587 mnt_release_group_id(p); 1588 } 1589 } 1590 1591 static int invent_group_ids(struct mount *mnt, bool recurse) 1592 { 1593 struct mount *p; 1594 1595 for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) { 1596 if (!p->mnt_group_id && !IS_MNT_SHARED(p)) { 1597 int err = mnt_alloc_group_id(p); 1598 if (err) { 1599 cleanup_group_ids(mnt, p); 1600 return err; 1601 } 1602 } 1603 } 1604 1605 return 0; 1606 } 1607 1608 /* 1609 * @source_mnt : mount tree to be attached 1610 * @nd : place the mount tree @source_mnt is attached 1611 * @parent_nd : if non-null, detach the source_mnt from its parent and 1612 * store the parent mount and mountpoint dentry. 1613 * (done when source_mnt is moved) 1614 * 1615 * NOTE: in the table below explains the semantics when a source mount 1616 * of a given type is attached to a destination mount of a given type. 1617 * --------------------------------------------------------------------------- 1618 * | BIND MOUNT OPERATION | 1619 * |************************************************************************** 1620 * | source-->| shared | private | slave | unbindable | 1621 * | dest | | | | | 1622 * | | | | | | | 1623 * | v | | | | | 1624 * |************************************************************************** 1625 * | shared | shared (++) | shared (+) | shared(+++)| invalid | 1626 * | | | | | | 1627 * |non-shared| shared (+) | private | slave (*) | invalid | 1628 * *************************************************************************** 1629 * A bind operation clones the source mount and mounts the clone on the 1630 * destination mount. 1631 * 1632 * (++) the cloned mount is propagated to all the mounts in the propagation 1633 * tree of the destination mount and the cloned mount is added to 1634 * the peer group of the source mount. 1635 * (+) the cloned mount is created under the destination mount and is marked 1636 * as shared. The cloned mount is added to the peer group of the source 1637 * mount. 1638 * (+++) the mount is propagated to all the mounts in the propagation tree 1639 * of the destination mount and the cloned mount is made slave 1640 * of the same master as that of the source mount. The cloned mount 1641 * is marked as 'shared and slave'. 1642 * (*) the cloned mount is made a slave of the same master as that of the 1643 * source mount. 1644 * 1645 * --------------------------------------------------------------------------- 1646 * | MOVE MOUNT OPERATION | 1647 * |************************************************************************** 1648 * | source-->| shared | private | slave | unbindable | 1649 * | dest | | | | | 1650 * | | | | | | | 1651 * | v | | | | | 1652 * |************************************************************************** 1653 * | shared | shared (+) | shared (+) | shared(+++) | invalid | 1654 * | | | | | | 1655 * |non-shared| shared (+*) | private | slave (*) | unbindable | 1656 * *************************************************************************** 1657 * 1658 * (+) the mount is moved to the destination. And is then propagated to 1659 * all the mounts in the propagation tree of the destination mount. 1660 * (+*) the mount is moved to the destination. 1661 * (+++) the mount is moved to the destination and is then propagated to 1662 * all the mounts belonging to the destination mount's propagation tree. 1663 * the mount is marked as 'shared and slave'. 1664 * (*) the mount continues to be a slave at the new location. 1665 * 1666 * if the source mount is a tree, the operations explained above is 1667 * applied to each mount in the tree. 1668 * Must be called without spinlocks held, since this function can sleep 1669 * in allocations. 1670 */ 1671 static int attach_recursive_mnt(struct mount *source_mnt, 1672 struct mount *dest_mnt, 1673 struct mountpoint *dest_mp, 1674 struct path *parent_path) 1675 { 1676 HLIST_HEAD(tree_list); 1677 struct mount *child, *p; 1678 struct hlist_node *n; 1679 int err; 1680 1681 if (IS_MNT_SHARED(dest_mnt)) { 1682 err = invent_group_ids(source_mnt, true); 1683 if (err) 1684 goto out; 1685 err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list); 1686 lock_mount_hash(); 1687 if (err) 1688 goto out_cleanup_ids; 1689 for (p = source_mnt; p; p = next_mnt(p, source_mnt)) 1690 set_mnt_shared(p); 1691 } else { 1692 lock_mount_hash(); 1693 } 1694 if (parent_path) { 1695 detach_mnt(source_mnt, parent_path); 1696 attach_mnt(source_mnt, dest_mnt, dest_mp); 1697 touch_mnt_namespace(source_mnt->mnt_ns); 1698 } else { 1699 mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt); 1700 commit_tree(source_mnt, NULL); 1701 } 1702 1703 hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) { 1704 struct mount *q; 1705 hlist_del_init(&child->mnt_hash); 1706 q = __lookup_mnt_last(&child->mnt_parent->mnt, 1707 child->mnt_mountpoint); 1708 commit_tree(child, q); 1709 } 1710 unlock_mount_hash(); 1711 1712 return 0; 1713 1714 out_cleanup_ids: 1715 while (!hlist_empty(&tree_list)) { 1716 child = hlist_entry(tree_list.first, struct mount, mnt_hash); 1717 umount_tree(child, 0); 1718 } 1719 unlock_mount_hash(); 1720 cleanup_group_ids(source_mnt, NULL); 1721 out: 1722 return err; 1723 } 1724 1725 static struct mountpoint *lock_mount(struct path *path) 1726 { 1727 struct vfsmount *mnt; 1728 struct dentry *dentry = path->dentry; 1729 retry: 1730 mutex_lock(&dentry->d_inode->i_mutex); 1731 if (unlikely(cant_mount(dentry))) { 1732 mutex_unlock(&dentry->d_inode->i_mutex); 1733 return ERR_PTR(-ENOENT); 1734 } 1735 namespace_lock(); 1736 mnt = lookup_mnt(path); 1737 if (likely(!mnt)) { 1738 struct mountpoint *mp = new_mountpoint(dentry); 1739 if (IS_ERR(mp)) { 1740 namespace_unlock(); 1741 mutex_unlock(&dentry->d_inode->i_mutex); 1742 return mp; 1743 } 1744 return mp; 1745 } 1746 namespace_unlock(); 1747 mutex_unlock(&path->dentry->d_inode->i_mutex); 1748 path_put(path); 1749 path->mnt = mnt; 1750 dentry = path->dentry = dget(mnt->mnt_root); 1751 goto retry; 1752 } 1753 1754 static void unlock_mount(struct mountpoint *where) 1755 { 1756 struct dentry *dentry = where->m_dentry; 1757 put_mountpoint(where); 1758 namespace_unlock(); 1759 mutex_unlock(&dentry->d_inode->i_mutex); 1760 } 1761 1762 static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp) 1763 { 1764 if (mnt->mnt.mnt_sb->s_flags & MS_NOUSER) 1765 return -EINVAL; 1766 1767 if (S_ISDIR(mp->m_dentry->d_inode->i_mode) != 1768 S_ISDIR(mnt->mnt.mnt_root->d_inode->i_mode)) 1769 return -ENOTDIR; 1770 1771 return attach_recursive_mnt(mnt, p, mp, NULL); 1772 } 1773 1774 /* 1775 * Sanity check the flags to change_mnt_propagation. 1776 */ 1777 1778 static int flags_to_propagation_type(int flags) 1779 { 1780 int type = flags & ~(MS_REC | MS_SILENT); 1781 1782 /* Fail if any non-propagation flags are set */ 1783 if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) 1784 return 0; 1785 /* Only one propagation flag should be set */ 1786 if (!is_power_of_2(type)) 1787 return 0; 1788 return type; 1789 } 1790 1791 /* 1792 * recursively change the type of the mountpoint. 1793 */ 1794 static int do_change_type(struct path *path, int flag) 1795 { 1796 struct mount *m; 1797 struct mount *mnt = real_mount(path->mnt); 1798 int recurse = flag & MS_REC; 1799 int type; 1800 int err = 0; 1801 1802 if (path->dentry != path->mnt->mnt_root) 1803 return -EINVAL; 1804 1805 type = flags_to_propagation_type(flag); 1806 if (!type) 1807 return -EINVAL; 1808 1809 namespace_lock(); 1810 if (type == MS_SHARED) { 1811 err = invent_group_ids(mnt, recurse); 1812 if (err) 1813 goto out_unlock; 1814 } 1815 1816 lock_mount_hash(); 1817 for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL)) 1818 change_mnt_propagation(m, type); 1819 unlock_mount_hash(); 1820 1821 out_unlock: 1822 namespace_unlock(); 1823 return err; 1824 } 1825 1826 static bool has_locked_children(struct mount *mnt, struct dentry *dentry) 1827 { 1828 struct mount *child; 1829 list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { 1830 if (!is_subdir(child->mnt_mountpoint, dentry)) 1831 continue; 1832 1833 if (child->mnt.mnt_flags & MNT_LOCKED) 1834 return true; 1835 } 1836 return false; 1837 } 1838 1839 /* 1840 * do loopback mount. 1841 */ 1842 static int do_loopback(struct path *path, const char *old_name, 1843 int recurse) 1844 { 1845 struct path old_path; 1846 struct mount *mnt = NULL, *old, *parent; 1847 struct mountpoint *mp; 1848 int err; 1849 if (!old_name || !*old_name) 1850 return -EINVAL; 1851 err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path); 1852 if (err) 1853 return err; 1854 1855 err = -EINVAL; 1856 if (mnt_ns_loop(old_path.dentry)) 1857 goto out; 1858 1859 mp = lock_mount(path); 1860 err = PTR_ERR(mp); 1861 if (IS_ERR(mp)) 1862 goto out; 1863 1864 old = real_mount(old_path.mnt); 1865 parent = real_mount(path->mnt); 1866 1867 err = -EINVAL; 1868 if (IS_MNT_UNBINDABLE(old)) 1869 goto out2; 1870 1871 if (!check_mnt(parent) || !check_mnt(old)) 1872 goto out2; 1873 1874 if (!recurse && has_locked_children(old, old_path.dentry)) 1875 goto out2; 1876 1877 if (recurse) 1878 mnt = copy_tree(old, old_path.dentry, CL_COPY_MNT_NS_FILE); 1879 else 1880 mnt = clone_mnt(old, old_path.dentry, 0); 1881 1882 if (IS_ERR(mnt)) { 1883 err = PTR_ERR(mnt); 1884 goto out2; 1885 } 1886 1887 mnt->mnt.mnt_flags &= ~MNT_LOCKED; 1888 1889 err = graft_tree(mnt, parent, mp); 1890 if (err) { 1891 lock_mount_hash(); 1892 umount_tree(mnt, 0); 1893 unlock_mount_hash(); 1894 } 1895 out2: 1896 unlock_mount(mp); 1897 out: 1898 path_put(&old_path); 1899 return err; 1900 } 1901 1902 static int change_mount_flags(struct vfsmount *mnt, int ms_flags) 1903 { 1904 int error = 0; 1905 int readonly_request = 0; 1906 1907 if (ms_flags & MS_RDONLY) 1908 readonly_request = 1; 1909 if (readonly_request == __mnt_is_readonly(mnt)) 1910 return 0; 1911 1912 if (readonly_request) 1913 error = mnt_make_readonly(real_mount(mnt)); 1914 else 1915 __mnt_unmake_readonly(real_mount(mnt)); 1916 return error; 1917 } 1918 1919 /* 1920 * change filesystem flags. dir should be a physical root of filesystem. 1921 * If you've mounted a non-root directory somewhere and want to do remount 1922 * on it - tough luck. 1923 */ 1924 static int do_remount(struct path *path, int flags, int mnt_flags, 1925 void *data) 1926 { 1927 int err; 1928 struct super_block *sb = path->mnt->mnt_sb; 1929 struct mount *mnt = real_mount(path->mnt); 1930 1931 if (!check_mnt(mnt)) 1932 return -EINVAL; 1933 1934 if (path->dentry != path->mnt->mnt_root) 1935 return -EINVAL; 1936 1937 /* Don't allow changing of locked mnt flags. 1938 * 1939 * No locks need to be held here while testing the various 1940 * MNT_LOCK flags because those flags can never be cleared 1941 * once they are set. 1942 */ 1943 if ((mnt->mnt.mnt_flags & MNT_LOCK_READONLY) && 1944 !(mnt_flags & MNT_READONLY)) { 1945 return -EPERM; 1946 } 1947 if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) && 1948 !(mnt_flags & MNT_NODEV)) { 1949 return -EPERM; 1950 } 1951 if ((mnt->mnt.mnt_flags & MNT_LOCK_NOSUID) && 1952 !(mnt_flags & MNT_NOSUID)) { 1953 return -EPERM; 1954 } 1955 if ((mnt->mnt.mnt_flags & MNT_LOCK_NOEXEC) && 1956 !(mnt_flags & MNT_NOEXEC)) { 1957 return -EPERM; 1958 } 1959 if ((mnt->mnt.mnt_flags & MNT_LOCK_ATIME) && 1960 ((mnt->mnt.mnt_flags & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK))) { 1961 return -EPERM; 1962 } 1963 1964 err = security_sb_remount(sb, data); 1965 if (err) 1966 return err; 1967 1968 down_write(&sb->s_umount); 1969 if (flags & MS_BIND) 1970 err = change_mount_flags(path->mnt, flags); 1971 else if (!capable(CAP_SYS_ADMIN)) 1972 err = -EPERM; 1973 else 1974 err = do_remount_sb(sb, flags, data, 0); 1975 if (!err) { 1976 lock_mount_hash(); 1977 mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK; 1978 mnt->mnt.mnt_flags = mnt_flags; 1979 touch_mnt_namespace(mnt->mnt_ns); 1980 unlock_mount_hash(); 1981 } 1982 up_write(&sb->s_umount); 1983 return err; 1984 } 1985 1986 static inline int tree_contains_unbindable(struct mount *mnt) 1987 { 1988 struct mount *p; 1989 for (p = mnt; p; p = next_mnt(p, mnt)) { 1990 if (IS_MNT_UNBINDABLE(p)) 1991 return 1; 1992 } 1993 return 0; 1994 } 1995 1996 static int do_move_mount(struct path *path, const char *old_name) 1997 { 1998 struct path old_path, parent_path; 1999 struct mount *p; 2000 struct mount *old; 2001 struct mountpoint *mp; 2002 int err; 2003 if (!old_name || !*old_name) 2004 return -EINVAL; 2005 err = kern_path(old_name, LOOKUP_FOLLOW, &old_path); 2006 if (err) 2007 return err; 2008 2009 mp = lock_mount(path); 2010 err = PTR_ERR(mp); 2011 if (IS_ERR(mp)) 2012 goto out; 2013 2014 old = real_mount(old_path.mnt); 2015 p = real_mount(path->mnt); 2016 2017 err = -EINVAL; 2018 if (!check_mnt(p) || !check_mnt(old)) 2019 goto out1; 2020 2021 if (old->mnt.mnt_flags & MNT_LOCKED) 2022 goto out1; 2023 2024 err = -EINVAL; 2025 if (old_path.dentry != old_path.mnt->mnt_root) 2026 goto out1; 2027 2028 if (!mnt_has_parent(old)) 2029 goto out1; 2030 2031 if (S_ISDIR(path->dentry->d_inode->i_mode) != 2032 S_ISDIR(old_path.dentry->d_inode->i_mode)) 2033 goto out1; 2034 /* 2035 * Don't move a mount residing in a shared parent. 2036 */ 2037 if (IS_MNT_SHARED(old->mnt_parent)) 2038 goto out1; 2039 /* 2040 * Don't move a mount tree containing unbindable mounts to a destination 2041 * mount which is shared. 2042 */ 2043 if (IS_MNT_SHARED(p) && tree_contains_unbindable(old)) 2044 goto out1; 2045 err = -ELOOP; 2046 for (; mnt_has_parent(p); p = p->mnt_parent) 2047 if (p == old) 2048 goto out1; 2049 2050 err = attach_recursive_mnt(old, real_mount(path->mnt), mp, &parent_path); 2051 if (err) 2052 goto out1; 2053 2054 /* if the mount is moved, it should no longer be expire 2055 * automatically */ 2056 list_del_init(&old->mnt_expire); 2057 out1: 2058 unlock_mount(mp); 2059 out: 2060 if (!err) 2061 path_put(&parent_path); 2062 path_put(&old_path); 2063 return err; 2064 } 2065 2066 static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype) 2067 { 2068 int err; 2069 const char *subtype = strchr(fstype, '.'); 2070 if (subtype) { 2071 subtype++; 2072 err = -EINVAL; 2073 if (!subtype[0]) 2074 goto err; 2075 } else 2076 subtype = ""; 2077 2078 mnt->mnt_sb->s_subtype = kstrdup(subtype, GFP_KERNEL); 2079 err = -ENOMEM; 2080 if (!mnt->mnt_sb->s_subtype) 2081 goto err; 2082 return mnt; 2083 2084 err: 2085 mntput(mnt); 2086 return ERR_PTR(err); 2087 } 2088 2089 /* 2090 * add a mount into a namespace's mount tree 2091 */ 2092 static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags) 2093 { 2094 struct mountpoint *mp; 2095 struct mount *parent; 2096 int err; 2097 2098 mnt_flags &= ~MNT_INTERNAL_FLAGS; 2099 2100 mp = lock_mount(path); 2101 if (IS_ERR(mp)) 2102 return PTR_ERR(mp); 2103 2104 parent = real_mount(path->mnt); 2105 err = -EINVAL; 2106 if (unlikely(!check_mnt(parent))) { 2107 /* that's acceptable only for automounts done in private ns */ 2108 if (!(mnt_flags & MNT_SHRINKABLE)) 2109 goto unlock; 2110 /* ... and for those we'd better have mountpoint still alive */ 2111 if (!parent->mnt_ns) 2112 goto unlock; 2113 } 2114 2115 /* Refuse the same filesystem on the same mount point */ 2116 err = -EBUSY; 2117 if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb && 2118 path->mnt->mnt_root == path->dentry) 2119 goto unlock; 2120 2121 err = -EINVAL; 2122 if (S_ISLNK(newmnt->mnt.mnt_root->d_inode->i_mode)) 2123 goto unlock; 2124 2125 newmnt->mnt.mnt_flags = mnt_flags; 2126 err = graft_tree(newmnt, parent, mp); 2127 2128 unlock: 2129 unlock_mount(mp); 2130 return err; 2131 } 2132 2133 /* 2134 * create a new mount for userspace and request it to be added into the 2135 * namespace's tree 2136 */ 2137 static int do_new_mount(struct path *path, const char *fstype, int flags, 2138 int mnt_flags, const char *name, void *data) 2139 { 2140 struct file_system_type *type; 2141 struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; 2142 struct vfsmount *mnt; 2143 int err; 2144 2145 if (!fstype) 2146 return -EINVAL; 2147 2148 type = get_fs_type(fstype); 2149 if (!type) 2150 return -ENODEV; 2151 2152 if (user_ns != &init_user_ns) { 2153 if (!(type->fs_flags & FS_USERNS_MOUNT)) { 2154 put_filesystem(type); 2155 return -EPERM; 2156 } 2157 /* Only in special cases allow devices from mounts 2158 * created outside the initial user namespace. 2159 */ 2160 if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) { 2161 flags |= MS_NODEV; 2162 mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV; 2163 } 2164 } 2165 2166 mnt = vfs_kern_mount(type, flags, name, data); 2167 if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) && 2168 !mnt->mnt_sb->s_subtype) 2169 mnt = fs_set_subtype(mnt, fstype); 2170 2171 put_filesystem(type); 2172 if (IS_ERR(mnt)) 2173 return PTR_ERR(mnt); 2174 2175 err = do_add_mount(real_mount(mnt), path, mnt_flags); 2176 if (err) 2177 mntput(mnt); 2178 return err; 2179 } 2180 2181 int finish_automount(struct vfsmount *m, struct path *path) 2182 { 2183 struct mount *mnt = real_mount(m); 2184 int err; 2185 /* The new mount record should have at least 2 refs to prevent it being 2186 * expired before we get a chance to add it 2187 */ 2188 BUG_ON(mnt_get_count(mnt) < 2); 2189 2190 if (m->mnt_sb == path->mnt->mnt_sb && 2191 m->mnt_root == path->dentry) { 2192 err = -ELOOP; 2193 goto fail; 2194 } 2195 2196 err = do_add_mount(mnt, path, path->mnt->mnt_flags | MNT_SHRINKABLE); 2197 if (!err) 2198 return 0; 2199 fail: 2200 /* remove m from any expiration list it may be on */ 2201 if (!list_empty(&mnt->mnt_expire)) { 2202 namespace_lock(); 2203 list_del_init(&mnt->mnt_expire); 2204 namespace_unlock(); 2205 } 2206 mntput(m); 2207 mntput(m); 2208 return err; 2209 } 2210 2211 /** 2212 * mnt_set_expiry - Put a mount on an expiration list 2213 * @mnt: The mount to list. 2214 * @expiry_list: The list to add the mount to. 2215 */ 2216 void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list) 2217 { 2218 namespace_lock(); 2219 2220 list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list); 2221 2222 namespace_unlock(); 2223 } 2224 EXPORT_SYMBOL(mnt_set_expiry); 2225 2226 /* 2227 * process a list of expirable mountpoints with the intent of discarding any 2228 * mountpoints that aren't in use and haven't been touched since last we came 2229 * here 2230 */ 2231 void mark_mounts_for_expiry(struct list_head *mounts) 2232 { 2233 struct mount *mnt, *next; 2234 LIST_HEAD(graveyard); 2235 2236 if (list_empty(mounts)) 2237 return; 2238 2239 namespace_lock(); 2240 lock_mount_hash(); 2241 2242 /* extract from the expiration list every vfsmount that matches the 2243 * following criteria: 2244 * - only referenced by its parent vfsmount 2245 * - still marked for expiry (marked on the last call here; marks are 2246 * cleared by mntput()) 2247 */ 2248 list_for_each_entry_safe(mnt, next, mounts, mnt_expire) { 2249 if (!xchg(&mnt->mnt_expiry_mark, 1) || 2250 propagate_mount_busy(mnt, 1)) 2251 continue; 2252 list_move(&mnt->mnt_expire, &graveyard); 2253 } 2254 while (!list_empty(&graveyard)) { 2255 mnt = list_first_entry(&graveyard, struct mount, mnt_expire); 2256 touch_mnt_namespace(mnt->mnt_ns); 2257 umount_tree(mnt, 1); 2258 } 2259 unlock_mount_hash(); 2260 namespace_unlock(); 2261 } 2262 2263 EXPORT_SYMBOL_GPL(mark_mounts_for_expiry); 2264 2265 /* 2266 * Ripoff of 'select_parent()' 2267 * 2268 * search the list of submounts for a given mountpoint, and move any 2269 * shrinkable submounts to the 'graveyard' list. 2270 */ 2271 static int select_submounts(struct mount *parent, struct list_head *graveyard) 2272 { 2273 struct mount *this_parent = parent; 2274 struct list_head *next; 2275 int found = 0; 2276 2277 repeat: 2278 next = this_parent->mnt_mounts.next; 2279 resume: 2280 while (next != &this_parent->mnt_mounts) { 2281 struct list_head *tmp = next; 2282 struct mount *mnt = list_entry(tmp, struct mount, mnt_child); 2283 2284 next = tmp->next; 2285 if (!(mnt->mnt.mnt_flags & MNT_SHRINKABLE)) 2286 continue; 2287 /* 2288 * Descend a level if the d_mounts list is non-empty. 2289 */ 2290 if (!list_empty(&mnt->mnt_mounts)) { 2291 this_parent = mnt; 2292 goto repeat; 2293 } 2294 2295 if (!propagate_mount_busy(mnt, 1)) { 2296 list_move_tail(&mnt->mnt_expire, graveyard); 2297 found++; 2298 } 2299 } 2300 /* 2301 * All done at this level ... ascend and resume the search 2302 */ 2303 if (this_parent != parent) { 2304 next = this_parent->mnt_child.next; 2305 this_parent = this_parent->mnt_parent; 2306 goto resume; 2307 } 2308 return found; 2309 } 2310 2311 /* 2312 * process a list of expirable mountpoints with the intent of discarding any 2313 * submounts of a specific parent mountpoint 2314 * 2315 * mount_lock must be held for write 2316 */ 2317 static void shrink_submounts(struct mount *mnt) 2318 { 2319 LIST_HEAD(graveyard); 2320 struct mount *m; 2321 2322 /* extract submounts of 'mountpoint' from the expiration list */ 2323 while (select_submounts(mnt, &graveyard)) { 2324 while (!list_empty(&graveyard)) { 2325 m = list_first_entry(&graveyard, struct mount, 2326 mnt_expire); 2327 touch_mnt_namespace(m->mnt_ns); 2328 umount_tree(m, 1); 2329 } 2330 } 2331 } 2332 2333 /* 2334 * Some copy_from_user() implementations do not return the exact number of 2335 * bytes remaining to copy on a fault. But copy_mount_options() requires that. 2336 * Note that this function differs from copy_from_user() in that it will oops 2337 * on bad values of `to', rather than returning a short copy. 2338 */ 2339 static long exact_copy_from_user(void *to, const void __user * from, 2340 unsigned long n) 2341 { 2342 char *t = to; 2343 const char __user *f = from; 2344 char c; 2345 2346 if (!access_ok(VERIFY_READ, from, n)) 2347 return n; 2348 2349 while (n) { 2350 if (__get_user(c, f)) { 2351 memset(t, 0, n); 2352 break; 2353 } 2354 *t++ = c; 2355 f++; 2356 n--; 2357 } 2358 return n; 2359 } 2360 2361 int copy_mount_options(const void __user * data, unsigned long *where) 2362 { 2363 int i; 2364 unsigned long page; 2365 unsigned long size; 2366 2367 *where = 0; 2368 if (!data) 2369 return 0; 2370 2371 if (!(page = __get_free_page(GFP_KERNEL))) 2372 return -ENOMEM; 2373 2374 /* We only care that *some* data at the address the user 2375 * gave us is valid. Just in case, we'll zero 2376 * the remainder of the page. 2377 */ 2378 /* copy_from_user cannot cross TASK_SIZE ! */ 2379 size = TASK_SIZE - (unsigned long)data; 2380 if (size > PAGE_SIZE) 2381 size = PAGE_SIZE; 2382 2383 i = size - exact_copy_from_user((void *)page, data, size); 2384 if (!i) { 2385 free_page(page); 2386 return -EFAULT; 2387 } 2388 if (i != PAGE_SIZE) 2389 memset((char *)page + i, 0, PAGE_SIZE - i); 2390 *where = page; 2391 return 0; 2392 } 2393 2394 int copy_mount_string(const void __user *data, char **where) 2395 { 2396 char *tmp; 2397 2398 if (!data) { 2399 *where = NULL; 2400 return 0; 2401 } 2402 2403 tmp = strndup_user(data, PAGE_SIZE); 2404 if (IS_ERR(tmp)) 2405 return PTR_ERR(tmp); 2406 2407 *where = tmp; 2408 return 0; 2409 } 2410 2411 /* 2412 * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to 2413 * be given to the mount() call (ie: read-only, no-dev, no-suid etc). 2414 * 2415 * data is a (void *) that can point to any structure up to 2416 * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent 2417 * information (or be NULL). 2418 * 2419 * Pre-0.97 versions of mount() didn't have a flags word. 2420 * When the flags word was introduced its top half was required 2421 * to have the magic value 0xC0ED, and this remained so until 2.4.0-test9. 2422 * Therefore, if this magic number is present, it carries no information 2423 * and must be discarded. 2424 */ 2425 long do_mount(const char *dev_name, const char *dir_name, 2426 const char *type_page, unsigned long flags, void *data_page) 2427 { 2428 struct path path; 2429 int retval = 0; 2430 int mnt_flags = 0; 2431 2432 /* Discard magic */ 2433 if ((flags & MS_MGC_MSK) == MS_MGC_VAL) 2434 flags &= ~MS_MGC_MSK; 2435 2436 /* Basic sanity checks */ 2437 2438 if (!dir_name || !*dir_name || !memchr(dir_name, 0, PAGE_SIZE)) 2439 return -EINVAL; 2440 2441 if (data_page) 2442 ((char *)data_page)[PAGE_SIZE - 1] = 0; 2443 2444 /* ... and get the mountpoint */ 2445 retval = kern_path(dir_name, LOOKUP_FOLLOW, &path); 2446 if (retval) 2447 return retval; 2448 2449 retval = security_sb_mount(dev_name, &path, 2450 type_page, flags, data_page); 2451 if (!retval && !may_mount()) 2452 retval = -EPERM; 2453 if (retval) 2454 goto dput_out; 2455 2456 /* Default to relatime unless overriden */ 2457 if (!(flags & MS_NOATIME)) 2458 mnt_flags |= MNT_RELATIME; 2459 2460 /* Separate the per-mountpoint flags */ 2461 if (flags & MS_NOSUID) 2462 mnt_flags |= MNT_NOSUID; 2463 if (flags & MS_NODEV) 2464 mnt_flags |= MNT_NODEV; 2465 if (flags & MS_NOEXEC) 2466 mnt_flags |= MNT_NOEXEC; 2467 if (flags & MS_NOATIME) 2468 mnt_flags |= MNT_NOATIME; 2469 if (flags & MS_NODIRATIME) 2470 mnt_flags |= MNT_NODIRATIME; 2471 if (flags & MS_STRICTATIME) 2472 mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME); 2473 if (flags & MS_RDONLY) 2474 mnt_flags |= MNT_READONLY; 2475 2476 /* The default atime for remount is preservation */ 2477 if ((flags & MS_REMOUNT) && 2478 ((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME | 2479 MS_STRICTATIME)) == 0)) { 2480 mnt_flags &= ~MNT_ATIME_MASK; 2481 mnt_flags |= path.mnt->mnt_flags & MNT_ATIME_MASK; 2482 } 2483 2484 flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN | 2485 MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT | 2486 MS_STRICTATIME); 2487 2488 if (flags & MS_REMOUNT) 2489 retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags, 2490 data_page); 2491 else if (flags & MS_BIND) 2492 retval = do_loopback(&path, dev_name, flags & MS_REC); 2493 else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) 2494 retval = do_change_type(&path, flags); 2495 else if (flags & MS_MOVE) 2496 retval = do_move_mount(&path, dev_name); 2497 else 2498 retval = do_new_mount(&path, type_page, flags, mnt_flags, 2499 dev_name, data_page); 2500 dput_out: 2501 path_put(&path); 2502 return retval; 2503 } 2504 2505 static void free_mnt_ns(struct mnt_namespace *ns) 2506 { 2507 proc_free_inum(ns->proc_inum); 2508 put_user_ns(ns->user_ns); 2509 kfree(ns); 2510 } 2511 2512 /* 2513 * Assign a sequence number so we can detect when we attempt to bind 2514 * mount a reference to an older mount namespace into the current 2515 * mount namespace, preventing reference counting loops. A 64bit 2516 * number incrementing at 10Ghz will take 12,427 years to wrap which 2517 * is effectively never, so we can ignore the possibility. 2518 */ 2519 static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1); 2520 2521 static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns) 2522 { 2523 struct mnt_namespace *new_ns; 2524 int ret; 2525 2526 new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL); 2527 if (!new_ns) 2528 return ERR_PTR(-ENOMEM); 2529 ret = proc_alloc_inum(&new_ns->proc_inum); 2530 if (ret) { 2531 kfree(new_ns); 2532 return ERR_PTR(ret); 2533 } 2534 new_ns->seq = atomic64_add_return(1, &mnt_ns_seq); 2535 atomic_set(&new_ns->count, 1); 2536 new_ns->root = NULL; 2537 INIT_LIST_HEAD(&new_ns->list); 2538 init_waitqueue_head(&new_ns->poll); 2539 new_ns->event = 0; 2540 new_ns->user_ns = get_user_ns(user_ns); 2541 return new_ns; 2542 } 2543 2544 struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, 2545 struct user_namespace *user_ns, struct fs_struct *new_fs) 2546 { 2547 struct mnt_namespace *new_ns; 2548 struct vfsmount *rootmnt = NULL, *pwdmnt = NULL; 2549 struct mount *p, *q; 2550 struct mount *old; 2551 struct mount *new; 2552 int copy_flags; 2553 2554 BUG_ON(!ns); 2555 2556 if (likely(!(flags & CLONE_NEWNS))) { 2557 get_mnt_ns(ns); 2558 return ns; 2559 } 2560 2561 old = ns->root; 2562 2563 new_ns = alloc_mnt_ns(user_ns); 2564 if (IS_ERR(new_ns)) 2565 return new_ns; 2566 2567 namespace_lock(); 2568 /* First pass: copy the tree topology */ 2569 copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE; 2570 if (user_ns != ns->user_ns) 2571 copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED; 2572 new = copy_tree(old, old->mnt.mnt_root, copy_flags); 2573 if (IS_ERR(new)) { 2574 namespace_unlock(); 2575 free_mnt_ns(new_ns); 2576 return ERR_CAST(new); 2577 } 2578 new_ns->root = new; 2579 list_add_tail(&new_ns->list, &new->mnt_list); 2580 2581 /* 2582 * Second pass: switch the tsk->fs->* elements and mark new vfsmounts 2583 * as belonging to new namespace. We have already acquired a private 2584 * fs_struct, so tsk->fs->lock is not needed. 2585 */ 2586 p = old; 2587 q = new; 2588 while (p) { 2589 q->mnt_ns = new_ns; 2590 if (new_fs) { 2591 if (&p->mnt == new_fs->root.mnt) { 2592 new_fs->root.mnt = mntget(&q->mnt); 2593 rootmnt = &p->mnt; 2594 } 2595 if (&p->mnt == new_fs->pwd.mnt) { 2596 new_fs->pwd.mnt = mntget(&q->mnt); 2597 pwdmnt = &p->mnt; 2598 } 2599 } 2600 p = next_mnt(p, old); 2601 q = next_mnt(q, new); 2602 if (!q) 2603 break; 2604 while (p->mnt.mnt_root != q->mnt.mnt_root) 2605 p = next_mnt(p, old); 2606 } 2607 namespace_unlock(); 2608 2609 if (rootmnt) 2610 mntput(rootmnt); 2611 if (pwdmnt) 2612 mntput(pwdmnt); 2613 2614 return new_ns; 2615 } 2616 2617 /** 2618 * create_mnt_ns - creates a private namespace and adds a root filesystem 2619 * @mnt: pointer to the new root filesystem mountpoint 2620 */ 2621 static struct mnt_namespace *create_mnt_ns(struct vfsmount *m) 2622 { 2623 struct mnt_namespace *new_ns = alloc_mnt_ns(&init_user_ns); 2624 if (!IS_ERR(new_ns)) { 2625 struct mount *mnt = real_mount(m); 2626 mnt->mnt_ns = new_ns; 2627 new_ns->root = mnt; 2628 list_add(&mnt->mnt_list, &new_ns->list); 2629 } else { 2630 mntput(m); 2631 } 2632 return new_ns; 2633 } 2634 2635 struct dentry *mount_subtree(struct vfsmount *mnt, const char *name) 2636 { 2637 struct mnt_namespace *ns; 2638 struct super_block *s; 2639 struct path path; 2640 int err; 2641 2642 ns = create_mnt_ns(mnt); 2643 if (IS_ERR(ns)) 2644 return ERR_CAST(ns); 2645 2646 err = vfs_path_lookup(mnt->mnt_root, mnt, 2647 name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path); 2648 2649 put_mnt_ns(ns); 2650 2651 if (err) 2652 return ERR_PTR(err); 2653 2654 /* trade a vfsmount reference for active sb one */ 2655 s = path.mnt->mnt_sb; 2656 atomic_inc(&s->s_active); 2657 mntput(path.mnt); 2658 /* lock the sucker */ 2659 down_write(&s->s_umount); 2660 /* ... and return the root of (sub)tree on it */ 2661 return path.dentry; 2662 } 2663 EXPORT_SYMBOL(mount_subtree); 2664 2665 SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, 2666 char __user *, type, unsigned long, flags, void __user *, data) 2667 { 2668 int ret; 2669 char *kernel_type; 2670 struct filename *kernel_dir; 2671 char *kernel_dev; 2672 unsigned long data_page; 2673 2674 ret = copy_mount_string(type, &kernel_type); 2675 if (ret < 0) 2676 goto out_type; 2677 2678 kernel_dir = getname(dir_name); 2679 if (IS_ERR(kernel_dir)) { 2680 ret = PTR_ERR(kernel_dir); 2681 goto out_dir; 2682 } 2683 2684 ret = copy_mount_string(dev_name, &kernel_dev); 2685 if (ret < 0) 2686 goto out_dev; 2687 2688 ret = copy_mount_options(data, &data_page); 2689 if (ret < 0) 2690 goto out_data; 2691 2692 ret = do_mount(kernel_dev, kernel_dir->name, kernel_type, flags, 2693 (void *) data_page); 2694 2695 free_page(data_page); 2696 out_data: 2697 kfree(kernel_dev); 2698 out_dev: 2699 putname(kernel_dir); 2700 out_dir: 2701 kfree(kernel_type); 2702 out_type: 2703 return ret; 2704 } 2705 2706 /* 2707 * Return true if path is reachable from root 2708 * 2709 * namespace_sem or mount_lock is held 2710 */ 2711 bool is_path_reachable(struct mount *mnt, struct dentry *dentry, 2712 const struct path *root) 2713 { 2714 while (&mnt->mnt != root->mnt && mnt_has_parent(mnt)) { 2715 dentry = mnt->mnt_mountpoint; 2716 mnt = mnt->mnt_parent; 2717 } 2718 return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry); 2719 } 2720 2721 int path_is_under(struct path *path1, struct path *path2) 2722 { 2723 int res; 2724 read_seqlock_excl(&mount_lock); 2725 res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2); 2726 read_sequnlock_excl(&mount_lock); 2727 return res; 2728 } 2729 EXPORT_SYMBOL(path_is_under); 2730 2731 /* 2732 * pivot_root Semantics: 2733 * Moves the root file system of the current process to the directory put_old, 2734 * makes new_root as the new root file system of the current process, and sets 2735 * root/cwd of all processes which had them on the current root to new_root. 2736 * 2737 * Restrictions: 2738 * The new_root and put_old must be directories, and must not be on the 2739 * same file system as the current process root. The put_old must be 2740 * underneath new_root, i.e. adding a non-zero number of /.. to the string 2741 * pointed to by put_old must yield the same directory as new_root. No other 2742 * file system may be mounted on put_old. After all, new_root is a mountpoint. 2743 * 2744 * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem. 2745 * See Documentation/filesystems/ramfs-rootfs-initramfs.txt for alternatives 2746 * in this situation. 2747 * 2748 * Notes: 2749 * - we don't move root/cwd if they are not at the root (reason: if something 2750 * cared enough to change them, it's probably wrong to force them elsewhere) 2751 * - it's okay to pick a root that isn't the root of a file system, e.g. 2752 * /nfs/my_root where /nfs is the mount point. It must be a mountpoint, 2753 * though, so you may need to say mount --bind /nfs/my_root /nfs/my_root 2754 * first. 2755 */ 2756 SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, 2757 const char __user *, put_old) 2758 { 2759 struct path new, old, parent_path, root_parent, root; 2760 struct mount *new_mnt, *root_mnt, *old_mnt; 2761 struct mountpoint *old_mp, *root_mp; 2762 int error; 2763 2764 if (!may_mount()) 2765 return -EPERM; 2766 2767 error = user_path_dir(new_root, &new); 2768 if (error) 2769 goto out0; 2770 2771 error = user_path_dir(put_old, &old); 2772 if (error) 2773 goto out1; 2774 2775 error = security_sb_pivotroot(&old, &new); 2776 if (error) 2777 goto out2; 2778 2779 get_fs_root(current->fs, &root); 2780 old_mp = lock_mount(&old); 2781 error = PTR_ERR(old_mp); 2782 if (IS_ERR(old_mp)) 2783 goto out3; 2784 2785 error = -EINVAL; 2786 new_mnt = real_mount(new.mnt); 2787 root_mnt = real_mount(root.mnt); 2788 old_mnt = real_mount(old.mnt); 2789 if (IS_MNT_SHARED(old_mnt) || 2790 IS_MNT_SHARED(new_mnt->mnt_parent) || 2791 IS_MNT_SHARED(root_mnt->mnt_parent)) 2792 goto out4; 2793 if (!check_mnt(root_mnt) || !check_mnt(new_mnt)) 2794 goto out4; 2795 if (new_mnt->mnt.mnt_flags & MNT_LOCKED) 2796 goto out4; 2797 error = -ENOENT; 2798 if (d_unlinked(new.dentry)) 2799 goto out4; 2800 error = -EBUSY; 2801 if (new_mnt == root_mnt || old_mnt == root_mnt) 2802 goto out4; /* loop, on the same file system */ 2803 error = -EINVAL; 2804 if (root.mnt->mnt_root != root.dentry) 2805 goto out4; /* not a mountpoint */ 2806 if (!mnt_has_parent(root_mnt)) 2807 goto out4; /* not attached */ 2808 root_mp = root_mnt->mnt_mp; 2809 if (new.mnt->mnt_root != new.dentry) 2810 goto out4; /* not a mountpoint */ 2811 if (!mnt_has_parent(new_mnt)) 2812 goto out4; /* not attached */ 2813 /* make sure we can reach put_old from new_root */ 2814 if (!is_path_reachable(old_mnt, old.dentry, &new)) 2815 goto out4; 2816 root_mp->m_count++; /* pin it so it won't go away */ 2817 lock_mount_hash(); 2818 detach_mnt(new_mnt, &parent_path); 2819 detach_mnt(root_mnt, &root_parent); 2820 if (root_mnt->mnt.mnt_flags & MNT_LOCKED) { 2821 new_mnt->mnt.mnt_flags |= MNT_LOCKED; 2822 root_mnt->mnt.mnt_flags &= ~MNT_LOCKED; 2823 } 2824 /* mount old root on put_old */ 2825 attach_mnt(root_mnt, old_mnt, old_mp); 2826 /* mount new_root on / */ 2827 attach_mnt(new_mnt, real_mount(root_parent.mnt), root_mp); 2828 touch_mnt_namespace(current->nsproxy->mnt_ns); 2829 unlock_mount_hash(); 2830 chroot_fs_refs(&root, &new); 2831 put_mountpoint(root_mp); 2832 error = 0; 2833 out4: 2834 unlock_mount(old_mp); 2835 if (!error) { 2836 path_put(&root_parent); 2837 path_put(&parent_path); 2838 } 2839 out3: 2840 path_put(&root); 2841 out2: 2842 path_put(&old); 2843 out1: 2844 path_put(&new); 2845 out0: 2846 return error; 2847 } 2848 2849 static void __init init_mount_tree(void) 2850 { 2851 struct vfsmount *mnt; 2852 struct mnt_namespace *ns; 2853 struct path root; 2854 struct file_system_type *type; 2855 2856 type = get_fs_type("rootfs"); 2857 if (!type) 2858 panic("Can't find rootfs type"); 2859 mnt = vfs_kern_mount(type, 0, "rootfs", NULL); 2860 put_filesystem(type); 2861 if (IS_ERR(mnt)) 2862 panic("Can't create rootfs"); 2863 2864 ns = create_mnt_ns(mnt); 2865 if (IS_ERR(ns)) 2866 panic("Can't allocate initial namespace"); 2867 2868 init_task.nsproxy->mnt_ns = ns; 2869 get_mnt_ns(ns); 2870 2871 root.mnt = mnt; 2872 root.dentry = mnt->mnt_root; 2873 2874 set_fs_pwd(current->fs, &root); 2875 set_fs_root(current->fs, &root); 2876 } 2877 2878 void __init mnt_init(void) 2879 { 2880 unsigned u; 2881 int err; 2882 2883 mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount), 2884 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); 2885 2886 mount_hashtable = alloc_large_system_hash("Mount-cache", 2887 sizeof(struct hlist_head), 2888 mhash_entries, 19, 2889 0, 2890 &m_hash_shift, &m_hash_mask, 0, 0); 2891 mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache", 2892 sizeof(struct hlist_head), 2893 mphash_entries, 19, 2894 0, 2895 &mp_hash_shift, &mp_hash_mask, 0, 0); 2896 2897 if (!mount_hashtable || !mountpoint_hashtable) 2898 panic("Failed to allocate mount hash table\n"); 2899 2900 for (u = 0; u <= m_hash_mask; u++) 2901 INIT_HLIST_HEAD(&mount_hashtable[u]); 2902 for (u = 0; u <= mp_hash_mask; u++) 2903 INIT_HLIST_HEAD(&mountpoint_hashtable[u]); 2904 2905 kernfs_init(); 2906 2907 err = sysfs_init(); 2908 if (err) 2909 printk(KERN_WARNING "%s: sysfs_init error: %d\n", 2910 __func__, err); 2911 fs_kobj = kobject_create_and_add("fs", NULL); 2912 if (!fs_kobj) 2913 printk(KERN_WARNING "%s: kobj create error\n", __func__); 2914 init_rootfs(); 2915 init_mount_tree(); 2916 } 2917 2918 void put_mnt_ns(struct mnt_namespace *ns) 2919 { 2920 if (!atomic_dec_and_test(&ns->count)) 2921 return; 2922 drop_collected_mounts(&ns->root->mnt); 2923 free_mnt_ns(ns); 2924 } 2925 2926 struct vfsmount *kern_mount_data(struct file_system_type *type, void *data) 2927 { 2928 struct vfsmount *mnt; 2929 mnt = vfs_kern_mount(type, MS_KERNMOUNT, type->name, data); 2930 if (!IS_ERR(mnt)) { 2931 /* 2932 * it is a longterm mount, don't release mnt until 2933 * we unmount before file sys is unregistered 2934 */ 2935 real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL; 2936 } 2937 return mnt; 2938 } 2939 EXPORT_SYMBOL_GPL(kern_mount_data); 2940 2941 void kern_unmount(struct vfsmount *mnt) 2942 { 2943 /* release long term mount so mount point can be released */ 2944 if (!IS_ERR_OR_NULL(mnt)) { 2945 real_mount(mnt)->mnt_ns = NULL; 2946 synchronize_rcu(); /* yecchhh... */ 2947 mntput(mnt); 2948 } 2949 } 2950 EXPORT_SYMBOL(kern_unmount); 2951 2952 bool our_mnt(struct vfsmount *mnt) 2953 { 2954 return check_mnt(real_mount(mnt)); 2955 } 2956 2957 bool current_chrooted(void) 2958 { 2959 /* Does the current process have a non-standard root */ 2960 struct path ns_root; 2961 struct path fs_root; 2962 bool chrooted; 2963 2964 /* Find the namespace root */ 2965 ns_root.mnt = ¤t->nsproxy->mnt_ns->root->mnt; 2966 ns_root.dentry = ns_root.mnt->mnt_root; 2967 path_get(&ns_root); 2968 while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root)) 2969 ; 2970 2971 get_fs_root(current->fs, &fs_root); 2972 2973 chrooted = !path_equal(&fs_root, &ns_root); 2974 2975 path_put(&fs_root); 2976 path_put(&ns_root); 2977 2978 return chrooted; 2979 } 2980 2981 bool fs_fully_visible(struct file_system_type *type) 2982 { 2983 struct mnt_namespace *ns = current->nsproxy->mnt_ns; 2984 struct mount *mnt; 2985 bool visible = false; 2986 2987 if (unlikely(!ns)) 2988 return false; 2989 2990 down_read(&namespace_sem); 2991 list_for_each_entry(mnt, &ns->list, mnt_list) { 2992 struct mount *child; 2993 if (mnt->mnt.mnt_sb->s_type != type) 2994 continue; 2995 2996 /* This mount is not fully visible if there are any child mounts 2997 * that cover anything except for empty directories. 2998 */ 2999 list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { 3000 struct inode *inode = child->mnt_mountpoint->d_inode; 3001 if (!S_ISDIR(inode->i_mode)) 3002 goto next; 3003 if (inode->i_nlink > 2) 3004 goto next; 3005 } 3006 visible = true; 3007 goto found; 3008 next: ; 3009 } 3010 found: 3011 up_read(&namespace_sem); 3012 return visible; 3013 } 3014 3015 static void *mntns_get(struct task_struct *task) 3016 { 3017 struct mnt_namespace *ns = NULL; 3018 struct nsproxy *nsproxy; 3019 3020 task_lock(task); 3021 nsproxy = task->nsproxy; 3022 if (nsproxy) { 3023 ns = nsproxy->mnt_ns; 3024 get_mnt_ns(ns); 3025 } 3026 task_unlock(task); 3027 3028 return ns; 3029 } 3030 3031 static void mntns_put(void *ns) 3032 { 3033 put_mnt_ns(ns); 3034 } 3035 3036 static int mntns_install(struct nsproxy *nsproxy, void *ns) 3037 { 3038 struct fs_struct *fs = current->fs; 3039 struct mnt_namespace *mnt_ns = ns; 3040 struct path root; 3041 3042 if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) || 3043 !ns_capable(current_user_ns(), CAP_SYS_CHROOT) || 3044 !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) 3045 return -EPERM; 3046 3047 if (fs->users != 1) 3048 return -EINVAL; 3049 3050 get_mnt_ns(mnt_ns); 3051 put_mnt_ns(nsproxy->mnt_ns); 3052 nsproxy->mnt_ns = mnt_ns; 3053 3054 /* Find the root */ 3055 root.mnt = &mnt_ns->root->mnt; 3056 root.dentry = mnt_ns->root->mnt.mnt_root; 3057 path_get(&root); 3058 while(d_mountpoint(root.dentry) && follow_down_one(&root)) 3059 ; 3060 3061 /* Update the pwd and root */ 3062 set_fs_pwd(fs, &root); 3063 set_fs_root(fs, &root); 3064 3065 path_put(&root); 3066 return 0; 3067 } 3068 3069 static unsigned int mntns_inum(void *ns) 3070 { 3071 struct mnt_namespace *mnt_ns = ns; 3072 return mnt_ns->proc_inum; 3073 } 3074 3075 const struct proc_ns_operations mntns_operations = { 3076 .name = "mnt", 3077 .type = CLONE_NEWNS, 3078 .get = mntns_get, 3079 .put = mntns_put, 3080 .install = mntns_install, 3081 .inum = mntns_inum, 3082 }; 3083