1 /* 2 * linux/fs/namespace.c 3 * 4 * (C) Copyright Al Viro 2000, 2001 5 * Released under GPL v2. 6 * 7 * Based on code from fs/super.c, copyright Linus Torvalds and others. 8 * Heavily rewritten. 9 */ 10 11 #include <linux/syscalls.h> 12 #include <linux/slab.h> 13 #include <linux/sched.h> 14 #include <linux/spinlock.h> 15 #include <linux/percpu.h> 16 #include <linux/init.h> 17 #include <linux/kernel.h> 18 #include <linux/acct.h> 19 #include <linux/capability.h> 20 #include <linux/cpumask.h> 21 #include <linux/module.h> 22 #include <linux/sysfs.h> 23 #include <linux/seq_file.h> 24 #include <linux/mnt_namespace.h> 25 #include <linux/namei.h> 26 #include <linux/nsproxy.h> 27 #include <linux/security.h> 28 #include <linux/mount.h> 29 #include <linux/ramfs.h> 30 #include <linux/log2.h> 31 #include <linux/idr.h> 32 #include <linux/fs_struct.h> 33 #include <linux/fsnotify.h> 34 #include <asm/uaccess.h> 35 #include <asm/unistd.h> 36 #include "pnode.h" 37 #include "internal.h" 38 39 #define HASH_SHIFT ilog2(PAGE_SIZE / sizeof(struct list_head)) 40 #define HASH_SIZE (1UL << HASH_SHIFT) 41 42 static int event; 43 static DEFINE_IDA(mnt_id_ida); 44 static DEFINE_IDA(mnt_group_ida); 45 static DEFINE_SPINLOCK(mnt_id_lock); 46 static int mnt_id_start = 0; 47 static int mnt_group_start = 1; 48 49 static struct list_head *mount_hashtable __read_mostly; 50 static struct kmem_cache *mnt_cache __read_mostly; 51 static struct rw_semaphore namespace_sem; 52 53 /* /sys/fs */ 54 struct kobject *fs_kobj; 55 EXPORT_SYMBOL_GPL(fs_kobj); 56 57 /* 58 * vfsmount lock may be taken for read to prevent changes to the 59 * vfsmount hash, ie. during mountpoint lookups or walking back 60 * up the tree. 61 * 62 * It should be taken for write in all cases where the vfsmount 63 * tree or hash is modified or when a vfsmount structure is modified. 64 */ 65 DEFINE_BRLOCK(vfsmount_lock); 66 67 static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry) 68 { 69 unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES); 70 tmp += ((unsigned long)dentry / L1_CACHE_BYTES); 71 tmp = tmp + (tmp >> HASH_SHIFT); 72 return tmp & (HASH_SIZE - 1); 73 } 74 75 #define MNT_WRITER_UNDERFLOW_LIMIT -(1<<16) 76 77 /* 78 * allocation is serialized by namespace_sem, but we need the spinlock to 79 * serialize with freeing. 80 */ 81 static int mnt_alloc_id(struct vfsmount *mnt) 82 { 83 int res; 84 85 retry: 86 ida_pre_get(&mnt_id_ida, GFP_KERNEL); 87 spin_lock(&mnt_id_lock); 88 res = ida_get_new_above(&mnt_id_ida, mnt_id_start, &mnt->mnt_id); 89 if (!res) 90 mnt_id_start = mnt->mnt_id + 1; 91 spin_unlock(&mnt_id_lock); 92 if (res == -EAGAIN) 93 goto retry; 94 95 return res; 96 } 97 98 static void mnt_free_id(struct vfsmount *mnt) 99 { 100 int id = mnt->mnt_id; 101 spin_lock(&mnt_id_lock); 102 ida_remove(&mnt_id_ida, id); 103 if (mnt_id_start > id) 104 mnt_id_start = id; 105 spin_unlock(&mnt_id_lock); 106 } 107 108 /* 109 * Allocate a new peer group ID 110 * 111 * mnt_group_ida is protected by namespace_sem 112 */ 113 static int mnt_alloc_group_id(struct vfsmount *mnt) 114 { 115 int res; 116 117 if (!ida_pre_get(&mnt_group_ida, GFP_KERNEL)) 118 return -ENOMEM; 119 120 res = ida_get_new_above(&mnt_group_ida, 121 mnt_group_start, 122 &mnt->mnt_group_id); 123 if (!res) 124 mnt_group_start = mnt->mnt_group_id + 1; 125 126 return res; 127 } 128 129 /* 130 * Release a peer group ID 131 */ 132 void mnt_release_group_id(struct vfsmount *mnt) 133 { 134 int id = mnt->mnt_group_id; 135 ida_remove(&mnt_group_ida, id); 136 if (mnt_group_start > id) 137 mnt_group_start = id; 138 mnt->mnt_group_id = 0; 139 } 140 141 /* 142 * vfsmount lock must be held for read 143 */ 144 static inline void mnt_add_count(struct vfsmount *mnt, int n) 145 { 146 #ifdef CONFIG_SMP 147 this_cpu_add(mnt->mnt_pcp->mnt_count, n); 148 #else 149 preempt_disable(); 150 mnt->mnt_count += n; 151 preempt_enable(); 152 #endif 153 } 154 155 static inline void mnt_set_count(struct vfsmount *mnt, int n) 156 { 157 #ifdef CONFIG_SMP 158 this_cpu_write(mnt->mnt_pcp->mnt_count, n); 159 #else 160 mnt->mnt_count = n; 161 #endif 162 } 163 164 /* 165 * vfsmount lock must be held for read 166 */ 167 static inline void mnt_inc_count(struct vfsmount *mnt) 168 { 169 mnt_add_count(mnt, 1); 170 } 171 172 /* 173 * vfsmount lock must be held for read 174 */ 175 static inline void mnt_dec_count(struct vfsmount *mnt) 176 { 177 mnt_add_count(mnt, -1); 178 } 179 180 /* 181 * vfsmount lock must be held for write 182 */ 183 unsigned int mnt_get_count(struct vfsmount *mnt) 184 { 185 #ifdef CONFIG_SMP 186 unsigned int count = atomic_read(&mnt->mnt_longrefs); 187 int cpu; 188 189 for_each_possible_cpu(cpu) { 190 count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count; 191 } 192 193 return count; 194 #else 195 return mnt->mnt_count; 196 #endif 197 } 198 199 struct vfsmount *alloc_vfsmnt(const char *name) 200 { 201 struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); 202 if (mnt) { 203 int err; 204 205 err = mnt_alloc_id(mnt); 206 if (err) 207 goto out_free_cache; 208 209 if (name) { 210 mnt->mnt_devname = kstrdup(name, GFP_KERNEL); 211 if (!mnt->mnt_devname) 212 goto out_free_id; 213 } 214 215 #ifdef CONFIG_SMP 216 mnt->mnt_pcp = alloc_percpu(struct mnt_pcp); 217 if (!mnt->mnt_pcp) 218 goto out_free_devname; 219 220 atomic_set(&mnt->mnt_longrefs, 1); 221 #else 222 mnt->mnt_count = 1; 223 mnt->mnt_writers = 0; 224 #endif 225 226 INIT_LIST_HEAD(&mnt->mnt_hash); 227 INIT_LIST_HEAD(&mnt->mnt_child); 228 INIT_LIST_HEAD(&mnt->mnt_mounts); 229 INIT_LIST_HEAD(&mnt->mnt_list); 230 INIT_LIST_HEAD(&mnt->mnt_expire); 231 INIT_LIST_HEAD(&mnt->mnt_share); 232 INIT_LIST_HEAD(&mnt->mnt_slave_list); 233 INIT_LIST_HEAD(&mnt->mnt_slave); 234 #ifdef CONFIG_FSNOTIFY 235 INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks); 236 #endif 237 } 238 return mnt; 239 240 #ifdef CONFIG_SMP 241 out_free_devname: 242 kfree(mnt->mnt_devname); 243 #endif 244 out_free_id: 245 mnt_free_id(mnt); 246 out_free_cache: 247 kmem_cache_free(mnt_cache, mnt); 248 return NULL; 249 } 250 251 /* 252 * Most r/o checks on a fs are for operations that take 253 * discrete amounts of time, like a write() or unlink(). 254 * We must keep track of when those operations start 255 * (for permission checks) and when they end, so that 256 * we can determine when writes are able to occur to 257 * a filesystem. 258 */ 259 /* 260 * __mnt_is_readonly: check whether a mount is read-only 261 * @mnt: the mount to check for its write status 262 * 263 * This shouldn't be used directly ouside of the VFS. 264 * It does not guarantee that the filesystem will stay 265 * r/w, just that it is right *now*. This can not and 266 * should not be used in place of IS_RDONLY(inode). 267 * mnt_want/drop_write() will _keep_ the filesystem 268 * r/w. 269 */ 270 int __mnt_is_readonly(struct vfsmount *mnt) 271 { 272 if (mnt->mnt_flags & MNT_READONLY) 273 return 1; 274 if (mnt->mnt_sb->s_flags & MS_RDONLY) 275 return 1; 276 return 0; 277 } 278 EXPORT_SYMBOL_GPL(__mnt_is_readonly); 279 280 static inline void mnt_inc_writers(struct vfsmount *mnt) 281 { 282 #ifdef CONFIG_SMP 283 this_cpu_inc(mnt->mnt_pcp->mnt_writers); 284 #else 285 mnt->mnt_writers++; 286 #endif 287 } 288 289 static inline void mnt_dec_writers(struct vfsmount *mnt) 290 { 291 #ifdef CONFIG_SMP 292 this_cpu_dec(mnt->mnt_pcp->mnt_writers); 293 #else 294 mnt->mnt_writers--; 295 #endif 296 } 297 298 static unsigned int mnt_get_writers(struct vfsmount *mnt) 299 { 300 #ifdef CONFIG_SMP 301 unsigned int count = 0; 302 int cpu; 303 304 for_each_possible_cpu(cpu) { 305 count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers; 306 } 307 308 return count; 309 #else 310 return mnt->mnt_writers; 311 #endif 312 } 313 314 /* 315 * Most r/o checks on a fs are for operations that take 316 * discrete amounts of time, like a write() or unlink(). 317 * We must keep track of when those operations start 318 * (for permission checks) and when they end, so that 319 * we can determine when writes are able to occur to 320 * a filesystem. 321 */ 322 /** 323 * mnt_want_write - get write access to a mount 324 * @mnt: the mount on which to take a write 325 * 326 * This tells the low-level filesystem that a write is 327 * about to be performed to it, and makes sure that 328 * writes are allowed before returning success. When 329 * the write operation is finished, mnt_drop_write() 330 * must be called. This is effectively a refcount. 331 */ 332 int mnt_want_write(struct vfsmount *mnt) 333 { 334 int ret = 0; 335 336 preempt_disable(); 337 mnt_inc_writers(mnt); 338 /* 339 * The store to mnt_inc_writers must be visible before we pass 340 * MNT_WRITE_HOLD loop below, so that the slowpath can see our 341 * incremented count after it has set MNT_WRITE_HOLD. 342 */ 343 smp_mb(); 344 while (mnt->mnt_flags & MNT_WRITE_HOLD) 345 cpu_relax(); 346 /* 347 * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will 348 * be set to match its requirements. So we must not load that until 349 * MNT_WRITE_HOLD is cleared. 350 */ 351 smp_rmb(); 352 if (__mnt_is_readonly(mnt)) { 353 mnt_dec_writers(mnt); 354 ret = -EROFS; 355 goto out; 356 } 357 out: 358 preempt_enable(); 359 return ret; 360 } 361 EXPORT_SYMBOL_GPL(mnt_want_write); 362 363 /** 364 * mnt_clone_write - get write access to a mount 365 * @mnt: the mount on which to take a write 366 * 367 * This is effectively like mnt_want_write, except 368 * it must only be used to take an extra write reference 369 * on a mountpoint that we already know has a write reference 370 * on it. This allows some optimisation. 371 * 372 * After finished, mnt_drop_write must be called as usual to 373 * drop the reference. 374 */ 375 int mnt_clone_write(struct vfsmount *mnt) 376 { 377 /* superblock may be r/o */ 378 if (__mnt_is_readonly(mnt)) 379 return -EROFS; 380 preempt_disable(); 381 mnt_inc_writers(mnt); 382 preempt_enable(); 383 return 0; 384 } 385 EXPORT_SYMBOL_GPL(mnt_clone_write); 386 387 /** 388 * mnt_want_write_file - get write access to a file's mount 389 * @file: the file who's mount on which to take a write 390 * 391 * This is like mnt_want_write, but it takes a file and can 392 * do some optimisations if the file is open for write already 393 */ 394 int mnt_want_write_file(struct file *file) 395 { 396 struct inode *inode = file->f_dentry->d_inode; 397 if (!(file->f_mode & FMODE_WRITE) || special_file(inode->i_mode)) 398 return mnt_want_write(file->f_path.mnt); 399 else 400 return mnt_clone_write(file->f_path.mnt); 401 } 402 EXPORT_SYMBOL_GPL(mnt_want_write_file); 403 404 /** 405 * mnt_drop_write - give up write access to a mount 406 * @mnt: the mount on which to give up write access 407 * 408 * Tells the low-level filesystem that we are done 409 * performing writes to it. Must be matched with 410 * mnt_want_write() call above. 411 */ 412 void mnt_drop_write(struct vfsmount *mnt) 413 { 414 preempt_disable(); 415 mnt_dec_writers(mnt); 416 preempt_enable(); 417 } 418 EXPORT_SYMBOL_GPL(mnt_drop_write); 419 420 static int mnt_make_readonly(struct vfsmount *mnt) 421 { 422 int ret = 0; 423 424 br_write_lock(vfsmount_lock); 425 mnt->mnt_flags |= MNT_WRITE_HOLD; 426 /* 427 * After storing MNT_WRITE_HOLD, we'll read the counters. This store 428 * should be visible before we do. 429 */ 430 smp_mb(); 431 432 /* 433 * With writers on hold, if this value is zero, then there are 434 * definitely no active writers (although held writers may subsequently 435 * increment the count, they'll have to wait, and decrement it after 436 * seeing MNT_READONLY). 437 * 438 * It is OK to have counter incremented on one CPU and decremented on 439 * another: the sum will add up correctly. The danger would be when we 440 * sum up each counter, if we read a counter before it is incremented, 441 * but then read another CPU's count which it has been subsequently 442 * decremented from -- we would see more decrements than we should. 443 * MNT_WRITE_HOLD protects against this scenario, because 444 * mnt_want_write first increments count, then smp_mb, then spins on 445 * MNT_WRITE_HOLD, so it can't be decremented by another CPU while 446 * we're counting up here. 447 */ 448 if (mnt_get_writers(mnt) > 0) 449 ret = -EBUSY; 450 else 451 mnt->mnt_flags |= MNT_READONLY; 452 /* 453 * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers 454 * that become unheld will see MNT_READONLY. 455 */ 456 smp_wmb(); 457 mnt->mnt_flags &= ~MNT_WRITE_HOLD; 458 br_write_unlock(vfsmount_lock); 459 return ret; 460 } 461 462 static void __mnt_unmake_readonly(struct vfsmount *mnt) 463 { 464 br_write_lock(vfsmount_lock); 465 mnt->mnt_flags &= ~MNT_READONLY; 466 br_write_unlock(vfsmount_lock); 467 } 468 469 void simple_set_mnt(struct vfsmount *mnt, struct super_block *sb) 470 { 471 mnt->mnt_sb = sb; 472 mnt->mnt_root = dget(sb->s_root); 473 } 474 475 EXPORT_SYMBOL(simple_set_mnt); 476 477 void free_vfsmnt(struct vfsmount *mnt) 478 { 479 kfree(mnt->mnt_devname); 480 mnt_free_id(mnt); 481 #ifdef CONFIG_SMP 482 free_percpu(mnt->mnt_pcp); 483 #endif 484 kmem_cache_free(mnt_cache, mnt); 485 } 486 487 /* 488 * find the first or last mount at @dentry on vfsmount @mnt depending on 489 * @dir. If @dir is set return the first mount else return the last mount. 490 * vfsmount_lock must be held for read or write. 491 */ 492 struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry, 493 int dir) 494 { 495 struct list_head *head = mount_hashtable + hash(mnt, dentry); 496 struct list_head *tmp = head; 497 struct vfsmount *p, *found = NULL; 498 499 for (;;) { 500 tmp = dir ? tmp->next : tmp->prev; 501 p = NULL; 502 if (tmp == head) 503 break; 504 p = list_entry(tmp, struct vfsmount, mnt_hash); 505 if (p->mnt_parent == mnt && p->mnt_mountpoint == dentry) { 506 found = p; 507 break; 508 } 509 } 510 return found; 511 } 512 513 /* 514 * lookup_mnt increments the ref count before returning 515 * the vfsmount struct. 516 */ 517 struct vfsmount *lookup_mnt(struct path *path) 518 { 519 struct vfsmount *child_mnt; 520 521 br_read_lock(vfsmount_lock); 522 if ((child_mnt = __lookup_mnt(path->mnt, path->dentry, 1))) 523 mntget(child_mnt); 524 br_read_unlock(vfsmount_lock); 525 return child_mnt; 526 } 527 528 static inline int check_mnt(struct vfsmount *mnt) 529 { 530 return mnt->mnt_ns == current->nsproxy->mnt_ns; 531 } 532 533 /* 534 * vfsmount lock must be held for write 535 */ 536 static void touch_mnt_namespace(struct mnt_namespace *ns) 537 { 538 if (ns) { 539 ns->event = ++event; 540 wake_up_interruptible(&ns->poll); 541 } 542 } 543 544 /* 545 * vfsmount lock must be held for write 546 */ 547 static void __touch_mnt_namespace(struct mnt_namespace *ns) 548 { 549 if (ns && ns->event != event) { 550 ns->event = event; 551 wake_up_interruptible(&ns->poll); 552 } 553 } 554 555 /* 556 * Clear dentry's mounted state if it has no remaining mounts. 557 * vfsmount_lock must be held for write. 558 */ 559 static void dentry_reset_mounted(struct vfsmount *mnt, struct dentry *dentry) 560 { 561 unsigned u; 562 563 for (u = 0; u < HASH_SIZE; u++) { 564 struct vfsmount *p; 565 566 list_for_each_entry(p, &mount_hashtable[u], mnt_hash) { 567 if (p->mnt_mountpoint == dentry) 568 return; 569 } 570 } 571 spin_lock(&dentry->d_lock); 572 dentry->d_flags &= ~DCACHE_MOUNTED; 573 spin_unlock(&dentry->d_lock); 574 } 575 576 /* 577 * vfsmount lock must be held for write 578 */ 579 static void detach_mnt(struct vfsmount *mnt, struct path *old_path) 580 { 581 old_path->dentry = mnt->mnt_mountpoint; 582 old_path->mnt = mnt->mnt_parent; 583 mnt->mnt_parent = mnt; 584 mnt->mnt_mountpoint = mnt->mnt_root; 585 list_del_init(&mnt->mnt_child); 586 list_del_init(&mnt->mnt_hash); 587 dentry_reset_mounted(old_path->mnt, old_path->dentry); 588 } 589 590 /* 591 * vfsmount lock must be held for write 592 */ 593 void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry, 594 struct vfsmount *child_mnt) 595 { 596 child_mnt->mnt_parent = mntget(mnt); 597 child_mnt->mnt_mountpoint = dget(dentry); 598 spin_lock(&dentry->d_lock); 599 dentry->d_flags |= DCACHE_MOUNTED; 600 spin_unlock(&dentry->d_lock); 601 } 602 603 /* 604 * vfsmount lock must be held for write 605 */ 606 static void attach_mnt(struct vfsmount *mnt, struct path *path) 607 { 608 mnt_set_mountpoint(path->mnt, path->dentry, mnt); 609 list_add_tail(&mnt->mnt_hash, mount_hashtable + 610 hash(path->mnt, path->dentry)); 611 list_add_tail(&mnt->mnt_child, &path->mnt->mnt_mounts); 612 } 613 614 /* 615 * vfsmount lock must be held for write 616 */ 617 static void commit_tree(struct vfsmount *mnt) 618 { 619 struct vfsmount *parent = mnt->mnt_parent; 620 struct vfsmount *m; 621 LIST_HEAD(head); 622 struct mnt_namespace *n = parent->mnt_ns; 623 624 BUG_ON(parent == mnt); 625 626 list_add_tail(&head, &mnt->mnt_list); 627 list_for_each_entry(m, &head, mnt_list) 628 m->mnt_ns = n; 629 list_splice(&head, n->list.prev); 630 631 list_add_tail(&mnt->mnt_hash, mount_hashtable + 632 hash(parent, mnt->mnt_mountpoint)); 633 list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); 634 touch_mnt_namespace(n); 635 } 636 637 static struct vfsmount *next_mnt(struct vfsmount *p, struct vfsmount *root) 638 { 639 struct list_head *next = p->mnt_mounts.next; 640 if (next == &p->mnt_mounts) { 641 while (1) { 642 if (p == root) 643 return NULL; 644 next = p->mnt_child.next; 645 if (next != &p->mnt_parent->mnt_mounts) 646 break; 647 p = p->mnt_parent; 648 } 649 } 650 return list_entry(next, struct vfsmount, mnt_child); 651 } 652 653 static struct vfsmount *skip_mnt_tree(struct vfsmount *p) 654 { 655 struct list_head *prev = p->mnt_mounts.prev; 656 while (prev != &p->mnt_mounts) { 657 p = list_entry(prev, struct vfsmount, mnt_child); 658 prev = p->mnt_mounts.prev; 659 } 660 return p; 661 } 662 663 static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root, 664 int flag) 665 { 666 struct super_block *sb = old->mnt_sb; 667 struct vfsmount *mnt = alloc_vfsmnt(old->mnt_devname); 668 669 if (mnt) { 670 if (flag & (CL_SLAVE | CL_PRIVATE)) 671 mnt->mnt_group_id = 0; /* not a peer of original */ 672 else 673 mnt->mnt_group_id = old->mnt_group_id; 674 675 if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) { 676 int err = mnt_alloc_group_id(mnt); 677 if (err) 678 goto out_free; 679 } 680 681 mnt->mnt_flags = old->mnt_flags & ~MNT_WRITE_HOLD; 682 atomic_inc(&sb->s_active); 683 mnt->mnt_sb = sb; 684 mnt->mnt_root = dget(root); 685 mnt->mnt_mountpoint = mnt->mnt_root; 686 mnt->mnt_parent = mnt; 687 688 if (flag & CL_SLAVE) { 689 list_add(&mnt->mnt_slave, &old->mnt_slave_list); 690 mnt->mnt_master = old; 691 CLEAR_MNT_SHARED(mnt); 692 } else if (!(flag & CL_PRIVATE)) { 693 if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old)) 694 list_add(&mnt->mnt_share, &old->mnt_share); 695 if (IS_MNT_SLAVE(old)) 696 list_add(&mnt->mnt_slave, &old->mnt_slave); 697 mnt->mnt_master = old->mnt_master; 698 } 699 if (flag & CL_MAKE_SHARED) 700 set_mnt_shared(mnt); 701 702 /* stick the duplicate mount on the same expiry list 703 * as the original if that was on one */ 704 if (flag & CL_EXPIRE) { 705 if (!list_empty(&old->mnt_expire)) 706 list_add(&mnt->mnt_expire, &old->mnt_expire); 707 } 708 } 709 return mnt; 710 711 out_free: 712 free_vfsmnt(mnt); 713 return NULL; 714 } 715 716 static inline void mntfree(struct vfsmount *mnt) 717 { 718 struct super_block *sb = mnt->mnt_sb; 719 720 /* 721 * This probably indicates that somebody messed 722 * up a mnt_want/drop_write() pair. If this 723 * happens, the filesystem was probably unable 724 * to make r/w->r/o transitions. 725 */ 726 /* 727 * The locking used to deal with mnt_count decrement provides barriers, 728 * so mnt_get_writers() below is safe. 729 */ 730 WARN_ON(mnt_get_writers(mnt)); 731 fsnotify_vfsmount_delete(mnt); 732 dput(mnt->mnt_root); 733 free_vfsmnt(mnt); 734 deactivate_super(sb); 735 } 736 737 #ifdef CONFIG_SMP 738 static inline void __mntput(struct vfsmount *mnt, int longrefs) 739 { 740 if (!longrefs) { 741 put_again: 742 br_read_lock(vfsmount_lock); 743 if (likely(atomic_read(&mnt->mnt_longrefs))) { 744 mnt_dec_count(mnt); 745 br_read_unlock(vfsmount_lock); 746 return; 747 } 748 br_read_unlock(vfsmount_lock); 749 } else { 750 BUG_ON(!atomic_read(&mnt->mnt_longrefs)); 751 if (atomic_add_unless(&mnt->mnt_longrefs, -1, 1)) 752 return; 753 } 754 755 br_write_lock(vfsmount_lock); 756 if (!longrefs) 757 mnt_dec_count(mnt); 758 else 759 atomic_dec(&mnt->mnt_longrefs); 760 if (mnt_get_count(mnt)) { 761 br_write_unlock(vfsmount_lock); 762 return; 763 } 764 if (unlikely(mnt->mnt_pinned)) { 765 mnt_add_count(mnt, mnt->mnt_pinned + 1); 766 mnt->mnt_pinned = 0; 767 br_write_unlock(vfsmount_lock); 768 acct_auto_close_mnt(mnt); 769 goto put_again; 770 } 771 br_write_unlock(vfsmount_lock); 772 mntfree(mnt); 773 } 774 #else 775 static inline void __mntput(struct vfsmount *mnt, int longrefs) 776 { 777 put_again: 778 mnt_dec_count(mnt); 779 if (likely(mnt_get_count(mnt))) 780 return; 781 br_write_lock(vfsmount_lock); 782 if (unlikely(mnt->mnt_pinned)) { 783 mnt_add_count(mnt, mnt->mnt_pinned + 1); 784 mnt->mnt_pinned = 0; 785 br_write_unlock(vfsmount_lock); 786 acct_auto_close_mnt(mnt); 787 goto put_again; 788 } 789 br_write_unlock(vfsmount_lock); 790 mntfree(mnt); 791 } 792 #endif 793 794 static void mntput_no_expire(struct vfsmount *mnt) 795 { 796 __mntput(mnt, 0); 797 } 798 799 void mntput(struct vfsmount *mnt) 800 { 801 if (mnt) { 802 /* avoid cacheline pingpong, hope gcc doesn't get "smart" */ 803 if (unlikely(mnt->mnt_expiry_mark)) 804 mnt->mnt_expiry_mark = 0; 805 __mntput(mnt, 0); 806 } 807 } 808 EXPORT_SYMBOL(mntput); 809 810 struct vfsmount *mntget(struct vfsmount *mnt) 811 { 812 if (mnt) 813 mnt_inc_count(mnt); 814 return mnt; 815 } 816 EXPORT_SYMBOL(mntget); 817 818 void mntput_long(struct vfsmount *mnt) 819 { 820 #ifdef CONFIG_SMP 821 if (mnt) { 822 /* avoid cacheline pingpong, hope gcc doesn't get "smart" */ 823 if (unlikely(mnt->mnt_expiry_mark)) 824 mnt->mnt_expiry_mark = 0; 825 __mntput(mnt, 1); 826 } 827 #else 828 mntput(mnt); 829 #endif 830 } 831 EXPORT_SYMBOL(mntput_long); 832 833 struct vfsmount *mntget_long(struct vfsmount *mnt) 834 { 835 #ifdef CONFIG_SMP 836 if (mnt) 837 atomic_inc(&mnt->mnt_longrefs); 838 return mnt; 839 #else 840 return mntget(mnt); 841 #endif 842 } 843 EXPORT_SYMBOL(mntget_long); 844 845 void mnt_pin(struct vfsmount *mnt) 846 { 847 br_write_lock(vfsmount_lock); 848 mnt->mnt_pinned++; 849 br_write_unlock(vfsmount_lock); 850 } 851 EXPORT_SYMBOL(mnt_pin); 852 853 void mnt_unpin(struct vfsmount *mnt) 854 { 855 br_write_lock(vfsmount_lock); 856 if (mnt->mnt_pinned) { 857 mnt_inc_count(mnt); 858 mnt->mnt_pinned--; 859 } 860 br_write_unlock(vfsmount_lock); 861 } 862 EXPORT_SYMBOL(mnt_unpin); 863 864 static inline void mangle(struct seq_file *m, const char *s) 865 { 866 seq_escape(m, s, " \t\n\\"); 867 } 868 869 /* 870 * Simple .show_options callback for filesystems which don't want to 871 * implement more complex mount option showing. 872 * 873 * See also save_mount_options(). 874 */ 875 int generic_show_options(struct seq_file *m, struct vfsmount *mnt) 876 { 877 const char *options; 878 879 rcu_read_lock(); 880 options = rcu_dereference(mnt->mnt_sb->s_options); 881 882 if (options != NULL && options[0]) { 883 seq_putc(m, ','); 884 mangle(m, options); 885 } 886 rcu_read_unlock(); 887 888 return 0; 889 } 890 EXPORT_SYMBOL(generic_show_options); 891 892 /* 893 * If filesystem uses generic_show_options(), this function should be 894 * called from the fill_super() callback. 895 * 896 * The .remount_fs callback usually needs to be handled in a special 897 * way, to make sure, that previous options are not overwritten if the 898 * remount fails. 899 * 900 * Also note, that if the filesystem's .remount_fs function doesn't 901 * reset all options to their default value, but changes only newly 902 * given options, then the displayed options will not reflect reality 903 * any more. 904 */ 905 void save_mount_options(struct super_block *sb, char *options) 906 { 907 BUG_ON(sb->s_options); 908 rcu_assign_pointer(sb->s_options, kstrdup(options, GFP_KERNEL)); 909 } 910 EXPORT_SYMBOL(save_mount_options); 911 912 void replace_mount_options(struct super_block *sb, char *options) 913 { 914 char *old = sb->s_options; 915 rcu_assign_pointer(sb->s_options, options); 916 if (old) { 917 synchronize_rcu(); 918 kfree(old); 919 } 920 } 921 EXPORT_SYMBOL(replace_mount_options); 922 923 #ifdef CONFIG_PROC_FS 924 /* iterator */ 925 static void *m_start(struct seq_file *m, loff_t *pos) 926 { 927 struct proc_mounts *p = m->private; 928 929 down_read(&namespace_sem); 930 return seq_list_start(&p->ns->list, *pos); 931 } 932 933 static void *m_next(struct seq_file *m, void *v, loff_t *pos) 934 { 935 struct proc_mounts *p = m->private; 936 937 return seq_list_next(v, &p->ns->list, pos); 938 } 939 940 static void m_stop(struct seq_file *m, void *v) 941 { 942 up_read(&namespace_sem); 943 } 944 945 int mnt_had_events(struct proc_mounts *p) 946 { 947 struct mnt_namespace *ns = p->ns; 948 int res = 0; 949 950 br_read_lock(vfsmount_lock); 951 if (p->event != ns->event) { 952 p->event = ns->event; 953 res = 1; 954 } 955 br_read_unlock(vfsmount_lock); 956 957 return res; 958 } 959 960 struct proc_fs_info { 961 int flag; 962 const char *str; 963 }; 964 965 static int show_sb_opts(struct seq_file *m, struct super_block *sb) 966 { 967 static const struct proc_fs_info fs_info[] = { 968 { MS_SYNCHRONOUS, ",sync" }, 969 { MS_DIRSYNC, ",dirsync" }, 970 { MS_MANDLOCK, ",mand" }, 971 { 0, NULL } 972 }; 973 const struct proc_fs_info *fs_infop; 974 975 for (fs_infop = fs_info; fs_infop->flag; fs_infop++) { 976 if (sb->s_flags & fs_infop->flag) 977 seq_puts(m, fs_infop->str); 978 } 979 980 return security_sb_show_options(m, sb); 981 } 982 983 static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt) 984 { 985 static const struct proc_fs_info mnt_info[] = { 986 { MNT_NOSUID, ",nosuid" }, 987 { MNT_NODEV, ",nodev" }, 988 { MNT_NOEXEC, ",noexec" }, 989 { MNT_NOATIME, ",noatime" }, 990 { MNT_NODIRATIME, ",nodiratime" }, 991 { MNT_RELATIME, ",relatime" }, 992 { 0, NULL } 993 }; 994 const struct proc_fs_info *fs_infop; 995 996 for (fs_infop = mnt_info; fs_infop->flag; fs_infop++) { 997 if (mnt->mnt_flags & fs_infop->flag) 998 seq_puts(m, fs_infop->str); 999 } 1000 } 1001 1002 static void show_type(struct seq_file *m, struct super_block *sb) 1003 { 1004 mangle(m, sb->s_type->name); 1005 if (sb->s_subtype && sb->s_subtype[0]) { 1006 seq_putc(m, '.'); 1007 mangle(m, sb->s_subtype); 1008 } 1009 } 1010 1011 static int show_vfsmnt(struct seq_file *m, void *v) 1012 { 1013 struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list); 1014 int err = 0; 1015 struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; 1016 1017 mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); 1018 seq_putc(m, ' '); 1019 seq_path(m, &mnt_path, " \t\n\\"); 1020 seq_putc(m, ' '); 1021 show_type(m, mnt->mnt_sb); 1022 seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw"); 1023 err = show_sb_opts(m, mnt->mnt_sb); 1024 if (err) 1025 goto out; 1026 show_mnt_opts(m, mnt); 1027 if (mnt->mnt_sb->s_op->show_options) 1028 err = mnt->mnt_sb->s_op->show_options(m, mnt); 1029 seq_puts(m, " 0 0\n"); 1030 out: 1031 return err; 1032 } 1033 1034 const struct seq_operations mounts_op = { 1035 .start = m_start, 1036 .next = m_next, 1037 .stop = m_stop, 1038 .show = show_vfsmnt 1039 }; 1040 1041 static int show_mountinfo(struct seq_file *m, void *v) 1042 { 1043 struct proc_mounts *p = m->private; 1044 struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list); 1045 struct super_block *sb = mnt->mnt_sb; 1046 struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; 1047 struct path root = p->root; 1048 int err = 0; 1049 1050 seq_printf(m, "%i %i %u:%u ", mnt->mnt_id, mnt->mnt_parent->mnt_id, 1051 MAJOR(sb->s_dev), MINOR(sb->s_dev)); 1052 seq_dentry(m, mnt->mnt_root, " \t\n\\"); 1053 seq_putc(m, ' '); 1054 seq_path_root(m, &mnt_path, &root, " \t\n\\"); 1055 if (root.mnt != p->root.mnt || root.dentry != p->root.dentry) { 1056 /* 1057 * Mountpoint is outside root, discard that one. Ugly, 1058 * but less so than trying to do that in iterator in a 1059 * race-free way (due to renames). 1060 */ 1061 return SEQ_SKIP; 1062 } 1063 seq_puts(m, mnt->mnt_flags & MNT_READONLY ? " ro" : " rw"); 1064 show_mnt_opts(m, mnt); 1065 1066 /* Tagged fields ("foo:X" or "bar") */ 1067 if (IS_MNT_SHARED(mnt)) 1068 seq_printf(m, " shared:%i", mnt->mnt_group_id); 1069 if (IS_MNT_SLAVE(mnt)) { 1070 int master = mnt->mnt_master->mnt_group_id; 1071 int dom = get_dominating_id(mnt, &p->root); 1072 seq_printf(m, " master:%i", master); 1073 if (dom && dom != master) 1074 seq_printf(m, " propagate_from:%i", dom); 1075 } 1076 if (IS_MNT_UNBINDABLE(mnt)) 1077 seq_puts(m, " unbindable"); 1078 1079 /* Filesystem specific data */ 1080 seq_puts(m, " - "); 1081 show_type(m, sb); 1082 seq_putc(m, ' '); 1083 mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); 1084 seq_puts(m, sb->s_flags & MS_RDONLY ? " ro" : " rw"); 1085 err = show_sb_opts(m, sb); 1086 if (err) 1087 goto out; 1088 if (sb->s_op->show_options) 1089 err = sb->s_op->show_options(m, mnt); 1090 seq_putc(m, '\n'); 1091 out: 1092 return err; 1093 } 1094 1095 const struct seq_operations mountinfo_op = { 1096 .start = m_start, 1097 .next = m_next, 1098 .stop = m_stop, 1099 .show = show_mountinfo, 1100 }; 1101 1102 static int show_vfsstat(struct seq_file *m, void *v) 1103 { 1104 struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list); 1105 struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; 1106 int err = 0; 1107 1108 /* device */ 1109 if (mnt->mnt_devname) { 1110 seq_puts(m, "device "); 1111 mangle(m, mnt->mnt_devname); 1112 } else 1113 seq_puts(m, "no device"); 1114 1115 /* mount point */ 1116 seq_puts(m, " mounted on "); 1117 seq_path(m, &mnt_path, " \t\n\\"); 1118 seq_putc(m, ' '); 1119 1120 /* file system type */ 1121 seq_puts(m, "with fstype "); 1122 show_type(m, mnt->mnt_sb); 1123 1124 /* optional statistics */ 1125 if (mnt->mnt_sb->s_op->show_stats) { 1126 seq_putc(m, ' '); 1127 err = mnt->mnt_sb->s_op->show_stats(m, mnt); 1128 } 1129 1130 seq_putc(m, '\n'); 1131 return err; 1132 } 1133 1134 const struct seq_operations mountstats_op = { 1135 .start = m_start, 1136 .next = m_next, 1137 .stop = m_stop, 1138 .show = show_vfsstat, 1139 }; 1140 #endif /* CONFIG_PROC_FS */ 1141 1142 /** 1143 * may_umount_tree - check if a mount tree is busy 1144 * @mnt: root of mount tree 1145 * 1146 * This is called to check if a tree of mounts has any 1147 * open files, pwds, chroots or sub mounts that are 1148 * busy. 1149 */ 1150 int may_umount_tree(struct vfsmount *mnt) 1151 { 1152 int actual_refs = 0; 1153 int minimum_refs = 0; 1154 struct vfsmount *p; 1155 1156 /* write lock needed for mnt_get_count */ 1157 br_write_lock(vfsmount_lock); 1158 for (p = mnt; p; p = next_mnt(p, mnt)) { 1159 actual_refs += mnt_get_count(p); 1160 minimum_refs += 2; 1161 } 1162 br_write_unlock(vfsmount_lock); 1163 1164 if (actual_refs > minimum_refs) 1165 return 0; 1166 1167 return 1; 1168 } 1169 1170 EXPORT_SYMBOL(may_umount_tree); 1171 1172 /** 1173 * may_umount - check if a mount point is busy 1174 * @mnt: root of mount 1175 * 1176 * This is called to check if a mount point has any 1177 * open files, pwds, chroots or sub mounts. If the 1178 * mount has sub mounts this will return busy 1179 * regardless of whether the sub mounts are busy. 1180 * 1181 * Doesn't take quota and stuff into account. IOW, in some cases it will 1182 * give false negatives. The main reason why it's here is that we need 1183 * a non-destructive way to look for easily umountable filesystems. 1184 */ 1185 int may_umount(struct vfsmount *mnt) 1186 { 1187 int ret = 1; 1188 down_read(&namespace_sem); 1189 br_write_lock(vfsmount_lock); 1190 if (propagate_mount_busy(mnt, 2)) 1191 ret = 0; 1192 br_write_unlock(vfsmount_lock); 1193 up_read(&namespace_sem); 1194 return ret; 1195 } 1196 1197 EXPORT_SYMBOL(may_umount); 1198 1199 void release_mounts(struct list_head *head) 1200 { 1201 struct vfsmount *mnt; 1202 while (!list_empty(head)) { 1203 mnt = list_first_entry(head, struct vfsmount, mnt_hash); 1204 list_del_init(&mnt->mnt_hash); 1205 if (mnt->mnt_parent != mnt) { 1206 struct dentry *dentry; 1207 struct vfsmount *m; 1208 1209 br_write_lock(vfsmount_lock); 1210 dentry = mnt->mnt_mountpoint; 1211 m = mnt->mnt_parent; 1212 mnt->mnt_mountpoint = mnt->mnt_root; 1213 mnt->mnt_parent = mnt; 1214 m->mnt_ghosts--; 1215 br_write_unlock(vfsmount_lock); 1216 dput(dentry); 1217 mntput(m); 1218 } 1219 mntput_long(mnt); 1220 } 1221 } 1222 1223 /* 1224 * vfsmount lock must be held for write 1225 * namespace_sem must be held for write 1226 */ 1227 void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill) 1228 { 1229 struct vfsmount *p; 1230 1231 for (p = mnt; p; p = next_mnt(p, mnt)) 1232 list_move(&p->mnt_hash, kill); 1233 1234 if (propagate) 1235 propagate_umount(kill); 1236 1237 list_for_each_entry(p, kill, mnt_hash) { 1238 list_del_init(&p->mnt_expire); 1239 list_del_init(&p->mnt_list); 1240 __touch_mnt_namespace(p->mnt_ns); 1241 p->mnt_ns = NULL; 1242 list_del_init(&p->mnt_child); 1243 if (p->mnt_parent != p) { 1244 p->mnt_parent->mnt_ghosts++; 1245 dentry_reset_mounted(p->mnt_parent, p->mnt_mountpoint); 1246 } 1247 change_mnt_propagation(p, MS_PRIVATE); 1248 } 1249 } 1250 1251 static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts); 1252 1253 static int do_umount(struct vfsmount *mnt, int flags) 1254 { 1255 struct super_block *sb = mnt->mnt_sb; 1256 int retval; 1257 LIST_HEAD(umount_list); 1258 1259 retval = security_sb_umount(mnt, flags); 1260 if (retval) 1261 return retval; 1262 1263 /* 1264 * Allow userspace to request a mountpoint be expired rather than 1265 * unmounting unconditionally. Unmount only happens if: 1266 * (1) the mark is already set (the mark is cleared by mntput()) 1267 * (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount] 1268 */ 1269 if (flags & MNT_EXPIRE) { 1270 if (mnt == current->fs->root.mnt || 1271 flags & (MNT_FORCE | MNT_DETACH)) 1272 return -EINVAL; 1273 1274 /* 1275 * probably don't strictly need the lock here if we examined 1276 * all race cases, but it's a slowpath. 1277 */ 1278 br_write_lock(vfsmount_lock); 1279 if (mnt_get_count(mnt) != 2) { 1280 br_write_lock(vfsmount_lock); 1281 return -EBUSY; 1282 } 1283 br_write_unlock(vfsmount_lock); 1284 1285 if (!xchg(&mnt->mnt_expiry_mark, 1)) 1286 return -EAGAIN; 1287 } 1288 1289 /* 1290 * If we may have to abort operations to get out of this 1291 * mount, and they will themselves hold resources we must 1292 * allow the fs to do things. In the Unix tradition of 1293 * 'Gee thats tricky lets do it in userspace' the umount_begin 1294 * might fail to complete on the first run through as other tasks 1295 * must return, and the like. Thats for the mount program to worry 1296 * about for the moment. 1297 */ 1298 1299 if (flags & MNT_FORCE && sb->s_op->umount_begin) { 1300 sb->s_op->umount_begin(sb); 1301 } 1302 1303 /* 1304 * No sense to grab the lock for this test, but test itself looks 1305 * somewhat bogus. Suggestions for better replacement? 1306 * Ho-hum... In principle, we might treat that as umount + switch 1307 * to rootfs. GC would eventually take care of the old vfsmount. 1308 * Actually it makes sense, especially if rootfs would contain a 1309 * /reboot - static binary that would close all descriptors and 1310 * call reboot(9). Then init(8) could umount root and exec /reboot. 1311 */ 1312 if (mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) { 1313 /* 1314 * Special case for "unmounting" root ... 1315 * we just try to remount it readonly. 1316 */ 1317 down_write(&sb->s_umount); 1318 if (!(sb->s_flags & MS_RDONLY)) 1319 retval = do_remount_sb(sb, MS_RDONLY, NULL, 0); 1320 up_write(&sb->s_umount); 1321 return retval; 1322 } 1323 1324 down_write(&namespace_sem); 1325 br_write_lock(vfsmount_lock); 1326 event++; 1327 1328 if (!(flags & MNT_DETACH)) 1329 shrink_submounts(mnt, &umount_list); 1330 1331 retval = -EBUSY; 1332 if (flags & MNT_DETACH || !propagate_mount_busy(mnt, 2)) { 1333 if (!list_empty(&mnt->mnt_list)) 1334 umount_tree(mnt, 1, &umount_list); 1335 retval = 0; 1336 } 1337 br_write_unlock(vfsmount_lock); 1338 up_write(&namespace_sem); 1339 release_mounts(&umount_list); 1340 return retval; 1341 } 1342 1343 /* 1344 * Now umount can handle mount points as well as block devices. 1345 * This is important for filesystems which use unnamed block devices. 1346 * 1347 * We now support a flag for forced unmount like the other 'big iron' 1348 * unixes. Our API is identical to OSF/1 to avoid making a mess of AMD 1349 */ 1350 1351 SYSCALL_DEFINE2(umount, char __user *, name, int, flags) 1352 { 1353 struct path path; 1354 int retval; 1355 int lookup_flags = 0; 1356 1357 if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW)) 1358 return -EINVAL; 1359 1360 if (!(flags & UMOUNT_NOFOLLOW)) 1361 lookup_flags |= LOOKUP_FOLLOW; 1362 1363 retval = user_path_at(AT_FDCWD, name, lookup_flags, &path); 1364 if (retval) 1365 goto out; 1366 retval = -EINVAL; 1367 if (path.dentry != path.mnt->mnt_root) 1368 goto dput_and_out; 1369 if (!check_mnt(path.mnt)) 1370 goto dput_and_out; 1371 1372 retval = -EPERM; 1373 if (!capable(CAP_SYS_ADMIN)) 1374 goto dput_and_out; 1375 1376 retval = do_umount(path.mnt, flags); 1377 dput_and_out: 1378 /* we mustn't call path_put() as that would clear mnt_expiry_mark */ 1379 dput(path.dentry); 1380 mntput_no_expire(path.mnt); 1381 out: 1382 return retval; 1383 } 1384 1385 #ifdef __ARCH_WANT_SYS_OLDUMOUNT 1386 1387 /* 1388 * The 2.0 compatible umount. No flags. 1389 */ 1390 SYSCALL_DEFINE1(oldumount, char __user *, name) 1391 { 1392 return sys_umount(name, 0); 1393 } 1394 1395 #endif 1396 1397 static int mount_is_safe(struct path *path) 1398 { 1399 if (capable(CAP_SYS_ADMIN)) 1400 return 0; 1401 return -EPERM; 1402 #ifdef notyet 1403 if (S_ISLNK(path->dentry->d_inode->i_mode)) 1404 return -EPERM; 1405 if (path->dentry->d_inode->i_mode & S_ISVTX) { 1406 if (current_uid() != path->dentry->d_inode->i_uid) 1407 return -EPERM; 1408 } 1409 if (inode_permission(path->dentry->d_inode, MAY_WRITE)) 1410 return -EPERM; 1411 return 0; 1412 #endif 1413 } 1414 1415 struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry, 1416 int flag) 1417 { 1418 struct vfsmount *res, *p, *q, *r, *s; 1419 struct path path; 1420 1421 if (!(flag & CL_COPY_ALL) && IS_MNT_UNBINDABLE(mnt)) 1422 return NULL; 1423 1424 res = q = clone_mnt(mnt, dentry, flag); 1425 if (!q) 1426 goto Enomem; 1427 q->mnt_mountpoint = mnt->mnt_mountpoint; 1428 1429 p = mnt; 1430 list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) { 1431 if (!is_subdir(r->mnt_mountpoint, dentry)) 1432 continue; 1433 1434 for (s = r; s; s = next_mnt(s, r)) { 1435 if (!(flag & CL_COPY_ALL) && IS_MNT_UNBINDABLE(s)) { 1436 s = skip_mnt_tree(s); 1437 continue; 1438 } 1439 while (p != s->mnt_parent) { 1440 p = p->mnt_parent; 1441 q = q->mnt_parent; 1442 } 1443 p = s; 1444 path.mnt = q; 1445 path.dentry = p->mnt_mountpoint; 1446 q = clone_mnt(p, p->mnt_root, flag); 1447 if (!q) 1448 goto Enomem; 1449 br_write_lock(vfsmount_lock); 1450 list_add_tail(&q->mnt_list, &res->mnt_list); 1451 attach_mnt(q, &path); 1452 br_write_unlock(vfsmount_lock); 1453 } 1454 } 1455 return res; 1456 Enomem: 1457 if (res) { 1458 LIST_HEAD(umount_list); 1459 br_write_lock(vfsmount_lock); 1460 umount_tree(res, 0, &umount_list); 1461 br_write_unlock(vfsmount_lock); 1462 release_mounts(&umount_list); 1463 } 1464 return NULL; 1465 } 1466 1467 struct vfsmount *collect_mounts(struct path *path) 1468 { 1469 struct vfsmount *tree; 1470 down_write(&namespace_sem); 1471 tree = copy_tree(path->mnt, path->dentry, CL_COPY_ALL | CL_PRIVATE); 1472 up_write(&namespace_sem); 1473 return tree; 1474 } 1475 1476 void drop_collected_mounts(struct vfsmount *mnt) 1477 { 1478 LIST_HEAD(umount_list); 1479 down_write(&namespace_sem); 1480 br_write_lock(vfsmount_lock); 1481 umount_tree(mnt, 0, &umount_list); 1482 br_write_unlock(vfsmount_lock); 1483 up_write(&namespace_sem); 1484 release_mounts(&umount_list); 1485 } 1486 1487 int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg, 1488 struct vfsmount *root) 1489 { 1490 struct vfsmount *mnt; 1491 int res = f(root, arg); 1492 if (res) 1493 return res; 1494 list_for_each_entry(mnt, &root->mnt_list, mnt_list) { 1495 res = f(mnt, arg); 1496 if (res) 1497 return res; 1498 } 1499 return 0; 1500 } 1501 1502 static void cleanup_group_ids(struct vfsmount *mnt, struct vfsmount *end) 1503 { 1504 struct vfsmount *p; 1505 1506 for (p = mnt; p != end; p = next_mnt(p, mnt)) { 1507 if (p->mnt_group_id && !IS_MNT_SHARED(p)) 1508 mnt_release_group_id(p); 1509 } 1510 } 1511 1512 static int invent_group_ids(struct vfsmount *mnt, bool recurse) 1513 { 1514 struct vfsmount *p; 1515 1516 for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) { 1517 if (!p->mnt_group_id && !IS_MNT_SHARED(p)) { 1518 int err = mnt_alloc_group_id(p); 1519 if (err) { 1520 cleanup_group_ids(mnt, p); 1521 return err; 1522 } 1523 } 1524 } 1525 1526 return 0; 1527 } 1528 1529 /* 1530 * @source_mnt : mount tree to be attached 1531 * @nd : place the mount tree @source_mnt is attached 1532 * @parent_nd : if non-null, detach the source_mnt from its parent and 1533 * store the parent mount and mountpoint dentry. 1534 * (done when source_mnt is moved) 1535 * 1536 * NOTE: in the table below explains the semantics when a source mount 1537 * of a given type is attached to a destination mount of a given type. 1538 * --------------------------------------------------------------------------- 1539 * | BIND MOUNT OPERATION | 1540 * |************************************************************************** 1541 * | source-->| shared | private | slave | unbindable | 1542 * | dest | | | | | 1543 * | | | | | | | 1544 * | v | | | | | 1545 * |************************************************************************** 1546 * | shared | shared (++) | shared (+) | shared(+++)| invalid | 1547 * | | | | | | 1548 * |non-shared| shared (+) | private | slave (*) | invalid | 1549 * *************************************************************************** 1550 * A bind operation clones the source mount and mounts the clone on the 1551 * destination mount. 1552 * 1553 * (++) the cloned mount is propagated to all the mounts in the propagation 1554 * tree of the destination mount and the cloned mount is added to 1555 * the peer group of the source mount. 1556 * (+) the cloned mount is created under the destination mount and is marked 1557 * as shared. The cloned mount is added to the peer group of the source 1558 * mount. 1559 * (+++) the mount is propagated to all the mounts in the propagation tree 1560 * of the destination mount and the cloned mount is made slave 1561 * of the same master as that of the source mount. The cloned mount 1562 * is marked as 'shared and slave'. 1563 * (*) the cloned mount is made a slave of the same master as that of the 1564 * source mount. 1565 * 1566 * --------------------------------------------------------------------------- 1567 * | MOVE MOUNT OPERATION | 1568 * |************************************************************************** 1569 * | source-->| shared | private | slave | unbindable | 1570 * | dest | | | | | 1571 * | | | | | | | 1572 * | v | | | | | 1573 * |************************************************************************** 1574 * | shared | shared (+) | shared (+) | shared(+++) | invalid | 1575 * | | | | | | 1576 * |non-shared| shared (+*) | private | slave (*) | unbindable | 1577 * *************************************************************************** 1578 * 1579 * (+) the mount is moved to the destination. And is then propagated to 1580 * all the mounts in the propagation tree of the destination mount. 1581 * (+*) the mount is moved to the destination. 1582 * (+++) the mount is moved to the destination and is then propagated to 1583 * all the mounts belonging to the destination mount's propagation tree. 1584 * the mount is marked as 'shared and slave'. 1585 * (*) the mount continues to be a slave at the new location. 1586 * 1587 * if the source mount is a tree, the operations explained above is 1588 * applied to each mount in the tree. 1589 * Must be called without spinlocks held, since this function can sleep 1590 * in allocations. 1591 */ 1592 static int attach_recursive_mnt(struct vfsmount *source_mnt, 1593 struct path *path, struct path *parent_path) 1594 { 1595 LIST_HEAD(tree_list); 1596 struct vfsmount *dest_mnt = path->mnt; 1597 struct dentry *dest_dentry = path->dentry; 1598 struct vfsmount *child, *p; 1599 int err; 1600 1601 if (IS_MNT_SHARED(dest_mnt)) { 1602 err = invent_group_ids(source_mnt, true); 1603 if (err) 1604 goto out; 1605 } 1606 err = propagate_mnt(dest_mnt, dest_dentry, source_mnt, &tree_list); 1607 if (err) 1608 goto out_cleanup_ids; 1609 1610 br_write_lock(vfsmount_lock); 1611 1612 if (IS_MNT_SHARED(dest_mnt)) { 1613 for (p = source_mnt; p; p = next_mnt(p, source_mnt)) 1614 set_mnt_shared(p); 1615 } 1616 if (parent_path) { 1617 detach_mnt(source_mnt, parent_path); 1618 attach_mnt(source_mnt, path); 1619 touch_mnt_namespace(parent_path->mnt->mnt_ns); 1620 } else { 1621 mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt); 1622 commit_tree(source_mnt); 1623 } 1624 1625 list_for_each_entry_safe(child, p, &tree_list, mnt_hash) { 1626 list_del_init(&child->mnt_hash); 1627 commit_tree(child); 1628 } 1629 br_write_unlock(vfsmount_lock); 1630 1631 return 0; 1632 1633 out_cleanup_ids: 1634 if (IS_MNT_SHARED(dest_mnt)) 1635 cleanup_group_ids(source_mnt, NULL); 1636 out: 1637 return err; 1638 } 1639 1640 static int graft_tree(struct vfsmount *mnt, struct path *path) 1641 { 1642 int err; 1643 if (mnt->mnt_sb->s_flags & MS_NOUSER) 1644 return -EINVAL; 1645 1646 if (S_ISDIR(path->dentry->d_inode->i_mode) != 1647 S_ISDIR(mnt->mnt_root->d_inode->i_mode)) 1648 return -ENOTDIR; 1649 1650 err = -ENOENT; 1651 mutex_lock(&path->dentry->d_inode->i_mutex); 1652 if (cant_mount(path->dentry)) 1653 goto out_unlock; 1654 1655 if (!d_unlinked(path->dentry)) 1656 err = attach_recursive_mnt(mnt, path, NULL); 1657 out_unlock: 1658 mutex_unlock(&path->dentry->d_inode->i_mutex); 1659 return err; 1660 } 1661 1662 /* 1663 * Sanity check the flags to change_mnt_propagation. 1664 */ 1665 1666 static int flags_to_propagation_type(int flags) 1667 { 1668 int type = flags & ~MS_REC; 1669 1670 /* Fail if any non-propagation flags are set */ 1671 if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) 1672 return 0; 1673 /* Only one propagation flag should be set */ 1674 if (!is_power_of_2(type)) 1675 return 0; 1676 return type; 1677 } 1678 1679 /* 1680 * recursively change the type of the mountpoint. 1681 */ 1682 static int do_change_type(struct path *path, int flag) 1683 { 1684 struct vfsmount *m, *mnt = path->mnt; 1685 int recurse = flag & MS_REC; 1686 int type; 1687 int err = 0; 1688 1689 if (!capable(CAP_SYS_ADMIN)) 1690 return -EPERM; 1691 1692 if (path->dentry != path->mnt->mnt_root) 1693 return -EINVAL; 1694 1695 type = flags_to_propagation_type(flag); 1696 if (!type) 1697 return -EINVAL; 1698 1699 down_write(&namespace_sem); 1700 if (type == MS_SHARED) { 1701 err = invent_group_ids(mnt, recurse); 1702 if (err) 1703 goto out_unlock; 1704 } 1705 1706 br_write_lock(vfsmount_lock); 1707 for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL)) 1708 change_mnt_propagation(m, type); 1709 br_write_unlock(vfsmount_lock); 1710 1711 out_unlock: 1712 up_write(&namespace_sem); 1713 return err; 1714 } 1715 1716 /* 1717 * do loopback mount. 1718 */ 1719 static int do_loopback(struct path *path, char *old_name, 1720 int recurse) 1721 { 1722 struct path old_path; 1723 struct vfsmount *mnt = NULL; 1724 int err = mount_is_safe(path); 1725 if (err) 1726 return err; 1727 if (!old_name || !*old_name) 1728 return -EINVAL; 1729 err = kern_path(old_name, LOOKUP_FOLLOW, &old_path); 1730 if (err) 1731 return err; 1732 1733 down_write(&namespace_sem); 1734 err = -EINVAL; 1735 if (IS_MNT_UNBINDABLE(old_path.mnt)) 1736 goto out; 1737 1738 if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt)) 1739 goto out; 1740 1741 err = -ENOMEM; 1742 if (recurse) 1743 mnt = copy_tree(old_path.mnt, old_path.dentry, 0); 1744 else 1745 mnt = clone_mnt(old_path.mnt, old_path.dentry, 0); 1746 1747 if (!mnt) 1748 goto out; 1749 1750 err = graft_tree(mnt, path); 1751 if (err) { 1752 LIST_HEAD(umount_list); 1753 1754 br_write_lock(vfsmount_lock); 1755 umount_tree(mnt, 0, &umount_list); 1756 br_write_unlock(vfsmount_lock); 1757 release_mounts(&umount_list); 1758 } 1759 1760 out: 1761 up_write(&namespace_sem); 1762 path_put(&old_path); 1763 return err; 1764 } 1765 1766 static int change_mount_flags(struct vfsmount *mnt, int ms_flags) 1767 { 1768 int error = 0; 1769 int readonly_request = 0; 1770 1771 if (ms_flags & MS_RDONLY) 1772 readonly_request = 1; 1773 if (readonly_request == __mnt_is_readonly(mnt)) 1774 return 0; 1775 1776 if (readonly_request) 1777 error = mnt_make_readonly(mnt); 1778 else 1779 __mnt_unmake_readonly(mnt); 1780 return error; 1781 } 1782 1783 /* 1784 * change filesystem flags. dir should be a physical root of filesystem. 1785 * If you've mounted a non-root directory somewhere and want to do remount 1786 * on it - tough luck. 1787 */ 1788 static int do_remount(struct path *path, int flags, int mnt_flags, 1789 void *data) 1790 { 1791 int err; 1792 struct super_block *sb = path->mnt->mnt_sb; 1793 1794 if (!capable(CAP_SYS_ADMIN)) 1795 return -EPERM; 1796 1797 if (!check_mnt(path->mnt)) 1798 return -EINVAL; 1799 1800 if (path->dentry != path->mnt->mnt_root) 1801 return -EINVAL; 1802 1803 down_write(&sb->s_umount); 1804 if (flags & MS_BIND) 1805 err = change_mount_flags(path->mnt, flags); 1806 else 1807 err = do_remount_sb(sb, flags, data, 0); 1808 if (!err) { 1809 br_write_lock(vfsmount_lock); 1810 mnt_flags |= path->mnt->mnt_flags & MNT_PROPAGATION_MASK; 1811 path->mnt->mnt_flags = mnt_flags; 1812 br_write_unlock(vfsmount_lock); 1813 } 1814 up_write(&sb->s_umount); 1815 if (!err) { 1816 br_write_lock(vfsmount_lock); 1817 touch_mnt_namespace(path->mnt->mnt_ns); 1818 br_write_unlock(vfsmount_lock); 1819 } 1820 return err; 1821 } 1822 1823 static inline int tree_contains_unbindable(struct vfsmount *mnt) 1824 { 1825 struct vfsmount *p; 1826 for (p = mnt; p; p = next_mnt(p, mnt)) { 1827 if (IS_MNT_UNBINDABLE(p)) 1828 return 1; 1829 } 1830 return 0; 1831 } 1832 1833 static int do_move_mount(struct path *path, char *old_name) 1834 { 1835 struct path old_path, parent_path; 1836 struct vfsmount *p; 1837 int err = 0; 1838 if (!capable(CAP_SYS_ADMIN)) 1839 return -EPERM; 1840 if (!old_name || !*old_name) 1841 return -EINVAL; 1842 err = kern_path(old_name, LOOKUP_FOLLOW, &old_path); 1843 if (err) 1844 return err; 1845 1846 down_write(&namespace_sem); 1847 while (d_mountpoint(path->dentry) && 1848 follow_down(path)) 1849 ; 1850 err = -EINVAL; 1851 if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt)) 1852 goto out; 1853 1854 err = -ENOENT; 1855 mutex_lock(&path->dentry->d_inode->i_mutex); 1856 if (cant_mount(path->dentry)) 1857 goto out1; 1858 1859 if (d_unlinked(path->dentry)) 1860 goto out1; 1861 1862 err = -EINVAL; 1863 if (old_path.dentry != old_path.mnt->mnt_root) 1864 goto out1; 1865 1866 if (old_path.mnt == old_path.mnt->mnt_parent) 1867 goto out1; 1868 1869 if (S_ISDIR(path->dentry->d_inode->i_mode) != 1870 S_ISDIR(old_path.dentry->d_inode->i_mode)) 1871 goto out1; 1872 /* 1873 * Don't move a mount residing in a shared parent. 1874 */ 1875 if (old_path.mnt->mnt_parent && 1876 IS_MNT_SHARED(old_path.mnt->mnt_parent)) 1877 goto out1; 1878 /* 1879 * Don't move a mount tree containing unbindable mounts to a destination 1880 * mount which is shared. 1881 */ 1882 if (IS_MNT_SHARED(path->mnt) && 1883 tree_contains_unbindable(old_path.mnt)) 1884 goto out1; 1885 err = -ELOOP; 1886 for (p = path->mnt; p->mnt_parent != p; p = p->mnt_parent) 1887 if (p == old_path.mnt) 1888 goto out1; 1889 1890 err = attach_recursive_mnt(old_path.mnt, path, &parent_path); 1891 if (err) 1892 goto out1; 1893 1894 /* if the mount is moved, it should no longer be expire 1895 * automatically */ 1896 list_del_init(&old_path.mnt->mnt_expire); 1897 out1: 1898 mutex_unlock(&path->dentry->d_inode->i_mutex); 1899 out: 1900 up_write(&namespace_sem); 1901 if (!err) 1902 path_put(&parent_path); 1903 path_put(&old_path); 1904 return err; 1905 } 1906 1907 /* 1908 * create a new mount for userspace and request it to be added into the 1909 * namespace's tree 1910 */ 1911 static int do_new_mount(struct path *path, char *type, int flags, 1912 int mnt_flags, char *name, void *data) 1913 { 1914 struct vfsmount *mnt; 1915 1916 if (!type) 1917 return -EINVAL; 1918 1919 /* we need capabilities... */ 1920 if (!capable(CAP_SYS_ADMIN)) 1921 return -EPERM; 1922 1923 mnt = do_kern_mount(type, flags, name, data); 1924 if (IS_ERR(mnt)) 1925 return PTR_ERR(mnt); 1926 1927 return do_add_mount(mnt, path, mnt_flags, NULL); 1928 } 1929 1930 /* 1931 * add a mount into a namespace's mount tree 1932 * - provide the option of adding the new mount to an expiration list 1933 */ 1934 int do_add_mount(struct vfsmount *newmnt, struct path *path, 1935 int mnt_flags, struct list_head *fslist) 1936 { 1937 int err; 1938 1939 mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL); 1940 1941 down_write(&namespace_sem); 1942 /* Something was mounted here while we slept */ 1943 while (d_mountpoint(path->dentry) && 1944 follow_down(path)) 1945 ; 1946 err = -EINVAL; 1947 if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt)) 1948 goto unlock; 1949 1950 /* Refuse the same filesystem on the same mount point */ 1951 err = -EBUSY; 1952 if (path->mnt->mnt_sb == newmnt->mnt_sb && 1953 path->mnt->mnt_root == path->dentry) 1954 goto unlock; 1955 1956 err = -EINVAL; 1957 if (S_ISLNK(newmnt->mnt_root->d_inode->i_mode)) 1958 goto unlock; 1959 1960 newmnt->mnt_flags = mnt_flags; 1961 if ((err = graft_tree(newmnt, path))) 1962 goto unlock; 1963 1964 if (fslist) /* add to the specified expiration list */ 1965 list_add_tail(&newmnt->mnt_expire, fslist); 1966 1967 up_write(&namespace_sem); 1968 return 0; 1969 1970 unlock: 1971 up_write(&namespace_sem); 1972 mntput_long(newmnt); 1973 return err; 1974 } 1975 1976 EXPORT_SYMBOL_GPL(do_add_mount); 1977 1978 /* 1979 * process a list of expirable mountpoints with the intent of discarding any 1980 * mountpoints that aren't in use and haven't been touched since last we came 1981 * here 1982 */ 1983 void mark_mounts_for_expiry(struct list_head *mounts) 1984 { 1985 struct vfsmount *mnt, *next; 1986 LIST_HEAD(graveyard); 1987 LIST_HEAD(umounts); 1988 1989 if (list_empty(mounts)) 1990 return; 1991 1992 down_write(&namespace_sem); 1993 br_write_lock(vfsmount_lock); 1994 1995 /* extract from the expiration list every vfsmount that matches the 1996 * following criteria: 1997 * - only referenced by its parent vfsmount 1998 * - still marked for expiry (marked on the last call here; marks are 1999 * cleared by mntput()) 2000 */ 2001 list_for_each_entry_safe(mnt, next, mounts, mnt_expire) { 2002 if (!xchg(&mnt->mnt_expiry_mark, 1) || 2003 propagate_mount_busy(mnt, 1)) 2004 continue; 2005 list_move(&mnt->mnt_expire, &graveyard); 2006 } 2007 while (!list_empty(&graveyard)) { 2008 mnt = list_first_entry(&graveyard, struct vfsmount, mnt_expire); 2009 touch_mnt_namespace(mnt->mnt_ns); 2010 umount_tree(mnt, 1, &umounts); 2011 } 2012 br_write_unlock(vfsmount_lock); 2013 up_write(&namespace_sem); 2014 2015 release_mounts(&umounts); 2016 } 2017 2018 EXPORT_SYMBOL_GPL(mark_mounts_for_expiry); 2019 2020 /* 2021 * Ripoff of 'select_parent()' 2022 * 2023 * search the list of submounts for a given mountpoint, and move any 2024 * shrinkable submounts to the 'graveyard' list. 2025 */ 2026 static int select_submounts(struct vfsmount *parent, struct list_head *graveyard) 2027 { 2028 struct vfsmount *this_parent = parent; 2029 struct list_head *next; 2030 int found = 0; 2031 2032 repeat: 2033 next = this_parent->mnt_mounts.next; 2034 resume: 2035 while (next != &this_parent->mnt_mounts) { 2036 struct list_head *tmp = next; 2037 struct vfsmount *mnt = list_entry(tmp, struct vfsmount, mnt_child); 2038 2039 next = tmp->next; 2040 if (!(mnt->mnt_flags & MNT_SHRINKABLE)) 2041 continue; 2042 /* 2043 * Descend a level if the d_mounts list is non-empty. 2044 */ 2045 if (!list_empty(&mnt->mnt_mounts)) { 2046 this_parent = mnt; 2047 goto repeat; 2048 } 2049 2050 if (!propagate_mount_busy(mnt, 1)) { 2051 list_move_tail(&mnt->mnt_expire, graveyard); 2052 found++; 2053 } 2054 } 2055 /* 2056 * All done at this level ... ascend and resume the search 2057 */ 2058 if (this_parent != parent) { 2059 next = this_parent->mnt_child.next; 2060 this_parent = this_parent->mnt_parent; 2061 goto resume; 2062 } 2063 return found; 2064 } 2065 2066 /* 2067 * process a list of expirable mountpoints with the intent of discarding any 2068 * submounts of a specific parent mountpoint 2069 * 2070 * vfsmount_lock must be held for write 2071 */ 2072 static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts) 2073 { 2074 LIST_HEAD(graveyard); 2075 struct vfsmount *m; 2076 2077 /* extract submounts of 'mountpoint' from the expiration list */ 2078 while (select_submounts(mnt, &graveyard)) { 2079 while (!list_empty(&graveyard)) { 2080 m = list_first_entry(&graveyard, struct vfsmount, 2081 mnt_expire); 2082 touch_mnt_namespace(m->mnt_ns); 2083 umount_tree(m, 1, umounts); 2084 } 2085 } 2086 } 2087 2088 /* 2089 * Some copy_from_user() implementations do not return the exact number of 2090 * bytes remaining to copy on a fault. But copy_mount_options() requires that. 2091 * Note that this function differs from copy_from_user() in that it will oops 2092 * on bad values of `to', rather than returning a short copy. 2093 */ 2094 static long exact_copy_from_user(void *to, const void __user * from, 2095 unsigned long n) 2096 { 2097 char *t = to; 2098 const char __user *f = from; 2099 char c; 2100 2101 if (!access_ok(VERIFY_READ, from, n)) 2102 return n; 2103 2104 while (n) { 2105 if (__get_user(c, f)) { 2106 memset(t, 0, n); 2107 break; 2108 } 2109 *t++ = c; 2110 f++; 2111 n--; 2112 } 2113 return n; 2114 } 2115 2116 int copy_mount_options(const void __user * data, unsigned long *where) 2117 { 2118 int i; 2119 unsigned long page; 2120 unsigned long size; 2121 2122 *where = 0; 2123 if (!data) 2124 return 0; 2125 2126 if (!(page = __get_free_page(GFP_KERNEL))) 2127 return -ENOMEM; 2128 2129 /* We only care that *some* data at the address the user 2130 * gave us is valid. Just in case, we'll zero 2131 * the remainder of the page. 2132 */ 2133 /* copy_from_user cannot cross TASK_SIZE ! */ 2134 size = TASK_SIZE - (unsigned long)data; 2135 if (size > PAGE_SIZE) 2136 size = PAGE_SIZE; 2137 2138 i = size - exact_copy_from_user((void *)page, data, size); 2139 if (!i) { 2140 free_page(page); 2141 return -EFAULT; 2142 } 2143 if (i != PAGE_SIZE) 2144 memset((char *)page + i, 0, PAGE_SIZE - i); 2145 *where = page; 2146 return 0; 2147 } 2148 2149 int copy_mount_string(const void __user *data, char **where) 2150 { 2151 char *tmp; 2152 2153 if (!data) { 2154 *where = NULL; 2155 return 0; 2156 } 2157 2158 tmp = strndup_user(data, PAGE_SIZE); 2159 if (IS_ERR(tmp)) 2160 return PTR_ERR(tmp); 2161 2162 *where = tmp; 2163 return 0; 2164 } 2165 2166 /* 2167 * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to 2168 * be given to the mount() call (ie: read-only, no-dev, no-suid etc). 2169 * 2170 * data is a (void *) that can point to any structure up to 2171 * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent 2172 * information (or be NULL). 2173 * 2174 * Pre-0.97 versions of mount() didn't have a flags word. 2175 * When the flags word was introduced its top half was required 2176 * to have the magic value 0xC0ED, and this remained so until 2.4.0-test9. 2177 * Therefore, if this magic number is present, it carries no information 2178 * and must be discarded. 2179 */ 2180 long do_mount(char *dev_name, char *dir_name, char *type_page, 2181 unsigned long flags, void *data_page) 2182 { 2183 struct path path; 2184 int retval = 0; 2185 int mnt_flags = 0; 2186 2187 /* Discard magic */ 2188 if ((flags & MS_MGC_MSK) == MS_MGC_VAL) 2189 flags &= ~MS_MGC_MSK; 2190 2191 /* Basic sanity checks */ 2192 2193 if (!dir_name || !*dir_name || !memchr(dir_name, 0, PAGE_SIZE)) 2194 return -EINVAL; 2195 2196 if (data_page) 2197 ((char *)data_page)[PAGE_SIZE - 1] = 0; 2198 2199 /* ... and get the mountpoint */ 2200 retval = kern_path(dir_name, LOOKUP_FOLLOW, &path); 2201 if (retval) 2202 return retval; 2203 2204 retval = security_sb_mount(dev_name, &path, 2205 type_page, flags, data_page); 2206 if (retval) 2207 goto dput_out; 2208 2209 /* Default to relatime unless overriden */ 2210 if (!(flags & MS_NOATIME)) 2211 mnt_flags |= MNT_RELATIME; 2212 2213 /* Separate the per-mountpoint flags */ 2214 if (flags & MS_NOSUID) 2215 mnt_flags |= MNT_NOSUID; 2216 if (flags & MS_NODEV) 2217 mnt_flags |= MNT_NODEV; 2218 if (flags & MS_NOEXEC) 2219 mnt_flags |= MNT_NOEXEC; 2220 if (flags & MS_NOATIME) 2221 mnt_flags |= MNT_NOATIME; 2222 if (flags & MS_NODIRATIME) 2223 mnt_flags |= MNT_NODIRATIME; 2224 if (flags & MS_STRICTATIME) 2225 mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME); 2226 if (flags & MS_RDONLY) 2227 mnt_flags |= MNT_READONLY; 2228 2229 flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN | 2230 MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT | 2231 MS_STRICTATIME); 2232 2233 if (flags & MS_REMOUNT) 2234 retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags, 2235 data_page); 2236 else if (flags & MS_BIND) 2237 retval = do_loopback(&path, dev_name, flags & MS_REC); 2238 else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) 2239 retval = do_change_type(&path, flags); 2240 else if (flags & MS_MOVE) 2241 retval = do_move_mount(&path, dev_name); 2242 else 2243 retval = do_new_mount(&path, type_page, flags, mnt_flags, 2244 dev_name, data_page); 2245 dput_out: 2246 path_put(&path); 2247 return retval; 2248 } 2249 2250 static struct mnt_namespace *alloc_mnt_ns(void) 2251 { 2252 struct mnt_namespace *new_ns; 2253 2254 new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL); 2255 if (!new_ns) 2256 return ERR_PTR(-ENOMEM); 2257 atomic_set(&new_ns->count, 1); 2258 new_ns->root = NULL; 2259 INIT_LIST_HEAD(&new_ns->list); 2260 init_waitqueue_head(&new_ns->poll); 2261 new_ns->event = 0; 2262 return new_ns; 2263 } 2264 2265 /* 2266 * Allocate a new namespace structure and populate it with contents 2267 * copied from the namespace of the passed in task structure. 2268 */ 2269 static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, 2270 struct fs_struct *fs) 2271 { 2272 struct mnt_namespace *new_ns; 2273 struct vfsmount *rootmnt = NULL, *pwdmnt = NULL; 2274 struct vfsmount *p, *q; 2275 2276 new_ns = alloc_mnt_ns(); 2277 if (IS_ERR(new_ns)) 2278 return new_ns; 2279 2280 down_write(&namespace_sem); 2281 /* First pass: copy the tree topology */ 2282 new_ns->root = copy_tree(mnt_ns->root, mnt_ns->root->mnt_root, 2283 CL_COPY_ALL | CL_EXPIRE); 2284 if (!new_ns->root) { 2285 up_write(&namespace_sem); 2286 kfree(new_ns); 2287 return ERR_PTR(-ENOMEM); 2288 } 2289 br_write_lock(vfsmount_lock); 2290 list_add_tail(&new_ns->list, &new_ns->root->mnt_list); 2291 br_write_unlock(vfsmount_lock); 2292 2293 /* 2294 * Second pass: switch the tsk->fs->* elements and mark new vfsmounts 2295 * as belonging to new namespace. We have already acquired a private 2296 * fs_struct, so tsk->fs->lock is not needed. 2297 */ 2298 p = mnt_ns->root; 2299 q = new_ns->root; 2300 while (p) { 2301 q->mnt_ns = new_ns; 2302 if (fs) { 2303 if (p == fs->root.mnt) { 2304 rootmnt = p; 2305 fs->root.mnt = mntget_long(q); 2306 } 2307 if (p == fs->pwd.mnt) { 2308 pwdmnt = p; 2309 fs->pwd.mnt = mntget_long(q); 2310 } 2311 } 2312 p = next_mnt(p, mnt_ns->root); 2313 q = next_mnt(q, new_ns->root); 2314 } 2315 up_write(&namespace_sem); 2316 2317 if (rootmnt) 2318 mntput_long(rootmnt); 2319 if (pwdmnt) 2320 mntput_long(pwdmnt); 2321 2322 return new_ns; 2323 } 2324 2325 struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, 2326 struct fs_struct *new_fs) 2327 { 2328 struct mnt_namespace *new_ns; 2329 2330 BUG_ON(!ns); 2331 get_mnt_ns(ns); 2332 2333 if (!(flags & CLONE_NEWNS)) 2334 return ns; 2335 2336 new_ns = dup_mnt_ns(ns, new_fs); 2337 2338 put_mnt_ns(ns); 2339 return new_ns; 2340 } 2341 2342 /** 2343 * create_mnt_ns - creates a private namespace and adds a root filesystem 2344 * @mnt: pointer to the new root filesystem mountpoint 2345 */ 2346 struct mnt_namespace *create_mnt_ns(struct vfsmount *mnt) 2347 { 2348 struct mnt_namespace *new_ns; 2349 2350 new_ns = alloc_mnt_ns(); 2351 if (!IS_ERR(new_ns)) { 2352 mnt->mnt_ns = new_ns; 2353 new_ns->root = mnt; 2354 list_add(&new_ns->list, &new_ns->root->mnt_list); 2355 } 2356 return new_ns; 2357 } 2358 EXPORT_SYMBOL(create_mnt_ns); 2359 2360 SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, 2361 char __user *, type, unsigned long, flags, void __user *, data) 2362 { 2363 int ret; 2364 char *kernel_type; 2365 char *kernel_dir; 2366 char *kernel_dev; 2367 unsigned long data_page; 2368 2369 ret = copy_mount_string(type, &kernel_type); 2370 if (ret < 0) 2371 goto out_type; 2372 2373 kernel_dir = getname(dir_name); 2374 if (IS_ERR(kernel_dir)) { 2375 ret = PTR_ERR(kernel_dir); 2376 goto out_dir; 2377 } 2378 2379 ret = copy_mount_string(dev_name, &kernel_dev); 2380 if (ret < 0) 2381 goto out_dev; 2382 2383 ret = copy_mount_options(data, &data_page); 2384 if (ret < 0) 2385 goto out_data; 2386 2387 ret = do_mount(kernel_dev, kernel_dir, kernel_type, flags, 2388 (void *) data_page); 2389 2390 free_page(data_page); 2391 out_data: 2392 kfree(kernel_dev); 2393 out_dev: 2394 putname(kernel_dir); 2395 out_dir: 2396 kfree(kernel_type); 2397 out_type: 2398 return ret; 2399 } 2400 2401 /* 2402 * pivot_root Semantics: 2403 * Moves the root file system of the current process to the directory put_old, 2404 * makes new_root as the new root file system of the current process, and sets 2405 * root/cwd of all processes which had them on the current root to new_root. 2406 * 2407 * Restrictions: 2408 * The new_root and put_old must be directories, and must not be on the 2409 * same file system as the current process root. The put_old must be 2410 * underneath new_root, i.e. adding a non-zero number of /.. to the string 2411 * pointed to by put_old must yield the same directory as new_root. No other 2412 * file system may be mounted on put_old. After all, new_root is a mountpoint. 2413 * 2414 * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem. 2415 * See Documentation/filesystems/ramfs-rootfs-initramfs.txt for alternatives 2416 * in this situation. 2417 * 2418 * Notes: 2419 * - we don't move root/cwd if they are not at the root (reason: if something 2420 * cared enough to change them, it's probably wrong to force them elsewhere) 2421 * - it's okay to pick a root that isn't the root of a file system, e.g. 2422 * /nfs/my_root where /nfs is the mount point. It must be a mountpoint, 2423 * though, so you may need to say mount --bind /nfs/my_root /nfs/my_root 2424 * first. 2425 */ 2426 SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, 2427 const char __user *, put_old) 2428 { 2429 struct vfsmount *tmp; 2430 struct path new, old, parent_path, root_parent, root; 2431 int error; 2432 2433 if (!capable(CAP_SYS_ADMIN)) 2434 return -EPERM; 2435 2436 error = user_path_dir(new_root, &new); 2437 if (error) 2438 goto out0; 2439 error = -EINVAL; 2440 if (!check_mnt(new.mnt)) 2441 goto out1; 2442 2443 error = user_path_dir(put_old, &old); 2444 if (error) 2445 goto out1; 2446 2447 error = security_sb_pivotroot(&old, &new); 2448 if (error) { 2449 path_put(&old); 2450 goto out1; 2451 } 2452 2453 get_fs_root(current->fs, &root); 2454 down_write(&namespace_sem); 2455 mutex_lock(&old.dentry->d_inode->i_mutex); 2456 error = -EINVAL; 2457 if (IS_MNT_SHARED(old.mnt) || 2458 IS_MNT_SHARED(new.mnt->mnt_parent) || 2459 IS_MNT_SHARED(root.mnt->mnt_parent)) 2460 goto out2; 2461 if (!check_mnt(root.mnt)) 2462 goto out2; 2463 error = -ENOENT; 2464 if (cant_mount(old.dentry)) 2465 goto out2; 2466 if (d_unlinked(new.dentry)) 2467 goto out2; 2468 if (d_unlinked(old.dentry)) 2469 goto out2; 2470 error = -EBUSY; 2471 if (new.mnt == root.mnt || 2472 old.mnt == root.mnt) 2473 goto out2; /* loop, on the same file system */ 2474 error = -EINVAL; 2475 if (root.mnt->mnt_root != root.dentry) 2476 goto out2; /* not a mountpoint */ 2477 if (root.mnt->mnt_parent == root.mnt) 2478 goto out2; /* not attached */ 2479 if (new.mnt->mnt_root != new.dentry) 2480 goto out2; /* not a mountpoint */ 2481 if (new.mnt->mnt_parent == new.mnt) 2482 goto out2; /* not attached */ 2483 /* make sure we can reach put_old from new_root */ 2484 tmp = old.mnt; 2485 br_write_lock(vfsmount_lock); 2486 if (tmp != new.mnt) { 2487 for (;;) { 2488 if (tmp->mnt_parent == tmp) 2489 goto out3; /* already mounted on put_old */ 2490 if (tmp->mnt_parent == new.mnt) 2491 break; 2492 tmp = tmp->mnt_parent; 2493 } 2494 if (!is_subdir(tmp->mnt_mountpoint, new.dentry)) 2495 goto out3; 2496 } else if (!is_subdir(old.dentry, new.dentry)) 2497 goto out3; 2498 detach_mnt(new.mnt, &parent_path); 2499 detach_mnt(root.mnt, &root_parent); 2500 /* mount old root on put_old */ 2501 attach_mnt(root.mnt, &old); 2502 /* mount new_root on / */ 2503 attach_mnt(new.mnt, &root_parent); 2504 touch_mnt_namespace(current->nsproxy->mnt_ns); 2505 br_write_unlock(vfsmount_lock); 2506 chroot_fs_refs(&root, &new); 2507 2508 error = 0; 2509 path_put(&root_parent); 2510 path_put(&parent_path); 2511 out2: 2512 mutex_unlock(&old.dentry->d_inode->i_mutex); 2513 up_write(&namespace_sem); 2514 path_put(&root); 2515 path_put(&old); 2516 out1: 2517 path_put(&new); 2518 out0: 2519 return error; 2520 out3: 2521 br_write_unlock(vfsmount_lock); 2522 goto out2; 2523 } 2524 2525 static void __init init_mount_tree(void) 2526 { 2527 struct vfsmount *mnt; 2528 struct mnt_namespace *ns; 2529 struct path root; 2530 2531 mnt = do_kern_mount("rootfs", 0, "rootfs", NULL); 2532 if (IS_ERR(mnt)) 2533 panic("Can't create rootfs"); 2534 2535 ns = create_mnt_ns(mnt); 2536 if (IS_ERR(ns)) 2537 panic("Can't allocate initial namespace"); 2538 2539 init_task.nsproxy->mnt_ns = ns; 2540 get_mnt_ns(ns); 2541 2542 root.mnt = ns->root; 2543 root.dentry = ns->root->mnt_root; 2544 2545 set_fs_pwd(current->fs, &root); 2546 set_fs_root(current->fs, &root); 2547 } 2548 2549 void __init mnt_init(void) 2550 { 2551 unsigned u; 2552 int err; 2553 2554 init_rwsem(&namespace_sem); 2555 2556 mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct vfsmount), 2557 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); 2558 2559 mount_hashtable = (struct list_head *)__get_free_page(GFP_ATOMIC); 2560 2561 if (!mount_hashtable) 2562 panic("Failed to allocate mount hash table\n"); 2563 2564 printk("Mount-cache hash table entries: %lu\n", HASH_SIZE); 2565 2566 for (u = 0; u < HASH_SIZE; u++) 2567 INIT_LIST_HEAD(&mount_hashtable[u]); 2568 2569 br_lock_init(vfsmount_lock); 2570 2571 err = sysfs_init(); 2572 if (err) 2573 printk(KERN_WARNING "%s: sysfs_init error: %d\n", 2574 __func__, err); 2575 fs_kobj = kobject_create_and_add("fs", NULL); 2576 if (!fs_kobj) 2577 printk(KERN_WARNING "%s: kobj create error\n", __func__); 2578 init_rootfs(); 2579 init_mount_tree(); 2580 } 2581 2582 void put_mnt_ns(struct mnt_namespace *ns) 2583 { 2584 LIST_HEAD(umount_list); 2585 2586 if (!atomic_dec_and_test(&ns->count)) 2587 return; 2588 down_write(&namespace_sem); 2589 br_write_lock(vfsmount_lock); 2590 umount_tree(ns->root, 0, &umount_list); 2591 br_write_unlock(vfsmount_lock); 2592 up_write(&namespace_sem); 2593 release_mounts(&umount_list); 2594 kfree(ns); 2595 } 2596 EXPORT_SYMBOL(put_mnt_ns); 2597