1 /* 2 * linux/fs/namespace.c 3 * 4 * (C) Copyright Al Viro 2000, 2001 5 * Released under GPL v2. 6 * 7 * Based on code from fs/super.c, copyright Linus Torvalds and others. 8 * Heavily rewritten. 9 */ 10 11 #include <linux/syscalls.h> 12 #include <linux/slab.h> 13 #include <linux/sched.h> 14 #include <linux/smp_lock.h> 15 #include <linux/init.h> 16 #include <linux/quotaops.h> 17 #include <linux/acct.h> 18 #include <linux/capability.h> 19 #include <linux/module.h> 20 #include <linux/seq_file.h> 21 #include <linux/namespace.h> 22 #include <linux/namei.h> 23 #include <linux/security.h> 24 #include <linux/mount.h> 25 #include <asm/uaccess.h> 26 #include <asm/unistd.h> 27 #include "pnode.h" 28 29 extern int __init init_rootfs(void); 30 31 #ifdef CONFIG_SYSFS 32 extern int __init sysfs_init(void); 33 #else 34 static inline int sysfs_init(void) 35 { 36 return 0; 37 } 38 #endif 39 40 /* spinlock for vfsmount related operations, inplace of dcache_lock */ 41 __cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock); 42 43 static int event; 44 45 static struct list_head *mount_hashtable __read_mostly; 46 static int hash_mask __read_mostly, hash_bits __read_mostly; 47 static kmem_cache_t *mnt_cache __read_mostly; 48 static struct rw_semaphore namespace_sem; 49 50 /* /sys/fs */ 51 decl_subsys(fs, NULL, NULL); 52 EXPORT_SYMBOL_GPL(fs_subsys); 53 54 static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry) 55 { 56 unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES); 57 tmp += ((unsigned long)dentry / L1_CACHE_BYTES); 58 tmp = tmp + (tmp >> hash_bits); 59 return tmp & hash_mask; 60 } 61 62 struct vfsmount *alloc_vfsmnt(const char *name) 63 { 64 struct vfsmount *mnt = kmem_cache_alloc(mnt_cache, GFP_KERNEL); 65 if (mnt) { 66 memset(mnt, 0, sizeof(struct vfsmount)); 67 atomic_set(&mnt->mnt_count, 1); 68 INIT_LIST_HEAD(&mnt->mnt_hash); 69 INIT_LIST_HEAD(&mnt->mnt_child); 70 INIT_LIST_HEAD(&mnt->mnt_mounts); 71 INIT_LIST_HEAD(&mnt->mnt_list); 72 INIT_LIST_HEAD(&mnt->mnt_expire); 73 INIT_LIST_HEAD(&mnt->mnt_share); 74 INIT_LIST_HEAD(&mnt->mnt_slave_list); 75 INIT_LIST_HEAD(&mnt->mnt_slave); 76 if (name) { 77 int size = strlen(name) + 1; 78 char *newname = kmalloc(size, GFP_KERNEL); 79 if (newname) { 80 memcpy(newname, name, size); 81 mnt->mnt_devname = newname; 82 } 83 } 84 } 85 return mnt; 86 } 87 88 int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb) 89 { 90 mnt->mnt_sb = sb; 91 mnt->mnt_root = dget(sb->s_root); 92 return 0; 93 } 94 95 EXPORT_SYMBOL(simple_set_mnt); 96 97 void free_vfsmnt(struct vfsmount *mnt) 98 { 99 kfree(mnt->mnt_devname); 100 kmem_cache_free(mnt_cache, mnt); 101 } 102 103 /* 104 * find the first or last mount at @dentry on vfsmount @mnt depending on 105 * @dir. If @dir is set return the first mount else return the last mount. 106 */ 107 struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry, 108 int dir) 109 { 110 struct list_head *head = mount_hashtable + hash(mnt, dentry); 111 struct list_head *tmp = head; 112 struct vfsmount *p, *found = NULL; 113 114 for (;;) { 115 tmp = dir ? tmp->next : tmp->prev; 116 p = NULL; 117 if (tmp == head) 118 break; 119 p = list_entry(tmp, struct vfsmount, mnt_hash); 120 if (p->mnt_parent == mnt && p->mnt_mountpoint == dentry) { 121 found = p; 122 break; 123 } 124 } 125 return found; 126 } 127 128 /* 129 * lookup_mnt increments the ref count before returning 130 * the vfsmount struct. 131 */ 132 struct vfsmount *lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) 133 { 134 struct vfsmount *child_mnt; 135 spin_lock(&vfsmount_lock); 136 if ((child_mnt = __lookup_mnt(mnt, dentry, 1))) 137 mntget(child_mnt); 138 spin_unlock(&vfsmount_lock); 139 return child_mnt; 140 } 141 142 static inline int check_mnt(struct vfsmount *mnt) 143 { 144 return mnt->mnt_namespace == current->namespace; 145 } 146 147 static void touch_namespace(struct namespace *ns) 148 { 149 if (ns) { 150 ns->event = ++event; 151 wake_up_interruptible(&ns->poll); 152 } 153 } 154 155 static void __touch_namespace(struct namespace *ns) 156 { 157 if (ns && ns->event != event) { 158 ns->event = event; 159 wake_up_interruptible(&ns->poll); 160 } 161 } 162 163 static void detach_mnt(struct vfsmount *mnt, struct nameidata *old_nd) 164 { 165 old_nd->dentry = mnt->mnt_mountpoint; 166 old_nd->mnt = mnt->mnt_parent; 167 mnt->mnt_parent = mnt; 168 mnt->mnt_mountpoint = mnt->mnt_root; 169 list_del_init(&mnt->mnt_child); 170 list_del_init(&mnt->mnt_hash); 171 old_nd->dentry->d_mounted--; 172 } 173 174 void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry, 175 struct vfsmount *child_mnt) 176 { 177 child_mnt->mnt_parent = mntget(mnt); 178 child_mnt->mnt_mountpoint = dget(dentry); 179 dentry->d_mounted++; 180 } 181 182 static void attach_mnt(struct vfsmount *mnt, struct nameidata *nd) 183 { 184 mnt_set_mountpoint(nd->mnt, nd->dentry, mnt); 185 list_add_tail(&mnt->mnt_hash, mount_hashtable + 186 hash(nd->mnt, nd->dentry)); 187 list_add_tail(&mnt->mnt_child, &nd->mnt->mnt_mounts); 188 } 189 190 /* 191 * the caller must hold vfsmount_lock 192 */ 193 static void commit_tree(struct vfsmount *mnt) 194 { 195 struct vfsmount *parent = mnt->mnt_parent; 196 struct vfsmount *m; 197 LIST_HEAD(head); 198 struct namespace *n = parent->mnt_namespace; 199 200 BUG_ON(parent == mnt); 201 202 list_add_tail(&head, &mnt->mnt_list); 203 list_for_each_entry(m, &head, mnt_list) 204 m->mnt_namespace = n; 205 list_splice(&head, n->list.prev); 206 207 list_add_tail(&mnt->mnt_hash, mount_hashtable + 208 hash(parent, mnt->mnt_mountpoint)); 209 list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); 210 touch_namespace(n); 211 } 212 213 static struct vfsmount *next_mnt(struct vfsmount *p, struct vfsmount *root) 214 { 215 struct list_head *next = p->mnt_mounts.next; 216 if (next == &p->mnt_mounts) { 217 while (1) { 218 if (p == root) 219 return NULL; 220 next = p->mnt_child.next; 221 if (next != &p->mnt_parent->mnt_mounts) 222 break; 223 p = p->mnt_parent; 224 } 225 } 226 return list_entry(next, struct vfsmount, mnt_child); 227 } 228 229 static struct vfsmount *skip_mnt_tree(struct vfsmount *p) 230 { 231 struct list_head *prev = p->mnt_mounts.prev; 232 while (prev != &p->mnt_mounts) { 233 p = list_entry(prev, struct vfsmount, mnt_child); 234 prev = p->mnt_mounts.prev; 235 } 236 return p; 237 } 238 239 static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root, 240 int flag) 241 { 242 struct super_block *sb = old->mnt_sb; 243 struct vfsmount *mnt = alloc_vfsmnt(old->mnt_devname); 244 245 if (mnt) { 246 mnt->mnt_flags = old->mnt_flags; 247 atomic_inc(&sb->s_active); 248 mnt->mnt_sb = sb; 249 mnt->mnt_root = dget(root); 250 mnt->mnt_mountpoint = mnt->mnt_root; 251 mnt->mnt_parent = mnt; 252 253 if (flag & CL_SLAVE) { 254 list_add(&mnt->mnt_slave, &old->mnt_slave_list); 255 mnt->mnt_master = old; 256 CLEAR_MNT_SHARED(mnt); 257 } else { 258 if ((flag & CL_PROPAGATION) || IS_MNT_SHARED(old)) 259 list_add(&mnt->mnt_share, &old->mnt_share); 260 if (IS_MNT_SLAVE(old)) 261 list_add(&mnt->mnt_slave, &old->mnt_slave); 262 mnt->mnt_master = old->mnt_master; 263 } 264 if (flag & CL_MAKE_SHARED) 265 set_mnt_shared(mnt); 266 267 /* stick the duplicate mount on the same expiry list 268 * as the original if that was on one */ 269 if (flag & CL_EXPIRE) { 270 spin_lock(&vfsmount_lock); 271 if (!list_empty(&old->mnt_expire)) 272 list_add(&mnt->mnt_expire, &old->mnt_expire); 273 spin_unlock(&vfsmount_lock); 274 } 275 } 276 return mnt; 277 } 278 279 static inline void __mntput(struct vfsmount *mnt) 280 { 281 struct super_block *sb = mnt->mnt_sb; 282 dput(mnt->mnt_root); 283 free_vfsmnt(mnt); 284 deactivate_super(sb); 285 } 286 287 void mntput_no_expire(struct vfsmount *mnt) 288 { 289 repeat: 290 if (atomic_dec_and_lock(&mnt->mnt_count, &vfsmount_lock)) { 291 if (likely(!mnt->mnt_pinned)) { 292 spin_unlock(&vfsmount_lock); 293 __mntput(mnt); 294 return; 295 } 296 atomic_add(mnt->mnt_pinned + 1, &mnt->mnt_count); 297 mnt->mnt_pinned = 0; 298 spin_unlock(&vfsmount_lock); 299 acct_auto_close_mnt(mnt); 300 security_sb_umount_close(mnt); 301 goto repeat; 302 } 303 } 304 305 EXPORT_SYMBOL(mntput_no_expire); 306 307 void mnt_pin(struct vfsmount *mnt) 308 { 309 spin_lock(&vfsmount_lock); 310 mnt->mnt_pinned++; 311 spin_unlock(&vfsmount_lock); 312 } 313 314 EXPORT_SYMBOL(mnt_pin); 315 316 void mnt_unpin(struct vfsmount *mnt) 317 { 318 spin_lock(&vfsmount_lock); 319 if (mnt->mnt_pinned) { 320 atomic_inc(&mnt->mnt_count); 321 mnt->mnt_pinned--; 322 } 323 spin_unlock(&vfsmount_lock); 324 } 325 326 EXPORT_SYMBOL(mnt_unpin); 327 328 /* iterator */ 329 static void *m_start(struct seq_file *m, loff_t *pos) 330 { 331 struct namespace *n = m->private; 332 struct list_head *p; 333 loff_t l = *pos; 334 335 down_read(&namespace_sem); 336 list_for_each(p, &n->list) 337 if (!l--) 338 return list_entry(p, struct vfsmount, mnt_list); 339 return NULL; 340 } 341 342 static void *m_next(struct seq_file *m, void *v, loff_t *pos) 343 { 344 struct namespace *n = m->private; 345 struct list_head *p = ((struct vfsmount *)v)->mnt_list.next; 346 (*pos)++; 347 return p == &n->list ? NULL : list_entry(p, struct vfsmount, mnt_list); 348 } 349 350 static void m_stop(struct seq_file *m, void *v) 351 { 352 up_read(&namespace_sem); 353 } 354 355 static inline void mangle(struct seq_file *m, const char *s) 356 { 357 seq_escape(m, s, " \t\n\\"); 358 } 359 360 static int show_vfsmnt(struct seq_file *m, void *v) 361 { 362 struct vfsmount *mnt = v; 363 int err = 0; 364 static struct proc_fs_info { 365 int flag; 366 char *str; 367 } fs_info[] = { 368 { MS_SYNCHRONOUS, ",sync" }, 369 { MS_DIRSYNC, ",dirsync" }, 370 { MS_MANDLOCK, ",mand" }, 371 { 0, NULL } 372 }; 373 static struct proc_fs_info mnt_info[] = { 374 { MNT_NOSUID, ",nosuid" }, 375 { MNT_NODEV, ",nodev" }, 376 { MNT_NOEXEC, ",noexec" }, 377 { MNT_NOATIME, ",noatime" }, 378 { MNT_NODIRATIME, ",nodiratime" }, 379 { 0, NULL } 380 }; 381 struct proc_fs_info *fs_infop; 382 383 mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); 384 seq_putc(m, ' '); 385 seq_path(m, mnt, mnt->mnt_root, " \t\n\\"); 386 seq_putc(m, ' '); 387 mangle(m, mnt->mnt_sb->s_type->name); 388 seq_puts(m, mnt->mnt_sb->s_flags & MS_RDONLY ? " ro" : " rw"); 389 for (fs_infop = fs_info; fs_infop->flag; fs_infop++) { 390 if (mnt->mnt_sb->s_flags & fs_infop->flag) 391 seq_puts(m, fs_infop->str); 392 } 393 for (fs_infop = mnt_info; fs_infop->flag; fs_infop++) { 394 if (mnt->mnt_flags & fs_infop->flag) 395 seq_puts(m, fs_infop->str); 396 } 397 if (mnt->mnt_sb->s_op->show_options) 398 err = mnt->mnt_sb->s_op->show_options(m, mnt); 399 seq_puts(m, " 0 0\n"); 400 return err; 401 } 402 403 struct seq_operations mounts_op = { 404 .start = m_start, 405 .next = m_next, 406 .stop = m_stop, 407 .show = show_vfsmnt 408 }; 409 410 static int show_vfsstat(struct seq_file *m, void *v) 411 { 412 struct vfsmount *mnt = v; 413 int err = 0; 414 415 /* device */ 416 if (mnt->mnt_devname) { 417 seq_puts(m, "device "); 418 mangle(m, mnt->mnt_devname); 419 } else 420 seq_puts(m, "no device"); 421 422 /* mount point */ 423 seq_puts(m, " mounted on "); 424 seq_path(m, mnt, mnt->mnt_root, " \t\n\\"); 425 seq_putc(m, ' '); 426 427 /* file system type */ 428 seq_puts(m, "with fstype "); 429 mangle(m, mnt->mnt_sb->s_type->name); 430 431 /* optional statistics */ 432 if (mnt->mnt_sb->s_op->show_stats) { 433 seq_putc(m, ' '); 434 err = mnt->mnt_sb->s_op->show_stats(m, mnt); 435 } 436 437 seq_putc(m, '\n'); 438 return err; 439 } 440 441 struct seq_operations mountstats_op = { 442 .start = m_start, 443 .next = m_next, 444 .stop = m_stop, 445 .show = show_vfsstat, 446 }; 447 448 /** 449 * may_umount_tree - check if a mount tree is busy 450 * @mnt: root of mount tree 451 * 452 * This is called to check if a tree of mounts has any 453 * open files, pwds, chroots or sub mounts that are 454 * busy. 455 */ 456 int may_umount_tree(struct vfsmount *mnt) 457 { 458 int actual_refs = 0; 459 int minimum_refs = 0; 460 struct vfsmount *p; 461 462 spin_lock(&vfsmount_lock); 463 for (p = mnt; p; p = next_mnt(p, mnt)) { 464 actual_refs += atomic_read(&p->mnt_count); 465 minimum_refs += 2; 466 } 467 spin_unlock(&vfsmount_lock); 468 469 if (actual_refs > minimum_refs) 470 return 0; 471 472 return 1; 473 } 474 475 EXPORT_SYMBOL(may_umount_tree); 476 477 /** 478 * may_umount - check if a mount point is busy 479 * @mnt: root of mount 480 * 481 * This is called to check if a mount point has any 482 * open files, pwds, chroots or sub mounts. If the 483 * mount has sub mounts this will return busy 484 * regardless of whether the sub mounts are busy. 485 * 486 * Doesn't take quota and stuff into account. IOW, in some cases it will 487 * give false negatives. The main reason why it's here is that we need 488 * a non-destructive way to look for easily umountable filesystems. 489 */ 490 int may_umount(struct vfsmount *mnt) 491 { 492 int ret = 1; 493 spin_lock(&vfsmount_lock); 494 if (propagate_mount_busy(mnt, 2)) 495 ret = 0; 496 spin_unlock(&vfsmount_lock); 497 return ret; 498 } 499 500 EXPORT_SYMBOL(may_umount); 501 502 void release_mounts(struct list_head *head) 503 { 504 struct vfsmount *mnt; 505 while (!list_empty(head)) { 506 mnt = list_entry(head->next, struct vfsmount, mnt_hash); 507 list_del_init(&mnt->mnt_hash); 508 if (mnt->mnt_parent != mnt) { 509 struct dentry *dentry; 510 struct vfsmount *m; 511 spin_lock(&vfsmount_lock); 512 dentry = mnt->mnt_mountpoint; 513 m = mnt->mnt_parent; 514 mnt->mnt_mountpoint = mnt->mnt_root; 515 mnt->mnt_parent = mnt; 516 spin_unlock(&vfsmount_lock); 517 dput(dentry); 518 mntput(m); 519 } 520 mntput(mnt); 521 } 522 } 523 524 void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill) 525 { 526 struct vfsmount *p; 527 528 for (p = mnt; p; p = next_mnt(p, mnt)) 529 list_move(&p->mnt_hash, kill); 530 531 if (propagate) 532 propagate_umount(kill); 533 534 list_for_each_entry(p, kill, mnt_hash) { 535 list_del_init(&p->mnt_expire); 536 list_del_init(&p->mnt_list); 537 __touch_namespace(p->mnt_namespace); 538 p->mnt_namespace = NULL; 539 list_del_init(&p->mnt_child); 540 if (p->mnt_parent != p) 541 p->mnt_mountpoint->d_mounted--; 542 change_mnt_propagation(p, MS_PRIVATE); 543 } 544 } 545 546 static int do_umount(struct vfsmount *mnt, int flags) 547 { 548 struct super_block *sb = mnt->mnt_sb; 549 int retval; 550 LIST_HEAD(umount_list); 551 552 retval = security_sb_umount(mnt, flags); 553 if (retval) 554 return retval; 555 556 /* 557 * Allow userspace to request a mountpoint be expired rather than 558 * unmounting unconditionally. Unmount only happens if: 559 * (1) the mark is already set (the mark is cleared by mntput()) 560 * (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount] 561 */ 562 if (flags & MNT_EXPIRE) { 563 if (mnt == current->fs->rootmnt || 564 flags & (MNT_FORCE | MNT_DETACH)) 565 return -EINVAL; 566 567 if (atomic_read(&mnt->mnt_count) != 2) 568 return -EBUSY; 569 570 if (!xchg(&mnt->mnt_expiry_mark, 1)) 571 return -EAGAIN; 572 } 573 574 /* 575 * If we may have to abort operations to get out of this 576 * mount, and they will themselves hold resources we must 577 * allow the fs to do things. In the Unix tradition of 578 * 'Gee thats tricky lets do it in userspace' the umount_begin 579 * might fail to complete on the first run through as other tasks 580 * must return, and the like. Thats for the mount program to worry 581 * about for the moment. 582 */ 583 584 lock_kernel(); 585 if (sb->s_op->umount_begin) 586 sb->s_op->umount_begin(mnt, flags); 587 unlock_kernel(); 588 589 /* 590 * No sense to grab the lock for this test, but test itself looks 591 * somewhat bogus. Suggestions for better replacement? 592 * Ho-hum... In principle, we might treat that as umount + switch 593 * to rootfs. GC would eventually take care of the old vfsmount. 594 * Actually it makes sense, especially if rootfs would contain a 595 * /reboot - static binary that would close all descriptors and 596 * call reboot(9). Then init(8) could umount root and exec /reboot. 597 */ 598 if (mnt == current->fs->rootmnt && !(flags & MNT_DETACH)) { 599 /* 600 * Special case for "unmounting" root ... 601 * we just try to remount it readonly. 602 */ 603 down_write(&sb->s_umount); 604 if (!(sb->s_flags & MS_RDONLY)) { 605 lock_kernel(); 606 DQUOT_OFF(sb); 607 retval = do_remount_sb(sb, MS_RDONLY, NULL, 0); 608 unlock_kernel(); 609 } 610 up_write(&sb->s_umount); 611 return retval; 612 } 613 614 down_write(&namespace_sem); 615 spin_lock(&vfsmount_lock); 616 event++; 617 618 retval = -EBUSY; 619 if (flags & MNT_DETACH || !propagate_mount_busy(mnt, 2)) { 620 if (!list_empty(&mnt->mnt_list)) 621 umount_tree(mnt, 1, &umount_list); 622 retval = 0; 623 } 624 spin_unlock(&vfsmount_lock); 625 if (retval) 626 security_sb_umount_busy(mnt); 627 up_write(&namespace_sem); 628 release_mounts(&umount_list); 629 return retval; 630 } 631 632 /* 633 * Now umount can handle mount points as well as block devices. 634 * This is important for filesystems which use unnamed block devices. 635 * 636 * We now support a flag for forced unmount like the other 'big iron' 637 * unixes. Our API is identical to OSF/1 to avoid making a mess of AMD 638 */ 639 640 asmlinkage long sys_umount(char __user * name, int flags) 641 { 642 struct nameidata nd; 643 int retval; 644 645 retval = __user_walk(name, LOOKUP_FOLLOW, &nd); 646 if (retval) 647 goto out; 648 retval = -EINVAL; 649 if (nd.dentry != nd.mnt->mnt_root) 650 goto dput_and_out; 651 if (!check_mnt(nd.mnt)) 652 goto dput_and_out; 653 654 retval = -EPERM; 655 if (!capable(CAP_SYS_ADMIN)) 656 goto dput_and_out; 657 658 retval = do_umount(nd.mnt, flags); 659 dput_and_out: 660 path_release_on_umount(&nd); 661 out: 662 return retval; 663 } 664 665 #ifdef __ARCH_WANT_SYS_OLDUMOUNT 666 667 /* 668 * The 2.0 compatible umount. No flags. 669 */ 670 asmlinkage long sys_oldumount(char __user * name) 671 { 672 return sys_umount(name, 0); 673 } 674 675 #endif 676 677 static int mount_is_safe(struct nameidata *nd) 678 { 679 if (capable(CAP_SYS_ADMIN)) 680 return 0; 681 return -EPERM; 682 #ifdef notyet 683 if (S_ISLNK(nd->dentry->d_inode->i_mode)) 684 return -EPERM; 685 if (nd->dentry->d_inode->i_mode & S_ISVTX) { 686 if (current->uid != nd->dentry->d_inode->i_uid) 687 return -EPERM; 688 } 689 if (vfs_permission(nd, MAY_WRITE)) 690 return -EPERM; 691 return 0; 692 #endif 693 } 694 695 static int lives_below_in_same_fs(struct dentry *d, struct dentry *dentry) 696 { 697 while (1) { 698 if (d == dentry) 699 return 1; 700 if (d == NULL || d == d->d_parent) 701 return 0; 702 d = d->d_parent; 703 } 704 } 705 706 struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry, 707 int flag) 708 { 709 struct vfsmount *res, *p, *q, *r, *s; 710 struct nameidata nd; 711 712 if (!(flag & CL_COPY_ALL) && IS_MNT_UNBINDABLE(mnt)) 713 return NULL; 714 715 res = q = clone_mnt(mnt, dentry, flag); 716 if (!q) 717 goto Enomem; 718 q->mnt_mountpoint = mnt->mnt_mountpoint; 719 720 p = mnt; 721 list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) { 722 if (!lives_below_in_same_fs(r->mnt_mountpoint, dentry)) 723 continue; 724 725 for (s = r; s; s = next_mnt(s, r)) { 726 if (!(flag & CL_COPY_ALL) && IS_MNT_UNBINDABLE(s)) { 727 s = skip_mnt_tree(s); 728 continue; 729 } 730 while (p != s->mnt_parent) { 731 p = p->mnt_parent; 732 q = q->mnt_parent; 733 } 734 p = s; 735 nd.mnt = q; 736 nd.dentry = p->mnt_mountpoint; 737 q = clone_mnt(p, p->mnt_root, flag); 738 if (!q) 739 goto Enomem; 740 spin_lock(&vfsmount_lock); 741 list_add_tail(&q->mnt_list, &res->mnt_list); 742 attach_mnt(q, &nd); 743 spin_unlock(&vfsmount_lock); 744 } 745 } 746 return res; 747 Enomem: 748 if (res) { 749 LIST_HEAD(umount_list); 750 spin_lock(&vfsmount_lock); 751 umount_tree(res, 0, &umount_list); 752 spin_unlock(&vfsmount_lock); 753 release_mounts(&umount_list); 754 } 755 return NULL; 756 } 757 758 /* 759 * @source_mnt : mount tree to be attached 760 * @nd : place the mount tree @source_mnt is attached 761 * @parent_nd : if non-null, detach the source_mnt from its parent and 762 * store the parent mount and mountpoint dentry. 763 * (done when source_mnt is moved) 764 * 765 * NOTE: in the table below explains the semantics when a source mount 766 * of a given type is attached to a destination mount of a given type. 767 * --------------------------------------------------------------------------- 768 * | BIND MOUNT OPERATION | 769 * |************************************************************************** 770 * | source-->| shared | private | slave | unbindable | 771 * | dest | | | | | 772 * | | | | | | | 773 * | v | | | | | 774 * |************************************************************************** 775 * | shared | shared (++) | shared (+) | shared(+++)| invalid | 776 * | | | | | | 777 * |non-shared| shared (+) | private | slave (*) | invalid | 778 * *************************************************************************** 779 * A bind operation clones the source mount and mounts the clone on the 780 * destination mount. 781 * 782 * (++) the cloned mount is propagated to all the mounts in the propagation 783 * tree of the destination mount and the cloned mount is added to 784 * the peer group of the source mount. 785 * (+) the cloned mount is created under the destination mount and is marked 786 * as shared. The cloned mount is added to the peer group of the source 787 * mount. 788 * (+++) the mount is propagated to all the mounts in the propagation tree 789 * of the destination mount and the cloned mount is made slave 790 * of the same master as that of the source mount. The cloned mount 791 * is marked as 'shared and slave'. 792 * (*) the cloned mount is made a slave of the same master as that of the 793 * source mount. 794 * 795 * --------------------------------------------------------------------------- 796 * | MOVE MOUNT OPERATION | 797 * |************************************************************************** 798 * | source-->| shared | private | slave | unbindable | 799 * | dest | | | | | 800 * | | | | | | | 801 * | v | | | | | 802 * |************************************************************************** 803 * | shared | shared (+) | shared (+) | shared(+++) | invalid | 804 * | | | | | | 805 * |non-shared| shared (+*) | private | slave (*) | unbindable | 806 * *************************************************************************** 807 * 808 * (+) the mount is moved to the destination. And is then propagated to 809 * all the mounts in the propagation tree of the destination mount. 810 * (+*) the mount is moved to the destination. 811 * (+++) the mount is moved to the destination and is then propagated to 812 * all the mounts belonging to the destination mount's propagation tree. 813 * the mount is marked as 'shared and slave'. 814 * (*) the mount continues to be a slave at the new location. 815 * 816 * if the source mount is a tree, the operations explained above is 817 * applied to each mount in the tree. 818 * Must be called without spinlocks held, since this function can sleep 819 * in allocations. 820 */ 821 static int attach_recursive_mnt(struct vfsmount *source_mnt, 822 struct nameidata *nd, struct nameidata *parent_nd) 823 { 824 LIST_HEAD(tree_list); 825 struct vfsmount *dest_mnt = nd->mnt; 826 struct dentry *dest_dentry = nd->dentry; 827 struct vfsmount *child, *p; 828 829 if (propagate_mnt(dest_mnt, dest_dentry, source_mnt, &tree_list)) 830 return -EINVAL; 831 832 if (IS_MNT_SHARED(dest_mnt)) { 833 for (p = source_mnt; p; p = next_mnt(p, source_mnt)) 834 set_mnt_shared(p); 835 } 836 837 spin_lock(&vfsmount_lock); 838 if (parent_nd) { 839 detach_mnt(source_mnt, parent_nd); 840 attach_mnt(source_mnt, nd); 841 touch_namespace(current->namespace); 842 } else { 843 mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt); 844 commit_tree(source_mnt); 845 } 846 847 list_for_each_entry_safe(child, p, &tree_list, mnt_hash) { 848 list_del_init(&child->mnt_hash); 849 commit_tree(child); 850 } 851 spin_unlock(&vfsmount_lock); 852 return 0; 853 } 854 855 static int graft_tree(struct vfsmount *mnt, struct nameidata *nd) 856 { 857 int err; 858 if (mnt->mnt_sb->s_flags & MS_NOUSER) 859 return -EINVAL; 860 861 if (S_ISDIR(nd->dentry->d_inode->i_mode) != 862 S_ISDIR(mnt->mnt_root->d_inode->i_mode)) 863 return -ENOTDIR; 864 865 err = -ENOENT; 866 mutex_lock(&nd->dentry->d_inode->i_mutex); 867 if (IS_DEADDIR(nd->dentry->d_inode)) 868 goto out_unlock; 869 870 err = security_sb_check_sb(mnt, nd); 871 if (err) 872 goto out_unlock; 873 874 err = -ENOENT; 875 if (IS_ROOT(nd->dentry) || !d_unhashed(nd->dentry)) 876 err = attach_recursive_mnt(mnt, nd, NULL); 877 out_unlock: 878 mutex_unlock(&nd->dentry->d_inode->i_mutex); 879 if (!err) 880 security_sb_post_addmount(mnt, nd); 881 return err; 882 } 883 884 /* 885 * recursively change the type of the mountpoint. 886 */ 887 static int do_change_type(struct nameidata *nd, int flag) 888 { 889 struct vfsmount *m, *mnt = nd->mnt; 890 int recurse = flag & MS_REC; 891 int type = flag & ~MS_REC; 892 893 if (nd->dentry != nd->mnt->mnt_root) 894 return -EINVAL; 895 896 down_write(&namespace_sem); 897 spin_lock(&vfsmount_lock); 898 for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL)) 899 change_mnt_propagation(m, type); 900 spin_unlock(&vfsmount_lock); 901 up_write(&namespace_sem); 902 return 0; 903 } 904 905 /* 906 * do loopback mount. 907 */ 908 static int do_loopback(struct nameidata *nd, char *old_name, int recurse) 909 { 910 struct nameidata old_nd; 911 struct vfsmount *mnt = NULL; 912 int err = mount_is_safe(nd); 913 if (err) 914 return err; 915 if (!old_name || !*old_name) 916 return -EINVAL; 917 err = path_lookup(old_name, LOOKUP_FOLLOW, &old_nd); 918 if (err) 919 return err; 920 921 down_write(&namespace_sem); 922 err = -EINVAL; 923 if (IS_MNT_UNBINDABLE(old_nd.mnt)) 924 goto out; 925 926 if (!check_mnt(nd->mnt) || !check_mnt(old_nd.mnt)) 927 goto out; 928 929 err = -ENOMEM; 930 if (recurse) 931 mnt = copy_tree(old_nd.mnt, old_nd.dentry, 0); 932 else 933 mnt = clone_mnt(old_nd.mnt, old_nd.dentry, 0); 934 935 if (!mnt) 936 goto out; 937 938 err = graft_tree(mnt, nd); 939 if (err) { 940 LIST_HEAD(umount_list); 941 spin_lock(&vfsmount_lock); 942 umount_tree(mnt, 0, &umount_list); 943 spin_unlock(&vfsmount_lock); 944 release_mounts(&umount_list); 945 } 946 947 out: 948 up_write(&namespace_sem); 949 path_release(&old_nd); 950 return err; 951 } 952 953 /* 954 * change filesystem flags. dir should be a physical root of filesystem. 955 * If you've mounted a non-root directory somewhere and want to do remount 956 * on it - tough luck. 957 */ 958 static int do_remount(struct nameidata *nd, int flags, int mnt_flags, 959 void *data) 960 { 961 int err; 962 struct super_block *sb = nd->mnt->mnt_sb; 963 964 if (!capable(CAP_SYS_ADMIN)) 965 return -EPERM; 966 967 if (!check_mnt(nd->mnt)) 968 return -EINVAL; 969 970 if (nd->dentry != nd->mnt->mnt_root) 971 return -EINVAL; 972 973 down_write(&sb->s_umount); 974 err = do_remount_sb(sb, flags, data, 0); 975 if (!err) 976 nd->mnt->mnt_flags = mnt_flags; 977 up_write(&sb->s_umount); 978 if (!err) 979 security_sb_post_remount(nd->mnt, flags, data); 980 return err; 981 } 982 983 static inline int tree_contains_unbindable(struct vfsmount *mnt) 984 { 985 struct vfsmount *p; 986 for (p = mnt; p; p = next_mnt(p, mnt)) { 987 if (IS_MNT_UNBINDABLE(p)) 988 return 1; 989 } 990 return 0; 991 } 992 993 static int do_move_mount(struct nameidata *nd, char *old_name) 994 { 995 struct nameidata old_nd, parent_nd; 996 struct vfsmount *p; 997 int err = 0; 998 if (!capable(CAP_SYS_ADMIN)) 999 return -EPERM; 1000 if (!old_name || !*old_name) 1001 return -EINVAL; 1002 err = path_lookup(old_name, LOOKUP_FOLLOW, &old_nd); 1003 if (err) 1004 return err; 1005 1006 down_write(&namespace_sem); 1007 while (d_mountpoint(nd->dentry) && follow_down(&nd->mnt, &nd->dentry)) 1008 ; 1009 err = -EINVAL; 1010 if (!check_mnt(nd->mnt) || !check_mnt(old_nd.mnt)) 1011 goto out; 1012 1013 err = -ENOENT; 1014 mutex_lock(&nd->dentry->d_inode->i_mutex); 1015 if (IS_DEADDIR(nd->dentry->d_inode)) 1016 goto out1; 1017 1018 if (!IS_ROOT(nd->dentry) && d_unhashed(nd->dentry)) 1019 goto out1; 1020 1021 err = -EINVAL; 1022 if (old_nd.dentry != old_nd.mnt->mnt_root) 1023 goto out1; 1024 1025 if (old_nd.mnt == old_nd.mnt->mnt_parent) 1026 goto out1; 1027 1028 if (S_ISDIR(nd->dentry->d_inode->i_mode) != 1029 S_ISDIR(old_nd.dentry->d_inode->i_mode)) 1030 goto out1; 1031 /* 1032 * Don't move a mount residing in a shared parent. 1033 */ 1034 if (old_nd.mnt->mnt_parent && IS_MNT_SHARED(old_nd.mnt->mnt_parent)) 1035 goto out1; 1036 /* 1037 * Don't move a mount tree containing unbindable mounts to a destination 1038 * mount which is shared. 1039 */ 1040 if (IS_MNT_SHARED(nd->mnt) && tree_contains_unbindable(old_nd.mnt)) 1041 goto out1; 1042 err = -ELOOP; 1043 for (p = nd->mnt; p->mnt_parent != p; p = p->mnt_parent) 1044 if (p == old_nd.mnt) 1045 goto out1; 1046 1047 if ((err = attach_recursive_mnt(old_nd.mnt, nd, &parent_nd))) 1048 goto out1; 1049 1050 spin_lock(&vfsmount_lock); 1051 /* if the mount is moved, it should no longer be expire 1052 * automatically */ 1053 list_del_init(&old_nd.mnt->mnt_expire); 1054 spin_unlock(&vfsmount_lock); 1055 out1: 1056 mutex_unlock(&nd->dentry->d_inode->i_mutex); 1057 out: 1058 up_write(&namespace_sem); 1059 if (!err) 1060 path_release(&parent_nd); 1061 path_release(&old_nd); 1062 return err; 1063 } 1064 1065 /* 1066 * create a new mount for userspace and request it to be added into the 1067 * namespace's tree 1068 */ 1069 static int do_new_mount(struct nameidata *nd, char *type, int flags, 1070 int mnt_flags, char *name, void *data) 1071 { 1072 struct vfsmount *mnt; 1073 1074 if (!type || !memchr(type, 0, PAGE_SIZE)) 1075 return -EINVAL; 1076 1077 /* we need capabilities... */ 1078 if (!capable(CAP_SYS_ADMIN)) 1079 return -EPERM; 1080 1081 mnt = do_kern_mount(type, flags, name, data); 1082 if (IS_ERR(mnt)) 1083 return PTR_ERR(mnt); 1084 1085 return do_add_mount(mnt, nd, mnt_flags, NULL); 1086 } 1087 1088 /* 1089 * add a mount into a namespace's mount tree 1090 * - provide the option of adding the new mount to an expiration list 1091 */ 1092 int do_add_mount(struct vfsmount *newmnt, struct nameidata *nd, 1093 int mnt_flags, struct list_head *fslist) 1094 { 1095 int err; 1096 1097 down_write(&namespace_sem); 1098 /* Something was mounted here while we slept */ 1099 while (d_mountpoint(nd->dentry) && follow_down(&nd->mnt, &nd->dentry)) 1100 ; 1101 err = -EINVAL; 1102 if (!check_mnt(nd->mnt)) 1103 goto unlock; 1104 1105 /* Refuse the same filesystem on the same mount point */ 1106 err = -EBUSY; 1107 if (nd->mnt->mnt_sb == newmnt->mnt_sb && 1108 nd->mnt->mnt_root == nd->dentry) 1109 goto unlock; 1110 1111 err = -EINVAL; 1112 if (S_ISLNK(newmnt->mnt_root->d_inode->i_mode)) 1113 goto unlock; 1114 1115 newmnt->mnt_flags = mnt_flags; 1116 if ((err = graft_tree(newmnt, nd))) 1117 goto unlock; 1118 1119 if (fslist) { 1120 /* add to the specified expiration list */ 1121 spin_lock(&vfsmount_lock); 1122 list_add_tail(&newmnt->mnt_expire, fslist); 1123 spin_unlock(&vfsmount_lock); 1124 } 1125 up_write(&namespace_sem); 1126 return 0; 1127 1128 unlock: 1129 up_write(&namespace_sem); 1130 mntput(newmnt); 1131 return err; 1132 } 1133 1134 EXPORT_SYMBOL_GPL(do_add_mount); 1135 1136 static void expire_mount(struct vfsmount *mnt, struct list_head *mounts, 1137 struct list_head *umounts) 1138 { 1139 spin_lock(&vfsmount_lock); 1140 1141 /* 1142 * Check if mount is still attached, if not, let whoever holds it deal 1143 * with the sucker 1144 */ 1145 if (mnt->mnt_parent == mnt) { 1146 spin_unlock(&vfsmount_lock); 1147 return; 1148 } 1149 1150 /* 1151 * Check that it is still dead: the count should now be 2 - as 1152 * contributed by the vfsmount parent and the mntget above 1153 */ 1154 if (!propagate_mount_busy(mnt, 2)) { 1155 /* delete from the namespace */ 1156 touch_namespace(mnt->mnt_namespace); 1157 list_del_init(&mnt->mnt_list); 1158 mnt->mnt_namespace = NULL; 1159 umount_tree(mnt, 1, umounts); 1160 spin_unlock(&vfsmount_lock); 1161 } else { 1162 /* 1163 * Someone brought it back to life whilst we didn't have any 1164 * locks held so return it to the expiration list 1165 */ 1166 list_add_tail(&mnt->mnt_expire, mounts); 1167 spin_unlock(&vfsmount_lock); 1168 } 1169 } 1170 1171 /* 1172 * go through the vfsmounts we've just consigned to the graveyard to 1173 * - check that they're still dead 1174 * - delete the vfsmount from the appropriate namespace under lock 1175 * - dispose of the corpse 1176 */ 1177 static void expire_mount_list(struct list_head *graveyard, struct list_head *mounts) 1178 { 1179 struct namespace *namespace; 1180 struct vfsmount *mnt; 1181 1182 while (!list_empty(graveyard)) { 1183 LIST_HEAD(umounts); 1184 mnt = list_entry(graveyard->next, struct vfsmount, mnt_expire); 1185 list_del_init(&mnt->mnt_expire); 1186 1187 /* don't do anything if the namespace is dead - all the 1188 * vfsmounts from it are going away anyway */ 1189 namespace = mnt->mnt_namespace; 1190 if (!namespace || !namespace->root) 1191 continue; 1192 get_namespace(namespace); 1193 1194 spin_unlock(&vfsmount_lock); 1195 down_write(&namespace_sem); 1196 expire_mount(mnt, mounts, &umounts); 1197 up_write(&namespace_sem); 1198 release_mounts(&umounts); 1199 mntput(mnt); 1200 put_namespace(namespace); 1201 spin_lock(&vfsmount_lock); 1202 } 1203 } 1204 1205 /* 1206 * process a list of expirable mountpoints with the intent of discarding any 1207 * mountpoints that aren't in use and haven't been touched since last we came 1208 * here 1209 */ 1210 void mark_mounts_for_expiry(struct list_head *mounts) 1211 { 1212 struct vfsmount *mnt, *next; 1213 LIST_HEAD(graveyard); 1214 1215 if (list_empty(mounts)) 1216 return; 1217 1218 spin_lock(&vfsmount_lock); 1219 1220 /* extract from the expiration list every vfsmount that matches the 1221 * following criteria: 1222 * - only referenced by its parent vfsmount 1223 * - still marked for expiry (marked on the last call here; marks are 1224 * cleared by mntput()) 1225 */ 1226 list_for_each_entry_safe(mnt, next, mounts, mnt_expire) { 1227 if (!xchg(&mnt->mnt_expiry_mark, 1) || 1228 atomic_read(&mnt->mnt_count) != 1) 1229 continue; 1230 1231 mntget(mnt); 1232 list_move(&mnt->mnt_expire, &graveyard); 1233 } 1234 1235 expire_mount_list(&graveyard, mounts); 1236 1237 spin_unlock(&vfsmount_lock); 1238 } 1239 1240 EXPORT_SYMBOL_GPL(mark_mounts_for_expiry); 1241 1242 /* 1243 * Ripoff of 'select_parent()' 1244 * 1245 * search the list of submounts for a given mountpoint, and move any 1246 * shrinkable submounts to the 'graveyard' list. 1247 */ 1248 static int select_submounts(struct vfsmount *parent, struct list_head *graveyard) 1249 { 1250 struct vfsmount *this_parent = parent; 1251 struct list_head *next; 1252 int found = 0; 1253 1254 repeat: 1255 next = this_parent->mnt_mounts.next; 1256 resume: 1257 while (next != &this_parent->mnt_mounts) { 1258 struct list_head *tmp = next; 1259 struct vfsmount *mnt = list_entry(tmp, struct vfsmount, mnt_child); 1260 1261 next = tmp->next; 1262 if (!(mnt->mnt_flags & MNT_SHRINKABLE)) 1263 continue; 1264 /* 1265 * Descend a level if the d_mounts list is non-empty. 1266 */ 1267 if (!list_empty(&mnt->mnt_mounts)) { 1268 this_parent = mnt; 1269 goto repeat; 1270 } 1271 1272 if (!propagate_mount_busy(mnt, 1)) { 1273 mntget(mnt); 1274 list_move_tail(&mnt->mnt_expire, graveyard); 1275 found++; 1276 } 1277 } 1278 /* 1279 * All done at this level ... ascend and resume the search 1280 */ 1281 if (this_parent != parent) { 1282 next = this_parent->mnt_child.next; 1283 this_parent = this_parent->mnt_parent; 1284 goto resume; 1285 } 1286 return found; 1287 } 1288 1289 /* 1290 * process a list of expirable mountpoints with the intent of discarding any 1291 * submounts of a specific parent mountpoint 1292 */ 1293 void shrink_submounts(struct vfsmount *mountpoint, struct list_head *mounts) 1294 { 1295 LIST_HEAD(graveyard); 1296 int found; 1297 1298 spin_lock(&vfsmount_lock); 1299 1300 /* extract submounts of 'mountpoint' from the expiration list */ 1301 while ((found = select_submounts(mountpoint, &graveyard)) != 0) 1302 expire_mount_list(&graveyard, mounts); 1303 1304 spin_unlock(&vfsmount_lock); 1305 } 1306 1307 EXPORT_SYMBOL_GPL(shrink_submounts); 1308 1309 /* 1310 * Some copy_from_user() implementations do not return the exact number of 1311 * bytes remaining to copy on a fault. But copy_mount_options() requires that. 1312 * Note that this function differs from copy_from_user() in that it will oops 1313 * on bad values of `to', rather than returning a short copy. 1314 */ 1315 static long exact_copy_from_user(void *to, const void __user * from, 1316 unsigned long n) 1317 { 1318 char *t = to; 1319 const char __user *f = from; 1320 char c; 1321 1322 if (!access_ok(VERIFY_READ, from, n)) 1323 return n; 1324 1325 while (n) { 1326 if (__get_user(c, f)) { 1327 memset(t, 0, n); 1328 break; 1329 } 1330 *t++ = c; 1331 f++; 1332 n--; 1333 } 1334 return n; 1335 } 1336 1337 int copy_mount_options(const void __user * data, unsigned long *where) 1338 { 1339 int i; 1340 unsigned long page; 1341 unsigned long size; 1342 1343 *where = 0; 1344 if (!data) 1345 return 0; 1346 1347 if (!(page = __get_free_page(GFP_KERNEL))) 1348 return -ENOMEM; 1349 1350 /* We only care that *some* data at the address the user 1351 * gave us is valid. Just in case, we'll zero 1352 * the remainder of the page. 1353 */ 1354 /* copy_from_user cannot cross TASK_SIZE ! */ 1355 size = TASK_SIZE - (unsigned long)data; 1356 if (size > PAGE_SIZE) 1357 size = PAGE_SIZE; 1358 1359 i = size - exact_copy_from_user((void *)page, data, size); 1360 if (!i) { 1361 free_page(page); 1362 return -EFAULT; 1363 } 1364 if (i != PAGE_SIZE) 1365 memset((char *)page + i, 0, PAGE_SIZE - i); 1366 *where = page; 1367 return 0; 1368 } 1369 1370 /* 1371 * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to 1372 * be given to the mount() call (ie: read-only, no-dev, no-suid etc). 1373 * 1374 * data is a (void *) that can point to any structure up to 1375 * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent 1376 * information (or be NULL). 1377 * 1378 * Pre-0.97 versions of mount() didn't have a flags word. 1379 * When the flags word was introduced its top half was required 1380 * to have the magic value 0xC0ED, and this remained so until 2.4.0-test9. 1381 * Therefore, if this magic number is present, it carries no information 1382 * and must be discarded. 1383 */ 1384 long do_mount(char *dev_name, char *dir_name, char *type_page, 1385 unsigned long flags, void *data_page) 1386 { 1387 struct nameidata nd; 1388 int retval = 0; 1389 int mnt_flags = 0; 1390 1391 /* Discard magic */ 1392 if ((flags & MS_MGC_MSK) == MS_MGC_VAL) 1393 flags &= ~MS_MGC_MSK; 1394 1395 /* Basic sanity checks */ 1396 1397 if (!dir_name || !*dir_name || !memchr(dir_name, 0, PAGE_SIZE)) 1398 return -EINVAL; 1399 if (dev_name && !memchr(dev_name, 0, PAGE_SIZE)) 1400 return -EINVAL; 1401 1402 if (data_page) 1403 ((char *)data_page)[PAGE_SIZE - 1] = 0; 1404 1405 /* Separate the per-mountpoint flags */ 1406 if (flags & MS_NOSUID) 1407 mnt_flags |= MNT_NOSUID; 1408 if (flags & MS_NODEV) 1409 mnt_flags |= MNT_NODEV; 1410 if (flags & MS_NOEXEC) 1411 mnt_flags |= MNT_NOEXEC; 1412 if (flags & MS_NOATIME) 1413 mnt_flags |= MNT_NOATIME; 1414 if (flags & MS_NODIRATIME) 1415 mnt_flags |= MNT_NODIRATIME; 1416 1417 flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | 1418 MS_NOATIME | MS_NODIRATIME); 1419 1420 /* ... and get the mountpoint */ 1421 retval = path_lookup(dir_name, LOOKUP_FOLLOW, &nd); 1422 if (retval) 1423 return retval; 1424 1425 retval = security_sb_mount(dev_name, &nd, type_page, flags, data_page); 1426 if (retval) 1427 goto dput_out; 1428 1429 if (flags & MS_REMOUNT) 1430 retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags, 1431 data_page); 1432 else if (flags & MS_BIND) 1433 retval = do_loopback(&nd, dev_name, flags & MS_REC); 1434 else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) 1435 retval = do_change_type(&nd, flags); 1436 else if (flags & MS_MOVE) 1437 retval = do_move_mount(&nd, dev_name); 1438 else 1439 retval = do_new_mount(&nd, type_page, flags, mnt_flags, 1440 dev_name, data_page); 1441 dput_out: 1442 path_release(&nd); 1443 return retval; 1444 } 1445 1446 /* 1447 * Allocate a new namespace structure and populate it with contents 1448 * copied from the namespace of the passed in task structure. 1449 */ 1450 struct namespace *dup_namespace(struct task_struct *tsk, struct fs_struct *fs) 1451 { 1452 struct namespace *namespace = tsk->namespace; 1453 struct namespace *new_ns; 1454 struct vfsmount *rootmnt = NULL, *pwdmnt = NULL, *altrootmnt = NULL; 1455 struct vfsmount *p, *q; 1456 1457 new_ns = kmalloc(sizeof(struct namespace), GFP_KERNEL); 1458 if (!new_ns) 1459 return NULL; 1460 1461 atomic_set(&new_ns->count, 1); 1462 INIT_LIST_HEAD(&new_ns->list); 1463 init_waitqueue_head(&new_ns->poll); 1464 new_ns->event = 0; 1465 1466 down_write(&namespace_sem); 1467 /* First pass: copy the tree topology */ 1468 new_ns->root = copy_tree(namespace->root, namespace->root->mnt_root, 1469 CL_COPY_ALL | CL_EXPIRE); 1470 if (!new_ns->root) { 1471 up_write(&namespace_sem); 1472 kfree(new_ns); 1473 return NULL; 1474 } 1475 spin_lock(&vfsmount_lock); 1476 list_add_tail(&new_ns->list, &new_ns->root->mnt_list); 1477 spin_unlock(&vfsmount_lock); 1478 1479 /* 1480 * Second pass: switch the tsk->fs->* elements and mark new vfsmounts 1481 * as belonging to new namespace. We have already acquired a private 1482 * fs_struct, so tsk->fs->lock is not needed. 1483 */ 1484 p = namespace->root; 1485 q = new_ns->root; 1486 while (p) { 1487 q->mnt_namespace = new_ns; 1488 if (fs) { 1489 if (p == fs->rootmnt) { 1490 rootmnt = p; 1491 fs->rootmnt = mntget(q); 1492 } 1493 if (p == fs->pwdmnt) { 1494 pwdmnt = p; 1495 fs->pwdmnt = mntget(q); 1496 } 1497 if (p == fs->altrootmnt) { 1498 altrootmnt = p; 1499 fs->altrootmnt = mntget(q); 1500 } 1501 } 1502 p = next_mnt(p, namespace->root); 1503 q = next_mnt(q, new_ns->root); 1504 } 1505 up_write(&namespace_sem); 1506 1507 if (rootmnt) 1508 mntput(rootmnt); 1509 if (pwdmnt) 1510 mntput(pwdmnt); 1511 if (altrootmnt) 1512 mntput(altrootmnt); 1513 1514 return new_ns; 1515 } 1516 1517 int copy_namespace(int flags, struct task_struct *tsk) 1518 { 1519 struct namespace *namespace = tsk->namespace; 1520 struct namespace *new_ns; 1521 int err = 0; 1522 1523 if (!namespace) 1524 return 0; 1525 1526 get_namespace(namespace); 1527 1528 if (!(flags & CLONE_NEWNS)) 1529 return 0; 1530 1531 if (!capable(CAP_SYS_ADMIN)) { 1532 err = -EPERM; 1533 goto out; 1534 } 1535 1536 new_ns = dup_namespace(tsk, tsk->fs); 1537 if (!new_ns) { 1538 err = -ENOMEM; 1539 goto out; 1540 } 1541 1542 tsk->namespace = new_ns; 1543 1544 out: 1545 put_namespace(namespace); 1546 return err; 1547 } 1548 1549 asmlinkage long sys_mount(char __user * dev_name, char __user * dir_name, 1550 char __user * type, unsigned long flags, 1551 void __user * data) 1552 { 1553 int retval; 1554 unsigned long data_page; 1555 unsigned long type_page; 1556 unsigned long dev_page; 1557 char *dir_page; 1558 1559 retval = copy_mount_options(type, &type_page); 1560 if (retval < 0) 1561 return retval; 1562 1563 dir_page = getname(dir_name); 1564 retval = PTR_ERR(dir_page); 1565 if (IS_ERR(dir_page)) 1566 goto out1; 1567 1568 retval = copy_mount_options(dev_name, &dev_page); 1569 if (retval < 0) 1570 goto out2; 1571 1572 retval = copy_mount_options(data, &data_page); 1573 if (retval < 0) 1574 goto out3; 1575 1576 lock_kernel(); 1577 retval = do_mount((char *)dev_page, dir_page, (char *)type_page, 1578 flags, (void *)data_page); 1579 unlock_kernel(); 1580 free_page(data_page); 1581 1582 out3: 1583 free_page(dev_page); 1584 out2: 1585 putname(dir_page); 1586 out1: 1587 free_page(type_page); 1588 return retval; 1589 } 1590 1591 /* 1592 * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values. 1593 * It can block. Requires the big lock held. 1594 */ 1595 void set_fs_root(struct fs_struct *fs, struct vfsmount *mnt, 1596 struct dentry *dentry) 1597 { 1598 struct dentry *old_root; 1599 struct vfsmount *old_rootmnt; 1600 write_lock(&fs->lock); 1601 old_root = fs->root; 1602 old_rootmnt = fs->rootmnt; 1603 fs->rootmnt = mntget(mnt); 1604 fs->root = dget(dentry); 1605 write_unlock(&fs->lock); 1606 if (old_root) { 1607 dput(old_root); 1608 mntput(old_rootmnt); 1609 } 1610 } 1611 1612 /* 1613 * Replace the fs->{pwdmnt,pwd} with {mnt,dentry}. Put the old values. 1614 * It can block. Requires the big lock held. 1615 */ 1616 void set_fs_pwd(struct fs_struct *fs, struct vfsmount *mnt, 1617 struct dentry *dentry) 1618 { 1619 struct dentry *old_pwd; 1620 struct vfsmount *old_pwdmnt; 1621 1622 write_lock(&fs->lock); 1623 old_pwd = fs->pwd; 1624 old_pwdmnt = fs->pwdmnt; 1625 fs->pwdmnt = mntget(mnt); 1626 fs->pwd = dget(dentry); 1627 write_unlock(&fs->lock); 1628 1629 if (old_pwd) { 1630 dput(old_pwd); 1631 mntput(old_pwdmnt); 1632 } 1633 } 1634 1635 static void chroot_fs_refs(struct nameidata *old_nd, struct nameidata *new_nd) 1636 { 1637 struct task_struct *g, *p; 1638 struct fs_struct *fs; 1639 1640 read_lock(&tasklist_lock); 1641 do_each_thread(g, p) { 1642 task_lock(p); 1643 fs = p->fs; 1644 if (fs) { 1645 atomic_inc(&fs->count); 1646 task_unlock(p); 1647 if (fs->root == old_nd->dentry 1648 && fs->rootmnt == old_nd->mnt) 1649 set_fs_root(fs, new_nd->mnt, new_nd->dentry); 1650 if (fs->pwd == old_nd->dentry 1651 && fs->pwdmnt == old_nd->mnt) 1652 set_fs_pwd(fs, new_nd->mnt, new_nd->dentry); 1653 put_fs_struct(fs); 1654 } else 1655 task_unlock(p); 1656 } while_each_thread(g, p); 1657 read_unlock(&tasklist_lock); 1658 } 1659 1660 /* 1661 * pivot_root Semantics: 1662 * Moves the root file system of the current process to the directory put_old, 1663 * makes new_root as the new root file system of the current process, and sets 1664 * root/cwd of all processes which had them on the current root to new_root. 1665 * 1666 * Restrictions: 1667 * The new_root and put_old must be directories, and must not be on the 1668 * same file system as the current process root. The put_old must be 1669 * underneath new_root, i.e. adding a non-zero number of /.. to the string 1670 * pointed to by put_old must yield the same directory as new_root. No other 1671 * file system may be mounted on put_old. After all, new_root is a mountpoint. 1672 * 1673 * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem. 1674 * See Documentation/filesystems/ramfs-rootfs-initramfs.txt for alternatives 1675 * in this situation. 1676 * 1677 * Notes: 1678 * - we don't move root/cwd if they are not at the root (reason: if something 1679 * cared enough to change them, it's probably wrong to force them elsewhere) 1680 * - it's okay to pick a root that isn't the root of a file system, e.g. 1681 * /nfs/my_root where /nfs is the mount point. It must be a mountpoint, 1682 * though, so you may need to say mount --bind /nfs/my_root /nfs/my_root 1683 * first. 1684 */ 1685 asmlinkage long sys_pivot_root(const char __user * new_root, 1686 const char __user * put_old) 1687 { 1688 struct vfsmount *tmp; 1689 struct nameidata new_nd, old_nd, parent_nd, root_parent, user_nd; 1690 int error; 1691 1692 if (!capable(CAP_SYS_ADMIN)) 1693 return -EPERM; 1694 1695 lock_kernel(); 1696 1697 error = __user_walk(new_root, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, 1698 &new_nd); 1699 if (error) 1700 goto out0; 1701 error = -EINVAL; 1702 if (!check_mnt(new_nd.mnt)) 1703 goto out1; 1704 1705 error = __user_walk(put_old, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old_nd); 1706 if (error) 1707 goto out1; 1708 1709 error = security_sb_pivotroot(&old_nd, &new_nd); 1710 if (error) { 1711 path_release(&old_nd); 1712 goto out1; 1713 } 1714 1715 read_lock(¤t->fs->lock); 1716 user_nd.mnt = mntget(current->fs->rootmnt); 1717 user_nd.dentry = dget(current->fs->root); 1718 read_unlock(¤t->fs->lock); 1719 down_write(&namespace_sem); 1720 mutex_lock(&old_nd.dentry->d_inode->i_mutex); 1721 error = -EINVAL; 1722 if (IS_MNT_SHARED(old_nd.mnt) || 1723 IS_MNT_SHARED(new_nd.mnt->mnt_parent) || 1724 IS_MNT_SHARED(user_nd.mnt->mnt_parent)) 1725 goto out2; 1726 if (!check_mnt(user_nd.mnt)) 1727 goto out2; 1728 error = -ENOENT; 1729 if (IS_DEADDIR(new_nd.dentry->d_inode)) 1730 goto out2; 1731 if (d_unhashed(new_nd.dentry) && !IS_ROOT(new_nd.dentry)) 1732 goto out2; 1733 if (d_unhashed(old_nd.dentry) && !IS_ROOT(old_nd.dentry)) 1734 goto out2; 1735 error = -EBUSY; 1736 if (new_nd.mnt == user_nd.mnt || old_nd.mnt == user_nd.mnt) 1737 goto out2; /* loop, on the same file system */ 1738 error = -EINVAL; 1739 if (user_nd.mnt->mnt_root != user_nd.dentry) 1740 goto out2; /* not a mountpoint */ 1741 if (user_nd.mnt->mnt_parent == user_nd.mnt) 1742 goto out2; /* not attached */ 1743 if (new_nd.mnt->mnt_root != new_nd.dentry) 1744 goto out2; /* not a mountpoint */ 1745 if (new_nd.mnt->mnt_parent == new_nd.mnt) 1746 goto out2; /* not attached */ 1747 tmp = old_nd.mnt; /* make sure we can reach put_old from new_root */ 1748 spin_lock(&vfsmount_lock); 1749 if (tmp != new_nd.mnt) { 1750 for (;;) { 1751 if (tmp->mnt_parent == tmp) 1752 goto out3; /* already mounted on put_old */ 1753 if (tmp->mnt_parent == new_nd.mnt) 1754 break; 1755 tmp = tmp->mnt_parent; 1756 } 1757 if (!is_subdir(tmp->mnt_mountpoint, new_nd.dentry)) 1758 goto out3; 1759 } else if (!is_subdir(old_nd.dentry, new_nd.dentry)) 1760 goto out3; 1761 detach_mnt(new_nd.mnt, &parent_nd); 1762 detach_mnt(user_nd.mnt, &root_parent); 1763 attach_mnt(user_nd.mnt, &old_nd); /* mount old root on put_old */ 1764 attach_mnt(new_nd.mnt, &root_parent); /* mount new_root on / */ 1765 touch_namespace(current->namespace); 1766 spin_unlock(&vfsmount_lock); 1767 chroot_fs_refs(&user_nd, &new_nd); 1768 security_sb_post_pivotroot(&user_nd, &new_nd); 1769 error = 0; 1770 path_release(&root_parent); 1771 path_release(&parent_nd); 1772 out2: 1773 mutex_unlock(&old_nd.dentry->d_inode->i_mutex); 1774 up_write(&namespace_sem); 1775 path_release(&user_nd); 1776 path_release(&old_nd); 1777 out1: 1778 path_release(&new_nd); 1779 out0: 1780 unlock_kernel(); 1781 return error; 1782 out3: 1783 spin_unlock(&vfsmount_lock); 1784 goto out2; 1785 } 1786 1787 static void __init init_mount_tree(void) 1788 { 1789 struct vfsmount *mnt; 1790 struct namespace *namespace; 1791 struct task_struct *g, *p; 1792 1793 mnt = do_kern_mount("rootfs", 0, "rootfs", NULL); 1794 if (IS_ERR(mnt)) 1795 panic("Can't create rootfs"); 1796 namespace = kmalloc(sizeof(*namespace), GFP_KERNEL); 1797 if (!namespace) 1798 panic("Can't allocate initial namespace"); 1799 atomic_set(&namespace->count, 1); 1800 INIT_LIST_HEAD(&namespace->list); 1801 init_waitqueue_head(&namespace->poll); 1802 namespace->event = 0; 1803 list_add(&mnt->mnt_list, &namespace->list); 1804 namespace->root = mnt; 1805 mnt->mnt_namespace = namespace; 1806 1807 init_task.namespace = namespace; 1808 read_lock(&tasklist_lock); 1809 do_each_thread(g, p) { 1810 get_namespace(namespace); 1811 p->namespace = namespace; 1812 } while_each_thread(g, p); 1813 read_unlock(&tasklist_lock); 1814 1815 set_fs_pwd(current->fs, namespace->root, namespace->root->mnt_root); 1816 set_fs_root(current->fs, namespace->root, namespace->root->mnt_root); 1817 } 1818 1819 void __init mnt_init(unsigned long mempages) 1820 { 1821 struct list_head *d; 1822 unsigned int nr_hash; 1823 int i; 1824 1825 init_rwsem(&namespace_sem); 1826 1827 mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct vfsmount), 1828 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL, NULL); 1829 1830 mount_hashtable = (struct list_head *)__get_free_page(GFP_ATOMIC); 1831 1832 if (!mount_hashtable) 1833 panic("Failed to allocate mount hash table\n"); 1834 1835 /* 1836 * Find the power-of-two list-heads that can fit into the allocation.. 1837 * We don't guarantee that "sizeof(struct list_head)" is necessarily 1838 * a power-of-two. 1839 */ 1840 nr_hash = PAGE_SIZE / sizeof(struct list_head); 1841 hash_bits = 0; 1842 do { 1843 hash_bits++; 1844 } while ((nr_hash >> hash_bits) != 0); 1845 hash_bits--; 1846 1847 /* 1848 * Re-calculate the actual number of entries and the mask 1849 * from the number of bits we can fit. 1850 */ 1851 nr_hash = 1UL << hash_bits; 1852 hash_mask = nr_hash - 1; 1853 1854 printk("Mount-cache hash table entries: %d\n", nr_hash); 1855 1856 /* And initialize the newly allocated array */ 1857 d = mount_hashtable; 1858 i = nr_hash; 1859 do { 1860 INIT_LIST_HEAD(d); 1861 d++; 1862 i--; 1863 } while (i); 1864 sysfs_init(); 1865 subsystem_register(&fs_subsys); 1866 init_rootfs(); 1867 init_mount_tree(); 1868 } 1869 1870 void __put_namespace(struct namespace *namespace) 1871 { 1872 struct vfsmount *root = namespace->root; 1873 LIST_HEAD(umount_list); 1874 namespace->root = NULL; 1875 spin_unlock(&vfsmount_lock); 1876 down_write(&namespace_sem); 1877 spin_lock(&vfsmount_lock); 1878 umount_tree(root, 0, &umount_list); 1879 spin_unlock(&vfsmount_lock); 1880 up_write(&namespace_sem); 1881 release_mounts(&umount_list); 1882 kfree(namespace); 1883 } 1884