1 /* 2 * linux/fs/namespace.c 3 * 4 * (C) Copyright Al Viro 2000, 2001 5 * Released under GPL v2. 6 * 7 * Based on code from fs/super.c, copyright Linus Torvalds and others. 8 * Heavily rewritten. 9 */ 10 11 #include <linux/syscalls.h> 12 #include <linux/slab.h> 13 #include <linux/sched.h> 14 #include <linux/smp_lock.h> 15 #include <linux/init.h> 16 #include <linux/kernel.h> 17 #include <linux/quotaops.h> 18 #include <linux/acct.h> 19 #include <linux/capability.h> 20 #include <linux/module.h> 21 #include <linux/sysfs.h> 22 #include <linux/seq_file.h> 23 #include <linux/mnt_namespace.h> 24 #include <linux/namei.h> 25 #include <linux/security.h> 26 #include <linux/mount.h> 27 #include <linux/ramfs.h> 28 #include <linux/log2.h> 29 #include <asm/uaccess.h> 30 #include <asm/unistd.h> 31 #include "pnode.h" 32 #include "internal.h" 33 34 #define HASH_SHIFT ilog2(PAGE_SIZE / sizeof(struct list_head)) 35 #define HASH_SIZE (1UL << HASH_SHIFT) 36 37 /* spinlock for vfsmount related operations, inplace of dcache_lock */ 38 __cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock); 39 40 static int event; 41 42 static struct list_head *mount_hashtable __read_mostly; 43 static struct kmem_cache *mnt_cache __read_mostly; 44 static struct rw_semaphore namespace_sem; 45 46 /* /sys/fs */ 47 struct kobject *fs_kobj; 48 EXPORT_SYMBOL_GPL(fs_kobj); 49 50 static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry) 51 { 52 unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES); 53 tmp += ((unsigned long)dentry / L1_CACHE_BYTES); 54 tmp = tmp + (tmp >> HASH_SHIFT); 55 return tmp & (HASH_SIZE - 1); 56 } 57 58 struct vfsmount *alloc_vfsmnt(const char *name) 59 { 60 struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); 61 if (mnt) { 62 atomic_set(&mnt->mnt_count, 1); 63 INIT_LIST_HEAD(&mnt->mnt_hash); 64 INIT_LIST_HEAD(&mnt->mnt_child); 65 INIT_LIST_HEAD(&mnt->mnt_mounts); 66 INIT_LIST_HEAD(&mnt->mnt_list); 67 INIT_LIST_HEAD(&mnt->mnt_expire); 68 INIT_LIST_HEAD(&mnt->mnt_share); 69 INIT_LIST_HEAD(&mnt->mnt_slave_list); 70 INIT_LIST_HEAD(&mnt->mnt_slave); 71 if (name) { 72 int size = strlen(name) + 1; 73 char *newname = kmalloc(size, GFP_KERNEL); 74 if (newname) { 75 memcpy(newname, name, size); 76 mnt->mnt_devname = newname; 77 } 78 } 79 } 80 return mnt; 81 } 82 83 int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb) 84 { 85 mnt->mnt_sb = sb; 86 mnt->mnt_root = dget(sb->s_root); 87 return 0; 88 } 89 90 EXPORT_SYMBOL(simple_set_mnt); 91 92 void free_vfsmnt(struct vfsmount *mnt) 93 { 94 kfree(mnt->mnt_devname); 95 kmem_cache_free(mnt_cache, mnt); 96 } 97 98 /* 99 * find the first or last mount at @dentry on vfsmount @mnt depending on 100 * @dir. If @dir is set return the first mount else return the last mount. 101 */ 102 struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry, 103 int dir) 104 { 105 struct list_head *head = mount_hashtable + hash(mnt, dentry); 106 struct list_head *tmp = head; 107 struct vfsmount *p, *found = NULL; 108 109 for (;;) { 110 tmp = dir ? tmp->next : tmp->prev; 111 p = NULL; 112 if (tmp == head) 113 break; 114 p = list_entry(tmp, struct vfsmount, mnt_hash); 115 if (p->mnt_parent == mnt && p->mnt_mountpoint == dentry) { 116 found = p; 117 break; 118 } 119 } 120 return found; 121 } 122 123 /* 124 * lookup_mnt increments the ref count before returning 125 * the vfsmount struct. 126 */ 127 struct vfsmount *lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) 128 { 129 struct vfsmount *child_mnt; 130 spin_lock(&vfsmount_lock); 131 if ((child_mnt = __lookup_mnt(mnt, dentry, 1))) 132 mntget(child_mnt); 133 spin_unlock(&vfsmount_lock); 134 return child_mnt; 135 } 136 137 static inline int check_mnt(struct vfsmount *mnt) 138 { 139 return mnt->mnt_ns == current->nsproxy->mnt_ns; 140 } 141 142 static void touch_mnt_namespace(struct mnt_namespace *ns) 143 { 144 if (ns) { 145 ns->event = ++event; 146 wake_up_interruptible(&ns->poll); 147 } 148 } 149 150 static void __touch_mnt_namespace(struct mnt_namespace *ns) 151 { 152 if (ns && ns->event != event) { 153 ns->event = event; 154 wake_up_interruptible(&ns->poll); 155 } 156 } 157 158 static void detach_mnt(struct vfsmount *mnt, struct nameidata *old_nd) 159 { 160 old_nd->dentry = mnt->mnt_mountpoint; 161 old_nd->mnt = mnt->mnt_parent; 162 mnt->mnt_parent = mnt; 163 mnt->mnt_mountpoint = mnt->mnt_root; 164 list_del_init(&mnt->mnt_child); 165 list_del_init(&mnt->mnt_hash); 166 old_nd->dentry->d_mounted--; 167 } 168 169 void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry, 170 struct vfsmount *child_mnt) 171 { 172 child_mnt->mnt_parent = mntget(mnt); 173 child_mnt->mnt_mountpoint = dget(dentry); 174 dentry->d_mounted++; 175 } 176 177 static void attach_mnt(struct vfsmount *mnt, struct nameidata *nd) 178 { 179 mnt_set_mountpoint(nd->mnt, nd->dentry, mnt); 180 list_add_tail(&mnt->mnt_hash, mount_hashtable + 181 hash(nd->mnt, nd->dentry)); 182 list_add_tail(&mnt->mnt_child, &nd->mnt->mnt_mounts); 183 } 184 185 /* 186 * the caller must hold vfsmount_lock 187 */ 188 static void commit_tree(struct vfsmount *mnt) 189 { 190 struct vfsmount *parent = mnt->mnt_parent; 191 struct vfsmount *m; 192 LIST_HEAD(head); 193 struct mnt_namespace *n = parent->mnt_ns; 194 195 BUG_ON(parent == mnt); 196 197 list_add_tail(&head, &mnt->mnt_list); 198 list_for_each_entry(m, &head, mnt_list) 199 m->mnt_ns = n; 200 list_splice(&head, n->list.prev); 201 202 list_add_tail(&mnt->mnt_hash, mount_hashtable + 203 hash(parent, mnt->mnt_mountpoint)); 204 list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); 205 touch_mnt_namespace(n); 206 } 207 208 static struct vfsmount *next_mnt(struct vfsmount *p, struct vfsmount *root) 209 { 210 struct list_head *next = p->mnt_mounts.next; 211 if (next == &p->mnt_mounts) { 212 while (1) { 213 if (p == root) 214 return NULL; 215 next = p->mnt_child.next; 216 if (next != &p->mnt_parent->mnt_mounts) 217 break; 218 p = p->mnt_parent; 219 } 220 } 221 return list_entry(next, struct vfsmount, mnt_child); 222 } 223 224 static struct vfsmount *skip_mnt_tree(struct vfsmount *p) 225 { 226 struct list_head *prev = p->mnt_mounts.prev; 227 while (prev != &p->mnt_mounts) { 228 p = list_entry(prev, struct vfsmount, mnt_child); 229 prev = p->mnt_mounts.prev; 230 } 231 return p; 232 } 233 234 static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root, 235 int flag) 236 { 237 struct super_block *sb = old->mnt_sb; 238 struct vfsmount *mnt = alloc_vfsmnt(old->mnt_devname); 239 240 if (mnt) { 241 mnt->mnt_flags = old->mnt_flags; 242 atomic_inc(&sb->s_active); 243 mnt->mnt_sb = sb; 244 mnt->mnt_root = dget(root); 245 mnt->mnt_mountpoint = mnt->mnt_root; 246 mnt->mnt_parent = mnt; 247 248 if (flag & CL_SLAVE) { 249 list_add(&mnt->mnt_slave, &old->mnt_slave_list); 250 mnt->mnt_master = old; 251 CLEAR_MNT_SHARED(mnt); 252 } else if (!(flag & CL_PRIVATE)) { 253 if ((flag & CL_PROPAGATION) || IS_MNT_SHARED(old)) 254 list_add(&mnt->mnt_share, &old->mnt_share); 255 if (IS_MNT_SLAVE(old)) 256 list_add(&mnt->mnt_slave, &old->mnt_slave); 257 mnt->mnt_master = old->mnt_master; 258 } 259 if (flag & CL_MAKE_SHARED) 260 set_mnt_shared(mnt); 261 262 /* stick the duplicate mount on the same expiry list 263 * as the original if that was on one */ 264 if (flag & CL_EXPIRE) { 265 spin_lock(&vfsmount_lock); 266 if (!list_empty(&old->mnt_expire)) 267 list_add(&mnt->mnt_expire, &old->mnt_expire); 268 spin_unlock(&vfsmount_lock); 269 } 270 } 271 return mnt; 272 } 273 274 static inline void __mntput(struct vfsmount *mnt) 275 { 276 struct super_block *sb = mnt->mnt_sb; 277 dput(mnt->mnt_root); 278 free_vfsmnt(mnt); 279 deactivate_super(sb); 280 } 281 282 void mntput_no_expire(struct vfsmount *mnt) 283 { 284 repeat: 285 if (atomic_dec_and_lock(&mnt->mnt_count, &vfsmount_lock)) { 286 if (likely(!mnt->mnt_pinned)) { 287 spin_unlock(&vfsmount_lock); 288 __mntput(mnt); 289 return; 290 } 291 atomic_add(mnt->mnt_pinned + 1, &mnt->mnt_count); 292 mnt->mnt_pinned = 0; 293 spin_unlock(&vfsmount_lock); 294 acct_auto_close_mnt(mnt); 295 security_sb_umount_close(mnt); 296 goto repeat; 297 } 298 } 299 300 EXPORT_SYMBOL(mntput_no_expire); 301 302 void mnt_pin(struct vfsmount *mnt) 303 { 304 spin_lock(&vfsmount_lock); 305 mnt->mnt_pinned++; 306 spin_unlock(&vfsmount_lock); 307 } 308 309 EXPORT_SYMBOL(mnt_pin); 310 311 void mnt_unpin(struct vfsmount *mnt) 312 { 313 spin_lock(&vfsmount_lock); 314 if (mnt->mnt_pinned) { 315 atomic_inc(&mnt->mnt_count); 316 mnt->mnt_pinned--; 317 } 318 spin_unlock(&vfsmount_lock); 319 } 320 321 EXPORT_SYMBOL(mnt_unpin); 322 323 static inline void mangle(struct seq_file *m, const char *s) 324 { 325 seq_escape(m, s, " \t\n\\"); 326 } 327 328 /* 329 * Simple .show_options callback for filesystems which don't want to 330 * implement more complex mount option showing. 331 * 332 * See also save_mount_options(). 333 */ 334 int generic_show_options(struct seq_file *m, struct vfsmount *mnt) 335 { 336 const char *options = mnt->mnt_sb->s_options; 337 338 if (options != NULL && options[0]) { 339 seq_putc(m, ','); 340 mangle(m, options); 341 } 342 343 return 0; 344 } 345 EXPORT_SYMBOL(generic_show_options); 346 347 /* 348 * If filesystem uses generic_show_options(), this function should be 349 * called from the fill_super() callback. 350 * 351 * The .remount_fs callback usually needs to be handled in a special 352 * way, to make sure, that previous options are not overwritten if the 353 * remount fails. 354 * 355 * Also note, that if the filesystem's .remount_fs function doesn't 356 * reset all options to their default value, but changes only newly 357 * given options, then the displayed options will not reflect reality 358 * any more. 359 */ 360 void save_mount_options(struct super_block *sb, char *options) 361 { 362 kfree(sb->s_options); 363 sb->s_options = kstrdup(options, GFP_KERNEL); 364 } 365 EXPORT_SYMBOL(save_mount_options); 366 367 /* iterator */ 368 static void *m_start(struct seq_file *m, loff_t *pos) 369 { 370 struct mnt_namespace *n = m->private; 371 372 down_read(&namespace_sem); 373 return seq_list_start(&n->list, *pos); 374 } 375 376 static void *m_next(struct seq_file *m, void *v, loff_t *pos) 377 { 378 struct mnt_namespace *n = m->private; 379 380 return seq_list_next(v, &n->list, pos); 381 } 382 383 static void m_stop(struct seq_file *m, void *v) 384 { 385 up_read(&namespace_sem); 386 } 387 388 static int show_vfsmnt(struct seq_file *m, void *v) 389 { 390 struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list); 391 int err = 0; 392 static struct proc_fs_info { 393 int flag; 394 char *str; 395 } fs_info[] = { 396 { MS_SYNCHRONOUS, ",sync" }, 397 { MS_DIRSYNC, ",dirsync" }, 398 { MS_MANDLOCK, ",mand" }, 399 { 0, NULL } 400 }; 401 static struct proc_fs_info mnt_info[] = { 402 { MNT_NOSUID, ",nosuid" }, 403 { MNT_NODEV, ",nodev" }, 404 { MNT_NOEXEC, ",noexec" }, 405 { MNT_NOATIME, ",noatime" }, 406 { MNT_NODIRATIME, ",nodiratime" }, 407 { MNT_RELATIME, ",relatime" }, 408 { 0, NULL } 409 }; 410 struct proc_fs_info *fs_infop; 411 412 mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); 413 seq_putc(m, ' '); 414 seq_path(m, mnt, mnt->mnt_root, " \t\n\\"); 415 seq_putc(m, ' '); 416 mangle(m, mnt->mnt_sb->s_type->name); 417 if (mnt->mnt_sb->s_subtype && mnt->mnt_sb->s_subtype[0]) { 418 seq_putc(m, '.'); 419 mangle(m, mnt->mnt_sb->s_subtype); 420 } 421 seq_puts(m, mnt->mnt_sb->s_flags & MS_RDONLY ? " ro" : " rw"); 422 for (fs_infop = fs_info; fs_infop->flag; fs_infop++) { 423 if (mnt->mnt_sb->s_flags & fs_infop->flag) 424 seq_puts(m, fs_infop->str); 425 } 426 for (fs_infop = mnt_info; fs_infop->flag; fs_infop++) { 427 if (mnt->mnt_flags & fs_infop->flag) 428 seq_puts(m, fs_infop->str); 429 } 430 if (mnt->mnt_sb->s_op->show_options) 431 err = mnt->mnt_sb->s_op->show_options(m, mnt); 432 seq_puts(m, " 0 0\n"); 433 return err; 434 } 435 436 struct seq_operations mounts_op = { 437 .start = m_start, 438 .next = m_next, 439 .stop = m_stop, 440 .show = show_vfsmnt 441 }; 442 443 static int show_vfsstat(struct seq_file *m, void *v) 444 { 445 struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list); 446 int err = 0; 447 448 /* device */ 449 if (mnt->mnt_devname) { 450 seq_puts(m, "device "); 451 mangle(m, mnt->mnt_devname); 452 } else 453 seq_puts(m, "no device"); 454 455 /* mount point */ 456 seq_puts(m, " mounted on "); 457 seq_path(m, mnt, mnt->mnt_root, " \t\n\\"); 458 seq_putc(m, ' '); 459 460 /* file system type */ 461 seq_puts(m, "with fstype "); 462 mangle(m, mnt->mnt_sb->s_type->name); 463 464 /* optional statistics */ 465 if (mnt->mnt_sb->s_op->show_stats) { 466 seq_putc(m, ' '); 467 err = mnt->mnt_sb->s_op->show_stats(m, mnt); 468 } 469 470 seq_putc(m, '\n'); 471 return err; 472 } 473 474 struct seq_operations mountstats_op = { 475 .start = m_start, 476 .next = m_next, 477 .stop = m_stop, 478 .show = show_vfsstat, 479 }; 480 481 /** 482 * may_umount_tree - check if a mount tree is busy 483 * @mnt: root of mount tree 484 * 485 * This is called to check if a tree of mounts has any 486 * open files, pwds, chroots or sub mounts that are 487 * busy. 488 */ 489 int may_umount_tree(struct vfsmount *mnt) 490 { 491 int actual_refs = 0; 492 int minimum_refs = 0; 493 struct vfsmount *p; 494 495 spin_lock(&vfsmount_lock); 496 for (p = mnt; p; p = next_mnt(p, mnt)) { 497 actual_refs += atomic_read(&p->mnt_count); 498 minimum_refs += 2; 499 } 500 spin_unlock(&vfsmount_lock); 501 502 if (actual_refs > minimum_refs) 503 return 0; 504 505 return 1; 506 } 507 508 EXPORT_SYMBOL(may_umount_tree); 509 510 /** 511 * may_umount - check if a mount point is busy 512 * @mnt: root of mount 513 * 514 * This is called to check if a mount point has any 515 * open files, pwds, chroots or sub mounts. If the 516 * mount has sub mounts this will return busy 517 * regardless of whether the sub mounts are busy. 518 * 519 * Doesn't take quota and stuff into account. IOW, in some cases it will 520 * give false negatives. The main reason why it's here is that we need 521 * a non-destructive way to look for easily umountable filesystems. 522 */ 523 int may_umount(struct vfsmount *mnt) 524 { 525 int ret = 1; 526 spin_lock(&vfsmount_lock); 527 if (propagate_mount_busy(mnt, 2)) 528 ret = 0; 529 spin_unlock(&vfsmount_lock); 530 return ret; 531 } 532 533 EXPORT_SYMBOL(may_umount); 534 535 void release_mounts(struct list_head *head) 536 { 537 struct vfsmount *mnt; 538 while (!list_empty(head)) { 539 mnt = list_first_entry(head, struct vfsmount, mnt_hash); 540 list_del_init(&mnt->mnt_hash); 541 if (mnt->mnt_parent != mnt) { 542 struct dentry *dentry; 543 struct vfsmount *m; 544 spin_lock(&vfsmount_lock); 545 dentry = mnt->mnt_mountpoint; 546 m = mnt->mnt_parent; 547 mnt->mnt_mountpoint = mnt->mnt_root; 548 mnt->mnt_parent = mnt; 549 spin_unlock(&vfsmount_lock); 550 dput(dentry); 551 mntput(m); 552 } 553 mntput(mnt); 554 } 555 } 556 557 void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill) 558 { 559 struct vfsmount *p; 560 561 for (p = mnt; p; p = next_mnt(p, mnt)) 562 list_move(&p->mnt_hash, kill); 563 564 if (propagate) 565 propagate_umount(kill); 566 567 list_for_each_entry(p, kill, mnt_hash) { 568 list_del_init(&p->mnt_expire); 569 list_del_init(&p->mnt_list); 570 __touch_mnt_namespace(p->mnt_ns); 571 p->mnt_ns = NULL; 572 list_del_init(&p->mnt_child); 573 if (p->mnt_parent != p) 574 p->mnt_mountpoint->d_mounted--; 575 change_mnt_propagation(p, MS_PRIVATE); 576 } 577 } 578 579 static int do_umount(struct vfsmount *mnt, int flags) 580 { 581 struct super_block *sb = mnt->mnt_sb; 582 int retval; 583 LIST_HEAD(umount_list); 584 585 retval = security_sb_umount(mnt, flags); 586 if (retval) 587 return retval; 588 589 /* 590 * Allow userspace to request a mountpoint be expired rather than 591 * unmounting unconditionally. Unmount only happens if: 592 * (1) the mark is already set (the mark is cleared by mntput()) 593 * (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount] 594 */ 595 if (flags & MNT_EXPIRE) { 596 if (mnt == current->fs->rootmnt || 597 flags & (MNT_FORCE | MNT_DETACH)) 598 return -EINVAL; 599 600 if (atomic_read(&mnt->mnt_count) != 2) 601 return -EBUSY; 602 603 if (!xchg(&mnt->mnt_expiry_mark, 1)) 604 return -EAGAIN; 605 } 606 607 /* 608 * If we may have to abort operations to get out of this 609 * mount, and they will themselves hold resources we must 610 * allow the fs to do things. In the Unix tradition of 611 * 'Gee thats tricky lets do it in userspace' the umount_begin 612 * might fail to complete on the first run through as other tasks 613 * must return, and the like. Thats for the mount program to worry 614 * about for the moment. 615 */ 616 617 lock_kernel(); 618 if (sb->s_op->umount_begin) 619 sb->s_op->umount_begin(mnt, flags); 620 unlock_kernel(); 621 622 /* 623 * No sense to grab the lock for this test, but test itself looks 624 * somewhat bogus. Suggestions for better replacement? 625 * Ho-hum... In principle, we might treat that as umount + switch 626 * to rootfs. GC would eventually take care of the old vfsmount. 627 * Actually it makes sense, especially if rootfs would contain a 628 * /reboot - static binary that would close all descriptors and 629 * call reboot(9). Then init(8) could umount root and exec /reboot. 630 */ 631 if (mnt == current->fs->rootmnt && !(flags & MNT_DETACH)) { 632 /* 633 * Special case for "unmounting" root ... 634 * we just try to remount it readonly. 635 */ 636 down_write(&sb->s_umount); 637 if (!(sb->s_flags & MS_RDONLY)) { 638 lock_kernel(); 639 DQUOT_OFF(sb); 640 retval = do_remount_sb(sb, MS_RDONLY, NULL, 0); 641 unlock_kernel(); 642 } 643 up_write(&sb->s_umount); 644 return retval; 645 } 646 647 down_write(&namespace_sem); 648 spin_lock(&vfsmount_lock); 649 event++; 650 651 retval = -EBUSY; 652 if (flags & MNT_DETACH || !propagate_mount_busy(mnt, 2)) { 653 if (!list_empty(&mnt->mnt_list)) 654 umount_tree(mnt, 1, &umount_list); 655 retval = 0; 656 } 657 spin_unlock(&vfsmount_lock); 658 if (retval) 659 security_sb_umount_busy(mnt); 660 up_write(&namespace_sem); 661 release_mounts(&umount_list); 662 return retval; 663 } 664 665 /* 666 * Now umount can handle mount points as well as block devices. 667 * This is important for filesystems which use unnamed block devices. 668 * 669 * We now support a flag for forced unmount like the other 'big iron' 670 * unixes. Our API is identical to OSF/1 to avoid making a mess of AMD 671 */ 672 673 asmlinkage long sys_umount(char __user * name, int flags) 674 { 675 struct nameidata nd; 676 int retval; 677 678 retval = __user_walk(name, LOOKUP_FOLLOW, &nd); 679 if (retval) 680 goto out; 681 retval = -EINVAL; 682 if (nd.dentry != nd.mnt->mnt_root) 683 goto dput_and_out; 684 if (!check_mnt(nd.mnt)) 685 goto dput_and_out; 686 687 retval = -EPERM; 688 if (!capable(CAP_SYS_ADMIN)) 689 goto dput_and_out; 690 691 retval = do_umount(nd.mnt, flags); 692 dput_and_out: 693 path_release_on_umount(&nd); 694 out: 695 return retval; 696 } 697 698 #ifdef __ARCH_WANT_SYS_OLDUMOUNT 699 700 /* 701 * The 2.0 compatible umount. No flags. 702 */ 703 asmlinkage long sys_oldumount(char __user * name) 704 { 705 return sys_umount(name, 0); 706 } 707 708 #endif 709 710 static int mount_is_safe(struct nameidata *nd) 711 { 712 if (capable(CAP_SYS_ADMIN)) 713 return 0; 714 return -EPERM; 715 #ifdef notyet 716 if (S_ISLNK(nd->dentry->d_inode->i_mode)) 717 return -EPERM; 718 if (nd->dentry->d_inode->i_mode & S_ISVTX) { 719 if (current->uid != nd->dentry->d_inode->i_uid) 720 return -EPERM; 721 } 722 if (vfs_permission(nd, MAY_WRITE)) 723 return -EPERM; 724 return 0; 725 #endif 726 } 727 728 static int lives_below_in_same_fs(struct dentry *d, struct dentry *dentry) 729 { 730 while (1) { 731 if (d == dentry) 732 return 1; 733 if (d == NULL || d == d->d_parent) 734 return 0; 735 d = d->d_parent; 736 } 737 } 738 739 struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry, 740 int flag) 741 { 742 struct vfsmount *res, *p, *q, *r, *s; 743 struct nameidata nd; 744 745 if (!(flag & CL_COPY_ALL) && IS_MNT_UNBINDABLE(mnt)) 746 return NULL; 747 748 res = q = clone_mnt(mnt, dentry, flag); 749 if (!q) 750 goto Enomem; 751 q->mnt_mountpoint = mnt->mnt_mountpoint; 752 753 p = mnt; 754 list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) { 755 if (!lives_below_in_same_fs(r->mnt_mountpoint, dentry)) 756 continue; 757 758 for (s = r; s; s = next_mnt(s, r)) { 759 if (!(flag & CL_COPY_ALL) && IS_MNT_UNBINDABLE(s)) { 760 s = skip_mnt_tree(s); 761 continue; 762 } 763 while (p != s->mnt_parent) { 764 p = p->mnt_parent; 765 q = q->mnt_parent; 766 } 767 p = s; 768 nd.mnt = q; 769 nd.dentry = p->mnt_mountpoint; 770 q = clone_mnt(p, p->mnt_root, flag); 771 if (!q) 772 goto Enomem; 773 spin_lock(&vfsmount_lock); 774 list_add_tail(&q->mnt_list, &res->mnt_list); 775 attach_mnt(q, &nd); 776 spin_unlock(&vfsmount_lock); 777 } 778 } 779 return res; 780 Enomem: 781 if (res) { 782 LIST_HEAD(umount_list); 783 spin_lock(&vfsmount_lock); 784 umount_tree(res, 0, &umount_list); 785 spin_unlock(&vfsmount_lock); 786 release_mounts(&umount_list); 787 } 788 return NULL; 789 } 790 791 struct vfsmount *collect_mounts(struct vfsmount *mnt, struct dentry *dentry) 792 { 793 struct vfsmount *tree; 794 down_read(&namespace_sem); 795 tree = copy_tree(mnt, dentry, CL_COPY_ALL | CL_PRIVATE); 796 up_read(&namespace_sem); 797 return tree; 798 } 799 800 void drop_collected_mounts(struct vfsmount *mnt) 801 { 802 LIST_HEAD(umount_list); 803 down_read(&namespace_sem); 804 spin_lock(&vfsmount_lock); 805 umount_tree(mnt, 0, &umount_list); 806 spin_unlock(&vfsmount_lock); 807 up_read(&namespace_sem); 808 release_mounts(&umount_list); 809 } 810 811 /* 812 * @source_mnt : mount tree to be attached 813 * @nd : place the mount tree @source_mnt is attached 814 * @parent_nd : if non-null, detach the source_mnt from its parent and 815 * store the parent mount and mountpoint dentry. 816 * (done when source_mnt is moved) 817 * 818 * NOTE: in the table below explains the semantics when a source mount 819 * of a given type is attached to a destination mount of a given type. 820 * --------------------------------------------------------------------------- 821 * | BIND MOUNT OPERATION | 822 * |************************************************************************** 823 * | source-->| shared | private | slave | unbindable | 824 * | dest | | | | | 825 * | | | | | | | 826 * | v | | | | | 827 * |************************************************************************** 828 * | shared | shared (++) | shared (+) | shared(+++)| invalid | 829 * | | | | | | 830 * |non-shared| shared (+) | private | slave (*) | invalid | 831 * *************************************************************************** 832 * A bind operation clones the source mount and mounts the clone on the 833 * destination mount. 834 * 835 * (++) the cloned mount is propagated to all the mounts in the propagation 836 * tree of the destination mount and the cloned mount is added to 837 * the peer group of the source mount. 838 * (+) the cloned mount is created under the destination mount and is marked 839 * as shared. The cloned mount is added to the peer group of the source 840 * mount. 841 * (+++) the mount is propagated to all the mounts in the propagation tree 842 * of the destination mount and the cloned mount is made slave 843 * of the same master as that of the source mount. The cloned mount 844 * is marked as 'shared and slave'. 845 * (*) the cloned mount is made a slave of the same master as that of the 846 * source mount. 847 * 848 * --------------------------------------------------------------------------- 849 * | MOVE MOUNT OPERATION | 850 * |************************************************************************** 851 * | source-->| shared | private | slave | unbindable | 852 * | dest | | | | | 853 * | | | | | | | 854 * | v | | | | | 855 * |************************************************************************** 856 * | shared | shared (+) | shared (+) | shared(+++) | invalid | 857 * | | | | | | 858 * |non-shared| shared (+*) | private | slave (*) | unbindable | 859 * *************************************************************************** 860 * 861 * (+) the mount is moved to the destination. And is then propagated to 862 * all the mounts in the propagation tree of the destination mount. 863 * (+*) the mount is moved to the destination. 864 * (+++) the mount is moved to the destination and is then propagated to 865 * all the mounts belonging to the destination mount's propagation tree. 866 * the mount is marked as 'shared and slave'. 867 * (*) the mount continues to be a slave at the new location. 868 * 869 * if the source mount is a tree, the operations explained above is 870 * applied to each mount in the tree. 871 * Must be called without spinlocks held, since this function can sleep 872 * in allocations. 873 */ 874 static int attach_recursive_mnt(struct vfsmount *source_mnt, 875 struct nameidata *nd, struct nameidata *parent_nd) 876 { 877 LIST_HEAD(tree_list); 878 struct vfsmount *dest_mnt = nd->mnt; 879 struct dentry *dest_dentry = nd->dentry; 880 struct vfsmount *child, *p; 881 882 if (propagate_mnt(dest_mnt, dest_dentry, source_mnt, &tree_list)) 883 return -EINVAL; 884 885 if (IS_MNT_SHARED(dest_mnt)) { 886 for (p = source_mnt; p; p = next_mnt(p, source_mnt)) 887 set_mnt_shared(p); 888 } 889 890 spin_lock(&vfsmount_lock); 891 if (parent_nd) { 892 detach_mnt(source_mnt, parent_nd); 893 attach_mnt(source_mnt, nd); 894 touch_mnt_namespace(current->nsproxy->mnt_ns); 895 } else { 896 mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt); 897 commit_tree(source_mnt); 898 } 899 900 list_for_each_entry_safe(child, p, &tree_list, mnt_hash) { 901 list_del_init(&child->mnt_hash); 902 commit_tree(child); 903 } 904 spin_unlock(&vfsmount_lock); 905 return 0; 906 } 907 908 static int graft_tree(struct vfsmount *mnt, struct nameidata *nd) 909 { 910 int err; 911 if (mnt->mnt_sb->s_flags & MS_NOUSER) 912 return -EINVAL; 913 914 if (S_ISDIR(nd->dentry->d_inode->i_mode) != 915 S_ISDIR(mnt->mnt_root->d_inode->i_mode)) 916 return -ENOTDIR; 917 918 err = -ENOENT; 919 mutex_lock(&nd->dentry->d_inode->i_mutex); 920 if (IS_DEADDIR(nd->dentry->d_inode)) 921 goto out_unlock; 922 923 err = security_sb_check_sb(mnt, nd); 924 if (err) 925 goto out_unlock; 926 927 err = -ENOENT; 928 if (IS_ROOT(nd->dentry) || !d_unhashed(nd->dentry)) 929 err = attach_recursive_mnt(mnt, nd, NULL); 930 out_unlock: 931 mutex_unlock(&nd->dentry->d_inode->i_mutex); 932 if (!err) 933 security_sb_post_addmount(mnt, nd); 934 return err; 935 } 936 937 /* 938 * recursively change the type of the mountpoint. 939 * noinline this do_mount helper to save do_mount stack space. 940 */ 941 static noinline int do_change_type(struct nameidata *nd, int flag) 942 { 943 struct vfsmount *m, *mnt = nd->mnt; 944 int recurse = flag & MS_REC; 945 int type = flag & ~MS_REC; 946 947 if (!capable(CAP_SYS_ADMIN)) 948 return -EPERM; 949 950 if (nd->dentry != nd->mnt->mnt_root) 951 return -EINVAL; 952 953 down_write(&namespace_sem); 954 spin_lock(&vfsmount_lock); 955 for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL)) 956 change_mnt_propagation(m, type); 957 spin_unlock(&vfsmount_lock); 958 up_write(&namespace_sem); 959 return 0; 960 } 961 962 /* 963 * do loopback mount. 964 * noinline this do_mount helper to save do_mount stack space. 965 */ 966 static noinline int do_loopback(struct nameidata *nd, char *old_name, 967 int recurse) 968 { 969 struct nameidata old_nd; 970 struct vfsmount *mnt = NULL; 971 int err = mount_is_safe(nd); 972 if (err) 973 return err; 974 if (!old_name || !*old_name) 975 return -EINVAL; 976 err = path_lookup(old_name, LOOKUP_FOLLOW, &old_nd); 977 if (err) 978 return err; 979 980 down_write(&namespace_sem); 981 err = -EINVAL; 982 if (IS_MNT_UNBINDABLE(old_nd.mnt)) 983 goto out; 984 985 if (!check_mnt(nd->mnt) || !check_mnt(old_nd.mnt)) 986 goto out; 987 988 err = -ENOMEM; 989 if (recurse) 990 mnt = copy_tree(old_nd.mnt, old_nd.dentry, 0); 991 else 992 mnt = clone_mnt(old_nd.mnt, old_nd.dentry, 0); 993 994 if (!mnt) 995 goto out; 996 997 err = graft_tree(mnt, nd); 998 if (err) { 999 LIST_HEAD(umount_list); 1000 spin_lock(&vfsmount_lock); 1001 umount_tree(mnt, 0, &umount_list); 1002 spin_unlock(&vfsmount_lock); 1003 release_mounts(&umount_list); 1004 } 1005 1006 out: 1007 up_write(&namespace_sem); 1008 path_release(&old_nd); 1009 return err; 1010 } 1011 1012 /* 1013 * change filesystem flags. dir should be a physical root of filesystem. 1014 * If you've mounted a non-root directory somewhere and want to do remount 1015 * on it - tough luck. 1016 * noinline this do_mount helper to save do_mount stack space. 1017 */ 1018 static noinline int do_remount(struct nameidata *nd, int flags, int mnt_flags, 1019 void *data) 1020 { 1021 int err; 1022 struct super_block *sb = nd->mnt->mnt_sb; 1023 1024 if (!capable(CAP_SYS_ADMIN)) 1025 return -EPERM; 1026 1027 if (!check_mnt(nd->mnt)) 1028 return -EINVAL; 1029 1030 if (nd->dentry != nd->mnt->mnt_root) 1031 return -EINVAL; 1032 1033 down_write(&sb->s_umount); 1034 err = do_remount_sb(sb, flags, data, 0); 1035 if (!err) 1036 nd->mnt->mnt_flags = mnt_flags; 1037 up_write(&sb->s_umount); 1038 if (!err) 1039 security_sb_post_remount(nd->mnt, flags, data); 1040 return err; 1041 } 1042 1043 static inline int tree_contains_unbindable(struct vfsmount *mnt) 1044 { 1045 struct vfsmount *p; 1046 for (p = mnt; p; p = next_mnt(p, mnt)) { 1047 if (IS_MNT_UNBINDABLE(p)) 1048 return 1; 1049 } 1050 return 0; 1051 } 1052 1053 /* 1054 * noinline this do_mount helper to save do_mount stack space. 1055 */ 1056 static noinline int do_move_mount(struct nameidata *nd, char *old_name) 1057 { 1058 struct nameidata old_nd, parent_nd; 1059 struct vfsmount *p; 1060 int err = 0; 1061 if (!capable(CAP_SYS_ADMIN)) 1062 return -EPERM; 1063 if (!old_name || !*old_name) 1064 return -EINVAL; 1065 err = path_lookup(old_name, LOOKUP_FOLLOW, &old_nd); 1066 if (err) 1067 return err; 1068 1069 down_write(&namespace_sem); 1070 while (d_mountpoint(nd->dentry) && follow_down(&nd->mnt, &nd->dentry)) 1071 ; 1072 err = -EINVAL; 1073 if (!check_mnt(nd->mnt) || !check_mnt(old_nd.mnt)) 1074 goto out; 1075 1076 err = -ENOENT; 1077 mutex_lock(&nd->dentry->d_inode->i_mutex); 1078 if (IS_DEADDIR(nd->dentry->d_inode)) 1079 goto out1; 1080 1081 if (!IS_ROOT(nd->dentry) && d_unhashed(nd->dentry)) 1082 goto out1; 1083 1084 err = -EINVAL; 1085 if (old_nd.dentry != old_nd.mnt->mnt_root) 1086 goto out1; 1087 1088 if (old_nd.mnt == old_nd.mnt->mnt_parent) 1089 goto out1; 1090 1091 if (S_ISDIR(nd->dentry->d_inode->i_mode) != 1092 S_ISDIR(old_nd.dentry->d_inode->i_mode)) 1093 goto out1; 1094 /* 1095 * Don't move a mount residing in a shared parent. 1096 */ 1097 if (old_nd.mnt->mnt_parent && IS_MNT_SHARED(old_nd.mnt->mnt_parent)) 1098 goto out1; 1099 /* 1100 * Don't move a mount tree containing unbindable mounts to a destination 1101 * mount which is shared. 1102 */ 1103 if (IS_MNT_SHARED(nd->mnt) && tree_contains_unbindable(old_nd.mnt)) 1104 goto out1; 1105 err = -ELOOP; 1106 for (p = nd->mnt; p->mnt_parent != p; p = p->mnt_parent) 1107 if (p == old_nd.mnt) 1108 goto out1; 1109 1110 if ((err = attach_recursive_mnt(old_nd.mnt, nd, &parent_nd))) 1111 goto out1; 1112 1113 spin_lock(&vfsmount_lock); 1114 /* if the mount is moved, it should no longer be expire 1115 * automatically */ 1116 list_del_init(&old_nd.mnt->mnt_expire); 1117 spin_unlock(&vfsmount_lock); 1118 out1: 1119 mutex_unlock(&nd->dentry->d_inode->i_mutex); 1120 out: 1121 up_write(&namespace_sem); 1122 if (!err) 1123 path_release(&parent_nd); 1124 path_release(&old_nd); 1125 return err; 1126 } 1127 1128 /* 1129 * create a new mount for userspace and request it to be added into the 1130 * namespace's tree 1131 * noinline this do_mount helper to save do_mount stack space. 1132 */ 1133 static noinline int do_new_mount(struct nameidata *nd, char *type, int flags, 1134 int mnt_flags, char *name, void *data) 1135 { 1136 struct vfsmount *mnt; 1137 1138 if (!type || !memchr(type, 0, PAGE_SIZE)) 1139 return -EINVAL; 1140 1141 /* we need capabilities... */ 1142 if (!capable(CAP_SYS_ADMIN)) 1143 return -EPERM; 1144 1145 mnt = do_kern_mount(type, flags, name, data); 1146 if (IS_ERR(mnt)) 1147 return PTR_ERR(mnt); 1148 1149 return do_add_mount(mnt, nd, mnt_flags, NULL); 1150 } 1151 1152 /* 1153 * add a mount into a namespace's mount tree 1154 * - provide the option of adding the new mount to an expiration list 1155 */ 1156 int do_add_mount(struct vfsmount *newmnt, struct nameidata *nd, 1157 int mnt_flags, struct list_head *fslist) 1158 { 1159 int err; 1160 1161 down_write(&namespace_sem); 1162 /* Something was mounted here while we slept */ 1163 while (d_mountpoint(nd->dentry) && follow_down(&nd->mnt, &nd->dentry)) 1164 ; 1165 err = -EINVAL; 1166 if (!check_mnt(nd->mnt)) 1167 goto unlock; 1168 1169 /* Refuse the same filesystem on the same mount point */ 1170 err = -EBUSY; 1171 if (nd->mnt->mnt_sb == newmnt->mnt_sb && 1172 nd->mnt->mnt_root == nd->dentry) 1173 goto unlock; 1174 1175 err = -EINVAL; 1176 if (S_ISLNK(newmnt->mnt_root->d_inode->i_mode)) 1177 goto unlock; 1178 1179 newmnt->mnt_flags = mnt_flags; 1180 if ((err = graft_tree(newmnt, nd))) 1181 goto unlock; 1182 1183 if (fslist) { 1184 /* add to the specified expiration list */ 1185 spin_lock(&vfsmount_lock); 1186 list_add_tail(&newmnt->mnt_expire, fslist); 1187 spin_unlock(&vfsmount_lock); 1188 } 1189 up_write(&namespace_sem); 1190 return 0; 1191 1192 unlock: 1193 up_write(&namespace_sem); 1194 mntput(newmnt); 1195 return err; 1196 } 1197 1198 EXPORT_SYMBOL_GPL(do_add_mount); 1199 1200 static void expire_mount(struct vfsmount *mnt, struct list_head *mounts, 1201 struct list_head *umounts) 1202 { 1203 spin_lock(&vfsmount_lock); 1204 1205 /* 1206 * Check if mount is still attached, if not, let whoever holds it deal 1207 * with the sucker 1208 */ 1209 if (mnt->mnt_parent == mnt) { 1210 spin_unlock(&vfsmount_lock); 1211 return; 1212 } 1213 1214 /* 1215 * Check that it is still dead: the count should now be 2 - as 1216 * contributed by the vfsmount parent and the mntget above 1217 */ 1218 if (!propagate_mount_busy(mnt, 2)) { 1219 /* delete from the namespace */ 1220 touch_mnt_namespace(mnt->mnt_ns); 1221 list_del_init(&mnt->mnt_list); 1222 mnt->mnt_ns = NULL; 1223 umount_tree(mnt, 1, umounts); 1224 spin_unlock(&vfsmount_lock); 1225 } else { 1226 /* 1227 * Someone brought it back to life whilst we didn't have any 1228 * locks held so return it to the expiration list 1229 */ 1230 list_add_tail(&mnt->mnt_expire, mounts); 1231 spin_unlock(&vfsmount_lock); 1232 } 1233 } 1234 1235 /* 1236 * go through the vfsmounts we've just consigned to the graveyard to 1237 * - check that they're still dead 1238 * - delete the vfsmount from the appropriate namespace under lock 1239 * - dispose of the corpse 1240 */ 1241 static void expire_mount_list(struct list_head *graveyard, struct list_head *mounts) 1242 { 1243 struct mnt_namespace *ns; 1244 struct vfsmount *mnt; 1245 1246 while (!list_empty(graveyard)) { 1247 LIST_HEAD(umounts); 1248 mnt = list_first_entry(graveyard, struct vfsmount, mnt_expire); 1249 list_del_init(&mnt->mnt_expire); 1250 1251 /* don't do anything if the namespace is dead - all the 1252 * vfsmounts from it are going away anyway */ 1253 ns = mnt->mnt_ns; 1254 if (!ns || !ns->root) 1255 continue; 1256 get_mnt_ns(ns); 1257 1258 spin_unlock(&vfsmount_lock); 1259 down_write(&namespace_sem); 1260 expire_mount(mnt, mounts, &umounts); 1261 up_write(&namespace_sem); 1262 release_mounts(&umounts); 1263 mntput(mnt); 1264 put_mnt_ns(ns); 1265 spin_lock(&vfsmount_lock); 1266 } 1267 } 1268 1269 /* 1270 * process a list of expirable mountpoints with the intent of discarding any 1271 * mountpoints that aren't in use and haven't been touched since last we came 1272 * here 1273 */ 1274 void mark_mounts_for_expiry(struct list_head *mounts) 1275 { 1276 struct vfsmount *mnt, *next; 1277 LIST_HEAD(graveyard); 1278 1279 if (list_empty(mounts)) 1280 return; 1281 1282 spin_lock(&vfsmount_lock); 1283 1284 /* extract from the expiration list every vfsmount that matches the 1285 * following criteria: 1286 * - only referenced by its parent vfsmount 1287 * - still marked for expiry (marked on the last call here; marks are 1288 * cleared by mntput()) 1289 */ 1290 list_for_each_entry_safe(mnt, next, mounts, mnt_expire) { 1291 if (!xchg(&mnt->mnt_expiry_mark, 1) || 1292 atomic_read(&mnt->mnt_count) != 1) 1293 continue; 1294 1295 mntget(mnt); 1296 list_move(&mnt->mnt_expire, &graveyard); 1297 } 1298 1299 expire_mount_list(&graveyard, mounts); 1300 1301 spin_unlock(&vfsmount_lock); 1302 } 1303 1304 EXPORT_SYMBOL_GPL(mark_mounts_for_expiry); 1305 1306 /* 1307 * Ripoff of 'select_parent()' 1308 * 1309 * search the list of submounts for a given mountpoint, and move any 1310 * shrinkable submounts to the 'graveyard' list. 1311 */ 1312 static int select_submounts(struct vfsmount *parent, struct list_head *graveyard) 1313 { 1314 struct vfsmount *this_parent = parent; 1315 struct list_head *next; 1316 int found = 0; 1317 1318 repeat: 1319 next = this_parent->mnt_mounts.next; 1320 resume: 1321 while (next != &this_parent->mnt_mounts) { 1322 struct list_head *tmp = next; 1323 struct vfsmount *mnt = list_entry(tmp, struct vfsmount, mnt_child); 1324 1325 next = tmp->next; 1326 if (!(mnt->mnt_flags & MNT_SHRINKABLE)) 1327 continue; 1328 /* 1329 * Descend a level if the d_mounts list is non-empty. 1330 */ 1331 if (!list_empty(&mnt->mnt_mounts)) { 1332 this_parent = mnt; 1333 goto repeat; 1334 } 1335 1336 if (!propagate_mount_busy(mnt, 1)) { 1337 mntget(mnt); 1338 list_move_tail(&mnt->mnt_expire, graveyard); 1339 found++; 1340 } 1341 } 1342 /* 1343 * All done at this level ... ascend and resume the search 1344 */ 1345 if (this_parent != parent) { 1346 next = this_parent->mnt_child.next; 1347 this_parent = this_parent->mnt_parent; 1348 goto resume; 1349 } 1350 return found; 1351 } 1352 1353 /* 1354 * process a list of expirable mountpoints with the intent of discarding any 1355 * submounts of a specific parent mountpoint 1356 */ 1357 void shrink_submounts(struct vfsmount *mountpoint, struct list_head *mounts) 1358 { 1359 LIST_HEAD(graveyard); 1360 int found; 1361 1362 spin_lock(&vfsmount_lock); 1363 1364 /* extract submounts of 'mountpoint' from the expiration list */ 1365 while ((found = select_submounts(mountpoint, &graveyard)) != 0) 1366 expire_mount_list(&graveyard, mounts); 1367 1368 spin_unlock(&vfsmount_lock); 1369 } 1370 1371 EXPORT_SYMBOL_GPL(shrink_submounts); 1372 1373 /* 1374 * Some copy_from_user() implementations do not return the exact number of 1375 * bytes remaining to copy on a fault. But copy_mount_options() requires that. 1376 * Note that this function differs from copy_from_user() in that it will oops 1377 * on bad values of `to', rather than returning a short copy. 1378 */ 1379 static long exact_copy_from_user(void *to, const void __user * from, 1380 unsigned long n) 1381 { 1382 char *t = to; 1383 const char __user *f = from; 1384 char c; 1385 1386 if (!access_ok(VERIFY_READ, from, n)) 1387 return n; 1388 1389 while (n) { 1390 if (__get_user(c, f)) { 1391 memset(t, 0, n); 1392 break; 1393 } 1394 *t++ = c; 1395 f++; 1396 n--; 1397 } 1398 return n; 1399 } 1400 1401 int copy_mount_options(const void __user * data, unsigned long *where) 1402 { 1403 int i; 1404 unsigned long page; 1405 unsigned long size; 1406 1407 *where = 0; 1408 if (!data) 1409 return 0; 1410 1411 if (!(page = __get_free_page(GFP_KERNEL))) 1412 return -ENOMEM; 1413 1414 /* We only care that *some* data at the address the user 1415 * gave us is valid. Just in case, we'll zero 1416 * the remainder of the page. 1417 */ 1418 /* copy_from_user cannot cross TASK_SIZE ! */ 1419 size = TASK_SIZE - (unsigned long)data; 1420 if (size > PAGE_SIZE) 1421 size = PAGE_SIZE; 1422 1423 i = size - exact_copy_from_user((void *)page, data, size); 1424 if (!i) { 1425 free_page(page); 1426 return -EFAULT; 1427 } 1428 if (i != PAGE_SIZE) 1429 memset((char *)page + i, 0, PAGE_SIZE - i); 1430 *where = page; 1431 return 0; 1432 } 1433 1434 /* 1435 * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to 1436 * be given to the mount() call (ie: read-only, no-dev, no-suid etc). 1437 * 1438 * data is a (void *) that can point to any structure up to 1439 * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent 1440 * information (or be NULL). 1441 * 1442 * Pre-0.97 versions of mount() didn't have a flags word. 1443 * When the flags word was introduced its top half was required 1444 * to have the magic value 0xC0ED, and this remained so until 2.4.0-test9. 1445 * Therefore, if this magic number is present, it carries no information 1446 * and must be discarded. 1447 */ 1448 long do_mount(char *dev_name, char *dir_name, char *type_page, 1449 unsigned long flags, void *data_page) 1450 { 1451 struct nameidata nd; 1452 int retval = 0; 1453 int mnt_flags = 0; 1454 1455 /* Discard magic */ 1456 if ((flags & MS_MGC_MSK) == MS_MGC_VAL) 1457 flags &= ~MS_MGC_MSK; 1458 1459 /* Basic sanity checks */ 1460 1461 if (!dir_name || !*dir_name || !memchr(dir_name, 0, PAGE_SIZE)) 1462 return -EINVAL; 1463 if (dev_name && !memchr(dev_name, 0, PAGE_SIZE)) 1464 return -EINVAL; 1465 1466 if (data_page) 1467 ((char *)data_page)[PAGE_SIZE - 1] = 0; 1468 1469 /* Separate the per-mountpoint flags */ 1470 if (flags & MS_NOSUID) 1471 mnt_flags |= MNT_NOSUID; 1472 if (flags & MS_NODEV) 1473 mnt_flags |= MNT_NODEV; 1474 if (flags & MS_NOEXEC) 1475 mnt_flags |= MNT_NOEXEC; 1476 if (flags & MS_NOATIME) 1477 mnt_flags |= MNT_NOATIME; 1478 if (flags & MS_NODIRATIME) 1479 mnt_flags |= MNT_NODIRATIME; 1480 if (flags & MS_RELATIME) 1481 mnt_flags |= MNT_RELATIME; 1482 1483 flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | 1484 MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT); 1485 1486 /* ... and get the mountpoint */ 1487 retval = path_lookup(dir_name, LOOKUP_FOLLOW, &nd); 1488 if (retval) 1489 return retval; 1490 1491 retval = security_sb_mount(dev_name, &nd, type_page, flags, data_page); 1492 if (retval) 1493 goto dput_out; 1494 1495 if (flags & MS_REMOUNT) 1496 retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags, 1497 data_page); 1498 else if (flags & MS_BIND) 1499 retval = do_loopback(&nd, dev_name, flags & MS_REC); 1500 else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) 1501 retval = do_change_type(&nd, flags); 1502 else if (flags & MS_MOVE) 1503 retval = do_move_mount(&nd, dev_name); 1504 else 1505 retval = do_new_mount(&nd, type_page, flags, mnt_flags, 1506 dev_name, data_page); 1507 dput_out: 1508 path_release(&nd); 1509 return retval; 1510 } 1511 1512 /* 1513 * Allocate a new namespace structure and populate it with contents 1514 * copied from the namespace of the passed in task structure. 1515 */ 1516 static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, 1517 struct fs_struct *fs) 1518 { 1519 struct mnt_namespace *new_ns; 1520 struct vfsmount *rootmnt = NULL, *pwdmnt = NULL, *altrootmnt = NULL; 1521 struct vfsmount *p, *q; 1522 1523 new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL); 1524 if (!new_ns) 1525 return ERR_PTR(-ENOMEM); 1526 1527 atomic_set(&new_ns->count, 1); 1528 INIT_LIST_HEAD(&new_ns->list); 1529 init_waitqueue_head(&new_ns->poll); 1530 new_ns->event = 0; 1531 1532 down_write(&namespace_sem); 1533 /* First pass: copy the tree topology */ 1534 new_ns->root = copy_tree(mnt_ns->root, mnt_ns->root->mnt_root, 1535 CL_COPY_ALL | CL_EXPIRE); 1536 if (!new_ns->root) { 1537 up_write(&namespace_sem); 1538 kfree(new_ns); 1539 return ERR_PTR(-ENOMEM);; 1540 } 1541 spin_lock(&vfsmount_lock); 1542 list_add_tail(&new_ns->list, &new_ns->root->mnt_list); 1543 spin_unlock(&vfsmount_lock); 1544 1545 /* 1546 * Second pass: switch the tsk->fs->* elements and mark new vfsmounts 1547 * as belonging to new namespace. We have already acquired a private 1548 * fs_struct, so tsk->fs->lock is not needed. 1549 */ 1550 p = mnt_ns->root; 1551 q = new_ns->root; 1552 while (p) { 1553 q->mnt_ns = new_ns; 1554 if (fs) { 1555 if (p == fs->rootmnt) { 1556 rootmnt = p; 1557 fs->rootmnt = mntget(q); 1558 } 1559 if (p == fs->pwdmnt) { 1560 pwdmnt = p; 1561 fs->pwdmnt = mntget(q); 1562 } 1563 if (p == fs->altrootmnt) { 1564 altrootmnt = p; 1565 fs->altrootmnt = mntget(q); 1566 } 1567 } 1568 p = next_mnt(p, mnt_ns->root); 1569 q = next_mnt(q, new_ns->root); 1570 } 1571 up_write(&namespace_sem); 1572 1573 if (rootmnt) 1574 mntput(rootmnt); 1575 if (pwdmnt) 1576 mntput(pwdmnt); 1577 if (altrootmnt) 1578 mntput(altrootmnt); 1579 1580 return new_ns; 1581 } 1582 1583 struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, 1584 struct fs_struct *new_fs) 1585 { 1586 struct mnt_namespace *new_ns; 1587 1588 BUG_ON(!ns); 1589 get_mnt_ns(ns); 1590 1591 if (!(flags & CLONE_NEWNS)) 1592 return ns; 1593 1594 new_ns = dup_mnt_ns(ns, new_fs); 1595 1596 put_mnt_ns(ns); 1597 return new_ns; 1598 } 1599 1600 asmlinkage long sys_mount(char __user * dev_name, char __user * dir_name, 1601 char __user * type, unsigned long flags, 1602 void __user * data) 1603 { 1604 int retval; 1605 unsigned long data_page; 1606 unsigned long type_page; 1607 unsigned long dev_page; 1608 char *dir_page; 1609 1610 retval = copy_mount_options(type, &type_page); 1611 if (retval < 0) 1612 return retval; 1613 1614 dir_page = getname(dir_name); 1615 retval = PTR_ERR(dir_page); 1616 if (IS_ERR(dir_page)) 1617 goto out1; 1618 1619 retval = copy_mount_options(dev_name, &dev_page); 1620 if (retval < 0) 1621 goto out2; 1622 1623 retval = copy_mount_options(data, &data_page); 1624 if (retval < 0) 1625 goto out3; 1626 1627 lock_kernel(); 1628 retval = do_mount((char *)dev_page, dir_page, (char *)type_page, 1629 flags, (void *)data_page); 1630 unlock_kernel(); 1631 free_page(data_page); 1632 1633 out3: 1634 free_page(dev_page); 1635 out2: 1636 putname(dir_page); 1637 out1: 1638 free_page(type_page); 1639 return retval; 1640 } 1641 1642 /* 1643 * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values. 1644 * It can block. Requires the big lock held. 1645 */ 1646 void set_fs_root(struct fs_struct *fs, struct vfsmount *mnt, 1647 struct dentry *dentry) 1648 { 1649 struct dentry *old_root; 1650 struct vfsmount *old_rootmnt; 1651 write_lock(&fs->lock); 1652 old_root = fs->root; 1653 old_rootmnt = fs->rootmnt; 1654 fs->rootmnt = mntget(mnt); 1655 fs->root = dget(dentry); 1656 write_unlock(&fs->lock); 1657 if (old_root) { 1658 dput(old_root); 1659 mntput(old_rootmnt); 1660 } 1661 } 1662 1663 /* 1664 * Replace the fs->{pwdmnt,pwd} with {mnt,dentry}. Put the old values. 1665 * It can block. Requires the big lock held. 1666 */ 1667 void set_fs_pwd(struct fs_struct *fs, struct vfsmount *mnt, 1668 struct dentry *dentry) 1669 { 1670 struct dentry *old_pwd; 1671 struct vfsmount *old_pwdmnt; 1672 1673 write_lock(&fs->lock); 1674 old_pwd = fs->pwd; 1675 old_pwdmnt = fs->pwdmnt; 1676 fs->pwdmnt = mntget(mnt); 1677 fs->pwd = dget(dentry); 1678 write_unlock(&fs->lock); 1679 1680 if (old_pwd) { 1681 dput(old_pwd); 1682 mntput(old_pwdmnt); 1683 } 1684 } 1685 1686 static void chroot_fs_refs(struct nameidata *old_nd, struct nameidata *new_nd) 1687 { 1688 struct task_struct *g, *p; 1689 struct fs_struct *fs; 1690 1691 read_lock(&tasklist_lock); 1692 do_each_thread(g, p) { 1693 task_lock(p); 1694 fs = p->fs; 1695 if (fs) { 1696 atomic_inc(&fs->count); 1697 task_unlock(p); 1698 if (fs->root == old_nd->dentry 1699 && fs->rootmnt == old_nd->mnt) 1700 set_fs_root(fs, new_nd->mnt, new_nd->dentry); 1701 if (fs->pwd == old_nd->dentry 1702 && fs->pwdmnt == old_nd->mnt) 1703 set_fs_pwd(fs, new_nd->mnt, new_nd->dentry); 1704 put_fs_struct(fs); 1705 } else 1706 task_unlock(p); 1707 } while_each_thread(g, p); 1708 read_unlock(&tasklist_lock); 1709 } 1710 1711 /* 1712 * pivot_root Semantics: 1713 * Moves the root file system of the current process to the directory put_old, 1714 * makes new_root as the new root file system of the current process, and sets 1715 * root/cwd of all processes which had them on the current root to new_root. 1716 * 1717 * Restrictions: 1718 * The new_root and put_old must be directories, and must not be on the 1719 * same file system as the current process root. The put_old must be 1720 * underneath new_root, i.e. adding a non-zero number of /.. to the string 1721 * pointed to by put_old must yield the same directory as new_root. No other 1722 * file system may be mounted on put_old. After all, new_root is a mountpoint. 1723 * 1724 * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem. 1725 * See Documentation/filesystems/ramfs-rootfs-initramfs.txt for alternatives 1726 * in this situation. 1727 * 1728 * Notes: 1729 * - we don't move root/cwd if they are not at the root (reason: if something 1730 * cared enough to change them, it's probably wrong to force them elsewhere) 1731 * - it's okay to pick a root that isn't the root of a file system, e.g. 1732 * /nfs/my_root where /nfs is the mount point. It must be a mountpoint, 1733 * though, so you may need to say mount --bind /nfs/my_root /nfs/my_root 1734 * first. 1735 */ 1736 asmlinkage long sys_pivot_root(const char __user * new_root, 1737 const char __user * put_old) 1738 { 1739 struct vfsmount *tmp; 1740 struct nameidata new_nd, old_nd, parent_nd, root_parent, user_nd; 1741 int error; 1742 1743 if (!capable(CAP_SYS_ADMIN)) 1744 return -EPERM; 1745 1746 lock_kernel(); 1747 1748 error = __user_walk(new_root, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, 1749 &new_nd); 1750 if (error) 1751 goto out0; 1752 error = -EINVAL; 1753 if (!check_mnt(new_nd.mnt)) 1754 goto out1; 1755 1756 error = __user_walk(put_old, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old_nd); 1757 if (error) 1758 goto out1; 1759 1760 error = security_sb_pivotroot(&old_nd, &new_nd); 1761 if (error) { 1762 path_release(&old_nd); 1763 goto out1; 1764 } 1765 1766 read_lock(¤t->fs->lock); 1767 user_nd.mnt = mntget(current->fs->rootmnt); 1768 user_nd.dentry = dget(current->fs->root); 1769 read_unlock(¤t->fs->lock); 1770 down_write(&namespace_sem); 1771 mutex_lock(&old_nd.dentry->d_inode->i_mutex); 1772 error = -EINVAL; 1773 if (IS_MNT_SHARED(old_nd.mnt) || 1774 IS_MNT_SHARED(new_nd.mnt->mnt_parent) || 1775 IS_MNT_SHARED(user_nd.mnt->mnt_parent)) 1776 goto out2; 1777 if (!check_mnt(user_nd.mnt)) 1778 goto out2; 1779 error = -ENOENT; 1780 if (IS_DEADDIR(new_nd.dentry->d_inode)) 1781 goto out2; 1782 if (d_unhashed(new_nd.dentry) && !IS_ROOT(new_nd.dentry)) 1783 goto out2; 1784 if (d_unhashed(old_nd.dentry) && !IS_ROOT(old_nd.dentry)) 1785 goto out2; 1786 error = -EBUSY; 1787 if (new_nd.mnt == user_nd.mnt || old_nd.mnt == user_nd.mnt) 1788 goto out2; /* loop, on the same file system */ 1789 error = -EINVAL; 1790 if (user_nd.mnt->mnt_root != user_nd.dentry) 1791 goto out2; /* not a mountpoint */ 1792 if (user_nd.mnt->mnt_parent == user_nd.mnt) 1793 goto out2; /* not attached */ 1794 if (new_nd.mnt->mnt_root != new_nd.dentry) 1795 goto out2; /* not a mountpoint */ 1796 if (new_nd.mnt->mnt_parent == new_nd.mnt) 1797 goto out2; /* not attached */ 1798 tmp = old_nd.mnt; /* make sure we can reach put_old from new_root */ 1799 spin_lock(&vfsmount_lock); 1800 if (tmp != new_nd.mnt) { 1801 for (;;) { 1802 if (tmp->mnt_parent == tmp) 1803 goto out3; /* already mounted on put_old */ 1804 if (tmp->mnt_parent == new_nd.mnt) 1805 break; 1806 tmp = tmp->mnt_parent; 1807 } 1808 if (!is_subdir(tmp->mnt_mountpoint, new_nd.dentry)) 1809 goto out3; 1810 } else if (!is_subdir(old_nd.dentry, new_nd.dentry)) 1811 goto out3; 1812 detach_mnt(new_nd.mnt, &parent_nd); 1813 detach_mnt(user_nd.mnt, &root_parent); 1814 attach_mnt(user_nd.mnt, &old_nd); /* mount old root on put_old */ 1815 attach_mnt(new_nd.mnt, &root_parent); /* mount new_root on / */ 1816 touch_mnt_namespace(current->nsproxy->mnt_ns); 1817 spin_unlock(&vfsmount_lock); 1818 chroot_fs_refs(&user_nd, &new_nd); 1819 security_sb_post_pivotroot(&user_nd, &new_nd); 1820 error = 0; 1821 path_release(&root_parent); 1822 path_release(&parent_nd); 1823 out2: 1824 mutex_unlock(&old_nd.dentry->d_inode->i_mutex); 1825 up_write(&namespace_sem); 1826 path_release(&user_nd); 1827 path_release(&old_nd); 1828 out1: 1829 path_release(&new_nd); 1830 out0: 1831 unlock_kernel(); 1832 return error; 1833 out3: 1834 spin_unlock(&vfsmount_lock); 1835 goto out2; 1836 } 1837 1838 static void __init init_mount_tree(void) 1839 { 1840 struct vfsmount *mnt; 1841 struct mnt_namespace *ns; 1842 1843 mnt = do_kern_mount("rootfs", 0, "rootfs", NULL); 1844 if (IS_ERR(mnt)) 1845 panic("Can't create rootfs"); 1846 ns = kmalloc(sizeof(*ns), GFP_KERNEL); 1847 if (!ns) 1848 panic("Can't allocate initial namespace"); 1849 atomic_set(&ns->count, 1); 1850 INIT_LIST_HEAD(&ns->list); 1851 init_waitqueue_head(&ns->poll); 1852 ns->event = 0; 1853 list_add(&mnt->mnt_list, &ns->list); 1854 ns->root = mnt; 1855 mnt->mnt_ns = ns; 1856 1857 init_task.nsproxy->mnt_ns = ns; 1858 get_mnt_ns(ns); 1859 1860 set_fs_pwd(current->fs, ns->root, ns->root->mnt_root); 1861 set_fs_root(current->fs, ns->root, ns->root->mnt_root); 1862 } 1863 1864 void __init mnt_init(void) 1865 { 1866 unsigned u; 1867 int err; 1868 1869 init_rwsem(&namespace_sem); 1870 1871 mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct vfsmount), 1872 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); 1873 1874 mount_hashtable = (struct list_head *)__get_free_page(GFP_ATOMIC); 1875 1876 if (!mount_hashtable) 1877 panic("Failed to allocate mount hash table\n"); 1878 1879 printk("Mount-cache hash table entries: %lu\n", HASH_SIZE); 1880 1881 for (u = 0; u < HASH_SIZE; u++) 1882 INIT_LIST_HEAD(&mount_hashtable[u]); 1883 1884 err = sysfs_init(); 1885 if (err) 1886 printk(KERN_WARNING "%s: sysfs_init error: %d\n", 1887 __FUNCTION__, err); 1888 fs_kobj = kobject_create_and_add("fs", NULL); 1889 if (!fs_kobj) 1890 printk(KERN_WARNING "%s: kobj create error\n", __FUNCTION__); 1891 init_rootfs(); 1892 init_mount_tree(); 1893 } 1894 1895 void __put_mnt_ns(struct mnt_namespace *ns) 1896 { 1897 struct vfsmount *root = ns->root; 1898 LIST_HEAD(umount_list); 1899 ns->root = NULL; 1900 spin_unlock(&vfsmount_lock); 1901 down_write(&namespace_sem); 1902 spin_lock(&vfsmount_lock); 1903 umount_tree(root, 0, &umount_list); 1904 spin_unlock(&vfsmount_lock); 1905 up_write(&namespace_sem); 1906 release_mounts(&umount_list); 1907 kfree(ns); 1908 } 1909