1 /* 2 * linux/fs/namei.c 3 * 4 * Copyright (C) 1991, 1992 Linus Torvalds 5 */ 6 7 /* 8 * Some corrections by tytso. 9 */ 10 11 /* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname 12 * lookup logic. 13 */ 14 /* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture. 15 */ 16 17 #include <linux/init.h> 18 #include <linux/module.h> 19 #include <linux/slab.h> 20 #include <linux/fs.h> 21 #include <linux/namei.h> 22 #include <linux/pagemap.h> 23 #include <linux/fsnotify.h> 24 #include <linux/personality.h> 25 #include <linux/security.h> 26 #include <linux/ima.h> 27 #include <linux/syscalls.h> 28 #include <linux/mount.h> 29 #include <linux/audit.h> 30 #include <linux/capability.h> 31 #include <linux/file.h> 32 #include <linux/fcntl.h> 33 #include <linux/device_cgroup.h> 34 #include <linux/fs_struct.h> 35 #include <linux/posix_acl.h> 36 #include <asm/uaccess.h> 37 38 #include "internal.h" 39 #include "mount.h" 40 41 /* [Feb-1997 T. Schoebel-Theuer] 42 * Fundamental changes in the pathname lookup mechanisms (namei) 43 * were necessary because of omirr. The reason is that omirr needs 44 * to know the _real_ pathname, not the user-supplied one, in case 45 * of symlinks (and also when transname replacements occur). 46 * 47 * The new code replaces the old recursive symlink resolution with 48 * an iterative one (in case of non-nested symlink chains). It does 49 * this with calls to <fs>_follow_link(). 50 * As a side effect, dir_namei(), _namei() and follow_link() are now 51 * replaced with a single function lookup_dentry() that can handle all 52 * the special cases of the former code. 53 * 54 * With the new dcache, the pathname is stored at each inode, at least as 55 * long as the refcount of the inode is positive. As a side effect, the 56 * size of the dcache depends on the inode cache and thus is dynamic. 57 * 58 * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink 59 * resolution to correspond with current state of the code. 60 * 61 * Note that the symlink resolution is not *completely* iterative. 62 * There is still a significant amount of tail- and mid- recursion in 63 * the algorithm. Also, note that <fs>_readlink() is not used in 64 * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink() 65 * may return different results than <fs>_follow_link(). Many virtual 66 * filesystems (including /proc) exhibit this behavior. 67 */ 68 69 /* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation: 70 * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL 71 * and the name already exists in form of a symlink, try to create the new 72 * name indicated by the symlink. The old code always complained that the 73 * name already exists, due to not following the symlink even if its target 74 * is nonexistent. The new semantics affects also mknod() and link() when 75 * the name is a symlink pointing to a non-existent name. 76 * 77 * I don't know which semantics is the right one, since I have no access 78 * to standards. But I found by trial that HP-UX 9.0 has the full "new" 79 * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the 80 * "old" one. Personally, I think the new semantics is much more logical. 81 * Note that "ln old new" where "new" is a symlink pointing to a non-existing 82 * file does succeed in both HP-UX and SunOs, but not in Solaris 83 * and in the old Linux semantics. 84 */ 85 86 /* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink 87 * semantics. See the comments in "open_namei" and "do_link" below. 88 * 89 * [10-Sep-98 Alan Modra] Another symlink change. 90 */ 91 92 /* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks: 93 * inside the path - always follow. 94 * in the last component in creation/removal/renaming - never follow. 95 * if LOOKUP_FOLLOW passed - follow. 96 * if the pathname has trailing slashes - follow. 97 * otherwise - don't follow. 98 * (applied in that order). 99 * 100 * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT 101 * restored for 2.4. This is the last surviving part of old 4.2BSD bug. 102 * During the 2.4 we need to fix the userland stuff depending on it - 103 * hopefully we will be able to get rid of that wart in 2.5. So far only 104 * XEmacs seems to be relying on it... 105 */ 106 /* 107 * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland) 108 * implemented. Let's see if raised priority of ->s_vfs_rename_mutex gives 109 * any extra contention... 110 */ 111 112 /* In order to reduce some races, while at the same time doing additional 113 * checking and hopefully speeding things up, we copy filenames to the 114 * kernel data space before using them.. 115 * 116 * POSIX.1 2.4: an empty pathname is invalid (ENOENT). 117 * PATH_MAX includes the nul terminator --RR. 118 */ 119 static int do_getname(const char __user *filename, char *page) 120 { 121 int retval; 122 unsigned long len = PATH_MAX; 123 124 if (!segment_eq(get_fs(), KERNEL_DS)) { 125 if ((unsigned long) filename >= TASK_SIZE) 126 return -EFAULT; 127 if (TASK_SIZE - (unsigned long) filename < PATH_MAX) 128 len = TASK_SIZE - (unsigned long) filename; 129 } 130 131 retval = strncpy_from_user(page, filename, len); 132 if (retval > 0) { 133 if (retval < len) 134 return 0; 135 return -ENAMETOOLONG; 136 } else if (!retval) 137 retval = -ENOENT; 138 return retval; 139 } 140 141 static char *getname_flags(const char __user *filename, int flags, int *empty) 142 { 143 char *tmp, *result; 144 145 result = ERR_PTR(-ENOMEM); 146 tmp = __getname(); 147 if (tmp) { 148 int retval = do_getname(filename, tmp); 149 150 result = tmp; 151 if (retval < 0) { 152 if (retval == -ENOENT && empty) 153 *empty = 1; 154 if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) { 155 __putname(tmp); 156 result = ERR_PTR(retval); 157 } 158 } 159 } 160 audit_getname(result); 161 return result; 162 } 163 164 char *getname(const char __user * filename) 165 { 166 return getname_flags(filename, 0, 0); 167 } 168 169 #ifdef CONFIG_AUDITSYSCALL 170 void putname(const char *name) 171 { 172 if (unlikely(!audit_dummy_context())) 173 audit_putname(name); 174 else 175 __putname(name); 176 } 177 EXPORT_SYMBOL(putname); 178 #endif 179 180 static int check_acl(struct inode *inode, int mask) 181 { 182 #ifdef CONFIG_FS_POSIX_ACL 183 struct posix_acl *acl; 184 185 if (mask & MAY_NOT_BLOCK) { 186 acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS); 187 if (!acl) 188 return -EAGAIN; 189 /* no ->get_acl() calls in RCU mode... */ 190 if (acl == ACL_NOT_CACHED) 191 return -ECHILD; 192 return posix_acl_permission(inode, acl, mask & ~MAY_NOT_BLOCK); 193 } 194 195 acl = get_cached_acl(inode, ACL_TYPE_ACCESS); 196 197 /* 198 * A filesystem can force a ACL callback by just never filling the 199 * ACL cache. But normally you'd fill the cache either at inode 200 * instantiation time, or on the first ->get_acl call. 201 * 202 * If the filesystem doesn't have a get_acl() function at all, we'll 203 * just create the negative cache entry. 204 */ 205 if (acl == ACL_NOT_CACHED) { 206 if (inode->i_op->get_acl) { 207 acl = inode->i_op->get_acl(inode, ACL_TYPE_ACCESS); 208 if (IS_ERR(acl)) 209 return PTR_ERR(acl); 210 } else { 211 set_cached_acl(inode, ACL_TYPE_ACCESS, NULL); 212 return -EAGAIN; 213 } 214 } 215 216 if (acl) { 217 int error = posix_acl_permission(inode, acl, mask); 218 posix_acl_release(acl); 219 return error; 220 } 221 #endif 222 223 return -EAGAIN; 224 } 225 226 /* 227 * This does the basic permission checking 228 */ 229 static int acl_permission_check(struct inode *inode, int mask) 230 { 231 unsigned int mode = inode->i_mode; 232 233 if (current_user_ns() != inode_userns(inode)) 234 goto other_perms; 235 236 if (likely(current_fsuid() == inode->i_uid)) 237 mode >>= 6; 238 else { 239 if (IS_POSIXACL(inode) && (mode & S_IRWXG)) { 240 int error = check_acl(inode, mask); 241 if (error != -EAGAIN) 242 return error; 243 } 244 245 if (in_group_p(inode->i_gid)) 246 mode >>= 3; 247 } 248 249 other_perms: 250 /* 251 * If the DACs are ok we don't need any capability check. 252 */ 253 if ((mask & ~mode & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0) 254 return 0; 255 return -EACCES; 256 } 257 258 /** 259 * generic_permission - check for access rights on a Posix-like filesystem 260 * @inode: inode to check access rights for 261 * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, ...) 262 * 263 * Used to check for read/write/execute permissions on a file. 264 * We use "fsuid" for this, letting us set arbitrary permissions 265 * for filesystem access without changing the "normal" uids which 266 * are used for other things. 267 * 268 * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk 269 * request cannot be satisfied (eg. requires blocking or too much complexity). 270 * It would then be called again in ref-walk mode. 271 */ 272 int generic_permission(struct inode *inode, int mask) 273 { 274 int ret; 275 276 /* 277 * Do the basic permission checks. 278 */ 279 ret = acl_permission_check(inode, mask); 280 if (ret != -EACCES) 281 return ret; 282 283 if (S_ISDIR(inode->i_mode)) { 284 /* DACs are overridable for directories */ 285 if (ns_capable(inode_userns(inode), CAP_DAC_OVERRIDE)) 286 return 0; 287 if (!(mask & MAY_WRITE)) 288 if (ns_capable(inode_userns(inode), CAP_DAC_READ_SEARCH)) 289 return 0; 290 return -EACCES; 291 } 292 /* 293 * Read/write DACs are always overridable. 294 * Executable DACs are overridable when there is 295 * at least one exec bit set. 296 */ 297 if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO)) 298 if (ns_capable(inode_userns(inode), CAP_DAC_OVERRIDE)) 299 return 0; 300 301 /* 302 * Searching includes executable on directories, else just read. 303 */ 304 mask &= MAY_READ | MAY_WRITE | MAY_EXEC; 305 if (mask == MAY_READ) 306 if (ns_capable(inode_userns(inode), CAP_DAC_READ_SEARCH)) 307 return 0; 308 309 return -EACCES; 310 } 311 312 /* 313 * We _really_ want to just do "generic_permission()" without 314 * even looking at the inode->i_op values. So we keep a cache 315 * flag in inode->i_opflags, that says "this has not special 316 * permission function, use the fast case". 317 */ 318 static inline int do_inode_permission(struct inode *inode, int mask) 319 { 320 if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) { 321 if (likely(inode->i_op->permission)) 322 return inode->i_op->permission(inode, mask); 323 324 /* This gets set once for the inode lifetime */ 325 spin_lock(&inode->i_lock); 326 inode->i_opflags |= IOP_FASTPERM; 327 spin_unlock(&inode->i_lock); 328 } 329 return generic_permission(inode, mask); 330 } 331 332 /** 333 * inode_permission - check for access rights to a given inode 334 * @inode: inode to check permission on 335 * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, ...) 336 * 337 * Used to check for read/write/execute permissions on an inode. 338 * We use "fsuid" for this, letting us set arbitrary permissions 339 * for filesystem access without changing the "normal" uids which 340 * are used for other things. 341 * 342 * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask. 343 */ 344 int inode_permission(struct inode *inode, int mask) 345 { 346 int retval; 347 348 if (unlikely(mask & MAY_WRITE)) { 349 umode_t mode = inode->i_mode; 350 351 /* 352 * Nobody gets write access to a read-only fs. 353 */ 354 if (IS_RDONLY(inode) && 355 (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) 356 return -EROFS; 357 358 /* 359 * Nobody gets write access to an immutable file. 360 */ 361 if (IS_IMMUTABLE(inode)) 362 return -EACCES; 363 } 364 365 retval = do_inode_permission(inode, mask); 366 if (retval) 367 return retval; 368 369 retval = devcgroup_inode_permission(inode, mask); 370 if (retval) 371 return retval; 372 373 return security_inode_permission(inode, mask); 374 } 375 376 /** 377 * path_get - get a reference to a path 378 * @path: path to get the reference to 379 * 380 * Given a path increment the reference count to the dentry and the vfsmount. 381 */ 382 void path_get(struct path *path) 383 { 384 mntget(path->mnt); 385 dget(path->dentry); 386 } 387 EXPORT_SYMBOL(path_get); 388 389 /** 390 * path_put - put a reference to a path 391 * @path: path to put the reference to 392 * 393 * Given a path decrement the reference count to the dentry and the vfsmount. 394 */ 395 void path_put(struct path *path) 396 { 397 dput(path->dentry); 398 mntput(path->mnt); 399 } 400 EXPORT_SYMBOL(path_put); 401 402 /* 403 * Path walking has 2 modes, rcu-walk and ref-walk (see 404 * Documentation/filesystems/path-lookup.txt). In situations when we can't 405 * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab 406 * normal reference counts on dentries and vfsmounts to transition to rcu-walk 407 * mode. Refcounts are grabbed at the last known good point before rcu-walk 408 * got stuck, so ref-walk may continue from there. If this is not successful 409 * (eg. a seqcount has changed), then failure is returned and it's up to caller 410 * to restart the path walk from the beginning in ref-walk mode. 411 */ 412 413 /** 414 * unlazy_walk - try to switch to ref-walk mode. 415 * @nd: nameidata pathwalk data 416 * @dentry: child of nd->path.dentry or NULL 417 * Returns: 0 on success, -ECHILD on failure 418 * 419 * unlazy_walk attempts to legitimize the current nd->path, nd->root and dentry 420 * for ref-walk mode. @dentry must be a path found by a do_lookup call on 421 * @nd or NULL. Must be called from rcu-walk context. 422 */ 423 static int unlazy_walk(struct nameidata *nd, struct dentry *dentry) 424 { 425 struct fs_struct *fs = current->fs; 426 struct dentry *parent = nd->path.dentry; 427 int want_root = 0; 428 429 BUG_ON(!(nd->flags & LOOKUP_RCU)); 430 if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { 431 want_root = 1; 432 spin_lock(&fs->lock); 433 if (nd->root.mnt != fs->root.mnt || 434 nd->root.dentry != fs->root.dentry) 435 goto err_root; 436 } 437 spin_lock(&parent->d_lock); 438 if (!dentry) { 439 if (!__d_rcu_to_refcount(parent, nd->seq)) 440 goto err_parent; 441 BUG_ON(nd->inode != parent->d_inode); 442 } else { 443 if (dentry->d_parent != parent) 444 goto err_parent; 445 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); 446 if (!__d_rcu_to_refcount(dentry, nd->seq)) 447 goto err_child; 448 /* 449 * If the sequence check on the child dentry passed, then 450 * the child has not been removed from its parent. This 451 * means the parent dentry must be valid and able to take 452 * a reference at this point. 453 */ 454 BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent); 455 BUG_ON(!parent->d_count); 456 parent->d_count++; 457 spin_unlock(&dentry->d_lock); 458 } 459 spin_unlock(&parent->d_lock); 460 if (want_root) { 461 path_get(&nd->root); 462 spin_unlock(&fs->lock); 463 } 464 mntget(nd->path.mnt); 465 466 rcu_read_unlock(); 467 br_read_unlock(vfsmount_lock); 468 nd->flags &= ~LOOKUP_RCU; 469 return 0; 470 471 err_child: 472 spin_unlock(&dentry->d_lock); 473 err_parent: 474 spin_unlock(&parent->d_lock); 475 err_root: 476 if (want_root) 477 spin_unlock(&fs->lock); 478 return -ECHILD; 479 } 480 481 /** 482 * release_open_intent - free up open intent resources 483 * @nd: pointer to nameidata 484 */ 485 void release_open_intent(struct nameidata *nd) 486 { 487 struct file *file = nd->intent.open.file; 488 489 if (file && !IS_ERR(file)) { 490 if (file->f_path.dentry == NULL) 491 put_filp(file); 492 else 493 fput(file); 494 } 495 } 496 497 static inline int d_revalidate(struct dentry *dentry, struct nameidata *nd) 498 { 499 return dentry->d_op->d_revalidate(dentry, nd); 500 } 501 502 /** 503 * complete_walk - successful completion of path walk 504 * @nd: pointer nameidata 505 * 506 * If we had been in RCU mode, drop out of it and legitimize nd->path. 507 * Revalidate the final result, unless we'd already done that during 508 * the path walk or the filesystem doesn't ask for it. Return 0 on 509 * success, -error on failure. In case of failure caller does not 510 * need to drop nd->path. 511 */ 512 static int complete_walk(struct nameidata *nd) 513 { 514 struct dentry *dentry = nd->path.dentry; 515 int status; 516 517 if (nd->flags & LOOKUP_RCU) { 518 nd->flags &= ~LOOKUP_RCU; 519 if (!(nd->flags & LOOKUP_ROOT)) 520 nd->root.mnt = NULL; 521 spin_lock(&dentry->d_lock); 522 if (unlikely(!__d_rcu_to_refcount(dentry, nd->seq))) { 523 spin_unlock(&dentry->d_lock); 524 rcu_read_unlock(); 525 br_read_unlock(vfsmount_lock); 526 return -ECHILD; 527 } 528 BUG_ON(nd->inode != dentry->d_inode); 529 spin_unlock(&dentry->d_lock); 530 mntget(nd->path.mnt); 531 rcu_read_unlock(); 532 br_read_unlock(vfsmount_lock); 533 } 534 535 if (likely(!(nd->flags & LOOKUP_JUMPED))) 536 return 0; 537 538 if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE))) 539 return 0; 540 541 if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT))) 542 return 0; 543 544 /* Note: we do not d_invalidate() */ 545 status = d_revalidate(dentry, nd); 546 if (status > 0) 547 return 0; 548 549 if (!status) 550 status = -ESTALE; 551 552 path_put(&nd->path); 553 return status; 554 } 555 556 static __always_inline void set_root(struct nameidata *nd) 557 { 558 if (!nd->root.mnt) 559 get_fs_root(current->fs, &nd->root); 560 } 561 562 static int link_path_walk(const char *, struct nameidata *); 563 564 static __always_inline void set_root_rcu(struct nameidata *nd) 565 { 566 if (!nd->root.mnt) { 567 struct fs_struct *fs = current->fs; 568 unsigned seq; 569 570 do { 571 seq = read_seqcount_begin(&fs->seq); 572 nd->root = fs->root; 573 nd->seq = __read_seqcount_begin(&nd->root.dentry->d_seq); 574 } while (read_seqcount_retry(&fs->seq, seq)); 575 } 576 } 577 578 static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link) 579 { 580 int ret; 581 582 if (IS_ERR(link)) 583 goto fail; 584 585 if (*link == '/') { 586 set_root(nd); 587 path_put(&nd->path); 588 nd->path = nd->root; 589 path_get(&nd->root); 590 nd->flags |= LOOKUP_JUMPED; 591 } 592 nd->inode = nd->path.dentry->d_inode; 593 594 ret = link_path_walk(link, nd); 595 return ret; 596 fail: 597 path_put(&nd->path); 598 return PTR_ERR(link); 599 } 600 601 static void path_put_conditional(struct path *path, struct nameidata *nd) 602 { 603 dput(path->dentry); 604 if (path->mnt != nd->path.mnt) 605 mntput(path->mnt); 606 } 607 608 static inline void path_to_nameidata(const struct path *path, 609 struct nameidata *nd) 610 { 611 if (!(nd->flags & LOOKUP_RCU)) { 612 dput(nd->path.dentry); 613 if (nd->path.mnt != path->mnt) 614 mntput(nd->path.mnt); 615 } 616 nd->path.mnt = path->mnt; 617 nd->path.dentry = path->dentry; 618 } 619 620 static inline void put_link(struct nameidata *nd, struct path *link, void *cookie) 621 { 622 struct inode *inode = link->dentry->d_inode; 623 if (!IS_ERR(cookie) && inode->i_op->put_link) 624 inode->i_op->put_link(link->dentry, nd, cookie); 625 path_put(link); 626 } 627 628 static __always_inline int 629 follow_link(struct path *link, struct nameidata *nd, void **p) 630 { 631 int error; 632 struct dentry *dentry = link->dentry; 633 634 BUG_ON(nd->flags & LOOKUP_RCU); 635 636 if (link->mnt == nd->path.mnt) 637 mntget(link->mnt); 638 639 if (unlikely(current->total_link_count >= 40)) { 640 *p = ERR_PTR(-ELOOP); /* no ->put_link(), please */ 641 path_put(&nd->path); 642 return -ELOOP; 643 } 644 cond_resched(); 645 current->total_link_count++; 646 647 touch_atime(link->mnt, dentry); 648 nd_set_link(nd, NULL); 649 650 error = security_inode_follow_link(link->dentry, nd); 651 if (error) { 652 *p = ERR_PTR(error); /* no ->put_link(), please */ 653 path_put(&nd->path); 654 return error; 655 } 656 657 nd->last_type = LAST_BIND; 658 *p = dentry->d_inode->i_op->follow_link(dentry, nd); 659 error = PTR_ERR(*p); 660 if (!IS_ERR(*p)) { 661 char *s = nd_get_link(nd); 662 error = 0; 663 if (s) 664 error = __vfs_follow_link(nd, s); 665 else if (nd->last_type == LAST_BIND) { 666 nd->flags |= LOOKUP_JUMPED; 667 nd->inode = nd->path.dentry->d_inode; 668 if (nd->inode->i_op->follow_link) { 669 /* stepped on a _really_ weird one */ 670 path_put(&nd->path); 671 error = -ELOOP; 672 } 673 } 674 } 675 return error; 676 } 677 678 static int follow_up_rcu(struct path *path) 679 { 680 struct mount *mnt = real_mount(path->mnt); 681 struct mount *parent; 682 struct dentry *mountpoint; 683 684 parent = mnt->mnt_parent; 685 if (&parent->mnt == path->mnt) 686 return 0; 687 mountpoint = mnt->mnt_mountpoint; 688 path->dentry = mountpoint; 689 path->mnt = &parent->mnt; 690 return 1; 691 } 692 693 int follow_up(struct path *path) 694 { 695 struct mount *mnt = real_mount(path->mnt); 696 struct mount *parent; 697 struct dentry *mountpoint; 698 699 br_read_lock(vfsmount_lock); 700 parent = mnt->mnt_parent; 701 if (&parent->mnt == path->mnt) { 702 br_read_unlock(vfsmount_lock); 703 return 0; 704 } 705 mntget(&parent->mnt); 706 mountpoint = dget(mnt->mnt_mountpoint); 707 br_read_unlock(vfsmount_lock); 708 dput(path->dentry); 709 path->dentry = mountpoint; 710 mntput(path->mnt); 711 path->mnt = &parent->mnt; 712 return 1; 713 } 714 715 /* 716 * Perform an automount 717 * - return -EISDIR to tell follow_managed() to stop and return the path we 718 * were called with. 719 */ 720 static int follow_automount(struct path *path, unsigned flags, 721 bool *need_mntput) 722 { 723 struct vfsmount *mnt; 724 int err; 725 726 if (!path->dentry->d_op || !path->dentry->d_op->d_automount) 727 return -EREMOTE; 728 729 /* We don't want to mount if someone's just doing a stat - 730 * unless they're stat'ing a directory and appended a '/' to 731 * the name. 732 * 733 * We do, however, want to mount if someone wants to open or 734 * create a file of any type under the mountpoint, wants to 735 * traverse through the mountpoint or wants to open the 736 * mounted directory. Also, autofs may mark negative dentries 737 * as being automount points. These will need the attentions 738 * of the daemon to instantiate them before they can be used. 739 */ 740 if (!(flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY | 741 LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) && 742 path->dentry->d_inode) 743 return -EISDIR; 744 745 current->total_link_count++; 746 if (current->total_link_count >= 40) 747 return -ELOOP; 748 749 mnt = path->dentry->d_op->d_automount(path); 750 if (IS_ERR(mnt)) { 751 /* 752 * The filesystem is allowed to return -EISDIR here to indicate 753 * it doesn't want to automount. For instance, autofs would do 754 * this so that its userspace daemon can mount on this dentry. 755 * 756 * However, we can only permit this if it's a terminal point in 757 * the path being looked up; if it wasn't then the remainder of 758 * the path is inaccessible and we should say so. 759 */ 760 if (PTR_ERR(mnt) == -EISDIR && (flags & LOOKUP_PARENT)) 761 return -EREMOTE; 762 return PTR_ERR(mnt); 763 } 764 765 if (!mnt) /* mount collision */ 766 return 0; 767 768 if (!*need_mntput) { 769 /* lock_mount() may release path->mnt on error */ 770 mntget(path->mnt); 771 *need_mntput = true; 772 } 773 err = finish_automount(mnt, path); 774 775 switch (err) { 776 case -EBUSY: 777 /* Someone else made a mount here whilst we were busy */ 778 return 0; 779 case 0: 780 path_put(path); 781 path->mnt = mnt; 782 path->dentry = dget(mnt->mnt_root); 783 return 0; 784 default: 785 return err; 786 } 787 788 } 789 790 /* 791 * Handle a dentry that is managed in some way. 792 * - Flagged for transit management (autofs) 793 * - Flagged as mountpoint 794 * - Flagged as automount point 795 * 796 * This may only be called in refwalk mode. 797 * 798 * Serialization is taken care of in namespace.c 799 */ 800 static int follow_managed(struct path *path, unsigned flags) 801 { 802 struct vfsmount *mnt = path->mnt; /* held by caller, must be left alone */ 803 unsigned managed; 804 bool need_mntput = false; 805 int ret = 0; 806 807 /* Given that we're not holding a lock here, we retain the value in a 808 * local variable for each dentry as we look at it so that we don't see 809 * the components of that value change under us */ 810 while (managed = ACCESS_ONCE(path->dentry->d_flags), 811 managed &= DCACHE_MANAGED_DENTRY, 812 unlikely(managed != 0)) { 813 /* Allow the filesystem to manage the transit without i_mutex 814 * being held. */ 815 if (managed & DCACHE_MANAGE_TRANSIT) { 816 BUG_ON(!path->dentry->d_op); 817 BUG_ON(!path->dentry->d_op->d_manage); 818 ret = path->dentry->d_op->d_manage(path->dentry, false); 819 if (ret < 0) 820 break; 821 } 822 823 /* Transit to a mounted filesystem. */ 824 if (managed & DCACHE_MOUNTED) { 825 struct vfsmount *mounted = lookup_mnt(path); 826 if (mounted) { 827 dput(path->dentry); 828 if (need_mntput) 829 mntput(path->mnt); 830 path->mnt = mounted; 831 path->dentry = dget(mounted->mnt_root); 832 need_mntput = true; 833 continue; 834 } 835 836 /* Something is mounted on this dentry in another 837 * namespace and/or whatever was mounted there in this 838 * namespace got unmounted before we managed to get the 839 * vfsmount_lock */ 840 } 841 842 /* Handle an automount point */ 843 if (managed & DCACHE_NEED_AUTOMOUNT) { 844 ret = follow_automount(path, flags, &need_mntput); 845 if (ret < 0) 846 break; 847 continue; 848 } 849 850 /* We didn't change the current path point */ 851 break; 852 } 853 854 if (need_mntput && path->mnt == mnt) 855 mntput(path->mnt); 856 if (ret == -EISDIR) 857 ret = 0; 858 return ret < 0 ? ret : need_mntput; 859 } 860 861 int follow_down_one(struct path *path) 862 { 863 struct vfsmount *mounted; 864 865 mounted = lookup_mnt(path); 866 if (mounted) { 867 dput(path->dentry); 868 mntput(path->mnt); 869 path->mnt = mounted; 870 path->dentry = dget(mounted->mnt_root); 871 return 1; 872 } 873 return 0; 874 } 875 876 static inline bool managed_dentry_might_block(struct dentry *dentry) 877 { 878 return (dentry->d_flags & DCACHE_MANAGE_TRANSIT && 879 dentry->d_op->d_manage(dentry, true) < 0); 880 } 881 882 /* 883 * Try to skip to top of mountpoint pile in rcuwalk mode. Fail if 884 * we meet a managed dentry that would need blocking. 885 */ 886 static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, 887 struct inode **inode) 888 { 889 for (;;) { 890 struct mount *mounted; 891 /* 892 * Don't forget we might have a non-mountpoint managed dentry 893 * that wants to block transit. 894 */ 895 if (unlikely(managed_dentry_might_block(path->dentry))) 896 return false; 897 898 if (!d_mountpoint(path->dentry)) 899 break; 900 901 mounted = __lookup_mnt(path->mnt, path->dentry, 1); 902 if (!mounted) 903 break; 904 path->mnt = &mounted->mnt; 905 path->dentry = mounted->mnt.mnt_root; 906 nd->flags |= LOOKUP_JUMPED; 907 nd->seq = read_seqcount_begin(&path->dentry->d_seq); 908 /* 909 * Update the inode too. We don't need to re-check the 910 * dentry sequence number here after this d_inode read, 911 * because a mount-point is always pinned. 912 */ 913 *inode = path->dentry->d_inode; 914 } 915 return true; 916 } 917 918 static void follow_mount_rcu(struct nameidata *nd) 919 { 920 while (d_mountpoint(nd->path.dentry)) { 921 struct mount *mounted; 922 mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry, 1); 923 if (!mounted) 924 break; 925 nd->path.mnt = &mounted->mnt; 926 nd->path.dentry = mounted->mnt.mnt_root; 927 nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); 928 } 929 } 930 931 static int follow_dotdot_rcu(struct nameidata *nd) 932 { 933 set_root_rcu(nd); 934 935 while (1) { 936 if (nd->path.dentry == nd->root.dentry && 937 nd->path.mnt == nd->root.mnt) { 938 break; 939 } 940 if (nd->path.dentry != nd->path.mnt->mnt_root) { 941 struct dentry *old = nd->path.dentry; 942 struct dentry *parent = old->d_parent; 943 unsigned seq; 944 945 seq = read_seqcount_begin(&parent->d_seq); 946 if (read_seqcount_retry(&old->d_seq, nd->seq)) 947 goto failed; 948 nd->path.dentry = parent; 949 nd->seq = seq; 950 break; 951 } 952 if (!follow_up_rcu(&nd->path)) 953 break; 954 nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); 955 } 956 follow_mount_rcu(nd); 957 nd->inode = nd->path.dentry->d_inode; 958 return 0; 959 960 failed: 961 nd->flags &= ~LOOKUP_RCU; 962 if (!(nd->flags & LOOKUP_ROOT)) 963 nd->root.mnt = NULL; 964 rcu_read_unlock(); 965 br_read_unlock(vfsmount_lock); 966 return -ECHILD; 967 } 968 969 /* 970 * Follow down to the covering mount currently visible to userspace. At each 971 * point, the filesystem owning that dentry may be queried as to whether the 972 * caller is permitted to proceed or not. 973 */ 974 int follow_down(struct path *path) 975 { 976 unsigned managed; 977 int ret; 978 979 while (managed = ACCESS_ONCE(path->dentry->d_flags), 980 unlikely(managed & DCACHE_MANAGED_DENTRY)) { 981 /* Allow the filesystem to manage the transit without i_mutex 982 * being held. 983 * 984 * We indicate to the filesystem if someone is trying to mount 985 * something here. This gives autofs the chance to deny anyone 986 * other than its daemon the right to mount on its 987 * superstructure. 988 * 989 * The filesystem may sleep at this point. 990 */ 991 if (managed & DCACHE_MANAGE_TRANSIT) { 992 BUG_ON(!path->dentry->d_op); 993 BUG_ON(!path->dentry->d_op->d_manage); 994 ret = path->dentry->d_op->d_manage( 995 path->dentry, false); 996 if (ret < 0) 997 return ret == -EISDIR ? 0 : ret; 998 } 999 1000 /* Transit to a mounted filesystem. */ 1001 if (managed & DCACHE_MOUNTED) { 1002 struct vfsmount *mounted = lookup_mnt(path); 1003 if (!mounted) 1004 break; 1005 dput(path->dentry); 1006 mntput(path->mnt); 1007 path->mnt = mounted; 1008 path->dentry = dget(mounted->mnt_root); 1009 continue; 1010 } 1011 1012 /* Don't handle automount points here */ 1013 break; 1014 } 1015 return 0; 1016 } 1017 1018 /* 1019 * Skip to top of mountpoint pile in refwalk mode for follow_dotdot() 1020 */ 1021 static void follow_mount(struct path *path) 1022 { 1023 while (d_mountpoint(path->dentry)) { 1024 struct vfsmount *mounted = lookup_mnt(path); 1025 if (!mounted) 1026 break; 1027 dput(path->dentry); 1028 mntput(path->mnt); 1029 path->mnt = mounted; 1030 path->dentry = dget(mounted->mnt_root); 1031 } 1032 } 1033 1034 static void follow_dotdot(struct nameidata *nd) 1035 { 1036 set_root(nd); 1037 1038 while(1) { 1039 struct dentry *old = nd->path.dentry; 1040 1041 if (nd->path.dentry == nd->root.dentry && 1042 nd->path.mnt == nd->root.mnt) { 1043 break; 1044 } 1045 if (nd->path.dentry != nd->path.mnt->mnt_root) { 1046 /* rare case of legitimate dget_parent()... */ 1047 nd->path.dentry = dget_parent(nd->path.dentry); 1048 dput(old); 1049 break; 1050 } 1051 if (!follow_up(&nd->path)) 1052 break; 1053 } 1054 follow_mount(&nd->path); 1055 nd->inode = nd->path.dentry->d_inode; 1056 } 1057 1058 /* 1059 * Allocate a dentry with name and parent, and perform a parent 1060 * directory ->lookup on it. Returns the new dentry, or ERR_PTR 1061 * on error. parent->d_inode->i_mutex must be held. d_lookup must 1062 * have verified that no child exists while under i_mutex. 1063 */ 1064 static struct dentry *d_alloc_and_lookup(struct dentry *parent, 1065 struct qstr *name, struct nameidata *nd) 1066 { 1067 struct inode *inode = parent->d_inode; 1068 struct dentry *dentry; 1069 struct dentry *old; 1070 1071 /* Don't create child dentry for a dead directory. */ 1072 if (unlikely(IS_DEADDIR(inode))) 1073 return ERR_PTR(-ENOENT); 1074 1075 dentry = d_alloc(parent, name); 1076 if (unlikely(!dentry)) 1077 return ERR_PTR(-ENOMEM); 1078 1079 old = inode->i_op->lookup(inode, dentry, nd); 1080 if (unlikely(old)) { 1081 dput(dentry); 1082 dentry = old; 1083 } 1084 return dentry; 1085 } 1086 1087 /* 1088 * We already have a dentry, but require a lookup to be performed on the parent 1089 * directory to fill in d_inode. Returns the new dentry, or ERR_PTR on error. 1090 * parent->d_inode->i_mutex must be held. d_lookup must have verified that no 1091 * child exists while under i_mutex. 1092 */ 1093 static struct dentry *d_inode_lookup(struct dentry *parent, struct dentry *dentry, 1094 struct nameidata *nd) 1095 { 1096 struct inode *inode = parent->d_inode; 1097 struct dentry *old; 1098 1099 /* Don't create child dentry for a dead directory. */ 1100 if (unlikely(IS_DEADDIR(inode))) 1101 return ERR_PTR(-ENOENT); 1102 1103 old = inode->i_op->lookup(inode, dentry, nd); 1104 if (unlikely(old)) { 1105 dput(dentry); 1106 dentry = old; 1107 } 1108 return dentry; 1109 } 1110 1111 /* 1112 * It's more convoluted than I'd like it to be, but... it's still fairly 1113 * small and for now I'd prefer to have fast path as straight as possible. 1114 * It _is_ time-critical. 1115 */ 1116 static int do_lookup(struct nameidata *nd, struct qstr *name, 1117 struct path *path, struct inode **inode) 1118 { 1119 struct vfsmount *mnt = nd->path.mnt; 1120 struct dentry *dentry, *parent = nd->path.dentry; 1121 int need_reval = 1; 1122 int status = 1; 1123 int err; 1124 1125 /* 1126 * Rename seqlock is not required here because in the off chance 1127 * of a false negative due to a concurrent rename, we're going to 1128 * do the non-racy lookup, below. 1129 */ 1130 if (nd->flags & LOOKUP_RCU) { 1131 unsigned seq; 1132 *inode = nd->inode; 1133 dentry = __d_lookup_rcu(parent, name, &seq, inode); 1134 if (!dentry) 1135 goto unlazy; 1136 1137 /* Memory barrier in read_seqcount_begin of child is enough */ 1138 if (__read_seqcount_retry(&parent->d_seq, nd->seq)) 1139 return -ECHILD; 1140 nd->seq = seq; 1141 1142 if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) { 1143 status = d_revalidate(dentry, nd); 1144 if (unlikely(status <= 0)) { 1145 if (status != -ECHILD) 1146 need_reval = 0; 1147 goto unlazy; 1148 } 1149 } 1150 if (unlikely(d_need_lookup(dentry))) 1151 goto unlazy; 1152 path->mnt = mnt; 1153 path->dentry = dentry; 1154 if (unlikely(!__follow_mount_rcu(nd, path, inode))) 1155 goto unlazy; 1156 if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT)) 1157 goto unlazy; 1158 return 0; 1159 unlazy: 1160 if (unlazy_walk(nd, dentry)) 1161 return -ECHILD; 1162 } else { 1163 dentry = __d_lookup(parent, name); 1164 } 1165 1166 if (dentry && unlikely(d_need_lookup(dentry))) { 1167 dput(dentry); 1168 dentry = NULL; 1169 } 1170 retry: 1171 if (unlikely(!dentry)) { 1172 struct inode *dir = parent->d_inode; 1173 BUG_ON(nd->inode != dir); 1174 1175 mutex_lock(&dir->i_mutex); 1176 dentry = d_lookup(parent, name); 1177 if (likely(!dentry)) { 1178 dentry = d_alloc_and_lookup(parent, name, nd); 1179 if (IS_ERR(dentry)) { 1180 mutex_unlock(&dir->i_mutex); 1181 return PTR_ERR(dentry); 1182 } 1183 /* known good */ 1184 need_reval = 0; 1185 status = 1; 1186 } else if (unlikely(d_need_lookup(dentry))) { 1187 dentry = d_inode_lookup(parent, dentry, nd); 1188 if (IS_ERR(dentry)) { 1189 mutex_unlock(&dir->i_mutex); 1190 return PTR_ERR(dentry); 1191 } 1192 /* known good */ 1193 need_reval = 0; 1194 status = 1; 1195 } 1196 mutex_unlock(&dir->i_mutex); 1197 } 1198 if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval) 1199 status = d_revalidate(dentry, nd); 1200 if (unlikely(status <= 0)) { 1201 if (status < 0) { 1202 dput(dentry); 1203 return status; 1204 } 1205 if (!d_invalidate(dentry)) { 1206 dput(dentry); 1207 dentry = NULL; 1208 need_reval = 1; 1209 goto retry; 1210 } 1211 } 1212 1213 path->mnt = mnt; 1214 path->dentry = dentry; 1215 err = follow_managed(path, nd->flags); 1216 if (unlikely(err < 0)) { 1217 path_put_conditional(path, nd); 1218 return err; 1219 } 1220 if (err) 1221 nd->flags |= LOOKUP_JUMPED; 1222 *inode = path->dentry->d_inode; 1223 return 0; 1224 } 1225 1226 static inline int may_lookup(struct nameidata *nd) 1227 { 1228 if (nd->flags & LOOKUP_RCU) { 1229 int err = inode_permission(nd->inode, MAY_EXEC|MAY_NOT_BLOCK); 1230 if (err != -ECHILD) 1231 return err; 1232 if (unlazy_walk(nd, NULL)) 1233 return -ECHILD; 1234 } 1235 return inode_permission(nd->inode, MAY_EXEC); 1236 } 1237 1238 static inline int handle_dots(struct nameidata *nd, int type) 1239 { 1240 if (type == LAST_DOTDOT) { 1241 if (nd->flags & LOOKUP_RCU) { 1242 if (follow_dotdot_rcu(nd)) 1243 return -ECHILD; 1244 } else 1245 follow_dotdot(nd); 1246 } 1247 return 0; 1248 } 1249 1250 static void terminate_walk(struct nameidata *nd) 1251 { 1252 if (!(nd->flags & LOOKUP_RCU)) { 1253 path_put(&nd->path); 1254 } else { 1255 nd->flags &= ~LOOKUP_RCU; 1256 if (!(nd->flags & LOOKUP_ROOT)) 1257 nd->root.mnt = NULL; 1258 rcu_read_unlock(); 1259 br_read_unlock(vfsmount_lock); 1260 } 1261 } 1262 1263 /* 1264 * Do we need to follow links? We _really_ want to be able 1265 * to do this check without having to look at inode->i_op, 1266 * so we keep a cache of "no, this doesn't need follow_link" 1267 * for the common case. 1268 */ 1269 static inline int should_follow_link(struct inode *inode, int follow) 1270 { 1271 if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) { 1272 if (likely(inode->i_op->follow_link)) 1273 return follow; 1274 1275 /* This gets set once for the inode lifetime */ 1276 spin_lock(&inode->i_lock); 1277 inode->i_opflags |= IOP_NOFOLLOW; 1278 spin_unlock(&inode->i_lock); 1279 } 1280 return 0; 1281 } 1282 1283 static inline int walk_component(struct nameidata *nd, struct path *path, 1284 struct qstr *name, int type, int follow) 1285 { 1286 struct inode *inode; 1287 int err; 1288 /* 1289 * "." and ".." are special - ".." especially so because it has 1290 * to be able to know about the current root directory and 1291 * parent relationships. 1292 */ 1293 if (unlikely(type != LAST_NORM)) 1294 return handle_dots(nd, type); 1295 err = do_lookup(nd, name, path, &inode); 1296 if (unlikely(err)) { 1297 terminate_walk(nd); 1298 return err; 1299 } 1300 if (!inode) { 1301 path_to_nameidata(path, nd); 1302 terminate_walk(nd); 1303 return -ENOENT; 1304 } 1305 if (should_follow_link(inode, follow)) { 1306 if (nd->flags & LOOKUP_RCU) { 1307 if (unlikely(unlazy_walk(nd, path->dentry))) { 1308 terminate_walk(nd); 1309 return -ECHILD; 1310 } 1311 } 1312 BUG_ON(inode != path->dentry->d_inode); 1313 return 1; 1314 } 1315 path_to_nameidata(path, nd); 1316 nd->inode = inode; 1317 return 0; 1318 } 1319 1320 /* 1321 * This limits recursive symlink follows to 8, while 1322 * limiting consecutive symlinks to 40. 1323 * 1324 * Without that kind of total limit, nasty chains of consecutive 1325 * symlinks can cause almost arbitrarily long lookups. 1326 */ 1327 static inline int nested_symlink(struct path *path, struct nameidata *nd) 1328 { 1329 int res; 1330 1331 if (unlikely(current->link_count >= MAX_NESTED_LINKS)) { 1332 path_put_conditional(path, nd); 1333 path_put(&nd->path); 1334 return -ELOOP; 1335 } 1336 BUG_ON(nd->depth >= MAX_NESTED_LINKS); 1337 1338 nd->depth++; 1339 current->link_count++; 1340 1341 do { 1342 struct path link = *path; 1343 void *cookie; 1344 1345 res = follow_link(&link, nd, &cookie); 1346 if (!res) 1347 res = walk_component(nd, path, &nd->last, 1348 nd->last_type, LOOKUP_FOLLOW); 1349 put_link(nd, &link, cookie); 1350 } while (res > 0); 1351 1352 current->link_count--; 1353 nd->depth--; 1354 return res; 1355 } 1356 1357 /* 1358 * We really don't want to look at inode->i_op->lookup 1359 * when we don't have to. So we keep a cache bit in 1360 * the inode ->i_opflags field that says "yes, we can 1361 * do lookup on this inode". 1362 */ 1363 static inline int can_lookup(struct inode *inode) 1364 { 1365 if (likely(inode->i_opflags & IOP_LOOKUP)) 1366 return 1; 1367 if (likely(!inode->i_op->lookup)) 1368 return 0; 1369 1370 /* We do this once for the lifetime of the inode */ 1371 spin_lock(&inode->i_lock); 1372 inode->i_opflags |= IOP_LOOKUP; 1373 spin_unlock(&inode->i_lock); 1374 return 1; 1375 } 1376 1377 /* 1378 * Name resolution. 1379 * This is the basic name resolution function, turning a pathname into 1380 * the final dentry. We expect 'base' to be positive and a directory. 1381 * 1382 * Returns 0 and nd will have valid dentry and mnt on success. 1383 * Returns error and drops reference to input namei data on failure. 1384 */ 1385 static int link_path_walk(const char *name, struct nameidata *nd) 1386 { 1387 struct path next; 1388 int err; 1389 1390 while (*name=='/') 1391 name++; 1392 if (!*name) 1393 return 0; 1394 1395 /* At this point we know we have a real path component. */ 1396 for(;;) { 1397 unsigned long hash; 1398 struct qstr this; 1399 unsigned int c; 1400 int type; 1401 1402 err = may_lookup(nd); 1403 if (err) 1404 break; 1405 1406 this.name = name; 1407 c = *(const unsigned char *)name; 1408 1409 hash = init_name_hash(); 1410 do { 1411 name++; 1412 hash = partial_name_hash(c, hash); 1413 c = *(const unsigned char *)name; 1414 } while (c && (c != '/')); 1415 this.len = name - (const char *) this.name; 1416 this.hash = end_name_hash(hash); 1417 1418 type = LAST_NORM; 1419 if (this.name[0] == '.') switch (this.len) { 1420 case 2: 1421 if (this.name[1] == '.') { 1422 type = LAST_DOTDOT; 1423 nd->flags |= LOOKUP_JUMPED; 1424 } 1425 break; 1426 case 1: 1427 type = LAST_DOT; 1428 } 1429 if (likely(type == LAST_NORM)) { 1430 struct dentry *parent = nd->path.dentry; 1431 nd->flags &= ~LOOKUP_JUMPED; 1432 if (unlikely(parent->d_flags & DCACHE_OP_HASH)) { 1433 err = parent->d_op->d_hash(parent, nd->inode, 1434 &this); 1435 if (err < 0) 1436 break; 1437 } 1438 } 1439 1440 /* remove trailing slashes? */ 1441 if (!c) 1442 goto last_component; 1443 while (*++name == '/'); 1444 if (!*name) 1445 goto last_component; 1446 1447 err = walk_component(nd, &next, &this, type, LOOKUP_FOLLOW); 1448 if (err < 0) 1449 return err; 1450 1451 if (err) { 1452 err = nested_symlink(&next, nd); 1453 if (err) 1454 return err; 1455 } 1456 if (can_lookup(nd->inode)) 1457 continue; 1458 err = -ENOTDIR; 1459 break; 1460 /* here ends the main loop */ 1461 1462 last_component: 1463 nd->last = this; 1464 nd->last_type = type; 1465 return 0; 1466 } 1467 terminate_walk(nd); 1468 return err; 1469 } 1470 1471 static int path_init(int dfd, const char *name, unsigned int flags, 1472 struct nameidata *nd, struct file **fp) 1473 { 1474 int retval = 0; 1475 int fput_needed; 1476 struct file *file; 1477 1478 nd->last_type = LAST_ROOT; /* if there are only slashes... */ 1479 nd->flags = flags | LOOKUP_JUMPED; 1480 nd->depth = 0; 1481 if (flags & LOOKUP_ROOT) { 1482 struct inode *inode = nd->root.dentry->d_inode; 1483 if (*name) { 1484 if (!inode->i_op->lookup) 1485 return -ENOTDIR; 1486 retval = inode_permission(inode, MAY_EXEC); 1487 if (retval) 1488 return retval; 1489 } 1490 nd->path = nd->root; 1491 nd->inode = inode; 1492 if (flags & LOOKUP_RCU) { 1493 br_read_lock(vfsmount_lock); 1494 rcu_read_lock(); 1495 nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); 1496 } else { 1497 path_get(&nd->path); 1498 } 1499 return 0; 1500 } 1501 1502 nd->root.mnt = NULL; 1503 1504 if (*name=='/') { 1505 if (flags & LOOKUP_RCU) { 1506 br_read_lock(vfsmount_lock); 1507 rcu_read_lock(); 1508 set_root_rcu(nd); 1509 } else { 1510 set_root(nd); 1511 path_get(&nd->root); 1512 } 1513 nd->path = nd->root; 1514 } else if (dfd == AT_FDCWD) { 1515 if (flags & LOOKUP_RCU) { 1516 struct fs_struct *fs = current->fs; 1517 unsigned seq; 1518 1519 br_read_lock(vfsmount_lock); 1520 rcu_read_lock(); 1521 1522 do { 1523 seq = read_seqcount_begin(&fs->seq); 1524 nd->path = fs->pwd; 1525 nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); 1526 } while (read_seqcount_retry(&fs->seq, seq)); 1527 } else { 1528 get_fs_pwd(current->fs, &nd->path); 1529 } 1530 } else { 1531 struct dentry *dentry; 1532 1533 file = fget_raw_light(dfd, &fput_needed); 1534 retval = -EBADF; 1535 if (!file) 1536 goto out_fail; 1537 1538 dentry = file->f_path.dentry; 1539 1540 if (*name) { 1541 retval = -ENOTDIR; 1542 if (!S_ISDIR(dentry->d_inode->i_mode)) 1543 goto fput_fail; 1544 1545 retval = inode_permission(dentry->d_inode, MAY_EXEC); 1546 if (retval) 1547 goto fput_fail; 1548 } 1549 1550 nd->path = file->f_path; 1551 if (flags & LOOKUP_RCU) { 1552 if (fput_needed) 1553 *fp = file; 1554 nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); 1555 br_read_lock(vfsmount_lock); 1556 rcu_read_lock(); 1557 } else { 1558 path_get(&file->f_path); 1559 fput_light(file, fput_needed); 1560 } 1561 } 1562 1563 nd->inode = nd->path.dentry->d_inode; 1564 return 0; 1565 1566 fput_fail: 1567 fput_light(file, fput_needed); 1568 out_fail: 1569 return retval; 1570 } 1571 1572 static inline int lookup_last(struct nameidata *nd, struct path *path) 1573 { 1574 if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len]) 1575 nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; 1576 1577 nd->flags &= ~LOOKUP_PARENT; 1578 return walk_component(nd, path, &nd->last, nd->last_type, 1579 nd->flags & LOOKUP_FOLLOW); 1580 } 1581 1582 /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ 1583 static int path_lookupat(int dfd, const char *name, 1584 unsigned int flags, struct nameidata *nd) 1585 { 1586 struct file *base = NULL; 1587 struct path path; 1588 int err; 1589 1590 /* 1591 * Path walking is largely split up into 2 different synchronisation 1592 * schemes, rcu-walk and ref-walk (explained in 1593 * Documentation/filesystems/path-lookup.txt). These share much of the 1594 * path walk code, but some things particularly setup, cleanup, and 1595 * following mounts are sufficiently divergent that functions are 1596 * duplicated. Typically there is a function foo(), and its RCU 1597 * analogue, foo_rcu(). 1598 * 1599 * -ECHILD is the error number of choice (just to avoid clashes) that 1600 * is returned if some aspect of an rcu-walk fails. Such an error must 1601 * be handled by restarting a traditional ref-walk (which will always 1602 * be able to complete). 1603 */ 1604 err = path_init(dfd, name, flags | LOOKUP_PARENT, nd, &base); 1605 1606 if (unlikely(err)) 1607 return err; 1608 1609 current->total_link_count = 0; 1610 err = link_path_walk(name, nd); 1611 1612 if (!err && !(flags & LOOKUP_PARENT)) { 1613 err = lookup_last(nd, &path); 1614 while (err > 0) { 1615 void *cookie; 1616 struct path link = path; 1617 nd->flags |= LOOKUP_PARENT; 1618 err = follow_link(&link, nd, &cookie); 1619 if (!err) 1620 err = lookup_last(nd, &path); 1621 put_link(nd, &link, cookie); 1622 } 1623 } 1624 1625 if (!err) 1626 err = complete_walk(nd); 1627 1628 if (!err && nd->flags & LOOKUP_DIRECTORY) { 1629 if (!nd->inode->i_op->lookup) { 1630 path_put(&nd->path); 1631 err = -ENOTDIR; 1632 } 1633 } 1634 1635 if (base) 1636 fput(base); 1637 1638 if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { 1639 path_put(&nd->root); 1640 nd->root.mnt = NULL; 1641 } 1642 return err; 1643 } 1644 1645 static int do_path_lookup(int dfd, const char *name, 1646 unsigned int flags, struct nameidata *nd) 1647 { 1648 int retval = path_lookupat(dfd, name, flags | LOOKUP_RCU, nd); 1649 if (unlikely(retval == -ECHILD)) 1650 retval = path_lookupat(dfd, name, flags, nd); 1651 if (unlikely(retval == -ESTALE)) 1652 retval = path_lookupat(dfd, name, flags | LOOKUP_REVAL, nd); 1653 1654 if (likely(!retval)) { 1655 if (unlikely(!audit_dummy_context())) { 1656 if (nd->path.dentry && nd->inode) 1657 audit_inode(name, nd->path.dentry); 1658 } 1659 } 1660 return retval; 1661 } 1662 1663 int kern_path_parent(const char *name, struct nameidata *nd) 1664 { 1665 return do_path_lookup(AT_FDCWD, name, LOOKUP_PARENT, nd); 1666 } 1667 1668 int kern_path(const char *name, unsigned int flags, struct path *path) 1669 { 1670 struct nameidata nd; 1671 int res = do_path_lookup(AT_FDCWD, name, flags, &nd); 1672 if (!res) 1673 *path = nd.path; 1674 return res; 1675 } 1676 1677 /** 1678 * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair 1679 * @dentry: pointer to dentry of the base directory 1680 * @mnt: pointer to vfs mount of the base directory 1681 * @name: pointer to file name 1682 * @flags: lookup flags 1683 * @path: pointer to struct path to fill 1684 */ 1685 int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, 1686 const char *name, unsigned int flags, 1687 struct path *path) 1688 { 1689 struct nameidata nd; 1690 int err; 1691 nd.root.dentry = dentry; 1692 nd.root.mnt = mnt; 1693 BUG_ON(flags & LOOKUP_PARENT); 1694 /* the first argument of do_path_lookup() is ignored with LOOKUP_ROOT */ 1695 err = do_path_lookup(AT_FDCWD, name, flags | LOOKUP_ROOT, &nd); 1696 if (!err) 1697 *path = nd.path; 1698 return err; 1699 } 1700 1701 static struct dentry *__lookup_hash(struct qstr *name, 1702 struct dentry *base, struct nameidata *nd) 1703 { 1704 struct inode *inode = base->d_inode; 1705 struct dentry *dentry; 1706 int err; 1707 1708 err = inode_permission(inode, MAY_EXEC); 1709 if (err) 1710 return ERR_PTR(err); 1711 1712 /* 1713 * Don't bother with __d_lookup: callers are for creat as 1714 * well as unlink, so a lot of the time it would cost 1715 * a double lookup. 1716 */ 1717 dentry = d_lookup(base, name); 1718 1719 if (dentry && d_need_lookup(dentry)) { 1720 /* 1721 * __lookup_hash is called with the parent dir's i_mutex already 1722 * held, so we are good to go here. 1723 */ 1724 dentry = d_inode_lookup(base, dentry, nd); 1725 if (IS_ERR(dentry)) 1726 return dentry; 1727 } 1728 1729 if (dentry && (dentry->d_flags & DCACHE_OP_REVALIDATE)) { 1730 int status = d_revalidate(dentry, nd); 1731 if (unlikely(status <= 0)) { 1732 /* 1733 * The dentry failed validation. 1734 * If d_revalidate returned 0 attempt to invalidate 1735 * the dentry otherwise d_revalidate is asking us 1736 * to return a fail status. 1737 */ 1738 if (status < 0) { 1739 dput(dentry); 1740 return ERR_PTR(status); 1741 } else if (!d_invalidate(dentry)) { 1742 dput(dentry); 1743 dentry = NULL; 1744 } 1745 } 1746 } 1747 1748 if (!dentry) 1749 dentry = d_alloc_and_lookup(base, name, nd); 1750 1751 return dentry; 1752 } 1753 1754 /* 1755 * Restricted form of lookup. Doesn't follow links, single-component only, 1756 * needs parent already locked. Doesn't follow mounts. 1757 * SMP-safe. 1758 */ 1759 static struct dentry *lookup_hash(struct nameidata *nd) 1760 { 1761 return __lookup_hash(&nd->last, nd->path.dentry, nd); 1762 } 1763 1764 /** 1765 * lookup_one_len - filesystem helper to lookup single pathname component 1766 * @name: pathname component to lookup 1767 * @base: base directory to lookup from 1768 * @len: maximum length @len should be interpreted to 1769 * 1770 * Note that this routine is purely a helper for filesystem usage and should 1771 * not be called by generic code. Also note that by using this function the 1772 * nameidata argument is passed to the filesystem methods and a filesystem 1773 * using this helper needs to be prepared for that. 1774 */ 1775 struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) 1776 { 1777 struct qstr this; 1778 unsigned long hash; 1779 unsigned int c; 1780 1781 WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex)); 1782 1783 this.name = name; 1784 this.len = len; 1785 if (!len) 1786 return ERR_PTR(-EACCES); 1787 1788 hash = init_name_hash(); 1789 while (len--) { 1790 c = *(const unsigned char *)name++; 1791 if (c == '/' || c == '\0') 1792 return ERR_PTR(-EACCES); 1793 hash = partial_name_hash(c, hash); 1794 } 1795 this.hash = end_name_hash(hash); 1796 /* 1797 * See if the low-level filesystem might want 1798 * to use its own hash.. 1799 */ 1800 if (base->d_flags & DCACHE_OP_HASH) { 1801 int err = base->d_op->d_hash(base, base->d_inode, &this); 1802 if (err < 0) 1803 return ERR_PTR(err); 1804 } 1805 1806 return __lookup_hash(&this, base, NULL); 1807 } 1808 1809 int user_path_at_empty(int dfd, const char __user *name, unsigned flags, 1810 struct path *path, int *empty) 1811 { 1812 struct nameidata nd; 1813 char *tmp = getname_flags(name, flags, empty); 1814 int err = PTR_ERR(tmp); 1815 if (!IS_ERR(tmp)) { 1816 1817 BUG_ON(flags & LOOKUP_PARENT); 1818 1819 err = do_path_lookup(dfd, tmp, flags, &nd); 1820 putname(tmp); 1821 if (!err) 1822 *path = nd.path; 1823 } 1824 return err; 1825 } 1826 1827 int user_path_at(int dfd, const char __user *name, unsigned flags, 1828 struct path *path) 1829 { 1830 return user_path_at_empty(dfd, name, flags, path, 0); 1831 } 1832 1833 static int user_path_parent(int dfd, const char __user *path, 1834 struct nameidata *nd, char **name) 1835 { 1836 char *s = getname(path); 1837 int error; 1838 1839 if (IS_ERR(s)) 1840 return PTR_ERR(s); 1841 1842 error = do_path_lookup(dfd, s, LOOKUP_PARENT, nd); 1843 if (error) 1844 putname(s); 1845 else 1846 *name = s; 1847 1848 return error; 1849 } 1850 1851 /* 1852 * It's inline, so penalty for filesystems that don't use sticky bit is 1853 * minimal. 1854 */ 1855 static inline int check_sticky(struct inode *dir, struct inode *inode) 1856 { 1857 uid_t fsuid = current_fsuid(); 1858 1859 if (!(dir->i_mode & S_ISVTX)) 1860 return 0; 1861 if (current_user_ns() != inode_userns(inode)) 1862 goto other_userns; 1863 if (inode->i_uid == fsuid) 1864 return 0; 1865 if (dir->i_uid == fsuid) 1866 return 0; 1867 1868 other_userns: 1869 return !ns_capable(inode_userns(inode), CAP_FOWNER); 1870 } 1871 1872 /* 1873 * Check whether we can remove a link victim from directory dir, check 1874 * whether the type of victim is right. 1875 * 1. We can't do it if dir is read-only (done in permission()) 1876 * 2. We should have write and exec permissions on dir 1877 * 3. We can't remove anything from append-only dir 1878 * 4. We can't do anything with immutable dir (done in permission()) 1879 * 5. If the sticky bit on dir is set we should either 1880 * a. be owner of dir, or 1881 * b. be owner of victim, or 1882 * c. have CAP_FOWNER capability 1883 * 6. If the victim is append-only or immutable we can't do antyhing with 1884 * links pointing to it. 1885 * 7. If we were asked to remove a directory and victim isn't one - ENOTDIR. 1886 * 8. If we were asked to remove a non-directory and victim isn't one - EISDIR. 1887 * 9. We can't remove a root or mountpoint. 1888 * 10. We don't allow removal of NFS sillyrenamed files; it's handled by 1889 * nfs_async_unlink(). 1890 */ 1891 static int may_delete(struct inode *dir,struct dentry *victim,int isdir) 1892 { 1893 int error; 1894 1895 if (!victim->d_inode) 1896 return -ENOENT; 1897 1898 BUG_ON(victim->d_parent->d_inode != dir); 1899 audit_inode_child(victim, dir); 1900 1901 error = inode_permission(dir, MAY_WRITE | MAY_EXEC); 1902 if (error) 1903 return error; 1904 if (IS_APPEND(dir)) 1905 return -EPERM; 1906 if (check_sticky(dir, victim->d_inode)||IS_APPEND(victim->d_inode)|| 1907 IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode)) 1908 return -EPERM; 1909 if (isdir) { 1910 if (!S_ISDIR(victim->d_inode->i_mode)) 1911 return -ENOTDIR; 1912 if (IS_ROOT(victim)) 1913 return -EBUSY; 1914 } else if (S_ISDIR(victim->d_inode->i_mode)) 1915 return -EISDIR; 1916 if (IS_DEADDIR(dir)) 1917 return -ENOENT; 1918 if (victim->d_flags & DCACHE_NFSFS_RENAMED) 1919 return -EBUSY; 1920 return 0; 1921 } 1922 1923 /* Check whether we can create an object with dentry child in directory 1924 * dir. 1925 * 1. We can't do it if child already exists (open has special treatment for 1926 * this case, but since we are inlined it's OK) 1927 * 2. We can't do it if dir is read-only (done in permission()) 1928 * 3. We should have write and exec permissions on dir 1929 * 4. We can't do it if dir is immutable (done in permission()) 1930 */ 1931 static inline int may_create(struct inode *dir, struct dentry *child) 1932 { 1933 if (child->d_inode) 1934 return -EEXIST; 1935 if (IS_DEADDIR(dir)) 1936 return -ENOENT; 1937 return inode_permission(dir, MAY_WRITE | MAY_EXEC); 1938 } 1939 1940 /* 1941 * p1 and p2 should be directories on the same fs. 1942 */ 1943 struct dentry *lock_rename(struct dentry *p1, struct dentry *p2) 1944 { 1945 struct dentry *p; 1946 1947 if (p1 == p2) { 1948 mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT); 1949 return NULL; 1950 } 1951 1952 mutex_lock(&p1->d_inode->i_sb->s_vfs_rename_mutex); 1953 1954 p = d_ancestor(p2, p1); 1955 if (p) { 1956 mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT); 1957 mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_CHILD); 1958 return p; 1959 } 1960 1961 p = d_ancestor(p1, p2); 1962 if (p) { 1963 mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT); 1964 mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD); 1965 return p; 1966 } 1967 1968 mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT); 1969 mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD); 1970 return NULL; 1971 } 1972 1973 void unlock_rename(struct dentry *p1, struct dentry *p2) 1974 { 1975 mutex_unlock(&p1->d_inode->i_mutex); 1976 if (p1 != p2) { 1977 mutex_unlock(&p2->d_inode->i_mutex); 1978 mutex_unlock(&p1->d_inode->i_sb->s_vfs_rename_mutex); 1979 } 1980 } 1981 1982 int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, 1983 struct nameidata *nd) 1984 { 1985 int error = may_create(dir, dentry); 1986 1987 if (error) 1988 return error; 1989 1990 if (!dir->i_op->create) 1991 return -EACCES; /* shouldn't it be ENOSYS? */ 1992 mode &= S_IALLUGO; 1993 mode |= S_IFREG; 1994 error = security_inode_create(dir, dentry, mode); 1995 if (error) 1996 return error; 1997 error = dir->i_op->create(dir, dentry, mode, nd); 1998 if (!error) 1999 fsnotify_create(dir, dentry); 2000 return error; 2001 } 2002 2003 static int may_open(struct path *path, int acc_mode, int flag) 2004 { 2005 struct dentry *dentry = path->dentry; 2006 struct inode *inode = dentry->d_inode; 2007 int error; 2008 2009 /* O_PATH? */ 2010 if (!acc_mode) 2011 return 0; 2012 2013 if (!inode) 2014 return -ENOENT; 2015 2016 switch (inode->i_mode & S_IFMT) { 2017 case S_IFLNK: 2018 return -ELOOP; 2019 case S_IFDIR: 2020 if (acc_mode & MAY_WRITE) 2021 return -EISDIR; 2022 break; 2023 case S_IFBLK: 2024 case S_IFCHR: 2025 if (path->mnt->mnt_flags & MNT_NODEV) 2026 return -EACCES; 2027 /*FALLTHRU*/ 2028 case S_IFIFO: 2029 case S_IFSOCK: 2030 flag &= ~O_TRUNC; 2031 break; 2032 } 2033 2034 error = inode_permission(inode, acc_mode); 2035 if (error) 2036 return error; 2037 2038 /* 2039 * An append-only file must be opened in append mode for writing. 2040 */ 2041 if (IS_APPEND(inode)) { 2042 if ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND)) 2043 return -EPERM; 2044 if (flag & O_TRUNC) 2045 return -EPERM; 2046 } 2047 2048 /* O_NOATIME can only be set by the owner or superuser */ 2049 if (flag & O_NOATIME && !inode_owner_or_capable(inode)) 2050 return -EPERM; 2051 2052 return 0; 2053 } 2054 2055 static int handle_truncate(struct file *filp) 2056 { 2057 struct path *path = &filp->f_path; 2058 struct inode *inode = path->dentry->d_inode; 2059 int error = get_write_access(inode); 2060 if (error) 2061 return error; 2062 /* 2063 * Refuse to truncate files with mandatory locks held on them. 2064 */ 2065 error = locks_verify_locked(inode); 2066 if (!error) 2067 error = security_path_truncate(path); 2068 if (!error) { 2069 error = do_truncate(path->dentry, 0, 2070 ATTR_MTIME|ATTR_CTIME|ATTR_OPEN, 2071 filp); 2072 } 2073 put_write_access(inode); 2074 return error; 2075 } 2076 2077 static inline int open_to_namei_flags(int flag) 2078 { 2079 if ((flag & O_ACCMODE) == 3) 2080 flag--; 2081 return flag; 2082 } 2083 2084 /* 2085 * Handle the last step of open() 2086 */ 2087 static struct file *do_last(struct nameidata *nd, struct path *path, 2088 const struct open_flags *op, const char *pathname) 2089 { 2090 struct dentry *dir = nd->path.dentry; 2091 struct dentry *dentry; 2092 int open_flag = op->open_flag; 2093 int will_truncate = open_flag & O_TRUNC; 2094 int want_write = 0; 2095 int acc_mode = op->acc_mode; 2096 struct file *filp; 2097 int error; 2098 2099 nd->flags &= ~LOOKUP_PARENT; 2100 nd->flags |= op->intent; 2101 2102 switch (nd->last_type) { 2103 case LAST_DOTDOT: 2104 case LAST_DOT: 2105 error = handle_dots(nd, nd->last_type); 2106 if (error) 2107 return ERR_PTR(error); 2108 /* fallthrough */ 2109 case LAST_ROOT: 2110 error = complete_walk(nd); 2111 if (error) 2112 return ERR_PTR(error); 2113 audit_inode(pathname, nd->path.dentry); 2114 if (open_flag & O_CREAT) { 2115 error = -EISDIR; 2116 goto exit; 2117 } 2118 goto ok; 2119 case LAST_BIND: 2120 error = complete_walk(nd); 2121 if (error) 2122 return ERR_PTR(error); 2123 audit_inode(pathname, dir); 2124 goto ok; 2125 } 2126 2127 if (!(open_flag & O_CREAT)) { 2128 int symlink_ok = 0; 2129 if (nd->last.name[nd->last.len]) 2130 nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; 2131 if (open_flag & O_PATH && !(nd->flags & LOOKUP_FOLLOW)) 2132 symlink_ok = 1; 2133 /* we _can_ be in RCU mode here */ 2134 error = walk_component(nd, path, &nd->last, LAST_NORM, 2135 !symlink_ok); 2136 if (error < 0) 2137 return ERR_PTR(error); 2138 if (error) /* symlink */ 2139 return NULL; 2140 /* sayonara */ 2141 error = complete_walk(nd); 2142 if (error) 2143 return ERR_PTR(-ECHILD); 2144 2145 error = -ENOTDIR; 2146 if (nd->flags & LOOKUP_DIRECTORY) { 2147 if (!nd->inode->i_op->lookup) 2148 goto exit; 2149 } 2150 audit_inode(pathname, nd->path.dentry); 2151 goto ok; 2152 } 2153 2154 /* create side of things */ 2155 /* 2156 * This will *only* deal with leaving RCU mode - LOOKUP_JUMPED has been 2157 * cleared when we got to the last component we are about to look up 2158 */ 2159 error = complete_walk(nd); 2160 if (error) 2161 return ERR_PTR(error); 2162 2163 audit_inode(pathname, dir); 2164 error = -EISDIR; 2165 /* trailing slashes? */ 2166 if (nd->last.name[nd->last.len]) 2167 goto exit; 2168 2169 mutex_lock(&dir->d_inode->i_mutex); 2170 2171 dentry = lookup_hash(nd); 2172 error = PTR_ERR(dentry); 2173 if (IS_ERR(dentry)) { 2174 mutex_unlock(&dir->d_inode->i_mutex); 2175 goto exit; 2176 } 2177 2178 path->dentry = dentry; 2179 path->mnt = nd->path.mnt; 2180 2181 /* Negative dentry, just create the file */ 2182 if (!dentry->d_inode) { 2183 umode_t mode = op->mode; 2184 if (!IS_POSIXACL(dir->d_inode)) 2185 mode &= ~current_umask(); 2186 /* 2187 * This write is needed to ensure that a 2188 * rw->ro transition does not occur between 2189 * the time when the file is created and when 2190 * a permanent write count is taken through 2191 * the 'struct file' in nameidata_to_filp(). 2192 */ 2193 error = mnt_want_write(nd->path.mnt); 2194 if (error) 2195 goto exit_mutex_unlock; 2196 want_write = 1; 2197 /* Don't check for write permission, don't truncate */ 2198 open_flag &= ~O_TRUNC; 2199 will_truncate = 0; 2200 acc_mode = MAY_OPEN; 2201 error = security_path_mknod(&nd->path, dentry, mode, 0); 2202 if (error) 2203 goto exit_mutex_unlock; 2204 error = vfs_create(dir->d_inode, dentry, mode, nd); 2205 if (error) 2206 goto exit_mutex_unlock; 2207 mutex_unlock(&dir->d_inode->i_mutex); 2208 dput(nd->path.dentry); 2209 nd->path.dentry = dentry; 2210 goto common; 2211 } 2212 2213 /* 2214 * It already exists. 2215 */ 2216 mutex_unlock(&dir->d_inode->i_mutex); 2217 audit_inode(pathname, path->dentry); 2218 2219 error = -EEXIST; 2220 if (open_flag & O_EXCL) 2221 goto exit_dput; 2222 2223 error = follow_managed(path, nd->flags); 2224 if (error < 0) 2225 goto exit_dput; 2226 2227 if (error) 2228 nd->flags |= LOOKUP_JUMPED; 2229 2230 error = -ENOENT; 2231 if (!path->dentry->d_inode) 2232 goto exit_dput; 2233 2234 if (path->dentry->d_inode->i_op->follow_link) 2235 return NULL; 2236 2237 path_to_nameidata(path, nd); 2238 nd->inode = path->dentry->d_inode; 2239 /* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */ 2240 error = complete_walk(nd); 2241 if (error) 2242 goto exit; 2243 error = -EISDIR; 2244 if (S_ISDIR(nd->inode->i_mode)) 2245 goto exit; 2246 ok: 2247 if (!S_ISREG(nd->inode->i_mode)) 2248 will_truncate = 0; 2249 2250 if (will_truncate) { 2251 error = mnt_want_write(nd->path.mnt); 2252 if (error) 2253 goto exit; 2254 want_write = 1; 2255 } 2256 common: 2257 error = may_open(&nd->path, acc_mode, open_flag); 2258 if (error) 2259 goto exit; 2260 filp = nameidata_to_filp(nd); 2261 if (!IS_ERR(filp)) { 2262 error = ima_file_check(filp, op->acc_mode); 2263 if (error) { 2264 fput(filp); 2265 filp = ERR_PTR(error); 2266 } 2267 } 2268 if (!IS_ERR(filp)) { 2269 if (will_truncate) { 2270 error = handle_truncate(filp); 2271 if (error) { 2272 fput(filp); 2273 filp = ERR_PTR(error); 2274 } 2275 } 2276 } 2277 out: 2278 if (want_write) 2279 mnt_drop_write(nd->path.mnt); 2280 path_put(&nd->path); 2281 return filp; 2282 2283 exit_mutex_unlock: 2284 mutex_unlock(&dir->d_inode->i_mutex); 2285 exit_dput: 2286 path_put_conditional(path, nd); 2287 exit: 2288 filp = ERR_PTR(error); 2289 goto out; 2290 } 2291 2292 static struct file *path_openat(int dfd, const char *pathname, 2293 struct nameidata *nd, const struct open_flags *op, int flags) 2294 { 2295 struct file *base = NULL; 2296 struct file *filp; 2297 struct path path; 2298 int error; 2299 2300 filp = get_empty_filp(); 2301 if (!filp) 2302 return ERR_PTR(-ENFILE); 2303 2304 filp->f_flags = op->open_flag; 2305 nd->intent.open.file = filp; 2306 nd->intent.open.flags = open_to_namei_flags(op->open_flag); 2307 nd->intent.open.create_mode = op->mode; 2308 2309 error = path_init(dfd, pathname, flags | LOOKUP_PARENT, nd, &base); 2310 if (unlikely(error)) 2311 goto out_filp; 2312 2313 current->total_link_count = 0; 2314 error = link_path_walk(pathname, nd); 2315 if (unlikely(error)) 2316 goto out_filp; 2317 2318 filp = do_last(nd, &path, op, pathname); 2319 while (unlikely(!filp)) { /* trailing symlink */ 2320 struct path link = path; 2321 void *cookie; 2322 if (!(nd->flags & LOOKUP_FOLLOW)) { 2323 path_put_conditional(&path, nd); 2324 path_put(&nd->path); 2325 filp = ERR_PTR(-ELOOP); 2326 break; 2327 } 2328 nd->flags |= LOOKUP_PARENT; 2329 nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL); 2330 error = follow_link(&link, nd, &cookie); 2331 if (unlikely(error)) 2332 filp = ERR_PTR(error); 2333 else 2334 filp = do_last(nd, &path, op, pathname); 2335 put_link(nd, &link, cookie); 2336 } 2337 out: 2338 if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) 2339 path_put(&nd->root); 2340 if (base) 2341 fput(base); 2342 release_open_intent(nd); 2343 return filp; 2344 2345 out_filp: 2346 filp = ERR_PTR(error); 2347 goto out; 2348 } 2349 2350 struct file *do_filp_open(int dfd, const char *pathname, 2351 const struct open_flags *op, int flags) 2352 { 2353 struct nameidata nd; 2354 struct file *filp; 2355 2356 filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_RCU); 2357 if (unlikely(filp == ERR_PTR(-ECHILD))) 2358 filp = path_openat(dfd, pathname, &nd, op, flags); 2359 if (unlikely(filp == ERR_PTR(-ESTALE))) 2360 filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_REVAL); 2361 return filp; 2362 } 2363 2364 struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt, 2365 const char *name, const struct open_flags *op, int flags) 2366 { 2367 struct nameidata nd; 2368 struct file *file; 2369 2370 nd.root.mnt = mnt; 2371 nd.root.dentry = dentry; 2372 2373 flags |= LOOKUP_ROOT; 2374 2375 if (dentry->d_inode->i_op->follow_link && op->intent & LOOKUP_OPEN) 2376 return ERR_PTR(-ELOOP); 2377 2378 file = path_openat(-1, name, &nd, op, flags | LOOKUP_RCU); 2379 if (unlikely(file == ERR_PTR(-ECHILD))) 2380 file = path_openat(-1, name, &nd, op, flags); 2381 if (unlikely(file == ERR_PTR(-ESTALE))) 2382 file = path_openat(-1, name, &nd, op, flags | LOOKUP_REVAL); 2383 return file; 2384 } 2385 2386 struct dentry *kern_path_create(int dfd, const char *pathname, struct path *path, int is_dir) 2387 { 2388 struct dentry *dentry = ERR_PTR(-EEXIST); 2389 struct nameidata nd; 2390 int error = do_path_lookup(dfd, pathname, LOOKUP_PARENT, &nd); 2391 if (error) 2392 return ERR_PTR(error); 2393 2394 /* 2395 * Yucky last component or no last component at all? 2396 * (foo/., foo/.., /////) 2397 */ 2398 if (nd.last_type != LAST_NORM) 2399 goto out; 2400 nd.flags &= ~LOOKUP_PARENT; 2401 nd.flags |= LOOKUP_CREATE | LOOKUP_EXCL; 2402 nd.intent.open.flags = O_EXCL; 2403 2404 /* 2405 * Do the final lookup. 2406 */ 2407 mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); 2408 dentry = lookup_hash(&nd); 2409 if (IS_ERR(dentry)) 2410 goto fail; 2411 2412 if (dentry->d_inode) 2413 goto eexist; 2414 /* 2415 * Special case - lookup gave negative, but... we had foo/bar/ 2416 * From the vfs_mknod() POV we just have a negative dentry - 2417 * all is fine. Let's be bastards - you had / on the end, you've 2418 * been asking for (non-existent) directory. -ENOENT for you. 2419 */ 2420 if (unlikely(!is_dir && nd.last.name[nd.last.len])) { 2421 dput(dentry); 2422 dentry = ERR_PTR(-ENOENT); 2423 goto fail; 2424 } 2425 *path = nd.path; 2426 return dentry; 2427 eexist: 2428 dput(dentry); 2429 dentry = ERR_PTR(-EEXIST); 2430 fail: 2431 mutex_unlock(&nd.path.dentry->d_inode->i_mutex); 2432 out: 2433 path_put(&nd.path); 2434 return dentry; 2435 } 2436 EXPORT_SYMBOL(kern_path_create); 2437 2438 struct dentry *user_path_create(int dfd, const char __user *pathname, struct path *path, int is_dir) 2439 { 2440 char *tmp = getname(pathname); 2441 struct dentry *res; 2442 if (IS_ERR(tmp)) 2443 return ERR_CAST(tmp); 2444 res = kern_path_create(dfd, tmp, path, is_dir); 2445 putname(tmp); 2446 return res; 2447 } 2448 EXPORT_SYMBOL(user_path_create); 2449 2450 int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) 2451 { 2452 int error = may_create(dir, dentry); 2453 2454 if (error) 2455 return error; 2456 2457 if ((S_ISCHR(mode) || S_ISBLK(mode)) && 2458 !ns_capable(inode_userns(dir), CAP_MKNOD)) 2459 return -EPERM; 2460 2461 if (!dir->i_op->mknod) 2462 return -EPERM; 2463 2464 error = devcgroup_inode_mknod(mode, dev); 2465 if (error) 2466 return error; 2467 2468 error = security_inode_mknod(dir, dentry, mode, dev); 2469 if (error) 2470 return error; 2471 2472 error = dir->i_op->mknod(dir, dentry, mode, dev); 2473 if (!error) 2474 fsnotify_create(dir, dentry); 2475 return error; 2476 } 2477 2478 static int may_mknod(umode_t mode) 2479 { 2480 switch (mode & S_IFMT) { 2481 case S_IFREG: 2482 case S_IFCHR: 2483 case S_IFBLK: 2484 case S_IFIFO: 2485 case S_IFSOCK: 2486 case 0: /* zero mode translates to S_IFREG */ 2487 return 0; 2488 case S_IFDIR: 2489 return -EPERM; 2490 default: 2491 return -EINVAL; 2492 } 2493 } 2494 2495 SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode, 2496 unsigned, dev) 2497 { 2498 struct dentry *dentry; 2499 struct path path; 2500 int error; 2501 2502 if (S_ISDIR(mode)) 2503 return -EPERM; 2504 2505 dentry = user_path_create(dfd, filename, &path, 0); 2506 if (IS_ERR(dentry)) 2507 return PTR_ERR(dentry); 2508 2509 if (!IS_POSIXACL(path.dentry->d_inode)) 2510 mode &= ~current_umask(); 2511 error = may_mknod(mode); 2512 if (error) 2513 goto out_dput; 2514 error = mnt_want_write(path.mnt); 2515 if (error) 2516 goto out_dput; 2517 error = security_path_mknod(&path, dentry, mode, dev); 2518 if (error) 2519 goto out_drop_write; 2520 switch (mode & S_IFMT) { 2521 case 0: case S_IFREG: 2522 error = vfs_create(path.dentry->d_inode,dentry,mode,NULL); 2523 break; 2524 case S_IFCHR: case S_IFBLK: 2525 error = vfs_mknod(path.dentry->d_inode,dentry,mode, 2526 new_decode_dev(dev)); 2527 break; 2528 case S_IFIFO: case S_IFSOCK: 2529 error = vfs_mknod(path.dentry->d_inode,dentry,mode,0); 2530 break; 2531 } 2532 out_drop_write: 2533 mnt_drop_write(path.mnt); 2534 out_dput: 2535 dput(dentry); 2536 mutex_unlock(&path.dentry->d_inode->i_mutex); 2537 path_put(&path); 2538 2539 return error; 2540 } 2541 2542 SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev) 2543 { 2544 return sys_mknodat(AT_FDCWD, filename, mode, dev); 2545 } 2546 2547 int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 2548 { 2549 int error = may_create(dir, dentry); 2550 2551 if (error) 2552 return error; 2553 2554 if (!dir->i_op->mkdir) 2555 return -EPERM; 2556 2557 mode &= (S_IRWXUGO|S_ISVTX); 2558 error = security_inode_mkdir(dir, dentry, mode); 2559 if (error) 2560 return error; 2561 2562 error = dir->i_op->mkdir(dir, dentry, mode); 2563 if (!error) 2564 fsnotify_mkdir(dir, dentry); 2565 return error; 2566 } 2567 2568 SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode) 2569 { 2570 struct dentry *dentry; 2571 struct path path; 2572 int error; 2573 2574 dentry = user_path_create(dfd, pathname, &path, 1); 2575 if (IS_ERR(dentry)) 2576 return PTR_ERR(dentry); 2577 2578 if (!IS_POSIXACL(path.dentry->d_inode)) 2579 mode &= ~current_umask(); 2580 error = mnt_want_write(path.mnt); 2581 if (error) 2582 goto out_dput; 2583 error = security_path_mkdir(&path, dentry, mode); 2584 if (error) 2585 goto out_drop_write; 2586 error = vfs_mkdir(path.dentry->d_inode, dentry, mode); 2587 out_drop_write: 2588 mnt_drop_write(path.mnt); 2589 out_dput: 2590 dput(dentry); 2591 mutex_unlock(&path.dentry->d_inode->i_mutex); 2592 path_put(&path); 2593 return error; 2594 } 2595 2596 SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode) 2597 { 2598 return sys_mkdirat(AT_FDCWD, pathname, mode); 2599 } 2600 2601 /* 2602 * The dentry_unhash() helper will try to drop the dentry early: we 2603 * should have a usage count of 2 if we're the only user of this 2604 * dentry, and if that is true (possibly after pruning the dcache), 2605 * then we drop the dentry now. 2606 * 2607 * A low-level filesystem can, if it choses, legally 2608 * do a 2609 * 2610 * if (!d_unhashed(dentry)) 2611 * return -EBUSY; 2612 * 2613 * if it cannot handle the case of removing a directory 2614 * that is still in use by something else.. 2615 */ 2616 void dentry_unhash(struct dentry *dentry) 2617 { 2618 shrink_dcache_parent(dentry); 2619 spin_lock(&dentry->d_lock); 2620 if (dentry->d_count == 1) 2621 __d_drop(dentry); 2622 spin_unlock(&dentry->d_lock); 2623 } 2624 2625 int vfs_rmdir(struct inode *dir, struct dentry *dentry) 2626 { 2627 int error = may_delete(dir, dentry, 1); 2628 2629 if (error) 2630 return error; 2631 2632 if (!dir->i_op->rmdir) 2633 return -EPERM; 2634 2635 dget(dentry); 2636 mutex_lock(&dentry->d_inode->i_mutex); 2637 2638 error = -EBUSY; 2639 if (d_mountpoint(dentry)) 2640 goto out; 2641 2642 error = security_inode_rmdir(dir, dentry); 2643 if (error) 2644 goto out; 2645 2646 shrink_dcache_parent(dentry); 2647 error = dir->i_op->rmdir(dir, dentry); 2648 if (error) 2649 goto out; 2650 2651 dentry->d_inode->i_flags |= S_DEAD; 2652 dont_mount(dentry); 2653 2654 out: 2655 mutex_unlock(&dentry->d_inode->i_mutex); 2656 dput(dentry); 2657 if (!error) 2658 d_delete(dentry); 2659 return error; 2660 } 2661 2662 static long do_rmdir(int dfd, const char __user *pathname) 2663 { 2664 int error = 0; 2665 char * name; 2666 struct dentry *dentry; 2667 struct nameidata nd; 2668 2669 error = user_path_parent(dfd, pathname, &nd, &name); 2670 if (error) 2671 return error; 2672 2673 switch(nd.last_type) { 2674 case LAST_DOTDOT: 2675 error = -ENOTEMPTY; 2676 goto exit1; 2677 case LAST_DOT: 2678 error = -EINVAL; 2679 goto exit1; 2680 case LAST_ROOT: 2681 error = -EBUSY; 2682 goto exit1; 2683 } 2684 2685 nd.flags &= ~LOOKUP_PARENT; 2686 2687 mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); 2688 dentry = lookup_hash(&nd); 2689 error = PTR_ERR(dentry); 2690 if (IS_ERR(dentry)) 2691 goto exit2; 2692 if (!dentry->d_inode) { 2693 error = -ENOENT; 2694 goto exit3; 2695 } 2696 error = mnt_want_write(nd.path.mnt); 2697 if (error) 2698 goto exit3; 2699 error = security_path_rmdir(&nd.path, dentry); 2700 if (error) 2701 goto exit4; 2702 error = vfs_rmdir(nd.path.dentry->d_inode, dentry); 2703 exit4: 2704 mnt_drop_write(nd.path.mnt); 2705 exit3: 2706 dput(dentry); 2707 exit2: 2708 mutex_unlock(&nd.path.dentry->d_inode->i_mutex); 2709 exit1: 2710 path_put(&nd.path); 2711 putname(name); 2712 return error; 2713 } 2714 2715 SYSCALL_DEFINE1(rmdir, const char __user *, pathname) 2716 { 2717 return do_rmdir(AT_FDCWD, pathname); 2718 } 2719 2720 int vfs_unlink(struct inode *dir, struct dentry *dentry) 2721 { 2722 int error = may_delete(dir, dentry, 0); 2723 2724 if (error) 2725 return error; 2726 2727 if (!dir->i_op->unlink) 2728 return -EPERM; 2729 2730 mutex_lock(&dentry->d_inode->i_mutex); 2731 if (d_mountpoint(dentry)) 2732 error = -EBUSY; 2733 else { 2734 error = security_inode_unlink(dir, dentry); 2735 if (!error) { 2736 error = dir->i_op->unlink(dir, dentry); 2737 if (!error) 2738 dont_mount(dentry); 2739 } 2740 } 2741 mutex_unlock(&dentry->d_inode->i_mutex); 2742 2743 /* We don't d_delete() NFS sillyrenamed files--they still exist. */ 2744 if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) { 2745 fsnotify_link_count(dentry->d_inode); 2746 d_delete(dentry); 2747 } 2748 2749 return error; 2750 } 2751 2752 /* 2753 * Make sure that the actual truncation of the file will occur outside its 2754 * directory's i_mutex. Truncate can take a long time if there is a lot of 2755 * writeout happening, and we don't want to prevent access to the directory 2756 * while waiting on the I/O. 2757 */ 2758 static long do_unlinkat(int dfd, const char __user *pathname) 2759 { 2760 int error; 2761 char *name; 2762 struct dentry *dentry; 2763 struct nameidata nd; 2764 struct inode *inode = NULL; 2765 2766 error = user_path_parent(dfd, pathname, &nd, &name); 2767 if (error) 2768 return error; 2769 2770 error = -EISDIR; 2771 if (nd.last_type != LAST_NORM) 2772 goto exit1; 2773 2774 nd.flags &= ~LOOKUP_PARENT; 2775 2776 mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); 2777 dentry = lookup_hash(&nd); 2778 error = PTR_ERR(dentry); 2779 if (!IS_ERR(dentry)) { 2780 /* Why not before? Because we want correct error value */ 2781 if (nd.last.name[nd.last.len]) 2782 goto slashes; 2783 inode = dentry->d_inode; 2784 if (!inode) 2785 goto slashes; 2786 ihold(inode); 2787 error = mnt_want_write(nd.path.mnt); 2788 if (error) 2789 goto exit2; 2790 error = security_path_unlink(&nd.path, dentry); 2791 if (error) 2792 goto exit3; 2793 error = vfs_unlink(nd.path.dentry->d_inode, dentry); 2794 exit3: 2795 mnt_drop_write(nd.path.mnt); 2796 exit2: 2797 dput(dentry); 2798 } 2799 mutex_unlock(&nd.path.dentry->d_inode->i_mutex); 2800 if (inode) 2801 iput(inode); /* truncate the inode here */ 2802 exit1: 2803 path_put(&nd.path); 2804 putname(name); 2805 return error; 2806 2807 slashes: 2808 error = !dentry->d_inode ? -ENOENT : 2809 S_ISDIR(dentry->d_inode->i_mode) ? -EISDIR : -ENOTDIR; 2810 goto exit2; 2811 } 2812 2813 SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag) 2814 { 2815 if ((flag & ~AT_REMOVEDIR) != 0) 2816 return -EINVAL; 2817 2818 if (flag & AT_REMOVEDIR) 2819 return do_rmdir(dfd, pathname); 2820 2821 return do_unlinkat(dfd, pathname); 2822 } 2823 2824 SYSCALL_DEFINE1(unlink, const char __user *, pathname) 2825 { 2826 return do_unlinkat(AT_FDCWD, pathname); 2827 } 2828 2829 int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname) 2830 { 2831 int error = may_create(dir, dentry); 2832 2833 if (error) 2834 return error; 2835 2836 if (!dir->i_op->symlink) 2837 return -EPERM; 2838 2839 error = security_inode_symlink(dir, dentry, oldname); 2840 if (error) 2841 return error; 2842 2843 error = dir->i_op->symlink(dir, dentry, oldname); 2844 if (!error) 2845 fsnotify_create(dir, dentry); 2846 return error; 2847 } 2848 2849 SYSCALL_DEFINE3(symlinkat, const char __user *, oldname, 2850 int, newdfd, const char __user *, newname) 2851 { 2852 int error; 2853 char *from; 2854 struct dentry *dentry; 2855 struct path path; 2856 2857 from = getname(oldname); 2858 if (IS_ERR(from)) 2859 return PTR_ERR(from); 2860 2861 dentry = user_path_create(newdfd, newname, &path, 0); 2862 error = PTR_ERR(dentry); 2863 if (IS_ERR(dentry)) 2864 goto out_putname; 2865 2866 error = mnt_want_write(path.mnt); 2867 if (error) 2868 goto out_dput; 2869 error = security_path_symlink(&path, dentry, from); 2870 if (error) 2871 goto out_drop_write; 2872 error = vfs_symlink(path.dentry->d_inode, dentry, from); 2873 out_drop_write: 2874 mnt_drop_write(path.mnt); 2875 out_dput: 2876 dput(dentry); 2877 mutex_unlock(&path.dentry->d_inode->i_mutex); 2878 path_put(&path); 2879 out_putname: 2880 putname(from); 2881 return error; 2882 } 2883 2884 SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname) 2885 { 2886 return sys_symlinkat(oldname, AT_FDCWD, newname); 2887 } 2888 2889 int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry) 2890 { 2891 struct inode *inode = old_dentry->d_inode; 2892 int error; 2893 2894 if (!inode) 2895 return -ENOENT; 2896 2897 error = may_create(dir, new_dentry); 2898 if (error) 2899 return error; 2900 2901 if (dir->i_sb != inode->i_sb) 2902 return -EXDEV; 2903 2904 /* 2905 * A link to an append-only or immutable file cannot be created. 2906 */ 2907 if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) 2908 return -EPERM; 2909 if (!dir->i_op->link) 2910 return -EPERM; 2911 if (S_ISDIR(inode->i_mode)) 2912 return -EPERM; 2913 2914 error = security_inode_link(old_dentry, dir, new_dentry); 2915 if (error) 2916 return error; 2917 2918 mutex_lock(&inode->i_mutex); 2919 /* Make sure we don't allow creating hardlink to an unlinked file */ 2920 if (inode->i_nlink == 0) 2921 error = -ENOENT; 2922 else 2923 error = dir->i_op->link(old_dentry, dir, new_dentry); 2924 mutex_unlock(&inode->i_mutex); 2925 if (!error) 2926 fsnotify_link(dir, inode, new_dentry); 2927 return error; 2928 } 2929 2930 /* 2931 * Hardlinks are often used in delicate situations. We avoid 2932 * security-related surprises by not following symlinks on the 2933 * newname. --KAB 2934 * 2935 * We don't follow them on the oldname either to be compatible 2936 * with linux 2.0, and to avoid hard-linking to directories 2937 * and other special files. --ADM 2938 */ 2939 SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname, 2940 int, newdfd, const char __user *, newname, int, flags) 2941 { 2942 struct dentry *new_dentry; 2943 struct path old_path, new_path; 2944 int how = 0; 2945 int error; 2946 2947 if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0) 2948 return -EINVAL; 2949 /* 2950 * To use null names we require CAP_DAC_READ_SEARCH 2951 * This ensures that not everyone will be able to create 2952 * handlink using the passed filedescriptor. 2953 */ 2954 if (flags & AT_EMPTY_PATH) { 2955 if (!capable(CAP_DAC_READ_SEARCH)) 2956 return -ENOENT; 2957 how = LOOKUP_EMPTY; 2958 } 2959 2960 if (flags & AT_SYMLINK_FOLLOW) 2961 how |= LOOKUP_FOLLOW; 2962 2963 error = user_path_at(olddfd, oldname, how, &old_path); 2964 if (error) 2965 return error; 2966 2967 new_dentry = user_path_create(newdfd, newname, &new_path, 0); 2968 error = PTR_ERR(new_dentry); 2969 if (IS_ERR(new_dentry)) 2970 goto out; 2971 2972 error = -EXDEV; 2973 if (old_path.mnt != new_path.mnt) 2974 goto out_dput; 2975 error = mnt_want_write(new_path.mnt); 2976 if (error) 2977 goto out_dput; 2978 error = security_path_link(old_path.dentry, &new_path, new_dentry); 2979 if (error) 2980 goto out_drop_write; 2981 error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry); 2982 out_drop_write: 2983 mnt_drop_write(new_path.mnt); 2984 out_dput: 2985 dput(new_dentry); 2986 mutex_unlock(&new_path.dentry->d_inode->i_mutex); 2987 path_put(&new_path); 2988 out: 2989 path_put(&old_path); 2990 2991 return error; 2992 } 2993 2994 SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname) 2995 { 2996 return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0); 2997 } 2998 2999 /* 3000 * The worst of all namespace operations - renaming directory. "Perverted" 3001 * doesn't even start to describe it. Somebody in UCB had a heck of a trip... 3002 * Problems: 3003 * a) we can get into loop creation. Check is done in is_subdir(). 3004 * b) race potential - two innocent renames can create a loop together. 3005 * That's where 4.4 screws up. Current fix: serialization on 3006 * sb->s_vfs_rename_mutex. We might be more accurate, but that's another 3007 * story. 3008 * c) we have to lock _three_ objects - parents and victim (if it exists). 3009 * And that - after we got ->i_mutex on parents (until then we don't know 3010 * whether the target exists). Solution: try to be smart with locking 3011 * order for inodes. We rely on the fact that tree topology may change 3012 * only under ->s_vfs_rename_mutex _and_ that parent of the object we 3013 * move will be locked. Thus we can rank directories by the tree 3014 * (ancestors first) and rank all non-directories after them. 3015 * That works since everybody except rename does "lock parent, lookup, 3016 * lock child" and rename is under ->s_vfs_rename_mutex. 3017 * HOWEVER, it relies on the assumption that any object with ->lookup() 3018 * has no more than 1 dentry. If "hybrid" objects will ever appear, 3019 * we'd better make sure that there's no link(2) for them. 3020 * d) conversion from fhandle to dentry may come in the wrong moment - when 3021 * we are removing the target. Solution: we will have to grab ->i_mutex 3022 * in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on 3023 * ->i_mutex on parents, which works but leads to some truly excessive 3024 * locking]. 3025 */ 3026 static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, 3027 struct inode *new_dir, struct dentry *new_dentry) 3028 { 3029 int error = 0; 3030 struct inode *target = new_dentry->d_inode; 3031 3032 /* 3033 * If we are going to change the parent - check write permissions, 3034 * we'll need to flip '..'. 3035 */ 3036 if (new_dir != old_dir) { 3037 error = inode_permission(old_dentry->d_inode, MAY_WRITE); 3038 if (error) 3039 return error; 3040 } 3041 3042 error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry); 3043 if (error) 3044 return error; 3045 3046 dget(new_dentry); 3047 if (target) 3048 mutex_lock(&target->i_mutex); 3049 3050 error = -EBUSY; 3051 if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry)) 3052 goto out; 3053 3054 if (target) 3055 shrink_dcache_parent(new_dentry); 3056 error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); 3057 if (error) 3058 goto out; 3059 3060 if (target) { 3061 target->i_flags |= S_DEAD; 3062 dont_mount(new_dentry); 3063 } 3064 out: 3065 if (target) 3066 mutex_unlock(&target->i_mutex); 3067 dput(new_dentry); 3068 if (!error) 3069 if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) 3070 d_move(old_dentry,new_dentry); 3071 return error; 3072 } 3073 3074 static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, 3075 struct inode *new_dir, struct dentry *new_dentry) 3076 { 3077 struct inode *target = new_dentry->d_inode; 3078 int error; 3079 3080 error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry); 3081 if (error) 3082 return error; 3083 3084 dget(new_dentry); 3085 if (target) 3086 mutex_lock(&target->i_mutex); 3087 3088 error = -EBUSY; 3089 if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry)) 3090 goto out; 3091 3092 error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); 3093 if (error) 3094 goto out; 3095 3096 if (target) 3097 dont_mount(new_dentry); 3098 if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) 3099 d_move(old_dentry, new_dentry); 3100 out: 3101 if (target) 3102 mutex_unlock(&target->i_mutex); 3103 dput(new_dentry); 3104 return error; 3105 } 3106 3107 int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, 3108 struct inode *new_dir, struct dentry *new_dentry) 3109 { 3110 int error; 3111 int is_dir = S_ISDIR(old_dentry->d_inode->i_mode); 3112 const unsigned char *old_name; 3113 3114 if (old_dentry->d_inode == new_dentry->d_inode) 3115 return 0; 3116 3117 error = may_delete(old_dir, old_dentry, is_dir); 3118 if (error) 3119 return error; 3120 3121 if (!new_dentry->d_inode) 3122 error = may_create(new_dir, new_dentry); 3123 else 3124 error = may_delete(new_dir, new_dentry, is_dir); 3125 if (error) 3126 return error; 3127 3128 if (!old_dir->i_op->rename) 3129 return -EPERM; 3130 3131 old_name = fsnotify_oldname_init(old_dentry->d_name.name); 3132 3133 if (is_dir) 3134 error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry); 3135 else 3136 error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry); 3137 if (!error) 3138 fsnotify_move(old_dir, new_dir, old_name, is_dir, 3139 new_dentry->d_inode, old_dentry); 3140 fsnotify_oldname_free(old_name); 3141 3142 return error; 3143 } 3144 3145 SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname, 3146 int, newdfd, const char __user *, newname) 3147 { 3148 struct dentry *old_dir, *new_dir; 3149 struct dentry *old_dentry, *new_dentry; 3150 struct dentry *trap; 3151 struct nameidata oldnd, newnd; 3152 char *from; 3153 char *to; 3154 int error; 3155 3156 error = user_path_parent(olddfd, oldname, &oldnd, &from); 3157 if (error) 3158 goto exit; 3159 3160 error = user_path_parent(newdfd, newname, &newnd, &to); 3161 if (error) 3162 goto exit1; 3163 3164 error = -EXDEV; 3165 if (oldnd.path.mnt != newnd.path.mnt) 3166 goto exit2; 3167 3168 old_dir = oldnd.path.dentry; 3169 error = -EBUSY; 3170 if (oldnd.last_type != LAST_NORM) 3171 goto exit2; 3172 3173 new_dir = newnd.path.dentry; 3174 if (newnd.last_type != LAST_NORM) 3175 goto exit2; 3176 3177 oldnd.flags &= ~LOOKUP_PARENT; 3178 newnd.flags &= ~LOOKUP_PARENT; 3179 newnd.flags |= LOOKUP_RENAME_TARGET; 3180 3181 trap = lock_rename(new_dir, old_dir); 3182 3183 old_dentry = lookup_hash(&oldnd); 3184 error = PTR_ERR(old_dentry); 3185 if (IS_ERR(old_dentry)) 3186 goto exit3; 3187 /* source must exist */ 3188 error = -ENOENT; 3189 if (!old_dentry->d_inode) 3190 goto exit4; 3191 /* unless the source is a directory trailing slashes give -ENOTDIR */ 3192 if (!S_ISDIR(old_dentry->d_inode->i_mode)) { 3193 error = -ENOTDIR; 3194 if (oldnd.last.name[oldnd.last.len]) 3195 goto exit4; 3196 if (newnd.last.name[newnd.last.len]) 3197 goto exit4; 3198 } 3199 /* source should not be ancestor of target */ 3200 error = -EINVAL; 3201 if (old_dentry == trap) 3202 goto exit4; 3203 new_dentry = lookup_hash(&newnd); 3204 error = PTR_ERR(new_dentry); 3205 if (IS_ERR(new_dentry)) 3206 goto exit4; 3207 /* target should not be an ancestor of source */ 3208 error = -ENOTEMPTY; 3209 if (new_dentry == trap) 3210 goto exit5; 3211 3212 error = mnt_want_write(oldnd.path.mnt); 3213 if (error) 3214 goto exit5; 3215 error = security_path_rename(&oldnd.path, old_dentry, 3216 &newnd.path, new_dentry); 3217 if (error) 3218 goto exit6; 3219 error = vfs_rename(old_dir->d_inode, old_dentry, 3220 new_dir->d_inode, new_dentry); 3221 exit6: 3222 mnt_drop_write(oldnd.path.mnt); 3223 exit5: 3224 dput(new_dentry); 3225 exit4: 3226 dput(old_dentry); 3227 exit3: 3228 unlock_rename(new_dir, old_dir); 3229 exit2: 3230 path_put(&newnd.path); 3231 putname(to); 3232 exit1: 3233 path_put(&oldnd.path); 3234 putname(from); 3235 exit: 3236 return error; 3237 } 3238 3239 SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname) 3240 { 3241 return sys_renameat(AT_FDCWD, oldname, AT_FDCWD, newname); 3242 } 3243 3244 int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen, const char *link) 3245 { 3246 int len; 3247 3248 len = PTR_ERR(link); 3249 if (IS_ERR(link)) 3250 goto out; 3251 3252 len = strlen(link); 3253 if (len > (unsigned) buflen) 3254 len = buflen; 3255 if (copy_to_user(buffer, link, len)) 3256 len = -EFAULT; 3257 out: 3258 return len; 3259 } 3260 3261 /* 3262 * A helper for ->readlink(). This should be used *ONLY* for symlinks that 3263 * have ->follow_link() touching nd only in nd_set_link(). Using (or not 3264 * using) it for any given inode is up to filesystem. 3265 */ 3266 int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen) 3267 { 3268 struct nameidata nd; 3269 void *cookie; 3270 int res; 3271 3272 nd.depth = 0; 3273 cookie = dentry->d_inode->i_op->follow_link(dentry, &nd); 3274 if (IS_ERR(cookie)) 3275 return PTR_ERR(cookie); 3276 3277 res = vfs_readlink(dentry, buffer, buflen, nd_get_link(&nd)); 3278 if (dentry->d_inode->i_op->put_link) 3279 dentry->d_inode->i_op->put_link(dentry, &nd, cookie); 3280 return res; 3281 } 3282 3283 int vfs_follow_link(struct nameidata *nd, const char *link) 3284 { 3285 return __vfs_follow_link(nd, link); 3286 } 3287 3288 /* get the link contents into pagecache */ 3289 static char *page_getlink(struct dentry * dentry, struct page **ppage) 3290 { 3291 char *kaddr; 3292 struct page *page; 3293 struct address_space *mapping = dentry->d_inode->i_mapping; 3294 page = read_mapping_page(mapping, 0, NULL); 3295 if (IS_ERR(page)) 3296 return (char*)page; 3297 *ppage = page; 3298 kaddr = kmap(page); 3299 nd_terminate_link(kaddr, dentry->d_inode->i_size, PAGE_SIZE - 1); 3300 return kaddr; 3301 } 3302 3303 int page_readlink(struct dentry *dentry, char __user *buffer, int buflen) 3304 { 3305 struct page *page = NULL; 3306 char *s = page_getlink(dentry, &page); 3307 int res = vfs_readlink(dentry,buffer,buflen,s); 3308 if (page) { 3309 kunmap(page); 3310 page_cache_release(page); 3311 } 3312 return res; 3313 } 3314 3315 void *page_follow_link_light(struct dentry *dentry, struct nameidata *nd) 3316 { 3317 struct page *page = NULL; 3318 nd_set_link(nd, page_getlink(dentry, &page)); 3319 return page; 3320 } 3321 3322 void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) 3323 { 3324 struct page *page = cookie; 3325 3326 if (page) { 3327 kunmap(page); 3328 page_cache_release(page); 3329 } 3330 } 3331 3332 /* 3333 * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS 3334 */ 3335 int __page_symlink(struct inode *inode, const char *symname, int len, int nofs) 3336 { 3337 struct address_space *mapping = inode->i_mapping; 3338 struct page *page; 3339 void *fsdata; 3340 int err; 3341 char *kaddr; 3342 unsigned int flags = AOP_FLAG_UNINTERRUPTIBLE; 3343 if (nofs) 3344 flags |= AOP_FLAG_NOFS; 3345 3346 retry: 3347 err = pagecache_write_begin(NULL, mapping, 0, len-1, 3348 flags, &page, &fsdata); 3349 if (err) 3350 goto fail; 3351 3352 kaddr = kmap_atomic(page, KM_USER0); 3353 memcpy(kaddr, symname, len-1); 3354 kunmap_atomic(kaddr, KM_USER0); 3355 3356 err = pagecache_write_end(NULL, mapping, 0, len-1, len-1, 3357 page, fsdata); 3358 if (err < 0) 3359 goto fail; 3360 if (err < len-1) 3361 goto retry; 3362 3363 mark_inode_dirty(inode); 3364 return 0; 3365 fail: 3366 return err; 3367 } 3368 3369 int page_symlink(struct inode *inode, const char *symname, int len) 3370 { 3371 return __page_symlink(inode, symname, len, 3372 !(mapping_gfp_mask(inode->i_mapping) & __GFP_FS)); 3373 } 3374 3375 const struct inode_operations page_symlink_inode_operations = { 3376 .readlink = generic_readlink, 3377 .follow_link = page_follow_link_light, 3378 .put_link = page_put_link, 3379 }; 3380 3381 EXPORT_SYMBOL(user_path_at); 3382 EXPORT_SYMBOL(follow_down_one); 3383 EXPORT_SYMBOL(follow_down); 3384 EXPORT_SYMBOL(follow_up); 3385 EXPORT_SYMBOL(get_write_access); /* binfmt_aout */ 3386 EXPORT_SYMBOL(getname); 3387 EXPORT_SYMBOL(lock_rename); 3388 EXPORT_SYMBOL(lookup_one_len); 3389 EXPORT_SYMBOL(page_follow_link_light); 3390 EXPORT_SYMBOL(page_put_link); 3391 EXPORT_SYMBOL(page_readlink); 3392 EXPORT_SYMBOL(__page_symlink); 3393 EXPORT_SYMBOL(page_symlink); 3394 EXPORT_SYMBOL(page_symlink_inode_operations); 3395 EXPORT_SYMBOL(kern_path); 3396 EXPORT_SYMBOL(vfs_path_lookup); 3397 EXPORT_SYMBOL(inode_permission); 3398 EXPORT_SYMBOL(unlock_rename); 3399 EXPORT_SYMBOL(vfs_create); 3400 EXPORT_SYMBOL(vfs_follow_link); 3401 EXPORT_SYMBOL(vfs_link); 3402 EXPORT_SYMBOL(vfs_mkdir); 3403 EXPORT_SYMBOL(vfs_mknod); 3404 EXPORT_SYMBOL(generic_permission); 3405 EXPORT_SYMBOL(vfs_readlink); 3406 EXPORT_SYMBOL(vfs_rename); 3407 EXPORT_SYMBOL(vfs_rmdir); 3408 EXPORT_SYMBOL(vfs_symlink); 3409 EXPORT_SYMBOL(vfs_unlink); 3410 EXPORT_SYMBOL(dentry_unhash); 3411 EXPORT_SYMBOL(generic_readlink); 3412