1 /* 2 * linux/fs/namei.c 3 * 4 * Copyright (C) 1991, 1992 Linus Torvalds 5 */ 6 7 /* 8 * Some corrections by tytso. 9 */ 10 11 /* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname 12 * lookup logic. 13 */ 14 /* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture. 15 */ 16 17 #include <linux/init.h> 18 #include <linux/export.h> 19 #include <linux/kernel.h> 20 #include <linux/slab.h> 21 #include <linux/fs.h> 22 #include <linux/namei.h> 23 #include <linux/pagemap.h> 24 #include <linux/fsnotify.h> 25 #include <linux/personality.h> 26 #include <linux/security.h> 27 #include <linux/ima.h> 28 #include <linux/syscalls.h> 29 #include <linux/mount.h> 30 #include <linux/audit.h> 31 #include <linux/capability.h> 32 #include <linux/file.h> 33 #include <linux/fcntl.h> 34 #include <linux/device_cgroup.h> 35 #include <linux/fs_struct.h> 36 #include <linux/posix_acl.h> 37 #include <asm/uaccess.h> 38 39 #include "internal.h" 40 #include "mount.h" 41 42 /* [Feb-1997 T. Schoebel-Theuer] 43 * Fundamental changes in the pathname lookup mechanisms (namei) 44 * were necessary because of omirr. The reason is that omirr needs 45 * to know the _real_ pathname, not the user-supplied one, in case 46 * of symlinks (and also when transname replacements occur). 47 * 48 * The new code replaces the old recursive symlink resolution with 49 * an iterative one (in case of non-nested symlink chains). It does 50 * this with calls to <fs>_follow_link(). 51 * As a side effect, dir_namei(), _namei() and follow_link() are now 52 * replaced with a single function lookup_dentry() that can handle all 53 * the special cases of the former code. 54 * 55 * With the new dcache, the pathname is stored at each inode, at least as 56 * long as the refcount of the inode is positive. As a side effect, the 57 * size of the dcache depends on the inode cache and thus is dynamic. 58 * 59 * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink 60 * resolution to correspond with current state of the code. 61 * 62 * Note that the symlink resolution is not *completely* iterative. 63 * There is still a significant amount of tail- and mid- recursion in 64 * the algorithm. Also, note that <fs>_readlink() is not used in 65 * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink() 66 * may return different results than <fs>_follow_link(). Many virtual 67 * filesystems (including /proc) exhibit this behavior. 68 */ 69 70 /* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation: 71 * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL 72 * and the name already exists in form of a symlink, try to create the new 73 * name indicated by the symlink. The old code always complained that the 74 * name already exists, due to not following the symlink even if its target 75 * is nonexistent. The new semantics affects also mknod() and link() when 76 * the name is a symlink pointing to a non-existent name. 77 * 78 * I don't know which semantics is the right one, since I have no access 79 * to standards. But I found by trial that HP-UX 9.0 has the full "new" 80 * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the 81 * "old" one. Personally, I think the new semantics is much more logical. 82 * Note that "ln old new" where "new" is a symlink pointing to a non-existing 83 * file does succeed in both HP-UX and SunOs, but not in Solaris 84 * and in the old Linux semantics. 85 */ 86 87 /* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink 88 * semantics. See the comments in "open_namei" and "do_link" below. 89 * 90 * [10-Sep-98 Alan Modra] Another symlink change. 91 */ 92 93 /* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks: 94 * inside the path - always follow. 95 * in the last component in creation/removal/renaming - never follow. 96 * if LOOKUP_FOLLOW passed - follow. 97 * if the pathname has trailing slashes - follow. 98 * otherwise - don't follow. 99 * (applied in that order). 100 * 101 * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT 102 * restored for 2.4. This is the last surviving part of old 4.2BSD bug. 103 * During the 2.4 we need to fix the userland stuff depending on it - 104 * hopefully we will be able to get rid of that wart in 2.5. So far only 105 * XEmacs seems to be relying on it... 106 */ 107 /* 108 * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland) 109 * implemented. Let's see if raised priority of ->s_vfs_rename_mutex gives 110 * any extra contention... 111 */ 112 113 /* In order to reduce some races, while at the same time doing additional 114 * checking and hopefully speeding things up, we copy filenames to the 115 * kernel data space before using them.. 116 * 117 * POSIX.1 2.4: an empty pathname is invalid (ENOENT). 118 * PATH_MAX includes the nul terminator --RR. 119 */ 120 static char *getname_flags(const char __user *filename, int flags, int *empty) 121 { 122 char *result = __getname(), *err; 123 int len; 124 125 if (unlikely(!result)) 126 return ERR_PTR(-ENOMEM); 127 128 len = strncpy_from_user(result, filename, PATH_MAX); 129 err = ERR_PTR(len); 130 if (unlikely(len < 0)) 131 goto error; 132 133 /* The empty path is special. */ 134 if (unlikely(!len)) { 135 if (empty) 136 *empty = 1; 137 err = ERR_PTR(-ENOENT); 138 if (!(flags & LOOKUP_EMPTY)) 139 goto error; 140 } 141 142 err = ERR_PTR(-ENAMETOOLONG); 143 if (likely(len < PATH_MAX)) { 144 audit_getname(result); 145 return result; 146 } 147 148 error: 149 __putname(result); 150 return err; 151 } 152 153 char *getname(const char __user * filename) 154 { 155 return getname_flags(filename, 0, NULL); 156 } 157 158 #ifdef CONFIG_AUDITSYSCALL 159 void putname(const char *name) 160 { 161 if (unlikely(!audit_dummy_context())) 162 audit_putname(name); 163 else 164 __putname(name); 165 } 166 EXPORT_SYMBOL(putname); 167 #endif 168 169 static int check_acl(struct inode *inode, int mask) 170 { 171 #ifdef CONFIG_FS_POSIX_ACL 172 struct posix_acl *acl; 173 174 if (mask & MAY_NOT_BLOCK) { 175 acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS); 176 if (!acl) 177 return -EAGAIN; 178 /* no ->get_acl() calls in RCU mode... */ 179 if (acl == ACL_NOT_CACHED) 180 return -ECHILD; 181 return posix_acl_permission(inode, acl, mask & ~MAY_NOT_BLOCK); 182 } 183 184 acl = get_cached_acl(inode, ACL_TYPE_ACCESS); 185 186 /* 187 * A filesystem can force a ACL callback by just never filling the 188 * ACL cache. But normally you'd fill the cache either at inode 189 * instantiation time, or on the first ->get_acl call. 190 * 191 * If the filesystem doesn't have a get_acl() function at all, we'll 192 * just create the negative cache entry. 193 */ 194 if (acl == ACL_NOT_CACHED) { 195 if (inode->i_op->get_acl) { 196 acl = inode->i_op->get_acl(inode, ACL_TYPE_ACCESS); 197 if (IS_ERR(acl)) 198 return PTR_ERR(acl); 199 } else { 200 set_cached_acl(inode, ACL_TYPE_ACCESS, NULL); 201 return -EAGAIN; 202 } 203 } 204 205 if (acl) { 206 int error = posix_acl_permission(inode, acl, mask); 207 posix_acl_release(acl); 208 return error; 209 } 210 #endif 211 212 return -EAGAIN; 213 } 214 215 /* 216 * This does the basic permission checking 217 */ 218 static int acl_permission_check(struct inode *inode, int mask) 219 { 220 unsigned int mode = inode->i_mode; 221 222 if (likely(uid_eq(current_fsuid(), inode->i_uid))) 223 mode >>= 6; 224 else { 225 if (IS_POSIXACL(inode) && (mode & S_IRWXG)) { 226 int error = check_acl(inode, mask); 227 if (error != -EAGAIN) 228 return error; 229 } 230 231 if (in_group_p(inode->i_gid)) 232 mode >>= 3; 233 } 234 235 /* 236 * If the DACs are ok we don't need any capability check. 237 */ 238 if ((mask & ~mode & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0) 239 return 0; 240 return -EACCES; 241 } 242 243 /** 244 * generic_permission - check for access rights on a Posix-like filesystem 245 * @inode: inode to check access rights for 246 * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, ...) 247 * 248 * Used to check for read/write/execute permissions on a file. 249 * We use "fsuid" for this, letting us set arbitrary permissions 250 * for filesystem access without changing the "normal" uids which 251 * are used for other things. 252 * 253 * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk 254 * request cannot be satisfied (eg. requires blocking or too much complexity). 255 * It would then be called again in ref-walk mode. 256 */ 257 int generic_permission(struct inode *inode, int mask) 258 { 259 int ret; 260 261 /* 262 * Do the basic permission checks. 263 */ 264 ret = acl_permission_check(inode, mask); 265 if (ret != -EACCES) 266 return ret; 267 268 if (S_ISDIR(inode->i_mode)) { 269 /* DACs are overridable for directories */ 270 if (inode_capable(inode, CAP_DAC_OVERRIDE)) 271 return 0; 272 if (!(mask & MAY_WRITE)) 273 if (inode_capable(inode, CAP_DAC_READ_SEARCH)) 274 return 0; 275 return -EACCES; 276 } 277 /* 278 * Read/write DACs are always overridable. 279 * Executable DACs are overridable when there is 280 * at least one exec bit set. 281 */ 282 if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO)) 283 if (inode_capable(inode, CAP_DAC_OVERRIDE)) 284 return 0; 285 286 /* 287 * Searching includes executable on directories, else just read. 288 */ 289 mask &= MAY_READ | MAY_WRITE | MAY_EXEC; 290 if (mask == MAY_READ) 291 if (inode_capable(inode, CAP_DAC_READ_SEARCH)) 292 return 0; 293 294 return -EACCES; 295 } 296 297 /* 298 * We _really_ want to just do "generic_permission()" without 299 * even looking at the inode->i_op values. So we keep a cache 300 * flag in inode->i_opflags, that says "this has not special 301 * permission function, use the fast case". 302 */ 303 static inline int do_inode_permission(struct inode *inode, int mask) 304 { 305 if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) { 306 if (likely(inode->i_op->permission)) 307 return inode->i_op->permission(inode, mask); 308 309 /* This gets set once for the inode lifetime */ 310 spin_lock(&inode->i_lock); 311 inode->i_opflags |= IOP_FASTPERM; 312 spin_unlock(&inode->i_lock); 313 } 314 return generic_permission(inode, mask); 315 } 316 317 /** 318 * inode_permission - check for access rights to a given inode 319 * @inode: inode to check permission on 320 * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, ...) 321 * 322 * Used to check for read/write/execute permissions on an inode. 323 * We use "fsuid" for this, letting us set arbitrary permissions 324 * for filesystem access without changing the "normal" uids which 325 * are used for other things. 326 * 327 * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask. 328 */ 329 int inode_permission(struct inode *inode, int mask) 330 { 331 int retval; 332 333 if (unlikely(mask & MAY_WRITE)) { 334 umode_t mode = inode->i_mode; 335 336 /* 337 * Nobody gets write access to a read-only fs. 338 */ 339 if (IS_RDONLY(inode) && 340 (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) 341 return -EROFS; 342 343 /* 344 * Nobody gets write access to an immutable file. 345 */ 346 if (IS_IMMUTABLE(inode)) 347 return -EACCES; 348 } 349 350 retval = do_inode_permission(inode, mask); 351 if (retval) 352 return retval; 353 354 retval = devcgroup_inode_permission(inode, mask); 355 if (retval) 356 return retval; 357 358 return security_inode_permission(inode, mask); 359 } 360 361 /** 362 * path_get - get a reference to a path 363 * @path: path to get the reference to 364 * 365 * Given a path increment the reference count to the dentry and the vfsmount. 366 */ 367 void path_get(struct path *path) 368 { 369 mntget(path->mnt); 370 dget(path->dentry); 371 } 372 EXPORT_SYMBOL(path_get); 373 374 /** 375 * path_put - put a reference to a path 376 * @path: path to put the reference to 377 * 378 * Given a path decrement the reference count to the dentry and the vfsmount. 379 */ 380 void path_put(struct path *path) 381 { 382 dput(path->dentry); 383 mntput(path->mnt); 384 } 385 EXPORT_SYMBOL(path_put); 386 387 /* 388 * Path walking has 2 modes, rcu-walk and ref-walk (see 389 * Documentation/filesystems/path-lookup.txt). In situations when we can't 390 * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab 391 * normal reference counts on dentries and vfsmounts to transition to rcu-walk 392 * mode. Refcounts are grabbed at the last known good point before rcu-walk 393 * got stuck, so ref-walk may continue from there. If this is not successful 394 * (eg. a seqcount has changed), then failure is returned and it's up to caller 395 * to restart the path walk from the beginning in ref-walk mode. 396 */ 397 398 /** 399 * unlazy_walk - try to switch to ref-walk mode. 400 * @nd: nameidata pathwalk data 401 * @dentry: child of nd->path.dentry or NULL 402 * Returns: 0 on success, -ECHILD on failure 403 * 404 * unlazy_walk attempts to legitimize the current nd->path, nd->root and dentry 405 * for ref-walk mode. @dentry must be a path found by a do_lookup call on 406 * @nd or NULL. Must be called from rcu-walk context. 407 */ 408 static int unlazy_walk(struct nameidata *nd, struct dentry *dentry) 409 { 410 struct fs_struct *fs = current->fs; 411 struct dentry *parent = nd->path.dentry; 412 int want_root = 0; 413 414 BUG_ON(!(nd->flags & LOOKUP_RCU)); 415 if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { 416 want_root = 1; 417 spin_lock(&fs->lock); 418 if (nd->root.mnt != fs->root.mnt || 419 nd->root.dentry != fs->root.dentry) 420 goto err_root; 421 } 422 spin_lock(&parent->d_lock); 423 if (!dentry) { 424 if (!__d_rcu_to_refcount(parent, nd->seq)) 425 goto err_parent; 426 BUG_ON(nd->inode != parent->d_inode); 427 } else { 428 if (dentry->d_parent != parent) 429 goto err_parent; 430 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); 431 if (!__d_rcu_to_refcount(dentry, nd->seq)) 432 goto err_child; 433 /* 434 * If the sequence check on the child dentry passed, then 435 * the child has not been removed from its parent. This 436 * means the parent dentry must be valid and able to take 437 * a reference at this point. 438 */ 439 BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent); 440 BUG_ON(!parent->d_count); 441 parent->d_count++; 442 spin_unlock(&dentry->d_lock); 443 } 444 spin_unlock(&parent->d_lock); 445 if (want_root) { 446 path_get(&nd->root); 447 spin_unlock(&fs->lock); 448 } 449 mntget(nd->path.mnt); 450 451 rcu_read_unlock(); 452 br_read_unlock(&vfsmount_lock); 453 nd->flags &= ~LOOKUP_RCU; 454 return 0; 455 456 err_child: 457 spin_unlock(&dentry->d_lock); 458 err_parent: 459 spin_unlock(&parent->d_lock); 460 err_root: 461 if (want_root) 462 spin_unlock(&fs->lock); 463 return -ECHILD; 464 } 465 466 /** 467 * release_open_intent - free up open intent resources 468 * @nd: pointer to nameidata 469 */ 470 void release_open_intent(struct nameidata *nd) 471 { 472 struct file *file = nd->intent.open.file; 473 474 if (file && !IS_ERR(file)) { 475 if (file->f_path.dentry == NULL) 476 put_filp(file); 477 else 478 fput(file); 479 } 480 } 481 482 static inline int d_revalidate(struct dentry *dentry, struct nameidata *nd) 483 { 484 return dentry->d_op->d_revalidate(dentry, nd); 485 } 486 487 /** 488 * complete_walk - successful completion of path walk 489 * @nd: pointer nameidata 490 * 491 * If we had been in RCU mode, drop out of it and legitimize nd->path. 492 * Revalidate the final result, unless we'd already done that during 493 * the path walk or the filesystem doesn't ask for it. Return 0 on 494 * success, -error on failure. In case of failure caller does not 495 * need to drop nd->path. 496 */ 497 static int complete_walk(struct nameidata *nd) 498 { 499 struct dentry *dentry = nd->path.dentry; 500 int status; 501 502 if (nd->flags & LOOKUP_RCU) { 503 nd->flags &= ~LOOKUP_RCU; 504 if (!(nd->flags & LOOKUP_ROOT)) 505 nd->root.mnt = NULL; 506 spin_lock(&dentry->d_lock); 507 if (unlikely(!__d_rcu_to_refcount(dentry, nd->seq))) { 508 spin_unlock(&dentry->d_lock); 509 rcu_read_unlock(); 510 br_read_unlock(&vfsmount_lock); 511 return -ECHILD; 512 } 513 BUG_ON(nd->inode != dentry->d_inode); 514 spin_unlock(&dentry->d_lock); 515 mntget(nd->path.mnt); 516 rcu_read_unlock(); 517 br_read_unlock(&vfsmount_lock); 518 } 519 520 if (likely(!(nd->flags & LOOKUP_JUMPED))) 521 return 0; 522 523 if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE))) 524 return 0; 525 526 if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT))) 527 return 0; 528 529 /* Note: we do not d_invalidate() */ 530 status = d_revalidate(dentry, nd); 531 if (status > 0) 532 return 0; 533 534 if (!status) 535 status = -ESTALE; 536 537 path_put(&nd->path); 538 return status; 539 } 540 541 static __always_inline void set_root(struct nameidata *nd) 542 { 543 if (!nd->root.mnt) 544 get_fs_root(current->fs, &nd->root); 545 } 546 547 static int link_path_walk(const char *, struct nameidata *); 548 549 static __always_inline void set_root_rcu(struct nameidata *nd) 550 { 551 if (!nd->root.mnt) { 552 struct fs_struct *fs = current->fs; 553 unsigned seq; 554 555 do { 556 seq = read_seqcount_begin(&fs->seq); 557 nd->root = fs->root; 558 nd->seq = __read_seqcount_begin(&nd->root.dentry->d_seq); 559 } while (read_seqcount_retry(&fs->seq, seq)); 560 } 561 } 562 563 static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link) 564 { 565 int ret; 566 567 if (IS_ERR(link)) 568 goto fail; 569 570 if (*link == '/') { 571 set_root(nd); 572 path_put(&nd->path); 573 nd->path = nd->root; 574 path_get(&nd->root); 575 nd->flags |= LOOKUP_JUMPED; 576 } 577 nd->inode = nd->path.dentry->d_inode; 578 579 ret = link_path_walk(link, nd); 580 return ret; 581 fail: 582 path_put(&nd->path); 583 return PTR_ERR(link); 584 } 585 586 static void path_put_conditional(struct path *path, struct nameidata *nd) 587 { 588 dput(path->dentry); 589 if (path->mnt != nd->path.mnt) 590 mntput(path->mnt); 591 } 592 593 static inline void path_to_nameidata(const struct path *path, 594 struct nameidata *nd) 595 { 596 if (!(nd->flags & LOOKUP_RCU)) { 597 dput(nd->path.dentry); 598 if (nd->path.mnt != path->mnt) 599 mntput(nd->path.mnt); 600 } 601 nd->path.mnt = path->mnt; 602 nd->path.dentry = path->dentry; 603 } 604 605 static inline void put_link(struct nameidata *nd, struct path *link, void *cookie) 606 { 607 struct inode *inode = link->dentry->d_inode; 608 if (!IS_ERR(cookie) && inode->i_op->put_link) 609 inode->i_op->put_link(link->dentry, nd, cookie); 610 path_put(link); 611 } 612 613 static __always_inline int 614 follow_link(struct path *link, struct nameidata *nd, void **p) 615 { 616 int error; 617 struct dentry *dentry = link->dentry; 618 619 BUG_ON(nd->flags & LOOKUP_RCU); 620 621 if (link->mnt == nd->path.mnt) 622 mntget(link->mnt); 623 624 if (unlikely(current->total_link_count >= 40)) { 625 *p = ERR_PTR(-ELOOP); /* no ->put_link(), please */ 626 path_put(&nd->path); 627 return -ELOOP; 628 } 629 cond_resched(); 630 current->total_link_count++; 631 632 touch_atime(link); 633 nd_set_link(nd, NULL); 634 635 error = security_inode_follow_link(link->dentry, nd); 636 if (error) { 637 *p = ERR_PTR(error); /* no ->put_link(), please */ 638 path_put(&nd->path); 639 return error; 640 } 641 642 nd->last_type = LAST_BIND; 643 *p = dentry->d_inode->i_op->follow_link(dentry, nd); 644 error = PTR_ERR(*p); 645 if (!IS_ERR(*p)) { 646 char *s = nd_get_link(nd); 647 error = 0; 648 if (s) 649 error = __vfs_follow_link(nd, s); 650 else if (nd->last_type == LAST_BIND) { 651 nd->flags |= LOOKUP_JUMPED; 652 nd->inode = nd->path.dentry->d_inode; 653 if (nd->inode->i_op->follow_link) { 654 /* stepped on a _really_ weird one */ 655 path_put(&nd->path); 656 error = -ELOOP; 657 } 658 } 659 } 660 return error; 661 } 662 663 static int follow_up_rcu(struct path *path) 664 { 665 struct mount *mnt = real_mount(path->mnt); 666 struct mount *parent; 667 struct dentry *mountpoint; 668 669 parent = mnt->mnt_parent; 670 if (&parent->mnt == path->mnt) 671 return 0; 672 mountpoint = mnt->mnt_mountpoint; 673 path->dentry = mountpoint; 674 path->mnt = &parent->mnt; 675 return 1; 676 } 677 678 int follow_up(struct path *path) 679 { 680 struct mount *mnt = real_mount(path->mnt); 681 struct mount *parent; 682 struct dentry *mountpoint; 683 684 br_read_lock(&vfsmount_lock); 685 parent = mnt->mnt_parent; 686 if (&parent->mnt == path->mnt) { 687 br_read_unlock(&vfsmount_lock); 688 return 0; 689 } 690 mntget(&parent->mnt); 691 mountpoint = dget(mnt->mnt_mountpoint); 692 br_read_unlock(&vfsmount_lock); 693 dput(path->dentry); 694 path->dentry = mountpoint; 695 mntput(path->mnt); 696 path->mnt = &parent->mnt; 697 return 1; 698 } 699 700 /* 701 * Perform an automount 702 * - return -EISDIR to tell follow_managed() to stop and return the path we 703 * were called with. 704 */ 705 static int follow_automount(struct path *path, unsigned flags, 706 bool *need_mntput) 707 { 708 struct vfsmount *mnt; 709 int err; 710 711 if (!path->dentry->d_op || !path->dentry->d_op->d_automount) 712 return -EREMOTE; 713 714 /* We don't want to mount if someone's just doing a stat - 715 * unless they're stat'ing a directory and appended a '/' to 716 * the name. 717 * 718 * We do, however, want to mount if someone wants to open or 719 * create a file of any type under the mountpoint, wants to 720 * traverse through the mountpoint or wants to open the 721 * mounted directory. Also, autofs may mark negative dentries 722 * as being automount points. These will need the attentions 723 * of the daemon to instantiate them before they can be used. 724 */ 725 if (!(flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY | 726 LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) && 727 path->dentry->d_inode) 728 return -EISDIR; 729 730 current->total_link_count++; 731 if (current->total_link_count >= 40) 732 return -ELOOP; 733 734 mnt = path->dentry->d_op->d_automount(path); 735 if (IS_ERR(mnt)) { 736 /* 737 * The filesystem is allowed to return -EISDIR here to indicate 738 * it doesn't want to automount. For instance, autofs would do 739 * this so that its userspace daemon can mount on this dentry. 740 * 741 * However, we can only permit this if it's a terminal point in 742 * the path being looked up; if it wasn't then the remainder of 743 * the path is inaccessible and we should say so. 744 */ 745 if (PTR_ERR(mnt) == -EISDIR && (flags & LOOKUP_PARENT)) 746 return -EREMOTE; 747 return PTR_ERR(mnt); 748 } 749 750 if (!mnt) /* mount collision */ 751 return 0; 752 753 if (!*need_mntput) { 754 /* lock_mount() may release path->mnt on error */ 755 mntget(path->mnt); 756 *need_mntput = true; 757 } 758 err = finish_automount(mnt, path); 759 760 switch (err) { 761 case -EBUSY: 762 /* Someone else made a mount here whilst we were busy */ 763 return 0; 764 case 0: 765 path_put(path); 766 path->mnt = mnt; 767 path->dentry = dget(mnt->mnt_root); 768 return 0; 769 default: 770 return err; 771 } 772 773 } 774 775 /* 776 * Handle a dentry that is managed in some way. 777 * - Flagged for transit management (autofs) 778 * - Flagged as mountpoint 779 * - Flagged as automount point 780 * 781 * This may only be called in refwalk mode. 782 * 783 * Serialization is taken care of in namespace.c 784 */ 785 static int follow_managed(struct path *path, unsigned flags) 786 { 787 struct vfsmount *mnt = path->mnt; /* held by caller, must be left alone */ 788 unsigned managed; 789 bool need_mntput = false; 790 int ret = 0; 791 792 /* Given that we're not holding a lock here, we retain the value in a 793 * local variable for each dentry as we look at it so that we don't see 794 * the components of that value change under us */ 795 while (managed = ACCESS_ONCE(path->dentry->d_flags), 796 managed &= DCACHE_MANAGED_DENTRY, 797 unlikely(managed != 0)) { 798 /* Allow the filesystem to manage the transit without i_mutex 799 * being held. */ 800 if (managed & DCACHE_MANAGE_TRANSIT) { 801 BUG_ON(!path->dentry->d_op); 802 BUG_ON(!path->dentry->d_op->d_manage); 803 ret = path->dentry->d_op->d_manage(path->dentry, false); 804 if (ret < 0) 805 break; 806 } 807 808 /* Transit to a mounted filesystem. */ 809 if (managed & DCACHE_MOUNTED) { 810 struct vfsmount *mounted = lookup_mnt(path); 811 if (mounted) { 812 dput(path->dentry); 813 if (need_mntput) 814 mntput(path->mnt); 815 path->mnt = mounted; 816 path->dentry = dget(mounted->mnt_root); 817 need_mntput = true; 818 continue; 819 } 820 821 /* Something is mounted on this dentry in another 822 * namespace and/or whatever was mounted there in this 823 * namespace got unmounted before we managed to get the 824 * vfsmount_lock */ 825 } 826 827 /* Handle an automount point */ 828 if (managed & DCACHE_NEED_AUTOMOUNT) { 829 ret = follow_automount(path, flags, &need_mntput); 830 if (ret < 0) 831 break; 832 continue; 833 } 834 835 /* We didn't change the current path point */ 836 break; 837 } 838 839 if (need_mntput && path->mnt == mnt) 840 mntput(path->mnt); 841 if (ret == -EISDIR) 842 ret = 0; 843 return ret < 0 ? ret : need_mntput; 844 } 845 846 int follow_down_one(struct path *path) 847 { 848 struct vfsmount *mounted; 849 850 mounted = lookup_mnt(path); 851 if (mounted) { 852 dput(path->dentry); 853 mntput(path->mnt); 854 path->mnt = mounted; 855 path->dentry = dget(mounted->mnt_root); 856 return 1; 857 } 858 return 0; 859 } 860 861 static inline bool managed_dentry_might_block(struct dentry *dentry) 862 { 863 return (dentry->d_flags & DCACHE_MANAGE_TRANSIT && 864 dentry->d_op->d_manage(dentry, true) < 0); 865 } 866 867 /* 868 * Try to skip to top of mountpoint pile in rcuwalk mode. Fail if 869 * we meet a managed dentry that would need blocking. 870 */ 871 static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, 872 struct inode **inode) 873 { 874 for (;;) { 875 struct mount *mounted; 876 /* 877 * Don't forget we might have a non-mountpoint managed dentry 878 * that wants to block transit. 879 */ 880 if (unlikely(managed_dentry_might_block(path->dentry))) 881 return false; 882 883 if (!d_mountpoint(path->dentry)) 884 break; 885 886 mounted = __lookup_mnt(path->mnt, path->dentry, 1); 887 if (!mounted) 888 break; 889 path->mnt = &mounted->mnt; 890 path->dentry = mounted->mnt.mnt_root; 891 nd->flags |= LOOKUP_JUMPED; 892 nd->seq = read_seqcount_begin(&path->dentry->d_seq); 893 /* 894 * Update the inode too. We don't need to re-check the 895 * dentry sequence number here after this d_inode read, 896 * because a mount-point is always pinned. 897 */ 898 *inode = path->dentry->d_inode; 899 } 900 return true; 901 } 902 903 static void follow_mount_rcu(struct nameidata *nd) 904 { 905 while (d_mountpoint(nd->path.dentry)) { 906 struct mount *mounted; 907 mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry, 1); 908 if (!mounted) 909 break; 910 nd->path.mnt = &mounted->mnt; 911 nd->path.dentry = mounted->mnt.mnt_root; 912 nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); 913 } 914 } 915 916 static int follow_dotdot_rcu(struct nameidata *nd) 917 { 918 set_root_rcu(nd); 919 920 while (1) { 921 if (nd->path.dentry == nd->root.dentry && 922 nd->path.mnt == nd->root.mnt) { 923 break; 924 } 925 if (nd->path.dentry != nd->path.mnt->mnt_root) { 926 struct dentry *old = nd->path.dentry; 927 struct dentry *parent = old->d_parent; 928 unsigned seq; 929 930 seq = read_seqcount_begin(&parent->d_seq); 931 if (read_seqcount_retry(&old->d_seq, nd->seq)) 932 goto failed; 933 nd->path.dentry = parent; 934 nd->seq = seq; 935 break; 936 } 937 if (!follow_up_rcu(&nd->path)) 938 break; 939 nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); 940 } 941 follow_mount_rcu(nd); 942 nd->inode = nd->path.dentry->d_inode; 943 return 0; 944 945 failed: 946 nd->flags &= ~LOOKUP_RCU; 947 if (!(nd->flags & LOOKUP_ROOT)) 948 nd->root.mnt = NULL; 949 rcu_read_unlock(); 950 br_read_unlock(&vfsmount_lock); 951 return -ECHILD; 952 } 953 954 /* 955 * Follow down to the covering mount currently visible to userspace. At each 956 * point, the filesystem owning that dentry may be queried as to whether the 957 * caller is permitted to proceed or not. 958 */ 959 int follow_down(struct path *path) 960 { 961 unsigned managed; 962 int ret; 963 964 while (managed = ACCESS_ONCE(path->dentry->d_flags), 965 unlikely(managed & DCACHE_MANAGED_DENTRY)) { 966 /* Allow the filesystem to manage the transit without i_mutex 967 * being held. 968 * 969 * We indicate to the filesystem if someone is trying to mount 970 * something here. This gives autofs the chance to deny anyone 971 * other than its daemon the right to mount on its 972 * superstructure. 973 * 974 * The filesystem may sleep at this point. 975 */ 976 if (managed & DCACHE_MANAGE_TRANSIT) { 977 BUG_ON(!path->dentry->d_op); 978 BUG_ON(!path->dentry->d_op->d_manage); 979 ret = path->dentry->d_op->d_manage( 980 path->dentry, false); 981 if (ret < 0) 982 return ret == -EISDIR ? 0 : ret; 983 } 984 985 /* Transit to a mounted filesystem. */ 986 if (managed & DCACHE_MOUNTED) { 987 struct vfsmount *mounted = lookup_mnt(path); 988 if (!mounted) 989 break; 990 dput(path->dentry); 991 mntput(path->mnt); 992 path->mnt = mounted; 993 path->dentry = dget(mounted->mnt_root); 994 continue; 995 } 996 997 /* Don't handle automount points here */ 998 break; 999 } 1000 return 0; 1001 } 1002 1003 /* 1004 * Skip to top of mountpoint pile in refwalk mode for follow_dotdot() 1005 */ 1006 static void follow_mount(struct path *path) 1007 { 1008 while (d_mountpoint(path->dentry)) { 1009 struct vfsmount *mounted = lookup_mnt(path); 1010 if (!mounted) 1011 break; 1012 dput(path->dentry); 1013 mntput(path->mnt); 1014 path->mnt = mounted; 1015 path->dentry = dget(mounted->mnt_root); 1016 } 1017 } 1018 1019 static void follow_dotdot(struct nameidata *nd) 1020 { 1021 set_root(nd); 1022 1023 while(1) { 1024 struct dentry *old = nd->path.dentry; 1025 1026 if (nd->path.dentry == nd->root.dentry && 1027 nd->path.mnt == nd->root.mnt) { 1028 break; 1029 } 1030 if (nd->path.dentry != nd->path.mnt->mnt_root) { 1031 /* rare case of legitimate dget_parent()... */ 1032 nd->path.dentry = dget_parent(nd->path.dentry); 1033 dput(old); 1034 break; 1035 } 1036 if (!follow_up(&nd->path)) 1037 break; 1038 } 1039 follow_mount(&nd->path); 1040 nd->inode = nd->path.dentry->d_inode; 1041 } 1042 1043 /* 1044 * This looks up the name in dcache, possibly revalidates the old dentry and 1045 * allocates a new one if not found or not valid. In the need_lookup argument 1046 * returns whether i_op->lookup is necessary. 1047 * 1048 * dir->d_inode->i_mutex must be held 1049 */ 1050 static struct dentry *lookup_dcache(struct qstr *name, struct dentry *dir, 1051 struct nameidata *nd, bool *need_lookup) 1052 { 1053 struct dentry *dentry; 1054 int error; 1055 1056 *need_lookup = false; 1057 dentry = d_lookup(dir, name); 1058 if (dentry) { 1059 if (d_need_lookup(dentry)) { 1060 *need_lookup = true; 1061 } else if (dentry->d_flags & DCACHE_OP_REVALIDATE) { 1062 error = d_revalidate(dentry, nd); 1063 if (unlikely(error <= 0)) { 1064 if (error < 0) { 1065 dput(dentry); 1066 return ERR_PTR(error); 1067 } else if (!d_invalidate(dentry)) { 1068 dput(dentry); 1069 dentry = NULL; 1070 } 1071 } 1072 } 1073 } 1074 1075 if (!dentry) { 1076 dentry = d_alloc(dir, name); 1077 if (unlikely(!dentry)) 1078 return ERR_PTR(-ENOMEM); 1079 1080 *need_lookup = true; 1081 } 1082 return dentry; 1083 } 1084 1085 /* 1086 * Call i_op->lookup on the dentry. The dentry must be negative but may be 1087 * hashed if it was pouplated with DCACHE_NEED_LOOKUP. 1088 * 1089 * dir->d_inode->i_mutex must be held 1090 */ 1091 static struct dentry *lookup_real(struct inode *dir, struct dentry *dentry, 1092 struct nameidata *nd) 1093 { 1094 struct dentry *old; 1095 1096 /* Don't create child dentry for a dead directory. */ 1097 if (unlikely(IS_DEADDIR(dir))) { 1098 dput(dentry); 1099 return ERR_PTR(-ENOENT); 1100 } 1101 1102 old = dir->i_op->lookup(dir, dentry, nd); 1103 if (unlikely(old)) { 1104 dput(dentry); 1105 dentry = old; 1106 } 1107 return dentry; 1108 } 1109 1110 static struct dentry *__lookup_hash(struct qstr *name, 1111 struct dentry *base, struct nameidata *nd) 1112 { 1113 bool need_lookup; 1114 struct dentry *dentry; 1115 1116 dentry = lookup_dcache(name, base, nd, &need_lookup); 1117 if (!need_lookup) 1118 return dentry; 1119 1120 return lookup_real(base->d_inode, dentry, nd); 1121 } 1122 1123 /* 1124 * It's more convoluted than I'd like it to be, but... it's still fairly 1125 * small and for now I'd prefer to have fast path as straight as possible. 1126 * It _is_ time-critical. 1127 */ 1128 static int lookup_fast(struct nameidata *nd, struct qstr *name, 1129 struct path *path, struct inode **inode) 1130 { 1131 struct vfsmount *mnt = nd->path.mnt; 1132 struct dentry *dentry, *parent = nd->path.dentry; 1133 int need_reval = 1; 1134 int status = 1; 1135 int err; 1136 1137 /* 1138 * Rename seqlock is not required here because in the off chance 1139 * of a false negative due to a concurrent rename, we're going to 1140 * do the non-racy lookup, below. 1141 */ 1142 if (nd->flags & LOOKUP_RCU) { 1143 unsigned seq; 1144 dentry = __d_lookup_rcu(parent, name, &seq, nd->inode); 1145 if (!dentry) 1146 goto unlazy; 1147 1148 /* 1149 * This sequence count validates that the inode matches 1150 * the dentry name information from lookup. 1151 */ 1152 *inode = dentry->d_inode; 1153 if (read_seqcount_retry(&dentry->d_seq, seq)) 1154 return -ECHILD; 1155 1156 /* 1157 * This sequence count validates that the parent had no 1158 * changes while we did the lookup of the dentry above. 1159 * 1160 * The memory barrier in read_seqcount_begin of child is 1161 * enough, we can use __read_seqcount_retry here. 1162 */ 1163 if (__read_seqcount_retry(&parent->d_seq, nd->seq)) 1164 return -ECHILD; 1165 nd->seq = seq; 1166 1167 if (unlikely(d_need_lookup(dentry))) 1168 goto unlazy; 1169 if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) { 1170 status = d_revalidate(dentry, nd); 1171 if (unlikely(status <= 0)) { 1172 if (status != -ECHILD) 1173 need_reval = 0; 1174 goto unlazy; 1175 } 1176 } 1177 path->mnt = mnt; 1178 path->dentry = dentry; 1179 if (unlikely(!__follow_mount_rcu(nd, path, inode))) 1180 goto unlazy; 1181 if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT)) 1182 goto unlazy; 1183 return 0; 1184 unlazy: 1185 if (unlazy_walk(nd, dentry)) 1186 return -ECHILD; 1187 } else { 1188 dentry = __d_lookup(parent, name); 1189 } 1190 1191 if (unlikely(!dentry)) 1192 goto need_lookup; 1193 1194 if (unlikely(d_need_lookup(dentry))) { 1195 dput(dentry); 1196 goto need_lookup; 1197 } 1198 1199 if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval) 1200 status = d_revalidate(dentry, nd); 1201 if (unlikely(status <= 0)) { 1202 if (status < 0) { 1203 dput(dentry); 1204 return status; 1205 } 1206 if (!d_invalidate(dentry)) { 1207 dput(dentry); 1208 goto need_lookup; 1209 } 1210 } 1211 1212 path->mnt = mnt; 1213 path->dentry = dentry; 1214 err = follow_managed(path, nd->flags); 1215 if (unlikely(err < 0)) { 1216 path_put_conditional(path, nd); 1217 return err; 1218 } 1219 if (err) 1220 nd->flags |= LOOKUP_JUMPED; 1221 *inode = path->dentry->d_inode; 1222 return 0; 1223 1224 need_lookup: 1225 return 1; 1226 } 1227 1228 /* Fast lookup failed, do it the slow way */ 1229 static int lookup_slow(struct nameidata *nd, struct qstr *name, 1230 struct path *path) 1231 { 1232 struct dentry *dentry, *parent; 1233 int err; 1234 1235 parent = nd->path.dentry; 1236 BUG_ON(nd->inode != parent->d_inode); 1237 1238 mutex_lock(&parent->d_inode->i_mutex); 1239 dentry = __lookup_hash(name, parent, nd); 1240 mutex_unlock(&parent->d_inode->i_mutex); 1241 if (IS_ERR(dentry)) 1242 return PTR_ERR(dentry); 1243 path->mnt = nd->path.mnt; 1244 path->dentry = dentry; 1245 err = follow_managed(path, nd->flags); 1246 if (unlikely(err < 0)) { 1247 path_put_conditional(path, nd); 1248 return err; 1249 } 1250 if (err) 1251 nd->flags |= LOOKUP_JUMPED; 1252 return 0; 1253 } 1254 1255 static inline int may_lookup(struct nameidata *nd) 1256 { 1257 if (nd->flags & LOOKUP_RCU) { 1258 int err = inode_permission(nd->inode, MAY_EXEC|MAY_NOT_BLOCK); 1259 if (err != -ECHILD) 1260 return err; 1261 if (unlazy_walk(nd, NULL)) 1262 return -ECHILD; 1263 } 1264 return inode_permission(nd->inode, MAY_EXEC); 1265 } 1266 1267 static inline int handle_dots(struct nameidata *nd, int type) 1268 { 1269 if (type == LAST_DOTDOT) { 1270 if (nd->flags & LOOKUP_RCU) { 1271 if (follow_dotdot_rcu(nd)) 1272 return -ECHILD; 1273 } else 1274 follow_dotdot(nd); 1275 } 1276 return 0; 1277 } 1278 1279 static void terminate_walk(struct nameidata *nd) 1280 { 1281 if (!(nd->flags & LOOKUP_RCU)) { 1282 path_put(&nd->path); 1283 } else { 1284 nd->flags &= ~LOOKUP_RCU; 1285 if (!(nd->flags & LOOKUP_ROOT)) 1286 nd->root.mnt = NULL; 1287 rcu_read_unlock(); 1288 br_read_unlock(&vfsmount_lock); 1289 } 1290 } 1291 1292 /* 1293 * Do we need to follow links? We _really_ want to be able 1294 * to do this check without having to look at inode->i_op, 1295 * so we keep a cache of "no, this doesn't need follow_link" 1296 * for the common case. 1297 */ 1298 static inline int should_follow_link(struct inode *inode, int follow) 1299 { 1300 if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) { 1301 if (likely(inode->i_op->follow_link)) 1302 return follow; 1303 1304 /* This gets set once for the inode lifetime */ 1305 spin_lock(&inode->i_lock); 1306 inode->i_opflags |= IOP_NOFOLLOW; 1307 spin_unlock(&inode->i_lock); 1308 } 1309 return 0; 1310 } 1311 1312 static inline int walk_component(struct nameidata *nd, struct path *path, 1313 struct qstr *name, int type, int follow) 1314 { 1315 struct inode *inode; 1316 int err; 1317 /* 1318 * "." and ".." are special - ".." especially so because it has 1319 * to be able to know about the current root directory and 1320 * parent relationships. 1321 */ 1322 if (unlikely(type != LAST_NORM)) 1323 return handle_dots(nd, type); 1324 err = lookup_fast(nd, name, path, &inode); 1325 if (unlikely(err)) { 1326 if (err < 0) 1327 goto out_err; 1328 1329 err = lookup_slow(nd, name, path); 1330 if (err < 0) 1331 goto out_err; 1332 1333 inode = path->dentry->d_inode; 1334 } 1335 err = -ENOENT; 1336 if (!inode) 1337 goto out_path_put; 1338 1339 if (should_follow_link(inode, follow)) { 1340 if (nd->flags & LOOKUP_RCU) { 1341 if (unlikely(unlazy_walk(nd, path->dentry))) { 1342 err = -ECHILD; 1343 goto out_err; 1344 } 1345 } 1346 BUG_ON(inode != path->dentry->d_inode); 1347 return 1; 1348 } 1349 path_to_nameidata(path, nd); 1350 nd->inode = inode; 1351 return 0; 1352 1353 out_path_put: 1354 path_to_nameidata(path, nd); 1355 out_err: 1356 terminate_walk(nd); 1357 return err; 1358 } 1359 1360 /* 1361 * This limits recursive symlink follows to 8, while 1362 * limiting consecutive symlinks to 40. 1363 * 1364 * Without that kind of total limit, nasty chains of consecutive 1365 * symlinks can cause almost arbitrarily long lookups. 1366 */ 1367 static inline int nested_symlink(struct path *path, struct nameidata *nd) 1368 { 1369 int res; 1370 1371 if (unlikely(current->link_count >= MAX_NESTED_LINKS)) { 1372 path_put_conditional(path, nd); 1373 path_put(&nd->path); 1374 return -ELOOP; 1375 } 1376 BUG_ON(nd->depth >= MAX_NESTED_LINKS); 1377 1378 nd->depth++; 1379 current->link_count++; 1380 1381 do { 1382 struct path link = *path; 1383 void *cookie; 1384 1385 res = follow_link(&link, nd, &cookie); 1386 if (!res) 1387 res = walk_component(nd, path, &nd->last, 1388 nd->last_type, LOOKUP_FOLLOW); 1389 put_link(nd, &link, cookie); 1390 } while (res > 0); 1391 1392 current->link_count--; 1393 nd->depth--; 1394 return res; 1395 } 1396 1397 /* 1398 * We really don't want to look at inode->i_op->lookup 1399 * when we don't have to. So we keep a cache bit in 1400 * the inode ->i_opflags field that says "yes, we can 1401 * do lookup on this inode". 1402 */ 1403 static inline int can_lookup(struct inode *inode) 1404 { 1405 if (likely(inode->i_opflags & IOP_LOOKUP)) 1406 return 1; 1407 if (likely(!inode->i_op->lookup)) 1408 return 0; 1409 1410 /* We do this once for the lifetime of the inode */ 1411 spin_lock(&inode->i_lock); 1412 inode->i_opflags |= IOP_LOOKUP; 1413 spin_unlock(&inode->i_lock); 1414 return 1; 1415 } 1416 1417 /* 1418 * We can do the critical dentry name comparison and hashing 1419 * operations one word at a time, but we are limited to: 1420 * 1421 * - Architectures with fast unaligned word accesses. We could 1422 * do a "get_unaligned()" if this helps and is sufficiently 1423 * fast. 1424 * 1425 * - Little-endian machines (so that we can generate the mask 1426 * of low bytes efficiently). Again, we *could* do a byte 1427 * swapping load on big-endian architectures if that is not 1428 * expensive enough to make the optimization worthless. 1429 * 1430 * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we 1431 * do not trap on the (extremely unlikely) case of a page 1432 * crossing operation. 1433 * 1434 * - Furthermore, we need an efficient 64-bit compile for the 1435 * 64-bit case in order to generate the "number of bytes in 1436 * the final mask". Again, that could be replaced with a 1437 * efficient population count instruction or similar. 1438 */ 1439 #ifdef CONFIG_DCACHE_WORD_ACCESS 1440 1441 #include <asm/word-at-a-time.h> 1442 1443 #ifdef CONFIG_64BIT 1444 1445 static inline unsigned int fold_hash(unsigned long hash) 1446 { 1447 hash += hash >> (8*sizeof(int)); 1448 return hash; 1449 } 1450 1451 #else /* 32-bit case */ 1452 1453 #define fold_hash(x) (x) 1454 1455 #endif 1456 1457 unsigned int full_name_hash(const unsigned char *name, unsigned int len) 1458 { 1459 unsigned long a, mask; 1460 unsigned long hash = 0; 1461 1462 for (;;) { 1463 a = load_unaligned_zeropad(name); 1464 if (len < sizeof(unsigned long)) 1465 break; 1466 hash += a; 1467 hash *= 9; 1468 name += sizeof(unsigned long); 1469 len -= sizeof(unsigned long); 1470 if (!len) 1471 goto done; 1472 } 1473 mask = ~(~0ul << len*8); 1474 hash += mask & a; 1475 done: 1476 return fold_hash(hash); 1477 } 1478 EXPORT_SYMBOL(full_name_hash); 1479 1480 /* 1481 * Calculate the length and hash of the path component, and 1482 * return the length of the component; 1483 */ 1484 static inline unsigned long hash_name(const char *name, unsigned int *hashp) 1485 { 1486 unsigned long a, b, adata, bdata, mask, hash, len; 1487 const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; 1488 1489 hash = a = 0; 1490 len = -sizeof(unsigned long); 1491 do { 1492 hash = (hash + a) * 9; 1493 len += sizeof(unsigned long); 1494 a = load_unaligned_zeropad(name+len); 1495 b = a ^ REPEAT_BYTE('/'); 1496 } while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants))); 1497 1498 adata = prep_zero_mask(a, adata, &constants); 1499 bdata = prep_zero_mask(b, bdata, &constants); 1500 1501 mask = create_zero_mask(adata | bdata); 1502 1503 hash += a & zero_bytemask(mask); 1504 *hashp = fold_hash(hash); 1505 1506 return len + find_zero(mask); 1507 } 1508 1509 #else 1510 1511 unsigned int full_name_hash(const unsigned char *name, unsigned int len) 1512 { 1513 unsigned long hash = init_name_hash(); 1514 while (len--) 1515 hash = partial_name_hash(*name++, hash); 1516 return end_name_hash(hash); 1517 } 1518 EXPORT_SYMBOL(full_name_hash); 1519 1520 /* 1521 * We know there's a real path component here of at least 1522 * one character. 1523 */ 1524 static inline unsigned long hash_name(const char *name, unsigned int *hashp) 1525 { 1526 unsigned long hash = init_name_hash(); 1527 unsigned long len = 0, c; 1528 1529 c = (unsigned char)*name; 1530 do { 1531 len++; 1532 hash = partial_name_hash(c, hash); 1533 c = (unsigned char)name[len]; 1534 } while (c && c != '/'); 1535 *hashp = end_name_hash(hash); 1536 return len; 1537 } 1538 1539 #endif 1540 1541 /* 1542 * Name resolution. 1543 * This is the basic name resolution function, turning a pathname into 1544 * the final dentry. We expect 'base' to be positive and a directory. 1545 * 1546 * Returns 0 and nd will have valid dentry and mnt on success. 1547 * Returns error and drops reference to input namei data on failure. 1548 */ 1549 static int link_path_walk(const char *name, struct nameidata *nd) 1550 { 1551 struct path next; 1552 int err; 1553 1554 while (*name=='/') 1555 name++; 1556 if (!*name) 1557 return 0; 1558 1559 /* At this point we know we have a real path component. */ 1560 for(;;) { 1561 struct qstr this; 1562 long len; 1563 int type; 1564 1565 err = may_lookup(nd); 1566 if (err) 1567 break; 1568 1569 len = hash_name(name, &this.hash); 1570 this.name = name; 1571 this.len = len; 1572 1573 type = LAST_NORM; 1574 if (name[0] == '.') switch (len) { 1575 case 2: 1576 if (name[1] == '.') { 1577 type = LAST_DOTDOT; 1578 nd->flags |= LOOKUP_JUMPED; 1579 } 1580 break; 1581 case 1: 1582 type = LAST_DOT; 1583 } 1584 if (likely(type == LAST_NORM)) { 1585 struct dentry *parent = nd->path.dentry; 1586 nd->flags &= ~LOOKUP_JUMPED; 1587 if (unlikely(parent->d_flags & DCACHE_OP_HASH)) { 1588 err = parent->d_op->d_hash(parent, nd->inode, 1589 &this); 1590 if (err < 0) 1591 break; 1592 } 1593 } 1594 1595 if (!name[len]) 1596 goto last_component; 1597 /* 1598 * If it wasn't NUL, we know it was '/'. Skip that 1599 * slash, and continue until no more slashes. 1600 */ 1601 do { 1602 len++; 1603 } while (unlikely(name[len] == '/')); 1604 if (!name[len]) 1605 goto last_component; 1606 name += len; 1607 1608 err = walk_component(nd, &next, &this, type, LOOKUP_FOLLOW); 1609 if (err < 0) 1610 return err; 1611 1612 if (err) { 1613 err = nested_symlink(&next, nd); 1614 if (err) 1615 return err; 1616 } 1617 if (can_lookup(nd->inode)) 1618 continue; 1619 err = -ENOTDIR; 1620 break; 1621 /* here ends the main loop */ 1622 1623 last_component: 1624 nd->last = this; 1625 nd->last_type = type; 1626 return 0; 1627 } 1628 terminate_walk(nd); 1629 return err; 1630 } 1631 1632 static int path_init(int dfd, const char *name, unsigned int flags, 1633 struct nameidata *nd, struct file **fp) 1634 { 1635 int retval = 0; 1636 int fput_needed; 1637 struct file *file; 1638 1639 nd->last_type = LAST_ROOT; /* if there are only slashes... */ 1640 nd->flags = flags | LOOKUP_JUMPED; 1641 nd->depth = 0; 1642 if (flags & LOOKUP_ROOT) { 1643 struct inode *inode = nd->root.dentry->d_inode; 1644 if (*name) { 1645 if (!inode->i_op->lookup) 1646 return -ENOTDIR; 1647 retval = inode_permission(inode, MAY_EXEC); 1648 if (retval) 1649 return retval; 1650 } 1651 nd->path = nd->root; 1652 nd->inode = inode; 1653 if (flags & LOOKUP_RCU) { 1654 br_read_lock(&vfsmount_lock); 1655 rcu_read_lock(); 1656 nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); 1657 } else { 1658 path_get(&nd->path); 1659 } 1660 return 0; 1661 } 1662 1663 nd->root.mnt = NULL; 1664 1665 if (*name=='/') { 1666 if (flags & LOOKUP_RCU) { 1667 br_read_lock(&vfsmount_lock); 1668 rcu_read_lock(); 1669 set_root_rcu(nd); 1670 } else { 1671 set_root(nd); 1672 path_get(&nd->root); 1673 } 1674 nd->path = nd->root; 1675 } else if (dfd == AT_FDCWD) { 1676 if (flags & LOOKUP_RCU) { 1677 struct fs_struct *fs = current->fs; 1678 unsigned seq; 1679 1680 br_read_lock(&vfsmount_lock); 1681 rcu_read_lock(); 1682 1683 do { 1684 seq = read_seqcount_begin(&fs->seq); 1685 nd->path = fs->pwd; 1686 nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); 1687 } while (read_seqcount_retry(&fs->seq, seq)); 1688 } else { 1689 get_fs_pwd(current->fs, &nd->path); 1690 } 1691 } else { 1692 struct dentry *dentry; 1693 1694 file = fget_raw_light(dfd, &fput_needed); 1695 retval = -EBADF; 1696 if (!file) 1697 goto out_fail; 1698 1699 dentry = file->f_path.dentry; 1700 1701 if (*name) { 1702 retval = -ENOTDIR; 1703 if (!S_ISDIR(dentry->d_inode->i_mode)) 1704 goto fput_fail; 1705 1706 retval = inode_permission(dentry->d_inode, MAY_EXEC); 1707 if (retval) 1708 goto fput_fail; 1709 } 1710 1711 nd->path = file->f_path; 1712 if (flags & LOOKUP_RCU) { 1713 if (fput_needed) 1714 *fp = file; 1715 nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); 1716 br_read_lock(&vfsmount_lock); 1717 rcu_read_lock(); 1718 } else { 1719 path_get(&file->f_path); 1720 fput_light(file, fput_needed); 1721 } 1722 } 1723 1724 nd->inode = nd->path.dentry->d_inode; 1725 return 0; 1726 1727 fput_fail: 1728 fput_light(file, fput_needed); 1729 out_fail: 1730 return retval; 1731 } 1732 1733 static inline int lookup_last(struct nameidata *nd, struct path *path) 1734 { 1735 if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len]) 1736 nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; 1737 1738 nd->flags &= ~LOOKUP_PARENT; 1739 return walk_component(nd, path, &nd->last, nd->last_type, 1740 nd->flags & LOOKUP_FOLLOW); 1741 } 1742 1743 /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ 1744 static int path_lookupat(int dfd, const char *name, 1745 unsigned int flags, struct nameidata *nd) 1746 { 1747 struct file *base = NULL; 1748 struct path path; 1749 int err; 1750 1751 /* 1752 * Path walking is largely split up into 2 different synchronisation 1753 * schemes, rcu-walk and ref-walk (explained in 1754 * Documentation/filesystems/path-lookup.txt). These share much of the 1755 * path walk code, but some things particularly setup, cleanup, and 1756 * following mounts are sufficiently divergent that functions are 1757 * duplicated. Typically there is a function foo(), and its RCU 1758 * analogue, foo_rcu(). 1759 * 1760 * -ECHILD is the error number of choice (just to avoid clashes) that 1761 * is returned if some aspect of an rcu-walk fails. Such an error must 1762 * be handled by restarting a traditional ref-walk (which will always 1763 * be able to complete). 1764 */ 1765 err = path_init(dfd, name, flags | LOOKUP_PARENT, nd, &base); 1766 1767 if (unlikely(err)) 1768 return err; 1769 1770 current->total_link_count = 0; 1771 err = link_path_walk(name, nd); 1772 1773 if (!err && !(flags & LOOKUP_PARENT)) { 1774 err = lookup_last(nd, &path); 1775 while (err > 0) { 1776 void *cookie; 1777 struct path link = path; 1778 nd->flags |= LOOKUP_PARENT; 1779 err = follow_link(&link, nd, &cookie); 1780 if (!err) 1781 err = lookup_last(nd, &path); 1782 put_link(nd, &link, cookie); 1783 } 1784 } 1785 1786 if (!err) 1787 err = complete_walk(nd); 1788 1789 if (!err && nd->flags & LOOKUP_DIRECTORY) { 1790 if (!nd->inode->i_op->lookup) { 1791 path_put(&nd->path); 1792 err = -ENOTDIR; 1793 } 1794 } 1795 1796 if (base) 1797 fput(base); 1798 1799 if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { 1800 path_put(&nd->root); 1801 nd->root.mnt = NULL; 1802 } 1803 return err; 1804 } 1805 1806 static int do_path_lookup(int dfd, const char *name, 1807 unsigned int flags, struct nameidata *nd) 1808 { 1809 int retval = path_lookupat(dfd, name, flags | LOOKUP_RCU, nd); 1810 if (unlikely(retval == -ECHILD)) 1811 retval = path_lookupat(dfd, name, flags, nd); 1812 if (unlikely(retval == -ESTALE)) 1813 retval = path_lookupat(dfd, name, flags | LOOKUP_REVAL, nd); 1814 1815 if (likely(!retval)) { 1816 if (unlikely(!audit_dummy_context())) { 1817 if (nd->path.dentry && nd->inode) 1818 audit_inode(name, nd->path.dentry); 1819 } 1820 } 1821 return retval; 1822 } 1823 1824 int kern_path_parent(const char *name, struct nameidata *nd) 1825 { 1826 return do_path_lookup(AT_FDCWD, name, LOOKUP_PARENT, nd); 1827 } 1828 1829 int kern_path(const char *name, unsigned int flags, struct path *path) 1830 { 1831 struct nameidata nd; 1832 int res = do_path_lookup(AT_FDCWD, name, flags, &nd); 1833 if (!res) 1834 *path = nd.path; 1835 return res; 1836 } 1837 1838 /** 1839 * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair 1840 * @dentry: pointer to dentry of the base directory 1841 * @mnt: pointer to vfs mount of the base directory 1842 * @name: pointer to file name 1843 * @flags: lookup flags 1844 * @path: pointer to struct path to fill 1845 */ 1846 int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, 1847 const char *name, unsigned int flags, 1848 struct path *path) 1849 { 1850 struct nameidata nd; 1851 int err; 1852 nd.root.dentry = dentry; 1853 nd.root.mnt = mnt; 1854 BUG_ON(flags & LOOKUP_PARENT); 1855 /* the first argument of do_path_lookup() is ignored with LOOKUP_ROOT */ 1856 err = do_path_lookup(AT_FDCWD, name, flags | LOOKUP_ROOT, &nd); 1857 if (!err) 1858 *path = nd.path; 1859 return err; 1860 } 1861 1862 /* 1863 * Restricted form of lookup. Doesn't follow links, single-component only, 1864 * needs parent already locked. Doesn't follow mounts. 1865 * SMP-safe. 1866 */ 1867 static struct dentry *lookup_hash(struct nameidata *nd) 1868 { 1869 return __lookup_hash(&nd->last, nd->path.dentry, nd); 1870 } 1871 1872 /** 1873 * lookup_one_len - filesystem helper to lookup single pathname component 1874 * @name: pathname component to lookup 1875 * @base: base directory to lookup from 1876 * @len: maximum length @len should be interpreted to 1877 * 1878 * Note that this routine is purely a helper for filesystem usage and should 1879 * not be called by generic code. Also note that by using this function the 1880 * nameidata argument is passed to the filesystem methods and a filesystem 1881 * using this helper needs to be prepared for that. 1882 */ 1883 struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) 1884 { 1885 struct qstr this; 1886 unsigned int c; 1887 int err; 1888 1889 WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex)); 1890 1891 this.name = name; 1892 this.len = len; 1893 this.hash = full_name_hash(name, len); 1894 if (!len) 1895 return ERR_PTR(-EACCES); 1896 1897 while (len--) { 1898 c = *(const unsigned char *)name++; 1899 if (c == '/' || c == '\0') 1900 return ERR_PTR(-EACCES); 1901 } 1902 /* 1903 * See if the low-level filesystem might want 1904 * to use its own hash.. 1905 */ 1906 if (base->d_flags & DCACHE_OP_HASH) { 1907 int err = base->d_op->d_hash(base, base->d_inode, &this); 1908 if (err < 0) 1909 return ERR_PTR(err); 1910 } 1911 1912 err = inode_permission(base->d_inode, MAY_EXEC); 1913 if (err) 1914 return ERR_PTR(err); 1915 1916 return __lookup_hash(&this, base, NULL); 1917 } 1918 1919 int user_path_at_empty(int dfd, const char __user *name, unsigned flags, 1920 struct path *path, int *empty) 1921 { 1922 struct nameidata nd; 1923 char *tmp = getname_flags(name, flags, empty); 1924 int err = PTR_ERR(tmp); 1925 if (!IS_ERR(tmp)) { 1926 1927 BUG_ON(flags & LOOKUP_PARENT); 1928 1929 err = do_path_lookup(dfd, tmp, flags, &nd); 1930 putname(tmp); 1931 if (!err) 1932 *path = nd.path; 1933 } 1934 return err; 1935 } 1936 1937 int user_path_at(int dfd, const char __user *name, unsigned flags, 1938 struct path *path) 1939 { 1940 return user_path_at_empty(dfd, name, flags, path, NULL); 1941 } 1942 1943 static int user_path_parent(int dfd, const char __user *path, 1944 struct nameidata *nd, char **name) 1945 { 1946 char *s = getname(path); 1947 int error; 1948 1949 if (IS_ERR(s)) 1950 return PTR_ERR(s); 1951 1952 error = do_path_lookup(dfd, s, LOOKUP_PARENT, nd); 1953 if (error) 1954 putname(s); 1955 else 1956 *name = s; 1957 1958 return error; 1959 } 1960 1961 /* 1962 * It's inline, so penalty for filesystems that don't use sticky bit is 1963 * minimal. 1964 */ 1965 static inline int check_sticky(struct inode *dir, struct inode *inode) 1966 { 1967 kuid_t fsuid = current_fsuid(); 1968 1969 if (!(dir->i_mode & S_ISVTX)) 1970 return 0; 1971 if (uid_eq(inode->i_uid, fsuid)) 1972 return 0; 1973 if (uid_eq(dir->i_uid, fsuid)) 1974 return 0; 1975 return !inode_capable(inode, CAP_FOWNER); 1976 } 1977 1978 /* 1979 * Check whether we can remove a link victim from directory dir, check 1980 * whether the type of victim is right. 1981 * 1. We can't do it if dir is read-only (done in permission()) 1982 * 2. We should have write and exec permissions on dir 1983 * 3. We can't remove anything from append-only dir 1984 * 4. We can't do anything with immutable dir (done in permission()) 1985 * 5. If the sticky bit on dir is set we should either 1986 * a. be owner of dir, or 1987 * b. be owner of victim, or 1988 * c. have CAP_FOWNER capability 1989 * 6. If the victim is append-only or immutable we can't do antyhing with 1990 * links pointing to it. 1991 * 7. If we were asked to remove a directory and victim isn't one - ENOTDIR. 1992 * 8. If we were asked to remove a non-directory and victim isn't one - EISDIR. 1993 * 9. We can't remove a root or mountpoint. 1994 * 10. We don't allow removal of NFS sillyrenamed files; it's handled by 1995 * nfs_async_unlink(). 1996 */ 1997 static int may_delete(struct inode *dir,struct dentry *victim,int isdir) 1998 { 1999 int error; 2000 2001 if (!victim->d_inode) 2002 return -ENOENT; 2003 2004 BUG_ON(victim->d_parent->d_inode != dir); 2005 audit_inode_child(victim, dir); 2006 2007 error = inode_permission(dir, MAY_WRITE | MAY_EXEC); 2008 if (error) 2009 return error; 2010 if (IS_APPEND(dir)) 2011 return -EPERM; 2012 if (check_sticky(dir, victim->d_inode)||IS_APPEND(victim->d_inode)|| 2013 IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode)) 2014 return -EPERM; 2015 if (isdir) { 2016 if (!S_ISDIR(victim->d_inode->i_mode)) 2017 return -ENOTDIR; 2018 if (IS_ROOT(victim)) 2019 return -EBUSY; 2020 } else if (S_ISDIR(victim->d_inode->i_mode)) 2021 return -EISDIR; 2022 if (IS_DEADDIR(dir)) 2023 return -ENOENT; 2024 if (victim->d_flags & DCACHE_NFSFS_RENAMED) 2025 return -EBUSY; 2026 return 0; 2027 } 2028 2029 /* Check whether we can create an object with dentry child in directory 2030 * dir. 2031 * 1. We can't do it if child already exists (open has special treatment for 2032 * this case, but since we are inlined it's OK) 2033 * 2. We can't do it if dir is read-only (done in permission()) 2034 * 3. We should have write and exec permissions on dir 2035 * 4. We can't do it if dir is immutable (done in permission()) 2036 */ 2037 static inline int may_create(struct inode *dir, struct dentry *child) 2038 { 2039 if (child->d_inode) 2040 return -EEXIST; 2041 if (IS_DEADDIR(dir)) 2042 return -ENOENT; 2043 return inode_permission(dir, MAY_WRITE | MAY_EXEC); 2044 } 2045 2046 /* 2047 * p1 and p2 should be directories on the same fs. 2048 */ 2049 struct dentry *lock_rename(struct dentry *p1, struct dentry *p2) 2050 { 2051 struct dentry *p; 2052 2053 if (p1 == p2) { 2054 mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT); 2055 return NULL; 2056 } 2057 2058 mutex_lock(&p1->d_inode->i_sb->s_vfs_rename_mutex); 2059 2060 p = d_ancestor(p2, p1); 2061 if (p) { 2062 mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT); 2063 mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_CHILD); 2064 return p; 2065 } 2066 2067 p = d_ancestor(p1, p2); 2068 if (p) { 2069 mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT); 2070 mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD); 2071 return p; 2072 } 2073 2074 mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT); 2075 mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD); 2076 return NULL; 2077 } 2078 2079 void unlock_rename(struct dentry *p1, struct dentry *p2) 2080 { 2081 mutex_unlock(&p1->d_inode->i_mutex); 2082 if (p1 != p2) { 2083 mutex_unlock(&p2->d_inode->i_mutex); 2084 mutex_unlock(&p1->d_inode->i_sb->s_vfs_rename_mutex); 2085 } 2086 } 2087 2088 int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, 2089 struct nameidata *nd) 2090 { 2091 int error = may_create(dir, dentry); 2092 2093 if (error) 2094 return error; 2095 2096 if (!dir->i_op->create) 2097 return -EACCES; /* shouldn't it be ENOSYS? */ 2098 mode &= S_IALLUGO; 2099 mode |= S_IFREG; 2100 error = security_inode_create(dir, dentry, mode); 2101 if (error) 2102 return error; 2103 error = dir->i_op->create(dir, dentry, mode, nd); 2104 if (!error) 2105 fsnotify_create(dir, dentry); 2106 return error; 2107 } 2108 2109 static int may_open(struct path *path, int acc_mode, int flag) 2110 { 2111 struct dentry *dentry = path->dentry; 2112 struct inode *inode = dentry->d_inode; 2113 int error; 2114 2115 /* O_PATH? */ 2116 if (!acc_mode) 2117 return 0; 2118 2119 if (!inode) 2120 return -ENOENT; 2121 2122 switch (inode->i_mode & S_IFMT) { 2123 case S_IFLNK: 2124 return -ELOOP; 2125 case S_IFDIR: 2126 if (acc_mode & MAY_WRITE) 2127 return -EISDIR; 2128 break; 2129 case S_IFBLK: 2130 case S_IFCHR: 2131 if (path->mnt->mnt_flags & MNT_NODEV) 2132 return -EACCES; 2133 /*FALLTHRU*/ 2134 case S_IFIFO: 2135 case S_IFSOCK: 2136 flag &= ~O_TRUNC; 2137 break; 2138 } 2139 2140 error = inode_permission(inode, acc_mode); 2141 if (error) 2142 return error; 2143 2144 /* 2145 * An append-only file must be opened in append mode for writing. 2146 */ 2147 if (IS_APPEND(inode)) { 2148 if ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND)) 2149 return -EPERM; 2150 if (flag & O_TRUNC) 2151 return -EPERM; 2152 } 2153 2154 /* O_NOATIME can only be set by the owner or superuser */ 2155 if (flag & O_NOATIME && !inode_owner_or_capable(inode)) 2156 return -EPERM; 2157 2158 return 0; 2159 } 2160 2161 static int handle_truncate(struct file *filp) 2162 { 2163 struct path *path = &filp->f_path; 2164 struct inode *inode = path->dentry->d_inode; 2165 int error = get_write_access(inode); 2166 if (error) 2167 return error; 2168 /* 2169 * Refuse to truncate files with mandatory locks held on them. 2170 */ 2171 error = locks_verify_locked(inode); 2172 if (!error) 2173 error = security_path_truncate(path); 2174 if (!error) { 2175 error = do_truncate(path->dentry, 0, 2176 ATTR_MTIME|ATTR_CTIME|ATTR_OPEN, 2177 filp); 2178 } 2179 put_write_access(inode); 2180 return error; 2181 } 2182 2183 static inline int open_to_namei_flags(int flag) 2184 { 2185 if ((flag & O_ACCMODE) == 3) 2186 flag--; 2187 return flag; 2188 } 2189 2190 /* 2191 * Handle the last step of open() 2192 */ 2193 static struct file *do_last(struct nameidata *nd, struct path *path, 2194 const struct open_flags *op, const char *pathname) 2195 { 2196 struct dentry *dir = nd->path.dentry; 2197 struct dentry *dentry; 2198 int open_flag = op->open_flag; 2199 int will_truncate = open_flag & O_TRUNC; 2200 int want_write = 0; 2201 int acc_mode = op->acc_mode; 2202 struct file *filp; 2203 struct inode *inode; 2204 int symlink_ok = 0; 2205 struct path save_parent = { .dentry = NULL, .mnt = NULL }; 2206 bool retried = false; 2207 int error; 2208 2209 nd->flags &= ~LOOKUP_PARENT; 2210 nd->flags |= op->intent; 2211 2212 switch (nd->last_type) { 2213 case LAST_DOTDOT: 2214 case LAST_DOT: 2215 error = handle_dots(nd, nd->last_type); 2216 if (error) 2217 return ERR_PTR(error); 2218 /* fallthrough */ 2219 case LAST_ROOT: 2220 error = complete_walk(nd); 2221 if (error) 2222 return ERR_PTR(error); 2223 audit_inode(pathname, nd->path.dentry); 2224 if (open_flag & O_CREAT) { 2225 error = -EISDIR; 2226 goto exit; 2227 } 2228 goto ok; 2229 case LAST_BIND: 2230 error = complete_walk(nd); 2231 if (error) 2232 return ERR_PTR(error); 2233 audit_inode(pathname, dir); 2234 goto ok; 2235 } 2236 2237 if (!(open_flag & O_CREAT)) { 2238 if (nd->last.name[nd->last.len]) 2239 nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; 2240 if (open_flag & O_PATH && !(nd->flags & LOOKUP_FOLLOW)) 2241 symlink_ok = 1; 2242 /* we _can_ be in RCU mode here */ 2243 error = lookup_fast(nd, &nd->last, path, &inode); 2244 if (unlikely(error)) { 2245 if (error < 0) 2246 goto exit; 2247 2248 error = lookup_slow(nd, &nd->last, path); 2249 if (error < 0) 2250 goto exit; 2251 2252 inode = path->dentry->d_inode; 2253 } 2254 goto finish_lookup; 2255 } 2256 2257 /* create side of things */ 2258 /* 2259 * This will *only* deal with leaving RCU mode - LOOKUP_JUMPED has been 2260 * cleared when we got to the last component we are about to look up 2261 */ 2262 error = complete_walk(nd); 2263 if (error) 2264 return ERR_PTR(error); 2265 2266 audit_inode(pathname, dir); 2267 error = -EISDIR; 2268 /* trailing slashes? */ 2269 if (nd->last.name[nd->last.len]) 2270 goto exit; 2271 2272 retry_lookup: 2273 mutex_lock(&dir->d_inode->i_mutex); 2274 2275 dentry = lookup_hash(nd); 2276 error = PTR_ERR(dentry); 2277 if (IS_ERR(dentry)) { 2278 mutex_unlock(&dir->d_inode->i_mutex); 2279 goto exit; 2280 } 2281 2282 path->dentry = dentry; 2283 path->mnt = nd->path.mnt; 2284 2285 /* Negative dentry, just create the file */ 2286 if (!dentry->d_inode) { 2287 umode_t mode = op->mode; 2288 if (!IS_POSIXACL(dir->d_inode)) 2289 mode &= ~current_umask(); 2290 /* 2291 * This write is needed to ensure that a 2292 * rw->ro transition does not occur between 2293 * the time when the file is created and when 2294 * a permanent write count is taken through 2295 * the 'struct file' in nameidata_to_filp(). 2296 */ 2297 error = mnt_want_write(nd->path.mnt); 2298 if (error) 2299 goto exit_mutex_unlock; 2300 want_write = 1; 2301 /* Don't check for write permission, don't truncate */ 2302 open_flag &= ~O_TRUNC; 2303 will_truncate = 0; 2304 acc_mode = MAY_OPEN; 2305 error = security_path_mknod(&nd->path, dentry, mode, 0); 2306 if (error) 2307 goto exit_mutex_unlock; 2308 error = vfs_create(dir->d_inode, dentry, mode, nd); 2309 if (error) 2310 goto exit_mutex_unlock; 2311 mutex_unlock(&dir->d_inode->i_mutex); 2312 dput(nd->path.dentry); 2313 nd->path.dentry = dentry; 2314 goto common; 2315 } 2316 2317 /* 2318 * It already exists. 2319 */ 2320 mutex_unlock(&dir->d_inode->i_mutex); 2321 audit_inode(pathname, path->dentry); 2322 2323 error = -EEXIST; 2324 if (open_flag & O_EXCL) 2325 goto exit_dput; 2326 2327 error = follow_managed(path, nd->flags); 2328 if (error < 0) 2329 goto exit_dput; 2330 2331 if (error) 2332 nd->flags |= LOOKUP_JUMPED; 2333 2334 BUG_ON(nd->flags & LOOKUP_RCU); 2335 inode = path->dentry->d_inode; 2336 finish_lookup: 2337 /* we _can_ be in RCU mode here */ 2338 error = -ENOENT; 2339 if (!inode) { 2340 path_to_nameidata(path, nd); 2341 goto exit; 2342 } 2343 2344 if (should_follow_link(inode, !symlink_ok)) { 2345 if (nd->flags & LOOKUP_RCU) { 2346 if (unlikely(unlazy_walk(nd, path->dentry))) { 2347 error = -ECHILD; 2348 goto exit; 2349 } 2350 } 2351 BUG_ON(inode != path->dentry->d_inode); 2352 return NULL; 2353 } 2354 2355 if ((nd->flags & LOOKUP_RCU) || nd->path.mnt != path->mnt) { 2356 path_to_nameidata(path, nd); 2357 } else { 2358 save_parent.dentry = nd->path.dentry; 2359 save_parent.mnt = mntget(path->mnt); 2360 nd->path.dentry = path->dentry; 2361 2362 } 2363 nd->inode = inode; 2364 /* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */ 2365 error = complete_walk(nd); 2366 if (error) { 2367 path_put(&save_parent); 2368 return ERR_PTR(error); 2369 } 2370 error = -EISDIR; 2371 if ((open_flag & O_CREAT) && S_ISDIR(nd->inode->i_mode)) 2372 goto exit; 2373 error = -ENOTDIR; 2374 if ((nd->flags & LOOKUP_DIRECTORY) && !nd->inode->i_op->lookup) 2375 goto exit; 2376 audit_inode(pathname, nd->path.dentry); 2377 ok: 2378 if (!S_ISREG(nd->inode->i_mode)) 2379 will_truncate = 0; 2380 2381 if (will_truncate) { 2382 error = mnt_want_write(nd->path.mnt); 2383 if (error) 2384 goto exit; 2385 want_write = 1; 2386 } 2387 common: 2388 error = may_open(&nd->path, acc_mode, open_flag); 2389 if (error) 2390 goto exit; 2391 filp = nameidata_to_filp(nd); 2392 if (filp == ERR_PTR(-EOPENSTALE) && save_parent.dentry && !retried) { 2393 BUG_ON(save_parent.dentry != dir); 2394 path_put(&nd->path); 2395 nd->path = save_parent; 2396 nd->inode = dir->d_inode; 2397 save_parent.mnt = NULL; 2398 save_parent.dentry = NULL; 2399 if (want_write) { 2400 mnt_drop_write(nd->path.mnt); 2401 want_write = 0; 2402 } 2403 retried = true; 2404 goto retry_lookup; 2405 } 2406 if (!IS_ERR(filp)) { 2407 error = ima_file_check(filp, op->acc_mode); 2408 if (error) { 2409 fput(filp); 2410 filp = ERR_PTR(error); 2411 } 2412 } 2413 if (!IS_ERR(filp)) { 2414 if (will_truncate) { 2415 error = handle_truncate(filp); 2416 if (error) { 2417 fput(filp); 2418 filp = ERR_PTR(error); 2419 } 2420 } 2421 } 2422 out: 2423 if (want_write) 2424 mnt_drop_write(nd->path.mnt); 2425 path_put(&save_parent); 2426 terminate_walk(nd); 2427 return filp; 2428 2429 exit_mutex_unlock: 2430 mutex_unlock(&dir->d_inode->i_mutex); 2431 exit_dput: 2432 path_put_conditional(path, nd); 2433 exit: 2434 filp = ERR_PTR(error); 2435 goto out; 2436 } 2437 2438 static struct file *path_openat(int dfd, const char *pathname, 2439 struct nameidata *nd, const struct open_flags *op, int flags) 2440 { 2441 struct file *base = NULL; 2442 struct file *filp; 2443 struct path path; 2444 int error; 2445 2446 filp = get_empty_filp(); 2447 if (!filp) 2448 return ERR_PTR(-ENFILE); 2449 2450 filp->f_flags = op->open_flag; 2451 nd->intent.open.file = filp; 2452 nd->intent.open.flags = open_to_namei_flags(op->open_flag); 2453 nd->intent.open.create_mode = op->mode; 2454 2455 error = path_init(dfd, pathname, flags | LOOKUP_PARENT, nd, &base); 2456 if (unlikely(error)) 2457 goto out_filp; 2458 2459 current->total_link_count = 0; 2460 error = link_path_walk(pathname, nd); 2461 if (unlikely(error)) 2462 goto out_filp; 2463 2464 filp = do_last(nd, &path, op, pathname); 2465 while (unlikely(!filp)) { /* trailing symlink */ 2466 struct path link = path; 2467 void *cookie; 2468 if (!(nd->flags & LOOKUP_FOLLOW)) { 2469 path_put_conditional(&path, nd); 2470 path_put(&nd->path); 2471 filp = ERR_PTR(-ELOOP); 2472 break; 2473 } 2474 nd->flags |= LOOKUP_PARENT; 2475 nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL); 2476 error = follow_link(&link, nd, &cookie); 2477 if (unlikely(error)) 2478 filp = ERR_PTR(error); 2479 else 2480 filp = do_last(nd, &path, op, pathname); 2481 put_link(nd, &link, cookie); 2482 } 2483 out: 2484 if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) 2485 path_put(&nd->root); 2486 if (base) 2487 fput(base); 2488 release_open_intent(nd); 2489 if (filp == ERR_PTR(-EOPENSTALE)) { 2490 if (flags & LOOKUP_RCU) 2491 filp = ERR_PTR(-ECHILD); 2492 else 2493 filp = ERR_PTR(-ESTALE); 2494 } 2495 return filp; 2496 2497 out_filp: 2498 filp = ERR_PTR(error); 2499 goto out; 2500 } 2501 2502 struct file *do_filp_open(int dfd, const char *pathname, 2503 const struct open_flags *op, int flags) 2504 { 2505 struct nameidata nd; 2506 struct file *filp; 2507 2508 filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_RCU); 2509 if (unlikely(filp == ERR_PTR(-ECHILD))) 2510 filp = path_openat(dfd, pathname, &nd, op, flags); 2511 if (unlikely(filp == ERR_PTR(-ESTALE))) 2512 filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_REVAL); 2513 return filp; 2514 } 2515 2516 struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt, 2517 const char *name, const struct open_flags *op, int flags) 2518 { 2519 struct nameidata nd; 2520 struct file *file; 2521 2522 nd.root.mnt = mnt; 2523 nd.root.dentry = dentry; 2524 2525 flags |= LOOKUP_ROOT; 2526 2527 if (dentry->d_inode->i_op->follow_link && op->intent & LOOKUP_OPEN) 2528 return ERR_PTR(-ELOOP); 2529 2530 file = path_openat(-1, name, &nd, op, flags | LOOKUP_RCU); 2531 if (unlikely(file == ERR_PTR(-ECHILD))) 2532 file = path_openat(-1, name, &nd, op, flags); 2533 if (unlikely(file == ERR_PTR(-ESTALE))) 2534 file = path_openat(-1, name, &nd, op, flags | LOOKUP_REVAL); 2535 return file; 2536 } 2537 2538 struct dentry *kern_path_create(int dfd, const char *pathname, struct path *path, int is_dir) 2539 { 2540 struct dentry *dentry = ERR_PTR(-EEXIST); 2541 struct nameidata nd; 2542 int error = do_path_lookup(dfd, pathname, LOOKUP_PARENT, &nd); 2543 if (error) 2544 return ERR_PTR(error); 2545 2546 /* 2547 * Yucky last component or no last component at all? 2548 * (foo/., foo/.., /////) 2549 */ 2550 if (nd.last_type != LAST_NORM) 2551 goto out; 2552 nd.flags &= ~LOOKUP_PARENT; 2553 nd.flags |= LOOKUP_CREATE | LOOKUP_EXCL; 2554 nd.intent.open.flags = O_EXCL; 2555 2556 /* 2557 * Do the final lookup. 2558 */ 2559 mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); 2560 dentry = lookup_hash(&nd); 2561 if (IS_ERR(dentry)) 2562 goto fail; 2563 2564 if (dentry->d_inode) 2565 goto eexist; 2566 /* 2567 * Special case - lookup gave negative, but... we had foo/bar/ 2568 * From the vfs_mknod() POV we just have a negative dentry - 2569 * all is fine. Let's be bastards - you had / on the end, you've 2570 * been asking for (non-existent) directory. -ENOENT for you. 2571 */ 2572 if (unlikely(!is_dir && nd.last.name[nd.last.len])) { 2573 dput(dentry); 2574 dentry = ERR_PTR(-ENOENT); 2575 goto fail; 2576 } 2577 *path = nd.path; 2578 return dentry; 2579 eexist: 2580 dput(dentry); 2581 dentry = ERR_PTR(-EEXIST); 2582 fail: 2583 mutex_unlock(&nd.path.dentry->d_inode->i_mutex); 2584 out: 2585 path_put(&nd.path); 2586 return dentry; 2587 } 2588 EXPORT_SYMBOL(kern_path_create); 2589 2590 struct dentry *user_path_create(int dfd, const char __user *pathname, struct path *path, int is_dir) 2591 { 2592 char *tmp = getname(pathname); 2593 struct dentry *res; 2594 if (IS_ERR(tmp)) 2595 return ERR_CAST(tmp); 2596 res = kern_path_create(dfd, tmp, path, is_dir); 2597 putname(tmp); 2598 return res; 2599 } 2600 EXPORT_SYMBOL(user_path_create); 2601 2602 int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) 2603 { 2604 int error = may_create(dir, dentry); 2605 2606 if (error) 2607 return error; 2608 2609 if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD)) 2610 return -EPERM; 2611 2612 if (!dir->i_op->mknod) 2613 return -EPERM; 2614 2615 error = devcgroup_inode_mknod(mode, dev); 2616 if (error) 2617 return error; 2618 2619 error = security_inode_mknod(dir, dentry, mode, dev); 2620 if (error) 2621 return error; 2622 2623 error = dir->i_op->mknod(dir, dentry, mode, dev); 2624 if (!error) 2625 fsnotify_create(dir, dentry); 2626 return error; 2627 } 2628 2629 static int may_mknod(umode_t mode) 2630 { 2631 switch (mode & S_IFMT) { 2632 case S_IFREG: 2633 case S_IFCHR: 2634 case S_IFBLK: 2635 case S_IFIFO: 2636 case S_IFSOCK: 2637 case 0: /* zero mode translates to S_IFREG */ 2638 return 0; 2639 case S_IFDIR: 2640 return -EPERM; 2641 default: 2642 return -EINVAL; 2643 } 2644 } 2645 2646 SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode, 2647 unsigned, dev) 2648 { 2649 struct dentry *dentry; 2650 struct path path; 2651 int error; 2652 2653 if (S_ISDIR(mode)) 2654 return -EPERM; 2655 2656 dentry = user_path_create(dfd, filename, &path, 0); 2657 if (IS_ERR(dentry)) 2658 return PTR_ERR(dentry); 2659 2660 if (!IS_POSIXACL(path.dentry->d_inode)) 2661 mode &= ~current_umask(); 2662 error = may_mknod(mode); 2663 if (error) 2664 goto out_dput; 2665 error = mnt_want_write(path.mnt); 2666 if (error) 2667 goto out_dput; 2668 error = security_path_mknod(&path, dentry, mode, dev); 2669 if (error) 2670 goto out_drop_write; 2671 switch (mode & S_IFMT) { 2672 case 0: case S_IFREG: 2673 error = vfs_create(path.dentry->d_inode,dentry,mode,NULL); 2674 break; 2675 case S_IFCHR: case S_IFBLK: 2676 error = vfs_mknod(path.dentry->d_inode,dentry,mode, 2677 new_decode_dev(dev)); 2678 break; 2679 case S_IFIFO: case S_IFSOCK: 2680 error = vfs_mknod(path.dentry->d_inode,dentry,mode,0); 2681 break; 2682 } 2683 out_drop_write: 2684 mnt_drop_write(path.mnt); 2685 out_dput: 2686 dput(dentry); 2687 mutex_unlock(&path.dentry->d_inode->i_mutex); 2688 path_put(&path); 2689 2690 return error; 2691 } 2692 2693 SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev) 2694 { 2695 return sys_mknodat(AT_FDCWD, filename, mode, dev); 2696 } 2697 2698 int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 2699 { 2700 int error = may_create(dir, dentry); 2701 unsigned max_links = dir->i_sb->s_max_links; 2702 2703 if (error) 2704 return error; 2705 2706 if (!dir->i_op->mkdir) 2707 return -EPERM; 2708 2709 mode &= (S_IRWXUGO|S_ISVTX); 2710 error = security_inode_mkdir(dir, dentry, mode); 2711 if (error) 2712 return error; 2713 2714 if (max_links && dir->i_nlink >= max_links) 2715 return -EMLINK; 2716 2717 error = dir->i_op->mkdir(dir, dentry, mode); 2718 if (!error) 2719 fsnotify_mkdir(dir, dentry); 2720 return error; 2721 } 2722 2723 SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode) 2724 { 2725 struct dentry *dentry; 2726 struct path path; 2727 int error; 2728 2729 dentry = user_path_create(dfd, pathname, &path, 1); 2730 if (IS_ERR(dentry)) 2731 return PTR_ERR(dentry); 2732 2733 if (!IS_POSIXACL(path.dentry->d_inode)) 2734 mode &= ~current_umask(); 2735 error = mnt_want_write(path.mnt); 2736 if (error) 2737 goto out_dput; 2738 error = security_path_mkdir(&path, dentry, mode); 2739 if (error) 2740 goto out_drop_write; 2741 error = vfs_mkdir(path.dentry->d_inode, dentry, mode); 2742 out_drop_write: 2743 mnt_drop_write(path.mnt); 2744 out_dput: 2745 dput(dentry); 2746 mutex_unlock(&path.dentry->d_inode->i_mutex); 2747 path_put(&path); 2748 return error; 2749 } 2750 2751 SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode) 2752 { 2753 return sys_mkdirat(AT_FDCWD, pathname, mode); 2754 } 2755 2756 /* 2757 * The dentry_unhash() helper will try to drop the dentry early: we 2758 * should have a usage count of 1 if we're the only user of this 2759 * dentry, and if that is true (possibly after pruning the dcache), 2760 * then we drop the dentry now. 2761 * 2762 * A low-level filesystem can, if it choses, legally 2763 * do a 2764 * 2765 * if (!d_unhashed(dentry)) 2766 * return -EBUSY; 2767 * 2768 * if it cannot handle the case of removing a directory 2769 * that is still in use by something else.. 2770 */ 2771 void dentry_unhash(struct dentry *dentry) 2772 { 2773 shrink_dcache_parent(dentry); 2774 spin_lock(&dentry->d_lock); 2775 if (dentry->d_count == 1) 2776 __d_drop(dentry); 2777 spin_unlock(&dentry->d_lock); 2778 } 2779 2780 int vfs_rmdir(struct inode *dir, struct dentry *dentry) 2781 { 2782 int error = may_delete(dir, dentry, 1); 2783 2784 if (error) 2785 return error; 2786 2787 if (!dir->i_op->rmdir) 2788 return -EPERM; 2789 2790 dget(dentry); 2791 mutex_lock(&dentry->d_inode->i_mutex); 2792 2793 error = -EBUSY; 2794 if (d_mountpoint(dentry)) 2795 goto out; 2796 2797 error = security_inode_rmdir(dir, dentry); 2798 if (error) 2799 goto out; 2800 2801 shrink_dcache_parent(dentry); 2802 error = dir->i_op->rmdir(dir, dentry); 2803 if (error) 2804 goto out; 2805 2806 dentry->d_inode->i_flags |= S_DEAD; 2807 dont_mount(dentry); 2808 2809 out: 2810 mutex_unlock(&dentry->d_inode->i_mutex); 2811 dput(dentry); 2812 if (!error) 2813 d_delete(dentry); 2814 return error; 2815 } 2816 2817 static long do_rmdir(int dfd, const char __user *pathname) 2818 { 2819 int error = 0; 2820 char * name; 2821 struct dentry *dentry; 2822 struct nameidata nd; 2823 2824 error = user_path_parent(dfd, pathname, &nd, &name); 2825 if (error) 2826 return error; 2827 2828 switch(nd.last_type) { 2829 case LAST_DOTDOT: 2830 error = -ENOTEMPTY; 2831 goto exit1; 2832 case LAST_DOT: 2833 error = -EINVAL; 2834 goto exit1; 2835 case LAST_ROOT: 2836 error = -EBUSY; 2837 goto exit1; 2838 } 2839 2840 nd.flags &= ~LOOKUP_PARENT; 2841 2842 mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); 2843 dentry = lookup_hash(&nd); 2844 error = PTR_ERR(dentry); 2845 if (IS_ERR(dentry)) 2846 goto exit2; 2847 if (!dentry->d_inode) { 2848 error = -ENOENT; 2849 goto exit3; 2850 } 2851 error = mnt_want_write(nd.path.mnt); 2852 if (error) 2853 goto exit3; 2854 error = security_path_rmdir(&nd.path, dentry); 2855 if (error) 2856 goto exit4; 2857 error = vfs_rmdir(nd.path.dentry->d_inode, dentry); 2858 exit4: 2859 mnt_drop_write(nd.path.mnt); 2860 exit3: 2861 dput(dentry); 2862 exit2: 2863 mutex_unlock(&nd.path.dentry->d_inode->i_mutex); 2864 exit1: 2865 path_put(&nd.path); 2866 putname(name); 2867 return error; 2868 } 2869 2870 SYSCALL_DEFINE1(rmdir, const char __user *, pathname) 2871 { 2872 return do_rmdir(AT_FDCWD, pathname); 2873 } 2874 2875 int vfs_unlink(struct inode *dir, struct dentry *dentry) 2876 { 2877 int error = may_delete(dir, dentry, 0); 2878 2879 if (error) 2880 return error; 2881 2882 if (!dir->i_op->unlink) 2883 return -EPERM; 2884 2885 mutex_lock(&dentry->d_inode->i_mutex); 2886 if (d_mountpoint(dentry)) 2887 error = -EBUSY; 2888 else { 2889 error = security_inode_unlink(dir, dentry); 2890 if (!error) { 2891 error = dir->i_op->unlink(dir, dentry); 2892 if (!error) 2893 dont_mount(dentry); 2894 } 2895 } 2896 mutex_unlock(&dentry->d_inode->i_mutex); 2897 2898 /* We don't d_delete() NFS sillyrenamed files--they still exist. */ 2899 if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) { 2900 fsnotify_link_count(dentry->d_inode); 2901 d_delete(dentry); 2902 } 2903 2904 return error; 2905 } 2906 2907 /* 2908 * Make sure that the actual truncation of the file will occur outside its 2909 * directory's i_mutex. Truncate can take a long time if there is a lot of 2910 * writeout happening, and we don't want to prevent access to the directory 2911 * while waiting on the I/O. 2912 */ 2913 static long do_unlinkat(int dfd, const char __user *pathname) 2914 { 2915 int error; 2916 char *name; 2917 struct dentry *dentry; 2918 struct nameidata nd; 2919 struct inode *inode = NULL; 2920 2921 error = user_path_parent(dfd, pathname, &nd, &name); 2922 if (error) 2923 return error; 2924 2925 error = -EISDIR; 2926 if (nd.last_type != LAST_NORM) 2927 goto exit1; 2928 2929 nd.flags &= ~LOOKUP_PARENT; 2930 2931 mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); 2932 dentry = lookup_hash(&nd); 2933 error = PTR_ERR(dentry); 2934 if (!IS_ERR(dentry)) { 2935 /* Why not before? Because we want correct error value */ 2936 if (nd.last.name[nd.last.len]) 2937 goto slashes; 2938 inode = dentry->d_inode; 2939 if (!inode) 2940 goto slashes; 2941 ihold(inode); 2942 error = mnt_want_write(nd.path.mnt); 2943 if (error) 2944 goto exit2; 2945 error = security_path_unlink(&nd.path, dentry); 2946 if (error) 2947 goto exit3; 2948 error = vfs_unlink(nd.path.dentry->d_inode, dentry); 2949 exit3: 2950 mnt_drop_write(nd.path.mnt); 2951 exit2: 2952 dput(dentry); 2953 } 2954 mutex_unlock(&nd.path.dentry->d_inode->i_mutex); 2955 if (inode) 2956 iput(inode); /* truncate the inode here */ 2957 exit1: 2958 path_put(&nd.path); 2959 putname(name); 2960 return error; 2961 2962 slashes: 2963 error = !dentry->d_inode ? -ENOENT : 2964 S_ISDIR(dentry->d_inode->i_mode) ? -EISDIR : -ENOTDIR; 2965 goto exit2; 2966 } 2967 2968 SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag) 2969 { 2970 if ((flag & ~AT_REMOVEDIR) != 0) 2971 return -EINVAL; 2972 2973 if (flag & AT_REMOVEDIR) 2974 return do_rmdir(dfd, pathname); 2975 2976 return do_unlinkat(dfd, pathname); 2977 } 2978 2979 SYSCALL_DEFINE1(unlink, const char __user *, pathname) 2980 { 2981 return do_unlinkat(AT_FDCWD, pathname); 2982 } 2983 2984 int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname) 2985 { 2986 int error = may_create(dir, dentry); 2987 2988 if (error) 2989 return error; 2990 2991 if (!dir->i_op->symlink) 2992 return -EPERM; 2993 2994 error = security_inode_symlink(dir, dentry, oldname); 2995 if (error) 2996 return error; 2997 2998 error = dir->i_op->symlink(dir, dentry, oldname); 2999 if (!error) 3000 fsnotify_create(dir, dentry); 3001 return error; 3002 } 3003 3004 SYSCALL_DEFINE3(symlinkat, const char __user *, oldname, 3005 int, newdfd, const char __user *, newname) 3006 { 3007 int error; 3008 char *from; 3009 struct dentry *dentry; 3010 struct path path; 3011 3012 from = getname(oldname); 3013 if (IS_ERR(from)) 3014 return PTR_ERR(from); 3015 3016 dentry = user_path_create(newdfd, newname, &path, 0); 3017 error = PTR_ERR(dentry); 3018 if (IS_ERR(dentry)) 3019 goto out_putname; 3020 3021 error = mnt_want_write(path.mnt); 3022 if (error) 3023 goto out_dput; 3024 error = security_path_symlink(&path, dentry, from); 3025 if (error) 3026 goto out_drop_write; 3027 error = vfs_symlink(path.dentry->d_inode, dentry, from); 3028 out_drop_write: 3029 mnt_drop_write(path.mnt); 3030 out_dput: 3031 dput(dentry); 3032 mutex_unlock(&path.dentry->d_inode->i_mutex); 3033 path_put(&path); 3034 out_putname: 3035 putname(from); 3036 return error; 3037 } 3038 3039 SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname) 3040 { 3041 return sys_symlinkat(oldname, AT_FDCWD, newname); 3042 } 3043 3044 int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry) 3045 { 3046 struct inode *inode = old_dentry->d_inode; 3047 unsigned max_links = dir->i_sb->s_max_links; 3048 int error; 3049 3050 if (!inode) 3051 return -ENOENT; 3052 3053 error = may_create(dir, new_dentry); 3054 if (error) 3055 return error; 3056 3057 if (dir->i_sb != inode->i_sb) 3058 return -EXDEV; 3059 3060 /* 3061 * A link to an append-only or immutable file cannot be created. 3062 */ 3063 if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) 3064 return -EPERM; 3065 if (!dir->i_op->link) 3066 return -EPERM; 3067 if (S_ISDIR(inode->i_mode)) 3068 return -EPERM; 3069 3070 error = security_inode_link(old_dentry, dir, new_dentry); 3071 if (error) 3072 return error; 3073 3074 mutex_lock(&inode->i_mutex); 3075 /* Make sure we don't allow creating hardlink to an unlinked file */ 3076 if (inode->i_nlink == 0) 3077 error = -ENOENT; 3078 else if (max_links && inode->i_nlink >= max_links) 3079 error = -EMLINK; 3080 else 3081 error = dir->i_op->link(old_dentry, dir, new_dentry); 3082 mutex_unlock(&inode->i_mutex); 3083 if (!error) 3084 fsnotify_link(dir, inode, new_dentry); 3085 return error; 3086 } 3087 3088 /* 3089 * Hardlinks are often used in delicate situations. We avoid 3090 * security-related surprises by not following symlinks on the 3091 * newname. --KAB 3092 * 3093 * We don't follow them on the oldname either to be compatible 3094 * with linux 2.0, and to avoid hard-linking to directories 3095 * and other special files. --ADM 3096 */ 3097 SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname, 3098 int, newdfd, const char __user *, newname, int, flags) 3099 { 3100 struct dentry *new_dentry; 3101 struct path old_path, new_path; 3102 int how = 0; 3103 int error; 3104 3105 if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0) 3106 return -EINVAL; 3107 /* 3108 * To use null names we require CAP_DAC_READ_SEARCH 3109 * This ensures that not everyone will be able to create 3110 * handlink using the passed filedescriptor. 3111 */ 3112 if (flags & AT_EMPTY_PATH) { 3113 if (!capable(CAP_DAC_READ_SEARCH)) 3114 return -ENOENT; 3115 how = LOOKUP_EMPTY; 3116 } 3117 3118 if (flags & AT_SYMLINK_FOLLOW) 3119 how |= LOOKUP_FOLLOW; 3120 3121 error = user_path_at(olddfd, oldname, how, &old_path); 3122 if (error) 3123 return error; 3124 3125 new_dentry = user_path_create(newdfd, newname, &new_path, 0); 3126 error = PTR_ERR(new_dentry); 3127 if (IS_ERR(new_dentry)) 3128 goto out; 3129 3130 error = -EXDEV; 3131 if (old_path.mnt != new_path.mnt) 3132 goto out_dput; 3133 error = mnt_want_write(new_path.mnt); 3134 if (error) 3135 goto out_dput; 3136 error = security_path_link(old_path.dentry, &new_path, new_dentry); 3137 if (error) 3138 goto out_drop_write; 3139 error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry); 3140 out_drop_write: 3141 mnt_drop_write(new_path.mnt); 3142 out_dput: 3143 dput(new_dentry); 3144 mutex_unlock(&new_path.dentry->d_inode->i_mutex); 3145 path_put(&new_path); 3146 out: 3147 path_put(&old_path); 3148 3149 return error; 3150 } 3151 3152 SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname) 3153 { 3154 return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0); 3155 } 3156 3157 /* 3158 * The worst of all namespace operations - renaming directory. "Perverted" 3159 * doesn't even start to describe it. Somebody in UCB had a heck of a trip... 3160 * Problems: 3161 * a) we can get into loop creation. Check is done in is_subdir(). 3162 * b) race potential - two innocent renames can create a loop together. 3163 * That's where 4.4 screws up. Current fix: serialization on 3164 * sb->s_vfs_rename_mutex. We might be more accurate, but that's another 3165 * story. 3166 * c) we have to lock _three_ objects - parents and victim (if it exists). 3167 * And that - after we got ->i_mutex on parents (until then we don't know 3168 * whether the target exists). Solution: try to be smart with locking 3169 * order for inodes. We rely on the fact that tree topology may change 3170 * only under ->s_vfs_rename_mutex _and_ that parent of the object we 3171 * move will be locked. Thus we can rank directories by the tree 3172 * (ancestors first) and rank all non-directories after them. 3173 * That works since everybody except rename does "lock parent, lookup, 3174 * lock child" and rename is under ->s_vfs_rename_mutex. 3175 * HOWEVER, it relies on the assumption that any object with ->lookup() 3176 * has no more than 1 dentry. If "hybrid" objects will ever appear, 3177 * we'd better make sure that there's no link(2) for them. 3178 * d) conversion from fhandle to dentry may come in the wrong moment - when 3179 * we are removing the target. Solution: we will have to grab ->i_mutex 3180 * in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on 3181 * ->i_mutex on parents, which works but leads to some truly excessive 3182 * locking]. 3183 */ 3184 static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, 3185 struct inode *new_dir, struct dentry *new_dentry) 3186 { 3187 int error = 0; 3188 struct inode *target = new_dentry->d_inode; 3189 unsigned max_links = new_dir->i_sb->s_max_links; 3190 3191 /* 3192 * If we are going to change the parent - check write permissions, 3193 * we'll need to flip '..'. 3194 */ 3195 if (new_dir != old_dir) { 3196 error = inode_permission(old_dentry->d_inode, MAY_WRITE); 3197 if (error) 3198 return error; 3199 } 3200 3201 error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry); 3202 if (error) 3203 return error; 3204 3205 dget(new_dentry); 3206 if (target) 3207 mutex_lock(&target->i_mutex); 3208 3209 error = -EBUSY; 3210 if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry)) 3211 goto out; 3212 3213 error = -EMLINK; 3214 if (max_links && !target && new_dir != old_dir && 3215 new_dir->i_nlink >= max_links) 3216 goto out; 3217 3218 if (target) 3219 shrink_dcache_parent(new_dentry); 3220 error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); 3221 if (error) 3222 goto out; 3223 3224 if (target) { 3225 target->i_flags |= S_DEAD; 3226 dont_mount(new_dentry); 3227 } 3228 out: 3229 if (target) 3230 mutex_unlock(&target->i_mutex); 3231 dput(new_dentry); 3232 if (!error) 3233 if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) 3234 d_move(old_dentry,new_dentry); 3235 return error; 3236 } 3237 3238 static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, 3239 struct inode *new_dir, struct dentry *new_dentry) 3240 { 3241 struct inode *target = new_dentry->d_inode; 3242 int error; 3243 3244 error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry); 3245 if (error) 3246 return error; 3247 3248 dget(new_dentry); 3249 if (target) 3250 mutex_lock(&target->i_mutex); 3251 3252 error = -EBUSY; 3253 if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry)) 3254 goto out; 3255 3256 error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); 3257 if (error) 3258 goto out; 3259 3260 if (target) 3261 dont_mount(new_dentry); 3262 if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) 3263 d_move(old_dentry, new_dentry); 3264 out: 3265 if (target) 3266 mutex_unlock(&target->i_mutex); 3267 dput(new_dentry); 3268 return error; 3269 } 3270 3271 int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, 3272 struct inode *new_dir, struct dentry *new_dentry) 3273 { 3274 int error; 3275 int is_dir = S_ISDIR(old_dentry->d_inode->i_mode); 3276 const unsigned char *old_name; 3277 3278 if (old_dentry->d_inode == new_dentry->d_inode) 3279 return 0; 3280 3281 error = may_delete(old_dir, old_dentry, is_dir); 3282 if (error) 3283 return error; 3284 3285 if (!new_dentry->d_inode) 3286 error = may_create(new_dir, new_dentry); 3287 else 3288 error = may_delete(new_dir, new_dentry, is_dir); 3289 if (error) 3290 return error; 3291 3292 if (!old_dir->i_op->rename) 3293 return -EPERM; 3294 3295 old_name = fsnotify_oldname_init(old_dentry->d_name.name); 3296 3297 if (is_dir) 3298 error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry); 3299 else 3300 error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry); 3301 if (!error) 3302 fsnotify_move(old_dir, new_dir, old_name, is_dir, 3303 new_dentry->d_inode, old_dentry); 3304 fsnotify_oldname_free(old_name); 3305 3306 return error; 3307 } 3308 3309 SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname, 3310 int, newdfd, const char __user *, newname) 3311 { 3312 struct dentry *old_dir, *new_dir; 3313 struct dentry *old_dentry, *new_dentry; 3314 struct dentry *trap; 3315 struct nameidata oldnd, newnd; 3316 char *from; 3317 char *to; 3318 int error; 3319 3320 error = user_path_parent(olddfd, oldname, &oldnd, &from); 3321 if (error) 3322 goto exit; 3323 3324 error = user_path_parent(newdfd, newname, &newnd, &to); 3325 if (error) 3326 goto exit1; 3327 3328 error = -EXDEV; 3329 if (oldnd.path.mnt != newnd.path.mnt) 3330 goto exit2; 3331 3332 old_dir = oldnd.path.dentry; 3333 error = -EBUSY; 3334 if (oldnd.last_type != LAST_NORM) 3335 goto exit2; 3336 3337 new_dir = newnd.path.dentry; 3338 if (newnd.last_type != LAST_NORM) 3339 goto exit2; 3340 3341 oldnd.flags &= ~LOOKUP_PARENT; 3342 newnd.flags &= ~LOOKUP_PARENT; 3343 newnd.flags |= LOOKUP_RENAME_TARGET; 3344 3345 trap = lock_rename(new_dir, old_dir); 3346 3347 old_dentry = lookup_hash(&oldnd); 3348 error = PTR_ERR(old_dentry); 3349 if (IS_ERR(old_dentry)) 3350 goto exit3; 3351 /* source must exist */ 3352 error = -ENOENT; 3353 if (!old_dentry->d_inode) 3354 goto exit4; 3355 /* unless the source is a directory trailing slashes give -ENOTDIR */ 3356 if (!S_ISDIR(old_dentry->d_inode->i_mode)) { 3357 error = -ENOTDIR; 3358 if (oldnd.last.name[oldnd.last.len]) 3359 goto exit4; 3360 if (newnd.last.name[newnd.last.len]) 3361 goto exit4; 3362 } 3363 /* source should not be ancestor of target */ 3364 error = -EINVAL; 3365 if (old_dentry == trap) 3366 goto exit4; 3367 new_dentry = lookup_hash(&newnd); 3368 error = PTR_ERR(new_dentry); 3369 if (IS_ERR(new_dentry)) 3370 goto exit4; 3371 /* target should not be an ancestor of source */ 3372 error = -ENOTEMPTY; 3373 if (new_dentry == trap) 3374 goto exit5; 3375 3376 error = mnt_want_write(oldnd.path.mnt); 3377 if (error) 3378 goto exit5; 3379 error = security_path_rename(&oldnd.path, old_dentry, 3380 &newnd.path, new_dentry); 3381 if (error) 3382 goto exit6; 3383 error = vfs_rename(old_dir->d_inode, old_dentry, 3384 new_dir->d_inode, new_dentry); 3385 exit6: 3386 mnt_drop_write(oldnd.path.mnt); 3387 exit5: 3388 dput(new_dentry); 3389 exit4: 3390 dput(old_dentry); 3391 exit3: 3392 unlock_rename(new_dir, old_dir); 3393 exit2: 3394 path_put(&newnd.path); 3395 putname(to); 3396 exit1: 3397 path_put(&oldnd.path); 3398 putname(from); 3399 exit: 3400 return error; 3401 } 3402 3403 SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname) 3404 { 3405 return sys_renameat(AT_FDCWD, oldname, AT_FDCWD, newname); 3406 } 3407 3408 int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen, const char *link) 3409 { 3410 int len; 3411 3412 len = PTR_ERR(link); 3413 if (IS_ERR(link)) 3414 goto out; 3415 3416 len = strlen(link); 3417 if (len > (unsigned) buflen) 3418 len = buflen; 3419 if (copy_to_user(buffer, link, len)) 3420 len = -EFAULT; 3421 out: 3422 return len; 3423 } 3424 3425 /* 3426 * A helper for ->readlink(). This should be used *ONLY* for symlinks that 3427 * have ->follow_link() touching nd only in nd_set_link(). Using (or not 3428 * using) it for any given inode is up to filesystem. 3429 */ 3430 int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen) 3431 { 3432 struct nameidata nd; 3433 void *cookie; 3434 int res; 3435 3436 nd.depth = 0; 3437 cookie = dentry->d_inode->i_op->follow_link(dentry, &nd); 3438 if (IS_ERR(cookie)) 3439 return PTR_ERR(cookie); 3440 3441 res = vfs_readlink(dentry, buffer, buflen, nd_get_link(&nd)); 3442 if (dentry->d_inode->i_op->put_link) 3443 dentry->d_inode->i_op->put_link(dentry, &nd, cookie); 3444 return res; 3445 } 3446 3447 int vfs_follow_link(struct nameidata *nd, const char *link) 3448 { 3449 return __vfs_follow_link(nd, link); 3450 } 3451 3452 /* get the link contents into pagecache */ 3453 static char *page_getlink(struct dentry * dentry, struct page **ppage) 3454 { 3455 char *kaddr; 3456 struct page *page; 3457 struct address_space *mapping = dentry->d_inode->i_mapping; 3458 page = read_mapping_page(mapping, 0, NULL); 3459 if (IS_ERR(page)) 3460 return (char*)page; 3461 *ppage = page; 3462 kaddr = kmap(page); 3463 nd_terminate_link(kaddr, dentry->d_inode->i_size, PAGE_SIZE - 1); 3464 return kaddr; 3465 } 3466 3467 int page_readlink(struct dentry *dentry, char __user *buffer, int buflen) 3468 { 3469 struct page *page = NULL; 3470 char *s = page_getlink(dentry, &page); 3471 int res = vfs_readlink(dentry,buffer,buflen,s); 3472 if (page) { 3473 kunmap(page); 3474 page_cache_release(page); 3475 } 3476 return res; 3477 } 3478 3479 void *page_follow_link_light(struct dentry *dentry, struct nameidata *nd) 3480 { 3481 struct page *page = NULL; 3482 nd_set_link(nd, page_getlink(dentry, &page)); 3483 return page; 3484 } 3485 3486 void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) 3487 { 3488 struct page *page = cookie; 3489 3490 if (page) { 3491 kunmap(page); 3492 page_cache_release(page); 3493 } 3494 } 3495 3496 /* 3497 * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS 3498 */ 3499 int __page_symlink(struct inode *inode, const char *symname, int len, int nofs) 3500 { 3501 struct address_space *mapping = inode->i_mapping; 3502 struct page *page; 3503 void *fsdata; 3504 int err; 3505 char *kaddr; 3506 unsigned int flags = AOP_FLAG_UNINTERRUPTIBLE; 3507 if (nofs) 3508 flags |= AOP_FLAG_NOFS; 3509 3510 retry: 3511 err = pagecache_write_begin(NULL, mapping, 0, len-1, 3512 flags, &page, &fsdata); 3513 if (err) 3514 goto fail; 3515 3516 kaddr = kmap_atomic(page); 3517 memcpy(kaddr, symname, len-1); 3518 kunmap_atomic(kaddr); 3519 3520 err = pagecache_write_end(NULL, mapping, 0, len-1, len-1, 3521 page, fsdata); 3522 if (err < 0) 3523 goto fail; 3524 if (err < len-1) 3525 goto retry; 3526 3527 mark_inode_dirty(inode); 3528 return 0; 3529 fail: 3530 return err; 3531 } 3532 3533 int page_symlink(struct inode *inode, const char *symname, int len) 3534 { 3535 return __page_symlink(inode, symname, len, 3536 !(mapping_gfp_mask(inode->i_mapping) & __GFP_FS)); 3537 } 3538 3539 const struct inode_operations page_symlink_inode_operations = { 3540 .readlink = generic_readlink, 3541 .follow_link = page_follow_link_light, 3542 .put_link = page_put_link, 3543 }; 3544 3545 EXPORT_SYMBOL(user_path_at); 3546 EXPORT_SYMBOL(follow_down_one); 3547 EXPORT_SYMBOL(follow_down); 3548 EXPORT_SYMBOL(follow_up); 3549 EXPORT_SYMBOL(get_write_access); /* binfmt_aout */ 3550 EXPORT_SYMBOL(getname); 3551 EXPORT_SYMBOL(lock_rename); 3552 EXPORT_SYMBOL(lookup_one_len); 3553 EXPORT_SYMBOL(page_follow_link_light); 3554 EXPORT_SYMBOL(page_put_link); 3555 EXPORT_SYMBOL(page_readlink); 3556 EXPORT_SYMBOL(__page_symlink); 3557 EXPORT_SYMBOL(page_symlink); 3558 EXPORT_SYMBOL(page_symlink_inode_operations); 3559 EXPORT_SYMBOL(kern_path); 3560 EXPORT_SYMBOL(vfs_path_lookup); 3561 EXPORT_SYMBOL(inode_permission); 3562 EXPORT_SYMBOL(unlock_rename); 3563 EXPORT_SYMBOL(vfs_create); 3564 EXPORT_SYMBOL(vfs_follow_link); 3565 EXPORT_SYMBOL(vfs_link); 3566 EXPORT_SYMBOL(vfs_mkdir); 3567 EXPORT_SYMBOL(vfs_mknod); 3568 EXPORT_SYMBOL(generic_permission); 3569 EXPORT_SYMBOL(vfs_readlink); 3570 EXPORT_SYMBOL(vfs_rename); 3571 EXPORT_SYMBOL(vfs_rmdir); 3572 EXPORT_SYMBOL(vfs_symlink); 3573 EXPORT_SYMBOL(vfs_unlink); 3574 EXPORT_SYMBOL(dentry_unhash); 3575 EXPORT_SYMBOL(generic_readlink); 3576