1 /* 2 * linux/fs/namei.c 3 * 4 * Copyright (C) 1991, 1992 Linus Torvalds 5 */ 6 7 /* 8 * Some corrections by tytso. 9 */ 10 11 /* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname 12 * lookup logic. 13 */ 14 /* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture. 15 */ 16 17 #include <linux/init.h> 18 #include <linux/export.h> 19 #include <linux/slab.h> 20 #include <linux/fs.h> 21 #include <linux/namei.h> 22 #include <linux/pagemap.h> 23 #include <linux/fsnotify.h> 24 #include <linux/personality.h> 25 #include <linux/security.h> 26 #include <linux/ima.h> 27 #include <linux/syscalls.h> 28 #include <linux/mount.h> 29 #include <linux/audit.h> 30 #include <linux/capability.h> 31 #include <linux/file.h> 32 #include <linux/fcntl.h> 33 #include <linux/device_cgroup.h> 34 #include <linux/fs_struct.h> 35 #include <linux/posix_acl.h> 36 #include <asm/uaccess.h> 37 38 #include "internal.h" 39 #include "mount.h" 40 41 /* [Feb-1997 T. Schoebel-Theuer] 42 * Fundamental changes in the pathname lookup mechanisms (namei) 43 * were necessary because of omirr. The reason is that omirr needs 44 * to know the _real_ pathname, not the user-supplied one, in case 45 * of symlinks (and also when transname replacements occur). 46 * 47 * The new code replaces the old recursive symlink resolution with 48 * an iterative one (in case of non-nested symlink chains). It does 49 * this with calls to <fs>_follow_link(). 50 * As a side effect, dir_namei(), _namei() and follow_link() are now 51 * replaced with a single function lookup_dentry() that can handle all 52 * the special cases of the former code. 53 * 54 * With the new dcache, the pathname is stored at each inode, at least as 55 * long as the refcount of the inode is positive. As a side effect, the 56 * size of the dcache depends on the inode cache and thus is dynamic. 57 * 58 * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink 59 * resolution to correspond with current state of the code. 60 * 61 * Note that the symlink resolution is not *completely* iterative. 62 * There is still a significant amount of tail- and mid- recursion in 63 * the algorithm. Also, note that <fs>_readlink() is not used in 64 * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink() 65 * may return different results than <fs>_follow_link(). Many virtual 66 * filesystems (including /proc) exhibit this behavior. 67 */ 68 69 /* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation: 70 * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL 71 * and the name already exists in form of a symlink, try to create the new 72 * name indicated by the symlink. The old code always complained that the 73 * name already exists, due to not following the symlink even if its target 74 * is nonexistent. The new semantics affects also mknod() and link() when 75 * the name is a symlink pointing to a non-existent name. 76 * 77 * I don't know which semantics is the right one, since I have no access 78 * to standards. But I found by trial that HP-UX 9.0 has the full "new" 79 * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the 80 * "old" one. Personally, I think the new semantics is much more logical. 81 * Note that "ln old new" where "new" is a symlink pointing to a non-existing 82 * file does succeed in both HP-UX and SunOs, but not in Solaris 83 * and in the old Linux semantics. 84 */ 85 86 /* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink 87 * semantics. See the comments in "open_namei" and "do_link" below. 88 * 89 * [10-Sep-98 Alan Modra] Another symlink change. 90 */ 91 92 /* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks: 93 * inside the path - always follow. 94 * in the last component in creation/removal/renaming - never follow. 95 * if LOOKUP_FOLLOW passed - follow. 96 * if the pathname has trailing slashes - follow. 97 * otherwise - don't follow. 98 * (applied in that order). 99 * 100 * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT 101 * restored for 2.4. This is the last surviving part of old 4.2BSD bug. 102 * During the 2.4 we need to fix the userland stuff depending on it - 103 * hopefully we will be able to get rid of that wart in 2.5. So far only 104 * XEmacs seems to be relying on it... 105 */ 106 /* 107 * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland) 108 * implemented. Let's see if raised priority of ->s_vfs_rename_mutex gives 109 * any extra contention... 110 */ 111 112 /* In order to reduce some races, while at the same time doing additional 113 * checking and hopefully speeding things up, we copy filenames to the 114 * kernel data space before using them.. 115 * 116 * POSIX.1 2.4: an empty pathname is invalid (ENOENT). 117 * PATH_MAX includes the nul terminator --RR. 118 */ 119 static int do_getname(const char __user *filename, char *page) 120 { 121 int retval; 122 unsigned long len = PATH_MAX; 123 124 if (!segment_eq(get_fs(), KERNEL_DS)) { 125 if ((unsigned long) filename >= TASK_SIZE) 126 return -EFAULT; 127 if (TASK_SIZE - (unsigned long) filename < PATH_MAX) 128 len = TASK_SIZE - (unsigned long) filename; 129 } 130 131 retval = strncpy_from_user(page, filename, len); 132 if (retval > 0) { 133 if (retval < len) 134 return 0; 135 return -ENAMETOOLONG; 136 } else if (!retval) 137 retval = -ENOENT; 138 return retval; 139 } 140 141 static char *getname_flags(const char __user *filename, int flags, int *empty) 142 { 143 char *result = __getname(); 144 int retval; 145 146 if (!result) 147 return ERR_PTR(-ENOMEM); 148 149 retval = do_getname(filename, result); 150 if (retval < 0) { 151 if (retval == -ENOENT && empty) 152 *empty = 1; 153 if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) { 154 __putname(result); 155 return ERR_PTR(retval); 156 } 157 } 158 audit_getname(result); 159 return result; 160 } 161 162 char *getname(const char __user * filename) 163 { 164 return getname_flags(filename, 0, NULL); 165 } 166 167 #ifdef CONFIG_AUDITSYSCALL 168 void putname(const char *name) 169 { 170 if (unlikely(!audit_dummy_context())) 171 audit_putname(name); 172 else 173 __putname(name); 174 } 175 EXPORT_SYMBOL(putname); 176 #endif 177 178 static int check_acl(struct inode *inode, int mask) 179 { 180 #ifdef CONFIG_FS_POSIX_ACL 181 struct posix_acl *acl; 182 183 if (mask & MAY_NOT_BLOCK) { 184 acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS); 185 if (!acl) 186 return -EAGAIN; 187 /* no ->get_acl() calls in RCU mode... */ 188 if (acl == ACL_NOT_CACHED) 189 return -ECHILD; 190 return posix_acl_permission(inode, acl, mask & ~MAY_NOT_BLOCK); 191 } 192 193 acl = get_cached_acl(inode, ACL_TYPE_ACCESS); 194 195 /* 196 * A filesystem can force a ACL callback by just never filling the 197 * ACL cache. But normally you'd fill the cache either at inode 198 * instantiation time, or on the first ->get_acl call. 199 * 200 * If the filesystem doesn't have a get_acl() function at all, we'll 201 * just create the negative cache entry. 202 */ 203 if (acl == ACL_NOT_CACHED) { 204 if (inode->i_op->get_acl) { 205 acl = inode->i_op->get_acl(inode, ACL_TYPE_ACCESS); 206 if (IS_ERR(acl)) 207 return PTR_ERR(acl); 208 } else { 209 set_cached_acl(inode, ACL_TYPE_ACCESS, NULL); 210 return -EAGAIN; 211 } 212 } 213 214 if (acl) { 215 int error = posix_acl_permission(inode, acl, mask); 216 posix_acl_release(acl); 217 return error; 218 } 219 #endif 220 221 return -EAGAIN; 222 } 223 224 /* 225 * This does the basic permission checking 226 */ 227 static int acl_permission_check(struct inode *inode, int mask) 228 { 229 unsigned int mode = inode->i_mode; 230 231 if (likely(uid_eq(current_fsuid(), inode->i_uid))) 232 mode >>= 6; 233 else { 234 if (IS_POSIXACL(inode) && (mode & S_IRWXG)) { 235 int error = check_acl(inode, mask); 236 if (error != -EAGAIN) 237 return error; 238 } 239 240 if (in_group_p(inode->i_gid)) 241 mode >>= 3; 242 } 243 244 /* 245 * If the DACs are ok we don't need any capability check. 246 */ 247 if ((mask & ~mode & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0) 248 return 0; 249 return -EACCES; 250 } 251 252 /** 253 * generic_permission - check for access rights on a Posix-like filesystem 254 * @inode: inode to check access rights for 255 * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, ...) 256 * 257 * Used to check for read/write/execute permissions on a file. 258 * We use "fsuid" for this, letting us set arbitrary permissions 259 * for filesystem access without changing the "normal" uids which 260 * are used for other things. 261 * 262 * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk 263 * request cannot be satisfied (eg. requires blocking or too much complexity). 264 * It would then be called again in ref-walk mode. 265 */ 266 int generic_permission(struct inode *inode, int mask) 267 { 268 int ret; 269 270 /* 271 * Do the basic permission checks. 272 */ 273 ret = acl_permission_check(inode, mask); 274 if (ret != -EACCES) 275 return ret; 276 277 if (S_ISDIR(inode->i_mode)) { 278 /* DACs are overridable for directories */ 279 if (inode_capable(inode, CAP_DAC_OVERRIDE)) 280 return 0; 281 if (!(mask & MAY_WRITE)) 282 if (inode_capable(inode, CAP_DAC_READ_SEARCH)) 283 return 0; 284 return -EACCES; 285 } 286 /* 287 * Read/write DACs are always overridable. 288 * Executable DACs are overridable when there is 289 * at least one exec bit set. 290 */ 291 if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO)) 292 if (inode_capable(inode, CAP_DAC_OVERRIDE)) 293 return 0; 294 295 /* 296 * Searching includes executable on directories, else just read. 297 */ 298 mask &= MAY_READ | MAY_WRITE | MAY_EXEC; 299 if (mask == MAY_READ) 300 if (inode_capable(inode, CAP_DAC_READ_SEARCH)) 301 return 0; 302 303 return -EACCES; 304 } 305 306 /* 307 * We _really_ want to just do "generic_permission()" without 308 * even looking at the inode->i_op values. So we keep a cache 309 * flag in inode->i_opflags, that says "this has not special 310 * permission function, use the fast case". 311 */ 312 static inline int do_inode_permission(struct inode *inode, int mask) 313 { 314 if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) { 315 if (likely(inode->i_op->permission)) 316 return inode->i_op->permission(inode, mask); 317 318 /* This gets set once for the inode lifetime */ 319 spin_lock(&inode->i_lock); 320 inode->i_opflags |= IOP_FASTPERM; 321 spin_unlock(&inode->i_lock); 322 } 323 return generic_permission(inode, mask); 324 } 325 326 /** 327 * inode_permission - check for access rights to a given inode 328 * @inode: inode to check permission on 329 * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, ...) 330 * 331 * Used to check for read/write/execute permissions on an inode. 332 * We use "fsuid" for this, letting us set arbitrary permissions 333 * for filesystem access without changing the "normal" uids which 334 * are used for other things. 335 * 336 * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask. 337 */ 338 int inode_permission(struct inode *inode, int mask) 339 { 340 int retval; 341 342 if (unlikely(mask & MAY_WRITE)) { 343 umode_t mode = inode->i_mode; 344 345 /* 346 * Nobody gets write access to a read-only fs. 347 */ 348 if (IS_RDONLY(inode) && 349 (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) 350 return -EROFS; 351 352 /* 353 * Nobody gets write access to an immutable file. 354 */ 355 if (IS_IMMUTABLE(inode)) 356 return -EACCES; 357 } 358 359 retval = do_inode_permission(inode, mask); 360 if (retval) 361 return retval; 362 363 retval = devcgroup_inode_permission(inode, mask); 364 if (retval) 365 return retval; 366 367 return security_inode_permission(inode, mask); 368 } 369 370 /** 371 * path_get - get a reference to a path 372 * @path: path to get the reference to 373 * 374 * Given a path increment the reference count to the dentry and the vfsmount. 375 */ 376 void path_get(struct path *path) 377 { 378 mntget(path->mnt); 379 dget(path->dentry); 380 } 381 EXPORT_SYMBOL(path_get); 382 383 /** 384 * path_put - put a reference to a path 385 * @path: path to put the reference to 386 * 387 * Given a path decrement the reference count to the dentry and the vfsmount. 388 */ 389 void path_put(struct path *path) 390 { 391 dput(path->dentry); 392 mntput(path->mnt); 393 } 394 EXPORT_SYMBOL(path_put); 395 396 /* 397 * Path walking has 2 modes, rcu-walk and ref-walk (see 398 * Documentation/filesystems/path-lookup.txt). In situations when we can't 399 * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab 400 * normal reference counts on dentries and vfsmounts to transition to rcu-walk 401 * mode. Refcounts are grabbed at the last known good point before rcu-walk 402 * got stuck, so ref-walk may continue from there. If this is not successful 403 * (eg. a seqcount has changed), then failure is returned and it's up to caller 404 * to restart the path walk from the beginning in ref-walk mode. 405 */ 406 407 /** 408 * unlazy_walk - try to switch to ref-walk mode. 409 * @nd: nameidata pathwalk data 410 * @dentry: child of nd->path.dentry or NULL 411 * Returns: 0 on success, -ECHILD on failure 412 * 413 * unlazy_walk attempts to legitimize the current nd->path, nd->root and dentry 414 * for ref-walk mode. @dentry must be a path found by a do_lookup call on 415 * @nd or NULL. Must be called from rcu-walk context. 416 */ 417 static int unlazy_walk(struct nameidata *nd, struct dentry *dentry) 418 { 419 struct fs_struct *fs = current->fs; 420 struct dentry *parent = nd->path.dentry; 421 int want_root = 0; 422 423 BUG_ON(!(nd->flags & LOOKUP_RCU)); 424 if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { 425 want_root = 1; 426 spin_lock(&fs->lock); 427 if (nd->root.mnt != fs->root.mnt || 428 nd->root.dentry != fs->root.dentry) 429 goto err_root; 430 } 431 spin_lock(&parent->d_lock); 432 if (!dentry) { 433 if (!__d_rcu_to_refcount(parent, nd->seq)) 434 goto err_parent; 435 BUG_ON(nd->inode != parent->d_inode); 436 } else { 437 if (dentry->d_parent != parent) 438 goto err_parent; 439 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); 440 if (!__d_rcu_to_refcount(dentry, nd->seq)) 441 goto err_child; 442 /* 443 * If the sequence check on the child dentry passed, then 444 * the child has not been removed from its parent. This 445 * means the parent dentry must be valid and able to take 446 * a reference at this point. 447 */ 448 BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent); 449 BUG_ON(!parent->d_count); 450 parent->d_count++; 451 spin_unlock(&dentry->d_lock); 452 } 453 spin_unlock(&parent->d_lock); 454 if (want_root) { 455 path_get(&nd->root); 456 spin_unlock(&fs->lock); 457 } 458 mntget(nd->path.mnt); 459 460 rcu_read_unlock(); 461 br_read_unlock(vfsmount_lock); 462 nd->flags &= ~LOOKUP_RCU; 463 return 0; 464 465 err_child: 466 spin_unlock(&dentry->d_lock); 467 err_parent: 468 spin_unlock(&parent->d_lock); 469 err_root: 470 if (want_root) 471 spin_unlock(&fs->lock); 472 return -ECHILD; 473 } 474 475 /** 476 * release_open_intent - free up open intent resources 477 * @nd: pointer to nameidata 478 */ 479 void release_open_intent(struct nameidata *nd) 480 { 481 struct file *file = nd->intent.open.file; 482 483 if (file && !IS_ERR(file)) { 484 if (file->f_path.dentry == NULL) 485 put_filp(file); 486 else 487 fput(file); 488 } 489 } 490 491 static inline int d_revalidate(struct dentry *dentry, struct nameidata *nd) 492 { 493 return dentry->d_op->d_revalidate(dentry, nd); 494 } 495 496 /** 497 * complete_walk - successful completion of path walk 498 * @nd: pointer nameidata 499 * 500 * If we had been in RCU mode, drop out of it and legitimize nd->path. 501 * Revalidate the final result, unless we'd already done that during 502 * the path walk or the filesystem doesn't ask for it. Return 0 on 503 * success, -error on failure. In case of failure caller does not 504 * need to drop nd->path. 505 */ 506 static int complete_walk(struct nameidata *nd) 507 { 508 struct dentry *dentry = nd->path.dentry; 509 int status; 510 511 if (nd->flags & LOOKUP_RCU) { 512 nd->flags &= ~LOOKUP_RCU; 513 if (!(nd->flags & LOOKUP_ROOT)) 514 nd->root.mnt = NULL; 515 spin_lock(&dentry->d_lock); 516 if (unlikely(!__d_rcu_to_refcount(dentry, nd->seq))) { 517 spin_unlock(&dentry->d_lock); 518 rcu_read_unlock(); 519 br_read_unlock(vfsmount_lock); 520 return -ECHILD; 521 } 522 BUG_ON(nd->inode != dentry->d_inode); 523 spin_unlock(&dentry->d_lock); 524 mntget(nd->path.mnt); 525 rcu_read_unlock(); 526 br_read_unlock(vfsmount_lock); 527 } 528 529 if (likely(!(nd->flags & LOOKUP_JUMPED))) 530 return 0; 531 532 if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE))) 533 return 0; 534 535 if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT))) 536 return 0; 537 538 /* Note: we do not d_invalidate() */ 539 status = d_revalidate(dentry, nd); 540 if (status > 0) 541 return 0; 542 543 if (!status) 544 status = -ESTALE; 545 546 path_put(&nd->path); 547 return status; 548 } 549 550 static __always_inline void set_root(struct nameidata *nd) 551 { 552 if (!nd->root.mnt) 553 get_fs_root(current->fs, &nd->root); 554 } 555 556 static int link_path_walk(const char *, struct nameidata *); 557 558 static __always_inline void set_root_rcu(struct nameidata *nd) 559 { 560 if (!nd->root.mnt) { 561 struct fs_struct *fs = current->fs; 562 unsigned seq; 563 564 do { 565 seq = read_seqcount_begin(&fs->seq); 566 nd->root = fs->root; 567 nd->seq = __read_seqcount_begin(&nd->root.dentry->d_seq); 568 } while (read_seqcount_retry(&fs->seq, seq)); 569 } 570 } 571 572 static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link) 573 { 574 int ret; 575 576 if (IS_ERR(link)) 577 goto fail; 578 579 if (*link == '/') { 580 set_root(nd); 581 path_put(&nd->path); 582 nd->path = nd->root; 583 path_get(&nd->root); 584 nd->flags |= LOOKUP_JUMPED; 585 } 586 nd->inode = nd->path.dentry->d_inode; 587 588 ret = link_path_walk(link, nd); 589 return ret; 590 fail: 591 path_put(&nd->path); 592 return PTR_ERR(link); 593 } 594 595 static void path_put_conditional(struct path *path, struct nameidata *nd) 596 { 597 dput(path->dentry); 598 if (path->mnt != nd->path.mnt) 599 mntput(path->mnt); 600 } 601 602 static inline void path_to_nameidata(const struct path *path, 603 struct nameidata *nd) 604 { 605 if (!(nd->flags & LOOKUP_RCU)) { 606 dput(nd->path.dentry); 607 if (nd->path.mnt != path->mnt) 608 mntput(nd->path.mnt); 609 } 610 nd->path.mnt = path->mnt; 611 nd->path.dentry = path->dentry; 612 } 613 614 static inline void put_link(struct nameidata *nd, struct path *link, void *cookie) 615 { 616 struct inode *inode = link->dentry->d_inode; 617 if (!IS_ERR(cookie) && inode->i_op->put_link) 618 inode->i_op->put_link(link->dentry, nd, cookie); 619 path_put(link); 620 } 621 622 static __always_inline int 623 follow_link(struct path *link, struct nameidata *nd, void **p) 624 { 625 int error; 626 struct dentry *dentry = link->dentry; 627 628 BUG_ON(nd->flags & LOOKUP_RCU); 629 630 if (link->mnt == nd->path.mnt) 631 mntget(link->mnt); 632 633 if (unlikely(current->total_link_count >= 40)) { 634 *p = ERR_PTR(-ELOOP); /* no ->put_link(), please */ 635 path_put(&nd->path); 636 return -ELOOP; 637 } 638 cond_resched(); 639 current->total_link_count++; 640 641 touch_atime(link); 642 nd_set_link(nd, NULL); 643 644 error = security_inode_follow_link(link->dentry, nd); 645 if (error) { 646 *p = ERR_PTR(error); /* no ->put_link(), please */ 647 path_put(&nd->path); 648 return error; 649 } 650 651 nd->last_type = LAST_BIND; 652 *p = dentry->d_inode->i_op->follow_link(dentry, nd); 653 error = PTR_ERR(*p); 654 if (!IS_ERR(*p)) { 655 char *s = nd_get_link(nd); 656 error = 0; 657 if (s) 658 error = __vfs_follow_link(nd, s); 659 else if (nd->last_type == LAST_BIND) { 660 nd->flags |= LOOKUP_JUMPED; 661 nd->inode = nd->path.dentry->d_inode; 662 if (nd->inode->i_op->follow_link) { 663 /* stepped on a _really_ weird one */ 664 path_put(&nd->path); 665 error = -ELOOP; 666 } 667 } 668 } 669 return error; 670 } 671 672 static int follow_up_rcu(struct path *path) 673 { 674 struct mount *mnt = real_mount(path->mnt); 675 struct mount *parent; 676 struct dentry *mountpoint; 677 678 parent = mnt->mnt_parent; 679 if (&parent->mnt == path->mnt) 680 return 0; 681 mountpoint = mnt->mnt_mountpoint; 682 path->dentry = mountpoint; 683 path->mnt = &parent->mnt; 684 return 1; 685 } 686 687 int follow_up(struct path *path) 688 { 689 struct mount *mnt = real_mount(path->mnt); 690 struct mount *parent; 691 struct dentry *mountpoint; 692 693 br_read_lock(vfsmount_lock); 694 parent = mnt->mnt_parent; 695 if (&parent->mnt == path->mnt) { 696 br_read_unlock(vfsmount_lock); 697 return 0; 698 } 699 mntget(&parent->mnt); 700 mountpoint = dget(mnt->mnt_mountpoint); 701 br_read_unlock(vfsmount_lock); 702 dput(path->dentry); 703 path->dentry = mountpoint; 704 mntput(path->mnt); 705 path->mnt = &parent->mnt; 706 return 1; 707 } 708 709 /* 710 * Perform an automount 711 * - return -EISDIR to tell follow_managed() to stop and return the path we 712 * were called with. 713 */ 714 static int follow_automount(struct path *path, unsigned flags, 715 bool *need_mntput) 716 { 717 struct vfsmount *mnt; 718 int err; 719 720 if (!path->dentry->d_op || !path->dentry->d_op->d_automount) 721 return -EREMOTE; 722 723 /* We don't want to mount if someone's just doing a stat - 724 * unless they're stat'ing a directory and appended a '/' to 725 * the name. 726 * 727 * We do, however, want to mount if someone wants to open or 728 * create a file of any type under the mountpoint, wants to 729 * traverse through the mountpoint or wants to open the 730 * mounted directory. Also, autofs may mark negative dentries 731 * as being automount points. These will need the attentions 732 * of the daemon to instantiate them before they can be used. 733 */ 734 if (!(flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY | 735 LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) && 736 path->dentry->d_inode) 737 return -EISDIR; 738 739 current->total_link_count++; 740 if (current->total_link_count >= 40) 741 return -ELOOP; 742 743 mnt = path->dentry->d_op->d_automount(path); 744 if (IS_ERR(mnt)) { 745 /* 746 * The filesystem is allowed to return -EISDIR here to indicate 747 * it doesn't want to automount. For instance, autofs would do 748 * this so that its userspace daemon can mount on this dentry. 749 * 750 * However, we can only permit this if it's a terminal point in 751 * the path being looked up; if it wasn't then the remainder of 752 * the path is inaccessible and we should say so. 753 */ 754 if (PTR_ERR(mnt) == -EISDIR && (flags & LOOKUP_PARENT)) 755 return -EREMOTE; 756 return PTR_ERR(mnt); 757 } 758 759 if (!mnt) /* mount collision */ 760 return 0; 761 762 if (!*need_mntput) { 763 /* lock_mount() may release path->mnt on error */ 764 mntget(path->mnt); 765 *need_mntput = true; 766 } 767 err = finish_automount(mnt, path); 768 769 switch (err) { 770 case -EBUSY: 771 /* Someone else made a mount here whilst we were busy */ 772 return 0; 773 case 0: 774 path_put(path); 775 path->mnt = mnt; 776 path->dentry = dget(mnt->mnt_root); 777 return 0; 778 default: 779 return err; 780 } 781 782 } 783 784 /* 785 * Handle a dentry that is managed in some way. 786 * - Flagged for transit management (autofs) 787 * - Flagged as mountpoint 788 * - Flagged as automount point 789 * 790 * This may only be called in refwalk mode. 791 * 792 * Serialization is taken care of in namespace.c 793 */ 794 static int follow_managed(struct path *path, unsigned flags) 795 { 796 struct vfsmount *mnt = path->mnt; /* held by caller, must be left alone */ 797 unsigned managed; 798 bool need_mntput = false; 799 int ret = 0; 800 801 /* Given that we're not holding a lock here, we retain the value in a 802 * local variable for each dentry as we look at it so that we don't see 803 * the components of that value change under us */ 804 while (managed = ACCESS_ONCE(path->dentry->d_flags), 805 managed &= DCACHE_MANAGED_DENTRY, 806 unlikely(managed != 0)) { 807 /* Allow the filesystem to manage the transit without i_mutex 808 * being held. */ 809 if (managed & DCACHE_MANAGE_TRANSIT) { 810 BUG_ON(!path->dentry->d_op); 811 BUG_ON(!path->dentry->d_op->d_manage); 812 ret = path->dentry->d_op->d_manage(path->dentry, false); 813 if (ret < 0) 814 break; 815 } 816 817 /* Transit to a mounted filesystem. */ 818 if (managed & DCACHE_MOUNTED) { 819 struct vfsmount *mounted = lookup_mnt(path); 820 if (mounted) { 821 dput(path->dentry); 822 if (need_mntput) 823 mntput(path->mnt); 824 path->mnt = mounted; 825 path->dentry = dget(mounted->mnt_root); 826 need_mntput = true; 827 continue; 828 } 829 830 /* Something is mounted on this dentry in another 831 * namespace and/or whatever was mounted there in this 832 * namespace got unmounted before we managed to get the 833 * vfsmount_lock */ 834 } 835 836 /* Handle an automount point */ 837 if (managed & DCACHE_NEED_AUTOMOUNT) { 838 ret = follow_automount(path, flags, &need_mntput); 839 if (ret < 0) 840 break; 841 continue; 842 } 843 844 /* We didn't change the current path point */ 845 break; 846 } 847 848 if (need_mntput && path->mnt == mnt) 849 mntput(path->mnt); 850 if (ret == -EISDIR) 851 ret = 0; 852 return ret < 0 ? ret : need_mntput; 853 } 854 855 int follow_down_one(struct path *path) 856 { 857 struct vfsmount *mounted; 858 859 mounted = lookup_mnt(path); 860 if (mounted) { 861 dput(path->dentry); 862 mntput(path->mnt); 863 path->mnt = mounted; 864 path->dentry = dget(mounted->mnt_root); 865 return 1; 866 } 867 return 0; 868 } 869 870 static inline bool managed_dentry_might_block(struct dentry *dentry) 871 { 872 return (dentry->d_flags & DCACHE_MANAGE_TRANSIT && 873 dentry->d_op->d_manage(dentry, true) < 0); 874 } 875 876 /* 877 * Try to skip to top of mountpoint pile in rcuwalk mode. Fail if 878 * we meet a managed dentry that would need blocking. 879 */ 880 static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, 881 struct inode **inode) 882 { 883 for (;;) { 884 struct mount *mounted; 885 /* 886 * Don't forget we might have a non-mountpoint managed dentry 887 * that wants to block transit. 888 */ 889 if (unlikely(managed_dentry_might_block(path->dentry))) 890 return false; 891 892 if (!d_mountpoint(path->dentry)) 893 break; 894 895 mounted = __lookup_mnt(path->mnt, path->dentry, 1); 896 if (!mounted) 897 break; 898 path->mnt = &mounted->mnt; 899 path->dentry = mounted->mnt.mnt_root; 900 nd->flags |= LOOKUP_JUMPED; 901 nd->seq = read_seqcount_begin(&path->dentry->d_seq); 902 /* 903 * Update the inode too. We don't need to re-check the 904 * dentry sequence number here after this d_inode read, 905 * because a mount-point is always pinned. 906 */ 907 *inode = path->dentry->d_inode; 908 } 909 return true; 910 } 911 912 static void follow_mount_rcu(struct nameidata *nd) 913 { 914 while (d_mountpoint(nd->path.dentry)) { 915 struct mount *mounted; 916 mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry, 1); 917 if (!mounted) 918 break; 919 nd->path.mnt = &mounted->mnt; 920 nd->path.dentry = mounted->mnt.mnt_root; 921 nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); 922 } 923 } 924 925 static int follow_dotdot_rcu(struct nameidata *nd) 926 { 927 set_root_rcu(nd); 928 929 while (1) { 930 if (nd->path.dentry == nd->root.dentry && 931 nd->path.mnt == nd->root.mnt) { 932 break; 933 } 934 if (nd->path.dentry != nd->path.mnt->mnt_root) { 935 struct dentry *old = nd->path.dentry; 936 struct dentry *parent = old->d_parent; 937 unsigned seq; 938 939 seq = read_seqcount_begin(&parent->d_seq); 940 if (read_seqcount_retry(&old->d_seq, nd->seq)) 941 goto failed; 942 nd->path.dentry = parent; 943 nd->seq = seq; 944 break; 945 } 946 if (!follow_up_rcu(&nd->path)) 947 break; 948 nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); 949 } 950 follow_mount_rcu(nd); 951 nd->inode = nd->path.dentry->d_inode; 952 return 0; 953 954 failed: 955 nd->flags &= ~LOOKUP_RCU; 956 if (!(nd->flags & LOOKUP_ROOT)) 957 nd->root.mnt = NULL; 958 rcu_read_unlock(); 959 br_read_unlock(vfsmount_lock); 960 return -ECHILD; 961 } 962 963 /* 964 * Follow down to the covering mount currently visible to userspace. At each 965 * point, the filesystem owning that dentry may be queried as to whether the 966 * caller is permitted to proceed or not. 967 */ 968 int follow_down(struct path *path) 969 { 970 unsigned managed; 971 int ret; 972 973 while (managed = ACCESS_ONCE(path->dentry->d_flags), 974 unlikely(managed & DCACHE_MANAGED_DENTRY)) { 975 /* Allow the filesystem to manage the transit without i_mutex 976 * being held. 977 * 978 * We indicate to the filesystem if someone is trying to mount 979 * something here. This gives autofs the chance to deny anyone 980 * other than its daemon the right to mount on its 981 * superstructure. 982 * 983 * The filesystem may sleep at this point. 984 */ 985 if (managed & DCACHE_MANAGE_TRANSIT) { 986 BUG_ON(!path->dentry->d_op); 987 BUG_ON(!path->dentry->d_op->d_manage); 988 ret = path->dentry->d_op->d_manage( 989 path->dentry, false); 990 if (ret < 0) 991 return ret == -EISDIR ? 0 : ret; 992 } 993 994 /* Transit to a mounted filesystem. */ 995 if (managed & DCACHE_MOUNTED) { 996 struct vfsmount *mounted = lookup_mnt(path); 997 if (!mounted) 998 break; 999 dput(path->dentry); 1000 mntput(path->mnt); 1001 path->mnt = mounted; 1002 path->dentry = dget(mounted->mnt_root); 1003 continue; 1004 } 1005 1006 /* Don't handle automount points here */ 1007 break; 1008 } 1009 return 0; 1010 } 1011 1012 /* 1013 * Skip to top of mountpoint pile in refwalk mode for follow_dotdot() 1014 */ 1015 static void follow_mount(struct path *path) 1016 { 1017 while (d_mountpoint(path->dentry)) { 1018 struct vfsmount *mounted = lookup_mnt(path); 1019 if (!mounted) 1020 break; 1021 dput(path->dentry); 1022 mntput(path->mnt); 1023 path->mnt = mounted; 1024 path->dentry = dget(mounted->mnt_root); 1025 } 1026 } 1027 1028 static void follow_dotdot(struct nameidata *nd) 1029 { 1030 set_root(nd); 1031 1032 while(1) { 1033 struct dentry *old = nd->path.dentry; 1034 1035 if (nd->path.dentry == nd->root.dentry && 1036 nd->path.mnt == nd->root.mnt) { 1037 break; 1038 } 1039 if (nd->path.dentry != nd->path.mnt->mnt_root) { 1040 /* rare case of legitimate dget_parent()... */ 1041 nd->path.dentry = dget_parent(nd->path.dentry); 1042 dput(old); 1043 break; 1044 } 1045 if (!follow_up(&nd->path)) 1046 break; 1047 } 1048 follow_mount(&nd->path); 1049 nd->inode = nd->path.dentry->d_inode; 1050 } 1051 1052 /* 1053 * This looks up the name in dcache, possibly revalidates the old dentry and 1054 * allocates a new one if not found or not valid. In the need_lookup argument 1055 * returns whether i_op->lookup is necessary. 1056 * 1057 * dir->d_inode->i_mutex must be held 1058 */ 1059 static struct dentry *lookup_dcache(struct qstr *name, struct dentry *dir, 1060 struct nameidata *nd, bool *need_lookup) 1061 { 1062 struct dentry *dentry; 1063 int error; 1064 1065 *need_lookup = false; 1066 dentry = d_lookup(dir, name); 1067 if (dentry) { 1068 if (d_need_lookup(dentry)) { 1069 *need_lookup = true; 1070 } else if (dentry->d_flags & DCACHE_OP_REVALIDATE) { 1071 error = d_revalidate(dentry, nd); 1072 if (unlikely(error <= 0)) { 1073 if (error < 0) { 1074 dput(dentry); 1075 return ERR_PTR(error); 1076 } else if (!d_invalidate(dentry)) { 1077 dput(dentry); 1078 dentry = NULL; 1079 } 1080 } 1081 } 1082 } 1083 1084 if (!dentry) { 1085 dentry = d_alloc(dir, name); 1086 if (unlikely(!dentry)) 1087 return ERR_PTR(-ENOMEM); 1088 1089 *need_lookup = true; 1090 } 1091 return dentry; 1092 } 1093 1094 /* 1095 * Call i_op->lookup on the dentry. The dentry must be negative but may be 1096 * hashed if it was pouplated with DCACHE_NEED_LOOKUP. 1097 * 1098 * dir->d_inode->i_mutex must be held 1099 */ 1100 static struct dentry *lookup_real(struct inode *dir, struct dentry *dentry, 1101 struct nameidata *nd) 1102 { 1103 struct dentry *old; 1104 1105 /* Don't create child dentry for a dead directory. */ 1106 if (unlikely(IS_DEADDIR(dir))) { 1107 dput(dentry); 1108 return ERR_PTR(-ENOENT); 1109 } 1110 1111 old = dir->i_op->lookup(dir, dentry, nd); 1112 if (unlikely(old)) { 1113 dput(dentry); 1114 dentry = old; 1115 } 1116 return dentry; 1117 } 1118 1119 static struct dentry *__lookup_hash(struct qstr *name, 1120 struct dentry *base, struct nameidata *nd) 1121 { 1122 bool need_lookup; 1123 struct dentry *dentry; 1124 1125 dentry = lookup_dcache(name, base, nd, &need_lookup); 1126 if (!need_lookup) 1127 return dentry; 1128 1129 return lookup_real(base->d_inode, dentry, nd); 1130 } 1131 1132 /* 1133 * It's more convoluted than I'd like it to be, but... it's still fairly 1134 * small and for now I'd prefer to have fast path as straight as possible. 1135 * It _is_ time-critical. 1136 */ 1137 static int do_lookup(struct nameidata *nd, struct qstr *name, 1138 struct path *path, struct inode **inode) 1139 { 1140 struct vfsmount *mnt = nd->path.mnt; 1141 struct dentry *dentry, *parent = nd->path.dentry; 1142 int need_reval = 1; 1143 int status = 1; 1144 int err; 1145 1146 /* 1147 * Rename seqlock is not required here because in the off chance 1148 * of a false negative due to a concurrent rename, we're going to 1149 * do the non-racy lookup, below. 1150 */ 1151 if (nd->flags & LOOKUP_RCU) { 1152 unsigned seq; 1153 *inode = nd->inode; 1154 dentry = __d_lookup_rcu(parent, name, &seq, inode); 1155 if (!dentry) 1156 goto unlazy; 1157 1158 /* Memory barrier in read_seqcount_begin of child is enough */ 1159 if (__read_seqcount_retry(&parent->d_seq, nd->seq)) 1160 return -ECHILD; 1161 nd->seq = seq; 1162 1163 if (unlikely(d_need_lookup(dentry))) 1164 goto unlazy; 1165 if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) { 1166 status = d_revalidate(dentry, nd); 1167 if (unlikely(status <= 0)) { 1168 if (status != -ECHILD) 1169 need_reval = 0; 1170 goto unlazy; 1171 } 1172 } 1173 path->mnt = mnt; 1174 path->dentry = dentry; 1175 if (unlikely(!__follow_mount_rcu(nd, path, inode))) 1176 goto unlazy; 1177 if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT)) 1178 goto unlazy; 1179 return 0; 1180 unlazy: 1181 if (unlazy_walk(nd, dentry)) 1182 return -ECHILD; 1183 } else { 1184 dentry = __d_lookup(parent, name); 1185 } 1186 1187 if (unlikely(!dentry)) 1188 goto need_lookup; 1189 1190 if (unlikely(d_need_lookup(dentry))) { 1191 dput(dentry); 1192 goto need_lookup; 1193 } 1194 1195 if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval) 1196 status = d_revalidate(dentry, nd); 1197 if (unlikely(status <= 0)) { 1198 if (status < 0) { 1199 dput(dentry); 1200 return status; 1201 } 1202 if (!d_invalidate(dentry)) { 1203 dput(dentry); 1204 goto need_lookup; 1205 } 1206 } 1207 done: 1208 path->mnt = mnt; 1209 path->dentry = dentry; 1210 err = follow_managed(path, nd->flags); 1211 if (unlikely(err < 0)) { 1212 path_put_conditional(path, nd); 1213 return err; 1214 } 1215 if (err) 1216 nd->flags |= LOOKUP_JUMPED; 1217 *inode = path->dentry->d_inode; 1218 return 0; 1219 1220 need_lookup: 1221 BUG_ON(nd->inode != parent->d_inode); 1222 1223 mutex_lock(&parent->d_inode->i_mutex); 1224 dentry = __lookup_hash(name, parent, nd); 1225 mutex_unlock(&parent->d_inode->i_mutex); 1226 if (IS_ERR(dentry)) 1227 return PTR_ERR(dentry); 1228 goto done; 1229 } 1230 1231 static inline int may_lookup(struct nameidata *nd) 1232 { 1233 if (nd->flags & LOOKUP_RCU) { 1234 int err = inode_permission(nd->inode, MAY_EXEC|MAY_NOT_BLOCK); 1235 if (err != -ECHILD) 1236 return err; 1237 if (unlazy_walk(nd, NULL)) 1238 return -ECHILD; 1239 } 1240 return inode_permission(nd->inode, MAY_EXEC); 1241 } 1242 1243 static inline int handle_dots(struct nameidata *nd, int type) 1244 { 1245 if (type == LAST_DOTDOT) { 1246 if (nd->flags & LOOKUP_RCU) { 1247 if (follow_dotdot_rcu(nd)) 1248 return -ECHILD; 1249 } else 1250 follow_dotdot(nd); 1251 } 1252 return 0; 1253 } 1254 1255 static void terminate_walk(struct nameidata *nd) 1256 { 1257 if (!(nd->flags & LOOKUP_RCU)) { 1258 path_put(&nd->path); 1259 } else { 1260 nd->flags &= ~LOOKUP_RCU; 1261 if (!(nd->flags & LOOKUP_ROOT)) 1262 nd->root.mnt = NULL; 1263 rcu_read_unlock(); 1264 br_read_unlock(vfsmount_lock); 1265 } 1266 } 1267 1268 /* 1269 * Do we need to follow links? We _really_ want to be able 1270 * to do this check without having to look at inode->i_op, 1271 * so we keep a cache of "no, this doesn't need follow_link" 1272 * for the common case. 1273 */ 1274 static inline int should_follow_link(struct inode *inode, int follow) 1275 { 1276 if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) { 1277 if (likely(inode->i_op->follow_link)) 1278 return follow; 1279 1280 /* This gets set once for the inode lifetime */ 1281 spin_lock(&inode->i_lock); 1282 inode->i_opflags |= IOP_NOFOLLOW; 1283 spin_unlock(&inode->i_lock); 1284 } 1285 return 0; 1286 } 1287 1288 static inline int walk_component(struct nameidata *nd, struct path *path, 1289 struct qstr *name, int type, int follow) 1290 { 1291 struct inode *inode; 1292 int err; 1293 /* 1294 * "." and ".." are special - ".." especially so because it has 1295 * to be able to know about the current root directory and 1296 * parent relationships. 1297 */ 1298 if (unlikely(type != LAST_NORM)) 1299 return handle_dots(nd, type); 1300 err = do_lookup(nd, name, path, &inode); 1301 if (unlikely(err)) { 1302 terminate_walk(nd); 1303 return err; 1304 } 1305 if (!inode) { 1306 path_to_nameidata(path, nd); 1307 terminate_walk(nd); 1308 return -ENOENT; 1309 } 1310 if (should_follow_link(inode, follow)) { 1311 if (nd->flags & LOOKUP_RCU) { 1312 if (unlikely(unlazy_walk(nd, path->dentry))) { 1313 terminate_walk(nd); 1314 return -ECHILD; 1315 } 1316 } 1317 BUG_ON(inode != path->dentry->d_inode); 1318 return 1; 1319 } 1320 path_to_nameidata(path, nd); 1321 nd->inode = inode; 1322 return 0; 1323 } 1324 1325 /* 1326 * This limits recursive symlink follows to 8, while 1327 * limiting consecutive symlinks to 40. 1328 * 1329 * Without that kind of total limit, nasty chains of consecutive 1330 * symlinks can cause almost arbitrarily long lookups. 1331 */ 1332 static inline int nested_symlink(struct path *path, struct nameidata *nd) 1333 { 1334 int res; 1335 1336 if (unlikely(current->link_count >= MAX_NESTED_LINKS)) { 1337 path_put_conditional(path, nd); 1338 path_put(&nd->path); 1339 return -ELOOP; 1340 } 1341 BUG_ON(nd->depth >= MAX_NESTED_LINKS); 1342 1343 nd->depth++; 1344 current->link_count++; 1345 1346 do { 1347 struct path link = *path; 1348 void *cookie; 1349 1350 res = follow_link(&link, nd, &cookie); 1351 if (!res) 1352 res = walk_component(nd, path, &nd->last, 1353 nd->last_type, LOOKUP_FOLLOW); 1354 put_link(nd, &link, cookie); 1355 } while (res > 0); 1356 1357 current->link_count--; 1358 nd->depth--; 1359 return res; 1360 } 1361 1362 /* 1363 * We really don't want to look at inode->i_op->lookup 1364 * when we don't have to. So we keep a cache bit in 1365 * the inode ->i_opflags field that says "yes, we can 1366 * do lookup on this inode". 1367 */ 1368 static inline int can_lookup(struct inode *inode) 1369 { 1370 if (likely(inode->i_opflags & IOP_LOOKUP)) 1371 return 1; 1372 if (likely(!inode->i_op->lookup)) 1373 return 0; 1374 1375 /* We do this once for the lifetime of the inode */ 1376 spin_lock(&inode->i_lock); 1377 inode->i_opflags |= IOP_LOOKUP; 1378 spin_unlock(&inode->i_lock); 1379 return 1; 1380 } 1381 1382 /* 1383 * We can do the critical dentry name comparison and hashing 1384 * operations one word at a time, but we are limited to: 1385 * 1386 * - Architectures with fast unaligned word accesses. We could 1387 * do a "get_unaligned()" if this helps and is sufficiently 1388 * fast. 1389 * 1390 * - Little-endian machines (so that we can generate the mask 1391 * of low bytes efficiently). Again, we *could* do a byte 1392 * swapping load on big-endian architectures if that is not 1393 * expensive enough to make the optimization worthless. 1394 * 1395 * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we 1396 * do not trap on the (extremely unlikely) case of a page 1397 * crossing operation. 1398 * 1399 * - Furthermore, we need an efficient 64-bit compile for the 1400 * 64-bit case in order to generate the "number of bytes in 1401 * the final mask". Again, that could be replaced with a 1402 * efficient population count instruction or similar. 1403 */ 1404 #ifdef CONFIG_DCACHE_WORD_ACCESS 1405 1406 #ifdef CONFIG_64BIT 1407 1408 /* 1409 * Jan Achrenius on G+: microoptimized version of 1410 * the simpler "(mask & ONEBYTES) * ONEBYTES >> 56" 1411 * that works for the bytemasks without having to 1412 * mask them first. 1413 */ 1414 static inline long count_masked_bytes(unsigned long mask) 1415 { 1416 return mask*0x0001020304050608ul >> 56; 1417 } 1418 1419 static inline unsigned int fold_hash(unsigned long hash) 1420 { 1421 hash += hash >> (8*sizeof(int)); 1422 return hash; 1423 } 1424 1425 #else /* 32-bit case */ 1426 1427 /* Carl Chatfield / Jan Achrenius G+ version for 32-bit */ 1428 static inline long count_masked_bytes(long mask) 1429 { 1430 /* (000000 0000ff 00ffff ffffff) -> ( 1 1 2 3 ) */ 1431 long a = (0x0ff0001+mask) >> 23; 1432 /* Fix the 1 for 00 case */ 1433 return a & mask; 1434 } 1435 1436 #define fold_hash(x) (x) 1437 1438 #endif 1439 1440 unsigned int full_name_hash(const unsigned char *name, unsigned int len) 1441 { 1442 unsigned long a, mask; 1443 unsigned long hash = 0; 1444 1445 for (;;) { 1446 a = *(unsigned long *)name; 1447 if (len < sizeof(unsigned long)) 1448 break; 1449 hash += a; 1450 hash *= 9; 1451 name += sizeof(unsigned long); 1452 len -= sizeof(unsigned long); 1453 if (!len) 1454 goto done; 1455 } 1456 mask = ~(~0ul << len*8); 1457 hash += mask & a; 1458 done: 1459 return fold_hash(hash); 1460 } 1461 EXPORT_SYMBOL(full_name_hash); 1462 1463 #define REPEAT_BYTE(x) ((~0ul / 0xff) * (x)) 1464 #define ONEBYTES REPEAT_BYTE(0x01) 1465 #define SLASHBYTES REPEAT_BYTE('/') 1466 #define HIGHBITS REPEAT_BYTE(0x80) 1467 1468 /* Return the high bit set in the first byte that is a zero */ 1469 static inline unsigned long has_zero(unsigned long a) 1470 { 1471 return ((a - ONEBYTES) & ~a) & HIGHBITS; 1472 } 1473 1474 /* 1475 * Calculate the length and hash of the path component, and 1476 * return the length of the component; 1477 */ 1478 static inline unsigned long hash_name(const char *name, unsigned int *hashp) 1479 { 1480 unsigned long a, mask, hash, len; 1481 1482 hash = a = 0; 1483 len = -sizeof(unsigned long); 1484 do { 1485 hash = (hash + a) * 9; 1486 len += sizeof(unsigned long); 1487 a = *(unsigned long *)(name+len); 1488 /* Do we have any NUL or '/' bytes in this word? */ 1489 mask = has_zero(a) | has_zero(a ^ SLASHBYTES); 1490 } while (!mask); 1491 1492 /* The mask *below* the first high bit set */ 1493 mask = (mask - 1) & ~mask; 1494 mask >>= 7; 1495 hash += a & mask; 1496 *hashp = fold_hash(hash); 1497 1498 return len + count_masked_bytes(mask); 1499 } 1500 1501 #else 1502 1503 unsigned int full_name_hash(const unsigned char *name, unsigned int len) 1504 { 1505 unsigned long hash = init_name_hash(); 1506 while (len--) 1507 hash = partial_name_hash(*name++, hash); 1508 return end_name_hash(hash); 1509 } 1510 EXPORT_SYMBOL(full_name_hash); 1511 1512 /* 1513 * We know there's a real path component here of at least 1514 * one character. 1515 */ 1516 static inline unsigned long hash_name(const char *name, unsigned int *hashp) 1517 { 1518 unsigned long hash = init_name_hash(); 1519 unsigned long len = 0, c; 1520 1521 c = (unsigned char)*name; 1522 do { 1523 len++; 1524 hash = partial_name_hash(c, hash); 1525 c = (unsigned char)name[len]; 1526 } while (c && c != '/'); 1527 *hashp = end_name_hash(hash); 1528 return len; 1529 } 1530 1531 #endif 1532 1533 /* 1534 * Name resolution. 1535 * This is the basic name resolution function, turning a pathname into 1536 * the final dentry. We expect 'base' to be positive and a directory. 1537 * 1538 * Returns 0 and nd will have valid dentry and mnt on success. 1539 * Returns error and drops reference to input namei data on failure. 1540 */ 1541 static int link_path_walk(const char *name, struct nameidata *nd) 1542 { 1543 struct path next; 1544 int err; 1545 1546 while (*name=='/') 1547 name++; 1548 if (!*name) 1549 return 0; 1550 1551 /* At this point we know we have a real path component. */ 1552 for(;;) { 1553 struct qstr this; 1554 long len; 1555 int type; 1556 1557 err = may_lookup(nd); 1558 if (err) 1559 break; 1560 1561 len = hash_name(name, &this.hash); 1562 this.name = name; 1563 this.len = len; 1564 1565 type = LAST_NORM; 1566 if (name[0] == '.') switch (len) { 1567 case 2: 1568 if (name[1] == '.') { 1569 type = LAST_DOTDOT; 1570 nd->flags |= LOOKUP_JUMPED; 1571 } 1572 break; 1573 case 1: 1574 type = LAST_DOT; 1575 } 1576 if (likely(type == LAST_NORM)) { 1577 struct dentry *parent = nd->path.dentry; 1578 nd->flags &= ~LOOKUP_JUMPED; 1579 if (unlikely(parent->d_flags & DCACHE_OP_HASH)) { 1580 err = parent->d_op->d_hash(parent, nd->inode, 1581 &this); 1582 if (err < 0) 1583 break; 1584 } 1585 } 1586 1587 if (!name[len]) 1588 goto last_component; 1589 /* 1590 * If it wasn't NUL, we know it was '/'. Skip that 1591 * slash, and continue until no more slashes. 1592 */ 1593 do { 1594 len++; 1595 } while (unlikely(name[len] == '/')); 1596 if (!name[len]) 1597 goto last_component; 1598 name += len; 1599 1600 err = walk_component(nd, &next, &this, type, LOOKUP_FOLLOW); 1601 if (err < 0) 1602 return err; 1603 1604 if (err) { 1605 err = nested_symlink(&next, nd); 1606 if (err) 1607 return err; 1608 } 1609 if (can_lookup(nd->inode)) 1610 continue; 1611 err = -ENOTDIR; 1612 break; 1613 /* here ends the main loop */ 1614 1615 last_component: 1616 nd->last = this; 1617 nd->last_type = type; 1618 return 0; 1619 } 1620 terminate_walk(nd); 1621 return err; 1622 } 1623 1624 static int path_init(int dfd, const char *name, unsigned int flags, 1625 struct nameidata *nd, struct file **fp) 1626 { 1627 int retval = 0; 1628 int fput_needed; 1629 struct file *file; 1630 1631 nd->last_type = LAST_ROOT; /* if there are only slashes... */ 1632 nd->flags = flags | LOOKUP_JUMPED; 1633 nd->depth = 0; 1634 if (flags & LOOKUP_ROOT) { 1635 struct inode *inode = nd->root.dentry->d_inode; 1636 if (*name) { 1637 if (!inode->i_op->lookup) 1638 return -ENOTDIR; 1639 retval = inode_permission(inode, MAY_EXEC); 1640 if (retval) 1641 return retval; 1642 } 1643 nd->path = nd->root; 1644 nd->inode = inode; 1645 if (flags & LOOKUP_RCU) { 1646 br_read_lock(vfsmount_lock); 1647 rcu_read_lock(); 1648 nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); 1649 } else { 1650 path_get(&nd->path); 1651 } 1652 return 0; 1653 } 1654 1655 nd->root.mnt = NULL; 1656 1657 if (*name=='/') { 1658 if (flags & LOOKUP_RCU) { 1659 br_read_lock(vfsmount_lock); 1660 rcu_read_lock(); 1661 set_root_rcu(nd); 1662 } else { 1663 set_root(nd); 1664 path_get(&nd->root); 1665 } 1666 nd->path = nd->root; 1667 } else if (dfd == AT_FDCWD) { 1668 if (flags & LOOKUP_RCU) { 1669 struct fs_struct *fs = current->fs; 1670 unsigned seq; 1671 1672 br_read_lock(vfsmount_lock); 1673 rcu_read_lock(); 1674 1675 do { 1676 seq = read_seqcount_begin(&fs->seq); 1677 nd->path = fs->pwd; 1678 nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); 1679 } while (read_seqcount_retry(&fs->seq, seq)); 1680 } else { 1681 get_fs_pwd(current->fs, &nd->path); 1682 } 1683 } else { 1684 struct dentry *dentry; 1685 1686 file = fget_raw_light(dfd, &fput_needed); 1687 retval = -EBADF; 1688 if (!file) 1689 goto out_fail; 1690 1691 dentry = file->f_path.dentry; 1692 1693 if (*name) { 1694 retval = -ENOTDIR; 1695 if (!S_ISDIR(dentry->d_inode->i_mode)) 1696 goto fput_fail; 1697 1698 retval = inode_permission(dentry->d_inode, MAY_EXEC); 1699 if (retval) 1700 goto fput_fail; 1701 } 1702 1703 nd->path = file->f_path; 1704 if (flags & LOOKUP_RCU) { 1705 if (fput_needed) 1706 *fp = file; 1707 nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); 1708 br_read_lock(vfsmount_lock); 1709 rcu_read_lock(); 1710 } else { 1711 path_get(&file->f_path); 1712 fput_light(file, fput_needed); 1713 } 1714 } 1715 1716 nd->inode = nd->path.dentry->d_inode; 1717 return 0; 1718 1719 fput_fail: 1720 fput_light(file, fput_needed); 1721 out_fail: 1722 return retval; 1723 } 1724 1725 static inline int lookup_last(struct nameidata *nd, struct path *path) 1726 { 1727 if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len]) 1728 nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; 1729 1730 nd->flags &= ~LOOKUP_PARENT; 1731 return walk_component(nd, path, &nd->last, nd->last_type, 1732 nd->flags & LOOKUP_FOLLOW); 1733 } 1734 1735 /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ 1736 static int path_lookupat(int dfd, const char *name, 1737 unsigned int flags, struct nameidata *nd) 1738 { 1739 struct file *base = NULL; 1740 struct path path; 1741 int err; 1742 1743 /* 1744 * Path walking is largely split up into 2 different synchronisation 1745 * schemes, rcu-walk and ref-walk (explained in 1746 * Documentation/filesystems/path-lookup.txt). These share much of the 1747 * path walk code, but some things particularly setup, cleanup, and 1748 * following mounts are sufficiently divergent that functions are 1749 * duplicated. Typically there is a function foo(), and its RCU 1750 * analogue, foo_rcu(). 1751 * 1752 * -ECHILD is the error number of choice (just to avoid clashes) that 1753 * is returned if some aspect of an rcu-walk fails. Such an error must 1754 * be handled by restarting a traditional ref-walk (which will always 1755 * be able to complete). 1756 */ 1757 err = path_init(dfd, name, flags | LOOKUP_PARENT, nd, &base); 1758 1759 if (unlikely(err)) 1760 return err; 1761 1762 current->total_link_count = 0; 1763 err = link_path_walk(name, nd); 1764 1765 if (!err && !(flags & LOOKUP_PARENT)) { 1766 err = lookup_last(nd, &path); 1767 while (err > 0) { 1768 void *cookie; 1769 struct path link = path; 1770 nd->flags |= LOOKUP_PARENT; 1771 err = follow_link(&link, nd, &cookie); 1772 if (!err) 1773 err = lookup_last(nd, &path); 1774 put_link(nd, &link, cookie); 1775 } 1776 } 1777 1778 if (!err) 1779 err = complete_walk(nd); 1780 1781 if (!err && nd->flags & LOOKUP_DIRECTORY) { 1782 if (!nd->inode->i_op->lookup) { 1783 path_put(&nd->path); 1784 err = -ENOTDIR; 1785 } 1786 } 1787 1788 if (base) 1789 fput(base); 1790 1791 if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { 1792 path_put(&nd->root); 1793 nd->root.mnt = NULL; 1794 } 1795 return err; 1796 } 1797 1798 static int do_path_lookup(int dfd, const char *name, 1799 unsigned int flags, struct nameidata *nd) 1800 { 1801 int retval = path_lookupat(dfd, name, flags | LOOKUP_RCU, nd); 1802 if (unlikely(retval == -ECHILD)) 1803 retval = path_lookupat(dfd, name, flags, nd); 1804 if (unlikely(retval == -ESTALE)) 1805 retval = path_lookupat(dfd, name, flags | LOOKUP_REVAL, nd); 1806 1807 if (likely(!retval)) { 1808 if (unlikely(!audit_dummy_context())) { 1809 if (nd->path.dentry && nd->inode) 1810 audit_inode(name, nd->path.dentry); 1811 } 1812 } 1813 return retval; 1814 } 1815 1816 int kern_path_parent(const char *name, struct nameidata *nd) 1817 { 1818 return do_path_lookup(AT_FDCWD, name, LOOKUP_PARENT, nd); 1819 } 1820 1821 int kern_path(const char *name, unsigned int flags, struct path *path) 1822 { 1823 struct nameidata nd; 1824 int res = do_path_lookup(AT_FDCWD, name, flags, &nd); 1825 if (!res) 1826 *path = nd.path; 1827 return res; 1828 } 1829 1830 /** 1831 * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair 1832 * @dentry: pointer to dentry of the base directory 1833 * @mnt: pointer to vfs mount of the base directory 1834 * @name: pointer to file name 1835 * @flags: lookup flags 1836 * @path: pointer to struct path to fill 1837 */ 1838 int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, 1839 const char *name, unsigned int flags, 1840 struct path *path) 1841 { 1842 struct nameidata nd; 1843 int err; 1844 nd.root.dentry = dentry; 1845 nd.root.mnt = mnt; 1846 BUG_ON(flags & LOOKUP_PARENT); 1847 /* the first argument of do_path_lookup() is ignored with LOOKUP_ROOT */ 1848 err = do_path_lookup(AT_FDCWD, name, flags | LOOKUP_ROOT, &nd); 1849 if (!err) 1850 *path = nd.path; 1851 return err; 1852 } 1853 1854 /* 1855 * Restricted form of lookup. Doesn't follow links, single-component only, 1856 * needs parent already locked. Doesn't follow mounts. 1857 * SMP-safe. 1858 */ 1859 static struct dentry *lookup_hash(struct nameidata *nd) 1860 { 1861 return __lookup_hash(&nd->last, nd->path.dentry, nd); 1862 } 1863 1864 /** 1865 * lookup_one_len - filesystem helper to lookup single pathname component 1866 * @name: pathname component to lookup 1867 * @base: base directory to lookup from 1868 * @len: maximum length @len should be interpreted to 1869 * 1870 * Note that this routine is purely a helper for filesystem usage and should 1871 * not be called by generic code. Also note that by using this function the 1872 * nameidata argument is passed to the filesystem methods and a filesystem 1873 * using this helper needs to be prepared for that. 1874 */ 1875 struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) 1876 { 1877 struct qstr this; 1878 unsigned int c; 1879 int err; 1880 1881 WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex)); 1882 1883 this.name = name; 1884 this.len = len; 1885 this.hash = full_name_hash(name, len); 1886 if (!len) 1887 return ERR_PTR(-EACCES); 1888 1889 while (len--) { 1890 c = *(const unsigned char *)name++; 1891 if (c == '/' || c == '\0') 1892 return ERR_PTR(-EACCES); 1893 } 1894 /* 1895 * See if the low-level filesystem might want 1896 * to use its own hash.. 1897 */ 1898 if (base->d_flags & DCACHE_OP_HASH) { 1899 int err = base->d_op->d_hash(base, base->d_inode, &this); 1900 if (err < 0) 1901 return ERR_PTR(err); 1902 } 1903 1904 err = inode_permission(base->d_inode, MAY_EXEC); 1905 if (err) 1906 return ERR_PTR(err); 1907 1908 return __lookup_hash(&this, base, NULL); 1909 } 1910 1911 int user_path_at_empty(int dfd, const char __user *name, unsigned flags, 1912 struct path *path, int *empty) 1913 { 1914 struct nameidata nd; 1915 char *tmp = getname_flags(name, flags, empty); 1916 int err = PTR_ERR(tmp); 1917 if (!IS_ERR(tmp)) { 1918 1919 BUG_ON(flags & LOOKUP_PARENT); 1920 1921 err = do_path_lookup(dfd, tmp, flags, &nd); 1922 putname(tmp); 1923 if (!err) 1924 *path = nd.path; 1925 } 1926 return err; 1927 } 1928 1929 int user_path_at(int dfd, const char __user *name, unsigned flags, 1930 struct path *path) 1931 { 1932 return user_path_at_empty(dfd, name, flags, path, NULL); 1933 } 1934 1935 static int user_path_parent(int dfd, const char __user *path, 1936 struct nameidata *nd, char **name) 1937 { 1938 char *s = getname(path); 1939 int error; 1940 1941 if (IS_ERR(s)) 1942 return PTR_ERR(s); 1943 1944 error = do_path_lookup(dfd, s, LOOKUP_PARENT, nd); 1945 if (error) 1946 putname(s); 1947 else 1948 *name = s; 1949 1950 return error; 1951 } 1952 1953 /* 1954 * It's inline, so penalty for filesystems that don't use sticky bit is 1955 * minimal. 1956 */ 1957 static inline int check_sticky(struct inode *dir, struct inode *inode) 1958 { 1959 kuid_t fsuid = current_fsuid(); 1960 1961 if (!(dir->i_mode & S_ISVTX)) 1962 return 0; 1963 if (uid_eq(inode->i_uid, fsuid)) 1964 return 0; 1965 if (uid_eq(dir->i_uid, fsuid)) 1966 return 0; 1967 return !inode_capable(inode, CAP_FOWNER); 1968 } 1969 1970 /* 1971 * Check whether we can remove a link victim from directory dir, check 1972 * whether the type of victim is right. 1973 * 1. We can't do it if dir is read-only (done in permission()) 1974 * 2. We should have write and exec permissions on dir 1975 * 3. We can't remove anything from append-only dir 1976 * 4. We can't do anything with immutable dir (done in permission()) 1977 * 5. If the sticky bit on dir is set we should either 1978 * a. be owner of dir, or 1979 * b. be owner of victim, or 1980 * c. have CAP_FOWNER capability 1981 * 6. If the victim is append-only or immutable we can't do antyhing with 1982 * links pointing to it. 1983 * 7. If we were asked to remove a directory and victim isn't one - ENOTDIR. 1984 * 8. If we were asked to remove a non-directory and victim isn't one - EISDIR. 1985 * 9. We can't remove a root or mountpoint. 1986 * 10. We don't allow removal of NFS sillyrenamed files; it's handled by 1987 * nfs_async_unlink(). 1988 */ 1989 static int may_delete(struct inode *dir,struct dentry *victim,int isdir) 1990 { 1991 int error; 1992 1993 if (!victim->d_inode) 1994 return -ENOENT; 1995 1996 BUG_ON(victim->d_parent->d_inode != dir); 1997 audit_inode_child(victim, dir); 1998 1999 error = inode_permission(dir, MAY_WRITE | MAY_EXEC); 2000 if (error) 2001 return error; 2002 if (IS_APPEND(dir)) 2003 return -EPERM; 2004 if (check_sticky(dir, victim->d_inode)||IS_APPEND(victim->d_inode)|| 2005 IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode)) 2006 return -EPERM; 2007 if (isdir) { 2008 if (!S_ISDIR(victim->d_inode->i_mode)) 2009 return -ENOTDIR; 2010 if (IS_ROOT(victim)) 2011 return -EBUSY; 2012 } else if (S_ISDIR(victim->d_inode->i_mode)) 2013 return -EISDIR; 2014 if (IS_DEADDIR(dir)) 2015 return -ENOENT; 2016 if (victim->d_flags & DCACHE_NFSFS_RENAMED) 2017 return -EBUSY; 2018 return 0; 2019 } 2020 2021 /* Check whether we can create an object with dentry child in directory 2022 * dir. 2023 * 1. We can't do it if child already exists (open has special treatment for 2024 * this case, but since we are inlined it's OK) 2025 * 2. We can't do it if dir is read-only (done in permission()) 2026 * 3. We should have write and exec permissions on dir 2027 * 4. We can't do it if dir is immutable (done in permission()) 2028 */ 2029 static inline int may_create(struct inode *dir, struct dentry *child) 2030 { 2031 if (child->d_inode) 2032 return -EEXIST; 2033 if (IS_DEADDIR(dir)) 2034 return -ENOENT; 2035 return inode_permission(dir, MAY_WRITE | MAY_EXEC); 2036 } 2037 2038 /* 2039 * p1 and p2 should be directories on the same fs. 2040 */ 2041 struct dentry *lock_rename(struct dentry *p1, struct dentry *p2) 2042 { 2043 struct dentry *p; 2044 2045 if (p1 == p2) { 2046 mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT); 2047 return NULL; 2048 } 2049 2050 mutex_lock(&p1->d_inode->i_sb->s_vfs_rename_mutex); 2051 2052 p = d_ancestor(p2, p1); 2053 if (p) { 2054 mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT); 2055 mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_CHILD); 2056 return p; 2057 } 2058 2059 p = d_ancestor(p1, p2); 2060 if (p) { 2061 mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT); 2062 mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD); 2063 return p; 2064 } 2065 2066 mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT); 2067 mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD); 2068 return NULL; 2069 } 2070 2071 void unlock_rename(struct dentry *p1, struct dentry *p2) 2072 { 2073 mutex_unlock(&p1->d_inode->i_mutex); 2074 if (p1 != p2) { 2075 mutex_unlock(&p2->d_inode->i_mutex); 2076 mutex_unlock(&p1->d_inode->i_sb->s_vfs_rename_mutex); 2077 } 2078 } 2079 2080 int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, 2081 struct nameidata *nd) 2082 { 2083 int error = may_create(dir, dentry); 2084 2085 if (error) 2086 return error; 2087 2088 if (!dir->i_op->create) 2089 return -EACCES; /* shouldn't it be ENOSYS? */ 2090 mode &= S_IALLUGO; 2091 mode |= S_IFREG; 2092 error = security_inode_create(dir, dentry, mode); 2093 if (error) 2094 return error; 2095 error = dir->i_op->create(dir, dentry, mode, nd); 2096 if (!error) 2097 fsnotify_create(dir, dentry); 2098 return error; 2099 } 2100 2101 static int may_open(struct path *path, int acc_mode, int flag) 2102 { 2103 struct dentry *dentry = path->dentry; 2104 struct inode *inode = dentry->d_inode; 2105 int error; 2106 2107 /* O_PATH? */ 2108 if (!acc_mode) 2109 return 0; 2110 2111 if (!inode) 2112 return -ENOENT; 2113 2114 switch (inode->i_mode & S_IFMT) { 2115 case S_IFLNK: 2116 return -ELOOP; 2117 case S_IFDIR: 2118 if (acc_mode & MAY_WRITE) 2119 return -EISDIR; 2120 break; 2121 case S_IFBLK: 2122 case S_IFCHR: 2123 if (path->mnt->mnt_flags & MNT_NODEV) 2124 return -EACCES; 2125 /*FALLTHRU*/ 2126 case S_IFIFO: 2127 case S_IFSOCK: 2128 flag &= ~O_TRUNC; 2129 break; 2130 } 2131 2132 error = inode_permission(inode, acc_mode); 2133 if (error) 2134 return error; 2135 2136 /* 2137 * An append-only file must be opened in append mode for writing. 2138 */ 2139 if (IS_APPEND(inode)) { 2140 if ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND)) 2141 return -EPERM; 2142 if (flag & O_TRUNC) 2143 return -EPERM; 2144 } 2145 2146 /* O_NOATIME can only be set by the owner or superuser */ 2147 if (flag & O_NOATIME && !inode_owner_or_capable(inode)) 2148 return -EPERM; 2149 2150 return 0; 2151 } 2152 2153 static int handle_truncate(struct file *filp) 2154 { 2155 struct path *path = &filp->f_path; 2156 struct inode *inode = path->dentry->d_inode; 2157 int error = get_write_access(inode); 2158 if (error) 2159 return error; 2160 /* 2161 * Refuse to truncate files with mandatory locks held on them. 2162 */ 2163 error = locks_verify_locked(inode); 2164 if (!error) 2165 error = security_path_truncate(path); 2166 if (!error) { 2167 error = do_truncate(path->dentry, 0, 2168 ATTR_MTIME|ATTR_CTIME|ATTR_OPEN, 2169 filp); 2170 } 2171 put_write_access(inode); 2172 return error; 2173 } 2174 2175 static inline int open_to_namei_flags(int flag) 2176 { 2177 if ((flag & O_ACCMODE) == 3) 2178 flag--; 2179 return flag; 2180 } 2181 2182 /* 2183 * Handle the last step of open() 2184 */ 2185 static struct file *do_last(struct nameidata *nd, struct path *path, 2186 const struct open_flags *op, const char *pathname) 2187 { 2188 struct dentry *dir = nd->path.dentry; 2189 struct dentry *dentry; 2190 int open_flag = op->open_flag; 2191 int will_truncate = open_flag & O_TRUNC; 2192 int want_write = 0; 2193 int acc_mode = op->acc_mode; 2194 struct file *filp; 2195 int error; 2196 2197 nd->flags &= ~LOOKUP_PARENT; 2198 nd->flags |= op->intent; 2199 2200 switch (nd->last_type) { 2201 case LAST_DOTDOT: 2202 case LAST_DOT: 2203 error = handle_dots(nd, nd->last_type); 2204 if (error) 2205 return ERR_PTR(error); 2206 /* fallthrough */ 2207 case LAST_ROOT: 2208 error = complete_walk(nd); 2209 if (error) 2210 return ERR_PTR(error); 2211 audit_inode(pathname, nd->path.dentry); 2212 if (open_flag & O_CREAT) { 2213 error = -EISDIR; 2214 goto exit; 2215 } 2216 goto ok; 2217 case LAST_BIND: 2218 error = complete_walk(nd); 2219 if (error) 2220 return ERR_PTR(error); 2221 audit_inode(pathname, dir); 2222 goto ok; 2223 } 2224 2225 if (!(open_flag & O_CREAT)) { 2226 int symlink_ok = 0; 2227 if (nd->last.name[nd->last.len]) 2228 nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; 2229 if (open_flag & O_PATH && !(nd->flags & LOOKUP_FOLLOW)) 2230 symlink_ok = 1; 2231 /* we _can_ be in RCU mode here */ 2232 error = walk_component(nd, path, &nd->last, LAST_NORM, 2233 !symlink_ok); 2234 if (error < 0) 2235 return ERR_PTR(error); 2236 if (error) /* symlink */ 2237 return NULL; 2238 /* sayonara */ 2239 error = complete_walk(nd); 2240 if (error) 2241 return ERR_PTR(error); 2242 2243 error = -ENOTDIR; 2244 if (nd->flags & LOOKUP_DIRECTORY) { 2245 if (!nd->inode->i_op->lookup) 2246 goto exit; 2247 } 2248 audit_inode(pathname, nd->path.dentry); 2249 goto ok; 2250 } 2251 2252 /* create side of things */ 2253 /* 2254 * This will *only* deal with leaving RCU mode - LOOKUP_JUMPED has been 2255 * cleared when we got to the last component we are about to look up 2256 */ 2257 error = complete_walk(nd); 2258 if (error) 2259 return ERR_PTR(error); 2260 2261 audit_inode(pathname, dir); 2262 error = -EISDIR; 2263 /* trailing slashes? */ 2264 if (nd->last.name[nd->last.len]) 2265 goto exit; 2266 2267 mutex_lock(&dir->d_inode->i_mutex); 2268 2269 dentry = lookup_hash(nd); 2270 error = PTR_ERR(dentry); 2271 if (IS_ERR(dentry)) { 2272 mutex_unlock(&dir->d_inode->i_mutex); 2273 goto exit; 2274 } 2275 2276 path->dentry = dentry; 2277 path->mnt = nd->path.mnt; 2278 2279 /* Negative dentry, just create the file */ 2280 if (!dentry->d_inode) { 2281 umode_t mode = op->mode; 2282 if (!IS_POSIXACL(dir->d_inode)) 2283 mode &= ~current_umask(); 2284 /* 2285 * This write is needed to ensure that a 2286 * rw->ro transition does not occur between 2287 * the time when the file is created and when 2288 * a permanent write count is taken through 2289 * the 'struct file' in nameidata_to_filp(). 2290 */ 2291 error = mnt_want_write(nd->path.mnt); 2292 if (error) 2293 goto exit_mutex_unlock; 2294 want_write = 1; 2295 /* Don't check for write permission, don't truncate */ 2296 open_flag &= ~O_TRUNC; 2297 will_truncate = 0; 2298 acc_mode = MAY_OPEN; 2299 error = security_path_mknod(&nd->path, dentry, mode, 0); 2300 if (error) 2301 goto exit_mutex_unlock; 2302 error = vfs_create(dir->d_inode, dentry, mode, nd); 2303 if (error) 2304 goto exit_mutex_unlock; 2305 mutex_unlock(&dir->d_inode->i_mutex); 2306 dput(nd->path.dentry); 2307 nd->path.dentry = dentry; 2308 goto common; 2309 } 2310 2311 /* 2312 * It already exists. 2313 */ 2314 mutex_unlock(&dir->d_inode->i_mutex); 2315 audit_inode(pathname, path->dentry); 2316 2317 error = -EEXIST; 2318 if (open_flag & O_EXCL) 2319 goto exit_dput; 2320 2321 error = follow_managed(path, nd->flags); 2322 if (error < 0) 2323 goto exit_dput; 2324 2325 if (error) 2326 nd->flags |= LOOKUP_JUMPED; 2327 2328 error = -ENOENT; 2329 if (!path->dentry->d_inode) 2330 goto exit_dput; 2331 2332 if (path->dentry->d_inode->i_op->follow_link) 2333 return NULL; 2334 2335 path_to_nameidata(path, nd); 2336 nd->inode = path->dentry->d_inode; 2337 /* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */ 2338 error = complete_walk(nd); 2339 if (error) 2340 return ERR_PTR(error); 2341 error = -EISDIR; 2342 if (S_ISDIR(nd->inode->i_mode)) 2343 goto exit; 2344 ok: 2345 if (!S_ISREG(nd->inode->i_mode)) 2346 will_truncate = 0; 2347 2348 if (will_truncate) { 2349 error = mnt_want_write(nd->path.mnt); 2350 if (error) 2351 goto exit; 2352 want_write = 1; 2353 } 2354 common: 2355 error = may_open(&nd->path, acc_mode, open_flag); 2356 if (error) 2357 goto exit; 2358 filp = nameidata_to_filp(nd); 2359 if (!IS_ERR(filp)) { 2360 error = ima_file_check(filp, op->acc_mode); 2361 if (error) { 2362 fput(filp); 2363 filp = ERR_PTR(error); 2364 } 2365 } 2366 if (!IS_ERR(filp)) { 2367 if (will_truncate) { 2368 error = handle_truncate(filp); 2369 if (error) { 2370 fput(filp); 2371 filp = ERR_PTR(error); 2372 } 2373 } 2374 } 2375 out: 2376 if (want_write) 2377 mnt_drop_write(nd->path.mnt); 2378 path_put(&nd->path); 2379 return filp; 2380 2381 exit_mutex_unlock: 2382 mutex_unlock(&dir->d_inode->i_mutex); 2383 exit_dput: 2384 path_put_conditional(path, nd); 2385 exit: 2386 filp = ERR_PTR(error); 2387 goto out; 2388 } 2389 2390 static struct file *path_openat(int dfd, const char *pathname, 2391 struct nameidata *nd, const struct open_flags *op, int flags) 2392 { 2393 struct file *base = NULL; 2394 struct file *filp; 2395 struct path path; 2396 int error; 2397 2398 filp = get_empty_filp(); 2399 if (!filp) 2400 return ERR_PTR(-ENFILE); 2401 2402 filp->f_flags = op->open_flag; 2403 nd->intent.open.file = filp; 2404 nd->intent.open.flags = open_to_namei_flags(op->open_flag); 2405 nd->intent.open.create_mode = op->mode; 2406 2407 error = path_init(dfd, pathname, flags | LOOKUP_PARENT, nd, &base); 2408 if (unlikely(error)) 2409 goto out_filp; 2410 2411 current->total_link_count = 0; 2412 error = link_path_walk(pathname, nd); 2413 if (unlikely(error)) 2414 goto out_filp; 2415 2416 filp = do_last(nd, &path, op, pathname); 2417 while (unlikely(!filp)) { /* trailing symlink */ 2418 struct path link = path; 2419 void *cookie; 2420 if (!(nd->flags & LOOKUP_FOLLOW)) { 2421 path_put_conditional(&path, nd); 2422 path_put(&nd->path); 2423 filp = ERR_PTR(-ELOOP); 2424 break; 2425 } 2426 nd->flags |= LOOKUP_PARENT; 2427 nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL); 2428 error = follow_link(&link, nd, &cookie); 2429 if (unlikely(error)) 2430 filp = ERR_PTR(error); 2431 else 2432 filp = do_last(nd, &path, op, pathname); 2433 put_link(nd, &link, cookie); 2434 } 2435 out: 2436 if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) 2437 path_put(&nd->root); 2438 if (base) 2439 fput(base); 2440 release_open_intent(nd); 2441 return filp; 2442 2443 out_filp: 2444 filp = ERR_PTR(error); 2445 goto out; 2446 } 2447 2448 struct file *do_filp_open(int dfd, const char *pathname, 2449 const struct open_flags *op, int flags) 2450 { 2451 struct nameidata nd; 2452 struct file *filp; 2453 2454 filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_RCU); 2455 if (unlikely(filp == ERR_PTR(-ECHILD))) 2456 filp = path_openat(dfd, pathname, &nd, op, flags); 2457 if (unlikely(filp == ERR_PTR(-ESTALE))) 2458 filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_REVAL); 2459 return filp; 2460 } 2461 2462 struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt, 2463 const char *name, const struct open_flags *op, int flags) 2464 { 2465 struct nameidata nd; 2466 struct file *file; 2467 2468 nd.root.mnt = mnt; 2469 nd.root.dentry = dentry; 2470 2471 flags |= LOOKUP_ROOT; 2472 2473 if (dentry->d_inode->i_op->follow_link && op->intent & LOOKUP_OPEN) 2474 return ERR_PTR(-ELOOP); 2475 2476 file = path_openat(-1, name, &nd, op, flags | LOOKUP_RCU); 2477 if (unlikely(file == ERR_PTR(-ECHILD))) 2478 file = path_openat(-1, name, &nd, op, flags); 2479 if (unlikely(file == ERR_PTR(-ESTALE))) 2480 file = path_openat(-1, name, &nd, op, flags | LOOKUP_REVAL); 2481 return file; 2482 } 2483 2484 struct dentry *kern_path_create(int dfd, const char *pathname, struct path *path, int is_dir) 2485 { 2486 struct dentry *dentry = ERR_PTR(-EEXIST); 2487 struct nameidata nd; 2488 int error = do_path_lookup(dfd, pathname, LOOKUP_PARENT, &nd); 2489 if (error) 2490 return ERR_PTR(error); 2491 2492 /* 2493 * Yucky last component or no last component at all? 2494 * (foo/., foo/.., /////) 2495 */ 2496 if (nd.last_type != LAST_NORM) 2497 goto out; 2498 nd.flags &= ~LOOKUP_PARENT; 2499 nd.flags |= LOOKUP_CREATE | LOOKUP_EXCL; 2500 nd.intent.open.flags = O_EXCL; 2501 2502 /* 2503 * Do the final lookup. 2504 */ 2505 mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); 2506 dentry = lookup_hash(&nd); 2507 if (IS_ERR(dentry)) 2508 goto fail; 2509 2510 if (dentry->d_inode) 2511 goto eexist; 2512 /* 2513 * Special case - lookup gave negative, but... we had foo/bar/ 2514 * From the vfs_mknod() POV we just have a negative dentry - 2515 * all is fine. Let's be bastards - you had / on the end, you've 2516 * been asking for (non-existent) directory. -ENOENT for you. 2517 */ 2518 if (unlikely(!is_dir && nd.last.name[nd.last.len])) { 2519 dput(dentry); 2520 dentry = ERR_PTR(-ENOENT); 2521 goto fail; 2522 } 2523 *path = nd.path; 2524 return dentry; 2525 eexist: 2526 dput(dentry); 2527 dentry = ERR_PTR(-EEXIST); 2528 fail: 2529 mutex_unlock(&nd.path.dentry->d_inode->i_mutex); 2530 out: 2531 path_put(&nd.path); 2532 return dentry; 2533 } 2534 EXPORT_SYMBOL(kern_path_create); 2535 2536 struct dentry *user_path_create(int dfd, const char __user *pathname, struct path *path, int is_dir) 2537 { 2538 char *tmp = getname(pathname); 2539 struct dentry *res; 2540 if (IS_ERR(tmp)) 2541 return ERR_CAST(tmp); 2542 res = kern_path_create(dfd, tmp, path, is_dir); 2543 putname(tmp); 2544 return res; 2545 } 2546 EXPORT_SYMBOL(user_path_create); 2547 2548 int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) 2549 { 2550 int error = may_create(dir, dentry); 2551 2552 if (error) 2553 return error; 2554 2555 if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD)) 2556 return -EPERM; 2557 2558 if (!dir->i_op->mknod) 2559 return -EPERM; 2560 2561 error = devcgroup_inode_mknod(mode, dev); 2562 if (error) 2563 return error; 2564 2565 error = security_inode_mknod(dir, dentry, mode, dev); 2566 if (error) 2567 return error; 2568 2569 error = dir->i_op->mknod(dir, dentry, mode, dev); 2570 if (!error) 2571 fsnotify_create(dir, dentry); 2572 return error; 2573 } 2574 2575 static int may_mknod(umode_t mode) 2576 { 2577 switch (mode & S_IFMT) { 2578 case S_IFREG: 2579 case S_IFCHR: 2580 case S_IFBLK: 2581 case S_IFIFO: 2582 case S_IFSOCK: 2583 case 0: /* zero mode translates to S_IFREG */ 2584 return 0; 2585 case S_IFDIR: 2586 return -EPERM; 2587 default: 2588 return -EINVAL; 2589 } 2590 } 2591 2592 SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode, 2593 unsigned, dev) 2594 { 2595 struct dentry *dentry; 2596 struct path path; 2597 int error; 2598 2599 if (S_ISDIR(mode)) 2600 return -EPERM; 2601 2602 dentry = user_path_create(dfd, filename, &path, 0); 2603 if (IS_ERR(dentry)) 2604 return PTR_ERR(dentry); 2605 2606 if (!IS_POSIXACL(path.dentry->d_inode)) 2607 mode &= ~current_umask(); 2608 error = may_mknod(mode); 2609 if (error) 2610 goto out_dput; 2611 error = mnt_want_write(path.mnt); 2612 if (error) 2613 goto out_dput; 2614 error = security_path_mknod(&path, dentry, mode, dev); 2615 if (error) 2616 goto out_drop_write; 2617 switch (mode & S_IFMT) { 2618 case 0: case S_IFREG: 2619 error = vfs_create(path.dentry->d_inode,dentry,mode,NULL); 2620 break; 2621 case S_IFCHR: case S_IFBLK: 2622 error = vfs_mknod(path.dentry->d_inode,dentry,mode, 2623 new_decode_dev(dev)); 2624 break; 2625 case S_IFIFO: case S_IFSOCK: 2626 error = vfs_mknod(path.dentry->d_inode,dentry,mode,0); 2627 break; 2628 } 2629 out_drop_write: 2630 mnt_drop_write(path.mnt); 2631 out_dput: 2632 dput(dentry); 2633 mutex_unlock(&path.dentry->d_inode->i_mutex); 2634 path_put(&path); 2635 2636 return error; 2637 } 2638 2639 SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev) 2640 { 2641 return sys_mknodat(AT_FDCWD, filename, mode, dev); 2642 } 2643 2644 int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 2645 { 2646 int error = may_create(dir, dentry); 2647 unsigned max_links = dir->i_sb->s_max_links; 2648 2649 if (error) 2650 return error; 2651 2652 if (!dir->i_op->mkdir) 2653 return -EPERM; 2654 2655 mode &= (S_IRWXUGO|S_ISVTX); 2656 error = security_inode_mkdir(dir, dentry, mode); 2657 if (error) 2658 return error; 2659 2660 if (max_links && dir->i_nlink >= max_links) 2661 return -EMLINK; 2662 2663 error = dir->i_op->mkdir(dir, dentry, mode); 2664 if (!error) 2665 fsnotify_mkdir(dir, dentry); 2666 return error; 2667 } 2668 2669 SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode) 2670 { 2671 struct dentry *dentry; 2672 struct path path; 2673 int error; 2674 2675 dentry = user_path_create(dfd, pathname, &path, 1); 2676 if (IS_ERR(dentry)) 2677 return PTR_ERR(dentry); 2678 2679 if (!IS_POSIXACL(path.dentry->d_inode)) 2680 mode &= ~current_umask(); 2681 error = mnt_want_write(path.mnt); 2682 if (error) 2683 goto out_dput; 2684 error = security_path_mkdir(&path, dentry, mode); 2685 if (error) 2686 goto out_drop_write; 2687 error = vfs_mkdir(path.dentry->d_inode, dentry, mode); 2688 out_drop_write: 2689 mnt_drop_write(path.mnt); 2690 out_dput: 2691 dput(dentry); 2692 mutex_unlock(&path.dentry->d_inode->i_mutex); 2693 path_put(&path); 2694 return error; 2695 } 2696 2697 SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode) 2698 { 2699 return sys_mkdirat(AT_FDCWD, pathname, mode); 2700 } 2701 2702 /* 2703 * The dentry_unhash() helper will try to drop the dentry early: we 2704 * should have a usage count of 1 if we're the only user of this 2705 * dentry, and if that is true (possibly after pruning the dcache), 2706 * then we drop the dentry now. 2707 * 2708 * A low-level filesystem can, if it choses, legally 2709 * do a 2710 * 2711 * if (!d_unhashed(dentry)) 2712 * return -EBUSY; 2713 * 2714 * if it cannot handle the case of removing a directory 2715 * that is still in use by something else.. 2716 */ 2717 void dentry_unhash(struct dentry *dentry) 2718 { 2719 shrink_dcache_parent(dentry); 2720 spin_lock(&dentry->d_lock); 2721 if (dentry->d_count == 1) 2722 __d_drop(dentry); 2723 spin_unlock(&dentry->d_lock); 2724 } 2725 2726 int vfs_rmdir(struct inode *dir, struct dentry *dentry) 2727 { 2728 int error = may_delete(dir, dentry, 1); 2729 2730 if (error) 2731 return error; 2732 2733 if (!dir->i_op->rmdir) 2734 return -EPERM; 2735 2736 dget(dentry); 2737 mutex_lock(&dentry->d_inode->i_mutex); 2738 2739 error = -EBUSY; 2740 if (d_mountpoint(dentry)) 2741 goto out; 2742 2743 error = security_inode_rmdir(dir, dentry); 2744 if (error) 2745 goto out; 2746 2747 shrink_dcache_parent(dentry); 2748 error = dir->i_op->rmdir(dir, dentry); 2749 if (error) 2750 goto out; 2751 2752 dentry->d_inode->i_flags |= S_DEAD; 2753 dont_mount(dentry); 2754 2755 out: 2756 mutex_unlock(&dentry->d_inode->i_mutex); 2757 dput(dentry); 2758 if (!error) 2759 d_delete(dentry); 2760 return error; 2761 } 2762 2763 static long do_rmdir(int dfd, const char __user *pathname) 2764 { 2765 int error = 0; 2766 char * name; 2767 struct dentry *dentry; 2768 struct nameidata nd; 2769 2770 error = user_path_parent(dfd, pathname, &nd, &name); 2771 if (error) 2772 return error; 2773 2774 switch(nd.last_type) { 2775 case LAST_DOTDOT: 2776 error = -ENOTEMPTY; 2777 goto exit1; 2778 case LAST_DOT: 2779 error = -EINVAL; 2780 goto exit1; 2781 case LAST_ROOT: 2782 error = -EBUSY; 2783 goto exit1; 2784 } 2785 2786 nd.flags &= ~LOOKUP_PARENT; 2787 2788 mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); 2789 dentry = lookup_hash(&nd); 2790 error = PTR_ERR(dentry); 2791 if (IS_ERR(dentry)) 2792 goto exit2; 2793 if (!dentry->d_inode) { 2794 error = -ENOENT; 2795 goto exit3; 2796 } 2797 error = mnt_want_write(nd.path.mnt); 2798 if (error) 2799 goto exit3; 2800 error = security_path_rmdir(&nd.path, dentry); 2801 if (error) 2802 goto exit4; 2803 error = vfs_rmdir(nd.path.dentry->d_inode, dentry); 2804 exit4: 2805 mnt_drop_write(nd.path.mnt); 2806 exit3: 2807 dput(dentry); 2808 exit2: 2809 mutex_unlock(&nd.path.dentry->d_inode->i_mutex); 2810 exit1: 2811 path_put(&nd.path); 2812 putname(name); 2813 return error; 2814 } 2815 2816 SYSCALL_DEFINE1(rmdir, const char __user *, pathname) 2817 { 2818 return do_rmdir(AT_FDCWD, pathname); 2819 } 2820 2821 int vfs_unlink(struct inode *dir, struct dentry *dentry) 2822 { 2823 int error = may_delete(dir, dentry, 0); 2824 2825 if (error) 2826 return error; 2827 2828 if (!dir->i_op->unlink) 2829 return -EPERM; 2830 2831 mutex_lock(&dentry->d_inode->i_mutex); 2832 if (d_mountpoint(dentry)) 2833 error = -EBUSY; 2834 else { 2835 error = security_inode_unlink(dir, dentry); 2836 if (!error) { 2837 error = dir->i_op->unlink(dir, dentry); 2838 if (!error) 2839 dont_mount(dentry); 2840 } 2841 } 2842 mutex_unlock(&dentry->d_inode->i_mutex); 2843 2844 /* We don't d_delete() NFS sillyrenamed files--they still exist. */ 2845 if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) { 2846 fsnotify_link_count(dentry->d_inode); 2847 d_delete(dentry); 2848 } 2849 2850 return error; 2851 } 2852 2853 /* 2854 * Make sure that the actual truncation of the file will occur outside its 2855 * directory's i_mutex. Truncate can take a long time if there is a lot of 2856 * writeout happening, and we don't want to prevent access to the directory 2857 * while waiting on the I/O. 2858 */ 2859 static long do_unlinkat(int dfd, const char __user *pathname) 2860 { 2861 int error; 2862 char *name; 2863 struct dentry *dentry; 2864 struct nameidata nd; 2865 struct inode *inode = NULL; 2866 2867 error = user_path_parent(dfd, pathname, &nd, &name); 2868 if (error) 2869 return error; 2870 2871 error = -EISDIR; 2872 if (nd.last_type != LAST_NORM) 2873 goto exit1; 2874 2875 nd.flags &= ~LOOKUP_PARENT; 2876 2877 mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); 2878 dentry = lookup_hash(&nd); 2879 error = PTR_ERR(dentry); 2880 if (!IS_ERR(dentry)) { 2881 /* Why not before? Because we want correct error value */ 2882 if (nd.last.name[nd.last.len]) 2883 goto slashes; 2884 inode = dentry->d_inode; 2885 if (!inode) 2886 goto slashes; 2887 ihold(inode); 2888 error = mnt_want_write(nd.path.mnt); 2889 if (error) 2890 goto exit2; 2891 error = security_path_unlink(&nd.path, dentry); 2892 if (error) 2893 goto exit3; 2894 error = vfs_unlink(nd.path.dentry->d_inode, dentry); 2895 exit3: 2896 mnt_drop_write(nd.path.mnt); 2897 exit2: 2898 dput(dentry); 2899 } 2900 mutex_unlock(&nd.path.dentry->d_inode->i_mutex); 2901 if (inode) 2902 iput(inode); /* truncate the inode here */ 2903 exit1: 2904 path_put(&nd.path); 2905 putname(name); 2906 return error; 2907 2908 slashes: 2909 error = !dentry->d_inode ? -ENOENT : 2910 S_ISDIR(dentry->d_inode->i_mode) ? -EISDIR : -ENOTDIR; 2911 goto exit2; 2912 } 2913 2914 SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag) 2915 { 2916 if ((flag & ~AT_REMOVEDIR) != 0) 2917 return -EINVAL; 2918 2919 if (flag & AT_REMOVEDIR) 2920 return do_rmdir(dfd, pathname); 2921 2922 return do_unlinkat(dfd, pathname); 2923 } 2924 2925 SYSCALL_DEFINE1(unlink, const char __user *, pathname) 2926 { 2927 return do_unlinkat(AT_FDCWD, pathname); 2928 } 2929 2930 int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname) 2931 { 2932 int error = may_create(dir, dentry); 2933 2934 if (error) 2935 return error; 2936 2937 if (!dir->i_op->symlink) 2938 return -EPERM; 2939 2940 error = security_inode_symlink(dir, dentry, oldname); 2941 if (error) 2942 return error; 2943 2944 error = dir->i_op->symlink(dir, dentry, oldname); 2945 if (!error) 2946 fsnotify_create(dir, dentry); 2947 return error; 2948 } 2949 2950 SYSCALL_DEFINE3(symlinkat, const char __user *, oldname, 2951 int, newdfd, const char __user *, newname) 2952 { 2953 int error; 2954 char *from; 2955 struct dentry *dentry; 2956 struct path path; 2957 2958 from = getname(oldname); 2959 if (IS_ERR(from)) 2960 return PTR_ERR(from); 2961 2962 dentry = user_path_create(newdfd, newname, &path, 0); 2963 error = PTR_ERR(dentry); 2964 if (IS_ERR(dentry)) 2965 goto out_putname; 2966 2967 error = mnt_want_write(path.mnt); 2968 if (error) 2969 goto out_dput; 2970 error = security_path_symlink(&path, dentry, from); 2971 if (error) 2972 goto out_drop_write; 2973 error = vfs_symlink(path.dentry->d_inode, dentry, from); 2974 out_drop_write: 2975 mnt_drop_write(path.mnt); 2976 out_dput: 2977 dput(dentry); 2978 mutex_unlock(&path.dentry->d_inode->i_mutex); 2979 path_put(&path); 2980 out_putname: 2981 putname(from); 2982 return error; 2983 } 2984 2985 SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname) 2986 { 2987 return sys_symlinkat(oldname, AT_FDCWD, newname); 2988 } 2989 2990 int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry) 2991 { 2992 struct inode *inode = old_dentry->d_inode; 2993 unsigned max_links = dir->i_sb->s_max_links; 2994 int error; 2995 2996 if (!inode) 2997 return -ENOENT; 2998 2999 error = may_create(dir, new_dentry); 3000 if (error) 3001 return error; 3002 3003 if (dir->i_sb != inode->i_sb) 3004 return -EXDEV; 3005 3006 /* 3007 * A link to an append-only or immutable file cannot be created. 3008 */ 3009 if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) 3010 return -EPERM; 3011 if (!dir->i_op->link) 3012 return -EPERM; 3013 if (S_ISDIR(inode->i_mode)) 3014 return -EPERM; 3015 3016 error = security_inode_link(old_dentry, dir, new_dentry); 3017 if (error) 3018 return error; 3019 3020 mutex_lock(&inode->i_mutex); 3021 /* Make sure we don't allow creating hardlink to an unlinked file */ 3022 if (inode->i_nlink == 0) 3023 error = -ENOENT; 3024 else if (max_links && inode->i_nlink >= max_links) 3025 error = -EMLINK; 3026 else 3027 error = dir->i_op->link(old_dentry, dir, new_dentry); 3028 mutex_unlock(&inode->i_mutex); 3029 if (!error) 3030 fsnotify_link(dir, inode, new_dentry); 3031 return error; 3032 } 3033 3034 /* 3035 * Hardlinks are often used in delicate situations. We avoid 3036 * security-related surprises by not following symlinks on the 3037 * newname. --KAB 3038 * 3039 * We don't follow them on the oldname either to be compatible 3040 * with linux 2.0, and to avoid hard-linking to directories 3041 * and other special files. --ADM 3042 */ 3043 SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname, 3044 int, newdfd, const char __user *, newname, int, flags) 3045 { 3046 struct dentry *new_dentry; 3047 struct path old_path, new_path; 3048 int how = 0; 3049 int error; 3050 3051 if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0) 3052 return -EINVAL; 3053 /* 3054 * To use null names we require CAP_DAC_READ_SEARCH 3055 * This ensures that not everyone will be able to create 3056 * handlink using the passed filedescriptor. 3057 */ 3058 if (flags & AT_EMPTY_PATH) { 3059 if (!capable(CAP_DAC_READ_SEARCH)) 3060 return -ENOENT; 3061 how = LOOKUP_EMPTY; 3062 } 3063 3064 if (flags & AT_SYMLINK_FOLLOW) 3065 how |= LOOKUP_FOLLOW; 3066 3067 error = user_path_at(olddfd, oldname, how, &old_path); 3068 if (error) 3069 return error; 3070 3071 new_dentry = user_path_create(newdfd, newname, &new_path, 0); 3072 error = PTR_ERR(new_dentry); 3073 if (IS_ERR(new_dentry)) 3074 goto out; 3075 3076 error = -EXDEV; 3077 if (old_path.mnt != new_path.mnt) 3078 goto out_dput; 3079 error = mnt_want_write(new_path.mnt); 3080 if (error) 3081 goto out_dput; 3082 error = security_path_link(old_path.dentry, &new_path, new_dentry); 3083 if (error) 3084 goto out_drop_write; 3085 error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry); 3086 out_drop_write: 3087 mnt_drop_write(new_path.mnt); 3088 out_dput: 3089 dput(new_dentry); 3090 mutex_unlock(&new_path.dentry->d_inode->i_mutex); 3091 path_put(&new_path); 3092 out: 3093 path_put(&old_path); 3094 3095 return error; 3096 } 3097 3098 SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname) 3099 { 3100 return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0); 3101 } 3102 3103 /* 3104 * The worst of all namespace operations - renaming directory. "Perverted" 3105 * doesn't even start to describe it. Somebody in UCB had a heck of a trip... 3106 * Problems: 3107 * a) we can get into loop creation. Check is done in is_subdir(). 3108 * b) race potential - two innocent renames can create a loop together. 3109 * That's where 4.4 screws up. Current fix: serialization on 3110 * sb->s_vfs_rename_mutex. We might be more accurate, but that's another 3111 * story. 3112 * c) we have to lock _three_ objects - parents and victim (if it exists). 3113 * And that - after we got ->i_mutex on parents (until then we don't know 3114 * whether the target exists). Solution: try to be smart with locking 3115 * order for inodes. We rely on the fact that tree topology may change 3116 * only under ->s_vfs_rename_mutex _and_ that parent of the object we 3117 * move will be locked. Thus we can rank directories by the tree 3118 * (ancestors first) and rank all non-directories after them. 3119 * That works since everybody except rename does "lock parent, lookup, 3120 * lock child" and rename is under ->s_vfs_rename_mutex. 3121 * HOWEVER, it relies on the assumption that any object with ->lookup() 3122 * has no more than 1 dentry. If "hybrid" objects will ever appear, 3123 * we'd better make sure that there's no link(2) for them. 3124 * d) conversion from fhandle to dentry may come in the wrong moment - when 3125 * we are removing the target. Solution: we will have to grab ->i_mutex 3126 * in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on 3127 * ->i_mutex on parents, which works but leads to some truly excessive 3128 * locking]. 3129 */ 3130 static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, 3131 struct inode *new_dir, struct dentry *new_dentry) 3132 { 3133 int error = 0; 3134 struct inode *target = new_dentry->d_inode; 3135 unsigned max_links = new_dir->i_sb->s_max_links; 3136 3137 /* 3138 * If we are going to change the parent - check write permissions, 3139 * we'll need to flip '..'. 3140 */ 3141 if (new_dir != old_dir) { 3142 error = inode_permission(old_dentry->d_inode, MAY_WRITE); 3143 if (error) 3144 return error; 3145 } 3146 3147 error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry); 3148 if (error) 3149 return error; 3150 3151 dget(new_dentry); 3152 if (target) 3153 mutex_lock(&target->i_mutex); 3154 3155 error = -EBUSY; 3156 if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry)) 3157 goto out; 3158 3159 error = -EMLINK; 3160 if (max_links && !target && new_dir != old_dir && 3161 new_dir->i_nlink >= max_links) 3162 goto out; 3163 3164 if (target) 3165 shrink_dcache_parent(new_dentry); 3166 error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); 3167 if (error) 3168 goto out; 3169 3170 if (target) { 3171 target->i_flags |= S_DEAD; 3172 dont_mount(new_dentry); 3173 } 3174 out: 3175 if (target) 3176 mutex_unlock(&target->i_mutex); 3177 dput(new_dentry); 3178 if (!error) 3179 if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) 3180 d_move(old_dentry,new_dentry); 3181 return error; 3182 } 3183 3184 static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, 3185 struct inode *new_dir, struct dentry *new_dentry) 3186 { 3187 struct inode *target = new_dentry->d_inode; 3188 int error; 3189 3190 error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry); 3191 if (error) 3192 return error; 3193 3194 dget(new_dentry); 3195 if (target) 3196 mutex_lock(&target->i_mutex); 3197 3198 error = -EBUSY; 3199 if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry)) 3200 goto out; 3201 3202 error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); 3203 if (error) 3204 goto out; 3205 3206 if (target) 3207 dont_mount(new_dentry); 3208 if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) 3209 d_move(old_dentry, new_dentry); 3210 out: 3211 if (target) 3212 mutex_unlock(&target->i_mutex); 3213 dput(new_dentry); 3214 return error; 3215 } 3216 3217 int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, 3218 struct inode *new_dir, struct dentry *new_dentry) 3219 { 3220 int error; 3221 int is_dir = S_ISDIR(old_dentry->d_inode->i_mode); 3222 const unsigned char *old_name; 3223 3224 if (old_dentry->d_inode == new_dentry->d_inode) 3225 return 0; 3226 3227 error = may_delete(old_dir, old_dentry, is_dir); 3228 if (error) 3229 return error; 3230 3231 if (!new_dentry->d_inode) 3232 error = may_create(new_dir, new_dentry); 3233 else 3234 error = may_delete(new_dir, new_dentry, is_dir); 3235 if (error) 3236 return error; 3237 3238 if (!old_dir->i_op->rename) 3239 return -EPERM; 3240 3241 old_name = fsnotify_oldname_init(old_dentry->d_name.name); 3242 3243 if (is_dir) 3244 error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry); 3245 else 3246 error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry); 3247 if (!error) 3248 fsnotify_move(old_dir, new_dir, old_name, is_dir, 3249 new_dentry->d_inode, old_dentry); 3250 fsnotify_oldname_free(old_name); 3251 3252 return error; 3253 } 3254 3255 SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname, 3256 int, newdfd, const char __user *, newname) 3257 { 3258 struct dentry *old_dir, *new_dir; 3259 struct dentry *old_dentry, *new_dentry; 3260 struct dentry *trap; 3261 struct nameidata oldnd, newnd; 3262 char *from; 3263 char *to; 3264 int error; 3265 3266 error = user_path_parent(olddfd, oldname, &oldnd, &from); 3267 if (error) 3268 goto exit; 3269 3270 error = user_path_parent(newdfd, newname, &newnd, &to); 3271 if (error) 3272 goto exit1; 3273 3274 error = -EXDEV; 3275 if (oldnd.path.mnt != newnd.path.mnt) 3276 goto exit2; 3277 3278 old_dir = oldnd.path.dentry; 3279 error = -EBUSY; 3280 if (oldnd.last_type != LAST_NORM) 3281 goto exit2; 3282 3283 new_dir = newnd.path.dentry; 3284 if (newnd.last_type != LAST_NORM) 3285 goto exit2; 3286 3287 oldnd.flags &= ~LOOKUP_PARENT; 3288 newnd.flags &= ~LOOKUP_PARENT; 3289 newnd.flags |= LOOKUP_RENAME_TARGET; 3290 3291 trap = lock_rename(new_dir, old_dir); 3292 3293 old_dentry = lookup_hash(&oldnd); 3294 error = PTR_ERR(old_dentry); 3295 if (IS_ERR(old_dentry)) 3296 goto exit3; 3297 /* source must exist */ 3298 error = -ENOENT; 3299 if (!old_dentry->d_inode) 3300 goto exit4; 3301 /* unless the source is a directory trailing slashes give -ENOTDIR */ 3302 if (!S_ISDIR(old_dentry->d_inode->i_mode)) { 3303 error = -ENOTDIR; 3304 if (oldnd.last.name[oldnd.last.len]) 3305 goto exit4; 3306 if (newnd.last.name[newnd.last.len]) 3307 goto exit4; 3308 } 3309 /* source should not be ancestor of target */ 3310 error = -EINVAL; 3311 if (old_dentry == trap) 3312 goto exit4; 3313 new_dentry = lookup_hash(&newnd); 3314 error = PTR_ERR(new_dentry); 3315 if (IS_ERR(new_dentry)) 3316 goto exit4; 3317 /* target should not be an ancestor of source */ 3318 error = -ENOTEMPTY; 3319 if (new_dentry == trap) 3320 goto exit5; 3321 3322 error = mnt_want_write(oldnd.path.mnt); 3323 if (error) 3324 goto exit5; 3325 error = security_path_rename(&oldnd.path, old_dentry, 3326 &newnd.path, new_dentry); 3327 if (error) 3328 goto exit6; 3329 error = vfs_rename(old_dir->d_inode, old_dentry, 3330 new_dir->d_inode, new_dentry); 3331 exit6: 3332 mnt_drop_write(oldnd.path.mnt); 3333 exit5: 3334 dput(new_dentry); 3335 exit4: 3336 dput(old_dentry); 3337 exit3: 3338 unlock_rename(new_dir, old_dir); 3339 exit2: 3340 path_put(&newnd.path); 3341 putname(to); 3342 exit1: 3343 path_put(&oldnd.path); 3344 putname(from); 3345 exit: 3346 return error; 3347 } 3348 3349 SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname) 3350 { 3351 return sys_renameat(AT_FDCWD, oldname, AT_FDCWD, newname); 3352 } 3353 3354 int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen, const char *link) 3355 { 3356 int len; 3357 3358 len = PTR_ERR(link); 3359 if (IS_ERR(link)) 3360 goto out; 3361 3362 len = strlen(link); 3363 if (len > (unsigned) buflen) 3364 len = buflen; 3365 if (copy_to_user(buffer, link, len)) 3366 len = -EFAULT; 3367 out: 3368 return len; 3369 } 3370 3371 /* 3372 * A helper for ->readlink(). This should be used *ONLY* for symlinks that 3373 * have ->follow_link() touching nd only in nd_set_link(). Using (or not 3374 * using) it for any given inode is up to filesystem. 3375 */ 3376 int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen) 3377 { 3378 struct nameidata nd; 3379 void *cookie; 3380 int res; 3381 3382 nd.depth = 0; 3383 cookie = dentry->d_inode->i_op->follow_link(dentry, &nd); 3384 if (IS_ERR(cookie)) 3385 return PTR_ERR(cookie); 3386 3387 res = vfs_readlink(dentry, buffer, buflen, nd_get_link(&nd)); 3388 if (dentry->d_inode->i_op->put_link) 3389 dentry->d_inode->i_op->put_link(dentry, &nd, cookie); 3390 return res; 3391 } 3392 3393 int vfs_follow_link(struct nameidata *nd, const char *link) 3394 { 3395 return __vfs_follow_link(nd, link); 3396 } 3397 3398 /* get the link contents into pagecache */ 3399 static char *page_getlink(struct dentry * dentry, struct page **ppage) 3400 { 3401 char *kaddr; 3402 struct page *page; 3403 struct address_space *mapping = dentry->d_inode->i_mapping; 3404 page = read_mapping_page(mapping, 0, NULL); 3405 if (IS_ERR(page)) 3406 return (char*)page; 3407 *ppage = page; 3408 kaddr = kmap(page); 3409 nd_terminate_link(kaddr, dentry->d_inode->i_size, PAGE_SIZE - 1); 3410 return kaddr; 3411 } 3412 3413 int page_readlink(struct dentry *dentry, char __user *buffer, int buflen) 3414 { 3415 struct page *page = NULL; 3416 char *s = page_getlink(dentry, &page); 3417 int res = vfs_readlink(dentry,buffer,buflen,s); 3418 if (page) { 3419 kunmap(page); 3420 page_cache_release(page); 3421 } 3422 return res; 3423 } 3424 3425 void *page_follow_link_light(struct dentry *dentry, struct nameidata *nd) 3426 { 3427 struct page *page = NULL; 3428 nd_set_link(nd, page_getlink(dentry, &page)); 3429 return page; 3430 } 3431 3432 void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) 3433 { 3434 struct page *page = cookie; 3435 3436 if (page) { 3437 kunmap(page); 3438 page_cache_release(page); 3439 } 3440 } 3441 3442 /* 3443 * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS 3444 */ 3445 int __page_symlink(struct inode *inode, const char *symname, int len, int nofs) 3446 { 3447 struct address_space *mapping = inode->i_mapping; 3448 struct page *page; 3449 void *fsdata; 3450 int err; 3451 char *kaddr; 3452 unsigned int flags = AOP_FLAG_UNINTERRUPTIBLE; 3453 if (nofs) 3454 flags |= AOP_FLAG_NOFS; 3455 3456 retry: 3457 err = pagecache_write_begin(NULL, mapping, 0, len-1, 3458 flags, &page, &fsdata); 3459 if (err) 3460 goto fail; 3461 3462 kaddr = kmap_atomic(page); 3463 memcpy(kaddr, symname, len-1); 3464 kunmap_atomic(kaddr); 3465 3466 err = pagecache_write_end(NULL, mapping, 0, len-1, len-1, 3467 page, fsdata); 3468 if (err < 0) 3469 goto fail; 3470 if (err < len-1) 3471 goto retry; 3472 3473 mark_inode_dirty(inode); 3474 return 0; 3475 fail: 3476 return err; 3477 } 3478 3479 int page_symlink(struct inode *inode, const char *symname, int len) 3480 { 3481 return __page_symlink(inode, symname, len, 3482 !(mapping_gfp_mask(inode->i_mapping) & __GFP_FS)); 3483 } 3484 3485 const struct inode_operations page_symlink_inode_operations = { 3486 .readlink = generic_readlink, 3487 .follow_link = page_follow_link_light, 3488 .put_link = page_put_link, 3489 }; 3490 3491 EXPORT_SYMBOL(user_path_at); 3492 EXPORT_SYMBOL(follow_down_one); 3493 EXPORT_SYMBOL(follow_down); 3494 EXPORT_SYMBOL(follow_up); 3495 EXPORT_SYMBOL(get_write_access); /* binfmt_aout */ 3496 EXPORT_SYMBOL(getname); 3497 EXPORT_SYMBOL(lock_rename); 3498 EXPORT_SYMBOL(lookup_one_len); 3499 EXPORT_SYMBOL(page_follow_link_light); 3500 EXPORT_SYMBOL(page_put_link); 3501 EXPORT_SYMBOL(page_readlink); 3502 EXPORT_SYMBOL(__page_symlink); 3503 EXPORT_SYMBOL(page_symlink); 3504 EXPORT_SYMBOL(page_symlink_inode_operations); 3505 EXPORT_SYMBOL(kern_path); 3506 EXPORT_SYMBOL(vfs_path_lookup); 3507 EXPORT_SYMBOL(inode_permission); 3508 EXPORT_SYMBOL(unlock_rename); 3509 EXPORT_SYMBOL(vfs_create); 3510 EXPORT_SYMBOL(vfs_follow_link); 3511 EXPORT_SYMBOL(vfs_link); 3512 EXPORT_SYMBOL(vfs_mkdir); 3513 EXPORT_SYMBOL(vfs_mknod); 3514 EXPORT_SYMBOL(generic_permission); 3515 EXPORT_SYMBOL(vfs_readlink); 3516 EXPORT_SYMBOL(vfs_rename); 3517 EXPORT_SYMBOL(vfs_rmdir); 3518 EXPORT_SYMBOL(vfs_symlink); 3519 EXPORT_SYMBOL(vfs_unlink); 3520 EXPORT_SYMBOL(dentry_unhash); 3521 EXPORT_SYMBOL(generic_readlink); 3522