1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * linux/fs/namei.c 4 * 5 * Copyright (C) 1991, 1992 Linus Torvalds 6 */ 7 8 /* 9 * Some corrections by tytso. 10 */ 11 12 /* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname 13 * lookup logic. 14 */ 15 /* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture. 16 */ 17 18 #include <linux/init.h> 19 #include <linux/export.h> 20 #include <linux/slab.h> 21 #include <linux/wordpart.h> 22 #include <linux/fs.h> 23 #include <linux/filelock.h> 24 #include <linux/namei.h> 25 #include <linux/pagemap.h> 26 #include <linux/sched/mm.h> 27 #include <linux/fsnotify.h> 28 #include <linux/personality.h> 29 #include <linux/security.h> 30 #include <linux/syscalls.h> 31 #include <linux/mount.h> 32 #include <linux/audit.h> 33 #include <linux/capability.h> 34 #include <linux/file.h> 35 #include <linux/fcntl.h> 36 #include <linux/device_cgroup.h> 37 #include <linux/fs_struct.h> 38 #include <linux/posix_acl.h> 39 #include <linux/hash.h> 40 #include <linux/bitops.h> 41 #include <linux/init_task.h> 42 #include <linux/uaccess.h> 43 44 #include "internal.h" 45 #include "mount.h" 46 47 /* [Feb-1997 T. Schoebel-Theuer] 48 * Fundamental changes in the pathname lookup mechanisms (namei) 49 * were necessary because of omirr. The reason is that omirr needs 50 * to know the _real_ pathname, not the user-supplied one, in case 51 * of symlinks (and also when transname replacements occur). 52 * 53 * The new code replaces the old recursive symlink resolution with 54 * an iterative one (in case of non-nested symlink chains). It does 55 * this with calls to <fs>_follow_link(). 56 * As a side effect, dir_namei(), _namei() and follow_link() are now 57 * replaced with a single function lookup_dentry() that can handle all 58 * the special cases of the former code. 59 * 60 * With the new dcache, the pathname is stored at each inode, at least as 61 * long as the refcount of the inode is positive. As a side effect, the 62 * size of the dcache depends on the inode cache and thus is dynamic. 63 * 64 * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink 65 * resolution to correspond with current state of the code. 66 * 67 * Note that the symlink resolution is not *completely* iterative. 68 * There is still a significant amount of tail- and mid- recursion in 69 * the algorithm. Also, note that <fs>_readlink() is not used in 70 * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink() 71 * may return different results than <fs>_follow_link(). Many virtual 72 * filesystems (including /proc) exhibit this behavior. 73 */ 74 75 /* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation: 76 * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL 77 * and the name already exists in form of a symlink, try to create the new 78 * name indicated by the symlink. The old code always complained that the 79 * name already exists, due to not following the symlink even if its target 80 * is nonexistent. The new semantics affects also mknod() and link() when 81 * the name is a symlink pointing to a non-existent name. 82 * 83 * I don't know which semantics is the right one, since I have no access 84 * to standards. But I found by trial that HP-UX 9.0 has the full "new" 85 * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the 86 * "old" one. Personally, I think the new semantics is much more logical. 87 * Note that "ln old new" where "new" is a symlink pointing to a non-existing 88 * file does succeed in both HP-UX and SunOs, but not in Solaris 89 * and in the old Linux semantics. 90 */ 91 92 /* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink 93 * semantics. See the comments in "open_namei" and "do_link" below. 94 * 95 * [10-Sep-98 Alan Modra] Another symlink change. 96 */ 97 98 /* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks: 99 * inside the path - always follow. 100 * in the last component in creation/removal/renaming - never follow. 101 * if LOOKUP_FOLLOW passed - follow. 102 * if the pathname has trailing slashes - follow. 103 * otherwise - don't follow. 104 * (applied in that order). 105 * 106 * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT 107 * restored for 2.4. This is the last surviving part of old 4.2BSD bug. 108 * During the 2.4 we need to fix the userland stuff depending on it - 109 * hopefully we will be able to get rid of that wart in 2.5. So far only 110 * XEmacs seems to be relying on it... 111 */ 112 /* 113 * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland) 114 * implemented. Let's see if raised priority of ->s_vfs_rename_mutex gives 115 * any extra contention... 116 */ 117 118 /* In order to reduce some races, while at the same time doing additional 119 * checking and hopefully speeding things up, we copy filenames to the 120 * kernel data space before using them.. 121 * 122 * POSIX.1 2.4: an empty pathname is invalid (ENOENT). 123 * PATH_MAX includes the nul terminator --RR. 124 */ 125 126 #define EMBEDDED_NAME_MAX (PATH_MAX - offsetof(struct filename, iname)) 127 128 static inline void initname(struct filename *name, const char __user *uptr) 129 { 130 name->uptr = uptr; 131 name->aname = NULL; 132 atomic_set(&name->refcnt, 1); 133 } 134 135 struct filename * 136 getname_flags(const char __user *filename, int flags) 137 { 138 struct filename *result; 139 char *kname; 140 int len; 141 142 result = audit_reusename(filename); 143 if (result) 144 return result; 145 146 result = __getname(); 147 if (unlikely(!result)) 148 return ERR_PTR(-ENOMEM); 149 150 /* 151 * First, try to embed the struct filename inside the names_cache 152 * allocation 153 */ 154 kname = (char *)result->iname; 155 result->name = kname; 156 157 len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX); 158 /* 159 * Handle both empty path and copy failure in one go. 160 */ 161 if (unlikely(len <= 0)) { 162 if (unlikely(len < 0)) { 163 __putname(result); 164 return ERR_PTR(len); 165 } 166 167 /* The empty path is special. */ 168 if (!(flags & LOOKUP_EMPTY)) { 169 __putname(result); 170 return ERR_PTR(-ENOENT); 171 } 172 } 173 174 /* 175 * Uh-oh. We have a name that's approaching PATH_MAX. Allocate a 176 * separate struct filename so we can dedicate the entire 177 * names_cache allocation for the pathname, and re-do the copy from 178 * userland. 179 */ 180 if (unlikely(len == EMBEDDED_NAME_MAX)) { 181 const size_t size = offsetof(struct filename, iname[1]); 182 kname = (char *)result; 183 184 /* 185 * size is chosen that way we to guarantee that 186 * result->iname[0] is within the same object and that 187 * kname can't be equal to result->iname, no matter what. 188 */ 189 result = kzalloc(size, GFP_KERNEL); 190 if (unlikely(!result)) { 191 __putname(kname); 192 return ERR_PTR(-ENOMEM); 193 } 194 result->name = kname; 195 len = strncpy_from_user(kname, filename, PATH_MAX); 196 if (unlikely(len < 0)) { 197 __putname(kname); 198 kfree(result); 199 return ERR_PTR(len); 200 } 201 /* The empty path is special. */ 202 if (unlikely(!len) && !(flags & LOOKUP_EMPTY)) { 203 __putname(kname); 204 kfree(result); 205 return ERR_PTR(-ENOENT); 206 } 207 if (unlikely(len == PATH_MAX)) { 208 __putname(kname); 209 kfree(result); 210 return ERR_PTR(-ENAMETOOLONG); 211 } 212 } 213 initname(result, filename); 214 audit_getname(result); 215 return result; 216 } 217 218 struct filename *getname_uflags(const char __user *filename, int uflags) 219 { 220 int flags = (uflags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0; 221 222 return getname_flags(filename, flags); 223 } 224 225 struct filename *__getname_maybe_null(const char __user *pathname) 226 { 227 struct filename *name; 228 char c; 229 230 /* try to save on allocations; loss on um, though */ 231 if (get_user(c, pathname)) 232 return ERR_PTR(-EFAULT); 233 if (!c) 234 return NULL; 235 236 name = getname_flags(pathname, LOOKUP_EMPTY); 237 if (!IS_ERR(name) && !(name->name[0])) { 238 putname(name); 239 name = NULL; 240 } 241 return name; 242 } 243 244 struct filename *getname_kernel(const char * filename) 245 { 246 struct filename *result; 247 int len = strlen(filename) + 1; 248 249 result = __getname(); 250 if (unlikely(!result)) 251 return ERR_PTR(-ENOMEM); 252 253 if (len <= EMBEDDED_NAME_MAX) { 254 result->name = (char *)result->iname; 255 } else if (len <= PATH_MAX) { 256 const size_t size = offsetof(struct filename, iname[1]); 257 struct filename *tmp; 258 259 tmp = kmalloc(size, GFP_KERNEL); 260 if (unlikely(!tmp)) { 261 __putname(result); 262 return ERR_PTR(-ENOMEM); 263 } 264 tmp->name = (char *)result; 265 result = tmp; 266 } else { 267 __putname(result); 268 return ERR_PTR(-ENAMETOOLONG); 269 } 270 memcpy((char *)result->name, filename, len); 271 initname(result, NULL); 272 audit_getname(result); 273 return result; 274 } 275 EXPORT_SYMBOL(getname_kernel); 276 277 void putname(struct filename *name) 278 { 279 int refcnt; 280 281 if (IS_ERR_OR_NULL(name)) 282 return; 283 284 refcnt = atomic_read(&name->refcnt); 285 if (unlikely(refcnt != 1)) { 286 if (WARN_ON_ONCE(!refcnt)) 287 return; 288 289 if (!atomic_dec_and_test(&name->refcnt)) 290 return; 291 } 292 293 if (unlikely(name->name != name->iname)) { 294 __putname(name->name); 295 kfree(name); 296 } else 297 __putname(name); 298 } 299 EXPORT_SYMBOL(putname); 300 301 /** 302 * check_acl - perform ACL permission checking 303 * @idmap: idmap of the mount the inode was found from 304 * @inode: inode to check permissions on 305 * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...) 306 * 307 * This function performs the ACL permission checking. Since this function 308 * retrieve POSIX acls it needs to know whether it is called from a blocking or 309 * non-blocking context and thus cares about the MAY_NOT_BLOCK bit. 310 * 311 * If the inode has been found through an idmapped mount the idmap of 312 * the vfsmount must be passed through @idmap. This function will then take 313 * care to map the inode according to @idmap before checking permissions. 314 * On non-idmapped mounts or if permission checking is to be performed on the 315 * raw inode simply pass @nop_mnt_idmap. 316 */ 317 static int check_acl(struct mnt_idmap *idmap, 318 struct inode *inode, int mask) 319 { 320 #ifdef CONFIG_FS_POSIX_ACL 321 struct posix_acl *acl; 322 323 if (mask & MAY_NOT_BLOCK) { 324 acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS); 325 if (!acl) 326 return -EAGAIN; 327 /* no ->get_inode_acl() calls in RCU mode... */ 328 if (is_uncached_acl(acl)) 329 return -ECHILD; 330 return posix_acl_permission(idmap, inode, acl, mask); 331 } 332 333 acl = get_inode_acl(inode, ACL_TYPE_ACCESS); 334 if (IS_ERR(acl)) 335 return PTR_ERR(acl); 336 if (acl) { 337 int error = posix_acl_permission(idmap, inode, acl, mask); 338 posix_acl_release(acl); 339 return error; 340 } 341 #endif 342 343 return -EAGAIN; 344 } 345 346 /* 347 * Very quick optimistic "we know we have no ACL's" check. 348 * 349 * Note that this is purely for ACL_TYPE_ACCESS, and purely 350 * for the "we have cached that there are no ACLs" case. 351 * 352 * If this returns true, we know there are no ACLs. But if 353 * it returns false, we might still not have ACLs (it could 354 * be the is_uncached_acl() case). 355 */ 356 static inline bool no_acl_inode(struct inode *inode) 357 { 358 #ifdef CONFIG_FS_POSIX_ACL 359 return likely(!READ_ONCE(inode->i_acl)); 360 #else 361 return true; 362 #endif 363 } 364 365 /** 366 * acl_permission_check - perform basic UNIX permission checking 367 * @idmap: idmap of the mount the inode was found from 368 * @inode: inode to check permissions on 369 * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...) 370 * 371 * This function performs the basic UNIX permission checking. Since this 372 * function may retrieve POSIX acls it needs to know whether it is called from a 373 * blocking or non-blocking context and thus cares about the MAY_NOT_BLOCK bit. 374 * 375 * If the inode has been found through an idmapped mount the idmap of 376 * the vfsmount must be passed through @idmap. This function will then take 377 * care to map the inode according to @idmap before checking permissions. 378 * On non-idmapped mounts or if permission checking is to be performed on the 379 * raw inode simply pass @nop_mnt_idmap. 380 */ 381 static int acl_permission_check(struct mnt_idmap *idmap, 382 struct inode *inode, int mask) 383 { 384 unsigned int mode = inode->i_mode; 385 vfsuid_t vfsuid; 386 387 /* 388 * Common cheap case: everybody has the requested 389 * rights, and there are no ACLs to check. No need 390 * to do any owner/group checks in that case. 391 * 392 * - 'mask&7' is the requested permission bit set 393 * - multiplying by 0111 spreads them out to all of ugo 394 * - '& ~mode' looks for missing inode permission bits 395 * - the '!' is for "no missing permissions" 396 * 397 * After that, we just need to check that there are no 398 * ACL's on the inode - do the 'IS_POSIXACL()' check last 399 * because it will dereference the ->i_sb pointer and we 400 * want to avoid that if at all possible. 401 */ 402 if (!((mask & 7) * 0111 & ~mode)) { 403 if (no_acl_inode(inode)) 404 return 0; 405 if (!IS_POSIXACL(inode)) 406 return 0; 407 } 408 409 /* Are we the owner? If so, ACL's don't matter */ 410 vfsuid = i_uid_into_vfsuid(idmap, inode); 411 if (likely(vfsuid_eq_kuid(vfsuid, current_fsuid()))) { 412 mask &= 7; 413 mode >>= 6; 414 return (mask & ~mode) ? -EACCES : 0; 415 } 416 417 /* Do we have ACL's? */ 418 if (IS_POSIXACL(inode) && (mode & S_IRWXG)) { 419 int error = check_acl(idmap, inode, mask); 420 if (error != -EAGAIN) 421 return error; 422 } 423 424 /* Only RWX matters for group/other mode bits */ 425 mask &= 7; 426 427 /* 428 * Are the group permissions different from 429 * the other permissions in the bits we care 430 * about? Need to check group ownership if so. 431 */ 432 if (mask & (mode ^ (mode >> 3))) { 433 vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode); 434 if (vfsgid_in_group_p(vfsgid)) 435 mode >>= 3; 436 } 437 438 /* Bits in 'mode' clear that we require? */ 439 return (mask & ~mode) ? -EACCES : 0; 440 } 441 442 /** 443 * generic_permission - check for access rights on a Posix-like filesystem 444 * @idmap: idmap of the mount the inode was found from 445 * @inode: inode to check access rights for 446 * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, 447 * %MAY_NOT_BLOCK ...) 448 * 449 * Used to check for read/write/execute permissions on a file. 450 * We use "fsuid" for this, letting us set arbitrary permissions 451 * for filesystem access without changing the "normal" uids which 452 * are used for other things. 453 * 454 * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk 455 * request cannot be satisfied (eg. requires blocking or too much complexity). 456 * It would then be called again in ref-walk mode. 457 * 458 * If the inode has been found through an idmapped mount the idmap of 459 * the vfsmount must be passed through @idmap. This function will then take 460 * care to map the inode according to @idmap before checking permissions. 461 * On non-idmapped mounts or if permission checking is to be performed on the 462 * raw inode simply pass @nop_mnt_idmap. 463 */ 464 int generic_permission(struct mnt_idmap *idmap, struct inode *inode, 465 int mask) 466 { 467 int ret; 468 469 /* 470 * Do the basic permission checks. 471 */ 472 ret = acl_permission_check(idmap, inode, mask); 473 if (ret != -EACCES) 474 return ret; 475 476 if (S_ISDIR(inode->i_mode)) { 477 /* DACs are overridable for directories */ 478 if (!(mask & MAY_WRITE)) 479 if (capable_wrt_inode_uidgid(idmap, inode, 480 CAP_DAC_READ_SEARCH)) 481 return 0; 482 if (capable_wrt_inode_uidgid(idmap, inode, 483 CAP_DAC_OVERRIDE)) 484 return 0; 485 return -EACCES; 486 } 487 488 /* 489 * Searching includes executable on directories, else just read. 490 */ 491 mask &= MAY_READ | MAY_WRITE | MAY_EXEC; 492 if (mask == MAY_READ) 493 if (capable_wrt_inode_uidgid(idmap, inode, 494 CAP_DAC_READ_SEARCH)) 495 return 0; 496 /* 497 * Read/write DACs are always overridable. 498 * Executable DACs are overridable when there is 499 * at least one exec bit set. 500 */ 501 if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO)) 502 if (capable_wrt_inode_uidgid(idmap, inode, 503 CAP_DAC_OVERRIDE)) 504 return 0; 505 506 return -EACCES; 507 } 508 EXPORT_SYMBOL(generic_permission); 509 510 /** 511 * do_inode_permission - UNIX permission checking 512 * @idmap: idmap of the mount the inode was found from 513 * @inode: inode to check permissions on 514 * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...) 515 * 516 * We _really_ want to just do "generic_permission()" without 517 * even looking at the inode->i_op values. So we keep a cache 518 * flag in inode->i_opflags, that says "this has not special 519 * permission function, use the fast case". 520 */ 521 static inline int do_inode_permission(struct mnt_idmap *idmap, 522 struct inode *inode, int mask) 523 { 524 if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) { 525 if (likely(inode->i_op->permission)) 526 return inode->i_op->permission(idmap, inode, mask); 527 528 /* This gets set once for the inode lifetime */ 529 spin_lock(&inode->i_lock); 530 inode->i_opflags |= IOP_FASTPERM; 531 spin_unlock(&inode->i_lock); 532 } 533 return generic_permission(idmap, inode, mask); 534 } 535 536 /** 537 * sb_permission - Check superblock-level permissions 538 * @sb: Superblock of inode to check permission on 539 * @inode: Inode to check permission on 540 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) 541 * 542 * Separate out file-system wide checks from inode-specific permission checks. 543 * 544 * Note: lookup_inode_permission_may_exec() does not call here. If you add 545 * MAY_EXEC checks, adjust it. 546 */ 547 static int sb_permission(struct super_block *sb, struct inode *inode, int mask) 548 { 549 if (mask & MAY_WRITE) { 550 umode_t mode = inode->i_mode; 551 552 /* Nobody gets write access to a read-only fs. */ 553 if (sb_rdonly(sb) && (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) 554 return -EROFS; 555 } 556 return 0; 557 } 558 559 /** 560 * inode_permission - Check for access rights to a given inode 561 * @idmap: idmap of the mount the inode was found from 562 * @inode: Inode to check permission on 563 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) 564 * 565 * Check for read/write/execute permissions on an inode. We use fs[ug]id for 566 * this, letting us set arbitrary permissions for filesystem access without 567 * changing the "normal" UIDs which are used for other things. 568 * 569 * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask. 570 */ 571 int inode_permission(struct mnt_idmap *idmap, 572 struct inode *inode, int mask) 573 { 574 int retval; 575 576 retval = sb_permission(inode->i_sb, inode, mask); 577 if (unlikely(retval)) 578 return retval; 579 580 if (mask & MAY_WRITE) { 581 /* 582 * Nobody gets write access to an immutable file. 583 */ 584 if (unlikely(IS_IMMUTABLE(inode))) 585 return -EPERM; 586 587 /* 588 * Updating mtime will likely cause i_uid and i_gid to be 589 * written back improperly if their true value is unknown 590 * to the vfs. 591 */ 592 if (unlikely(HAS_UNMAPPED_ID(idmap, inode))) 593 return -EACCES; 594 } 595 596 retval = do_inode_permission(idmap, inode, mask); 597 if (unlikely(retval)) 598 return retval; 599 600 retval = devcgroup_inode_permission(inode, mask); 601 if (unlikely(retval)) 602 return retval; 603 604 return security_inode_permission(inode, mask); 605 } 606 EXPORT_SYMBOL(inode_permission); 607 608 /* 609 * lookup_inode_permission_may_exec - Check traversal right for given inode 610 * 611 * This is a special case routine for may_lookup() making assumptions specific 612 * to path traversal. Use inode_permission() if you are doing something else. 613 * 614 * Work is shaved off compared to inode_permission() as follows: 615 * - we know for a fact there is no MAY_WRITE to worry about 616 * - it is an invariant the inode is a directory 617 * 618 * Since majority of real-world traversal happens on inodes which grant it for 619 * everyone, we check it upfront and only resort to more expensive work if it 620 * fails. 621 * 622 * Filesystems which have their own ->permission hook and consequently miss out 623 * on IOP_FASTPERM can still get the optimization if they set IOP_FASTPERM_MAY_EXEC 624 * on their directory inodes. 625 */ 626 static __always_inline int lookup_inode_permission_may_exec(struct mnt_idmap *idmap, 627 struct inode *inode, int mask) 628 { 629 /* Lookup already checked this to return -ENOTDIR */ 630 VFS_BUG_ON_INODE(!S_ISDIR(inode->i_mode), inode); 631 VFS_BUG_ON((mask & ~MAY_NOT_BLOCK) != 0); 632 633 mask |= MAY_EXEC; 634 635 if (unlikely(!(inode->i_opflags & (IOP_FASTPERM | IOP_FASTPERM_MAY_EXEC)))) 636 return inode_permission(idmap, inode, mask); 637 638 if (unlikely(((inode->i_mode & 0111) != 0111) || !no_acl_inode(inode))) 639 return inode_permission(idmap, inode, mask); 640 641 return security_inode_permission(inode, mask); 642 } 643 644 /** 645 * path_get - get a reference to a path 646 * @path: path to get the reference to 647 * 648 * Given a path increment the reference count to the dentry and the vfsmount. 649 */ 650 void path_get(const struct path *path) 651 { 652 mntget(path->mnt); 653 dget(path->dentry); 654 } 655 EXPORT_SYMBOL(path_get); 656 657 /** 658 * path_put - put a reference to a path 659 * @path: path to put the reference to 660 * 661 * Given a path decrement the reference count to the dentry and the vfsmount. 662 */ 663 void path_put(const struct path *path) 664 { 665 dput(path->dentry); 666 mntput(path->mnt); 667 } 668 EXPORT_SYMBOL(path_put); 669 670 #define EMBEDDED_LEVELS 2 671 struct nameidata { 672 struct path path; 673 struct qstr last; 674 struct path root; 675 struct inode *inode; /* path.dentry.d_inode */ 676 unsigned int flags, state; 677 unsigned seq, next_seq, m_seq, r_seq; 678 int last_type; 679 unsigned depth; 680 int total_link_count; 681 struct saved { 682 struct path link; 683 struct delayed_call done; 684 const char *name; 685 unsigned seq; 686 } *stack, internal[EMBEDDED_LEVELS]; 687 struct filename *name; 688 const char *pathname; 689 struct nameidata *saved; 690 unsigned root_seq; 691 int dfd; 692 vfsuid_t dir_vfsuid; 693 umode_t dir_mode; 694 } __randomize_layout; 695 696 #define ND_ROOT_PRESET 1 697 #define ND_ROOT_GRABBED 2 698 #define ND_JUMPED 4 699 700 static void __set_nameidata(struct nameidata *p, int dfd, struct filename *name) 701 { 702 struct nameidata *old = current->nameidata; 703 p->stack = p->internal; 704 p->depth = 0; 705 p->dfd = dfd; 706 p->name = name; 707 p->pathname = likely(name) ? name->name : ""; 708 p->path.mnt = NULL; 709 p->path.dentry = NULL; 710 p->total_link_count = old ? old->total_link_count : 0; 711 p->saved = old; 712 current->nameidata = p; 713 } 714 715 static inline void set_nameidata(struct nameidata *p, int dfd, struct filename *name, 716 const struct path *root) 717 { 718 __set_nameidata(p, dfd, name); 719 p->state = 0; 720 if (unlikely(root)) { 721 p->state = ND_ROOT_PRESET; 722 p->root = *root; 723 } 724 } 725 726 static void restore_nameidata(void) 727 { 728 struct nameidata *now = current->nameidata, *old = now->saved; 729 730 current->nameidata = old; 731 if (old) 732 old->total_link_count = now->total_link_count; 733 if (now->stack != now->internal) 734 kfree(now->stack); 735 } 736 737 static bool nd_alloc_stack(struct nameidata *nd) 738 { 739 struct saved *p; 740 741 p= kmalloc_array(MAXSYMLINKS, sizeof(struct saved), 742 nd->flags & LOOKUP_RCU ? GFP_ATOMIC : GFP_KERNEL); 743 if (unlikely(!p)) 744 return false; 745 memcpy(p, nd->internal, sizeof(nd->internal)); 746 nd->stack = p; 747 return true; 748 } 749 750 /** 751 * path_connected - Verify that a dentry is below mnt.mnt_root 752 * @mnt: The mountpoint to check. 753 * @dentry: The dentry to check. 754 * 755 * Rename can sometimes move a file or directory outside of a bind 756 * mount, path_connected allows those cases to be detected. 757 */ 758 static bool path_connected(struct vfsmount *mnt, struct dentry *dentry) 759 { 760 struct super_block *sb = mnt->mnt_sb; 761 762 /* Bind mounts can have disconnected paths */ 763 if (mnt->mnt_root == sb->s_root) 764 return true; 765 766 return is_subdir(dentry, mnt->mnt_root); 767 } 768 769 static void drop_links(struct nameidata *nd) 770 { 771 int i = nd->depth; 772 while (i--) { 773 struct saved *last = nd->stack + i; 774 do_delayed_call(&last->done); 775 clear_delayed_call(&last->done); 776 } 777 } 778 779 static void leave_rcu(struct nameidata *nd) 780 { 781 nd->flags &= ~LOOKUP_RCU; 782 nd->seq = nd->next_seq = 0; 783 rcu_read_unlock(); 784 } 785 786 static void terminate_walk(struct nameidata *nd) 787 { 788 if (unlikely(nd->depth)) 789 drop_links(nd); 790 if (!(nd->flags & LOOKUP_RCU)) { 791 int i; 792 path_put(&nd->path); 793 for (i = 0; i < nd->depth; i++) 794 path_put(&nd->stack[i].link); 795 if (nd->state & ND_ROOT_GRABBED) { 796 path_put(&nd->root); 797 nd->state &= ~ND_ROOT_GRABBED; 798 } 799 } else { 800 leave_rcu(nd); 801 } 802 nd->depth = 0; 803 nd->path.mnt = NULL; 804 nd->path.dentry = NULL; 805 } 806 807 /* path_put is needed afterwards regardless of success or failure */ 808 static bool __legitimize_path(struct path *path, unsigned seq, unsigned mseq) 809 { 810 int res = __legitimize_mnt(path->mnt, mseq); 811 if (unlikely(res)) { 812 if (res > 0) 813 path->mnt = NULL; 814 path->dentry = NULL; 815 return false; 816 } 817 if (unlikely(!lockref_get_not_dead(&path->dentry->d_lockref))) { 818 path->dentry = NULL; 819 return false; 820 } 821 return !read_seqcount_retry(&path->dentry->d_seq, seq); 822 } 823 824 static inline bool legitimize_path(struct nameidata *nd, 825 struct path *path, unsigned seq) 826 { 827 return __legitimize_path(path, seq, nd->m_seq); 828 } 829 830 static bool legitimize_links(struct nameidata *nd) 831 { 832 int i; 833 if (unlikely(nd->flags & LOOKUP_CACHED)) { 834 drop_links(nd); 835 nd->depth = 0; 836 return false; 837 } 838 for (i = 0; i < nd->depth; i++) { 839 struct saved *last = nd->stack + i; 840 if (unlikely(!legitimize_path(nd, &last->link, last->seq))) { 841 drop_links(nd); 842 nd->depth = i + 1; 843 return false; 844 } 845 } 846 return true; 847 } 848 849 static bool legitimize_root(struct nameidata *nd) 850 { 851 /* Nothing to do if nd->root is zero or is managed by the VFS user. */ 852 if (!nd->root.mnt || (nd->state & ND_ROOT_PRESET)) 853 return true; 854 nd->state |= ND_ROOT_GRABBED; 855 return legitimize_path(nd, &nd->root, nd->root_seq); 856 } 857 858 /* 859 * Path walking has 2 modes, rcu-walk and ref-walk (see 860 * Documentation/filesystems/path-lookup.txt). In situations when we can't 861 * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab 862 * normal reference counts on dentries and vfsmounts to transition to ref-walk 863 * mode. Refcounts are grabbed at the last known good point before rcu-walk 864 * got stuck, so ref-walk may continue from there. If this is not successful 865 * (eg. a seqcount has changed), then failure is returned and it's up to caller 866 * to restart the path walk from the beginning in ref-walk mode. 867 */ 868 869 /** 870 * try_to_unlazy - try to switch to ref-walk mode. 871 * @nd: nameidata pathwalk data 872 * Returns: true on success, false on failure 873 * 874 * try_to_unlazy attempts to legitimize the current nd->path and nd->root 875 * for ref-walk mode. 876 * Must be called from rcu-walk context. 877 * Nothing should touch nameidata between try_to_unlazy() failure and 878 * terminate_walk(). 879 */ 880 static bool try_to_unlazy(struct nameidata *nd) 881 { 882 struct dentry *parent = nd->path.dentry; 883 884 BUG_ON(!(nd->flags & LOOKUP_RCU)); 885 886 if (unlikely(nd->depth && !legitimize_links(nd))) 887 goto out1; 888 if (unlikely(!legitimize_path(nd, &nd->path, nd->seq))) 889 goto out; 890 if (unlikely(!legitimize_root(nd))) 891 goto out; 892 leave_rcu(nd); 893 BUG_ON(nd->inode != parent->d_inode); 894 return true; 895 896 out1: 897 nd->path.mnt = NULL; 898 nd->path.dentry = NULL; 899 out: 900 leave_rcu(nd); 901 return false; 902 } 903 904 /** 905 * try_to_unlazy_next - try to switch to ref-walk mode. 906 * @nd: nameidata pathwalk data 907 * @dentry: next dentry to step into 908 * Returns: true on success, false on failure 909 * 910 * Similar to try_to_unlazy(), but here we have the next dentry already 911 * picked by rcu-walk and want to legitimize that in addition to the current 912 * nd->path and nd->root for ref-walk mode. Must be called from rcu-walk context. 913 * Nothing should touch nameidata between try_to_unlazy_next() failure and 914 * terminate_walk(). 915 */ 916 static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry) 917 { 918 int res; 919 BUG_ON(!(nd->flags & LOOKUP_RCU)); 920 921 if (unlikely(nd->depth && !legitimize_links(nd))) 922 goto out2; 923 res = __legitimize_mnt(nd->path.mnt, nd->m_seq); 924 if (unlikely(res)) { 925 if (res > 0) 926 goto out2; 927 goto out1; 928 } 929 if (unlikely(!lockref_get_not_dead(&nd->path.dentry->d_lockref))) 930 goto out1; 931 932 /* 933 * We need to move both the parent and the dentry from the RCU domain 934 * to be properly refcounted. And the sequence number in the dentry 935 * validates *both* dentry counters, since we checked the sequence 936 * number of the parent after we got the child sequence number. So we 937 * know the parent must still be valid if the child sequence number is 938 */ 939 if (unlikely(!lockref_get_not_dead(&dentry->d_lockref))) 940 goto out; 941 if (read_seqcount_retry(&dentry->d_seq, nd->next_seq)) 942 goto out_dput; 943 /* 944 * Sequence counts matched. Now make sure that the root is 945 * still valid and get it if required. 946 */ 947 if (unlikely(!legitimize_root(nd))) 948 goto out_dput; 949 leave_rcu(nd); 950 return true; 951 952 out2: 953 nd->path.mnt = NULL; 954 out1: 955 nd->path.dentry = NULL; 956 out: 957 leave_rcu(nd); 958 return false; 959 out_dput: 960 leave_rcu(nd); 961 dput(dentry); 962 return false; 963 } 964 965 static inline int d_revalidate(struct inode *dir, const struct qstr *name, 966 struct dentry *dentry, unsigned int flags) 967 { 968 if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) 969 return dentry->d_op->d_revalidate(dir, name, dentry, flags); 970 else 971 return 1; 972 } 973 974 /** 975 * complete_walk - successful completion of path walk 976 * @nd: pointer nameidata 977 * 978 * If we had been in RCU mode, drop out of it and legitimize nd->path. 979 * Revalidate the final result, unless we'd already done that during 980 * the path walk or the filesystem doesn't ask for it. Return 0 on 981 * success, -error on failure. In case of failure caller does not 982 * need to drop nd->path. 983 */ 984 static int complete_walk(struct nameidata *nd) 985 { 986 struct dentry *dentry = nd->path.dentry; 987 int status; 988 989 if (nd->flags & LOOKUP_RCU) { 990 /* 991 * We don't want to zero nd->root for scoped-lookups or 992 * externally-managed nd->root. 993 */ 994 if (likely(!(nd->state & ND_ROOT_PRESET))) 995 if (likely(!(nd->flags & LOOKUP_IS_SCOPED))) 996 nd->root.mnt = NULL; 997 nd->flags &= ~LOOKUP_CACHED; 998 if (!try_to_unlazy(nd)) 999 return -ECHILD; 1000 } 1001 1002 if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) { 1003 /* 1004 * While the guarantee of LOOKUP_IS_SCOPED is (roughly) "don't 1005 * ever step outside the root during lookup" and should already 1006 * be guaranteed by the rest of namei, we want to avoid a namei 1007 * BUG resulting in userspace being given a path that was not 1008 * scoped within the root at some point during the lookup. 1009 * 1010 * So, do a final sanity-check to make sure that in the 1011 * worst-case scenario (a complete bypass of LOOKUP_IS_SCOPED) 1012 * we won't silently return an fd completely outside of the 1013 * requested root to userspace. 1014 * 1015 * Userspace could move the path outside the root after this 1016 * check, but as discussed elsewhere this is not a concern (the 1017 * resolved file was inside the root at some point). 1018 */ 1019 if (!path_is_under(&nd->path, &nd->root)) 1020 return -EXDEV; 1021 } 1022 1023 if (likely(!(nd->state & ND_JUMPED))) 1024 return 0; 1025 1026 if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE))) 1027 return 0; 1028 1029 status = dentry->d_op->d_weak_revalidate(dentry, nd->flags); 1030 if (status > 0) 1031 return 0; 1032 1033 if (!status) 1034 status = -ESTALE; 1035 1036 return status; 1037 } 1038 1039 static int set_root(struct nameidata *nd) 1040 { 1041 struct fs_struct *fs = current->fs; 1042 1043 /* 1044 * Jumping to the real root in a scoped-lookup is a BUG in namei, but we 1045 * still have to ensure it doesn't happen because it will cause a breakout 1046 * from the dirfd. 1047 */ 1048 if (WARN_ON(nd->flags & LOOKUP_IS_SCOPED)) 1049 return -ENOTRECOVERABLE; 1050 1051 if (nd->flags & LOOKUP_RCU) { 1052 unsigned seq; 1053 1054 do { 1055 seq = read_seqbegin(&fs->seq); 1056 nd->root = fs->root; 1057 nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq); 1058 } while (read_seqretry(&fs->seq, seq)); 1059 } else { 1060 get_fs_root(fs, &nd->root); 1061 nd->state |= ND_ROOT_GRABBED; 1062 } 1063 return 0; 1064 } 1065 1066 static int nd_jump_root(struct nameidata *nd) 1067 { 1068 if (unlikely(nd->flags & LOOKUP_BENEATH)) 1069 return -EXDEV; 1070 if (unlikely(nd->flags & LOOKUP_NO_XDEV)) { 1071 /* Absolute path arguments to path_init() are allowed. */ 1072 if (nd->path.mnt != NULL && nd->path.mnt != nd->root.mnt) 1073 return -EXDEV; 1074 } 1075 if (!nd->root.mnt) { 1076 int error = set_root(nd); 1077 if (unlikely(error)) 1078 return error; 1079 } 1080 if (nd->flags & LOOKUP_RCU) { 1081 struct dentry *d; 1082 nd->path = nd->root; 1083 d = nd->path.dentry; 1084 nd->inode = d->d_inode; 1085 nd->seq = nd->root_seq; 1086 if (read_seqcount_retry(&d->d_seq, nd->seq)) 1087 return -ECHILD; 1088 } else { 1089 path_put(&nd->path); 1090 nd->path = nd->root; 1091 path_get(&nd->path); 1092 nd->inode = nd->path.dentry->d_inode; 1093 } 1094 nd->state |= ND_JUMPED; 1095 return 0; 1096 } 1097 1098 /* 1099 * Helper to directly jump to a known parsed path from ->get_link, 1100 * caller must have taken a reference to path beforehand. 1101 */ 1102 int nd_jump_link(const struct path *path) 1103 { 1104 int error = -ELOOP; 1105 struct nameidata *nd = current->nameidata; 1106 1107 if (unlikely(nd->flags & LOOKUP_NO_MAGICLINKS)) 1108 goto err; 1109 1110 error = -EXDEV; 1111 if (unlikely(nd->flags & LOOKUP_NO_XDEV)) { 1112 if (nd->path.mnt != path->mnt) 1113 goto err; 1114 } 1115 /* Not currently safe for scoped-lookups. */ 1116 if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) 1117 goto err; 1118 1119 path_put(&nd->path); 1120 nd->path = *path; 1121 nd->inode = nd->path.dentry->d_inode; 1122 nd->state |= ND_JUMPED; 1123 return 0; 1124 1125 err: 1126 path_put(path); 1127 return error; 1128 } 1129 1130 static inline void put_link(struct nameidata *nd) 1131 { 1132 struct saved *last = nd->stack + --nd->depth; 1133 do_delayed_call(&last->done); 1134 if (!(nd->flags & LOOKUP_RCU)) 1135 path_put(&last->link); 1136 } 1137 1138 static int sysctl_protected_symlinks __read_mostly; 1139 static int sysctl_protected_hardlinks __read_mostly; 1140 static int sysctl_protected_fifos __read_mostly; 1141 static int sysctl_protected_regular __read_mostly; 1142 1143 #ifdef CONFIG_SYSCTL 1144 static const struct ctl_table namei_sysctls[] = { 1145 { 1146 .procname = "protected_symlinks", 1147 .data = &sysctl_protected_symlinks, 1148 .maxlen = sizeof(int), 1149 .mode = 0644, 1150 .proc_handler = proc_dointvec_minmax, 1151 .extra1 = SYSCTL_ZERO, 1152 .extra2 = SYSCTL_ONE, 1153 }, 1154 { 1155 .procname = "protected_hardlinks", 1156 .data = &sysctl_protected_hardlinks, 1157 .maxlen = sizeof(int), 1158 .mode = 0644, 1159 .proc_handler = proc_dointvec_minmax, 1160 .extra1 = SYSCTL_ZERO, 1161 .extra2 = SYSCTL_ONE, 1162 }, 1163 { 1164 .procname = "protected_fifos", 1165 .data = &sysctl_protected_fifos, 1166 .maxlen = sizeof(int), 1167 .mode = 0644, 1168 .proc_handler = proc_dointvec_minmax, 1169 .extra1 = SYSCTL_ZERO, 1170 .extra2 = SYSCTL_TWO, 1171 }, 1172 { 1173 .procname = "protected_regular", 1174 .data = &sysctl_protected_regular, 1175 .maxlen = sizeof(int), 1176 .mode = 0644, 1177 .proc_handler = proc_dointvec_minmax, 1178 .extra1 = SYSCTL_ZERO, 1179 .extra2 = SYSCTL_TWO, 1180 }, 1181 }; 1182 1183 static int __init init_fs_namei_sysctls(void) 1184 { 1185 register_sysctl_init("fs", namei_sysctls); 1186 return 0; 1187 } 1188 fs_initcall(init_fs_namei_sysctls); 1189 1190 #endif /* CONFIG_SYSCTL */ 1191 1192 /** 1193 * may_follow_link - Check symlink following for unsafe situations 1194 * @nd: nameidata pathwalk data 1195 * @inode: Used for idmapping. 1196 * 1197 * In the case of the sysctl_protected_symlinks sysctl being enabled, 1198 * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is 1199 * in a sticky world-writable directory. This is to protect privileged 1200 * processes from failing races against path names that may change out 1201 * from under them by way of other users creating malicious symlinks. 1202 * It will permit symlinks to be followed only when outside a sticky 1203 * world-writable directory, or when the uid of the symlink and follower 1204 * match, or when the directory owner matches the symlink's owner. 1205 * 1206 * Returns 0 if following the symlink is allowed, -ve on error. 1207 */ 1208 static inline int may_follow_link(struct nameidata *nd, const struct inode *inode) 1209 { 1210 struct mnt_idmap *idmap; 1211 vfsuid_t vfsuid; 1212 1213 if (!sysctl_protected_symlinks) 1214 return 0; 1215 1216 idmap = mnt_idmap(nd->path.mnt); 1217 vfsuid = i_uid_into_vfsuid(idmap, inode); 1218 /* Allowed if owner and follower match. */ 1219 if (vfsuid_eq_kuid(vfsuid, current_fsuid())) 1220 return 0; 1221 1222 /* Allowed if parent directory not sticky and world-writable. */ 1223 if ((nd->dir_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH)) 1224 return 0; 1225 1226 /* Allowed if parent directory and link owner match. */ 1227 if (vfsuid_valid(nd->dir_vfsuid) && vfsuid_eq(nd->dir_vfsuid, vfsuid)) 1228 return 0; 1229 1230 if (nd->flags & LOOKUP_RCU) 1231 return -ECHILD; 1232 1233 audit_inode(nd->name, nd->stack[0].link.dentry, 0); 1234 audit_log_path_denied(AUDIT_ANOM_LINK, "follow_link"); 1235 return -EACCES; 1236 } 1237 1238 /** 1239 * safe_hardlink_source - Check for safe hardlink conditions 1240 * @idmap: idmap of the mount the inode was found from 1241 * @inode: the source inode to hardlink from 1242 * 1243 * Return false if at least one of the following conditions: 1244 * - inode is not a regular file 1245 * - inode is setuid 1246 * - inode is setgid and group-exec 1247 * - access failure for read and write 1248 * 1249 * Otherwise returns true. 1250 */ 1251 static bool safe_hardlink_source(struct mnt_idmap *idmap, 1252 struct inode *inode) 1253 { 1254 umode_t mode = inode->i_mode; 1255 1256 /* Special files should not get pinned to the filesystem. */ 1257 if (!S_ISREG(mode)) 1258 return false; 1259 1260 /* Setuid files should not get pinned to the filesystem. */ 1261 if (mode & S_ISUID) 1262 return false; 1263 1264 /* Executable setgid files should not get pinned to the filesystem. */ 1265 if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) 1266 return false; 1267 1268 /* Hardlinking to unreadable or unwritable sources is dangerous. */ 1269 if (inode_permission(idmap, inode, MAY_READ | MAY_WRITE)) 1270 return false; 1271 1272 return true; 1273 } 1274 1275 /** 1276 * may_linkat - Check permissions for creating a hardlink 1277 * @idmap: idmap of the mount the inode was found from 1278 * @link: the source to hardlink from 1279 * 1280 * Block hardlink when all of: 1281 * - sysctl_protected_hardlinks enabled 1282 * - fsuid does not match inode 1283 * - hardlink source is unsafe (see safe_hardlink_source() above) 1284 * - not CAP_FOWNER in a namespace with the inode owner uid mapped 1285 * 1286 * If the inode has been found through an idmapped mount the idmap of 1287 * the vfsmount must be passed through @idmap. This function will then take 1288 * care to map the inode according to @idmap before checking permissions. 1289 * On non-idmapped mounts or if permission checking is to be performed on the 1290 * raw inode simply pass @nop_mnt_idmap. 1291 * 1292 * Returns 0 if successful, -ve on error. 1293 */ 1294 int may_linkat(struct mnt_idmap *idmap, const struct path *link) 1295 { 1296 struct inode *inode = link->dentry->d_inode; 1297 1298 /* Inode writeback is not safe when the uid or gid are invalid. */ 1299 if (!vfsuid_valid(i_uid_into_vfsuid(idmap, inode)) || 1300 !vfsgid_valid(i_gid_into_vfsgid(idmap, inode))) 1301 return -EOVERFLOW; 1302 1303 if (!sysctl_protected_hardlinks) 1304 return 0; 1305 1306 /* Source inode owner (or CAP_FOWNER) can hardlink all they like, 1307 * otherwise, it must be a safe source. 1308 */ 1309 if (safe_hardlink_source(idmap, inode) || 1310 inode_owner_or_capable(idmap, inode)) 1311 return 0; 1312 1313 audit_log_path_denied(AUDIT_ANOM_LINK, "linkat"); 1314 return -EPERM; 1315 } 1316 1317 /** 1318 * may_create_in_sticky - Check whether an O_CREAT open in a sticky directory 1319 * should be allowed, or not, on files that already 1320 * exist. 1321 * @idmap: idmap of the mount the inode was found from 1322 * @nd: nameidata pathwalk data 1323 * @inode: the inode of the file to open 1324 * 1325 * Block an O_CREAT open of a FIFO (or a regular file) when: 1326 * - sysctl_protected_fifos (or sysctl_protected_regular) is enabled 1327 * - the file already exists 1328 * - we are in a sticky directory 1329 * - we don't own the file 1330 * - the owner of the directory doesn't own the file 1331 * - the directory is world writable 1332 * If the sysctl_protected_fifos (or sysctl_protected_regular) is set to 2 1333 * the directory doesn't have to be world writable: being group writable will 1334 * be enough. 1335 * 1336 * If the inode has been found through an idmapped mount the idmap of 1337 * the vfsmount must be passed through @idmap. This function will then take 1338 * care to map the inode according to @idmap before checking permissions. 1339 * On non-idmapped mounts or if permission checking is to be performed on the 1340 * raw inode simply pass @nop_mnt_idmap. 1341 * 1342 * Returns 0 if the open is allowed, -ve on error. 1343 */ 1344 static int may_create_in_sticky(struct mnt_idmap *idmap, struct nameidata *nd, 1345 struct inode *const inode) 1346 { 1347 umode_t dir_mode = nd->dir_mode; 1348 vfsuid_t dir_vfsuid = nd->dir_vfsuid, i_vfsuid; 1349 1350 if (likely(!(dir_mode & S_ISVTX))) 1351 return 0; 1352 1353 if (S_ISREG(inode->i_mode) && !sysctl_protected_regular) 1354 return 0; 1355 1356 if (S_ISFIFO(inode->i_mode) && !sysctl_protected_fifos) 1357 return 0; 1358 1359 i_vfsuid = i_uid_into_vfsuid(idmap, inode); 1360 1361 if (vfsuid_eq(i_vfsuid, dir_vfsuid)) 1362 return 0; 1363 1364 if (vfsuid_eq_kuid(i_vfsuid, current_fsuid())) 1365 return 0; 1366 1367 if (likely(dir_mode & 0002)) { 1368 audit_log_path_denied(AUDIT_ANOM_CREAT, "sticky_create"); 1369 return -EACCES; 1370 } 1371 1372 if (dir_mode & 0020) { 1373 if (sysctl_protected_fifos >= 2 && S_ISFIFO(inode->i_mode)) { 1374 audit_log_path_denied(AUDIT_ANOM_CREAT, 1375 "sticky_create_fifo"); 1376 return -EACCES; 1377 } 1378 1379 if (sysctl_protected_regular >= 2 && S_ISREG(inode->i_mode)) { 1380 audit_log_path_denied(AUDIT_ANOM_CREAT, 1381 "sticky_create_regular"); 1382 return -EACCES; 1383 } 1384 } 1385 1386 return 0; 1387 } 1388 1389 /* 1390 * follow_up - Find the mountpoint of path's vfsmount 1391 * 1392 * Given a path, find the mountpoint of its source file system. 1393 * Replace @path with the path of the mountpoint in the parent mount. 1394 * Up is towards /. 1395 * 1396 * Return 1 if we went up a level and 0 if we were already at the 1397 * root. 1398 */ 1399 int follow_up(struct path *path) 1400 { 1401 struct mount *mnt = real_mount(path->mnt); 1402 struct mount *parent; 1403 struct dentry *mountpoint; 1404 1405 read_seqlock_excl(&mount_lock); 1406 parent = mnt->mnt_parent; 1407 if (parent == mnt) { 1408 read_sequnlock_excl(&mount_lock); 1409 return 0; 1410 } 1411 mntget(&parent->mnt); 1412 mountpoint = dget(mnt->mnt_mountpoint); 1413 read_sequnlock_excl(&mount_lock); 1414 dput(path->dentry); 1415 path->dentry = mountpoint; 1416 mntput(path->mnt); 1417 path->mnt = &parent->mnt; 1418 return 1; 1419 } 1420 EXPORT_SYMBOL(follow_up); 1421 1422 static bool choose_mountpoint_rcu(struct mount *m, const struct path *root, 1423 struct path *path, unsigned *seqp) 1424 { 1425 while (mnt_has_parent(m)) { 1426 struct dentry *mountpoint = m->mnt_mountpoint; 1427 1428 m = m->mnt_parent; 1429 if (unlikely(root->dentry == mountpoint && 1430 root->mnt == &m->mnt)) 1431 break; 1432 if (mountpoint != m->mnt.mnt_root) { 1433 path->mnt = &m->mnt; 1434 path->dentry = mountpoint; 1435 *seqp = read_seqcount_begin(&mountpoint->d_seq); 1436 return true; 1437 } 1438 } 1439 return false; 1440 } 1441 1442 static bool choose_mountpoint(struct mount *m, const struct path *root, 1443 struct path *path) 1444 { 1445 bool found; 1446 1447 rcu_read_lock(); 1448 while (1) { 1449 unsigned seq, mseq = read_seqbegin(&mount_lock); 1450 1451 found = choose_mountpoint_rcu(m, root, path, &seq); 1452 if (unlikely(!found)) { 1453 if (!read_seqretry(&mount_lock, mseq)) 1454 break; 1455 } else { 1456 if (likely(__legitimize_path(path, seq, mseq))) 1457 break; 1458 rcu_read_unlock(); 1459 path_put(path); 1460 rcu_read_lock(); 1461 } 1462 } 1463 rcu_read_unlock(); 1464 return found; 1465 } 1466 1467 /* 1468 * Perform an automount 1469 * - return -EISDIR to tell follow_managed() to stop and return the path we 1470 * were called with. 1471 */ 1472 static int follow_automount(struct path *path, int *count, unsigned lookup_flags) 1473 { 1474 struct dentry *dentry = path->dentry; 1475 1476 /* We don't want to mount if someone's just doing a stat - 1477 * unless they're stat'ing a directory and appended a '/' to 1478 * the name. 1479 * 1480 * We do, however, want to mount if someone wants to open or 1481 * create a file of any type under the mountpoint, wants to 1482 * traverse through the mountpoint or wants to open the 1483 * mounted directory. Also, autofs may mark negative dentries 1484 * as being automount points. These will need the attentions 1485 * of the daemon to instantiate them before they can be used. 1486 */ 1487 if (!(lookup_flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY | 1488 LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) && 1489 dentry->d_inode) 1490 return -EISDIR; 1491 1492 /* No need to trigger automounts if mountpoint crossing is disabled. */ 1493 if (lookup_flags & LOOKUP_NO_XDEV) 1494 return -EXDEV; 1495 1496 if (count && (*count)++ >= MAXSYMLINKS) 1497 return -ELOOP; 1498 1499 return finish_automount(dentry->d_op->d_automount(path), path); 1500 } 1501 1502 /* 1503 * mount traversal - out-of-line part. One note on ->d_flags accesses - 1504 * dentries are pinned but not locked here, so negative dentry can go 1505 * positive right under us. Use of smp_load_acquire() provides a barrier 1506 * sufficient for ->d_inode and ->d_flags consistency. 1507 */ 1508 static int __traverse_mounts(struct path *path, unsigned flags, bool *jumped, 1509 int *count, unsigned lookup_flags) 1510 { 1511 struct vfsmount *mnt = path->mnt; 1512 bool need_mntput = false; 1513 int ret = 0; 1514 1515 while (flags & DCACHE_MANAGED_DENTRY) { 1516 /* Allow the filesystem to manage the transit without i_rwsem 1517 * being held. */ 1518 if (flags & DCACHE_MANAGE_TRANSIT) { 1519 if (lookup_flags & LOOKUP_NO_XDEV) { 1520 ret = -EXDEV; 1521 break; 1522 } 1523 ret = path->dentry->d_op->d_manage(path, false); 1524 flags = smp_load_acquire(&path->dentry->d_flags); 1525 if (ret < 0) 1526 break; 1527 } 1528 1529 if (flags & DCACHE_MOUNTED) { // something's mounted on it.. 1530 struct vfsmount *mounted = lookup_mnt(path); 1531 if (mounted) { // ... in our namespace 1532 dput(path->dentry); 1533 if (need_mntput) 1534 mntput(path->mnt); 1535 path->mnt = mounted; 1536 path->dentry = dget(mounted->mnt_root); 1537 // here we know it's positive 1538 flags = path->dentry->d_flags; 1539 need_mntput = true; 1540 if (unlikely(lookup_flags & LOOKUP_NO_XDEV)) { 1541 ret = -EXDEV; 1542 break; 1543 } 1544 continue; 1545 } 1546 } 1547 1548 if (!(flags & DCACHE_NEED_AUTOMOUNT)) 1549 break; 1550 1551 // uncovered automount point 1552 ret = follow_automount(path, count, lookup_flags); 1553 flags = smp_load_acquire(&path->dentry->d_flags); 1554 if (ret < 0) 1555 break; 1556 } 1557 1558 if (ret == -EISDIR) 1559 ret = 0; 1560 // possible if you race with several mount --move 1561 if (need_mntput && path->mnt == mnt) 1562 mntput(path->mnt); 1563 if (!ret && unlikely(d_flags_negative(flags))) 1564 ret = -ENOENT; 1565 *jumped = need_mntput; 1566 return ret; 1567 } 1568 1569 static inline int traverse_mounts(struct path *path, bool *jumped, 1570 int *count, unsigned lookup_flags) 1571 { 1572 unsigned flags = smp_load_acquire(&path->dentry->d_flags); 1573 1574 /* fastpath */ 1575 if (likely(!(flags & DCACHE_MANAGED_DENTRY))) { 1576 *jumped = false; 1577 if (unlikely(d_flags_negative(flags))) 1578 return -ENOENT; 1579 return 0; 1580 } 1581 return __traverse_mounts(path, flags, jumped, count, lookup_flags); 1582 } 1583 1584 int follow_down_one(struct path *path) 1585 { 1586 struct vfsmount *mounted; 1587 1588 mounted = lookup_mnt(path); 1589 if (mounted) { 1590 dput(path->dentry); 1591 mntput(path->mnt); 1592 path->mnt = mounted; 1593 path->dentry = dget(mounted->mnt_root); 1594 return 1; 1595 } 1596 return 0; 1597 } 1598 EXPORT_SYMBOL(follow_down_one); 1599 1600 /* 1601 * Follow down to the covering mount currently visible to userspace. At each 1602 * point, the filesystem owning that dentry may be queried as to whether the 1603 * caller is permitted to proceed or not. 1604 */ 1605 int follow_down(struct path *path, unsigned int flags) 1606 { 1607 struct vfsmount *mnt = path->mnt; 1608 bool jumped; 1609 int ret = traverse_mounts(path, &jumped, NULL, flags); 1610 1611 if (path->mnt != mnt) 1612 mntput(mnt); 1613 return ret; 1614 } 1615 EXPORT_SYMBOL(follow_down); 1616 1617 /* 1618 * Try to skip to top of mountpoint pile in rcuwalk mode. Fail if 1619 * we meet a managed dentry that would need blocking. 1620 */ 1621 static bool __follow_mount_rcu(struct nameidata *nd, struct path *path) 1622 { 1623 struct dentry *dentry = path->dentry; 1624 unsigned int flags = dentry->d_flags; 1625 1626 if (likely(!(flags & DCACHE_MANAGED_DENTRY))) 1627 return true; 1628 1629 if (unlikely(nd->flags & LOOKUP_NO_XDEV)) 1630 return false; 1631 1632 for (;;) { 1633 /* 1634 * Don't forget we might have a non-mountpoint managed dentry 1635 * that wants to block transit. 1636 */ 1637 if (unlikely(flags & DCACHE_MANAGE_TRANSIT)) { 1638 int res = dentry->d_op->d_manage(path, true); 1639 if (res) 1640 return res == -EISDIR; 1641 flags = dentry->d_flags; 1642 } 1643 1644 if (flags & DCACHE_MOUNTED) { 1645 struct mount *mounted = __lookup_mnt(path->mnt, dentry); 1646 if (mounted) { 1647 path->mnt = &mounted->mnt; 1648 dentry = path->dentry = mounted->mnt.mnt_root; 1649 nd->state |= ND_JUMPED; 1650 nd->next_seq = read_seqcount_begin(&dentry->d_seq); 1651 flags = dentry->d_flags; 1652 // makes sure that non-RCU pathwalk could reach 1653 // this state. 1654 if (read_seqretry(&mount_lock, nd->m_seq)) 1655 return false; 1656 continue; 1657 } 1658 if (read_seqretry(&mount_lock, nd->m_seq)) 1659 return false; 1660 } 1661 return !(flags & DCACHE_NEED_AUTOMOUNT); 1662 } 1663 } 1664 1665 static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry, 1666 struct path *path) 1667 { 1668 bool jumped; 1669 int ret; 1670 1671 path->mnt = nd->path.mnt; 1672 path->dentry = dentry; 1673 if (nd->flags & LOOKUP_RCU) { 1674 unsigned int seq = nd->next_seq; 1675 if (likely(!d_managed(dentry))) 1676 return 0; 1677 if (likely(__follow_mount_rcu(nd, path))) 1678 return 0; 1679 // *path and nd->next_seq might've been clobbered 1680 path->mnt = nd->path.mnt; 1681 path->dentry = dentry; 1682 nd->next_seq = seq; 1683 if (unlikely(!try_to_unlazy_next(nd, dentry))) 1684 return -ECHILD; 1685 } 1686 ret = traverse_mounts(path, &jumped, &nd->total_link_count, nd->flags); 1687 if (jumped) 1688 nd->state |= ND_JUMPED; 1689 if (unlikely(ret)) { 1690 dput(path->dentry); 1691 if (path->mnt != nd->path.mnt) 1692 mntput(path->mnt); 1693 } 1694 return ret; 1695 } 1696 1697 /* 1698 * This looks up the name in dcache and possibly revalidates the found dentry. 1699 * NULL is returned if the dentry does not exist in the cache. 1700 */ 1701 static struct dentry *lookup_dcache(const struct qstr *name, 1702 struct dentry *dir, 1703 unsigned int flags) 1704 { 1705 struct dentry *dentry = d_lookup(dir, name); 1706 if (dentry) { 1707 int error = d_revalidate(dir->d_inode, name, dentry, flags); 1708 if (unlikely(error <= 0)) { 1709 if (!error) 1710 d_invalidate(dentry); 1711 dput(dentry); 1712 return ERR_PTR(error); 1713 } 1714 } 1715 return dentry; 1716 } 1717 1718 /* 1719 * Parent directory has inode locked exclusive. This is one 1720 * and only case when ->lookup() gets called on non in-lookup 1721 * dentries - as the matter of fact, this only gets called 1722 * when directory is guaranteed to have no in-lookup children 1723 * at all. 1724 * Will return -ENOENT if name isn't found and LOOKUP_CREATE wasn't passed. 1725 * Will return -EEXIST if name is found and LOOKUP_EXCL was passed. 1726 */ 1727 struct dentry *lookup_one_qstr_excl(const struct qstr *name, 1728 struct dentry *base, unsigned int flags) 1729 { 1730 struct dentry *dentry; 1731 struct dentry *old; 1732 struct inode *dir; 1733 1734 dentry = lookup_dcache(name, base, flags); 1735 if (dentry) 1736 goto found; 1737 1738 /* Don't create child dentry for a dead directory. */ 1739 dir = base->d_inode; 1740 if (unlikely(IS_DEADDIR(dir))) 1741 return ERR_PTR(-ENOENT); 1742 1743 dentry = d_alloc(base, name); 1744 if (unlikely(!dentry)) 1745 return ERR_PTR(-ENOMEM); 1746 1747 old = dir->i_op->lookup(dir, dentry, flags); 1748 if (unlikely(old)) { 1749 dput(dentry); 1750 dentry = old; 1751 } 1752 found: 1753 if (IS_ERR(dentry)) 1754 return dentry; 1755 if (d_is_negative(dentry) && !(flags & LOOKUP_CREATE)) { 1756 dput(dentry); 1757 return ERR_PTR(-ENOENT); 1758 } 1759 if (d_is_positive(dentry) && (flags & LOOKUP_EXCL)) { 1760 dput(dentry); 1761 return ERR_PTR(-EEXIST); 1762 } 1763 return dentry; 1764 } 1765 EXPORT_SYMBOL(lookup_one_qstr_excl); 1766 1767 /** 1768 * lookup_fast - do fast lockless (but racy) lookup of a dentry 1769 * @nd: current nameidata 1770 * 1771 * Do a fast, but racy lookup in the dcache for the given dentry, and 1772 * revalidate it. Returns a valid dentry pointer or NULL if one wasn't 1773 * found. On error, an ERR_PTR will be returned. 1774 * 1775 * If this function returns a valid dentry and the walk is no longer 1776 * lazy, the dentry will carry a reference that must later be put. If 1777 * RCU mode is still in force, then this is not the case and the dentry 1778 * must be legitimized before use. If this returns NULL, then the walk 1779 * will no longer be in RCU mode. 1780 */ 1781 static struct dentry *lookup_fast(struct nameidata *nd) 1782 { 1783 struct dentry *dentry, *parent = nd->path.dentry; 1784 int status = 1; 1785 1786 /* 1787 * Rename seqlock is not required here because in the off chance 1788 * of a false negative due to a concurrent rename, the caller is 1789 * going to fall back to non-racy lookup. 1790 */ 1791 if (nd->flags & LOOKUP_RCU) { 1792 dentry = __d_lookup_rcu(parent, &nd->last, &nd->next_seq); 1793 if (unlikely(!dentry)) { 1794 if (!try_to_unlazy(nd)) 1795 return ERR_PTR(-ECHILD); 1796 return NULL; 1797 } 1798 1799 /* 1800 * This sequence count validates that the parent had no 1801 * changes while we did the lookup of the dentry above. 1802 */ 1803 if (read_seqcount_retry(&parent->d_seq, nd->seq)) 1804 return ERR_PTR(-ECHILD); 1805 1806 status = d_revalidate(nd->inode, &nd->last, dentry, nd->flags); 1807 if (likely(status > 0)) 1808 return dentry; 1809 if (!try_to_unlazy_next(nd, dentry)) 1810 return ERR_PTR(-ECHILD); 1811 if (status == -ECHILD) 1812 /* we'd been told to redo it in non-rcu mode */ 1813 status = d_revalidate(nd->inode, &nd->last, 1814 dentry, nd->flags); 1815 } else { 1816 dentry = __d_lookup(parent, &nd->last); 1817 if (unlikely(!dentry)) 1818 return NULL; 1819 status = d_revalidate(nd->inode, &nd->last, dentry, nd->flags); 1820 } 1821 if (unlikely(status <= 0)) { 1822 if (!status) 1823 d_invalidate(dentry); 1824 dput(dentry); 1825 return ERR_PTR(status); 1826 } 1827 return dentry; 1828 } 1829 1830 /* Fast lookup failed, do it the slow way */ 1831 static struct dentry *__lookup_slow(const struct qstr *name, 1832 struct dentry *dir, 1833 unsigned int flags) 1834 { 1835 struct dentry *dentry, *old; 1836 struct inode *inode = dir->d_inode; 1837 DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); 1838 1839 /* Don't go there if it's already dead */ 1840 if (unlikely(IS_DEADDIR(inode))) 1841 return ERR_PTR(-ENOENT); 1842 again: 1843 dentry = d_alloc_parallel(dir, name, &wq); 1844 if (IS_ERR(dentry)) 1845 return dentry; 1846 if (unlikely(!d_in_lookup(dentry))) { 1847 int error = d_revalidate(inode, name, dentry, flags); 1848 if (unlikely(error <= 0)) { 1849 if (!error) { 1850 d_invalidate(dentry); 1851 dput(dentry); 1852 goto again; 1853 } 1854 dput(dentry); 1855 dentry = ERR_PTR(error); 1856 } 1857 } else { 1858 old = inode->i_op->lookup(inode, dentry, flags); 1859 d_lookup_done(dentry); 1860 if (unlikely(old)) { 1861 dput(dentry); 1862 dentry = old; 1863 } 1864 } 1865 return dentry; 1866 } 1867 1868 static noinline struct dentry *lookup_slow(const struct qstr *name, 1869 struct dentry *dir, 1870 unsigned int flags) 1871 { 1872 struct inode *inode = dir->d_inode; 1873 struct dentry *res; 1874 inode_lock_shared(inode); 1875 res = __lookup_slow(name, dir, flags); 1876 inode_unlock_shared(inode); 1877 return res; 1878 } 1879 1880 static struct dentry *lookup_slow_killable(const struct qstr *name, 1881 struct dentry *dir, 1882 unsigned int flags) 1883 { 1884 struct inode *inode = dir->d_inode; 1885 struct dentry *res; 1886 1887 if (inode_lock_shared_killable(inode)) 1888 return ERR_PTR(-EINTR); 1889 res = __lookup_slow(name, dir, flags); 1890 inode_unlock_shared(inode); 1891 return res; 1892 } 1893 1894 static inline int may_lookup(struct mnt_idmap *idmap, 1895 struct nameidata *restrict nd) 1896 { 1897 int err, mask; 1898 1899 mask = nd->flags & LOOKUP_RCU ? MAY_NOT_BLOCK : 0; 1900 err = lookup_inode_permission_may_exec(idmap, nd->inode, mask); 1901 if (likely(!err)) 1902 return 0; 1903 1904 // If we failed, and we weren't in LOOKUP_RCU, it's final 1905 if (!(nd->flags & LOOKUP_RCU)) 1906 return err; 1907 1908 // Drop out of RCU mode to make sure it wasn't transient 1909 if (!try_to_unlazy(nd)) 1910 return -ECHILD; // redo it all non-lazy 1911 1912 if (err != -ECHILD) // hard error 1913 return err; 1914 1915 return lookup_inode_permission_may_exec(idmap, nd->inode, 0); 1916 } 1917 1918 static int reserve_stack(struct nameidata *nd, struct path *link) 1919 { 1920 if (unlikely(nd->total_link_count++ >= MAXSYMLINKS)) 1921 return -ELOOP; 1922 1923 if (likely(nd->depth != EMBEDDED_LEVELS)) 1924 return 0; 1925 if (likely(nd->stack != nd->internal)) 1926 return 0; 1927 if (likely(nd_alloc_stack(nd))) 1928 return 0; 1929 1930 if (nd->flags & LOOKUP_RCU) { 1931 // we need to grab link before we do unlazy. And we can't skip 1932 // unlazy even if we fail to grab the link - cleanup needs it 1933 bool grabbed_link = legitimize_path(nd, link, nd->next_seq); 1934 1935 if (!try_to_unlazy(nd) || !grabbed_link) 1936 return -ECHILD; 1937 1938 if (nd_alloc_stack(nd)) 1939 return 0; 1940 } 1941 return -ENOMEM; 1942 } 1943 1944 enum {WALK_TRAILING = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4}; 1945 1946 static noinline const char *pick_link(struct nameidata *nd, struct path *link, 1947 struct inode *inode, int flags) 1948 { 1949 struct saved *last; 1950 const char *res; 1951 int error; 1952 1953 if (nd->flags & LOOKUP_RCU) { 1954 /* make sure that d_is_symlink from step_into_slowpath() matches the inode */ 1955 if (read_seqcount_retry(&link->dentry->d_seq, nd->next_seq)) 1956 return ERR_PTR(-ECHILD); 1957 } else { 1958 if (link->mnt == nd->path.mnt) 1959 mntget(link->mnt); 1960 } 1961 1962 error = reserve_stack(nd, link); 1963 if (unlikely(error)) { 1964 if (!(nd->flags & LOOKUP_RCU)) 1965 path_put(link); 1966 return ERR_PTR(error); 1967 } 1968 last = nd->stack + nd->depth++; 1969 last->link = *link; 1970 clear_delayed_call(&last->done); 1971 last->seq = nd->next_seq; 1972 1973 if (flags & WALK_TRAILING) { 1974 error = may_follow_link(nd, inode); 1975 if (unlikely(error)) 1976 return ERR_PTR(error); 1977 } 1978 1979 if (unlikely(nd->flags & LOOKUP_NO_SYMLINKS) || 1980 unlikely(link->mnt->mnt_flags & MNT_NOSYMFOLLOW)) 1981 return ERR_PTR(-ELOOP); 1982 1983 if (unlikely(atime_needs_update(&last->link, inode))) { 1984 if (nd->flags & LOOKUP_RCU) { 1985 if (!try_to_unlazy(nd)) 1986 return ERR_PTR(-ECHILD); 1987 } 1988 touch_atime(&last->link); 1989 cond_resched(); 1990 } 1991 1992 error = security_inode_follow_link(link->dentry, inode, 1993 nd->flags & LOOKUP_RCU); 1994 if (unlikely(error)) 1995 return ERR_PTR(error); 1996 1997 res = READ_ONCE(inode->i_link); 1998 if (!res) { 1999 const char * (*get)(struct dentry *, struct inode *, 2000 struct delayed_call *); 2001 get = inode->i_op->get_link; 2002 if (nd->flags & LOOKUP_RCU) { 2003 res = get(NULL, inode, &last->done); 2004 if (res == ERR_PTR(-ECHILD) && try_to_unlazy(nd)) 2005 res = get(link->dentry, inode, &last->done); 2006 } else { 2007 res = get(link->dentry, inode, &last->done); 2008 } 2009 if (!res) 2010 goto all_done; 2011 if (IS_ERR(res)) 2012 return res; 2013 } 2014 if (*res == '/') { 2015 error = nd_jump_root(nd); 2016 if (unlikely(error)) 2017 return ERR_PTR(error); 2018 while (unlikely(*++res == '/')) 2019 ; 2020 } 2021 if (*res) 2022 return res; 2023 all_done: // pure jump 2024 put_link(nd); 2025 return NULL; 2026 } 2027 2028 /* 2029 * Do we need to follow links? We _really_ want to be able 2030 * to do this check without having to look at inode->i_op, 2031 * so we keep a cache of "no, this doesn't need follow_link" 2032 * for the common case. 2033 * 2034 * NOTE: dentry must be what nd->next_seq had been sampled from. 2035 */ 2036 static noinline const char *step_into_slowpath(struct nameidata *nd, int flags, 2037 struct dentry *dentry) 2038 { 2039 struct path path; 2040 struct inode *inode; 2041 int err; 2042 2043 err = handle_mounts(nd, dentry, &path); 2044 if (unlikely(err < 0)) 2045 return ERR_PTR(err); 2046 inode = path.dentry->d_inode; 2047 if (likely(!d_is_symlink(path.dentry)) || 2048 ((flags & WALK_TRAILING) && !(nd->flags & LOOKUP_FOLLOW)) || 2049 (flags & WALK_NOFOLLOW)) { 2050 /* not a symlink or should not follow */ 2051 if (nd->flags & LOOKUP_RCU) { 2052 if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq)) 2053 return ERR_PTR(-ECHILD); 2054 if (unlikely(!inode)) 2055 return ERR_PTR(-ENOENT); 2056 } else { 2057 dput(nd->path.dentry); 2058 if (nd->path.mnt != path.mnt) 2059 mntput(nd->path.mnt); 2060 } 2061 nd->path = path; 2062 nd->inode = inode; 2063 nd->seq = nd->next_seq; 2064 return NULL; 2065 } 2066 return pick_link(nd, &path, inode, flags); 2067 } 2068 2069 static __always_inline const char *step_into(struct nameidata *nd, int flags, 2070 struct dentry *dentry) 2071 { 2072 /* 2073 * In the common case we are in rcu-walk and traversing over a non-mounted on 2074 * directory (as opposed to e.g., a symlink). 2075 * 2076 * We can handle that and negative entries with the checks below. 2077 */ 2078 if (likely((nd->flags & LOOKUP_RCU) && 2079 !d_managed(dentry) && !d_is_symlink(dentry))) { 2080 struct inode *inode = dentry->d_inode; 2081 if (read_seqcount_retry(&dentry->d_seq, nd->next_seq)) 2082 return ERR_PTR(-ECHILD); 2083 if (unlikely(!inode)) 2084 return ERR_PTR(-ENOENT); 2085 nd->path.dentry = dentry; 2086 /* nd->path.mnt is retained on purpose */ 2087 nd->inode = inode; 2088 nd->seq = nd->next_seq; 2089 return NULL; 2090 } 2091 return step_into_slowpath(nd, flags, dentry); 2092 } 2093 2094 static struct dentry *follow_dotdot_rcu(struct nameidata *nd) 2095 { 2096 struct dentry *parent, *old; 2097 2098 if (path_equal(&nd->path, &nd->root)) 2099 goto in_root; 2100 if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) { 2101 struct path path; 2102 unsigned seq; 2103 if (!choose_mountpoint_rcu(real_mount(nd->path.mnt), 2104 &nd->root, &path, &seq)) 2105 goto in_root; 2106 if (unlikely(nd->flags & LOOKUP_NO_XDEV)) 2107 return ERR_PTR(-ECHILD); 2108 nd->path = path; 2109 nd->inode = path.dentry->d_inode; 2110 nd->seq = seq; 2111 // makes sure that non-RCU pathwalk could reach this state 2112 if (read_seqretry(&mount_lock, nd->m_seq)) 2113 return ERR_PTR(-ECHILD); 2114 /* we know that mountpoint was pinned */ 2115 } 2116 old = nd->path.dentry; 2117 parent = old->d_parent; 2118 nd->next_seq = read_seqcount_begin(&parent->d_seq); 2119 // makes sure that non-RCU pathwalk could reach this state 2120 if (read_seqcount_retry(&old->d_seq, nd->seq)) 2121 return ERR_PTR(-ECHILD); 2122 if (unlikely(!path_connected(nd->path.mnt, parent))) 2123 return ERR_PTR(-ECHILD); 2124 return parent; 2125 in_root: 2126 if (read_seqretry(&mount_lock, nd->m_seq)) 2127 return ERR_PTR(-ECHILD); 2128 if (unlikely(nd->flags & LOOKUP_BENEATH)) 2129 return ERR_PTR(-ECHILD); 2130 nd->next_seq = nd->seq; 2131 return nd->path.dentry; 2132 } 2133 2134 static struct dentry *follow_dotdot(struct nameidata *nd) 2135 { 2136 struct dentry *parent; 2137 2138 if (path_equal(&nd->path, &nd->root)) 2139 goto in_root; 2140 if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) { 2141 struct path path; 2142 2143 if (!choose_mountpoint(real_mount(nd->path.mnt), 2144 &nd->root, &path)) 2145 goto in_root; 2146 path_put(&nd->path); 2147 nd->path = path; 2148 nd->inode = path.dentry->d_inode; 2149 if (unlikely(nd->flags & LOOKUP_NO_XDEV)) 2150 return ERR_PTR(-EXDEV); 2151 } 2152 /* rare case of legitimate dget_parent()... */ 2153 parent = dget_parent(nd->path.dentry); 2154 if (unlikely(!path_connected(nd->path.mnt, parent))) { 2155 dput(parent); 2156 return ERR_PTR(-ENOENT); 2157 } 2158 return parent; 2159 2160 in_root: 2161 if (unlikely(nd->flags & LOOKUP_BENEATH)) 2162 return ERR_PTR(-EXDEV); 2163 return dget(nd->path.dentry); 2164 } 2165 2166 static const char *handle_dots(struct nameidata *nd, int type) 2167 { 2168 if (type == LAST_DOTDOT) { 2169 const char *error = NULL; 2170 struct dentry *parent; 2171 2172 if (!nd->root.mnt) { 2173 error = ERR_PTR(set_root(nd)); 2174 if (unlikely(error)) 2175 return error; 2176 } 2177 if (nd->flags & LOOKUP_RCU) 2178 parent = follow_dotdot_rcu(nd); 2179 else 2180 parent = follow_dotdot(nd); 2181 if (IS_ERR(parent)) 2182 return ERR_CAST(parent); 2183 error = step_into(nd, WALK_NOFOLLOW, parent); 2184 if (unlikely(error)) 2185 return error; 2186 2187 if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) { 2188 /* 2189 * If there was a racing rename or mount along our 2190 * path, then we can't be sure that ".." hasn't jumped 2191 * above nd->root (and so userspace should retry or use 2192 * some fallback). 2193 */ 2194 smp_rmb(); 2195 if (__read_seqcount_retry(&mount_lock.seqcount, nd->m_seq)) 2196 return ERR_PTR(-EAGAIN); 2197 if (__read_seqcount_retry(&rename_lock.seqcount, nd->r_seq)) 2198 return ERR_PTR(-EAGAIN); 2199 } 2200 } 2201 return NULL; 2202 } 2203 2204 static __always_inline const char *walk_component(struct nameidata *nd, int flags) 2205 { 2206 struct dentry *dentry; 2207 /* 2208 * "." and ".." are special - ".." especially so because it has 2209 * to be able to know about the current root directory and 2210 * parent relationships. 2211 */ 2212 if (unlikely(nd->last_type != LAST_NORM)) { 2213 if (unlikely(nd->depth) && !(flags & WALK_MORE)) 2214 put_link(nd); 2215 return handle_dots(nd, nd->last_type); 2216 } 2217 dentry = lookup_fast(nd); 2218 if (IS_ERR(dentry)) 2219 return ERR_CAST(dentry); 2220 if (unlikely(!dentry)) { 2221 dentry = lookup_slow(&nd->last, nd->path.dentry, nd->flags); 2222 if (IS_ERR(dentry)) 2223 return ERR_CAST(dentry); 2224 } 2225 if (unlikely(nd->depth) && !(flags & WALK_MORE)) 2226 put_link(nd); 2227 return step_into(nd, flags, dentry); 2228 } 2229 2230 /* 2231 * We can do the critical dentry name comparison and hashing 2232 * operations one word at a time, but we are limited to: 2233 * 2234 * - Architectures with fast unaligned word accesses. We could 2235 * do a "get_unaligned()" if this helps and is sufficiently 2236 * fast. 2237 * 2238 * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we 2239 * do not trap on the (extremely unlikely) case of a page 2240 * crossing operation. 2241 * 2242 * - Furthermore, we need an efficient 64-bit compile for the 2243 * 64-bit case in order to generate the "number of bytes in 2244 * the final mask". Again, that could be replaced with a 2245 * efficient population count instruction or similar. 2246 */ 2247 #ifdef CONFIG_DCACHE_WORD_ACCESS 2248 2249 #include <asm/word-at-a-time.h> 2250 2251 #ifdef HASH_MIX 2252 2253 /* Architecture provides HASH_MIX and fold_hash() in <asm/hash.h> */ 2254 2255 #elif defined(CONFIG_64BIT) 2256 /* 2257 * Register pressure in the mixing function is an issue, particularly 2258 * on 32-bit x86, but almost any function requires one state value and 2259 * one temporary. Instead, use a function designed for two state values 2260 * and no temporaries. 2261 * 2262 * This function cannot create a collision in only two iterations, so 2263 * we have two iterations to achieve avalanche. In those two iterations, 2264 * we have six layers of mixing, which is enough to spread one bit's 2265 * influence out to 2^6 = 64 state bits. 2266 * 2267 * Rotate constants are scored by considering either 64 one-bit input 2268 * deltas or 64*63/2 = 2016 two-bit input deltas, and finding the 2269 * probability of that delta causing a change to each of the 128 output 2270 * bits, using a sample of random initial states. 2271 * 2272 * The Shannon entropy of the computed probabilities is then summed 2273 * to produce a score. Ideally, any input change has a 50% chance of 2274 * toggling any given output bit. 2275 * 2276 * Mixing scores (in bits) for (12,45): 2277 * Input delta: 1-bit 2-bit 2278 * 1 round: 713.3 42542.6 2279 * 2 rounds: 2753.7 140389.8 2280 * 3 rounds: 5954.1 233458.2 2281 * 4 rounds: 7862.6 256672.2 2282 * Perfect: 8192 258048 2283 * (64*128) (64*63/2 * 128) 2284 */ 2285 #define HASH_MIX(x, y, a) \ 2286 ( x ^= (a), \ 2287 y ^= x, x = rol64(x,12),\ 2288 x += y, y = rol64(y,45),\ 2289 y *= 9 ) 2290 2291 /* 2292 * Fold two longs into one 32-bit hash value. This must be fast, but 2293 * latency isn't quite as critical, as there is a fair bit of additional 2294 * work done before the hash value is used. 2295 */ 2296 static inline unsigned int fold_hash(unsigned long x, unsigned long y) 2297 { 2298 y ^= x * GOLDEN_RATIO_64; 2299 y *= GOLDEN_RATIO_64; 2300 return y >> 32; 2301 } 2302 2303 #else /* 32-bit case */ 2304 2305 /* 2306 * Mixing scores (in bits) for (7,20): 2307 * Input delta: 1-bit 2-bit 2308 * 1 round: 330.3 9201.6 2309 * 2 rounds: 1246.4 25475.4 2310 * 3 rounds: 1907.1 31295.1 2311 * 4 rounds: 2042.3 31718.6 2312 * Perfect: 2048 31744 2313 * (32*64) (32*31/2 * 64) 2314 */ 2315 #define HASH_MIX(x, y, a) \ 2316 ( x ^= (a), \ 2317 y ^= x, x = rol32(x, 7),\ 2318 x += y, y = rol32(y,20),\ 2319 y *= 9 ) 2320 2321 static inline unsigned int fold_hash(unsigned long x, unsigned long y) 2322 { 2323 /* Use arch-optimized multiply if one exists */ 2324 return __hash_32(y ^ __hash_32(x)); 2325 } 2326 2327 #endif 2328 2329 /* 2330 * Return the hash of a string of known length. This is carfully 2331 * designed to match hash_name(), which is the more critical function. 2332 * In particular, we must end by hashing a final word containing 0..7 2333 * payload bytes, to match the way that hash_name() iterates until it 2334 * finds the delimiter after the name. 2335 */ 2336 unsigned int full_name_hash(const void *salt, const char *name, unsigned int len) 2337 { 2338 unsigned long a, x = 0, y = (unsigned long)salt; 2339 2340 for (;;) { 2341 if (!len) 2342 goto done; 2343 a = load_unaligned_zeropad(name); 2344 if (len < sizeof(unsigned long)) 2345 break; 2346 HASH_MIX(x, y, a); 2347 name += sizeof(unsigned long); 2348 len -= sizeof(unsigned long); 2349 } 2350 x ^= a & bytemask_from_count(len); 2351 done: 2352 return fold_hash(x, y); 2353 } 2354 EXPORT_SYMBOL(full_name_hash); 2355 2356 /* Return the "hash_len" (hash and length) of a null-terminated string */ 2357 u64 hashlen_string(const void *salt, const char *name) 2358 { 2359 unsigned long a = 0, x = 0, y = (unsigned long)salt; 2360 unsigned long adata, mask, len; 2361 const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; 2362 2363 len = 0; 2364 goto inside; 2365 2366 do { 2367 HASH_MIX(x, y, a); 2368 len += sizeof(unsigned long); 2369 inside: 2370 a = load_unaligned_zeropad(name+len); 2371 } while (!has_zero(a, &adata, &constants)); 2372 2373 adata = prep_zero_mask(a, adata, &constants); 2374 mask = create_zero_mask(adata); 2375 x ^= a & zero_bytemask(mask); 2376 2377 return hashlen_create(fold_hash(x, y), len + find_zero(mask)); 2378 } 2379 EXPORT_SYMBOL(hashlen_string); 2380 2381 /* 2382 * Calculate the length and hash of the path component, and 2383 * return the length as the result. 2384 */ 2385 static inline const char *hash_name(struct nameidata *nd, 2386 const char *name, 2387 unsigned long *lastword) 2388 { 2389 unsigned long a, b, x, y = (unsigned long)nd->path.dentry; 2390 unsigned long adata, bdata, mask, len; 2391 const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; 2392 2393 /* 2394 * The first iteration is special, because it can result in 2395 * '.' and '..' and has no mixing other than the final fold. 2396 */ 2397 a = load_unaligned_zeropad(name); 2398 b = a ^ REPEAT_BYTE('/'); 2399 if (has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants)) { 2400 adata = prep_zero_mask(a, adata, &constants); 2401 bdata = prep_zero_mask(b, bdata, &constants); 2402 mask = create_zero_mask(adata | bdata); 2403 a &= zero_bytemask(mask); 2404 *lastword = a; 2405 len = find_zero(mask); 2406 nd->last.hash = fold_hash(a, y); 2407 nd->last.len = len; 2408 return name + len; 2409 } 2410 2411 len = 0; 2412 x = 0; 2413 do { 2414 HASH_MIX(x, y, a); 2415 len += sizeof(unsigned long); 2416 a = load_unaligned_zeropad(name+len); 2417 b = a ^ REPEAT_BYTE('/'); 2418 } while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants))); 2419 2420 adata = prep_zero_mask(a, adata, &constants); 2421 bdata = prep_zero_mask(b, bdata, &constants); 2422 mask = create_zero_mask(adata | bdata); 2423 a &= zero_bytemask(mask); 2424 x ^= a; 2425 len += find_zero(mask); 2426 *lastword = 0; // Multi-word components cannot be DOT or DOTDOT 2427 2428 nd->last.hash = fold_hash(x, y); 2429 nd->last.len = len; 2430 return name + len; 2431 } 2432 2433 /* 2434 * Note that the 'last' word is always zero-masked, but 2435 * was loaded as a possibly big-endian word. 2436 */ 2437 #ifdef __BIG_ENDIAN 2438 #define LAST_WORD_IS_DOT (0x2eul << (BITS_PER_LONG-8)) 2439 #define LAST_WORD_IS_DOTDOT (0x2e2eul << (BITS_PER_LONG-16)) 2440 #endif 2441 2442 #else /* !CONFIG_DCACHE_WORD_ACCESS: Slow, byte-at-a-time version */ 2443 2444 /* Return the hash of a string of known length */ 2445 unsigned int full_name_hash(const void *salt, const char *name, unsigned int len) 2446 { 2447 unsigned long hash = init_name_hash(salt); 2448 while (len--) 2449 hash = partial_name_hash((unsigned char)*name++, hash); 2450 return end_name_hash(hash); 2451 } 2452 EXPORT_SYMBOL(full_name_hash); 2453 2454 /* Return the "hash_len" (hash and length) of a null-terminated string */ 2455 u64 hashlen_string(const void *salt, const char *name) 2456 { 2457 unsigned long hash = init_name_hash(salt); 2458 unsigned long len = 0, c; 2459 2460 c = (unsigned char)*name; 2461 while (c) { 2462 len++; 2463 hash = partial_name_hash(c, hash); 2464 c = (unsigned char)name[len]; 2465 } 2466 return hashlen_create(end_name_hash(hash), len); 2467 } 2468 EXPORT_SYMBOL(hashlen_string); 2469 2470 /* 2471 * We know there's a real path component here of at least 2472 * one character. 2473 */ 2474 static inline const char *hash_name(struct nameidata *nd, const char *name, unsigned long *lastword) 2475 { 2476 unsigned long hash = init_name_hash(nd->path.dentry); 2477 unsigned long len = 0, c, last = 0; 2478 2479 c = (unsigned char)*name; 2480 do { 2481 last = (last << 8) + c; 2482 len++; 2483 hash = partial_name_hash(c, hash); 2484 c = (unsigned char)name[len]; 2485 } while (c && c != '/'); 2486 2487 // This is reliable for DOT or DOTDOT, since the component 2488 // cannot contain NUL characters - top bits being zero means 2489 // we cannot have had any other pathnames. 2490 *lastword = last; 2491 nd->last.hash = end_name_hash(hash); 2492 nd->last.len = len; 2493 return name + len; 2494 } 2495 2496 #endif 2497 2498 #ifndef LAST_WORD_IS_DOT 2499 #define LAST_WORD_IS_DOT 0x2e 2500 #define LAST_WORD_IS_DOTDOT 0x2e2e 2501 #endif 2502 2503 /* 2504 * Name resolution. 2505 * This is the basic name resolution function, turning a pathname into 2506 * the final dentry. We expect 'base' to be positive and a directory. 2507 * 2508 * Returns 0 and nd will have valid dentry and mnt on success. 2509 * Returns error and drops reference to input namei data on failure. 2510 */ 2511 static int link_path_walk(const char *name, struct nameidata *nd) 2512 { 2513 int depth = 0; // depth <= nd->depth 2514 int err; 2515 2516 nd->last_type = LAST_ROOT; 2517 nd->flags |= LOOKUP_PARENT; 2518 if (IS_ERR(name)) 2519 return PTR_ERR(name); 2520 if (*name == '/') { 2521 do { 2522 name++; 2523 } while (unlikely(*name == '/')); 2524 } 2525 if (unlikely(!*name)) { 2526 nd->dir_mode = 0; // short-circuit the 'hardening' idiocy 2527 return 0; 2528 } 2529 2530 /* At this point we know we have a real path component. */ 2531 for(;;) { 2532 struct mnt_idmap *idmap; 2533 const char *link; 2534 unsigned long lastword; 2535 2536 idmap = mnt_idmap(nd->path.mnt); 2537 err = may_lookup(idmap, nd); 2538 if (unlikely(err)) 2539 return err; 2540 2541 nd->last.name = name; 2542 name = hash_name(nd, name, &lastword); 2543 2544 switch(lastword) { 2545 case LAST_WORD_IS_DOTDOT: 2546 nd->last_type = LAST_DOTDOT; 2547 nd->state |= ND_JUMPED; 2548 break; 2549 2550 case LAST_WORD_IS_DOT: 2551 nd->last_type = LAST_DOT; 2552 break; 2553 2554 default: 2555 nd->last_type = LAST_NORM; 2556 nd->state &= ~ND_JUMPED; 2557 2558 struct dentry *parent = nd->path.dentry; 2559 if (unlikely(parent->d_flags & DCACHE_OP_HASH)) { 2560 err = parent->d_op->d_hash(parent, &nd->last); 2561 if (err < 0) 2562 return err; 2563 } 2564 } 2565 2566 if (!*name) 2567 goto OK; 2568 /* 2569 * If it wasn't NUL, we know it was '/'. Skip that 2570 * slash, and continue until no more slashes. 2571 */ 2572 do { 2573 name++; 2574 } while (unlikely(*name == '/')); 2575 if (unlikely(!*name)) { 2576 OK: 2577 /* pathname or trailing symlink, done */ 2578 if (likely(!depth)) { 2579 nd->dir_vfsuid = i_uid_into_vfsuid(idmap, nd->inode); 2580 nd->dir_mode = nd->inode->i_mode; 2581 nd->flags &= ~LOOKUP_PARENT; 2582 return 0; 2583 } 2584 /* last component of nested symlink */ 2585 name = nd->stack[--depth].name; 2586 link = walk_component(nd, 0); 2587 } else { 2588 /* not the last component */ 2589 link = walk_component(nd, WALK_MORE); 2590 } 2591 if (unlikely(link)) { 2592 if (IS_ERR(link)) 2593 return PTR_ERR(link); 2594 /* a symlink to follow */ 2595 nd->stack[depth++].name = name; 2596 name = link; 2597 continue; 2598 } 2599 if (unlikely(!d_can_lookup(nd->path.dentry))) { 2600 if (nd->flags & LOOKUP_RCU) { 2601 if (!try_to_unlazy(nd)) 2602 return -ECHILD; 2603 } 2604 return -ENOTDIR; 2605 } 2606 } 2607 } 2608 2609 /* must be paired with terminate_walk() */ 2610 static const char *path_init(struct nameidata *nd, unsigned flags) 2611 { 2612 int error; 2613 const char *s = nd->pathname; 2614 2615 /* LOOKUP_CACHED requires RCU, ask caller to retry */ 2616 if (unlikely((flags & (LOOKUP_RCU | LOOKUP_CACHED)) == LOOKUP_CACHED)) 2617 return ERR_PTR(-EAGAIN); 2618 2619 if (unlikely(!*s)) 2620 flags &= ~LOOKUP_RCU; 2621 if (flags & LOOKUP_RCU) 2622 rcu_read_lock(); 2623 else 2624 nd->seq = nd->next_seq = 0; 2625 2626 nd->flags = flags; 2627 nd->state |= ND_JUMPED; 2628 2629 nd->m_seq = __read_seqcount_begin(&mount_lock.seqcount); 2630 nd->r_seq = __read_seqcount_begin(&rename_lock.seqcount); 2631 smp_rmb(); 2632 2633 if (unlikely(nd->state & ND_ROOT_PRESET)) { 2634 struct dentry *root = nd->root.dentry; 2635 struct inode *inode = root->d_inode; 2636 if (*s && unlikely(!d_can_lookup(root))) 2637 return ERR_PTR(-ENOTDIR); 2638 nd->path = nd->root; 2639 nd->inode = inode; 2640 if (flags & LOOKUP_RCU) { 2641 nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); 2642 nd->root_seq = nd->seq; 2643 } else { 2644 path_get(&nd->path); 2645 } 2646 return s; 2647 } 2648 2649 nd->root.mnt = NULL; 2650 2651 /* Absolute pathname -- fetch the root (LOOKUP_IN_ROOT uses nd->dfd). */ 2652 if (*s == '/' && likely(!(flags & LOOKUP_IN_ROOT))) { 2653 error = nd_jump_root(nd); 2654 if (unlikely(error)) 2655 return ERR_PTR(error); 2656 return s; 2657 } 2658 2659 /* Relative pathname -- get the starting-point it is relative to. */ 2660 if (nd->dfd == AT_FDCWD) { 2661 if (flags & LOOKUP_RCU) { 2662 struct fs_struct *fs = current->fs; 2663 unsigned seq; 2664 2665 do { 2666 seq = read_seqbegin(&fs->seq); 2667 nd->path = fs->pwd; 2668 nd->inode = nd->path.dentry->d_inode; 2669 nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); 2670 } while (read_seqretry(&fs->seq, seq)); 2671 } else { 2672 get_fs_pwd(current->fs, &nd->path); 2673 nd->inode = nd->path.dentry->d_inode; 2674 } 2675 } else { 2676 /* Caller must check execute permissions on the starting path component */ 2677 CLASS(fd_raw, f)(nd->dfd); 2678 struct dentry *dentry; 2679 2680 if (fd_empty(f)) 2681 return ERR_PTR(-EBADF); 2682 2683 if (flags & LOOKUP_LINKAT_EMPTY) { 2684 if (fd_file(f)->f_cred != current_cred() && 2685 !ns_capable(fd_file(f)->f_cred->user_ns, CAP_DAC_READ_SEARCH)) 2686 return ERR_PTR(-ENOENT); 2687 } 2688 2689 dentry = fd_file(f)->f_path.dentry; 2690 2691 if (*s && unlikely(!d_can_lookup(dentry))) 2692 return ERR_PTR(-ENOTDIR); 2693 2694 nd->path = fd_file(f)->f_path; 2695 if (flags & LOOKUP_RCU) { 2696 nd->inode = nd->path.dentry->d_inode; 2697 nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); 2698 } else { 2699 path_get(&nd->path); 2700 nd->inode = nd->path.dentry->d_inode; 2701 } 2702 } 2703 2704 /* For scoped-lookups we need to set the root to the dirfd as well. */ 2705 if (unlikely(flags & LOOKUP_IS_SCOPED)) { 2706 nd->root = nd->path; 2707 if (flags & LOOKUP_RCU) { 2708 nd->root_seq = nd->seq; 2709 } else { 2710 path_get(&nd->root); 2711 nd->state |= ND_ROOT_GRABBED; 2712 } 2713 } 2714 return s; 2715 } 2716 2717 static inline const char *lookup_last(struct nameidata *nd) 2718 { 2719 if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len]) 2720 nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; 2721 2722 return walk_component(nd, WALK_TRAILING); 2723 } 2724 2725 static int handle_lookup_down(struct nameidata *nd) 2726 { 2727 if (!(nd->flags & LOOKUP_RCU)) 2728 dget(nd->path.dentry); 2729 nd->next_seq = nd->seq; 2730 return PTR_ERR(step_into(nd, WALK_NOFOLLOW, nd->path.dentry)); 2731 } 2732 2733 /* Returns 0 and nd will be valid on success; Returns error, otherwise. */ 2734 static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path) 2735 { 2736 const char *s = path_init(nd, flags); 2737 int err; 2738 2739 if (unlikely(flags & LOOKUP_DOWN) && !IS_ERR(s)) { 2740 err = handle_lookup_down(nd); 2741 if (unlikely(err < 0)) 2742 s = ERR_PTR(err); 2743 } 2744 2745 while (!(err = link_path_walk(s, nd)) && 2746 (s = lookup_last(nd)) != NULL) 2747 ; 2748 if (!err && unlikely(nd->flags & LOOKUP_MOUNTPOINT)) { 2749 err = handle_lookup_down(nd); 2750 nd->state &= ~ND_JUMPED; // no d_weak_revalidate(), please... 2751 } 2752 if (!err) 2753 err = complete_walk(nd); 2754 2755 if (!err && nd->flags & LOOKUP_DIRECTORY) 2756 if (!d_can_lookup(nd->path.dentry)) 2757 err = -ENOTDIR; 2758 if (!err) { 2759 *path = nd->path; 2760 nd->path.mnt = NULL; 2761 nd->path.dentry = NULL; 2762 } 2763 terminate_walk(nd); 2764 return err; 2765 } 2766 2767 int filename_lookup(int dfd, struct filename *name, unsigned flags, 2768 struct path *path, const struct path *root) 2769 { 2770 int retval; 2771 struct nameidata nd; 2772 if (IS_ERR(name)) 2773 return PTR_ERR(name); 2774 set_nameidata(&nd, dfd, name, root); 2775 retval = path_lookupat(&nd, flags | LOOKUP_RCU, path); 2776 if (unlikely(retval == -ECHILD)) 2777 retval = path_lookupat(&nd, flags, path); 2778 if (unlikely(retval == -ESTALE)) 2779 retval = path_lookupat(&nd, flags | LOOKUP_REVAL, path); 2780 2781 if (likely(!retval)) 2782 audit_inode(name, path->dentry, 2783 flags & LOOKUP_MOUNTPOINT ? AUDIT_INODE_NOEVAL : 0); 2784 restore_nameidata(); 2785 return retval; 2786 } 2787 2788 /* Returns 0 and nd will be valid on success; Returns error, otherwise. */ 2789 static int path_parentat(struct nameidata *nd, unsigned flags, 2790 struct path *parent) 2791 { 2792 const char *s = path_init(nd, flags); 2793 int err = link_path_walk(s, nd); 2794 if (!err) 2795 err = complete_walk(nd); 2796 if (!err) { 2797 *parent = nd->path; 2798 nd->path.mnt = NULL; 2799 nd->path.dentry = NULL; 2800 } 2801 terminate_walk(nd); 2802 return err; 2803 } 2804 2805 /* Note: this does not consume "name" */ 2806 static int __filename_parentat(int dfd, struct filename *name, 2807 unsigned int flags, struct path *parent, 2808 struct qstr *last, int *type, 2809 const struct path *root) 2810 { 2811 int retval; 2812 struct nameidata nd; 2813 2814 if (IS_ERR(name)) 2815 return PTR_ERR(name); 2816 set_nameidata(&nd, dfd, name, root); 2817 retval = path_parentat(&nd, flags | LOOKUP_RCU, parent); 2818 if (unlikely(retval == -ECHILD)) 2819 retval = path_parentat(&nd, flags, parent); 2820 if (unlikely(retval == -ESTALE)) 2821 retval = path_parentat(&nd, flags | LOOKUP_REVAL, parent); 2822 if (likely(!retval)) { 2823 *last = nd.last; 2824 *type = nd.last_type; 2825 audit_inode(name, parent->dentry, AUDIT_INODE_PARENT); 2826 } 2827 restore_nameidata(); 2828 return retval; 2829 } 2830 2831 static int filename_parentat(int dfd, struct filename *name, 2832 unsigned int flags, struct path *parent, 2833 struct qstr *last, int *type) 2834 { 2835 return __filename_parentat(dfd, name, flags, parent, last, type, NULL); 2836 } 2837 2838 /** 2839 * start_dirop - begin a create or remove dirop, performing locking and lookup 2840 * @parent: the dentry of the parent in which the operation will occur 2841 * @name: a qstr holding the name within that parent 2842 * @lookup_flags: intent and other lookup flags. 2843 * 2844 * The lookup is performed and necessary locks are taken so that, on success, 2845 * the returned dentry can be operated on safely. 2846 * The qstr must already have the hash value calculated. 2847 * 2848 * Returns: a locked dentry, or an error. 2849 * 2850 */ 2851 static struct dentry *__start_dirop(struct dentry *parent, struct qstr *name, 2852 unsigned int lookup_flags, 2853 unsigned int state) 2854 { 2855 struct dentry *dentry; 2856 struct inode *dir = d_inode(parent); 2857 2858 if (state == TASK_KILLABLE) { 2859 int ret = down_write_killable_nested(&dir->i_rwsem, 2860 I_MUTEX_PARENT); 2861 if (ret) 2862 return ERR_PTR(ret); 2863 } else { 2864 inode_lock_nested(dir, I_MUTEX_PARENT); 2865 } 2866 dentry = lookup_one_qstr_excl(name, parent, lookup_flags); 2867 if (IS_ERR(dentry)) 2868 inode_unlock(dir); 2869 return dentry; 2870 } 2871 2872 struct dentry *start_dirop(struct dentry *parent, struct qstr *name, 2873 unsigned int lookup_flags) 2874 { 2875 return __start_dirop(parent, name, lookup_flags, TASK_NORMAL); 2876 } 2877 2878 /** 2879 * end_dirop - signal completion of a dirop 2880 * @de: the dentry which was returned by start_dirop or similar. 2881 * 2882 * If the de is an error, nothing happens. Otherwise any lock taken to 2883 * protect the dentry is dropped and the dentry itself is release (dput()). 2884 */ 2885 void end_dirop(struct dentry *de) 2886 { 2887 if (!IS_ERR(de)) { 2888 inode_unlock(de->d_parent->d_inode); 2889 dput(de); 2890 } 2891 } 2892 EXPORT_SYMBOL(end_dirop); 2893 2894 /* does lookup, returns the object with parent locked */ 2895 static struct dentry *__start_removing_path(int dfd, struct filename *name, 2896 struct path *path) 2897 { 2898 struct path parent_path __free(path_put) = {}; 2899 struct dentry *d; 2900 struct qstr last; 2901 int type, error; 2902 2903 error = filename_parentat(dfd, name, 0, &parent_path, &last, &type); 2904 if (error) 2905 return ERR_PTR(error); 2906 if (unlikely(type != LAST_NORM)) 2907 return ERR_PTR(-EINVAL); 2908 /* don't fail immediately if it's r/o, at least try to report other errors */ 2909 error = mnt_want_write(parent_path.mnt); 2910 d = start_dirop(parent_path.dentry, &last, 0); 2911 if (IS_ERR(d)) 2912 goto drop; 2913 if (error) 2914 goto fail; 2915 path->dentry = no_free_ptr(parent_path.dentry); 2916 path->mnt = no_free_ptr(parent_path.mnt); 2917 return d; 2918 2919 fail: 2920 end_dirop(d); 2921 d = ERR_PTR(error); 2922 drop: 2923 if (!error) 2924 mnt_drop_write(parent_path.mnt); 2925 return d; 2926 } 2927 2928 /** 2929 * kern_path_parent: lookup path returning parent and target 2930 * @name: path name 2931 * @path: path to store parent in 2932 * 2933 * The path @name should end with a normal component, not "." or ".." or "/". 2934 * A lookup is performed and if successful the parent information 2935 * is store in @parent and the dentry is returned. 2936 * 2937 * The dentry maybe negative, the parent will be positive. 2938 * 2939 * Returns: dentry or error. 2940 */ 2941 struct dentry *kern_path_parent(const char *name, struct path *path) 2942 { 2943 struct path parent_path __free(path_put) = {}; 2944 struct filename *filename __free(putname) = getname_kernel(name); 2945 struct dentry *d; 2946 struct qstr last; 2947 int type, error; 2948 2949 error = filename_parentat(AT_FDCWD, filename, 0, &parent_path, &last, &type); 2950 if (error) 2951 return ERR_PTR(error); 2952 if (unlikely(type != LAST_NORM)) 2953 return ERR_PTR(-EINVAL); 2954 2955 d = lookup_noperm_unlocked(&last, parent_path.dentry); 2956 if (IS_ERR(d)) 2957 return d; 2958 path->dentry = no_free_ptr(parent_path.dentry); 2959 path->mnt = no_free_ptr(parent_path.mnt); 2960 return d; 2961 } 2962 2963 struct dentry *start_removing_path(const char *name, struct path *path) 2964 { 2965 struct filename *filename = getname_kernel(name); 2966 struct dentry *res = __start_removing_path(AT_FDCWD, filename, path); 2967 2968 putname(filename); 2969 return res; 2970 } 2971 2972 struct dentry *start_removing_user_path_at(int dfd, 2973 const char __user *name, 2974 struct path *path) 2975 { 2976 struct filename *filename = getname(name); 2977 struct dentry *res = __start_removing_path(dfd, filename, path); 2978 2979 putname(filename); 2980 return res; 2981 } 2982 EXPORT_SYMBOL(start_removing_user_path_at); 2983 2984 int kern_path(const char *name, unsigned int flags, struct path *path) 2985 { 2986 struct filename *filename = getname_kernel(name); 2987 int ret = filename_lookup(AT_FDCWD, filename, flags, path, NULL); 2988 2989 putname(filename); 2990 return ret; 2991 2992 } 2993 EXPORT_SYMBOL(kern_path); 2994 2995 /** 2996 * vfs_path_parent_lookup - lookup a parent path relative to a dentry-vfsmount pair 2997 * @filename: filename structure 2998 * @flags: lookup flags 2999 * @parent: pointer to struct path to fill 3000 * @last: last component 3001 * @type: type of the last component 3002 * @root: pointer to struct path of the base directory 3003 */ 3004 int vfs_path_parent_lookup(struct filename *filename, unsigned int flags, 3005 struct path *parent, struct qstr *last, int *type, 3006 const struct path *root) 3007 { 3008 return __filename_parentat(AT_FDCWD, filename, flags, parent, last, 3009 type, root); 3010 } 3011 EXPORT_SYMBOL(vfs_path_parent_lookup); 3012 3013 /** 3014 * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair 3015 * @dentry: pointer to dentry of the base directory 3016 * @mnt: pointer to vfs mount of the base directory 3017 * @name: pointer to file name 3018 * @flags: lookup flags 3019 * @path: pointer to struct path to fill 3020 */ 3021 int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, 3022 const char *name, unsigned int flags, 3023 struct path *path) 3024 { 3025 struct filename *filename; 3026 struct path root = {.mnt = mnt, .dentry = dentry}; 3027 int ret; 3028 3029 filename = getname_kernel(name); 3030 /* the first argument of filename_lookup() is ignored with root */ 3031 ret = filename_lookup(AT_FDCWD, filename, flags, path, &root); 3032 putname(filename); 3033 return ret; 3034 } 3035 EXPORT_SYMBOL(vfs_path_lookup); 3036 3037 int lookup_noperm_common(struct qstr *qname, struct dentry *base) 3038 { 3039 const char *name = qname->name; 3040 u32 len = qname->len; 3041 3042 qname->hash = full_name_hash(base, name, len); 3043 if (!len) 3044 return -EACCES; 3045 3046 if (is_dot_dotdot(name, len)) 3047 return -EACCES; 3048 3049 while (len--) { 3050 unsigned int c = *(const unsigned char *)name++; 3051 if (c == '/' || c == '\0') 3052 return -EACCES; 3053 } 3054 /* 3055 * See if the low-level filesystem might want 3056 * to use its own hash.. 3057 */ 3058 if (base->d_flags & DCACHE_OP_HASH) { 3059 int err = base->d_op->d_hash(base, qname); 3060 if (err < 0) 3061 return err; 3062 } 3063 return 0; 3064 } 3065 3066 static int lookup_one_common(struct mnt_idmap *idmap, 3067 struct qstr *qname, struct dentry *base) 3068 { 3069 int err; 3070 err = lookup_noperm_common(qname, base); 3071 if (err < 0) 3072 return err; 3073 return inode_permission(idmap, base->d_inode, MAY_EXEC); 3074 } 3075 3076 /** 3077 * try_lookup_noperm - filesystem helper to lookup single pathname component 3078 * @name: qstr storing pathname component to lookup 3079 * @base: base directory to lookup from 3080 * 3081 * Look up a dentry by name in the dcache, returning NULL if it does not 3082 * currently exist. The function does not try to create a dentry and if one 3083 * is found it doesn't try to revalidate it. 3084 * 3085 * Note that this routine is purely a helper for filesystem usage and should 3086 * not be called by generic code. It does no permission checking. 3087 * 3088 * No locks need be held - only a counted reference to @base is needed. 3089 * 3090 */ 3091 struct dentry *try_lookup_noperm(struct qstr *name, struct dentry *base) 3092 { 3093 int err; 3094 3095 err = lookup_noperm_common(name, base); 3096 if (err) 3097 return ERR_PTR(err); 3098 3099 return d_lookup(base, name); 3100 } 3101 EXPORT_SYMBOL(try_lookup_noperm); 3102 3103 /** 3104 * lookup_noperm - filesystem helper to lookup single pathname component 3105 * @name: qstr storing pathname component to lookup 3106 * @base: base directory to lookup from 3107 * 3108 * Note that this routine is purely a helper for filesystem usage and should 3109 * not be called by generic code. It does no permission checking. 3110 * 3111 * The caller must hold base->i_rwsem. 3112 */ 3113 struct dentry *lookup_noperm(struct qstr *name, struct dentry *base) 3114 { 3115 struct dentry *dentry; 3116 int err; 3117 3118 WARN_ON_ONCE(!inode_is_locked(base->d_inode)); 3119 3120 err = lookup_noperm_common(name, base); 3121 if (err) 3122 return ERR_PTR(err); 3123 3124 dentry = lookup_dcache(name, base, 0); 3125 return dentry ? dentry : __lookup_slow(name, base, 0); 3126 } 3127 EXPORT_SYMBOL(lookup_noperm); 3128 3129 /** 3130 * lookup_one - lookup single pathname component 3131 * @idmap: idmap of the mount the lookup is performed from 3132 * @name: qstr holding pathname component to lookup 3133 * @base: base directory to lookup from 3134 * 3135 * This can be used for in-kernel filesystem clients such as file servers. 3136 * 3137 * The caller must hold base->i_rwsem. 3138 */ 3139 struct dentry *lookup_one(struct mnt_idmap *idmap, struct qstr *name, 3140 struct dentry *base) 3141 { 3142 struct dentry *dentry; 3143 int err; 3144 3145 WARN_ON_ONCE(!inode_is_locked(base->d_inode)); 3146 3147 err = lookup_one_common(idmap, name, base); 3148 if (err) 3149 return ERR_PTR(err); 3150 3151 dentry = lookup_dcache(name, base, 0); 3152 return dentry ? dentry : __lookup_slow(name, base, 0); 3153 } 3154 EXPORT_SYMBOL(lookup_one); 3155 3156 /** 3157 * lookup_one_unlocked - lookup single pathname component 3158 * @idmap: idmap of the mount the lookup is performed from 3159 * @name: qstr olding pathname component to lookup 3160 * @base: base directory to lookup from 3161 * 3162 * This can be used for in-kernel filesystem clients such as file servers. 3163 * 3164 * Unlike lookup_one, it should be called without the parent 3165 * i_rwsem held, and will take the i_rwsem itself if necessary. 3166 */ 3167 struct dentry *lookup_one_unlocked(struct mnt_idmap *idmap, struct qstr *name, 3168 struct dentry *base) 3169 { 3170 int err; 3171 struct dentry *ret; 3172 3173 err = lookup_one_common(idmap, name, base); 3174 if (err) 3175 return ERR_PTR(err); 3176 3177 ret = lookup_dcache(name, base, 0); 3178 if (!ret) 3179 ret = lookup_slow(name, base, 0); 3180 return ret; 3181 } 3182 EXPORT_SYMBOL(lookup_one_unlocked); 3183 3184 /** 3185 * lookup_one_positive_killable - lookup single pathname component 3186 * @idmap: idmap of the mount the lookup is performed from 3187 * @name: qstr olding pathname component to lookup 3188 * @base: base directory to lookup from 3189 * 3190 * This helper will yield ERR_PTR(-ENOENT) on negatives. The helper returns 3191 * known positive or ERR_PTR(). This is what most of the users want. 3192 * 3193 * Note that pinned negative with unlocked parent _can_ become positive at any 3194 * time, so callers of lookup_one_unlocked() need to be very careful; pinned 3195 * positives have >d_inode stable, so this one avoids such problems. 3196 * 3197 * This can be used for in-kernel filesystem clients such as file servers. 3198 * 3199 * It should be called without the parent i_rwsem held, and will take 3200 * the i_rwsem itself if necessary. If a fatal signal is pending or 3201 * delivered, it will return %-EINTR if the lock is needed. 3202 */ 3203 struct dentry *lookup_one_positive_killable(struct mnt_idmap *idmap, 3204 struct qstr *name, 3205 struct dentry *base) 3206 { 3207 int err; 3208 struct dentry *ret; 3209 3210 err = lookup_one_common(idmap, name, base); 3211 if (err) 3212 return ERR_PTR(err); 3213 3214 ret = lookup_dcache(name, base, 0); 3215 if (!ret) 3216 ret = lookup_slow_killable(name, base, 0); 3217 if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) { 3218 dput(ret); 3219 ret = ERR_PTR(-ENOENT); 3220 } 3221 return ret; 3222 } 3223 EXPORT_SYMBOL(lookup_one_positive_killable); 3224 3225 /** 3226 * lookup_one_positive_unlocked - lookup single pathname component 3227 * @idmap: idmap of the mount the lookup is performed from 3228 * @name: qstr holding pathname component to lookup 3229 * @base: base directory to lookup from 3230 * 3231 * This helper will yield ERR_PTR(-ENOENT) on negatives. The helper returns 3232 * known positive or ERR_PTR(). This is what most of the users want. 3233 * 3234 * Note that pinned negative with unlocked parent _can_ become positive at any 3235 * time, so callers of lookup_one_unlocked() need to be very careful; pinned 3236 * positives have >d_inode stable, so this one avoids such problems. 3237 * 3238 * This can be used for in-kernel filesystem clients such as file servers. 3239 * 3240 * The helper should be called without i_rwsem held. 3241 */ 3242 struct dentry *lookup_one_positive_unlocked(struct mnt_idmap *idmap, 3243 struct qstr *name, 3244 struct dentry *base) 3245 { 3246 struct dentry *ret = lookup_one_unlocked(idmap, name, base); 3247 3248 if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) { 3249 dput(ret); 3250 ret = ERR_PTR(-ENOENT); 3251 } 3252 return ret; 3253 } 3254 EXPORT_SYMBOL(lookup_one_positive_unlocked); 3255 3256 /** 3257 * lookup_noperm_unlocked - filesystem helper to lookup single pathname component 3258 * @name: pathname component to lookup 3259 * @base: base directory to lookup from 3260 * 3261 * Note that this routine is purely a helper for filesystem usage and should 3262 * not be called by generic code. It does no permission checking. 3263 * 3264 * Unlike lookup_noperm(), it should be called without the parent 3265 * i_rwsem held, and will take the i_rwsem itself if necessary. 3266 * 3267 * Unlike try_lookup_noperm() it *does* revalidate the dentry if it already 3268 * existed. 3269 */ 3270 struct dentry *lookup_noperm_unlocked(struct qstr *name, struct dentry *base) 3271 { 3272 struct dentry *ret; 3273 int err; 3274 3275 err = lookup_noperm_common(name, base); 3276 if (err) 3277 return ERR_PTR(err); 3278 3279 ret = lookup_dcache(name, base, 0); 3280 if (!ret) 3281 ret = lookup_slow(name, base, 0); 3282 return ret; 3283 } 3284 EXPORT_SYMBOL(lookup_noperm_unlocked); 3285 3286 /* 3287 * Like lookup_noperm_unlocked(), except that it yields ERR_PTR(-ENOENT) 3288 * on negatives. Returns known positive or ERR_PTR(); that's what 3289 * most of the users want. Note that pinned negative with unlocked parent 3290 * _can_ become positive at any time, so callers of lookup_noperm_unlocked() 3291 * need to be very careful; pinned positives have ->d_inode stable, so 3292 * this one avoids such problems. 3293 */ 3294 struct dentry *lookup_noperm_positive_unlocked(struct qstr *name, 3295 struct dentry *base) 3296 { 3297 struct dentry *ret; 3298 3299 ret = lookup_noperm_unlocked(name, base); 3300 if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) { 3301 dput(ret); 3302 ret = ERR_PTR(-ENOENT); 3303 } 3304 return ret; 3305 } 3306 EXPORT_SYMBOL(lookup_noperm_positive_unlocked); 3307 3308 /** 3309 * start_creating - prepare to create a given name with permission checking 3310 * @idmap: idmap of the mount 3311 * @parent: directory in which to prepare to create the name 3312 * @name: the name to be created 3313 * 3314 * Locks are taken and a lookup is performed prior to creating 3315 * an object in a directory. Permission checking (MAY_EXEC) is performed 3316 * against @idmap. 3317 * 3318 * If the name already exists, a positive dentry is returned, so 3319 * behaviour is similar to O_CREAT without O_EXCL, which doesn't fail 3320 * with -EEXIST. 3321 * 3322 * Returns: a negative or positive dentry, or an error. 3323 */ 3324 struct dentry *start_creating(struct mnt_idmap *idmap, struct dentry *parent, 3325 struct qstr *name) 3326 { 3327 int err = lookup_one_common(idmap, name, parent); 3328 3329 if (err) 3330 return ERR_PTR(err); 3331 return start_dirop(parent, name, LOOKUP_CREATE); 3332 } 3333 EXPORT_SYMBOL(start_creating); 3334 3335 /** 3336 * start_removing - prepare to remove a given name with permission checking 3337 * @idmap: idmap of the mount 3338 * @parent: directory in which to find the name 3339 * @name: the name to be removed 3340 * 3341 * Locks are taken and a lookup in performed prior to removing 3342 * an object from a directory. Permission checking (MAY_EXEC) is performed 3343 * against @idmap. 3344 * 3345 * If the name doesn't exist, an error is returned. 3346 * 3347 * end_removing() should be called when removal is complete, or aborted. 3348 * 3349 * Returns: a positive dentry, or an error. 3350 */ 3351 struct dentry *start_removing(struct mnt_idmap *idmap, struct dentry *parent, 3352 struct qstr *name) 3353 { 3354 int err = lookup_one_common(idmap, name, parent); 3355 3356 if (err) 3357 return ERR_PTR(err); 3358 return start_dirop(parent, name, 0); 3359 } 3360 EXPORT_SYMBOL(start_removing); 3361 3362 /** 3363 * start_creating_killable - prepare to create a given name with permission checking 3364 * @idmap: idmap of the mount 3365 * @parent: directory in which to prepare to create the name 3366 * @name: the name to be created 3367 * 3368 * Locks are taken and a lookup in performed prior to creating 3369 * an object in a directory. Permission checking (MAY_EXEC) is performed 3370 * against @idmap. 3371 * 3372 * If the name already exists, a positive dentry is returned. 3373 * 3374 * If a signal is received or was already pending, the function aborts 3375 * with -EINTR; 3376 * 3377 * Returns: a negative or positive dentry, or an error. 3378 */ 3379 struct dentry *start_creating_killable(struct mnt_idmap *idmap, 3380 struct dentry *parent, 3381 struct qstr *name) 3382 { 3383 int err = lookup_one_common(idmap, name, parent); 3384 3385 if (err) 3386 return ERR_PTR(err); 3387 return __start_dirop(parent, name, LOOKUP_CREATE, TASK_KILLABLE); 3388 } 3389 EXPORT_SYMBOL(start_creating_killable); 3390 3391 /** 3392 * start_removing_killable - prepare to remove a given name with permission checking 3393 * @idmap: idmap of the mount 3394 * @parent: directory in which to find the name 3395 * @name: the name to be removed 3396 * 3397 * Locks are taken and a lookup in performed prior to removing 3398 * an object from a directory. Permission checking (MAY_EXEC) is performed 3399 * against @idmap. 3400 * 3401 * If the name doesn't exist, an error is returned. 3402 * 3403 * end_removing() should be called when removal is complete, or aborted. 3404 * 3405 * If a signal is received or was already pending, the function aborts 3406 * with -EINTR; 3407 * 3408 * Returns: a positive dentry, or an error. 3409 */ 3410 struct dentry *start_removing_killable(struct mnt_idmap *idmap, 3411 struct dentry *parent, 3412 struct qstr *name) 3413 { 3414 int err = lookup_one_common(idmap, name, parent); 3415 3416 if (err) 3417 return ERR_PTR(err); 3418 return __start_dirop(parent, name, 0, TASK_KILLABLE); 3419 } 3420 EXPORT_SYMBOL(start_removing_killable); 3421 3422 /** 3423 * start_creating_noperm - prepare to create a given name without permission checking 3424 * @parent: directory in which to prepare to create the name 3425 * @name: the name to be created 3426 * 3427 * Locks are taken and a lookup in performed prior to creating 3428 * an object in a directory. 3429 * 3430 * If the name already exists, a positive dentry is returned. 3431 * 3432 * Returns: a negative or positive dentry, or an error. 3433 */ 3434 struct dentry *start_creating_noperm(struct dentry *parent, 3435 struct qstr *name) 3436 { 3437 int err = lookup_noperm_common(name, parent); 3438 3439 if (err) 3440 return ERR_PTR(err); 3441 return start_dirop(parent, name, LOOKUP_CREATE); 3442 } 3443 EXPORT_SYMBOL(start_creating_noperm); 3444 3445 /** 3446 * start_removing_noperm - prepare to remove a given name without permission checking 3447 * @parent: directory in which to find the name 3448 * @name: the name to be removed 3449 * 3450 * Locks are taken and a lookup in performed prior to removing 3451 * an object from a directory. 3452 * 3453 * If the name doesn't exist, an error is returned. 3454 * 3455 * end_removing() should be called when removal is complete, or aborted. 3456 * 3457 * Returns: a positive dentry, or an error. 3458 */ 3459 struct dentry *start_removing_noperm(struct dentry *parent, 3460 struct qstr *name) 3461 { 3462 int err = lookup_noperm_common(name, parent); 3463 3464 if (err) 3465 return ERR_PTR(err); 3466 return start_dirop(parent, name, 0); 3467 } 3468 EXPORT_SYMBOL(start_removing_noperm); 3469 3470 /** 3471 * start_creating_dentry - prepare to create a given dentry 3472 * @parent: directory from which dentry should be removed 3473 * @child: the dentry to be removed 3474 * 3475 * A lock is taken to protect the dentry again other dirops and 3476 * the validity of the dentry is checked: correct parent and still hashed. 3477 * 3478 * If the dentry is valid and negative a reference is taken and 3479 * returned. If not an error is returned. 3480 * 3481 * end_creating() should be called when creation is complete, or aborted. 3482 * 3483 * Returns: the valid dentry, or an error. 3484 */ 3485 struct dentry *start_creating_dentry(struct dentry *parent, 3486 struct dentry *child) 3487 { 3488 inode_lock_nested(parent->d_inode, I_MUTEX_PARENT); 3489 if (unlikely(IS_DEADDIR(parent->d_inode) || 3490 child->d_parent != parent || 3491 d_unhashed(child))) { 3492 inode_unlock(parent->d_inode); 3493 return ERR_PTR(-EINVAL); 3494 } 3495 if (d_is_positive(child)) { 3496 inode_unlock(parent->d_inode); 3497 return ERR_PTR(-EEXIST); 3498 } 3499 return dget(child); 3500 } 3501 EXPORT_SYMBOL(start_creating_dentry); 3502 3503 /** 3504 * start_removing_dentry - prepare to remove a given dentry 3505 * @parent: directory from which dentry should be removed 3506 * @child: the dentry to be removed 3507 * 3508 * A lock is taken to protect the dentry again other dirops and 3509 * the validity of the dentry is checked: correct parent and still hashed. 3510 * 3511 * If the dentry is valid and positive, a reference is taken and 3512 * returned. If not an error is returned. 3513 * 3514 * end_removing() should be called when removal is complete, or aborted. 3515 * 3516 * Returns: the valid dentry, or an error. 3517 */ 3518 struct dentry *start_removing_dentry(struct dentry *parent, 3519 struct dentry *child) 3520 { 3521 inode_lock_nested(parent->d_inode, I_MUTEX_PARENT); 3522 if (unlikely(IS_DEADDIR(parent->d_inode) || 3523 child->d_parent != parent || 3524 d_unhashed(child))) { 3525 inode_unlock(parent->d_inode); 3526 return ERR_PTR(-EINVAL); 3527 } 3528 if (d_is_negative(child)) { 3529 inode_unlock(parent->d_inode); 3530 return ERR_PTR(-ENOENT); 3531 } 3532 return dget(child); 3533 } 3534 EXPORT_SYMBOL(start_removing_dentry); 3535 3536 #ifdef CONFIG_UNIX98_PTYS 3537 int path_pts(struct path *path) 3538 { 3539 /* Find something mounted on "pts" in the same directory as 3540 * the input path. 3541 */ 3542 struct dentry *parent = dget_parent(path->dentry); 3543 struct dentry *child; 3544 struct qstr this = QSTR_INIT("pts", 3); 3545 3546 if (unlikely(!path_connected(path->mnt, parent))) { 3547 dput(parent); 3548 return -ENOENT; 3549 } 3550 dput(path->dentry); 3551 path->dentry = parent; 3552 child = d_hash_and_lookup(parent, &this); 3553 if (IS_ERR_OR_NULL(child)) 3554 return -ENOENT; 3555 3556 path->dentry = child; 3557 dput(parent); 3558 follow_down(path, 0); 3559 return 0; 3560 } 3561 #endif 3562 3563 int user_path_at(int dfd, const char __user *name, unsigned flags, 3564 struct path *path) 3565 { 3566 struct filename *filename = getname_flags(name, flags); 3567 int ret = filename_lookup(dfd, filename, flags, path, NULL); 3568 3569 putname(filename); 3570 return ret; 3571 } 3572 EXPORT_SYMBOL(user_path_at); 3573 3574 int __check_sticky(struct mnt_idmap *idmap, struct inode *dir, 3575 struct inode *inode) 3576 { 3577 kuid_t fsuid = current_fsuid(); 3578 3579 if (vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode), fsuid)) 3580 return 0; 3581 if (vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, dir), fsuid)) 3582 return 0; 3583 return !capable_wrt_inode_uidgid(idmap, inode, CAP_FOWNER); 3584 } 3585 EXPORT_SYMBOL(__check_sticky); 3586 3587 /* 3588 * Check whether we can remove a link victim from directory dir, check 3589 * whether the type of victim is right. 3590 * 1. We can't do it if dir is read-only (done in permission()) 3591 * 2. We should have write and exec permissions on dir 3592 * 3. We can't remove anything from append-only dir 3593 * 4. We can't do anything with immutable dir (done in permission()) 3594 * 5. If the sticky bit on dir is set we should either 3595 * a. be owner of dir, or 3596 * b. be owner of victim, or 3597 * c. have CAP_FOWNER capability 3598 * 6. If the victim is append-only or immutable we can't do antyhing with 3599 * links pointing to it. 3600 * 7. If the victim has an unknown uid or gid we can't change the inode. 3601 * 8. If we were asked to remove a directory and victim isn't one - ENOTDIR. 3602 * 9. If we were asked to remove a non-directory and victim isn't one - EISDIR. 3603 * 10. We can't remove a root or mountpoint. 3604 * 11. We don't allow removal of NFS sillyrenamed files; it's handled by 3605 * nfs_async_unlink(). 3606 */ 3607 static int may_delete(struct mnt_idmap *idmap, struct inode *dir, 3608 struct dentry *victim, bool isdir) 3609 { 3610 struct inode *inode = d_backing_inode(victim); 3611 int error; 3612 3613 if (d_is_negative(victim)) 3614 return -ENOENT; 3615 BUG_ON(!inode); 3616 3617 BUG_ON(victim->d_parent->d_inode != dir); 3618 3619 /* Inode writeback is not safe when the uid or gid are invalid. */ 3620 if (!vfsuid_valid(i_uid_into_vfsuid(idmap, inode)) || 3621 !vfsgid_valid(i_gid_into_vfsgid(idmap, inode))) 3622 return -EOVERFLOW; 3623 3624 audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE); 3625 3626 error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); 3627 if (error) 3628 return error; 3629 if (IS_APPEND(dir)) 3630 return -EPERM; 3631 3632 if (check_sticky(idmap, dir, inode) || IS_APPEND(inode) || 3633 IS_IMMUTABLE(inode) || IS_SWAPFILE(inode) || 3634 HAS_UNMAPPED_ID(idmap, inode)) 3635 return -EPERM; 3636 if (isdir) { 3637 if (!d_is_dir(victim)) 3638 return -ENOTDIR; 3639 if (IS_ROOT(victim)) 3640 return -EBUSY; 3641 } else if (d_is_dir(victim)) 3642 return -EISDIR; 3643 if (IS_DEADDIR(dir)) 3644 return -ENOENT; 3645 if (victim->d_flags & DCACHE_NFSFS_RENAMED) 3646 return -EBUSY; 3647 return 0; 3648 } 3649 3650 /* Check whether we can create an object with dentry child in directory 3651 * dir. 3652 * 1. We can't do it if child already exists (open has special treatment for 3653 * this case, but since we are inlined it's OK) 3654 * 2. We can't do it if dir is read-only (done in permission()) 3655 * 3. We can't do it if the fs can't represent the fsuid or fsgid. 3656 * 4. We should have write and exec permissions on dir 3657 * 5. We can't do it if dir is immutable (done in permission()) 3658 */ 3659 static inline int may_create(struct mnt_idmap *idmap, 3660 struct inode *dir, struct dentry *child) 3661 { 3662 audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE); 3663 if (child->d_inode) 3664 return -EEXIST; 3665 if (IS_DEADDIR(dir)) 3666 return -ENOENT; 3667 if (!fsuidgid_has_mapping(dir->i_sb, idmap)) 3668 return -EOVERFLOW; 3669 3670 return inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); 3671 } 3672 3673 // p1 != p2, both are on the same filesystem, ->s_vfs_rename_mutex is held 3674 static struct dentry *lock_two_directories(struct dentry *p1, struct dentry *p2) 3675 { 3676 struct dentry *p = p1, *q = p2, *r; 3677 3678 while ((r = p->d_parent) != p2 && r != p) 3679 p = r; 3680 if (r == p2) { 3681 // p is a child of p2 and an ancestor of p1 or p1 itself 3682 inode_lock_nested(p2->d_inode, I_MUTEX_PARENT); 3683 inode_lock_nested(p1->d_inode, I_MUTEX_PARENT2); 3684 return p; 3685 } 3686 // p is the root of connected component that contains p1 3687 // p2 does not occur on the path from p to p1 3688 while ((r = q->d_parent) != p1 && r != p && r != q) 3689 q = r; 3690 if (r == p1) { 3691 // q is a child of p1 and an ancestor of p2 or p2 itself 3692 inode_lock_nested(p1->d_inode, I_MUTEX_PARENT); 3693 inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2); 3694 return q; 3695 } else if (likely(r == p)) { 3696 // both p2 and p1 are descendents of p 3697 inode_lock_nested(p1->d_inode, I_MUTEX_PARENT); 3698 inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2); 3699 return NULL; 3700 } else { // no common ancestor at the time we'd been called 3701 mutex_unlock(&p1->d_sb->s_vfs_rename_mutex); 3702 return ERR_PTR(-EXDEV); 3703 } 3704 } 3705 3706 /* 3707 * p1 and p2 should be directories on the same fs. 3708 */ 3709 struct dentry *lock_rename(struct dentry *p1, struct dentry *p2) 3710 { 3711 if (p1 == p2) { 3712 inode_lock_nested(p1->d_inode, I_MUTEX_PARENT); 3713 return NULL; 3714 } 3715 3716 mutex_lock(&p1->d_sb->s_vfs_rename_mutex); 3717 return lock_two_directories(p1, p2); 3718 } 3719 EXPORT_SYMBOL(lock_rename); 3720 3721 /* 3722 * c1 and p2 should be on the same fs. 3723 */ 3724 struct dentry *lock_rename_child(struct dentry *c1, struct dentry *p2) 3725 { 3726 if (READ_ONCE(c1->d_parent) == p2) { 3727 /* 3728 * hopefully won't need to touch ->s_vfs_rename_mutex at all. 3729 */ 3730 inode_lock_nested(p2->d_inode, I_MUTEX_PARENT); 3731 /* 3732 * now that p2 is locked, nobody can move in or out of it, 3733 * so the test below is safe. 3734 */ 3735 if (likely(c1->d_parent == p2)) 3736 return NULL; 3737 3738 /* 3739 * c1 got moved out of p2 while we'd been taking locks; 3740 * unlock and fall back to slow case. 3741 */ 3742 inode_unlock(p2->d_inode); 3743 } 3744 3745 mutex_lock(&c1->d_sb->s_vfs_rename_mutex); 3746 /* 3747 * nobody can move out of any directories on this fs. 3748 */ 3749 if (likely(c1->d_parent != p2)) 3750 return lock_two_directories(c1->d_parent, p2); 3751 3752 /* 3753 * c1 got moved into p2 while we were taking locks; 3754 * we need p2 locked and ->s_vfs_rename_mutex unlocked, 3755 * for consistency with lock_rename(). 3756 */ 3757 inode_lock_nested(p2->d_inode, I_MUTEX_PARENT); 3758 mutex_unlock(&c1->d_sb->s_vfs_rename_mutex); 3759 return NULL; 3760 } 3761 EXPORT_SYMBOL(lock_rename_child); 3762 3763 void unlock_rename(struct dentry *p1, struct dentry *p2) 3764 { 3765 inode_unlock(p1->d_inode); 3766 if (p1 != p2) { 3767 inode_unlock(p2->d_inode); 3768 mutex_unlock(&p1->d_sb->s_vfs_rename_mutex); 3769 } 3770 } 3771 EXPORT_SYMBOL(unlock_rename); 3772 3773 /** 3774 * __start_renaming - lookup and lock names for rename 3775 * @rd: rename data containing parents and flags, and 3776 * for receiving found dentries 3777 * @lookup_flags: extra flags to pass to ->lookup (e.g. LOOKUP_REVAL, 3778 * LOOKUP_NO_SYMLINKS etc). 3779 * @old_last: name of object in @rd.old_parent 3780 * @new_last: name of object in @rd.new_parent 3781 * 3782 * Look up two names and ensure locks are in place for 3783 * rename. 3784 * 3785 * On success the found dentries are stored in @rd.old_dentry, 3786 * @rd.new_dentry and an extra ref is taken on @rd.old_parent. 3787 * These references and the lock are dropped by end_renaming(). 3788 * 3789 * The passed in qstrs must have the hash calculated, and no permission 3790 * checking is performed. 3791 * 3792 * Returns: zero or an error. 3793 */ 3794 static int 3795 __start_renaming(struct renamedata *rd, int lookup_flags, 3796 struct qstr *old_last, struct qstr *new_last) 3797 { 3798 struct dentry *trap; 3799 struct dentry *d1, *d2; 3800 int target_flags = LOOKUP_RENAME_TARGET | LOOKUP_CREATE; 3801 int err; 3802 3803 if (rd->flags & RENAME_EXCHANGE) 3804 target_flags = 0; 3805 if (rd->flags & RENAME_NOREPLACE) 3806 target_flags |= LOOKUP_EXCL; 3807 3808 trap = lock_rename(rd->old_parent, rd->new_parent); 3809 if (IS_ERR(trap)) 3810 return PTR_ERR(trap); 3811 3812 d1 = lookup_one_qstr_excl(old_last, rd->old_parent, 3813 lookup_flags); 3814 err = PTR_ERR(d1); 3815 if (IS_ERR(d1)) 3816 goto out_unlock; 3817 3818 d2 = lookup_one_qstr_excl(new_last, rd->new_parent, 3819 lookup_flags | target_flags); 3820 err = PTR_ERR(d2); 3821 if (IS_ERR(d2)) 3822 goto out_dput_d1; 3823 3824 if (d1 == trap) { 3825 /* source is an ancestor of target */ 3826 err = -EINVAL; 3827 goto out_dput_d2; 3828 } 3829 3830 if (d2 == trap) { 3831 /* target is an ancestor of source */ 3832 if (rd->flags & RENAME_EXCHANGE) 3833 err = -EINVAL; 3834 else 3835 err = -ENOTEMPTY; 3836 goto out_dput_d2; 3837 } 3838 3839 rd->old_dentry = d1; 3840 rd->new_dentry = d2; 3841 dget(rd->old_parent); 3842 return 0; 3843 3844 out_dput_d2: 3845 dput(d2); 3846 out_dput_d1: 3847 dput(d1); 3848 out_unlock: 3849 unlock_rename(rd->old_parent, rd->new_parent); 3850 return err; 3851 } 3852 3853 /** 3854 * start_renaming - lookup and lock names for rename with permission checking 3855 * @rd: rename data containing parents and flags, and 3856 * for receiving found dentries 3857 * @lookup_flags: extra flags to pass to ->lookup (e.g. LOOKUP_REVAL, 3858 * LOOKUP_NO_SYMLINKS etc). 3859 * @old_last: name of object in @rd.old_parent 3860 * @new_last: name of object in @rd.new_parent 3861 * 3862 * Look up two names and ensure locks are in place for 3863 * rename. 3864 * 3865 * On success the found dentries are stored in @rd.old_dentry, 3866 * @rd.new_dentry. Also the refcount on @rd->old_parent is increased. 3867 * These references and the lock are dropped by end_renaming(). 3868 * 3869 * The passed in qstrs need not have the hash calculated, and basic 3870 * eXecute permission checking is performed against @rd.mnt_idmap. 3871 * 3872 * Returns: zero or an error. 3873 */ 3874 int start_renaming(struct renamedata *rd, int lookup_flags, 3875 struct qstr *old_last, struct qstr *new_last) 3876 { 3877 int err; 3878 3879 err = lookup_one_common(rd->mnt_idmap, old_last, rd->old_parent); 3880 if (err) 3881 return err; 3882 err = lookup_one_common(rd->mnt_idmap, new_last, rd->new_parent); 3883 if (err) 3884 return err; 3885 return __start_renaming(rd, lookup_flags, old_last, new_last); 3886 } 3887 EXPORT_SYMBOL(start_renaming); 3888 3889 static int 3890 __start_renaming_dentry(struct renamedata *rd, int lookup_flags, 3891 struct dentry *old_dentry, struct qstr *new_last) 3892 { 3893 struct dentry *trap; 3894 struct dentry *d2; 3895 int target_flags = LOOKUP_RENAME_TARGET | LOOKUP_CREATE; 3896 int err; 3897 3898 if (rd->flags & RENAME_EXCHANGE) 3899 target_flags = 0; 3900 if (rd->flags & RENAME_NOREPLACE) 3901 target_flags |= LOOKUP_EXCL; 3902 3903 /* Already have the dentry - need to be sure to lock the correct parent */ 3904 trap = lock_rename_child(old_dentry, rd->new_parent); 3905 if (IS_ERR(trap)) 3906 return PTR_ERR(trap); 3907 if (d_unhashed(old_dentry) || 3908 (rd->old_parent && rd->old_parent != old_dentry->d_parent)) { 3909 /* dentry was removed, or moved and explicit parent requested */ 3910 err = -EINVAL; 3911 goto out_unlock; 3912 } 3913 3914 d2 = lookup_one_qstr_excl(new_last, rd->new_parent, 3915 lookup_flags | target_flags); 3916 err = PTR_ERR(d2); 3917 if (IS_ERR(d2)) 3918 goto out_unlock; 3919 3920 if (old_dentry == trap) { 3921 /* source is an ancestor of target */ 3922 err = -EINVAL; 3923 goto out_dput_d2; 3924 } 3925 3926 if (d2 == trap) { 3927 /* target is an ancestor of source */ 3928 if (rd->flags & RENAME_EXCHANGE) 3929 err = -EINVAL; 3930 else 3931 err = -ENOTEMPTY; 3932 goto out_dput_d2; 3933 } 3934 3935 rd->old_dentry = dget(old_dentry); 3936 rd->new_dentry = d2; 3937 rd->old_parent = dget(old_dentry->d_parent); 3938 return 0; 3939 3940 out_dput_d2: 3941 dput(d2); 3942 out_unlock: 3943 unlock_rename(old_dentry->d_parent, rd->new_parent); 3944 return err; 3945 } 3946 3947 /** 3948 * start_renaming_dentry - lookup and lock name for rename with permission checking 3949 * @rd: rename data containing parents and flags, and 3950 * for receiving found dentries 3951 * @lookup_flags: extra flags to pass to ->lookup (e.g. LOOKUP_REVAL, 3952 * LOOKUP_NO_SYMLINKS etc). 3953 * @old_dentry: dentry of name to move 3954 * @new_last: name of target in @rd.new_parent 3955 * 3956 * Look up target name and ensure locks are in place for 3957 * rename. 3958 * 3959 * On success the found dentry is stored in @rd.new_dentry and 3960 * @rd.old_parent is confirmed to be the parent of @old_dentry. If it 3961 * was originally %NULL, it is set. In either case a reference is taken 3962 * so that end_renaming() can have a stable reference to unlock. 3963 * 3964 * References and the lock can be dropped with end_renaming() 3965 * 3966 * The passed in qstr need not have the hash calculated, and basic 3967 * eXecute permission checking is performed against @rd.mnt_idmap. 3968 * 3969 * Returns: zero or an error. 3970 */ 3971 int start_renaming_dentry(struct renamedata *rd, int lookup_flags, 3972 struct dentry *old_dentry, struct qstr *new_last) 3973 { 3974 int err; 3975 3976 err = lookup_one_common(rd->mnt_idmap, new_last, rd->new_parent); 3977 if (err) 3978 return err; 3979 return __start_renaming_dentry(rd, lookup_flags, old_dentry, new_last); 3980 } 3981 EXPORT_SYMBOL(start_renaming_dentry); 3982 3983 /** 3984 * start_renaming_two_dentries - Lock to dentries in given parents for rename 3985 * @rd: rename data containing parent 3986 * @old_dentry: dentry of name to move 3987 * @new_dentry: dentry to move to 3988 * 3989 * Ensure locks are in place for rename and check parentage is still correct. 3990 * 3991 * On success the two dentries are stored in @rd.old_dentry and 3992 * @rd.new_dentry and @rd.old_parent and @rd.new_parent are confirmed to 3993 * be the parents of the dentries. 3994 * 3995 * References and the lock can be dropped with end_renaming() 3996 * 3997 * Returns: zero or an error. 3998 */ 3999 int 4000 start_renaming_two_dentries(struct renamedata *rd, 4001 struct dentry *old_dentry, struct dentry *new_dentry) 4002 { 4003 struct dentry *trap; 4004 int err; 4005 4006 /* Already have the dentry - need to be sure to lock the correct parent */ 4007 trap = lock_rename_child(old_dentry, rd->new_parent); 4008 if (IS_ERR(trap)) 4009 return PTR_ERR(trap); 4010 err = -EINVAL; 4011 if (d_unhashed(old_dentry) || 4012 (rd->old_parent && rd->old_parent != old_dentry->d_parent)) 4013 /* old_dentry was removed, or moved and explicit parent requested */ 4014 goto out_unlock; 4015 if (d_unhashed(new_dentry) || 4016 rd->new_parent != new_dentry->d_parent) 4017 /* new_dentry was removed or moved */ 4018 goto out_unlock; 4019 4020 if (old_dentry == trap) 4021 /* source is an ancestor of target */ 4022 goto out_unlock; 4023 4024 if (new_dentry == trap) { 4025 /* target is an ancestor of source */ 4026 if (rd->flags & RENAME_EXCHANGE) 4027 err = -EINVAL; 4028 else 4029 err = -ENOTEMPTY; 4030 goto out_unlock; 4031 } 4032 4033 err = -EEXIST; 4034 if (d_is_positive(new_dentry) && (rd->flags & RENAME_NOREPLACE)) 4035 goto out_unlock; 4036 4037 rd->old_dentry = dget(old_dentry); 4038 rd->new_dentry = dget(new_dentry); 4039 rd->old_parent = dget(old_dentry->d_parent); 4040 return 0; 4041 4042 out_unlock: 4043 unlock_rename(old_dentry->d_parent, rd->new_parent); 4044 return err; 4045 } 4046 EXPORT_SYMBOL(start_renaming_two_dentries); 4047 4048 void end_renaming(struct renamedata *rd) 4049 { 4050 unlock_rename(rd->old_parent, rd->new_parent); 4051 dput(rd->old_dentry); 4052 dput(rd->new_dentry); 4053 dput(rd->old_parent); 4054 } 4055 EXPORT_SYMBOL(end_renaming); 4056 4057 /** 4058 * vfs_prepare_mode - prepare the mode to be used for a new inode 4059 * @idmap: idmap of the mount the inode was found from 4060 * @dir: parent directory of the new inode 4061 * @mode: mode of the new inode 4062 * @mask_perms: allowed permission by the vfs 4063 * @type: type of file to be created 4064 * 4065 * This helper consolidates and enforces vfs restrictions on the @mode of a new 4066 * object to be created. 4067 * 4068 * Umask stripping depends on whether the filesystem supports POSIX ACLs (see 4069 * the kernel documentation for mode_strip_umask()). Moving umask stripping 4070 * after setgid stripping allows the same ordering for both non-POSIX ACL and 4071 * POSIX ACL supporting filesystems. 4072 * 4073 * Note that it's currently valid for @type to be 0 if a directory is created. 4074 * Filesystems raise that flag individually and we need to check whether each 4075 * filesystem can deal with receiving S_IFDIR from the vfs before we enforce a 4076 * non-zero type. 4077 * 4078 * Returns: mode to be passed to the filesystem 4079 */ 4080 static inline umode_t vfs_prepare_mode(struct mnt_idmap *idmap, 4081 const struct inode *dir, umode_t mode, 4082 umode_t mask_perms, umode_t type) 4083 { 4084 mode = mode_strip_sgid(idmap, dir, mode); 4085 mode = mode_strip_umask(dir, mode); 4086 4087 /* 4088 * Apply the vfs mandated allowed permission mask and set the type of 4089 * file to be created before we call into the filesystem. 4090 */ 4091 mode &= (mask_perms & ~S_IFMT); 4092 mode |= (type & S_IFMT); 4093 4094 return mode; 4095 } 4096 4097 /** 4098 * vfs_create - create new file 4099 * @idmap: idmap of the mount the inode was found from 4100 * @dentry: dentry of the child file 4101 * @mode: mode of the child file 4102 * @di: returns parent inode, if the inode is delegated. 4103 * 4104 * Create a new file. 4105 * 4106 * If the inode has been found through an idmapped mount the idmap of 4107 * the vfsmount must be passed through @idmap. This function will then take 4108 * care to map the inode according to @idmap before checking permissions. 4109 * On non-idmapped mounts or if permission checking is to be performed on the 4110 * raw inode simply pass @nop_mnt_idmap. 4111 */ 4112 int vfs_create(struct mnt_idmap *idmap, struct dentry *dentry, umode_t mode, 4113 struct delegated_inode *di) 4114 { 4115 struct inode *dir = d_inode(dentry->d_parent); 4116 int error; 4117 4118 error = may_create(idmap, dir, dentry); 4119 if (error) 4120 return error; 4121 4122 if (!dir->i_op->create) 4123 return -EACCES; /* shouldn't it be ENOSYS? */ 4124 4125 mode = vfs_prepare_mode(idmap, dir, mode, S_IALLUGO, S_IFREG); 4126 error = security_inode_create(dir, dentry, mode); 4127 if (error) 4128 return error; 4129 error = try_break_deleg(dir, di); 4130 if (error) 4131 return error; 4132 error = dir->i_op->create(idmap, dir, dentry, mode, true); 4133 if (!error) 4134 fsnotify_create(dir, dentry); 4135 return error; 4136 } 4137 EXPORT_SYMBOL(vfs_create); 4138 4139 int vfs_mkobj(struct dentry *dentry, umode_t mode, 4140 int (*f)(struct dentry *, umode_t, void *), 4141 void *arg) 4142 { 4143 struct inode *dir = dentry->d_parent->d_inode; 4144 int error = may_create(&nop_mnt_idmap, dir, dentry); 4145 if (error) 4146 return error; 4147 4148 mode &= S_IALLUGO; 4149 mode |= S_IFREG; 4150 error = security_inode_create(dir, dentry, mode); 4151 if (error) 4152 return error; 4153 error = f(dentry, mode, arg); 4154 if (!error) 4155 fsnotify_create(dir, dentry); 4156 return error; 4157 } 4158 EXPORT_SYMBOL(vfs_mkobj); 4159 4160 bool may_open_dev(const struct path *path) 4161 { 4162 return !(path->mnt->mnt_flags & MNT_NODEV) && 4163 !(path->mnt->mnt_sb->s_iflags & SB_I_NODEV); 4164 } 4165 4166 static int may_open(struct mnt_idmap *idmap, const struct path *path, 4167 int acc_mode, int flag) 4168 { 4169 struct dentry *dentry = path->dentry; 4170 struct inode *inode = dentry->d_inode; 4171 int error; 4172 4173 if (!inode) 4174 return -ENOENT; 4175 4176 switch (inode->i_mode & S_IFMT) { 4177 case S_IFLNK: 4178 return -ELOOP; 4179 case S_IFDIR: 4180 if (acc_mode & MAY_WRITE) 4181 return -EISDIR; 4182 if (acc_mode & MAY_EXEC) 4183 return -EACCES; 4184 break; 4185 case S_IFBLK: 4186 case S_IFCHR: 4187 if (!may_open_dev(path)) 4188 return -EACCES; 4189 fallthrough; 4190 case S_IFIFO: 4191 case S_IFSOCK: 4192 if (acc_mode & MAY_EXEC) 4193 return -EACCES; 4194 flag &= ~O_TRUNC; 4195 break; 4196 case S_IFREG: 4197 if ((acc_mode & MAY_EXEC) && path_noexec(path)) 4198 return -EACCES; 4199 break; 4200 default: 4201 VFS_BUG_ON_INODE(!IS_ANON_FILE(inode), inode); 4202 } 4203 4204 error = inode_permission(idmap, inode, MAY_OPEN | acc_mode); 4205 if (error) 4206 return error; 4207 4208 /* 4209 * An append-only file must be opened in append mode for writing. 4210 */ 4211 if (IS_APPEND(inode)) { 4212 if ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND)) 4213 return -EPERM; 4214 if (flag & O_TRUNC) 4215 return -EPERM; 4216 } 4217 4218 /* O_NOATIME can only be set by the owner or superuser */ 4219 if (flag & O_NOATIME && !inode_owner_or_capable(idmap, inode)) 4220 return -EPERM; 4221 4222 return 0; 4223 } 4224 4225 static int handle_truncate(struct mnt_idmap *idmap, struct file *filp) 4226 { 4227 const struct path *path = &filp->f_path; 4228 struct inode *inode = path->dentry->d_inode; 4229 int error = get_write_access(inode); 4230 if (error) 4231 return error; 4232 4233 error = security_file_truncate(filp); 4234 if (!error) { 4235 error = do_truncate(idmap, path->dentry, 0, 4236 ATTR_MTIME|ATTR_CTIME|ATTR_OPEN, 4237 filp); 4238 } 4239 put_write_access(inode); 4240 return error; 4241 } 4242 4243 static inline int open_to_namei_flags(int flag) 4244 { 4245 if ((flag & O_ACCMODE) == 3) 4246 flag--; 4247 return flag; 4248 } 4249 4250 static int may_o_create(struct mnt_idmap *idmap, 4251 const struct path *dir, struct dentry *dentry, 4252 umode_t mode) 4253 { 4254 int error = security_path_mknod(dir, dentry, mode, 0); 4255 if (error) 4256 return error; 4257 4258 if (!fsuidgid_has_mapping(dir->dentry->d_sb, idmap)) 4259 return -EOVERFLOW; 4260 4261 error = inode_permission(idmap, dir->dentry->d_inode, 4262 MAY_WRITE | MAY_EXEC); 4263 if (error) 4264 return error; 4265 4266 return security_inode_create(dir->dentry->d_inode, dentry, mode); 4267 } 4268 4269 /* 4270 * Attempt to atomically look up, create and open a file from a negative 4271 * dentry. 4272 * 4273 * Returns 0 if successful. The file will have been created and attached to 4274 * @file by the filesystem calling finish_open(). 4275 * 4276 * If the file was looked up only or didn't need creating, FMODE_OPENED won't 4277 * be set. The caller will need to perform the open themselves. @path will 4278 * have been updated to point to the new dentry. This may be negative. 4279 * 4280 * Returns an error code otherwise. 4281 */ 4282 static struct dentry *atomic_open(struct nameidata *nd, struct dentry *dentry, 4283 struct file *file, 4284 int open_flag, umode_t mode) 4285 { 4286 struct dentry *const DENTRY_NOT_SET = (void *) -1UL; 4287 struct inode *dir = nd->path.dentry->d_inode; 4288 int error; 4289 4290 if (nd->flags & LOOKUP_DIRECTORY) 4291 open_flag |= O_DIRECTORY; 4292 4293 file->__f_path.dentry = DENTRY_NOT_SET; 4294 file->__f_path.mnt = nd->path.mnt; 4295 error = dir->i_op->atomic_open(dir, dentry, file, 4296 open_to_namei_flags(open_flag), mode); 4297 d_lookup_done(dentry); 4298 if (!error) { 4299 if (file->f_mode & FMODE_OPENED) { 4300 if (unlikely(dentry != file->f_path.dentry)) { 4301 dput(dentry); 4302 dentry = dget(file->f_path.dentry); 4303 } 4304 } else if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) { 4305 error = -EIO; 4306 } else { 4307 if (file->f_path.dentry) { 4308 dput(dentry); 4309 dentry = file->f_path.dentry; 4310 } 4311 if (unlikely(d_is_negative(dentry))) 4312 error = -ENOENT; 4313 } 4314 } 4315 if (error) { 4316 dput(dentry); 4317 dentry = ERR_PTR(error); 4318 } 4319 return dentry; 4320 } 4321 4322 /* 4323 * Look up and maybe create and open the last component. 4324 * 4325 * Must be called with parent locked (exclusive in O_CREAT case). 4326 * 4327 * Returns 0 on success, that is, if 4328 * the file was successfully atomically created (if necessary) and opened, or 4329 * the file was not completely opened at this time, though lookups and 4330 * creations were performed. 4331 * These case are distinguished by presence of FMODE_OPENED on file->f_mode. 4332 * In the latter case dentry returned in @path might be negative if O_CREAT 4333 * hadn't been specified. 4334 * 4335 * An error code is returned on failure. 4336 */ 4337 static struct dentry *lookup_open(struct nameidata *nd, struct file *file, 4338 const struct open_flags *op, 4339 bool got_write, struct delegated_inode *delegated_inode) 4340 { 4341 struct mnt_idmap *idmap; 4342 struct dentry *dir = nd->path.dentry; 4343 struct inode *dir_inode = dir->d_inode; 4344 int open_flag = op->open_flag; 4345 struct dentry *dentry; 4346 int error, create_error = 0; 4347 umode_t mode = op->mode; 4348 DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); 4349 4350 if (unlikely(IS_DEADDIR(dir_inode))) 4351 return ERR_PTR(-ENOENT); 4352 4353 file->f_mode &= ~FMODE_CREATED; 4354 dentry = d_lookup(dir, &nd->last); 4355 for (;;) { 4356 if (!dentry) { 4357 dentry = d_alloc_parallel(dir, &nd->last, &wq); 4358 if (IS_ERR(dentry)) 4359 return dentry; 4360 } 4361 if (d_in_lookup(dentry)) 4362 break; 4363 4364 error = d_revalidate(dir_inode, &nd->last, dentry, nd->flags); 4365 if (likely(error > 0)) 4366 break; 4367 if (error) 4368 goto out_dput; 4369 d_invalidate(dentry); 4370 dput(dentry); 4371 dentry = NULL; 4372 } 4373 if (dentry->d_inode) { 4374 /* Cached positive dentry: will open in f_op->open */ 4375 return dentry; 4376 } 4377 4378 if (open_flag & O_CREAT) 4379 audit_inode(nd->name, dir, AUDIT_INODE_PARENT); 4380 4381 /* 4382 * Checking write permission is tricky, bacuse we don't know if we are 4383 * going to actually need it: O_CREAT opens should work as long as the 4384 * file exists. But checking existence breaks atomicity. The trick is 4385 * to check access and if not granted clear O_CREAT from the flags. 4386 * 4387 * Another problem is returing the "right" error value (e.g. for an 4388 * O_EXCL open we want to return EEXIST not EROFS). 4389 */ 4390 if (unlikely(!got_write)) 4391 open_flag &= ~O_TRUNC; 4392 idmap = mnt_idmap(nd->path.mnt); 4393 if (open_flag & O_CREAT) { 4394 if (open_flag & O_EXCL) 4395 open_flag &= ~O_TRUNC; 4396 mode = vfs_prepare_mode(idmap, dir->d_inode, mode, mode, mode); 4397 if (likely(got_write)) 4398 create_error = may_o_create(idmap, &nd->path, 4399 dentry, mode); 4400 else 4401 create_error = -EROFS; 4402 } 4403 if (create_error) 4404 open_flag &= ~O_CREAT; 4405 if (dir_inode->i_op->atomic_open) { 4406 dentry = atomic_open(nd, dentry, file, open_flag, mode); 4407 if (unlikely(create_error) && dentry == ERR_PTR(-ENOENT)) 4408 dentry = ERR_PTR(create_error); 4409 return dentry; 4410 } 4411 4412 if (d_in_lookup(dentry)) { 4413 struct dentry *res = dir_inode->i_op->lookup(dir_inode, dentry, 4414 nd->flags); 4415 d_lookup_done(dentry); 4416 if (unlikely(res)) { 4417 if (IS_ERR(res)) { 4418 error = PTR_ERR(res); 4419 goto out_dput; 4420 } 4421 dput(dentry); 4422 dentry = res; 4423 } 4424 } 4425 4426 /* Negative dentry, just create the file */ 4427 if (!dentry->d_inode && (open_flag & O_CREAT)) { 4428 /* but break the directory lease first! */ 4429 error = try_break_deleg(dir_inode, delegated_inode); 4430 if (error) 4431 goto out_dput; 4432 4433 file->f_mode |= FMODE_CREATED; 4434 audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE); 4435 if (!dir_inode->i_op->create) { 4436 error = -EACCES; 4437 goto out_dput; 4438 } 4439 4440 error = dir_inode->i_op->create(idmap, dir_inode, dentry, 4441 mode, open_flag & O_EXCL); 4442 if (error) 4443 goto out_dput; 4444 } 4445 if (unlikely(create_error) && !dentry->d_inode) { 4446 error = create_error; 4447 goto out_dput; 4448 } 4449 return dentry; 4450 4451 out_dput: 4452 dput(dentry); 4453 return ERR_PTR(error); 4454 } 4455 4456 static inline bool trailing_slashes(struct nameidata *nd) 4457 { 4458 return (bool)nd->last.name[nd->last.len]; 4459 } 4460 4461 static struct dentry *lookup_fast_for_open(struct nameidata *nd, int open_flag) 4462 { 4463 struct dentry *dentry; 4464 4465 if (open_flag & O_CREAT) { 4466 if (trailing_slashes(nd)) 4467 return ERR_PTR(-EISDIR); 4468 4469 /* Don't bother on an O_EXCL create */ 4470 if (open_flag & O_EXCL) 4471 return NULL; 4472 } 4473 4474 if (trailing_slashes(nd)) 4475 nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; 4476 4477 dentry = lookup_fast(nd); 4478 if (IS_ERR_OR_NULL(dentry)) 4479 return dentry; 4480 4481 if (open_flag & O_CREAT) { 4482 /* Discard negative dentries. Need inode_lock to do the create */ 4483 if (!dentry->d_inode) { 4484 if (!(nd->flags & LOOKUP_RCU)) 4485 dput(dentry); 4486 dentry = NULL; 4487 } 4488 } 4489 return dentry; 4490 } 4491 4492 static const char *open_last_lookups(struct nameidata *nd, 4493 struct file *file, const struct open_flags *op) 4494 { 4495 struct delegated_inode delegated_inode = { }; 4496 struct dentry *dir = nd->path.dentry; 4497 int open_flag = op->open_flag; 4498 bool got_write = false; 4499 struct dentry *dentry; 4500 const char *res; 4501 4502 nd->flags |= op->intent; 4503 4504 if (nd->last_type != LAST_NORM) { 4505 if (nd->depth) 4506 put_link(nd); 4507 return handle_dots(nd, nd->last_type); 4508 } 4509 4510 /* We _can_ be in RCU mode here */ 4511 dentry = lookup_fast_for_open(nd, open_flag); 4512 if (IS_ERR(dentry)) 4513 return ERR_CAST(dentry); 4514 4515 if (likely(dentry)) 4516 goto finish_lookup; 4517 4518 if (!(open_flag & O_CREAT)) { 4519 if (WARN_ON_ONCE(nd->flags & LOOKUP_RCU)) 4520 return ERR_PTR(-ECHILD); 4521 } else { 4522 if (nd->flags & LOOKUP_RCU) { 4523 if (!try_to_unlazy(nd)) 4524 return ERR_PTR(-ECHILD); 4525 } 4526 } 4527 retry: 4528 if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) { 4529 got_write = !mnt_want_write(nd->path.mnt); 4530 /* 4531 * do _not_ fail yet - we might not need that or fail with 4532 * a different error; let lookup_open() decide; we'll be 4533 * dropping this one anyway. 4534 */ 4535 } 4536 if (open_flag & O_CREAT) 4537 inode_lock(dir->d_inode); 4538 else 4539 inode_lock_shared(dir->d_inode); 4540 dentry = lookup_open(nd, file, op, got_write, &delegated_inode); 4541 if (!IS_ERR(dentry)) { 4542 if (file->f_mode & FMODE_CREATED) 4543 fsnotify_create(dir->d_inode, dentry); 4544 if (file->f_mode & FMODE_OPENED) 4545 fsnotify_open(file); 4546 } 4547 if (open_flag & O_CREAT) 4548 inode_unlock(dir->d_inode); 4549 else 4550 inode_unlock_shared(dir->d_inode); 4551 4552 if (got_write) 4553 mnt_drop_write(nd->path.mnt); 4554 4555 if (IS_ERR(dentry)) { 4556 if (is_delegated(&delegated_inode)) { 4557 int error = break_deleg_wait(&delegated_inode); 4558 4559 if (!error) 4560 goto retry; 4561 return ERR_PTR(error); 4562 } 4563 return ERR_CAST(dentry); 4564 } 4565 4566 if (file->f_mode & (FMODE_OPENED | FMODE_CREATED)) { 4567 dput(nd->path.dentry); 4568 nd->path.dentry = dentry; 4569 return NULL; 4570 } 4571 4572 finish_lookup: 4573 if (nd->depth) 4574 put_link(nd); 4575 res = step_into(nd, WALK_TRAILING, dentry); 4576 if (unlikely(res)) 4577 nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL); 4578 return res; 4579 } 4580 4581 /* 4582 * Handle the last step of open() 4583 */ 4584 static int do_open(struct nameidata *nd, 4585 struct file *file, const struct open_flags *op) 4586 { 4587 struct mnt_idmap *idmap; 4588 int open_flag = op->open_flag; 4589 bool do_truncate; 4590 int acc_mode; 4591 int error; 4592 4593 if (!(file->f_mode & (FMODE_OPENED | FMODE_CREATED))) { 4594 error = complete_walk(nd); 4595 if (error) 4596 return error; 4597 } 4598 if (!(file->f_mode & FMODE_CREATED)) 4599 audit_inode(nd->name, nd->path.dentry, 0); 4600 idmap = mnt_idmap(nd->path.mnt); 4601 if (open_flag & O_CREAT) { 4602 if ((open_flag & O_EXCL) && !(file->f_mode & FMODE_CREATED)) 4603 return -EEXIST; 4604 if (d_is_dir(nd->path.dentry)) 4605 return -EISDIR; 4606 error = may_create_in_sticky(idmap, nd, 4607 d_backing_inode(nd->path.dentry)); 4608 if (unlikely(error)) 4609 return error; 4610 } 4611 if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry)) 4612 return -ENOTDIR; 4613 4614 do_truncate = false; 4615 acc_mode = op->acc_mode; 4616 if (file->f_mode & FMODE_CREATED) { 4617 /* Don't check for write permission, don't truncate */ 4618 open_flag &= ~O_TRUNC; 4619 acc_mode = 0; 4620 } else if (d_is_reg(nd->path.dentry) && open_flag & O_TRUNC) { 4621 error = mnt_want_write(nd->path.mnt); 4622 if (error) 4623 return error; 4624 do_truncate = true; 4625 } 4626 error = may_open(idmap, &nd->path, acc_mode, open_flag); 4627 if (!error && !(file->f_mode & FMODE_OPENED)) 4628 error = vfs_open(&nd->path, file); 4629 if (!error) 4630 error = security_file_post_open(file, op->acc_mode); 4631 if (!error && do_truncate) 4632 error = handle_truncate(idmap, file); 4633 if (unlikely(error > 0)) { 4634 WARN_ON(1); 4635 error = -EINVAL; 4636 } 4637 if (do_truncate) 4638 mnt_drop_write(nd->path.mnt); 4639 return error; 4640 } 4641 4642 /** 4643 * vfs_tmpfile - create tmpfile 4644 * @idmap: idmap of the mount the inode was found from 4645 * @parentpath: pointer to the path of the base directory 4646 * @file: file descriptor of the new tmpfile 4647 * @mode: mode of the new tmpfile 4648 * 4649 * Create a temporary file. 4650 * 4651 * If the inode has been found through an idmapped mount the idmap of 4652 * the vfsmount must be passed through @idmap. This function will then take 4653 * care to map the inode according to @idmap before checking permissions. 4654 * On non-idmapped mounts or if permission checking is to be performed on the 4655 * raw inode simply pass @nop_mnt_idmap. 4656 */ 4657 int vfs_tmpfile(struct mnt_idmap *idmap, 4658 const struct path *parentpath, 4659 struct file *file, umode_t mode) 4660 { 4661 struct dentry *child; 4662 struct inode *dir = d_inode(parentpath->dentry); 4663 struct inode *inode; 4664 int error; 4665 int open_flag = file->f_flags; 4666 4667 /* we want directory to be writable */ 4668 error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); 4669 if (error) 4670 return error; 4671 if (!dir->i_op->tmpfile) 4672 return -EOPNOTSUPP; 4673 child = d_alloc(parentpath->dentry, &slash_name); 4674 if (unlikely(!child)) 4675 return -ENOMEM; 4676 file->__f_path.mnt = parentpath->mnt; 4677 file->__f_path.dentry = child; 4678 mode = vfs_prepare_mode(idmap, dir, mode, mode, mode); 4679 error = dir->i_op->tmpfile(idmap, dir, file, mode); 4680 dput(child); 4681 if (file->f_mode & FMODE_OPENED) 4682 fsnotify_open(file); 4683 if (error) 4684 return error; 4685 /* Don't check for other permissions, the inode was just created */ 4686 error = may_open(idmap, &file->f_path, 0, file->f_flags); 4687 if (error) 4688 return error; 4689 inode = file_inode(file); 4690 if (!(open_flag & O_EXCL)) { 4691 spin_lock(&inode->i_lock); 4692 inode_state_set(inode, I_LINKABLE); 4693 spin_unlock(&inode->i_lock); 4694 } 4695 security_inode_post_create_tmpfile(idmap, inode); 4696 return 0; 4697 } 4698 4699 /** 4700 * kernel_tmpfile_open - open a tmpfile for kernel internal use 4701 * @idmap: idmap of the mount the inode was found from 4702 * @parentpath: path of the base directory 4703 * @mode: mode of the new tmpfile 4704 * @open_flag: flags 4705 * @cred: credentials for open 4706 * 4707 * Create and open a temporary file. The file is not accounted in nr_files, 4708 * hence this is only for kernel internal use, and must not be installed into 4709 * file tables or such. 4710 */ 4711 struct file *kernel_tmpfile_open(struct mnt_idmap *idmap, 4712 const struct path *parentpath, 4713 umode_t mode, int open_flag, 4714 const struct cred *cred) 4715 { 4716 struct file *file; 4717 int error; 4718 4719 file = alloc_empty_file_noaccount(open_flag, cred); 4720 if (IS_ERR(file)) 4721 return file; 4722 4723 error = vfs_tmpfile(idmap, parentpath, file, mode); 4724 if (error) { 4725 fput(file); 4726 file = ERR_PTR(error); 4727 } 4728 return file; 4729 } 4730 EXPORT_SYMBOL(kernel_tmpfile_open); 4731 4732 static int do_tmpfile(struct nameidata *nd, unsigned flags, 4733 const struct open_flags *op, 4734 struct file *file) 4735 { 4736 struct path path; 4737 int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path); 4738 4739 if (unlikely(error)) 4740 return error; 4741 error = mnt_want_write(path.mnt); 4742 if (unlikely(error)) 4743 goto out; 4744 error = vfs_tmpfile(mnt_idmap(path.mnt), &path, file, op->mode); 4745 if (error) 4746 goto out2; 4747 audit_inode(nd->name, file->f_path.dentry, 0); 4748 out2: 4749 mnt_drop_write(path.mnt); 4750 out: 4751 path_put(&path); 4752 return error; 4753 } 4754 4755 static int do_o_path(struct nameidata *nd, unsigned flags, struct file *file) 4756 { 4757 struct path path; 4758 int error = path_lookupat(nd, flags, &path); 4759 if (!error) { 4760 audit_inode(nd->name, path.dentry, 0); 4761 error = vfs_open(&path, file); 4762 path_put(&path); 4763 } 4764 return error; 4765 } 4766 4767 static struct file *path_openat(struct nameidata *nd, 4768 const struct open_flags *op, unsigned flags) 4769 { 4770 struct file *file; 4771 int error; 4772 4773 file = alloc_empty_file(op->open_flag, current_cred()); 4774 if (IS_ERR(file)) 4775 return file; 4776 4777 if (unlikely(file->f_flags & __O_TMPFILE)) { 4778 error = do_tmpfile(nd, flags, op, file); 4779 } else if (unlikely(file->f_flags & O_PATH)) { 4780 error = do_o_path(nd, flags, file); 4781 } else { 4782 const char *s = path_init(nd, flags); 4783 while (!(error = link_path_walk(s, nd)) && 4784 (s = open_last_lookups(nd, file, op)) != NULL) 4785 ; 4786 if (!error) 4787 error = do_open(nd, file, op); 4788 terminate_walk(nd); 4789 } 4790 if (likely(!error)) { 4791 if (likely(file->f_mode & FMODE_OPENED)) 4792 return file; 4793 WARN_ON(1); 4794 error = -EINVAL; 4795 } 4796 fput_close(file); 4797 if (error == -EOPENSTALE) { 4798 if (flags & LOOKUP_RCU) 4799 error = -ECHILD; 4800 else 4801 error = -ESTALE; 4802 } 4803 return ERR_PTR(error); 4804 } 4805 4806 struct file *do_filp_open(int dfd, struct filename *pathname, 4807 const struct open_flags *op) 4808 { 4809 struct nameidata nd; 4810 int flags = op->lookup_flags; 4811 struct file *filp; 4812 4813 set_nameidata(&nd, dfd, pathname, NULL); 4814 filp = path_openat(&nd, op, flags | LOOKUP_RCU); 4815 if (unlikely(filp == ERR_PTR(-ECHILD))) 4816 filp = path_openat(&nd, op, flags); 4817 if (unlikely(filp == ERR_PTR(-ESTALE))) 4818 filp = path_openat(&nd, op, flags | LOOKUP_REVAL); 4819 restore_nameidata(); 4820 return filp; 4821 } 4822 4823 struct file *do_file_open_root(const struct path *root, 4824 const char *name, const struct open_flags *op) 4825 { 4826 struct nameidata nd; 4827 struct file *file; 4828 struct filename *filename; 4829 int flags = op->lookup_flags; 4830 4831 if (d_is_symlink(root->dentry) && op->intent & LOOKUP_OPEN) 4832 return ERR_PTR(-ELOOP); 4833 4834 filename = getname_kernel(name); 4835 if (IS_ERR(filename)) 4836 return ERR_CAST(filename); 4837 4838 set_nameidata(&nd, -1, filename, root); 4839 file = path_openat(&nd, op, flags | LOOKUP_RCU); 4840 if (unlikely(file == ERR_PTR(-ECHILD))) 4841 file = path_openat(&nd, op, flags); 4842 if (unlikely(file == ERR_PTR(-ESTALE))) 4843 file = path_openat(&nd, op, flags | LOOKUP_REVAL); 4844 restore_nameidata(); 4845 putname(filename); 4846 return file; 4847 } 4848 4849 static struct dentry *filename_create(int dfd, struct filename *name, 4850 struct path *path, unsigned int lookup_flags) 4851 { 4852 struct dentry *dentry = ERR_PTR(-EEXIST); 4853 struct qstr last; 4854 bool want_dir = lookup_flags & LOOKUP_DIRECTORY; 4855 unsigned int reval_flag = lookup_flags & LOOKUP_REVAL; 4856 unsigned int create_flags = LOOKUP_CREATE | LOOKUP_EXCL; 4857 int type; 4858 int error; 4859 4860 error = filename_parentat(dfd, name, reval_flag, path, &last, &type); 4861 if (error) 4862 return ERR_PTR(error); 4863 4864 /* 4865 * Yucky last component or no last component at all? 4866 * (foo/., foo/.., /////) 4867 */ 4868 if (unlikely(type != LAST_NORM)) 4869 goto out; 4870 4871 /* don't fail immediately if it's r/o, at least try to report other errors */ 4872 error = mnt_want_write(path->mnt); 4873 /* 4874 * Do the final lookup. Suppress 'create' if there is a trailing 4875 * '/', and a directory wasn't requested. 4876 */ 4877 if (last.name[last.len] && !want_dir) 4878 create_flags &= ~LOOKUP_CREATE; 4879 dentry = start_dirop(path->dentry, &last, reval_flag | create_flags); 4880 if (IS_ERR(dentry)) 4881 goto out_drop_write; 4882 4883 if (unlikely(error)) 4884 goto fail; 4885 4886 return dentry; 4887 fail: 4888 end_dirop(dentry); 4889 dentry = ERR_PTR(error); 4890 out_drop_write: 4891 if (!error) 4892 mnt_drop_write(path->mnt); 4893 out: 4894 path_put(path); 4895 return dentry; 4896 } 4897 4898 struct dentry *start_creating_path(int dfd, const char *pathname, 4899 struct path *path, unsigned int lookup_flags) 4900 { 4901 struct filename *filename = getname_kernel(pathname); 4902 struct dentry *res = filename_create(dfd, filename, path, lookup_flags); 4903 4904 putname(filename); 4905 return res; 4906 } 4907 EXPORT_SYMBOL(start_creating_path); 4908 4909 /** 4910 * end_creating_path - finish a code section started by start_creating_path() 4911 * @path: the path instantiated by start_creating_path() 4912 * @dentry: the dentry returned by start_creating_path() 4913 * 4914 * end_creating_path() will unlock and locks taken by start_creating_path() 4915 * and drop an references that were taken. It should only be called 4916 * if start_creating_path() returned a non-error. 4917 * If vfs_mkdir() was called and it returned an error, that error *should* 4918 * be passed to end_creating_path() together with the path. 4919 */ 4920 void end_creating_path(const struct path *path, struct dentry *dentry) 4921 { 4922 end_creating(dentry); 4923 mnt_drop_write(path->mnt); 4924 path_put(path); 4925 } 4926 EXPORT_SYMBOL(end_creating_path); 4927 4928 inline struct dentry *start_creating_user_path( 4929 int dfd, const char __user *pathname, 4930 struct path *path, unsigned int lookup_flags) 4931 { 4932 struct filename *filename = getname(pathname); 4933 struct dentry *res = filename_create(dfd, filename, path, lookup_flags); 4934 4935 putname(filename); 4936 return res; 4937 } 4938 EXPORT_SYMBOL(start_creating_user_path); 4939 4940 4941 /** 4942 * vfs_mknod - create device node or file 4943 * @idmap: idmap of the mount the inode was found from 4944 * @dir: inode of the parent directory 4945 * @dentry: dentry of the child device node 4946 * @mode: mode of the child device node 4947 * @dev: device number of device to create 4948 * @delegated_inode: returns parent inode, if the inode is delegated. 4949 * 4950 * Create a device node or file. 4951 * 4952 * If the inode has been found through an idmapped mount the idmap of 4953 * the vfsmount must be passed through @idmap. This function will then take 4954 * care to map the inode according to @idmap before checking permissions. 4955 * On non-idmapped mounts or if permission checking is to be performed on the 4956 * raw inode simply pass @nop_mnt_idmap. 4957 */ 4958 int vfs_mknod(struct mnt_idmap *idmap, struct inode *dir, 4959 struct dentry *dentry, umode_t mode, dev_t dev, 4960 struct delegated_inode *delegated_inode) 4961 { 4962 bool is_whiteout = S_ISCHR(mode) && dev == WHITEOUT_DEV; 4963 int error = may_create(idmap, dir, dentry); 4964 4965 if (error) 4966 return error; 4967 4968 if ((S_ISCHR(mode) || S_ISBLK(mode)) && !is_whiteout && 4969 !capable(CAP_MKNOD)) 4970 return -EPERM; 4971 4972 if (!dir->i_op->mknod) 4973 return -EPERM; 4974 4975 mode = vfs_prepare_mode(idmap, dir, mode, mode, mode); 4976 error = devcgroup_inode_mknod(mode, dev); 4977 if (error) 4978 return error; 4979 4980 error = security_inode_mknod(dir, dentry, mode, dev); 4981 if (error) 4982 return error; 4983 4984 error = try_break_deleg(dir, delegated_inode); 4985 if (error) 4986 return error; 4987 4988 error = dir->i_op->mknod(idmap, dir, dentry, mode, dev); 4989 if (!error) 4990 fsnotify_create(dir, dentry); 4991 return error; 4992 } 4993 EXPORT_SYMBOL(vfs_mknod); 4994 4995 static int may_mknod(umode_t mode) 4996 { 4997 switch (mode & S_IFMT) { 4998 case S_IFREG: 4999 case S_IFCHR: 5000 case S_IFBLK: 5001 case S_IFIFO: 5002 case S_IFSOCK: 5003 case 0: /* zero mode translates to S_IFREG */ 5004 return 0; 5005 case S_IFDIR: 5006 return -EPERM; 5007 default: 5008 return -EINVAL; 5009 } 5010 } 5011 5012 static int do_mknodat(int dfd, struct filename *name, umode_t mode, 5013 unsigned int dev) 5014 { 5015 struct delegated_inode di = { }; 5016 struct mnt_idmap *idmap; 5017 struct dentry *dentry; 5018 struct path path; 5019 int error; 5020 unsigned int lookup_flags = 0; 5021 5022 error = may_mknod(mode); 5023 if (error) 5024 goto out1; 5025 retry: 5026 dentry = filename_create(dfd, name, &path, lookup_flags); 5027 error = PTR_ERR(dentry); 5028 if (IS_ERR(dentry)) 5029 goto out1; 5030 5031 error = security_path_mknod(&path, dentry, 5032 mode_strip_umask(path.dentry->d_inode, mode), dev); 5033 if (error) 5034 goto out2; 5035 5036 idmap = mnt_idmap(path.mnt); 5037 switch (mode & S_IFMT) { 5038 case 0: case S_IFREG: 5039 error = vfs_create(idmap, dentry, mode, &di); 5040 if (!error) 5041 security_path_post_mknod(idmap, dentry); 5042 break; 5043 case S_IFCHR: case S_IFBLK: 5044 error = vfs_mknod(idmap, path.dentry->d_inode, 5045 dentry, mode, new_decode_dev(dev), &di); 5046 break; 5047 case S_IFIFO: case S_IFSOCK: 5048 error = vfs_mknod(idmap, path.dentry->d_inode, 5049 dentry, mode, 0, &di); 5050 break; 5051 } 5052 out2: 5053 end_creating_path(&path, dentry); 5054 if (is_delegated(&di)) { 5055 error = break_deleg_wait(&di); 5056 if (!error) 5057 goto retry; 5058 } 5059 if (retry_estale(error, lookup_flags)) { 5060 lookup_flags |= LOOKUP_REVAL; 5061 goto retry; 5062 } 5063 out1: 5064 putname(name); 5065 return error; 5066 } 5067 5068 SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode, 5069 unsigned int, dev) 5070 { 5071 return do_mknodat(dfd, getname(filename), mode, dev); 5072 } 5073 5074 SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev) 5075 { 5076 return do_mknodat(AT_FDCWD, getname(filename), mode, dev); 5077 } 5078 5079 /** 5080 * vfs_mkdir - create directory returning correct dentry if possible 5081 * @idmap: idmap of the mount the inode was found from 5082 * @dir: inode of the parent directory 5083 * @dentry: dentry of the child directory 5084 * @mode: mode of the child directory 5085 * @delegated_inode: returns parent inode, if the inode is delegated. 5086 * 5087 * Create a directory. 5088 * 5089 * If the inode has been found through an idmapped mount the idmap of 5090 * the vfsmount must be passed through @idmap. This function will then take 5091 * care to map the inode according to @idmap before checking permissions. 5092 * On non-idmapped mounts or if permission checking is to be performed on the 5093 * raw inode simply pass @nop_mnt_idmap. 5094 * 5095 * In the event that the filesystem does not use the *@dentry but leaves it 5096 * negative or unhashes it and possibly splices a different one returning it, 5097 * the original dentry is dput() and the alternate is returned. 5098 * 5099 * In case of an error the dentry is dput() and an ERR_PTR() is returned. 5100 */ 5101 struct dentry *vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, 5102 struct dentry *dentry, umode_t mode, 5103 struct delegated_inode *delegated_inode) 5104 { 5105 int error; 5106 unsigned max_links = dir->i_sb->s_max_links; 5107 struct dentry *de; 5108 5109 error = may_create(idmap, dir, dentry); 5110 if (error) 5111 goto err; 5112 5113 error = -EPERM; 5114 if (!dir->i_op->mkdir) 5115 goto err; 5116 5117 mode = vfs_prepare_mode(idmap, dir, mode, S_IRWXUGO | S_ISVTX, 0); 5118 error = security_inode_mkdir(dir, dentry, mode); 5119 if (error) 5120 goto err; 5121 5122 error = -EMLINK; 5123 if (max_links && dir->i_nlink >= max_links) 5124 goto err; 5125 5126 error = try_break_deleg(dir, delegated_inode); 5127 if (error) 5128 goto err; 5129 5130 de = dir->i_op->mkdir(idmap, dir, dentry, mode); 5131 error = PTR_ERR(de); 5132 if (IS_ERR(de)) 5133 goto err; 5134 if (de) { 5135 dput(dentry); 5136 dentry = de; 5137 } 5138 fsnotify_mkdir(dir, dentry); 5139 return dentry; 5140 5141 err: 5142 end_creating(dentry); 5143 return ERR_PTR(error); 5144 } 5145 EXPORT_SYMBOL(vfs_mkdir); 5146 5147 int do_mkdirat(int dfd, struct filename *name, umode_t mode) 5148 { 5149 struct dentry *dentry; 5150 struct path path; 5151 int error; 5152 unsigned int lookup_flags = LOOKUP_DIRECTORY; 5153 struct delegated_inode delegated_inode = { }; 5154 5155 retry: 5156 dentry = filename_create(dfd, name, &path, lookup_flags); 5157 error = PTR_ERR(dentry); 5158 if (IS_ERR(dentry)) 5159 goto out_putname; 5160 5161 error = security_path_mkdir(&path, dentry, 5162 mode_strip_umask(path.dentry->d_inode, mode)); 5163 if (!error) { 5164 dentry = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode, 5165 dentry, mode, &delegated_inode); 5166 if (IS_ERR(dentry)) 5167 error = PTR_ERR(dentry); 5168 } 5169 end_creating_path(&path, dentry); 5170 if (is_delegated(&delegated_inode)) { 5171 error = break_deleg_wait(&delegated_inode); 5172 if (!error) 5173 goto retry; 5174 } 5175 if (retry_estale(error, lookup_flags)) { 5176 lookup_flags |= LOOKUP_REVAL; 5177 goto retry; 5178 } 5179 out_putname: 5180 putname(name); 5181 return error; 5182 } 5183 5184 SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode) 5185 { 5186 return do_mkdirat(dfd, getname(pathname), mode); 5187 } 5188 5189 SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode) 5190 { 5191 return do_mkdirat(AT_FDCWD, getname(pathname), mode); 5192 } 5193 5194 /** 5195 * vfs_rmdir - remove directory 5196 * @idmap: idmap of the mount the inode was found from 5197 * @dir: inode of the parent directory 5198 * @dentry: dentry of the child directory 5199 * @delegated_inode: returns parent inode, if it's delegated. 5200 * 5201 * Remove a directory. 5202 * 5203 * If the inode has been found through an idmapped mount the idmap of 5204 * the vfsmount must be passed through @idmap. This function will then take 5205 * care to map the inode according to @idmap before checking permissions. 5206 * On non-idmapped mounts or if permission checking is to be performed on the 5207 * raw inode simply pass @nop_mnt_idmap. 5208 */ 5209 int vfs_rmdir(struct mnt_idmap *idmap, struct inode *dir, 5210 struct dentry *dentry, struct delegated_inode *delegated_inode) 5211 { 5212 int error = may_delete(idmap, dir, dentry, 1); 5213 5214 if (error) 5215 return error; 5216 5217 if (!dir->i_op->rmdir) 5218 return -EPERM; 5219 5220 dget(dentry); 5221 inode_lock(dentry->d_inode); 5222 5223 error = -EBUSY; 5224 if (is_local_mountpoint(dentry) || 5225 (dentry->d_inode->i_flags & S_KERNEL_FILE)) 5226 goto out; 5227 5228 error = security_inode_rmdir(dir, dentry); 5229 if (error) 5230 goto out; 5231 5232 error = try_break_deleg(dir, delegated_inode); 5233 if (error) 5234 goto out; 5235 5236 error = dir->i_op->rmdir(dir, dentry); 5237 if (error) 5238 goto out; 5239 5240 shrink_dcache_parent(dentry); 5241 dentry->d_inode->i_flags |= S_DEAD; 5242 dont_mount(dentry); 5243 detach_mounts(dentry); 5244 5245 out: 5246 inode_unlock(dentry->d_inode); 5247 dput(dentry); 5248 if (!error) 5249 d_delete_notify(dir, dentry); 5250 return error; 5251 } 5252 EXPORT_SYMBOL(vfs_rmdir); 5253 5254 int do_rmdir(int dfd, struct filename *name) 5255 { 5256 int error; 5257 struct dentry *dentry; 5258 struct path path; 5259 struct qstr last; 5260 int type; 5261 unsigned int lookup_flags = 0; 5262 struct delegated_inode delegated_inode = { }; 5263 retry: 5264 error = filename_parentat(dfd, name, lookup_flags, &path, &last, &type); 5265 if (error) 5266 goto exit1; 5267 5268 switch (type) { 5269 case LAST_DOTDOT: 5270 error = -ENOTEMPTY; 5271 goto exit2; 5272 case LAST_DOT: 5273 error = -EINVAL; 5274 goto exit2; 5275 case LAST_ROOT: 5276 error = -EBUSY; 5277 goto exit2; 5278 } 5279 5280 error = mnt_want_write(path.mnt); 5281 if (error) 5282 goto exit2; 5283 5284 dentry = start_dirop(path.dentry, &last, lookup_flags); 5285 error = PTR_ERR(dentry); 5286 if (IS_ERR(dentry)) 5287 goto exit3; 5288 error = security_path_rmdir(&path, dentry); 5289 if (error) 5290 goto exit4; 5291 error = vfs_rmdir(mnt_idmap(path.mnt), path.dentry->d_inode, 5292 dentry, &delegated_inode); 5293 exit4: 5294 end_dirop(dentry); 5295 exit3: 5296 mnt_drop_write(path.mnt); 5297 exit2: 5298 path_put(&path); 5299 if (is_delegated(&delegated_inode)) { 5300 error = break_deleg_wait(&delegated_inode); 5301 if (!error) 5302 goto retry; 5303 } 5304 if (retry_estale(error, lookup_flags)) { 5305 lookup_flags |= LOOKUP_REVAL; 5306 goto retry; 5307 } 5308 exit1: 5309 putname(name); 5310 return error; 5311 } 5312 5313 SYSCALL_DEFINE1(rmdir, const char __user *, pathname) 5314 { 5315 return do_rmdir(AT_FDCWD, getname(pathname)); 5316 } 5317 5318 /** 5319 * vfs_unlink - unlink a filesystem object 5320 * @idmap: idmap of the mount the inode was found from 5321 * @dir: parent directory 5322 * @dentry: victim 5323 * @delegated_inode: returns victim inode, if the inode is delegated. 5324 * 5325 * The caller must hold dir->i_rwsem exclusively. 5326 * 5327 * If vfs_unlink discovers a delegation, it will return -EWOULDBLOCK and 5328 * return a reference to the inode in delegated_inode. The caller 5329 * should then break the delegation on that inode and retry. Because 5330 * breaking a delegation may take a long time, the caller should drop 5331 * dir->i_rwsem before doing so. 5332 * 5333 * Alternatively, a caller may pass NULL for delegated_inode. This may 5334 * be appropriate for callers that expect the underlying filesystem not 5335 * to be NFS exported. 5336 * 5337 * If the inode has been found through an idmapped mount the idmap of 5338 * the vfsmount must be passed through @idmap. This function will then take 5339 * care to map the inode according to @idmap before checking permissions. 5340 * On non-idmapped mounts or if permission checking is to be performed on the 5341 * raw inode simply pass @nop_mnt_idmap. 5342 */ 5343 int vfs_unlink(struct mnt_idmap *idmap, struct inode *dir, 5344 struct dentry *dentry, struct delegated_inode *delegated_inode) 5345 { 5346 struct inode *target = dentry->d_inode; 5347 int error = may_delete(idmap, dir, dentry, 0); 5348 5349 if (error) 5350 return error; 5351 5352 if (!dir->i_op->unlink) 5353 return -EPERM; 5354 5355 inode_lock(target); 5356 if (IS_SWAPFILE(target)) 5357 error = -EPERM; 5358 else if (is_local_mountpoint(dentry)) 5359 error = -EBUSY; 5360 else { 5361 error = security_inode_unlink(dir, dentry); 5362 if (!error) { 5363 error = try_break_deleg(dir, delegated_inode); 5364 if (error) 5365 goto out; 5366 error = try_break_deleg(target, delegated_inode); 5367 if (error) 5368 goto out; 5369 error = dir->i_op->unlink(dir, dentry); 5370 if (!error) { 5371 dont_mount(dentry); 5372 detach_mounts(dentry); 5373 } 5374 } 5375 } 5376 out: 5377 inode_unlock(target); 5378 5379 /* We don't d_delete() NFS sillyrenamed files--they still exist. */ 5380 if (!error && dentry->d_flags & DCACHE_NFSFS_RENAMED) { 5381 fsnotify_unlink(dir, dentry); 5382 } else if (!error) { 5383 fsnotify_link_count(target); 5384 d_delete_notify(dir, dentry); 5385 } 5386 5387 return error; 5388 } 5389 EXPORT_SYMBOL(vfs_unlink); 5390 5391 /* 5392 * Make sure that the actual truncation of the file will occur outside its 5393 * directory's i_rwsem. Truncate can take a long time if there is a lot of 5394 * writeout happening, and we don't want to prevent access to the directory 5395 * while waiting on the I/O. 5396 */ 5397 int do_unlinkat(int dfd, struct filename *name) 5398 { 5399 int error; 5400 struct dentry *dentry; 5401 struct path path; 5402 struct qstr last; 5403 int type; 5404 struct inode *inode; 5405 struct delegated_inode delegated_inode = { }; 5406 unsigned int lookup_flags = 0; 5407 retry: 5408 error = filename_parentat(dfd, name, lookup_flags, &path, &last, &type); 5409 if (error) 5410 goto exit_putname; 5411 5412 error = -EISDIR; 5413 if (type != LAST_NORM) 5414 goto exit_path_put; 5415 5416 error = mnt_want_write(path.mnt); 5417 if (error) 5418 goto exit_path_put; 5419 retry_deleg: 5420 dentry = start_dirop(path.dentry, &last, lookup_flags); 5421 error = PTR_ERR(dentry); 5422 if (IS_ERR(dentry)) 5423 goto exit_drop_write; 5424 5425 /* Why not before? Because we want correct error value */ 5426 if (unlikely(last.name[last.len])) { 5427 if (d_is_dir(dentry)) 5428 error = -EISDIR; 5429 else 5430 error = -ENOTDIR; 5431 end_dirop(dentry); 5432 goto exit_drop_write; 5433 } 5434 inode = dentry->d_inode; 5435 ihold(inode); 5436 error = security_path_unlink(&path, dentry); 5437 if (error) 5438 goto exit_end_dirop; 5439 error = vfs_unlink(mnt_idmap(path.mnt), path.dentry->d_inode, 5440 dentry, &delegated_inode); 5441 exit_end_dirop: 5442 end_dirop(dentry); 5443 iput(inode); /* truncate the inode here */ 5444 if (is_delegated(&delegated_inode)) { 5445 error = break_deleg_wait(&delegated_inode); 5446 if (!error) 5447 goto retry_deleg; 5448 } 5449 exit_drop_write: 5450 mnt_drop_write(path.mnt); 5451 exit_path_put: 5452 path_put(&path); 5453 if (retry_estale(error, lookup_flags)) { 5454 lookup_flags |= LOOKUP_REVAL; 5455 goto retry; 5456 } 5457 exit_putname: 5458 putname(name); 5459 return error; 5460 } 5461 5462 SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag) 5463 { 5464 if ((flag & ~AT_REMOVEDIR) != 0) 5465 return -EINVAL; 5466 5467 if (flag & AT_REMOVEDIR) 5468 return do_rmdir(dfd, getname(pathname)); 5469 return do_unlinkat(dfd, getname(pathname)); 5470 } 5471 5472 SYSCALL_DEFINE1(unlink, const char __user *, pathname) 5473 { 5474 return do_unlinkat(AT_FDCWD, getname(pathname)); 5475 } 5476 5477 /** 5478 * vfs_symlink - create symlink 5479 * @idmap: idmap of the mount the inode was found from 5480 * @dir: inode of the parent directory 5481 * @dentry: dentry of the child symlink file 5482 * @oldname: name of the file to link to 5483 * @delegated_inode: returns victim inode, if the inode is delegated. 5484 * 5485 * Create a symlink. 5486 * 5487 * If the inode has been found through an idmapped mount the idmap of 5488 * the vfsmount must be passed through @idmap. This function will then take 5489 * care to map the inode according to @idmap before checking permissions. 5490 * On non-idmapped mounts or if permission checking is to be performed on the 5491 * raw inode simply pass @nop_mnt_idmap. 5492 */ 5493 int vfs_symlink(struct mnt_idmap *idmap, struct inode *dir, 5494 struct dentry *dentry, const char *oldname, 5495 struct delegated_inode *delegated_inode) 5496 { 5497 int error; 5498 5499 error = may_create(idmap, dir, dentry); 5500 if (error) 5501 return error; 5502 5503 if (!dir->i_op->symlink) 5504 return -EPERM; 5505 5506 error = security_inode_symlink(dir, dentry, oldname); 5507 if (error) 5508 return error; 5509 5510 error = try_break_deleg(dir, delegated_inode); 5511 if (error) 5512 return error; 5513 5514 error = dir->i_op->symlink(idmap, dir, dentry, oldname); 5515 if (!error) 5516 fsnotify_create(dir, dentry); 5517 return error; 5518 } 5519 EXPORT_SYMBOL(vfs_symlink); 5520 5521 int do_symlinkat(struct filename *from, int newdfd, struct filename *to) 5522 { 5523 int error; 5524 struct dentry *dentry; 5525 struct path path; 5526 unsigned int lookup_flags = 0; 5527 struct delegated_inode delegated_inode = { }; 5528 5529 if (IS_ERR(from)) { 5530 error = PTR_ERR(from); 5531 goto out_putnames; 5532 } 5533 retry: 5534 dentry = filename_create(newdfd, to, &path, lookup_flags); 5535 error = PTR_ERR(dentry); 5536 if (IS_ERR(dentry)) 5537 goto out_putnames; 5538 5539 error = security_path_symlink(&path, dentry, from->name); 5540 if (!error) 5541 error = vfs_symlink(mnt_idmap(path.mnt), path.dentry->d_inode, 5542 dentry, from->name, &delegated_inode); 5543 end_creating_path(&path, dentry); 5544 if (is_delegated(&delegated_inode)) { 5545 error = break_deleg_wait(&delegated_inode); 5546 if (!error) 5547 goto retry; 5548 } 5549 if (retry_estale(error, lookup_flags)) { 5550 lookup_flags |= LOOKUP_REVAL; 5551 goto retry; 5552 } 5553 out_putnames: 5554 putname(to); 5555 putname(from); 5556 return error; 5557 } 5558 5559 SYSCALL_DEFINE3(symlinkat, const char __user *, oldname, 5560 int, newdfd, const char __user *, newname) 5561 { 5562 return do_symlinkat(getname(oldname), newdfd, getname(newname)); 5563 } 5564 5565 SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname) 5566 { 5567 return do_symlinkat(getname(oldname), AT_FDCWD, getname(newname)); 5568 } 5569 5570 /** 5571 * vfs_link - create a new link 5572 * @old_dentry: object to be linked 5573 * @idmap: idmap of the mount 5574 * @dir: new parent 5575 * @new_dentry: where to create the new link 5576 * @delegated_inode: returns inode needing a delegation break 5577 * 5578 * The caller must hold dir->i_rwsem exclusively. 5579 * 5580 * If vfs_link discovers a delegation on the to-be-linked file in need 5581 * of breaking, it will return -EWOULDBLOCK and return a reference to the 5582 * inode in delegated_inode. The caller should then break the delegation 5583 * and retry. Because breaking a delegation may take a long time, the 5584 * caller should drop the i_rwsem before doing so. 5585 * 5586 * Alternatively, a caller may pass NULL for delegated_inode. This may 5587 * be appropriate for callers that expect the underlying filesystem not 5588 * to be NFS exported. 5589 * 5590 * If the inode has been found through an idmapped mount the idmap of 5591 * the vfsmount must be passed through @idmap. This function will then take 5592 * care to map the inode according to @idmap before checking permissions. 5593 * On non-idmapped mounts or if permission checking is to be performed on the 5594 * raw inode simply pass @nop_mnt_idmap. 5595 */ 5596 int vfs_link(struct dentry *old_dentry, struct mnt_idmap *idmap, 5597 struct inode *dir, struct dentry *new_dentry, 5598 struct delegated_inode *delegated_inode) 5599 { 5600 struct inode *inode = old_dentry->d_inode; 5601 unsigned max_links = dir->i_sb->s_max_links; 5602 int error; 5603 5604 if (!inode) 5605 return -ENOENT; 5606 5607 error = may_create(idmap, dir, new_dentry); 5608 if (error) 5609 return error; 5610 5611 if (dir->i_sb != inode->i_sb) 5612 return -EXDEV; 5613 5614 /* 5615 * A link to an append-only or immutable file cannot be created. 5616 */ 5617 if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) 5618 return -EPERM; 5619 /* 5620 * Updating the link count will likely cause i_uid and i_gid to 5621 * be written back improperly if their true value is unknown to 5622 * the vfs. 5623 */ 5624 if (HAS_UNMAPPED_ID(idmap, inode)) 5625 return -EPERM; 5626 if (!dir->i_op->link) 5627 return -EPERM; 5628 if (S_ISDIR(inode->i_mode)) 5629 return -EPERM; 5630 5631 error = security_inode_link(old_dentry, dir, new_dentry); 5632 if (error) 5633 return error; 5634 5635 inode_lock(inode); 5636 /* Make sure we don't allow creating hardlink to an unlinked file */ 5637 if (inode->i_nlink == 0 && !(inode_state_read_once(inode) & I_LINKABLE)) 5638 error = -ENOENT; 5639 else if (max_links && inode->i_nlink >= max_links) 5640 error = -EMLINK; 5641 else { 5642 error = try_break_deleg(dir, delegated_inode); 5643 if (!error) 5644 error = try_break_deleg(inode, delegated_inode); 5645 if (!error) 5646 error = dir->i_op->link(old_dentry, dir, new_dentry); 5647 } 5648 5649 if (!error && (inode_state_read_once(inode) & I_LINKABLE)) { 5650 spin_lock(&inode->i_lock); 5651 inode_state_clear(inode, I_LINKABLE); 5652 spin_unlock(&inode->i_lock); 5653 } 5654 inode_unlock(inode); 5655 if (!error) 5656 fsnotify_link(dir, inode, new_dentry); 5657 return error; 5658 } 5659 EXPORT_SYMBOL(vfs_link); 5660 5661 /* 5662 * Hardlinks are often used in delicate situations. We avoid 5663 * security-related surprises by not following symlinks on the 5664 * newname. --KAB 5665 * 5666 * We don't follow them on the oldname either to be compatible 5667 * with linux 2.0, and to avoid hard-linking to directories 5668 * and other special files. --ADM 5669 */ 5670 int do_linkat(int olddfd, struct filename *old, int newdfd, 5671 struct filename *new, int flags) 5672 { 5673 struct mnt_idmap *idmap; 5674 struct dentry *new_dentry; 5675 struct path old_path, new_path; 5676 struct delegated_inode delegated_inode = { }; 5677 int how = 0; 5678 int error; 5679 5680 if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0) { 5681 error = -EINVAL; 5682 goto out_putnames; 5683 } 5684 /* 5685 * To use null names we require CAP_DAC_READ_SEARCH or 5686 * that the open-time creds of the dfd matches current. 5687 * This ensures that not everyone will be able to create 5688 * a hardlink using the passed file descriptor. 5689 */ 5690 if (flags & AT_EMPTY_PATH) 5691 how |= LOOKUP_LINKAT_EMPTY; 5692 5693 if (flags & AT_SYMLINK_FOLLOW) 5694 how |= LOOKUP_FOLLOW; 5695 retry: 5696 error = filename_lookup(olddfd, old, how, &old_path, NULL); 5697 if (error) 5698 goto out_putnames; 5699 5700 new_dentry = filename_create(newdfd, new, &new_path, 5701 (how & LOOKUP_REVAL)); 5702 error = PTR_ERR(new_dentry); 5703 if (IS_ERR(new_dentry)) 5704 goto out_putpath; 5705 5706 error = -EXDEV; 5707 if (old_path.mnt != new_path.mnt) 5708 goto out_dput; 5709 idmap = mnt_idmap(new_path.mnt); 5710 error = may_linkat(idmap, &old_path); 5711 if (unlikely(error)) 5712 goto out_dput; 5713 error = security_path_link(old_path.dentry, &new_path, new_dentry); 5714 if (error) 5715 goto out_dput; 5716 error = vfs_link(old_path.dentry, idmap, new_path.dentry->d_inode, 5717 new_dentry, &delegated_inode); 5718 out_dput: 5719 end_creating_path(&new_path, new_dentry); 5720 if (is_delegated(&delegated_inode)) { 5721 error = break_deleg_wait(&delegated_inode); 5722 if (!error) { 5723 path_put(&old_path); 5724 goto retry; 5725 } 5726 } 5727 if (retry_estale(error, how)) { 5728 path_put(&old_path); 5729 how |= LOOKUP_REVAL; 5730 goto retry; 5731 } 5732 out_putpath: 5733 path_put(&old_path); 5734 out_putnames: 5735 putname(old); 5736 putname(new); 5737 5738 return error; 5739 } 5740 5741 SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname, 5742 int, newdfd, const char __user *, newname, int, flags) 5743 { 5744 return do_linkat(olddfd, getname_uflags(oldname, flags), 5745 newdfd, getname(newname), flags); 5746 } 5747 5748 SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname) 5749 { 5750 return do_linkat(AT_FDCWD, getname(oldname), AT_FDCWD, getname(newname), 0); 5751 } 5752 5753 /** 5754 * vfs_rename - rename a filesystem object 5755 * @rd: pointer to &struct renamedata info 5756 * 5757 * The caller must hold multiple mutexes--see lock_rename()). 5758 * 5759 * If vfs_rename discovers a delegation in need of breaking at either 5760 * the source or destination, it will return -EWOULDBLOCK and return a 5761 * reference to the inode in delegated_inode. The caller should then 5762 * break the delegation and retry. Because breaking a delegation may 5763 * take a long time, the caller should drop all locks before doing 5764 * so. 5765 * 5766 * Alternatively, a caller may pass NULL for delegated_inode. This may 5767 * be appropriate for callers that expect the underlying filesystem not 5768 * to be NFS exported. 5769 * 5770 * The worst of all namespace operations - renaming directory. "Perverted" 5771 * doesn't even start to describe it. Somebody in UCB had a heck of a trip... 5772 * Problems: 5773 * 5774 * a) we can get into loop creation. 5775 * b) race potential - two innocent renames can create a loop together. 5776 * That's where 4.4BSD screws up. Current fix: serialization on 5777 * sb->s_vfs_rename_mutex. We might be more accurate, but that's another 5778 * story. 5779 * c) we may have to lock up to _four_ objects - parents and victim (if it exists), 5780 * and source (if it's a non-directory or a subdirectory that moves to 5781 * different parent). 5782 * And that - after we got ->i_rwsem on parents (until then we don't know 5783 * whether the target exists). Solution: try to be smart with locking 5784 * order for inodes. We rely on the fact that tree topology may change 5785 * only under ->s_vfs_rename_mutex _and_ that parent of the object we 5786 * move will be locked. Thus we can rank directories by the tree 5787 * (ancestors first) and rank all non-directories after them. 5788 * That works since everybody except rename does "lock parent, lookup, 5789 * lock child" and rename is under ->s_vfs_rename_mutex. 5790 * HOWEVER, it relies on the assumption that any object with ->lookup() 5791 * has no more than 1 dentry. If "hybrid" objects will ever appear, 5792 * we'd better make sure that there's no link(2) for them. 5793 * d) conversion from fhandle to dentry may come in the wrong moment - when 5794 * we are removing the target. Solution: we will have to grab ->i_rwsem 5795 * in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on 5796 * ->i_rwsem on parents, which works but leads to some truly excessive 5797 * locking]. 5798 */ 5799 int vfs_rename(struct renamedata *rd) 5800 { 5801 int error; 5802 struct inode *old_dir = d_inode(rd->old_parent); 5803 struct inode *new_dir = d_inode(rd->new_parent); 5804 struct dentry *old_dentry = rd->old_dentry; 5805 struct dentry *new_dentry = rd->new_dentry; 5806 struct delegated_inode *delegated_inode = rd->delegated_inode; 5807 unsigned int flags = rd->flags; 5808 bool is_dir = d_is_dir(old_dentry); 5809 struct inode *source = old_dentry->d_inode; 5810 struct inode *target = new_dentry->d_inode; 5811 bool new_is_dir = false; 5812 unsigned max_links = new_dir->i_sb->s_max_links; 5813 struct name_snapshot old_name; 5814 bool lock_old_subdir, lock_new_subdir; 5815 5816 if (source == target) 5817 return 0; 5818 5819 error = may_delete(rd->mnt_idmap, old_dir, old_dentry, is_dir); 5820 if (error) 5821 return error; 5822 5823 if (!target) { 5824 error = may_create(rd->mnt_idmap, new_dir, new_dentry); 5825 } else { 5826 new_is_dir = d_is_dir(new_dentry); 5827 5828 if (!(flags & RENAME_EXCHANGE)) 5829 error = may_delete(rd->mnt_idmap, new_dir, 5830 new_dentry, is_dir); 5831 else 5832 error = may_delete(rd->mnt_idmap, new_dir, 5833 new_dentry, new_is_dir); 5834 } 5835 if (error) 5836 return error; 5837 5838 if (!old_dir->i_op->rename) 5839 return -EPERM; 5840 5841 /* 5842 * If we are going to change the parent - check write permissions, 5843 * we'll need to flip '..'. 5844 */ 5845 if (new_dir != old_dir) { 5846 if (is_dir) { 5847 error = inode_permission(rd->mnt_idmap, source, 5848 MAY_WRITE); 5849 if (error) 5850 return error; 5851 } 5852 if ((flags & RENAME_EXCHANGE) && new_is_dir) { 5853 error = inode_permission(rd->mnt_idmap, target, 5854 MAY_WRITE); 5855 if (error) 5856 return error; 5857 } 5858 } 5859 5860 error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry, 5861 flags); 5862 if (error) 5863 return error; 5864 5865 take_dentry_name_snapshot(&old_name, old_dentry); 5866 dget(new_dentry); 5867 /* 5868 * Lock children. 5869 * The source subdirectory needs to be locked on cross-directory 5870 * rename or cross-directory exchange since its parent changes. 5871 * The target subdirectory needs to be locked on cross-directory 5872 * exchange due to parent change and on any rename due to becoming 5873 * a victim. 5874 * Non-directories need locking in all cases (for NFS reasons); 5875 * they get locked after any subdirectories (in inode address order). 5876 * 5877 * NOTE: WE ONLY LOCK UNRELATED DIRECTORIES IN CROSS-DIRECTORY CASE. 5878 * NEVER, EVER DO THAT WITHOUT ->s_vfs_rename_mutex. 5879 */ 5880 lock_old_subdir = new_dir != old_dir; 5881 lock_new_subdir = new_dir != old_dir || !(flags & RENAME_EXCHANGE); 5882 if (is_dir) { 5883 if (lock_old_subdir) 5884 inode_lock_nested(source, I_MUTEX_CHILD); 5885 if (target && (!new_is_dir || lock_new_subdir)) 5886 inode_lock(target); 5887 } else if (new_is_dir) { 5888 if (lock_new_subdir) 5889 inode_lock_nested(target, I_MUTEX_CHILD); 5890 inode_lock(source); 5891 } else { 5892 lock_two_nondirectories(source, target); 5893 } 5894 5895 error = -EPERM; 5896 if (IS_SWAPFILE(source) || (target && IS_SWAPFILE(target))) 5897 goto out; 5898 5899 error = -EBUSY; 5900 if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry)) 5901 goto out; 5902 5903 if (max_links && new_dir != old_dir) { 5904 error = -EMLINK; 5905 if (is_dir && !new_is_dir && new_dir->i_nlink >= max_links) 5906 goto out; 5907 if ((flags & RENAME_EXCHANGE) && !is_dir && new_is_dir && 5908 old_dir->i_nlink >= max_links) 5909 goto out; 5910 } 5911 error = try_break_deleg(old_dir, delegated_inode); 5912 if (error) 5913 goto out; 5914 if (new_dir != old_dir) { 5915 error = try_break_deleg(new_dir, delegated_inode); 5916 if (error) 5917 goto out; 5918 } 5919 if (!is_dir) { 5920 error = try_break_deleg(source, delegated_inode); 5921 if (error) 5922 goto out; 5923 } 5924 if (target && !new_is_dir) { 5925 error = try_break_deleg(target, delegated_inode); 5926 if (error) 5927 goto out; 5928 } 5929 error = old_dir->i_op->rename(rd->mnt_idmap, old_dir, old_dentry, 5930 new_dir, new_dentry, flags); 5931 if (error) 5932 goto out; 5933 5934 if (!(flags & RENAME_EXCHANGE) && target) { 5935 if (is_dir) { 5936 shrink_dcache_parent(new_dentry); 5937 target->i_flags |= S_DEAD; 5938 } 5939 dont_mount(new_dentry); 5940 detach_mounts(new_dentry); 5941 } 5942 if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) { 5943 if (!(flags & RENAME_EXCHANGE)) 5944 d_move(old_dentry, new_dentry); 5945 else 5946 d_exchange(old_dentry, new_dentry); 5947 } 5948 out: 5949 if (!is_dir || lock_old_subdir) 5950 inode_unlock(source); 5951 if (target && (!new_is_dir || lock_new_subdir)) 5952 inode_unlock(target); 5953 dput(new_dentry); 5954 if (!error) { 5955 fsnotify_move(old_dir, new_dir, &old_name.name, is_dir, 5956 !(flags & RENAME_EXCHANGE) ? target : NULL, old_dentry); 5957 if (flags & RENAME_EXCHANGE) { 5958 fsnotify_move(new_dir, old_dir, &old_dentry->d_name, 5959 new_is_dir, NULL, new_dentry); 5960 } 5961 } 5962 release_dentry_name_snapshot(&old_name); 5963 5964 return error; 5965 } 5966 EXPORT_SYMBOL(vfs_rename); 5967 5968 int do_renameat2(int olddfd, struct filename *from, int newdfd, 5969 struct filename *to, unsigned int flags) 5970 { 5971 struct renamedata rd; 5972 struct path old_path, new_path; 5973 struct qstr old_last, new_last; 5974 int old_type, new_type; 5975 struct delegated_inode delegated_inode = { }; 5976 unsigned int lookup_flags = 0; 5977 bool should_retry = false; 5978 int error = -EINVAL; 5979 5980 if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) 5981 goto put_names; 5982 5983 if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) && 5984 (flags & RENAME_EXCHANGE)) 5985 goto put_names; 5986 5987 retry: 5988 error = filename_parentat(olddfd, from, lookup_flags, &old_path, 5989 &old_last, &old_type); 5990 if (error) 5991 goto put_names; 5992 5993 error = filename_parentat(newdfd, to, lookup_flags, &new_path, &new_last, 5994 &new_type); 5995 if (error) 5996 goto exit1; 5997 5998 error = -EXDEV; 5999 if (old_path.mnt != new_path.mnt) 6000 goto exit2; 6001 6002 error = -EBUSY; 6003 if (old_type != LAST_NORM) 6004 goto exit2; 6005 6006 if (flags & RENAME_NOREPLACE) 6007 error = -EEXIST; 6008 if (new_type != LAST_NORM) 6009 goto exit2; 6010 6011 error = mnt_want_write(old_path.mnt); 6012 if (error) 6013 goto exit2; 6014 6015 retry_deleg: 6016 rd.old_parent = old_path.dentry; 6017 rd.mnt_idmap = mnt_idmap(old_path.mnt); 6018 rd.new_parent = new_path.dentry; 6019 rd.delegated_inode = &delegated_inode; 6020 rd.flags = flags; 6021 6022 error = __start_renaming(&rd, lookup_flags, &old_last, &new_last); 6023 if (error) 6024 goto exit_lock_rename; 6025 6026 if (flags & RENAME_EXCHANGE) { 6027 if (!d_is_dir(rd.new_dentry)) { 6028 error = -ENOTDIR; 6029 if (new_last.name[new_last.len]) 6030 goto exit_unlock; 6031 } 6032 } 6033 /* unless the source is a directory trailing slashes give -ENOTDIR */ 6034 if (!d_is_dir(rd.old_dentry)) { 6035 error = -ENOTDIR; 6036 if (old_last.name[old_last.len]) 6037 goto exit_unlock; 6038 if (!(flags & RENAME_EXCHANGE) && new_last.name[new_last.len]) 6039 goto exit_unlock; 6040 } 6041 6042 error = security_path_rename(&old_path, rd.old_dentry, 6043 &new_path, rd.new_dentry, flags); 6044 if (error) 6045 goto exit_unlock; 6046 6047 error = vfs_rename(&rd); 6048 exit_unlock: 6049 end_renaming(&rd); 6050 exit_lock_rename: 6051 if (is_delegated(&delegated_inode)) { 6052 error = break_deleg_wait(&delegated_inode); 6053 if (!error) 6054 goto retry_deleg; 6055 } 6056 mnt_drop_write(old_path.mnt); 6057 exit2: 6058 if (retry_estale(error, lookup_flags)) 6059 should_retry = true; 6060 path_put(&new_path); 6061 exit1: 6062 path_put(&old_path); 6063 if (should_retry) { 6064 should_retry = false; 6065 lookup_flags |= LOOKUP_REVAL; 6066 goto retry; 6067 } 6068 put_names: 6069 putname(from); 6070 putname(to); 6071 return error; 6072 } 6073 6074 SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname, 6075 int, newdfd, const char __user *, newname, unsigned int, flags) 6076 { 6077 return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname), 6078 flags); 6079 } 6080 6081 SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname, 6082 int, newdfd, const char __user *, newname) 6083 { 6084 return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname), 6085 0); 6086 } 6087 6088 SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname) 6089 { 6090 return do_renameat2(AT_FDCWD, getname(oldname), AT_FDCWD, 6091 getname(newname), 0); 6092 } 6093 6094 int readlink_copy(char __user *buffer, int buflen, const char *link, int linklen) 6095 { 6096 int copylen; 6097 6098 copylen = linklen; 6099 if (unlikely(copylen > (unsigned) buflen)) 6100 copylen = buflen; 6101 if (copy_to_user(buffer, link, copylen)) 6102 copylen = -EFAULT; 6103 return copylen; 6104 } 6105 6106 /** 6107 * vfs_readlink - copy symlink body into userspace buffer 6108 * @dentry: dentry on which to get symbolic link 6109 * @buffer: user memory pointer 6110 * @buflen: size of buffer 6111 * 6112 * Does not touch atime. That's up to the caller if necessary 6113 * 6114 * Does not call security hook. 6115 */ 6116 int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen) 6117 { 6118 struct inode *inode = d_inode(dentry); 6119 DEFINE_DELAYED_CALL(done); 6120 const char *link; 6121 int res; 6122 6123 if (inode->i_opflags & IOP_CACHED_LINK) 6124 return readlink_copy(buffer, buflen, inode->i_link, inode->i_linklen); 6125 6126 if (unlikely(!(inode->i_opflags & IOP_DEFAULT_READLINK))) { 6127 if (unlikely(inode->i_op->readlink)) 6128 return inode->i_op->readlink(dentry, buffer, buflen); 6129 6130 if (!d_is_symlink(dentry)) 6131 return -EINVAL; 6132 6133 spin_lock(&inode->i_lock); 6134 inode->i_opflags |= IOP_DEFAULT_READLINK; 6135 spin_unlock(&inode->i_lock); 6136 } 6137 6138 link = READ_ONCE(inode->i_link); 6139 if (!link) { 6140 link = inode->i_op->get_link(dentry, inode, &done); 6141 if (IS_ERR(link)) 6142 return PTR_ERR(link); 6143 } 6144 res = readlink_copy(buffer, buflen, link, strlen(link)); 6145 do_delayed_call(&done); 6146 return res; 6147 } 6148 EXPORT_SYMBOL(vfs_readlink); 6149 6150 /** 6151 * vfs_get_link - get symlink body 6152 * @dentry: dentry on which to get symbolic link 6153 * @done: caller needs to free returned data with this 6154 * 6155 * Calls security hook and i_op->get_link() on the supplied inode. 6156 * 6157 * It does not touch atime. That's up to the caller if necessary. 6158 * 6159 * Does not work on "special" symlinks like /proc/$$/fd/N 6160 */ 6161 const char *vfs_get_link(struct dentry *dentry, struct delayed_call *done) 6162 { 6163 const char *res = ERR_PTR(-EINVAL); 6164 struct inode *inode = d_inode(dentry); 6165 6166 if (d_is_symlink(dentry)) { 6167 res = ERR_PTR(security_inode_readlink(dentry)); 6168 if (!res) 6169 res = inode->i_op->get_link(dentry, inode, done); 6170 } 6171 return res; 6172 } 6173 EXPORT_SYMBOL(vfs_get_link); 6174 6175 /* get the link contents into pagecache */ 6176 static char *__page_get_link(struct dentry *dentry, struct inode *inode, 6177 struct delayed_call *callback) 6178 { 6179 struct folio *folio; 6180 struct address_space *mapping = inode->i_mapping; 6181 6182 if (!dentry) { 6183 folio = filemap_get_folio(mapping, 0); 6184 if (IS_ERR(folio)) 6185 return ERR_PTR(-ECHILD); 6186 if (!folio_test_uptodate(folio)) { 6187 folio_put(folio); 6188 return ERR_PTR(-ECHILD); 6189 } 6190 } else { 6191 folio = read_mapping_folio(mapping, 0, NULL); 6192 if (IS_ERR(folio)) 6193 return ERR_CAST(folio); 6194 } 6195 set_delayed_call(callback, page_put_link, folio); 6196 BUG_ON(mapping_gfp_mask(mapping) & __GFP_HIGHMEM); 6197 return folio_address(folio); 6198 } 6199 6200 const char *page_get_link_raw(struct dentry *dentry, struct inode *inode, 6201 struct delayed_call *callback) 6202 { 6203 return __page_get_link(dentry, inode, callback); 6204 } 6205 EXPORT_SYMBOL_GPL(page_get_link_raw); 6206 6207 /** 6208 * page_get_link() - An implementation of the get_link inode_operation. 6209 * @dentry: The directory entry which is the symlink. 6210 * @inode: The inode for the symlink. 6211 * @callback: Used to drop the reference to the symlink. 6212 * 6213 * Filesystems which store their symlinks in the page cache should use 6214 * this to implement the get_link() member of their inode_operations. 6215 * 6216 * Return: A pointer to the NUL-terminated symlink. 6217 */ 6218 const char *page_get_link(struct dentry *dentry, struct inode *inode, 6219 struct delayed_call *callback) 6220 { 6221 char *kaddr = __page_get_link(dentry, inode, callback); 6222 6223 if (!IS_ERR(kaddr)) 6224 nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1); 6225 return kaddr; 6226 } 6227 EXPORT_SYMBOL(page_get_link); 6228 6229 /** 6230 * page_put_link() - Drop the reference to the symlink. 6231 * @arg: The folio which contains the symlink. 6232 * 6233 * This is used internally by page_get_link(). It is exported for use 6234 * by filesystems which need to implement a variant of page_get_link() 6235 * themselves. Despite the apparent symmetry, filesystems which use 6236 * page_get_link() do not need to call page_put_link(). 6237 * 6238 * The argument, while it has a void pointer type, must be a pointer to 6239 * the folio which was retrieved from the page cache. The delayed_call 6240 * infrastructure is used to drop the reference count once the caller 6241 * is done with the symlink. 6242 */ 6243 void page_put_link(void *arg) 6244 { 6245 folio_put(arg); 6246 } 6247 EXPORT_SYMBOL(page_put_link); 6248 6249 int page_readlink(struct dentry *dentry, char __user *buffer, int buflen) 6250 { 6251 const char *link; 6252 int res; 6253 6254 DEFINE_DELAYED_CALL(done); 6255 link = page_get_link(dentry, d_inode(dentry), &done); 6256 res = PTR_ERR(link); 6257 if (!IS_ERR(link)) 6258 res = readlink_copy(buffer, buflen, link, strlen(link)); 6259 do_delayed_call(&done); 6260 return res; 6261 } 6262 EXPORT_SYMBOL(page_readlink); 6263 6264 int page_symlink(struct inode *inode, const char *symname, int len) 6265 { 6266 struct address_space *mapping = inode->i_mapping; 6267 const struct address_space_operations *aops = mapping->a_ops; 6268 bool nofs = !mapping_gfp_constraint(mapping, __GFP_FS); 6269 struct folio *folio; 6270 void *fsdata = NULL; 6271 int err; 6272 unsigned int flags; 6273 6274 retry: 6275 if (nofs) 6276 flags = memalloc_nofs_save(); 6277 err = aops->write_begin(NULL, mapping, 0, len-1, &folio, &fsdata); 6278 if (nofs) 6279 memalloc_nofs_restore(flags); 6280 if (err) 6281 goto fail; 6282 6283 memcpy(folio_address(folio), symname, len - 1); 6284 6285 err = aops->write_end(NULL, mapping, 0, len - 1, len - 1, 6286 folio, fsdata); 6287 if (err < 0) 6288 goto fail; 6289 if (err < len-1) 6290 goto retry; 6291 6292 mark_inode_dirty(inode); 6293 return 0; 6294 fail: 6295 return err; 6296 } 6297 EXPORT_SYMBOL(page_symlink); 6298 6299 const struct inode_operations page_symlink_inode_operations = { 6300 .get_link = page_get_link, 6301 }; 6302 EXPORT_SYMBOL(page_symlink_inode_operations); 6303