1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * linux/fs/open.c 4 * 5 * Copyright (C) 1991, 1992 Linus Torvalds 6 */ 7 8 #include <linux/string.h> 9 #include <linux/mm.h> 10 #include <linux/file.h> 11 #include <linux/fdtable.h> 12 #include <linux/fsnotify.h> 13 #include <linux/module.h> 14 #include <linux/tty.h> 15 #include <linux/namei.h> 16 #include <linux/backing-dev.h> 17 #include <linux/capability.h> 18 #include <linux/securebits.h> 19 #include <linux/security.h> 20 #include <linux/mount.h> 21 #include <linux/fcntl.h> 22 #include <linux/slab.h> 23 #include <linux/uaccess.h> 24 #include <linux/fs.h> 25 #include <linux/personality.h> 26 #include <linux/pagemap.h> 27 #include <linux/syscalls.h> 28 #include <linux/rcupdate.h> 29 #include <linux/audit.h> 30 #include <linux/falloc.h> 31 #include <linux/fs_struct.h> 32 #include <linux/ima.h> 33 #include <linux/dnotify.h> 34 #include <linux/compat.h> 35 #include <linux/mnt_idmapping.h> 36 #include <linux/filelock.h> 37 38 #include "internal.h" 39 40 int do_truncate(struct mnt_idmap *idmap, struct dentry *dentry, 41 loff_t length, unsigned int time_attrs, struct file *filp) 42 { 43 int ret; 44 struct iattr newattrs; 45 46 /* Not pretty: "inode->i_size" shouldn't really be signed. But it is. */ 47 if (length < 0) 48 return -EINVAL; 49 50 newattrs.ia_size = length; 51 newattrs.ia_valid = ATTR_SIZE | time_attrs; 52 if (filp) { 53 newattrs.ia_file = filp; 54 newattrs.ia_valid |= ATTR_FILE; 55 } 56 57 /* Remove suid, sgid, and file capabilities on truncate too */ 58 ret = dentry_needs_remove_privs(idmap, dentry); 59 if (ret < 0) 60 return ret; 61 if (ret) 62 newattrs.ia_valid |= ret | ATTR_FORCE; 63 64 inode_lock(dentry->d_inode); 65 /* Note any delegations or leases have already been broken: */ 66 ret = notify_change(idmap, dentry, &newattrs, NULL); 67 inode_unlock(dentry->d_inode); 68 return ret; 69 } 70 71 long vfs_truncate(const struct path *path, loff_t length) 72 { 73 struct mnt_idmap *idmap; 74 struct inode *inode; 75 long error; 76 77 inode = path->dentry->d_inode; 78 79 /* For directories it's -EISDIR, for other non-regulars - -EINVAL */ 80 if (S_ISDIR(inode->i_mode)) 81 return -EISDIR; 82 if (!S_ISREG(inode->i_mode)) 83 return -EINVAL; 84 85 error = mnt_want_write(path->mnt); 86 if (error) 87 goto out; 88 89 idmap = mnt_idmap(path->mnt); 90 error = inode_permission(idmap, inode, MAY_WRITE); 91 if (error) 92 goto mnt_drop_write_and_out; 93 94 error = -EPERM; 95 if (IS_APPEND(inode)) 96 goto mnt_drop_write_and_out; 97 98 error = get_write_access(inode); 99 if (error) 100 goto mnt_drop_write_and_out; 101 102 /* 103 * Make sure that there are no leases. get_write_access() protects 104 * against the truncate racing with a lease-granting setlease(). 105 */ 106 error = break_lease(inode, O_WRONLY); 107 if (error) 108 goto put_write_and_out; 109 110 error = security_path_truncate(path); 111 if (!error) 112 error = do_truncate(idmap, path->dentry, length, 0, NULL); 113 114 put_write_and_out: 115 put_write_access(inode); 116 mnt_drop_write_and_out: 117 mnt_drop_write(path->mnt); 118 out: 119 return error; 120 } 121 EXPORT_SYMBOL_GPL(vfs_truncate); 122 123 long do_sys_truncate(const char __user *pathname, loff_t length) 124 { 125 unsigned int lookup_flags = LOOKUP_FOLLOW; 126 struct path path; 127 int error; 128 129 if (length < 0) /* sorry, but loff_t says... */ 130 return -EINVAL; 131 132 retry: 133 error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); 134 if (!error) { 135 error = vfs_truncate(&path, length); 136 path_put(&path); 137 } 138 if (retry_estale(error, lookup_flags)) { 139 lookup_flags |= LOOKUP_REVAL; 140 goto retry; 141 } 142 return error; 143 } 144 145 SYSCALL_DEFINE2(truncate, const char __user *, path, long, length) 146 { 147 return do_sys_truncate(path, length); 148 } 149 150 #ifdef CONFIG_COMPAT 151 COMPAT_SYSCALL_DEFINE2(truncate, const char __user *, path, compat_off_t, length) 152 { 153 return do_sys_truncate(path, length); 154 } 155 #endif 156 157 long do_ftruncate(struct file *file, loff_t length, int small) 158 { 159 struct inode *inode; 160 struct dentry *dentry; 161 int error; 162 163 /* explicitly opened as large or we are on 64-bit box */ 164 if (file->f_flags & O_LARGEFILE) 165 small = 0; 166 167 dentry = file->f_path.dentry; 168 inode = dentry->d_inode; 169 if (!S_ISREG(inode->i_mode) || !(file->f_mode & FMODE_WRITE)) 170 return -EINVAL; 171 172 /* Cannot ftruncate over 2^31 bytes without large file support */ 173 if (small && length > MAX_NON_LFS) 174 return -EINVAL; 175 176 /* Check IS_APPEND on real upper inode */ 177 if (IS_APPEND(file_inode(file))) 178 return -EPERM; 179 sb_start_write(inode->i_sb); 180 error = security_file_truncate(file); 181 if (!error) 182 error = do_truncate(file_mnt_idmap(file), dentry, length, 183 ATTR_MTIME | ATTR_CTIME, file); 184 sb_end_write(inode->i_sb); 185 186 return error; 187 } 188 189 long do_sys_ftruncate(unsigned int fd, loff_t length, int small) 190 { 191 struct fd f; 192 int error; 193 194 if (length < 0) 195 return -EINVAL; 196 f = fdget(fd); 197 if (!f.file) 198 return -EBADF; 199 200 error = do_ftruncate(f.file, length, small); 201 202 fdput(f); 203 return error; 204 } 205 206 SYSCALL_DEFINE2(ftruncate, unsigned int, fd, unsigned long, length) 207 { 208 return do_sys_ftruncate(fd, length, 1); 209 } 210 211 #ifdef CONFIG_COMPAT 212 COMPAT_SYSCALL_DEFINE2(ftruncate, unsigned int, fd, compat_ulong_t, length) 213 { 214 return do_sys_ftruncate(fd, length, 1); 215 } 216 #endif 217 218 /* LFS versions of truncate are only needed on 32 bit machines */ 219 #if BITS_PER_LONG == 32 220 SYSCALL_DEFINE2(truncate64, const char __user *, path, loff_t, length) 221 { 222 return do_sys_truncate(path, length); 223 } 224 225 SYSCALL_DEFINE2(ftruncate64, unsigned int, fd, loff_t, length) 226 { 227 return do_sys_ftruncate(fd, length, 0); 228 } 229 #endif /* BITS_PER_LONG == 32 */ 230 231 #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_TRUNCATE64) 232 COMPAT_SYSCALL_DEFINE3(truncate64, const char __user *, pathname, 233 compat_arg_u64_dual(length)) 234 { 235 return ksys_truncate(pathname, compat_arg_u64_glue(length)); 236 } 237 #endif 238 239 #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_FTRUNCATE64) 240 COMPAT_SYSCALL_DEFINE3(ftruncate64, unsigned int, fd, 241 compat_arg_u64_dual(length)) 242 { 243 return ksys_ftruncate(fd, compat_arg_u64_glue(length)); 244 } 245 #endif 246 247 int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) 248 { 249 struct inode *inode = file_inode(file); 250 long ret; 251 252 if (offset < 0 || len <= 0) 253 return -EINVAL; 254 255 /* Return error if mode is not supported */ 256 if (mode & ~FALLOC_FL_SUPPORTED_MASK) 257 return -EOPNOTSUPP; 258 259 /* Punch hole and zero range are mutually exclusive */ 260 if ((mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) == 261 (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) 262 return -EOPNOTSUPP; 263 264 /* Punch hole must have keep size set */ 265 if ((mode & FALLOC_FL_PUNCH_HOLE) && 266 !(mode & FALLOC_FL_KEEP_SIZE)) 267 return -EOPNOTSUPP; 268 269 /* Collapse range should only be used exclusively. */ 270 if ((mode & FALLOC_FL_COLLAPSE_RANGE) && 271 (mode & ~FALLOC_FL_COLLAPSE_RANGE)) 272 return -EINVAL; 273 274 /* Insert range should only be used exclusively. */ 275 if ((mode & FALLOC_FL_INSERT_RANGE) && 276 (mode & ~FALLOC_FL_INSERT_RANGE)) 277 return -EINVAL; 278 279 /* Unshare range should only be used with allocate mode. */ 280 if ((mode & FALLOC_FL_UNSHARE_RANGE) && 281 (mode & ~(FALLOC_FL_UNSHARE_RANGE | FALLOC_FL_KEEP_SIZE))) 282 return -EINVAL; 283 284 if (!(file->f_mode & FMODE_WRITE)) 285 return -EBADF; 286 287 /* 288 * We can only allow pure fallocate on append only files 289 */ 290 if ((mode & ~FALLOC_FL_KEEP_SIZE) && IS_APPEND(inode)) 291 return -EPERM; 292 293 if (IS_IMMUTABLE(inode)) 294 return -EPERM; 295 296 /* 297 * We cannot allow any fallocate operation on an active swapfile 298 */ 299 if (IS_SWAPFILE(inode)) 300 return -ETXTBSY; 301 302 /* 303 * Revalidate the write permissions, in case security policy has 304 * changed since the files were opened. 305 */ 306 ret = security_file_permission(file, MAY_WRITE); 307 if (ret) 308 return ret; 309 310 ret = fsnotify_file_area_perm(file, MAY_WRITE, &offset, len); 311 if (ret) 312 return ret; 313 314 if (S_ISFIFO(inode->i_mode)) 315 return -ESPIPE; 316 317 if (S_ISDIR(inode->i_mode)) 318 return -EISDIR; 319 320 if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode)) 321 return -ENODEV; 322 323 /* Check for wrap through zero too */ 324 if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0)) 325 return -EFBIG; 326 327 if (!file->f_op->fallocate) 328 return -EOPNOTSUPP; 329 330 file_start_write(file); 331 ret = file->f_op->fallocate(file, mode, offset, len); 332 333 /* 334 * Create inotify and fanotify events. 335 * 336 * To keep the logic simple always create events if fallocate succeeds. 337 * This implies that events are even created if the file size remains 338 * unchanged, e.g. when using flag FALLOC_FL_KEEP_SIZE. 339 */ 340 if (ret == 0) 341 fsnotify_modify(file); 342 343 file_end_write(file); 344 return ret; 345 } 346 EXPORT_SYMBOL_GPL(vfs_fallocate); 347 348 int ksys_fallocate(int fd, int mode, loff_t offset, loff_t len) 349 { 350 struct fd f = fdget(fd); 351 int error = -EBADF; 352 353 if (f.file) { 354 error = vfs_fallocate(f.file, mode, offset, len); 355 fdput(f); 356 } 357 return error; 358 } 359 360 SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len) 361 { 362 return ksys_fallocate(fd, mode, offset, len); 363 } 364 365 #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_FALLOCATE) 366 COMPAT_SYSCALL_DEFINE6(fallocate, int, fd, int, mode, compat_arg_u64_dual(offset), 367 compat_arg_u64_dual(len)) 368 { 369 return ksys_fallocate(fd, mode, compat_arg_u64_glue(offset), 370 compat_arg_u64_glue(len)); 371 } 372 #endif 373 374 /* 375 * access() needs to use the real uid/gid, not the effective uid/gid. 376 * We do this by temporarily clearing all FS-related capabilities and 377 * switching the fsuid/fsgid around to the real ones. 378 * 379 * Creating new credentials is expensive, so we try to skip doing it, 380 * which we can if the result would match what we already got. 381 */ 382 static bool access_need_override_creds(int flags) 383 { 384 const struct cred *cred; 385 386 if (flags & AT_EACCESS) 387 return false; 388 389 cred = current_cred(); 390 if (!uid_eq(cred->fsuid, cred->uid) || 391 !gid_eq(cred->fsgid, cred->gid)) 392 return true; 393 394 if (!issecure(SECURE_NO_SETUID_FIXUP)) { 395 kuid_t root_uid = make_kuid(cred->user_ns, 0); 396 if (!uid_eq(cred->uid, root_uid)) { 397 if (!cap_isclear(cred->cap_effective)) 398 return true; 399 } else { 400 if (!cap_isidentical(cred->cap_effective, 401 cred->cap_permitted)) 402 return true; 403 } 404 } 405 406 return false; 407 } 408 409 static const struct cred *access_override_creds(void) 410 { 411 const struct cred *old_cred; 412 struct cred *override_cred; 413 414 override_cred = prepare_creds(); 415 if (!override_cred) 416 return NULL; 417 418 /* 419 * XXX access_need_override_creds performs checks in hopes of skipping 420 * this work. Make sure it stays in sync if making any changes in this 421 * routine. 422 */ 423 424 override_cred->fsuid = override_cred->uid; 425 override_cred->fsgid = override_cred->gid; 426 427 if (!issecure(SECURE_NO_SETUID_FIXUP)) { 428 /* Clear the capabilities if we switch to a non-root user */ 429 kuid_t root_uid = make_kuid(override_cred->user_ns, 0); 430 if (!uid_eq(override_cred->uid, root_uid)) 431 cap_clear(override_cred->cap_effective); 432 else 433 override_cred->cap_effective = 434 override_cred->cap_permitted; 435 } 436 437 /* 438 * The new set of credentials can *only* be used in 439 * task-synchronous circumstances, and does not need 440 * RCU freeing, unless somebody then takes a separate 441 * reference to it. 442 * 443 * NOTE! This is _only_ true because this credential 444 * is used purely for override_creds() that installs 445 * it as the subjective cred. Other threads will be 446 * accessing ->real_cred, not the subjective cred. 447 * 448 * If somebody _does_ make a copy of this (using the 449 * 'get_current_cred()' function), that will clear the 450 * non_rcu field, because now that other user may be 451 * expecting RCU freeing. But normal thread-synchronous 452 * cred accesses will keep things non-racy to avoid RCU 453 * freeing. 454 */ 455 override_cred->non_rcu = 1; 456 457 old_cred = override_creds(override_cred); 458 459 /* override_cred() gets its own ref */ 460 put_cred(override_cred); 461 462 return old_cred; 463 } 464 465 static long do_faccessat(int dfd, const char __user *filename, int mode, int flags) 466 { 467 struct path path; 468 struct inode *inode; 469 int res; 470 unsigned int lookup_flags = LOOKUP_FOLLOW; 471 const struct cred *old_cred = NULL; 472 473 if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ 474 return -EINVAL; 475 476 if (flags & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) 477 return -EINVAL; 478 479 if (flags & AT_SYMLINK_NOFOLLOW) 480 lookup_flags &= ~LOOKUP_FOLLOW; 481 if (flags & AT_EMPTY_PATH) 482 lookup_flags |= LOOKUP_EMPTY; 483 484 if (access_need_override_creds(flags)) { 485 old_cred = access_override_creds(); 486 if (!old_cred) 487 return -ENOMEM; 488 } 489 490 retry: 491 res = user_path_at(dfd, filename, lookup_flags, &path); 492 if (res) 493 goto out; 494 495 inode = d_backing_inode(path.dentry); 496 497 if ((mode & MAY_EXEC) && S_ISREG(inode->i_mode)) { 498 /* 499 * MAY_EXEC on regular files is denied if the fs is mounted 500 * with the "noexec" flag. 501 */ 502 res = -EACCES; 503 if (path_noexec(&path)) 504 goto out_path_release; 505 } 506 507 res = inode_permission(mnt_idmap(path.mnt), inode, mode | MAY_ACCESS); 508 /* SuS v2 requires we report a read only fs too */ 509 if (res || !(mode & S_IWOTH) || special_file(inode->i_mode)) 510 goto out_path_release; 511 /* 512 * This is a rare case where using __mnt_is_readonly() 513 * is OK without a mnt_want/drop_write() pair. Since 514 * no actual write to the fs is performed here, we do 515 * not need to telegraph to that to anyone. 516 * 517 * By doing this, we accept that this access is 518 * inherently racy and know that the fs may change 519 * state before we even see this result. 520 */ 521 if (__mnt_is_readonly(path.mnt)) 522 res = -EROFS; 523 524 out_path_release: 525 path_put(&path); 526 if (retry_estale(res, lookup_flags)) { 527 lookup_flags |= LOOKUP_REVAL; 528 goto retry; 529 } 530 out: 531 if (old_cred) 532 revert_creds(old_cred); 533 534 return res; 535 } 536 537 SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode) 538 { 539 return do_faccessat(dfd, filename, mode, 0); 540 } 541 542 SYSCALL_DEFINE4(faccessat2, int, dfd, const char __user *, filename, int, mode, 543 int, flags) 544 { 545 return do_faccessat(dfd, filename, mode, flags); 546 } 547 548 SYSCALL_DEFINE2(access, const char __user *, filename, int, mode) 549 { 550 return do_faccessat(AT_FDCWD, filename, mode, 0); 551 } 552 553 SYSCALL_DEFINE1(chdir, const char __user *, filename) 554 { 555 struct path path; 556 int error; 557 unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY; 558 retry: 559 error = user_path_at(AT_FDCWD, filename, lookup_flags, &path); 560 if (error) 561 goto out; 562 563 error = path_permission(&path, MAY_EXEC | MAY_CHDIR); 564 if (error) 565 goto dput_and_out; 566 567 set_fs_pwd(current->fs, &path); 568 569 dput_and_out: 570 path_put(&path); 571 if (retry_estale(error, lookup_flags)) { 572 lookup_flags |= LOOKUP_REVAL; 573 goto retry; 574 } 575 out: 576 return error; 577 } 578 579 SYSCALL_DEFINE1(fchdir, unsigned int, fd) 580 { 581 struct fd f = fdget_raw(fd); 582 int error; 583 584 error = -EBADF; 585 if (!f.file) 586 goto out; 587 588 error = -ENOTDIR; 589 if (!d_can_lookup(f.file->f_path.dentry)) 590 goto out_putf; 591 592 error = file_permission(f.file, MAY_EXEC | MAY_CHDIR); 593 if (!error) 594 set_fs_pwd(current->fs, &f.file->f_path); 595 out_putf: 596 fdput(f); 597 out: 598 return error; 599 } 600 601 SYSCALL_DEFINE1(chroot, const char __user *, filename) 602 { 603 struct path path; 604 int error; 605 unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY; 606 retry: 607 error = user_path_at(AT_FDCWD, filename, lookup_flags, &path); 608 if (error) 609 goto out; 610 611 error = path_permission(&path, MAY_EXEC | MAY_CHDIR); 612 if (error) 613 goto dput_and_out; 614 615 error = -EPERM; 616 if (!ns_capable(current_user_ns(), CAP_SYS_CHROOT)) 617 goto dput_and_out; 618 error = security_path_chroot(&path); 619 if (error) 620 goto dput_and_out; 621 622 set_fs_root(current->fs, &path); 623 error = 0; 624 dput_and_out: 625 path_put(&path); 626 if (retry_estale(error, lookup_flags)) { 627 lookup_flags |= LOOKUP_REVAL; 628 goto retry; 629 } 630 out: 631 return error; 632 } 633 634 int chmod_common(const struct path *path, umode_t mode) 635 { 636 struct inode *inode = path->dentry->d_inode; 637 struct inode *delegated_inode = NULL; 638 struct iattr newattrs; 639 int error; 640 641 error = mnt_want_write(path->mnt); 642 if (error) 643 return error; 644 retry_deleg: 645 inode_lock(inode); 646 error = security_path_chmod(path, mode); 647 if (error) 648 goto out_unlock; 649 newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); 650 newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; 651 error = notify_change(mnt_idmap(path->mnt), path->dentry, 652 &newattrs, &delegated_inode); 653 out_unlock: 654 inode_unlock(inode); 655 if (delegated_inode) { 656 error = break_deleg_wait(&delegated_inode); 657 if (!error) 658 goto retry_deleg; 659 } 660 mnt_drop_write(path->mnt); 661 return error; 662 } 663 664 int vfs_fchmod(struct file *file, umode_t mode) 665 { 666 audit_file(file); 667 return chmod_common(&file->f_path, mode); 668 } 669 670 SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode) 671 { 672 struct fd f = fdget(fd); 673 int err = -EBADF; 674 675 if (f.file) { 676 err = vfs_fchmod(f.file, mode); 677 fdput(f); 678 } 679 return err; 680 } 681 682 static int do_fchmodat(int dfd, const char __user *filename, umode_t mode, 683 unsigned int flags) 684 { 685 struct path path; 686 int error; 687 unsigned int lookup_flags; 688 689 if (unlikely(flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH))) 690 return -EINVAL; 691 692 lookup_flags = (flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; 693 if (flags & AT_EMPTY_PATH) 694 lookup_flags |= LOOKUP_EMPTY; 695 696 retry: 697 error = user_path_at(dfd, filename, lookup_flags, &path); 698 if (!error) { 699 error = chmod_common(&path, mode); 700 path_put(&path); 701 if (retry_estale(error, lookup_flags)) { 702 lookup_flags |= LOOKUP_REVAL; 703 goto retry; 704 } 705 } 706 return error; 707 } 708 709 SYSCALL_DEFINE4(fchmodat2, int, dfd, const char __user *, filename, 710 umode_t, mode, unsigned int, flags) 711 { 712 return do_fchmodat(dfd, filename, mode, flags); 713 } 714 715 SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, 716 umode_t, mode) 717 { 718 return do_fchmodat(dfd, filename, mode, 0); 719 } 720 721 SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode) 722 { 723 return do_fchmodat(AT_FDCWD, filename, mode, 0); 724 } 725 726 /* 727 * Check whether @kuid is valid and if so generate and set vfsuid_t in 728 * ia_vfsuid. 729 * 730 * Return: true if @kuid is valid, false if not. 731 */ 732 static inline bool setattr_vfsuid(struct iattr *attr, kuid_t kuid) 733 { 734 if (!uid_valid(kuid)) 735 return false; 736 attr->ia_valid |= ATTR_UID; 737 attr->ia_vfsuid = VFSUIDT_INIT(kuid); 738 return true; 739 } 740 741 /* 742 * Check whether @kgid is valid and if so generate and set vfsgid_t in 743 * ia_vfsgid. 744 * 745 * Return: true if @kgid is valid, false if not. 746 */ 747 static inline bool setattr_vfsgid(struct iattr *attr, kgid_t kgid) 748 { 749 if (!gid_valid(kgid)) 750 return false; 751 attr->ia_valid |= ATTR_GID; 752 attr->ia_vfsgid = VFSGIDT_INIT(kgid); 753 return true; 754 } 755 756 int chown_common(const struct path *path, uid_t user, gid_t group) 757 { 758 struct mnt_idmap *idmap; 759 struct user_namespace *fs_userns; 760 struct inode *inode = path->dentry->d_inode; 761 struct inode *delegated_inode = NULL; 762 int error; 763 struct iattr newattrs; 764 kuid_t uid; 765 kgid_t gid; 766 767 uid = make_kuid(current_user_ns(), user); 768 gid = make_kgid(current_user_ns(), group); 769 770 idmap = mnt_idmap(path->mnt); 771 fs_userns = i_user_ns(inode); 772 773 retry_deleg: 774 newattrs.ia_vfsuid = INVALID_VFSUID; 775 newattrs.ia_vfsgid = INVALID_VFSGID; 776 newattrs.ia_valid = ATTR_CTIME; 777 if ((user != (uid_t)-1) && !setattr_vfsuid(&newattrs, uid)) 778 return -EINVAL; 779 if ((group != (gid_t)-1) && !setattr_vfsgid(&newattrs, gid)) 780 return -EINVAL; 781 inode_lock(inode); 782 if (!S_ISDIR(inode->i_mode)) 783 newattrs.ia_valid |= ATTR_KILL_SUID | ATTR_KILL_PRIV | 784 setattr_should_drop_sgid(idmap, inode); 785 /* Continue to send actual fs values, not the mount values. */ 786 error = security_path_chown( 787 path, 788 from_vfsuid(idmap, fs_userns, newattrs.ia_vfsuid), 789 from_vfsgid(idmap, fs_userns, newattrs.ia_vfsgid)); 790 if (!error) 791 error = notify_change(idmap, path->dentry, &newattrs, 792 &delegated_inode); 793 inode_unlock(inode); 794 if (delegated_inode) { 795 error = break_deleg_wait(&delegated_inode); 796 if (!error) 797 goto retry_deleg; 798 } 799 return error; 800 } 801 802 int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group, 803 int flag) 804 { 805 struct path path; 806 int error = -EINVAL; 807 int lookup_flags; 808 809 if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) 810 goto out; 811 812 lookup_flags = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; 813 if (flag & AT_EMPTY_PATH) 814 lookup_flags |= LOOKUP_EMPTY; 815 retry: 816 error = user_path_at(dfd, filename, lookup_flags, &path); 817 if (error) 818 goto out; 819 error = mnt_want_write(path.mnt); 820 if (error) 821 goto out_release; 822 error = chown_common(&path, user, group); 823 mnt_drop_write(path.mnt); 824 out_release: 825 path_put(&path); 826 if (retry_estale(error, lookup_flags)) { 827 lookup_flags |= LOOKUP_REVAL; 828 goto retry; 829 } 830 out: 831 return error; 832 } 833 834 SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user, 835 gid_t, group, int, flag) 836 { 837 return do_fchownat(dfd, filename, user, group, flag); 838 } 839 840 SYSCALL_DEFINE3(chown, const char __user *, filename, uid_t, user, gid_t, group) 841 { 842 return do_fchownat(AT_FDCWD, filename, user, group, 0); 843 } 844 845 SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group) 846 { 847 return do_fchownat(AT_FDCWD, filename, user, group, 848 AT_SYMLINK_NOFOLLOW); 849 } 850 851 int vfs_fchown(struct file *file, uid_t user, gid_t group) 852 { 853 int error; 854 855 error = mnt_want_write_file(file); 856 if (error) 857 return error; 858 audit_file(file); 859 error = chown_common(&file->f_path, user, group); 860 mnt_drop_write_file(file); 861 return error; 862 } 863 864 int ksys_fchown(unsigned int fd, uid_t user, gid_t group) 865 { 866 struct fd f = fdget(fd); 867 int error = -EBADF; 868 869 if (f.file) { 870 error = vfs_fchown(f.file, user, group); 871 fdput(f); 872 } 873 return error; 874 } 875 876 SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group) 877 { 878 return ksys_fchown(fd, user, group); 879 } 880 881 static inline int file_get_write_access(struct file *f) 882 { 883 int error; 884 885 error = get_write_access(f->f_inode); 886 if (unlikely(error)) 887 return error; 888 error = mnt_get_write_access(f->f_path.mnt); 889 if (unlikely(error)) 890 goto cleanup_inode; 891 if (unlikely(f->f_mode & FMODE_BACKING)) { 892 error = mnt_get_write_access(backing_file_user_path(f)->mnt); 893 if (unlikely(error)) 894 goto cleanup_mnt; 895 } 896 return 0; 897 898 cleanup_mnt: 899 mnt_put_write_access(f->f_path.mnt); 900 cleanup_inode: 901 put_write_access(f->f_inode); 902 return error; 903 } 904 905 static int do_dentry_open(struct file *f, 906 struct inode *inode, 907 int (*open)(struct inode *, struct file *)) 908 { 909 static const struct file_operations empty_fops = {}; 910 int error; 911 912 path_get(&f->f_path); 913 f->f_inode = inode; 914 f->f_mapping = inode->i_mapping; 915 f->f_wb_err = filemap_sample_wb_err(f->f_mapping); 916 f->f_sb_err = file_sample_sb_err(f); 917 918 if (unlikely(f->f_flags & O_PATH)) { 919 f->f_mode = FMODE_PATH | FMODE_OPENED; 920 f->f_op = &empty_fops; 921 return 0; 922 } 923 924 if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) { 925 i_readcount_inc(inode); 926 } else if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) { 927 error = file_get_write_access(f); 928 if (unlikely(error)) 929 goto cleanup_file; 930 f->f_mode |= FMODE_WRITER; 931 } 932 933 /* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */ 934 if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)) 935 f->f_mode |= FMODE_ATOMIC_POS; 936 937 f->f_op = fops_get(inode->i_fop); 938 if (WARN_ON(!f->f_op)) { 939 error = -ENODEV; 940 goto cleanup_all; 941 } 942 943 error = security_file_open(f); 944 if (error) 945 goto cleanup_all; 946 947 error = break_lease(file_inode(f), f->f_flags); 948 if (error) 949 goto cleanup_all; 950 951 /* normally all 3 are set; ->open() can clear them if needed */ 952 f->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; 953 if (!open) 954 open = f->f_op->open; 955 if (open) { 956 error = open(inode, f); 957 if (error) 958 goto cleanup_all; 959 } 960 f->f_mode |= FMODE_OPENED; 961 if ((f->f_mode & FMODE_READ) && 962 likely(f->f_op->read || f->f_op->read_iter)) 963 f->f_mode |= FMODE_CAN_READ; 964 if ((f->f_mode & FMODE_WRITE) && 965 likely(f->f_op->write || f->f_op->write_iter)) 966 f->f_mode |= FMODE_CAN_WRITE; 967 if ((f->f_mode & FMODE_LSEEK) && !f->f_op->llseek) 968 f->f_mode &= ~FMODE_LSEEK; 969 if (f->f_mapping->a_ops && f->f_mapping->a_ops->direct_IO) 970 f->f_mode |= FMODE_CAN_ODIRECT; 971 972 f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); 973 f->f_iocb_flags = iocb_flags(f); 974 975 file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping); 976 977 if ((f->f_flags & O_DIRECT) && !(f->f_mode & FMODE_CAN_ODIRECT)) 978 return -EINVAL; 979 980 /* 981 * XXX: Huge page cache doesn't support writing yet. Drop all page 982 * cache for this file before processing writes. 983 */ 984 if (f->f_mode & FMODE_WRITE) { 985 /* 986 * Paired with smp_mb() in collapse_file() to ensure nr_thps 987 * is up to date and the update to i_writecount by 988 * get_write_access() is visible. Ensures subsequent insertion 989 * of THPs into the page cache will fail. 990 */ 991 smp_mb(); 992 if (filemap_nr_thps(inode->i_mapping)) { 993 struct address_space *mapping = inode->i_mapping; 994 995 filemap_invalidate_lock(inode->i_mapping); 996 /* 997 * unmap_mapping_range just need to be called once 998 * here, because the private pages is not need to be 999 * unmapped mapping (e.g. data segment of dynamic 1000 * shared libraries here). 1001 */ 1002 unmap_mapping_range(mapping, 0, 0, 0); 1003 truncate_inode_pages(mapping, 0); 1004 filemap_invalidate_unlock(inode->i_mapping); 1005 } 1006 } 1007 1008 /* 1009 * Once we return a file with FMODE_OPENED, __fput() will call 1010 * fsnotify_close(), so we need fsnotify_open() here for symmetry. 1011 */ 1012 fsnotify_open(f); 1013 return 0; 1014 1015 cleanup_all: 1016 if (WARN_ON_ONCE(error > 0)) 1017 error = -EINVAL; 1018 fops_put(f->f_op); 1019 put_file_access(f); 1020 cleanup_file: 1021 path_put(&f->f_path); 1022 f->f_path.mnt = NULL; 1023 f->f_path.dentry = NULL; 1024 f->f_inode = NULL; 1025 return error; 1026 } 1027 1028 /** 1029 * finish_open - finish opening a file 1030 * @file: file pointer 1031 * @dentry: pointer to dentry 1032 * @open: open callback 1033 * 1034 * This can be used to finish opening a file passed to i_op->atomic_open(). 1035 * 1036 * If the open callback is set to NULL, then the standard f_op->open() 1037 * filesystem callback is substituted. 1038 * 1039 * NB: the dentry reference is _not_ consumed. If, for example, the dentry is 1040 * the return value of d_splice_alias(), then the caller needs to perform dput() 1041 * on it after finish_open(). 1042 * 1043 * Returns zero on success or -errno if the open failed. 1044 */ 1045 int finish_open(struct file *file, struct dentry *dentry, 1046 int (*open)(struct inode *, struct file *)) 1047 { 1048 BUG_ON(file->f_mode & FMODE_OPENED); /* once it's opened, it's opened */ 1049 1050 file->f_path.dentry = dentry; 1051 return do_dentry_open(file, d_backing_inode(dentry), open); 1052 } 1053 EXPORT_SYMBOL(finish_open); 1054 1055 /** 1056 * finish_no_open - finish ->atomic_open() without opening the file 1057 * 1058 * @file: file pointer 1059 * @dentry: dentry or NULL (as returned from ->lookup()) 1060 * 1061 * This can be used to set the result of a successful lookup in ->atomic_open(). 1062 * 1063 * NB: unlike finish_open() this function does consume the dentry reference and 1064 * the caller need not dput() it. 1065 * 1066 * Returns "0" which must be the return value of ->atomic_open() after having 1067 * called this function. 1068 */ 1069 int finish_no_open(struct file *file, struct dentry *dentry) 1070 { 1071 file->f_path.dentry = dentry; 1072 return 0; 1073 } 1074 EXPORT_SYMBOL(finish_no_open); 1075 1076 char *file_path(struct file *filp, char *buf, int buflen) 1077 { 1078 return d_path(&filp->f_path, buf, buflen); 1079 } 1080 EXPORT_SYMBOL(file_path); 1081 1082 /** 1083 * vfs_open - open the file at the given path 1084 * @path: path to open 1085 * @file: newly allocated file with f_flag initialized 1086 */ 1087 int vfs_open(const struct path *path, struct file *file) 1088 { 1089 file->f_path = *path; 1090 return do_dentry_open(file, d_backing_inode(path->dentry), NULL); 1091 } 1092 1093 struct file *dentry_open(const struct path *path, int flags, 1094 const struct cred *cred) 1095 { 1096 int error; 1097 struct file *f; 1098 1099 /* We must always pass in a valid mount pointer. */ 1100 BUG_ON(!path->mnt); 1101 1102 f = alloc_empty_file(flags, cred); 1103 if (!IS_ERR(f)) { 1104 error = vfs_open(path, f); 1105 if (error) { 1106 fput(f); 1107 f = ERR_PTR(error); 1108 } 1109 } 1110 return f; 1111 } 1112 EXPORT_SYMBOL(dentry_open); 1113 1114 /** 1115 * dentry_create - Create and open a file 1116 * @path: path to create 1117 * @flags: O_ flags 1118 * @mode: mode bits for new file 1119 * @cred: credentials to use 1120 * 1121 * Caller must hold the parent directory's lock, and have prepared 1122 * a negative dentry, placed in @path->dentry, for the new file. 1123 * 1124 * Caller sets @path->mnt to the vfsmount of the filesystem where 1125 * the new file is to be created. The parent directory and the 1126 * negative dentry must reside on the same filesystem instance. 1127 * 1128 * On success, returns a "struct file *". Otherwise a ERR_PTR 1129 * is returned. 1130 */ 1131 struct file *dentry_create(const struct path *path, int flags, umode_t mode, 1132 const struct cred *cred) 1133 { 1134 struct file *f; 1135 int error; 1136 1137 f = alloc_empty_file(flags, cred); 1138 if (IS_ERR(f)) 1139 return f; 1140 1141 error = vfs_create(mnt_idmap(path->mnt), 1142 d_inode(path->dentry->d_parent), 1143 path->dentry, mode, true); 1144 if (!error) 1145 error = vfs_open(path, f); 1146 1147 if (unlikely(error)) { 1148 fput(f); 1149 return ERR_PTR(error); 1150 } 1151 return f; 1152 } 1153 EXPORT_SYMBOL(dentry_create); 1154 1155 /** 1156 * kernel_file_open - open a file for kernel internal use 1157 * @path: path of the file to open 1158 * @flags: open flags 1159 * @inode: the inode 1160 * @cred: credentials for open 1161 * 1162 * Open a file for use by in-kernel consumers. The file is not accounted 1163 * against nr_files and must not be installed into the file descriptor 1164 * table. 1165 * 1166 * Return: Opened file on success, an error pointer on failure. 1167 */ 1168 struct file *kernel_file_open(const struct path *path, int flags, 1169 struct inode *inode, const struct cred *cred) 1170 { 1171 struct file *f; 1172 int error; 1173 1174 f = alloc_empty_file_noaccount(flags, cred); 1175 if (IS_ERR(f)) 1176 return f; 1177 1178 f->f_path = *path; 1179 error = do_dentry_open(f, inode, NULL); 1180 if (error) { 1181 fput(f); 1182 f = ERR_PTR(error); 1183 } 1184 return f; 1185 } 1186 EXPORT_SYMBOL_GPL(kernel_file_open); 1187 1188 #define WILL_CREATE(flags) (flags & (O_CREAT | __O_TMPFILE)) 1189 #define O_PATH_FLAGS (O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC) 1190 1191 inline struct open_how build_open_how(int flags, umode_t mode) 1192 { 1193 struct open_how how = { 1194 .flags = flags & VALID_OPEN_FLAGS, 1195 .mode = mode & S_IALLUGO, 1196 }; 1197 1198 /* O_PATH beats everything else. */ 1199 if (how.flags & O_PATH) 1200 how.flags &= O_PATH_FLAGS; 1201 /* Modes should only be set for create-like flags. */ 1202 if (!WILL_CREATE(how.flags)) 1203 how.mode = 0; 1204 return how; 1205 } 1206 1207 inline int build_open_flags(const struct open_how *how, struct open_flags *op) 1208 { 1209 u64 flags = how->flags; 1210 u64 strip = __FMODE_NONOTIFY | O_CLOEXEC; 1211 int lookup_flags = 0; 1212 int acc_mode = ACC_MODE(flags); 1213 1214 BUILD_BUG_ON_MSG(upper_32_bits(VALID_OPEN_FLAGS), 1215 "struct open_flags doesn't yet handle flags > 32 bits"); 1216 1217 /* 1218 * Strip flags that either shouldn't be set by userspace like 1219 * FMODE_NONOTIFY or that aren't relevant in determining struct 1220 * open_flags like O_CLOEXEC. 1221 */ 1222 flags &= ~strip; 1223 1224 /* 1225 * Older syscalls implicitly clear all of the invalid flags or argument 1226 * values before calling build_open_flags(), but openat2(2) checks all 1227 * of its arguments. 1228 */ 1229 if (flags & ~VALID_OPEN_FLAGS) 1230 return -EINVAL; 1231 if (how->resolve & ~VALID_RESOLVE_FLAGS) 1232 return -EINVAL; 1233 1234 /* Scoping flags are mutually exclusive. */ 1235 if ((how->resolve & RESOLVE_BENEATH) && (how->resolve & RESOLVE_IN_ROOT)) 1236 return -EINVAL; 1237 1238 /* Deal with the mode. */ 1239 if (WILL_CREATE(flags)) { 1240 if (how->mode & ~S_IALLUGO) 1241 return -EINVAL; 1242 op->mode = how->mode | S_IFREG; 1243 } else { 1244 if (how->mode != 0) 1245 return -EINVAL; 1246 op->mode = 0; 1247 } 1248 1249 /* 1250 * Block bugs where O_DIRECTORY | O_CREAT created regular files. 1251 * Note, that blocking O_DIRECTORY | O_CREAT here also protects 1252 * O_TMPFILE below which requires O_DIRECTORY being raised. 1253 */ 1254 if ((flags & (O_DIRECTORY | O_CREAT)) == (O_DIRECTORY | O_CREAT)) 1255 return -EINVAL; 1256 1257 /* Now handle the creative implementation of O_TMPFILE. */ 1258 if (flags & __O_TMPFILE) { 1259 /* 1260 * In order to ensure programs get explicit errors when trying 1261 * to use O_TMPFILE on old kernels we enforce that O_DIRECTORY 1262 * is raised alongside __O_TMPFILE. 1263 */ 1264 if (!(flags & O_DIRECTORY)) 1265 return -EINVAL; 1266 if (!(acc_mode & MAY_WRITE)) 1267 return -EINVAL; 1268 } 1269 if (flags & O_PATH) { 1270 /* O_PATH only permits certain other flags to be set. */ 1271 if (flags & ~O_PATH_FLAGS) 1272 return -EINVAL; 1273 acc_mode = 0; 1274 } 1275 1276 /* 1277 * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only 1278 * check for O_DSYNC if the need any syncing at all we enforce it's 1279 * always set instead of having to deal with possibly weird behaviour 1280 * for malicious applications setting only __O_SYNC. 1281 */ 1282 if (flags & __O_SYNC) 1283 flags |= O_DSYNC; 1284 1285 op->open_flag = flags; 1286 1287 /* O_TRUNC implies we need access checks for write permissions */ 1288 if (flags & O_TRUNC) 1289 acc_mode |= MAY_WRITE; 1290 1291 /* Allow the LSM permission hook to distinguish append 1292 access from general write access. */ 1293 if (flags & O_APPEND) 1294 acc_mode |= MAY_APPEND; 1295 1296 op->acc_mode = acc_mode; 1297 1298 op->intent = flags & O_PATH ? 0 : LOOKUP_OPEN; 1299 1300 if (flags & O_CREAT) { 1301 op->intent |= LOOKUP_CREATE; 1302 if (flags & O_EXCL) { 1303 op->intent |= LOOKUP_EXCL; 1304 flags |= O_NOFOLLOW; 1305 } 1306 } 1307 1308 if (flags & O_DIRECTORY) 1309 lookup_flags |= LOOKUP_DIRECTORY; 1310 if (!(flags & O_NOFOLLOW)) 1311 lookup_flags |= LOOKUP_FOLLOW; 1312 1313 if (how->resolve & RESOLVE_NO_XDEV) 1314 lookup_flags |= LOOKUP_NO_XDEV; 1315 if (how->resolve & RESOLVE_NO_MAGICLINKS) 1316 lookup_flags |= LOOKUP_NO_MAGICLINKS; 1317 if (how->resolve & RESOLVE_NO_SYMLINKS) 1318 lookup_flags |= LOOKUP_NO_SYMLINKS; 1319 if (how->resolve & RESOLVE_BENEATH) 1320 lookup_flags |= LOOKUP_BENEATH; 1321 if (how->resolve & RESOLVE_IN_ROOT) 1322 lookup_flags |= LOOKUP_IN_ROOT; 1323 if (how->resolve & RESOLVE_CACHED) { 1324 /* Don't bother even trying for create/truncate/tmpfile open */ 1325 if (flags & (O_TRUNC | O_CREAT | __O_TMPFILE)) 1326 return -EAGAIN; 1327 lookup_flags |= LOOKUP_CACHED; 1328 } 1329 1330 op->lookup_flags = lookup_flags; 1331 return 0; 1332 } 1333 1334 /** 1335 * file_open_name - open file and return file pointer 1336 * 1337 * @name: struct filename containing path to open 1338 * @flags: open flags as per the open(2) second argument 1339 * @mode: mode for the new file if O_CREAT is set, else ignored 1340 * 1341 * This is the helper to open a file from kernelspace if you really 1342 * have to. But in generally you should not do this, so please move 1343 * along, nothing to see here.. 1344 */ 1345 struct file *file_open_name(struct filename *name, int flags, umode_t mode) 1346 { 1347 struct open_flags op; 1348 struct open_how how = build_open_how(flags, mode); 1349 int err = build_open_flags(&how, &op); 1350 if (err) 1351 return ERR_PTR(err); 1352 return do_filp_open(AT_FDCWD, name, &op); 1353 } 1354 1355 /** 1356 * filp_open - open file and return file pointer 1357 * 1358 * @filename: path to open 1359 * @flags: open flags as per the open(2) second argument 1360 * @mode: mode for the new file if O_CREAT is set, else ignored 1361 * 1362 * This is the helper to open a file from kernelspace if you really 1363 * have to. But in generally you should not do this, so please move 1364 * along, nothing to see here.. 1365 */ 1366 struct file *filp_open(const char *filename, int flags, umode_t mode) 1367 { 1368 struct filename *name = getname_kernel(filename); 1369 struct file *file = ERR_CAST(name); 1370 1371 if (!IS_ERR(name)) { 1372 file = file_open_name(name, flags, mode); 1373 putname(name); 1374 } 1375 return file; 1376 } 1377 EXPORT_SYMBOL(filp_open); 1378 1379 struct file *file_open_root(const struct path *root, 1380 const char *filename, int flags, umode_t mode) 1381 { 1382 struct open_flags op; 1383 struct open_how how = build_open_how(flags, mode); 1384 int err = build_open_flags(&how, &op); 1385 if (err) 1386 return ERR_PTR(err); 1387 return do_file_open_root(root, filename, &op); 1388 } 1389 EXPORT_SYMBOL(file_open_root); 1390 1391 static long do_sys_openat2(int dfd, const char __user *filename, 1392 struct open_how *how) 1393 { 1394 struct open_flags op; 1395 int fd = build_open_flags(how, &op); 1396 struct filename *tmp; 1397 1398 if (fd) 1399 return fd; 1400 1401 tmp = getname(filename); 1402 if (IS_ERR(tmp)) 1403 return PTR_ERR(tmp); 1404 1405 fd = get_unused_fd_flags(how->flags); 1406 if (fd >= 0) { 1407 struct file *f = do_filp_open(dfd, tmp, &op); 1408 if (IS_ERR(f)) { 1409 put_unused_fd(fd); 1410 fd = PTR_ERR(f); 1411 } else { 1412 fd_install(fd, f); 1413 } 1414 } 1415 putname(tmp); 1416 return fd; 1417 } 1418 1419 long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) 1420 { 1421 struct open_how how = build_open_how(flags, mode); 1422 return do_sys_openat2(dfd, filename, &how); 1423 } 1424 1425 1426 SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode) 1427 { 1428 if (force_o_largefile()) 1429 flags |= O_LARGEFILE; 1430 return do_sys_open(AT_FDCWD, filename, flags, mode); 1431 } 1432 1433 SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, 1434 umode_t, mode) 1435 { 1436 if (force_o_largefile()) 1437 flags |= O_LARGEFILE; 1438 return do_sys_open(dfd, filename, flags, mode); 1439 } 1440 1441 SYSCALL_DEFINE4(openat2, int, dfd, const char __user *, filename, 1442 struct open_how __user *, how, size_t, usize) 1443 { 1444 int err; 1445 struct open_how tmp; 1446 1447 BUILD_BUG_ON(sizeof(struct open_how) < OPEN_HOW_SIZE_VER0); 1448 BUILD_BUG_ON(sizeof(struct open_how) != OPEN_HOW_SIZE_LATEST); 1449 1450 if (unlikely(usize < OPEN_HOW_SIZE_VER0)) 1451 return -EINVAL; 1452 1453 err = copy_struct_from_user(&tmp, sizeof(tmp), how, usize); 1454 if (err) 1455 return err; 1456 1457 audit_openat2_how(&tmp); 1458 1459 /* O_LARGEFILE is only allowed for non-O_PATH. */ 1460 if (!(tmp.flags & O_PATH) && force_o_largefile()) 1461 tmp.flags |= O_LARGEFILE; 1462 1463 return do_sys_openat2(dfd, filename, &tmp); 1464 } 1465 1466 #ifdef CONFIG_COMPAT 1467 /* 1468 * Exactly like sys_open(), except that it doesn't set the 1469 * O_LARGEFILE flag. 1470 */ 1471 COMPAT_SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode) 1472 { 1473 return do_sys_open(AT_FDCWD, filename, flags, mode); 1474 } 1475 1476 /* 1477 * Exactly like sys_openat(), except that it doesn't set the 1478 * O_LARGEFILE flag. 1479 */ 1480 COMPAT_SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, umode_t, mode) 1481 { 1482 return do_sys_open(dfd, filename, flags, mode); 1483 } 1484 #endif 1485 1486 #ifndef __alpha__ 1487 1488 /* 1489 * For backward compatibility? Maybe this should be moved 1490 * into arch/i386 instead? 1491 */ 1492 SYSCALL_DEFINE2(creat, const char __user *, pathname, umode_t, mode) 1493 { 1494 int flags = O_CREAT | O_WRONLY | O_TRUNC; 1495 1496 if (force_o_largefile()) 1497 flags |= O_LARGEFILE; 1498 return do_sys_open(AT_FDCWD, pathname, flags, mode); 1499 } 1500 #endif 1501 1502 /* 1503 * "id" is the POSIX thread ID. We use the 1504 * files pointer for this.. 1505 */ 1506 static int filp_flush(struct file *filp, fl_owner_t id) 1507 { 1508 int retval = 0; 1509 1510 if (CHECK_DATA_CORRUPTION(file_count(filp) == 0, 1511 "VFS: Close: file count is 0 (f_op=%ps)", 1512 filp->f_op)) { 1513 return 0; 1514 } 1515 1516 if (filp->f_op->flush) 1517 retval = filp->f_op->flush(filp, id); 1518 1519 if (likely(!(filp->f_mode & FMODE_PATH))) { 1520 dnotify_flush(filp, id); 1521 locks_remove_posix(filp, id); 1522 } 1523 return retval; 1524 } 1525 1526 int filp_close(struct file *filp, fl_owner_t id) 1527 { 1528 int retval; 1529 1530 retval = filp_flush(filp, id); 1531 fput(filp); 1532 1533 return retval; 1534 } 1535 EXPORT_SYMBOL(filp_close); 1536 1537 /* 1538 * Careful here! We test whether the file pointer is NULL before 1539 * releasing the fd. This ensures that one clone task can't release 1540 * an fd while another clone is opening it. 1541 */ 1542 SYSCALL_DEFINE1(close, unsigned int, fd) 1543 { 1544 int retval; 1545 struct file *file; 1546 1547 file = file_close_fd(fd); 1548 if (!file) 1549 return -EBADF; 1550 1551 retval = filp_flush(file, current->files); 1552 1553 /* 1554 * We're returning to user space. Don't bother 1555 * with any delayed fput() cases. 1556 */ 1557 __fput_sync(file); 1558 1559 /* can't restart close syscall because file table entry was cleared */ 1560 if (unlikely(retval == -ERESTARTSYS || 1561 retval == -ERESTARTNOINTR || 1562 retval == -ERESTARTNOHAND || 1563 retval == -ERESTART_RESTARTBLOCK)) 1564 retval = -EINTR; 1565 1566 return retval; 1567 } 1568 1569 /** 1570 * sys_close_range() - Close all file descriptors in a given range. 1571 * 1572 * @fd: starting file descriptor to close 1573 * @max_fd: last file descriptor to close 1574 * @flags: reserved for future extensions 1575 * 1576 * This closes a range of file descriptors. All file descriptors 1577 * from @fd up to and including @max_fd are closed. 1578 * Currently, errors to close a given file descriptor are ignored. 1579 */ 1580 SYSCALL_DEFINE3(close_range, unsigned int, fd, unsigned int, max_fd, 1581 unsigned int, flags) 1582 { 1583 return __close_range(fd, max_fd, flags); 1584 } 1585 1586 /* 1587 * This routine simulates a hangup on the tty, to arrange that users 1588 * are given clean terminals at login time. 1589 */ 1590 SYSCALL_DEFINE0(vhangup) 1591 { 1592 if (capable(CAP_SYS_TTY_CONFIG)) { 1593 tty_vhangup_self(); 1594 return 0; 1595 } 1596 return -EPERM; 1597 } 1598 1599 /* 1600 * Called when an inode is about to be open. 1601 * We use this to disallow opening large files on 32bit systems if 1602 * the caller didn't specify O_LARGEFILE. On 64bit systems we force 1603 * on this flag in sys_open. 1604 */ 1605 int generic_file_open(struct inode * inode, struct file * filp) 1606 { 1607 if (!(filp->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS) 1608 return -EOVERFLOW; 1609 return 0; 1610 } 1611 1612 EXPORT_SYMBOL(generic_file_open); 1613 1614 /* 1615 * This is used by subsystems that don't want seekable 1616 * file descriptors. The function is not supposed to ever fail, the only 1617 * reason it returns an 'int' and not 'void' is so that it can be plugged 1618 * directly into file_operations structure. 1619 */ 1620 int nonseekable_open(struct inode *inode, struct file *filp) 1621 { 1622 filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE); 1623 return 0; 1624 } 1625 1626 EXPORT_SYMBOL(nonseekable_open); 1627 1628 /* 1629 * stream_open is used by subsystems that want stream-like file descriptors. 1630 * Such file descriptors are not seekable and don't have notion of position 1631 * (file.f_pos is always 0 and ppos passed to .read()/.write() is always NULL). 1632 * Contrary to file descriptors of other regular files, .read() and .write() 1633 * can run simultaneously. 1634 * 1635 * stream_open never fails and is marked to return int so that it could be 1636 * directly used as file_operations.open . 1637 */ 1638 int stream_open(struct inode *inode, struct file *filp) 1639 { 1640 filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE | FMODE_ATOMIC_POS); 1641 filp->f_mode |= FMODE_STREAM; 1642 return 0; 1643 } 1644 1645 EXPORT_SYMBOL(stream_open); 1646