1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * linux/fs/open.c 4 * 5 * Copyright (C) 1991, 1992 Linus Torvalds 6 */ 7 8 #include <linux/string.h> 9 #include <linux/mm.h> 10 #include <linux/file.h> 11 #include <linux/fdtable.h> 12 #include <linux/fsnotify.h> 13 #include <linux/module.h> 14 #include <linux/tty.h> 15 #include <linux/namei.h> 16 #include <linux/backing-dev.h> 17 #include <linux/capability.h> 18 #include <linux/securebits.h> 19 #include <linux/security.h> 20 #include <linux/mount.h> 21 #include <linux/fcntl.h> 22 #include <linux/slab.h> 23 #include <linux/uaccess.h> 24 #include <linux/fs.h> 25 #include <linux/personality.h> 26 #include <linux/pagemap.h> 27 #include <linux/syscalls.h> 28 #include <linux/rcupdate.h> 29 #include <linux/audit.h> 30 #include <linux/falloc.h> 31 #include <linux/fs_struct.h> 32 #include <linux/dnotify.h> 33 #include <linux/compat.h> 34 #include <linux/mnt_idmapping.h> 35 #include <linux/filelock.h> 36 37 #include "internal.h" 38 39 int do_truncate(struct mnt_idmap *idmap, struct dentry *dentry, 40 loff_t length, unsigned int time_attrs, struct file *filp) 41 { 42 int ret; 43 struct iattr newattrs; 44 45 /* Not pretty: "inode->i_size" shouldn't really be signed. But it is. */ 46 if (length < 0) 47 return -EINVAL; 48 49 newattrs.ia_size = length; 50 newattrs.ia_valid = ATTR_SIZE | time_attrs; 51 if (filp) { 52 newattrs.ia_file = filp; 53 newattrs.ia_valid |= ATTR_FILE; 54 } 55 56 /* Remove suid, sgid, and file capabilities on truncate too */ 57 ret = dentry_needs_remove_privs(idmap, dentry); 58 if (ret < 0) 59 return ret; 60 if (ret) 61 newattrs.ia_valid |= ret | ATTR_FORCE; 62 63 ret = inode_lock_killable(dentry->d_inode); 64 if (ret) 65 return ret; 66 67 /* Note any delegations or leases have already been broken: */ 68 ret = notify_change(idmap, dentry, &newattrs, NULL); 69 inode_unlock(dentry->d_inode); 70 return ret; 71 } 72 73 int vfs_truncate(const struct path *path, loff_t length) 74 { 75 struct mnt_idmap *idmap; 76 struct inode *inode; 77 int error; 78 79 inode = path->dentry->d_inode; 80 81 /* For directories it's -EISDIR, for other non-regulars - -EINVAL */ 82 if (S_ISDIR(inode->i_mode)) 83 return -EISDIR; 84 if (!S_ISREG(inode->i_mode)) 85 return -EINVAL; 86 87 idmap = mnt_idmap(path->mnt); 88 error = inode_permission(idmap, inode, MAY_WRITE); 89 if (error) 90 return error; 91 92 error = fsnotify_truncate_perm(path, length); 93 if (error) 94 return error; 95 96 error = mnt_want_write(path->mnt); 97 if (error) 98 return error; 99 100 error = -EPERM; 101 if (IS_APPEND(inode)) 102 goto mnt_drop_write_and_out; 103 104 error = get_write_access(inode); 105 if (error) 106 goto mnt_drop_write_and_out; 107 108 /* 109 * Make sure that there are no leases. get_write_access() protects 110 * against the truncate racing with a lease-granting setlease(). 111 */ 112 error = break_lease(inode, O_WRONLY); 113 if (error) 114 goto put_write_and_out; 115 116 error = security_path_truncate(path); 117 if (!error) 118 error = do_truncate(idmap, path->dentry, length, 0, NULL); 119 120 put_write_and_out: 121 put_write_access(inode); 122 mnt_drop_write_and_out: 123 mnt_drop_write(path->mnt); 124 125 return error; 126 } 127 EXPORT_SYMBOL_GPL(vfs_truncate); 128 129 int ksys_truncate(const char __user *pathname, loff_t length) 130 { 131 unsigned int lookup_flags = LOOKUP_FOLLOW; 132 struct path path; 133 int error; 134 135 if (length < 0) /* sorry, but loff_t says... */ 136 return -EINVAL; 137 138 CLASS(filename, name)(pathname); 139 retry: 140 error = filename_lookup(AT_FDCWD, name, lookup_flags, &path, NULL); 141 if (!error) { 142 error = vfs_truncate(&path, length); 143 path_put(&path); 144 if (retry_estale(error, lookup_flags)) { 145 lookup_flags |= LOOKUP_REVAL; 146 goto retry; 147 } 148 } 149 return error; 150 } 151 152 SYSCALL_DEFINE2(truncate, const char __user *, path, long, length) 153 { 154 return ksys_truncate(path, length); 155 } 156 157 #ifdef CONFIG_COMPAT 158 COMPAT_SYSCALL_DEFINE2(truncate, const char __user *, path, compat_off_t, length) 159 { 160 return ksys_truncate(path, length); 161 } 162 #endif 163 164 int do_ftruncate(struct file *file, loff_t length, unsigned int flags) 165 { 166 struct dentry *dentry = file->f_path.dentry; 167 struct inode *inode = dentry->d_inode; 168 int error; 169 170 if (!S_ISREG(inode->i_mode) || !(file->f_mode & FMODE_WRITE)) 171 return -EINVAL; 172 173 /* 174 * Cannot ftruncate over 2^31 bytes without large file support, either 175 * through opening with O_LARGEFILE or by using ftruncate64(). 176 */ 177 if (length > MAX_NON_LFS && 178 !(file->f_flags & O_LARGEFILE) && !(flags & FTRUNCATE_LFS)) 179 return -EINVAL; 180 181 /* Check IS_APPEND on real upper inode */ 182 if (IS_APPEND(file_inode(file))) 183 return -EPERM; 184 185 error = security_file_truncate(file); 186 if (error) 187 return error; 188 189 error = fsnotify_truncate_perm(&file->f_path, length); 190 if (error) 191 return error; 192 193 scoped_guard(super_write, inode->i_sb) 194 return do_truncate(file_mnt_idmap(file), dentry, length, 195 ATTR_MTIME | ATTR_CTIME, file); 196 } 197 198 int ksys_ftruncate(unsigned int fd, loff_t length, unsigned int flags) 199 { 200 if (length < 0) 201 return -EINVAL; 202 CLASS(fd, f)(fd); 203 if (fd_empty(f)) 204 return -EBADF; 205 206 return do_ftruncate(fd_file(f), length, flags); 207 } 208 209 SYSCALL_DEFINE2(ftruncate, unsigned int, fd, off_t, length) 210 { 211 return ksys_ftruncate(fd, length, 0); 212 } 213 214 #ifdef CONFIG_COMPAT 215 COMPAT_SYSCALL_DEFINE2(ftruncate, unsigned int, fd, compat_off_t, length) 216 { 217 return ksys_ftruncate(fd, length, 0); 218 } 219 #endif 220 221 /* LFS versions of truncate are only needed on 32 bit machines */ 222 #if BITS_PER_LONG == 32 223 SYSCALL_DEFINE2(truncate64, const char __user *, path, loff_t, length) 224 { 225 return ksys_truncate(path, length); 226 } 227 228 SYSCALL_DEFINE2(ftruncate64, unsigned int, fd, loff_t, length) 229 { 230 return ksys_ftruncate(fd, length, FTRUNCATE_LFS); 231 } 232 #endif /* BITS_PER_LONG == 32 */ 233 234 #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_TRUNCATE64) 235 COMPAT_SYSCALL_DEFINE3(truncate64, const char __user *, pathname, 236 compat_arg_u64_dual(length)) 237 { 238 return ksys_truncate(pathname, compat_arg_u64_glue(length)); 239 } 240 #endif 241 242 #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_FTRUNCATE64) 243 COMPAT_SYSCALL_DEFINE3(ftruncate64, unsigned int, fd, 244 compat_arg_u64_dual(length)) 245 { 246 return ksys_ftruncate(fd, compat_arg_u64_glue(length), FTRUNCATE_LFS); 247 } 248 #endif 249 250 int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) 251 { 252 struct inode *inode = file_inode(file); 253 int ret; 254 loff_t sum; 255 256 if (offset < 0 || len <= 0) 257 return -EINVAL; 258 259 if (mode & ~(FALLOC_FL_MODE_MASK | FALLOC_FL_KEEP_SIZE)) 260 return -EOPNOTSUPP; 261 262 /* 263 * Modes are exclusive, even if that is not obvious from the encoding 264 * as bit masks and the mix with the flag in the same namespace. 265 * 266 * To make things even more complicated, FALLOC_FL_ALLOCATE_RANGE is 267 * encoded as no bit set. 268 */ 269 switch (mode & FALLOC_FL_MODE_MASK) { 270 case FALLOC_FL_ALLOCATE_RANGE: 271 case FALLOC_FL_UNSHARE_RANGE: 272 case FALLOC_FL_ZERO_RANGE: 273 break; 274 case FALLOC_FL_PUNCH_HOLE: 275 if (!(mode & FALLOC_FL_KEEP_SIZE)) 276 return -EOPNOTSUPP; 277 break; 278 case FALLOC_FL_COLLAPSE_RANGE: 279 case FALLOC_FL_INSERT_RANGE: 280 case FALLOC_FL_WRITE_ZEROES: 281 if (mode & FALLOC_FL_KEEP_SIZE) 282 return -EOPNOTSUPP; 283 break; 284 default: 285 return -EOPNOTSUPP; 286 } 287 288 if (!(file->f_mode & FMODE_WRITE)) 289 return -EBADF; 290 291 /* 292 * On append-only files only space preallocation is supported. 293 */ 294 if ((mode & ~FALLOC_FL_KEEP_SIZE) && IS_APPEND(inode)) 295 return -EPERM; 296 297 if (IS_IMMUTABLE(inode)) 298 return -EPERM; 299 300 /* 301 * We cannot allow any fallocate operation on an active swapfile 302 */ 303 if (IS_SWAPFILE(inode)) 304 return -ETXTBSY; 305 306 /* 307 * Revalidate the write permissions, in case security policy has 308 * changed since the files were opened. 309 */ 310 ret = security_file_permission(file, MAY_WRITE); 311 if (ret) 312 return ret; 313 314 ret = fsnotify_file_area_perm(file, MAY_WRITE, &offset, len); 315 if (ret) 316 return ret; 317 318 if (S_ISFIFO(inode->i_mode)) 319 return -ESPIPE; 320 321 if (S_ISDIR(inode->i_mode)) 322 return -EISDIR; 323 324 if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode)) 325 return -ENODEV; 326 327 /* Check for wraparound */ 328 if (check_add_overflow(offset, len, &sum)) 329 return -EFBIG; 330 331 if (sum > inode->i_sb->s_maxbytes) 332 return -EFBIG; 333 334 if (!file->f_op->fallocate) 335 return -EOPNOTSUPP; 336 337 file_start_write(file); 338 ret = file->f_op->fallocate(file, mode, offset, len); 339 340 /* 341 * Create inotify and fanotify events. 342 * 343 * To keep the logic simple always create events if fallocate succeeds. 344 * This implies that events are even created if the file size remains 345 * unchanged, e.g. when using flag FALLOC_FL_KEEP_SIZE. 346 */ 347 if (ret == 0) 348 fsnotify_modify(file); 349 350 file_end_write(file); 351 return ret; 352 } 353 EXPORT_SYMBOL_GPL(vfs_fallocate); 354 355 int ksys_fallocate(int fd, int mode, loff_t offset, loff_t len) 356 { 357 CLASS(fd, f)(fd); 358 359 if (fd_empty(f)) 360 return -EBADF; 361 362 return vfs_fallocate(fd_file(f), mode, offset, len); 363 } 364 365 SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len) 366 { 367 return ksys_fallocate(fd, mode, offset, len); 368 } 369 370 #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_FALLOCATE) 371 COMPAT_SYSCALL_DEFINE6(fallocate, int, fd, int, mode, compat_arg_u64_dual(offset), 372 compat_arg_u64_dual(len)) 373 { 374 return ksys_fallocate(fd, mode, compat_arg_u64_glue(offset), 375 compat_arg_u64_glue(len)); 376 } 377 #endif 378 379 /* 380 * access() needs to use the real uid/gid, not the effective uid/gid. 381 * We do this by temporarily clearing all FS-related capabilities and 382 * switching the fsuid/fsgid around to the real ones. 383 * 384 * Creating new credentials is expensive, so we try to skip doing it, 385 * which we can if the result would match what we already got. 386 */ 387 static bool access_need_override_creds(int flags) 388 { 389 const struct cred *cred; 390 391 if (flags & AT_EACCESS) 392 return false; 393 394 cred = current_cred(); 395 if (!uid_eq(cred->fsuid, cred->uid) || 396 !gid_eq(cred->fsgid, cred->gid)) 397 return true; 398 399 if (!issecure(SECURE_NO_SETUID_FIXUP)) { 400 kuid_t root_uid = make_kuid(cred->user_ns, 0); 401 if (!uid_eq(cred->uid, root_uid)) { 402 if (!cap_isclear(cred->cap_effective)) 403 return true; 404 } else { 405 if (!cap_isidentical(cred->cap_effective, 406 cred->cap_permitted)) 407 return true; 408 } 409 } 410 411 return false; 412 } 413 414 static const struct cred *access_override_creds(void) 415 { 416 struct cred *override_cred; 417 418 override_cred = prepare_creds(); 419 if (!override_cred) 420 return NULL; 421 422 /* 423 * XXX access_need_override_creds performs checks in hopes of skipping 424 * this work. Make sure it stays in sync if making any changes in this 425 * routine. 426 */ 427 428 override_cred->fsuid = override_cred->uid; 429 override_cred->fsgid = override_cred->gid; 430 431 if (!issecure(SECURE_NO_SETUID_FIXUP)) { 432 /* Clear the capabilities if we switch to a non-root user */ 433 kuid_t root_uid = make_kuid(override_cred->user_ns, 0); 434 if (!uid_eq(override_cred->uid, root_uid)) 435 cap_clear(override_cred->cap_effective); 436 else 437 override_cred->cap_effective = 438 override_cred->cap_permitted; 439 } 440 441 /* 442 * The new set of credentials can *only* be used in 443 * task-synchronous circumstances, and does not need 444 * RCU freeing, unless somebody then takes a separate 445 * reference to it. 446 * 447 * NOTE! This is _only_ true because this credential 448 * is used purely for override_creds() that installs 449 * it as the subjective cred. Other threads will be 450 * accessing ->real_cred, not the subjective cred. 451 * 452 * If somebody _does_ make a copy of this (using the 453 * 'get_current_cred()' function), that will clear the 454 * non_rcu field, because now that other user may be 455 * expecting RCU freeing. But normal thread-synchronous 456 * cred accesses will keep things non-racy to avoid RCU 457 * freeing. 458 */ 459 override_cred->non_rcu = 1; 460 return override_creds(override_cred); 461 } 462 463 static int do_faccessat(int dfd, const char __user *filename, int mode, int flags) 464 { 465 struct path path; 466 struct inode *inode; 467 int res; 468 unsigned int lookup_flags = LOOKUP_FOLLOW; 469 const struct cred *old_cred = NULL; 470 471 if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ 472 return -EINVAL; 473 474 if (flags & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) 475 return -EINVAL; 476 477 if (flags & AT_SYMLINK_NOFOLLOW) 478 lookup_flags &= ~LOOKUP_FOLLOW; 479 480 if (access_need_override_creds(flags)) { 481 old_cred = access_override_creds(); 482 if (!old_cred) 483 return -ENOMEM; 484 } 485 486 CLASS(filename_uflags, name)(filename, flags); 487 retry: 488 res = filename_lookup(dfd, name, lookup_flags, &path, NULL); 489 if (res) 490 goto out; 491 492 inode = d_backing_inode(path.dentry); 493 494 if ((mode & MAY_EXEC) && S_ISREG(inode->i_mode)) { 495 /* 496 * MAY_EXEC on regular files is denied if the fs is mounted 497 * with the "noexec" flag. 498 */ 499 res = -EACCES; 500 if (path_noexec(&path)) 501 goto out_path_release; 502 } 503 504 res = inode_permission(mnt_idmap(path.mnt), inode, mode | MAY_ACCESS); 505 /* SuS v2 requires we report a read only fs too */ 506 if (res || !(mode & S_IWOTH) || special_file(inode->i_mode)) 507 goto out_path_release; 508 /* 509 * This is a rare case where using __mnt_is_readonly() 510 * is OK without a mnt_want/drop_write() pair. Since 511 * no actual write to the fs is performed here, we do 512 * not need to telegraph to that to anyone. 513 * 514 * By doing this, we accept that this access is 515 * inherently racy and know that the fs may change 516 * state before we even see this result. 517 */ 518 if (__mnt_is_readonly(path.mnt)) 519 res = -EROFS; 520 521 out_path_release: 522 path_put(&path); 523 if (retry_estale(res, lookup_flags)) { 524 lookup_flags |= LOOKUP_REVAL; 525 goto retry; 526 } 527 out: 528 if (old_cred) 529 put_cred(revert_creds(old_cred)); 530 531 return res; 532 } 533 534 SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode) 535 { 536 return do_faccessat(dfd, filename, mode, 0); 537 } 538 539 SYSCALL_DEFINE4(faccessat2, int, dfd, const char __user *, filename, int, mode, 540 int, flags) 541 { 542 return do_faccessat(dfd, filename, mode, flags); 543 } 544 545 SYSCALL_DEFINE2(access, const char __user *, filename, int, mode) 546 { 547 return do_faccessat(AT_FDCWD, filename, mode, 0); 548 } 549 550 SYSCALL_DEFINE1(chdir, const char __user *, filename) 551 { 552 struct path path; 553 int error; 554 unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY; 555 CLASS(filename, name)(filename); 556 retry: 557 error = filename_lookup(AT_FDCWD, name, lookup_flags, &path, NULL); 558 if (!error) { 559 error = path_permission(&path, MAY_EXEC | MAY_CHDIR); 560 if (!error) 561 set_fs_pwd(current->fs, &path); 562 path_put(&path); 563 if (retry_estale(error, lookup_flags)) { 564 lookup_flags |= LOOKUP_REVAL; 565 goto retry; 566 } 567 } 568 return error; 569 } 570 571 SYSCALL_DEFINE1(fchdir, unsigned int, fd) 572 { 573 CLASS(fd_raw, f)(fd); 574 int error; 575 576 if (fd_empty(f)) 577 return -EBADF; 578 579 if (!d_can_lookup(fd_file(f)->f_path.dentry)) 580 return -ENOTDIR; 581 582 error = file_permission(fd_file(f), MAY_EXEC | MAY_CHDIR); 583 if (!error) 584 set_fs_pwd(current->fs, &fd_file(f)->f_path); 585 return error; 586 } 587 588 SYSCALL_DEFINE1(chroot, const char __user *, filename) 589 { 590 struct path path; 591 int error; 592 unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY; 593 CLASS(filename, name)(filename); 594 retry: 595 error = filename_lookup(AT_FDCWD, name, lookup_flags, &path, NULL); 596 if (error) 597 return error; 598 599 error = path_permission(&path, MAY_EXEC | MAY_CHDIR); 600 if (error) 601 goto dput_and_out; 602 603 error = -EPERM; 604 if (!ns_capable(current_user_ns(), CAP_SYS_CHROOT)) 605 goto dput_and_out; 606 error = security_path_chroot(&path); 607 if (!error) 608 set_fs_root(current->fs, &path); 609 dput_and_out: 610 path_put(&path); 611 if (retry_estale(error, lookup_flags)) { 612 lookup_flags |= LOOKUP_REVAL; 613 goto retry; 614 } 615 return error; 616 } 617 618 int chmod_common(const struct path *path, umode_t mode) 619 { 620 struct inode *inode = path->dentry->d_inode; 621 struct delegated_inode delegated_inode = { }; 622 struct iattr newattrs; 623 int error; 624 625 error = mnt_want_write(path->mnt); 626 if (error) 627 return error; 628 retry_deleg: 629 error = inode_lock_killable(inode); 630 if (error) 631 goto out_mnt_unlock; 632 error = security_path_chmod(path, mode); 633 if (error) 634 goto out_unlock; 635 newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); 636 newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; 637 error = notify_change(mnt_idmap(path->mnt), path->dentry, 638 &newattrs, &delegated_inode); 639 out_unlock: 640 inode_unlock(inode); 641 if (is_delegated(&delegated_inode)) { 642 error = break_deleg_wait(&delegated_inode); 643 if (!error) 644 goto retry_deleg; 645 } 646 out_mnt_unlock: 647 mnt_drop_write(path->mnt); 648 return error; 649 } 650 651 int vfs_fchmod(struct file *file, umode_t mode) 652 { 653 audit_file(file); 654 return chmod_common(&file->f_path, mode); 655 } 656 657 SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode) 658 { 659 CLASS(fd, f)(fd); 660 661 if (fd_empty(f)) 662 return -EBADF; 663 664 return vfs_fchmod(fd_file(f), mode); 665 } 666 667 static int do_fchmodat(int dfd, const char __user *filename, umode_t mode, 668 unsigned int flags) 669 { 670 struct path path; 671 int error; 672 unsigned int lookup_flags; 673 674 if (unlikely(flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH))) 675 return -EINVAL; 676 677 lookup_flags = (flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; 678 CLASS(filename_uflags, name)(filename, flags); 679 retry: 680 error = filename_lookup(dfd, name, lookup_flags, &path, NULL); 681 if (!error) { 682 error = chmod_common(&path, mode); 683 path_put(&path); 684 if (retry_estale(error, lookup_flags)) { 685 lookup_flags |= LOOKUP_REVAL; 686 goto retry; 687 } 688 } 689 return error; 690 } 691 692 SYSCALL_DEFINE4(fchmodat2, int, dfd, const char __user *, filename, 693 umode_t, mode, unsigned int, flags) 694 { 695 return do_fchmodat(dfd, filename, mode, flags); 696 } 697 698 SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, 699 umode_t, mode) 700 { 701 return do_fchmodat(dfd, filename, mode, 0); 702 } 703 704 SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode) 705 { 706 return do_fchmodat(AT_FDCWD, filename, mode, 0); 707 } 708 709 /* 710 * Check whether @kuid is valid and if so generate and set vfsuid_t in 711 * ia_vfsuid. 712 * 713 * Return: true if @kuid is valid, false if not. 714 */ 715 static inline bool setattr_vfsuid(struct iattr *attr, kuid_t kuid) 716 { 717 if (!uid_valid(kuid)) 718 return false; 719 attr->ia_valid |= ATTR_UID; 720 attr->ia_vfsuid = VFSUIDT_INIT(kuid); 721 return true; 722 } 723 724 /* 725 * Check whether @kgid is valid and if so generate and set vfsgid_t in 726 * ia_vfsgid. 727 * 728 * Return: true if @kgid is valid, false if not. 729 */ 730 static inline bool setattr_vfsgid(struct iattr *attr, kgid_t kgid) 731 { 732 if (!gid_valid(kgid)) 733 return false; 734 attr->ia_valid |= ATTR_GID; 735 attr->ia_vfsgid = VFSGIDT_INIT(kgid); 736 return true; 737 } 738 739 int chown_common(const struct path *path, uid_t user, gid_t group) 740 { 741 struct mnt_idmap *idmap; 742 struct user_namespace *fs_userns; 743 struct inode *inode = path->dentry->d_inode; 744 struct delegated_inode delegated_inode = { }; 745 int error; 746 struct iattr newattrs; 747 kuid_t uid; 748 kgid_t gid; 749 750 uid = make_kuid(current_user_ns(), user); 751 gid = make_kgid(current_user_ns(), group); 752 753 idmap = mnt_idmap(path->mnt); 754 fs_userns = i_user_ns(inode); 755 756 retry_deleg: 757 newattrs.ia_vfsuid = INVALID_VFSUID; 758 newattrs.ia_vfsgid = INVALID_VFSGID; 759 newattrs.ia_valid = ATTR_CTIME; 760 if ((user != (uid_t)-1) && !setattr_vfsuid(&newattrs, uid)) 761 return -EINVAL; 762 if ((group != (gid_t)-1) && !setattr_vfsgid(&newattrs, gid)) 763 return -EINVAL; 764 error = inode_lock_killable(inode); 765 if (error) 766 return error; 767 if (!S_ISDIR(inode->i_mode)) 768 newattrs.ia_valid |= ATTR_KILL_SUID | ATTR_KILL_PRIV | 769 setattr_should_drop_sgid(idmap, inode); 770 /* Continue to send actual fs values, not the mount values. */ 771 error = security_path_chown( 772 path, 773 from_vfsuid(idmap, fs_userns, newattrs.ia_vfsuid), 774 from_vfsgid(idmap, fs_userns, newattrs.ia_vfsgid)); 775 if (!error) 776 error = notify_change(idmap, path->dentry, &newattrs, 777 &delegated_inode); 778 inode_unlock(inode); 779 if (is_delegated(&delegated_inode)) { 780 error = break_deleg_wait(&delegated_inode); 781 if (!error) 782 goto retry_deleg; 783 } 784 return error; 785 } 786 787 int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group, 788 int flag) 789 { 790 struct path path; 791 int error; 792 int lookup_flags; 793 794 if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) 795 return -EINVAL; 796 797 lookup_flags = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; 798 CLASS(filename_uflags, name)(filename, flag); 799 retry: 800 error = filename_lookup(dfd, name, lookup_flags, &path, NULL); 801 if (!error) { 802 error = mnt_want_write(path.mnt); 803 if (!error) { 804 error = chown_common(&path, user, group); 805 mnt_drop_write(path.mnt); 806 } 807 path_put(&path); 808 if (retry_estale(error, lookup_flags)) { 809 lookup_flags |= LOOKUP_REVAL; 810 goto retry; 811 } 812 } 813 return error; 814 } 815 816 SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user, 817 gid_t, group, int, flag) 818 { 819 return do_fchownat(dfd, filename, user, group, flag); 820 } 821 822 SYSCALL_DEFINE3(chown, const char __user *, filename, uid_t, user, gid_t, group) 823 { 824 return do_fchownat(AT_FDCWD, filename, user, group, 0); 825 } 826 827 SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group) 828 { 829 return do_fchownat(AT_FDCWD, filename, user, group, 830 AT_SYMLINK_NOFOLLOW); 831 } 832 833 int vfs_fchown(struct file *file, uid_t user, gid_t group) 834 { 835 int error; 836 837 error = mnt_want_write_file(file); 838 if (error) 839 return error; 840 audit_file(file); 841 error = chown_common(&file->f_path, user, group); 842 mnt_drop_write_file(file); 843 return error; 844 } 845 846 int ksys_fchown(unsigned int fd, uid_t user, gid_t group) 847 { 848 CLASS(fd, f)(fd); 849 850 if (fd_empty(f)) 851 return -EBADF; 852 853 return vfs_fchown(fd_file(f), user, group); 854 } 855 856 SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group) 857 { 858 return ksys_fchown(fd, user, group); 859 } 860 861 static inline int file_get_write_access(struct file *f) 862 { 863 int error; 864 865 error = get_write_access(f->f_inode); 866 if (unlikely(error)) 867 return error; 868 error = mnt_get_write_access(f->f_path.mnt); 869 if (unlikely(error)) 870 goto cleanup_inode; 871 if (unlikely(f->f_mode & FMODE_BACKING)) { 872 error = mnt_get_write_access(backing_file_user_path(f)->mnt); 873 if (unlikely(error)) 874 goto cleanup_mnt; 875 } 876 return 0; 877 878 cleanup_mnt: 879 mnt_put_write_access(f->f_path.mnt); 880 cleanup_inode: 881 put_write_access(f->f_inode); 882 return error; 883 } 884 885 static int do_dentry_open(struct file *f, 886 int (*open)(struct inode *, struct file *)) 887 { 888 static const struct file_operations empty_fops = {}; 889 struct inode *inode = f->f_path.dentry->d_inode; 890 int error; 891 892 path_get(&f->f_path); 893 f->f_inode = inode; 894 f->f_mapping = inode->i_mapping; 895 f->f_wb_err = filemap_sample_wb_err(f->f_mapping); 896 f->f_sb_err = file_sample_sb_err(f); 897 898 if (unlikely(f->f_flags & O_PATH)) { 899 f->f_mode = FMODE_PATH | FMODE_OPENED; 900 file_set_fsnotify_mode(f, FMODE_NONOTIFY); 901 f->f_op = &empty_fops; 902 return 0; 903 } 904 905 if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) { 906 i_readcount_inc(inode); 907 } else if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) { 908 error = file_get_write_access(f); 909 if (unlikely(error)) 910 goto cleanup_file; 911 f->f_mode |= FMODE_WRITER; 912 } 913 914 /* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */ 915 if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)) 916 f->f_mode |= FMODE_ATOMIC_POS; 917 918 f->f_op = fops_get(inode->i_fop); 919 if (WARN_ON(!f->f_op)) { 920 error = -ENODEV; 921 goto cleanup_all; 922 } 923 924 error = security_file_open(f); 925 if (unlikely(error)) 926 goto cleanup_all; 927 928 /* 929 * Call fsnotify open permission hook and set FMODE_NONOTIFY_* bits 930 * according to existing permission watches. 931 * If FMODE_NONOTIFY mode was already set for an fanotify fd or for a 932 * pseudo file, this call will not change the mode. 933 */ 934 error = fsnotify_open_perm_and_set_mode(f); 935 if (unlikely(error)) 936 goto cleanup_all; 937 938 error = break_lease(file_inode(f), f->f_flags); 939 if (unlikely(error)) 940 goto cleanup_all; 941 942 /* normally all 3 are set; ->open() can clear them if needed */ 943 f->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; 944 if (!open) 945 open = f->f_op->open; 946 if (open) { 947 error = open(inode, f); 948 if (error) 949 goto cleanup_all; 950 } 951 f->f_mode |= FMODE_OPENED; 952 if ((f->f_mode & FMODE_READ) && 953 likely(f->f_op->read || f->f_op->read_iter)) 954 f->f_mode |= FMODE_CAN_READ; 955 if ((f->f_mode & FMODE_WRITE) && 956 likely(f->f_op->write || f->f_op->write_iter)) 957 f->f_mode |= FMODE_CAN_WRITE; 958 if ((f->f_mode & FMODE_LSEEK) && !f->f_op->llseek) 959 f->f_mode &= ~FMODE_LSEEK; 960 if (f->f_mapping->a_ops && f->f_mapping->a_ops->direct_IO) 961 f->f_mode |= FMODE_CAN_ODIRECT; 962 963 f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); 964 f->f_iocb_flags = iocb_flags(f); 965 966 file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping); 967 968 if ((f->f_flags & O_DIRECT) && !(f->f_mode & FMODE_CAN_ODIRECT)) 969 return -EINVAL; 970 971 /* 972 * XXX: Huge page cache doesn't support writing yet. Drop all page 973 * cache for this file before processing writes. 974 */ 975 if (f->f_mode & FMODE_WRITE) { 976 /* 977 * Depends on full fence from get_write_access() to synchronize 978 * against collapse_file() regarding i_writecount and nr_thps 979 * updates. Ensures subsequent insertion of THPs into the page 980 * cache will fail. 981 */ 982 if (filemap_nr_thps(inode->i_mapping)) { 983 struct address_space *mapping = inode->i_mapping; 984 985 filemap_invalidate_lock(inode->i_mapping); 986 /* 987 * unmap_mapping_range just need to be called once 988 * here, because the private pages is not need to be 989 * unmapped mapping (e.g. data segment of dynamic 990 * shared libraries here). 991 */ 992 unmap_mapping_range(mapping, 0, 0, 0); 993 truncate_inode_pages(mapping, 0); 994 filemap_invalidate_unlock(inode->i_mapping); 995 } 996 } 997 998 return 0; 999 1000 cleanup_all: 1001 if (WARN_ON_ONCE(error > 0)) 1002 error = -EINVAL; 1003 fops_put(f->f_op); 1004 put_file_access(f); 1005 cleanup_file: 1006 path_put(&f->f_path); 1007 f->__f_path.mnt = NULL; 1008 f->__f_path.dentry = NULL; 1009 f->f_inode = NULL; 1010 return error; 1011 } 1012 1013 /** 1014 * finish_open - finish opening a file 1015 * @file: file pointer 1016 * @dentry: pointer to dentry 1017 * @open: open callback 1018 * 1019 * This can be used to finish opening a file passed to i_op->atomic_open(). 1020 * 1021 * If the open callback is set to NULL, then the standard f_op->open() 1022 * filesystem callback is substituted. 1023 * 1024 * NB: the dentry reference is _not_ consumed. If, for example, the dentry is 1025 * the return value of d_splice_alias(), then the caller needs to perform dput() 1026 * on it after finish_open(). 1027 * 1028 * Returns zero on success or -errno if the open failed. 1029 */ 1030 int finish_open(struct file *file, struct dentry *dentry, 1031 int (*open)(struct inode *, struct file *)) 1032 { 1033 BUG_ON(file->f_mode & FMODE_OPENED); /* once it's opened, it's opened */ 1034 1035 file->__f_path.dentry = dentry; 1036 return do_dentry_open(file, open); 1037 } 1038 EXPORT_SYMBOL(finish_open); 1039 1040 /** 1041 * finish_no_open - finish ->atomic_open() without opening the file 1042 * 1043 * @file: file pointer 1044 * @dentry: dentry, ERR_PTR(-E...) or NULL (as returned from ->lookup()) 1045 * 1046 * This can be used to set the result of a lookup in ->atomic_open(). 1047 * 1048 * NB: unlike finish_open() this function does consume the dentry reference and 1049 * the caller need not dput() it. 1050 * 1051 * Returns 0 or -E..., which must be the return value of ->atomic_open() after 1052 * having called this function. 1053 */ 1054 int finish_no_open(struct file *file, struct dentry *dentry) 1055 { 1056 if (IS_ERR(dentry)) 1057 return PTR_ERR(dentry); 1058 file->__f_path.dentry = dentry; 1059 return 0; 1060 } 1061 EXPORT_SYMBOL(finish_no_open); 1062 1063 char *file_path(struct file *filp, char *buf, int buflen) 1064 { 1065 return d_path(&filp->f_path, buf, buflen); 1066 } 1067 EXPORT_SYMBOL(file_path); 1068 1069 /** 1070 * vfs_open - open the file at the given path 1071 * @path: path to open 1072 * @file: newly allocated file with f_flag initialized 1073 */ 1074 int vfs_open(const struct path *path, struct file *file) 1075 { 1076 int ret; 1077 1078 file->__f_path = *path; 1079 ret = do_dentry_open(file, NULL); 1080 if (!ret) { 1081 /* 1082 * Once we return a file with FMODE_OPENED, __fput() will call 1083 * fsnotify_close(), so we need fsnotify_open() here for 1084 * symmetry. 1085 */ 1086 fsnotify_open(file); 1087 } 1088 return ret; 1089 } 1090 1091 struct file *dentry_open(const struct path *path, int flags, 1092 const struct cred *cred) 1093 { 1094 int error; 1095 struct file *f; 1096 1097 /* We must always pass in a valid mount pointer. */ 1098 BUG_ON(!path->mnt); 1099 1100 f = alloc_empty_file(flags, cred); 1101 if (!IS_ERR(f)) { 1102 error = vfs_open(path, f); 1103 if (error) { 1104 fput(f); 1105 f = ERR_PTR(error); 1106 } 1107 } 1108 return f; 1109 } 1110 EXPORT_SYMBOL(dentry_open); 1111 1112 struct file *dentry_open_nonotify(const struct path *path, int flags, 1113 const struct cred *cred) 1114 { 1115 struct file *f = alloc_empty_file(flags, cred); 1116 if (!IS_ERR(f)) { 1117 int error; 1118 1119 file_set_fsnotify_mode(f, FMODE_NONOTIFY); 1120 error = vfs_open(path, f); 1121 if (error) { 1122 fput(f); 1123 f = ERR_PTR(error); 1124 } 1125 } 1126 return f; 1127 } 1128 1129 /** 1130 * kernel_file_open - open a file for kernel internal use 1131 * @path: path of the file to open 1132 * @flags: open flags 1133 * @cred: credentials for open 1134 * 1135 * Open a file for use by in-kernel consumers. The file is not accounted 1136 * against nr_files and must not be installed into the file descriptor 1137 * table. 1138 * 1139 * Return: Opened file on success, an error pointer on failure. 1140 */ 1141 struct file *kernel_file_open(const struct path *path, int flags, 1142 const struct cred *cred) 1143 { 1144 struct file *f; 1145 int error; 1146 1147 f = alloc_empty_file_noaccount(flags, cred); 1148 if (IS_ERR(f)) 1149 return f; 1150 1151 error = vfs_open(path, f); 1152 if (error) { 1153 fput(f); 1154 return ERR_PTR(error); 1155 } 1156 return f; 1157 } 1158 EXPORT_SYMBOL_GPL(kernel_file_open); 1159 1160 #define WILL_CREATE(flags) (flags & (O_CREAT | __O_TMPFILE)) 1161 #define O_PATH_FLAGS (O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC) 1162 1163 inline struct open_how build_open_how(int flags, umode_t mode) 1164 { 1165 struct open_how how = { 1166 .flags = flags & VALID_OPEN_FLAGS, 1167 .mode = mode & S_IALLUGO, 1168 }; 1169 1170 /* O_PATH beats everything else. */ 1171 if (how.flags & O_PATH) 1172 how.flags &= O_PATH_FLAGS; 1173 /* Modes should only be set for create-like flags. */ 1174 if (!WILL_CREATE(how.flags)) 1175 how.mode = 0; 1176 return how; 1177 } 1178 1179 inline int build_open_flags(const struct open_how *how, struct open_flags *op) 1180 { 1181 u64 flags = how->flags; 1182 u64 strip = O_CLOEXEC; 1183 int lookup_flags = 0; 1184 int acc_mode = ACC_MODE(flags); 1185 1186 BUILD_BUG_ON_MSG(upper_32_bits(VALID_OPEN_FLAGS), 1187 "struct open_flags doesn't yet handle flags > 32 bits"); 1188 1189 /* 1190 * Strip flags that aren't relevant in determining struct open_flags. 1191 */ 1192 flags &= ~strip; 1193 1194 /* 1195 * Older syscalls implicitly clear all of the invalid flags or argument 1196 * values before calling build_open_flags(), but openat2(2) checks all 1197 * of its arguments. 1198 */ 1199 if (flags & ~VALID_OPEN_FLAGS) 1200 return -EINVAL; 1201 if (how->resolve & ~VALID_RESOLVE_FLAGS) 1202 return -EINVAL; 1203 1204 /* Scoping flags are mutually exclusive. */ 1205 if ((how->resolve & RESOLVE_BENEATH) && (how->resolve & RESOLVE_IN_ROOT)) 1206 return -EINVAL; 1207 1208 /* Deal with the mode. */ 1209 if (WILL_CREATE(flags)) { 1210 if (how->mode & ~S_IALLUGO) 1211 return -EINVAL; 1212 op->mode = how->mode | S_IFREG; 1213 } else { 1214 if (how->mode != 0) 1215 return -EINVAL; 1216 op->mode = 0; 1217 } 1218 1219 /* 1220 * Block bugs where O_DIRECTORY | O_CREAT created regular files. 1221 * Note, that blocking O_DIRECTORY | O_CREAT here also protects 1222 * O_TMPFILE below which requires O_DIRECTORY being raised. 1223 */ 1224 if ((flags & (O_DIRECTORY | O_CREAT)) == (O_DIRECTORY | O_CREAT)) 1225 return -EINVAL; 1226 1227 /* Now handle the creative implementation of O_TMPFILE. */ 1228 if (flags & __O_TMPFILE) { 1229 /* 1230 * In order to ensure programs get explicit errors when trying 1231 * to use O_TMPFILE on old kernels we enforce that O_DIRECTORY 1232 * is raised alongside __O_TMPFILE. 1233 */ 1234 if (!(flags & O_DIRECTORY)) 1235 return -EINVAL; 1236 if (!(acc_mode & MAY_WRITE)) 1237 return -EINVAL; 1238 } 1239 if (flags & O_PATH) { 1240 /* O_PATH only permits certain other flags to be set. */ 1241 if (flags & ~O_PATH_FLAGS) 1242 return -EINVAL; 1243 acc_mode = 0; 1244 } 1245 1246 /* 1247 * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only 1248 * check for O_DSYNC if the need any syncing at all we enforce it's 1249 * always set instead of having to deal with possibly weird behaviour 1250 * for malicious applications setting only __O_SYNC. 1251 */ 1252 if (flags & __O_SYNC) 1253 flags |= O_DSYNC; 1254 1255 op->open_flag = flags; 1256 1257 /* O_TRUNC implies we need access checks for write permissions */ 1258 if (flags & O_TRUNC) 1259 acc_mode |= MAY_WRITE; 1260 1261 /* Allow the LSM permission hook to distinguish append 1262 access from general write access. */ 1263 if (flags & O_APPEND) 1264 acc_mode |= MAY_APPEND; 1265 1266 op->acc_mode = acc_mode; 1267 1268 op->intent = flags & O_PATH ? 0 : LOOKUP_OPEN; 1269 1270 if (flags & O_CREAT) { 1271 op->intent |= LOOKUP_CREATE; 1272 if (flags & O_EXCL) { 1273 op->intent |= LOOKUP_EXCL; 1274 flags |= O_NOFOLLOW; 1275 } 1276 } 1277 1278 if (flags & O_DIRECTORY) 1279 lookup_flags |= LOOKUP_DIRECTORY; 1280 if (!(flags & O_NOFOLLOW)) 1281 lookup_flags |= LOOKUP_FOLLOW; 1282 1283 if (how->resolve & RESOLVE_NO_XDEV) 1284 lookup_flags |= LOOKUP_NO_XDEV; 1285 if (how->resolve & RESOLVE_NO_MAGICLINKS) 1286 lookup_flags |= LOOKUP_NO_MAGICLINKS; 1287 if (how->resolve & RESOLVE_NO_SYMLINKS) 1288 lookup_flags |= LOOKUP_NO_SYMLINKS; 1289 if (how->resolve & RESOLVE_BENEATH) 1290 lookup_flags |= LOOKUP_BENEATH; 1291 if (how->resolve & RESOLVE_IN_ROOT) 1292 lookup_flags |= LOOKUP_IN_ROOT; 1293 if (how->resolve & RESOLVE_CACHED) { 1294 /* Don't bother even trying for create/truncate/tmpfile open */ 1295 if (flags & (O_TRUNC | O_CREAT | __O_TMPFILE)) 1296 return -EAGAIN; 1297 lookup_flags |= LOOKUP_CACHED; 1298 } 1299 1300 op->lookup_flags = lookup_flags; 1301 return 0; 1302 } 1303 1304 /** 1305 * file_open_name - open file and return file pointer 1306 * 1307 * @name: struct filename containing path to open 1308 * @flags: open flags as per the open(2) second argument 1309 * @mode: mode for the new file if O_CREAT is set, else ignored 1310 * 1311 * This is the helper to open a file from kernelspace if you really 1312 * have to. But in generally you should not do this, so please move 1313 * along, nothing to see here.. 1314 */ 1315 struct file *file_open_name(struct filename *name, int flags, umode_t mode) 1316 { 1317 struct open_flags op; 1318 struct open_how how = build_open_how(flags, mode); 1319 int err = build_open_flags(&how, &op); 1320 if (err) 1321 return ERR_PTR(err); 1322 return do_file_open(AT_FDCWD, name, &op); 1323 } 1324 1325 /** 1326 * filp_open - open file and return file pointer 1327 * 1328 * @filename: path to open 1329 * @flags: open flags as per the open(2) second argument 1330 * @mode: mode for the new file if O_CREAT is set, else ignored 1331 * 1332 * This is the helper to open a file from kernelspace if you really 1333 * have to. But in generally you should not do this, so please move 1334 * along, nothing to see here.. 1335 */ 1336 struct file *filp_open(const char *filename, int flags, umode_t mode) 1337 { 1338 CLASS(filename_kernel, name)(filename); 1339 return file_open_name(name, flags, mode); 1340 } 1341 EXPORT_SYMBOL(filp_open); 1342 1343 struct file *file_open_root(const struct path *root, 1344 const char *filename, int flags, umode_t mode) 1345 { 1346 struct open_flags op; 1347 struct open_how how = build_open_how(flags, mode); 1348 int err = build_open_flags(&how, &op); 1349 if (err) 1350 return ERR_PTR(err); 1351 return do_file_open_root(root, filename, &op); 1352 } 1353 EXPORT_SYMBOL(file_open_root); 1354 1355 static int do_sys_openat2(int dfd, const char __user *filename, 1356 struct open_how *how) 1357 { 1358 struct open_flags op; 1359 int err = build_open_flags(how, &op); 1360 if (unlikely(err)) 1361 return err; 1362 1363 CLASS(filename, name)(filename); 1364 return FD_ADD(how->flags, do_file_open(dfd, name, &op)); 1365 } 1366 1367 int do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) 1368 { 1369 struct open_how how = build_open_how(flags, mode); 1370 return do_sys_openat2(dfd, filename, &how); 1371 } 1372 1373 1374 SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode) 1375 { 1376 if (force_o_largefile()) 1377 flags |= O_LARGEFILE; 1378 return do_sys_open(AT_FDCWD, filename, flags, mode); 1379 } 1380 1381 SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, 1382 umode_t, mode) 1383 { 1384 if (force_o_largefile()) 1385 flags |= O_LARGEFILE; 1386 return do_sys_open(dfd, filename, flags, mode); 1387 } 1388 1389 SYSCALL_DEFINE4(openat2, int, dfd, const char __user *, filename, 1390 struct open_how __user *, how, size_t, usize) 1391 { 1392 int err; 1393 struct open_how tmp; 1394 1395 BUILD_BUG_ON(sizeof(struct open_how) < OPEN_HOW_SIZE_VER0); 1396 BUILD_BUG_ON(sizeof(struct open_how) != OPEN_HOW_SIZE_LATEST); 1397 1398 if (unlikely(usize < OPEN_HOW_SIZE_VER0)) 1399 return -EINVAL; 1400 if (unlikely(usize > PAGE_SIZE)) 1401 return -E2BIG; 1402 1403 err = copy_struct_from_user(&tmp, sizeof(tmp), how, usize); 1404 if (err) 1405 return err; 1406 1407 audit_openat2_how(&tmp); 1408 1409 /* O_LARGEFILE is only allowed for non-O_PATH. */ 1410 if (!(tmp.flags & O_PATH) && force_o_largefile()) 1411 tmp.flags |= O_LARGEFILE; 1412 1413 return do_sys_openat2(dfd, filename, &tmp); 1414 } 1415 1416 #ifdef CONFIG_COMPAT 1417 /* 1418 * Exactly like sys_open(), except that it doesn't set the 1419 * O_LARGEFILE flag. 1420 */ 1421 COMPAT_SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode) 1422 { 1423 return do_sys_open(AT_FDCWD, filename, flags, mode); 1424 } 1425 1426 /* 1427 * Exactly like sys_openat(), except that it doesn't set the 1428 * O_LARGEFILE flag. 1429 */ 1430 COMPAT_SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, umode_t, mode) 1431 { 1432 return do_sys_open(dfd, filename, flags, mode); 1433 } 1434 #endif 1435 1436 #ifndef __alpha__ 1437 1438 /* 1439 * For backward compatibility? Maybe this should be moved 1440 * into arch/i386 instead? 1441 */ 1442 SYSCALL_DEFINE2(creat, const char __user *, pathname, umode_t, mode) 1443 { 1444 int flags = O_CREAT | O_WRONLY | O_TRUNC; 1445 1446 if (force_o_largefile()) 1447 flags |= O_LARGEFILE; 1448 return do_sys_open(AT_FDCWD, pathname, flags, mode); 1449 } 1450 #endif 1451 1452 /* 1453 * "id" is the POSIX thread ID. We use the 1454 * files pointer for this.. 1455 */ 1456 static int filp_flush(struct file *filp, fl_owner_t id) 1457 { 1458 int retval = 0; 1459 1460 if (CHECK_DATA_CORRUPTION(file_count(filp) == 0, filp, 1461 "VFS: Close: file count is 0 (f_op=%ps)", 1462 filp->f_op)) { 1463 return 0; 1464 } 1465 1466 if (filp->f_op->flush) 1467 retval = filp->f_op->flush(filp, id); 1468 1469 if (likely(!(filp->f_mode & FMODE_PATH))) { 1470 dnotify_flush(filp, id); 1471 locks_remove_posix(filp, id); 1472 } 1473 return retval; 1474 } 1475 1476 int filp_close(struct file *filp, fl_owner_t id) 1477 { 1478 int retval; 1479 1480 retval = filp_flush(filp, id); 1481 fput_close(filp); 1482 1483 return retval; 1484 } 1485 EXPORT_SYMBOL(filp_close); 1486 1487 /* 1488 * Careful here! We test whether the file pointer is NULL before 1489 * releasing the fd. This ensures that one clone task can't release 1490 * an fd while another clone is opening it. 1491 */ 1492 SYSCALL_DEFINE1(close, unsigned int, fd) 1493 { 1494 int retval; 1495 struct file *file; 1496 1497 file = file_close_fd(fd); 1498 if (!file) 1499 return -EBADF; 1500 1501 retval = filp_flush(file, current->files); 1502 1503 /* 1504 * We're returning to user space. Don't bother 1505 * with any delayed fput() cases. 1506 */ 1507 fput_close_sync(file); 1508 1509 if (likely(retval == 0)) 1510 return 0; 1511 1512 /* can't restart close syscall because file table entry was cleared */ 1513 if (retval == -ERESTARTSYS || 1514 retval == -ERESTARTNOINTR || 1515 retval == -ERESTARTNOHAND || 1516 retval == -ERESTART_RESTARTBLOCK) 1517 retval = -EINTR; 1518 1519 return retval; 1520 } 1521 1522 /* 1523 * This routine simulates a hangup on the tty, to arrange that users 1524 * are given clean terminals at login time. 1525 */ 1526 SYSCALL_DEFINE0(vhangup) 1527 { 1528 if (capable(CAP_SYS_TTY_CONFIG)) { 1529 tty_vhangup_self(); 1530 return 0; 1531 } 1532 return -EPERM; 1533 } 1534 1535 /* 1536 * Called when an inode is about to be open. 1537 * We use this to disallow opening large files on 32bit systems if 1538 * the caller didn't specify O_LARGEFILE. On 64bit systems we force 1539 * on this flag in sys_open. 1540 */ 1541 int generic_file_open(struct inode * inode, struct file * filp) 1542 { 1543 if (!(filp->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS) 1544 return -EOVERFLOW; 1545 return 0; 1546 } 1547 1548 EXPORT_SYMBOL(generic_file_open); 1549 1550 /* 1551 * This is used by subsystems that don't want seekable 1552 * file descriptors. The function is not supposed to ever fail, the only 1553 * reason it returns an 'int' and not 'void' is so that it can be plugged 1554 * directly into file_operations structure. 1555 */ 1556 int nonseekable_open(struct inode *inode, struct file *filp) 1557 { 1558 filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE); 1559 return 0; 1560 } 1561 1562 EXPORT_SYMBOL(nonseekable_open); 1563 1564 /* 1565 * stream_open is used by subsystems that want stream-like file descriptors. 1566 * Such file descriptors are not seekable and don't have notion of position 1567 * (file.f_pos is always 0 and ppos passed to .read()/.write() is always NULL). 1568 * Contrary to file descriptors of other regular files, .read() and .write() 1569 * can run simultaneously. 1570 * 1571 * stream_open never fails and is marked to return int so that it could be 1572 * directly used as file_operations.open . 1573 */ 1574 int stream_open(struct inode *inode, struct file *filp) 1575 { 1576 filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE | FMODE_ATOMIC_POS); 1577 filp->f_mode |= FMODE_STREAM; 1578 return 0; 1579 } 1580 1581 EXPORT_SYMBOL(stream_open); 1582