xref: /linux/fs/open.c (revision 520615e1f5b2e617845238c650b58b43592fa923)
1  // SPDX-License-Identifier: GPL-2.0-only
2  /*
3   *  linux/fs/open.c
4   *
5   *  Copyright (C) 1991, 1992  Linus Torvalds
6   */
7  
8  #include <linux/string.h>
9  #include <linux/mm.h>
10  #include <linux/file.h>
11  #include <linux/fdtable.h>
12  #include <linux/fsnotify.h>
13  #include <linux/module.h>
14  #include <linux/tty.h>
15  #include <linux/namei.h>
16  #include <linux/backing-dev.h>
17  #include <linux/capability.h>
18  #include <linux/securebits.h>
19  #include <linux/security.h>
20  #include <linux/mount.h>
21  #include <linux/fcntl.h>
22  #include <linux/slab.h>
23  #include <linux/uaccess.h>
24  #include <linux/fs.h>
25  #include <linux/personality.h>
26  #include <linux/pagemap.h>
27  #include <linux/syscalls.h>
28  #include <linux/rcupdate.h>
29  #include <linux/audit.h>
30  #include <linux/falloc.h>
31  #include <linux/fs_struct.h>
32  #include <linux/ima.h>
33  #include <linux/dnotify.h>
34  #include <linux/compat.h>
35  
36  #include "internal.h"
37  
38  int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
39  	struct file *filp)
40  {
41  	int ret;
42  	struct iattr newattrs;
43  
44  	/* Not pretty: "inode->i_size" shouldn't really be signed. But it is. */
45  	if (length < 0)
46  		return -EINVAL;
47  
48  	newattrs.ia_size = length;
49  	newattrs.ia_valid = ATTR_SIZE | time_attrs;
50  	if (filp) {
51  		newattrs.ia_file = filp;
52  		newattrs.ia_valid |= ATTR_FILE;
53  	}
54  
55  	/* Remove suid, sgid, and file capabilities on truncate too */
56  	ret = dentry_needs_remove_privs(dentry);
57  	if (ret < 0)
58  		return ret;
59  	if (ret)
60  		newattrs.ia_valid |= ret | ATTR_FORCE;
61  
62  	inode_lock(dentry->d_inode);
63  	/* Note any delegations or leases have already been broken: */
64  	ret = notify_change(dentry, &newattrs, NULL);
65  	inode_unlock(dentry->d_inode);
66  	return ret;
67  }
68  
69  long vfs_truncate(const struct path *path, loff_t length)
70  {
71  	struct inode *inode;
72  	long error;
73  
74  	inode = path->dentry->d_inode;
75  
76  	/* For directories it's -EISDIR, for other non-regulars - -EINVAL */
77  	if (S_ISDIR(inode->i_mode))
78  		return -EISDIR;
79  	if (!S_ISREG(inode->i_mode))
80  		return -EINVAL;
81  
82  	error = mnt_want_write(path->mnt);
83  	if (error)
84  		goto out;
85  
86  	error = inode_permission(inode, MAY_WRITE);
87  	if (error)
88  		goto mnt_drop_write_and_out;
89  
90  	error = -EPERM;
91  	if (IS_APPEND(inode))
92  		goto mnt_drop_write_and_out;
93  
94  	error = get_write_access(inode);
95  	if (error)
96  		goto mnt_drop_write_and_out;
97  
98  	/*
99  	 * Make sure that there are no leases.  get_write_access() protects
100  	 * against the truncate racing with a lease-granting setlease().
101  	 */
102  	error = break_lease(inode, O_WRONLY);
103  	if (error)
104  		goto put_write_and_out;
105  
106  	error = locks_verify_truncate(inode, NULL, length);
107  	if (!error)
108  		error = security_path_truncate(path);
109  	if (!error)
110  		error = do_truncate(path->dentry, length, 0, NULL);
111  
112  put_write_and_out:
113  	put_write_access(inode);
114  mnt_drop_write_and_out:
115  	mnt_drop_write(path->mnt);
116  out:
117  	return error;
118  }
119  EXPORT_SYMBOL_GPL(vfs_truncate);
120  
121  long do_sys_truncate(const char __user *pathname, loff_t length)
122  {
123  	unsigned int lookup_flags = LOOKUP_FOLLOW;
124  	struct path path;
125  	int error;
126  
127  	if (length < 0)	/* sorry, but loff_t says... */
128  		return -EINVAL;
129  
130  retry:
131  	error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
132  	if (!error) {
133  		error = vfs_truncate(&path, length);
134  		path_put(&path);
135  	}
136  	if (retry_estale(error, lookup_flags)) {
137  		lookup_flags |= LOOKUP_REVAL;
138  		goto retry;
139  	}
140  	return error;
141  }
142  
143  SYSCALL_DEFINE2(truncate, const char __user *, path, long, length)
144  {
145  	return do_sys_truncate(path, length);
146  }
147  
148  #ifdef CONFIG_COMPAT
149  COMPAT_SYSCALL_DEFINE2(truncate, const char __user *, path, compat_off_t, length)
150  {
151  	return do_sys_truncate(path, length);
152  }
153  #endif
154  
155  long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
156  {
157  	struct inode *inode;
158  	struct dentry *dentry;
159  	struct fd f;
160  	int error;
161  
162  	error = -EINVAL;
163  	if (length < 0)
164  		goto out;
165  	error = -EBADF;
166  	f = fdget(fd);
167  	if (!f.file)
168  		goto out;
169  
170  	/* explicitly opened as large or we are on 64-bit box */
171  	if (f.file->f_flags & O_LARGEFILE)
172  		small = 0;
173  
174  	dentry = f.file->f_path.dentry;
175  	inode = dentry->d_inode;
176  	error = -EINVAL;
177  	if (!S_ISREG(inode->i_mode) || !(f.file->f_mode & FMODE_WRITE))
178  		goto out_putf;
179  
180  	error = -EINVAL;
181  	/* Cannot ftruncate over 2^31 bytes without large file support */
182  	if (small && length > MAX_NON_LFS)
183  		goto out_putf;
184  
185  	error = -EPERM;
186  	/* Check IS_APPEND on real upper inode */
187  	if (IS_APPEND(file_inode(f.file)))
188  		goto out_putf;
189  
190  	sb_start_write(inode->i_sb);
191  	error = locks_verify_truncate(inode, f.file, length);
192  	if (!error)
193  		error = security_path_truncate(&f.file->f_path);
194  	if (!error)
195  		error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, f.file);
196  	sb_end_write(inode->i_sb);
197  out_putf:
198  	fdput(f);
199  out:
200  	return error;
201  }
202  
203  SYSCALL_DEFINE2(ftruncate, unsigned int, fd, unsigned long, length)
204  {
205  	return do_sys_ftruncate(fd, length, 1);
206  }
207  
208  #ifdef CONFIG_COMPAT
209  COMPAT_SYSCALL_DEFINE2(ftruncate, unsigned int, fd, compat_ulong_t, length)
210  {
211  	return do_sys_ftruncate(fd, length, 1);
212  }
213  #endif
214  
215  /* LFS versions of truncate are only needed on 32 bit machines */
216  #if BITS_PER_LONG == 32
217  SYSCALL_DEFINE2(truncate64, const char __user *, path, loff_t, length)
218  {
219  	return do_sys_truncate(path, length);
220  }
221  
222  SYSCALL_DEFINE2(ftruncate64, unsigned int, fd, loff_t, length)
223  {
224  	return do_sys_ftruncate(fd, length, 0);
225  }
226  #endif /* BITS_PER_LONG == 32 */
227  
228  
229  int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
230  {
231  	struct inode *inode = file_inode(file);
232  	long ret;
233  
234  	if (offset < 0 || len <= 0)
235  		return -EINVAL;
236  
237  	/* Return error if mode is not supported */
238  	if (mode & ~FALLOC_FL_SUPPORTED_MASK)
239  		return -EOPNOTSUPP;
240  
241  	/* Punch hole and zero range are mutually exclusive */
242  	if ((mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) ==
243  	    (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE))
244  		return -EOPNOTSUPP;
245  
246  	/* Punch hole must have keep size set */
247  	if ((mode & FALLOC_FL_PUNCH_HOLE) &&
248  	    !(mode & FALLOC_FL_KEEP_SIZE))
249  		return -EOPNOTSUPP;
250  
251  	/* Collapse range should only be used exclusively. */
252  	if ((mode & FALLOC_FL_COLLAPSE_RANGE) &&
253  	    (mode & ~FALLOC_FL_COLLAPSE_RANGE))
254  		return -EINVAL;
255  
256  	/* Insert range should only be used exclusively. */
257  	if ((mode & FALLOC_FL_INSERT_RANGE) &&
258  	    (mode & ~FALLOC_FL_INSERT_RANGE))
259  		return -EINVAL;
260  
261  	/* Unshare range should only be used with allocate mode. */
262  	if ((mode & FALLOC_FL_UNSHARE_RANGE) &&
263  	    (mode & ~(FALLOC_FL_UNSHARE_RANGE | FALLOC_FL_KEEP_SIZE)))
264  		return -EINVAL;
265  
266  	if (!(file->f_mode & FMODE_WRITE))
267  		return -EBADF;
268  
269  	/*
270  	 * We can only allow pure fallocate on append only files
271  	 */
272  	if ((mode & ~FALLOC_FL_KEEP_SIZE) && IS_APPEND(inode))
273  		return -EPERM;
274  
275  	if (IS_IMMUTABLE(inode))
276  		return -EPERM;
277  
278  	/*
279  	 * We cannot allow any fallocate operation on an active swapfile
280  	 */
281  	if (IS_SWAPFILE(inode))
282  		return -ETXTBSY;
283  
284  	/*
285  	 * Revalidate the write permissions, in case security policy has
286  	 * changed since the files were opened.
287  	 */
288  	ret = security_file_permission(file, MAY_WRITE);
289  	if (ret)
290  		return ret;
291  
292  	if (S_ISFIFO(inode->i_mode))
293  		return -ESPIPE;
294  
295  	if (S_ISDIR(inode->i_mode))
296  		return -EISDIR;
297  
298  	if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))
299  		return -ENODEV;
300  
301  	/* Check for wrap through zero too */
302  	if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0))
303  		return -EFBIG;
304  
305  	if (!file->f_op->fallocate)
306  		return -EOPNOTSUPP;
307  
308  	file_start_write(file);
309  	ret = file->f_op->fallocate(file, mode, offset, len);
310  
311  	/*
312  	 * Create inotify and fanotify events.
313  	 *
314  	 * To keep the logic simple always create events if fallocate succeeds.
315  	 * This implies that events are even created if the file size remains
316  	 * unchanged, e.g. when using flag FALLOC_FL_KEEP_SIZE.
317  	 */
318  	if (ret == 0)
319  		fsnotify_modify(file);
320  
321  	file_end_write(file);
322  	return ret;
323  }
324  EXPORT_SYMBOL_GPL(vfs_fallocate);
325  
326  int ksys_fallocate(int fd, int mode, loff_t offset, loff_t len)
327  {
328  	struct fd f = fdget(fd);
329  	int error = -EBADF;
330  
331  	if (f.file) {
332  		error = vfs_fallocate(f.file, mode, offset, len);
333  		fdput(f);
334  	}
335  	return error;
336  }
337  
338  SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len)
339  {
340  	return ksys_fallocate(fd, mode, offset, len);
341  }
342  
343  /*
344   * access() needs to use the real uid/gid, not the effective uid/gid.
345   * We do this by temporarily clearing all FS-related capabilities and
346   * switching the fsuid/fsgid around to the real ones.
347   */
348  static const struct cred *access_override_creds(void)
349  {
350  	const struct cred *old_cred;
351  	struct cred *override_cred;
352  
353  	override_cred = prepare_creds();
354  	if (!override_cred)
355  		return NULL;
356  
357  	override_cred->fsuid = override_cred->uid;
358  	override_cred->fsgid = override_cred->gid;
359  
360  	if (!issecure(SECURE_NO_SETUID_FIXUP)) {
361  		/* Clear the capabilities if we switch to a non-root user */
362  		kuid_t root_uid = make_kuid(override_cred->user_ns, 0);
363  		if (!uid_eq(override_cred->uid, root_uid))
364  			cap_clear(override_cred->cap_effective);
365  		else
366  			override_cred->cap_effective =
367  				override_cred->cap_permitted;
368  	}
369  
370  	/*
371  	 * The new set of credentials can *only* be used in
372  	 * task-synchronous circumstances, and does not need
373  	 * RCU freeing, unless somebody then takes a separate
374  	 * reference to it.
375  	 *
376  	 * NOTE! This is _only_ true because this credential
377  	 * is used purely for override_creds() that installs
378  	 * it as the subjective cred. Other threads will be
379  	 * accessing ->real_cred, not the subjective cred.
380  	 *
381  	 * If somebody _does_ make a copy of this (using the
382  	 * 'get_current_cred()' function), that will clear the
383  	 * non_rcu field, because now that other user may be
384  	 * expecting RCU freeing. But normal thread-synchronous
385  	 * cred accesses will keep things non-RCY.
386  	 */
387  	override_cred->non_rcu = 1;
388  
389  	old_cred = override_creds(override_cred);
390  
391  	/* override_cred() gets its own ref */
392  	put_cred(override_cred);
393  
394  	return old_cred;
395  }
396  
397  long do_faccessat(int dfd, const char __user *filename, int mode, int flags)
398  {
399  	struct path path;
400  	struct inode *inode;
401  	int res;
402  	unsigned int lookup_flags = LOOKUP_FOLLOW;
403  	const struct cred *old_cred = NULL;
404  
405  	if (mode & ~S_IRWXO)	/* where's F_OK, X_OK, W_OK, R_OK? */
406  		return -EINVAL;
407  
408  	if (flags & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH))
409  		return -EINVAL;
410  
411  	if (flags & AT_SYMLINK_NOFOLLOW)
412  		lookup_flags &= ~LOOKUP_FOLLOW;
413  	if (flags & AT_EMPTY_PATH)
414  		lookup_flags |= LOOKUP_EMPTY;
415  
416  	if (!(flags & AT_EACCESS)) {
417  		old_cred = access_override_creds();
418  		if (!old_cred)
419  			return -ENOMEM;
420  	}
421  
422  retry:
423  	res = user_path_at(dfd, filename, lookup_flags, &path);
424  	if (res)
425  		goto out;
426  
427  	inode = d_backing_inode(path.dentry);
428  
429  	if ((mode & MAY_EXEC) && S_ISREG(inode->i_mode)) {
430  		/*
431  		 * MAY_EXEC on regular files is denied if the fs is mounted
432  		 * with the "noexec" flag.
433  		 */
434  		res = -EACCES;
435  		if (path_noexec(&path))
436  			goto out_path_release;
437  	}
438  
439  	res = inode_permission(inode, mode | MAY_ACCESS);
440  	/* SuS v2 requires we report a read only fs too */
441  	if (res || !(mode & S_IWOTH) || special_file(inode->i_mode))
442  		goto out_path_release;
443  	/*
444  	 * This is a rare case where using __mnt_is_readonly()
445  	 * is OK without a mnt_want/drop_write() pair.  Since
446  	 * no actual write to the fs is performed here, we do
447  	 * not need to telegraph to that to anyone.
448  	 *
449  	 * By doing this, we accept that this access is
450  	 * inherently racy and know that the fs may change
451  	 * state before we even see this result.
452  	 */
453  	if (__mnt_is_readonly(path.mnt))
454  		res = -EROFS;
455  
456  out_path_release:
457  	path_put(&path);
458  	if (retry_estale(res, lookup_flags)) {
459  		lookup_flags |= LOOKUP_REVAL;
460  		goto retry;
461  	}
462  out:
463  	if (old_cred)
464  		revert_creds(old_cred);
465  
466  	return res;
467  }
468  
469  SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
470  {
471  	return do_faccessat(dfd, filename, mode, 0);
472  }
473  
474  SYSCALL_DEFINE4(faccessat2, int, dfd, const char __user *, filename, int, mode,
475  		int, flags)
476  {
477  	return do_faccessat(dfd, filename, mode, flags);
478  }
479  
480  SYSCALL_DEFINE2(access, const char __user *, filename, int, mode)
481  {
482  	return do_faccessat(AT_FDCWD, filename, mode, 0);
483  }
484  
485  int ksys_chdir(const char __user *filename)
486  {
487  	struct path path;
488  	int error;
489  	unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
490  retry:
491  	error = user_path_at(AT_FDCWD, filename, lookup_flags, &path);
492  	if (error)
493  		goto out;
494  
495  	error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
496  	if (error)
497  		goto dput_and_out;
498  
499  	set_fs_pwd(current->fs, &path);
500  
501  dput_and_out:
502  	path_put(&path);
503  	if (retry_estale(error, lookup_flags)) {
504  		lookup_flags |= LOOKUP_REVAL;
505  		goto retry;
506  	}
507  out:
508  	return error;
509  }
510  
511  SYSCALL_DEFINE1(chdir, const char __user *, filename)
512  {
513  	return ksys_chdir(filename);
514  }
515  
516  SYSCALL_DEFINE1(fchdir, unsigned int, fd)
517  {
518  	struct fd f = fdget_raw(fd);
519  	int error;
520  
521  	error = -EBADF;
522  	if (!f.file)
523  		goto out;
524  
525  	error = -ENOTDIR;
526  	if (!d_can_lookup(f.file->f_path.dentry))
527  		goto out_putf;
528  
529  	error = inode_permission(file_inode(f.file), MAY_EXEC | MAY_CHDIR);
530  	if (!error)
531  		set_fs_pwd(current->fs, &f.file->f_path);
532  out_putf:
533  	fdput(f);
534  out:
535  	return error;
536  }
537  
538  int ksys_chroot(const char __user *filename)
539  {
540  	struct path path;
541  	int error;
542  	unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
543  retry:
544  	error = user_path_at(AT_FDCWD, filename, lookup_flags, &path);
545  	if (error)
546  		goto out;
547  
548  	error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
549  	if (error)
550  		goto dput_and_out;
551  
552  	error = -EPERM;
553  	if (!ns_capable(current_user_ns(), CAP_SYS_CHROOT))
554  		goto dput_and_out;
555  	error = security_path_chroot(&path);
556  	if (error)
557  		goto dput_and_out;
558  
559  	set_fs_root(current->fs, &path);
560  	error = 0;
561  dput_and_out:
562  	path_put(&path);
563  	if (retry_estale(error, lookup_flags)) {
564  		lookup_flags |= LOOKUP_REVAL;
565  		goto retry;
566  	}
567  out:
568  	return error;
569  }
570  
571  SYSCALL_DEFINE1(chroot, const char __user *, filename)
572  {
573  	return ksys_chroot(filename);
574  }
575  
576  static int chmod_common(const struct path *path, umode_t mode)
577  {
578  	struct inode *inode = path->dentry->d_inode;
579  	struct inode *delegated_inode = NULL;
580  	struct iattr newattrs;
581  	int error;
582  
583  	error = mnt_want_write(path->mnt);
584  	if (error)
585  		return error;
586  retry_deleg:
587  	inode_lock(inode);
588  	error = security_path_chmod(path, mode);
589  	if (error)
590  		goto out_unlock;
591  	newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
592  	newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
593  	error = notify_change(path->dentry, &newattrs, &delegated_inode);
594  out_unlock:
595  	inode_unlock(inode);
596  	if (delegated_inode) {
597  		error = break_deleg_wait(&delegated_inode);
598  		if (!error)
599  			goto retry_deleg;
600  	}
601  	mnt_drop_write(path->mnt);
602  	return error;
603  }
604  
605  int ksys_fchmod(unsigned int fd, umode_t mode)
606  {
607  	struct fd f = fdget(fd);
608  	int err = -EBADF;
609  
610  	if (f.file) {
611  		audit_file(f.file);
612  		err = chmod_common(&f.file->f_path, mode);
613  		fdput(f);
614  	}
615  	return err;
616  }
617  
618  SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode)
619  {
620  	return ksys_fchmod(fd, mode);
621  }
622  
623  int do_fchmodat(int dfd, const char __user *filename, umode_t mode)
624  {
625  	struct path path;
626  	int error;
627  	unsigned int lookup_flags = LOOKUP_FOLLOW;
628  retry:
629  	error = user_path_at(dfd, filename, lookup_flags, &path);
630  	if (!error) {
631  		error = chmod_common(&path, mode);
632  		path_put(&path);
633  		if (retry_estale(error, lookup_flags)) {
634  			lookup_flags |= LOOKUP_REVAL;
635  			goto retry;
636  		}
637  	}
638  	return error;
639  }
640  
641  SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename,
642  		umode_t, mode)
643  {
644  	return do_fchmodat(dfd, filename, mode);
645  }
646  
647  SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode)
648  {
649  	return do_fchmodat(AT_FDCWD, filename, mode);
650  }
651  
652  static int chown_common(const struct path *path, uid_t user, gid_t group)
653  {
654  	struct inode *inode = path->dentry->d_inode;
655  	struct inode *delegated_inode = NULL;
656  	int error;
657  	struct iattr newattrs;
658  	kuid_t uid;
659  	kgid_t gid;
660  
661  	uid = make_kuid(current_user_ns(), user);
662  	gid = make_kgid(current_user_ns(), group);
663  
664  retry_deleg:
665  	newattrs.ia_valid =  ATTR_CTIME;
666  	if (user != (uid_t) -1) {
667  		if (!uid_valid(uid))
668  			return -EINVAL;
669  		newattrs.ia_valid |= ATTR_UID;
670  		newattrs.ia_uid = uid;
671  	}
672  	if (group != (gid_t) -1) {
673  		if (!gid_valid(gid))
674  			return -EINVAL;
675  		newattrs.ia_valid |= ATTR_GID;
676  		newattrs.ia_gid = gid;
677  	}
678  	if (!S_ISDIR(inode->i_mode))
679  		newattrs.ia_valid |=
680  			ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV;
681  	inode_lock(inode);
682  	error = security_path_chown(path, uid, gid);
683  	if (!error)
684  		error = notify_change(path->dentry, &newattrs, &delegated_inode);
685  	inode_unlock(inode);
686  	if (delegated_inode) {
687  		error = break_deleg_wait(&delegated_inode);
688  		if (!error)
689  			goto retry_deleg;
690  	}
691  	return error;
692  }
693  
694  int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group,
695  		int flag)
696  {
697  	struct path path;
698  	int error = -EINVAL;
699  	int lookup_flags;
700  
701  	if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
702  		goto out;
703  
704  	lookup_flags = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
705  	if (flag & AT_EMPTY_PATH)
706  		lookup_flags |= LOOKUP_EMPTY;
707  retry:
708  	error = user_path_at(dfd, filename, lookup_flags, &path);
709  	if (error)
710  		goto out;
711  	error = mnt_want_write(path.mnt);
712  	if (error)
713  		goto out_release;
714  	error = chown_common(&path, user, group);
715  	mnt_drop_write(path.mnt);
716  out_release:
717  	path_put(&path);
718  	if (retry_estale(error, lookup_flags)) {
719  		lookup_flags |= LOOKUP_REVAL;
720  		goto retry;
721  	}
722  out:
723  	return error;
724  }
725  
726  SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user,
727  		gid_t, group, int, flag)
728  {
729  	return do_fchownat(dfd, filename, user, group, flag);
730  }
731  
732  SYSCALL_DEFINE3(chown, const char __user *, filename, uid_t, user, gid_t, group)
733  {
734  	return do_fchownat(AT_FDCWD, filename, user, group, 0);
735  }
736  
737  SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group)
738  {
739  	return do_fchownat(AT_FDCWD, filename, user, group,
740  			   AT_SYMLINK_NOFOLLOW);
741  }
742  
743  int ksys_fchown(unsigned int fd, uid_t user, gid_t group)
744  {
745  	struct fd f = fdget(fd);
746  	int error = -EBADF;
747  
748  	if (!f.file)
749  		goto out;
750  
751  	error = mnt_want_write_file(f.file);
752  	if (error)
753  		goto out_fput;
754  	audit_file(f.file);
755  	error = chown_common(&f.file->f_path, user, group);
756  	mnt_drop_write_file(f.file);
757  out_fput:
758  	fdput(f);
759  out:
760  	return error;
761  }
762  
763  SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
764  {
765  	return ksys_fchown(fd, user, group);
766  }
767  
768  static int do_dentry_open(struct file *f,
769  			  struct inode *inode,
770  			  int (*open)(struct inode *, struct file *))
771  {
772  	static const struct file_operations empty_fops = {};
773  	int error;
774  
775  	path_get(&f->f_path);
776  	f->f_inode = inode;
777  	f->f_mapping = inode->i_mapping;
778  	f->f_wb_err = filemap_sample_wb_err(f->f_mapping);
779  	f->f_sb_err = file_sample_sb_err(f);
780  
781  	if (unlikely(f->f_flags & O_PATH)) {
782  		f->f_mode = FMODE_PATH | FMODE_OPENED;
783  		f->f_op = &empty_fops;
784  		return 0;
785  	}
786  
787  	/* Any file opened for execve()/uselib() has to be a regular file. */
788  	if (unlikely(f->f_flags & FMODE_EXEC && !S_ISREG(inode->i_mode))) {
789  		error = -EACCES;
790  		goto cleanup_file;
791  	}
792  
793  	if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) {
794  		error = get_write_access(inode);
795  		if (unlikely(error))
796  			goto cleanup_file;
797  		error = __mnt_want_write(f->f_path.mnt);
798  		if (unlikely(error)) {
799  			put_write_access(inode);
800  			goto cleanup_file;
801  		}
802  		f->f_mode |= FMODE_WRITER;
803  	}
804  
805  	/* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */
806  	if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))
807  		f->f_mode |= FMODE_ATOMIC_POS;
808  
809  	f->f_op = fops_get(inode->i_fop);
810  	if (WARN_ON(!f->f_op)) {
811  		error = -ENODEV;
812  		goto cleanup_all;
813  	}
814  
815  	error = security_file_open(f);
816  	if (error)
817  		goto cleanup_all;
818  
819  	error = break_lease(locks_inode(f), f->f_flags);
820  	if (error)
821  		goto cleanup_all;
822  
823  	/* normally all 3 are set; ->open() can clear them if needed */
824  	f->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
825  	if (!open)
826  		open = f->f_op->open;
827  	if (open) {
828  		error = open(inode, f);
829  		if (error)
830  			goto cleanup_all;
831  	}
832  	f->f_mode |= FMODE_OPENED;
833  	if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
834  		i_readcount_inc(inode);
835  	if ((f->f_mode & FMODE_READ) &&
836  	     likely(f->f_op->read || f->f_op->read_iter))
837  		f->f_mode |= FMODE_CAN_READ;
838  	if ((f->f_mode & FMODE_WRITE) &&
839  	     likely(f->f_op->write || f->f_op->write_iter))
840  		f->f_mode |= FMODE_CAN_WRITE;
841  
842  	f->f_write_hint = WRITE_LIFE_NOT_SET;
843  	f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
844  
845  	file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);
846  
847  	/* NB: we're sure to have correct a_ops only after f_op->open */
848  	if (f->f_flags & O_DIRECT) {
849  		if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO)
850  			return -EINVAL;
851  	}
852  
853  	/*
854  	 * XXX: Huge page cache doesn't support writing yet. Drop all page
855  	 * cache for this file before processing writes.
856  	 */
857  	if ((f->f_mode & FMODE_WRITE) && filemap_nr_thps(inode->i_mapping))
858  		truncate_pagecache(inode, 0);
859  
860  	return 0;
861  
862  cleanup_all:
863  	if (WARN_ON_ONCE(error > 0))
864  		error = -EINVAL;
865  	fops_put(f->f_op);
866  	if (f->f_mode & FMODE_WRITER) {
867  		put_write_access(inode);
868  		__mnt_drop_write(f->f_path.mnt);
869  	}
870  cleanup_file:
871  	path_put(&f->f_path);
872  	f->f_path.mnt = NULL;
873  	f->f_path.dentry = NULL;
874  	f->f_inode = NULL;
875  	return error;
876  }
877  
878  /**
879   * finish_open - finish opening a file
880   * @file: file pointer
881   * @dentry: pointer to dentry
882   * @open: open callback
883   * @opened: state of open
884   *
885   * This can be used to finish opening a file passed to i_op->atomic_open().
886   *
887   * If the open callback is set to NULL, then the standard f_op->open()
888   * filesystem callback is substituted.
889   *
890   * NB: the dentry reference is _not_ consumed.  If, for example, the dentry is
891   * the return value of d_splice_alias(), then the caller needs to perform dput()
892   * on it after finish_open().
893   *
894   * Returns zero on success or -errno if the open failed.
895   */
896  int finish_open(struct file *file, struct dentry *dentry,
897  		int (*open)(struct inode *, struct file *))
898  {
899  	BUG_ON(file->f_mode & FMODE_OPENED); /* once it's opened, it's opened */
900  
901  	file->f_path.dentry = dentry;
902  	return do_dentry_open(file, d_backing_inode(dentry), open);
903  }
904  EXPORT_SYMBOL(finish_open);
905  
906  /**
907   * finish_no_open - finish ->atomic_open() without opening the file
908   *
909   * @file: file pointer
910   * @dentry: dentry or NULL (as returned from ->lookup())
911   *
912   * This can be used to set the result of a successful lookup in ->atomic_open().
913   *
914   * NB: unlike finish_open() this function does consume the dentry reference and
915   * the caller need not dput() it.
916   *
917   * Returns "0" which must be the return value of ->atomic_open() after having
918   * called this function.
919   */
920  int finish_no_open(struct file *file, struct dentry *dentry)
921  {
922  	file->f_path.dentry = dentry;
923  	return 0;
924  }
925  EXPORT_SYMBOL(finish_no_open);
926  
927  char *file_path(struct file *filp, char *buf, int buflen)
928  {
929  	return d_path(&filp->f_path, buf, buflen);
930  }
931  EXPORT_SYMBOL(file_path);
932  
933  /**
934   * vfs_open - open the file at the given path
935   * @path: path to open
936   * @file: newly allocated file with f_flag initialized
937   * @cred: credentials to use
938   */
939  int vfs_open(const struct path *path, struct file *file)
940  {
941  	file->f_path = *path;
942  	return do_dentry_open(file, d_backing_inode(path->dentry), NULL);
943  }
944  
945  struct file *dentry_open(const struct path *path, int flags,
946  			 const struct cred *cred)
947  {
948  	int error;
949  	struct file *f;
950  
951  	validate_creds(cred);
952  
953  	/* We must always pass in a valid mount pointer. */
954  	BUG_ON(!path->mnt);
955  
956  	f = alloc_empty_file(flags, cred);
957  	if (!IS_ERR(f)) {
958  		error = vfs_open(path, f);
959  		if (error) {
960  			fput(f);
961  			f = ERR_PTR(error);
962  		}
963  	}
964  	return f;
965  }
966  EXPORT_SYMBOL(dentry_open);
967  
968  struct file *open_with_fake_path(const struct path *path, int flags,
969  				struct inode *inode, const struct cred *cred)
970  {
971  	struct file *f = alloc_empty_file_noaccount(flags, cred);
972  	if (!IS_ERR(f)) {
973  		int error;
974  
975  		f->f_path = *path;
976  		error = do_dentry_open(f, inode, NULL);
977  		if (error) {
978  			fput(f);
979  			f = ERR_PTR(error);
980  		}
981  	}
982  	return f;
983  }
984  EXPORT_SYMBOL(open_with_fake_path);
985  
986  #define WILL_CREATE(flags)	(flags & (O_CREAT | __O_TMPFILE))
987  #define O_PATH_FLAGS		(O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC)
988  
989  inline struct open_how build_open_how(int flags, umode_t mode)
990  {
991  	struct open_how how = {
992  		.flags = flags & VALID_OPEN_FLAGS,
993  		.mode = mode & S_IALLUGO,
994  	};
995  
996  	/* O_PATH beats everything else. */
997  	if (how.flags & O_PATH)
998  		how.flags &= O_PATH_FLAGS;
999  	/* Modes should only be set for create-like flags. */
1000  	if (!WILL_CREATE(how.flags))
1001  		how.mode = 0;
1002  	return how;
1003  }
1004  
1005  inline int build_open_flags(const struct open_how *how, struct open_flags *op)
1006  {
1007  	int flags = how->flags;
1008  	int lookup_flags = 0;
1009  	int acc_mode = ACC_MODE(flags);
1010  
1011  	/* Must never be set by userspace */
1012  	flags &= ~(FMODE_NONOTIFY | O_CLOEXEC);
1013  
1014  	/*
1015  	 * Older syscalls implicitly clear all of the invalid flags or argument
1016  	 * values before calling build_open_flags(), but openat2(2) checks all
1017  	 * of its arguments.
1018  	 */
1019  	if (flags & ~VALID_OPEN_FLAGS)
1020  		return -EINVAL;
1021  	if (how->resolve & ~VALID_RESOLVE_FLAGS)
1022  		return -EINVAL;
1023  
1024  	/* Deal with the mode. */
1025  	if (WILL_CREATE(flags)) {
1026  		if (how->mode & ~S_IALLUGO)
1027  			return -EINVAL;
1028  		op->mode = how->mode | S_IFREG;
1029  	} else {
1030  		if (how->mode != 0)
1031  			return -EINVAL;
1032  		op->mode = 0;
1033  	}
1034  
1035  	/*
1036  	 * In order to ensure programs get explicit errors when trying to use
1037  	 * O_TMPFILE on old kernels, O_TMPFILE is implemented such that it
1038  	 * looks like (O_DIRECTORY|O_RDWR & ~O_CREAT) to old kernels. But we
1039  	 * have to require userspace to explicitly set it.
1040  	 */
1041  	if (flags & __O_TMPFILE) {
1042  		if ((flags & O_TMPFILE_MASK) != O_TMPFILE)
1043  			return -EINVAL;
1044  		if (!(acc_mode & MAY_WRITE))
1045  			return -EINVAL;
1046  	}
1047  	if (flags & O_PATH) {
1048  		/* O_PATH only permits certain other flags to be set. */
1049  		if (flags & ~O_PATH_FLAGS)
1050  			return -EINVAL;
1051  		acc_mode = 0;
1052  	}
1053  
1054  	/*
1055  	 * O_SYNC is implemented as __O_SYNC|O_DSYNC.  As many places only
1056  	 * check for O_DSYNC if the need any syncing at all we enforce it's
1057  	 * always set instead of having to deal with possibly weird behaviour
1058  	 * for malicious applications setting only __O_SYNC.
1059  	 */
1060  	if (flags & __O_SYNC)
1061  		flags |= O_DSYNC;
1062  
1063  	op->open_flag = flags;
1064  
1065  	/* O_TRUNC implies we need access checks for write permissions */
1066  	if (flags & O_TRUNC)
1067  		acc_mode |= MAY_WRITE;
1068  
1069  	/* Allow the LSM permission hook to distinguish append
1070  	   access from general write access. */
1071  	if (flags & O_APPEND)
1072  		acc_mode |= MAY_APPEND;
1073  
1074  	op->acc_mode = acc_mode;
1075  
1076  	op->intent = flags & O_PATH ? 0 : LOOKUP_OPEN;
1077  
1078  	if (flags & O_CREAT) {
1079  		op->intent |= LOOKUP_CREATE;
1080  		if (flags & O_EXCL) {
1081  			op->intent |= LOOKUP_EXCL;
1082  			flags |= O_NOFOLLOW;
1083  		}
1084  	}
1085  
1086  	if (flags & O_DIRECTORY)
1087  		lookup_flags |= LOOKUP_DIRECTORY;
1088  	if (!(flags & O_NOFOLLOW))
1089  		lookup_flags |= LOOKUP_FOLLOW;
1090  
1091  	if (how->resolve & RESOLVE_NO_XDEV)
1092  		lookup_flags |= LOOKUP_NO_XDEV;
1093  	if (how->resolve & RESOLVE_NO_MAGICLINKS)
1094  		lookup_flags |= LOOKUP_NO_MAGICLINKS;
1095  	if (how->resolve & RESOLVE_NO_SYMLINKS)
1096  		lookup_flags |= LOOKUP_NO_SYMLINKS;
1097  	if (how->resolve & RESOLVE_BENEATH)
1098  		lookup_flags |= LOOKUP_BENEATH;
1099  	if (how->resolve & RESOLVE_IN_ROOT)
1100  		lookup_flags |= LOOKUP_IN_ROOT;
1101  
1102  	op->lookup_flags = lookup_flags;
1103  	return 0;
1104  }
1105  
1106  /**
1107   * file_open_name - open file and return file pointer
1108   *
1109   * @name:	struct filename containing path to open
1110   * @flags:	open flags as per the open(2) second argument
1111   * @mode:	mode for the new file if O_CREAT is set, else ignored
1112   *
1113   * This is the helper to open a file from kernelspace if you really
1114   * have to.  But in generally you should not do this, so please move
1115   * along, nothing to see here..
1116   */
1117  struct file *file_open_name(struct filename *name, int flags, umode_t mode)
1118  {
1119  	struct open_flags op;
1120  	struct open_how how = build_open_how(flags, mode);
1121  	int err = build_open_flags(&how, &op);
1122  	if (err)
1123  		return ERR_PTR(err);
1124  	return do_filp_open(AT_FDCWD, name, &op);
1125  }
1126  
1127  /**
1128   * filp_open - open file and return file pointer
1129   *
1130   * @filename:	path to open
1131   * @flags:	open flags as per the open(2) second argument
1132   * @mode:	mode for the new file if O_CREAT is set, else ignored
1133   *
1134   * This is the helper to open a file from kernelspace if you really
1135   * have to.  But in generally you should not do this, so please move
1136   * along, nothing to see here..
1137   */
1138  struct file *filp_open(const char *filename, int flags, umode_t mode)
1139  {
1140  	struct filename *name = getname_kernel(filename);
1141  	struct file *file = ERR_CAST(name);
1142  
1143  	if (!IS_ERR(name)) {
1144  		file = file_open_name(name, flags, mode);
1145  		putname(name);
1146  	}
1147  	return file;
1148  }
1149  EXPORT_SYMBOL(filp_open);
1150  
1151  struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt,
1152  			    const char *filename, int flags, umode_t mode)
1153  {
1154  	struct open_flags op;
1155  	struct open_how how = build_open_how(flags, mode);
1156  	int err = build_open_flags(&how, &op);
1157  	if (err)
1158  		return ERR_PTR(err);
1159  	return do_file_open_root(dentry, mnt, filename, &op);
1160  }
1161  EXPORT_SYMBOL(file_open_root);
1162  
1163  static long do_sys_openat2(int dfd, const char __user *filename,
1164  			   struct open_how *how)
1165  {
1166  	struct open_flags op;
1167  	int fd = build_open_flags(how, &op);
1168  	struct filename *tmp;
1169  
1170  	if (fd)
1171  		return fd;
1172  
1173  	tmp = getname(filename);
1174  	if (IS_ERR(tmp))
1175  		return PTR_ERR(tmp);
1176  
1177  	fd = get_unused_fd_flags(how->flags);
1178  	if (fd >= 0) {
1179  		struct file *f = do_filp_open(dfd, tmp, &op);
1180  		if (IS_ERR(f)) {
1181  			put_unused_fd(fd);
1182  			fd = PTR_ERR(f);
1183  		} else {
1184  			fsnotify_open(f);
1185  			fd_install(fd, f);
1186  		}
1187  	}
1188  	putname(tmp);
1189  	return fd;
1190  }
1191  
1192  long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
1193  {
1194  	struct open_how how = build_open_how(flags, mode);
1195  	return do_sys_openat2(dfd, filename, &how);
1196  }
1197  
1198  
1199  SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
1200  {
1201  	return ksys_open(filename, flags, mode);
1202  }
1203  
1204  SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags,
1205  		umode_t, mode)
1206  {
1207  	if (force_o_largefile())
1208  		flags |= O_LARGEFILE;
1209  	return do_sys_open(dfd, filename, flags, mode);
1210  }
1211  
1212  SYSCALL_DEFINE4(openat2, int, dfd, const char __user *, filename,
1213  		struct open_how __user *, how, size_t, usize)
1214  {
1215  	int err;
1216  	struct open_how tmp;
1217  
1218  	BUILD_BUG_ON(sizeof(struct open_how) < OPEN_HOW_SIZE_VER0);
1219  	BUILD_BUG_ON(sizeof(struct open_how) != OPEN_HOW_SIZE_LATEST);
1220  
1221  	if (unlikely(usize < OPEN_HOW_SIZE_VER0))
1222  		return -EINVAL;
1223  
1224  	err = copy_struct_from_user(&tmp, sizeof(tmp), how, usize);
1225  	if (err)
1226  		return err;
1227  
1228  	/* O_LARGEFILE is only allowed for non-O_PATH. */
1229  	if (!(tmp.flags & O_PATH) && force_o_largefile())
1230  		tmp.flags |= O_LARGEFILE;
1231  
1232  	return do_sys_openat2(dfd, filename, &tmp);
1233  }
1234  
1235  #ifdef CONFIG_COMPAT
1236  /*
1237   * Exactly like sys_open(), except that it doesn't set the
1238   * O_LARGEFILE flag.
1239   */
1240  COMPAT_SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
1241  {
1242  	return do_sys_open(AT_FDCWD, filename, flags, mode);
1243  }
1244  
1245  /*
1246   * Exactly like sys_openat(), except that it doesn't set the
1247   * O_LARGEFILE flag.
1248   */
1249  COMPAT_SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, umode_t, mode)
1250  {
1251  	return do_sys_open(dfd, filename, flags, mode);
1252  }
1253  #endif
1254  
1255  #ifndef __alpha__
1256  
1257  /*
1258   * For backward compatibility?  Maybe this should be moved
1259   * into arch/i386 instead?
1260   */
1261  SYSCALL_DEFINE2(creat, const char __user *, pathname, umode_t, mode)
1262  {
1263  	return ksys_open(pathname, O_CREAT | O_WRONLY | O_TRUNC, mode);
1264  }
1265  
1266  #endif
1267  
1268  /*
1269   * "id" is the POSIX thread ID. We use the
1270   * files pointer for this..
1271   */
1272  int filp_close(struct file *filp, fl_owner_t id)
1273  {
1274  	int retval = 0;
1275  
1276  	if (!file_count(filp)) {
1277  		printk(KERN_ERR "VFS: Close: file count is 0\n");
1278  		return 0;
1279  	}
1280  
1281  	if (filp->f_op->flush)
1282  		retval = filp->f_op->flush(filp, id);
1283  
1284  	if (likely(!(filp->f_mode & FMODE_PATH))) {
1285  		dnotify_flush(filp, id);
1286  		locks_remove_posix(filp, id);
1287  	}
1288  	fput(filp);
1289  	return retval;
1290  }
1291  
1292  EXPORT_SYMBOL(filp_close);
1293  
1294  /*
1295   * Careful here! We test whether the file pointer is NULL before
1296   * releasing the fd. This ensures that one clone task can't release
1297   * an fd while another clone is opening it.
1298   */
1299  SYSCALL_DEFINE1(close, unsigned int, fd)
1300  {
1301  	int retval = __close_fd(current->files, fd);
1302  
1303  	/* can't restart close syscall because file table entry was cleared */
1304  	if (unlikely(retval == -ERESTARTSYS ||
1305  		     retval == -ERESTARTNOINTR ||
1306  		     retval == -ERESTARTNOHAND ||
1307  		     retval == -ERESTART_RESTARTBLOCK))
1308  		retval = -EINTR;
1309  
1310  	return retval;
1311  }
1312  
1313  /*
1314   * This routine simulates a hangup on the tty, to arrange that users
1315   * are given clean terminals at login time.
1316   */
1317  SYSCALL_DEFINE0(vhangup)
1318  {
1319  	if (capable(CAP_SYS_TTY_CONFIG)) {
1320  		tty_vhangup_self();
1321  		return 0;
1322  	}
1323  	return -EPERM;
1324  }
1325  
1326  /*
1327   * Called when an inode is about to be open.
1328   * We use this to disallow opening large files on 32bit systems if
1329   * the caller didn't specify O_LARGEFILE.  On 64bit systems we force
1330   * on this flag in sys_open.
1331   */
1332  int generic_file_open(struct inode * inode, struct file * filp)
1333  {
1334  	if (!(filp->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
1335  		return -EOVERFLOW;
1336  	return 0;
1337  }
1338  
1339  EXPORT_SYMBOL(generic_file_open);
1340  
1341  /*
1342   * This is used by subsystems that don't want seekable
1343   * file descriptors. The function is not supposed to ever fail, the only
1344   * reason it returns an 'int' and not 'void' is so that it can be plugged
1345   * directly into file_operations structure.
1346   */
1347  int nonseekable_open(struct inode *inode, struct file *filp)
1348  {
1349  	filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
1350  	return 0;
1351  }
1352  
1353  EXPORT_SYMBOL(nonseekable_open);
1354  
1355  /*
1356   * stream_open is used by subsystems that want stream-like file descriptors.
1357   * Such file descriptors are not seekable and don't have notion of position
1358   * (file.f_pos is always 0 and ppos passed to .read()/.write() is always NULL).
1359   * Contrary to file descriptors of other regular files, .read() and .write()
1360   * can run simultaneously.
1361   *
1362   * stream_open never fails and is marked to return int so that it could be
1363   * directly used as file_operations.open .
1364   */
1365  int stream_open(struct inode *inode, struct file *filp)
1366  {
1367  	filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE | FMODE_ATOMIC_POS);
1368  	filp->f_mode |= FMODE_STREAM;
1369  	return 0;
1370  }
1371  
1372  EXPORT_SYMBOL(stream_open);
1373