xref: /linux/fs/namei.c (revision 357660d7596bd40d1004762739e426b1fbe10a14)
1  // SPDX-License-Identifier: GPL-2.0
2  /*
3   *  linux/fs/namei.c
4   *
5   *  Copyright (C) 1991, 1992  Linus Torvalds
6   */
7  
8  /*
9   * Some corrections by tytso.
10   */
11  
12  /* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname
13   * lookup logic.
14   */
15  /* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture.
16   */
17  
18  #include <linux/init.h>
19  #include <linux/export.h>
20  #include <linux/slab.h>
21  #include <linux/wordpart.h>
22  #include <linux/fs.h>
23  #include <linux/filelock.h>
24  #include <linux/namei.h>
25  #include <linux/pagemap.h>
26  #include <linux/sched/mm.h>
27  #include <linux/fsnotify.h>
28  #include <linux/personality.h>
29  #include <linux/security.h>
30  #include <linux/syscalls.h>
31  #include <linux/mount.h>
32  #include <linux/audit.h>
33  #include <linux/capability.h>
34  #include <linux/file.h>
35  #include <linux/fcntl.h>
36  #include <linux/device_cgroup.h>
37  #include <linux/fs_struct.h>
38  #include <linux/posix_acl.h>
39  #include <linux/hash.h>
40  #include <linux/bitops.h>
41  #include <linux/init_task.h>
42  #include <linux/uaccess.h>
43  
44  #include "internal.h"
45  #include "mount.h"
46  
47  /* [Feb-1997 T. Schoebel-Theuer]
48   * Fundamental changes in the pathname lookup mechanisms (namei)
49   * were necessary because of omirr.  The reason is that omirr needs
50   * to know the _real_ pathname, not the user-supplied one, in case
51   * of symlinks (and also when transname replacements occur).
52   *
53   * The new code replaces the old recursive symlink resolution with
54   * an iterative one (in case of non-nested symlink chains).  It does
55   * this with calls to <fs>_follow_link().
56   * As a side effect, dir_namei(), _namei() and follow_link() are now
57   * replaced with a single function lookup_dentry() that can handle all
58   * the special cases of the former code.
59   *
60   * With the new dcache, the pathname is stored at each inode, at least as
61   * long as the refcount of the inode is positive.  As a side effect, the
62   * size of the dcache depends on the inode cache and thus is dynamic.
63   *
64   * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink
65   * resolution to correspond with current state of the code.
66   *
67   * Note that the symlink resolution is not *completely* iterative.
68   * There is still a significant amount of tail- and mid- recursion in
69   * the algorithm.  Also, note that <fs>_readlink() is not used in
70   * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink()
71   * may return different results than <fs>_follow_link().  Many virtual
72   * filesystems (including /proc) exhibit this behavior.
73   */
74  
75  /* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation:
76   * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL
77   * and the name already exists in form of a symlink, try to create the new
78   * name indicated by the symlink. The old code always complained that the
79   * name already exists, due to not following the symlink even if its target
80   * is nonexistent.  The new semantics affects also mknod() and link() when
81   * the name is a symlink pointing to a non-existent name.
82   *
83   * I don't know which semantics is the right one, since I have no access
84   * to standards. But I found by trial that HP-UX 9.0 has the full "new"
85   * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the
86   * "old" one. Personally, I think the new semantics is much more logical.
87   * Note that "ln old new" where "new" is a symlink pointing to a non-existing
88   * file does succeed in both HP-UX and SunOs, but not in Solaris
89   * and in the old Linux semantics.
90   */
91  
92  /* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink
93   * semantics.  See the comments in "open_namei" and "do_link" below.
94   *
95   * [10-Sep-98 Alan Modra] Another symlink change.
96   */
97  
98  /* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks:
99   *	inside the path - always follow.
100   *	in the last component in creation/removal/renaming - never follow.
101   *	if LOOKUP_FOLLOW passed - follow.
102   *	if the pathname has trailing slashes - follow.
103   *	otherwise - don't follow.
104   * (applied in that order).
105   *
106   * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT
107   * restored for 2.4. This is the last surviving part of old 4.2BSD bug.
108   * During the 2.4 we need to fix the userland stuff depending on it -
109   * hopefully we will be able to get rid of that wart in 2.5. So far only
110   * XEmacs seems to be relying on it...
111   */
112  /*
113   * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland)
114   * implemented.  Let's see if raised priority of ->s_vfs_rename_mutex gives
115   * any extra contention...
116   */
117  
118  /* In order to reduce some races, while at the same time doing additional
119   * checking and hopefully speeding things up, we copy filenames to the
120   * kernel data space before using them..
121   *
122   * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
123   * PATH_MAX includes the nul terminator --RR.
124   */
125  
126  #define EMBEDDED_NAME_MAX	(PATH_MAX - offsetof(struct filename, iname))
127  
initname(struct filename * name)128  static inline void initname(struct filename *name)
129  {
130  	name->uptr = NULL;
131  	name->aname = NULL;
132  	atomic_set(&name->refcnt, 1);
133  }
134  
135  struct filename *
getname_flags(const char __user * filename,int flags)136  getname_flags(const char __user *filename, int flags)
137  {
138  	struct filename *result;
139  	char *kname;
140  	int len;
141  
142  	result = audit_reusename(filename);
143  	if (result)
144  		return result;
145  
146  	result = __getname();
147  	if (unlikely(!result))
148  		return ERR_PTR(-ENOMEM);
149  
150  	/*
151  	 * First, try to embed the struct filename inside the names_cache
152  	 * allocation
153  	 */
154  	kname = (char *)result->iname;
155  	result->name = kname;
156  
157  	len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX);
158  	/*
159  	 * Handle both empty path and copy failure in one go.
160  	 */
161  	if (unlikely(len <= 0)) {
162  		if (unlikely(len < 0)) {
163  			__putname(result);
164  			return ERR_PTR(len);
165  		}
166  
167  		/* The empty path is special. */
168  		if (!(flags & LOOKUP_EMPTY)) {
169  			__putname(result);
170  			return ERR_PTR(-ENOENT);
171  		}
172  	}
173  
174  	/*
175  	 * Uh-oh. We have a name that's approaching PATH_MAX. Allocate a
176  	 * separate struct filename so we can dedicate the entire
177  	 * names_cache allocation for the pathname, and re-do the copy from
178  	 * userland.
179  	 */
180  	if (unlikely(len == EMBEDDED_NAME_MAX)) {
181  		const size_t size = offsetof(struct filename, iname[1]);
182  		kname = (char *)result;
183  
184  		/*
185  		 * size is chosen that way we to guarantee that
186  		 * result->iname[0] is within the same object and that
187  		 * kname can't be equal to result->iname, no matter what.
188  		 */
189  		result = kzalloc(size, GFP_KERNEL);
190  		if (unlikely(!result)) {
191  			__putname(kname);
192  			return ERR_PTR(-ENOMEM);
193  		}
194  		result->name = kname;
195  		len = strncpy_from_user(kname, filename, PATH_MAX);
196  		if (unlikely(len < 0)) {
197  			__putname(kname);
198  			kfree(result);
199  			return ERR_PTR(len);
200  		}
201  		/* The empty path is special. */
202  		if (unlikely(!len) && !(flags & LOOKUP_EMPTY)) {
203  			__putname(kname);
204  			kfree(result);
205  			return ERR_PTR(-ENOENT);
206  		}
207  		if (unlikely(len == PATH_MAX)) {
208  			__putname(kname);
209  			kfree(result);
210  			return ERR_PTR(-ENAMETOOLONG);
211  		}
212  	}
213  	initname(result);
214  	audit_getname(result);
215  	return result;
216  }
217  
getname_uflags(const char __user * filename,int uflags)218  struct filename *getname_uflags(const char __user *filename, int uflags)
219  {
220  	int flags = (uflags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;
221  
222  	return getname_flags(filename, flags);
223  }
224  
__getname_maybe_null(const char __user * pathname)225  struct filename *__getname_maybe_null(const char __user *pathname)
226  {
227  	struct filename *name;
228  	char c;
229  
230  	/* try to save on allocations; loss on um, though */
231  	if (get_user(c, pathname))
232  		return ERR_PTR(-EFAULT);
233  	if (!c)
234  		return NULL;
235  
236  	name = getname_flags(pathname, LOOKUP_EMPTY);
237  	if (!IS_ERR(name) && !(name->name[0])) {
238  		putname(name);
239  		name = NULL;
240  	}
241  	return name;
242  }
243  
getname_kernel(const char * filename)244  struct filename *getname_kernel(const char * filename)
245  {
246  	struct filename *result;
247  	int len = strlen(filename) + 1;
248  
249  	result = __getname();
250  	if (unlikely(!result))
251  		return ERR_PTR(-ENOMEM);
252  
253  	if (len <= EMBEDDED_NAME_MAX) {
254  		result->name = (char *)result->iname;
255  	} else if (len <= PATH_MAX) {
256  		const size_t size = offsetof(struct filename, iname[1]);
257  		struct filename *tmp;
258  
259  		tmp = kmalloc(size, GFP_KERNEL);
260  		if (unlikely(!tmp)) {
261  			__putname(result);
262  			return ERR_PTR(-ENOMEM);
263  		}
264  		tmp->name = (char *)result;
265  		result = tmp;
266  	} else {
267  		__putname(result);
268  		return ERR_PTR(-ENAMETOOLONG);
269  	}
270  	memcpy((char *)result->name, filename, len);
271  	initname(result);
272  	audit_getname(result);
273  	return result;
274  }
275  EXPORT_SYMBOL(getname_kernel);
276  
putname(struct filename * name)277  void putname(struct filename *name)
278  {
279  	int refcnt;
280  
281  	if (IS_ERR_OR_NULL(name))
282  		return;
283  
284  	refcnt = atomic_read(&name->refcnt);
285  	if (refcnt != 1) {
286  		if (WARN_ON_ONCE(!refcnt))
287  			return;
288  
289  		if (!atomic_dec_and_test(&name->refcnt))
290  			return;
291  	}
292  
293  	if (name->name != name->iname) {
294  		__putname(name->name);
295  		kfree(name);
296  	} else
297  		__putname(name);
298  }
299  EXPORT_SYMBOL(putname);
300  
301  /**
302   * check_acl - perform ACL permission checking
303   * @idmap:	idmap of the mount the inode was found from
304   * @inode:	inode to check permissions on
305   * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
306   *
307   * This function performs the ACL permission checking. Since this function
308   * retrieve POSIX acls it needs to know whether it is called from a blocking or
309   * non-blocking context and thus cares about the MAY_NOT_BLOCK bit.
310   *
311   * If the inode has been found through an idmapped mount the idmap of
312   * the vfsmount must be passed through @idmap. This function will then take
313   * care to map the inode according to @idmap before checking permissions.
314   * On non-idmapped mounts or if permission checking is to be performed on the
315   * raw inode simply pass @nop_mnt_idmap.
316   */
check_acl(struct mnt_idmap * idmap,struct inode * inode,int mask)317  static int check_acl(struct mnt_idmap *idmap,
318  		     struct inode *inode, int mask)
319  {
320  #ifdef CONFIG_FS_POSIX_ACL
321  	struct posix_acl *acl;
322  
323  	if (mask & MAY_NOT_BLOCK) {
324  		acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS);
325  	        if (!acl)
326  	                return -EAGAIN;
327  		/* no ->get_inode_acl() calls in RCU mode... */
328  		if (is_uncached_acl(acl))
329  			return -ECHILD;
330  	        return posix_acl_permission(idmap, inode, acl, mask);
331  	}
332  
333  	acl = get_inode_acl(inode, ACL_TYPE_ACCESS);
334  	if (IS_ERR(acl))
335  		return PTR_ERR(acl);
336  	if (acl) {
337  	        int error = posix_acl_permission(idmap, inode, acl, mask);
338  	        posix_acl_release(acl);
339  	        return error;
340  	}
341  #endif
342  
343  	return -EAGAIN;
344  }
345  
346  /*
347   * Very quick optimistic "we know we have no ACL's" check.
348   *
349   * Note that this is purely for ACL_TYPE_ACCESS, and purely
350   * for the "we have cached that there are no ACLs" case.
351   *
352   * If this returns true, we know there are no ACLs. But if
353   * it returns false, we might still not have ACLs (it could
354   * be the is_uncached_acl() case).
355   */
no_acl_inode(struct inode * inode)356  static inline bool no_acl_inode(struct inode *inode)
357  {
358  #ifdef CONFIG_FS_POSIX_ACL
359  	return likely(!READ_ONCE(inode->i_acl));
360  #else
361  	return true;
362  #endif
363  }
364  
365  /**
366   * acl_permission_check - perform basic UNIX permission checking
367   * @idmap:	idmap of the mount the inode was found from
368   * @inode:	inode to check permissions on
369   * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
370   *
371   * This function performs the basic UNIX permission checking. Since this
372   * function may retrieve POSIX acls it needs to know whether it is called from a
373   * blocking or non-blocking context and thus cares about the MAY_NOT_BLOCK bit.
374   *
375   * If the inode has been found through an idmapped mount the idmap of
376   * the vfsmount must be passed through @idmap. This function will then take
377   * care to map the inode according to @idmap before checking permissions.
378   * On non-idmapped mounts or if permission checking is to be performed on the
379   * raw inode simply pass @nop_mnt_idmap.
380   */
acl_permission_check(struct mnt_idmap * idmap,struct inode * inode,int mask)381  static int acl_permission_check(struct mnt_idmap *idmap,
382  				struct inode *inode, int mask)
383  {
384  	unsigned int mode = inode->i_mode;
385  	vfsuid_t vfsuid;
386  
387  	/*
388  	 * Common cheap case: everybody has the requested
389  	 * rights, and there are no ACLs to check. No need
390  	 * to do any owner/group checks in that case.
391  	 *
392  	 *  - 'mask&7' is the requested permission bit set
393  	 *  - multiplying by 0111 spreads them out to all of ugo
394  	 *  - '& ~mode' looks for missing inode permission bits
395  	 *  - the '!' is for "no missing permissions"
396  	 *
397  	 * After that, we just need to check that there are no
398  	 * ACL's on the inode - do the 'IS_POSIXACL()' check last
399  	 * because it will dereference the ->i_sb pointer and we
400  	 * want to avoid that if at all possible.
401  	 */
402  	if (!((mask & 7) * 0111 & ~mode)) {
403  		if (no_acl_inode(inode))
404  			return 0;
405  		if (!IS_POSIXACL(inode))
406  			return 0;
407  	}
408  
409  	/* Are we the owner? If so, ACL's don't matter */
410  	vfsuid = i_uid_into_vfsuid(idmap, inode);
411  	if (likely(vfsuid_eq_kuid(vfsuid, current_fsuid()))) {
412  		mask &= 7;
413  		mode >>= 6;
414  		return (mask & ~mode) ? -EACCES : 0;
415  	}
416  
417  	/* Do we have ACL's? */
418  	if (IS_POSIXACL(inode) && (mode & S_IRWXG)) {
419  		int error = check_acl(idmap, inode, mask);
420  		if (error != -EAGAIN)
421  			return error;
422  	}
423  
424  	/* Only RWX matters for group/other mode bits */
425  	mask &= 7;
426  
427  	/*
428  	 * Are the group permissions different from
429  	 * the other permissions in the bits we care
430  	 * about? Need to check group ownership if so.
431  	 */
432  	if (mask & (mode ^ (mode >> 3))) {
433  		vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode);
434  		if (vfsgid_in_group_p(vfsgid))
435  			mode >>= 3;
436  	}
437  
438  	/* Bits in 'mode' clear that we require? */
439  	return (mask & ~mode) ? -EACCES : 0;
440  }
441  
442  /**
443   * generic_permission -  check for access rights on a Posix-like filesystem
444   * @idmap:	idmap of the mount the inode was found from
445   * @inode:	inode to check access rights for
446   * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC,
447   *		%MAY_NOT_BLOCK ...)
448   *
449   * Used to check for read/write/execute permissions on a file.
450   * We use "fsuid" for this, letting us set arbitrary permissions
451   * for filesystem access without changing the "normal" uids which
452   * are used for other things.
453   *
454   * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
455   * request cannot be satisfied (eg. requires blocking or too much complexity).
456   * It would then be called again in ref-walk mode.
457   *
458   * If the inode has been found through an idmapped mount the idmap of
459   * the vfsmount must be passed through @idmap. This function will then take
460   * care to map the inode according to @idmap before checking permissions.
461   * On non-idmapped mounts or if permission checking is to be performed on the
462   * raw inode simply pass @nop_mnt_idmap.
463   */
generic_permission(struct mnt_idmap * idmap,struct inode * inode,int mask)464  int generic_permission(struct mnt_idmap *idmap, struct inode *inode,
465  		       int mask)
466  {
467  	int ret;
468  
469  	/*
470  	 * Do the basic permission checks.
471  	 */
472  	ret = acl_permission_check(idmap, inode, mask);
473  	if (ret != -EACCES)
474  		return ret;
475  
476  	if (S_ISDIR(inode->i_mode)) {
477  		/* DACs are overridable for directories */
478  		if (!(mask & MAY_WRITE))
479  			if (capable_wrt_inode_uidgid(idmap, inode,
480  						     CAP_DAC_READ_SEARCH))
481  				return 0;
482  		if (capable_wrt_inode_uidgid(idmap, inode,
483  					     CAP_DAC_OVERRIDE))
484  			return 0;
485  		return -EACCES;
486  	}
487  
488  	/*
489  	 * Searching includes executable on directories, else just read.
490  	 */
491  	mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
492  	if (mask == MAY_READ)
493  		if (capable_wrt_inode_uidgid(idmap, inode,
494  					     CAP_DAC_READ_SEARCH))
495  			return 0;
496  	/*
497  	 * Read/write DACs are always overridable.
498  	 * Executable DACs are overridable when there is
499  	 * at least one exec bit set.
500  	 */
501  	if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO))
502  		if (capable_wrt_inode_uidgid(idmap, inode,
503  					     CAP_DAC_OVERRIDE))
504  			return 0;
505  
506  	return -EACCES;
507  }
508  EXPORT_SYMBOL(generic_permission);
509  
510  /**
511   * do_inode_permission - UNIX permission checking
512   * @idmap:	idmap of the mount the inode was found from
513   * @inode:	inode to check permissions on
514   * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
515   *
516   * We _really_ want to just do "generic_permission()" without
517   * even looking at the inode->i_op values. So we keep a cache
518   * flag in inode->i_opflags, that says "this has not special
519   * permission function, use the fast case".
520   */
do_inode_permission(struct mnt_idmap * idmap,struct inode * inode,int mask)521  static inline int do_inode_permission(struct mnt_idmap *idmap,
522  				      struct inode *inode, int mask)
523  {
524  	if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
525  		if (likely(inode->i_op->permission))
526  			return inode->i_op->permission(idmap, inode, mask);
527  
528  		/* This gets set once for the inode lifetime */
529  		spin_lock(&inode->i_lock);
530  		inode->i_opflags |= IOP_FASTPERM;
531  		spin_unlock(&inode->i_lock);
532  	}
533  	return generic_permission(idmap, inode, mask);
534  }
535  
536  /**
537   * sb_permission - Check superblock-level permissions
538   * @sb: Superblock of inode to check permission on
539   * @inode: Inode to check permission on
540   * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
541   *
542   * Separate out file-system wide checks from inode-specific permission checks.
543   */
sb_permission(struct super_block * sb,struct inode * inode,int mask)544  static int sb_permission(struct super_block *sb, struct inode *inode, int mask)
545  {
546  	if (unlikely(mask & MAY_WRITE)) {
547  		umode_t mode = inode->i_mode;
548  
549  		/* Nobody gets write access to a read-only fs. */
550  		if (sb_rdonly(sb) && (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
551  			return -EROFS;
552  	}
553  	return 0;
554  }
555  
556  /**
557   * inode_permission - Check for access rights to a given inode
558   * @idmap:	idmap of the mount the inode was found from
559   * @inode:	Inode to check permission on
560   * @mask:	Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
561   *
562   * Check for read/write/execute permissions on an inode.  We use fs[ug]id for
563   * this, letting us set arbitrary permissions for filesystem access without
564   * changing the "normal" UIDs which are used for other things.
565   *
566   * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
567   */
inode_permission(struct mnt_idmap * idmap,struct inode * inode,int mask)568  int inode_permission(struct mnt_idmap *idmap,
569  		     struct inode *inode, int mask)
570  {
571  	int retval;
572  
573  	retval = sb_permission(inode->i_sb, inode, mask);
574  	if (retval)
575  		return retval;
576  
577  	if (unlikely(mask & MAY_WRITE)) {
578  		/*
579  		 * Nobody gets write access to an immutable file.
580  		 */
581  		if (IS_IMMUTABLE(inode))
582  			return -EPERM;
583  
584  		/*
585  		 * Updating mtime will likely cause i_uid and i_gid to be
586  		 * written back improperly if their true value is unknown
587  		 * to the vfs.
588  		 */
589  		if (HAS_UNMAPPED_ID(idmap, inode))
590  			return -EACCES;
591  	}
592  
593  	retval = do_inode_permission(idmap, inode, mask);
594  	if (retval)
595  		return retval;
596  
597  	retval = devcgroup_inode_permission(inode, mask);
598  	if (retval)
599  		return retval;
600  
601  	return security_inode_permission(inode, mask);
602  }
603  EXPORT_SYMBOL(inode_permission);
604  
605  /**
606   * path_get - get a reference to a path
607   * @path: path to get the reference to
608   *
609   * Given a path increment the reference count to the dentry and the vfsmount.
610   */
path_get(const struct path * path)611  void path_get(const struct path *path)
612  {
613  	mntget(path->mnt);
614  	dget(path->dentry);
615  }
616  EXPORT_SYMBOL(path_get);
617  
618  /**
619   * path_put - put a reference to a path
620   * @path: path to put the reference to
621   *
622   * Given a path decrement the reference count to the dentry and the vfsmount.
623   */
path_put(const struct path * path)624  void path_put(const struct path *path)
625  {
626  	dput(path->dentry);
627  	mntput(path->mnt);
628  }
629  EXPORT_SYMBOL(path_put);
630  
631  #define EMBEDDED_LEVELS 2
632  struct nameidata {
633  	struct path	path;
634  	struct qstr	last;
635  	struct path	root;
636  	struct inode	*inode; /* path.dentry.d_inode */
637  	unsigned int	flags, state;
638  	unsigned	seq, next_seq, m_seq, r_seq;
639  	int		last_type;
640  	unsigned	depth;
641  	int		total_link_count;
642  	struct saved {
643  		struct path link;
644  		struct delayed_call done;
645  		const char *name;
646  		unsigned seq;
647  	} *stack, internal[EMBEDDED_LEVELS];
648  	struct filename	*name;
649  	const char *pathname;
650  	struct nameidata *saved;
651  	unsigned	root_seq;
652  	int		dfd;
653  	vfsuid_t	dir_vfsuid;
654  	umode_t		dir_mode;
655  } __randomize_layout;
656  
657  #define ND_ROOT_PRESET 1
658  #define ND_ROOT_GRABBED 2
659  #define ND_JUMPED 4
660  
__set_nameidata(struct nameidata * p,int dfd,struct filename * name)661  static void __set_nameidata(struct nameidata *p, int dfd, struct filename *name)
662  {
663  	struct nameidata *old = current->nameidata;
664  	p->stack = p->internal;
665  	p->depth = 0;
666  	p->dfd = dfd;
667  	p->name = name;
668  	p->pathname = likely(name) ? name->name : "";
669  	p->path.mnt = NULL;
670  	p->path.dentry = NULL;
671  	p->total_link_count = old ? old->total_link_count : 0;
672  	p->saved = old;
673  	current->nameidata = p;
674  }
675  
set_nameidata(struct nameidata * p,int dfd,struct filename * name,const struct path * root)676  static inline void set_nameidata(struct nameidata *p, int dfd, struct filename *name,
677  			  const struct path *root)
678  {
679  	__set_nameidata(p, dfd, name);
680  	p->state = 0;
681  	if (unlikely(root)) {
682  		p->state = ND_ROOT_PRESET;
683  		p->root = *root;
684  	}
685  }
686  
restore_nameidata(void)687  static void restore_nameidata(void)
688  {
689  	struct nameidata *now = current->nameidata, *old = now->saved;
690  
691  	current->nameidata = old;
692  	if (old)
693  		old->total_link_count = now->total_link_count;
694  	if (now->stack != now->internal)
695  		kfree(now->stack);
696  }
697  
nd_alloc_stack(struct nameidata * nd)698  static bool nd_alloc_stack(struct nameidata *nd)
699  {
700  	struct saved *p;
701  
702  	p= kmalloc_array(MAXSYMLINKS, sizeof(struct saved),
703  			 nd->flags & LOOKUP_RCU ? GFP_ATOMIC : GFP_KERNEL);
704  	if (unlikely(!p))
705  		return false;
706  	memcpy(p, nd->internal, sizeof(nd->internal));
707  	nd->stack = p;
708  	return true;
709  }
710  
711  /**
712   * path_connected - Verify that a dentry is below mnt.mnt_root
713   * @mnt: The mountpoint to check.
714   * @dentry: The dentry to check.
715   *
716   * Rename can sometimes move a file or directory outside of a bind
717   * mount, path_connected allows those cases to be detected.
718   */
path_connected(struct vfsmount * mnt,struct dentry * dentry)719  static bool path_connected(struct vfsmount *mnt, struct dentry *dentry)
720  {
721  	struct super_block *sb = mnt->mnt_sb;
722  
723  	/* Bind mounts can have disconnected paths */
724  	if (mnt->mnt_root == sb->s_root)
725  		return true;
726  
727  	return is_subdir(dentry, mnt->mnt_root);
728  }
729  
drop_links(struct nameidata * nd)730  static void drop_links(struct nameidata *nd)
731  {
732  	int i = nd->depth;
733  	while (i--) {
734  		struct saved *last = nd->stack + i;
735  		do_delayed_call(&last->done);
736  		clear_delayed_call(&last->done);
737  	}
738  }
739  
leave_rcu(struct nameidata * nd)740  static void leave_rcu(struct nameidata *nd)
741  {
742  	nd->flags &= ~LOOKUP_RCU;
743  	nd->seq = nd->next_seq = 0;
744  	rcu_read_unlock();
745  }
746  
terminate_walk(struct nameidata * nd)747  static void terminate_walk(struct nameidata *nd)
748  {
749  	drop_links(nd);
750  	if (!(nd->flags & LOOKUP_RCU)) {
751  		int i;
752  		path_put(&nd->path);
753  		for (i = 0; i < nd->depth; i++)
754  			path_put(&nd->stack[i].link);
755  		if (nd->state & ND_ROOT_GRABBED) {
756  			path_put(&nd->root);
757  			nd->state &= ~ND_ROOT_GRABBED;
758  		}
759  	} else {
760  		leave_rcu(nd);
761  	}
762  	nd->depth = 0;
763  	nd->path.mnt = NULL;
764  	nd->path.dentry = NULL;
765  }
766  
767  /* path_put is needed afterwards regardless of success or failure */
__legitimize_path(struct path * path,unsigned seq,unsigned mseq)768  static bool __legitimize_path(struct path *path, unsigned seq, unsigned mseq)
769  {
770  	int res = __legitimize_mnt(path->mnt, mseq);
771  	if (unlikely(res)) {
772  		if (res > 0)
773  			path->mnt = NULL;
774  		path->dentry = NULL;
775  		return false;
776  	}
777  	if (unlikely(!lockref_get_not_dead(&path->dentry->d_lockref))) {
778  		path->dentry = NULL;
779  		return false;
780  	}
781  	return !read_seqcount_retry(&path->dentry->d_seq, seq);
782  }
783  
legitimize_path(struct nameidata * nd,struct path * path,unsigned seq)784  static inline bool legitimize_path(struct nameidata *nd,
785  			    struct path *path, unsigned seq)
786  {
787  	return __legitimize_path(path, seq, nd->m_seq);
788  }
789  
legitimize_links(struct nameidata * nd)790  static bool legitimize_links(struct nameidata *nd)
791  {
792  	int i;
793  	if (unlikely(nd->flags & LOOKUP_CACHED)) {
794  		drop_links(nd);
795  		nd->depth = 0;
796  		return false;
797  	}
798  	for (i = 0; i < nd->depth; i++) {
799  		struct saved *last = nd->stack + i;
800  		if (unlikely(!legitimize_path(nd, &last->link, last->seq))) {
801  			drop_links(nd);
802  			nd->depth = i + 1;
803  			return false;
804  		}
805  	}
806  	return true;
807  }
808  
legitimize_root(struct nameidata * nd)809  static bool legitimize_root(struct nameidata *nd)
810  {
811  	/* Nothing to do if nd->root is zero or is managed by the VFS user. */
812  	if (!nd->root.mnt || (nd->state & ND_ROOT_PRESET))
813  		return true;
814  	nd->state |= ND_ROOT_GRABBED;
815  	return legitimize_path(nd, &nd->root, nd->root_seq);
816  }
817  
818  /*
819   * Path walking has 2 modes, rcu-walk and ref-walk (see
820   * Documentation/filesystems/path-lookup.txt).  In situations when we can't
821   * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
822   * normal reference counts on dentries and vfsmounts to transition to ref-walk
823   * mode.  Refcounts are grabbed at the last known good point before rcu-walk
824   * got stuck, so ref-walk may continue from there. If this is not successful
825   * (eg. a seqcount has changed), then failure is returned and it's up to caller
826   * to restart the path walk from the beginning in ref-walk mode.
827   */
828  
829  /**
830   * try_to_unlazy - try to switch to ref-walk mode.
831   * @nd: nameidata pathwalk data
832   * Returns: true on success, false on failure
833   *
834   * try_to_unlazy attempts to legitimize the current nd->path and nd->root
835   * for ref-walk mode.
836   * Must be called from rcu-walk context.
837   * Nothing should touch nameidata between try_to_unlazy() failure and
838   * terminate_walk().
839   */
try_to_unlazy(struct nameidata * nd)840  static bool try_to_unlazy(struct nameidata *nd)
841  {
842  	struct dentry *parent = nd->path.dentry;
843  
844  	BUG_ON(!(nd->flags & LOOKUP_RCU));
845  
846  	if (unlikely(!legitimize_links(nd)))
847  		goto out1;
848  	if (unlikely(!legitimize_path(nd, &nd->path, nd->seq)))
849  		goto out;
850  	if (unlikely(!legitimize_root(nd)))
851  		goto out;
852  	leave_rcu(nd);
853  	BUG_ON(nd->inode != parent->d_inode);
854  	return true;
855  
856  out1:
857  	nd->path.mnt = NULL;
858  	nd->path.dentry = NULL;
859  out:
860  	leave_rcu(nd);
861  	return false;
862  }
863  
864  /**
865   * try_to_unlazy_next - try to switch to ref-walk mode.
866   * @nd: nameidata pathwalk data
867   * @dentry: next dentry to step into
868   * Returns: true on success, false on failure
869   *
870   * Similar to try_to_unlazy(), but here we have the next dentry already
871   * picked by rcu-walk and want to legitimize that in addition to the current
872   * nd->path and nd->root for ref-walk mode.  Must be called from rcu-walk context.
873   * Nothing should touch nameidata between try_to_unlazy_next() failure and
874   * terminate_walk().
875   */
try_to_unlazy_next(struct nameidata * nd,struct dentry * dentry)876  static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry)
877  {
878  	int res;
879  	BUG_ON(!(nd->flags & LOOKUP_RCU));
880  
881  	if (unlikely(!legitimize_links(nd)))
882  		goto out2;
883  	res = __legitimize_mnt(nd->path.mnt, nd->m_seq);
884  	if (unlikely(res)) {
885  		if (res > 0)
886  			goto out2;
887  		goto out1;
888  	}
889  	if (unlikely(!lockref_get_not_dead(&nd->path.dentry->d_lockref)))
890  		goto out1;
891  
892  	/*
893  	 * We need to move both the parent and the dentry from the RCU domain
894  	 * to be properly refcounted. And the sequence number in the dentry
895  	 * validates *both* dentry counters, since we checked the sequence
896  	 * number of the parent after we got the child sequence number. So we
897  	 * know the parent must still be valid if the child sequence number is
898  	 */
899  	if (unlikely(!lockref_get_not_dead(&dentry->d_lockref)))
900  		goto out;
901  	if (read_seqcount_retry(&dentry->d_seq, nd->next_seq))
902  		goto out_dput;
903  	/*
904  	 * Sequence counts matched. Now make sure that the root is
905  	 * still valid and get it if required.
906  	 */
907  	if (unlikely(!legitimize_root(nd)))
908  		goto out_dput;
909  	leave_rcu(nd);
910  	return true;
911  
912  out2:
913  	nd->path.mnt = NULL;
914  out1:
915  	nd->path.dentry = NULL;
916  out:
917  	leave_rcu(nd);
918  	return false;
919  out_dput:
920  	leave_rcu(nd);
921  	dput(dentry);
922  	return false;
923  }
924  
d_revalidate(struct inode * dir,const struct qstr * name,struct dentry * dentry,unsigned int flags)925  static inline int d_revalidate(struct inode *dir, const struct qstr *name,
926  			       struct dentry *dentry, unsigned int flags)
927  {
928  	if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE))
929  		return dentry->d_op->d_revalidate(dir, name, dentry, flags);
930  	else
931  		return 1;
932  }
933  
934  /**
935   * complete_walk - successful completion of path walk
936   * @nd:  pointer nameidata
937   *
938   * If we had been in RCU mode, drop out of it and legitimize nd->path.
939   * Revalidate the final result, unless we'd already done that during
940   * the path walk or the filesystem doesn't ask for it.  Return 0 on
941   * success, -error on failure.  In case of failure caller does not
942   * need to drop nd->path.
943   */
complete_walk(struct nameidata * nd)944  static int complete_walk(struct nameidata *nd)
945  {
946  	struct dentry *dentry = nd->path.dentry;
947  	int status;
948  
949  	if (nd->flags & LOOKUP_RCU) {
950  		/*
951  		 * We don't want to zero nd->root for scoped-lookups or
952  		 * externally-managed nd->root.
953  		 */
954  		if (!(nd->state & ND_ROOT_PRESET))
955  			if (!(nd->flags & LOOKUP_IS_SCOPED))
956  				nd->root.mnt = NULL;
957  		nd->flags &= ~LOOKUP_CACHED;
958  		if (!try_to_unlazy(nd))
959  			return -ECHILD;
960  	}
961  
962  	if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) {
963  		/*
964  		 * While the guarantee of LOOKUP_IS_SCOPED is (roughly) "don't
965  		 * ever step outside the root during lookup" and should already
966  		 * be guaranteed by the rest of namei, we want to avoid a namei
967  		 * BUG resulting in userspace being given a path that was not
968  		 * scoped within the root at some point during the lookup.
969  		 *
970  		 * So, do a final sanity-check to make sure that in the
971  		 * worst-case scenario (a complete bypass of LOOKUP_IS_SCOPED)
972  		 * we won't silently return an fd completely outside of the
973  		 * requested root to userspace.
974  		 *
975  		 * Userspace could move the path outside the root after this
976  		 * check, but as discussed elsewhere this is not a concern (the
977  		 * resolved file was inside the root at some point).
978  		 */
979  		if (!path_is_under(&nd->path, &nd->root))
980  			return -EXDEV;
981  	}
982  
983  	if (likely(!(nd->state & ND_JUMPED)))
984  		return 0;
985  
986  	if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE)))
987  		return 0;
988  
989  	status = dentry->d_op->d_weak_revalidate(dentry, nd->flags);
990  	if (status > 0)
991  		return 0;
992  
993  	if (!status)
994  		status = -ESTALE;
995  
996  	return status;
997  }
998  
set_root(struct nameidata * nd)999  static int set_root(struct nameidata *nd)
1000  {
1001  	struct fs_struct *fs = current->fs;
1002  
1003  	/*
1004  	 * Jumping to the real root in a scoped-lookup is a BUG in namei, but we
1005  	 * still have to ensure it doesn't happen because it will cause a breakout
1006  	 * from the dirfd.
1007  	 */
1008  	if (WARN_ON(nd->flags & LOOKUP_IS_SCOPED))
1009  		return -ENOTRECOVERABLE;
1010  
1011  	if (nd->flags & LOOKUP_RCU) {
1012  		unsigned seq;
1013  
1014  		do {
1015  			seq = read_seqcount_begin(&fs->seq);
1016  			nd->root = fs->root;
1017  			nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
1018  		} while (read_seqcount_retry(&fs->seq, seq));
1019  	} else {
1020  		get_fs_root(fs, &nd->root);
1021  		nd->state |= ND_ROOT_GRABBED;
1022  	}
1023  	return 0;
1024  }
1025  
nd_jump_root(struct nameidata * nd)1026  static int nd_jump_root(struct nameidata *nd)
1027  {
1028  	if (unlikely(nd->flags & LOOKUP_BENEATH))
1029  		return -EXDEV;
1030  	if (unlikely(nd->flags & LOOKUP_NO_XDEV)) {
1031  		/* Absolute path arguments to path_init() are allowed. */
1032  		if (nd->path.mnt != NULL && nd->path.mnt != nd->root.mnt)
1033  			return -EXDEV;
1034  	}
1035  	if (!nd->root.mnt) {
1036  		int error = set_root(nd);
1037  		if (error)
1038  			return error;
1039  	}
1040  	if (nd->flags & LOOKUP_RCU) {
1041  		struct dentry *d;
1042  		nd->path = nd->root;
1043  		d = nd->path.dentry;
1044  		nd->inode = d->d_inode;
1045  		nd->seq = nd->root_seq;
1046  		if (read_seqcount_retry(&d->d_seq, nd->seq))
1047  			return -ECHILD;
1048  	} else {
1049  		path_put(&nd->path);
1050  		nd->path = nd->root;
1051  		path_get(&nd->path);
1052  		nd->inode = nd->path.dentry->d_inode;
1053  	}
1054  	nd->state |= ND_JUMPED;
1055  	return 0;
1056  }
1057  
1058  /*
1059   * Helper to directly jump to a known parsed path from ->get_link,
1060   * caller must have taken a reference to path beforehand.
1061   */
nd_jump_link(const struct path * path)1062  int nd_jump_link(const struct path *path)
1063  {
1064  	int error = -ELOOP;
1065  	struct nameidata *nd = current->nameidata;
1066  
1067  	if (unlikely(nd->flags & LOOKUP_NO_MAGICLINKS))
1068  		goto err;
1069  
1070  	error = -EXDEV;
1071  	if (unlikely(nd->flags & LOOKUP_NO_XDEV)) {
1072  		if (nd->path.mnt != path->mnt)
1073  			goto err;
1074  	}
1075  	/* Not currently safe for scoped-lookups. */
1076  	if (unlikely(nd->flags & LOOKUP_IS_SCOPED))
1077  		goto err;
1078  
1079  	path_put(&nd->path);
1080  	nd->path = *path;
1081  	nd->inode = nd->path.dentry->d_inode;
1082  	nd->state |= ND_JUMPED;
1083  	return 0;
1084  
1085  err:
1086  	path_put(path);
1087  	return error;
1088  }
1089  
put_link(struct nameidata * nd)1090  static inline void put_link(struct nameidata *nd)
1091  {
1092  	struct saved *last = nd->stack + --nd->depth;
1093  	do_delayed_call(&last->done);
1094  	if (!(nd->flags & LOOKUP_RCU))
1095  		path_put(&last->link);
1096  }
1097  
1098  static int sysctl_protected_symlinks __read_mostly;
1099  static int sysctl_protected_hardlinks __read_mostly;
1100  static int sysctl_protected_fifos __read_mostly;
1101  static int sysctl_protected_regular __read_mostly;
1102  
1103  #ifdef CONFIG_SYSCTL
1104  static const struct ctl_table namei_sysctls[] = {
1105  	{
1106  		.procname	= "protected_symlinks",
1107  		.data		= &sysctl_protected_symlinks,
1108  		.maxlen		= sizeof(int),
1109  		.mode		= 0644,
1110  		.proc_handler	= proc_dointvec_minmax,
1111  		.extra1		= SYSCTL_ZERO,
1112  		.extra2		= SYSCTL_ONE,
1113  	},
1114  	{
1115  		.procname	= "protected_hardlinks",
1116  		.data		= &sysctl_protected_hardlinks,
1117  		.maxlen		= sizeof(int),
1118  		.mode		= 0644,
1119  		.proc_handler	= proc_dointvec_minmax,
1120  		.extra1		= SYSCTL_ZERO,
1121  		.extra2		= SYSCTL_ONE,
1122  	},
1123  	{
1124  		.procname	= "protected_fifos",
1125  		.data		= &sysctl_protected_fifos,
1126  		.maxlen		= sizeof(int),
1127  		.mode		= 0644,
1128  		.proc_handler	= proc_dointvec_minmax,
1129  		.extra1		= SYSCTL_ZERO,
1130  		.extra2		= SYSCTL_TWO,
1131  	},
1132  	{
1133  		.procname	= "protected_regular",
1134  		.data		= &sysctl_protected_regular,
1135  		.maxlen		= sizeof(int),
1136  		.mode		= 0644,
1137  		.proc_handler	= proc_dointvec_minmax,
1138  		.extra1		= SYSCTL_ZERO,
1139  		.extra2		= SYSCTL_TWO,
1140  	},
1141  };
1142  
init_fs_namei_sysctls(void)1143  static int __init init_fs_namei_sysctls(void)
1144  {
1145  	register_sysctl_init("fs", namei_sysctls);
1146  	return 0;
1147  }
1148  fs_initcall(init_fs_namei_sysctls);
1149  
1150  #endif /* CONFIG_SYSCTL */
1151  
1152  /**
1153   * may_follow_link - Check symlink following for unsafe situations
1154   * @nd: nameidata pathwalk data
1155   * @inode: Used for idmapping.
1156   *
1157   * In the case of the sysctl_protected_symlinks sysctl being enabled,
1158   * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is
1159   * in a sticky world-writable directory. This is to protect privileged
1160   * processes from failing races against path names that may change out
1161   * from under them by way of other users creating malicious symlinks.
1162   * It will permit symlinks to be followed only when outside a sticky
1163   * world-writable directory, or when the uid of the symlink and follower
1164   * match, or when the directory owner matches the symlink's owner.
1165   *
1166   * Returns 0 if following the symlink is allowed, -ve on error.
1167   */
may_follow_link(struct nameidata * nd,const struct inode * inode)1168  static inline int may_follow_link(struct nameidata *nd, const struct inode *inode)
1169  {
1170  	struct mnt_idmap *idmap;
1171  	vfsuid_t vfsuid;
1172  
1173  	if (!sysctl_protected_symlinks)
1174  		return 0;
1175  
1176  	idmap = mnt_idmap(nd->path.mnt);
1177  	vfsuid = i_uid_into_vfsuid(idmap, inode);
1178  	/* Allowed if owner and follower match. */
1179  	if (vfsuid_eq_kuid(vfsuid, current_fsuid()))
1180  		return 0;
1181  
1182  	/* Allowed if parent directory not sticky and world-writable. */
1183  	if ((nd->dir_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH))
1184  		return 0;
1185  
1186  	/* Allowed if parent directory and link owner match. */
1187  	if (vfsuid_valid(nd->dir_vfsuid) && vfsuid_eq(nd->dir_vfsuid, vfsuid))
1188  		return 0;
1189  
1190  	if (nd->flags & LOOKUP_RCU)
1191  		return -ECHILD;
1192  
1193  	audit_inode(nd->name, nd->stack[0].link.dentry, 0);
1194  	audit_log_path_denied(AUDIT_ANOM_LINK, "follow_link");
1195  	return -EACCES;
1196  }
1197  
1198  /**
1199   * safe_hardlink_source - Check for safe hardlink conditions
1200   * @idmap: idmap of the mount the inode was found from
1201   * @inode: the source inode to hardlink from
1202   *
1203   * Return false if at least one of the following conditions:
1204   *    - inode is not a regular file
1205   *    - inode is setuid
1206   *    - inode is setgid and group-exec
1207   *    - access failure for read and write
1208   *
1209   * Otherwise returns true.
1210   */
safe_hardlink_source(struct mnt_idmap * idmap,struct inode * inode)1211  static bool safe_hardlink_source(struct mnt_idmap *idmap,
1212  				 struct inode *inode)
1213  {
1214  	umode_t mode = inode->i_mode;
1215  
1216  	/* Special files should not get pinned to the filesystem. */
1217  	if (!S_ISREG(mode))
1218  		return false;
1219  
1220  	/* Setuid files should not get pinned to the filesystem. */
1221  	if (mode & S_ISUID)
1222  		return false;
1223  
1224  	/* Executable setgid files should not get pinned to the filesystem. */
1225  	if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
1226  		return false;
1227  
1228  	/* Hardlinking to unreadable or unwritable sources is dangerous. */
1229  	if (inode_permission(idmap, inode, MAY_READ | MAY_WRITE))
1230  		return false;
1231  
1232  	return true;
1233  }
1234  
1235  /**
1236   * may_linkat - Check permissions for creating a hardlink
1237   * @idmap: idmap of the mount the inode was found from
1238   * @link:  the source to hardlink from
1239   *
1240   * Block hardlink when all of:
1241   *  - sysctl_protected_hardlinks enabled
1242   *  - fsuid does not match inode
1243   *  - hardlink source is unsafe (see safe_hardlink_source() above)
1244   *  - not CAP_FOWNER in a namespace with the inode owner uid mapped
1245   *
1246   * If the inode has been found through an idmapped mount the idmap of
1247   * the vfsmount must be passed through @idmap. This function will then take
1248   * care to map the inode according to @idmap before checking permissions.
1249   * On non-idmapped mounts or if permission checking is to be performed on the
1250   * raw inode simply pass @nop_mnt_idmap.
1251   *
1252   * Returns 0 if successful, -ve on error.
1253   */
may_linkat(struct mnt_idmap * idmap,const struct path * link)1254  int may_linkat(struct mnt_idmap *idmap, const struct path *link)
1255  {
1256  	struct inode *inode = link->dentry->d_inode;
1257  
1258  	/* Inode writeback is not safe when the uid or gid are invalid. */
1259  	if (!vfsuid_valid(i_uid_into_vfsuid(idmap, inode)) ||
1260  	    !vfsgid_valid(i_gid_into_vfsgid(idmap, inode)))
1261  		return -EOVERFLOW;
1262  
1263  	if (!sysctl_protected_hardlinks)
1264  		return 0;
1265  
1266  	/* Source inode owner (or CAP_FOWNER) can hardlink all they like,
1267  	 * otherwise, it must be a safe source.
1268  	 */
1269  	if (safe_hardlink_source(idmap, inode) ||
1270  	    inode_owner_or_capable(idmap, inode))
1271  		return 0;
1272  
1273  	audit_log_path_denied(AUDIT_ANOM_LINK, "linkat");
1274  	return -EPERM;
1275  }
1276  
1277  /**
1278   * may_create_in_sticky - Check whether an O_CREAT open in a sticky directory
1279   *			  should be allowed, or not, on files that already
1280   *			  exist.
1281   * @idmap: idmap of the mount the inode was found from
1282   * @nd: nameidata pathwalk data
1283   * @inode: the inode of the file to open
1284   *
1285   * Block an O_CREAT open of a FIFO (or a regular file) when:
1286   *   - sysctl_protected_fifos (or sysctl_protected_regular) is enabled
1287   *   - the file already exists
1288   *   - we are in a sticky directory
1289   *   - we don't own the file
1290   *   - the owner of the directory doesn't own the file
1291   *   - the directory is world writable
1292   * If the sysctl_protected_fifos (or sysctl_protected_regular) is set to 2
1293   * the directory doesn't have to be world writable: being group writable will
1294   * be enough.
1295   *
1296   * If the inode has been found through an idmapped mount the idmap of
1297   * the vfsmount must be passed through @idmap. This function will then take
1298   * care to map the inode according to @idmap before checking permissions.
1299   * On non-idmapped mounts or if permission checking is to be performed on the
1300   * raw inode simply pass @nop_mnt_idmap.
1301   *
1302   * Returns 0 if the open is allowed, -ve on error.
1303   */
may_create_in_sticky(struct mnt_idmap * idmap,struct nameidata * nd,struct inode * const inode)1304  static int may_create_in_sticky(struct mnt_idmap *idmap, struct nameidata *nd,
1305  				struct inode *const inode)
1306  {
1307  	umode_t dir_mode = nd->dir_mode;
1308  	vfsuid_t dir_vfsuid = nd->dir_vfsuid, i_vfsuid;
1309  
1310  	if (likely(!(dir_mode & S_ISVTX)))
1311  		return 0;
1312  
1313  	if (S_ISREG(inode->i_mode) && !sysctl_protected_regular)
1314  		return 0;
1315  
1316  	if (S_ISFIFO(inode->i_mode) && !sysctl_protected_fifos)
1317  		return 0;
1318  
1319  	i_vfsuid = i_uid_into_vfsuid(idmap, inode);
1320  
1321  	if (vfsuid_eq(i_vfsuid, dir_vfsuid))
1322  		return 0;
1323  
1324  	if (vfsuid_eq_kuid(i_vfsuid, current_fsuid()))
1325  		return 0;
1326  
1327  	if (likely(dir_mode & 0002)) {
1328  		audit_log_path_denied(AUDIT_ANOM_CREAT, "sticky_create");
1329  		return -EACCES;
1330  	}
1331  
1332  	if (dir_mode & 0020) {
1333  		if (sysctl_protected_fifos >= 2 && S_ISFIFO(inode->i_mode)) {
1334  			audit_log_path_denied(AUDIT_ANOM_CREAT,
1335  					      "sticky_create_fifo");
1336  			return -EACCES;
1337  		}
1338  
1339  		if (sysctl_protected_regular >= 2 && S_ISREG(inode->i_mode)) {
1340  			audit_log_path_denied(AUDIT_ANOM_CREAT,
1341  					      "sticky_create_regular");
1342  			return -EACCES;
1343  		}
1344  	}
1345  
1346  	return 0;
1347  }
1348  
1349  /*
1350   * follow_up - Find the mountpoint of path's vfsmount
1351   *
1352   * Given a path, find the mountpoint of its source file system.
1353   * Replace @path with the path of the mountpoint in the parent mount.
1354   * Up is towards /.
1355   *
1356   * Return 1 if we went up a level and 0 if we were already at the
1357   * root.
1358   */
follow_up(struct path * path)1359  int follow_up(struct path *path)
1360  {
1361  	struct mount *mnt = real_mount(path->mnt);
1362  	struct mount *parent;
1363  	struct dentry *mountpoint;
1364  
1365  	read_seqlock_excl(&mount_lock);
1366  	parent = mnt->mnt_parent;
1367  	if (parent == mnt) {
1368  		read_sequnlock_excl(&mount_lock);
1369  		return 0;
1370  	}
1371  	mntget(&parent->mnt);
1372  	mountpoint = dget(mnt->mnt_mountpoint);
1373  	read_sequnlock_excl(&mount_lock);
1374  	dput(path->dentry);
1375  	path->dentry = mountpoint;
1376  	mntput(path->mnt);
1377  	path->mnt = &parent->mnt;
1378  	return 1;
1379  }
1380  EXPORT_SYMBOL(follow_up);
1381  
choose_mountpoint_rcu(struct mount * m,const struct path * root,struct path * path,unsigned * seqp)1382  static bool choose_mountpoint_rcu(struct mount *m, const struct path *root,
1383  				  struct path *path, unsigned *seqp)
1384  {
1385  	while (mnt_has_parent(m)) {
1386  		struct dentry *mountpoint = m->mnt_mountpoint;
1387  
1388  		m = m->mnt_parent;
1389  		if (unlikely(root->dentry == mountpoint &&
1390  			     root->mnt == &m->mnt))
1391  			break;
1392  		if (mountpoint != m->mnt.mnt_root) {
1393  			path->mnt = &m->mnt;
1394  			path->dentry = mountpoint;
1395  			*seqp = read_seqcount_begin(&mountpoint->d_seq);
1396  			return true;
1397  		}
1398  	}
1399  	return false;
1400  }
1401  
choose_mountpoint(struct mount * m,const struct path * root,struct path * path)1402  static bool choose_mountpoint(struct mount *m, const struct path *root,
1403  			      struct path *path)
1404  {
1405  	bool found;
1406  
1407  	rcu_read_lock();
1408  	while (1) {
1409  		unsigned seq, mseq = read_seqbegin(&mount_lock);
1410  
1411  		found = choose_mountpoint_rcu(m, root, path, &seq);
1412  		if (unlikely(!found)) {
1413  			if (!read_seqretry(&mount_lock, mseq))
1414  				break;
1415  		} else {
1416  			if (likely(__legitimize_path(path, seq, mseq)))
1417  				break;
1418  			rcu_read_unlock();
1419  			path_put(path);
1420  			rcu_read_lock();
1421  		}
1422  	}
1423  	rcu_read_unlock();
1424  	return found;
1425  }
1426  
1427  /*
1428   * Perform an automount
1429   * - return -EISDIR to tell follow_managed() to stop and return the path we
1430   *   were called with.
1431   */
follow_automount(struct path * path,int * count,unsigned lookup_flags)1432  static int follow_automount(struct path *path, int *count, unsigned lookup_flags)
1433  {
1434  	struct dentry *dentry = path->dentry;
1435  
1436  	/* We don't want to mount if someone's just doing a stat -
1437  	 * unless they're stat'ing a directory and appended a '/' to
1438  	 * the name.
1439  	 *
1440  	 * We do, however, want to mount if someone wants to open or
1441  	 * create a file of any type under the mountpoint, wants to
1442  	 * traverse through the mountpoint or wants to open the
1443  	 * mounted directory.  Also, autofs may mark negative dentries
1444  	 * as being automount points.  These will need the attentions
1445  	 * of the daemon to instantiate them before they can be used.
1446  	 */
1447  	if (!(lookup_flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
1448  			   LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
1449  	    dentry->d_inode)
1450  		return -EISDIR;
1451  
1452  	if (count && (*count)++ >= MAXSYMLINKS)
1453  		return -ELOOP;
1454  
1455  	return finish_automount(dentry->d_op->d_automount(path), path);
1456  }
1457  
1458  /*
1459   * mount traversal - out-of-line part.  One note on ->d_flags accesses -
1460   * dentries are pinned but not locked here, so negative dentry can go
1461   * positive right under us.  Use of smp_load_acquire() provides a barrier
1462   * sufficient for ->d_inode and ->d_flags consistency.
1463   */
__traverse_mounts(struct path * path,unsigned flags,bool * jumped,int * count,unsigned lookup_flags)1464  static int __traverse_mounts(struct path *path, unsigned flags, bool *jumped,
1465  			     int *count, unsigned lookup_flags)
1466  {
1467  	struct vfsmount *mnt = path->mnt;
1468  	bool need_mntput = false;
1469  	int ret = 0;
1470  
1471  	while (flags & DCACHE_MANAGED_DENTRY) {
1472  		/* Allow the filesystem to manage the transit without i_mutex
1473  		 * being held. */
1474  		if (flags & DCACHE_MANAGE_TRANSIT) {
1475  			ret = path->dentry->d_op->d_manage(path, false);
1476  			flags = smp_load_acquire(&path->dentry->d_flags);
1477  			if (ret < 0)
1478  				break;
1479  		}
1480  
1481  		if (flags & DCACHE_MOUNTED) {	// something's mounted on it..
1482  			struct vfsmount *mounted = lookup_mnt(path);
1483  			if (mounted) {		// ... in our namespace
1484  				dput(path->dentry);
1485  				if (need_mntput)
1486  					mntput(path->mnt);
1487  				path->mnt = mounted;
1488  				path->dentry = dget(mounted->mnt_root);
1489  				// here we know it's positive
1490  				flags = path->dentry->d_flags;
1491  				need_mntput = true;
1492  				continue;
1493  			}
1494  		}
1495  
1496  		if (!(flags & DCACHE_NEED_AUTOMOUNT))
1497  			break;
1498  
1499  		// uncovered automount point
1500  		ret = follow_automount(path, count, lookup_flags);
1501  		flags = smp_load_acquire(&path->dentry->d_flags);
1502  		if (ret < 0)
1503  			break;
1504  	}
1505  
1506  	if (ret == -EISDIR)
1507  		ret = 0;
1508  	// possible if you race with several mount --move
1509  	if (need_mntput && path->mnt == mnt)
1510  		mntput(path->mnt);
1511  	if (!ret && unlikely(d_flags_negative(flags)))
1512  		ret = -ENOENT;
1513  	*jumped = need_mntput;
1514  	return ret;
1515  }
1516  
traverse_mounts(struct path * path,bool * jumped,int * count,unsigned lookup_flags)1517  static inline int traverse_mounts(struct path *path, bool *jumped,
1518  				  int *count, unsigned lookup_flags)
1519  {
1520  	unsigned flags = smp_load_acquire(&path->dentry->d_flags);
1521  
1522  	/* fastpath */
1523  	if (likely(!(flags & DCACHE_MANAGED_DENTRY))) {
1524  		*jumped = false;
1525  		if (unlikely(d_flags_negative(flags)))
1526  			return -ENOENT;
1527  		return 0;
1528  	}
1529  	return __traverse_mounts(path, flags, jumped, count, lookup_flags);
1530  }
1531  
follow_down_one(struct path * path)1532  int follow_down_one(struct path *path)
1533  {
1534  	struct vfsmount *mounted;
1535  
1536  	mounted = lookup_mnt(path);
1537  	if (mounted) {
1538  		dput(path->dentry);
1539  		mntput(path->mnt);
1540  		path->mnt = mounted;
1541  		path->dentry = dget(mounted->mnt_root);
1542  		return 1;
1543  	}
1544  	return 0;
1545  }
1546  EXPORT_SYMBOL(follow_down_one);
1547  
1548  /*
1549   * Follow down to the covering mount currently visible to userspace.  At each
1550   * point, the filesystem owning that dentry may be queried as to whether the
1551   * caller is permitted to proceed or not.
1552   */
follow_down(struct path * path,unsigned int flags)1553  int follow_down(struct path *path, unsigned int flags)
1554  {
1555  	struct vfsmount *mnt = path->mnt;
1556  	bool jumped;
1557  	int ret = traverse_mounts(path, &jumped, NULL, flags);
1558  
1559  	if (path->mnt != mnt)
1560  		mntput(mnt);
1561  	return ret;
1562  }
1563  EXPORT_SYMBOL(follow_down);
1564  
1565  /*
1566   * Try to skip to top of mountpoint pile in rcuwalk mode.  Fail if
1567   * we meet a managed dentry that would need blocking.
1568   */
__follow_mount_rcu(struct nameidata * nd,struct path * path)1569  static bool __follow_mount_rcu(struct nameidata *nd, struct path *path)
1570  {
1571  	struct dentry *dentry = path->dentry;
1572  	unsigned int flags = dentry->d_flags;
1573  
1574  	if (likely(!(flags & DCACHE_MANAGED_DENTRY)))
1575  		return true;
1576  
1577  	if (unlikely(nd->flags & LOOKUP_NO_XDEV))
1578  		return false;
1579  
1580  	for (;;) {
1581  		/*
1582  		 * Don't forget we might have a non-mountpoint managed dentry
1583  		 * that wants to block transit.
1584  		 */
1585  		if (unlikely(flags & DCACHE_MANAGE_TRANSIT)) {
1586  			int res = dentry->d_op->d_manage(path, true);
1587  			if (res)
1588  				return res == -EISDIR;
1589  			flags = dentry->d_flags;
1590  		}
1591  
1592  		if (flags & DCACHE_MOUNTED) {
1593  			struct mount *mounted = __lookup_mnt(path->mnt, dentry);
1594  			if (mounted) {
1595  				path->mnt = &mounted->mnt;
1596  				dentry = path->dentry = mounted->mnt.mnt_root;
1597  				nd->state |= ND_JUMPED;
1598  				nd->next_seq = read_seqcount_begin(&dentry->d_seq);
1599  				flags = dentry->d_flags;
1600  				// makes sure that non-RCU pathwalk could reach
1601  				// this state.
1602  				if (read_seqretry(&mount_lock, nd->m_seq))
1603  					return false;
1604  				continue;
1605  			}
1606  			if (read_seqretry(&mount_lock, nd->m_seq))
1607  				return false;
1608  		}
1609  		return !(flags & DCACHE_NEED_AUTOMOUNT);
1610  	}
1611  }
1612  
handle_mounts(struct nameidata * nd,struct dentry * dentry,struct path * path)1613  static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry,
1614  			  struct path *path)
1615  {
1616  	bool jumped;
1617  	int ret;
1618  
1619  	path->mnt = nd->path.mnt;
1620  	path->dentry = dentry;
1621  	if (nd->flags & LOOKUP_RCU) {
1622  		unsigned int seq = nd->next_seq;
1623  		if (likely(__follow_mount_rcu(nd, path)))
1624  			return 0;
1625  		// *path and nd->next_seq might've been clobbered
1626  		path->mnt = nd->path.mnt;
1627  		path->dentry = dentry;
1628  		nd->next_seq = seq;
1629  		if (!try_to_unlazy_next(nd, dentry))
1630  			return -ECHILD;
1631  	}
1632  	ret = traverse_mounts(path, &jumped, &nd->total_link_count, nd->flags);
1633  	if (jumped) {
1634  		if (unlikely(nd->flags & LOOKUP_NO_XDEV))
1635  			ret = -EXDEV;
1636  		else
1637  			nd->state |= ND_JUMPED;
1638  	}
1639  	if (unlikely(ret)) {
1640  		dput(path->dentry);
1641  		if (path->mnt != nd->path.mnt)
1642  			mntput(path->mnt);
1643  	}
1644  	return ret;
1645  }
1646  
1647  /*
1648   * This looks up the name in dcache and possibly revalidates the found dentry.
1649   * NULL is returned if the dentry does not exist in the cache.
1650   */
lookup_dcache(const struct qstr * name,struct dentry * dir,unsigned int flags)1651  static struct dentry *lookup_dcache(const struct qstr *name,
1652  				    struct dentry *dir,
1653  				    unsigned int flags)
1654  {
1655  	struct dentry *dentry = d_lookup(dir, name);
1656  	if (dentry) {
1657  		int error = d_revalidate(dir->d_inode, name, dentry, flags);
1658  		if (unlikely(error <= 0)) {
1659  			if (!error)
1660  				d_invalidate(dentry);
1661  			dput(dentry);
1662  			return ERR_PTR(error);
1663  		}
1664  	}
1665  	return dentry;
1666  }
1667  
1668  /*
1669   * Parent directory has inode locked exclusive.  This is one
1670   * and only case when ->lookup() gets called on non in-lookup
1671   * dentries - as the matter of fact, this only gets called
1672   * when directory is guaranteed to have no in-lookup children
1673   * at all.
1674   * Will return -ENOENT if name isn't found and LOOKUP_CREATE wasn't passed.
1675   * Will return -EEXIST if name is found and LOOKUP_EXCL was passed.
1676   */
lookup_one_qstr_excl(const struct qstr * name,struct dentry * base,unsigned int flags)1677  struct dentry *lookup_one_qstr_excl(const struct qstr *name,
1678  				    struct dentry *base,
1679  				    unsigned int flags)
1680  {
1681  	struct dentry *dentry = lookup_dcache(name, base, flags);
1682  	struct dentry *old;
1683  	struct inode *dir = base->d_inode;
1684  
1685  	if (dentry)
1686  		goto found;
1687  
1688  	/* Don't create child dentry for a dead directory. */
1689  	if (unlikely(IS_DEADDIR(dir)))
1690  		return ERR_PTR(-ENOENT);
1691  
1692  	dentry = d_alloc(base, name);
1693  	if (unlikely(!dentry))
1694  		return ERR_PTR(-ENOMEM);
1695  
1696  	old = dir->i_op->lookup(dir, dentry, flags);
1697  	if (unlikely(old)) {
1698  		dput(dentry);
1699  		dentry = old;
1700  	}
1701  found:
1702  	if (IS_ERR(dentry))
1703  		return dentry;
1704  	if (d_is_negative(dentry) && !(flags & LOOKUP_CREATE)) {
1705  		dput(dentry);
1706  		return ERR_PTR(-ENOENT);
1707  	}
1708  	if (d_is_positive(dentry) && (flags & LOOKUP_EXCL)) {
1709  		dput(dentry);
1710  		return ERR_PTR(-EEXIST);
1711  	}
1712  	return dentry;
1713  }
1714  EXPORT_SYMBOL(lookup_one_qstr_excl);
1715  
1716  /**
1717   * lookup_fast - do fast lockless (but racy) lookup of a dentry
1718   * @nd: current nameidata
1719   *
1720   * Do a fast, but racy lookup in the dcache for the given dentry, and
1721   * revalidate it. Returns a valid dentry pointer or NULL if one wasn't
1722   * found. On error, an ERR_PTR will be returned.
1723   *
1724   * If this function returns a valid dentry and the walk is no longer
1725   * lazy, the dentry will carry a reference that must later be put. If
1726   * RCU mode is still in force, then this is not the case and the dentry
1727   * must be legitimized before use. If this returns NULL, then the walk
1728   * will no longer be in RCU mode.
1729   */
lookup_fast(struct nameidata * nd)1730  static struct dentry *lookup_fast(struct nameidata *nd)
1731  {
1732  	struct dentry *dentry, *parent = nd->path.dentry;
1733  	int status = 1;
1734  
1735  	/*
1736  	 * Rename seqlock is not required here because in the off chance
1737  	 * of a false negative due to a concurrent rename, the caller is
1738  	 * going to fall back to non-racy lookup.
1739  	 */
1740  	if (nd->flags & LOOKUP_RCU) {
1741  		dentry = __d_lookup_rcu(parent, &nd->last, &nd->next_seq);
1742  		if (unlikely(!dentry)) {
1743  			if (!try_to_unlazy(nd))
1744  				return ERR_PTR(-ECHILD);
1745  			return NULL;
1746  		}
1747  
1748  		/*
1749  		 * This sequence count validates that the parent had no
1750  		 * changes while we did the lookup of the dentry above.
1751  		 */
1752  		if (read_seqcount_retry(&parent->d_seq, nd->seq))
1753  			return ERR_PTR(-ECHILD);
1754  
1755  		status = d_revalidate(nd->inode, &nd->last, dentry, nd->flags);
1756  		if (likely(status > 0))
1757  			return dentry;
1758  		if (!try_to_unlazy_next(nd, dentry))
1759  			return ERR_PTR(-ECHILD);
1760  		if (status == -ECHILD)
1761  			/* we'd been told to redo it in non-rcu mode */
1762  			status = d_revalidate(nd->inode, &nd->last,
1763  					      dentry, nd->flags);
1764  	} else {
1765  		dentry = __d_lookup(parent, &nd->last);
1766  		if (unlikely(!dentry))
1767  			return NULL;
1768  		status = d_revalidate(nd->inode, &nd->last, dentry, nd->flags);
1769  	}
1770  	if (unlikely(status <= 0)) {
1771  		if (!status)
1772  			d_invalidate(dentry);
1773  		dput(dentry);
1774  		return ERR_PTR(status);
1775  	}
1776  	return dentry;
1777  }
1778  
1779  /* Fast lookup failed, do it the slow way */
__lookup_slow(const struct qstr * name,struct dentry * dir,unsigned int flags)1780  static struct dentry *__lookup_slow(const struct qstr *name,
1781  				    struct dentry *dir,
1782  				    unsigned int flags)
1783  {
1784  	struct dentry *dentry, *old;
1785  	struct inode *inode = dir->d_inode;
1786  	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
1787  
1788  	/* Don't go there if it's already dead */
1789  	if (unlikely(IS_DEADDIR(inode)))
1790  		return ERR_PTR(-ENOENT);
1791  again:
1792  	dentry = d_alloc_parallel(dir, name, &wq);
1793  	if (IS_ERR(dentry))
1794  		return dentry;
1795  	if (unlikely(!d_in_lookup(dentry))) {
1796  		int error = d_revalidate(inode, name, dentry, flags);
1797  		if (unlikely(error <= 0)) {
1798  			if (!error) {
1799  				d_invalidate(dentry);
1800  				dput(dentry);
1801  				goto again;
1802  			}
1803  			dput(dentry);
1804  			dentry = ERR_PTR(error);
1805  		}
1806  	} else {
1807  		old = inode->i_op->lookup(inode, dentry, flags);
1808  		d_lookup_done(dentry);
1809  		if (unlikely(old)) {
1810  			dput(dentry);
1811  			dentry = old;
1812  		}
1813  	}
1814  	return dentry;
1815  }
1816  
lookup_slow(const struct qstr * name,struct dentry * dir,unsigned int flags)1817  static struct dentry *lookup_slow(const struct qstr *name,
1818  				  struct dentry *dir,
1819  				  unsigned int flags)
1820  {
1821  	struct inode *inode = dir->d_inode;
1822  	struct dentry *res;
1823  	inode_lock_shared(inode);
1824  	res = __lookup_slow(name, dir, flags);
1825  	inode_unlock_shared(inode);
1826  	return res;
1827  }
1828  
may_lookup(struct mnt_idmap * idmap,struct nameidata * restrict nd)1829  static inline int may_lookup(struct mnt_idmap *idmap,
1830  			     struct nameidata *restrict nd)
1831  {
1832  	int err, mask;
1833  
1834  	mask = nd->flags & LOOKUP_RCU ? MAY_NOT_BLOCK : 0;
1835  	err = inode_permission(idmap, nd->inode, mask | MAY_EXEC);
1836  	if (likely(!err))
1837  		return 0;
1838  
1839  	// If we failed, and we weren't in LOOKUP_RCU, it's final
1840  	if (!(nd->flags & LOOKUP_RCU))
1841  		return err;
1842  
1843  	// Drop out of RCU mode to make sure it wasn't transient
1844  	if (!try_to_unlazy(nd))
1845  		return -ECHILD;	// redo it all non-lazy
1846  
1847  	if (err != -ECHILD)	// hard error
1848  		return err;
1849  
1850  	return inode_permission(idmap, nd->inode, MAY_EXEC);
1851  }
1852  
reserve_stack(struct nameidata * nd,struct path * link)1853  static int reserve_stack(struct nameidata *nd, struct path *link)
1854  {
1855  	if (unlikely(nd->total_link_count++ >= MAXSYMLINKS))
1856  		return -ELOOP;
1857  
1858  	if (likely(nd->depth != EMBEDDED_LEVELS))
1859  		return 0;
1860  	if (likely(nd->stack != nd->internal))
1861  		return 0;
1862  	if (likely(nd_alloc_stack(nd)))
1863  		return 0;
1864  
1865  	if (nd->flags & LOOKUP_RCU) {
1866  		// we need to grab link before we do unlazy.  And we can't skip
1867  		// unlazy even if we fail to grab the link - cleanup needs it
1868  		bool grabbed_link = legitimize_path(nd, link, nd->next_seq);
1869  
1870  		if (!try_to_unlazy(nd) || !grabbed_link)
1871  			return -ECHILD;
1872  
1873  		if (nd_alloc_stack(nd))
1874  			return 0;
1875  	}
1876  	return -ENOMEM;
1877  }
1878  
1879  enum {WALK_TRAILING = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4};
1880  
pick_link(struct nameidata * nd,struct path * link,struct inode * inode,int flags)1881  static const char *pick_link(struct nameidata *nd, struct path *link,
1882  		     struct inode *inode, int flags)
1883  {
1884  	struct saved *last;
1885  	const char *res;
1886  	int error = reserve_stack(nd, link);
1887  
1888  	if (unlikely(error)) {
1889  		if (!(nd->flags & LOOKUP_RCU))
1890  			path_put(link);
1891  		return ERR_PTR(error);
1892  	}
1893  	last = nd->stack + nd->depth++;
1894  	last->link = *link;
1895  	clear_delayed_call(&last->done);
1896  	last->seq = nd->next_seq;
1897  
1898  	if (flags & WALK_TRAILING) {
1899  		error = may_follow_link(nd, inode);
1900  		if (unlikely(error))
1901  			return ERR_PTR(error);
1902  	}
1903  
1904  	if (unlikely(nd->flags & LOOKUP_NO_SYMLINKS) ||
1905  			unlikely(link->mnt->mnt_flags & MNT_NOSYMFOLLOW))
1906  		return ERR_PTR(-ELOOP);
1907  
1908  	if (!(nd->flags & LOOKUP_RCU)) {
1909  		touch_atime(&last->link);
1910  		cond_resched();
1911  	} else if (atime_needs_update(&last->link, inode)) {
1912  		if (!try_to_unlazy(nd))
1913  			return ERR_PTR(-ECHILD);
1914  		touch_atime(&last->link);
1915  	}
1916  
1917  	error = security_inode_follow_link(link->dentry, inode,
1918  					   nd->flags & LOOKUP_RCU);
1919  	if (unlikely(error))
1920  		return ERR_PTR(error);
1921  
1922  	res = READ_ONCE(inode->i_link);
1923  	if (!res) {
1924  		const char * (*get)(struct dentry *, struct inode *,
1925  				struct delayed_call *);
1926  		get = inode->i_op->get_link;
1927  		if (nd->flags & LOOKUP_RCU) {
1928  			res = get(NULL, inode, &last->done);
1929  			if (res == ERR_PTR(-ECHILD) && try_to_unlazy(nd))
1930  				res = get(link->dentry, inode, &last->done);
1931  		} else {
1932  			res = get(link->dentry, inode, &last->done);
1933  		}
1934  		if (!res)
1935  			goto all_done;
1936  		if (IS_ERR(res))
1937  			return res;
1938  	}
1939  	if (*res == '/') {
1940  		error = nd_jump_root(nd);
1941  		if (unlikely(error))
1942  			return ERR_PTR(error);
1943  		while (unlikely(*++res == '/'))
1944  			;
1945  	}
1946  	if (*res)
1947  		return res;
1948  all_done: // pure jump
1949  	put_link(nd);
1950  	return NULL;
1951  }
1952  
1953  /*
1954   * Do we need to follow links? We _really_ want to be able
1955   * to do this check without having to look at inode->i_op,
1956   * so we keep a cache of "no, this doesn't need follow_link"
1957   * for the common case.
1958   *
1959   * NOTE: dentry must be what nd->next_seq had been sampled from.
1960   */
step_into(struct nameidata * nd,int flags,struct dentry * dentry)1961  static const char *step_into(struct nameidata *nd, int flags,
1962  		     struct dentry *dentry)
1963  {
1964  	struct path path;
1965  	struct inode *inode;
1966  	int err = handle_mounts(nd, dentry, &path);
1967  
1968  	if (err < 0)
1969  		return ERR_PTR(err);
1970  	inode = path.dentry->d_inode;
1971  	if (likely(!d_is_symlink(path.dentry)) ||
1972  	   ((flags & WALK_TRAILING) && !(nd->flags & LOOKUP_FOLLOW)) ||
1973  	   (flags & WALK_NOFOLLOW)) {
1974  		/* not a symlink or should not follow */
1975  		if (nd->flags & LOOKUP_RCU) {
1976  			if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq))
1977  				return ERR_PTR(-ECHILD);
1978  			if (unlikely(!inode))
1979  				return ERR_PTR(-ENOENT);
1980  		} else {
1981  			dput(nd->path.dentry);
1982  			if (nd->path.mnt != path.mnt)
1983  				mntput(nd->path.mnt);
1984  		}
1985  		nd->path = path;
1986  		nd->inode = inode;
1987  		nd->seq = nd->next_seq;
1988  		return NULL;
1989  	}
1990  	if (nd->flags & LOOKUP_RCU) {
1991  		/* make sure that d_is_symlink above matches inode */
1992  		if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq))
1993  			return ERR_PTR(-ECHILD);
1994  	} else {
1995  		if (path.mnt == nd->path.mnt)
1996  			mntget(path.mnt);
1997  	}
1998  	return pick_link(nd, &path, inode, flags);
1999  }
2000  
follow_dotdot_rcu(struct nameidata * nd)2001  static struct dentry *follow_dotdot_rcu(struct nameidata *nd)
2002  {
2003  	struct dentry *parent, *old;
2004  
2005  	if (path_equal(&nd->path, &nd->root))
2006  		goto in_root;
2007  	if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) {
2008  		struct path path;
2009  		unsigned seq;
2010  		if (!choose_mountpoint_rcu(real_mount(nd->path.mnt),
2011  					   &nd->root, &path, &seq))
2012  			goto in_root;
2013  		if (unlikely(nd->flags & LOOKUP_NO_XDEV))
2014  			return ERR_PTR(-ECHILD);
2015  		nd->path = path;
2016  		nd->inode = path.dentry->d_inode;
2017  		nd->seq = seq;
2018  		// makes sure that non-RCU pathwalk could reach this state
2019  		if (read_seqretry(&mount_lock, nd->m_seq))
2020  			return ERR_PTR(-ECHILD);
2021  		/* we know that mountpoint was pinned */
2022  	}
2023  	old = nd->path.dentry;
2024  	parent = old->d_parent;
2025  	nd->next_seq = read_seqcount_begin(&parent->d_seq);
2026  	// makes sure that non-RCU pathwalk could reach this state
2027  	if (read_seqcount_retry(&old->d_seq, nd->seq))
2028  		return ERR_PTR(-ECHILD);
2029  	if (unlikely(!path_connected(nd->path.mnt, parent)))
2030  		return ERR_PTR(-ECHILD);
2031  	return parent;
2032  in_root:
2033  	if (read_seqretry(&mount_lock, nd->m_seq))
2034  		return ERR_PTR(-ECHILD);
2035  	if (unlikely(nd->flags & LOOKUP_BENEATH))
2036  		return ERR_PTR(-ECHILD);
2037  	nd->next_seq = nd->seq;
2038  	return nd->path.dentry;
2039  }
2040  
follow_dotdot(struct nameidata * nd)2041  static struct dentry *follow_dotdot(struct nameidata *nd)
2042  {
2043  	struct dentry *parent;
2044  
2045  	if (path_equal(&nd->path, &nd->root))
2046  		goto in_root;
2047  	if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) {
2048  		struct path path;
2049  
2050  		if (!choose_mountpoint(real_mount(nd->path.mnt),
2051  				       &nd->root, &path))
2052  			goto in_root;
2053  		path_put(&nd->path);
2054  		nd->path = path;
2055  		nd->inode = path.dentry->d_inode;
2056  		if (unlikely(nd->flags & LOOKUP_NO_XDEV))
2057  			return ERR_PTR(-EXDEV);
2058  	}
2059  	/* rare case of legitimate dget_parent()... */
2060  	parent = dget_parent(nd->path.dentry);
2061  	if (unlikely(!path_connected(nd->path.mnt, parent))) {
2062  		dput(parent);
2063  		return ERR_PTR(-ENOENT);
2064  	}
2065  	return parent;
2066  
2067  in_root:
2068  	if (unlikely(nd->flags & LOOKUP_BENEATH))
2069  		return ERR_PTR(-EXDEV);
2070  	return dget(nd->path.dentry);
2071  }
2072  
handle_dots(struct nameidata * nd,int type)2073  static const char *handle_dots(struct nameidata *nd, int type)
2074  {
2075  	if (type == LAST_DOTDOT) {
2076  		const char *error = NULL;
2077  		struct dentry *parent;
2078  
2079  		if (!nd->root.mnt) {
2080  			error = ERR_PTR(set_root(nd));
2081  			if (error)
2082  				return error;
2083  		}
2084  		if (nd->flags & LOOKUP_RCU)
2085  			parent = follow_dotdot_rcu(nd);
2086  		else
2087  			parent = follow_dotdot(nd);
2088  		if (IS_ERR(parent))
2089  			return ERR_CAST(parent);
2090  		error = step_into(nd, WALK_NOFOLLOW, parent);
2091  		if (unlikely(error))
2092  			return error;
2093  
2094  		if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) {
2095  			/*
2096  			 * If there was a racing rename or mount along our
2097  			 * path, then we can't be sure that ".." hasn't jumped
2098  			 * above nd->root (and so userspace should retry or use
2099  			 * some fallback).
2100  			 */
2101  			smp_rmb();
2102  			if (__read_seqcount_retry(&mount_lock.seqcount, nd->m_seq))
2103  				return ERR_PTR(-EAGAIN);
2104  			if (__read_seqcount_retry(&rename_lock.seqcount, nd->r_seq))
2105  				return ERR_PTR(-EAGAIN);
2106  		}
2107  	}
2108  	return NULL;
2109  }
2110  
walk_component(struct nameidata * nd,int flags)2111  static const char *walk_component(struct nameidata *nd, int flags)
2112  {
2113  	struct dentry *dentry;
2114  	/*
2115  	 * "." and ".." are special - ".." especially so because it has
2116  	 * to be able to know about the current root directory and
2117  	 * parent relationships.
2118  	 */
2119  	if (unlikely(nd->last_type != LAST_NORM)) {
2120  		if (!(flags & WALK_MORE) && nd->depth)
2121  			put_link(nd);
2122  		return handle_dots(nd, nd->last_type);
2123  	}
2124  	dentry = lookup_fast(nd);
2125  	if (IS_ERR(dentry))
2126  		return ERR_CAST(dentry);
2127  	if (unlikely(!dentry)) {
2128  		dentry = lookup_slow(&nd->last, nd->path.dentry, nd->flags);
2129  		if (IS_ERR(dentry))
2130  			return ERR_CAST(dentry);
2131  	}
2132  	if (!(flags & WALK_MORE) && nd->depth)
2133  		put_link(nd);
2134  	return step_into(nd, flags, dentry);
2135  }
2136  
2137  /*
2138   * We can do the critical dentry name comparison and hashing
2139   * operations one word at a time, but we are limited to:
2140   *
2141   * - Architectures with fast unaligned word accesses. We could
2142   *   do a "get_unaligned()" if this helps and is sufficiently
2143   *   fast.
2144   *
2145   * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we
2146   *   do not trap on the (extremely unlikely) case of a page
2147   *   crossing operation.
2148   *
2149   * - Furthermore, we need an efficient 64-bit compile for the
2150   *   64-bit case in order to generate the "number of bytes in
2151   *   the final mask". Again, that could be replaced with a
2152   *   efficient population count instruction or similar.
2153   */
2154  #ifdef CONFIG_DCACHE_WORD_ACCESS
2155  
2156  #include <asm/word-at-a-time.h>
2157  
2158  #ifdef HASH_MIX
2159  
2160  /* Architecture provides HASH_MIX and fold_hash() in <asm/hash.h> */
2161  
2162  #elif defined(CONFIG_64BIT)
2163  /*
2164   * Register pressure in the mixing function is an issue, particularly
2165   * on 32-bit x86, but almost any function requires one state value and
2166   * one temporary.  Instead, use a function designed for two state values
2167   * and no temporaries.
2168   *
2169   * This function cannot create a collision in only two iterations, so
2170   * we have two iterations to achieve avalanche.  In those two iterations,
2171   * we have six layers of mixing, which is enough to spread one bit's
2172   * influence out to 2^6 = 64 state bits.
2173   *
2174   * Rotate constants are scored by considering either 64 one-bit input
2175   * deltas or 64*63/2 = 2016 two-bit input deltas, and finding the
2176   * probability of that delta causing a change to each of the 128 output
2177   * bits, using a sample of random initial states.
2178   *
2179   * The Shannon entropy of the computed probabilities is then summed
2180   * to produce a score.  Ideally, any input change has a 50% chance of
2181   * toggling any given output bit.
2182   *
2183   * Mixing scores (in bits) for (12,45):
2184   * Input delta: 1-bit      2-bit
2185   * 1 round:     713.3    42542.6
2186   * 2 rounds:   2753.7   140389.8
2187   * 3 rounds:   5954.1   233458.2
2188   * 4 rounds:   7862.6   256672.2
2189   * Perfect:    8192     258048
2190   *            (64*128) (64*63/2 * 128)
2191   */
2192  #define HASH_MIX(x, y, a)	\
2193  	(	x ^= (a),	\
2194  	y ^= x,	x = rol64(x,12),\
2195  	x += y,	y = rol64(y,45),\
2196  	y *= 9			)
2197  
2198  /*
2199   * Fold two longs into one 32-bit hash value.  This must be fast, but
2200   * latency isn't quite as critical, as there is a fair bit of additional
2201   * work done before the hash value is used.
2202   */
fold_hash(unsigned long x,unsigned long y)2203  static inline unsigned int fold_hash(unsigned long x, unsigned long y)
2204  {
2205  	y ^= x * GOLDEN_RATIO_64;
2206  	y *= GOLDEN_RATIO_64;
2207  	return y >> 32;
2208  }
2209  
2210  #else	/* 32-bit case */
2211  
2212  /*
2213   * Mixing scores (in bits) for (7,20):
2214   * Input delta: 1-bit      2-bit
2215   * 1 round:     330.3     9201.6
2216   * 2 rounds:   1246.4    25475.4
2217   * 3 rounds:   1907.1    31295.1
2218   * 4 rounds:   2042.3    31718.6
2219   * Perfect:    2048      31744
2220   *            (32*64)   (32*31/2 * 64)
2221   */
2222  #define HASH_MIX(x, y, a)	\
2223  	(	x ^= (a),	\
2224  	y ^= x,	x = rol32(x, 7),\
2225  	x += y,	y = rol32(y,20),\
2226  	y *= 9			)
2227  
fold_hash(unsigned long x,unsigned long y)2228  static inline unsigned int fold_hash(unsigned long x, unsigned long y)
2229  {
2230  	/* Use arch-optimized multiply if one exists */
2231  	return __hash_32(y ^ __hash_32(x));
2232  }
2233  
2234  #endif
2235  
2236  /*
2237   * Return the hash of a string of known length.  This is carfully
2238   * designed to match hash_name(), which is the more critical function.
2239   * In particular, we must end by hashing a final word containing 0..7
2240   * payload bytes, to match the way that hash_name() iterates until it
2241   * finds the delimiter after the name.
2242   */
full_name_hash(const void * salt,const char * name,unsigned int len)2243  unsigned int full_name_hash(const void *salt, const char *name, unsigned int len)
2244  {
2245  	unsigned long a, x = 0, y = (unsigned long)salt;
2246  
2247  	for (;;) {
2248  		if (!len)
2249  			goto done;
2250  		a = load_unaligned_zeropad(name);
2251  		if (len < sizeof(unsigned long))
2252  			break;
2253  		HASH_MIX(x, y, a);
2254  		name += sizeof(unsigned long);
2255  		len -= sizeof(unsigned long);
2256  	}
2257  	x ^= a & bytemask_from_count(len);
2258  done:
2259  	return fold_hash(x, y);
2260  }
2261  EXPORT_SYMBOL(full_name_hash);
2262  
2263  /* Return the "hash_len" (hash and length) of a null-terminated string */
hashlen_string(const void * salt,const char * name)2264  u64 hashlen_string(const void *salt, const char *name)
2265  {
2266  	unsigned long a = 0, x = 0, y = (unsigned long)salt;
2267  	unsigned long adata, mask, len;
2268  	const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
2269  
2270  	len = 0;
2271  	goto inside;
2272  
2273  	do {
2274  		HASH_MIX(x, y, a);
2275  		len += sizeof(unsigned long);
2276  inside:
2277  		a = load_unaligned_zeropad(name+len);
2278  	} while (!has_zero(a, &adata, &constants));
2279  
2280  	adata = prep_zero_mask(a, adata, &constants);
2281  	mask = create_zero_mask(adata);
2282  	x ^= a & zero_bytemask(mask);
2283  
2284  	return hashlen_create(fold_hash(x, y), len + find_zero(mask));
2285  }
2286  EXPORT_SYMBOL(hashlen_string);
2287  
2288  /*
2289   * Calculate the length and hash of the path component, and
2290   * return the length as the result.
2291   */
hash_name(struct nameidata * nd,const char * name,unsigned long * lastword)2292  static inline const char *hash_name(struct nameidata *nd,
2293  				    const char *name,
2294  				    unsigned long *lastword)
2295  {
2296  	unsigned long a, b, x, y = (unsigned long)nd->path.dentry;
2297  	unsigned long adata, bdata, mask, len;
2298  	const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
2299  
2300  	/*
2301  	 * The first iteration is special, because it can result in
2302  	 * '.' and '..' and has no mixing other than the final fold.
2303  	 */
2304  	a = load_unaligned_zeropad(name);
2305  	b = a ^ REPEAT_BYTE('/');
2306  	if (has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants)) {
2307  		adata = prep_zero_mask(a, adata, &constants);
2308  		bdata = prep_zero_mask(b, bdata, &constants);
2309  		mask = create_zero_mask(adata | bdata);
2310  		a &= zero_bytemask(mask);
2311  		*lastword = a;
2312  		len = find_zero(mask);
2313  		nd->last.hash = fold_hash(a, y);
2314  		nd->last.len = len;
2315  		return name + len;
2316  	}
2317  
2318  	len = 0;
2319  	x = 0;
2320  	do {
2321  		HASH_MIX(x, y, a);
2322  		len += sizeof(unsigned long);
2323  		a = load_unaligned_zeropad(name+len);
2324  		b = a ^ REPEAT_BYTE('/');
2325  	} while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants)));
2326  
2327  	adata = prep_zero_mask(a, adata, &constants);
2328  	bdata = prep_zero_mask(b, bdata, &constants);
2329  	mask = create_zero_mask(adata | bdata);
2330  	a &= zero_bytemask(mask);
2331  	x ^= a;
2332  	len += find_zero(mask);
2333  	*lastword = 0;		// Multi-word components cannot be DOT or DOTDOT
2334  
2335  	nd->last.hash = fold_hash(x, y);
2336  	nd->last.len = len;
2337  	return name + len;
2338  }
2339  
2340  /*
2341   * Note that the 'last' word is always zero-masked, but
2342   * was loaded as a possibly big-endian word.
2343   */
2344  #ifdef __BIG_ENDIAN
2345    #define LAST_WORD_IS_DOT	(0x2eul << (BITS_PER_LONG-8))
2346    #define LAST_WORD_IS_DOTDOT	(0x2e2eul << (BITS_PER_LONG-16))
2347  #endif
2348  
2349  #else	/* !CONFIG_DCACHE_WORD_ACCESS: Slow, byte-at-a-time version */
2350  
2351  /* Return the hash of a string of known length */
full_name_hash(const void * salt,const char * name,unsigned int len)2352  unsigned int full_name_hash(const void *salt, const char *name, unsigned int len)
2353  {
2354  	unsigned long hash = init_name_hash(salt);
2355  	while (len--)
2356  		hash = partial_name_hash((unsigned char)*name++, hash);
2357  	return end_name_hash(hash);
2358  }
2359  EXPORT_SYMBOL(full_name_hash);
2360  
2361  /* Return the "hash_len" (hash and length) of a null-terminated string */
hashlen_string(const void * salt,const char * name)2362  u64 hashlen_string(const void *salt, const char *name)
2363  {
2364  	unsigned long hash = init_name_hash(salt);
2365  	unsigned long len = 0, c;
2366  
2367  	c = (unsigned char)*name;
2368  	while (c) {
2369  		len++;
2370  		hash = partial_name_hash(c, hash);
2371  		c = (unsigned char)name[len];
2372  	}
2373  	return hashlen_create(end_name_hash(hash), len);
2374  }
2375  EXPORT_SYMBOL(hashlen_string);
2376  
2377  /*
2378   * We know there's a real path component here of at least
2379   * one character.
2380   */
hash_name(struct nameidata * nd,const char * name,unsigned long * lastword)2381  static inline const char *hash_name(struct nameidata *nd, const char *name, unsigned long *lastword)
2382  {
2383  	unsigned long hash = init_name_hash(nd->path.dentry);
2384  	unsigned long len = 0, c, last = 0;
2385  
2386  	c = (unsigned char)*name;
2387  	do {
2388  		last = (last << 8) + c;
2389  		len++;
2390  		hash = partial_name_hash(c, hash);
2391  		c = (unsigned char)name[len];
2392  	} while (c && c != '/');
2393  
2394  	// This is reliable for DOT or DOTDOT, since the component
2395  	// cannot contain NUL characters - top bits being zero means
2396  	// we cannot have had any other pathnames.
2397  	*lastword = last;
2398  	nd->last.hash = end_name_hash(hash);
2399  	nd->last.len = len;
2400  	return name + len;
2401  }
2402  
2403  #endif
2404  
2405  #ifndef LAST_WORD_IS_DOT
2406    #define LAST_WORD_IS_DOT	0x2e
2407    #define LAST_WORD_IS_DOTDOT	0x2e2e
2408  #endif
2409  
2410  /*
2411   * Name resolution.
2412   * This is the basic name resolution function, turning a pathname into
2413   * the final dentry. We expect 'base' to be positive and a directory.
2414   *
2415   * Returns 0 and nd will have valid dentry and mnt on success.
2416   * Returns error and drops reference to input namei data on failure.
2417   */
link_path_walk(const char * name,struct nameidata * nd)2418  static int link_path_walk(const char *name, struct nameidata *nd)
2419  {
2420  	int depth = 0; // depth <= nd->depth
2421  	int err;
2422  
2423  	nd->last_type = LAST_ROOT;
2424  	nd->flags |= LOOKUP_PARENT;
2425  	if (IS_ERR(name))
2426  		return PTR_ERR(name);
2427  	while (*name=='/')
2428  		name++;
2429  	if (!*name) {
2430  		nd->dir_mode = 0; // short-circuit the 'hardening' idiocy
2431  		return 0;
2432  	}
2433  
2434  	/* At this point we know we have a real path component. */
2435  	for(;;) {
2436  		struct mnt_idmap *idmap;
2437  		const char *link;
2438  		unsigned long lastword;
2439  
2440  		idmap = mnt_idmap(nd->path.mnt);
2441  		err = may_lookup(idmap, nd);
2442  		if (err)
2443  			return err;
2444  
2445  		nd->last.name = name;
2446  		name = hash_name(nd, name, &lastword);
2447  
2448  		switch(lastword) {
2449  		case LAST_WORD_IS_DOTDOT:
2450  			nd->last_type = LAST_DOTDOT;
2451  			nd->state |= ND_JUMPED;
2452  			break;
2453  
2454  		case LAST_WORD_IS_DOT:
2455  			nd->last_type = LAST_DOT;
2456  			break;
2457  
2458  		default:
2459  			nd->last_type = LAST_NORM;
2460  			nd->state &= ~ND_JUMPED;
2461  
2462  			struct dentry *parent = nd->path.dentry;
2463  			if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
2464  				err = parent->d_op->d_hash(parent, &nd->last);
2465  				if (err < 0)
2466  					return err;
2467  			}
2468  		}
2469  
2470  		if (!*name)
2471  			goto OK;
2472  		/*
2473  		 * If it wasn't NUL, we know it was '/'. Skip that
2474  		 * slash, and continue until no more slashes.
2475  		 */
2476  		do {
2477  			name++;
2478  		} while (unlikely(*name == '/'));
2479  		if (unlikely(!*name)) {
2480  OK:
2481  			/* pathname or trailing symlink, done */
2482  			if (!depth) {
2483  				nd->dir_vfsuid = i_uid_into_vfsuid(idmap, nd->inode);
2484  				nd->dir_mode = nd->inode->i_mode;
2485  				nd->flags &= ~LOOKUP_PARENT;
2486  				return 0;
2487  			}
2488  			/* last component of nested symlink */
2489  			name = nd->stack[--depth].name;
2490  			link = walk_component(nd, 0);
2491  		} else {
2492  			/* not the last component */
2493  			link = walk_component(nd, WALK_MORE);
2494  		}
2495  		if (unlikely(link)) {
2496  			if (IS_ERR(link))
2497  				return PTR_ERR(link);
2498  			/* a symlink to follow */
2499  			nd->stack[depth++].name = name;
2500  			name = link;
2501  			continue;
2502  		}
2503  		if (unlikely(!d_can_lookup(nd->path.dentry))) {
2504  			if (nd->flags & LOOKUP_RCU) {
2505  				if (!try_to_unlazy(nd))
2506  					return -ECHILD;
2507  			}
2508  			return -ENOTDIR;
2509  		}
2510  	}
2511  }
2512  
2513  /* must be paired with terminate_walk() */
path_init(struct nameidata * nd,unsigned flags)2514  static const char *path_init(struct nameidata *nd, unsigned flags)
2515  {
2516  	int error;
2517  	const char *s = nd->pathname;
2518  
2519  	/* LOOKUP_CACHED requires RCU, ask caller to retry */
2520  	if ((flags & (LOOKUP_RCU | LOOKUP_CACHED)) == LOOKUP_CACHED)
2521  		return ERR_PTR(-EAGAIN);
2522  
2523  	if (!*s)
2524  		flags &= ~LOOKUP_RCU;
2525  	if (flags & LOOKUP_RCU)
2526  		rcu_read_lock();
2527  	else
2528  		nd->seq = nd->next_seq = 0;
2529  
2530  	nd->flags = flags;
2531  	nd->state |= ND_JUMPED;
2532  
2533  	nd->m_seq = __read_seqcount_begin(&mount_lock.seqcount);
2534  	nd->r_seq = __read_seqcount_begin(&rename_lock.seqcount);
2535  	smp_rmb();
2536  
2537  	if (nd->state & ND_ROOT_PRESET) {
2538  		struct dentry *root = nd->root.dentry;
2539  		struct inode *inode = root->d_inode;
2540  		if (*s && unlikely(!d_can_lookup(root)))
2541  			return ERR_PTR(-ENOTDIR);
2542  		nd->path = nd->root;
2543  		nd->inode = inode;
2544  		if (flags & LOOKUP_RCU) {
2545  			nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
2546  			nd->root_seq = nd->seq;
2547  		} else {
2548  			path_get(&nd->path);
2549  		}
2550  		return s;
2551  	}
2552  
2553  	nd->root.mnt = NULL;
2554  
2555  	/* Absolute pathname -- fetch the root (LOOKUP_IN_ROOT uses nd->dfd). */
2556  	if (*s == '/' && !(flags & LOOKUP_IN_ROOT)) {
2557  		error = nd_jump_root(nd);
2558  		if (unlikely(error))
2559  			return ERR_PTR(error);
2560  		return s;
2561  	}
2562  
2563  	/* Relative pathname -- get the starting-point it is relative to. */
2564  	if (nd->dfd == AT_FDCWD) {
2565  		if (flags & LOOKUP_RCU) {
2566  			struct fs_struct *fs = current->fs;
2567  			unsigned seq;
2568  
2569  			do {
2570  				seq = read_seqcount_begin(&fs->seq);
2571  				nd->path = fs->pwd;
2572  				nd->inode = nd->path.dentry->d_inode;
2573  				nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
2574  			} while (read_seqcount_retry(&fs->seq, seq));
2575  		} else {
2576  			get_fs_pwd(current->fs, &nd->path);
2577  			nd->inode = nd->path.dentry->d_inode;
2578  		}
2579  	} else {
2580  		/* Caller must check execute permissions on the starting path component */
2581  		CLASS(fd_raw, f)(nd->dfd);
2582  		struct dentry *dentry;
2583  
2584  		if (fd_empty(f))
2585  			return ERR_PTR(-EBADF);
2586  
2587  		if (flags & LOOKUP_LINKAT_EMPTY) {
2588  			if (fd_file(f)->f_cred != current_cred() &&
2589  			    !ns_capable(fd_file(f)->f_cred->user_ns, CAP_DAC_READ_SEARCH))
2590  				return ERR_PTR(-ENOENT);
2591  		}
2592  
2593  		dentry = fd_file(f)->f_path.dentry;
2594  
2595  		if (*s && unlikely(!d_can_lookup(dentry)))
2596  			return ERR_PTR(-ENOTDIR);
2597  
2598  		nd->path = fd_file(f)->f_path;
2599  		if (flags & LOOKUP_RCU) {
2600  			nd->inode = nd->path.dentry->d_inode;
2601  			nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
2602  		} else {
2603  			path_get(&nd->path);
2604  			nd->inode = nd->path.dentry->d_inode;
2605  		}
2606  	}
2607  
2608  	/* For scoped-lookups we need to set the root to the dirfd as well. */
2609  	if (flags & LOOKUP_IS_SCOPED) {
2610  		nd->root = nd->path;
2611  		if (flags & LOOKUP_RCU) {
2612  			nd->root_seq = nd->seq;
2613  		} else {
2614  			path_get(&nd->root);
2615  			nd->state |= ND_ROOT_GRABBED;
2616  		}
2617  	}
2618  	return s;
2619  }
2620  
lookup_last(struct nameidata * nd)2621  static inline const char *lookup_last(struct nameidata *nd)
2622  {
2623  	if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
2624  		nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
2625  
2626  	return walk_component(nd, WALK_TRAILING);
2627  }
2628  
handle_lookup_down(struct nameidata * nd)2629  static int handle_lookup_down(struct nameidata *nd)
2630  {
2631  	if (!(nd->flags & LOOKUP_RCU))
2632  		dget(nd->path.dentry);
2633  	nd->next_seq = nd->seq;
2634  	return PTR_ERR(step_into(nd, WALK_NOFOLLOW, nd->path.dentry));
2635  }
2636  
2637  /* Returns 0 and nd will be valid on success; Returns error, otherwise. */
path_lookupat(struct nameidata * nd,unsigned flags,struct path * path)2638  static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path)
2639  {
2640  	const char *s = path_init(nd, flags);
2641  	int err;
2642  
2643  	if (unlikely(flags & LOOKUP_DOWN) && !IS_ERR(s)) {
2644  		err = handle_lookup_down(nd);
2645  		if (unlikely(err < 0))
2646  			s = ERR_PTR(err);
2647  	}
2648  
2649  	while (!(err = link_path_walk(s, nd)) &&
2650  	       (s = lookup_last(nd)) != NULL)
2651  		;
2652  	if (!err && unlikely(nd->flags & LOOKUP_MOUNTPOINT)) {
2653  		err = handle_lookup_down(nd);
2654  		nd->state &= ~ND_JUMPED; // no d_weak_revalidate(), please...
2655  	}
2656  	if (!err)
2657  		err = complete_walk(nd);
2658  
2659  	if (!err && nd->flags & LOOKUP_DIRECTORY)
2660  		if (!d_can_lookup(nd->path.dentry))
2661  			err = -ENOTDIR;
2662  	if (!err) {
2663  		*path = nd->path;
2664  		nd->path.mnt = NULL;
2665  		nd->path.dentry = NULL;
2666  	}
2667  	terminate_walk(nd);
2668  	return err;
2669  }
2670  
filename_lookup(int dfd,struct filename * name,unsigned flags,struct path * path,struct path * root)2671  int filename_lookup(int dfd, struct filename *name, unsigned flags,
2672  		    struct path *path, struct path *root)
2673  {
2674  	int retval;
2675  	struct nameidata nd;
2676  	if (IS_ERR(name))
2677  		return PTR_ERR(name);
2678  	set_nameidata(&nd, dfd, name, root);
2679  	retval = path_lookupat(&nd, flags | LOOKUP_RCU, path);
2680  	if (unlikely(retval == -ECHILD))
2681  		retval = path_lookupat(&nd, flags, path);
2682  	if (unlikely(retval == -ESTALE))
2683  		retval = path_lookupat(&nd, flags | LOOKUP_REVAL, path);
2684  
2685  	if (likely(!retval))
2686  		audit_inode(name, path->dentry,
2687  			    flags & LOOKUP_MOUNTPOINT ? AUDIT_INODE_NOEVAL : 0);
2688  	restore_nameidata();
2689  	return retval;
2690  }
2691  
2692  /* Returns 0 and nd will be valid on success; Returns error, otherwise. */
path_parentat(struct nameidata * nd,unsigned flags,struct path * parent)2693  static int path_parentat(struct nameidata *nd, unsigned flags,
2694  				struct path *parent)
2695  {
2696  	const char *s = path_init(nd, flags);
2697  	int err = link_path_walk(s, nd);
2698  	if (!err)
2699  		err = complete_walk(nd);
2700  	if (!err) {
2701  		*parent = nd->path;
2702  		nd->path.mnt = NULL;
2703  		nd->path.dentry = NULL;
2704  	}
2705  	terminate_walk(nd);
2706  	return err;
2707  }
2708  
2709  /* Note: this does not consume "name" */
__filename_parentat(int dfd,struct filename * name,unsigned int flags,struct path * parent,struct qstr * last,int * type,const struct path * root)2710  static int __filename_parentat(int dfd, struct filename *name,
2711  			       unsigned int flags, struct path *parent,
2712  			       struct qstr *last, int *type,
2713  			       const struct path *root)
2714  {
2715  	int retval;
2716  	struct nameidata nd;
2717  
2718  	if (IS_ERR(name))
2719  		return PTR_ERR(name);
2720  	set_nameidata(&nd, dfd, name, root);
2721  	retval = path_parentat(&nd, flags | LOOKUP_RCU, parent);
2722  	if (unlikely(retval == -ECHILD))
2723  		retval = path_parentat(&nd, flags, parent);
2724  	if (unlikely(retval == -ESTALE))
2725  		retval = path_parentat(&nd, flags | LOOKUP_REVAL, parent);
2726  	if (likely(!retval)) {
2727  		*last = nd.last;
2728  		*type = nd.last_type;
2729  		audit_inode(name, parent->dentry, AUDIT_INODE_PARENT);
2730  	}
2731  	restore_nameidata();
2732  	return retval;
2733  }
2734  
filename_parentat(int dfd,struct filename * name,unsigned int flags,struct path * parent,struct qstr * last,int * type)2735  static int filename_parentat(int dfd, struct filename *name,
2736  			     unsigned int flags, struct path *parent,
2737  			     struct qstr *last, int *type)
2738  {
2739  	return __filename_parentat(dfd, name, flags, parent, last, type, NULL);
2740  }
2741  
2742  /* does lookup, returns the object with parent locked */
__kern_path_locked(int dfd,struct filename * name,struct path * path)2743  static struct dentry *__kern_path_locked(int dfd, struct filename *name, struct path *path)
2744  {
2745  	struct dentry *d;
2746  	struct qstr last;
2747  	int type, error;
2748  
2749  	error = filename_parentat(dfd, name, 0, path, &last, &type);
2750  	if (error)
2751  		return ERR_PTR(error);
2752  	if (unlikely(type != LAST_NORM)) {
2753  		path_put(path);
2754  		return ERR_PTR(-EINVAL);
2755  	}
2756  	inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
2757  	d = lookup_one_qstr_excl(&last, path->dentry, 0);
2758  	if (IS_ERR(d)) {
2759  		inode_unlock(path->dentry->d_inode);
2760  		path_put(path);
2761  	}
2762  	return d;
2763  }
2764  
kern_path_locked(const char * name,struct path * path)2765  struct dentry *kern_path_locked(const char *name, struct path *path)
2766  {
2767  	struct filename *filename = getname_kernel(name);
2768  	struct dentry *res = __kern_path_locked(AT_FDCWD, filename, path);
2769  
2770  	putname(filename);
2771  	return res;
2772  }
2773  
user_path_locked_at(int dfd,const char __user * name,struct path * path)2774  struct dentry *user_path_locked_at(int dfd, const char __user *name, struct path *path)
2775  {
2776  	struct filename *filename = getname(name);
2777  	struct dentry *res = __kern_path_locked(dfd, filename, path);
2778  
2779  	putname(filename);
2780  	return res;
2781  }
2782  EXPORT_SYMBOL(user_path_locked_at);
2783  
kern_path(const char * name,unsigned int flags,struct path * path)2784  int kern_path(const char *name, unsigned int flags, struct path *path)
2785  {
2786  	struct filename *filename = getname_kernel(name);
2787  	int ret = filename_lookup(AT_FDCWD, filename, flags, path, NULL);
2788  
2789  	putname(filename);
2790  	return ret;
2791  
2792  }
2793  EXPORT_SYMBOL(kern_path);
2794  
2795  /**
2796   * vfs_path_parent_lookup - lookup a parent path relative to a dentry-vfsmount pair
2797   * @filename: filename structure
2798   * @flags: lookup flags
2799   * @parent: pointer to struct path to fill
2800   * @last: last component
2801   * @type: type of the last component
2802   * @root: pointer to struct path of the base directory
2803   */
vfs_path_parent_lookup(struct filename * filename,unsigned int flags,struct path * parent,struct qstr * last,int * type,const struct path * root)2804  int vfs_path_parent_lookup(struct filename *filename, unsigned int flags,
2805  			   struct path *parent, struct qstr *last, int *type,
2806  			   const struct path *root)
2807  {
2808  	return  __filename_parentat(AT_FDCWD, filename, flags, parent, last,
2809  				    type, root);
2810  }
2811  EXPORT_SYMBOL(vfs_path_parent_lookup);
2812  
2813  /**
2814   * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair
2815   * @dentry:  pointer to dentry of the base directory
2816   * @mnt: pointer to vfs mount of the base directory
2817   * @name: pointer to file name
2818   * @flags: lookup flags
2819   * @path: pointer to struct path to fill
2820   */
vfs_path_lookup(struct dentry * dentry,struct vfsmount * mnt,const char * name,unsigned int flags,struct path * path)2821  int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
2822  		    const char *name, unsigned int flags,
2823  		    struct path *path)
2824  {
2825  	struct filename *filename;
2826  	struct path root = {.mnt = mnt, .dentry = dentry};
2827  	int ret;
2828  
2829  	filename = getname_kernel(name);
2830  	/* the first argument of filename_lookup() is ignored with root */
2831  	ret = filename_lookup(AT_FDCWD, filename, flags, path, &root);
2832  	putname(filename);
2833  	return ret;
2834  }
2835  EXPORT_SYMBOL(vfs_path_lookup);
2836  
lookup_one_common(struct mnt_idmap * idmap,const char * name,struct dentry * base,int len,struct qstr * this)2837  static int lookup_one_common(struct mnt_idmap *idmap,
2838  			     const char *name, struct dentry *base, int len,
2839  			     struct qstr *this)
2840  {
2841  	this->name = name;
2842  	this->len = len;
2843  	this->hash = full_name_hash(base, name, len);
2844  	if (!len)
2845  		return -EACCES;
2846  
2847  	if (is_dot_dotdot(name, len))
2848  		return -EACCES;
2849  
2850  	while (len--) {
2851  		unsigned int c = *(const unsigned char *)name++;
2852  		if (c == '/' || c == '\0')
2853  			return -EACCES;
2854  	}
2855  	/*
2856  	 * See if the low-level filesystem might want
2857  	 * to use its own hash..
2858  	 */
2859  	if (base->d_flags & DCACHE_OP_HASH) {
2860  		int err = base->d_op->d_hash(base, this);
2861  		if (err < 0)
2862  			return err;
2863  	}
2864  
2865  	return inode_permission(idmap, base->d_inode, MAY_EXEC);
2866  }
2867  
2868  /**
2869   * try_lookup_one_len - filesystem helper to lookup single pathname component
2870   * @name:	pathname component to lookup
2871   * @base:	base directory to lookup from
2872   * @len:	maximum length @len should be interpreted to
2873   *
2874   * Look up a dentry by name in the dcache, returning NULL if it does not
2875   * currently exist.  The function does not try to create a dentry.
2876   *
2877   * Note that this routine is purely a helper for filesystem usage and should
2878   * not be called by generic code.
2879   *
2880   * No locks need be held - only a counted reference to @base is needed.
2881   *
2882   */
try_lookup_one_len(const char * name,struct dentry * base,int len)2883  struct dentry *try_lookup_one_len(const char *name, struct dentry *base, int len)
2884  {
2885  	struct qstr this;
2886  	int err;
2887  
2888  	err = lookup_one_common(&nop_mnt_idmap, name, base, len, &this);
2889  	if (err)
2890  		return ERR_PTR(err);
2891  
2892  	return lookup_dcache(&this, base, 0);
2893  }
2894  EXPORT_SYMBOL(try_lookup_one_len);
2895  
2896  /**
2897   * lookup_one_len - filesystem helper to lookup single pathname component
2898   * @name:	pathname component to lookup
2899   * @base:	base directory to lookup from
2900   * @len:	maximum length @len should be interpreted to
2901   *
2902   * Note that this routine is purely a helper for filesystem usage and should
2903   * not be called by generic code.
2904   *
2905   * The caller must hold base->i_mutex.
2906   */
lookup_one_len(const char * name,struct dentry * base,int len)2907  struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
2908  {
2909  	struct dentry *dentry;
2910  	struct qstr this;
2911  	int err;
2912  
2913  	WARN_ON_ONCE(!inode_is_locked(base->d_inode));
2914  
2915  	err = lookup_one_common(&nop_mnt_idmap, name, base, len, &this);
2916  	if (err)
2917  		return ERR_PTR(err);
2918  
2919  	dentry = lookup_dcache(&this, base, 0);
2920  	return dentry ? dentry : __lookup_slow(&this, base, 0);
2921  }
2922  EXPORT_SYMBOL(lookup_one_len);
2923  
2924  /**
2925   * lookup_one - filesystem helper to lookup single pathname component
2926   * @idmap:	idmap of the mount the lookup is performed from
2927   * @name:	pathname component to lookup
2928   * @base:	base directory to lookup from
2929   * @len:	maximum length @len should be interpreted to
2930   *
2931   * Note that this routine is purely a helper for filesystem usage and should
2932   * not be called by generic code.
2933   *
2934   * The caller must hold base->i_mutex.
2935   */
lookup_one(struct mnt_idmap * idmap,const char * name,struct dentry * base,int len)2936  struct dentry *lookup_one(struct mnt_idmap *idmap, const char *name,
2937  			  struct dentry *base, int len)
2938  {
2939  	struct dentry *dentry;
2940  	struct qstr this;
2941  	int err;
2942  
2943  	WARN_ON_ONCE(!inode_is_locked(base->d_inode));
2944  
2945  	err = lookup_one_common(idmap, name, base, len, &this);
2946  	if (err)
2947  		return ERR_PTR(err);
2948  
2949  	dentry = lookup_dcache(&this, base, 0);
2950  	return dentry ? dentry : __lookup_slow(&this, base, 0);
2951  }
2952  EXPORT_SYMBOL(lookup_one);
2953  
2954  /**
2955   * lookup_one_unlocked - filesystem helper to lookup single pathname component
2956   * @idmap:	idmap of the mount the lookup is performed from
2957   * @name:	pathname component to lookup
2958   * @base:	base directory to lookup from
2959   * @len:	maximum length @len should be interpreted to
2960   *
2961   * Note that this routine is purely a helper for filesystem usage and should
2962   * not be called by generic code.
2963   *
2964   * Unlike lookup_one_len, it should be called without the parent
2965   * i_mutex held, and will take the i_mutex itself if necessary.
2966   */
lookup_one_unlocked(struct mnt_idmap * idmap,const char * name,struct dentry * base,int len)2967  struct dentry *lookup_one_unlocked(struct mnt_idmap *idmap,
2968  				   const char *name, struct dentry *base,
2969  				   int len)
2970  {
2971  	struct qstr this;
2972  	int err;
2973  	struct dentry *ret;
2974  
2975  	err = lookup_one_common(idmap, name, base, len, &this);
2976  	if (err)
2977  		return ERR_PTR(err);
2978  
2979  	ret = lookup_dcache(&this, base, 0);
2980  	if (!ret)
2981  		ret = lookup_slow(&this, base, 0);
2982  	return ret;
2983  }
2984  EXPORT_SYMBOL(lookup_one_unlocked);
2985  
2986  /**
2987   * lookup_one_positive_unlocked - filesystem helper to lookup single
2988   *				  pathname component
2989   * @idmap:	idmap of the mount the lookup is performed from
2990   * @name:	pathname component to lookup
2991   * @base:	base directory to lookup from
2992   * @len:	maximum length @len should be interpreted to
2993   *
2994   * This helper will yield ERR_PTR(-ENOENT) on negatives. The helper returns
2995   * known positive or ERR_PTR(). This is what most of the users want.
2996   *
2997   * Note that pinned negative with unlocked parent _can_ become positive at any
2998   * time, so callers of lookup_one_unlocked() need to be very careful; pinned
2999   * positives have >d_inode stable, so this one avoids such problems.
3000   *
3001   * Note that this routine is purely a helper for filesystem usage and should
3002   * not be called by generic code.
3003   *
3004   * The helper should be called without i_mutex held.
3005   */
lookup_one_positive_unlocked(struct mnt_idmap * idmap,const char * name,struct dentry * base,int len)3006  struct dentry *lookup_one_positive_unlocked(struct mnt_idmap *idmap,
3007  					    const char *name,
3008  					    struct dentry *base, int len)
3009  {
3010  	struct dentry *ret = lookup_one_unlocked(idmap, name, base, len);
3011  
3012  	if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) {
3013  		dput(ret);
3014  		ret = ERR_PTR(-ENOENT);
3015  	}
3016  	return ret;
3017  }
3018  EXPORT_SYMBOL(lookup_one_positive_unlocked);
3019  
3020  /**
3021   * lookup_one_len_unlocked - filesystem helper to lookup single pathname component
3022   * @name:	pathname component to lookup
3023   * @base:	base directory to lookup from
3024   * @len:	maximum length @len should be interpreted to
3025   *
3026   * Note that this routine is purely a helper for filesystem usage and should
3027   * not be called by generic code.
3028   *
3029   * Unlike lookup_one_len, it should be called without the parent
3030   * i_mutex held, and will take the i_mutex itself if necessary.
3031   */
lookup_one_len_unlocked(const char * name,struct dentry * base,int len)3032  struct dentry *lookup_one_len_unlocked(const char *name,
3033  				       struct dentry *base, int len)
3034  {
3035  	return lookup_one_unlocked(&nop_mnt_idmap, name, base, len);
3036  }
3037  EXPORT_SYMBOL(lookup_one_len_unlocked);
3038  
3039  /*
3040   * Like lookup_one_len_unlocked(), except that it yields ERR_PTR(-ENOENT)
3041   * on negatives.  Returns known positive or ERR_PTR(); that's what
3042   * most of the users want.  Note that pinned negative with unlocked parent
3043   * _can_ become positive at any time, so callers of lookup_one_len_unlocked()
3044   * need to be very careful; pinned positives have ->d_inode stable, so
3045   * this one avoids such problems.
3046   */
lookup_positive_unlocked(const char * name,struct dentry * base,int len)3047  struct dentry *lookup_positive_unlocked(const char *name,
3048  				       struct dentry *base, int len)
3049  {
3050  	return lookup_one_positive_unlocked(&nop_mnt_idmap, name, base, len);
3051  }
3052  EXPORT_SYMBOL(lookup_positive_unlocked);
3053  
3054  #ifdef CONFIG_UNIX98_PTYS
path_pts(struct path * path)3055  int path_pts(struct path *path)
3056  {
3057  	/* Find something mounted on "pts" in the same directory as
3058  	 * the input path.
3059  	 */
3060  	struct dentry *parent = dget_parent(path->dentry);
3061  	struct dentry *child;
3062  	struct qstr this = QSTR_INIT("pts", 3);
3063  
3064  	if (unlikely(!path_connected(path->mnt, parent))) {
3065  		dput(parent);
3066  		return -ENOENT;
3067  	}
3068  	dput(path->dentry);
3069  	path->dentry = parent;
3070  	child = d_hash_and_lookup(parent, &this);
3071  	if (IS_ERR_OR_NULL(child))
3072  		return -ENOENT;
3073  
3074  	path->dentry = child;
3075  	dput(parent);
3076  	follow_down(path, 0);
3077  	return 0;
3078  }
3079  #endif
3080  
user_path_at(int dfd,const char __user * name,unsigned flags,struct path * path)3081  int user_path_at(int dfd, const char __user *name, unsigned flags,
3082  		 struct path *path)
3083  {
3084  	struct filename *filename = getname_flags(name, flags);
3085  	int ret = filename_lookup(dfd, filename, flags, path, NULL);
3086  
3087  	putname(filename);
3088  	return ret;
3089  }
3090  EXPORT_SYMBOL(user_path_at);
3091  
__check_sticky(struct mnt_idmap * idmap,struct inode * dir,struct inode * inode)3092  int __check_sticky(struct mnt_idmap *idmap, struct inode *dir,
3093  		   struct inode *inode)
3094  {
3095  	kuid_t fsuid = current_fsuid();
3096  
3097  	if (vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode), fsuid))
3098  		return 0;
3099  	if (vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, dir), fsuid))
3100  		return 0;
3101  	return !capable_wrt_inode_uidgid(idmap, inode, CAP_FOWNER);
3102  }
3103  EXPORT_SYMBOL(__check_sticky);
3104  
3105  /*
3106   *	Check whether we can remove a link victim from directory dir, check
3107   *  whether the type of victim is right.
3108   *  1. We can't do it if dir is read-only (done in permission())
3109   *  2. We should have write and exec permissions on dir
3110   *  3. We can't remove anything from append-only dir
3111   *  4. We can't do anything with immutable dir (done in permission())
3112   *  5. If the sticky bit on dir is set we should either
3113   *	a. be owner of dir, or
3114   *	b. be owner of victim, or
3115   *	c. have CAP_FOWNER capability
3116   *  6. If the victim is append-only or immutable we can't do antyhing with
3117   *     links pointing to it.
3118   *  7. If the victim has an unknown uid or gid we can't change the inode.
3119   *  8. If we were asked to remove a directory and victim isn't one - ENOTDIR.
3120   *  9. If we were asked to remove a non-directory and victim isn't one - EISDIR.
3121   * 10. We can't remove a root or mountpoint.
3122   * 11. We don't allow removal of NFS sillyrenamed files; it's handled by
3123   *     nfs_async_unlink().
3124   */
may_delete(struct mnt_idmap * idmap,struct inode * dir,struct dentry * victim,bool isdir)3125  static int may_delete(struct mnt_idmap *idmap, struct inode *dir,
3126  		      struct dentry *victim, bool isdir)
3127  {
3128  	struct inode *inode = d_backing_inode(victim);
3129  	int error;
3130  
3131  	if (d_is_negative(victim))
3132  		return -ENOENT;
3133  	BUG_ON(!inode);
3134  
3135  	BUG_ON(victim->d_parent->d_inode != dir);
3136  
3137  	/* Inode writeback is not safe when the uid or gid are invalid. */
3138  	if (!vfsuid_valid(i_uid_into_vfsuid(idmap, inode)) ||
3139  	    !vfsgid_valid(i_gid_into_vfsgid(idmap, inode)))
3140  		return -EOVERFLOW;
3141  
3142  	audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
3143  
3144  	error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC);
3145  	if (error)
3146  		return error;
3147  	if (IS_APPEND(dir))
3148  		return -EPERM;
3149  
3150  	if (check_sticky(idmap, dir, inode) || IS_APPEND(inode) ||
3151  	    IS_IMMUTABLE(inode) || IS_SWAPFILE(inode) ||
3152  	    HAS_UNMAPPED_ID(idmap, inode))
3153  		return -EPERM;
3154  	if (isdir) {
3155  		if (!d_is_dir(victim))
3156  			return -ENOTDIR;
3157  		if (IS_ROOT(victim))
3158  			return -EBUSY;
3159  	} else if (d_is_dir(victim))
3160  		return -EISDIR;
3161  	if (IS_DEADDIR(dir))
3162  		return -ENOENT;
3163  	if (victim->d_flags & DCACHE_NFSFS_RENAMED)
3164  		return -EBUSY;
3165  	return 0;
3166  }
3167  
3168  /*	Check whether we can create an object with dentry child in directory
3169   *  dir.
3170   *  1. We can't do it if child already exists (open has special treatment for
3171   *     this case, but since we are inlined it's OK)
3172   *  2. We can't do it if dir is read-only (done in permission())
3173   *  3. We can't do it if the fs can't represent the fsuid or fsgid.
3174   *  4. We should have write and exec permissions on dir
3175   *  5. We can't do it if dir is immutable (done in permission())
3176   */
may_create(struct mnt_idmap * idmap,struct inode * dir,struct dentry * child)3177  static inline int may_create(struct mnt_idmap *idmap,
3178  			     struct inode *dir, struct dentry *child)
3179  {
3180  	audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE);
3181  	if (child->d_inode)
3182  		return -EEXIST;
3183  	if (IS_DEADDIR(dir))
3184  		return -ENOENT;
3185  	if (!fsuidgid_has_mapping(dir->i_sb, idmap))
3186  		return -EOVERFLOW;
3187  
3188  	return inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC);
3189  }
3190  
3191  // p1 != p2, both are on the same filesystem, ->s_vfs_rename_mutex is held
lock_two_directories(struct dentry * p1,struct dentry * p2)3192  static struct dentry *lock_two_directories(struct dentry *p1, struct dentry *p2)
3193  {
3194  	struct dentry *p = p1, *q = p2, *r;
3195  
3196  	while ((r = p->d_parent) != p2 && r != p)
3197  		p = r;
3198  	if (r == p2) {
3199  		// p is a child of p2 and an ancestor of p1 or p1 itself
3200  		inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
3201  		inode_lock_nested(p1->d_inode, I_MUTEX_PARENT2);
3202  		return p;
3203  	}
3204  	// p is the root of connected component that contains p1
3205  	// p2 does not occur on the path from p to p1
3206  	while ((r = q->d_parent) != p1 && r != p && r != q)
3207  		q = r;
3208  	if (r == p1) {
3209  		// q is a child of p1 and an ancestor of p2 or p2 itself
3210  		inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
3211  		inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2);
3212  		return q;
3213  	} else if (likely(r == p)) {
3214  		// both p2 and p1 are descendents of p
3215  		inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
3216  		inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2);
3217  		return NULL;
3218  	} else { // no common ancestor at the time we'd been called
3219  		mutex_unlock(&p1->d_sb->s_vfs_rename_mutex);
3220  		return ERR_PTR(-EXDEV);
3221  	}
3222  }
3223  
3224  /*
3225   * p1 and p2 should be directories on the same fs.
3226   */
lock_rename(struct dentry * p1,struct dentry * p2)3227  struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
3228  {
3229  	if (p1 == p2) {
3230  		inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
3231  		return NULL;
3232  	}
3233  
3234  	mutex_lock(&p1->d_sb->s_vfs_rename_mutex);
3235  	return lock_two_directories(p1, p2);
3236  }
3237  EXPORT_SYMBOL(lock_rename);
3238  
3239  /*
3240   * c1 and p2 should be on the same fs.
3241   */
lock_rename_child(struct dentry * c1,struct dentry * p2)3242  struct dentry *lock_rename_child(struct dentry *c1, struct dentry *p2)
3243  {
3244  	if (READ_ONCE(c1->d_parent) == p2) {
3245  		/*
3246  		 * hopefully won't need to touch ->s_vfs_rename_mutex at all.
3247  		 */
3248  		inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
3249  		/*
3250  		 * now that p2 is locked, nobody can move in or out of it,
3251  		 * so the test below is safe.
3252  		 */
3253  		if (likely(c1->d_parent == p2))
3254  			return NULL;
3255  
3256  		/*
3257  		 * c1 got moved out of p2 while we'd been taking locks;
3258  		 * unlock and fall back to slow case.
3259  		 */
3260  		inode_unlock(p2->d_inode);
3261  	}
3262  
3263  	mutex_lock(&c1->d_sb->s_vfs_rename_mutex);
3264  	/*
3265  	 * nobody can move out of any directories on this fs.
3266  	 */
3267  	if (likely(c1->d_parent != p2))
3268  		return lock_two_directories(c1->d_parent, p2);
3269  
3270  	/*
3271  	 * c1 got moved into p2 while we were taking locks;
3272  	 * we need p2 locked and ->s_vfs_rename_mutex unlocked,
3273  	 * for consistency with lock_rename().
3274  	 */
3275  	inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
3276  	mutex_unlock(&c1->d_sb->s_vfs_rename_mutex);
3277  	return NULL;
3278  }
3279  EXPORT_SYMBOL(lock_rename_child);
3280  
unlock_rename(struct dentry * p1,struct dentry * p2)3281  void unlock_rename(struct dentry *p1, struct dentry *p2)
3282  {
3283  	inode_unlock(p1->d_inode);
3284  	if (p1 != p2) {
3285  		inode_unlock(p2->d_inode);
3286  		mutex_unlock(&p1->d_sb->s_vfs_rename_mutex);
3287  	}
3288  }
3289  EXPORT_SYMBOL(unlock_rename);
3290  
3291  /**
3292   * vfs_prepare_mode - prepare the mode to be used for a new inode
3293   * @idmap:	idmap of the mount the inode was found from
3294   * @dir:	parent directory of the new inode
3295   * @mode:	mode of the new inode
3296   * @mask_perms:	allowed permission by the vfs
3297   * @type:	type of file to be created
3298   *
3299   * This helper consolidates and enforces vfs restrictions on the @mode of a new
3300   * object to be created.
3301   *
3302   * Umask stripping depends on whether the filesystem supports POSIX ACLs (see
3303   * the kernel documentation for mode_strip_umask()). Moving umask stripping
3304   * after setgid stripping allows the same ordering for both non-POSIX ACL and
3305   * POSIX ACL supporting filesystems.
3306   *
3307   * Note that it's currently valid for @type to be 0 if a directory is created.
3308   * Filesystems raise that flag individually and we need to check whether each
3309   * filesystem can deal with receiving S_IFDIR from the vfs before we enforce a
3310   * non-zero type.
3311   *
3312   * Returns: mode to be passed to the filesystem
3313   */
vfs_prepare_mode(struct mnt_idmap * idmap,const struct inode * dir,umode_t mode,umode_t mask_perms,umode_t type)3314  static inline umode_t vfs_prepare_mode(struct mnt_idmap *idmap,
3315  				       const struct inode *dir, umode_t mode,
3316  				       umode_t mask_perms, umode_t type)
3317  {
3318  	mode = mode_strip_sgid(idmap, dir, mode);
3319  	mode = mode_strip_umask(dir, mode);
3320  
3321  	/*
3322  	 * Apply the vfs mandated allowed permission mask and set the type of
3323  	 * file to be created before we call into the filesystem.
3324  	 */
3325  	mode &= (mask_perms & ~S_IFMT);
3326  	mode |= (type & S_IFMT);
3327  
3328  	return mode;
3329  }
3330  
3331  /**
3332   * vfs_create - create new file
3333   * @idmap:	idmap of the mount the inode was found from
3334   * @dir:	inode of the parent directory
3335   * @dentry:	dentry of the child file
3336   * @mode:	mode of the child file
3337   * @want_excl:	whether the file must not yet exist
3338   *
3339   * Create a new file.
3340   *
3341   * If the inode has been found through an idmapped mount the idmap of
3342   * the vfsmount must be passed through @idmap. This function will then take
3343   * care to map the inode according to @idmap before checking permissions.
3344   * On non-idmapped mounts or if permission checking is to be performed on the
3345   * raw inode simply pass @nop_mnt_idmap.
3346   */
vfs_create(struct mnt_idmap * idmap,struct inode * dir,struct dentry * dentry,umode_t mode,bool want_excl)3347  int vfs_create(struct mnt_idmap *idmap, struct inode *dir,
3348  	       struct dentry *dentry, umode_t mode, bool want_excl)
3349  {
3350  	int error;
3351  
3352  	error = may_create(idmap, dir, dentry);
3353  	if (error)
3354  		return error;
3355  
3356  	if (!dir->i_op->create)
3357  		return -EACCES;	/* shouldn't it be ENOSYS? */
3358  
3359  	mode = vfs_prepare_mode(idmap, dir, mode, S_IALLUGO, S_IFREG);
3360  	error = security_inode_create(dir, dentry, mode);
3361  	if (error)
3362  		return error;
3363  	error = dir->i_op->create(idmap, dir, dentry, mode, want_excl);
3364  	if (!error)
3365  		fsnotify_create(dir, dentry);
3366  	return error;
3367  }
3368  EXPORT_SYMBOL(vfs_create);
3369  
vfs_mkobj(struct dentry * dentry,umode_t mode,int (* f)(struct dentry *,umode_t,void *),void * arg)3370  int vfs_mkobj(struct dentry *dentry, umode_t mode,
3371  		int (*f)(struct dentry *, umode_t, void *),
3372  		void *arg)
3373  {
3374  	struct inode *dir = dentry->d_parent->d_inode;
3375  	int error = may_create(&nop_mnt_idmap, dir, dentry);
3376  	if (error)
3377  		return error;
3378  
3379  	mode &= S_IALLUGO;
3380  	mode |= S_IFREG;
3381  	error = security_inode_create(dir, dentry, mode);
3382  	if (error)
3383  		return error;
3384  	error = f(dentry, mode, arg);
3385  	if (!error)
3386  		fsnotify_create(dir, dentry);
3387  	return error;
3388  }
3389  EXPORT_SYMBOL(vfs_mkobj);
3390  
may_open_dev(const struct path * path)3391  bool may_open_dev(const struct path *path)
3392  {
3393  	return !(path->mnt->mnt_flags & MNT_NODEV) &&
3394  		!(path->mnt->mnt_sb->s_iflags & SB_I_NODEV);
3395  }
3396  
may_open(struct mnt_idmap * idmap,const struct path * path,int acc_mode,int flag)3397  static int may_open(struct mnt_idmap *idmap, const struct path *path,
3398  		    int acc_mode, int flag)
3399  {
3400  	struct dentry *dentry = path->dentry;
3401  	struct inode *inode = dentry->d_inode;
3402  	int error;
3403  
3404  	if (!inode)
3405  		return -ENOENT;
3406  
3407  	switch (inode->i_mode & S_IFMT) {
3408  	case S_IFLNK:
3409  		return -ELOOP;
3410  	case S_IFDIR:
3411  		if (acc_mode & MAY_WRITE)
3412  			return -EISDIR;
3413  		if (acc_mode & MAY_EXEC)
3414  			return -EACCES;
3415  		break;
3416  	case S_IFBLK:
3417  	case S_IFCHR:
3418  		if (!may_open_dev(path))
3419  			return -EACCES;
3420  		fallthrough;
3421  	case S_IFIFO:
3422  	case S_IFSOCK:
3423  		if (acc_mode & MAY_EXEC)
3424  			return -EACCES;
3425  		flag &= ~O_TRUNC;
3426  		break;
3427  	case S_IFREG:
3428  		if ((acc_mode & MAY_EXEC) && path_noexec(path))
3429  			return -EACCES;
3430  		break;
3431  	default:
3432  		VFS_BUG_ON_INODE(1, inode);
3433  	}
3434  
3435  	error = inode_permission(idmap, inode, MAY_OPEN | acc_mode);
3436  	if (error)
3437  		return error;
3438  
3439  	/*
3440  	 * An append-only file must be opened in append mode for writing.
3441  	 */
3442  	if (IS_APPEND(inode)) {
3443  		if  ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND))
3444  			return -EPERM;
3445  		if (flag & O_TRUNC)
3446  			return -EPERM;
3447  	}
3448  
3449  	/* O_NOATIME can only be set by the owner or superuser */
3450  	if (flag & O_NOATIME && !inode_owner_or_capable(idmap, inode))
3451  		return -EPERM;
3452  
3453  	return 0;
3454  }
3455  
handle_truncate(struct mnt_idmap * idmap,struct file * filp)3456  static int handle_truncate(struct mnt_idmap *idmap, struct file *filp)
3457  {
3458  	const struct path *path = &filp->f_path;
3459  	struct inode *inode = path->dentry->d_inode;
3460  	int error = get_write_access(inode);
3461  	if (error)
3462  		return error;
3463  
3464  	error = security_file_truncate(filp);
3465  	if (!error) {
3466  		error = do_truncate(idmap, path->dentry, 0,
3467  				    ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
3468  				    filp);
3469  	}
3470  	put_write_access(inode);
3471  	return error;
3472  }
3473  
open_to_namei_flags(int flag)3474  static inline int open_to_namei_flags(int flag)
3475  {
3476  	if ((flag & O_ACCMODE) == 3)
3477  		flag--;
3478  	return flag;
3479  }
3480  
may_o_create(struct mnt_idmap * idmap,const struct path * dir,struct dentry * dentry,umode_t mode)3481  static int may_o_create(struct mnt_idmap *idmap,
3482  			const struct path *dir, struct dentry *dentry,
3483  			umode_t mode)
3484  {
3485  	int error = security_path_mknod(dir, dentry, mode, 0);
3486  	if (error)
3487  		return error;
3488  
3489  	if (!fsuidgid_has_mapping(dir->dentry->d_sb, idmap))
3490  		return -EOVERFLOW;
3491  
3492  	error = inode_permission(idmap, dir->dentry->d_inode,
3493  				 MAY_WRITE | MAY_EXEC);
3494  	if (error)
3495  		return error;
3496  
3497  	return security_inode_create(dir->dentry->d_inode, dentry, mode);
3498  }
3499  
3500  /*
3501   * Attempt to atomically look up, create and open a file from a negative
3502   * dentry.
3503   *
3504   * Returns 0 if successful.  The file will have been created and attached to
3505   * @file by the filesystem calling finish_open().
3506   *
3507   * If the file was looked up only or didn't need creating, FMODE_OPENED won't
3508   * be set.  The caller will need to perform the open themselves.  @path will
3509   * have been updated to point to the new dentry.  This may be negative.
3510   *
3511   * Returns an error code otherwise.
3512   */
atomic_open(struct nameidata * nd,struct dentry * dentry,struct file * file,int open_flag,umode_t mode)3513  static struct dentry *atomic_open(struct nameidata *nd, struct dentry *dentry,
3514  				  struct file *file,
3515  				  int open_flag, umode_t mode)
3516  {
3517  	struct dentry *const DENTRY_NOT_SET = (void *) -1UL;
3518  	struct inode *dir =  nd->path.dentry->d_inode;
3519  	int error;
3520  
3521  	if (nd->flags & LOOKUP_DIRECTORY)
3522  		open_flag |= O_DIRECTORY;
3523  
3524  	file->f_path.dentry = DENTRY_NOT_SET;
3525  	file->f_path.mnt = nd->path.mnt;
3526  	error = dir->i_op->atomic_open(dir, dentry, file,
3527  				       open_to_namei_flags(open_flag), mode);
3528  	d_lookup_done(dentry);
3529  	if (!error) {
3530  		if (file->f_mode & FMODE_OPENED) {
3531  			if (unlikely(dentry != file->f_path.dentry)) {
3532  				dput(dentry);
3533  				dentry = dget(file->f_path.dentry);
3534  			}
3535  		} else if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) {
3536  			error = -EIO;
3537  		} else {
3538  			if (file->f_path.dentry) {
3539  				dput(dentry);
3540  				dentry = file->f_path.dentry;
3541  			}
3542  			if (unlikely(d_is_negative(dentry)))
3543  				error = -ENOENT;
3544  		}
3545  	}
3546  	if (error) {
3547  		dput(dentry);
3548  		dentry = ERR_PTR(error);
3549  	}
3550  	return dentry;
3551  }
3552  
3553  /*
3554   * Look up and maybe create and open the last component.
3555   *
3556   * Must be called with parent locked (exclusive in O_CREAT case).
3557   *
3558   * Returns 0 on success, that is, if
3559   *  the file was successfully atomically created (if necessary) and opened, or
3560   *  the file was not completely opened at this time, though lookups and
3561   *  creations were performed.
3562   * These case are distinguished by presence of FMODE_OPENED on file->f_mode.
3563   * In the latter case dentry returned in @path might be negative if O_CREAT
3564   * hadn't been specified.
3565   *
3566   * An error code is returned on failure.
3567   */
lookup_open(struct nameidata * nd,struct file * file,const struct open_flags * op,bool got_write)3568  static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
3569  				  const struct open_flags *op,
3570  				  bool got_write)
3571  {
3572  	struct mnt_idmap *idmap;
3573  	struct dentry *dir = nd->path.dentry;
3574  	struct inode *dir_inode = dir->d_inode;
3575  	int open_flag = op->open_flag;
3576  	struct dentry *dentry;
3577  	int error, create_error = 0;
3578  	umode_t mode = op->mode;
3579  	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
3580  
3581  	if (unlikely(IS_DEADDIR(dir_inode)))
3582  		return ERR_PTR(-ENOENT);
3583  
3584  	file->f_mode &= ~FMODE_CREATED;
3585  	dentry = d_lookup(dir, &nd->last);
3586  	for (;;) {
3587  		if (!dentry) {
3588  			dentry = d_alloc_parallel(dir, &nd->last, &wq);
3589  			if (IS_ERR(dentry))
3590  				return dentry;
3591  		}
3592  		if (d_in_lookup(dentry))
3593  			break;
3594  
3595  		error = d_revalidate(dir_inode, &nd->last, dentry, nd->flags);
3596  		if (likely(error > 0))
3597  			break;
3598  		if (error)
3599  			goto out_dput;
3600  		d_invalidate(dentry);
3601  		dput(dentry);
3602  		dentry = NULL;
3603  	}
3604  	if (dentry->d_inode) {
3605  		/* Cached positive dentry: will open in f_op->open */
3606  		return dentry;
3607  	}
3608  
3609  	if (open_flag & O_CREAT)
3610  		audit_inode(nd->name, dir, AUDIT_INODE_PARENT);
3611  
3612  	/*
3613  	 * Checking write permission is tricky, bacuse we don't know if we are
3614  	 * going to actually need it: O_CREAT opens should work as long as the
3615  	 * file exists.  But checking existence breaks atomicity.  The trick is
3616  	 * to check access and if not granted clear O_CREAT from the flags.
3617  	 *
3618  	 * Another problem is returing the "right" error value (e.g. for an
3619  	 * O_EXCL open we want to return EEXIST not EROFS).
3620  	 */
3621  	if (unlikely(!got_write))
3622  		open_flag &= ~O_TRUNC;
3623  	idmap = mnt_idmap(nd->path.mnt);
3624  	if (open_flag & O_CREAT) {
3625  		if (open_flag & O_EXCL)
3626  			open_flag &= ~O_TRUNC;
3627  		mode = vfs_prepare_mode(idmap, dir->d_inode, mode, mode, mode);
3628  		if (likely(got_write))
3629  			create_error = may_o_create(idmap, &nd->path,
3630  						    dentry, mode);
3631  		else
3632  			create_error = -EROFS;
3633  	}
3634  	if (create_error)
3635  		open_flag &= ~O_CREAT;
3636  	if (dir_inode->i_op->atomic_open) {
3637  		dentry = atomic_open(nd, dentry, file, open_flag, mode);
3638  		if (unlikely(create_error) && dentry == ERR_PTR(-ENOENT))
3639  			dentry = ERR_PTR(create_error);
3640  		return dentry;
3641  	}
3642  
3643  	if (d_in_lookup(dentry)) {
3644  		struct dentry *res = dir_inode->i_op->lookup(dir_inode, dentry,
3645  							     nd->flags);
3646  		d_lookup_done(dentry);
3647  		if (unlikely(res)) {
3648  			if (IS_ERR(res)) {
3649  				error = PTR_ERR(res);
3650  				goto out_dput;
3651  			}
3652  			dput(dentry);
3653  			dentry = res;
3654  		}
3655  	}
3656  
3657  	/* Negative dentry, just create the file */
3658  	if (!dentry->d_inode && (open_flag & O_CREAT)) {
3659  		file->f_mode |= FMODE_CREATED;
3660  		audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE);
3661  		if (!dir_inode->i_op->create) {
3662  			error = -EACCES;
3663  			goto out_dput;
3664  		}
3665  
3666  		error = dir_inode->i_op->create(idmap, dir_inode, dentry,
3667  						mode, open_flag & O_EXCL);
3668  		if (error)
3669  			goto out_dput;
3670  	}
3671  	if (unlikely(create_error) && !dentry->d_inode) {
3672  		error = create_error;
3673  		goto out_dput;
3674  	}
3675  	return dentry;
3676  
3677  out_dput:
3678  	dput(dentry);
3679  	return ERR_PTR(error);
3680  }
3681  
trailing_slashes(struct nameidata * nd)3682  static inline bool trailing_slashes(struct nameidata *nd)
3683  {
3684  	return (bool)nd->last.name[nd->last.len];
3685  }
3686  
lookup_fast_for_open(struct nameidata * nd,int open_flag)3687  static struct dentry *lookup_fast_for_open(struct nameidata *nd, int open_flag)
3688  {
3689  	struct dentry *dentry;
3690  
3691  	if (open_flag & O_CREAT) {
3692  		if (trailing_slashes(nd))
3693  			return ERR_PTR(-EISDIR);
3694  
3695  		/* Don't bother on an O_EXCL create */
3696  		if (open_flag & O_EXCL)
3697  			return NULL;
3698  	}
3699  
3700  	if (trailing_slashes(nd))
3701  		nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
3702  
3703  	dentry = lookup_fast(nd);
3704  	if (IS_ERR_OR_NULL(dentry))
3705  		return dentry;
3706  
3707  	if (open_flag & O_CREAT) {
3708  		/* Discard negative dentries. Need inode_lock to do the create */
3709  		if (!dentry->d_inode) {
3710  			if (!(nd->flags & LOOKUP_RCU))
3711  				dput(dentry);
3712  			dentry = NULL;
3713  		}
3714  	}
3715  	return dentry;
3716  }
3717  
open_last_lookups(struct nameidata * nd,struct file * file,const struct open_flags * op)3718  static const char *open_last_lookups(struct nameidata *nd,
3719  		   struct file *file, const struct open_flags *op)
3720  {
3721  	struct dentry *dir = nd->path.dentry;
3722  	int open_flag = op->open_flag;
3723  	bool got_write = false;
3724  	struct dentry *dentry;
3725  	const char *res;
3726  
3727  	nd->flags |= op->intent;
3728  
3729  	if (nd->last_type != LAST_NORM) {
3730  		if (nd->depth)
3731  			put_link(nd);
3732  		return handle_dots(nd, nd->last_type);
3733  	}
3734  
3735  	/* We _can_ be in RCU mode here */
3736  	dentry = lookup_fast_for_open(nd, open_flag);
3737  	if (IS_ERR(dentry))
3738  		return ERR_CAST(dentry);
3739  
3740  	if (likely(dentry))
3741  		goto finish_lookup;
3742  
3743  	if (!(open_flag & O_CREAT)) {
3744  		if (WARN_ON_ONCE(nd->flags & LOOKUP_RCU))
3745  			return ERR_PTR(-ECHILD);
3746  	} else {
3747  		if (nd->flags & LOOKUP_RCU) {
3748  			if (!try_to_unlazy(nd))
3749  				return ERR_PTR(-ECHILD);
3750  		}
3751  	}
3752  
3753  	if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
3754  		got_write = !mnt_want_write(nd->path.mnt);
3755  		/*
3756  		 * do _not_ fail yet - we might not need that or fail with
3757  		 * a different error; let lookup_open() decide; we'll be
3758  		 * dropping this one anyway.
3759  		 */
3760  	}
3761  	if (open_flag & O_CREAT)
3762  		inode_lock(dir->d_inode);
3763  	else
3764  		inode_lock_shared(dir->d_inode);
3765  	dentry = lookup_open(nd, file, op, got_write);
3766  	if (!IS_ERR(dentry)) {
3767  		if (file->f_mode & FMODE_CREATED)
3768  			fsnotify_create(dir->d_inode, dentry);
3769  		if (file->f_mode & FMODE_OPENED)
3770  			fsnotify_open(file);
3771  	}
3772  	if (open_flag & O_CREAT)
3773  		inode_unlock(dir->d_inode);
3774  	else
3775  		inode_unlock_shared(dir->d_inode);
3776  
3777  	if (got_write)
3778  		mnt_drop_write(nd->path.mnt);
3779  
3780  	if (IS_ERR(dentry))
3781  		return ERR_CAST(dentry);
3782  
3783  	if (file->f_mode & (FMODE_OPENED | FMODE_CREATED)) {
3784  		dput(nd->path.dentry);
3785  		nd->path.dentry = dentry;
3786  		return NULL;
3787  	}
3788  
3789  finish_lookup:
3790  	if (nd->depth)
3791  		put_link(nd);
3792  	res = step_into(nd, WALK_TRAILING, dentry);
3793  	if (unlikely(res))
3794  		nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
3795  	return res;
3796  }
3797  
3798  /*
3799   * Handle the last step of open()
3800   */
do_open(struct nameidata * nd,struct file * file,const struct open_flags * op)3801  static int do_open(struct nameidata *nd,
3802  		   struct file *file, const struct open_flags *op)
3803  {
3804  	struct mnt_idmap *idmap;
3805  	int open_flag = op->open_flag;
3806  	bool do_truncate;
3807  	int acc_mode;
3808  	int error;
3809  
3810  	if (!(file->f_mode & (FMODE_OPENED | FMODE_CREATED))) {
3811  		error = complete_walk(nd);
3812  		if (error)
3813  			return error;
3814  	}
3815  	if (!(file->f_mode & FMODE_CREATED))
3816  		audit_inode(nd->name, nd->path.dentry, 0);
3817  	idmap = mnt_idmap(nd->path.mnt);
3818  	if (open_flag & O_CREAT) {
3819  		if ((open_flag & O_EXCL) && !(file->f_mode & FMODE_CREATED))
3820  			return -EEXIST;
3821  		if (d_is_dir(nd->path.dentry))
3822  			return -EISDIR;
3823  		error = may_create_in_sticky(idmap, nd,
3824  					     d_backing_inode(nd->path.dentry));
3825  		if (unlikely(error))
3826  			return error;
3827  	}
3828  	if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry))
3829  		return -ENOTDIR;
3830  
3831  	do_truncate = false;
3832  	acc_mode = op->acc_mode;
3833  	if (file->f_mode & FMODE_CREATED) {
3834  		/* Don't check for write permission, don't truncate */
3835  		open_flag &= ~O_TRUNC;
3836  		acc_mode = 0;
3837  	} else if (d_is_reg(nd->path.dentry) && open_flag & O_TRUNC) {
3838  		error = mnt_want_write(nd->path.mnt);
3839  		if (error)
3840  			return error;
3841  		do_truncate = true;
3842  	}
3843  	error = may_open(idmap, &nd->path, acc_mode, open_flag);
3844  	if (!error && !(file->f_mode & FMODE_OPENED))
3845  		error = vfs_open(&nd->path, file);
3846  	if (!error)
3847  		error = security_file_post_open(file, op->acc_mode);
3848  	if (!error && do_truncate)
3849  		error = handle_truncate(idmap, file);
3850  	if (unlikely(error > 0)) {
3851  		WARN_ON(1);
3852  		error = -EINVAL;
3853  	}
3854  	if (do_truncate)
3855  		mnt_drop_write(nd->path.mnt);
3856  	return error;
3857  }
3858  
3859  /**
3860   * vfs_tmpfile - create tmpfile
3861   * @idmap:	idmap of the mount the inode was found from
3862   * @parentpath:	pointer to the path of the base directory
3863   * @file:	file descriptor of the new tmpfile
3864   * @mode:	mode of the new tmpfile
3865   *
3866   * Create a temporary file.
3867   *
3868   * If the inode has been found through an idmapped mount the idmap of
3869   * the vfsmount must be passed through @idmap. This function will then take
3870   * care to map the inode according to @idmap before checking permissions.
3871   * On non-idmapped mounts or if permission checking is to be performed on the
3872   * raw inode simply pass @nop_mnt_idmap.
3873   */
vfs_tmpfile(struct mnt_idmap * idmap,const struct path * parentpath,struct file * file,umode_t mode)3874  int vfs_tmpfile(struct mnt_idmap *idmap,
3875  		const struct path *parentpath,
3876  		struct file *file, umode_t mode)
3877  {
3878  	struct dentry *child;
3879  	struct inode *dir = d_inode(parentpath->dentry);
3880  	struct inode *inode;
3881  	int error;
3882  	int open_flag = file->f_flags;
3883  
3884  	/* we want directory to be writable */
3885  	error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC);
3886  	if (error)
3887  		return error;
3888  	if (!dir->i_op->tmpfile)
3889  		return -EOPNOTSUPP;
3890  	child = d_alloc(parentpath->dentry, &slash_name);
3891  	if (unlikely(!child))
3892  		return -ENOMEM;
3893  	file->f_path.mnt = parentpath->mnt;
3894  	file->f_path.dentry = child;
3895  	mode = vfs_prepare_mode(idmap, dir, mode, mode, mode);
3896  	error = dir->i_op->tmpfile(idmap, dir, file, mode);
3897  	dput(child);
3898  	if (file->f_mode & FMODE_OPENED)
3899  		fsnotify_open(file);
3900  	if (error)
3901  		return error;
3902  	/* Don't check for other permissions, the inode was just created */
3903  	error = may_open(idmap, &file->f_path, 0, file->f_flags);
3904  	if (error)
3905  		return error;
3906  	inode = file_inode(file);
3907  	if (!(open_flag & O_EXCL)) {
3908  		spin_lock(&inode->i_lock);
3909  		inode->i_state |= I_LINKABLE;
3910  		spin_unlock(&inode->i_lock);
3911  	}
3912  	security_inode_post_create_tmpfile(idmap, inode);
3913  	return 0;
3914  }
3915  
3916  /**
3917   * kernel_tmpfile_open - open a tmpfile for kernel internal use
3918   * @idmap:	idmap of the mount the inode was found from
3919   * @parentpath:	path of the base directory
3920   * @mode:	mode of the new tmpfile
3921   * @open_flag:	flags
3922   * @cred:	credentials for open
3923   *
3924   * Create and open a temporary file.  The file is not accounted in nr_files,
3925   * hence this is only for kernel internal use, and must not be installed into
3926   * file tables or such.
3927   */
kernel_tmpfile_open(struct mnt_idmap * idmap,const struct path * parentpath,umode_t mode,int open_flag,const struct cred * cred)3928  struct file *kernel_tmpfile_open(struct mnt_idmap *idmap,
3929  				 const struct path *parentpath,
3930  				 umode_t mode, int open_flag,
3931  				 const struct cred *cred)
3932  {
3933  	struct file *file;
3934  	int error;
3935  
3936  	file = alloc_empty_file_noaccount(open_flag, cred);
3937  	if (IS_ERR(file))
3938  		return file;
3939  
3940  	error = vfs_tmpfile(idmap, parentpath, file, mode);
3941  	if (error) {
3942  		fput(file);
3943  		file = ERR_PTR(error);
3944  	}
3945  	return file;
3946  }
3947  EXPORT_SYMBOL(kernel_tmpfile_open);
3948  
do_tmpfile(struct nameidata * nd,unsigned flags,const struct open_flags * op,struct file * file)3949  static int do_tmpfile(struct nameidata *nd, unsigned flags,
3950  		const struct open_flags *op,
3951  		struct file *file)
3952  {
3953  	struct path path;
3954  	int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path);
3955  
3956  	if (unlikely(error))
3957  		return error;
3958  	error = mnt_want_write(path.mnt);
3959  	if (unlikely(error))
3960  		goto out;
3961  	error = vfs_tmpfile(mnt_idmap(path.mnt), &path, file, op->mode);
3962  	if (error)
3963  		goto out2;
3964  	audit_inode(nd->name, file->f_path.dentry, 0);
3965  out2:
3966  	mnt_drop_write(path.mnt);
3967  out:
3968  	path_put(&path);
3969  	return error;
3970  }
3971  
do_o_path(struct nameidata * nd,unsigned flags,struct file * file)3972  static int do_o_path(struct nameidata *nd, unsigned flags, struct file *file)
3973  {
3974  	struct path path;
3975  	int error = path_lookupat(nd, flags, &path);
3976  	if (!error) {
3977  		audit_inode(nd->name, path.dentry, 0);
3978  		error = vfs_open(&path, file);
3979  		path_put(&path);
3980  	}
3981  	return error;
3982  }
3983  
path_openat(struct nameidata * nd,const struct open_flags * op,unsigned flags)3984  static struct file *path_openat(struct nameidata *nd,
3985  			const struct open_flags *op, unsigned flags)
3986  {
3987  	struct file *file;
3988  	int error;
3989  
3990  	file = alloc_empty_file(op->open_flag, current_cred());
3991  	if (IS_ERR(file))
3992  		return file;
3993  
3994  	if (unlikely(file->f_flags & __O_TMPFILE)) {
3995  		error = do_tmpfile(nd, flags, op, file);
3996  	} else if (unlikely(file->f_flags & O_PATH)) {
3997  		error = do_o_path(nd, flags, file);
3998  	} else {
3999  		const char *s = path_init(nd, flags);
4000  		while (!(error = link_path_walk(s, nd)) &&
4001  		       (s = open_last_lookups(nd, file, op)) != NULL)
4002  			;
4003  		if (!error)
4004  			error = do_open(nd, file, op);
4005  		terminate_walk(nd);
4006  	}
4007  	if (likely(!error)) {
4008  		if (likely(file->f_mode & FMODE_OPENED))
4009  			return file;
4010  		WARN_ON(1);
4011  		error = -EINVAL;
4012  	}
4013  	fput_close(file);
4014  	if (error == -EOPENSTALE) {
4015  		if (flags & LOOKUP_RCU)
4016  			error = -ECHILD;
4017  		else
4018  			error = -ESTALE;
4019  	}
4020  	return ERR_PTR(error);
4021  }
4022  
do_filp_open(int dfd,struct filename * pathname,const struct open_flags * op)4023  struct file *do_filp_open(int dfd, struct filename *pathname,
4024  		const struct open_flags *op)
4025  {
4026  	struct nameidata nd;
4027  	int flags = op->lookup_flags;
4028  	struct file *filp;
4029  
4030  	set_nameidata(&nd, dfd, pathname, NULL);
4031  	filp = path_openat(&nd, op, flags | LOOKUP_RCU);
4032  	if (unlikely(filp == ERR_PTR(-ECHILD)))
4033  		filp = path_openat(&nd, op, flags);
4034  	if (unlikely(filp == ERR_PTR(-ESTALE)))
4035  		filp = path_openat(&nd, op, flags | LOOKUP_REVAL);
4036  	restore_nameidata();
4037  	return filp;
4038  }
4039  
do_file_open_root(const struct path * root,const char * name,const struct open_flags * op)4040  struct file *do_file_open_root(const struct path *root,
4041  		const char *name, const struct open_flags *op)
4042  {
4043  	struct nameidata nd;
4044  	struct file *file;
4045  	struct filename *filename;
4046  	int flags = op->lookup_flags;
4047  
4048  	if (d_is_symlink(root->dentry) && op->intent & LOOKUP_OPEN)
4049  		return ERR_PTR(-ELOOP);
4050  
4051  	filename = getname_kernel(name);
4052  	if (IS_ERR(filename))
4053  		return ERR_CAST(filename);
4054  
4055  	set_nameidata(&nd, -1, filename, root);
4056  	file = path_openat(&nd, op, flags | LOOKUP_RCU);
4057  	if (unlikely(file == ERR_PTR(-ECHILD)))
4058  		file = path_openat(&nd, op, flags);
4059  	if (unlikely(file == ERR_PTR(-ESTALE)))
4060  		file = path_openat(&nd, op, flags | LOOKUP_REVAL);
4061  	restore_nameidata();
4062  	putname(filename);
4063  	return file;
4064  }
4065  
filename_create(int dfd,struct filename * name,struct path * path,unsigned int lookup_flags)4066  static struct dentry *filename_create(int dfd, struct filename *name,
4067  				      struct path *path, unsigned int lookup_flags)
4068  {
4069  	struct dentry *dentry = ERR_PTR(-EEXIST);
4070  	struct qstr last;
4071  	bool want_dir = lookup_flags & LOOKUP_DIRECTORY;
4072  	unsigned int reval_flag = lookup_flags & LOOKUP_REVAL;
4073  	unsigned int create_flags = LOOKUP_CREATE | LOOKUP_EXCL;
4074  	int type;
4075  	int err2;
4076  	int error;
4077  
4078  	error = filename_parentat(dfd, name, reval_flag, path, &last, &type);
4079  	if (error)
4080  		return ERR_PTR(error);
4081  
4082  	/*
4083  	 * Yucky last component or no last component at all?
4084  	 * (foo/., foo/.., /////)
4085  	 */
4086  	if (unlikely(type != LAST_NORM))
4087  		goto out;
4088  
4089  	/* don't fail immediately if it's r/o, at least try to report other errors */
4090  	err2 = mnt_want_write(path->mnt);
4091  	/*
4092  	 * Do the final lookup.  Suppress 'create' if there is a trailing
4093  	 * '/', and a directory wasn't requested.
4094  	 */
4095  	if (last.name[last.len] && !want_dir)
4096  		create_flags &= ~LOOKUP_CREATE;
4097  	inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
4098  	dentry = lookup_one_qstr_excl(&last, path->dentry,
4099  				      reval_flag | create_flags);
4100  	if (IS_ERR(dentry))
4101  		goto unlock;
4102  
4103  	if (unlikely(err2)) {
4104  		error = err2;
4105  		goto fail;
4106  	}
4107  	return dentry;
4108  fail:
4109  	dput(dentry);
4110  	dentry = ERR_PTR(error);
4111  unlock:
4112  	inode_unlock(path->dentry->d_inode);
4113  	if (!err2)
4114  		mnt_drop_write(path->mnt);
4115  out:
4116  	path_put(path);
4117  	return dentry;
4118  }
4119  
kern_path_create(int dfd,const char * pathname,struct path * path,unsigned int lookup_flags)4120  struct dentry *kern_path_create(int dfd, const char *pathname,
4121  				struct path *path, unsigned int lookup_flags)
4122  {
4123  	struct filename *filename = getname_kernel(pathname);
4124  	struct dentry *res = filename_create(dfd, filename, path, lookup_flags);
4125  
4126  	putname(filename);
4127  	return res;
4128  }
4129  EXPORT_SYMBOL(kern_path_create);
4130  
done_path_create(struct path * path,struct dentry * dentry)4131  void done_path_create(struct path *path, struct dentry *dentry)
4132  {
4133  	if (!IS_ERR(dentry))
4134  		dput(dentry);
4135  	inode_unlock(path->dentry->d_inode);
4136  	mnt_drop_write(path->mnt);
4137  	path_put(path);
4138  }
4139  EXPORT_SYMBOL(done_path_create);
4140  
user_path_create(int dfd,const char __user * pathname,struct path * path,unsigned int lookup_flags)4141  inline struct dentry *user_path_create(int dfd, const char __user *pathname,
4142  				struct path *path, unsigned int lookup_flags)
4143  {
4144  	struct filename *filename = getname(pathname);
4145  	struct dentry *res = filename_create(dfd, filename, path, lookup_flags);
4146  
4147  	putname(filename);
4148  	return res;
4149  }
4150  EXPORT_SYMBOL(user_path_create);
4151  
4152  /**
4153   * vfs_mknod - create device node or file
4154   * @idmap:	idmap of the mount the inode was found from
4155   * @dir:	inode of the parent directory
4156   * @dentry:	dentry of the child device node
4157   * @mode:	mode of the child device node
4158   * @dev:	device number of device to create
4159   *
4160   * Create a device node or file.
4161   *
4162   * If the inode has been found through an idmapped mount the idmap of
4163   * the vfsmount must be passed through @idmap. This function will then take
4164   * care to map the inode according to @idmap before checking permissions.
4165   * On non-idmapped mounts or if permission checking is to be performed on the
4166   * raw inode simply pass @nop_mnt_idmap.
4167   */
vfs_mknod(struct mnt_idmap * idmap,struct inode * dir,struct dentry * dentry,umode_t mode,dev_t dev)4168  int vfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
4169  	      struct dentry *dentry, umode_t mode, dev_t dev)
4170  {
4171  	bool is_whiteout = S_ISCHR(mode) && dev == WHITEOUT_DEV;
4172  	int error = may_create(idmap, dir, dentry);
4173  
4174  	if (error)
4175  		return error;
4176  
4177  	if ((S_ISCHR(mode) || S_ISBLK(mode)) && !is_whiteout &&
4178  	    !capable(CAP_MKNOD))
4179  		return -EPERM;
4180  
4181  	if (!dir->i_op->mknod)
4182  		return -EPERM;
4183  
4184  	mode = vfs_prepare_mode(idmap, dir, mode, mode, mode);
4185  	error = devcgroup_inode_mknod(mode, dev);
4186  	if (error)
4187  		return error;
4188  
4189  	error = security_inode_mknod(dir, dentry, mode, dev);
4190  	if (error)
4191  		return error;
4192  
4193  	error = dir->i_op->mknod(idmap, dir, dentry, mode, dev);
4194  	if (!error)
4195  		fsnotify_create(dir, dentry);
4196  	return error;
4197  }
4198  EXPORT_SYMBOL(vfs_mknod);
4199  
may_mknod(umode_t mode)4200  static int may_mknod(umode_t mode)
4201  {
4202  	switch (mode & S_IFMT) {
4203  	case S_IFREG:
4204  	case S_IFCHR:
4205  	case S_IFBLK:
4206  	case S_IFIFO:
4207  	case S_IFSOCK:
4208  	case 0: /* zero mode translates to S_IFREG */
4209  		return 0;
4210  	case S_IFDIR:
4211  		return -EPERM;
4212  	default:
4213  		return -EINVAL;
4214  	}
4215  }
4216  
do_mknodat(int dfd,struct filename * name,umode_t mode,unsigned int dev)4217  static int do_mknodat(int dfd, struct filename *name, umode_t mode,
4218  		unsigned int dev)
4219  {
4220  	struct mnt_idmap *idmap;
4221  	struct dentry *dentry;
4222  	struct path path;
4223  	int error;
4224  	unsigned int lookup_flags = 0;
4225  
4226  	error = may_mknod(mode);
4227  	if (error)
4228  		goto out1;
4229  retry:
4230  	dentry = filename_create(dfd, name, &path, lookup_flags);
4231  	error = PTR_ERR(dentry);
4232  	if (IS_ERR(dentry))
4233  		goto out1;
4234  
4235  	error = security_path_mknod(&path, dentry,
4236  			mode_strip_umask(path.dentry->d_inode, mode), dev);
4237  	if (error)
4238  		goto out2;
4239  
4240  	idmap = mnt_idmap(path.mnt);
4241  	switch (mode & S_IFMT) {
4242  		case 0: case S_IFREG:
4243  			error = vfs_create(idmap, path.dentry->d_inode,
4244  					   dentry, mode, true);
4245  			if (!error)
4246  				security_path_post_mknod(idmap, dentry);
4247  			break;
4248  		case S_IFCHR: case S_IFBLK:
4249  			error = vfs_mknod(idmap, path.dentry->d_inode,
4250  					  dentry, mode, new_decode_dev(dev));
4251  			break;
4252  		case S_IFIFO: case S_IFSOCK:
4253  			error = vfs_mknod(idmap, path.dentry->d_inode,
4254  					  dentry, mode, 0);
4255  			break;
4256  	}
4257  out2:
4258  	done_path_create(&path, dentry);
4259  	if (retry_estale(error, lookup_flags)) {
4260  		lookup_flags |= LOOKUP_REVAL;
4261  		goto retry;
4262  	}
4263  out1:
4264  	putname(name);
4265  	return error;
4266  }
4267  
SYSCALL_DEFINE4(mknodat,int,dfd,const char __user *,filename,umode_t,mode,unsigned int,dev)4268  SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
4269  		unsigned int, dev)
4270  {
4271  	return do_mknodat(dfd, getname(filename), mode, dev);
4272  }
4273  
SYSCALL_DEFINE3(mknod,const char __user *,filename,umode_t,mode,unsigned,dev)4274  SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev)
4275  {
4276  	return do_mknodat(AT_FDCWD, getname(filename), mode, dev);
4277  }
4278  
4279  /**
4280   * vfs_mkdir - create directory returning correct dentry if possible
4281   * @idmap:	idmap of the mount the inode was found from
4282   * @dir:	inode of the parent directory
4283   * @dentry:	dentry of the child directory
4284   * @mode:	mode of the child directory
4285   *
4286   * Create a directory.
4287   *
4288   * If the inode has been found through an idmapped mount the idmap of
4289   * the vfsmount must be passed through @idmap. This function will then take
4290   * care to map the inode according to @idmap before checking permissions.
4291   * On non-idmapped mounts or if permission checking is to be performed on the
4292   * raw inode simply pass @nop_mnt_idmap.
4293   *
4294   * In the event that the filesystem does not use the *@dentry but leaves it
4295   * negative or unhashes it and possibly splices a different one returning it,
4296   * the original dentry is dput() and the alternate is returned.
4297   *
4298   * In case of an error the dentry is dput() and an ERR_PTR() is returned.
4299   */
vfs_mkdir(struct mnt_idmap * idmap,struct inode * dir,struct dentry * dentry,umode_t mode)4300  struct dentry *vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
4301  			 struct dentry *dentry, umode_t mode)
4302  {
4303  	int error;
4304  	unsigned max_links = dir->i_sb->s_max_links;
4305  	struct dentry *de;
4306  
4307  	error = may_create(idmap, dir, dentry);
4308  	if (error)
4309  		goto err;
4310  
4311  	error = -EPERM;
4312  	if (!dir->i_op->mkdir)
4313  		goto err;
4314  
4315  	mode = vfs_prepare_mode(idmap, dir, mode, S_IRWXUGO | S_ISVTX, 0);
4316  	error = security_inode_mkdir(dir, dentry, mode);
4317  	if (error)
4318  		goto err;
4319  
4320  	error = -EMLINK;
4321  	if (max_links && dir->i_nlink >= max_links)
4322  		goto err;
4323  
4324  	de = dir->i_op->mkdir(idmap, dir, dentry, mode);
4325  	error = PTR_ERR(de);
4326  	if (IS_ERR(de))
4327  		goto err;
4328  	if (de) {
4329  		dput(dentry);
4330  		dentry = de;
4331  	}
4332  	fsnotify_mkdir(dir, dentry);
4333  	return dentry;
4334  
4335  err:
4336  	dput(dentry);
4337  	return ERR_PTR(error);
4338  }
4339  EXPORT_SYMBOL(vfs_mkdir);
4340  
do_mkdirat(int dfd,struct filename * name,umode_t mode)4341  int do_mkdirat(int dfd, struct filename *name, umode_t mode)
4342  {
4343  	struct dentry *dentry;
4344  	struct path path;
4345  	int error;
4346  	unsigned int lookup_flags = LOOKUP_DIRECTORY;
4347  
4348  retry:
4349  	dentry = filename_create(dfd, name, &path, lookup_flags);
4350  	error = PTR_ERR(dentry);
4351  	if (IS_ERR(dentry))
4352  		goto out_putname;
4353  
4354  	error = security_path_mkdir(&path, dentry,
4355  			mode_strip_umask(path.dentry->d_inode, mode));
4356  	if (!error) {
4357  		dentry = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode,
4358  				  dentry, mode);
4359  		if (IS_ERR(dentry))
4360  			error = PTR_ERR(dentry);
4361  	}
4362  	done_path_create(&path, dentry);
4363  	if (retry_estale(error, lookup_flags)) {
4364  		lookup_flags |= LOOKUP_REVAL;
4365  		goto retry;
4366  	}
4367  out_putname:
4368  	putname(name);
4369  	return error;
4370  }
4371  
SYSCALL_DEFINE3(mkdirat,int,dfd,const char __user *,pathname,umode_t,mode)4372  SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
4373  {
4374  	return do_mkdirat(dfd, getname(pathname), mode);
4375  }
4376  
SYSCALL_DEFINE2(mkdir,const char __user *,pathname,umode_t,mode)4377  SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
4378  {
4379  	return do_mkdirat(AT_FDCWD, getname(pathname), mode);
4380  }
4381  
4382  /**
4383   * vfs_rmdir - remove directory
4384   * @idmap:	idmap of the mount the inode was found from
4385   * @dir:	inode of the parent directory
4386   * @dentry:	dentry of the child directory
4387   *
4388   * Remove a directory.
4389   *
4390   * If the inode has been found through an idmapped mount the idmap of
4391   * the vfsmount must be passed through @idmap. This function will then take
4392   * care to map the inode according to @idmap before checking permissions.
4393   * On non-idmapped mounts or if permission checking is to be performed on the
4394   * raw inode simply pass @nop_mnt_idmap.
4395   */
vfs_rmdir(struct mnt_idmap * idmap,struct inode * dir,struct dentry * dentry)4396  int vfs_rmdir(struct mnt_idmap *idmap, struct inode *dir,
4397  		     struct dentry *dentry)
4398  {
4399  	int error = may_delete(idmap, dir, dentry, 1);
4400  
4401  	if (error)
4402  		return error;
4403  
4404  	if (!dir->i_op->rmdir)
4405  		return -EPERM;
4406  
4407  	dget(dentry);
4408  	inode_lock(dentry->d_inode);
4409  
4410  	error = -EBUSY;
4411  	if (is_local_mountpoint(dentry) ||
4412  	    (dentry->d_inode->i_flags & S_KERNEL_FILE))
4413  		goto out;
4414  
4415  	error = security_inode_rmdir(dir, dentry);
4416  	if (error)
4417  		goto out;
4418  
4419  	error = dir->i_op->rmdir(dir, dentry);
4420  	if (error)
4421  		goto out;
4422  
4423  	shrink_dcache_parent(dentry);
4424  	dentry->d_inode->i_flags |= S_DEAD;
4425  	dont_mount(dentry);
4426  	detach_mounts(dentry);
4427  
4428  out:
4429  	inode_unlock(dentry->d_inode);
4430  	dput(dentry);
4431  	if (!error)
4432  		d_delete_notify(dir, dentry);
4433  	return error;
4434  }
4435  EXPORT_SYMBOL(vfs_rmdir);
4436  
do_rmdir(int dfd,struct filename * name)4437  int do_rmdir(int dfd, struct filename *name)
4438  {
4439  	int error;
4440  	struct dentry *dentry;
4441  	struct path path;
4442  	struct qstr last;
4443  	int type;
4444  	unsigned int lookup_flags = 0;
4445  retry:
4446  	error = filename_parentat(dfd, name, lookup_flags, &path, &last, &type);
4447  	if (error)
4448  		goto exit1;
4449  
4450  	switch (type) {
4451  	case LAST_DOTDOT:
4452  		error = -ENOTEMPTY;
4453  		goto exit2;
4454  	case LAST_DOT:
4455  		error = -EINVAL;
4456  		goto exit2;
4457  	case LAST_ROOT:
4458  		error = -EBUSY;
4459  		goto exit2;
4460  	}
4461  
4462  	error = mnt_want_write(path.mnt);
4463  	if (error)
4464  		goto exit2;
4465  
4466  	inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
4467  	dentry = lookup_one_qstr_excl(&last, path.dentry, lookup_flags);
4468  	error = PTR_ERR(dentry);
4469  	if (IS_ERR(dentry))
4470  		goto exit3;
4471  	error = security_path_rmdir(&path, dentry);
4472  	if (error)
4473  		goto exit4;
4474  	error = vfs_rmdir(mnt_idmap(path.mnt), path.dentry->d_inode, dentry);
4475  exit4:
4476  	dput(dentry);
4477  exit3:
4478  	inode_unlock(path.dentry->d_inode);
4479  	mnt_drop_write(path.mnt);
4480  exit2:
4481  	path_put(&path);
4482  	if (retry_estale(error, lookup_flags)) {
4483  		lookup_flags |= LOOKUP_REVAL;
4484  		goto retry;
4485  	}
4486  exit1:
4487  	putname(name);
4488  	return error;
4489  }
4490  
SYSCALL_DEFINE1(rmdir,const char __user *,pathname)4491  SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
4492  {
4493  	return do_rmdir(AT_FDCWD, getname(pathname));
4494  }
4495  
4496  /**
4497   * vfs_unlink - unlink a filesystem object
4498   * @idmap:	idmap of the mount the inode was found from
4499   * @dir:	parent directory
4500   * @dentry:	victim
4501   * @delegated_inode: returns victim inode, if the inode is delegated.
4502   *
4503   * The caller must hold dir->i_mutex.
4504   *
4505   * If vfs_unlink discovers a delegation, it will return -EWOULDBLOCK and
4506   * return a reference to the inode in delegated_inode.  The caller
4507   * should then break the delegation on that inode and retry.  Because
4508   * breaking a delegation may take a long time, the caller should drop
4509   * dir->i_mutex before doing so.
4510   *
4511   * Alternatively, a caller may pass NULL for delegated_inode.  This may
4512   * be appropriate for callers that expect the underlying filesystem not
4513   * to be NFS exported.
4514   *
4515   * If the inode has been found through an idmapped mount the idmap of
4516   * the vfsmount must be passed through @idmap. This function will then take
4517   * care to map the inode according to @idmap before checking permissions.
4518   * On non-idmapped mounts or if permission checking is to be performed on the
4519   * raw inode simply pass @nop_mnt_idmap.
4520   */
vfs_unlink(struct mnt_idmap * idmap,struct inode * dir,struct dentry * dentry,struct inode ** delegated_inode)4521  int vfs_unlink(struct mnt_idmap *idmap, struct inode *dir,
4522  	       struct dentry *dentry, struct inode **delegated_inode)
4523  {
4524  	struct inode *target = dentry->d_inode;
4525  	int error = may_delete(idmap, dir, dentry, 0);
4526  
4527  	if (error)
4528  		return error;
4529  
4530  	if (!dir->i_op->unlink)
4531  		return -EPERM;
4532  
4533  	inode_lock(target);
4534  	if (IS_SWAPFILE(target))
4535  		error = -EPERM;
4536  	else if (is_local_mountpoint(dentry))
4537  		error = -EBUSY;
4538  	else {
4539  		error = security_inode_unlink(dir, dentry);
4540  		if (!error) {
4541  			error = try_break_deleg(target, delegated_inode);
4542  			if (error)
4543  				goto out;
4544  			error = dir->i_op->unlink(dir, dentry);
4545  			if (!error) {
4546  				dont_mount(dentry);
4547  				detach_mounts(dentry);
4548  			}
4549  		}
4550  	}
4551  out:
4552  	inode_unlock(target);
4553  
4554  	/* We don't d_delete() NFS sillyrenamed files--they still exist. */
4555  	if (!error && dentry->d_flags & DCACHE_NFSFS_RENAMED) {
4556  		fsnotify_unlink(dir, dentry);
4557  	} else if (!error) {
4558  		fsnotify_link_count(target);
4559  		d_delete_notify(dir, dentry);
4560  	}
4561  
4562  	return error;
4563  }
4564  EXPORT_SYMBOL(vfs_unlink);
4565  
4566  /*
4567   * Make sure that the actual truncation of the file will occur outside its
4568   * directory's i_mutex.  Truncate can take a long time if there is a lot of
4569   * writeout happening, and we don't want to prevent access to the directory
4570   * while waiting on the I/O.
4571   */
do_unlinkat(int dfd,struct filename * name)4572  int do_unlinkat(int dfd, struct filename *name)
4573  {
4574  	int error;
4575  	struct dentry *dentry;
4576  	struct path path;
4577  	struct qstr last;
4578  	int type;
4579  	struct inode *inode = NULL;
4580  	struct inode *delegated_inode = NULL;
4581  	unsigned int lookup_flags = 0;
4582  retry:
4583  	error = filename_parentat(dfd, name, lookup_flags, &path, &last, &type);
4584  	if (error)
4585  		goto exit1;
4586  
4587  	error = -EISDIR;
4588  	if (type != LAST_NORM)
4589  		goto exit2;
4590  
4591  	error = mnt_want_write(path.mnt);
4592  	if (error)
4593  		goto exit2;
4594  retry_deleg:
4595  	inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
4596  	dentry = lookup_one_qstr_excl(&last, path.dentry, lookup_flags);
4597  	error = PTR_ERR(dentry);
4598  	if (!IS_ERR(dentry)) {
4599  
4600  		/* Why not before? Because we want correct error value */
4601  		if (last.name[last.len])
4602  			goto slashes;
4603  		inode = dentry->d_inode;
4604  		ihold(inode);
4605  		error = security_path_unlink(&path, dentry);
4606  		if (error)
4607  			goto exit3;
4608  		error = vfs_unlink(mnt_idmap(path.mnt), path.dentry->d_inode,
4609  				   dentry, &delegated_inode);
4610  exit3:
4611  		dput(dentry);
4612  	}
4613  	inode_unlock(path.dentry->d_inode);
4614  	if (inode)
4615  		iput(inode);	/* truncate the inode here */
4616  	inode = NULL;
4617  	if (delegated_inode) {
4618  		error = break_deleg_wait(&delegated_inode);
4619  		if (!error)
4620  			goto retry_deleg;
4621  	}
4622  	mnt_drop_write(path.mnt);
4623  exit2:
4624  	path_put(&path);
4625  	if (retry_estale(error, lookup_flags)) {
4626  		lookup_flags |= LOOKUP_REVAL;
4627  		inode = NULL;
4628  		goto retry;
4629  	}
4630  exit1:
4631  	putname(name);
4632  	return error;
4633  
4634  slashes:
4635  	if (d_is_dir(dentry))
4636  		error = -EISDIR;
4637  	else
4638  		error = -ENOTDIR;
4639  	goto exit3;
4640  }
4641  
SYSCALL_DEFINE3(unlinkat,int,dfd,const char __user *,pathname,int,flag)4642  SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag)
4643  {
4644  	if ((flag & ~AT_REMOVEDIR) != 0)
4645  		return -EINVAL;
4646  
4647  	if (flag & AT_REMOVEDIR)
4648  		return do_rmdir(dfd, getname(pathname));
4649  	return do_unlinkat(dfd, getname(pathname));
4650  }
4651  
SYSCALL_DEFINE1(unlink,const char __user *,pathname)4652  SYSCALL_DEFINE1(unlink, const char __user *, pathname)
4653  {
4654  	return do_unlinkat(AT_FDCWD, getname(pathname));
4655  }
4656  
4657  /**
4658   * vfs_symlink - create symlink
4659   * @idmap:	idmap of the mount the inode was found from
4660   * @dir:	inode of the parent directory
4661   * @dentry:	dentry of the child symlink file
4662   * @oldname:	name of the file to link to
4663   *
4664   * Create a symlink.
4665   *
4666   * If the inode has been found through an idmapped mount the idmap of
4667   * the vfsmount must be passed through @idmap. This function will then take
4668   * care to map the inode according to @idmap before checking permissions.
4669   * On non-idmapped mounts or if permission checking is to be performed on the
4670   * raw inode simply pass @nop_mnt_idmap.
4671   */
vfs_symlink(struct mnt_idmap * idmap,struct inode * dir,struct dentry * dentry,const char * oldname)4672  int vfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
4673  		struct dentry *dentry, const char *oldname)
4674  {
4675  	int error;
4676  
4677  	error = may_create(idmap, dir, dentry);
4678  	if (error)
4679  		return error;
4680  
4681  	if (!dir->i_op->symlink)
4682  		return -EPERM;
4683  
4684  	error = security_inode_symlink(dir, dentry, oldname);
4685  	if (error)
4686  		return error;
4687  
4688  	error = dir->i_op->symlink(idmap, dir, dentry, oldname);
4689  	if (!error)
4690  		fsnotify_create(dir, dentry);
4691  	return error;
4692  }
4693  EXPORT_SYMBOL(vfs_symlink);
4694  
do_symlinkat(struct filename * from,int newdfd,struct filename * to)4695  int do_symlinkat(struct filename *from, int newdfd, struct filename *to)
4696  {
4697  	int error;
4698  	struct dentry *dentry;
4699  	struct path path;
4700  	unsigned int lookup_flags = 0;
4701  
4702  	if (IS_ERR(from)) {
4703  		error = PTR_ERR(from);
4704  		goto out_putnames;
4705  	}
4706  retry:
4707  	dentry = filename_create(newdfd, to, &path, lookup_flags);
4708  	error = PTR_ERR(dentry);
4709  	if (IS_ERR(dentry))
4710  		goto out_putnames;
4711  
4712  	error = security_path_symlink(&path, dentry, from->name);
4713  	if (!error)
4714  		error = vfs_symlink(mnt_idmap(path.mnt), path.dentry->d_inode,
4715  				    dentry, from->name);
4716  	done_path_create(&path, dentry);
4717  	if (retry_estale(error, lookup_flags)) {
4718  		lookup_flags |= LOOKUP_REVAL;
4719  		goto retry;
4720  	}
4721  out_putnames:
4722  	putname(to);
4723  	putname(from);
4724  	return error;
4725  }
4726  
SYSCALL_DEFINE3(symlinkat,const char __user *,oldname,int,newdfd,const char __user *,newname)4727  SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
4728  		int, newdfd, const char __user *, newname)
4729  {
4730  	return do_symlinkat(getname(oldname), newdfd, getname(newname));
4731  }
4732  
SYSCALL_DEFINE2(symlink,const char __user *,oldname,const char __user *,newname)4733  SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname)
4734  {
4735  	return do_symlinkat(getname(oldname), AT_FDCWD, getname(newname));
4736  }
4737  
4738  /**
4739   * vfs_link - create a new link
4740   * @old_dentry:	object to be linked
4741   * @idmap:	idmap of the mount
4742   * @dir:	new parent
4743   * @new_dentry:	where to create the new link
4744   * @delegated_inode: returns inode needing a delegation break
4745   *
4746   * The caller must hold dir->i_mutex
4747   *
4748   * If vfs_link discovers a delegation on the to-be-linked file in need
4749   * of breaking, it will return -EWOULDBLOCK and return a reference to the
4750   * inode in delegated_inode.  The caller should then break the delegation
4751   * and retry.  Because breaking a delegation may take a long time, the
4752   * caller should drop the i_mutex before doing so.
4753   *
4754   * Alternatively, a caller may pass NULL for delegated_inode.  This may
4755   * be appropriate for callers that expect the underlying filesystem not
4756   * to be NFS exported.
4757   *
4758   * If the inode has been found through an idmapped mount the idmap of
4759   * the vfsmount must be passed through @idmap. This function will then take
4760   * care to map the inode according to @idmap before checking permissions.
4761   * On non-idmapped mounts or if permission checking is to be performed on the
4762   * raw inode simply pass @nop_mnt_idmap.
4763   */
vfs_link(struct dentry * old_dentry,struct mnt_idmap * idmap,struct inode * dir,struct dentry * new_dentry,struct inode ** delegated_inode)4764  int vfs_link(struct dentry *old_dentry, struct mnt_idmap *idmap,
4765  	     struct inode *dir, struct dentry *new_dentry,
4766  	     struct inode **delegated_inode)
4767  {
4768  	struct inode *inode = old_dentry->d_inode;
4769  	unsigned max_links = dir->i_sb->s_max_links;
4770  	int error;
4771  
4772  	if (!inode)
4773  		return -ENOENT;
4774  
4775  	error = may_create(idmap, dir, new_dentry);
4776  	if (error)
4777  		return error;
4778  
4779  	if (dir->i_sb != inode->i_sb)
4780  		return -EXDEV;
4781  
4782  	/*
4783  	 * A link to an append-only or immutable file cannot be created.
4784  	 */
4785  	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
4786  		return -EPERM;
4787  	/*
4788  	 * Updating the link count will likely cause i_uid and i_gid to
4789  	 * be writen back improperly if their true value is unknown to
4790  	 * the vfs.
4791  	 */
4792  	if (HAS_UNMAPPED_ID(idmap, inode))
4793  		return -EPERM;
4794  	if (!dir->i_op->link)
4795  		return -EPERM;
4796  	if (S_ISDIR(inode->i_mode))
4797  		return -EPERM;
4798  
4799  	error = security_inode_link(old_dentry, dir, new_dentry);
4800  	if (error)
4801  		return error;
4802  
4803  	inode_lock(inode);
4804  	/* Make sure we don't allow creating hardlink to an unlinked file */
4805  	if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE))
4806  		error =  -ENOENT;
4807  	else if (max_links && inode->i_nlink >= max_links)
4808  		error = -EMLINK;
4809  	else {
4810  		error = try_break_deleg(inode, delegated_inode);
4811  		if (!error)
4812  			error = dir->i_op->link(old_dentry, dir, new_dentry);
4813  	}
4814  
4815  	if (!error && (inode->i_state & I_LINKABLE)) {
4816  		spin_lock(&inode->i_lock);
4817  		inode->i_state &= ~I_LINKABLE;
4818  		spin_unlock(&inode->i_lock);
4819  	}
4820  	inode_unlock(inode);
4821  	if (!error)
4822  		fsnotify_link(dir, inode, new_dentry);
4823  	return error;
4824  }
4825  EXPORT_SYMBOL(vfs_link);
4826  
4827  /*
4828   * Hardlinks are often used in delicate situations.  We avoid
4829   * security-related surprises by not following symlinks on the
4830   * newname.  --KAB
4831   *
4832   * We don't follow them on the oldname either to be compatible
4833   * with linux 2.0, and to avoid hard-linking to directories
4834   * and other special files.  --ADM
4835   */
do_linkat(int olddfd,struct filename * old,int newdfd,struct filename * new,int flags)4836  int do_linkat(int olddfd, struct filename *old, int newdfd,
4837  	      struct filename *new, int flags)
4838  {
4839  	struct mnt_idmap *idmap;
4840  	struct dentry *new_dentry;
4841  	struct path old_path, new_path;
4842  	struct inode *delegated_inode = NULL;
4843  	int how = 0;
4844  	int error;
4845  
4846  	if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0) {
4847  		error = -EINVAL;
4848  		goto out_putnames;
4849  	}
4850  	/*
4851  	 * To use null names we require CAP_DAC_READ_SEARCH or
4852  	 * that the open-time creds of the dfd matches current.
4853  	 * This ensures that not everyone will be able to create
4854  	 * a hardlink using the passed file descriptor.
4855  	 */
4856  	if (flags & AT_EMPTY_PATH)
4857  		how |= LOOKUP_LINKAT_EMPTY;
4858  
4859  	if (flags & AT_SYMLINK_FOLLOW)
4860  		how |= LOOKUP_FOLLOW;
4861  retry:
4862  	error = filename_lookup(olddfd, old, how, &old_path, NULL);
4863  	if (error)
4864  		goto out_putnames;
4865  
4866  	new_dentry = filename_create(newdfd, new, &new_path,
4867  					(how & LOOKUP_REVAL));
4868  	error = PTR_ERR(new_dentry);
4869  	if (IS_ERR(new_dentry))
4870  		goto out_putpath;
4871  
4872  	error = -EXDEV;
4873  	if (old_path.mnt != new_path.mnt)
4874  		goto out_dput;
4875  	idmap = mnt_idmap(new_path.mnt);
4876  	error = may_linkat(idmap, &old_path);
4877  	if (unlikely(error))
4878  		goto out_dput;
4879  	error = security_path_link(old_path.dentry, &new_path, new_dentry);
4880  	if (error)
4881  		goto out_dput;
4882  	error = vfs_link(old_path.dentry, idmap, new_path.dentry->d_inode,
4883  			 new_dentry, &delegated_inode);
4884  out_dput:
4885  	done_path_create(&new_path, new_dentry);
4886  	if (delegated_inode) {
4887  		error = break_deleg_wait(&delegated_inode);
4888  		if (!error) {
4889  			path_put(&old_path);
4890  			goto retry;
4891  		}
4892  	}
4893  	if (retry_estale(error, how)) {
4894  		path_put(&old_path);
4895  		how |= LOOKUP_REVAL;
4896  		goto retry;
4897  	}
4898  out_putpath:
4899  	path_put(&old_path);
4900  out_putnames:
4901  	putname(old);
4902  	putname(new);
4903  
4904  	return error;
4905  }
4906  
SYSCALL_DEFINE5(linkat,int,olddfd,const char __user *,oldname,int,newdfd,const char __user *,newname,int,flags)4907  SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
4908  		int, newdfd, const char __user *, newname, int, flags)
4909  {
4910  	return do_linkat(olddfd, getname_uflags(oldname, flags),
4911  		newdfd, getname(newname), flags);
4912  }
4913  
SYSCALL_DEFINE2(link,const char __user *,oldname,const char __user *,newname)4914  SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname)
4915  {
4916  	return do_linkat(AT_FDCWD, getname(oldname), AT_FDCWD, getname(newname), 0);
4917  }
4918  
4919  /**
4920   * vfs_rename - rename a filesystem object
4921   * @rd:		pointer to &struct renamedata info
4922   *
4923   * The caller must hold multiple mutexes--see lock_rename()).
4924   *
4925   * If vfs_rename discovers a delegation in need of breaking at either
4926   * the source or destination, it will return -EWOULDBLOCK and return a
4927   * reference to the inode in delegated_inode.  The caller should then
4928   * break the delegation and retry.  Because breaking a delegation may
4929   * take a long time, the caller should drop all locks before doing
4930   * so.
4931   *
4932   * Alternatively, a caller may pass NULL for delegated_inode.  This may
4933   * be appropriate for callers that expect the underlying filesystem not
4934   * to be NFS exported.
4935   *
4936   * The worst of all namespace operations - renaming directory. "Perverted"
4937   * doesn't even start to describe it. Somebody in UCB had a heck of a trip...
4938   * Problems:
4939   *
4940   *	a) we can get into loop creation.
4941   *	b) race potential - two innocent renames can create a loop together.
4942   *	   That's where 4.4BSD screws up. Current fix: serialization on
4943   *	   sb->s_vfs_rename_mutex. We might be more accurate, but that's another
4944   *	   story.
4945   *	c) we may have to lock up to _four_ objects - parents and victim (if it exists),
4946   *	   and source (if it's a non-directory or a subdirectory that moves to
4947   *	   different parent).
4948   *	   And that - after we got ->i_mutex on parents (until then we don't know
4949   *	   whether the target exists).  Solution: try to be smart with locking
4950   *	   order for inodes.  We rely on the fact that tree topology may change
4951   *	   only under ->s_vfs_rename_mutex _and_ that parent of the object we
4952   *	   move will be locked.  Thus we can rank directories by the tree
4953   *	   (ancestors first) and rank all non-directories after them.
4954   *	   That works since everybody except rename does "lock parent, lookup,
4955   *	   lock child" and rename is under ->s_vfs_rename_mutex.
4956   *	   HOWEVER, it relies on the assumption that any object with ->lookup()
4957   *	   has no more than 1 dentry.  If "hybrid" objects will ever appear,
4958   *	   we'd better make sure that there's no link(2) for them.
4959   *	d) conversion from fhandle to dentry may come in the wrong moment - when
4960   *	   we are removing the target. Solution: we will have to grab ->i_mutex
4961   *	   in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
4962   *	   ->i_mutex on parents, which works but leads to some truly excessive
4963   *	   locking].
4964   */
vfs_rename(struct renamedata * rd)4965  int vfs_rename(struct renamedata *rd)
4966  {
4967  	int error;
4968  	struct inode *old_dir = rd->old_dir, *new_dir = rd->new_dir;
4969  	struct dentry *old_dentry = rd->old_dentry;
4970  	struct dentry *new_dentry = rd->new_dentry;
4971  	struct inode **delegated_inode = rd->delegated_inode;
4972  	unsigned int flags = rd->flags;
4973  	bool is_dir = d_is_dir(old_dentry);
4974  	struct inode *source = old_dentry->d_inode;
4975  	struct inode *target = new_dentry->d_inode;
4976  	bool new_is_dir = false;
4977  	unsigned max_links = new_dir->i_sb->s_max_links;
4978  	struct name_snapshot old_name;
4979  	bool lock_old_subdir, lock_new_subdir;
4980  
4981  	if (source == target)
4982  		return 0;
4983  
4984  	error = may_delete(rd->old_mnt_idmap, old_dir, old_dentry, is_dir);
4985  	if (error)
4986  		return error;
4987  
4988  	if (!target) {
4989  		error = may_create(rd->new_mnt_idmap, new_dir, new_dentry);
4990  	} else {
4991  		new_is_dir = d_is_dir(new_dentry);
4992  
4993  		if (!(flags & RENAME_EXCHANGE))
4994  			error = may_delete(rd->new_mnt_idmap, new_dir,
4995  					   new_dentry, is_dir);
4996  		else
4997  			error = may_delete(rd->new_mnt_idmap, new_dir,
4998  					   new_dentry, new_is_dir);
4999  	}
5000  	if (error)
5001  		return error;
5002  
5003  	if (!old_dir->i_op->rename)
5004  		return -EPERM;
5005  
5006  	/*
5007  	 * If we are going to change the parent - check write permissions,
5008  	 * we'll need to flip '..'.
5009  	 */
5010  	if (new_dir != old_dir) {
5011  		if (is_dir) {
5012  			error = inode_permission(rd->old_mnt_idmap, source,
5013  						 MAY_WRITE);
5014  			if (error)
5015  				return error;
5016  		}
5017  		if ((flags & RENAME_EXCHANGE) && new_is_dir) {
5018  			error = inode_permission(rd->new_mnt_idmap, target,
5019  						 MAY_WRITE);
5020  			if (error)
5021  				return error;
5022  		}
5023  	}
5024  
5025  	error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry,
5026  				      flags);
5027  	if (error)
5028  		return error;
5029  
5030  	take_dentry_name_snapshot(&old_name, old_dentry);
5031  	dget(new_dentry);
5032  	/*
5033  	 * Lock children.
5034  	 * The source subdirectory needs to be locked on cross-directory
5035  	 * rename or cross-directory exchange since its parent changes.
5036  	 * The target subdirectory needs to be locked on cross-directory
5037  	 * exchange due to parent change and on any rename due to becoming
5038  	 * a victim.
5039  	 * Non-directories need locking in all cases (for NFS reasons);
5040  	 * they get locked after any subdirectories (in inode address order).
5041  	 *
5042  	 * NOTE: WE ONLY LOCK UNRELATED DIRECTORIES IN CROSS-DIRECTORY CASE.
5043  	 * NEVER, EVER DO THAT WITHOUT ->s_vfs_rename_mutex.
5044  	 */
5045  	lock_old_subdir = new_dir != old_dir;
5046  	lock_new_subdir = new_dir != old_dir || !(flags & RENAME_EXCHANGE);
5047  	if (is_dir) {
5048  		if (lock_old_subdir)
5049  			inode_lock_nested(source, I_MUTEX_CHILD);
5050  		if (target && (!new_is_dir || lock_new_subdir))
5051  			inode_lock(target);
5052  	} else if (new_is_dir) {
5053  		if (lock_new_subdir)
5054  			inode_lock_nested(target, I_MUTEX_CHILD);
5055  		inode_lock(source);
5056  	} else {
5057  		lock_two_nondirectories(source, target);
5058  	}
5059  
5060  	error = -EPERM;
5061  	if (IS_SWAPFILE(source) || (target && IS_SWAPFILE(target)))
5062  		goto out;
5063  
5064  	error = -EBUSY;
5065  	if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry))
5066  		goto out;
5067  
5068  	if (max_links && new_dir != old_dir) {
5069  		error = -EMLINK;
5070  		if (is_dir && !new_is_dir && new_dir->i_nlink >= max_links)
5071  			goto out;
5072  		if ((flags & RENAME_EXCHANGE) && !is_dir && new_is_dir &&
5073  		    old_dir->i_nlink >= max_links)
5074  			goto out;
5075  	}
5076  	if (!is_dir) {
5077  		error = try_break_deleg(source, delegated_inode);
5078  		if (error)
5079  			goto out;
5080  	}
5081  	if (target && !new_is_dir) {
5082  		error = try_break_deleg(target, delegated_inode);
5083  		if (error)
5084  			goto out;
5085  	}
5086  	error = old_dir->i_op->rename(rd->new_mnt_idmap, old_dir, old_dentry,
5087  				      new_dir, new_dentry, flags);
5088  	if (error)
5089  		goto out;
5090  
5091  	if (!(flags & RENAME_EXCHANGE) && target) {
5092  		if (is_dir) {
5093  			shrink_dcache_parent(new_dentry);
5094  			target->i_flags |= S_DEAD;
5095  		}
5096  		dont_mount(new_dentry);
5097  		detach_mounts(new_dentry);
5098  	}
5099  	if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) {
5100  		if (!(flags & RENAME_EXCHANGE))
5101  			d_move(old_dentry, new_dentry);
5102  		else
5103  			d_exchange(old_dentry, new_dentry);
5104  	}
5105  out:
5106  	if (!is_dir || lock_old_subdir)
5107  		inode_unlock(source);
5108  	if (target && (!new_is_dir || lock_new_subdir))
5109  		inode_unlock(target);
5110  	dput(new_dentry);
5111  	if (!error) {
5112  		fsnotify_move(old_dir, new_dir, &old_name.name, is_dir,
5113  			      !(flags & RENAME_EXCHANGE) ? target : NULL, old_dentry);
5114  		if (flags & RENAME_EXCHANGE) {
5115  			fsnotify_move(new_dir, old_dir, &old_dentry->d_name,
5116  				      new_is_dir, NULL, new_dentry);
5117  		}
5118  	}
5119  	release_dentry_name_snapshot(&old_name);
5120  
5121  	return error;
5122  }
5123  EXPORT_SYMBOL(vfs_rename);
5124  
do_renameat2(int olddfd,struct filename * from,int newdfd,struct filename * to,unsigned int flags)5125  int do_renameat2(int olddfd, struct filename *from, int newdfd,
5126  		 struct filename *to, unsigned int flags)
5127  {
5128  	struct renamedata rd;
5129  	struct dentry *old_dentry, *new_dentry;
5130  	struct dentry *trap;
5131  	struct path old_path, new_path;
5132  	struct qstr old_last, new_last;
5133  	int old_type, new_type;
5134  	struct inode *delegated_inode = NULL;
5135  	unsigned int lookup_flags = 0, target_flags =
5136  		LOOKUP_RENAME_TARGET | LOOKUP_CREATE;
5137  	bool should_retry = false;
5138  	int error = -EINVAL;
5139  
5140  	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
5141  		goto put_names;
5142  
5143  	if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) &&
5144  	    (flags & RENAME_EXCHANGE))
5145  		goto put_names;
5146  
5147  	if (flags & RENAME_EXCHANGE)
5148  		target_flags = 0;
5149  	if (flags & RENAME_NOREPLACE)
5150  		target_flags |= LOOKUP_EXCL;
5151  
5152  retry:
5153  	error = filename_parentat(olddfd, from, lookup_flags, &old_path,
5154  				  &old_last, &old_type);
5155  	if (error)
5156  		goto put_names;
5157  
5158  	error = filename_parentat(newdfd, to, lookup_flags, &new_path, &new_last,
5159  				  &new_type);
5160  	if (error)
5161  		goto exit1;
5162  
5163  	error = -EXDEV;
5164  	if (old_path.mnt != new_path.mnt)
5165  		goto exit2;
5166  
5167  	error = -EBUSY;
5168  	if (old_type != LAST_NORM)
5169  		goto exit2;
5170  
5171  	if (flags & RENAME_NOREPLACE)
5172  		error = -EEXIST;
5173  	if (new_type != LAST_NORM)
5174  		goto exit2;
5175  
5176  	error = mnt_want_write(old_path.mnt);
5177  	if (error)
5178  		goto exit2;
5179  
5180  retry_deleg:
5181  	trap = lock_rename(new_path.dentry, old_path.dentry);
5182  	if (IS_ERR(trap)) {
5183  		error = PTR_ERR(trap);
5184  		goto exit_lock_rename;
5185  	}
5186  
5187  	old_dentry = lookup_one_qstr_excl(&old_last, old_path.dentry,
5188  					  lookup_flags);
5189  	error = PTR_ERR(old_dentry);
5190  	if (IS_ERR(old_dentry))
5191  		goto exit3;
5192  	new_dentry = lookup_one_qstr_excl(&new_last, new_path.dentry,
5193  					  lookup_flags | target_flags);
5194  	error = PTR_ERR(new_dentry);
5195  	if (IS_ERR(new_dentry))
5196  		goto exit4;
5197  	if (flags & RENAME_EXCHANGE) {
5198  		if (!d_is_dir(new_dentry)) {
5199  			error = -ENOTDIR;
5200  			if (new_last.name[new_last.len])
5201  				goto exit5;
5202  		}
5203  	}
5204  	/* unless the source is a directory trailing slashes give -ENOTDIR */
5205  	if (!d_is_dir(old_dentry)) {
5206  		error = -ENOTDIR;
5207  		if (old_last.name[old_last.len])
5208  			goto exit5;
5209  		if (!(flags & RENAME_EXCHANGE) && new_last.name[new_last.len])
5210  			goto exit5;
5211  	}
5212  	/* source should not be ancestor of target */
5213  	error = -EINVAL;
5214  	if (old_dentry == trap)
5215  		goto exit5;
5216  	/* target should not be an ancestor of source */
5217  	if (!(flags & RENAME_EXCHANGE))
5218  		error = -ENOTEMPTY;
5219  	if (new_dentry == trap)
5220  		goto exit5;
5221  
5222  	error = security_path_rename(&old_path, old_dentry,
5223  				     &new_path, new_dentry, flags);
5224  	if (error)
5225  		goto exit5;
5226  
5227  	rd.old_dir	   = old_path.dentry->d_inode;
5228  	rd.old_dentry	   = old_dentry;
5229  	rd.old_mnt_idmap   = mnt_idmap(old_path.mnt);
5230  	rd.new_dir	   = new_path.dentry->d_inode;
5231  	rd.new_dentry	   = new_dentry;
5232  	rd.new_mnt_idmap   = mnt_idmap(new_path.mnt);
5233  	rd.delegated_inode = &delegated_inode;
5234  	rd.flags	   = flags;
5235  	error = vfs_rename(&rd);
5236  exit5:
5237  	dput(new_dentry);
5238  exit4:
5239  	dput(old_dentry);
5240  exit3:
5241  	unlock_rename(new_path.dentry, old_path.dentry);
5242  exit_lock_rename:
5243  	if (delegated_inode) {
5244  		error = break_deleg_wait(&delegated_inode);
5245  		if (!error)
5246  			goto retry_deleg;
5247  	}
5248  	mnt_drop_write(old_path.mnt);
5249  exit2:
5250  	if (retry_estale(error, lookup_flags))
5251  		should_retry = true;
5252  	path_put(&new_path);
5253  exit1:
5254  	path_put(&old_path);
5255  	if (should_retry) {
5256  		should_retry = false;
5257  		lookup_flags |= LOOKUP_REVAL;
5258  		goto retry;
5259  	}
5260  put_names:
5261  	putname(from);
5262  	putname(to);
5263  	return error;
5264  }
5265  
SYSCALL_DEFINE5(renameat2,int,olddfd,const char __user *,oldname,int,newdfd,const char __user *,newname,unsigned int,flags)5266  SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
5267  		int, newdfd, const char __user *, newname, unsigned int, flags)
5268  {
5269  	return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname),
5270  				flags);
5271  }
5272  
SYSCALL_DEFINE4(renameat,int,olddfd,const char __user *,oldname,int,newdfd,const char __user *,newname)5273  SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
5274  		int, newdfd, const char __user *, newname)
5275  {
5276  	return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname),
5277  				0);
5278  }
5279  
SYSCALL_DEFINE2(rename,const char __user *,oldname,const char __user *,newname)5280  SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname)
5281  {
5282  	return do_renameat2(AT_FDCWD, getname(oldname), AT_FDCWD,
5283  				getname(newname), 0);
5284  }
5285  
readlink_copy(char __user * buffer,int buflen,const char * link,int linklen)5286  int readlink_copy(char __user *buffer, int buflen, const char *link, int linklen)
5287  {
5288  	int copylen;
5289  
5290  	copylen = linklen;
5291  	if (unlikely(copylen > (unsigned) buflen))
5292  		copylen = buflen;
5293  	if (copy_to_user(buffer, link, copylen))
5294  		copylen = -EFAULT;
5295  	return copylen;
5296  }
5297  
5298  /**
5299   * vfs_readlink - copy symlink body into userspace buffer
5300   * @dentry: dentry on which to get symbolic link
5301   * @buffer: user memory pointer
5302   * @buflen: size of buffer
5303   *
5304   * Does not touch atime.  That's up to the caller if necessary
5305   *
5306   * Does not call security hook.
5307   */
vfs_readlink(struct dentry * dentry,char __user * buffer,int buflen)5308  int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen)
5309  {
5310  	struct inode *inode = d_inode(dentry);
5311  	DEFINE_DELAYED_CALL(done);
5312  	const char *link;
5313  	int res;
5314  
5315  	if (inode->i_opflags & IOP_CACHED_LINK)
5316  		return readlink_copy(buffer, buflen, inode->i_link, inode->i_linklen);
5317  
5318  	if (unlikely(!(inode->i_opflags & IOP_DEFAULT_READLINK))) {
5319  		if (unlikely(inode->i_op->readlink))
5320  			return inode->i_op->readlink(dentry, buffer, buflen);
5321  
5322  		if (!d_is_symlink(dentry))
5323  			return -EINVAL;
5324  
5325  		spin_lock(&inode->i_lock);
5326  		inode->i_opflags |= IOP_DEFAULT_READLINK;
5327  		spin_unlock(&inode->i_lock);
5328  	}
5329  
5330  	link = READ_ONCE(inode->i_link);
5331  	if (!link) {
5332  		link = inode->i_op->get_link(dentry, inode, &done);
5333  		if (IS_ERR(link))
5334  			return PTR_ERR(link);
5335  	}
5336  	res = readlink_copy(buffer, buflen, link, strlen(link));
5337  	do_delayed_call(&done);
5338  	return res;
5339  }
5340  EXPORT_SYMBOL(vfs_readlink);
5341  
5342  /**
5343   * vfs_get_link - get symlink body
5344   * @dentry: dentry on which to get symbolic link
5345   * @done: caller needs to free returned data with this
5346   *
5347   * Calls security hook and i_op->get_link() on the supplied inode.
5348   *
5349   * It does not touch atime.  That's up to the caller if necessary.
5350   *
5351   * Does not work on "special" symlinks like /proc/$$/fd/N
5352   */
vfs_get_link(struct dentry * dentry,struct delayed_call * done)5353  const char *vfs_get_link(struct dentry *dentry, struct delayed_call *done)
5354  {
5355  	const char *res = ERR_PTR(-EINVAL);
5356  	struct inode *inode = d_inode(dentry);
5357  
5358  	if (d_is_symlink(dentry)) {
5359  		res = ERR_PTR(security_inode_readlink(dentry));
5360  		if (!res)
5361  			res = inode->i_op->get_link(dentry, inode, done);
5362  	}
5363  	return res;
5364  }
5365  EXPORT_SYMBOL(vfs_get_link);
5366  
5367  /* get the link contents into pagecache */
__page_get_link(struct dentry * dentry,struct inode * inode,struct delayed_call * callback)5368  static char *__page_get_link(struct dentry *dentry, struct inode *inode,
5369  			     struct delayed_call *callback)
5370  {
5371  	struct page *page;
5372  	struct address_space *mapping = inode->i_mapping;
5373  
5374  	if (!dentry) {
5375  		page = find_get_page(mapping, 0);
5376  		if (!page)
5377  			return ERR_PTR(-ECHILD);
5378  		if (!PageUptodate(page)) {
5379  			put_page(page);
5380  			return ERR_PTR(-ECHILD);
5381  		}
5382  	} else {
5383  		page = read_mapping_page(mapping, 0, NULL);
5384  		if (IS_ERR(page))
5385  			return (char*)page;
5386  	}
5387  	set_delayed_call(callback, page_put_link, page);
5388  	BUG_ON(mapping_gfp_mask(mapping) & __GFP_HIGHMEM);
5389  	return page_address(page);
5390  }
5391  
page_get_link_raw(struct dentry * dentry,struct inode * inode,struct delayed_call * callback)5392  const char *page_get_link_raw(struct dentry *dentry, struct inode *inode,
5393  			      struct delayed_call *callback)
5394  {
5395  	return __page_get_link(dentry, inode, callback);
5396  }
5397  EXPORT_SYMBOL_GPL(page_get_link_raw);
5398  
page_get_link(struct dentry * dentry,struct inode * inode,struct delayed_call * callback)5399  const char *page_get_link(struct dentry *dentry, struct inode *inode,
5400  					struct delayed_call *callback)
5401  {
5402  	char *kaddr = __page_get_link(dentry, inode, callback);
5403  
5404  	if (!IS_ERR(kaddr))
5405  		nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1);
5406  	return kaddr;
5407  }
5408  
5409  EXPORT_SYMBOL(page_get_link);
5410  
page_put_link(void * arg)5411  void page_put_link(void *arg)
5412  {
5413  	put_page(arg);
5414  }
5415  EXPORT_SYMBOL(page_put_link);
5416  
page_readlink(struct dentry * dentry,char __user * buffer,int buflen)5417  int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
5418  {
5419  	const char *link;
5420  	int res;
5421  
5422  	DEFINE_DELAYED_CALL(done);
5423  	link = page_get_link(dentry, d_inode(dentry), &done);
5424  	res = PTR_ERR(link);
5425  	if (!IS_ERR(link))
5426  		res = readlink_copy(buffer, buflen, link, strlen(link));
5427  	do_delayed_call(&done);
5428  	return res;
5429  }
5430  EXPORT_SYMBOL(page_readlink);
5431  
page_symlink(struct inode * inode,const char * symname,int len)5432  int page_symlink(struct inode *inode, const char *symname, int len)
5433  {
5434  	struct address_space *mapping = inode->i_mapping;
5435  	const struct address_space_operations *aops = mapping->a_ops;
5436  	bool nofs = !mapping_gfp_constraint(mapping, __GFP_FS);
5437  	struct folio *folio;
5438  	void *fsdata = NULL;
5439  	int err;
5440  	unsigned int flags;
5441  
5442  retry:
5443  	if (nofs)
5444  		flags = memalloc_nofs_save();
5445  	err = aops->write_begin(NULL, mapping, 0, len-1, &folio, &fsdata);
5446  	if (nofs)
5447  		memalloc_nofs_restore(flags);
5448  	if (err)
5449  		goto fail;
5450  
5451  	memcpy(folio_address(folio), symname, len - 1);
5452  
5453  	err = aops->write_end(NULL, mapping, 0, len - 1, len - 1,
5454  						folio, fsdata);
5455  	if (err < 0)
5456  		goto fail;
5457  	if (err < len-1)
5458  		goto retry;
5459  
5460  	mark_inode_dirty(inode);
5461  	return 0;
5462  fail:
5463  	return err;
5464  }
5465  EXPORT_SYMBOL(page_symlink);
5466  
5467  const struct inode_operations page_symlink_inode_operations = {
5468  	.get_link	= page_get_link,
5469  };
5470  EXPORT_SYMBOL(page_symlink_inode_operations);
5471