xref: /linux/fs/namei.c (revision a5a64498c194c82ecad3a2d67cff6231cda8d3dd)
1 /*
2  *  linux/fs/namei.c
3  *
4  *  Copyright (C) 1991, 1992  Linus Torvalds
5  */
6 
7 /*
8  * Some corrections by tytso.
9  */
10 
11 /* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname
12  * lookup logic.
13  */
14 /* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture.
15  */
16 
17 #include <linux/init.h>
18 #include <linux/module.h>
19 #include <linux/slab.h>
20 #include <linux/fs.h>
21 #include <linux/namei.h>
22 #include <linux/quotaops.h>
23 #include <linux/pagemap.h>
24 #include <linux/fsnotify.h>
25 #include <linux/personality.h>
26 #include <linux/security.h>
27 #include <linux/syscalls.h>
28 #include <linux/mount.h>
29 #include <linux/audit.h>
30 #include <linux/capability.h>
31 #include <linux/file.h>
32 #include <linux/fcntl.h>
33 #include <linux/device_cgroup.h>
34 #include <asm/uaccess.h>
35 
36 #define ACC_MODE(x) ("\000\004\002\006"[(x)&O_ACCMODE])
37 
38 /* [Feb-1997 T. Schoebel-Theuer]
39  * Fundamental changes in the pathname lookup mechanisms (namei)
40  * were necessary because of omirr.  The reason is that omirr needs
41  * to know the _real_ pathname, not the user-supplied one, in case
42  * of symlinks (and also when transname replacements occur).
43  *
44  * The new code replaces the old recursive symlink resolution with
45  * an iterative one (in case of non-nested symlink chains).  It does
46  * this with calls to <fs>_follow_link().
47  * As a side effect, dir_namei(), _namei() and follow_link() are now
48  * replaced with a single function lookup_dentry() that can handle all
49  * the special cases of the former code.
50  *
51  * With the new dcache, the pathname is stored at each inode, at least as
52  * long as the refcount of the inode is positive.  As a side effect, the
53  * size of the dcache depends on the inode cache and thus is dynamic.
54  *
55  * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink
56  * resolution to correspond with current state of the code.
57  *
58  * Note that the symlink resolution is not *completely* iterative.
59  * There is still a significant amount of tail- and mid- recursion in
60  * the algorithm.  Also, note that <fs>_readlink() is not used in
61  * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink()
62  * may return different results than <fs>_follow_link().  Many virtual
63  * filesystems (including /proc) exhibit this behavior.
64  */
65 
66 /* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation:
67  * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL
68  * and the name already exists in form of a symlink, try to create the new
69  * name indicated by the symlink. The old code always complained that the
70  * name already exists, due to not following the symlink even if its target
71  * is nonexistent.  The new semantics affects also mknod() and link() when
72  * the name is a symlink pointing to a non-existant name.
73  *
74  * I don't know which semantics is the right one, since I have no access
75  * to standards. But I found by trial that HP-UX 9.0 has the full "new"
76  * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the
77  * "old" one. Personally, I think the new semantics is much more logical.
78  * Note that "ln old new" where "new" is a symlink pointing to a non-existing
79  * file does succeed in both HP-UX and SunOs, but not in Solaris
80  * and in the old Linux semantics.
81  */
82 
83 /* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink
84  * semantics.  See the comments in "open_namei" and "do_link" below.
85  *
86  * [10-Sep-98 Alan Modra] Another symlink change.
87  */
88 
89 /* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks:
90  *	inside the path - always follow.
91  *	in the last component in creation/removal/renaming - never follow.
92  *	if LOOKUP_FOLLOW passed - follow.
93  *	if the pathname has trailing slashes - follow.
94  *	otherwise - don't follow.
95  * (applied in that order).
96  *
97  * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT
98  * restored for 2.4. This is the last surviving part of old 4.2BSD bug.
99  * During the 2.4 we need to fix the userland stuff depending on it -
100  * hopefully we will be able to get rid of that wart in 2.5. So far only
101  * XEmacs seems to be relying on it...
102  */
103 /*
104  * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland)
105  * implemented.  Let's see if raised priority of ->s_vfs_rename_mutex gives
106  * any extra contention...
107  */
108 
109 static int __link_path_walk(const char *name, struct nameidata *nd);
110 
111 /* In order to reduce some races, while at the same time doing additional
112  * checking and hopefully speeding things up, we copy filenames to the
113  * kernel data space before using them..
114  *
115  * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
116  * PATH_MAX includes the nul terminator --RR.
117  */
118 static int do_getname(const char __user *filename, char *page)
119 {
120 	int retval;
121 	unsigned long len = PATH_MAX;
122 
123 	if (!segment_eq(get_fs(), KERNEL_DS)) {
124 		if ((unsigned long) filename >= TASK_SIZE)
125 			return -EFAULT;
126 		if (TASK_SIZE - (unsigned long) filename < PATH_MAX)
127 			len = TASK_SIZE - (unsigned long) filename;
128 	}
129 
130 	retval = strncpy_from_user(page, filename, len);
131 	if (retval > 0) {
132 		if (retval < len)
133 			return 0;
134 		return -ENAMETOOLONG;
135 	} else if (!retval)
136 		retval = -ENOENT;
137 	return retval;
138 }
139 
140 char * getname(const char __user * filename)
141 {
142 	char *tmp, *result;
143 
144 	result = ERR_PTR(-ENOMEM);
145 	tmp = __getname();
146 	if (tmp)  {
147 		int retval = do_getname(filename, tmp);
148 
149 		result = tmp;
150 		if (retval < 0) {
151 			__putname(tmp);
152 			result = ERR_PTR(retval);
153 		}
154 	}
155 	audit_getname(result);
156 	return result;
157 }
158 
159 #ifdef CONFIG_AUDITSYSCALL
160 void putname(const char *name)
161 {
162 	if (unlikely(!audit_dummy_context()))
163 		audit_putname(name);
164 	else
165 		__putname(name);
166 }
167 EXPORT_SYMBOL(putname);
168 #endif
169 
170 
171 /**
172  * generic_permission  -  check for access rights on a Posix-like filesystem
173  * @inode:	inode to check access rights for
174  * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
175  * @check_acl:	optional callback to check for Posix ACLs
176  *
177  * Used to check for read/write/execute permissions on a file.
178  * We use "fsuid" for this, letting us set arbitrary permissions
179  * for filesystem access without changing the "normal" uids which
180  * are used for other things..
181  */
182 int generic_permission(struct inode *inode, int mask,
183 		int (*check_acl)(struct inode *inode, int mask))
184 {
185 	umode_t			mode = inode->i_mode;
186 
187 	mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
188 
189 	if (current->fsuid == inode->i_uid)
190 		mode >>= 6;
191 	else {
192 		if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) {
193 			int error = check_acl(inode, mask);
194 			if (error == -EACCES)
195 				goto check_capabilities;
196 			else if (error != -EAGAIN)
197 				return error;
198 		}
199 
200 		if (in_group_p(inode->i_gid))
201 			mode >>= 3;
202 	}
203 
204 	/*
205 	 * If the DACs are ok we don't need any capability check.
206 	 */
207 	if ((mask & ~mode) == 0)
208 		return 0;
209 
210  check_capabilities:
211 	/*
212 	 * Read/write DACs are always overridable.
213 	 * Executable DACs are overridable if at least one exec bit is set.
214 	 */
215 	if (!(mask & MAY_EXEC) || execute_ok(inode))
216 		if (capable(CAP_DAC_OVERRIDE))
217 			return 0;
218 
219 	/*
220 	 * Searching includes executable on directories, else just read.
221 	 */
222 	if (mask == MAY_READ || (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE)))
223 		if (capable(CAP_DAC_READ_SEARCH))
224 			return 0;
225 
226 	return -EACCES;
227 }
228 
229 int inode_permission(struct inode *inode, int mask)
230 {
231 	int retval;
232 
233 	if (mask & MAY_WRITE) {
234 		umode_t mode = inode->i_mode;
235 
236 		/*
237 		 * Nobody gets write access to a read-only fs.
238 		 */
239 		if (IS_RDONLY(inode) &&
240 		    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
241 			return -EROFS;
242 
243 		/*
244 		 * Nobody gets write access to an immutable file.
245 		 */
246 		if (IS_IMMUTABLE(inode))
247 			return -EACCES;
248 	}
249 
250 	/* Ordinary permission routines do not understand MAY_APPEND. */
251 	if (inode->i_op && inode->i_op->permission)
252 		retval = inode->i_op->permission(inode, mask);
253 	else
254 		retval = generic_permission(inode, mask, NULL);
255 
256 	if (retval)
257 		return retval;
258 
259 	retval = devcgroup_inode_permission(inode, mask);
260 	if (retval)
261 		return retval;
262 
263 	return security_inode_permission(inode,
264 			mask & (MAY_READ|MAY_WRITE|MAY_EXEC|MAY_APPEND));
265 }
266 
267 /**
268  * vfs_permission  -  check for access rights to a given path
269  * @nd:		lookup result that describes the path
270  * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
271  *
272  * Used to check for read/write/execute permissions on a path.
273  * We use "fsuid" for this, letting us set arbitrary permissions
274  * for filesystem access without changing the "normal" uids which
275  * are used for other things.
276  */
277 int vfs_permission(struct nameidata *nd, int mask)
278 {
279 	return inode_permission(nd->path.dentry->d_inode, mask);
280 }
281 
282 /**
283  * file_permission  -  check for additional access rights to a given file
284  * @file:	file to check access rights for
285  * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
286  *
287  * Used to check for read/write/execute permissions on an already opened
288  * file.
289  *
290  * Note:
291  *	Do not use this function in new code.  All access checks should
292  *	be done using vfs_permission().
293  */
294 int file_permission(struct file *file, int mask)
295 {
296 	return inode_permission(file->f_path.dentry->d_inode, mask);
297 }
298 
299 /*
300  * get_write_access() gets write permission for a file.
301  * put_write_access() releases this write permission.
302  * This is used for regular files.
303  * We cannot support write (and maybe mmap read-write shared) accesses and
304  * MAP_DENYWRITE mmappings simultaneously. The i_writecount field of an inode
305  * can have the following values:
306  * 0: no writers, no VM_DENYWRITE mappings
307  * < 0: (-i_writecount) vm_area_structs with VM_DENYWRITE set exist
308  * > 0: (i_writecount) users are writing to the file.
309  *
310  * Normally we operate on that counter with atomic_{inc,dec} and it's safe
311  * except for the cases where we don't hold i_writecount yet. Then we need to
312  * use {get,deny}_write_access() - these functions check the sign and refuse
313  * to do the change if sign is wrong. Exclusion between them is provided by
314  * the inode->i_lock spinlock.
315  */
316 
317 int get_write_access(struct inode * inode)
318 {
319 	spin_lock(&inode->i_lock);
320 	if (atomic_read(&inode->i_writecount) < 0) {
321 		spin_unlock(&inode->i_lock);
322 		return -ETXTBSY;
323 	}
324 	atomic_inc(&inode->i_writecount);
325 	spin_unlock(&inode->i_lock);
326 
327 	return 0;
328 }
329 
330 int deny_write_access(struct file * file)
331 {
332 	struct inode *inode = file->f_path.dentry->d_inode;
333 
334 	spin_lock(&inode->i_lock);
335 	if (atomic_read(&inode->i_writecount) > 0) {
336 		spin_unlock(&inode->i_lock);
337 		return -ETXTBSY;
338 	}
339 	atomic_dec(&inode->i_writecount);
340 	spin_unlock(&inode->i_lock);
341 
342 	return 0;
343 }
344 
345 /**
346  * path_get - get a reference to a path
347  * @path: path to get the reference to
348  *
349  * Given a path increment the reference count to the dentry and the vfsmount.
350  */
351 void path_get(struct path *path)
352 {
353 	mntget(path->mnt);
354 	dget(path->dentry);
355 }
356 EXPORT_SYMBOL(path_get);
357 
358 /**
359  * path_put - put a reference to a path
360  * @path: path to put the reference to
361  *
362  * Given a path decrement the reference count to the dentry and the vfsmount.
363  */
364 void path_put(struct path *path)
365 {
366 	dput(path->dentry);
367 	mntput(path->mnt);
368 }
369 EXPORT_SYMBOL(path_put);
370 
371 /**
372  * release_open_intent - free up open intent resources
373  * @nd: pointer to nameidata
374  */
375 void release_open_intent(struct nameidata *nd)
376 {
377 	if (nd->intent.open.file->f_path.dentry == NULL)
378 		put_filp(nd->intent.open.file);
379 	else
380 		fput(nd->intent.open.file);
381 }
382 
383 static inline struct dentry *
384 do_revalidate(struct dentry *dentry, struct nameidata *nd)
385 {
386 	int status = dentry->d_op->d_revalidate(dentry, nd);
387 	if (unlikely(status <= 0)) {
388 		/*
389 		 * The dentry failed validation.
390 		 * If d_revalidate returned 0 attempt to invalidate
391 		 * the dentry otherwise d_revalidate is asking us
392 		 * to return a fail status.
393 		 */
394 		if (!status) {
395 			if (!d_invalidate(dentry)) {
396 				dput(dentry);
397 				dentry = NULL;
398 			}
399 		} else {
400 			dput(dentry);
401 			dentry = ERR_PTR(status);
402 		}
403 	}
404 	return dentry;
405 }
406 
407 /*
408  * Internal lookup() using the new generic dcache.
409  * SMP-safe
410  */
411 static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name, struct nameidata *nd)
412 {
413 	struct dentry * dentry = __d_lookup(parent, name);
414 
415 	/* lockess __d_lookup may fail due to concurrent d_move()
416 	 * in some unrelated directory, so try with d_lookup
417 	 */
418 	if (!dentry)
419 		dentry = d_lookup(parent, name);
420 
421 	if (dentry && dentry->d_op && dentry->d_op->d_revalidate)
422 		dentry = do_revalidate(dentry, nd);
423 
424 	return dentry;
425 }
426 
427 /*
428  * Short-cut version of permission(), for calling by
429  * path_walk(), when dcache lock is held.  Combines parts
430  * of permission() and generic_permission(), and tests ONLY for
431  * MAY_EXEC permission.
432  *
433  * If appropriate, check DAC only.  If not appropriate, or
434  * short-cut DAC fails, then call permission() to do more
435  * complete permission check.
436  */
437 static int exec_permission_lite(struct inode *inode)
438 {
439 	umode_t	mode = inode->i_mode;
440 
441 	if (inode->i_op && inode->i_op->permission)
442 		return -EAGAIN;
443 
444 	if (current->fsuid == inode->i_uid)
445 		mode >>= 6;
446 	else if (in_group_p(inode->i_gid))
447 		mode >>= 3;
448 
449 	if (mode & MAY_EXEC)
450 		goto ok;
451 
452 	if ((inode->i_mode & S_IXUGO) && capable(CAP_DAC_OVERRIDE))
453 		goto ok;
454 
455 	if (S_ISDIR(inode->i_mode) && capable(CAP_DAC_OVERRIDE))
456 		goto ok;
457 
458 	if (S_ISDIR(inode->i_mode) && capable(CAP_DAC_READ_SEARCH))
459 		goto ok;
460 
461 	return -EACCES;
462 ok:
463 	return security_inode_permission(inode, MAY_EXEC);
464 }
465 
466 /*
467  * This is called when everything else fails, and we actually have
468  * to go to the low-level filesystem to find out what we should do..
469  *
470  * We get the directory semaphore, and after getting that we also
471  * make sure that nobody added the entry to the dcache in the meantime..
472  * SMP-safe
473  */
474 static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, struct nameidata *nd)
475 {
476 	struct dentry * result;
477 	struct inode *dir = parent->d_inode;
478 
479 	mutex_lock(&dir->i_mutex);
480 	/*
481 	 * First re-do the cached lookup just in case it was created
482 	 * while we waited for the directory semaphore..
483 	 *
484 	 * FIXME! This could use version numbering or similar to
485 	 * avoid unnecessary cache lookups.
486 	 *
487 	 * The "dcache_lock" is purely to protect the RCU list walker
488 	 * from concurrent renames at this point (we mustn't get false
489 	 * negatives from the RCU list walk here, unlike the optimistic
490 	 * fast walk).
491 	 *
492 	 * so doing d_lookup() (with seqlock), instead of lockfree __d_lookup
493 	 */
494 	result = d_lookup(parent, name);
495 	if (!result) {
496 		struct dentry *dentry;
497 
498 		/* Don't create child dentry for a dead directory. */
499 		result = ERR_PTR(-ENOENT);
500 		if (IS_DEADDIR(dir))
501 			goto out_unlock;
502 
503 		dentry = d_alloc(parent, name);
504 		result = ERR_PTR(-ENOMEM);
505 		if (dentry) {
506 			result = dir->i_op->lookup(dir, dentry, nd);
507 			if (result)
508 				dput(dentry);
509 			else
510 				result = dentry;
511 		}
512 out_unlock:
513 		mutex_unlock(&dir->i_mutex);
514 		return result;
515 	}
516 
517 	/*
518 	 * Uhhuh! Nasty case: the cache was re-populated while
519 	 * we waited on the semaphore. Need to revalidate.
520 	 */
521 	mutex_unlock(&dir->i_mutex);
522 	if (result->d_op && result->d_op->d_revalidate) {
523 		result = do_revalidate(result, nd);
524 		if (!result)
525 			result = ERR_PTR(-ENOENT);
526 	}
527 	return result;
528 }
529 
530 /* SMP-safe */
531 static __always_inline void
532 walk_init_root(const char *name, struct nameidata *nd)
533 {
534 	struct fs_struct *fs = current->fs;
535 
536 	read_lock(&fs->lock);
537 	nd->path = fs->root;
538 	path_get(&fs->root);
539 	read_unlock(&fs->lock);
540 }
541 
542 /*
543  * Wrapper to retry pathname resolution whenever the underlying
544  * file system returns an ESTALE.
545  *
546  * Retry the whole path once, forcing real lookup requests
547  * instead of relying on the dcache.
548  */
549 static __always_inline int link_path_walk(const char *name, struct nameidata *nd)
550 {
551 	struct path save = nd->path;
552 	int result;
553 
554 	/* make sure the stuff we saved doesn't go away */
555 	path_get(&save);
556 
557 	result = __link_path_walk(name, nd);
558 	if (result == -ESTALE) {
559 		/* nd->path had been dropped */
560 		nd->path = save;
561 		path_get(&nd->path);
562 		nd->flags |= LOOKUP_REVAL;
563 		result = __link_path_walk(name, nd);
564 	}
565 
566 	path_put(&save);
567 
568 	return result;
569 }
570 
571 static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link)
572 {
573 	int res = 0;
574 	char *name;
575 	if (IS_ERR(link))
576 		goto fail;
577 
578 	if (*link == '/') {
579 		path_put(&nd->path);
580 		walk_init_root(link, nd);
581 	}
582 	res = link_path_walk(link, nd);
583 	if (nd->depth || res || nd->last_type!=LAST_NORM)
584 		return res;
585 	/*
586 	 * If it is an iterative symlinks resolution in open_namei() we
587 	 * have to copy the last component. And all that crap because of
588 	 * bloody create() on broken symlinks. Furrfu...
589 	 */
590 	name = __getname();
591 	if (unlikely(!name)) {
592 		path_put(&nd->path);
593 		return -ENOMEM;
594 	}
595 	strcpy(name, nd->last.name);
596 	nd->last.name = name;
597 	return 0;
598 fail:
599 	path_put(&nd->path);
600 	return PTR_ERR(link);
601 }
602 
603 static void path_put_conditional(struct path *path, struct nameidata *nd)
604 {
605 	dput(path->dentry);
606 	if (path->mnt != nd->path.mnt)
607 		mntput(path->mnt);
608 }
609 
610 static inline void path_to_nameidata(struct path *path, struct nameidata *nd)
611 {
612 	dput(nd->path.dentry);
613 	if (nd->path.mnt != path->mnt)
614 		mntput(nd->path.mnt);
615 	nd->path.mnt = path->mnt;
616 	nd->path.dentry = path->dentry;
617 }
618 
619 static __always_inline int __do_follow_link(struct path *path, struct nameidata *nd)
620 {
621 	int error;
622 	void *cookie;
623 	struct dentry *dentry = path->dentry;
624 
625 	touch_atime(path->mnt, dentry);
626 	nd_set_link(nd, NULL);
627 
628 	if (path->mnt != nd->path.mnt) {
629 		path_to_nameidata(path, nd);
630 		dget(dentry);
631 	}
632 	mntget(path->mnt);
633 	cookie = dentry->d_inode->i_op->follow_link(dentry, nd);
634 	error = PTR_ERR(cookie);
635 	if (!IS_ERR(cookie)) {
636 		char *s = nd_get_link(nd);
637 		error = 0;
638 		if (s)
639 			error = __vfs_follow_link(nd, s);
640 		if (dentry->d_inode->i_op->put_link)
641 			dentry->d_inode->i_op->put_link(dentry, nd, cookie);
642 	}
643 	path_put(path);
644 
645 	return error;
646 }
647 
648 /*
649  * This limits recursive symlink follows to 8, while
650  * limiting consecutive symlinks to 40.
651  *
652  * Without that kind of total limit, nasty chains of consecutive
653  * symlinks can cause almost arbitrarily long lookups.
654  */
655 static inline int do_follow_link(struct path *path, struct nameidata *nd)
656 {
657 	int err = -ELOOP;
658 	if (current->link_count >= MAX_NESTED_LINKS)
659 		goto loop;
660 	if (current->total_link_count >= 40)
661 		goto loop;
662 	BUG_ON(nd->depth >= MAX_NESTED_LINKS);
663 	cond_resched();
664 	err = security_inode_follow_link(path->dentry, nd);
665 	if (err)
666 		goto loop;
667 	current->link_count++;
668 	current->total_link_count++;
669 	nd->depth++;
670 	err = __do_follow_link(path, nd);
671 	current->link_count--;
672 	nd->depth--;
673 	return err;
674 loop:
675 	path_put_conditional(path, nd);
676 	path_put(&nd->path);
677 	return err;
678 }
679 
680 int follow_up(struct vfsmount **mnt, struct dentry **dentry)
681 {
682 	struct vfsmount *parent;
683 	struct dentry *mountpoint;
684 	spin_lock(&vfsmount_lock);
685 	parent=(*mnt)->mnt_parent;
686 	if (parent == *mnt) {
687 		spin_unlock(&vfsmount_lock);
688 		return 0;
689 	}
690 	mntget(parent);
691 	mountpoint=dget((*mnt)->mnt_mountpoint);
692 	spin_unlock(&vfsmount_lock);
693 	dput(*dentry);
694 	*dentry = mountpoint;
695 	mntput(*mnt);
696 	*mnt = parent;
697 	return 1;
698 }
699 
700 /* no need for dcache_lock, as serialization is taken care in
701  * namespace.c
702  */
703 static int __follow_mount(struct path *path)
704 {
705 	int res = 0;
706 	while (d_mountpoint(path->dentry)) {
707 		struct vfsmount *mounted = lookup_mnt(path->mnt, path->dentry);
708 		if (!mounted)
709 			break;
710 		dput(path->dentry);
711 		if (res)
712 			mntput(path->mnt);
713 		path->mnt = mounted;
714 		path->dentry = dget(mounted->mnt_root);
715 		res = 1;
716 	}
717 	return res;
718 }
719 
720 static void follow_mount(struct vfsmount **mnt, struct dentry **dentry)
721 {
722 	while (d_mountpoint(*dentry)) {
723 		struct vfsmount *mounted = lookup_mnt(*mnt, *dentry);
724 		if (!mounted)
725 			break;
726 		dput(*dentry);
727 		mntput(*mnt);
728 		*mnt = mounted;
729 		*dentry = dget(mounted->mnt_root);
730 	}
731 }
732 
733 /* no need for dcache_lock, as serialization is taken care in
734  * namespace.c
735  */
736 int follow_down(struct vfsmount **mnt, struct dentry **dentry)
737 {
738 	struct vfsmount *mounted;
739 
740 	mounted = lookup_mnt(*mnt, *dentry);
741 	if (mounted) {
742 		dput(*dentry);
743 		mntput(*mnt);
744 		*mnt = mounted;
745 		*dentry = dget(mounted->mnt_root);
746 		return 1;
747 	}
748 	return 0;
749 }
750 
751 static __always_inline void follow_dotdot(struct nameidata *nd)
752 {
753 	struct fs_struct *fs = current->fs;
754 
755 	while(1) {
756 		struct vfsmount *parent;
757 		struct dentry *old = nd->path.dentry;
758 
759                 read_lock(&fs->lock);
760 		if (nd->path.dentry == fs->root.dentry &&
761 		    nd->path.mnt == fs->root.mnt) {
762                         read_unlock(&fs->lock);
763 			break;
764 		}
765                 read_unlock(&fs->lock);
766 		spin_lock(&dcache_lock);
767 		if (nd->path.dentry != nd->path.mnt->mnt_root) {
768 			nd->path.dentry = dget(nd->path.dentry->d_parent);
769 			spin_unlock(&dcache_lock);
770 			dput(old);
771 			break;
772 		}
773 		spin_unlock(&dcache_lock);
774 		spin_lock(&vfsmount_lock);
775 		parent = nd->path.mnt->mnt_parent;
776 		if (parent == nd->path.mnt) {
777 			spin_unlock(&vfsmount_lock);
778 			break;
779 		}
780 		mntget(parent);
781 		nd->path.dentry = dget(nd->path.mnt->mnt_mountpoint);
782 		spin_unlock(&vfsmount_lock);
783 		dput(old);
784 		mntput(nd->path.mnt);
785 		nd->path.mnt = parent;
786 	}
787 	follow_mount(&nd->path.mnt, &nd->path.dentry);
788 }
789 
790 /*
791  *  It's more convoluted than I'd like it to be, but... it's still fairly
792  *  small and for now I'd prefer to have fast path as straight as possible.
793  *  It _is_ time-critical.
794  */
795 static int do_lookup(struct nameidata *nd, struct qstr *name,
796 		     struct path *path)
797 {
798 	struct vfsmount *mnt = nd->path.mnt;
799 	struct dentry *dentry = __d_lookup(nd->path.dentry, name);
800 
801 	if (!dentry)
802 		goto need_lookup;
803 	if (dentry->d_op && dentry->d_op->d_revalidate)
804 		goto need_revalidate;
805 done:
806 	path->mnt = mnt;
807 	path->dentry = dentry;
808 	__follow_mount(path);
809 	return 0;
810 
811 need_lookup:
812 	dentry = real_lookup(nd->path.dentry, name, nd);
813 	if (IS_ERR(dentry))
814 		goto fail;
815 	goto done;
816 
817 need_revalidate:
818 	dentry = do_revalidate(dentry, nd);
819 	if (!dentry)
820 		goto need_lookup;
821 	if (IS_ERR(dentry))
822 		goto fail;
823 	goto done;
824 
825 fail:
826 	return PTR_ERR(dentry);
827 }
828 
829 /*
830  * Name resolution.
831  * This is the basic name resolution function, turning a pathname into
832  * the final dentry. We expect 'base' to be positive and a directory.
833  *
834  * Returns 0 and nd will have valid dentry and mnt on success.
835  * Returns error and drops reference to input namei data on failure.
836  */
837 static int __link_path_walk(const char *name, struct nameidata *nd)
838 {
839 	struct path next;
840 	struct inode *inode;
841 	int err;
842 	unsigned int lookup_flags = nd->flags;
843 
844 	while (*name=='/')
845 		name++;
846 	if (!*name)
847 		goto return_reval;
848 
849 	inode = nd->path.dentry->d_inode;
850 	if (nd->depth)
851 		lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE);
852 
853 	/* At this point we know we have a real path component. */
854 	for(;;) {
855 		unsigned long hash;
856 		struct qstr this;
857 		unsigned int c;
858 
859 		nd->flags |= LOOKUP_CONTINUE;
860 		err = exec_permission_lite(inode);
861 		if (err == -EAGAIN)
862 			err = vfs_permission(nd, MAY_EXEC);
863  		if (err)
864 			break;
865 
866 		this.name = name;
867 		c = *(const unsigned char *)name;
868 
869 		hash = init_name_hash();
870 		do {
871 			name++;
872 			hash = partial_name_hash(c, hash);
873 			c = *(const unsigned char *)name;
874 		} while (c && (c != '/'));
875 		this.len = name - (const char *) this.name;
876 		this.hash = end_name_hash(hash);
877 
878 		/* remove trailing slashes? */
879 		if (!c)
880 			goto last_component;
881 		while (*++name == '/');
882 		if (!*name)
883 			goto last_with_slashes;
884 
885 		/*
886 		 * "." and ".." are special - ".." especially so because it has
887 		 * to be able to know about the current root directory and
888 		 * parent relationships.
889 		 */
890 		if (this.name[0] == '.') switch (this.len) {
891 			default:
892 				break;
893 			case 2:
894 				if (this.name[1] != '.')
895 					break;
896 				follow_dotdot(nd);
897 				inode = nd->path.dentry->d_inode;
898 				/* fallthrough */
899 			case 1:
900 				continue;
901 		}
902 		/*
903 		 * See if the low-level filesystem might want
904 		 * to use its own hash..
905 		 */
906 		if (nd->path.dentry->d_op && nd->path.dentry->d_op->d_hash) {
907 			err = nd->path.dentry->d_op->d_hash(nd->path.dentry,
908 							    &this);
909 			if (err < 0)
910 				break;
911 		}
912 		/* This does the actual lookups.. */
913 		err = do_lookup(nd, &this, &next);
914 		if (err)
915 			break;
916 
917 		err = -ENOENT;
918 		inode = next.dentry->d_inode;
919 		if (!inode)
920 			goto out_dput;
921 		err = -ENOTDIR;
922 		if (!inode->i_op)
923 			goto out_dput;
924 
925 		if (inode->i_op->follow_link) {
926 			err = do_follow_link(&next, nd);
927 			if (err)
928 				goto return_err;
929 			err = -ENOENT;
930 			inode = nd->path.dentry->d_inode;
931 			if (!inode)
932 				break;
933 			err = -ENOTDIR;
934 			if (!inode->i_op)
935 				break;
936 		} else
937 			path_to_nameidata(&next, nd);
938 		err = -ENOTDIR;
939 		if (!inode->i_op->lookup)
940 			break;
941 		continue;
942 		/* here ends the main loop */
943 
944 last_with_slashes:
945 		lookup_flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
946 last_component:
947 		/* Clear LOOKUP_CONTINUE iff it was previously unset */
948 		nd->flags &= lookup_flags | ~LOOKUP_CONTINUE;
949 		if (lookup_flags & LOOKUP_PARENT)
950 			goto lookup_parent;
951 		if (this.name[0] == '.') switch (this.len) {
952 			default:
953 				break;
954 			case 2:
955 				if (this.name[1] != '.')
956 					break;
957 				follow_dotdot(nd);
958 				inode = nd->path.dentry->d_inode;
959 				/* fallthrough */
960 			case 1:
961 				goto return_reval;
962 		}
963 		if (nd->path.dentry->d_op && nd->path.dentry->d_op->d_hash) {
964 			err = nd->path.dentry->d_op->d_hash(nd->path.dentry,
965 							    &this);
966 			if (err < 0)
967 				break;
968 		}
969 		err = do_lookup(nd, &this, &next);
970 		if (err)
971 			break;
972 		inode = next.dentry->d_inode;
973 		if ((lookup_flags & LOOKUP_FOLLOW)
974 		    && inode && inode->i_op && inode->i_op->follow_link) {
975 			err = do_follow_link(&next, nd);
976 			if (err)
977 				goto return_err;
978 			inode = nd->path.dentry->d_inode;
979 		} else
980 			path_to_nameidata(&next, nd);
981 		err = -ENOENT;
982 		if (!inode)
983 			break;
984 		if (lookup_flags & LOOKUP_DIRECTORY) {
985 			err = -ENOTDIR;
986 			if (!inode->i_op || !inode->i_op->lookup)
987 				break;
988 		}
989 		goto return_base;
990 lookup_parent:
991 		nd->last = this;
992 		nd->last_type = LAST_NORM;
993 		if (this.name[0] != '.')
994 			goto return_base;
995 		if (this.len == 1)
996 			nd->last_type = LAST_DOT;
997 		else if (this.len == 2 && this.name[1] == '.')
998 			nd->last_type = LAST_DOTDOT;
999 		else
1000 			goto return_base;
1001 return_reval:
1002 		/*
1003 		 * We bypassed the ordinary revalidation routines.
1004 		 * We may need to check the cached dentry for staleness.
1005 		 */
1006 		if (nd->path.dentry && nd->path.dentry->d_sb &&
1007 		    (nd->path.dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)) {
1008 			err = -ESTALE;
1009 			/* Note: we do not d_invalidate() */
1010 			if (!nd->path.dentry->d_op->d_revalidate(
1011 					nd->path.dentry, nd))
1012 				break;
1013 		}
1014 return_base:
1015 		return 0;
1016 out_dput:
1017 		path_put_conditional(&next, nd);
1018 		break;
1019 	}
1020 	path_put(&nd->path);
1021 return_err:
1022 	return err;
1023 }
1024 
1025 static int path_walk(const char *name, struct nameidata *nd)
1026 {
1027 	current->total_link_count = 0;
1028 	return link_path_walk(name, nd);
1029 }
1030 
1031 /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
1032 static int do_path_lookup(int dfd, const char *name,
1033 				unsigned int flags, struct nameidata *nd)
1034 {
1035 	int retval = 0;
1036 	int fput_needed;
1037 	struct file *file;
1038 	struct fs_struct *fs = current->fs;
1039 
1040 	nd->last_type = LAST_ROOT; /* if there are only slashes... */
1041 	nd->flags = flags;
1042 	nd->depth = 0;
1043 
1044 	if (*name=='/') {
1045 		read_lock(&fs->lock);
1046 		nd->path = fs->root;
1047 		path_get(&fs->root);
1048 		read_unlock(&fs->lock);
1049 	} else if (dfd == AT_FDCWD) {
1050 		read_lock(&fs->lock);
1051 		nd->path = fs->pwd;
1052 		path_get(&fs->pwd);
1053 		read_unlock(&fs->lock);
1054 	} else {
1055 		struct dentry *dentry;
1056 
1057 		file = fget_light(dfd, &fput_needed);
1058 		retval = -EBADF;
1059 		if (!file)
1060 			goto out_fail;
1061 
1062 		dentry = file->f_path.dentry;
1063 
1064 		retval = -ENOTDIR;
1065 		if (!S_ISDIR(dentry->d_inode->i_mode))
1066 			goto fput_fail;
1067 
1068 		retval = file_permission(file, MAY_EXEC);
1069 		if (retval)
1070 			goto fput_fail;
1071 
1072 		nd->path = file->f_path;
1073 		path_get(&file->f_path);
1074 
1075 		fput_light(file, fput_needed);
1076 	}
1077 
1078 	retval = path_walk(name, nd);
1079 	if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
1080 				nd->path.dentry->d_inode))
1081 		audit_inode(name, nd->path.dentry);
1082 out_fail:
1083 	return retval;
1084 
1085 fput_fail:
1086 	fput_light(file, fput_needed);
1087 	goto out_fail;
1088 }
1089 
1090 int path_lookup(const char *name, unsigned int flags,
1091 			struct nameidata *nd)
1092 {
1093 	return do_path_lookup(AT_FDCWD, name, flags, nd);
1094 }
1095 
1096 int kern_path(const char *name, unsigned int flags, struct path *path)
1097 {
1098 	struct nameidata nd;
1099 	int res = do_path_lookup(AT_FDCWD, name, flags, &nd);
1100 	if (!res)
1101 		*path = nd.path;
1102 	return res;
1103 }
1104 
1105 /**
1106  * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair
1107  * @dentry:  pointer to dentry of the base directory
1108  * @mnt: pointer to vfs mount of the base directory
1109  * @name: pointer to file name
1110  * @flags: lookup flags
1111  * @nd: pointer to nameidata
1112  */
1113 int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
1114 		    const char *name, unsigned int flags,
1115 		    struct nameidata *nd)
1116 {
1117 	int retval;
1118 
1119 	/* same as do_path_lookup */
1120 	nd->last_type = LAST_ROOT;
1121 	nd->flags = flags;
1122 	nd->depth = 0;
1123 
1124 	nd->path.dentry = dentry;
1125 	nd->path.mnt = mnt;
1126 	path_get(&nd->path);
1127 
1128 	retval = path_walk(name, nd);
1129 	if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
1130 				nd->path.dentry->d_inode))
1131 		audit_inode(name, nd->path.dentry);
1132 
1133 	return retval;
1134 
1135 }
1136 
1137 /**
1138  * path_lookup_open - lookup a file path with open intent
1139  * @dfd: the directory to use as base, or AT_FDCWD
1140  * @name: pointer to file name
1141  * @lookup_flags: lookup intent flags
1142  * @nd: pointer to nameidata
1143  * @open_flags: open intent flags
1144  */
1145 int path_lookup_open(int dfd, const char *name, unsigned int lookup_flags,
1146 		struct nameidata *nd, int open_flags)
1147 {
1148 	struct file *filp = get_empty_filp();
1149 	int err;
1150 
1151 	if (filp == NULL)
1152 		return -ENFILE;
1153 	nd->intent.open.file = filp;
1154 	nd->intent.open.flags = open_flags;
1155 	nd->intent.open.create_mode = 0;
1156 	err = do_path_lookup(dfd, name, lookup_flags|LOOKUP_OPEN, nd);
1157 	if (IS_ERR(nd->intent.open.file)) {
1158 		if (err == 0) {
1159 			err = PTR_ERR(nd->intent.open.file);
1160 			path_put(&nd->path);
1161 		}
1162 	} else if (err != 0)
1163 		release_open_intent(nd);
1164 	return err;
1165 }
1166 
1167 static struct dentry *__lookup_hash(struct qstr *name,
1168 		struct dentry *base, struct nameidata *nd)
1169 {
1170 	struct dentry *dentry;
1171 	struct inode *inode;
1172 	int err;
1173 
1174 	inode = base->d_inode;
1175 
1176 	/*
1177 	 * See if the low-level filesystem might want
1178 	 * to use its own hash..
1179 	 */
1180 	if (base->d_op && base->d_op->d_hash) {
1181 		err = base->d_op->d_hash(base, name);
1182 		dentry = ERR_PTR(err);
1183 		if (err < 0)
1184 			goto out;
1185 	}
1186 
1187 	dentry = cached_lookup(base, name, nd);
1188 	if (!dentry) {
1189 		struct dentry *new;
1190 
1191 		/* Don't create child dentry for a dead directory. */
1192 		dentry = ERR_PTR(-ENOENT);
1193 		if (IS_DEADDIR(inode))
1194 			goto out;
1195 
1196 		new = d_alloc(base, name);
1197 		dentry = ERR_PTR(-ENOMEM);
1198 		if (!new)
1199 			goto out;
1200 		dentry = inode->i_op->lookup(inode, new, nd);
1201 		if (!dentry)
1202 			dentry = new;
1203 		else
1204 			dput(new);
1205 	}
1206 out:
1207 	return dentry;
1208 }
1209 
1210 /*
1211  * Restricted form of lookup. Doesn't follow links, single-component only,
1212  * needs parent already locked. Doesn't follow mounts.
1213  * SMP-safe.
1214  */
1215 static struct dentry *lookup_hash(struct nameidata *nd)
1216 {
1217 	int err;
1218 
1219 	err = inode_permission(nd->path.dentry->d_inode, MAY_EXEC);
1220 	if (err)
1221 		return ERR_PTR(err);
1222 	return __lookup_hash(&nd->last, nd->path.dentry, nd);
1223 }
1224 
1225 static int __lookup_one_len(const char *name, struct qstr *this,
1226 		struct dentry *base, int len)
1227 {
1228 	unsigned long hash;
1229 	unsigned int c;
1230 
1231 	this->name = name;
1232 	this->len = len;
1233 	if (!len)
1234 		return -EACCES;
1235 
1236 	hash = init_name_hash();
1237 	while (len--) {
1238 		c = *(const unsigned char *)name++;
1239 		if (c == '/' || c == '\0')
1240 			return -EACCES;
1241 		hash = partial_name_hash(c, hash);
1242 	}
1243 	this->hash = end_name_hash(hash);
1244 	return 0;
1245 }
1246 
1247 /**
1248  * lookup_one_len - filesystem helper to lookup single pathname component
1249  * @name:	pathname component to lookup
1250  * @base:	base directory to lookup from
1251  * @len:	maximum length @len should be interpreted to
1252  *
1253  * Note that this routine is purely a helper for filesystem usage and should
1254  * not be called by generic code.  Also note that by using this function the
1255  * nameidata argument is passed to the filesystem methods and a filesystem
1256  * using this helper needs to be prepared for that.
1257  */
1258 struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
1259 {
1260 	int err;
1261 	struct qstr this;
1262 
1263 	err = __lookup_one_len(name, &this, base, len);
1264 	if (err)
1265 		return ERR_PTR(err);
1266 
1267 	err = inode_permission(base->d_inode, MAY_EXEC);
1268 	if (err)
1269 		return ERR_PTR(err);
1270 	return __lookup_hash(&this, base, NULL);
1271 }
1272 
1273 /**
1274  * lookup_one_noperm - bad hack for sysfs
1275  * @name:	pathname component to lookup
1276  * @base:	base directory to lookup from
1277  *
1278  * This is a variant of lookup_one_len that doesn't perform any permission
1279  * checks.   It's a horrible hack to work around the braindead sysfs
1280  * architecture and should not be used anywhere else.
1281  *
1282  * DON'T USE THIS FUNCTION EVER, thanks.
1283  */
1284 struct dentry *lookup_one_noperm(const char *name, struct dentry *base)
1285 {
1286 	int err;
1287 	struct qstr this;
1288 
1289 	err = __lookup_one_len(name, &this, base, strlen(name));
1290 	if (err)
1291 		return ERR_PTR(err);
1292 	return __lookup_hash(&this, base, NULL);
1293 }
1294 
1295 int user_path_at(int dfd, const char __user *name, unsigned flags,
1296 		 struct path *path)
1297 {
1298 	struct nameidata nd;
1299 	char *tmp = getname(name);
1300 	int err = PTR_ERR(tmp);
1301 	if (!IS_ERR(tmp)) {
1302 
1303 		BUG_ON(flags & LOOKUP_PARENT);
1304 
1305 		err = do_path_lookup(dfd, tmp, flags, &nd);
1306 		putname(tmp);
1307 		if (!err)
1308 			*path = nd.path;
1309 	}
1310 	return err;
1311 }
1312 
1313 static int user_path_parent(int dfd, const char __user *path,
1314 			struct nameidata *nd, char **name)
1315 {
1316 	char *s = getname(path);
1317 	int error;
1318 
1319 	if (IS_ERR(s))
1320 		return PTR_ERR(s);
1321 
1322 	error = do_path_lookup(dfd, s, LOOKUP_PARENT, nd);
1323 	if (error)
1324 		putname(s);
1325 	else
1326 		*name = s;
1327 
1328 	return error;
1329 }
1330 
1331 /*
1332  * It's inline, so penalty for filesystems that don't use sticky bit is
1333  * minimal.
1334  */
1335 static inline int check_sticky(struct inode *dir, struct inode *inode)
1336 {
1337 	if (!(dir->i_mode & S_ISVTX))
1338 		return 0;
1339 	if (inode->i_uid == current->fsuid)
1340 		return 0;
1341 	if (dir->i_uid == current->fsuid)
1342 		return 0;
1343 	return !capable(CAP_FOWNER);
1344 }
1345 
1346 /*
1347  *	Check whether we can remove a link victim from directory dir, check
1348  *  whether the type of victim is right.
1349  *  1. We can't do it if dir is read-only (done in permission())
1350  *  2. We should have write and exec permissions on dir
1351  *  3. We can't remove anything from append-only dir
1352  *  4. We can't do anything with immutable dir (done in permission())
1353  *  5. If the sticky bit on dir is set we should either
1354  *	a. be owner of dir, or
1355  *	b. be owner of victim, or
1356  *	c. have CAP_FOWNER capability
1357  *  6. If the victim is append-only or immutable we can't do antyhing with
1358  *     links pointing to it.
1359  *  7. If we were asked to remove a directory and victim isn't one - ENOTDIR.
1360  *  8. If we were asked to remove a non-directory and victim isn't one - EISDIR.
1361  *  9. We can't remove a root or mountpoint.
1362  * 10. We don't allow removal of NFS sillyrenamed files; it's handled by
1363  *     nfs_async_unlink().
1364  */
1365 static int may_delete(struct inode *dir,struct dentry *victim,int isdir)
1366 {
1367 	int error;
1368 
1369 	if (!victim->d_inode)
1370 		return -ENOENT;
1371 
1372 	BUG_ON(victim->d_parent->d_inode != dir);
1373 	audit_inode_child(victim->d_name.name, victim, dir);
1374 
1375 	error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
1376 	if (error)
1377 		return error;
1378 	if (IS_APPEND(dir))
1379 		return -EPERM;
1380 	if (check_sticky(dir, victim->d_inode)||IS_APPEND(victim->d_inode)||
1381 	    IS_IMMUTABLE(victim->d_inode))
1382 		return -EPERM;
1383 	if (isdir) {
1384 		if (!S_ISDIR(victim->d_inode->i_mode))
1385 			return -ENOTDIR;
1386 		if (IS_ROOT(victim))
1387 			return -EBUSY;
1388 	} else if (S_ISDIR(victim->d_inode->i_mode))
1389 		return -EISDIR;
1390 	if (IS_DEADDIR(dir))
1391 		return -ENOENT;
1392 	if (victim->d_flags & DCACHE_NFSFS_RENAMED)
1393 		return -EBUSY;
1394 	return 0;
1395 }
1396 
1397 /*	Check whether we can create an object with dentry child in directory
1398  *  dir.
1399  *  1. We can't do it if child already exists (open has special treatment for
1400  *     this case, but since we are inlined it's OK)
1401  *  2. We can't do it if dir is read-only (done in permission())
1402  *  3. We should have write and exec permissions on dir
1403  *  4. We can't do it if dir is immutable (done in permission())
1404  */
1405 static inline int may_create(struct inode *dir, struct dentry *child)
1406 {
1407 	if (child->d_inode)
1408 		return -EEXIST;
1409 	if (IS_DEADDIR(dir))
1410 		return -ENOENT;
1411 	return inode_permission(dir, MAY_WRITE | MAY_EXEC);
1412 }
1413 
1414 /*
1415  * O_DIRECTORY translates into forcing a directory lookup.
1416  */
1417 static inline int lookup_flags(unsigned int f)
1418 {
1419 	unsigned long retval = LOOKUP_FOLLOW;
1420 
1421 	if (f & O_NOFOLLOW)
1422 		retval &= ~LOOKUP_FOLLOW;
1423 
1424 	if (f & O_DIRECTORY)
1425 		retval |= LOOKUP_DIRECTORY;
1426 
1427 	return retval;
1428 }
1429 
1430 /*
1431  * p1 and p2 should be directories on the same fs.
1432  */
1433 struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
1434 {
1435 	struct dentry *p;
1436 
1437 	if (p1 == p2) {
1438 		mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
1439 		return NULL;
1440 	}
1441 
1442 	mutex_lock(&p1->d_inode->i_sb->s_vfs_rename_mutex);
1443 
1444 	p = d_ancestor(p2, p1);
1445 	if (p) {
1446 		mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT);
1447 		mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_CHILD);
1448 		return p;
1449 	}
1450 
1451 	p = d_ancestor(p1, p2);
1452 	if (p) {
1453 		mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
1454 		mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD);
1455 		return p;
1456 	}
1457 
1458 	mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
1459 	mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD);
1460 	return NULL;
1461 }
1462 
1463 void unlock_rename(struct dentry *p1, struct dentry *p2)
1464 {
1465 	mutex_unlock(&p1->d_inode->i_mutex);
1466 	if (p1 != p2) {
1467 		mutex_unlock(&p2->d_inode->i_mutex);
1468 		mutex_unlock(&p1->d_inode->i_sb->s_vfs_rename_mutex);
1469 	}
1470 }
1471 
1472 int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
1473 		struct nameidata *nd)
1474 {
1475 	int error = may_create(dir, dentry);
1476 
1477 	if (error)
1478 		return error;
1479 
1480 	if (!dir->i_op || !dir->i_op->create)
1481 		return -EACCES;	/* shouldn't it be ENOSYS? */
1482 	mode &= S_IALLUGO;
1483 	mode |= S_IFREG;
1484 	error = security_inode_create(dir, dentry, mode);
1485 	if (error)
1486 		return error;
1487 	DQUOT_INIT(dir);
1488 	error = dir->i_op->create(dir, dentry, mode, nd);
1489 	if (!error)
1490 		fsnotify_create(dir, dentry);
1491 	return error;
1492 }
1493 
1494 int may_open(struct nameidata *nd, int acc_mode, int flag)
1495 {
1496 	struct dentry *dentry = nd->path.dentry;
1497 	struct inode *inode = dentry->d_inode;
1498 	int error;
1499 
1500 	if (!inode)
1501 		return -ENOENT;
1502 
1503 	if (S_ISLNK(inode->i_mode))
1504 		return -ELOOP;
1505 
1506 	if (S_ISDIR(inode->i_mode) && (acc_mode & MAY_WRITE))
1507 		return -EISDIR;
1508 
1509 	/*
1510 	 * FIFO's, sockets and device files are special: they don't
1511 	 * actually live on the filesystem itself, and as such you
1512 	 * can write to them even if the filesystem is read-only.
1513 	 */
1514 	if (S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
1515 	    	flag &= ~O_TRUNC;
1516 	} else if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode)) {
1517 		if (nd->path.mnt->mnt_flags & MNT_NODEV)
1518 			return -EACCES;
1519 
1520 		flag &= ~O_TRUNC;
1521 	}
1522 
1523 	error = vfs_permission(nd, acc_mode);
1524 	if (error)
1525 		return error;
1526 	/*
1527 	 * An append-only file must be opened in append mode for writing.
1528 	 */
1529 	if (IS_APPEND(inode)) {
1530 		if  ((flag & FMODE_WRITE) && !(flag & O_APPEND))
1531 			return -EPERM;
1532 		if (flag & O_TRUNC)
1533 			return -EPERM;
1534 	}
1535 
1536 	/* O_NOATIME can only be set by the owner or superuser */
1537 	if (flag & O_NOATIME)
1538 		if (!is_owner_or_cap(inode))
1539 			return -EPERM;
1540 
1541 	/*
1542 	 * Ensure there are no outstanding leases on the file.
1543 	 */
1544 	error = break_lease(inode, flag);
1545 	if (error)
1546 		return error;
1547 
1548 	if (flag & O_TRUNC) {
1549 		error = get_write_access(inode);
1550 		if (error)
1551 			return error;
1552 
1553 		/*
1554 		 * Refuse to truncate files with mandatory locks held on them.
1555 		 */
1556 		error = locks_verify_locked(inode);
1557 		if (!error) {
1558 			DQUOT_INIT(inode);
1559 
1560 			error = do_truncate(dentry, 0,
1561 					    ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
1562 					    NULL);
1563 		}
1564 		put_write_access(inode);
1565 		if (error)
1566 			return error;
1567 	} else
1568 		if (flag & FMODE_WRITE)
1569 			DQUOT_INIT(inode);
1570 
1571 	return 0;
1572 }
1573 
1574 /*
1575  * Be careful about ever adding any more callers of this
1576  * function.  Its flags must be in the namei format, not
1577  * what get passed to sys_open().
1578  */
1579 static int __open_namei_create(struct nameidata *nd, struct path *path,
1580 				int flag, int mode)
1581 {
1582 	int error;
1583 	struct dentry *dir = nd->path.dentry;
1584 
1585 	if (!IS_POSIXACL(dir->d_inode))
1586 		mode &= ~current->fs->umask;
1587 	error = vfs_create(dir->d_inode, path->dentry, mode, nd);
1588 	mutex_unlock(&dir->d_inode->i_mutex);
1589 	dput(nd->path.dentry);
1590 	nd->path.dentry = path->dentry;
1591 	if (error)
1592 		return error;
1593 	/* Don't check for write permission, don't truncate */
1594 	return may_open(nd, 0, flag & ~O_TRUNC);
1595 }
1596 
1597 /*
1598  * Note that while the flag value (low two bits) for sys_open means:
1599  *	00 - read-only
1600  *	01 - write-only
1601  *	10 - read-write
1602  *	11 - special
1603  * it is changed into
1604  *	00 - no permissions needed
1605  *	01 - read-permission
1606  *	10 - write-permission
1607  *	11 - read-write
1608  * for the internal routines (ie open_namei()/follow_link() etc)
1609  * This is more logical, and also allows the 00 "no perm needed"
1610  * to be used for symlinks (where the permissions are checked
1611  * later).
1612  *
1613 */
1614 static inline int open_to_namei_flags(int flag)
1615 {
1616 	if ((flag+1) & O_ACCMODE)
1617 		flag++;
1618 	return flag;
1619 }
1620 
1621 static int open_will_write_to_fs(int flag, struct inode *inode)
1622 {
1623 	/*
1624 	 * We'll never write to the fs underlying
1625 	 * a device file.
1626 	 */
1627 	if (special_file(inode->i_mode))
1628 		return 0;
1629 	return (flag & O_TRUNC);
1630 }
1631 
1632 /*
1633  * Note that the low bits of the passed in "open_flag"
1634  * are not the same as in the local variable "flag". See
1635  * open_to_namei_flags() for more details.
1636  */
1637 struct file *do_filp_open(int dfd, const char *pathname,
1638 		int open_flag, int mode)
1639 {
1640 	struct file *filp;
1641 	struct nameidata nd;
1642 	int acc_mode, error;
1643 	struct path path;
1644 	struct dentry *dir;
1645 	int count = 0;
1646 	int will_write;
1647 	int flag = open_to_namei_flags(open_flag);
1648 
1649 	acc_mode = MAY_OPEN | ACC_MODE(flag);
1650 
1651 	/* O_TRUNC implies we need access checks for write permissions */
1652 	if (flag & O_TRUNC)
1653 		acc_mode |= MAY_WRITE;
1654 
1655 	/* Allow the LSM permission hook to distinguish append
1656 	   access from general write access. */
1657 	if (flag & O_APPEND)
1658 		acc_mode |= MAY_APPEND;
1659 
1660 	/*
1661 	 * The simplest case - just a plain lookup.
1662 	 */
1663 	if (!(flag & O_CREAT)) {
1664 		error = path_lookup_open(dfd, pathname, lookup_flags(flag),
1665 					 &nd, flag);
1666 		if (error)
1667 			return ERR_PTR(error);
1668 		goto ok;
1669 	}
1670 
1671 	/*
1672 	 * Create - we need to know the parent.
1673 	 */
1674 	error = do_path_lookup(dfd, pathname, LOOKUP_PARENT, &nd);
1675 	if (error)
1676 		return ERR_PTR(error);
1677 
1678 	/*
1679 	 * We have the parent and last component. First of all, check
1680 	 * that we are not asked to creat(2) an obvious directory - that
1681 	 * will not do.
1682 	 */
1683 	error = -EISDIR;
1684 	if (nd.last_type != LAST_NORM || nd.last.name[nd.last.len])
1685 		goto exit_parent;
1686 
1687 	error = -ENFILE;
1688 	filp = get_empty_filp();
1689 	if (filp == NULL)
1690 		goto exit_parent;
1691 	nd.intent.open.file = filp;
1692 	nd.intent.open.flags = flag;
1693 	nd.intent.open.create_mode = mode;
1694 	dir = nd.path.dentry;
1695 	nd.flags &= ~LOOKUP_PARENT;
1696 	nd.flags |= LOOKUP_CREATE | LOOKUP_OPEN;
1697 	if (flag & O_EXCL)
1698 		nd.flags |= LOOKUP_EXCL;
1699 	mutex_lock(&dir->d_inode->i_mutex);
1700 	path.dentry = lookup_hash(&nd);
1701 	path.mnt = nd.path.mnt;
1702 
1703 do_last:
1704 	error = PTR_ERR(path.dentry);
1705 	if (IS_ERR(path.dentry)) {
1706 		mutex_unlock(&dir->d_inode->i_mutex);
1707 		goto exit;
1708 	}
1709 
1710 	if (IS_ERR(nd.intent.open.file)) {
1711 		error = PTR_ERR(nd.intent.open.file);
1712 		goto exit_mutex_unlock;
1713 	}
1714 
1715 	/* Negative dentry, just create the file */
1716 	if (!path.dentry->d_inode) {
1717 		/*
1718 		 * This write is needed to ensure that a
1719 		 * ro->rw transition does not occur between
1720 		 * the time when the file is created and when
1721 		 * a permanent write count is taken through
1722 		 * the 'struct file' in nameidata_to_filp().
1723 		 */
1724 		error = mnt_want_write(nd.path.mnt);
1725 		if (error)
1726 			goto exit_mutex_unlock;
1727 		error = __open_namei_create(&nd, &path, flag, mode);
1728 		if (error) {
1729 			mnt_drop_write(nd.path.mnt);
1730 			goto exit;
1731 		}
1732 		filp = nameidata_to_filp(&nd, open_flag);
1733 		mnt_drop_write(nd.path.mnt);
1734 		return filp;
1735 	}
1736 
1737 	/*
1738 	 * It already exists.
1739 	 */
1740 	mutex_unlock(&dir->d_inode->i_mutex);
1741 	audit_inode(pathname, path.dentry);
1742 
1743 	error = -EEXIST;
1744 	if (flag & O_EXCL)
1745 		goto exit_dput;
1746 
1747 	if (__follow_mount(&path)) {
1748 		error = -ELOOP;
1749 		if (flag & O_NOFOLLOW)
1750 			goto exit_dput;
1751 	}
1752 
1753 	error = -ENOENT;
1754 	if (!path.dentry->d_inode)
1755 		goto exit_dput;
1756 	if (path.dentry->d_inode->i_op && path.dentry->d_inode->i_op->follow_link)
1757 		goto do_link;
1758 
1759 	path_to_nameidata(&path, &nd);
1760 	error = -EISDIR;
1761 	if (path.dentry->d_inode && S_ISDIR(path.dentry->d_inode->i_mode))
1762 		goto exit;
1763 ok:
1764 	/*
1765 	 * Consider:
1766 	 * 1. may_open() truncates a file
1767 	 * 2. a rw->ro mount transition occurs
1768 	 * 3. nameidata_to_filp() fails due to
1769 	 *    the ro mount.
1770 	 * That would be inconsistent, and should
1771 	 * be avoided. Taking this mnt write here
1772 	 * ensures that (2) can not occur.
1773 	 */
1774 	will_write = open_will_write_to_fs(flag, nd.path.dentry->d_inode);
1775 	if (will_write) {
1776 		error = mnt_want_write(nd.path.mnt);
1777 		if (error)
1778 			goto exit;
1779 	}
1780 	error = may_open(&nd, acc_mode, flag);
1781 	if (error) {
1782 		if (will_write)
1783 			mnt_drop_write(nd.path.mnt);
1784 		goto exit;
1785 	}
1786 	filp = nameidata_to_filp(&nd, open_flag);
1787 	/*
1788 	 * It is now safe to drop the mnt write
1789 	 * because the filp has had a write taken
1790 	 * on its behalf.
1791 	 */
1792 	if (will_write)
1793 		mnt_drop_write(nd.path.mnt);
1794 	return filp;
1795 
1796 exit_mutex_unlock:
1797 	mutex_unlock(&dir->d_inode->i_mutex);
1798 exit_dput:
1799 	path_put_conditional(&path, &nd);
1800 exit:
1801 	if (!IS_ERR(nd.intent.open.file))
1802 		release_open_intent(&nd);
1803 exit_parent:
1804 	path_put(&nd.path);
1805 	return ERR_PTR(error);
1806 
1807 do_link:
1808 	error = -ELOOP;
1809 	if (flag & O_NOFOLLOW)
1810 		goto exit_dput;
1811 	/*
1812 	 * This is subtle. Instead of calling do_follow_link() we do the
1813 	 * thing by hands. The reason is that this way we have zero link_count
1814 	 * and path_walk() (called from ->follow_link) honoring LOOKUP_PARENT.
1815 	 * After that we have the parent and last component, i.e.
1816 	 * we are in the same situation as after the first path_walk().
1817 	 * Well, almost - if the last component is normal we get its copy
1818 	 * stored in nd->last.name and we will have to putname() it when we
1819 	 * are done. Procfs-like symlinks just set LAST_BIND.
1820 	 */
1821 	nd.flags |= LOOKUP_PARENT;
1822 	error = security_inode_follow_link(path.dentry, &nd);
1823 	if (error)
1824 		goto exit_dput;
1825 	error = __do_follow_link(&path, &nd);
1826 	if (error) {
1827 		/* Does someone understand code flow here? Or it is only
1828 		 * me so stupid? Anathema to whoever designed this non-sense
1829 		 * with "intent.open".
1830 		 */
1831 		release_open_intent(&nd);
1832 		return ERR_PTR(error);
1833 	}
1834 	nd.flags &= ~LOOKUP_PARENT;
1835 	if (nd.last_type == LAST_BIND)
1836 		goto ok;
1837 	error = -EISDIR;
1838 	if (nd.last_type != LAST_NORM)
1839 		goto exit;
1840 	if (nd.last.name[nd.last.len]) {
1841 		__putname(nd.last.name);
1842 		goto exit;
1843 	}
1844 	error = -ELOOP;
1845 	if (count++==32) {
1846 		__putname(nd.last.name);
1847 		goto exit;
1848 	}
1849 	dir = nd.path.dentry;
1850 	mutex_lock(&dir->d_inode->i_mutex);
1851 	path.dentry = lookup_hash(&nd);
1852 	path.mnt = nd.path.mnt;
1853 	__putname(nd.last.name);
1854 	goto do_last;
1855 }
1856 
1857 /**
1858  * filp_open - open file and return file pointer
1859  *
1860  * @filename:	path to open
1861  * @flags:	open flags as per the open(2) second argument
1862  * @mode:	mode for the new file if O_CREAT is set, else ignored
1863  *
1864  * This is the helper to open a file from kernelspace if you really
1865  * have to.  But in generally you should not do this, so please move
1866  * along, nothing to see here..
1867  */
1868 struct file *filp_open(const char *filename, int flags, int mode)
1869 {
1870 	return do_filp_open(AT_FDCWD, filename, flags, mode);
1871 }
1872 EXPORT_SYMBOL(filp_open);
1873 
1874 /**
1875  * lookup_create - lookup a dentry, creating it if it doesn't exist
1876  * @nd: nameidata info
1877  * @is_dir: directory flag
1878  *
1879  * Simple function to lookup and return a dentry and create it
1880  * if it doesn't exist.  Is SMP-safe.
1881  *
1882  * Returns with nd->path.dentry->d_inode->i_mutex locked.
1883  */
1884 struct dentry *lookup_create(struct nameidata *nd, int is_dir)
1885 {
1886 	struct dentry *dentry = ERR_PTR(-EEXIST);
1887 
1888 	mutex_lock_nested(&nd->path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
1889 	/*
1890 	 * Yucky last component or no last component at all?
1891 	 * (foo/., foo/.., /////)
1892 	 */
1893 	if (nd->last_type != LAST_NORM)
1894 		goto fail;
1895 	nd->flags &= ~LOOKUP_PARENT;
1896 	nd->flags |= LOOKUP_CREATE | LOOKUP_EXCL;
1897 	nd->intent.open.flags = O_EXCL;
1898 
1899 	/*
1900 	 * Do the final lookup.
1901 	 */
1902 	dentry = lookup_hash(nd);
1903 	if (IS_ERR(dentry))
1904 		goto fail;
1905 
1906 	if (dentry->d_inode)
1907 		goto eexist;
1908 	/*
1909 	 * Special case - lookup gave negative, but... we had foo/bar/
1910 	 * From the vfs_mknod() POV we just have a negative dentry -
1911 	 * all is fine. Let's be bastards - you had / on the end, you've
1912 	 * been asking for (non-existent) directory. -ENOENT for you.
1913 	 */
1914 	if (unlikely(!is_dir && nd->last.name[nd->last.len])) {
1915 		dput(dentry);
1916 		dentry = ERR_PTR(-ENOENT);
1917 	}
1918 	return dentry;
1919 eexist:
1920 	dput(dentry);
1921 	dentry = ERR_PTR(-EEXIST);
1922 fail:
1923 	return dentry;
1924 }
1925 EXPORT_SYMBOL_GPL(lookup_create);
1926 
1927 int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
1928 {
1929 	int error = may_create(dir, dentry);
1930 
1931 	if (error)
1932 		return error;
1933 
1934 	if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD))
1935 		return -EPERM;
1936 
1937 	if (!dir->i_op || !dir->i_op->mknod)
1938 		return -EPERM;
1939 
1940 	error = devcgroup_inode_mknod(mode, dev);
1941 	if (error)
1942 		return error;
1943 
1944 	error = security_inode_mknod(dir, dentry, mode, dev);
1945 	if (error)
1946 		return error;
1947 
1948 	DQUOT_INIT(dir);
1949 	error = dir->i_op->mknod(dir, dentry, mode, dev);
1950 	if (!error)
1951 		fsnotify_create(dir, dentry);
1952 	return error;
1953 }
1954 
1955 static int may_mknod(mode_t mode)
1956 {
1957 	switch (mode & S_IFMT) {
1958 	case S_IFREG:
1959 	case S_IFCHR:
1960 	case S_IFBLK:
1961 	case S_IFIFO:
1962 	case S_IFSOCK:
1963 	case 0: /* zero mode translates to S_IFREG */
1964 		return 0;
1965 	case S_IFDIR:
1966 		return -EPERM;
1967 	default:
1968 		return -EINVAL;
1969 	}
1970 }
1971 
1972 asmlinkage long sys_mknodat(int dfd, const char __user *filename, int mode,
1973 				unsigned dev)
1974 {
1975 	int error;
1976 	char *tmp;
1977 	struct dentry *dentry;
1978 	struct nameidata nd;
1979 
1980 	if (S_ISDIR(mode))
1981 		return -EPERM;
1982 
1983 	error = user_path_parent(dfd, filename, &nd, &tmp);
1984 	if (error)
1985 		return error;
1986 
1987 	dentry = lookup_create(&nd, 0);
1988 	if (IS_ERR(dentry)) {
1989 		error = PTR_ERR(dentry);
1990 		goto out_unlock;
1991 	}
1992 	if (!IS_POSIXACL(nd.path.dentry->d_inode))
1993 		mode &= ~current->fs->umask;
1994 	error = may_mknod(mode);
1995 	if (error)
1996 		goto out_dput;
1997 	error = mnt_want_write(nd.path.mnt);
1998 	if (error)
1999 		goto out_dput;
2000 	switch (mode & S_IFMT) {
2001 		case 0: case S_IFREG:
2002 			error = vfs_create(nd.path.dentry->d_inode,dentry,mode,&nd);
2003 			break;
2004 		case S_IFCHR: case S_IFBLK:
2005 			error = vfs_mknod(nd.path.dentry->d_inode,dentry,mode,
2006 					new_decode_dev(dev));
2007 			break;
2008 		case S_IFIFO: case S_IFSOCK:
2009 			error = vfs_mknod(nd.path.dentry->d_inode,dentry,mode,0);
2010 			break;
2011 	}
2012 	mnt_drop_write(nd.path.mnt);
2013 out_dput:
2014 	dput(dentry);
2015 out_unlock:
2016 	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
2017 	path_put(&nd.path);
2018 	putname(tmp);
2019 
2020 	return error;
2021 }
2022 
2023 asmlinkage long sys_mknod(const char __user *filename, int mode, unsigned dev)
2024 {
2025 	return sys_mknodat(AT_FDCWD, filename, mode, dev);
2026 }
2027 
2028 int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
2029 {
2030 	int error = may_create(dir, dentry);
2031 
2032 	if (error)
2033 		return error;
2034 
2035 	if (!dir->i_op || !dir->i_op->mkdir)
2036 		return -EPERM;
2037 
2038 	mode &= (S_IRWXUGO|S_ISVTX);
2039 	error = security_inode_mkdir(dir, dentry, mode);
2040 	if (error)
2041 		return error;
2042 
2043 	DQUOT_INIT(dir);
2044 	error = dir->i_op->mkdir(dir, dentry, mode);
2045 	if (!error)
2046 		fsnotify_mkdir(dir, dentry);
2047 	return error;
2048 }
2049 
2050 asmlinkage long sys_mkdirat(int dfd, const char __user *pathname, int mode)
2051 {
2052 	int error = 0;
2053 	char * tmp;
2054 	struct dentry *dentry;
2055 	struct nameidata nd;
2056 
2057 	error = user_path_parent(dfd, pathname, &nd, &tmp);
2058 	if (error)
2059 		goto out_err;
2060 
2061 	dentry = lookup_create(&nd, 1);
2062 	error = PTR_ERR(dentry);
2063 	if (IS_ERR(dentry))
2064 		goto out_unlock;
2065 
2066 	if (!IS_POSIXACL(nd.path.dentry->d_inode))
2067 		mode &= ~current->fs->umask;
2068 	error = mnt_want_write(nd.path.mnt);
2069 	if (error)
2070 		goto out_dput;
2071 	error = vfs_mkdir(nd.path.dentry->d_inode, dentry, mode);
2072 	mnt_drop_write(nd.path.mnt);
2073 out_dput:
2074 	dput(dentry);
2075 out_unlock:
2076 	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
2077 	path_put(&nd.path);
2078 	putname(tmp);
2079 out_err:
2080 	return error;
2081 }
2082 
2083 asmlinkage long sys_mkdir(const char __user *pathname, int mode)
2084 {
2085 	return sys_mkdirat(AT_FDCWD, pathname, mode);
2086 }
2087 
2088 /*
2089  * We try to drop the dentry early: we should have
2090  * a usage count of 2 if we're the only user of this
2091  * dentry, and if that is true (possibly after pruning
2092  * the dcache), then we drop the dentry now.
2093  *
2094  * A low-level filesystem can, if it choses, legally
2095  * do a
2096  *
2097  *	if (!d_unhashed(dentry))
2098  *		return -EBUSY;
2099  *
2100  * if it cannot handle the case of removing a directory
2101  * that is still in use by something else..
2102  */
2103 void dentry_unhash(struct dentry *dentry)
2104 {
2105 	dget(dentry);
2106 	shrink_dcache_parent(dentry);
2107 	spin_lock(&dcache_lock);
2108 	spin_lock(&dentry->d_lock);
2109 	if (atomic_read(&dentry->d_count) == 2)
2110 		__d_drop(dentry);
2111 	spin_unlock(&dentry->d_lock);
2112 	spin_unlock(&dcache_lock);
2113 }
2114 
2115 int vfs_rmdir(struct inode *dir, struct dentry *dentry)
2116 {
2117 	int error = may_delete(dir, dentry, 1);
2118 
2119 	if (error)
2120 		return error;
2121 
2122 	if (!dir->i_op || !dir->i_op->rmdir)
2123 		return -EPERM;
2124 
2125 	DQUOT_INIT(dir);
2126 
2127 	mutex_lock(&dentry->d_inode->i_mutex);
2128 	dentry_unhash(dentry);
2129 	if (d_mountpoint(dentry))
2130 		error = -EBUSY;
2131 	else {
2132 		error = security_inode_rmdir(dir, dentry);
2133 		if (!error) {
2134 			error = dir->i_op->rmdir(dir, dentry);
2135 			if (!error)
2136 				dentry->d_inode->i_flags |= S_DEAD;
2137 		}
2138 	}
2139 	mutex_unlock(&dentry->d_inode->i_mutex);
2140 	if (!error) {
2141 		d_delete(dentry);
2142 	}
2143 	dput(dentry);
2144 
2145 	return error;
2146 }
2147 
2148 static long do_rmdir(int dfd, const char __user *pathname)
2149 {
2150 	int error = 0;
2151 	char * name;
2152 	struct dentry *dentry;
2153 	struct nameidata nd;
2154 
2155 	error = user_path_parent(dfd, pathname, &nd, &name);
2156 	if (error)
2157 		return error;
2158 
2159 	switch(nd.last_type) {
2160 	case LAST_DOTDOT:
2161 		error = -ENOTEMPTY;
2162 		goto exit1;
2163 	case LAST_DOT:
2164 		error = -EINVAL;
2165 		goto exit1;
2166 	case LAST_ROOT:
2167 		error = -EBUSY;
2168 		goto exit1;
2169 	}
2170 
2171 	nd.flags &= ~LOOKUP_PARENT;
2172 
2173 	mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
2174 	dentry = lookup_hash(&nd);
2175 	error = PTR_ERR(dentry);
2176 	if (IS_ERR(dentry))
2177 		goto exit2;
2178 	error = mnt_want_write(nd.path.mnt);
2179 	if (error)
2180 		goto exit3;
2181 	error = vfs_rmdir(nd.path.dentry->d_inode, dentry);
2182 	mnt_drop_write(nd.path.mnt);
2183 exit3:
2184 	dput(dentry);
2185 exit2:
2186 	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
2187 exit1:
2188 	path_put(&nd.path);
2189 	putname(name);
2190 	return error;
2191 }
2192 
2193 asmlinkage long sys_rmdir(const char __user *pathname)
2194 {
2195 	return do_rmdir(AT_FDCWD, pathname);
2196 }
2197 
2198 int vfs_unlink(struct inode *dir, struct dentry *dentry)
2199 {
2200 	int error = may_delete(dir, dentry, 0);
2201 
2202 	if (error)
2203 		return error;
2204 
2205 	if (!dir->i_op || !dir->i_op->unlink)
2206 		return -EPERM;
2207 
2208 	DQUOT_INIT(dir);
2209 
2210 	mutex_lock(&dentry->d_inode->i_mutex);
2211 	if (d_mountpoint(dentry))
2212 		error = -EBUSY;
2213 	else {
2214 		error = security_inode_unlink(dir, dentry);
2215 		if (!error)
2216 			error = dir->i_op->unlink(dir, dentry);
2217 	}
2218 	mutex_unlock(&dentry->d_inode->i_mutex);
2219 
2220 	/* We don't d_delete() NFS sillyrenamed files--they still exist. */
2221 	if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) {
2222 		fsnotify_link_count(dentry->d_inode);
2223 		d_delete(dentry);
2224 	}
2225 
2226 	return error;
2227 }
2228 
2229 /*
2230  * Make sure that the actual truncation of the file will occur outside its
2231  * directory's i_mutex.  Truncate can take a long time if there is a lot of
2232  * writeout happening, and we don't want to prevent access to the directory
2233  * while waiting on the I/O.
2234  */
2235 static long do_unlinkat(int dfd, const char __user *pathname)
2236 {
2237 	int error;
2238 	char *name;
2239 	struct dentry *dentry;
2240 	struct nameidata nd;
2241 	struct inode *inode = NULL;
2242 
2243 	error = user_path_parent(dfd, pathname, &nd, &name);
2244 	if (error)
2245 		return error;
2246 
2247 	error = -EISDIR;
2248 	if (nd.last_type != LAST_NORM)
2249 		goto exit1;
2250 
2251 	nd.flags &= ~LOOKUP_PARENT;
2252 
2253 	mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
2254 	dentry = lookup_hash(&nd);
2255 	error = PTR_ERR(dentry);
2256 	if (!IS_ERR(dentry)) {
2257 		/* Why not before? Because we want correct error value */
2258 		if (nd.last.name[nd.last.len])
2259 			goto slashes;
2260 		inode = dentry->d_inode;
2261 		if (inode)
2262 			atomic_inc(&inode->i_count);
2263 		error = mnt_want_write(nd.path.mnt);
2264 		if (error)
2265 			goto exit2;
2266 		error = vfs_unlink(nd.path.dentry->d_inode, dentry);
2267 		mnt_drop_write(nd.path.mnt);
2268 	exit2:
2269 		dput(dentry);
2270 	}
2271 	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
2272 	if (inode)
2273 		iput(inode);	/* truncate the inode here */
2274 exit1:
2275 	path_put(&nd.path);
2276 	putname(name);
2277 	return error;
2278 
2279 slashes:
2280 	error = !dentry->d_inode ? -ENOENT :
2281 		S_ISDIR(dentry->d_inode->i_mode) ? -EISDIR : -ENOTDIR;
2282 	goto exit2;
2283 }
2284 
2285 asmlinkage long sys_unlinkat(int dfd, const char __user *pathname, int flag)
2286 {
2287 	if ((flag & ~AT_REMOVEDIR) != 0)
2288 		return -EINVAL;
2289 
2290 	if (flag & AT_REMOVEDIR)
2291 		return do_rmdir(dfd, pathname);
2292 
2293 	return do_unlinkat(dfd, pathname);
2294 }
2295 
2296 asmlinkage long sys_unlink(const char __user *pathname)
2297 {
2298 	return do_unlinkat(AT_FDCWD, pathname);
2299 }
2300 
2301 int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
2302 {
2303 	int error = may_create(dir, dentry);
2304 
2305 	if (error)
2306 		return error;
2307 
2308 	if (!dir->i_op || !dir->i_op->symlink)
2309 		return -EPERM;
2310 
2311 	error = security_inode_symlink(dir, dentry, oldname);
2312 	if (error)
2313 		return error;
2314 
2315 	DQUOT_INIT(dir);
2316 	error = dir->i_op->symlink(dir, dentry, oldname);
2317 	if (!error)
2318 		fsnotify_create(dir, dentry);
2319 	return error;
2320 }
2321 
2322 asmlinkage long sys_symlinkat(const char __user *oldname,
2323 			      int newdfd, const char __user *newname)
2324 {
2325 	int error;
2326 	char *from;
2327 	char *to;
2328 	struct dentry *dentry;
2329 	struct nameidata nd;
2330 
2331 	from = getname(oldname);
2332 	if (IS_ERR(from))
2333 		return PTR_ERR(from);
2334 
2335 	error = user_path_parent(newdfd, newname, &nd, &to);
2336 	if (error)
2337 		goto out_putname;
2338 
2339 	dentry = lookup_create(&nd, 0);
2340 	error = PTR_ERR(dentry);
2341 	if (IS_ERR(dentry))
2342 		goto out_unlock;
2343 
2344 	error = mnt_want_write(nd.path.mnt);
2345 	if (error)
2346 		goto out_dput;
2347 	error = vfs_symlink(nd.path.dentry->d_inode, dentry, from);
2348 	mnt_drop_write(nd.path.mnt);
2349 out_dput:
2350 	dput(dentry);
2351 out_unlock:
2352 	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
2353 	path_put(&nd.path);
2354 	putname(to);
2355 out_putname:
2356 	putname(from);
2357 	return error;
2358 }
2359 
2360 asmlinkage long sys_symlink(const char __user *oldname, const char __user *newname)
2361 {
2362 	return sys_symlinkat(oldname, AT_FDCWD, newname);
2363 }
2364 
2365 int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry)
2366 {
2367 	struct inode *inode = old_dentry->d_inode;
2368 	int error;
2369 
2370 	if (!inode)
2371 		return -ENOENT;
2372 
2373 	error = may_create(dir, new_dentry);
2374 	if (error)
2375 		return error;
2376 
2377 	if (dir->i_sb != inode->i_sb)
2378 		return -EXDEV;
2379 
2380 	/*
2381 	 * A link to an append-only or immutable file cannot be created.
2382 	 */
2383 	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
2384 		return -EPERM;
2385 	if (!dir->i_op || !dir->i_op->link)
2386 		return -EPERM;
2387 	if (S_ISDIR(inode->i_mode))
2388 		return -EPERM;
2389 
2390 	error = security_inode_link(old_dentry, dir, new_dentry);
2391 	if (error)
2392 		return error;
2393 
2394 	mutex_lock(&inode->i_mutex);
2395 	DQUOT_INIT(dir);
2396 	error = dir->i_op->link(old_dentry, dir, new_dentry);
2397 	mutex_unlock(&inode->i_mutex);
2398 	if (!error)
2399 		fsnotify_link(dir, inode, new_dentry);
2400 	return error;
2401 }
2402 
2403 /*
2404  * Hardlinks are often used in delicate situations.  We avoid
2405  * security-related surprises by not following symlinks on the
2406  * newname.  --KAB
2407  *
2408  * We don't follow them on the oldname either to be compatible
2409  * with linux 2.0, and to avoid hard-linking to directories
2410  * and other special files.  --ADM
2411  */
2412 asmlinkage long sys_linkat(int olddfd, const char __user *oldname,
2413 			   int newdfd, const char __user *newname,
2414 			   int flags)
2415 {
2416 	struct dentry *new_dentry;
2417 	struct nameidata nd;
2418 	struct path old_path;
2419 	int error;
2420 	char *to;
2421 
2422 	if ((flags & ~AT_SYMLINK_FOLLOW) != 0)
2423 		return -EINVAL;
2424 
2425 	error = user_path_at(olddfd, oldname,
2426 			     flags & AT_SYMLINK_FOLLOW ? LOOKUP_FOLLOW : 0,
2427 			     &old_path);
2428 	if (error)
2429 		return error;
2430 
2431 	error = user_path_parent(newdfd, newname, &nd, &to);
2432 	if (error)
2433 		goto out;
2434 	error = -EXDEV;
2435 	if (old_path.mnt != nd.path.mnt)
2436 		goto out_release;
2437 	new_dentry = lookup_create(&nd, 0);
2438 	error = PTR_ERR(new_dentry);
2439 	if (IS_ERR(new_dentry))
2440 		goto out_unlock;
2441 	error = mnt_want_write(nd.path.mnt);
2442 	if (error)
2443 		goto out_dput;
2444 	error = vfs_link(old_path.dentry, nd.path.dentry->d_inode, new_dentry);
2445 	mnt_drop_write(nd.path.mnt);
2446 out_dput:
2447 	dput(new_dentry);
2448 out_unlock:
2449 	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
2450 out_release:
2451 	path_put(&nd.path);
2452 	putname(to);
2453 out:
2454 	path_put(&old_path);
2455 
2456 	return error;
2457 }
2458 
2459 asmlinkage long sys_link(const char __user *oldname, const char __user *newname)
2460 {
2461 	return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
2462 }
2463 
2464 /*
2465  * The worst of all namespace operations - renaming directory. "Perverted"
2466  * doesn't even start to describe it. Somebody in UCB had a heck of a trip...
2467  * Problems:
2468  *	a) we can get into loop creation. Check is done in is_subdir().
2469  *	b) race potential - two innocent renames can create a loop together.
2470  *	   That's where 4.4 screws up. Current fix: serialization on
2471  *	   sb->s_vfs_rename_mutex. We might be more accurate, but that's another
2472  *	   story.
2473  *	c) we have to lock _three_ objects - parents and victim (if it exists).
2474  *	   And that - after we got ->i_mutex on parents (until then we don't know
2475  *	   whether the target exists).  Solution: try to be smart with locking
2476  *	   order for inodes.  We rely on the fact that tree topology may change
2477  *	   only under ->s_vfs_rename_mutex _and_ that parent of the object we
2478  *	   move will be locked.  Thus we can rank directories by the tree
2479  *	   (ancestors first) and rank all non-directories after them.
2480  *	   That works since everybody except rename does "lock parent, lookup,
2481  *	   lock child" and rename is under ->s_vfs_rename_mutex.
2482  *	   HOWEVER, it relies on the assumption that any object with ->lookup()
2483  *	   has no more than 1 dentry.  If "hybrid" objects will ever appear,
2484  *	   we'd better make sure that there's no link(2) for them.
2485  *	d) some filesystems don't support opened-but-unlinked directories,
2486  *	   either because of layout or because they are not ready to deal with
2487  *	   all cases correctly. The latter will be fixed (taking this sort of
2488  *	   stuff into VFS), but the former is not going away. Solution: the same
2489  *	   trick as in rmdir().
2490  *	e) conversion from fhandle to dentry may come in the wrong moment - when
2491  *	   we are removing the target. Solution: we will have to grab ->i_mutex
2492  *	   in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
2493  *	   ->i_mutex on parents, which works but leads to some truely excessive
2494  *	   locking].
2495  */
2496 static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
2497 			  struct inode *new_dir, struct dentry *new_dentry)
2498 {
2499 	int error = 0;
2500 	struct inode *target;
2501 
2502 	/*
2503 	 * If we are going to change the parent - check write permissions,
2504 	 * we'll need to flip '..'.
2505 	 */
2506 	if (new_dir != old_dir) {
2507 		error = inode_permission(old_dentry->d_inode, MAY_WRITE);
2508 		if (error)
2509 			return error;
2510 	}
2511 
2512 	error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry);
2513 	if (error)
2514 		return error;
2515 
2516 	target = new_dentry->d_inode;
2517 	if (target) {
2518 		mutex_lock(&target->i_mutex);
2519 		dentry_unhash(new_dentry);
2520 	}
2521 	if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
2522 		error = -EBUSY;
2523 	else
2524 		error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
2525 	if (target) {
2526 		if (!error)
2527 			target->i_flags |= S_DEAD;
2528 		mutex_unlock(&target->i_mutex);
2529 		if (d_unhashed(new_dentry))
2530 			d_rehash(new_dentry);
2531 		dput(new_dentry);
2532 	}
2533 	if (!error)
2534 		if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
2535 			d_move(old_dentry,new_dentry);
2536 	return error;
2537 }
2538 
2539 static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
2540 			    struct inode *new_dir, struct dentry *new_dentry)
2541 {
2542 	struct inode *target;
2543 	int error;
2544 
2545 	error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry);
2546 	if (error)
2547 		return error;
2548 
2549 	dget(new_dentry);
2550 	target = new_dentry->d_inode;
2551 	if (target)
2552 		mutex_lock(&target->i_mutex);
2553 	if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
2554 		error = -EBUSY;
2555 	else
2556 		error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
2557 	if (!error) {
2558 		if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
2559 			d_move(old_dentry, new_dentry);
2560 	}
2561 	if (target)
2562 		mutex_unlock(&target->i_mutex);
2563 	dput(new_dentry);
2564 	return error;
2565 }
2566 
2567 int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
2568 	       struct inode *new_dir, struct dentry *new_dentry)
2569 {
2570 	int error;
2571 	int is_dir = S_ISDIR(old_dentry->d_inode->i_mode);
2572 	const char *old_name;
2573 
2574 	if (old_dentry->d_inode == new_dentry->d_inode)
2575  		return 0;
2576 
2577 	error = may_delete(old_dir, old_dentry, is_dir);
2578 	if (error)
2579 		return error;
2580 
2581 	if (!new_dentry->d_inode)
2582 		error = may_create(new_dir, new_dentry);
2583 	else
2584 		error = may_delete(new_dir, new_dentry, is_dir);
2585 	if (error)
2586 		return error;
2587 
2588 	if (!old_dir->i_op || !old_dir->i_op->rename)
2589 		return -EPERM;
2590 
2591 	DQUOT_INIT(old_dir);
2592 	DQUOT_INIT(new_dir);
2593 
2594 	old_name = fsnotify_oldname_init(old_dentry->d_name.name);
2595 
2596 	if (is_dir)
2597 		error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry);
2598 	else
2599 		error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry);
2600 	if (!error) {
2601 		const char *new_name = old_dentry->d_name.name;
2602 		fsnotify_move(old_dir, new_dir, old_name, new_name, is_dir,
2603 			      new_dentry->d_inode, old_dentry);
2604 	}
2605 	fsnotify_oldname_free(old_name);
2606 
2607 	return error;
2608 }
2609 
2610 asmlinkage long sys_renameat(int olddfd, const char __user *oldname,
2611 			     int newdfd, const char __user *newname)
2612 {
2613 	struct dentry *old_dir, *new_dir;
2614 	struct dentry *old_dentry, *new_dentry;
2615 	struct dentry *trap;
2616 	struct nameidata oldnd, newnd;
2617 	char *from;
2618 	char *to;
2619 	int error;
2620 
2621 	error = user_path_parent(olddfd, oldname, &oldnd, &from);
2622 	if (error)
2623 		goto exit;
2624 
2625 	error = user_path_parent(newdfd, newname, &newnd, &to);
2626 	if (error)
2627 		goto exit1;
2628 
2629 	error = -EXDEV;
2630 	if (oldnd.path.mnt != newnd.path.mnt)
2631 		goto exit2;
2632 
2633 	old_dir = oldnd.path.dentry;
2634 	error = -EBUSY;
2635 	if (oldnd.last_type != LAST_NORM)
2636 		goto exit2;
2637 
2638 	new_dir = newnd.path.dentry;
2639 	if (newnd.last_type != LAST_NORM)
2640 		goto exit2;
2641 
2642 	oldnd.flags &= ~LOOKUP_PARENT;
2643 	newnd.flags &= ~LOOKUP_PARENT;
2644 	newnd.flags |= LOOKUP_RENAME_TARGET;
2645 
2646 	trap = lock_rename(new_dir, old_dir);
2647 
2648 	old_dentry = lookup_hash(&oldnd);
2649 	error = PTR_ERR(old_dentry);
2650 	if (IS_ERR(old_dentry))
2651 		goto exit3;
2652 	/* source must exist */
2653 	error = -ENOENT;
2654 	if (!old_dentry->d_inode)
2655 		goto exit4;
2656 	/* unless the source is a directory trailing slashes give -ENOTDIR */
2657 	if (!S_ISDIR(old_dentry->d_inode->i_mode)) {
2658 		error = -ENOTDIR;
2659 		if (oldnd.last.name[oldnd.last.len])
2660 			goto exit4;
2661 		if (newnd.last.name[newnd.last.len])
2662 			goto exit4;
2663 	}
2664 	/* source should not be ancestor of target */
2665 	error = -EINVAL;
2666 	if (old_dentry == trap)
2667 		goto exit4;
2668 	new_dentry = lookup_hash(&newnd);
2669 	error = PTR_ERR(new_dentry);
2670 	if (IS_ERR(new_dentry))
2671 		goto exit4;
2672 	/* target should not be an ancestor of source */
2673 	error = -ENOTEMPTY;
2674 	if (new_dentry == trap)
2675 		goto exit5;
2676 
2677 	error = mnt_want_write(oldnd.path.mnt);
2678 	if (error)
2679 		goto exit5;
2680 	error = vfs_rename(old_dir->d_inode, old_dentry,
2681 				   new_dir->d_inode, new_dentry);
2682 	mnt_drop_write(oldnd.path.mnt);
2683 exit5:
2684 	dput(new_dentry);
2685 exit4:
2686 	dput(old_dentry);
2687 exit3:
2688 	unlock_rename(new_dir, old_dir);
2689 exit2:
2690 	path_put(&newnd.path);
2691 	putname(to);
2692 exit1:
2693 	path_put(&oldnd.path);
2694 	putname(from);
2695 exit:
2696 	return error;
2697 }
2698 
2699 asmlinkage long sys_rename(const char __user *oldname, const char __user *newname)
2700 {
2701 	return sys_renameat(AT_FDCWD, oldname, AT_FDCWD, newname);
2702 }
2703 
2704 int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen, const char *link)
2705 {
2706 	int len;
2707 
2708 	len = PTR_ERR(link);
2709 	if (IS_ERR(link))
2710 		goto out;
2711 
2712 	len = strlen(link);
2713 	if (len > (unsigned) buflen)
2714 		len = buflen;
2715 	if (copy_to_user(buffer, link, len))
2716 		len = -EFAULT;
2717 out:
2718 	return len;
2719 }
2720 
2721 /*
2722  * A helper for ->readlink().  This should be used *ONLY* for symlinks that
2723  * have ->follow_link() touching nd only in nd_set_link().  Using (or not
2724  * using) it for any given inode is up to filesystem.
2725  */
2726 int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen)
2727 {
2728 	struct nameidata nd;
2729 	void *cookie;
2730 	int res;
2731 
2732 	nd.depth = 0;
2733 	cookie = dentry->d_inode->i_op->follow_link(dentry, &nd);
2734 	if (IS_ERR(cookie))
2735 		return PTR_ERR(cookie);
2736 
2737 	res = vfs_readlink(dentry, buffer, buflen, nd_get_link(&nd));
2738 	if (dentry->d_inode->i_op->put_link)
2739 		dentry->d_inode->i_op->put_link(dentry, &nd, cookie);
2740 	return res;
2741 }
2742 
2743 int vfs_follow_link(struct nameidata *nd, const char *link)
2744 {
2745 	return __vfs_follow_link(nd, link);
2746 }
2747 
2748 /* get the link contents into pagecache */
2749 static char *page_getlink(struct dentry * dentry, struct page **ppage)
2750 {
2751 	struct page * page;
2752 	struct address_space *mapping = dentry->d_inode->i_mapping;
2753 	page = read_mapping_page(mapping, 0, NULL);
2754 	if (IS_ERR(page))
2755 		return (char*)page;
2756 	*ppage = page;
2757 	return kmap(page);
2758 }
2759 
2760 int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
2761 {
2762 	struct page *page = NULL;
2763 	char *s = page_getlink(dentry, &page);
2764 	int res = vfs_readlink(dentry,buffer,buflen,s);
2765 	if (page) {
2766 		kunmap(page);
2767 		page_cache_release(page);
2768 	}
2769 	return res;
2770 }
2771 
2772 void *page_follow_link_light(struct dentry *dentry, struct nameidata *nd)
2773 {
2774 	struct page *page = NULL;
2775 	nd_set_link(nd, page_getlink(dentry, &page));
2776 	return page;
2777 }
2778 
2779 void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
2780 {
2781 	struct page *page = cookie;
2782 
2783 	if (page) {
2784 		kunmap(page);
2785 		page_cache_release(page);
2786 	}
2787 }
2788 
2789 int __page_symlink(struct inode *inode, const char *symname, int len,
2790 		gfp_t gfp_mask)
2791 {
2792 	struct address_space *mapping = inode->i_mapping;
2793 	struct page *page;
2794 	void *fsdata;
2795 	int err;
2796 	char *kaddr;
2797 
2798 retry:
2799 	err = pagecache_write_begin(NULL, mapping, 0, len-1,
2800 				AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
2801 	if (err)
2802 		goto fail;
2803 
2804 	kaddr = kmap_atomic(page, KM_USER0);
2805 	memcpy(kaddr, symname, len-1);
2806 	kunmap_atomic(kaddr, KM_USER0);
2807 
2808 	err = pagecache_write_end(NULL, mapping, 0, len-1, len-1,
2809 							page, fsdata);
2810 	if (err < 0)
2811 		goto fail;
2812 	if (err < len-1)
2813 		goto retry;
2814 
2815 	mark_inode_dirty(inode);
2816 	return 0;
2817 fail:
2818 	return err;
2819 }
2820 
2821 int page_symlink(struct inode *inode, const char *symname, int len)
2822 {
2823 	return __page_symlink(inode, symname, len,
2824 			mapping_gfp_mask(inode->i_mapping));
2825 }
2826 
2827 const struct inode_operations page_symlink_inode_operations = {
2828 	.readlink	= generic_readlink,
2829 	.follow_link	= page_follow_link_light,
2830 	.put_link	= page_put_link,
2831 };
2832 
2833 EXPORT_SYMBOL(user_path_at);
2834 EXPORT_SYMBOL(follow_down);
2835 EXPORT_SYMBOL(follow_up);
2836 EXPORT_SYMBOL(get_write_access); /* binfmt_aout */
2837 EXPORT_SYMBOL(getname);
2838 EXPORT_SYMBOL(lock_rename);
2839 EXPORT_SYMBOL(lookup_one_len);
2840 EXPORT_SYMBOL(page_follow_link_light);
2841 EXPORT_SYMBOL(page_put_link);
2842 EXPORT_SYMBOL(page_readlink);
2843 EXPORT_SYMBOL(__page_symlink);
2844 EXPORT_SYMBOL(page_symlink);
2845 EXPORT_SYMBOL(page_symlink_inode_operations);
2846 EXPORT_SYMBOL(path_lookup);
2847 EXPORT_SYMBOL(kern_path);
2848 EXPORT_SYMBOL(vfs_path_lookup);
2849 EXPORT_SYMBOL(inode_permission);
2850 EXPORT_SYMBOL(vfs_permission);
2851 EXPORT_SYMBOL(file_permission);
2852 EXPORT_SYMBOL(unlock_rename);
2853 EXPORT_SYMBOL(vfs_create);
2854 EXPORT_SYMBOL(vfs_follow_link);
2855 EXPORT_SYMBOL(vfs_link);
2856 EXPORT_SYMBOL(vfs_mkdir);
2857 EXPORT_SYMBOL(vfs_mknod);
2858 EXPORT_SYMBOL(generic_permission);
2859 EXPORT_SYMBOL(vfs_readlink);
2860 EXPORT_SYMBOL(vfs_rename);
2861 EXPORT_SYMBOL(vfs_rmdir);
2862 EXPORT_SYMBOL(vfs_symlink);
2863 EXPORT_SYMBOL(vfs_unlink);
2864 EXPORT_SYMBOL(dentry_unhash);
2865 EXPORT_SYMBOL(generic_readlink);
2866