xref: /freebsd/sys/kern/vfs_lookup.c (revision 6af83ee0d2941d18880b6aaa2b4facd1d30c6106)
1 /*-
2  * Copyright (c) 1982, 1986, 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 4. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *	@(#)vfs_lookup.c	8.4 (Berkeley) 2/16/94
35  */
36 
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39 
40 #include "opt_ktrace.h"
41 #include "opt_mac.h"
42 
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/kernel.h>
46 #include <sys/lock.h>
47 #include <sys/mac.h>
48 #include <sys/mutex.h>
49 #include <sys/namei.h>
50 #include <sys/vnode.h>
51 #include <sys/mount.h>
52 #include <sys/filedesc.h>
53 #include <sys/proc.h>
54 #include <sys/syscallsubr.h>
55 #ifdef KTRACE
56 #include <sys/ktrace.h>
57 #endif
58 
59 #include <vm/uma.h>
60 
61 #define NAMEI_DIAGNOSTIC 1
62 #undef NAMEI_DIAGNOSTIC
63 
64 /*
65  * Allocation zone for namei
66  */
67 uma_zone_t namei_zone;
68 
69 static void
70 nameiinit(void *dummy __unused)
71 {
72 	namei_zone = uma_zcreate("NAMEI", MAXPATHLEN, NULL, NULL, NULL, NULL,
73 	    UMA_ALIGN_PTR, 0);
74 
75 }
76 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nameiinit, NULL)
77 
78 /*
79  * Convert a pathname into a pointer to a locked inode.
80  *
81  * The FOLLOW flag is set when symbolic links are to be followed
82  * when they occur at the end of the name translation process.
83  * Symbolic links are always followed for all other pathname
84  * components other than the last.
85  *
86  * The segflg defines whether the name is to be copied from user
87  * space or kernel space.
88  *
89  * Overall outline of namei:
90  *
91  *	copy in name
92  *	get starting directory
93  *	while (!done && !error) {
94  *		call lookup to search path.
95  *		if symbolic link, massage name in buffer and continue
96  *	}
97  */
98 int
99 namei(ndp)
100 	register struct nameidata *ndp;
101 {
102 	register struct filedesc *fdp;	/* pointer to file descriptor state */
103 	register char *cp;		/* pointer into pathname argument */
104 	register struct vnode *dp;	/* the directory we are searching */
105 	struct iovec aiov;		/* uio for reading symbolic links */
106 	struct uio auio;
107 	int error, linklen;
108 	struct componentname *cnp = &ndp->ni_cnd;
109 	struct thread *td = cnp->cn_thread;
110 	struct proc *p = td->td_proc;
111 	struct mount *mp;
112 	int vfslocked;
113 
114 	ndp->ni_cnd.cn_cred = ndp->ni_cnd.cn_thread->td_ucred;
115 	KASSERT(cnp->cn_cred && p, ("namei: bad cred/proc"));
116 	KASSERT((cnp->cn_nameiop & (~OPMASK)) == 0,
117 	    ("namei: nameiop contaminated with flags"));
118 	KASSERT((cnp->cn_flags & OPMASK) == 0,
119 	    ("namei: flags contaminated with nameiops"));
120 	fdp = p->p_fd;
121 
122 	/*
123 	 * Get a buffer for the name to be translated, and copy the
124 	 * name into the buffer.
125 	 */
126 	if ((cnp->cn_flags & HASBUF) == 0)
127 		cnp->cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK);
128 	if (ndp->ni_segflg == UIO_SYSSPACE)
129 		error = copystr(ndp->ni_dirp, cnp->cn_pnbuf,
130 			    MAXPATHLEN, (size_t *)&ndp->ni_pathlen);
131 	else
132 		error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf,
133 			    MAXPATHLEN, (size_t *)&ndp->ni_pathlen);
134 
135 	/*
136 	 * Don't allow empty pathnames.
137 	 */
138 	if (!error && *cnp->cn_pnbuf == '\0')
139 		error = ENOENT;
140 
141 	if (error) {
142 		uma_zfree(namei_zone, cnp->cn_pnbuf);
143 #ifdef DIAGNOSTIC
144 		cnp->cn_pnbuf = NULL;
145 		cnp->cn_nameptr = NULL;
146 #endif
147 		ndp->ni_vp = NULL;
148 		return (error);
149 	}
150 	ndp->ni_loopcnt = 0;
151 #ifdef KTRACE
152 	if (KTRPOINT(td, KTR_NAMEI)) {
153 		KASSERT(cnp->cn_thread == curthread,
154 		    ("namei not using curthread"));
155 		ktrnamei(cnp->cn_pnbuf);
156 	}
157 #endif
158 
159 	/*
160 	 * Get starting point for the translation.
161 	 */
162 	FILEDESC_LOCK(fdp);
163 	ndp->ni_rootdir = fdp->fd_rdir;
164 	ndp->ni_topdir = fdp->fd_jdir;
165 
166 	dp = fdp->fd_cdir;
167 	vfslocked = VFS_LOCK_GIANT(dp->v_mount);
168 	VREF(dp);
169 	FILEDESC_UNLOCK(fdp);
170 	for (;;) {
171 		/*
172 		 * Check if root directory should replace current directory.
173 		 * Done at start of translation and after symbolic link.
174 		 */
175 		cnp->cn_nameptr = cnp->cn_pnbuf;
176 		if (*(cnp->cn_nameptr) == '/') {
177 			vrele(dp);
178 			VFS_UNLOCK_GIANT(vfslocked);
179 			while (*(cnp->cn_nameptr) == '/') {
180 				cnp->cn_nameptr++;
181 				ndp->ni_pathlen--;
182 			}
183 			dp = ndp->ni_rootdir;
184 			vfslocked = VFS_LOCK_GIANT(dp->v_mount);
185 			VREF(dp);
186 		}
187 		if (vfslocked)
188 			ndp->ni_cnd.cn_flags |= GIANTHELD;
189 		ndp->ni_startdir = dp;
190 		error = lookup(ndp);
191 		if (error) {
192 			uma_zfree(namei_zone, cnp->cn_pnbuf);
193 #ifdef DIAGNOSTIC
194 			cnp->cn_pnbuf = NULL;
195 			cnp->cn_nameptr = NULL;
196 #endif
197 			return (error);
198 		}
199 		vfslocked = (ndp->ni_cnd.cn_flags & GIANTHELD) != 0;
200 		ndp->ni_cnd.cn_flags &= ~GIANTHELD;
201 		/*
202 		 * Check for symbolic link
203 		 */
204 		if ((cnp->cn_flags & ISSYMLINK) == 0) {
205 			if ((cnp->cn_flags & (SAVENAME | SAVESTART)) == 0) {
206 				uma_zfree(namei_zone, cnp->cn_pnbuf);
207 #ifdef DIAGNOSTIC
208 				cnp->cn_pnbuf = NULL;
209 				cnp->cn_nameptr = NULL;
210 #endif
211 			} else
212 				cnp->cn_flags |= HASBUF;
213 
214 			if ((cnp->cn_flags & MPSAFE) == 0) {
215 				VFS_UNLOCK_GIANT(vfslocked);
216 			} else if (vfslocked)
217 				ndp->ni_cnd.cn_flags |= GIANTHELD;
218 			return (0);
219 		}
220 		if ((cnp->cn_flags & LOCKPARENT) && ndp->ni_pathlen == 1)
221 			VOP_UNLOCK(ndp->ni_dvp, 0, td);
222 		if (ndp->ni_loopcnt++ >= MAXSYMLINKS) {
223 			error = ELOOP;
224 			break;
225 		}
226 #ifdef MAC
227 		if ((cnp->cn_flags & NOMACCHECK) == 0) {
228 			error = mac_check_vnode_readlink(td->td_ucred,
229 			    ndp->ni_vp);
230 			if (error)
231 				break;
232 		}
233 #endif
234 		if (ndp->ni_pathlen > 1)
235 			cp = uma_zalloc(namei_zone, M_WAITOK);
236 		else
237 			cp = cnp->cn_pnbuf;
238 		aiov.iov_base = cp;
239 		aiov.iov_len = MAXPATHLEN;
240 		auio.uio_iov = &aiov;
241 		auio.uio_iovcnt = 1;
242 		auio.uio_offset = 0;
243 		auio.uio_rw = UIO_READ;
244 		auio.uio_segflg = UIO_SYSSPACE;
245 		auio.uio_td = (struct thread *)0;
246 		auio.uio_resid = MAXPATHLEN;
247 		error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred);
248 		if (error) {
249 			if (ndp->ni_pathlen > 1)
250 				uma_zfree(namei_zone, cp);
251 			break;
252 		}
253 		linklen = MAXPATHLEN - auio.uio_resid;
254 		if (linklen == 0) {
255 			if (ndp->ni_pathlen > 1)
256 				uma_zfree(namei_zone, cp);
257 			error = ENOENT;
258 			break;
259 		}
260 		if (linklen + ndp->ni_pathlen >= MAXPATHLEN) {
261 			if (ndp->ni_pathlen > 1)
262 				uma_zfree(namei_zone, cp);
263 			error = ENAMETOOLONG;
264 			break;
265 		}
266 		if (ndp->ni_pathlen > 1) {
267 			bcopy(ndp->ni_next, cp + linklen, ndp->ni_pathlen);
268 			uma_zfree(namei_zone, cnp->cn_pnbuf);
269 			cnp->cn_pnbuf = cp;
270 		} else
271 			cnp->cn_pnbuf[linklen] = '\0';
272 		ndp->ni_pathlen += linklen;
273 		vput(ndp->ni_vp);
274 		dp = ndp->ni_dvp;
275 	}
276 	uma_zfree(namei_zone, cnp->cn_pnbuf);
277 #ifdef DIAGNOSTIC
278 	cnp->cn_pnbuf = NULL;
279 	cnp->cn_nameptr = NULL;
280 #endif
281 	vrele(ndp->ni_dvp);
282 	mp = ndp->ni_vp->v_mount;
283 	vput(ndp->ni_vp);
284 	VFS_UNLOCK_GIANT(vfslocked);
285 	ndp->ni_vp = NULL;
286 	return (error);
287 }
288 
289 /*
290  * Search a pathname.
291  * This is a very central and rather complicated routine.
292  *
293  * The pathname is pointed to by ni_ptr and is of length ni_pathlen.
294  * The starting directory is taken from ni_startdir. The pathname is
295  * descended until done, or a symbolic link is encountered. The variable
296  * ni_more is clear if the path is completed; it is set to one if a
297  * symbolic link needing interpretation is encountered.
298  *
299  * The flag argument is LOOKUP, CREATE, RENAME, or DELETE depending on
300  * whether the name is to be looked up, created, renamed, or deleted.
301  * When CREATE, RENAME, or DELETE is specified, information usable in
302  * creating, renaming, or deleting a directory entry may be calculated.
303  * If flag has LOCKPARENT or'ed into it, the parent directory is returned
304  * locked. If flag has WANTPARENT or'ed into it, the parent directory is
305  * returned unlocked. Otherwise the parent directory is not returned. If
306  * the target of the pathname exists and LOCKLEAF is or'ed into the flag
307  * the target is returned locked, otherwise it is returned unlocked.
308  * When creating or renaming and LOCKPARENT is specified, the target may not
309  * be ".".  When deleting and LOCKPARENT is specified, the target may be ".".
310  *
311  * Overall outline of lookup:
312  *
313  * dirloop:
314  *	identify next component of name at ndp->ni_ptr
315  *	handle degenerate case where name is null string
316  *	if .. and crossing mount points and on mounted filesys, find parent
317  *	call VOP_LOOKUP routine for next component name
318  *	    directory vnode returned in ni_dvp, unlocked unless LOCKPARENT set
319  *	    component vnode returned in ni_vp (if it exists), locked.
320  *	if result vnode is mounted on and crossing mount points,
321  *	    find mounted on vnode
322  *	if more components of name, do next level at dirloop
323  *	return the answer in ni_vp, locked if LOCKLEAF set
324  *	    if LOCKPARENT set, return locked parent in ni_dvp
325  *	    if WANTPARENT set, return unlocked parent in ni_dvp
326  */
327 int
328 lookup(ndp)
329 	register struct nameidata *ndp;
330 {
331 	register char *cp;		/* pointer into pathname argument */
332 	register struct vnode *dp = 0;	/* the directory we are searching */
333 	struct vnode *tdp;		/* saved dp */
334 	struct mount *mp;		/* mount table entry */
335 	int docache;			/* == 0 do not cache last component */
336 	int wantparent;			/* 1 => wantparent or lockparent flag */
337 	int rdonly;			/* lookup read-only flag bit */
338 	int trailing_slash;
339 	int error = 0;
340 	int dpunlocked = 0;		/* dp has already been unlocked */
341 	struct componentname *cnp = &ndp->ni_cnd;
342 	struct thread *td = cnp->cn_thread;
343 	int vfslocked;
344 	int tvfslocked;
345 
346 	/*
347 	 * Setup: break out flag bits into variables.
348 	 */
349 	vfslocked = (ndp->ni_cnd.cn_flags & GIANTHELD) != 0;
350 	ndp->ni_cnd.cn_flags &= ~GIANTHELD;
351 	wantparent = cnp->cn_flags & (LOCKPARENT | WANTPARENT);
352 	docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE;
353 	if (cnp->cn_nameiop == DELETE ||
354 	    (wantparent && cnp->cn_nameiop != CREATE &&
355 	     cnp->cn_nameiop != LOOKUP))
356 		docache = 0;
357 	rdonly = cnp->cn_flags & RDONLY;
358 	ndp->ni_dvp = NULL;
359 	cnp->cn_flags &= ~ISSYMLINK;
360 	dp = ndp->ni_startdir;
361 	ndp->ni_startdir = NULLVP;
362 	vn_lock(dp, LK_EXCLUSIVE | LK_RETRY, td);
363 
364 dirloop:
365 	/*
366 	 * Search a new directory.
367 	 *
368 	 * The last component of the filename is left accessible via
369 	 * cnp->cn_nameptr for callers that need the name. Callers needing
370 	 * the name set the SAVENAME flag. When done, they assume
371 	 * responsibility for freeing the pathname buffer.
372 	 */
373 	cnp->cn_consume = 0;
374 	for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++)
375 		continue;
376 	cnp->cn_namelen = cp - cnp->cn_nameptr;
377 	if (cnp->cn_namelen > NAME_MAX) {
378 		error = ENAMETOOLONG;
379 		goto bad;
380 	}
381 #ifdef NAMEI_DIAGNOSTIC
382 	{ char c = *cp;
383 	*cp = '\0';
384 	printf("{%s}: ", cnp->cn_nameptr);
385 	*cp = c; }
386 #endif
387 	ndp->ni_pathlen -= cnp->cn_namelen;
388 	ndp->ni_next = cp;
389 
390 	/*
391 	 * Replace multiple slashes by a single slash and trailing slashes
392 	 * by a null.  This must be done before VOP_LOOKUP() because some
393 	 * fs's don't know about trailing slashes.  Remember if there were
394 	 * trailing slashes to handle symlinks, existing non-directories
395 	 * and non-existing files that won't be directories specially later.
396 	 */
397 	trailing_slash = 0;
398 	while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) {
399 		cp++;
400 		ndp->ni_pathlen--;
401 		if (*cp == '\0') {
402 			trailing_slash = 1;
403 			*ndp->ni_next = '\0';	/* XXX for direnter() ... */
404 		}
405 	}
406 	ndp->ni_next = cp;
407 
408 	cnp->cn_flags |= MAKEENTRY;
409 	if (*cp == '\0' && docache == 0)
410 		cnp->cn_flags &= ~MAKEENTRY;
411 	if (cnp->cn_namelen == 2 &&
412 	    cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
413 		cnp->cn_flags |= ISDOTDOT;
414 	else
415 		cnp->cn_flags &= ~ISDOTDOT;
416 	if (*ndp->ni_next == 0)
417 		cnp->cn_flags |= ISLASTCN;
418 	else
419 		cnp->cn_flags &= ~ISLASTCN;
420 
421 
422 	/*
423 	 * Check for degenerate name (e.g. / or "")
424 	 * which is a way of talking about a directory,
425 	 * e.g. like "/." or ".".
426 	 */
427 	if (cnp->cn_nameptr[0] == '\0') {
428 		if (dp->v_type != VDIR) {
429 			error = ENOTDIR;
430 			goto bad;
431 		}
432 		if (cnp->cn_nameiop != LOOKUP) {
433 			error = EISDIR;
434 			goto bad;
435 		}
436 		if (wantparent) {
437 			ndp->ni_dvp = dp;
438 			VREF(dp);
439 		}
440 		ndp->ni_vp = dp;
441 		if (!(cnp->cn_flags & (LOCKPARENT | LOCKLEAF)))
442 			VOP_UNLOCK(dp, 0, td);
443 		/* XXX This should probably move to the top of function. */
444 		if (cnp->cn_flags & SAVESTART)
445 			panic("lookup: SAVESTART");
446 		goto success;
447 	}
448 
449 	/*
450 	 * Handle "..": two special cases.
451 	 * 1. If at root directory (e.g. after chroot)
452 	 *    or at absolute root directory
453 	 *    then ignore it so can't get out.
454 	 * 2. If this vnode is the root of a mounted
455 	 *    filesystem, then replace it with the
456 	 *    vnode which was mounted on so we take the
457 	 *    .. in the other filesystem.
458 	 * 3. If the vnode is the top directory of
459 	 *    the jail or chroot, don't let them out.
460 	 */
461 	if (cnp->cn_flags & ISDOTDOT) {
462 		for (;;) {
463 			if (dp == ndp->ni_rootdir ||
464 			    dp == ndp->ni_topdir ||
465 			    dp == rootvnode) {
466 				ndp->ni_dvp = dp;
467 				ndp->ni_vp = dp;
468 				VREF(dp);
469 				goto nextname;
470 			}
471 			if ((dp->v_vflag & VV_ROOT) == 0 ||
472 			    (cnp->cn_flags & NOCROSSMOUNT))
473 				break;
474 			if (dp->v_mount == NULL) {	/* forced unmount */
475 				error = EBADF;
476 				goto bad;
477 			}
478 			tdp = dp;
479 			tvfslocked = vfslocked;
480 			dp = dp->v_mount->mnt_vnodecovered;
481 			vput(tdp);
482 			vfslocked = VFS_LOCK_GIANT(dp->v_mount);
483 			VFS_UNLOCK_GIANT(tvfslocked);
484 			VREF(dp);
485 			vn_lock(dp, LK_EXCLUSIVE | LK_RETRY, td);
486 		}
487 	}
488 
489 	/*
490 	 * We now have a segment name to search for, and a directory to search.
491 	 */
492 unionlookup:
493 #ifdef MAC
494 	if ((cnp->cn_flags & NOMACCHECK) == 0) {
495 		error = mac_check_vnode_lookup(td->td_ucred, dp, cnp);
496 		if (error)
497 			goto bad;
498 	}
499 #endif
500 	ndp->ni_dvp = dp;
501 	ndp->ni_vp = NULL;
502 	cnp->cn_flags &= ~PDIRUNLOCK;
503 	ASSERT_VOP_LOCKED(dp, "lookup");
504 #ifdef NAMEI_DIAGNOSTIC
505 	vprint("lookup in", dp);
506 #endif
507 	if ((error = VOP_LOOKUP(dp, &ndp->ni_vp, cnp)) != 0) {
508 		KASSERT(ndp->ni_vp == NULL, ("leaf should be empty"));
509 #ifdef NAMEI_DIAGNOSTIC
510 		printf("not found\n");
511 #endif
512 		if ((error == ENOENT) &&
513 		    (dp->v_vflag & VV_ROOT) && (dp->v_mount != NULL) &&
514 		    (dp->v_mount->mnt_flag & MNT_UNION)) {
515 			tdp = dp;
516 			tvfslocked = vfslocked;
517 			dp = dp->v_mount->mnt_vnodecovered;
518 			if (cnp->cn_flags & PDIRUNLOCK)
519 				vrele(tdp);
520 			else
521 				vput(tdp);
522 			vfslocked = VFS_LOCK_GIANT(dp->v_mount);
523 			VFS_UNLOCK_GIANT(tvfslocked);
524 			VREF(dp);
525 			vn_lock(dp, LK_EXCLUSIVE | LK_RETRY, td);
526 			goto unionlookup;
527 		}
528 
529 		if (error != EJUSTRETURN)
530 			goto bad;
531 		/*
532 		 * If creating and at end of pathname, then can consider
533 		 * allowing file to be created.
534 		 */
535 		if (rdonly) {
536 			error = EROFS;
537 			goto bad;
538 		}
539 		if (*cp == '\0' && trailing_slash &&
540 		     !(cnp->cn_flags & WILLBEDIR)) {
541 			error = ENOENT;
542 			goto bad;
543 		}
544 		/*
545 		 * We return with ni_vp NULL to indicate that the entry
546 		 * doesn't currently exist, leaving a pointer to the
547 		 * (possibly locked) directory inode in ndp->ni_dvp.
548 		 */
549 		if (cnp->cn_flags & SAVESTART) {
550 			ndp->ni_startdir = ndp->ni_dvp;
551 			VREF(ndp->ni_startdir);
552 		}
553 		goto success;
554 	}
555 #ifdef NAMEI_DIAGNOSTIC
556 	printf("found\n");
557 #endif
558 
559 	ASSERT_VOP_LOCKED(ndp->ni_vp, "lookup");
560 
561 	/*
562 	 * Take into account any additional components consumed by
563 	 * the underlying filesystem.
564 	 */
565 	if (cnp->cn_consume > 0) {
566 		cnp->cn_nameptr += cnp->cn_consume;
567 		ndp->ni_next += cnp->cn_consume;
568 		ndp->ni_pathlen -= cnp->cn_consume;
569 		cnp->cn_consume = 0;
570 	}
571 
572 	dp = ndp->ni_vp;
573 
574 	/*
575 	 * Check to see if the vnode has been mounted on;
576 	 * if so find the root of the mounted filesystem.
577 	 */
578 	while (dp->v_type == VDIR && (mp = dp->v_mountedhere) &&
579 	       (cnp->cn_flags & NOCROSSMOUNT) == 0) {
580 		if (vfs_busy(mp, 0, 0, td))
581 			continue;
582 		VOP_UNLOCK(dp, 0, td);
583 		tvfslocked = VFS_LOCK_GIANT(mp);
584 		error = VFS_ROOT(mp, &tdp, td);
585 		vfs_unbusy(mp, td);
586 		if (error) {
587 			VFS_UNLOCK_GIANT(tvfslocked);
588 			dpunlocked = 1;
589 			goto bad2;
590 		}
591 		vrele(dp);
592 		VFS_UNLOCK_GIANT(vfslocked);
593 		ndp->ni_vp = dp = tdp;
594 		vfslocked = tvfslocked;
595 	}
596 
597 	/*
598 	 * Check for symbolic link
599 	 */
600 	if ((dp->v_type == VLNK) &&
601 	    ((cnp->cn_flags & FOLLOW) || trailing_slash ||
602 	     *ndp->ni_next == '/')) {
603 		cnp->cn_flags |= ISSYMLINK;
604 		if (dp->v_mount == NULL) {
605 			/* We can't know whether the directory was mounted with
606 			 * NOSYMFOLLOW, so we can't follow safely. */
607 			error = EBADF;
608 			goto bad2;
609 		}
610 		if (dp->v_mount->mnt_flag & MNT_NOSYMFOLLOW) {
611 			error = EACCES;
612 			goto bad2;
613 		}
614 		goto success;
615 	}
616 
617 	/*
618 	 * Check for bogus trailing slashes.
619 	 */
620 	if (trailing_slash && dp->v_type != VDIR) {
621 		error = ENOTDIR;
622 		goto bad2;
623 	}
624 
625 nextname:
626 	/*
627 	 * Not a symbolic link.  If more pathname,
628 	 * continue at next component, else return.
629 	 */
630 	if (*ndp->ni_next == '/') {
631 		cnp->cn_nameptr = ndp->ni_next;
632 		while (*cnp->cn_nameptr == '/') {
633 			cnp->cn_nameptr++;
634 			ndp->ni_pathlen--;
635 		}
636 		if (ndp->ni_dvp != ndp->ni_vp)
637 			ASSERT_VOP_UNLOCKED(ndp->ni_dvp, "lookup");
638 		vrele(ndp->ni_dvp);
639 		goto dirloop;
640 	}
641 	/*
642 	 * Disallow directory write attempts on read-only filesystems.
643 	 */
644 	if (rdonly &&
645 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
646 		error = EROFS;
647 		goto bad2;
648 	}
649 	if (cnp->cn_flags & SAVESTART) {
650 		ndp->ni_startdir = ndp->ni_dvp;
651 		VREF(ndp->ni_startdir);
652 	}
653 	if (!wantparent)
654 		vrele(ndp->ni_dvp);
655 
656 	if ((cnp->cn_flags & LOCKLEAF) == 0)
657 		VOP_UNLOCK(dp, 0, td);
658 success:
659 	if (vfslocked)
660 		ndp->ni_cnd.cn_flags |= GIANTHELD;
661 	return (0);
662 
663 bad2:
664 	if ((cnp->cn_flags & (LOCKPARENT | PDIRUNLOCK)) == LOCKPARENT &&
665 	    *ndp->ni_next == '\0')
666 		VOP_UNLOCK(ndp->ni_dvp, 0, td);
667 	vrele(ndp->ni_dvp);
668 bad:
669 	if (dpunlocked)
670 		vrele(dp);
671 	else
672 		vput(dp);
673 	VFS_UNLOCK_GIANT(vfslocked);
674 	ndp->ni_cnd.cn_flags &= ~GIANTHELD;
675 	ndp->ni_vp = NULL;
676 	return (error);
677 }
678 
679 /*
680  * relookup - lookup a path name component
681  *    Used by lookup to re-aquire things.
682  */
683 int
684 relookup(dvp, vpp, cnp)
685 	struct vnode *dvp, **vpp;
686 	struct componentname *cnp;
687 {
688 	struct thread *td = cnp->cn_thread;
689 	struct vnode *dp = 0;		/* the directory we are searching */
690 	int docache;			/* == 0 do not cache last component */
691 	int wantparent;			/* 1 => wantparent or lockparent flag */
692 	int rdonly;			/* lookup read-only flag bit */
693 	int error = 0;
694 
695 	/*
696 	 * Setup: break out flag bits into variables.
697 	 */
698 	wantparent = cnp->cn_flags & (LOCKPARENT|WANTPARENT);
699 	docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE;
700 	if (cnp->cn_nameiop == DELETE ||
701 	    (wantparent && cnp->cn_nameiop != CREATE))
702 		docache = 0;
703 	rdonly = cnp->cn_flags & RDONLY;
704 	cnp->cn_flags &= ~ISSYMLINK;
705 	dp = dvp;
706 	vn_lock(dp, LK_EXCLUSIVE | LK_RETRY, td);
707 
708 /* dirloop: */
709 	/*
710 	 * Search a new directory.
711 	 *
712 	 * The last component of the filename is left accessible via
713 	 * cnp->cn_nameptr for callers that need the name. Callers needing
714 	 * the name set the SAVENAME flag. When done, they assume
715 	 * responsibility for freeing the pathname buffer.
716 	 */
717 #ifdef NAMEI_DIAGNOSTIC
718 	printf("{%s}: ", cnp->cn_nameptr);
719 #endif
720 
721 	/*
722 	 * Check for degenerate name (e.g. / or "")
723 	 * which is a way of talking about a directory,
724 	 * e.g. like "/." or ".".
725 	 */
726 	if (cnp->cn_nameptr[0] == '\0') {
727 		if (cnp->cn_nameiop != LOOKUP || wantparent) {
728 			error = EISDIR;
729 			goto bad;
730 		}
731 		if (dp->v_type != VDIR) {
732 			error = ENOTDIR;
733 			goto bad;
734 		}
735 		if (!(cnp->cn_flags & LOCKLEAF))
736 			VOP_UNLOCK(dp, 0, td);
737 		*vpp = dp;
738 		/* XXX This should probably move to the top of function. */
739 		if (cnp->cn_flags & SAVESTART)
740 			panic("lookup: SAVESTART");
741 		return (0);
742 	}
743 
744 	if (cnp->cn_flags & ISDOTDOT)
745 		panic ("relookup: lookup on dot-dot");
746 
747 	/*
748 	 * We now have a segment name to search for, and a directory to search.
749 	 */
750 #ifdef NAMEI_DIAGNOSTIC
751 	vprint("search in:", dp);
752 #endif
753 	if ((error = VOP_LOOKUP(dp, vpp, cnp)) != 0) {
754 		KASSERT(*vpp == NULL, ("leaf should be empty"));
755 		if (error != EJUSTRETURN)
756 			goto bad;
757 		/*
758 		 * If creating and at end of pathname, then can consider
759 		 * allowing file to be created.
760 		 */
761 		if (rdonly) {
762 			error = EROFS;
763 			goto bad;
764 		}
765 		/* ASSERT(dvp == ndp->ni_startdir) */
766 		if (cnp->cn_flags & SAVESTART)
767 			VREF(dvp);
768 		/*
769 		 * We return with ni_vp NULL to indicate that the entry
770 		 * doesn't currently exist, leaving a pointer to the
771 		 * (possibly locked) directory inode in ndp->ni_dvp.
772 		 */
773 		return (0);
774 	}
775 	dp = *vpp;
776 
777 	/*
778 	 * Check for symbolic link
779 	 */
780 	KASSERT(dp->v_type != VLNK || !(cnp->cn_flags & FOLLOW),
781 	    ("relookup: symlink found.\n"));
782 
783 	/*
784 	 * Disallow directory write attempts on read-only filesystems.
785 	 */
786 	if (rdonly &&
787 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
788 		error = EROFS;
789 		goto bad2;
790 	}
791 	/* ASSERT(dvp == ndp->ni_startdir) */
792 	if (cnp->cn_flags & SAVESTART)
793 		VREF(dvp);
794 
795 	if (!wantparent)
796 		vrele(dvp);
797 
798 	if ((cnp->cn_flags & LOCKLEAF) == 0)
799 		VOP_UNLOCK(dp, 0, td);
800 	return (0);
801 
802 bad2:
803 	if ((cnp->cn_flags & LOCKPARENT) && (cnp->cn_flags & ISLASTCN))
804 		VOP_UNLOCK(dvp, 0, td);
805 	vrele(dvp);
806 bad:
807 	vput(dp);
808 	*vpp = NULL;
809 	return (error);
810 }
811 
812 /*
813  * Determine if there is a suitable alternate filename under the specified
814  * prefix for the specified path.  If the create flag is set, then the
815  * alternate prefix will be used so long as the parent directory exists.
816  * This is used by the various compatiblity ABIs so that Linux binaries prefer
817  * files under /compat/linux for example.  The chosen path (whether under
818  * the prefix or under /) is returned in a kernel malloc'd buffer pointed
819  * to by pathbuf.  The caller is responsible for free'ing the buffer from
820  * the M_TEMP bucket if one is returned.
821  */
822 int
823 kern_alternate_path(struct thread *td, const char *prefix, char *path,
824     enum uio_seg pathseg, char **pathbuf, int create)
825 {
826 	struct nameidata nd, ndroot;
827 	char *ptr, *buf, *cp;
828 	size_t len, sz;
829 	int error;
830 
831 	buf = (char *) malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
832 	*pathbuf = buf;
833 
834 	/* Copy the prefix into the new pathname as a starting point. */
835 	len = strlcpy(buf, prefix, MAXPATHLEN);
836 	if (len >= MAXPATHLEN) {
837 		*pathbuf = NULL;
838 		free(buf, M_TEMP);
839 		return (EINVAL);
840 	}
841 	sz = MAXPATHLEN - len;
842 	ptr = buf + len;
843 
844 	/* Append the filename to the prefix. */
845 	if (pathseg == UIO_SYSSPACE)
846 		error = copystr(path, ptr, sz, &len);
847 	else
848 		error = copyinstr(path, ptr, sz, &len);
849 
850 	if (error) {
851 		*pathbuf = NULL;
852 		free(buf, M_TEMP);
853 		return (error);
854 	}
855 
856 	/* Only use a prefix with absolute pathnames. */
857 	if (*ptr != '/') {
858 		error = EINVAL;
859 		goto keeporig;
860 	}
861 
862 	/* XXX: VFS_LOCK_GIANT? */
863 	mtx_lock(&Giant);
864 
865 	/*
866 	 * We know that there is a / somewhere in this pathname.
867 	 * Search backwards for it, to find the file's parent dir
868 	 * to see if it exists in the alternate tree. If it does,
869 	 * and we want to create a file (cflag is set). We don't
870 	 * need to worry about the root comparison in this case.
871 	 */
872 
873 	if (create) {
874 		for (cp = &ptr[len] - 1; *cp != '/'; cp--);
875 		*cp = '\0';
876 
877 		NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, buf, td);
878 		error = namei(&nd);
879 		*cp = '/';
880 		if (error != 0)
881 			goto nd_failed;
882 	} else {
883 		NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, buf, td);
884 
885 		error = namei(&nd);
886 		if (error != 0)
887 			goto nd_failed;
888 
889 		/*
890 		 * We now compare the vnode of the prefix to the one
891 		 * vnode asked. If they resolve to be the same, then we
892 		 * ignore the match so that the real root gets used.
893 		 * This avoids the problem of traversing "../.." to find the
894 		 * root directory and never finding it, because "/" resolves
895 		 * to the emulation root directory. This is expensive :-(
896 		 */
897 		NDINIT(&ndroot, LOOKUP, FOLLOW, UIO_SYSSPACE, prefix, td);
898 
899 		/* We shouldn't ever get an error from this namei(). */
900 		error = namei(&ndroot);
901 		if (error == 0) {
902 			if (nd.ni_vp == ndroot.ni_vp)
903 				error = ENOENT;
904 
905 			NDFREE(&ndroot, NDF_ONLY_PNBUF);
906 			vrele(ndroot.ni_vp);
907 		}
908 	}
909 
910 	NDFREE(&nd, NDF_ONLY_PNBUF);
911 	vrele(nd.ni_vp);
912 
913 nd_failed:
914 	/* XXX: VFS_UNLOCK_GIANT? */
915 	mtx_unlock(&Giant);
916 
917 keeporig:
918 	/* If there was an error, use the original path name. */
919 	if (error)
920 		bcopy(ptr, buf, len);
921 	return (error);
922 }
923