xref: /freebsd/sys/kern/vfs_lookup.c (revision f391d6bc1d0464f62f1b8264666c897a680156b1)
1 /*-
2  * Copyright (c) 1982, 1986, 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *	@(#)vfs_lookup.c	8.4 (Berkeley) 2/16/94
35  */
36 
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39 
40 #include "opt_capsicum.h"
41 #include "opt_ktrace.h"
42 
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/kernel.h>
46 #include <sys/capsicum.h>
47 #include <sys/fcntl.h>
48 #include <sys/jail.h>
49 #include <sys/lock.h>
50 #include <sys/mutex.h>
51 #include <sys/namei.h>
52 #include <sys/vnode.h>
53 #include <sys/mount.h>
54 #include <sys/filedesc.h>
55 #include <sys/proc.h>
56 #include <sys/sdt.h>
57 #include <sys/syscallsubr.h>
58 #include <sys/sysctl.h>
59 #ifdef KTRACE
60 #include <sys/ktrace.h>
61 #endif
62 
63 #include <security/audit/audit.h>
64 #include <security/mac/mac_framework.h>
65 
66 #include <vm/uma.h>
67 
68 #define	NAMEI_DIAGNOSTIC 1
69 #undef NAMEI_DIAGNOSTIC
70 
71 SDT_PROVIDER_DECLARE(vfs);
72 SDT_PROBE_DEFINE3(vfs, namei, lookup, entry, "struct vnode *", "char *",
73     "unsigned long");
74 SDT_PROBE_DEFINE2(vfs, namei, lookup, return, "int", "struct vnode *");
75 
76 /* Allocation zone for namei. */
77 uma_zone_t namei_zone;
78 
79 /* Placeholder vnode for mp traversal. */
80 static struct vnode *vp_crossmp;
81 
82 struct nameicap_tracker {
83 	struct vnode *dp;
84 	TAILQ_ENTRY(nameicap_tracker) nm_link;
85 };
86 
87 /* Zone for cap mode tracker elements used for dotdot capability checks. */
88 static uma_zone_t nt_zone;
89 
90 static void
91 nameiinit(void *dummy __unused)
92 {
93 
94 	namei_zone = uma_zcreate("NAMEI", MAXPATHLEN, NULL, NULL, NULL, NULL,
95 	    UMA_ALIGN_PTR, 0);
96 	nt_zone = uma_zcreate("rentr", sizeof(struct nameicap_tracker),
97 	    NULL, NULL, NULL, NULL, sizeof(void *), 0);
98 	getnewvnode("crossmp", NULL, &dead_vnodeops, &vp_crossmp);
99 	vn_lock(vp_crossmp, LK_EXCLUSIVE);
100 	VN_LOCK_ASHARE(vp_crossmp);
101 	VOP_UNLOCK(vp_crossmp, 0);
102 }
103 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nameiinit, NULL);
104 
105 static int lookup_shared = 1;
106 SYSCTL_INT(_vfs, OID_AUTO, lookup_shared, CTLFLAG_RWTUN, &lookup_shared, 0,
107     "enables shared locks for path name translation");
108 
109 /*
110  * Intent is that lookup_cap_dotdot becomes unconditionally enabled,
111  * but it defaults to the disabled state until verification efforts
112  * are complete.
113  */
114 static int lookup_cap_dotdot = 0;
115 SYSCTL_INT(_vfs, OID_AUTO, lookup_cap_dotdot, CTLFLAG_RWTUN,
116     &lookup_cap_dotdot, 0,
117     "enables \"..\" components in path lookup in capability mode");
118 static int lookup_cap_dotdot_nonlocal = 0;
119 SYSCTL_INT(_vfs, OID_AUTO, lookup_cap_dotdot_nonlocal, CTLFLAG_RWTUN,
120     &lookup_cap_dotdot_nonlocal, 0,
121     "enables \"..\" components in path lookup in capability mode "
122     "on non-local mount");
123 
124 static void
125 nameicap_tracker_add(struct nameidata *ndp, struct vnode *dp)
126 {
127 	struct nameicap_tracker *nt;
128 
129 	if ((ndp->ni_lcf & NI_LCF_CAP_DOTDOT) == 0 || dp->v_type != VDIR)
130 		return;
131 	nt = uma_zalloc(nt_zone, M_WAITOK);
132 	vhold(dp);
133 	nt->dp = dp;
134 	TAILQ_INSERT_TAIL(&ndp->ni_cap_tracker, nt, nm_link);
135 }
136 
137 static void
138 nameicap_cleanup(struct nameidata *ndp)
139 {
140 	struct nameicap_tracker *nt, *nt1;
141 
142 	KASSERT(TAILQ_EMPTY(&ndp->ni_cap_tracker) ||
143 	    (ndp->ni_lcf & NI_LCF_CAP_DOTDOT) != 0, ("not strictrelative"));
144 	TAILQ_FOREACH_SAFE(nt, &ndp->ni_cap_tracker, nm_link, nt1) {
145 		TAILQ_REMOVE(&ndp->ni_cap_tracker, nt, nm_link);
146 		vdrop(nt->dp);
147 		uma_zfree(nt_zone, nt);
148 	}
149 }
150 
151 /*
152  * For dotdot lookups in capability mode, only allow the component
153  * lookup to succeed if the resulting directory was already traversed
154  * during the operation.  Also fail dotdot lookups for non-local
155  * filesystems, where external agents might assist local lookups to
156  * escape the compartment.
157  */
158 static int
159 nameicap_check_dotdot(struct nameidata *ndp, struct vnode *dp)
160 {
161 	struct nameicap_tracker *nt;
162 	struct mount *mp;
163 
164 	if ((ndp->ni_lcf & NI_LCF_CAP_DOTDOT) == 0 || dp == NULL ||
165 	    dp->v_type != VDIR)
166 		return (0);
167 	mp = dp->v_mount;
168 	if (lookup_cap_dotdot_nonlocal == 0 && mp != NULL &&
169 	    (mp->mnt_flag & MNT_LOCAL) == 0)
170 		return (ENOTCAPABLE);
171 	TAILQ_FOREACH_REVERSE(nt, &ndp->ni_cap_tracker, nameicap_tracker_head,
172 	    nm_link) {
173 		if (dp == nt->dp)
174 			return (0);
175 	}
176 	return (ENOTCAPABLE);
177 }
178 
179 static void
180 namei_cleanup_cnp(struct componentname *cnp)
181 {
182 
183 	uma_zfree(namei_zone, cnp->cn_pnbuf);
184 #ifdef DIAGNOSTIC
185 	cnp->cn_pnbuf = NULL;
186 	cnp->cn_nameptr = NULL;
187 #endif
188 }
189 
190 static int
191 namei_handle_root(struct nameidata *ndp, struct vnode **dpp)
192 {
193 	struct componentname *cnp;
194 
195 	cnp = &ndp->ni_cnd;
196 	if ((ndp->ni_lcf & NI_LCF_STRICTRELATIVE) != 0) {
197 #ifdef KTRACE
198 		if (KTRPOINT(curthread, KTR_CAPFAIL))
199 			ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL);
200 #endif
201 		return (ENOTCAPABLE);
202 	}
203 	while (*(cnp->cn_nameptr) == '/') {
204 		cnp->cn_nameptr++;
205 		ndp->ni_pathlen--;
206 	}
207 	*dpp = ndp->ni_rootdir;
208 	VREF(*dpp);
209 	return (0);
210 }
211 
212 /*
213  * Convert a pathname into a pointer to a locked vnode.
214  *
215  * The FOLLOW flag is set when symbolic links are to be followed
216  * when they occur at the end of the name translation process.
217  * Symbolic links are always followed for all other pathname
218  * components other than the last.
219  *
220  * The segflg defines whether the name is to be copied from user
221  * space or kernel space.
222  *
223  * Overall outline of namei:
224  *
225  *	copy in name
226  *	get starting directory
227  *	while (!done && !error) {
228  *		call lookup to search path.
229  *		if symbolic link, massage name in buffer and continue
230  *	}
231  */
232 int
233 namei(struct nameidata *ndp)
234 {
235 	struct filedesc *fdp;	/* pointer to file descriptor state */
236 	char *cp;		/* pointer into pathname argument */
237 	struct vnode *dp;	/* the directory we are searching */
238 	struct iovec aiov;		/* uio for reading symbolic links */
239 	struct componentname *cnp;
240 	struct thread *td;
241 	struct proc *p;
242 	cap_rights_t rights;
243 	struct uio auio;
244 	int error, linklen, startdir_used;
245 
246 	cnp = &ndp->ni_cnd;
247 	td = cnp->cn_thread;
248 	p = td->td_proc;
249 	ndp->ni_cnd.cn_cred = ndp->ni_cnd.cn_thread->td_ucred;
250 	KASSERT(cnp->cn_cred && p, ("namei: bad cred/proc"));
251 	KASSERT((cnp->cn_nameiop & (~OPMASK)) == 0,
252 	    ("namei: nameiop contaminated with flags"));
253 	KASSERT((cnp->cn_flags & OPMASK) == 0,
254 	    ("namei: flags contaminated with nameiops"));
255 	MPASS(ndp->ni_startdir == NULL || ndp->ni_startdir->v_type == VDIR ||
256 	    ndp->ni_startdir->v_type == VBAD);
257 	if (!lookup_shared)
258 		cnp->cn_flags &= ~LOCKSHARED;
259 	fdp = p->p_fd;
260 	TAILQ_INIT(&ndp->ni_cap_tracker);
261 	ndp->ni_lcf = 0;
262 
263 	/* We will set this ourselves if we need it. */
264 	cnp->cn_flags &= ~TRAILINGSLASH;
265 
266 	/*
267 	 * Get a buffer for the name to be translated, and copy the
268 	 * name into the buffer.
269 	 */
270 	if ((cnp->cn_flags & HASBUF) == 0)
271 		cnp->cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK);
272 	if (ndp->ni_segflg == UIO_SYSSPACE)
273 		error = copystr(ndp->ni_dirp, cnp->cn_pnbuf, MAXPATHLEN,
274 		    &ndp->ni_pathlen);
275 	else
276 		error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf, MAXPATHLEN,
277 		    &ndp->ni_pathlen);
278 
279 	/*
280 	 * Don't allow empty pathnames.
281 	 */
282 	if (error == 0 && *cnp->cn_pnbuf == '\0')
283 		error = ENOENT;
284 
285 #ifdef CAPABILITY_MODE
286 	/*
287 	 * In capability mode, lookups must be restricted to happen in
288 	 * the subtree with the root specified by the file descriptor:
289 	 * - The root must be real file descriptor, not the pseudo-descriptor
290 	 *   AT_FDCWD.
291 	 * - The passed path must be relative and not absolute.
292 	 * - If lookup_cap_dotdot is disabled, path must not contain the
293 	 *   '..' components.
294 	 * - If lookup_cap_dotdot is enabled, we verify that all '..'
295 	 *   components lookups result in the directories which were
296 	 *   previously walked by us, which prevents an escape from
297 	 *   the relative root.
298 	 */
299 	if (error == 0 && IN_CAPABILITY_MODE(td) &&
300 	    (cnp->cn_flags & NOCAPCHECK) == 0) {
301 		ndp->ni_lcf |= NI_LCF_STRICTRELATIVE;
302 		if (ndp->ni_dirfd == AT_FDCWD) {
303 #ifdef KTRACE
304 			if (KTRPOINT(td, KTR_CAPFAIL))
305 				ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL);
306 #endif
307 			error = ECAPMODE;
308 		}
309 	}
310 #endif
311 	if (error != 0) {
312 		namei_cleanup_cnp(cnp);
313 		ndp->ni_vp = NULL;
314 		return (error);
315 	}
316 	ndp->ni_loopcnt = 0;
317 #ifdef KTRACE
318 	if (KTRPOINT(td, KTR_NAMEI)) {
319 		KASSERT(cnp->cn_thread == curthread,
320 		    ("namei not using curthread"));
321 		ktrnamei(cnp->cn_pnbuf);
322 	}
323 #endif
324 	/*
325 	 * Get starting point for the translation.
326 	 */
327 	FILEDESC_SLOCK(fdp);
328 	ndp->ni_rootdir = fdp->fd_rdir;
329 	VREF(ndp->ni_rootdir);
330 	ndp->ni_topdir = fdp->fd_jdir;
331 
332 	/*
333 	 * If we are auditing the kernel pathname, save the user pathname.
334 	 */
335 	if (cnp->cn_flags & AUDITVNODE1)
336 		AUDIT_ARG_UPATH1(td, ndp->ni_dirfd, cnp->cn_pnbuf);
337 	if (cnp->cn_flags & AUDITVNODE2)
338 		AUDIT_ARG_UPATH2(td, ndp->ni_dirfd, cnp->cn_pnbuf);
339 
340 	startdir_used = 0;
341 	dp = NULL;
342 	cnp->cn_nameptr = cnp->cn_pnbuf;
343 	if (cnp->cn_pnbuf[0] == '/') {
344 		error = namei_handle_root(ndp, &dp);
345 	} else {
346 		if (ndp->ni_startdir != NULL) {
347 			dp = ndp->ni_startdir;
348 			startdir_used = 1;
349 		} else if (ndp->ni_dirfd == AT_FDCWD) {
350 			dp = fdp->fd_cdir;
351 			VREF(dp);
352 		} else {
353 			rights = ndp->ni_rightsneeded;
354 			cap_rights_set(&rights, CAP_LOOKUP);
355 
356 			if (cnp->cn_flags & AUDITVNODE1)
357 				AUDIT_ARG_ATFD1(ndp->ni_dirfd);
358 			if (cnp->cn_flags & AUDITVNODE2)
359 				AUDIT_ARG_ATFD2(ndp->ni_dirfd);
360 			error = fgetvp_rights(td, ndp->ni_dirfd,
361 			    &rights, &ndp->ni_filecaps, &dp);
362 			if (error == EINVAL)
363 				error = ENOTDIR;
364 #ifdef CAPABILITIES
365 			/*
366 			 * If file descriptor doesn't have all rights,
367 			 * all lookups relative to it must also be
368 			 * strictly relative.
369 			 */
370 			CAP_ALL(&rights);
371 			if (!cap_rights_contains(&ndp->ni_filecaps.fc_rights,
372 			    &rights) ||
373 			    ndp->ni_filecaps.fc_fcntls != CAP_FCNTL_ALL ||
374 			    ndp->ni_filecaps.fc_nioctls != -1) {
375 				ndp->ni_lcf |= NI_LCF_STRICTRELATIVE;
376 			}
377 #endif
378 		}
379 		if (error == 0 && dp->v_type != VDIR)
380 			error = ENOTDIR;
381 	}
382 	FILEDESC_SUNLOCK(fdp);
383 	if (ndp->ni_startdir != NULL && !startdir_used)
384 		vrele(ndp->ni_startdir);
385 	if (error != 0) {
386 		if (dp != NULL)
387 			vrele(dp);
388 		goto out;
389 	}
390 	if ((ndp->ni_lcf & NI_LCF_STRICTRELATIVE) != 0 &&
391 	    lookup_cap_dotdot != 0)
392 		ndp->ni_lcf |= NI_LCF_CAP_DOTDOT;
393 	SDT_PROBE3(vfs, namei, lookup, entry, dp, cnp->cn_pnbuf,
394 	    cnp->cn_flags);
395 	for (;;) {
396 		ndp->ni_startdir = dp;
397 		error = lookup(ndp);
398 		if (error != 0)
399 			goto out;
400 		/*
401 		 * If not a symbolic link, we're done.
402 		 */
403 		if ((cnp->cn_flags & ISSYMLINK) == 0) {
404 			vrele(ndp->ni_rootdir);
405 			if ((cnp->cn_flags & (SAVENAME | SAVESTART)) == 0) {
406 				namei_cleanup_cnp(cnp);
407 			} else
408 				cnp->cn_flags |= HASBUF;
409 			nameicap_cleanup(ndp);
410 			SDT_PROBE2(vfs, namei, lookup, return, 0, ndp->ni_vp);
411 			return (0);
412 		}
413 		if (ndp->ni_loopcnt++ >= MAXSYMLINKS) {
414 			error = ELOOP;
415 			break;
416 		}
417 #ifdef MAC
418 		if ((cnp->cn_flags & NOMACCHECK) == 0) {
419 			error = mac_vnode_check_readlink(td->td_ucred,
420 			    ndp->ni_vp);
421 			if (error != 0)
422 				break;
423 		}
424 #endif
425 		if (ndp->ni_pathlen > 1)
426 			cp = uma_zalloc(namei_zone, M_WAITOK);
427 		else
428 			cp = cnp->cn_pnbuf;
429 		aiov.iov_base = cp;
430 		aiov.iov_len = MAXPATHLEN;
431 		auio.uio_iov = &aiov;
432 		auio.uio_iovcnt = 1;
433 		auio.uio_offset = 0;
434 		auio.uio_rw = UIO_READ;
435 		auio.uio_segflg = UIO_SYSSPACE;
436 		auio.uio_td = td;
437 		auio.uio_resid = MAXPATHLEN;
438 		error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred);
439 		if (error != 0) {
440 			if (ndp->ni_pathlen > 1)
441 				uma_zfree(namei_zone, cp);
442 			break;
443 		}
444 		linklen = MAXPATHLEN - auio.uio_resid;
445 		if (linklen == 0) {
446 			if (ndp->ni_pathlen > 1)
447 				uma_zfree(namei_zone, cp);
448 			error = ENOENT;
449 			break;
450 		}
451 		if (linklen + ndp->ni_pathlen >= MAXPATHLEN) {
452 			if (ndp->ni_pathlen > 1)
453 				uma_zfree(namei_zone, cp);
454 			error = ENAMETOOLONG;
455 			break;
456 		}
457 		if (ndp->ni_pathlen > 1) {
458 			bcopy(ndp->ni_next, cp + linklen, ndp->ni_pathlen);
459 			uma_zfree(namei_zone, cnp->cn_pnbuf);
460 			cnp->cn_pnbuf = cp;
461 		} else
462 			cnp->cn_pnbuf[linklen] = '\0';
463 		ndp->ni_pathlen += linklen;
464 		vput(ndp->ni_vp);
465 		dp = ndp->ni_dvp;
466 		/*
467 		 * Check if root directory should replace current directory.
468 		 */
469 		cnp->cn_nameptr = cnp->cn_pnbuf;
470 		if (*(cnp->cn_nameptr) == '/') {
471 			vrele(dp);
472 			error = namei_handle_root(ndp, &dp);
473 			if (error != 0)
474 				goto out;
475 		}
476 	}
477 	vput(ndp->ni_vp);
478 	ndp->ni_vp = NULL;
479 	vrele(ndp->ni_dvp);
480 out:
481 	vrele(ndp->ni_rootdir);
482 	namei_cleanup_cnp(cnp);
483 	nameicap_cleanup(ndp);
484 	SDT_PROBE2(vfs, namei, lookup, return, error, NULL);
485 	return (error);
486 }
487 
488 static int
489 compute_cn_lkflags(struct mount *mp, int lkflags, int cnflags)
490 {
491 
492 	if (mp == NULL || ((lkflags & LK_SHARED) &&
493 	    (!(mp->mnt_kern_flag & MNTK_LOOKUP_SHARED) ||
494 	    ((cnflags & ISDOTDOT) &&
495 	    (mp->mnt_kern_flag & MNTK_LOOKUP_EXCL_DOTDOT))))) {
496 		lkflags &= ~LK_SHARED;
497 		lkflags |= LK_EXCLUSIVE;
498 	}
499 	lkflags |= LK_NODDLKTREAT;
500 	return (lkflags);
501 }
502 
503 static __inline int
504 needs_exclusive_leaf(struct mount *mp, int flags)
505 {
506 
507 	/*
508 	 * Intermediate nodes can use shared locks, we only need to
509 	 * force an exclusive lock for leaf nodes.
510 	 */
511 	if ((flags & (ISLASTCN | LOCKLEAF)) != (ISLASTCN | LOCKLEAF))
512 		return (0);
513 
514 	/* Always use exclusive locks if LOCKSHARED isn't set. */
515 	if (!(flags & LOCKSHARED))
516 		return (1);
517 
518 	/*
519 	 * For lookups during open(), if the mount point supports
520 	 * extended shared operations, then use a shared lock for the
521 	 * leaf node, otherwise use an exclusive lock.
522 	 */
523 	if ((flags & ISOPEN) != 0)
524 		return (!MNT_EXTENDED_SHARED(mp));
525 
526 	/*
527 	 * Lookup requests outside of open() that specify LOCKSHARED
528 	 * only need a shared lock on the leaf vnode.
529 	 */
530 	return (0);
531 }
532 
533 /*
534  * Search a pathname.
535  * This is a very central and rather complicated routine.
536  *
537  * The pathname is pointed to by ni_ptr and is of length ni_pathlen.
538  * The starting directory is taken from ni_startdir. The pathname is
539  * descended until done, or a symbolic link is encountered. The variable
540  * ni_more is clear if the path is completed; it is set to one if a
541  * symbolic link needing interpretation is encountered.
542  *
543  * The flag argument is LOOKUP, CREATE, RENAME, or DELETE depending on
544  * whether the name is to be looked up, created, renamed, or deleted.
545  * When CREATE, RENAME, or DELETE is specified, information usable in
546  * creating, renaming, or deleting a directory entry may be calculated.
547  * If flag has LOCKPARENT or'ed into it, the parent directory is returned
548  * locked. If flag has WANTPARENT or'ed into it, the parent directory is
549  * returned unlocked. Otherwise the parent directory is not returned. If
550  * the target of the pathname exists and LOCKLEAF is or'ed into the flag
551  * the target is returned locked, otherwise it is returned unlocked.
552  * When creating or renaming and LOCKPARENT is specified, the target may not
553  * be ".".  When deleting and LOCKPARENT is specified, the target may be ".".
554  *
555  * Overall outline of lookup:
556  *
557  * dirloop:
558  *	identify next component of name at ndp->ni_ptr
559  *	handle degenerate case where name is null string
560  *	if .. and crossing mount points and on mounted filesys, find parent
561  *	call VOP_LOOKUP routine for next component name
562  *	    directory vnode returned in ni_dvp, unlocked unless LOCKPARENT set
563  *	    component vnode returned in ni_vp (if it exists), locked.
564  *	if result vnode is mounted on and crossing mount points,
565  *	    find mounted on vnode
566  *	if more components of name, do next level at dirloop
567  *	return the answer in ni_vp, locked if LOCKLEAF set
568  *	    if LOCKPARENT set, return locked parent in ni_dvp
569  *	    if WANTPARENT set, return unlocked parent in ni_dvp
570  */
571 int
572 lookup(struct nameidata *ndp)
573 {
574 	char *cp;		/* pointer into pathname argument */
575 	struct vnode *dp = NULL;	/* the directory we are searching */
576 	struct vnode *tdp;		/* saved dp */
577 	struct mount *mp;		/* mount table entry */
578 	struct prison *pr;
579 	int docache;			/* == 0 do not cache last component */
580 	int wantparent;			/* 1 => wantparent or lockparent flag */
581 	int rdonly;			/* lookup read-only flag bit */
582 	int error = 0;
583 	int dpunlocked = 0;		/* dp has already been unlocked */
584 	int relookup = 0;		/* do not consume the path component */
585 	struct componentname *cnp = &ndp->ni_cnd;
586 	int lkflags_save;
587 	int ni_dvp_unlocked;
588 
589 	/*
590 	 * Setup: break out flag bits into variables.
591 	 */
592 	ni_dvp_unlocked = 0;
593 	wantparent = cnp->cn_flags & (LOCKPARENT | WANTPARENT);
594 	KASSERT(cnp->cn_nameiop == LOOKUP || wantparent,
595 	    ("CREATE, DELETE, RENAME require LOCKPARENT or WANTPARENT."));
596 	docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE;
597 	if (cnp->cn_nameiop == DELETE ||
598 	    (wantparent && cnp->cn_nameiop != CREATE &&
599 	     cnp->cn_nameiop != LOOKUP))
600 		docache = 0;
601 	rdonly = cnp->cn_flags & RDONLY;
602 	cnp->cn_flags &= ~ISSYMLINK;
603 	ndp->ni_dvp = NULL;
604 	/*
605 	 * We use shared locks until we hit the parent of the last cn then
606 	 * we adjust based on the requesting flags.
607 	 */
608 	if (lookup_shared)
609 		cnp->cn_lkflags = LK_SHARED;
610 	else
611 		cnp->cn_lkflags = LK_EXCLUSIVE;
612 	dp = ndp->ni_startdir;
613 	ndp->ni_startdir = NULLVP;
614 	vn_lock(dp,
615 	    compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags | LK_RETRY,
616 	    cnp->cn_flags));
617 
618 dirloop:
619 	/*
620 	 * Search a new directory.
621 	 *
622 	 * The last component of the filename is left accessible via
623 	 * cnp->cn_nameptr for callers that need the name. Callers needing
624 	 * the name set the SAVENAME flag. When done, they assume
625 	 * responsibility for freeing the pathname buffer.
626 	 */
627 	for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++)
628 		continue;
629 	cnp->cn_namelen = cp - cnp->cn_nameptr;
630 	if (cnp->cn_namelen > NAME_MAX) {
631 		error = ENAMETOOLONG;
632 		goto bad;
633 	}
634 #ifdef NAMEI_DIAGNOSTIC
635 	{ char c = *cp;
636 	*cp = '\0';
637 	printf("{%s}: ", cnp->cn_nameptr);
638 	*cp = c; }
639 #endif
640 	ndp->ni_pathlen -= cnp->cn_namelen;
641 	ndp->ni_next = cp;
642 
643 	/*
644 	 * Replace multiple slashes by a single slash and trailing slashes
645 	 * by a null.  This must be done before VOP_LOOKUP() because some
646 	 * fs's don't know about trailing slashes.  Remember if there were
647 	 * trailing slashes to handle symlinks, existing non-directories
648 	 * and non-existing files that won't be directories specially later.
649 	 */
650 	while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) {
651 		cp++;
652 		ndp->ni_pathlen--;
653 		if (*cp == '\0') {
654 			*ndp->ni_next = '\0';
655 			cnp->cn_flags |= TRAILINGSLASH;
656 		}
657 	}
658 	ndp->ni_next = cp;
659 
660 	cnp->cn_flags |= MAKEENTRY;
661 	if (*cp == '\0' && docache == 0)
662 		cnp->cn_flags &= ~MAKEENTRY;
663 	if (cnp->cn_namelen == 2 &&
664 	    cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
665 		cnp->cn_flags |= ISDOTDOT;
666 	else
667 		cnp->cn_flags &= ~ISDOTDOT;
668 	if (*ndp->ni_next == 0)
669 		cnp->cn_flags |= ISLASTCN;
670 	else
671 		cnp->cn_flags &= ~ISLASTCN;
672 
673 	if ((cnp->cn_flags & ISLASTCN) != 0 &&
674 	    cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.' &&
675 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
676 		error = EINVAL;
677 		goto bad;
678 	}
679 
680 	nameicap_tracker_add(ndp, dp);
681 
682 	/*
683 	 * Check for degenerate name (e.g. / or "")
684 	 * which is a way of talking about a directory,
685 	 * e.g. like "/." or ".".
686 	 */
687 	if (cnp->cn_nameptr[0] == '\0') {
688 		if (dp->v_type != VDIR) {
689 			error = ENOTDIR;
690 			goto bad;
691 		}
692 		if (cnp->cn_nameiop != LOOKUP) {
693 			error = EISDIR;
694 			goto bad;
695 		}
696 		if (wantparent) {
697 			ndp->ni_dvp = dp;
698 			VREF(dp);
699 		}
700 		ndp->ni_vp = dp;
701 
702 		if (cnp->cn_flags & AUDITVNODE1)
703 			AUDIT_ARG_VNODE1(dp);
704 		else if (cnp->cn_flags & AUDITVNODE2)
705 			AUDIT_ARG_VNODE2(dp);
706 
707 		if (!(cnp->cn_flags & (LOCKPARENT | LOCKLEAF)))
708 			VOP_UNLOCK(dp, 0);
709 		/* XXX This should probably move to the top of function. */
710 		if (cnp->cn_flags & SAVESTART)
711 			panic("lookup: SAVESTART");
712 		goto success;
713 	}
714 
715 	/*
716 	 * Handle "..": five special cases.
717 	 * 0. If doing a capability lookup and lookup_cap_dotdot is
718 	 *    disabled, return ENOTCAPABLE.
719 	 * 1. Return an error if this is the last component of
720 	 *    the name and the operation is DELETE or RENAME.
721 	 * 2. If at root directory (e.g. after chroot)
722 	 *    or at absolute root directory
723 	 *    then ignore it so can't get out.
724 	 * 3. If this vnode is the root of a mounted
725 	 *    filesystem, then replace it with the
726 	 *    vnode which was mounted on so we take the
727 	 *    .. in the other filesystem.
728 	 * 4. If the vnode is the top directory of
729 	 *    the jail or chroot, don't let them out.
730 	 * 5. If doing a capability lookup and lookup_cap_dotdot is
731 	 *    enabled, return ENOTCAPABLE if the lookup would escape
732 	 *    from the initial file descriptor directory.  Checks are
733 	 *    done by ensuring that namei() already traversed the
734 	 *    result of dotdot lookup.
735 	 */
736 	if (cnp->cn_flags & ISDOTDOT) {
737 		if ((ndp->ni_lcf & (NI_LCF_STRICTRELATIVE | NI_LCF_CAP_DOTDOT))
738 		    == NI_LCF_STRICTRELATIVE) {
739 #ifdef KTRACE
740 			if (KTRPOINT(curthread, KTR_CAPFAIL))
741 				ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL);
742 #endif
743 			error = ENOTCAPABLE;
744 			goto bad;
745 		}
746 		if ((cnp->cn_flags & ISLASTCN) != 0 &&
747 		    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
748 			error = EINVAL;
749 			goto bad;
750 		}
751 		for (;;) {
752 			for (pr = cnp->cn_cred->cr_prison; pr != NULL;
753 			     pr = pr->pr_parent)
754 				if (dp == pr->pr_root)
755 					break;
756 			if (dp == ndp->ni_rootdir ||
757 			    dp == ndp->ni_topdir ||
758 			    dp == rootvnode ||
759 			    pr != NULL ||
760 			    ((dp->v_vflag & VV_ROOT) != 0 &&
761 			     (cnp->cn_flags & NOCROSSMOUNT) != 0)) {
762 				ndp->ni_dvp = dp;
763 				ndp->ni_vp = dp;
764 				VREF(dp);
765 				goto nextname;
766 			}
767 			if ((dp->v_vflag & VV_ROOT) == 0)
768 				break;
769 			if (dp->v_iflag & VI_DOOMED) {	/* forced unmount */
770 				error = ENOENT;
771 				goto bad;
772 			}
773 			tdp = dp;
774 			dp = dp->v_mount->mnt_vnodecovered;
775 			VREF(dp);
776 			vput(tdp);
777 			vn_lock(dp,
778 			    compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags |
779 			    LK_RETRY, ISDOTDOT));
780 			error = nameicap_check_dotdot(ndp, dp);
781 			if (error != 0) {
782 #ifdef KTRACE
783 				if (KTRPOINT(curthread, KTR_CAPFAIL))
784 					ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL);
785 #endif
786 				goto bad;
787 			}
788 		}
789 	}
790 
791 	/*
792 	 * We now have a segment name to search for, and a directory to search.
793 	 */
794 unionlookup:
795 #ifdef MAC
796 	if ((cnp->cn_flags & NOMACCHECK) == 0) {
797 		error = mac_vnode_check_lookup(cnp->cn_thread->td_ucred, dp,
798 		    cnp);
799 		if (error)
800 			goto bad;
801 	}
802 #endif
803 	ndp->ni_dvp = dp;
804 	ndp->ni_vp = NULL;
805 	ASSERT_VOP_LOCKED(dp, "lookup");
806 	/*
807 	 * If we have a shared lock we may need to upgrade the lock for the
808 	 * last operation.
809 	 */
810 	if (dp != vp_crossmp &&
811 	    VOP_ISLOCKED(dp) == LK_SHARED &&
812 	    (cnp->cn_flags & ISLASTCN) && (cnp->cn_flags & LOCKPARENT))
813 		vn_lock(dp, LK_UPGRADE|LK_RETRY);
814 	if ((dp->v_iflag & VI_DOOMED) != 0) {
815 		error = ENOENT;
816 		goto bad;
817 	}
818 	/*
819 	 * If we're looking up the last component and we need an exclusive
820 	 * lock, adjust our lkflags.
821 	 */
822 	if (needs_exclusive_leaf(dp->v_mount, cnp->cn_flags))
823 		cnp->cn_lkflags = LK_EXCLUSIVE;
824 #ifdef NAMEI_DIAGNOSTIC
825 	vn_printf(dp, "lookup in ");
826 #endif
827 	lkflags_save = cnp->cn_lkflags;
828 	cnp->cn_lkflags = compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags,
829 	    cnp->cn_flags);
830 	error = VOP_LOOKUP(dp, &ndp->ni_vp, cnp);
831 	cnp->cn_lkflags = lkflags_save;
832 	if (error != 0) {
833 		KASSERT(ndp->ni_vp == NULL, ("leaf should be empty"));
834 #ifdef NAMEI_DIAGNOSTIC
835 		printf("not found\n");
836 #endif
837 		if ((error == ENOENT) &&
838 		    (dp->v_vflag & VV_ROOT) && (dp->v_mount != NULL) &&
839 		    (dp->v_mount->mnt_flag & MNT_UNION)) {
840 			tdp = dp;
841 			dp = dp->v_mount->mnt_vnodecovered;
842 			VREF(dp);
843 			vput(tdp);
844 			vn_lock(dp,
845 			    compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags |
846 			    LK_RETRY, cnp->cn_flags));
847 			nameicap_tracker_add(ndp, dp);
848 			goto unionlookup;
849 		}
850 
851 		if (error == ERELOOKUP) {
852 			vref(dp);
853 			ndp->ni_vp = dp;
854 			error = 0;
855 			relookup = 1;
856 			goto good;
857 		}
858 
859 		if (error != EJUSTRETURN)
860 			goto bad;
861 		/*
862 		 * At this point, we know we're at the end of the
863 		 * pathname.  If creating / renaming, we can consider
864 		 * allowing the file or directory to be created / renamed,
865 		 * provided we're not on a read-only filesystem.
866 		 */
867 		if (rdonly) {
868 			error = EROFS;
869 			goto bad;
870 		}
871 		/* trailing slash only allowed for directories */
872 		if ((cnp->cn_flags & TRAILINGSLASH) &&
873 		    !(cnp->cn_flags & WILLBEDIR)) {
874 			error = ENOENT;
875 			goto bad;
876 		}
877 		if ((cnp->cn_flags & LOCKPARENT) == 0)
878 			VOP_UNLOCK(dp, 0);
879 		/*
880 		 * We return with ni_vp NULL to indicate that the entry
881 		 * doesn't currently exist, leaving a pointer to the
882 		 * (possibly locked) directory vnode in ndp->ni_dvp.
883 		 */
884 		if (cnp->cn_flags & SAVESTART) {
885 			ndp->ni_startdir = ndp->ni_dvp;
886 			VREF(ndp->ni_startdir);
887 		}
888 		goto success;
889 	}
890 
891 good:
892 #ifdef NAMEI_DIAGNOSTIC
893 	printf("found\n");
894 #endif
895 	dp = ndp->ni_vp;
896 
897 	/*
898 	 * Check to see if the vnode has been mounted on;
899 	 * if so find the root of the mounted filesystem.
900 	 */
901 	while (dp->v_type == VDIR && (mp = dp->v_mountedhere) &&
902 	       (cnp->cn_flags & NOCROSSMOUNT) == 0) {
903 		if (vfs_busy(mp, 0))
904 			continue;
905 		vput(dp);
906 		if (dp != ndp->ni_dvp)
907 			vput(ndp->ni_dvp);
908 		else
909 			vrele(ndp->ni_dvp);
910 		vref(vp_crossmp);
911 		ndp->ni_dvp = vp_crossmp;
912 		error = VFS_ROOT(mp, compute_cn_lkflags(mp, cnp->cn_lkflags,
913 		    cnp->cn_flags), &tdp);
914 		vfs_unbusy(mp);
915 		if (vn_lock(vp_crossmp, LK_SHARED | LK_NOWAIT))
916 			panic("vp_crossmp exclusively locked or reclaimed");
917 		if (error) {
918 			dpunlocked = 1;
919 			goto bad2;
920 		}
921 		ndp->ni_vp = dp = tdp;
922 	}
923 
924 	/*
925 	 * Check for symbolic link
926 	 */
927 	if ((dp->v_type == VLNK) &&
928 	    ((cnp->cn_flags & FOLLOW) || (cnp->cn_flags & TRAILINGSLASH) ||
929 	     *ndp->ni_next == '/')) {
930 		cnp->cn_flags |= ISSYMLINK;
931 		if (dp->v_iflag & VI_DOOMED) {
932 			/*
933 			 * We can't know whether the directory was mounted with
934 			 * NOSYMFOLLOW, so we can't follow safely.
935 			 */
936 			error = ENOENT;
937 			goto bad2;
938 		}
939 		if (dp->v_mount->mnt_flag & MNT_NOSYMFOLLOW) {
940 			error = EACCES;
941 			goto bad2;
942 		}
943 		/*
944 		 * Symlink code always expects an unlocked dvp.
945 		 */
946 		if (ndp->ni_dvp != ndp->ni_vp) {
947 			VOP_UNLOCK(ndp->ni_dvp, 0);
948 			ni_dvp_unlocked = 1;
949 		}
950 		goto success;
951 	}
952 
953 nextname:
954 	/*
955 	 * Not a symbolic link that we will follow.  Continue with the
956 	 * next component if there is any; otherwise, we're done.
957 	 */
958 	KASSERT((cnp->cn_flags & ISLASTCN) || *ndp->ni_next == '/',
959 	    ("lookup: invalid path state."));
960 	if (relookup) {
961 		relookup = 0;
962 		if (ndp->ni_dvp != dp)
963 			vput(ndp->ni_dvp);
964 		else
965 			vrele(ndp->ni_dvp);
966 		goto dirloop;
967 	}
968 	if (cnp->cn_flags & ISDOTDOT) {
969 		error = nameicap_check_dotdot(ndp, ndp->ni_vp);
970 		if (error != 0) {
971 #ifdef KTRACE
972 			if (KTRPOINT(curthread, KTR_CAPFAIL))
973 				ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL);
974 #endif
975 			goto bad2;
976 		}
977 	}
978 	if (*ndp->ni_next == '/') {
979 		cnp->cn_nameptr = ndp->ni_next;
980 		while (*cnp->cn_nameptr == '/') {
981 			cnp->cn_nameptr++;
982 			ndp->ni_pathlen--;
983 		}
984 		if (ndp->ni_dvp != dp)
985 			vput(ndp->ni_dvp);
986 		else
987 			vrele(ndp->ni_dvp);
988 		goto dirloop;
989 	}
990 	/*
991 	 * If we're processing a path with a trailing slash,
992 	 * check that the end result is a directory.
993 	 */
994 	if ((cnp->cn_flags & TRAILINGSLASH) && dp->v_type != VDIR) {
995 		error = ENOTDIR;
996 		goto bad2;
997 	}
998 	/*
999 	 * Disallow directory write attempts on read-only filesystems.
1000 	 */
1001 	if (rdonly &&
1002 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
1003 		error = EROFS;
1004 		goto bad2;
1005 	}
1006 	if (cnp->cn_flags & SAVESTART) {
1007 		ndp->ni_startdir = ndp->ni_dvp;
1008 		VREF(ndp->ni_startdir);
1009 	}
1010 	if (!wantparent) {
1011 		ni_dvp_unlocked = 2;
1012 		if (ndp->ni_dvp != dp)
1013 			vput(ndp->ni_dvp);
1014 		else
1015 			vrele(ndp->ni_dvp);
1016 	} else if ((cnp->cn_flags & LOCKPARENT) == 0 && ndp->ni_dvp != dp) {
1017 		VOP_UNLOCK(ndp->ni_dvp, 0);
1018 		ni_dvp_unlocked = 1;
1019 	}
1020 
1021 	if (cnp->cn_flags & AUDITVNODE1)
1022 		AUDIT_ARG_VNODE1(dp);
1023 	else if (cnp->cn_flags & AUDITVNODE2)
1024 		AUDIT_ARG_VNODE2(dp);
1025 
1026 	if ((cnp->cn_flags & LOCKLEAF) == 0)
1027 		VOP_UNLOCK(dp, 0);
1028 success:
1029 	/*
1030 	 * Because of lookup_shared we may have the vnode shared locked, but
1031 	 * the caller may want it to be exclusively locked.
1032 	 */
1033 	if (needs_exclusive_leaf(dp->v_mount, cnp->cn_flags) &&
1034 	    VOP_ISLOCKED(dp) != LK_EXCLUSIVE) {
1035 		vn_lock(dp, LK_UPGRADE | LK_RETRY);
1036 		if (dp->v_iflag & VI_DOOMED) {
1037 			error = ENOENT;
1038 			goto bad2;
1039 		}
1040 	}
1041 	return (0);
1042 
1043 bad2:
1044 	if (ni_dvp_unlocked != 2) {
1045 		if (dp != ndp->ni_dvp && !ni_dvp_unlocked)
1046 			vput(ndp->ni_dvp);
1047 		else
1048 			vrele(ndp->ni_dvp);
1049 	}
1050 bad:
1051 	if (!dpunlocked)
1052 		vput(dp);
1053 	ndp->ni_vp = NULL;
1054 	return (error);
1055 }
1056 
1057 /*
1058  * relookup - lookup a path name component
1059  *    Used by lookup to re-acquire things.
1060  */
1061 int
1062 relookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp)
1063 {
1064 	struct vnode *dp = NULL;		/* the directory we are searching */
1065 	int wantparent;			/* 1 => wantparent or lockparent flag */
1066 	int rdonly;			/* lookup read-only flag bit */
1067 	int error = 0;
1068 
1069 	KASSERT(cnp->cn_flags & ISLASTCN,
1070 	    ("relookup: Not given last component."));
1071 	/*
1072 	 * Setup: break out flag bits into variables.
1073 	 */
1074 	wantparent = cnp->cn_flags & (LOCKPARENT|WANTPARENT);
1075 	KASSERT(wantparent, ("relookup: parent not wanted."));
1076 	rdonly = cnp->cn_flags & RDONLY;
1077 	cnp->cn_flags &= ~ISSYMLINK;
1078 	dp = dvp;
1079 	cnp->cn_lkflags = LK_EXCLUSIVE;
1080 	vn_lock(dp, LK_EXCLUSIVE | LK_RETRY);
1081 
1082 	/*
1083 	 * Search a new directory.
1084 	 *
1085 	 * The last component of the filename is left accessible via
1086 	 * cnp->cn_nameptr for callers that need the name. Callers needing
1087 	 * the name set the SAVENAME flag. When done, they assume
1088 	 * responsibility for freeing the pathname buffer.
1089 	 */
1090 #ifdef NAMEI_DIAGNOSTIC
1091 	printf("{%s}: ", cnp->cn_nameptr);
1092 #endif
1093 
1094 	/*
1095 	 * Check for "" which represents the root directory after slash
1096 	 * removal.
1097 	 */
1098 	if (cnp->cn_nameptr[0] == '\0') {
1099 		/*
1100 		 * Support only LOOKUP for "/" because lookup()
1101 		 * can't succeed for CREATE, DELETE and RENAME.
1102 		 */
1103 		KASSERT(cnp->cn_nameiop == LOOKUP, ("nameiop must be LOOKUP"));
1104 		KASSERT(dp->v_type == VDIR, ("dp is not a directory"));
1105 
1106 		if (!(cnp->cn_flags & LOCKLEAF))
1107 			VOP_UNLOCK(dp, 0);
1108 		*vpp = dp;
1109 		/* XXX This should probably move to the top of function. */
1110 		if (cnp->cn_flags & SAVESTART)
1111 			panic("lookup: SAVESTART");
1112 		return (0);
1113 	}
1114 
1115 	if (cnp->cn_flags & ISDOTDOT)
1116 		panic ("relookup: lookup on dot-dot");
1117 
1118 	/*
1119 	 * We now have a segment name to search for, and a directory to search.
1120 	 */
1121 #ifdef NAMEI_DIAGNOSTIC
1122 	vn_printf(dp, "search in ");
1123 #endif
1124 	if ((error = VOP_LOOKUP(dp, vpp, cnp)) != 0) {
1125 		KASSERT(*vpp == NULL, ("leaf should be empty"));
1126 		if (error != EJUSTRETURN)
1127 			goto bad;
1128 		/*
1129 		 * If creating and at end of pathname, then can consider
1130 		 * allowing file to be created.
1131 		 */
1132 		if (rdonly) {
1133 			error = EROFS;
1134 			goto bad;
1135 		}
1136 		/* ASSERT(dvp == ndp->ni_startdir) */
1137 		if (cnp->cn_flags & SAVESTART)
1138 			VREF(dvp);
1139 		if ((cnp->cn_flags & LOCKPARENT) == 0)
1140 			VOP_UNLOCK(dp, 0);
1141 		/*
1142 		 * We return with ni_vp NULL to indicate that the entry
1143 		 * doesn't currently exist, leaving a pointer to the
1144 		 * (possibly locked) directory vnode in ndp->ni_dvp.
1145 		 */
1146 		return (0);
1147 	}
1148 
1149 	dp = *vpp;
1150 
1151 	/*
1152 	 * Disallow directory write attempts on read-only filesystems.
1153 	 */
1154 	if (rdonly &&
1155 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
1156 		if (dvp == dp)
1157 			vrele(dvp);
1158 		else
1159 			vput(dvp);
1160 		error = EROFS;
1161 		goto bad;
1162 	}
1163 	/*
1164 	 * Set the parent lock/ref state to the requested state.
1165 	 */
1166 	if ((cnp->cn_flags & LOCKPARENT) == 0 && dvp != dp) {
1167 		if (wantparent)
1168 			VOP_UNLOCK(dvp, 0);
1169 		else
1170 			vput(dvp);
1171 	} else if (!wantparent)
1172 		vrele(dvp);
1173 	/*
1174 	 * Check for symbolic link
1175 	 */
1176 	KASSERT(dp->v_type != VLNK || !(cnp->cn_flags & FOLLOW),
1177 	    ("relookup: symlink found.\n"));
1178 
1179 	/* ASSERT(dvp == ndp->ni_startdir) */
1180 	if (cnp->cn_flags & SAVESTART)
1181 		VREF(dvp);
1182 
1183 	if ((cnp->cn_flags & LOCKLEAF) == 0)
1184 		VOP_UNLOCK(dp, 0);
1185 	return (0);
1186 bad:
1187 	vput(dp);
1188 	*vpp = NULL;
1189 	return (error);
1190 }
1191 
1192 void
1193 NDINIT_ALL(struct nameidata *ndp, u_long op, u_long flags, enum uio_seg segflg,
1194     const char *namep, int dirfd, struct vnode *startdir, cap_rights_t *rightsp,
1195     struct thread *td)
1196 {
1197 
1198 	ndp->ni_cnd.cn_nameiop = op;
1199 	ndp->ni_cnd.cn_flags = flags;
1200 	ndp->ni_segflg = segflg;
1201 	ndp->ni_dirp = namep;
1202 	ndp->ni_dirfd = dirfd;
1203 	ndp->ni_startdir = startdir;
1204 	if (rightsp != NULL)
1205 		ndp->ni_rightsneeded = *rightsp;
1206 	else
1207 		cap_rights_init(&ndp->ni_rightsneeded);
1208 	filecaps_init(&ndp->ni_filecaps);
1209 	ndp->ni_cnd.cn_thread = td;
1210 }
1211 
1212 /*
1213  * Free data allocated by namei(); see namei(9) for details.
1214  */
1215 void
1216 NDFREE(struct nameidata *ndp, const u_int flags)
1217 {
1218 	int unlock_dvp;
1219 	int unlock_vp;
1220 
1221 	unlock_dvp = 0;
1222 	unlock_vp = 0;
1223 
1224 	if (!(flags & NDF_NO_FREE_PNBUF) &&
1225 	    (ndp->ni_cnd.cn_flags & HASBUF)) {
1226 		uma_zfree(namei_zone, ndp->ni_cnd.cn_pnbuf);
1227 		ndp->ni_cnd.cn_flags &= ~HASBUF;
1228 	}
1229 	if (!(flags & NDF_NO_VP_UNLOCK) &&
1230 	    (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp)
1231 		unlock_vp = 1;
1232 	if (!(flags & NDF_NO_VP_RELE) && ndp->ni_vp) {
1233 		if (unlock_vp) {
1234 			vput(ndp->ni_vp);
1235 			unlock_vp = 0;
1236 		} else
1237 			vrele(ndp->ni_vp);
1238 		ndp->ni_vp = NULL;
1239 	}
1240 	if (unlock_vp)
1241 		VOP_UNLOCK(ndp->ni_vp, 0);
1242 	if (!(flags & NDF_NO_DVP_UNLOCK) &&
1243 	    (ndp->ni_cnd.cn_flags & LOCKPARENT) &&
1244 	    ndp->ni_dvp != ndp->ni_vp)
1245 		unlock_dvp = 1;
1246 	if (!(flags & NDF_NO_DVP_RELE) &&
1247 	    (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) {
1248 		if (unlock_dvp) {
1249 			vput(ndp->ni_dvp);
1250 			unlock_dvp = 0;
1251 		} else
1252 			vrele(ndp->ni_dvp);
1253 		ndp->ni_dvp = NULL;
1254 	}
1255 	if (unlock_dvp)
1256 		VOP_UNLOCK(ndp->ni_dvp, 0);
1257 	if (!(flags & NDF_NO_STARTDIR_RELE) &&
1258 	    (ndp->ni_cnd.cn_flags & SAVESTART)) {
1259 		vrele(ndp->ni_startdir);
1260 		ndp->ni_startdir = NULL;
1261 	}
1262 }
1263 
1264 /*
1265  * Determine if there is a suitable alternate filename under the specified
1266  * prefix for the specified path.  If the create flag is set, then the
1267  * alternate prefix will be used so long as the parent directory exists.
1268  * This is used by the various compatibility ABIs so that Linux binaries prefer
1269  * files under /compat/linux for example.  The chosen path (whether under
1270  * the prefix or under /) is returned in a kernel malloc'd buffer pointed
1271  * to by pathbuf.  The caller is responsible for free'ing the buffer from
1272  * the M_TEMP bucket if one is returned.
1273  */
1274 int
1275 kern_alternate_path(struct thread *td, const char *prefix, const char *path,
1276     enum uio_seg pathseg, char **pathbuf, int create, int dirfd)
1277 {
1278 	struct nameidata nd, ndroot;
1279 	char *ptr, *buf, *cp;
1280 	size_t len, sz;
1281 	int error;
1282 
1283 	buf = (char *) malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
1284 	*pathbuf = buf;
1285 
1286 	/* Copy the prefix into the new pathname as a starting point. */
1287 	len = strlcpy(buf, prefix, MAXPATHLEN);
1288 	if (len >= MAXPATHLEN) {
1289 		*pathbuf = NULL;
1290 		free(buf, M_TEMP);
1291 		return (EINVAL);
1292 	}
1293 	sz = MAXPATHLEN - len;
1294 	ptr = buf + len;
1295 
1296 	/* Append the filename to the prefix. */
1297 	if (pathseg == UIO_SYSSPACE)
1298 		error = copystr(path, ptr, sz, &len);
1299 	else
1300 		error = copyinstr(path, ptr, sz, &len);
1301 
1302 	if (error) {
1303 		*pathbuf = NULL;
1304 		free(buf, M_TEMP);
1305 		return (error);
1306 	}
1307 
1308 	/* Only use a prefix with absolute pathnames. */
1309 	if (*ptr != '/') {
1310 		error = EINVAL;
1311 		goto keeporig;
1312 	}
1313 
1314 	if (dirfd != AT_FDCWD) {
1315 		/*
1316 		 * We want the original because the "prefix" is
1317 		 * included in the already opened dirfd.
1318 		 */
1319 		bcopy(ptr, buf, len);
1320 		return (0);
1321 	}
1322 
1323 	/*
1324 	 * We know that there is a / somewhere in this pathname.
1325 	 * Search backwards for it, to find the file's parent dir
1326 	 * to see if it exists in the alternate tree. If it does,
1327 	 * and we want to create a file (cflag is set). We don't
1328 	 * need to worry about the root comparison in this case.
1329 	 */
1330 
1331 	if (create) {
1332 		for (cp = &ptr[len] - 1; *cp != '/'; cp--);
1333 		*cp = '\0';
1334 
1335 		NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, buf, td);
1336 		error = namei(&nd);
1337 		*cp = '/';
1338 		if (error != 0)
1339 			goto keeporig;
1340 	} else {
1341 		NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, buf, td);
1342 
1343 		error = namei(&nd);
1344 		if (error != 0)
1345 			goto keeporig;
1346 
1347 		/*
1348 		 * We now compare the vnode of the prefix to the one
1349 		 * vnode asked. If they resolve to be the same, then we
1350 		 * ignore the match so that the real root gets used.
1351 		 * This avoids the problem of traversing "../.." to find the
1352 		 * root directory and never finding it, because "/" resolves
1353 		 * to the emulation root directory. This is expensive :-(
1354 		 */
1355 		NDINIT(&ndroot, LOOKUP, FOLLOW, UIO_SYSSPACE, prefix,
1356 		    td);
1357 
1358 		/* We shouldn't ever get an error from this namei(). */
1359 		error = namei(&ndroot);
1360 		if (error == 0) {
1361 			if (nd.ni_vp == ndroot.ni_vp)
1362 				error = ENOENT;
1363 
1364 			NDFREE(&ndroot, NDF_ONLY_PNBUF);
1365 			vrele(ndroot.ni_vp);
1366 		}
1367 	}
1368 
1369 	NDFREE(&nd, NDF_ONLY_PNBUF);
1370 	vrele(nd.ni_vp);
1371 
1372 keeporig:
1373 	/* If there was an error, use the original path name. */
1374 	if (error)
1375 		bcopy(ptr, buf, len);
1376 	return (error);
1377 }
1378