1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1989, 1993 5 * The Regents of the University of California. All rights reserved. 6 * (c) UNIX System Laboratories, Inc. 7 * All or some portions of this file are derived from material licensed 8 * to the University of California by American Telephone and Telegraph 9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 10 * the permission of UNIX System Laboratories, Inc. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 */ 36 37 #include <sys/cdefs.h> 38 #include "opt_capsicum.h" 39 #include "opt_ktrace.h" 40 41 #include <sys/param.h> 42 #include <sys/systm.h> 43 #include <sys/dirent.h> 44 #include <sys/kernel.h> 45 #include <sys/capsicum.h> 46 #include <sys/fcntl.h> 47 #include <sys/jail.h> 48 #include <sys/lock.h> 49 #include <sys/mutex.h> 50 #include <sys/namei.h> 51 #include <sys/vnode.h> 52 #include <sys/mount.h> 53 #include <sys/filedesc.h> 54 #include <sys/proc.h> 55 #include <sys/sdt.h> 56 #include <sys/syscallsubr.h> 57 #include <sys/sysctl.h> 58 #ifdef KTRACE 59 #include <sys/ktrace.h> 60 #endif 61 #ifdef INVARIANTS 62 #include <machine/_inttypes.h> 63 #endif 64 65 #include <security/audit/audit.h> 66 #include <security/mac/mac_framework.h> 67 68 #include <vm/uma.h> 69 70 #ifdef INVARIANTS 71 static void NDVALIDATE_impl(struct nameidata *, int); 72 #define NDVALIDATE(ndp) NDVALIDATE_impl(ndp, __LINE__) 73 #else 74 #define NDVALIDATE(ndp) 75 #endif 76 77 /* 78 * Prepare namei() to restart. Reset components to its original state and set 79 * ISRESTARTED flag which signals the underlying lookup code to change the root 80 * from ABI root to actual root and prevents a further restarts. 81 */ 82 #define NDRESTART(ndp) do { \ 83 NDREINIT_DBG(ndp); \ 84 ndp->ni_resflags = 0; \ 85 ndp->ni_cnd.cn_flags &= ~NAMEI_INTERNAL_FLAGS; \ 86 ndp->ni_cnd.cn_flags |= ISRESTARTED; \ 87 } while (0) 88 89 SDT_PROVIDER_DEFINE(vfs); 90 SDT_PROBE_DEFINE4(vfs, namei, lookup, entry, "struct vnode *", "char *", 91 "unsigned long", "bool"); 92 SDT_PROBE_DEFINE4(vfs, namei, lookup, return, "int", "struct vnode *", "bool", 93 "struct nameidata"); 94 95 /* Allocation zone for namei. */ 96 uma_zone_t namei_zone; 97 98 /* Placeholder vnode for mp traversal. */ 99 static struct vnode *vp_crossmp; 100 101 static int 102 crossmp_vop_islocked(struct vop_islocked_args *ap) 103 { 104 105 return (LK_SHARED); 106 } 107 108 static int 109 crossmp_vop_lock1(struct vop_lock1_args *ap) 110 { 111 struct vnode *vp; 112 struct lock *lk __diagused; 113 int flags; 114 115 vp = ap->a_vp; 116 lk = vp->v_vnlock; 117 flags = ap->a_flags; 118 119 KASSERT((flags & (LK_SHARED | LK_NOWAIT)) == (LK_SHARED | LK_NOWAIT), 120 ("%s: invalid lock request 0x%x for crossmp", __func__, flags)); 121 122 if ((flags & LK_INTERLOCK) != 0) 123 VI_UNLOCK(vp); 124 LOCK_LOG_LOCK("SLOCK", &lk->lock_object, 0, 0, ap->a_file, ap->a_line); 125 return (0); 126 } 127 128 static int 129 crossmp_vop_unlock(struct vop_unlock_args *ap) 130 { 131 struct vnode *vp; 132 struct lock *lk __diagused; 133 134 vp = ap->a_vp; 135 lk = vp->v_vnlock; 136 137 LOCK_LOG_LOCK("SUNLOCK", &lk->lock_object, 0, 0, LOCK_FILE, 138 LOCK_LINE); 139 return (0); 140 } 141 142 static struct vop_vector crossmp_vnodeops = { 143 .vop_default = &default_vnodeops, 144 .vop_islocked = crossmp_vop_islocked, 145 .vop_lock1 = crossmp_vop_lock1, 146 .vop_unlock = crossmp_vop_unlock, 147 }; 148 /* 149 * VFS_VOP_VECTOR_REGISTER(crossmp_vnodeops) is not used here since the vnode 150 * gets allocated early. See nameiinit for the direct call below. 151 */ 152 153 struct nameicap_tracker { 154 struct vnode *dp; 155 TAILQ_ENTRY(nameicap_tracker) nm_link; 156 }; 157 158 /* Zone for cap mode tracker elements used for dotdot capability checks. */ 159 MALLOC_DEFINE(M_NAMEITRACKER, "namei_tracker", "namei tracking for dotdot"); 160 161 static void 162 nameiinit(void *dummy __unused) 163 { 164 165 namei_zone = uma_zcreate("NAMEI", MAXPATHLEN, NULL, NULL, NULL, NULL, 166 UMA_ALIGN_PTR, 0); 167 vfs_vector_op_register(&crossmp_vnodeops); 168 getnewvnode("crossmp", NULL, &crossmp_vnodeops, &vp_crossmp); 169 vp_crossmp->v_state = VSTATE_CONSTRUCTED; 170 vp_crossmp->v_irflag |= VIRF_CROSSMP; 171 } 172 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nameiinit, NULL); 173 174 static int lookup_cap_dotdot = 1; 175 SYSCTL_INT(_vfs, OID_AUTO, lookup_cap_dotdot, CTLFLAG_RWTUN, 176 &lookup_cap_dotdot, 0, 177 "enables \"..\" components in path lookup in capability mode"); 178 static int lookup_cap_dotdot_nonlocal = 1; 179 SYSCTL_INT(_vfs, OID_AUTO, lookup_cap_dotdot_nonlocal, CTLFLAG_RWTUN, 180 &lookup_cap_dotdot_nonlocal, 0, 181 "enables \"..\" components in path lookup in capability mode " 182 "on non-local mount"); 183 184 static void 185 nameicap_tracker_add(struct nameidata *ndp, struct vnode *dp) 186 { 187 struct nameicap_tracker *nt; 188 189 if ((ndp->ni_lcf & NI_LCF_CAP_DOTDOT) == 0 || dp->v_type != VDIR) 190 return; 191 nt = TAILQ_LAST(&ndp->ni_cap_tracker, nameicap_tracker_head); 192 if (nt != NULL && nt->dp == dp) 193 return; 194 nt = malloc(sizeof(*nt), M_NAMEITRACKER, M_WAITOK); 195 vhold(dp); 196 nt->dp = dp; 197 TAILQ_INSERT_TAIL(&ndp->ni_cap_tracker, nt, nm_link); 198 } 199 200 static void 201 nameicap_cleanup_from(struct nameidata *ndp, struct nameicap_tracker *first) 202 { 203 struct nameicap_tracker *nt, *nt1; 204 205 nt = first; 206 TAILQ_FOREACH_FROM_SAFE(nt, &ndp->ni_cap_tracker, nm_link, nt1) { 207 TAILQ_REMOVE(&ndp->ni_cap_tracker, nt, nm_link); 208 vdrop(nt->dp); 209 free(nt, M_NAMEITRACKER); 210 } 211 } 212 213 static void 214 nameicap_cleanup(struct nameidata *ndp) 215 { 216 KASSERT(TAILQ_EMPTY(&ndp->ni_cap_tracker) || 217 (ndp->ni_lcf & NI_LCF_CAP_DOTDOT) != 0, ("not strictrelative")); 218 nameicap_cleanup_from(ndp, NULL); 219 } 220 221 /* 222 * For dotdot lookups in capability mode, only allow the component 223 * lookup to succeed if the resulting directory was already traversed 224 * during the operation. This catches situations where already 225 * traversed directory is moved to different parent, and then we walk 226 * over it with dotdots. 227 * 228 * Also allow to force failure of dotdot lookups for non-local 229 * filesystems, where external agents might assist local lookups to 230 * escape the compartment. 231 */ 232 static int 233 nameicap_check_dotdot(struct nameidata *ndp, struct vnode *dp) 234 { 235 struct nameicap_tracker *nt; 236 struct mount *mp; 237 238 if (dp == NULL || dp->v_type != VDIR || (ndp->ni_lcf & 239 NI_LCF_STRICTRELATIVE) == 0) 240 return (0); 241 if ((ndp->ni_lcf & NI_LCF_CAP_DOTDOT) == 0) 242 return (ENOTCAPABLE); 243 mp = dp->v_mount; 244 if (lookup_cap_dotdot_nonlocal == 0 && mp != NULL && 245 (mp->mnt_flag & MNT_LOCAL) == 0) 246 return (ENOTCAPABLE); 247 TAILQ_FOREACH_REVERSE(nt, &ndp->ni_cap_tracker, nameicap_tracker_head, 248 nm_link) { 249 if (dp == nt->dp) { 250 nt = TAILQ_NEXT(nt, nm_link); 251 if (nt != NULL) 252 nameicap_cleanup_from(ndp, nt); 253 return (0); 254 } 255 } 256 return (ENOTCAPABLE); 257 } 258 259 static void 260 namei_cleanup_cnp(struct componentname *cnp) 261 { 262 263 uma_zfree(namei_zone, cnp->cn_pnbuf); 264 cnp->cn_pnbuf = NULL; 265 cnp->cn_nameptr = NULL; 266 } 267 268 static int 269 namei_handle_root(struct nameidata *ndp, struct vnode **dpp) 270 { 271 struct componentname *cnp; 272 273 cnp = &ndp->ni_cnd; 274 if ((ndp->ni_lcf & NI_LCF_STRICTRELATIVE) != 0) { 275 #ifdef KTRACE 276 if (KTRPOINT(curthread, KTR_CAPFAIL)) 277 ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL); 278 #endif 279 return (ENOTCAPABLE); 280 } 281 while (*(cnp->cn_nameptr) == '/') { 282 cnp->cn_nameptr++; 283 ndp->ni_pathlen--; 284 } 285 *dpp = ndp->ni_rootdir; 286 vrefact(*dpp); 287 return (0); 288 } 289 290 static int 291 namei_setup(struct nameidata *ndp, struct vnode **dpp, struct pwd **pwdp) 292 { 293 struct componentname *cnp; 294 struct thread *td; 295 struct pwd *pwd; 296 int error; 297 bool startdir_used; 298 299 cnp = &ndp->ni_cnd; 300 td = curthread; 301 302 startdir_used = false; 303 *pwdp = NULL; 304 *dpp = NULL; 305 306 #ifdef CAPABILITY_MODE 307 /* 308 * In capability mode, lookups must be restricted to happen in 309 * the subtree with the root specified by the file descriptor: 310 * - The root must be real file descriptor, not the pseudo-descriptor 311 * AT_FDCWD. 312 * - The passed path must be relative and not absolute. 313 * - If lookup_cap_dotdot is disabled, path must not contain the 314 * '..' components. 315 * - If lookup_cap_dotdot is enabled, we verify that all '..' 316 * components lookups result in the directories which were 317 * previously walked by us, which prevents an escape from 318 * the relative root. 319 */ 320 if (IN_CAPABILITY_MODE(td) && (cnp->cn_flags & NOCAPCHECK) == 0) { 321 ndp->ni_lcf |= NI_LCF_STRICTRELATIVE; 322 ndp->ni_resflags |= NIRES_STRICTREL; 323 if (ndp->ni_dirfd == AT_FDCWD) { 324 #ifdef KTRACE 325 if (KTRPOINT(td, KTR_CAPFAIL)) 326 ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL); 327 #endif 328 return (ECAPMODE); 329 } 330 } 331 #endif 332 error = 0; 333 334 /* 335 * Get starting point for the translation. 336 */ 337 pwd = pwd_hold(td); 338 /* 339 * The reference on ni_rootdir is acquired in the block below to avoid 340 * back-to-back atomics for absolute lookups. 341 */ 342 namei_setup_rootdir(ndp, cnp, pwd); 343 ndp->ni_topdir = pwd->pwd_jdir; 344 345 if (cnp->cn_pnbuf[0] == '/') { 346 ndp->ni_resflags |= NIRES_ABS; 347 error = namei_handle_root(ndp, dpp); 348 } else { 349 if (ndp->ni_startdir != NULL) { 350 *dpp = ndp->ni_startdir; 351 startdir_used = true; 352 } else if (ndp->ni_dirfd == AT_FDCWD) { 353 *dpp = pwd->pwd_cdir; 354 vrefact(*dpp); 355 } else { 356 if (cnp->cn_flags & AUDITVNODE1) 357 AUDIT_ARG_ATFD1(ndp->ni_dirfd); 358 if (cnp->cn_flags & AUDITVNODE2) 359 AUDIT_ARG_ATFD2(ndp->ni_dirfd); 360 361 error = fgetvp_lookup(ndp, dpp); 362 } 363 if (error == 0 && (*dpp)->v_type != VDIR && 364 (cnp->cn_pnbuf[0] != '\0' || 365 (cnp->cn_flags & EMPTYPATH) == 0)) 366 error = ENOTDIR; 367 } 368 if (error == 0 && (cnp->cn_flags & RBENEATH) != 0) { 369 if (cnp->cn_pnbuf[0] == '/') { 370 error = ENOTCAPABLE; 371 } else if ((ndp->ni_lcf & NI_LCF_STRICTRELATIVE) == 0) { 372 ndp->ni_lcf |= NI_LCF_STRICTRELATIVE | 373 NI_LCF_CAP_DOTDOT; 374 } 375 } 376 377 /* 378 * If we are auditing the kernel pathname, save the user pathname. 379 */ 380 if (AUDITING_TD(td)) { 381 if (cnp->cn_flags & AUDITVNODE1) 382 AUDIT_ARG_UPATH1_VP(td, ndp->ni_rootdir, *dpp, cnp->cn_pnbuf); 383 if (cnp->cn_flags & AUDITVNODE2) 384 AUDIT_ARG_UPATH2_VP(td, ndp->ni_rootdir, *dpp, cnp->cn_pnbuf); 385 } 386 if (ndp->ni_startdir != NULL && !startdir_used) 387 vrele(ndp->ni_startdir); 388 if (error != 0) { 389 if (*dpp != NULL) 390 vrele(*dpp); 391 pwd_drop(pwd); 392 return (error); 393 } 394 if ((ndp->ni_lcf & NI_LCF_STRICTRELATIVE) != 0 && 395 lookup_cap_dotdot != 0) 396 ndp->ni_lcf |= NI_LCF_CAP_DOTDOT; 397 SDT_PROBE4(vfs, namei, lookup, entry, *dpp, cnp->cn_pnbuf, 398 cnp->cn_flags, false); 399 *pwdp = pwd; 400 return (0); 401 } 402 403 static int 404 namei_getpath(struct nameidata *ndp) 405 { 406 struct componentname *cnp; 407 int error; 408 409 cnp = &ndp->ni_cnd; 410 411 /* 412 * Get a buffer for the name to be translated, and copy the 413 * name into the buffer. 414 */ 415 cnp->cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK); 416 if (ndp->ni_segflg == UIO_SYSSPACE) { 417 error = copystr(ndp->ni_dirp, cnp->cn_pnbuf, MAXPATHLEN, 418 &ndp->ni_pathlen); 419 } else { 420 error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf, MAXPATHLEN, 421 &ndp->ni_pathlen); 422 } 423 424 return (error); 425 } 426 427 static int 428 namei_emptypath(struct nameidata *ndp) 429 { 430 struct componentname *cnp; 431 struct pwd *pwd; 432 struct vnode *dp; 433 int error; 434 435 cnp = &ndp->ni_cnd; 436 MPASS(*cnp->cn_pnbuf == '\0'); 437 MPASS((cnp->cn_flags & EMPTYPATH) != 0); 438 MPASS((cnp->cn_flags & (LOCKPARENT | WANTPARENT)) == 0); 439 440 ndp->ni_resflags |= NIRES_EMPTYPATH; 441 error = namei_setup(ndp, &dp, &pwd); 442 if (error != 0) { 443 goto errout; 444 } 445 446 /* 447 * Usecount on dp already provided by namei_setup. 448 */ 449 ndp->ni_vp = dp; 450 pwd_drop(pwd); 451 NDVALIDATE(ndp); 452 if ((cnp->cn_flags & LOCKLEAF) != 0) { 453 VOP_LOCK(dp, (cnp->cn_flags & LOCKSHARED) != 0 ? 454 LK_SHARED : LK_EXCLUSIVE); 455 if (VN_IS_DOOMED(dp)) { 456 vput(dp); 457 error = ENOENT; 458 goto errout; 459 } 460 } 461 SDT_PROBE4(vfs, namei, lookup, return, 0, ndp->ni_vp, false, ndp); 462 return (0); 463 464 errout: 465 SDT_PROBE4(vfs, namei, lookup, return, error, NULL, false, ndp); 466 namei_cleanup_cnp(cnp); 467 return (error); 468 } 469 470 static int __noinline 471 namei_follow_link(struct nameidata *ndp) 472 { 473 char *cp; 474 struct iovec aiov; 475 struct uio auio; 476 struct componentname *cnp; 477 struct thread *td; 478 int error, linklen; 479 480 error = 0; 481 cnp = &ndp->ni_cnd; 482 td = curthread; 483 484 if (ndp->ni_loopcnt++ >= MAXSYMLINKS) { 485 error = ELOOP; 486 goto out; 487 } 488 #ifdef MAC 489 if ((cnp->cn_flags & NOMACCHECK) == 0) { 490 error = mac_vnode_check_readlink(td->td_ucred, ndp->ni_vp); 491 if (error != 0) 492 goto out; 493 } 494 #endif 495 if (ndp->ni_pathlen > 1) 496 cp = uma_zalloc(namei_zone, M_WAITOK); 497 else 498 cp = cnp->cn_pnbuf; 499 aiov.iov_base = cp; 500 aiov.iov_len = MAXPATHLEN; 501 auio.uio_iov = &aiov; 502 auio.uio_iovcnt = 1; 503 auio.uio_offset = 0; 504 auio.uio_rw = UIO_READ; 505 auio.uio_segflg = UIO_SYSSPACE; 506 auio.uio_td = td; 507 auio.uio_resid = MAXPATHLEN; 508 error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred); 509 if (error != 0) { 510 if (ndp->ni_pathlen > 1) 511 uma_zfree(namei_zone, cp); 512 goto out; 513 } 514 linklen = MAXPATHLEN - auio.uio_resid; 515 if (linklen == 0) { 516 if (ndp->ni_pathlen > 1) 517 uma_zfree(namei_zone, cp); 518 error = ENOENT; 519 goto out; 520 } 521 if (linklen + ndp->ni_pathlen > MAXPATHLEN) { 522 if (ndp->ni_pathlen > 1) 523 uma_zfree(namei_zone, cp); 524 error = ENAMETOOLONG; 525 goto out; 526 } 527 if (ndp->ni_pathlen > 1) { 528 bcopy(ndp->ni_next, cp + linklen, ndp->ni_pathlen); 529 uma_zfree(namei_zone, cnp->cn_pnbuf); 530 cnp->cn_pnbuf = cp; 531 } else 532 cnp->cn_pnbuf[linklen] = '\0'; 533 ndp->ni_pathlen += linklen; 534 out: 535 return (error); 536 } 537 538 /* 539 * Convert a pathname into a pointer to a locked vnode. 540 * 541 * The FOLLOW flag is set when symbolic links are to be followed 542 * when they occur at the end of the name translation process. 543 * Symbolic links are always followed for all other pathname 544 * components other than the last. 545 * 546 * The segflg defines whether the name is to be copied from user 547 * space or kernel space. 548 * 549 * Overall outline of namei: 550 * 551 * copy in name 552 * get starting directory 553 * while (!done && !error) { 554 * call lookup to search path. 555 * if symbolic link, massage name in buffer and continue 556 * } 557 */ 558 int 559 namei(struct nameidata *ndp) 560 { 561 struct vnode *dp; /* the directory we are searching */ 562 struct componentname *cnp; 563 struct thread *td; 564 struct pwd *pwd; 565 int error; 566 enum cache_fpl_status status; 567 568 cnp = &ndp->ni_cnd; 569 td = curthread; 570 #ifdef INVARIANTS 571 KASSERT((ndp->ni_debugflags & NAMEI_DBG_CALLED) == 0, 572 ("%s: repeated call to namei without NDREINIT", __func__)); 573 KASSERT(ndp->ni_debugflags == NAMEI_DBG_INITED, 574 ("%s: bad debugflags %d", __func__, ndp->ni_debugflags)); 575 ndp->ni_debugflags |= NAMEI_DBG_CALLED; 576 if (ndp->ni_startdir != NULL) 577 ndp->ni_debugflags |= NAMEI_DBG_HADSTARTDIR; 578 if (cnp->cn_flags & FAILIFEXISTS) { 579 KASSERT(cnp->cn_nameiop == CREATE, 580 ("%s: FAILIFEXISTS passed for op %d", __func__, cnp->cn_nameiop)); 581 /* 582 * The limitation below is to restrict hairy corner cases. 583 */ 584 KASSERT((cnp->cn_flags & (LOCKPARENT | LOCKLEAF)) == LOCKPARENT, 585 ("%s: FAILIFEXISTS must be passed with LOCKPARENT and without LOCKLEAF", 586 __func__)); 587 } 588 #endif 589 ndp->ni_cnd.cn_cred = td->td_ucred; 590 KASSERT(ndp->ni_resflags == 0, ("%s: garbage in ni_resflags: %x\n", 591 __func__, ndp->ni_resflags)); 592 KASSERT(cnp->cn_cred && td->td_proc, ("namei: bad cred/proc")); 593 KASSERT((cnp->cn_flags & NAMEI_INTERNAL_FLAGS) == 0, 594 ("namei: unexpected flags: %" PRIx64 "\n", 595 cnp->cn_flags & NAMEI_INTERNAL_FLAGS)); 596 if (cnp->cn_flags & NOCACHE) 597 KASSERT(cnp->cn_nameiop != LOOKUP, 598 ("%s: NOCACHE passed with LOOKUP", __func__)); 599 MPASS(ndp->ni_startdir == NULL || ndp->ni_startdir->v_type == VDIR || 600 ndp->ni_startdir->v_type == VBAD); 601 602 restart: 603 ndp->ni_lcf = 0; 604 ndp->ni_loopcnt = 0; 605 ndp->ni_vp = NULL; 606 607 error = namei_getpath(ndp); 608 if (__predict_false(error != 0)) { 609 namei_cleanup_cnp(cnp); 610 SDT_PROBE4(vfs, namei, lookup, return, error, NULL, 611 false, ndp); 612 return (error); 613 } 614 615 cnp->cn_nameptr = cnp->cn_pnbuf; 616 617 #ifdef KTRACE 618 if (KTRPOINT(td, KTR_NAMEI)) { 619 ktrnamei(cnp->cn_pnbuf); 620 } 621 #endif 622 TSNAMEI(curthread->td_proc->p_pid, cnp->cn_pnbuf); 623 624 /* 625 * First try looking up the target without locking any vnodes. 626 * 627 * We may need to start from scratch or pick up where it left off. 628 */ 629 error = cache_fplookup(ndp, &status, &pwd); 630 switch (status) { 631 case CACHE_FPL_STATUS_UNSET: 632 __assert_unreachable(); 633 break; 634 case CACHE_FPL_STATUS_HANDLED: 635 if (error == 0) 636 NDVALIDATE(ndp); 637 else if (__predict_false(pwd->pwd_adir != pwd->pwd_rdir && 638 (cnp->cn_flags & ISRESTARTED) == 0)) { 639 namei_cleanup_cnp(cnp); 640 NDRESTART(ndp); 641 goto restart; 642 } 643 return (error); 644 case CACHE_FPL_STATUS_PARTIAL: 645 TAILQ_INIT(&ndp->ni_cap_tracker); 646 dp = ndp->ni_startdir; 647 break; 648 case CACHE_FPL_STATUS_DESTROYED: 649 ndp->ni_loopcnt = 0; 650 error = namei_getpath(ndp); 651 if (__predict_false(error != 0)) { 652 namei_cleanup_cnp(cnp); 653 return (error); 654 } 655 cnp->cn_nameptr = cnp->cn_pnbuf; 656 /* FALLTHROUGH */ 657 case CACHE_FPL_STATUS_ABORTED: 658 TAILQ_INIT(&ndp->ni_cap_tracker); 659 MPASS(ndp->ni_lcf == 0); 660 if (*cnp->cn_pnbuf == '\0') { 661 if ((cnp->cn_flags & EMPTYPATH) != 0) { 662 return (namei_emptypath(ndp)); 663 } 664 namei_cleanup_cnp(cnp); 665 SDT_PROBE4(vfs, namei, lookup, return, ENOENT, NULL, 666 false, ndp); 667 return (ENOENT); 668 } 669 error = namei_setup(ndp, &dp, &pwd); 670 if (error != 0) { 671 namei_cleanup_cnp(cnp); 672 return (error); 673 } 674 break; 675 } 676 677 /* 678 * Locked lookup. 679 */ 680 for (;;) { 681 ndp->ni_startdir = dp; 682 error = vfs_lookup(ndp); 683 if (error != 0) { 684 if (__predict_false(pwd->pwd_adir != pwd->pwd_rdir && 685 error == ENOENT && 686 (cnp->cn_flags & ISRESTARTED) == 0)) { 687 nameicap_cleanup(ndp); 688 pwd_drop(pwd); 689 namei_cleanup_cnp(cnp); 690 NDRESTART(ndp); 691 goto restart; 692 } else 693 goto out; 694 } 695 696 /* 697 * If not a symbolic link, we're done. 698 */ 699 if ((cnp->cn_flags & ISSYMLINK) == 0) { 700 SDT_PROBE4(vfs, namei, lookup, return, error, 701 ndp->ni_vp, false, ndp); 702 nameicap_cleanup(ndp); 703 pwd_drop(pwd); 704 NDVALIDATE(ndp); 705 return (0); 706 } 707 error = namei_follow_link(ndp); 708 if (error != 0) 709 break; 710 vput(ndp->ni_vp); 711 dp = ndp->ni_dvp; 712 /* 713 * Check if root directory should replace current directory. 714 */ 715 cnp->cn_nameptr = cnp->cn_pnbuf; 716 if (*(cnp->cn_nameptr) == '/') { 717 /* 718 * Reset the lookup to start from the real root without 719 * origin path name reloading. 720 */ 721 if (__predict_false(ndp->ni_rootdir != pwd->pwd_rdir)) { 722 cnp->cn_flags |= ISRESTARTED; 723 ndp->ni_rootdir = pwd->pwd_rdir; 724 } 725 vrele(dp); 726 error = namei_handle_root(ndp, &dp); 727 if (error != 0) 728 goto out; 729 } 730 } 731 vput(ndp->ni_vp); 732 ndp->ni_vp = NULL; 733 vrele(ndp->ni_dvp); 734 out: 735 MPASS(error != 0); 736 SDT_PROBE4(vfs, namei, lookup, return, error, NULL, false, ndp); 737 namei_cleanup_cnp(cnp); 738 nameicap_cleanup(ndp); 739 pwd_drop(pwd); 740 return (error); 741 } 742 743 static int 744 enforce_lkflags(struct mount *mp, int lkflags) 745 { 746 747 if (mp == NULL || ((lkflags & LK_SHARED) && 748 !(mp->mnt_kern_flag & MNTK_LOOKUP_SHARED))) { 749 lkflags &= ~LK_SHARED; 750 lkflags |= LK_EXCLUSIVE; 751 } 752 lkflags |= LK_NODDLKTREAT; 753 return (lkflags); 754 } 755 756 static __inline int 757 needs_exclusive_leaf(struct mount *mp, int flags) 758 { 759 760 /* 761 * Intermediate nodes can use shared locks, we only need to 762 * force an exclusive lock for leaf nodes. 763 */ 764 if ((flags & (ISLASTCN | LOCKLEAF)) != (ISLASTCN | LOCKLEAF)) 765 return (0); 766 767 /* Always use exclusive locks if LOCKSHARED isn't set. */ 768 if (!(flags & LOCKSHARED)) 769 return (1); 770 771 /* 772 * For lookups during open(), if the mount point supports 773 * extended shared operations, then use a shared lock for the 774 * leaf node, otherwise use an exclusive lock. 775 */ 776 if ((flags & ISOPEN) != 0) 777 return (!MNT_EXTENDED_SHARED(mp)); 778 779 /* 780 * Lookup requests outside of open() that specify LOCKSHARED 781 * only need a shared lock on the leaf vnode. 782 */ 783 return (0); 784 } 785 786 /* 787 * Various filesystems expect to be able to copy a name component with length 788 * bounded by NAME_MAX into a directory entry buffer of size MAXNAMLEN. Make 789 * sure that these are the same size. 790 */ 791 _Static_assert(MAXNAMLEN == NAME_MAX, 792 "MAXNAMLEN and NAME_MAX have different values"); 793 794 static int __noinline 795 vfs_lookup_degenerate(struct nameidata *ndp, struct vnode *dp, int wantparent) 796 { 797 struct componentname *cnp; 798 struct mount *mp; 799 int error; 800 801 cnp = &ndp->ni_cnd; 802 803 cnp->cn_flags |= ISLASTCN; 804 805 mp = atomic_load_ptr(&dp->v_mount); 806 if (needs_exclusive_leaf(mp, cnp->cn_flags)) { 807 cnp->cn_lkflags &= ~LK_SHARED; 808 cnp->cn_lkflags |= LK_EXCLUSIVE; 809 } 810 811 vn_lock(dp, enforce_lkflags(mp, cnp->cn_lkflags | LK_RETRY)); 812 813 if (dp->v_type != VDIR) { 814 error = ENOTDIR; 815 goto bad; 816 } 817 if (cnp->cn_nameiop != LOOKUP) { 818 error = EISDIR; 819 goto bad; 820 } 821 if (wantparent) { 822 ndp->ni_dvp = dp; 823 VREF(dp); 824 } 825 ndp->ni_vp = dp; 826 cnp->cn_namelen = 0; 827 828 if (cnp->cn_flags & AUDITVNODE1) 829 AUDIT_ARG_VNODE1(dp); 830 else if (cnp->cn_flags & AUDITVNODE2) 831 AUDIT_ARG_VNODE2(dp); 832 833 if (!(cnp->cn_flags & (LOCKPARENT | LOCKLEAF))) 834 VOP_UNLOCK(dp); 835 return (0); 836 bad: 837 VOP_UNLOCK(dp); 838 return (error); 839 } 840 841 /* 842 * FAILIFEXISTS handling. 843 * 844 * XXX namei called with LOCKPARENT but not LOCKLEAF has the strange 845 * behaviour of leaving the vnode unlocked if the target is the same 846 * vnode as the parent. 847 */ 848 static int __noinline 849 vfs_lookup_failifexists(struct nameidata *ndp) 850 { 851 struct componentname *cnp __diagused; 852 853 cnp = &ndp->ni_cnd; 854 855 MPASS((cnp->cn_flags & ISSYMLINK) == 0); 856 if (ndp->ni_vp == ndp->ni_dvp) 857 vrele(ndp->ni_dvp); 858 else 859 vput(ndp->ni_dvp); 860 vrele(ndp->ni_vp); 861 ndp->ni_dvp = NULL; 862 ndp->ni_vp = NULL; 863 NDFREE_PNBUF(ndp); 864 return (EEXIST); 865 } 866 867 static int __noinline 868 vfs_lookup_cross_mount(struct nameidata *ndp) 869 { 870 struct componentname *cnp; 871 struct mount *mp; 872 struct vnode *dp, *tdp; 873 int error, crosslkflags; 874 bool crosslock; 875 876 cnp = &ndp->ni_cnd; 877 dp = ndp->ni_vp; 878 879 /* 880 * The vnode has been mounted on, find the root of the mounted 881 * filesystem. 882 */ 883 do { 884 mp = dp->v_mountedhere; 885 ASSERT_VOP_LOCKED(dp, __func__); 886 VNPASS((vn_irflag_read(dp) & VIRF_MOUNTPOINT) != 0 && mp != NULL, dp); 887 888 crosslock = (dp->v_vflag & VV_CROSSLOCK) != 0; 889 crosslkflags = enforce_lkflags(mp, cnp->cn_lkflags); 890 if (__predict_false(crosslock)) { 891 /* 892 * We are going to be holding the vnode lock, which 893 * in this case is shared by the root vnode of the 894 * filesystem mounted at mp, across the call to 895 * VFS_ROOT(). Make the situation clear to the 896 * filesystem by passing LK_CANRECURSE if the 897 * lock is held exclusive, or by clearinng 898 * LK_NODDLKTREAT to allow recursion on the shared 899 * lock in the presence of an exclusive waiter. 900 */ 901 if (VOP_ISLOCKED(dp) == LK_EXCLUSIVE) { 902 crosslkflags &= ~LK_SHARED; 903 crosslkflags |= LK_EXCLUSIVE | LK_CANRECURSE; 904 } else if ((crosslkflags & LK_EXCLUSIVE) != 0) { 905 error = vn_lock(dp, LK_UPGRADE); 906 if (error != 0) { 907 MPASS(error == ENOENT); 908 vrele(dp); 909 if (dp != ndp->ni_dvp) 910 vput(ndp->ni_dvp); 911 else 912 vrele(ndp->ni_dvp); 913 break; 914 } 915 if (dp->v_mountedhere != mp) { 916 /* 917 * Note that we rely on the 918 * VIRF_MOUNTPOINT loop condition to 919 * ensure we stop iterating if dp is 920 * no longer a mountpoint at all. 921 */ 922 continue; 923 } 924 } else 925 crosslkflags &= ~LK_NODDLKTREAT; 926 } 927 if (vfs_busy(mp, 0) != 0) 928 continue; 929 if (__predict_true(!crosslock)) 930 vput(dp); 931 if (dp != ndp->ni_dvp) 932 vput(ndp->ni_dvp); 933 else 934 vrele(ndp->ni_dvp); 935 vrefact(vp_crossmp); 936 ndp->ni_dvp = vp_crossmp; 937 error = VFS_ROOT(mp, crosslkflags, &tdp); 938 vfs_unbusy(mp); 939 if (__predict_false(crosslock)) 940 vput(dp); 941 if (vn_lock(vp_crossmp, LK_SHARED | LK_NOWAIT)) 942 panic("vp_crossmp exclusively locked or reclaimed"); 943 if (error != 0) 944 break; 945 ndp->ni_vp = dp = tdp; 946 } while ((vn_irflag_read(dp) & VIRF_MOUNTPOINT) != 0); 947 948 return (error); 949 } 950 951 /* 952 * Search a pathname. 953 * This is a very central and rather complicated routine. 954 * 955 * The pathname is pointed to by cn_nameptr and is of length ni_pathlen. 956 * The starting directory is taken from ni_startdir. The pathname is 957 * descended until done, or a symbolic link is encountered. The cn_flags 958 * has ISLASTCN or'ed if the path is completed or ISSYMLINK or'ed if a 959 * symbolic link needing interpretation is encountered. 960 * 961 * The cn_nameiop is LOOKUP, CREATE, RENAME, or DELETE depending on 962 * whether the name is to be looked up, created, renamed, or deleted. 963 * When CREATE, RENAME, or DELETE is specified, information usable in 964 * creating, renaming, or deleting a directory entry may be calculated. 965 * If cn_flags has LOCKPARENT or'ed into it, the parent directory is returned 966 * locked. If it has WANTPARENT or'ed into it, the parent directory is 967 * returned unlocked. Otherwise the parent directory is not returned. If 968 * the target of the pathname exists and LOCKLEAF is or'ed into the cn_flags 969 * the target is returned locked, otherwise it is returned unlocked. 970 * 971 * Overall outline of lookup: 972 * 973 * handle degenerate case where name is null string 974 * 975 * dirloop: 976 * identify next component of name at ndp->ni_cnd.cn_nameptr 977 * handle .. special cases related to capabilities, chroot, jail 978 * if .. and crossing mount points and on mounted filesys, find parent 979 * call VOP_LOOKUP routine for next component name 980 * directory vnode returned in ni_dvp, unlocked unless LOCKPARENT set 981 * component vnode returned in ni_vp (if it exists), locked. 982 * if result vnode is mounted on and crossing mount points, 983 * find mounted on vnode 984 * if more components of name, do next level at dirloop 985 * if VOP_LOOKUP returns ERELOOKUP, repeat the same level at dirloop 986 * return the answer in ni_vp, locked if LOCKLEAF set 987 * if LOCKPARENT set, return locked parent in ni_dvp 988 * if WANTPARENT set, return unlocked parent in ni_dvp 989 */ 990 int 991 vfs_lookup(struct nameidata *ndp) 992 { 993 char *cp; /* pointer into pathname argument */ 994 char *prev_ni_next; /* saved ndp->ni_next */ 995 char *nulchar; /* location of '\0' in cn_pnbuf */ 996 char *lastchar; /* location of the last character */ 997 struct vnode *dp = NULL; /* the directory we are searching */ 998 struct vnode *tdp; /* saved dp */ 999 struct prison *pr; 1000 size_t prev_ni_pathlen; /* saved ndp->ni_pathlen */ 1001 int docache; /* == 0 do not cache last component */ 1002 int wantparent; /* 1 => wantparent or lockparent flag */ 1003 int rdonly; /* lookup read-only flag bit */ 1004 int error = 0; 1005 int relookup = 0; /* do not consume the path component */ 1006 struct componentname *cnp = &ndp->ni_cnd; 1007 int lkflags_save; 1008 int ni_dvp_unlocked; 1009 1010 /* 1011 * Setup: break out flag bits into variables. 1012 */ 1013 ni_dvp_unlocked = 0; 1014 wantparent = cnp->cn_flags & (LOCKPARENT | WANTPARENT); 1015 KASSERT(cnp->cn_nameiop == LOOKUP || wantparent, 1016 ("CREATE, DELETE, RENAME require LOCKPARENT or WANTPARENT.")); 1017 /* 1018 * When set to zero, docache causes the last component of the 1019 * pathname to be deleted from the cache and the full lookup 1020 * of the name to be done (via VOP_CACHEDLOOKUP()). Often 1021 * filesystems need some pre-computed values that are made 1022 * during the full lookup, for instance UFS sets dp->i_offset. 1023 * 1024 * The docache variable is set to zero when requested by the 1025 * NOCACHE flag and for all modifying operations except CREATE. 1026 */ 1027 docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE; 1028 if (cnp->cn_nameiop == DELETE || 1029 (wantparent && cnp->cn_nameiop != CREATE && 1030 cnp->cn_nameiop != LOOKUP)) 1031 docache = 0; 1032 rdonly = cnp->cn_flags & RDONLY; 1033 cnp->cn_flags &= ~ISSYMLINK; 1034 ndp->ni_dvp = NULL; 1035 1036 cnp->cn_lkflags = LK_SHARED; 1037 dp = ndp->ni_startdir; 1038 ndp->ni_startdir = NULLVP; 1039 1040 /* 1041 * Leading slashes, if any, are supposed to be skipped by the caller. 1042 */ 1043 MPASS(cnp->cn_nameptr[0] != '/'); 1044 1045 /* 1046 * Check for degenerate name (e.g. / or "") which is a way of talking 1047 * about a directory, e.g. like "/." or ".". 1048 */ 1049 if (__predict_false(cnp->cn_nameptr[0] == '\0')) { 1050 error = vfs_lookup_degenerate(ndp, dp, wantparent); 1051 if (error == 0) 1052 goto success_right_lock; 1053 goto bad_unlocked; 1054 } 1055 1056 /* 1057 * Nul-out trailing slashes (e.g., "foo///" -> "foo"). 1058 * 1059 * This must be done before VOP_LOOKUP() because some fs's don't know 1060 * about trailing slashes. Remember if there were trailing slashes to 1061 * handle symlinks, existing non-directories and non-existing files 1062 * that won't be directories specially later. 1063 */ 1064 MPASS(ndp->ni_pathlen >= 2); 1065 lastchar = &cnp->cn_nameptr[ndp->ni_pathlen - 2]; 1066 if (*lastchar == '/') { 1067 while (lastchar >= cnp->cn_pnbuf) { 1068 *lastchar = '\0'; 1069 lastchar--; 1070 ndp->ni_pathlen--; 1071 if (*lastchar != '/') { 1072 break; 1073 } 1074 } 1075 cnp->cn_flags |= TRAILINGSLASH; 1076 } 1077 1078 /* 1079 * We use shared locks until we hit the parent of the last cn then 1080 * we adjust based on the requesting flags. 1081 */ 1082 vn_lock(dp, 1083 enforce_lkflags(dp->v_mount, cnp->cn_lkflags | LK_RETRY)); 1084 1085 dirloop: 1086 /* 1087 * Search a new directory. 1088 * 1089 * The last component of the filename is left accessible via 1090 * cnp->cn_nameptr. It has to be freed with a call to NDFREE*. 1091 * 1092 * Store / as a temporary sentinel so that we only have one character 1093 * to test for. Pathnames tend to be short so this should not be 1094 * resulting in cache misses. 1095 */ 1096 nulchar = &cnp->cn_nameptr[ndp->ni_pathlen - 1]; 1097 KASSERT(*nulchar == '\0', 1098 ("%s: expected nul at %p; string [%s]\n", __func__, nulchar, 1099 cnp->cn_pnbuf)); 1100 *nulchar = '/'; 1101 for (cp = cnp->cn_nameptr; *cp != '/'; cp++) { 1102 KASSERT(*cp != '\0', 1103 ("%s: encountered unexpected nul; string [%s]\n", __func__, 1104 cnp->cn_nameptr)); 1105 continue; 1106 } 1107 *nulchar = '\0'; 1108 cnp->cn_namelen = cp - cnp->cn_nameptr; 1109 if (__predict_false(cnp->cn_namelen > NAME_MAX)) { 1110 error = ENAMETOOLONG; 1111 goto bad; 1112 } 1113 prev_ni_pathlen = ndp->ni_pathlen; 1114 ndp->ni_pathlen -= cnp->cn_namelen; 1115 KASSERT(ndp->ni_pathlen <= PATH_MAX, 1116 ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen)); 1117 prev_ni_next = ndp->ni_next; 1118 ndp->ni_next = cp; 1119 1120 /* 1121 * Something else should be clearing this. 1122 */ 1123 cnp->cn_flags &= ~(ISDOTDOT|ISLASTCN); 1124 1125 cnp->cn_flags |= MAKEENTRY; 1126 if (*cp == '\0' && docache == 0) 1127 cnp->cn_flags &= ~MAKEENTRY; 1128 if (cnp->cn_namelen == 2 && 1129 cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') 1130 cnp->cn_flags |= ISDOTDOT; 1131 if (*ndp->ni_next == 0) { 1132 cnp->cn_flags |= ISLASTCN; 1133 1134 if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.' && 1135 (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))) { 1136 error = EINVAL; 1137 goto bad; 1138 } 1139 } 1140 1141 nameicap_tracker_add(ndp, dp); 1142 1143 /* 1144 * Make sure degenerate names don't get here, their handling was 1145 * previously found in this spot. 1146 */ 1147 MPASS(cnp->cn_nameptr[0] != '\0'); 1148 1149 /* 1150 * Handle "..": five special cases. 1151 * 0. If doing a capability lookup and lookup_cap_dotdot is 1152 * disabled, return ENOTCAPABLE. 1153 * 1. Return an error if this is the last component of 1154 * the name and the operation is DELETE or RENAME. 1155 * 2. If at root directory (e.g. after chroot) 1156 * or at absolute root directory 1157 * then ignore it so can't get out. 1158 * 3. If this vnode is the root of a mounted 1159 * filesystem, then replace it with the 1160 * vnode which was mounted on so we take the 1161 * .. in the other filesystem. 1162 * 4. If the vnode is the top directory of 1163 * the jail or chroot, don't let them out. 1164 * 5. If doing a capability lookup and lookup_cap_dotdot is 1165 * enabled, return ENOTCAPABLE if the lookup would escape 1166 * from the initial file descriptor directory. Checks are 1167 * done by ensuring that namei() already traversed the 1168 * result of dotdot lookup. 1169 */ 1170 if (cnp->cn_flags & ISDOTDOT) { 1171 if ((ndp->ni_lcf & (NI_LCF_STRICTRELATIVE | NI_LCF_CAP_DOTDOT)) 1172 == NI_LCF_STRICTRELATIVE) { 1173 #ifdef KTRACE 1174 if (KTRPOINT(curthread, KTR_CAPFAIL)) 1175 ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL); 1176 #endif 1177 error = ENOTCAPABLE; 1178 goto bad; 1179 } 1180 if ((cnp->cn_flags & ISLASTCN) != 0 && 1181 (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) { 1182 error = EINVAL; 1183 goto bad; 1184 } 1185 for (;;) { 1186 for (pr = cnp->cn_cred->cr_prison; pr != NULL; 1187 pr = pr->pr_parent) 1188 if (dp == pr->pr_root) 1189 break; 1190 bool isroot = dp == ndp->ni_rootdir || 1191 dp == ndp->ni_topdir || dp == rootvnode || 1192 pr != NULL; 1193 if (isroot && (ndp->ni_lcf & 1194 NI_LCF_STRICTRELATIVE) != 0) { 1195 error = ENOTCAPABLE; 1196 goto capdotdot; 1197 } 1198 if (isroot || ((dp->v_vflag & VV_ROOT) != 0 && 1199 (cnp->cn_flags & NOCROSSMOUNT) != 0)) { 1200 ndp->ni_dvp = dp; 1201 ndp->ni_vp = dp; 1202 VREF(dp); 1203 goto nextname; 1204 } 1205 if ((dp->v_vflag & VV_ROOT) == 0) 1206 break; 1207 if (VN_IS_DOOMED(dp)) { /* forced unmount */ 1208 error = ENOENT; 1209 goto bad; 1210 } 1211 tdp = dp; 1212 dp = dp->v_mount->mnt_vnodecovered; 1213 VREF(dp); 1214 vput(tdp); 1215 vn_lock(dp, 1216 enforce_lkflags(dp->v_mount, cnp->cn_lkflags | 1217 LK_RETRY)); 1218 error = nameicap_check_dotdot(ndp, dp); 1219 if (error != 0) { 1220 capdotdot: 1221 #ifdef KTRACE 1222 if (KTRPOINT(curthread, KTR_CAPFAIL)) 1223 ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL); 1224 #endif 1225 goto bad; 1226 } 1227 } 1228 } 1229 1230 /* 1231 * We now have a segment name to search for, and a directory to search. 1232 */ 1233 unionlookup: 1234 #ifdef MAC 1235 error = mac_vnode_check_lookup(cnp->cn_cred, dp, cnp); 1236 if (__predict_false(error)) 1237 goto bad; 1238 #endif 1239 ndp->ni_dvp = dp; 1240 ndp->ni_vp = NULL; 1241 ASSERT_VOP_LOCKED(dp, "lookup"); 1242 /* 1243 * If we have a shared lock we may need to upgrade the lock for the 1244 * last operation. 1245 */ 1246 if ((cnp->cn_flags & LOCKPARENT) && (cnp->cn_flags & ISLASTCN) && 1247 dp != vp_crossmp && VOP_ISLOCKED(dp) == LK_SHARED) 1248 vn_lock(dp, LK_UPGRADE|LK_RETRY); 1249 if (VN_IS_DOOMED(dp)) { 1250 error = ENOENT; 1251 goto bad; 1252 } 1253 /* 1254 * If we're looking up the last component and we need an exclusive 1255 * lock, adjust our lkflags. 1256 */ 1257 if (needs_exclusive_leaf(dp->v_mount, cnp->cn_flags)) 1258 cnp->cn_lkflags = LK_EXCLUSIVE; 1259 lkflags_save = cnp->cn_lkflags; 1260 cnp->cn_lkflags = enforce_lkflags(dp->v_mount, cnp->cn_lkflags); 1261 error = VOP_LOOKUP(dp, &ndp->ni_vp, cnp); 1262 cnp->cn_lkflags = lkflags_save; 1263 if (error != 0) { 1264 KASSERT(ndp->ni_vp == NULL, ("leaf should be empty")); 1265 if ((error == ENOENT) && 1266 (dp->v_vflag & VV_ROOT) && (dp->v_mount != NULL) && 1267 (dp->v_mount->mnt_flag & MNT_UNION)) { 1268 tdp = dp; 1269 dp = dp->v_mount->mnt_vnodecovered; 1270 VREF(dp); 1271 vput(tdp); 1272 vn_lock(dp, 1273 enforce_lkflags(dp->v_mount, cnp->cn_lkflags | 1274 LK_RETRY)); 1275 nameicap_tracker_add(ndp, dp); 1276 goto unionlookup; 1277 } 1278 1279 if (error == ERELOOKUP) { 1280 vref(dp); 1281 ndp->ni_vp = dp; 1282 error = 0; 1283 relookup = 1; 1284 goto good; 1285 } 1286 1287 if (error != EJUSTRETURN) 1288 goto bad; 1289 /* 1290 * At this point, we know we're at the end of the 1291 * pathname. If creating / renaming, we can consider 1292 * allowing the file or directory to be created / renamed, 1293 * provided we're not on a read-only filesystem. 1294 */ 1295 if (rdonly) { 1296 error = EROFS; 1297 goto bad; 1298 } 1299 /* trailing slash only allowed for directories */ 1300 if ((cnp->cn_flags & TRAILINGSLASH) && 1301 !(cnp->cn_flags & WILLBEDIR)) { 1302 error = ENOENT; 1303 goto bad; 1304 } 1305 if ((cnp->cn_flags & LOCKPARENT) == 0) 1306 VOP_UNLOCK(dp); 1307 /* 1308 * We return with ni_vp NULL to indicate that the entry 1309 * doesn't currently exist, leaving a pointer to the 1310 * (possibly locked) directory vnode in ndp->ni_dvp. 1311 */ 1312 goto success; 1313 } 1314 1315 good: 1316 dp = ndp->ni_vp; 1317 1318 /* 1319 * Check for symbolic link 1320 */ 1321 if ((dp->v_type == VLNK) && 1322 ((cnp->cn_flags & FOLLOW) || (cnp->cn_flags & TRAILINGSLASH) || 1323 *ndp->ni_next == '/')) { 1324 cnp->cn_flags |= ISSYMLINK; 1325 if (VN_IS_DOOMED(dp)) { 1326 /* 1327 * We can't know whether the directory was mounted with 1328 * NOSYMFOLLOW, so we can't follow safely. 1329 */ 1330 error = ENOENT; 1331 goto bad2; 1332 } 1333 if (dp->v_mount->mnt_flag & MNT_NOSYMFOLLOW) { 1334 error = EACCES; 1335 goto bad2; 1336 } 1337 /* 1338 * Symlink code always expects an unlocked dvp. 1339 */ 1340 if (ndp->ni_dvp != ndp->ni_vp) { 1341 VOP_UNLOCK(ndp->ni_dvp); 1342 ni_dvp_unlocked = 1; 1343 } 1344 goto success; 1345 } 1346 1347 if ((vn_irflag_read(dp) & VIRF_MOUNTPOINT) != 0 && 1348 (cnp->cn_flags & NOCROSSMOUNT) == 0) { 1349 error = vfs_lookup_cross_mount(ndp); 1350 if (error != 0) 1351 goto bad_unlocked; 1352 /* 1353 * FALLTHROUGH to nextname 1354 */ 1355 dp = ndp->ni_vp; 1356 } 1357 1358 nextname: 1359 /* 1360 * Not a symbolic link that we will follow. Continue with the 1361 * next component if there is any; otherwise, we're done. 1362 */ 1363 KASSERT((cnp->cn_flags & ISLASTCN) || *ndp->ni_next == '/', 1364 ("lookup: invalid path state.")); 1365 if (relookup) { 1366 relookup = 0; 1367 ndp->ni_pathlen = prev_ni_pathlen; 1368 ndp->ni_next = prev_ni_next; 1369 if (ndp->ni_dvp != dp) 1370 vput(ndp->ni_dvp); 1371 else 1372 vrele(ndp->ni_dvp); 1373 goto dirloop; 1374 } 1375 if (cnp->cn_flags & ISDOTDOT) { 1376 error = nameicap_check_dotdot(ndp, ndp->ni_vp); 1377 if (error != 0) { 1378 #ifdef KTRACE 1379 if (KTRPOINT(curthread, KTR_CAPFAIL)) 1380 ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL); 1381 #endif 1382 goto bad2; 1383 } 1384 } 1385 if (*ndp->ni_next == '/') { 1386 cnp->cn_nameptr = ndp->ni_next; 1387 while (*cnp->cn_nameptr == '/') { 1388 cnp->cn_nameptr++; 1389 ndp->ni_pathlen--; 1390 } 1391 if (ndp->ni_dvp != dp) 1392 vput(ndp->ni_dvp); 1393 else 1394 vrele(ndp->ni_dvp); 1395 goto dirloop; 1396 } 1397 /* 1398 * If we're processing a path with a trailing slash, 1399 * check that the end result is a directory. 1400 */ 1401 if ((cnp->cn_flags & TRAILINGSLASH) && dp->v_type != VDIR) { 1402 error = ENOTDIR; 1403 goto bad2; 1404 } 1405 /* 1406 * Disallow directory write attempts on read-only filesystems. 1407 */ 1408 if (rdonly && 1409 (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) { 1410 error = EROFS; 1411 goto bad2; 1412 } 1413 if (!wantparent) { 1414 ni_dvp_unlocked = 2; 1415 if (ndp->ni_dvp != dp) 1416 vput(ndp->ni_dvp); 1417 else 1418 vrele(ndp->ni_dvp); 1419 } else if ((cnp->cn_flags & LOCKPARENT) == 0 && ndp->ni_dvp != dp) { 1420 VOP_UNLOCK(ndp->ni_dvp); 1421 ni_dvp_unlocked = 1; 1422 } 1423 1424 if (cnp->cn_flags & AUDITVNODE1) 1425 AUDIT_ARG_VNODE1(dp); 1426 else if (cnp->cn_flags & AUDITVNODE2) 1427 AUDIT_ARG_VNODE2(dp); 1428 1429 if ((cnp->cn_flags & LOCKLEAF) == 0) 1430 VOP_UNLOCK(dp); 1431 success: 1432 /* 1433 * FIXME: for lookups which only cross a mount point to fetch the 1434 * root vnode, ni_dvp will be set to vp_crossmp. This can be a problem 1435 * if either WANTPARENT or LOCKPARENT is set. 1436 */ 1437 /* 1438 * Because of shared lookup we may have the vnode shared locked, but 1439 * the caller may want it to be exclusively locked. 1440 */ 1441 if (needs_exclusive_leaf(dp->v_mount, cnp->cn_flags) && 1442 VOP_ISLOCKED(dp) != LK_EXCLUSIVE) { 1443 vn_lock(dp, LK_UPGRADE | LK_RETRY); 1444 if (VN_IS_DOOMED(dp)) { 1445 error = ENOENT; 1446 goto bad2; 1447 } 1448 } 1449 success_right_lock: 1450 if (ndp->ni_vp != NULL) { 1451 if ((cnp->cn_flags & ISDOTDOT) == 0) 1452 nameicap_tracker_add(ndp, ndp->ni_vp); 1453 if ((cnp->cn_flags & (FAILIFEXISTS | ISSYMLINK)) == FAILIFEXISTS) 1454 return (vfs_lookup_failifexists(ndp)); 1455 } 1456 return (0); 1457 1458 bad2: 1459 if (ni_dvp_unlocked != 2) { 1460 if (dp != ndp->ni_dvp && !ni_dvp_unlocked) 1461 vput(ndp->ni_dvp); 1462 else 1463 vrele(ndp->ni_dvp); 1464 } 1465 bad: 1466 vput(dp); 1467 bad_unlocked: 1468 ndp->ni_vp = NULL; 1469 return (error); 1470 } 1471 1472 /* 1473 * relookup - lookup a path name component 1474 * Used by lookup to re-acquire things. 1475 */ 1476 int 1477 vfs_relookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1478 bool refstart) 1479 { 1480 struct vnode *dp = NULL; /* the directory we are searching */ 1481 int rdonly; /* lookup read-only flag bit */ 1482 int error = 0; 1483 1484 KASSERT(cnp->cn_flags & ISLASTCN, 1485 ("relookup: Not given last component.")); 1486 /* 1487 * Setup: break out flag bits into variables. 1488 */ 1489 KASSERT((cnp->cn_flags & (LOCKPARENT | WANTPARENT)) != 0, 1490 ("relookup: parent not wanted")); 1491 rdonly = cnp->cn_flags & RDONLY; 1492 cnp->cn_flags &= ~ISSYMLINK; 1493 dp = dvp; 1494 cnp->cn_lkflags = LK_EXCLUSIVE; 1495 vn_lock(dp, LK_EXCLUSIVE | LK_RETRY); 1496 1497 /* 1498 * Search a new directory. 1499 * 1500 * See a comment in vfs_lookup for cnp->cn_nameptr. 1501 * 1502 * Check for "" which represents the root directory after slash 1503 * removal. 1504 */ 1505 if (cnp->cn_nameptr[0] == '\0') { 1506 /* 1507 * Support only LOOKUP for "/" because lookup() 1508 * can't succeed for CREATE, DELETE and RENAME. 1509 */ 1510 KASSERT(cnp->cn_nameiop == LOOKUP, ("nameiop must be LOOKUP")); 1511 KASSERT(dp->v_type == VDIR, ("dp is not a directory")); 1512 1513 if (!(cnp->cn_flags & LOCKLEAF)) 1514 VOP_UNLOCK(dp); 1515 *vpp = dp; 1516 /* XXX This should probably move to the top of function. */ 1517 if (refstart) 1518 panic("lookup: SAVESTART"); 1519 return (0); 1520 } 1521 1522 if (cnp->cn_flags & ISDOTDOT) 1523 panic ("relookup: lookup on dot-dot"); 1524 1525 /* 1526 * We now have a segment name to search for, and a directory to search. 1527 */ 1528 if ((error = VOP_LOOKUP(dp, vpp, cnp)) != 0) { 1529 KASSERT(*vpp == NULL, ("leaf should be empty")); 1530 if (error != EJUSTRETURN) 1531 goto bad; 1532 /* 1533 * If creating and at end of pathname, then can consider 1534 * allowing file to be created. 1535 */ 1536 if (rdonly) { 1537 error = EROFS; 1538 goto bad; 1539 } 1540 /* ASSERT(dvp == ndp->ni_startdir) */ 1541 if (refstart) 1542 VREF(dvp); 1543 if ((cnp->cn_flags & LOCKPARENT) == 0) 1544 VOP_UNLOCK(dp); 1545 /* 1546 * We return with ni_vp NULL to indicate that the entry 1547 * doesn't currently exist, leaving a pointer to the 1548 * (possibly locked) directory vnode in ndp->ni_dvp. 1549 */ 1550 return (0); 1551 } 1552 1553 dp = *vpp; 1554 1555 /* 1556 * Disallow directory write attempts on read-only filesystems. 1557 */ 1558 if (rdonly && 1559 (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) { 1560 if (dvp == dp) 1561 vrele(dvp); 1562 else 1563 vput(dvp); 1564 error = EROFS; 1565 goto bad; 1566 } 1567 /* 1568 * Set the parent lock/ref state to the requested state. 1569 */ 1570 if ((cnp->cn_flags & LOCKPARENT) == 0 && dvp != dp) 1571 VOP_UNLOCK(dvp); 1572 /* 1573 * Check for symbolic link 1574 */ 1575 KASSERT(dp->v_type != VLNK || !(cnp->cn_flags & FOLLOW), 1576 ("relookup: symlink found.\n")); 1577 1578 /* ASSERT(dvp == ndp->ni_startdir) */ 1579 if (refstart) 1580 VREF(dvp); 1581 1582 if ((cnp->cn_flags & LOCKLEAF) == 0) 1583 VOP_UNLOCK(dp); 1584 return (0); 1585 bad: 1586 vput(dp); 1587 *vpp = NULL; 1588 return (error); 1589 } 1590 1591 #ifdef INVARIANTS 1592 /* 1593 * Validate the final state of ndp after the lookup. 1594 */ 1595 static void 1596 NDVALIDATE_impl(struct nameidata *ndp, int line) 1597 { 1598 struct componentname *cnp; 1599 1600 cnp = &ndp->ni_cnd; 1601 if (cnp->cn_pnbuf == NULL) 1602 panic("%s: got no buf! called from %d", __func__, line); 1603 } 1604 1605 #endif 1606