1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1994 Jan-Simon Pendry 5 * Copyright (c) 1994 6 * The Regents of the University of California. All rights reserved. 7 * Copyright (c) 2005, 2006, 2012 Masanori Ozawa <ozawa@ongs.co.jp>, ONGS Inc. 8 * Copyright (c) 2006, 2012 Daichi Goto <daichi@freebsd.org> 9 * 10 * This code is derived from software contributed to Berkeley by 11 * Jan-Simon Pendry. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 3. Neither the name of the University nor the names of its contributors 22 * may be used to endorse or promote products derived from this software 23 * without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 * SUCH DAMAGE. 36 */ 37 38 #include <sys/param.h> 39 #include <sys/systm.h> 40 #include <sys/kernel.h> 41 #include <sys/ktr.h> 42 #include <sys/lock.h> 43 #include <sys/mutex.h> 44 #include <sys/malloc.h> 45 #include <sys/mount.h> 46 #include <sys/namei.h> 47 #include <sys/proc.h> 48 #include <sys/vnode.h> 49 #include <sys/dirent.h> 50 #include <sys/fcntl.h> 51 #include <sys/filedesc.h> 52 #include <sys/stat.h> 53 #include <sys/sysctl.h> 54 #include <sys/taskqueue.h> 55 #include <sys/resourcevar.h> 56 57 #include <machine/atomic.h> 58 59 #include <security/mac/mac_framework.h> 60 61 #include <vm/uma.h> 62 63 #include <fs/unionfs/union.h> 64 65 #define NUNIONFSNODECACHE 16 66 #define UNIONFSHASHMASK (NUNIONFSNODECACHE - 1) 67 68 static MALLOC_DEFINE(M_UNIONFSHASH, "UNIONFS hash", "UNIONFS hash table"); 69 MALLOC_DEFINE(M_UNIONFSNODE, "UNIONFS node", "UNIONFS vnode private part"); 70 MALLOC_DEFINE(M_UNIONFSPATH, "UNIONFS path", "UNIONFS path private part"); 71 72 static struct task unionfs_deferred_rele_task; 73 static struct mtx unionfs_deferred_rele_lock; 74 static STAILQ_HEAD(, unionfs_node) unionfs_deferred_rele_list = 75 STAILQ_HEAD_INITIALIZER(unionfs_deferred_rele_list); 76 static TASKQUEUE_DEFINE_THREAD(unionfs_rele); 77 78 unsigned int unionfs_ndeferred = 0; 79 SYSCTL_UINT(_vfs, OID_AUTO, unionfs_ndeferred, CTLFLAG_RD, 80 &unionfs_ndeferred, 0, "unionfs deferred vnode release"); 81 82 static void unionfs_deferred_rele(void *, int); 83 84 /* 85 * Initialize 86 */ 87 int 88 unionfs_init(struct vfsconf *vfsp) 89 { 90 UNIONFSDEBUG("unionfs_init\n"); /* printed during system boot */ 91 TASK_INIT(&unionfs_deferred_rele_task, 0, unionfs_deferred_rele, NULL); 92 mtx_init(&unionfs_deferred_rele_lock, "uniondefr", NULL, MTX_DEF); 93 return (0); 94 } 95 96 /* 97 * Uninitialize 98 */ 99 int 100 unionfs_uninit(struct vfsconf *vfsp) 101 { 102 taskqueue_quiesce(taskqueue_unionfs_rele); 103 taskqueue_free(taskqueue_unionfs_rele); 104 mtx_destroy(&unionfs_deferred_rele_lock); 105 return (0); 106 } 107 108 static void 109 unionfs_deferred_rele(void *arg __unused, int pending __unused) 110 { 111 STAILQ_HEAD(, unionfs_node) local_rele_list; 112 struct unionfs_node *unp, *tunp; 113 unsigned int ndeferred; 114 115 ndeferred = 0; 116 STAILQ_INIT(&local_rele_list); 117 mtx_lock(&unionfs_deferred_rele_lock); 118 STAILQ_CONCAT(&local_rele_list, &unionfs_deferred_rele_list); 119 mtx_unlock(&unionfs_deferred_rele_lock); 120 STAILQ_FOREACH_SAFE(unp, &local_rele_list, un_rele, tunp) { 121 ++ndeferred; 122 MPASS(unp->un_dvp != NULL); 123 vrele(unp->un_dvp); 124 free(unp, M_UNIONFSNODE); 125 } 126 127 /* We expect this function to be single-threaded, thus no atomic */ 128 unionfs_ndeferred += ndeferred; 129 } 130 131 static struct unionfs_node_hashhead * 132 unionfs_get_hashhead(struct vnode *dvp, struct vnode *lookup) 133 { 134 struct unionfs_node *unp; 135 136 unp = VTOUNIONFS(dvp); 137 138 return (&(unp->un_hashtbl[vfs_hash_index(lookup) & UNIONFSHASHMASK])); 139 } 140 141 /* 142 * Attempt to lookup a cached unionfs vnode by upper/lower vp 143 * from dvp, with dvp's interlock held. 144 */ 145 static struct vnode * 146 unionfs_get_cached_vnode_locked(struct vnode *lookup, struct vnode *dvp) 147 { 148 struct unionfs_node *unp; 149 struct unionfs_node_hashhead *hd; 150 struct vnode *vp; 151 152 hd = unionfs_get_hashhead(dvp, lookup); 153 154 LIST_FOREACH(unp, hd, un_hash) { 155 if (unp->un_uppervp == lookup || 156 unp->un_lowervp == lookup) { 157 vp = UNIONFSTOV(unp); 158 VI_LOCK_FLAGS(vp, MTX_DUPOK); 159 vp->v_iflag &= ~VI_OWEINACT; 160 if (VN_IS_DOOMED(vp) || 161 ((vp->v_iflag & VI_DOINGINACT) != 0)) { 162 VI_UNLOCK(vp); 163 vp = NULLVP; 164 } else { 165 vrefl(vp); 166 VI_UNLOCK(vp); 167 } 168 return (vp); 169 } 170 } 171 172 return (NULLVP); 173 } 174 175 176 /* 177 * Get the cached vnode. 178 */ 179 static struct vnode * 180 unionfs_get_cached_vnode(struct vnode *uvp, struct vnode *lvp, 181 struct vnode *dvp) 182 { 183 struct vnode *vp; 184 185 vp = NULLVP; 186 VI_LOCK(dvp); 187 if (uvp != NULLVP) 188 vp = unionfs_get_cached_vnode_locked(uvp, dvp); 189 else if (lvp != NULLVP) 190 vp = unionfs_get_cached_vnode_locked(lvp, dvp); 191 VI_UNLOCK(dvp); 192 193 return (vp); 194 } 195 196 /* 197 * Add the new vnode into cache. 198 */ 199 static struct vnode * 200 unionfs_ins_cached_vnode(struct unionfs_node *uncp, 201 struct vnode *dvp) 202 { 203 struct unionfs_node_hashhead *hd; 204 struct vnode *vp; 205 206 vp = NULLVP; 207 VI_LOCK(dvp); 208 if (uncp->un_uppervp != NULLVP) { 209 ASSERT_VOP_ELOCKED(uncp->un_uppervp, __func__); 210 KASSERT(uncp->un_uppervp->v_type == VDIR, 211 ("%s: v_type != VDIR", __func__)); 212 vp = unionfs_get_cached_vnode_locked(uncp->un_uppervp, dvp); 213 } else if (uncp->un_lowervp != NULLVP) { 214 ASSERT_VOP_ELOCKED(uncp->un_lowervp, __func__); 215 KASSERT(uncp->un_lowervp->v_type == VDIR, 216 ("%s: v_type != VDIR", __func__)); 217 vp = unionfs_get_cached_vnode_locked(uncp->un_lowervp, dvp); 218 } 219 if (vp == NULLVP) { 220 hd = unionfs_get_hashhead(dvp, (uncp->un_uppervp != NULLVP ? 221 uncp->un_uppervp : uncp->un_lowervp)); 222 LIST_INSERT_HEAD(hd, uncp, un_hash); 223 } 224 VI_UNLOCK(dvp); 225 226 return (vp); 227 } 228 229 /* 230 * Remove the vnode. 231 */ 232 static void 233 unionfs_rem_cached_vnode(struct unionfs_node *unp, struct vnode *dvp) 234 { 235 KASSERT(unp != NULL, ("%s: null node", __func__)); 236 KASSERT(dvp != NULLVP, 237 ("%s: null parent vnode", __func__)); 238 239 VI_LOCK(dvp); 240 if (unp->un_hash.le_prev != NULL) { 241 LIST_REMOVE(unp, un_hash); 242 unp->un_hash.le_next = NULL; 243 unp->un_hash.le_prev = NULL; 244 } 245 VI_UNLOCK(dvp); 246 } 247 248 /* 249 * Common cleanup handling for unionfs_nodeget 250 * Upper, lower, and parent directory vnodes are expected to be referenced by 251 * the caller. Upper and lower vnodes, if non-NULL, are also expected to be 252 * exclusively locked by the caller. 253 * This function will return with the caller's locks and references undone. 254 */ 255 static void 256 unionfs_nodeget_cleanup(struct vnode *vp, struct unionfs_node *unp) 257 { 258 259 /* 260 * Lock and reset the default vnode lock; vgone() expects a locked 261 * vnode, and we're going to reset the vnode ops. 262 */ 263 lockmgr(&vp->v_lock, LK_EXCLUSIVE, NULL); 264 265 /* 266 * Clear out private data and reset the vnode ops to avoid use of 267 * unionfs vnode ops on a partially constructed vnode. 268 */ 269 VI_LOCK(vp); 270 vp->v_data = NULL; 271 vp->v_vnlock = &vp->v_lock; 272 vp->v_op = &dead_vnodeops; 273 VI_UNLOCK(vp); 274 vgone(vp); 275 vput(vp); 276 277 if (unp->un_dvp != NULLVP) 278 vrele(unp->un_dvp); 279 if (unp->un_uppervp != NULLVP) { 280 vput(unp->un_uppervp); 281 if (unp->un_lowervp != NULLVP) 282 vrele(unp->un_lowervp); 283 } else if (unp->un_lowervp != NULLVP) 284 vput(unp->un_lowervp); 285 if (unp->un_hashtbl != NULL) 286 hashdestroy(unp->un_hashtbl, M_UNIONFSHASH, UNIONFSHASHMASK); 287 free(unp->un_path, M_UNIONFSPATH); 288 free(unp, M_UNIONFSNODE); 289 } 290 291 /* 292 * Make a new or get existing unionfs node. 293 * 294 * uppervp and lowervp should be unlocked. Because if new unionfs vnode is 295 * locked, uppervp or lowervp is locked too. In order to prevent dead lock, 296 * you should not lock plurality simultaneously. 297 */ 298 int 299 unionfs_nodeget(struct mount *mp, struct vnode *uppervp, 300 struct vnode *lowervp, struct vnode *dvp, struct vnode **vpp, 301 struct componentname *cnp) 302 { 303 char *path; 304 struct unionfs_mount *ump; 305 struct unionfs_node *unp; 306 struct vnode *vp; 307 u_long hashmask; 308 int error; 309 int lkflags; 310 __enum_uint8(vtype) vt; 311 312 error = 0; 313 ump = MOUNTTOUNIONFSMOUNT(mp); 314 lkflags = (cnp ? cnp->cn_lkflags : 0); 315 path = (cnp ? cnp->cn_nameptr : NULL); 316 *vpp = NULLVP; 317 318 if (uppervp == NULLVP && lowervp == NULLVP) 319 panic("%s: upper and lower are both null", __func__); 320 321 vt = (uppervp != NULLVP ? uppervp->v_type : lowervp->v_type); 322 323 /* If it has no ISLASTCN flag, path check is skipped. */ 324 if (cnp && !(cnp->cn_flags & ISLASTCN)) 325 path = NULL; 326 327 /* check the cache */ 328 if (dvp != NULLVP && vt == VDIR) { 329 vp = unionfs_get_cached_vnode(uppervp, lowervp, dvp); 330 if (vp != NULLVP) { 331 *vpp = vp; 332 if (lkflags != 0) 333 vn_lock(*vpp, lkflags | LK_RETRY); 334 return (0); 335 } 336 } 337 338 unp = malloc(sizeof(struct unionfs_node), 339 M_UNIONFSNODE, M_WAITOK | M_ZERO); 340 341 error = getnewvnode("unionfs", mp, &unionfs_vnodeops, &vp); 342 if (error != 0) { 343 free(unp, M_UNIONFSNODE); 344 return (error); 345 } 346 if (dvp != NULLVP) 347 vref(dvp); 348 if (uppervp != NULLVP) 349 vref(uppervp); 350 if (lowervp != NULLVP) 351 vref(lowervp); 352 353 if (vt == VDIR) { 354 unp->un_hashtbl = hashinit(NUNIONFSNODECACHE, M_UNIONFSHASH, 355 &hashmask); 356 KASSERT(hashmask == UNIONFSHASHMASK, 357 ("unexpected unionfs hash mask 0x%lx", hashmask)); 358 } 359 360 unp->un_vnode = vp; 361 unp->un_uppervp = uppervp; 362 unp->un_lowervp = lowervp; 363 unp->un_dvp = dvp; 364 if (uppervp != NULLVP) 365 vp->v_vnlock = uppervp->v_vnlock; 366 else 367 vp->v_vnlock = lowervp->v_vnlock; 368 369 if (path != NULL) { 370 unp->un_path = malloc(cnp->cn_namelen + 1, 371 M_UNIONFSPATH, M_WAITOK | M_ZERO); 372 bcopy(cnp->cn_nameptr, unp->un_path, cnp->cn_namelen); 373 unp->un_path[cnp->cn_namelen] = '\0'; 374 unp->un_pathlen = cnp->cn_namelen; 375 } 376 vp->v_type = vt; 377 vp->v_data = unp; 378 379 /* 380 * TODO: This is an imperfect check, as there's no guarantee that 381 * the underlying filesystems will always return vnode pointers 382 * for the root inodes that match our cached values. To reduce 383 * the likelihood of failure, for example in the case where either 384 * vnode has been forcibly doomed, we check both pointers and set 385 * VV_ROOT if either matches. 386 */ 387 if (ump->um_uppervp == uppervp || ump->um_lowervp == lowervp) 388 vp->v_vflag |= VV_ROOT; 389 KASSERT(dvp != NULL || (vp->v_vflag & VV_ROOT) != 0, 390 ("%s: NULL dvp for non-root vp %p", __func__, vp)); 391 392 393 /* 394 * NOTE: There is still a possibility for cross-filesystem locking here. 395 * If dvp has an upper FS component and is locked, while the new vnode 396 * created here only has a lower-layer FS component, then we will end 397 * up taking a lower-FS lock while holding an upper-FS lock. 398 * That situation could be dealt with here using vn_lock_pair(). 399 * However, that would only address one instance out of many in which 400 * a child vnode lock is taken while holding a lock on its parent 401 * directory. This is done in many places in common VFS code, as well as 402 * a few places within unionfs (which could lead to the same cross-FS 403 * locking issue if, for example, the upper FS is another nested unionfs 404 * instance). Additionally, it is unclear under what circumstances this 405 * specific lock sequence (a directory on one FS followed by a child of 406 * its 'peer' directory on another FS) would present the practical 407 * possibility of deadlock due to some other agent on the system 408 * attempting to lock those two specific vnodes in the opposite order. 409 */ 410 if (uppervp != NULLVP) 411 vn_lock(uppervp, LK_EXCLUSIVE | LK_RETRY); 412 else 413 vn_lock(lowervp, LK_EXCLUSIVE | LK_RETRY); 414 error = insmntque1(vp, mp); 415 if (error != 0) { 416 unionfs_nodeget_cleanup(vp, unp); 417 return (error); 418 } 419 /* 420 * lowervp and uppervp should only be doomed by a forced unmount of 421 * their respective filesystems, but that can only happen if the 422 * unionfs instance is first unmounted. We also effectively hold the 423 * lock on the new unionfs vnode at this point. Therefore, if a 424 * unionfs umount has not yet reached the point at which the above 425 * insmntque1() would fail, then its vflush() call will end up 426 * blocked on our vnode lock, effectively also preventing unmount 427 * of the underlying filesystems. 428 */ 429 VNASSERT(lowervp == NULLVP || !VN_IS_DOOMED(lowervp), vp, 430 ("%s: doomed lowervp %p", __func__, lowervp)); 431 VNASSERT(uppervp == NULLVP || !VN_IS_DOOMED(uppervp), vp, 432 ("%s: doomed lowervp %p", __func__, uppervp)); 433 434 vn_set_state(vp, VSTATE_CONSTRUCTED); 435 436 if (dvp != NULLVP && vt == VDIR) 437 *vpp = unionfs_ins_cached_vnode(unp, dvp); 438 if (*vpp != NULLVP) { 439 unionfs_nodeget_cleanup(vp, unp); 440 if (lkflags != 0) 441 vn_lock(*vpp, lkflags | LK_RETRY); 442 return (0); 443 } else 444 *vpp = vp; 445 446 if ((lkflags & LK_SHARED) != 0) 447 vn_lock(vp, LK_DOWNGRADE); 448 else if ((lkflags & LK_EXCLUSIVE) == 0) 449 VOP_UNLOCK(vp); 450 451 return (0); 452 } 453 454 /* 455 * Clean up the unionfs node. 456 */ 457 void 458 unionfs_noderem(struct vnode *vp) 459 { 460 struct unionfs_node *unp, *unp_t1, *unp_t2; 461 struct unionfs_node_hashhead *hd; 462 struct unionfs_node_status *unsp, *unsp_tmp; 463 struct vnode *lvp; 464 struct vnode *uvp; 465 struct vnode *dvp; 466 int count; 467 int writerefs; 468 bool unlock_lvp; 469 470 /* 471 * The root vnode lock may be recursed during unmount, because 472 * it may share the same lock as the unionfs mount's covered vnode, 473 * which is locked across VFS_UNMOUNT(). This lock will then be 474 * recursively taken during the vflush() issued by unionfs_unmount(). 475 * But we still only need to lock the unionfs lock once, because only 476 * one of those lock operations was taken against a unionfs vnode and 477 * will be undone against a unionfs vnode. 478 */ 479 KASSERT(vp->v_vnlock->lk_recurse == 0 || (vp->v_vflag & VV_ROOT) != 0, 480 ("%s: vnode %p locked recursively", __func__, vp)); 481 482 unp = VTOUNIONFS(vp); 483 VNASSERT(unp != NULL, vp, ("%s: already reclaimed", __func__)); 484 lvp = unp->un_lowervp; 485 uvp = unp->un_uppervp; 486 dvp = unp->un_dvp; 487 unlock_lvp = (uvp == NULLVP); 488 489 /* 490 * Lock the lower vnode in addition to the upper vnode lock in order 491 * to synchronize against any unionfs_lock() operation which may still 492 * hold the lower vnode lock. We do not need to do this for the root 493 * vnode, as the root vnode should always have both upper and lower 494 * base vnodes for its entire lifecycled, so unionfs_lock() should 495 * never attempt to lock its lower vnode in the first place. 496 * Moreover, during unmount of a non-"below" unionfs mount, the lower 497 * root vnode will already be locked as it is the covered vnode. 498 */ 499 if (uvp != NULLVP && lvp != NULLVP && (vp->v_vflag & VV_ROOT) == 0) { 500 vn_lock_pair(uvp, true, LK_EXCLUSIVE, lvp, false, LK_EXCLUSIVE); 501 unlock_lvp = true; 502 } 503 504 if (lockmgr(&vp->v_lock, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) 505 panic("%s: failed to acquire lock for vnode lock", __func__); 506 /* 507 * Use the interlock to protect the clearing of v_data to 508 * prevent faults in unionfs_lock(). 509 */ 510 VI_LOCK(vp); 511 unp->un_lowervp = unp->un_uppervp = NULLVP; 512 vp->v_vnlock = &(vp->v_lock); 513 vp->v_data = NULL; 514 vp->v_object = NULL; 515 if (unp->un_hashtbl != NULL) { 516 /* 517 * Clear out any cached child vnodes. This should only 518 * be necessary during forced unmount, when the vnode may 519 * be reclaimed with a non-zero use count. Otherwise the 520 * reference held by each child should prevent reclamation. 521 */ 522 for (count = 0; count <= UNIONFSHASHMASK; count++) { 523 hd = unp->un_hashtbl + count; 524 LIST_FOREACH_SAFE(unp_t1, hd, un_hash, unp_t2) { 525 LIST_REMOVE(unp_t1, un_hash); 526 unp_t1->un_hash.le_next = NULL; 527 unp_t1->un_hash.le_prev = NULL; 528 } 529 } 530 } 531 VI_UNLOCK(vp); 532 533 writerefs = atomic_load_int(&vp->v_writecount); 534 VNASSERT(writerefs >= 0, vp, 535 ("%s: write count %d, unexpected text ref", __func__, writerefs)); 536 /* 537 * If we were opened for write, we leased the write reference 538 * to the lower vnode. If this is a reclamation due to the 539 * forced unmount, undo the reference now. 540 */ 541 if (writerefs > 0) { 542 VNASSERT(uvp != NULL, vp, 543 ("%s: write reference without upper vnode", __func__)); 544 VOP_ADD_WRITECOUNT(uvp, -writerefs); 545 } 546 if (uvp != NULLVP) 547 vput(uvp); 548 if (unlock_lvp) 549 vput(lvp); 550 else if (lvp != NULLVP) 551 vrele(lvp); 552 553 if (dvp != NULLVP) 554 unionfs_rem_cached_vnode(unp, dvp); 555 556 if (unp->un_path != NULL) { 557 free(unp->un_path, M_UNIONFSPATH); 558 unp->un_path = NULL; 559 unp->un_pathlen = 0; 560 } 561 562 if (unp->un_hashtbl != NULL) { 563 hashdestroy(unp->un_hashtbl, M_UNIONFSHASH, UNIONFSHASHMASK); 564 } 565 566 LIST_FOREACH_SAFE(unsp, &(unp->un_unshead), uns_list, unsp_tmp) { 567 LIST_REMOVE(unsp, uns_list); 568 free(unsp, M_TEMP); 569 } 570 if (dvp != NULLVP) { 571 mtx_lock(&unionfs_deferred_rele_lock); 572 STAILQ_INSERT_TAIL(&unionfs_deferred_rele_list, unp, un_rele); 573 mtx_unlock(&unionfs_deferred_rele_lock); 574 taskqueue_enqueue(taskqueue_unionfs_rele, 575 &unionfs_deferred_rele_task); 576 } else 577 free(unp, M_UNIONFSNODE); 578 } 579 580 /* 581 * Find the unionfs node status object for the vnode corresponding to unp, 582 * for the process that owns td. Return NULL if no such object exists. 583 */ 584 struct unionfs_node_status * 585 unionfs_find_node_status(struct unionfs_node *unp, struct thread *td) 586 { 587 struct unionfs_node_status *unsp; 588 pid_t pid; 589 590 pid = td->td_proc->p_pid; 591 592 ASSERT_VOP_ELOCKED(UNIONFSTOV(unp), __func__); 593 594 LIST_FOREACH(unsp, &(unp->un_unshead), uns_list) { 595 if (unsp->uns_pid == pid) { 596 return (unsp); 597 } 598 } 599 600 return (NULL); 601 } 602 603 /* 604 * Get the unionfs node status object for the vnode corresponding to unp, 605 * for the process that owns td. Allocate a new status object if one 606 * does not already exist. 607 */ 608 void 609 unionfs_get_node_status(struct unionfs_node *unp, struct thread *td, 610 struct unionfs_node_status **unspp) 611 { 612 struct unionfs_node_status *unsp; 613 pid_t pid; 614 615 pid = td->td_proc->p_pid; 616 617 KASSERT(NULL != unspp, ("%s: NULL status", __func__)); 618 unsp = unionfs_find_node_status(unp, td); 619 if (unsp == NULL) { 620 /* create a new unionfs node status */ 621 unsp = malloc(sizeof(struct unionfs_node_status), 622 M_TEMP, M_WAITOK | M_ZERO); 623 624 unsp->uns_pid = pid; 625 LIST_INSERT_HEAD(&(unp->un_unshead), unsp, uns_list); 626 } 627 628 *unspp = unsp; 629 } 630 631 /* 632 * Remove the unionfs node status, if you can. 633 * You need exclusive lock this vnode. 634 */ 635 void 636 unionfs_tryrem_node_status(struct unionfs_node *unp, 637 struct unionfs_node_status *unsp) 638 { 639 KASSERT(NULL != unsp, ("%s: NULL status", __func__)); 640 ASSERT_VOP_ELOCKED(UNIONFSTOV(unp), __func__); 641 642 if (0 < unsp->uns_lower_opencnt || 0 < unsp->uns_upper_opencnt) 643 return; 644 645 LIST_REMOVE(unsp, uns_list); 646 free(unsp, M_TEMP); 647 } 648 649 /* 650 * Create upper node attr. 651 */ 652 void 653 unionfs_create_uppervattr_core(struct unionfs_mount *ump, struct vattr *lva, 654 struct vattr *uva, struct thread *td) 655 { 656 VATTR_NULL(uva); 657 uva->va_type = lva->va_type; 658 uva->va_atime = lva->va_atime; 659 uva->va_mtime = lva->va_mtime; 660 uva->va_ctime = lva->va_ctime; 661 662 switch (ump->um_copymode) { 663 case UNIONFS_TRANSPARENT: 664 uva->va_mode = lva->va_mode; 665 uva->va_uid = lva->va_uid; 666 uva->va_gid = lva->va_gid; 667 break; 668 case UNIONFS_MASQUERADE: 669 if (ump->um_uid == lva->va_uid) { 670 uva->va_mode = lva->va_mode & 077077; 671 uva->va_mode |= (lva->va_type == VDIR ? 672 ump->um_udir : ump->um_ufile) & 0700; 673 uva->va_uid = lva->va_uid; 674 uva->va_gid = lva->va_gid; 675 } else { 676 uva->va_mode = (lva->va_type == VDIR ? 677 ump->um_udir : ump->um_ufile); 678 uva->va_uid = ump->um_uid; 679 uva->va_gid = ump->um_gid; 680 } 681 break; 682 default: /* UNIONFS_TRADITIONAL */ 683 uva->va_mode = 0777 & ~td->td_proc->p_pd->pd_cmask; 684 uva->va_uid = ump->um_uid; 685 uva->va_gid = ump->um_gid; 686 break; 687 } 688 } 689 690 /* 691 * Create upper node attr. 692 */ 693 int 694 unionfs_create_uppervattr(struct unionfs_mount *ump, struct vnode *lvp, 695 struct vattr *uva, struct ucred *cred, struct thread *td) 696 { 697 struct vattr lva; 698 int error; 699 700 if ((error = VOP_GETATTR(lvp, &lva, cred))) 701 return (error); 702 703 unionfs_create_uppervattr_core(ump, &lva, uva, td); 704 705 return (error); 706 } 707 708 /* 709 * relookup 710 * 711 * dvp should be locked on entry and will be locked on return. 712 * 713 * If an error is returned, *vpp will be invalid, otherwise it will hold a 714 * locked, referenced vnode. If *vpp == dvp then remember that only one 715 * LK_EXCLUSIVE lock is held. 716 */ 717 int 718 unionfs_relookup(struct vnode *dvp, struct vnode **vpp, 719 struct componentname *cnp, struct componentname *cn, struct thread *td, 720 char *path, int pathlen, u_long nameiop) 721 { 722 int error; 723 bool refstart; 724 725 cn->cn_namelen = pathlen; 726 cn->cn_pnbuf = path; 727 cn->cn_nameiop = nameiop; 728 cn->cn_flags = (LOCKPARENT | LOCKLEAF | ISLASTCN); 729 cn->cn_lkflags = LK_EXCLUSIVE; 730 cn->cn_cred = cnp->cn_cred; 731 cn->cn_nameptr = cn->cn_pnbuf; 732 733 refstart = false; 734 if (nameiop == DELETE) { 735 cn->cn_flags |= (cnp->cn_flags & DOWHITEOUT); 736 } else if (nameiop == RENAME) { 737 refstart = true; 738 } else if (nameiop == CREATE) { 739 cn->cn_flags |= NOCACHE; 740 } 741 742 vref(dvp); 743 VOP_UNLOCK(dvp); 744 745 if ((error = vfs_relookup(dvp, vpp, cn, refstart))) { 746 vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); 747 } else 748 vrele(dvp); 749 750 KASSERT(cn->cn_pnbuf == path, ("%s: cn_pnbuf changed", __func__)); 751 752 return (error); 753 } 754 755 /* 756 * Update the unionfs_node. 757 * 758 * uvp is new locked upper vnode. unionfs vnode's lock will be exchanged to the 759 * uvp's lock and lower's lock will be unlocked. 760 */ 761 static void 762 unionfs_node_update(struct unionfs_node *unp, struct vnode *uvp, 763 struct thread *td) 764 { 765 struct unionfs_node_hashhead *hd; 766 struct vnode *vp; 767 struct vnode *lvp; 768 struct vnode *dvp; 769 unsigned count, lockrec; 770 771 vp = UNIONFSTOV(unp); 772 lvp = unp->un_lowervp; 773 ASSERT_VOP_ELOCKED(lvp, __func__); 774 ASSERT_VOP_ELOCKED(uvp, __func__); 775 dvp = unp->un_dvp; 776 777 VNASSERT(vp->v_writecount == 0, vp, 778 ("%s: non-zero writecount", __func__)); 779 /* 780 * Update the upper vnode's lock state to match the lower vnode, 781 * and then switch the unionfs vnode's lock to the upper vnode. 782 */ 783 lockrec = lvp->v_vnlock->lk_recurse; 784 for (count = 0; count < lockrec; count++) 785 vn_lock(uvp, LK_EXCLUSIVE | LK_CANRECURSE | LK_RETRY); 786 VI_LOCK(vp); 787 unp->un_uppervp = uvp; 788 vp->v_vnlock = uvp->v_vnlock; 789 VI_UNLOCK(vp); 790 791 for (count = 0; count < lockrec + 1; count++) 792 VOP_UNLOCK(lvp); 793 /* 794 * Re-cache the unionfs vnode against the upper vnode 795 */ 796 if (dvp != NULLVP && vp->v_type == VDIR) { 797 VI_LOCK(dvp); 798 if (unp->un_hash.le_prev != NULL) { 799 LIST_REMOVE(unp, un_hash); 800 hd = unionfs_get_hashhead(dvp, uvp); 801 LIST_INSERT_HEAD(hd, unp, un_hash); 802 } 803 VI_UNLOCK(unp->un_dvp); 804 } 805 } 806 807 /* 808 * Mark a unionfs operation as being in progress, sleeping if the 809 * same operation is already in progress. 810 * This is useful, for example, during copy-up operations in which 811 * we may drop the target vnode lock, but we want to avoid the 812 * possibility of a concurrent copy-up on the same vnode triggering 813 * a spurious failure. 814 */ 815 int 816 unionfs_set_in_progress_flag(struct vnode *vp, unsigned int flag) 817 { 818 struct unionfs_node *unp; 819 int error; 820 821 error = 0; 822 ASSERT_VOP_ELOCKED(vp, __func__); 823 VI_LOCK(vp); 824 unp = VTOUNIONFS(vp); 825 while (error == 0 && (unp->un_flag & flag) != 0) { 826 VOP_UNLOCK(vp); 827 error = msleep(vp, VI_MTX(vp), PCATCH | PDROP, "unioncp", 0); 828 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 829 VI_LOCK(vp); 830 if (error == 0) { 831 /* 832 * If we waited on a concurrent copy-up and that 833 * copy-up was successful, return a non-fatal 834 * indication that the desired operation is already 835 * complete. If we waited on a concurrent lookup, 836 * return ERELOOKUP to indicate the VFS cache should 837 * be re-queried to avoid creating a duplicate unionfs 838 * vnode. 839 */ 840 unp = VTOUNIONFS(vp); 841 if (unp == NULL) 842 error = ENOENT; 843 else if (flag == UNIONFS_COPY_IN_PROGRESS && 844 unp->un_uppervp != NULLVP) 845 error = EJUSTRETURN; 846 else if (flag == UNIONFS_LOOKUP_IN_PROGRESS) 847 error = ERELOOKUP; 848 } 849 } 850 if (error == 0) 851 unp->un_flag |= flag; 852 VI_UNLOCK(vp); 853 854 return (error); 855 } 856 857 void 858 unionfs_clear_in_progress_flag(struct vnode *vp, unsigned int flag) 859 { 860 struct unionfs_node *unp; 861 862 ASSERT_VOP_ELOCKED(vp, __func__); 863 unp = VTOUNIONFS(vp); 864 VI_LOCK(vp); 865 if (unp != NULL) { 866 VNASSERT((unp->un_flag & flag) != 0, vp, 867 ("%s: copy not in progress", __func__)); 868 unp->un_flag &= ~flag; 869 } 870 wakeup(vp); 871 VI_UNLOCK(vp); 872 } 873 874 /* 875 * Create a new shadow dir. 876 * 877 * dvp and vp are unionfs vnodes representing a parent directory and 878 * child file, should be locked on entry, and will be locked on return. 879 * 880 * If no error returned, unp will be updated. 881 */ 882 int 883 unionfs_mkshadowdir(struct vnode *dvp, struct vnode *vp, 884 struct componentname *cnp, struct thread *td) 885 { 886 struct vnode *lvp; 887 struct vnode *uvp; 888 struct vnode *udvp; 889 struct vattr va; 890 struct vattr lva; 891 struct nameidata nd; 892 struct mount *mp; 893 struct ucred *cred; 894 struct ucred *credbk; 895 struct uidinfo *rootinfo; 896 struct unionfs_mount *ump; 897 struct unionfs_node *dunp; 898 struct unionfs_node *unp; 899 int error; 900 901 ASSERT_VOP_ELOCKED(dvp, __func__); 902 ASSERT_VOP_ELOCKED(vp, __func__); 903 ump = MOUNTTOUNIONFSMOUNT(vp->v_mount); 904 unp = VTOUNIONFS(vp); 905 if (unp->un_uppervp != NULLVP) 906 return (EEXIST); 907 dunp = VTOUNIONFS(dvp); 908 udvp = dunp->un_uppervp; 909 910 error = unionfs_set_in_progress_flag(vp, UNIONFS_COPY_IN_PROGRESS); 911 if (error == EJUSTRETURN) 912 return (0); 913 else if (error != 0) 914 return (error); 915 916 lvp = unp->un_lowervp; 917 uvp = NULLVP; 918 credbk = cnp->cn_cred; 919 920 /* Authority change to root */ 921 rootinfo = uifind((uid_t)0); 922 cred = crdup(cnp->cn_cred); 923 /* 924 * The calls to chgproccnt() are needed to compensate for change_ruid() 925 * calling chgproccnt(). 926 */ 927 chgproccnt(cred->cr_ruidinfo, 1, 0); 928 change_euid(cred, rootinfo); 929 change_ruid(cred, rootinfo); 930 change_svuid(cred, (uid_t)0); 931 uifree(rootinfo); 932 cnp->cn_cred = cred; 933 934 memset(&nd.ni_cnd, 0, sizeof(struct componentname)); 935 NDPREINIT(&nd); 936 937 if ((error = VOP_GETATTR(lvp, &lva, cnp->cn_cred))) 938 goto unionfs_mkshadowdir_finish; 939 940 vref(udvp); 941 VOP_UNLOCK(vp); 942 if ((error = unionfs_relookup(udvp, &uvp, cnp, &nd.ni_cnd, td, 943 cnp->cn_nameptr, cnp->cn_namelen, CREATE))) { 944 /* 945 * When handling error cases here, we drop udvp's lock and 946 * then jump to exit code that relocks dvp, which in most 947 * cases will effectively relock udvp. However, this is 948 * not guaranteed to be the case, as various calls made 949 * here (such as unionfs_relookup() above and VOP_MKDIR() 950 * below) may unlock and then relock udvp, allowing dvp to 951 * be reclaimed in the meantime. In such a situation dvp 952 * will no longer share its lock with udvp. Since 953 * performance isn't a concern for these error cases, it 954 * makes more sense to reuse the common code that locks 955 * dvp on exit than to explicitly check for reclamation 956 * of dvp. 957 */ 958 vput(udvp); 959 goto unionfs_mkshadowdir_relock; 960 } 961 if (uvp != NULLVP) { 962 if (udvp == uvp) 963 vrele(uvp); 964 else 965 vput(uvp); 966 967 error = EEXIST; 968 vput(udvp); 969 goto unionfs_mkshadowdir_relock; 970 } 971 972 if ((error = vn_start_write(udvp, &mp, V_WAIT | V_PCATCH))) { 973 vput(udvp); 974 goto unionfs_mkshadowdir_relock; 975 } 976 unionfs_create_uppervattr_core(ump, &lva, &va, td); 977 978 /* 979 * Temporarily NUL-terminate the current pathname component. 980 * This function may be called during lookup operations in which 981 * the current pathname component is not the leaf, meaning that 982 * the NUL terminator is some distance beyond the end of the current 983 * component. This *should* be fine, as cn_namelen will still 984 * correctly indicate the length of only the current component, 985 * but ZFS in particular does not respect cn_namelen in its VOP_MKDIR 986 * implementation. 987 * Note that this assumes nd.ni_cnd.cn_pnbuf was allocated by 988 * something like a local namei() operation and the temporary 989 * NUL-termination will not have an effect on other threads. 990 */ 991 char *pathend = &nd.ni_cnd.cn_nameptr[nd.ni_cnd.cn_namelen]; 992 char pathterm = *pathend; 993 *pathend = '\0'; 994 error = VOP_MKDIR(udvp, &uvp, &nd.ni_cnd, &va); 995 *pathend = pathterm; 996 if (error != 0) { 997 /* 998 * See the comment after unionfs_relookup() above for an 999 * explanation of why we unlock udvp here only to relock 1000 * dvp on exit. 1001 */ 1002 vput(udvp); 1003 vn_finished_write(mp); 1004 goto unionfs_mkshadowdir_relock; 1005 } 1006 1007 /* 1008 * XXX The bug which cannot set uid/gid was corrected. 1009 * Ignore errors. 1010 */ 1011 va.va_type = VNON; 1012 /* 1013 * VOP_SETATTR() may transiently drop uvp's lock, so it's 1014 * important to call it before unionfs_node_update() transfers 1015 * the unionfs vnode's lock from lvp to uvp; otherwise the 1016 * unionfs vnode itself would be transiently unlocked and 1017 * potentially doomed. 1018 */ 1019 VOP_SETATTR(uvp, &va, nd.ni_cnd.cn_cred); 1020 1021 /* 1022 * uvp may become doomed during VOP_VPUT_PAIR() if the implementation 1023 * must temporarily drop uvp's lock. However, since we hold a 1024 * reference to uvp from the VOP_MKDIR() call above, this would require 1025 * a forcible unmount of uvp's filesystem, which in turn can only 1026 * happen if our unionfs instance is first forcibly unmounted. We'll 1027 * therefore catch this case in the NULL check of unp below. 1028 */ 1029 VOP_VPUT_PAIR(udvp, &uvp, false); 1030 vn_finished_write(mp); 1031 vn_lock_pair(vp, false, LK_EXCLUSIVE, uvp, true, LK_EXCLUSIVE); 1032 unp = VTOUNIONFS(vp); 1033 if (unp == NULL) { 1034 vput(uvp); 1035 error = ENOENT; 1036 } else 1037 unionfs_node_update(unp, uvp, td); 1038 VOP_UNLOCK(vp); 1039 1040 unionfs_mkshadowdir_relock: 1041 vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); 1042 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1043 if (error == 0 && (VN_IS_DOOMED(dvp) || VN_IS_DOOMED(vp))) 1044 error = ENOENT; 1045 1046 unionfs_mkshadowdir_finish: 1047 unionfs_clear_in_progress_flag(vp, UNIONFS_COPY_IN_PROGRESS); 1048 cnp->cn_cred = credbk; 1049 chgproccnt(cred->cr_ruidinfo, -1, 0); 1050 crfree(cred); 1051 1052 return (error); 1053 } 1054 1055 static inline void 1056 unionfs_forward_vop_ref(struct vnode *basevp, int *lkflags) 1057 { 1058 ASSERT_VOP_LOCKED(basevp, __func__); 1059 *lkflags = VOP_ISLOCKED(basevp); 1060 vref(basevp); 1061 } 1062 1063 /* 1064 * Prepare unionfs to issue a forwarded VOP to either the upper or lower 1065 * FS. This should be used for any VOP which may drop the vnode lock; 1066 * it is not required otherwise. 1067 * The unionfs vnode shares its lock with the base-layer vnode(s); if the 1068 * base FS must transiently drop its vnode lock, the unionfs vnode may 1069 * effectively become unlocked. During that window, a concurrent forced 1070 * unmount may doom the unionfs vnode, which leads to two significant 1071 * issues: 1072 * 1) Completion of, and return from, the unionfs VOP with the unionfs 1073 * vnode completely unlocked. When the unionfs vnode becomes doomed 1074 * it stops sharing its lock with the base vnode, so even if the 1075 * forwarded VOP reacquires the base vnode lock the unionfs vnode 1076 * lock will no longer be held. This can lead to violation of the 1077 * caller's sychronization requirements as well as various failed 1078 * locking assertions when DEBUG_VFS_LOCKS is enabled. 1079 * 2) Loss of reference on the base vnode. The caller is expected to 1080 * hold a v_usecount reference on the unionfs vnode, while the 1081 * unionfs vnode holds a reference on the base-layer vnode(s). But 1082 * these references are released when the unionfs vnode becomes 1083 * doomed, violating the base layer's expectation that its caller 1084 * must hold a reference to prevent vnode recycling. 1085 * 1086 * basevp1 and basevp2 represent two base-layer vnodes which are 1087 * expected to be locked when this function is called. basevp2 1088 * may be NULL, but if not NULL basevp1 and basevp2 should represent 1089 * a parent directory and a filed linked to it, respectively. 1090 * lkflags1 and lkflags2 are output parameters that will store the 1091 * current lock status of basevp1 and basevp2, respectively. They 1092 * are intended to be passed as the lkflags1 and lkflags2 parameters 1093 * in the subsequent call to unionfs_forward_vop_finish_pair(). 1094 * lkflags2 may be NULL iff basevp2 is NULL. 1095 */ 1096 void 1097 unionfs_forward_vop_start_pair(struct vnode *basevp1, int *lkflags1, 1098 struct vnode *basevp2, int *lkflags2) 1099 { 1100 /* 1101 * Take an additional reference on the base-layer vnodes to 1102 * avoid loss of reference if the unionfs vnodes are doomed. 1103 */ 1104 unionfs_forward_vop_ref(basevp1, lkflags1); 1105 if (basevp2 != NULL) 1106 unionfs_forward_vop_ref(basevp2, lkflags2); 1107 } 1108 1109 static inline bool 1110 unionfs_forward_vop_rele(struct vnode *unionvp, struct vnode *basevp, 1111 int lkflags) 1112 { 1113 bool unionvp_doomed; 1114 1115 if (__predict_false(VTOUNIONFS(unionvp) == NULL)) { 1116 if ((lkflags & LK_EXCLUSIVE) != 0) 1117 ASSERT_VOP_ELOCKED(basevp, __func__); 1118 else 1119 ASSERT_VOP_LOCKED(basevp, __func__); 1120 unionvp_doomed = true; 1121 } else { 1122 vrele(basevp); 1123 unionvp_doomed = false; 1124 } 1125 1126 return (unionvp_doomed); 1127 } 1128 1129 1130 /* 1131 * Indicate completion of a forwarded VOP previously prepared by 1132 * unionfs_forward_vop_start_pair(). 1133 * basevp1 and basevp2 must be the same values passed to the prior 1134 * call to unionfs_forward_vop_start_pair(). unionvp1 and unionvp2 1135 * must be the unionfs vnodes that were initially above basevp1 and 1136 * basevp2, respectively. 1137 * basevp1 and basevp2 (if not NULL) must be locked when this function 1138 * is called, while unionvp1 and/or unionvp2 may be unlocked if either 1139 * unionfs vnode has become doomed. 1140 * lkflags1 and lkflag2 represent the locking flags that should be 1141 * used to re-lock unionvp1 and unionvp2, respectively, if either 1142 * vnode has become doomed. 1143 * 1144 * Returns true if any unionfs vnode was found to be doomed, false 1145 * otherwise. 1146 */ 1147 bool 1148 unionfs_forward_vop_finish_pair( 1149 struct vnode *unionvp1, struct vnode *basevp1, int lkflags1, 1150 struct vnode *unionvp2, struct vnode *basevp2, int lkflags2) 1151 { 1152 bool vp1_doomed, vp2_doomed; 1153 1154 /* 1155 * If either vnode is found to have been doomed, set 1156 * a flag indicating that it needs to be re-locked. 1157 * Otherwise, simply drop the base-vnode reference that 1158 * was taken in unionfs_forward_vop_start(). 1159 */ 1160 vp1_doomed = unionfs_forward_vop_rele(unionvp1, basevp1, lkflags1); 1161 1162 if (unionvp2 != NULL) 1163 vp2_doomed = unionfs_forward_vop_rele(unionvp2, basevp2, lkflags2); 1164 else 1165 vp2_doomed = false; 1166 1167 /* 1168 * If any of the unionfs vnodes need to be re-locked, that 1169 * means the unionfs vnode's lock is now de-coupled from the 1170 * corresponding base vnode. We therefore need to drop the 1171 * base vnode lock (since nothing else will after this point), 1172 * and also release the reference taken in 1173 * unionfs_forward_vop_start_pair(). 1174 */ 1175 if (__predict_false(vp1_doomed && vp2_doomed)) 1176 VOP_VPUT_PAIR(basevp1, &basevp2, true); 1177 else if (__predict_false(vp1_doomed)) { 1178 /* 1179 * If basevp1 needs to be unlocked, then we may not 1180 * be able to safely unlock it with basevp2 still locked, 1181 * for the same reason that an ordinary VFS call would 1182 * need to use VOP_VPUT_PAIR() here. We might be able 1183 * to use VOP_VPUT_PAIR(..., false) here, but then we 1184 * would need to deal with the possibility of basevp2 1185 * changing out from under us, which could result in 1186 * either the unionfs vnode becoming doomed or its 1187 * upper/lower vp no longer matching basevp2. Either 1188 * scenario would require at least re-locking the unionfs 1189 * vnode anyway. 1190 */ 1191 if (unionvp2 != NULL) { 1192 VOP_UNLOCK(unionvp2); 1193 vp2_doomed = true; 1194 } 1195 vput(basevp1); 1196 } else if (__predict_false(vp2_doomed)) 1197 vput(basevp2); 1198 1199 if (__predict_false(vp1_doomed || vp2_doomed)) 1200 vn_lock_pair(unionvp1, !vp1_doomed, lkflags1, 1201 unionvp2, !vp2_doomed, lkflags2); 1202 1203 return (vp1_doomed || vp2_doomed); 1204 } 1205 1206 /* 1207 * Create a new whiteout. 1208 * 1209 * dvp and vp are unionfs vnodes representing a parent directory and 1210 * child file, should be locked on entry, and will be locked on return. 1211 */ 1212 int 1213 unionfs_mkwhiteout(struct vnode *dvp, struct vnode *vp, 1214 struct componentname *cnp, struct thread *td, char *path, int pathlen) 1215 { 1216 struct vnode *udvp; 1217 struct vnode *wvp; 1218 struct nameidata nd; 1219 struct mount *mp; 1220 int error; 1221 bool dvp_locked; 1222 1223 ASSERT_VOP_ELOCKED(dvp, __func__); 1224 ASSERT_VOP_ELOCKED(vp, __func__); 1225 1226 udvp = VTOUNIONFS(dvp)->un_uppervp; 1227 wvp = NULLVP; 1228 NDPREINIT(&nd); 1229 vref(udvp); 1230 VOP_UNLOCK(vp); 1231 if ((error = unionfs_relookup(udvp, &wvp, cnp, &nd.ni_cnd, td, path, 1232 pathlen, CREATE))) { 1233 goto unionfs_mkwhiteout_cleanup; 1234 } 1235 if (wvp != NULLVP) { 1236 if (udvp == wvp) 1237 vrele(wvp); 1238 else 1239 vput(wvp); 1240 1241 if (nd.ni_cnd.cn_flags & ISWHITEOUT) 1242 error = 0; 1243 else 1244 error = EEXIST; 1245 goto unionfs_mkwhiteout_cleanup; 1246 } 1247 1248 if ((error = vn_start_write(udvp, &mp, V_WAIT | V_PCATCH))) 1249 goto unionfs_mkwhiteout_cleanup; 1250 error = VOP_WHITEOUT(udvp, &nd.ni_cnd, CREATE); 1251 vn_finished_write(mp); 1252 1253 unionfs_mkwhiteout_cleanup: 1254 if (VTOUNIONFS(dvp) == NULL) { 1255 vput(udvp); 1256 dvp_locked = false; 1257 } else { 1258 vrele(udvp); 1259 dvp_locked = true; 1260 } 1261 vn_lock_pair(dvp, dvp_locked, LK_EXCLUSIVE, vp, false, LK_EXCLUSIVE); 1262 return (error); 1263 } 1264 1265 /* 1266 * Create a new vnode for create a new shadow file. 1267 * 1268 * If an error is returned, *vpp will be invalid, otherwise it will hold a 1269 * locked, referenced and opened vnode. 1270 * 1271 * unp is never updated. 1272 */ 1273 static int 1274 unionfs_vn_create_on_upper(struct vnode **vpp, struct vnode *udvp, 1275 struct vnode *vp, struct vattr *uvap, struct thread *td) 1276 { 1277 struct unionfs_mount *ump; 1278 struct unionfs_node *unp; 1279 struct vnode *uvp; 1280 struct vnode *lvp; 1281 struct ucred *cred; 1282 struct vattr lva; 1283 struct nameidata nd; 1284 int fmode; 1285 int error; 1286 1287 ASSERT_VOP_ELOCKED(vp, __func__); 1288 unp = VTOUNIONFS(vp); 1289 ump = MOUNTTOUNIONFSMOUNT(UNIONFSTOV(unp)->v_mount); 1290 uvp = NULLVP; 1291 lvp = unp->un_lowervp; 1292 cred = td->td_ucred; 1293 fmode = FFLAGS(O_WRONLY | O_CREAT | O_TRUNC | O_EXCL); 1294 error = 0; 1295 1296 if ((error = VOP_GETATTR(lvp, &lva, cred)) != 0) 1297 return (error); 1298 unionfs_create_uppervattr_core(ump, &lva, uvap, td); 1299 1300 if (unp->un_path == NULL) 1301 panic("%s: NULL un_path", __func__); 1302 1303 nd.ni_cnd.cn_namelen = unp->un_pathlen; 1304 nd.ni_cnd.cn_pnbuf = unp->un_path; 1305 nd.ni_cnd.cn_nameiop = CREATE; 1306 nd.ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF | ISLASTCN; 1307 nd.ni_cnd.cn_lkflags = LK_EXCLUSIVE; 1308 nd.ni_cnd.cn_cred = cred; 1309 nd.ni_cnd.cn_nameptr = nd.ni_cnd.cn_pnbuf; 1310 NDPREINIT(&nd); 1311 1312 vref(udvp); 1313 VOP_UNLOCK(vp); 1314 if ((error = vfs_relookup(udvp, &uvp, &nd.ni_cnd, false)) != 0) { 1315 vrele(udvp); 1316 return (error); 1317 } 1318 1319 if (uvp != NULLVP) { 1320 if (uvp == udvp) 1321 vrele(uvp); 1322 else 1323 vput(uvp); 1324 error = EEXIST; 1325 goto unionfs_vn_create_on_upper_cleanup; 1326 } 1327 1328 if ((error = VOP_CREATE(udvp, &uvp, &nd.ni_cnd, uvap)) != 0) 1329 goto unionfs_vn_create_on_upper_cleanup; 1330 1331 if ((error = VOP_OPEN(uvp, fmode, cred, td, NULL)) != 0) { 1332 vput(uvp); 1333 goto unionfs_vn_create_on_upper_cleanup; 1334 } 1335 error = VOP_ADD_WRITECOUNT(uvp, 1); 1336 CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d", 1337 __func__, uvp, uvp->v_writecount); 1338 if (error == 0) { 1339 *vpp = uvp; 1340 } else { 1341 VOP_CLOSE(uvp, fmode, cred, td); 1342 } 1343 1344 unionfs_vn_create_on_upper_cleanup: 1345 vput(udvp); 1346 return (error); 1347 } 1348 1349 /* 1350 * Copy from lvp to uvp. 1351 * 1352 * lvp and uvp should be locked and opened on entry and will be locked and 1353 * opened on return. 1354 */ 1355 static int 1356 unionfs_copyfile_core(struct vnode *lvp, struct vnode *uvp, 1357 struct ucred *cred, struct thread *td) 1358 { 1359 char *buf; 1360 struct uio uio; 1361 struct iovec iov; 1362 off_t offset; 1363 int count; 1364 int error; 1365 int bufoffset; 1366 1367 error = 0; 1368 memset(&uio, 0, sizeof(uio)); 1369 1370 uio.uio_td = td; 1371 uio.uio_segflg = UIO_SYSSPACE; 1372 uio.uio_offset = 0; 1373 1374 buf = malloc(MAXBSIZE, M_TEMP, M_WAITOK); 1375 1376 while (error == 0) { 1377 offset = uio.uio_offset; 1378 1379 uio.uio_iov = &iov; 1380 uio.uio_iovcnt = 1; 1381 iov.iov_base = buf; 1382 iov.iov_len = MAXBSIZE; 1383 uio.uio_resid = iov.iov_len; 1384 uio.uio_rw = UIO_READ; 1385 1386 if ((error = VOP_READ(lvp, &uio, 0, cred)) != 0) 1387 break; 1388 if ((count = MAXBSIZE - uio.uio_resid) == 0) 1389 break; 1390 1391 bufoffset = 0; 1392 while (bufoffset < count) { 1393 uio.uio_iov = &iov; 1394 uio.uio_iovcnt = 1; 1395 iov.iov_base = buf + bufoffset; 1396 iov.iov_len = count - bufoffset; 1397 uio.uio_offset = offset + bufoffset; 1398 uio.uio_resid = iov.iov_len; 1399 uio.uio_rw = UIO_WRITE; 1400 1401 if ((error = VOP_WRITE(uvp, &uio, 0, cred)) != 0) 1402 break; 1403 1404 bufoffset += (count - bufoffset) - uio.uio_resid; 1405 } 1406 1407 uio.uio_offset = offset + bufoffset; 1408 } 1409 1410 free(buf, M_TEMP); 1411 1412 return (error); 1413 } 1414 1415 /* 1416 * Copy file from lower to upper. 1417 * 1418 * If you need copy of the contents, set 1 to docopy. Otherwise, set 0 to 1419 * docopy. 1420 * 1421 * vp is a unionfs vnode that should be locked on entry and will be 1422 * locked on return. 1423 * 1424 * If no error returned, unp will be updated. 1425 */ 1426 int 1427 unionfs_copyfile(struct vnode *vp, int docopy, struct ucred *cred, 1428 struct thread *td) 1429 { 1430 struct unionfs_node *unp; 1431 struct unionfs_node *dunp; 1432 struct mount *mp; 1433 struct vnode *udvp; 1434 struct vnode *lvp; 1435 struct vnode *uvp; 1436 struct vattr uva; 1437 int error; 1438 1439 ASSERT_VOP_ELOCKED(vp, __func__); 1440 unp = VTOUNIONFS(vp); 1441 lvp = unp->un_lowervp; 1442 uvp = NULLVP; 1443 1444 if ((UNIONFSTOV(unp)->v_mount->mnt_flag & MNT_RDONLY)) 1445 return (EROFS); 1446 if (unp->un_dvp == NULLVP) 1447 return (EINVAL); 1448 if (unp->un_uppervp != NULLVP) 1449 return (EEXIST); 1450 1451 udvp = NULLVP; 1452 VI_LOCK(unp->un_dvp); 1453 dunp = VTOUNIONFS(unp->un_dvp); 1454 if (dunp != NULL) 1455 udvp = dunp->un_uppervp; 1456 VI_UNLOCK(unp->un_dvp); 1457 1458 if (udvp == NULLVP) 1459 return (EROFS); 1460 if ((udvp->v_mount->mnt_flag & MNT_RDONLY)) 1461 return (EROFS); 1462 ASSERT_VOP_UNLOCKED(udvp, __func__); 1463 1464 error = unionfs_set_in_progress_flag(vp, UNIONFS_COPY_IN_PROGRESS); 1465 if (error == EJUSTRETURN) 1466 return (0); 1467 else if (error != 0) 1468 return (error); 1469 1470 error = VOP_ACCESS(lvp, VREAD, cred, td); 1471 if (error != 0) 1472 goto unionfs_copyfile_cleanup; 1473 1474 if ((error = vn_start_write(udvp, &mp, V_WAIT | V_PCATCH)) != 0) 1475 goto unionfs_copyfile_cleanup; 1476 error = unionfs_vn_create_on_upper(&uvp, udvp, vp, &uva, td); 1477 if (error != 0) { 1478 vn_finished_write(mp); 1479 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1480 goto unionfs_copyfile_cleanup; 1481 } 1482 1483 /* 1484 * Note that it's still possible for e.g. VOP_WRITE to relock 1485 * uvp below while holding vp[=lvp] locked. Replacing 1486 * unionfs_copyfile_core with vn_generic_copy_file_range() will 1487 * allow us to avoid the problem by moving this vn_lock_pair() 1488 * call much later. 1489 */ 1490 vn_lock_pair(vp, false, LK_EXCLUSIVE, uvp, true, LK_EXCLUSIVE); 1491 unp = VTOUNIONFS(vp); 1492 if (unp == NULL) { 1493 error = ENOENT; 1494 goto unionfs_copyfile_cleanup; 1495 } 1496 1497 if (docopy != 0) { 1498 error = VOP_OPEN(lvp, FREAD, cred, td, NULL); 1499 if (error == 0) { 1500 error = unionfs_copyfile_core(lvp, uvp, cred, td); 1501 VOP_CLOSE(lvp, FREAD, cred, td); 1502 } 1503 } 1504 VOP_CLOSE(uvp, FWRITE, cred, td); 1505 VOP_ADD_WRITECOUNT_CHECKED(uvp, -1); 1506 CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d", 1507 __func__, uvp, uvp->v_writecount); 1508 1509 vn_finished_write(mp); 1510 1511 if (error == 0) { 1512 /* Reset the attributes. Ignore errors. */ 1513 uva.va_type = VNON; 1514 VOP_SETATTR(uvp, &uva, cred); 1515 unionfs_node_update(unp, uvp, td); 1516 } 1517 1518 unionfs_copyfile_cleanup: 1519 unionfs_clear_in_progress_flag(vp, UNIONFS_COPY_IN_PROGRESS); 1520 return (error); 1521 } 1522 1523 /* 1524 * Determine if the unionfs view of a directory is empty such that 1525 * an rmdir operation can be permitted. 1526 * 1527 * We assume the VOP_RMDIR() against the upper layer vnode will take 1528 * care of this check for us where the upper FS is concerned, so here 1529 * we concentrate on the lower FS. We need to check for the presence 1530 * of files other than "." and ".." in the lower FS directory and 1531 * then cross-check any files we find against the upper FS to see if 1532 * a whiteout is present (in which case we treat the lower file as 1533 * non-present). 1534 * 1535 * The logic here is based heavily on vn_dir_check_empty(). 1536 * 1537 * vp should be a locked unionfs node, and vp's lowervp should also be 1538 * locked. 1539 */ 1540 int 1541 unionfs_check_rmdir(struct vnode *vp, struct ucred *cred, struct thread *td) 1542 { 1543 struct vnode *uvp; 1544 struct vnode *lvp; 1545 struct vnode *tvp; 1546 char *dirbuf; 1547 size_t dirbuflen, len; 1548 off_t off; 1549 struct dirent *dp; 1550 struct componentname cn; 1551 struct vattr va; 1552 int error; 1553 int eofflag; 1554 1555 eofflag = 0; 1556 lvp = UNIONFSVPTOLOWERVP(vp); 1557 uvp = UNIONFSVPTOUPPERVP(vp); 1558 1559 /* 1560 * Note that the locking here still isn't ideal: We expect the caller 1561 * to hold both the upper and lower layer locks as well as the upper 1562 * parent directory lock, which it can do in a manner that avoids 1563 * deadlock. However, if the cross-check logic below needs to call 1564 * VOP_LOOKUP(), that may relock the upper vnode and lock any found 1565 * child vnode in a way that doesn't protect against deadlock given 1566 * the other held locks. Beyond that, the various other VOPs we issue 1567 * below, such as VOP_OPEN() and VOP_READDIR(), may also re-lock the 1568 * lower vnode. 1569 * We might instead just handoff between the upper vnode lock 1570 * (and its parent directory lock) and the lower vnode lock as needed, 1571 * so that the lower lock is never held at the same time as the upper 1572 * locks, but that opens up a wider window in which the upper 1573 * directory (and also the lower directory if it isn't truly 1574 * read-only) may change while the relevant lock is dropped. But 1575 * since re-locking may happen here and open up such a window anyway, 1576 * perhaps that is a worthwile tradeoff? Or perhaps we can ultimately 1577 * do sufficient tracking of empty state within the unionfs vnode 1578 * (in conjunction with upcalls from the lower FSes to notify us 1579 * of out-of-band state changes) that we can avoid these costly checks 1580 * altogether. 1581 */ 1582 ASSERT_VOP_LOCKED(lvp, __func__); 1583 ASSERT_VOP_ELOCKED(uvp, __func__); 1584 1585 if ((error = VOP_GETATTR(uvp, &va, cred)) != 0) 1586 return (error); 1587 if (va.va_flags & OPAQUE) 1588 return (0); 1589 1590 #ifdef MAC 1591 if ((error = mac_vnode_check_open(cred, lvp, VEXEC | VREAD)) != 0) 1592 return (error); 1593 #endif 1594 if ((error = VOP_ACCESS(lvp, VEXEC | VREAD, cred, td)) != 0) 1595 return (error); 1596 if ((error = VOP_OPEN(lvp, FREAD, cred, td, NULL)) != 0) 1597 return (error); 1598 if ((error = VOP_GETATTR(lvp, &va, cred)) != 0) 1599 return (error); 1600 1601 dirbuflen = max(DEV_BSIZE, GENERIC_MAXDIRSIZ); 1602 if (dirbuflen < va.va_blocksize) 1603 dirbuflen = va.va_blocksize; 1604 dirbuf = malloc(dirbuflen, M_TEMP, M_WAITOK); 1605 1606 len = 0; 1607 off = 0; 1608 eofflag = 0; 1609 1610 for (;;) { 1611 error = vn_dir_next_dirent(lvp, td, dirbuf, dirbuflen, 1612 &dp, &len, &off, &eofflag); 1613 if (error != 0) 1614 break; 1615 1616 if (len == 0) { 1617 /* EOF */ 1618 error = 0; 1619 break; 1620 } 1621 1622 if (dp->d_type == DT_WHT) 1623 continue; 1624 1625 /* 1626 * Any file in the directory which is not '.' or '..' indicates 1627 * the directory is not empty. 1628 */ 1629 switch (dp->d_namlen) { 1630 case 2: 1631 if (dp->d_name[1] != '.') { 1632 /* Can't be '..' (nor '.') */ 1633 break; 1634 } 1635 /* FALLTHROUGH */ 1636 case 1: 1637 if (dp->d_name[0] != '.') { 1638 /* Can't be '..' nor '.' */ 1639 break; 1640 } 1641 continue; 1642 default: 1643 break; 1644 } 1645 1646 cn.cn_namelen = dp->d_namlen; 1647 cn.cn_pnbuf = NULL; 1648 cn.cn_nameptr = dp->d_name; 1649 cn.cn_nameiop = LOOKUP; 1650 cn.cn_flags = LOCKPARENT | LOCKLEAF | RDONLY | ISLASTCN; 1651 cn.cn_lkflags = LK_EXCLUSIVE; 1652 cn.cn_cred = cred; 1653 1654 error = VOP_LOOKUP(uvp, &tvp, &cn); 1655 if (tvp != NULLVP) 1656 vput(tvp); 1657 if (error != 0 && error != ENOENT && error != EJUSTRETURN) 1658 break; 1659 else if ((cn.cn_flags & ISWHITEOUT) == 0) { 1660 error = ENOTEMPTY; 1661 break; 1662 } else 1663 error = 0; 1664 } 1665 1666 VOP_CLOSE(lvp, FREAD, cred, td); 1667 free(dirbuf, M_TEMP); 1668 return (error); 1669 } 1670