1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1994 Jan-Simon Pendry 5 * Copyright (c) 1994 6 * The Regents of the University of California. All rights reserved. 7 * Copyright (c) 2005, 2006, 2012 Masanori Ozawa <ozawa@ongs.co.jp>, ONGS Inc. 8 * Copyright (c) 2006, 2012 Daichi Goto <daichi@freebsd.org> 9 * 10 * This code is derived from software contributed to Berkeley by 11 * Jan-Simon Pendry. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 3. Neither the name of the University nor the names of its contributors 22 * may be used to endorse or promote products derived from this software 23 * without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 * SUCH DAMAGE. 36 */ 37 38 #include <sys/param.h> 39 #include <sys/systm.h> 40 #include <sys/kernel.h> 41 #include <sys/ktr.h> 42 #include <sys/lock.h> 43 #include <sys/mutex.h> 44 #include <sys/malloc.h> 45 #include <sys/mount.h> 46 #include <sys/namei.h> 47 #include <sys/proc.h> 48 #include <sys/vnode.h> 49 #include <sys/dirent.h> 50 #include <sys/fcntl.h> 51 #include <sys/filedesc.h> 52 #include <sys/stat.h> 53 #include <sys/sysctl.h> 54 #include <sys/taskqueue.h> 55 #include <sys/resourcevar.h> 56 57 #include <machine/atomic.h> 58 59 #include <security/mac/mac_framework.h> 60 61 #include <vm/uma.h> 62 63 #include <fs/unionfs/union.h> 64 65 #define NUNIONFSNODECACHE 16 66 #define UNIONFSHASHMASK (NUNIONFSNODECACHE - 1) 67 68 static MALLOC_DEFINE(M_UNIONFSHASH, "UNIONFS hash", "UNIONFS hash table"); 69 MALLOC_DEFINE(M_UNIONFSNODE, "UNIONFS node", "UNIONFS vnode private part"); 70 MALLOC_DEFINE(M_UNIONFSPATH, "UNIONFS path", "UNIONFS path private part"); 71 72 static struct task unionfs_deferred_rele_task; 73 static struct mtx unionfs_deferred_rele_lock; 74 static STAILQ_HEAD(, unionfs_node) unionfs_deferred_rele_list = 75 STAILQ_HEAD_INITIALIZER(unionfs_deferred_rele_list); 76 static TASKQUEUE_DEFINE_THREAD(unionfs_rele); 77 78 unsigned int unionfs_ndeferred = 0; 79 SYSCTL_UINT(_vfs, OID_AUTO, unionfs_ndeferred, CTLFLAG_RD, 80 &unionfs_ndeferred, 0, "unionfs deferred vnode release"); 81 82 static void unionfs_deferred_rele(void *, int); 83 84 /* 85 * Initialize 86 */ 87 int 88 unionfs_init(struct vfsconf *vfsp) 89 { 90 UNIONFSDEBUG("unionfs_init\n"); /* printed during system boot */ 91 TASK_INIT(&unionfs_deferred_rele_task, 0, unionfs_deferred_rele, NULL); 92 mtx_init(&unionfs_deferred_rele_lock, "uniondefr", NULL, MTX_DEF); 93 return (0); 94 } 95 96 /* 97 * Uninitialize 98 */ 99 int 100 unionfs_uninit(struct vfsconf *vfsp) 101 { 102 taskqueue_quiesce(taskqueue_unionfs_rele); 103 taskqueue_free(taskqueue_unionfs_rele); 104 mtx_destroy(&unionfs_deferred_rele_lock); 105 return (0); 106 } 107 108 static void 109 unionfs_deferred_rele(void *arg __unused, int pending __unused) 110 { 111 STAILQ_HEAD(, unionfs_node) local_rele_list; 112 struct unionfs_node *unp, *tunp; 113 unsigned int ndeferred; 114 115 ndeferred = 0; 116 STAILQ_INIT(&local_rele_list); 117 mtx_lock(&unionfs_deferred_rele_lock); 118 STAILQ_CONCAT(&local_rele_list, &unionfs_deferred_rele_list); 119 mtx_unlock(&unionfs_deferred_rele_lock); 120 STAILQ_FOREACH_SAFE(unp, &local_rele_list, un_rele, tunp) { 121 ++ndeferred; 122 MPASS(unp->un_dvp != NULL); 123 vrele(unp->un_dvp); 124 free(unp, M_UNIONFSNODE); 125 } 126 127 /* We expect this function to be single-threaded, thus no atomic */ 128 unionfs_ndeferred += ndeferred; 129 } 130 131 static struct unionfs_node_hashhead * 132 unionfs_get_hashhead(struct vnode *dvp, struct vnode *lookup) 133 { 134 struct unionfs_node *unp; 135 136 unp = VTOUNIONFS(dvp); 137 138 return (&(unp->un_hashtbl[vfs_hash_index(lookup) & UNIONFSHASHMASK])); 139 } 140 141 /* 142 * Attempt to lookup a cached unionfs vnode by upper/lower vp 143 * from dvp, with dvp's interlock held. 144 */ 145 static struct vnode * 146 unionfs_get_cached_vnode_locked(struct vnode *lookup, struct vnode *dvp) 147 { 148 struct unionfs_node *unp; 149 struct unionfs_node_hashhead *hd; 150 struct vnode *vp; 151 152 hd = unionfs_get_hashhead(dvp, lookup); 153 154 LIST_FOREACH(unp, hd, un_hash) { 155 if (unp->un_uppervp == lookup || 156 unp->un_lowervp == lookup) { 157 vp = UNIONFSTOV(unp); 158 VI_LOCK_FLAGS(vp, MTX_DUPOK); 159 vp->v_iflag &= ~VI_OWEINACT; 160 if (VN_IS_DOOMED(vp) || 161 ((vp->v_iflag & VI_DOINGINACT) != 0)) { 162 VI_UNLOCK(vp); 163 vp = NULL; 164 } else { 165 vrefl(vp); 166 VI_UNLOCK(vp); 167 } 168 return (vp); 169 } 170 } 171 172 return (NULL); 173 } 174 175 176 /* 177 * Get the cached vnode. 178 */ 179 static struct vnode * 180 unionfs_get_cached_vnode(struct vnode *uvp, struct vnode *lvp, 181 struct vnode *dvp) 182 { 183 struct vnode *vp; 184 185 vp = NULL; 186 VI_LOCK(dvp); 187 if (uvp != NULL) 188 vp = unionfs_get_cached_vnode_locked(uvp, dvp); 189 else if (lvp != NULL) 190 vp = unionfs_get_cached_vnode_locked(lvp, dvp); 191 VI_UNLOCK(dvp); 192 193 return (vp); 194 } 195 196 /* 197 * Add the new vnode into cache. 198 */ 199 static struct vnode * 200 unionfs_ins_cached_vnode(struct unionfs_node *uncp, 201 struct vnode *dvp) 202 { 203 struct unionfs_node_hashhead *hd; 204 struct vnode *vp; 205 206 vp = NULL; 207 VI_LOCK(dvp); 208 if (uncp->un_uppervp != NULL) { 209 ASSERT_VOP_ELOCKED(uncp->un_uppervp, __func__); 210 KASSERT(uncp->un_uppervp->v_type == VDIR, 211 ("%s: v_type != VDIR", __func__)); 212 vp = unionfs_get_cached_vnode_locked(uncp->un_uppervp, dvp); 213 } else if (uncp->un_lowervp != NULL) { 214 ASSERT_VOP_ELOCKED(uncp->un_lowervp, __func__); 215 KASSERT(uncp->un_lowervp->v_type == VDIR, 216 ("%s: v_type != VDIR", __func__)); 217 vp = unionfs_get_cached_vnode_locked(uncp->un_lowervp, dvp); 218 } 219 if (vp == NULL) { 220 hd = unionfs_get_hashhead(dvp, (uncp->un_uppervp != NULL ? 221 uncp->un_uppervp : uncp->un_lowervp)); 222 LIST_INSERT_HEAD(hd, uncp, un_hash); 223 } 224 VI_UNLOCK(dvp); 225 226 return (vp); 227 } 228 229 /* 230 * Remove the vnode. 231 */ 232 static void 233 unionfs_rem_cached_vnode(struct unionfs_node *unp, struct vnode *dvp) 234 { 235 KASSERT(unp != NULL, ("%s: null node", __func__)); 236 KASSERT(dvp != NULL, 237 ("%s: null parent vnode", __func__)); 238 239 VI_LOCK(dvp); 240 if (unp->un_hash.le_prev != NULL) { 241 LIST_REMOVE(unp, un_hash); 242 unp->un_hash.le_next = NULL; 243 unp->un_hash.le_prev = NULL; 244 } 245 VI_UNLOCK(dvp); 246 } 247 248 /* 249 * Common cleanup handling for unionfs_nodeget 250 * Upper, lower, and parent directory vnodes are expected to be referenced by 251 * the caller. Upper and lower vnodes, if non-NULL, are also expected to be 252 * exclusively locked by the caller. 253 * This function will return with the caller's locks and references undone. 254 */ 255 static void 256 unionfs_nodeget_cleanup(struct vnode *vp, struct unionfs_node *unp) 257 { 258 259 /* 260 * Lock and reset the default vnode lock; vgone() expects a locked 261 * vnode, and we're going to reset the vnode ops. 262 */ 263 lockmgr(&vp->v_lock, LK_EXCLUSIVE, NULL); 264 265 /* 266 * Clear out private data and reset the vnode ops to avoid use of 267 * unionfs vnode ops on a partially constructed vnode. 268 */ 269 VI_LOCK(vp); 270 vp->v_data = NULL; 271 vp->v_vnlock = &vp->v_lock; 272 vp->v_op = &dead_vnodeops; 273 VI_UNLOCK(vp); 274 vgone(vp); 275 vput(vp); 276 277 if (unp->un_dvp != NULL) 278 vrele(unp->un_dvp); 279 if (unp->un_uppervp != NULL) { 280 vput(unp->un_uppervp); 281 if (unp->un_lowervp != NULL) 282 vrele(unp->un_lowervp); 283 } else if (unp->un_lowervp != NULL) 284 vput(unp->un_lowervp); 285 if (unp->un_hashtbl != NULL) 286 hashdestroy(unp->un_hashtbl, M_UNIONFSHASH, UNIONFSHASHMASK); 287 free(unp->un_path, M_UNIONFSPATH); 288 free(unp, M_UNIONFSNODE); 289 } 290 291 /* 292 * Make a new or get existing unionfs node. 293 * 294 * uppervp and lowervp should be unlocked. Because if new unionfs vnode is 295 * locked, uppervp or lowervp is locked too. In order to prevent dead lock, 296 * you should not lock plurality simultaneously. 297 */ 298 int 299 unionfs_nodeget(struct mount *mp, struct vnode *uppervp, 300 struct vnode *lowervp, struct vnode *dvp, struct vnode **vpp, 301 struct componentname *cnp) 302 { 303 char *path; 304 struct unionfs_mount *ump; 305 struct unionfs_node *unp; 306 struct vnode *vp; 307 u_long hashmask; 308 int error; 309 int lkflags; 310 __enum_uint8(vtype) vt; 311 312 error = 0; 313 ump = MOUNTTOUNIONFSMOUNT(mp); 314 lkflags = (cnp ? cnp->cn_lkflags : 0); 315 path = (cnp ? cnp->cn_nameptr : NULL); 316 *vpp = NULL; 317 318 if (uppervp == NULL && lowervp == NULL) 319 panic("%s: upper and lower are both null", __func__); 320 321 vt = (uppervp != NULL ? uppervp->v_type : lowervp->v_type); 322 323 /* If it has no ISLASTCN flag, path check is skipped. */ 324 if (cnp && !(cnp->cn_flags & ISLASTCN)) 325 path = NULL; 326 327 /* check the cache */ 328 if (dvp != NULL && vt == VDIR) { 329 vp = unionfs_get_cached_vnode(uppervp, lowervp, dvp); 330 if (vp != NULL) { 331 *vpp = vp; 332 if (lkflags != 0) 333 vn_lock(*vpp, lkflags | LK_RETRY); 334 return (0); 335 } 336 } 337 338 unp = malloc(sizeof(struct unionfs_node), 339 M_UNIONFSNODE, M_WAITOK | M_ZERO); 340 341 error = getnewvnode("unionfs", mp, &unionfs_vnodeops, &vp); 342 if (error != 0) { 343 free(unp, M_UNIONFSNODE); 344 return (error); 345 } 346 if (dvp != NULL) 347 vref(dvp); 348 if (uppervp != NULL) 349 vref(uppervp); 350 if (lowervp != NULL) 351 vref(lowervp); 352 353 if (vt == VDIR) { 354 unp->un_hashtbl = hashinit(NUNIONFSNODECACHE, M_UNIONFSHASH, 355 &hashmask); 356 KASSERT(hashmask == UNIONFSHASHMASK, 357 ("unexpected unionfs hash mask 0x%lx", hashmask)); 358 } 359 360 unp->un_vnode = vp; 361 unp->un_uppervp = uppervp; 362 unp->un_lowervp = lowervp; 363 unp->un_dvp = dvp; 364 if (uppervp != NULL) 365 vp->v_vnlock = uppervp->v_vnlock; 366 else 367 vp->v_vnlock = lowervp->v_vnlock; 368 369 if (path != NULL) { 370 unp->un_path = malloc(cnp->cn_namelen + 1, 371 M_UNIONFSPATH, M_WAITOK | M_ZERO); 372 bcopy(cnp->cn_nameptr, unp->un_path, cnp->cn_namelen); 373 unp->un_path[cnp->cn_namelen] = '\0'; 374 unp->un_pathlen = cnp->cn_namelen; 375 } 376 vp->v_type = vt; 377 vp->v_data = unp; 378 379 /* 380 * TODO: This is an imperfect check, as there's no guarantee that 381 * the underlying filesystems will always return vnode pointers 382 * for the root inodes that match our cached values. To reduce 383 * the likelihood of failure, for example in the case where either 384 * vnode has been forcibly doomed, we check both pointers and set 385 * VV_ROOT if either matches. 386 */ 387 if (ump->um_uppervp == uppervp || ump->um_lowervp == lowervp) 388 vp->v_vflag |= VV_ROOT; 389 KASSERT(dvp != NULL || (vp->v_vflag & VV_ROOT) != 0, 390 ("%s: NULL dvp for non-root vp %p", __func__, vp)); 391 392 393 /* 394 * NOTE: There is still a possibility for cross-filesystem locking here. 395 * If dvp has an upper FS component and is locked, while the new vnode 396 * created here only has a lower-layer FS component, then we will end 397 * up taking a lower-FS lock while holding an upper-FS lock. 398 * That situation could be dealt with here using vn_lock_pair(). 399 * However, that would only address one instance out of many in which 400 * a child vnode lock is taken while holding a lock on its parent 401 * directory. This is done in many places in common VFS code, as well as 402 * a few places within unionfs (which could lead to the same cross-FS 403 * locking issue if, for example, the upper FS is another nested unionfs 404 * instance). Additionally, it is unclear under what circumstances this 405 * specific lock sequence (a directory on one FS followed by a child of 406 * its 'peer' directory on another FS) would present the practical 407 * possibility of deadlock due to some other agent on the system 408 * attempting to lock those two specific vnodes in the opposite order. 409 */ 410 if (uppervp != NULL) 411 vn_lock(uppervp, LK_EXCLUSIVE | LK_RETRY); 412 else 413 vn_lock(lowervp, LK_EXCLUSIVE | LK_RETRY); 414 error = insmntque1(vp, mp); 415 if (error != 0) { 416 unionfs_nodeget_cleanup(vp, unp); 417 return (error); 418 } 419 /* 420 * lowervp and uppervp should only be doomed by a forced unmount of 421 * their respective filesystems, but that can only happen if the 422 * unionfs instance is first unmounted. We also effectively hold the 423 * lock on the new unionfs vnode at this point. Therefore, if a 424 * unionfs umount has not yet reached the point at which the above 425 * insmntque1() would fail, then its vflush() call will end up 426 * blocked on our vnode lock, effectively also preventing unmount 427 * of the underlying filesystems. 428 */ 429 VNASSERT(lowervp == NULL || !VN_IS_DOOMED(lowervp), vp, 430 ("%s: doomed lowervp %p", __func__, lowervp)); 431 VNASSERT(uppervp == NULL || !VN_IS_DOOMED(uppervp), vp, 432 ("%s: doomed lowervp %p", __func__, uppervp)); 433 434 vn_set_state(vp, VSTATE_CONSTRUCTED); 435 436 if (dvp != NULL && vt == VDIR) 437 *vpp = unionfs_ins_cached_vnode(unp, dvp); 438 if (*vpp != NULL) { 439 unionfs_nodeget_cleanup(vp, unp); 440 if (lkflags != 0) 441 vn_lock(*vpp, lkflags | LK_RETRY); 442 return (0); 443 } else 444 *vpp = vp; 445 446 if ((lkflags & LK_SHARED) != 0) 447 vn_lock(vp, LK_DOWNGRADE); 448 else if ((lkflags & LK_EXCLUSIVE) == 0) 449 VOP_UNLOCK(vp); 450 451 return (0); 452 } 453 454 /* 455 * Clean up the unionfs node. 456 */ 457 void 458 unionfs_noderem(struct vnode *vp) 459 { 460 struct unionfs_node *unp, *unp_t1, *unp_t2; 461 struct unionfs_node_hashhead *hd; 462 struct unionfs_node_status *unsp, *unsp_tmp; 463 struct vnode *lvp; 464 struct vnode *uvp; 465 struct vnode *dvp; 466 int count; 467 int writerefs; 468 bool unlock_lvp; 469 470 /* 471 * The root vnode lock may be recursed during unmount, because 472 * it may share the same lock as the unionfs mount's covered vnode, 473 * which is locked across VFS_UNMOUNT(). This lock will then be 474 * recursively taken during the vflush() issued by unionfs_unmount(). 475 * But we still only need to lock the unionfs lock once, because only 476 * one of those lock operations was taken against a unionfs vnode and 477 * will be undone against a unionfs vnode. 478 */ 479 KASSERT(vp->v_vnlock->lk_recurse == 0 || (vp->v_vflag & VV_ROOT) != 0, 480 ("%s: vnode %p locked recursively", __func__, vp)); 481 482 unp = VTOUNIONFS(vp); 483 VNASSERT(unp != NULL, vp, ("%s: already reclaimed", __func__)); 484 lvp = unp->un_lowervp; 485 uvp = unp->un_uppervp; 486 dvp = unp->un_dvp; 487 unlock_lvp = (uvp == NULL); 488 489 /* 490 * Lock the lower vnode in addition to the upper vnode lock in order 491 * to synchronize against any unionfs_lock() operation which may still 492 * hold the lower vnode lock. We do not need to do this for the root 493 * vnode, as the root vnode should always have both upper and lower 494 * base vnodes for its entire lifecycled, so unionfs_lock() should 495 * never attempt to lock its lower vnode in the first place. 496 * Moreover, during unmount of a non-"below" unionfs mount, the lower 497 * root vnode will already be locked as it is the covered vnode. 498 */ 499 if (uvp != NULL && lvp != NULL && (vp->v_vflag & VV_ROOT) == 0) { 500 vn_lock_pair(uvp, true, LK_EXCLUSIVE, lvp, false, LK_EXCLUSIVE); 501 unlock_lvp = true; 502 } 503 504 if (lockmgr(&vp->v_lock, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) 505 panic("%s: failed to acquire lock for vnode lock", __func__); 506 /* 507 * Use the interlock to protect the clearing of v_data to 508 * prevent faults in unionfs_lock(). 509 */ 510 VI_LOCK(vp); 511 unp->un_lowervp = unp->un_uppervp = NULL; 512 vp->v_vnlock = &(vp->v_lock); 513 vp->v_data = NULL; 514 vp->v_object = NULL; 515 if (unp->un_hashtbl != NULL) { 516 /* 517 * Clear out any cached child vnodes. This should only 518 * be necessary during forced unmount, when the vnode may 519 * be reclaimed with a non-zero use count. Otherwise the 520 * reference held by each child should prevent reclamation. 521 */ 522 for (count = 0; count <= UNIONFSHASHMASK; count++) { 523 hd = unp->un_hashtbl + count; 524 LIST_FOREACH_SAFE(unp_t1, hd, un_hash, unp_t2) { 525 LIST_REMOVE(unp_t1, un_hash); 526 unp_t1->un_hash.le_next = NULL; 527 unp_t1->un_hash.le_prev = NULL; 528 } 529 } 530 } 531 VI_UNLOCK(vp); 532 533 writerefs = atomic_load_int(&vp->v_writecount); 534 VNASSERT(writerefs >= 0, vp, 535 ("%s: write count %d, unexpected text ref", __func__, writerefs)); 536 /* 537 * If we were opened for write, we leased the write reference 538 * to the lower vnode. If this is a reclamation due to the 539 * forced unmount, undo the reference now. 540 */ 541 if (writerefs > 0) { 542 VNASSERT(uvp != NULL, vp, 543 ("%s: write reference without upper vnode", __func__)); 544 VOP_ADD_WRITECOUNT(uvp, -writerefs); 545 } 546 if (uvp != NULL) 547 vput(uvp); 548 if (unlock_lvp) 549 vput(lvp); 550 else if (lvp != NULL) 551 vrele(lvp); 552 553 if (dvp != NULL) 554 unionfs_rem_cached_vnode(unp, dvp); 555 556 if (unp->un_path != NULL) { 557 free(unp->un_path, M_UNIONFSPATH); 558 unp->un_path = NULL; 559 unp->un_pathlen = 0; 560 } 561 562 if (unp->un_hashtbl != NULL) { 563 hashdestroy(unp->un_hashtbl, M_UNIONFSHASH, UNIONFSHASHMASK); 564 } 565 566 LIST_FOREACH_SAFE(unsp, &(unp->un_unshead), uns_list, unsp_tmp) { 567 LIST_REMOVE(unsp, uns_list); 568 free(unsp, M_TEMP); 569 } 570 if (dvp != NULL) { 571 mtx_lock(&unionfs_deferred_rele_lock); 572 STAILQ_INSERT_TAIL(&unionfs_deferred_rele_list, unp, un_rele); 573 mtx_unlock(&unionfs_deferred_rele_lock); 574 taskqueue_enqueue(taskqueue_unionfs_rele, 575 &unionfs_deferred_rele_task); 576 } else 577 free(unp, M_UNIONFSNODE); 578 } 579 580 /* 581 * Find the unionfs node status object for the vnode corresponding to unp, 582 * for the process that owns td. Return NULL if no such object exists. 583 */ 584 struct unionfs_node_status * 585 unionfs_find_node_status(struct unionfs_node *unp, struct thread *td) 586 { 587 struct unionfs_node_status *unsp; 588 pid_t pid; 589 590 MPASS(td != NULL); 591 pid = td->td_proc->p_pid; 592 593 ASSERT_VOP_ELOCKED(UNIONFSTOV(unp), __func__); 594 595 LIST_FOREACH(unsp, &(unp->un_unshead), uns_list) { 596 if (unsp->uns_pid == pid) { 597 return (unsp); 598 } 599 } 600 601 return (NULL); 602 } 603 604 /* 605 * Get the unionfs node status object for the vnode corresponding to unp, 606 * for the process that owns td. Allocate a new status object if one 607 * does not already exist. 608 */ 609 void 610 unionfs_get_node_status(struct unionfs_node *unp, struct thread *td, 611 struct unionfs_node_status **unspp) 612 { 613 struct unionfs_node_status *unsp; 614 pid_t pid; 615 616 MPASS(td != NULL); 617 pid = td->td_proc->p_pid; 618 619 KASSERT(NULL != unspp, ("%s: NULL status", __func__)); 620 unsp = unionfs_find_node_status(unp, td); 621 if (unsp == NULL) { 622 /* create a new unionfs node status */ 623 unsp = malloc(sizeof(struct unionfs_node_status), 624 M_TEMP, M_WAITOK | M_ZERO); 625 626 unsp->uns_pid = pid; 627 LIST_INSERT_HEAD(&(unp->un_unshead), unsp, uns_list); 628 } 629 630 *unspp = unsp; 631 } 632 633 /* 634 * Remove the unionfs node status, if you can. 635 * You need exclusive lock this vnode. 636 */ 637 void 638 unionfs_tryrem_node_status(struct unionfs_node *unp, 639 struct unionfs_node_status *unsp) 640 { 641 KASSERT(NULL != unsp, ("%s: NULL status", __func__)); 642 ASSERT_VOP_ELOCKED(UNIONFSTOV(unp), __func__); 643 644 if (0 < unsp->uns_lower_opencnt || 0 < unsp->uns_upper_opencnt) 645 return; 646 647 LIST_REMOVE(unsp, uns_list); 648 free(unsp, M_TEMP); 649 } 650 651 /* 652 * Create upper node attr. 653 */ 654 void 655 unionfs_create_uppervattr_core(struct unionfs_mount *ump, struct vattr *lva, 656 struct vattr *uva, struct thread *td) 657 { 658 VATTR_NULL(uva); 659 uva->va_type = lva->va_type; 660 uva->va_atime = lva->va_atime; 661 uva->va_mtime = lva->va_mtime; 662 uva->va_ctime = lva->va_ctime; 663 664 switch (ump->um_copymode) { 665 case UNIONFS_TRANSPARENT: 666 uva->va_mode = lva->va_mode; 667 uva->va_uid = lva->va_uid; 668 uva->va_gid = lva->va_gid; 669 break; 670 case UNIONFS_MASQUERADE: 671 if (ump->um_uid == lva->va_uid) { 672 uva->va_mode = lva->va_mode & 077077; 673 uva->va_mode |= (lva->va_type == VDIR ? 674 ump->um_udir : ump->um_ufile) & 0700; 675 uva->va_uid = lva->va_uid; 676 uva->va_gid = lva->va_gid; 677 } else { 678 uva->va_mode = (lva->va_type == VDIR ? 679 ump->um_udir : ump->um_ufile); 680 uva->va_uid = ump->um_uid; 681 uva->va_gid = ump->um_gid; 682 } 683 break; 684 default: /* UNIONFS_TRADITIONAL */ 685 uva->va_mode = 0777 & ~td->td_proc->p_pd->pd_cmask; 686 uva->va_uid = ump->um_uid; 687 uva->va_gid = ump->um_gid; 688 break; 689 } 690 } 691 692 /* 693 * Create upper node attr. 694 */ 695 int 696 unionfs_create_uppervattr(struct unionfs_mount *ump, struct vnode *lvp, 697 struct vattr *uva, struct ucred *cred, struct thread *td) 698 { 699 struct vattr lva; 700 int error; 701 702 if ((error = VOP_GETATTR(lvp, &lva, cred))) 703 return (error); 704 705 unionfs_create_uppervattr_core(ump, &lva, uva, td); 706 707 return (error); 708 } 709 710 /* 711 * relookup 712 * 713 * dvp should be locked on entry and will be locked on return. 714 * 715 * If an error is returned, *vpp will be invalid, otherwise it will hold a 716 * locked, referenced vnode. If *vpp == dvp then remember that only one 717 * LK_EXCLUSIVE lock is held. 718 */ 719 int 720 unionfs_relookup(struct vnode *dvp, struct vnode **vpp, 721 struct componentname *cnp, struct componentname *cn, struct thread *td, 722 char *path, int pathlen, u_long nameiop) 723 { 724 int error; 725 bool refstart; 726 727 cn->cn_namelen = pathlen; 728 cn->cn_pnbuf = path; 729 cn->cn_nameiop = nameiop; 730 cn->cn_flags = (LOCKPARENT | LOCKLEAF | ISLASTCN); 731 cn->cn_lkflags = LK_EXCLUSIVE; 732 cn->cn_cred = cnp->cn_cred; 733 cn->cn_nameptr = cn->cn_pnbuf; 734 735 refstart = false; 736 if (nameiop == DELETE) { 737 cn->cn_flags |= (cnp->cn_flags & DOWHITEOUT); 738 } else if (nameiop == RENAME) { 739 refstart = true; 740 } else if (nameiop == CREATE) { 741 cn->cn_flags |= NOCACHE; 742 } 743 744 vref(dvp); 745 VOP_UNLOCK(dvp); 746 747 if ((error = vfs_relookup(dvp, vpp, cn, refstart))) { 748 vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); 749 } else 750 vrele(dvp); 751 752 KASSERT(cn->cn_pnbuf == path, ("%s: cn_pnbuf changed", __func__)); 753 754 return (error); 755 } 756 757 /* 758 * Update the unionfs_node. 759 * 760 * uvp is new locked upper vnode. unionfs vnode's lock will be exchanged to the 761 * uvp's lock and lower's lock will be unlocked. 762 */ 763 static void 764 unionfs_node_update(struct unionfs_node *unp, struct vnode *uvp, 765 struct thread *td) 766 { 767 struct unionfs_node_hashhead *hd; 768 struct vnode *vp; 769 struct vnode *lvp; 770 struct vnode *dvp; 771 unsigned count, lockrec; 772 773 vp = UNIONFSTOV(unp); 774 lvp = unp->un_lowervp; 775 ASSERT_VOP_ELOCKED(lvp, __func__); 776 ASSERT_VOP_ELOCKED(uvp, __func__); 777 dvp = unp->un_dvp; 778 779 VNASSERT(vp->v_writecount == 0, vp, 780 ("%s: non-zero writecount", __func__)); 781 /* 782 * Update the upper vnode's lock state to match the lower vnode, 783 * and then switch the unionfs vnode's lock to the upper vnode. 784 */ 785 lockrec = lvp->v_vnlock->lk_recurse; 786 for (count = 0; count < lockrec; count++) 787 vn_lock(uvp, LK_EXCLUSIVE | LK_CANRECURSE | LK_RETRY); 788 VI_LOCK(vp); 789 unp->un_uppervp = uvp; 790 vp->v_vnlock = uvp->v_vnlock; 791 VI_UNLOCK(vp); 792 793 for (count = 0; count < lockrec + 1; count++) 794 VOP_UNLOCK(lvp); 795 /* 796 * Re-cache the unionfs vnode against the upper vnode 797 */ 798 if (dvp != NULL && vp->v_type == VDIR) { 799 VI_LOCK(dvp); 800 if (unp->un_hash.le_prev != NULL) { 801 LIST_REMOVE(unp, un_hash); 802 hd = unionfs_get_hashhead(dvp, uvp); 803 LIST_INSERT_HEAD(hd, unp, un_hash); 804 } 805 VI_UNLOCK(unp->un_dvp); 806 } 807 } 808 809 /* 810 * Mark a unionfs operation as being in progress, sleeping if the 811 * same operation is already in progress. 812 * This is useful, for example, during copy-up operations in which 813 * we may drop the target vnode lock, but we want to avoid the 814 * possibility of a concurrent copy-up on the same vnode triggering 815 * a spurious failure. 816 */ 817 int 818 unionfs_set_in_progress_flag(struct vnode *vp, unsigned int flag) 819 { 820 struct unionfs_node *unp; 821 int error; 822 823 error = 0; 824 ASSERT_VOP_ELOCKED(vp, __func__); 825 VI_LOCK(vp); 826 unp = VTOUNIONFS(vp); 827 while (error == 0 && (unp->un_flag & flag) != 0) { 828 VOP_UNLOCK(vp); 829 error = msleep(vp, VI_MTX(vp), PCATCH | PDROP, "unioncp", 0); 830 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 831 VI_LOCK(vp); 832 if (error == 0) { 833 /* 834 * If we waited on a concurrent copy-up and that 835 * copy-up was successful, return a non-fatal 836 * indication that the desired operation is already 837 * complete. If we waited on a concurrent lookup, 838 * return ERELOOKUP to indicate the VFS cache should 839 * be re-queried to avoid creating a duplicate unionfs 840 * vnode. 841 */ 842 unp = VTOUNIONFS(vp); 843 if (unp == NULL) 844 error = ENOENT; 845 else if (flag == UNIONFS_COPY_IN_PROGRESS && 846 unp->un_uppervp != NULL) 847 error = EJUSTRETURN; 848 else if (flag == UNIONFS_LOOKUP_IN_PROGRESS) 849 error = ERELOOKUP; 850 } 851 } 852 if (error == 0) 853 unp->un_flag |= flag; 854 VI_UNLOCK(vp); 855 856 return (error); 857 } 858 859 void 860 unionfs_clear_in_progress_flag(struct vnode *vp, unsigned int flag) 861 { 862 struct unionfs_node *unp; 863 864 ASSERT_VOP_ELOCKED(vp, __func__); 865 unp = VTOUNIONFS(vp); 866 VI_LOCK(vp); 867 if (unp != NULL) { 868 VNASSERT((unp->un_flag & flag) != 0, vp, 869 ("%s: copy not in progress", __func__)); 870 unp->un_flag &= ~flag; 871 } 872 wakeup(vp); 873 VI_UNLOCK(vp); 874 } 875 876 /* 877 * Create a new shadow dir. 878 * 879 * dvp and vp are unionfs vnodes representing a parent directory and 880 * child file, should be locked on entry, and will be locked on return. 881 * 882 * If no error returned, unp will be updated. 883 */ 884 int 885 unionfs_mkshadowdir(struct vnode *dvp, struct vnode *vp, 886 struct componentname *cnp, struct thread *td) 887 { 888 struct vnode *lvp; 889 struct vnode *uvp; 890 struct vnode *udvp; 891 struct vattr va; 892 struct vattr lva; 893 struct nameidata nd; 894 struct mount *mp; 895 struct ucred *cred; 896 struct ucred *credbk; 897 struct uidinfo *rootinfo; 898 struct unionfs_mount *ump; 899 struct unionfs_node *dunp; 900 struct unionfs_node *unp; 901 int error; 902 903 ASSERT_VOP_ELOCKED(dvp, __func__); 904 ASSERT_VOP_ELOCKED(vp, __func__); 905 ump = MOUNTTOUNIONFSMOUNT(vp->v_mount); 906 unp = VTOUNIONFS(vp); 907 if (unp->un_uppervp != NULL) 908 return (EEXIST); 909 dunp = VTOUNIONFS(dvp); 910 udvp = dunp->un_uppervp; 911 912 error = unionfs_set_in_progress_flag(vp, UNIONFS_COPY_IN_PROGRESS); 913 if (error == EJUSTRETURN) 914 return (0); 915 else if (error != 0) 916 return (error); 917 918 lvp = unp->un_lowervp; 919 uvp = NULL; 920 credbk = cnp->cn_cred; 921 922 /* Authority change to root */ 923 rootinfo = uifind((uid_t)0); 924 cred = crdup(cnp->cn_cred); 925 change_euid(cred, rootinfo); 926 change_ruid(cred, rootinfo); 927 change_svuid(cred, (uid_t)0); 928 uifree(rootinfo); 929 cnp->cn_cred = cred; 930 931 memset(&nd.ni_cnd, 0, sizeof(struct componentname)); 932 NDPREINIT(&nd); 933 934 if ((error = VOP_GETATTR(lvp, &lva, cnp->cn_cred))) 935 goto unionfs_mkshadowdir_finish; 936 937 vref(udvp); 938 VOP_UNLOCK(vp); 939 if ((error = unionfs_relookup(udvp, &uvp, cnp, &nd.ni_cnd, td, 940 cnp->cn_nameptr, cnp->cn_namelen, CREATE))) { 941 /* 942 * When handling error cases here, we drop udvp's lock and 943 * then jump to exit code that relocks dvp, which in most 944 * cases will effectively relock udvp. However, this is 945 * not guaranteed to be the case, as various calls made 946 * here (such as unionfs_relookup() above and VOP_MKDIR() 947 * below) may unlock and then relock udvp, allowing dvp to 948 * be reclaimed in the meantime. In such a situation dvp 949 * will no longer share its lock with udvp. Since 950 * performance isn't a concern for these error cases, it 951 * makes more sense to reuse the common code that locks 952 * dvp on exit than to explicitly check for reclamation 953 * of dvp. 954 */ 955 vput(udvp); 956 goto unionfs_mkshadowdir_relock; 957 } 958 if (uvp != NULL) { 959 if (udvp == uvp) 960 vrele(uvp); 961 else 962 vput(uvp); 963 964 error = EEXIST; 965 vput(udvp); 966 goto unionfs_mkshadowdir_relock; 967 } 968 969 if ((error = vn_start_write(udvp, &mp, V_WAIT | V_PCATCH))) { 970 vput(udvp); 971 goto unionfs_mkshadowdir_relock; 972 } 973 unionfs_create_uppervattr_core(ump, &lva, &va, td); 974 975 /* 976 * Temporarily NUL-terminate the current pathname component. 977 * This function may be called during lookup operations in which 978 * the current pathname component is not the leaf, meaning that 979 * the NUL terminator is some distance beyond the end of the current 980 * component. This *should* be fine, as cn_namelen will still 981 * correctly indicate the length of only the current component, 982 * but ZFS in particular does not respect cn_namelen in its VOP_MKDIR 983 * implementation. 984 * Note that this assumes nd.ni_cnd.cn_pnbuf was allocated by 985 * something like a local namei() operation and the temporary 986 * NUL-termination will not have an effect on other threads. 987 */ 988 char *pathend = &nd.ni_cnd.cn_nameptr[nd.ni_cnd.cn_namelen]; 989 char pathterm = *pathend; 990 *pathend = '\0'; 991 error = VOP_MKDIR(udvp, &uvp, &nd.ni_cnd, &va); 992 *pathend = pathterm; 993 if (error != 0) { 994 /* 995 * See the comment after unionfs_relookup() above for an 996 * explanation of why we unlock udvp here only to relock 997 * dvp on exit. 998 */ 999 vput(udvp); 1000 vn_finished_write(mp); 1001 goto unionfs_mkshadowdir_relock; 1002 } 1003 1004 /* 1005 * XXX The bug which cannot set uid/gid was corrected. 1006 * Ignore errors. 1007 */ 1008 va.va_type = VNON; 1009 /* 1010 * VOP_SETATTR() may transiently drop uvp's lock, so it's 1011 * important to call it before unionfs_node_update() transfers 1012 * the unionfs vnode's lock from lvp to uvp; otherwise the 1013 * unionfs vnode itself would be transiently unlocked and 1014 * potentially doomed. 1015 */ 1016 VOP_SETATTR(uvp, &va, nd.ni_cnd.cn_cred); 1017 1018 /* 1019 * uvp may become doomed during VOP_VPUT_PAIR() if the implementation 1020 * must temporarily drop uvp's lock. However, since we hold a 1021 * reference to uvp from the VOP_MKDIR() call above, this would require 1022 * a forcible unmount of uvp's filesystem, which in turn can only 1023 * happen if our unionfs instance is first forcibly unmounted. We'll 1024 * therefore catch this case in the NULL check of unp below. 1025 */ 1026 VOP_VPUT_PAIR(udvp, &uvp, false); 1027 vn_finished_write(mp); 1028 vn_lock_pair(vp, false, LK_EXCLUSIVE, uvp, true, LK_EXCLUSIVE); 1029 unp = VTOUNIONFS(vp); 1030 if (unp == NULL) { 1031 vput(uvp); 1032 error = ENOENT; 1033 } else 1034 unionfs_node_update(unp, uvp, td); 1035 VOP_UNLOCK(vp); 1036 1037 unionfs_mkshadowdir_relock: 1038 vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); 1039 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1040 if (error == 0 && (VN_IS_DOOMED(dvp) || VN_IS_DOOMED(vp))) 1041 error = ENOENT; 1042 1043 unionfs_mkshadowdir_finish: 1044 unionfs_clear_in_progress_flag(vp, UNIONFS_COPY_IN_PROGRESS); 1045 cnp->cn_cred = credbk; 1046 crfree(cred); 1047 1048 return (error); 1049 } 1050 1051 static inline void 1052 unionfs_forward_vop_ref(struct vnode *basevp, int *lkflags) 1053 { 1054 ASSERT_VOP_LOCKED(basevp, __func__); 1055 *lkflags = VOP_ISLOCKED(basevp); 1056 vref(basevp); 1057 } 1058 1059 /* 1060 * Prepare unionfs to issue a forwarded VOP to either the upper or lower 1061 * FS. This should be used for any VOP which may drop the vnode lock; 1062 * it is not required otherwise. 1063 * The unionfs vnode shares its lock with the base-layer vnode(s); if the 1064 * base FS must transiently drop its vnode lock, the unionfs vnode may 1065 * effectively become unlocked. During that window, a concurrent forced 1066 * unmount may doom the unionfs vnode, which leads to two significant 1067 * issues: 1068 * 1) Completion of, and return from, the unionfs VOP with the unionfs 1069 * vnode completely unlocked. When the unionfs vnode becomes doomed 1070 * it stops sharing its lock with the base vnode, so even if the 1071 * forwarded VOP reacquires the base vnode lock the unionfs vnode 1072 * lock will no longer be held. This can lead to violation of the 1073 * caller's sychronization requirements as well as various failed 1074 * locking assertions when DEBUG_VFS_LOCKS is enabled. 1075 * 2) Loss of reference on the base vnode. The caller is expected to 1076 * hold a v_usecount reference on the unionfs vnode, while the 1077 * unionfs vnode holds a reference on the base-layer vnode(s). But 1078 * these references are released when the unionfs vnode becomes 1079 * doomed, violating the base layer's expectation that its caller 1080 * must hold a reference to prevent vnode recycling. 1081 * 1082 * basevp1 and basevp2 represent two base-layer vnodes which are 1083 * expected to be locked when this function is called. basevp2 1084 * may be NULL, but if not NULL basevp1 and basevp2 should represent 1085 * a parent directory and a filed linked to it, respectively. 1086 * lkflags1 and lkflags2 are output parameters that will store the 1087 * current lock status of basevp1 and basevp2, respectively. They 1088 * are intended to be passed as the lkflags1 and lkflags2 parameters 1089 * in the subsequent call to unionfs_forward_vop_finish_pair(). 1090 * lkflags2 may be NULL iff basevp2 is NULL. 1091 */ 1092 void 1093 unionfs_forward_vop_start_pair(struct vnode *basevp1, int *lkflags1, 1094 struct vnode *basevp2, int *lkflags2) 1095 { 1096 /* 1097 * Take an additional reference on the base-layer vnodes to 1098 * avoid loss of reference if the unionfs vnodes are doomed. 1099 */ 1100 unionfs_forward_vop_ref(basevp1, lkflags1); 1101 if (basevp2 != NULL) 1102 unionfs_forward_vop_ref(basevp2, lkflags2); 1103 } 1104 1105 static inline bool 1106 unionfs_forward_vop_rele(struct vnode *unionvp, struct vnode *basevp, 1107 int lkflags) 1108 { 1109 bool unionvp_doomed; 1110 1111 if (__predict_false(VTOUNIONFS(unionvp) == NULL)) { 1112 if ((lkflags & LK_EXCLUSIVE) != 0) 1113 ASSERT_VOP_ELOCKED(basevp, __func__); 1114 else 1115 ASSERT_VOP_LOCKED(basevp, __func__); 1116 unionvp_doomed = true; 1117 } else { 1118 vrele(basevp); 1119 unionvp_doomed = false; 1120 } 1121 1122 return (unionvp_doomed); 1123 } 1124 1125 1126 /* 1127 * Indicate completion of a forwarded VOP previously prepared by 1128 * unionfs_forward_vop_start_pair(). 1129 * basevp1 and basevp2 must be the same values passed to the prior 1130 * call to unionfs_forward_vop_start_pair(). unionvp1 and unionvp2 1131 * must be the unionfs vnodes that were initially above basevp1 and 1132 * basevp2, respectively. 1133 * basevp1 and basevp2 (if not NULL) must be locked when this function 1134 * is called, while unionvp1 and/or unionvp2 may be unlocked if either 1135 * unionfs vnode has become doomed. 1136 * lkflags1 and lkflag2 represent the locking flags that should be 1137 * used to re-lock unionvp1 and unionvp2, respectively, if either 1138 * vnode has become doomed. 1139 * 1140 * Returns true if any unionfs vnode was found to be doomed, false 1141 * otherwise. 1142 */ 1143 bool 1144 unionfs_forward_vop_finish_pair( 1145 struct vnode *unionvp1, struct vnode *basevp1, int lkflags1, 1146 struct vnode *unionvp2, struct vnode *basevp2, int lkflags2) 1147 { 1148 bool vp1_doomed, vp2_doomed; 1149 1150 /* 1151 * If either vnode is found to have been doomed, set 1152 * a flag indicating that it needs to be re-locked. 1153 * Otherwise, simply drop the base-vnode reference that 1154 * was taken in unionfs_forward_vop_start(). 1155 */ 1156 vp1_doomed = unionfs_forward_vop_rele(unionvp1, basevp1, lkflags1); 1157 1158 if (unionvp2 != NULL) 1159 vp2_doomed = unionfs_forward_vop_rele(unionvp2, basevp2, lkflags2); 1160 else 1161 vp2_doomed = false; 1162 1163 /* 1164 * If any of the unionfs vnodes need to be re-locked, that 1165 * means the unionfs vnode's lock is now de-coupled from the 1166 * corresponding base vnode. We therefore need to drop the 1167 * base vnode lock (since nothing else will after this point), 1168 * and also release the reference taken in 1169 * unionfs_forward_vop_start_pair(). 1170 */ 1171 if (__predict_false(vp1_doomed && vp2_doomed)) 1172 VOP_VPUT_PAIR(basevp1, &basevp2, true); 1173 else if (__predict_false(vp1_doomed)) { 1174 /* 1175 * If basevp1 needs to be unlocked, then we may not 1176 * be able to safely unlock it with basevp2 still locked, 1177 * for the same reason that an ordinary VFS call would 1178 * need to use VOP_VPUT_PAIR() here. We might be able 1179 * to use VOP_VPUT_PAIR(..., false) here, but then we 1180 * would need to deal with the possibility of basevp2 1181 * changing out from under us, which could result in 1182 * either the unionfs vnode becoming doomed or its 1183 * upper/lower vp no longer matching basevp2. Either 1184 * scenario would require at least re-locking the unionfs 1185 * vnode anyway. 1186 */ 1187 if (unionvp2 != NULL) { 1188 VOP_UNLOCK(unionvp2); 1189 vp2_doomed = true; 1190 } 1191 vput(basevp1); 1192 } else if (__predict_false(vp2_doomed)) 1193 vput(basevp2); 1194 1195 if (__predict_false(vp1_doomed || vp2_doomed)) 1196 vn_lock_pair(unionvp1, !vp1_doomed, lkflags1, 1197 unionvp2, !vp2_doomed, lkflags2); 1198 1199 return (vp1_doomed || vp2_doomed); 1200 } 1201 1202 /* 1203 * Create a new whiteout. 1204 * 1205 * dvp and vp are unionfs vnodes representing a parent directory and 1206 * child file, should be locked on entry, and will be locked on return. 1207 */ 1208 int 1209 unionfs_mkwhiteout(struct vnode *dvp, struct vnode *vp, 1210 struct componentname *cnp, struct thread *td, char *path, int pathlen) 1211 { 1212 struct vnode *udvp; 1213 struct vnode *wvp; 1214 struct nameidata nd; 1215 struct mount *mp; 1216 int error; 1217 bool dvp_locked; 1218 1219 ASSERT_VOP_ELOCKED(dvp, __func__); 1220 ASSERT_VOP_ELOCKED(vp, __func__); 1221 1222 udvp = VTOUNIONFS(dvp)->un_uppervp; 1223 wvp = NULL; 1224 NDPREINIT(&nd); 1225 vref(udvp); 1226 VOP_UNLOCK(vp); 1227 if ((error = unionfs_relookup(udvp, &wvp, cnp, &nd.ni_cnd, td, path, 1228 pathlen, CREATE))) { 1229 goto unionfs_mkwhiteout_cleanup; 1230 } 1231 if (wvp != NULL) { 1232 if (udvp == wvp) 1233 vrele(wvp); 1234 else 1235 vput(wvp); 1236 1237 if (nd.ni_cnd.cn_flags & ISWHITEOUT) 1238 error = 0; 1239 else 1240 error = EEXIST; 1241 goto unionfs_mkwhiteout_cleanup; 1242 } 1243 1244 if ((error = vn_start_write(udvp, &mp, V_WAIT | V_PCATCH))) 1245 goto unionfs_mkwhiteout_cleanup; 1246 error = VOP_WHITEOUT(udvp, &nd.ni_cnd, CREATE); 1247 vn_finished_write(mp); 1248 1249 unionfs_mkwhiteout_cleanup: 1250 if (VTOUNIONFS(dvp) == NULL) { 1251 vput(udvp); 1252 dvp_locked = false; 1253 } else { 1254 vrele(udvp); 1255 dvp_locked = true; 1256 } 1257 vn_lock_pair(dvp, dvp_locked, LK_EXCLUSIVE, vp, false, LK_EXCLUSIVE); 1258 return (error); 1259 } 1260 1261 /* 1262 * Create a new vnode for create a new shadow file. 1263 * 1264 * If an error is returned, *vpp will be invalid, otherwise it will hold a 1265 * locked, referenced and opened vnode. 1266 * 1267 * unp is never updated. 1268 */ 1269 static int 1270 unionfs_vn_create_on_upper(struct vnode **vpp, struct vnode *udvp, 1271 struct vnode *vp, struct vattr *uvap, struct thread *td) 1272 { 1273 struct unionfs_mount *ump; 1274 struct unionfs_node *unp; 1275 struct vnode *uvp; 1276 struct vnode *lvp; 1277 struct ucred *cred; 1278 struct vattr lva; 1279 struct nameidata nd; 1280 int fmode; 1281 int error; 1282 1283 ASSERT_VOP_ELOCKED(vp, __func__); 1284 unp = VTOUNIONFS(vp); 1285 ump = MOUNTTOUNIONFSMOUNT(UNIONFSTOV(unp)->v_mount); 1286 uvp = NULL; 1287 lvp = unp->un_lowervp; 1288 cred = td->td_ucred; 1289 fmode = FFLAGS(O_WRONLY | O_CREAT | O_TRUNC | O_EXCL); 1290 error = 0; 1291 1292 if ((error = VOP_GETATTR(lvp, &lva, cred)) != 0) 1293 return (error); 1294 unionfs_create_uppervattr_core(ump, &lva, uvap, td); 1295 1296 if (unp->un_path == NULL) 1297 panic("%s: NULL un_path", __func__); 1298 1299 nd.ni_cnd.cn_namelen = unp->un_pathlen; 1300 nd.ni_cnd.cn_pnbuf = unp->un_path; 1301 nd.ni_cnd.cn_nameiop = CREATE; 1302 nd.ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF | ISLASTCN; 1303 nd.ni_cnd.cn_lkflags = LK_EXCLUSIVE; 1304 nd.ni_cnd.cn_cred = cred; 1305 nd.ni_cnd.cn_nameptr = nd.ni_cnd.cn_pnbuf; 1306 NDPREINIT(&nd); 1307 1308 vref(udvp); 1309 VOP_UNLOCK(vp); 1310 if ((error = vfs_relookup(udvp, &uvp, &nd.ni_cnd, false)) != 0) { 1311 vrele(udvp); 1312 return (error); 1313 } 1314 1315 if (uvp != NULL) { 1316 if (uvp == udvp) 1317 vrele(uvp); 1318 else 1319 vput(uvp); 1320 error = EEXIST; 1321 goto unionfs_vn_create_on_upper_cleanup; 1322 } 1323 1324 if ((error = VOP_CREATE(udvp, &uvp, &nd.ni_cnd, uvap)) != 0) 1325 goto unionfs_vn_create_on_upper_cleanup; 1326 1327 if ((error = VOP_OPEN(uvp, fmode, cred, td, NULL)) != 0) { 1328 vput(uvp); 1329 goto unionfs_vn_create_on_upper_cleanup; 1330 } 1331 error = VOP_ADD_WRITECOUNT(uvp, 1); 1332 CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d", 1333 __func__, uvp, uvp->v_writecount); 1334 if (error == 0) { 1335 *vpp = uvp; 1336 } else { 1337 VOP_CLOSE(uvp, fmode, cred, td); 1338 } 1339 1340 unionfs_vn_create_on_upper_cleanup: 1341 vput(udvp); 1342 return (error); 1343 } 1344 1345 /* 1346 * Copy from lvp to uvp. 1347 * 1348 * lvp and uvp should be locked and opened on entry and will be locked and 1349 * opened on return. 1350 */ 1351 static int 1352 unionfs_copyfile_core(struct vnode *lvp, struct vnode *uvp, 1353 struct ucred *cred, struct thread *td) 1354 { 1355 char *buf; 1356 struct uio uio; 1357 struct iovec iov; 1358 off_t offset; 1359 int count; 1360 int error; 1361 int bufoffset; 1362 1363 error = 0; 1364 memset(&uio, 0, sizeof(uio)); 1365 1366 uio.uio_td = td; 1367 uio.uio_segflg = UIO_SYSSPACE; 1368 uio.uio_offset = 0; 1369 1370 buf = malloc(MAXBSIZE, M_TEMP, M_WAITOK); 1371 1372 while (error == 0) { 1373 offset = uio.uio_offset; 1374 1375 uio.uio_iov = &iov; 1376 uio.uio_iovcnt = 1; 1377 iov.iov_base = buf; 1378 iov.iov_len = MAXBSIZE; 1379 uio.uio_resid = iov.iov_len; 1380 uio.uio_rw = UIO_READ; 1381 1382 if ((error = VOP_READ(lvp, &uio, 0, cred)) != 0) 1383 break; 1384 if ((count = MAXBSIZE - uio.uio_resid) == 0) 1385 break; 1386 1387 bufoffset = 0; 1388 while (bufoffset < count) { 1389 uio.uio_iov = &iov; 1390 uio.uio_iovcnt = 1; 1391 iov.iov_base = buf + bufoffset; 1392 iov.iov_len = count - bufoffset; 1393 uio.uio_offset = offset + bufoffset; 1394 uio.uio_resid = iov.iov_len; 1395 uio.uio_rw = UIO_WRITE; 1396 1397 if ((error = VOP_WRITE(uvp, &uio, 0, cred)) != 0) 1398 break; 1399 1400 bufoffset += (count - bufoffset) - uio.uio_resid; 1401 } 1402 1403 uio.uio_offset = offset + bufoffset; 1404 } 1405 1406 free(buf, M_TEMP); 1407 1408 return (error); 1409 } 1410 1411 /* 1412 * Copy file from lower to upper. 1413 * 1414 * If you need copy of the contents, set 1 to docopy. Otherwise, set 0 to 1415 * docopy. 1416 * 1417 * vp is a unionfs vnode that should be locked on entry and will be 1418 * locked on return. 1419 * 1420 * If no error returned, unp will be updated. 1421 */ 1422 int 1423 unionfs_copyfile(struct vnode *vp, int docopy, struct ucred *cred, 1424 struct thread *td) 1425 { 1426 struct unionfs_node *unp; 1427 struct unionfs_node *dunp; 1428 struct mount *mp; 1429 struct vnode *udvp; 1430 struct vnode *lvp; 1431 struct vnode *uvp; 1432 struct vattr uva; 1433 int error; 1434 1435 ASSERT_VOP_ELOCKED(vp, __func__); 1436 unp = VTOUNIONFS(vp); 1437 lvp = unp->un_lowervp; 1438 uvp = NULL; 1439 1440 if ((UNIONFSTOV(unp)->v_mount->mnt_flag & MNT_RDONLY)) 1441 return (EROFS); 1442 if (unp->un_dvp == NULL) 1443 return (EINVAL); 1444 if (unp->un_uppervp != NULL) 1445 return (EEXIST); 1446 1447 udvp = NULL; 1448 VI_LOCK(unp->un_dvp); 1449 dunp = VTOUNIONFS(unp->un_dvp); 1450 if (dunp != NULL) 1451 udvp = dunp->un_uppervp; 1452 VI_UNLOCK(unp->un_dvp); 1453 1454 if (udvp == NULL) 1455 return (EROFS); 1456 if ((udvp->v_mount->mnt_flag & MNT_RDONLY)) 1457 return (EROFS); 1458 ASSERT_VOP_UNLOCKED(udvp, __func__); 1459 1460 error = unionfs_set_in_progress_flag(vp, UNIONFS_COPY_IN_PROGRESS); 1461 if (error == EJUSTRETURN) 1462 return (0); 1463 else if (error != 0) 1464 return (error); 1465 1466 error = VOP_ACCESS(lvp, VREAD, cred, td); 1467 if (error != 0) 1468 goto unionfs_copyfile_cleanup; 1469 1470 if ((error = vn_start_write(udvp, &mp, V_WAIT | V_PCATCH)) != 0) 1471 goto unionfs_copyfile_cleanup; 1472 error = unionfs_vn_create_on_upper(&uvp, udvp, vp, &uva, td); 1473 if (error != 0) { 1474 vn_finished_write(mp); 1475 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1476 goto unionfs_copyfile_cleanup; 1477 } 1478 1479 /* 1480 * Note that it's still possible for e.g. VOP_WRITE to relock 1481 * uvp below while holding vp[=lvp] locked. Replacing 1482 * unionfs_copyfile_core with vn_generic_copy_file_range() will 1483 * allow us to avoid the problem by moving this vn_lock_pair() 1484 * call much later. 1485 */ 1486 vn_lock_pair(vp, false, LK_EXCLUSIVE, uvp, true, LK_EXCLUSIVE); 1487 unp = VTOUNIONFS(vp); 1488 if (unp == NULL) { 1489 error = ENOENT; 1490 goto unionfs_copyfile_cleanup; 1491 } 1492 1493 if (docopy != 0) { 1494 error = VOP_OPEN(lvp, FREAD, cred, td, NULL); 1495 if (error == 0) { 1496 error = unionfs_copyfile_core(lvp, uvp, cred, td); 1497 VOP_CLOSE(lvp, FREAD, cred, td); 1498 } 1499 } 1500 VOP_CLOSE(uvp, FWRITE, cred, td); 1501 VOP_ADD_WRITECOUNT_CHECKED(uvp, -1); 1502 CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d", 1503 __func__, uvp, uvp->v_writecount); 1504 1505 vn_finished_write(mp); 1506 1507 if (error == 0) { 1508 /* Reset the attributes. Ignore errors. */ 1509 uva.va_type = VNON; 1510 VOP_SETATTR(uvp, &uva, cred); 1511 unionfs_node_update(unp, uvp, td); 1512 } 1513 1514 unionfs_copyfile_cleanup: 1515 unionfs_clear_in_progress_flag(vp, UNIONFS_COPY_IN_PROGRESS); 1516 return (error); 1517 } 1518 1519 /* 1520 * Determine if the unionfs view of a directory is empty such that 1521 * an rmdir operation can be permitted. 1522 * 1523 * We assume the VOP_RMDIR() against the upper layer vnode will take 1524 * care of this check for us where the upper FS is concerned, so here 1525 * we concentrate on the lower FS. We need to check for the presence 1526 * of files other than "." and ".." in the lower FS directory and 1527 * then cross-check any files we find against the upper FS to see if 1528 * a whiteout is present (in which case we treat the lower file as 1529 * non-present). 1530 * 1531 * The logic here is based heavily on vn_dir_check_empty(). 1532 * 1533 * vp should be a locked unionfs node, and vp's lowervp should also be 1534 * locked. 1535 */ 1536 int 1537 unionfs_check_rmdir(struct vnode *vp, struct ucred *cred, struct thread *td) 1538 { 1539 struct vnode *uvp; 1540 struct vnode *lvp; 1541 struct vnode *tvp; 1542 char *dirbuf; 1543 size_t dirbuflen, len; 1544 off_t off; 1545 struct dirent *dp; 1546 struct componentname cn; 1547 struct vattr va; 1548 int error; 1549 int eofflag; 1550 1551 eofflag = 0; 1552 lvp = UNIONFSVPTOLOWERVP(vp); 1553 uvp = UNIONFSVPTOUPPERVP(vp); 1554 1555 /* 1556 * Note that the locking here still isn't ideal: We expect the caller 1557 * to hold both the upper and lower layer locks as well as the upper 1558 * parent directory lock, which it can do in a manner that avoids 1559 * deadlock. However, if the cross-check logic below needs to call 1560 * VOP_LOOKUP(), that may relock the upper vnode and lock any found 1561 * child vnode in a way that doesn't protect against deadlock given 1562 * the other held locks. Beyond that, the various other VOPs we issue 1563 * below, such as VOP_OPEN() and VOP_READDIR(), may also re-lock the 1564 * lower vnode. 1565 * We might instead just handoff between the upper vnode lock 1566 * (and its parent directory lock) and the lower vnode lock as needed, 1567 * so that the lower lock is never held at the same time as the upper 1568 * locks, but that opens up a wider window in which the upper 1569 * directory (and also the lower directory if it isn't truly 1570 * read-only) may change while the relevant lock is dropped. But 1571 * since re-locking may happen here and open up such a window anyway, 1572 * perhaps that is a worthwile tradeoff? Or perhaps we can ultimately 1573 * do sufficient tracking of empty state within the unionfs vnode 1574 * (in conjunction with upcalls from the lower FSes to notify us 1575 * of out-of-band state changes) that we can avoid these costly checks 1576 * altogether. 1577 */ 1578 ASSERT_VOP_LOCKED(lvp, __func__); 1579 ASSERT_VOP_ELOCKED(uvp, __func__); 1580 1581 if ((error = VOP_GETATTR(uvp, &va, cred)) != 0) 1582 return (error); 1583 if (va.va_flags & OPAQUE) 1584 return (0); 1585 1586 #ifdef MAC 1587 if ((error = mac_vnode_check_open(cred, lvp, VEXEC | VREAD)) != 0) 1588 return (error); 1589 #endif 1590 if ((error = VOP_ACCESS(lvp, VEXEC | VREAD, cred, td)) != 0) 1591 return (error); 1592 if ((error = VOP_OPEN(lvp, FREAD, cred, td, NULL)) != 0) 1593 return (error); 1594 if ((error = VOP_GETATTR(lvp, &va, cred)) != 0) 1595 return (error); 1596 1597 dirbuflen = max(DEV_BSIZE, GENERIC_MAXDIRSIZ); 1598 if (dirbuflen < va.va_blocksize) 1599 dirbuflen = va.va_blocksize; 1600 dirbuf = malloc(dirbuflen, M_TEMP, M_WAITOK); 1601 1602 len = 0; 1603 off = 0; 1604 eofflag = 0; 1605 1606 for (;;) { 1607 error = vn_dir_next_dirent(lvp, td, dirbuf, dirbuflen, 1608 &dp, &len, &off, &eofflag); 1609 if (error != 0) 1610 break; 1611 1612 if (len == 0) { 1613 /* EOF */ 1614 error = 0; 1615 break; 1616 } 1617 1618 if (dp->d_type == DT_WHT) 1619 continue; 1620 1621 /* 1622 * Any file in the directory which is not '.' or '..' indicates 1623 * the directory is not empty. 1624 */ 1625 switch (dp->d_namlen) { 1626 case 2: 1627 if (dp->d_name[1] != '.') { 1628 /* Can't be '..' (nor '.') */ 1629 break; 1630 } 1631 /* FALLTHROUGH */ 1632 case 1: 1633 if (dp->d_name[0] != '.') { 1634 /* Can't be '..' nor '.' */ 1635 break; 1636 } 1637 continue; 1638 default: 1639 break; 1640 } 1641 1642 cn.cn_namelen = dp->d_namlen; 1643 cn.cn_pnbuf = NULL; 1644 cn.cn_nameptr = dp->d_name; 1645 cn.cn_nameiop = LOOKUP; 1646 cn.cn_flags = LOCKPARENT | LOCKLEAF | RDONLY | ISLASTCN; 1647 cn.cn_lkflags = LK_EXCLUSIVE; 1648 cn.cn_cred = cred; 1649 1650 error = VOP_LOOKUP(uvp, &tvp, &cn); 1651 if (tvp != NULL) 1652 vput(tvp); 1653 if (error != 0 && error != ENOENT && error != EJUSTRETURN) 1654 break; 1655 else if ((cn.cn_flags & ISWHITEOUT) == 0) { 1656 error = ENOTEMPTY; 1657 break; 1658 } else 1659 error = 0; 1660 } 1661 1662 VOP_CLOSE(lvp, FREAD, cred, td); 1663 free(dirbuf, M_TEMP); 1664 return (error); 1665 } 1666