1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1994 Jan-Simon Pendry 5 * Copyright (c) 1994 6 * The Regents of the University of California. All rights reserved. 7 * Copyright (c) 2005, 2006, 2012 Masanori Ozawa <ozawa@ongs.co.jp>, ONGS Inc. 8 * Copyright (c) 2006, 2012 Daichi Goto <daichi@freebsd.org> 9 * 10 * This code is derived from software contributed to Berkeley by 11 * Jan-Simon Pendry. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 3. Neither the name of the University nor the names of its contributors 22 * may be used to endorse or promote products derived from this software 23 * without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 * SUCH DAMAGE. 36 */ 37 38 #include <sys/param.h> 39 #include <sys/systm.h> 40 #include <sys/kernel.h> 41 #include <sys/ktr.h> 42 #include <sys/lock.h> 43 #include <sys/mutex.h> 44 #include <sys/malloc.h> 45 #include <sys/mount.h> 46 #include <sys/namei.h> 47 #include <sys/proc.h> 48 #include <sys/vnode.h> 49 #include <sys/dirent.h> 50 #include <sys/fcntl.h> 51 #include <sys/filedesc.h> 52 #include <sys/stat.h> 53 #include <sys/sysctl.h> 54 #include <sys/taskqueue.h> 55 #include <sys/resourcevar.h> 56 57 #include <machine/atomic.h> 58 59 #include <security/mac/mac_framework.h> 60 61 #include <vm/uma.h> 62 63 #include <fs/unionfs/union.h> 64 65 #define NUNIONFSNODECACHE 16 66 #define UNIONFSHASHMASK (NUNIONFSNODECACHE - 1) 67 68 static MALLOC_DEFINE(M_UNIONFSHASH, "UNIONFS hash", "UNIONFS hash table"); 69 MALLOC_DEFINE(M_UNIONFSNODE, "UNIONFS node", "UNIONFS vnode private part"); 70 MALLOC_DEFINE(M_UNIONFSPATH, "UNIONFS path", "UNIONFS path private part"); 71 72 static struct task unionfs_deferred_rele_task; 73 static struct mtx unionfs_deferred_rele_lock; 74 static STAILQ_HEAD(, unionfs_node) unionfs_deferred_rele_list = 75 STAILQ_HEAD_INITIALIZER(unionfs_deferred_rele_list); 76 static TASKQUEUE_DEFINE_THREAD(unionfs_rele); 77 78 unsigned int unionfs_ndeferred = 0; 79 SYSCTL_UINT(_vfs, OID_AUTO, unionfs_ndeferred, CTLFLAG_RD, 80 &unionfs_ndeferred, 0, "unionfs deferred vnode release"); 81 82 static void unionfs_deferred_rele(void *, int); 83 84 /* 85 * Initialize 86 */ 87 int 88 unionfs_init(struct vfsconf *vfsp) 89 { 90 UNIONFSDEBUG("unionfs_init\n"); /* printed during system boot */ 91 TASK_INIT(&unionfs_deferred_rele_task, 0, unionfs_deferred_rele, NULL); 92 mtx_init(&unionfs_deferred_rele_lock, "uniondefr", NULL, MTX_DEF); 93 return (0); 94 } 95 96 /* 97 * Uninitialize 98 */ 99 int 100 unionfs_uninit(struct vfsconf *vfsp) 101 { 102 taskqueue_quiesce(taskqueue_unionfs_rele); 103 taskqueue_free(taskqueue_unionfs_rele); 104 mtx_destroy(&unionfs_deferred_rele_lock); 105 return (0); 106 } 107 108 static void 109 unionfs_deferred_rele(void *arg __unused, int pending __unused) 110 { 111 STAILQ_HEAD(, unionfs_node) local_rele_list; 112 struct unionfs_node *unp, *tunp; 113 unsigned int ndeferred; 114 115 ndeferred = 0; 116 STAILQ_INIT(&local_rele_list); 117 mtx_lock(&unionfs_deferred_rele_lock); 118 STAILQ_CONCAT(&local_rele_list, &unionfs_deferred_rele_list); 119 mtx_unlock(&unionfs_deferred_rele_lock); 120 STAILQ_FOREACH_SAFE(unp, &local_rele_list, un_rele, tunp) { 121 ++ndeferred; 122 MPASS(unp->un_dvp != NULL); 123 vrele(unp->un_dvp); 124 free(unp, M_UNIONFSNODE); 125 } 126 127 /* We expect this function to be single-threaded, thus no atomic */ 128 unionfs_ndeferred += ndeferred; 129 } 130 131 static struct unionfs_node_hashhead * 132 unionfs_get_hashhead(struct vnode *dvp, struct vnode *lookup) 133 { 134 struct unionfs_node *unp; 135 136 unp = VTOUNIONFS(dvp); 137 138 return (&(unp->un_hashtbl[vfs_hash_index(lookup) & UNIONFSHASHMASK])); 139 } 140 141 /* 142 * Attempt to lookup a cached unionfs vnode by upper/lower vp 143 * from dvp, with dvp's interlock held. 144 */ 145 static struct vnode * 146 unionfs_get_cached_vnode_locked(struct vnode *lookup, struct vnode *dvp) 147 { 148 struct unionfs_node *unp; 149 struct unionfs_node_hashhead *hd; 150 struct vnode *vp; 151 152 hd = unionfs_get_hashhead(dvp, lookup); 153 154 LIST_FOREACH(unp, hd, un_hash) { 155 if (unp->un_uppervp == lookup || 156 unp->un_lowervp == lookup) { 157 vp = UNIONFSTOV(unp); 158 VI_LOCK_FLAGS(vp, MTX_DUPOK); 159 vp->v_iflag &= ~VI_OWEINACT; 160 if (VN_IS_DOOMED(vp) || 161 ((vp->v_iflag & VI_DOINGINACT) != 0)) { 162 VI_UNLOCK(vp); 163 vp = NULLVP; 164 } else { 165 vrefl(vp); 166 VI_UNLOCK(vp); 167 } 168 return (vp); 169 } 170 } 171 172 return (NULLVP); 173 } 174 175 176 /* 177 * Get the cached vnode. 178 */ 179 static struct vnode * 180 unionfs_get_cached_vnode(struct vnode *uvp, struct vnode *lvp, 181 struct vnode *dvp) 182 { 183 struct vnode *vp; 184 185 vp = NULLVP; 186 VI_LOCK(dvp); 187 if (uvp != NULLVP) 188 vp = unionfs_get_cached_vnode_locked(uvp, dvp); 189 else if (lvp != NULLVP) 190 vp = unionfs_get_cached_vnode_locked(lvp, dvp); 191 VI_UNLOCK(dvp); 192 193 return (vp); 194 } 195 196 /* 197 * Add the new vnode into cache. 198 */ 199 static struct vnode * 200 unionfs_ins_cached_vnode(struct unionfs_node *uncp, 201 struct vnode *dvp) 202 { 203 struct unionfs_node_hashhead *hd; 204 struct vnode *vp; 205 206 vp = NULLVP; 207 VI_LOCK(dvp); 208 if (uncp->un_uppervp != NULLVP) { 209 ASSERT_VOP_ELOCKED(uncp->un_uppervp, __func__); 210 KASSERT(uncp->un_uppervp->v_type == VDIR, 211 ("%s: v_type != VDIR", __func__)); 212 vp = unionfs_get_cached_vnode_locked(uncp->un_uppervp, dvp); 213 } else if (uncp->un_lowervp != NULLVP) { 214 ASSERT_VOP_ELOCKED(uncp->un_lowervp, __func__); 215 KASSERT(uncp->un_lowervp->v_type == VDIR, 216 ("%s: v_type != VDIR", __func__)); 217 vp = unionfs_get_cached_vnode_locked(uncp->un_lowervp, dvp); 218 } 219 if (vp == NULLVP) { 220 hd = unionfs_get_hashhead(dvp, (uncp->un_uppervp != NULLVP ? 221 uncp->un_uppervp : uncp->un_lowervp)); 222 LIST_INSERT_HEAD(hd, uncp, un_hash); 223 } 224 VI_UNLOCK(dvp); 225 226 return (vp); 227 } 228 229 /* 230 * Remove the vnode. 231 */ 232 static void 233 unionfs_rem_cached_vnode(struct unionfs_node *unp, struct vnode *dvp) 234 { 235 KASSERT(unp != NULL, ("%s: null node", __func__)); 236 KASSERT(dvp != NULLVP, 237 ("%s: null parent vnode", __func__)); 238 239 VI_LOCK(dvp); 240 if (unp->un_hash.le_prev != NULL) { 241 LIST_REMOVE(unp, un_hash); 242 unp->un_hash.le_next = NULL; 243 unp->un_hash.le_prev = NULL; 244 } 245 VI_UNLOCK(dvp); 246 } 247 248 /* 249 * Common cleanup handling for unionfs_nodeget 250 * Upper, lower, and parent directory vnodes are expected to be referenced by 251 * the caller. Upper and lower vnodes, if non-NULL, are also expected to be 252 * exclusively locked by the caller. 253 * This function will return with the caller's locks and references undone. 254 */ 255 static void 256 unionfs_nodeget_cleanup(struct vnode *vp, struct unionfs_node *unp) 257 { 258 259 /* 260 * Lock and reset the default vnode lock; vgone() expects a locked 261 * vnode, and we're going to reset the vnode ops. 262 */ 263 lockmgr(&vp->v_lock, LK_EXCLUSIVE, NULL); 264 265 /* 266 * Clear out private data and reset the vnode ops to avoid use of 267 * unionfs vnode ops on a partially constructed vnode. 268 */ 269 VI_LOCK(vp); 270 vp->v_data = NULL; 271 vp->v_vnlock = &vp->v_lock; 272 vp->v_op = &dead_vnodeops; 273 VI_UNLOCK(vp); 274 vgone(vp); 275 vput(vp); 276 277 if (unp->un_dvp != NULLVP) 278 vrele(unp->un_dvp); 279 if (unp->un_uppervp != NULLVP) { 280 vput(unp->un_uppervp); 281 if (unp->un_lowervp != NULLVP) 282 vrele(unp->un_lowervp); 283 } else if (unp->un_lowervp != NULLVP) 284 vput(unp->un_lowervp); 285 if (unp->un_hashtbl != NULL) 286 hashdestroy(unp->un_hashtbl, M_UNIONFSHASH, UNIONFSHASHMASK); 287 free(unp->un_path, M_UNIONFSPATH); 288 free(unp, M_UNIONFSNODE); 289 } 290 291 /* 292 * Make a new or get existing unionfs node. 293 * 294 * uppervp and lowervp should be unlocked. Because if new unionfs vnode is 295 * locked, uppervp or lowervp is locked too. In order to prevent dead lock, 296 * you should not lock plurality simultaneously. 297 */ 298 int 299 unionfs_nodeget(struct mount *mp, struct vnode *uppervp, 300 struct vnode *lowervp, struct vnode *dvp, struct vnode **vpp, 301 struct componentname *cnp) 302 { 303 char *path; 304 struct unionfs_mount *ump; 305 struct unionfs_node *unp; 306 struct vnode *vp; 307 u_long hashmask; 308 int error; 309 int lkflags; 310 __enum_uint8(vtype) vt; 311 312 error = 0; 313 ump = MOUNTTOUNIONFSMOUNT(mp); 314 lkflags = (cnp ? cnp->cn_lkflags : 0); 315 path = (cnp ? cnp->cn_nameptr : NULL); 316 *vpp = NULLVP; 317 318 if (uppervp == NULLVP && lowervp == NULLVP) 319 panic("%s: upper and lower are both null", __func__); 320 321 vt = (uppervp != NULLVP ? uppervp->v_type : lowervp->v_type); 322 323 /* If it has no ISLASTCN flag, path check is skipped. */ 324 if (cnp && !(cnp->cn_flags & ISLASTCN)) 325 path = NULL; 326 327 /* check the cache */ 328 if (dvp != NULLVP && vt == VDIR) { 329 vp = unionfs_get_cached_vnode(uppervp, lowervp, dvp); 330 if (vp != NULLVP) { 331 *vpp = vp; 332 if (lkflags != 0) 333 vn_lock(*vpp, lkflags | LK_RETRY); 334 return (0); 335 } 336 } 337 338 unp = malloc(sizeof(struct unionfs_node), 339 M_UNIONFSNODE, M_WAITOK | M_ZERO); 340 341 error = getnewvnode("unionfs", mp, &unionfs_vnodeops, &vp); 342 if (error != 0) { 343 free(unp, M_UNIONFSNODE); 344 return (error); 345 } 346 if (dvp != NULLVP) 347 vref(dvp); 348 if (uppervp != NULLVP) 349 vref(uppervp); 350 if (lowervp != NULLVP) 351 vref(lowervp); 352 353 if (vt == VDIR) { 354 unp->un_hashtbl = hashinit(NUNIONFSNODECACHE, M_UNIONFSHASH, 355 &hashmask); 356 KASSERT(hashmask == UNIONFSHASHMASK, 357 ("unexpected unionfs hash mask 0x%lx", hashmask)); 358 } 359 360 unp->un_vnode = vp; 361 unp->un_uppervp = uppervp; 362 unp->un_lowervp = lowervp; 363 unp->un_dvp = dvp; 364 if (uppervp != NULLVP) 365 vp->v_vnlock = uppervp->v_vnlock; 366 else 367 vp->v_vnlock = lowervp->v_vnlock; 368 369 if (path != NULL) { 370 unp->un_path = malloc(cnp->cn_namelen + 1, 371 M_UNIONFSPATH, M_WAITOK | M_ZERO); 372 bcopy(cnp->cn_nameptr, unp->un_path, cnp->cn_namelen); 373 unp->un_path[cnp->cn_namelen] = '\0'; 374 unp->un_pathlen = cnp->cn_namelen; 375 } 376 vp->v_type = vt; 377 vp->v_data = unp; 378 379 /* 380 * TODO: This is an imperfect check, as there's no guarantee that 381 * the underlying filesystems will always return vnode pointers 382 * for the root inodes that match our cached values. To reduce 383 * the likelihood of failure, for example in the case where either 384 * vnode has been forcibly doomed, we check both pointers and set 385 * VV_ROOT if either matches. 386 */ 387 if (ump->um_uppervp == uppervp || ump->um_lowervp == lowervp) 388 vp->v_vflag |= VV_ROOT; 389 KASSERT(dvp != NULL || (vp->v_vflag & VV_ROOT) != 0, 390 ("%s: NULL dvp for non-root vp %p", __func__, vp)); 391 392 393 /* 394 * NOTE: There is still a possibility for cross-filesystem locking here. 395 * If dvp has an upper FS component and is locked, while the new vnode 396 * created here only has a lower-layer FS component, then we will end 397 * up taking a lower-FS lock while holding an upper-FS lock. 398 * That situation could be dealt with here using vn_lock_pair(). 399 * However, that would only address one instance out of many in which 400 * a child vnode lock is taken while holding a lock on its parent 401 * directory. This is done in many places in common VFS code, as well as 402 * a few places within unionfs (which could lead to the same cross-FS 403 * locking issue if, for example, the upper FS is another nested unionfs 404 * instance). Additionally, it is unclear under what circumstances this 405 * specific lock sequence (a directory on one FS followed by a child of 406 * its 'peer' directory on another FS) would present the practical 407 * possibility of deadlock due to some other agent on the system 408 * attempting to lock those two specific vnodes in the opposite order. 409 */ 410 if (uppervp != NULLVP) 411 vn_lock(uppervp, LK_EXCLUSIVE | LK_RETRY); 412 else 413 vn_lock(lowervp, LK_EXCLUSIVE | LK_RETRY); 414 error = insmntque1(vp, mp); 415 if (error != 0) { 416 unionfs_nodeget_cleanup(vp, unp); 417 return (error); 418 } 419 /* 420 * lowervp and uppervp should only be doomed by a forced unmount of 421 * their respective filesystems, but that can only happen if the 422 * unionfs instance is first unmounted. We also effectively hold the 423 * lock on the new unionfs vnode at this point. Therefore, if a 424 * unionfs umount has not yet reached the point at which the above 425 * insmntque1() would fail, then its vflush() call will end up 426 * blocked on our vnode lock, effectively also preventing unmount 427 * of the underlying filesystems. 428 */ 429 VNASSERT(lowervp == NULLVP || !VN_IS_DOOMED(lowervp), vp, 430 ("%s: doomed lowervp %p", __func__, lowervp)); 431 VNASSERT(uppervp == NULLVP || !VN_IS_DOOMED(uppervp), vp, 432 ("%s: doomed lowervp %p", __func__, uppervp)); 433 434 vn_set_state(vp, VSTATE_CONSTRUCTED); 435 436 if (dvp != NULLVP && vt == VDIR) 437 *vpp = unionfs_ins_cached_vnode(unp, dvp); 438 if (*vpp != NULLVP) { 439 unionfs_nodeget_cleanup(vp, unp); 440 if (lkflags != 0) 441 vn_lock(*vpp, lkflags | LK_RETRY); 442 return (0); 443 } else 444 *vpp = vp; 445 446 if ((lkflags & LK_SHARED) != 0) 447 vn_lock(vp, LK_DOWNGRADE); 448 else if ((lkflags & LK_EXCLUSIVE) == 0) 449 VOP_UNLOCK(vp); 450 451 return (0); 452 } 453 454 /* 455 * Clean up the unionfs node. 456 */ 457 void 458 unionfs_noderem(struct vnode *vp) 459 { 460 struct unionfs_node *unp, *unp_t1, *unp_t2; 461 struct unionfs_node_hashhead *hd; 462 struct unionfs_node_status *unsp, *unsp_tmp; 463 struct vnode *lvp; 464 struct vnode *uvp; 465 struct vnode *dvp; 466 int count; 467 int writerefs; 468 bool unlock_lvp; 469 470 /* 471 * The root vnode lock may be recursed during unmount, because 472 * it may share the same lock as the unionfs mount's covered vnode, 473 * which is locked across VFS_UNMOUNT(). This lock will then be 474 * recursively taken during the vflush() issued by unionfs_unmount(). 475 * But we still only need to lock the unionfs lock once, because only 476 * one of those lock operations was taken against a unionfs vnode and 477 * will be undone against a unionfs vnode. 478 */ 479 KASSERT(vp->v_vnlock->lk_recurse == 0 || (vp->v_vflag & VV_ROOT) != 0, 480 ("%s: vnode %p locked recursively", __func__, vp)); 481 482 unp = VTOUNIONFS(vp); 483 VNASSERT(unp != NULL, vp, ("%s: already reclaimed", __func__)); 484 lvp = unp->un_lowervp; 485 uvp = unp->un_uppervp; 486 dvp = unp->un_dvp; 487 unlock_lvp = (uvp == NULLVP); 488 489 /* 490 * Lock the lower vnode in addition to the upper vnode lock in order 491 * to synchronize against any unionfs_lock() operation which may still 492 * hold the lower vnode lock. We do not need to do this for the root 493 * vnode, as the root vnode should always have both upper and lower 494 * base vnodes for its entire lifecycled, so unionfs_lock() should 495 * never attempt to lock its lower vnode in the first place. 496 * Moreover, during unmount of a non-"below" unionfs mount, the lower 497 * root vnode will already be locked as it is the covered vnode. 498 */ 499 if (uvp != NULLVP && lvp != NULLVP && (vp->v_vflag & VV_ROOT) == 0) { 500 vn_lock_pair(uvp, true, LK_EXCLUSIVE, lvp, false, LK_EXCLUSIVE); 501 unlock_lvp = true; 502 } 503 504 if (lockmgr(&vp->v_lock, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) 505 panic("%s: failed to acquire lock for vnode lock", __func__); 506 /* 507 * Use the interlock to protect the clearing of v_data to 508 * prevent faults in unionfs_lock(). 509 */ 510 VI_LOCK(vp); 511 unp->un_lowervp = unp->un_uppervp = NULLVP; 512 vp->v_vnlock = &(vp->v_lock); 513 vp->v_data = NULL; 514 vp->v_object = NULL; 515 if (unp->un_hashtbl != NULL) { 516 /* 517 * Clear out any cached child vnodes. This should only 518 * be necessary during forced unmount, when the vnode may 519 * be reclaimed with a non-zero use count. Otherwise the 520 * reference held by each child should prevent reclamation. 521 */ 522 for (count = 0; count <= UNIONFSHASHMASK; count++) { 523 hd = unp->un_hashtbl + count; 524 LIST_FOREACH_SAFE(unp_t1, hd, un_hash, unp_t2) { 525 LIST_REMOVE(unp_t1, un_hash); 526 unp_t1->un_hash.le_next = NULL; 527 unp_t1->un_hash.le_prev = NULL; 528 } 529 } 530 } 531 VI_UNLOCK(vp); 532 533 writerefs = atomic_load_int(&vp->v_writecount); 534 VNASSERT(writerefs >= 0, vp, 535 ("%s: write count %d, unexpected text ref", __func__, writerefs)); 536 /* 537 * If we were opened for write, we leased the write reference 538 * to the lower vnode. If this is a reclamation due to the 539 * forced unmount, undo the reference now. 540 */ 541 if (writerefs > 0) { 542 VNASSERT(uvp != NULL, vp, 543 ("%s: write reference without upper vnode", __func__)); 544 VOP_ADD_WRITECOUNT(uvp, -writerefs); 545 } 546 if (uvp != NULLVP) 547 vput(uvp); 548 if (unlock_lvp) 549 vput(lvp); 550 else if (lvp != NULLVP) 551 vrele(lvp); 552 553 if (dvp != NULLVP) 554 unionfs_rem_cached_vnode(unp, dvp); 555 556 if (unp->un_path != NULL) { 557 free(unp->un_path, M_UNIONFSPATH); 558 unp->un_path = NULL; 559 unp->un_pathlen = 0; 560 } 561 562 if (unp->un_hashtbl != NULL) { 563 hashdestroy(unp->un_hashtbl, M_UNIONFSHASH, UNIONFSHASHMASK); 564 } 565 566 LIST_FOREACH_SAFE(unsp, &(unp->un_unshead), uns_list, unsp_tmp) { 567 LIST_REMOVE(unsp, uns_list); 568 free(unsp, M_TEMP); 569 } 570 if (dvp != NULLVP) { 571 mtx_lock(&unionfs_deferred_rele_lock); 572 STAILQ_INSERT_TAIL(&unionfs_deferred_rele_list, unp, un_rele); 573 mtx_unlock(&unionfs_deferred_rele_lock); 574 taskqueue_enqueue(taskqueue_unionfs_rele, 575 &unionfs_deferred_rele_task); 576 } else 577 free(unp, M_UNIONFSNODE); 578 } 579 580 /* 581 * Find the unionfs node status object for the vnode corresponding to unp, 582 * for the process that owns td. Return NULL if no such object exists. 583 */ 584 struct unionfs_node_status * 585 unionfs_find_node_status(struct unionfs_node *unp, struct thread *td) 586 { 587 struct unionfs_node_status *unsp; 588 pid_t pid; 589 590 pid = td->td_proc->p_pid; 591 592 ASSERT_VOP_ELOCKED(UNIONFSTOV(unp), __func__); 593 594 LIST_FOREACH(unsp, &(unp->un_unshead), uns_list) { 595 if (unsp->uns_pid == pid) { 596 return (unsp); 597 } 598 } 599 600 return (NULL); 601 } 602 603 /* 604 * Get the unionfs node status object for the vnode corresponding to unp, 605 * for the process that owns td. Allocate a new status object if one 606 * does not already exist. 607 */ 608 void 609 unionfs_get_node_status(struct unionfs_node *unp, struct thread *td, 610 struct unionfs_node_status **unspp) 611 { 612 struct unionfs_node_status *unsp; 613 pid_t pid; 614 615 pid = td->td_proc->p_pid; 616 617 KASSERT(NULL != unspp, ("%s: NULL status", __func__)); 618 unsp = unionfs_find_node_status(unp, td); 619 if (unsp == NULL) { 620 /* create a new unionfs node status */ 621 unsp = malloc(sizeof(struct unionfs_node_status), 622 M_TEMP, M_WAITOK | M_ZERO); 623 624 unsp->uns_pid = pid; 625 LIST_INSERT_HEAD(&(unp->un_unshead), unsp, uns_list); 626 } 627 628 *unspp = unsp; 629 } 630 631 /* 632 * Remove the unionfs node status, if you can. 633 * You need exclusive lock this vnode. 634 */ 635 void 636 unionfs_tryrem_node_status(struct unionfs_node *unp, 637 struct unionfs_node_status *unsp) 638 { 639 KASSERT(NULL != unsp, ("%s: NULL status", __func__)); 640 ASSERT_VOP_ELOCKED(UNIONFSTOV(unp), __func__); 641 642 if (0 < unsp->uns_lower_opencnt || 0 < unsp->uns_upper_opencnt) 643 return; 644 645 LIST_REMOVE(unsp, uns_list); 646 free(unsp, M_TEMP); 647 } 648 649 /* 650 * Create upper node attr. 651 */ 652 void 653 unionfs_create_uppervattr_core(struct unionfs_mount *ump, struct vattr *lva, 654 struct vattr *uva, struct thread *td) 655 { 656 VATTR_NULL(uva); 657 uva->va_type = lva->va_type; 658 uva->va_atime = lva->va_atime; 659 uva->va_mtime = lva->va_mtime; 660 uva->va_ctime = lva->va_ctime; 661 662 switch (ump->um_copymode) { 663 case UNIONFS_TRANSPARENT: 664 uva->va_mode = lva->va_mode; 665 uva->va_uid = lva->va_uid; 666 uva->va_gid = lva->va_gid; 667 break; 668 case UNIONFS_MASQUERADE: 669 if (ump->um_uid == lva->va_uid) { 670 uva->va_mode = lva->va_mode & 077077; 671 uva->va_mode |= (lva->va_type == VDIR ? 672 ump->um_udir : ump->um_ufile) & 0700; 673 uva->va_uid = lva->va_uid; 674 uva->va_gid = lva->va_gid; 675 } else { 676 uva->va_mode = (lva->va_type == VDIR ? 677 ump->um_udir : ump->um_ufile); 678 uva->va_uid = ump->um_uid; 679 uva->va_gid = ump->um_gid; 680 } 681 break; 682 default: /* UNIONFS_TRADITIONAL */ 683 uva->va_mode = 0777 & ~td->td_proc->p_pd->pd_cmask; 684 uva->va_uid = ump->um_uid; 685 uva->va_gid = ump->um_gid; 686 break; 687 } 688 } 689 690 /* 691 * Create upper node attr. 692 */ 693 int 694 unionfs_create_uppervattr(struct unionfs_mount *ump, struct vnode *lvp, 695 struct vattr *uva, struct ucred *cred, struct thread *td) 696 { 697 struct vattr lva; 698 int error; 699 700 if ((error = VOP_GETATTR(lvp, &lva, cred))) 701 return (error); 702 703 unionfs_create_uppervattr_core(ump, &lva, uva, td); 704 705 return (error); 706 } 707 708 /* 709 * relookup 710 * 711 * dvp should be locked on entry and will be locked on return. 712 * 713 * If an error is returned, *vpp will be invalid, otherwise it will hold a 714 * locked, referenced vnode. If *vpp == dvp then remember that only one 715 * LK_EXCLUSIVE lock is held. 716 */ 717 int 718 unionfs_relookup(struct vnode *dvp, struct vnode **vpp, 719 struct componentname *cnp, struct componentname *cn, struct thread *td, 720 char *path, int pathlen, u_long nameiop) 721 { 722 int error; 723 bool refstart; 724 725 cn->cn_namelen = pathlen; 726 cn->cn_pnbuf = path; 727 cn->cn_nameiop = nameiop; 728 cn->cn_flags = (LOCKPARENT | LOCKLEAF | ISLASTCN); 729 cn->cn_lkflags = LK_EXCLUSIVE; 730 cn->cn_cred = cnp->cn_cred; 731 cn->cn_nameptr = cn->cn_pnbuf; 732 733 refstart = false; 734 if (nameiop == DELETE) { 735 cn->cn_flags |= (cnp->cn_flags & DOWHITEOUT); 736 } else if (nameiop == RENAME) { 737 refstart = true; 738 } else if (nameiop == CREATE) { 739 cn->cn_flags |= NOCACHE; 740 } 741 742 vref(dvp); 743 VOP_UNLOCK(dvp); 744 745 if ((error = vfs_relookup(dvp, vpp, cn, refstart))) { 746 vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); 747 } else 748 vrele(dvp); 749 750 KASSERT(cn->cn_pnbuf == path, ("%s: cn_pnbuf changed", __func__)); 751 752 return (error); 753 } 754 755 /* 756 * Update the unionfs_node. 757 * 758 * uvp is new locked upper vnode. unionfs vnode's lock will be exchanged to the 759 * uvp's lock and lower's lock will be unlocked. 760 */ 761 static void 762 unionfs_node_update(struct unionfs_node *unp, struct vnode *uvp, 763 struct thread *td) 764 { 765 struct unionfs_node_hashhead *hd; 766 struct vnode *vp; 767 struct vnode *lvp; 768 struct vnode *dvp; 769 unsigned count, lockrec; 770 771 vp = UNIONFSTOV(unp); 772 lvp = unp->un_lowervp; 773 ASSERT_VOP_ELOCKED(lvp, __func__); 774 ASSERT_VOP_ELOCKED(uvp, __func__); 775 dvp = unp->un_dvp; 776 777 VNASSERT(vp->v_writecount == 0, vp, 778 ("%s: non-zero writecount", __func__)); 779 /* 780 * Update the upper vnode's lock state to match the lower vnode, 781 * and then switch the unionfs vnode's lock to the upper vnode. 782 */ 783 lockrec = lvp->v_vnlock->lk_recurse; 784 for (count = 0; count < lockrec; count++) 785 vn_lock(uvp, LK_EXCLUSIVE | LK_CANRECURSE | LK_RETRY); 786 VI_LOCK(vp); 787 unp->un_uppervp = uvp; 788 vp->v_vnlock = uvp->v_vnlock; 789 VI_UNLOCK(vp); 790 791 for (count = 0; count < lockrec + 1; count++) 792 VOP_UNLOCK(lvp); 793 /* 794 * Re-cache the unionfs vnode against the upper vnode 795 */ 796 if (dvp != NULLVP && vp->v_type == VDIR) { 797 VI_LOCK(dvp); 798 if (unp->un_hash.le_prev != NULL) { 799 LIST_REMOVE(unp, un_hash); 800 hd = unionfs_get_hashhead(dvp, uvp); 801 LIST_INSERT_HEAD(hd, unp, un_hash); 802 } 803 VI_UNLOCK(unp->un_dvp); 804 } 805 } 806 807 /* 808 * Mark a unionfs operation as being in progress, sleeping if the 809 * same operation is already in progress. 810 * This is useful, for example, during copy-up operations in which 811 * we may drop the target vnode lock, but we want to avoid the 812 * possibility of a concurrent copy-up on the same vnode triggering 813 * a spurious failure. 814 */ 815 int 816 unionfs_set_in_progress_flag(struct vnode *vp, unsigned int flag) 817 { 818 struct unionfs_node *unp; 819 int error; 820 821 error = 0; 822 ASSERT_VOP_ELOCKED(vp, __func__); 823 VI_LOCK(vp); 824 unp = VTOUNIONFS(vp); 825 while (error == 0 && (unp->un_flag & flag) != 0) { 826 VOP_UNLOCK(vp); 827 error = msleep(vp, VI_MTX(vp), PCATCH | PDROP, "unioncp", 0); 828 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 829 VI_LOCK(vp); 830 if (error == 0) { 831 /* 832 * If we waited on a concurrent copy-up and that 833 * copy-up was successful, return a non-fatal 834 * indication that the desired operation is already 835 * complete. If we waited on a concurrent lookup, 836 * return ERELOOKUP to indicate the VFS cache should 837 * be re-queried to avoid creating a duplicate unionfs 838 * vnode. 839 */ 840 unp = VTOUNIONFS(vp); 841 if (unp == NULL) 842 error = ENOENT; 843 else if (flag == UNIONFS_COPY_IN_PROGRESS && 844 unp->un_uppervp != NULLVP) 845 error = EJUSTRETURN; 846 else if (flag == UNIONFS_LOOKUP_IN_PROGRESS) 847 error = ERELOOKUP; 848 } 849 } 850 if (error == 0) 851 unp->un_flag |= flag; 852 VI_UNLOCK(vp); 853 854 return (error); 855 } 856 857 void 858 unionfs_clear_in_progress_flag(struct vnode *vp, unsigned int flag) 859 { 860 struct unionfs_node *unp; 861 862 ASSERT_VOP_ELOCKED(vp, __func__); 863 unp = VTOUNIONFS(vp); 864 VI_LOCK(vp); 865 if (unp != NULL) { 866 VNASSERT((unp->un_flag & flag) != 0, vp, 867 ("%s: copy not in progress", __func__)); 868 unp->un_flag &= ~flag; 869 } 870 wakeup(vp); 871 VI_UNLOCK(vp); 872 } 873 874 /* 875 * Create a new shadow dir. 876 * 877 * dvp and vp are unionfs vnodes representing a parent directory and 878 * child file, should be locked on entry, and will be locked on return. 879 * 880 * If no error returned, unp will be updated. 881 */ 882 int 883 unionfs_mkshadowdir(struct vnode *dvp, struct vnode *vp, 884 struct componentname *cnp, struct thread *td) 885 { 886 struct vnode *lvp; 887 struct vnode *uvp; 888 struct vnode *udvp; 889 struct vattr va; 890 struct vattr lva; 891 struct nameidata nd; 892 struct mount *mp; 893 struct ucred *cred; 894 struct ucred *credbk; 895 struct uidinfo *rootinfo; 896 struct unionfs_mount *ump; 897 struct unionfs_node *dunp; 898 struct unionfs_node *unp; 899 int error; 900 901 ASSERT_VOP_ELOCKED(dvp, __func__); 902 ASSERT_VOP_ELOCKED(vp, __func__); 903 ump = MOUNTTOUNIONFSMOUNT(vp->v_mount); 904 unp = VTOUNIONFS(vp); 905 if (unp->un_uppervp != NULLVP) 906 return (EEXIST); 907 dunp = VTOUNIONFS(dvp); 908 udvp = dunp->un_uppervp; 909 910 error = unionfs_set_in_progress_flag(vp, UNIONFS_COPY_IN_PROGRESS); 911 if (error == EJUSTRETURN) 912 return (0); 913 else if (error != 0) 914 return (error); 915 916 lvp = unp->un_lowervp; 917 uvp = NULLVP; 918 credbk = cnp->cn_cred; 919 920 /* Authority change to root */ 921 rootinfo = uifind((uid_t)0); 922 cred = crdup(cnp->cn_cred); 923 change_euid(cred, rootinfo); 924 change_ruid(cred, rootinfo); 925 change_svuid(cred, (uid_t)0); 926 uifree(rootinfo); 927 cnp->cn_cred = cred; 928 929 memset(&nd.ni_cnd, 0, sizeof(struct componentname)); 930 NDPREINIT(&nd); 931 932 if ((error = VOP_GETATTR(lvp, &lva, cnp->cn_cred))) 933 goto unionfs_mkshadowdir_finish; 934 935 vref(udvp); 936 VOP_UNLOCK(vp); 937 if ((error = unionfs_relookup(udvp, &uvp, cnp, &nd.ni_cnd, td, 938 cnp->cn_nameptr, cnp->cn_namelen, CREATE))) { 939 /* 940 * When handling error cases here, we drop udvp's lock and 941 * then jump to exit code that relocks dvp, which in most 942 * cases will effectively relock udvp. However, this is 943 * not guaranteed to be the case, as various calls made 944 * here (such as unionfs_relookup() above and VOP_MKDIR() 945 * below) may unlock and then relock udvp, allowing dvp to 946 * be reclaimed in the meantime. In such a situation dvp 947 * will no longer share its lock with udvp. Since 948 * performance isn't a concern for these error cases, it 949 * makes more sense to reuse the common code that locks 950 * dvp on exit than to explicitly check for reclamation 951 * of dvp. 952 */ 953 vput(udvp); 954 goto unionfs_mkshadowdir_relock; 955 } 956 if (uvp != NULLVP) { 957 if (udvp == uvp) 958 vrele(uvp); 959 else 960 vput(uvp); 961 962 error = EEXIST; 963 vput(udvp); 964 goto unionfs_mkshadowdir_relock; 965 } 966 967 if ((error = vn_start_write(udvp, &mp, V_WAIT | V_PCATCH))) { 968 vput(udvp); 969 goto unionfs_mkshadowdir_relock; 970 } 971 unionfs_create_uppervattr_core(ump, &lva, &va, td); 972 973 /* 974 * Temporarily NUL-terminate the current pathname component. 975 * This function may be called during lookup operations in which 976 * the current pathname component is not the leaf, meaning that 977 * the NUL terminator is some distance beyond the end of the current 978 * component. This *should* be fine, as cn_namelen will still 979 * correctly indicate the length of only the current component, 980 * but ZFS in particular does not respect cn_namelen in its VOP_MKDIR 981 * implementation. 982 * Note that this assumes nd.ni_cnd.cn_pnbuf was allocated by 983 * something like a local namei() operation and the temporary 984 * NUL-termination will not have an effect on other threads. 985 */ 986 char *pathend = &nd.ni_cnd.cn_nameptr[nd.ni_cnd.cn_namelen]; 987 char pathterm = *pathend; 988 *pathend = '\0'; 989 error = VOP_MKDIR(udvp, &uvp, &nd.ni_cnd, &va); 990 *pathend = pathterm; 991 if (error != 0) { 992 /* 993 * See the comment after unionfs_relookup() above for an 994 * explanation of why we unlock udvp here only to relock 995 * dvp on exit. 996 */ 997 vput(udvp); 998 vn_finished_write(mp); 999 goto unionfs_mkshadowdir_relock; 1000 } 1001 1002 /* 1003 * XXX The bug which cannot set uid/gid was corrected. 1004 * Ignore errors. 1005 */ 1006 va.va_type = VNON; 1007 /* 1008 * VOP_SETATTR() may transiently drop uvp's lock, so it's 1009 * important to call it before unionfs_node_update() transfers 1010 * the unionfs vnode's lock from lvp to uvp; otherwise the 1011 * unionfs vnode itself would be transiently unlocked and 1012 * potentially doomed. 1013 */ 1014 VOP_SETATTR(uvp, &va, nd.ni_cnd.cn_cred); 1015 1016 /* 1017 * uvp may become doomed during VOP_VPUT_PAIR() if the implementation 1018 * must temporarily drop uvp's lock. However, since we hold a 1019 * reference to uvp from the VOP_MKDIR() call above, this would require 1020 * a forcible unmount of uvp's filesystem, which in turn can only 1021 * happen if our unionfs instance is first forcibly unmounted. We'll 1022 * therefore catch this case in the NULL check of unp below. 1023 */ 1024 VOP_VPUT_PAIR(udvp, &uvp, false); 1025 vn_finished_write(mp); 1026 vn_lock_pair(vp, false, LK_EXCLUSIVE, uvp, true, LK_EXCLUSIVE); 1027 unp = VTOUNIONFS(vp); 1028 if (unp == NULL) { 1029 vput(uvp); 1030 error = ENOENT; 1031 } else 1032 unionfs_node_update(unp, uvp, td); 1033 VOP_UNLOCK(vp); 1034 1035 unionfs_mkshadowdir_relock: 1036 vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); 1037 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1038 if (error == 0 && (VN_IS_DOOMED(dvp) || VN_IS_DOOMED(vp))) 1039 error = ENOENT; 1040 1041 unionfs_mkshadowdir_finish: 1042 unionfs_clear_in_progress_flag(vp, UNIONFS_COPY_IN_PROGRESS); 1043 cnp->cn_cred = credbk; 1044 crfree(cred); 1045 1046 return (error); 1047 } 1048 1049 static inline void 1050 unionfs_forward_vop_ref(struct vnode *basevp, int *lkflags) 1051 { 1052 ASSERT_VOP_LOCKED(basevp, __func__); 1053 *lkflags = VOP_ISLOCKED(basevp); 1054 vref(basevp); 1055 } 1056 1057 /* 1058 * Prepare unionfs to issue a forwarded VOP to either the upper or lower 1059 * FS. This should be used for any VOP which may drop the vnode lock; 1060 * it is not required otherwise. 1061 * The unionfs vnode shares its lock with the base-layer vnode(s); if the 1062 * base FS must transiently drop its vnode lock, the unionfs vnode may 1063 * effectively become unlocked. During that window, a concurrent forced 1064 * unmount may doom the unionfs vnode, which leads to two significant 1065 * issues: 1066 * 1) Completion of, and return from, the unionfs VOP with the unionfs 1067 * vnode completely unlocked. When the unionfs vnode becomes doomed 1068 * it stops sharing its lock with the base vnode, so even if the 1069 * forwarded VOP reacquires the base vnode lock the unionfs vnode 1070 * lock will no longer be held. This can lead to violation of the 1071 * caller's sychronization requirements as well as various failed 1072 * locking assertions when DEBUG_VFS_LOCKS is enabled. 1073 * 2) Loss of reference on the base vnode. The caller is expected to 1074 * hold a v_usecount reference on the unionfs vnode, while the 1075 * unionfs vnode holds a reference on the base-layer vnode(s). But 1076 * these references are released when the unionfs vnode becomes 1077 * doomed, violating the base layer's expectation that its caller 1078 * must hold a reference to prevent vnode recycling. 1079 * 1080 * basevp1 and basevp2 represent two base-layer vnodes which are 1081 * expected to be locked when this function is called. basevp2 1082 * may be NULL, but if not NULL basevp1 and basevp2 should represent 1083 * a parent directory and a filed linked to it, respectively. 1084 * lkflags1 and lkflags2 are output parameters that will store the 1085 * current lock status of basevp1 and basevp2, respectively. They 1086 * are intended to be passed as the lkflags1 and lkflags2 parameters 1087 * in the subsequent call to unionfs_forward_vop_finish_pair(). 1088 * lkflags2 may be NULL iff basevp2 is NULL. 1089 */ 1090 void 1091 unionfs_forward_vop_start_pair(struct vnode *basevp1, int *lkflags1, 1092 struct vnode *basevp2, int *lkflags2) 1093 { 1094 /* 1095 * Take an additional reference on the base-layer vnodes to 1096 * avoid loss of reference if the unionfs vnodes are doomed. 1097 */ 1098 unionfs_forward_vop_ref(basevp1, lkflags1); 1099 if (basevp2 != NULL) 1100 unionfs_forward_vop_ref(basevp2, lkflags2); 1101 } 1102 1103 static inline bool 1104 unionfs_forward_vop_rele(struct vnode *unionvp, struct vnode *basevp, 1105 int lkflags) 1106 { 1107 bool unionvp_doomed; 1108 1109 if (__predict_false(VTOUNIONFS(unionvp) == NULL)) { 1110 if ((lkflags & LK_EXCLUSIVE) != 0) 1111 ASSERT_VOP_ELOCKED(basevp, __func__); 1112 else 1113 ASSERT_VOP_LOCKED(basevp, __func__); 1114 unionvp_doomed = true; 1115 } else { 1116 vrele(basevp); 1117 unionvp_doomed = false; 1118 } 1119 1120 return (unionvp_doomed); 1121 } 1122 1123 1124 /* 1125 * Indicate completion of a forwarded VOP previously prepared by 1126 * unionfs_forward_vop_start_pair(). 1127 * basevp1 and basevp2 must be the same values passed to the prior 1128 * call to unionfs_forward_vop_start_pair(). unionvp1 and unionvp2 1129 * must be the unionfs vnodes that were initially above basevp1 and 1130 * basevp2, respectively. 1131 * basevp1 and basevp2 (if not NULL) must be locked when this function 1132 * is called, while unionvp1 and/or unionvp2 may be unlocked if either 1133 * unionfs vnode has become doomed. 1134 * lkflags1 and lkflag2 represent the locking flags that should be 1135 * used to re-lock unionvp1 and unionvp2, respectively, if either 1136 * vnode has become doomed. 1137 * 1138 * Returns true if any unionfs vnode was found to be doomed, false 1139 * otherwise. 1140 */ 1141 bool 1142 unionfs_forward_vop_finish_pair( 1143 struct vnode *unionvp1, struct vnode *basevp1, int lkflags1, 1144 struct vnode *unionvp2, struct vnode *basevp2, int lkflags2) 1145 { 1146 bool vp1_doomed, vp2_doomed; 1147 1148 /* 1149 * If either vnode is found to have been doomed, set 1150 * a flag indicating that it needs to be re-locked. 1151 * Otherwise, simply drop the base-vnode reference that 1152 * was taken in unionfs_forward_vop_start(). 1153 */ 1154 vp1_doomed = unionfs_forward_vop_rele(unionvp1, basevp1, lkflags1); 1155 1156 if (unionvp2 != NULL) 1157 vp2_doomed = unionfs_forward_vop_rele(unionvp2, basevp2, lkflags2); 1158 else 1159 vp2_doomed = false; 1160 1161 /* 1162 * If any of the unionfs vnodes need to be re-locked, that 1163 * means the unionfs vnode's lock is now de-coupled from the 1164 * corresponding base vnode. We therefore need to drop the 1165 * base vnode lock (since nothing else will after this point), 1166 * and also release the reference taken in 1167 * unionfs_forward_vop_start_pair(). 1168 */ 1169 if (__predict_false(vp1_doomed && vp2_doomed)) 1170 VOP_VPUT_PAIR(basevp1, &basevp2, true); 1171 else if (__predict_false(vp1_doomed)) { 1172 /* 1173 * If basevp1 needs to be unlocked, then we may not 1174 * be able to safely unlock it with basevp2 still locked, 1175 * for the same reason that an ordinary VFS call would 1176 * need to use VOP_VPUT_PAIR() here. We might be able 1177 * to use VOP_VPUT_PAIR(..., false) here, but then we 1178 * would need to deal with the possibility of basevp2 1179 * changing out from under us, which could result in 1180 * either the unionfs vnode becoming doomed or its 1181 * upper/lower vp no longer matching basevp2. Either 1182 * scenario would require at least re-locking the unionfs 1183 * vnode anyway. 1184 */ 1185 if (unionvp2 != NULL) { 1186 VOP_UNLOCK(unionvp2); 1187 vp2_doomed = true; 1188 } 1189 vput(basevp1); 1190 } else if (__predict_false(vp2_doomed)) 1191 vput(basevp2); 1192 1193 if (__predict_false(vp1_doomed || vp2_doomed)) 1194 vn_lock_pair(unionvp1, !vp1_doomed, lkflags1, 1195 unionvp2, !vp2_doomed, lkflags2); 1196 1197 return (vp1_doomed || vp2_doomed); 1198 } 1199 1200 /* 1201 * Create a new whiteout. 1202 * 1203 * dvp and vp are unionfs vnodes representing a parent directory and 1204 * child file, should be locked on entry, and will be locked on return. 1205 */ 1206 int 1207 unionfs_mkwhiteout(struct vnode *dvp, struct vnode *vp, 1208 struct componentname *cnp, struct thread *td, char *path, int pathlen) 1209 { 1210 struct vnode *udvp; 1211 struct vnode *wvp; 1212 struct nameidata nd; 1213 struct mount *mp; 1214 int error; 1215 bool dvp_locked; 1216 1217 ASSERT_VOP_ELOCKED(dvp, __func__); 1218 ASSERT_VOP_ELOCKED(vp, __func__); 1219 1220 udvp = VTOUNIONFS(dvp)->un_uppervp; 1221 wvp = NULLVP; 1222 NDPREINIT(&nd); 1223 vref(udvp); 1224 VOP_UNLOCK(vp); 1225 if ((error = unionfs_relookup(udvp, &wvp, cnp, &nd.ni_cnd, td, path, 1226 pathlen, CREATE))) { 1227 goto unionfs_mkwhiteout_cleanup; 1228 } 1229 if (wvp != NULLVP) { 1230 if (udvp == wvp) 1231 vrele(wvp); 1232 else 1233 vput(wvp); 1234 1235 if (nd.ni_cnd.cn_flags & ISWHITEOUT) 1236 error = 0; 1237 else 1238 error = EEXIST; 1239 goto unionfs_mkwhiteout_cleanup; 1240 } 1241 1242 if ((error = vn_start_write(udvp, &mp, V_WAIT | V_PCATCH))) 1243 goto unionfs_mkwhiteout_cleanup; 1244 error = VOP_WHITEOUT(udvp, &nd.ni_cnd, CREATE); 1245 vn_finished_write(mp); 1246 1247 unionfs_mkwhiteout_cleanup: 1248 if (VTOUNIONFS(dvp) == NULL) { 1249 vput(udvp); 1250 dvp_locked = false; 1251 } else { 1252 vrele(udvp); 1253 dvp_locked = true; 1254 } 1255 vn_lock_pair(dvp, dvp_locked, LK_EXCLUSIVE, vp, false, LK_EXCLUSIVE); 1256 return (error); 1257 } 1258 1259 /* 1260 * Create a new vnode for create a new shadow file. 1261 * 1262 * If an error is returned, *vpp will be invalid, otherwise it will hold a 1263 * locked, referenced and opened vnode. 1264 * 1265 * unp is never updated. 1266 */ 1267 static int 1268 unionfs_vn_create_on_upper(struct vnode **vpp, struct vnode *udvp, 1269 struct vnode *vp, struct vattr *uvap, struct thread *td) 1270 { 1271 struct unionfs_mount *ump; 1272 struct unionfs_node *unp; 1273 struct vnode *uvp; 1274 struct vnode *lvp; 1275 struct ucred *cred; 1276 struct vattr lva; 1277 struct nameidata nd; 1278 int fmode; 1279 int error; 1280 1281 ASSERT_VOP_ELOCKED(vp, __func__); 1282 unp = VTOUNIONFS(vp); 1283 ump = MOUNTTOUNIONFSMOUNT(UNIONFSTOV(unp)->v_mount); 1284 uvp = NULLVP; 1285 lvp = unp->un_lowervp; 1286 cred = td->td_ucred; 1287 fmode = FFLAGS(O_WRONLY | O_CREAT | O_TRUNC | O_EXCL); 1288 error = 0; 1289 1290 if ((error = VOP_GETATTR(lvp, &lva, cred)) != 0) 1291 return (error); 1292 unionfs_create_uppervattr_core(ump, &lva, uvap, td); 1293 1294 if (unp->un_path == NULL) 1295 panic("%s: NULL un_path", __func__); 1296 1297 nd.ni_cnd.cn_namelen = unp->un_pathlen; 1298 nd.ni_cnd.cn_pnbuf = unp->un_path; 1299 nd.ni_cnd.cn_nameiop = CREATE; 1300 nd.ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF | ISLASTCN; 1301 nd.ni_cnd.cn_lkflags = LK_EXCLUSIVE; 1302 nd.ni_cnd.cn_cred = cred; 1303 nd.ni_cnd.cn_nameptr = nd.ni_cnd.cn_pnbuf; 1304 NDPREINIT(&nd); 1305 1306 vref(udvp); 1307 VOP_UNLOCK(vp); 1308 if ((error = vfs_relookup(udvp, &uvp, &nd.ni_cnd, false)) != 0) { 1309 vrele(udvp); 1310 return (error); 1311 } 1312 1313 if (uvp != NULLVP) { 1314 if (uvp == udvp) 1315 vrele(uvp); 1316 else 1317 vput(uvp); 1318 error = EEXIST; 1319 goto unionfs_vn_create_on_upper_cleanup; 1320 } 1321 1322 if ((error = VOP_CREATE(udvp, &uvp, &nd.ni_cnd, uvap)) != 0) 1323 goto unionfs_vn_create_on_upper_cleanup; 1324 1325 if ((error = VOP_OPEN(uvp, fmode, cred, td, NULL)) != 0) { 1326 vput(uvp); 1327 goto unionfs_vn_create_on_upper_cleanup; 1328 } 1329 error = VOP_ADD_WRITECOUNT(uvp, 1); 1330 CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d", 1331 __func__, uvp, uvp->v_writecount); 1332 if (error == 0) { 1333 *vpp = uvp; 1334 } else { 1335 VOP_CLOSE(uvp, fmode, cred, td); 1336 } 1337 1338 unionfs_vn_create_on_upper_cleanup: 1339 vput(udvp); 1340 return (error); 1341 } 1342 1343 /* 1344 * Copy from lvp to uvp. 1345 * 1346 * lvp and uvp should be locked and opened on entry and will be locked and 1347 * opened on return. 1348 */ 1349 static int 1350 unionfs_copyfile_core(struct vnode *lvp, struct vnode *uvp, 1351 struct ucred *cred, struct thread *td) 1352 { 1353 char *buf; 1354 struct uio uio; 1355 struct iovec iov; 1356 off_t offset; 1357 int count; 1358 int error; 1359 int bufoffset; 1360 1361 error = 0; 1362 memset(&uio, 0, sizeof(uio)); 1363 1364 uio.uio_td = td; 1365 uio.uio_segflg = UIO_SYSSPACE; 1366 uio.uio_offset = 0; 1367 1368 buf = malloc(MAXBSIZE, M_TEMP, M_WAITOK); 1369 1370 while (error == 0) { 1371 offset = uio.uio_offset; 1372 1373 uio.uio_iov = &iov; 1374 uio.uio_iovcnt = 1; 1375 iov.iov_base = buf; 1376 iov.iov_len = MAXBSIZE; 1377 uio.uio_resid = iov.iov_len; 1378 uio.uio_rw = UIO_READ; 1379 1380 if ((error = VOP_READ(lvp, &uio, 0, cred)) != 0) 1381 break; 1382 if ((count = MAXBSIZE - uio.uio_resid) == 0) 1383 break; 1384 1385 bufoffset = 0; 1386 while (bufoffset < count) { 1387 uio.uio_iov = &iov; 1388 uio.uio_iovcnt = 1; 1389 iov.iov_base = buf + bufoffset; 1390 iov.iov_len = count - bufoffset; 1391 uio.uio_offset = offset + bufoffset; 1392 uio.uio_resid = iov.iov_len; 1393 uio.uio_rw = UIO_WRITE; 1394 1395 if ((error = VOP_WRITE(uvp, &uio, 0, cred)) != 0) 1396 break; 1397 1398 bufoffset += (count - bufoffset) - uio.uio_resid; 1399 } 1400 1401 uio.uio_offset = offset + bufoffset; 1402 } 1403 1404 free(buf, M_TEMP); 1405 1406 return (error); 1407 } 1408 1409 /* 1410 * Copy file from lower to upper. 1411 * 1412 * If you need copy of the contents, set 1 to docopy. Otherwise, set 0 to 1413 * docopy. 1414 * 1415 * vp is a unionfs vnode that should be locked on entry and will be 1416 * locked on return. 1417 * 1418 * If no error returned, unp will be updated. 1419 */ 1420 int 1421 unionfs_copyfile(struct vnode *vp, int docopy, struct ucred *cred, 1422 struct thread *td) 1423 { 1424 struct unionfs_node *unp; 1425 struct unionfs_node *dunp; 1426 struct mount *mp; 1427 struct vnode *udvp; 1428 struct vnode *lvp; 1429 struct vnode *uvp; 1430 struct vattr uva; 1431 int error; 1432 1433 ASSERT_VOP_ELOCKED(vp, __func__); 1434 unp = VTOUNIONFS(vp); 1435 lvp = unp->un_lowervp; 1436 uvp = NULLVP; 1437 1438 if ((UNIONFSTOV(unp)->v_mount->mnt_flag & MNT_RDONLY)) 1439 return (EROFS); 1440 if (unp->un_dvp == NULLVP) 1441 return (EINVAL); 1442 if (unp->un_uppervp != NULLVP) 1443 return (EEXIST); 1444 1445 udvp = NULLVP; 1446 VI_LOCK(unp->un_dvp); 1447 dunp = VTOUNIONFS(unp->un_dvp); 1448 if (dunp != NULL) 1449 udvp = dunp->un_uppervp; 1450 VI_UNLOCK(unp->un_dvp); 1451 1452 if (udvp == NULLVP) 1453 return (EROFS); 1454 if ((udvp->v_mount->mnt_flag & MNT_RDONLY)) 1455 return (EROFS); 1456 ASSERT_VOP_UNLOCKED(udvp, __func__); 1457 1458 error = unionfs_set_in_progress_flag(vp, UNIONFS_COPY_IN_PROGRESS); 1459 if (error == EJUSTRETURN) 1460 return (0); 1461 else if (error != 0) 1462 return (error); 1463 1464 error = VOP_ACCESS(lvp, VREAD, cred, td); 1465 if (error != 0) 1466 goto unionfs_copyfile_cleanup; 1467 1468 if ((error = vn_start_write(udvp, &mp, V_WAIT | V_PCATCH)) != 0) 1469 goto unionfs_copyfile_cleanup; 1470 error = unionfs_vn_create_on_upper(&uvp, udvp, vp, &uva, td); 1471 if (error != 0) { 1472 vn_finished_write(mp); 1473 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1474 goto unionfs_copyfile_cleanup; 1475 } 1476 1477 /* 1478 * Note that it's still possible for e.g. VOP_WRITE to relock 1479 * uvp below while holding vp[=lvp] locked. Replacing 1480 * unionfs_copyfile_core with vn_generic_copy_file_range() will 1481 * allow us to avoid the problem by moving this vn_lock_pair() 1482 * call much later. 1483 */ 1484 vn_lock_pair(vp, false, LK_EXCLUSIVE, uvp, true, LK_EXCLUSIVE); 1485 unp = VTOUNIONFS(vp); 1486 if (unp == NULL) { 1487 error = ENOENT; 1488 goto unionfs_copyfile_cleanup; 1489 } 1490 1491 if (docopy != 0) { 1492 error = VOP_OPEN(lvp, FREAD, cred, td, NULL); 1493 if (error == 0) { 1494 error = unionfs_copyfile_core(lvp, uvp, cred, td); 1495 VOP_CLOSE(lvp, FREAD, cred, td); 1496 } 1497 } 1498 VOP_CLOSE(uvp, FWRITE, cred, td); 1499 VOP_ADD_WRITECOUNT_CHECKED(uvp, -1); 1500 CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d", 1501 __func__, uvp, uvp->v_writecount); 1502 1503 vn_finished_write(mp); 1504 1505 if (error == 0) { 1506 /* Reset the attributes. Ignore errors. */ 1507 uva.va_type = VNON; 1508 VOP_SETATTR(uvp, &uva, cred); 1509 unionfs_node_update(unp, uvp, td); 1510 } 1511 1512 unionfs_copyfile_cleanup: 1513 unionfs_clear_in_progress_flag(vp, UNIONFS_COPY_IN_PROGRESS); 1514 return (error); 1515 } 1516 1517 /* 1518 * Determine if the unionfs view of a directory is empty such that 1519 * an rmdir operation can be permitted. 1520 * 1521 * We assume the VOP_RMDIR() against the upper layer vnode will take 1522 * care of this check for us where the upper FS is concerned, so here 1523 * we concentrate on the lower FS. We need to check for the presence 1524 * of files other than "." and ".." in the lower FS directory and 1525 * then cross-check any files we find against the upper FS to see if 1526 * a whiteout is present (in which case we treat the lower file as 1527 * non-present). 1528 * 1529 * The logic here is based heavily on vn_dir_check_empty(). 1530 * 1531 * vp should be a locked unionfs node, and vp's lowervp should also be 1532 * locked. 1533 */ 1534 int 1535 unionfs_check_rmdir(struct vnode *vp, struct ucred *cred, struct thread *td) 1536 { 1537 struct vnode *uvp; 1538 struct vnode *lvp; 1539 struct vnode *tvp; 1540 char *dirbuf; 1541 size_t dirbuflen, len; 1542 off_t off; 1543 struct dirent *dp; 1544 struct componentname cn; 1545 struct vattr va; 1546 int error; 1547 int eofflag; 1548 1549 eofflag = 0; 1550 lvp = UNIONFSVPTOLOWERVP(vp); 1551 uvp = UNIONFSVPTOUPPERVP(vp); 1552 1553 /* 1554 * Note that the locking here still isn't ideal: We expect the caller 1555 * to hold both the upper and lower layer locks as well as the upper 1556 * parent directory lock, which it can do in a manner that avoids 1557 * deadlock. However, if the cross-check logic below needs to call 1558 * VOP_LOOKUP(), that may relock the upper vnode and lock any found 1559 * child vnode in a way that doesn't protect against deadlock given 1560 * the other held locks. Beyond that, the various other VOPs we issue 1561 * below, such as VOP_OPEN() and VOP_READDIR(), may also re-lock the 1562 * lower vnode. 1563 * We might instead just handoff between the upper vnode lock 1564 * (and its parent directory lock) and the lower vnode lock as needed, 1565 * so that the lower lock is never held at the same time as the upper 1566 * locks, but that opens up a wider window in which the upper 1567 * directory (and also the lower directory if it isn't truly 1568 * read-only) may change while the relevant lock is dropped. But 1569 * since re-locking may happen here and open up such a window anyway, 1570 * perhaps that is a worthwile tradeoff? Or perhaps we can ultimately 1571 * do sufficient tracking of empty state within the unionfs vnode 1572 * (in conjunction with upcalls from the lower FSes to notify us 1573 * of out-of-band state changes) that we can avoid these costly checks 1574 * altogether. 1575 */ 1576 ASSERT_VOP_LOCKED(lvp, __func__); 1577 ASSERT_VOP_ELOCKED(uvp, __func__); 1578 1579 if ((error = VOP_GETATTR(uvp, &va, cred)) != 0) 1580 return (error); 1581 if (va.va_flags & OPAQUE) 1582 return (0); 1583 1584 #ifdef MAC 1585 if ((error = mac_vnode_check_open(cred, lvp, VEXEC | VREAD)) != 0) 1586 return (error); 1587 #endif 1588 if ((error = VOP_ACCESS(lvp, VEXEC | VREAD, cred, td)) != 0) 1589 return (error); 1590 if ((error = VOP_OPEN(lvp, FREAD, cred, td, NULL)) != 0) 1591 return (error); 1592 if ((error = VOP_GETATTR(lvp, &va, cred)) != 0) 1593 return (error); 1594 1595 dirbuflen = max(DEV_BSIZE, GENERIC_MAXDIRSIZ); 1596 if (dirbuflen < va.va_blocksize) 1597 dirbuflen = va.va_blocksize; 1598 dirbuf = malloc(dirbuflen, M_TEMP, M_WAITOK); 1599 1600 len = 0; 1601 off = 0; 1602 eofflag = 0; 1603 1604 for (;;) { 1605 error = vn_dir_next_dirent(lvp, td, dirbuf, dirbuflen, 1606 &dp, &len, &off, &eofflag); 1607 if (error != 0) 1608 break; 1609 1610 if (len == 0) { 1611 /* EOF */ 1612 error = 0; 1613 break; 1614 } 1615 1616 if (dp->d_type == DT_WHT) 1617 continue; 1618 1619 /* 1620 * Any file in the directory which is not '.' or '..' indicates 1621 * the directory is not empty. 1622 */ 1623 switch (dp->d_namlen) { 1624 case 2: 1625 if (dp->d_name[1] != '.') { 1626 /* Can't be '..' (nor '.') */ 1627 break; 1628 } 1629 /* FALLTHROUGH */ 1630 case 1: 1631 if (dp->d_name[0] != '.') { 1632 /* Can't be '..' nor '.' */ 1633 break; 1634 } 1635 continue; 1636 default: 1637 break; 1638 } 1639 1640 cn.cn_namelen = dp->d_namlen; 1641 cn.cn_pnbuf = NULL; 1642 cn.cn_nameptr = dp->d_name; 1643 cn.cn_nameiop = LOOKUP; 1644 cn.cn_flags = LOCKPARENT | LOCKLEAF | RDONLY | ISLASTCN; 1645 cn.cn_lkflags = LK_EXCLUSIVE; 1646 cn.cn_cred = cred; 1647 1648 error = VOP_LOOKUP(uvp, &tvp, &cn); 1649 if (tvp != NULLVP) 1650 vput(tvp); 1651 if (error != 0 && error != ENOENT && error != EJUSTRETURN) 1652 break; 1653 else if ((cn.cn_flags & ISWHITEOUT) == 0) { 1654 error = ENOTEMPTY; 1655 break; 1656 } else 1657 error = 0; 1658 } 1659 1660 VOP_CLOSE(lvp, FREAD, cred, td); 1661 free(dirbuf, M_TEMP); 1662 return (error); 1663 } 1664