1 /* 2 * Copyright (c) 1994 Jan-Simon Pendry 3 * Copyright (c) 1994 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * Jan-Simon Pendry. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. All advertising materials mentioning features or use of this software 18 * must display the following acknowledgement: 19 * This product includes software developed by the University of 20 * California, Berkeley and its contributors. 21 * 4. Neither the name of the University nor the names of its contributors 22 * may be used to endorse or promote products derived from this software 23 * without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 * SUCH DAMAGE. 36 * 37 * @(#)union_subr.c 8.20 (Berkeley) 5/20/95 38 * $FreeBSD$ 39 */ 40 41 #include <sys/param.h> 42 #include <sys/systm.h> 43 #include <sys/fcntl.h> 44 #include <sys/file.h> 45 #include <sys/filedesc.h> 46 #include <sys/kernel.h> 47 #include <sys/lock.h> 48 #include <sys/malloc.h> 49 #include <sys/module.h> 50 #include <sys/mount.h> 51 #include <sys/mutex.h> 52 #include <sys/namei.h> 53 #include <sys/stat.h> 54 #include <sys/vnode.h> 55 56 #include <vm/vm.h> 57 #include <vm/vm_extern.h> /* for vnode_pager_setsize */ 58 #include <vm/vm_object.h> /* for vm cache coherency */ 59 #include <vm/uma.h> 60 61 #include <fs/unionfs/union.h> 62 63 #include <sys/proc.h> 64 65 extern int union_init(void); 66 67 /* must be power of two, otherwise change UNION_HASH() */ 68 #define NHASH 32 69 70 /* unsigned int ... */ 71 #define UNION_HASH(u, l) \ 72 (((((uintptr_t) (u)) + ((uintptr_t) l)) >> 8) & (NHASH-1)) 73 74 static LIST_HEAD(unhead, union_node) unhead[NHASH]; 75 static int unvplock[NHASH]; 76 77 static void union_dircache_r(struct vnode *vp, struct vnode ***vppp, 78 int *cntp); 79 static int union_list_lock(int ix); 80 static void union_list_unlock(int ix); 81 static int union_relookup(struct union_mount *um, struct vnode *dvp, 82 struct vnode **vpp, 83 struct componentname *cnp, 84 struct componentname *cn, char *path, 85 int pathlen); 86 static void union_updatevp(struct union_node *un, 87 struct vnode *uppervp, 88 struct vnode *lowervp); 89 static void union_newlower(struct union_node *, struct vnode *); 90 static void union_newupper(struct union_node *, struct vnode *); 91 static int union_copyfile(struct vnode *, struct vnode *, 92 struct ucred *, struct thread *); 93 static int union_vn_create(struct vnode **, struct union_node *, 94 struct thread *); 95 static int union_vn_close(struct vnode *, int, struct ucred *, 96 struct thread *); 97 98 int 99 union_init() 100 { 101 int i; 102 103 for (i = 0; i < NHASH; i++) 104 LIST_INIT(&unhead[i]); 105 bzero((caddr_t)unvplock, sizeof(unvplock)); 106 return (0); 107 } 108 109 static int 110 union_list_lock(ix) 111 int ix; 112 { 113 if (unvplock[ix] & UNVP_LOCKED) { 114 unvplock[ix] |= UNVP_WANT; 115 (void) tsleep( &unvplock[ix], PINOD, "unllck", 0); 116 return (1); 117 } 118 unvplock[ix] |= UNVP_LOCKED; 119 return (0); 120 } 121 122 static void 123 union_list_unlock(ix) 124 int ix; 125 { 126 unvplock[ix] &= ~UNVP_LOCKED; 127 128 if (unvplock[ix] & UNVP_WANT) { 129 unvplock[ix] &= ~UNVP_WANT; 130 wakeup( &unvplock[ix]); 131 } 132 } 133 134 /* 135 * union_updatevp: 136 * 137 * The uppervp, if not NULL, must be referenced and not locked by us 138 * The lowervp, if not NULL, must be referenced. 139 * 140 * If uppervp and lowervp match pointers already installed, then 141 * nothing happens. The passed vp's (when matching) are not adjusted. 142 * 143 * This routine may only be called by union_newupper() and 144 * union_newlower(). 145 */ 146 147 static void 148 union_updatevp(un, uppervp, lowervp) 149 struct union_node *un; 150 struct vnode *uppervp; 151 struct vnode *lowervp; 152 { 153 int ohash = UNION_HASH(un->un_uppervp, un->un_lowervp); 154 int nhash = UNION_HASH(uppervp, lowervp); 155 int docache = (lowervp != NULLVP || uppervp != NULLVP); 156 int lhash, uhash; 157 158 /* 159 * Ensure locking is ordered from lower to higher 160 * to avoid deadlocks. 161 */ 162 if (nhash < ohash) { 163 lhash = nhash; 164 uhash = ohash; 165 } else { 166 lhash = ohash; 167 uhash = nhash; 168 } 169 170 if (lhash != uhash) { 171 while (union_list_lock(lhash)) 172 continue; 173 } 174 175 while (union_list_lock(uhash)) 176 continue; 177 178 if (ohash != nhash || !docache) { 179 if (un->un_flags & UN_CACHED) { 180 un->un_flags &= ~UN_CACHED; 181 LIST_REMOVE(un, un_cache); 182 } 183 } 184 185 if (ohash != nhash) 186 union_list_unlock(ohash); 187 188 if (un->un_lowervp != lowervp) { 189 if (un->un_lowervp) { 190 vrele(un->un_lowervp); 191 if (un->un_path) { 192 free(un->un_path, M_TEMP); 193 un->un_path = 0; 194 } 195 } 196 un->un_lowervp = lowervp; 197 un->un_lowersz = VNOVAL; 198 } 199 200 if (un->un_uppervp != uppervp) { 201 if (un->un_uppervp) 202 vrele(un->un_uppervp); 203 un->un_uppervp = uppervp; 204 un->un_uppersz = VNOVAL; 205 } 206 207 if (docache && (ohash != nhash)) { 208 LIST_INSERT_HEAD(&unhead[nhash], un, un_cache); 209 un->un_flags |= UN_CACHED; 210 } 211 212 union_list_unlock(nhash); 213 } 214 215 /* 216 * Set a new lowervp. The passed lowervp must be referenced and will be 217 * stored in the vp in a referenced state. 218 */ 219 220 static void 221 union_newlower(un, lowervp) 222 struct union_node *un; 223 struct vnode *lowervp; 224 { 225 union_updatevp(un, un->un_uppervp, lowervp); 226 } 227 228 /* 229 * Set a new uppervp. The passed uppervp must be locked and will be 230 * stored in the vp in a locked state. The caller should not unlock 231 * uppervp. 232 */ 233 234 static void 235 union_newupper(un, uppervp) 236 struct union_node *un; 237 struct vnode *uppervp; 238 { 239 union_updatevp(un, uppervp, un->un_lowervp); 240 } 241 242 /* 243 * Keep track of size changes in the underlying vnodes. 244 * If the size changes, then callback to the vm layer 245 * giving priority to the upper layer size. 246 */ 247 void 248 union_newsize(vp, uppersz, lowersz) 249 struct vnode *vp; 250 off_t uppersz, lowersz; 251 { 252 struct union_node *un; 253 off_t sz; 254 255 /* only interested in regular files */ 256 if (vp->v_type != VREG) 257 return; 258 259 un = VTOUNION(vp); 260 sz = VNOVAL; 261 262 if ((uppersz != VNOVAL) && (un->un_uppersz != uppersz)) { 263 un->un_uppersz = uppersz; 264 if (sz == VNOVAL) 265 sz = un->un_uppersz; 266 } 267 268 if ((lowersz != VNOVAL) && (un->un_lowersz != lowersz)) { 269 un->un_lowersz = lowersz; 270 if (sz == VNOVAL) 271 sz = un->un_lowersz; 272 } 273 274 if (sz != VNOVAL) { 275 UDEBUG(("union: %s size now %ld\n", 276 (uppersz != VNOVAL ? "upper" : "lower"), (long)sz)); 277 /* 278 * There is no need to change size of non-existent object. 279 */ 280 /* vnode_pager_setsize(vp, sz); */ 281 } 282 } 283 284 /* 285 * union_allocvp: allocate a union_node and associate it with a 286 * parent union_node and one or two vnodes. 287 * 288 * vpp Holds the returned vnode locked and referenced if no 289 * error occurs. 290 * 291 * mp Holds the mount point. mp may or may not be busied. 292 * allocvp() makes no changes to mp. 293 * 294 * dvp Holds the parent union_node to the one we wish to create. 295 * XXX may only be used to traverse an uncopied lowervp-based 296 * tree? XXX 297 * 298 * dvp may or may not be locked. allocvp() makes no changes 299 * to dvp. 300 * 301 * upperdvp Holds the parent vnode to uppervp, generally used along 302 * with path component information to create a shadow of 303 * lowervp when uppervp does not exist. 304 * 305 * upperdvp is referenced but unlocked on entry, and will be 306 * dereferenced on return. 307 * 308 * uppervp Holds the new uppervp vnode to be stored in the 309 * union_node we are allocating. uppervp is referenced but 310 * not locked, and will be dereferenced on return. 311 * 312 * lowervp Holds the new lowervp vnode to be stored in the 313 * union_node we are allocating. lowervp is referenced but 314 * not locked, and will be dereferenced on return. 315 * 316 * cnp Holds path component information to be coupled with 317 * lowervp and upperdvp to allow unionfs to create an uppervp 318 * later on. Only used if lowervp is valid. The contents 319 * of cnp is only valid for the duration of the call. 320 * 321 * docache Determine whether this node should be entered in the 322 * cache or whether it should be destroyed as soon as possible. 323 * 324 * All union_nodes are maintained on a singly-linked 325 * list. New nodes are only allocated when they cannot 326 * be found on this list. Entries on the list are 327 * removed when the vfs reclaim entry is called. 328 * 329 * A single lock is kept for the entire list. This is 330 * needed because the getnewvnode() function can block 331 * waiting for a vnode to become free, in which case there 332 * may be more than one process trying to get the same 333 * vnode. This lock is only taken if we are going to 334 * call getnewvnode(), since the kernel itself is single-threaded. 335 * 336 * If an entry is found on the list, then call vget() to 337 * take a reference. This is done because there may be 338 * zero references to it and so it needs to removed from 339 * the vnode free list. 340 */ 341 342 int 343 union_allocvp(vpp, mp, dvp, upperdvp, cnp, uppervp, lowervp, docache) 344 struct vnode **vpp; 345 struct mount *mp; 346 struct vnode *dvp; /* parent union vnode */ 347 struct vnode *upperdvp; /* parent vnode of uppervp */ 348 struct componentname *cnp; /* may be null */ 349 struct vnode *uppervp; /* may be null */ 350 struct vnode *lowervp; /* may be null */ 351 int docache; 352 { 353 int error; 354 struct union_node *un = 0; 355 struct union_mount *um = MOUNTTOUNIONMOUNT(mp); 356 struct thread *td = (cnp) ? cnp->cn_thread : curthread; 357 int hash = 0; 358 int vflag; 359 int try; 360 361 if (uppervp == NULLVP && lowervp == NULLVP) 362 panic("union: unidentifiable allocation"); 363 364 if (uppervp && lowervp && (uppervp->v_type != lowervp->v_type)) { 365 vrele(lowervp); 366 lowervp = NULLVP; 367 } 368 369 /* detect the root vnode (and aliases) */ 370 vflag = 0; 371 if ((uppervp == um->um_uppervp) && 372 ((lowervp == NULLVP) || lowervp == um->um_lowervp)) { 373 if (lowervp == NULLVP) { 374 lowervp = um->um_lowervp; 375 if (lowervp != NULLVP) 376 VREF(lowervp); 377 } 378 vflag = VV_ROOT; 379 } 380 381 loop: 382 if (!docache) { 383 un = 0; 384 } else for (try = 0; try < 3; try++) { 385 switch (try) { 386 case 0: 387 if (lowervp == NULLVP) 388 continue; 389 hash = UNION_HASH(uppervp, lowervp); 390 break; 391 392 case 1: 393 if (uppervp == NULLVP) 394 continue; 395 hash = UNION_HASH(uppervp, NULLVP); 396 break; 397 398 case 2: 399 if (lowervp == NULLVP) 400 continue; 401 hash = UNION_HASH(NULLVP, lowervp); 402 break; 403 } 404 405 while (union_list_lock(hash)) 406 continue; 407 408 LIST_FOREACH(un, &unhead[hash], un_cache) { 409 if ((un->un_lowervp == lowervp || 410 un->un_lowervp == NULLVP) && 411 (un->un_uppervp == uppervp || 412 un->un_uppervp == NULLVP) && 413 (UNIONTOV(un)->v_mount == mp)) { 414 if (vget(UNIONTOV(un), 0, 415 cnp ? cnp->cn_thread : NULL)) { 416 union_list_unlock(hash); 417 goto loop; 418 } 419 break; 420 } 421 } 422 423 union_list_unlock(hash); 424 425 if (un) 426 break; 427 } 428 429 if (un) { 430 /* 431 * Obtain a lock on the union_node. Everything is unlocked 432 * except for dvp, so check that case. If they match, our 433 * new un is already locked. Otherwise we have to lock our 434 * new un. 435 * 436 * A potential deadlock situation occurs when we are holding 437 * one lock while trying to get another. We must follow 438 * strict ordering rules to avoid it. We try to locate dvp 439 * by scanning up from un_vnode, since the most likely 440 * scenario is un being under dvp. 441 */ 442 443 if (dvp && un->un_vnode != dvp) { 444 struct vnode *scan = un->un_vnode; 445 446 do { 447 scan = VTOUNION(scan)->un_pvp; 448 } while (scan && scan->v_op == union_vnodeop_p && 449 scan != dvp); 450 if (scan != dvp) { 451 /* 452 * our new un is above dvp (we never saw dvp 453 * while moving up the tree). 454 */ 455 VREF(dvp); 456 VOP_UNLOCK(dvp, 0, td); 457 error = vn_lock(un->un_vnode, LK_EXCLUSIVE, td); 458 vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, td); 459 vrele(dvp); 460 } else { 461 /* 462 * our new un is under dvp 463 */ 464 error = vn_lock(un->un_vnode, LK_EXCLUSIVE, td); 465 } 466 } else if (dvp == NULLVP) { 467 /* 468 * dvp is NULL, we need to lock un. 469 */ 470 error = vn_lock(un->un_vnode, LK_EXCLUSIVE, td); 471 } else { 472 /* 473 * dvp == un->un_vnode, we are already locked. 474 */ 475 error = 0; 476 } 477 478 if (error) 479 goto loop; 480 481 /* 482 * At this point, the union_node is locked and referenced. 483 * 484 * uppervp is locked and referenced or NULL, lowervp is 485 * referenced or NULL. 486 */ 487 UDEBUG(("Modify existing un %p vn %p upper %p(refs %d) -> %p(refs %d)\n", 488 un, un->un_vnode, un->un_uppervp, 489 (un->un_uppervp ? vrefcnt(un->un_uppervp) : -99), 490 uppervp, 491 (uppervp ? vrefcnt(uppervp) : -99) 492 )); 493 494 if (uppervp != un->un_uppervp) { 495 KASSERT(uppervp == NULL || vrefcnt(uppervp) > 0, ("union_allocvp: too few refs %d (at least 1 required) on uppervp", vrefcnt(uppervp))); 496 union_newupper(un, uppervp); 497 } else if (uppervp) { 498 KASSERT(vrefcnt(uppervp) > 1, ("union_allocvp: too few refs %d (at least 2 required) on uppervp", vrefcnt(uppervp))); 499 vrele(uppervp); 500 } 501 502 /* 503 * Save information about the lower layer. 504 * This needs to keep track of pathname 505 * and directory information which union_vn_create() 506 * might need. 507 */ 508 if (lowervp != un->un_lowervp) { 509 union_newlower(un, lowervp); 510 if (cnp && (lowervp != NULLVP)) { 511 un->un_path = malloc(cnp->cn_namelen+1, 512 M_TEMP, M_WAITOK); 513 bcopy(cnp->cn_nameptr, un->un_path, 514 cnp->cn_namelen); 515 un->un_path[cnp->cn_namelen] = '\0'; 516 } 517 } else if (lowervp) { 518 vrele(lowervp); 519 } 520 521 /* 522 * and upperdvp 523 */ 524 if (upperdvp != un->un_dirvp) { 525 if (un->un_dirvp) 526 vrele(un->un_dirvp); 527 un->un_dirvp = upperdvp; 528 } else if (upperdvp) { 529 vrele(upperdvp); 530 } 531 532 *vpp = UNIONTOV(un); 533 return (0); 534 } 535 536 if (docache) { 537 /* 538 * Otherwise lock the vp list while we call getnewvnode() 539 * since that can block. 540 */ 541 hash = UNION_HASH(uppervp, lowervp); 542 543 if (union_list_lock(hash)) 544 goto loop; 545 } 546 547 /* 548 * Create new node rather than replace old node. 549 */ 550 551 error = getnewvnode("union", mp, union_vnodeop_p, vpp); 552 if (error) { 553 /* 554 * If an error occurs, clear out vnodes. 555 */ 556 if (lowervp) 557 vrele(lowervp); 558 if (uppervp) 559 vrele(uppervp); 560 if (upperdvp) 561 vrele(upperdvp); 562 *vpp = NULL; 563 goto out; 564 } 565 566 MALLOC((*vpp)->v_data, void *, sizeof(struct union_node), 567 M_TEMP, M_WAITOK); 568 569 ASSERT_VOP_LOCKED(*vpp, "union_allocvp"); 570 (*vpp)->v_vflag |= vflag; 571 if (uppervp) 572 (*vpp)->v_type = uppervp->v_type; 573 else 574 (*vpp)->v_type = lowervp->v_type; 575 576 un = VTOUNION(*vpp); 577 bzero(un, sizeof(*un)); 578 579 vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, td); 580 581 un->un_vnode = *vpp; 582 un->un_uppervp = uppervp; 583 un->un_uppersz = VNOVAL; 584 un->un_lowervp = lowervp; 585 un->un_lowersz = VNOVAL; 586 un->un_dirvp = upperdvp; 587 un->un_pvp = dvp; /* only parent dir in new allocation */ 588 if (dvp != NULLVP) 589 VREF(dvp); 590 un->un_dircache = 0; 591 un->un_openl = 0; 592 593 if (cnp && (lowervp != NULLVP)) { 594 un->un_path = malloc(cnp->cn_namelen+1, M_TEMP, M_WAITOK); 595 bcopy(cnp->cn_nameptr, un->un_path, cnp->cn_namelen); 596 un->un_path[cnp->cn_namelen] = '\0'; 597 } else { 598 un->un_path = 0; 599 un->un_dirvp = NULL; 600 } 601 602 if (docache) { 603 LIST_INSERT_HEAD(&unhead[hash], un, un_cache); 604 un->un_flags |= UN_CACHED; 605 } 606 607 out: 608 if (docache) 609 union_list_unlock(hash); 610 611 return (error); 612 } 613 614 int 615 union_freevp(vp) 616 struct vnode *vp; 617 { 618 struct union_node *un = VTOUNION(vp); 619 620 if (un->un_flags & UN_CACHED) { 621 un->un_flags &= ~UN_CACHED; 622 LIST_REMOVE(un, un_cache); 623 } 624 625 if (un->un_pvp != NULLVP) { 626 vrele(un->un_pvp); 627 un->un_pvp = NULL; 628 } 629 if (un->un_uppervp != NULLVP) { 630 vrele(un->un_uppervp); 631 un->un_uppervp = NULL; 632 } 633 if (un->un_lowervp != NULLVP) { 634 vrele(un->un_lowervp); 635 un->un_lowervp = NULL; 636 } 637 if (un->un_dirvp != NULLVP) { 638 vrele(un->un_dirvp); 639 un->un_dirvp = NULL; 640 } 641 if (un->un_path) { 642 free(un->un_path, M_TEMP); 643 un->un_path = NULL; 644 } 645 646 FREE(vp->v_data, M_TEMP); 647 vp->v_data = 0; 648 649 return (0); 650 } 651 652 /* 653 * copyfile. Copy the vnode (fvp) to the vnode (tvp) 654 * using a sequence of reads and writes. Both (fvp) 655 * and (tvp) are locked on entry and exit. 656 * 657 * fvp and tvp are both exclusive locked on call, but their refcount's 658 * haven't been bumped at all. 659 */ 660 static int 661 union_copyfile(fvp, tvp, cred, td) 662 struct vnode *fvp; 663 struct vnode *tvp; 664 struct ucred *cred; 665 struct thread *td; 666 { 667 char *buf; 668 struct uio uio; 669 struct iovec iov; 670 int error = 0; 671 672 /* 673 * strategy: 674 * Allocate a buffer of size MAXBSIZE. 675 * Loop doing reads and writes, keeping track 676 * of the current uio offset. 677 * Give up at the first sign of trouble. 678 */ 679 680 bzero(&uio, sizeof(uio)); 681 682 uio.uio_td = td; 683 uio.uio_segflg = UIO_SYSSPACE; 684 uio.uio_offset = 0; 685 686 VOP_LEASE(fvp, td, cred, LEASE_READ); 687 VOP_LEASE(tvp, td, cred, LEASE_WRITE); 688 689 buf = malloc(MAXBSIZE, M_TEMP, M_WAITOK); 690 691 /* ugly loop follows... */ 692 do { 693 off_t offset = uio.uio_offset; 694 int count; 695 int bufoffset; 696 697 /* 698 * Setup for big read. 699 */ 700 uio.uio_iov = &iov; 701 uio.uio_iovcnt = 1; 702 iov.iov_base = buf; 703 iov.iov_len = MAXBSIZE; 704 uio.uio_resid = iov.iov_len; 705 uio.uio_rw = UIO_READ; 706 707 if ((error = VOP_READ(fvp, &uio, 0, cred)) != 0) 708 break; 709 710 /* 711 * Get bytes read, handle read eof case and setup for 712 * write loop. 713 */ 714 if ((count = MAXBSIZE - uio.uio_resid) == 0) 715 break; 716 bufoffset = 0; 717 718 /* 719 * Write until an error occurs or our buffer has been 720 * exhausted, then update the offset for the next read. 721 */ 722 while (bufoffset < count) { 723 uio.uio_iov = &iov; 724 uio.uio_iovcnt = 1; 725 iov.iov_base = buf + bufoffset; 726 iov.iov_len = count - bufoffset; 727 uio.uio_offset = offset + bufoffset; 728 uio.uio_rw = UIO_WRITE; 729 uio.uio_resid = iov.iov_len; 730 731 if ((error = VOP_WRITE(tvp, &uio, 0, cred)) != 0) 732 break; 733 bufoffset += (count - bufoffset) - uio.uio_resid; 734 } 735 uio.uio_offset = offset + bufoffset; 736 } while (error == 0); 737 738 free(buf, M_TEMP); 739 return (error); 740 } 741 742 /* 743 * 744 * un's vnode is assumed to be locked on entry and remains locked on exit. 745 */ 746 747 int 748 union_copyup(un, docopy, cred, td) 749 struct union_node *un; 750 int docopy; 751 struct ucred *cred; 752 struct thread *td; 753 { 754 int error; 755 struct mount *mp; 756 struct vnode *lvp, *uvp; 757 758 /* 759 * If the user does not have read permission, the vnode should not 760 * be copied to upper layer. 761 */ 762 vn_lock(un->un_lowervp, LK_EXCLUSIVE | LK_RETRY, td); 763 error = VOP_ACCESS(un->un_lowervp, VREAD, cred, td); 764 VOP_UNLOCK(un->un_lowervp, 0, td); 765 if (error) 766 return (error); 767 768 if ((error = vn_start_write(un->un_dirvp, &mp, V_WAIT | PCATCH)) != 0) 769 return (error); 770 if ((error = union_vn_create(&uvp, un, td)) != 0) { 771 vn_finished_write(mp); 772 return (error); 773 } 774 775 lvp = un->un_lowervp; 776 777 KASSERT(vrefcnt(uvp) > 0, ("copy: uvp refcount 0: %d", vrefcnt(uvp))); 778 if (docopy) { 779 /* 780 * XX - should not ignore errors 781 * from VOP_CLOSE() 782 */ 783 vn_lock(lvp, LK_EXCLUSIVE | LK_RETRY, td); 784 error = VOP_OPEN(lvp, FREAD, cred, td); 785 if (error == 0 && vn_canvmio(lvp) == TRUE) 786 error = vfs_object_create(lvp, td, cred); 787 if (error == 0) { 788 error = union_copyfile(lvp, uvp, cred, td); 789 VOP_UNLOCK(lvp, 0, td); 790 (void) VOP_CLOSE(lvp, FREAD, cred, td); 791 } 792 if (error == 0) 793 UDEBUG(("union: copied up %s\n", un->un_path)); 794 795 } 796 VOP_UNLOCK(uvp, 0, td); 797 vn_finished_write(mp); 798 union_newupper(un, uvp); 799 KASSERT(vrefcnt(uvp) > 0, ("copy: uvp refcount 0: %d", vrefcnt(uvp))); 800 union_vn_close(uvp, FWRITE, cred, td); 801 KASSERT(vrefcnt(uvp) > 0, ("copy: uvp refcount 0: %d", vrefcnt(uvp))); 802 /* 803 * Subsequent IOs will go to the top layer, so 804 * call close on the lower vnode and open on the 805 * upper vnode to ensure that the filesystem keeps 806 * its references counts right. This doesn't do 807 * the right thing with (cred) and (FREAD) though. 808 * Ignoring error returns is not right, either. 809 */ 810 if (error == 0) { 811 int i; 812 813 for (i = 0; i < un->un_openl; i++) { 814 (void) VOP_CLOSE(lvp, FREAD, cred, td); 815 (void) VOP_OPEN(uvp, FREAD, cred, td); 816 } 817 if (un->un_openl) { 818 if (vn_canvmio(uvp) == TRUE) 819 error = vfs_object_create(uvp, td, cred); 820 } 821 un->un_openl = 0; 822 } 823 824 return (error); 825 826 } 827 828 /* 829 * union_relookup: 830 * 831 * dvp should be locked on entry and will be locked on return. No 832 * net change in the ref count will occur. 833 * 834 * If an error is returned, *vpp will be invalid, otherwise it 835 * will hold a locked, referenced vnode. If *vpp == dvp then 836 * remember that only one exclusive lock is held. 837 */ 838 839 static int 840 union_relookup(um, dvp, vpp, cnp, cn, path, pathlen) 841 struct union_mount *um; 842 struct vnode *dvp; 843 struct vnode **vpp; 844 struct componentname *cnp; 845 struct componentname *cn; 846 char *path; 847 int pathlen; 848 { 849 int error; 850 851 /* 852 * A new componentname structure must be faked up because 853 * there is no way to know where the upper level cnp came 854 * from or what it is being used for. This must duplicate 855 * some of the work done by NDINIT(), some of the work done 856 * by namei(), some of the work done by lookup() and some of 857 * the work done by VOP_LOOKUP() when given a CREATE flag. 858 * Conclusion: Horrible. 859 */ 860 cn->cn_namelen = pathlen; 861 cn->cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK); 862 bcopy(path, cn->cn_pnbuf, cn->cn_namelen); 863 cn->cn_pnbuf[cn->cn_namelen] = '\0'; 864 865 cn->cn_nameiop = CREATE; 866 cn->cn_flags = (LOCKPARENT|LOCKLEAF|HASBUF|SAVENAME|ISLASTCN); 867 cn->cn_thread = cnp->cn_thread; 868 if (um->um_op == UNMNT_ABOVE) 869 cn->cn_cred = cnp->cn_cred; 870 else 871 cn->cn_cred = um->um_cred; 872 cn->cn_nameptr = cn->cn_pnbuf; 873 cn->cn_consume = cnp->cn_consume; 874 875 VREF(dvp); 876 VOP_UNLOCK(dvp, 0, cnp->cn_thread); 877 878 /* 879 * Pass dvp unlocked and referenced on call to relookup(). 880 * 881 * If an error occurs, dvp will be returned unlocked and dereferenced. 882 */ 883 884 if ((error = relookup(dvp, vpp, cn)) != 0) { 885 vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, cnp->cn_thread); 886 return(error); 887 } 888 889 /* 890 * If no error occurs, dvp will be returned locked with the reference 891 * left as before, and vpp will be returned referenced and locked. 892 * 893 * We want to return with dvp as it was passed to us, so we get 894 * rid of our reference. 895 */ 896 vrele(dvp); 897 return (0); 898 } 899 900 /* 901 * Create a shadow directory in the upper layer. 902 * The new vnode is returned locked. 903 * 904 * (um) points to the union mount structure for access to the 905 * the mounting process's credentials. 906 * (dvp) is the directory in which to create the shadow directory, 907 * It is locked (but not ref'd) on entry and return. 908 * (cnp) is the component name to be created. 909 * (vpp) is the returned newly created shadow directory, which 910 * is returned locked and ref'd 911 */ 912 int 913 union_mkshadow(um, dvp, cnp, vpp) 914 struct union_mount *um; 915 struct vnode *dvp; 916 struct componentname *cnp; 917 struct vnode **vpp; 918 { 919 int error; 920 struct vattr va; 921 struct thread *td = cnp->cn_thread; 922 struct componentname cn; 923 struct mount *mp; 924 925 if ((error = vn_start_write(dvp, &mp, V_WAIT | PCATCH)) != 0) 926 return (error); 927 if ((error = union_relookup(um, dvp, vpp, cnp, &cn, 928 cnp->cn_nameptr, cnp->cn_namelen)) != 0) { 929 vn_finished_write(mp); 930 return (error); 931 } 932 933 if (*vpp) { 934 if (cn.cn_flags & HASBUF) { 935 uma_zfree(namei_zone, cn.cn_pnbuf); 936 cn.cn_flags &= ~HASBUF; 937 } 938 if (dvp == *vpp) 939 vrele(*vpp); 940 else 941 vput(*vpp); 942 vn_finished_write(mp); 943 *vpp = NULLVP; 944 return (EEXIST); 945 } 946 947 /* 948 * Policy: when creating the shadow directory in the 949 * upper layer, create it owned by the user who did 950 * the mount, group from parent directory, and mode 951 * 777 modified by umask (ie mostly identical to the 952 * mkdir syscall). (jsp, kb) 953 */ 954 955 VATTR_NULL(&va); 956 va.va_type = VDIR; 957 va.va_mode = um->um_cmode; 958 959 /* VOP_LEASE: dvp is locked */ 960 VOP_LEASE(dvp, td, cn.cn_cred, LEASE_WRITE); 961 962 error = VOP_MKDIR(dvp, vpp, &cn, &va); 963 if (cn.cn_flags & HASBUF) { 964 uma_zfree(namei_zone, cn.cn_pnbuf); 965 cn.cn_flags &= ~HASBUF; 966 } 967 /*vput(dvp);*/ 968 vn_finished_write(mp); 969 return (error); 970 } 971 972 /* 973 * Create a whiteout entry in the upper layer. 974 * 975 * (um) points to the union mount structure for access to the 976 * the mounting process's credentials. 977 * (dvp) is the directory in which to create the whiteout. 978 * It is locked on entry and return. 979 * (cnp) is the component name to be created. 980 */ 981 int 982 union_mkwhiteout(um, dvp, cnp, path) 983 struct union_mount *um; 984 struct vnode *dvp; 985 struct componentname *cnp; 986 char *path; 987 { 988 int error; 989 struct thread *td = cnp->cn_thread; 990 struct vnode *wvp; 991 struct componentname cn; 992 struct mount *mp; 993 994 if ((error = vn_start_write(dvp, &mp, V_WAIT | PCATCH)) != 0) 995 return (error); 996 error = union_relookup(um, dvp, &wvp, cnp, &cn, path, strlen(path)); 997 if (error) { 998 vn_finished_write(mp); 999 return (error); 1000 } 1001 1002 if (wvp) { 1003 if (cn.cn_flags & HASBUF) { 1004 uma_zfree(namei_zone, cn.cn_pnbuf); 1005 cn.cn_flags &= ~HASBUF; 1006 } 1007 if (wvp == dvp) 1008 vrele(wvp); 1009 else 1010 vput(wvp); 1011 vn_finished_write(mp); 1012 return (EEXIST); 1013 } 1014 1015 /* VOP_LEASE: dvp is locked */ 1016 VOP_LEASE(dvp, td, td->td_ucred, LEASE_WRITE); 1017 1018 error = VOP_WHITEOUT(dvp, &cn, CREATE); 1019 if (cn.cn_flags & HASBUF) { 1020 uma_zfree(namei_zone, cn.cn_pnbuf); 1021 cn.cn_flags &= ~HASBUF; 1022 } 1023 vn_finished_write(mp); 1024 return (error); 1025 } 1026 1027 /* 1028 * union_vn_create: creates and opens a new shadow file 1029 * on the upper union layer. This function is similar 1030 * in spirit to calling vn_open() but it avoids calling namei(). 1031 * The problem with calling namei() is that a) it locks too many 1032 * things, and b) it doesn't start at the "right" directory, 1033 * whereas relookup() is told where to start. 1034 * 1035 * On entry, the vnode associated with un is locked. It remains locked 1036 * on return. 1037 * 1038 * If no error occurs, *vpp contains a locked referenced vnode for your 1039 * use. If an error occurs *vpp iis undefined. 1040 */ 1041 static int 1042 union_vn_create(vpp, un, td) 1043 struct vnode **vpp; 1044 struct union_node *un; 1045 struct thread *td; 1046 { 1047 struct vnode *vp; 1048 struct ucred *cred = td->td_ucred; 1049 struct vattr vat; 1050 struct vattr *vap = &vat; 1051 int fmode = FFLAGS(O_WRONLY|O_CREAT|O_TRUNC|O_EXCL); 1052 int error; 1053 int cmode; 1054 struct componentname cn; 1055 1056 *vpp = NULLVP; 1057 FILEDESC_LOCK(td->td_proc->p_fd); 1058 cmode = UN_FILEMODE & ~td->td_proc->p_fd->fd_cmask; 1059 FILEDESC_UNLOCK(td->td_proc->p_fd); 1060 1061 /* 1062 * Build a new componentname structure (for the same 1063 * reasons outlines in union_mkshadow()). 1064 * The difference here is that the file is owned by 1065 * the current user, rather than by the person who 1066 * did the mount, since the current user needs to be 1067 * able to write the file (that's why it is being 1068 * copied in the first place). 1069 */ 1070 cn.cn_namelen = strlen(un->un_path); 1071 cn.cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK); 1072 bcopy(un->un_path, cn.cn_pnbuf, cn.cn_namelen+1); 1073 cn.cn_nameiop = CREATE; 1074 cn.cn_flags = (LOCKPARENT|LOCKLEAF|HASBUF|SAVENAME|ISLASTCN); 1075 cn.cn_thread = td; 1076 cn.cn_cred = td->td_ucred; 1077 cn.cn_nameptr = cn.cn_pnbuf; 1078 cn.cn_consume = 0; 1079 1080 /* 1081 * Pass dvp unlocked and referenced on call to relookup(). 1082 * 1083 * If an error occurs, dvp will be returned unlocked and dereferenced. 1084 */ 1085 VREF(un->un_dirvp); 1086 error = relookup(un->un_dirvp, &vp, &cn); 1087 if (error) 1088 return (error); 1089 1090 /* 1091 * If no error occurs, dvp will be returned locked with the reference 1092 * left as before, and vpp will be returned referenced and locked. 1093 */ 1094 if (vp) { 1095 vput(un->un_dirvp); 1096 if (cn.cn_flags & HASBUF) { 1097 uma_zfree(namei_zone, cn.cn_pnbuf); 1098 cn.cn_flags &= ~HASBUF; 1099 } 1100 if (vp == un->un_dirvp) 1101 vrele(vp); 1102 else 1103 vput(vp); 1104 return (EEXIST); 1105 } 1106 1107 /* 1108 * Good - there was no race to create the file 1109 * so go ahead and create it. The permissions 1110 * on the file will be 0666 modified by the 1111 * current user's umask. Access to the file, while 1112 * it is unioned, will require access to the top *and* 1113 * bottom files. Access when not unioned will simply 1114 * require access to the top-level file. 1115 * TODO: confirm choice of access permissions. 1116 */ 1117 VATTR_NULL(vap); 1118 vap->va_type = VREG; 1119 vap->va_mode = cmode; 1120 VOP_LEASE(un->un_dirvp, td, cred, LEASE_WRITE); 1121 error = VOP_CREATE(un->un_dirvp, &vp, &cn, vap); 1122 if (cn.cn_flags & HASBUF) { 1123 uma_zfree(namei_zone, cn.cn_pnbuf); 1124 cn.cn_flags &= ~HASBUF; 1125 } 1126 vput(un->un_dirvp); 1127 if (error) 1128 return (error); 1129 1130 error = VOP_OPEN(vp, fmode, cred, td); 1131 if (error == 0 && vn_canvmio(vp) == TRUE) 1132 error = vfs_object_create(vp, td, cred); 1133 if (error) { 1134 vput(vp); 1135 return (error); 1136 } 1137 vp->v_writecount++; 1138 *vpp = vp; 1139 return (0); 1140 } 1141 1142 static int 1143 union_vn_close(vp, fmode, cred, td) 1144 struct vnode *vp; 1145 int fmode; 1146 struct ucred *cred; 1147 struct thread *td; 1148 { 1149 1150 if (fmode & FWRITE) 1151 --vp->v_writecount; 1152 return (VOP_CLOSE(vp, fmode, cred, td)); 1153 } 1154 1155 #if 0 1156 1157 /* 1158 * union_removed_upper: 1159 * 1160 * called with union_node unlocked. XXX 1161 */ 1162 1163 void 1164 union_removed_upper(un) 1165 struct union_node *un; 1166 { 1167 struct thread *td = curthread; /* XXX */ 1168 struct vnode **vpp; 1169 1170 /* 1171 * Do not set the uppervp to NULLVP. If lowervp is NULLVP, 1172 * union node will have neither uppervp nor lowervp. We remove 1173 * the union node from cache, so that it will not be referrenced. 1174 */ 1175 union_newupper(un, NULLVP); 1176 if (un->un_dircache != 0) { 1177 for (vpp = un->un_dircache; *vpp != NULLVP; vpp++) 1178 vrele(*vpp); 1179 free(un->un_dircache, M_TEMP); 1180 un->un_dircache = 0; 1181 } 1182 1183 if (un->un_flags & UN_CACHED) { 1184 un->un_flags &= ~UN_CACHED; 1185 LIST_REMOVE(un, un_cache); 1186 } 1187 } 1188 1189 #endif 1190 1191 /* 1192 * Determine whether a whiteout is needed 1193 * during a remove/rmdir operation. 1194 */ 1195 int 1196 union_dowhiteout(un, cred, td) 1197 struct union_node *un; 1198 struct ucred *cred; 1199 struct thread *td; 1200 { 1201 struct vattr va; 1202 1203 if (un->un_lowervp != NULLVP) 1204 return (1); 1205 1206 if (VOP_GETATTR(un->un_uppervp, &va, cred, td) == 0 && 1207 (va.va_flags & OPAQUE)) 1208 return (1); 1209 1210 return (0); 1211 } 1212 1213 static void 1214 union_dircache_r(vp, vppp, cntp) 1215 struct vnode *vp; 1216 struct vnode ***vppp; 1217 int *cntp; 1218 { 1219 struct union_node *un; 1220 1221 if (vp->v_op != union_vnodeop_p) { 1222 if (vppp) { 1223 VREF(vp); 1224 *(*vppp)++ = vp; 1225 if (--(*cntp) == 0) 1226 panic("union: dircache table too small"); 1227 } else { 1228 (*cntp)++; 1229 } 1230 1231 return; 1232 } 1233 1234 un = VTOUNION(vp); 1235 if (un->un_uppervp != NULLVP) 1236 union_dircache_r(un->un_uppervp, vppp, cntp); 1237 if (un->un_lowervp != NULLVP) 1238 union_dircache_r(un->un_lowervp, vppp, cntp); 1239 } 1240 1241 struct vnode * 1242 union_dircache(vp, td) 1243 struct vnode *vp; 1244 struct thread *td; 1245 { 1246 int cnt; 1247 struct vnode *nvp; 1248 struct vnode **vpp; 1249 struct vnode **dircache; 1250 struct union_node *un; 1251 int error; 1252 1253 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 1254 dircache = VTOUNION(vp)->un_dircache; 1255 1256 nvp = NULLVP; 1257 1258 if (dircache == NULL) { 1259 cnt = 0; 1260 union_dircache_r(vp, 0, &cnt); 1261 cnt++; 1262 dircache = malloc(cnt * sizeof(struct vnode *), 1263 M_TEMP, M_WAITOK); 1264 vpp = dircache; 1265 union_dircache_r(vp, &vpp, &cnt); 1266 *vpp = NULLVP; 1267 vpp = dircache + 1; 1268 } else { 1269 vpp = dircache; 1270 do { 1271 if (*vpp++ == VTOUNION(vp)->un_uppervp) 1272 break; 1273 } while (*vpp != NULLVP); 1274 } 1275 1276 if (*vpp == NULLVP) 1277 goto out; 1278 1279 /*vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, td);*/ 1280 UDEBUG(("ALLOCVP-3 %p ref %d\n", *vpp, (*vpp ? vrefcnt(*vpp) : -99))); 1281 VREF(*vpp); 1282 error = union_allocvp(&nvp, vp->v_mount, NULLVP, NULLVP, NULL, *vpp, NULLVP, 0); 1283 UDEBUG(("ALLOCVP-3B %p ref %d\n", nvp, (*vpp ? vrefcnt(*vpp) : -99))); 1284 if (error) 1285 goto out; 1286 1287 VTOUNION(vp)->un_dircache = 0; 1288 un = VTOUNION(nvp); 1289 un->un_dircache = dircache; 1290 1291 out: 1292 VOP_UNLOCK(vp, 0, td); 1293 return (nvp); 1294 } 1295 1296 /* 1297 * Module glue to remove #ifdef UNION from vfs_syscalls.c 1298 */ 1299 static int 1300 union_dircheck(struct thread *td, struct vnode **vp, struct file *fp) 1301 { 1302 int error = 0; 1303 1304 if ((*vp)->v_op == union_vnodeop_p) { 1305 struct vnode *lvp; 1306 1307 lvp = union_dircache(*vp, td); 1308 if (lvp != NULLVP) { 1309 struct vattr va; 1310 1311 /* 1312 * If the directory is opaque, 1313 * then don't show lower entries 1314 */ 1315 error = VOP_GETATTR(*vp, &va, fp->f_cred, td); 1316 if (va.va_flags & OPAQUE) { 1317 vput(lvp); 1318 lvp = NULL; 1319 } 1320 } 1321 1322 if (lvp != NULLVP) { 1323 error = VOP_OPEN(lvp, FREAD, fp->f_cred, td); 1324 if (error == 0 && vn_canvmio(lvp) == TRUE) 1325 error = vfs_object_create(lvp, td, fp->f_cred); 1326 if (error) { 1327 vput(lvp); 1328 return (error); 1329 } 1330 VOP_UNLOCK(lvp, 0, td); 1331 FILE_LOCK(fp); 1332 fp->f_data = lvp; 1333 fp->f_offset = 0; 1334 FILE_UNLOCK(fp); 1335 error = vn_close(*vp, FREAD, fp->f_cred, td); 1336 if (error) 1337 return (error); 1338 *vp = lvp; 1339 return -1; /* goto unionread */ 1340 } 1341 } 1342 return error; 1343 } 1344 1345 static int 1346 union_modevent(module_t mod, int type, void *data) 1347 { 1348 switch (type) { 1349 case MOD_LOAD: 1350 union_dircheckp = union_dircheck; 1351 break; 1352 case MOD_UNLOAD: 1353 union_dircheckp = NULL; 1354 break; 1355 default: 1356 break; 1357 } 1358 return 0; 1359 } 1360 1361 static moduledata_t union_mod = { 1362 "union_dircheck", 1363 union_modevent, 1364 NULL 1365 }; 1366 1367 DECLARE_MODULE(union_dircheck, union_mod, SI_SUB_VFS, SI_ORDER_ANY); 1368