xref: /freebsd/sys/fs/unionfs/union_subr.c (revision 0de89efe5c443f213c7ea28773ef2dc6cf3af2ed)
1 /*
2  * Copyright (c) 1994 Jan-Simon Pendry
3  * Copyright (c) 1994
4  *	The Regents of the University of California.  All rights reserved.
5  *
6  * This code is derived from software contributed to Berkeley by
7  * Jan-Simon Pendry.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  * 3. All advertising materials mentioning features or use of this software
18  *    must display the following acknowledgement:
19  *	This product includes software developed by the University of
20  *	California, Berkeley and its contributors.
21  * 4. Neither the name of the University nor the names of its contributors
22  *    may be used to endorse or promote products derived from this software
23  *    without specific prior written permission.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35  * SUCH DAMAGE.
36  *
37  *	@(#)union_subr.c	8.20 (Berkeley) 5/20/95
38  * $Id: union_subr.c,v 1.20 1997/08/14 03:57:46 kato Exp $
39  */
40 
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/vnode.h>
44 #include <sys/namei.h>
45 #include <sys/malloc.h>
46 #include <sys/fcntl.h>
47 #include <sys/filedesc.h>
48 #include <sys/mount.h>
49 #include <sys/stat.h>
50 #include <vm/vm.h>
51 #include <vm/vm_extern.h>	/* for vnode_pager_setsize */
52 #include <miscfs/union/union.h>
53 
54 #include <sys/proc.h>
55 
56 extern int	union_init __P((void));
57 
58 /* must be power of two, otherwise change UNION_HASH() */
59 #define NHASH 32
60 
61 /* unsigned int ... */
62 #define UNION_HASH(u, l) \
63 	(((((unsigned long) (u)) + ((unsigned long) l)) >> 8) & (NHASH-1))
64 
65 static LIST_HEAD(unhead, union_node) unhead[NHASH];
66 static int unvplock[NHASH];
67 
68 static void	union_dircache_r __P((struct vnode *vp, struct vnode ***vppp,
69 				      int *cntp));
70 static int	union_list_lock __P((int ix));
71 static void	union_list_unlock __P((int ix));
72 static int	union_relookup __P((struct union_mount *um, struct vnode *dvp,
73 				    struct vnode **vpp,
74 				    struct componentname *cnp,
75 				    struct componentname *cn, char *path,
76 				    int pathlen));
77 extern void	union_updatevp __P((struct union_node *un,
78 				    struct vnode *uppervp,
79 				    struct vnode *lowervp));
80 
81 int
82 union_init()
83 {
84 	int i;
85 
86 	for (i = 0; i < NHASH; i++)
87 		LIST_INIT(&unhead[i]);
88 	bzero((caddr_t) unvplock, sizeof(unvplock));
89 	return (0);
90 }
91 
92 static int
93 union_list_lock(ix)
94 	int ix;
95 {
96 
97 	if (unvplock[ix] & UN_LOCKED) {
98 		unvplock[ix] |= UN_WANT;
99 		(void) tsleep((caddr_t) &unvplock[ix], PINOD, "unllck", 0);
100 		return (1);
101 	}
102 
103 	unvplock[ix] |= UN_LOCKED;
104 
105 	return (0);
106 }
107 
108 static void
109 union_list_unlock(ix)
110 	int ix;
111 {
112 
113 	unvplock[ix] &= ~UN_LOCKED;
114 
115 	if (unvplock[ix] & UN_WANT) {
116 		unvplock[ix] &= ~UN_WANT;
117 		wakeup((caddr_t) &unvplock[ix]);
118 	}
119 }
120 
121 void
122 union_updatevp(un, uppervp, lowervp)
123 	struct union_node *un;
124 	struct vnode *uppervp;
125 	struct vnode *lowervp;
126 {
127 	int ohash = UNION_HASH(un->un_uppervp, un->un_lowervp);
128 	int nhash = UNION_HASH(uppervp, lowervp);
129 	int docache = (lowervp != NULLVP || uppervp != NULLVP);
130 	int lhash, hhash, uhash;
131 
132 	/*
133 	 * Ensure locking is ordered from lower to higher
134 	 * to avoid deadlocks.
135 	 */
136 	if (nhash < ohash) {
137 		lhash = nhash;
138 		uhash = ohash;
139 	} else {
140 		lhash = ohash;
141 		uhash = nhash;
142 	}
143 
144 	if (lhash != uhash)
145 		while (union_list_lock(lhash))
146 			continue;
147 
148 	while (union_list_lock(uhash))
149 		continue;
150 
151 	if (ohash != nhash || !docache) {
152 		if (un->un_flags & UN_CACHED) {
153 			un->un_flags &= ~UN_CACHED;
154 			LIST_REMOVE(un, un_cache);
155 		}
156 	}
157 
158 	if (ohash != nhash)
159 		union_list_unlock(ohash);
160 
161 	if (un->un_lowervp != lowervp) {
162 		if (un->un_lowervp) {
163 			vrele(un->un_lowervp);
164 			if (un->un_path) {
165 				free(un->un_path, M_TEMP);
166 				un->un_path = 0;
167 			}
168 			if (un->un_dirvp) {
169 				vrele(un->un_dirvp);
170 				un->un_dirvp = NULLVP;
171 			}
172 		}
173 		un->un_lowervp = lowervp;
174 		un->un_lowersz = VNOVAL;
175 	}
176 
177 	if (un->un_uppervp != uppervp) {
178 		if (un->un_uppervp)
179 			vrele(un->un_uppervp);
180 
181 		un->un_uppervp = uppervp;
182 		un->un_uppersz = VNOVAL;
183 	}
184 
185 	if (docache && (ohash != nhash)) {
186 		LIST_INSERT_HEAD(&unhead[nhash], un, un_cache);
187 		un->un_flags |= UN_CACHED;
188 	}
189 
190 	union_list_unlock(nhash);
191 }
192 
193 void
194 union_newlower(un, lowervp)
195 	struct union_node *un;
196 	struct vnode *lowervp;
197 {
198 
199 	union_updatevp(un, un->un_uppervp, lowervp);
200 }
201 
202 void
203 union_newupper(un, uppervp)
204 	struct union_node *un;
205 	struct vnode *uppervp;
206 {
207 
208 	union_updatevp(un, uppervp, un->un_lowervp);
209 }
210 
211 /*
212  * Keep track of size changes in the underlying vnodes.
213  * If the size changes, then callback to the vm layer
214  * giving priority to the upper layer size.
215  */
216 void
217 union_newsize(vp, uppersz, lowersz)
218 	struct vnode *vp;
219 	off_t uppersz, lowersz;
220 {
221 	struct union_node *un;
222 	off_t sz;
223 
224 	/* only interested in regular files */
225 	if (vp->v_type != VREG)
226 		return;
227 
228 	un = VTOUNION(vp);
229 	sz = VNOVAL;
230 
231 	if ((uppersz != VNOVAL) && (un->un_uppersz != uppersz)) {
232 		un->un_uppersz = uppersz;
233 		if (sz == VNOVAL)
234 			sz = un->un_uppersz;
235 	}
236 
237 	if ((lowersz != VNOVAL) && (un->un_lowersz != lowersz)) {
238 		un->un_lowersz = lowersz;
239 		if (sz == VNOVAL)
240 			sz = un->un_lowersz;
241 	}
242 
243 	if (sz != VNOVAL) {
244 #ifdef UNION_DIAGNOSTIC
245 		printf("union: %s size now %ld\n",
246 			uppersz != VNOVAL ? "upper" : "lower", (long) sz);
247 #endif
248 		vnode_pager_setsize(vp, sz);
249 	}
250 }
251 
252 /*
253  * allocate a union_node/vnode pair.  the vnode is
254  * referenced and locked.  the new vnode is returned
255  * via (vpp).  (mp) is the mountpoint of the union filesystem,
256  * (dvp) is the parent directory where the upper layer object
257  * should exist (but doesn't) and (cnp) is the componentname
258  * information which is partially copied to allow the upper
259  * layer object to be created at a later time.  (uppervp)
260  * and (lowervp) reference the upper and lower layer objects
261  * being mapped.  either, but not both, can be nil.
262  * if supplied, (uppervp) is locked.
263  * the reference is either maintained in the new union_node
264  * object which is allocated, or they are vrele'd.
265  *
266  * all union_nodes are maintained on a singly-linked
267  * list.  new nodes are only allocated when they cannot
268  * be found on this list.  entries on the list are
269  * removed when the vfs reclaim entry is called.
270  *
271  * a single lock is kept for the entire list.  this is
272  * needed because the getnewvnode() function can block
273  * waiting for a vnode to become free, in which case there
274  * may be more than one process trying to get the same
275  * vnode.  this lock is only taken if we are going to
276  * call getnewvnode, since the kernel itself is single-threaded.
277  *
278  * if an entry is found on the list, then call vget() to
279  * take a reference.  this is done because there may be
280  * zero references to it and so it needs to removed from
281  * the vnode free list.
282  */
283 int
284 union_allocvp(vpp, mp, undvp, dvp, cnp, uppervp, lowervp, docache)
285 	struct vnode **vpp;
286 	struct mount *mp;
287 	struct vnode *undvp;		/* parent union vnode */
288 	struct vnode *dvp;		/* may be null */
289 	struct componentname *cnp;	/* may be null */
290 	struct vnode *uppervp;		/* may be null */
291 	struct vnode *lowervp;		/* may be null */
292 	int docache;
293 {
294 	int error;
295 	struct union_node *un = 0;
296 	struct vnode *xlowervp = NULLVP;
297 	struct union_mount *um = MOUNTTOUNIONMOUNT(mp);
298 	int hash;
299 	int vflag;
300 	int try;
301 
302 	if (uppervp == NULLVP && lowervp == NULLVP)
303 		panic("union: unidentifiable allocation");
304 
305 	if (uppervp && lowervp && (uppervp->v_type != lowervp->v_type)) {
306 		xlowervp = lowervp;
307 		lowervp = NULLVP;
308 	}
309 
310 	/* detect the root vnode (and aliases) */
311 	vflag = 0;
312 	if ((uppervp == um->um_uppervp) &&
313 	    ((lowervp == NULLVP) || lowervp == um->um_lowervp)) {
314 		if (lowervp == NULLVP) {
315 			lowervp = um->um_lowervp;
316 			if (lowervp != NULLVP)
317 				VREF(lowervp);
318 		}
319 		vflag = VROOT;
320 	}
321 
322 loop:
323 	if (!docache) {
324 		un = 0;
325 	} else for (try = 0; try < 3; try++) {
326 		switch (try) {
327 		case 0:
328 			if (lowervp == NULLVP)
329 				continue;
330 			hash = UNION_HASH(uppervp, lowervp);
331 			break;
332 
333 		case 1:
334 			if (uppervp == NULLVP)
335 				continue;
336 			hash = UNION_HASH(uppervp, NULLVP);
337 			break;
338 
339 		case 2:
340 			if (lowervp == NULLVP)
341 				continue;
342 			hash = UNION_HASH(NULLVP, lowervp);
343 			break;
344 		}
345 
346 		while (union_list_lock(hash))
347 			continue;
348 
349 		for (un = unhead[hash].lh_first; un != 0;
350 					un = un->un_cache.le_next) {
351 			if ((un->un_lowervp == lowervp ||
352 			     un->un_lowervp == NULLVP) &&
353 			    (un->un_uppervp == uppervp ||
354 			     un->un_uppervp == NULLVP) &&
355 			    (UNIONTOV(un)->v_mount == mp)) {
356 				if (vget(UNIONTOV(un), 0,
357 				    cnp ? cnp->cn_proc : NULL)) {
358 					union_list_unlock(hash);
359 					goto loop;
360 				}
361 				break;
362 			}
363 		}
364 
365 		union_list_unlock(hash);
366 
367 		if (un)
368 			break;
369 	}
370 
371 	if (un) {
372 		/*
373 		 * Obtain a lock on the union_node.
374 		 * uppervp is locked, though un->un_uppervp
375 		 * may not be.  this doesn't break the locking
376 		 * hierarchy since in the case that un->un_uppervp
377 		 * is not yet locked it will be vrele'd and replaced
378 		 * with uppervp.
379 		 */
380 
381 		if ((dvp != NULLVP) && (uppervp == dvp)) {
382 			/*
383 			 * Access ``.'', so (un) will already
384 			 * be locked.  Since this process has
385 			 * the lock on (uppervp) no other
386 			 * process can hold the lock on (un).
387 			 */
388 #ifdef DIAGNOSTIC
389 			if ((un->un_flags & UN_LOCKED) == 0)
390 				panic("union: . not locked");
391 			else if (curproc && un->un_pid != curproc->p_pid &&
392 				    un->un_pid > -1 && curproc->p_pid > -1)
393 				panic("union: allocvp not lock owner");
394 #endif
395 		} else {
396 			if (un->un_flags & UN_LOCKED) {
397 				vrele(UNIONTOV(un));
398 				un->un_flags |= UN_WANT;
399 				(void) tsleep((caddr_t) &un->un_flags, PINOD, "unalvp", 0);
400 				goto loop;
401 			}
402 			un->un_flags |= UN_LOCKED;
403 
404 #ifdef DIAGNOSTIC
405 			if (curproc)
406 				un->un_pid = curproc->p_pid;
407 			else
408 				un->un_pid = -1;
409 #endif
410 		}
411 
412 		/*
413 		 * At this point, the union_node is locked,
414 		 * un->un_uppervp may not be locked, and uppervp
415 		 * is locked or nil.
416 		 */
417 
418 		/*
419 		 * Save information about the upper layer.
420 		 */
421 		if (uppervp != un->un_uppervp) {
422 			union_newupper(un, uppervp);
423 		} else if (uppervp) {
424 			vrele(uppervp);
425 		}
426 
427 		if (un->un_uppervp) {
428 			un->un_flags |= UN_ULOCK;
429 			un->un_flags &= ~UN_KLOCK;
430 		}
431 
432 		/*
433 		 * Save information about the lower layer.
434 		 * This needs to keep track of pathname
435 		 * and directory information which union_vn_create
436 		 * might need.
437 		 */
438 		if (lowervp != un->un_lowervp) {
439 			union_newlower(un, lowervp);
440 			if (cnp && (lowervp != NULLVP)) {
441 				un->un_hash = cnp->cn_hash;
442 				un->un_path = malloc(cnp->cn_namelen+1,
443 						M_TEMP, M_WAITOK);
444 				bcopy(cnp->cn_nameptr, un->un_path,
445 						cnp->cn_namelen);
446 				un->un_path[cnp->cn_namelen] = '\0';
447 				VREF(dvp);
448 				un->un_dirvp = dvp;
449 			}
450 		} else if (lowervp) {
451 			vrele(lowervp);
452 		}
453 		*vpp = UNIONTOV(un);
454 		return (0);
455 	}
456 
457 	if (docache) {
458 		/*
459 		 * otherwise lock the vp list while we call getnewvnode
460 		 * since that can block.
461 		 */
462 		hash = UNION_HASH(uppervp, lowervp);
463 
464 		if (union_list_lock(hash))
465 			goto loop;
466 	}
467 
468 	error = getnewvnode(VT_UNION, mp, union_vnodeop_p, vpp);
469 	if (error) {
470 		if (uppervp) {
471 			if (dvp == uppervp)
472 				vrele(uppervp);
473 			else
474 				vput(uppervp);
475 		}
476 		if (lowervp)
477 			vrele(lowervp);
478 
479 		goto out;
480 	}
481 
482 	MALLOC((*vpp)->v_data, void *, sizeof(struct union_node),
483 		M_TEMP, M_WAITOK);
484 
485 	(*vpp)->v_flag |= vflag;
486 	if (uppervp)
487 		(*vpp)->v_type = uppervp->v_type;
488 	else
489 		(*vpp)->v_type = lowervp->v_type;
490 	un = VTOUNION(*vpp);
491 	un->un_vnode = *vpp;
492 	un->un_uppervp = uppervp;
493 	un->un_uppersz = VNOVAL;
494 	un->un_lowervp = lowervp;
495 	un->un_lowersz = VNOVAL;
496 	un->un_pvp = undvp;
497 	if (undvp != NULLVP)
498 		VREF(undvp);
499 	un->un_dircache = 0;
500 	un->un_openl = 0;
501 	un->un_flags = UN_LOCKED;
502 	if (un->un_uppervp)
503 		un->un_flags |= UN_ULOCK;
504 #ifdef DIAGNOSTIC
505 	if (curproc)
506 		un->un_pid = curproc->p_pid;
507 	else
508 		un->un_pid = -1;
509 #endif
510 	if (cnp && (lowervp != NULLVP)) {
511 		un->un_hash = cnp->cn_hash;
512 		un->un_path = malloc(cnp->cn_namelen+1, M_TEMP, M_WAITOK);
513 		bcopy(cnp->cn_nameptr, un->un_path, cnp->cn_namelen);
514 		un->un_path[cnp->cn_namelen] = '\0';
515 		VREF(dvp);
516 		un->un_dirvp = dvp;
517 	} else {
518 		un->un_hash = 0;
519 		un->un_path = 0;
520 		un->un_dirvp = 0;
521 	}
522 
523 	if (docache) {
524 		LIST_INSERT_HEAD(&unhead[hash], un, un_cache);
525 		un->un_flags |= UN_CACHED;
526 	}
527 
528 	if (xlowervp)
529 		vrele(xlowervp);
530 
531 out:
532 	if (docache)
533 		union_list_unlock(hash);
534 
535 	return (error);
536 }
537 
538 int
539 union_freevp(vp)
540 	struct vnode *vp;
541 {
542 	struct union_node *un = VTOUNION(vp);
543 
544 	if (un->un_flags & UN_CACHED) {
545 		un->un_flags &= ~UN_CACHED;
546 		LIST_REMOVE(un, un_cache);
547 	}
548 
549 	if (un->un_pvp != NULLVP)
550 		vrele(un->un_pvp);
551 	if (un->un_uppervp != NULLVP)
552 		vrele(un->un_uppervp);
553 	if (un->un_lowervp != NULLVP)
554 		vrele(un->un_lowervp);
555 	if (un->un_dirvp != NULLVP)
556 		vrele(un->un_dirvp);
557 	if (un->un_path)
558 		free(un->un_path, M_TEMP);
559 
560 	FREE(vp->v_data, M_TEMP);
561 	vp->v_data = 0;
562 
563 	return (0);
564 }
565 
566 /*
567  * copyfile.  copy the vnode (fvp) to the vnode (tvp)
568  * using a sequence of reads and writes.  both (fvp)
569  * and (tvp) are locked on entry and exit.
570  */
571 int
572 union_copyfile(fvp, tvp, cred, p)
573 	struct vnode *fvp;
574 	struct vnode *tvp;
575 	struct ucred *cred;
576 	struct proc *p;
577 {
578 	char *buf;
579 	struct uio uio;
580 	struct iovec iov;
581 	int error = 0;
582 
583 	/*
584 	 * strategy:
585 	 * allocate a buffer of size MAXBSIZE.
586 	 * loop doing reads and writes, keeping track
587 	 * of the current uio offset.
588 	 * give up at the first sign of trouble.
589 	 */
590 
591 	uio.uio_procp = p;
592 	uio.uio_segflg = UIO_SYSSPACE;
593 	uio.uio_offset = 0;
594 
595 	VOP_UNLOCK(fvp, 0, p);				/* XXX */
596 	VOP_LEASE(fvp, p, cred, LEASE_READ);
597 	vn_lock(fvp, LK_EXCLUSIVE | LK_RETRY, p);	/* XXX */
598 	VOP_UNLOCK(tvp, 0, p);				/* XXX */
599 	VOP_LEASE(tvp, p, cred, LEASE_WRITE);
600 	vn_lock(tvp, LK_EXCLUSIVE | LK_RETRY, p);	/* XXX */
601 
602 	buf = malloc(MAXBSIZE, M_TEMP, M_WAITOK);
603 
604 	/* ugly loop follows... */
605 	do {
606 		off_t offset = uio.uio_offset;
607 
608 		uio.uio_iov = &iov;
609 		uio.uio_iovcnt = 1;
610 		iov.iov_base = buf;
611 		iov.iov_len = MAXBSIZE;
612 		uio.uio_resid = iov.iov_len;
613 		uio.uio_rw = UIO_READ;
614 		error = VOP_READ(fvp, &uio, 0, cred);
615 
616 		if (error == 0) {
617 			uio.uio_iov = &iov;
618 			uio.uio_iovcnt = 1;
619 			iov.iov_base = buf;
620 			iov.iov_len = MAXBSIZE - uio.uio_resid;
621 			uio.uio_offset = offset;
622 			uio.uio_rw = UIO_WRITE;
623 			uio.uio_resid = iov.iov_len;
624 
625 			if (uio.uio_resid == 0)
626 				break;
627 
628 			do {
629 				error = VOP_WRITE(tvp, &uio, 0, cred);
630 			} while ((uio.uio_resid > 0) && (error == 0));
631 		}
632 
633 	} while (error == 0);
634 
635 	free(buf, M_TEMP);
636 	return (error);
637 }
638 
639 /*
640  * (un) is assumed to be locked on entry and remains
641  * locked on exit.
642  */
643 int
644 union_copyup(un, docopy, cred, p)
645 	struct union_node *un;
646 	int docopy;
647 	struct ucred *cred;
648 	struct proc *p;
649 {
650 	int error;
651 	struct vnode *lvp, *uvp;
652 
653 	/*
654 	 * If the user does not have read permission, the vnode should not
655 	 * be copied to upper layer.
656 	 */
657 	vn_lock(un->un_lowervp, LK_EXCLUSIVE | LK_RETRY, p);
658 	error = VOP_ACCESS(un->un_lowervp, VREAD, cred, p);
659 	VOP_UNLOCK(un->un_lowervp, 0, p);
660 	if (error)
661 		return (error);
662 
663 	error = union_vn_create(&uvp, un, p);
664 	if (error)
665 		return (error);
666 
667 	/* at this point, uppervp is locked */
668 	union_newupper(un, uvp);
669 	un->un_flags |= UN_ULOCK;
670 
671 	lvp = un->un_lowervp;
672 
673 	if (docopy) {
674 		/*
675 		 * XX - should not ignore errors
676 		 * from VOP_CLOSE
677 		 */
678 		vn_lock(lvp, LK_EXCLUSIVE | LK_RETRY, p);
679 		error = VOP_OPEN(lvp, FREAD, cred, p);
680 		if (error == 0) {
681 			error = union_copyfile(lvp, uvp, cred, p);
682 			VOP_UNLOCK(lvp, 0, p);
683 			(void) VOP_CLOSE(lvp, FREAD, cred, p);
684 		}
685 #ifdef UNION_DIAGNOSTIC
686 		if (error == 0)
687 			uprintf("union: copied up %s\n", un->un_path);
688 #endif
689 
690 	}
691 	un->un_flags &= ~UN_ULOCK;
692 	VOP_UNLOCK(uvp, 0, p);
693 	union_vn_close(uvp, FWRITE, cred, p);
694 	vn_lock(uvp, LK_EXCLUSIVE | LK_RETRY, p);
695 	un->un_flags |= UN_ULOCK;
696 
697 	/*
698 	 * Subsequent IOs will go to the top layer, so
699 	 * call close on the lower vnode and open on the
700 	 * upper vnode to ensure that the filesystem keeps
701 	 * its references counts right.  This doesn't do
702 	 * the right thing with (cred) and (FREAD) though.
703 	 * Ignoring error returns is not right, either.
704 	 */
705 	if (error == 0) {
706 		int i;
707 
708 		for (i = 0; i < un->un_openl; i++) {
709 			(void) VOP_CLOSE(lvp, FREAD, cred, p);
710 			(void) VOP_OPEN(uvp, FREAD, cred, p);
711 		}
712 		un->un_openl = 0;
713 	}
714 
715 	return (error);
716 
717 }
718 
719 static int
720 union_relookup(um, dvp, vpp, cnp, cn, path, pathlen)
721 	struct union_mount *um;
722 	struct vnode *dvp;
723 	struct vnode **vpp;
724 	struct componentname *cnp;
725 	struct componentname *cn;
726 	char *path;
727 	int pathlen;
728 {
729 	int error;
730 
731 	/*
732 	 * A new componentname structure must be faked up because
733 	 * there is no way to know where the upper level cnp came
734 	 * from or what it is being used for.  This must duplicate
735 	 * some of the work done by NDINIT, some of the work done
736 	 * by namei, some of the work done by lookup and some of
737 	 * the work done by VOP_LOOKUP when given a CREATE flag.
738 	 * Conclusion: Horrible.
739 	 *
740 	 * The pathname buffer will be FREEed by VOP_MKDIR.
741 	 */
742 	cn->cn_namelen = pathlen;
743 	cn->cn_pnbuf = zalloc(namei_zone);
744 	bcopy(path, cn->cn_pnbuf, cn->cn_namelen);
745 	cn->cn_pnbuf[cn->cn_namelen] = '\0';
746 
747 	cn->cn_nameiop = CREATE;
748 	cn->cn_flags = (LOCKPARENT|HASBUF|SAVENAME|SAVESTART|ISLASTCN);
749 	cn->cn_proc = cnp->cn_proc;
750 	if (um->um_op == UNMNT_ABOVE)
751 		cn->cn_cred = cnp->cn_cred;
752 	else
753 		cn->cn_cred = um->um_cred;
754 	cn->cn_nameptr = cn->cn_pnbuf;
755 	cn->cn_hash = cnp->cn_hash;
756 	cn->cn_consume = cnp->cn_consume;
757 
758 	VREF(dvp);
759 	error = relookup(dvp, vpp, cn);
760 	if (!error)
761 		vrele(dvp);
762 	else {
763 		zfree(namei_zone, cn->cn_pnbuf);
764 		cn->cn_pnbuf = '\0';
765 	}
766 
767 	return (error);
768 }
769 
770 /*
771  * Create a shadow directory in the upper layer.
772  * The new vnode is returned locked.
773  *
774  * (um) points to the union mount structure for access to the
775  * the mounting process's credentials.
776  * (dvp) is the directory in which to create the shadow directory.
777  * it is unlocked on entry and exit.
778  * (cnp) is the componentname to be created.
779  * (vpp) is the returned newly created shadow directory, which
780  * is returned locked.
781  */
782 int
783 union_mkshadow(um, dvp, cnp, vpp)
784 	struct union_mount *um;
785 	struct vnode *dvp;
786 	struct componentname *cnp;
787 	struct vnode **vpp;
788 {
789 	int error;
790 	struct vattr va;
791 	struct proc *p = cnp->cn_proc;
792 	struct componentname cn;
793 
794 	error = union_relookup(um, dvp, vpp, cnp, &cn,
795 			cnp->cn_nameptr, cnp->cn_namelen);
796 	if (error)
797 		return (error);
798 
799 	if (*vpp) {
800 		VOP_ABORTOP(dvp, &cn);
801 		VOP_UNLOCK(dvp, 0, p);
802 		vrele(*vpp);
803 		*vpp = NULLVP;
804 		return (EEXIST);
805 	}
806 
807 	/*
808 	 * policy: when creating the shadow directory in the
809 	 * upper layer, create it owned by the user who did
810 	 * the mount, group from parent directory, and mode
811 	 * 777 modified by umask (ie mostly identical to the
812 	 * mkdir syscall).  (jsp, kb)
813 	 */
814 
815 	VATTR_NULL(&va);
816 	va.va_type = VDIR;
817 	va.va_mode = um->um_cmode;
818 
819 	/* VOP_LEASE: dvp is locked */
820 	VOP_LEASE(dvp, p, cn.cn_cred, LEASE_WRITE);
821 
822 	error = VOP_MKDIR(dvp, vpp, &cn, &va);
823 	return (error);
824 }
825 
826 /*
827  * Create a whiteout entry in the upper layer.
828  *
829  * (um) points to the union mount structure for access to the
830  * the mounting process's credentials.
831  * (dvp) is the directory in which to create the whiteout.
832  * it is locked on entry and exit.
833  * (cnp) is the componentname to be created.
834  */
835 int
836 union_mkwhiteout(um, dvp, cnp, path)
837 	struct union_mount *um;
838 	struct vnode *dvp;
839 	struct componentname *cnp;
840 	char *path;
841 {
842 	int error;
843 	struct vattr va;
844 	struct proc *p = cnp->cn_proc;
845 	struct vnode *wvp;
846 	struct componentname cn;
847 
848 	VOP_UNLOCK(dvp, 0, p);
849 	error = union_relookup(um, dvp, &wvp, cnp, &cn, path, strlen(path));
850 	if (error) {
851 		vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, p);
852 		return (error);
853 	}
854 
855 	if (wvp) {
856 		VOP_ABORTOP(dvp, &cn);
857 		vrele(dvp);
858 		vrele(wvp);
859 		return (EEXIST);
860 	}
861 
862 	/* VOP_LEASE: dvp is locked */
863 	VOP_LEASE(dvp, p, p->p_ucred, LEASE_WRITE);
864 
865 	error = VOP_WHITEOUT(dvp, &cn, CREATE);
866 	if (error)
867 		VOP_ABORTOP(dvp, &cn);
868 
869 	vrele(dvp);
870 
871 	return (error);
872 }
873 
874 /*
875  * union_vn_create: creates and opens a new shadow file
876  * on the upper union layer.  this function is similar
877  * in spirit to calling vn_open but it avoids calling namei().
878  * the problem with calling namei is that a) it locks too many
879  * things, and b) it doesn't start at the "right" directory,
880  * whereas relookup is told where to start.
881  */
882 int
883 union_vn_create(vpp, un, p)
884 	struct vnode **vpp;
885 	struct union_node *un;
886 	struct proc *p;
887 {
888 	struct vnode *vp;
889 	struct ucred *cred = p->p_ucred;
890 	struct vattr vat;
891 	struct vattr *vap = &vat;
892 	int fmode = FFLAGS(O_WRONLY|O_CREAT|O_TRUNC|O_EXCL);
893 	int error;
894 	int cmode = UN_FILEMODE & ~p->p_fd->fd_cmask;
895 	struct componentname cn;
896 
897 	*vpp = NULLVP;
898 
899 	/*
900 	 * Build a new componentname structure (for the same
901 	 * reasons outlines in union_mkshadow).
902 	 * The difference here is that the file is owned by
903 	 * the current user, rather than by the person who
904 	 * did the mount, since the current user needs to be
905 	 * able to write the file (that's why it is being
906 	 * copied in the first place).
907 	 */
908 	cn.cn_namelen = strlen(un->un_path);
909 	cn.cn_pnbuf = zalloc(namei_zone);
910 	bcopy(un->un_path, cn.cn_pnbuf, cn.cn_namelen+1);
911 	cn.cn_nameiop = CREATE;
912 	cn.cn_flags = (LOCKPARENT|HASBUF|SAVENAME|SAVESTART|ISLASTCN);
913 	cn.cn_proc = p;
914 	cn.cn_cred = p->p_ucred;
915 	cn.cn_nameptr = cn.cn_pnbuf;
916 	cn.cn_hash = un->un_hash;
917 	cn.cn_consume = 0;
918 
919 	VREF(un->un_dirvp);
920 	error = relookup(un->un_dirvp, &vp, &cn);
921 	if (error)
922 		return (error);
923 	vrele(un->un_dirvp);
924 
925 	if (vp) {
926 		VOP_ABORTOP(un->un_dirvp, &cn);
927 		if (un->un_dirvp == vp)
928 			vrele(un->un_dirvp);
929 		else
930 			vput(un->un_dirvp);
931 		vrele(vp);
932 		return (EEXIST);
933 	}
934 
935 	/*
936 	 * Good - there was no race to create the file
937 	 * so go ahead and create it.  The permissions
938 	 * on the file will be 0666 modified by the
939 	 * current user's umask.  Access to the file, while
940 	 * it is unioned, will require access to the top *and*
941 	 * bottom files.  Access when not unioned will simply
942 	 * require access to the top-level file.
943 	 * TODO: confirm choice of access permissions.
944 	 */
945 	VATTR_NULL(vap);
946 	vap->va_type = VREG;
947 	vap->va_mode = cmode;
948 	VOP_LEASE(un->un_dirvp, p, cred, LEASE_WRITE);
949 	if (error = VOP_CREATE(un->un_dirvp, &vp, &cn, vap))
950 		return (error);
951 
952 	error = VOP_OPEN(vp, fmode, cred, p);
953 	if (error) {
954 		vput(vp);
955 		return (error);
956 	}
957 
958 	vp->v_writecount++;
959 	*vpp = vp;
960 	return (0);
961 }
962 
963 int
964 union_vn_close(vp, fmode, cred, p)
965 	struct vnode *vp;
966 	int fmode;
967 	struct ucred *cred;
968 	struct proc *p;
969 {
970 
971 	if (fmode & FWRITE)
972 		--vp->v_writecount;
973 	return (VOP_CLOSE(vp, fmode, cred, p));
974 }
975 
976 void
977 union_removed_upper(un)
978 	struct union_node *un;
979 {
980 	struct proc *p = curproc;	/* XXX */
981 	struct vnode **vpp;
982 
983 	/*
984 	 * Do not set the uppervp to NULLVP.  If lowervp is NULLVP,
985 	 * union node will have neither uppervp nor lowervp.  We romove
986 	 * the union node from cache, so that it will not be referrenced.
987 	 */
988 #if 0
989 	union_newupper(un, NULLVP);
990 #endif
991 	if (un->un_dircache != 0) {
992 		for (vpp = un->un_dircache; *vpp != NULLVP; vpp++)
993 			vrele(*vpp);
994 		free(un->un_dircache, M_TEMP);
995 		un->un_dircache = 0;
996 	}
997 
998 	if (un->un_flags & UN_CACHED) {
999 		un->un_flags &= ~UN_CACHED;
1000 		LIST_REMOVE(un, un_cache);
1001 	}
1002 
1003 	if (un->un_flags & UN_ULOCK) {
1004 		un->un_flags &= ~UN_ULOCK;
1005 		VOP_UNLOCK(un->un_uppervp, 0, p);
1006 	}
1007 }
1008 
1009 #if 0
1010 struct vnode *
1011 union_lowervp(vp)
1012 	struct vnode *vp;
1013 {
1014 	struct union_node *un = VTOUNION(vp);
1015 
1016 	if ((un->un_lowervp != NULLVP) &&
1017 	    (vp->v_type == un->un_lowervp->v_type)) {
1018 		if (vget(un->un_lowervp, 0) == 0)
1019 			return (un->un_lowervp);
1020 	}
1021 
1022 	return (NULLVP);
1023 }
1024 #endif
1025 
1026 /*
1027  * determine whether a whiteout is needed
1028  * during a remove/rmdir operation.
1029  */
1030 int
1031 union_dowhiteout(un, cred, p)
1032 	struct union_node *un;
1033 	struct ucred *cred;
1034 	struct proc *p;
1035 {
1036 	struct vattr va;
1037 
1038 	if (un->un_lowervp != NULLVP)
1039 		return (1);
1040 
1041 	if (VOP_GETATTR(un->un_uppervp, &va, cred, p) == 0 &&
1042 	    (va.va_flags & OPAQUE))
1043 		return (1);
1044 
1045 	return (0);
1046 }
1047 
1048 static void
1049 union_dircache_r(vp, vppp, cntp)
1050 	struct vnode *vp;
1051 	struct vnode ***vppp;
1052 	int *cntp;
1053 {
1054 	struct union_node *un;
1055 
1056 	if (vp->v_op != union_vnodeop_p) {
1057 		if (vppp) {
1058 			VREF(vp);
1059 			*(*vppp)++ = vp;
1060 			if (--(*cntp) == 0)
1061 				panic("union: dircache table too small");
1062 		} else {
1063 			(*cntp)++;
1064 		}
1065 
1066 		return;
1067 	}
1068 
1069 	un = VTOUNION(vp);
1070 	if (un->un_uppervp != NULLVP)
1071 		union_dircache_r(un->un_uppervp, vppp, cntp);
1072 	if (un->un_lowervp != NULLVP)
1073 		union_dircache_r(un->un_lowervp, vppp, cntp);
1074 }
1075 
1076 struct vnode *
1077 union_dircache(vp, p)
1078 	struct vnode *vp;
1079 	struct proc *p;
1080 {
1081 	int cnt;
1082 	struct vnode *nvp;
1083 	struct vnode **vpp;
1084 	struct vnode **dircache;
1085 	struct union_node *un;
1086 	int error;
1087 
1088 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
1089 	dircache = VTOUNION(vp)->un_dircache;
1090 
1091 	nvp = NULLVP;
1092 
1093 	if (dircache == 0) {
1094 		cnt = 0;
1095 		union_dircache_r(vp, 0, &cnt);
1096 		cnt++;
1097 		dircache = (struct vnode **)
1098 				malloc(cnt * sizeof(struct vnode *),
1099 					M_TEMP, M_WAITOK);
1100 		vpp = dircache;
1101 		union_dircache_r(vp, &vpp, &cnt);
1102 		*vpp = NULLVP;
1103 		vpp = dircache + 1;
1104 	} else {
1105 		vpp = dircache;
1106 		do {
1107 			if (*vpp++ == VTOUNION(vp)->un_uppervp)
1108 				break;
1109 		} while (*vpp != NULLVP);
1110 	}
1111 
1112 	if (*vpp == NULLVP)
1113 		goto out;
1114 
1115 	vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, p);
1116 	VREF(*vpp);
1117 	error = union_allocvp(&nvp, vp->v_mount, NULLVP, NULLVP, 0, *vpp, NULLVP, 0);
1118 	if (error)
1119 		goto out;
1120 
1121 	VTOUNION(vp)->un_dircache = 0;
1122 	un = VTOUNION(nvp);
1123 	un->un_dircache = dircache;
1124 
1125 out:
1126 	VOP_UNLOCK(vp, 0, p);
1127 	return (nvp);
1128 }
1129