xref: /titanic_52/usr/src/uts/common/fs/nfs/nfs4_client.c (revision 1fac5a6088d9f8a16d0a302d57227a80031f002d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  *  	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
29  *	All Rights Reserved
30  */
31 
32 #pragma ident	"%Z%%M%	%I%	%E% SMI"
33 
34 #include <sys/param.h>
35 #include <sys/types.h>
36 #include <sys/systm.h>
37 #include <sys/thread.h>
38 #include <sys/t_lock.h>
39 #include <sys/time.h>
40 #include <sys/vnode.h>
41 #include <sys/vfs.h>
42 #include <sys/errno.h>
43 #include <sys/buf.h>
44 #include <sys/stat.h>
45 #include <sys/cred.h>
46 #include <sys/kmem.h>
47 #include <sys/debug.h>
48 #include <sys/dnlc.h>
49 #include <sys/vmsystm.h>
50 #include <sys/flock.h>
51 #include <sys/share.h>
52 #include <sys/cmn_err.h>
53 #include <sys/tiuser.h>
54 #include <sys/sysmacros.h>
55 #include <sys/callb.h>
56 #include <sys/acl.h>
57 #include <sys/kstat.h>
58 #include <sys/signal.h>
59 #include <sys/disp.h>
60 #include <sys/atomic.h>
61 #include <sys/list.h>
62 #include <sys/sdt.h>
63 
64 #include <rpc/types.h>
65 #include <rpc/xdr.h>
66 #include <rpc/auth.h>
67 #include <rpc/clnt.h>
68 
69 #include <nfs/nfs.h>
70 #include <nfs/nfs_clnt.h>
71 #include <nfs/nfs_acl.h>
72 
73 #include <nfs/nfs4.h>
74 #include <nfs/rnode4.h>
75 #include <nfs/nfs4_clnt.h>
76 
77 #include <vm/hat.h>
78 #include <vm/as.h>
79 #include <vm/page.h>
80 #include <vm/pvn.h>
81 #include <vm/seg.h>
82 #include <vm/seg_map.h>
83 #include <vm/seg_vn.h>
84 
85 #include <sys/ddi.h>
86 
87 /*
88  * Arguments to page-flush thread.
89  */
90 typedef struct {
91 	vnode_t *vp;
92 	cred_t *cr;
93 } pgflush_t;
94 
95 #ifdef DEBUG
96 int nfs4_client_lease_debug;
97 int nfs4_sharedfh_debug;
98 int nfs4_fname_debug;
99 
100 /* temporary: panic if v_type is inconsistent with r_attr va_type */
101 int nfs4_vtype_debug;
102 
103 uint_t nfs4_tsd_key;
104 #endif
105 
106 static time_t	nfs4_client_resumed = 0;
107 static	callb_id_t cid = 0;
108 
109 static int	nfs4renew(nfs4_server_t *);
110 static void	nfs4_attrcache_va(vnode_t *, nfs4_ga_res_t *, int);
111 static void	nfs4_pgflush_thread(pgflush_t *);
112 static void	flush_pages(vnode_t *, cred_t *);
113 
114 static boolean_t nfs4_client_cpr_callb(void *, int);
115 
116 struct mi4_globals {
117 	kmutex_t	mig_lock;  /* lock protecting mig_list */
118 	list_t		mig_list;  /* list of NFS v4 mounts in zone */
119 	boolean_t	mig_destructor_called;
120 };
121 
122 static zone_key_t mi4_list_key;
123 
124 /*
125  * Attributes caching:
126  *
127  * Attributes are cached in the rnode in struct vattr form.
128  * There is a time associated with the cached attributes (r_time_attr_inval)
129  * which tells whether the attributes are valid. The time is initialized
130  * to the difference between current time and the modify time of the vnode
131  * when new attributes are cached. This allows the attributes for
132  * files that have changed recently to be timed out sooner than for files
133  * that have not changed for a long time. There are minimum and maximum
134  * timeout values that can be set per mount point.
135  */
136 
137 /*
138  * If a cache purge is in progress, wait for it to finish.
139  *
140  * The current thread must not be in the middle of an
141  * nfs4_start_op/nfs4_end_op region.  Otherwise, there could be a deadlock
142  * between this thread, a recovery thread, and the page flush thread.
143  */
144 int
145 nfs4_waitfor_purge_complete(vnode_t *vp)
146 {
147 	rnode4_t *rp;
148 	k_sigset_t smask;
149 
150 	rp = VTOR4(vp);
151 	if ((rp->r_serial != NULL && rp->r_serial != curthread) ||
152 	    ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread)) {
153 		mutex_enter(&rp->r_statelock);
154 		sigintr(&smask, VTOMI4(vp)->mi_flags & MI4_INT);
155 		while ((rp->r_serial != NULL && rp->r_serial != curthread) ||
156 		    ((rp->r_flags & R4PGFLUSH) &&
157 		    rp->r_pgflush != curthread)) {
158 			if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
159 				sigunintr(&smask);
160 				mutex_exit(&rp->r_statelock);
161 				return (EINTR);
162 			}
163 		}
164 		sigunintr(&smask);
165 		mutex_exit(&rp->r_statelock);
166 	}
167 	return (0);
168 }
169 
170 /*
171  * Validate caches by checking cached attributes. If they have timed out,
172  * then get new attributes from the server.  As a side effect, cache
173  * invalidation is done if the attributes have changed.
174  *
175  * If the attributes have not timed out and if there is a cache
176  * invalidation being done by some other thread, then wait until that
177  * thread has completed the cache invalidation.
178  */
179 int
180 nfs4_validate_caches(vnode_t *vp, cred_t *cr)
181 {
182 	int error;
183 	nfs4_ga_res_t gar;
184 
185 	if (ATTRCACHE4_VALID(vp)) {
186 		error = nfs4_waitfor_purge_complete(vp);
187 		if (error)
188 			return (error);
189 		return (0);
190 	}
191 
192 	gar.n4g_va.va_mask = AT_ALL;
193 	return (nfs4_getattr_otw(vp, &gar, cr, 0));
194 }
195 
196 /*
197  * Fill in attribute from the cache.
198  * If valid, then return 0 to indicate that no error occurred,
199  * otherwise return 1 to indicate that an error occurred.
200  */
201 static int
202 nfs4_getattr_cache(vnode_t *vp, struct vattr *vap)
203 {
204 	rnode4_t *rp;
205 
206 	rp = VTOR4(vp);
207 	mutex_enter(&rp->r_statelock);
208 	mutex_enter(&rp->r_statev4_lock);
209 	if (ATTRCACHE4_VALID(vp)) {
210 		mutex_exit(&rp->r_statev4_lock);
211 		/*
212 		 * Cached attributes are valid
213 		 */
214 		*vap = rp->r_attr;
215 		mutex_exit(&rp->r_statelock);
216 		return (0);
217 	}
218 	mutex_exit(&rp->r_statev4_lock);
219 	mutex_exit(&rp->r_statelock);
220 	return (1);
221 }
222 
223 
224 /*
225  * If returned error is ESTALE flush all caches.  The nfs4_purge_caches()
226  * call is synchronous because all the pages were invalidated by the
227  * nfs4_invalidate_pages() call.
228  */
229 void
230 nfs4_purge_stale_fh(int errno, vnode_t *vp, cred_t *cr)
231 {
232 	struct rnode4 *rp = VTOR4(vp);
233 
234 	/* Ensure that the ..._end_op() call has been done */
235 	ASSERT(tsd_get(nfs4_tsd_key) == NULL);
236 
237 	if (errno != ESTALE)
238 		return;
239 
240 	mutex_enter(&rp->r_statelock);
241 	rp->r_flags |= R4STALE;
242 	if (!rp->r_error)
243 		rp->r_error = errno;
244 	mutex_exit(&rp->r_statelock);
245 	if (nfs4_has_pages(vp))
246 		nfs4_invalidate_pages(vp, (u_offset_t)0, cr);
247 	nfs4_purge_caches(vp, NFS4_PURGE_DNLC, cr, FALSE);
248 }
249 
250 /*
251  * Purge all of the various NFS `data' caches.  If "asyncpg" is TRUE, the
252  * page purge is done asynchronously.
253  */
254 void
255 nfs4_purge_caches(vnode_t *vp, int purge_dnlc, cred_t *cr, int asyncpg)
256 {
257 	rnode4_t *rp;
258 	char *contents;
259 	vnode_t *xattr;
260 	int size;
261 	int pgflush;			/* are we the page flush thread? */
262 
263 	/*
264 	 * Purge the DNLC for any entries which refer to this file.
265 	 */
266 	if (vp->v_count > 1 &&
267 	    (vp->v_type == VDIR || purge_dnlc == NFS4_PURGE_DNLC))
268 		dnlc_purge_vp(vp);
269 
270 	/*
271 	 * Clear any readdir state bits and purge the readlink response cache.
272 	 */
273 	rp = VTOR4(vp);
274 	mutex_enter(&rp->r_statelock);
275 	rp->r_flags &= ~R4LOOKUP;
276 	contents = rp->r_symlink.contents;
277 	size = rp->r_symlink.size;
278 	rp->r_symlink.contents = NULL;
279 
280 	xattr = rp->r_xattr_dir;
281 	rp->r_xattr_dir = NULL;
282 
283 	/*
284 	 * Purge pathconf cache too.
285 	 */
286 	rp->r_pathconf.pc4_xattr_valid = 0;
287 	rp->r_pathconf.pc4_cache_valid = 0;
288 
289 	pgflush = (curthread == rp->r_pgflush);
290 	mutex_exit(&rp->r_statelock);
291 
292 	if (contents != NULL) {
293 
294 		kmem_free((void *)contents, size);
295 	}
296 
297 	if (xattr != NULL)
298 		VN_RELE(xattr);
299 
300 	/*
301 	 * Flush the page cache.  If the current thread is the page flush
302 	 * thread, don't initiate a new page flush.  There's no need for
303 	 * it, and doing it correctly is hard.
304 	 */
305 	if (nfs4_has_pages(vp) && !pgflush) {
306 		if (!asyncpg) {
307 			(void) nfs4_waitfor_purge_complete(vp);
308 			flush_pages(vp, cr);
309 		} else {
310 			pgflush_t *args;
311 
312 			/*
313 			 * We don't hold r_statelock while creating the
314 			 * thread, in case the call blocks.  So we use a
315 			 * flag to indicate that a page flush thread is
316 			 * active.
317 			 */
318 			mutex_enter(&rp->r_statelock);
319 			if (rp->r_flags & R4PGFLUSH) {
320 				mutex_exit(&rp->r_statelock);
321 			} else {
322 				rp->r_flags |= R4PGFLUSH;
323 				mutex_exit(&rp->r_statelock);
324 
325 				args = kmem_alloc(sizeof (pgflush_t),
326 						KM_SLEEP);
327 				args->vp = vp;
328 				VN_HOLD(args->vp);
329 				args->cr = cr;
330 				crhold(args->cr);
331 				(void) zthread_create(NULL, 0,
332 						nfs4_pgflush_thread, args, 0,
333 						minclsyspri);
334 			}
335 		}
336 	}
337 
338 	/*
339 	 * Flush the readdir response cache.
340 	 */
341 	nfs4_purge_rddir_cache(vp);
342 }
343 
344 /*
345  * Invalidate all pages for the given file, after writing back the dirty
346  * ones.
347  */
348 
349 static void
350 flush_pages(vnode_t *vp, cred_t *cr)
351 {
352 	int error;
353 	rnode4_t *rp = VTOR4(vp);
354 
355 	error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_INVAL, cr);
356 	if (error == ENOSPC || error == EDQUOT) {
357 		mutex_enter(&rp->r_statelock);
358 		if (!rp->r_error)
359 			rp->r_error = error;
360 		mutex_exit(&rp->r_statelock);
361 	}
362 }
363 
364 /*
365  * Page flush thread.
366  */
367 
368 static void
369 nfs4_pgflush_thread(pgflush_t *args)
370 {
371 	rnode4_t *rp = VTOR4(args->vp);
372 
373 	/* remember which thread we are, so we don't deadlock ourselves */
374 	mutex_enter(&rp->r_statelock);
375 	ASSERT(rp->r_pgflush == NULL);
376 	rp->r_pgflush = curthread;
377 	mutex_exit(&rp->r_statelock);
378 
379 	flush_pages(args->vp, args->cr);
380 
381 	mutex_enter(&rp->r_statelock);
382 	rp->r_pgflush = NULL;
383 	rp->r_flags &= ~R4PGFLUSH;
384 	cv_broadcast(&rp->r_cv);
385 	mutex_exit(&rp->r_statelock);
386 
387 	VN_RELE(args->vp);
388 	crfree(args->cr);
389 	kmem_free(args, sizeof (pgflush_t));
390 	zthread_exit();
391 }
392 
393 /*
394  * Purge the readdir cache of all entries which are not currently
395  * being filled.
396  */
397 void
398 nfs4_purge_rddir_cache(vnode_t *vp)
399 {
400 	rnode4_t *rp;
401 
402 	rp = VTOR4(vp);
403 
404 	mutex_enter(&rp->r_statelock);
405 	rp->r_direof = NULL;
406 	rp->r_flags &= ~R4LOOKUP;
407 	rp->r_flags |= R4READDIRWATTR;
408 	rddir4_cache_purge(rp);
409 	mutex_exit(&rp->r_statelock);
410 }
411 
412 /*
413  * Set attributes cache for given vnode using virtual attributes.  There is
414  * no cache validation, but if the attributes are deemed to be stale, they
415  * are ignored.  This corresponds to nfs3_attrcache().
416  *
417  * Set the timeout value on the attribute cache and fill it
418  * with the passed in attributes.
419  */
420 void
421 nfs4_attrcache_noinval(vnode_t *vp, nfs4_ga_res_t *garp, hrtime_t t)
422 {
423 	rnode4_t *rp = VTOR4(vp);
424 
425 	mutex_enter(&rp->r_statelock);
426 	if (rp->r_time_attr_saved <= t)
427 		nfs4_attrcache_va(vp, garp, FALSE);
428 	mutex_exit(&rp->r_statelock);
429 }
430 
431 /*
432  * Use the passed in virtual attributes to check to see whether the
433  * data and metadata caches are valid, cache the new attributes, and
434  * then do the cache invalidation if required.
435  *
436  * The cache validation and caching of the new attributes is done
437  * atomically via the use of the mutex, r_statelock.  If required,
438  * the cache invalidation is done atomically w.r.t. the cache
439  * validation and caching of the attributes via the pseudo lock,
440  * r_serial.
441  *
442  * This routine is used to do cache validation and attributes caching
443  * for operations with a single set of post operation attributes.
444  */
445 
446 void
447 nfs4_attr_cache(vnode_t *vp, nfs4_ga_res_t *garp,
448 		hrtime_t t, cred_t *cr, int async,
449 		change_info4 *cinfo)
450 {
451 	rnode4_t *rp;
452 	int mtime_changed;
453 	int ctime_changed;
454 	vsecattr_t *vsp;
455 	int was_serial, set_time_cache_inval, recov;
456 	vattr_t *vap = &garp->n4g_va;
457 	mntinfo4_t *mi = VTOMI4(vp);
458 
459 	ASSERT(mi->mi_vfsp->vfs_dev == garp->n4g_va.va_fsid);
460 
461 	/* Is curthread the recovery thread? */
462 	mutex_enter(&mi->mi_lock);
463 	recov = (VTOMI4(vp)->mi_recovthread == curthread);
464 	mutex_exit(&mi->mi_lock);
465 
466 	rp = VTOR4(vp);
467 	mutex_enter(&rp->r_statelock);
468 	was_serial = (rp->r_serial == curthread);
469 	if (rp->r_serial && !was_serial) {
470 		klwp_t *lwp = ttolwp(curthread);
471 
472 		/*
473 		 * If we're the recovery thread, then purge current attrs
474 		 * and bail out to avoid potential deadlock between another
475 		 * thread caching attrs (r_serial thread), recov thread,
476 		 * and an async writer thread.
477 		 */
478 		if (recov) {
479 			PURGE_ATTRCACHE4_LOCKED(rp);
480 			mutex_exit(&rp->r_statelock);
481 			return;
482 		}
483 
484 		if (lwp != NULL)
485 			lwp->lwp_nostop++;
486 		while (rp->r_serial != NULL) {
487 			if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
488 				mutex_exit(&rp->r_statelock);
489 				if (lwp != NULL)
490 					lwp->lwp_nostop--;
491 				return;
492 			}
493 		}
494 		if (lwp != NULL)
495 			lwp->lwp_nostop--;
496 	}
497 
498 	/*
499 	 * If there is a page flush thread, the current thread needs to
500 	 * bail out, to prevent a possible deadlock between the current
501 	 * thread (which might be in a start_op/end_op region), the
502 	 * recovery thread, and the page flush thread.  Expire the
503 	 * attribute cache, so that any attributes the current thread was
504 	 * going to set are not lost.
505 	 */
506 	if ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread) {
507 		PURGE_ATTRCACHE4_LOCKED(rp);
508 		mutex_exit(&rp->r_statelock);
509 		return;
510 	}
511 
512 	if (rp->r_time_attr_saved > t) {
513 		/*
514 		 * Attributes have been cached since these attributes were
515 		 * made, so don't act on them.
516 		 */
517 		mutex_exit(&rp->r_statelock);
518 		return;
519 	}
520 	set_time_cache_inval = 0;
521 	if (cinfo) {
522 		/*
523 		 * Only directory modifying callers pass non-NULL cinfo.
524 		 */
525 		ASSERT(vp->v_type == VDIR);
526 		/*
527 		 * If the cache timeout either doesn't exist or hasn't expired,
528 		 * and dir didn't changed on server before dirmod op
529 		 * and dir didn't change after dirmod op but before getattr
530 		 * then there's a chance that the client's cached data for
531 		 * this object is current (not stale).  No immediate cache
532 		 * flush is required.
533 		 *
534 		 */
535 		if ((! rp->r_time_cache_inval || t < rp->r_time_cache_inval) &&
536 		    cinfo->before == rp->r_change &&
537 		    (garp->n4g_change_valid &&
538 		    cinfo->after == garp->n4g_change)) {
539 
540 			/*
541 			 * If atomic isn't set, then the before/after info
542 			 * cannot be blindly trusted.  For this case, we tell
543 			 * nfs4_attrcache_va to cache the attrs but also
544 			 * establish an absolute maximum cache timeout.  When
545 			 * the timeout is reached, caches will be flushed.
546 			 */
547 			if (! cinfo->atomic)
548 				set_time_cache_inval = 1;
549 
550 			mtime_changed = 0;
551 			ctime_changed = 0;
552 		} else {
553 
554 			/*
555 			 * We're not sure exactly what changed, but we know
556 			 * what to do.  flush all caches for dir.  remove the
557 			 * attr timeout.
558 			 *
559 			 * a) timeout expired.  flush all caches.
560 			 * b) r_change != cinfo.before.  flush all caches.
561 			 * c) r_change == cinfo.before, but cinfo.after !=
562 			 *    post-op getattr(change).  flush all caches.
563 			 * d) post-op getattr(change) not provided by server.
564 			 *    flush all caches.
565 			 */
566 			mtime_changed = 1;
567 			ctime_changed = 1;
568 			rp->r_time_cache_inval = 0;
569 		}
570 	} else {
571 		if (!(rp->r_flags & R4WRITEMODIFIED)) {
572 			if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size))
573 				mtime_changed = 1;
574 			else
575 				mtime_changed = 0;
576 			if (rp->r_attr.va_ctime.tv_sec !=
577 			    vap->va_ctime.tv_sec ||
578 			    rp->r_attr.va_ctime.tv_nsec !=
579 			    vap->va_ctime.tv_nsec)
580 				ctime_changed = 1;
581 			else
582 				ctime_changed = 0;
583 		} else {
584 			mtime_changed = 0;
585 			ctime_changed = 0;
586 		}
587 	}
588 
589 	nfs4_attrcache_va(vp, garp, set_time_cache_inval);
590 
591 	if (!mtime_changed && !ctime_changed) {
592 		mutex_exit(&rp->r_statelock);
593 		return;
594 	}
595 
596 	rp->r_serial = curthread;
597 
598 	mutex_exit(&rp->r_statelock);
599 
600 	/*
601 	 * If we're the recov thread, then force async nfs4_purge_caches
602 	 * to avoid potential deadlock.
603 	 */
604 	if (mtime_changed)
605 		nfs4_purge_caches(vp, NFS4_NOPURGE_DNLC, cr, recov ? 1 : async);
606 
607 	if (ctime_changed) {
608 		(void) nfs4_access_purge_rp(rp);
609 		if (rp->r_secattr != NULL) {
610 			mutex_enter(&rp->r_statelock);
611 			vsp = rp->r_secattr;
612 			rp->r_secattr = NULL;
613 			mutex_exit(&rp->r_statelock);
614 			if (vsp != NULL)
615 				nfs4_acl_free_cache(vsp);
616 		}
617 	}
618 
619 	if (!was_serial) {
620 		mutex_enter(&rp->r_statelock);
621 		rp->r_serial = NULL;
622 		cv_broadcast(&rp->r_cv);
623 		mutex_exit(&rp->r_statelock);
624 	}
625 }
626 
627 /*
628  * Set attributes cache for given vnode using virtual attributes.
629  *
630  * Set the timeout value on the attribute cache and fill it
631  * with the passed in attributes.
632  *
633  * The caller must be holding r_statelock.
634  */
635 static void
636 nfs4_attrcache_va(vnode_t *vp, nfs4_ga_res_t *garp, int set_cache_timeout)
637 {
638 	rnode4_t *rp;
639 	mntinfo4_t *mi;
640 	hrtime_t delta;
641 	hrtime_t now;
642 	vattr_t *vap = &garp->n4g_va;
643 
644 	rp = VTOR4(vp);
645 
646 	ASSERT(MUTEX_HELD(&rp->r_statelock));
647 	ASSERT(vap->va_mask == AT_ALL);
648 
649 	/* Switch to master before checking v_flag */
650 	if (IS_SHADOW(vp, rp))
651 		vp = RTOV4(rp);
652 
653 	now = gethrtime();
654 
655 	mi = VTOMI4(vp);
656 
657 	/*
658 	 * Only establish a new cache timeout (if requested).  Never
659 	 * extend a timeout.  Never clear a timeout.  Clearing a timeout
660 	 * is done by nfs4_update_dircaches (ancestor in our call chain)
661 	 */
662 	if (set_cache_timeout && ! rp->r_time_cache_inval)
663 		rp->r_time_cache_inval = now + mi->mi_acdirmax;
664 
665 	/*
666 	 * Delta is the number of nanoseconds that we will
667 	 * cache the attributes of the file.  It is based on
668 	 * the number of nanoseconds since the last time that
669 	 * we detected a change.  The assumption is that files
670 	 * that changed recently are likely to change again.
671 	 * There is a minimum and a maximum for regular files
672 	 * and for directories which is enforced though.
673 	 *
674 	 * Using the time since last change was detected
675 	 * eliminates direct comparison or calculation
676 	 * using mixed client and server times.  NFS does
677 	 * not make any assumptions regarding the client
678 	 * and server clocks being synchronized.
679 	 */
680 	if (vap->va_mtime.tv_sec != rp->r_attr.va_mtime.tv_sec ||
681 	    vap->va_mtime.tv_nsec != rp->r_attr.va_mtime.tv_nsec ||
682 	    vap->va_size != rp->r_attr.va_size) {
683 		rp->r_time_attr_saved = now;
684 	}
685 
686 	if ((mi->mi_flags & MI4_NOAC) || (vp->v_flag & VNOCACHE))
687 		delta = 0;
688 	else {
689 		delta = now - rp->r_time_attr_saved;
690 		if (vp->v_type == VDIR) {
691 			if (delta < mi->mi_acdirmin)
692 				delta = mi->mi_acdirmin;
693 			else if (delta > mi->mi_acdirmax)
694 				delta = mi->mi_acdirmax;
695 		} else {
696 			if (delta < mi->mi_acregmin)
697 				delta = mi->mi_acregmin;
698 			else if (delta > mi->mi_acregmax)
699 				delta = mi->mi_acregmax;
700 		}
701 	}
702 	rp->r_time_attr_inval = now + delta;
703 
704 	rp->r_attr = *vap;
705 	if (garp->n4g_change_valid)
706 		rp->r_change = garp->n4g_change;
707 
708 	/*
709 	 * The attributes that were returned may be valid and can
710 	 * be used, but they may not be allowed to be cached.
711 	 * Reset the timers to cause immediate invalidation and
712 	 * clear r_change so no VERIFY operations will suceed
713 	 */
714 	if (garp->n4g_attrwhy == NFS4_GETATTR_NOCACHE_OK) {
715 		rp->r_time_attr_inval = now;
716 		rp->r_time_attr_saved = now;
717 		rp->r_change = 0;
718 	}
719 
720 	/*
721 	 * If mounted_on_fileid returned AND the object is a stub,
722 	 * then set object's va_nodeid to the mounted over fid
723 	 * returned by server.
724 	 *
725 	 * If mounted_on_fileid not provided/supported, then
726 	 * just set it to 0 for now.  Eventually it would be
727 	 * better to set it to a hashed version of FH.  This
728 	 * would probably be good enough to provide a unique
729 	 * fid/d_ino within a dir.
730 	 *
731 	 * We don't need to carry mounted_on_fileid in the
732 	 * rnode as long as the client never requests fileid
733 	 * without also requesting mounted_on_fileid.  For
734 	 * now, it stays.
735 	 */
736 	if (garp->n4g_mon_fid_valid) {
737 		rp->r_mntd_fid = garp->n4g_mon_fid;
738 
739 		if (rp->r_flags & R4SRVSTUB)
740 			rp->r_attr.va_nodeid = rp->r_mntd_fid;
741 	}
742 
743 	/*
744 	 * Check to see if there are valid pathconf bits to
745 	 * cache in the rnode.
746 	 */
747 	if (garp->n4g_ext_res) {
748 		if (garp->n4g_ext_res->n4g_pc4.pc4_cache_valid) {
749 			rp->r_pathconf = garp->n4g_ext_res->n4g_pc4;
750 		} else {
751 			if (garp->n4g_ext_res->n4g_pc4.pc4_xattr_valid) {
752 				rp->r_pathconf.pc4_xattr_valid = TRUE;
753 				rp->r_pathconf.pc4_xattr_exists =
754 				    garp->n4g_ext_res->n4g_pc4.pc4_xattr_exists;
755 			}
756 		}
757 	}
758 	/*
759 	 * Update the size of the file if there is no cached data or if
760 	 * the cached data is clean and there is no data being written
761 	 * out.
762 	 */
763 	if (rp->r_size != vap->va_size &&
764 	    (!vn_has_cached_data(vp) ||
765 	    (!(rp->r_flags & R4DIRTY) && rp->r_count == 0))) {
766 		rp->r_size = vap->va_size;
767 	}
768 	nfs_setswaplike(vp, vap);
769 	rp->r_flags &= ~R4WRITEMODIFIED;
770 }
771 
772 /*
773  * Get attributes over-the-wire and update attributes cache
774  * if no error occurred in the over-the-wire operation.
775  * Return 0 if successful, otherwise error.
776  */
777 int
778 nfs4_getattr_otw(vnode_t *vp, nfs4_ga_res_t *garp, cred_t *cr, int get_acl)
779 {
780 	mntinfo4_t *mi = VTOMI4(vp);
781 	hrtime_t t;
782 	nfs4_recov_state_t recov_state;
783 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
784 
785 	recov_state.rs_flags = 0;
786 	recov_state.rs_num_retry_despite_err = 0;
787 
788 	/* Save the original mount point security flavor */
789 	(void) save_mnt_secinfo(mi->mi_curr_serv);
790 
791 recov_retry:
792 	if ((e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR,
793 						&recov_state, NULL))) {
794 		(void) check_mnt_secinfo(mi->mi_curr_serv, vp);
795 		return (e.error);
796 	}
797 
798 	t = gethrtime();
799 
800 	nfs4_getattr_otw_norecovery(vp, garp, &e, cr, get_acl);
801 
802 	if (nfs4_needs_recovery(&e, FALSE, vp->v_vfsp)) {
803 		if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
804 		    NULL, OP_GETATTR, NULL) == FALSE)  {
805 			nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR,
806 					&recov_state, 1);
807 			goto recov_retry;
808 		}
809 	}
810 
811 	nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 0);
812 
813 	if (!e.error) {
814 		if (e.stat == NFS4_OK) {
815 			nfs4_attr_cache(vp, garp, t, cr, FALSE, NULL);
816 		} else {
817 			e.error = geterrno4(e.stat);
818 
819 			nfs4_purge_stale_fh(e.error, vp, cr);
820 		}
821 	}
822 
823 	/*
824 	 * If getattr a node that is a stub for a crossed
825 	 * mount point, keep the original secinfo flavor for
826 	 * the current file system, not the crossed one.
827 	 */
828 	(void) check_mnt_secinfo(mi->mi_curr_serv, vp);
829 
830 	return (e.error);
831 }
832 
833 /*
834  * Generate a compound to get attributes over-the-wire.
835  */
836 void
837 nfs4_getattr_otw_norecovery(vnode_t *vp, nfs4_ga_res_t *garp,
838 		nfs4_error_t *ep, cred_t *cr, int get_acl)
839 {
840 	COMPOUND4args_clnt args;
841 	COMPOUND4res_clnt res;
842 	int doqueue;
843 	rnode4_t *rp = VTOR4(vp);
844 	nfs_argop4 argop[2];
845 
846 	args.ctag = TAG_GETATTR;
847 
848 	args.array_len = 2;
849 	args.array = argop;
850 
851 	/* putfh */
852 	argop[0].argop = OP_CPUTFH;
853 	argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
854 
855 	/* getattr */
856 	/*
857 	 * Unlike nfs version 2 and 3, where getattr returns all the
858 	 * attributes, nfs version 4 returns only the ones explicitely
859 	 * asked for. This creates problems, as some system functions
860 	 * (e.g. cache check) require certain attributes and if the
861 	 * cached node lacks some attributes such as uid/gid, it can
862 	 * affect system utilities (e.g. "ls") that rely on the information
863 	 * to be there. This can lead to anything from system crashes to
864 	 * corrupted information processed by user apps.
865 	 * So to ensure that all bases are covered, request at least
866 	 * the AT_ALL attribute mask.
867 	 */
868 	argop[1].argop = OP_GETATTR;
869 	argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
870 	if (get_acl)
871 		argop[1].nfs_argop4_u.opgetattr.attr_request |= FATTR4_ACL_MASK;
872 	argop[1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp);
873 
874 	doqueue = 1;
875 
876 	rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, ep);
877 
878 	if (ep->error)
879 		return;
880 
881 	if (res.status != NFS4_OK) {
882 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
883 		return;
884 	}
885 
886 	*garp = res.array[1].nfs_resop4_u.opgetattr.ga_res;
887 
888 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
889 }
890 
891 /*
892  * Return either cached or remote attributes. If get remote attr
893  * use them to check and invalidate caches, then cache the new attributes.
894  */
895 int
896 nfs4getattr(vnode_t *vp, vattr_t *vap, cred_t *cr)
897 {
898 	int error;
899 	rnode4_t *rp;
900 	nfs4_ga_res_t gar;
901 
902 	ASSERT(nfs4_consistent_type(vp));
903 
904 	/*
905 	 * If we've got cached attributes, we're done, otherwise go
906 	 * to the server to get attributes, which will update the cache
907 	 * in the process.
908 	 */
909 	rp = VTOR4(vp);
910 	mutex_enter(&rp->r_statelock);
911 	mutex_enter(&rp->r_statev4_lock);
912 	if (ATTRCACHE4_VALID(vp)) {
913 		mutex_exit(&rp->r_statev4_lock);
914 		/*
915 		 * Cached attributes are valid
916 		 * Return the client's view of file size
917 		 */
918 		*vap = rp->r_attr;
919 		vap->va_size = rp->r_size;
920 		mutex_exit(&rp->r_statelock);
921 
922 		ASSERT(nfs4_consistent_type(vp));
923 
924 		return (0);
925 	}
926 	mutex_exit(&rp->r_statev4_lock);
927 	mutex_exit(&rp->r_statelock);
928 
929 	error = nfs4_getattr_otw(vp, &gar, cr, 0);
930 	if (!error)
931 		*vap = gar.n4g_va;
932 
933 	/* Return the client's view of file size */
934 	mutex_enter(&rp->r_statelock);
935 	vap->va_size = rp->r_size;
936 	mutex_exit(&rp->r_statelock);
937 
938 	ASSERT(nfs4_consistent_type(vp));
939 
940 	return (error);
941 }
942 
943 int
944 nfs4_attr_otw(vnode_t *vp, nfs4_tag_type_t tag_type,
945 		nfs4_ga_res_t *garp, bitmap4 reqbitmap, cred_t *cr)
946 {
947 	COMPOUND4args_clnt args;
948 	COMPOUND4res_clnt res;
949 	int doqueue;
950 	nfs_argop4 argop[2];
951 	mntinfo4_t *mi = VTOMI4(vp);
952 	bool_t needrecov = FALSE;
953 	nfs4_recov_state_t recov_state;
954 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
955 	nfs4_ga_ext_res_t *gerp;
956 
957 	recov_state.rs_flags = 0;
958 	recov_state.rs_num_retry_despite_err = 0;
959 
960 recov_retry:
961 	args.ctag = tag_type;
962 
963 	args.array_len = 2;
964 	args.array = argop;
965 
966 	e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR, &recov_state, NULL);
967 	if (e.error)
968 		return (e.error);
969 
970 	/* putfh */
971 	argop[0].argop = OP_CPUTFH;
972 	argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh;
973 
974 	/* getattr */
975 	argop[1].argop = OP_GETATTR;
976 	argop[1].nfs_argop4_u.opgetattr.attr_request = reqbitmap;
977 	argop[1].nfs_argop4_u.opgetattr.mi = mi;
978 
979 	doqueue = 1;
980 
981 	NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
982 	    "nfs4_attr_otw: %s call, rp %s", needrecov ? "recov" : "first",
983 	    rnode4info(VTOR4(vp))));
984 
985 	rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
986 
987 	needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
988 	if (!needrecov && e.error) {
989 		nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
990 			    needrecov);
991 		return (e.error);
992 	}
993 
994 	if (needrecov) {
995 		bool_t abort;
996 
997 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
998 		    "nfs4_attr_otw: initiating recovery\n"));
999 
1000 		abort = nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
1001 			    NULL, OP_GETATTR, NULL);
1002 		nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
1003 				needrecov);
1004 		if (!e.error) {
1005 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1006 			e.error = geterrno4(res.status);
1007 		}
1008 		if (abort == FALSE)
1009 			goto recov_retry;
1010 		return (e.error);
1011 	}
1012 
1013 	if (res.status) {
1014 		e.error = geterrno4(res.status);
1015 	} else {
1016 		gerp = garp->n4g_ext_res;
1017 		bcopy(&res.array[1].nfs_resop4_u.opgetattr.ga_res,
1018 			garp, sizeof (nfs4_ga_res_t));
1019 		garp->n4g_ext_res = gerp;
1020 		if (garp->n4g_ext_res &&
1021 		    res.array[1].nfs_resop4_u.opgetattr.ga_res.n4g_ext_res)
1022 			bcopy(res.array[1].nfs_resop4_u.opgetattr.
1023 				ga_res.n4g_ext_res,
1024 				garp->n4g_ext_res, sizeof (nfs4_ga_ext_res_t));
1025 	}
1026 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1027 	nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
1028 		    needrecov);
1029 	return (e.error);
1030 }
1031 
1032 /*
1033  * Asynchronous I/O parameters.  nfs_async_threads is the high-water mark
1034  * for the demand-based allocation of async threads per-mount.  The
1035  * nfs_async_timeout is the amount of time a thread will live after it
1036  * becomes idle, unless new I/O requests are received before the thread
1037  * dies.  See nfs4_async_putpage and nfs4_async_start.
1038  */
1039 
1040 static void	nfs4_async_start(struct vfs *);
1041 
1042 static void
1043 free_async_args4(struct nfs4_async_reqs *args)
1044 {
1045 	rnode4_t *rp;
1046 
1047 	if (args->a_io != NFS4_INACTIVE) {
1048 		rp = VTOR4(args->a_vp);
1049 		mutex_enter(&rp->r_statelock);
1050 		rp->r_count--;
1051 		if (args->a_io == NFS4_PUTAPAGE ||
1052 		    args->a_io == NFS4_PAGEIO)
1053 			rp->r_awcount--;
1054 		cv_broadcast(&rp->r_cv);
1055 		mutex_exit(&rp->r_statelock);
1056 		VN_RELE(args->a_vp);
1057 	}
1058 	crfree(args->a_cred);
1059 	kmem_free(args, sizeof (*args));
1060 }
1061 
1062 /*
1063  * Cross-zone thread creation and NFS access is disallowed, yet fsflush() and
1064  * pageout(), running in the global zone, have legitimate reasons to do
1065  * VOP_PUTPAGE(B_ASYNC) on other zones' NFS mounts.  We avoid the problem by
1066  * use of a a per-mount "asynchronous requests manager thread" which is
1067  * signaled by the various asynchronous work routines when there is
1068  * asynchronous work to be done.  It is responsible for creating new
1069  * worker threads if necessary, and notifying existing worker threads
1070  * that there is work to be done.
1071  *
1072  * In other words, it will "take the specifications from the customers and
1073  * give them to the engineers."
1074  *
1075  * Worker threads die off of their own accord if they are no longer
1076  * needed.
1077  *
1078  * This thread is killed when the zone is going away or the filesystem
1079  * is being unmounted.
1080  */
1081 void
1082 nfs4_async_manager(vfs_t *vfsp)
1083 {
1084 	callb_cpr_t cprinfo;
1085 	mntinfo4_t *mi;
1086 	uint_t max_threads;
1087 
1088 	mi = VFTOMI4(vfsp);
1089 
1090 	CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr,
1091 		    "nfs4_async_manager");
1092 
1093 	mutex_enter(&mi->mi_async_lock);
1094 	/*
1095 	 * We want to stash the max number of threads that this mount was
1096 	 * allowed so we can use it later when the variable is set to zero as
1097 	 * part of the zone/mount going away.
1098 	 *
1099 	 * We want to be able to create at least one thread to handle
1100 	 * asyncrhonous inactive calls.
1101 	 */
1102 	max_threads = MAX(mi->mi_max_threads, 1);
1103 	mutex_enter(&mi->mi_lock);
1104 	/*
1105 	 * We don't want to wait for mi_max_threads to go to zero, since that
1106 	 * happens as part of a failed unmount, but this thread should only
1107 	 * exit when the mount is really going away.
1108 	 *
1109 	 * Once MI4_ASYNC_MGR_STOP is set, no more async operations will be
1110 	 * attempted: the various _async_*() functions know to do things
1111 	 * inline if mi_max_threads == 0.  Henceforth we just drain out the
1112 	 * outstanding requests.
1113 	 *
1114 	 * Note that we still create zthreads even if we notice the zone is
1115 	 * shutting down (MI4_ASYNC_MGR_STOP is set); this may cause the zone
1116 	 * shutdown sequence to take slightly longer in some cases, but
1117 	 * doesn't violate the protocol, as all threads will exit as soon as
1118 	 * they're done processing the remaining requests.
1119 	 */
1120 	while (!(mi->mi_flags & MI4_ASYNC_MGR_STOP) ||
1121 	    mi->mi_async_req_count > 0) {
1122 		mutex_exit(&mi->mi_lock);
1123 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
1124 		cv_wait(&mi->mi_async_reqs_cv, &mi->mi_async_lock);
1125 		CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1126 		while (mi->mi_async_req_count > 0) {
1127 			/*
1128 			 * Paranoia: If the mount started out having
1129 			 * (mi->mi_max_threads == 0), and the value was
1130 			 * later changed (via a debugger or somesuch),
1131 			 * we could be confused since we will think we
1132 			 * can't create any threads, and the calling
1133 			 * code (which looks at the current value of
1134 			 * mi->mi_max_threads, now non-zero) thinks we
1135 			 * can.
1136 			 *
1137 			 * So, because we're paranoid, we create threads
1138 			 * up to the maximum of the original and the
1139 			 * current value. This means that future
1140 			 * (debugger-induced) alterations of
1141 			 * mi->mi_max_threads are ignored for our
1142 			 * purposes, but who told them they could change
1143 			 * random values on a live kernel anyhow?
1144 			 */
1145 			if (mi->mi_threads <
1146 			    MAX(mi->mi_max_threads, max_threads)) {
1147 				mi->mi_threads++;
1148 				mutex_exit(&mi->mi_async_lock);
1149 				VFS_HOLD(vfsp);	/* hold for new thread */
1150 				(void) zthread_create(NULL, 0, nfs4_async_start,
1151 				    vfsp, 0, minclsyspri);
1152 				mutex_enter(&mi->mi_async_lock);
1153 			}
1154 			cv_signal(&mi->mi_async_work_cv);
1155 			ASSERT(mi->mi_async_req_count != 0);
1156 			mi->mi_async_req_count--;
1157 		}
1158 		mutex_enter(&mi->mi_lock);
1159 	}
1160 	mutex_exit(&mi->mi_lock);
1161 
1162 	NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
1163 	    "nfs4_async_manager exiting for vfs %p\n", (void *)mi->mi_vfsp));
1164 	/*
1165 	 * Let everyone know we're done.
1166 	 */
1167 	mi->mi_manager_thread = NULL;
1168 	/*
1169 	 * Wake up the inactive thread.
1170 	 */
1171 	cv_broadcast(&mi->mi_inact_req_cv);
1172 	/*
1173 	 * Wake up anyone sitting in nfs4_async_manager_stop()
1174 	 */
1175 	cv_broadcast(&mi->mi_async_cv);
1176 	/*
1177 	 * There is no explicit call to mutex_exit(&mi->mi_async_lock)
1178 	 * since CALLB_CPR_EXIT is actually responsible for releasing
1179 	 * 'mi_async_lock'.
1180 	 */
1181 	CALLB_CPR_EXIT(&cprinfo);
1182 	VFS_RELE(vfsp);	/* release thread's hold */
1183 	zthread_exit();
1184 }
1185 
1186 /*
1187  * Signal (and wait for) the async manager thread to clean up and go away.
1188  */
1189 void
1190 nfs4_async_manager_stop(vfs_t *vfsp)
1191 {
1192 	mntinfo4_t *mi = VFTOMI4(vfsp);
1193 
1194 	mutex_enter(&mi->mi_async_lock);
1195 	mutex_enter(&mi->mi_lock);
1196 	mi->mi_flags |= MI4_ASYNC_MGR_STOP;
1197 	mutex_exit(&mi->mi_lock);
1198 	cv_broadcast(&mi->mi_async_reqs_cv);
1199 	/*
1200 	 * Wait for the async manager thread to die.
1201 	 */
1202 	while (mi->mi_manager_thread != NULL)
1203 		cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1204 	mutex_exit(&mi->mi_async_lock);
1205 }
1206 
1207 int
1208 nfs4_async_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr,
1209 	struct seg *seg, cred_t *cr, void (*readahead)(vnode_t *,
1210 	u_offset_t, caddr_t, struct seg *, cred_t *))
1211 {
1212 	rnode4_t *rp;
1213 	mntinfo4_t *mi;
1214 	struct nfs4_async_reqs *args;
1215 
1216 	rp = VTOR4(vp);
1217 	ASSERT(rp->r_freef == NULL);
1218 
1219 	mi = VTOMI4(vp);
1220 
1221 	/*
1222 	 * If addr falls in a different segment, don't bother doing readahead.
1223 	 */
1224 	if (addr >= seg->s_base + seg->s_size)
1225 		return (-1);
1226 
1227 	/*
1228 	 * If we can't allocate a request structure, punt on the readahead.
1229 	 */
1230 	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1231 		return (-1);
1232 
1233 	/*
1234 	 * If a lock operation is pending, don't initiate any new
1235 	 * readaheads.  Otherwise, bump r_count to indicate the new
1236 	 * asynchronous I/O.
1237 	 */
1238 	if (!nfs_rw_tryenter(&rp->r_lkserlock, RW_READER)) {
1239 		kmem_free(args, sizeof (*args));
1240 		return (-1);
1241 	}
1242 	mutex_enter(&rp->r_statelock);
1243 	rp->r_count++;
1244 	mutex_exit(&rp->r_statelock);
1245 	nfs_rw_exit(&rp->r_lkserlock);
1246 
1247 	args->a_next = NULL;
1248 #ifdef DEBUG
1249 	args->a_queuer = curthread;
1250 #endif
1251 	VN_HOLD(vp);
1252 	args->a_vp = vp;
1253 	ASSERT(cr != NULL);
1254 	crhold(cr);
1255 	args->a_cred = cr;
1256 	args->a_io = NFS4_READ_AHEAD;
1257 	args->a_nfs4_readahead = readahead;
1258 	args->a_nfs4_blkoff = blkoff;
1259 	args->a_nfs4_seg = seg;
1260 	args->a_nfs4_addr = addr;
1261 
1262 	mutex_enter(&mi->mi_async_lock);
1263 
1264 	/*
1265 	 * If asyncio has been disabled, don't bother readahead.
1266 	 */
1267 	if (mi->mi_max_threads == 0) {
1268 		mutex_exit(&mi->mi_async_lock);
1269 		goto noasync;
1270 	}
1271 
1272 	/*
1273 	 * Link request structure into the async list and
1274 	 * wakeup async thread to do the i/o.
1275 	 */
1276 	if (mi->mi_async_reqs[NFS4_READ_AHEAD] == NULL) {
1277 		mi->mi_async_reqs[NFS4_READ_AHEAD] = args;
1278 		mi->mi_async_tail[NFS4_READ_AHEAD] = args;
1279 	} else {
1280 		mi->mi_async_tail[NFS4_READ_AHEAD]->a_next = args;
1281 		mi->mi_async_tail[NFS4_READ_AHEAD] = args;
1282 	}
1283 
1284 	if (mi->mi_io_kstats) {
1285 		mutex_enter(&mi->mi_lock);
1286 		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1287 		mutex_exit(&mi->mi_lock);
1288 	}
1289 
1290 	mi->mi_async_req_count++;
1291 	ASSERT(mi->mi_async_req_count != 0);
1292 	cv_signal(&mi->mi_async_reqs_cv);
1293 	mutex_exit(&mi->mi_async_lock);
1294 	return (0);
1295 
1296 noasync:
1297 	mutex_enter(&rp->r_statelock);
1298 	rp->r_count--;
1299 	cv_broadcast(&rp->r_cv);
1300 	mutex_exit(&rp->r_statelock);
1301 	VN_RELE(vp);
1302 	crfree(cr);
1303 	kmem_free(args, sizeof (*args));
1304 	return (-1);
1305 }
1306 
1307 /*
1308  * The async queues for each mounted file system are arranged as a
1309  * set of queues, one for each async i/o type.  Requests are taken
1310  * from the queues in a round-robin fashion.  A number of consecutive
1311  * requests are taken from each queue before moving on to the next
1312  * queue.  This functionality may allow the NFS Version 2 server to do
1313  * write clustering, even if the client is mixing writes and reads
1314  * because it will take multiple write requests from the queue
1315  * before processing any of the other async i/o types.
1316  *
1317  * XXX The nfs4_async_start thread is unsafe in the light of the present
1318  * model defined by cpr to suspend the system. Specifically over the
1319  * wire calls are cpr-unsafe. The thread should be reevaluated in
1320  * case of future updates to the cpr model.
1321  */
1322 static void
1323 nfs4_async_start(struct vfs *vfsp)
1324 {
1325 	struct nfs4_async_reqs *args;
1326 	mntinfo4_t *mi = VFTOMI4(vfsp);
1327 	clock_t time_left = 1;
1328 	callb_cpr_t cprinfo;
1329 	int i;
1330 	extern int nfs_async_timeout;
1331 
1332 	/*
1333 	 * Dynamic initialization of nfs_async_timeout to allow nfs to be
1334 	 * built in an implementation independent manner.
1335 	 */
1336 	if (nfs_async_timeout == -1)
1337 		nfs_async_timeout = NFS_ASYNC_TIMEOUT;
1338 
1339 	CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, "nas");
1340 
1341 	mutex_enter(&mi->mi_async_lock);
1342 	for (;;) {
1343 		/*
1344 		 * Find the next queue containing an entry.  We start
1345 		 * at the current queue pointer and then round robin
1346 		 * through all of them until we either find a non-empty
1347 		 * queue or have looked through all of them.
1348 		 */
1349 		for (i = 0; i < NFS4_ASYNC_TYPES; i++) {
1350 			args = *mi->mi_async_curr;
1351 			if (args != NULL)
1352 				break;
1353 			mi->mi_async_curr++;
1354 			if (mi->mi_async_curr ==
1355 			    &mi->mi_async_reqs[NFS4_ASYNC_TYPES])
1356 				mi->mi_async_curr = &mi->mi_async_reqs[0];
1357 		}
1358 		/*
1359 		 * If we didn't find a entry, then block until woken up
1360 		 * again and then look through the queues again.
1361 		 */
1362 		if (args == NULL) {
1363 			/*
1364 			 * Exiting is considered to be safe for CPR as well
1365 			 */
1366 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
1367 
1368 			/*
1369 			 * Wakeup thread waiting to unmount the file
1370 			 * system only if all async threads are inactive.
1371 			 *
1372 			 * If we've timed-out and there's nothing to do,
1373 			 * then get rid of this thread.
1374 			 */
1375 			if (mi->mi_max_threads == 0 || time_left <= 0) {
1376 				if (--mi->mi_threads == 0)
1377 					cv_signal(&mi->mi_async_cv);
1378 				CALLB_CPR_EXIT(&cprinfo);
1379 				VFS_RELE(vfsp);	/* release thread's hold */
1380 				zthread_exit();
1381 				/* NOTREACHED */
1382 			}
1383 			time_left = cv_timedwait(&mi->mi_async_work_cv,
1384 			    &mi->mi_async_lock, nfs_async_timeout + lbolt);
1385 
1386 			CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1387 
1388 			continue;
1389 		} else {
1390 			time_left = 1;
1391 		}
1392 
1393 		/*
1394 		 * Remove the request from the async queue and then
1395 		 * update the current async request queue pointer.  If
1396 		 * the current queue is empty or we have removed enough
1397 		 * consecutive entries from it, then reset the counter
1398 		 * for this queue and then move the current pointer to
1399 		 * the next queue.
1400 		 */
1401 		*mi->mi_async_curr = args->a_next;
1402 		if (*mi->mi_async_curr == NULL ||
1403 		    --mi->mi_async_clusters[args->a_io] == 0) {
1404 			mi->mi_async_clusters[args->a_io] =
1405 						mi->mi_async_init_clusters;
1406 			mi->mi_async_curr++;
1407 			if (mi->mi_async_curr ==
1408 			    &mi->mi_async_reqs[NFS4_ASYNC_TYPES])
1409 				mi->mi_async_curr = &mi->mi_async_reqs[0];
1410 		}
1411 
1412 		if (args->a_io != NFS4_INACTIVE && mi->mi_io_kstats) {
1413 			mutex_enter(&mi->mi_lock);
1414 			kstat_waitq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
1415 			mutex_exit(&mi->mi_lock);
1416 		}
1417 
1418 		mutex_exit(&mi->mi_async_lock);
1419 
1420 		/*
1421 		 * Obtain arguments from the async request structure.
1422 		 */
1423 		if (args->a_io == NFS4_READ_AHEAD && mi->mi_max_threads > 0) {
1424 			(*args->a_nfs4_readahead)(args->a_vp,
1425 					args->a_nfs4_blkoff,
1426 					args->a_nfs4_addr, args->a_nfs4_seg,
1427 					args->a_cred);
1428 		} else if (args->a_io == NFS4_PUTAPAGE) {
1429 			(void) (*args->a_nfs4_putapage)(args->a_vp,
1430 					args->a_nfs4_pp, args->a_nfs4_off,
1431 					args->a_nfs4_len, args->a_nfs4_flags,
1432 					args->a_cred);
1433 		} else if (args->a_io == NFS4_PAGEIO) {
1434 			(void) (*args->a_nfs4_pageio)(args->a_vp,
1435 					args->a_nfs4_pp, args->a_nfs4_off,
1436 					args->a_nfs4_len, args->a_nfs4_flags,
1437 					args->a_cred);
1438 		} else if (args->a_io == NFS4_READDIR) {
1439 			(void) ((*args->a_nfs4_readdir)(args->a_vp,
1440 					args->a_nfs4_rdc, args->a_cred));
1441 		} else if (args->a_io == NFS4_COMMIT) {
1442 			(*args->a_nfs4_commit)(args->a_vp, args->a_nfs4_plist,
1443 					args->a_nfs4_offset, args->a_nfs4_count,
1444 					args->a_cred);
1445 		} else if (args->a_io == NFS4_INACTIVE) {
1446 			nfs4_inactive_otw(args->a_vp, args->a_cred);
1447 		}
1448 
1449 		/*
1450 		 * Now, release the vnode and free the credentials
1451 		 * structure.
1452 		 */
1453 		free_async_args4(args);
1454 		/*
1455 		 * Reacquire the mutex because it will be needed above.
1456 		 */
1457 		mutex_enter(&mi->mi_async_lock);
1458 	}
1459 }
1460 
1461 /*
1462  * nfs4_inactive_thread - look for vnodes that need over-the-wire calls as
1463  * part of VOP_INACTIVE.
1464  */
1465 
1466 void
1467 nfs4_inactive_thread(mntinfo4_t *mi)
1468 {
1469 	struct nfs4_async_reqs *args;
1470 	callb_cpr_t cprinfo;
1471 	int call_nfs_free_mi4 = 0;
1472 	vfs_t *vfsp = mi->mi_vfsp;
1473 
1474 	CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr,
1475 		    "nfs4_inactive_thread");
1476 
1477 	for (;;) {
1478 		mutex_enter(&mi->mi_async_lock);
1479 		args = mi->mi_async_reqs[NFS4_INACTIVE];
1480 		if (args == NULL) {
1481 			mutex_enter(&mi->mi_lock);
1482 			/*
1483 			 * During regular operation (ie, unmount
1484 			 * or a failed mount), the async manager thread always
1485 			 * exits before MI4_DEAD is set by nfs_free_mi4().
1486 			 *
1487 			 * When a zone is shutting down, however, we set
1488 			 * MI4_DEAD before the async manager thread is done, and
1489 			 * we don't want to exit until the async manager is done
1490 			 * with its work; hence the check for mi_manager_thread
1491 			 * being NULL.
1492 			 *
1493 			 * The async manager thread will cv_broadcast() on
1494 			 * mi_inact_req_cv when it's done, at which point we'll
1495 			 * wake up and exit.
1496 			 */
1497 			if (mi->mi_manager_thread == NULL &&
1498 			    (mi->mi_flags & MI4_DEAD))
1499 				goto die;
1500 			mi->mi_flags |= MI4_INACTIVE_IDLE;
1501 			mutex_exit(&mi->mi_lock);
1502 			cv_signal(&mi->mi_async_cv);
1503 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
1504 			cv_wait(&mi->mi_inact_req_cv, &mi->mi_async_lock);
1505 			CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1506 			mutex_exit(&mi->mi_async_lock);
1507 		} else {
1508 			mutex_enter(&mi->mi_lock);
1509 			mi->mi_flags &= ~MI4_INACTIVE_IDLE;
1510 			mutex_exit(&mi->mi_lock);
1511 			mi->mi_async_reqs[NFS4_INACTIVE] = args->a_next;
1512 			mutex_exit(&mi->mi_async_lock);
1513 			nfs4_inactive_otw(args->a_vp, args->a_cred);
1514 			crfree(args->a_cred);
1515 			kmem_free(args, sizeof (*args));
1516 		}
1517 	}
1518 die:
1519 	mutex_exit(&mi->mi_lock);
1520 	call_nfs_free_mi4 = (mi->mi_inactive_thread == NULL);
1521 	mi->mi_inactive_thread = NULL;
1522 	cv_signal(&mi->mi_async_cv);
1523 	/*
1524 	 * There is no explicit call to mutex_exit(&mi->mi_async_lock) since
1525 	 * CALLB_CPR_EXIT is actually responsible for releasing 'mi_async_lock'.
1526 	 */
1527 	CALLB_CPR_EXIT(&cprinfo);
1528 	if (call_nfs_free_mi4) {
1529 		if (mi->mi_io_kstats) {
1530 			kstat_delete(mi->mi_io_kstats);
1531 			mi->mi_io_kstats = NULL;
1532 		}
1533 		if (mi->mi_ro_kstats) {
1534 			kstat_delete(mi->mi_ro_kstats);
1535 			mi->mi_ro_kstats = NULL;
1536 		}
1537 		if (mi->mi_recov_ksp) {
1538 			kstat_delete(mi->mi_recov_ksp);
1539 			mi->mi_recov_ksp = NULL;
1540 		}
1541 		nfs_free_mi4(mi);
1542 	}
1543 	NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
1544 	    "nfs4_inactive_thread exiting for vfs %p\n", (void *)vfsp));
1545 	zthread_exit();
1546 	/* NOTREACHED */
1547 }
1548 
1549 /*
1550  * nfs_async_stop:
1551  * Wait for all outstanding putpage operations and the inactive thread to
1552  * complete; nfs4_async_stop_sig() without interruptibility.
1553  */
1554 void
1555 nfs4_async_stop(struct vfs *vfsp)
1556 {
1557 	mntinfo4_t *mi = VFTOMI4(vfsp);
1558 
1559 	/*
1560 	 * Wait for all outstanding async operations to complete and for
1561 	 * worker threads to exit.
1562 	 */
1563 	mutex_enter(&mi->mi_async_lock);
1564 	mi->mi_max_threads = 0;
1565 	cv_broadcast(&mi->mi_async_work_cv);
1566 	while (mi->mi_threads != 0)
1567 		cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1568 
1569 	/*
1570 	 * Wait for the inactive thread to finish doing what it's doing.  It
1571 	 * won't exit until the last reference to the vfs_t goes away.
1572 	 */
1573 	if (mi->mi_inactive_thread != NULL) {
1574 		mutex_enter(&mi->mi_lock);
1575 		while (!(mi->mi_flags & MI4_INACTIVE_IDLE) ||
1576 		    (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) {
1577 			mutex_exit(&mi->mi_lock);
1578 			cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1579 			mutex_enter(&mi->mi_lock);
1580 		}
1581 		mutex_exit(&mi->mi_lock);
1582 	}
1583 	mutex_exit(&mi->mi_async_lock);
1584 }
1585 
1586 /*
1587  * nfs_async_stop_sig:
1588  * Wait for all outstanding putpage operations and the inactive thread to
1589  * complete. If a signal is delivered we will abort and return non-zero;
1590  * otherwise return 0. Since this routine is called from nfs4_unmount, we
1591  * need to make it interruptable.
1592  */
1593 int
1594 nfs4_async_stop_sig(struct vfs *vfsp)
1595 {
1596 	mntinfo4_t *mi = VFTOMI4(vfsp);
1597 	ushort_t omax;
1598 	bool_t intr = FALSE;
1599 
1600 	/*
1601 	 * Wait for all outstanding putpage operations to complete and for
1602 	 * worker threads to exit.
1603 	 */
1604 	mutex_enter(&mi->mi_async_lock);
1605 	omax = mi->mi_max_threads;
1606 	mi->mi_max_threads = 0;
1607 	cv_broadcast(&mi->mi_async_work_cv);
1608 	while (mi->mi_threads != 0) {
1609 		if (!cv_wait_sig(&mi->mi_async_cv, &mi->mi_async_lock)) {
1610 			intr = TRUE;
1611 			goto interrupted;
1612 		}
1613 	}
1614 
1615 	/*
1616 	 * Wait for the inactive thread to finish doing what it's doing.  It
1617 	 * won't exit until the a last reference to the vfs_t goes away.
1618 	 */
1619 	if (mi->mi_inactive_thread != NULL) {
1620 		mutex_enter(&mi->mi_lock);
1621 		while (!(mi->mi_flags & MI4_INACTIVE_IDLE) ||
1622 		    (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) {
1623 			mutex_exit(&mi->mi_lock);
1624 			if (!cv_wait_sig(&mi->mi_async_cv,
1625 			    &mi->mi_async_lock)) {
1626 				intr = TRUE;
1627 				goto interrupted;
1628 			}
1629 			mutex_enter(&mi->mi_lock);
1630 		}
1631 		mutex_exit(&mi->mi_lock);
1632 	}
1633 interrupted:
1634 	if (intr)
1635 		mi->mi_max_threads = omax;
1636 	mutex_exit(&mi->mi_async_lock);
1637 
1638 	return (intr);
1639 }
1640 
1641 int
1642 nfs4_async_putapage(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
1643 	int flags, cred_t *cr, int (*putapage)(vnode_t *, page_t *,
1644 	u_offset_t, size_t, int, cred_t *))
1645 {
1646 	rnode4_t *rp;
1647 	mntinfo4_t *mi;
1648 	struct nfs4_async_reqs *args;
1649 
1650 	ASSERT(flags & B_ASYNC);
1651 	ASSERT(vp->v_vfsp != NULL);
1652 
1653 	rp = VTOR4(vp);
1654 	ASSERT(rp->r_count > 0);
1655 
1656 	mi = VTOMI4(vp);
1657 
1658 	/*
1659 	 * If we can't allocate a request structure, do the putpage
1660 	 * operation synchronously in this thread's context.
1661 	 */
1662 	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1663 		goto noasync;
1664 
1665 	args->a_next = NULL;
1666 #ifdef DEBUG
1667 	args->a_queuer = curthread;
1668 #endif
1669 	VN_HOLD(vp);
1670 	args->a_vp = vp;
1671 	ASSERT(cr != NULL);
1672 	crhold(cr);
1673 	args->a_cred = cr;
1674 	args->a_io = NFS4_PUTAPAGE;
1675 	args->a_nfs4_putapage = putapage;
1676 	args->a_nfs4_pp = pp;
1677 	args->a_nfs4_off = off;
1678 	args->a_nfs4_len = (uint_t)len;
1679 	args->a_nfs4_flags = flags;
1680 
1681 	mutex_enter(&mi->mi_async_lock);
1682 
1683 	/*
1684 	 * If asyncio has been disabled, then make a synchronous request.
1685 	 * This check is done a second time in case async io was diabled
1686 	 * while this thread was blocked waiting for memory pressure to
1687 	 * reduce or for the queue to drain.
1688 	 */
1689 	if (mi->mi_max_threads == 0) {
1690 		mutex_exit(&mi->mi_async_lock);
1691 
1692 		VN_RELE(vp);
1693 		crfree(cr);
1694 		kmem_free(args, sizeof (*args));
1695 		goto noasync;
1696 	}
1697 
1698 	/*
1699 	 * Link request structure into the async list and
1700 	 * wakeup async thread to do the i/o.
1701 	 */
1702 	if (mi->mi_async_reqs[NFS4_PUTAPAGE] == NULL) {
1703 		mi->mi_async_reqs[NFS4_PUTAPAGE] = args;
1704 		mi->mi_async_tail[NFS4_PUTAPAGE] = args;
1705 	} else {
1706 		mi->mi_async_tail[NFS4_PUTAPAGE]->a_next = args;
1707 		mi->mi_async_tail[NFS4_PUTAPAGE] = args;
1708 	}
1709 
1710 	mutex_enter(&rp->r_statelock);
1711 	rp->r_count++;
1712 	rp->r_awcount++;
1713 	mutex_exit(&rp->r_statelock);
1714 
1715 	if (mi->mi_io_kstats) {
1716 		mutex_enter(&mi->mi_lock);
1717 		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1718 		mutex_exit(&mi->mi_lock);
1719 	}
1720 
1721 	mi->mi_async_req_count++;
1722 	ASSERT(mi->mi_async_req_count != 0);
1723 	cv_signal(&mi->mi_async_reqs_cv);
1724 	mutex_exit(&mi->mi_async_lock);
1725 	return (0);
1726 
1727 noasync:
1728 
1729 	if (curproc == proc_pageout || curproc == proc_fsflush ||
1730 	    nfs_zone() == mi->mi_zone) {
1731 		/*
1732 		 * If we get here in the context of the pageout/fsflush,
1733 		 * or we have run out of memory or we're attempting to
1734 		 * unmount we refuse to do a sync write, because this may
1735 		 * hang pageout/fsflush and the machine. In this case,
1736 		 * we just re-mark the page as dirty and punt on the page.
1737 		 *
1738 		 * Make sure B_FORCE isn't set.  We can re-mark the
1739 		 * pages as dirty and unlock the pages in one swoop by
1740 		 * passing in B_ERROR to pvn_write_done().  However,
1741 		 * we should make sure B_FORCE isn't set - we don't
1742 		 * want the page tossed before it gets written out.
1743 		 */
1744 		if (flags & B_FORCE)
1745 			flags &= ~(B_INVAL | B_FORCE);
1746 		pvn_write_done(pp, flags | B_ERROR);
1747 		return (0);
1748 	}
1749 
1750 	/*
1751 	 * We'll get here only if (nfs_zone() != mi->mi_zone)
1752 	 * which means that this was a cross-zone sync putpage.
1753 	 *
1754 	 * We pass in B_ERROR to pvn_write_done() to re-mark the pages
1755 	 * as dirty and unlock them.
1756 	 *
1757 	 * We don't want to clear B_FORCE here as the caller presumably
1758 	 * knows what they're doing if they set it.
1759 	 */
1760 	pvn_write_done(pp, flags | B_ERROR);
1761 	return (EPERM);
1762 }
1763 
1764 int
1765 nfs4_async_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
1766 	int flags, cred_t *cr, int (*pageio)(vnode_t *, page_t *, u_offset_t,
1767 	size_t, int, cred_t *))
1768 {
1769 	rnode4_t *rp;
1770 	mntinfo4_t *mi;
1771 	struct nfs4_async_reqs *args;
1772 
1773 	ASSERT(flags & B_ASYNC);
1774 	ASSERT(vp->v_vfsp != NULL);
1775 
1776 	rp = VTOR4(vp);
1777 	ASSERT(rp->r_count > 0);
1778 
1779 	mi = VTOMI4(vp);
1780 
1781 	/*
1782 	 * If we can't allocate a request structure, do the pageio
1783 	 * request synchronously in this thread's context.
1784 	 */
1785 	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1786 		goto noasync;
1787 
1788 	args->a_next = NULL;
1789 #ifdef DEBUG
1790 	args->a_queuer = curthread;
1791 #endif
1792 	VN_HOLD(vp);
1793 	args->a_vp = vp;
1794 	ASSERT(cr != NULL);
1795 	crhold(cr);
1796 	args->a_cred = cr;
1797 	args->a_io = NFS4_PAGEIO;
1798 	args->a_nfs4_pageio = pageio;
1799 	args->a_nfs4_pp = pp;
1800 	args->a_nfs4_off = io_off;
1801 	args->a_nfs4_len = (uint_t)io_len;
1802 	args->a_nfs4_flags = flags;
1803 
1804 	mutex_enter(&mi->mi_async_lock);
1805 
1806 	/*
1807 	 * If asyncio has been disabled, then make a synchronous request.
1808 	 * This check is done a second time in case async io was diabled
1809 	 * while this thread was blocked waiting for memory pressure to
1810 	 * reduce or for the queue to drain.
1811 	 */
1812 	if (mi->mi_max_threads == 0) {
1813 		mutex_exit(&mi->mi_async_lock);
1814 
1815 		VN_RELE(vp);
1816 		crfree(cr);
1817 		kmem_free(args, sizeof (*args));
1818 		goto noasync;
1819 	}
1820 
1821 	/*
1822 	 * Link request structure into the async list and
1823 	 * wakeup async thread to do the i/o.
1824 	 */
1825 	if (mi->mi_async_reqs[NFS4_PAGEIO] == NULL) {
1826 		mi->mi_async_reqs[NFS4_PAGEIO] = args;
1827 		mi->mi_async_tail[NFS4_PAGEIO] = args;
1828 	} else {
1829 		mi->mi_async_tail[NFS4_PAGEIO]->a_next = args;
1830 		mi->mi_async_tail[NFS4_PAGEIO] = args;
1831 	}
1832 
1833 	mutex_enter(&rp->r_statelock);
1834 	rp->r_count++;
1835 	rp->r_awcount++;
1836 	mutex_exit(&rp->r_statelock);
1837 
1838 	if (mi->mi_io_kstats) {
1839 		mutex_enter(&mi->mi_lock);
1840 		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1841 		mutex_exit(&mi->mi_lock);
1842 	}
1843 
1844 	mi->mi_async_req_count++;
1845 	ASSERT(mi->mi_async_req_count != 0);
1846 	cv_signal(&mi->mi_async_reqs_cv);
1847 	mutex_exit(&mi->mi_async_lock);
1848 	return (0);
1849 
1850 noasync:
1851 	/*
1852 	 * If we can't do it ASYNC, for reads we do nothing (but cleanup
1853 	 * the page list), for writes we do it synchronously, except for
1854 	 * proc_pageout/proc_fsflush as described below.
1855 	 */
1856 	if (flags & B_READ) {
1857 		pvn_read_done(pp, flags | B_ERROR);
1858 		return (0);
1859 	}
1860 
1861 	if (curproc == proc_pageout || curproc == proc_fsflush) {
1862 		/*
1863 		 * If we get here in the context of the pageout/fsflush,
1864 		 * we refuse to do a sync write, because this may hang
1865 		 * pageout/fsflush (and the machine). In this case, we just
1866 		 * re-mark the page as dirty and punt on the page.
1867 		 *
1868 		 * Make sure B_FORCE isn't set.  We can re-mark the
1869 		 * pages as dirty and unlock the pages in one swoop by
1870 		 * passing in B_ERROR to pvn_write_done().  However,
1871 		 * we should make sure B_FORCE isn't set - we don't
1872 		 * want the page tossed before it gets written out.
1873 		 */
1874 		if (flags & B_FORCE)
1875 			flags &= ~(B_INVAL | B_FORCE);
1876 		pvn_write_done(pp, flags | B_ERROR);
1877 		return (0);
1878 	}
1879 
1880 	if (nfs_zone() != mi->mi_zone) {
1881 		/*
1882 		 * So this was a cross-zone sync pageio.  We pass in B_ERROR
1883 		 * to pvn_write_done() to re-mark the pages as dirty and unlock
1884 		 * them.
1885 		 *
1886 		 * We don't want to clear B_FORCE here as the caller presumably
1887 		 * knows what they're doing if they set it.
1888 		 */
1889 		pvn_write_done(pp, flags | B_ERROR);
1890 		return (EPERM);
1891 	}
1892 	return ((*pageio)(vp, pp, io_off, io_len, flags, cr));
1893 }
1894 
1895 void
1896 nfs4_async_readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr,
1897 	int (*readdir)(vnode_t *, rddir4_cache *, cred_t *))
1898 {
1899 	rnode4_t *rp;
1900 	mntinfo4_t *mi;
1901 	struct nfs4_async_reqs *args;
1902 
1903 	rp = VTOR4(vp);
1904 	ASSERT(rp->r_freef == NULL);
1905 
1906 	mi = VTOMI4(vp);
1907 
1908 	/*
1909 	 * If we can't allocate a request structure, skip the readdir.
1910 	 */
1911 	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1912 		goto noasync;
1913 
1914 	args->a_next = NULL;
1915 #ifdef DEBUG
1916 	args->a_queuer = curthread;
1917 #endif
1918 	VN_HOLD(vp);
1919 	args->a_vp = vp;
1920 	ASSERT(cr != NULL);
1921 	crhold(cr);
1922 	args->a_cred = cr;
1923 	args->a_io = NFS4_READDIR;
1924 	args->a_nfs4_readdir = readdir;
1925 	args->a_nfs4_rdc = rdc;
1926 
1927 	mutex_enter(&mi->mi_async_lock);
1928 
1929 	/*
1930 	 * If asyncio has been disabled, then skip this request
1931 	 */
1932 	if (mi->mi_max_threads == 0) {
1933 		mutex_exit(&mi->mi_async_lock);
1934 
1935 		VN_RELE(vp);
1936 		crfree(cr);
1937 		kmem_free(args, sizeof (*args));
1938 		goto noasync;
1939 	}
1940 
1941 	/*
1942 	 * Link request structure into the async list and
1943 	 * wakeup async thread to do the i/o.
1944 	 */
1945 	if (mi->mi_async_reqs[NFS4_READDIR] == NULL) {
1946 		mi->mi_async_reqs[NFS4_READDIR] = args;
1947 		mi->mi_async_tail[NFS4_READDIR] = args;
1948 	} else {
1949 		mi->mi_async_tail[NFS4_READDIR]->a_next = args;
1950 		mi->mi_async_tail[NFS4_READDIR] = args;
1951 	}
1952 
1953 	mutex_enter(&rp->r_statelock);
1954 	rp->r_count++;
1955 	mutex_exit(&rp->r_statelock);
1956 
1957 	if (mi->mi_io_kstats) {
1958 		mutex_enter(&mi->mi_lock);
1959 		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1960 		mutex_exit(&mi->mi_lock);
1961 	}
1962 
1963 	mi->mi_async_req_count++;
1964 	ASSERT(mi->mi_async_req_count != 0);
1965 	cv_signal(&mi->mi_async_reqs_cv);
1966 	mutex_exit(&mi->mi_async_lock);
1967 	return;
1968 
1969 noasync:
1970 	mutex_enter(&rp->r_statelock);
1971 	rdc->entries = NULL;
1972 	/*
1973 	 * Indicate that no one is trying to fill this entry and
1974 	 * it still needs to be filled.
1975 	 */
1976 	rdc->flags &= ~RDDIR;
1977 	rdc->flags |= RDDIRREQ;
1978 	rddir4_cache_rele(rp, rdc);
1979 	mutex_exit(&rp->r_statelock);
1980 }
1981 
1982 void
1983 nfs4_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
1984 	cred_t *cr, void (*commit)(vnode_t *, page_t *, offset3, count3,
1985 	cred_t *))
1986 {
1987 	rnode4_t *rp;
1988 	mntinfo4_t *mi;
1989 	struct nfs4_async_reqs *args;
1990 	page_t *pp;
1991 
1992 	rp = VTOR4(vp);
1993 	mi = VTOMI4(vp);
1994 
1995 	/*
1996 	 * If we can't allocate a request structure, do the commit
1997 	 * operation synchronously in this thread's context.
1998 	 */
1999 	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
2000 		goto noasync;
2001 
2002 	args->a_next = NULL;
2003 #ifdef DEBUG
2004 	args->a_queuer = curthread;
2005 #endif
2006 	VN_HOLD(vp);
2007 	args->a_vp = vp;
2008 	ASSERT(cr != NULL);
2009 	crhold(cr);
2010 	args->a_cred = cr;
2011 	args->a_io = NFS4_COMMIT;
2012 	args->a_nfs4_commit = commit;
2013 	args->a_nfs4_plist = plist;
2014 	args->a_nfs4_offset = offset;
2015 	args->a_nfs4_count = count;
2016 
2017 	mutex_enter(&mi->mi_async_lock);
2018 
2019 	/*
2020 	 * If asyncio has been disabled, then make a synchronous request.
2021 	 * This check is done a second time in case async io was diabled
2022 	 * while this thread was blocked waiting for memory pressure to
2023 	 * reduce or for the queue to drain.
2024 	 */
2025 	if (mi->mi_max_threads == 0) {
2026 		mutex_exit(&mi->mi_async_lock);
2027 
2028 		VN_RELE(vp);
2029 		crfree(cr);
2030 		kmem_free(args, sizeof (*args));
2031 		goto noasync;
2032 	}
2033 
2034 	/*
2035 	 * Link request structure into the async list and
2036 	 * wakeup async thread to do the i/o.
2037 	 */
2038 	if (mi->mi_async_reqs[NFS4_COMMIT] == NULL) {
2039 		mi->mi_async_reqs[NFS4_COMMIT] = args;
2040 		mi->mi_async_tail[NFS4_COMMIT] = args;
2041 	} else {
2042 		mi->mi_async_tail[NFS4_COMMIT]->a_next = args;
2043 		mi->mi_async_tail[NFS4_COMMIT] = args;
2044 	}
2045 
2046 	mutex_enter(&rp->r_statelock);
2047 	rp->r_count++;
2048 	mutex_exit(&rp->r_statelock);
2049 
2050 	if (mi->mi_io_kstats) {
2051 		mutex_enter(&mi->mi_lock);
2052 		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
2053 		mutex_exit(&mi->mi_lock);
2054 	}
2055 
2056 	mi->mi_async_req_count++;
2057 	ASSERT(mi->mi_async_req_count != 0);
2058 	cv_signal(&mi->mi_async_reqs_cv);
2059 	mutex_exit(&mi->mi_async_lock);
2060 	return;
2061 
2062 noasync:
2063 	if (curproc == proc_pageout || curproc == proc_fsflush ||
2064 	    nfs_zone() != mi->mi_zone) {
2065 		while (plist != NULL) {
2066 			pp = plist;
2067 			page_sub(&plist, pp);
2068 			pp->p_fsdata = C_COMMIT;
2069 			page_unlock(pp);
2070 		}
2071 		return;
2072 	}
2073 	(*commit)(vp, plist, offset, count, cr);
2074 }
2075 
2076 /*
2077  * nfs4_async_inactive - hand off a VOP_INACTIVE call to a thread.  The
2078  * reference to the vnode is handed over to the thread; the caller should
2079  * no longer refer to the vnode.
2080  *
2081  * Unlike most of the async routines, this handoff is needed for
2082  * correctness reasons, not just performance.  So doing operations in the
2083  * context of the current thread is not an option.
2084  */
2085 void
2086 nfs4_async_inactive(vnode_t *vp, cred_t *cr)
2087 {
2088 	mntinfo4_t *mi;
2089 	struct nfs4_async_reqs *args;
2090 	boolean_t signal_inactive_thread = B_FALSE;
2091 
2092 	mi = VTOMI4(vp);
2093 
2094 	args = kmem_alloc(sizeof (*args), KM_SLEEP);
2095 	args->a_next = NULL;
2096 #ifdef DEBUG
2097 	args->a_queuer = curthread;
2098 #endif
2099 	args->a_vp = vp;
2100 	ASSERT(cr != NULL);
2101 	crhold(cr);
2102 	args->a_cred = cr;
2103 	args->a_io = NFS4_INACTIVE;
2104 
2105 	/*
2106 	 * Note that we don't check mi->mi_max_threads here, since we
2107 	 * *need* to get rid of this vnode regardless of whether someone
2108 	 * set nfs4_max_threads to zero in /etc/system.
2109 	 *
2110 	 * The manager thread knows about this and is willing to create
2111 	 * at least one thread to accomodate us.
2112 	 */
2113 	mutex_enter(&mi->mi_async_lock);
2114 	if (mi->mi_inactive_thread == NULL) {
2115 		rnode4_t *rp;
2116 		vnode_t *unldvp = NULL;
2117 		char *unlname;
2118 		cred_t *unlcred;
2119 
2120 		mutex_exit(&mi->mi_async_lock);
2121 		/*
2122 		 * We just need to free up the memory associated with the
2123 		 * vnode, which can be safely done from within the current
2124 		 * context.
2125 		 */
2126 		crfree(cr);	/* drop our reference */
2127 		kmem_free(args, sizeof (*args));
2128 		rp = VTOR4(vp);
2129 		mutex_enter(&rp->r_statelock);
2130 		if (rp->r_unldvp != NULL) {
2131 			unldvp = rp->r_unldvp;
2132 			rp->r_unldvp = NULL;
2133 			unlname = rp->r_unlname;
2134 			rp->r_unlname = NULL;
2135 			unlcred = rp->r_unlcred;
2136 			rp->r_unlcred = NULL;
2137 		}
2138 		mutex_exit(&rp->r_statelock);
2139 		/*
2140 		 * No need to explicitly throw away any cached pages.  The
2141 		 * eventual r4inactive() will attempt a synchronous
2142 		 * VOP_PUTPAGE() which will immediately fail since the request
2143 		 * is coming from the wrong zone, and then will proceed to call
2144 		 * nfs4_invalidate_pages() which will clean things up for us.
2145 		 *
2146 		 * Throw away the delegation here so rp4_addfree()'s attempt to
2147 		 * return any existing delegations becomes a no-op.
2148 		 */
2149 		if (rp->r_deleg_type != OPEN_DELEGATE_NONE)
2150 			(void) nfs4delegreturn(rp, NFS4_DR_DISCARD);
2151 		nfs4_clear_open_streams(rp);
2152 
2153 		rp4_addfree(rp, cr);
2154 		if (unldvp != NULL) {
2155 			kmem_free(unlname, MAXNAMELEN);
2156 			VN_RELE(unldvp);
2157 			crfree(unlcred);
2158 		}
2159 		return;
2160 	}
2161 
2162 	if (mi->mi_manager_thread == NULL) {
2163 		/*
2164 		 * We want to talk to the inactive thread.
2165 		 */
2166 		signal_inactive_thread = B_TRUE;
2167 	}
2168 
2169 	/*
2170 	 * Enqueue the vnode and wake up either the special thread (empty
2171 	 * list) or an async thread.
2172 	 */
2173 	if (mi->mi_async_reqs[NFS4_INACTIVE] == NULL) {
2174 		mi->mi_async_reqs[NFS4_INACTIVE] = args;
2175 		mi->mi_async_tail[NFS4_INACTIVE] = args;
2176 		signal_inactive_thread = B_TRUE;
2177 	} else {
2178 		mi->mi_async_tail[NFS4_INACTIVE]->a_next = args;
2179 		mi->mi_async_tail[NFS4_INACTIVE] = args;
2180 	}
2181 	if (signal_inactive_thread) {
2182 		cv_signal(&mi->mi_inact_req_cv);
2183 	} else  {
2184 		mi->mi_async_req_count++;
2185 		ASSERT(mi->mi_async_req_count != 0);
2186 		cv_signal(&mi->mi_async_reqs_cv);
2187 	}
2188 
2189 	mutex_exit(&mi->mi_async_lock);
2190 }
2191 
2192 int
2193 writerp4(rnode4_t *rp, caddr_t base, int tcount, struct uio *uio, int pgcreated)
2194 {
2195 	int pagecreate;
2196 	int n;
2197 	int saved_n;
2198 	caddr_t saved_base;
2199 	u_offset_t offset;
2200 	int error;
2201 	int sm_error;
2202 
2203 	ASSERT(tcount <= MAXBSIZE && tcount <= uio->uio_resid);
2204 	ASSERT(((uintptr_t)base & MAXBOFFSET) + tcount <= MAXBSIZE);
2205 	ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_WRITER));
2206 
2207 	/*
2208 	 * Move bytes in at most PAGESIZE chunks. We must avoid
2209 	 * spanning pages in uiomove() because page faults may cause
2210 	 * the cache to be invalidated out from under us. The r_size is not
2211 	 * updated until after the uiomove. If we push the last page of a
2212 	 * file before r_size is correct, we will lose the data written past
2213 	 * the current (and invalid) r_size.
2214 	 */
2215 	do {
2216 		offset = uio->uio_loffset;
2217 		pagecreate = 0;
2218 
2219 		/*
2220 		 * n is the number of bytes required to satisfy the request
2221 		 *   or the number of bytes to fill out the page.
2222 		 */
2223 		n = (int)MIN((PAGESIZE - ((uintptr_t)base & PAGEOFFSET)),
2224 		    tcount);
2225 
2226 		/*
2227 		 * Check to see if we can skip reading in the page
2228 		 * and just allocate the memory.  We can do this
2229 		 * if we are going to rewrite the entire mapping
2230 		 * or if we are going to write to or beyond the current
2231 		 * end of file from the beginning of the mapping.
2232 		 *
2233 		 * The read of r_size is now protected by r_statelock.
2234 		 */
2235 		mutex_enter(&rp->r_statelock);
2236 		/*
2237 		 * When pgcreated is nonzero the caller has already done
2238 		 * a segmap_getmapflt with forcefault 0 and S_WRITE. With
2239 		 * segkpm this means we already have at least one page
2240 		 * created and mapped at base.
2241 		 */
2242 		pagecreate = pgcreated ||
2243 			(((uintptr_t)base & PAGEOFFSET) == 0 &&
2244 			(n == PAGESIZE || ((offset + n) >= rp->r_size)));
2245 
2246 		mutex_exit(&rp->r_statelock);
2247 
2248 		if (pagecreate) {
2249 			/*
2250 			 * The last argument tells segmap_pagecreate() to
2251 			 * always lock the page, as opposed to sometimes
2252 			 * returning with the page locked. This way we avoid a
2253 			 * fault on the ensuing uiomove(), but also
2254 			 * more importantly (to fix bug 1094402) we can
2255 			 * call segmap_fault() to unlock the page in all
2256 			 * cases. An alternative would be to modify
2257 			 * segmap_pagecreate() to tell us when it is
2258 			 * locking a page, but that's a fairly major
2259 			 * interface change.
2260 			 */
2261 			if (pgcreated == 0)
2262 				(void) segmap_pagecreate(segkmap, base,
2263 							(uint_t)n, 1);
2264 			saved_base = base;
2265 			saved_n = n;
2266 		}
2267 
2268 		/*
2269 		 * The number of bytes of data in the last page can not
2270 		 * be accurately be determined while page is being
2271 		 * uiomove'd to and the size of the file being updated.
2272 		 * Thus, inform threads which need to know accurately
2273 		 * how much data is in the last page of the file.  They
2274 		 * will not do the i/o immediately, but will arrange for
2275 		 * the i/o to happen later when this modify operation
2276 		 * will have finished.
2277 		 */
2278 		ASSERT(!(rp->r_flags & R4MODINPROGRESS));
2279 		mutex_enter(&rp->r_statelock);
2280 		rp->r_flags |= R4MODINPROGRESS;
2281 		rp->r_modaddr = (offset & MAXBMASK);
2282 		mutex_exit(&rp->r_statelock);
2283 
2284 		error = uiomove(base, n, UIO_WRITE, uio);
2285 
2286 		/*
2287 		 * r_size is the maximum number of
2288 		 * bytes known to be in the file.
2289 		 * Make sure it is at least as high as the
2290 		 * first unwritten byte pointed to by uio_loffset.
2291 		 */
2292 		mutex_enter(&rp->r_statelock);
2293 		if (rp->r_size < uio->uio_loffset)
2294 			rp->r_size = uio->uio_loffset;
2295 		rp->r_flags &= ~R4MODINPROGRESS;
2296 		rp->r_flags |= R4DIRTY;
2297 		mutex_exit(&rp->r_statelock);
2298 
2299 		/* n = # of bytes written */
2300 		n = (int)(uio->uio_loffset - offset);
2301 		base += n;
2302 		tcount -= n;
2303 		/*
2304 		 * If we created pages w/o initializing them completely,
2305 		 * we need to zero the part that wasn't set up.
2306 		 * This happens on a most EOF write cases and if
2307 		 * we had some sort of error during the uiomove.
2308 		 */
2309 		if (pagecreate) {
2310 			if ((uio->uio_loffset & PAGEOFFSET) || n == 0)
2311 				(void) kzero(base, PAGESIZE - n);
2312 
2313 			if (pgcreated) {
2314 				/*
2315 				 * Caller is responsible for this page,
2316 				 * it was not created in this loop.
2317 				 */
2318 				pgcreated = 0;
2319 			} else {
2320 				/*
2321 				 * For bug 1094402: segmap_pagecreate locks
2322 				 * page. Unlock it. This also unlocks the
2323 				 * pages allocated by page_create_va() in
2324 				 * segmap_pagecreate().
2325 				 */
2326 				sm_error = segmap_fault(kas.a_hat, segkmap,
2327 						saved_base, saved_n,
2328 						F_SOFTUNLOCK, S_WRITE);
2329 				if (error == 0)
2330 					error = sm_error;
2331 			}
2332 		}
2333 	} while (tcount > 0 && error == 0);
2334 
2335 	return (error);
2336 }
2337 
2338 int
2339 nfs4_putpages(vnode_t *vp, u_offset_t off, size_t len, int flags, cred_t *cr)
2340 {
2341 	rnode4_t *rp;
2342 	page_t *pp;
2343 	u_offset_t eoff;
2344 	u_offset_t io_off;
2345 	size_t io_len;
2346 	int error;
2347 	int rdirty;
2348 	int err;
2349 
2350 	rp = VTOR4(vp);
2351 	ASSERT(rp->r_count > 0);
2352 
2353 	if (!nfs4_has_pages(vp))
2354 		return (0);
2355 
2356 	ASSERT(vp->v_type != VCHR);
2357 
2358 	/*
2359 	 * If R4OUTOFSPACE is set, then all writes turn into B_INVAL
2360 	 * writes.  B_FORCE is set to force the VM system to actually
2361 	 * invalidate the pages, even if the i/o failed.  The pages
2362 	 * need to get invalidated because they can't be written out
2363 	 * because there isn't any space left on either the server's
2364 	 * file system or in the user's disk quota.  The B_FREE bit
2365 	 * is cleared to avoid confusion as to whether this is a
2366 	 * request to place the page on the freelist or to destroy
2367 	 * it.
2368 	 */
2369 	if ((rp->r_flags & R4OUTOFSPACE) ||
2370 	    (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED))
2371 		flags = (flags & ~B_FREE) | B_INVAL | B_FORCE;
2372 
2373 	if (len == 0) {
2374 		/*
2375 		 * If doing a full file synchronous operation, then clear
2376 		 * the R4DIRTY bit.  If a page gets dirtied while the flush
2377 		 * is happening, then R4DIRTY will get set again.  The
2378 		 * R4DIRTY bit must get cleared before the flush so that
2379 		 * we don't lose this information.
2380 		 */
2381 		if (off == (u_offset_t)0 &&
2382 		    !(flags & B_ASYNC) &&
2383 		    (rp->r_flags & R4DIRTY)) {
2384 			mutex_enter(&rp->r_statelock);
2385 			rdirty = (rp->r_flags & R4DIRTY);
2386 			rp->r_flags &= ~R4DIRTY;
2387 			mutex_exit(&rp->r_statelock);
2388 		} else
2389 			rdirty = 0;
2390 
2391 		/*
2392 		 * Search the entire vp list for pages >= off, and flush
2393 		 * the dirty pages.
2394 		 */
2395 		error = pvn_vplist_dirty(vp, off, rp->r_putapage,
2396 					flags, cr);
2397 
2398 		/*
2399 		 * If an error occured and the file was marked as dirty
2400 		 * before and we aren't forcibly invalidating pages, then
2401 		 * reset the R4DIRTY flag.
2402 		 */
2403 		if (error && rdirty &&
2404 		    (flags & (B_INVAL | B_FORCE)) != (B_INVAL | B_FORCE)) {
2405 			mutex_enter(&rp->r_statelock);
2406 			rp->r_flags |= R4DIRTY;
2407 			mutex_exit(&rp->r_statelock);
2408 		}
2409 	} else {
2410 		/*
2411 		 * Do a range from [off...off + len) looking for pages
2412 		 * to deal with.
2413 		 */
2414 		error = 0;
2415 		io_len = 0;
2416 		eoff = off + len;
2417 		mutex_enter(&rp->r_statelock);
2418 		for (io_off = off; io_off < eoff && io_off < rp->r_size;
2419 		    io_off += io_len) {
2420 			mutex_exit(&rp->r_statelock);
2421 			/*
2422 			 * If we are not invalidating, synchronously
2423 			 * freeing or writing pages use the routine
2424 			 * page_lookup_nowait() to prevent reclaiming
2425 			 * them from the free list.
2426 			 */
2427 			if ((flags & B_INVAL) || !(flags & B_ASYNC)) {
2428 				pp = page_lookup(vp, io_off,
2429 				    (flags & (B_INVAL | B_FREE)) ?
2430 				    SE_EXCL : SE_SHARED);
2431 			} else {
2432 				pp = page_lookup_nowait(vp, io_off,
2433 				    (flags & B_FREE) ? SE_EXCL : SE_SHARED);
2434 			}
2435 
2436 			if (pp == NULL || !pvn_getdirty(pp, flags))
2437 				io_len = PAGESIZE;
2438 			else {
2439 				err = (*rp->r_putapage)(vp, pp, &io_off,
2440 				    &io_len, flags, cr);
2441 				if (!error)
2442 					error = err;
2443 				/*
2444 				 * "io_off" and "io_len" are returned as
2445 				 * the range of pages we actually wrote.
2446 				 * This allows us to skip ahead more quickly
2447 				 * since several pages may've been dealt
2448 				 * with by this iteration of the loop.
2449 				 */
2450 			}
2451 			mutex_enter(&rp->r_statelock);
2452 		}
2453 		mutex_exit(&rp->r_statelock);
2454 	}
2455 
2456 	return (error);
2457 }
2458 
2459 void
2460 nfs4_invalidate_pages(vnode_t *vp, u_offset_t off, cred_t *cr)
2461 {
2462 	rnode4_t *rp;
2463 
2464 	rp = VTOR4(vp);
2465 	if (IS_SHADOW(vp, rp))
2466 		vp = RTOV4(rp);
2467 	mutex_enter(&rp->r_statelock);
2468 	while (rp->r_flags & R4TRUNCATE)
2469 		cv_wait(&rp->r_cv, &rp->r_statelock);
2470 	rp->r_flags |= R4TRUNCATE;
2471 	if (off == (u_offset_t)0) {
2472 		rp->r_flags &= ~R4DIRTY;
2473 		if (!(rp->r_flags & R4STALE))
2474 			rp->r_error = 0;
2475 	}
2476 	rp->r_truncaddr = off;
2477 	mutex_exit(&rp->r_statelock);
2478 	(void) pvn_vplist_dirty(vp, off, rp->r_putapage,
2479 		B_INVAL | B_TRUNC, cr);
2480 	mutex_enter(&rp->r_statelock);
2481 	rp->r_flags &= ~R4TRUNCATE;
2482 	cv_broadcast(&rp->r_cv);
2483 	mutex_exit(&rp->r_statelock);
2484 }
2485 
2486 static int
2487 nfs4_mnt_kstat_update(kstat_t *ksp, int rw)
2488 {
2489 	mntinfo4_t *mi;
2490 	struct mntinfo_kstat *mik;
2491 	vfs_t *vfsp;
2492 
2493 	/* this is a read-only kstat. Bail out on a write */
2494 	if (rw == KSTAT_WRITE)
2495 		return (EACCES);
2496 
2497 
2498 	/*
2499 	 * We don't want to wait here as kstat_chain_lock could be held by
2500 	 * dounmount(). dounmount() takes vfs_reflock before the chain lock
2501 	 * and thus could lead to a deadlock.
2502 	 */
2503 	vfsp = (struct vfs *)ksp->ks_private;
2504 
2505 	mi = VFTOMI4(vfsp);
2506 	mik = (struct mntinfo_kstat *)ksp->ks_data;
2507 
2508 	(void) strcpy(mik->mik_proto, mi->mi_curr_serv->sv_knconf->knc_proto);
2509 
2510 	mik->mik_vers = (uint32_t)mi->mi_vers;
2511 	mik->mik_flags = mi->mi_flags;
2512 	/*
2513 	 * The sv_secdata holds the flavor the client specifies.
2514 	 * If the client uses default and a security negotiation
2515 	 * occurs, sv_currsec will point to the current flavor
2516 	 * selected from the server flavor list.
2517 	 * sv_currsec is NULL if no security negotiation takes place.
2518 	 */
2519 	mik->mik_secmod = mi->mi_curr_serv->sv_currsec ?
2520 			mi->mi_curr_serv->sv_currsec->secmod :
2521 			mi->mi_curr_serv->sv_secdata->secmod;
2522 	mik->mik_curread = (uint32_t)mi->mi_curread;
2523 	mik->mik_curwrite = (uint32_t)mi->mi_curwrite;
2524 	mik->mik_retrans = mi->mi_retrans;
2525 	mik->mik_timeo = mi->mi_timeo;
2526 	mik->mik_acregmin = HR2SEC(mi->mi_acregmin);
2527 	mik->mik_acregmax = HR2SEC(mi->mi_acregmax);
2528 	mik->mik_acdirmin = HR2SEC(mi->mi_acdirmin);
2529 	mik->mik_acdirmax = HR2SEC(mi->mi_acdirmax);
2530 	mik->mik_noresponse = (uint32_t)mi->mi_noresponse;
2531 	mik->mik_failover = (uint32_t)mi->mi_failover;
2532 	mik->mik_remap = (uint32_t)mi->mi_remap;
2533 
2534 	(void) strcpy(mik->mik_curserver, mi->mi_curr_serv->sv_hostname);
2535 
2536 	return (0);
2537 }
2538 
2539 void
2540 nfs4_mnt_kstat_init(struct vfs *vfsp)
2541 {
2542 	mntinfo4_t *mi = VFTOMI4(vfsp);
2543 
2544 	/*
2545 	 * PSARC 2001/697 Contract Private Interface
2546 	 * All nfs kstats are under SunMC contract
2547 	 * Please refer to the PSARC listed above and contact
2548 	 * SunMC before making any changes!
2549 	 *
2550 	 * Changes must be reviewed by Solaris File Sharing
2551 	 * Changes must be communicated to contract-2001-697@sun.com
2552 	 *
2553 	 */
2554 
2555 	mi->mi_io_kstats = kstat_create_zone("nfs", getminor(vfsp->vfs_dev),
2556 	    NULL, "nfs", KSTAT_TYPE_IO, 1, 0, mi->mi_zone->zone_id);
2557 	if (mi->mi_io_kstats) {
2558 		if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
2559 			kstat_zone_add(mi->mi_io_kstats, GLOBAL_ZONEID);
2560 		mi->mi_io_kstats->ks_lock = &mi->mi_lock;
2561 		kstat_install(mi->mi_io_kstats);
2562 	}
2563 
2564 	if ((mi->mi_ro_kstats = kstat_create_zone("nfs",
2565 	    getminor(vfsp->vfs_dev), "mntinfo", "misc", KSTAT_TYPE_RAW,
2566 	    sizeof (struct mntinfo_kstat), 0, mi->mi_zone->zone_id)) != NULL) {
2567 		if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
2568 			kstat_zone_add(mi->mi_ro_kstats, GLOBAL_ZONEID);
2569 		mi->mi_ro_kstats->ks_update = nfs4_mnt_kstat_update;
2570 		mi->mi_ro_kstats->ks_private = (void *)vfsp;
2571 		kstat_install(mi->mi_ro_kstats);
2572 	}
2573 
2574 	nfs4_mnt_recov_kstat_init(vfsp);
2575 }
2576 
2577 void
2578 nfs4_write_error(vnode_t *vp, int error, cred_t *cr)
2579 {
2580 	mntinfo4_t *mi;
2581 
2582 	mi = VTOMI4(vp);
2583 	/*
2584 	 * In case of forced unmount, do not print any messages
2585 	 * since it can flood the console with error messages.
2586 	 */
2587 	if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)
2588 		return;
2589 
2590 	/*
2591 	 * If the mount point is dead, not recoverable, do not
2592 	 * print error messages that can flood the console.
2593 	 */
2594 	if (mi->mi_flags & MI4_RECOV_FAIL)
2595 		return;
2596 
2597 	/*
2598 	 * No use in flooding the console with ENOSPC
2599 	 * messages from the same file system.
2600 	 */
2601 	if ((error != ENOSPC && error != EDQUOT) ||
2602 	    lbolt - mi->mi_printftime > 0) {
2603 		zoneid_t zoneid = mi->mi_zone->zone_id;
2604 
2605 #ifdef DEBUG
2606 		nfs_perror(error, "NFS%ld write error on host %s: %m.\n",
2607 		    mi->mi_vers, VTOR4(vp)->r_server->sv_hostname, NULL);
2608 #else
2609 		nfs_perror(error, "NFS write error on host %s: %m.\n",
2610 		    VTOR4(vp)->r_server->sv_hostname, NULL);
2611 #endif
2612 		if (error == ENOSPC || error == EDQUOT) {
2613 			zcmn_err(zoneid, CE_CONT,
2614 			    "^File: userid=%d, groupid=%d\n",
2615 			    crgetuid(cr), crgetgid(cr));
2616 			if (crgetuid(curthread->t_cred) != crgetuid(cr) ||
2617 			    crgetgid(curthread->t_cred) != crgetgid(cr)) {
2618 				zcmn_err(zoneid, CE_CONT,
2619 				    "^User: userid=%d, groupid=%d\n",
2620 				    crgetuid(curthread->t_cred),
2621 				    crgetgid(curthread->t_cred));
2622 			}
2623 			mi->mi_printftime = lbolt +
2624 			    nfs_write_error_interval * hz;
2625 		}
2626 		sfh4_printfhandle(VTOR4(vp)->r_fh);
2627 #ifdef DEBUG
2628 		if (error == EACCES) {
2629 			zcmn_err(zoneid, CE_CONT,
2630 			    "nfs_bio: cred is%s kcred\n",
2631 			    cr == kcred ? "" : " not");
2632 		}
2633 #endif
2634 	}
2635 }
2636 
2637 /*
2638  * Return non-zero if the given file can be safely memory mapped.  Locks
2639  * are safe if whole-file (length and offset are both zero).
2640  */
2641 
2642 #define	SAFE_LOCK(flk)	((flk).l_start == 0 && (flk).l_len == 0)
2643 
2644 static int
2645 nfs4_safemap(const vnode_t *vp)
2646 {
2647 	locklist_t	*llp, *next_llp;
2648 	int		safe = 1;
2649 	rnode4_t	*rp = VTOR4(vp);
2650 
2651 	ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER));
2652 
2653 	NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: "
2654 		"vp = %p", (void *)vp));
2655 
2656 	/*
2657 	 * Review all the locks for the vnode, both ones that have been
2658 	 * acquired and ones that are pending.  We assume that
2659 	 * flk_active_locks_for_vp() has merged any locks that can be
2660 	 * merged (so that if a process has the entire file locked, it is
2661 	 * represented as a single lock).
2662 	 *
2663 	 * Note that we can't bail out of the loop if we find a non-safe
2664 	 * lock, because we have to free all the elements in the llp list.
2665 	 * We might be able to speed up this code slightly by not looking
2666 	 * at each lock's l_start and l_len fields once we've found a
2667 	 * non-safe lock.
2668 	 */
2669 
2670 	llp = flk_active_locks_for_vp(vp);
2671 	while (llp) {
2672 		NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE,
2673 		    "nfs4_safemap: active lock (%" PRId64 ", %" PRId64 ")",
2674 		    llp->ll_flock.l_start, llp->ll_flock.l_len));
2675 		if (!SAFE_LOCK(llp->ll_flock)) {
2676 			safe = 0;
2677 			NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE,
2678 			    "nfs4_safemap: unsafe active lock (%" PRId64
2679 			    ", %" PRId64 ")", llp->ll_flock.l_start,
2680 			    llp->ll_flock.l_len));
2681 		}
2682 		next_llp = llp->ll_next;
2683 		VN_RELE(llp->ll_vp);
2684 		kmem_free(llp, sizeof (*llp));
2685 		llp = next_llp;
2686 	}
2687 
2688 	NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: %s",
2689 		safe ? "safe" : "unsafe"));
2690 	return (safe);
2691 }
2692 
2693 /*
2694  * Return whether there is a lost LOCK or LOCKU queued up for the given
2695  * file that would make an mmap request unsafe.  cf. nfs4_safemap().
2696  */
2697 
2698 bool_t
2699 nfs4_map_lost_lock_conflict(vnode_t *vp)
2700 {
2701 	bool_t conflict = FALSE;
2702 	nfs4_lost_rqst_t *lrp;
2703 	mntinfo4_t *mi = VTOMI4(vp);
2704 
2705 	mutex_enter(&mi->mi_lock);
2706 	for (lrp = list_head(&mi->mi_lost_state); lrp != NULL;
2707 	    lrp = list_next(&mi->mi_lost_state, lrp)) {
2708 		if (lrp->lr_op != OP_LOCK && lrp->lr_op != OP_LOCKU)
2709 			continue;
2710 		ASSERT(lrp->lr_vp != NULL);
2711 		if (!VOP_CMP(lrp->lr_vp, vp))
2712 			continue;	/* different file */
2713 		if (!SAFE_LOCK(*lrp->lr_flk)) {
2714 			conflict = TRUE;
2715 			break;
2716 		}
2717 	}
2718 
2719 	mutex_exit(&mi->mi_lock);
2720 	return (conflict);
2721 }
2722 
2723 /*
2724  * nfs_lockcompletion:
2725  *
2726  * If the vnode has a lock that makes it unsafe to cache the file, mark it
2727  * as non cachable (set VNOCACHE bit).
2728  */
2729 
2730 void
2731 nfs4_lockcompletion(vnode_t *vp, int cmd)
2732 {
2733 	rnode4_t *rp = VTOR4(vp);
2734 
2735 	ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER));
2736 	ASSERT(!IS_SHADOW(vp, rp));
2737 
2738 	if (cmd == F_SETLK || cmd == F_SETLKW) {
2739 
2740 		if (!nfs4_safemap(vp)) {
2741 			mutex_enter(&vp->v_lock);
2742 			vp->v_flag |= VNOCACHE;
2743 			mutex_exit(&vp->v_lock);
2744 		} else {
2745 			mutex_enter(&vp->v_lock);
2746 			vp->v_flag &= ~VNOCACHE;
2747 			mutex_exit(&vp->v_lock);
2748 		}
2749 	}
2750 	/*
2751 	 * The cached attributes of the file are stale after acquiring
2752 	 * the lock on the file. They were updated when the file was
2753 	 * opened, but not updated when the lock was acquired. Therefore the
2754 	 * cached attributes are invalidated after the lock is obtained.
2755 	 */
2756 	PURGE_ATTRCACHE4(vp);
2757 }
2758 
2759 /* ARGSUSED */
2760 static void *
2761 nfs4_mi_init(zoneid_t zoneid)
2762 {
2763 	struct mi4_globals *mig;
2764 
2765 	mig = kmem_alloc(sizeof (*mig), KM_SLEEP);
2766 	mutex_init(&mig->mig_lock, NULL, MUTEX_DEFAULT, NULL);
2767 	list_create(&mig->mig_list, sizeof (mntinfo4_t),
2768 	    offsetof(mntinfo4_t, mi_zone_node));
2769 	mig->mig_destructor_called = B_FALSE;
2770 	return (mig);
2771 }
2772 
2773 /*
2774  * Callback routine to tell all NFSv4 mounts in the zone to start tearing down
2775  * state and killing off threads.
2776  */
2777 /* ARGSUSED */
2778 static void
2779 nfs4_mi_shutdown(zoneid_t zoneid, void *data)
2780 {
2781 	struct mi4_globals *mig = data;
2782 	mntinfo4_t *mi;
2783 	nfs4_server_t *np;
2784 
2785 	NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2786 	    "nfs4_mi_shutdown zone %d\n", zoneid));
2787 	ASSERT(mig != NULL);
2788 again:
2789 	mutex_enter(&mig->mig_lock);
2790 	for (mi = list_head(&mig->mig_list); mi != NULL;
2791 	    mi = list_next(&mig->mig_list, mi)) {
2792 		/*
2793 		 * If we've done the shutdown work for this FS, skip.
2794 		 * Once we go off the end of the list, we're done.
2795 		 */
2796 		if (mi->mi_flags & MI4_DEAD)
2797 			continue;
2798 
2799 		/*
2800 		 * We will do work, so not done.  Get a hold on the FS.
2801 		 */
2802 		NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2803 		    "nfs4_mi_shutdown stopping vfs %p\n", (void *)mi->mi_vfsp));
2804 		VFS_HOLD(mi->mi_vfsp);
2805 
2806 		/*
2807 		 * purge the DNLC for this filesystem
2808 		 */
2809 		(void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
2810 
2811 		mutex_enter(&mi->mi_async_lock);
2812 		/*
2813 		 * Tell existing async worker threads to exit.
2814 		 */
2815 		mi->mi_max_threads = 0;
2816 		cv_broadcast(&mi->mi_async_work_cv);
2817 		/*
2818 		 * Set the appropriate flags so both the async manager and the
2819 		 * inactive thread start getting ready to exit when they're done
2820 		 * with their current work.
2821 		 */
2822 		mutex_enter(&mi->mi_lock);
2823 		mi->mi_flags |= (MI4_ASYNC_MGR_STOP|MI4_DEAD);
2824 		mutex_exit(&mi->mi_lock);
2825 		/*
2826 		 * Wake up async manager thread.  When it is done it will wake
2827 		 * up the inactive thread which will then exit.
2828 		 */
2829 		cv_broadcast(&mi->mi_async_reqs_cv);
2830 		mutex_exit(&mi->mi_async_lock);
2831 
2832 		/*
2833 		 * Drop lock and release FS, which may change list, then repeat.
2834 		 * We're done when every mi has been done or the list is empty.
2835 		 */
2836 		mutex_exit(&mig->mig_lock);
2837 		VFS_RELE(mi->mi_vfsp);
2838 		goto again;
2839 	}
2840 	mutex_exit(&mig->mig_lock);
2841 	/*
2842 	 * Tell each renew thread in the zone to exit
2843 	 */
2844 	mutex_enter(&nfs4_server_lst_lock);
2845 	for (np = nfs4_server_lst.forw; np != &nfs4_server_lst; np = np->forw) {
2846 		mutex_enter(&np->s_lock);
2847 		if (np->zoneid == zoneid) {
2848 			/*
2849 			 * We add another hold onto the nfs4_server_t
2850 			 * because this will make sure tha the nfs4_server_t
2851 			 * stays around until nfs4_callback_fini_zone destroys
2852 			 * the zone. This way, the renew thread can
2853 			 * unconditionally release its holds on the
2854 			 * nfs4_server_t.
2855 			 */
2856 			np->s_refcnt++;
2857 			nfs4_mark_srv_dead(np);
2858 		}
2859 		mutex_exit(&np->s_lock);
2860 	}
2861 	mutex_exit(&nfs4_server_lst_lock);
2862 }
2863 
2864 static void
2865 nfs4_mi_free_globals(struct mi4_globals *mig)
2866 {
2867 	list_destroy(&mig->mig_list);	/* makes sure the list is empty */
2868 	mutex_destroy(&mig->mig_lock);
2869 	kmem_free(mig, sizeof (*mig));
2870 
2871 }
2872 
2873 /* ARGSUSED */
2874 static void
2875 nfs4_mi_destroy(zoneid_t zoneid, void *data)
2876 {
2877 	struct mi4_globals *mig = data;
2878 
2879 	NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2880 	    "nfs4_mi_destroy zone %d\n", zoneid));
2881 	ASSERT(mig != NULL);
2882 	mutex_enter(&mig->mig_lock);
2883 	if (list_head(&mig->mig_list) != NULL) {
2884 		/* Still waiting for VFS_FREEVFS() */
2885 		mig->mig_destructor_called = B_TRUE;
2886 		mutex_exit(&mig->mig_lock);
2887 		return;
2888 	}
2889 	nfs4_mi_free_globals(mig);
2890 }
2891 
2892 /*
2893  * Add an NFS mount to the per-zone list of NFS mounts.
2894  */
2895 void
2896 nfs4_mi_zonelist_add(mntinfo4_t *mi)
2897 {
2898 	struct mi4_globals *mig;
2899 
2900 	mig = zone_getspecific(mi4_list_key, mi->mi_zone);
2901 	mutex_enter(&mig->mig_lock);
2902 	list_insert_head(&mig->mig_list, mi);
2903 	mutex_exit(&mig->mig_lock);
2904 }
2905 
2906 /*
2907  * Remove an NFS mount from the per-zone list of NFS mounts.
2908  */
2909 static void
2910 nfs4_mi_zonelist_remove(mntinfo4_t *mi)
2911 {
2912 	struct mi4_globals *mig;
2913 
2914 	mig = zone_getspecific(mi4_list_key, mi->mi_zone);
2915 	mutex_enter(&mig->mig_lock);
2916 	list_remove(&mig->mig_list, mi);
2917 	/*
2918 	 * We can be called asynchronously by VFS_FREEVFS() after the zone
2919 	 * shutdown/destroy callbacks have executed; if so, clean up the zone's
2920 	 * mi globals.
2921 	 */
2922 	if (list_head(&mig->mig_list) == NULL &&
2923 	    mig->mig_destructor_called == B_TRUE) {
2924 		nfs4_mi_free_globals(mig);
2925 		return;
2926 	}
2927 	mutex_exit(&mig->mig_lock);
2928 }
2929 
2930 void
2931 nfs_free_mi4(mntinfo4_t *mi)
2932 {
2933 	nfs4_open_owner_t	*foop;
2934 	nfs4_oo_hash_bucket_t	*bucketp;
2935 	nfs4_debug_msg_t	*msgp;
2936 	int i;
2937 
2938 	/*
2939 	 * Tell the thread for over the wire inactive calls to exit.
2940 	 *
2941 	 * By the time we get here the last VFS_RELE() has already been called,
2942 	 * or this is an aborted mount; in either case the async manager thread
2943 	 * should be dead by now.  The recovery thread has called recov_done(),
2944 	 * but may not have exited yet.
2945 	 */
2946 	mutex_enter(&mi->mi_lock);
2947 	ASSERT(mi->mi_recovthread == NULL);
2948 	ASSERT(mi->mi_flags & MI4_ASYNC_MGR_STOP);
2949 	mi->mi_flags |= MI4_DEAD;
2950 	mutex_exit(&mi->mi_lock);
2951 
2952 	mutex_enter(&mi->mi_async_lock);
2953 	ASSERT(mi->mi_threads == 0);
2954 	ASSERT(mi->mi_manager_thread == NULL);
2955 
2956 	/*
2957 	 * If we are the inactive thread NULL mi_inactive_thread
2958 	 * then return. The inactive thread will detect MI4_DEAD
2959 	 * and call nfs_free_mi4 directly so that the cleanup and
2960 	 * thread exit can occur.
2961 	 */
2962 	if (mi->mi_inactive_thread == curthread) {
2963 		mi->mi_inactive_thread = NULL;
2964 		mutex_exit(&mi->mi_async_lock);
2965 		return;
2966 	}
2967 
2968 	/*
2969 	 * Wake up the inactive thread.
2970 	 */
2971 	cv_signal(&mi->mi_inact_req_cv);
2972 
2973 	/*
2974 	 * Wait for the inactive thread to exit.
2975 	 */
2976 	while (mi->mi_inactive_thread != NULL) {
2977 		cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
2978 	}
2979 
2980 	mutex_exit(&mi->mi_async_lock);
2981 
2982 	/*
2983 	 * Wait for the recovery thread to complete, that is, it will signal
2984 	 * when it is done using the "mi" structure and about to exit.
2985 	 */
2986 	mutex_enter(&mi->mi_lock);
2987 	while (mi->mi_in_recovery > 0)
2988 		cv_wait(&mi->mi_cv_in_recov, &mi->mi_lock);
2989 	mutex_exit(&mi->mi_lock);
2990 
2991 	mutex_enter(&mi->mi_msg_list_lock);
2992 	while (msgp = list_head(&mi->mi_msg_list)) {
2993 		list_remove(&mi->mi_msg_list, msgp);
2994 		nfs4_free_msg(msgp);
2995 	}
2996 	mutex_exit(&mi->mi_msg_list_lock);
2997 	list_destroy(&mi->mi_msg_list);
2998 
2999 	if (mi->mi_rootfh != NULL)
3000 		sfh4_rele(&mi->mi_rootfh);
3001 	if (mi->mi_srvparentfh != NULL)
3002 		sfh4_rele(&mi->mi_srvparentfh);
3003 
3004 	mutex_destroy(&mi->mi_lock);
3005 	mutex_destroy(&mi->mi_async_lock);
3006 	mutex_destroy(&mi->mi_msg_list_lock);
3007 	nfs_rw_destroy(&mi->mi_recovlock);
3008 	nfs_rw_destroy(&mi->mi_rename_lock);
3009 	nfs_rw_destroy(&mi->mi_fh_lock);
3010 	cv_destroy(&mi->mi_failover_cv);
3011 	cv_destroy(&mi->mi_async_reqs_cv);
3012 	cv_destroy(&mi->mi_async_work_cv);
3013 	cv_destroy(&mi->mi_async_cv);
3014 	cv_destroy(&mi->mi_inact_req_cv);
3015 
3016 	/*
3017 	 * Destroy the oo hash lists and mutexes for the cred hash table.
3018 	 */
3019 	for (i = 0; i < NFS4_NUM_OO_BUCKETS; i++) {
3020 		bucketp = &(mi->mi_oo_list[i]);
3021 		/* Destroy any remaining open owners on the list */
3022 		foop = list_head(&bucketp->b_oo_hash_list);
3023 		while (foop != NULL) {
3024 			list_remove(&bucketp->b_oo_hash_list, foop);
3025 			nfs4_destroy_open_owner(foop);
3026 			foop = list_head(&bucketp->b_oo_hash_list);
3027 		}
3028 		list_destroy(&bucketp->b_oo_hash_list);
3029 		mutex_destroy(&bucketp->b_lock);
3030 	}
3031 
3032 	/*
3033 	 * Empty and destroy the freed open owner list.
3034 	 */
3035 	foop = list_head(&mi->mi_foo_list);
3036 	while (foop != NULL) {
3037 		list_remove(&mi->mi_foo_list, foop);
3038 		nfs4_destroy_open_owner(foop);
3039 		foop = list_head(&mi->mi_foo_list);
3040 	}
3041 
3042 	list_destroy(&mi->mi_foo_list);
3043 	list_destroy(&mi->mi_bseqid_list);
3044 	list_destroy(&mi->mi_lost_state);
3045 	avl_destroy(&mi->mi_filehandles);
3046 	fn_rele(&mi->mi_fname);
3047 	nfs4_mi_zonelist_remove(mi);
3048 	zone_rele(mi->mi_zone);
3049 
3050 	kmem_free(mi, sizeof (*mi));
3051 }
3052 
3053 vnode_t    nfs4_xattr_notsupp_vnode;
3054 
3055 void
3056 nfs4_clnt_init(void)
3057 {
3058 	nfs4_vnops_init();
3059 	(void) nfs4_rnode_init();
3060 	(void) nfs4_shadow_init();
3061 	(void) nfs4_acache_init();
3062 	(void) nfs4_subr_init();
3063 	nfs4_acl_init();
3064 	nfs_idmap_init();
3065 	nfs4_callback_init();
3066 	nfs4_secinfo_init();
3067 #ifdef	DEBUG
3068 	tsd_create(&nfs4_tsd_key, NULL);
3069 #endif
3070 
3071 	/*
3072 	 * Add a CPR callback so that we can update client
3073 	 * lease after a suspend and resume.
3074 	 */
3075 	cid = callb_add(nfs4_client_cpr_callb, 0, CB_CL_CPR_RPC, "nfs4");
3076 
3077 	zone_key_create(&mi4_list_key, nfs4_mi_init, nfs4_mi_shutdown,
3078 	    nfs4_mi_destroy);
3079 
3080 	/*
3081 	 * Initialise the reference count of the notsupp xattr cache vnode to 1
3082 	 * so that it never goes away (VOP_INACTIVE isn't called on it).
3083 	 */
3084 	nfs4_xattr_notsupp_vnode.v_count = 1;
3085 }
3086 
3087 void
3088 nfs4_clnt_fini(void)
3089 {
3090 	(void) zone_key_delete(mi4_list_key);
3091 	nfs4_vnops_fini();
3092 	(void) nfs4_rnode_fini();
3093 	(void) nfs4_shadow_fini();
3094 	(void) nfs4_acache_fini();
3095 	(void) nfs4_subr_fini();
3096 	nfs_idmap_fini();
3097 	nfs4_callback_fini();
3098 	nfs4_secinfo_fini();
3099 #ifdef	DEBUG
3100 	tsd_destroy(&nfs4_tsd_key);
3101 #endif
3102 	if (cid)
3103 		(void) callb_delete(cid);
3104 }
3105 
3106 /*ARGSUSED*/
3107 static boolean_t
3108 nfs4_client_cpr_callb(void *arg, int code)
3109 {
3110 	/*
3111 	 * We get called for Suspend and Resume events.
3112 	 * For the suspend case we simply don't care!
3113 	 */
3114 	if (code == CB_CODE_CPR_CHKPT) {
3115 		return (B_TRUE);
3116 	}
3117 
3118 	/*
3119 	 * When we get to here we are in the process of
3120 	 * resuming the system from a previous suspend.
3121 	 */
3122 	nfs4_client_resumed = gethrestime_sec();
3123 	return (B_TRUE);
3124 }
3125 
3126 void
3127 nfs4_renew_lease_thread(nfs4_server_t *sp)
3128 {
3129 	int	error = 0;
3130 	time_t	tmp_last_renewal_time, tmp_time, tmp_now_time, kip_secs;
3131 	clock_t	tick_delay = 0;
3132 	clock_t time_left = 0;
3133 	callb_cpr_t cpr_info;
3134 	kmutex_t cpr_lock;
3135 
3136 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3137 		"nfs4_renew_lease_thread: acting on sp 0x%p", (void*)sp));
3138 	mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
3139 	CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr, "nfsv4Lease");
3140 
3141 	mutex_enter(&sp->s_lock);
3142 	/* sp->s_lease_time is set via a GETATTR */
3143 	sp->last_renewal_time = gethrestime_sec();
3144 	sp->lease_valid = NFS4_LEASE_UNINITIALIZED;
3145 	ASSERT(sp->s_refcnt >= 1);
3146 
3147 	for (;;) {
3148 		if (!sp->state_ref_count ||
3149 			sp->lease_valid != NFS4_LEASE_VALID) {
3150 
3151 			kip_secs = MAX((sp->s_lease_time >> 1) -
3152 				(3 * sp->propagation_delay.tv_sec), 1);
3153 
3154 			tick_delay = SEC_TO_TICK(kip_secs);
3155 
3156 			NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3157 				"nfs4_renew_lease_thread: no renew : thread "
3158 				"wait %ld secs", kip_secs));
3159 
3160 			NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3161 				"nfs4_renew_lease_thread: no renew : "
3162 				"state_ref_count %d, lease_valid %d",
3163 				sp->state_ref_count, sp->lease_valid));
3164 
3165 			mutex_enter(&cpr_lock);
3166 			CALLB_CPR_SAFE_BEGIN(&cpr_info);
3167 			mutex_exit(&cpr_lock);
3168 			time_left = cv_timedwait(&sp->cv_thread_exit,
3169 				&sp->s_lock, tick_delay + lbolt);
3170 			mutex_enter(&cpr_lock);
3171 			CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
3172 			mutex_exit(&cpr_lock);
3173 
3174 			NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3175 				"nfs4_renew_lease_thread: no renew: "
3176 				"time left %ld", time_left));
3177 
3178 			if (sp->s_thread_exit == NFS4_THREAD_EXIT)
3179 				goto die;
3180 			continue;
3181 		}
3182 
3183 		tmp_last_renewal_time = sp->last_renewal_time;
3184 
3185 		tmp_time = gethrestime_sec() - sp->last_renewal_time +
3186 			(3 * sp->propagation_delay.tv_sec);
3187 
3188 		NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3189 			"nfs4_renew_lease_thread: tmp_time %ld, "
3190 			"sp->last_renewal_time %ld", tmp_time,
3191 			sp->last_renewal_time));
3192 
3193 		kip_secs = MAX((sp->s_lease_time >> 1) - tmp_time, 1);
3194 
3195 		tick_delay = SEC_TO_TICK(kip_secs);
3196 
3197 		NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3198 			"nfs4_renew_lease_thread: valid lease: sleep for %ld "
3199 			"secs", kip_secs));
3200 
3201 		mutex_enter(&cpr_lock);
3202 		CALLB_CPR_SAFE_BEGIN(&cpr_info);
3203 		mutex_exit(&cpr_lock);
3204 		time_left = cv_timedwait(&sp->cv_thread_exit, &sp->s_lock,
3205 			tick_delay + lbolt);
3206 		mutex_enter(&cpr_lock);
3207 		CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
3208 		mutex_exit(&cpr_lock);
3209 
3210 		NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3211 			"nfs4_renew_lease_thread: valid lease: time left %ld :"
3212 			"sp last_renewal_time %ld, nfs4_client_resumed %ld, "
3213 			"tmp_last_renewal_time %ld", time_left,
3214 			sp->last_renewal_time, nfs4_client_resumed,
3215 			tmp_last_renewal_time));
3216 
3217 		if (sp->s_thread_exit == NFS4_THREAD_EXIT)
3218 			goto die;
3219 
3220 		if (tmp_last_renewal_time == sp->last_renewal_time ||
3221 			(nfs4_client_resumed != 0 &&
3222 			nfs4_client_resumed > sp->last_renewal_time)) {
3223 			/*
3224 			 * Issue RENEW op since we haven't renewed the lease
3225 			 * since we slept.
3226 			 */
3227 			tmp_now_time = gethrestime_sec();
3228 			error = nfs4renew(sp);
3229 			/*
3230 			 * Need to re-acquire sp's lock, nfs4renew()
3231 			 * relinqueshes it.
3232 			 */
3233 			mutex_enter(&sp->s_lock);
3234 
3235 			/*
3236 			 * See if someone changed s_thread_exit while we gave
3237 			 * up s_lock.
3238 			 */
3239 			if (sp->s_thread_exit == NFS4_THREAD_EXIT)
3240 				goto die;
3241 
3242 			if (!error) {
3243 				/*
3244 				 * check to see if we implicitly renewed while
3245 				 * we waited for a reply for our RENEW call.
3246 				 */
3247 				if (tmp_last_renewal_time ==
3248 					sp->last_renewal_time) {
3249 					/* no implicit renew came */
3250 					sp->last_renewal_time = tmp_now_time;
3251 				} else {
3252 					NFS4_DEBUG(nfs4_client_lease_debug,
3253 						(CE_NOTE, "renew_thread: did "
3254 						"implicit renewal before reply "
3255 						"from server for RENEW"));
3256 				}
3257 			} else {
3258 				/* figure out error */
3259 				NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3260 					"renew_thread: nfs4renew returned error"
3261 					" %d", error));
3262 			}
3263 
3264 		}
3265 	}
3266 
3267 die:
3268 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3269 		"nfs4_renew_lease_thread: thread exiting"));
3270 
3271 	while (sp->s_otw_call_count != 0) {
3272 		NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3273 			"nfs4_renew_lease_thread: waiting for outstanding "
3274 			"otw calls to finish for sp 0x%p, current "
3275 			"s_otw_call_count %d", (void *)sp,
3276 			sp->s_otw_call_count));
3277 		mutex_enter(&cpr_lock);
3278 		CALLB_CPR_SAFE_BEGIN(&cpr_info);
3279 		mutex_exit(&cpr_lock);
3280 		cv_wait(&sp->s_cv_otw_count, &sp->s_lock);
3281 		mutex_enter(&cpr_lock);
3282 		CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
3283 		mutex_exit(&cpr_lock);
3284 	}
3285 	mutex_exit(&sp->s_lock);
3286 
3287 	nfs4_server_rele(sp);		/* free the thread's reference */
3288 	nfs4_server_rele(sp);		/* free the list's reference */
3289 	sp = NULL;
3290 
3291 done:
3292 	mutex_enter(&cpr_lock);
3293 	CALLB_CPR_EXIT(&cpr_info);	/* drops cpr_lock */
3294 	mutex_destroy(&cpr_lock);
3295 
3296 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3297 		"nfs4_renew_lease_thread: renew thread exit officially"));
3298 
3299 	zthread_exit();
3300 	/* NOT REACHED */
3301 }
3302 
3303 /*
3304  * Send out a RENEW op to the server.
3305  * Assumes sp is locked down.
3306  */
3307 static int
3308 nfs4renew(nfs4_server_t *sp)
3309 {
3310 	COMPOUND4args_clnt args;
3311 	COMPOUND4res_clnt res;
3312 	nfs_argop4 argop[1];
3313 	int doqueue = 1;
3314 	int rpc_error;
3315 	cred_t *cr;
3316 	mntinfo4_t *mi;
3317 	timespec_t prop_time, after_time;
3318 	int needrecov = FALSE;
3319 	nfs4_recov_state_t recov_state;
3320 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
3321 
3322 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "nfs4renew"));
3323 
3324 	recov_state.rs_flags = 0;
3325 	recov_state.rs_num_retry_despite_err = 0;
3326 
3327 recov_retry:
3328 	mi = sp->mntinfo4_list;
3329 	VFS_HOLD(mi->mi_vfsp);
3330 	mutex_exit(&sp->s_lock);
3331 	ASSERT(mi != NULL);
3332 
3333 	e.error = nfs4_start_op(mi, NULL, NULL, &recov_state);
3334 	if (e.error) {
3335 		VFS_RELE(mi->mi_vfsp);
3336 		return (e.error);
3337 	}
3338 
3339 	/* Check to see if we're dealing with a marked-dead sp */
3340 	mutex_enter(&sp->s_lock);
3341 	if (sp->s_thread_exit == NFS4_THREAD_EXIT) {
3342 		mutex_exit(&sp->s_lock);
3343 		nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3344 		VFS_RELE(mi->mi_vfsp);
3345 		return (0);
3346 	}
3347 
3348 	/* Make sure mi hasn't changed on us */
3349 	if (mi != sp->mntinfo4_list) {
3350 		/* Must drop sp's lock to avoid a recursive mutex enter */
3351 		mutex_exit(&sp->s_lock);
3352 		nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3353 		VFS_RELE(mi->mi_vfsp);
3354 		mutex_enter(&sp->s_lock);
3355 		goto recov_retry;
3356 	}
3357 	mutex_exit(&sp->s_lock);
3358 
3359 	args.ctag = TAG_RENEW;
3360 
3361 	args.array_len = 1;
3362 	args.array = argop;
3363 
3364 	argop[0].argop = OP_RENEW;
3365 
3366 	mutex_enter(&sp->s_lock);
3367 	argop[0].nfs_argop4_u.oprenew.clientid = sp->clientid;
3368 	cr = sp->s_cred;
3369 	crhold(cr);
3370 	mutex_exit(&sp->s_lock);
3371 
3372 	ASSERT(cr != NULL);
3373 
3374 	/* used to figure out RTT for sp */
3375 	gethrestime(&prop_time);
3376 
3377 	NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
3378 	    "nfs4renew: %s call, sp 0x%p", needrecov ? "recov" : "first",
3379 	    (void*)sp));
3380 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "before: %ld s %ld ns ",
3381 		prop_time.tv_sec, prop_time.tv_nsec));
3382 
3383 	DTRACE_PROBE2(nfs4__renew__start, nfs4_server_t *, sp,
3384 			mntinfo4_t *, mi);
3385 
3386 	rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
3387 	crfree(cr);
3388 
3389 	DTRACE_PROBE2(nfs4__renew__end, nfs4_server_t *, sp,
3390 			mntinfo4_t *, mi);
3391 
3392 	gethrestime(&after_time);
3393 
3394 	mutex_enter(&sp->s_lock);
3395 	sp->propagation_delay.tv_sec =
3396 		MAX(1, after_time.tv_sec - prop_time.tv_sec);
3397 	mutex_exit(&sp->s_lock);
3398 
3399 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "after : %ld s %ld ns ",
3400 		after_time.tv_sec, after_time.tv_nsec));
3401 
3402 	if (e.error == 0 && res.status == NFS4ERR_CB_PATH_DOWN) {
3403 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3404 		nfs4_delegreturn_all(sp);
3405 		nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3406 		VFS_RELE(mi->mi_vfsp);
3407 		/*
3408 		 * If the server returns CB_PATH_DOWN, it has renewed
3409 		 * the lease and informed us that the callback path is
3410 		 * down.  Since the lease is renewed, just return 0 and
3411 		 * let the renew thread proceed as normal.
3412 		 */
3413 		return (0);
3414 	}
3415 
3416 	needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
3417 	if (!needrecov && e.error) {
3418 		nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3419 		VFS_RELE(mi->mi_vfsp);
3420 		return (e.error);
3421 	}
3422 
3423 	rpc_error = e.error;
3424 
3425 	if (needrecov) {
3426 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
3427 		    "nfs4renew: initiating recovery\n"));
3428 
3429 		if (nfs4_start_recovery(&e, mi, NULL, NULL, NULL, NULL,
3430 		    OP_RENEW, NULL) == FALSE) {
3431 			nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3432 			VFS_RELE(mi->mi_vfsp);
3433 			if (!e.error)
3434 				(void) xdr_free(xdr_COMPOUND4res_clnt,
3435 								(caddr_t)&res);
3436 			mutex_enter(&sp->s_lock);
3437 			goto recov_retry;
3438 		}
3439 		/* fall through for res.status case */
3440 	}
3441 
3442 	if (res.status) {
3443 		if (res.status == NFS4ERR_LEASE_MOVED) {
3444 			/*EMPTY*/
3445 			/*
3446 			 * XXX need to try every mntinfo4 in sp->mntinfo4_list
3447 			 * to renew the lease on that server
3448 			 */
3449 		}
3450 		e.error = geterrno4(res.status);
3451 	}
3452 
3453 	if (!rpc_error)
3454 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3455 
3456 	nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3457 
3458 	VFS_RELE(mi->mi_vfsp);
3459 
3460 	return (e.error);
3461 }
3462 
3463 void
3464 nfs4_inc_state_ref_count(mntinfo4_t *mi)
3465 {
3466 	nfs4_server_t	*sp;
3467 
3468 	/* this locks down sp if it is found */
3469 	sp = find_nfs4_server(mi);
3470 
3471 	if (sp != NULL) {
3472 		nfs4_inc_state_ref_count_nolock(sp, mi);
3473 		mutex_exit(&sp->s_lock);
3474 		nfs4_server_rele(sp);
3475 	}
3476 }
3477 
3478 /*
3479  * Bump the number of OPEN files (ie: those with state) so we know if this
3480  * nfs4_server has any state to maintain a lease for or not.
3481  *
3482  * Also, marks the nfs4_server's lease valid if it hasn't been done so already.
3483  */
3484 void
3485 nfs4_inc_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi)
3486 {
3487 	ASSERT(mutex_owned(&sp->s_lock));
3488 
3489 	sp->state_ref_count++;
3490 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3491 		"nfs4_inc_state_ref_count: state_ref_count now %d",
3492 		sp->state_ref_count));
3493 
3494 	if (sp->lease_valid == NFS4_LEASE_UNINITIALIZED)
3495 		sp->lease_valid = NFS4_LEASE_VALID;
3496 
3497 	/*
3498 	 * If this call caused the lease to be marked valid and/or
3499 	 * took the state_ref_count from 0 to 1, then start the time
3500 	 * on lease renewal.
3501 	 */
3502 	if (sp->lease_valid == NFS4_LEASE_VALID && sp->state_ref_count == 1)
3503 		sp->last_renewal_time = gethrestime_sec();
3504 
3505 	/* update the number of open files for mi */
3506 	mi->mi_open_files++;
3507 }
3508 
3509 void
3510 nfs4_dec_state_ref_count(mntinfo4_t *mi)
3511 {
3512 	nfs4_server_t	*sp;
3513 
3514 	/* this locks down sp if it is found */
3515 	sp = find_nfs4_server_all(mi, 1);
3516 
3517 	if (sp != NULL) {
3518 		nfs4_dec_state_ref_count_nolock(sp, mi);
3519 		mutex_exit(&sp->s_lock);
3520 		nfs4_server_rele(sp);
3521 	}
3522 }
3523 
3524 /*
3525  * Decrement the number of OPEN files (ie: those with state) so we know if
3526  * this nfs4_server has any state to maintain a lease for or not.
3527  */
3528 void
3529 nfs4_dec_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi)
3530 {
3531 	ASSERT(mutex_owned(&sp->s_lock));
3532 	ASSERT(sp->state_ref_count != 0);
3533 	sp->state_ref_count--;
3534 
3535 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3536 		"nfs4_dec_state_ref_count: state ref count now %d",
3537 		sp->state_ref_count));
3538 
3539 	mi->mi_open_files--;
3540 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3541 		"nfs4_dec_state_ref_count: mi open files %d, v4 flags 0x%x",
3542 		mi->mi_open_files, mi->mi_flags));
3543 
3544 	/* We don't have to hold the mi_lock to test mi_flags */
3545 	if (mi->mi_open_files == 0 &&
3546 	    (mi->mi_flags & MI4_REMOVE_ON_LAST_CLOSE)) {
3547 		NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3548 			"nfs4_dec_state_ref_count: remove mntinfo4 %p since "
3549 			"we have closed the last open file", (void*)mi));
3550 		nfs4_remove_mi_from_server(mi, sp);
3551 	}
3552 }
3553 
3554 bool_t
3555 inlease(nfs4_server_t *sp)
3556 {
3557 	bool_t result;
3558 
3559 	ASSERT(mutex_owned(&sp->s_lock));
3560 
3561 	if (sp->lease_valid == NFS4_LEASE_VALID &&
3562 	    gethrestime_sec() < sp->last_renewal_time + sp->s_lease_time)
3563 		result = TRUE;
3564 	else
3565 		result = FALSE;
3566 
3567 	return (result);
3568 }
3569 
3570 
3571 /*
3572  * Return non-zero if the given nfs4_server_t is going through recovery.
3573  */
3574 
3575 int
3576 nfs4_server_in_recovery(nfs4_server_t *sp)
3577 {
3578 	return (nfs_rw_lock_held(&sp->s_recovlock, RW_WRITER));
3579 }
3580 
3581 /*
3582  * Compare two shared filehandle objects.  Returns -1, 0, or +1, if the
3583  * first is less than, equal to, or greater than the second.
3584  */
3585 
3586 int
3587 sfh4cmp(const void *p1, const void *p2)
3588 {
3589 	const nfs4_sharedfh_t *sfh1 = (const nfs4_sharedfh_t *)p1;
3590 	const nfs4_sharedfh_t *sfh2 = (const nfs4_sharedfh_t *)p2;
3591 
3592 	return (nfs4cmpfh(&sfh1->sfh_fh, &sfh2->sfh_fh));
3593 }
3594 
3595 /*
3596  * Create a table for shared filehandle objects.
3597  */
3598 
3599 void
3600 sfh4_createtab(avl_tree_t *tab)
3601 {
3602 	avl_create(tab, sfh4cmp, sizeof (nfs4_sharedfh_t),
3603 		offsetof(nfs4_sharedfh_t, sfh_tree));
3604 }
3605 
3606 /*
3607  * Return a shared filehandle object for the given filehandle.  The caller
3608  * is responsible for eventually calling sfh4_rele().
3609  */
3610 
3611 nfs4_sharedfh_t *
3612 sfh4_put(const nfs_fh4 *fh, mntinfo4_t *mi, nfs4_sharedfh_t *key)
3613 {
3614 	nfs4_sharedfh_t *sfh, *nsfh;
3615 	avl_index_t where;
3616 	nfs4_sharedfh_t skey;
3617 
3618 	if (!key) {
3619 		skey.sfh_fh = *fh;
3620 		key = &skey;
3621 	}
3622 
3623 	nsfh = kmem_alloc(sizeof (nfs4_sharedfh_t), KM_SLEEP);
3624 	nsfh->sfh_fh.nfs_fh4_len = fh->nfs_fh4_len;
3625 	/*
3626 	 * We allocate the largest possible filehandle size because it's
3627 	 * not that big, and it saves us from possibly having to resize the
3628 	 * buffer later.
3629 	 */
3630 	nsfh->sfh_fh.nfs_fh4_val = kmem_alloc(NFS4_FHSIZE, KM_SLEEP);
3631 	bcopy(fh->nfs_fh4_val, nsfh->sfh_fh.nfs_fh4_val, fh->nfs_fh4_len);
3632 	mutex_init(&nsfh->sfh_lock, NULL, MUTEX_DEFAULT, NULL);
3633 	nsfh->sfh_refcnt = 1;
3634 	nsfh->sfh_flags = SFH4_IN_TREE;
3635 	nsfh->sfh_mi = mi;
3636 	NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, "sfh4_get: new object (%p)",
3637 			(void *)nsfh));
3638 
3639 	(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
3640 	sfh = avl_find(&mi->mi_filehandles, key, &where);
3641 	if (sfh != NULL) {
3642 		mutex_enter(&sfh->sfh_lock);
3643 		sfh->sfh_refcnt++;
3644 		mutex_exit(&sfh->sfh_lock);
3645 		nfs_rw_exit(&mi->mi_fh_lock);
3646 		/* free our speculative allocs */
3647 		kmem_free(nsfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE);
3648 		kmem_free(nsfh, sizeof (nfs4_sharedfh_t));
3649 		return (sfh);
3650 	}
3651 
3652 	avl_insert(&mi->mi_filehandles, nsfh, where);
3653 	nfs_rw_exit(&mi->mi_fh_lock);
3654 
3655 	return (nsfh);
3656 }
3657 
3658 /*
3659  * Return a shared filehandle object for the given filehandle.  The caller
3660  * is responsible for eventually calling sfh4_rele().
3661  */
3662 
3663 nfs4_sharedfh_t *
3664 sfh4_get(const nfs_fh4 *fh, mntinfo4_t *mi)
3665 {
3666 	nfs4_sharedfh_t *sfh;
3667 	nfs4_sharedfh_t key;
3668 
3669 	ASSERT(fh->nfs_fh4_len <= NFS4_FHSIZE);
3670 
3671 #ifdef DEBUG
3672 	if (nfs4_sharedfh_debug) {
3673 		nfs4_fhandle_t fhandle;
3674 
3675 		fhandle.fh_len = fh->nfs_fh4_len;
3676 		bcopy(fh->nfs_fh4_val, fhandle.fh_buf, fhandle.fh_len);
3677 		zcmn_err(mi->mi_zone->zone_id, CE_NOTE, "sfh4_get:");
3678 		nfs4_printfhandle(&fhandle);
3679 	}
3680 #endif
3681 
3682 	/*
3683 	 * If there's already an object for the given filehandle, bump the
3684 	 * reference count and return it.  Otherwise, create a new object
3685 	 * and add it to the AVL tree.
3686 	 */
3687 
3688 	key.sfh_fh = *fh;
3689 
3690 	(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
3691 	sfh = avl_find(&mi->mi_filehandles, &key, NULL);
3692 	if (sfh != NULL) {
3693 		mutex_enter(&sfh->sfh_lock);
3694 		sfh->sfh_refcnt++;
3695 		NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3696 			"sfh4_get: found existing %p, new refcnt=%d",
3697 			(void *)sfh, sfh->sfh_refcnt));
3698 		mutex_exit(&sfh->sfh_lock);
3699 		nfs_rw_exit(&mi->mi_fh_lock);
3700 		return (sfh);
3701 	}
3702 	nfs_rw_exit(&mi->mi_fh_lock);
3703 
3704 	return (sfh4_put(fh, mi, &key));
3705 }
3706 
3707 /*
3708  * Get a reference to the given shared filehandle object.
3709  */
3710 
3711 void
3712 sfh4_hold(nfs4_sharedfh_t *sfh)
3713 {
3714 	ASSERT(sfh->sfh_refcnt > 0);
3715 
3716 	mutex_enter(&sfh->sfh_lock);
3717 	sfh->sfh_refcnt++;
3718 	NFS4_DEBUG(nfs4_sharedfh_debug,
3719 		(CE_NOTE, "sfh4_hold %p, new refcnt=%d",
3720 		(void *)sfh, sfh->sfh_refcnt));
3721 	mutex_exit(&sfh->sfh_lock);
3722 }
3723 
3724 /*
3725  * Release a reference to the given shared filehandle object and null out
3726  * the given pointer.
3727  */
3728 
3729 void
3730 sfh4_rele(nfs4_sharedfh_t **sfhpp)
3731 {
3732 	mntinfo4_t *mi;
3733 	nfs4_sharedfh_t *sfh = *sfhpp;
3734 
3735 	ASSERT(sfh->sfh_refcnt > 0);
3736 
3737 	mutex_enter(&sfh->sfh_lock);
3738 	if (sfh->sfh_refcnt > 1) {
3739 		sfh->sfh_refcnt--;
3740 		NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3741 		    "sfh4_rele %p, new refcnt=%d",
3742 		    (void *)sfh, sfh->sfh_refcnt));
3743 		mutex_exit(&sfh->sfh_lock);
3744 		goto finish;
3745 	}
3746 	mutex_exit(&sfh->sfh_lock);
3747 
3748 	/*
3749 	 * Possibly the last reference, so get the lock for the table in
3750 	 * case it's time to remove the object from the table.
3751 	 */
3752 	mi = sfh->sfh_mi;
3753 	(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
3754 	mutex_enter(&sfh->sfh_lock);
3755 	sfh->sfh_refcnt--;
3756 	if (sfh->sfh_refcnt > 0) {
3757 		NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3758 		    "sfh4_rele %p, new refcnt=%d",
3759 		    (void *)sfh, sfh->sfh_refcnt));
3760 		mutex_exit(&sfh->sfh_lock);
3761 		nfs_rw_exit(&mi->mi_fh_lock);
3762 		goto finish;
3763 	}
3764 
3765 	NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3766 		"sfh4_rele %p, last ref", (void *)sfh));
3767 	if (sfh->sfh_flags & SFH4_IN_TREE) {
3768 		avl_remove(&mi->mi_filehandles, sfh);
3769 		sfh->sfh_flags &= ~SFH4_IN_TREE;
3770 	}
3771 	mutex_exit(&sfh->sfh_lock);
3772 	nfs_rw_exit(&mi->mi_fh_lock);
3773 	mutex_destroy(&sfh->sfh_lock);
3774 	kmem_free(sfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE);
3775 	kmem_free(sfh, sizeof (nfs4_sharedfh_t));
3776 
3777 finish:
3778 	*sfhpp = NULL;
3779 }
3780 
3781 /*
3782  * Update the filehandle for the given shared filehandle object.
3783  */
3784 
3785 int nfs4_warn_dupfh = 0;	/* if set, always warn about dup fhs below */
3786 
3787 void
3788 sfh4_update(nfs4_sharedfh_t *sfh, const nfs_fh4 *newfh)
3789 {
3790 	mntinfo4_t *mi = sfh->sfh_mi;
3791 	nfs4_sharedfh_t *dupsfh;
3792 	avl_index_t where;
3793 	nfs4_sharedfh_t key;
3794 
3795 #ifdef DEBUG
3796 	mutex_enter(&sfh->sfh_lock);
3797 	ASSERT(sfh->sfh_refcnt > 0);
3798 	mutex_exit(&sfh->sfh_lock);
3799 #endif
3800 	ASSERT(newfh->nfs_fh4_len <= NFS4_FHSIZE);
3801 
3802 	/*
3803 	 * The basic plan is to remove the shared filehandle object from
3804 	 * the table, update it to have the new filehandle, then reinsert
3805 	 * it.
3806 	 */
3807 
3808 	(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
3809 	mutex_enter(&sfh->sfh_lock);
3810 	if (sfh->sfh_flags & SFH4_IN_TREE) {
3811 		avl_remove(&mi->mi_filehandles, sfh);
3812 		sfh->sfh_flags &= ~SFH4_IN_TREE;
3813 	}
3814 	mutex_exit(&sfh->sfh_lock);
3815 	sfh->sfh_fh.nfs_fh4_len = newfh->nfs_fh4_len;
3816 	bcopy(newfh->nfs_fh4_val, sfh->sfh_fh.nfs_fh4_val,
3817 	    sfh->sfh_fh.nfs_fh4_len);
3818 
3819 	/*
3820 	 * XXX If there is already a shared filehandle object with the new
3821 	 * filehandle, we're in trouble, because the rnode code assumes
3822 	 * that there is only one shared filehandle object for a given
3823 	 * filehandle.  So issue a warning (for read-write mounts only)
3824 	 * and don't try to re-insert the given object into the table.
3825 	 * Hopefully the given object will quickly go away and everyone
3826 	 * will use the new object.
3827 	 */
3828 	key.sfh_fh = *newfh;
3829 	dupsfh = avl_find(&mi->mi_filehandles, &key, &where);
3830 	if (dupsfh != NULL) {
3831 		if (!(mi->mi_vfsp->vfs_flag & VFS_RDONLY) || nfs4_warn_dupfh) {
3832 			zcmn_err(mi->mi_zone->zone_id, CE_WARN, "sfh4_update: "
3833 			    "duplicate filehandle detected");
3834 			sfh4_printfhandle(dupsfh);
3835 		}
3836 	} else {
3837 		avl_insert(&mi->mi_filehandles, sfh, where);
3838 		mutex_enter(&sfh->sfh_lock);
3839 		sfh->sfh_flags |= SFH4_IN_TREE;
3840 		mutex_exit(&sfh->sfh_lock);
3841 	}
3842 	nfs_rw_exit(&mi->mi_fh_lock);
3843 }
3844 
3845 /*
3846  * Copy out the current filehandle for the given shared filehandle object.
3847  */
3848 
3849 void
3850 sfh4_copyval(const nfs4_sharedfh_t *sfh, nfs4_fhandle_t *fhp)
3851 {
3852 	mntinfo4_t *mi = sfh->sfh_mi;
3853 
3854 	ASSERT(sfh->sfh_refcnt > 0);
3855 
3856 	(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
3857 	fhp->fh_len = sfh->sfh_fh.nfs_fh4_len;
3858 	ASSERT(fhp->fh_len <= NFS4_FHSIZE);
3859 	bcopy(sfh->sfh_fh.nfs_fh4_val, fhp->fh_buf, fhp->fh_len);
3860 	nfs_rw_exit(&mi->mi_fh_lock);
3861 }
3862 
3863 /*
3864  * Print out the filehandle for the given shared filehandle object.
3865  */
3866 
3867 void
3868 sfh4_printfhandle(const nfs4_sharedfh_t *sfh)
3869 {
3870 	nfs4_fhandle_t fhandle;
3871 
3872 	sfh4_copyval(sfh, &fhandle);
3873 	nfs4_printfhandle(&fhandle);
3874 }
3875 
3876 /*
3877  * Compare 2 fnames.  Returns -1 if the first is "less" than the second, 0
3878  * if they're the same, +1 if the first is "greater" than the second.  The
3879  * caller (or whoever's calling the AVL package) is responsible for
3880  * handling locking issues.
3881  */
3882 
3883 static int
3884 fncmp(const void *p1, const void *p2)
3885 {
3886 	const nfs4_fname_t *f1 = p1;
3887 	const nfs4_fname_t *f2 = p2;
3888 	int res;
3889 
3890 	res = strcmp(f1->fn_name, f2->fn_name);
3891 	/*
3892 	 * The AVL package wants +/-1, not arbitrary positive or negative
3893 	 * integers.
3894 	 */
3895 	if (res > 0)
3896 		res = 1;
3897 	else if (res < 0)
3898 		res = -1;
3899 	return (res);
3900 }
3901 
3902 /*
3903  * Get or create an fname with the given name, as a child of the given
3904  * fname.  The caller is responsible for eventually releasing the reference
3905  * (fn_rele()).  parent may be NULL.
3906  */
3907 
3908 nfs4_fname_t *
3909 fn_get(nfs4_fname_t *parent, char *name)
3910 {
3911 	nfs4_fname_t key;
3912 	nfs4_fname_t *fnp;
3913 	avl_index_t where;
3914 
3915 	key.fn_name = name;
3916 
3917 	/*
3918 	 * If there's already an fname registered with the given name, bump
3919 	 * its reference count and return it.  Otherwise, create a new one
3920 	 * and add it to the parent's AVL tree.
3921 	 */
3922 
3923 	if (parent != NULL) {
3924 		mutex_enter(&parent->fn_lock);
3925 		fnp = avl_find(&parent->fn_children, &key, &where);
3926 		if (fnp != NULL) {
3927 			fn_hold(fnp);
3928 			mutex_exit(&parent->fn_lock);
3929 			return (fnp);
3930 		}
3931 	}
3932 
3933 	fnp = kmem_alloc(sizeof (nfs4_fname_t), KM_SLEEP);
3934 	mutex_init(&fnp->fn_lock, NULL, MUTEX_DEFAULT, NULL);
3935 	fnp->fn_parent = parent;
3936 	if (parent != NULL)
3937 		fn_hold(parent);
3938 	fnp->fn_len = strlen(name);
3939 	ASSERT(fnp->fn_len < MAXNAMELEN);
3940 	fnp->fn_name = kmem_alloc(fnp->fn_len + 1, KM_SLEEP);
3941 	(void) strcpy(fnp->fn_name, name);
3942 	fnp->fn_refcnt = 1;
3943 	avl_create(&fnp->fn_children, fncmp, sizeof (nfs4_fname_t),
3944 	    offsetof(nfs4_fname_t, fn_tree));
3945 	NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
3946 		"fn_get %p:%s, a new nfs4_fname_t!",
3947 		(void *)fnp, fnp->fn_name));
3948 	if (parent != NULL) {
3949 		avl_insert(&parent->fn_children, fnp, where);
3950 		mutex_exit(&parent->fn_lock);
3951 	}
3952 
3953 	return (fnp);
3954 }
3955 
3956 void
3957 fn_hold(nfs4_fname_t *fnp)
3958 {
3959 	atomic_add_32(&fnp->fn_refcnt, 1);
3960 	NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
3961 		"fn_hold %p:%s, new refcnt=%d",
3962 		(void *)fnp, fnp->fn_name, fnp->fn_refcnt));
3963 }
3964 
3965 /*
3966  * Decrement the reference count of the given fname, and destroy it if its
3967  * reference count goes to zero.  Nulls out the given pointer.
3968  */
3969 
3970 void
3971 fn_rele(nfs4_fname_t **fnpp)
3972 {
3973 	nfs4_fname_t *parent;
3974 	uint32_t newref;
3975 	nfs4_fname_t *fnp;
3976 
3977 recur:
3978 	fnp = *fnpp;
3979 	*fnpp = NULL;
3980 
3981 	mutex_enter(&fnp->fn_lock);
3982 	parent = fnp->fn_parent;
3983 	if (parent != NULL)
3984 		mutex_enter(&parent->fn_lock);	/* prevent new references */
3985 	newref = atomic_add_32_nv(&fnp->fn_refcnt, -1);
3986 	if (newref > 0) {
3987 		NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
3988 			"fn_rele %p:%s, new refcnt=%d",
3989 			(void *)fnp, fnp->fn_name, fnp->fn_refcnt));
3990 		if (parent != NULL)
3991 			mutex_exit(&parent->fn_lock);
3992 		mutex_exit(&fnp->fn_lock);
3993 		return;
3994 	}
3995 
3996 	NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
3997 		"fn_rele %p:%s, last reference, deleting...",
3998 		(void *)fnp, fnp->fn_name));
3999 	if (parent != NULL) {
4000 		avl_remove(&parent->fn_children, fnp);
4001 		mutex_exit(&parent->fn_lock);
4002 	}
4003 	kmem_free(fnp->fn_name, fnp->fn_len + 1);
4004 	mutex_destroy(&fnp->fn_lock);
4005 	avl_destroy(&fnp->fn_children);
4006 	kmem_free(fnp, sizeof (nfs4_fname_t));
4007 	/*
4008 	 * Recursivly fn_rele the parent.
4009 	 * Use goto instead of a recursive call to avoid stack overflow.
4010 	 */
4011 	if (parent != NULL) {
4012 		fnpp = &parent;
4013 		goto recur;
4014 	}
4015 }
4016 
4017 /*
4018  * Returns the single component name of the given fname, in a MAXNAMELEN
4019  * string buffer, which the caller is responsible for freeing.  Note that
4020  * the name may become invalid as a result of fn_move().
4021  */
4022 
4023 char *
4024 fn_name(nfs4_fname_t *fnp)
4025 {
4026 	char *name;
4027 
4028 	ASSERT(fnp->fn_len < MAXNAMELEN);
4029 	name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
4030 	mutex_enter(&fnp->fn_lock);
4031 	(void) strcpy(name, fnp->fn_name);
4032 	mutex_exit(&fnp->fn_lock);
4033 
4034 	return (name);
4035 }
4036 
4037 
4038 /*
4039  * fn_path_realloc
4040  *
4041  * This function, used only by fn_path, constructs
4042  * a new string which looks like "prepend" + "/" + "current".
4043  * by allocating a new string and freeing the old one.
4044  */
4045 static void
4046 fn_path_realloc(char **curses, char *prepend)
4047 {
4048 	int len, curlen = 0;
4049 	char *news;
4050 
4051 	if (*curses == NULL) {
4052 		/*
4053 		 * Prime the pump, allocate just the
4054 		 * space for prepend and return that.
4055 		 */
4056 		len = strlen(prepend) + 1;
4057 		news = kmem_alloc(len, KM_SLEEP);
4058 		(void) strncpy(news, prepend, len);
4059 	} else {
4060 		/*
4061 		 * Allocate the space  for a new string
4062 		 * +1 +1 is for the "/" and the NULL
4063 		 * byte at the end of it all.
4064 		 */
4065 		curlen = strlen(*curses);
4066 		len = curlen + strlen(prepend) + 1 + 1;
4067 		news = kmem_alloc(len, KM_SLEEP);
4068 		(void) strncpy(news, prepend, len);
4069 		(void) strcat(news, "/");
4070 		(void) strcat(news, *curses);
4071 		kmem_free(*curses, curlen + 1);
4072 	}
4073 	*curses = news;
4074 }
4075 
4076 /*
4077  * Returns the path name (starting from the fs root) for the given fname.
4078  * The caller is responsible for freeing.  Note that the path may be or
4079  * become invalid as a result of fn_move().
4080  */
4081 
4082 char *
4083 fn_path(nfs4_fname_t *fnp)
4084 {
4085 	char *path;
4086 	nfs4_fname_t *nextfnp;
4087 
4088 	if (fnp == NULL)
4089 		return (NULL);
4090 
4091 	path = NULL;
4092 
4093 	/* walk up the tree constructing the pathname.  */
4094 
4095 	fn_hold(fnp);			/* adjust for later rele */
4096 	do {
4097 		mutex_enter(&fnp->fn_lock);
4098 		/*
4099 		 * Add fn_name in front of the current path
4100 		 */
4101 		fn_path_realloc(&path, fnp->fn_name);
4102 		nextfnp = fnp->fn_parent;
4103 		if (nextfnp != NULL)
4104 			fn_hold(nextfnp);
4105 		mutex_exit(&fnp->fn_lock);
4106 		fn_rele(&fnp);
4107 		fnp = nextfnp;
4108 	} while (fnp != NULL);
4109 
4110 	return (path);
4111 }
4112 
4113 /*
4114  * Return a reference to the parent of the given fname, which the caller is
4115  * responsible for eventually releasing.
4116  */
4117 
4118 nfs4_fname_t *
4119 fn_parent(nfs4_fname_t *fnp)
4120 {
4121 	nfs4_fname_t *parent;
4122 
4123 	mutex_enter(&fnp->fn_lock);
4124 	parent = fnp->fn_parent;
4125 	if (parent != NULL)
4126 		fn_hold(parent);
4127 	mutex_exit(&fnp->fn_lock);
4128 
4129 	return (parent);
4130 }
4131 
4132 /*
4133  * Update fnp so that its parent is newparent and its name is newname.
4134  */
4135 
4136 void
4137 fn_move(nfs4_fname_t *fnp, nfs4_fname_t *newparent, char *newname)
4138 {
4139 	nfs4_fname_t *parent, *tmpfnp;
4140 	ssize_t newlen;
4141 	nfs4_fname_t key;
4142 	avl_index_t where;
4143 
4144 	/*
4145 	 * This assert exists to catch the client trying to rename
4146 	 * a dir to be a child of itself.  This happened at a recent
4147 	 * bakeoff against a 3rd party (broken) server which allowed
4148 	 * the rename to succeed.  If it trips it means that:
4149 	 *	a) the code in nfs4rename that detects this case is broken
4150 	 *	b) the server is broken (since it allowed the bogus rename)
4151 	 *
4152 	 * For non-DEBUG kernels, prepare for a recursive mutex_enter
4153 	 * panic below from:  mutex_enter(&newparent->fn_lock);
4154 	 */
4155 	ASSERT(fnp != newparent);
4156 
4157 	/*
4158 	 * Remove fnp from its current parent, change its name, then add it
4159 	 * to newparent.
4160 	 */
4161 	mutex_enter(&fnp->fn_lock);
4162 	parent = fnp->fn_parent;
4163 	mutex_enter(&parent->fn_lock);
4164 	avl_remove(&parent->fn_children, fnp);
4165 	mutex_exit(&parent->fn_lock);
4166 	fn_rele(&fnp->fn_parent);
4167 
4168 	newlen = strlen(newname);
4169 	if (newlen != fnp->fn_len) {
4170 		ASSERT(newlen < MAXNAMELEN);
4171 		kmem_free(fnp->fn_name, fnp->fn_len + 1);
4172 		fnp->fn_name = kmem_alloc(newlen + 1, KM_SLEEP);
4173 		fnp->fn_len = newlen;
4174 	}
4175 	(void) strcpy(fnp->fn_name, newname);
4176 
4177 again:
4178 	mutex_enter(&newparent->fn_lock);
4179 	key.fn_name = fnp->fn_name;
4180 	tmpfnp = avl_find(&newparent->fn_children, &key, &where);
4181 	if (tmpfnp != NULL) {
4182 		/*
4183 		 * This could be due to a file that was unlinked while
4184 		 * open, or perhaps the rnode is in the free list.  Remove
4185 		 * it from newparent and let it go away on its own.  The
4186 		 * contorted code is to deal with lock order issues and
4187 		 * race conditions.
4188 		 */
4189 		fn_hold(tmpfnp);
4190 		mutex_exit(&newparent->fn_lock);
4191 		mutex_enter(&tmpfnp->fn_lock);
4192 		if (tmpfnp->fn_parent == newparent) {
4193 			mutex_enter(&newparent->fn_lock);
4194 			avl_remove(&newparent->fn_children, tmpfnp);
4195 			mutex_exit(&newparent->fn_lock);
4196 			fn_rele(&tmpfnp->fn_parent);
4197 		}
4198 		mutex_exit(&tmpfnp->fn_lock);
4199 		fn_rele(&tmpfnp);
4200 		goto again;
4201 	}
4202 	fnp->fn_parent = newparent;
4203 	fn_hold(newparent);
4204 	avl_insert(&newparent->fn_children, fnp, where);
4205 	mutex_exit(&newparent->fn_lock);
4206 	mutex_exit(&fnp->fn_lock);
4207 }
4208 
4209 #ifdef DEBUG
4210 /*
4211  * Return non-zero if the type information makes sense for the given vnode.
4212  * Otherwise panic.
4213  */
4214 int
4215 nfs4_consistent_type(vnode_t *vp)
4216 {
4217 	rnode4_t *rp = VTOR4(vp);
4218 
4219 	if (nfs4_vtype_debug && vp->v_type != VNON &&
4220 	    rp->r_attr.va_type != VNON && vp->v_type != rp->r_attr.va_type) {
4221 		cmn_err(CE_PANIC, "vnode %p type mismatch; v_type=%d, "
4222 			"rnode attr type=%d", (void *)vp, vp->v_type,
4223 			rp->r_attr.va_type);
4224 	}
4225 
4226 	return (1);
4227 }
4228 #endif /* DEBUG */
4229