xref: /titanic_41/usr/src/uts/common/fs/nfs/nfs4_client.c (revision 582271e8d649568c83e9a016cc0d54265389c5d9)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  *  	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
28  *	All Rights Reserved
29  */
30 
31 #pragma ident	"%Z%%M%	%I%	%E% SMI"
32 
33 #include <sys/param.h>
34 #include <sys/types.h>
35 #include <sys/systm.h>
36 #include <sys/thread.h>
37 #include <sys/t_lock.h>
38 #include <sys/time.h>
39 #include <sys/vnode.h>
40 #include <sys/vfs.h>
41 #include <sys/errno.h>
42 #include <sys/buf.h>
43 #include <sys/stat.h>
44 #include <sys/cred.h>
45 #include <sys/kmem.h>
46 #include <sys/debug.h>
47 #include <sys/dnlc.h>
48 #include <sys/vmsystm.h>
49 #include <sys/flock.h>
50 #include <sys/share.h>
51 #include <sys/cmn_err.h>
52 #include <sys/tiuser.h>
53 #include <sys/sysmacros.h>
54 #include <sys/callb.h>
55 #include <sys/acl.h>
56 #include <sys/kstat.h>
57 #include <sys/signal.h>
58 #include <sys/disp.h>
59 #include <sys/atomic.h>
60 #include <sys/list.h>
61 #include <sys/sdt.h>
62 
63 #include <rpc/types.h>
64 #include <rpc/xdr.h>
65 #include <rpc/auth.h>
66 #include <rpc/clnt.h>
67 
68 #include <nfs/nfs.h>
69 #include <nfs/nfs_clnt.h>
70 #include <nfs/nfs_acl.h>
71 
72 #include <nfs/nfs4.h>
73 #include <nfs/rnode4.h>
74 #include <nfs/nfs4_clnt.h>
75 
76 #include <vm/hat.h>
77 #include <vm/as.h>
78 #include <vm/page.h>
79 #include <vm/pvn.h>
80 #include <vm/seg.h>
81 #include <vm/seg_map.h>
82 #include <vm/seg_vn.h>
83 
84 #include <sys/ddi.h>
85 
86 /*
87  * Arguments to page-flush thread.
88  */
89 typedef struct {
90 	vnode_t *vp;
91 	cred_t *cr;
92 } pgflush_t;
93 
94 #ifdef DEBUG
95 int nfs4_client_lease_debug;
96 int nfs4_sharedfh_debug;
97 int nfs4_fname_debug;
98 
99 /* temporary: panic if v_type is inconsistent with r_attr va_type */
100 int nfs4_vtype_debug;
101 
102 uint_t nfs4_tsd_key;
103 #endif
104 
105 static time_t	nfs4_client_resumed = 0;
106 static	callb_id_t cid = 0;
107 
108 static int	nfs4renew(nfs4_server_t *);
109 static void	nfs4_attrcache_va(vnode_t *, nfs4_ga_res_t *, int);
110 static void	nfs4_pgflush_thread(pgflush_t *);
111 static void	flush_pages(vnode_t *, cred_t *);
112 
113 static boolean_t nfs4_client_cpr_callb(void *, int);
114 
115 struct mi4_globals {
116 	kmutex_t	mig_lock;  /* lock protecting mig_list */
117 	list_t		mig_list;  /* list of NFS v4 mounts in zone */
118 	boolean_t	mig_destructor_called;
119 };
120 
121 static zone_key_t mi4_list_key;
122 
123 /*
124  * Attributes caching:
125  *
126  * Attributes are cached in the rnode in struct vattr form.
127  * There is a time associated with the cached attributes (r_time_attr_inval)
128  * which tells whether the attributes are valid. The time is initialized
129  * to the difference between current time and the modify time of the vnode
130  * when new attributes are cached. This allows the attributes for
131  * files that have changed recently to be timed out sooner than for files
132  * that have not changed for a long time. There are minimum and maximum
133  * timeout values that can be set per mount point.
134  */
135 
136 /*
137  * If a cache purge is in progress, wait for it to finish.
138  *
139  * The current thread must not be in the middle of an
140  * nfs4_start_op/nfs4_end_op region.  Otherwise, there could be a deadlock
141  * between this thread, a recovery thread, and the page flush thread.
142  */
143 int
144 nfs4_waitfor_purge_complete(vnode_t *vp)
145 {
146 	rnode4_t *rp;
147 	k_sigset_t smask;
148 
149 	rp = VTOR4(vp);
150 	if ((rp->r_serial != NULL && rp->r_serial != curthread) ||
151 	    ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread)) {
152 		mutex_enter(&rp->r_statelock);
153 		sigintr(&smask, VTOMI4(vp)->mi_flags & MI4_INT);
154 		while ((rp->r_serial != NULL && rp->r_serial != curthread) ||
155 		    ((rp->r_flags & R4PGFLUSH) &&
156 		    rp->r_pgflush != curthread)) {
157 			if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
158 				sigunintr(&smask);
159 				mutex_exit(&rp->r_statelock);
160 				return (EINTR);
161 			}
162 		}
163 		sigunintr(&smask);
164 		mutex_exit(&rp->r_statelock);
165 	}
166 	return (0);
167 }
168 
169 /*
170  * Validate caches by checking cached attributes. If they have timed out,
171  * then get new attributes from the server.  As a side effect, cache
172  * invalidation is done if the attributes have changed.
173  *
174  * If the attributes have not timed out and if there is a cache
175  * invalidation being done by some other thread, then wait until that
176  * thread has completed the cache invalidation.
177  */
178 int
179 nfs4_validate_caches(vnode_t *vp, cred_t *cr)
180 {
181 	int error;
182 	nfs4_ga_res_t gar;
183 
184 	if (ATTRCACHE4_VALID(vp)) {
185 		error = nfs4_waitfor_purge_complete(vp);
186 		if (error)
187 			return (error);
188 		return (0);
189 	}
190 
191 	gar.n4g_va.va_mask = AT_ALL;
192 	return (nfs4_getattr_otw(vp, &gar, cr, 0));
193 }
194 
195 /*
196  * Fill in attribute from the cache.
197  * If valid, then return 0 to indicate that no error occurred,
198  * otherwise return 1 to indicate that an error occurred.
199  */
200 static int
201 nfs4_getattr_cache(vnode_t *vp, struct vattr *vap)
202 {
203 	rnode4_t *rp;
204 
205 	rp = VTOR4(vp);
206 	mutex_enter(&rp->r_statelock);
207 	mutex_enter(&rp->r_statev4_lock);
208 	if (ATTRCACHE4_VALID(vp)) {
209 		mutex_exit(&rp->r_statev4_lock);
210 		/*
211 		 * Cached attributes are valid
212 		 */
213 		*vap = rp->r_attr;
214 		mutex_exit(&rp->r_statelock);
215 		return (0);
216 	}
217 	mutex_exit(&rp->r_statev4_lock);
218 	mutex_exit(&rp->r_statelock);
219 	return (1);
220 }
221 
222 
223 /*
224  * If returned error is ESTALE flush all caches.  The nfs4_purge_caches()
225  * call is synchronous because all the pages were invalidated by the
226  * nfs4_invalidate_pages() call.
227  */
228 void
229 nfs4_purge_stale_fh(int errno, vnode_t *vp, cred_t *cr)
230 {
231 	struct rnode4 *rp = VTOR4(vp);
232 
233 	/* Ensure that the ..._end_op() call has been done */
234 	ASSERT(tsd_get(nfs4_tsd_key) == NULL);
235 
236 	if (errno != ESTALE)
237 		return;
238 
239 	mutex_enter(&rp->r_statelock);
240 	rp->r_flags |= R4STALE;
241 	if (!rp->r_error)
242 		rp->r_error = errno;
243 	mutex_exit(&rp->r_statelock);
244 	if (nfs4_has_pages(vp))
245 		nfs4_invalidate_pages(vp, (u_offset_t)0, cr);
246 	nfs4_purge_caches(vp, NFS4_PURGE_DNLC, cr, FALSE);
247 }
248 
249 /*
250  * Purge all of the various NFS `data' caches.  If "asyncpg" is TRUE, the
251  * page purge is done asynchronously.
252  */
253 void
254 nfs4_purge_caches(vnode_t *vp, int purge_dnlc, cred_t *cr, int asyncpg)
255 {
256 	rnode4_t *rp;
257 	char *contents;
258 	vnode_t *xattr;
259 	int size;
260 	int pgflush;			/* are we the page flush thread? */
261 
262 	/*
263 	 * Purge the DNLC for any entries which refer to this file.
264 	 */
265 	if (vp->v_count > 1 &&
266 	    (vp->v_type == VDIR || purge_dnlc == NFS4_PURGE_DNLC))
267 		dnlc_purge_vp(vp);
268 
269 	/*
270 	 * Clear any readdir state bits and purge the readlink response cache.
271 	 */
272 	rp = VTOR4(vp);
273 	mutex_enter(&rp->r_statelock);
274 	rp->r_flags &= ~R4LOOKUP;
275 	contents = rp->r_symlink.contents;
276 	size = rp->r_symlink.size;
277 	rp->r_symlink.contents = NULL;
278 
279 	xattr = rp->r_xattr_dir;
280 	rp->r_xattr_dir = NULL;
281 
282 	/*
283 	 * Purge pathconf cache too.
284 	 */
285 	rp->r_pathconf.pc4_xattr_valid = 0;
286 	rp->r_pathconf.pc4_cache_valid = 0;
287 
288 	pgflush = (curthread == rp->r_pgflush);
289 	mutex_exit(&rp->r_statelock);
290 
291 	if (contents != NULL) {
292 
293 		kmem_free((void *)contents, size);
294 	}
295 
296 	if (xattr != NULL)
297 		VN_RELE(xattr);
298 
299 	/*
300 	 * Flush the page cache.  If the current thread is the page flush
301 	 * thread, don't initiate a new page flush.  There's no need for
302 	 * it, and doing it correctly is hard.
303 	 */
304 	if (nfs4_has_pages(vp) && !pgflush) {
305 		if (!asyncpg) {
306 			(void) nfs4_waitfor_purge_complete(vp);
307 			flush_pages(vp, cr);
308 		} else {
309 			pgflush_t *args;
310 
311 			/*
312 			 * We don't hold r_statelock while creating the
313 			 * thread, in case the call blocks.  So we use a
314 			 * flag to indicate that a page flush thread is
315 			 * active.
316 			 */
317 			mutex_enter(&rp->r_statelock);
318 			if (rp->r_flags & R4PGFLUSH) {
319 				mutex_exit(&rp->r_statelock);
320 			} else {
321 				rp->r_flags |= R4PGFLUSH;
322 				mutex_exit(&rp->r_statelock);
323 
324 				args = kmem_alloc(sizeof (pgflush_t),
325 				    KM_SLEEP);
326 				args->vp = vp;
327 				VN_HOLD(args->vp);
328 				args->cr = cr;
329 				crhold(args->cr);
330 				(void) zthread_create(NULL, 0,
331 				    nfs4_pgflush_thread, args, 0,
332 				    minclsyspri);
333 			}
334 		}
335 	}
336 
337 	/*
338 	 * Flush the readdir response cache.
339 	 */
340 	nfs4_purge_rddir_cache(vp);
341 }
342 
343 /*
344  * Invalidate all pages for the given file, after writing back the dirty
345  * ones.
346  */
347 
348 static void
349 flush_pages(vnode_t *vp, cred_t *cr)
350 {
351 	int error;
352 	rnode4_t *rp = VTOR4(vp);
353 
354 	error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_INVAL, cr, NULL);
355 	if (error == ENOSPC || error == EDQUOT) {
356 		mutex_enter(&rp->r_statelock);
357 		if (!rp->r_error)
358 			rp->r_error = error;
359 		mutex_exit(&rp->r_statelock);
360 	}
361 }
362 
363 /*
364  * Page flush thread.
365  */
366 
367 static void
368 nfs4_pgflush_thread(pgflush_t *args)
369 {
370 	rnode4_t *rp = VTOR4(args->vp);
371 
372 	/* remember which thread we are, so we don't deadlock ourselves */
373 	mutex_enter(&rp->r_statelock);
374 	ASSERT(rp->r_pgflush == NULL);
375 	rp->r_pgflush = curthread;
376 	mutex_exit(&rp->r_statelock);
377 
378 	flush_pages(args->vp, args->cr);
379 
380 	mutex_enter(&rp->r_statelock);
381 	rp->r_pgflush = NULL;
382 	rp->r_flags &= ~R4PGFLUSH;
383 	cv_broadcast(&rp->r_cv);
384 	mutex_exit(&rp->r_statelock);
385 
386 	VN_RELE(args->vp);
387 	crfree(args->cr);
388 	kmem_free(args, sizeof (pgflush_t));
389 	zthread_exit();
390 }
391 
392 /*
393  * Purge the readdir cache of all entries which are not currently
394  * being filled.
395  */
396 void
397 nfs4_purge_rddir_cache(vnode_t *vp)
398 {
399 	rnode4_t *rp;
400 
401 	rp = VTOR4(vp);
402 
403 	mutex_enter(&rp->r_statelock);
404 	rp->r_direof = NULL;
405 	rp->r_flags &= ~R4LOOKUP;
406 	rp->r_flags |= R4READDIRWATTR;
407 	rddir4_cache_purge(rp);
408 	mutex_exit(&rp->r_statelock);
409 }
410 
411 /*
412  * Set attributes cache for given vnode using virtual attributes.  There is
413  * no cache validation, but if the attributes are deemed to be stale, they
414  * are ignored.  This corresponds to nfs3_attrcache().
415  *
416  * Set the timeout value on the attribute cache and fill it
417  * with the passed in attributes.
418  */
419 void
420 nfs4_attrcache_noinval(vnode_t *vp, nfs4_ga_res_t *garp, hrtime_t t)
421 {
422 	rnode4_t *rp = VTOR4(vp);
423 
424 	mutex_enter(&rp->r_statelock);
425 	if (rp->r_time_attr_saved <= t)
426 		nfs4_attrcache_va(vp, garp, FALSE);
427 	mutex_exit(&rp->r_statelock);
428 }
429 
430 /*
431  * Use the passed in virtual attributes to check to see whether the
432  * data and metadata caches are valid, cache the new attributes, and
433  * then do the cache invalidation if required.
434  *
435  * The cache validation and caching of the new attributes is done
436  * atomically via the use of the mutex, r_statelock.  If required,
437  * the cache invalidation is done atomically w.r.t. the cache
438  * validation and caching of the attributes via the pseudo lock,
439  * r_serial.
440  *
441  * This routine is used to do cache validation and attributes caching
442  * for operations with a single set of post operation attributes.
443  */
444 
445 void
446 nfs4_attr_cache(vnode_t *vp, nfs4_ga_res_t *garp,
447     hrtime_t t, cred_t *cr, int async,
448     change_info4 *cinfo)
449 {
450 	rnode4_t *rp;
451 	int mtime_changed;
452 	int ctime_changed;
453 	vsecattr_t *vsp;
454 	int was_serial, set_time_cache_inval, recov;
455 	vattr_t *vap = &garp->n4g_va;
456 	mntinfo4_t *mi = VTOMI4(vp);
457 
458 	ASSERT(mi->mi_vfsp->vfs_dev == garp->n4g_va.va_fsid);
459 
460 	/* Is curthread the recovery thread? */
461 	mutex_enter(&mi->mi_lock);
462 	recov = (VTOMI4(vp)->mi_recovthread == curthread);
463 	mutex_exit(&mi->mi_lock);
464 
465 	rp = VTOR4(vp);
466 	mutex_enter(&rp->r_statelock);
467 	was_serial = (rp->r_serial == curthread);
468 	if (rp->r_serial && !was_serial) {
469 		klwp_t *lwp = ttolwp(curthread);
470 
471 		/*
472 		 * If we're the recovery thread, then purge current attrs
473 		 * and bail out to avoid potential deadlock between another
474 		 * thread caching attrs (r_serial thread), recov thread,
475 		 * and an async writer thread.
476 		 */
477 		if (recov) {
478 			PURGE_ATTRCACHE4_LOCKED(rp);
479 			mutex_exit(&rp->r_statelock);
480 			return;
481 		}
482 
483 		if (lwp != NULL)
484 			lwp->lwp_nostop++;
485 		while (rp->r_serial != NULL) {
486 			if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
487 				mutex_exit(&rp->r_statelock);
488 				if (lwp != NULL)
489 					lwp->lwp_nostop--;
490 				return;
491 			}
492 		}
493 		if (lwp != NULL)
494 			lwp->lwp_nostop--;
495 	}
496 
497 	/*
498 	 * If there is a page flush thread, the current thread needs to
499 	 * bail out, to prevent a possible deadlock between the current
500 	 * thread (which might be in a start_op/end_op region), the
501 	 * recovery thread, and the page flush thread.  Expire the
502 	 * attribute cache, so that any attributes the current thread was
503 	 * going to set are not lost.
504 	 */
505 	if ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread) {
506 		PURGE_ATTRCACHE4_LOCKED(rp);
507 		mutex_exit(&rp->r_statelock);
508 		return;
509 	}
510 
511 	if (rp->r_time_attr_saved > t) {
512 		/*
513 		 * Attributes have been cached since these attributes were
514 		 * probably made. If there is an inconsistency in what is
515 		 * cached, mark them invalid. If not, don't act on them.
516 		 */
517 		if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size))
518 			PURGE_ATTRCACHE4_LOCKED(rp);
519 		mutex_exit(&rp->r_statelock);
520 		return;
521 	}
522 	set_time_cache_inval = 0;
523 	if (cinfo) {
524 		/*
525 		 * Only directory modifying callers pass non-NULL cinfo.
526 		 */
527 		ASSERT(vp->v_type == VDIR);
528 		/*
529 		 * If the cache timeout either doesn't exist or hasn't expired,
530 		 * and dir didn't changed on server before dirmod op
531 		 * and dir didn't change after dirmod op but before getattr
532 		 * then there's a chance that the client's cached data for
533 		 * this object is current (not stale).  No immediate cache
534 		 * flush is required.
535 		 *
536 		 */
537 		if ((! rp->r_time_cache_inval || t < rp->r_time_cache_inval) &&
538 		    cinfo->before == rp->r_change &&
539 		    (garp->n4g_change_valid &&
540 		    cinfo->after == garp->n4g_change)) {
541 
542 			/*
543 			 * If atomic isn't set, then the before/after info
544 			 * cannot be blindly trusted.  For this case, we tell
545 			 * nfs4_attrcache_va to cache the attrs but also
546 			 * establish an absolute maximum cache timeout.  When
547 			 * the timeout is reached, caches will be flushed.
548 			 */
549 			if (! cinfo->atomic)
550 				set_time_cache_inval = 1;
551 
552 			mtime_changed = 0;
553 			ctime_changed = 0;
554 		} else {
555 
556 			/*
557 			 * We're not sure exactly what changed, but we know
558 			 * what to do.  flush all caches for dir.  remove the
559 			 * attr timeout.
560 			 *
561 			 * a) timeout expired.  flush all caches.
562 			 * b) r_change != cinfo.before.  flush all caches.
563 			 * c) r_change == cinfo.before, but cinfo.after !=
564 			 *    post-op getattr(change).  flush all caches.
565 			 * d) post-op getattr(change) not provided by server.
566 			 *    flush all caches.
567 			 */
568 			mtime_changed = 1;
569 			ctime_changed = 1;
570 			rp->r_time_cache_inval = 0;
571 		}
572 	} else {
573 		if (!(rp->r_flags & R4WRITEMODIFIED)) {
574 			if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size))
575 				mtime_changed = 1;
576 			else
577 				mtime_changed = 0;
578 			if (rp->r_attr.va_ctime.tv_sec !=
579 			    vap->va_ctime.tv_sec ||
580 			    rp->r_attr.va_ctime.tv_nsec !=
581 			    vap->va_ctime.tv_nsec)
582 				ctime_changed = 1;
583 			else
584 				ctime_changed = 0;
585 		} else {
586 			mtime_changed = 0;
587 			ctime_changed = 0;
588 		}
589 	}
590 
591 	nfs4_attrcache_va(vp, garp, set_time_cache_inval);
592 
593 	if (!mtime_changed && !ctime_changed) {
594 		mutex_exit(&rp->r_statelock);
595 		return;
596 	}
597 
598 	rp->r_serial = curthread;
599 
600 	mutex_exit(&rp->r_statelock);
601 
602 	/*
603 	 * If we're the recov thread, then force async nfs4_purge_caches
604 	 * to avoid potential deadlock.
605 	 */
606 	if (mtime_changed)
607 		nfs4_purge_caches(vp, NFS4_NOPURGE_DNLC, cr, recov ? 1 : async);
608 
609 	if (ctime_changed) {
610 		(void) nfs4_access_purge_rp(rp);
611 		if (rp->r_secattr != NULL) {
612 			mutex_enter(&rp->r_statelock);
613 			vsp = rp->r_secattr;
614 			rp->r_secattr = NULL;
615 			mutex_exit(&rp->r_statelock);
616 			if (vsp != NULL)
617 				nfs4_acl_free_cache(vsp);
618 		}
619 	}
620 
621 	if (!was_serial) {
622 		mutex_enter(&rp->r_statelock);
623 		rp->r_serial = NULL;
624 		cv_broadcast(&rp->r_cv);
625 		mutex_exit(&rp->r_statelock);
626 	}
627 }
628 
629 /*
630  * Set attributes cache for given vnode using virtual attributes.
631  *
632  * Set the timeout value on the attribute cache and fill it
633  * with the passed in attributes.
634  *
635  * The caller must be holding r_statelock.
636  */
637 static void
638 nfs4_attrcache_va(vnode_t *vp, nfs4_ga_res_t *garp, int set_cache_timeout)
639 {
640 	rnode4_t *rp;
641 	mntinfo4_t *mi;
642 	hrtime_t delta;
643 	hrtime_t now;
644 	vattr_t *vap = &garp->n4g_va;
645 
646 	rp = VTOR4(vp);
647 
648 	ASSERT(MUTEX_HELD(&rp->r_statelock));
649 	ASSERT(vap->va_mask == AT_ALL);
650 
651 	/* Switch to master before checking v_flag */
652 	if (IS_SHADOW(vp, rp))
653 		vp = RTOV4(rp);
654 
655 	now = gethrtime();
656 
657 	mi = VTOMI4(vp);
658 
659 	/*
660 	 * Only establish a new cache timeout (if requested).  Never
661 	 * extend a timeout.  Never clear a timeout.  Clearing a timeout
662 	 * is done by nfs4_update_dircaches (ancestor in our call chain)
663 	 */
664 	if (set_cache_timeout && ! rp->r_time_cache_inval)
665 		rp->r_time_cache_inval = now + mi->mi_acdirmax;
666 
667 	/*
668 	 * Delta is the number of nanoseconds that we will
669 	 * cache the attributes of the file.  It is based on
670 	 * the number of nanoseconds since the last time that
671 	 * we detected a change.  The assumption is that files
672 	 * that changed recently are likely to change again.
673 	 * There is a minimum and a maximum for regular files
674 	 * and for directories which is enforced though.
675 	 *
676 	 * Using the time since last change was detected
677 	 * eliminates direct comparison or calculation
678 	 * using mixed client and server times.  NFS does
679 	 * not make any assumptions regarding the client
680 	 * and server clocks being synchronized.
681 	 */
682 	if (vap->va_mtime.tv_sec != rp->r_attr.va_mtime.tv_sec ||
683 	    vap->va_mtime.tv_nsec != rp->r_attr.va_mtime.tv_nsec ||
684 	    vap->va_size != rp->r_attr.va_size) {
685 		rp->r_time_attr_saved = now;
686 	}
687 
688 	if ((mi->mi_flags & MI4_NOAC) || (vp->v_flag & VNOCACHE))
689 		delta = 0;
690 	else {
691 		delta = now - rp->r_time_attr_saved;
692 		if (vp->v_type == VDIR) {
693 			if (delta < mi->mi_acdirmin)
694 				delta = mi->mi_acdirmin;
695 			else if (delta > mi->mi_acdirmax)
696 				delta = mi->mi_acdirmax;
697 		} else {
698 			if (delta < mi->mi_acregmin)
699 				delta = mi->mi_acregmin;
700 			else if (delta > mi->mi_acregmax)
701 				delta = mi->mi_acregmax;
702 		}
703 	}
704 	rp->r_time_attr_inval = now + delta;
705 
706 	rp->r_attr = *vap;
707 	if (garp->n4g_change_valid)
708 		rp->r_change = garp->n4g_change;
709 
710 	/*
711 	 * The attributes that were returned may be valid and can
712 	 * be used, but they may not be allowed to be cached.
713 	 * Reset the timers to cause immediate invalidation and
714 	 * clear r_change so no VERIFY operations will suceed
715 	 */
716 	if (garp->n4g_attrwhy == NFS4_GETATTR_NOCACHE_OK) {
717 		rp->r_time_attr_inval = now;
718 		rp->r_time_attr_saved = now;
719 		rp->r_change = 0;
720 	}
721 
722 	/*
723 	 * If mounted_on_fileid returned AND the object is a stub,
724 	 * then set object's va_nodeid to the mounted over fid
725 	 * returned by server.
726 	 *
727 	 * If mounted_on_fileid not provided/supported, then
728 	 * just set it to 0 for now.  Eventually it would be
729 	 * better to set it to a hashed version of FH.  This
730 	 * would probably be good enough to provide a unique
731 	 * fid/d_ino within a dir.
732 	 *
733 	 * We don't need to carry mounted_on_fileid in the
734 	 * rnode as long as the client never requests fileid
735 	 * without also requesting mounted_on_fileid.  For
736 	 * now, it stays.
737 	 */
738 	if (garp->n4g_mon_fid_valid) {
739 		rp->r_mntd_fid = garp->n4g_mon_fid;
740 
741 		if (RP_ISSTUB(rp))
742 			rp->r_attr.va_nodeid = rp->r_mntd_fid;
743 	}
744 
745 	/*
746 	 * Check to see if there are valid pathconf bits to
747 	 * cache in the rnode.
748 	 */
749 	if (garp->n4g_ext_res) {
750 		if (garp->n4g_ext_res->n4g_pc4.pc4_cache_valid) {
751 			rp->r_pathconf = garp->n4g_ext_res->n4g_pc4;
752 		} else {
753 			if (garp->n4g_ext_res->n4g_pc4.pc4_xattr_valid) {
754 				rp->r_pathconf.pc4_xattr_valid = TRUE;
755 				rp->r_pathconf.pc4_xattr_exists =
756 				    garp->n4g_ext_res->n4g_pc4.pc4_xattr_exists;
757 			}
758 		}
759 	}
760 	/*
761 	 * Update the size of the file if there is no cached data or if
762 	 * the cached data is clean and there is no data being written
763 	 * out.
764 	 */
765 	if (rp->r_size != vap->va_size &&
766 	    (!vn_has_cached_data(vp) ||
767 	    (!(rp->r_flags & R4DIRTY) && rp->r_count == 0))) {
768 		rp->r_size = vap->va_size;
769 	}
770 	nfs_setswaplike(vp, vap);
771 	rp->r_flags &= ~R4WRITEMODIFIED;
772 }
773 
774 /*
775  * Get attributes over-the-wire and update attributes cache
776  * if no error occurred in the over-the-wire operation.
777  * Return 0 if successful, otherwise error.
778  */
779 int
780 nfs4_getattr_otw(vnode_t *vp, nfs4_ga_res_t *garp, cred_t *cr, int get_acl)
781 {
782 	mntinfo4_t *mi = VTOMI4(vp);
783 	hrtime_t t;
784 	nfs4_recov_state_t recov_state;
785 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
786 
787 	recov_state.rs_flags = 0;
788 	recov_state.rs_num_retry_despite_err = 0;
789 
790 	/* Save the original mount point security flavor */
791 	(void) save_mnt_secinfo(mi->mi_curr_serv);
792 
793 recov_retry:
794 
795 	if ((e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR,
796 	    &recov_state, NULL))) {
797 		(void) check_mnt_secinfo(mi->mi_curr_serv, vp);
798 		return (e.error);
799 	}
800 
801 	t = gethrtime();
802 
803 	nfs4_getattr_otw_norecovery(vp, garp, &e, cr, get_acl);
804 
805 	if (nfs4_needs_recovery(&e, FALSE, vp->v_vfsp)) {
806 		if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
807 		    NULL, OP_GETATTR, NULL) == FALSE)  {
808 			nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR,
809 			    &recov_state, 1);
810 			goto recov_retry;
811 		}
812 	}
813 
814 	nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 0);
815 
816 	if (!e.error) {
817 		if (e.stat == NFS4_OK) {
818 			nfs4_attr_cache(vp, garp, t, cr, FALSE, NULL);
819 		} else {
820 			e.error = geterrno4(e.stat);
821 
822 			nfs4_purge_stale_fh(e.error, vp, cr);
823 		}
824 	}
825 
826 	/*
827 	 * If getattr a node that is a stub for a crossed
828 	 * mount point, keep the original secinfo flavor for
829 	 * the current file system, not the crossed one.
830 	 */
831 	(void) check_mnt_secinfo(mi->mi_curr_serv, vp);
832 
833 	return (e.error);
834 }
835 
836 /*
837  * Generate a compound to get attributes over-the-wire.
838  */
839 void
840 nfs4_getattr_otw_norecovery(vnode_t *vp, nfs4_ga_res_t *garp,
841     nfs4_error_t *ep, cred_t *cr, int get_acl)
842 {
843 	COMPOUND4args_clnt args;
844 	COMPOUND4res_clnt res;
845 	int doqueue;
846 	rnode4_t *rp = VTOR4(vp);
847 	nfs_argop4 argop[2];
848 
849 	args.ctag = TAG_GETATTR;
850 
851 	args.array_len = 2;
852 	args.array = argop;
853 
854 	/* putfh */
855 	argop[0].argop = OP_CPUTFH;
856 	argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
857 
858 	/* getattr */
859 	/*
860 	 * Unlike nfs version 2 and 3, where getattr returns all the
861 	 * attributes, nfs version 4 returns only the ones explicitly
862 	 * asked for. This creates problems, as some system functions
863 	 * (e.g. cache check) require certain attributes and if the
864 	 * cached node lacks some attributes such as uid/gid, it can
865 	 * affect system utilities (e.g. "ls") that rely on the information
866 	 * to be there. This can lead to anything from system crashes to
867 	 * corrupted information processed by user apps.
868 	 * So to ensure that all bases are covered, request at least
869 	 * the AT_ALL attribute mask.
870 	 */
871 	argop[1].argop = OP_GETATTR;
872 	argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
873 	if (get_acl)
874 		argop[1].nfs_argop4_u.opgetattr.attr_request |= FATTR4_ACL_MASK;
875 	argop[1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp);
876 
877 	doqueue = 1;
878 
879 	rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, ep);
880 
881 	if (ep->error)
882 		return;
883 
884 	if (res.status != NFS4_OK) {
885 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
886 		return;
887 	}
888 
889 	*garp = res.array[1].nfs_resop4_u.opgetattr.ga_res;
890 
891 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
892 }
893 
894 /*
895  * Return either cached or remote attributes. If get remote attr
896  * use them to check and invalidate caches, then cache the new attributes.
897  */
898 int
899 nfs4getattr(vnode_t *vp, vattr_t *vap, cred_t *cr)
900 {
901 	int error;
902 	rnode4_t *rp;
903 	nfs4_ga_res_t gar;
904 
905 	ASSERT(nfs4_consistent_type(vp));
906 
907 	/*
908 	 * If we've got cached attributes, we're done, otherwise go
909 	 * to the server to get attributes, which will update the cache
910 	 * in the process. Either way, use the cached attributes for
911 	 * the caller's vattr_t.
912 	 *
913 	 * Note that we ignore the gar set by the OTW call: the attr caching
914 	 * code may make adjustments when storing to the rnode, and we want
915 	 * to see those changes here.
916 	 */
917 	rp = VTOR4(vp);
918 	error = 0;
919 	mutex_enter(&rp->r_statelock);
920 	if (!ATTRCACHE4_VALID(vp)) {
921 		mutex_exit(&rp->r_statelock);
922 		error = nfs4_getattr_otw(vp, &gar, cr, 0);
923 		mutex_enter(&rp->r_statelock);
924 	}
925 
926 	if (!error)
927 		*vap = rp->r_attr;
928 
929 	/* Return the client's view of file size */
930 	vap->va_size = rp->r_size;
931 
932 	mutex_exit(&rp->r_statelock);
933 
934 	ASSERT(nfs4_consistent_type(vp));
935 
936 	return (error);
937 }
938 
939 int
940 nfs4_attr_otw(vnode_t *vp, nfs4_tag_type_t tag_type,
941     nfs4_ga_res_t *garp, bitmap4 reqbitmap, cred_t *cr)
942 {
943 	COMPOUND4args_clnt args;
944 	COMPOUND4res_clnt res;
945 	int doqueue;
946 	nfs_argop4 argop[2];
947 	mntinfo4_t *mi = VTOMI4(vp);
948 	bool_t needrecov = FALSE;
949 	nfs4_recov_state_t recov_state;
950 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
951 	nfs4_ga_ext_res_t *gerp;
952 
953 	recov_state.rs_flags = 0;
954 	recov_state.rs_num_retry_despite_err = 0;
955 
956 recov_retry:
957 	args.ctag = tag_type;
958 
959 	args.array_len = 2;
960 	args.array = argop;
961 
962 	e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR, &recov_state, NULL);
963 	if (e.error)
964 		return (e.error);
965 
966 	/* putfh */
967 	argop[0].argop = OP_CPUTFH;
968 	argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh;
969 
970 	/* getattr */
971 	argop[1].argop = OP_GETATTR;
972 	argop[1].nfs_argop4_u.opgetattr.attr_request = reqbitmap;
973 	argop[1].nfs_argop4_u.opgetattr.mi = mi;
974 
975 	doqueue = 1;
976 
977 	NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
978 	    "nfs4_attr_otw: %s call, rp %s", needrecov ? "recov" : "first",
979 	    rnode4info(VTOR4(vp))));
980 
981 	rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
982 
983 	needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
984 	if (!needrecov && e.error) {
985 		nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
986 		    needrecov);
987 		return (e.error);
988 	}
989 
990 	if (needrecov) {
991 		bool_t abort;
992 
993 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
994 		    "nfs4_attr_otw: initiating recovery\n"));
995 
996 		abort = nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
997 		    NULL, OP_GETATTR, NULL);
998 		nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
999 		    needrecov);
1000 		if (!e.error) {
1001 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1002 			e.error = geterrno4(res.status);
1003 		}
1004 		if (abort == FALSE)
1005 			goto recov_retry;
1006 		return (e.error);
1007 	}
1008 
1009 	if (res.status) {
1010 		e.error = geterrno4(res.status);
1011 	} else {
1012 		gerp = garp->n4g_ext_res;
1013 		bcopy(&res.array[1].nfs_resop4_u.opgetattr.ga_res,
1014 		    garp, sizeof (nfs4_ga_res_t));
1015 		garp->n4g_ext_res = gerp;
1016 		if (garp->n4g_ext_res &&
1017 		    res.array[1].nfs_resop4_u.opgetattr.ga_res.n4g_ext_res)
1018 			bcopy(res.array[1].nfs_resop4_u.opgetattr.
1019 			    ga_res.n4g_ext_res,
1020 			    garp->n4g_ext_res, sizeof (nfs4_ga_ext_res_t));
1021 	}
1022 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1023 	nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
1024 	    needrecov);
1025 	return (e.error);
1026 }
1027 
1028 /*
1029  * Asynchronous I/O parameters.  nfs_async_threads is the high-water mark
1030  * for the demand-based allocation of async threads per-mount.  The
1031  * nfs_async_timeout is the amount of time a thread will live after it
1032  * becomes idle, unless new I/O requests are received before the thread
1033  * dies.  See nfs4_async_putpage and nfs4_async_start.
1034  */
1035 
1036 static void	nfs4_async_start(struct vfs *);
1037 
1038 static void
1039 free_async_args4(struct nfs4_async_reqs *args)
1040 {
1041 	rnode4_t *rp;
1042 
1043 	if (args->a_io != NFS4_INACTIVE) {
1044 		rp = VTOR4(args->a_vp);
1045 		mutex_enter(&rp->r_statelock);
1046 		rp->r_count--;
1047 		if (args->a_io == NFS4_PUTAPAGE ||
1048 		    args->a_io == NFS4_PAGEIO)
1049 			rp->r_awcount--;
1050 		cv_broadcast(&rp->r_cv);
1051 		mutex_exit(&rp->r_statelock);
1052 		VN_RELE(args->a_vp);
1053 	}
1054 	crfree(args->a_cred);
1055 	kmem_free(args, sizeof (*args));
1056 }
1057 
1058 /*
1059  * Cross-zone thread creation and NFS access is disallowed, yet fsflush() and
1060  * pageout(), running in the global zone, have legitimate reasons to do
1061  * VOP_PUTPAGE(B_ASYNC) on other zones' NFS mounts.  We avoid the problem by
1062  * use of a a per-mount "asynchronous requests manager thread" which is
1063  * signaled by the various asynchronous work routines when there is
1064  * asynchronous work to be done.  It is responsible for creating new
1065  * worker threads if necessary, and notifying existing worker threads
1066  * that there is work to be done.
1067  *
1068  * In other words, it will "take the specifications from the customers and
1069  * give them to the engineers."
1070  *
1071  * Worker threads die off of their own accord if they are no longer
1072  * needed.
1073  *
1074  * This thread is killed when the zone is going away or the filesystem
1075  * is being unmounted.
1076  */
1077 void
1078 nfs4_async_manager(vfs_t *vfsp)
1079 {
1080 	callb_cpr_t cprinfo;
1081 	mntinfo4_t *mi;
1082 	uint_t max_threads;
1083 
1084 	mi = VFTOMI4(vfsp);
1085 
1086 	CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr,
1087 	    "nfs4_async_manager");
1088 
1089 	mutex_enter(&mi->mi_async_lock);
1090 	/*
1091 	 * We want to stash the max number of threads that this mount was
1092 	 * allowed so we can use it later when the variable is set to zero as
1093 	 * part of the zone/mount going away.
1094 	 *
1095 	 * We want to be able to create at least one thread to handle
1096 	 * asyncrhonous inactive calls.
1097 	 */
1098 	max_threads = MAX(mi->mi_max_threads, 1);
1099 	mutex_enter(&mi->mi_lock);
1100 	/*
1101 	 * We don't want to wait for mi_max_threads to go to zero, since that
1102 	 * happens as part of a failed unmount, but this thread should only
1103 	 * exit when the mount is really going away.
1104 	 *
1105 	 * Once MI4_ASYNC_MGR_STOP is set, no more async operations will be
1106 	 * attempted: the various _async_*() functions know to do things
1107 	 * inline if mi_max_threads == 0.  Henceforth we just drain out the
1108 	 * outstanding requests.
1109 	 *
1110 	 * Note that we still create zthreads even if we notice the zone is
1111 	 * shutting down (MI4_ASYNC_MGR_STOP is set); this may cause the zone
1112 	 * shutdown sequence to take slightly longer in some cases, but
1113 	 * doesn't violate the protocol, as all threads will exit as soon as
1114 	 * they're done processing the remaining requests.
1115 	 */
1116 	while (!(mi->mi_flags & MI4_ASYNC_MGR_STOP) ||
1117 	    mi->mi_async_req_count > 0) {
1118 		mutex_exit(&mi->mi_lock);
1119 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
1120 		cv_wait(&mi->mi_async_reqs_cv, &mi->mi_async_lock);
1121 		CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1122 		while (mi->mi_async_req_count > 0) {
1123 			/*
1124 			 * Paranoia: If the mount started out having
1125 			 * (mi->mi_max_threads == 0), and the value was
1126 			 * later changed (via a debugger or somesuch),
1127 			 * we could be confused since we will think we
1128 			 * can't create any threads, and the calling
1129 			 * code (which looks at the current value of
1130 			 * mi->mi_max_threads, now non-zero) thinks we
1131 			 * can.
1132 			 *
1133 			 * So, because we're paranoid, we create threads
1134 			 * up to the maximum of the original and the
1135 			 * current value. This means that future
1136 			 * (debugger-induced) alterations of
1137 			 * mi->mi_max_threads are ignored for our
1138 			 * purposes, but who told them they could change
1139 			 * random values on a live kernel anyhow?
1140 			 */
1141 			if (mi->mi_threads <
1142 			    MAX(mi->mi_max_threads, max_threads)) {
1143 				mi->mi_threads++;
1144 				mutex_exit(&mi->mi_async_lock);
1145 				MI4_HOLD(mi);
1146 				VFS_HOLD(vfsp);	/* hold for new thread */
1147 				(void) zthread_create(NULL, 0, nfs4_async_start,
1148 				    vfsp, 0, minclsyspri);
1149 				mutex_enter(&mi->mi_async_lock);
1150 			}
1151 			cv_signal(&mi->mi_async_work_cv);
1152 			ASSERT(mi->mi_async_req_count != 0);
1153 			mi->mi_async_req_count--;
1154 		}
1155 		mutex_enter(&mi->mi_lock);
1156 	}
1157 	mutex_exit(&mi->mi_lock);
1158 
1159 	NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
1160 	    "nfs4_async_manager exiting for vfs %p\n", (void *)mi->mi_vfsp));
1161 	/*
1162 	 * Let everyone know we're done.
1163 	 */
1164 	mi->mi_manager_thread = NULL;
1165 	/*
1166 	 * Wake up the inactive thread.
1167 	 */
1168 	cv_broadcast(&mi->mi_inact_req_cv);
1169 	/*
1170 	 * Wake up anyone sitting in nfs4_async_manager_stop()
1171 	 */
1172 	cv_broadcast(&mi->mi_async_cv);
1173 	/*
1174 	 * There is no explicit call to mutex_exit(&mi->mi_async_lock)
1175 	 * since CALLB_CPR_EXIT is actually responsible for releasing
1176 	 * 'mi_async_lock'.
1177 	 */
1178 	CALLB_CPR_EXIT(&cprinfo);
1179 	VFS_RELE(vfsp);	/* release thread's hold */
1180 	MI4_RELE(mi);
1181 	zthread_exit();
1182 }
1183 
1184 /*
1185  * Signal (and wait for) the async manager thread to clean up and go away.
1186  */
1187 void
1188 nfs4_async_manager_stop(vfs_t *vfsp)
1189 {
1190 	mntinfo4_t *mi = VFTOMI4(vfsp);
1191 
1192 	mutex_enter(&mi->mi_async_lock);
1193 	mutex_enter(&mi->mi_lock);
1194 	mi->mi_flags |= MI4_ASYNC_MGR_STOP;
1195 	mutex_exit(&mi->mi_lock);
1196 	cv_broadcast(&mi->mi_async_reqs_cv);
1197 	/*
1198 	 * Wait for the async manager thread to die.
1199 	 */
1200 	while (mi->mi_manager_thread != NULL)
1201 		cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1202 	mutex_exit(&mi->mi_async_lock);
1203 }
1204 
1205 int
1206 nfs4_async_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr,
1207     struct seg *seg, cred_t *cr, void (*readahead)(vnode_t *,
1208     u_offset_t, caddr_t, struct seg *, cred_t *))
1209 {
1210 	rnode4_t *rp;
1211 	mntinfo4_t *mi;
1212 	struct nfs4_async_reqs *args;
1213 
1214 	rp = VTOR4(vp);
1215 	ASSERT(rp->r_freef == NULL);
1216 
1217 	mi = VTOMI4(vp);
1218 
1219 	/*
1220 	 * If addr falls in a different segment, don't bother doing readahead.
1221 	 */
1222 	if (addr >= seg->s_base + seg->s_size)
1223 		return (-1);
1224 
1225 	/*
1226 	 * If we can't allocate a request structure, punt on the readahead.
1227 	 */
1228 	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1229 		return (-1);
1230 
1231 	/*
1232 	 * If a lock operation is pending, don't initiate any new
1233 	 * readaheads.  Otherwise, bump r_count to indicate the new
1234 	 * asynchronous I/O.
1235 	 */
1236 	if (!nfs_rw_tryenter(&rp->r_lkserlock, RW_READER)) {
1237 		kmem_free(args, sizeof (*args));
1238 		return (-1);
1239 	}
1240 	mutex_enter(&rp->r_statelock);
1241 	rp->r_count++;
1242 	mutex_exit(&rp->r_statelock);
1243 	nfs_rw_exit(&rp->r_lkserlock);
1244 
1245 	args->a_next = NULL;
1246 #ifdef DEBUG
1247 	args->a_queuer = curthread;
1248 #endif
1249 	VN_HOLD(vp);
1250 	args->a_vp = vp;
1251 	ASSERT(cr != NULL);
1252 	crhold(cr);
1253 	args->a_cred = cr;
1254 	args->a_io = NFS4_READ_AHEAD;
1255 	args->a_nfs4_readahead = readahead;
1256 	args->a_nfs4_blkoff = blkoff;
1257 	args->a_nfs4_seg = seg;
1258 	args->a_nfs4_addr = addr;
1259 
1260 	mutex_enter(&mi->mi_async_lock);
1261 
1262 	/*
1263 	 * If asyncio has been disabled, don't bother readahead.
1264 	 */
1265 	if (mi->mi_max_threads == 0) {
1266 		mutex_exit(&mi->mi_async_lock);
1267 		goto noasync;
1268 	}
1269 
1270 	/*
1271 	 * Link request structure into the async list and
1272 	 * wakeup async thread to do the i/o.
1273 	 */
1274 	if (mi->mi_async_reqs[NFS4_READ_AHEAD] == NULL) {
1275 		mi->mi_async_reqs[NFS4_READ_AHEAD] = args;
1276 		mi->mi_async_tail[NFS4_READ_AHEAD] = args;
1277 	} else {
1278 		mi->mi_async_tail[NFS4_READ_AHEAD]->a_next = args;
1279 		mi->mi_async_tail[NFS4_READ_AHEAD] = args;
1280 	}
1281 
1282 	if (mi->mi_io_kstats) {
1283 		mutex_enter(&mi->mi_lock);
1284 		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1285 		mutex_exit(&mi->mi_lock);
1286 	}
1287 
1288 	mi->mi_async_req_count++;
1289 	ASSERT(mi->mi_async_req_count != 0);
1290 	cv_signal(&mi->mi_async_reqs_cv);
1291 	mutex_exit(&mi->mi_async_lock);
1292 	return (0);
1293 
1294 noasync:
1295 	mutex_enter(&rp->r_statelock);
1296 	rp->r_count--;
1297 	cv_broadcast(&rp->r_cv);
1298 	mutex_exit(&rp->r_statelock);
1299 	VN_RELE(vp);
1300 	crfree(cr);
1301 	kmem_free(args, sizeof (*args));
1302 	return (-1);
1303 }
1304 
1305 /*
1306  * The async queues for each mounted file system are arranged as a
1307  * set of queues, one for each async i/o type.  Requests are taken
1308  * from the queues in a round-robin fashion.  A number of consecutive
1309  * requests are taken from each queue before moving on to the next
1310  * queue.  This functionality may allow the NFS Version 2 server to do
1311  * write clustering, even if the client is mixing writes and reads
1312  * because it will take multiple write requests from the queue
1313  * before processing any of the other async i/o types.
1314  *
1315  * XXX The nfs4_async_start thread is unsafe in the light of the present
1316  * model defined by cpr to suspend the system. Specifically over the
1317  * wire calls are cpr-unsafe. The thread should be reevaluated in
1318  * case of future updates to the cpr model.
1319  */
1320 static void
1321 nfs4_async_start(struct vfs *vfsp)
1322 {
1323 	struct nfs4_async_reqs *args;
1324 	mntinfo4_t *mi = VFTOMI4(vfsp);
1325 	clock_t time_left = 1;
1326 	callb_cpr_t cprinfo;
1327 	int i;
1328 	extern int nfs_async_timeout;
1329 
1330 	/*
1331 	 * Dynamic initialization of nfs_async_timeout to allow nfs to be
1332 	 * built in an implementation independent manner.
1333 	 */
1334 	if (nfs_async_timeout == -1)
1335 		nfs_async_timeout = NFS_ASYNC_TIMEOUT;
1336 
1337 	CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, "nas");
1338 
1339 	mutex_enter(&mi->mi_async_lock);
1340 	for (;;) {
1341 		/*
1342 		 * Find the next queue containing an entry.  We start
1343 		 * at the current queue pointer and then round robin
1344 		 * through all of them until we either find a non-empty
1345 		 * queue or have looked through all of them.
1346 		 */
1347 		for (i = 0; i < NFS4_ASYNC_TYPES; i++) {
1348 			args = *mi->mi_async_curr;
1349 			if (args != NULL)
1350 				break;
1351 			mi->mi_async_curr++;
1352 			if (mi->mi_async_curr ==
1353 			    &mi->mi_async_reqs[NFS4_ASYNC_TYPES])
1354 				mi->mi_async_curr = &mi->mi_async_reqs[0];
1355 		}
1356 		/*
1357 		 * If we didn't find a entry, then block until woken up
1358 		 * again and then look through the queues again.
1359 		 */
1360 		if (args == NULL) {
1361 			/*
1362 			 * Exiting is considered to be safe for CPR as well
1363 			 */
1364 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
1365 
1366 			/*
1367 			 * Wakeup thread waiting to unmount the file
1368 			 * system only if all async threads are inactive.
1369 			 *
1370 			 * If we've timed-out and there's nothing to do,
1371 			 * then get rid of this thread.
1372 			 */
1373 			if (mi->mi_max_threads == 0 || time_left <= 0) {
1374 				if (--mi->mi_threads == 0)
1375 					cv_signal(&mi->mi_async_cv);
1376 				CALLB_CPR_EXIT(&cprinfo);
1377 				VFS_RELE(vfsp);	/* release thread's hold */
1378 				MI4_RELE(mi);
1379 				zthread_exit();
1380 				/* NOTREACHED */
1381 			}
1382 			time_left = cv_timedwait(&mi->mi_async_work_cv,
1383 			    &mi->mi_async_lock, nfs_async_timeout + lbolt);
1384 
1385 			CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1386 
1387 			continue;
1388 		} else {
1389 			time_left = 1;
1390 		}
1391 
1392 		/*
1393 		 * Remove the request from the async queue and then
1394 		 * update the current async request queue pointer.  If
1395 		 * the current queue is empty or we have removed enough
1396 		 * consecutive entries from it, then reset the counter
1397 		 * for this queue and then move the current pointer to
1398 		 * the next queue.
1399 		 */
1400 		*mi->mi_async_curr = args->a_next;
1401 		if (*mi->mi_async_curr == NULL ||
1402 		    --mi->mi_async_clusters[args->a_io] == 0) {
1403 			mi->mi_async_clusters[args->a_io] =
1404 			    mi->mi_async_init_clusters;
1405 			mi->mi_async_curr++;
1406 			if (mi->mi_async_curr ==
1407 			    &mi->mi_async_reqs[NFS4_ASYNC_TYPES])
1408 				mi->mi_async_curr = &mi->mi_async_reqs[0];
1409 		}
1410 
1411 		if (args->a_io != NFS4_INACTIVE && mi->mi_io_kstats) {
1412 			mutex_enter(&mi->mi_lock);
1413 			kstat_waitq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
1414 			mutex_exit(&mi->mi_lock);
1415 		}
1416 
1417 		mutex_exit(&mi->mi_async_lock);
1418 
1419 		/*
1420 		 * Obtain arguments from the async request structure.
1421 		 */
1422 		if (args->a_io == NFS4_READ_AHEAD && mi->mi_max_threads > 0) {
1423 			(*args->a_nfs4_readahead)(args->a_vp,
1424 			    args->a_nfs4_blkoff, args->a_nfs4_addr,
1425 			    args->a_nfs4_seg, args->a_cred);
1426 		} else if (args->a_io == NFS4_PUTAPAGE) {
1427 			(void) (*args->a_nfs4_putapage)(args->a_vp,
1428 			    args->a_nfs4_pp, args->a_nfs4_off,
1429 			    args->a_nfs4_len, args->a_nfs4_flags,
1430 			    args->a_cred);
1431 		} else if (args->a_io == NFS4_PAGEIO) {
1432 			(void) (*args->a_nfs4_pageio)(args->a_vp,
1433 			    args->a_nfs4_pp, args->a_nfs4_off,
1434 			    args->a_nfs4_len, args->a_nfs4_flags,
1435 			    args->a_cred);
1436 		} else if (args->a_io == NFS4_READDIR) {
1437 			(void) ((*args->a_nfs4_readdir)(args->a_vp,
1438 			    args->a_nfs4_rdc, args->a_cred));
1439 		} else if (args->a_io == NFS4_COMMIT) {
1440 			(*args->a_nfs4_commit)(args->a_vp, args->a_nfs4_plist,
1441 			    args->a_nfs4_offset, args->a_nfs4_count,
1442 			    args->a_cred);
1443 		} else if (args->a_io == NFS4_INACTIVE) {
1444 			nfs4_inactive_otw(args->a_vp, args->a_cred);
1445 		}
1446 
1447 		/*
1448 		 * Now, release the vnode and free the credentials
1449 		 * structure.
1450 		 */
1451 		free_async_args4(args);
1452 		/*
1453 		 * Reacquire the mutex because it will be needed above.
1454 		 */
1455 		mutex_enter(&mi->mi_async_lock);
1456 	}
1457 }
1458 
1459 /*
1460  * nfs4_inactive_thread - look for vnodes that need over-the-wire calls as
1461  * part of VOP_INACTIVE.
1462  */
1463 
1464 void
1465 nfs4_inactive_thread(mntinfo4_t *mi)
1466 {
1467 	struct nfs4_async_reqs *args;
1468 	callb_cpr_t cprinfo;
1469 	vfs_t *vfsp = mi->mi_vfsp;
1470 
1471 	CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr,
1472 	    "nfs4_inactive_thread");
1473 
1474 	for (;;) {
1475 		mutex_enter(&mi->mi_async_lock);
1476 		args = mi->mi_async_reqs[NFS4_INACTIVE];
1477 		if (args == NULL) {
1478 			mutex_enter(&mi->mi_lock);
1479 			/*
1480 			 * We don't want to exit until the async manager is done
1481 			 * with its work; hence the check for mi_manager_thread
1482 			 * being NULL.
1483 			 *
1484 			 * The async manager thread will cv_broadcast() on
1485 			 * mi_inact_req_cv when it's done, at which point we'll
1486 			 * wake up and exit.
1487 			 */
1488 			if (mi->mi_manager_thread == NULL)
1489 				goto die;
1490 			mi->mi_flags |= MI4_INACTIVE_IDLE;
1491 			mutex_exit(&mi->mi_lock);
1492 			cv_signal(&mi->mi_async_cv);
1493 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
1494 			cv_wait(&mi->mi_inact_req_cv, &mi->mi_async_lock);
1495 			CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1496 			mutex_exit(&mi->mi_async_lock);
1497 		} else {
1498 			mutex_enter(&mi->mi_lock);
1499 			mi->mi_flags &= ~MI4_INACTIVE_IDLE;
1500 			mutex_exit(&mi->mi_lock);
1501 			mi->mi_async_reqs[NFS4_INACTIVE] = args->a_next;
1502 			mutex_exit(&mi->mi_async_lock);
1503 			nfs4_inactive_otw(args->a_vp, args->a_cred);
1504 			crfree(args->a_cred);
1505 			kmem_free(args, sizeof (*args));
1506 		}
1507 	}
1508 die:
1509 	mutex_exit(&mi->mi_lock);
1510 	mi->mi_inactive_thread = NULL;
1511 	cv_signal(&mi->mi_async_cv);
1512 
1513 	/*
1514 	 * There is no explicit call to mutex_exit(&mi->mi_async_lock) since
1515 	 * CALLB_CPR_EXIT is actually responsible for releasing 'mi_async_lock'.
1516 	 */
1517 	CALLB_CPR_EXIT(&cprinfo);
1518 
1519 	NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
1520 	    "nfs4_inactive_thread exiting for vfs %p\n", (void *)vfsp));
1521 
1522 	MI4_RELE(mi);
1523 	zthread_exit();
1524 	/* NOTREACHED */
1525 }
1526 
1527 /*
1528  * nfs_async_stop:
1529  * Wait for all outstanding putpage operations and the inactive thread to
1530  * complete; nfs4_async_stop_sig() without interruptibility.
1531  */
1532 void
1533 nfs4_async_stop(struct vfs *vfsp)
1534 {
1535 	mntinfo4_t *mi = VFTOMI4(vfsp);
1536 
1537 	/*
1538 	 * Wait for all outstanding async operations to complete and for
1539 	 * worker threads to exit.
1540 	 */
1541 	mutex_enter(&mi->mi_async_lock);
1542 	mi->mi_max_threads = 0;
1543 	cv_broadcast(&mi->mi_async_work_cv);
1544 	while (mi->mi_threads != 0)
1545 		cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1546 
1547 	/*
1548 	 * Wait for the inactive thread to finish doing what it's doing.  It
1549 	 * won't exit until the last reference to the vfs_t goes away.
1550 	 */
1551 	if (mi->mi_inactive_thread != NULL) {
1552 		mutex_enter(&mi->mi_lock);
1553 		while (!(mi->mi_flags & MI4_INACTIVE_IDLE) ||
1554 		    (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) {
1555 			mutex_exit(&mi->mi_lock);
1556 			cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1557 			mutex_enter(&mi->mi_lock);
1558 		}
1559 		mutex_exit(&mi->mi_lock);
1560 	}
1561 	mutex_exit(&mi->mi_async_lock);
1562 }
1563 
1564 /*
1565  * nfs_async_stop_sig:
1566  * Wait for all outstanding putpage operations and the inactive thread to
1567  * complete. If a signal is delivered we will abort and return non-zero;
1568  * otherwise return 0. Since this routine is called from nfs4_unmount, we
1569  * need to make it interruptible.
1570  */
1571 int
1572 nfs4_async_stop_sig(struct vfs *vfsp)
1573 {
1574 	mntinfo4_t *mi = VFTOMI4(vfsp);
1575 	ushort_t omax;
1576 	bool_t intr = FALSE;
1577 
1578 	/*
1579 	 * Wait for all outstanding putpage operations to complete and for
1580 	 * worker threads to exit.
1581 	 */
1582 	mutex_enter(&mi->mi_async_lock);
1583 	omax = mi->mi_max_threads;
1584 	mi->mi_max_threads = 0;
1585 	cv_broadcast(&mi->mi_async_work_cv);
1586 	while (mi->mi_threads != 0) {
1587 		if (!cv_wait_sig(&mi->mi_async_cv, &mi->mi_async_lock)) {
1588 			intr = TRUE;
1589 			goto interrupted;
1590 		}
1591 	}
1592 
1593 	/*
1594 	 * Wait for the inactive thread to finish doing what it's doing.  It
1595 	 * won't exit until the a last reference to the vfs_t goes away.
1596 	 */
1597 	if (mi->mi_inactive_thread != NULL) {
1598 		mutex_enter(&mi->mi_lock);
1599 		while (!(mi->mi_flags & MI4_INACTIVE_IDLE) ||
1600 		    (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) {
1601 			mutex_exit(&mi->mi_lock);
1602 			if (!cv_wait_sig(&mi->mi_async_cv,
1603 			    &mi->mi_async_lock)) {
1604 				intr = TRUE;
1605 				goto interrupted;
1606 			}
1607 			mutex_enter(&mi->mi_lock);
1608 		}
1609 		mutex_exit(&mi->mi_lock);
1610 	}
1611 interrupted:
1612 	if (intr)
1613 		mi->mi_max_threads = omax;
1614 	mutex_exit(&mi->mi_async_lock);
1615 
1616 	return (intr);
1617 }
1618 
1619 int
1620 nfs4_async_putapage(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
1621     int flags, cred_t *cr, int (*putapage)(vnode_t *, page_t *,
1622     u_offset_t, size_t, int, cred_t *))
1623 {
1624 	rnode4_t *rp;
1625 	mntinfo4_t *mi;
1626 	struct nfs4_async_reqs *args;
1627 
1628 	ASSERT(flags & B_ASYNC);
1629 	ASSERT(vp->v_vfsp != NULL);
1630 
1631 	rp = VTOR4(vp);
1632 	ASSERT(rp->r_count > 0);
1633 
1634 	mi = VTOMI4(vp);
1635 
1636 	/*
1637 	 * If we can't allocate a request structure, do the putpage
1638 	 * operation synchronously in this thread's context.
1639 	 */
1640 	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1641 		goto noasync;
1642 
1643 	args->a_next = NULL;
1644 #ifdef DEBUG
1645 	args->a_queuer = curthread;
1646 #endif
1647 	VN_HOLD(vp);
1648 	args->a_vp = vp;
1649 	ASSERT(cr != NULL);
1650 	crhold(cr);
1651 	args->a_cred = cr;
1652 	args->a_io = NFS4_PUTAPAGE;
1653 	args->a_nfs4_putapage = putapage;
1654 	args->a_nfs4_pp = pp;
1655 	args->a_nfs4_off = off;
1656 	args->a_nfs4_len = (uint_t)len;
1657 	args->a_nfs4_flags = flags;
1658 
1659 	mutex_enter(&mi->mi_async_lock);
1660 
1661 	/*
1662 	 * If asyncio has been disabled, then make a synchronous request.
1663 	 * This check is done a second time in case async io was diabled
1664 	 * while this thread was blocked waiting for memory pressure to
1665 	 * reduce or for the queue to drain.
1666 	 */
1667 	if (mi->mi_max_threads == 0) {
1668 		mutex_exit(&mi->mi_async_lock);
1669 
1670 		VN_RELE(vp);
1671 		crfree(cr);
1672 		kmem_free(args, sizeof (*args));
1673 		goto noasync;
1674 	}
1675 
1676 	/*
1677 	 * Link request structure into the async list and
1678 	 * wakeup async thread to do the i/o.
1679 	 */
1680 	if (mi->mi_async_reqs[NFS4_PUTAPAGE] == NULL) {
1681 		mi->mi_async_reqs[NFS4_PUTAPAGE] = args;
1682 		mi->mi_async_tail[NFS4_PUTAPAGE] = args;
1683 	} else {
1684 		mi->mi_async_tail[NFS4_PUTAPAGE]->a_next = args;
1685 		mi->mi_async_tail[NFS4_PUTAPAGE] = args;
1686 	}
1687 
1688 	mutex_enter(&rp->r_statelock);
1689 	rp->r_count++;
1690 	rp->r_awcount++;
1691 	mutex_exit(&rp->r_statelock);
1692 
1693 	if (mi->mi_io_kstats) {
1694 		mutex_enter(&mi->mi_lock);
1695 		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1696 		mutex_exit(&mi->mi_lock);
1697 	}
1698 
1699 	mi->mi_async_req_count++;
1700 	ASSERT(mi->mi_async_req_count != 0);
1701 	cv_signal(&mi->mi_async_reqs_cv);
1702 	mutex_exit(&mi->mi_async_lock);
1703 	return (0);
1704 
1705 noasync:
1706 
1707 	if (curproc == proc_pageout || curproc == proc_fsflush ||
1708 	    nfs_zone() == mi->mi_zone) {
1709 		/*
1710 		 * If we get here in the context of the pageout/fsflush,
1711 		 * or we have run out of memory or we're attempting to
1712 		 * unmount we refuse to do a sync write, because this may
1713 		 * hang pageout/fsflush and the machine. In this case,
1714 		 * we just re-mark the page as dirty and punt on the page.
1715 		 *
1716 		 * Make sure B_FORCE isn't set.  We can re-mark the
1717 		 * pages as dirty and unlock the pages in one swoop by
1718 		 * passing in B_ERROR to pvn_write_done().  However,
1719 		 * we should make sure B_FORCE isn't set - we don't
1720 		 * want the page tossed before it gets written out.
1721 		 */
1722 		if (flags & B_FORCE)
1723 			flags &= ~(B_INVAL | B_FORCE);
1724 		pvn_write_done(pp, flags | B_ERROR);
1725 		return (0);
1726 	}
1727 
1728 	/*
1729 	 * We'll get here only if (nfs_zone() != mi->mi_zone)
1730 	 * which means that this was a cross-zone sync putpage.
1731 	 *
1732 	 * We pass in B_ERROR to pvn_write_done() to re-mark the pages
1733 	 * as dirty and unlock them.
1734 	 *
1735 	 * We don't want to clear B_FORCE here as the caller presumably
1736 	 * knows what they're doing if they set it.
1737 	 */
1738 	pvn_write_done(pp, flags | B_ERROR);
1739 	return (EPERM);
1740 }
1741 
1742 int
1743 nfs4_async_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
1744     int flags, cred_t *cr, int (*pageio)(vnode_t *, page_t *, u_offset_t,
1745     size_t, int, cred_t *))
1746 {
1747 	rnode4_t *rp;
1748 	mntinfo4_t *mi;
1749 	struct nfs4_async_reqs *args;
1750 
1751 	ASSERT(flags & B_ASYNC);
1752 	ASSERT(vp->v_vfsp != NULL);
1753 
1754 	rp = VTOR4(vp);
1755 	ASSERT(rp->r_count > 0);
1756 
1757 	mi = VTOMI4(vp);
1758 
1759 	/*
1760 	 * If we can't allocate a request structure, do the pageio
1761 	 * request synchronously in this thread's context.
1762 	 */
1763 	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1764 		goto noasync;
1765 
1766 	args->a_next = NULL;
1767 #ifdef DEBUG
1768 	args->a_queuer = curthread;
1769 #endif
1770 	VN_HOLD(vp);
1771 	args->a_vp = vp;
1772 	ASSERT(cr != NULL);
1773 	crhold(cr);
1774 	args->a_cred = cr;
1775 	args->a_io = NFS4_PAGEIO;
1776 	args->a_nfs4_pageio = pageio;
1777 	args->a_nfs4_pp = pp;
1778 	args->a_nfs4_off = io_off;
1779 	args->a_nfs4_len = (uint_t)io_len;
1780 	args->a_nfs4_flags = flags;
1781 
1782 	mutex_enter(&mi->mi_async_lock);
1783 
1784 	/*
1785 	 * If asyncio has been disabled, then make a synchronous request.
1786 	 * This check is done a second time in case async io was diabled
1787 	 * while this thread was blocked waiting for memory pressure to
1788 	 * reduce or for the queue to drain.
1789 	 */
1790 	if (mi->mi_max_threads == 0) {
1791 		mutex_exit(&mi->mi_async_lock);
1792 
1793 		VN_RELE(vp);
1794 		crfree(cr);
1795 		kmem_free(args, sizeof (*args));
1796 		goto noasync;
1797 	}
1798 
1799 	/*
1800 	 * Link request structure into the async list and
1801 	 * wakeup async thread to do the i/o.
1802 	 */
1803 	if (mi->mi_async_reqs[NFS4_PAGEIO] == NULL) {
1804 		mi->mi_async_reqs[NFS4_PAGEIO] = args;
1805 		mi->mi_async_tail[NFS4_PAGEIO] = args;
1806 	} else {
1807 		mi->mi_async_tail[NFS4_PAGEIO]->a_next = args;
1808 		mi->mi_async_tail[NFS4_PAGEIO] = args;
1809 	}
1810 
1811 	mutex_enter(&rp->r_statelock);
1812 	rp->r_count++;
1813 	rp->r_awcount++;
1814 	mutex_exit(&rp->r_statelock);
1815 
1816 	if (mi->mi_io_kstats) {
1817 		mutex_enter(&mi->mi_lock);
1818 		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1819 		mutex_exit(&mi->mi_lock);
1820 	}
1821 
1822 	mi->mi_async_req_count++;
1823 	ASSERT(mi->mi_async_req_count != 0);
1824 	cv_signal(&mi->mi_async_reqs_cv);
1825 	mutex_exit(&mi->mi_async_lock);
1826 	return (0);
1827 
1828 noasync:
1829 	/*
1830 	 * If we can't do it ASYNC, for reads we do nothing (but cleanup
1831 	 * the page list), for writes we do it synchronously, except for
1832 	 * proc_pageout/proc_fsflush as described below.
1833 	 */
1834 	if (flags & B_READ) {
1835 		pvn_read_done(pp, flags | B_ERROR);
1836 		return (0);
1837 	}
1838 
1839 	if (curproc == proc_pageout || curproc == proc_fsflush) {
1840 		/*
1841 		 * If we get here in the context of the pageout/fsflush,
1842 		 * we refuse to do a sync write, because this may hang
1843 		 * pageout/fsflush (and the machine). In this case, we just
1844 		 * re-mark the page as dirty and punt on the page.
1845 		 *
1846 		 * Make sure B_FORCE isn't set.  We can re-mark the
1847 		 * pages as dirty and unlock the pages in one swoop by
1848 		 * passing in B_ERROR to pvn_write_done().  However,
1849 		 * we should make sure B_FORCE isn't set - we don't
1850 		 * want the page tossed before it gets written out.
1851 		 */
1852 		if (flags & B_FORCE)
1853 			flags &= ~(B_INVAL | B_FORCE);
1854 		pvn_write_done(pp, flags | B_ERROR);
1855 		return (0);
1856 	}
1857 
1858 	if (nfs_zone() != mi->mi_zone) {
1859 		/*
1860 		 * So this was a cross-zone sync pageio.  We pass in B_ERROR
1861 		 * to pvn_write_done() to re-mark the pages as dirty and unlock
1862 		 * them.
1863 		 *
1864 		 * We don't want to clear B_FORCE here as the caller presumably
1865 		 * knows what they're doing if they set it.
1866 		 */
1867 		pvn_write_done(pp, flags | B_ERROR);
1868 		return (EPERM);
1869 	}
1870 	return ((*pageio)(vp, pp, io_off, io_len, flags, cr));
1871 }
1872 
1873 void
1874 nfs4_async_readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr,
1875     int (*readdir)(vnode_t *, rddir4_cache *, cred_t *))
1876 {
1877 	rnode4_t *rp;
1878 	mntinfo4_t *mi;
1879 	struct nfs4_async_reqs *args;
1880 
1881 	rp = VTOR4(vp);
1882 	ASSERT(rp->r_freef == NULL);
1883 
1884 	mi = VTOMI4(vp);
1885 
1886 	/*
1887 	 * If we can't allocate a request structure, skip the readdir.
1888 	 */
1889 	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1890 		goto noasync;
1891 
1892 	args->a_next = NULL;
1893 #ifdef DEBUG
1894 	args->a_queuer = curthread;
1895 #endif
1896 	VN_HOLD(vp);
1897 	args->a_vp = vp;
1898 	ASSERT(cr != NULL);
1899 	crhold(cr);
1900 	args->a_cred = cr;
1901 	args->a_io = NFS4_READDIR;
1902 	args->a_nfs4_readdir = readdir;
1903 	args->a_nfs4_rdc = rdc;
1904 
1905 	mutex_enter(&mi->mi_async_lock);
1906 
1907 	/*
1908 	 * If asyncio has been disabled, then skip this request
1909 	 */
1910 	if (mi->mi_max_threads == 0) {
1911 		mutex_exit(&mi->mi_async_lock);
1912 
1913 		VN_RELE(vp);
1914 		crfree(cr);
1915 		kmem_free(args, sizeof (*args));
1916 		goto noasync;
1917 	}
1918 
1919 	/*
1920 	 * Link request structure into the async list and
1921 	 * wakeup async thread to do the i/o.
1922 	 */
1923 	if (mi->mi_async_reqs[NFS4_READDIR] == NULL) {
1924 		mi->mi_async_reqs[NFS4_READDIR] = args;
1925 		mi->mi_async_tail[NFS4_READDIR] = args;
1926 	} else {
1927 		mi->mi_async_tail[NFS4_READDIR]->a_next = args;
1928 		mi->mi_async_tail[NFS4_READDIR] = args;
1929 	}
1930 
1931 	mutex_enter(&rp->r_statelock);
1932 	rp->r_count++;
1933 	mutex_exit(&rp->r_statelock);
1934 
1935 	if (mi->mi_io_kstats) {
1936 		mutex_enter(&mi->mi_lock);
1937 		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1938 		mutex_exit(&mi->mi_lock);
1939 	}
1940 
1941 	mi->mi_async_req_count++;
1942 	ASSERT(mi->mi_async_req_count != 0);
1943 	cv_signal(&mi->mi_async_reqs_cv);
1944 	mutex_exit(&mi->mi_async_lock);
1945 	return;
1946 
1947 noasync:
1948 	mutex_enter(&rp->r_statelock);
1949 	rdc->entries = NULL;
1950 	/*
1951 	 * Indicate that no one is trying to fill this entry and
1952 	 * it still needs to be filled.
1953 	 */
1954 	rdc->flags &= ~RDDIR;
1955 	rdc->flags |= RDDIRREQ;
1956 	rddir4_cache_rele(rp, rdc);
1957 	mutex_exit(&rp->r_statelock);
1958 }
1959 
1960 void
1961 nfs4_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
1962     cred_t *cr, void (*commit)(vnode_t *, page_t *, offset3, count3,
1963     cred_t *))
1964 {
1965 	rnode4_t *rp;
1966 	mntinfo4_t *mi;
1967 	struct nfs4_async_reqs *args;
1968 	page_t *pp;
1969 
1970 	rp = VTOR4(vp);
1971 	mi = VTOMI4(vp);
1972 
1973 	/*
1974 	 * If we can't allocate a request structure, do the commit
1975 	 * operation synchronously in this thread's context.
1976 	 */
1977 	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1978 		goto noasync;
1979 
1980 	args->a_next = NULL;
1981 #ifdef DEBUG
1982 	args->a_queuer = curthread;
1983 #endif
1984 	VN_HOLD(vp);
1985 	args->a_vp = vp;
1986 	ASSERT(cr != NULL);
1987 	crhold(cr);
1988 	args->a_cred = cr;
1989 	args->a_io = NFS4_COMMIT;
1990 	args->a_nfs4_commit = commit;
1991 	args->a_nfs4_plist = plist;
1992 	args->a_nfs4_offset = offset;
1993 	args->a_nfs4_count = count;
1994 
1995 	mutex_enter(&mi->mi_async_lock);
1996 
1997 	/*
1998 	 * If asyncio has been disabled, then make a synchronous request.
1999 	 * This check is done a second time in case async io was diabled
2000 	 * while this thread was blocked waiting for memory pressure to
2001 	 * reduce or for the queue to drain.
2002 	 */
2003 	if (mi->mi_max_threads == 0) {
2004 		mutex_exit(&mi->mi_async_lock);
2005 
2006 		VN_RELE(vp);
2007 		crfree(cr);
2008 		kmem_free(args, sizeof (*args));
2009 		goto noasync;
2010 	}
2011 
2012 	/*
2013 	 * Link request structure into the async list and
2014 	 * wakeup async thread to do the i/o.
2015 	 */
2016 	if (mi->mi_async_reqs[NFS4_COMMIT] == NULL) {
2017 		mi->mi_async_reqs[NFS4_COMMIT] = args;
2018 		mi->mi_async_tail[NFS4_COMMIT] = args;
2019 	} else {
2020 		mi->mi_async_tail[NFS4_COMMIT]->a_next = args;
2021 		mi->mi_async_tail[NFS4_COMMIT] = args;
2022 	}
2023 
2024 	mutex_enter(&rp->r_statelock);
2025 	rp->r_count++;
2026 	mutex_exit(&rp->r_statelock);
2027 
2028 	if (mi->mi_io_kstats) {
2029 		mutex_enter(&mi->mi_lock);
2030 		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
2031 		mutex_exit(&mi->mi_lock);
2032 	}
2033 
2034 	mi->mi_async_req_count++;
2035 	ASSERT(mi->mi_async_req_count != 0);
2036 	cv_signal(&mi->mi_async_reqs_cv);
2037 	mutex_exit(&mi->mi_async_lock);
2038 	return;
2039 
2040 noasync:
2041 	if (curproc == proc_pageout || curproc == proc_fsflush ||
2042 	    nfs_zone() != mi->mi_zone) {
2043 		while (plist != NULL) {
2044 			pp = plist;
2045 			page_sub(&plist, pp);
2046 			pp->p_fsdata = C_COMMIT;
2047 			page_unlock(pp);
2048 		}
2049 		return;
2050 	}
2051 	(*commit)(vp, plist, offset, count, cr);
2052 }
2053 
2054 /*
2055  * nfs4_async_inactive - hand off a VOP_INACTIVE call to a thread.  The
2056  * reference to the vnode is handed over to the thread; the caller should
2057  * no longer refer to the vnode.
2058  *
2059  * Unlike most of the async routines, this handoff is needed for
2060  * correctness reasons, not just performance.  So doing operations in the
2061  * context of the current thread is not an option.
2062  */
2063 void
2064 nfs4_async_inactive(vnode_t *vp, cred_t *cr)
2065 {
2066 	mntinfo4_t *mi;
2067 	struct nfs4_async_reqs *args;
2068 	boolean_t signal_inactive_thread = B_FALSE;
2069 
2070 	mi = VTOMI4(vp);
2071 
2072 	args = kmem_alloc(sizeof (*args), KM_SLEEP);
2073 	args->a_next = NULL;
2074 #ifdef DEBUG
2075 	args->a_queuer = curthread;
2076 #endif
2077 	args->a_vp = vp;
2078 	ASSERT(cr != NULL);
2079 	crhold(cr);
2080 	args->a_cred = cr;
2081 	args->a_io = NFS4_INACTIVE;
2082 
2083 	/*
2084 	 * Note that we don't check mi->mi_max_threads here, since we
2085 	 * *need* to get rid of this vnode regardless of whether someone
2086 	 * set nfs4_max_threads to zero in /etc/system.
2087 	 *
2088 	 * The manager thread knows about this and is willing to create
2089 	 * at least one thread to accommodate us.
2090 	 */
2091 	mutex_enter(&mi->mi_async_lock);
2092 	if (mi->mi_inactive_thread == NULL) {
2093 		rnode4_t *rp;
2094 		vnode_t *unldvp = NULL;
2095 		char *unlname;
2096 		cred_t *unlcred;
2097 
2098 		mutex_exit(&mi->mi_async_lock);
2099 		/*
2100 		 * We just need to free up the memory associated with the
2101 		 * vnode, which can be safely done from within the current
2102 		 * context.
2103 		 */
2104 		crfree(cr);	/* drop our reference */
2105 		kmem_free(args, sizeof (*args));
2106 		rp = VTOR4(vp);
2107 		mutex_enter(&rp->r_statelock);
2108 		if (rp->r_unldvp != NULL) {
2109 			unldvp = rp->r_unldvp;
2110 			rp->r_unldvp = NULL;
2111 			unlname = rp->r_unlname;
2112 			rp->r_unlname = NULL;
2113 			unlcred = rp->r_unlcred;
2114 			rp->r_unlcred = NULL;
2115 		}
2116 		mutex_exit(&rp->r_statelock);
2117 		/*
2118 		 * No need to explicitly throw away any cached pages.  The
2119 		 * eventual r4inactive() will attempt a synchronous
2120 		 * VOP_PUTPAGE() which will immediately fail since the request
2121 		 * is coming from the wrong zone, and then will proceed to call
2122 		 * nfs4_invalidate_pages() which will clean things up for us.
2123 		 *
2124 		 * Throw away the delegation here so rp4_addfree()'s attempt to
2125 		 * return any existing delegations becomes a no-op.
2126 		 */
2127 		if (rp->r_deleg_type != OPEN_DELEGATE_NONE) {
2128 			(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER,
2129 			    FALSE);
2130 			(void) nfs4delegreturn(rp, NFS4_DR_DISCARD);
2131 			nfs_rw_exit(&mi->mi_recovlock);
2132 		}
2133 		nfs4_clear_open_streams(rp);
2134 
2135 		rp4_addfree(rp, cr);
2136 		if (unldvp != NULL) {
2137 			kmem_free(unlname, MAXNAMELEN);
2138 			VN_RELE(unldvp);
2139 			crfree(unlcred);
2140 		}
2141 		return;
2142 	}
2143 
2144 	if (mi->mi_manager_thread == NULL) {
2145 		/*
2146 		 * We want to talk to the inactive thread.
2147 		 */
2148 		signal_inactive_thread = B_TRUE;
2149 	}
2150 
2151 	/*
2152 	 * Enqueue the vnode and wake up either the special thread (empty
2153 	 * list) or an async thread.
2154 	 */
2155 	if (mi->mi_async_reqs[NFS4_INACTIVE] == NULL) {
2156 		mi->mi_async_reqs[NFS4_INACTIVE] = args;
2157 		mi->mi_async_tail[NFS4_INACTIVE] = args;
2158 		signal_inactive_thread = B_TRUE;
2159 	} else {
2160 		mi->mi_async_tail[NFS4_INACTIVE]->a_next = args;
2161 		mi->mi_async_tail[NFS4_INACTIVE] = args;
2162 	}
2163 	if (signal_inactive_thread) {
2164 		cv_signal(&mi->mi_inact_req_cv);
2165 	} else  {
2166 		mi->mi_async_req_count++;
2167 		ASSERT(mi->mi_async_req_count != 0);
2168 		cv_signal(&mi->mi_async_reqs_cv);
2169 	}
2170 
2171 	mutex_exit(&mi->mi_async_lock);
2172 }
2173 
2174 int
2175 writerp4(rnode4_t *rp, caddr_t base, int tcount, struct uio *uio, int pgcreated)
2176 {
2177 	int pagecreate;
2178 	int n;
2179 	int saved_n;
2180 	caddr_t saved_base;
2181 	u_offset_t offset;
2182 	int error;
2183 	int sm_error;
2184 	vnode_t *vp = RTOV(rp);
2185 
2186 	ASSERT(tcount <= MAXBSIZE && tcount <= uio->uio_resid);
2187 	ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_WRITER));
2188 	if (!vpm_enable) {
2189 		ASSERT(((uintptr_t)base & MAXBOFFSET) + tcount <= MAXBSIZE);
2190 	}
2191 
2192 	/*
2193 	 * Move bytes in at most PAGESIZE chunks. We must avoid
2194 	 * spanning pages in uiomove() because page faults may cause
2195 	 * the cache to be invalidated out from under us. The r_size is not
2196 	 * updated until after the uiomove. If we push the last page of a
2197 	 * file before r_size is correct, we will lose the data written past
2198 	 * the current (and invalid) r_size.
2199 	 */
2200 	do {
2201 		offset = uio->uio_loffset;
2202 		pagecreate = 0;
2203 
2204 		/*
2205 		 * n is the number of bytes required to satisfy the request
2206 		 *   or the number of bytes to fill out the page.
2207 		 */
2208 		n = (int)MIN((PAGESIZE - (offset & PAGEOFFSET)), tcount);
2209 
2210 		/*
2211 		 * Check to see if we can skip reading in the page
2212 		 * and just allocate the memory.  We can do this
2213 		 * if we are going to rewrite the entire mapping
2214 		 * or if we are going to write to or beyond the current
2215 		 * end of file from the beginning of the mapping.
2216 		 *
2217 		 * The read of r_size is now protected by r_statelock.
2218 		 */
2219 		mutex_enter(&rp->r_statelock);
2220 		/*
2221 		 * When pgcreated is nonzero the caller has already done
2222 		 * a segmap_getmapflt with forcefault 0 and S_WRITE. With
2223 		 * segkpm this means we already have at least one page
2224 		 * created and mapped at base.
2225 		 */
2226 		pagecreate = pgcreated ||
2227 		    ((offset & PAGEOFFSET) == 0 &&
2228 		    (n == PAGESIZE || ((offset + n) >= rp->r_size)));
2229 
2230 		mutex_exit(&rp->r_statelock);
2231 
2232 		if (!vpm_enable && pagecreate) {
2233 			/*
2234 			 * The last argument tells segmap_pagecreate() to
2235 			 * always lock the page, as opposed to sometimes
2236 			 * returning with the page locked. This way we avoid a
2237 			 * fault on the ensuing uiomove(), but also
2238 			 * more importantly (to fix bug 1094402) we can
2239 			 * call segmap_fault() to unlock the page in all
2240 			 * cases. An alternative would be to modify
2241 			 * segmap_pagecreate() to tell us when it is
2242 			 * locking a page, but that's a fairly major
2243 			 * interface change.
2244 			 */
2245 			if (pgcreated == 0)
2246 				(void) segmap_pagecreate(segkmap, base,
2247 				    (uint_t)n, 1);
2248 			saved_base = base;
2249 			saved_n = n;
2250 		}
2251 
2252 		/*
2253 		 * The number of bytes of data in the last page can not
2254 		 * be accurately be determined while page is being
2255 		 * uiomove'd to and the size of the file being updated.
2256 		 * Thus, inform threads which need to know accurately
2257 		 * how much data is in the last page of the file.  They
2258 		 * will not do the i/o immediately, but will arrange for
2259 		 * the i/o to happen later when this modify operation
2260 		 * will have finished.
2261 		 */
2262 		ASSERT(!(rp->r_flags & R4MODINPROGRESS));
2263 		mutex_enter(&rp->r_statelock);
2264 		rp->r_flags |= R4MODINPROGRESS;
2265 		rp->r_modaddr = (offset & MAXBMASK);
2266 		mutex_exit(&rp->r_statelock);
2267 
2268 		if (vpm_enable) {
2269 			/*
2270 			 * Copy data. If new pages are created, part of
2271 			 * the page that is not written will be initizliazed
2272 			 * with zeros.
2273 			 */
2274 			error = vpm_data_copy(vp, offset, n, uio,
2275 			    !pagecreate, NULL, 0, S_WRITE);
2276 		} else {
2277 			error = uiomove(base, n, UIO_WRITE, uio);
2278 		}
2279 
2280 		/*
2281 		 * r_size is the maximum number of
2282 		 * bytes known to be in the file.
2283 		 * Make sure it is at least as high as the
2284 		 * first unwritten byte pointed to by uio_loffset.
2285 		 */
2286 		mutex_enter(&rp->r_statelock);
2287 		if (rp->r_size < uio->uio_loffset)
2288 			rp->r_size = uio->uio_loffset;
2289 		rp->r_flags &= ~R4MODINPROGRESS;
2290 		rp->r_flags |= R4DIRTY;
2291 		mutex_exit(&rp->r_statelock);
2292 
2293 		/* n = # of bytes written */
2294 		n = (int)(uio->uio_loffset - offset);
2295 
2296 		if (!vpm_enable) {
2297 			base += n;
2298 		}
2299 
2300 		tcount -= n;
2301 		/*
2302 		 * If we created pages w/o initializing them completely,
2303 		 * we need to zero the part that wasn't set up.
2304 		 * This happens on a most EOF write cases and if
2305 		 * we had some sort of error during the uiomove.
2306 		 */
2307 		if (!vpm_enable && pagecreate) {
2308 			if ((uio->uio_loffset & PAGEOFFSET) || n == 0)
2309 				(void) kzero(base, PAGESIZE - n);
2310 
2311 			if (pgcreated) {
2312 				/*
2313 				 * Caller is responsible for this page,
2314 				 * it was not created in this loop.
2315 				 */
2316 				pgcreated = 0;
2317 			} else {
2318 				/*
2319 				 * For bug 1094402: segmap_pagecreate locks
2320 				 * page. Unlock it. This also unlocks the
2321 				 * pages allocated by page_create_va() in
2322 				 * segmap_pagecreate().
2323 				 */
2324 				sm_error = segmap_fault(kas.a_hat, segkmap,
2325 				    saved_base, saved_n,
2326 				    F_SOFTUNLOCK, S_WRITE);
2327 				if (error == 0)
2328 					error = sm_error;
2329 			}
2330 		}
2331 	} while (tcount > 0 && error == 0);
2332 
2333 	return (error);
2334 }
2335 
2336 int
2337 nfs4_putpages(vnode_t *vp, u_offset_t off, size_t len, int flags, cred_t *cr)
2338 {
2339 	rnode4_t *rp;
2340 	page_t *pp;
2341 	u_offset_t eoff;
2342 	u_offset_t io_off;
2343 	size_t io_len;
2344 	int error;
2345 	int rdirty;
2346 	int err;
2347 
2348 	rp = VTOR4(vp);
2349 	ASSERT(rp->r_count > 0);
2350 
2351 	if (!nfs4_has_pages(vp))
2352 		return (0);
2353 
2354 	ASSERT(vp->v_type != VCHR);
2355 
2356 	/*
2357 	 * If R4OUTOFSPACE is set, then all writes turn into B_INVAL
2358 	 * writes.  B_FORCE is set to force the VM system to actually
2359 	 * invalidate the pages, even if the i/o failed.  The pages
2360 	 * need to get invalidated because they can't be written out
2361 	 * because there isn't any space left on either the server's
2362 	 * file system or in the user's disk quota.  The B_FREE bit
2363 	 * is cleared to avoid confusion as to whether this is a
2364 	 * request to place the page on the freelist or to destroy
2365 	 * it.
2366 	 */
2367 	if ((rp->r_flags & R4OUTOFSPACE) ||
2368 	    (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED))
2369 		flags = (flags & ~B_FREE) | B_INVAL | B_FORCE;
2370 
2371 	if (len == 0) {
2372 		/*
2373 		 * If doing a full file synchronous operation, then clear
2374 		 * the R4DIRTY bit.  If a page gets dirtied while the flush
2375 		 * is happening, then R4DIRTY will get set again.  The
2376 		 * R4DIRTY bit must get cleared before the flush so that
2377 		 * we don't lose this information.
2378 		 *
2379 		 * If there are no full file async write operations
2380 		 * pending and RDIRTY bit is set, clear it.
2381 		 */
2382 		if (off == (u_offset_t)0 &&
2383 		    !(flags & B_ASYNC) &&
2384 		    (rp->r_flags & R4DIRTY)) {
2385 			mutex_enter(&rp->r_statelock);
2386 			rdirty = (rp->r_flags & R4DIRTY);
2387 			rp->r_flags &= ~R4DIRTY;
2388 			mutex_exit(&rp->r_statelock);
2389 		} else if (flags & B_ASYNC && off == (u_offset_t)0) {
2390 			mutex_enter(&rp->r_statelock);
2391 			if (rp->r_flags & R4DIRTY && rp->r_awcount == 0) {
2392 				rdirty = (rp->r_flags & R4DIRTY);
2393 				rp->r_flags &= ~R4DIRTY;
2394 			}
2395 			mutex_exit(&rp->r_statelock);
2396 		} else
2397 			rdirty = 0;
2398 
2399 		/*
2400 		 * Search the entire vp list for pages >= off, and flush
2401 		 * the dirty pages.
2402 		 */
2403 		error = pvn_vplist_dirty(vp, off, rp->r_putapage,
2404 		    flags, cr);
2405 
2406 		/*
2407 		 * If an error occurred and the file was marked as dirty
2408 		 * before and we aren't forcibly invalidating pages, then
2409 		 * reset the R4DIRTY flag.
2410 		 */
2411 		if (error && rdirty &&
2412 		    (flags & (B_INVAL | B_FORCE)) != (B_INVAL | B_FORCE)) {
2413 			mutex_enter(&rp->r_statelock);
2414 			rp->r_flags |= R4DIRTY;
2415 			mutex_exit(&rp->r_statelock);
2416 		}
2417 	} else {
2418 		/*
2419 		 * Do a range from [off...off + len) looking for pages
2420 		 * to deal with.
2421 		 */
2422 		error = 0;
2423 		io_len = 0;
2424 		eoff = off + len;
2425 		mutex_enter(&rp->r_statelock);
2426 		for (io_off = off; io_off < eoff && io_off < rp->r_size;
2427 		    io_off += io_len) {
2428 			mutex_exit(&rp->r_statelock);
2429 			/*
2430 			 * If we are not invalidating, synchronously
2431 			 * freeing or writing pages use the routine
2432 			 * page_lookup_nowait() to prevent reclaiming
2433 			 * them from the free list.
2434 			 */
2435 			if ((flags & B_INVAL) || !(flags & B_ASYNC)) {
2436 				pp = page_lookup(vp, io_off,
2437 				    (flags & (B_INVAL | B_FREE)) ?
2438 				    SE_EXCL : SE_SHARED);
2439 			} else {
2440 				pp = page_lookup_nowait(vp, io_off,
2441 				    (flags & B_FREE) ? SE_EXCL : SE_SHARED);
2442 			}
2443 
2444 			if (pp == NULL || !pvn_getdirty(pp, flags))
2445 				io_len = PAGESIZE;
2446 			else {
2447 				err = (*rp->r_putapage)(vp, pp, &io_off,
2448 				    &io_len, flags, cr);
2449 				if (!error)
2450 					error = err;
2451 				/*
2452 				 * "io_off" and "io_len" are returned as
2453 				 * the range of pages we actually wrote.
2454 				 * This allows us to skip ahead more quickly
2455 				 * since several pages may've been dealt
2456 				 * with by this iteration of the loop.
2457 				 */
2458 			}
2459 			mutex_enter(&rp->r_statelock);
2460 		}
2461 		mutex_exit(&rp->r_statelock);
2462 	}
2463 
2464 	return (error);
2465 }
2466 
2467 void
2468 nfs4_invalidate_pages(vnode_t *vp, u_offset_t off, cred_t *cr)
2469 {
2470 	rnode4_t *rp;
2471 
2472 	rp = VTOR4(vp);
2473 	if (IS_SHADOW(vp, rp))
2474 		vp = RTOV4(rp);
2475 	mutex_enter(&rp->r_statelock);
2476 	while (rp->r_flags & R4TRUNCATE)
2477 		cv_wait(&rp->r_cv, &rp->r_statelock);
2478 	rp->r_flags |= R4TRUNCATE;
2479 	if (off == (u_offset_t)0) {
2480 		rp->r_flags &= ~R4DIRTY;
2481 		if (!(rp->r_flags & R4STALE))
2482 			rp->r_error = 0;
2483 	}
2484 	rp->r_truncaddr = off;
2485 	mutex_exit(&rp->r_statelock);
2486 	(void) pvn_vplist_dirty(vp, off, rp->r_putapage,
2487 	    B_INVAL | B_TRUNC, cr);
2488 	mutex_enter(&rp->r_statelock);
2489 	rp->r_flags &= ~R4TRUNCATE;
2490 	cv_broadcast(&rp->r_cv);
2491 	mutex_exit(&rp->r_statelock);
2492 }
2493 
2494 static int
2495 nfs4_mnt_kstat_update(kstat_t *ksp, int rw)
2496 {
2497 	mntinfo4_t *mi;
2498 	struct mntinfo_kstat *mik;
2499 	vfs_t *vfsp;
2500 
2501 	/* this is a read-only kstat. Bail out on a write */
2502 	if (rw == KSTAT_WRITE)
2503 		return (EACCES);
2504 
2505 
2506 	/*
2507 	 * We don't want to wait here as kstat_chain_lock could be held by
2508 	 * dounmount(). dounmount() takes vfs_reflock before the chain lock
2509 	 * and thus could lead to a deadlock.
2510 	 */
2511 	vfsp = (struct vfs *)ksp->ks_private;
2512 
2513 	mi = VFTOMI4(vfsp);
2514 	mik = (struct mntinfo_kstat *)ksp->ks_data;
2515 
2516 	(void) strcpy(mik->mik_proto, mi->mi_curr_serv->sv_knconf->knc_proto);
2517 
2518 	mik->mik_vers = (uint32_t)mi->mi_vers;
2519 	mik->mik_flags = mi->mi_flags;
2520 	/*
2521 	 * The sv_secdata holds the flavor the client specifies.
2522 	 * If the client uses default and a security negotiation
2523 	 * occurs, sv_currsec will point to the current flavor
2524 	 * selected from the server flavor list.
2525 	 * sv_currsec is NULL if no security negotiation takes place.
2526 	 */
2527 	mik->mik_secmod = mi->mi_curr_serv->sv_currsec ?
2528 	    mi->mi_curr_serv->sv_currsec->secmod :
2529 	    mi->mi_curr_serv->sv_secdata->secmod;
2530 	mik->mik_curread = (uint32_t)mi->mi_curread;
2531 	mik->mik_curwrite = (uint32_t)mi->mi_curwrite;
2532 	mik->mik_retrans = mi->mi_retrans;
2533 	mik->mik_timeo = mi->mi_timeo;
2534 	mik->mik_acregmin = HR2SEC(mi->mi_acregmin);
2535 	mik->mik_acregmax = HR2SEC(mi->mi_acregmax);
2536 	mik->mik_acdirmin = HR2SEC(mi->mi_acdirmin);
2537 	mik->mik_acdirmax = HR2SEC(mi->mi_acdirmax);
2538 	mik->mik_noresponse = (uint32_t)mi->mi_noresponse;
2539 	mik->mik_failover = (uint32_t)mi->mi_failover;
2540 	mik->mik_remap = (uint32_t)mi->mi_remap;
2541 
2542 	(void) strcpy(mik->mik_curserver, mi->mi_curr_serv->sv_hostname);
2543 
2544 	return (0);
2545 }
2546 
2547 void
2548 nfs4_mnt_kstat_init(struct vfs *vfsp)
2549 {
2550 	mntinfo4_t *mi = VFTOMI4(vfsp);
2551 
2552 	/*
2553 	 * PSARC 2001/697 Contract Private Interface
2554 	 * All nfs kstats are under SunMC contract
2555 	 * Please refer to the PSARC listed above and contact
2556 	 * SunMC before making any changes!
2557 	 *
2558 	 * Changes must be reviewed by Solaris File Sharing
2559 	 * Changes must be communicated to contract-2001-697@sun.com
2560 	 *
2561 	 */
2562 
2563 	mi->mi_io_kstats = kstat_create_zone("nfs", getminor(vfsp->vfs_dev),
2564 	    NULL, "nfs", KSTAT_TYPE_IO, 1, 0, mi->mi_zone->zone_id);
2565 	if (mi->mi_io_kstats) {
2566 		if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
2567 			kstat_zone_add(mi->mi_io_kstats, GLOBAL_ZONEID);
2568 		mi->mi_io_kstats->ks_lock = &mi->mi_lock;
2569 		kstat_install(mi->mi_io_kstats);
2570 	}
2571 
2572 	if ((mi->mi_ro_kstats = kstat_create_zone("nfs",
2573 	    getminor(vfsp->vfs_dev), "mntinfo", "misc", KSTAT_TYPE_RAW,
2574 	    sizeof (struct mntinfo_kstat), 0, mi->mi_zone->zone_id)) != NULL) {
2575 		if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
2576 			kstat_zone_add(mi->mi_ro_kstats, GLOBAL_ZONEID);
2577 		mi->mi_ro_kstats->ks_update = nfs4_mnt_kstat_update;
2578 		mi->mi_ro_kstats->ks_private = (void *)vfsp;
2579 		kstat_install(mi->mi_ro_kstats);
2580 	}
2581 
2582 	nfs4_mnt_recov_kstat_init(vfsp);
2583 }
2584 
2585 void
2586 nfs4_write_error(vnode_t *vp, int error, cred_t *cr)
2587 {
2588 	mntinfo4_t *mi;
2589 
2590 	mi = VTOMI4(vp);
2591 	/*
2592 	 * In case of forced unmount, do not print any messages
2593 	 * since it can flood the console with error messages.
2594 	 */
2595 	if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)
2596 		return;
2597 
2598 	/*
2599 	 * If the mount point is dead, not recoverable, do not
2600 	 * print error messages that can flood the console.
2601 	 */
2602 	if (mi->mi_flags & MI4_RECOV_FAIL)
2603 		return;
2604 
2605 	/*
2606 	 * No use in flooding the console with ENOSPC
2607 	 * messages from the same file system.
2608 	 */
2609 	if ((error != ENOSPC && error != EDQUOT) ||
2610 	    lbolt - mi->mi_printftime > 0) {
2611 		zoneid_t zoneid = mi->mi_zone->zone_id;
2612 
2613 #ifdef DEBUG
2614 		nfs_perror(error, "NFS%ld write error on host %s: %m.\n",
2615 		    mi->mi_vers, VTOR4(vp)->r_server->sv_hostname, NULL);
2616 #else
2617 		nfs_perror(error, "NFS write error on host %s: %m.\n",
2618 		    VTOR4(vp)->r_server->sv_hostname, NULL);
2619 #endif
2620 		if (error == ENOSPC || error == EDQUOT) {
2621 			zcmn_err(zoneid, CE_CONT,
2622 			    "^File: userid=%d, groupid=%d\n",
2623 			    crgetuid(cr), crgetgid(cr));
2624 			if (crgetuid(curthread->t_cred) != crgetuid(cr) ||
2625 			    crgetgid(curthread->t_cred) != crgetgid(cr)) {
2626 				zcmn_err(zoneid, CE_CONT,
2627 				    "^User: userid=%d, groupid=%d\n",
2628 				    crgetuid(curthread->t_cred),
2629 				    crgetgid(curthread->t_cred));
2630 			}
2631 			mi->mi_printftime = lbolt +
2632 			    nfs_write_error_interval * hz;
2633 		}
2634 		sfh4_printfhandle(VTOR4(vp)->r_fh);
2635 #ifdef DEBUG
2636 		if (error == EACCES) {
2637 			zcmn_err(zoneid, CE_CONT,
2638 			    "nfs_bio: cred is%s kcred\n",
2639 			    cr == kcred ? "" : " not");
2640 		}
2641 #endif
2642 	}
2643 }
2644 
2645 /*
2646  * Return non-zero if the given file can be safely memory mapped.  Locks
2647  * are safe if whole-file (length and offset are both zero).
2648  */
2649 
2650 #define	SAFE_LOCK(flk)	((flk).l_start == 0 && (flk).l_len == 0)
2651 
2652 static int
2653 nfs4_safemap(const vnode_t *vp)
2654 {
2655 	locklist_t	*llp, *next_llp;
2656 	int		safe = 1;
2657 	rnode4_t	*rp = VTOR4(vp);
2658 
2659 	ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER));
2660 
2661 	NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: "
2662 	    "vp = %p", (void *)vp));
2663 
2664 	/*
2665 	 * Review all the locks for the vnode, both ones that have been
2666 	 * acquired and ones that are pending.  We assume that
2667 	 * flk_active_locks_for_vp() has merged any locks that can be
2668 	 * merged (so that if a process has the entire file locked, it is
2669 	 * represented as a single lock).
2670 	 *
2671 	 * Note that we can't bail out of the loop if we find a non-safe
2672 	 * lock, because we have to free all the elements in the llp list.
2673 	 * We might be able to speed up this code slightly by not looking
2674 	 * at each lock's l_start and l_len fields once we've found a
2675 	 * non-safe lock.
2676 	 */
2677 
2678 	llp = flk_active_locks_for_vp(vp);
2679 	while (llp) {
2680 		NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE,
2681 		    "nfs4_safemap: active lock (%" PRId64 ", %" PRId64 ")",
2682 		    llp->ll_flock.l_start, llp->ll_flock.l_len));
2683 		if (!SAFE_LOCK(llp->ll_flock)) {
2684 			safe = 0;
2685 			NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE,
2686 			    "nfs4_safemap: unsafe active lock (%" PRId64
2687 			    ", %" PRId64 ")", llp->ll_flock.l_start,
2688 			    llp->ll_flock.l_len));
2689 		}
2690 		next_llp = llp->ll_next;
2691 		VN_RELE(llp->ll_vp);
2692 		kmem_free(llp, sizeof (*llp));
2693 		llp = next_llp;
2694 	}
2695 
2696 	NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: %s",
2697 	    safe ? "safe" : "unsafe"));
2698 	return (safe);
2699 }
2700 
2701 /*
2702  * Return whether there is a lost LOCK or LOCKU queued up for the given
2703  * file that would make an mmap request unsafe.  cf. nfs4_safemap().
2704  */
2705 
2706 bool_t
2707 nfs4_map_lost_lock_conflict(vnode_t *vp)
2708 {
2709 	bool_t conflict = FALSE;
2710 	nfs4_lost_rqst_t *lrp;
2711 	mntinfo4_t *mi = VTOMI4(vp);
2712 
2713 	mutex_enter(&mi->mi_lock);
2714 	for (lrp = list_head(&mi->mi_lost_state); lrp != NULL;
2715 	    lrp = list_next(&mi->mi_lost_state, lrp)) {
2716 		if (lrp->lr_op != OP_LOCK && lrp->lr_op != OP_LOCKU)
2717 			continue;
2718 		ASSERT(lrp->lr_vp != NULL);
2719 		if (!VOP_CMP(lrp->lr_vp, vp, NULL))
2720 			continue;	/* different file */
2721 		if (!SAFE_LOCK(*lrp->lr_flk)) {
2722 			conflict = TRUE;
2723 			break;
2724 		}
2725 	}
2726 
2727 	mutex_exit(&mi->mi_lock);
2728 	return (conflict);
2729 }
2730 
2731 /*
2732  * nfs_lockcompletion:
2733  *
2734  * If the vnode has a lock that makes it unsafe to cache the file, mark it
2735  * as non cachable (set VNOCACHE bit).
2736  */
2737 
2738 void
2739 nfs4_lockcompletion(vnode_t *vp, int cmd)
2740 {
2741 	rnode4_t *rp = VTOR4(vp);
2742 
2743 	ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER));
2744 	ASSERT(!IS_SHADOW(vp, rp));
2745 
2746 	if (cmd == F_SETLK || cmd == F_SETLKW) {
2747 
2748 		if (!nfs4_safemap(vp)) {
2749 			mutex_enter(&vp->v_lock);
2750 			vp->v_flag |= VNOCACHE;
2751 			mutex_exit(&vp->v_lock);
2752 		} else {
2753 			mutex_enter(&vp->v_lock);
2754 			vp->v_flag &= ~VNOCACHE;
2755 			mutex_exit(&vp->v_lock);
2756 		}
2757 	}
2758 	/*
2759 	 * The cached attributes of the file are stale after acquiring
2760 	 * the lock on the file. They were updated when the file was
2761 	 * opened, but not updated when the lock was acquired. Therefore the
2762 	 * cached attributes are invalidated after the lock is obtained.
2763 	 */
2764 	PURGE_ATTRCACHE4(vp);
2765 }
2766 
2767 /* ARGSUSED */
2768 static void *
2769 nfs4_mi_init(zoneid_t zoneid)
2770 {
2771 	struct mi4_globals *mig;
2772 
2773 	mig = kmem_alloc(sizeof (*mig), KM_SLEEP);
2774 	mutex_init(&mig->mig_lock, NULL, MUTEX_DEFAULT, NULL);
2775 	list_create(&mig->mig_list, sizeof (mntinfo4_t),
2776 	    offsetof(mntinfo4_t, mi_zone_node));
2777 	mig->mig_destructor_called = B_FALSE;
2778 	return (mig);
2779 }
2780 
2781 /*
2782  * Callback routine to tell all NFSv4 mounts in the zone to start tearing down
2783  * state and killing off threads.
2784  */
2785 /* ARGSUSED */
2786 static void
2787 nfs4_mi_shutdown(zoneid_t zoneid, void *data)
2788 {
2789 	struct mi4_globals *mig = data;
2790 	mntinfo4_t *mi;
2791 	nfs4_server_t *np;
2792 
2793 	NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2794 	    "nfs4_mi_shutdown zone %d\n", zoneid));
2795 	ASSERT(mig != NULL);
2796 	for (;;) {
2797 		mutex_enter(&mig->mig_lock);
2798 		mi = list_head(&mig->mig_list);
2799 		if (mi == NULL) {
2800 			mutex_exit(&mig->mig_lock);
2801 			break;
2802 		}
2803 
2804 		NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2805 		    "nfs4_mi_shutdown stopping vfs %p\n", (void *)mi->mi_vfsp));
2806 		/*
2807 		 * purge the DNLC for this filesystem
2808 		 */
2809 		(void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
2810 		/*
2811 		 * Tell existing async worker threads to exit.
2812 		 */
2813 		mutex_enter(&mi->mi_async_lock);
2814 		mi->mi_max_threads = 0;
2815 		cv_broadcast(&mi->mi_async_work_cv);
2816 		/*
2817 		 * Set the appropriate flags, signal and wait for both the
2818 		 * async manager and the inactive thread to exit when they're
2819 		 * done with their current work.
2820 		 */
2821 		mutex_enter(&mi->mi_lock);
2822 		mi->mi_flags |= (MI4_ASYNC_MGR_STOP|MI4_DEAD);
2823 		mutex_exit(&mi->mi_lock);
2824 		mutex_exit(&mi->mi_async_lock);
2825 		if (mi->mi_manager_thread) {
2826 			nfs4_async_manager_stop(mi->mi_vfsp);
2827 		}
2828 		if (mi->mi_inactive_thread) {
2829 			mutex_enter(&mi->mi_async_lock);
2830 			cv_signal(&mi->mi_inact_req_cv);
2831 			/*
2832 			 * Wait for the inactive thread to exit.
2833 			 */
2834 			while (mi->mi_inactive_thread != NULL) {
2835 				cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
2836 			}
2837 			mutex_exit(&mi->mi_async_lock);
2838 		}
2839 		/*
2840 		 * Wait for the recovery thread to complete, that is, it will
2841 		 * signal when it is done using the "mi" structure and about
2842 		 * to exit
2843 		 */
2844 		mutex_enter(&mi->mi_lock);
2845 		while (mi->mi_in_recovery > 0)
2846 			cv_wait(&mi->mi_cv_in_recov, &mi->mi_lock);
2847 		mutex_exit(&mi->mi_lock);
2848 		/*
2849 		 * We're done when every mi has been done or the list is empty.
2850 		 * This one is done, remove it from the list.
2851 		 */
2852 		list_remove(&mig->mig_list, mi);
2853 		mutex_exit(&mig->mig_lock);
2854 		zone_rele(mi->mi_zone);
2855 		/*
2856 		 * Release hold on vfs and mi done to prevent race with zone
2857 		 * shutdown. This releases the hold in nfs4_mi_zonelist_add.
2858 		 */
2859 		VFS_RELE(mi->mi_vfsp);
2860 		MI4_RELE(mi);
2861 	}
2862 	/*
2863 	 * Tell each renew thread in the zone to exit
2864 	 */
2865 	mutex_enter(&nfs4_server_lst_lock);
2866 	for (np = nfs4_server_lst.forw; np != &nfs4_server_lst; np = np->forw) {
2867 		mutex_enter(&np->s_lock);
2868 		if (np->zoneid == zoneid) {
2869 			/*
2870 			 * We add another hold onto the nfs4_server_t
2871 			 * because this will make sure tha the nfs4_server_t
2872 			 * stays around until nfs4_callback_fini_zone destroys
2873 			 * the zone. This way, the renew thread can
2874 			 * unconditionally release its holds on the
2875 			 * nfs4_server_t.
2876 			 */
2877 			np->s_refcnt++;
2878 			nfs4_mark_srv_dead(np);
2879 		}
2880 		mutex_exit(&np->s_lock);
2881 	}
2882 	mutex_exit(&nfs4_server_lst_lock);
2883 }
2884 
2885 static void
2886 nfs4_mi_free_globals(struct mi4_globals *mig)
2887 {
2888 	list_destroy(&mig->mig_list);	/* makes sure the list is empty */
2889 	mutex_destroy(&mig->mig_lock);
2890 	kmem_free(mig, sizeof (*mig));
2891 }
2892 
2893 /* ARGSUSED */
2894 static void
2895 nfs4_mi_destroy(zoneid_t zoneid, void *data)
2896 {
2897 	struct mi4_globals *mig = data;
2898 
2899 	NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2900 	    "nfs4_mi_destroy zone %d\n", zoneid));
2901 	ASSERT(mig != NULL);
2902 	mutex_enter(&mig->mig_lock);
2903 	if (list_head(&mig->mig_list) != NULL) {
2904 		/* Still waiting for VFS_FREEVFS() */
2905 		mig->mig_destructor_called = B_TRUE;
2906 		mutex_exit(&mig->mig_lock);
2907 		return;
2908 	}
2909 	nfs4_mi_free_globals(mig);
2910 }
2911 
2912 /*
2913  * Add an NFS mount to the per-zone list of NFS mounts.
2914  */
2915 void
2916 nfs4_mi_zonelist_add(mntinfo4_t *mi)
2917 {
2918 	struct mi4_globals *mig;
2919 
2920 	mig = zone_getspecific(mi4_list_key, mi->mi_zone);
2921 	mutex_enter(&mig->mig_lock);
2922 	list_insert_head(&mig->mig_list, mi);
2923 	/*
2924 	 * hold added to eliminate race with zone shutdown -this will be
2925 	 * released in mi_shutdown
2926 	 */
2927 	MI4_HOLD(mi);
2928 	VFS_HOLD(mi->mi_vfsp);
2929 	mutex_exit(&mig->mig_lock);
2930 }
2931 
2932 /*
2933  * Remove an NFS mount from the per-zone list of NFS mounts.
2934  */
2935 int
2936 nfs4_mi_zonelist_remove(mntinfo4_t *mi)
2937 {
2938 	struct mi4_globals *mig;
2939 	int ret = 0;
2940 
2941 	mig = zone_getspecific(mi4_list_key, mi->mi_zone);
2942 	mutex_enter(&mig->mig_lock);
2943 	mutex_enter(&mi->mi_lock);
2944 	/* if this mi is marked dead, then the zone already released it */
2945 	if (!(mi->mi_flags & MI4_DEAD)) {
2946 		list_remove(&mig->mig_list, mi);
2947 
2948 		/* release the holds put on in zonelist_add(). */
2949 		VFS_RELE(mi->mi_vfsp);
2950 		MI4_RELE(mi);
2951 		ret = 1;
2952 	}
2953 	mutex_exit(&mi->mi_lock);
2954 
2955 	/*
2956 	 * We can be called asynchronously by VFS_FREEVFS() after the zone
2957 	 * shutdown/destroy callbacks have executed; if so, clean up the zone's
2958 	 * mi globals.
2959 	 */
2960 	if (list_head(&mig->mig_list) == NULL &&
2961 	    mig->mig_destructor_called == B_TRUE) {
2962 		nfs4_mi_free_globals(mig);
2963 		return (ret);
2964 	}
2965 	mutex_exit(&mig->mig_lock);
2966 	return (ret);
2967 }
2968 
2969 void
2970 nfs_free_mi4(mntinfo4_t *mi)
2971 {
2972 	nfs4_open_owner_t	*foop;
2973 	nfs4_oo_hash_bucket_t   *bucketp;
2974 	nfs4_debug_msg_t	*msgp;
2975 	int i;
2976 	servinfo4_t 		*svp;
2977 
2978 	mutex_enter(&mi->mi_lock);
2979 	ASSERT(mi->mi_recovthread == NULL);
2980 	ASSERT(mi->mi_flags & MI4_ASYNC_MGR_STOP);
2981 	mutex_exit(&mi->mi_lock);
2982 	mutex_enter(&mi->mi_async_lock);
2983 	ASSERT(mi->mi_threads == 0);
2984 	ASSERT(mi->mi_manager_thread == NULL);
2985 	mutex_exit(&mi->mi_async_lock);
2986 	svp = mi->mi_servers;
2987 	sv4_free(svp);
2988 	if (mi->mi_io_kstats) {
2989 		kstat_delete(mi->mi_io_kstats);
2990 		mi->mi_io_kstats = NULL;
2991 	}
2992 	if (mi->mi_ro_kstats) {
2993 		kstat_delete(mi->mi_ro_kstats);
2994 		mi->mi_ro_kstats = NULL;
2995 	}
2996 	if (mi->mi_recov_ksp) {
2997 		kstat_delete(mi->mi_recov_ksp);
2998 		mi->mi_recov_ksp = NULL;
2999 	}
3000 	mutex_enter(&mi->mi_msg_list_lock);
3001 	while (msgp = list_head(&mi->mi_msg_list)) {
3002 		list_remove(&mi->mi_msg_list, msgp);
3003 		nfs4_free_msg(msgp);
3004 	}
3005 	mutex_exit(&mi->mi_msg_list_lock);
3006 	list_destroy(&mi->mi_msg_list);
3007 	if (mi->mi_rootfh != NULL)
3008 		sfh4_rele(&mi->mi_rootfh);
3009 	if (mi->mi_srvparentfh != NULL)
3010 		sfh4_rele(&mi->mi_srvparentfh);
3011 	mutex_destroy(&mi->mi_lock);
3012 	mutex_destroy(&mi->mi_async_lock);
3013 	mutex_destroy(&mi->mi_msg_list_lock);
3014 	nfs_rw_destroy(&mi->mi_recovlock);
3015 	nfs_rw_destroy(&mi->mi_rename_lock);
3016 	nfs_rw_destroy(&mi->mi_fh_lock);
3017 	cv_destroy(&mi->mi_failover_cv);
3018 	cv_destroy(&mi->mi_async_reqs_cv);
3019 	cv_destroy(&mi->mi_async_work_cv);
3020 	cv_destroy(&mi->mi_async_cv);
3021 	cv_destroy(&mi->mi_inact_req_cv);
3022 	/*
3023 	 * Destroy the oo hash lists and mutexes for the cred hash table.
3024 	 */
3025 	for (i = 0; i < NFS4_NUM_OO_BUCKETS; i++) {
3026 		bucketp = &(mi->mi_oo_list[i]);
3027 		/* Destroy any remaining open owners on the list */
3028 		foop = list_head(&bucketp->b_oo_hash_list);
3029 		while (foop != NULL) {
3030 			list_remove(&bucketp->b_oo_hash_list, foop);
3031 			nfs4_destroy_open_owner(foop);
3032 			foop = list_head(&bucketp->b_oo_hash_list);
3033 		}
3034 		list_destroy(&bucketp->b_oo_hash_list);
3035 		mutex_destroy(&bucketp->b_lock);
3036 	}
3037 	/*
3038 	 * Empty and destroy the freed open owner list.
3039 	 */
3040 	foop = list_head(&mi->mi_foo_list);
3041 	while (foop != NULL) {
3042 		list_remove(&mi->mi_foo_list, foop);
3043 		nfs4_destroy_open_owner(foop);
3044 		foop = list_head(&mi->mi_foo_list);
3045 	}
3046 	list_destroy(&mi->mi_foo_list);
3047 	list_destroy(&mi->mi_bseqid_list);
3048 	list_destroy(&mi->mi_lost_state);
3049 	avl_destroy(&mi->mi_filehandles);
3050 	fn_rele(&mi->mi_fname);
3051 	kmem_free(mi, sizeof (*mi));
3052 }
3053 void
3054 mi_hold(mntinfo4_t *mi)
3055 {
3056 	atomic_add_32(&mi->mi_count, 1);
3057 	ASSERT(mi->mi_count != 0);
3058 }
3059 
3060 void
3061 mi_rele(mntinfo4_t *mi)
3062 {
3063 	ASSERT(mi->mi_count != 0);
3064 	if (atomic_add_32_nv(&mi->mi_count, -1) == 0) {
3065 		nfs_free_mi4(mi);
3066 	}
3067 }
3068 
3069 vnode_t    nfs4_xattr_notsupp_vnode;
3070 
3071 void
3072 nfs4_clnt_init(void)
3073 {
3074 	nfs4_vnops_init();
3075 	(void) nfs4_rnode_init();
3076 	(void) nfs4_shadow_init();
3077 	(void) nfs4_acache_init();
3078 	(void) nfs4_subr_init();
3079 	nfs4_acl_init();
3080 	nfs_idmap_init();
3081 	nfs4_callback_init();
3082 	nfs4_secinfo_init();
3083 #ifdef	DEBUG
3084 	tsd_create(&nfs4_tsd_key, NULL);
3085 #endif
3086 
3087 	/*
3088 	 * Add a CPR callback so that we can update client
3089 	 * lease after a suspend and resume.
3090 	 */
3091 	cid = callb_add(nfs4_client_cpr_callb, 0, CB_CL_CPR_RPC, "nfs4");
3092 
3093 	zone_key_create(&mi4_list_key, nfs4_mi_init, nfs4_mi_shutdown,
3094 	    nfs4_mi_destroy);
3095 
3096 	/*
3097 	 * Initialise the reference count of the notsupp xattr cache vnode to 1
3098 	 * so that it never goes away (VOP_INACTIVE isn't called on it).
3099 	 */
3100 	nfs4_xattr_notsupp_vnode.v_count = 1;
3101 }
3102 
3103 void
3104 nfs4_clnt_fini(void)
3105 {
3106 	(void) zone_key_delete(mi4_list_key);
3107 	nfs4_vnops_fini();
3108 	(void) nfs4_rnode_fini();
3109 	(void) nfs4_shadow_fini();
3110 	(void) nfs4_acache_fini();
3111 	(void) nfs4_subr_fini();
3112 	nfs_idmap_fini();
3113 	nfs4_callback_fini();
3114 	nfs4_secinfo_fini();
3115 #ifdef	DEBUG
3116 	tsd_destroy(&nfs4_tsd_key);
3117 #endif
3118 	if (cid)
3119 		(void) callb_delete(cid);
3120 }
3121 
3122 /*ARGSUSED*/
3123 static boolean_t
3124 nfs4_client_cpr_callb(void *arg, int code)
3125 {
3126 	/*
3127 	 * We get called for Suspend and Resume events.
3128 	 * For the suspend case we simply don't care!
3129 	 */
3130 	if (code == CB_CODE_CPR_CHKPT) {
3131 		return (B_TRUE);
3132 	}
3133 
3134 	/*
3135 	 * When we get to here we are in the process of
3136 	 * resuming the system from a previous suspend.
3137 	 */
3138 	nfs4_client_resumed = gethrestime_sec();
3139 	return (B_TRUE);
3140 }
3141 
3142 void
3143 nfs4_renew_lease_thread(nfs4_server_t *sp)
3144 {
3145 	int	error = 0;
3146 	time_t	tmp_last_renewal_time, tmp_time, tmp_now_time, kip_secs;
3147 	clock_t	tick_delay = 0;
3148 	clock_t time_left = 0;
3149 	callb_cpr_t cpr_info;
3150 	kmutex_t cpr_lock;
3151 
3152 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3153 	    "nfs4_renew_lease_thread: acting on sp 0x%p", (void*)sp));
3154 	mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
3155 	CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr, "nfsv4Lease");
3156 
3157 	mutex_enter(&sp->s_lock);
3158 	/* sp->s_lease_time is set via a GETATTR */
3159 	sp->last_renewal_time = gethrestime_sec();
3160 	sp->lease_valid = NFS4_LEASE_UNINITIALIZED;
3161 	ASSERT(sp->s_refcnt >= 1);
3162 
3163 	for (;;) {
3164 		if (!sp->state_ref_count ||
3165 		    sp->lease_valid != NFS4_LEASE_VALID) {
3166 
3167 			kip_secs = MAX((sp->s_lease_time >> 1) -
3168 			    (3 * sp->propagation_delay.tv_sec), 1);
3169 
3170 			tick_delay = SEC_TO_TICK(kip_secs);
3171 
3172 			NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3173 			    "nfs4_renew_lease_thread: no renew : thread "
3174 			    "wait %ld secs", kip_secs));
3175 
3176 			NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3177 			    "nfs4_renew_lease_thread: no renew : "
3178 			    "state_ref_count %d, lease_valid %d",
3179 			    sp->state_ref_count, sp->lease_valid));
3180 
3181 			mutex_enter(&cpr_lock);
3182 			CALLB_CPR_SAFE_BEGIN(&cpr_info);
3183 			mutex_exit(&cpr_lock);
3184 			time_left = cv_timedwait(&sp->cv_thread_exit,
3185 			    &sp->s_lock, tick_delay + lbolt);
3186 			mutex_enter(&cpr_lock);
3187 			CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
3188 			mutex_exit(&cpr_lock);
3189 
3190 			NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3191 			    "nfs4_renew_lease_thread: no renew: "
3192 			    "time left %ld", time_left));
3193 
3194 			if (sp->s_thread_exit == NFS4_THREAD_EXIT)
3195 				goto die;
3196 			continue;
3197 		}
3198 
3199 		tmp_last_renewal_time = sp->last_renewal_time;
3200 
3201 		tmp_time = gethrestime_sec() - sp->last_renewal_time +
3202 		    (3 * sp->propagation_delay.tv_sec);
3203 
3204 		NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3205 		    "nfs4_renew_lease_thread: tmp_time %ld, "
3206 		    "sp->last_renewal_time %ld", tmp_time,
3207 		    sp->last_renewal_time));
3208 
3209 		kip_secs = MAX((sp->s_lease_time >> 1) - tmp_time, 1);
3210 
3211 		tick_delay = SEC_TO_TICK(kip_secs);
3212 
3213 		NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3214 		    "nfs4_renew_lease_thread: valid lease: sleep for %ld "
3215 		    "secs", kip_secs));
3216 
3217 		mutex_enter(&cpr_lock);
3218 		CALLB_CPR_SAFE_BEGIN(&cpr_info);
3219 		mutex_exit(&cpr_lock);
3220 		time_left = cv_timedwait(&sp->cv_thread_exit, &sp->s_lock,
3221 		    tick_delay + lbolt);
3222 		mutex_enter(&cpr_lock);
3223 		CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
3224 		mutex_exit(&cpr_lock);
3225 
3226 		NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3227 		    "nfs4_renew_lease_thread: valid lease: time left %ld :"
3228 		    "sp last_renewal_time %ld, nfs4_client_resumed %ld, "
3229 		    "tmp_last_renewal_time %ld", time_left,
3230 		    sp->last_renewal_time, nfs4_client_resumed,
3231 		    tmp_last_renewal_time));
3232 
3233 		if (sp->s_thread_exit == NFS4_THREAD_EXIT)
3234 			goto die;
3235 
3236 		if (tmp_last_renewal_time == sp->last_renewal_time ||
3237 		    (nfs4_client_resumed != 0 &&
3238 		    nfs4_client_resumed > sp->last_renewal_time)) {
3239 			/*
3240 			 * Issue RENEW op since we haven't renewed the lease
3241 			 * since we slept.
3242 			 */
3243 			tmp_now_time = gethrestime_sec();
3244 			error = nfs4renew(sp);
3245 			/*
3246 			 * Need to re-acquire sp's lock, nfs4renew()
3247 			 * relinqueshes it.
3248 			 */
3249 			mutex_enter(&sp->s_lock);
3250 
3251 			/*
3252 			 * See if someone changed s_thread_exit while we gave
3253 			 * up s_lock.
3254 			 */
3255 			if (sp->s_thread_exit == NFS4_THREAD_EXIT)
3256 				goto die;
3257 
3258 			if (!error) {
3259 				/*
3260 				 * check to see if we implicitly renewed while
3261 				 * we waited for a reply for our RENEW call.
3262 				 */
3263 				if (tmp_last_renewal_time ==
3264 				    sp->last_renewal_time) {
3265 					/* no implicit renew came */
3266 					sp->last_renewal_time = tmp_now_time;
3267 				} else {
3268 					NFS4_DEBUG(nfs4_client_lease_debug,
3269 					    (CE_NOTE, "renew_thread: did "
3270 					    "implicit renewal before reply "
3271 					    "from server for RENEW"));
3272 				}
3273 			} else {
3274 				/* figure out error */
3275 				NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3276 				    "renew_thread: nfs4renew returned error"
3277 				    " %d", error));
3278 			}
3279 
3280 		}
3281 	}
3282 
3283 die:
3284 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3285 	    "nfs4_renew_lease_thread: thread exiting"));
3286 
3287 	while (sp->s_otw_call_count != 0) {
3288 		NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3289 		    "nfs4_renew_lease_thread: waiting for outstanding "
3290 		    "otw calls to finish for sp 0x%p, current "
3291 		    "s_otw_call_count %d", (void *)sp,
3292 		    sp->s_otw_call_count));
3293 		mutex_enter(&cpr_lock);
3294 		CALLB_CPR_SAFE_BEGIN(&cpr_info);
3295 		mutex_exit(&cpr_lock);
3296 		cv_wait(&sp->s_cv_otw_count, &sp->s_lock);
3297 		mutex_enter(&cpr_lock);
3298 		CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
3299 		mutex_exit(&cpr_lock);
3300 	}
3301 	mutex_exit(&sp->s_lock);
3302 
3303 	nfs4_server_rele(sp);		/* free the thread's reference */
3304 	nfs4_server_rele(sp);		/* free the list's reference */
3305 	sp = NULL;
3306 
3307 done:
3308 	mutex_enter(&cpr_lock);
3309 	CALLB_CPR_EXIT(&cpr_info);	/* drops cpr_lock */
3310 	mutex_destroy(&cpr_lock);
3311 
3312 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3313 	    "nfs4_renew_lease_thread: renew thread exit officially"));
3314 
3315 	zthread_exit();
3316 	/* NOT REACHED */
3317 }
3318 
3319 /*
3320  * Send out a RENEW op to the server.
3321  * Assumes sp is locked down.
3322  */
3323 static int
3324 nfs4renew(nfs4_server_t *sp)
3325 {
3326 	COMPOUND4args_clnt args;
3327 	COMPOUND4res_clnt res;
3328 	nfs_argop4 argop[1];
3329 	int doqueue = 1;
3330 	int rpc_error;
3331 	cred_t *cr;
3332 	mntinfo4_t *mi;
3333 	timespec_t prop_time, after_time;
3334 	int needrecov = FALSE;
3335 	nfs4_recov_state_t recov_state;
3336 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
3337 
3338 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "nfs4renew"));
3339 
3340 	recov_state.rs_flags = 0;
3341 	recov_state.rs_num_retry_despite_err = 0;
3342 
3343 recov_retry:
3344 	mi = sp->mntinfo4_list;
3345 	VFS_HOLD(mi->mi_vfsp);
3346 	mutex_exit(&sp->s_lock);
3347 	ASSERT(mi != NULL);
3348 
3349 	e.error = nfs4_start_op(mi, NULL, NULL, &recov_state);
3350 	if (e.error) {
3351 		VFS_RELE(mi->mi_vfsp);
3352 		return (e.error);
3353 	}
3354 
3355 	/* Check to see if we're dealing with a marked-dead sp */
3356 	mutex_enter(&sp->s_lock);
3357 	if (sp->s_thread_exit == NFS4_THREAD_EXIT) {
3358 		mutex_exit(&sp->s_lock);
3359 		nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3360 		VFS_RELE(mi->mi_vfsp);
3361 		return (0);
3362 	}
3363 
3364 	/* Make sure mi hasn't changed on us */
3365 	if (mi != sp->mntinfo4_list) {
3366 		/* Must drop sp's lock to avoid a recursive mutex enter */
3367 		mutex_exit(&sp->s_lock);
3368 		nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3369 		VFS_RELE(mi->mi_vfsp);
3370 		mutex_enter(&sp->s_lock);
3371 		goto recov_retry;
3372 	}
3373 	mutex_exit(&sp->s_lock);
3374 
3375 	args.ctag = TAG_RENEW;
3376 
3377 	args.array_len = 1;
3378 	args.array = argop;
3379 
3380 	argop[0].argop = OP_RENEW;
3381 
3382 	mutex_enter(&sp->s_lock);
3383 	argop[0].nfs_argop4_u.oprenew.clientid = sp->clientid;
3384 	cr = sp->s_cred;
3385 	crhold(cr);
3386 	mutex_exit(&sp->s_lock);
3387 
3388 	ASSERT(cr != NULL);
3389 
3390 	/* used to figure out RTT for sp */
3391 	gethrestime(&prop_time);
3392 
3393 	NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
3394 	    "nfs4renew: %s call, sp 0x%p", needrecov ? "recov" : "first",
3395 	    (void*)sp));
3396 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "before: %ld s %ld ns ",
3397 	    prop_time.tv_sec, prop_time.tv_nsec));
3398 
3399 	DTRACE_PROBE2(nfs4__renew__start, nfs4_server_t *, sp,
3400 	    mntinfo4_t *, mi);
3401 
3402 	rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
3403 	crfree(cr);
3404 
3405 	DTRACE_PROBE2(nfs4__renew__end, nfs4_server_t *, sp,
3406 	    mntinfo4_t *, mi);
3407 
3408 	gethrestime(&after_time);
3409 
3410 	mutex_enter(&sp->s_lock);
3411 	sp->propagation_delay.tv_sec =
3412 	    MAX(1, after_time.tv_sec - prop_time.tv_sec);
3413 	mutex_exit(&sp->s_lock);
3414 
3415 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "after : %ld s %ld ns ",
3416 	    after_time.tv_sec, after_time.tv_nsec));
3417 
3418 	if (e.error == 0 && res.status == NFS4ERR_CB_PATH_DOWN) {
3419 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3420 		nfs4_delegreturn_all(sp);
3421 		nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3422 		VFS_RELE(mi->mi_vfsp);
3423 		/*
3424 		 * If the server returns CB_PATH_DOWN, it has renewed
3425 		 * the lease and informed us that the callback path is
3426 		 * down.  Since the lease is renewed, just return 0 and
3427 		 * let the renew thread proceed as normal.
3428 		 */
3429 		return (0);
3430 	}
3431 
3432 	needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
3433 	if (!needrecov && e.error) {
3434 		nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3435 		VFS_RELE(mi->mi_vfsp);
3436 		return (e.error);
3437 	}
3438 
3439 	rpc_error = e.error;
3440 
3441 	if (needrecov) {
3442 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
3443 		    "nfs4renew: initiating recovery\n"));
3444 
3445 		if (nfs4_start_recovery(&e, mi, NULL, NULL, NULL, NULL,
3446 		    OP_RENEW, NULL) == FALSE) {
3447 			nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3448 			VFS_RELE(mi->mi_vfsp);
3449 			if (!e.error)
3450 				(void) xdr_free(xdr_COMPOUND4res_clnt,
3451 				    (caddr_t)&res);
3452 			mutex_enter(&sp->s_lock);
3453 			goto recov_retry;
3454 		}
3455 		/* fall through for res.status case */
3456 	}
3457 
3458 	if (res.status) {
3459 		if (res.status == NFS4ERR_LEASE_MOVED) {
3460 			/*EMPTY*/
3461 			/*
3462 			 * XXX need to try every mntinfo4 in sp->mntinfo4_list
3463 			 * to renew the lease on that server
3464 			 */
3465 		}
3466 		e.error = geterrno4(res.status);
3467 	}
3468 
3469 	if (!rpc_error)
3470 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3471 
3472 	nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3473 
3474 	VFS_RELE(mi->mi_vfsp);
3475 
3476 	return (e.error);
3477 }
3478 
3479 void
3480 nfs4_inc_state_ref_count(mntinfo4_t *mi)
3481 {
3482 	nfs4_server_t	*sp;
3483 
3484 	/* this locks down sp if it is found */
3485 	sp = find_nfs4_server(mi);
3486 
3487 	if (sp != NULL) {
3488 		nfs4_inc_state_ref_count_nolock(sp, mi);
3489 		mutex_exit(&sp->s_lock);
3490 		nfs4_server_rele(sp);
3491 	}
3492 }
3493 
3494 /*
3495  * Bump the number of OPEN files (ie: those with state) so we know if this
3496  * nfs4_server has any state to maintain a lease for or not.
3497  *
3498  * Also, marks the nfs4_server's lease valid if it hasn't been done so already.
3499  */
3500 void
3501 nfs4_inc_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi)
3502 {
3503 	ASSERT(mutex_owned(&sp->s_lock));
3504 
3505 	sp->state_ref_count++;
3506 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3507 	    "nfs4_inc_state_ref_count: state_ref_count now %d",
3508 	    sp->state_ref_count));
3509 
3510 	if (sp->lease_valid == NFS4_LEASE_UNINITIALIZED)
3511 		sp->lease_valid = NFS4_LEASE_VALID;
3512 
3513 	/*
3514 	 * If this call caused the lease to be marked valid and/or
3515 	 * took the state_ref_count from 0 to 1, then start the time
3516 	 * on lease renewal.
3517 	 */
3518 	if (sp->lease_valid == NFS4_LEASE_VALID && sp->state_ref_count == 1)
3519 		sp->last_renewal_time = gethrestime_sec();
3520 
3521 	/* update the number of open files for mi */
3522 	mi->mi_open_files++;
3523 }
3524 
3525 void
3526 nfs4_dec_state_ref_count(mntinfo4_t *mi)
3527 {
3528 	nfs4_server_t	*sp;
3529 
3530 	/* this locks down sp if it is found */
3531 	sp = find_nfs4_server_all(mi, 1);
3532 
3533 	if (sp != NULL) {
3534 		nfs4_dec_state_ref_count_nolock(sp, mi);
3535 		mutex_exit(&sp->s_lock);
3536 		nfs4_server_rele(sp);
3537 	}
3538 }
3539 
3540 /*
3541  * Decrement the number of OPEN files (ie: those with state) so we know if
3542  * this nfs4_server has any state to maintain a lease for or not.
3543  */
3544 void
3545 nfs4_dec_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi)
3546 {
3547 	ASSERT(mutex_owned(&sp->s_lock));
3548 	ASSERT(sp->state_ref_count != 0);
3549 	sp->state_ref_count--;
3550 
3551 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3552 	    "nfs4_dec_state_ref_count: state ref count now %d",
3553 	    sp->state_ref_count));
3554 
3555 	mi->mi_open_files--;
3556 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3557 	    "nfs4_dec_state_ref_count: mi open files %d, v4 flags 0x%x",
3558 	    mi->mi_open_files, mi->mi_flags));
3559 
3560 	/* We don't have to hold the mi_lock to test mi_flags */
3561 	if (mi->mi_open_files == 0 &&
3562 	    (mi->mi_flags & MI4_REMOVE_ON_LAST_CLOSE)) {
3563 		NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3564 		    "nfs4_dec_state_ref_count: remove mntinfo4 %p since "
3565 		    "we have closed the last open file", (void*)mi));
3566 		nfs4_remove_mi_from_server(mi, sp);
3567 	}
3568 }
3569 
3570 bool_t
3571 inlease(nfs4_server_t *sp)
3572 {
3573 	bool_t result;
3574 
3575 	ASSERT(mutex_owned(&sp->s_lock));
3576 
3577 	if (sp->lease_valid == NFS4_LEASE_VALID &&
3578 	    gethrestime_sec() < sp->last_renewal_time + sp->s_lease_time)
3579 		result = TRUE;
3580 	else
3581 		result = FALSE;
3582 
3583 	return (result);
3584 }
3585 
3586 
3587 /*
3588  * Return non-zero if the given nfs4_server_t is going through recovery.
3589  */
3590 
3591 int
3592 nfs4_server_in_recovery(nfs4_server_t *sp)
3593 {
3594 	return (nfs_rw_lock_held(&sp->s_recovlock, RW_WRITER));
3595 }
3596 
3597 /*
3598  * Compare two shared filehandle objects.  Returns -1, 0, or +1, if the
3599  * first is less than, equal to, or greater than the second.
3600  */
3601 
3602 int
3603 sfh4cmp(const void *p1, const void *p2)
3604 {
3605 	const nfs4_sharedfh_t *sfh1 = (const nfs4_sharedfh_t *)p1;
3606 	const nfs4_sharedfh_t *sfh2 = (const nfs4_sharedfh_t *)p2;
3607 
3608 	return (nfs4cmpfh(&sfh1->sfh_fh, &sfh2->sfh_fh));
3609 }
3610 
3611 /*
3612  * Create a table for shared filehandle objects.
3613  */
3614 
3615 void
3616 sfh4_createtab(avl_tree_t *tab)
3617 {
3618 	avl_create(tab, sfh4cmp, sizeof (nfs4_sharedfh_t),
3619 	    offsetof(nfs4_sharedfh_t, sfh_tree));
3620 }
3621 
3622 /*
3623  * Return a shared filehandle object for the given filehandle.  The caller
3624  * is responsible for eventually calling sfh4_rele().
3625  */
3626 
3627 nfs4_sharedfh_t *
3628 sfh4_put(const nfs_fh4 *fh, mntinfo4_t *mi, nfs4_sharedfh_t *key)
3629 {
3630 	nfs4_sharedfh_t *sfh, *nsfh;
3631 	avl_index_t where;
3632 	nfs4_sharedfh_t skey;
3633 
3634 	if (!key) {
3635 		skey.sfh_fh = *fh;
3636 		key = &skey;
3637 	}
3638 
3639 	nsfh = kmem_alloc(sizeof (nfs4_sharedfh_t), KM_SLEEP);
3640 	nsfh->sfh_fh.nfs_fh4_len = fh->nfs_fh4_len;
3641 	/*
3642 	 * We allocate the largest possible filehandle size because it's
3643 	 * not that big, and it saves us from possibly having to resize the
3644 	 * buffer later.
3645 	 */
3646 	nsfh->sfh_fh.nfs_fh4_val = kmem_alloc(NFS4_FHSIZE, KM_SLEEP);
3647 	bcopy(fh->nfs_fh4_val, nsfh->sfh_fh.nfs_fh4_val, fh->nfs_fh4_len);
3648 	mutex_init(&nsfh->sfh_lock, NULL, MUTEX_DEFAULT, NULL);
3649 	nsfh->sfh_refcnt = 1;
3650 	nsfh->sfh_flags = SFH4_IN_TREE;
3651 	nsfh->sfh_mi = mi;
3652 	NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, "sfh4_get: new object (%p)",
3653 	    (void *)nsfh));
3654 
3655 	(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
3656 	sfh = avl_find(&mi->mi_filehandles, key, &where);
3657 	if (sfh != NULL) {
3658 		mutex_enter(&sfh->sfh_lock);
3659 		sfh->sfh_refcnt++;
3660 		mutex_exit(&sfh->sfh_lock);
3661 		nfs_rw_exit(&mi->mi_fh_lock);
3662 		/* free our speculative allocs */
3663 		kmem_free(nsfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE);
3664 		kmem_free(nsfh, sizeof (nfs4_sharedfh_t));
3665 		return (sfh);
3666 	}
3667 
3668 	avl_insert(&mi->mi_filehandles, nsfh, where);
3669 	nfs_rw_exit(&mi->mi_fh_lock);
3670 
3671 	return (nsfh);
3672 }
3673 
3674 /*
3675  * Return a shared filehandle object for the given filehandle.  The caller
3676  * is responsible for eventually calling sfh4_rele().
3677  */
3678 
3679 nfs4_sharedfh_t *
3680 sfh4_get(const nfs_fh4 *fh, mntinfo4_t *mi)
3681 {
3682 	nfs4_sharedfh_t *sfh;
3683 	nfs4_sharedfh_t key;
3684 
3685 	ASSERT(fh->nfs_fh4_len <= NFS4_FHSIZE);
3686 
3687 #ifdef DEBUG
3688 	if (nfs4_sharedfh_debug) {
3689 		nfs4_fhandle_t fhandle;
3690 
3691 		fhandle.fh_len = fh->nfs_fh4_len;
3692 		bcopy(fh->nfs_fh4_val, fhandle.fh_buf, fhandle.fh_len);
3693 		zcmn_err(mi->mi_zone->zone_id, CE_NOTE, "sfh4_get:");
3694 		nfs4_printfhandle(&fhandle);
3695 	}
3696 #endif
3697 
3698 	/*
3699 	 * If there's already an object for the given filehandle, bump the
3700 	 * reference count and return it.  Otherwise, create a new object
3701 	 * and add it to the AVL tree.
3702 	 */
3703 
3704 	key.sfh_fh = *fh;
3705 
3706 	(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
3707 	sfh = avl_find(&mi->mi_filehandles, &key, NULL);
3708 	if (sfh != NULL) {
3709 		mutex_enter(&sfh->sfh_lock);
3710 		sfh->sfh_refcnt++;
3711 		NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3712 		    "sfh4_get: found existing %p, new refcnt=%d",
3713 		    (void *)sfh, sfh->sfh_refcnt));
3714 		mutex_exit(&sfh->sfh_lock);
3715 		nfs_rw_exit(&mi->mi_fh_lock);
3716 		return (sfh);
3717 	}
3718 	nfs_rw_exit(&mi->mi_fh_lock);
3719 
3720 	return (sfh4_put(fh, mi, &key));
3721 }
3722 
3723 /*
3724  * Get a reference to the given shared filehandle object.
3725  */
3726 
3727 void
3728 sfh4_hold(nfs4_sharedfh_t *sfh)
3729 {
3730 	ASSERT(sfh->sfh_refcnt > 0);
3731 
3732 	mutex_enter(&sfh->sfh_lock);
3733 	sfh->sfh_refcnt++;
3734 	NFS4_DEBUG(nfs4_sharedfh_debug,
3735 	    (CE_NOTE, "sfh4_hold %p, new refcnt=%d",
3736 	    (void *)sfh, sfh->sfh_refcnt));
3737 	mutex_exit(&sfh->sfh_lock);
3738 }
3739 
3740 /*
3741  * Release a reference to the given shared filehandle object and null out
3742  * the given pointer.
3743  */
3744 
3745 void
3746 sfh4_rele(nfs4_sharedfh_t **sfhpp)
3747 {
3748 	mntinfo4_t *mi;
3749 	nfs4_sharedfh_t *sfh = *sfhpp;
3750 
3751 	ASSERT(sfh->sfh_refcnt > 0);
3752 
3753 	mutex_enter(&sfh->sfh_lock);
3754 	if (sfh->sfh_refcnt > 1) {
3755 		sfh->sfh_refcnt--;
3756 		NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3757 		    "sfh4_rele %p, new refcnt=%d",
3758 		    (void *)sfh, sfh->sfh_refcnt));
3759 		mutex_exit(&sfh->sfh_lock);
3760 		goto finish;
3761 	}
3762 	mutex_exit(&sfh->sfh_lock);
3763 
3764 	/*
3765 	 * Possibly the last reference, so get the lock for the table in
3766 	 * case it's time to remove the object from the table.
3767 	 */
3768 	mi = sfh->sfh_mi;
3769 	(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
3770 	mutex_enter(&sfh->sfh_lock);
3771 	sfh->sfh_refcnt--;
3772 	if (sfh->sfh_refcnt > 0) {
3773 		NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3774 		    "sfh4_rele %p, new refcnt=%d",
3775 		    (void *)sfh, sfh->sfh_refcnt));
3776 		mutex_exit(&sfh->sfh_lock);
3777 		nfs_rw_exit(&mi->mi_fh_lock);
3778 		goto finish;
3779 	}
3780 
3781 	NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3782 	    "sfh4_rele %p, last ref", (void *)sfh));
3783 	if (sfh->sfh_flags & SFH4_IN_TREE) {
3784 		avl_remove(&mi->mi_filehandles, sfh);
3785 		sfh->sfh_flags &= ~SFH4_IN_TREE;
3786 	}
3787 	mutex_exit(&sfh->sfh_lock);
3788 	nfs_rw_exit(&mi->mi_fh_lock);
3789 	mutex_destroy(&sfh->sfh_lock);
3790 	kmem_free(sfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE);
3791 	kmem_free(sfh, sizeof (nfs4_sharedfh_t));
3792 
3793 finish:
3794 	*sfhpp = NULL;
3795 }
3796 
3797 /*
3798  * Update the filehandle for the given shared filehandle object.
3799  */
3800 
3801 int nfs4_warn_dupfh = 0;	/* if set, always warn about dup fhs below */
3802 
3803 void
3804 sfh4_update(nfs4_sharedfh_t *sfh, const nfs_fh4 *newfh)
3805 {
3806 	mntinfo4_t *mi = sfh->sfh_mi;
3807 	nfs4_sharedfh_t *dupsfh;
3808 	avl_index_t where;
3809 	nfs4_sharedfh_t key;
3810 
3811 #ifdef DEBUG
3812 	mutex_enter(&sfh->sfh_lock);
3813 	ASSERT(sfh->sfh_refcnt > 0);
3814 	mutex_exit(&sfh->sfh_lock);
3815 #endif
3816 	ASSERT(newfh->nfs_fh4_len <= NFS4_FHSIZE);
3817 
3818 	/*
3819 	 * The basic plan is to remove the shared filehandle object from
3820 	 * the table, update it to have the new filehandle, then reinsert
3821 	 * it.
3822 	 */
3823 
3824 	(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
3825 	mutex_enter(&sfh->sfh_lock);
3826 	if (sfh->sfh_flags & SFH4_IN_TREE) {
3827 		avl_remove(&mi->mi_filehandles, sfh);
3828 		sfh->sfh_flags &= ~SFH4_IN_TREE;
3829 	}
3830 	mutex_exit(&sfh->sfh_lock);
3831 	sfh->sfh_fh.nfs_fh4_len = newfh->nfs_fh4_len;
3832 	bcopy(newfh->nfs_fh4_val, sfh->sfh_fh.nfs_fh4_val,
3833 	    sfh->sfh_fh.nfs_fh4_len);
3834 
3835 	/*
3836 	 * XXX If there is already a shared filehandle object with the new
3837 	 * filehandle, we're in trouble, because the rnode code assumes
3838 	 * that there is only one shared filehandle object for a given
3839 	 * filehandle.  So issue a warning (for read-write mounts only)
3840 	 * and don't try to re-insert the given object into the table.
3841 	 * Hopefully the given object will quickly go away and everyone
3842 	 * will use the new object.
3843 	 */
3844 	key.sfh_fh = *newfh;
3845 	dupsfh = avl_find(&mi->mi_filehandles, &key, &where);
3846 	if (dupsfh != NULL) {
3847 		if (!(mi->mi_vfsp->vfs_flag & VFS_RDONLY) || nfs4_warn_dupfh) {
3848 			zcmn_err(mi->mi_zone->zone_id, CE_WARN, "sfh4_update: "
3849 			    "duplicate filehandle detected");
3850 			sfh4_printfhandle(dupsfh);
3851 		}
3852 	} else {
3853 		avl_insert(&mi->mi_filehandles, sfh, where);
3854 		mutex_enter(&sfh->sfh_lock);
3855 		sfh->sfh_flags |= SFH4_IN_TREE;
3856 		mutex_exit(&sfh->sfh_lock);
3857 	}
3858 	nfs_rw_exit(&mi->mi_fh_lock);
3859 }
3860 
3861 /*
3862  * Copy out the current filehandle for the given shared filehandle object.
3863  */
3864 
3865 void
3866 sfh4_copyval(const nfs4_sharedfh_t *sfh, nfs4_fhandle_t *fhp)
3867 {
3868 	mntinfo4_t *mi = sfh->sfh_mi;
3869 
3870 	ASSERT(sfh->sfh_refcnt > 0);
3871 
3872 	(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
3873 	fhp->fh_len = sfh->sfh_fh.nfs_fh4_len;
3874 	ASSERT(fhp->fh_len <= NFS4_FHSIZE);
3875 	bcopy(sfh->sfh_fh.nfs_fh4_val, fhp->fh_buf, fhp->fh_len);
3876 	nfs_rw_exit(&mi->mi_fh_lock);
3877 }
3878 
3879 /*
3880  * Print out the filehandle for the given shared filehandle object.
3881  */
3882 
3883 void
3884 sfh4_printfhandle(const nfs4_sharedfh_t *sfh)
3885 {
3886 	nfs4_fhandle_t fhandle;
3887 
3888 	sfh4_copyval(sfh, &fhandle);
3889 	nfs4_printfhandle(&fhandle);
3890 }
3891 
3892 /*
3893  * Compare 2 fnames.  Returns -1 if the first is "less" than the second, 0
3894  * if they're the same, +1 if the first is "greater" than the second.  The
3895  * caller (or whoever's calling the AVL package) is responsible for
3896  * handling locking issues.
3897  */
3898 
3899 static int
3900 fncmp(const void *p1, const void *p2)
3901 {
3902 	const nfs4_fname_t *f1 = p1;
3903 	const nfs4_fname_t *f2 = p2;
3904 	int res;
3905 
3906 	res = strcmp(f1->fn_name, f2->fn_name);
3907 	/*
3908 	 * The AVL package wants +/-1, not arbitrary positive or negative
3909 	 * integers.
3910 	 */
3911 	if (res > 0)
3912 		res = 1;
3913 	else if (res < 0)
3914 		res = -1;
3915 	return (res);
3916 }
3917 
3918 /*
3919  * Get or create an fname with the given name, as a child of the given
3920  * fname.  The caller is responsible for eventually releasing the reference
3921  * (fn_rele()).  parent may be NULL.
3922  */
3923 
3924 nfs4_fname_t *
3925 fn_get(nfs4_fname_t *parent, char *name)
3926 {
3927 	nfs4_fname_t key;
3928 	nfs4_fname_t *fnp;
3929 	avl_index_t where;
3930 
3931 	key.fn_name = name;
3932 
3933 	/*
3934 	 * If there's already an fname registered with the given name, bump
3935 	 * its reference count and return it.  Otherwise, create a new one
3936 	 * and add it to the parent's AVL tree.
3937 	 */
3938 
3939 	if (parent != NULL) {
3940 		mutex_enter(&parent->fn_lock);
3941 		fnp = avl_find(&parent->fn_children, &key, &where);
3942 		if (fnp != NULL) {
3943 			fn_hold(fnp);
3944 			mutex_exit(&parent->fn_lock);
3945 			return (fnp);
3946 		}
3947 	}
3948 
3949 	fnp = kmem_alloc(sizeof (nfs4_fname_t), KM_SLEEP);
3950 	mutex_init(&fnp->fn_lock, NULL, MUTEX_DEFAULT, NULL);
3951 	fnp->fn_parent = parent;
3952 	if (parent != NULL)
3953 		fn_hold(parent);
3954 	fnp->fn_len = strlen(name);
3955 	ASSERT(fnp->fn_len < MAXNAMELEN);
3956 	fnp->fn_name = kmem_alloc(fnp->fn_len + 1, KM_SLEEP);
3957 	(void) strcpy(fnp->fn_name, name);
3958 	fnp->fn_refcnt = 1;
3959 	avl_create(&fnp->fn_children, fncmp, sizeof (nfs4_fname_t),
3960 	    offsetof(nfs4_fname_t, fn_tree));
3961 	NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
3962 	    "fn_get %p:%s, a new nfs4_fname_t!",
3963 	    (void *)fnp, fnp->fn_name));
3964 	if (parent != NULL) {
3965 		avl_insert(&parent->fn_children, fnp, where);
3966 		mutex_exit(&parent->fn_lock);
3967 	}
3968 
3969 	return (fnp);
3970 }
3971 
3972 void
3973 fn_hold(nfs4_fname_t *fnp)
3974 {
3975 	atomic_add_32(&fnp->fn_refcnt, 1);
3976 	NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
3977 	    "fn_hold %p:%s, new refcnt=%d",
3978 	    (void *)fnp, fnp->fn_name, fnp->fn_refcnt));
3979 }
3980 
3981 /*
3982  * Decrement the reference count of the given fname, and destroy it if its
3983  * reference count goes to zero.  Nulls out the given pointer.
3984  */
3985 
3986 void
3987 fn_rele(nfs4_fname_t **fnpp)
3988 {
3989 	nfs4_fname_t *parent;
3990 	uint32_t newref;
3991 	nfs4_fname_t *fnp;
3992 
3993 recur:
3994 	fnp = *fnpp;
3995 	*fnpp = NULL;
3996 
3997 	mutex_enter(&fnp->fn_lock);
3998 	parent = fnp->fn_parent;
3999 	if (parent != NULL)
4000 		mutex_enter(&parent->fn_lock);	/* prevent new references */
4001 	newref = atomic_add_32_nv(&fnp->fn_refcnt, -1);
4002 	if (newref > 0) {
4003 		NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4004 		    "fn_rele %p:%s, new refcnt=%d",
4005 		    (void *)fnp, fnp->fn_name, fnp->fn_refcnt));
4006 		if (parent != NULL)
4007 			mutex_exit(&parent->fn_lock);
4008 		mutex_exit(&fnp->fn_lock);
4009 		return;
4010 	}
4011 
4012 	NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4013 	    "fn_rele %p:%s, last reference, deleting...",
4014 	    (void *)fnp, fnp->fn_name));
4015 	if (parent != NULL) {
4016 		avl_remove(&parent->fn_children, fnp);
4017 		mutex_exit(&parent->fn_lock);
4018 	}
4019 	kmem_free(fnp->fn_name, fnp->fn_len + 1);
4020 	mutex_destroy(&fnp->fn_lock);
4021 	avl_destroy(&fnp->fn_children);
4022 	kmem_free(fnp, sizeof (nfs4_fname_t));
4023 	/*
4024 	 * Recursivly fn_rele the parent.
4025 	 * Use goto instead of a recursive call to avoid stack overflow.
4026 	 */
4027 	if (parent != NULL) {
4028 		fnpp = &parent;
4029 		goto recur;
4030 	}
4031 }
4032 
4033 /*
4034  * Returns the single component name of the given fname, in a MAXNAMELEN
4035  * string buffer, which the caller is responsible for freeing.  Note that
4036  * the name may become invalid as a result of fn_move().
4037  */
4038 
4039 char *
4040 fn_name(nfs4_fname_t *fnp)
4041 {
4042 	char *name;
4043 
4044 	ASSERT(fnp->fn_len < MAXNAMELEN);
4045 	name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
4046 	mutex_enter(&fnp->fn_lock);
4047 	(void) strcpy(name, fnp->fn_name);
4048 	mutex_exit(&fnp->fn_lock);
4049 
4050 	return (name);
4051 }
4052 
4053 
4054 /*
4055  * fn_path_realloc
4056  *
4057  * This function, used only by fn_path, constructs
4058  * a new string which looks like "prepend" + "/" + "current".
4059  * by allocating a new string and freeing the old one.
4060  */
4061 static void
4062 fn_path_realloc(char **curses, char *prepend)
4063 {
4064 	int len, curlen = 0;
4065 	char *news;
4066 
4067 	if (*curses == NULL) {
4068 		/*
4069 		 * Prime the pump, allocate just the
4070 		 * space for prepend and return that.
4071 		 */
4072 		len = strlen(prepend) + 1;
4073 		news = kmem_alloc(len, KM_SLEEP);
4074 		(void) strncpy(news, prepend, len);
4075 	} else {
4076 		/*
4077 		 * Allocate the space  for a new string
4078 		 * +1 +1 is for the "/" and the NULL
4079 		 * byte at the end of it all.
4080 		 */
4081 		curlen = strlen(*curses);
4082 		len = curlen + strlen(prepend) + 1 + 1;
4083 		news = kmem_alloc(len, KM_SLEEP);
4084 		(void) strncpy(news, prepend, len);
4085 		(void) strcat(news, "/");
4086 		(void) strcat(news, *curses);
4087 		kmem_free(*curses, curlen + 1);
4088 	}
4089 	*curses = news;
4090 }
4091 
4092 /*
4093  * Returns the path name (starting from the fs root) for the given fname.
4094  * The caller is responsible for freeing.  Note that the path may be or
4095  * become invalid as a result of fn_move().
4096  */
4097 
4098 char *
4099 fn_path(nfs4_fname_t *fnp)
4100 {
4101 	char *path;
4102 	nfs4_fname_t *nextfnp;
4103 
4104 	if (fnp == NULL)
4105 		return (NULL);
4106 
4107 	path = NULL;
4108 
4109 	/* walk up the tree constructing the pathname.  */
4110 
4111 	fn_hold(fnp);			/* adjust for later rele */
4112 	do {
4113 		mutex_enter(&fnp->fn_lock);
4114 		/*
4115 		 * Add fn_name in front of the current path
4116 		 */
4117 		fn_path_realloc(&path, fnp->fn_name);
4118 		nextfnp = fnp->fn_parent;
4119 		if (nextfnp != NULL)
4120 			fn_hold(nextfnp);
4121 		mutex_exit(&fnp->fn_lock);
4122 		fn_rele(&fnp);
4123 		fnp = nextfnp;
4124 	} while (fnp != NULL);
4125 
4126 	return (path);
4127 }
4128 
4129 /*
4130  * Return a reference to the parent of the given fname, which the caller is
4131  * responsible for eventually releasing.
4132  */
4133 
4134 nfs4_fname_t *
4135 fn_parent(nfs4_fname_t *fnp)
4136 {
4137 	nfs4_fname_t *parent;
4138 
4139 	mutex_enter(&fnp->fn_lock);
4140 	parent = fnp->fn_parent;
4141 	if (parent != NULL)
4142 		fn_hold(parent);
4143 	mutex_exit(&fnp->fn_lock);
4144 
4145 	return (parent);
4146 }
4147 
4148 /*
4149  * Update fnp so that its parent is newparent and its name is newname.
4150  */
4151 
4152 void
4153 fn_move(nfs4_fname_t *fnp, nfs4_fname_t *newparent, char *newname)
4154 {
4155 	nfs4_fname_t *parent, *tmpfnp;
4156 	ssize_t newlen;
4157 	nfs4_fname_t key;
4158 	avl_index_t where;
4159 
4160 	/*
4161 	 * This assert exists to catch the client trying to rename
4162 	 * a dir to be a child of itself.  This happened at a recent
4163 	 * bakeoff against a 3rd party (broken) server which allowed
4164 	 * the rename to succeed.  If it trips it means that:
4165 	 *	a) the code in nfs4rename that detects this case is broken
4166 	 *	b) the server is broken (since it allowed the bogus rename)
4167 	 *
4168 	 * For non-DEBUG kernels, prepare for a recursive mutex_enter
4169 	 * panic below from:  mutex_enter(&newparent->fn_lock);
4170 	 */
4171 	ASSERT(fnp != newparent);
4172 
4173 	/*
4174 	 * Remove fnp from its current parent, change its name, then add it
4175 	 * to newparent.
4176 	 */
4177 	mutex_enter(&fnp->fn_lock);
4178 	parent = fnp->fn_parent;
4179 	mutex_enter(&parent->fn_lock);
4180 	avl_remove(&parent->fn_children, fnp);
4181 	mutex_exit(&parent->fn_lock);
4182 	fn_rele(&fnp->fn_parent);
4183 
4184 	newlen = strlen(newname);
4185 	if (newlen != fnp->fn_len) {
4186 		ASSERT(newlen < MAXNAMELEN);
4187 		kmem_free(fnp->fn_name, fnp->fn_len + 1);
4188 		fnp->fn_name = kmem_alloc(newlen + 1, KM_SLEEP);
4189 		fnp->fn_len = newlen;
4190 	}
4191 	(void) strcpy(fnp->fn_name, newname);
4192 
4193 again:
4194 	mutex_enter(&newparent->fn_lock);
4195 	key.fn_name = fnp->fn_name;
4196 	tmpfnp = avl_find(&newparent->fn_children, &key, &where);
4197 	if (tmpfnp != NULL) {
4198 		/*
4199 		 * This could be due to a file that was unlinked while
4200 		 * open, or perhaps the rnode is in the free list.  Remove
4201 		 * it from newparent and let it go away on its own.  The
4202 		 * contorted code is to deal with lock order issues and
4203 		 * race conditions.
4204 		 */
4205 		fn_hold(tmpfnp);
4206 		mutex_exit(&newparent->fn_lock);
4207 		mutex_enter(&tmpfnp->fn_lock);
4208 		if (tmpfnp->fn_parent == newparent) {
4209 			mutex_enter(&newparent->fn_lock);
4210 			avl_remove(&newparent->fn_children, tmpfnp);
4211 			mutex_exit(&newparent->fn_lock);
4212 			fn_rele(&tmpfnp->fn_parent);
4213 		}
4214 		mutex_exit(&tmpfnp->fn_lock);
4215 		fn_rele(&tmpfnp);
4216 		goto again;
4217 	}
4218 	fnp->fn_parent = newparent;
4219 	fn_hold(newparent);
4220 	avl_insert(&newparent->fn_children, fnp, where);
4221 	mutex_exit(&newparent->fn_lock);
4222 	mutex_exit(&fnp->fn_lock);
4223 }
4224 
4225 #ifdef DEBUG
4226 /*
4227  * Return non-zero if the type information makes sense for the given vnode.
4228  * Otherwise panic.
4229  */
4230 int
4231 nfs4_consistent_type(vnode_t *vp)
4232 {
4233 	rnode4_t *rp = VTOR4(vp);
4234 
4235 	if (nfs4_vtype_debug && vp->v_type != VNON &&
4236 	    rp->r_attr.va_type != VNON && vp->v_type != rp->r_attr.va_type) {
4237 		cmn_err(CE_PANIC, "vnode %p type mismatch; v_type=%d, "
4238 		    "rnode attr type=%d", (void *)vp, vp->v_type,
4239 		    rp->r_attr.va_type);
4240 	}
4241 
4242 	return (1);
4243 }
4244 #endif /* DEBUG */
4245