xref: /titanic_51/usr/src/uts/common/fs/nfs/nfs4_client.c (revision 0cd13cbfb4270b840b4bd22ec5f673b2b6a2c02b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  *  	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
28  *	All Rights Reserved
29  */
30 
31 #pragma ident	"%Z%%M%	%I%	%E% SMI"
32 
33 #include <sys/param.h>
34 #include <sys/types.h>
35 #include <sys/systm.h>
36 #include <sys/thread.h>
37 #include <sys/t_lock.h>
38 #include <sys/time.h>
39 #include <sys/vnode.h>
40 #include <sys/vfs.h>
41 #include <sys/errno.h>
42 #include <sys/buf.h>
43 #include <sys/stat.h>
44 #include <sys/cred.h>
45 #include <sys/kmem.h>
46 #include <sys/debug.h>
47 #include <sys/dnlc.h>
48 #include <sys/vmsystm.h>
49 #include <sys/flock.h>
50 #include <sys/share.h>
51 #include <sys/cmn_err.h>
52 #include <sys/tiuser.h>
53 #include <sys/sysmacros.h>
54 #include <sys/callb.h>
55 #include <sys/acl.h>
56 #include <sys/kstat.h>
57 #include <sys/signal.h>
58 #include <sys/disp.h>
59 #include <sys/atomic.h>
60 #include <sys/list.h>
61 #include <sys/sdt.h>
62 
63 #include <rpc/types.h>
64 #include <rpc/xdr.h>
65 #include <rpc/auth.h>
66 #include <rpc/clnt.h>
67 
68 #include <nfs/nfs.h>
69 #include <nfs/nfs_clnt.h>
70 #include <nfs/nfs_acl.h>
71 
72 #include <nfs/nfs4.h>
73 #include <nfs/rnode4.h>
74 #include <nfs/nfs4_clnt.h>
75 
76 #include <vm/hat.h>
77 #include <vm/as.h>
78 #include <vm/page.h>
79 #include <vm/pvn.h>
80 #include <vm/seg.h>
81 #include <vm/seg_map.h>
82 #include <vm/seg_vn.h>
83 
84 #include <sys/ddi.h>
85 
86 /*
87  * Arguments to page-flush thread.
88  */
89 typedef struct {
90 	vnode_t *vp;
91 	cred_t *cr;
92 } pgflush_t;
93 
94 #ifdef DEBUG
95 int nfs4_client_lease_debug;
96 int nfs4_sharedfh_debug;
97 int nfs4_fname_debug;
98 
99 /* temporary: panic if v_type is inconsistent with r_attr va_type */
100 int nfs4_vtype_debug;
101 
102 uint_t nfs4_tsd_key;
103 #endif
104 
105 static time_t	nfs4_client_resumed = 0;
106 static	callb_id_t cid = 0;
107 
108 static int	nfs4renew(nfs4_server_t *);
109 static void	nfs4_attrcache_va(vnode_t *, nfs4_ga_res_t *, int);
110 static void	nfs4_pgflush_thread(pgflush_t *);
111 static void	flush_pages(vnode_t *, cred_t *);
112 
113 static boolean_t nfs4_client_cpr_callb(void *, int);
114 
115 struct mi4_globals {
116 	kmutex_t	mig_lock;  /* lock protecting mig_list */
117 	list_t		mig_list;  /* list of NFS v4 mounts in zone */
118 	boolean_t	mig_destructor_called;
119 };
120 
121 static zone_key_t mi4_list_key;
122 
123 /*
124  * Attributes caching:
125  *
126  * Attributes are cached in the rnode in struct vattr form.
127  * There is a time associated with the cached attributes (r_time_attr_inval)
128  * which tells whether the attributes are valid. The time is initialized
129  * to the difference between current time and the modify time of the vnode
130  * when new attributes are cached. This allows the attributes for
131  * files that have changed recently to be timed out sooner than for files
132  * that have not changed for a long time. There are minimum and maximum
133  * timeout values that can be set per mount point.
134  */
135 
136 /*
137  * If a cache purge is in progress, wait for it to finish.
138  *
139  * The current thread must not be in the middle of an
140  * nfs4_start_op/nfs4_end_op region.  Otherwise, there could be a deadlock
141  * between this thread, a recovery thread, and the page flush thread.
142  */
143 int
144 nfs4_waitfor_purge_complete(vnode_t *vp)
145 {
146 	rnode4_t *rp;
147 	k_sigset_t smask;
148 
149 	rp = VTOR4(vp);
150 	if ((rp->r_serial != NULL && rp->r_serial != curthread) ||
151 	    ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread)) {
152 		mutex_enter(&rp->r_statelock);
153 		sigintr(&smask, VTOMI4(vp)->mi_flags & MI4_INT);
154 		while ((rp->r_serial != NULL && rp->r_serial != curthread) ||
155 		    ((rp->r_flags & R4PGFLUSH) &&
156 		    rp->r_pgflush != curthread)) {
157 			if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
158 				sigunintr(&smask);
159 				mutex_exit(&rp->r_statelock);
160 				return (EINTR);
161 			}
162 		}
163 		sigunintr(&smask);
164 		mutex_exit(&rp->r_statelock);
165 	}
166 	return (0);
167 }
168 
169 /*
170  * Validate caches by checking cached attributes. If they have timed out,
171  * then get new attributes from the server.  As a side effect, cache
172  * invalidation is done if the attributes have changed.
173  *
174  * If the attributes have not timed out and if there is a cache
175  * invalidation being done by some other thread, then wait until that
176  * thread has completed the cache invalidation.
177  */
178 int
179 nfs4_validate_caches(vnode_t *vp, cred_t *cr)
180 {
181 	int error;
182 	nfs4_ga_res_t gar;
183 
184 	if (ATTRCACHE4_VALID(vp)) {
185 		error = nfs4_waitfor_purge_complete(vp);
186 		if (error)
187 			return (error);
188 		return (0);
189 	}
190 
191 	gar.n4g_va.va_mask = AT_ALL;
192 	return (nfs4_getattr_otw(vp, &gar, cr, 0));
193 }
194 
195 /*
196  * Fill in attribute from the cache.
197  * If valid, then return 0 to indicate that no error occurred,
198  * otherwise return 1 to indicate that an error occurred.
199  */
200 static int
201 nfs4_getattr_cache(vnode_t *vp, struct vattr *vap)
202 {
203 	rnode4_t *rp;
204 
205 	rp = VTOR4(vp);
206 	mutex_enter(&rp->r_statelock);
207 	mutex_enter(&rp->r_statev4_lock);
208 	if (ATTRCACHE4_VALID(vp)) {
209 		mutex_exit(&rp->r_statev4_lock);
210 		/*
211 		 * Cached attributes are valid
212 		 */
213 		*vap = rp->r_attr;
214 		mutex_exit(&rp->r_statelock);
215 		return (0);
216 	}
217 	mutex_exit(&rp->r_statev4_lock);
218 	mutex_exit(&rp->r_statelock);
219 	return (1);
220 }
221 
222 
223 /*
224  * If returned error is ESTALE flush all caches.  The nfs4_purge_caches()
225  * call is synchronous because all the pages were invalidated by the
226  * nfs4_invalidate_pages() call.
227  */
228 void
229 nfs4_purge_stale_fh(int errno, vnode_t *vp, cred_t *cr)
230 {
231 	struct rnode4 *rp = VTOR4(vp);
232 
233 	/* Ensure that the ..._end_op() call has been done */
234 	ASSERT(tsd_get(nfs4_tsd_key) == NULL);
235 
236 	if (errno != ESTALE)
237 		return;
238 
239 	mutex_enter(&rp->r_statelock);
240 	rp->r_flags |= R4STALE;
241 	if (!rp->r_error)
242 		rp->r_error = errno;
243 	mutex_exit(&rp->r_statelock);
244 	if (nfs4_has_pages(vp))
245 		nfs4_invalidate_pages(vp, (u_offset_t)0, cr);
246 	nfs4_purge_caches(vp, NFS4_PURGE_DNLC, cr, FALSE);
247 }
248 
249 /*
250  * Purge all of the various NFS `data' caches.  If "asyncpg" is TRUE, the
251  * page purge is done asynchronously.
252  */
253 void
254 nfs4_purge_caches(vnode_t *vp, int purge_dnlc, cred_t *cr, int asyncpg)
255 {
256 	rnode4_t *rp;
257 	char *contents;
258 	vnode_t *xattr;
259 	int size;
260 	int pgflush;			/* are we the page flush thread? */
261 
262 	/*
263 	 * Purge the DNLC for any entries which refer to this file.
264 	 */
265 	if (vp->v_count > 1 &&
266 	    (vp->v_type == VDIR || purge_dnlc == NFS4_PURGE_DNLC))
267 		dnlc_purge_vp(vp);
268 
269 	/*
270 	 * Clear any readdir state bits and purge the readlink response cache.
271 	 */
272 	rp = VTOR4(vp);
273 	mutex_enter(&rp->r_statelock);
274 	rp->r_flags &= ~R4LOOKUP;
275 	contents = rp->r_symlink.contents;
276 	size = rp->r_symlink.size;
277 	rp->r_symlink.contents = NULL;
278 
279 	xattr = rp->r_xattr_dir;
280 	rp->r_xattr_dir = NULL;
281 
282 	/*
283 	 * Purge pathconf cache too.
284 	 */
285 	rp->r_pathconf.pc4_xattr_valid = 0;
286 	rp->r_pathconf.pc4_cache_valid = 0;
287 
288 	pgflush = (curthread == rp->r_pgflush);
289 	mutex_exit(&rp->r_statelock);
290 
291 	if (contents != NULL) {
292 
293 		kmem_free((void *)contents, size);
294 	}
295 
296 	if (xattr != NULL)
297 		VN_RELE(xattr);
298 
299 	/*
300 	 * Flush the page cache.  If the current thread is the page flush
301 	 * thread, don't initiate a new page flush.  There's no need for
302 	 * it, and doing it correctly is hard.
303 	 */
304 	if (nfs4_has_pages(vp) && !pgflush) {
305 		if (!asyncpg) {
306 			(void) nfs4_waitfor_purge_complete(vp);
307 			flush_pages(vp, cr);
308 		} else {
309 			pgflush_t *args;
310 
311 			/*
312 			 * We don't hold r_statelock while creating the
313 			 * thread, in case the call blocks.  So we use a
314 			 * flag to indicate that a page flush thread is
315 			 * active.
316 			 */
317 			mutex_enter(&rp->r_statelock);
318 			if (rp->r_flags & R4PGFLUSH) {
319 				mutex_exit(&rp->r_statelock);
320 			} else {
321 				rp->r_flags |= R4PGFLUSH;
322 				mutex_exit(&rp->r_statelock);
323 
324 				args = kmem_alloc(sizeof (pgflush_t),
325 				    KM_SLEEP);
326 				args->vp = vp;
327 				VN_HOLD(args->vp);
328 				args->cr = cr;
329 				crhold(args->cr);
330 				(void) zthread_create(NULL, 0,
331 				    nfs4_pgflush_thread, args, 0,
332 				    minclsyspri);
333 			}
334 		}
335 	}
336 
337 	/*
338 	 * Flush the readdir response cache.
339 	 */
340 	nfs4_purge_rddir_cache(vp);
341 }
342 
343 /*
344  * Invalidate all pages for the given file, after writing back the dirty
345  * ones.
346  */
347 
348 static void
349 flush_pages(vnode_t *vp, cred_t *cr)
350 {
351 	int error;
352 	rnode4_t *rp = VTOR4(vp);
353 
354 	error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_INVAL, cr, NULL);
355 	if (error == ENOSPC || error == EDQUOT) {
356 		mutex_enter(&rp->r_statelock);
357 		if (!rp->r_error)
358 			rp->r_error = error;
359 		mutex_exit(&rp->r_statelock);
360 	}
361 }
362 
363 /*
364  * Page flush thread.
365  */
366 
367 static void
368 nfs4_pgflush_thread(pgflush_t *args)
369 {
370 	rnode4_t *rp = VTOR4(args->vp);
371 
372 	/* remember which thread we are, so we don't deadlock ourselves */
373 	mutex_enter(&rp->r_statelock);
374 	ASSERT(rp->r_pgflush == NULL);
375 	rp->r_pgflush = curthread;
376 	mutex_exit(&rp->r_statelock);
377 
378 	flush_pages(args->vp, args->cr);
379 
380 	mutex_enter(&rp->r_statelock);
381 	rp->r_pgflush = NULL;
382 	rp->r_flags &= ~R4PGFLUSH;
383 	cv_broadcast(&rp->r_cv);
384 	mutex_exit(&rp->r_statelock);
385 
386 	VN_RELE(args->vp);
387 	crfree(args->cr);
388 	kmem_free(args, sizeof (pgflush_t));
389 	zthread_exit();
390 }
391 
392 /*
393  * Purge the readdir cache of all entries which are not currently
394  * being filled.
395  */
396 void
397 nfs4_purge_rddir_cache(vnode_t *vp)
398 {
399 	rnode4_t *rp;
400 
401 	rp = VTOR4(vp);
402 
403 	mutex_enter(&rp->r_statelock);
404 	rp->r_direof = NULL;
405 	rp->r_flags &= ~R4LOOKUP;
406 	rp->r_flags |= R4READDIRWATTR;
407 	rddir4_cache_purge(rp);
408 	mutex_exit(&rp->r_statelock);
409 }
410 
411 /*
412  * Set attributes cache for given vnode using virtual attributes.  There is
413  * no cache validation, but if the attributes are deemed to be stale, they
414  * are ignored.  This corresponds to nfs3_attrcache().
415  *
416  * Set the timeout value on the attribute cache and fill it
417  * with the passed in attributes.
418  */
419 void
420 nfs4_attrcache_noinval(vnode_t *vp, nfs4_ga_res_t *garp, hrtime_t t)
421 {
422 	rnode4_t *rp = VTOR4(vp);
423 
424 	mutex_enter(&rp->r_statelock);
425 	if (rp->r_time_attr_saved <= t)
426 		nfs4_attrcache_va(vp, garp, FALSE);
427 	mutex_exit(&rp->r_statelock);
428 }
429 
430 /*
431  * Use the passed in virtual attributes to check to see whether the
432  * data and metadata caches are valid, cache the new attributes, and
433  * then do the cache invalidation if required.
434  *
435  * The cache validation and caching of the new attributes is done
436  * atomically via the use of the mutex, r_statelock.  If required,
437  * the cache invalidation is done atomically w.r.t. the cache
438  * validation and caching of the attributes via the pseudo lock,
439  * r_serial.
440  *
441  * This routine is used to do cache validation and attributes caching
442  * for operations with a single set of post operation attributes.
443  */
444 
445 void
446 nfs4_attr_cache(vnode_t *vp, nfs4_ga_res_t *garp,
447     hrtime_t t, cred_t *cr, int async,
448     change_info4 *cinfo)
449 {
450 	rnode4_t *rp;
451 	int mtime_changed = 0;
452 	int ctime_changed = 0;
453 	vsecattr_t *vsp;
454 	int was_serial, set_time_cache_inval, recov;
455 	vattr_t *vap = &garp->n4g_va;
456 	mntinfo4_t *mi = VTOMI4(vp);
457 	len_t preattr_rsize;
458 	boolean_t writemodify_set = B_FALSE;
459 	boolean_t cachepurge_set = B_FALSE;
460 
461 	ASSERT(mi->mi_vfsp->vfs_dev == garp->n4g_va.va_fsid);
462 
463 	/* Is curthread the recovery thread? */
464 	mutex_enter(&mi->mi_lock);
465 	recov = (VTOMI4(vp)->mi_recovthread == curthread);
466 	mutex_exit(&mi->mi_lock);
467 
468 	rp = VTOR4(vp);
469 	mutex_enter(&rp->r_statelock);
470 	was_serial = (rp->r_serial == curthread);
471 	if (rp->r_serial && !was_serial) {
472 		klwp_t *lwp = ttolwp(curthread);
473 
474 		/*
475 		 * If we're the recovery thread, then purge current attrs
476 		 * and bail out to avoid potential deadlock between another
477 		 * thread caching attrs (r_serial thread), recov thread,
478 		 * and an async writer thread.
479 		 */
480 		if (recov) {
481 			PURGE_ATTRCACHE4_LOCKED(rp);
482 			mutex_exit(&rp->r_statelock);
483 			return;
484 		}
485 
486 		if (lwp != NULL)
487 			lwp->lwp_nostop++;
488 		while (rp->r_serial != NULL) {
489 			if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
490 				mutex_exit(&rp->r_statelock);
491 				if (lwp != NULL)
492 					lwp->lwp_nostop--;
493 				return;
494 			}
495 		}
496 		if (lwp != NULL)
497 			lwp->lwp_nostop--;
498 	}
499 
500 	/*
501 	 * If there is a page flush thread, the current thread needs to
502 	 * bail out, to prevent a possible deadlock between the current
503 	 * thread (which might be in a start_op/end_op region), the
504 	 * recovery thread, and the page flush thread.  Expire the
505 	 * attribute cache, so that any attributes the current thread was
506 	 * going to set are not lost.
507 	 */
508 	if ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread) {
509 		PURGE_ATTRCACHE4_LOCKED(rp);
510 		mutex_exit(&rp->r_statelock);
511 		return;
512 	}
513 
514 	if (rp->r_time_attr_saved > t) {
515 		/*
516 		 * Attributes have been cached since these attributes were
517 		 * probably made. If there is an inconsistency in what is
518 		 * cached, mark them invalid. If not, don't act on them.
519 		 */
520 		if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size))
521 			PURGE_ATTRCACHE4_LOCKED(rp);
522 		mutex_exit(&rp->r_statelock);
523 		return;
524 	}
525 	set_time_cache_inval = 0;
526 	if (cinfo) {
527 		/*
528 		 * Only directory modifying callers pass non-NULL cinfo.
529 		 */
530 		ASSERT(vp->v_type == VDIR);
531 		/*
532 		 * If the cache timeout either doesn't exist or hasn't expired,
533 		 * and dir didn't changed on server before dirmod op
534 		 * and dir didn't change after dirmod op but before getattr
535 		 * then there's a chance that the client's cached data for
536 		 * this object is current (not stale).  No immediate cache
537 		 * flush is required.
538 		 *
539 		 */
540 		if ((! rp->r_time_cache_inval || t < rp->r_time_cache_inval) &&
541 		    cinfo->before == rp->r_change &&
542 		    (garp->n4g_change_valid &&
543 		    cinfo->after == garp->n4g_change)) {
544 
545 			/*
546 			 * If atomic isn't set, then the before/after info
547 			 * cannot be blindly trusted.  For this case, we tell
548 			 * nfs4_attrcache_va to cache the attrs but also
549 			 * establish an absolute maximum cache timeout.  When
550 			 * the timeout is reached, caches will be flushed.
551 			 */
552 			if (! cinfo->atomic)
553 				set_time_cache_inval = 1;
554 		} else {
555 
556 			/*
557 			 * We're not sure exactly what changed, but we know
558 			 * what to do.  flush all caches for dir.  remove the
559 			 * attr timeout.
560 			 *
561 			 * a) timeout expired.  flush all caches.
562 			 * b) r_change != cinfo.before.  flush all caches.
563 			 * c) r_change == cinfo.before, but cinfo.after !=
564 			 *    post-op getattr(change).  flush all caches.
565 			 * d) post-op getattr(change) not provided by server.
566 			 *    flush all caches.
567 			 */
568 			mtime_changed = 1;
569 			ctime_changed = 1;
570 			rp->r_time_cache_inval = 0;
571 		}
572 	} else {
573 		/*
574 		 * Write thread after writing data to file on remote server,
575 		 * will always set R4WRITEMODIFIED to indicate that file on
576 		 * remote server was modified with a WRITE operation and would
577 		 * have marked attribute cache as timed out. If R4WRITEMODIFIED
578 		 * is set, then do not check for mtime and ctime change.
579 		 */
580 		if (!(rp->r_flags & R4WRITEMODIFIED)) {
581 			if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size))
582 				mtime_changed = 1;
583 
584 			if (rp->r_attr.va_ctime.tv_sec !=
585 			    vap->va_ctime.tv_sec ||
586 			    rp->r_attr.va_ctime.tv_nsec !=
587 			    vap->va_ctime.tv_nsec)
588 				ctime_changed = 1;
589 		} else {
590 			writemodify_set = B_TRUE;
591 		}
592 	}
593 
594 	preattr_rsize = rp->r_size;
595 
596 	nfs4_attrcache_va(vp, garp, set_time_cache_inval);
597 
598 	/*
599 	 * If we have updated filesize in nfs4_attrcache_va, as soon as we
600 	 * drop statelock we will be in transition of purging all
601 	 * our caches and updating them. It is possible for another
602 	 * thread to pick this new file size and read in zeroed data.
603 	 * stall other threads till cache purge is complete.
604 	 */
605 	if ((!cinfo) && (rp->r_size != preattr_rsize)) {
606 		/*
607 		 * If R4WRITEMODIFIED was set and we have updated the file
608 		 * size, Server's returned file size need not necessarily
609 		 * be because of this Client's WRITE. We need to purge
610 		 * all caches.
611 		 */
612 		if (writemodify_set)
613 			mtime_changed = 1;
614 
615 		if (mtime_changed && !(rp->r_flags & R4INCACHEPURGE)) {
616 			rp->r_flags |= R4INCACHEPURGE;
617 			cachepurge_set = B_TRUE;
618 		}
619 	}
620 
621 	if (!mtime_changed && !ctime_changed) {
622 		mutex_exit(&rp->r_statelock);
623 		return;
624 	}
625 
626 	rp->r_serial = curthread;
627 
628 	mutex_exit(&rp->r_statelock);
629 
630 	/*
631 	 * If we're the recov thread, then force async nfs4_purge_caches
632 	 * to avoid potential deadlock.
633 	 */
634 	if (mtime_changed)
635 		nfs4_purge_caches(vp, NFS4_NOPURGE_DNLC, cr, recov ? 1 : async);
636 
637 	if ((rp->r_flags & R4INCACHEPURGE) && cachepurge_set) {
638 		mutex_enter(&rp->r_statelock);
639 		rp->r_flags &= ~R4INCACHEPURGE;
640 		cv_broadcast(&rp->r_cv);
641 		mutex_exit(&rp->r_statelock);
642 		cachepurge_set = B_FALSE;
643 	}
644 
645 	if (ctime_changed) {
646 		(void) nfs4_access_purge_rp(rp);
647 		if (rp->r_secattr != NULL) {
648 			mutex_enter(&rp->r_statelock);
649 			vsp = rp->r_secattr;
650 			rp->r_secattr = NULL;
651 			mutex_exit(&rp->r_statelock);
652 			if (vsp != NULL)
653 				nfs4_acl_free_cache(vsp);
654 		}
655 	}
656 
657 	if (!was_serial) {
658 		mutex_enter(&rp->r_statelock);
659 		rp->r_serial = NULL;
660 		cv_broadcast(&rp->r_cv);
661 		mutex_exit(&rp->r_statelock);
662 	}
663 }
664 
665 /*
666  * Set attributes cache for given vnode using virtual attributes.
667  *
668  * Set the timeout value on the attribute cache and fill it
669  * with the passed in attributes.
670  *
671  * The caller must be holding r_statelock.
672  */
673 static void
674 nfs4_attrcache_va(vnode_t *vp, nfs4_ga_res_t *garp, int set_cache_timeout)
675 {
676 	rnode4_t *rp;
677 	mntinfo4_t *mi;
678 	hrtime_t delta;
679 	hrtime_t now;
680 	vattr_t *vap = &garp->n4g_va;
681 
682 	rp = VTOR4(vp);
683 
684 	ASSERT(MUTEX_HELD(&rp->r_statelock));
685 	ASSERT(vap->va_mask == AT_ALL);
686 
687 	/* Switch to master before checking v_flag */
688 	if (IS_SHADOW(vp, rp))
689 		vp = RTOV4(rp);
690 
691 	now = gethrtime();
692 
693 	mi = VTOMI4(vp);
694 
695 	/*
696 	 * Only establish a new cache timeout (if requested).  Never
697 	 * extend a timeout.  Never clear a timeout.  Clearing a timeout
698 	 * is done by nfs4_update_dircaches (ancestor in our call chain)
699 	 */
700 	if (set_cache_timeout && ! rp->r_time_cache_inval)
701 		rp->r_time_cache_inval = now + mi->mi_acdirmax;
702 
703 	/*
704 	 * Delta is the number of nanoseconds that we will
705 	 * cache the attributes of the file.  It is based on
706 	 * the number of nanoseconds since the last time that
707 	 * we detected a change.  The assumption is that files
708 	 * that changed recently are likely to change again.
709 	 * There is a minimum and a maximum for regular files
710 	 * and for directories which is enforced though.
711 	 *
712 	 * Using the time since last change was detected
713 	 * eliminates direct comparison or calculation
714 	 * using mixed client and server times.  NFS does
715 	 * not make any assumptions regarding the client
716 	 * and server clocks being synchronized.
717 	 */
718 	if (vap->va_mtime.tv_sec != rp->r_attr.va_mtime.tv_sec ||
719 	    vap->va_mtime.tv_nsec != rp->r_attr.va_mtime.tv_nsec ||
720 	    vap->va_size != rp->r_attr.va_size) {
721 		rp->r_time_attr_saved = now;
722 	}
723 
724 	if ((mi->mi_flags & MI4_NOAC) || (vp->v_flag & VNOCACHE))
725 		delta = 0;
726 	else {
727 		delta = now - rp->r_time_attr_saved;
728 		if (vp->v_type == VDIR) {
729 			if (delta < mi->mi_acdirmin)
730 				delta = mi->mi_acdirmin;
731 			else if (delta > mi->mi_acdirmax)
732 				delta = mi->mi_acdirmax;
733 		} else {
734 			if (delta < mi->mi_acregmin)
735 				delta = mi->mi_acregmin;
736 			else if (delta > mi->mi_acregmax)
737 				delta = mi->mi_acregmax;
738 		}
739 	}
740 	rp->r_time_attr_inval = now + delta;
741 
742 	rp->r_attr = *vap;
743 	if (garp->n4g_change_valid)
744 		rp->r_change = garp->n4g_change;
745 
746 	/*
747 	 * The attributes that were returned may be valid and can
748 	 * be used, but they may not be allowed to be cached.
749 	 * Reset the timers to cause immediate invalidation and
750 	 * clear r_change so no VERIFY operations will suceed
751 	 */
752 	if (garp->n4g_attrwhy == NFS4_GETATTR_NOCACHE_OK) {
753 		rp->r_time_attr_inval = now;
754 		rp->r_time_attr_saved = now;
755 		rp->r_change = 0;
756 	}
757 
758 	/*
759 	 * If mounted_on_fileid returned AND the object is a stub,
760 	 * then set object's va_nodeid to the mounted over fid
761 	 * returned by server.
762 	 *
763 	 * If mounted_on_fileid not provided/supported, then
764 	 * just set it to 0 for now.  Eventually it would be
765 	 * better to set it to a hashed version of FH.  This
766 	 * would probably be good enough to provide a unique
767 	 * fid/d_ino within a dir.
768 	 *
769 	 * We don't need to carry mounted_on_fileid in the
770 	 * rnode as long as the client never requests fileid
771 	 * without also requesting mounted_on_fileid.  For
772 	 * now, it stays.
773 	 */
774 	if (garp->n4g_mon_fid_valid) {
775 		rp->r_mntd_fid = garp->n4g_mon_fid;
776 
777 		if (RP_ISSTUB(rp))
778 			rp->r_attr.va_nodeid = rp->r_mntd_fid;
779 	}
780 
781 	/*
782 	 * Check to see if there are valid pathconf bits to
783 	 * cache in the rnode.
784 	 */
785 	if (garp->n4g_ext_res) {
786 		if (garp->n4g_ext_res->n4g_pc4.pc4_cache_valid) {
787 			rp->r_pathconf = garp->n4g_ext_res->n4g_pc4;
788 		} else {
789 			if (garp->n4g_ext_res->n4g_pc4.pc4_xattr_valid) {
790 				rp->r_pathconf.pc4_xattr_valid = TRUE;
791 				rp->r_pathconf.pc4_xattr_exists =
792 				    garp->n4g_ext_res->n4g_pc4.pc4_xattr_exists;
793 			}
794 		}
795 	}
796 	/*
797 	 * Update the size of the file if there is no cached data or if
798 	 * the cached data is clean and there is no data being written
799 	 * out.
800 	 */
801 	if (rp->r_size != vap->va_size &&
802 	    (!vn_has_cached_data(vp) ||
803 	    (!(rp->r_flags & R4DIRTY) && rp->r_count == 0))) {
804 		rp->r_size = vap->va_size;
805 	}
806 	nfs_setswaplike(vp, vap);
807 	rp->r_flags &= ~R4WRITEMODIFIED;
808 }
809 
810 /*
811  * Get attributes over-the-wire and update attributes cache
812  * if no error occurred in the over-the-wire operation.
813  * Return 0 if successful, otherwise error.
814  */
815 int
816 nfs4_getattr_otw(vnode_t *vp, nfs4_ga_res_t *garp, cred_t *cr, int get_acl)
817 {
818 	mntinfo4_t *mi = VTOMI4(vp);
819 	hrtime_t t;
820 	nfs4_recov_state_t recov_state;
821 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
822 
823 	recov_state.rs_flags = 0;
824 	recov_state.rs_num_retry_despite_err = 0;
825 
826 	/* Save the original mount point security flavor */
827 	(void) save_mnt_secinfo(mi->mi_curr_serv);
828 
829 recov_retry:
830 
831 	if ((e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR,
832 	    &recov_state, NULL))) {
833 		(void) check_mnt_secinfo(mi->mi_curr_serv, vp);
834 		return (e.error);
835 	}
836 
837 	t = gethrtime();
838 
839 	nfs4_getattr_otw_norecovery(vp, garp, &e, cr, get_acl);
840 
841 	if (nfs4_needs_recovery(&e, FALSE, vp->v_vfsp)) {
842 		if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
843 		    NULL, OP_GETATTR, NULL) == FALSE)  {
844 			nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR,
845 			    &recov_state, 1);
846 			goto recov_retry;
847 		}
848 	}
849 
850 	nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 0);
851 
852 	if (!e.error) {
853 		if (e.stat == NFS4_OK) {
854 			nfs4_attr_cache(vp, garp, t, cr, FALSE, NULL);
855 		} else {
856 			e.error = geterrno4(e.stat);
857 
858 			nfs4_purge_stale_fh(e.error, vp, cr);
859 		}
860 	}
861 
862 	/*
863 	 * If getattr a node that is a stub for a crossed
864 	 * mount point, keep the original secinfo flavor for
865 	 * the current file system, not the crossed one.
866 	 */
867 	(void) check_mnt_secinfo(mi->mi_curr_serv, vp);
868 
869 	return (e.error);
870 }
871 
872 /*
873  * Generate a compound to get attributes over-the-wire.
874  */
875 void
876 nfs4_getattr_otw_norecovery(vnode_t *vp, nfs4_ga_res_t *garp,
877     nfs4_error_t *ep, cred_t *cr, int get_acl)
878 {
879 	COMPOUND4args_clnt args;
880 	COMPOUND4res_clnt res;
881 	int doqueue;
882 	rnode4_t *rp = VTOR4(vp);
883 	nfs_argop4 argop[2];
884 
885 	args.ctag = TAG_GETATTR;
886 
887 	args.array_len = 2;
888 	args.array = argop;
889 
890 	/* putfh */
891 	argop[0].argop = OP_CPUTFH;
892 	argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
893 
894 	/* getattr */
895 	/*
896 	 * Unlike nfs version 2 and 3, where getattr returns all the
897 	 * attributes, nfs version 4 returns only the ones explicitly
898 	 * asked for. This creates problems, as some system functions
899 	 * (e.g. cache check) require certain attributes and if the
900 	 * cached node lacks some attributes such as uid/gid, it can
901 	 * affect system utilities (e.g. "ls") that rely on the information
902 	 * to be there. This can lead to anything from system crashes to
903 	 * corrupted information processed by user apps.
904 	 * So to ensure that all bases are covered, request at least
905 	 * the AT_ALL attribute mask.
906 	 */
907 	argop[1].argop = OP_GETATTR;
908 	argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
909 	if (get_acl)
910 		argop[1].nfs_argop4_u.opgetattr.attr_request |= FATTR4_ACL_MASK;
911 	argop[1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp);
912 
913 	doqueue = 1;
914 
915 	rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, ep);
916 
917 	if (ep->error)
918 		return;
919 
920 	if (res.status != NFS4_OK) {
921 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
922 		return;
923 	}
924 
925 	*garp = res.array[1].nfs_resop4_u.opgetattr.ga_res;
926 
927 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
928 }
929 
930 /*
931  * Return either cached or remote attributes. If get remote attr
932  * use them to check and invalidate caches, then cache the new attributes.
933  */
934 int
935 nfs4getattr(vnode_t *vp, vattr_t *vap, cred_t *cr)
936 {
937 	int error;
938 	rnode4_t *rp;
939 	nfs4_ga_res_t gar;
940 
941 	ASSERT(nfs4_consistent_type(vp));
942 
943 	/*
944 	 * If we've got cached attributes, we're done, otherwise go
945 	 * to the server to get attributes, which will update the cache
946 	 * in the process. Either way, use the cached attributes for
947 	 * the caller's vattr_t.
948 	 *
949 	 * Note that we ignore the gar set by the OTW call: the attr caching
950 	 * code may make adjustments when storing to the rnode, and we want
951 	 * to see those changes here.
952 	 */
953 	rp = VTOR4(vp);
954 	error = 0;
955 	mutex_enter(&rp->r_statelock);
956 	if (!ATTRCACHE4_VALID(vp)) {
957 		mutex_exit(&rp->r_statelock);
958 		error = nfs4_getattr_otw(vp, &gar, cr, 0);
959 		mutex_enter(&rp->r_statelock);
960 	}
961 
962 	if (!error)
963 		*vap = rp->r_attr;
964 
965 	/* Return the client's view of file size */
966 	vap->va_size = rp->r_size;
967 
968 	mutex_exit(&rp->r_statelock);
969 
970 	ASSERT(nfs4_consistent_type(vp));
971 
972 	return (error);
973 }
974 
975 int
976 nfs4_attr_otw(vnode_t *vp, nfs4_tag_type_t tag_type,
977     nfs4_ga_res_t *garp, bitmap4 reqbitmap, cred_t *cr)
978 {
979 	COMPOUND4args_clnt args;
980 	COMPOUND4res_clnt res;
981 	int doqueue;
982 	nfs_argop4 argop[2];
983 	mntinfo4_t *mi = VTOMI4(vp);
984 	bool_t needrecov = FALSE;
985 	nfs4_recov_state_t recov_state;
986 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
987 	nfs4_ga_ext_res_t *gerp;
988 
989 	recov_state.rs_flags = 0;
990 	recov_state.rs_num_retry_despite_err = 0;
991 
992 recov_retry:
993 	args.ctag = tag_type;
994 
995 	args.array_len = 2;
996 	args.array = argop;
997 
998 	e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR, &recov_state, NULL);
999 	if (e.error)
1000 		return (e.error);
1001 
1002 	/* putfh */
1003 	argop[0].argop = OP_CPUTFH;
1004 	argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh;
1005 
1006 	/* getattr */
1007 	argop[1].argop = OP_GETATTR;
1008 	argop[1].nfs_argop4_u.opgetattr.attr_request = reqbitmap;
1009 	argop[1].nfs_argop4_u.opgetattr.mi = mi;
1010 
1011 	doqueue = 1;
1012 
1013 	NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
1014 	    "nfs4_attr_otw: %s call, rp %s", needrecov ? "recov" : "first",
1015 	    rnode4info(VTOR4(vp))));
1016 
1017 	rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
1018 
1019 	needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
1020 	if (!needrecov && e.error) {
1021 		nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
1022 		    needrecov);
1023 		return (e.error);
1024 	}
1025 
1026 	if (needrecov) {
1027 		bool_t abort;
1028 
1029 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1030 		    "nfs4_attr_otw: initiating recovery\n"));
1031 
1032 		abort = nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
1033 		    NULL, OP_GETATTR, NULL);
1034 		nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
1035 		    needrecov);
1036 		if (!e.error) {
1037 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1038 			e.error = geterrno4(res.status);
1039 		}
1040 		if (abort == FALSE)
1041 			goto recov_retry;
1042 		return (e.error);
1043 	}
1044 
1045 	if (res.status) {
1046 		e.error = geterrno4(res.status);
1047 	} else {
1048 		gerp = garp->n4g_ext_res;
1049 		bcopy(&res.array[1].nfs_resop4_u.opgetattr.ga_res,
1050 		    garp, sizeof (nfs4_ga_res_t));
1051 		garp->n4g_ext_res = gerp;
1052 		if (garp->n4g_ext_res &&
1053 		    res.array[1].nfs_resop4_u.opgetattr.ga_res.n4g_ext_res)
1054 			bcopy(res.array[1].nfs_resop4_u.opgetattr.
1055 			    ga_res.n4g_ext_res,
1056 			    garp->n4g_ext_res, sizeof (nfs4_ga_ext_res_t));
1057 	}
1058 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1059 	nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
1060 	    needrecov);
1061 	return (e.error);
1062 }
1063 
1064 /*
1065  * Asynchronous I/O parameters.  nfs_async_threads is the high-water mark
1066  * for the demand-based allocation of async threads per-mount.  The
1067  * nfs_async_timeout is the amount of time a thread will live after it
1068  * becomes idle, unless new I/O requests are received before the thread
1069  * dies.  See nfs4_async_putpage and nfs4_async_start.
1070  */
1071 
1072 static void	nfs4_async_start(struct vfs *);
1073 
1074 static void
1075 free_async_args4(struct nfs4_async_reqs *args)
1076 {
1077 	rnode4_t *rp;
1078 
1079 	if (args->a_io != NFS4_INACTIVE) {
1080 		rp = VTOR4(args->a_vp);
1081 		mutex_enter(&rp->r_statelock);
1082 		rp->r_count--;
1083 		if (args->a_io == NFS4_PUTAPAGE ||
1084 		    args->a_io == NFS4_PAGEIO)
1085 			rp->r_awcount--;
1086 		cv_broadcast(&rp->r_cv);
1087 		mutex_exit(&rp->r_statelock);
1088 		VN_RELE(args->a_vp);
1089 	}
1090 	crfree(args->a_cred);
1091 	kmem_free(args, sizeof (*args));
1092 }
1093 
1094 /*
1095  * Cross-zone thread creation and NFS access is disallowed, yet fsflush() and
1096  * pageout(), running in the global zone, have legitimate reasons to do
1097  * VOP_PUTPAGE(B_ASYNC) on other zones' NFS mounts.  We avoid the problem by
1098  * use of a a per-mount "asynchronous requests manager thread" which is
1099  * signaled by the various asynchronous work routines when there is
1100  * asynchronous work to be done.  It is responsible for creating new
1101  * worker threads if necessary, and notifying existing worker threads
1102  * that there is work to be done.
1103  *
1104  * In other words, it will "take the specifications from the customers and
1105  * give them to the engineers."
1106  *
1107  * Worker threads die off of their own accord if they are no longer
1108  * needed.
1109  *
1110  * This thread is killed when the zone is going away or the filesystem
1111  * is being unmounted.
1112  */
1113 void
1114 nfs4_async_manager(vfs_t *vfsp)
1115 {
1116 	callb_cpr_t cprinfo;
1117 	mntinfo4_t *mi;
1118 	uint_t max_threads;
1119 
1120 	mi = VFTOMI4(vfsp);
1121 
1122 	CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr,
1123 	    "nfs4_async_manager");
1124 
1125 	mutex_enter(&mi->mi_async_lock);
1126 	/*
1127 	 * We want to stash the max number of threads that this mount was
1128 	 * allowed so we can use it later when the variable is set to zero as
1129 	 * part of the zone/mount going away.
1130 	 *
1131 	 * We want to be able to create at least one thread to handle
1132 	 * asyncrhonous inactive calls.
1133 	 */
1134 	max_threads = MAX(mi->mi_max_threads, 1);
1135 	mutex_enter(&mi->mi_lock);
1136 	/*
1137 	 * We don't want to wait for mi_max_threads to go to zero, since that
1138 	 * happens as part of a failed unmount, but this thread should only
1139 	 * exit when the mount is really going away.
1140 	 *
1141 	 * Once MI4_ASYNC_MGR_STOP is set, no more async operations will be
1142 	 * attempted: the various _async_*() functions know to do things
1143 	 * inline if mi_max_threads == 0.  Henceforth we just drain out the
1144 	 * outstanding requests.
1145 	 *
1146 	 * Note that we still create zthreads even if we notice the zone is
1147 	 * shutting down (MI4_ASYNC_MGR_STOP is set); this may cause the zone
1148 	 * shutdown sequence to take slightly longer in some cases, but
1149 	 * doesn't violate the protocol, as all threads will exit as soon as
1150 	 * they're done processing the remaining requests.
1151 	 */
1152 	while (!(mi->mi_flags & MI4_ASYNC_MGR_STOP) ||
1153 	    mi->mi_async_req_count > 0) {
1154 		mutex_exit(&mi->mi_lock);
1155 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
1156 		cv_wait(&mi->mi_async_reqs_cv, &mi->mi_async_lock);
1157 		CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1158 		while (mi->mi_async_req_count > 0) {
1159 			/*
1160 			 * Paranoia: If the mount started out having
1161 			 * (mi->mi_max_threads == 0), and the value was
1162 			 * later changed (via a debugger or somesuch),
1163 			 * we could be confused since we will think we
1164 			 * can't create any threads, and the calling
1165 			 * code (which looks at the current value of
1166 			 * mi->mi_max_threads, now non-zero) thinks we
1167 			 * can.
1168 			 *
1169 			 * So, because we're paranoid, we create threads
1170 			 * up to the maximum of the original and the
1171 			 * current value. This means that future
1172 			 * (debugger-induced) alterations of
1173 			 * mi->mi_max_threads are ignored for our
1174 			 * purposes, but who told them they could change
1175 			 * random values on a live kernel anyhow?
1176 			 */
1177 			if (mi->mi_threads <
1178 			    MAX(mi->mi_max_threads, max_threads)) {
1179 				mi->mi_threads++;
1180 				mutex_exit(&mi->mi_async_lock);
1181 				MI4_HOLD(mi);
1182 				VFS_HOLD(vfsp);	/* hold for new thread */
1183 				(void) zthread_create(NULL, 0, nfs4_async_start,
1184 				    vfsp, 0, minclsyspri);
1185 				mutex_enter(&mi->mi_async_lock);
1186 			}
1187 			cv_signal(&mi->mi_async_work_cv);
1188 			ASSERT(mi->mi_async_req_count != 0);
1189 			mi->mi_async_req_count--;
1190 		}
1191 		mutex_enter(&mi->mi_lock);
1192 	}
1193 	mutex_exit(&mi->mi_lock);
1194 
1195 	NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
1196 	    "nfs4_async_manager exiting for vfs %p\n", (void *)mi->mi_vfsp));
1197 	/*
1198 	 * Let everyone know we're done.
1199 	 */
1200 	mi->mi_manager_thread = NULL;
1201 	/*
1202 	 * Wake up the inactive thread.
1203 	 */
1204 	cv_broadcast(&mi->mi_inact_req_cv);
1205 	/*
1206 	 * Wake up anyone sitting in nfs4_async_manager_stop()
1207 	 */
1208 	cv_broadcast(&mi->mi_async_cv);
1209 	/*
1210 	 * There is no explicit call to mutex_exit(&mi->mi_async_lock)
1211 	 * since CALLB_CPR_EXIT is actually responsible for releasing
1212 	 * 'mi_async_lock'.
1213 	 */
1214 	CALLB_CPR_EXIT(&cprinfo);
1215 	VFS_RELE(vfsp);	/* release thread's hold */
1216 	MI4_RELE(mi);
1217 	zthread_exit();
1218 }
1219 
1220 /*
1221  * Signal (and wait for) the async manager thread to clean up and go away.
1222  */
1223 void
1224 nfs4_async_manager_stop(vfs_t *vfsp)
1225 {
1226 	mntinfo4_t *mi = VFTOMI4(vfsp);
1227 
1228 	mutex_enter(&mi->mi_async_lock);
1229 	mutex_enter(&mi->mi_lock);
1230 	mi->mi_flags |= MI4_ASYNC_MGR_STOP;
1231 	mutex_exit(&mi->mi_lock);
1232 	cv_broadcast(&mi->mi_async_reqs_cv);
1233 	/*
1234 	 * Wait for the async manager thread to die.
1235 	 */
1236 	while (mi->mi_manager_thread != NULL)
1237 		cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1238 	mutex_exit(&mi->mi_async_lock);
1239 }
1240 
1241 int
1242 nfs4_async_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr,
1243     struct seg *seg, cred_t *cr, void (*readahead)(vnode_t *,
1244     u_offset_t, caddr_t, struct seg *, cred_t *))
1245 {
1246 	rnode4_t *rp;
1247 	mntinfo4_t *mi;
1248 	struct nfs4_async_reqs *args;
1249 
1250 	rp = VTOR4(vp);
1251 	ASSERT(rp->r_freef == NULL);
1252 
1253 	mi = VTOMI4(vp);
1254 
1255 	/*
1256 	 * If addr falls in a different segment, don't bother doing readahead.
1257 	 */
1258 	if (addr >= seg->s_base + seg->s_size)
1259 		return (-1);
1260 
1261 	/*
1262 	 * If we can't allocate a request structure, punt on the readahead.
1263 	 */
1264 	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1265 		return (-1);
1266 
1267 	/*
1268 	 * If a lock operation is pending, don't initiate any new
1269 	 * readaheads.  Otherwise, bump r_count to indicate the new
1270 	 * asynchronous I/O.
1271 	 */
1272 	if (!nfs_rw_tryenter(&rp->r_lkserlock, RW_READER)) {
1273 		kmem_free(args, sizeof (*args));
1274 		return (-1);
1275 	}
1276 	mutex_enter(&rp->r_statelock);
1277 	rp->r_count++;
1278 	mutex_exit(&rp->r_statelock);
1279 	nfs_rw_exit(&rp->r_lkserlock);
1280 
1281 	args->a_next = NULL;
1282 #ifdef DEBUG
1283 	args->a_queuer = curthread;
1284 #endif
1285 	VN_HOLD(vp);
1286 	args->a_vp = vp;
1287 	ASSERT(cr != NULL);
1288 	crhold(cr);
1289 	args->a_cred = cr;
1290 	args->a_io = NFS4_READ_AHEAD;
1291 	args->a_nfs4_readahead = readahead;
1292 	args->a_nfs4_blkoff = blkoff;
1293 	args->a_nfs4_seg = seg;
1294 	args->a_nfs4_addr = addr;
1295 
1296 	mutex_enter(&mi->mi_async_lock);
1297 
1298 	/*
1299 	 * If asyncio has been disabled, don't bother readahead.
1300 	 */
1301 	if (mi->mi_max_threads == 0) {
1302 		mutex_exit(&mi->mi_async_lock);
1303 		goto noasync;
1304 	}
1305 
1306 	/*
1307 	 * Link request structure into the async list and
1308 	 * wakeup async thread to do the i/o.
1309 	 */
1310 	if (mi->mi_async_reqs[NFS4_READ_AHEAD] == NULL) {
1311 		mi->mi_async_reqs[NFS4_READ_AHEAD] = args;
1312 		mi->mi_async_tail[NFS4_READ_AHEAD] = args;
1313 	} else {
1314 		mi->mi_async_tail[NFS4_READ_AHEAD]->a_next = args;
1315 		mi->mi_async_tail[NFS4_READ_AHEAD] = args;
1316 	}
1317 
1318 	if (mi->mi_io_kstats) {
1319 		mutex_enter(&mi->mi_lock);
1320 		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1321 		mutex_exit(&mi->mi_lock);
1322 	}
1323 
1324 	mi->mi_async_req_count++;
1325 	ASSERT(mi->mi_async_req_count != 0);
1326 	cv_signal(&mi->mi_async_reqs_cv);
1327 	mutex_exit(&mi->mi_async_lock);
1328 	return (0);
1329 
1330 noasync:
1331 	mutex_enter(&rp->r_statelock);
1332 	rp->r_count--;
1333 	cv_broadcast(&rp->r_cv);
1334 	mutex_exit(&rp->r_statelock);
1335 	VN_RELE(vp);
1336 	crfree(cr);
1337 	kmem_free(args, sizeof (*args));
1338 	return (-1);
1339 }
1340 
1341 /*
1342  * The async queues for each mounted file system are arranged as a
1343  * set of queues, one for each async i/o type.  Requests are taken
1344  * from the queues in a round-robin fashion.  A number of consecutive
1345  * requests are taken from each queue before moving on to the next
1346  * queue.  This functionality may allow the NFS Version 2 server to do
1347  * write clustering, even if the client is mixing writes and reads
1348  * because it will take multiple write requests from the queue
1349  * before processing any of the other async i/o types.
1350  *
1351  * XXX The nfs4_async_start thread is unsafe in the light of the present
1352  * model defined by cpr to suspend the system. Specifically over the
1353  * wire calls are cpr-unsafe. The thread should be reevaluated in
1354  * case of future updates to the cpr model.
1355  */
1356 static void
1357 nfs4_async_start(struct vfs *vfsp)
1358 {
1359 	struct nfs4_async_reqs *args;
1360 	mntinfo4_t *mi = VFTOMI4(vfsp);
1361 	clock_t time_left = 1;
1362 	callb_cpr_t cprinfo;
1363 	int i;
1364 	extern int nfs_async_timeout;
1365 
1366 	/*
1367 	 * Dynamic initialization of nfs_async_timeout to allow nfs to be
1368 	 * built in an implementation independent manner.
1369 	 */
1370 	if (nfs_async_timeout == -1)
1371 		nfs_async_timeout = NFS_ASYNC_TIMEOUT;
1372 
1373 	CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, "nas");
1374 
1375 	mutex_enter(&mi->mi_async_lock);
1376 	for (;;) {
1377 		/*
1378 		 * Find the next queue containing an entry.  We start
1379 		 * at the current queue pointer and then round robin
1380 		 * through all of them until we either find a non-empty
1381 		 * queue or have looked through all of them.
1382 		 */
1383 		for (i = 0; i < NFS4_ASYNC_TYPES; i++) {
1384 			args = *mi->mi_async_curr;
1385 			if (args != NULL)
1386 				break;
1387 			mi->mi_async_curr++;
1388 			if (mi->mi_async_curr ==
1389 			    &mi->mi_async_reqs[NFS4_ASYNC_TYPES])
1390 				mi->mi_async_curr = &mi->mi_async_reqs[0];
1391 		}
1392 		/*
1393 		 * If we didn't find a entry, then block until woken up
1394 		 * again and then look through the queues again.
1395 		 */
1396 		if (args == NULL) {
1397 			/*
1398 			 * Exiting is considered to be safe for CPR as well
1399 			 */
1400 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
1401 
1402 			/*
1403 			 * Wakeup thread waiting to unmount the file
1404 			 * system only if all async threads are inactive.
1405 			 *
1406 			 * If we've timed-out and there's nothing to do,
1407 			 * then get rid of this thread.
1408 			 */
1409 			if (mi->mi_max_threads == 0 || time_left <= 0) {
1410 				if (--mi->mi_threads == 0)
1411 					cv_signal(&mi->mi_async_cv);
1412 				CALLB_CPR_EXIT(&cprinfo);
1413 				VFS_RELE(vfsp);	/* release thread's hold */
1414 				MI4_RELE(mi);
1415 				zthread_exit();
1416 				/* NOTREACHED */
1417 			}
1418 			time_left = cv_timedwait(&mi->mi_async_work_cv,
1419 			    &mi->mi_async_lock, nfs_async_timeout + lbolt);
1420 
1421 			CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1422 
1423 			continue;
1424 		} else {
1425 			time_left = 1;
1426 		}
1427 
1428 		/*
1429 		 * Remove the request from the async queue and then
1430 		 * update the current async request queue pointer.  If
1431 		 * the current queue is empty or we have removed enough
1432 		 * consecutive entries from it, then reset the counter
1433 		 * for this queue and then move the current pointer to
1434 		 * the next queue.
1435 		 */
1436 		*mi->mi_async_curr = args->a_next;
1437 		if (*mi->mi_async_curr == NULL ||
1438 		    --mi->mi_async_clusters[args->a_io] == 0) {
1439 			mi->mi_async_clusters[args->a_io] =
1440 			    mi->mi_async_init_clusters;
1441 			mi->mi_async_curr++;
1442 			if (mi->mi_async_curr ==
1443 			    &mi->mi_async_reqs[NFS4_ASYNC_TYPES])
1444 				mi->mi_async_curr = &mi->mi_async_reqs[0];
1445 		}
1446 
1447 		if (args->a_io != NFS4_INACTIVE && mi->mi_io_kstats) {
1448 			mutex_enter(&mi->mi_lock);
1449 			kstat_waitq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
1450 			mutex_exit(&mi->mi_lock);
1451 		}
1452 
1453 		mutex_exit(&mi->mi_async_lock);
1454 
1455 		/*
1456 		 * Obtain arguments from the async request structure.
1457 		 */
1458 		if (args->a_io == NFS4_READ_AHEAD && mi->mi_max_threads > 0) {
1459 			(*args->a_nfs4_readahead)(args->a_vp,
1460 			    args->a_nfs4_blkoff, args->a_nfs4_addr,
1461 			    args->a_nfs4_seg, args->a_cred);
1462 		} else if (args->a_io == NFS4_PUTAPAGE) {
1463 			(void) (*args->a_nfs4_putapage)(args->a_vp,
1464 			    args->a_nfs4_pp, args->a_nfs4_off,
1465 			    args->a_nfs4_len, args->a_nfs4_flags,
1466 			    args->a_cred);
1467 		} else if (args->a_io == NFS4_PAGEIO) {
1468 			(void) (*args->a_nfs4_pageio)(args->a_vp,
1469 			    args->a_nfs4_pp, args->a_nfs4_off,
1470 			    args->a_nfs4_len, args->a_nfs4_flags,
1471 			    args->a_cred);
1472 		} else if (args->a_io == NFS4_READDIR) {
1473 			(void) ((*args->a_nfs4_readdir)(args->a_vp,
1474 			    args->a_nfs4_rdc, args->a_cred));
1475 		} else if (args->a_io == NFS4_COMMIT) {
1476 			(*args->a_nfs4_commit)(args->a_vp, args->a_nfs4_plist,
1477 			    args->a_nfs4_offset, args->a_nfs4_count,
1478 			    args->a_cred);
1479 		} else if (args->a_io == NFS4_INACTIVE) {
1480 			nfs4_inactive_otw(args->a_vp, args->a_cred);
1481 		}
1482 
1483 		/*
1484 		 * Now, release the vnode and free the credentials
1485 		 * structure.
1486 		 */
1487 		free_async_args4(args);
1488 		/*
1489 		 * Reacquire the mutex because it will be needed above.
1490 		 */
1491 		mutex_enter(&mi->mi_async_lock);
1492 	}
1493 }
1494 
1495 /*
1496  * nfs4_inactive_thread - look for vnodes that need over-the-wire calls as
1497  * part of VOP_INACTIVE.
1498  */
1499 
1500 void
1501 nfs4_inactive_thread(mntinfo4_t *mi)
1502 {
1503 	struct nfs4_async_reqs *args;
1504 	callb_cpr_t cprinfo;
1505 	vfs_t *vfsp = mi->mi_vfsp;
1506 
1507 	CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr,
1508 	    "nfs4_inactive_thread");
1509 
1510 	for (;;) {
1511 		mutex_enter(&mi->mi_async_lock);
1512 		args = mi->mi_async_reqs[NFS4_INACTIVE];
1513 		if (args == NULL) {
1514 			mutex_enter(&mi->mi_lock);
1515 			/*
1516 			 * We don't want to exit until the async manager is done
1517 			 * with its work; hence the check for mi_manager_thread
1518 			 * being NULL.
1519 			 *
1520 			 * The async manager thread will cv_broadcast() on
1521 			 * mi_inact_req_cv when it's done, at which point we'll
1522 			 * wake up and exit.
1523 			 */
1524 			if (mi->mi_manager_thread == NULL)
1525 				goto die;
1526 			mi->mi_flags |= MI4_INACTIVE_IDLE;
1527 			mutex_exit(&mi->mi_lock);
1528 			cv_signal(&mi->mi_async_cv);
1529 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
1530 			cv_wait(&mi->mi_inact_req_cv, &mi->mi_async_lock);
1531 			CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1532 			mutex_exit(&mi->mi_async_lock);
1533 		} else {
1534 			mutex_enter(&mi->mi_lock);
1535 			mi->mi_flags &= ~MI4_INACTIVE_IDLE;
1536 			mutex_exit(&mi->mi_lock);
1537 			mi->mi_async_reqs[NFS4_INACTIVE] = args->a_next;
1538 			mutex_exit(&mi->mi_async_lock);
1539 			nfs4_inactive_otw(args->a_vp, args->a_cred);
1540 			crfree(args->a_cred);
1541 			kmem_free(args, sizeof (*args));
1542 		}
1543 	}
1544 die:
1545 	mutex_exit(&mi->mi_lock);
1546 	mi->mi_inactive_thread = NULL;
1547 	cv_signal(&mi->mi_async_cv);
1548 
1549 	/*
1550 	 * There is no explicit call to mutex_exit(&mi->mi_async_lock) since
1551 	 * CALLB_CPR_EXIT is actually responsible for releasing 'mi_async_lock'.
1552 	 */
1553 	CALLB_CPR_EXIT(&cprinfo);
1554 
1555 	NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
1556 	    "nfs4_inactive_thread exiting for vfs %p\n", (void *)vfsp));
1557 
1558 	MI4_RELE(mi);
1559 	zthread_exit();
1560 	/* NOTREACHED */
1561 }
1562 
1563 /*
1564  * nfs_async_stop:
1565  * Wait for all outstanding putpage operations and the inactive thread to
1566  * complete; nfs4_async_stop_sig() without interruptibility.
1567  */
1568 void
1569 nfs4_async_stop(struct vfs *vfsp)
1570 {
1571 	mntinfo4_t *mi = VFTOMI4(vfsp);
1572 
1573 	/*
1574 	 * Wait for all outstanding async operations to complete and for
1575 	 * worker threads to exit.
1576 	 */
1577 	mutex_enter(&mi->mi_async_lock);
1578 	mi->mi_max_threads = 0;
1579 	cv_broadcast(&mi->mi_async_work_cv);
1580 	while (mi->mi_threads != 0)
1581 		cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1582 
1583 	/*
1584 	 * Wait for the inactive thread to finish doing what it's doing.  It
1585 	 * won't exit until the last reference to the vfs_t goes away.
1586 	 */
1587 	if (mi->mi_inactive_thread != NULL) {
1588 		mutex_enter(&mi->mi_lock);
1589 		while (!(mi->mi_flags & MI4_INACTIVE_IDLE) ||
1590 		    (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) {
1591 			mutex_exit(&mi->mi_lock);
1592 			cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1593 			mutex_enter(&mi->mi_lock);
1594 		}
1595 		mutex_exit(&mi->mi_lock);
1596 	}
1597 	mutex_exit(&mi->mi_async_lock);
1598 }
1599 
1600 /*
1601  * nfs_async_stop_sig:
1602  * Wait for all outstanding putpage operations and the inactive thread to
1603  * complete. If a signal is delivered we will abort and return non-zero;
1604  * otherwise return 0. Since this routine is called from nfs4_unmount, we
1605  * need to make it interruptible.
1606  */
1607 int
1608 nfs4_async_stop_sig(struct vfs *vfsp)
1609 {
1610 	mntinfo4_t *mi = VFTOMI4(vfsp);
1611 	ushort_t omax;
1612 	bool_t intr = FALSE;
1613 
1614 	/*
1615 	 * Wait for all outstanding putpage operations to complete and for
1616 	 * worker threads to exit.
1617 	 */
1618 	mutex_enter(&mi->mi_async_lock);
1619 	omax = mi->mi_max_threads;
1620 	mi->mi_max_threads = 0;
1621 	cv_broadcast(&mi->mi_async_work_cv);
1622 	while (mi->mi_threads != 0) {
1623 		if (!cv_wait_sig(&mi->mi_async_cv, &mi->mi_async_lock)) {
1624 			intr = TRUE;
1625 			goto interrupted;
1626 		}
1627 	}
1628 
1629 	/*
1630 	 * Wait for the inactive thread to finish doing what it's doing.  It
1631 	 * won't exit until the a last reference to the vfs_t goes away.
1632 	 */
1633 	if (mi->mi_inactive_thread != NULL) {
1634 		mutex_enter(&mi->mi_lock);
1635 		while (!(mi->mi_flags & MI4_INACTIVE_IDLE) ||
1636 		    (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) {
1637 			mutex_exit(&mi->mi_lock);
1638 			if (!cv_wait_sig(&mi->mi_async_cv,
1639 			    &mi->mi_async_lock)) {
1640 				intr = TRUE;
1641 				goto interrupted;
1642 			}
1643 			mutex_enter(&mi->mi_lock);
1644 		}
1645 		mutex_exit(&mi->mi_lock);
1646 	}
1647 interrupted:
1648 	if (intr)
1649 		mi->mi_max_threads = omax;
1650 	mutex_exit(&mi->mi_async_lock);
1651 
1652 	return (intr);
1653 }
1654 
1655 int
1656 nfs4_async_putapage(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
1657     int flags, cred_t *cr, int (*putapage)(vnode_t *, page_t *,
1658     u_offset_t, size_t, int, cred_t *))
1659 {
1660 	rnode4_t *rp;
1661 	mntinfo4_t *mi;
1662 	struct nfs4_async_reqs *args;
1663 
1664 	ASSERT(flags & B_ASYNC);
1665 	ASSERT(vp->v_vfsp != NULL);
1666 
1667 	rp = VTOR4(vp);
1668 	ASSERT(rp->r_count > 0);
1669 
1670 	mi = VTOMI4(vp);
1671 
1672 	/*
1673 	 * If we can't allocate a request structure, do the putpage
1674 	 * operation synchronously in this thread's context.
1675 	 */
1676 	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1677 		goto noasync;
1678 
1679 	args->a_next = NULL;
1680 #ifdef DEBUG
1681 	args->a_queuer = curthread;
1682 #endif
1683 	VN_HOLD(vp);
1684 	args->a_vp = vp;
1685 	ASSERT(cr != NULL);
1686 	crhold(cr);
1687 	args->a_cred = cr;
1688 	args->a_io = NFS4_PUTAPAGE;
1689 	args->a_nfs4_putapage = putapage;
1690 	args->a_nfs4_pp = pp;
1691 	args->a_nfs4_off = off;
1692 	args->a_nfs4_len = (uint_t)len;
1693 	args->a_nfs4_flags = flags;
1694 
1695 	mutex_enter(&mi->mi_async_lock);
1696 
1697 	/*
1698 	 * If asyncio has been disabled, then make a synchronous request.
1699 	 * This check is done a second time in case async io was diabled
1700 	 * while this thread was blocked waiting for memory pressure to
1701 	 * reduce or for the queue to drain.
1702 	 */
1703 	if (mi->mi_max_threads == 0) {
1704 		mutex_exit(&mi->mi_async_lock);
1705 
1706 		VN_RELE(vp);
1707 		crfree(cr);
1708 		kmem_free(args, sizeof (*args));
1709 		goto noasync;
1710 	}
1711 
1712 	/*
1713 	 * Link request structure into the async list and
1714 	 * wakeup async thread to do the i/o.
1715 	 */
1716 	if (mi->mi_async_reqs[NFS4_PUTAPAGE] == NULL) {
1717 		mi->mi_async_reqs[NFS4_PUTAPAGE] = args;
1718 		mi->mi_async_tail[NFS4_PUTAPAGE] = args;
1719 	} else {
1720 		mi->mi_async_tail[NFS4_PUTAPAGE]->a_next = args;
1721 		mi->mi_async_tail[NFS4_PUTAPAGE] = args;
1722 	}
1723 
1724 	mutex_enter(&rp->r_statelock);
1725 	rp->r_count++;
1726 	rp->r_awcount++;
1727 	mutex_exit(&rp->r_statelock);
1728 
1729 	if (mi->mi_io_kstats) {
1730 		mutex_enter(&mi->mi_lock);
1731 		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1732 		mutex_exit(&mi->mi_lock);
1733 	}
1734 
1735 	mi->mi_async_req_count++;
1736 	ASSERT(mi->mi_async_req_count != 0);
1737 	cv_signal(&mi->mi_async_reqs_cv);
1738 	mutex_exit(&mi->mi_async_lock);
1739 	return (0);
1740 
1741 noasync:
1742 
1743 	if (curproc == proc_pageout || curproc == proc_fsflush ||
1744 	    nfs_zone() == mi->mi_zone) {
1745 		/*
1746 		 * If we get here in the context of the pageout/fsflush,
1747 		 * or we have run out of memory or we're attempting to
1748 		 * unmount we refuse to do a sync write, because this may
1749 		 * hang pageout/fsflush and the machine. In this case,
1750 		 * we just re-mark the page as dirty and punt on the page.
1751 		 *
1752 		 * Make sure B_FORCE isn't set.  We can re-mark the
1753 		 * pages as dirty and unlock the pages in one swoop by
1754 		 * passing in B_ERROR to pvn_write_done().  However,
1755 		 * we should make sure B_FORCE isn't set - we don't
1756 		 * want the page tossed before it gets written out.
1757 		 */
1758 		if (flags & B_FORCE)
1759 			flags &= ~(B_INVAL | B_FORCE);
1760 		pvn_write_done(pp, flags | B_ERROR);
1761 		return (0);
1762 	}
1763 
1764 	/*
1765 	 * We'll get here only if (nfs_zone() != mi->mi_zone)
1766 	 * which means that this was a cross-zone sync putpage.
1767 	 *
1768 	 * We pass in B_ERROR to pvn_write_done() to re-mark the pages
1769 	 * as dirty and unlock them.
1770 	 *
1771 	 * We don't want to clear B_FORCE here as the caller presumably
1772 	 * knows what they're doing if they set it.
1773 	 */
1774 	pvn_write_done(pp, flags | B_ERROR);
1775 	return (EPERM);
1776 }
1777 
1778 int
1779 nfs4_async_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
1780     int flags, cred_t *cr, int (*pageio)(vnode_t *, page_t *, u_offset_t,
1781     size_t, int, cred_t *))
1782 {
1783 	rnode4_t *rp;
1784 	mntinfo4_t *mi;
1785 	struct nfs4_async_reqs *args;
1786 
1787 	ASSERT(flags & B_ASYNC);
1788 	ASSERT(vp->v_vfsp != NULL);
1789 
1790 	rp = VTOR4(vp);
1791 	ASSERT(rp->r_count > 0);
1792 
1793 	mi = VTOMI4(vp);
1794 
1795 	/*
1796 	 * If we can't allocate a request structure, do the pageio
1797 	 * request synchronously in this thread's context.
1798 	 */
1799 	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1800 		goto noasync;
1801 
1802 	args->a_next = NULL;
1803 #ifdef DEBUG
1804 	args->a_queuer = curthread;
1805 #endif
1806 	VN_HOLD(vp);
1807 	args->a_vp = vp;
1808 	ASSERT(cr != NULL);
1809 	crhold(cr);
1810 	args->a_cred = cr;
1811 	args->a_io = NFS4_PAGEIO;
1812 	args->a_nfs4_pageio = pageio;
1813 	args->a_nfs4_pp = pp;
1814 	args->a_nfs4_off = io_off;
1815 	args->a_nfs4_len = (uint_t)io_len;
1816 	args->a_nfs4_flags = flags;
1817 
1818 	mutex_enter(&mi->mi_async_lock);
1819 
1820 	/*
1821 	 * If asyncio has been disabled, then make a synchronous request.
1822 	 * This check is done a second time in case async io was diabled
1823 	 * while this thread was blocked waiting for memory pressure to
1824 	 * reduce or for the queue to drain.
1825 	 */
1826 	if (mi->mi_max_threads == 0) {
1827 		mutex_exit(&mi->mi_async_lock);
1828 
1829 		VN_RELE(vp);
1830 		crfree(cr);
1831 		kmem_free(args, sizeof (*args));
1832 		goto noasync;
1833 	}
1834 
1835 	/*
1836 	 * Link request structure into the async list and
1837 	 * wakeup async thread to do the i/o.
1838 	 */
1839 	if (mi->mi_async_reqs[NFS4_PAGEIO] == NULL) {
1840 		mi->mi_async_reqs[NFS4_PAGEIO] = args;
1841 		mi->mi_async_tail[NFS4_PAGEIO] = args;
1842 	} else {
1843 		mi->mi_async_tail[NFS4_PAGEIO]->a_next = args;
1844 		mi->mi_async_tail[NFS4_PAGEIO] = args;
1845 	}
1846 
1847 	mutex_enter(&rp->r_statelock);
1848 	rp->r_count++;
1849 	rp->r_awcount++;
1850 	mutex_exit(&rp->r_statelock);
1851 
1852 	if (mi->mi_io_kstats) {
1853 		mutex_enter(&mi->mi_lock);
1854 		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1855 		mutex_exit(&mi->mi_lock);
1856 	}
1857 
1858 	mi->mi_async_req_count++;
1859 	ASSERT(mi->mi_async_req_count != 0);
1860 	cv_signal(&mi->mi_async_reqs_cv);
1861 	mutex_exit(&mi->mi_async_lock);
1862 	return (0);
1863 
1864 noasync:
1865 	/*
1866 	 * If we can't do it ASYNC, for reads we do nothing (but cleanup
1867 	 * the page list), for writes we do it synchronously, except for
1868 	 * proc_pageout/proc_fsflush as described below.
1869 	 */
1870 	if (flags & B_READ) {
1871 		pvn_read_done(pp, flags | B_ERROR);
1872 		return (0);
1873 	}
1874 
1875 	if (curproc == proc_pageout || curproc == proc_fsflush) {
1876 		/*
1877 		 * If we get here in the context of the pageout/fsflush,
1878 		 * we refuse to do a sync write, because this may hang
1879 		 * pageout/fsflush (and the machine). In this case, we just
1880 		 * re-mark the page as dirty and punt on the page.
1881 		 *
1882 		 * Make sure B_FORCE isn't set.  We can re-mark the
1883 		 * pages as dirty and unlock the pages in one swoop by
1884 		 * passing in B_ERROR to pvn_write_done().  However,
1885 		 * we should make sure B_FORCE isn't set - we don't
1886 		 * want the page tossed before it gets written out.
1887 		 */
1888 		if (flags & B_FORCE)
1889 			flags &= ~(B_INVAL | B_FORCE);
1890 		pvn_write_done(pp, flags | B_ERROR);
1891 		return (0);
1892 	}
1893 
1894 	if (nfs_zone() != mi->mi_zone) {
1895 		/*
1896 		 * So this was a cross-zone sync pageio.  We pass in B_ERROR
1897 		 * to pvn_write_done() to re-mark the pages as dirty and unlock
1898 		 * them.
1899 		 *
1900 		 * We don't want to clear B_FORCE here as the caller presumably
1901 		 * knows what they're doing if they set it.
1902 		 */
1903 		pvn_write_done(pp, flags | B_ERROR);
1904 		return (EPERM);
1905 	}
1906 	return ((*pageio)(vp, pp, io_off, io_len, flags, cr));
1907 }
1908 
1909 void
1910 nfs4_async_readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr,
1911     int (*readdir)(vnode_t *, rddir4_cache *, cred_t *))
1912 {
1913 	rnode4_t *rp;
1914 	mntinfo4_t *mi;
1915 	struct nfs4_async_reqs *args;
1916 
1917 	rp = VTOR4(vp);
1918 	ASSERT(rp->r_freef == NULL);
1919 
1920 	mi = VTOMI4(vp);
1921 
1922 	/*
1923 	 * If we can't allocate a request structure, skip the readdir.
1924 	 */
1925 	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1926 		goto noasync;
1927 
1928 	args->a_next = NULL;
1929 #ifdef DEBUG
1930 	args->a_queuer = curthread;
1931 #endif
1932 	VN_HOLD(vp);
1933 	args->a_vp = vp;
1934 	ASSERT(cr != NULL);
1935 	crhold(cr);
1936 	args->a_cred = cr;
1937 	args->a_io = NFS4_READDIR;
1938 	args->a_nfs4_readdir = readdir;
1939 	args->a_nfs4_rdc = rdc;
1940 
1941 	mutex_enter(&mi->mi_async_lock);
1942 
1943 	/*
1944 	 * If asyncio has been disabled, then skip this request
1945 	 */
1946 	if (mi->mi_max_threads == 0) {
1947 		mutex_exit(&mi->mi_async_lock);
1948 
1949 		VN_RELE(vp);
1950 		crfree(cr);
1951 		kmem_free(args, sizeof (*args));
1952 		goto noasync;
1953 	}
1954 
1955 	/*
1956 	 * Link request structure into the async list and
1957 	 * wakeup async thread to do the i/o.
1958 	 */
1959 	if (mi->mi_async_reqs[NFS4_READDIR] == NULL) {
1960 		mi->mi_async_reqs[NFS4_READDIR] = args;
1961 		mi->mi_async_tail[NFS4_READDIR] = args;
1962 	} else {
1963 		mi->mi_async_tail[NFS4_READDIR]->a_next = args;
1964 		mi->mi_async_tail[NFS4_READDIR] = args;
1965 	}
1966 
1967 	mutex_enter(&rp->r_statelock);
1968 	rp->r_count++;
1969 	mutex_exit(&rp->r_statelock);
1970 
1971 	if (mi->mi_io_kstats) {
1972 		mutex_enter(&mi->mi_lock);
1973 		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1974 		mutex_exit(&mi->mi_lock);
1975 	}
1976 
1977 	mi->mi_async_req_count++;
1978 	ASSERT(mi->mi_async_req_count != 0);
1979 	cv_signal(&mi->mi_async_reqs_cv);
1980 	mutex_exit(&mi->mi_async_lock);
1981 	return;
1982 
1983 noasync:
1984 	mutex_enter(&rp->r_statelock);
1985 	rdc->entries = NULL;
1986 	/*
1987 	 * Indicate that no one is trying to fill this entry and
1988 	 * it still needs to be filled.
1989 	 */
1990 	rdc->flags &= ~RDDIR;
1991 	rdc->flags |= RDDIRREQ;
1992 	rddir4_cache_rele(rp, rdc);
1993 	mutex_exit(&rp->r_statelock);
1994 }
1995 
1996 void
1997 nfs4_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
1998     cred_t *cr, void (*commit)(vnode_t *, page_t *, offset3, count3,
1999     cred_t *))
2000 {
2001 	rnode4_t *rp;
2002 	mntinfo4_t *mi;
2003 	struct nfs4_async_reqs *args;
2004 	page_t *pp;
2005 
2006 	rp = VTOR4(vp);
2007 	mi = VTOMI4(vp);
2008 
2009 	/*
2010 	 * If we can't allocate a request structure, do the commit
2011 	 * operation synchronously in this thread's context.
2012 	 */
2013 	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
2014 		goto noasync;
2015 
2016 	args->a_next = NULL;
2017 #ifdef DEBUG
2018 	args->a_queuer = curthread;
2019 #endif
2020 	VN_HOLD(vp);
2021 	args->a_vp = vp;
2022 	ASSERT(cr != NULL);
2023 	crhold(cr);
2024 	args->a_cred = cr;
2025 	args->a_io = NFS4_COMMIT;
2026 	args->a_nfs4_commit = commit;
2027 	args->a_nfs4_plist = plist;
2028 	args->a_nfs4_offset = offset;
2029 	args->a_nfs4_count = count;
2030 
2031 	mutex_enter(&mi->mi_async_lock);
2032 
2033 	/*
2034 	 * If asyncio has been disabled, then make a synchronous request.
2035 	 * This check is done a second time in case async io was diabled
2036 	 * while this thread was blocked waiting for memory pressure to
2037 	 * reduce or for the queue to drain.
2038 	 */
2039 	if (mi->mi_max_threads == 0) {
2040 		mutex_exit(&mi->mi_async_lock);
2041 
2042 		VN_RELE(vp);
2043 		crfree(cr);
2044 		kmem_free(args, sizeof (*args));
2045 		goto noasync;
2046 	}
2047 
2048 	/*
2049 	 * Link request structure into the async list and
2050 	 * wakeup async thread to do the i/o.
2051 	 */
2052 	if (mi->mi_async_reqs[NFS4_COMMIT] == NULL) {
2053 		mi->mi_async_reqs[NFS4_COMMIT] = args;
2054 		mi->mi_async_tail[NFS4_COMMIT] = args;
2055 	} else {
2056 		mi->mi_async_tail[NFS4_COMMIT]->a_next = args;
2057 		mi->mi_async_tail[NFS4_COMMIT] = args;
2058 	}
2059 
2060 	mutex_enter(&rp->r_statelock);
2061 	rp->r_count++;
2062 	mutex_exit(&rp->r_statelock);
2063 
2064 	if (mi->mi_io_kstats) {
2065 		mutex_enter(&mi->mi_lock);
2066 		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
2067 		mutex_exit(&mi->mi_lock);
2068 	}
2069 
2070 	mi->mi_async_req_count++;
2071 	ASSERT(mi->mi_async_req_count != 0);
2072 	cv_signal(&mi->mi_async_reqs_cv);
2073 	mutex_exit(&mi->mi_async_lock);
2074 	return;
2075 
2076 noasync:
2077 	if (curproc == proc_pageout || curproc == proc_fsflush ||
2078 	    nfs_zone() != mi->mi_zone) {
2079 		while (plist != NULL) {
2080 			pp = plist;
2081 			page_sub(&plist, pp);
2082 			pp->p_fsdata = C_COMMIT;
2083 			page_unlock(pp);
2084 		}
2085 		return;
2086 	}
2087 	(*commit)(vp, plist, offset, count, cr);
2088 }
2089 
2090 /*
2091  * nfs4_async_inactive - hand off a VOP_INACTIVE call to a thread.  The
2092  * reference to the vnode is handed over to the thread; the caller should
2093  * no longer refer to the vnode.
2094  *
2095  * Unlike most of the async routines, this handoff is needed for
2096  * correctness reasons, not just performance.  So doing operations in the
2097  * context of the current thread is not an option.
2098  */
2099 void
2100 nfs4_async_inactive(vnode_t *vp, cred_t *cr)
2101 {
2102 	mntinfo4_t *mi;
2103 	struct nfs4_async_reqs *args;
2104 	boolean_t signal_inactive_thread = B_FALSE;
2105 
2106 	mi = VTOMI4(vp);
2107 
2108 	args = kmem_alloc(sizeof (*args), KM_SLEEP);
2109 	args->a_next = NULL;
2110 #ifdef DEBUG
2111 	args->a_queuer = curthread;
2112 #endif
2113 	args->a_vp = vp;
2114 	ASSERT(cr != NULL);
2115 	crhold(cr);
2116 	args->a_cred = cr;
2117 	args->a_io = NFS4_INACTIVE;
2118 
2119 	/*
2120 	 * Note that we don't check mi->mi_max_threads here, since we
2121 	 * *need* to get rid of this vnode regardless of whether someone
2122 	 * set nfs4_max_threads to zero in /etc/system.
2123 	 *
2124 	 * The manager thread knows about this and is willing to create
2125 	 * at least one thread to accommodate us.
2126 	 */
2127 	mutex_enter(&mi->mi_async_lock);
2128 	if (mi->mi_inactive_thread == NULL) {
2129 		rnode4_t *rp;
2130 		vnode_t *unldvp = NULL;
2131 		char *unlname;
2132 		cred_t *unlcred;
2133 
2134 		mutex_exit(&mi->mi_async_lock);
2135 		/*
2136 		 * We just need to free up the memory associated with the
2137 		 * vnode, which can be safely done from within the current
2138 		 * context.
2139 		 */
2140 		crfree(cr);	/* drop our reference */
2141 		kmem_free(args, sizeof (*args));
2142 		rp = VTOR4(vp);
2143 		mutex_enter(&rp->r_statelock);
2144 		if (rp->r_unldvp != NULL) {
2145 			unldvp = rp->r_unldvp;
2146 			rp->r_unldvp = NULL;
2147 			unlname = rp->r_unlname;
2148 			rp->r_unlname = NULL;
2149 			unlcred = rp->r_unlcred;
2150 			rp->r_unlcred = NULL;
2151 		}
2152 		mutex_exit(&rp->r_statelock);
2153 		/*
2154 		 * No need to explicitly throw away any cached pages.  The
2155 		 * eventual r4inactive() will attempt a synchronous
2156 		 * VOP_PUTPAGE() which will immediately fail since the request
2157 		 * is coming from the wrong zone, and then will proceed to call
2158 		 * nfs4_invalidate_pages() which will clean things up for us.
2159 		 *
2160 		 * Throw away the delegation here so rp4_addfree()'s attempt to
2161 		 * return any existing delegations becomes a no-op.
2162 		 */
2163 		if (rp->r_deleg_type != OPEN_DELEGATE_NONE) {
2164 			(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER,
2165 			    FALSE);
2166 			(void) nfs4delegreturn(rp, NFS4_DR_DISCARD);
2167 			nfs_rw_exit(&mi->mi_recovlock);
2168 		}
2169 		nfs4_clear_open_streams(rp);
2170 
2171 		rp4_addfree(rp, cr);
2172 		if (unldvp != NULL) {
2173 			kmem_free(unlname, MAXNAMELEN);
2174 			VN_RELE(unldvp);
2175 			crfree(unlcred);
2176 		}
2177 		return;
2178 	}
2179 
2180 	if (mi->mi_manager_thread == NULL) {
2181 		/*
2182 		 * We want to talk to the inactive thread.
2183 		 */
2184 		signal_inactive_thread = B_TRUE;
2185 	}
2186 
2187 	/*
2188 	 * Enqueue the vnode and wake up either the special thread (empty
2189 	 * list) or an async thread.
2190 	 */
2191 	if (mi->mi_async_reqs[NFS4_INACTIVE] == NULL) {
2192 		mi->mi_async_reqs[NFS4_INACTIVE] = args;
2193 		mi->mi_async_tail[NFS4_INACTIVE] = args;
2194 		signal_inactive_thread = B_TRUE;
2195 	} else {
2196 		mi->mi_async_tail[NFS4_INACTIVE]->a_next = args;
2197 		mi->mi_async_tail[NFS4_INACTIVE] = args;
2198 	}
2199 	if (signal_inactive_thread) {
2200 		cv_signal(&mi->mi_inact_req_cv);
2201 	} else  {
2202 		mi->mi_async_req_count++;
2203 		ASSERT(mi->mi_async_req_count != 0);
2204 		cv_signal(&mi->mi_async_reqs_cv);
2205 	}
2206 
2207 	mutex_exit(&mi->mi_async_lock);
2208 }
2209 
2210 int
2211 writerp4(rnode4_t *rp, caddr_t base, int tcount, struct uio *uio, int pgcreated)
2212 {
2213 	int pagecreate;
2214 	int n;
2215 	int saved_n;
2216 	caddr_t saved_base;
2217 	u_offset_t offset;
2218 	int error;
2219 	int sm_error;
2220 	vnode_t *vp = RTOV(rp);
2221 
2222 	ASSERT(tcount <= MAXBSIZE && tcount <= uio->uio_resid);
2223 	ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_WRITER));
2224 	if (!vpm_enable) {
2225 		ASSERT(((uintptr_t)base & MAXBOFFSET) + tcount <= MAXBSIZE);
2226 	}
2227 
2228 	/*
2229 	 * Move bytes in at most PAGESIZE chunks. We must avoid
2230 	 * spanning pages in uiomove() because page faults may cause
2231 	 * the cache to be invalidated out from under us. The r_size is not
2232 	 * updated until after the uiomove. If we push the last page of a
2233 	 * file before r_size is correct, we will lose the data written past
2234 	 * the current (and invalid) r_size.
2235 	 */
2236 	do {
2237 		offset = uio->uio_loffset;
2238 		pagecreate = 0;
2239 
2240 		/*
2241 		 * n is the number of bytes required to satisfy the request
2242 		 *   or the number of bytes to fill out the page.
2243 		 */
2244 		n = (int)MIN((PAGESIZE - (offset & PAGEOFFSET)), tcount);
2245 
2246 		/*
2247 		 * Check to see if we can skip reading in the page
2248 		 * and just allocate the memory.  We can do this
2249 		 * if we are going to rewrite the entire mapping
2250 		 * or if we are going to write to or beyond the current
2251 		 * end of file from the beginning of the mapping.
2252 		 *
2253 		 * The read of r_size is now protected by r_statelock.
2254 		 */
2255 		mutex_enter(&rp->r_statelock);
2256 		/*
2257 		 * When pgcreated is nonzero the caller has already done
2258 		 * a segmap_getmapflt with forcefault 0 and S_WRITE. With
2259 		 * segkpm this means we already have at least one page
2260 		 * created and mapped at base.
2261 		 */
2262 		pagecreate = pgcreated ||
2263 		    ((offset & PAGEOFFSET) == 0 &&
2264 		    (n == PAGESIZE || ((offset + n) >= rp->r_size)));
2265 
2266 		mutex_exit(&rp->r_statelock);
2267 
2268 		if (!vpm_enable && pagecreate) {
2269 			/*
2270 			 * The last argument tells segmap_pagecreate() to
2271 			 * always lock the page, as opposed to sometimes
2272 			 * returning with the page locked. This way we avoid a
2273 			 * fault on the ensuing uiomove(), but also
2274 			 * more importantly (to fix bug 1094402) we can
2275 			 * call segmap_fault() to unlock the page in all
2276 			 * cases. An alternative would be to modify
2277 			 * segmap_pagecreate() to tell us when it is
2278 			 * locking a page, but that's a fairly major
2279 			 * interface change.
2280 			 */
2281 			if (pgcreated == 0)
2282 				(void) segmap_pagecreate(segkmap, base,
2283 				    (uint_t)n, 1);
2284 			saved_base = base;
2285 			saved_n = n;
2286 		}
2287 
2288 		/*
2289 		 * The number of bytes of data in the last page can not
2290 		 * be accurately be determined while page is being
2291 		 * uiomove'd to and the size of the file being updated.
2292 		 * Thus, inform threads which need to know accurately
2293 		 * how much data is in the last page of the file.  They
2294 		 * will not do the i/o immediately, but will arrange for
2295 		 * the i/o to happen later when this modify operation
2296 		 * will have finished.
2297 		 */
2298 		ASSERT(!(rp->r_flags & R4MODINPROGRESS));
2299 		mutex_enter(&rp->r_statelock);
2300 		rp->r_flags |= R4MODINPROGRESS;
2301 		rp->r_modaddr = (offset & MAXBMASK);
2302 		mutex_exit(&rp->r_statelock);
2303 
2304 		if (vpm_enable) {
2305 			/*
2306 			 * Copy data. If new pages are created, part of
2307 			 * the page that is not written will be initizliazed
2308 			 * with zeros.
2309 			 */
2310 			error = vpm_data_copy(vp, offset, n, uio,
2311 			    !pagecreate, NULL, 0, S_WRITE);
2312 		} else {
2313 			error = uiomove(base, n, UIO_WRITE, uio);
2314 		}
2315 
2316 		/*
2317 		 * r_size is the maximum number of
2318 		 * bytes known to be in the file.
2319 		 * Make sure it is at least as high as the
2320 		 * first unwritten byte pointed to by uio_loffset.
2321 		 */
2322 		mutex_enter(&rp->r_statelock);
2323 		if (rp->r_size < uio->uio_loffset)
2324 			rp->r_size = uio->uio_loffset;
2325 		rp->r_flags &= ~R4MODINPROGRESS;
2326 		rp->r_flags |= R4DIRTY;
2327 		mutex_exit(&rp->r_statelock);
2328 
2329 		/* n = # of bytes written */
2330 		n = (int)(uio->uio_loffset - offset);
2331 
2332 		if (!vpm_enable) {
2333 			base += n;
2334 		}
2335 
2336 		tcount -= n;
2337 		/*
2338 		 * If we created pages w/o initializing them completely,
2339 		 * we need to zero the part that wasn't set up.
2340 		 * This happens on a most EOF write cases and if
2341 		 * we had some sort of error during the uiomove.
2342 		 */
2343 		if (!vpm_enable && pagecreate) {
2344 			if ((uio->uio_loffset & PAGEOFFSET) || n == 0)
2345 				(void) kzero(base, PAGESIZE - n);
2346 
2347 			if (pgcreated) {
2348 				/*
2349 				 * Caller is responsible for this page,
2350 				 * it was not created in this loop.
2351 				 */
2352 				pgcreated = 0;
2353 			} else {
2354 				/*
2355 				 * For bug 1094402: segmap_pagecreate locks
2356 				 * page. Unlock it. This also unlocks the
2357 				 * pages allocated by page_create_va() in
2358 				 * segmap_pagecreate().
2359 				 */
2360 				sm_error = segmap_fault(kas.a_hat, segkmap,
2361 				    saved_base, saved_n,
2362 				    F_SOFTUNLOCK, S_WRITE);
2363 				if (error == 0)
2364 					error = sm_error;
2365 			}
2366 		}
2367 	} while (tcount > 0 && error == 0);
2368 
2369 	return (error);
2370 }
2371 
2372 int
2373 nfs4_putpages(vnode_t *vp, u_offset_t off, size_t len, int flags, cred_t *cr)
2374 {
2375 	rnode4_t *rp;
2376 	page_t *pp;
2377 	u_offset_t eoff;
2378 	u_offset_t io_off;
2379 	size_t io_len;
2380 	int error;
2381 	int rdirty;
2382 	int err;
2383 
2384 	rp = VTOR4(vp);
2385 	ASSERT(rp->r_count > 0);
2386 
2387 	if (!nfs4_has_pages(vp))
2388 		return (0);
2389 
2390 	ASSERT(vp->v_type != VCHR);
2391 
2392 	/*
2393 	 * If R4OUTOFSPACE is set, then all writes turn into B_INVAL
2394 	 * writes.  B_FORCE is set to force the VM system to actually
2395 	 * invalidate the pages, even if the i/o failed.  The pages
2396 	 * need to get invalidated because they can't be written out
2397 	 * because there isn't any space left on either the server's
2398 	 * file system or in the user's disk quota.  The B_FREE bit
2399 	 * is cleared to avoid confusion as to whether this is a
2400 	 * request to place the page on the freelist or to destroy
2401 	 * it.
2402 	 */
2403 	if ((rp->r_flags & R4OUTOFSPACE) ||
2404 	    (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED))
2405 		flags = (flags & ~B_FREE) | B_INVAL | B_FORCE;
2406 
2407 	if (len == 0) {
2408 		/*
2409 		 * If doing a full file synchronous operation, then clear
2410 		 * the R4DIRTY bit.  If a page gets dirtied while the flush
2411 		 * is happening, then R4DIRTY will get set again.  The
2412 		 * R4DIRTY bit must get cleared before the flush so that
2413 		 * we don't lose this information.
2414 		 *
2415 		 * If there are no full file async write operations
2416 		 * pending and RDIRTY bit is set, clear it.
2417 		 */
2418 		if (off == (u_offset_t)0 &&
2419 		    !(flags & B_ASYNC) &&
2420 		    (rp->r_flags & R4DIRTY)) {
2421 			mutex_enter(&rp->r_statelock);
2422 			rdirty = (rp->r_flags & R4DIRTY);
2423 			rp->r_flags &= ~R4DIRTY;
2424 			mutex_exit(&rp->r_statelock);
2425 		} else if (flags & B_ASYNC && off == (u_offset_t)0) {
2426 			mutex_enter(&rp->r_statelock);
2427 			if (rp->r_flags & R4DIRTY && rp->r_awcount == 0) {
2428 				rdirty = (rp->r_flags & R4DIRTY);
2429 				rp->r_flags &= ~R4DIRTY;
2430 			}
2431 			mutex_exit(&rp->r_statelock);
2432 		} else
2433 			rdirty = 0;
2434 
2435 		/*
2436 		 * Search the entire vp list for pages >= off, and flush
2437 		 * the dirty pages.
2438 		 */
2439 		error = pvn_vplist_dirty(vp, off, rp->r_putapage,
2440 		    flags, cr);
2441 
2442 		/*
2443 		 * If an error occurred and the file was marked as dirty
2444 		 * before and we aren't forcibly invalidating pages, then
2445 		 * reset the R4DIRTY flag.
2446 		 */
2447 		if (error && rdirty &&
2448 		    (flags & (B_INVAL | B_FORCE)) != (B_INVAL | B_FORCE)) {
2449 			mutex_enter(&rp->r_statelock);
2450 			rp->r_flags |= R4DIRTY;
2451 			mutex_exit(&rp->r_statelock);
2452 		}
2453 	} else {
2454 		/*
2455 		 * Do a range from [off...off + len) looking for pages
2456 		 * to deal with.
2457 		 */
2458 		error = 0;
2459 		io_len = 0;
2460 		eoff = off + len;
2461 		mutex_enter(&rp->r_statelock);
2462 		for (io_off = off; io_off < eoff && io_off < rp->r_size;
2463 		    io_off += io_len) {
2464 			mutex_exit(&rp->r_statelock);
2465 			/*
2466 			 * If we are not invalidating, synchronously
2467 			 * freeing or writing pages use the routine
2468 			 * page_lookup_nowait() to prevent reclaiming
2469 			 * them from the free list.
2470 			 */
2471 			if ((flags & B_INVAL) || !(flags & B_ASYNC)) {
2472 				pp = page_lookup(vp, io_off,
2473 				    (flags & (B_INVAL | B_FREE)) ?
2474 				    SE_EXCL : SE_SHARED);
2475 			} else {
2476 				pp = page_lookup_nowait(vp, io_off,
2477 				    (flags & B_FREE) ? SE_EXCL : SE_SHARED);
2478 			}
2479 
2480 			if (pp == NULL || !pvn_getdirty(pp, flags))
2481 				io_len = PAGESIZE;
2482 			else {
2483 				err = (*rp->r_putapage)(vp, pp, &io_off,
2484 				    &io_len, flags, cr);
2485 				if (!error)
2486 					error = err;
2487 				/*
2488 				 * "io_off" and "io_len" are returned as
2489 				 * the range of pages we actually wrote.
2490 				 * This allows us to skip ahead more quickly
2491 				 * since several pages may've been dealt
2492 				 * with by this iteration of the loop.
2493 				 */
2494 			}
2495 			mutex_enter(&rp->r_statelock);
2496 		}
2497 		mutex_exit(&rp->r_statelock);
2498 	}
2499 
2500 	return (error);
2501 }
2502 
2503 void
2504 nfs4_invalidate_pages(vnode_t *vp, u_offset_t off, cred_t *cr)
2505 {
2506 	rnode4_t *rp;
2507 
2508 	rp = VTOR4(vp);
2509 	if (IS_SHADOW(vp, rp))
2510 		vp = RTOV4(rp);
2511 	mutex_enter(&rp->r_statelock);
2512 	while (rp->r_flags & R4TRUNCATE)
2513 		cv_wait(&rp->r_cv, &rp->r_statelock);
2514 	rp->r_flags |= R4TRUNCATE;
2515 	if (off == (u_offset_t)0) {
2516 		rp->r_flags &= ~R4DIRTY;
2517 		if (!(rp->r_flags & R4STALE))
2518 			rp->r_error = 0;
2519 	}
2520 	rp->r_truncaddr = off;
2521 	mutex_exit(&rp->r_statelock);
2522 	(void) pvn_vplist_dirty(vp, off, rp->r_putapage,
2523 	    B_INVAL | B_TRUNC, cr);
2524 	mutex_enter(&rp->r_statelock);
2525 	rp->r_flags &= ~R4TRUNCATE;
2526 	cv_broadcast(&rp->r_cv);
2527 	mutex_exit(&rp->r_statelock);
2528 }
2529 
2530 static int
2531 nfs4_mnt_kstat_update(kstat_t *ksp, int rw)
2532 {
2533 	mntinfo4_t *mi;
2534 	struct mntinfo_kstat *mik;
2535 	vfs_t *vfsp;
2536 
2537 	/* this is a read-only kstat. Bail out on a write */
2538 	if (rw == KSTAT_WRITE)
2539 		return (EACCES);
2540 
2541 
2542 	/*
2543 	 * We don't want to wait here as kstat_chain_lock could be held by
2544 	 * dounmount(). dounmount() takes vfs_reflock before the chain lock
2545 	 * and thus could lead to a deadlock.
2546 	 */
2547 	vfsp = (struct vfs *)ksp->ks_private;
2548 
2549 	mi = VFTOMI4(vfsp);
2550 	mik = (struct mntinfo_kstat *)ksp->ks_data;
2551 
2552 	(void) strcpy(mik->mik_proto, mi->mi_curr_serv->sv_knconf->knc_proto);
2553 
2554 	mik->mik_vers = (uint32_t)mi->mi_vers;
2555 	mik->mik_flags = mi->mi_flags;
2556 	/*
2557 	 * The sv_secdata holds the flavor the client specifies.
2558 	 * If the client uses default and a security negotiation
2559 	 * occurs, sv_currsec will point to the current flavor
2560 	 * selected from the server flavor list.
2561 	 * sv_currsec is NULL if no security negotiation takes place.
2562 	 */
2563 	mik->mik_secmod = mi->mi_curr_serv->sv_currsec ?
2564 	    mi->mi_curr_serv->sv_currsec->secmod :
2565 	    mi->mi_curr_serv->sv_secdata->secmod;
2566 	mik->mik_curread = (uint32_t)mi->mi_curread;
2567 	mik->mik_curwrite = (uint32_t)mi->mi_curwrite;
2568 	mik->mik_retrans = mi->mi_retrans;
2569 	mik->mik_timeo = mi->mi_timeo;
2570 	mik->mik_acregmin = HR2SEC(mi->mi_acregmin);
2571 	mik->mik_acregmax = HR2SEC(mi->mi_acregmax);
2572 	mik->mik_acdirmin = HR2SEC(mi->mi_acdirmin);
2573 	mik->mik_acdirmax = HR2SEC(mi->mi_acdirmax);
2574 	mik->mik_noresponse = (uint32_t)mi->mi_noresponse;
2575 	mik->mik_failover = (uint32_t)mi->mi_failover;
2576 	mik->mik_remap = (uint32_t)mi->mi_remap;
2577 
2578 	(void) strcpy(mik->mik_curserver, mi->mi_curr_serv->sv_hostname);
2579 
2580 	return (0);
2581 }
2582 
2583 void
2584 nfs4_mnt_kstat_init(struct vfs *vfsp)
2585 {
2586 	mntinfo4_t *mi = VFTOMI4(vfsp);
2587 
2588 	/*
2589 	 * PSARC 2001/697 Contract Private Interface
2590 	 * All nfs kstats are under SunMC contract
2591 	 * Please refer to the PSARC listed above and contact
2592 	 * SunMC before making any changes!
2593 	 *
2594 	 * Changes must be reviewed by Solaris File Sharing
2595 	 * Changes must be communicated to contract-2001-697@sun.com
2596 	 *
2597 	 */
2598 
2599 	mi->mi_io_kstats = kstat_create_zone("nfs", getminor(vfsp->vfs_dev),
2600 	    NULL, "nfs", KSTAT_TYPE_IO, 1, 0, mi->mi_zone->zone_id);
2601 	if (mi->mi_io_kstats) {
2602 		if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
2603 			kstat_zone_add(mi->mi_io_kstats, GLOBAL_ZONEID);
2604 		mi->mi_io_kstats->ks_lock = &mi->mi_lock;
2605 		kstat_install(mi->mi_io_kstats);
2606 	}
2607 
2608 	if ((mi->mi_ro_kstats = kstat_create_zone("nfs",
2609 	    getminor(vfsp->vfs_dev), "mntinfo", "misc", KSTAT_TYPE_RAW,
2610 	    sizeof (struct mntinfo_kstat), 0, mi->mi_zone->zone_id)) != NULL) {
2611 		if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
2612 			kstat_zone_add(mi->mi_ro_kstats, GLOBAL_ZONEID);
2613 		mi->mi_ro_kstats->ks_update = nfs4_mnt_kstat_update;
2614 		mi->mi_ro_kstats->ks_private = (void *)vfsp;
2615 		kstat_install(mi->mi_ro_kstats);
2616 	}
2617 
2618 	nfs4_mnt_recov_kstat_init(vfsp);
2619 }
2620 
2621 void
2622 nfs4_write_error(vnode_t *vp, int error, cred_t *cr)
2623 {
2624 	mntinfo4_t *mi;
2625 
2626 	mi = VTOMI4(vp);
2627 	/*
2628 	 * In case of forced unmount, do not print any messages
2629 	 * since it can flood the console with error messages.
2630 	 */
2631 	if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)
2632 		return;
2633 
2634 	/*
2635 	 * If the mount point is dead, not recoverable, do not
2636 	 * print error messages that can flood the console.
2637 	 */
2638 	if (mi->mi_flags & MI4_RECOV_FAIL)
2639 		return;
2640 
2641 	/*
2642 	 * No use in flooding the console with ENOSPC
2643 	 * messages from the same file system.
2644 	 */
2645 	if ((error != ENOSPC && error != EDQUOT) ||
2646 	    lbolt - mi->mi_printftime > 0) {
2647 		zoneid_t zoneid = mi->mi_zone->zone_id;
2648 
2649 #ifdef DEBUG
2650 		nfs_perror(error, "NFS%ld write error on host %s: %m.\n",
2651 		    mi->mi_vers, VTOR4(vp)->r_server->sv_hostname, NULL);
2652 #else
2653 		nfs_perror(error, "NFS write error on host %s: %m.\n",
2654 		    VTOR4(vp)->r_server->sv_hostname, NULL);
2655 #endif
2656 		if (error == ENOSPC || error == EDQUOT) {
2657 			zcmn_err(zoneid, CE_CONT,
2658 			    "^File: userid=%d, groupid=%d\n",
2659 			    crgetuid(cr), crgetgid(cr));
2660 			if (crgetuid(curthread->t_cred) != crgetuid(cr) ||
2661 			    crgetgid(curthread->t_cred) != crgetgid(cr)) {
2662 				zcmn_err(zoneid, CE_CONT,
2663 				    "^User: userid=%d, groupid=%d\n",
2664 				    crgetuid(curthread->t_cred),
2665 				    crgetgid(curthread->t_cred));
2666 			}
2667 			mi->mi_printftime = lbolt +
2668 			    nfs_write_error_interval * hz;
2669 		}
2670 		sfh4_printfhandle(VTOR4(vp)->r_fh);
2671 #ifdef DEBUG
2672 		if (error == EACCES) {
2673 			zcmn_err(zoneid, CE_CONT,
2674 			    "nfs_bio: cred is%s kcred\n",
2675 			    cr == kcred ? "" : " not");
2676 		}
2677 #endif
2678 	}
2679 }
2680 
2681 /*
2682  * Return non-zero if the given file can be safely memory mapped.  Locks
2683  * are safe if whole-file (length and offset are both zero).
2684  */
2685 
2686 #define	SAFE_LOCK(flk)	((flk).l_start == 0 && (flk).l_len == 0)
2687 
2688 static int
2689 nfs4_safemap(const vnode_t *vp)
2690 {
2691 	locklist_t	*llp, *next_llp;
2692 	int		safe = 1;
2693 	rnode4_t	*rp = VTOR4(vp);
2694 
2695 	ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER));
2696 
2697 	NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: "
2698 	    "vp = %p", (void *)vp));
2699 
2700 	/*
2701 	 * Review all the locks for the vnode, both ones that have been
2702 	 * acquired and ones that are pending.  We assume that
2703 	 * flk_active_locks_for_vp() has merged any locks that can be
2704 	 * merged (so that if a process has the entire file locked, it is
2705 	 * represented as a single lock).
2706 	 *
2707 	 * Note that we can't bail out of the loop if we find a non-safe
2708 	 * lock, because we have to free all the elements in the llp list.
2709 	 * We might be able to speed up this code slightly by not looking
2710 	 * at each lock's l_start and l_len fields once we've found a
2711 	 * non-safe lock.
2712 	 */
2713 
2714 	llp = flk_active_locks_for_vp(vp);
2715 	while (llp) {
2716 		NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE,
2717 		    "nfs4_safemap: active lock (%" PRId64 ", %" PRId64 ")",
2718 		    llp->ll_flock.l_start, llp->ll_flock.l_len));
2719 		if (!SAFE_LOCK(llp->ll_flock)) {
2720 			safe = 0;
2721 			NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE,
2722 			    "nfs4_safemap: unsafe active lock (%" PRId64
2723 			    ", %" PRId64 ")", llp->ll_flock.l_start,
2724 			    llp->ll_flock.l_len));
2725 		}
2726 		next_llp = llp->ll_next;
2727 		VN_RELE(llp->ll_vp);
2728 		kmem_free(llp, sizeof (*llp));
2729 		llp = next_llp;
2730 	}
2731 
2732 	NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: %s",
2733 	    safe ? "safe" : "unsafe"));
2734 	return (safe);
2735 }
2736 
2737 /*
2738  * Return whether there is a lost LOCK or LOCKU queued up for the given
2739  * file that would make an mmap request unsafe.  cf. nfs4_safemap().
2740  */
2741 
2742 bool_t
2743 nfs4_map_lost_lock_conflict(vnode_t *vp)
2744 {
2745 	bool_t conflict = FALSE;
2746 	nfs4_lost_rqst_t *lrp;
2747 	mntinfo4_t *mi = VTOMI4(vp);
2748 
2749 	mutex_enter(&mi->mi_lock);
2750 	for (lrp = list_head(&mi->mi_lost_state); lrp != NULL;
2751 	    lrp = list_next(&mi->mi_lost_state, lrp)) {
2752 		if (lrp->lr_op != OP_LOCK && lrp->lr_op != OP_LOCKU)
2753 			continue;
2754 		ASSERT(lrp->lr_vp != NULL);
2755 		if (!VOP_CMP(lrp->lr_vp, vp, NULL))
2756 			continue;	/* different file */
2757 		if (!SAFE_LOCK(*lrp->lr_flk)) {
2758 			conflict = TRUE;
2759 			break;
2760 		}
2761 	}
2762 
2763 	mutex_exit(&mi->mi_lock);
2764 	return (conflict);
2765 }
2766 
2767 /*
2768  * nfs_lockcompletion:
2769  *
2770  * If the vnode has a lock that makes it unsafe to cache the file, mark it
2771  * as non cachable (set VNOCACHE bit).
2772  */
2773 
2774 void
2775 nfs4_lockcompletion(vnode_t *vp, int cmd)
2776 {
2777 	rnode4_t *rp = VTOR4(vp);
2778 
2779 	ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER));
2780 	ASSERT(!IS_SHADOW(vp, rp));
2781 
2782 	if (cmd == F_SETLK || cmd == F_SETLKW) {
2783 
2784 		if (!nfs4_safemap(vp)) {
2785 			mutex_enter(&vp->v_lock);
2786 			vp->v_flag |= VNOCACHE;
2787 			mutex_exit(&vp->v_lock);
2788 		} else {
2789 			mutex_enter(&vp->v_lock);
2790 			vp->v_flag &= ~VNOCACHE;
2791 			mutex_exit(&vp->v_lock);
2792 		}
2793 	}
2794 	/*
2795 	 * The cached attributes of the file are stale after acquiring
2796 	 * the lock on the file. They were updated when the file was
2797 	 * opened, but not updated when the lock was acquired. Therefore the
2798 	 * cached attributes are invalidated after the lock is obtained.
2799 	 */
2800 	PURGE_ATTRCACHE4(vp);
2801 }
2802 
2803 /* ARGSUSED */
2804 static void *
2805 nfs4_mi_init(zoneid_t zoneid)
2806 {
2807 	struct mi4_globals *mig;
2808 
2809 	mig = kmem_alloc(sizeof (*mig), KM_SLEEP);
2810 	mutex_init(&mig->mig_lock, NULL, MUTEX_DEFAULT, NULL);
2811 	list_create(&mig->mig_list, sizeof (mntinfo4_t),
2812 	    offsetof(mntinfo4_t, mi_zone_node));
2813 	mig->mig_destructor_called = B_FALSE;
2814 	return (mig);
2815 }
2816 
2817 /*
2818  * Callback routine to tell all NFSv4 mounts in the zone to start tearing down
2819  * state and killing off threads.
2820  */
2821 /* ARGSUSED */
2822 static void
2823 nfs4_mi_shutdown(zoneid_t zoneid, void *data)
2824 {
2825 	struct mi4_globals *mig = data;
2826 	mntinfo4_t *mi;
2827 	nfs4_server_t *np;
2828 
2829 	NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2830 	    "nfs4_mi_shutdown zone %d\n", zoneid));
2831 	ASSERT(mig != NULL);
2832 	for (;;) {
2833 		mutex_enter(&mig->mig_lock);
2834 		mi = list_head(&mig->mig_list);
2835 		if (mi == NULL) {
2836 			mutex_exit(&mig->mig_lock);
2837 			break;
2838 		}
2839 
2840 		NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2841 		    "nfs4_mi_shutdown stopping vfs %p\n", (void *)mi->mi_vfsp));
2842 		/*
2843 		 * purge the DNLC for this filesystem
2844 		 */
2845 		(void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
2846 		/*
2847 		 * Tell existing async worker threads to exit.
2848 		 */
2849 		mutex_enter(&mi->mi_async_lock);
2850 		mi->mi_max_threads = 0;
2851 		cv_broadcast(&mi->mi_async_work_cv);
2852 		/*
2853 		 * Set the appropriate flags, signal and wait for both the
2854 		 * async manager and the inactive thread to exit when they're
2855 		 * done with their current work.
2856 		 */
2857 		mutex_enter(&mi->mi_lock);
2858 		mi->mi_flags |= (MI4_ASYNC_MGR_STOP|MI4_DEAD);
2859 		mutex_exit(&mi->mi_lock);
2860 		mutex_exit(&mi->mi_async_lock);
2861 		if (mi->mi_manager_thread) {
2862 			nfs4_async_manager_stop(mi->mi_vfsp);
2863 		}
2864 		if (mi->mi_inactive_thread) {
2865 			mutex_enter(&mi->mi_async_lock);
2866 			cv_signal(&mi->mi_inact_req_cv);
2867 			/*
2868 			 * Wait for the inactive thread to exit.
2869 			 */
2870 			while (mi->mi_inactive_thread != NULL) {
2871 				cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
2872 			}
2873 			mutex_exit(&mi->mi_async_lock);
2874 		}
2875 		/*
2876 		 * Wait for the recovery thread to complete, that is, it will
2877 		 * signal when it is done using the "mi" structure and about
2878 		 * to exit
2879 		 */
2880 		mutex_enter(&mi->mi_lock);
2881 		while (mi->mi_in_recovery > 0)
2882 			cv_wait(&mi->mi_cv_in_recov, &mi->mi_lock);
2883 		mutex_exit(&mi->mi_lock);
2884 		/*
2885 		 * We're done when every mi has been done or the list is empty.
2886 		 * This one is done, remove it from the list.
2887 		 */
2888 		list_remove(&mig->mig_list, mi);
2889 		mutex_exit(&mig->mig_lock);
2890 		zone_rele(mi->mi_zone);
2891 		/*
2892 		 * Release hold on vfs and mi done to prevent race with zone
2893 		 * shutdown. This releases the hold in nfs4_mi_zonelist_add.
2894 		 */
2895 		VFS_RELE(mi->mi_vfsp);
2896 		MI4_RELE(mi);
2897 	}
2898 	/*
2899 	 * Tell each renew thread in the zone to exit
2900 	 */
2901 	mutex_enter(&nfs4_server_lst_lock);
2902 	for (np = nfs4_server_lst.forw; np != &nfs4_server_lst; np = np->forw) {
2903 		mutex_enter(&np->s_lock);
2904 		if (np->zoneid == zoneid) {
2905 			/*
2906 			 * We add another hold onto the nfs4_server_t
2907 			 * because this will make sure tha the nfs4_server_t
2908 			 * stays around until nfs4_callback_fini_zone destroys
2909 			 * the zone. This way, the renew thread can
2910 			 * unconditionally release its holds on the
2911 			 * nfs4_server_t.
2912 			 */
2913 			np->s_refcnt++;
2914 			nfs4_mark_srv_dead(np);
2915 		}
2916 		mutex_exit(&np->s_lock);
2917 	}
2918 	mutex_exit(&nfs4_server_lst_lock);
2919 }
2920 
2921 static void
2922 nfs4_mi_free_globals(struct mi4_globals *mig)
2923 {
2924 	list_destroy(&mig->mig_list);	/* makes sure the list is empty */
2925 	mutex_destroy(&mig->mig_lock);
2926 	kmem_free(mig, sizeof (*mig));
2927 }
2928 
2929 /* ARGSUSED */
2930 static void
2931 nfs4_mi_destroy(zoneid_t zoneid, void *data)
2932 {
2933 	struct mi4_globals *mig = data;
2934 
2935 	NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2936 	    "nfs4_mi_destroy zone %d\n", zoneid));
2937 	ASSERT(mig != NULL);
2938 	mutex_enter(&mig->mig_lock);
2939 	if (list_head(&mig->mig_list) != NULL) {
2940 		/* Still waiting for VFS_FREEVFS() */
2941 		mig->mig_destructor_called = B_TRUE;
2942 		mutex_exit(&mig->mig_lock);
2943 		return;
2944 	}
2945 	nfs4_mi_free_globals(mig);
2946 }
2947 
2948 /*
2949  * Add an NFS mount to the per-zone list of NFS mounts.
2950  */
2951 void
2952 nfs4_mi_zonelist_add(mntinfo4_t *mi)
2953 {
2954 	struct mi4_globals *mig;
2955 
2956 	mig = zone_getspecific(mi4_list_key, mi->mi_zone);
2957 	mutex_enter(&mig->mig_lock);
2958 	list_insert_head(&mig->mig_list, mi);
2959 	/*
2960 	 * hold added to eliminate race with zone shutdown -this will be
2961 	 * released in mi_shutdown
2962 	 */
2963 	MI4_HOLD(mi);
2964 	VFS_HOLD(mi->mi_vfsp);
2965 	mutex_exit(&mig->mig_lock);
2966 }
2967 
2968 /*
2969  * Remove an NFS mount from the per-zone list of NFS mounts.
2970  */
2971 int
2972 nfs4_mi_zonelist_remove(mntinfo4_t *mi)
2973 {
2974 	struct mi4_globals *mig;
2975 	int ret = 0;
2976 
2977 	mig = zone_getspecific(mi4_list_key, mi->mi_zone);
2978 	mutex_enter(&mig->mig_lock);
2979 	mutex_enter(&mi->mi_lock);
2980 	/* if this mi is marked dead, then the zone already released it */
2981 	if (!(mi->mi_flags & MI4_DEAD)) {
2982 		list_remove(&mig->mig_list, mi);
2983 
2984 		/* release the holds put on in zonelist_add(). */
2985 		VFS_RELE(mi->mi_vfsp);
2986 		MI4_RELE(mi);
2987 		ret = 1;
2988 	}
2989 	mutex_exit(&mi->mi_lock);
2990 
2991 	/*
2992 	 * We can be called asynchronously by VFS_FREEVFS() after the zone
2993 	 * shutdown/destroy callbacks have executed; if so, clean up the zone's
2994 	 * mi globals.
2995 	 */
2996 	if (list_head(&mig->mig_list) == NULL &&
2997 	    mig->mig_destructor_called == B_TRUE) {
2998 		nfs4_mi_free_globals(mig);
2999 		return (ret);
3000 	}
3001 	mutex_exit(&mig->mig_lock);
3002 	return (ret);
3003 }
3004 
3005 void
3006 nfs_free_mi4(mntinfo4_t *mi)
3007 {
3008 	nfs4_open_owner_t	*foop;
3009 	nfs4_oo_hash_bucket_t   *bucketp;
3010 	nfs4_debug_msg_t	*msgp;
3011 	int i;
3012 	servinfo4_t 		*svp;
3013 
3014 	mutex_enter(&mi->mi_lock);
3015 	ASSERT(mi->mi_recovthread == NULL);
3016 	ASSERT(mi->mi_flags & MI4_ASYNC_MGR_STOP);
3017 	mutex_exit(&mi->mi_lock);
3018 	mutex_enter(&mi->mi_async_lock);
3019 	ASSERT(mi->mi_threads == 0);
3020 	ASSERT(mi->mi_manager_thread == NULL);
3021 	mutex_exit(&mi->mi_async_lock);
3022 	svp = mi->mi_servers;
3023 	sv4_free(svp);
3024 	if (mi->mi_io_kstats) {
3025 		kstat_delete(mi->mi_io_kstats);
3026 		mi->mi_io_kstats = NULL;
3027 	}
3028 	if (mi->mi_ro_kstats) {
3029 		kstat_delete(mi->mi_ro_kstats);
3030 		mi->mi_ro_kstats = NULL;
3031 	}
3032 	if (mi->mi_recov_ksp) {
3033 		kstat_delete(mi->mi_recov_ksp);
3034 		mi->mi_recov_ksp = NULL;
3035 	}
3036 	mutex_enter(&mi->mi_msg_list_lock);
3037 	while (msgp = list_head(&mi->mi_msg_list)) {
3038 		list_remove(&mi->mi_msg_list, msgp);
3039 		nfs4_free_msg(msgp);
3040 	}
3041 	mutex_exit(&mi->mi_msg_list_lock);
3042 	list_destroy(&mi->mi_msg_list);
3043 	if (mi->mi_rootfh != NULL)
3044 		sfh4_rele(&mi->mi_rootfh);
3045 	if (mi->mi_srvparentfh != NULL)
3046 		sfh4_rele(&mi->mi_srvparentfh);
3047 	mutex_destroy(&mi->mi_lock);
3048 	mutex_destroy(&mi->mi_async_lock);
3049 	mutex_destroy(&mi->mi_msg_list_lock);
3050 	nfs_rw_destroy(&mi->mi_recovlock);
3051 	nfs_rw_destroy(&mi->mi_rename_lock);
3052 	nfs_rw_destroy(&mi->mi_fh_lock);
3053 	cv_destroy(&mi->mi_failover_cv);
3054 	cv_destroy(&mi->mi_async_reqs_cv);
3055 	cv_destroy(&mi->mi_async_work_cv);
3056 	cv_destroy(&mi->mi_async_cv);
3057 	cv_destroy(&mi->mi_inact_req_cv);
3058 	/*
3059 	 * Destroy the oo hash lists and mutexes for the cred hash table.
3060 	 */
3061 	for (i = 0; i < NFS4_NUM_OO_BUCKETS; i++) {
3062 		bucketp = &(mi->mi_oo_list[i]);
3063 		/* Destroy any remaining open owners on the list */
3064 		foop = list_head(&bucketp->b_oo_hash_list);
3065 		while (foop != NULL) {
3066 			list_remove(&bucketp->b_oo_hash_list, foop);
3067 			nfs4_destroy_open_owner(foop);
3068 			foop = list_head(&bucketp->b_oo_hash_list);
3069 		}
3070 		list_destroy(&bucketp->b_oo_hash_list);
3071 		mutex_destroy(&bucketp->b_lock);
3072 	}
3073 	/*
3074 	 * Empty and destroy the freed open owner list.
3075 	 */
3076 	foop = list_head(&mi->mi_foo_list);
3077 	while (foop != NULL) {
3078 		list_remove(&mi->mi_foo_list, foop);
3079 		nfs4_destroy_open_owner(foop);
3080 		foop = list_head(&mi->mi_foo_list);
3081 	}
3082 	list_destroy(&mi->mi_foo_list);
3083 	list_destroy(&mi->mi_bseqid_list);
3084 	list_destroy(&mi->mi_lost_state);
3085 	avl_destroy(&mi->mi_filehandles);
3086 	fn_rele(&mi->mi_fname);
3087 	kmem_free(mi, sizeof (*mi));
3088 }
3089 void
3090 mi_hold(mntinfo4_t *mi)
3091 {
3092 	atomic_add_32(&mi->mi_count, 1);
3093 	ASSERT(mi->mi_count != 0);
3094 }
3095 
3096 void
3097 mi_rele(mntinfo4_t *mi)
3098 {
3099 	ASSERT(mi->mi_count != 0);
3100 	if (atomic_add_32_nv(&mi->mi_count, -1) == 0) {
3101 		nfs_free_mi4(mi);
3102 	}
3103 }
3104 
3105 vnode_t    nfs4_xattr_notsupp_vnode;
3106 
3107 void
3108 nfs4_clnt_init(void)
3109 {
3110 	nfs4_vnops_init();
3111 	(void) nfs4_rnode_init();
3112 	(void) nfs4_shadow_init();
3113 	(void) nfs4_acache_init();
3114 	(void) nfs4_subr_init();
3115 	nfs4_acl_init();
3116 	nfs_idmap_init();
3117 	nfs4_callback_init();
3118 	nfs4_secinfo_init();
3119 #ifdef	DEBUG
3120 	tsd_create(&nfs4_tsd_key, NULL);
3121 #endif
3122 
3123 	/*
3124 	 * Add a CPR callback so that we can update client
3125 	 * lease after a suspend and resume.
3126 	 */
3127 	cid = callb_add(nfs4_client_cpr_callb, 0, CB_CL_CPR_RPC, "nfs4");
3128 
3129 	zone_key_create(&mi4_list_key, nfs4_mi_init, nfs4_mi_shutdown,
3130 	    nfs4_mi_destroy);
3131 
3132 	/*
3133 	 * Initialise the reference count of the notsupp xattr cache vnode to 1
3134 	 * so that it never goes away (VOP_INACTIVE isn't called on it).
3135 	 */
3136 	nfs4_xattr_notsupp_vnode.v_count = 1;
3137 }
3138 
3139 void
3140 nfs4_clnt_fini(void)
3141 {
3142 	(void) zone_key_delete(mi4_list_key);
3143 	nfs4_vnops_fini();
3144 	(void) nfs4_rnode_fini();
3145 	(void) nfs4_shadow_fini();
3146 	(void) nfs4_acache_fini();
3147 	(void) nfs4_subr_fini();
3148 	nfs_idmap_fini();
3149 	nfs4_callback_fini();
3150 	nfs4_secinfo_fini();
3151 #ifdef	DEBUG
3152 	tsd_destroy(&nfs4_tsd_key);
3153 #endif
3154 	if (cid)
3155 		(void) callb_delete(cid);
3156 }
3157 
3158 /*ARGSUSED*/
3159 static boolean_t
3160 nfs4_client_cpr_callb(void *arg, int code)
3161 {
3162 	/*
3163 	 * We get called for Suspend and Resume events.
3164 	 * For the suspend case we simply don't care!
3165 	 */
3166 	if (code == CB_CODE_CPR_CHKPT) {
3167 		return (B_TRUE);
3168 	}
3169 
3170 	/*
3171 	 * When we get to here we are in the process of
3172 	 * resuming the system from a previous suspend.
3173 	 */
3174 	nfs4_client_resumed = gethrestime_sec();
3175 	return (B_TRUE);
3176 }
3177 
3178 void
3179 nfs4_renew_lease_thread(nfs4_server_t *sp)
3180 {
3181 	int	error = 0;
3182 	time_t	tmp_last_renewal_time, tmp_time, tmp_now_time, kip_secs;
3183 	clock_t	tick_delay = 0;
3184 	clock_t time_left = 0;
3185 	callb_cpr_t cpr_info;
3186 	kmutex_t cpr_lock;
3187 
3188 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3189 	    "nfs4_renew_lease_thread: acting on sp 0x%p", (void*)sp));
3190 	mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
3191 	CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr, "nfsv4Lease");
3192 
3193 	mutex_enter(&sp->s_lock);
3194 	/* sp->s_lease_time is set via a GETATTR */
3195 	sp->last_renewal_time = gethrestime_sec();
3196 	sp->lease_valid = NFS4_LEASE_UNINITIALIZED;
3197 	ASSERT(sp->s_refcnt >= 1);
3198 
3199 	for (;;) {
3200 		if (!sp->state_ref_count ||
3201 		    sp->lease_valid != NFS4_LEASE_VALID) {
3202 
3203 			kip_secs = MAX((sp->s_lease_time >> 1) -
3204 			    (3 * sp->propagation_delay.tv_sec), 1);
3205 
3206 			tick_delay = SEC_TO_TICK(kip_secs);
3207 
3208 			NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3209 			    "nfs4_renew_lease_thread: no renew : thread "
3210 			    "wait %ld secs", kip_secs));
3211 
3212 			NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3213 			    "nfs4_renew_lease_thread: no renew : "
3214 			    "state_ref_count %d, lease_valid %d",
3215 			    sp->state_ref_count, sp->lease_valid));
3216 
3217 			mutex_enter(&cpr_lock);
3218 			CALLB_CPR_SAFE_BEGIN(&cpr_info);
3219 			mutex_exit(&cpr_lock);
3220 			time_left = cv_timedwait(&sp->cv_thread_exit,
3221 			    &sp->s_lock, tick_delay + lbolt);
3222 			mutex_enter(&cpr_lock);
3223 			CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
3224 			mutex_exit(&cpr_lock);
3225 
3226 			NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3227 			    "nfs4_renew_lease_thread: no renew: "
3228 			    "time left %ld", time_left));
3229 
3230 			if (sp->s_thread_exit == NFS4_THREAD_EXIT)
3231 				goto die;
3232 			continue;
3233 		}
3234 
3235 		tmp_last_renewal_time = sp->last_renewal_time;
3236 
3237 		tmp_time = gethrestime_sec() - sp->last_renewal_time +
3238 		    (3 * sp->propagation_delay.tv_sec);
3239 
3240 		NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3241 		    "nfs4_renew_lease_thread: tmp_time %ld, "
3242 		    "sp->last_renewal_time %ld", tmp_time,
3243 		    sp->last_renewal_time));
3244 
3245 		kip_secs = MAX((sp->s_lease_time >> 1) - tmp_time, 1);
3246 
3247 		tick_delay = SEC_TO_TICK(kip_secs);
3248 
3249 		NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3250 		    "nfs4_renew_lease_thread: valid lease: sleep for %ld "
3251 		    "secs", kip_secs));
3252 
3253 		mutex_enter(&cpr_lock);
3254 		CALLB_CPR_SAFE_BEGIN(&cpr_info);
3255 		mutex_exit(&cpr_lock);
3256 		time_left = cv_timedwait(&sp->cv_thread_exit, &sp->s_lock,
3257 		    tick_delay + lbolt);
3258 		mutex_enter(&cpr_lock);
3259 		CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
3260 		mutex_exit(&cpr_lock);
3261 
3262 		NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3263 		    "nfs4_renew_lease_thread: valid lease: time left %ld :"
3264 		    "sp last_renewal_time %ld, nfs4_client_resumed %ld, "
3265 		    "tmp_last_renewal_time %ld", time_left,
3266 		    sp->last_renewal_time, nfs4_client_resumed,
3267 		    tmp_last_renewal_time));
3268 
3269 		if (sp->s_thread_exit == NFS4_THREAD_EXIT)
3270 			goto die;
3271 
3272 		if (tmp_last_renewal_time == sp->last_renewal_time ||
3273 		    (nfs4_client_resumed != 0 &&
3274 		    nfs4_client_resumed > sp->last_renewal_time)) {
3275 			/*
3276 			 * Issue RENEW op since we haven't renewed the lease
3277 			 * since we slept.
3278 			 */
3279 			tmp_now_time = gethrestime_sec();
3280 			error = nfs4renew(sp);
3281 			/*
3282 			 * Need to re-acquire sp's lock, nfs4renew()
3283 			 * relinqueshes it.
3284 			 */
3285 			mutex_enter(&sp->s_lock);
3286 
3287 			/*
3288 			 * See if someone changed s_thread_exit while we gave
3289 			 * up s_lock.
3290 			 */
3291 			if (sp->s_thread_exit == NFS4_THREAD_EXIT)
3292 				goto die;
3293 
3294 			if (!error) {
3295 				/*
3296 				 * check to see if we implicitly renewed while
3297 				 * we waited for a reply for our RENEW call.
3298 				 */
3299 				if (tmp_last_renewal_time ==
3300 				    sp->last_renewal_time) {
3301 					/* no implicit renew came */
3302 					sp->last_renewal_time = tmp_now_time;
3303 				} else {
3304 					NFS4_DEBUG(nfs4_client_lease_debug,
3305 					    (CE_NOTE, "renew_thread: did "
3306 					    "implicit renewal before reply "
3307 					    "from server for RENEW"));
3308 				}
3309 			} else {
3310 				/* figure out error */
3311 				NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3312 				    "renew_thread: nfs4renew returned error"
3313 				    " %d", error));
3314 			}
3315 
3316 		}
3317 	}
3318 
3319 die:
3320 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3321 	    "nfs4_renew_lease_thread: thread exiting"));
3322 
3323 	while (sp->s_otw_call_count != 0) {
3324 		NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3325 		    "nfs4_renew_lease_thread: waiting for outstanding "
3326 		    "otw calls to finish for sp 0x%p, current "
3327 		    "s_otw_call_count %d", (void *)sp,
3328 		    sp->s_otw_call_count));
3329 		mutex_enter(&cpr_lock);
3330 		CALLB_CPR_SAFE_BEGIN(&cpr_info);
3331 		mutex_exit(&cpr_lock);
3332 		cv_wait(&sp->s_cv_otw_count, &sp->s_lock);
3333 		mutex_enter(&cpr_lock);
3334 		CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
3335 		mutex_exit(&cpr_lock);
3336 	}
3337 	mutex_exit(&sp->s_lock);
3338 
3339 	nfs4_server_rele(sp);		/* free the thread's reference */
3340 	nfs4_server_rele(sp);		/* free the list's reference */
3341 	sp = NULL;
3342 
3343 done:
3344 	mutex_enter(&cpr_lock);
3345 	CALLB_CPR_EXIT(&cpr_info);	/* drops cpr_lock */
3346 	mutex_destroy(&cpr_lock);
3347 
3348 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3349 	    "nfs4_renew_lease_thread: renew thread exit officially"));
3350 
3351 	zthread_exit();
3352 	/* NOT REACHED */
3353 }
3354 
3355 /*
3356  * Send out a RENEW op to the server.
3357  * Assumes sp is locked down.
3358  */
3359 static int
3360 nfs4renew(nfs4_server_t *sp)
3361 {
3362 	COMPOUND4args_clnt args;
3363 	COMPOUND4res_clnt res;
3364 	nfs_argop4 argop[1];
3365 	int doqueue = 1;
3366 	int rpc_error;
3367 	cred_t *cr;
3368 	mntinfo4_t *mi;
3369 	timespec_t prop_time, after_time;
3370 	int needrecov = FALSE;
3371 	nfs4_recov_state_t recov_state;
3372 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
3373 
3374 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "nfs4renew"));
3375 
3376 	recov_state.rs_flags = 0;
3377 	recov_state.rs_num_retry_despite_err = 0;
3378 
3379 recov_retry:
3380 	mi = sp->mntinfo4_list;
3381 	VFS_HOLD(mi->mi_vfsp);
3382 	mutex_exit(&sp->s_lock);
3383 	ASSERT(mi != NULL);
3384 
3385 	e.error = nfs4_start_op(mi, NULL, NULL, &recov_state);
3386 	if (e.error) {
3387 		VFS_RELE(mi->mi_vfsp);
3388 		return (e.error);
3389 	}
3390 
3391 	/* Check to see if we're dealing with a marked-dead sp */
3392 	mutex_enter(&sp->s_lock);
3393 	if (sp->s_thread_exit == NFS4_THREAD_EXIT) {
3394 		mutex_exit(&sp->s_lock);
3395 		nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3396 		VFS_RELE(mi->mi_vfsp);
3397 		return (0);
3398 	}
3399 
3400 	/* Make sure mi hasn't changed on us */
3401 	if (mi != sp->mntinfo4_list) {
3402 		/* Must drop sp's lock to avoid a recursive mutex enter */
3403 		mutex_exit(&sp->s_lock);
3404 		nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3405 		VFS_RELE(mi->mi_vfsp);
3406 		mutex_enter(&sp->s_lock);
3407 		goto recov_retry;
3408 	}
3409 	mutex_exit(&sp->s_lock);
3410 
3411 	args.ctag = TAG_RENEW;
3412 
3413 	args.array_len = 1;
3414 	args.array = argop;
3415 
3416 	argop[0].argop = OP_RENEW;
3417 
3418 	mutex_enter(&sp->s_lock);
3419 	argop[0].nfs_argop4_u.oprenew.clientid = sp->clientid;
3420 	cr = sp->s_cred;
3421 	crhold(cr);
3422 	mutex_exit(&sp->s_lock);
3423 
3424 	ASSERT(cr != NULL);
3425 
3426 	/* used to figure out RTT for sp */
3427 	gethrestime(&prop_time);
3428 
3429 	NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
3430 	    "nfs4renew: %s call, sp 0x%p", needrecov ? "recov" : "first",
3431 	    (void*)sp));
3432 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "before: %ld s %ld ns ",
3433 	    prop_time.tv_sec, prop_time.tv_nsec));
3434 
3435 	DTRACE_PROBE2(nfs4__renew__start, nfs4_server_t *, sp,
3436 	    mntinfo4_t *, mi);
3437 
3438 	rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
3439 	crfree(cr);
3440 
3441 	DTRACE_PROBE2(nfs4__renew__end, nfs4_server_t *, sp,
3442 	    mntinfo4_t *, mi);
3443 
3444 	gethrestime(&after_time);
3445 
3446 	mutex_enter(&sp->s_lock);
3447 	sp->propagation_delay.tv_sec =
3448 	    MAX(1, after_time.tv_sec - prop_time.tv_sec);
3449 	mutex_exit(&sp->s_lock);
3450 
3451 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "after : %ld s %ld ns ",
3452 	    after_time.tv_sec, after_time.tv_nsec));
3453 
3454 	if (e.error == 0 && res.status == NFS4ERR_CB_PATH_DOWN) {
3455 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3456 		nfs4_delegreturn_all(sp);
3457 		nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3458 		VFS_RELE(mi->mi_vfsp);
3459 		/*
3460 		 * If the server returns CB_PATH_DOWN, it has renewed
3461 		 * the lease and informed us that the callback path is
3462 		 * down.  Since the lease is renewed, just return 0 and
3463 		 * let the renew thread proceed as normal.
3464 		 */
3465 		return (0);
3466 	}
3467 
3468 	needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
3469 	if (!needrecov && e.error) {
3470 		nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3471 		VFS_RELE(mi->mi_vfsp);
3472 		return (e.error);
3473 	}
3474 
3475 	rpc_error = e.error;
3476 
3477 	if (needrecov) {
3478 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
3479 		    "nfs4renew: initiating recovery\n"));
3480 
3481 		if (nfs4_start_recovery(&e, mi, NULL, NULL, NULL, NULL,
3482 		    OP_RENEW, NULL) == FALSE) {
3483 			nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3484 			VFS_RELE(mi->mi_vfsp);
3485 			if (!e.error)
3486 				(void) xdr_free(xdr_COMPOUND4res_clnt,
3487 				    (caddr_t)&res);
3488 			mutex_enter(&sp->s_lock);
3489 			goto recov_retry;
3490 		}
3491 		/* fall through for res.status case */
3492 	}
3493 
3494 	if (res.status) {
3495 		if (res.status == NFS4ERR_LEASE_MOVED) {
3496 			/*EMPTY*/
3497 			/*
3498 			 * XXX need to try every mntinfo4 in sp->mntinfo4_list
3499 			 * to renew the lease on that server
3500 			 */
3501 		}
3502 		e.error = geterrno4(res.status);
3503 	}
3504 
3505 	if (!rpc_error)
3506 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3507 
3508 	nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3509 
3510 	VFS_RELE(mi->mi_vfsp);
3511 
3512 	return (e.error);
3513 }
3514 
3515 void
3516 nfs4_inc_state_ref_count(mntinfo4_t *mi)
3517 {
3518 	nfs4_server_t	*sp;
3519 
3520 	/* this locks down sp if it is found */
3521 	sp = find_nfs4_server(mi);
3522 
3523 	if (sp != NULL) {
3524 		nfs4_inc_state_ref_count_nolock(sp, mi);
3525 		mutex_exit(&sp->s_lock);
3526 		nfs4_server_rele(sp);
3527 	}
3528 }
3529 
3530 /*
3531  * Bump the number of OPEN files (ie: those with state) so we know if this
3532  * nfs4_server has any state to maintain a lease for or not.
3533  *
3534  * Also, marks the nfs4_server's lease valid if it hasn't been done so already.
3535  */
3536 void
3537 nfs4_inc_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi)
3538 {
3539 	ASSERT(mutex_owned(&sp->s_lock));
3540 
3541 	sp->state_ref_count++;
3542 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3543 	    "nfs4_inc_state_ref_count: state_ref_count now %d",
3544 	    sp->state_ref_count));
3545 
3546 	if (sp->lease_valid == NFS4_LEASE_UNINITIALIZED)
3547 		sp->lease_valid = NFS4_LEASE_VALID;
3548 
3549 	/*
3550 	 * If this call caused the lease to be marked valid and/or
3551 	 * took the state_ref_count from 0 to 1, then start the time
3552 	 * on lease renewal.
3553 	 */
3554 	if (sp->lease_valid == NFS4_LEASE_VALID && sp->state_ref_count == 1)
3555 		sp->last_renewal_time = gethrestime_sec();
3556 
3557 	/* update the number of open files for mi */
3558 	mi->mi_open_files++;
3559 }
3560 
3561 void
3562 nfs4_dec_state_ref_count(mntinfo4_t *mi)
3563 {
3564 	nfs4_server_t	*sp;
3565 
3566 	/* this locks down sp if it is found */
3567 	sp = find_nfs4_server_all(mi, 1);
3568 
3569 	if (sp != NULL) {
3570 		nfs4_dec_state_ref_count_nolock(sp, mi);
3571 		mutex_exit(&sp->s_lock);
3572 		nfs4_server_rele(sp);
3573 	}
3574 }
3575 
3576 /*
3577  * Decrement the number of OPEN files (ie: those with state) so we know if
3578  * this nfs4_server has any state to maintain a lease for or not.
3579  */
3580 void
3581 nfs4_dec_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi)
3582 {
3583 	ASSERT(mutex_owned(&sp->s_lock));
3584 	ASSERT(sp->state_ref_count != 0);
3585 	sp->state_ref_count--;
3586 
3587 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3588 	    "nfs4_dec_state_ref_count: state ref count now %d",
3589 	    sp->state_ref_count));
3590 
3591 	mi->mi_open_files--;
3592 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3593 	    "nfs4_dec_state_ref_count: mi open files %d, v4 flags 0x%x",
3594 	    mi->mi_open_files, mi->mi_flags));
3595 
3596 	/* We don't have to hold the mi_lock to test mi_flags */
3597 	if (mi->mi_open_files == 0 &&
3598 	    (mi->mi_flags & MI4_REMOVE_ON_LAST_CLOSE)) {
3599 		NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3600 		    "nfs4_dec_state_ref_count: remove mntinfo4 %p since "
3601 		    "we have closed the last open file", (void*)mi));
3602 		nfs4_remove_mi_from_server(mi, sp);
3603 	}
3604 }
3605 
3606 bool_t
3607 inlease(nfs4_server_t *sp)
3608 {
3609 	bool_t result;
3610 
3611 	ASSERT(mutex_owned(&sp->s_lock));
3612 
3613 	if (sp->lease_valid == NFS4_LEASE_VALID &&
3614 	    gethrestime_sec() < sp->last_renewal_time + sp->s_lease_time)
3615 		result = TRUE;
3616 	else
3617 		result = FALSE;
3618 
3619 	return (result);
3620 }
3621 
3622 
3623 /*
3624  * Return non-zero if the given nfs4_server_t is going through recovery.
3625  */
3626 
3627 int
3628 nfs4_server_in_recovery(nfs4_server_t *sp)
3629 {
3630 	return (nfs_rw_lock_held(&sp->s_recovlock, RW_WRITER));
3631 }
3632 
3633 /*
3634  * Compare two shared filehandle objects.  Returns -1, 0, or +1, if the
3635  * first is less than, equal to, or greater than the second.
3636  */
3637 
3638 int
3639 sfh4cmp(const void *p1, const void *p2)
3640 {
3641 	const nfs4_sharedfh_t *sfh1 = (const nfs4_sharedfh_t *)p1;
3642 	const nfs4_sharedfh_t *sfh2 = (const nfs4_sharedfh_t *)p2;
3643 
3644 	return (nfs4cmpfh(&sfh1->sfh_fh, &sfh2->sfh_fh));
3645 }
3646 
3647 /*
3648  * Create a table for shared filehandle objects.
3649  */
3650 
3651 void
3652 sfh4_createtab(avl_tree_t *tab)
3653 {
3654 	avl_create(tab, sfh4cmp, sizeof (nfs4_sharedfh_t),
3655 	    offsetof(nfs4_sharedfh_t, sfh_tree));
3656 }
3657 
3658 /*
3659  * Return a shared filehandle object for the given filehandle.  The caller
3660  * is responsible for eventually calling sfh4_rele().
3661  */
3662 
3663 nfs4_sharedfh_t *
3664 sfh4_put(const nfs_fh4 *fh, mntinfo4_t *mi, nfs4_sharedfh_t *key)
3665 {
3666 	nfs4_sharedfh_t *sfh, *nsfh;
3667 	avl_index_t where;
3668 	nfs4_sharedfh_t skey;
3669 
3670 	if (!key) {
3671 		skey.sfh_fh = *fh;
3672 		key = &skey;
3673 	}
3674 
3675 	nsfh = kmem_alloc(sizeof (nfs4_sharedfh_t), KM_SLEEP);
3676 	nsfh->sfh_fh.nfs_fh4_len = fh->nfs_fh4_len;
3677 	/*
3678 	 * We allocate the largest possible filehandle size because it's
3679 	 * not that big, and it saves us from possibly having to resize the
3680 	 * buffer later.
3681 	 */
3682 	nsfh->sfh_fh.nfs_fh4_val = kmem_alloc(NFS4_FHSIZE, KM_SLEEP);
3683 	bcopy(fh->nfs_fh4_val, nsfh->sfh_fh.nfs_fh4_val, fh->nfs_fh4_len);
3684 	mutex_init(&nsfh->sfh_lock, NULL, MUTEX_DEFAULT, NULL);
3685 	nsfh->sfh_refcnt = 1;
3686 	nsfh->sfh_flags = SFH4_IN_TREE;
3687 	nsfh->sfh_mi = mi;
3688 	NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, "sfh4_get: new object (%p)",
3689 	    (void *)nsfh));
3690 
3691 	(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
3692 	sfh = avl_find(&mi->mi_filehandles, key, &where);
3693 	if (sfh != NULL) {
3694 		mutex_enter(&sfh->sfh_lock);
3695 		sfh->sfh_refcnt++;
3696 		mutex_exit(&sfh->sfh_lock);
3697 		nfs_rw_exit(&mi->mi_fh_lock);
3698 		/* free our speculative allocs */
3699 		kmem_free(nsfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE);
3700 		kmem_free(nsfh, sizeof (nfs4_sharedfh_t));
3701 		return (sfh);
3702 	}
3703 
3704 	avl_insert(&mi->mi_filehandles, nsfh, where);
3705 	nfs_rw_exit(&mi->mi_fh_lock);
3706 
3707 	return (nsfh);
3708 }
3709 
3710 /*
3711  * Return a shared filehandle object for the given filehandle.  The caller
3712  * is responsible for eventually calling sfh4_rele().
3713  */
3714 
3715 nfs4_sharedfh_t *
3716 sfh4_get(const nfs_fh4 *fh, mntinfo4_t *mi)
3717 {
3718 	nfs4_sharedfh_t *sfh;
3719 	nfs4_sharedfh_t key;
3720 
3721 	ASSERT(fh->nfs_fh4_len <= NFS4_FHSIZE);
3722 
3723 #ifdef DEBUG
3724 	if (nfs4_sharedfh_debug) {
3725 		nfs4_fhandle_t fhandle;
3726 
3727 		fhandle.fh_len = fh->nfs_fh4_len;
3728 		bcopy(fh->nfs_fh4_val, fhandle.fh_buf, fhandle.fh_len);
3729 		zcmn_err(mi->mi_zone->zone_id, CE_NOTE, "sfh4_get:");
3730 		nfs4_printfhandle(&fhandle);
3731 	}
3732 #endif
3733 
3734 	/*
3735 	 * If there's already an object for the given filehandle, bump the
3736 	 * reference count and return it.  Otherwise, create a new object
3737 	 * and add it to the AVL tree.
3738 	 */
3739 
3740 	key.sfh_fh = *fh;
3741 
3742 	(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
3743 	sfh = avl_find(&mi->mi_filehandles, &key, NULL);
3744 	if (sfh != NULL) {
3745 		mutex_enter(&sfh->sfh_lock);
3746 		sfh->sfh_refcnt++;
3747 		NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3748 		    "sfh4_get: found existing %p, new refcnt=%d",
3749 		    (void *)sfh, sfh->sfh_refcnt));
3750 		mutex_exit(&sfh->sfh_lock);
3751 		nfs_rw_exit(&mi->mi_fh_lock);
3752 		return (sfh);
3753 	}
3754 	nfs_rw_exit(&mi->mi_fh_lock);
3755 
3756 	return (sfh4_put(fh, mi, &key));
3757 }
3758 
3759 /*
3760  * Get a reference to the given shared filehandle object.
3761  */
3762 
3763 void
3764 sfh4_hold(nfs4_sharedfh_t *sfh)
3765 {
3766 	ASSERT(sfh->sfh_refcnt > 0);
3767 
3768 	mutex_enter(&sfh->sfh_lock);
3769 	sfh->sfh_refcnt++;
3770 	NFS4_DEBUG(nfs4_sharedfh_debug,
3771 	    (CE_NOTE, "sfh4_hold %p, new refcnt=%d",
3772 	    (void *)sfh, sfh->sfh_refcnt));
3773 	mutex_exit(&sfh->sfh_lock);
3774 }
3775 
3776 /*
3777  * Release a reference to the given shared filehandle object and null out
3778  * the given pointer.
3779  */
3780 
3781 void
3782 sfh4_rele(nfs4_sharedfh_t **sfhpp)
3783 {
3784 	mntinfo4_t *mi;
3785 	nfs4_sharedfh_t *sfh = *sfhpp;
3786 
3787 	ASSERT(sfh->sfh_refcnt > 0);
3788 
3789 	mutex_enter(&sfh->sfh_lock);
3790 	if (sfh->sfh_refcnt > 1) {
3791 		sfh->sfh_refcnt--;
3792 		NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3793 		    "sfh4_rele %p, new refcnt=%d",
3794 		    (void *)sfh, sfh->sfh_refcnt));
3795 		mutex_exit(&sfh->sfh_lock);
3796 		goto finish;
3797 	}
3798 	mutex_exit(&sfh->sfh_lock);
3799 
3800 	/*
3801 	 * Possibly the last reference, so get the lock for the table in
3802 	 * case it's time to remove the object from the table.
3803 	 */
3804 	mi = sfh->sfh_mi;
3805 	(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
3806 	mutex_enter(&sfh->sfh_lock);
3807 	sfh->sfh_refcnt--;
3808 	if (sfh->sfh_refcnt > 0) {
3809 		NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3810 		    "sfh4_rele %p, new refcnt=%d",
3811 		    (void *)sfh, sfh->sfh_refcnt));
3812 		mutex_exit(&sfh->sfh_lock);
3813 		nfs_rw_exit(&mi->mi_fh_lock);
3814 		goto finish;
3815 	}
3816 
3817 	NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3818 	    "sfh4_rele %p, last ref", (void *)sfh));
3819 	if (sfh->sfh_flags & SFH4_IN_TREE) {
3820 		avl_remove(&mi->mi_filehandles, sfh);
3821 		sfh->sfh_flags &= ~SFH4_IN_TREE;
3822 	}
3823 	mutex_exit(&sfh->sfh_lock);
3824 	nfs_rw_exit(&mi->mi_fh_lock);
3825 	mutex_destroy(&sfh->sfh_lock);
3826 	kmem_free(sfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE);
3827 	kmem_free(sfh, sizeof (nfs4_sharedfh_t));
3828 
3829 finish:
3830 	*sfhpp = NULL;
3831 }
3832 
3833 /*
3834  * Update the filehandle for the given shared filehandle object.
3835  */
3836 
3837 int nfs4_warn_dupfh = 0;	/* if set, always warn about dup fhs below */
3838 
3839 void
3840 sfh4_update(nfs4_sharedfh_t *sfh, const nfs_fh4 *newfh)
3841 {
3842 	mntinfo4_t *mi = sfh->sfh_mi;
3843 	nfs4_sharedfh_t *dupsfh;
3844 	avl_index_t where;
3845 	nfs4_sharedfh_t key;
3846 
3847 #ifdef DEBUG
3848 	mutex_enter(&sfh->sfh_lock);
3849 	ASSERT(sfh->sfh_refcnt > 0);
3850 	mutex_exit(&sfh->sfh_lock);
3851 #endif
3852 	ASSERT(newfh->nfs_fh4_len <= NFS4_FHSIZE);
3853 
3854 	/*
3855 	 * The basic plan is to remove the shared filehandle object from
3856 	 * the table, update it to have the new filehandle, then reinsert
3857 	 * it.
3858 	 */
3859 
3860 	(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
3861 	mutex_enter(&sfh->sfh_lock);
3862 	if (sfh->sfh_flags & SFH4_IN_TREE) {
3863 		avl_remove(&mi->mi_filehandles, sfh);
3864 		sfh->sfh_flags &= ~SFH4_IN_TREE;
3865 	}
3866 	mutex_exit(&sfh->sfh_lock);
3867 	sfh->sfh_fh.nfs_fh4_len = newfh->nfs_fh4_len;
3868 	bcopy(newfh->nfs_fh4_val, sfh->sfh_fh.nfs_fh4_val,
3869 	    sfh->sfh_fh.nfs_fh4_len);
3870 
3871 	/*
3872 	 * XXX If there is already a shared filehandle object with the new
3873 	 * filehandle, we're in trouble, because the rnode code assumes
3874 	 * that there is only one shared filehandle object for a given
3875 	 * filehandle.  So issue a warning (for read-write mounts only)
3876 	 * and don't try to re-insert the given object into the table.
3877 	 * Hopefully the given object will quickly go away and everyone
3878 	 * will use the new object.
3879 	 */
3880 	key.sfh_fh = *newfh;
3881 	dupsfh = avl_find(&mi->mi_filehandles, &key, &where);
3882 	if (dupsfh != NULL) {
3883 		if (!(mi->mi_vfsp->vfs_flag & VFS_RDONLY) || nfs4_warn_dupfh) {
3884 			zcmn_err(mi->mi_zone->zone_id, CE_WARN, "sfh4_update: "
3885 			    "duplicate filehandle detected");
3886 			sfh4_printfhandle(dupsfh);
3887 		}
3888 	} else {
3889 		avl_insert(&mi->mi_filehandles, sfh, where);
3890 		mutex_enter(&sfh->sfh_lock);
3891 		sfh->sfh_flags |= SFH4_IN_TREE;
3892 		mutex_exit(&sfh->sfh_lock);
3893 	}
3894 	nfs_rw_exit(&mi->mi_fh_lock);
3895 }
3896 
3897 /*
3898  * Copy out the current filehandle for the given shared filehandle object.
3899  */
3900 
3901 void
3902 sfh4_copyval(const nfs4_sharedfh_t *sfh, nfs4_fhandle_t *fhp)
3903 {
3904 	mntinfo4_t *mi = sfh->sfh_mi;
3905 
3906 	ASSERT(sfh->sfh_refcnt > 0);
3907 
3908 	(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
3909 	fhp->fh_len = sfh->sfh_fh.nfs_fh4_len;
3910 	ASSERT(fhp->fh_len <= NFS4_FHSIZE);
3911 	bcopy(sfh->sfh_fh.nfs_fh4_val, fhp->fh_buf, fhp->fh_len);
3912 	nfs_rw_exit(&mi->mi_fh_lock);
3913 }
3914 
3915 /*
3916  * Print out the filehandle for the given shared filehandle object.
3917  */
3918 
3919 void
3920 sfh4_printfhandle(const nfs4_sharedfh_t *sfh)
3921 {
3922 	nfs4_fhandle_t fhandle;
3923 
3924 	sfh4_copyval(sfh, &fhandle);
3925 	nfs4_printfhandle(&fhandle);
3926 }
3927 
3928 /*
3929  * Compare 2 fnames.  Returns -1 if the first is "less" than the second, 0
3930  * if they're the same, +1 if the first is "greater" than the second.  The
3931  * caller (or whoever's calling the AVL package) is responsible for
3932  * handling locking issues.
3933  */
3934 
3935 static int
3936 fncmp(const void *p1, const void *p2)
3937 {
3938 	const nfs4_fname_t *f1 = p1;
3939 	const nfs4_fname_t *f2 = p2;
3940 	int res;
3941 
3942 	res = strcmp(f1->fn_name, f2->fn_name);
3943 	/*
3944 	 * The AVL package wants +/-1, not arbitrary positive or negative
3945 	 * integers.
3946 	 */
3947 	if (res > 0)
3948 		res = 1;
3949 	else if (res < 0)
3950 		res = -1;
3951 	return (res);
3952 }
3953 
3954 /*
3955  * Get or create an fname with the given name, as a child of the given
3956  * fname.  The caller is responsible for eventually releasing the reference
3957  * (fn_rele()).  parent may be NULL.
3958  */
3959 
3960 nfs4_fname_t *
3961 fn_get(nfs4_fname_t *parent, char *name)
3962 {
3963 	nfs4_fname_t key;
3964 	nfs4_fname_t *fnp;
3965 	avl_index_t where;
3966 
3967 	key.fn_name = name;
3968 
3969 	/*
3970 	 * If there's already an fname registered with the given name, bump
3971 	 * its reference count and return it.  Otherwise, create a new one
3972 	 * and add it to the parent's AVL tree.
3973 	 */
3974 
3975 	if (parent != NULL) {
3976 		mutex_enter(&parent->fn_lock);
3977 		fnp = avl_find(&parent->fn_children, &key, &where);
3978 		if (fnp != NULL) {
3979 			fn_hold(fnp);
3980 			mutex_exit(&parent->fn_lock);
3981 			return (fnp);
3982 		}
3983 	}
3984 
3985 	fnp = kmem_alloc(sizeof (nfs4_fname_t), KM_SLEEP);
3986 	mutex_init(&fnp->fn_lock, NULL, MUTEX_DEFAULT, NULL);
3987 	fnp->fn_parent = parent;
3988 	if (parent != NULL)
3989 		fn_hold(parent);
3990 	fnp->fn_len = strlen(name);
3991 	ASSERT(fnp->fn_len < MAXNAMELEN);
3992 	fnp->fn_name = kmem_alloc(fnp->fn_len + 1, KM_SLEEP);
3993 	(void) strcpy(fnp->fn_name, name);
3994 	fnp->fn_refcnt = 1;
3995 	avl_create(&fnp->fn_children, fncmp, sizeof (nfs4_fname_t),
3996 	    offsetof(nfs4_fname_t, fn_tree));
3997 	NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
3998 	    "fn_get %p:%s, a new nfs4_fname_t!",
3999 	    (void *)fnp, fnp->fn_name));
4000 	if (parent != NULL) {
4001 		avl_insert(&parent->fn_children, fnp, where);
4002 		mutex_exit(&parent->fn_lock);
4003 	}
4004 
4005 	return (fnp);
4006 }
4007 
4008 void
4009 fn_hold(nfs4_fname_t *fnp)
4010 {
4011 	atomic_add_32(&fnp->fn_refcnt, 1);
4012 	NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4013 	    "fn_hold %p:%s, new refcnt=%d",
4014 	    (void *)fnp, fnp->fn_name, fnp->fn_refcnt));
4015 }
4016 
4017 /*
4018  * Decrement the reference count of the given fname, and destroy it if its
4019  * reference count goes to zero.  Nulls out the given pointer.
4020  */
4021 
4022 void
4023 fn_rele(nfs4_fname_t **fnpp)
4024 {
4025 	nfs4_fname_t *parent;
4026 	uint32_t newref;
4027 	nfs4_fname_t *fnp;
4028 
4029 recur:
4030 	fnp = *fnpp;
4031 	*fnpp = NULL;
4032 
4033 	mutex_enter(&fnp->fn_lock);
4034 	parent = fnp->fn_parent;
4035 	if (parent != NULL)
4036 		mutex_enter(&parent->fn_lock);	/* prevent new references */
4037 	newref = atomic_add_32_nv(&fnp->fn_refcnt, -1);
4038 	if (newref > 0) {
4039 		NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4040 		    "fn_rele %p:%s, new refcnt=%d",
4041 		    (void *)fnp, fnp->fn_name, fnp->fn_refcnt));
4042 		if (parent != NULL)
4043 			mutex_exit(&parent->fn_lock);
4044 		mutex_exit(&fnp->fn_lock);
4045 		return;
4046 	}
4047 
4048 	NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4049 	    "fn_rele %p:%s, last reference, deleting...",
4050 	    (void *)fnp, fnp->fn_name));
4051 	if (parent != NULL) {
4052 		avl_remove(&parent->fn_children, fnp);
4053 		mutex_exit(&parent->fn_lock);
4054 	}
4055 	kmem_free(fnp->fn_name, fnp->fn_len + 1);
4056 	mutex_destroy(&fnp->fn_lock);
4057 	avl_destroy(&fnp->fn_children);
4058 	kmem_free(fnp, sizeof (nfs4_fname_t));
4059 	/*
4060 	 * Recursivly fn_rele the parent.
4061 	 * Use goto instead of a recursive call to avoid stack overflow.
4062 	 */
4063 	if (parent != NULL) {
4064 		fnpp = &parent;
4065 		goto recur;
4066 	}
4067 }
4068 
4069 /*
4070  * Returns the single component name of the given fname, in a MAXNAMELEN
4071  * string buffer, which the caller is responsible for freeing.  Note that
4072  * the name may become invalid as a result of fn_move().
4073  */
4074 
4075 char *
4076 fn_name(nfs4_fname_t *fnp)
4077 {
4078 	char *name;
4079 
4080 	ASSERT(fnp->fn_len < MAXNAMELEN);
4081 	name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
4082 	mutex_enter(&fnp->fn_lock);
4083 	(void) strcpy(name, fnp->fn_name);
4084 	mutex_exit(&fnp->fn_lock);
4085 
4086 	return (name);
4087 }
4088 
4089 
4090 /*
4091  * fn_path_realloc
4092  *
4093  * This function, used only by fn_path, constructs
4094  * a new string which looks like "prepend" + "/" + "current".
4095  * by allocating a new string and freeing the old one.
4096  */
4097 static void
4098 fn_path_realloc(char **curses, char *prepend)
4099 {
4100 	int len, curlen = 0;
4101 	char *news;
4102 
4103 	if (*curses == NULL) {
4104 		/*
4105 		 * Prime the pump, allocate just the
4106 		 * space for prepend and return that.
4107 		 */
4108 		len = strlen(prepend) + 1;
4109 		news = kmem_alloc(len, KM_SLEEP);
4110 		(void) strncpy(news, prepend, len);
4111 	} else {
4112 		/*
4113 		 * Allocate the space  for a new string
4114 		 * +1 +1 is for the "/" and the NULL
4115 		 * byte at the end of it all.
4116 		 */
4117 		curlen = strlen(*curses);
4118 		len = curlen + strlen(prepend) + 1 + 1;
4119 		news = kmem_alloc(len, KM_SLEEP);
4120 		(void) strncpy(news, prepend, len);
4121 		(void) strcat(news, "/");
4122 		(void) strcat(news, *curses);
4123 		kmem_free(*curses, curlen + 1);
4124 	}
4125 	*curses = news;
4126 }
4127 
4128 /*
4129  * Returns the path name (starting from the fs root) for the given fname.
4130  * The caller is responsible for freeing.  Note that the path may be or
4131  * become invalid as a result of fn_move().
4132  */
4133 
4134 char *
4135 fn_path(nfs4_fname_t *fnp)
4136 {
4137 	char *path;
4138 	nfs4_fname_t *nextfnp;
4139 
4140 	if (fnp == NULL)
4141 		return (NULL);
4142 
4143 	path = NULL;
4144 
4145 	/* walk up the tree constructing the pathname.  */
4146 
4147 	fn_hold(fnp);			/* adjust for later rele */
4148 	do {
4149 		mutex_enter(&fnp->fn_lock);
4150 		/*
4151 		 * Add fn_name in front of the current path
4152 		 */
4153 		fn_path_realloc(&path, fnp->fn_name);
4154 		nextfnp = fnp->fn_parent;
4155 		if (nextfnp != NULL)
4156 			fn_hold(nextfnp);
4157 		mutex_exit(&fnp->fn_lock);
4158 		fn_rele(&fnp);
4159 		fnp = nextfnp;
4160 	} while (fnp != NULL);
4161 
4162 	return (path);
4163 }
4164 
4165 /*
4166  * Return a reference to the parent of the given fname, which the caller is
4167  * responsible for eventually releasing.
4168  */
4169 
4170 nfs4_fname_t *
4171 fn_parent(nfs4_fname_t *fnp)
4172 {
4173 	nfs4_fname_t *parent;
4174 
4175 	mutex_enter(&fnp->fn_lock);
4176 	parent = fnp->fn_parent;
4177 	if (parent != NULL)
4178 		fn_hold(parent);
4179 	mutex_exit(&fnp->fn_lock);
4180 
4181 	return (parent);
4182 }
4183 
4184 /*
4185  * Update fnp so that its parent is newparent and its name is newname.
4186  */
4187 
4188 void
4189 fn_move(nfs4_fname_t *fnp, nfs4_fname_t *newparent, char *newname)
4190 {
4191 	nfs4_fname_t *parent, *tmpfnp;
4192 	ssize_t newlen;
4193 	nfs4_fname_t key;
4194 	avl_index_t where;
4195 
4196 	/*
4197 	 * This assert exists to catch the client trying to rename
4198 	 * a dir to be a child of itself.  This happened at a recent
4199 	 * bakeoff against a 3rd party (broken) server which allowed
4200 	 * the rename to succeed.  If it trips it means that:
4201 	 *	a) the code in nfs4rename that detects this case is broken
4202 	 *	b) the server is broken (since it allowed the bogus rename)
4203 	 *
4204 	 * For non-DEBUG kernels, prepare for a recursive mutex_enter
4205 	 * panic below from:  mutex_enter(&newparent->fn_lock);
4206 	 */
4207 	ASSERT(fnp != newparent);
4208 
4209 	/*
4210 	 * Remove fnp from its current parent, change its name, then add it
4211 	 * to newparent.
4212 	 */
4213 	mutex_enter(&fnp->fn_lock);
4214 	parent = fnp->fn_parent;
4215 	mutex_enter(&parent->fn_lock);
4216 	avl_remove(&parent->fn_children, fnp);
4217 	mutex_exit(&parent->fn_lock);
4218 	fn_rele(&fnp->fn_parent);
4219 
4220 	newlen = strlen(newname);
4221 	if (newlen != fnp->fn_len) {
4222 		ASSERT(newlen < MAXNAMELEN);
4223 		kmem_free(fnp->fn_name, fnp->fn_len + 1);
4224 		fnp->fn_name = kmem_alloc(newlen + 1, KM_SLEEP);
4225 		fnp->fn_len = newlen;
4226 	}
4227 	(void) strcpy(fnp->fn_name, newname);
4228 
4229 again:
4230 	mutex_enter(&newparent->fn_lock);
4231 	key.fn_name = fnp->fn_name;
4232 	tmpfnp = avl_find(&newparent->fn_children, &key, &where);
4233 	if (tmpfnp != NULL) {
4234 		/*
4235 		 * This could be due to a file that was unlinked while
4236 		 * open, or perhaps the rnode is in the free list.  Remove
4237 		 * it from newparent and let it go away on its own.  The
4238 		 * contorted code is to deal with lock order issues and
4239 		 * race conditions.
4240 		 */
4241 		fn_hold(tmpfnp);
4242 		mutex_exit(&newparent->fn_lock);
4243 		mutex_enter(&tmpfnp->fn_lock);
4244 		if (tmpfnp->fn_parent == newparent) {
4245 			mutex_enter(&newparent->fn_lock);
4246 			avl_remove(&newparent->fn_children, tmpfnp);
4247 			mutex_exit(&newparent->fn_lock);
4248 			fn_rele(&tmpfnp->fn_parent);
4249 		}
4250 		mutex_exit(&tmpfnp->fn_lock);
4251 		fn_rele(&tmpfnp);
4252 		goto again;
4253 	}
4254 	fnp->fn_parent = newparent;
4255 	fn_hold(newparent);
4256 	avl_insert(&newparent->fn_children, fnp, where);
4257 	mutex_exit(&newparent->fn_lock);
4258 	mutex_exit(&fnp->fn_lock);
4259 }
4260 
4261 #ifdef DEBUG
4262 /*
4263  * Return non-zero if the type information makes sense for the given vnode.
4264  * Otherwise panic.
4265  */
4266 int
4267 nfs4_consistent_type(vnode_t *vp)
4268 {
4269 	rnode4_t *rp = VTOR4(vp);
4270 
4271 	if (nfs4_vtype_debug && vp->v_type != VNON &&
4272 	    rp->r_attr.va_type != VNON && vp->v_type != rp->r_attr.va_type) {
4273 		cmn_err(CE_PANIC, "vnode %p type mismatch; v_type=%d, "
4274 		    "rnode attr type=%d", (void *)vp, vp->v_type,
4275 		    rp->r_attr.va_type);
4276 	}
4277 
4278 	return (1);
4279 }
4280 #endif /* DEBUG */
4281