xref: /titanic_50/usr/src/uts/common/fs/nfs/nfs4_client.c (revision 08399995051c04db70a4f07eed94812105b52053)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  *  	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
28  *	All Rights Reserved
29  */
30 
31 #include <sys/param.h>
32 #include <sys/types.h>
33 #include <sys/systm.h>
34 #include <sys/thread.h>
35 #include <sys/t_lock.h>
36 #include <sys/time.h>
37 #include <sys/vnode.h>
38 #include <sys/vfs.h>
39 #include <sys/errno.h>
40 #include <sys/buf.h>
41 #include <sys/stat.h>
42 #include <sys/cred.h>
43 #include <sys/kmem.h>
44 #include <sys/debug.h>
45 #include <sys/dnlc.h>
46 #include <sys/vmsystm.h>
47 #include <sys/flock.h>
48 #include <sys/share.h>
49 #include <sys/cmn_err.h>
50 #include <sys/tiuser.h>
51 #include <sys/sysmacros.h>
52 #include <sys/callb.h>
53 #include <sys/acl.h>
54 #include <sys/kstat.h>
55 #include <sys/signal.h>
56 #include <sys/disp.h>
57 #include <sys/atomic.h>
58 #include <sys/list.h>
59 #include <sys/sdt.h>
60 
61 #include <rpc/types.h>
62 #include <rpc/xdr.h>
63 #include <rpc/auth.h>
64 #include <rpc/clnt.h>
65 
66 #include <nfs/nfs.h>
67 #include <nfs/nfs_clnt.h>
68 #include <nfs/nfs_acl.h>
69 
70 #include <nfs/nfs4.h>
71 #include <nfs/rnode4.h>
72 #include <nfs/nfs4_clnt.h>
73 
74 #include <vm/hat.h>
75 #include <vm/as.h>
76 #include <vm/page.h>
77 #include <vm/pvn.h>
78 #include <vm/seg.h>
79 #include <vm/seg_map.h>
80 #include <vm/seg_vn.h>
81 
82 #include <sys/ddi.h>
83 
84 /*
85  * Arguments to page-flush thread.
86  */
87 typedef struct {
88 	vnode_t *vp;
89 	cred_t *cr;
90 } pgflush_t;
91 
92 #ifdef DEBUG
93 int nfs4_client_lease_debug;
94 int nfs4_sharedfh_debug;
95 int nfs4_fname_debug;
96 
97 /* temporary: panic if v_type is inconsistent with r_attr va_type */
98 int nfs4_vtype_debug;
99 
100 uint_t nfs4_tsd_key;
101 #endif
102 
103 static time_t	nfs4_client_resumed = 0;
104 static	callb_id_t cid = 0;
105 
106 static int	nfs4renew(nfs4_server_t *);
107 static void	nfs4_attrcache_va(vnode_t *, nfs4_ga_res_t *, int);
108 static void	nfs4_pgflush_thread(pgflush_t *);
109 static void	flush_pages(vnode_t *, cred_t *);
110 
111 static boolean_t nfs4_client_cpr_callb(void *, int);
112 
113 struct mi4_globals {
114 	kmutex_t	mig_lock;  /* lock protecting mig_list */
115 	list_t		mig_list;  /* list of NFS v4 mounts in zone */
116 	boolean_t	mig_destructor_called;
117 };
118 
119 static zone_key_t mi4_list_key;
120 
121 /*
122  * Attributes caching:
123  *
124  * Attributes are cached in the rnode in struct vattr form.
125  * There is a time associated with the cached attributes (r_time_attr_inval)
126  * which tells whether the attributes are valid. The time is initialized
127  * to the difference between current time and the modify time of the vnode
128  * when new attributes are cached. This allows the attributes for
129  * files that have changed recently to be timed out sooner than for files
130  * that have not changed for a long time. There are minimum and maximum
131  * timeout values that can be set per mount point.
132  */
133 
134 /*
135  * If a cache purge is in progress, wait for it to finish.
136  *
137  * The current thread must not be in the middle of an
138  * nfs4_start_op/nfs4_end_op region.  Otherwise, there could be a deadlock
139  * between this thread, a recovery thread, and the page flush thread.
140  */
141 int
142 nfs4_waitfor_purge_complete(vnode_t *vp)
143 {
144 	rnode4_t *rp;
145 	k_sigset_t smask;
146 
147 	rp = VTOR4(vp);
148 	if ((rp->r_serial != NULL && rp->r_serial != curthread) ||
149 	    ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread)) {
150 		mutex_enter(&rp->r_statelock);
151 		sigintr(&smask, VTOMI4(vp)->mi_flags & MI4_INT);
152 		while ((rp->r_serial != NULL && rp->r_serial != curthread) ||
153 		    ((rp->r_flags & R4PGFLUSH) &&
154 		    rp->r_pgflush != curthread)) {
155 			if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
156 				sigunintr(&smask);
157 				mutex_exit(&rp->r_statelock);
158 				return (EINTR);
159 			}
160 		}
161 		sigunintr(&smask);
162 		mutex_exit(&rp->r_statelock);
163 	}
164 	return (0);
165 }
166 
167 /*
168  * Validate caches by checking cached attributes. If they have timed out,
169  * then get new attributes from the server.  As a side effect, cache
170  * invalidation is done if the attributes have changed.
171  *
172  * If the attributes have not timed out and if there is a cache
173  * invalidation being done by some other thread, then wait until that
174  * thread has completed the cache invalidation.
175  */
176 int
177 nfs4_validate_caches(vnode_t *vp, cred_t *cr)
178 {
179 	int error;
180 	nfs4_ga_res_t gar;
181 
182 	if (ATTRCACHE4_VALID(vp)) {
183 		error = nfs4_waitfor_purge_complete(vp);
184 		if (error)
185 			return (error);
186 		return (0);
187 	}
188 
189 	gar.n4g_va.va_mask = AT_ALL;
190 	return (nfs4_getattr_otw(vp, &gar, cr, 0));
191 }
192 
193 /*
194  * Fill in attribute from the cache.
195  * If valid, then return 0 to indicate that no error occurred,
196  * otherwise return 1 to indicate that an error occurred.
197  */
198 static int
199 nfs4_getattr_cache(vnode_t *vp, struct vattr *vap)
200 {
201 	rnode4_t *rp;
202 
203 	rp = VTOR4(vp);
204 	mutex_enter(&rp->r_statelock);
205 	mutex_enter(&rp->r_statev4_lock);
206 	if (ATTRCACHE4_VALID(vp)) {
207 		mutex_exit(&rp->r_statev4_lock);
208 		/*
209 		 * Cached attributes are valid
210 		 */
211 		*vap = rp->r_attr;
212 		mutex_exit(&rp->r_statelock);
213 		return (0);
214 	}
215 	mutex_exit(&rp->r_statev4_lock);
216 	mutex_exit(&rp->r_statelock);
217 	return (1);
218 }
219 
220 
221 /*
222  * If returned error is ESTALE flush all caches.  The nfs4_purge_caches()
223  * call is synchronous because all the pages were invalidated by the
224  * nfs4_invalidate_pages() call.
225  */
226 void
227 nfs4_purge_stale_fh(int errno, vnode_t *vp, cred_t *cr)
228 {
229 	struct rnode4 *rp = VTOR4(vp);
230 
231 	/* Ensure that the ..._end_op() call has been done */
232 	ASSERT(tsd_get(nfs4_tsd_key) == NULL);
233 
234 	if (errno != ESTALE)
235 		return;
236 
237 	mutex_enter(&rp->r_statelock);
238 	rp->r_flags |= R4STALE;
239 	if (!rp->r_error)
240 		rp->r_error = errno;
241 	mutex_exit(&rp->r_statelock);
242 	if (nfs4_has_pages(vp))
243 		nfs4_invalidate_pages(vp, (u_offset_t)0, cr);
244 	nfs4_purge_caches(vp, NFS4_PURGE_DNLC, cr, FALSE);
245 }
246 
247 /*
248  * Purge all of the various NFS `data' caches.  If "asyncpg" is TRUE, the
249  * page purge is done asynchronously.
250  */
251 void
252 nfs4_purge_caches(vnode_t *vp, int purge_dnlc, cred_t *cr, int asyncpg)
253 {
254 	rnode4_t *rp;
255 	char *contents;
256 	vnode_t *xattr;
257 	int size;
258 	int pgflush;			/* are we the page flush thread? */
259 
260 	/*
261 	 * Purge the DNLC for any entries which refer to this file.
262 	 */
263 	if (vp->v_count > 1 &&
264 	    (vp->v_type == VDIR || purge_dnlc == NFS4_PURGE_DNLC))
265 		dnlc_purge_vp(vp);
266 
267 	/*
268 	 * Clear any readdir state bits and purge the readlink response cache.
269 	 */
270 	rp = VTOR4(vp);
271 	mutex_enter(&rp->r_statelock);
272 	rp->r_flags &= ~R4LOOKUP;
273 	contents = rp->r_symlink.contents;
274 	size = rp->r_symlink.size;
275 	rp->r_symlink.contents = NULL;
276 
277 	xattr = rp->r_xattr_dir;
278 	rp->r_xattr_dir = NULL;
279 
280 	/*
281 	 * Purge pathconf cache too.
282 	 */
283 	rp->r_pathconf.pc4_xattr_valid = 0;
284 	rp->r_pathconf.pc4_cache_valid = 0;
285 
286 	pgflush = (curthread == rp->r_pgflush);
287 	mutex_exit(&rp->r_statelock);
288 
289 	if (contents != NULL) {
290 
291 		kmem_free((void *)contents, size);
292 	}
293 
294 	if (xattr != NULL)
295 		VN_RELE(xattr);
296 
297 	/*
298 	 * Flush the page cache.  If the current thread is the page flush
299 	 * thread, don't initiate a new page flush.  There's no need for
300 	 * it, and doing it correctly is hard.
301 	 */
302 	if (nfs4_has_pages(vp) && !pgflush) {
303 		if (!asyncpg) {
304 			(void) nfs4_waitfor_purge_complete(vp);
305 			flush_pages(vp, cr);
306 		} else {
307 			pgflush_t *args;
308 
309 			/*
310 			 * We don't hold r_statelock while creating the
311 			 * thread, in case the call blocks.  So we use a
312 			 * flag to indicate that a page flush thread is
313 			 * active.
314 			 */
315 			mutex_enter(&rp->r_statelock);
316 			if (rp->r_flags & R4PGFLUSH) {
317 				mutex_exit(&rp->r_statelock);
318 			} else {
319 				rp->r_flags |= R4PGFLUSH;
320 				mutex_exit(&rp->r_statelock);
321 
322 				args = kmem_alloc(sizeof (pgflush_t),
323 				    KM_SLEEP);
324 				args->vp = vp;
325 				VN_HOLD(args->vp);
326 				args->cr = cr;
327 				crhold(args->cr);
328 				(void) zthread_create(NULL, 0,
329 				    nfs4_pgflush_thread, args, 0,
330 				    minclsyspri);
331 			}
332 		}
333 	}
334 
335 	/*
336 	 * Flush the readdir response cache.
337 	 */
338 	nfs4_purge_rddir_cache(vp);
339 }
340 
341 /*
342  * Invalidate all pages for the given file, after writing back the dirty
343  * ones.
344  */
345 
346 static void
347 flush_pages(vnode_t *vp, cred_t *cr)
348 {
349 	int error;
350 	rnode4_t *rp = VTOR4(vp);
351 
352 	error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_INVAL, cr, NULL);
353 	if (error == ENOSPC || error == EDQUOT) {
354 		mutex_enter(&rp->r_statelock);
355 		if (!rp->r_error)
356 			rp->r_error = error;
357 		mutex_exit(&rp->r_statelock);
358 	}
359 }
360 
361 /*
362  * Page flush thread.
363  */
364 
365 static void
366 nfs4_pgflush_thread(pgflush_t *args)
367 {
368 	rnode4_t *rp = VTOR4(args->vp);
369 
370 	/* remember which thread we are, so we don't deadlock ourselves */
371 	mutex_enter(&rp->r_statelock);
372 	ASSERT(rp->r_pgflush == NULL);
373 	rp->r_pgflush = curthread;
374 	mutex_exit(&rp->r_statelock);
375 
376 	flush_pages(args->vp, args->cr);
377 
378 	mutex_enter(&rp->r_statelock);
379 	rp->r_pgflush = NULL;
380 	rp->r_flags &= ~R4PGFLUSH;
381 	cv_broadcast(&rp->r_cv);
382 	mutex_exit(&rp->r_statelock);
383 
384 	VN_RELE(args->vp);
385 	crfree(args->cr);
386 	kmem_free(args, sizeof (pgflush_t));
387 	zthread_exit();
388 }
389 
390 /*
391  * Purge the readdir cache of all entries which are not currently
392  * being filled.
393  */
394 void
395 nfs4_purge_rddir_cache(vnode_t *vp)
396 {
397 	rnode4_t *rp;
398 
399 	rp = VTOR4(vp);
400 
401 	mutex_enter(&rp->r_statelock);
402 	rp->r_direof = NULL;
403 	rp->r_flags &= ~R4LOOKUP;
404 	rp->r_flags |= R4READDIRWATTR;
405 	rddir4_cache_purge(rp);
406 	mutex_exit(&rp->r_statelock);
407 }
408 
409 /*
410  * Set attributes cache for given vnode using virtual attributes.  There is
411  * no cache validation, but if the attributes are deemed to be stale, they
412  * are ignored.  This corresponds to nfs3_attrcache().
413  *
414  * Set the timeout value on the attribute cache and fill it
415  * with the passed in attributes.
416  */
417 void
418 nfs4_attrcache_noinval(vnode_t *vp, nfs4_ga_res_t *garp, hrtime_t t)
419 {
420 	rnode4_t *rp = VTOR4(vp);
421 
422 	mutex_enter(&rp->r_statelock);
423 	if (rp->r_time_attr_saved <= t)
424 		nfs4_attrcache_va(vp, garp, FALSE);
425 	mutex_exit(&rp->r_statelock);
426 }
427 
428 /*
429  * Use the passed in virtual attributes to check to see whether the
430  * data and metadata caches are valid, cache the new attributes, and
431  * then do the cache invalidation if required.
432  *
433  * The cache validation and caching of the new attributes is done
434  * atomically via the use of the mutex, r_statelock.  If required,
435  * the cache invalidation is done atomically w.r.t. the cache
436  * validation and caching of the attributes via the pseudo lock,
437  * r_serial.
438  *
439  * This routine is used to do cache validation and attributes caching
440  * for operations with a single set of post operation attributes.
441  */
442 
443 void
444 nfs4_attr_cache(vnode_t *vp, nfs4_ga_res_t *garp,
445     hrtime_t t, cred_t *cr, int async,
446     change_info4 *cinfo)
447 {
448 	rnode4_t *rp;
449 	int mtime_changed = 0;
450 	int ctime_changed = 0;
451 	vsecattr_t *vsp;
452 	int was_serial, set_time_cache_inval, recov;
453 	vattr_t *vap = &garp->n4g_va;
454 	mntinfo4_t *mi = VTOMI4(vp);
455 	len_t preattr_rsize;
456 	boolean_t writemodify_set = B_FALSE;
457 	boolean_t cachepurge_set = B_FALSE;
458 
459 	ASSERT(mi->mi_vfsp->vfs_dev == garp->n4g_va.va_fsid);
460 
461 	/* Is curthread the recovery thread? */
462 	mutex_enter(&mi->mi_lock);
463 	recov = (VTOMI4(vp)->mi_recovthread == curthread);
464 	mutex_exit(&mi->mi_lock);
465 
466 	rp = VTOR4(vp);
467 	mutex_enter(&rp->r_statelock);
468 	was_serial = (rp->r_serial == curthread);
469 	if (rp->r_serial && !was_serial) {
470 		klwp_t *lwp = ttolwp(curthread);
471 
472 		/*
473 		 * If we're the recovery thread, then purge current attrs
474 		 * and bail out to avoid potential deadlock between another
475 		 * thread caching attrs (r_serial thread), recov thread,
476 		 * and an async writer thread.
477 		 */
478 		if (recov) {
479 			PURGE_ATTRCACHE4_LOCKED(rp);
480 			mutex_exit(&rp->r_statelock);
481 			return;
482 		}
483 
484 		if (lwp != NULL)
485 			lwp->lwp_nostop++;
486 		while (rp->r_serial != NULL) {
487 			if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
488 				mutex_exit(&rp->r_statelock);
489 				if (lwp != NULL)
490 					lwp->lwp_nostop--;
491 				return;
492 			}
493 		}
494 		if (lwp != NULL)
495 			lwp->lwp_nostop--;
496 	}
497 
498 	/*
499 	 * If there is a page flush thread, the current thread needs to
500 	 * bail out, to prevent a possible deadlock between the current
501 	 * thread (which might be in a start_op/end_op region), the
502 	 * recovery thread, and the page flush thread.  Expire the
503 	 * attribute cache, so that any attributes the current thread was
504 	 * going to set are not lost.
505 	 */
506 	if ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread) {
507 		PURGE_ATTRCACHE4_LOCKED(rp);
508 		mutex_exit(&rp->r_statelock);
509 		return;
510 	}
511 
512 	if (rp->r_time_attr_saved > t) {
513 		/*
514 		 * Attributes have been cached since these attributes were
515 		 * probably made. If there is an inconsistency in what is
516 		 * cached, mark them invalid. If not, don't act on them.
517 		 */
518 		if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size))
519 			PURGE_ATTRCACHE4_LOCKED(rp);
520 		mutex_exit(&rp->r_statelock);
521 		return;
522 	}
523 	set_time_cache_inval = 0;
524 	if (cinfo) {
525 		/*
526 		 * Only directory modifying callers pass non-NULL cinfo.
527 		 */
528 		ASSERT(vp->v_type == VDIR);
529 		/*
530 		 * If the cache timeout either doesn't exist or hasn't expired,
531 		 * and dir didn't changed on server before dirmod op
532 		 * and dir didn't change after dirmod op but before getattr
533 		 * then there's a chance that the client's cached data for
534 		 * this object is current (not stale).  No immediate cache
535 		 * flush is required.
536 		 *
537 		 */
538 		if ((! rp->r_time_cache_inval || t < rp->r_time_cache_inval) &&
539 		    cinfo->before == rp->r_change &&
540 		    (garp->n4g_change_valid &&
541 		    cinfo->after == garp->n4g_change)) {
542 
543 			/*
544 			 * If atomic isn't set, then the before/after info
545 			 * cannot be blindly trusted.  For this case, we tell
546 			 * nfs4_attrcache_va to cache the attrs but also
547 			 * establish an absolute maximum cache timeout.  When
548 			 * the timeout is reached, caches will be flushed.
549 			 */
550 			if (! cinfo->atomic)
551 				set_time_cache_inval = 1;
552 		} else {
553 
554 			/*
555 			 * We're not sure exactly what changed, but we know
556 			 * what to do.  flush all caches for dir.  remove the
557 			 * attr timeout.
558 			 *
559 			 * a) timeout expired.  flush all caches.
560 			 * b) r_change != cinfo.before.  flush all caches.
561 			 * c) r_change == cinfo.before, but cinfo.after !=
562 			 *    post-op getattr(change).  flush all caches.
563 			 * d) post-op getattr(change) not provided by server.
564 			 *    flush all caches.
565 			 */
566 			mtime_changed = 1;
567 			ctime_changed = 1;
568 			rp->r_time_cache_inval = 0;
569 		}
570 	} else {
571 		/*
572 		 * Write thread after writing data to file on remote server,
573 		 * will always set R4WRITEMODIFIED to indicate that file on
574 		 * remote server was modified with a WRITE operation and would
575 		 * have marked attribute cache as timed out. If R4WRITEMODIFIED
576 		 * is set, then do not check for mtime and ctime change.
577 		 */
578 		if (!(rp->r_flags & R4WRITEMODIFIED)) {
579 			if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size))
580 				mtime_changed = 1;
581 
582 			if (rp->r_attr.va_ctime.tv_sec !=
583 			    vap->va_ctime.tv_sec ||
584 			    rp->r_attr.va_ctime.tv_nsec !=
585 			    vap->va_ctime.tv_nsec)
586 				ctime_changed = 1;
587 		} else {
588 			writemodify_set = B_TRUE;
589 		}
590 	}
591 
592 	preattr_rsize = rp->r_size;
593 
594 	nfs4_attrcache_va(vp, garp, set_time_cache_inval);
595 
596 	/*
597 	 * If we have updated filesize in nfs4_attrcache_va, as soon as we
598 	 * drop statelock we will be in transition of purging all
599 	 * our caches and updating them. It is possible for another
600 	 * thread to pick this new file size and read in zeroed data.
601 	 * stall other threads till cache purge is complete.
602 	 */
603 	if ((!cinfo) && (rp->r_size != preattr_rsize)) {
604 		/*
605 		 * If R4WRITEMODIFIED was set and we have updated the file
606 		 * size, Server's returned file size need not necessarily
607 		 * be because of this Client's WRITE. We need to purge
608 		 * all caches.
609 		 */
610 		if (writemodify_set)
611 			mtime_changed = 1;
612 
613 		if (mtime_changed && !(rp->r_flags & R4INCACHEPURGE)) {
614 			rp->r_flags |= R4INCACHEPURGE;
615 			cachepurge_set = B_TRUE;
616 		}
617 	}
618 
619 	if (!mtime_changed && !ctime_changed) {
620 		mutex_exit(&rp->r_statelock);
621 		return;
622 	}
623 
624 	rp->r_serial = curthread;
625 
626 	mutex_exit(&rp->r_statelock);
627 
628 	/*
629 	 * If we're the recov thread, then force async nfs4_purge_caches
630 	 * to avoid potential deadlock.
631 	 */
632 	if (mtime_changed)
633 		nfs4_purge_caches(vp, NFS4_NOPURGE_DNLC, cr, recov ? 1 : async);
634 
635 	if ((rp->r_flags & R4INCACHEPURGE) && cachepurge_set) {
636 		mutex_enter(&rp->r_statelock);
637 		rp->r_flags &= ~R4INCACHEPURGE;
638 		cv_broadcast(&rp->r_cv);
639 		mutex_exit(&rp->r_statelock);
640 		cachepurge_set = B_FALSE;
641 	}
642 
643 	if (ctime_changed) {
644 		(void) nfs4_access_purge_rp(rp);
645 		if (rp->r_secattr != NULL) {
646 			mutex_enter(&rp->r_statelock);
647 			vsp = rp->r_secattr;
648 			rp->r_secattr = NULL;
649 			mutex_exit(&rp->r_statelock);
650 			if (vsp != NULL)
651 				nfs4_acl_free_cache(vsp);
652 		}
653 	}
654 
655 	if (!was_serial) {
656 		mutex_enter(&rp->r_statelock);
657 		rp->r_serial = NULL;
658 		cv_broadcast(&rp->r_cv);
659 		mutex_exit(&rp->r_statelock);
660 	}
661 }
662 
663 /*
664  * Set attributes cache for given vnode using virtual attributes.
665  *
666  * Set the timeout value on the attribute cache and fill it
667  * with the passed in attributes.
668  *
669  * The caller must be holding r_statelock.
670  */
671 static void
672 nfs4_attrcache_va(vnode_t *vp, nfs4_ga_res_t *garp, int set_cache_timeout)
673 {
674 	rnode4_t *rp;
675 	mntinfo4_t *mi;
676 	hrtime_t delta;
677 	hrtime_t now;
678 	vattr_t *vap = &garp->n4g_va;
679 
680 	rp = VTOR4(vp);
681 
682 	ASSERT(MUTEX_HELD(&rp->r_statelock));
683 	ASSERT(vap->va_mask == AT_ALL);
684 
685 	/* Switch to master before checking v_flag */
686 	if (IS_SHADOW(vp, rp))
687 		vp = RTOV4(rp);
688 
689 	now = gethrtime();
690 
691 	mi = VTOMI4(vp);
692 
693 	/*
694 	 * Only establish a new cache timeout (if requested).  Never
695 	 * extend a timeout.  Never clear a timeout.  Clearing a timeout
696 	 * is done by nfs4_update_dircaches (ancestor in our call chain)
697 	 */
698 	if (set_cache_timeout && ! rp->r_time_cache_inval)
699 		rp->r_time_cache_inval = now + mi->mi_acdirmax;
700 
701 	/*
702 	 * Delta is the number of nanoseconds that we will
703 	 * cache the attributes of the file.  It is based on
704 	 * the number of nanoseconds since the last time that
705 	 * we detected a change.  The assumption is that files
706 	 * that changed recently are likely to change again.
707 	 * There is a minimum and a maximum for regular files
708 	 * and for directories which is enforced though.
709 	 *
710 	 * Using the time since last change was detected
711 	 * eliminates direct comparison or calculation
712 	 * using mixed client and server times.  NFS does
713 	 * not make any assumptions regarding the client
714 	 * and server clocks being synchronized.
715 	 */
716 	if (vap->va_mtime.tv_sec != rp->r_attr.va_mtime.tv_sec ||
717 	    vap->va_mtime.tv_nsec != rp->r_attr.va_mtime.tv_nsec ||
718 	    vap->va_size != rp->r_attr.va_size) {
719 		rp->r_time_attr_saved = now;
720 	}
721 
722 	if ((mi->mi_flags & MI4_NOAC) || (vp->v_flag & VNOCACHE))
723 		delta = 0;
724 	else {
725 		delta = now - rp->r_time_attr_saved;
726 		if (vp->v_type == VDIR) {
727 			if (delta < mi->mi_acdirmin)
728 				delta = mi->mi_acdirmin;
729 			else if (delta > mi->mi_acdirmax)
730 				delta = mi->mi_acdirmax;
731 		} else {
732 			if (delta < mi->mi_acregmin)
733 				delta = mi->mi_acregmin;
734 			else if (delta > mi->mi_acregmax)
735 				delta = mi->mi_acregmax;
736 		}
737 	}
738 	rp->r_time_attr_inval = now + delta;
739 
740 	rp->r_attr = *vap;
741 	if (garp->n4g_change_valid)
742 		rp->r_change = garp->n4g_change;
743 
744 	/*
745 	 * The attributes that were returned may be valid and can
746 	 * be used, but they may not be allowed to be cached.
747 	 * Reset the timers to cause immediate invalidation and
748 	 * clear r_change so no VERIFY operations will suceed
749 	 */
750 	if (garp->n4g_attrwhy == NFS4_GETATTR_NOCACHE_OK) {
751 		rp->r_time_attr_inval = now;
752 		rp->r_time_attr_saved = now;
753 		rp->r_change = 0;
754 	}
755 
756 	/*
757 	 * If mounted_on_fileid returned AND the object is a stub,
758 	 * then set object's va_nodeid to the mounted over fid
759 	 * returned by server.
760 	 *
761 	 * If mounted_on_fileid not provided/supported, then
762 	 * just set it to 0 for now.  Eventually it would be
763 	 * better to set it to a hashed version of FH.  This
764 	 * would probably be good enough to provide a unique
765 	 * fid/d_ino within a dir.
766 	 *
767 	 * We don't need to carry mounted_on_fileid in the
768 	 * rnode as long as the client never requests fileid
769 	 * without also requesting mounted_on_fileid.  For
770 	 * now, it stays.
771 	 */
772 	if (garp->n4g_mon_fid_valid) {
773 		rp->r_mntd_fid = garp->n4g_mon_fid;
774 
775 		if (RP_ISSTUB(rp))
776 			rp->r_attr.va_nodeid = rp->r_mntd_fid;
777 	}
778 
779 	/*
780 	 * Check to see if there are valid pathconf bits to
781 	 * cache in the rnode.
782 	 */
783 	if (garp->n4g_ext_res) {
784 		if (garp->n4g_ext_res->n4g_pc4.pc4_cache_valid) {
785 			rp->r_pathconf = garp->n4g_ext_res->n4g_pc4;
786 		} else {
787 			if (garp->n4g_ext_res->n4g_pc4.pc4_xattr_valid) {
788 				rp->r_pathconf.pc4_xattr_valid = TRUE;
789 				rp->r_pathconf.pc4_xattr_exists =
790 				    garp->n4g_ext_res->n4g_pc4.pc4_xattr_exists;
791 			}
792 		}
793 	}
794 	/*
795 	 * Update the size of the file if there is no cached data or if
796 	 * the cached data is clean and there is no data being written
797 	 * out.
798 	 */
799 	if (rp->r_size != vap->va_size &&
800 	    (!vn_has_cached_data(vp) ||
801 	    (!(rp->r_flags & R4DIRTY) && rp->r_count == 0))) {
802 		rp->r_size = vap->va_size;
803 	}
804 	nfs_setswaplike(vp, vap);
805 	rp->r_flags &= ~R4WRITEMODIFIED;
806 }
807 
808 /*
809  * Get attributes over-the-wire and update attributes cache
810  * if no error occurred in the over-the-wire operation.
811  * Return 0 if successful, otherwise error.
812  */
813 int
814 nfs4_getattr_otw(vnode_t *vp, nfs4_ga_res_t *garp, cred_t *cr, int get_acl)
815 {
816 	mntinfo4_t *mi = VTOMI4(vp);
817 	hrtime_t t;
818 	nfs4_recov_state_t recov_state;
819 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
820 
821 	recov_state.rs_flags = 0;
822 	recov_state.rs_num_retry_despite_err = 0;
823 
824 	/* Save the original mount point security flavor */
825 	(void) save_mnt_secinfo(mi->mi_curr_serv);
826 
827 recov_retry:
828 
829 	if ((e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR,
830 	    &recov_state, NULL))) {
831 		(void) check_mnt_secinfo(mi->mi_curr_serv, vp);
832 		return (e.error);
833 	}
834 
835 	t = gethrtime();
836 
837 	nfs4_getattr_otw_norecovery(vp, garp, &e, cr, get_acl);
838 
839 	if (nfs4_needs_recovery(&e, FALSE, vp->v_vfsp)) {
840 		if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
841 		    NULL, OP_GETATTR, NULL) == FALSE)  {
842 			nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR,
843 			    &recov_state, 1);
844 			goto recov_retry;
845 		}
846 	}
847 
848 	nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 0);
849 
850 	if (!e.error) {
851 		if (e.stat == NFS4_OK) {
852 			nfs4_attr_cache(vp, garp, t, cr, FALSE, NULL);
853 		} else {
854 			e.error = geterrno4(e.stat);
855 
856 			nfs4_purge_stale_fh(e.error, vp, cr);
857 		}
858 	}
859 
860 	/*
861 	 * If getattr a node that is a stub for a crossed
862 	 * mount point, keep the original secinfo flavor for
863 	 * the current file system, not the crossed one.
864 	 */
865 	(void) check_mnt_secinfo(mi->mi_curr_serv, vp);
866 
867 	return (e.error);
868 }
869 
870 /*
871  * Generate a compound to get attributes over-the-wire.
872  */
873 void
874 nfs4_getattr_otw_norecovery(vnode_t *vp, nfs4_ga_res_t *garp,
875     nfs4_error_t *ep, cred_t *cr, int get_acl)
876 {
877 	COMPOUND4args_clnt args;
878 	COMPOUND4res_clnt res;
879 	int doqueue;
880 	rnode4_t *rp = VTOR4(vp);
881 	nfs_argop4 argop[2];
882 
883 	args.ctag = TAG_GETATTR;
884 
885 	args.array_len = 2;
886 	args.array = argop;
887 
888 	/* putfh */
889 	argop[0].argop = OP_CPUTFH;
890 	argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
891 
892 	/* getattr */
893 	/*
894 	 * Unlike nfs version 2 and 3, where getattr returns all the
895 	 * attributes, nfs version 4 returns only the ones explicitly
896 	 * asked for. This creates problems, as some system functions
897 	 * (e.g. cache check) require certain attributes and if the
898 	 * cached node lacks some attributes such as uid/gid, it can
899 	 * affect system utilities (e.g. "ls") that rely on the information
900 	 * to be there. This can lead to anything from system crashes to
901 	 * corrupted information processed by user apps.
902 	 * So to ensure that all bases are covered, request at least
903 	 * the AT_ALL attribute mask.
904 	 */
905 	argop[1].argop = OP_GETATTR;
906 	argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
907 	if (get_acl)
908 		argop[1].nfs_argop4_u.opgetattr.attr_request |= FATTR4_ACL_MASK;
909 	argop[1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp);
910 
911 	doqueue = 1;
912 
913 	rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, ep);
914 
915 	if (ep->error)
916 		return;
917 
918 	if (res.status != NFS4_OK) {
919 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
920 		return;
921 	}
922 
923 	*garp = res.array[1].nfs_resop4_u.opgetattr.ga_res;
924 
925 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
926 }
927 
928 /*
929  * Return either cached or remote attributes. If get remote attr
930  * use them to check and invalidate caches, then cache the new attributes.
931  */
932 int
933 nfs4getattr(vnode_t *vp, vattr_t *vap, cred_t *cr)
934 {
935 	int error;
936 	rnode4_t *rp;
937 	nfs4_ga_res_t gar;
938 
939 	ASSERT(nfs4_consistent_type(vp));
940 
941 	/*
942 	 * If we've got cached attributes, we're done, otherwise go
943 	 * to the server to get attributes, which will update the cache
944 	 * in the process. Either way, use the cached attributes for
945 	 * the caller's vattr_t.
946 	 *
947 	 * Note that we ignore the gar set by the OTW call: the attr caching
948 	 * code may make adjustments when storing to the rnode, and we want
949 	 * to see those changes here.
950 	 */
951 	rp = VTOR4(vp);
952 	error = 0;
953 	mutex_enter(&rp->r_statelock);
954 	if (!ATTRCACHE4_VALID(vp)) {
955 		mutex_exit(&rp->r_statelock);
956 		error = nfs4_getattr_otw(vp, &gar, cr, 0);
957 		mutex_enter(&rp->r_statelock);
958 	}
959 
960 	if (!error)
961 		*vap = rp->r_attr;
962 
963 	/* Return the client's view of file size */
964 	vap->va_size = rp->r_size;
965 
966 	mutex_exit(&rp->r_statelock);
967 
968 	ASSERT(nfs4_consistent_type(vp));
969 
970 	return (error);
971 }
972 
973 int
974 nfs4_attr_otw(vnode_t *vp, nfs4_tag_type_t tag_type,
975     nfs4_ga_res_t *garp, bitmap4 reqbitmap, cred_t *cr)
976 {
977 	COMPOUND4args_clnt args;
978 	COMPOUND4res_clnt res;
979 	int doqueue;
980 	nfs_argop4 argop[2];
981 	mntinfo4_t *mi = VTOMI4(vp);
982 	bool_t needrecov = FALSE;
983 	nfs4_recov_state_t recov_state;
984 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
985 	nfs4_ga_ext_res_t *gerp;
986 
987 	recov_state.rs_flags = 0;
988 	recov_state.rs_num_retry_despite_err = 0;
989 
990 recov_retry:
991 	args.ctag = tag_type;
992 
993 	args.array_len = 2;
994 	args.array = argop;
995 
996 	e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR, &recov_state, NULL);
997 	if (e.error)
998 		return (e.error);
999 
1000 	/* putfh */
1001 	argop[0].argop = OP_CPUTFH;
1002 	argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh;
1003 
1004 	/* getattr */
1005 	argop[1].argop = OP_GETATTR;
1006 	argop[1].nfs_argop4_u.opgetattr.attr_request = reqbitmap;
1007 	argop[1].nfs_argop4_u.opgetattr.mi = mi;
1008 
1009 	doqueue = 1;
1010 
1011 	NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
1012 	    "nfs4_attr_otw: %s call, rp %s", needrecov ? "recov" : "first",
1013 	    rnode4info(VTOR4(vp))));
1014 
1015 	rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
1016 
1017 	needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
1018 	if (!needrecov && e.error) {
1019 		nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
1020 		    needrecov);
1021 		return (e.error);
1022 	}
1023 
1024 	if (needrecov) {
1025 		bool_t abort;
1026 
1027 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1028 		    "nfs4_attr_otw: initiating recovery\n"));
1029 
1030 		abort = nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
1031 		    NULL, OP_GETATTR, NULL);
1032 		nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
1033 		    needrecov);
1034 		if (!e.error) {
1035 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1036 			e.error = geterrno4(res.status);
1037 		}
1038 		if (abort == FALSE)
1039 			goto recov_retry;
1040 		return (e.error);
1041 	}
1042 
1043 	if (res.status) {
1044 		e.error = geterrno4(res.status);
1045 	} else {
1046 		gerp = garp->n4g_ext_res;
1047 		bcopy(&res.array[1].nfs_resop4_u.opgetattr.ga_res,
1048 		    garp, sizeof (nfs4_ga_res_t));
1049 		garp->n4g_ext_res = gerp;
1050 		if (garp->n4g_ext_res &&
1051 		    res.array[1].nfs_resop4_u.opgetattr.ga_res.n4g_ext_res)
1052 			bcopy(res.array[1].nfs_resop4_u.opgetattr.
1053 			    ga_res.n4g_ext_res,
1054 			    garp->n4g_ext_res, sizeof (nfs4_ga_ext_res_t));
1055 	}
1056 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1057 	nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
1058 	    needrecov);
1059 	return (e.error);
1060 }
1061 
1062 /*
1063  * Asynchronous I/O parameters.  nfs_async_threads is the high-water mark
1064  * for the demand-based allocation of async threads per-mount.  The
1065  * nfs_async_timeout is the amount of time a thread will live after it
1066  * becomes idle, unless new I/O requests are received before the thread
1067  * dies.  See nfs4_async_putpage and nfs4_async_start.
1068  */
1069 
1070 static void	nfs4_async_start(struct vfs *);
1071 
1072 static void
1073 free_async_args4(struct nfs4_async_reqs *args)
1074 {
1075 	rnode4_t *rp;
1076 
1077 	if (args->a_io != NFS4_INACTIVE) {
1078 		rp = VTOR4(args->a_vp);
1079 		mutex_enter(&rp->r_statelock);
1080 		rp->r_count--;
1081 		if (args->a_io == NFS4_PUTAPAGE ||
1082 		    args->a_io == NFS4_PAGEIO)
1083 			rp->r_awcount--;
1084 		cv_broadcast(&rp->r_cv);
1085 		mutex_exit(&rp->r_statelock);
1086 		VN_RELE(args->a_vp);
1087 	}
1088 	crfree(args->a_cred);
1089 	kmem_free(args, sizeof (*args));
1090 }
1091 
1092 /*
1093  * Cross-zone thread creation and NFS access is disallowed, yet fsflush() and
1094  * pageout(), running in the global zone, have legitimate reasons to do
1095  * VOP_PUTPAGE(B_ASYNC) on other zones' NFS mounts.  We avoid the problem by
1096  * use of a a per-mount "asynchronous requests manager thread" which is
1097  * signaled by the various asynchronous work routines when there is
1098  * asynchronous work to be done.  It is responsible for creating new
1099  * worker threads if necessary, and notifying existing worker threads
1100  * that there is work to be done.
1101  *
1102  * In other words, it will "take the specifications from the customers and
1103  * give them to the engineers."
1104  *
1105  * Worker threads die off of their own accord if they are no longer
1106  * needed.
1107  *
1108  * This thread is killed when the zone is going away or the filesystem
1109  * is being unmounted.
1110  */
1111 void
1112 nfs4_async_manager(vfs_t *vfsp)
1113 {
1114 	callb_cpr_t cprinfo;
1115 	mntinfo4_t *mi;
1116 	uint_t max_threads;
1117 
1118 	mi = VFTOMI4(vfsp);
1119 
1120 	CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr,
1121 	    "nfs4_async_manager");
1122 
1123 	mutex_enter(&mi->mi_async_lock);
1124 	/*
1125 	 * We want to stash the max number of threads that this mount was
1126 	 * allowed so we can use it later when the variable is set to zero as
1127 	 * part of the zone/mount going away.
1128 	 *
1129 	 * We want to be able to create at least one thread to handle
1130 	 * asyncrhonous inactive calls.
1131 	 */
1132 	max_threads = MAX(mi->mi_max_threads, 1);
1133 	mutex_enter(&mi->mi_lock);
1134 	/*
1135 	 * We don't want to wait for mi_max_threads to go to zero, since that
1136 	 * happens as part of a failed unmount, but this thread should only
1137 	 * exit when the mount is really going away.
1138 	 *
1139 	 * Once MI4_ASYNC_MGR_STOP is set, no more async operations will be
1140 	 * attempted: the various _async_*() functions know to do things
1141 	 * inline if mi_max_threads == 0.  Henceforth we just drain out the
1142 	 * outstanding requests.
1143 	 *
1144 	 * Note that we still create zthreads even if we notice the zone is
1145 	 * shutting down (MI4_ASYNC_MGR_STOP is set); this may cause the zone
1146 	 * shutdown sequence to take slightly longer in some cases, but
1147 	 * doesn't violate the protocol, as all threads will exit as soon as
1148 	 * they're done processing the remaining requests.
1149 	 */
1150 	while (!(mi->mi_flags & MI4_ASYNC_MGR_STOP) ||
1151 	    mi->mi_async_req_count > 0) {
1152 		mutex_exit(&mi->mi_lock);
1153 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
1154 		cv_wait(&mi->mi_async_reqs_cv, &mi->mi_async_lock);
1155 		CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1156 		while (mi->mi_async_req_count > 0) {
1157 			/*
1158 			 * Paranoia: If the mount started out having
1159 			 * (mi->mi_max_threads == 0), and the value was
1160 			 * later changed (via a debugger or somesuch),
1161 			 * we could be confused since we will think we
1162 			 * can't create any threads, and the calling
1163 			 * code (which looks at the current value of
1164 			 * mi->mi_max_threads, now non-zero) thinks we
1165 			 * can.
1166 			 *
1167 			 * So, because we're paranoid, we create threads
1168 			 * up to the maximum of the original and the
1169 			 * current value. This means that future
1170 			 * (debugger-induced) alterations of
1171 			 * mi->mi_max_threads are ignored for our
1172 			 * purposes, but who told them they could change
1173 			 * random values on a live kernel anyhow?
1174 			 */
1175 			if (mi->mi_threads <
1176 			    MAX(mi->mi_max_threads, max_threads)) {
1177 				mi->mi_threads++;
1178 				mutex_exit(&mi->mi_async_lock);
1179 				MI4_HOLD(mi);
1180 				VFS_HOLD(vfsp);	/* hold for new thread */
1181 				(void) zthread_create(NULL, 0, nfs4_async_start,
1182 				    vfsp, 0, minclsyspri);
1183 				mutex_enter(&mi->mi_async_lock);
1184 			}
1185 			cv_signal(&mi->mi_async_work_cv);
1186 			ASSERT(mi->mi_async_req_count != 0);
1187 			mi->mi_async_req_count--;
1188 		}
1189 		mutex_enter(&mi->mi_lock);
1190 	}
1191 	mutex_exit(&mi->mi_lock);
1192 
1193 	NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
1194 	    "nfs4_async_manager exiting for vfs %p\n", (void *)mi->mi_vfsp));
1195 	/*
1196 	 * Let everyone know we're done.
1197 	 */
1198 	mi->mi_manager_thread = NULL;
1199 	/*
1200 	 * Wake up the inactive thread.
1201 	 */
1202 	cv_broadcast(&mi->mi_inact_req_cv);
1203 	/*
1204 	 * Wake up anyone sitting in nfs4_async_manager_stop()
1205 	 */
1206 	cv_broadcast(&mi->mi_async_cv);
1207 	/*
1208 	 * There is no explicit call to mutex_exit(&mi->mi_async_lock)
1209 	 * since CALLB_CPR_EXIT is actually responsible for releasing
1210 	 * 'mi_async_lock'.
1211 	 */
1212 	CALLB_CPR_EXIT(&cprinfo);
1213 	VFS_RELE(vfsp);	/* release thread's hold */
1214 	MI4_RELE(mi);
1215 	zthread_exit();
1216 }
1217 
1218 /*
1219  * Signal (and wait for) the async manager thread to clean up and go away.
1220  */
1221 void
1222 nfs4_async_manager_stop(vfs_t *vfsp)
1223 {
1224 	mntinfo4_t *mi = VFTOMI4(vfsp);
1225 
1226 	mutex_enter(&mi->mi_async_lock);
1227 	mutex_enter(&mi->mi_lock);
1228 	mi->mi_flags |= MI4_ASYNC_MGR_STOP;
1229 	mutex_exit(&mi->mi_lock);
1230 	cv_broadcast(&mi->mi_async_reqs_cv);
1231 	/*
1232 	 * Wait for the async manager thread to die.
1233 	 */
1234 	while (mi->mi_manager_thread != NULL)
1235 		cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1236 	mutex_exit(&mi->mi_async_lock);
1237 }
1238 
1239 int
1240 nfs4_async_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr,
1241     struct seg *seg, cred_t *cr, void (*readahead)(vnode_t *,
1242     u_offset_t, caddr_t, struct seg *, cred_t *))
1243 {
1244 	rnode4_t *rp;
1245 	mntinfo4_t *mi;
1246 	struct nfs4_async_reqs *args;
1247 
1248 	rp = VTOR4(vp);
1249 	ASSERT(rp->r_freef == NULL);
1250 
1251 	mi = VTOMI4(vp);
1252 
1253 	/*
1254 	 * If addr falls in a different segment, don't bother doing readahead.
1255 	 */
1256 	if (addr >= seg->s_base + seg->s_size)
1257 		return (-1);
1258 
1259 	/*
1260 	 * If we can't allocate a request structure, punt on the readahead.
1261 	 */
1262 	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1263 		return (-1);
1264 
1265 	/*
1266 	 * If a lock operation is pending, don't initiate any new
1267 	 * readaheads.  Otherwise, bump r_count to indicate the new
1268 	 * asynchronous I/O.
1269 	 */
1270 	if (!nfs_rw_tryenter(&rp->r_lkserlock, RW_READER)) {
1271 		kmem_free(args, sizeof (*args));
1272 		return (-1);
1273 	}
1274 	mutex_enter(&rp->r_statelock);
1275 	rp->r_count++;
1276 	mutex_exit(&rp->r_statelock);
1277 	nfs_rw_exit(&rp->r_lkserlock);
1278 
1279 	args->a_next = NULL;
1280 #ifdef DEBUG
1281 	args->a_queuer = curthread;
1282 #endif
1283 	VN_HOLD(vp);
1284 	args->a_vp = vp;
1285 	ASSERT(cr != NULL);
1286 	crhold(cr);
1287 	args->a_cred = cr;
1288 	args->a_io = NFS4_READ_AHEAD;
1289 	args->a_nfs4_readahead = readahead;
1290 	args->a_nfs4_blkoff = blkoff;
1291 	args->a_nfs4_seg = seg;
1292 	args->a_nfs4_addr = addr;
1293 
1294 	mutex_enter(&mi->mi_async_lock);
1295 
1296 	/*
1297 	 * If asyncio has been disabled, don't bother readahead.
1298 	 */
1299 	if (mi->mi_max_threads == 0) {
1300 		mutex_exit(&mi->mi_async_lock);
1301 		goto noasync;
1302 	}
1303 
1304 	/*
1305 	 * Link request structure into the async list and
1306 	 * wakeup async thread to do the i/o.
1307 	 */
1308 	if (mi->mi_async_reqs[NFS4_READ_AHEAD] == NULL) {
1309 		mi->mi_async_reqs[NFS4_READ_AHEAD] = args;
1310 		mi->mi_async_tail[NFS4_READ_AHEAD] = args;
1311 	} else {
1312 		mi->mi_async_tail[NFS4_READ_AHEAD]->a_next = args;
1313 		mi->mi_async_tail[NFS4_READ_AHEAD] = args;
1314 	}
1315 
1316 	if (mi->mi_io_kstats) {
1317 		mutex_enter(&mi->mi_lock);
1318 		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1319 		mutex_exit(&mi->mi_lock);
1320 	}
1321 
1322 	mi->mi_async_req_count++;
1323 	ASSERT(mi->mi_async_req_count != 0);
1324 	cv_signal(&mi->mi_async_reqs_cv);
1325 	mutex_exit(&mi->mi_async_lock);
1326 	return (0);
1327 
1328 noasync:
1329 	mutex_enter(&rp->r_statelock);
1330 	rp->r_count--;
1331 	cv_broadcast(&rp->r_cv);
1332 	mutex_exit(&rp->r_statelock);
1333 	VN_RELE(vp);
1334 	crfree(cr);
1335 	kmem_free(args, sizeof (*args));
1336 	return (-1);
1337 }
1338 
1339 /*
1340  * The async queues for each mounted file system are arranged as a
1341  * set of queues, one for each async i/o type.  Requests are taken
1342  * from the queues in a round-robin fashion.  A number of consecutive
1343  * requests are taken from each queue before moving on to the next
1344  * queue.  This functionality may allow the NFS Version 2 server to do
1345  * write clustering, even if the client is mixing writes and reads
1346  * because it will take multiple write requests from the queue
1347  * before processing any of the other async i/o types.
1348  *
1349  * XXX The nfs4_async_start thread is unsafe in the light of the present
1350  * model defined by cpr to suspend the system. Specifically over the
1351  * wire calls are cpr-unsafe. The thread should be reevaluated in
1352  * case of future updates to the cpr model.
1353  */
1354 static void
1355 nfs4_async_start(struct vfs *vfsp)
1356 {
1357 	struct nfs4_async_reqs *args;
1358 	mntinfo4_t *mi = VFTOMI4(vfsp);
1359 	clock_t time_left = 1;
1360 	callb_cpr_t cprinfo;
1361 	int i;
1362 	extern int nfs_async_timeout;
1363 
1364 	/*
1365 	 * Dynamic initialization of nfs_async_timeout to allow nfs to be
1366 	 * built in an implementation independent manner.
1367 	 */
1368 	if (nfs_async_timeout == -1)
1369 		nfs_async_timeout = NFS_ASYNC_TIMEOUT;
1370 
1371 	CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, "nas");
1372 
1373 	mutex_enter(&mi->mi_async_lock);
1374 	for (;;) {
1375 		/*
1376 		 * Find the next queue containing an entry.  We start
1377 		 * at the current queue pointer and then round robin
1378 		 * through all of them until we either find a non-empty
1379 		 * queue or have looked through all of them.
1380 		 */
1381 		for (i = 0; i < NFS4_ASYNC_TYPES; i++) {
1382 			args = *mi->mi_async_curr;
1383 			if (args != NULL)
1384 				break;
1385 			mi->mi_async_curr++;
1386 			if (mi->mi_async_curr ==
1387 			    &mi->mi_async_reqs[NFS4_ASYNC_TYPES])
1388 				mi->mi_async_curr = &mi->mi_async_reqs[0];
1389 		}
1390 		/*
1391 		 * If we didn't find a entry, then block until woken up
1392 		 * again and then look through the queues again.
1393 		 */
1394 		if (args == NULL) {
1395 			/*
1396 			 * Exiting is considered to be safe for CPR as well
1397 			 */
1398 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
1399 
1400 			/*
1401 			 * Wakeup thread waiting to unmount the file
1402 			 * system only if all async threads are inactive.
1403 			 *
1404 			 * If we've timed-out and there's nothing to do,
1405 			 * then get rid of this thread.
1406 			 */
1407 			if (mi->mi_max_threads == 0 || time_left <= 0) {
1408 				if (--mi->mi_threads == 0)
1409 					cv_signal(&mi->mi_async_cv);
1410 				CALLB_CPR_EXIT(&cprinfo);
1411 				VFS_RELE(vfsp);	/* release thread's hold */
1412 				MI4_RELE(mi);
1413 				zthread_exit();
1414 				/* NOTREACHED */
1415 			}
1416 			time_left = cv_timedwait(&mi->mi_async_work_cv,
1417 			    &mi->mi_async_lock, nfs_async_timeout + lbolt);
1418 
1419 			CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1420 
1421 			continue;
1422 		} else {
1423 			time_left = 1;
1424 		}
1425 
1426 		/*
1427 		 * Remove the request from the async queue and then
1428 		 * update the current async request queue pointer.  If
1429 		 * the current queue is empty or we have removed enough
1430 		 * consecutive entries from it, then reset the counter
1431 		 * for this queue and then move the current pointer to
1432 		 * the next queue.
1433 		 */
1434 		*mi->mi_async_curr = args->a_next;
1435 		if (*mi->mi_async_curr == NULL ||
1436 		    --mi->mi_async_clusters[args->a_io] == 0) {
1437 			mi->mi_async_clusters[args->a_io] =
1438 			    mi->mi_async_init_clusters;
1439 			mi->mi_async_curr++;
1440 			if (mi->mi_async_curr ==
1441 			    &mi->mi_async_reqs[NFS4_ASYNC_TYPES])
1442 				mi->mi_async_curr = &mi->mi_async_reqs[0];
1443 		}
1444 
1445 		if (args->a_io != NFS4_INACTIVE && mi->mi_io_kstats) {
1446 			mutex_enter(&mi->mi_lock);
1447 			kstat_waitq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
1448 			mutex_exit(&mi->mi_lock);
1449 		}
1450 
1451 		mutex_exit(&mi->mi_async_lock);
1452 
1453 		/*
1454 		 * Obtain arguments from the async request structure.
1455 		 */
1456 		if (args->a_io == NFS4_READ_AHEAD && mi->mi_max_threads > 0) {
1457 			(*args->a_nfs4_readahead)(args->a_vp,
1458 			    args->a_nfs4_blkoff, args->a_nfs4_addr,
1459 			    args->a_nfs4_seg, args->a_cred);
1460 		} else if (args->a_io == NFS4_PUTAPAGE) {
1461 			(void) (*args->a_nfs4_putapage)(args->a_vp,
1462 			    args->a_nfs4_pp, args->a_nfs4_off,
1463 			    args->a_nfs4_len, args->a_nfs4_flags,
1464 			    args->a_cred);
1465 		} else if (args->a_io == NFS4_PAGEIO) {
1466 			(void) (*args->a_nfs4_pageio)(args->a_vp,
1467 			    args->a_nfs4_pp, args->a_nfs4_off,
1468 			    args->a_nfs4_len, args->a_nfs4_flags,
1469 			    args->a_cred);
1470 		} else if (args->a_io == NFS4_READDIR) {
1471 			(void) ((*args->a_nfs4_readdir)(args->a_vp,
1472 			    args->a_nfs4_rdc, args->a_cred));
1473 		} else if (args->a_io == NFS4_COMMIT) {
1474 			(*args->a_nfs4_commit)(args->a_vp, args->a_nfs4_plist,
1475 			    args->a_nfs4_offset, args->a_nfs4_count,
1476 			    args->a_cred);
1477 		} else if (args->a_io == NFS4_INACTIVE) {
1478 			nfs4_inactive_otw(args->a_vp, args->a_cred);
1479 		}
1480 
1481 		/*
1482 		 * Now, release the vnode and free the credentials
1483 		 * structure.
1484 		 */
1485 		free_async_args4(args);
1486 		/*
1487 		 * Reacquire the mutex because it will be needed above.
1488 		 */
1489 		mutex_enter(&mi->mi_async_lock);
1490 	}
1491 }
1492 
1493 /*
1494  * nfs4_inactive_thread - look for vnodes that need over-the-wire calls as
1495  * part of VOP_INACTIVE.
1496  */
1497 
1498 void
1499 nfs4_inactive_thread(mntinfo4_t *mi)
1500 {
1501 	struct nfs4_async_reqs *args;
1502 	callb_cpr_t cprinfo;
1503 	vfs_t *vfsp = mi->mi_vfsp;
1504 
1505 	CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr,
1506 	    "nfs4_inactive_thread");
1507 
1508 	for (;;) {
1509 		mutex_enter(&mi->mi_async_lock);
1510 		args = mi->mi_async_reqs[NFS4_INACTIVE];
1511 		if (args == NULL) {
1512 			mutex_enter(&mi->mi_lock);
1513 			/*
1514 			 * We don't want to exit until the async manager is done
1515 			 * with its work; hence the check for mi_manager_thread
1516 			 * being NULL.
1517 			 *
1518 			 * The async manager thread will cv_broadcast() on
1519 			 * mi_inact_req_cv when it's done, at which point we'll
1520 			 * wake up and exit.
1521 			 */
1522 			if (mi->mi_manager_thread == NULL)
1523 				goto die;
1524 			mi->mi_flags |= MI4_INACTIVE_IDLE;
1525 			mutex_exit(&mi->mi_lock);
1526 			cv_signal(&mi->mi_async_cv);
1527 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
1528 			cv_wait(&mi->mi_inact_req_cv, &mi->mi_async_lock);
1529 			CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1530 			mutex_exit(&mi->mi_async_lock);
1531 		} else {
1532 			mutex_enter(&mi->mi_lock);
1533 			mi->mi_flags &= ~MI4_INACTIVE_IDLE;
1534 			mutex_exit(&mi->mi_lock);
1535 			mi->mi_async_reqs[NFS4_INACTIVE] = args->a_next;
1536 			mutex_exit(&mi->mi_async_lock);
1537 			nfs4_inactive_otw(args->a_vp, args->a_cred);
1538 			crfree(args->a_cred);
1539 			kmem_free(args, sizeof (*args));
1540 		}
1541 	}
1542 die:
1543 	mutex_exit(&mi->mi_lock);
1544 	mi->mi_inactive_thread = NULL;
1545 	cv_signal(&mi->mi_async_cv);
1546 
1547 	/*
1548 	 * There is no explicit call to mutex_exit(&mi->mi_async_lock) since
1549 	 * CALLB_CPR_EXIT is actually responsible for releasing 'mi_async_lock'.
1550 	 */
1551 	CALLB_CPR_EXIT(&cprinfo);
1552 
1553 	NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
1554 	    "nfs4_inactive_thread exiting for vfs %p\n", (void *)vfsp));
1555 
1556 	MI4_RELE(mi);
1557 	zthread_exit();
1558 	/* NOTREACHED */
1559 }
1560 
1561 /*
1562  * nfs_async_stop:
1563  * Wait for all outstanding putpage operations and the inactive thread to
1564  * complete; nfs4_async_stop_sig() without interruptibility.
1565  */
1566 void
1567 nfs4_async_stop(struct vfs *vfsp)
1568 {
1569 	mntinfo4_t *mi = VFTOMI4(vfsp);
1570 
1571 	/*
1572 	 * Wait for all outstanding async operations to complete and for
1573 	 * worker threads to exit.
1574 	 */
1575 	mutex_enter(&mi->mi_async_lock);
1576 	mi->mi_max_threads = 0;
1577 	cv_broadcast(&mi->mi_async_work_cv);
1578 	while (mi->mi_threads != 0)
1579 		cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1580 
1581 	/*
1582 	 * Wait for the inactive thread to finish doing what it's doing.  It
1583 	 * won't exit until the last reference to the vfs_t goes away.
1584 	 */
1585 	if (mi->mi_inactive_thread != NULL) {
1586 		mutex_enter(&mi->mi_lock);
1587 		while (!(mi->mi_flags & MI4_INACTIVE_IDLE) ||
1588 		    (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) {
1589 			mutex_exit(&mi->mi_lock);
1590 			cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1591 			mutex_enter(&mi->mi_lock);
1592 		}
1593 		mutex_exit(&mi->mi_lock);
1594 	}
1595 	mutex_exit(&mi->mi_async_lock);
1596 }
1597 
1598 /*
1599  * nfs_async_stop_sig:
1600  * Wait for all outstanding putpage operations and the inactive thread to
1601  * complete. If a signal is delivered we will abort and return non-zero;
1602  * otherwise return 0. Since this routine is called from nfs4_unmount, we
1603  * need to make it interruptible.
1604  */
1605 int
1606 nfs4_async_stop_sig(struct vfs *vfsp)
1607 {
1608 	mntinfo4_t *mi = VFTOMI4(vfsp);
1609 	ushort_t omax;
1610 	bool_t intr = FALSE;
1611 
1612 	/*
1613 	 * Wait for all outstanding putpage operations to complete and for
1614 	 * worker threads to exit.
1615 	 */
1616 	mutex_enter(&mi->mi_async_lock);
1617 	omax = mi->mi_max_threads;
1618 	mi->mi_max_threads = 0;
1619 	cv_broadcast(&mi->mi_async_work_cv);
1620 	while (mi->mi_threads != 0) {
1621 		if (!cv_wait_sig(&mi->mi_async_cv, &mi->mi_async_lock)) {
1622 			intr = TRUE;
1623 			goto interrupted;
1624 		}
1625 	}
1626 
1627 	/*
1628 	 * Wait for the inactive thread to finish doing what it's doing.  It
1629 	 * won't exit until the a last reference to the vfs_t goes away.
1630 	 */
1631 	if (mi->mi_inactive_thread != NULL) {
1632 		mutex_enter(&mi->mi_lock);
1633 		while (!(mi->mi_flags & MI4_INACTIVE_IDLE) ||
1634 		    (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) {
1635 			mutex_exit(&mi->mi_lock);
1636 			if (!cv_wait_sig(&mi->mi_async_cv,
1637 			    &mi->mi_async_lock)) {
1638 				intr = TRUE;
1639 				goto interrupted;
1640 			}
1641 			mutex_enter(&mi->mi_lock);
1642 		}
1643 		mutex_exit(&mi->mi_lock);
1644 	}
1645 interrupted:
1646 	if (intr)
1647 		mi->mi_max_threads = omax;
1648 	mutex_exit(&mi->mi_async_lock);
1649 
1650 	return (intr);
1651 }
1652 
1653 int
1654 nfs4_async_putapage(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
1655     int flags, cred_t *cr, int (*putapage)(vnode_t *, page_t *,
1656     u_offset_t, size_t, int, cred_t *))
1657 {
1658 	rnode4_t *rp;
1659 	mntinfo4_t *mi;
1660 	struct nfs4_async_reqs *args;
1661 
1662 	ASSERT(flags & B_ASYNC);
1663 	ASSERT(vp->v_vfsp != NULL);
1664 
1665 	rp = VTOR4(vp);
1666 	ASSERT(rp->r_count > 0);
1667 
1668 	mi = VTOMI4(vp);
1669 
1670 	/*
1671 	 * If we can't allocate a request structure, do the putpage
1672 	 * operation synchronously in this thread's context.
1673 	 */
1674 	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1675 		goto noasync;
1676 
1677 	args->a_next = NULL;
1678 #ifdef DEBUG
1679 	args->a_queuer = curthread;
1680 #endif
1681 	VN_HOLD(vp);
1682 	args->a_vp = vp;
1683 	ASSERT(cr != NULL);
1684 	crhold(cr);
1685 	args->a_cred = cr;
1686 	args->a_io = NFS4_PUTAPAGE;
1687 	args->a_nfs4_putapage = putapage;
1688 	args->a_nfs4_pp = pp;
1689 	args->a_nfs4_off = off;
1690 	args->a_nfs4_len = (uint_t)len;
1691 	args->a_nfs4_flags = flags;
1692 
1693 	mutex_enter(&mi->mi_async_lock);
1694 
1695 	/*
1696 	 * If asyncio has been disabled, then make a synchronous request.
1697 	 * This check is done a second time in case async io was diabled
1698 	 * while this thread was blocked waiting for memory pressure to
1699 	 * reduce or for the queue to drain.
1700 	 */
1701 	if (mi->mi_max_threads == 0) {
1702 		mutex_exit(&mi->mi_async_lock);
1703 
1704 		VN_RELE(vp);
1705 		crfree(cr);
1706 		kmem_free(args, sizeof (*args));
1707 		goto noasync;
1708 	}
1709 
1710 	/*
1711 	 * Link request structure into the async list and
1712 	 * wakeup async thread to do the i/o.
1713 	 */
1714 	if (mi->mi_async_reqs[NFS4_PUTAPAGE] == NULL) {
1715 		mi->mi_async_reqs[NFS4_PUTAPAGE] = args;
1716 		mi->mi_async_tail[NFS4_PUTAPAGE] = args;
1717 	} else {
1718 		mi->mi_async_tail[NFS4_PUTAPAGE]->a_next = args;
1719 		mi->mi_async_tail[NFS4_PUTAPAGE] = args;
1720 	}
1721 
1722 	mutex_enter(&rp->r_statelock);
1723 	rp->r_count++;
1724 	rp->r_awcount++;
1725 	mutex_exit(&rp->r_statelock);
1726 
1727 	if (mi->mi_io_kstats) {
1728 		mutex_enter(&mi->mi_lock);
1729 		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1730 		mutex_exit(&mi->mi_lock);
1731 	}
1732 
1733 	mi->mi_async_req_count++;
1734 	ASSERT(mi->mi_async_req_count != 0);
1735 	cv_signal(&mi->mi_async_reqs_cv);
1736 	mutex_exit(&mi->mi_async_lock);
1737 	return (0);
1738 
1739 noasync:
1740 
1741 	if (curproc == proc_pageout || curproc == proc_fsflush ||
1742 	    nfs_zone() == mi->mi_zone) {
1743 		/*
1744 		 * If we get here in the context of the pageout/fsflush,
1745 		 * or we have run out of memory or we're attempting to
1746 		 * unmount we refuse to do a sync write, because this may
1747 		 * hang pageout/fsflush and the machine. In this case,
1748 		 * we just re-mark the page as dirty and punt on the page.
1749 		 *
1750 		 * Make sure B_FORCE isn't set.  We can re-mark the
1751 		 * pages as dirty and unlock the pages in one swoop by
1752 		 * passing in B_ERROR to pvn_write_done().  However,
1753 		 * we should make sure B_FORCE isn't set - we don't
1754 		 * want the page tossed before it gets written out.
1755 		 */
1756 		if (flags & B_FORCE)
1757 			flags &= ~(B_INVAL | B_FORCE);
1758 		pvn_write_done(pp, flags | B_ERROR);
1759 		return (0);
1760 	}
1761 
1762 	/*
1763 	 * We'll get here only if (nfs_zone() != mi->mi_zone)
1764 	 * which means that this was a cross-zone sync putpage.
1765 	 *
1766 	 * We pass in B_ERROR to pvn_write_done() to re-mark the pages
1767 	 * as dirty and unlock them.
1768 	 *
1769 	 * We don't want to clear B_FORCE here as the caller presumably
1770 	 * knows what they're doing if they set it.
1771 	 */
1772 	pvn_write_done(pp, flags | B_ERROR);
1773 	return (EPERM);
1774 }
1775 
1776 int
1777 nfs4_async_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
1778     int flags, cred_t *cr, int (*pageio)(vnode_t *, page_t *, u_offset_t,
1779     size_t, int, cred_t *))
1780 {
1781 	rnode4_t *rp;
1782 	mntinfo4_t *mi;
1783 	struct nfs4_async_reqs *args;
1784 
1785 	ASSERT(flags & B_ASYNC);
1786 	ASSERT(vp->v_vfsp != NULL);
1787 
1788 	rp = VTOR4(vp);
1789 	ASSERT(rp->r_count > 0);
1790 
1791 	mi = VTOMI4(vp);
1792 
1793 	/*
1794 	 * If we can't allocate a request structure, do the pageio
1795 	 * request synchronously in this thread's context.
1796 	 */
1797 	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1798 		goto noasync;
1799 
1800 	args->a_next = NULL;
1801 #ifdef DEBUG
1802 	args->a_queuer = curthread;
1803 #endif
1804 	VN_HOLD(vp);
1805 	args->a_vp = vp;
1806 	ASSERT(cr != NULL);
1807 	crhold(cr);
1808 	args->a_cred = cr;
1809 	args->a_io = NFS4_PAGEIO;
1810 	args->a_nfs4_pageio = pageio;
1811 	args->a_nfs4_pp = pp;
1812 	args->a_nfs4_off = io_off;
1813 	args->a_nfs4_len = (uint_t)io_len;
1814 	args->a_nfs4_flags = flags;
1815 
1816 	mutex_enter(&mi->mi_async_lock);
1817 
1818 	/*
1819 	 * If asyncio has been disabled, then make a synchronous request.
1820 	 * This check is done a second time in case async io was diabled
1821 	 * while this thread was blocked waiting for memory pressure to
1822 	 * reduce or for the queue to drain.
1823 	 */
1824 	if (mi->mi_max_threads == 0) {
1825 		mutex_exit(&mi->mi_async_lock);
1826 
1827 		VN_RELE(vp);
1828 		crfree(cr);
1829 		kmem_free(args, sizeof (*args));
1830 		goto noasync;
1831 	}
1832 
1833 	/*
1834 	 * Link request structure into the async list and
1835 	 * wakeup async thread to do the i/o.
1836 	 */
1837 	if (mi->mi_async_reqs[NFS4_PAGEIO] == NULL) {
1838 		mi->mi_async_reqs[NFS4_PAGEIO] = args;
1839 		mi->mi_async_tail[NFS4_PAGEIO] = args;
1840 	} else {
1841 		mi->mi_async_tail[NFS4_PAGEIO]->a_next = args;
1842 		mi->mi_async_tail[NFS4_PAGEIO] = args;
1843 	}
1844 
1845 	mutex_enter(&rp->r_statelock);
1846 	rp->r_count++;
1847 	rp->r_awcount++;
1848 	mutex_exit(&rp->r_statelock);
1849 
1850 	if (mi->mi_io_kstats) {
1851 		mutex_enter(&mi->mi_lock);
1852 		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1853 		mutex_exit(&mi->mi_lock);
1854 	}
1855 
1856 	mi->mi_async_req_count++;
1857 	ASSERT(mi->mi_async_req_count != 0);
1858 	cv_signal(&mi->mi_async_reqs_cv);
1859 	mutex_exit(&mi->mi_async_lock);
1860 	return (0);
1861 
1862 noasync:
1863 	/*
1864 	 * If we can't do it ASYNC, for reads we do nothing (but cleanup
1865 	 * the page list), for writes we do it synchronously, except for
1866 	 * proc_pageout/proc_fsflush as described below.
1867 	 */
1868 	if (flags & B_READ) {
1869 		pvn_read_done(pp, flags | B_ERROR);
1870 		return (0);
1871 	}
1872 
1873 	if (curproc == proc_pageout || curproc == proc_fsflush) {
1874 		/*
1875 		 * If we get here in the context of the pageout/fsflush,
1876 		 * we refuse to do a sync write, because this may hang
1877 		 * pageout/fsflush (and the machine). In this case, we just
1878 		 * re-mark the page as dirty and punt on the page.
1879 		 *
1880 		 * Make sure B_FORCE isn't set.  We can re-mark the
1881 		 * pages as dirty and unlock the pages in one swoop by
1882 		 * passing in B_ERROR to pvn_write_done().  However,
1883 		 * we should make sure B_FORCE isn't set - we don't
1884 		 * want the page tossed before it gets written out.
1885 		 */
1886 		if (flags & B_FORCE)
1887 			flags &= ~(B_INVAL | B_FORCE);
1888 		pvn_write_done(pp, flags | B_ERROR);
1889 		return (0);
1890 	}
1891 
1892 	if (nfs_zone() != mi->mi_zone) {
1893 		/*
1894 		 * So this was a cross-zone sync pageio.  We pass in B_ERROR
1895 		 * to pvn_write_done() to re-mark the pages as dirty and unlock
1896 		 * them.
1897 		 *
1898 		 * We don't want to clear B_FORCE here as the caller presumably
1899 		 * knows what they're doing if they set it.
1900 		 */
1901 		pvn_write_done(pp, flags | B_ERROR);
1902 		return (EPERM);
1903 	}
1904 	return ((*pageio)(vp, pp, io_off, io_len, flags, cr));
1905 }
1906 
1907 void
1908 nfs4_async_readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr,
1909     int (*readdir)(vnode_t *, rddir4_cache *, cred_t *))
1910 {
1911 	rnode4_t *rp;
1912 	mntinfo4_t *mi;
1913 	struct nfs4_async_reqs *args;
1914 
1915 	rp = VTOR4(vp);
1916 	ASSERT(rp->r_freef == NULL);
1917 
1918 	mi = VTOMI4(vp);
1919 
1920 	/*
1921 	 * If we can't allocate a request structure, skip the readdir.
1922 	 */
1923 	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1924 		goto noasync;
1925 
1926 	args->a_next = NULL;
1927 #ifdef DEBUG
1928 	args->a_queuer = curthread;
1929 #endif
1930 	VN_HOLD(vp);
1931 	args->a_vp = vp;
1932 	ASSERT(cr != NULL);
1933 	crhold(cr);
1934 	args->a_cred = cr;
1935 	args->a_io = NFS4_READDIR;
1936 	args->a_nfs4_readdir = readdir;
1937 	args->a_nfs4_rdc = rdc;
1938 
1939 	mutex_enter(&mi->mi_async_lock);
1940 
1941 	/*
1942 	 * If asyncio has been disabled, then skip this request
1943 	 */
1944 	if (mi->mi_max_threads == 0) {
1945 		mutex_exit(&mi->mi_async_lock);
1946 
1947 		VN_RELE(vp);
1948 		crfree(cr);
1949 		kmem_free(args, sizeof (*args));
1950 		goto noasync;
1951 	}
1952 
1953 	/*
1954 	 * Link request structure into the async list and
1955 	 * wakeup async thread to do the i/o.
1956 	 */
1957 	if (mi->mi_async_reqs[NFS4_READDIR] == NULL) {
1958 		mi->mi_async_reqs[NFS4_READDIR] = args;
1959 		mi->mi_async_tail[NFS4_READDIR] = args;
1960 	} else {
1961 		mi->mi_async_tail[NFS4_READDIR]->a_next = args;
1962 		mi->mi_async_tail[NFS4_READDIR] = args;
1963 	}
1964 
1965 	mutex_enter(&rp->r_statelock);
1966 	rp->r_count++;
1967 	mutex_exit(&rp->r_statelock);
1968 
1969 	if (mi->mi_io_kstats) {
1970 		mutex_enter(&mi->mi_lock);
1971 		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1972 		mutex_exit(&mi->mi_lock);
1973 	}
1974 
1975 	mi->mi_async_req_count++;
1976 	ASSERT(mi->mi_async_req_count != 0);
1977 	cv_signal(&mi->mi_async_reqs_cv);
1978 	mutex_exit(&mi->mi_async_lock);
1979 	return;
1980 
1981 noasync:
1982 	mutex_enter(&rp->r_statelock);
1983 	rdc->entries = NULL;
1984 	/*
1985 	 * Indicate that no one is trying to fill this entry and
1986 	 * it still needs to be filled.
1987 	 */
1988 	rdc->flags &= ~RDDIR;
1989 	rdc->flags |= RDDIRREQ;
1990 	rddir4_cache_rele(rp, rdc);
1991 	mutex_exit(&rp->r_statelock);
1992 }
1993 
1994 void
1995 nfs4_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
1996     cred_t *cr, void (*commit)(vnode_t *, page_t *, offset3, count3,
1997     cred_t *))
1998 {
1999 	rnode4_t *rp;
2000 	mntinfo4_t *mi;
2001 	struct nfs4_async_reqs *args;
2002 	page_t *pp;
2003 
2004 	rp = VTOR4(vp);
2005 	mi = VTOMI4(vp);
2006 
2007 	/*
2008 	 * If we can't allocate a request structure, do the commit
2009 	 * operation synchronously in this thread's context.
2010 	 */
2011 	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
2012 		goto noasync;
2013 
2014 	args->a_next = NULL;
2015 #ifdef DEBUG
2016 	args->a_queuer = curthread;
2017 #endif
2018 	VN_HOLD(vp);
2019 	args->a_vp = vp;
2020 	ASSERT(cr != NULL);
2021 	crhold(cr);
2022 	args->a_cred = cr;
2023 	args->a_io = NFS4_COMMIT;
2024 	args->a_nfs4_commit = commit;
2025 	args->a_nfs4_plist = plist;
2026 	args->a_nfs4_offset = offset;
2027 	args->a_nfs4_count = count;
2028 
2029 	mutex_enter(&mi->mi_async_lock);
2030 
2031 	/*
2032 	 * If asyncio has been disabled, then make a synchronous request.
2033 	 * This check is done a second time in case async io was diabled
2034 	 * while this thread was blocked waiting for memory pressure to
2035 	 * reduce or for the queue to drain.
2036 	 */
2037 	if (mi->mi_max_threads == 0) {
2038 		mutex_exit(&mi->mi_async_lock);
2039 
2040 		VN_RELE(vp);
2041 		crfree(cr);
2042 		kmem_free(args, sizeof (*args));
2043 		goto noasync;
2044 	}
2045 
2046 	/*
2047 	 * Link request structure into the async list and
2048 	 * wakeup async thread to do the i/o.
2049 	 */
2050 	if (mi->mi_async_reqs[NFS4_COMMIT] == NULL) {
2051 		mi->mi_async_reqs[NFS4_COMMIT] = args;
2052 		mi->mi_async_tail[NFS4_COMMIT] = args;
2053 	} else {
2054 		mi->mi_async_tail[NFS4_COMMIT]->a_next = args;
2055 		mi->mi_async_tail[NFS4_COMMIT] = args;
2056 	}
2057 
2058 	mutex_enter(&rp->r_statelock);
2059 	rp->r_count++;
2060 	mutex_exit(&rp->r_statelock);
2061 
2062 	if (mi->mi_io_kstats) {
2063 		mutex_enter(&mi->mi_lock);
2064 		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
2065 		mutex_exit(&mi->mi_lock);
2066 	}
2067 
2068 	mi->mi_async_req_count++;
2069 	ASSERT(mi->mi_async_req_count != 0);
2070 	cv_signal(&mi->mi_async_reqs_cv);
2071 	mutex_exit(&mi->mi_async_lock);
2072 	return;
2073 
2074 noasync:
2075 	if (curproc == proc_pageout || curproc == proc_fsflush ||
2076 	    nfs_zone() != mi->mi_zone) {
2077 		while (plist != NULL) {
2078 			pp = plist;
2079 			page_sub(&plist, pp);
2080 			pp->p_fsdata = C_COMMIT;
2081 			page_unlock(pp);
2082 		}
2083 		return;
2084 	}
2085 	(*commit)(vp, plist, offset, count, cr);
2086 }
2087 
2088 /*
2089  * nfs4_async_inactive - hand off a VOP_INACTIVE call to a thread.  The
2090  * reference to the vnode is handed over to the thread; the caller should
2091  * no longer refer to the vnode.
2092  *
2093  * Unlike most of the async routines, this handoff is needed for
2094  * correctness reasons, not just performance.  So doing operations in the
2095  * context of the current thread is not an option.
2096  */
2097 void
2098 nfs4_async_inactive(vnode_t *vp, cred_t *cr)
2099 {
2100 	mntinfo4_t *mi;
2101 	struct nfs4_async_reqs *args;
2102 	boolean_t signal_inactive_thread = B_FALSE;
2103 
2104 	mi = VTOMI4(vp);
2105 
2106 	args = kmem_alloc(sizeof (*args), KM_SLEEP);
2107 	args->a_next = NULL;
2108 #ifdef DEBUG
2109 	args->a_queuer = curthread;
2110 #endif
2111 	args->a_vp = vp;
2112 	ASSERT(cr != NULL);
2113 	crhold(cr);
2114 	args->a_cred = cr;
2115 	args->a_io = NFS4_INACTIVE;
2116 
2117 	/*
2118 	 * Note that we don't check mi->mi_max_threads here, since we
2119 	 * *need* to get rid of this vnode regardless of whether someone
2120 	 * set nfs4_max_threads to zero in /etc/system.
2121 	 *
2122 	 * The manager thread knows about this and is willing to create
2123 	 * at least one thread to accommodate us.
2124 	 */
2125 	mutex_enter(&mi->mi_async_lock);
2126 	if (mi->mi_inactive_thread == NULL) {
2127 		rnode4_t *rp;
2128 		vnode_t *unldvp = NULL;
2129 		char *unlname;
2130 		cred_t *unlcred;
2131 
2132 		mutex_exit(&mi->mi_async_lock);
2133 		/*
2134 		 * We just need to free up the memory associated with the
2135 		 * vnode, which can be safely done from within the current
2136 		 * context.
2137 		 */
2138 		crfree(cr);	/* drop our reference */
2139 		kmem_free(args, sizeof (*args));
2140 		rp = VTOR4(vp);
2141 		mutex_enter(&rp->r_statelock);
2142 		if (rp->r_unldvp != NULL) {
2143 			unldvp = rp->r_unldvp;
2144 			rp->r_unldvp = NULL;
2145 			unlname = rp->r_unlname;
2146 			rp->r_unlname = NULL;
2147 			unlcred = rp->r_unlcred;
2148 			rp->r_unlcred = NULL;
2149 		}
2150 		mutex_exit(&rp->r_statelock);
2151 		/*
2152 		 * No need to explicitly throw away any cached pages.  The
2153 		 * eventual r4inactive() will attempt a synchronous
2154 		 * VOP_PUTPAGE() which will immediately fail since the request
2155 		 * is coming from the wrong zone, and then will proceed to call
2156 		 * nfs4_invalidate_pages() which will clean things up for us.
2157 		 *
2158 		 * Throw away the delegation here so rp4_addfree()'s attempt to
2159 		 * return any existing delegations becomes a no-op.
2160 		 */
2161 		if (rp->r_deleg_type != OPEN_DELEGATE_NONE) {
2162 			(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER,
2163 			    FALSE);
2164 			(void) nfs4delegreturn(rp, NFS4_DR_DISCARD);
2165 			nfs_rw_exit(&mi->mi_recovlock);
2166 		}
2167 		nfs4_clear_open_streams(rp);
2168 
2169 		rp4_addfree(rp, cr);
2170 		if (unldvp != NULL) {
2171 			kmem_free(unlname, MAXNAMELEN);
2172 			VN_RELE(unldvp);
2173 			crfree(unlcred);
2174 		}
2175 		return;
2176 	}
2177 
2178 	if (mi->mi_manager_thread == NULL) {
2179 		/*
2180 		 * We want to talk to the inactive thread.
2181 		 */
2182 		signal_inactive_thread = B_TRUE;
2183 	}
2184 
2185 	/*
2186 	 * Enqueue the vnode and wake up either the special thread (empty
2187 	 * list) or an async thread.
2188 	 */
2189 	if (mi->mi_async_reqs[NFS4_INACTIVE] == NULL) {
2190 		mi->mi_async_reqs[NFS4_INACTIVE] = args;
2191 		mi->mi_async_tail[NFS4_INACTIVE] = args;
2192 		signal_inactive_thread = B_TRUE;
2193 	} else {
2194 		mi->mi_async_tail[NFS4_INACTIVE]->a_next = args;
2195 		mi->mi_async_tail[NFS4_INACTIVE] = args;
2196 	}
2197 	if (signal_inactive_thread) {
2198 		cv_signal(&mi->mi_inact_req_cv);
2199 	} else  {
2200 		mi->mi_async_req_count++;
2201 		ASSERT(mi->mi_async_req_count != 0);
2202 		cv_signal(&mi->mi_async_reqs_cv);
2203 	}
2204 
2205 	mutex_exit(&mi->mi_async_lock);
2206 }
2207 
2208 int
2209 writerp4(rnode4_t *rp, caddr_t base, int tcount, struct uio *uio, int pgcreated)
2210 {
2211 	int pagecreate;
2212 	int n;
2213 	int saved_n;
2214 	caddr_t saved_base;
2215 	u_offset_t offset;
2216 	int error;
2217 	int sm_error;
2218 	vnode_t *vp = RTOV(rp);
2219 
2220 	ASSERT(tcount <= MAXBSIZE && tcount <= uio->uio_resid);
2221 	ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_WRITER));
2222 	if (!vpm_enable) {
2223 		ASSERT(((uintptr_t)base & MAXBOFFSET) + tcount <= MAXBSIZE);
2224 	}
2225 
2226 	/*
2227 	 * Move bytes in at most PAGESIZE chunks. We must avoid
2228 	 * spanning pages in uiomove() because page faults may cause
2229 	 * the cache to be invalidated out from under us. The r_size is not
2230 	 * updated until after the uiomove. If we push the last page of a
2231 	 * file before r_size is correct, we will lose the data written past
2232 	 * the current (and invalid) r_size.
2233 	 */
2234 	do {
2235 		offset = uio->uio_loffset;
2236 		pagecreate = 0;
2237 
2238 		/*
2239 		 * n is the number of bytes required to satisfy the request
2240 		 *   or the number of bytes to fill out the page.
2241 		 */
2242 		n = (int)MIN((PAGESIZE - (offset & PAGEOFFSET)), tcount);
2243 
2244 		/*
2245 		 * Check to see if we can skip reading in the page
2246 		 * and just allocate the memory.  We can do this
2247 		 * if we are going to rewrite the entire mapping
2248 		 * or if we are going to write to or beyond the current
2249 		 * end of file from the beginning of the mapping.
2250 		 *
2251 		 * The read of r_size is now protected by r_statelock.
2252 		 */
2253 		mutex_enter(&rp->r_statelock);
2254 		/*
2255 		 * When pgcreated is nonzero the caller has already done
2256 		 * a segmap_getmapflt with forcefault 0 and S_WRITE. With
2257 		 * segkpm this means we already have at least one page
2258 		 * created and mapped at base.
2259 		 */
2260 		pagecreate = pgcreated ||
2261 		    ((offset & PAGEOFFSET) == 0 &&
2262 		    (n == PAGESIZE || ((offset + n) >= rp->r_size)));
2263 
2264 		mutex_exit(&rp->r_statelock);
2265 
2266 		if (!vpm_enable && pagecreate) {
2267 			/*
2268 			 * The last argument tells segmap_pagecreate() to
2269 			 * always lock the page, as opposed to sometimes
2270 			 * returning with the page locked. This way we avoid a
2271 			 * fault on the ensuing uiomove(), but also
2272 			 * more importantly (to fix bug 1094402) we can
2273 			 * call segmap_fault() to unlock the page in all
2274 			 * cases. An alternative would be to modify
2275 			 * segmap_pagecreate() to tell us when it is
2276 			 * locking a page, but that's a fairly major
2277 			 * interface change.
2278 			 */
2279 			if (pgcreated == 0)
2280 				(void) segmap_pagecreate(segkmap, base,
2281 				    (uint_t)n, 1);
2282 			saved_base = base;
2283 			saved_n = n;
2284 		}
2285 
2286 		/*
2287 		 * The number of bytes of data in the last page can not
2288 		 * be accurately be determined while page is being
2289 		 * uiomove'd to and the size of the file being updated.
2290 		 * Thus, inform threads which need to know accurately
2291 		 * how much data is in the last page of the file.  They
2292 		 * will not do the i/o immediately, but will arrange for
2293 		 * the i/o to happen later when this modify operation
2294 		 * will have finished.
2295 		 */
2296 		ASSERT(!(rp->r_flags & R4MODINPROGRESS));
2297 		mutex_enter(&rp->r_statelock);
2298 		rp->r_flags |= R4MODINPROGRESS;
2299 		rp->r_modaddr = (offset & MAXBMASK);
2300 		mutex_exit(&rp->r_statelock);
2301 
2302 		if (vpm_enable) {
2303 			/*
2304 			 * Copy data. If new pages are created, part of
2305 			 * the page that is not written will be initizliazed
2306 			 * with zeros.
2307 			 */
2308 			error = vpm_data_copy(vp, offset, n, uio,
2309 			    !pagecreate, NULL, 0, S_WRITE);
2310 		} else {
2311 			error = uiomove(base, n, UIO_WRITE, uio);
2312 		}
2313 
2314 		/*
2315 		 * r_size is the maximum number of
2316 		 * bytes known to be in the file.
2317 		 * Make sure it is at least as high as the
2318 		 * first unwritten byte pointed to by uio_loffset.
2319 		 */
2320 		mutex_enter(&rp->r_statelock);
2321 		if (rp->r_size < uio->uio_loffset)
2322 			rp->r_size = uio->uio_loffset;
2323 		rp->r_flags &= ~R4MODINPROGRESS;
2324 		rp->r_flags |= R4DIRTY;
2325 		mutex_exit(&rp->r_statelock);
2326 
2327 		/* n = # of bytes written */
2328 		n = (int)(uio->uio_loffset - offset);
2329 
2330 		if (!vpm_enable) {
2331 			base += n;
2332 		}
2333 
2334 		tcount -= n;
2335 		/*
2336 		 * If we created pages w/o initializing them completely,
2337 		 * we need to zero the part that wasn't set up.
2338 		 * This happens on a most EOF write cases and if
2339 		 * we had some sort of error during the uiomove.
2340 		 */
2341 		if (!vpm_enable && pagecreate) {
2342 			if ((uio->uio_loffset & PAGEOFFSET) || n == 0)
2343 				(void) kzero(base, PAGESIZE - n);
2344 
2345 			if (pgcreated) {
2346 				/*
2347 				 * Caller is responsible for this page,
2348 				 * it was not created in this loop.
2349 				 */
2350 				pgcreated = 0;
2351 			} else {
2352 				/*
2353 				 * For bug 1094402: segmap_pagecreate locks
2354 				 * page. Unlock it. This also unlocks the
2355 				 * pages allocated by page_create_va() in
2356 				 * segmap_pagecreate().
2357 				 */
2358 				sm_error = segmap_fault(kas.a_hat, segkmap,
2359 				    saved_base, saved_n,
2360 				    F_SOFTUNLOCK, S_WRITE);
2361 				if (error == 0)
2362 					error = sm_error;
2363 			}
2364 		}
2365 	} while (tcount > 0 && error == 0);
2366 
2367 	return (error);
2368 }
2369 
2370 int
2371 nfs4_putpages(vnode_t *vp, u_offset_t off, size_t len, int flags, cred_t *cr)
2372 {
2373 	rnode4_t *rp;
2374 	page_t *pp;
2375 	u_offset_t eoff;
2376 	u_offset_t io_off;
2377 	size_t io_len;
2378 	int error;
2379 	int rdirty;
2380 	int err;
2381 
2382 	rp = VTOR4(vp);
2383 	ASSERT(rp->r_count > 0);
2384 
2385 	if (!nfs4_has_pages(vp))
2386 		return (0);
2387 
2388 	ASSERT(vp->v_type != VCHR);
2389 
2390 	/*
2391 	 * If R4OUTOFSPACE is set, then all writes turn into B_INVAL
2392 	 * writes.  B_FORCE is set to force the VM system to actually
2393 	 * invalidate the pages, even if the i/o failed.  The pages
2394 	 * need to get invalidated because they can't be written out
2395 	 * because there isn't any space left on either the server's
2396 	 * file system or in the user's disk quota.  The B_FREE bit
2397 	 * is cleared to avoid confusion as to whether this is a
2398 	 * request to place the page on the freelist or to destroy
2399 	 * it.
2400 	 */
2401 	if ((rp->r_flags & R4OUTOFSPACE) ||
2402 	    (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED))
2403 		flags = (flags & ~B_FREE) | B_INVAL | B_FORCE;
2404 
2405 	if (len == 0) {
2406 		/*
2407 		 * If doing a full file synchronous operation, then clear
2408 		 * the R4DIRTY bit.  If a page gets dirtied while the flush
2409 		 * is happening, then R4DIRTY will get set again.  The
2410 		 * R4DIRTY bit must get cleared before the flush so that
2411 		 * we don't lose this information.
2412 		 *
2413 		 * If there are no full file async write operations
2414 		 * pending and RDIRTY bit is set, clear it.
2415 		 */
2416 		if (off == (u_offset_t)0 &&
2417 		    !(flags & B_ASYNC) &&
2418 		    (rp->r_flags & R4DIRTY)) {
2419 			mutex_enter(&rp->r_statelock);
2420 			rdirty = (rp->r_flags & R4DIRTY);
2421 			rp->r_flags &= ~R4DIRTY;
2422 			mutex_exit(&rp->r_statelock);
2423 		} else if (flags & B_ASYNC && off == (u_offset_t)0) {
2424 			mutex_enter(&rp->r_statelock);
2425 			if (rp->r_flags & R4DIRTY && rp->r_awcount == 0) {
2426 				rdirty = (rp->r_flags & R4DIRTY);
2427 				rp->r_flags &= ~R4DIRTY;
2428 			}
2429 			mutex_exit(&rp->r_statelock);
2430 		} else
2431 			rdirty = 0;
2432 
2433 		/*
2434 		 * Search the entire vp list for pages >= off, and flush
2435 		 * the dirty pages.
2436 		 */
2437 		error = pvn_vplist_dirty(vp, off, rp->r_putapage,
2438 		    flags, cr);
2439 
2440 		/*
2441 		 * If an error occurred and the file was marked as dirty
2442 		 * before and we aren't forcibly invalidating pages, then
2443 		 * reset the R4DIRTY flag.
2444 		 */
2445 		if (error && rdirty &&
2446 		    (flags & (B_INVAL | B_FORCE)) != (B_INVAL | B_FORCE)) {
2447 			mutex_enter(&rp->r_statelock);
2448 			rp->r_flags |= R4DIRTY;
2449 			mutex_exit(&rp->r_statelock);
2450 		}
2451 	} else {
2452 		/*
2453 		 * Do a range from [off...off + len) looking for pages
2454 		 * to deal with.
2455 		 */
2456 		error = 0;
2457 		io_len = 0;
2458 		eoff = off + len;
2459 		mutex_enter(&rp->r_statelock);
2460 		for (io_off = off; io_off < eoff && io_off < rp->r_size;
2461 		    io_off += io_len) {
2462 			mutex_exit(&rp->r_statelock);
2463 			/*
2464 			 * If we are not invalidating, synchronously
2465 			 * freeing or writing pages use the routine
2466 			 * page_lookup_nowait() to prevent reclaiming
2467 			 * them from the free list.
2468 			 */
2469 			if ((flags & B_INVAL) || !(flags & B_ASYNC)) {
2470 				pp = page_lookup(vp, io_off,
2471 				    (flags & (B_INVAL | B_FREE)) ?
2472 				    SE_EXCL : SE_SHARED);
2473 			} else {
2474 				pp = page_lookup_nowait(vp, io_off,
2475 				    (flags & B_FREE) ? SE_EXCL : SE_SHARED);
2476 			}
2477 
2478 			if (pp == NULL || !pvn_getdirty(pp, flags))
2479 				io_len = PAGESIZE;
2480 			else {
2481 				err = (*rp->r_putapage)(vp, pp, &io_off,
2482 				    &io_len, flags, cr);
2483 				if (!error)
2484 					error = err;
2485 				/*
2486 				 * "io_off" and "io_len" are returned as
2487 				 * the range of pages we actually wrote.
2488 				 * This allows us to skip ahead more quickly
2489 				 * since several pages may've been dealt
2490 				 * with by this iteration of the loop.
2491 				 */
2492 			}
2493 			mutex_enter(&rp->r_statelock);
2494 		}
2495 		mutex_exit(&rp->r_statelock);
2496 	}
2497 
2498 	return (error);
2499 }
2500 
2501 void
2502 nfs4_invalidate_pages(vnode_t *vp, u_offset_t off, cred_t *cr)
2503 {
2504 	rnode4_t *rp;
2505 
2506 	rp = VTOR4(vp);
2507 	if (IS_SHADOW(vp, rp))
2508 		vp = RTOV4(rp);
2509 	mutex_enter(&rp->r_statelock);
2510 	while (rp->r_flags & R4TRUNCATE)
2511 		cv_wait(&rp->r_cv, &rp->r_statelock);
2512 	rp->r_flags |= R4TRUNCATE;
2513 	if (off == (u_offset_t)0) {
2514 		rp->r_flags &= ~R4DIRTY;
2515 		if (!(rp->r_flags & R4STALE))
2516 			rp->r_error = 0;
2517 	}
2518 	rp->r_truncaddr = off;
2519 	mutex_exit(&rp->r_statelock);
2520 	(void) pvn_vplist_dirty(vp, off, rp->r_putapage,
2521 	    B_INVAL | B_TRUNC, cr);
2522 	mutex_enter(&rp->r_statelock);
2523 	rp->r_flags &= ~R4TRUNCATE;
2524 	cv_broadcast(&rp->r_cv);
2525 	mutex_exit(&rp->r_statelock);
2526 }
2527 
2528 static int
2529 nfs4_mnt_kstat_update(kstat_t *ksp, int rw)
2530 {
2531 	mntinfo4_t *mi;
2532 	struct mntinfo_kstat *mik;
2533 	vfs_t *vfsp;
2534 
2535 	/* this is a read-only kstat. Bail out on a write */
2536 	if (rw == KSTAT_WRITE)
2537 		return (EACCES);
2538 
2539 
2540 	/*
2541 	 * We don't want to wait here as kstat_chain_lock could be held by
2542 	 * dounmount(). dounmount() takes vfs_reflock before the chain lock
2543 	 * and thus could lead to a deadlock.
2544 	 */
2545 	vfsp = (struct vfs *)ksp->ks_private;
2546 
2547 	mi = VFTOMI4(vfsp);
2548 	mik = (struct mntinfo_kstat *)ksp->ks_data;
2549 
2550 	(void) strcpy(mik->mik_proto, mi->mi_curr_serv->sv_knconf->knc_proto);
2551 
2552 	mik->mik_vers = (uint32_t)mi->mi_vers;
2553 	mik->mik_flags = mi->mi_flags;
2554 	/*
2555 	 * The sv_secdata holds the flavor the client specifies.
2556 	 * If the client uses default and a security negotiation
2557 	 * occurs, sv_currsec will point to the current flavor
2558 	 * selected from the server flavor list.
2559 	 * sv_currsec is NULL if no security negotiation takes place.
2560 	 */
2561 	mik->mik_secmod = mi->mi_curr_serv->sv_currsec ?
2562 	    mi->mi_curr_serv->sv_currsec->secmod :
2563 	    mi->mi_curr_serv->sv_secdata->secmod;
2564 	mik->mik_curread = (uint32_t)mi->mi_curread;
2565 	mik->mik_curwrite = (uint32_t)mi->mi_curwrite;
2566 	mik->mik_retrans = mi->mi_retrans;
2567 	mik->mik_timeo = mi->mi_timeo;
2568 	mik->mik_acregmin = HR2SEC(mi->mi_acregmin);
2569 	mik->mik_acregmax = HR2SEC(mi->mi_acregmax);
2570 	mik->mik_acdirmin = HR2SEC(mi->mi_acdirmin);
2571 	mik->mik_acdirmax = HR2SEC(mi->mi_acdirmax);
2572 	mik->mik_noresponse = (uint32_t)mi->mi_noresponse;
2573 	mik->mik_failover = (uint32_t)mi->mi_failover;
2574 	mik->mik_remap = (uint32_t)mi->mi_remap;
2575 
2576 	(void) strcpy(mik->mik_curserver, mi->mi_curr_serv->sv_hostname);
2577 
2578 	return (0);
2579 }
2580 
2581 void
2582 nfs4_mnt_kstat_init(struct vfs *vfsp)
2583 {
2584 	mntinfo4_t *mi = VFTOMI4(vfsp);
2585 
2586 	/*
2587 	 * PSARC 2001/697 Contract Private Interface
2588 	 * All nfs kstats are under SunMC contract
2589 	 * Please refer to the PSARC listed above and contact
2590 	 * SunMC before making any changes!
2591 	 *
2592 	 * Changes must be reviewed by Solaris File Sharing
2593 	 * Changes must be communicated to contract-2001-697@sun.com
2594 	 *
2595 	 */
2596 
2597 	mi->mi_io_kstats = kstat_create_zone("nfs", getminor(vfsp->vfs_dev),
2598 	    NULL, "nfs", KSTAT_TYPE_IO, 1, 0, mi->mi_zone->zone_id);
2599 	if (mi->mi_io_kstats) {
2600 		if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
2601 			kstat_zone_add(mi->mi_io_kstats, GLOBAL_ZONEID);
2602 		mi->mi_io_kstats->ks_lock = &mi->mi_lock;
2603 		kstat_install(mi->mi_io_kstats);
2604 	}
2605 
2606 	if ((mi->mi_ro_kstats = kstat_create_zone("nfs",
2607 	    getminor(vfsp->vfs_dev), "mntinfo", "misc", KSTAT_TYPE_RAW,
2608 	    sizeof (struct mntinfo_kstat), 0, mi->mi_zone->zone_id)) != NULL) {
2609 		if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
2610 			kstat_zone_add(mi->mi_ro_kstats, GLOBAL_ZONEID);
2611 		mi->mi_ro_kstats->ks_update = nfs4_mnt_kstat_update;
2612 		mi->mi_ro_kstats->ks_private = (void *)vfsp;
2613 		kstat_install(mi->mi_ro_kstats);
2614 	}
2615 
2616 	nfs4_mnt_recov_kstat_init(vfsp);
2617 }
2618 
2619 void
2620 nfs4_write_error(vnode_t *vp, int error, cred_t *cr)
2621 {
2622 	mntinfo4_t *mi;
2623 
2624 	mi = VTOMI4(vp);
2625 	/*
2626 	 * In case of forced unmount, do not print any messages
2627 	 * since it can flood the console with error messages.
2628 	 */
2629 	if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)
2630 		return;
2631 
2632 	/*
2633 	 * If the mount point is dead, not recoverable, do not
2634 	 * print error messages that can flood the console.
2635 	 */
2636 	if (mi->mi_flags & MI4_RECOV_FAIL)
2637 		return;
2638 
2639 	/*
2640 	 * No use in flooding the console with ENOSPC
2641 	 * messages from the same file system.
2642 	 */
2643 	if ((error != ENOSPC && error != EDQUOT) ||
2644 	    lbolt - mi->mi_printftime > 0) {
2645 		zoneid_t zoneid = mi->mi_zone->zone_id;
2646 
2647 #ifdef DEBUG
2648 		nfs_perror(error, "NFS%ld write error on host %s: %m.\n",
2649 		    mi->mi_vers, VTOR4(vp)->r_server->sv_hostname, NULL);
2650 #else
2651 		nfs_perror(error, "NFS write error on host %s: %m.\n",
2652 		    VTOR4(vp)->r_server->sv_hostname, NULL);
2653 #endif
2654 		if (error == ENOSPC || error == EDQUOT) {
2655 			zcmn_err(zoneid, CE_CONT,
2656 			    "^File: userid=%d, groupid=%d\n",
2657 			    crgetuid(cr), crgetgid(cr));
2658 			if (crgetuid(curthread->t_cred) != crgetuid(cr) ||
2659 			    crgetgid(curthread->t_cred) != crgetgid(cr)) {
2660 				zcmn_err(zoneid, CE_CONT,
2661 				    "^User: userid=%d, groupid=%d\n",
2662 				    crgetuid(curthread->t_cred),
2663 				    crgetgid(curthread->t_cred));
2664 			}
2665 			mi->mi_printftime = lbolt +
2666 			    nfs_write_error_interval * hz;
2667 		}
2668 		sfh4_printfhandle(VTOR4(vp)->r_fh);
2669 #ifdef DEBUG
2670 		if (error == EACCES) {
2671 			zcmn_err(zoneid, CE_CONT,
2672 			    "nfs_bio: cred is%s kcred\n",
2673 			    cr == kcred ? "" : " not");
2674 		}
2675 #endif
2676 	}
2677 }
2678 
2679 /*
2680  * Return non-zero if the given file can be safely memory mapped.  Locks
2681  * are safe if whole-file (length and offset are both zero).
2682  */
2683 
2684 #define	SAFE_LOCK(flk)	((flk).l_start == 0 && (flk).l_len == 0)
2685 
2686 static int
2687 nfs4_safemap(const vnode_t *vp)
2688 {
2689 	locklist_t	*llp, *next_llp;
2690 	int		safe = 1;
2691 	rnode4_t	*rp = VTOR4(vp);
2692 
2693 	ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER));
2694 
2695 	NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: "
2696 	    "vp = %p", (void *)vp));
2697 
2698 	/*
2699 	 * Review all the locks for the vnode, both ones that have been
2700 	 * acquired and ones that are pending.  We assume that
2701 	 * flk_active_locks_for_vp() has merged any locks that can be
2702 	 * merged (so that if a process has the entire file locked, it is
2703 	 * represented as a single lock).
2704 	 *
2705 	 * Note that we can't bail out of the loop if we find a non-safe
2706 	 * lock, because we have to free all the elements in the llp list.
2707 	 * We might be able to speed up this code slightly by not looking
2708 	 * at each lock's l_start and l_len fields once we've found a
2709 	 * non-safe lock.
2710 	 */
2711 
2712 	llp = flk_active_locks_for_vp(vp);
2713 	while (llp) {
2714 		NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE,
2715 		    "nfs4_safemap: active lock (%" PRId64 ", %" PRId64 ")",
2716 		    llp->ll_flock.l_start, llp->ll_flock.l_len));
2717 		if (!SAFE_LOCK(llp->ll_flock)) {
2718 			safe = 0;
2719 			NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE,
2720 			    "nfs4_safemap: unsafe active lock (%" PRId64
2721 			    ", %" PRId64 ")", llp->ll_flock.l_start,
2722 			    llp->ll_flock.l_len));
2723 		}
2724 		next_llp = llp->ll_next;
2725 		VN_RELE(llp->ll_vp);
2726 		kmem_free(llp, sizeof (*llp));
2727 		llp = next_llp;
2728 	}
2729 
2730 	NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: %s",
2731 	    safe ? "safe" : "unsafe"));
2732 	return (safe);
2733 }
2734 
2735 /*
2736  * Return whether there is a lost LOCK or LOCKU queued up for the given
2737  * file that would make an mmap request unsafe.  cf. nfs4_safemap().
2738  */
2739 
2740 bool_t
2741 nfs4_map_lost_lock_conflict(vnode_t *vp)
2742 {
2743 	bool_t conflict = FALSE;
2744 	nfs4_lost_rqst_t *lrp;
2745 	mntinfo4_t *mi = VTOMI4(vp);
2746 
2747 	mutex_enter(&mi->mi_lock);
2748 	for (lrp = list_head(&mi->mi_lost_state); lrp != NULL;
2749 	    lrp = list_next(&mi->mi_lost_state, lrp)) {
2750 		if (lrp->lr_op != OP_LOCK && lrp->lr_op != OP_LOCKU)
2751 			continue;
2752 		ASSERT(lrp->lr_vp != NULL);
2753 		if (!VOP_CMP(lrp->lr_vp, vp, NULL))
2754 			continue;	/* different file */
2755 		if (!SAFE_LOCK(*lrp->lr_flk)) {
2756 			conflict = TRUE;
2757 			break;
2758 		}
2759 	}
2760 
2761 	mutex_exit(&mi->mi_lock);
2762 	return (conflict);
2763 }
2764 
2765 /*
2766  * nfs_lockcompletion:
2767  *
2768  * If the vnode has a lock that makes it unsafe to cache the file, mark it
2769  * as non cachable (set VNOCACHE bit).
2770  */
2771 
2772 void
2773 nfs4_lockcompletion(vnode_t *vp, int cmd)
2774 {
2775 	rnode4_t *rp = VTOR4(vp);
2776 
2777 	ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER));
2778 	ASSERT(!IS_SHADOW(vp, rp));
2779 
2780 	if (cmd == F_SETLK || cmd == F_SETLKW) {
2781 
2782 		if (!nfs4_safemap(vp)) {
2783 			mutex_enter(&vp->v_lock);
2784 			vp->v_flag |= VNOCACHE;
2785 			mutex_exit(&vp->v_lock);
2786 		} else {
2787 			mutex_enter(&vp->v_lock);
2788 			vp->v_flag &= ~VNOCACHE;
2789 			mutex_exit(&vp->v_lock);
2790 		}
2791 	}
2792 	/*
2793 	 * The cached attributes of the file are stale after acquiring
2794 	 * the lock on the file. They were updated when the file was
2795 	 * opened, but not updated when the lock was acquired. Therefore the
2796 	 * cached attributes are invalidated after the lock is obtained.
2797 	 */
2798 	PURGE_ATTRCACHE4(vp);
2799 }
2800 
2801 /* ARGSUSED */
2802 static void *
2803 nfs4_mi_init(zoneid_t zoneid)
2804 {
2805 	struct mi4_globals *mig;
2806 
2807 	mig = kmem_alloc(sizeof (*mig), KM_SLEEP);
2808 	mutex_init(&mig->mig_lock, NULL, MUTEX_DEFAULT, NULL);
2809 	list_create(&mig->mig_list, sizeof (mntinfo4_t),
2810 	    offsetof(mntinfo4_t, mi_zone_node));
2811 	mig->mig_destructor_called = B_FALSE;
2812 	return (mig);
2813 }
2814 
2815 /*
2816  * Callback routine to tell all NFSv4 mounts in the zone to start tearing down
2817  * state and killing off threads.
2818  */
2819 /* ARGSUSED */
2820 static void
2821 nfs4_mi_shutdown(zoneid_t zoneid, void *data)
2822 {
2823 	struct mi4_globals *mig = data;
2824 	mntinfo4_t *mi;
2825 	nfs4_server_t *np;
2826 
2827 	NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2828 	    "nfs4_mi_shutdown zone %d\n", zoneid));
2829 	ASSERT(mig != NULL);
2830 	for (;;) {
2831 		mutex_enter(&mig->mig_lock);
2832 		mi = list_head(&mig->mig_list);
2833 		if (mi == NULL) {
2834 			mutex_exit(&mig->mig_lock);
2835 			break;
2836 		}
2837 
2838 		NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2839 		    "nfs4_mi_shutdown stopping vfs %p\n", (void *)mi->mi_vfsp));
2840 		/*
2841 		 * purge the DNLC for this filesystem
2842 		 */
2843 		(void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
2844 		/*
2845 		 * Tell existing async worker threads to exit.
2846 		 */
2847 		mutex_enter(&mi->mi_async_lock);
2848 		mi->mi_max_threads = 0;
2849 		cv_broadcast(&mi->mi_async_work_cv);
2850 		/*
2851 		 * Set the appropriate flags, signal and wait for both the
2852 		 * async manager and the inactive thread to exit when they're
2853 		 * done with their current work.
2854 		 */
2855 		mutex_enter(&mi->mi_lock);
2856 		mi->mi_flags |= (MI4_ASYNC_MGR_STOP|MI4_DEAD);
2857 		mutex_exit(&mi->mi_lock);
2858 		mutex_exit(&mi->mi_async_lock);
2859 		if (mi->mi_manager_thread) {
2860 			nfs4_async_manager_stop(mi->mi_vfsp);
2861 		}
2862 		if (mi->mi_inactive_thread) {
2863 			mutex_enter(&mi->mi_async_lock);
2864 			cv_signal(&mi->mi_inact_req_cv);
2865 			/*
2866 			 * Wait for the inactive thread to exit.
2867 			 */
2868 			while (mi->mi_inactive_thread != NULL) {
2869 				cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
2870 			}
2871 			mutex_exit(&mi->mi_async_lock);
2872 		}
2873 		/*
2874 		 * Wait for the recovery thread to complete, that is, it will
2875 		 * signal when it is done using the "mi" structure and about
2876 		 * to exit
2877 		 */
2878 		mutex_enter(&mi->mi_lock);
2879 		while (mi->mi_in_recovery > 0)
2880 			cv_wait(&mi->mi_cv_in_recov, &mi->mi_lock);
2881 		mutex_exit(&mi->mi_lock);
2882 		/*
2883 		 * We're done when every mi has been done or the list is empty.
2884 		 * This one is done, remove it from the list.
2885 		 */
2886 		list_remove(&mig->mig_list, mi);
2887 		mutex_exit(&mig->mig_lock);
2888 		zone_rele(mi->mi_zone);
2889 		/*
2890 		 * Release hold on vfs and mi done to prevent race with zone
2891 		 * shutdown. This releases the hold in nfs4_mi_zonelist_add.
2892 		 */
2893 		VFS_RELE(mi->mi_vfsp);
2894 		MI4_RELE(mi);
2895 	}
2896 	/*
2897 	 * Tell each renew thread in the zone to exit
2898 	 */
2899 	mutex_enter(&nfs4_server_lst_lock);
2900 	for (np = nfs4_server_lst.forw; np != &nfs4_server_lst; np = np->forw) {
2901 		mutex_enter(&np->s_lock);
2902 		if (np->zoneid == zoneid) {
2903 			/*
2904 			 * We add another hold onto the nfs4_server_t
2905 			 * because this will make sure tha the nfs4_server_t
2906 			 * stays around until nfs4_callback_fini_zone destroys
2907 			 * the zone. This way, the renew thread can
2908 			 * unconditionally release its holds on the
2909 			 * nfs4_server_t.
2910 			 */
2911 			np->s_refcnt++;
2912 			nfs4_mark_srv_dead(np);
2913 		}
2914 		mutex_exit(&np->s_lock);
2915 	}
2916 	mutex_exit(&nfs4_server_lst_lock);
2917 }
2918 
2919 static void
2920 nfs4_mi_free_globals(struct mi4_globals *mig)
2921 {
2922 	list_destroy(&mig->mig_list);	/* makes sure the list is empty */
2923 	mutex_destroy(&mig->mig_lock);
2924 	kmem_free(mig, sizeof (*mig));
2925 }
2926 
2927 /* ARGSUSED */
2928 static void
2929 nfs4_mi_destroy(zoneid_t zoneid, void *data)
2930 {
2931 	struct mi4_globals *mig = data;
2932 
2933 	NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2934 	    "nfs4_mi_destroy zone %d\n", zoneid));
2935 	ASSERT(mig != NULL);
2936 	mutex_enter(&mig->mig_lock);
2937 	if (list_head(&mig->mig_list) != NULL) {
2938 		/* Still waiting for VFS_FREEVFS() */
2939 		mig->mig_destructor_called = B_TRUE;
2940 		mutex_exit(&mig->mig_lock);
2941 		return;
2942 	}
2943 	nfs4_mi_free_globals(mig);
2944 }
2945 
2946 /*
2947  * Add an NFS mount to the per-zone list of NFS mounts.
2948  */
2949 void
2950 nfs4_mi_zonelist_add(mntinfo4_t *mi)
2951 {
2952 	struct mi4_globals *mig;
2953 
2954 	mig = zone_getspecific(mi4_list_key, mi->mi_zone);
2955 	mutex_enter(&mig->mig_lock);
2956 	list_insert_head(&mig->mig_list, mi);
2957 	/*
2958 	 * hold added to eliminate race with zone shutdown -this will be
2959 	 * released in mi_shutdown
2960 	 */
2961 	MI4_HOLD(mi);
2962 	VFS_HOLD(mi->mi_vfsp);
2963 	mutex_exit(&mig->mig_lock);
2964 }
2965 
2966 /*
2967  * Remove an NFS mount from the per-zone list of NFS mounts.
2968  */
2969 int
2970 nfs4_mi_zonelist_remove(mntinfo4_t *mi)
2971 {
2972 	struct mi4_globals *mig;
2973 	int ret = 0;
2974 
2975 	mig = zone_getspecific(mi4_list_key, mi->mi_zone);
2976 	mutex_enter(&mig->mig_lock);
2977 	mutex_enter(&mi->mi_lock);
2978 	/* if this mi is marked dead, then the zone already released it */
2979 	if (!(mi->mi_flags & MI4_DEAD)) {
2980 		list_remove(&mig->mig_list, mi);
2981 
2982 		/* release the holds put on in zonelist_add(). */
2983 		VFS_RELE(mi->mi_vfsp);
2984 		MI4_RELE(mi);
2985 		ret = 1;
2986 	}
2987 	mutex_exit(&mi->mi_lock);
2988 
2989 	/*
2990 	 * We can be called asynchronously by VFS_FREEVFS() after the zone
2991 	 * shutdown/destroy callbacks have executed; if so, clean up the zone's
2992 	 * mi globals.
2993 	 */
2994 	if (list_head(&mig->mig_list) == NULL &&
2995 	    mig->mig_destructor_called == B_TRUE) {
2996 		nfs4_mi_free_globals(mig);
2997 		return (ret);
2998 	}
2999 	mutex_exit(&mig->mig_lock);
3000 	return (ret);
3001 }
3002 
3003 void
3004 nfs_free_mi4(mntinfo4_t *mi)
3005 {
3006 	nfs4_open_owner_t	*foop;
3007 	nfs4_oo_hash_bucket_t   *bucketp;
3008 	nfs4_debug_msg_t	*msgp;
3009 	int i;
3010 	servinfo4_t 		*svp;
3011 
3012 	mutex_enter(&mi->mi_lock);
3013 	ASSERT(mi->mi_recovthread == NULL);
3014 	ASSERT(mi->mi_flags & MI4_ASYNC_MGR_STOP);
3015 	mutex_exit(&mi->mi_lock);
3016 	mutex_enter(&mi->mi_async_lock);
3017 	ASSERT(mi->mi_threads == 0);
3018 	ASSERT(mi->mi_manager_thread == NULL);
3019 	mutex_exit(&mi->mi_async_lock);
3020 	svp = mi->mi_servers;
3021 	sv4_free(svp);
3022 	if (mi->mi_io_kstats) {
3023 		kstat_delete(mi->mi_io_kstats);
3024 		mi->mi_io_kstats = NULL;
3025 	}
3026 	if (mi->mi_ro_kstats) {
3027 		kstat_delete(mi->mi_ro_kstats);
3028 		mi->mi_ro_kstats = NULL;
3029 	}
3030 	if (mi->mi_recov_ksp) {
3031 		kstat_delete(mi->mi_recov_ksp);
3032 		mi->mi_recov_ksp = NULL;
3033 	}
3034 	mutex_enter(&mi->mi_msg_list_lock);
3035 	while (msgp = list_head(&mi->mi_msg_list)) {
3036 		list_remove(&mi->mi_msg_list, msgp);
3037 		nfs4_free_msg(msgp);
3038 	}
3039 	mutex_exit(&mi->mi_msg_list_lock);
3040 	list_destroy(&mi->mi_msg_list);
3041 	if (mi->mi_fname != NULL)
3042 		fn_rele(&mi->mi_fname);
3043 	if (mi->mi_rootfh != NULL)
3044 		sfh4_rele(&mi->mi_rootfh);
3045 	if (mi->mi_srvparentfh != NULL)
3046 		sfh4_rele(&mi->mi_srvparentfh);
3047 	mutex_destroy(&mi->mi_lock);
3048 	mutex_destroy(&mi->mi_async_lock);
3049 	mutex_destroy(&mi->mi_msg_list_lock);
3050 	nfs_rw_destroy(&mi->mi_recovlock);
3051 	nfs_rw_destroy(&mi->mi_rename_lock);
3052 	nfs_rw_destroy(&mi->mi_fh_lock);
3053 	cv_destroy(&mi->mi_failover_cv);
3054 	cv_destroy(&mi->mi_async_reqs_cv);
3055 	cv_destroy(&mi->mi_async_work_cv);
3056 	cv_destroy(&mi->mi_async_cv);
3057 	cv_destroy(&mi->mi_inact_req_cv);
3058 	/*
3059 	 * Destroy the oo hash lists and mutexes for the cred hash table.
3060 	 */
3061 	for (i = 0; i < NFS4_NUM_OO_BUCKETS; i++) {
3062 		bucketp = &(mi->mi_oo_list[i]);
3063 		/* Destroy any remaining open owners on the list */
3064 		foop = list_head(&bucketp->b_oo_hash_list);
3065 		while (foop != NULL) {
3066 			list_remove(&bucketp->b_oo_hash_list, foop);
3067 			nfs4_destroy_open_owner(foop);
3068 			foop = list_head(&bucketp->b_oo_hash_list);
3069 		}
3070 		list_destroy(&bucketp->b_oo_hash_list);
3071 		mutex_destroy(&bucketp->b_lock);
3072 	}
3073 	/*
3074 	 * Empty and destroy the freed open owner list.
3075 	 */
3076 	foop = list_head(&mi->mi_foo_list);
3077 	while (foop != NULL) {
3078 		list_remove(&mi->mi_foo_list, foop);
3079 		nfs4_destroy_open_owner(foop);
3080 		foop = list_head(&mi->mi_foo_list);
3081 	}
3082 	list_destroy(&mi->mi_foo_list);
3083 	list_destroy(&mi->mi_bseqid_list);
3084 	list_destroy(&mi->mi_lost_state);
3085 	avl_destroy(&mi->mi_filehandles);
3086 	kmem_free(mi, sizeof (*mi));
3087 }
3088 void
3089 mi_hold(mntinfo4_t *mi)
3090 {
3091 	atomic_add_32(&mi->mi_count, 1);
3092 	ASSERT(mi->mi_count != 0);
3093 }
3094 
3095 void
3096 mi_rele(mntinfo4_t *mi)
3097 {
3098 	ASSERT(mi->mi_count != 0);
3099 	if (atomic_add_32_nv(&mi->mi_count, -1) == 0) {
3100 		nfs_free_mi4(mi);
3101 	}
3102 }
3103 
3104 vnode_t    nfs4_xattr_notsupp_vnode;
3105 
3106 void
3107 nfs4_clnt_init(void)
3108 {
3109 	nfs4_vnops_init();
3110 	(void) nfs4_rnode_init();
3111 	(void) nfs4_shadow_init();
3112 	(void) nfs4_acache_init();
3113 	(void) nfs4_subr_init();
3114 	nfs4_acl_init();
3115 	nfs_idmap_init();
3116 	nfs4_callback_init();
3117 	nfs4_secinfo_init();
3118 #ifdef	DEBUG
3119 	tsd_create(&nfs4_tsd_key, NULL);
3120 #endif
3121 
3122 	/*
3123 	 * Add a CPR callback so that we can update client
3124 	 * lease after a suspend and resume.
3125 	 */
3126 	cid = callb_add(nfs4_client_cpr_callb, 0, CB_CL_CPR_RPC, "nfs4");
3127 
3128 	zone_key_create(&mi4_list_key, nfs4_mi_init, nfs4_mi_shutdown,
3129 	    nfs4_mi_destroy);
3130 
3131 	/*
3132 	 * Initialise the reference count of the notsupp xattr cache vnode to 1
3133 	 * so that it never goes away (VOP_INACTIVE isn't called on it).
3134 	 */
3135 	nfs4_xattr_notsupp_vnode.v_count = 1;
3136 }
3137 
3138 void
3139 nfs4_clnt_fini(void)
3140 {
3141 	(void) zone_key_delete(mi4_list_key);
3142 	nfs4_vnops_fini();
3143 	(void) nfs4_rnode_fini();
3144 	(void) nfs4_shadow_fini();
3145 	(void) nfs4_acache_fini();
3146 	(void) nfs4_subr_fini();
3147 	nfs_idmap_fini();
3148 	nfs4_callback_fini();
3149 	nfs4_secinfo_fini();
3150 #ifdef	DEBUG
3151 	tsd_destroy(&nfs4_tsd_key);
3152 #endif
3153 	if (cid)
3154 		(void) callb_delete(cid);
3155 }
3156 
3157 /*ARGSUSED*/
3158 static boolean_t
3159 nfs4_client_cpr_callb(void *arg, int code)
3160 {
3161 	/*
3162 	 * We get called for Suspend and Resume events.
3163 	 * For the suspend case we simply don't care!
3164 	 */
3165 	if (code == CB_CODE_CPR_CHKPT) {
3166 		return (B_TRUE);
3167 	}
3168 
3169 	/*
3170 	 * When we get to here we are in the process of
3171 	 * resuming the system from a previous suspend.
3172 	 */
3173 	nfs4_client_resumed = gethrestime_sec();
3174 	return (B_TRUE);
3175 }
3176 
3177 void
3178 nfs4_renew_lease_thread(nfs4_server_t *sp)
3179 {
3180 	int	error = 0;
3181 	time_t	tmp_last_renewal_time, tmp_time, tmp_now_time, kip_secs;
3182 	clock_t	tick_delay = 0;
3183 	clock_t time_left = 0;
3184 	callb_cpr_t cpr_info;
3185 	kmutex_t cpr_lock;
3186 
3187 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3188 	    "nfs4_renew_lease_thread: acting on sp 0x%p", (void*)sp));
3189 	mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
3190 	CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr, "nfsv4Lease");
3191 
3192 	mutex_enter(&sp->s_lock);
3193 	/* sp->s_lease_time is set via a GETATTR */
3194 	sp->last_renewal_time = gethrestime_sec();
3195 	sp->lease_valid = NFS4_LEASE_UNINITIALIZED;
3196 	ASSERT(sp->s_refcnt >= 1);
3197 
3198 	for (;;) {
3199 		if (!sp->state_ref_count ||
3200 		    sp->lease_valid != NFS4_LEASE_VALID) {
3201 
3202 			kip_secs = MAX((sp->s_lease_time >> 1) -
3203 			    (3 * sp->propagation_delay.tv_sec), 1);
3204 
3205 			tick_delay = SEC_TO_TICK(kip_secs);
3206 
3207 			NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3208 			    "nfs4_renew_lease_thread: no renew : thread "
3209 			    "wait %ld secs", kip_secs));
3210 
3211 			NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3212 			    "nfs4_renew_lease_thread: no renew : "
3213 			    "state_ref_count %d, lease_valid %d",
3214 			    sp->state_ref_count, sp->lease_valid));
3215 
3216 			mutex_enter(&cpr_lock);
3217 			CALLB_CPR_SAFE_BEGIN(&cpr_info);
3218 			mutex_exit(&cpr_lock);
3219 			time_left = cv_timedwait(&sp->cv_thread_exit,
3220 			    &sp->s_lock, tick_delay + lbolt);
3221 			mutex_enter(&cpr_lock);
3222 			CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
3223 			mutex_exit(&cpr_lock);
3224 
3225 			NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3226 			    "nfs4_renew_lease_thread: no renew: "
3227 			    "time left %ld", time_left));
3228 
3229 			if (sp->s_thread_exit == NFS4_THREAD_EXIT)
3230 				goto die;
3231 			continue;
3232 		}
3233 
3234 		tmp_last_renewal_time = sp->last_renewal_time;
3235 
3236 		tmp_time = gethrestime_sec() - sp->last_renewal_time +
3237 		    (3 * sp->propagation_delay.tv_sec);
3238 
3239 		NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3240 		    "nfs4_renew_lease_thread: tmp_time %ld, "
3241 		    "sp->last_renewal_time %ld", tmp_time,
3242 		    sp->last_renewal_time));
3243 
3244 		kip_secs = MAX((sp->s_lease_time >> 1) - tmp_time, 1);
3245 
3246 		tick_delay = SEC_TO_TICK(kip_secs);
3247 
3248 		NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3249 		    "nfs4_renew_lease_thread: valid lease: sleep for %ld "
3250 		    "secs", kip_secs));
3251 
3252 		mutex_enter(&cpr_lock);
3253 		CALLB_CPR_SAFE_BEGIN(&cpr_info);
3254 		mutex_exit(&cpr_lock);
3255 		time_left = cv_timedwait(&sp->cv_thread_exit, &sp->s_lock,
3256 		    tick_delay + lbolt);
3257 		mutex_enter(&cpr_lock);
3258 		CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
3259 		mutex_exit(&cpr_lock);
3260 
3261 		NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3262 		    "nfs4_renew_lease_thread: valid lease: time left %ld :"
3263 		    "sp last_renewal_time %ld, nfs4_client_resumed %ld, "
3264 		    "tmp_last_renewal_time %ld", time_left,
3265 		    sp->last_renewal_time, nfs4_client_resumed,
3266 		    tmp_last_renewal_time));
3267 
3268 		if (sp->s_thread_exit == NFS4_THREAD_EXIT)
3269 			goto die;
3270 
3271 		if (tmp_last_renewal_time == sp->last_renewal_time ||
3272 		    (nfs4_client_resumed != 0 &&
3273 		    nfs4_client_resumed > sp->last_renewal_time)) {
3274 			/*
3275 			 * Issue RENEW op since we haven't renewed the lease
3276 			 * since we slept.
3277 			 */
3278 			tmp_now_time = gethrestime_sec();
3279 			error = nfs4renew(sp);
3280 			/*
3281 			 * Need to re-acquire sp's lock, nfs4renew()
3282 			 * relinqueshes it.
3283 			 */
3284 			mutex_enter(&sp->s_lock);
3285 
3286 			/*
3287 			 * See if someone changed s_thread_exit while we gave
3288 			 * up s_lock.
3289 			 */
3290 			if (sp->s_thread_exit == NFS4_THREAD_EXIT)
3291 				goto die;
3292 
3293 			if (!error) {
3294 				/*
3295 				 * check to see if we implicitly renewed while
3296 				 * we waited for a reply for our RENEW call.
3297 				 */
3298 				if (tmp_last_renewal_time ==
3299 				    sp->last_renewal_time) {
3300 					/* no implicit renew came */
3301 					sp->last_renewal_time = tmp_now_time;
3302 				} else {
3303 					NFS4_DEBUG(nfs4_client_lease_debug,
3304 					    (CE_NOTE, "renew_thread: did "
3305 					    "implicit renewal before reply "
3306 					    "from server for RENEW"));
3307 				}
3308 			} else {
3309 				/* figure out error */
3310 				NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3311 				    "renew_thread: nfs4renew returned error"
3312 				    " %d", error));
3313 			}
3314 
3315 		}
3316 	}
3317 
3318 die:
3319 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3320 	    "nfs4_renew_lease_thread: thread exiting"));
3321 
3322 	while (sp->s_otw_call_count != 0) {
3323 		NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3324 		    "nfs4_renew_lease_thread: waiting for outstanding "
3325 		    "otw calls to finish for sp 0x%p, current "
3326 		    "s_otw_call_count %d", (void *)sp,
3327 		    sp->s_otw_call_count));
3328 		mutex_enter(&cpr_lock);
3329 		CALLB_CPR_SAFE_BEGIN(&cpr_info);
3330 		mutex_exit(&cpr_lock);
3331 		cv_wait(&sp->s_cv_otw_count, &sp->s_lock);
3332 		mutex_enter(&cpr_lock);
3333 		CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
3334 		mutex_exit(&cpr_lock);
3335 	}
3336 	mutex_exit(&sp->s_lock);
3337 
3338 	nfs4_server_rele(sp);		/* free the thread's reference */
3339 	nfs4_server_rele(sp);		/* free the list's reference */
3340 	sp = NULL;
3341 
3342 done:
3343 	mutex_enter(&cpr_lock);
3344 	CALLB_CPR_EXIT(&cpr_info);	/* drops cpr_lock */
3345 	mutex_destroy(&cpr_lock);
3346 
3347 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3348 	    "nfs4_renew_lease_thread: renew thread exit officially"));
3349 
3350 	zthread_exit();
3351 	/* NOT REACHED */
3352 }
3353 
3354 /*
3355  * Send out a RENEW op to the server.
3356  * Assumes sp is locked down.
3357  */
3358 static int
3359 nfs4renew(nfs4_server_t *sp)
3360 {
3361 	COMPOUND4args_clnt args;
3362 	COMPOUND4res_clnt res;
3363 	nfs_argop4 argop[1];
3364 	int doqueue = 1;
3365 	int rpc_error;
3366 	cred_t *cr;
3367 	mntinfo4_t *mi;
3368 	timespec_t prop_time, after_time;
3369 	int needrecov = FALSE;
3370 	nfs4_recov_state_t recov_state;
3371 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
3372 
3373 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "nfs4renew"));
3374 
3375 	recov_state.rs_flags = 0;
3376 	recov_state.rs_num_retry_despite_err = 0;
3377 
3378 recov_retry:
3379 	mi = sp->mntinfo4_list;
3380 	VFS_HOLD(mi->mi_vfsp);
3381 	mutex_exit(&sp->s_lock);
3382 	ASSERT(mi != NULL);
3383 
3384 	e.error = nfs4_start_op(mi, NULL, NULL, &recov_state);
3385 	if (e.error) {
3386 		VFS_RELE(mi->mi_vfsp);
3387 		return (e.error);
3388 	}
3389 
3390 	/* Check to see if we're dealing with a marked-dead sp */
3391 	mutex_enter(&sp->s_lock);
3392 	if (sp->s_thread_exit == NFS4_THREAD_EXIT) {
3393 		mutex_exit(&sp->s_lock);
3394 		nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3395 		VFS_RELE(mi->mi_vfsp);
3396 		return (0);
3397 	}
3398 
3399 	/* Make sure mi hasn't changed on us */
3400 	if (mi != sp->mntinfo4_list) {
3401 		/* Must drop sp's lock to avoid a recursive mutex enter */
3402 		mutex_exit(&sp->s_lock);
3403 		nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3404 		VFS_RELE(mi->mi_vfsp);
3405 		mutex_enter(&sp->s_lock);
3406 		goto recov_retry;
3407 	}
3408 	mutex_exit(&sp->s_lock);
3409 
3410 	args.ctag = TAG_RENEW;
3411 
3412 	args.array_len = 1;
3413 	args.array = argop;
3414 
3415 	argop[0].argop = OP_RENEW;
3416 
3417 	mutex_enter(&sp->s_lock);
3418 	argop[0].nfs_argop4_u.oprenew.clientid = sp->clientid;
3419 	cr = sp->s_cred;
3420 	crhold(cr);
3421 	mutex_exit(&sp->s_lock);
3422 
3423 	ASSERT(cr != NULL);
3424 
3425 	/* used to figure out RTT for sp */
3426 	gethrestime(&prop_time);
3427 
3428 	NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
3429 	    "nfs4renew: %s call, sp 0x%p", needrecov ? "recov" : "first",
3430 	    (void*)sp));
3431 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "before: %ld s %ld ns ",
3432 	    prop_time.tv_sec, prop_time.tv_nsec));
3433 
3434 	DTRACE_PROBE2(nfs4__renew__start, nfs4_server_t *, sp,
3435 	    mntinfo4_t *, mi);
3436 
3437 	rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
3438 	crfree(cr);
3439 
3440 	DTRACE_PROBE2(nfs4__renew__end, nfs4_server_t *, sp,
3441 	    mntinfo4_t *, mi);
3442 
3443 	gethrestime(&after_time);
3444 
3445 	mutex_enter(&sp->s_lock);
3446 	sp->propagation_delay.tv_sec =
3447 	    MAX(1, after_time.tv_sec - prop_time.tv_sec);
3448 	mutex_exit(&sp->s_lock);
3449 
3450 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "after : %ld s %ld ns ",
3451 	    after_time.tv_sec, after_time.tv_nsec));
3452 
3453 	if (e.error == 0 && res.status == NFS4ERR_CB_PATH_DOWN) {
3454 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3455 		nfs4_delegreturn_all(sp);
3456 		nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3457 		VFS_RELE(mi->mi_vfsp);
3458 		/*
3459 		 * If the server returns CB_PATH_DOWN, it has renewed
3460 		 * the lease and informed us that the callback path is
3461 		 * down.  Since the lease is renewed, just return 0 and
3462 		 * let the renew thread proceed as normal.
3463 		 */
3464 		return (0);
3465 	}
3466 
3467 	needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
3468 	if (!needrecov && e.error) {
3469 		nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3470 		VFS_RELE(mi->mi_vfsp);
3471 		return (e.error);
3472 	}
3473 
3474 	rpc_error = e.error;
3475 
3476 	if (needrecov) {
3477 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
3478 		    "nfs4renew: initiating recovery\n"));
3479 
3480 		if (nfs4_start_recovery(&e, mi, NULL, NULL, NULL, NULL,
3481 		    OP_RENEW, NULL) == FALSE) {
3482 			nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3483 			VFS_RELE(mi->mi_vfsp);
3484 			if (!e.error)
3485 				(void) xdr_free(xdr_COMPOUND4res_clnt,
3486 				    (caddr_t)&res);
3487 			mutex_enter(&sp->s_lock);
3488 			goto recov_retry;
3489 		}
3490 		/* fall through for res.status case */
3491 	}
3492 
3493 	if (res.status) {
3494 		if (res.status == NFS4ERR_LEASE_MOVED) {
3495 			/*EMPTY*/
3496 			/*
3497 			 * XXX need to try every mntinfo4 in sp->mntinfo4_list
3498 			 * to renew the lease on that server
3499 			 */
3500 		}
3501 		e.error = geterrno4(res.status);
3502 	}
3503 
3504 	if (!rpc_error)
3505 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3506 
3507 	nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3508 
3509 	VFS_RELE(mi->mi_vfsp);
3510 
3511 	return (e.error);
3512 }
3513 
3514 void
3515 nfs4_inc_state_ref_count(mntinfo4_t *mi)
3516 {
3517 	nfs4_server_t	*sp;
3518 
3519 	/* this locks down sp if it is found */
3520 	sp = find_nfs4_server(mi);
3521 
3522 	if (sp != NULL) {
3523 		nfs4_inc_state_ref_count_nolock(sp, mi);
3524 		mutex_exit(&sp->s_lock);
3525 		nfs4_server_rele(sp);
3526 	}
3527 }
3528 
3529 /*
3530  * Bump the number of OPEN files (ie: those with state) so we know if this
3531  * nfs4_server has any state to maintain a lease for or not.
3532  *
3533  * Also, marks the nfs4_server's lease valid if it hasn't been done so already.
3534  */
3535 void
3536 nfs4_inc_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi)
3537 {
3538 	ASSERT(mutex_owned(&sp->s_lock));
3539 
3540 	sp->state_ref_count++;
3541 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3542 	    "nfs4_inc_state_ref_count: state_ref_count now %d",
3543 	    sp->state_ref_count));
3544 
3545 	if (sp->lease_valid == NFS4_LEASE_UNINITIALIZED)
3546 		sp->lease_valid = NFS4_LEASE_VALID;
3547 
3548 	/*
3549 	 * If this call caused the lease to be marked valid and/or
3550 	 * took the state_ref_count from 0 to 1, then start the time
3551 	 * on lease renewal.
3552 	 */
3553 	if (sp->lease_valid == NFS4_LEASE_VALID && sp->state_ref_count == 1)
3554 		sp->last_renewal_time = gethrestime_sec();
3555 
3556 	/* update the number of open files for mi */
3557 	mi->mi_open_files++;
3558 }
3559 
3560 void
3561 nfs4_dec_state_ref_count(mntinfo4_t *mi)
3562 {
3563 	nfs4_server_t	*sp;
3564 
3565 	/* this locks down sp if it is found */
3566 	sp = find_nfs4_server_all(mi, 1);
3567 
3568 	if (sp != NULL) {
3569 		nfs4_dec_state_ref_count_nolock(sp, mi);
3570 		mutex_exit(&sp->s_lock);
3571 		nfs4_server_rele(sp);
3572 	}
3573 }
3574 
3575 /*
3576  * Decrement the number of OPEN files (ie: those with state) so we know if
3577  * this nfs4_server has any state to maintain a lease for or not.
3578  */
3579 void
3580 nfs4_dec_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi)
3581 {
3582 	ASSERT(mutex_owned(&sp->s_lock));
3583 	ASSERT(sp->state_ref_count != 0);
3584 	sp->state_ref_count--;
3585 
3586 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3587 	    "nfs4_dec_state_ref_count: state ref count now %d",
3588 	    sp->state_ref_count));
3589 
3590 	mi->mi_open_files--;
3591 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3592 	    "nfs4_dec_state_ref_count: mi open files %d, v4 flags 0x%x",
3593 	    mi->mi_open_files, mi->mi_flags));
3594 
3595 	/* We don't have to hold the mi_lock to test mi_flags */
3596 	if (mi->mi_open_files == 0 &&
3597 	    (mi->mi_flags & MI4_REMOVE_ON_LAST_CLOSE)) {
3598 		NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3599 		    "nfs4_dec_state_ref_count: remove mntinfo4 %p since "
3600 		    "we have closed the last open file", (void*)mi));
3601 		nfs4_remove_mi_from_server(mi, sp);
3602 	}
3603 }
3604 
3605 bool_t
3606 inlease(nfs4_server_t *sp)
3607 {
3608 	bool_t result;
3609 
3610 	ASSERT(mutex_owned(&sp->s_lock));
3611 
3612 	if (sp->lease_valid == NFS4_LEASE_VALID &&
3613 	    gethrestime_sec() < sp->last_renewal_time + sp->s_lease_time)
3614 		result = TRUE;
3615 	else
3616 		result = FALSE;
3617 
3618 	return (result);
3619 }
3620 
3621 
3622 /*
3623  * Return non-zero if the given nfs4_server_t is going through recovery.
3624  */
3625 
3626 int
3627 nfs4_server_in_recovery(nfs4_server_t *sp)
3628 {
3629 	return (nfs_rw_lock_held(&sp->s_recovlock, RW_WRITER));
3630 }
3631 
3632 /*
3633  * Compare two shared filehandle objects.  Returns -1, 0, or +1, if the
3634  * first is less than, equal to, or greater than the second.
3635  */
3636 
3637 int
3638 sfh4cmp(const void *p1, const void *p2)
3639 {
3640 	const nfs4_sharedfh_t *sfh1 = (const nfs4_sharedfh_t *)p1;
3641 	const nfs4_sharedfh_t *sfh2 = (const nfs4_sharedfh_t *)p2;
3642 
3643 	return (nfs4cmpfh(&sfh1->sfh_fh, &sfh2->sfh_fh));
3644 }
3645 
3646 /*
3647  * Create a table for shared filehandle objects.
3648  */
3649 
3650 void
3651 sfh4_createtab(avl_tree_t *tab)
3652 {
3653 	avl_create(tab, sfh4cmp, sizeof (nfs4_sharedfh_t),
3654 	    offsetof(nfs4_sharedfh_t, sfh_tree));
3655 }
3656 
3657 /*
3658  * Return a shared filehandle object for the given filehandle.  The caller
3659  * is responsible for eventually calling sfh4_rele().
3660  */
3661 
3662 nfs4_sharedfh_t *
3663 sfh4_put(const nfs_fh4 *fh, mntinfo4_t *mi, nfs4_sharedfh_t *key)
3664 {
3665 	nfs4_sharedfh_t *sfh, *nsfh;
3666 	avl_index_t where;
3667 	nfs4_sharedfh_t skey;
3668 
3669 	if (!key) {
3670 		skey.sfh_fh = *fh;
3671 		key = &skey;
3672 	}
3673 
3674 	nsfh = kmem_alloc(sizeof (nfs4_sharedfh_t), KM_SLEEP);
3675 	nsfh->sfh_fh.nfs_fh4_len = fh->nfs_fh4_len;
3676 	/*
3677 	 * We allocate the largest possible filehandle size because it's
3678 	 * not that big, and it saves us from possibly having to resize the
3679 	 * buffer later.
3680 	 */
3681 	nsfh->sfh_fh.nfs_fh4_val = kmem_alloc(NFS4_FHSIZE, KM_SLEEP);
3682 	bcopy(fh->nfs_fh4_val, nsfh->sfh_fh.nfs_fh4_val, fh->nfs_fh4_len);
3683 	mutex_init(&nsfh->sfh_lock, NULL, MUTEX_DEFAULT, NULL);
3684 	nsfh->sfh_refcnt = 1;
3685 	nsfh->sfh_flags = SFH4_IN_TREE;
3686 	nsfh->sfh_mi = mi;
3687 	NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, "sfh4_get: new object (%p)",
3688 	    (void *)nsfh));
3689 
3690 	(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
3691 	sfh = avl_find(&mi->mi_filehandles, key, &where);
3692 	if (sfh != NULL) {
3693 		mutex_enter(&sfh->sfh_lock);
3694 		sfh->sfh_refcnt++;
3695 		mutex_exit(&sfh->sfh_lock);
3696 		nfs_rw_exit(&mi->mi_fh_lock);
3697 		/* free our speculative allocs */
3698 		kmem_free(nsfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE);
3699 		kmem_free(nsfh, sizeof (nfs4_sharedfh_t));
3700 		return (sfh);
3701 	}
3702 
3703 	avl_insert(&mi->mi_filehandles, nsfh, where);
3704 	nfs_rw_exit(&mi->mi_fh_lock);
3705 
3706 	return (nsfh);
3707 }
3708 
3709 /*
3710  * Return a shared filehandle object for the given filehandle.  The caller
3711  * is responsible for eventually calling sfh4_rele().
3712  */
3713 
3714 nfs4_sharedfh_t *
3715 sfh4_get(const nfs_fh4 *fh, mntinfo4_t *mi)
3716 {
3717 	nfs4_sharedfh_t *sfh;
3718 	nfs4_sharedfh_t key;
3719 
3720 	ASSERT(fh->nfs_fh4_len <= NFS4_FHSIZE);
3721 
3722 #ifdef DEBUG
3723 	if (nfs4_sharedfh_debug) {
3724 		nfs4_fhandle_t fhandle;
3725 
3726 		fhandle.fh_len = fh->nfs_fh4_len;
3727 		bcopy(fh->nfs_fh4_val, fhandle.fh_buf, fhandle.fh_len);
3728 		zcmn_err(mi->mi_zone->zone_id, CE_NOTE, "sfh4_get:");
3729 		nfs4_printfhandle(&fhandle);
3730 	}
3731 #endif
3732 
3733 	/*
3734 	 * If there's already an object for the given filehandle, bump the
3735 	 * reference count and return it.  Otherwise, create a new object
3736 	 * and add it to the AVL tree.
3737 	 */
3738 
3739 	key.sfh_fh = *fh;
3740 
3741 	(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
3742 	sfh = avl_find(&mi->mi_filehandles, &key, NULL);
3743 	if (sfh != NULL) {
3744 		mutex_enter(&sfh->sfh_lock);
3745 		sfh->sfh_refcnt++;
3746 		NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3747 		    "sfh4_get: found existing %p, new refcnt=%d",
3748 		    (void *)sfh, sfh->sfh_refcnt));
3749 		mutex_exit(&sfh->sfh_lock);
3750 		nfs_rw_exit(&mi->mi_fh_lock);
3751 		return (sfh);
3752 	}
3753 	nfs_rw_exit(&mi->mi_fh_lock);
3754 
3755 	return (sfh4_put(fh, mi, &key));
3756 }
3757 
3758 /*
3759  * Get a reference to the given shared filehandle object.
3760  */
3761 
3762 void
3763 sfh4_hold(nfs4_sharedfh_t *sfh)
3764 {
3765 	ASSERT(sfh->sfh_refcnt > 0);
3766 
3767 	mutex_enter(&sfh->sfh_lock);
3768 	sfh->sfh_refcnt++;
3769 	NFS4_DEBUG(nfs4_sharedfh_debug,
3770 	    (CE_NOTE, "sfh4_hold %p, new refcnt=%d",
3771 	    (void *)sfh, sfh->sfh_refcnt));
3772 	mutex_exit(&sfh->sfh_lock);
3773 }
3774 
3775 /*
3776  * Release a reference to the given shared filehandle object and null out
3777  * the given pointer.
3778  */
3779 
3780 void
3781 sfh4_rele(nfs4_sharedfh_t **sfhpp)
3782 {
3783 	mntinfo4_t *mi;
3784 	nfs4_sharedfh_t *sfh = *sfhpp;
3785 
3786 	ASSERT(sfh->sfh_refcnt > 0);
3787 
3788 	mutex_enter(&sfh->sfh_lock);
3789 	if (sfh->sfh_refcnt > 1) {
3790 		sfh->sfh_refcnt--;
3791 		NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3792 		    "sfh4_rele %p, new refcnt=%d",
3793 		    (void *)sfh, sfh->sfh_refcnt));
3794 		mutex_exit(&sfh->sfh_lock);
3795 		goto finish;
3796 	}
3797 	mutex_exit(&sfh->sfh_lock);
3798 
3799 	/*
3800 	 * Possibly the last reference, so get the lock for the table in
3801 	 * case it's time to remove the object from the table.
3802 	 */
3803 	mi = sfh->sfh_mi;
3804 	(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
3805 	mutex_enter(&sfh->sfh_lock);
3806 	sfh->sfh_refcnt--;
3807 	if (sfh->sfh_refcnt > 0) {
3808 		NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3809 		    "sfh4_rele %p, new refcnt=%d",
3810 		    (void *)sfh, sfh->sfh_refcnt));
3811 		mutex_exit(&sfh->sfh_lock);
3812 		nfs_rw_exit(&mi->mi_fh_lock);
3813 		goto finish;
3814 	}
3815 
3816 	NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3817 	    "sfh4_rele %p, last ref", (void *)sfh));
3818 	if (sfh->sfh_flags & SFH4_IN_TREE) {
3819 		avl_remove(&mi->mi_filehandles, sfh);
3820 		sfh->sfh_flags &= ~SFH4_IN_TREE;
3821 	}
3822 	mutex_exit(&sfh->sfh_lock);
3823 	nfs_rw_exit(&mi->mi_fh_lock);
3824 	mutex_destroy(&sfh->sfh_lock);
3825 	kmem_free(sfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE);
3826 	kmem_free(sfh, sizeof (nfs4_sharedfh_t));
3827 
3828 finish:
3829 	*sfhpp = NULL;
3830 }
3831 
3832 /*
3833  * Update the filehandle for the given shared filehandle object.
3834  */
3835 
3836 int nfs4_warn_dupfh = 0;	/* if set, always warn about dup fhs below */
3837 
3838 void
3839 sfh4_update(nfs4_sharedfh_t *sfh, const nfs_fh4 *newfh)
3840 {
3841 	mntinfo4_t *mi = sfh->sfh_mi;
3842 	nfs4_sharedfh_t *dupsfh;
3843 	avl_index_t where;
3844 	nfs4_sharedfh_t key;
3845 
3846 #ifdef DEBUG
3847 	mutex_enter(&sfh->sfh_lock);
3848 	ASSERT(sfh->sfh_refcnt > 0);
3849 	mutex_exit(&sfh->sfh_lock);
3850 #endif
3851 	ASSERT(newfh->nfs_fh4_len <= NFS4_FHSIZE);
3852 
3853 	/*
3854 	 * The basic plan is to remove the shared filehandle object from
3855 	 * the table, update it to have the new filehandle, then reinsert
3856 	 * it.
3857 	 */
3858 
3859 	(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
3860 	mutex_enter(&sfh->sfh_lock);
3861 	if (sfh->sfh_flags & SFH4_IN_TREE) {
3862 		avl_remove(&mi->mi_filehandles, sfh);
3863 		sfh->sfh_flags &= ~SFH4_IN_TREE;
3864 	}
3865 	mutex_exit(&sfh->sfh_lock);
3866 	sfh->sfh_fh.nfs_fh4_len = newfh->nfs_fh4_len;
3867 	bcopy(newfh->nfs_fh4_val, sfh->sfh_fh.nfs_fh4_val,
3868 	    sfh->sfh_fh.nfs_fh4_len);
3869 
3870 	/*
3871 	 * XXX If there is already a shared filehandle object with the new
3872 	 * filehandle, we're in trouble, because the rnode code assumes
3873 	 * that there is only one shared filehandle object for a given
3874 	 * filehandle.  So issue a warning (for read-write mounts only)
3875 	 * and don't try to re-insert the given object into the table.
3876 	 * Hopefully the given object will quickly go away and everyone
3877 	 * will use the new object.
3878 	 */
3879 	key.sfh_fh = *newfh;
3880 	dupsfh = avl_find(&mi->mi_filehandles, &key, &where);
3881 	if (dupsfh != NULL) {
3882 		if (!(mi->mi_vfsp->vfs_flag & VFS_RDONLY) || nfs4_warn_dupfh) {
3883 			zcmn_err(mi->mi_zone->zone_id, CE_WARN, "sfh4_update: "
3884 			    "duplicate filehandle detected");
3885 			sfh4_printfhandle(dupsfh);
3886 		}
3887 	} else {
3888 		avl_insert(&mi->mi_filehandles, sfh, where);
3889 		mutex_enter(&sfh->sfh_lock);
3890 		sfh->sfh_flags |= SFH4_IN_TREE;
3891 		mutex_exit(&sfh->sfh_lock);
3892 	}
3893 	nfs_rw_exit(&mi->mi_fh_lock);
3894 }
3895 
3896 /*
3897  * Copy out the current filehandle for the given shared filehandle object.
3898  */
3899 
3900 void
3901 sfh4_copyval(const nfs4_sharedfh_t *sfh, nfs4_fhandle_t *fhp)
3902 {
3903 	mntinfo4_t *mi = sfh->sfh_mi;
3904 
3905 	ASSERT(sfh->sfh_refcnt > 0);
3906 
3907 	(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
3908 	fhp->fh_len = sfh->sfh_fh.nfs_fh4_len;
3909 	ASSERT(fhp->fh_len <= NFS4_FHSIZE);
3910 	bcopy(sfh->sfh_fh.nfs_fh4_val, fhp->fh_buf, fhp->fh_len);
3911 	nfs_rw_exit(&mi->mi_fh_lock);
3912 }
3913 
3914 /*
3915  * Print out the filehandle for the given shared filehandle object.
3916  */
3917 
3918 void
3919 sfh4_printfhandle(const nfs4_sharedfh_t *sfh)
3920 {
3921 	nfs4_fhandle_t fhandle;
3922 
3923 	sfh4_copyval(sfh, &fhandle);
3924 	nfs4_printfhandle(&fhandle);
3925 }
3926 
3927 /*
3928  * Compare 2 fnames.  Returns -1 if the first is "less" than the second, 0
3929  * if they're the same, +1 if the first is "greater" than the second.  The
3930  * caller (or whoever's calling the AVL package) is responsible for
3931  * handling locking issues.
3932  */
3933 
3934 static int
3935 fncmp(const void *p1, const void *p2)
3936 {
3937 	const nfs4_fname_t *f1 = p1;
3938 	const nfs4_fname_t *f2 = p2;
3939 	int res;
3940 
3941 	res = strcmp(f1->fn_name, f2->fn_name);
3942 	/*
3943 	 * The AVL package wants +/-1, not arbitrary positive or negative
3944 	 * integers.
3945 	 */
3946 	if (res > 0)
3947 		res = 1;
3948 	else if (res < 0)
3949 		res = -1;
3950 	return (res);
3951 }
3952 
3953 /*
3954  * Get or create an fname with the given name, as a child of the given
3955  * fname.  The caller is responsible for eventually releasing the reference
3956  * (fn_rele()).  parent may be NULL.
3957  */
3958 
3959 nfs4_fname_t *
3960 fn_get(nfs4_fname_t *parent, char *name, nfs4_sharedfh_t *sfh)
3961 {
3962 	nfs4_fname_t key;
3963 	nfs4_fname_t *fnp;
3964 	avl_index_t where;
3965 
3966 	key.fn_name = name;
3967 
3968 	/*
3969 	 * If there's already an fname registered with the given name, bump
3970 	 * its reference count and return it.  Otherwise, create a new one
3971 	 * and add it to the parent's AVL tree.
3972 	 *
3973 	 * fname entries we are looking for should match both name
3974 	 * and sfh stored in the fname.
3975 	 */
3976 again:
3977 	if (parent != NULL) {
3978 		mutex_enter(&parent->fn_lock);
3979 		fnp = avl_find(&parent->fn_children, &key, &where);
3980 		if (fnp != NULL) {
3981 			/*
3982 			 * This hold on fnp is released below later,
3983 			 * in case this is not the fnp we want.
3984 			 */
3985 			fn_hold(fnp);
3986 
3987 			if (fnp->fn_sfh == sfh) {
3988 				/*
3989 				 * We have found our entry.
3990 				 * put an hold and return it.
3991 				 */
3992 				mutex_exit(&parent->fn_lock);
3993 				return (fnp);
3994 			}
3995 
3996 			/*
3997 			 * We have found an entry that has a mismatching
3998 			 * fn_sfh. This could be a stale entry due to
3999 			 * server side rename. We will remove this entry
4000 			 * and make sure no such entries exist.
4001 			 */
4002 			mutex_exit(&parent->fn_lock);
4003 			mutex_enter(&fnp->fn_lock);
4004 			if (fnp->fn_parent == parent) {
4005 				/*
4006 				 * Remove ourselves from parent's
4007 				 * fn_children tree.
4008 				 */
4009 				mutex_enter(&parent->fn_lock);
4010 				avl_remove(&parent->fn_children, fnp);
4011 				mutex_exit(&parent->fn_lock);
4012 				fn_rele(&fnp->fn_parent);
4013 			}
4014 			mutex_exit(&fnp->fn_lock);
4015 			fn_rele(&fnp);
4016 			goto again;
4017 		}
4018 	}
4019 
4020 	fnp = kmem_alloc(sizeof (nfs4_fname_t), KM_SLEEP);
4021 	mutex_init(&fnp->fn_lock, NULL, MUTEX_DEFAULT, NULL);
4022 	fnp->fn_parent = parent;
4023 	if (parent != NULL)
4024 		fn_hold(parent);
4025 	fnp->fn_len = strlen(name);
4026 	ASSERT(fnp->fn_len < MAXNAMELEN);
4027 	fnp->fn_name = kmem_alloc(fnp->fn_len + 1, KM_SLEEP);
4028 	(void) strcpy(fnp->fn_name, name);
4029 	fnp->fn_refcnt = 1;
4030 
4031 	/*
4032 	 * This hold on sfh is later released
4033 	 * when we do the final fn_rele() on this fname.
4034 	 */
4035 	sfh4_hold(sfh);
4036 	fnp->fn_sfh = sfh;
4037 
4038 	avl_create(&fnp->fn_children, fncmp, sizeof (nfs4_fname_t),
4039 	    offsetof(nfs4_fname_t, fn_tree));
4040 	NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4041 	    "fn_get %p:%s, a new nfs4_fname_t!",
4042 	    (void *)fnp, fnp->fn_name));
4043 	if (parent != NULL) {
4044 		avl_insert(&parent->fn_children, fnp, where);
4045 		mutex_exit(&parent->fn_lock);
4046 	}
4047 
4048 	return (fnp);
4049 }
4050 
4051 void
4052 fn_hold(nfs4_fname_t *fnp)
4053 {
4054 	atomic_add_32(&fnp->fn_refcnt, 1);
4055 	NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4056 	    "fn_hold %p:%s, new refcnt=%d",
4057 	    (void *)fnp, fnp->fn_name, fnp->fn_refcnt));
4058 }
4059 
4060 /*
4061  * Decrement the reference count of the given fname, and destroy it if its
4062  * reference count goes to zero.  Nulls out the given pointer.
4063  */
4064 
4065 void
4066 fn_rele(nfs4_fname_t **fnpp)
4067 {
4068 	nfs4_fname_t *parent;
4069 	uint32_t newref;
4070 	nfs4_fname_t *fnp;
4071 
4072 recur:
4073 	fnp = *fnpp;
4074 	*fnpp = NULL;
4075 
4076 	mutex_enter(&fnp->fn_lock);
4077 	parent = fnp->fn_parent;
4078 	if (parent != NULL)
4079 		mutex_enter(&parent->fn_lock);	/* prevent new references */
4080 	newref = atomic_add_32_nv(&fnp->fn_refcnt, -1);
4081 	if (newref > 0) {
4082 		NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4083 		    "fn_rele %p:%s, new refcnt=%d",
4084 		    (void *)fnp, fnp->fn_name, fnp->fn_refcnt));
4085 		if (parent != NULL)
4086 			mutex_exit(&parent->fn_lock);
4087 		mutex_exit(&fnp->fn_lock);
4088 		return;
4089 	}
4090 
4091 	NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4092 	    "fn_rele %p:%s, last reference, deleting...",
4093 	    (void *)fnp, fnp->fn_name));
4094 	if (parent != NULL) {
4095 		avl_remove(&parent->fn_children, fnp);
4096 		mutex_exit(&parent->fn_lock);
4097 	}
4098 	kmem_free(fnp->fn_name, fnp->fn_len + 1);
4099 	sfh4_rele(&fnp->fn_sfh);
4100 	mutex_destroy(&fnp->fn_lock);
4101 	avl_destroy(&fnp->fn_children);
4102 	kmem_free(fnp, sizeof (nfs4_fname_t));
4103 	/*
4104 	 * Recursivly fn_rele the parent.
4105 	 * Use goto instead of a recursive call to avoid stack overflow.
4106 	 */
4107 	if (parent != NULL) {
4108 		fnpp = &parent;
4109 		goto recur;
4110 	}
4111 }
4112 
4113 /*
4114  * Returns the single component name of the given fname, in a MAXNAMELEN
4115  * string buffer, which the caller is responsible for freeing.  Note that
4116  * the name may become invalid as a result of fn_move().
4117  */
4118 
4119 char *
4120 fn_name(nfs4_fname_t *fnp)
4121 {
4122 	char *name;
4123 
4124 	ASSERT(fnp->fn_len < MAXNAMELEN);
4125 	name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
4126 	mutex_enter(&fnp->fn_lock);
4127 	(void) strcpy(name, fnp->fn_name);
4128 	mutex_exit(&fnp->fn_lock);
4129 
4130 	return (name);
4131 }
4132 
4133 
4134 /*
4135  * fn_path_realloc
4136  *
4137  * This function, used only by fn_path, constructs
4138  * a new string which looks like "prepend" + "/" + "current".
4139  * by allocating a new string and freeing the old one.
4140  */
4141 static void
4142 fn_path_realloc(char **curses, char *prepend)
4143 {
4144 	int len, curlen = 0;
4145 	char *news;
4146 
4147 	if (*curses == NULL) {
4148 		/*
4149 		 * Prime the pump, allocate just the
4150 		 * space for prepend and return that.
4151 		 */
4152 		len = strlen(prepend) + 1;
4153 		news = kmem_alloc(len, KM_SLEEP);
4154 		(void) strncpy(news, prepend, len);
4155 	} else {
4156 		/*
4157 		 * Allocate the space  for a new string
4158 		 * +1 +1 is for the "/" and the NULL
4159 		 * byte at the end of it all.
4160 		 */
4161 		curlen = strlen(*curses);
4162 		len = curlen + strlen(prepend) + 1 + 1;
4163 		news = kmem_alloc(len, KM_SLEEP);
4164 		(void) strncpy(news, prepend, len);
4165 		(void) strcat(news, "/");
4166 		(void) strcat(news, *curses);
4167 		kmem_free(*curses, curlen + 1);
4168 	}
4169 	*curses = news;
4170 }
4171 
4172 /*
4173  * Returns the path name (starting from the fs root) for the given fname.
4174  * The caller is responsible for freeing.  Note that the path may be or
4175  * become invalid as a result of fn_move().
4176  */
4177 
4178 char *
4179 fn_path(nfs4_fname_t *fnp)
4180 {
4181 	char *path;
4182 	nfs4_fname_t *nextfnp;
4183 
4184 	if (fnp == NULL)
4185 		return (NULL);
4186 
4187 	path = NULL;
4188 
4189 	/* walk up the tree constructing the pathname.  */
4190 
4191 	fn_hold(fnp);			/* adjust for later rele */
4192 	do {
4193 		mutex_enter(&fnp->fn_lock);
4194 		/*
4195 		 * Add fn_name in front of the current path
4196 		 */
4197 		fn_path_realloc(&path, fnp->fn_name);
4198 		nextfnp = fnp->fn_parent;
4199 		if (nextfnp != NULL)
4200 			fn_hold(nextfnp);
4201 		mutex_exit(&fnp->fn_lock);
4202 		fn_rele(&fnp);
4203 		fnp = nextfnp;
4204 	} while (fnp != NULL);
4205 
4206 	return (path);
4207 }
4208 
4209 /*
4210  * Return a reference to the parent of the given fname, which the caller is
4211  * responsible for eventually releasing.
4212  */
4213 
4214 nfs4_fname_t *
4215 fn_parent(nfs4_fname_t *fnp)
4216 {
4217 	nfs4_fname_t *parent;
4218 
4219 	mutex_enter(&fnp->fn_lock);
4220 	parent = fnp->fn_parent;
4221 	if (parent != NULL)
4222 		fn_hold(parent);
4223 	mutex_exit(&fnp->fn_lock);
4224 
4225 	return (parent);
4226 }
4227 
4228 /*
4229  * Update fnp so that its parent is newparent and its name is newname.
4230  */
4231 
4232 void
4233 fn_move(nfs4_fname_t *fnp, nfs4_fname_t *newparent, char *newname)
4234 {
4235 	nfs4_fname_t *parent, *tmpfnp;
4236 	ssize_t newlen;
4237 	nfs4_fname_t key;
4238 	avl_index_t where;
4239 
4240 	/*
4241 	 * This assert exists to catch the client trying to rename
4242 	 * a dir to be a child of itself.  This happened at a recent
4243 	 * bakeoff against a 3rd party (broken) server which allowed
4244 	 * the rename to succeed.  If it trips it means that:
4245 	 *	a) the code in nfs4rename that detects this case is broken
4246 	 *	b) the server is broken (since it allowed the bogus rename)
4247 	 *
4248 	 * For non-DEBUG kernels, prepare for a recursive mutex_enter
4249 	 * panic below from:  mutex_enter(&newparent->fn_lock);
4250 	 */
4251 	ASSERT(fnp != newparent);
4252 
4253 	/*
4254 	 * Remove fnp from its current parent, change its name, then add it
4255 	 * to newparent.
4256 	 */
4257 	mutex_enter(&fnp->fn_lock);
4258 	parent = fnp->fn_parent;
4259 	mutex_enter(&parent->fn_lock);
4260 	avl_remove(&parent->fn_children, fnp);
4261 	mutex_exit(&parent->fn_lock);
4262 	fn_rele(&fnp->fn_parent);
4263 
4264 	newlen = strlen(newname);
4265 	if (newlen != fnp->fn_len) {
4266 		ASSERT(newlen < MAXNAMELEN);
4267 		kmem_free(fnp->fn_name, fnp->fn_len + 1);
4268 		fnp->fn_name = kmem_alloc(newlen + 1, KM_SLEEP);
4269 		fnp->fn_len = newlen;
4270 	}
4271 	(void) strcpy(fnp->fn_name, newname);
4272 
4273 again:
4274 	mutex_enter(&newparent->fn_lock);
4275 	key.fn_name = fnp->fn_name;
4276 	tmpfnp = avl_find(&newparent->fn_children, &key, &where);
4277 	if (tmpfnp != NULL) {
4278 		/*
4279 		 * This could be due to a file that was unlinked while
4280 		 * open, or perhaps the rnode is in the free list.  Remove
4281 		 * it from newparent and let it go away on its own.  The
4282 		 * contorted code is to deal with lock order issues and
4283 		 * race conditions.
4284 		 */
4285 		fn_hold(tmpfnp);
4286 		mutex_exit(&newparent->fn_lock);
4287 		mutex_enter(&tmpfnp->fn_lock);
4288 		if (tmpfnp->fn_parent == newparent) {
4289 			mutex_enter(&newparent->fn_lock);
4290 			avl_remove(&newparent->fn_children, tmpfnp);
4291 			mutex_exit(&newparent->fn_lock);
4292 			fn_rele(&tmpfnp->fn_parent);
4293 		}
4294 		mutex_exit(&tmpfnp->fn_lock);
4295 		fn_rele(&tmpfnp);
4296 		goto again;
4297 	}
4298 	fnp->fn_parent = newparent;
4299 	fn_hold(newparent);
4300 	avl_insert(&newparent->fn_children, fnp, where);
4301 	mutex_exit(&newparent->fn_lock);
4302 	mutex_exit(&fnp->fn_lock);
4303 }
4304 
4305 #ifdef DEBUG
4306 /*
4307  * Return non-zero if the type information makes sense for the given vnode.
4308  * Otherwise panic.
4309  */
4310 int
4311 nfs4_consistent_type(vnode_t *vp)
4312 {
4313 	rnode4_t *rp = VTOR4(vp);
4314 
4315 	if (nfs4_vtype_debug && vp->v_type != VNON &&
4316 	    rp->r_attr.va_type != VNON && vp->v_type != rp->r_attr.va_type) {
4317 		cmn_err(CE_PANIC, "vnode %p type mismatch; v_type=%d, "
4318 		    "rnode attr type=%d", (void *)vp, vp->v_type,
4319 		    rp->r_attr.va_type);
4320 	}
4321 
4322 	return (1);
4323 }
4324 #endif /* DEBUG */
4325