xref: /titanic_52/usr/src/uts/common/fs/nfs/nfs4_client.c (revision 263f549e5da8b32c4922f586afb365b8ae388a6c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 /*
26  *  	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
27  *	All Rights Reserved
28  */
29 
30 #include <sys/param.h>
31 #include <sys/types.h>
32 #include <sys/systm.h>
33 #include <sys/thread.h>
34 #include <sys/t_lock.h>
35 #include <sys/time.h>
36 #include <sys/vnode.h>
37 #include <sys/vfs.h>
38 #include <sys/errno.h>
39 #include <sys/buf.h>
40 #include <sys/stat.h>
41 #include <sys/cred.h>
42 #include <sys/kmem.h>
43 #include <sys/debug.h>
44 #include <sys/dnlc.h>
45 #include <sys/vmsystm.h>
46 #include <sys/flock.h>
47 #include <sys/share.h>
48 #include <sys/cmn_err.h>
49 #include <sys/tiuser.h>
50 #include <sys/sysmacros.h>
51 #include <sys/callb.h>
52 #include <sys/acl.h>
53 #include <sys/kstat.h>
54 #include <sys/signal.h>
55 #include <sys/disp.h>
56 #include <sys/atomic.h>
57 #include <sys/list.h>
58 #include <sys/sdt.h>
59 
60 #include <rpc/types.h>
61 #include <rpc/xdr.h>
62 #include <rpc/auth.h>
63 #include <rpc/clnt.h>
64 
65 #include <nfs/nfs.h>
66 #include <nfs/nfs_clnt.h>
67 #include <nfs/nfs_acl.h>
68 
69 #include <nfs/nfs4.h>
70 #include <nfs/rnode4.h>
71 #include <nfs/nfs4_clnt.h>
72 
73 #include <vm/hat.h>
74 #include <vm/as.h>
75 #include <vm/page.h>
76 #include <vm/pvn.h>
77 #include <vm/seg.h>
78 #include <vm/seg_map.h>
79 #include <vm/seg_vn.h>
80 
81 #include <sys/ddi.h>
82 
83 /*
84  * Arguments to page-flush thread.
85  */
86 typedef struct {
87 	vnode_t *vp;
88 	cred_t *cr;
89 } pgflush_t;
90 
91 #ifdef DEBUG
92 int nfs4_client_lease_debug;
93 int nfs4_sharedfh_debug;
94 int nfs4_fname_debug;
95 
96 /* temporary: panic if v_type is inconsistent with r_attr va_type */
97 int nfs4_vtype_debug;
98 
99 uint_t nfs4_tsd_key;
100 #endif
101 
102 static time_t	nfs4_client_resumed = 0;
103 static	callb_id_t cid = 0;
104 
105 static int	nfs4renew(nfs4_server_t *);
106 static void	nfs4_attrcache_va(vnode_t *, nfs4_ga_res_t *, int);
107 static void	nfs4_pgflush_thread(pgflush_t *);
108 
109 static boolean_t nfs4_client_cpr_callb(void *, int);
110 
111 struct mi4_globals {
112 	kmutex_t	mig_lock;  /* lock protecting mig_list */
113 	list_t		mig_list;  /* list of NFS v4 mounts in zone */
114 	boolean_t	mig_destructor_called;
115 };
116 
117 static zone_key_t mi4_list_key;
118 
119 /*
120  * Attributes caching:
121  *
122  * Attributes are cached in the rnode in struct vattr form.
123  * There is a time associated with the cached attributes (r_time_attr_inval)
124  * which tells whether the attributes are valid. The time is initialized
125  * to the difference between current time and the modify time of the vnode
126  * when new attributes are cached. This allows the attributes for
127  * files that have changed recently to be timed out sooner than for files
128  * that have not changed for a long time. There are minimum and maximum
129  * timeout values that can be set per mount point.
130  */
131 
132 /*
133  * If a cache purge is in progress, wait for it to finish.
134  *
135  * The current thread must not be in the middle of an
136  * nfs4_start_op/nfs4_end_op region.  Otherwise, there could be a deadlock
137  * between this thread, a recovery thread, and the page flush thread.
138  */
139 int
140 nfs4_waitfor_purge_complete(vnode_t *vp)
141 {
142 	rnode4_t *rp;
143 	k_sigset_t smask;
144 
145 	rp = VTOR4(vp);
146 	if ((rp->r_serial != NULL && rp->r_serial != curthread) ||
147 	    ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread)) {
148 		mutex_enter(&rp->r_statelock);
149 		sigintr(&smask, VTOMI4(vp)->mi_flags & MI4_INT);
150 		while ((rp->r_serial != NULL && rp->r_serial != curthread) ||
151 		    ((rp->r_flags & R4PGFLUSH) &&
152 		    rp->r_pgflush != curthread)) {
153 			if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
154 				sigunintr(&smask);
155 				mutex_exit(&rp->r_statelock);
156 				return (EINTR);
157 			}
158 		}
159 		sigunintr(&smask);
160 		mutex_exit(&rp->r_statelock);
161 	}
162 	return (0);
163 }
164 
165 /*
166  * Validate caches by checking cached attributes. If they have timed out,
167  * then get new attributes from the server.  As a side effect, cache
168  * invalidation is done if the attributes have changed.
169  *
170  * If the attributes have not timed out and if there is a cache
171  * invalidation being done by some other thread, then wait until that
172  * thread has completed the cache invalidation.
173  */
174 int
175 nfs4_validate_caches(vnode_t *vp, cred_t *cr)
176 {
177 	int error;
178 	nfs4_ga_res_t gar;
179 
180 	if (ATTRCACHE4_VALID(vp)) {
181 		error = nfs4_waitfor_purge_complete(vp);
182 		if (error)
183 			return (error);
184 		return (0);
185 	}
186 
187 	return (nfs4_getattr_otw(vp, &gar, cr, 0));
188 }
189 
190 /*
191  * Fill in attribute from the cache.
192  * If valid, then return 0 to indicate that no error occurred,
193  * otherwise return 1 to indicate that an error occurred.
194  */
195 static int
196 nfs4_getattr_cache(vnode_t *vp, struct vattr *vap)
197 {
198 	rnode4_t *rp;
199 
200 	rp = VTOR4(vp);
201 	mutex_enter(&rp->r_statelock);
202 	mutex_enter(&rp->r_statev4_lock);
203 	if (ATTRCACHE4_VALID(vp)) {
204 		mutex_exit(&rp->r_statev4_lock);
205 		/*
206 		 * Cached attributes are valid
207 		 */
208 		*vap = rp->r_attr;
209 		mutex_exit(&rp->r_statelock);
210 		return (0);
211 	}
212 	mutex_exit(&rp->r_statev4_lock);
213 	mutex_exit(&rp->r_statelock);
214 	return (1);
215 }
216 
217 
218 /*
219  * If returned error is ESTALE flush all caches.  The nfs4_purge_caches()
220  * call is synchronous because all the pages were invalidated by the
221  * nfs4_invalidate_pages() call.
222  */
223 void
224 nfs4_purge_stale_fh(int errno, vnode_t *vp, cred_t *cr)
225 {
226 	struct rnode4 *rp = VTOR4(vp);
227 
228 	/* Ensure that the ..._end_op() call has been done */
229 	ASSERT(tsd_get(nfs4_tsd_key) == NULL);
230 
231 	if (errno != ESTALE)
232 		return;
233 
234 	mutex_enter(&rp->r_statelock);
235 	rp->r_flags |= R4STALE;
236 	if (!rp->r_error)
237 		rp->r_error = errno;
238 	mutex_exit(&rp->r_statelock);
239 	if (nfs4_has_pages(vp))
240 		nfs4_invalidate_pages(vp, (u_offset_t)0, cr);
241 	nfs4_purge_caches(vp, NFS4_PURGE_DNLC, cr, FALSE);
242 }
243 
244 /*
245  * Purge all of the various NFS `data' caches.  If "asyncpg" is TRUE, the
246  * page purge is done asynchronously.
247  */
248 void
249 nfs4_purge_caches(vnode_t *vp, int purge_dnlc, cred_t *cr, int asyncpg)
250 {
251 	rnode4_t *rp;
252 	char *contents;
253 	vnode_t *xattr;
254 	int size;
255 	int pgflush;			/* are we the page flush thread? */
256 
257 	/*
258 	 * Purge the DNLC for any entries which refer to this file.
259 	 */
260 	if (vp->v_count > 1 &&
261 	    (vp->v_type == VDIR || purge_dnlc == NFS4_PURGE_DNLC))
262 		dnlc_purge_vp(vp);
263 
264 	/*
265 	 * Clear any readdir state bits and purge the readlink response cache.
266 	 */
267 	rp = VTOR4(vp);
268 	mutex_enter(&rp->r_statelock);
269 	rp->r_flags &= ~R4LOOKUP;
270 	contents = rp->r_symlink.contents;
271 	size = rp->r_symlink.size;
272 	rp->r_symlink.contents = NULL;
273 
274 	xattr = rp->r_xattr_dir;
275 	rp->r_xattr_dir = NULL;
276 
277 	/*
278 	 * Purge pathconf cache too.
279 	 */
280 	rp->r_pathconf.pc4_xattr_valid = 0;
281 	rp->r_pathconf.pc4_cache_valid = 0;
282 
283 	pgflush = (curthread == rp->r_pgflush);
284 	mutex_exit(&rp->r_statelock);
285 
286 	if (contents != NULL) {
287 
288 		kmem_free((void *)contents, size);
289 	}
290 
291 	if (xattr != NULL)
292 		VN_RELE(xattr);
293 
294 	/*
295 	 * Flush the page cache.  If the current thread is the page flush
296 	 * thread, don't initiate a new page flush.  There's no need for
297 	 * it, and doing it correctly is hard.
298 	 */
299 	if (nfs4_has_pages(vp) && !pgflush) {
300 		if (!asyncpg) {
301 			(void) nfs4_waitfor_purge_complete(vp);
302 			nfs4_flush_pages(vp, cr);
303 		} else {
304 			pgflush_t *args;
305 
306 			/*
307 			 * We don't hold r_statelock while creating the
308 			 * thread, in case the call blocks.  So we use a
309 			 * flag to indicate that a page flush thread is
310 			 * active.
311 			 */
312 			mutex_enter(&rp->r_statelock);
313 			if (rp->r_flags & R4PGFLUSH) {
314 				mutex_exit(&rp->r_statelock);
315 			} else {
316 				rp->r_flags |= R4PGFLUSH;
317 				mutex_exit(&rp->r_statelock);
318 
319 				args = kmem_alloc(sizeof (pgflush_t),
320 				    KM_SLEEP);
321 				args->vp = vp;
322 				VN_HOLD(args->vp);
323 				args->cr = cr;
324 				crhold(args->cr);
325 				(void) zthread_create(NULL, 0,
326 				    nfs4_pgflush_thread, args, 0,
327 				    minclsyspri);
328 			}
329 		}
330 	}
331 
332 	/*
333 	 * Flush the readdir response cache.
334 	 */
335 	nfs4_purge_rddir_cache(vp);
336 }
337 
338 /*
339  * Invalidate all pages for the given file, after writing back the dirty
340  * ones.
341  */
342 
343 void
344 nfs4_flush_pages(vnode_t *vp, cred_t *cr)
345 {
346 	int error;
347 	rnode4_t *rp = VTOR4(vp);
348 
349 	error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_INVAL, cr, NULL);
350 	if (error == ENOSPC || error == EDQUOT) {
351 		mutex_enter(&rp->r_statelock);
352 		if (!rp->r_error)
353 			rp->r_error = error;
354 		mutex_exit(&rp->r_statelock);
355 	}
356 }
357 
358 /*
359  * Page flush thread.
360  */
361 
362 static void
363 nfs4_pgflush_thread(pgflush_t *args)
364 {
365 	rnode4_t *rp = VTOR4(args->vp);
366 
367 	/* remember which thread we are, so we don't deadlock ourselves */
368 	mutex_enter(&rp->r_statelock);
369 	ASSERT(rp->r_pgflush == NULL);
370 	rp->r_pgflush = curthread;
371 	mutex_exit(&rp->r_statelock);
372 
373 	nfs4_flush_pages(args->vp, args->cr);
374 
375 	mutex_enter(&rp->r_statelock);
376 	rp->r_pgflush = NULL;
377 	rp->r_flags &= ~R4PGFLUSH;
378 	cv_broadcast(&rp->r_cv);
379 	mutex_exit(&rp->r_statelock);
380 
381 	VN_RELE(args->vp);
382 	crfree(args->cr);
383 	kmem_free(args, sizeof (pgflush_t));
384 	zthread_exit();
385 }
386 
387 /*
388  * Purge the readdir cache of all entries which are not currently
389  * being filled.
390  */
391 void
392 nfs4_purge_rddir_cache(vnode_t *vp)
393 {
394 	rnode4_t *rp;
395 
396 	rp = VTOR4(vp);
397 
398 	mutex_enter(&rp->r_statelock);
399 	rp->r_direof = NULL;
400 	rp->r_flags &= ~R4LOOKUP;
401 	rp->r_flags |= R4READDIRWATTR;
402 	rddir4_cache_purge(rp);
403 	mutex_exit(&rp->r_statelock);
404 }
405 
406 /*
407  * Set attributes cache for given vnode using virtual attributes.  There is
408  * no cache validation, but if the attributes are deemed to be stale, they
409  * are ignored.  This corresponds to nfs3_attrcache().
410  *
411  * Set the timeout value on the attribute cache and fill it
412  * with the passed in attributes.
413  */
414 void
415 nfs4_attrcache_noinval(vnode_t *vp, nfs4_ga_res_t *garp, hrtime_t t)
416 {
417 	rnode4_t *rp = VTOR4(vp);
418 
419 	mutex_enter(&rp->r_statelock);
420 	if (rp->r_time_attr_saved <= t)
421 		nfs4_attrcache_va(vp, garp, FALSE);
422 	mutex_exit(&rp->r_statelock);
423 }
424 
425 /*
426  * Use the passed in virtual attributes to check to see whether the
427  * data and metadata caches are valid, cache the new attributes, and
428  * then do the cache invalidation if required.
429  *
430  * The cache validation and caching of the new attributes is done
431  * atomically via the use of the mutex, r_statelock.  If required,
432  * the cache invalidation is done atomically w.r.t. the cache
433  * validation and caching of the attributes via the pseudo lock,
434  * r_serial.
435  *
436  * This routine is used to do cache validation and attributes caching
437  * for operations with a single set of post operation attributes.
438  */
439 
440 void
441 nfs4_attr_cache(vnode_t *vp, nfs4_ga_res_t *garp,
442     hrtime_t t, cred_t *cr, int async,
443     change_info4 *cinfo)
444 {
445 	rnode4_t *rp;
446 	int mtime_changed = 0;
447 	int ctime_changed = 0;
448 	vsecattr_t *vsp;
449 	int was_serial, set_time_cache_inval, recov;
450 	vattr_t *vap = &garp->n4g_va;
451 	mntinfo4_t *mi = VTOMI4(vp);
452 	len_t preattr_rsize;
453 	boolean_t writemodify_set = B_FALSE;
454 	boolean_t cachepurge_set = B_FALSE;
455 
456 	ASSERT(mi->mi_vfsp->vfs_dev == garp->n4g_va.va_fsid);
457 
458 	/* Is curthread the recovery thread? */
459 	mutex_enter(&mi->mi_lock);
460 	recov = (VTOMI4(vp)->mi_recovthread == curthread);
461 	mutex_exit(&mi->mi_lock);
462 
463 	rp = VTOR4(vp);
464 	mutex_enter(&rp->r_statelock);
465 	was_serial = (rp->r_serial == curthread);
466 	if (rp->r_serial != NULL && !was_serial) {
467 		/*
468 		 * Purge current attrs and bail out to avoid potential deadlock
469 		 * between another thread caching attrs (r_serial thread), this
470 		 * thread, and a thread trying to read or write pages.
471 		 */
472 		PURGE_ATTRCACHE4_LOCKED(rp);
473 		mutex_exit(&rp->r_statelock);
474 		return;
475 	}
476 
477 	/*
478 	 * If there is a page flush thread, the current thread needs to
479 	 * bail out, to prevent a possible deadlock between the current
480 	 * thread (which might be in a start_op/end_op region), the
481 	 * recovery thread, and the page flush thread.  Expire the
482 	 * attribute cache, so that any attributes the current thread was
483 	 * going to set are not lost.
484 	 */
485 	if ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread) {
486 		PURGE_ATTRCACHE4_LOCKED(rp);
487 		mutex_exit(&rp->r_statelock);
488 		return;
489 	}
490 
491 	if (rp->r_time_attr_saved > t) {
492 		/*
493 		 * Attributes have been cached since these attributes were
494 		 * probably made. If there is an inconsistency in what is
495 		 * cached, mark them invalid. If not, don't act on them.
496 		 */
497 		if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size))
498 			PURGE_ATTRCACHE4_LOCKED(rp);
499 		mutex_exit(&rp->r_statelock);
500 		return;
501 	}
502 	set_time_cache_inval = 0;
503 	if (cinfo) {
504 		/*
505 		 * Only directory modifying callers pass non-NULL cinfo.
506 		 */
507 		ASSERT(vp->v_type == VDIR);
508 		/*
509 		 * If the cache timeout either doesn't exist or hasn't expired,
510 		 * and dir didn't changed on server before dirmod op
511 		 * and dir didn't change after dirmod op but before getattr
512 		 * then there's a chance that the client's cached data for
513 		 * this object is current (not stale).  No immediate cache
514 		 * flush is required.
515 		 *
516 		 */
517 		if ((! rp->r_time_cache_inval || t < rp->r_time_cache_inval) &&
518 		    cinfo->before == rp->r_change &&
519 		    (garp->n4g_change_valid &&
520 		    cinfo->after == garp->n4g_change)) {
521 
522 			/*
523 			 * If atomic isn't set, then the before/after info
524 			 * cannot be blindly trusted.  For this case, we tell
525 			 * nfs4_attrcache_va to cache the attrs but also
526 			 * establish an absolute maximum cache timeout.  When
527 			 * the timeout is reached, caches will be flushed.
528 			 */
529 			if (! cinfo->atomic)
530 				set_time_cache_inval = 1;
531 		} else {
532 
533 			/*
534 			 * We're not sure exactly what changed, but we know
535 			 * what to do.  flush all caches for dir.  remove the
536 			 * attr timeout.
537 			 *
538 			 * a) timeout expired.  flush all caches.
539 			 * b) r_change != cinfo.before.  flush all caches.
540 			 * c) r_change == cinfo.before, but cinfo.after !=
541 			 *    post-op getattr(change).  flush all caches.
542 			 * d) post-op getattr(change) not provided by server.
543 			 *    flush all caches.
544 			 */
545 			mtime_changed = 1;
546 			ctime_changed = 1;
547 			rp->r_time_cache_inval = 0;
548 		}
549 	} else {
550 		/*
551 		 * Write thread after writing data to file on remote server,
552 		 * will always set R4WRITEMODIFIED to indicate that file on
553 		 * remote server was modified with a WRITE operation and would
554 		 * have marked attribute cache as timed out. If R4WRITEMODIFIED
555 		 * is set, then do not check for mtime and ctime change.
556 		 */
557 		if (!(rp->r_flags & R4WRITEMODIFIED)) {
558 			if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size))
559 				mtime_changed = 1;
560 
561 			if (rp->r_attr.va_ctime.tv_sec !=
562 			    vap->va_ctime.tv_sec ||
563 			    rp->r_attr.va_ctime.tv_nsec !=
564 			    vap->va_ctime.tv_nsec)
565 				ctime_changed = 1;
566 
567 			/*
568 			 * If the change attribute was not provided by server
569 			 * or it differs, then flush all caches.
570 			 */
571 			if (!garp->n4g_change_valid ||
572 			    rp->r_change != garp->n4g_change) {
573 				mtime_changed = 1;
574 				ctime_changed = 1;
575 			}
576 		} else {
577 			writemodify_set = B_TRUE;
578 		}
579 	}
580 
581 	preattr_rsize = rp->r_size;
582 
583 	nfs4_attrcache_va(vp, garp, set_time_cache_inval);
584 
585 	/*
586 	 * If we have updated filesize in nfs4_attrcache_va, as soon as we
587 	 * drop statelock we will be in transition of purging all
588 	 * our caches and updating them. It is possible for another
589 	 * thread to pick this new file size and read in zeroed data.
590 	 * stall other threads till cache purge is complete.
591 	 */
592 	if ((!cinfo) && (rp->r_size != preattr_rsize)) {
593 		/*
594 		 * If R4WRITEMODIFIED was set and we have updated the file
595 		 * size, Server's returned file size need not necessarily
596 		 * be because of this Client's WRITE. We need to purge
597 		 * all caches.
598 		 */
599 		if (writemodify_set)
600 			mtime_changed = 1;
601 
602 		if (mtime_changed && !(rp->r_flags & R4INCACHEPURGE)) {
603 			rp->r_flags |= R4INCACHEPURGE;
604 			cachepurge_set = B_TRUE;
605 		}
606 	}
607 
608 	if (!mtime_changed && !ctime_changed) {
609 		mutex_exit(&rp->r_statelock);
610 		return;
611 	}
612 
613 	rp->r_serial = curthread;
614 
615 	mutex_exit(&rp->r_statelock);
616 
617 	/*
618 	 * If we're the recov thread, then force async nfs4_purge_caches
619 	 * to avoid potential deadlock.
620 	 */
621 	if (mtime_changed)
622 		nfs4_purge_caches(vp, NFS4_NOPURGE_DNLC, cr, recov ? 1 : async);
623 
624 	if ((rp->r_flags & R4INCACHEPURGE) && cachepurge_set) {
625 		mutex_enter(&rp->r_statelock);
626 		rp->r_flags &= ~R4INCACHEPURGE;
627 		cv_broadcast(&rp->r_cv);
628 		mutex_exit(&rp->r_statelock);
629 		cachepurge_set = B_FALSE;
630 	}
631 
632 	if (ctime_changed) {
633 		(void) nfs4_access_purge_rp(rp);
634 		if (rp->r_secattr != NULL) {
635 			mutex_enter(&rp->r_statelock);
636 			vsp = rp->r_secattr;
637 			rp->r_secattr = NULL;
638 			mutex_exit(&rp->r_statelock);
639 			if (vsp != NULL)
640 				nfs4_acl_free_cache(vsp);
641 		}
642 	}
643 
644 	if (!was_serial) {
645 		mutex_enter(&rp->r_statelock);
646 		rp->r_serial = NULL;
647 		cv_broadcast(&rp->r_cv);
648 		mutex_exit(&rp->r_statelock);
649 	}
650 }
651 
652 /*
653  * Set attributes cache for given vnode using virtual attributes.
654  *
655  * Set the timeout value on the attribute cache and fill it
656  * with the passed in attributes.
657  *
658  * The caller must be holding r_statelock.
659  */
660 static void
661 nfs4_attrcache_va(vnode_t *vp, nfs4_ga_res_t *garp, int set_cache_timeout)
662 {
663 	rnode4_t *rp;
664 	mntinfo4_t *mi;
665 	hrtime_t delta;
666 	hrtime_t now;
667 	vattr_t *vap = &garp->n4g_va;
668 
669 	rp = VTOR4(vp);
670 
671 	ASSERT(MUTEX_HELD(&rp->r_statelock));
672 	ASSERT(vap->va_mask == AT_ALL);
673 
674 	/* Switch to master before checking v_flag */
675 	if (IS_SHADOW(vp, rp))
676 		vp = RTOV4(rp);
677 
678 	now = gethrtime();
679 
680 	mi = VTOMI4(vp);
681 
682 	/*
683 	 * Only establish a new cache timeout (if requested).  Never
684 	 * extend a timeout.  Never clear a timeout.  Clearing a timeout
685 	 * is done by nfs4_update_dircaches (ancestor in our call chain)
686 	 */
687 	if (set_cache_timeout && ! rp->r_time_cache_inval)
688 		rp->r_time_cache_inval = now + mi->mi_acdirmax;
689 
690 	/*
691 	 * Delta is the number of nanoseconds that we will
692 	 * cache the attributes of the file.  It is based on
693 	 * the number of nanoseconds since the last time that
694 	 * we detected a change.  The assumption is that files
695 	 * that changed recently are likely to change again.
696 	 * There is a minimum and a maximum for regular files
697 	 * and for directories which is enforced though.
698 	 *
699 	 * Using the time since last change was detected
700 	 * eliminates direct comparison or calculation
701 	 * using mixed client and server times.  NFS does
702 	 * not make any assumptions regarding the client
703 	 * and server clocks being synchronized.
704 	 */
705 	if (vap->va_mtime.tv_sec != rp->r_attr.va_mtime.tv_sec ||
706 	    vap->va_mtime.tv_nsec != rp->r_attr.va_mtime.tv_nsec ||
707 	    vap->va_size != rp->r_attr.va_size) {
708 		rp->r_time_attr_saved = now;
709 	}
710 
711 	if ((mi->mi_flags & MI4_NOAC) || (vp->v_flag & VNOCACHE))
712 		delta = 0;
713 	else {
714 		delta = now - rp->r_time_attr_saved;
715 		if (vp->v_type == VDIR) {
716 			if (delta < mi->mi_acdirmin)
717 				delta = mi->mi_acdirmin;
718 			else if (delta > mi->mi_acdirmax)
719 				delta = mi->mi_acdirmax;
720 		} else {
721 			if (delta < mi->mi_acregmin)
722 				delta = mi->mi_acregmin;
723 			else if (delta > mi->mi_acregmax)
724 				delta = mi->mi_acregmax;
725 		}
726 	}
727 	rp->r_time_attr_inval = now + delta;
728 
729 	rp->r_attr = *vap;
730 	if (garp->n4g_change_valid)
731 		rp->r_change = garp->n4g_change;
732 
733 	/*
734 	 * The attributes that were returned may be valid and can
735 	 * be used, but they may not be allowed to be cached.
736 	 * Reset the timers to cause immediate invalidation and
737 	 * clear r_change so no VERIFY operations will suceed
738 	 */
739 	if (garp->n4g_attrwhy == NFS4_GETATTR_NOCACHE_OK) {
740 		rp->r_time_attr_inval = now;
741 		rp->r_time_attr_saved = now;
742 		rp->r_change = 0;
743 	}
744 
745 	/*
746 	 * If mounted_on_fileid returned AND the object is a stub,
747 	 * then set object's va_nodeid to the mounted over fid
748 	 * returned by server.
749 	 *
750 	 * If mounted_on_fileid not provided/supported, then
751 	 * just set it to 0 for now.  Eventually it would be
752 	 * better to set it to a hashed version of FH.  This
753 	 * would probably be good enough to provide a unique
754 	 * fid/d_ino within a dir.
755 	 *
756 	 * We don't need to carry mounted_on_fileid in the
757 	 * rnode as long as the client never requests fileid
758 	 * without also requesting mounted_on_fileid.  For
759 	 * now, it stays.
760 	 */
761 	if (garp->n4g_mon_fid_valid) {
762 		rp->r_mntd_fid = garp->n4g_mon_fid;
763 
764 		if (RP_ISSTUB(rp))
765 			rp->r_attr.va_nodeid = rp->r_mntd_fid;
766 	}
767 
768 	/*
769 	 * Check to see if there are valid pathconf bits to
770 	 * cache in the rnode.
771 	 */
772 	if (garp->n4g_ext_res) {
773 		if (garp->n4g_ext_res->n4g_pc4.pc4_cache_valid) {
774 			rp->r_pathconf = garp->n4g_ext_res->n4g_pc4;
775 		} else {
776 			if (garp->n4g_ext_res->n4g_pc4.pc4_xattr_valid) {
777 				rp->r_pathconf.pc4_xattr_valid = TRUE;
778 				rp->r_pathconf.pc4_xattr_exists =
779 				    garp->n4g_ext_res->n4g_pc4.pc4_xattr_exists;
780 			}
781 		}
782 	}
783 	/*
784 	 * Update the size of the file if there is no cached data or if
785 	 * the cached data is clean and there is no data being written
786 	 * out.
787 	 */
788 	if (rp->r_size != vap->va_size &&
789 	    (!vn_has_cached_data(vp) ||
790 	    (!(rp->r_flags & R4DIRTY) && rp->r_count == 0))) {
791 		rp->r_size = vap->va_size;
792 	}
793 	nfs_setswaplike(vp, vap);
794 	rp->r_flags &= ~R4WRITEMODIFIED;
795 }
796 
797 /*
798  * Get attributes over-the-wire and update attributes cache
799  * if no error occurred in the over-the-wire operation.
800  * Return 0 if successful, otherwise error.
801  */
802 int
803 nfs4_getattr_otw(vnode_t *vp, nfs4_ga_res_t *garp, cred_t *cr, int get_acl)
804 {
805 	mntinfo4_t *mi = VTOMI4(vp);
806 	hrtime_t t;
807 	nfs4_recov_state_t recov_state;
808 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
809 
810 	recov_state.rs_flags = 0;
811 	recov_state.rs_num_retry_despite_err = 0;
812 
813 	/* Save the original mount point security flavor */
814 	(void) save_mnt_secinfo(mi->mi_curr_serv);
815 
816 recov_retry:
817 
818 	if ((e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR,
819 	    &recov_state, NULL))) {
820 		(void) check_mnt_secinfo(mi->mi_curr_serv, vp);
821 		return (e.error);
822 	}
823 
824 	t = gethrtime();
825 
826 	nfs4_getattr_otw_norecovery(vp, garp, &e, cr, get_acl);
827 
828 	if (nfs4_needs_recovery(&e, FALSE, vp->v_vfsp)) {
829 		if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
830 		    NULL, OP_GETATTR, NULL, NULL, NULL) == FALSE)  {
831 			nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR,
832 			    &recov_state, 1);
833 			goto recov_retry;
834 		}
835 	}
836 
837 	nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 0);
838 
839 	if (!e.error) {
840 		if (e.stat == NFS4_OK) {
841 			nfs4_attr_cache(vp, garp, t, cr, FALSE, NULL);
842 		} else {
843 			e.error = geterrno4(e.stat);
844 
845 			nfs4_purge_stale_fh(e.error, vp, cr);
846 		}
847 	}
848 
849 	/*
850 	 * If getattr a node that is a stub for a crossed
851 	 * mount point, keep the original secinfo flavor for
852 	 * the current file system, not the crossed one.
853 	 */
854 	(void) check_mnt_secinfo(mi->mi_curr_serv, vp);
855 
856 	return (e.error);
857 }
858 
859 /*
860  * Generate a compound to get attributes over-the-wire.
861  */
862 void
863 nfs4_getattr_otw_norecovery(vnode_t *vp, nfs4_ga_res_t *garp,
864     nfs4_error_t *ep, cred_t *cr, int get_acl)
865 {
866 	COMPOUND4args_clnt args;
867 	COMPOUND4res_clnt res;
868 	int doqueue;
869 	rnode4_t *rp = VTOR4(vp);
870 	nfs_argop4 argop[2];
871 
872 	args.ctag = TAG_GETATTR;
873 
874 	args.array_len = 2;
875 	args.array = argop;
876 
877 	/* putfh */
878 	argop[0].argop = OP_CPUTFH;
879 	argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
880 
881 	/* getattr */
882 	/*
883 	 * Unlike nfs version 2 and 3, where getattr returns all the
884 	 * attributes, nfs version 4 returns only the ones explicitly
885 	 * asked for. This creates problems, as some system functions
886 	 * (e.g. cache check) require certain attributes and if the
887 	 * cached node lacks some attributes such as uid/gid, it can
888 	 * affect system utilities (e.g. "ls") that rely on the information
889 	 * to be there. This can lead to anything from system crashes to
890 	 * corrupted information processed by user apps.
891 	 * So to ensure that all bases are covered, request at least
892 	 * the AT_ALL attribute mask.
893 	 */
894 	argop[1].argop = OP_GETATTR;
895 	argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
896 	if (get_acl)
897 		argop[1].nfs_argop4_u.opgetattr.attr_request |= FATTR4_ACL_MASK;
898 	argop[1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp);
899 
900 	doqueue = 1;
901 
902 	rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, ep);
903 
904 	if (ep->error)
905 		return;
906 
907 	if (res.status != NFS4_OK) {
908 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
909 		return;
910 	}
911 
912 	*garp = res.array[1].nfs_resop4_u.opgetattr.ga_res;
913 
914 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
915 }
916 
917 /*
918  * Return either cached or remote attributes. If get remote attr
919  * use them to check and invalidate caches, then cache the new attributes.
920  */
921 int
922 nfs4getattr(vnode_t *vp, vattr_t *vap, cred_t *cr)
923 {
924 	int error;
925 	rnode4_t *rp;
926 	nfs4_ga_res_t gar;
927 
928 	ASSERT(nfs4_consistent_type(vp));
929 
930 	/*
931 	 * If we've got cached attributes, we're done, otherwise go
932 	 * to the server to get attributes, which will update the cache
933 	 * in the process. Either way, use the cached attributes for
934 	 * the caller's vattr_t.
935 	 *
936 	 * Note that we ignore the gar set by the OTW call: the attr caching
937 	 * code may make adjustments when storing to the rnode, and we want
938 	 * to see those changes here.
939 	 */
940 	rp = VTOR4(vp);
941 	error = 0;
942 	mutex_enter(&rp->r_statelock);
943 	if (!ATTRCACHE4_VALID(vp)) {
944 		mutex_exit(&rp->r_statelock);
945 		error = nfs4_getattr_otw(vp, &gar, cr, 0);
946 		mutex_enter(&rp->r_statelock);
947 	}
948 
949 	if (!error)
950 		*vap = rp->r_attr;
951 
952 	/* Return the client's view of file size */
953 	vap->va_size = rp->r_size;
954 
955 	mutex_exit(&rp->r_statelock);
956 
957 	ASSERT(nfs4_consistent_type(vp));
958 
959 	return (error);
960 }
961 
962 int
963 nfs4_attr_otw(vnode_t *vp, nfs4_tag_type_t tag_type,
964     nfs4_ga_res_t *garp, bitmap4 reqbitmap, cred_t *cr)
965 {
966 	COMPOUND4args_clnt args;
967 	COMPOUND4res_clnt res;
968 	int doqueue;
969 	nfs_argop4 argop[2];
970 	mntinfo4_t *mi = VTOMI4(vp);
971 	bool_t needrecov = FALSE;
972 	nfs4_recov_state_t recov_state;
973 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
974 	nfs4_ga_ext_res_t *gerp;
975 
976 	recov_state.rs_flags = 0;
977 	recov_state.rs_num_retry_despite_err = 0;
978 
979 recov_retry:
980 	args.ctag = tag_type;
981 
982 	args.array_len = 2;
983 	args.array = argop;
984 
985 	e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR, &recov_state, NULL);
986 	if (e.error)
987 		return (e.error);
988 
989 	/* putfh */
990 	argop[0].argop = OP_CPUTFH;
991 	argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh;
992 
993 	/* getattr */
994 	argop[1].argop = OP_GETATTR;
995 	argop[1].nfs_argop4_u.opgetattr.attr_request = reqbitmap;
996 	argop[1].nfs_argop4_u.opgetattr.mi = mi;
997 
998 	doqueue = 1;
999 
1000 	NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
1001 	    "nfs4_attr_otw: %s call, rp %s", needrecov ? "recov" : "first",
1002 	    rnode4info(VTOR4(vp))));
1003 
1004 	rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
1005 
1006 	needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
1007 	if (!needrecov && e.error) {
1008 		nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
1009 		    needrecov);
1010 		return (e.error);
1011 	}
1012 
1013 	if (needrecov) {
1014 		bool_t abort;
1015 
1016 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1017 		    "nfs4_attr_otw: initiating recovery\n"));
1018 
1019 		abort = nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
1020 		    NULL, OP_GETATTR, NULL, NULL, NULL);
1021 		nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
1022 		    needrecov);
1023 		if (!e.error) {
1024 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1025 			e.error = geterrno4(res.status);
1026 		}
1027 		if (abort == FALSE)
1028 			goto recov_retry;
1029 		return (e.error);
1030 	}
1031 
1032 	if (res.status) {
1033 		e.error = geterrno4(res.status);
1034 	} else {
1035 		gerp = garp->n4g_ext_res;
1036 		bcopy(&res.array[1].nfs_resop4_u.opgetattr.ga_res,
1037 		    garp, sizeof (nfs4_ga_res_t));
1038 		garp->n4g_ext_res = gerp;
1039 		if (garp->n4g_ext_res &&
1040 		    res.array[1].nfs_resop4_u.opgetattr.ga_res.n4g_ext_res)
1041 			bcopy(res.array[1].nfs_resop4_u.opgetattr.
1042 			    ga_res.n4g_ext_res,
1043 			    garp->n4g_ext_res, sizeof (nfs4_ga_ext_res_t));
1044 	}
1045 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1046 	nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
1047 	    needrecov);
1048 	return (e.error);
1049 }
1050 
1051 /*
1052  * Asynchronous I/O parameters.  nfs_async_threads is the high-water mark
1053  * for the demand-based allocation of async threads per-mount.  The
1054  * nfs_async_timeout is the amount of time a thread will live after it
1055  * becomes idle, unless new I/O requests are received before the thread
1056  * dies.  See nfs4_async_putpage and nfs4_async_start.
1057  */
1058 
1059 static void	nfs4_async_start(struct vfs *);
1060 static void	nfs4_async_pgops_start(struct vfs *);
1061 static void	nfs4_async_common_start(struct vfs *, int);
1062 
1063 static void
1064 free_async_args4(struct nfs4_async_reqs *args)
1065 {
1066 	rnode4_t *rp;
1067 
1068 	if (args->a_io != NFS4_INACTIVE) {
1069 		rp = VTOR4(args->a_vp);
1070 		mutex_enter(&rp->r_statelock);
1071 		rp->r_count--;
1072 		if (args->a_io == NFS4_PUTAPAGE ||
1073 		    args->a_io == NFS4_PAGEIO)
1074 			rp->r_awcount--;
1075 		cv_broadcast(&rp->r_cv);
1076 		mutex_exit(&rp->r_statelock);
1077 		VN_RELE(args->a_vp);
1078 	}
1079 	crfree(args->a_cred);
1080 	kmem_free(args, sizeof (*args));
1081 }
1082 
1083 /*
1084  * Cross-zone thread creation and NFS access is disallowed, yet fsflush() and
1085  * pageout(), running in the global zone, have legitimate reasons to do
1086  * VOP_PUTPAGE(B_ASYNC) on other zones' NFS mounts.  We avoid the problem by
1087  * use of a a per-mount "asynchronous requests manager thread" which is
1088  * signaled by the various asynchronous work routines when there is
1089  * asynchronous work to be done.  It is responsible for creating new
1090  * worker threads if necessary, and notifying existing worker threads
1091  * that there is work to be done.
1092  *
1093  * In other words, it will "take the specifications from the customers and
1094  * give them to the engineers."
1095  *
1096  * Worker threads die off of their own accord if they are no longer
1097  * needed.
1098  *
1099  * This thread is killed when the zone is going away or the filesystem
1100  * is being unmounted.
1101  */
1102 void
1103 nfs4_async_manager(vfs_t *vfsp)
1104 {
1105 	callb_cpr_t cprinfo;
1106 	mntinfo4_t *mi;
1107 	uint_t max_threads;
1108 
1109 	mi = VFTOMI4(vfsp);
1110 
1111 	CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr,
1112 	    "nfs4_async_manager");
1113 
1114 	mutex_enter(&mi->mi_async_lock);
1115 	/*
1116 	 * We want to stash the max number of threads that this mount was
1117 	 * allowed so we can use it later when the variable is set to zero as
1118 	 * part of the zone/mount going away.
1119 	 *
1120 	 * We want to be able to create at least one thread to handle
1121 	 * asynchronous inactive calls.
1122 	 */
1123 	max_threads = MAX(mi->mi_max_threads, 1);
1124 	/*
1125 	 * We don't want to wait for mi_max_threads to go to zero, since that
1126 	 * happens as part of a failed unmount, but this thread should only
1127 	 * exit when the mount is really going away.
1128 	 *
1129 	 * Once MI4_ASYNC_MGR_STOP is set, no more async operations will be
1130 	 * attempted: the various _async_*() functions know to do things
1131 	 * inline if mi_max_threads == 0.  Henceforth we just drain out the
1132 	 * outstanding requests.
1133 	 *
1134 	 * Note that we still create zthreads even if we notice the zone is
1135 	 * shutting down (MI4_ASYNC_MGR_STOP is set); this may cause the zone
1136 	 * shutdown sequence to take slightly longer in some cases, but
1137 	 * doesn't violate the protocol, as all threads will exit as soon as
1138 	 * they're done processing the remaining requests.
1139 	 */
1140 	for (;;) {
1141 		while (mi->mi_async_req_count > 0) {
1142 			/*
1143 			 * Paranoia: If the mount started out having
1144 			 * (mi->mi_max_threads == 0), and the value was
1145 			 * later changed (via a debugger or somesuch),
1146 			 * we could be confused since we will think we
1147 			 * can't create any threads, and the calling
1148 			 * code (which looks at the current value of
1149 			 * mi->mi_max_threads, now non-zero) thinks we
1150 			 * can.
1151 			 *
1152 			 * So, because we're paranoid, we create threads
1153 			 * up to the maximum of the original and the
1154 			 * current value. This means that future
1155 			 * (debugger-induced) alterations of
1156 			 * mi->mi_max_threads are ignored for our
1157 			 * purposes, but who told them they could change
1158 			 * random values on a live kernel anyhow?
1159 			 */
1160 			if (mi->mi_threads[NFS4_ASYNC_QUEUE] <
1161 			    MAX(mi->mi_max_threads, max_threads)) {
1162 				mi->mi_threads[NFS4_ASYNC_QUEUE]++;
1163 				mutex_exit(&mi->mi_async_lock);
1164 				MI4_HOLD(mi);
1165 				VFS_HOLD(vfsp);	/* hold for new thread */
1166 				(void) zthread_create(NULL, 0, nfs4_async_start,
1167 				    vfsp, 0, minclsyspri);
1168 				mutex_enter(&mi->mi_async_lock);
1169 			} else if (mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] <
1170 			    NUM_ASYNC_PGOPS_THREADS) {
1171 				mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE]++;
1172 				mutex_exit(&mi->mi_async_lock);
1173 				MI4_HOLD(mi);
1174 				VFS_HOLD(vfsp); /* hold for new thread */
1175 				(void) zthread_create(NULL, 0,
1176 				    nfs4_async_pgops_start, vfsp, 0,
1177 				    minclsyspri);
1178 				mutex_enter(&mi->mi_async_lock);
1179 			}
1180 			NFS4_WAKE_ASYNC_WORKER(mi->mi_async_work_cv);
1181 			ASSERT(mi->mi_async_req_count != 0);
1182 			mi->mi_async_req_count--;
1183 		}
1184 
1185 		mutex_enter(&mi->mi_lock);
1186 		if (mi->mi_flags & MI4_ASYNC_MGR_STOP) {
1187 			mutex_exit(&mi->mi_lock);
1188 			break;
1189 		}
1190 		mutex_exit(&mi->mi_lock);
1191 
1192 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
1193 		cv_wait(&mi->mi_async_reqs_cv, &mi->mi_async_lock);
1194 		CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1195 	}
1196 
1197 	NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
1198 	    "nfs4_async_manager exiting for vfs %p\n", (void *)mi->mi_vfsp));
1199 	/*
1200 	 * Let everyone know we're done.
1201 	 */
1202 	mi->mi_manager_thread = NULL;
1203 	/*
1204 	 * Wake up the inactive thread.
1205 	 */
1206 	cv_broadcast(&mi->mi_inact_req_cv);
1207 	/*
1208 	 * Wake up anyone sitting in nfs4_async_manager_stop()
1209 	 */
1210 	cv_broadcast(&mi->mi_async_cv);
1211 	/*
1212 	 * There is no explicit call to mutex_exit(&mi->mi_async_lock)
1213 	 * since CALLB_CPR_EXIT is actually responsible for releasing
1214 	 * 'mi_async_lock'.
1215 	 */
1216 	CALLB_CPR_EXIT(&cprinfo);
1217 	VFS_RELE(vfsp);	/* release thread's hold */
1218 	MI4_RELE(mi);
1219 	zthread_exit();
1220 }
1221 
1222 /*
1223  * Signal (and wait for) the async manager thread to clean up and go away.
1224  */
1225 void
1226 nfs4_async_manager_stop(vfs_t *vfsp)
1227 {
1228 	mntinfo4_t *mi = VFTOMI4(vfsp);
1229 
1230 	mutex_enter(&mi->mi_async_lock);
1231 	mutex_enter(&mi->mi_lock);
1232 	mi->mi_flags |= MI4_ASYNC_MGR_STOP;
1233 	mutex_exit(&mi->mi_lock);
1234 	cv_broadcast(&mi->mi_async_reqs_cv);
1235 	/*
1236 	 * Wait for the async manager thread to die.
1237 	 */
1238 	while (mi->mi_manager_thread != NULL)
1239 		cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1240 	mutex_exit(&mi->mi_async_lock);
1241 }
1242 
1243 int
1244 nfs4_async_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr,
1245     struct seg *seg, cred_t *cr, void (*readahead)(vnode_t *,
1246     u_offset_t, caddr_t, struct seg *, cred_t *))
1247 {
1248 	rnode4_t *rp;
1249 	mntinfo4_t *mi;
1250 	struct nfs4_async_reqs *args;
1251 
1252 	rp = VTOR4(vp);
1253 	ASSERT(rp->r_freef == NULL);
1254 
1255 	mi = VTOMI4(vp);
1256 
1257 	/*
1258 	 * If addr falls in a different segment, don't bother doing readahead.
1259 	 */
1260 	if (addr >= seg->s_base + seg->s_size)
1261 		return (-1);
1262 
1263 	/*
1264 	 * If we can't allocate a request structure, punt on the readahead.
1265 	 */
1266 	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1267 		return (-1);
1268 
1269 	/*
1270 	 * If a lock operation is pending, don't initiate any new
1271 	 * readaheads.  Otherwise, bump r_count to indicate the new
1272 	 * asynchronous I/O.
1273 	 */
1274 	if (!nfs_rw_tryenter(&rp->r_lkserlock, RW_READER)) {
1275 		kmem_free(args, sizeof (*args));
1276 		return (-1);
1277 	}
1278 	mutex_enter(&rp->r_statelock);
1279 	rp->r_count++;
1280 	mutex_exit(&rp->r_statelock);
1281 	nfs_rw_exit(&rp->r_lkserlock);
1282 
1283 	args->a_next = NULL;
1284 #ifdef DEBUG
1285 	args->a_queuer = curthread;
1286 #endif
1287 	VN_HOLD(vp);
1288 	args->a_vp = vp;
1289 	ASSERT(cr != NULL);
1290 	crhold(cr);
1291 	args->a_cred = cr;
1292 	args->a_io = NFS4_READ_AHEAD;
1293 	args->a_nfs4_readahead = readahead;
1294 	args->a_nfs4_blkoff = blkoff;
1295 	args->a_nfs4_seg = seg;
1296 	args->a_nfs4_addr = addr;
1297 
1298 	mutex_enter(&mi->mi_async_lock);
1299 
1300 	/*
1301 	 * If asyncio has been disabled, don't bother readahead.
1302 	 */
1303 	if (mi->mi_max_threads == 0) {
1304 		mutex_exit(&mi->mi_async_lock);
1305 		goto noasync;
1306 	}
1307 
1308 	/*
1309 	 * Link request structure into the async list and
1310 	 * wakeup async thread to do the i/o.
1311 	 */
1312 	if (mi->mi_async_reqs[NFS4_READ_AHEAD] == NULL) {
1313 		mi->mi_async_reqs[NFS4_READ_AHEAD] = args;
1314 		mi->mi_async_tail[NFS4_READ_AHEAD] = args;
1315 	} else {
1316 		mi->mi_async_tail[NFS4_READ_AHEAD]->a_next = args;
1317 		mi->mi_async_tail[NFS4_READ_AHEAD] = args;
1318 	}
1319 
1320 	if (mi->mi_io_kstats) {
1321 		mutex_enter(&mi->mi_lock);
1322 		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1323 		mutex_exit(&mi->mi_lock);
1324 	}
1325 
1326 	mi->mi_async_req_count++;
1327 	ASSERT(mi->mi_async_req_count != 0);
1328 	cv_signal(&mi->mi_async_reqs_cv);
1329 	mutex_exit(&mi->mi_async_lock);
1330 	return (0);
1331 
1332 noasync:
1333 	mutex_enter(&rp->r_statelock);
1334 	rp->r_count--;
1335 	cv_broadcast(&rp->r_cv);
1336 	mutex_exit(&rp->r_statelock);
1337 	VN_RELE(vp);
1338 	crfree(cr);
1339 	kmem_free(args, sizeof (*args));
1340 	return (-1);
1341 }
1342 
1343 static void
1344 nfs4_async_start(struct vfs *vfsp)
1345 {
1346 	nfs4_async_common_start(vfsp, NFS4_ASYNC_QUEUE);
1347 }
1348 
1349 static void
1350 nfs4_async_pgops_start(struct vfs *vfsp)
1351 {
1352 	nfs4_async_common_start(vfsp, NFS4_ASYNC_PGOPS_QUEUE);
1353 }
1354 
1355 /*
1356  * The async queues for each mounted file system are arranged as a
1357  * set of queues, one for each async i/o type.  Requests are taken
1358  * from the queues in a round-robin fashion.  A number of consecutive
1359  * requests are taken from each queue before moving on to the next
1360  * queue.  This functionality may allow the NFS Version 2 server to do
1361  * write clustering, even if the client is mixing writes and reads
1362  * because it will take multiple write requests from the queue
1363  * before processing any of the other async i/o types.
1364  *
1365  * XXX The nfs4_async_common_start thread is unsafe in the light of the present
1366  * model defined by cpr to suspend the system. Specifically over the
1367  * wire calls are cpr-unsafe. The thread should be reevaluated in
1368  * case of future updates to the cpr model.
1369  */
1370 static void
1371 nfs4_async_common_start(struct vfs *vfsp, int async_queue)
1372 {
1373 	struct nfs4_async_reqs *args;
1374 	mntinfo4_t *mi = VFTOMI4(vfsp);
1375 	clock_t time_left = 1;
1376 	callb_cpr_t cprinfo;
1377 	int i;
1378 	extern int nfs_async_timeout;
1379 	int async_types;
1380 	kcondvar_t *async_work_cv;
1381 
1382 	if (async_queue == NFS4_ASYNC_QUEUE) {
1383 		async_types = NFS4_ASYNC_TYPES;
1384 		async_work_cv = &mi->mi_async_work_cv[NFS4_ASYNC_QUEUE];
1385 	} else {
1386 		async_types = NFS4_ASYNC_PGOPS_TYPES;
1387 		async_work_cv = &mi->mi_async_work_cv[NFS4_ASYNC_PGOPS_QUEUE];
1388 	}
1389 
1390 	/*
1391 	 * Dynamic initialization of nfs_async_timeout to allow nfs to be
1392 	 * built in an implementation independent manner.
1393 	 */
1394 	if (nfs_async_timeout == -1)
1395 		nfs_async_timeout = NFS_ASYNC_TIMEOUT;
1396 
1397 	CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, "nas");
1398 
1399 	mutex_enter(&mi->mi_async_lock);
1400 	for (;;) {
1401 		/*
1402 		 * Find the next queue containing an entry.  We start
1403 		 * at the current queue pointer and then round robin
1404 		 * through all of them until we either find a non-empty
1405 		 * queue or have looked through all of them.
1406 		 */
1407 		for (i = 0; i < async_types; i++) {
1408 			args = *mi->mi_async_curr[async_queue];
1409 			if (args != NULL)
1410 				break;
1411 			mi->mi_async_curr[async_queue]++;
1412 			if (mi->mi_async_curr[async_queue] ==
1413 			    &mi->mi_async_reqs[async_types]) {
1414 				mi->mi_async_curr[async_queue] =
1415 				    &mi->mi_async_reqs[0];
1416 			}
1417 		}
1418 		/*
1419 		 * If we didn't find a entry, then block until woken up
1420 		 * again and then look through the queues again.
1421 		 */
1422 		if (args == NULL) {
1423 			/*
1424 			 * Exiting is considered to be safe for CPR as well
1425 			 */
1426 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
1427 
1428 			/*
1429 			 * Wakeup thread waiting to unmount the file
1430 			 * system only if all async threads are inactive.
1431 			 *
1432 			 * If we've timed-out and there's nothing to do,
1433 			 * then get rid of this thread.
1434 			 */
1435 			if (mi->mi_max_threads == 0 || time_left <= 0) {
1436 				--mi->mi_threads[async_queue];
1437 
1438 				if (mi->mi_threads[NFS4_ASYNC_QUEUE] == 0 &&
1439 				    mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] == 0)
1440 					cv_signal(&mi->mi_async_cv);
1441 				CALLB_CPR_EXIT(&cprinfo);
1442 				VFS_RELE(vfsp);	/* release thread's hold */
1443 				MI4_RELE(mi);
1444 				zthread_exit();
1445 				/* NOTREACHED */
1446 			}
1447 			time_left = cv_reltimedwait(async_work_cv,
1448 			    &mi->mi_async_lock, nfs_async_timeout,
1449 			    TR_CLOCK_TICK);
1450 
1451 			CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1452 
1453 			continue;
1454 		} else {
1455 			time_left = 1;
1456 		}
1457 
1458 		/*
1459 		 * Remove the request from the async queue and then
1460 		 * update the current async request queue pointer.  If
1461 		 * the current queue is empty or we have removed enough
1462 		 * consecutive entries from it, then reset the counter
1463 		 * for this queue and then move the current pointer to
1464 		 * the next queue.
1465 		 */
1466 		*mi->mi_async_curr[async_queue] = args->a_next;
1467 		if (*mi->mi_async_curr[async_queue] == NULL ||
1468 		    --mi->mi_async_clusters[args->a_io] == 0) {
1469 			mi->mi_async_clusters[args->a_io] =
1470 			    mi->mi_async_init_clusters;
1471 			mi->mi_async_curr[async_queue]++;
1472 			if (mi->mi_async_curr[async_queue] ==
1473 			    &mi->mi_async_reqs[async_types]) {
1474 				mi->mi_async_curr[async_queue] =
1475 				    &mi->mi_async_reqs[0];
1476 			}
1477 		}
1478 
1479 		if (args->a_io != NFS4_INACTIVE && mi->mi_io_kstats) {
1480 			mutex_enter(&mi->mi_lock);
1481 			kstat_waitq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
1482 			mutex_exit(&mi->mi_lock);
1483 		}
1484 
1485 		mutex_exit(&mi->mi_async_lock);
1486 
1487 		/*
1488 		 * Obtain arguments from the async request structure.
1489 		 */
1490 		if (args->a_io == NFS4_READ_AHEAD && mi->mi_max_threads > 0) {
1491 			(*args->a_nfs4_readahead)(args->a_vp,
1492 			    args->a_nfs4_blkoff, args->a_nfs4_addr,
1493 			    args->a_nfs4_seg, args->a_cred);
1494 		} else if (args->a_io == NFS4_PUTAPAGE) {
1495 			(void) (*args->a_nfs4_putapage)(args->a_vp,
1496 			    args->a_nfs4_pp, args->a_nfs4_off,
1497 			    args->a_nfs4_len, args->a_nfs4_flags,
1498 			    args->a_cred);
1499 		} else if (args->a_io == NFS4_PAGEIO) {
1500 			(void) (*args->a_nfs4_pageio)(args->a_vp,
1501 			    args->a_nfs4_pp, args->a_nfs4_off,
1502 			    args->a_nfs4_len, args->a_nfs4_flags,
1503 			    args->a_cred);
1504 		} else if (args->a_io == NFS4_READDIR) {
1505 			(void) ((*args->a_nfs4_readdir)(args->a_vp,
1506 			    args->a_nfs4_rdc, args->a_cred));
1507 		} else if (args->a_io == NFS4_COMMIT) {
1508 			(*args->a_nfs4_commit)(args->a_vp, args->a_nfs4_plist,
1509 			    args->a_nfs4_offset, args->a_nfs4_count,
1510 			    args->a_cred);
1511 		} else if (args->a_io == NFS4_INACTIVE) {
1512 			nfs4_inactive_otw(args->a_vp, args->a_cred);
1513 		}
1514 
1515 		/*
1516 		 * Now, release the vnode and free the credentials
1517 		 * structure.
1518 		 */
1519 		free_async_args4(args);
1520 		/*
1521 		 * Reacquire the mutex because it will be needed above.
1522 		 */
1523 		mutex_enter(&mi->mi_async_lock);
1524 	}
1525 }
1526 
1527 /*
1528  * nfs4_inactive_thread - look for vnodes that need over-the-wire calls as
1529  * part of VOP_INACTIVE.
1530  */
1531 
1532 void
1533 nfs4_inactive_thread(mntinfo4_t *mi)
1534 {
1535 	struct nfs4_async_reqs *args;
1536 	callb_cpr_t cprinfo;
1537 	vfs_t *vfsp = mi->mi_vfsp;
1538 
1539 	CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr,
1540 	    "nfs4_inactive_thread");
1541 
1542 	for (;;) {
1543 		mutex_enter(&mi->mi_async_lock);
1544 		args = mi->mi_async_reqs[NFS4_INACTIVE];
1545 		if (args == NULL) {
1546 			mutex_enter(&mi->mi_lock);
1547 			/*
1548 			 * We don't want to exit until the async manager is done
1549 			 * with its work; hence the check for mi_manager_thread
1550 			 * being NULL.
1551 			 *
1552 			 * The async manager thread will cv_broadcast() on
1553 			 * mi_inact_req_cv when it's done, at which point we'll
1554 			 * wake up and exit.
1555 			 */
1556 			if (mi->mi_manager_thread == NULL)
1557 				goto die;
1558 			mi->mi_flags |= MI4_INACTIVE_IDLE;
1559 			mutex_exit(&mi->mi_lock);
1560 			cv_signal(&mi->mi_async_cv);
1561 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
1562 			cv_wait(&mi->mi_inact_req_cv, &mi->mi_async_lock);
1563 			CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1564 			mutex_exit(&mi->mi_async_lock);
1565 		} else {
1566 			mutex_enter(&mi->mi_lock);
1567 			mi->mi_flags &= ~MI4_INACTIVE_IDLE;
1568 			mutex_exit(&mi->mi_lock);
1569 			mi->mi_async_reqs[NFS4_INACTIVE] = args->a_next;
1570 			mutex_exit(&mi->mi_async_lock);
1571 			nfs4_inactive_otw(args->a_vp, args->a_cred);
1572 			crfree(args->a_cred);
1573 			kmem_free(args, sizeof (*args));
1574 		}
1575 	}
1576 die:
1577 	mutex_exit(&mi->mi_lock);
1578 	mi->mi_inactive_thread = NULL;
1579 	cv_signal(&mi->mi_async_cv);
1580 
1581 	/*
1582 	 * There is no explicit call to mutex_exit(&mi->mi_async_lock) since
1583 	 * CALLB_CPR_EXIT is actually responsible for releasing 'mi_async_lock'.
1584 	 */
1585 	CALLB_CPR_EXIT(&cprinfo);
1586 
1587 	NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
1588 	    "nfs4_inactive_thread exiting for vfs %p\n", (void *)vfsp));
1589 
1590 	MI4_RELE(mi);
1591 	zthread_exit();
1592 	/* NOTREACHED */
1593 }
1594 
1595 /*
1596  * nfs_async_stop:
1597  * Wait for all outstanding putpage operations and the inactive thread to
1598  * complete; nfs4_async_stop_sig() without interruptibility.
1599  */
1600 void
1601 nfs4_async_stop(struct vfs *vfsp)
1602 {
1603 	mntinfo4_t *mi = VFTOMI4(vfsp);
1604 
1605 	/*
1606 	 * Wait for all outstanding async operations to complete and for
1607 	 * worker threads to exit.
1608 	 */
1609 	mutex_enter(&mi->mi_async_lock);
1610 	mi->mi_max_threads = 0;
1611 	NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
1612 	while (mi->mi_threads[NFS4_ASYNC_QUEUE] != 0 ||
1613 	    mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] != 0)
1614 		cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1615 
1616 	/*
1617 	 * Wait for the inactive thread to finish doing what it's doing.  It
1618 	 * won't exit until the last reference to the vfs_t goes away.
1619 	 */
1620 	if (mi->mi_inactive_thread != NULL) {
1621 		mutex_enter(&mi->mi_lock);
1622 		while (!(mi->mi_flags & MI4_INACTIVE_IDLE) ||
1623 		    (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) {
1624 			mutex_exit(&mi->mi_lock);
1625 			cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1626 			mutex_enter(&mi->mi_lock);
1627 		}
1628 		mutex_exit(&mi->mi_lock);
1629 	}
1630 	mutex_exit(&mi->mi_async_lock);
1631 }
1632 
1633 /*
1634  * nfs_async_stop_sig:
1635  * Wait for all outstanding putpage operations and the inactive thread to
1636  * complete. If a signal is delivered we will abort and return non-zero;
1637  * otherwise return 0. Since this routine is called from nfs4_unmount, we
1638  * need to make it interruptible.
1639  */
1640 int
1641 nfs4_async_stop_sig(struct vfs *vfsp)
1642 {
1643 	mntinfo4_t *mi = VFTOMI4(vfsp);
1644 	ushort_t omax;
1645 	bool_t intr = FALSE;
1646 
1647 	/*
1648 	 * Wait for all outstanding putpage operations to complete and for
1649 	 * worker threads to exit.
1650 	 */
1651 	mutex_enter(&mi->mi_async_lock);
1652 	omax = mi->mi_max_threads;
1653 	mi->mi_max_threads = 0;
1654 	NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
1655 	while (mi->mi_threads[NFS4_ASYNC_QUEUE] != 0 ||
1656 	    mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] != 0) {
1657 		if (!cv_wait_sig(&mi->mi_async_cv, &mi->mi_async_lock)) {
1658 			intr = TRUE;
1659 			goto interrupted;
1660 		}
1661 	}
1662 
1663 	/*
1664 	 * Wait for the inactive thread to finish doing what it's doing.  It
1665 	 * won't exit until the a last reference to the vfs_t goes away.
1666 	 */
1667 	if (mi->mi_inactive_thread != NULL) {
1668 		mutex_enter(&mi->mi_lock);
1669 		while (!(mi->mi_flags & MI4_INACTIVE_IDLE) ||
1670 		    (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) {
1671 			mutex_exit(&mi->mi_lock);
1672 			if (!cv_wait_sig(&mi->mi_async_cv,
1673 			    &mi->mi_async_lock)) {
1674 				intr = TRUE;
1675 				goto interrupted;
1676 			}
1677 			mutex_enter(&mi->mi_lock);
1678 		}
1679 		mutex_exit(&mi->mi_lock);
1680 	}
1681 interrupted:
1682 	if (intr)
1683 		mi->mi_max_threads = omax;
1684 	mutex_exit(&mi->mi_async_lock);
1685 
1686 	return (intr);
1687 }
1688 
1689 int
1690 nfs4_async_putapage(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
1691     int flags, cred_t *cr, int (*putapage)(vnode_t *, page_t *,
1692     u_offset_t, size_t, int, cred_t *))
1693 {
1694 	rnode4_t *rp;
1695 	mntinfo4_t *mi;
1696 	struct nfs4_async_reqs *args;
1697 
1698 	ASSERT(flags & B_ASYNC);
1699 	ASSERT(vp->v_vfsp != NULL);
1700 
1701 	rp = VTOR4(vp);
1702 	ASSERT(rp->r_count > 0);
1703 
1704 	mi = VTOMI4(vp);
1705 
1706 	/*
1707 	 * If we can't allocate a request structure, do the putpage
1708 	 * operation synchronously in this thread's context.
1709 	 */
1710 	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1711 		goto noasync;
1712 
1713 	args->a_next = NULL;
1714 #ifdef DEBUG
1715 	args->a_queuer = curthread;
1716 #endif
1717 	VN_HOLD(vp);
1718 	args->a_vp = vp;
1719 	ASSERT(cr != NULL);
1720 	crhold(cr);
1721 	args->a_cred = cr;
1722 	args->a_io = NFS4_PUTAPAGE;
1723 	args->a_nfs4_putapage = putapage;
1724 	args->a_nfs4_pp = pp;
1725 	args->a_nfs4_off = off;
1726 	args->a_nfs4_len = (uint_t)len;
1727 	args->a_nfs4_flags = flags;
1728 
1729 	mutex_enter(&mi->mi_async_lock);
1730 
1731 	/*
1732 	 * If asyncio has been disabled, then make a synchronous request.
1733 	 * This check is done a second time in case async io was diabled
1734 	 * while this thread was blocked waiting for memory pressure to
1735 	 * reduce or for the queue to drain.
1736 	 */
1737 	if (mi->mi_max_threads == 0) {
1738 		mutex_exit(&mi->mi_async_lock);
1739 
1740 		VN_RELE(vp);
1741 		crfree(cr);
1742 		kmem_free(args, sizeof (*args));
1743 		goto noasync;
1744 	}
1745 
1746 	/*
1747 	 * Link request structure into the async list and
1748 	 * wakeup async thread to do the i/o.
1749 	 */
1750 	if (mi->mi_async_reqs[NFS4_PUTAPAGE] == NULL) {
1751 		mi->mi_async_reqs[NFS4_PUTAPAGE] = args;
1752 		mi->mi_async_tail[NFS4_PUTAPAGE] = args;
1753 	} else {
1754 		mi->mi_async_tail[NFS4_PUTAPAGE]->a_next = args;
1755 		mi->mi_async_tail[NFS4_PUTAPAGE] = args;
1756 	}
1757 
1758 	mutex_enter(&rp->r_statelock);
1759 	rp->r_count++;
1760 	rp->r_awcount++;
1761 	mutex_exit(&rp->r_statelock);
1762 
1763 	if (mi->mi_io_kstats) {
1764 		mutex_enter(&mi->mi_lock);
1765 		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1766 		mutex_exit(&mi->mi_lock);
1767 	}
1768 
1769 	mi->mi_async_req_count++;
1770 	ASSERT(mi->mi_async_req_count != 0);
1771 	cv_signal(&mi->mi_async_reqs_cv);
1772 	mutex_exit(&mi->mi_async_lock);
1773 	return (0);
1774 
1775 noasync:
1776 
1777 	if (curproc == proc_pageout || curproc == proc_fsflush) {
1778 		/*
1779 		 * If we get here in the context of the pageout/fsflush,
1780 		 * or we have run out of memory or we're attempting to
1781 		 * unmount we refuse to do a sync write, because this may
1782 		 * hang pageout/fsflush and the machine. In this case,
1783 		 * we just re-mark the page as dirty and punt on the page.
1784 		 *
1785 		 * Make sure B_FORCE isn't set.  We can re-mark the
1786 		 * pages as dirty and unlock the pages in one swoop by
1787 		 * passing in B_ERROR to pvn_write_done().  However,
1788 		 * we should make sure B_FORCE isn't set - we don't
1789 		 * want the page tossed before it gets written out.
1790 		 */
1791 		if (flags & B_FORCE)
1792 			flags &= ~(B_INVAL | B_FORCE);
1793 		pvn_write_done(pp, flags | B_ERROR);
1794 		return (0);
1795 	}
1796 
1797 	if (nfs_zone() != mi->mi_zone) {
1798 		/*
1799 		 * So this was a cross-zone sync putpage.
1800 		 *
1801 		 * We pass in B_ERROR to pvn_write_done() to re-mark the pages
1802 		 * as dirty and unlock them.
1803 		 *
1804 		 * We don't want to clear B_FORCE here as the caller presumably
1805 		 * knows what they're doing if they set it.
1806 		 */
1807 		pvn_write_done(pp, flags | B_ERROR);
1808 		return (EPERM);
1809 	}
1810 	return ((*putapage)(vp, pp, off, len, flags, cr));
1811 }
1812 
1813 int
1814 nfs4_async_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
1815     int flags, cred_t *cr, int (*pageio)(vnode_t *, page_t *, u_offset_t,
1816     size_t, int, cred_t *))
1817 {
1818 	rnode4_t *rp;
1819 	mntinfo4_t *mi;
1820 	struct nfs4_async_reqs *args;
1821 
1822 	ASSERT(flags & B_ASYNC);
1823 	ASSERT(vp->v_vfsp != NULL);
1824 
1825 	rp = VTOR4(vp);
1826 	ASSERT(rp->r_count > 0);
1827 
1828 	mi = VTOMI4(vp);
1829 
1830 	/*
1831 	 * If we can't allocate a request structure, do the pageio
1832 	 * request synchronously in this thread's context.
1833 	 */
1834 	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1835 		goto noasync;
1836 
1837 	args->a_next = NULL;
1838 #ifdef DEBUG
1839 	args->a_queuer = curthread;
1840 #endif
1841 	VN_HOLD(vp);
1842 	args->a_vp = vp;
1843 	ASSERT(cr != NULL);
1844 	crhold(cr);
1845 	args->a_cred = cr;
1846 	args->a_io = NFS4_PAGEIO;
1847 	args->a_nfs4_pageio = pageio;
1848 	args->a_nfs4_pp = pp;
1849 	args->a_nfs4_off = io_off;
1850 	args->a_nfs4_len = (uint_t)io_len;
1851 	args->a_nfs4_flags = flags;
1852 
1853 	mutex_enter(&mi->mi_async_lock);
1854 
1855 	/*
1856 	 * If asyncio has been disabled, then make a synchronous request.
1857 	 * This check is done a second time in case async io was diabled
1858 	 * while this thread was blocked waiting for memory pressure to
1859 	 * reduce or for the queue to drain.
1860 	 */
1861 	if (mi->mi_max_threads == 0) {
1862 		mutex_exit(&mi->mi_async_lock);
1863 
1864 		VN_RELE(vp);
1865 		crfree(cr);
1866 		kmem_free(args, sizeof (*args));
1867 		goto noasync;
1868 	}
1869 
1870 	/*
1871 	 * Link request structure into the async list and
1872 	 * wakeup async thread to do the i/o.
1873 	 */
1874 	if (mi->mi_async_reqs[NFS4_PAGEIO] == NULL) {
1875 		mi->mi_async_reqs[NFS4_PAGEIO] = args;
1876 		mi->mi_async_tail[NFS4_PAGEIO] = args;
1877 	} else {
1878 		mi->mi_async_tail[NFS4_PAGEIO]->a_next = args;
1879 		mi->mi_async_tail[NFS4_PAGEIO] = args;
1880 	}
1881 
1882 	mutex_enter(&rp->r_statelock);
1883 	rp->r_count++;
1884 	rp->r_awcount++;
1885 	mutex_exit(&rp->r_statelock);
1886 
1887 	if (mi->mi_io_kstats) {
1888 		mutex_enter(&mi->mi_lock);
1889 		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1890 		mutex_exit(&mi->mi_lock);
1891 	}
1892 
1893 	mi->mi_async_req_count++;
1894 	ASSERT(mi->mi_async_req_count != 0);
1895 	cv_signal(&mi->mi_async_reqs_cv);
1896 	mutex_exit(&mi->mi_async_lock);
1897 	return (0);
1898 
1899 noasync:
1900 	/*
1901 	 * If we can't do it ASYNC, for reads we do nothing (but cleanup
1902 	 * the page list), for writes we do it synchronously, except for
1903 	 * proc_pageout/proc_fsflush as described below.
1904 	 */
1905 	if (flags & B_READ) {
1906 		pvn_read_done(pp, flags | B_ERROR);
1907 		return (0);
1908 	}
1909 
1910 	if (curproc == proc_pageout || curproc == proc_fsflush) {
1911 		/*
1912 		 * If we get here in the context of the pageout/fsflush,
1913 		 * we refuse to do a sync write, because this may hang
1914 		 * pageout/fsflush (and the machine). In this case, we just
1915 		 * re-mark the page as dirty and punt on the page.
1916 		 *
1917 		 * Make sure B_FORCE isn't set.  We can re-mark the
1918 		 * pages as dirty and unlock the pages in one swoop by
1919 		 * passing in B_ERROR to pvn_write_done().  However,
1920 		 * we should make sure B_FORCE isn't set - we don't
1921 		 * want the page tossed before it gets written out.
1922 		 */
1923 		if (flags & B_FORCE)
1924 			flags &= ~(B_INVAL | B_FORCE);
1925 		pvn_write_done(pp, flags | B_ERROR);
1926 		return (0);
1927 	}
1928 
1929 	if (nfs_zone() != mi->mi_zone) {
1930 		/*
1931 		 * So this was a cross-zone sync pageio.  We pass in B_ERROR
1932 		 * to pvn_write_done() to re-mark the pages as dirty and unlock
1933 		 * them.
1934 		 *
1935 		 * We don't want to clear B_FORCE here as the caller presumably
1936 		 * knows what they're doing if they set it.
1937 		 */
1938 		pvn_write_done(pp, flags | B_ERROR);
1939 		return (EPERM);
1940 	}
1941 	return ((*pageio)(vp, pp, io_off, io_len, flags, cr));
1942 }
1943 
1944 void
1945 nfs4_async_readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr,
1946     int (*readdir)(vnode_t *, rddir4_cache *, cred_t *))
1947 {
1948 	rnode4_t *rp;
1949 	mntinfo4_t *mi;
1950 	struct nfs4_async_reqs *args;
1951 
1952 	rp = VTOR4(vp);
1953 	ASSERT(rp->r_freef == NULL);
1954 
1955 	mi = VTOMI4(vp);
1956 
1957 	/*
1958 	 * If we can't allocate a request structure, skip the readdir.
1959 	 */
1960 	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1961 		goto noasync;
1962 
1963 	args->a_next = NULL;
1964 #ifdef DEBUG
1965 	args->a_queuer = curthread;
1966 #endif
1967 	VN_HOLD(vp);
1968 	args->a_vp = vp;
1969 	ASSERT(cr != NULL);
1970 	crhold(cr);
1971 	args->a_cred = cr;
1972 	args->a_io = NFS4_READDIR;
1973 	args->a_nfs4_readdir = readdir;
1974 	args->a_nfs4_rdc = rdc;
1975 
1976 	mutex_enter(&mi->mi_async_lock);
1977 
1978 	/*
1979 	 * If asyncio has been disabled, then skip this request
1980 	 */
1981 	if (mi->mi_max_threads == 0) {
1982 		mutex_exit(&mi->mi_async_lock);
1983 
1984 		VN_RELE(vp);
1985 		crfree(cr);
1986 		kmem_free(args, sizeof (*args));
1987 		goto noasync;
1988 	}
1989 
1990 	/*
1991 	 * Link request structure into the async list and
1992 	 * wakeup async thread to do the i/o.
1993 	 */
1994 	if (mi->mi_async_reqs[NFS4_READDIR] == NULL) {
1995 		mi->mi_async_reqs[NFS4_READDIR] = args;
1996 		mi->mi_async_tail[NFS4_READDIR] = args;
1997 	} else {
1998 		mi->mi_async_tail[NFS4_READDIR]->a_next = args;
1999 		mi->mi_async_tail[NFS4_READDIR] = args;
2000 	}
2001 
2002 	mutex_enter(&rp->r_statelock);
2003 	rp->r_count++;
2004 	mutex_exit(&rp->r_statelock);
2005 
2006 	if (mi->mi_io_kstats) {
2007 		mutex_enter(&mi->mi_lock);
2008 		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
2009 		mutex_exit(&mi->mi_lock);
2010 	}
2011 
2012 	mi->mi_async_req_count++;
2013 	ASSERT(mi->mi_async_req_count != 0);
2014 	cv_signal(&mi->mi_async_reqs_cv);
2015 	mutex_exit(&mi->mi_async_lock);
2016 	return;
2017 
2018 noasync:
2019 	mutex_enter(&rp->r_statelock);
2020 	rdc->entries = NULL;
2021 	/*
2022 	 * Indicate that no one is trying to fill this entry and
2023 	 * it still needs to be filled.
2024 	 */
2025 	rdc->flags &= ~RDDIR;
2026 	rdc->flags |= RDDIRREQ;
2027 	rddir4_cache_rele(rp, rdc);
2028 	mutex_exit(&rp->r_statelock);
2029 }
2030 
2031 void
2032 nfs4_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
2033     cred_t *cr, void (*commit)(vnode_t *, page_t *, offset3, count3,
2034     cred_t *))
2035 {
2036 	rnode4_t *rp;
2037 	mntinfo4_t *mi;
2038 	struct nfs4_async_reqs *args;
2039 	page_t *pp;
2040 
2041 	rp = VTOR4(vp);
2042 	mi = VTOMI4(vp);
2043 
2044 	/*
2045 	 * If we can't allocate a request structure, do the commit
2046 	 * operation synchronously in this thread's context.
2047 	 */
2048 	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
2049 		goto noasync;
2050 
2051 	args->a_next = NULL;
2052 #ifdef DEBUG
2053 	args->a_queuer = curthread;
2054 #endif
2055 	VN_HOLD(vp);
2056 	args->a_vp = vp;
2057 	ASSERT(cr != NULL);
2058 	crhold(cr);
2059 	args->a_cred = cr;
2060 	args->a_io = NFS4_COMMIT;
2061 	args->a_nfs4_commit = commit;
2062 	args->a_nfs4_plist = plist;
2063 	args->a_nfs4_offset = offset;
2064 	args->a_nfs4_count = count;
2065 
2066 	mutex_enter(&mi->mi_async_lock);
2067 
2068 	/*
2069 	 * If asyncio has been disabled, then make a synchronous request.
2070 	 * This check is done a second time in case async io was diabled
2071 	 * while this thread was blocked waiting for memory pressure to
2072 	 * reduce or for the queue to drain.
2073 	 */
2074 	if (mi->mi_max_threads == 0) {
2075 		mutex_exit(&mi->mi_async_lock);
2076 
2077 		VN_RELE(vp);
2078 		crfree(cr);
2079 		kmem_free(args, sizeof (*args));
2080 		goto noasync;
2081 	}
2082 
2083 	/*
2084 	 * Link request structure into the async list and
2085 	 * wakeup async thread to do the i/o.
2086 	 */
2087 	if (mi->mi_async_reqs[NFS4_COMMIT] == NULL) {
2088 		mi->mi_async_reqs[NFS4_COMMIT] = args;
2089 		mi->mi_async_tail[NFS4_COMMIT] = args;
2090 	} else {
2091 		mi->mi_async_tail[NFS4_COMMIT]->a_next = args;
2092 		mi->mi_async_tail[NFS4_COMMIT] = args;
2093 	}
2094 
2095 	mutex_enter(&rp->r_statelock);
2096 	rp->r_count++;
2097 	mutex_exit(&rp->r_statelock);
2098 
2099 	if (mi->mi_io_kstats) {
2100 		mutex_enter(&mi->mi_lock);
2101 		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
2102 		mutex_exit(&mi->mi_lock);
2103 	}
2104 
2105 	mi->mi_async_req_count++;
2106 	ASSERT(mi->mi_async_req_count != 0);
2107 	cv_signal(&mi->mi_async_reqs_cv);
2108 	mutex_exit(&mi->mi_async_lock);
2109 	return;
2110 
2111 noasync:
2112 	if (curproc == proc_pageout || curproc == proc_fsflush ||
2113 	    nfs_zone() != mi->mi_zone) {
2114 		while (plist != NULL) {
2115 			pp = plist;
2116 			page_sub(&plist, pp);
2117 			pp->p_fsdata = C_COMMIT;
2118 			page_unlock(pp);
2119 		}
2120 		return;
2121 	}
2122 	(*commit)(vp, plist, offset, count, cr);
2123 }
2124 
2125 /*
2126  * nfs4_async_inactive - hand off a VOP_INACTIVE call to a thread.  The
2127  * reference to the vnode is handed over to the thread; the caller should
2128  * no longer refer to the vnode.
2129  *
2130  * Unlike most of the async routines, this handoff is needed for
2131  * correctness reasons, not just performance.  So doing operations in the
2132  * context of the current thread is not an option.
2133  */
2134 void
2135 nfs4_async_inactive(vnode_t *vp, cred_t *cr)
2136 {
2137 	mntinfo4_t *mi;
2138 	struct nfs4_async_reqs *args;
2139 	boolean_t signal_inactive_thread = B_FALSE;
2140 
2141 	mi = VTOMI4(vp);
2142 
2143 	args = kmem_alloc(sizeof (*args), KM_SLEEP);
2144 	args->a_next = NULL;
2145 #ifdef DEBUG
2146 	args->a_queuer = curthread;
2147 #endif
2148 	args->a_vp = vp;
2149 	ASSERT(cr != NULL);
2150 	crhold(cr);
2151 	args->a_cred = cr;
2152 	args->a_io = NFS4_INACTIVE;
2153 
2154 	/*
2155 	 * Note that we don't check mi->mi_max_threads here, since we
2156 	 * *need* to get rid of this vnode regardless of whether someone
2157 	 * set nfs4_max_threads to zero in /etc/system.
2158 	 *
2159 	 * The manager thread knows about this and is willing to create
2160 	 * at least one thread to accommodate us.
2161 	 */
2162 	mutex_enter(&mi->mi_async_lock);
2163 	if (mi->mi_inactive_thread == NULL) {
2164 		rnode4_t *rp;
2165 		vnode_t *unldvp = NULL;
2166 		char *unlname;
2167 		cred_t *unlcred;
2168 
2169 		mutex_exit(&mi->mi_async_lock);
2170 		/*
2171 		 * We just need to free up the memory associated with the
2172 		 * vnode, which can be safely done from within the current
2173 		 * context.
2174 		 */
2175 		crfree(cr);	/* drop our reference */
2176 		kmem_free(args, sizeof (*args));
2177 		rp = VTOR4(vp);
2178 		mutex_enter(&rp->r_statelock);
2179 		if (rp->r_unldvp != NULL) {
2180 			unldvp = rp->r_unldvp;
2181 			rp->r_unldvp = NULL;
2182 			unlname = rp->r_unlname;
2183 			rp->r_unlname = NULL;
2184 			unlcred = rp->r_unlcred;
2185 			rp->r_unlcred = NULL;
2186 		}
2187 		mutex_exit(&rp->r_statelock);
2188 		/*
2189 		 * No need to explicitly throw away any cached pages.  The
2190 		 * eventual r4inactive() will attempt a synchronous
2191 		 * VOP_PUTPAGE() which will immediately fail since the request
2192 		 * is coming from the wrong zone, and then will proceed to call
2193 		 * nfs4_invalidate_pages() which will clean things up for us.
2194 		 *
2195 		 * Throw away the delegation here so rp4_addfree()'s attempt to
2196 		 * return any existing delegations becomes a no-op.
2197 		 */
2198 		if (rp->r_deleg_type != OPEN_DELEGATE_NONE) {
2199 			(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER,
2200 			    FALSE);
2201 			(void) nfs4delegreturn(rp, NFS4_DR_DISCARD);
2202 			nfs_rw_exit(&mi->mi_recovlock);
2203 		}
2204 		nfs4_clear_open_streams(rp);
2205 
2206 		rp4_addfree(rp, cr);
2207 		if (unldvp != NULL) {
2208 			kmem_free(unlname, MAXNAMELEN);
2209 			VN_RELE(unldvp);
2210 			crfree(unlcred);
2211 		}
2212 		return;
2213 	}
2214 
2215 	if (mi->mi_manager_thread == NULL) {
2216 		/*
2217 		 * We want to talk to the inactive thread.
2218 		 */
2219 		signal_inactive_thread = B_TRUE;
2220 	}
2221 
2222 	/*
2223 	 * Enqueue the vnode and wake up either the special thread (empty
2224 	 * list) or an async thread.
2225 	 */
2226 	if (mi->mi_async_reqs[NFS4_INACTIVE] == NULL) {
2227 		mi->mi_async_reqs[NFS4_INACTIVE] = args;
2228 		mi->mi_async_tail[NFS4_INACTIVE] = args;
2229 		signal_inactive_thread = B_TRUE;
2230 	} else {
2231 		mi->mi_async_tail[NFS4_INACTIVE]->a_next = args;
2232 		mi->mi_async_tail[NFS4_INACTIVE] = args;
2233 	}
2234 	if (signal_inactive_thread) {
2235 		cv_signal(&mi->mi_inact_req_cv);
2236 	} else  {
2237 		mi->mi_async_req_count++;
2238 		ASSERT(mi->mi_async_req_count != 0);
2239 		cv_signal(&mi->mi_async_reqs_cv);
2240 	}
2241 
2242 	mutex_exit(&mi->mi_async_lock);
2243 }
2244 
2245 int
2246 writerp4(rnode4_t *rp, caddr_t base, int tcount, struct uio *uio, int pgcreated)
2247 {
2248 	int pagecreate;
2249 	int n;
2250 	int saved_n;
2251 	caddr_t saved_base;
2252 	u_offset_t offset;
2253 	int error;
2254 	int sm_error;
2255 	vnode_t *vp = RTOV(rp);
2256 
2257 	ASSERT(tcount <= MAXBSIZE && tcount <= uio->uio_resid);
2258 	ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_WRITER));
2259 	if (!vpm_enable) {
2260 		ASSERT(((uintptr_t)base & MAXBOFFSET) + tcount <= MAXBSIZE);
2261 	}
2262 
2263 	/*
2264 	 * Move bytes in at most PAGESIZE chunks. We must avoid
2265 	 * spanning pages in uiomove() because page faults may cause
2266 	 * the cache to be invalidated out from under us. The r_size is not
2267 	 * updated until after the uiomove. If we push the last page of a
2268 	 * file before r_size is correct, we will lose the data written past
2269 	 * the current (and invalid) r_size.
2270 	 */
2271 	do {
2272 		offset = uio->uio_loffset;
2273 		pagecreate = 0;
2274 
2275 		/*
2276 		 * n is the number of bytes required to satisfy the request
2277 		 *   or the number of bytes to fill out the page.
2278 		 */
2279 		n = (int)MIN((PAGESIZE - (offset & PAGEOFFSET)), tcount);
2280 
2281 		/*
2282 		 * Check to see if we can skip reading in the page
2283 		 * and just allocate the memory.  We can do this
2284 		 * if we are going to rewrite the entire mapping
2285 		 * or if we are going to write to or beyond the current
2286 		 * end of file from the beginning of the mapping.
2287 		 *
2288 		 * The read of r_size is now protected by r_statelock.
2289 		 */
2290 		mutex_enter(&rp->r_statelock);
2291 		/*
2292 		 * When pgcreated is nonzero the caller has already done
2293 		 * a segmap_getmapflt with forcefault 0 and S_WRITE. With
2294 		 * segkpm this means we already have at least one page
2295 		 * created and mapped at base.
2296 		 */
2297 		pagecreate = pgcreated ||
2298 		    ((offset & PAGEOFFSET) == 0 &&
2299 		    (n == PAGESIZE || ((offset + n) >= rp->r_size)));
2300 
2301 		mutex_exit(&rp->r_statelock);
2302 
2303 		if (!vpm_enable && pagecreate) {
2304 			/*
2305 			 * The last argument tells segmap_pagecreate() to
2306 			 * always lock the page, as opposed to sometimes
2307 			 * returning with the page locked. This way we avoid a
2308 			 * fault on the ensuing uiomove(), but also
2309 			 * more importantly (to fix bug 1094402) we can
2310 			 * call segmap_fault() to unlock the page in all
2311 			 * cases. An alternative would be to modify
2312 			 * segmap_pagecreate() to tell us when it is
2313 			 * locking a page, but that's a fairly major
2314 			 * interface change.
2315 			 */
2316 			if (pgcreated == 0)
2317 				(void) segmap_pagecreate(segkmap, base,
2318 				    (uint_t)n, 1);
2319 			saved_base = base;
2320 			saved_n = n;
2321 		}
2322 
2323 		/*
2324 		 * The number of bytes of data in the last page can not
2325 		 * be accurately be determined while page is being
2326 		 * uiomove'd to and the size of the file being updated.
2327 		 * Thus, inform threads which need to know accurately
2328 		 * how much data is in the last page of the file.  They
2329 		 * will not do the i/o immediately, but will arrange for
2330 		 * the i/o to happen later when this modify operation
2331 		 * will have finished.
2332 		 */
2333 		ASSERT(!(rp->r_flags & R4MODINPROGRESS));
2334 		mutex_enter(&rp->r_statelock);
2335 		rp->r_flags |= R4MODINPROGRESS;
2336 		rp->r_modaddr = (offset & MAXBMASK);
2337 		mutex_exit(&rp->r_statelock);
2338 
2339 		if (vpm_enable) {
2340 			/*
2341 			 * Copy data. If new pages are created, part of
2342 			 * the page that is not written will be initizliazed
2343 			 * with zeros.
2344 			 */
2345 			error = vpm_data_copy(vp, offset, n, uio,
2346 			    !pagecreate, NULL, 0, S_WRITE);
2347 		} else {
2348 			error = uiomove(base, n, UIO_WRITE, uio);
2349 		}
2350 
2351 		/*
2352 		 * r_size is the maximum number of
2353 		 * bytes known to be in the file.
2354 		 * Make sure it is at least as high as the
2355 		 * first unwritten byte pointed to by uio_loffset.
2356 		 */
2357 		mutex_enter(&rp->r_statelock);
2358 		if (rp->r_size < uio->uio_loffset)
2359 			rp->r_size = uio->uio_loffset;
2360 		rp->r_flags &= ~R4MODINPROGRESS;
2361 		rp->r_flags |= R4DIRTY;
2362 		mutex_exit(&rp->r_statelock);
2363 
2364 		/* n = # of bytes written */
2365 		n = (int)(uio->uio_loffset - offset);
2366 
2367 		if (!vpm_enable) {
2368 			base += n;
2369 		}
2370 
2371 		tcount -= n;
2372 		/*
2373 		 * If we created pages w/o initializing them completely,
2374 		 * we need to zero the part that wasn't set up.
2375 		 * This happens on a most EOF write cases and if
2376 		 * we had some sort of error during the uiomove.
2377 		 */
2378 		if (!vpm_enable && pagecreate) {
2379 			if ((uio->uio_loffset & PAGEOFFSET) || n == 0)
2380 				(void) kzero(base, PAGESIZE - n);
2381 
2382 			if (pgcreated) {
2383 				/*
2384 				 * Caller is responsible for this page,
2385 				 * it was not created in this loop.
2386 				 */
2387 				pgcreated = 0;
2388 			} else {
2389 				/*
2390 				 * For bug 1094402: segmap_pagecreate locks
2391 				 * page. Unlock it. This also unlocks the
2392 				 * pages allocated by page_create_va() in
2393 				 * segmap_pagecreate().
2394 				 */
2395 				sm_error = segmap_fault(kas.a_hat, segkmap,
2396 				    saved_base, saved_n,
2397 				    F_SOFTUNLOCK, S_WRITE);
2398 				if (error == 0)
2399 					error = sm_error;
2400 			}
2401 		}
2402 	} while (tcount > 0 && error == 0);
2403 
2404 	return (error);
2405 }
2406 
2407 int
2408 nfs4_putpages(vnode_t *vp, u_offset_t off, size_t len, int flags, cred_t *cr)
2409 {
2410 	rnode4_t *rp;
2411 	page_t *pp;
2412 	u_offset_t eoff;
2413 	u_offset_t io_off;
2414 	size_t io_len;
2415 	int error;
2416 	int rdirty;
2417 	int err;
2418 
2419 	rp = VTOR4(vp);
2420 	ASSERT(rp->r_count > 0);
2421 
2422 	if (!nfs4_has_pages(vp))
2423 		return (0);
2424 
2425 	ASSERT(vp->v_type != VCHR);
2426 
2427 	/*
2428 	 * If R4OUTOFSPACE is set, then all writes turn into B_INVAL
2429 	 * writes.  B_FORCE is set to force the VM system to actually
2430 	 * invalidate the pages, even if the i/o failed.  The pages
2431 	 * need to get invalidated because they can't be written out
2432 	 * because there isn't any space left on either the server's
2433 	 * file system or in the user's disk quota.  The B_FREE bit
2434 	 * is cleared to avoid confusion as to whether this is a
2435 	 * request to place the page on the freelist or to destroy
2436 	 * it.
2437 	 */
2438 	if ((rp->r_flags & R4OUTOFSPACE) ||
2439 	    (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED))
2440 		flags = (flags & ~B_FREE) | B_INVAL | B_FORCE;
2441 
2442 	if (len == 0) {
2443 		/*
2444 		 * If doing a full file synchronous operation, then clear
2445 		 * the R4DIRTY bit.  If a page gets dirtied while the flush
2446 		 * is happening, then R4DIRTY will get set again.  The
2447 		 * R4DIRTY bit must get cleared before the flush so that
2448 		 * we don't lose this information.
2449 		 *
2450 		 * If there are no full file async write operations
2451 		 * pending and RDIRTY bit is set, clear it.
2452 		 */
2453 		if (off == (u_offset_t)0 &&
2454 		    !(flags & B_ASYNC) &&
2455 		    (rp->r_flags & R4DIRTY)) {
2456 			mutex_enter(&rp->r_statelock);
2457 			rdirty = (rp->r_flags & R4DIRTY);
2458 			rp->r_flags &= ~R4DIRTY;
2459 			mutex_exit(&rp->r_statelock);
2460 		} else if (flags & B_ASYNC && off == (u_offset_t)0) {
2461 			mutex_enter(&rp->r_statelock);
2462 			if (rp->r_flags & R4DIRTY && rp->r_awcount == 0) {
2463 				rdirty = (rp->r_flags & R4DIRTY);
2464 				rp->r_flags &= ~R4DIRTY;
2465 			}
2466 			mutex_exit(&rp->r_statelock);
2467 		} else
2468 			rdirty = 0;
2469 
2470 		/*
2471 		 * Search the entire vp list for pages >= off, and flush
2472 		 * the dirty pages.
2473 		 */
2474 		error = pvn_vplist_dirty(vp, off, rp->r_putapage,
2475 		    flags, cr);
2476 
2477 		/*
2478 		 * If an error occurred and the file was marked as dirty
2479 		 * before and we aren't forcibly invalidating pages, then
2480 		 * reset the R4DIRTY flag.
2481 		 */
2482 		if (error && rdirty &&
2483 		    (flags & (B_INVAL | B_FORCE)) != (B_INVAL | B_FORCE)) {
2484 			mutex_enter(&rp->r_statelock);
2485 			rp->r_flags |= R4DIRTY;
2486 			mutex_exit(&rp->r_statelock);
2487 		}
2488 	} else {
2489 		/*
2490 		 * Do a range from [off...off + len) looking for pages
2491 		 * to deal with.
2492 		 */
2493 		error = 0;
2494 		io_len = 0;
2495 		eoff = off + len;
2496 		mutex_enter(&rp->r_statelock);
2497 		for (io_off = off; io_off < eoff && io_off < rp->r_size;
2498 		    io_off += io_len) {
2499 			mutex_exit(&rp->r_statelock);
2500 			/*
2501 			 * If we are not invalidating, synchronously
2502 			 * freeing or writing pages use the routine
2503 			 * page_lookup_nowait() to prevent reclaiming
2504 			 * them from the free list.
2505 			 */
2506 			if ((flags & B_INVAL) || !(flags & B_ASYNC)) {
2507 				pp = page_lookup(vp, io_off,
2508 				    (flags & (B_INVAL | B_FREE)) ?
2509 				    SE_EXCL : SE_SHARED);
2510 			} else {
2511 				pp = page_lookup_nowait(vp, io_off,
2512 				    (flags & B_FREE) ? SE_EXCL : SE_SHARED);
2513 			}
2514 
2515 			if (pp == NULL || !pvn_getdirty(pp, flags))
2516 				io_len = PAGESIZE;
2517 			else {
2518 				err = (*rp->r_putapage)(vp, pp, &io_off,
2519 				    &io_len, flags, cr);
2520 				if (!error)
2521 					error = err;
2522 				/*
2523 				 * "io_off" and "io_len" are returned as
2524 				 * the range of pages we actually wrote.
2525 				 * This allows us to skip ahead more quickly
2526 				 * since several pages may've been dealt
2527 				 * with by this iteration of the loop.
2528 				 */
2529 			}
2530 			mutex_enter(&rp->r_statelock);
2531 		}
2532 		mutex_exit(&rp->r_statelock);
2533 	}
2534 
2535 	return (error);
2536 }
2537 
2538 void
2539 nfs4_invalidate_pages(vnode_t *vp, u_offset_t off, cred_t *cr)
2540 {
2541 	rnode4_t *rp;
2542 
2543 	rp = VTOR4(vp);
2544 	if (IS_SHADOW(vp, rp))
2545 		vp = RTOV4(rp);
2546 	mutex_enter(&rp->r_statelock);
2547 	while (rp->r_flags & R4TRUNCATE)
2548 		cv_wait(&rp->r_cv, &rp->r_statelock);
2549 	rp->r_flags |= R4TRUNCATE;
2550 	if (off == (u_offset_t)0) {
2551 		rp->r_flags &= ~R4DIRTY;
2552 		if (!(rp->r_flags & R4STALE))
2553 			rp->r_error = 0;
2554 	}
2555 	rp->r_truncaddr = off;
2556 	mutex_exit(&rp->r_statelock);
2557 	(void) pvn_vplist_dirty(vp, off, rp->r_putapage,
2558 	    B_INVAL | B_TRUNC, cr);
2559 	mutex_enter(&rp->r_statelock);
2560 	rp->r_flags &= ~R4TRUNCATE;
2561 	cv_broadcast(&rp->r_cv);
2562 	mutex_exit(&rp->r_statelock);
2563 }
2564 
2565 static int
2566 nfs4_mnt_kstat_update(kstat_t *ksp, int rw)
2567 {
2568 	mntinfo4_t *mi;
2569 	struct mntinfo_kstat *mik;
2570 	vfs_t *vfsp;
2571 
2572 	/* this is a read-only kstat. Bail out on a write */
2573 	if (rw == KSTAT_WRITE)
2574 		return (EACCES);
2575 
2576 
2577 	/*
2578 	 * We don't want to wait here as kstat_chain_lock could be held by
2579 	 * dounmount(). dounmount() takes vfs_reflock before the chain lock
2580 	 * and thus could lead to a deadlock.
2581 	 */
2582 	vfsp = (struct vfs *)ksp->ks_private;
2583 
2584 	mi = VFTOMI4(vfsp);
2585 	mik = (struct mntinfo_kstat *)ksp->ks_data;
2586 
2587 	(void) strcpy(mik->mik_proto, mi->mi_curr_serv->sv_knconf->knc_proto);
2588 
2589 	mik->mik_vers = (uint32_t)mi->mi_vers;
2590 	mik->mik_flags = mi->mi_flags;
2591 	/*
2592 	 * The sv_secdata holds the flavor the client specifies.
2593 	 * If the client uses default and a security negotiation
2594 	 * occurs, sv_currsec will point to the current flavor
2595 	 * selected from the server flavor list.
2596 	 * sv_currsec is NULL if no security negotiation takes place.
2597 	 */
2598 	mik->mik_secmod = mi->mi_curr_serv->sv_currsec ?
2599 	    mi->mi_curr_serv->sv_currsec->secmod :
2600 	    mi->mi_curr_serv->sv_secdata->secmod;
2601 	mik->mik_curread = (uint32_t)mi->mi_curread;
2602 	mik->mik_curwrite = (uint32_t)mi->mi_curwrite;
2603 	mik->mik_retrans = mi->mi_retrans;
2604 	mik->mik_timeo = mi->mi_timeo;
2605 	mik->mik_acregmin = HR2SEC(mi->mi_acregmin);
2606 	mik->mik_acregmax = HR2SEC(mi->mi_acregmax);
2607 	mik->mik_acdirmin = HR2SEC(mi->mi_acdirmin);
2608 	mik->mik_acdirmax = HR2SEC(mi->mi_acdirmax);
2609 	mik->mik_noresponse = (uint32_t)mi->mi_noresponse;
2610 	mik->mik_failover = (uint32_t)mi->mi_failover;
2611 	mik->mik_remap = (uint32_t)mi->mi_remap;
2612 
2613 	(void) strcpy(mik->mik_curserver, mi->mi_curr_serv->sv_hostname);
2614 
2615 	return (0);
2616 }
2617 
2618 void
2619 nfs4_mnt_kstat_init(struct vfs *vfsp)
2620 {
2621 	mntinfo4_t *mi = VFTOMI4(vfsp);
2622 
2623 	/*
2624 	 * PSARC 2001/697 Contract Private Interface
2625 	 * All nfs kstats are under SunMC contract
2626 	 * Please refer to the PSARC listed above and contact
2627 	 * SunMC before making any changes!
2628 	 *
2629 	 * Changes must be reviewed by Solaris File Sharing
2630 	 * Changes must be communicated to contract-2001-697@sun.com
2631 	 *
2632 	 */
2633 
2634 	mi->mi_io_kstats = kstat_create_zone("nfs", getminor(vfsp->vfs_dev),
2635 	    NULL, "nfs", KSTAT_TYPE_IO, 1, 0, mi->mi_zone->zone_id);
2636 	if (mi->mi_io_kstats) {
2637 		if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
2638 			kstat_zone_add(mi->mi_io_kstats, GLOBAL_ZONEID);
2639 		mi->mi_io_kstats->ks_lock = &mi->mi_lock;
2640 		kstat_install(mi->mi_io_kstats);
2641 	}
2642 
2643 	if ((mi->mi_ro_kstats = kstat_create_zone("nfs",
2644 	    getminor(vfsp->vfs_dev), "mntinfo", "misc", KSTAT_TYPE_RAW,
2645 	    sizeof (struct mntinfo_kstat), 0, mi->mi_zone->zone_id)) != NULL) {
2646 		if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
2647 			kstat_zone_add(mi->mi_ro_kstats, GLOBAL_ZONEID);
2648 		mi->mi_ro_kstats->ks_update = nfs4_mnt_kstat_update;
2649 		mi->mi_ro_kstats->ks_private = (void *)vfsp;
2650 		kstat_install(mi->mi_ro_kstats);
2651 	}
2652 
2653 	nfs4_mnt_recov_kstat_init(vfsp);
2654 }
2655 
2656 void
2657 nfs4_write_error(vnode_t *vp, int error, cred_t *cr)
2658 {
2659 	mntinfo4_t *mi;
2660 	clock_t now = ddi_get_lbolt();
2661 
2662 	mi = VTOMI4(vp);
2663 	/*
2664 	 * In case of forced unmount, do not print any messages
2665 	 * since it can flood the console with error messages.
2666 	 */
2667 	if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)
2668 		return;
2669 
2670 	/*
2671 	 * If the mount point is dead, not recoverable, do not
2672 	 * print error messages that can flood the console.
2673 	 */
2674 	if (mi->mi_flags & MI4_RECOV_FAIL)
2675 		return;
2676 
2677 	/*
2678 	 * No use in flooding the console with ENOSPC
2679 	 * messages from the same file system.
2680 	 */
2681 	if ((error != ENOSPC && error != EDQUOT) ||
2682 	    now - mi->mi_printftime > 0) {
2683 		zoneid_t zoneid = mi->mi_zone->zone_id;
2684 
2685 #ifdef DEBUG
2686 		nfs_perror(error, "NFS%ld write error on host %s: %m.\n",
2687 		    mi->mi_vers, VTOR4(vp)->r_server->sv_hostname, NULL);
2688 #else
2689 		nfs_perror(error, "NFS write error on host %s: %m.\n",
2690 		    VTOR4(vp)->r_server->sv_hostname, NULL);
2691 #endif
2692 		if (error == ENOSPC || error == EDQUOT) {
2693 			zcmn_err(zoneid, CE_CONT,
2694 			    "^File: userid=%d, groupid=%d\n",
2695 			    crgetuid(cr), crgetgid(cr));
2696 			if (crgetuid(curthread->t_cred) != crgetuid(cr) ||
2697 			    crgetgid(curthread->t_cred) != crgetgid(cr)) {
2698 				zcmn_err(zoneid, CE_CONT,
2699 				    "^User: userid=%d, groupid=%d\n",
2700 				    crgetuid(curthread->t_cred),
2701 				    crgetgid(curthread->t_cred));
2702 			}
2703 			mi->mi_printftime = now +
2704 			    nfs_write_error_interval * hz;
2705 		}
2706 		sfh4_printfhandle(VTOR4(vp)->r_fh);
2707 #ifdef DEBUG
2708 		if (error == EACCES) {
2709 			zcmn_err(zoneid, CE_CONT,
2710 			    "nfs_bio: cred is%s kcred\n",
2711 			    cr == kcred ? "" : " not");
2712 		}
2713 #endif
2714 	}
2715 }
2716 
2717 /*
2718  * Return non-zero if the given file can be safely memory mapped.  Locks
2719  * are safe if whole-file (length and offset are both zero).
2720  */
2721 
2722 #define	SAFE_LOCK(flk)	((flk).l_start == 0 && (flk).l_len == 0)
2723 
2724 static int
2725 nfs4_safemap(const vnode_t *vp)
2726 {
2727 	locklist_t	*llp, *next_llp;
2728 	int		safe = 1;
2729 	rnode4_t	*rp = VTOR4(vp);
2730 
2731 	ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER));
2732 
2733 	NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: "
2734 	    "vp = %p", (void *)vp));
2735 
2736 	/*
2737 	 * Review all the locks for the vnode, both ones that have been
2738 	 * acquired and ones that are pending.  We assume that
2739 	 * flk_active_locks_for_vp() has merged any locks that can be
2740 	 * merged (so that if a process has the entire file locked, it is
2741 	 * represented as a single lock).
2742 	 *
2743 	 * Note that we can't bail out of the loop if we find a non-safe
2744 	 * lock, because we have to free all the elements in the llp list.
2745 	 * We might be able to speed up this code slightly by not looking
2746 	 * at each lock's l_start and l_len fields once we've found a
2747 	 * non-safe lock.
2748 	 */
2749 
2750 	llp = flk_active_locks_for_vp(vp);
2751 	while (llp) {
2752 		NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE,
2753 		    "nfs4_safemap: active lock (%" PRId64 ", %" PRId64 ")",
2754 		    llp->ll_flock.l_start, llp->ll_flock.l_len));
2755 		if (!SAFE_LOCK(llp->ll_flock)) {
2756 			safe = 0;
2757 			NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE,
2758 			    "nfs4_safemap: unsafe active lock (%" PRId64
2759 			    ", %" PRId64 ")", llp->ll_flock.l_start,
2760 			    llp->ll_flock.l_len));
2761 		}
2762 		next_llp = llp->ll_next;
2763 		VN_RELE(llp->ll_vp);
2764 		kmem_free(llp, sizeof (*llp));
2765 		llp = next_llp;
2766 	}
2767 
2768 	NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: %s",
2769 	    safe ? "safe" : "unsafe"));
2770 	return (safe);
2771 }
2772 
2773 /*
2774  * Return whether there is a lost LOCK or LOCKU queued up for the given
2775  * file that would make an mmap request unsafe.  cf. nfs4_safemap().
2776  */
2777 
2778 bool_t
2779 nfs4_map_lost_lock_conflict(vnode_t *vp)
2780 {
2781 	bool_t conflict = FALSE;
2782 	nfs4_lost_rqst_t *lrp;
2783 	mntinfo4_t *mi = VTOMI4(vp);
2784 
2785 	mutex_enter(&mi->mi_lock);
2786 	for (lrp = list_head(&mi->mi_lost_state); lrp != NULL;
2787 	    lrp = list_next(&mi->mi_lost_state, lrp)) {
2788 		if (lrp->lr_op != OP_LOCK && lrp->lr_op != OP_LOCKU)
2789 			continue;
2790 		ASSERT(lrp->lr_vp != NULL);
2791 		if (!VOP_CMP(lrp->lr_vp, vp, NULL))
2792 			continue;	/* different file */
2793 		if (!SAFE_LOCK(*lrp->lr_flk)) {
2794 			conflict = TRUE;
2795 			break;
2796 		}
2797 	}
2798 
2799 	mutex_exit(&mi->mi_lock);
2800 	return (conflict);
2801 }
2802 
2803 /*
2804  * nfs_lockcompletion:
2805  *
2806  * If the vnode has a lock that makes it unsafe to cache the file, mark it
2807  * as non cachable (set VNOCACHE bit).
2808  */
2809 
2810 void
2811 nfs4_lockcompletion(vnode_t *vp, int cmd)
2812 {
2813 	rnode4_t *rp = VTOR4(vp);
2814 
2815 	ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER));
2816 	ASSERT(!IS_SHADOW(vp, rp));
2817 
2818 	if (cmd == F_SETLK || cmd == F_SETLKW) {
2819 
2820 		if (!nfs4_safemap(vp)) {
2821 			mutex_enter(&vp->v_lock);
2822 			vp->v_flag |= VNOCACHE;
2823 			mutex_exit(&vp->v_lock);
2824 		} else {
2825 			mutex_enter(&vp->v_lock);
2826 			vp->v_flag &= ~VNOCACHE;
2827 			mutex_exit(&vp->v_lock);
2828 		}
2829 	}
2830 	/*
2831 	 * The cached attributes of the file are stale after acquiring
2832 	 * the lock on the file. They were updated when the file was
2833 	 * opened, but not updated when the lock was acquired. Therefore the
2834 	 * cached attributes are invalidated after the lock is obtained.
2835 	 */
2836 	PURGE_ATTRCACHE4(vp);
2837 }
2838 
2839 /* ARGSUSED */
2840 static void *
2841 nfs4_mi_init(zoneid_t zoneid)
2842 {
2843 	struct mi4_globals *mig;
2844 
2845 	mig = kmem_alloc(sizeof (*mig), KM_SLEEP);
2846 	mutex_init(&mig->mig_lock, NULL, MUTEX_DEFAULT, NULL);
2847 	list_create(&mig->mig_list, sizeof (mntinfo4_t),
2848 	    offsetof(mntinfo4_t, mi_zone_node));
2849 	mig->mig_destructor_called = B_FALSE;
2850 	return (mig);
2851 }
2852 
2853 /*
2854  * Callback routine to tell all NFSv4 mounts in the zone to start tearing down
2855  * state and killing off threads.
2856  */
2857 /* ARGSUSED */
2858 static void
2859 nfs4_mi_shutdown(zoneid_t zoneid, void *data)
2860 {
2861 	struct mi4_globals *mig = data;
2862 	mntinfo4_t *mi;
2863 	nfs4_server_t *np;
2864 
2865 	NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2866 	    "nfs4_mi_shutdown zone %d\n", zoneid));
2867 	ASSERT(mig != NULL);
2868 	for (;;) {
2869 		mutex_enter(&mig->mig_lock);
2870 		mi = list_head(&mig->mig_list);
2871 		if (mi == NULL) {
2872 			mutex_exit(&mig->mig_lock);
2873 			break;
2874 		}
2875 
2876 		NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2877 		    "nfs4_mi_shutdown stopping vfs %p\n", (void *)mi->mi_vfsp));
2878 		/*
2879 		 * purge the DNLC for this filesystem
2880 		 */
2881 		(void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
2882 		/*
2883 		 * Tell existing async worker threads to exit.
2884 		 */
2885 		mutex_enter(&mi->mi_async_lock);
2886 		mi->mi_max_threads = 0;
2887 		NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
2888 		/*
2889 		 * Set the appropriate flags, signal and wait for both the
2890 		 * async manager and the inactive thread to exit when they're
2891 		 * done with their current work.
2892 		 */
2893 		mutex_enter(&mi->mi_lock);
2894 		mi->mi_flags |= (MI4_ASYNC_MGR_STOP|MI4_DEAD);
2895 		mutex_exit(&mi->mi_lock);
2896 		mutex_exit(&mi->mi_async_lock);
2897 		if (mi->mi_manager_thread) {
2898 			nfs4_async_manager_stop(mi->mi_vfsp);
2899 		}
2900 		if (mi->mi_inactive_thread) {
2901 			mutex_enter(&mi->mi_async_lock);
2902 			cv_signal(&mi->mi_inact_req_cv);
2903 			/*
2904 			 * Wait for the inactive thread to exit.
2905 			 */
2906 			while (mi->mi_inactive_thread != NULL) {
2907 				cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
2908 			}
2909 			mutex_exit(&mi->mi_async_lock);
2910 		}
2911 		/*
2912 		 * Wait for the recovery thread to complete, that is, it will
2913 		 * signal when it is done using the "mi" structure and about
2914 		 * to exit
2915 		 */
2916 		mutex_enter(&mi->mi_lock);
2917 		while (mi->mi_in_recovery > 0)
2918 			cv_wait(&mi->mi_cv_in_recov, &mi->mi_lock);
2919 		mutex_exit(&mi->mi_lock);
2920 		/*
2921 		 * We're done when every mi has been done or the list is empty.
2922 		 * This one is done, remove it from the list.
2923 		 */
2924 		list_remove(&mig->mig_list, mi);
2925 		mutex_exit(&mig->mig_lock);
2926 		zone_rele_ref(&mi->mi_zone_ref, ZONE_REF_NFSV4);
2927 
2928 		/*
2929 		 * Release hold on vfs and mi done to prevent race with zone
2930 		 * shutdown. This releases the hold in nfs4_mi_zonelist_add.
2931 		 */
2932 		VFS_RELE(mi->mi_vfsp);
2933 		MI4_RELE(mi);
2934 	}
2935 	/*
2936 	 * Tell each renew thread in the zone to exit
2937 	 */
2938 	mutex_enter(&nfs4_server_lst_lock);
2939 	for (np = nfs4_server_lst.forw; np != &nfs4_server_lst; np = np->forw) {
2940 		mutex_enter(&np->s_lock);
2941 		if (np->zoneid == zoneid) {
2942 			/*
2943 			 * We add another hold onto the nfs4_server_t
2944 			 * because this will make sure tha the nfs4_server_t
2945 			 * stays around until nfs4_callback_fini_zone destroys
2946 			 * the zone. This way, the renew thread can
2947 			 * unconditionally release its holds on the
2948 			 * nfs4_server_t.
2949 			 */
2950 			np->s_refcnt++;
2951 			nfs4_mark_srv_dead(np);
2952 		}
2953 		mutex_exit(&np->s_lock);
2954 	}
2955 	mutex_exit(&nfs4_server_lst_lock);
2956 }
2957 
2958 static void
2959 nfs4_mi_free_globals(struct mi4_globals *mig)
2960 {
2961 	list_destroy(&mig->mig_list);	/* makes sure the list is empty */
2962 	mutex_destroy(&mig->mig_lock);
2963 	kmem_free(mig, sizeof (*mig));
2964 }
2965 
2966 /* ARGSUSED */
2967 static void
2968 nfs4_mi_destroy(zoneid_t zoneid, void *data)
2969 {
2970 	struct mi4_globals *mig = data;
2971 
2972 	NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2973 	    "nfs4_mi_destroy zone %d\n", zoneid));
2974 	ASSERT(mig != NULL);
2975 	mutex_enter(&mig->mig_lock);
2976 	if (list_head(&mig->mig_list) != NULL) {
2977 		/* Still waiting for VFS_FREEVFS() */
2978 		mig->mig_destructor_called = B_TRUE;
2979 		mutex_exit(&mig->mig_lock);
2980 		return;
2981 	}
2982 	nfs4_mi_free_globals(mig);
2983 }
2984 
2985 /*
2986  * Add an NFS mount to the per-zone list of NFS mounts.
2987  */
2988 void
2989 nfs4_mi_zonelist_add(mntinfo4_t *mi)
2990 {
2991 	struct mi4_globals *mig;
2992 
2993 	mig = zone_getspecific(mi4_list_key, mi->mi_zone);
2994 	mutex_enter(&mig->mig_lock);
2995 	list_insert_head(&mig->mig_list, mi);
2996 	/*
2997 	 * hold added to eliminate race with zone shutdown -this will be
2998 	 * released in mi_shutdown
2999 	 */
3000 	MI4_HOLD(mi);
3001 	VFS_HOLD(mi->mi_vfsp);
3002 	mutex_exit(&mig->mig_lock);
3003 }
3004 
3005 /*
3006  * Remove an NFS mount from the per-zone list of NFS mounts.
3007  */
3008 int
3009 nfs4_mi_zonelist_remove(mntinfo4_t *mi)
3010 {
3011 	struct mi4_globals *mig;
3012 	int ret = 0;
3013 
3014 	mig = zone_getspecific(mi4_list_key, mi->mi_zone);
3015 	mutex_enter(&mig->mig_lock);
3016 	mutex_enter(&mi->mi_lock);
3017 	/* if this mi is marked dead, then the zone already released it */
3018 	if (!(mi->mi_flags & MI4_DEAD)) {
3019 		list_remove(&mig->mig_list, mi);
3020 		mutex_exit(&mi->mi_lock);
3021 
3022 		/* release the holds put on in zonelist_add(). */
3023 		VFS_RELE(mi->mi_vfsp);
3024 		MI4_RELE(mi);
3025 		ret = 1;
3026 	} else {
3027 		mutex_exit(&mi->mi_lock);
3028 	}
3029 
3030 	/*
3031 	 * We can be called asynchronously by VFS_FREEVFS() after the zone
3032 	 * shutdown/destroy callbacks have executed; if so, clean up the zone's
3033 	 * mi globals.
3034 	 */
3035 	if (list_head(&mig->mig_list) == NULL &&
3036 	    mig->mig_destructor_called == B_TRUE) {
3037 		nfs4_mi_free_globals(mig);
3038 		return (ret);
3039 	}
3040 	mutex_exit(&mig->mig_lock);
3041 	return (ret);
3042 }
3043 
3044 void
3045 nfs_free_mi4(mntinfo4_t *mi)
3046 {
3047 	nfs4_open_owner_t	*foop;
3048 	nfs4_oo_hash_bucket_t   *bucketp;
3049 	nfs4_debug_msg_t	*msgp;
3050 	int i;
3051 	servinfo4_t 		*svp;
3052 
3053 	/*
3054 	 * Code introduced here should be carefully evaluated to make
3055 	 * sure none of the freed resources are accessed either directly
3056 	 * or indirectly after freeing them. For eg: Introducing calls to
3057 	 * NFS4_DEBUG that use mntinfo4_t structure member after freeing
3058 	 * the structure members or other routines calling back into NFS
3059 	 * accessing freed mntinfo4_t structure member.
3060 	 */
3061 	mutex_enter(&mi->mi_lock);
3062 	ASSERT(mi->mi_recovthread == NULL);
3063 	ASSERT(mi->mi_flags & MI4_ASYNC_MGR_STOP);
3064 	mutex_exit(&mi->mi_lock);
3065 	mutex_enter(&mi->mi_async_lock);
3066 	ASSERT(mi->mi_threads[NFS4_ASYNC_QUEUE] == 0 &&
3067 	    mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] == 0);
3068 	ASSERT(mi->mi_manager_thread == NULL);
3069 	mutex_exit(&mi->mi_async_lock);
3070 	if (mi->mi_io_kstats) {
3071 		kstat_delete(mi->mi_io_kstats);
3072 		mi->mi_io_kstats = NULL;
3073 	}
3074 	if (mi->mi_ro_kstats) {
3075 		kstat_delete(mi->mi_ro_kstats);
3076 		mi->mi_ro_kstats = NULL;
3077 	}
3078 	if (mi->mi_recov_ksp) {
3079 		kstat_delete(mi->mi_recov_ksp);
3080 		mi->mi_recov_ksp = NULL;
3081 	}
3082 	mutex_enter(&mi->mi_msg_list_lock);
3083 	while (msgp = list_head(&mi->mi_msg_list)) {
3084 		list_remove(&mi->mi_msg_list, msgp);
3085 		nfs4_free_msg(msgp);
3086 	}
3087 	mutex_exit(&mi->mi_msg_list_lock);
3088 	list_destroy(&mi->mi_msg_list);
3089 	if (mi->mi_fname != NULL)
3090 		fn_rele(&mi->mi_fname);
3091 	if (mi->mi_rootfh != NULL)
3092 		sfh4_rele(&mi->mi_rootfh);
3093 	if (mi->mi_srvparentfh != NULL)
3094 		sfh4_rele(&mi->mi_srvparentfh);
3095 	svp = mi->mi_servers;
3096 	sv4_free(svp);
3097 	mutex_destroy(&mi->mi_lock);
3098 	mutex_destroy(&mi->mi_async_lock);
3099 	mutex_destroy(&mi->mi_msg_list_lock);
3100 	mutex_destroy(&mi->mi_rnodes_lock);
3101 	nfs_rw_destroy(&mi->mi_recovlock);
3102 	nfs_rw_destroy(&mi->mi_rename_lock);
3103 	nfs_rw_destroy(&mi->mi_fh_lock);
3104 	cv_destroy(&mi->mi_failover_cv);
3105 	cv_destroy(&mi->mi_async_reqs_cv);
3106 	cv_destroy(&mi->mi_async_work_cv[NFS4_ASYNC_QUEUE]);
3107 	cv_destroy(&mi->mi_async_work_cv[NFS4_ASYNC_PGOPS_QUEUE]);
3108 	cv_destroy(&mi->mi_async_cv);
3109 	cv_destroy(&mi->mi_inact_req_cv);
3110 	/*
3111 	 * Destroy the oo hash lists and mutexes for the cred hash table.
3112 	 */
3113 	for (i = 0; i < NFS4_NUM_OO_BUCKETS; i++) {
3114 		bucketp = &(mi->mi_oo_list[i]);
3115 		/* Destroy any remaining open owners on the list */
3116 		foop = list_head(&bucketp->b_oo_hash_list);
3117 		while (foop != NULL) {
3118 			list_remove(&bucketp->b_oo_hash_list, foop);
3119 			nfs4_destroy_open_owner(foop);
3120 			foop = list_head(&bucketp->b_oo_hash_list);
3121 		}
3122 		list_destroy(&bucketp->b_oo_hash_list);
3123 		mutex_destroy(&bucketp->b_lock);
3124 	}
3125 	/*
3126 	 * Empty and destroy the freed open owner list.
3127 	 */
3128 	foop = list_head(&mi->mi_foo_list);
3129 	while (foop != NULL) {
3130 		list_remove(&mi->mi_foo_list, foop);
3131 		nfs4_destroy_open_owner(foop);
3132 		foop = list_head(&mi->mi_foo_list);
3133 	}
3134 	list_destroy(&mi->mi_foo_list);
3135 	list_destroy(&mi->mi_bseqid_list);
3136 	list_destroy(&mi->mi_lost_state);
3137 	list_destroy(&mi->mi_rnodes);
3138 	avl_destroy(&mi->mi_filehandles);
3139 	kmem_free(mi, sizeof (*mi));
3140 }
3141 void
3142 mi_hold(mntinfo4_t *mi)
3143 {
3144 	atomic_inc_32(&mi->mi_count);
3145 	ASSERT(mi->mi_count != 0);
3146 }
3147 
3148 void
3149 mi_rele(mntinfo4_t *mi)
3150 {
3151 	ASSERT(mi->mi_count != 0);
3152 	if (atomic_dec_32_nv(&mi->mi_count) == 0) {
3153 		nfs_free_mi4(mi);
3154 	}
3155 }
3156 
3157 vnode_t    nfs4_xattr_notsupp_vnode;
3158 
3159 void
3160 nfs4_clnt_init(void)
3161 {
3162 	nfs4_vnops_init();
3163 	(void) nfs4_rnode_init();
3164 	(void) nfs4_shadow_init();
3165 	(void) nfs4_acache_init();
3166 	(void) nfs4_subr_init();
3167 	nfs4_acl_init();
3168 	nfs_idmap_init();
3169 	nfs4_callback_init();
3170 	nfs4_secinfo_init();
3171 #ifdef	DEBUG
3172 	tsd_create(&nfs4_tsd_key, NULL);
3173 #endif
3174 
3175 	/*
3176 	 * Add a CPR callback so that we can update client
3177 	 * lease after a suspend and resume.
3178 	 */
3179 	cid = callb_add(nfs4_client_cpr_callb, 0, CB_CL_CPR_RPC, "nfs4");
3180 
3181 	zone_key_create(&mi4_list_key, nfs4_mi_init, nfs4_mi_shutdown,
3182 	    nfs4_mi_destroy);
3183 
3184 	/*
3185 	 * Initialise the reference count of the notsupp xattr cache vnode to 1
3186 	 * so that it never goes away (VOP_INACTIVE isn't called on it).
3187 	 */
3188 	nfs4_xattr_notsupp_vnode.v_count = 1;
3189 }
3190 
3191 void
3192 nfs4_clnt_fini(void)
3193 {
3194 	(void) zone_key_delete(mi4_list_key);
3195 	nfs4_vnops_fini();
3196 	(void) nfs4_rnode_fini();
3197 	(void) nfs4_shadow_fini();
3198 	(void) nfs4_acache_fini();
3199 	(void) nfs4_subr_fini();
3200 	nfs_idmap_fini();
3201 	nfs4_callback_fini();
3202 	nfs4_secinfo_fini();
3203 #ifdef	DEBUG
3204 	tsd_destroy(&nfs4_tsd_key);
3205 #endif
3206 	if (cid)
3207 		(void) callb_delete(cid);
3208 }
3209 
3210 /*ARGSUSED*/
3211 static boolean_t
3212 nfs4_client_cpr_callb(void *arg, int code)
3213 {
3214 	/*
3215 	 * We get called for Suspend and Resume events.
3216 	 * For the suspend case we simply don't care!
3217 	 */
3218 	if (code == CB_CODE_CPR_CHKPT) {
3219 		return (B_TRUE);
3220 	}
3221 
3222 	/*
3223 	 * When we get to here we are in the process of
3224 	 * resuming the system from a previous suspend.
3225 	 */
3226 	nfs4_client_resumed = gethrestime_sec();
3227 	return (B_TRUE);
3228 }
3229 
3230 void
3231 nfs4_renew_lease_thread(nfs4_server_t *sp)
3232 {
3233 	int	error = 0;
3234 	time_t	tmp_last_renewal_time, tmp_time, tmp_now_time, kip_secs;
3235 	clock_t	tick_delay = 0;
3236 	clock_t time_left = 0;
3237 	callb_cpr_t cpr_info;
3238 	kmutex_t cpr_lock;
3239 
3240 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3241 	    "nfs4_renew_lease_thread: acting on sp 0x%p", (void*)sp));
3242 	mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
3243 	CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr, "nfsv4Lease");
3244 
3245 	mutex_enter(&sp->s_lock);
3246 	/* sp->s_lease_time is set via a GETATTR */
3247 	sp->last_renewal_time = gethrestime_sec();
3248 	sp->lease_valid = NFS4_LEASE_UNINITIALIZED;
3249 	ASSERT(sp->s_refcnt >= 1);
3250 
3251 	for (;;) {
3252 		if (!sp->state_ref_count ||
3253 		    sp->lease_valid != NFS4_LEASE_VALID) {
3254 
3255 			kip_secs = MAX((sp->s_lease_time >> 1) -
3256 			    (3 * sp->propagation_delay.tv_sec), 1);
3257 
3258 			tick_delay = SEC_TO_TICK(kip_secs);
3259 
3260 			NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3261 			    "nfs4_renew_lease_thread: no renew : thread "
3262 			    "wait %ld secs", kip_secs));
3263 
3264 			NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3265 			    "nfs4_renew_lease_thread: no renew : "
3266 			    "state_ref_count %d, lease_valid %d",
3267 			    sp->state_ref_count, sp->lease_valid));
3268 
3269 			mutex_enter(&cpr_lock);
3270 			CALLB_CPR_SAFE_BEGIN(&cpr_info);
3271 			mutex_exit(&cpr_lock);
3272 			time_left = cv_reltimedwait(&sp->cv_thread_exit,
3273 			    &sp->s_lock, tick_delay, TR_CLOCK_TICK);
3274 			mutex_enter(&cpr_lock);
3275 			CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
3276 			mutex_exit(&cpr_lock);
3277 
3278 			NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3279 			    "nfs4_renew_lease_thread: no renew: "
3280 			    "time left %ld", time_left));
3281 
3282 			if (sp->s_thread_exit == NFS4_THREAD_EXIT)
3283 				goto die;
3284 			continue;
3285 		}
3286 
3287 		tmp_last_renewal_time = sp->last_renewal_time;
3288 
3289 		tmp_time = gethrestime_sec() - sp->last_renewal_time +
3290 		    (3 * sp->propagation_delay.tv_sec);
3291 
3292 		NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3293 		    "nfs4_renew_lease_thread: tmp_time %ld, "
3294 		    "sp->last_renewal_time %ld", tmp_time,
3295 		    sp->last_renewal_time));
3296 
3297 		kip_secs = MAX((sp->s_lease_time >> 1) - tmp_time, 1);
3298 
3299 		tick_delay = SEC_TO_TICK(kip_secs);
3300 
3301 		NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3302 		    "nfs4_renew_lease_thread: valid lease: sleep for %ld "
3303 		    "secs", kip_secs));
3304 
3305 		mutex_enter(&cpr_lock);
3306 		CALLB_CPR_SAFE_BEGIN(&cpr_info);
3307 		mutex_exit(&cpr_lock);
3308 		time_left = cv_reltimedwait(&sp->cv_thread_exit, &sp->s_lock,
3309 		    tick_delay, TR_CLOCK_TICK);
3310 		mutex_enter(&cpr_lock);
3311 		CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
3312 		mutex_exit(&cpr_lock);
3313 
3314 		NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3315 		    "nfs4_renew_lease_thread: valid lease: time left %ld :"
3316 		    "sp last_renewal_time %ld, nfs4_client_resumed %ld, "
3317 		    "tmp_last_renewal_time %ld", time_left,
3318 		    sp->last_renewal_time, nfs4_client_resumed,
3319 		    tmp_last_renewal_time));
3320 
3321 		if (sp->s_thread_exit == NFS4_THREAD_EXIT)
3322 			goto die;
3323 
3324 		if (tmp_last_renewal_time == sp->last_renewal_time ||
3325 		    (nfs4_client_resumed != 0 &&
3326 		    nfs4_client_resumed > sp->last_renewal_time)) {
3327 			/*
3328 			 * Issue RENEW op since we haven't renewed the lease
3329 			 * since we slept.
3330 			 */
3331 			tmp_now_time = gethrestime_sec();
3332 			error = nfs4renew(sp);
3333 			/*
3334 			 * Need to re-acquire sp's lock, nfs4renew()
3335 			 * relinqueshes it.
3336 			 */
3337 			mutex_enter(&sp->s_lock);
3338 
3339 			/*
3340 			 * See if someone changed s_thread_exit while we gave
3341 			 * up s_lock.
3342 			 */
3343 			if (sp->s_thread_exit == NFS4_THREAD_EXIT)
3344 				goto die;
3345 
3346 			if (!error) {
3347 				/*
3348 				 * check to see if we implicitly renewed while
3349 				 * we waited for a reply for our RENEW call.
3350 				 */
3351 				if (tmp_last_renewal_time ==
3352 				    sp->last_renewal_time) {
3353 					/* no implicit renew came */
3354 					sp->last_renewal_time = tmp_now_time;
3355 				} else {
3356 					NFS4_DEBUG(nfs4_client_lease_debug,
3357 					    (CE_NOTE, "renew_thread: did "
3358 					    "implicit renewal before reply "
3359 					    "from server for RENEW"));
3360 				}
3361 			} else {
3362 				/* figure out error */
3363 				NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3364 				    "renew_thread: nfs4renew returned error"
3365 				    " %d", error));
3366 			}
3367 
3368 		}
3369 	}
3370 
3371 die:
3372 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3373 	    "nfs4_renew_lease_thread: thread exiting"));
3374 
3375 	while (sp->s_otw_call_count != 0) {
3376 		NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3377 		    "nfs4_renew_lease_thread: waiting for outstanding "
3378 		    "otw calls to finish for sp 0x%p, current "
3379 		    "s_otw_call_count %d", (void *)sp,
3380 		    sp->s_otw_call_count));
3381 		mutex_enter(&cpr_lock);
3382 		CALLB_CPR_SAFE_BEGIN(&cpr_info);
3383 		mutex_exit(&cpr_lock);
3384 		cv_wait(&sp->s_cv_otw_count, &sp->s_lock);
3385 		mutex_enter(&cpr_lock);
3386 		CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
3387 		mutex_exit(&cpr_lock);
3388 	}
3389 	mutex_exit(&sp->s_lock);
3390 
3391 	nfs4_server_rele(sp);		/* free the thread's reference */
3392 	nfs4_server_rele(sp);		/* free the list's reference */
3393 	sp = NULL;
3394 
3395 done:
3396 	mutex_enter(&cpr_lock);
3397 	CALLB_CPR_EXIT(&cpr_info);	/* drops cpr_lock */
3398 	mutex_destroy(&cpr_lock);
3399 
3400 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3401 	    "nfs4_renew_lease_thread: renew thread exit officially"));
3402 
3403 	zthread_exit();
3404 	/* NOT REACHED */
3405 }
3406 
3407 /*
3408  * Send out a RENEW op to the server.
3409  * Assumes sp is locked down.
3410  */
3411 static int
3412 nfs4renew(nfs4_server_t *sp)
3413 {
3414 	COMPOUND4args_clnt args;
3415 	COMPOUND4res_clnt res;
3416 	nfs_argop4 argop[1];
3417 	int doqueue = 1;
3418 	int rpc_error;
3419 	cred_t *cr;
3420 	mntinfo4_t *mi;
3421 	timespec_t prop_time, after_time;
3422 	int needrecov = FALSE;
3423 	nfs4_recov_state_t recov_state;
3424 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
3425 
3426 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "nfs4renew"));
3427 
3428 	recov_state.rs_flags = 0;
3429 	recov_state.rs_num_retry_despite_err = 0;
3430 
3431 recov_retry:
3432 	mi = sp->mntinfo4_list;
3433 	VFS_HOLD(mi->mi_vfsp);
3434 	mutex_exit(&sp->s_lock);
3435 	ASSERT(mi != NULL);
3436 
3437 	e.error = nfs4_start_op(mi, NULL, NULL, &recov_state);
3438 	if (e.error) {
3439 		VFS_RELE(mi->mi_vfsp);
3440 		return (e.error);
3441 	}
3442 
3443 	/* Check to see if we're dealing with a marked-dead sp */
3444 	mutex_enter(&sp->s_lock);
3445 	if (sp->s_thread_exit == NFS4_THREAD_EXIT) {
3446 		mutex_exit(&sp->s_lock);
3447 		nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3448 		VFS_RELE(mi->mi_vfsp);
3449 		return (0);
3450 	}
3451 
3452 	/* Make sure mi hasn't changed on us */
3453 	if (mi != sp->mntinfo4_list) {
3454 		/* Must drop sp's lock to avoid a recursive mutex enter */
3455 		mutex_exit(&sp->s_lock);
3456 		nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3457 		VFS_RELE(mi->mi_vfsp);
3458 		mutex_enter(&sp->s_lock);
3459 		goto recov_retry;
3460 	}
3461 	mutex_exit(&sp->s_lock);
3462 
3463 	args.ctag = TAG_RENEW;
3464 
3465 	args.array_len = 1;
3466 	args.array = argop;
3467 
3468 	argop[0].argop = OP_RENEW;
3469 
3470 	mutex_enter(&sp->s_lock);
3471 	argop[0].nfs_argop4_u.oprenew.clientid = sp->clientid;
3472 	cr = sp->s_cred;
3473 	crhold(cr);
3474 	mutex_exit(&sp->s_lock);
3475 
3476 	ASSERT(cr != NULL);
3477 
3478 	/* used to figure out RTT for sp */
3479 	gethrestime(&prop_time);
3480 
3481 	NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
3482 	    "nfs4renew: %s call, sp 0x%p", needrecov ? "recov" : "first",
3483 	    (void*)sp));
3484 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "before: %ld s %ld ns ",
3485 	    prop_time.tv_sec, prop_time.tv_nsec));
3486 
3487 	DTRACE_PROBE2(nfs4__renew__start, nfs4_server_t *, sp,
3488 	    mntinfo4_t *, mi);
3489 
3490 	rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
3491 	crfree(cr);
3492 
3493 	DTRACE_PROBE2(nfs4__renew__end, nfs4_server_t *, sp,
3494 	    mntinfo4_t *, mi);
3495 
3496 	gethrestime(&after_time);
3497 
3498 	mutex_enter(&sp->s_lock);
3499 	sp->propagation_delay.tv_sec =
3500 	    MAX(1, after_time.tv_sec - prop_time.tv_sec);
3501 	mutex_exit(&sp->s_lock);
3502 
3503 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "after : %ld s %ld ns ",
3504 	    after_time.tv_sec, after_time.tv_nsec));
3505 
3506 	if (e.error == 0 && res.status == NFS4ERR_CB_PATH_DOWN) {
3507 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3508 		nfs4_delegreturn_all(sp);
3509 		nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3510 		VFS_RELE(mi->mi_vfsp);
3511 		/*
3512 		 * If the server returns CB_PATH_DOWN, it has renewed
3513 		 * the lease and informed us that the callback path is
3514 		 * down.  Since the lease is renewed, just return 0 and
3515 		 * let the renew thread proceed as normal.
3516 		 */
3517 		return (0);
3518 	}
3519 
3520 	needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
3521 	if (!needrecov && e.error) {
3522 		nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3523 		VFS_RELE(mi->mi_vfsp);
3524 		return (e.error);
3525 	}
3526 
3527 	rpc_error = e.error;
3528 
3529 	if (needrecov) {
3530 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
3531 		    "nfs4renew: initiating recovery\n"));
3532 
3533 		if (nfs4_start_recovery(&e, mi, NULL, NULL, NULL, NULL,
3534 		    OP_RENEW, NULL, NULL, NULL) == FALSE) {
3535 			nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3536 			VFS_RELE(mi->mi_vfsp);
3537 			if (!e.error)
3538 				(void) xdr_free(xdr_COMPOUND4res_clnt,
3539 				    (caddr_t)&res);
3540 			mutex_enter(&sp->s_lock);
3541 			goto recov_retry;
3542 		}
3543 		/* fall through for res.status case */
3544 	}
3545 
3546 	if (res.status) {
3547 		if (res.status == NFS4ERR_LEASE_MOVED) {
3548 			/*EMPTY*/
3549 			/*
3550 			 * XXX need to try every mntinfo4 in sp->mntinfo4_list
3551 			 * to renew the lease on that server
3552 			 */
3553 		}
3554 		e.error = geterrno4(res.status);
3555 	}
3556 
3557 	if (!rpc_error)
3558 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3559 
3560 	nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3561 
3562 	VFS_RELE(mi->mi_vfsp);
3563 
3564 	return (e.error);
3565 }
3566 
3567 void
3568 nfs4_inc_state_ref_count(mntinfo4_t *mi)
3569 {
3570 	nfs4_server_t	*sp;
3571 
3572 	/* this locks down sp if it is found */
3573 	sp = find_nfs4_server(mi);
3574 
3575 	if (sp != NULL) {
3576 		nfs4_inc_state_ref_count_nolock(sp, mi);
3577 		mutex_exit(&sp->s_lock);
3578 		nfs4_server_rele(sp);
3579 	}
3580 }
3581 
3582 /*
3583  * Bump the number of OPEN files (ie: those with state) so we know if this
3584  * nfs4_server has any state to maintain a lease for or not.
3585  *
3586  * Also, marks the nfs4_server's lease valid if it hasn't been done so already.
3587  */
3588 void
3589 nfs4_inc_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi)
3590 {
3591 	ASSERT(mutex_owned(&sp->s_lock));
3592 
3593 	sp->state_ref_count++;
3594 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3595 	    "nfs4_inc_state_ref_count: state_ref_count now %d",
3596 	    sp->state_ref_count));
3597 
3598 	if (sp->lease_valid == NFS4_LEASE_UNINITIALIZED)
3599 		sp->lease_valid = NFS4_LEASE_VALID;
3600 
3601 	/*
3602 	 * If this call caused the lease to be marked valid and/or
3603 	 * took the state_ref_count from 0 to 1, then start the time
3604 	 * on lease renewal.
3605 	 */
3606 	if (sp->lease_valid == NFS4_LEASE_VALID && sp->state_ref_count == 1)
3607 		sp->last_renewal_time = gethrestime_sec();
3608 
3609 	/* update the number of open files for mi */
3610 	mi->mi_open_files++;
3611 }
3612 
3613 void
3614 nfs4_dec_state_ref_count(mntinfo4_t *mi)
3615 {
3616 	nfs4_server_t	*sp;
3617 
3618 	/* this locks down sp if it is found */
3619 	sp = find_nfs4_server_all(mi, 1);
3620 
3621 	if (sp != NULL) {
3622 		nfs4_dec_state_ref_count_nolock(sp, mi);
3623 		mutex_exit(&sp->s_lock);
3624 		nfs4_server_rele(sp);
3625 	}
3626 }
3627 
3628 /*
3629  * Decrement the number of OPEN files (ie: those with state) so we know if
3630  * this nfs4_server has any state to maintain a lease for or not.
3631  */
3632 void
3633 nfs4_dec_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi)
3634 {
3635 	ASSERT(mutex_owned(&sp->s_lock));
3636 	ASSERT(sp->state_ref_count != 0);
3637 	sp->state_ref_count--;
3638 
3639 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3640 	    "nfs4_dec_state_ref_count: state ref count now %d",
3641 	    sp->state_ref_count));
3642 
3643 	mi->mi_open_files--;
3644 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3645 	    "nfs4_dec_state_ref_count: mi open files %d, v4 flags 0x%x",
3646 	    mi->mi_open_files, mi->mi_flags));
3647 
3648 	/* We don't have to hold the mi_lock to test mi_flags */
3649 	if (mi->mi_open_files == 0 &&
3650 	    (mi->mi_flags & MI4_REMOVE_ON_LAST_CLOSE)) {
3651 		NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3652 		    "nfs4_dec_state_ref_count: remove mntinfo4 %p since "
3653 		    "we have closed the last open file", (void*)mi));
3654 		nfs4_remove_mi_from_server(mi, sp);
3655 	}
3656 }
3657 
3658 bool_t
3659 inlease(nfs4_server_t *sp)
3660 {
3661 	bool_t result;
3662 
3663 	ASSERT(mutex_owned(&sp->s_lock));
3664 
3665 	if (sp->lease_valid == NFS4_LEASE_VALID &&
3666 	    gethrestime_sec() < sp->last_renewal_time + sp->s_lease_time)
3667 		result = TRUE;
3668 	else
3669 		result = FALSE;
3670 
3671 	return (result);
3672 }
3673 
3674 
3675 /*
3676  * Return non-zero if the given nfs4_server_t is going through recovery.
3677  */
3678 
3679 int
3680 nfs4_server_in_recovery(nfs4_server_t *sp)
3681 {
3682 	return (nfs_rw_lock_held(&sp->s_recovlock, RW_WRITER));
3683 }
3684 
3685 /*
3686  * Compare two shared filehandle objects.  Returns -1, 0, or +1, if the
3687  * first is less than, equal to, or greater than the second.
3688  */
3689 
3690 int
3691 sfh4cmp(const void *p1, const void *p2)
3692 {
3693 	const nfs4_sharedfh_t *sfh1 = (const nfs4_sharedfh_t *)p1;
3694 	const nfs4_sharedfh_t *sfh2 = (const nfs4_sharedfh_t *)p2;
3695 
3696 	return (nfs4cmpfh(&sfh1->sfh_fh, &sfh2->sfh_fh));
3697 }
3698 
3699 /*
3700  * Create a table for shared filehandle objects.
3701  */
3702 
3703 void
3704 sfh4_createtab(avl_tree_t *tab)
3705 {
3706 	avl_create(tab, sfh4cmp, sizeof (nfs4_sharedfh_t),
3707 	    offsetof(nfs4_sharedfh_t, sfh_tree));
3708 }
3709 
3710 /*
3711  * Return a shared filehandle object for the given filehandle.  The caller
3712  * is responsible for eventually calling sfh4_rele().
3713  */
3714 
3715 nfs4_sharedfh_t *
3716 sfh4_put(const nfs_fh4 *fh, mntinfo4_t *mi, nfs4_sharedfh_t *key)
3717 {
3718 	nfs4_sharedfh_t *sfh, *nsfh;
3719 	avl_index_t where;
3720 	nfs4_sharedfh_t skey;
3721 
3722 	if (!key) {
3723 		skey.sfh_fh = *fh;
3724 		key = &skey;
3725 	}
3726 
3727 	nsfh = kmem_alloc(sizeof (nfs4_sharedfh_t), KM_SLEEP);
3728 	nsfh->sfh_fh.nfs_fh4_len = fh->nfs_fh4_len;
3729 	/*
3730 	 * We allocate the largest possible filehandle size because it's
3731 	 * not that big, and it saves us from possibly having to resize the
3732 	 * buffer later.
3733 	 */
3734 	nsfh->sfh_fh.nfs_fh4_val = kmem_alloc(NFS4_FHSIZE, KM_SLEEP);
3735 	bcopy(fh->nfs_fh4_val, nsfh->sfh_fh.nfs_fh4_val, fh->nfs_fh4_len);
3736 	mutex_init(&nsfh->sfh_lock, NULL, MUTEX_DEFAULT, NULL);
3737 	nsfh->sfh_refcnt = 1;
3738 	nsfh->sfh_flags = SFH4_IN_TREE;
3739 	nsfh->sfh_mi = mi;
3740 	NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, "sfh4_get: new object (%p)",
3741 	    (void *)nsfh));
3742 
3743 	(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
3744 	sfh = avl_find(&mi->mi_filehandles, key, &where);
3745 	if (sfh != NULL) {
3746 		mutex_enter(&sfh->sfh_lock);
3747 		sfh->sfh_refcnt++;
3748 		mutex_exit(&sfh->sfh_lock);
3749 		nfs_rw_exit(&mi->mi_fh_lock);
3750 		/* free our speculative allocs */
3751 		kmem_free(nsfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE);
3752 		kmem_free(nsfh, sizeof (nfs4_sharedfh_t));
3753 		return (sfh);
3754 	}
3755 
3756 	avl_insert(&mi->mi_filehandles, nsfh, where);
3757 	nfs_rw_exit(&mi->mi_fh_lock);
3758 
3759 	return (nsfh);
3760 }
3761 
3762 /*
3763  * Return a shared filehandle object for the given filehandle.  The caller
3764  * is responsible for eventually calling sfh4_rele().
3765  */
3766 
3767 nfs4_sharedfh_t *
3768 sfh4_get(const nfs_fh4 *fh, mntinfo4_t *mi)
3769 {
3770 	nfs4_sharedfh_t *sfh;
3771 	nfs4_sharedfh_t key;
3772 
3773 	ASSERT(fh->nfs_fh4_len <= NFS4_FHSIZE);
3774 
3775 #ifdef DEBUG
3776 	if (nfs4_sharedfh_debug) {
3777 		nfs4_fhandle_t fhandle;
3778 
3779 		fhandle.fh_len = fh->nfs_fh4_len;
3780 		bcopy(fh->nfs_fh4_val, fhandle.fh_buf, fhandle.fh_len);
3781 		zcmn_err(mi->mi_zone->zone_id, CE_NOTE, "sfh4_get:");
3782 		nfs4_printfhandle(&fhandle);
3783 	}
3784 #endif
3785 
3786 	/*
3787 	 * If there's already an object for the given filehandle, bump the
3788 	 * reference count and return it.  Otherwise, create a new object
3789 	 * and add it to the AVL tree.
3790 	 */
3791 
3792 	key.sfh_fh = *fh;
3793 
3794 	(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
3795 	sfh = avl_find(&mi->mi_filehandles, &key, NULL);
3796 	if (sfh != NULL) {
3797 		mutex_enter(&sfh->sfh_lock);
3798 		sfh->sfh_refcnt++;
3799 		NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3800 		    "sfh4_get: found existing %p, new refcnt=%d",
3801 		    (void *)sfh, sfh->sfh_refcnt));
3802 		mutex_exit(&sfh->sfh_lock);
3803 		nfs_rw_exit(&mi->mi_fh_lock);
3804 		return (sfh);
3805 	}
3806 	nfs_rw_exit(&mi->mi_fh_lock);
3807 
3808 	return (sfh4_put(fh, mi, &key));
3809 }
3810 
3811 /*
3812  * Get a reference to the given shared filehandle object.
3813  */
3814 
3815 void
3816 sfh4_hold(nfs4_sharedfh_t *sfh)
3817 {
3818 	ASSERT(sfh->sfh_refcnt > 0);
3819 
3820 	mutex_enter(&sfh->sfh_lock);
3821 	sfh->sfh_refcnt++;
3822 	NFS4_DEBUG(nfs4_sharedfh_debug,
3823 	    (CE_NOTE, "sfh4_hold %p, new refcnt=%d",
3824 	    (void *)sfh, sfh->sfh_refcnt));
3825 	mutex_exit(&sfh->sfh_lock);
3826 }
3827 
3828 /*
3829  * Release a reference to the given shared filehandle object and null out
3830  * the given pointer.
3831  */
3832 
3833 void
3834 sfh4_rele(nfs4_sharedfh_t **sfhpp)
3835 {
3836 	mntinfo4_t *mi;
3837 	nfs4_sharedfh_t *sfh = *sfhpp;
3838 
3839 	ASSERT(sfh->sfh_refcnt > 0);
3840 
3841 	mutex_enter(&sfh->sfh_lock);
3842 	if (sfh->sfh_refcnt > 1) {
3843 		sfh->sfh_refcnt--;
3844 		NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3845 		    "sfh4_rele %p, new refcnt=%d",
3846 		    (void *)sfh, sfh->sfh_refcnt));
3847 		mutex_exit(&sfh->sfh_lock);
3848 		goto finish;
3849 	}
3850 	mutex_exit(&sfh->sfh_lock);
3851 
3852 	/*
3853 	 * Possibly the last reference, so get the lock for the table in
3854 	 * case it's time to remove the object from the table.
3855 	 */
3856 	mi = sfh->sfh_mi;
3857 	(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
3858 	mutex_enter(&sfh->sfh_lock);
3859 	sfh->sfh_refcnt--;
3860 	if (sfh->sfh_refcnt > 0) {
3861 		NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3862 		    "sfh4_rele %p, new refcnt=%d",
3863 		    (void *)sfh, sfh->sfh_refcnt));
3864 		mutex_exit(&sfh->sfh_lock);
3865 		nfs_rw_exit(&mi->mi_fh_lock);
3866 		goto finish;
3867 	}
3868 
3869 	NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3870 	    "sfh4_rele %p, last ref", (void *)sfh));
3871 	if (sfh->sfh_flags & SFH4_IN_TREE) {
3872 		avl_remove(&mi->mi_filehandles, sfh);
3873 		sfh->sfh_flags &= ~SFH4_IN_TREE;
3874 	}
3875 	mutex_exit(&sfh->sfh_lock);
3876 	nfs_rw_exit(&mi->mi_fh_lock);
3877 	mutex_destroy(&sfh->sfh_lock);
3878 	kmem_free(sfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE);
3879 	kmem_free(sfh, sizeof (nfs4_sharedfh_t));
3880 
3881 finish:
3882 	*sfhpp = NULL;
3883 }
3884 
3885 /*
3886  * Update the filehandle for the given shared filehandle object.
3887  */
3888 
3889 int nfs4_warn_dupfh = 0;	/* if set, always warn about dup fhs below */
3890 
3891 void
3892 sfh4_update(nfs4_sharedfh_t *sfh, const nfs_fh4 *newfh)
3893 {
3894 	mntinfo4_t *mi = sfh->sfh_mi;
3895 	nfs4_sharedfh_t *dupsfh;
3896 	avl_index_t where;
3897 	nfs4_sharedfh_t key;
3898 
3899 #ifdef DEBUG
3900 	mutex_enter(&sfh->sfh_lock);
3901 	ASSERT(sfh->sfh_refcnt > 0);
3902 	mutex_exit(&sfh->sfh_lock);
3903 #endif
3904 	ASSERT(newfh->nfs_fh4_len <= NFS4_FHSIZE);
3905 
3906 	/*
3907 	 * The basic plan is to remove the shared filehandle object from
3908 	 * the table, update it to have the new filehandle, then reinsert
3909 	 * it.
3910 	 */
3911 
3912 	(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
3913 	mutex_enter(&sfh->sfh_lock);
3914 	if (sfh->sfh_flags & SFH4_IN_TREE) {
3915 		avl_remove(&mi->mi_filehandles, sfh);
3916 		sfh->sfh_flags &= ~SFH4_IN_TREE;
3917 	}
3918 	mutex_exit(&sfh->sfh_lock);
3919 	sfh->sfh_fh.nfs_fh4_len = newfh->nfs_fh4_len;
3920 	bcopy(newfh->nfs_fh4_val, sfh->sfh_fh.nfs_fh4_val,
3921 	    sfh->sfh_fh.nfs_fh4_len);
3922 
3923 	/*
3924 	 * XXX If there is already a shared filehandle object with the new
3925 	 * filehandle, we're in trouble, because the rnode code assumes
3926 	 * that there is only one shared filehandle object for a given
3927 	 * filehandle.  So issue a warning (for read-write mounts only)
3928 	 * and don't try to re-insert the given object into the table.
3929 	 * Hopefully the given object will quickly go away and everyone
3930 	 * will use the new object.
3931 	 */
3932 	key.sfh_fh = *newfh;
3933 	dupsfh = avl_find(&mi->mi_filehandles, &key, &where);
3934 	if (dupsfh != NULL) {
3935 		if (!(mi->mi_vfsp->vfs_flag & VFS_RDONLY) || nfs4_warn_dupfh) {
3936 			zcmn_err(mi->mi_zone->zone_id, CE_WARN, "sfh4_update: "
3937 			    "duplicate filehandle detected");
3938 			sfh4_printfhandle(dupsfh);
3939 		}
3940 	} else {
3941 		avl_insert(&mi->mi_filehandles, sfh, where);
3942 		mutex_enter(&sfh->sfh_lock);
3943 		sfh->sfh_flags |= SFH4_IN_TREE;
3944 		mutex_exit(&sfh->sfh_lock);
3945 	}
3946 	nfs_rw_exit(&mi->mi_fh_lock);
3947 }
3948 
3949 /*
3950  * Copy out the current filehandle for the given shared filehandle object.
3951  */
3952 
3953 void
3954 sfh4_copyval(const nfs4_sharedfh_t *sfh, nfs4_fhandle_t *fhp)
3955 {
3956 	mntinfo4_t *mi = sfh->sfh_mi;
3957 
3958 	ASSERT(sfh->sfh_refcnt > 0);
3959 
3960 	(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
3961 	fhp->fh_len = sfh->sfh_fh.nfs_fh4_len;
3962 	ASSERT(fhp->fh_len <= NFS4_FHSIZE);
3963 	bcopy(sfh->sfh_fh.nfs_fh4_val, fhp->fh_buf, fhp->fh_len);
3964 	nfs_rw_exit(&mi->mi_fh_lock);
3965 }
3966 
3967 /*
3968  * Print out the filehandle for the given shared filehandle object.
3969  */
3970 
3971 void
3972 sfh4_printfhandle(const nfs4_sharedfh_t *sfh)
3973 {
3974 	nfs4_fhandle_t fhandle;
3975 
3976 	sfh4_copyval(sfh, &fhandle);
3977 	nfs4_printfhandle(&fhandle);
3978 }
3979 
3980 /*
3981  * Compare 2 fnames.  Returns -1 if the first is "less" than the second, 0
3982  * if they're the same, +1 if the first is "greater" than the second.  The
3983  * caller (or whoever's calling the AVL package) is responsible for
3984  * handling locking issues.
3985  */
3986 
3987 static int
3988 fncmp(const void *p1, const void *p2)
3989 {
3990 	const nfs4_fname_t *f1 = p1;
3991 	const nfs4_fname_t *f2 = p2;
3992 	int res;
3993 
3994 	res = strcmp(f1->fn_name, f2->fn_name);
3995 	/*
3996 	 * The AVL package wants +/-1, not arbitrary positive or negative
3997 	 * integers.
3998 	 */
3999 	if (res > 0)
4000 		res = 1;
4001 	else if (res < 0)
4002 		res = -1;
4003 	return (res);
4004 }
4005 
4006 /*
4007  * Get or create an fname with the given name, as a child of the given
4008  * fname.  The caller is responsible for eventually releasing the reference
4009  * (fn_rele()).  parent may be NULL.
4010  */
4011 
4012 nfs4_fname_t *
4013 fn_get(nfs4_fname_t *parent, char *name, nfs4_sharedfh_t *sfh)
4014 {
4015 	nfs4_fname_t key;
4016 	nfs4_fname_t *fnp;
4017 	avl_index_t where;
4018 
4019 	key.fn_name = name;
4020 
4021 	/*
4022 	 * If there's already an fname registered with the given name, bump
4023 	 * its reference count and return it.  Otherwise, create a new one
4024 	 * and add it to the parent's AVL tree.
4025 	 *
4026 	 * fname entries we are looking for should match both name
4027 	 * and sfh stored in the fname.
4028 	 */
4029 again:
4030 	if (parent != NULL) {
4031 		mutex_enter(&parent->fn_lock);
4032 		fnp = avl_find(&parent->fn_children, &key, &where);
4033 		if (fnp != NULL) {
4034 			/*
4035 			 * This hold on fnp is released below later,
4036 			 * in case this is not the fnp we want.
4037 			 */
4038 			fn_hold(fnp);
4039 
4040 			if (fnp->fn_sfh == sfh) {
4041 				/*
4042 				 * We have found our entry.
4043 				 * put an hold and return it.
4044 				 */
4045 				mutex_exit(&parent->fn_lock);
4046 				return (fnp);
4047 			}
4048 
4049 			/*
4050 			 * We have found an entry that has a mismatching
4051 			 * fn_sfh. This could be a stale entry due to
4052 			 * server side rename. We will remove this entry
4053 			 * and make sure no such entries exist.
4054 			 */
4055 			mutex_exit(&parent->fn_lock);
4056 			mutex_enter(&fnp->fn_lock);
4057 			if (fnp->fn_parent == parent) {
4058 				/*
4059 				 * Remove ourselves from parent's
4060 				 * fn_children tree.
4061 				 */
4062 				mutex_enter(&parent->fn_lock);
4063 				avl_remove(&parent->fn_children, fnp);
4064 				mutex_exit(&parent->fn_lock);
4065 				fn_rele(&fnp->fn_parent);
4066 			}
4067 			mutex_exit(&fnp->fn_lock);
4068 			fn_rele(&fnp);
4069 			goto again;
4070 		}
4071 	}
4072 
4073 	fnp = kmem_alloc(sizeof (nfs4_fname_t), KM_SLEEP);
4074 	mutex_init(&fnp->fn_lock, NULL, MUTEX_DEFAULT, NULL);
4075 	fnp->fn_parent = parent;
4076 	if (parent != NULL)
4077 		fn_hold(parent);
4078 	fnp->fn_len = strlen(name);
4079 	ASSERT(fnp->fn_len < MAXNAMELEN);
4080 	fnp->fn_name = kmem_alloc(fnp->fn_len + 1, KM_SLEEP);
4081 	(void) strcpy(fnp->fn_name, name);
4082 	fnp->fn_refcnt = 1;
4083 
4084 	/*
4085 	 * This hold on sfh is later released
4086 	 * when we do the final fn_rele() on this fname.
4087 	 */
4088 	sfh4_hold(sfh);
4089 	fnp->fn_sfh = sfh;
4090 
4091 	avl_create(&fnp->fn_children, fncmp, sizeof (nfs4_fname_t),
4092 	    offsetof(nfs4_fname_t, fn_tree));
4093 	NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4094 	    "fn_get %p:%s, a new nfs4_fname_t!",
4095 	    (void *)fnp, fnp->fn_name));
4096 	if (parent != NULL) {
4097 		avl_insert(&parent->fn_children, fnp, where);
4098 		mutex_exit(&parent->fn_lock);
4099 	}
4100 
4101 	return (fnp);
4102 }
4103 
4104 void
4105 fn_hold(nfs4_fname_t *fnp)
4106 {
4107 	atomic_inc_32(&fnp->fn_refcnt);
4108 	NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4109 	    "fn_hold %p:%s, new refcnt=%d",
4110 	    (void *)fnp, fnp->fn_name, fnp->fn_refcnt));
4111 }
4112 
4113 /*
4114  * Decrement the reference count of the given fname, and destroy it if its
4115  * reference count goes to zero.  Nulls out the given pointer.
4116  */
4117 
4118 void
4119 fn_rele(nfs4_fname_t **fnpp)
4120 {
4121 	nfs4_fname_t *parent;
4122 	uint32_t newref;
4123 	nfs4_fname_t *fnp;
4124 
4125 recur:
4126 	fnp = *fnpp;
4127 	*fnpp = NULL;
4128 
4129 	mutex_enter(&fnp->fn_lock);
4130 	parent = fnp->fn_parent;
4131 	if (parent != NULL)
4132 		mutex_enter(&parent->fn_lock);	/* prevent new references */
4133 	newref = atomic_dec_32_nv(&fnp->fn_refcnt);
4134 	if (newref > 0) {
4135 		NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4136 		    "fn_rele %p:%s, new refcnt=%d",
4137 		    (void *)fnp, fnp->fn_name, fnp->fn_refcnt));
4138 		if (parent != NULL)
4139 			mutex_exit(&parent->fn_lock);
4140 		mutex_exit(&fnp->fn_lock);
4141 		return;
4142 	}
4143 
4144 	NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4145 	    "fn_rele %p:%s, last reference, deleting...",
4146 	    (void *)fnp, fnp->fn_name));
4147 	if (parent != NULL) {
4148 		avl_remove(&parent->fn_children, fnp);
4149 		mutex_exit(&parent->fn_lock);
4150 	}
4151 	kmem_free(fnp->fn_name, fnp->fn_len + 1);
4152 	sfh4_rele(&fnp->fn_sfh);
4153 	mutex_destroy(&fnp->fn_lock);
4154 	avl_destroy(&fnp->fn_children);
4155 	kmem_free(fnp, sizeof (nfs4_fname_t));
4156 	/*
4157 	 * Recursivly fn_rele the parent.
4158 	 * Use goto instead of a recursive call to avoid stack overflow.
4159 	 */
4160 	if (parent != NULL) {
4161 		fnpp = &parent;
4162 		goto recur;
4163 	}
4164 }
4165 
4166 /*
4167  * Returns the single component name of the given fname, in a MAXNAMELEN
4168  * string buffer, which the caller is responsible for freeing.  Note that
4169  * the name may become invalid as a result of fn_move().
4170  */
4171 
4172 char *
4173 fn_name(nfs4_fname_t *fnp)
4174 {
4175 	char *name;
4176 
4177 	ASSERT(fnp->fn_len < MAXNAMELEN);
4178 	name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
4179 	mutex_enter(&fnp->fn_lock);
4180 	(void) strcpy(name, fnp->fn_name);
4181 	mutex_exit(&fnp->fn_lock);
4182 
4183 	return (name);
4184 }
4185 
4186 
4187 /*
4188  * fn_path_realloc
4189  *
4190  * This function, used only by fn_path, constructs
4191  * a new string which looks like "prepend" + "/" + "current".
4192  * by allocating a new string and freeing the old one.
4193  */
4194 static void
4195 fn_path_realloc(char **curses, char *prepend)
4196 {
4197 	int len, curlen = 0;
4198 	char *news;
4199 
4200 	if (*curses == NULL) {
4201 		/*
4202 		 * Prime the pump, allocate just the
4203 		 * space for prepend and return that.
4204 		 */
4205 		len = strlen(prepend) + 1;
4206 		news = kmem_alloc(len, KM_SLEEP);
4207 		(void) strncpy(news, prepend, len);
4208 	} else {
4209 		/*
4210 		 * Allocate the space  for a new string
4211 		 * +1 +1 is for the "/" and the NULL
4212 		 * byte at the end of it all.
4213 		 */
4214 		curlen = strlen(*curses);
4215 		len = curlen + strlen(prepend) + 1 + 1;
4216 		news = kmem_alloc(len, KM_SLEEP);
4217 		(void) strncpy(news, prepend, len);
4218 		(void) strcat(news, "/");
4219 		(void) strcat(news, *curses);
4220 		kmem_free(*curses, curlen + 1);
4221 	}
4222 	*curses = news;
4223 }
4224 
4225 /*
4226  * Returns the path name (starting from the fs root) for the given fname.
4227  * The caller is responsible for freeing.  Note that the path may be or
4228  * become invalid as a result of fn_move().
4229  */
4230 
4231 char *
4232 fn_path(nfs4_fname_t *fnp)
4233 {
4234 	char *path;
4235 	nfs4_fname_t *nextfnp;
4236 
4237 	if (fnp == NULL)
4238 		return (NULL);
4239 
4240 	path = NULL;
4241 
4242 	/* walk up the tree constructing the pathname.  */
4243 
4244 	fn_hold(fnp);			/* adjust for later rele */
4245 	do {
4246 		mutex_enter(&fnp->fn_lock);
4247 		/*
4248 		 * Add fn_name in front of the current path
4249 		 */
4250 		fn_path_realloc(&path, fnp->fn_name);
4251 		nextfnp = fnp->fn_parent;
4252 		if (nextfnp != NULL)
4253 			fn_hold(nextfnp);
4254 		mutex_exit(&fnp->fn_lock);
4255 		fn_rele(&fnp);
4256 		fnp = nextfnp;
4257 	} while (fnp != NULL);
4258 
4259 	return (path);
4260 }
4261 
4262 /*
4263  * Return a reference to the parent of the given fname, which the caller is
4264  * responsible for eventually releasing.
4265  */
4266 
4267 nfs4_fname_t *
4268 fn_parent(nfs4_fname_t *fnp)
4269 {
4270 	nfs4_fname_t *parent;
4271 
4272 	mutex_enter(&fnp->fn_lock);
4273 	parent = fnp->fn_parent;
4274 	if (parent != NULL)
4275 		fn_hold(parent);
4276 	mutex_exit(&fnp->fn_lock);
4277 
4278 	return (parent);
4279 }
4280 
4281 /*
4282  * Update fnp so that its parent is newparent and its name is newname.
4283  */
4284 
4285 void
4286 fn_move(nfs4_fname_t *fnp, nfs4_fname_t *newparent, char *newname)
4287 {
4288 	nfs4_fname_t *parent, *tmpfnp;
4289 	ssize_t newlen;
4290 	nfs4_fname_t key;
4291 	avl_index_t where;
4292 
4293 	/*
4294 	 * This assert exists to catch the client trying to rename
4295 	 * a dir to be a child of itself.  This happened at a recent
4296 	 * bakeoff against a 3rd party (broken) server which allowed
4297 	 * the rename to succeed.  If it trips it means that:
4298 	 *	a) the code in nfs4rename that detects this case is broken
4299 	 *	b) the server is broken (since it allowed the bogus rename)
4300 	 *
4301 	 * For non-DEBUG kernels, prepare for a recursive mutex_enter
4302 	 * panic below from:  mutex_enter(&newparent->fn_lock);
4303 	 */
4304 	ASSERT(fnp != newparent);
4305 
4306 	/*
4307 	 * Remove fnp from its current parent, change its name, then add it
4308 	 * to newparent. It might happen that fnp was replaced by another
4309 	 * nfs4_fname_t with the same fn_name in parent->fn_children.
4310 	 * In such case, fnp->fn_parent is NULL and we skip the removal
4311 	 * of fnp from its current parent.
4312 	 */
4313 	mutex_enter(&fnp->fn_lock);
4314 	parent = fnp->fn_parent;
4315 	if (parent != NULL) {
4316 		mutex_enter(&parent->fn_lock);
4317 		avl_remove(&parent->fn_children, fnp);
4318 		mutex_exit(&parent->fn_lock);
4319 		fn_rele(&fnp->fn_parent);
4320 	}
4321 
4322 	newlen = strlen(newname);
4323 	if (newlen != fnp->fn_len) {
4324 		ASSERT(newlen < MAXNAMELEN);
4325 		kmem_free(fnp->fn_name, fnp->fn_len + 1);
4326 		fnp->fn_name = kmem_alloc(newlen + 1, KM_SLEEP);
4327 		fnp->fn_len = newlen;
4328 	}
4329 	(void) strcpy(fnp->fn_name, newname);
4330 
4331 again:
4332 	mutex_enter(&newparent->fn_lock);
4333 	key.fn_name = fnp->fn_name;
4334 	tmpfnp = avl_find(&newparent->fn_children, &key, &where);
4335 	if (tmpfnp != NULL) {
4336 		/*
4337 		 * This could be due to a file that was unlinked while
4338 		 * open, or perhaps the rnode is in the free list.  Remove
4339 		 * it from newparent and let it go away on its own.  The
4340 		 * contorted code is to deal with lock order issues and
4341 		 * race conditions.
4342 		 */
4343 		fn_hold(tmpfnp);
4344 		mutex_exit(&newparent->fn_lock);
4345 		mutex_enter(&tmpfnp->fn_lock);
4346 		if (tmpfnp->fn_parent == newparent) {
4347 			mutex_enter(&newparent->fn_lock);
4348 			avl_remove(&newparent->fn_children, tmpfnp);
4349 			mutex_exit(&newparent->fn_lock);
4350 			fn_rele(&tmpfnp->fn_parent);
4351 		}
4352 		mutex_exit(&tmpfnp->fn_lock);
4353 		fn_rele(&tmpfnp);
4354 		goto again;
4355 	}
4356 	fnp->fn_parent = newparent;
4357 	fn_hold(newparent);
4358 	avl_insert(&newparent->fn_children, fnp, where);
4359 	mutex_exit(&newparent->fn_lock);
4360 	mutex_exit(&fnp->fn_lock);
4361 }
4362 
4363 #ifdef DEBUG
4364 /*
4365  * Return non-zero if the type information makes sense for the given vnode.
4366  * Otherwise panic.
4367  */
4368 int
4369 nfs4_consistent_type(vnode_t *vp)
4370 {
4371 	rnode4_t *rp = VTOR4(vp);
4372 
4373 	if (nfs4_vtype_debug && vp->v_type != VNON &&
4374 	    rp->r_attr.va_type != VNON && vp->v_type != rp->r_attr.va_type) {
4375 		cmn_err(CE_PANIC, "vnode %p type mismatch; v_type=%d, "
4376 		    "rnode attr type=%d", (void *)vp, vp->v_type,
4377 		    rp->r_attr.va_type);
4378 	}
4379 
4380 	return (1);
4381 }
4382 #endif /* DEBUG */
4383