xref: /titanic_50/usr/src/uts/common/fs/nfs/nfs4_client.c (revision cb6bbb52bf1ab745f552771b995842398861f875)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 /*
26  *  	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
27  *	All Rights Reserved
28  */
29 
30 #include <sys/param.h>
31 #include <sys/types.h>
32 #include <sys/systm.h>
33 #include <sys/thread.h>
34 #include <sys/t_lock.h>
35 #include <sys/time.h>
36 #include <sys/vnode.h>
37 #include <sys/vfs.h>
38 #include <sys/errno.h>
39 #include <sys/buf.h>
40 #include <sys/stat.h>
41 #include <sys/cred.h>
42 #include <sys/kmem.h>
43 #include <sys/debug.h>
44 #include <sys/dnlc.h>
45 #include <sys/vmsystm.h>
46 #include <sys/flock.h>
47 #include <sys/share.h>
48 #include <sys/cmn_err.h>
49 #include <sys/tiuser.h>
50 #include <sys/sysmacros.h>
51 #include <sys/callb.h>
52 #include <sys/acl.h>
53 #include <sys/kstat.h>
54 #include <sys/signal.h>
55 #include <sys/disp.h>
56 #include <sys/atomic.h>
57 #include <sys/list.h>
58 #include <sys/sdt.h>
59 
60 #include <rpc/types.h>
61 #include <rpc/xdr.h>
62 #include <rpc/auth.h>
63 #include <rpc/clnt.h>
64 
65 #include <nfs/nfs.h>
66 #include <nfs/nfs_clnt.h>
67 #include <nfs/nfs_acl.h>
68 
69 #include <nfs/nfs4.h>
70 #include <nfs/rnode4.h>
71 #include <nfs/nfs4_clnt.h>
72 
73 #include <vm/hat.h>
74 #include <vm/as.h>
75 #include <vm/page.h>
76 #include <vm/pvn.h>
77 #include <vm/seg.h>
78 #include <vm/seg_map.h>
79 #include <vm/seg_vn.h>
80 
81 #include <sys/ddi.h>
82 
83 /*
84  * Arguments to page-flush thread.
85  */
86 typedef struct {
87 	vnode_t *vp;
88 	cred_t *cr;
89 } pgflush_t;
90 
91 #ifdef DEBUG
92 int nfs4_client_lease_debug;
93 int nfs4_sharedfh_debug;
94 int nfs4_fname_debug;
95 
96 /* temporary: panic if v_type is inconsistent with r_attr va_type */
97 int nfs4_vtype_debug;
98 
99 uint_t nfs4_tsd_key;
100 #endif
101 
102 static time_t	nfs4_client_resumed = 0;
103 static	callb_id_t cid = 0;
104 
105 static int	nfs4renew(nfs4_server_t *);
106 static void	nfs4_attrcache_va(vnode_t *, nfs4_ga_res_t *, int);
107 static void	nfs4_pgflush_thread(pgflush_t *);
108 
109 static boolean_t nfs4_client_cpr_callb(void *, int);
110 
111 struct mi4_globals {
112 	kmutex_t	mig_lock;  /* lock protecting mig_list */
113 	list_t		mig_list;  /* list of NFS v4 mounts in zone */
114 	boolean_t	mig_destructor_called;
115 };
116 
117 static zone_key_t mi4_list_key;
118 
119 /*
120  * Attributes caching:
121  *
122  * Attributes are cached in the rnode in struct vattr form.
123  * There is a time associated with the cached attributes (r_time_attr_inval)
124  * which tells whether the attributes are valid. The time is initialized
125  * to the difference between current time and the modify time of the vnode
126  * when new attributes are cached. This allows the attributes for
127  * files that have changed recently to be timed out sooner than for files
128  * that have not changed for a long time. There are minimum and maximum
129  * timeout values that can be set per mount point.
130  */
131 
132 /*
133  * If a cache purge is in progress, wait for it to finish.
134  *
135  * The current thread must not be in the middle of an
136  * nfs4_start_op/nfs4_end_op region.  Otherwise, there could be a deadlock
137  * between this thread, a recovery thread, and the page flush thread.
138  */
139 int
140 nfs4_waitfor_purge_complete(vnode_t *vp)
141 {
142 	rnode4_t *rp;
143 	k_sigset_t smask;
144 
145 	rp = VTOR4(vp);
146 	if ((rp->r_serial != NULL && rp->r_serial != curthread) ||
147 	    ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread)) {
148 		mutex_enter(&rp->r_statelock);
149 		sigintr(&smask, VTOMI4(vp)->mi_flags & MI4_INT);
150 		while ((rp->r_serial != NULL && rp->r_serial != curthread) ||
151 		    ((rp->r_flags & R4PGFLUSH) &&
152 		    rp->r_pgflush != curthread)) {
153 			if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
154 				sigunintr(&smask);
155 				mutex_exit(&rp->r_statelock);
156 				return (EINTR);
157 			}
158 		}
159 		sigunintr(&smask);
160 		mutex_exit(&rp->r_statelock);
161 	}
162 	return (0);
163 }
164 
165 /*
166  * Validate caches by checking cached attributes. If they have timed out,
167  * then get new attributes from the server.  As a side effect, cache
168  * invalidation is done if the attributes have changed.
169  *
170  * If the attributes have not timed out and if there is a cache
171  * invalidation being done by some other thread, then wait until that
172  * thread has completed the cache invalidation.
173  */
174 int
175 nfs4_validate_caches(vnode_t *vp, cred_t *cr)
176 {
177 	int error;
178 	nfs4_ga_res_t gar;
179 
180 	if (ATTRCACHE4_VALID(vp)) {
181 		error = nfs4_waitfor_purge_complete(vp);
182 		if (error)
183 			return (error);
184 		return (0);
185 	}
186 
187 	return (nfs4_getattr_otw(vp, &gar, cr, 0));
188 }
189 
190 /*
191  * Fill in attribute from the cache.
192  * If valid, then return 0 to indicate that no error occurred,
193  * otherwise return 1 to indicate that an error occurred.
194  */
195 static int
196 nfs4_getattr_cache(vnode_t *vp, struct vattr *vap)
197 {
198 	rnode4_t *rp;
199 
200 	rp = VTOR4(vp);
201 	mutex_enter(&rp->r_statelock);
202 	mutex_enter(&rp->r_statev4_lock);
203 	if (ATTRCACHE4_VALID(vp)) {
204 		mutex_exit(&rp->r_statev4_lock);
205 		/*
206 		 * Cached attributes are valid
207 		 */
208 		*vap = rp->r_attr;
209 		mutex_exit(&rp->r_statelock);
210 		return (0);
211 	}
212 	mutex_exit(&rp->r_statev4_lock);
213 	mutex_exit(&rp->r_statelock);
214 	return (1);
215 }
216 
217 
218 /*
219  * If returned error is ESTALE flush all caches.  The nfs4_purge_caches()
220  * call is synchronous because all the pages were invalidated by the
221  * nfs4_invalidate_pages() call.
222  */
223 void
224 nfs4_purge_stale_fh(int errno, vnode_t *vp, cred_t *cr)
225 {
226 	struct rnode4 *rp = VTOR4(vp);
227 
228 	/* Ensure that the ..._end_op() call has been done */
229 	ASSERT(tsd_get(nfs4_tsd_key) == NULL);
230 
231 	if (errno != ESTALE)
232 		return;
233 
234 	mutex_enter(&rp->r_statelock);
235 	rp->r_flags |= R4STALE;
236 	if (!rp->r_error)
237 		rp->r_error = errno;
238 	mutex_exit(&rp->r_statelock);
239 	if (nfs4_has_pages(vp))
240 		nfs4_invalidate_pages(vp, (u_offset_t)0, cr);
241 	nfs4_purge_caches(vp, NFS4_PURGE_DNLC, cr, FALSE);
242 }
243 
244 /*
245  * Purge all of the various NFS `data' caches.  If "asyncpg" is TRUE, the
246  * page purge is done asynchronously.
247  */
248 void
249 nfs4_purge_caches(vnode_t *vp, int purge_dnlc, cred_t *cr, int asyncpg)
250 {
251 	rnode4_t *rp;
252 	char *contents;
253 	vnode_t *xattr;
254 	int size;
255 	int pgflush;			/* are we the page flush thread? */
256 
257 	/*
258 	 * Purge the DNLC for any entries which refer to this file.
259 	 */
260 	if (vp->v_count > 1 &&
261 	    (vp->v_type == VDIR || purge_dnlc == NFS4_PURGE_DNLC))
262 		dnlc_purge_vp(vp);
263 
264 	/*
265 	 * Clear any readdir state bits and purge the readlink response cache.
266 	 */
267 	rp = VTOR4(vp);
268 	mutex_enter(&rp->r_statelock);
269 	rp->r_flags &= ~R4LOOKUP;
270 	contents = rp->r_symlink.contents;
271 	size = rp->r_symlink.size;
272 	rp->r_symlink.contents = NULL;
273 
274 	xattr = rp->r_xattr_dir;
275 	rp->r_xattr_dir = NULL;
276 
277 	/*
278 	 * Purge pathconf cache too.
279 	 */
280 	rp->r_pathconf.pc4_xattr_valid = 0;
281 	rp->r_pathconf.pc4_cache_valid = 0;
282 
283 	pgflush = (curthread == rp->r_pgflush);
284 	mutex_exit(&rp->r_statelock);
285 
286 	if (contents != NULL) {
287 
288 		kmem_free((void *)contents, size);
289 	}
290 
291 	if (xattr != NULL)
292 		VN_RELE(xattr);
293 
294 	/*
295 	 * Flush the page cache.  If the current thread is the page flush
296 	 * thread, don't initiate a new page flush.  There's no need for
297 	 * it, and doing it correctly is hard.
298 	 */
299 	if (nfs4_has_pages(vp) && !pgflush) {
300 		if (!asyncpg) {
301 			(void) nfs4_waitfor_purge_complete(vp);
302 			nfs4_flush_pages(vp, cr);
303 		} else {
304 			pgflush_t *args;
305 
306 			/*
307 			 * We don't hold r_statelock while creating the
308 			 * thread, in case the call blocks.  So we use a
309 			 * flag to indicate that a page flush thread is
310 			 * active.
311 			 */
312 			mutex_enter(&rp->r_statelock);
313 			if (rp->r_flags & R4PGFLUSH) {
314 				mutex_exit(&rp->r_statelock);
315 			} else {
316 				rp->r_flags |= R4PGFLUSH;
317 				mutex_exit(&rp->r_statelock);
318 
319 				args = kmem_alloc(sizeof (pgflush_t),
320 				    KM_SLEEP);
321 				args->vp = vp;
322 				VN_HOLD(args->vp);
323 				args->cr = cr;
324 				crhold(args->cr);
325 				(void) zthread_create(NULL, 0,
326 				    nfs4_pgflush_thread, args, 0,
327 				    minclsyspri);
328 			}
329 		}
330 	}
331 
332 	/*
333 	 * Flush the readdir response cache.
334 	 */
335 	nfs4_purge_rddir_cache(vp);
336 }
337 
338 /*
339  * Invalidate all pages for the given file, after writing back the dirty
340  * ones.
341  */
342 
343 void
344 nfs4_flush_pages(vnode_t *vp, cred_t *cr)
345 {
346 	int error;
347 	rnode4_t *rp = VTOR4(vp);
348 
349 	error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_INVAL, cr, NULL);
350 	if (error == ENOSPC || error == EDQUOT) {
351 		mutex_enter(&rp->r_statelock);
352 		if (!rp->r_error)
353 			rp->r_error = error;
354 		mutex_exit(&rp->r_statelock);
355 	}
356 }
357 
358 /*
359  * Page flush thread.
360  */
361 
362 static void
363 nfs4_pgflush_thread(pgflush_t *args)
364 {
365 	rnode4_t *rp = VTOR4(args->vp);
366 
367 	/* remember which thread we are, so we don't deadlock ourselves */
368 	mutex_enter(&rp->r_statelock);
369 	ASSERT(rp->r_pgflush == NULL);
370 	rp->r_pgflush = curthread;
371 	mutex_exit(&rp->r_statelock);
372 
373 	nfs4_flush_pages(args->vp, args->cr);
374 
375 	mutex_enter(&rp->r_statelock);
376 	rp->r_pgflush = NULL;
377 	rp->r_flags &= ~R4PGFLUSH;
378 	cv_broadcast(&rp->r_cv);
379 	mutex_exit(&rp->r_statelock);
380 
381 	VN_RELE(args->vp);
382 	crfree(args->cr);
383 	kmem_free(args, sizeof (pgflush_t));
384 	zthread_exit();
385 }
386 
387 /*
388  * Purge the readdir cache of all entries which are not currently
389  * being filled.
390  */
391 void
392 nfs4_purge_rddir_cache(vnode_t *vp)
393 {
394 	rnode4_t *rp;
395 
396 	rp = VTOR4(vp);
397 
398 	mutex_enter(&rp->r_statelock);
399 	rp->r_direof = NULL;
400 	rp->r_flags &= ~R4LOOKUP;
401 	rp->r_flags |= R4READDIRWATTR;
402 	rddir4_cache_purge(rp);
403 	mutex_exit(&rp->r_statelock);
404 }
405 
406 /*
407  * Set attributes cache for given vnode using virtual attributes.  There is
408  * no cache validation, but if the attributes are deemed to be stale, they
409  * are ignored.  This corresponds to nfs3_attrcache().
410  *
411  * Set the timeout value on the attribute cache and fill it
412  * with the passed in attributes.
413  */
414 void
415 nfs4_attrcache_noinval(vnode_t *vp, nfs4_ga_res_t *garp, hrtime_t t)
416 {
417 	rnode4_t *rp = VTOR4(vp);
418 
419 	mutex_enter(&rp->r_statelock);
420 	if (rp->r_time_attr_saved <= t)
421 		nfs4_attrcache_va(vp, garp, FALSE);
422 	mutex_exit(&rp->r_statelock);
423 }
424 
425 /*
426  * Use the passed in virtual attributes to check to see whether the
427  * data and metadata caches are valid, cache the new attributes, and
428  * then do the cache invalidation if required.
429  *
430  * The cache validation and caching of the new attributes is done
431  * atomically via the use of the mutex, r_statelock.  If required,
432  * the cache invalidation is done atomically w.r.t. the cache
433  * validation and caching of the attributes via the pseudo lock,
434  * r_serial.
435  *
436  * This routine is used to do cache validation and attributes caching
437  * for operations with a single set of post operation attributes.
438  */
439 
440 void
441 nfs4_attr_cache(vnode_t *vp, nfs4_ga_res_t *garp,
442     hrtime_t t, cred_t *cr, int async,
443     change_info4 *cinfo)
444 {
445 	rnode4_t *rp;
446 	int mtime_changed = 0;
447 	int ctime_changed = 0;
448 	vsecattr_t *vsp;
449 	int was_serial, set_time_cache_inval, recov;
450 	vattr_t *vap = &garp->n4g_va;
451 	mntinfo4_t *mi = VTOMI4(vp);
452 	len_t preattr_rsize;
453 	boolean_t writemodify_set = B_FALSE;
454 	boolean_t cachepurge_set = B_FALSE;
455 
456 	ASSERT(mi->mi_vfsp->vfs_dev == garp->n4g_va.va_fsid);
457 
458 	/* Is curthread the recovery thread? */
459 	mutex_enter(&mi->mi_lock);
460 	recov = (VTOMI4(vp)->mi_recovthread == curthread);
461 	mutex_exit(&mi->mi_lock);
462 
463 	rp = VTOR4(vp);
464 	mutex_enter(&rp->r_statelock);
465 	was_serial = (rp->r_serial == curthread);
466 	if (rp->r_serial && !was_serial) {
467 		klwp_t *lwp = ttolwp(curthread);
468 
469 		/*
470 		 * If we're the recovery thread, then purge current attrs
471 		 * and bail out to avoid potential deadlock between another
472 		 * thread caching attrs (r_serial thread), recov thread,
473 		 * and an async writer thread.
474 		 */
475 		if (recov) {
476 			PURGE_ATTRCACHE4_LOCKED(rp);
477 			mutex_exit(&rp->r_statelock);
478 			return;
479 		}
480 
481 		if (lwp != NULL)
482 			lwp->lwp_nostop++;
483 		while (rp->r_serial != NULL) {
484 			if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
485 				mutex_exit(&rp->r_statelock);
486 				if (lwp != NULL)
487 					lwp->lwp_nostop--;
488 				return;
489 			}
490 		}
491 		if (lwp != NULL)
492 			lwp->lwp_nostop--;
493 	}
494 
495 	/*
496 	 * If there is a page flush thread, the current thread needs to
497 	 * bail out, to prevent a possible deadlock between the current
498 	 * thread (which might be in a start_op/end_op region), the
499 	 * recovery thread, and the page flush thread.  Expire the
500 	 * attribute cache, so that any attributes the current thread was
501 	 * going to set are not lost.
502 	 */
503 	if ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread) {
504 		PURGE_ATTRCACHE4_LOCKED(rp);
505 		mutex_exit(&rp->r_statelock);
506 		return;
507 	}
508 
509 	if (rp->r_time_attr_saved > t) {
510 		/*
511 		 * Attributes have been cached since these attributes were
512 		 * probably made. If there is an inconsistency in what is
513 		 * cached, mark them invalid. If not, don't act on them.
514 		 */
515 		if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size))
516 			PURGE_ATTRCACHE4_LOCKED(rp);
517 		mutex_exit(&rp->r_statelock);
518 		return;
519 	}
520 	set_time_cache_inval = 0;
521 	if (cinfo) {
522 		/*
523 		 * Only directory modifying callers pass non-NULL cinfo.
524 		 */
525 		ASSERT(vp->v_type == VDIR);
526 		/*
527 		 * If the cache timeout either doesn't exist or hasn't expired,
528 		 * and dir didn't changed on server before dirmod op
529 		 * and dir didn't change after dirmod op but before getattr
530 		 * then there's a chance that the client's cached data for
531 		 * this object is current (not stale).  No immediate cache
532 		 * flush is required.
533 		 *
534 		 */
535 		if ((! rp->r_time_cache_inval || t < rp->r_time_cache_inval) &&
536 		    cinfo->before == rp->r_change &&
537 		    (garp->n4g_change_valid &&
538 		    cinfo->after == garp->n4g_change)) {
539 
540 			/*
541 			 * If atomic isn't set, then the before/after info
542 			 * cannot be blindly trusted.  For this case, we tell
543 			 * nfs4_attrcache_va to cache the attrs but also
544 			 * establish an absolute maximum cache timeout.  When
545 			 * the timeout is reached, caches will be flushed.
546 			 */
547 			if (! cinfo->atomic)
548 				set_time_cache_inval = 1;
549 		} else {
550 
551 			/*
552 			 * We're not sure exactly what changed, but we know
553 			 * what to do.  flush all caches for dir.  remove the
554 			 * attr timeout.
555 			 *
556 			 * a) timeout expired.  flush all caches.
557 			 * b) r_change != cinfo.before.  flush all caches.
558 			 * c) r_change == cinfo.before, but cinfo.after !=
559 			 *    post-op getattr(change).  flush all caches.
560 			 * d) post-op getattr(change) not provided by server.
561 			 *    flush all caches.
562 			 */
563 			mtime_changed = 1;
564 			ctime_changed = 1;
565 			rp->r_time_cache_inval = 0;
566 		}
567 	} else {
568 		/*
569 		 * Write thread after writing data to file on remote server,
570 		 * will always set R4WRITEMODIFIED to indicate that file on
571 		 * remote server was modified with a WRITE operation and would
572 		 * have marked attribute cache as timed out. If R4WRITEMODIFIED
573 		 * is set, then do not check for mtime and ctime change.
574 		 */
575 		if (!(rp->r_flags & R4WRITEMODIFIED)) {
576 			if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size))
577 				mtime_changed = 1;
578 
579 			if (rp->r_attr.va_ctime.tv_sec !=
580 			    vap->va_ctime.tv_sec ||
581 			    rp->r_attr.va_ctime.tv_nsec !=
582 			    vap->va_ctime.tv_nsec)
583 				ctime_changed = 1;
584 
585 			/*
586 			 * If the change attribute was not provided by server
587 			 * or it differs, then flush all caches.
588 			 */
589 			if (!garp->n4g_change_valid ||
590 			    rp->r_change != garp->n4g_change) {
591 				mtime_changed = 1;
592 				ctime_changed = 1;
593 			}
594 		} else {
595 			writemodify_set = B_TRUE;
596 		}
597 	}
598 
599 	preattr_rsize = rp->r_size;
600 
601 	nfs4_attrcache_va(vp, garp, set_time_cache_inval);
602 
603 	/*
604 	 * If we have updated filesize in nfs4_attrcache_va, as soon as we
605 	 * drop statelock we will be in transition of purging all
606 	 * our caches and updating them. It is possible for another
607 	 * thread to pick this new file size and read in zeroed data.
608 	 * stall other threads till cache purge is complete.
609 	 */
610 	if ((!cinfo) && (rp->r_size != preattr_rsize)) {
611 		/*
612 		 * If R4WRITEMODIFIED was set and we have updated the file
613 		 * size, Server's returned file size need not necessarily
614 		 * be because of this Client's WRITE. We need to purge
615 		 * all caches.
616 		 */
617 		if (writemodify_set)
618 			mtime_changed = 1;
619 
620 		if (mtime_changed && !(rp->r_flags & R4INCACHEPURGE)) {
621 			rp->r_flags |= R4INCACHEPURGE;
622 			cachepurge_set = B_TRUE;
623 		}
624 	}
625 
626 	if (!mtime_changed && !ctime_changed) {
627 		mutex_exit(&rp->r_statelock);
628 		return;
629 	}
630 
631 	rp->r_serial = curthread;
632 
633 	mutex_exit(&rp->r_statelock);
634 
635 	/*
636 	 * If we're the recov thread, then force async nfs4_purge_caches
637 	 * to avoid potential deadlock.
638 	 */
639 	if (mtime_changed)
640 		nfs4_purge_caches(vp, NFS4_NOPURGE_DNLC, cr, recov ? 1 : async);
641 
642 	if ((rp->r_flags & R4INCACHEPURGE) && cachepurge_set) {
643 		mutex_enter(&rp->r_statelock);
644 		rp->r_flags &= ~R4INCACHEPURGE;
645 		cv_broadcast(&rp->r_cv);
646 		mutex_exit(&rp->r_statelock);
647 		cachepurge_set = B_FALSE;
648 	}
649 
650 	if (ctime_changed) {
651 		(void) nfs4_access_purge_rp(rp);
652 		if (rp->r_secattr != NULL) {
653 			mutex_enter(&rp->r_statelock);
654 			vsp = rp->r_secattr;
655 			rp->r_secattr = NULL;
656 			mutex_exit(&rp->r_statelock);
657 			if (vsp != NULL)
658 				nfs4_acl_free_cache(vsp);
659 		}
660 	}
661 
662 	if (!was_serial) {
663 		mutex_enter(&rp->r_statelock);
664 		rp->r_serial = NULL;
665 		cv_broadcast(&rp->r_cv);
666 		mutex_exit(&rp->r_statelock);
667 	}
668 }
669 
670 /*
671  * Set attributes cache for given vnode using virtual attributes.
672  *
673  * Set the timeout value on the attribute cache and fill it
674  * with the passed in attributes.
675  *
676  * The caller must be holding r_statelock.
677  */
678 static void
679 nfs4_attrcache_va(vnode_t *vp, nfs4_ga_res_t *garp, int set_cache_timeout)
680 {
681 	rnode4_t *rp;
682 	mntinfo4_t *mi;
683 	hrtime_t delta;
684 	hrtime_t now;
685 	vattr_t *vap = &garp->n4g_va;
686 
687 	rp = VTOR4(vp);
688 
689 	ASSERT(MUTEX_HELD(&rp->r_statelock));
690 	ASSERT(vap->va_mask == AT_ALL);
691 
692 	/* Switch to master before checking v_flag */
693 	if (IS_SHADOW(vp, rp))
694 		vp = RTOV4(rp);
695 
696 	now = gethrtime();
697 
698 	mi = VTOMI4(vp);
699 
700 	/*
701 	 * Only establish a new cache timeout (if requested).  Never
702 	 * extend a timeout.  Never clear a timeout.  Clearing a timeout
703 	 * is done by nfs4_update_dircaches (ancestor in our call chain)
704 	 */
705 	if (set_cache_timeout && ! rp->r_time_cache_inval)
706 		rp->r_time_cache_inval = now + mi->mi_acdirmax;
707 
708 	/*
709 	 * Delta is the number of nanoseconds that we will
710 	 * cache the attributes of the file.  It is based on
711 	 * the number of nanoseconds since the last time that
712 	 * we detected a change.  The assumption is that files
713 	 * that changed recently are likely to change again.
714 	 * There is a minimum and a maximum for regular files
715 	 * and for directories which is enforced though.
716 	 *
717 	 * Using the time since last change was detected
718 	 * eliminates direct comparison or calculation
719 	 * using mixed client and server times.  NFS does
720 	 * not make any assumptions regarding the client
721 	 * and server clocks being synchronized.
722 	 */
723 	if (vap->va_mtime.tv_sec != rp->r_attr.va_mtime.tv_sec ||
724 	    vap->va_mtime.tv_nsec != rp->r_attr.va_mtime.tv_nsec ||
725 	    vap->va_size != rp->r_attr.va_size) {
726 		rp->r_time_attr_saved = now;
727 	}
728 
729 	if ((mi->mi_flags & MI4_NOAC) || (vp->v_flag & VNOCACHE))
730 		delta = 0;
731 	else {
732 		delta = now - rp->r_time_attr_saved;
733 		if (vp->v_type == VDIR) {
734 			if (delta < mi->mi_acdirmin)
735 				delta = mi->mi_acdirmin;
736 			else if (delta > mi->mi_acdirmax)
737 				delta = mi->mi_acdirmax;
738 		} else {
739 			if (delta < mi->mi_acregmin)
740 				delta = mi->mi_acregmin;
741 			else if (delta > mi->mi_acregmax)
742 				delta = mi->mi_acregmax;
743 		}
744 	}
745 	rp->r_time_attr_inval = now + delta;
746 
747 	rp->r_attr = *vap;
748 	if (garp->n4g_change_valid)
749 		rp->r_change = garp->n4g_change;
750 
751 	/*
752 	 * The attributes that were returned may be valid and can
753 	 * be used, but they may not be allowed to be cached.
754 	 * Reset the timers to cause immediate invalidation and
755 	 * clear r_change so no VERIFY operations will suceed
756 	 */
757 	if (garp->n4g_attrwhy == NFS4_GETATTR_NOCACHE_OK) {
758 		rp->r_time_attr_inval = now;
759 		rp->r_time_attr_saved = now;
760 		rp->r_change = 0;
761 	}
762 
763 	/*
764 	 * If mounted_on_fileid returned AND the object is a stub,
765 	 * then set object's va_nodeid to the mounted over fid
766 	 * returned by server.
767 	 *
768 	 * If mounted_on_fileid not provided/supported, then
769 	 * just set it to 0 for now.  Eventually it would be
770 	 * better to set it to a hashed version of FH.  This
771 	 * would probably be good enough to provide a unique
772 	 * fid/d_ino within a dir.
773 	 *
774 	 * We don't need to carry mounted_on_fileid in the
775 	 * rnode as long as the client never requests fileid
776 	 * without also requesting mounted_on_fileid.  For
777 	 * now, it stays.
778 	 */
779 	if (garp->n4g_mon_fid_valid) {
780 		rp->r_mntd_fid = garp->n4g_mon_fid;
781 
782 		if (RP_ISSTUB(rp))
783 			rp->r_attr.va_nodeid = rp->r_mntd_fid;
784 	}
785 
786 	/*
787 	 * Check to see if there are valid pathconf bits to
788 	 * cache in the rnode.
789 	 */
790 	if (garp->n4g_ext_res) {
791 		if (garp->n4g_ext_res->n4g_pc4.pc4_cache_valid) {
792 			rp->r_pathconf = garp->n4g_ext_res->n4g_pc4;
793 		} else {
794 			if (garp->n4g_ext_res->n4g_pc4.pc4_xattr_valid) {
795 				rp->r_pathconf.pc4_xattr_valid = TRUE;
796 				rp->r_pathconf.pc4_xattr_exists =
797 				    garp->n4g_ext_res->n4g_pc4.pc4_xattr_exists;
798 			}
799 		}
800 	}
801 	/*
802 	 * Update the size of the file if there is no cached data or if
803 	 * the cached data is clean and there is no data being written
804 	 * out.
805 	 */
806 	if (rp->r_size != vap->va_size &&
807 	    (!vn_has_cached_data(vp) ||
808 	    (!(rp->r_flags & R4DIRTY) && rp->r_count == 0))) {
809 		rp->r_size = vap->va_size;
810 	}
811 	nfs_setswaplike(vp, vap);
812 	rp->r_flags &= ~R4WRITEMODIFIED;
813 }
814 
815 /*
816  * Get attributes over-the-wire and update attributes cache
817  * if no error occurred in the over-the-wire operation.
818  * Return 0 if successful, otherwise error.
819  */
820 int
821 nfs4_getattr_otw(vnode_t *vp, nfs4_ga_res_t *garp, cred_t *cr, int get_acl)
822 {
823 	mntinfo4_t *mi = VTOMI4(vp);
824 	hrtime_t t;
825 	nfs4_recov_state_t recov_state;
826 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
827 
828 	recov_state.rs_flags = 0;
829 	recov_state.rs_num_retry_despite_err = 0;
830 
831 	/* Save the original mount point security flavor */
832 	(void) save_mnt_secinfo(mi->mi_curr_serv);
833 
834 recov_retry:
835 
836 	if ((e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR,
837 	    &recov_state, NULL))) {
838 		(void) check_mnt_secinfo(mi->mi_curr_serv, vp);
839 		return (e.error);
840 	}
841 
842 	t = gethrtime();
843 
844 	nfs4_getattr_otw_norecovery(vp, garp, &e, cr, get_acl);
845 
846 	if (nfs4_needs_recovery(&e, FALSE, vp->v_vfsp)) {
847 		if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
848 		    NULL, OP_GETATTR, NULL, NULL, NULL) == FALSE)  {
849 			nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR,
850 			    &recov_state, 1);
851 			goto recov_retry;
852 		}
853 	}
854 
855 	nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 0);
856 
857 	if (!e.error) {
858 		if (e.stat == NFS4_OK) {
859 			nfs4_attr_cache(vp, garp, t, cr, FALSE, NULL);
860 		} else {
861 			e.error = geterrno4(e.stat);
862 
863 			nfs4_purge_stale_fh(e.error, vp, cr);
864 		}
865 	}
866 
867 	/*
868 	 * If getattr a node that is a stub for a crossed
869 	 * mount point, keep the original secinfo flavor for
870 	 * the current file system, not the crossed one.
871 	 */
872 	(void) check_mnt_secinfo(mi->mi_curr_serv, vp);
873 
874 	return (e.error);
875 }
876 
877 /*
878  * Generate a compound to get attributes over-the-wire.
879  */
880 void
881 nfs4_getattr_otw_norecovery(vnode_t *vp, nfs4_ga_res_t *garp,
882     nfs4_error_t *ep, cred_t *cr, int get_acl)
883 {
884 	COMPOUND4args_clnt args;
885 	COMPOUND4res_clnt res;
886 	int doqueue;
887 	rnode4_t *rp = VTOR4(vp);
888 	nfs_argop4 argop[2];
889 
890 	args.ctag = TAG_GETATTR;
891 
892 	args.array_len = 2;
893 	args.array = argop;
894 
895 	/* putfh */
896 	argop[0].argop = OP_CPUTFH;
897 	argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
898 
899 	/* getattr */
900 	/*
901 	 * Unlike nfs version 2 and 3, where getattr returns all the
902 	 * attributes, nfs version 4 returns only the ones explicitly
903 	 * asked for. This creates problems, as some system functions
904 	 * (e.g. cache check) require certain attributes and if the
905 	 * cached node lacks some attributes such as uid/gid, it can
906 	 * affect system utilities (e.g. "ls") that rely on the information
907 	 * to be there. This can lead to anything from system crashes to
908 	 * corrupted information processed by user apps.
909 	 * So to ensure that all bases are covered, request at least
910 	 * the AT_ALL attribute mask.
911 	 */
912 	argop[1].argop = OP_GETATTR;
913 	argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
914 	if (get_acl)
915 		argop[1].nfs_argop4_u.opgetattr.attr_request |= FATTR4_ACL_MASK;
916 	argop[1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp);
917 
918 	doqueue = 1;
919 
920 	rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, ep);
921 
922 	if (ep->error)
923 		return;
924 
925 	if (res.status != NFS4_OK) {
926 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
927 		return;
928 	}
929 
930 	*garp = res.array[1].nfs_resop4_u.opgetattr.ga_res;
931 
932 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
933 }
934 
935 /*
936  * Return either cached or remote attributes. If get remote attr
937  * use them to check and invalidate caches, then cache the new attributes.
938  */
939 int
940 nfs4getattr(vnode_t *vp, vattr_t *vap, cred_t *cr)
941 {
942 	int error;
943 	rnode4_t *rp;
944 	nfs4_ga_res_t gar;
945 
946 	ASSERT(nfs4_consistent_type(vp));
947 
948 	/*
949 	 * If we've got cached attributes, we're done, otherwise go
950 	 * to the server to get attributes, which will update the cache
951 	 * in the process. Either way, use the cached attributes for
952 	 * the caller's vattr_t.
953 	 *
954 	 * Note that we ignore the gar set by the OTW call: the attr caching
955 	 * code may make adjustments when storing to the rnode, and we want
956 	 * to see those changes here.
957 	 */
958 	rp = VTOR4(vp);
959 	error = 0;
960 	mutex_enter(&rp->r_statelock);
961 	if (!ATTRCACHE4_VALID(vp)) {
962 		mutex_exit(&rp->r_statelock);
963 		error = nfs4_getattr_otw(vp, &gar, cr, 0);
964 		mutex_enter(&rp->r_statelock);
965 	}
966 
967 	if (!error)
968 		*vap = rp->r_attr;
969 
970 	/* Return the client's view of file size */
971 	vap->va_size = rp->r_size;
972 
973 	mutex_exit(&rp->r_statelock);
974 
975 	ASSERT(nfs4_consistent_type(vp));
976 
977 	return (error);
978 }
979 
980 int
981 nfs4_attr_otw(vnode_t *vp, nfs4_tag_type_t tag_type,
982     nfs4_ga_res_t *garp, bitmap4 reqbitmap, cred_t *cr)
983 {
984 	COMPOUND4args_clnt args;
985 	COMPOUND4res_clnt res;
986 	int doqueue;
987 	nfs_argop4 argop[2];
988 	mntinfo4_t *mi = VTOMI4(vp);
989 	bool_t needrecov = FALSE;
990 	nfs4_recov_state_t recov_state;
991 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
992 	nfs4_ga_ext_res_t *gerp;
993 
994 	recov_state.rs_flags = 0;
995 	recov_state.rs_num_retry_despite_err = 0;
996 
997 recov_retry:
998 	args.ctag = tag_type;
999 
1000 	args.array_len = 2;
1001 	args.array = argop;
1002 
1003 	e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR, &recov_state, NULL);
1004 	if (e.error)
1005 		return (e.error);
1006 
1007 	/* putfh */
1008 	argop[0].argop = OP_CPUTFH;
1009 	argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh;
1010 
1011 	/* getattr */
1012 	argop[1].argop = OP_GETATTR;
1013 	argop[1].nfs_argop4_u.opgetattr.attr_request = reqbitmap;
1014 	argop[1].nfs_argop4_u.opgetattr.mi = mi;
1015 
1016 	doqueue = 1;
1017 
1018 	NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
1019 	    "nfs4_attr_otw: %s call, rp %s", needrecov ? "recov" : "first",
1020 	    rnode4info(VTOR4(vp))));
1021 
1022 	rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
1023 
1024 	needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
1025 	if (!needrecov && e.error) {
1026 		nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
1027 		    needrecov);
1028 		return (e.error);
1029 	}
1030 
1031 	if (needrecov) {
1032 		bool_t abort;
1033 
1034 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1035 		    "nfs4_attr_otw: initiating recovery\n"));
1036 
1037 		abort = nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
1038 		    NULL, OP_GETATTR, NULL, NULL, NULL);
1039 		nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
1040 		    needrecov);
1041 		if (!e.error) {
1042 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1043 			e.error = geterrno4(res.status);
1044 		}
1045 		if (abort == FALSE)
1046 			goto recov_retry;
1047 		return (e.error);
1048 	}
1049 
1050 	if (res.status) {
1051 		e.error = geterrno4(res.status);
1052 	} else {
1053 		gerp = garp->n4g_ext_res;
1054 		bcopy(&res.array[1].nfs_resop4_u.opgetattr.ga_res,
1055 		    garp, sizeof (nfs4_ga_res_t));
1056 		garp->n4g_ext_res = gerp;
1057 		if (garp->n4g_ext_res &&
1058 		    res.array[1].nfs_resop4_u.opgetattr.ga_res.n4g_ext_res)
1059 			bcopy(res.array[1].nfs_resop4_u.opgetattr.
1060 			    ga_res.n4g_ext_res,
1061 			    garp->n4g_ext_res, sizeof (nfs4_ga_ext_res_t));
1062 	}
1063 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1064 	nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
1065 	    needrecov);
1066 	return (e.error);
1067 }
1068 
1069 /*
1070  * Asynchronous I/O parameters.  nfs_async_threads is the high-water mark
1071  * for the demand-based allocation of async threads per-mount.  The
1072  * nfs_async_timeout is the amount of time a thread will live after it
1073  * becomes idle, unless new I/O requests are received before the thread
1074  * dies.  See nfs4_async_putpage and nfs4_async_start.
1075  */
1076 
1077 static void	nfs4_async_start(struct vfs *);
1078 static void	nfs4_async_pgops_start(struct vfs *);
1079 static void	nfs4_async_common_start(struct vfs *, int);
1080 
1081 static void
1082 free_async_args4(struct nfs4_async_reqs *args)
1083 {
1084 	rnode4_t *rp;
1085 
1086 	if (args->a_io != NFS4_INACTIVE) {
1087 		rp = VTOR4(args->a_vp);
1088 		mutex_enter(&rp->r_statelock);
1089 		rp->r_count--;
1090 		if (args->a_io == NFS4_PUTAPAGE ||
1091 		    args->a_io == NFS4_PAGEIO)
1092 			rp->r_awcount--;
1093 		cv_broadcast(&rp->r_cv);
1094 		mutex_exit(&rp->r_statelock);
1095 		VN_RELE(args->a_vp);
1096 	}
1097 	crfree(args->a_cred);
1098 	kmem_free(args, sizeof (*args));
1099 }
1100 
1101 /*
1102  * Cross-zone thread creation and NFS access is disallowed, yet fsflush() and
1103  * pageout(), running in the global zone, have legitimate reasons to do
1104  * VOP_PUTPAGE(B_ASYNC) on other zones' NFS mounts.  We avoid the problem by
1105  * use of a a per-mount "asynchronous requests manager thread" which is
1106  * signaled by the various asynchronous work routines when there is
1107  * asynchronous work to be done.  It is responsible for creating new
1108  * worker threads if necessary, and notifying existing worker threads
1109  * that there is work to be done.
1110  *
1111  * In other words, it will "take the specifications from the customers and
1112  * give them to the engineers."
1113  *
1114  * Worker threads die off of their own accord if they are no longer
1115  * needed.
1116  *
1117  * This thread is killed when the zone is going away or the filesystem
1118  * is being unmounted.
1119  */
1120 void
1121 nfs4_async_manager(vfs_t *vfsp)
1122 {
1123 	callb_cpr_t cprinfo;
1124 	mntinfo4_t *mi;
1125 	uint_t max_threads;
1126 
1127 	mi = VFTOMI4(vfsp);
1128 
1129 	CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr,
1130 	    "nfs4_async_manager");
1131 
1132 	mutex_enter(&mi->mi_async_lock);
1133 	/*
1134 	 * We want to stash the max number of threads that this mount was
1135 	 * allowed so we can use it later when the variable is set to zero as
1136 	 * part of the zone/mount going away.
1137 	 *
1138 	 * We want to be able to create at least one thread to handle
1139 	 * asynchronous inactive calls.
1140 	 */
1141 	max_threads = MAX(mi->mi_max_threads, 1);
1142 	/*
1143 	 * We don't want to wait for mi_max_threads to go to zero, since that
1144 	 * happens as part of a failed unmount, but this thread should only
1145 	 * exit when the mount is really going away.
1146 	 *
1147 	 * Once MI4_ASYNC_MGR_STOP is set, no more async operations will be
1148 	 * attempted: the various _async_*() functions know to do things
1149 	 * inline if mi_max_threads == 0.  Henceforth we just drain out the
1150 	 * outstanding requests.
1151 	 *
1152 	 * Note that we still create zthreads even if we notice the zone is
1153 	 * shutting down (MI4_ASYNC_MGR_STOP is set); this may cause the zone
1154 	 * shutdown sequence to take slightly longer in some cases, but
1155 	 * doesn't violate the protocol, as all threads will exit as soon as
1156 	 * they're done processing the remaining requests.
1157 	 */
1158 	for (;;) {
1159 		while (mi->mi_async_req_count > 0) {
1160 			/*
1161 			 * Paranoia: If the mount started out having
1162 			 * (mi->mi_max_threads == 0), and the value was
1163 			 * later changed (via a debugger or somesuch),
1164 			 * we could be confused since we will think we
1165 			 * can't create any threads, and the calling
1166 			 * code (which looks at the current value of
1167 			 * mi->mi_max_threads, now non-zero) thinks we
1168 			 * can.
1169 			 *
1170 			 * So, because we're paranoid, we create threads
1171 			 * up to the maximum of the original and the
1172 			 * current value. This means that future
1173 			 * (debugger-induced) alterations of
1174 			 * mi->mi_max_threads are ignored for our
1175 			 * purposes, but who told them they could change
1176 			 * random values on a live kernel anyhow?
1177 			 */
1178 			if (mi->mi_threads[NFS4_ASYNC_QUEUE] <
1179 			    MAX(mi->mi_max_threads, max_threads)) {
1180 				mi->mi_threads[NFS4_ASYNC_QUEUE]++;
1181 				mutex_exit(&mi->mi_async_lock);
1182 				MI4_HOLD(mi);
1183 				VFS_HOLD(vfsp);	/* hold for new thread */
1184 				(void) zthread_create(NULL, 0, nfs4_async_start,
1185 				    vfsp, 0, minclsyspri);
1186 				mutex_enter(&mi->mi_async_lock);
1187 			} else if (mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] <
1188 			    NUM_ASYNC_PGOPS_THREADS) {
1189 				mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE]++;
1190 				mutex_exit(&mi->mi_async_lock);
1191 				MI4_HOLD(mi);
1192 				VFS_HOLD(vfsp); /* hold for new thread */
1193 				(void) zthread_create(NULL, 0,
1194 				    nfs4_async_pgops_start, vfsp, 0,
1195 				    minclsyspri);
1196 				mutex_enter(&mi->mi_async_lock);
1197 			}
1198 			NFS4_WAKE_ASYNC_WORKER(mi->mi_async_work_cv);
1199 			ASSERT(mi->mi_async_req_count != 0);
1200 			mi->mi_async_req_count--;
1201 		}
1202 
1203 		mutex_enter(&mi->mi_lock);
1204 		if (mi->mi_flags & MI4_ASYNC_MGR_STOP) {
1205 			mutex_exit(&mi->mi_lock);
1206 			break;
1207 		}
1208 		mutex_exit(&mi->mi_lock);
1209 
1210 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
1211 		cv_wait(&mi->mi_async_reqs_cv, &mi->mi_async_lock);
1212 		CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1213 	}
1214 
1215 	NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
1216 	    "nfs4_async_manager exiting for vfs %p\n", (void *)mi->mi_vfsp));
1217 	/*
1218 	 * Let everyone know we're done.
1219 	 */
1220 	mi->mi_manager_thread = NULL;
1221 	/*
1222 	 * Wake up the inactive thread.
1223 	 */
1224 	cv_broadcast(&mi->mi_inact_req_cv);
1225 	/*
1226 	 * Wake up anyone sitting in nfs4_async_manager_stop()
1227 	 */
1228 	cv_broadcast(&mi->mi_async_cv);
1229 	/*
1230 	 * There is no explicit call to mutex_exit(&mi->mi_async_lock)
1231 	 * since CALLB_CPR_EXIT is actually responsible for releasing
1232 	 * 'mi_async_lock'.
1233 	 */
1234 	CALLB_CPR_EXIT(&cprinfo);
1235 	VFS_RELE(vfsp);	/* release thread's hold */
1236 	MI4_RELE(mi);
1237 	zthread_exit();
1238 }
1239 
1240 /*
1241  * Signal (and wait for) the async manager thread to clean up and go away.
1242  */
1243 void
1244 nfs4_async_manager_stop(vfs_t *vfsp)
1245 {
1246 	mntinfo4_t *mi = VFTOMI4(vfsp);
1247 
1248 	mutex_enter(&mi->mi_async_lock);
1249 	mutex_enter(&mi->mi_lock);
1250 	mi->mi_flags |= MI4_ASYNC_MGR_STOP;
1251 	mutex_exit(&mi->mi_lock);
1252 	cv_broadcast(&mi->mi_async_reqs_cv);
1253 	/*
1254 	 * Wait for the async manager thread to die.
1255 	 */
1256 	while (mi->mi_manager_thread != NULL)
1257 		cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1258 	mutex_exit(&mi->mi_async_lock);
1259 }
1260 
1261 int
1262 nfs4_async_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr,
1263     struct seg *seg, cred_t *cr, void (*readahead)(vnode_t *,
1264     u_offset_t, caddr_t, struct seg *, cred_t *))
1265 {
1266 	rnode4_t *rp;
1267 	mntinfo4_t *mi;
1268 	struct nfs4_async_reqs *args;
1269 
1270 	rp = VTOR4(vp);
1271 	ASSERT(rp->r_freef == NULL);
1272 
1273 	mi = VTOMI4(vp);
1274 
1275 	/*
1276 	 * If addr falls in a different segment, don't bother doing readahead.
1277 	 */
1278 	if (addr >= seg->s_base + seg->s_size)
1279 		return (-1);
1280 
1281 	/*
1282 	 * If we can't allocate a request structure, punt on the readahead.
1283 	 */
1284 	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1285 		return (-1);
1286 
1287 	/*
1288 	 * If a lock operation is pending, don't initiate any new
1289 	 * readaheads.  Otherwise, bump r_count to indicate the new
1290 	 * asynchronous I/O.
1291 	 */
1292 	if (!nfs_rw_tryenter(&rp->r_lkserlock, RW_READER)) {
1293 		kmem_free(args, sizeof (*args));
1294 		return (-1);
1295 	}
1296 	mutex_enter(&rp->r_statelock);
1297 	rp->r_count++;
1298 	mutex_exit(&rp->r_statelock);
1299 	nfs_rw_exit(&rp->r_lkserlock);
1300 
1301 	args->a_next = NULL;
1302 #ifdef DEBUG
1303 	args->a_queuer = curthread;
1304 #endif
1305 	VN_HOLD(vp);
1306 	args->a_vp = vp;
1307 	ASSERT(cr != NULL);
1308 	crhold(cr);
1309 	args->a_cred = cr;
1310 	args->a_io = NFS4_READ_AHEAD;
1311 	args->a_nfs4_readahead = readahead;
1312 	args->a_nfs4_blkoff = blkoff;
1313 	args->a_nfs4_seg = seg;
1314 	args->a_nfs4_addr = addr;
1315 
1316 	mutex_enter(&mi->mi_async_lock);
1317 
1318 	/*
1319 	 * If asyncio has been disabled, don't bother readahead.
1320 	 */
1321 	if (mi->mi_max_threads == 0) {
1322 		mutex_exit(&mi->mi_async_lock);
1323 		goto noasync;
1324 	}
1325 
1326 	/*
1327 	 * Link request structure into the async list and
1328 	 * wakeup async thread to do the i/o.
1329 	 */
1330 	if (mi->mi_async_reqs[NFS4_READ_AHEAD] == NULL) {
1331 		mi->mi_async_reqs[NFS4_READ_AHEAD] = args;
1332 		mi->mi_async_tail[NFS4_READ_AHEAD] = args;
1333 	} else {
1334 		mi->mi_async_tail[NFS4_READ_AHEAD]->a_next = args;
1335 		mi->mi_async_tail[NFS4_READ_AHEAD] = args;
1336 	}
1337 
1338 	if (mi->mi_io_kstats) {
1339 		mutex_enter(&mi->mi_lock);
1340 		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1341 		mutex_exit(&mi->mi_lock);
1342 	}
1343 
1344 	mi->mi_async_req_count++;
1345 	ASSERT(mi->mi_async_req_count != 0);
1346 	cv_signal(&mi->mi_async_reqs_cv);
1347 	mutex_exit(&mi->mi_async_lock);
1348 	return (0);
1349 
1350 noasync:
1351 	mutex_enter(&rp->r_statelock);
1352 	rp->r_count--;
1353 	cv_broadcast(&rp->r_cv);
1354 	mutex_exit(&rp->r_statelock);
1355 	VN_RELE(vp);
1356 	crfree(cr);
1357 	kmem_free(args, sizeof (*args));
1358 	return (-1);
1359 }
1360 
1361 static void
1362 nfs4_async_start(struct vfs *vfsp)
1363 {
1364 	nfs4_async_common_start(vfsp, NFS4_ASYNC_QUEUE);
1365 }
1366 
1367 static void
1368 nfs4_async_pgops_start(struct vfs *vfsp)
1369 {
1370 	nfs4_async_common_start(vfsp, NFS4_ASYNC_PGOPS_QUEUE);
1371 }
1372 
1373 /*
1374  * The async queues for each mounted file system are arranged as a
1375  * set of queues, one for each async i/o type.  Requests are taken
1376  * from the queues in a round-robin fashion.  A number of consecutive
1377  * requests are taken from each queue before moving on to the next
1378  * queue.  This functionality may allow the NFS Version 2 server to do
1379  * write clustering, even if the client is mixing writes and reads
1380  * because it will take multiple write requests from the queue
1381  * before processing any of the other async i/o types.
1382  *
1383  * XXX The nfs4_async_common_start thread is unsafe in the light of the present
1384  * model defined by cpr to suspend the system. Specifically over the
1385  * wire calls are cpr-unsafe. The thread should be reevaluated in
1386  * case of future updates to the cpr model.
1387  */
1388 static void
1389 nfs4_async_common_start(struct vfs *vfsp, int async_queue)
1390 {
1391 	struct nfs4_async_reqs *args;
1392 	mntinfo4_t *mi = VFTOMI4(vfsp);
1393 	clock_t time_left = 1;
1394 	callb_cpr_t cprinfo;
1395 	int i;
1396 	extern int nfs_async_timeout;
1397 	int async_types;
1398 	kcondvar_t *async_work_cv;
1399 
1400 	if (async_queue == NFS4_ASYNC_QUEUE) {
1401 		async_types = NFS4_ASYNC_TYPES;
1402 		async_work_cv = &mi->mi_async_work_cv[NFS4_ASYNC_QUEUE];
1403 	} else {
1404 		async_types = NFS4_ASYNC_PGOPS_TYPES;
1405 		async_work_cv = &mi->mi_async_work_cv[NFS4_ASYNC_PGOPS_QUEUE];
1406 	}
1407 
1408 	/*
1409 	 * Dynamic initialization of nfs_async_timeout to allow nfs to be
1410 	 * built in an implementation independent manner.
1411 	 */
1412 	if (nfs_async_timeout == -1)
1413 		nfs_async_timeout = NFS_ASYNC_TIMEOUT;
1414 
1415 	CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, "nas");
1416 
1417 	mutex_enter(&mi->mi_async_lock);
1418 	for (;;) {
1419 		/*
1420 		 * Find the next queue containing an entry.  We start
1421 		 * at the current queue pointer and then round robin
1422 		 * through all of them until we either find a non-empty
1423 		 * queue or have looked through all of them.
1424 		 */
1425 		for (i = 0; i < async_types; i++) {
1426 			args = *mi->mi_async_curr[async_queue];
1427 			if (args != NULL)
1428 				break;
1429 			mi->mi_async_curr[async_queue]++;
1430 			if (mi->mi_async_curr[async_queue] ==
1431 			    &mi->mi_async_reqs[async_types]) {
1432 				mi->mi_async_curr[async_queue] =
1433 				    &mi->mi_async_reqs[0];
1434 			}
1435 		}
1436 		/*
1437 		 * If we didn't find a entry, then block until woken up
1438 		 * again and then look through the queues again.
1439 		 */
1440 		if (args == NULL) {
1441 			/*
1442 			 * Exiting is considered to be safe for CPR as well
1443 			 */
1444 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
1445 
1446 			/*
1447 			 * Wakeup thread waiting to unmount the file
1448 			 * system only if all async threads are inactive.
1449 			 *
1450 			 * If we've timed-out and there's nothing to do,
1451 			 * then get rid of this thread.
1452 			 */
1453 			if (mi->mi_max_threads == 0 || time_left <= 0) {
1454 				--mi->mi_threads[async_queue];
1455 
1456 				if (mi->mi_threads[NFS4_ASYNC_QUEUE] == 0 &&
1457 				    mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] == 0)
1458 					cv_signal(&mi->mi_async_cv);
1459 				CALLB_CPR_EXIT(&cprinfo);
1460 				VFS_RELE(vfsp);	/* release thread's hold */
1461 				MI4_RELE(mi);
1462 				zthread_exit();
1463 				/* NOTREACHED */
1464 			}
1465 			time_left = cv_reltimedwait(async_work_cv,
1466 			    &mi->mi_async_lock, nfs_async_timeout,
1467 			    TR_CLOCK_TICK);
1468 
1469 			CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1470 
1471 			continue;
1472 		} else {
1473 			time_left = 1;
1474 		}
1475 
1476 		/*
1477 		 * Remove the request from the async queue and then
1478 		 * update the current async request queue pointer.  If
1479 		 * the current queue is empty or we have removed enough
1480 		 * consecutive entries from it, then reset the counter
1481 		 * for this queue and then move the current pointer to
1482 		 * the next queue.
1483 		 */
1484 		*mi->mi_async_curr[async_queue] = args->a_next;
1485 		if (*mi->mi_async_curr[async_queue] == NULL ||
1486 		    --mi->mi_async_clusters[args->a_io] == 0) {
1487 			mi->mi_async_clusters[args->a_io] =
1488 			    mi->mi_async_init_clusters;
1489 			mi->mi_async_curr[async_queue]++;
1490 			if (mi->mi_async_curr[async_queue] ==
1491 			    &mi->mi_async_reqs[async_types]) {
1492 				mi->mi_async_curr[async_queue] =
1493 				    &mi->mi_async_reqs[0];
1494 			}
1495 		}
1496 
1497 		if (args->a_io != NFS4_INACTIVE && mi->mi_io_kstats) {
1498 			mutex_enter(&mi->mi_lock);
1499 			kstat_waitq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
1500 			mutex_exit(&mi->mi_lock);
1501 		}
1502 
1503 		mutex_exit(&mi->mi_async_lock);
1504 
1505 		/*
1506 		 * Obtain arguments from the async request structure.
1507 		 */
1508 		if (args->a_io == NFS4_READ_AHEAD && mi->mi_max_threads > 0) {
1509 			(*args->a_nfs4_readahead)(args->a_vp,
1510 			    args->a_nfs4_blkoff, args->a_nfs4_addr,
1511 			    args->a_nfs4_seg, args->a_cred);
1512 		} else if (args->a_io == NFS4_PUTAPAGE) {
1513 			(void) (*args->a_nfs4_putapage)(args->a_vp,
1514 			    args->a_nfs4_pp, args->a_nfs4_off,
1515 			    args->a_nfs4_len, args->a_nfs4_flags,
1516 			    args->a_cred);
1517 		} else if (args->a_io == NFS4_PAGEIO) {
1518 			(void) (*args->a_nfs4_pageio)(args->a_vp,
1519 			    args->a_nfs4_pp, args->a_nfs4_off,
1520 			    args->a_nfs4_len, args->a_nfs4_flags,
1521 			    args->a_cred);
1522 		} else if (args->a_io == NFS4_READDIR) {
1523 			(void) ((*args->a_nfs4_readdir)(args->a_vp,
1524 			    args->a_nfs4_rdc, args->a_cred));
1525 		} else if (args->a_io == NFS4_COMMIT) {
1526 			(*args->a_nfs4_commit)(args->a_vp, args->a_nfs4_plist,
1527 			    args->a_nfs4_offset, args->a_nfs4_count,
1528 			    args->a_cred);
1529 		} else if (args->a_io == NFS4_INACTIVE) {
1530 			nfs4_inactive_otw(args->a_vp, args->a_cred);
1531 		}
1532 
1533 		/*
1534 		 * Now, release the vnode and free the credentials
1535 		 * structure.
1536 		 */
1537 		free_async_args4(args);
1538 		/*
1539 		 * Reacquire the mutex because it will be needed above.
1540 		 */
1541 		mutex_enter(&mi->mi_async_lock);
1542 	}
1543 }
1544 
1545 /*
1546  * nfs4_inactive_thread - look for vnodes that need over-the-wire calls as
1547  * part of VOP_INACTIVE.
1548  */
1549 
1550 void
1551 nfs4_inactive_thread(mntinfo4_t *mi)
1552 {
1553 	struct nfs4_async_reqs *args;
1554 	callb_cpr_t cprinfo;
1555 	vfs_t *vfsp = mi->mi_vfsp;
1556 
1557 	CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr,
1558 	    "nfs4_inactive_thread");
1559 
1560 	for (;;) {
1561 		mutex_enter(&mi->mi_async_lock);
1562 		args = mi->mi_async_reqs[NFS4_INACTIVE];
1563 		if (args == NULL) {
1564 			mutex_enter(&mi->mi_lock);
1565 			/*
1566 			 * We don't want to exit until the async manager is done
1567 			 * with its work; hence the check for mi_manager_thread
1568 			 * being NULL.
1569 			 *
1570 			 * The async manager thread will cv_broadcast() on
1571 			 * mi_inact_req_cv when it's done, at which point we'll
1572 			 * wake up and exit.
1573 			 */
1574 			if (mi->mi_manager_thread == NULL)
1575 				goto die;
1576 			mi->mi_flags |= MI4_INACTIVE_IDLE;
1577 			mutex_exit(&mi->mi_lock);
1578 			cv_signal(&mi->mi_async_cv);
1579 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
1580 			cv_wait(&mi->mi_inact_req_cv, &mi->mi_async_lock);
1581 			CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1582 			mutex_exit(&mi->mi_async_lock);
1583 		} else {
1584 			mutex_enter(&mi->mi_lock);
1585 			mi->mi_flags &= ~MI4_INACTIVE_IDLE;
1586 			mutex_exit(&mi->mi_lock);
1587 			mi->mi_async_reqs[NFS4_INACTIVE] = args->a_next;
1588 			mutex_exit(&mi->mi_async_lock);
1589 			nfs4_inactive_otw(args->a_vp, args->a_cred);
1590 			crfree(args->a_cred);
1591 			kmem_free(args, sizeof (*args));
1592 		}
1593 	}
1594 die:
1595 	mutex_exit(&mi->mi_lock);
1596 	mi->mi_inactive_thread = NULL;
1597 	cv_signal(&mi->mi_async_cv);
1598 
1599 	/*
1600 	 * There is no explicit call to mutex_exit(&mi->mi_async_lock) since
1601 	 * CALLB_CPR_EXIT is actually responsible for releasing 'mi_async_lock'.
1602 	 */
1603 	CALLB_CPR_EXIT(&cprinfo);
1604 
1605 	NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
1606 	    "nfs4_inactive_thread exiting for vfs %p\n", (void *)vfsp));
1607 
1608 	MI4_RELE(mi);
1609 	zthread_exit();
1610 	/* NOTREACHED */
1611 }
1612 
1613 /*
1614  * nfs_async_stop:
1615  * Wait for all outstanding putpage operations and the inactive thread to
1616  * complete; nfs4_async_stop_sig() without interruptibility.
1617  */
1618 void
1619 nfs4_async_stop(struct vfs *vfsp)
1620 {
1621 	mntinfo4_t *mi = VFTOMI4(vfsp);
1622 
1623 	/*
1624 	 * Wait for all outstanding async operations to complete and for
1625 	 * worker threads to exit.
1626 	 */
1627 	mutex_enter(&mi->mi_async_lock);
1628 	mi->mi_max_threads = 0;
1629 	NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
1630 	while (mi->mi_threads[NFS4_ASYNC_QUEUE] != 0 ||
1631 	    mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] != 0)
1632 		cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1633 
1634 	/*
1635 	 * Wait for the inactive thread to finish doing what it's doing.  It
1636 	 * won't exit until the last reference to the vfs_t goes away.
1637 	 */
1638 	if (mi->mi_inactive_thread != NULL) {
1639 		mutex_enter(&mi->mi_lock);
1640 		while (!(mi->mi_flags & MI4_INACTIVE_IDLE) ||
1641 		    (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) {
1642 			mutex_exit(&mi->mi_lock);
1643 			cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1644 			mutex_enter(&mi->mi_lock);
1645 		}
1646 		mutex_exit(&mi->mi_lock);
1647 	}
1648 	mutex_exit(&mi->mi_async_lock);
1649 }
1650 
1651 /*
1652  * nfs_async_stop_sig:
1653  * Wait for all outstanding putpage operations and the inactive thread to
1654  * complete. If a signal is delivered we will abort and return non-zero;
1655  * otherwise return 0. Since this routine is called from nfs4_unmount, we
1656  * need to make it interruptible.
1657  */
1658 int
1659 nfs4_async_stop_sig(struct vfs *vfsp)
1660 {
1661 	mntinfo4_t *mi = VFTOMI4(vfsp);
1662 	ushort_t omax;
1663 	bool_t intr = FALSE;
1664 
1665 	/*
1666 	 * Wait for all outstanding putpage operations to complete and for
1667 	 * worker threads to exit.
1668 	 */
1669 	mutex_enter(&mi->mi_async_lock);
1670 	omax = mi->mi_max_threads;
1671 	mi->mi_max_threads = 0;
1672 	NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
1673 	while (mi->mi_threads[NFS4_ASYNC_QUEUE] != 0 ||
1674 	    mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] != 0) {
1675 		if (!cv_wait_sig(&mi->mi_async_cv, &mi->mi_async_lock)) {
1676 			intr = TRUE;
1677 			goto interrupted;
1678 		}
1679 	}
1680 
1681 	/*
1682 	 * Wait for the inactive thread to finish doing what it's doing.  It
1683 	 * won't exit until the a last reference to the vfs_t goes away.
1684 	 */
1685 	if (mi->mi_inactive_thread != NULL) {
1686 		mutex_enter(&mi->mi_lock);
1687 		while (!(mi->mi_flags & MI4_INACTIVE_IDLE) ||
1688 		    (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) {
1689 			mutex_exit(&mi->mi_lock);
1690 			if (!cv_wait_sig(&mi->mi_async_cv,
1691 			    &mi->mi_async_lock)) {
1692 				intr = TRUE;
1693 				goto interrupted;
1694 			}
1695 			mutex_enter(&mi->mi_lock);
1696 		}
1697 		mutex_exit(&mi->mi_lock);
1698 	}
1699 interrupted:
1700 	if (intr)
1701 		mi->mi_max_threads = omax;
1702 	mutex_exit(&mi->mi_async_lock);
1703 
1704 	return (intr);
1705 }
1706 
1707 int
1708 nfs4_async_putapage(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
1709     int flags, cred_t *cr, int (*putapage)(vnode_t *, page_t *,
1710     u_offset_t, size_t, int, cred_t *))
1711 {
1712 	rnode4_t *rp;
1713 	mntinfo4_t *mi;
1714 	struct nfs4_async_reqs *args;
1715 
1716 	ASSERT(flags & B_ASYNC);
1717 	ASSERT(vp->v_vfsp != NULL);
1718 
1719 	rp = VTOR4(vp);
1720 	ASSERT(rp->r_count > 0);
1721 
1722 	mi = VTOMI4(vp);
1723 
1724 	/*
1725 	 * If we can't allocate a request structure, do the putpage
1726 	 * operation synchronously in this thread's context.
1727 	 */
1728 	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1729 		goto noasync;
1730 
1731 	args->a_next = NULL;
1732 #ifdef DEBUG
1733 	args->a_queuer = curthread;
1734 #endif
1735 	VN_HOLD(vp);
1736 	args->a_vp = vp;
1737 	ASSERT(cr != NULL);
1738 	crhold(cr);
1739 	args->a_cred = cr;
1740 	args->a_io = NFS4_PUTAPAGE;
1741 	args->a_nfs4_putapage = putapage;
1742 	args->a_nfs4_pp = pp;
1743 	args->a_nfs4_off = off;
1744 	args->a_nfs4_len = (uint_t)len;
1745 	args->a_nfs4_flags = flags;
1746 
1747 	mutex_enter(&mi->mi_async_lock);
1748 
1749 	/*
1750 	 * If asyncio has been disabled, then make a synchronous request.
1751 	 * This check is done a second time in case async io was diabled
1752 	 * while this thread was blocked waiting for memory pressure to
1753 	 * reduce or for the queue to drain.
1754 	 */
1755 	if (mi->mi_max_threads == 0) {
1756 		mutex_exit(&mi->mi_async_lock);
1757 
1758 		VN_RELE(vp);
1759 		crfree(cr);
1760 		kmem_free(args, sizeof (*args));
1761 		goto noasync;
1762 	}
1763 
1764 	/*
1765 	 * Link request structure into the async list and
1766 	 * wakeup async thread to do the i/o.
1767 	 */
1768 	if (mi->mi_async_reqs[NFS4_PUTAPAGE] == NULL) {
1769 		mi->mi_async_reqs[NFS4_PUTAPAGE] = args;
1770 		mi->mi_async_tail[NFS4_PUTAPAGE] = args;
1771 	} else {
1772 		mi->mi_async_tail[NFS4_PUTAPAGE]->a_next = args;
1773 		mi->mi_async_tail[NFS4_PUTAPAGE] = args;
1774 	}
1775 
1776 	mutex_enter(&rp->r_statelock);
1777 	rp->r_count++;
1778 	rp->r_awcount++;
1779 	mutex_exit(&rp->r_statelock);
1780 
1781 	if (mi->mi_io_kstats) {
1782 		mutex_enter(&mi->mi_lock);
1783 		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1784 		mutex_exit(&mi->mi_lock);
1785 	}
1786 
1787 	mi->mi_async_req_count++;
1788 	ASSERT(mi->mi_async_req_count != 0);
1789 	cv_signal(&mi->mi_async_reqs_cv);
1790 	mutex_exit(&mi->mi_async_lock);
1791 	return (0);
1792 
1793 noasync:
1794 
1795 	if (curproc == proc_pageout || curproc == proc_fsflush) {
1796 		/*
1797 		 * If we get here in the context of the pageout/fsflush,
1798 		 * or we have run out of memory or we're attempting to
1799 		 * unmount we refuse to do a sync write, because this may
1800 		 * hang pageout/fsflush and the machine. In this case,
1801 		 * we just re-mark the page as dirty and punt on the page.
1802 		 *
1803 		 * Make sure B_FORCE isn't set.  We can re-mark the
1804 		 * pages as dirty and unlock the pages in one swoop by
1805 		 * passing in B_ERROR to pvn_write_done().  However,
1806 		 * we should make sure B_FORCE isn't set - we don't
1807 		 * want the page tossed before it gets written out.
1808 		 */
1809 		if (flags & B_FORCE)
1810 			flags &= ~(B_INVAL | B_FORCE);
1811 		pvn_write_done(pp, flags | B_ERROR);
1812 		return (0);
1813 	}
1814 
1815 	if (nfs_zone() != mi->mi_zone) {
1816 		/*
1817 		 * So this was a cross-zone sync putpage.
1818 		 *
1819 		 * We pass in B_ERROR to pvn_write_done() to re-mark the pages
1820 		 * as dirty and unlock them.
1821 		 *
1822 		 * We don't want to clear B_FORCE here as the caller presumably
1823 		 * knows what they're doing if they set it.
1824 		 */
1825 		pvn_write_done(pp, flags | B_ERROR);
1826 		return (EPERM);
1827 	}
1828 	return ((*putapage)(vp, pp, off, len, flags, cr));
1829 }
1830 
1831 int
1832 nfs4_async_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
1833     int flags, cred_t *cr, int (*pageio)(vnode_t *, page_t *, u_offset_t,
1834     size_t, int, cred_t *))
1835 {
1836 	rnode4_t *rp;
1837 	mntinfo4_t *mi;
1838 	struct nfs4_async_reqs *args;
1839 
1840 	ASSERT(flags & B_ASYNC);
1841 	ASSERT(vp->v_vfsp != NULL);
1842 
1843 	rp = VTOR4(vp);
1844 	ASSERT(rp->r_count > 0);
1845 
1846 	mi = VTOMI4(vp);
1847 
1848 	/*
1849 	 * If we can't allocate a request structure, do the pageio
1850 	 * request synchronously in this thread's context.
1851 	 */
1852 	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1853 		goto noasync;
1854 
1855 	args->a_next = NULL;
1856 #ifdef DEBUG
1857 	args->a_queuer = curthread;
1858 #endif
1859 	VN_HOLD(vp);
1860 	args->a_vp = vp;
1861 	ASSERT(cr != NULL);
1862 	crhold(cr);
1863 	args->a_cred = cr;
1864 	args->a_io = NFS4_PAGEIO;
1865 	args->a_nfs4_pageio = pageio;
1866 	args->a_nfs4_pp = pp;
1867 	args->a_nfs4_off = io_off;
1868 	args->a_nfs4_len = (uint_t)io_len;
1869 	args->a_nfs4_flags = flags;
1870 
1871 	mutex_enter(&mi->mi_async_lock);
1872 
1873 	/*
1874 	 * If asyncio has been disabled, then make a synchronous request.
1875 	 * This check is done a second time in case async io was diabled
1876 	 * while this thread was blocked waiting for memory pressure to
1877 	 * reduce or for the queue to drain.
1878 	 */
1879 	if (mi->mi_max_threads == 0) {
1880 		mutex_exit(&mi->mi_async_lock);
1881 
1882 		VN_RELE(vp);
1883 		crfree(cr);
1884 		kmem_free(args, sizeof (*args));
1885 		goto noasync;
1886 	}
1887 
1888 	/*
1889 	 * Link request structure into the async list and
1890 	 * wakeup async thread to do the i/o.
1891 	 */
1892 	if (mi->mi_async_reqs[NFS4_PAGEIO] == NULL) {
1893 		mi->mi_async_reqs[NFS4_PAGEIO] = args;
1894 		mi->mi_async_tail[NFS4_PAGEIO] = args;
1895 	} else {
1896 		mi->mi_async_tail[NFS4_PAGEIO]->a_next = args;
1897 		mi->mi_async_tail[NFS4_PAGEIO] = args;
1898 	}
1899 
1900 	mutex_enter(&rp->r_statelock);
1901 	rp->r_count++;
1902 	rp->r_awcount++;
1903 	mutex_exit(&rp->r_statelock);
1904 
1905 	if (mi->mi_io_kstats) {
1906 		mutex_enter(&mi->mi_lock);
1907 		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1908 		mutex_exit(&mi->mi_lock);
1909 	}
1910 
1911 	mi->mi_async_req_count++;
1912 	ASSERT(mi->mi_async_req_count != 0);
1913 	cv_signal(&mi->mi_async_reqs_cv);
1914 	mutex_exit(&mi->mi_async_lock);
1915 	return (0);
1916 
1917 noasync:
1918 	/*
1919 	 * If we can't do it ASYNC, for reads we do nothing (but cleanup
1920 	 * the page list), for writes we do it synchronously, except for
1921 	 * proc_pageout/proc_fsflush as described below.
1922 	 */
1923 	if (flags & B_READ) {
1924 		pvn_read_done(pp, flags | B_ERROR);
1925 		return (0);
1926 	}
1927 
1928 	if (curproc == proc_pageout || curproc == proc_fsflush) {
1929 		/*
1930 		 * If we get here in the context of the pageout/fsflush,
1931 		 * we refuse to do a sync write, because this may hang
1932 		 * pageout/fsflush (and the machine). In this case, we just
1933 		 * re-mark the page as dirty and punt on the page.
1934 		 *
1935 		 * Make sure B_FORCE isn't set.  We can re-mark the
1936 		 * pages as dirty and unlock the pages in one swoop by
1937 		 * passing in B_ERROR to pvn_write_done().  However,
1938 		 * we should make sure B_FORCE isn't set - we don't
1939 		 * want the page tossed before it gets written out.
1940 		 */
1941 		if (flags & B_FORCE)
1942 			flags &= ~(B_INVAL | B_FORCE);
1943 		pvn_write_done(pp, flags | B_ERROR);
1944 		return (0);
1945 	}
1946 
1947 	if (nfs_zone() != mi->mi_zone) {
1948 		/*
1949 		 * So this was a cross-zone sync pageio.  We pass in B_ERROR
1950 		 * to pvn_write_done() to re-mark the pages as dirty and unlock
1951 		 * them.
1952 		 *
1953 		 * We don't want to clear B_FORCE here as the caller presumably
1954 		 * knows what they're doing if they set it.
1955 		 */
1956 		pvn_write_done(pp, flags | B_ERROR);
1957 		return (EPERM);
1958 	}
1959 	return ((*pageio)(vp, pp, io_off, io_len, flags, cr));
1960 }
1961 
1962 void
1963 nfs4_async_readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr,
1964     int (*readdir)(vnode_t *, rddir4_cache *, cred_t *))
1965 {
1966 	rnode4_t *rp;
1967 	mntinfo4_t *mi;
1968 	struct nfs4_async_reqs *args;
1969 
1970 	rp = VTOR4(vp);
1971 	ASSERT(rp->r_freef == NULL);
1972 
1973 	mi = VTOMI4(vp);
1974 
1975 	/*
1976 	 * If we can't allocate a request structure, skip the readdir.
1977 	 */
1978 	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1979 		goto noasync;
1980 
1981 	args->a_next = NULL;
1982 #ifdef DEBUG
1983 	args->a_queuer = curthread;
1984 #endif
1985 	VN_HOLD(vp);
1986 	args->a_vp = vp;
1987 	ASSERT(cr != NULL);
1988 	crhold(cr);
1989 	args->a_cred = cr;
1990 	args->a_io = NFS4_READDIR;
1991 	args->a_nfs4_readdir = readdir;
1992 	args->a_nfs4_rdc = rdc;
1993 
1994 	mutex_enter(&mi->mi_async_lock);
1995 
1996 	/*
1997 	 * If asyncio has been disabled, then skip this request
1998 	 */
1999 	if (mi->mi_max_threads == 0) {
2000 		mutex_exit(&mi->mi_async_lock);
2001 
2002 		VN_RELE(vp);
2003 		crfree(cr);
2004 		kmem_free(args, sizeof (*args));
2005 		goto noasync;
2006 	}
2007 
2008 	/*
2009 	 * Link request structure into the async list and
2010 	 * wakeup async thread to do the i/o.
2011 	 */
2012 	if (mi->mi_async_reqs[NFS4_READDIR] == NULL) {
2013 		mi->mi_async_reqs[NFS4_READDIR] = args;
2014 		mi->mi_async_tail[NFS4_READDIR] = args;
2015 	} else {
2016 		mi->mi_async_tail[NFS4_READDIR]->a_next = args;
2017 		mi->mi_async_tail[NFS4_READDIR] = args;
2018 	}
2019 
2020 	mutex_enter(&rp->r_statelock);
2021 	rp->r_count++;
2022 	mutex_exit(&rp->r_statelock);
2023 
2024 	if (mi->mi_io_kstats) {
2025 		mutex_enter(&mi->mi_lock);
2026 		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
2027 		mutex_exit(&mi->mi_lock);
2028 	}
2029 
2030 	mi->mi_async_req_count++;
2031 	ASSERT(mi->mi_async_req_count != 0);
2032 	cv_signal(&mi->mi_async_reqs_cv);
2033 	mutex_exit(&mi->mi_async_lock);
2034 	return;
2035 
2036 noasync:
2037 	mutex_enter(&rp->r_statelock);
2038 	rdc->entries = NULL;
2039 	/*
2040 	 * Indicate that no one is trying to fill this entry and
2041 	 * it still needs to be filled.
2042 	 */
2043 	rdc->flags &= ~RDDIR;
2044 	rdc->flags |= RDDIRREQ;
2045 	rddir4_cache_rele(rp, rdc);
2046 	mutex_exit(&rp->r_statelock);
2047 }
2048 
2049 void
2050 nfs4_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
2051     cred_t *cr, void (*commit)(vnode_t *, page_t *, offset3, count3,
2052     cred_t *))
2053 {
2054 	rnode4_t *rp;
2055 	mntinfo4_t *mi;
2056 	struct nfs4_async_reqs *args;
2057 	page_t *pp;
2058 
2059 	rp = VTOR4(vp);
2060 	mi = VTOMI4(vp);
2061 
2062 	/*
2063 	 * If we can't allocate a request structure, do the commit
2064 	 * operation synchronously in this thread's context.
2065 	 */
2066 	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
2067 		goto noasync;
2068 
2069 	args->a_next = NULL;
2070 #ifdef DEBUG
2071 	args->a_queuer = curthread;
2072 #endif
2073 	VN_HOLD(vp);
2074 	args->a_vp = vp;
2075 	ASSERT(cr != NULL);
2076 	crhold(cr);
2077 	args->a_cred = cr;
2078 	args->a_io = NFS4_COMMIT;
2079 	args->a_nfs4_commit = commit;
2080 	args->a_nfs4_plist = plist;
2081 	args->a_nfs4_offset = offset;
2082 	args->a_nfs4_count = count;
2083 
2084 	mutex_enter(&mi->mi_async_lock);
2085 
2086 	/*
2087 	 * If asyncio has been disabled, then make a synchronous request.
2088 	 * This check is done a second time in case async io was diabled
2089 	 * while this thread was blocked waiting for memory pressure to
2090 	 * reduce or for the queue to drain.
2091 	 */
2092 	if (mi->mi_max_threads == 0) {
2093 		mutex_exit(&mi->mi_async_lock);
2094 
2095 		VN_RELE(vp);
2096 		crfree(cr);
2097 		kmem_free(args, sizeof (*args));
2098 		goto noasync;
2099 	}
2100 
2101 	/*
2102 	 * Link request structure into the async list and
2103 	 * wakeup async thread to do the i/o.
2104 	 */
2105 	if (mi->mi_async_reqs[NFS4_COMMIT] == NULL) {
2106 		mi->mi_async_reqs[NFS4_COMMIT] = args;
2107 		mi->mi_async_tail[NFS4_COMMIT] = args;
2108 	} else {
2109 		mi->mi_async_tail[NFS4_COMMIT]->a_next = args;
2110 		mi->mi_async_tail[NFS4_COMMIT] = args;
2111 	}
2112 
2113 	mutex_enter(&rp->r_statelock);
2114 	rp->r_count++;
2115 	mutex_exit(&rp->r_statelock);
2116 
2117 	if (mi->mi_io_kstats) {
2118 		mutex_enter(&mi->mi_lock);
2119 		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
2120 		mutex_exit(&mi->mi_lock);
2121 	}
2122 
2123 	mi->mi_async_req_count++;
2124 	ASSERT(mi->mi_async_req_count != 0);
2125 	cv_signal(&mi->mi_async_reqs_cv);
2126 	mutex_exit(&mi->mi_async_lock);
2127 	return;
2128 
2129 noasync:
2130 	if (curproc == proc_pageout || curproc == proc_fsflush ||
2131 	    nfs_zone() != mi->mi_zone) {
2132 		while (plist != NULL) {
2133 			pp = plist;
2134 			page_sub(&plist, pp);
2135 			pp->p_fsdata = C_COMMIT;
2136 			page_unlock(pp);
2137 		}
2138 		return;
2139 	}
2140 	(*commit)(vp, plist, offset, count, cr);
2141 }
2142 
2143 /*
2144  * nfs4_async_inactive - hand off a VOP_INACTIVE call to a thread.  The
2145  * reference to the vnode is handed over to the thread; the caller should
2146  * no longer refer to the vnode.
2147  *
2148  * Unlike most of the async routines, this handoff is needed for
2149  * correctness reasons, not just performance.  So doing operations in the
2150  * context of the current thread is not an option.
2151  */
2152 void
2153 nfs4_async_inactive(vnode_t *vp, cred_t *cr)
2154 {
2155 	mntinfo4_t *mi;
2156 	struct nfs4_async_reqs *args;
2157 	boolean_t signal_inactive_thread = B_FALSE;
2158 
2159 	mi = VTOMI4(vp);
2160 
2161 	args = kmem_alloc(sizeof (*args), KM_SLEEP);
2162 	args->a_next = NULL;
2163 #ifdef DEBUG
2164 	args->a_queuer = curthread;
2165 #endif
2166 	args->a_vp = vp;
2167 	ASSERT(cr != NULL);
2168 	crhold(cr);
2169 	args->a_cred = cr;
2170 	args->a_io = NFS4_INACTIVE;
2171 
2172 	/*
2173 	 * Note that we don't check mi->mi_max_threads here, since we
2174 	 * *need* to get rid of this vnode regardless of whether someone
2175 	 * set nfs4_max_threads to zero in /etc/system.
2176 	 *
2177 	 * The manager thread knows about this and is willing to create
2178 	 * at least one thread to accommodate us.
2179 	 */
2180 	mutex_enter(&mi->mi_async_lock);
2181 	if (mi->mi_inactive_thread == NULL) {
2182 		rnode4_t *rp;
2183 		vnode_t *unldvp = NULL;
2184 		char *unlname;
2185 		cred_t *unlcred;
2186 
2187 		mutex_exit(&mi->mi_async_lock);
2188 		/*
2189 		 * We just need to free up the memory associated with the
2190 		 * vnode, which can be safely done from within the current
2191 		 * context.
2192 		 */
2193 		crfree(cr);	/* drop our reference */
2194 		kmem_free(args, sizeof (*args));
2195 		rp = VTOR4(vp);
2196 		mutex_enter(&rp->r_statelock);
2197 		if (rp->r_unldvp != NULL) {
2198 			unldvp = rp->r_unldvp;
2199 			rp->r_unldvp = NULL;
2200 			unlname = rp->r_unlname;
2201 			rp->r_unlname = NULL;
2202 			unlcred = rp->r_unlcred;
2203 			rp->r_unlcred = NULL;
2204 		}
2205 		mutex_exit(&rp->r_statelock);
2206 		/*
2207 		 * No need to explicitly throw away any cached pages.  The
2208 		 * eventual r4inactive() will attempt a synchronous
2209 		 * VOP_PUTPAGE() which will immediately fail since the request
2210 		 * is coming from the wrong zone, and then will proceed to call
2211 		 * nfs4_invalidate_pages() which will clean things up for us.
2212 		 *
2213 		 * Throw away the delegation here so rp4_addfree()'s attempt to
2214 		 * return any existing delegations becomes a no-op.
2215 		 */
2216 		if (rp->r_deleg_type != OPEN_DELEGATE_NONE) {
2217 			(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER,
2218 			    FALSE);
2219 			(void) nfs4delegreturn(rp, NFS4_DR_DISCARD);
2220 			nfs_rw_exit(&mi->mi_recovlock);
2221 		}
2222 		nfs4_clear_open_streams(rp);
2223 
2224 		rp4_addfree(rp, cr);
2225 		if (unldvp != NULL) {
2226 			kmem_free(unlname, MAXNAMELEN);
2227 			VN_RELE(unldvp);
2228 			crfree(unlcred);
2229 		}
2230 		return;
2231 	}
2232 
2233 	if (mi->mi_manager_thread == NULL) {
2234 		/*
2235 		 * We want to talk to the inactive thread.
2236 		 */
2237 		signal_inactive_thread = B_TRUE;
2238 	}
2239 
2240 	/*
2241 	 * Enqueue the vnode and wake up either the special thread (empty
2242 	 * list) or an async thread.
2243 	 */
2244 	if (mi->mi_async_reqs[NFS4_INACTIVE] == NULL) {
2245 		mi->mi_async_reqs[NFS4_INACTIVE] = args;
2246 		mi->mi_async_tail[NFS4_INACTIVE] = args;
2247 		signal_inactive_thread = B_TRUE;
2248 	} else {
2249 		mi->mi_async_tail[NFS4_INACTIVE]->a_next = args;
2250 		mi->mi_async_tail[NFS4_INACTIVE] = args;
2251 	}
2252 	if (signal_inactive_thread) {
2253 		cv_signal(&mi->mi_inact_req_cv);
2254 	} else  {
2255 		mi->mi_async_req_count++;
2256 		ASSERT(mi->mi_async_req_count != 0);
2257 		cv_signal(&mi->mi_async_reqs_cv);
2258 	}
2259 
2260 	mutex_exit(&mi->mi_async_lock);
2261 }
2262 
2263 int
2264 writerp4(rnode4_t *rp, caddr_t base, int tcount, struct uio *uio, int pgcreated)
2265 {
2266 	int pagecreate;
2267 	int n;
2268 	int saved_n;
2269 	caddr_t saved_base;
2270 	u_offset_t offset;
2271 	int error;
2272 	int sm_error;
2273 	vnode_t *vp = RTOV(rp);
2274 
2275 	ASSERT(tcount <= MAXBSIZE && tcount <= uio->uio_resid);
2276 	ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_WRITER));
2277 	if (!vpm_enable) {
2278 		ASSERT(((uintptr_t)base & MAXBOFFSET) + tcount <= MAXBSIZE);
2279 	}
2280 
2281 	/*
2282 	 * Move bytes in at most PAGESIZE chunks. We must avoid
2283 	 * spanning pages in uiomove() because page faults may cause
2284 	 * the cache to be invalidated out from under us. The r_size is not
2285 	 * updated until after the uiomove. If we push the last page of a
2286 	 * file before r_size is correct, we will lose the data written past
2287 	 * the current (and invalid) r_size.
2288 	 */
2289 	do {
2290 		offset = uio->uio_loffset;
2291 		pagecreate = 0;
2292 
2293 		/*
2294 		 * n is the number of bytes required to satisfy the request
2295 		 *   or the number of bytes to fill out the page.
2296 		 */
2297 		n = (int)MIN((PAGESIZE - (offset & PAGEOFFSET)), tcount);
2298 
2299 		/*
2300 		 * Check to see if we can skip reading in the page
2301 		 * and just allocate the memory.  We can do this
2302 		 * if we are going to rewrite the entire mapping
2303 		 * or if we are going to write to or beyond the current
2304 		 * end of file from the beginning of the mapping.
2305 		 *
2306 		 * The read of r_size is now protected by r_statelock.
2307 		 */
2308 		mutex_enter(&rp->r_statelock);
2309 		/*
2310 		 * When pgcreated is nonzero the caller has already done
2311 		 * a segmap_getmapflt with forcefault 0 and S_WRITE. With
2312 		 * segkpm this means we already have at least one page
2313 		 * created and mapped at base.
2314 		 */
2315 		pagecreate = pgcreated ||
2316 		    ((offset & PAGEOFFSET) == 0 &&
2317 		    (n == PAGESIZE || ((offset + n) >= rp->r_size)));
2318 
2319 		mutex_exit(&rp->r_statelock);
2320 
2321 		if (!vpm_enable && pagecreate) {
2322 			/*
2323 			 * The last argument tells segmap_pagecreate() to
2324 			 * always lock the page, as opposed to sometimes
2325 			 * returning with the page locked. This way we avoid a
2326 			 * fault on the ensuing uiomove(), but also
2327 			 * more importantly (to fix bug 1094402) we can
2328 			 * call segmap_fault() to unlock the page in all
2329 			 * cases. An alternative would be to modify
2330 			 * segmap_pagecreate() to tell us when it is
2331 			 * locking a page, but that's a fairly major
2332 			 * interface change.
2333 			 */
2334 			if (pgcreated == 0)
2335 				(void) segmap_pagecreate(segkmap, base,
2336 				    (uint_t)n, 1);
2337 			saved_base = base;
2338 			saved_n = n;
2339 		}
2340 
2341 		/*
2342 		 * The number of bytes of data in the last page can not
2343 		 * be accurately be determined while page is being
2344 		 * uiomove'd to and the size of the file being updated.
2345 		 * Thus, inform threads which need to know accurately
2346 		 * how much data is in the last page of the file.  They
2347 		 * will not do the i/o immediately, but will arrange for
2348 		 * the i/o to happen later when this modify operation
2349 		 * will have finished.
2350 		 */
2351 		ASSERT(!(rp->r_flags & R4MODINPROGRESS));
2352 		mutex_enter(&rp->r_statelock);
2353 		rp->r_flags |= R4MODINPROGRESS;
2354 		rp->r_modaddr = (offset & MAXBMASK);
2355 		mutex_exit(&rp->r_statelock);
2356 
2357 		if (vpm_enable) {
2358 			/*
2359 			 * Copy data. If new pages are created, part of
2360 			 * the page that is not written will be initizliazed
2361 			 * with zeros.
2362 			 */
2363 			error = vpm_data_copy(vp, offset, n, uio,
2364 			    !pagecreate, NULL, 0, S_WRITE);
2365 		} else {
2366 			error = uiomove(base, n, UIO_WRITE, uio);
2367 		}
2368 
2369 		/*
2370 		 * r_size is the maximum number of
2371 		 * bytes known to be in the file.
2372 		 * Make sure it is at least as high as the
2373 		 * first unwritten byte pointed to by uio_loffset.
2374 		 */
2375 		mutex_enter(&rp->r_statelock);
2376 		if (rp->r_size < uio->uio_loffset)
2377 			rp->r_size = uio->uio_loffset;
2378 		rp->r_flags &= ~R4MODINPROGRESS;
2379 		rp->r_flags |= R4DIRTY;
2380 		mutex_exit(&rp->r_statelock);
2381 
2382 		/* n = # of bytes written */
2383 		n = (int)(uio->uio_loffset - offset);
2384 
2385 		if (!vpm_enable) {
2386 			base += n;
2387 		}
2388 
2389 		tcount -= n;
2390 		/*
2391 		 * If we created pages w/o initializing them completely,
2392 		 * we need to zero the part that wasn't set up.
2393 		 * This happens on a most EOF write cases and if
2394 		 * we had some sort of error during the uiomove.
2395 		 */
2396 		if (!vpm_enable && pagecreate) {
2397 			if ((uio->uio_loffset & PAGEOFFSET) || n == 0)
2398 				(void) kzero(base, PAGESIZE - n);
2399 
2400 			if (pgcreated) {
2401 				/*
2402 				 * Caller is responsible for this page,
2403 				 * it was not created in this loop.
2404 				 */
2405 				pgcreated = 0;
2406 			} else {
2407 				/*
2408 				 * For bug 1094402: segmap_pagecreate locks
2409 				 * page. Unlock it. This also unlocks the
2410 				 * pages allocated by page_create_va() in
2411 				 * segmap_pagecreate().
2412 				 */
2413 				sm_error = segmap_fault(kas.a_hat, segkmap,
2414 				    saved_base, saved_n,
2415 				    F_SOFTUNLOCK, S_WRITE);
2416 				if (error == 0)
2417 					error = sm_error;
2418 			}
2419 		}
2420 	} while (tcount > 0 && error == 0);
2421 
2422 	return (error);
2423 }
2424 
2425 int
2426 nfs4_putpages(vnode_t *vp, u_offset_t off, size_t len, int flags, cred_t *cr)
2427 {
2428 	rnode4_t *rp;
2429 	page_t *pp;
2430 	u_offset_t eoff;
2431 	u_offset_t io_off;
2432 	size_t io_len;
2433 	int error;
2434 	int rdirty;
2435 	int err;
2436 
2437 	rp = VTOR4(vp);
2438 	ASSERT(rp->r_count > 0);
2439 
2440 	if (!nfs4_has_pages(vp))
2441 		return (0);
2442 
2443 	ASSERT(vp->v_type != VCHR);
2444 
2445 	/*
2446 	 * If R4OUTOFSPACE is set, then all writes turn into B_INVAL
2447 	 * writes.  B_FORCE is set to force the VM system to actually
2448 	 * invalidate the pages, even if the i/o failed.  The pages
2449 	 * need to get invalidated because they can't be written out
2450 	 * because there isn't any space left on either the server's
2451 	 * file system or in the user's disk quota.  The B_FREE bit
2452 	 * is cleared to avoid confusion as to whether this is a
2453 	 * request to place the page on the freelist or to destroy
2454 	 * it.
2455 	 */
2456 	if ((rp->r_flags & R4OUTOFSPACE) ||
2457 	    (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED))
2458 		flags = (flags & ~B_FREE) | B_INVAL | B_FORCE;
2459 
2460 	if (len == 0) {
2461 		/*
2462 		 * If doing a full file synchronous operation, then clear
2463 		 * the R4DIRTY bit.  If a page gets dirtied while the flush
2464 		 * is happening, then R4DIRTY will get set again.  The
2465 		 * R4DIRTY bit must get cleared before the flush so that
2466 		 * we don't lose this information.
2467 		 *
2468 		 * If there are no full file async write operations
2469 		 * pending and RDIRTY bit is set, clear it.
2470 		 */
2471 		if (off == (u_offset_t)0 &&
2472 		    !(flags & B_ASYNC) &&
2473 		    (rp->r_flags & R4DIRTY)) {
2474 			mutex_enter(&rp->r_statelock);
2475 			rdirty = (rp->r_flags & R4DIRTY);
2476 			rp->r_flags &= ~R4DIRTY;
2477 			mutex_exit(&rp->r_statelock);
2478 		} else if (flags & B_ASYNC && off == (u_offset_t)0) {
2479 			mutex_enter(&rp->r_statelock);
2480 			if (rp->r_flags & R4DIRTY && rp->r_awcount == 0) {
2481 				rdirty = (rp->r_flags & R4DIRTY);
2482 				rp->r_flags &= ~R4DIRTY;
2483 			}
2484 			mutex_exit(&rp->r_statelock);
2485 		} else
2486 			rdirty = 0;
2487 
2488 		/*
2489 		 * Search the entire vp list for pages >= off, and flush
2490 		 * the dirty pages.
2491 		 */
2492 		error = pvn_vplist_dirty(vp, off, rp->r_putapage,
2493 		    flags, cr);
2494 
2495 		/*
2496 		 * If an error occurred and the file was marked as dirty
2497 		 * before and we aren't forcibly invalidating pages, then
2498 		 * reset the R4DIRTY flag.
2499 		 */
2500 		if (error && rdirty &&
2501 		    (flags & (B_INVAL | B_FORCE)) != (B_INVAL | B_FORCE)) {
2502 			mutex_enter(&rp->r_statelock);
2503 			rp->r_flags |= R4DIRTY;
2504 			mutex_exit(&rp->r_statelock);
2505 		}
2506 	} else {
2507 		/*
2508 		 * Do a range from [off...off + len) looking for pages
2509 		 * to deal with.
2510 		 */
2511 		error = 0;
2512 		io_len = 0;
2513 		eoff = off + len;
2514 		mutex_enter(&rp->r_statelock);
2515 		for (io_off = off; io_off < eoff && io_off < rp->r_size;
2516 		    io_off += io_len) {
2517 			mutex_exit(&rp->r_statelock);
2518 			/*
2519 			 * If we are not invalidating, synchronously
2520 			 * freeing or writing pages use the routine
2521 			 * page_lookup_nowait() to prevent reclaiming
2522 			 * them from the free list.
2523 			 */
2524 			if ((flags & B_INVAL) || !(flags & B_ASYNC)) {
2525 				pp = page_lookup(vp, io_off,
2526 				    (flags & (B_INVAL | B_FREE)) ?
2527 				    SE_EXCL : SE_SHARED);
2528 			} else {
2529 				pp = page_lookup_nowait(vp, io_off,
2530 				    (flags & B_FREE) ? SE_EXCL : SE_SHARED);
2531 			}
2532 
2533 			if (pp == NULL || !pvn_getdirty(pp, flags))
2534 				io_len = PAGESIZE;
2535 			else {
2536 				err = (*rp->r_putapage)(vp, pp, &io_off,
2537 				    &io_len, flags, cr);
2538 				if (!error)
2539 					error = err;
2540 				/*
2541 				 * "io_off" and "io_len" are returned as
2542 				 * the range of pages we actually wrote.
2543 				 * This allows us to skip ahead more quickly
2544 				 * since several pages may've been dealt
2545 				 * with by this iteration of the loop.
2546 				 */
2547 			}
2548 			mutex_enter(&rp->r_statelock);
2549 		}
2550 		mutex_exit(&rp->r_statelock);
2551 	}
2552 
2553 	return (error);
2554 }
2555 
2556 void
2557 nfs4_invalidate_pages(vnode_t *vp, u_offset_t off, cred_t *cr)
2558 {
2559 	rnode4_t *rp;
2560 
2561 	rp = VTOR4(vp);
2562 	if (IS_SHADOW(vp, rp))
2563 		vp = RTOV4(rp);
2564 	mutex_enter(&rp->r_statelock);
2565 	while (rp->r_flags & R4TRUNCATE)
2566 		cv_wait(&rp->r_cv, &rp->r_statelock);
2567 	rp->r_flags |= R4TRUNCATE;
2568 	if (off == (u_offset_t)0) {
2569 		rp->r_flags &= ~R4DIRTY;
2570 		if (!(rp->r_flags & R4STALE))
2571 			rp->r_error = 0;
2572 	}
2573 	rp->r_truncaddr = off;
2574 	mutex_exit(&rp->r_statelock);
2575 	(void) pvn_vplist_dirty(vp, off, rp->r_putapage,
2576 	    B_INVAL | B_TRUNC, cr);
2577 	mutex_enter(&rp->r_statelock);
2578 	rp->r_flags &= ~R4TRUNCATE;
2579 	cv_broadcast(&rp->r_cv);
2580 	mutex_exit(&rp->r_statelock);
2581 }
2582 
2583 static int
2584 nfs4_mnt_kstat_update(kstat_t *ksp, int rw)
2585 {
2586 	mntinfo4_t *mi;
2587 	struct mntinfo_kstat *mik;
2588 	vfs_t *vfsp;
2589 
2590 	/* this is a read-only kstat. Bail out on a write */
2591 	if (rw == KSTAT_WRITE)
2592 		return (EACCES);
2593 
2594 
2595 	/*
2596 	 * We don't want to wait here as kstat_chain_lock could be held by
2597 	 * dounmount(). dounmount() takes vfs_reflock before the chain lock
2598 	 * and thus could lead to a deadlock.
2599 	 */
2600 	vfsp = (struct vfs *)ksp->ks_private;
2601 
2602 	mi = VFTOMI4(vfsp);
2603 	mik = (struct mntinfo_kstat *)ksp->ks_data;
2604 
2605 	(void) strcpy(mik->mik_proto, mi->mi_curr_serv->sv_knconf->knc_proto);
2606 
2607 	mik->mik_vers = (uint32_t)mi->mi_vers;
2608 	mik->mik_flags = mi->mi_flags;
2609 	/*
2610 	 * The sv_secdata holds the flavor the client specifies.
2611 	 * If the client uses default and a security negotiation
2612 	 * occurs, sv_currsec will point to the current flavor
2613 	 * selected from the server flavor list.
2614 	 * sv_currsec is NULL if no security negotiation takes place.
2615 	 */
2616 	mik->mik_secmod = mi->mi_curr_serv->sv_currsec ?
2617 	    mi->mi_curr_serv->sv_currsec->secmod :
2618 	    mi->mi_curr_serv->sv_secdata->secmod;
2619 	mik->mik_curread = (uint32_t)mi->mi_curread;
2620 	mik->mik_curwrite = (uint32_t)mi->mi_curwrite;
2621 	mik->mik_retrans = mi->mi_retrans;
2622 	mik->mik_timeo = mi->mi_timeo;
2623 	mik->mik_acregmin = HR2SEC(mi->mi_acregmin);
2624 	mik->mik_acregmax = HR2SEC(mi->mi_acregmax);
2625 	mik->mik_acdirmin = HR2SEC(mi->mi_acdirmin);
2626 	mik->mik_acdirmax = HR2SEC(mi->mi_acdirmax);
2627 	mik->mik_noresponse = (uint32_t)mi->mi_noresponse;
2628 	mik->mik_failover = (uint32_t)mi->mi_failover;
2629 	mik->mik_remap = (uint32_t)mi->mi_remap;
2630 
2631 	(void) strcpy(mik->mik_curserver, mi->mi_curr_serv->sv_hostname);
2632 
2633 	return (0);
2634 }
2635 
2636 void
2637 nfs4_mnt_kstat_init(struct vfs *vfsp)
2638 {
2639 	mntinfo4_t *mi = VFTOMI4(vfsp);
2640 
2641 	/*
2642 	 * PSARC 2001/697 Contract Private Interface
2643 	 * All nfs kstats are under SunMC contract
2644 	 * Please refer to the PSARC listed above and contact
2645 	 * SunMC before making any changes!
2646 	 *
2647 	 * Changes must be reviewed by Solaris File Sharing
2648 	 * Changes must be communicated to contract-2001-697@sun.com
2649 	 *
2650 	 */
2651 
2652 	mi->mi_io_kstats = kstat_create_zone("nfs", getminor(vfsp->vfs_dev),
2653 	    NULL, "nfs", KSTAT_TYPE_IO, 1, 0, mi->mi_zone->zone_id);
2654 	if (mi->mi_io_kstats) {
2655 		if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
2656 			kstat_zone_add(mi->mi_io_kstats, GLOBAL_ZONEID);
2657 		mi->mi_io_kstats->ks_lock = &mi->mi_lock;
2658 		kstat_install(mi->mi_io_kstats);
2659 	}
2660 
2661 	if ((mi->mi_ro_kstats = kstat_create_zone("nfs",
2662 	    getminor(vfsp->vfs_dev), "mntinfo", "misc", KSTAT_TYPE_RAW,
2663 	    sizeof (struct mntinfo_kstat), 0, mi->mi_zone->zone_id)) != NULL) {
2664 		if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
2665 			kstat_zone_add(mi->mi_ro_kstats, GLOBAL_ZONEID);
2666 		mi->mi_ro_kstats->ks_update = nfs4_mnt_kstat_update;
2667 		mi->mi_ro_kstats->ks_private = (void *)vfsp;
2668 		kstat_install(mi->mi_ro_kstats);
2669 	}
2670 
2671 	nfs4_mnt_recov_kstat_init(vfsp);
2672 }
2673 
2674 void
2675 nfs4_write_error(vnode_t *vp, int error, cred_t *cr)
2676 {
2677 	mntinfo4_t *mi;
2678 	clock_t now = ddi_get_lbolt();
2679 
2680 	mi = VTOMI4(vp);
2681 	/*
2682 	 * In case of forced unmount, do not print any messages
2683 	 * since it can flood the console with error messages.
2684 	 */
2685 	if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)
2686 		return;
2687 
2688 	/*
2689 	 * If the mount point is dead, not recoverable, do not
2690 	 * print error messages that can flood the console.
2691 	 */
2692 	if (mi->mi_flags & MI4_RECOV_FAIL)
2693 		return;
2694 
2695 	/*
2696 	 * No use in flooding the console with ENOSPC
2697 	 * messages from the same file system.
2698 	 */
2699 	if ((error != ENOSPC && error != EDQUOT) ||
2700 	    now - mi->mi_printftime > 0) {
2701 		zoneid_t zoneid = mi->mi_zone->zone_id;
2702 
2703 #ifdef DEBUG
2704 		nfs_perror(error, "NFS%ld write error on host %s: %m.\n",
2705 		    mi->mi_vers, VTOR4(vp)->r_server->sv_hostname, NULL);
2706 #else
2707 		nfs_perror(error, "NFS write error on host %s: %m.\n",
2708 		    VTOR4(vp)->r_server->sv_hostname, NULL);
2709 #endif
2710 		if (error == ENOSPC || error == EDQUOT) {
2711 			zcmn_err(zoneid, CE_CONT,
2712 			    "^File: userid=%d, groupid=%d\n",
2713 			    crgetuid(cr), crgetgid(cr));
2714 			if (crgetuid(curthread->t_cred) != crgetuid(cr) ||
2715 			    crgetgid(curthread->t_cred) != crgetgid(cr)) {
2716 				zcmn_err(zoneid, CE_CONT,
2717 				    "^User: userid=%d, groupid=%d\n",
2718 				    crgetuid(curthread->t_cred),
2719 				    crgetgid(curthread->t_cred));
2720 			}
2721 			mi->mi_printftime = now +
2722 			    nfs_write_error_interval * hz;
2723 		}
2724 		sfh4_printfhandle(VTOR4(vp)->r_fh);
2725 #ifdef DEBUG
2726 		if (error == EACCES) {
2727 			zcmn_err(zoneid, CE_CONT,
2728 			    "nfs_bio: cred is%s kcred\n",
2729 			    cr == kcred ? "" : " not");
2730 		}
2731 #endif
2732 	}
2733 }
2734 
2735 /*
2736  * Return non-zero if the given file can be safely memory mapped.  Locks
2737  * are safe if whole-file (length and offset are both zero).
2738  */
2739 
2740 #define	SAFE_LOCK(flk)	((flk).l_start == 0 && (flk).l_len == 0)
2741 
2742 static int
2743 nfs4_safemap(const vnode_t *vp)
2744 {
2745 	locklist_t	*llp, *next_llp;
2746 	int		safe = 1;
2747 	rnode4_t	*rp = VTOR4(vp);
2748 
2749 	ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER));
2750 
2751 	NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: "
2752 	    "vp = %p", (void *)vp));
2753 
2754 	/*
2755 	 * Review all the locks for the vnode, both ones that have been
2756 	 * acquired and ones that are pending.  We assume that
2757 	 * flk_active_locks_for_vp() has merged any locks that can be
2758 	 * merged (so that if a process has the entire file locked, it is
2759 	 * represented as a single lock).
2760 	 *
2761 	 * Note that we can't bail out of the loop if we find a non-safe
2762 	 * lock, because we have to free all the elements in the llp list.
2763 	 * We might be able to speed up this code slightly by not looking
2764 	 * at each lock's l_start and l_len fields once we've found a
2765 	 * non-safe lock.
2766 	 */
2767 
2768 	llp = flk_active_locks_for_vp(vp);
2769 	while (llp) {
2770 		NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE,
2771 		    "nfs4_safemap: active lock (%" PRId64 ", %" PRId64 ")",
2772 		    llp->ll_flock.l_start, llp->ll_flock.l_len));
2773 		if (!SAFE_LOCK(llp->ll_flock)) {
2774 			safe = 0;
2775 			NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE,
2776 			    "nfs4_safemap: unsafe active lock (%" PRId64
2777 			    ", %" PRId64 ")", llp->ll_flock.l_start,
2778 			    llp->ll_flock.l_len));
2779 		}
2780 		next_llp = llp->ll_next;
2781 		VN_RELE(llp->ll_vp);
2782 		kmem_free(llp, sizeof (*llp));
2783 		llp = next_llp;
2784 	}
2785 
2786 	NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: %s",
2787 	    safe ? "safe" : "unsafe"));
2788 	return (safe);
2789 }
2790 
2791 /*
2792  * Return whether there is a lost LOCK or LOCKU queued up for the given
2793  * file that would make an mmap request unsafe.  cf. nfs4_safemap().
2794  */
2795 
2796 bool_t
2797 nfs4_map_lost_lock_conflict(vnode_t *vp)
2798 {
2799 	bool_t conflict = FALSE;
2800 	nfs4_lost_rqst_t *lrp;
2801 	mntinfo4_t *mi = VTOMI4(vp);
2802 
2803 	mutex_enter(&mi->mi_lock);
2804 	for (lrp = list_head(&mi->mi_lost_state); lrp != NULL;
2805 	    lrp = list_next(&mi->mi_lost_state, lrp)) {
2806 		if (lrp->lr_op != OP_LOCK && lrp->lr_op != OP_LOCKU)
2807 			continue;
2808 		ASSERT(lrp->lr_vp != NULL);
2809 		if (!VOP_CMP(lrp->lr_vp, vp, NULL))
2810 			continue;	/* different file */
2811 		if (!SAFE_LOCK(*lrp->lr_flk)) {
2812 			conflict = TRUE;
2813 			break;
2814 		}
2815 	}
2816 
2817 	mutex_exit(&mi->mi_lock);
2818 	return (conflict);
2819 }
2820 
2821 /*
2822  * nfs_lockcompletion:
2823  *
2824  * If the vnode has a lock that makes it unsafe to cache the file, mark it
2825  * as non cachable (set VNOCACHE bit).
2826  */
2827 
2828 void
2829 nfs4_lockcompletion(vnode_t *vp, int cmd)
2830 {
2831 	rnode4_t *rp = VTOR4(vp);
2832 
2833 	ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER));
2834 	ASSERT(!IS_SHADOW(vp, rp));
2835 
2836 	if (cmd == F_SETLK || cmd == F_SETLKW) {
2837 
2838 		if (!nfs4_safemap(vp)) {
2839 			mutex_enter(&vp->v_lock);
2840 			vp->v_flag |= VNOCACHE;
2841 			mutex_exit(&vp->v_lock);
2842 		} else {
2843 			mutex_enter(&vp->v_lock);
2844 			vp->v_flag &= ~VNOCACHE;
2845 			mutex_exit(&vp->v_lock);
2846 		}
2847 	}
2848 	/*
2849 	 * The cached attributes of the file are stale after acquiring
2850 	 * the lock on the file. They were updated when the file was
2851 	 * opened, but not updated when the lock was acquired. Therefore the
2852 	 * cached attributes are invalidated after the lock is obtained.
2853 	 */
2854 	PURGE_ATTRCACHE4(vp);
2855 }
2856 
2857 /* ARGSUSED */
2858 static void *
2859 nfs4_mi_init(zoneid_t zoneid)
2860 {
2861 	struct mi4_globals *mig;
2862 
2863 	mig = kmem_alloc(sizeof (*mig), KM_SLEEP);
2864 	mutex_init(&mig->mig_lock, NULL, MUTEX_DEFAULT, NULL);
2865 	list_create(&mig->mig_list, sizeof (mntinfo4_t),
2866 	    offsetof(mntinfo4_t, mi_zone_node));
2867 	mig->mig_destructor_called = B_FALSE;
2868 	return (mig);
2869 }
2870 
2871 /*
2872  * Callback routine to tell all NFSv4 mounts in the zone to start tearing down
2873  * state and killing off threads.
2874  */
2875 /* ARGSUSED */
2876 static void
2877 nfs4_mi_shutdown(zoneid_t zoneid, void *data)
2878 {
2879 	struct mi4_globals *mig = data;
2880 	mntinfo4_t *mi;
2881 	nfs4_server_t *np;
2882 
2883 	NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2884 	    "nfs4_mi_shutdown zone %d\n", zoneid));
2885 	ASSERT(mig != NULL);
2886 	for (;;) {
2887 		mutex_enter(&mig->mig_lock);
2888 		mi = list_head(&mig->mig_list);
2889 		if (mi == NULL) {
2890 			mutex_exit(&mig->mig_lock);
2891 			break;
2892 		}
2893 
2894 		NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2895 		    "nfs4_mi_shutdown stopping vfs %p\n", (void *)mi->mi_vfsp));
2896 		/*
2897 		 * purge the DNLC for this filesystem
2898 		 */
2899 		(void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
2900 		/*
2901 		 * Tell existing async worker threads to exit.
2902 		 */
2903 		mutex_enter(&mi->mi_async_lock);
2904 		mi->mi_max_threads = 0;
2905 		NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
2906 		/*
2907 		 * Set the appropriate flags, signal and wait for both the
2908 		 * async manager and the inactive thread to exit when they're
2909 		 * done with their current work.
2910 		 */
2911 		mutex_enter(&mi->mi_lock);
2912 		mi->mi_flags |= (MI4_ASYNC_MGR_STOP|MI4_DEAD);
2913 		mutex_exit(&mi->mi_lock);
2914 		mutex_exit(&mi->mi_async_lock);
2915 		if (mi->mi_manager_thread) {
2916 			nfs4_async_manager_stop(mi->mi_vfsp);
2917 		}
2918 		if (mi->mi_inactive_thread) {
2919 			mutex_enter(&mi->mi_async_lock);
2920 			cv_signal(&mi->mi_inact_req_cv);
2921 			/*
2922 			 * Wait for the inactive thread to exit.
2923 			 */
2924 			while (mi->mi_inactive_thread != NULL) {
2925 				cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
2926 			}
2927 			mutex_exit(&mi->mi_async_lock);
2928 		}
2929 		/*
2930 		 * Wait for the recovery thread to complete, that is, it will
2931 		 * signal when it is done using the "mi" structure and about
2932 		 * to exit
2933 		 */
2934 		mutex_enter(&mi->mi_lock);
2935 		while (mi->mi_in_recovery > 0)
2936 			cv_wait(&mi->mi_cv_in_recov, &mi->mi_lock);
2937 		mutex_exit(&mi->mi_lock);
2938 		/*
2939 		 * We're done when every mi has been done or the list is empty.
2940 		 * This one is done, remove it from the list.
2941 		 */
2942 		list_remove(&mig->mig_list, mi);
2943 		mutex_exit(&mig->mig_lock);
2944 		zone_rele_ref(&mi->mi_zone_ref, ZONE_REF_NFSV4);
2945 
2946 		/*
2947 		 * Release hold on vfs and mi done to prevent race with zone
2948 		 * shutdown. This releases the hold in nfs4_mi_zonelist_add.
2949 		 */
2950 		VFS_RELE(mi->mi_vfsp);
2951 		MI4_RELE(mi);
2952 	}
2953 	/*
2954 	 * Tell each renew thread in the zone to exit
2955 	 */
2956 	mutex_enter(&nfs4_server_lst_lock);
2957 	for (np = nfs4_server_lst.forw; np != &nfs4_server_lst; np = np->forw) {
2958 		mutex_enter(&np->s_lock);
2959 		if (np->zoneid == zoneid) {
2960 			/*
2961 			 * We add another hold onto the nfs4_server_t
2962 			 * because this will make sure tha the nfs4_server_t
2963 			 * stays around until nfs4_callback_fini_zone destroys
2964 			 * the zone. This way, the renew thread can
2965 			 * unconditionally release its holds on the
2966 			 * nfs4_server_t.
2967 			 */
2968 			np->s_refcnt++;
2969 			nfs4_mark_srv_dead(np);
2970 		}
2971 		mutex_exit(&np->s_lock);
2972 	}
2973 	mutex_exit(&nfs4_server_lst_lock);
2974 }
2975 
2976 static void
2977 nfs4_mi_free_globals(struct mi4_globals *mig)
2978 {
2979 	list_destroy(&mig->mig_list);	/* makes sure the list is empty */
2980 	mutex_destroy(&mig->mig_lock);
2981 	kmem_free(mig, sizeof (*mig));
2982 }
2983 
2984 /* ARGSUSED */
2985 static void
2986 nfs4_mi_destroy(zoneid_t zoneid, void *data)
2987 {
2988 	struct mi4_globals *mig = data;
2989 
2990 	NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2991 	    "nfs4_mi_destroy zone %d\n", zoneid));
2992 	ASSERT(mig != NULL);
2993 	mutex_enter(&mig->mig_lock);
2994 	if (list_head(&mig->mig_list) != NULL) {
2995 		/* Still waiting for VFS_FREEVFS() */
2996 		mig->mig_destructor_called = B_TRUE;
2997 		mutex_exit(&mig->mig_lock);
2998 		return;
2999 	}
3000 	nfs4_mi_free_globals(mig);
3001 }
3002 
3003 /*
3004  * Add an NFS mount to the per-zone list of NFS mounts.
3005  */
3006 void
3007 nfs4_mi_zonelist_add(mntinfo4_t *mi)
3008 {
3009 	struct mi4_globals *mig;
3010 
3011 	mig = zone_getspecific(mi4_list_key, mi->mi_zone);
3012 	mutex_enter(&mig->mig_lock);
3013 	list_insert_head(&mig->mig_list, mi);
3014 	/*
3015 	 * hold added to eliminate race with zone shutdown -this will be
3016 	 * released in mi_shutdown
3017 	 */
3018 	MI4_HOLD(mi);
3019 	VFS_HOLD(mi->mi_vfsp);
3020 	mutex_exit(&mig->mig_lock);
3021 }
3022 
3023 /*
3024  * Remove an NFS mount from the per-zone list of NFS mounts.
3025  */
3026 int
3027 nfs4_mi_zonelist_remove(mntinfo4_t *mi)
3028 {
3029 	struct mi4_globals *mig;
3030 	int ret = 0;
3031 
3032 	mig = zone_getspecific(mi4_list_key, mi->mi_zone);
3033 	mutex_enter(&mig->mig_lock);
3034 	mutex_enter(&mi->mi_lock);
3035 	/* if this mi is marked dead, then the zone already released it */
3036 	if (!(mi->mi_flags & MI4_DEAD)) {
3037 		list_remove(&mig->mig_list, mi);
3038 		mutex_exit(&mi->mi_lock);
3039 
3040 		/* release the holds put on in zonelist_add(). */
3041 		VFS_RELE(mi->mi_vfsp);
3042 		MI4_RELE(mi);
3043 		ret = 1;
3044 	} else {
3045 		mutex_exit(&mi->mi_lock);
3046 	}
3047 
3048 	/*
3049 	 * We can be called asynchronously by VFS_FREEVFS() after the zone
3050 	 * shutdown/destroy callbacks have executed; if so, clean up the zone's
3051 	 * mi globals.
3052 	 */
3053 	if (list_head(&mig->mig_list) == NULL &&
3054 	    mig->mig_destructor_called == B_TRUE) {
3055 		nfs4_mi_free_globals(mig);
3056 		return (ret);
3057 	}
3058 	mutex_exit(&mig->mig_lock);
3059 	return (ret);
3060 }
3061 
3062 void
3063 nfs_free_mi4(mntinfo4_t *mi)
3064 {
3065 	nfs4_open_owner_t	*foop;
3066 	nfs4_oo_hash_bucket_t   *bucketp;
3067 	nfs4_debug_msg_t	*msgp;
3068 	int i;
3069 	servinfo4_t 		*svp;
3070 
3071 	/*
3072 	 * Code introduced here should be carefully evaluated to make
3073 	 * sure none of the freed resources are accessed either directly
3074 	 * or indirectly after freeing them. For eg: Introducing calls to
3075 	 * NFS4_DEBUG that use mntinfo4_t structure member after freeing
3076 	 * the structure members or other routines calling back into NFS
3077 	 * accessing freed mntinfo4_t structure member.
3078 	 */
3079 	mutex_enter(&mi->mi_lock);
3080 	ASSERT(mi->mi_recovthread == NULL);
3081 	ASSERT(mi->mi_flags & MI4_ASYNC_MGR_STOP);
3082 	mutex_exit(&mi->mi_lock);
3083 	mutex_enter(&mi->mi_async_lock);
3084 	ASSERT(mi->mi_threads[NFS4_ASYNC_QUEUE] == 0 &&
3085 	    mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] == 0);
3086 	ASSERT(mi->mi_manager_thread == NULL);
3087 	mutex_exit(&mi->mi_async_lock);
3088 	if (mi->mi_io_kstats) {
3089 		kstat_delete(mi->mi_io_kstats);
3090 		mi->mi_io_kstats = NULL;
3091 	}
3092 	if (mi->mi_ro_kstats) {
3093 		kstat_delete(mi->mi_ro_kstats);
3094 		mi->mi_ro_kstats = NULL;
3095 	}
3096 	if (mi->mi_recov_ksp) {
3097 		kstat_delete(mi->mi_recov_ksp);
3098 		mi->mi_recov_ksp = NULL;
3099 	}
3100 	mutex_enter(&mi->mi_msg_list_lock);
3101 	while (msgp = list_head(&mi->mi_msg_list)) {
3102 		list_remove(&mi->mi_msg_list, msgp);
3103 		nfs4_free_msg(msgp);
3104 	}
3105 	mutex_exit(&mi->mi_msg_list_lock);
3106 	list_destroy(&mi->mi_msg_list);
3107 	if (mi->mi_fname != NULL)
3108 		fn_rele(&mi->mi_fname);
3109 	if (mi->mi_rootfh != NULL)
3110 		sfh4_rele(&mi->mi_rootfh);
3111 	if (mi->mi_srvparentfh != NULL)
3112 		sfh4_rele(&mi->mi_srvparentfh);
3113 	svp = mi->mi_servers;
3114 	sv4_free(svp);
3115 	mutex_destroy(&mi->mi_lock);
3116 	mutex_destroy(&mi->mi_async_lock);
3117 	mutex_destroy(&mi->mi_msg_list_lock);
3118 	nfs_rw_destroy(&mi->mi_recovlock);
3119 	nfs_rw_destroy(&mi->mi_rename_lock);
3120 	nfs_rw_destroy(&mi->mi_fh_lock);
3121 	cv_destroy(&mi->mi_failover_cv);
3122 	cv_destroy(&mi->mi_async_reqs_cv);
3123 	cv_destroy(&mi->mi_async_work_cv[NFS4_ASYNC_QUEUE]);
3124 	cv_destroy(&mi->mi_async_work_cv[NFS4_ASYNC_PGOPS_QUEUE]);
3125 	cv_destroy(&mi->mi_async_cv);
3126 	cv_destroy(&mi->mi_inact_req_cv);
3127 	/*
3128 	 * Destroy the oo hash lists and mutexes for the cred hash table.
3129 	 */
3130 	for (i = 0; i < NFS4_NUM_OO_BUCKETS; i++) {
3131 		bucketp = &(mi->mi_oo_list[i]);
3132 		/* Destroy any remaining open owners on the list */
3133 		foop = list_head(&bucketp->b_oo_hash_list);
3134 		while (foop != NULL) {
3135 			list_remove(&bucketp->b_oo_hash_list, foop);
3136 			nfs4_destroy_open_owner(foop);
3137 			foop = list_head(&bucketp->b_oo_hash_list);
3138 		}
3139 		list_destroy(&bucketp->b_oo_hash_list);
3140 		mutex_destroy(&bucketp->b_lock);
3141 	}
3142 	/*
3143 	 * Empty and destroy the freed open owner list.
3144 	 */
3145 	foop = list_head(&mi->mi_foo_list);
3146 	while (foop != NULL) {
3147 		list_remove(&mi->mi_foo_list, foop);
3148 		nfs4_destroy_open_owner(foop);
3149 		foop = list_head(&mi->mi_foo_list);
3150 	}
3151 	list_destroy(&mi->mi_foo_list);
3152 	list_destroy(&mi->mi_bseqid_list);
3153 	list_destroy(&mi->mi_lost_state);
3154 	avl_destroy(&mi->mi_filehandles);
3155 	kmem_free(mi, sizeof (*mi));
3156 }
3157 void
3158 mi_hold(mntinfo4_t *mi)
3159 {
3160 	atomic_inc_32(&mi->mi_count);
3161 	ASSERT(mi->mi_count != 0);
3162 }
3163 
3164 void
3165 mi_rele(mntinfo4_t *mi)
3166 {
3167 	ASSERT(mi->mi_count != 0);
3168 	if (atomic_dec_32_nv(&mi->mi_count) == 0) {
3169 		nfs_free_mi4(mi);
3170 	}
3171 }
3172 
3173 vnode_t    nfs4_xattr_notsupp_vnode;
3174 
3175 void
3176 nfs4_clnt_init(void)
3177 {
3178 	nfs4_vnops_init();
3179 	(void) nfs4_rnode_init();
3180 	(void) nfs4_shadow_init();
3181 	(void) nfs4_acache_init();
3182 	(void) nfs4_subr_init();
3183 	nfs4_acl_init();
3184 	nfs_idmap_init();
3185 	nfs4_callback_init();
3186 	nfs4_secinfo_init();
3187 #ifdef	DEBUG
3188 	tsd_create(&nfs4_tsd_key, NULL);
3189 #endif
3190 
3191 	/*
3192 	 * Add a CPR callback so that we can update client
3193 	 * lease after a suspend and resume.
3194 	 */
3195 	cid = callb_add(nfs4_client_cpr_callb, 0, CB_CL_CPR_RPC, "nfs4");
3196 
3197 	zone_key_create(&mi4_list_key, nfs4_mi_init, nfs4_mi_shutdown,
3198 	    nfs4_mi_destroy);
3199 
3200 	/*
3201 	 * Initialise the reference count of the notsupp xattr cache vnode to 1
3202 	 * so that it never goes away (VOP_INACTIVE isn't called on it).
3203 	 */
3204 	nfs4_xattr_notsupp_vnode.v_count = 1;
3205 }
3206 
3207 void
3208 nfs4_clnt_fini(void)
3209 {
3210 	(void) zone_key_delete(mi4_list_key);
3211 	nfs4_vnops_fini();
3212 	(void) nfs4_rnode_fini();
3213 	(void) nfs4_shadow_fini();
3214 	(void) nfs4_acache_fini();
3215 	(void) nfs4_subr_fini();
3216 	nfs_idmap_fini();
3217 	nfs4_callback_fini();
3218 	nfs4_secinfo_fini();
3219 #ifdef	DEBUG
3220 	tsd_destroy(&nfs4_tsd_key);
3221 #endif
3222 	if (cid)
3223 		(void) callb_delete(cid);
3224 }
3225 
3226 /*ARGSUSED*/
3227 static boolean_t
3228 nfs4_client_cpr_callb(void *arg, int code)
3229 {
3230 	/*
3231 	 * We get called for Suspend and Resume events.
3232 	 * For the suspend case we simply don't care!
3233 	 */
3234 	if (code == CB_CODE_CPR_CHKPT) {
3235 		return (B_TRUE);
3236 	}
3237 
3238 	/*
3239 	 * When we get to here we are in the process of
3240 	 * resuming the system from a previous suspend.
3241 	 */
3242 	nfs4_client_resumed = gethrestime_sec();
3243 	return (B_TRUE);
3244 }
3245 
3246 void
3247 nfs4_renew_lease_thread(nfs4_server_t *sp)
3248 {
3249 	int	error = 0;
3250 	time_t	tmp_last_renewal_time, tmp_time, tmp_now_time, kip_secs;
3251 	clock_t	tick_delay = 0;
3252 	clock_t time_left = 0;
3253 	callb_cpr_t cpr_info;
3254 	kmutex_t cpr_lock;
3255 
3256 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3257 	    "nfs4_renew_lease_thread: acting on sp 0x%p", (void*)sp));
3258 	mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
3259 	CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr, "nfsv4Lease");
3260 
3261 	mutex_enter(&sp->s_lock);
3262 	/* sp->s_lease_time is set via a GETATTR */
3263 	sp->last_renewal_time = gethrestime_sec();
3264 	sp->lease_valid = NFS4_LEASE_UNINITIALIZED;
3265 	ASSERT(sp->s_refcnt >= 1);
3266 
3267 	for (;;) {
3268 		if (!sp->state_ref_count ||
3269 		    sp->lease_valid != NFS4_LEASE_VALID) {
3270 
3271 			kip_secs = MAX((sp->s_lease_time >> 1) -
3272 			    (3 * sp->propagation_delay.tv_sec), 1);
3273 
3274 			tick_delay = SEC_TO_TICK(kip_secs);
3275 
3276 			NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3277 			    "nfs4_renew_lease_thread: no renew : thread "
3278 			    "wait %ld secs", kip_secs));
3279 
3280 			NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3281 			    "nfs4_renew_lease_thread: no renew : "
3282 			    "state_ref_count %d, lease_valid %d",
3283 			    sp->state_ref_count, sp->lease_valid));
3284 
3285 			mutex_enter(&cpr_lock);
3286 			CALLB_CPR_SAFE_BEGIN(&cpr_info);
3287 			mutex_exit(&cpr_lock);
3288 			time_left = cv_reltimedwait(&sp->cv_thread_exit,
3289 			    &sp->s_lock, tick_delay, TR_CLOCK_TICK);
3290 			mutex_enter(&cpr_lock);
3291 			CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
3292 			mutex_exit(&cpr_lock);
3293 
3294 			NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3295 			    "nfs4_renew_lease_thread: no renew: "
3296 			    "time left %ld", time_left));
3297 
3298 			if (sp->s_thread_exit == NFS4_THREAD_EXIT)
3299 				goto die;
3300 			continue;
3301 		}
3302 
3303 		tmp_last_renewal_time = sp->last_renewal_time;
3304 
3305 		tmp_time = gethrestime_sec() - sp->last_renewal_time +
3306 		    (3 * sp->propagation_delay.tv_sec);
3307 
3308 		NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3309 		    "nfs4_renew_lease_thread: tmp_time %ld, "
3310 		    "sp->last_renewal_time %ld", tmp_time,
3311 		    sp->last_renewal_time));
3312 
3313 		kip_secs = MAX((sp->s_lease_time >> 1) - tmp_time, 1);
3314 
3315 		tick_delay = SEC_TO_TICK(kip_secs);
3316 
3317 		NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3318 		    "nfs4_renew_lease_thread: valid lease: sleep for %ld "
3319 		    "secs", kip_secs));
3320 
3321 		mutex_enter(&cpr_lock);
3322 		CALLB_CPR_SAFE_BEGIN(&cpr_info);
3323 		mutex_exit(&cpr_lock);
3324 		time_left = cv_reltimedwait(&sp->cv_thread_exit, &sp->s_lock,
3325 		    tick_delay, TR_CLOCK_TICK);
3326 		mutex_enter(&cpr_lock);
3327 		CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
3328 		mutex_exit(&cpr_lock);
3329 
3330 		NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3331 		    "nfs4_renew_lease_thread: valid lease: time left %ld :"
3332 		    "sp last_renewal_time %ld, nfs4_client_resumed %ld, "
3333 		    "tmp_last_renewal_time %ld", time_left,
3334 		    sp->last_renewal_time, nfs4_client_resumed,
3335 		    tmp_last_renewal_time));
3336 
3337 		if (sp->s_thread_exit == NFS4_THREAD_EXIT)
3338 			goto die;
3339 
3340 		if (tmp_last_renewal_time == sp->last_renewal_time ||
3341 		    (nfs4_client_resumed != 0 &&
3342 		    nfs4_client_resumed > sp->last_renewal_time)) {
3343 			/*
3344 			 * Issue RENEW op since we haven't renewed the lease
3345 			 * since we slept.
3346 			 */
3347 			tmp_now_time = gethrestime_sec();
3348 			error = nfs4renew(sp);
3349 			/*
3350 			 * Need to re-acquire sp's lock, nfs4renew()
3351 			 * relinqueshes it.
3352 			 */
3353 			mutex_enter(&sp->s_lock);
3354 
3355 			/*
3356 			 * See if someone changed s_thread_exit while we gave
3357 			 * up s_lock.
3358 			 */
3359 			if (sp->s_thread_exit == NFS4_THREAD_EXIT)
3360 				goto die;
3361 
3362 			if (!error) {
3363 				/*
3364 				 * check to see if we implicitly renewed while
3365 				 * we waited for a reply for our RENEW call.
3366 				 */
3367 				if (tmp_last_renewal_time ==
3368 				    sp->last_renewal_time) {
3369 					/* no implicit renew came */
3370 					sp->last_renewal_time = tmp_now_time;
3371 				} else {
3372 					NFS4_DEBUG(nfs4_client_lease_debug,
3373 					    (CE_NOTE, "renew_thread: did "
3374 					    "implicit renewal before reply "
3375 					    "from server for RENEW"));
3376 				}
3377 			} else {
3378 				/* figure out error */
3379 				NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3380 				    "renew_thread: nfs4renew returned error"
3381 				    " %d", error));
3382 			}
3383 
3384 		}
3385 	}
3386 
3387 die:
3388 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3389 	    "nfs4_renew_lease_thread: thread exiting"));
3390 
3391 	while (sp->s_otw_call_count != 0) {
3392 		NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3393 		    "nfs4_renew_lease_thread: waiting for outstanding "
3394 		    "otw calls to finish for sp 0x%p, current "
3395 		    "s_otw_call_count %d", (void *)sp,
3396 		    sp->s_otw_call_count));
3397 		mutex_enter(&cpr_lock);
3398 		CALLB_CPR_SAFE_BEGIN(&cpr_info);
3399 		mutex_exit(&cpr_lock);
3400 		cv_wait(&sp->s_cv_otw_count, &sp->s_lock);
3401 		mutex_enter(&cpr_lock);
3402 		CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
3403 		mutex_exit(&cpr_lock);
3404 	}
3405 	mutex_exit(&sp->s_lock);
3406 
3407 	nfs4_server_rele(sp);		/* free the thread's reference */
3408 	nfs4_server_rele(sp);		/* free the list's reference */
3409 	sp = NULL;
3410 
3411 done:
3412 	mutex_enter(&cpr_lock);
3413 	CALLB_CPR_EXIT(&cpr_info);	/* drops cpr_lock */
3414 	mutex_destroy(&cpr_lock);
3415 
3416 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3417 	    "nfs4_renew_lease_thread: renew thread exit officially"));
3418 
3419 	zthread_exit();
3420 	/* NOT REACHED */
3421 }
3422 
3423 /*
3424  * Send out a RENEW op to the server.
3425  * Assumes sp is locked down.
3426  */
3427 static int
3428 nfs4renew(nfs4_server_t *sp)
3429 {
3430 	COMPOUND4args_clnt args;
3431 	COMPOUND4res_clnt res;
3432 	nfs_argop4 argop[1];
3433 	int doqueue = 1;
3434 	int rpc_error;
3435 	cred_t *cr;
3436 	mntinfo4_t *mi;
3437 	timespec_t prop_time, after_time;
3438 	int needrecov = FALSE;
3439 	nfs4_recov_state_t recov_state;
3440 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
3441 
3442 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "nfs4renew"));
3443 
3444 	recov_state.rs_flags = 0;
3445 	recov_state.rs_num_retry_despite_err = 0;
3446 
3447 recov_retry:
3448 	mi = sp->mntinfo4_list;
3449 	VFS_HOLD(mi->mi_vfsp);
3450 	mutex_exit(&sp->s_lock);
3451 	ASSERT(mi != NULL);
3452 
3453 	e.error = nfs4_start_op(mi, NULL, NULL, &recov_state);
3454 	if (e.error) {
3455 		VFS_RELE(mi->mi_vfsp);
3456 		return (e.error);
3457 	}
3458 
3459 	/* Check to see if we're dealing with a marked-dead sp */
3460 	mutex_enter(&sp->s_lock);
3461 	if (sp->s_thread_exit == NFS4_THREAD_EXIT) {
3462 		mutex_exit(&sp->s_lock);
3463 		nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3464 		VFS_RELE(mi->mi_vfsp);
3465 		return (0);
3466 	}
3467 
3468 	/* Make sure mi hasn't changed on us */
3469 	if (mi != sp->mntinfo4_list) {
3470 		/* Must drop sp's lock to avoid a recursive mutex enter */
3471 		mutex_exit(&sp->s_lock);
3472 		nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3473 		VFS_RELE(mi->mi_vfsp);
3474 		mutex_enter(&sp->s_lock);
3475 		goto recov_retry;
3476 	}
3477 	mutex_exit(&sp->s_lock);
3478 
3479 	args.ctag = TAG_RENEW;
3480 
3481 	args.array_len = 1;
3482 	args.array = argop;
3483 
3484 	argop[0].argop = OP_RENEW;
3485 
3486 	mutex_enter(&sp->s_lock);
3487 	argop[0].nfs_argop4_u.oprenew.clientid = sp->clientid;
3488 	cr = sp->s_cred;
3489 	crhold(cr);
3490 	mutex_exit(&sp->s_lock);
3491 
3492 	ASSERT(cr != NULL);
3493 
3494 	/* used to figure out RTT for sp */
3495 	gethrestime(&prop_time);
3496 
3497 	NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
3498 	    "nfs4renew: %s call, sp 0x%p", needrecov ? "recov" : "first",
3499 	    (void*)sp));
3500 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "before: %ld s %ld ns ",
3501 	    prop_time.tv_sec, prop_time.tv_nsec));
3502 
3503 	DTRACE_PROBE2(nfs4__renew__start, nfs4_server_t *, sp,
3504 	    mntinfo4_t *, mi);
3505 
3506 	rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
3507 	crfree(cr);
3508 
3509 	DTRACE_PROBE2(nfs4__renew__end, nfs4_server_t *, sp,
3510 	    mntinfo4_t *, mi);
3511 
3512 	gethrestime(&after_time);
3513 
3514 	mutex_enter(&sp->s_lock);
3515 	sp->propagation_delay.tv_sec =
3516 	    MAX(1, after_time.tv_sec - prop_time.tv_sec);
3517 	mutex_exit(&sp->s_lock);
3518 
3519 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "after : %ld s %ld ns ",
3520 	    after_time.tv_sec, after_time.tv_nsec));
3521 
3522 	if (e.error == 0 && res.status == NFS4ERR_CB_PATH_DOWN) {
3523 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3524 		nfs4_delegreturn_all(sp);
3525 		nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3526 		VFS_RELE(mi->mi_vfsp);
3527 		/*
3528 		 * If the server returns CB_PATH_DOWN, it has renewed
3529 		 * the lease and informed us that the callback path is
3530 		 * down.  Since the lease is renewed, just return 0 and
3531 		 * let the renew thread proceed as normal.
3532 		 */
3533 		return (0);
3534 	}
3535 
3536 	needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
3537 	if (!needrecov && e.error) {
3538 		nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3539 		VFS_RELE(mi->mi_vfsp);
3540 		return (e.error);
3541 	}
3542 
3543 	rpc_error = e.error;
3544 
3545 	if (needrecov) {
3546 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
3547 		    "nfs4renew: initiating recovery\n"));
3548 
3549 		if (nfs4_start_recovery(&e, mi, NULL, NULL, NULL, NULL,
3550 		    OP_RENEW, NULL, NULL, NULL) == FALSE) {
3551 			nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3552 			VFS_RELE(mi->mi_vfsp);
3553 			if (!e.error)
3554 				(void) xdr_free(xdr_COMPOUND4res_clnt,
3555 				    (caddr_t)&res);
3556 			mutex_enter(&sp->s_lock);
3557 			goto recov_retry;
3558 		}
3559 		/* fall through for res.status case */
3560 	}
3561 
3562 	if (res.status) {
3563 		if (res.status == NFS4ERR_LEASE_MOVED) {
3564 			/*EMPTY*/
3565 			/*
3566 			 * XXX need to try every mntinfo4 in sp->mntinfo4_list
3567 			 * to renew the lease on that server
3568 			 */
3569 		}
3570 		e.error = geterrno4(res.status);
3571 	}
3572 
3573 	if (!rpc_error)
3574 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3575 
3576 	nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3577 
3578 	VFS_RELE(mi->mi_vfsp);
3579 
3580 	return (e.error);
3581 }
3582 
3583 void
3584 nfs4_inc_state_ref_count(mntinfo4_t *mi)
3585 {
3586 	nfs4_server_t	*sp;
3587 
3588 	/* this locks down sp if it is found */
3589 	sp = find_nfs4_server(mi);
3590 
3591 	if (sp != NULL) {
3592 		nfs4_inc_state_ref_count_nolock(sp, mi);
3593 		mutex_exit(&sp->s_lock);
3594 		nfs4_server_rele(sp);
3595 	}
3596 }
3597 
3598 /*
3599  * Bump the number of OPEN files (ie: those with state) so we know if this
3600  * nfs4_server has any state to maintain a lease for or not.
3601  *
3602  * Also, marks the nfs4_server's lease valid if it hasn't been done so already.
3603  */
3604 void
3605 nfs4_inc_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi)
3606 {
3607 	ASSERT(mutex_owned(&sp->s_lock));
3608 
3609 	sp->state_ref_count++;
3610 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3611 	    "nfs4_inc_state_ref_count: state_ref_count now %d",
3612 	    sp->state_ref_count));
3613 
3614 	if (sp->lease_valid == NFS4_LEASE_UNINITIALIZED)
3615 		sp->lease_valid = NFS4_LEASE_VALID;
3616 
3617 	/*
3618 	 * If this call caused the lease to be marked valid and/or
3619 	 * took the state_ref_count from 0 to 1, then start the time
3620 	 * on lease renewal.
3621 	 */
3622 	if (sp->lease_valid == NFS4_LEASE_VALID && sp->state_ref_count == 1)
3623 		sp->last_renewal_time = gethrestime_sec();
3624 
3625 	/* update the number of open files for mi */
3626 	mi->mi_open_files++;
3627 }
3628 
3629 void
3630 nfs4_dec_state_ref_count(mntinfo4_t *mi)
3631 {
3632 	nfs4_server_t	*sp;
3633 
3634 	/* this locks down sp if it is found */
3635 	sp = find_nfs4_server_all(mi, 1);
3636 
3637 	if (sp != NULL) {
3638 		nfs4_dec_state_ref_count_nolock(sp, mi);
3639 		mutex_exit(&sp->s_lock);
3640 		nfs4_server_rele(sp);
3641 	}
3642 }
3643 
3644 /*
3645  * Decrement the number of OPEN files (ie: those with state) so we know if
3646  * this nfs4_server has any state to maintain a lease for or not.
3647  */
3648 void
3649 nfs4_dec_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi)
3650 {
3651 	ASSERT(mutex_owned(&sp->s_lock));
3652 	ASSERT(sp->state_ref_count != 0);
3653 	sp->state_ref_count--;
3654 
3655 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3656 	    "nfs4_dec_state_ref_count: state ref count now %d",
3657 	    sp->state_ref_count));
3658 
3659 	mi->mi_open_files--;
3660 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3661 	    "nfs4_dec_state_ref_count: mi open files %d, v4 flags 0x%x",
3662 	    mi->mi_open_files, mi->mi_flags));
3663 
3664 	/* We don't have to hold the mi_lock to test mi_flags */
3665 	if (mi->mi_open_files == 0 &&
3666 	    (mi->mi_flags & MI4_REMOVE_ON_LAST_CLOSE)) {
3667 		NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3668 		    "nfs4_dec_state_ref_count: remove mntinfo4 %p since "
3669 		    "we have closed the last open file", (void*)mi));
3670 		nfs4_remove_mi_from_server(mi, sp);
3671 	}
3672 }
3673 
3674 bool_t
3675 inlease(nfs4_server_t *sp)
3676 {
3677 	bool_t result;
3678 
3679 	ASSERT(mutex_owned(&sp->s_lock));
3680 
3681 	if (sp->lease_valid == NFS4_LEASE_VALID &&
3682 	    gethrestime_sec() < sp->last_renewal_time + sp->s_lease_time)
3683 		result = TRUE;
3684 	else
3685 		result = FALSE;
3686 
3687 	return (result);
3688 }
3689 
3690 
3691 /*
3692  * Return non-zero if the given nfs4_server_t is going through recovery.
3693  */
3694 
3695 int
3696 nfs4_server_in_recovery(nfs4_server_t *sp)
3697 {
3698 	return (nfs_rw_lock_held(&sp->s_recovlock, RW_WRITER));
3699 }
3700 
3701 /*
3702  * Compare two shared filehandle objects.  Returns -1, 0, or +1, if the
3703  * first is less than, equal to, or greater than the second.
3704  */
3705 
3706 int
3707 sfh4cmp(const void *p1, const void *p2)
3708 {
3709 	const nfs4_sharedfh_t *sfh1 = (const nfs4_sharedfh_t *)p1;
3710 	const nfs4_sharedfh_t *sfh2 = (const nfs4_sharedfh_t *)p2;
3711 
3712 	return (nfs4cmpfh(&sfh1->sfh_fh, &sfh2->sfh_fh));
3713 }
3714 
3715 /*
3716  * Create a table for shared filehandle objects.
3717  */
3718 
3719 void
3720 sfh4_createtab(avl_tree_t *tab)
3721 {
3722 	avl_create(tab, sfh4cmp, sizeof (nfs4_sharedfh_t),
3723 	    offsetof(nfs4_sharedfh_t, sfh_tree));
3724 }
3725 
3726 /*
3727  * Return a shared filehandle object for the given filehandle.  The caller
3728  * is responsible for eventually calling sfh4_rele().
3729  */
3730 
3731 nfs4_sharedfh_t *
3732 sfh4_put(const nfs_fh4 *fh, mntinfo4_t *mi, nfs4_sharedfh_t *key)
3733 {
3734 	nfs4_sharedfh_t *sfh, *nsfh;
3735 	avl_index_t where;
3736 	nfs4_sharedfh_t skey;
3737 
3738 	if (!key) {
3739 		skey.sfh_fh = *fh;
3740 		key = &skey;
3741 	}
3742 
3743 	nsfh = kmem_alloc(sizeof (nfs4_sharedfh_t), KM_SLEEP);
3744 	nsfh->sfh_fh.nfs_fh4_len = fh->nfs_fh4_len;
3745 	/*
3746 	 * We allocate the largest possible filehandle size because it's
3747 	 * not that big, and it saves us from possibly having to resize the
3748 	 * buffer later.
3749 	 */
3750 	nsfh->sfh_fh.nfs_fh4_val = kmem_alloc(NFS4_FHSIZE, KM_SLEEP);
3751 	bcopy(fh->nfs_fh4_val, nsfh->sfh_fh.nfs_fh4_val, fh->nfs_fh4_len);
3752 	mutex_init(&nsfh->sfh_lock, NULL, MUTEX_DEFAULT, NULL);
3753 	nsfh->sfh_refcnt = 1;
3754 	nsfh->sfh_flags = SFH4_IN_TREE;
3755 	nsfh->sfh_mi = mi;
3756 	NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, "sfh4_get: new object (%p)",
3757 	    (void *)nsfh));
3758 
3759 	(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
3760 	sfh = avl_find(&mi->mi_filehandles, key, &where);
3761 	if (sfh != NULL) {
3762 		mutex_enter(&sfh->sfh_lock);
3763 		sfh->sfh_refcnt++;
3764 		mutex_exit(&sfh->sfh_lock);
3765 		nfs_rw_exit(&mi->mi_fh_lock);
3766 		/* free our speculative allocs */
3767 		kmem_free(nsfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE);
3768 		kmem_free(nsfh, sizeof (nfs4_sharedfh_t));
3769 		return (sfh);
3770 	}
3771 
3772 	avl_insert(&mi->mi_filehandles, nsfh, where);
3773 	nfs_rw_exit(&mi->mi_fh_lock);
3774 
3775 	return (nsfh);
3776 }
3777 
3778 /*
3779  * Return a shared filehandle object for the given filehandle.  The caller
3780  * is responsible for eventually calling sfh4_rele().
3781  */
3782 
3783 nfs4_sharedfh_t *
3784 sfh4_get(const nfs_fh4 *fh, mntinfo4_t *mi)
3785 {
3786 	nfs4_sharedfh_t *sfh;
3787 	nfs4_sharedfh_t key;
3788 
3789 	ASSERT(fh->nfs_fh4_len <= NFS4_FHSIZE);
3790 
3791 #ifdef DEBUG
3792 	if (nfs4_sharedfh_debug) {
3793 		nfs4_fhandle_t fhandle;
3794 
3795 		fhandle.fh_len = fh->nfs_fh4_len;
3796 		bcopy(fh->nfs_fh4_val, fhandle.fh_buf, fhandle.fh_len);
3797 		zcmn_err(mi->mi_zone->zone_id, CE_NOTE, "sfh4_get:");
3798 		nfs4_printfhandle(&fhandle);
3799 	}
3800 #endif
3801 
3802 	/*
3803 	 * If there's already an object for the given filehandle, bump the
3804 	 * reference count and return it.  Otherwise, create a new object
3805 	 * and add it to the AVL tree.
3806 	 */
3807 
3808 	key.sfh_fh = *fh;
3809 
3810 	(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
3811 	sfh = avl_find(&mi->mi_filehandles, &key, NULL);
3812 	if (sfh != NULL) {
3813 		mutex_enter(&sfh->sfh_lock);
3814 		sfh->sfh_refcnt++;
3815 		NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3816 		    "sfh4_get: found existing %p, new refcnt=%d",
3817 		    (void *)sfh, sfh->sfh_refcnt));
3818 		mutex_exit(&sfh->sfh_lock);
3819 		nfs_rw_exit(&mi->mi_fh_lock);
3820 		return (sfh);
3821 	}
3822 	nfs_rw_exit(&mi->mi_fh_lock);
3823 
3824 	return (sfh4_put(fh, mi, &key));
3825 }
3826 
3827 /*
3828  * Get a reference to the given shared filehandle object.
3829  */
3830 
3831 void
3832 sfh4_hold(nfs4_sharedfh_t *sfh)
3833 {
3834 	ASSERT(sfh->sfh_refcnt > 0);
3835 
3836 	mutex_enter(&sfh->sfh_lock);
3837 	sfh->sfh_refcnt++;
3838 	NFS4_DEBUG(nfs4_sharedfh_debug,
3839 	    (CE_NOTE, "sfh4_hold %p, new refcnt=%d",
3840 	    (void *)sfh, sfh->sfh_refcnt));
3841 	mutex_exit(&sfh->sfh_lock);
3842 }
3843 
3844 /*
3845  * Release a reference to the given shared filehandle object and null out
3846  * the given pointer.
3847  */
3848 
3849 void
3850 sfh4_rele(nfs4_sharedfh_t **sfhpp)
3851 {
3852 	mntinfo4_t *mi;
3853 	nfs4_sharedfh_t *sfh = *sfhpp;
3854 
3855 	ASSERT(sfh->sfh_refcnt > 0);
3856 
3857 	mutex_enter(&sfh->sfh_lock);
3858 	if (sfh->sfh_refcnt > 1) {
3859 		sfh->sfh_refcnt--;
3860 		NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3861 		    "sfh4_rele %p, new refcnt=%d",
3862 		    (void *)sfh, sfh->sfh_refcnt));
3863 		mutex_exit(&sfh->sfh_lock);
3864 		goto finish;
3865 	}
3866 	mutex_exit(&sfh->sfh_lock);
3867 
3868 	/*
3869 	 * Possibly the last reference, so get the lock for the table in
3870 	 * case it's time to remove the object from the table.
3871 	 */
3872 	mi = sfh->sfh_mi;
3873 	(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
3874 	mutex_enter(&sfh->sfh_lock);
3875 	sfh->sfh_refcnt--;
3876 	if (sfh->sfh_refcnt > 0) {
3877 		NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3878 		    "sfh4_rele %p, new refcnt=%d",
3879 		    (void *)sfh, sfh->sfh_refcnt));
3880 		mutex_exit(&sfh->sfh_lock);
3881 		nfs_rw_exit(&mi->mi_fh_lock);
3882 		goto finish;
3883 	}
3884 
3885 	NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3886 	    "sfh4_rele %p, last ref", (void *)sfh));
3887 	if (sfh->sfh_flags & SFH4_IN_TREE) {
3888 		avl_remove(&mi->mi_filehandles, sfh);
3889 		sfh->sfh_flags &= ~SFH4_IN_TREE;
3890 	}
3891 	mutex_exit(&sfh->sfh_lock);
3892 	nfs_rw_exit(&mi->mi_fh_lock);
3893 	mutex_destroy(&sfh->sfh_lock);
3894 	kmem_free(sfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE);
3895 	kmem_free(sfh, sizeof (nfs4_sharedfh_t));
3896 
3897 finish:
3898 	*sfhpp = NULL;
3899 }
3900 
3901 /*
3902  * Update the filehandle for the given shared filehandle object.
3903  */
3904 
3905 int nfs4_warn_dupfh = 0;	/* if set, always warn about dup fhs below */
3906 
3907 void
3908 sfh4_update(nfs4_sharedfh_t *sfh, const nfs_fh4 *newfh)
3909 {
3910 	mntinfo4_t *mi = sfh->sfh_mi;
3911 	nfs4_sharedfh_t *dupsfh;
3912 	avl_index_t where;
3913 	nfs4_sharedfh_t key;
3914 
3915 #ifdef DEBUG
3916 	mutex_enter(&sfh->sfh_lock);
3917 	ASSERT(sfh->sfh_refcnt > 0);
3918 	mutex_exit(&sfh->sfh_lock);
3919 #endif
3920 	ASSERT(newfh->nfs_fh4_len <= NFS4_FHSIZE);
3921 
3922 	/*
3923 	 * The basic plan is to remove the shared filehandle object from
3924 	 * the table, update it to have the new filehandle, then reinsert
3925 	 * it.
3926 	 */
3927 
3928 	(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
3929 	mutex_enter(&sfh->sfh_lock);
3930 	if (sfh->sfh_flags & SFH4_IN_TREE) {
3931 		avl_remove(&mi->mi_filehandles, sfh);
3932 		sfh->sfh_flags &= ~SFH4_IN_TREE;
3933 	}
3934 	mutex_exit(&sfh->sfh_lock);
3935 	sfh->sfh_fh.nfs_fh4_len = newfh->nfs_fh4_len;
3936 	bcopy(newfh->nfs_fh4_val, sfh->sfh_fh.nfs_fh4_val,
3937 	    sfh->sfh_fh.nfs_fh4_len);
3938 
3939 	/*
3940 	 * XXX If there is already a shared filehandle object with the new
3941 	 * filehandle, we're in trouble, because the rnode code assumes
3942 	 * that there is only one shared filehandle object for a given
3943 	 * filehandle.  So issue a warning (for read-write mounts only)
3944 	 * and don't try to re-insert the given object into the table.
3945 	 * Hopefully the given object will quickly go away and everyone
3946 	 * will use the new object.
3947 	 */
3948 	key.sfh_fh = *newfh;
3949 	dupsfh = avl_find(&mi->mi_filehandles, &key, &where);
3950 	if (dupsfh != NULL) {
3951 		if (!(mi->mi_vfsp->vfs_flag & VFS_RDONLY) || nfs4_warn_dupfh) {
3952 			zcmn_err(mi->mi_zone->zone_id, CE_WARN, "sfh4_update: "
3953 			    "duplicate filehandle detected");
3954 			sfh4_printfhandle(dupsfh);
3955 		}
3956 	} else {
3957 		avl_insert(&mi->mi_filehandles, sfh, where);
3958 		mutex_enter(&sfh->sfh_lock);
3959 		sfh->sfh_flags |= SFH4_IN_TREE;
3960 		mutex_exit(&sfh->sfh_lock);
3961 	}
3962 	nfs_rw_exit(&mi->mi_fh_lock);
3963 }
3964 
3965 /*
3966  * Copy out the current filehandle for the given shared filehandle object.
3967  */
3968 
3969 void
3970 sfh4_copyval(const nfs4_sharedfh_t *sfh, nfs4_fhandle_t *fhp)
3971 {
3972 	mntinfo4_t *mi = sfh->sfh_mi;
3973 
3974 	ASSERT(sfh->sfh_refcnt > 0);
3975 
3976 	(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
3977 	fhp->fh_len = sfh->sfh_fh.nfs_fh4_len;
3978 	ASSERT(fhp->fh_len <= NFS4_FHSIZE);
3979 	bcopy(sfh->sfh_fh.nfs_fh4_val, fhp->fh_buf, fhp->fh_len);
3980 	nfs_rw_exit(&mi->mi_fh_lock);
3981 }
3982 
3983 /*
3984  * Print out the filehandle for the given shared filehandle object.
3985  */
3986 
3987 void
3988 sfh4_printfhandle(const nfs4_sharedfh_t *sfh)
3989 {
3990 	nfs4_fhandle_t fhandle;
3991 
3992 	sfh4_copyval(sfh, &fhandle);
3993 	nfs4_printfhandle(&fhandle);
3994 }
3995 
3996 /*
3997  * Compare 2 fnames.  Returns -1 if the first is "less" than the second, 0
3998  * if they're the same, +1 if the first is "greater" than the second.  The
3999  * caller (or whoever's calling the AVL package) is responsible for
4000  * handling locking issues.
4001  */
4002 
4003 static int
4004 fncmp(const void *p1, const void *p2)
4005 {
4006 	const nfs4_fname_t *f1 = p1;
4007 	const nfs4_fname_t *f2 = p2;
4008 	int res;
4009 
4010 	res = strcmp(f1->fn_name, f2->fn_name);
4011 	/*
4012 	 * The AVL package wants +/-1, not arbitrary positive or negative
4013 	 * integers.
4014 	 */
4015 	if (res > 0)
4016 		res = 1;
4017 	else if (res < 0)
4018 		res = -1;
4019 	return (res);
4020 }
4021 
4022 /*
4023  * Get or create an fname with the given name, as a child of the given
4024  * fname.  The caller is responsible for eventually releasing the reference
4025  * (fn_rele()).  parent may be NULL.
4026  */
4027 
4028 nfs4_fname_t *
4029 fn_get(nfs4_fname_t *parent, char *name, nfs4_sharedfh_t *sfh)
4030 {
4031 	nfs4_fname_t key;
4032 	nfs4_fname_t *fnp;
4033 	avl_index_t where;
4034 
4035 	key.fn_name = name;
4036 
4037 	/*
4038 	 * If there's already an fname registered with the given name, bump
4039 	 * its reference count and return it.  Otherwise, create a new one
4040 	 * and add it to the parent's AVL tree.
4041 	 *
4042 	 * fname entries we are looking for should match both name
4043 	 * and sfh stored in the fname.
4044 	 */
4045 again:
4046 	if (parent != NULL) {
4047 		mutex_enter(&parent->fn_lock);
4048 		fnp = avl_find(&parent->fn_children, &key, &where);
4049 		if (fnp != NULL) {
4050 			/*
4051 			 * This hold on fnp is released below later,
4052 			 * in case this is not the fnp we want.
4053 			 */
4054 			fn_hold(fnp);
4055 
4056 			if (fnp->fn_sfh == sfh) {
4057 				/*
4058 				 * We have found our entry.
4059 				 * put an hold and return it.
4060 				 */
4061 				mutex_exit(&parent->fn_lock);
4062 				return (fnp);
4063 			}
4064 
4065 			/*
4066 			 * We have found an entry that has a mismatching
4067 			 * fn_sfh. This could be a stale entry due to
4068 			 * server side rename. We will remove this entry
4069 			 * and make sure no such entries exist.
4070 			 */
4071 			mutex_exit(&parent->fn_lock);
4072 			mutex_enter(&fnp->fn_lock);
4073 			if (fnp->fn_parent == parent) {
4074 				/*
4075 				 * Remove ourselves from parent's
4076 				 * fn_children tree.
4077 				 */
4078 				mutex_enter(&parent->fn_lock);
4079 				avl_remove(&parent->fn_children, fnp);
4080 				mutex_exit(&parent->fn_lock);
4081 				fn_rele(&fnp->fn_parent);
4082 			}
4083 			mutex_exit(&fnp->fn_lock);
4084 			fn_rele(&fnp);
4085 			goto again;
4086 		}
4087 	}
4088 
4089 	fnp = kmem_alloc(sizeof (nfs4_fname_t), KM_SLEEP);
4090 	mutex_init(&fnp->fn_lock, NULL, MUTEX_DEFAULT, NULL);
4091 	fnp->fn_parent = parent;
4092 	if (parent != NULL)
4093 		fn_hold(parent);
4094 	fnp->fn_len = strlen(name);
4095 	ASSERT(fnp->fn_len < MAXNAMELEN);
4096 	fnp->fn_name = kmem_alloc(fnp->fn_len + 1, KM_SLEEP);
4097 	(void) strcpy(fnp->fn_name, name);
4098 	fnp->fn_refcnt = 1;
4099 
4100 	/*
4101 	 * This hold on sfh is later released
4102 	 * when we do the final fn_rele() on this fname.
4103 	 */
4104 	sfh4_hold(sfh);
4105 	fnp->fn_sfh = sfh;
4106 
4107 	avl_create(&fnp->fn_children, fncmp, sizeof (nfs4_fname_t),
4108 	    offsetof(nfs4_fname_t, fn_tree));
4109 	NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4110 	    "fn_get %p:%s, a new nfs4_fname_t!",
4111 	    (void *)fnp, fnp->fn_name));
4112 	if (parent != NULL) {
4113 		avl_insert(&parent->fn_children, fnp, where);
4114 		mutex_exit(&parent->fn_lock);
4115 	}
4116 
4117 	return (fnp);
4118 }
4119 
4120 void
4121 fn_hold(nfs4_fname_t *fnp)
4122 {
4123 	atomic_inc_32(&fnp->fn_refcnt);
4124 	NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4125 	    "fn_hold %p:%s, new refcnt=%d",
4126 	    (void *)fnp, fnp->fn_name, fnp->fn_refcnt));
4127 }
4128 
4129 /*
4130  * Decrement the reference count of the given fname, and destroy it if its
4131  * reference count goes to zero.  Nulls out the given pointer.
4132  */
4133 
4134 void
4135 fn_rele(nfs4_fname_t **fnpp)
4136 {
4137 	nfs4_fname_t *parent;
4138 	uint32_t newref;
4139 	nfs4_fname_t *fnp;
4140 
4141 recur:
4142 	fnp = *fnpp;
4143 	*fnpp = NULL;
4144 
4145 	mutex_enter(&fnp->fn_lock);
4146 	parent = fnp->fn_parent;
4147 	if (parent != NULL)
4148 		mutex_enter(&parent->fn_lock);	/* prevent new references */
4149 	newref = atomic_dec_32_nv(&fnp->fn_refcnt);
4150 	if (newref > 0) {
4151 		NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4152 		    "fn_rele %p:%s, new refcnt=%d",
4153 		    (void *)fnp, fnp->fn_name, fnp->fn_refcnt));
4154 		if (parent != NULL)
4155 			mutex_exit(&parent->fn_lock);
4156 		mutex_exit(&fnp->fn_lock);
4157 		return;
4158 	}
4159 
4160 	NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4161 	    "fn_rele %p:%s, last reference, deleting...",
4162 	    (void *)fnp, fnp->fn_name));
4163 	if (parent != NULL) {
4164 		avl_remove(&parent->fn_children, fnp);
4165 		mutex_exit(&parent->fn_lock);
4166 	}
4167 	kmem_free(fnp->fn_name, fnp->fn_len + 1);
4168 	sfh4_rele(&fnp->fn_sfh);
4169 	mutex_destroy(&fnp->fn_lock);
4170 	avl_destroy(&fnp->fn_children);
4171 	kmem_free(fnp, sizeof (nfs4_fname_t));
4172 	/*
4173 	 * Recursivly fn_rele the parent.
4174 	 * Use goto instead of a recursive call to avoid stack overflow.
4175 	 */
4176 	if (parent != NULL) {
4177 		fnpp = &parent;
4178 		goto recur;
4179 	}
4180 }
4181 
4182 /*
4183  * Returns the single component name of the given fname, in a MAXNAMELEN
4184  * string buffer, which the caller is responsible for freeing.  Note that
4185  * the name may become invalid as a result of fn_move().
4186  */
4187 
4188 char *
4189 fn_name(nfs4_fname_t *fnp)
4190 {
4191 	char *name;
4192 
4193 	ASSERT(fnp->fn_len < MAXNAMELEN);
4194 	name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
4195 	mutex_enter(&fnp->fn_lock);
4196 	(void) strcpy(name, fnp->fn_name);
4197 	mutex_exit(&fnp->fn_lock);
4198 
4199 	return (name);
4200 }
4201 
4202 
4203 /*
4204  * fn_path_realloc
4205  *
4206  * This function, used only by fn_path, constructs
4207  * a new string which looks like "prepend" + "/" + "current".
4208  * by allocating a new string and freeing the old one.
4209  */
4210 static void
4211 fn_path_realloc(char **curses, char *prepend)
4212 {
4213 	int len, curlen = 0;
4214 	char *news;
4215 
4216 	if (*curses == NULL) {
4217 		/*
4218 		 * Prime the pump, allocate just the
4219 		 * space for prepend and return that.
4220 		 */
4221 		len = strlen(prepend) + 1;
4222 		news = kmem_alloc(len, KM_SLEEP);
4223 		(void) strncpy(news, prepend, len);
4224 	} else {
4225 		/*
4226 		 * Allocate the space  for a new string
4227 		 * +1 +1 is for the "/" and the NULL
4228 		 * byte at the end of it all.
4229 		 */
4230 		curlen = strlen(*curses);
4231 		len = curlen + strlen(prepend) + 1 + 1;
4232 		news = kmem_alloc(len, KM_SLEEP);
4233 		(void) strncpy(news, prepend, len);
4234 		(void) strcat(news, "/");
4235 		(void) strcat(news, *curses);
4236 		kmem_free(*curses, curlen + 1);
4237 	}
4238 	*curses = news;
4239 }
4240 
4241 /*
4242  * Returns the path name (starting from the fs root) for the given fname.
4243  * The caller is responsible for freeing.  Note that the path may be or
4244  * become invalid as a result of fn_move().
4245  */
4246 
4247 char *
4248 fn_path(nfs4_fname_t *fnp)
4249 {
4250 	char *path;
4251 	nfs4_fname_t *nextfnp;
4252 
4253 	if (fnp == NULL)
4254 		return (NULL);
4255 
4256 	path = NULL;
4257 
4258 	/* walk up the tree constructing the pathname.  */
4259 
4260 	fn_hold(fnp);			/* adjust for later rele */
4261 	do {
4262 		mutex_enter(&fnp->fn_lock);
4263 		/*
4264 		 * Add fn_name in front of the current path
4265 		 */
4266 		fn_path_realloc(&path, fnp->fn_name);
4267 		nextfnp = fnp->fn_parent;
4268 		if (nextfnp != NULL)
4269 			fn_hold(nextfnp);
4270 		mutex_exit(&fnp->fn_lock);
4271 		fn_rele(&fnp);
4272 		fnp = nextfnp;
4273 	} while (fnp != NULL);
4274 
4275 	return (path);
4276 }
4277 
4278 /*
4279  * Return a reference to the parent of the given fname, which the caller is
4280  * responsible for eventually releasing.
4281  */
4282 
4283 nfs4_fname_t *
4284 fn_parent(nfs4_fname_t *fnp)
4285 {
4286 	nfs4_fname_t *parent;
4287 
4288 	mutex_enter(&fnp->fn_lock);
4289 	parent = fnp->fn_parent;
4290 	if (parent != NULL)
4291 		fn_hold(parent);
4292 	mutex_exit(&fnp->fn_lock);
4293 
4294 	return (parent);
4295 }
4296 
4297 /*
4298  * Update fnp so that its parent is newparent and its name is newname.
4299  */
4300 
4301 void
4302 fn_move(nfs4_fname_t *fnp, nfs4_fname_t *newparent, char *newname)
4303 {
4304 	nfs4_fname_t *parent, *tmpfnp;
4305 	ssize_t newlen;
4306 	nfs4_fname_t key;
4307 	avl_index_t where;
4308 
4309 	/*
4310 	 * This assert exists to catch the client trying to rename
4311 	 * a dir to be a child of itself.  This happened at a recent
4312 	 * bakeoff against a 3rd party (broken) server which allowed
4313 	 * the rename to succeed.  If it trips it means that:
4314 	 *	a) the code in nfs4rename that detects this case is broken
4315 	 *	b) the server is broken (since it allowed the bogus rename)
4316 	 *
4317 	 * For non-DEBUG kernels, prepare for a recursive mutex_enter
4318 	 * panic below from:  mutex_enter(&newparent->fn_lock);
4319 	 */
4320 	ASSERT(fnp != newparent);
4321 
4322 	/*
4323 	 * Remove fnp from its current parent, change its name, then add it
4324 	 * to newparent. It might happen that fnp was replaced by another
4325 	 * nfs4_fname_t with the same fn_name in parent->fn_children.
4326 	 * In such case, fnp->fn_parent is NULL and we skip the removal
4327 	 * of fnp from its current parent.
4328 	 */
4329 	mutex_enter(&fnp->fn_lock);
4330 	parent = fnp->fn_parent;
4331 	if (parent != NULL) {
4332 		mutex_enter(&parent->fn_lock);
4333 		avl_remove(&parent->fn_children, fnp);
4334 		mutex_exit(&parent->fn_lock);
4335 		fn_rele(&fnp->fn_parent);
4336 	}
4337 
4338 	newlen = strlen(newname);
4339 	if (newlen != fnp->fn_len) {
4340 		ASSERT(newlen < MAXNAMELEN);
4341 		kmem_free(fnp->fn_name, fnp->fn_len + 1);
4342 		fnp->fn_name = kmem_alloc(newlen + 1, KM_SLEEP);
4343 		fnp->fn_len = newlen;
4344 	}
4345 	(void) strcpy(fnp->fn_name, newname);
4346 
4347 again:
4348 	mutex_enter(&newparent->fn_lock);
4349 	key.fn_name = fnp->fn_name;
4350 	tmpfnp = avl_find(&newparent->fn_children, &key, &where);
4351 	if (tmpfnp != NULL) {
4352 		/*
4353 		 * This could be due to a file that was unlinked while
4354 		 * open, or perhaps the rnode is in the free list.  Remove
4355 		 * it from newparent and let it go away on its own.  The
4356 		 * contorted code is to deal with lock order issues and
4357 		 * race conditions.
4358 		 */
4359 		fn_hold(tmpfnp);
4360 		mutex_exit(&newparent->fn_lock);
4361 		mutex_enter(&tmpfnp->fn_lock);
4362 		if (tmpfnp->fn_parent == newparent) {
4363 			mutex_enter(&newparent->fn_lock);
4364 			avl_remove(&newparent->fn_children, tmpfnp);
4365 			mutex_exit(&newparent->fn_lock);
4366 			fn_rele(&tmpfnp->fn_parent);
4367 		}
4368 		mutex_exit(&tmpfnp->fn_lock);
4369 		fn_rele(&tmpfnp);
4370 		goto again;
4371 	}
4372 	fnp->fn_parent = newparent;
4373 	fn_hold(newparent);
4374 	avl_insert(&newparent->fn_children, fnp, where);
4375 	mutex_exit(&newparent->fn_lock);
4376 	mutex_exit(&fnp->fn_lock);
4377 }
4378 
4379 #ifdef DEBUG
4380 /*
4381  * Return non-zero if the type information makes sense for the given vnode.
4382  * Otherwise panic.
4383  */
4384 int
4385 nfs4_consistent_type(vnode_t *vp)
4386 {
4387 	rnode4_t *rp = VTOR4(vp);
4388 
4389 	if (nfs4_vtype_debug && vp->v_type != VNON &&
4390 	    rp->r_attr.va_type != VNON && vp->v_type != rp->r_attr.va_type) {
4391 		cmn_err(CE_PANIC, "vnode %p type mismatch; v_type=%d, "
4392 		    "rnode attr type=%d", (void *)vp, vp->v_type,
4393 		    rp->r_attr.va_type);
4394 	}
4395 
4396 	return (1);
4397 }
4398 #endif /* DEBUG */
4399