xref: /illumos-gate/usr/src/uts/common/fs/nfs/nfs4_client.c (revision c5749750a3e052f1194f65a303456224c51dea63)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2017 by Delphix. All rights reserved.
24  */
25 
26 /*
27  *  	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
28  *	All Rights Reserved
29  */
30 
31 #include <sys/param.h>
32 #include <sys/types.h>
33 #include <sys/systm.h>
34 #include <sys/thread.h>
35 #include <sys/t_lock.h>
36 #include <sys/time.h>
37 #include <sys/vnode.h>
38 #include <sys/vfs.h>
39 #include <sys/errno.h>
40 #include <sys/buf.h>
41 #include <sys/stat.h>
42 #include <sys/cred.h>
43 #include <sys/kmem.h>
44 #include <sys/debug.h>
45 #include <sys/dnlc.h>
46 #include <sys/vmsystm.h>
47 #include <sys/flock.h>
48 #include <sys/share.h>
49 #include <sys/cmn_err.h>
50 #include <sys/tiuser.h>
51 #include <sys/sysmacros.h>
52 #include <sys/callb.h>
53 #include <sys/acl.h>
54 #include <sys/kstat.h>
55 #include <sys/signal.h>
56 #include <sys/disp.h>
57 #include <sys/atomic.h>
58 #include <sys/list.h>
59 #include <sys/sdt.h>
60 
61 #include <rpc/types.h>
62 #include <rpc/xdr.h>
63 #include <rpc/auth.h>
64 #include <rpc/clnt.h>
65 
66 #include <nfs/nfs.h>
67 #include <nfs/nfs_clnt.h>
68 #include <nfs/nfs_acl.h>
69 
70 #include <nfs/nfs4.h>
71 #include <nfs/rnode4.h>
72 #include <nfs/nfs4_clnt.h>
73 
74 #include <vm/hat.h>
75 #include <vm/as.h>
76 #include <vm/page.h>
77 #include <vm/pvn.h>
78 #include <vm/seg.h>
79 #include <vm/seg_map.h>
80 #include <vm/seg_vn.h>
81 
82 #include <sys/ddi.h>
83 
84 /*
85  * Arguments to page-flush thread.
86  */
87 typedef struct {
88 	vnode_t *vp;
89 	cred_t *cr;
90 } pgflush_t;
91 
92 #ifdef DEBUG
93 int nfs4_client_lease_debug;
94 int nfs4_sharedfh_debug;
95 int nfs4_fname_debug;
96 
97 /* temporary: panic if v_type is inconsistent with r_attr va_type */
98 int nfs4_vtype_debug;
99 
100 uint_t nfs4_tsd_key;
101 #endif
102 
103 static time_t	nfs4_client_resumed = 0;
104 static	callb_id_t cid = 0;
105 
106 static int	nfs4renew(nfs4_server_t *);
107 static void	nfs4_attrcache_va(vnode_t *, nfs4_ga_res_t *, int);
108 static void	nfs4_pgflush_thread(pgflush_t *);
109 
110 static boolean_t nfs4_client_cpr_callb(void *, int);
111 
112 struct mi4_globals {
113 	kmutex_t	mig_lock;  /* lock protecting mig_list */
114 	list_t		mig_list;  /* list of NFS v4 mounts in zone */
115 	boolean_t	mig_destructor_called;
116 };
117 
118 static zone_key_t mi4_list_key;
119 
120 /*
121  * Attributes caching:
122  *
123  * Attributes are cached in the rnode in struct vattr form.
124  * There is a time associated with the cached attributes (r_time_attr_inval)
125  * which tells whether the attributes are valid. The time is initialized
126  * to the difference between current time and the modify time of the vnode
127  * when new attributes are cached. This allows the attributes for
128  * files that have changed recently to be timed out sooner than for files
129  * that have not changed for a long time. There are minimum and maximum
130  * timeout values that can be set per mount point.
131  */
132 
133 /*
134  * If a cache purge is in progress, wait for it to finish.
135  *
136  * The current thread must not be in the middle of an
137  * nfs4_start_op/nfs4_end_op region.  Otherwise, there could be a deadlock
138  * between this thread, a recovery thread, and the page flush thread.
139  */
140 int
141 nfs4_waitfor_purge_complete(vnode_t *vp)
142 {
143 	rnode4_t *rp;
144 	k_sigset_t smask;
145 
146 	rp = VTOR4(vp);
147 	if ((rp->r_serial != NULL && rp->r_serial != curthread) ||
148 	    ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread)) {
149 		mutex_enter(&rp->r_statelock);
150 		sigintr(&smask, VTOMI4(vp)->mi_flags & MI4_INT);
151 		while ((rp->r_serial != NULL && rp->r_serial != curthread) ||
152 		    ((rp->r_flags & R4PGFLUSH) &&
153 		    rp->r_pgflush != curthread)) {
154 			if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
155 				sigunintr(&smask);
156 				mutex_exit(&rp->r_statelock);
157 				return (EINTR);
158 			}
159 		}
160 		sigunintr(&smask);
161 		mutex_exit(&rp->r_statelock);
162 	}
163 	return (0);
164 }
165 
166 /*
167  * Validate caches by checking cached attributes. If they have timed out,
168  * then get new attributes from the server.  As a side effect, cache
169  * invalidation is done if the attributes have changed.
170  *
171  * If the attributes have not timed out and if there is a cache
172  * invalidation being done by some other thread, then wait until that
173  * thread has completed the cache invalidation.
174  */
175 int
176 nfs4_validate_caches(vnode_t *vp, cred_t *cr)
177 {
178 	int error;
179 	nfs4_ga_res_t gar;
180 
181 	if (ATTRCACHE4_VALID(vp)) {
182 		error = nfs4_waitfor_purge_complete(vp);
183 		if (error)
184 			return (error);
185 		return (0);
186 	}
187 
188 	return (nfs4_getattr_otw(vp, &gar, cr, 0));
189 }
190 
191 /*
192  * Fill in attribute from the cache.
193  * If valid, then return 0 to indicate that no error occurred,
194  * otherwise return 1 to indicate that an error occurred.
195  */
196 static int
197 nfs4_getattr_cache(vnode_t *vp, struct vattr *vap)
198 {
199 	rnode4_t *rp;
200 
201 	rp = VTOR4(vp);
202 	mutex_enter(&rp->r_statelock);
203 	mutex_enter(&rp->r_statev4_lock);
204 	if (ATTRCACHE4_VALID(vp)) {
205 		mutex_exit(&rp->r_statev4_lock);
206 		/*
207 		 * Cached attributes are valid
208 		 */
209 		*vap = rp->r_attr;
210 		mutex_exit(&rp->r_statelock);
211 		return (0);
212 	}
213 	mutex_exit(&rp->r_statev4_lock);
214 	mutex_exit(&rp->r_statelock);
215 	return (1);
216 }
217 
218 
219 /*
220  * If returned error is ESTALE flush all caches.  The nfs4_purge_caches()
221  * call is synchronous because all the pages were invalidated by the
222  * nfs4_invalidate_pages() call.
223  */
224 void
225 nfs4_purge_stale_fh(int errno, vnode_t *vp, cred_t *cr)
226 {
227 	struct rnode4 *rp = VTOR4(vp);
228 
229 	/* Ensure that the ..._end_op() call has been done */
230 	ASSERT(tsd_get(nfs4_tsd_key) == NULL);
231 
232 	if (errno != ESTALE)
233 		return;
234 
235 	mutex_enter(&rp->r_statelock);
236 	rp->r_flags |= R4STALE;
237 	if (!rp->r_error)
238 		rp->r_error = errno;
239 	mutex_exit(&rp->r_statelock);
240 	if (nfs4_has_pages(vp))
241 		nfs4_invalidate_pages(vp, (u_offset_t)0, cr);
242 	nfs4_purge_caches(vp, NFS4_PURGE_DNLC, cr, FALSE);
243 }
244 
245 /*
246  * Purge all of the various NFS `data' caches.  If "asyncpg" is TRUE, the
247  * page purge is done asynchronously.
248  */
249 void
250 nfs4_purge_caches(vnode_t *vp, int purge_dnlc, cred_t *cr, int asyncpg)
251 {
252 	rnode4_t *rp;
253 	char *contents;
254 	vnode_t *xattr;
255 	int size;
256 	int pgflush;			/* are we the page flush thread? */
257 
258 	/*
259 	 * Purge the DNLC for any entries which refer to this file.
260 	 */
261 	if (vp->v_count > 1 &&
262 	    (vp->v_type == VDIR || purge_dnlc == NFS4_PURGE_DNLC))
263 		dnlc_purge_vp(vp);
264 
265 	/*
266 	 * Clear any readdir state bits and purge the readlink response cache.
267 	 */
268 	rp = VTOR4(vp);
269 	mutex_enter(&rp->r_statelock);
270 	rp->r_flags &= ~R4LOOKUP;
271 	contents = rp->r_symlink.contents;
272 	size = rp->r_symlink.size;
273 	rp->r_symlink.contents = NULL;
274 
275 	xattr = rp->r_xattr_dir;
276 	rp->r_xattr_dir = NULL;
277 
278 	/*
279 	 * Purge pathconf cache too.
280 	 */
281 	rp->r_pathconf.pc4_xattr_valid = 0;
282 	rp->r_pathconf.pc4_cache_valid = 0;
283 
284 	pgflush = (curthread == rp->r_pgflush);
285 	mutex_exit(&rp->r_statelock);
286 
287 	if (contents != NULL) {
288 
289 		kmem_free((void *)contents, size);
290 	}
291 
292 	if (xattr != NULL)
293 		VN_RELE(xattr);
294 
295 	/*
296 	 * Flush the page cache.  If the current thread is the page flush
297 	 * thread, don't initiate a new page flush.  There's no need for
298 	 * it, and doing it correctly is hard.
299 	 */
300 	if (nfs4_has_pages(vp) && !pgflush) {
301 		if (!asyncpg) {
302 			(void) nfs4_waitfor_purge_complete(vp);
303 			nfs4_flush_pages(vp, cr);
304 		} else {
305 			pgflush_t *args;
306 
307 			/*
308 			 * We don't hold r_statelock while creating the
309 			 * thread, in case the call blocks.  So we use a
310 			 * flag to indicate that a page flush thread is
311 			 * active.
312 			 */
313 			mutex_enter(&rp->r_statelock);
314 			if (rp->r_flags & R4PGFLUSH) {
315 				mutex_exit(&rp->r_statelock);
316 			} else {
317 				rp->r_flags |= R4PGFLUSH;
318 				mutex_exit(&rp->r_statelock);
319 
320 				args = kmem_alloc(sizeof (pgflush_t),
321 				    KM_SLEEP);
322 				args->vp = vp;
323 				VN_HOLD(args->vp);
324 				args->cr = cr;
325 				crhold(args->cr);
326 				(void) zthread_create(NULL, 0,
327 				    nfs4_pgflush_thread, args, 0,
328 				    minclsyspri);
329 			}
330 		}
331 	}
332 
333 	/*
334 	 * Flush the readdir response cache.
335 	 */
336 	nfs4_purge_rddir_cache(vp);
337 }
338 
339 /*
340  * Invalidate all pages for the given file, after writing back the dirty
341  * ones.
342  */
343 
344 void
345 nfs4_flush_pages(vnode_t *vp, cred_t *cr)
346 {
347 	int error;
348 	rnode4_t *rp = VTOR4(vp);
349 
350 	error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_INVAL, cr, NULL);
351 	if (error == ENOSPC || error == EDQUOT) {
352 		mutex_enter(&rp->r_statelock);
353 		if (!rp->r_error)
354 			rp->r_error = error;
355 		mutex_exit(&rp->r_statelock);
356 	}
357 }
358 
359 /*
360  * Page flush thread.
361  */
362 
363 static void
364 nfs4_pgflush_thread(pgflush_t *args)
365 {
366 	rnode4_t *rp = VTOR4(args->vp);
367 
368 	/* remember which thread we are, so we don't deadlock ourselves */
369 	mutex_enter(&rp->r_statelock);
370 	ASSERT(rp->r_pgflush == NULL);
371 	rp->r_pgflush = curthread;
372 	mutex_exit(&rp->r_statelock);
373 
374 	nfs4_flush_pages(args->vp, args->cr);
375 
376 	mutex_enter(&rp->r_statelock);
377 	rp->r_pgflush = NULL;
378 	rp->r_flags &= ~R4PGFLUSH;
379 	cv_broadcast(&rp->r_cv);
380 	mutex_exit(&rp->r_statelock);
381 
382 	VN_RELE(args->vp);
383 	crfree(args->cr);
384 	kmem_free(args, sizeof (pgflush_t));
385 	zthread_exit();
386 }
387 
388 /*
389  * Purge the readdir cache of all entries which are not currently
390  * being filled.
391  */
392 void
393 nfs4_purge_rddir_cache(vnode_t *vp)
394 {
395 	rnode4_t *rp;
396 
397 	rp = VTOR4(vp);
398 
399 	mutex_enter(&rp->r_statelock);
400 	rp->r_direof = NULL;
401 	rp->r_flags &= ~R4LOOKUP;
402 	rp->r_flags |= R4READDIRWATTR;
403 	rddir4_cache_purge(rp);
404 	mutex_exit(&rp->r_statelock);
405 }
406 
407 /*
408  * Set attributes cache for given vnode using virtual attributes.  There is
409  * no cache validation, but if the attributes are deemed to be stale, they
410  * are ignored.  This corresponds to nfs3_attrcache().
411  *
412  * Set the timeout value on the attribute cache and fill it
413  * with the passed in attributes.
414  */
415 void
416 nfs4_attrcache_noinval(vnode_t *vp, nfs4_ga_res_t *garp, hrtime_t t)
417 {
418 	rnode4_t *rp = VTOR4(vp);
419 
420 	mutex_enter(&rp->r_statelock);
421 	if (rp->r_time_attr_saved <= t)
422 		nfs4_attrcache_va(vp, garp, FALSE);
423 	mutex_exit(&rp->r_statelock);
424 }
425 
426 /*
427  * Use the passed in virtual attributes to check to see whether the
428  * data and metadata caches are valid, cache the new attributes, and
429  * then do the cache invalidation if required.
430  *
431  * The cache validation and caching of the new attributes is done
432  * atomically via the use of the mutex, r_statelock.  If required,
433  * the cache invalidation is done atomically w.r.t. the cache
434  * validation and caching of the attributes via the pseudo lock,
435  * r_serial.
436  *
437  * This routine is used to do cache validation and attributes caching
438  * for operations with a single set of post operation attributes.
439  */
440 
441 void
442 nfs4_attr_cache(vnode_t *vp, nfs4_ga_res_t *garp,
443     hrtime_t t, cred_t *cr, int async,
444     change_info4 *cinfo)
445 {
446 	rnode4_t *rp;
447 	int mtime_changed = 0;
448 	int ctime_changed = 0;
449 	vsecattr_t *vsp;
450 	int was_serial, set_time_cache_inval, recov;
451 	vattr_t *vap = &garp->n4g_va;
452 	mntinfo4_t *mi = VTOMI4(vp);
453 	len_t preattr_rsize;
454 	boolean_t writemodify_set = B_FALSE;
455 	boolean_t cachepurge_set = B_FALSE;
456 
457 	ASSERT(mi->mi_vfsp->vfs_dev == garp->n4g_va.va_fsid);
458 
459 	/* Is curthread the recovery thread? */
460 	mutex_enter(&mi->mi_lock);
461 	recov = (VTOMI4(vp)->mi_recovthread == curthread);
462 	mutex_exit(&mi->mi_lock);
463 
464 	rp = VTOR4(vp);
465 	mutex_enter(&rp->r_statelock);
466 	was_serial = (rp->r_serial == curthread);
467 	if (rp->r_serial && !was_serial) {
468 		klwp_t *lwp = ttolwp(curthread);
469 
470 		/*
471 		 * If we're the recovery thread, then purge current attrs
472 		 * and bail out to avoid potential deadlock between another
473 		 * thread caching attrs (r_serial thread), recov thread,
474 		 * and an async writer thread.
475 		 */
476 		if (recov) {
477 			PURGE_ATTRCACHE4_LOCKED(rp);
478 			mutex_exit(&rp->r_statelock);
479 			return;
480 		}
481 
482 		if (lwp != NULL)
483 			lwp->lwp_nostop++;
484 		while (rp->r_serial != NULL) {
485 			if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
486 				mutex_exit(&rp->r_statelock);
487 				if (lwp != NULL)
488 					lwp->lwp_nostop--;
489 				return;
490 			}
491 		}
492 		if (lwp != NULL)
493 			lwp->lwp_nostop--;
494 	}
495 
496 	/*
497 	 * If there is a page flush thread, the current thread needs to
498 	 * bail out, to prevent a possible deadlock between the current
499 	 * thread (which might be in a start_op/end_op region), the
500 	 * recovery thread, and the page flush thread.  Expire the
501 	 * attribute cache, so that any attributes the current thread was
502 	 * going to set are not lost.
503 	 */
504 	if ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread) {
505 		PURGE_ATTRCACHE4_LOCKED(rp);
506 		mutex_exit(&rp->r_statelock);
507 		return;
508 	}
509 
510 	if (rp->r_time_attr_saved > t) {
511 		/*
512 		 * Attributes have been cached since these attributes were
513 		 * probably made. If there is an inconsistency in what is
514 		 * cached, mark them invalid. If not, don't act on them.
515 		 */
516 		if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size))
517 			PURGE_ATTRCACHE4_LOCKED(rp);
518 		mutex_exit(&rp->r_statelock);
519 		return;
520 	}
521 	set_time_cache_inval = 0;
522 	if (cinfo) {
523 		/*
524 		 * Only directory modifying callers pass non-NULL cinfo.
525 		 */
526 		ASSERT(vp->v_type == VDIR);
527 		/*
528 		 * If the cache timeout either doesn't exist or hasn't expired,
529 		 * and dir didn't changed on server before dirmod op
530 		 * and dir didn't change after dirmod op but before getattr
531 		 * then there's a chance that the client's cached data for
532 		 * this object is current (not stale).  No immediate cache
533 		 * flush is required.
534 		 *
535 		 */
536 		if ((! rp->r_time_cache_inval || t < rp->r_time_cache_inval) &&
537 		    cinfo->before == rp->r_change &&
538 		    (garp->n4g_change_valid &&
539 		    cinfo->after == garp->n4g_change)) {
540 
541 			/*
542 			 * If atomic isn't set, then the before/after info
543 			 * cannot be blindly trusted.  For this case, we tell
544 			 * nfs4_attrcache_va to cache the attrs but also
545 			 * establish an absolute maximum cache timeout.  When
546 			 * the timeout is reached, caches will be flushed.
547 			 */
548 			if (! cinfo->atomic)
549 				set_time_cache_inval = 1;
550 		} else {
551 
552 			/*
553 			 * We're not sure exactly what changed, but we know
554 			 * what to do.  flush all caches for dir.  remove the
555 			 * attr timeout.
556 			 *
557 			 * a) timeout expired.  flush all caches.
558 			 * b) r_change != cinfo.before.  flush all caches.
559 			 * c) r_change == cinfo.before, but cinfo.after !=
560 			 *    post-op getattr(change).  flush all caches.
561 			 * d) post-op getattr(change) not provided by server.
562 			 *    flush all caches.
563 			 */
564 			mtime_changed = 1;
565 			ctime_changed = 1;
566 			rp->r_time_cache_inval = 0;
567 		}
568 	} else {
569 		/*
570 		 * Write thread after writing data to file on remote server,
571 		 * will always set R4WRITEMODIFIED to indicate that file on
572 		 * remote server was modified with a WRITE operation and would
573 		 * have marked attribute cache as timed out. If R4WRITEMODIFIED
574 		 * is set, then do not check for mtime and ctime change.
575 		 */
576 		if (!(rp->r_flags & R4WRITEMODIFIED)) {
577 			if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size))
578 				mtime_changed = 1;
579 
580 			if (rp->r_attr.va_ctime.tv_sec !=
581 			    vap->va_ctime.tv_sec ||
582 			    rp->r_attr.va_ctime.tv_nsec !=
583 			    vap->va_ctime.tv_nsec)
584 				ctime_changed = 1;
585 
586 			/*
587 			 * If the change attribute was not provided by server
588 			 * or it differs, then flush all caches.
589 			 */
590 			if (!garp->n4g_change_valid ||
591 			    rp->r_change != garp->n4g_change) {
592 				mtime_changed = 1;
593 				ctime_changed = 1;
594 			}
595 		} else {
596 			writemodify_set = B_TRUE;
597 		}
598 	}
599 
600 	preattr_rsize = rp->r_size;
601 
602 	nfs4_attrcache_va(vp, garp, set_time_cache_inval);
603 
604 	/*
605 	 * If we have updated filesize in nfs4_attrcache_va, as soon as we
606 	 * drop statelock we will be in transition of purging all
607 	 * our caches and updating them. It is possible for another
608 	 * thread to pick this new file size and read in zeroed data.
609 	 * stall other threads till cache purge is complete.
610 	 */
611 	if ((!cinfo) && (rp->r_size != preattr_rsize)) {
612 		/*
613 		 * If R4WRITEMODIFIED was set and we have updated the file
614 		 * size, Server's returned file size need not necessarily
615 		 * be because of this Client's WRITE. We need to purge
616 		 * all caches.
617 		 */
618 		if (writemodify_set)
619 			mtime_changed = 1;
620 
621 		if (mtime_changed && !(rp->r_flags & R4INCACHEPURGE)) {
622 			rp->r_flags |= R4INCACHEPURGE;
623 			cachepurge_set = B_TRUE;
624 		}
625 	}
626 
627 	if (!mtime_changed && !ctime_changed) {
628 		mutex_exit(&rp->r_statelock);
629 		return;
630 	}
631 
632 	rp->r_serial = curthread;
633 
634 	mutex_exit(&rp->r_statelock);
635 
636 	/*
637 	 * If we're the recov thread, then force async nfs4_purge_caches
638 	 * to avoid potential deadlock.
639 	 */
640 	if (mtime_changed)
641 		nfs4_purge_caches(vp, NFS4_NOPURGE_DNLC, cr, recov ? 1 : async);
642 
643 	if ((rp->r_flags & R4INCACHEPURGE) && cachepurge_set) {
644 		mutex_enter(&rp->r_statelock);
645 		rp->r_flags &= ~R4INCACHEPURGE;
646 		cv_broadcast(&rp->r_cv);
647 		mutex_exit(&rp->r_statelock);
648 		cachepurge_set = B_FALSE;
649 	}
650 
651 	if (ctime_changed) {
652 		(void) nfs4_access_purge_rp(rp);
653 		if (rp->r_secattr != NULL) {
654 			mutex_enter(&rp->r_statelock);
655 			vsp = rp->r_secattr;
656 			rp->r_secattr = NULL;
657 			mutex_exit(&rp->r_statelock);
658 			if (vsp != NULL)
659 				nfs4_acl_free_cache(vsp);
660 		}
661 	}
662 
663 	if (!was_serial) {
664 		mutex_enter(&rp->r_statelock);
665 		rp->r_serial = NULL;
666 		cv_broadcast(&rp->r_cv);
667 		mutex_exit(&rp->r_statelock);
668 	}
669 }
670 
671 /*
672  * Set attributes cache for given vnode using virtual attributes.
673  *
674  * Set the timeout value on the attribute cache and fill it
675  * with the passed in attributes.
676  *
677  * The caller must be holding r_statelock.
678  */
679 static void
680 nfs4_attrcache_va(vnode_t *vp, nfs4_ga_res_t *garp, int set_cache_timeout)
681 {
682 	rnode4_t *rp;
683 	mntinfo4_t *mi;
684 	hrtime_t delta;
685 	hrtime_t now;
686 	vattr_t *vap = &garp->n4g_va;
687 
688 	rp = VTOR4(vp);
689 
690 	ASSERT(MUTEX_HELD(&rp->r_statelock));
691 	ASSERT(vap->va_mask == AT_ALL);
692 
693 	/* Switch to master before checking v_flag */
694 	if (IS_SHADOW(vp, rp))
695 		vp = RTOV4(rp);
696 
697 	now = gethrtime();
698 
699 	mi = VTOMI4(vp);
700 
701 	/*
702 	 * Only establish a new cache timeout (if requested).  Never
703 	 * extend a timeout.  Never clear a timeout.  Clearing a timeout
704 	 * is done by nfs4_update_dircaches (ancestor in our call chain)
705 	 */
706 	if (set_cache_timeout && ! rp->r_time_cache_inval)
707 		rp->r_time_cache_inval = now + mi->mi_acdirmax;
708 
709 	/*
710 	 * Delta is the number of nanoseconds that we will
711 	 * cache the attributes of the file.  It is based on
712 	 * the number of nanoseconds since the last time that
713 	 * we detected a change.  The assumption is that files
714 	 * that changed recently are likely to change again.
715 	 * There is a minimum and a maximum for regular files
716 	 * and for directories which is enforced though.
717 	 *
718 	 * Using the time since last change was detected
719 	 * eliminates direct comparison or calculation
720 	 * using mixed client and server times.  NFS does
721 	 * not make any assumptions regarding the client
722 	 * and server clocks being synchronized.
723 	 */
724 	if (vap->va_mtime.tv_sec != rp->r_attr.va_mtime.tv_sec ||
725 	    vap->va_mtime.tv_nsec != rp->r_attr.va_mtime.tv_nsec ||
726 	    vap->va_size != rp->r_attr.va_size) {
727 		rp->r_time_attr_saved = now;
728 	}
729 
730 	if ((mi->mi_flags & MI4_NOAC) || (vp->v_flag & VNOCACHE))
731 		delta = 0;
732 	else {
733 		delta = now - rp->r_time_attr_saved;
734 		if (vp->v_type == VDIR) {
735 			if (delta < mi->mi_acdirmin)
736 				delta = mi->mi_acdirmin;
737 			else if (delta > mi->mi_acdirmax)
738 				delta = mi->mi_acdirmax;
739 		} else {
740 			if (delta < mi->mi_acregmin)
741 				delta = mi->mi_acregmin;
742 			else if (delta > mi->mi_acregmax)
743 				delta = mi->mi_acregmax;
744 		}
745 	}
746 	rp->r_time_attr_inval = now + delta;
747 
748 	rp->r_attr = *vap;
749 	if (garp->n4g_change_valid)
750 		rp->r_change = garp->n4g_change;
751 
752 	/*
753 	 * The attributes that were returned may be valid and can
754 	 * be used, but they may not be allowed to be cached.
755 	 * Reset the timers to cause immediate invalidation and
756 	 * clear r_change so no VERIFY operations will suceed
757 	 */
758 	if (garp->n4g_attrwhy == NFS4_GETATTR_NOCACHE_OK) {
759 		rp->r_time_attr_inval = now;
760 		rp->r_time_attr_saved = now;
761 		rp->r_change = 0;
762 	}
763 
764 	/*
765 	 * If mounted_on_fileid returned AND the object is a stub,
766 	 * then set object's va_nodeid to the mounted over fid
767 	 * returned by server.
768 	 *
769 	 * If mounted_on_fileid not provided/supported, then
770 	 * just set it to 0 for now.  Eventually it would be
771 	 * better to set it to a hashed version of FH.  This
772 	 * would probably be good enough to provide a unique
773 	 * fid/d_ino within a dir.
774 	 *
775 	 * We don't need to carry mounted_on_fileid in the
776 	 * rnode as long as the client never requests fileid
777 	 * without also requesting mounted_on_fileid.  For
778 	 * now, it stays.
779 	 */
780 	if (garp->n4g_mon_fid_valid) {
781 		rp->r_mntd_fid = garp->n4g_mon_fid;
782 
783 		if (RP_ISSTUB(rp))
784 			rp->r_attr.va_nodeid = rp->r_mntd_fid;
785 	}
786 
787 	/*
788 	 * Check to see if there are valid pathconf bits to
789 	 * cache in the rnode.
790 	 */
791 	if (garp->n4g_ext_res) {
792 		if (garp->n4g_ext_res->n4g_pc4.pc4_cache_valid) {
793 			rp->r_pathconf = garp->n4g_ext_res->n4g_pc4;
794 		} else {
795 			if (garp->n4g_ext_res->n4g_pc4.pc4_xattr_valid) {
796 				rp->r_pathconf.pc4_xattr_valid = TRUE;
797 				rp->r_pathconf.pc4_xattr_exists =
798 				    garp->n4g_ext_res->n4g_pc4.pc4_xattr_exists;
799 			}
800 		}
801 	}
802 	/*
803 	 * Update the size of the file if there is no cached data or if
804 	 * the cached data is clean and there is no data being written
805 	 * out.
806 	 */
807 	if (rp->r_size != vap->va_size &&
808 	    (!vn_has_cached_data(vp) ||
809 	    (!(rp->r_flags & R4DIRTY) && rp->r_count == 0))) {
810 		rp->r_size = vap->va_size;
811 	}
812 	nfs_setswaplike(vp, vap);
813 	rp->r_flags &= ~R4WRITEMODIFIED;
814 }
815 
816 /*
817  * Get attributes over-the-wire and update attributes cache
818  * if no error occurred in the over-the-wire operation.
819  * Return 0 if successful, otherwise error.
820  */
821 int
822 nfs4_getattr_otw(vnode_t *vp, nfs4_ga_res_t *garp, cred_t *cr, int get_acl)
823 {
824 	mntinfo4_t *mi = VTOMI4(vp);
825 	hrtime_t t;
826 	nfs4_recov_state_t recov_state;
827 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
828 
829 	recov_state.rs_flags = 0;
830 	recov_state.rs_num_retry_despite_err = 0;
831 
832 	/* Save the original mount point security flavor */
833 	(void) save_mnt_secinfo(mi->mi_curr_serv);
834 
835 recov_retry:
836 
837 	if ((e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR,
838 	    &recov_state, NULL))) {
839 		(void) check_mnt_secinfo(mi->mi_curr_serv, vp);
840 		return (e.error);
841 	}
842 
843 	t = gethrtime();
844 
845 	nfs4_getattr_otw_norecovery(vp, garp, &e, cr, get_acl);
846 
847 	if (nfs4_needs_recovery(&e, FALSE, vp->v_vfsp)) {
848 		if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
849 		    NULL, OP_GETATTR, NULL, NULL, NULL) == FALSE)  {
850 			nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR,
851 			    &recov_state, 1);
852 			goto recov_retry;
853 		}
854 	}
855 
856 	nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 0);
857 
858 	if (!e.error) {
859 		if (e.stat == NFS4_OK) {
860 			nfs4_attr_cache(vp, garp, t, cr, FALSE, NULL);
861 		} else {
862 			e.error = geterrno4(e.stat);
863 
864 			nfs4_purge_stale_fh(e.error, vp, cr);
865 		}
866 	}
867 
868 	/*
869 	 * If getattr a node that is a stub for a crossed
870 	 * mount point, keep the original secinfo flavor for
871 	 * the current file system, not the crossed one.
872 	 */
873 	(void) check_mnt_secinfo(mi->mi_curr_serv, vp);
874 
875 	return (e.error);
876 }
877 
878 /*
879  * Generate a compound to get attributes over-the-wire.
880  */
881 void
882 nfs4_getattr_otw_norecovery(vnode_t *vp, nfs4_ga_res_t *garp,
883     nfs4_error_t *ep, cred_t *cr, int get_acl)
884 {
885 	COMPOUND4args_clnt args;
886 	COMPOUND4res_clnt res;
887 	int doqueue;
888 	rnode4_t *rp = VTOR4(vp);
889 	nfs_argop4 argop[2];
890 
891 	args.ctag = TAG_GETATTR;
892 
893 	args.array_len = 2;
894 	args.array = argop;
895 
896 	/* putfh */
897 	argop[0].argop = OP_CPUTFH;
898 	argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
899 
900 	/* getattr */
901 	/*
902 	 * Unlike nfs version 2 and 3, where getattr returns all the
903 	 * attributes, nfs version 4 returns only the ones explicitly
904 	 * asked for. This creates problems, as some system functions
905 	 * (e.g. cache check) require certain attributes and if the
906 	 * cached node lacks some attributes such as uid/gid, it can
907 	 * affect system utilities (e.g. "ls") that rely on the information
908 	 * to be there. This can lead to anything from system crashes to
909 	 * corrupted information processed by user apps.
910 	 * So to ensure that all bases are covered, request at least
911 	 * the AT_ALL attribute mask.
912 	 */
913 	argop[1].argop = OP_GETATTR;
914 	argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
915 	if (get_acl)
916 		argop[1].nfs_argop4_u.opgetattr.attr_request |= FATTR4_ACL_MASK;
917 	argop[1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp);
918 
919 	doqueue = 1;
920 
921 	rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, ep);
922 
923 	if (ep->error)
924 		return;
925 
926 	if (res.status != NFS4_OK) {
927 		xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
928 		return;
929 	}
930 
931 	*garp = res.array[1].nfs_resop4_u.opgetattr.ga_res;
932 
933 	xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
934 }
935 
936 /*
937  * Return either cached or remote attributes. If get remote attr
938  * use them to check and invalidate caches, then cache the new attributes.
939  */
940 int
941 nfs4getattr(vnode_t *vp, vattr_t *vap, cred_t *cr)
942 {
943 	int error;
944 	rnode4_t *rp;
945 	nfs4_ga_res_t gar;
946 
947 	ASSERT(nfs4_consistent_type(vp));
948 
949 	/*
950 	 * If we've got cached attributes, we're done, otherwise go
951 	 * to the server to get attributes, which will update the cache
952 	 * in the process. Either way, use the cached attributes for
953 	 * the caller's vattr_t.
954 	 *
955 	 * Note that we ignore the gar set by the OTW call: the attr caching
956 	 * code may make adjustments when storing to the rnode, and we want
957 	 * to see those changes here.
958 	 */
959 	rp = VTOR4(vp);
960 	error = 0;
961 	mutex_enter(&rp->r_statelock);
962 	if (!ATTRCACHE4_VALID(vp)) {
963 		mutex_exit(&rp->r_statelock);
964 		error = nfs4_getattr_otw(vp, &gar, cr, 0);
965 		mutex_enter(&rp->r_statelock);
966 	}
967 
968 	if (!error)
969 		*vap = rp->r_attr;
970 
971 	/* Return the client's view of file size */
972 	vap->va_size = rp->r_size;
973 
974 	mutex_exit(&rp->r_statelock);
975 
976 	ASSERT(nfs4_consistent_type(vp));
977 
978 	return (error);
979 }
980 
981 int
982 nfs4_attr_otw(vnode_t *vp, nfs4_tag_type_t tag_type,
983     nfs4_ga_res_t *garp, bitmap4 reqbitmap, cred_t *cr)
984 {
985 	COMPOUND4args_clnt args;
986 	COMPOUND4res_clnt res;
987 	int doqueue;
988 	nfs_argop4 argop[2];
989 	mntinfo4_t *mi = VTOMI4(vp);
990 	bool_t needrecov = FALSE;
991 	nfs4_recov_state_t recov_state;
992 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
993 	nfs4_ga_ext_res_t *gerp;
994 
995 	recov_state.rs_flags = 0;
996 	recov_state.rs_num_retry_despite_err = 0;
997 
998 recov_retry:
999 	args.ctag = tag_type;
1000 
1001 	args.array_len = 2;
1002 	args.array = argop;
1003 
1004 	e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR, &recov_state, NULL);
1005 	if (e.error)
1006 		return (e.error);
1007 
1008 	/* putfh */
1009 	argop[0].argop = OP_CPUTFH;
1010 	argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh;
1011 
1012 	/* getattr */
1013 	argop[1].argop = OP_GETATTR;
1014 	argop[1].nfs_argop4_u.opgetattr.attr_request = reqbitmap;
1015 	argop[1].nfs_argop4_u.opgetattr.mi = mi;
1016 
1017 	doqueue = 1;
1018 
1019 	NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
1020 	    "nfs4_attr_otw: %s call, rp %s", needrecov ? "recov" : "first",
1021 	    rnode4info(VTOR4(vp))));
1022 
1023 	rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
1024 
1025 	needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
1026 	if (!needrecov && e.error) {
1027 		nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
1028 		    needrecov);
1029 		return (e.error);
1030 	}
1031 
1032 	if (needrecov) {
1033 		bool_t abort;
1034 
1035 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1036 		    "nfs4_attr_otw: initiating recovery\n"));
1037 
1038 		abort = nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
1039 		    NULL, OP_GETATTR, NULL, NULL, NULL);
1040 		nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
1041 		    needrecov);
1042 		if (!e.error) {
1043 			xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1044 			e.error = geterrno4(res.status);
1045 		}
1046 		if (abort == FALSE)
1047 			goto recov_retry;
1048 		return (e.error);
1049 	}
1050 
1051 	if (res.status) {
1052 		e.error = geterrno4(res.status);
1053 	} else {
1054 		gerp = garp->n4g_ext_res;
1055 		bcopy(&res.array[1].nfs_resop4_u.opgetattr.ga_res,
1056 		    garp, sizeof (nfs4_ga_res_t));
1057 		garp->n4g_ext_res = gerp;
1058 		if (garp->n4g_ext_res &&
1059 		    res.array[1].nfs_resop4_u.opgetattr.ga_res.n4g_ext_res)
1060 			bcopy(res.array[1].nfs_resop4_u.opgetattr.
1061 			    ga_res.n4g_ext_res,
1062 			    garp->n4g_ext_res, sizeof (nfs4_ga_ext_res_t));
1063 	}
1064 	xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1065 	nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
1066 	    needrecov);
1067 	return (e.error);
1068 }
1069 
1070 /*
1071  * Asynchronous I/O parameters.  nfs_async_threads is the high-water mark
1072  * for the demand-based allocation of async threads per-mount.  The
1073  * nfs_async_timeout is the amount of time a thread will live after it
1074  * becomes idle, unless new I/O requests are received before the thread
1075  * dies.  See nfs4_async_putpage and nfs4_async_start.
1076  */
1077 
1078 static void	nfs4_async_start(struct vfs *);
1079 static void	nfs4_async_pgops_start(struct vfs *);
1080 static void	nfs4_async_common_start(struct vfs *, int);
1081 
1082 static void
1083 free_async_args4(struct nfs4_async_reqs *args)
1084 {
1085 	rnode4_t *rp;
1086 
1087 	if (args->a_io != NFS4_INACTIVE) {
1088 		rp = VTOR4(args->a_vp);
1089 		mutex_enter(&rp->r_statelock);
1090 		rp->r_count--;
1091 		if (args->a_io == NFS4_PUTAPAGE ||
1092 		    args->a_io == NFS4_PAGEIO)
1093 			rp->r_awcount--;
1094 		cv_broadcast(&rp->r_cv);
1095 		mutex_exit(&rp->r_statelock);
1096 		VN_RELE(args->a_vp);
1097 	}
1098 	crfree(args->a_cred);
1099 	kmem_free(args, sizeof (*args));
1100 }
1101 
1102 /*
1103  * Cross-zone thread creation and NFS access is disallowed, yet fsflush() and
1104  * pageout(), running in the global zone, have legitimate reasons to do
1105  * VOP_PUTPAGE(B_ASYNC) on other zones' NFS mounts.  We avoid the problem by
1106  * use of a a per-mount "asynchronous requests manager thread" which is
1107  * signaled by the various asynchronous work routines when there is
1108  * asynchronous work to be done.  It is responsible for creating new
1109  * worker threads if necessary, and notifying existing worker threads
1110  * that there is work to be done.
1111  *
1112  * In other words, it will "take the specifications from the customers and
1113  * give them to the engineers."
1114  *
1115  * Worker threads die off of their own accord if they are no longer
1116  * needed.
1117  *
1118  * This thread is killed when the zone is going away or the filesystem
1119  * is being unmounted.
1120  */
1121 void
1122 nfs4_async_manager(vfs_t *vfsp)
1123 {
1124 	callb_cpr_t cprinfo;
1125 	mntinfo4_t *mi;
1126 	uint_t max_threads;
1127 
1128 	mi = VFTOMI4(vfsp);
1129 
1130 	CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr,
1131 	    "nfs4_async_manager");
1132 
1133 	mutex_enter(&mi->mi_async_lock);
1134 	/*
1135 	 * We want to stash the max number of threads that this mount was
1136 	 * allowed so we can use it later when the variable is set to zero as
1137 	 * part of the zone/mount going away.
1138 	 *
1139 	 * We want to be able to create at least one thread to handle
1140 	 * asynchronous inactive calls.
1141 	 */
1142 	max_threads = MAX(mi->mi_max_threads, 1);
1143 	/*
1144 	 * We don't want to wait for mi_max_threads to go to zero, since that
1145 	 * happens as part of a failed unmount, but this thread should only
1146 	 * exit when the mount is really going away.
1147 	 *
1148 	 * Once MI4_ASYNC_MGR_STOP is set, no more async operations will be
1149 	 * attempted: the various _async_*() functions know to do things
1150 	 * inline if mi_max_threads == 0.  Henceforth we just drain out the
1151 	 * outstanding requests.
1152 	 *
1153 	 * Note that we still create zthreads even if we notice the zone is
1154 	 * shutting down (MI4_ASYNC_MGR_STOP is set); this may cause the zone
1155 	 * shutdown sequence to take slightly longer in some cases, but
1156 	 * doesn't violate the protocol, as all threads will exit as soon as
1157 	 * they're done processing the remaining requests.
1158 	 */
1159 	for (;;) {
1160 		while (mi->mi_async_req_count > 0) {
1161 			/*
1162 			 * Paranoia: If the mount started out having
1163 			 * (mi->mi_max_threads == 0), and the value was
1164 			 * later changed (via a debugger or somesuch),
1165 			 * we could be confused since we will think we
1166 			 * can't create any threads, and the calling
1167 			 * code (which looks at the current value of
1168 			 * mi->mi_max_threads, now non-zero) thinks we
1169 			 * can.
1170 			 *
1171 			 * So, because we're paranoid, we create threads
1172 			 * up to the maximum of the original and the
1173 			 * current value. This means that future
1174 			 * (debugger-induced) alterations of
1175 			 * mi->mi_max_threads are ignored for our
1176 			 * purposes, but who told them they could change
1177 			 * random values on a live kernel anyhow?
1178 			 */
1179 			if (mi->mi_threads[NFS4_ASYNC_QUEUE] <
1180 			    MAX(mi->mi_max_threads, max_threads)) {
1181 				mi->mi_threads[NFS4_ASYNC_QUEUE]++;
1182 				mutex_exit(&mi->mi_async_lock);
1183 				MI4_HOLD(mi);
1184 				VFS_HOLD(vfsp);	/* hold for new thread */
1185 				(void) zthread_create(NULL, 0, nfs4_async_start,
1186 				    vfsp, 0, minclsyspri);
1187 				mutex_enter(&mi->mi_async_lock);
1188 			} else if (mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] <
1189 			    NUM_ASYNC_PGOPS_THREADS) {
1190 				mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE]++;
1191 				mutex_exit(&mi->mi_async_lock);
1192 				MI4_HOLD(mi);
1193 				VFS_HOLD(vfsp); /* hold for new thread */
1194 				(void) zthread_create(NULL, 0,
1195 				    nfs4_async_pgops_start, vfsp, 0,
1196 				    minclsyspri);
1197 				mutex_enter(&mi->mi_async_lock);
1198 			}
1199 			NFS4_WAKE_ASYNC_WORKER(mi->mi_async_work_cv);
1200 			ASSERT(mi->mi_async_req_count != 0);
1201 			mi->mi_async_req_count--;
1202 		}
1203 
1204 		mutex_enter(&mi->mi_lock);
1205 		if (mi->mi_flags & MI4_ASYNC_MGR_STOP) {
1206 			mutex_exit(&mi->mi_lock);
1207 			break;
1208 		}
1209 		mutex_exit(&mi->mi_lock);
1210 
1211 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
1212 		cv_wait(&mi->mi_async_reqs_cv, &mi->mi_async_lock);
1213 		CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1214 	}
1215 
1216 	NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
1217 	    "nfs4_async_manager exiting for vfs %p\n", (void *)mi->mi_vfsp));
1218 	/*
1219 	 * Let everyone know we're done.
1220 	 */
1221 	mi->mi_manager_thread = NULL;
1222 	/*
1223 	 * Wake up the inactive thread.
1224 	 */
1225 	cv_broadcast(&mi->mi_inact_req_cv);
1226 	/*
1227 	 * Wake up anyone sitting in nfs4_async_manager_stop()
1228 	 */
1229 	cv_broadcast(&mi->mi_async_cv);
1230 	/*
1231 	 * There is no explicit call to mutex_exit(&mi->mi_async_lock)
1232 	 * since CALLB_CPR_EXIT is actually responsible for releasing
1233 	 * 'mi_async_lock'.
1234 	 */
1235 	CALLB_CPR_EXIT(&cprinfo);
1236 	VFS_RELE(vfsp);	/* release thread's hold */
1237 	MI4_RELE(mi);
1238 	zthread_exit();
1239 }
1240 
1241 /*
1242  * Signal (and wait for) the async manager thread to clean up and go away.
1243  */
1244 void
1245 nfs4_async_manager_stop(vfs_t *vfsp)
1246 {
1247 	mntinfo4_t *mi = VFTOMI4(vfsp);
1248 
1249 	mutex_enter(&mi->mi_async_lock);
1250 	mutex_enter(&mi->mi_lock);
1251 	mi->mi_flags |= MI4_ASYNC_MGR_STOP;
1252 	mutex_exit(&mi->mi_lock);
1253 	cv_broadcast(&mi->mi_async_reqs_cv);
1254 	/*
1255 	 * Wait for the async manager thread to die.
1256 	 */
1257 	while (mi->mi_manager_thread != NULL)
1258 		cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1259 	mutex_exit(&mi->mi_async_lock);
1260 }
1261 
1262 int
1263 nfs4_async_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr,
1264     struct seg *seg, cred_t *cr, void (*readahead)(vnode_t *,
1265     u_offset_t, caddr_t, struct seg *, cred_t *))
1266 {
1267 	rnode4_t *rp;
1268 	mntinfo4_t *mi;
1269 	struct nfs4_async_reqs *args;
1270 
1271 	rp = VTOR4(vp);
1272 	ASSERT(rp->r_freef == NULL);
1273 
1274 	mi = VTOMI4(vp);
1275 
1276 	/*
1277 	 * If addr falls in a different segment, don't bother doing readahead.
1278 	 */
1279 	if (addr >= seg->s_base + seg->s_size)
1280 		return (-1);
1281 
1282 	/*
1283 	 * If we can't allocate a request structure, punt on the readahead.
1284 	 */
1285 	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1286 		return (-1);
1287 
1288 	/*
1289 	 * If a lock operation is pending, don't initiate any new
1290 	 * readaheads.  Otherwise, bump r_count to indicate the new
1291 	 * asynchronous I/O.
1292 	 */
1293 	if (!nfs_rw_tryenter(&rp->r_lkserlock, RW_READER)) {
1294 		kmem_free(args, sizeof (*args));
1295 		return (-1);
1296 	}
1297 	mutex_enter(&rp->r_statelock);
1298 	rp->r_count++;
1299 	mutex_exit(&rp->r_statelock);
1300 	nfs_rw_exit(&rp->r_lkserlock);
1301 
1302 	args->a_next = NULL;
1303 #ifdef DEBUG
1304 	args->a_queuer = curthread;
1305 #endif
1306 	VN_HOLD(vp);
1307 	args->a_vp = vp;
1308 	ASSERT(cr != NULL);
1309 	crhold(cr);
1310 	args->a_cred = cr;
1311 	args->a_io = NFS4_READ_AHEAD;
1312 	args->a_nfs4_readahead = readahead;
1313 	args->a_nfs4_blkoff = blkoff;
1314 	args->a_nfs4_seg = seg;
1315 	args->a_nfs4_addr = addr;
1316 
1317 	mutex_enter(&mi->mi_async_lock);
1318 
1319 	/*
1320 	 * If asyncio has been disabled, don't bother readahead.
1321 	 */
1322 	if (mi->mi_max_threads == 0) {
1323 		mutex_exit(&mi->mi_async_lock);
1324 		goto noasync;
1325 	}
1326 
1327 	/*
1328 	 * Link request structure into the async list and
1329 	 * wakeup async thread to do the i/o.
1330 	 */
1331 	if (mi->mi_async_reqs[NFS4_READ_AHEAD] == NULL) {
1332 		mi->mi_async_reqs[NFS4_READ_AHEAD] = args;
1333 		mi->mi_async_tail[NFS4_READ_AHEAD] = args;
1334 	} else {
1335 		mi->mi_async_tail[NFS4_READ_AHEAD]->a_next = args;
1336 		mi->mi_async_tail[NFS4_READ_AHEAD] = args;
1337 	}
1338 
1339 	if (mi->mi_io_kstats) {
1340 		mutex_enter(&mi->mi_lock);
1341 		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1342 		mutex_exit(&mi->mi_lock);
1343 	}
1344 
1345 	mi->mi_async_req_count++;
1346 	ASSERT(mi->mi_async_req_count != 0);
1347 	cv_signal(&mi->mi_async_reqs_cv);
1348 	mutex_exit(&mi->mi_async_lock);
1349 	return (0);
1350 
1351 noasync:
1352 	mutex_enter(&rp->r_statelock);
1353 	rp->r_count--;
1354 	cv_broadcast(&rp->r_cv);
1355 	mutex_exit(&rp->r_statelock);
1356 	VN_RELE(vp);
1357 	crfree(cr);
1358 	kmem_free(args, sizeof (*args));
1359 	return (-1);
1360 }
1361 
1362 static void
1363 nfs4_async_start(struct vfs *vfsp)
1364 {
1365 	nfs4_async_common_start(vfsp, NFS4_ASYNC_QUEUE);
1366 }
1367 
1368 static void
1369 nfs4_async_pgops_start(struct vfs *vfsp)
1370 {
1371 	nfs4_async_common_start(vfsp, NFS4_ASYNC_PGOPS_QUEUE);
1372 }
1373 
1374 /*
1375  * The async queues for each mounted file system are arranged as a
1376  * set of queues, one for each async i/o type.  Requests are taken
1377  * from the queues in a round-robin fashion.  A number of consecutive
1378  * requests are taken from each queue before moving on to the next
1379  * queue.  This functionality may allow the NFS Version 2 server to do
1380  * write clustering, even if the client is mixing writes and reads
1381  * because it will take multiple write requests from the queue
1382  * before processing any of the other async i/o types.
1383  *
1384  * XXX The nfs4_async_common_start thread is unsafe in the light of the present
1385  * model defined by cpr to suspend the system. Specifically over the
1386  * wire calls are cpr-unsafe. The thread should be reevaluated in
1387  * case of future updates to the cpr model.
1388  */
1389 static void
1390 nfs4_async_common_start(struct vfs *vfsp, int async_queue)
1391 {
1392 	struct nfs4_async_reqs *args;
1393 	mntinfo4_t *mi = VFTOMI4(vfsp);
1394 	clock_t time_left = 1;
1395 	callb_cpr_t cprinfo;
1396 	int i;
1397 	extern int nfs_async_timeout;
1398 	int async_types;
1399 	kcondvar_t *async_work_cv;
1400 
1401 	if (async_queue == NFS4_ASYNC_QUEUE) {
1402 		async_types = NFS4_ASYNC_TYPES;
1403 		async_work_cv = &mi->mi_async_work_cv[NFS4_ASYNC_QUEUE];
1404 	} else {
1405 		async_types = NFS4_ASYNC_PGOPS_TYPES;
1406 		async_work_cv = &mi->mi_async_work_cv[NFS4_ASYNC_PGOPS_QUEUE];
1407 	}
1408 
1409 	/*
1410 	 * Dynamic initialization of nfs_async_timeout to allow nfs to be
1411 	 * built in an implementation independent manner.
1412 	 */
1413 	if (nfs_async_timeout == -1)
1414 		nfs_async_timeout = NFS_ASYNC_TIMEOUT;
1415 
1416 	CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, "nas");
1417 
1418 	mutex_enter(&mi->mi_async_lock);
1419 	for (;;) {
1420 		/*
1421 		 * Find the next queue containing an entry.  We start
1422 		 * at the current queue pointer and then round robin
1423 		 * through all of them until we either find a non-empty
1424 		 * queue or have looked through all of them.
1425 		 */
1426 		for (i = 0; i < async_types; i++) {
1427 			args = *mi->mi_async_curr[async_queue];
1428 			if (args != NULL)
1429 				break;
1430 			mi->mi_async_curr[async_queue]++;
1431 			if (mi->mi_async_curr[async_queue] ==
1432 			    &mi->mi_async_reqs[async_types]) {
1433 				mi->mi_async_curr[async_queue] =
1434 				    &mi->mi_async_reqs[0];
1435 			}
1436 		}
1437 		/*
1438 		 * If we didn't find a entry, then block until woken up
1439 		 * again and then look through the queues again.
1440 		 */
1441 		if (args == NULL) {
1442 			/*
1443 			 * Exiting is considered to be safe for CPR as well
1444 			 */
1445 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
1446 
1447 			/*
1448 			 * Wakeup thread waiting to unmount the file
1449 			 * system only if all async threads are inactive.
1450 			 *
1451 			 * If we've timed-out and there's nothing to do,
1452 			 * then get rid of this thread.
1453 			 */
1454 			if (mi->mi_max_threads == 0 || time_left <= 0) {
1455 				--mi->mi_threads[async_queue];
1456 
1457 				if (mi->mi_threads[NFS4_ASYNC_QUEUE] == 0 &&
1458 				    mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] == 0)
1459 					cv_signal(&mi->mi_async_cv);
1460 				CALLB_CPR_EXIT(&cprinfo);
1461 				VFS_RELE(vfsp);	/* release thread's hold */
1462 				MI4_RELE(mi);
1463 				zthread_exit();
1464 				/* NOTREACHED */
1465 			}
1466 			time_left = cv_reltimedwait(async_work_cv,
1467 			    &mi->mi_async_lock, nfs_async_timeout,
1468 			    TR_CLOCK_TICK);
1469 
1470 			CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1471 
1472 			continue;
1473 		} else {
1474 			time_left = 1;
1475 		}
1476 
1477 		/*
1478 		 * Remove the request from the async queue and then
1479 		 * update the current async request queue pointer.  If
1480 		 * the current queue is empty or we have removed enough
1481 		 * consecutive entries from it, then reset the counter
1482 		 * for this queue and then move the current pointer to
1483 		 * the next queue.
1484 		 */
1485 		*mi->mi_async_curr[async_queue] = args->a_next;
1486 		if (*mi->mi_async_curr[async_queue] == NULL ||
1487 		    --mi->mi_async_clusters[args->a_io] == 0) {
1488 			mi->mi_async_clusters[args->a_io] =
1489 			    mi->mi_async_init_clusters;
1490 			mi->mi_async_curr[async_queue]++;
1491 			if (mi->mi_async_curr[async_queue] ==
1492 			    &mi->mi_async_reqs[async_types]) {
1493 				mi->mi_async_curr[async_queue] =
1494 				    &mi->mi_async_reqs[0];
1495 			}
1496 		}
1497 
1498 		if (args->a_io != NFS4_INACTIVE && mi->mi_io_kstats) {
1499 			mutex_enter(&mi->mi_lock);
1500 			kstat_waitq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
1501 			mutex_exit(&mi->mi_lock);
1502 		}
1503 
1504 		mutex_exit(&mi->mi_async_lock);
1505 
1506 		/*
1507 		 * Obtain arguments from the async request structure.
1508 		 */
1509 		if (args->a_io == NFS4_READ_AHEAD && mi->mi_max_threads > 0) {
1510 			(*args->a_nfs4_readahead)(args->a_vp,
1511 			    args->a_nfs4_blkoff, args->a_nfs4_addr,
1512 			    args->a_nfs4_seg, args->a_cred);
1513 		} else if (args->a_io == NFS4_PUTAPAGE) {
1514 			(void) (*args->a_nfs4_putapage)(args->a_vp,
1515 			    args->a_nfs4_pp, args->a_nfs4_off,
1516 			    args->a_nfs4_len, args->a_nfs4_flags,
1517 			    args->a_cred);
1518 		} else if (args->a_io == NFS4_PAGEIO) {
1519 			(void) (*args->a_nfs4_pageio)(args->a_vp,
1520 			    args->a_nfs4_pp, args->a_nfs4_off,
1521 			    args->a_nfs4_len, args->a_nfs4_flags,
1522 			    args->a_cred);
1523 		} else if (args->a_io == NFS4_READDIR) {
1524 			(void) ((*args->a_nfs4_readdir)(args->a_vp,
1525 			    args->a_nfs4_rdc, args->a_cred));
1526 		} else if (args->a_io == NFS4_COMMIT) {
1527 			(*args->a_nfs4_commit)(args->a_vp, args->a_nfs4_plist,
1528 			    args->a_nfs4_offset, args->a_nfs4_count,
1529 			    args->a_cred);
1530 		} else if (args->a_io == NFS4_INACTIVE) {
1531 			nfs4_inactive_otw(args->a_vp, args->a_cred);
1532 		}
1533 
1534 		/*
1535 		 * Now, release the vnode and free the credentials
1536 		 * structure.
1537 		 */
1538 		free_async_args4(args);
1539 		/*
1540 		 * Reacquire the mutex because it will be needed above.
1541 		 */
1542 		mutex_enter(&mi->mi_async_lock);
1543 	}
1544 }
1545 
1546 /*
1547  * nfs4_inactive_thread - look for vnodes that need over-the-wire calls as
1548  * part of VOP_INACTIVE.
1549  */
1550 
1551 void
1552 nfs4_inactive_thread(mntinfo4_t *mi)
1553 {
1554 	struct nfs4_async_reqs *args;
1555 	callb_cpr_t cprinfo;
1556 	vfs_t *vfsp = mi->mi_vfsp;
1557 
1558 	CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr,
1559 	    "nfs4_inactive_thread");
1560 
1561 	for (;;) {
1562 		mutex_enter(&mi->mi_async_lock);
1563 		args = mi->mi_async_reqs[NFS4_INACTIVE];
1564 		if (args == NULL) {
1565 			mutex_enter(&mi->mi_lock);
1566 			/*
1567 			 * We don't want to exit until the async manager is done
1568 			 * with its work; hence the check for mi_manager_thread
1569 			 * being NULL.
1570 			 *
1571 			 * The async manager thread will cv_broadcast() on
1572 			 * mi_inact_req_cv when it's done, at which point we'll
1573 			 * wake up and exit.
1574 			 */
1575 			if (mi->mi_manager_thread == NULL)
1576 				goto die;
1577 			mi->mi_flags |= MI4_INACTIVE_IDLE;
1578 			mutex_exit(&mi->mi_lock);
1579 			cv_signal(&mi->mi_async_cv);
1580 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
1581 			cv_wait(&mi->mi_inact_req_cv, &mi->mi_async_lock);
1582 			CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1583 			mutex_exit(&mi->mi_async_lock);
1584 		} else {
1585 			mutex_enter(&mi->mi_lock);
1586 			mi->mi_flags &= ~MI4_INACTIVE_IDLE;
1587 			mutex_exit(&mi->mi_lock);
1588 			mi->mi_async_reqs[NFS4_INACTIVE] = args->a_next;
1589 			mutex_exit(&mi->mi_async_lock);
1590 			nfs4_inactive_otw(args->a_vp, args->a_cred);
1591 			crfree(args->a_cred);
1592 			kmem_free(args, sizeof (*args));
1593 		}
1594 	}
1595 die:
1596 	mutex_exit(&mi->mi_lock);
1597 	mi->mi_inactive_thread = NULL;
1598 	cv_signal(&mi->mi_async_cv);
1599 
1600 	/*
1601 	 * There is no explicit call to mutex_exit(&mi->mi_async_lock) since
1602 	 * CALLB_CPR_EXIT is actually responsible for releasing 'mi_async_lock'.
1603 	 */
1604 	CALLB_CPR_EXIT(&cprinfo);
1605 
1606 	NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
1607 	    "nfs4_inactive_thread exiting for vfs %p\n", (void *)vfsp));
1608 
1609 	MI4_RELE(mi);
1610 	zthread_exit();
1611 	/* NOTREACHED */
1612 }
1613 
1614 /*
1615  * nfs_async_stop:
1616  * Wait for all outstanding putpage operations and the inactive thread to
1617  * complete; nfs4_async_stop_sig() without interruptibility.
1618  */
1619 void
1620 nfs4_async_stop(struct vfs *vfsp)
1621 {
1622 	mntinfo4_t *mi = VFTOMI4(vfsp);
1623 
1624 	/*
1625 	 * Wait for all outstanding async operations to complete and for
1626 	 * worker threads to exit.
1627 	 */
1628 	mutex_enter(&mi->mi_async_lock);
1629 	mi->mi_max_threads = 0;
1630 	NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
1631 	while (mi->mi_threads[NFS4_ASYNC_QUEUE] != 0 ||
1632 	    mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] != 0)
1633 		cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1634 
1635 	/*
1636 	 * Wait for the inactive thread to finish doing what it's doing.  It
1637 	 * won't exit until the last reference to the vfs_t goes away.
1638 	 */
1639 	if (mi->mi_inactive_thread != NULL) {
1640 		mutex_enter(&mi->mi_lock);
1641 		while (!(mi->mi_flags & MI4_INACTIVE_IDLE) ||
1642 		    (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) {
1643 			mutex_exit(&mi->mi_lock);
1644 			cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1645 			mutex_enter(&mi->mi_lock);
1646 		}
1647 		mutex_exit(&mi->mi_lock);
1648 	}
1649 	mutex_exit(&mi->mi_async_lock);
1650 }
1651 
1652 /*
1653  * nfs_async_stop_sig:
1654  * Wait for all outstanding putpage operations and the inactive thread to
1655  * complete. If a signal is delivered we will abort and return non-zero;
1656  * otherwise return 0. Since this routine is called from nfs4_unmount, we
1657  * need to make it interruptible.
1658  */
1659 int
1660 nfs4_async_stop_sig(struct vfs *vfsp)
1661 {
1662 	mntinfo4_t *mi = VFTOMI4(vfsp);
1663 	ushort_t omax;
1664 	bool_t intr = FALSE;
1665 
1666 	/*
1667 	 * Wait for all outstanding putpage operations to complete and for
1668 	 * worker threads to exit.
1669 	 */
1670 	mutex_enter(&mi->mi_async_lock);
1671 	omax = mi->mi_max_threads;
1672 	mi->mi_max_threads = 0;
1673 	NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
1674 	while (mi->mi_threads[NFS4_ASYNC_QUEUE] != 0 ||
1675 	    mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] != 0) {
1676 		if (!cv_wait_sig(&mi->mi_async_cv, &mi->mi_async_lock)) {
1677 			intr = TRUE;
1678 			goto interrupted;
1679 		}
1680 	}
1681 
1682 	/*
1683 	 * Wait for the inactive thread to finish doing what it's doing.  It
1684 	 * won't exit until the a last reference to the vfs_t goes away.
1685 	 */
1686 	if (mi->mi_inactive_thread != NULL) {
1687 		mutex_enter(&mi->mi_lock);
1688 		while (!(mi->mi_flags & MI4_INACTIVE_IDLE) ||
1689 		    (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) {
1690 			mutex_exit(&mi->mi_lock);
1691 			if (!cv_wait_sig(&mi->mi_async_cv,
1692 			    &mi->mi_async_lock)) {
1693 				intr = TRUE;
1694 				goto interrupted;
1695 			}
1696 			mutex_enter(&mi->mi_lock);
1697 		}
1698 		mutex_exit(&mi->mi_lock);
1699 	}
1700 interrupted:
1701 	if (intr)
1702 		mi->mi_max_threads = omax;
1703 	mutex_exit(&mi->mi_async_lock);
1704 
1705 	return (intr);
1706 }
1707 
1708 int
1709 nfs4_async_putapage(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
1710     int flags, cred_t *cr, int (*putapage)(vnode_t *, page_t *,
1711     u_offset_t, size_t, int, cred_t *))
1712 {
1713 	rnode4_t *rp;
1714 	mntinfo4_t *mi;
1715 	struct nfs4_async_reqs *args;
1716 
1717 	ASSERT(flags & B_ASYNC);
1718 	ASSERT(vp->v_vfsp != NULL);
1719 
1720 	rp = VTOR4(vp);
1721 	ASSERT(rp->r_count > 0);
1722 
1723 	mi = VTOMI4(vp);
1724 
1725 	/*
1726 	 * If we can't allocate a request structure, do the putpage
1727 	 * operation synchronously in this thread's context.
1728 	 */
1729 	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1730 		goto noasync;
1731 
1732 	args->a_next = NULL;
1733 #ifdef DEBUG
1734 	args->a_queuer = curthread;
1735 #endif
1736 	VN_HOLD(vp);
1737 	args->a_vp = vp;
1738 	ASSERT(cr != NULL);
1739 	crhold(cr);
1740 	args->a_cred = cr;
1741 	args->a_io = NFS4_PUTAPAGE;
1742 	args->a_nfs4_putapage = putapage;
1743 	args->a_nfs4_pp = pp;
1744 	args->a_nfs4_off = off;
1745 	args->a_nfs4_len = (uint_t)len;
1746 	args->a_nfs4_flags = flags;
1747 
1748 	mutex_enter(&mi->mi_async_lock);
1749 
1750 	/*
1751 	 * If asyncio has been disabled, then make a synchronous request.
1752 	 * This check is done a second time in case async io was diabled
1753 	 * while this thread was blocked waiting for memory pressure to
1754 	 * reduce or for the queue to drain.
1755 	 */
1756 	if (mi->mi_max_threads == 0) {
1757 		mutex_exit(&mi->mi_async_lock);
1758 
1759 		VN_RELE(vp);
1760 		crfree(cr);
1761 		kmem_free(args, sizeof (*args));
1762 		goto noasync;
1763 	}
1764 
1765 	/*
1766 	 * Link request structure into the async list and
1767 	 * wakeup async thread to do the i/o.
1768 	 */
1769 	if (mi->mi_async_reqs[NFS4_PUTAPAGE] == NULL) {
1770 		mi->mi_async_reqs[NFS4_PUTAPAGE] = args;
1771 		mi->mi_async_tail[NFS4_PUTAPAGE] = args;
1772 	} else {
1773 		mi->mi_async_tail[NFS4_PUTAPAGE]->a_next = args;
1774 		mi->mi_async_tail[NFS4_PUTAPAGE] = args;
1775 	}
1776 
1777 	mutex_enter(&rp->r_statelock);
1778 	rp->r_count++;
1779 	rp->r_awcount++;
1780 	mutex_exit(&rp->r_statelock);
1781 
1782 	if (mi->mi_io_kstats) {
1783 		mutex_enter(&mi->mi_lock);
1784 		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1785 		mutex_exit(&mi->mi_lock);
1786 	}
1787 
1788 	mi->mi_async_req_count++;
1789 	ASSERT(mi->mi_async_req_count != 0);
1790 	cv_signal(&mi->mi_async_reqs_cv);
1791 	mutex_exit(&mi->mi_async_lock);
1792 	return (0);
1793 
1794 noasync:
1795 
1796 	if (curproc == proc_pageout || curproc == proc_fsflush) {
1797 		/*
1798 		 * If we get here in the context of the pageout/fsflush,
1799 		 * or we have run out of memory or we're attempting to
1800 		 * unmount we refuse to do a sync write, because this may
1801 		 * hang pageout/fsflush and the machine. In this case,
1802 		 * we just re-mark the page as dirty and punt on the page.
1803 		 *
1804 		 * Make sure B_FORCE isn't set.  We can re-mark the
1805 		 * pages as dirty and unlock the pages in one swoop by
1806 		 * passing in B_ERROR to pvn_write_done().  However,
1807 		 * we should make sure B_FORCE isn't set - we don't
1808 		 * want the page tossed before it gets written out.
1809 		 */
1810 		if (flags & B_FORCE)
1811 			flags &= ~(B_INVAL | B_FORCE);
1812 		pvn_write_done(pp, flags | B_ERROR);
1813 		return (0);
1814 	}
1815 
1816 	if (nfs_zone() != mi->mi_zone) {
1817 		/*
1818 		 * So this was a cross-zone sync putpage.
1819 		 *
1820 		 * We pass in B_ERROR to pvn_write_done() to re-mark the pages
1821 		 * as dirty and unlock them.
1822 		 *
1823 		 * We don't want to clear B_FORCE here as the caller presumably
1824 		 * knows what they're doing if they set it.
1825 		 */
1826 		pvn_write_done(pp, flags | B_ERROR);
1827 		return (EPERM);
1828 	}
1829 	return ((*putapage)(vp, pp, off, len, flags, cr));
1830 }
1831 
1832 int
1833 nfs4_async_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
1834     int flags, cred_t *cr, int (*pageio)(vnode_t *, page_t *, u_offset_t,
1835     size_t, int, cred_t *))
1836 {
1837 	rnode4_t *rp;
1838 	mntinfo4_t *mi;
1839 	struct nfs4_async_reqs *args;
1840 
1841 	ASSERT(flags & B_ASYNC);
1842 	ASSERT(vp->v_vfsp != NULL);
1843 
1844 	rp = VTOR4(vp);
1845 	ASSERT(rp->r_count > 0);
1846 
1847 	mi = VTOMI4(vp);
1848 
1849 	/*
1850 	 * If we can't allocate a request structure, do the pageio
1851 	 * request synchronously in this thread's context.
1852 	 */
1853 	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1854 		goto noasync;
1855 
1856 	args->a_next = NULL;
1857 #ifdef DEBUG
1858 	args->a_queuer = curthread;
1859 #endif
1860 	VN_HOLD(vp);
1861 	args->a_vp = vp;
1862 	ASSERT(cr != NULL);
1863 	crhold(cr);
1864 	args->a_cred = cr;
1865 	args->a_io = NFS4_PAGEIO;
1866 	args->a_nfs4_pageio = pageio;
1867 	args->a_nfs4_pp = pp;
1868 	args->a_nfs4_off = io_off;
1869 	args->a_nfs4_len = (uint_t)io_len;
1870 	args->a_nfs4_flags = flags;
1871 
1872 	mutex_enter(&mi->mi_async_lock);
1873 
1874 	/*
1875 	 * If asyncio has been disabled, then make a synchronous request.
1876 	 * This check is done a second time in case async io was diabled
1877 	 * while this thread was blocked waiting for memory pressure to
1878 	 * reduce or for the queue to drain.
1879 	 */
1880 	if (mi->mi_max_threads == 0) {
1881 		mutex_exit(&mi->mi_async_lock);
1882 
1883 		VN_RELE(vp);
1884 		crfree(cr);
1885 		kmem_free(args, sizeof (*args));
1886 		goto noasync;
1887 	}
1888 
1889 	/*
1890 	 * Link request structure into the async list and
1891 	 * wakeup async thread to do the i/o.
1892 	 */
1893 	if (mi->mi_async_reqs[NFS4_PAGEIO] == NULL) {
1894 		mi->mi_async_reqs[NFS4_PAGEIO] = args;
1895 		mi->mi_async_tail[NFS4_PAGEIO] = args;
1896 	} else {
1897 		mi->mi_async_tail[NFS4_PAGEIO]->a_next = args;
1898 		mi->mi_async_tail[NFS4_PAGEIO] = args;
1899 	}
1900 
1901 	mutex_enter(&rp->r_statelock);
1902 	rp->r_count++;
1903 	rp->r_awcount++;
1904 	mutex_exit(&rp->r_statelock);
1905 
1906 	if (mi->mi_io_kstats) {
1907 		mutex_enter(&mi->mi_lock);
1908 		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1909 		mutex_exit(&mi->mi_lock);
1910 	}
1911 
1912 	mi->mi_async_req_count++;
1913 	ASSERT(mi->mi_async_req_count != 0);
1914 	cv_signal(&mi->mi_async_reqs_cv);
1915 	mutex_exit(&mi->mi_async_lock);
1916 	return (0);
1917 
1918 noasync:
1919 	/*
1920 	 * If we can't do it ASYNC, for reads we do nothing (but cleanup
1921 	 * the page list), for writes we do it synchronously, except for
1922 	 * proc_pageout/proc_fsflush as described below.
1923 	 */
1924 	if (flags & B_READ) {
1925 		pvn_read_done(pp, flags | B_ERROR);
1926 		return (0);
1927 	}
1928 
1929 	if (curproc == proc_pageout || curproc == proc_fsflush) {
1930 		/*
1931 		 * If we get here in the context of the pageout/fsflush,
1932 		 * we refuse to do a sync write, because this may hang
1933 		 * pageout/fsflush (and the machine). In this case, we just
1934 		 * re-mark the page as dirty and punt on the page.
1935 		 *
1936 		 * Make sure B_FORCE isn't set.  We can re-mark the
1937 		 * pages as dirty and unlock the pages in one swoop by
1938 		 * passing in B_ERROR to pvn_write_done().  However,
1939 		 * we should make sure B_FORCE isn't set - we don't
1940 		 * want the page tossed before it gets written out.
1941 		 */
1942 		if (flags & B_FORCE)
1943 			flags &= ~(B_INVAL | B_FORCE);
1944 		pvn_write_done(pp, flags | B_ERROR);
1945 		return (0);
1946 	}
1947 
1948 	if (nfs_zone() != mi->mi_zone) {
1949 		/*
1950 		 * So this was a cross-zone sync pageio.  We pass in B_ERROR
1951 		 * to pvn_write_done() to re-mark the pages as dirty and unlock
1952 		 * them.
1953 		 *
1954 		 * We don't want to clear B_FORCE here as the caller presumably
1955 		 * knows what they're doing if they set it.
1956 		 */
1957 		pvn_write_done(pp, flags | B_ERROR);
1958 		return (EPERM);
1959 	}
1960 	return ((*pageio)(vp, pp, io_off, io_len, flags, cr));
1961 }
1962 
1963 void
1964 nfs4_async_readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr,
1965     int (*readdir)(vnode_t *, rddir4_cache *, cred_t *))
1966 {
1967 	rnode4_t *rp;
1968 	mntinfo4_t *mi;
1969 	struct nfs4_async_reqs *args;
1970 
1971 	rp = VTOR4(vp);
1972 	ASSERT(rp->r_freef == NULL);
1973 
1974 	mi = VTOMI4(vp);
1975 
1976 	/*
1977 	 * If we can't allocate a request structure, skip the readdir.
1978 	 */
1979 	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1980 		goto noasync;
1981 
1982 	args->a_next = NULL;
1983 #ifdef DEBUG
1984 	args->a_queuer = curthread;
1985 #endif
1986 	VN_HOLD(vp);
1987 	args->a_vp = vp;
1988 	ASSERT(cr != NULL);
1989 	crhold(cr);
1990 	args->a_cred = cr;
1991 	args->a_io = NFS4_READDIR;
1992 	args->a_nfs4_readdir = readdir;
1993 	args->a_nfs4_rdc = rdc;
1994 
1995 	mutex_enter(&mi->mi_async_lock);
1996 
1997 	/*
1998 	 * If asyncio has been disabled, then skip this request
1999 	 */
2000 	if (mi->mi_max_threads == 0) {
2001 		mutex_exit(&mi->mi_async_lock);
2002 
2003 		VN_RELE(vp);
2004 		crfree(cr);
2005 		kmem_free(args, sizeof (*args));
2006 		goto noasync;
2007 	}
2008 
2009 	/*
2010 	 * Link request structure into the async list and
2011 	 * wakeup async thread to do the i/o.
2012 	 */
2013 	if (mi->mi_async_reqs[NFS4_READDIR] == NULL) {
2014 		mi->mi_async_reqs[NFS4_READDIR] = args;
2015 		mi->mi_async_tail[NFS4_READDIR] = args;
2016 	} else {
2017 		mi->mi_async_tail[NFS4_READDIR]->a_next = args;
2018 		mi->mi_async_tail[NFS4_READDIR] = args;
2019 	}
2020 
2021 	mutex_enter(&rp->r_statelock);
2022 	rp->r_count++;
2023 	mutex_exit(&rp->r_statelock);
2024 
2025 	if (mi->mi_io_kstats) {
2026 		mutex_enter(&mi->mi_lock);
2027 		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
2028 		mutex_exit(&mi->mi_lock);
2029 	}
2030 
2031 	mi->mi_async_req_count++;
2032 	ASSERT(mi->mi_async_req_count != 0);
2033 	cv_signal(&mi->mi_async_reqs_cv);
2034 	mutex_exit(&mi->mi_async_lock);
2035 	return;
2036 
2037 noasync:
2038 	mutex_enter(&rp->r_statelock);
2039 	rdc->entries = NULL;
2040 	/*
2041 	 * Indicate that no one is trying to fill this entry and
2042 	 * it still needs to be filled.
2043 	 */
2044 	rdc->flags &= ~RDDIR;
2045 	rdc->flags |= RDDIRREQ;
2046 	rddir4_cache_rele(rp, rdc);
2047 	mutex_exit(&rp->r_statelock);
2048 }
2049 
2050 void
2051 nfs4_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
2052     cred_t *cr, void (*commit)(vnode_t *, page_t *, offset3, count3,
2053     cred_t *))
2054 {
2055 	rnode4_t *rp;
2056 	mntinfo4_t *mi;
2057 	struct nfs4_async_reqs *args;
2058 	page_t *pp;
2059 
2060 	rp = VTOR4(vp);
2061 	mi = VTOMI4(vp);
2062 
2063 	/*
2064 	 * If we can't allocate a request structure, do the commit
2065 	 * operation synchronously in this thread's context.
2066 	 */
2067 	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
2068 		goto noasync;
2069 
2070 	args->a_next = NULL;
2071 #ifdef DEBUG
2072 	args->a_queuer = curthread;
2073 #endif
2074 	VN_HOLD(vp);
2075 	args->a_vp = vp;
2076 	ASSERT(cr != NULL);
2077 	crhold(cr);
2078 	args->a_cred = cr;
2079 	args->a_io = NFS4_COMMIT;
2080 	args->a_nfs4_commit = commit;
2081 	args->a_nfs4_plist = plist;
2082 	args->a_nfs4_offset = offset;
2083 	args->a_nfs4_count = count;
2084 
2085 	mutex_enter(&mi->mi_async_lock);
2086 
2087 	/*
2088 	 * If asyncio has been disabled, then make a synchronous request.
2089 	 * This check is done a second time in case async io was diabled
2090 	 * while this thread was blocked waiting for memory pressure to
2091 	 * reduce or for the queue to drain.
2092 	 */
2093 	if (mi->mi_max_threads == 0) {
2094 		mutex_exit(&mi->mi_async_lock);
2095 
2096 		VN_RELE(vp);
2097 		crfree(cr);
2098 		kmem_free(args, sizeof (*args));
2099 		goto noasync;
2100 	}
2101 
2102 	/*
2103 	 * Link request structure into the async list and
2104 	 * wakeup async thread to do the i/o.
2105 	 */
2106 	if (mi->mi_async_reqs[NFS4_COMMIT] == NULL) {
2107 		mi->mi_async_reqs[NFS4_COMMIT] = args;
2108 		mi->mi_async_tail[NFS4_COMMIT] = args;
2109 	} else {
2110 		mi->mi_async_tail[NFS4_COMMIT]->a_next = args;
2111 		mi->mi_async_tail[NFS4_COMMIT] = args;
2112 	}
2113 
2114 	mutex_enter(&rp->r_statelock);
2115 	rp->r_count++;
2116 	mutex_exit(&rp->r_statelock);
2117 
2118 	if (mi->mi_io_kstats) {
2119 		mutex_enter(&mi->mi_lock);
2120 		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
2121 		mutex_exit(&mi->mi_lock);
2122 	}
2123 
2124 	mi->mi_async_req_count++;
2125 	ASSERT(mi->mi_async_req_count != 0);
2126 	cv_signal(&mi->mi_async_reqs_cv);
2127 	mutex_exit(&mi->mi_async_lock);
2128 	return;
2129 
2130 noasync:
2131 	if (curproc == proc_pageout || curproc == proc_fsflush ||
2132 	    nfs_zone() != mi->mi_zone) {
2133 		while (plist != NULL) {
2134 			pp = plist;
2135 			page_sub(&plist, pp);
2136 			pp->p_fsdata = C_COMMIT;
2137 			page_unlock(pp);
2138 		}
2139 		return;
2140 	}
2141 	(*commit)(vp, plist, offset, count, cr);
2142 }
2143 
2144 /*
2145  * nfs4_async_inactive - hand off a VOP_INACTIVE call to a thread.  The
2146  * reference to the vnode is handed over to the thread; the caller should
2147  * no longer refer to the vnode.
2148  *
2149  * Unlike most of the async routines, this handoff is needed for
2150  * correctness reasons, not just performance.  So doing operations in the
2151  * context of the current thread is not an option.
2152  */
2153 void
2154 nfs4_async_inactive(vnode_t *vp, cred_t *cr)
2155 {
2156 	mntinfo4_t *mi;
2157 	struct nfs4_async_reqs *args;
2158 	boolean_t signal_inactive_thread = B_FALSE;
2159 
2160 	mi = VTOMI4(vp);
2161 
2162 	args = kmem_alloc(sizeof (*args), KM_SLEEP);
2163 	args->a_next = NULL;
2164 #ifdef DEBUG
2165 	args->a_queuer = curthread;
2166 #endif
2167 	args->a_vp = vp;
2168 	ASSERT(cr != NULL);
2169 	crhold(cr);
2170 	args->a_cred = cr;
2171 	args->a_io = NFS4_INACTIVE;
2172 
2173 	/*
2174 	 * Note that we don't check mi->mi_max_threads here, since we
2175 	 * *need* to get rid of this vnode regardless of whether someone
2176 	 * set nfs4_max_threads to zero in /etc/system.
2177 	 *
2178 	 * The manager thread knows about this and is willing to create
2179 	 * at least one thread to accommodate us.
2180 	 */
2181 	mutex_enter(&mi->mi_async_lock);
2182 	if (mi->mi_inactive_thread == NULL) {
2183 		rnode4_t *rp;
2184 		vnode_t *unldvp = NULL;
2185 		char *unlname;
2186 		cred_t *unlcred;
2187 
2188 		mutex_exit(&mi->mi_async_lock);
2189 		/*
2190 		 * We just need to free up the memory associated with the
2191 		 * vnode, which can be safely done from within the current
2192 		 * context.
2193 		 */
2194 		crfree(cr);	/* drop our reference */
2195 		kmem_free(args, sizeof (*args));
2196 		rp = VTOR4(vp);
2197 		mutex_enter(&rp->r_statelock);
2198 		if (rp->r_unldvp != NULL) {
2199 			unldvp = rp->r_unldvp;
2200 			rp->r_unldvp = NULL;
2201 			unlname = rp->r_unlname;
2202 			rp->r_unlname = NULL;
2203 			unlcred = rp->r_unlcred;
2204 			rp->r_unlcred = NULL;
2205 		}
2206 		mutex_exit(&rp->r_statelock);
2207 		/*
2208 		 * No need to explicitly throw away any cached pages.  The
2209 		 * eventual r4inactive() will attempt a synchronous
2210 		 * VOP_PUTPAGE() which will immediately fail since the request
2211 		 * is coming from the wrong zone, and then will proceed to call
2212 		 * nfs4_invalidate_pages() which will clean things up for us.
2213 		 *
2214 		 * Throw away the delegation here so rp4_addfree()'s attempt to
2215 		 * return any existing delegations becomes a no-op.
2216 		 */
2217 		if (rp->r_deleg_type != OPEN_DELEGATE_NONE) {
2218 			(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER,
2219 			    FALSE);
2220 			(void) nfs4delegreturn(rp, NFS4_DR_DISCARD);
2221 			nfs_rw_exit(&mi->mi_recovlock);
2222 		}
2223 		nfs4_clear_open_streams(rp);
2224 
2225 		rp4_addfree(rp, cr);
2226 		if (unldvp != NULL) {
2227 			kmem_free(unlname, MAXNAMELEN);
2228 			VN_RELE(unldvp);
2229 			crfree(unlcred);
2230 		}
2231 		return;
2232 	}
2233 
2234 	if (mi->mi_manager_thread == NULL) {
2235 		/*
2236 		 * We want to talk to the inactive thread.
2237 		 */
2238 		signal_inactive_thread = B_TRUE;
2239 	}
2240 
2241 	/*
2242 	 * Enqueue the vnode and wake up either the special thread (empty
2243 	 * list) or an async thread.
2244 	 */
2245 	if (mi->mi_async_reqs[NFS4_INACTIVE] == NULL) {
2246 		mi->mi_async_reqs[NFS4_INACTIVE] = args;
2247 		mi->mi_async_tail[NFS4_INACTIVE] = args;
2248 		signal_inactive_thread = B_TRUE;
2249 	} else {
2250 		mi->mi_async_tail[NFS4_INACTIVE]->a_next = args;
2251 		mi->mi_async_tail[NFS4_INACTIVE] = args;
2252 	}
2253 	if (signal_inactive_thread) {
2254 		cv_signal(&mi->mi_inact_req_cv);
2255 	} else  {
2256 		mi->mi_async_req_count++;
2257 		ASSERT(mi->mi_async_req_count != 0);
2258 		cv_signal(&mi->mi_async_reqs_cv);
2259 	}
2260 
2261 	mutex_exit(&mi->mi_async_lock);
2262 }
2263 
2264 int
2265 writerp4(rnode4_t *rp, caddr_t base, int tcount, struct uio *uio, int pgcreated)
2266 {
2267 	int pagecreate;
2268 	int n;
2269 	int saved_n;
2270 	caddr_t saved_base;
2271 	u_offset_t offset;
2272 	int error;
2273 	int sm_error;
2274 	vnode_t *vp = RTOV(rp);
2275 
2276 	ASSERT(tcount <= MAXBSIZE && tcount <= uio->uio_resid);
2277 	ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_WRITER));
2278 	if (!vpm_enable) {
2279 		ASSERT(((uintptr_t)base & MAXBOFFSET) + tcount <= MAXBSIZE);
2280 	}
2281 
2282 	/*
2283 	 * Move bytes in at most PAGESIZE chunks. We must avoid
2284 	 * spanning pages in uiomove() because page faults may cause
2285 	 * the cache to be invalidated out from under us. The r_size is not
2286 	 * updated until after the uiomove. If we push the last page of a
2287 	 * file before r_size is correct, we will lose the data written past
2288 	 * the current (and invalid) r_size.
2289 	 */
2290 	do {
2291 		offset = uio->uio_loffset;
2292 		pagecreate = 0;
2293 
2294 		/*
2295 		 * n is the number of bytes required to satisfy the request
2296 		 *   or the number of bytes to fill out the page.
2297 		 */
2298 		n = (int)MIN((PAGESIZE - (offset & PAGEOFFSET)), tcount);
2299 
2300 		/*
2301 		 * Check to see if we can skip reading in the page
2302 		 * and just allocate the memory.  We can do this
2303 		 * if we are going to rewrite the entire mapping
2304 		 * or if we are going to write to or beyond the current
2305 		 * end of file from the beginning of the mapping.
2306 		 *
2307 		 * The read of r_size is now protected by r_statelock.
2308 		 */
2309 		mutex_enter(&rp->r_statelock);
2310 		/*
2311 		 * When pgcreated is nonzero the caller has already done
2312 		 * a segmap_getmapflt with forcefault 0 and S_WRITE. With
2313 		 * segkpm this means we already have at least one page
2314 		 * created and mapped at base.
2315 		 */
2316 		pagecreate = pgcreated ||
2317 		    ((offset & PAGEOFFSET) == 0 &&
2318 		    (n == PAGESIZE || ((offset + n) >= rp->r_size)));
2319 
2320 		mutex_exit(&rp->r_statelock);
2321 
2322 		if (!vpm_enable && pagecreate) {
2323 			/*
2324 			 * The last argument tells segmap_pagecreate() to
2325 			 * always lock the page, as opposed to sometimes
2326 			 * returning with the page locked. This way we avoid a
2327 			 * fault on the ensuing uiomove(), but also
2328 			 * more importantly (to fix bug 1094402) we can
2329 			 * call segmap_fault() to unlock the page in all
2330 			 * cases. An alternative would be to modify
2331 			 * segmap_pagecreate() to tell us when it is
2332 			 * locking a page, but that's a fairly major
2333 			 * interface change.
2334 			 */
2335 			if (pgcreated == 0)
2336 				(void) segmap_pagecreate(segkmap, base,
2337 				    (uint_t)n, 1);
2338 			saved_base = base;
2339 			saved_n = n;
2340 		}
2341 
2342 		/*
2343 		 * The number of bytes of data in the last page can not
2344 		 * be accurately be determined while page is being
2345 		 * uiomove'd to and the size of the file being updated.
2346 		 * Thus, inform threads which need to know accurately
2347 		 * how much data is in the last page of the file.  They
2348 		 * will not do the i/o immediately, but will arrange for
2349 		 * the i/o to happen later when this modify operation
2350 		 * will have finished.
2351 		 */
2352 		ASSERT(!(rp->r_flags & R4MODINPROGRESS));
2353 		mutex_enter(&rp->r_statelock);
2354 		rp->r_flags |= R4MODINPROGRESS;
2355 		rp->r_modaddr = (offset & MAXBMASK);
2356 		mutex_exit(&rp->r_statelock);
2357 
2358 		if (vpm_enable) {
2359 			/*
2360 			 * Copy data. If new pages are created, part of
2361 			 * the page that is not written will be initizliazed
2362 			 * with zeros.
2363 			 */
2364 			error = vpm_data_copy(vp, offset, n, uio,
2365 			    !pagecreate, NULL, 0, S_WRITE);
2366 		} else {
2367 			error = uiomove(base, n, UIO_WRITE, uio);
2368 		}
2369 
2370 		/*
2371 		 * r_size is the maximum number of
2372 		 * bytes known to be in the file.
2373 		 * Make sure it is at least as high as the
2374 		 * first unwritten byte pointed to by uio_loffset.
2375 		 */
2376 		mutex_enter(&rp->r_statelock);
2377 		if (rp->r_size < uio->uio_loffset)
2378 			rp->r_size = uio->uio_loffset;
2379 		rp->r_flags &= ~R4MODINPROGRESS;
2380 		rp->r_flags |= R4DIRTY;
2381 		mutex_exit(&rp->r_statelock);
2382 
2383 		/* n = # of bytes written */
2384 		n = (int)(uio->uio_loffset - offset);
2385 
2386 		if (!vpm_enable) {
2387 			base += n;
2388 		}
2389 
2390 		tcount -= n;
2391 		/*
2392 		 * If we created pages w/o initializing them completely,
2393 		 * we need to zero the part that wasn't set up.
2394 		 * This happens on a most EOF write cases and if
2395 		 * we had some sort of error during the uiomove.
2396 		 */
2397 		if (!vpm_enable && pagecreate) {
2398 			if ((uio->uio_loffset & PAGEOFFSET) || n == 0)
2399 				(void) kzero(base, PAGESIZE - n);
2400 
2401 			if (pgcreated) {
2402 				/*
2403 				 * Caller is responsible for this page,
2404 				 * it was not created in this loop.
2405 				 */
2406 				pgcreated = 0;
2407 			} else {
2408 				/*
2409 				 * For bug 1094402: segmap_pagecreate locks
2410 				 * page. Unlock it. This also unlocks the
2411 				 * pages allocated by page_create_va() in
2412 				 * segmap_pagecreate().
2413 				 */
2414 				sm_error = segmap_fault(kas.a_hat, segkmap,
2415 				    saved_base, saved_n,
2416 				    F_SOFTUNLOCK, S_WRITE);
2417 				if (error == 0)
2418 					error = sm_error;
2419 			}
2420 		}
2421 	} while (tcount > 0 && error == 0);
2422 
2423 	return (error);
2424 }
2425 
2426 int
2427 nfs4_putpages(vnode_t *vp, u_offset_t off, size_t len, int flags, cred_t *cr)
2428 {
2429 	rnode4_t *rp;
2430 	page_t *pp;
2431 	u_offset_t eoff;
2432 	u_offset_t io_off;
2433 	size_t io_len;
2434 	int error;
2435 	int rdirty;
2436 	int err;
2437 
2438 	rp = VTOR4(vp);
2439 	ASSERT(rp->r_count > 0);
2440 
2441 	if (!nfs4_has_pages(vp))
2442 		return (0);
2443 
2444 	ASSERT(vp->v_type != VCHR);
2445 
2446 	/*
2447 	 * If R4OUTOFSPACE is set, then all writes turn into B_INVAL
2448 	 * writes.  B_FORCE is set to force the VM system to actually
2449 	 * invalidate the pages, even if the i/o failed.  The pages
2450 	 * need to get invalidated because they can't be written out
2451 	 * because there isn't any space left on either the server's
2452 	 * file system or in the user's disk quota.  The B_FREE bit
2453 	 * is cleared to avoid confusion as to whether this is a
2454 	 * request to place the page on the freelist or to destroy
2455 	 * it.
2456 	 */
2457 	if ((rp->r_flags & R4OUTOFSPACE) ||
2458 	    (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED))
2459 		flags = (flags & ~B_FREE) | B_INVAL | B_FORCE;
2460 
2461 	if (len == 0) {
2462 		/*
2463 		 * If doing a full file synchronous operation, then clear
2464 		 * the R4DIRTY bit.  If a page gets dirtied while the flush
2465 		 * is happening, then R4DIRTY will get set again.  The
2466 		 * R4DIRTY bit must get cleared before the flush so that
2467 		 * we don't lose this information.
2468 		 *
2469 		 * If there are no full file async write operations
2470 		 * pending and RDIRTY bit is set, clear it.
2471 		 */
2472 		if (off == (u_offset_t)0 &&
2473 		    !(flags & B_ASYNC) &&
2474 		    (rp->r_flags & R4DIRTY)) {
2475 			mutex_enter(&rp->r_statelock);
2476 			rdirty = (rp->r_flags & R4DIRTY);
2477 			rp->r_flags &= ~R4DIRTY;
2478 			mutex_exit(&rp->r_statelock);
2479 		} else if (flags & B_ASYNC && off == (u_offset_t)0) {
2480 			mutex_enter(&rp->r_statelock);
2481 			if (rp->r_flags & R4DIRTY && rp->r_awcount == 0) {
2482 				rdirty = (rp->r_flags & R4DIRTY);
2483 				rp->r_flags &= ~R4DIRTY;
2484 			}
2485 			mutex_exit(&rp->r_statelock);
2486 		} else
2487 			rdirty = 0;
2488 
2489 		/*
2490 		 * Search the entire vp list for pages >= off, and flush
2491 		 * the dirty pages.
2492 		 */
2493 		error = pvn_vplist_dirty(vp, off, rp->r_putapage,
2494 		    flags, cr);
2495 
2496 		/*
2497 		 * If an error occurred and the file was marked as dirty
2498 		 * before and we aren't forcibly invalidating pages, then
2499 		 * reset the R4DIRTY flag.
2500 		 */
2501 		if (error && rdirty &&
2502 		    (flags & (B_INVAL | B_FORCE)) != (B_INVAL | B_FORCE)) {
2503 			mutex_enter(&rp->r_statelock);
2504 			rp->r_flags |= R4DIRTY;
2505 			mutex_exit(&rp->r_statelock);
2506 		}
2507 	} else {
2508 		/*
2509 		 * Do a range from [off...off + len) looking for pages
2510 		 * to deal with.
2511 		 */
2512 		error = 0;
2513 		io_len = 0;
2514 		eoff = off + len;
2515 		mutex_enter(&rp->r_statelock);
2516 		for (io_off = off; io_off < eoff && io_off < rp->r_size;
2517 		    io_off += io_len) {
2518 			mutex_exit(&rp->r_statelock);
2519 			/*
2520 			 * If we are not invalidating, synchronously
2521 			 * freeing or writing pages use the routine
2522 			 * page_lookup_nowait() to prevent reclaiming
2523 			 * them from the free list.
2524 			 */
2525 			if ((flags & B_INVAL) || !(flags & B_ASYNC)) {
2526 				pp = page_lookup(vp, io_off,
2527 				    (flags & (B_INVAL | B_FREE)) ?
2528 				    SE_EXCL : SE_SHARED);
2529 			} else {
2530 				pp = page_lookup_nowait(vp, io_off,
2531 				    (flags & B_FREE) ? SE_EXCL : SE_SHARED);
2532 			}
2533 
2534 			if (pp == NULL || !pvn_getdirty(pp, flags))
2535 				io_len = PAGESIZE;
2536 			else {
2537 				err = (*rp->r_putapage)(vp, pp, &io_off,
2538 				    &io_len, flags, cr);
2539 				if (!error)
2540 					error = err;
2541 				/*
2542 				 * "io_off" and "io_len" are returned as
2543 				 * the range of pages we actually wrote.
2544 				 * This allows us to skip ahead more quickly
2545 				 * since several pages may've been dealt
2546 				 * with by this iteration of the loop.
2547 				 */
2548 			}
2549 			mutex_enter(&rp->r_statelock);
2550 		}
2551 		mutex_exit(&rp->r_statelock);
2552 	}
2553 
2554 	return (error);
2555 }
2556 
2557 void
2558 nfs4_invalidate_pages(vnode_t *vp, u_offset_t off, cred_t *cr)
2559 {
2560 	rnode4_t *rp;
2561 
2562 	rp = VTOR4(vp);
2563 	if (IS_SHADOW(vp, rp))
2564 		vp = RTOV4(rp);
2565 	mutex_enter(&rp->r_statelock);
2566 	while (rp->r_flags & R4TRUNCATE)
2567 		cv_wait(&rp->r_cv, &rp->r_statelock);
2568 	rp->r_flags |= R4TRUNCATE;
2569 	if (off == (u_offset_t)0) {
2570 		rp->r_flags &= ~R4DIRTY;
2571 		if (!(rp->r_flags & R4STALE))
2572 			rp->r_error = 0;
2573 	}
2574 	rp->r_truncaddr = off;
2575 	mutex_exit(&rp->r_statelock);
2576 	(void) pvn_vplist_dirty(vp, off, rp->r_putapage,
2577 	    B_INVAL | B_TRUNC, cr);
2578 	mutex_enter(&rp->r_statelock);
2579 	rp->r_flags &= ~R4TRUNCATE;
2580 	cv_broadcast(&rp->r_cv);
2581 	mutex_exit(&rp->r_statelock);
2582 }
2583 
2584 static int
2585 nfs4_mnt_kstat_update(kstat_t *ksp, int rw)
2586 {
2587 	mntinfo4_t *mi;
2588 	struct mntinfo_kstat *mik;
2589 	vfs_t *vfsp;
2590 
2591 	/* this is a read-only kstat. Bail out on a write */
2592 	if (rw == KSTAT_WRITE)
2593 		return (EACCES);
2594 
2595 
2596 	/*
2597 	 * We don't want to wait here as kstat_chain_lock could be held by
2598 	 * dounmount(). dounmount() takes vfs_reflock before the chain lock
2599 	 * and thus could lead to a deadlock.
2600 	 */
2601 	vfsp = (struct vfs *)ksp->ks_private;
2602 
2603 	mi = VFTOMI4(vfsp);
2604 	mik = (struct mntinfo_kstat *)ksp->ks_data;
2605 
2606 	(void) strcpy(mik->mik_proto, mi->mi_curr_serv->sv_knconf->knc_proto);
2607 
2608 	mik->mik_vers = (uint32_t)mi->mi_vers;
2609 	mik->mik_flags = mi->mi_flags;
2610 	/*
2611 	 * The sv_secdata holds the flavor the client specifies.
2612 	 * If the client uses default and a security negotiation
2613 	 * occurs, sv_currsec will point to the current flavor
2614 	 * selected from the server flavor list.
2615 	 * sv_currsec is NULL if no security negotiation takes place.
2616 	 */
2617 	mik->mik_secmod = mi->mi_curr_serv->sv_currsec ?
2618 	    mi->mi_curr_serv->sv_currsec->secmod :
2619 	    mi->mi_curr_serv->sv_secdata->secmod;
2620 	mik->mik_curread = (uint32_t)mi->mi_curread;
2621 	mik->mik_curwrite = (uint32_t)mi->mi_curwrite;
2622 	mik->mik_retrans = mi->mi_retrans;
2623 	mik->mik_timeo = mi->mi_timeo;
2624 	mik->mik_acregmin = HR2SEC(mi->mi_acregmin);
2625 	mik->mik_acregmax = HR2SEC(mi->mi_acregmax);
2626 	mik->mik_acdirmin = HR2SEC(mi->mi_acdirmin);
2627 	mik->mik_acdirmax = HR2SEC(mi->mi_acdirmax);
2628 	mik->mik_noresponse = (uint32_t)mi->mi_noresponse;
2629 	mik->mik_failover = (uint32_t)mi->mi_failover;
2630 	mik->mik_remap = (uint32_t)mi->mi_remap;
2631 
2632 	(void) strcpy(mik->mik_curserver, mi->mi_curr_serv->sv_hostname);
2633 
2634 	return (0);
2635 }
2636 
2637 void
2638 nfs4_mnt_kstat_init(struct vfs *vfsp)
2639 {
2640 	mntinfo4_t *mi = VFTOMI4(vfsp);
2641 
2642 	/*
2643 	 * PSARC 2001/697 Contract Private Interface
2644 	 * All nfs kstats are under SunMC contract
2645 	 * Please refer to the PSARC listed above and contact
2646 	 * SunMC before making any changes!
2647 	 *
2648 	 * Changes must be reviewed by Solaris File Sharing
2649 	 * Changes must be communicated to contract-2001-697@sun.com
2650 	 *
2651 	 */
2652 
2653 	mi->mi_io_kstats = kstat_create_zone("nfs", getminor(vfsp->vfs_dev),
2654 	    NULL, "nfs", KSTAT_TYPE_IO, 1, 0, mi->mi_zone->zone_id);
2655 	if (mi->mi_io_kstats) {
2656 		if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
2657 			kstat_zone_add(mi->mi_io_kstats, GLOBAL_ZONEID);
2658 		mi->mi_io_kstats->ks_lock = &mi->mi_lock;
2659 		kstat_install(mi->mi_io_kstats);
2660 	}
2661 
2662 	if ((mi->mi_ro_kstats = kstat_create_zone("nfs",
2663 	    getminor(vfsp->vfs_dev), "mntinfo", "misc", KSTAT_TYPE_RAW,
2664 	    sizeof (struct mntinfo_kstat), 0, mi->mi_zone->zone_id)) != NULL) {
2665 		if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
2666 			kstat_zone_add(mi->mi_ro_kstats, GLOBAL_ZONEID);
2667 		mi->mi_ro_kstats->ks_update = nfs4_mnt_kstat_update;
2668 		mi->mi_ro_kstats->ks_private = (void *)vfsp;
2669 		kstat_install(mi->mi_ro_kstats);
2670 	}
2671 
2672 	nfs4_mnt_recov_kstat_init(vfsp);
2673 }
2674 
2675 void
2676 nfs4_write_error(vnode_t *vp, int error, cred_t *cr)
2677 {
2678 	mntinfo4_t *mi;
2679 	clock_t now = ddi_get_lbolt();
2680 
2681 	mi = VTOMI4(vp);
2682 	/*
2683 	 * In case of forced unmount, do not print any messages
2684 	 * since it can flood the console with error messages.
2685 	 */
2686 	if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)
2687 		return;
2688 
2689 	/*
2690 	 * If the mount point is dead, not recoverable, do not
2691 	 * print error messages that can flood the console.
2692 	 */
2693 	if (mi->mi_flags & MI4_RECOV_FAIL)
2694 		return;
2695 
2696 	/*
2697 	 * No use in flooding the console with ENOSPC
2698 	 * messages from the same file system.
2699 	 */
2700 	if ((error != ENOSPC && error != EDQUOT) ||
2701 	    now - mi->mi_printftime > 0) {
2702 		zoneid_t zoneid = mi->mi_zone->zone_id;
2703 
2704 #ifdef DEBUG
2705 		nfs_perror(error, "NFS%ld write error on host %s: %m.\n",
2706 		    mi->mi_vers, VTOR4(vp)->r_server->sv_hostname, NULL);
2707 #else
2708 		nfs_perror(error, "NFS write error on host %s: %m.\n",
2709 		    VTOR4(vp)->r_server->sv_hostname, NULL);
2710 #endif
2711 		if (error == ENOSPC || error == EDQUOT) {
2712 			zcmn_err(zoneid, CE_CONT,
2713 			    "^File: userid=%d, groupid=%d\n",
2714 			    crgetuid(cr), crgetgid(cr));
2715 			if (crgetuid(curthread->t_cred) != crgetuid(cr) ||
2716 			    crgetgid(curthread->t_cred) != crgetgid(cr)) {
2717 				zcmn_err(zoneid, CE_CONT,
2718 				    "^User: userid=%d, groupid=%d\n",
2719 				    crgetuid(curthread->t_cred),
2720 				    crgetgid(curthread->t_cred));
2721 			}
2722 			mi->mi_printftime = now +
2723 			    nfs_write_error_interval * hz;
2724 		}
2725 		sfh4_printfhandle(VTOR4(vp)->r_fh);
2726 #ifdef DEBUG
2727 		if (error == EACCES) {
2728 			zcmn_err(zoneid, CE_CONT,
2729 			    "nfs_bio: cred is%s kcred\n",
2730 			    cr == kcred ? "" : " not");
2731 		}
2732 #endif
2733 	}
2734 }
2735 
2736 /*
2737  * Return non-zero if the given file can be safely memory mapped.  Locks
2738  * are safe if whole-file (length and offset are both zero).
2739  */
2740 
2741 #define	SAFE_LOCK(flk)	((flk).l_start == 0 && (flk).l_len == 0)
2742 
2743 static int
2744 nfs4_safemap(const vnode_t *vp)
2745 {
2746 	locklist_t	*llp, *next_llp;
2747 	int		safe = 1;
2748 	rnode4_t	*rp = VTOR4(vp);
2749 
2750 	ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER));
2751 
2752 	NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: "
2753 	    "vp = %p", (void *)vp));
2754 
2755 	/*
2756 	 * Review all the locks for the vnode, both ones that have been
2757 	 * acquired and ones that are pending.  We assume that
2758 	 * flk_active_locks_for_vp() has merged any locks that can be
2759 	 * merged (so that if a process has the entire file locked, it is
2760 	 * represented as a single lock).
2761 	 *
2762 	 * Note that we can't bail out of the loop if we find a non-safe
2763 	 * lock, because we have to free all the elements in the llp list.
2764 	 * We might be able to speed up this code slightly by not looking
2765 	 * at each lock's l_start and l_len fields once we've found a
2766 	 * non-safe lock.
2767 	 */
2768 
2769 	llp = flk_active_locks_for_vp(vp);
2770 	while (llp) {
2771 		NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE,
2772 		    "nfs4_safemap: active lock (%" PRId64 ", %" PRId64 ")",
2773 		    llp->ll_flock.l_start, llp->ll_flock.l_len));
2774 		if (!SAFE_LOCK(llp->ll_flock)) {
2775 			safe = 0;
2776 			NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE,
2777 			    "nfs4_safemap: unsafe active lock (%" PRId64
2778 			    ", %" PRId64 ")", llp->ll_flock.l_start,
2779 			    llp->ll_flock.l_len));
2780 		}
2781 		next_llp = llp->ll_next;
2782 		VN_RELE(llp->ll_vp);
2783 		kmem_free(llp, sizeof (*llp));
2784 		llp = next_llp;
2785 	}
2786 
2787 	NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: %s",
2788 	    safe ? "safe" : "unsafe"));
2789 	return (safe);
2790 }
2791 
2792 /*
2793  * Return whether there is a lost LOCK or LOCKU queued up for the given
2794  * file that would make an mmap request unsafe.  cf. nfs4_safemap().
2795  */
2796 
2797 bool_t
2798 nfs4_map_lost_lock_conflict(vnode_t *vp)
2799 {
2800 	bool_t conflict = FALSE;
2801 	nfs4_lost_rqst_t *lrp;
2802 	mntinfo4_t *mi = VTOMI4(vp);
2803 
2804 	mutex_enter(&mi->mi_lock);
2805 	for (lrp = list_head(&mi->mi_lost_state); lrp != NULL;
2806 	    lrp = list_next(&mi->mi_lost_state, lrp)) {
2807 		if (lrp->lr_op != OP_LOCK && lrp->lr_op != OP_LOCKU)
2808 			continue;
2809 		ASSERT(lrp->lr_vp != NULL);
2810 		if (!VOP_CMP(lrp->lr_vp, vp, NULL))
2811 			continue;	/* different file */
2812 		if (!SAFE_LOCK(*lrp->lr_flk)) {
2813 			conflict = TRUE;
2814 			break;
2815 		}
2816 	}
2817 
2818 	mutex_exit(&mi->mi_lock);
2819 	return (conflict);
2820 }
2821 
2822 /*
2823  * nfs_lockcompletion:
2824  *
2825  * If the vnode has a lock that makes it unsafe to cache the file, mark it
2826  * as non cachable (set VNOCACHE bit).
2827  */
2828 
2829 void
2830 nfs4_lockcompletion(vnode_t *vp, int cmd)
2831 {
2832 	rnode4_t *rp = VTOR4(vp);
2833 
2834 	ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER));
2835 	ASSERT(!IS_SHADOW(vp, rp));
2836 
2837 	if (cmd == F_SETLK || cmd == F_SETLKW) {
2838 
2839 		if (!nfs4_safemap(vp)) {
2840 			mutex_enter(&vp->v_lock);
2841 			vp->v_flag |= VNOCACHE;
2842 			mutex_exit(&vp->v_lock);
2843 		} else {
2844 			mutex_enter(&vp->v_lock);
2845 			vp->v_flag &= ~VNOCACHE;
2846 			mutex_exit(&vp->v_lock);
2847 		}
2848 	}
2849 	/*
2850 	 * The cached attributes of the file are stale after acquiring
2851 	 * the lock on the file. They were updated when the file was
2852 	 * opened, but not updated when the lock was acquired. Therefore the
2853 	 * cached attributes are invalidated after the lock is obtained.
2854 	 */
2855 	PURGE_ATTRCACHE4(vp);
2856 }
2857 
2858 /* ARGSUSED */
2859 static void *
2860 nfs4_mi_init(zoneid_t zoneid)
2861 {
2862 	struct mi4_globals *mig;
2863 
2864 	mig = kmem_alloc(sizeof (*mig), KM_SLEEP);
2865 	mutex_init(&mig->mig_lock, NULL, MUTEX_DEFAULT, NULL);
2866 	list_create(&mig->mig_list, sizeof (mntinfo4_t),
2867 	    offsetof(mntinfo4_t, mi_zone_node));
2868 	mig->mig_destructor_called = B_FALSE;
2869 	return (mig);
2870 }
2871 
2872 /*
2873  * Callback routine to tell all NFSv4 mounts in the zone to start tearing down
2874  * state and killing off threads.
2875  */
2876 /* ARGSUSED */
2877 static void
2878 nfs4_mi_shutdown(zoneid_t zoneid, void *data)
2879 {
2880 	struct mi4_globals *mig = data;
2881 	mntinfo4_t *mi;
2882 	nfs4_server_t *np;
2883 
2884 	NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2885 	    "nfs4_mi_shutdown zone %d\n", zoneid));
2886 	ASSERT(mig != NULL);
2887 	for (;;) {
2888 		mutex_enter(&mig->mig_lock);
2889 		mi = list_head(&mig->mig_list);
2890 		if (mi == NULL) {
2891 			mutex_exit(&mig->mig_lock);
2892 			break;
2893 		}
2894 
2895 		NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2896 		    "nfs4_mi_shutdown stopping vfs %p\n", (void *)mi->mi_vfsp));
2897 		/*
2898 		 * purge the DNLC for this filesystem
2899 		 */
2900 		(void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
2901 		/*
2902 		 * Tell existing async worker threads to exit.
2903 		 */
2904 		mutex_enter(&mi->mi_async_lock);
2905 		mi->mi_max_threads = 0;
2906 		NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
2907 		/*
2908 		 * Set the appropriate flags, signal and wait for both the
2909 		 * async manager and the inactive thread to exit when they're
2910 		 * done with their current work.
2911 		 */
2912 		mutex_enter(&mi->mi_lock);
2913 		mi->mi_flags |= (MI4_ASYNC_MGR_STOP|MI4_DEAD);
2914 		mutex_exit(&mi->mi_lock);
2915 		mutex_exit(&mi->mi_async_lock);
2916 		if (mi->mi_manager_thread) {
2917 			nfs4_async_manager_stop(mi->mi_vfsp);
2918 		}
2919 		if (mi->mi_inactive_thread) {
2920 			mutex_enter(&mi->mi_async_lock);
2921 			cv_signal(&mi->mi_inact_req_cv);
2922 			/*
2923 			 * Wait for the inactive thread to exit.
2924 			 */
2925 			while (mi->mi_inactive_thread != NULL) {
2926 				cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
2927 			}
2928 			mutex_exit(&mi->mi_async_lock);
2929 		}
2930 		/*
2931 		 * Wait for the recovery thread to complete, that is, it will
2932 		 * signal when it is done using the "mi" structure and about
2933 		 * to exit
2934 		 */
2935 		mutex_enter(&mi->mi_lock);
2936 		while (mi->mi_in_recovery > 0)
2937 			cv_wait(&mi->mi_cv_in_recov, &mi->mi_lock);
2938 		mutex_exit(&mi->mi_lock);
2939 		/*
2940 		 * We're done when every mi has been done or the list is empty.
2941 		 * This one is done, remove it from the list.
2942 		 */
2943 		list_remove(&mig->mig_list, mi);
2944 		mutex_exit(&mig->mig_lock);
2945 		zone_rele_ref(&mi->mi_zone_ref, ZONE_REF_NFSV4);
2946 
2947 		/*
2948 		 * Release hold on vfs and mi done to prevent race with zone
2949 		 * shutdown. This releases the hold in nfs4_mi_zonelist_add.
2950 		 */
2951 		VFS_RELE(mi->mi_vfsp);
2952 		MI4_RELE(mi);
2953 	}
2954 	/*
2955 	 * Tell each renew thread in the zone to exit
2956 	 */
2957 	mutex_enter(&nfs4_server_lst_lock);
2958 	for (np = nfs4_server_lst.forw; np != &nfs4_server_lst; np = np->forw) {
2959 		mutex_enter(&np->s_lock);
2960 		if (np->zoneid == zoneid) {
2961 			/*
2962 			 * We add another hold onto the nfs4_server_t
2963 			 * because this will make sure tha the nfs4_server_t
2964 			 * stays around until nfs4_callback_fini_zone destroys
2965 			 * the zone. This way, the renew thread can
2966 			 * unconditionally release its holds on the
2967 			 * nfs4_server_t.
2968 			 */
2969 			np->s_refcnt++;
2970 			nfs4_mark_srv_dead(np);
2971 		}
2972 		mutex_exit(&np->s_lock);
2973 	}
2974 	mutex_exit(&nfs4_server_lst_lock);
2975 }
2976 
2977 static void
2978 nfs4_mi_free_globals(struct mi4_globals *mig)
2979 {
2980 	list_destroy(&mig->mig_list);	/* makes sure the list is empty */
2981 	mutex_destroy(&mig->mig_lock);
2982 	kmem_free(mig, sizeof (*mig));
2983 }
2984 
2985 /* ARGSUSED */
2986 static void
2987 nfs4_mi_destroy(zoneid_t zoneid, void *data)
2988 {
2989 	struct mi4_globals *mig = data;
2990 
2991 	NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2992 	    "nfs4_mi_destroy zone %d\n", zoneid));
2993 	ASSERT(mig != NULL);
2994 	mutex_enter(&mig->mig_lock);
2995 	if (list_head(&mig->mig_list) != NULL) {
2996 		/* Still waiting for VFS_FREEVFS() */
2997 		mig->mig_destructor_called = B_TRUE;
2998 		mutex_exit(&mig->mig_lock);
2999 		return;
3000 	}
3001 	nfs4_mi_free_globals(mig);
3002 }
3003 
3004 /*
3005  * Add an NFS mount to the per-zone list of NFS mounts.
3006  */
3007 void
3008 nfs4_mi_zonelist_add(mntinfo4_t *mi)
3009 {
3010 	struct mi4_globals *mig;
3011 
3012 	mig = zone_getspecific(mi4_list_key, mi->mi_zone);
3013 	mutex_enter(&mig->mig_lock);
3014 	list_insert_head(&mig->mig_list, mi);
3015 	/*
3016 	 * hold added to eliminate race with zone shutdown -this will be
3017 	 * released in mi_shutdown
3018 	 */
3019 	MI4_HOLD(mi);
3020 	VFS_HOLD(mi->mi_vfsp);
3021 	mutex_exit(&mig->mig_lock);
3022 }
3023 
3024 /*
3025  * Remove an NFS mount from the per-zone list of NFS mounts.
3026  */
3027 int
3028 nfs4_mi_zonelist_remove(mntinfo4_t *mi)
3029 {
3030 	struct mi4_globals *mig;
3031 	int ret = 0;
3032 
3033 	mig = zone_getspecific(mi4_list_key, mi->mi_zone);
3034 	mutex_enter(&mig->mig_lock);
3035 	mutex_enter(&mi->mi_lock);
3036 	/* if this mi is marked dead, then the zone already released it */
3037 	if (!(mi->mi_flags & MI4_DEAD)) {
3038 		list_remove(&mig->mig_list, mi);
3039 		mutex_exit(&mi->mi_lock);
3040 
3041 		/* release the holds put on in zonelist_add(). */
3042 		VFS_RELE(mi->mi_vfsp);
3043 		MI4_RELE(mi);
3044 		ret = 1;
3045 	} else {
3046 		mutex_exit(&mi->mi_lock);
3047 	}
3048 
3049 	/*
3050 	 * We can be called asynchronously by VFS_FREEVFS() after the zone
3051 	 * shutdown/destroy callbacks have executed; if so, clean up the zone's
3052 	 * mi globals.
3053 	 */
3054 	if (list_head(&mig->mig_list) == NULL &&
3055 	    mig->mig_destructor_called == B_TRUE) {
3056 		nfs4_mi_free_globals(mig);
3057 		return (ret);
3058 	}
3059 	mutex_exit(&mig->mig_lock);
3060 	return (ret);
3061 }
3062 
3063 void
3064 nfs_free_mi4(mntinfo4_t *mi)
3065 {
3066 	nfs4_open_owner_t	*foop;
3067 	nfs4_oo_hash_bucket_t   *bucketp;
3068 	nfs4_debug_msg_t	*msgp;
3069 	int i;
3070 	servinfo4_t 		*svp;
3071 
3072 	/*
3073 	 * Code introduced here should be carefully evaluated to make
3074 	 * sure none of the freed resources are accessed either directly
3075 	 * or indirectly after freeing them. For eg: Introducing calls to
3076 	 * NFS4_DEBUG that use mntinfo4_t structure member after freeing
3077 	 * the structure members or other routines calling back into NFS
3078 	 * accessing freed mntinfo4_t structure member.
3079 	 */
3080 	mutex_enter(&mi->mi_lock);
3081 	ASSERT(mi->mi_recovthread == NULL);
3082 	ASSERT(mi->mi_flags & MI4_ASYNC_MGR_STOP);
3083 	mutex_exit(&mi->mi_lock);
3084 	mutex_enter(&mi->mi_async_lock);
3085 	ASSERT(mi->mi_threads[NFS4_ASYNC_QUEUE] == 0 &&
3086 	    mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] == 0);
3087 	ASSERT(mi->mi_manager_thread == NULL);
3088 	mutex_exit(&mi->mi_async_lock);
3089 	if (mi->mi_io_kstats) {
3090 		kstat_delete(mi->mi_io_kstats);
3091 		mi->mi_io_kstats = NULL;
3092 	}
3093 	if (mi->mi_ro_kstats) {
3094 		kstat_delete(mi->mi_ro_kstats);
3095 		mi->mi_ro_kstats = NULL;
3096 	}
3097 	if (mi->mi_recov_ksp) {
3098 		kstat_delete(mi->mi_recov_ksp);
3099 		mi->mi_recov_ksp = NULL;
3100 	}
3101 	mutex_enter(&mi->mi_msg_list_lock);
3102 	while (msgp = list_head(&mi->mi_msg_list)) {
3103 		list_remove(&mi->mi_msg_list, msgp);
3104 		nfs4_free_msg(msgp);
3105 	}
3106 	mutex_exit(&mi->mi_msg_list_lock);
3107 	list_destroy(&mi->mi_msg_list);
3108 	if (mi->mi_fname != NULL)
3109 		fn_rele(&mi->mi_fname);
3110 	if (mi->mi_rootfh != NULL)
3111 		sfh4_rele(&mi->mi_rootfh);
3112 	if (mi->mi_srvparentfh != NULL)
3113 		sfh4_rele(&mi->mi_srvparentfh);
3114 	svp = mi->mi_servers;
3115 	sv4_free(svp);
3116 	mutex_destroy(&mi->mi_lock);
3117 	mutex_destroy(&mi->mi_async_lock);
3118 	mutex_destroy(&mi->mi_msg_list_lock);
3119 	mutex_destroy(&mi->mi_rnodes_lock);
3120 	nfs_rw_destroy(&mi->mi_recovlock);
3121 	nfs_rw_destroy(&mi->mi_rename_lock);
3122 	nfs_rw_destroy(&mi->mi_fh_lock);
3123 	cv_destroy(&mi->mi_failover_cv);
3124 	cv_destroy(&mi->mi_async_reqs_cv);
3125 	cv_destroy(&mi->mi_async_work_cv[NFS4_ASYNC_QUEUE]);
3126 	cv_destroy(&mi->mi_async_work_cv[NFS4_ASYNC_PGOPS_QUEUE]);
3127 	cv_destroy(&mi->mi_async_cv);
3128 	cv_destroy(&mi->mi_inact_req_cv);
3129 	/*
3130 	 * Destroy the oo hash lists and mutexes for the cred hash table.
3131 	 */
3132 	for (i = 0; i < NFS4_NUM_OO_BUCKETS; i++) {
3133 		bucketp = &(mi->mi_oo_list[i]);
3134 		/* Destroy any remaining open owners on the list */
3135 		foop = list_head(&bucketp->b_oo_hash_list);
3136 		while (foop != NULL) {
3137 			list_remove(&bucketp->b_oo_hash_list, foop);
3138 			nfs4_destroy_open_owner(foop);
3139 			foop = list_head(&bucketp->b_oo_hash_list);
3140 		}
3141 		list_destroy(&bucketp->b_oo_hash_list);
3142 		mutex_destroy(&bucketp->b_lock);
3143 	}
3144 	/*
3145 	 * Empty and destroy the freed open owner list.
3146 	 */
3147 	foop = list_head(&mi->mi_foo_list);
3148 	while (foop != NULL) {
3149 		list_remove(&mi->mi_foo_list, foop);
3150 		nfs4_destroy_open_owner(foop);
3151 		foop = list_head(&mi->mi_foo_list);
3152 	}
3153 	list_destroy(&mi->mi_foo_list);
3154 	list_destroy(&mi->mi_bseqid_list);
3155 	list_destroy(&mi->mi_lost_state);
3156 	list_destroy(&mi->mi_rnodes);
3157 	avl_destroy(&mi->mi_filehandles);
3158 	kmem_free(mi, sizeof (*mi));
3159 }
3160 void
3161 mi_hold(mntinfo4_t *mi)
3162 {
3163 	atomic_inc_32(&mi->mi_count);
3164 	ASSERT(mi->mi_count != 0);
3165 }
3166 
3167 void
3168 mi_rele(mntinfo4_t *mi)
3169 {
3170 	ASSERT(mi->mi_count != 0);
3171 	if (atomic_dec_32_nv(&mi->mi_count) == 0) {
3172 		nfs_free_mi4(mi);
3173 	}
3174 }
3175 
3176 vnode_t    nfs4_xattr_notsupp_vnode;
3177 
3178 void
3179 nfs4_clnt_init(void)
3180 {
3181 	nfs4_vnops_init();
3182 	(void) nfs4_rnode_init();
3183 	(void) nfs4_shadow_init();
3184 	(void) nfs4_acache_init();
3185 	(void) nfs4_subr_init();
3186 	nfs4_acl_init();
3187 	nfs_idmap_init();
3188 	nfs4_callback_init();
3189 	nfs4_secinfo_init();
3190 #ifdef	DEBUG
3191 	tsd_create(&nfs4_tsd_key, NULL);
3192 #endif
3193 
3194 	/*
3195 	 * Add a CPR callback so that we can update client
3196 	 * lease after a suspend and resume.
3197 	 */
3198 	cid = callb_add(nfs4_client_cpr_callb, 0, CB_CL_CPR_RPC, "nfs4");
3199 
3200 	zone_key_create(&mi4_list_key, nfs4_mi_init, nfs4_mi_shutdown,
3201 	    nfs4_mi_destroy);
3202 
3203 	/*
3204 	 * Initialize the reference count of the notsupp xattr cache vnode to 1
3205 	 * so that it never goes away (VOP_INACTIVE isn't called on it).
3206 	 */
3207 	vn_reinit(&nfs4_xattr_notsupp_vnode);
3208 }
3209 
3210 void
3211 nfs4_clnt_fini(void)
3212 {
3213 	(void) zone_key_delete(mi4_list_key);
3214 	nfs4_vnops_fini();
3215 	(void) nfs4_rnode_fini();
3216 	(void) nfs4_shadow_fini();
3217 	(void) nfs4_acache_fini();
3218 	(void) nfs4_subr_fini();
3219 	nfs_idmap_fini();
3220 	nfs4_callback_fini();
3221 	nfs4_secinfo_fini();
3222 #ifdef	DEBUG
3223 	tsd_destroy(&nfs4_tsd_key);
3224 #endif
3225 	if (cid)
3226 		(void) callb_delete(cid);
3227 }
3228 
3229 /*ARGSUSED*/
3230 static boolean_t
3231 nfs4_client_cpr_callb(void *arg, int code)
3232 {
3233 	/*
3234 	 * We get called for Suspend and Resume events.
3235 	 * For the suspend case we simply don't care!
3236 	 */
3237 	if (code == CB_CODE_CPR_CHKPT) {
3238 		return (B_TRUE);
3239 	}
3240 
3241 	/*
3242 	 * When we get to here we are in the process of
3243 	 * resuming the system from a previous suspend.
3244 	 */
3245 	nfs4_client_resumed = gethrestime_sec();
3246 	return (B_TRUE);
3247 }
3248 
3249 void
3250 nfs4_renew_lease_thread(nfs4_server_t *sp)
3251 {
3252 	int	error = 0;
3253 	time_t	tmp_last_renewal_time, tmp_time, tmp_now_time, kip_secs;
3254 	clock_t	tick_delay = 0;
3255 	clock_t time_left = 0;
3256 	callb_cpr_t cpr_info;
3257 	kmutex_t cpr_lock;
3258 
3259 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3260 	    "nfs4_renew_lease_thread: acting on sp 0x%p", (void*)sp));
3261 	mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
3262 	CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr, "nfsv4Lease");
3263 
3264 	mutex_enter(&sp->s_lock);
3265 	/* sp->s_lease_time is set via a GETATTR */
3266 	sp->last_renewal_time = gethrestime_sec();
3267 	sp->lease_valid = NFS4_LEASE_UNINITIALIZED;
3268 	ASSERT(sp->s_refcnt >= 1);
3269 
3270 	for (;;) {
3271 		if (!sp->state_ref_count ||
3272 		    sp->lease_valid != NFS4_LEASE_VALID) {
3273 
3274 			kip_secs = MAX((sp->s_lease_time >> 1) -
3275 			    (3 * sp->propagation_delay.tv_sec), 1);
3276 
3277 			tick_delay = SEC_TO_TICK(kip_secs);
3278 
3279 			NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3280 			    "nfs4_renew_lease_thread: no renew : thread "
3281 			    "wait %ld secs", kip_secs));
3282 
3283 			NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3284 			    "nfs4_renew_lease_thread: no renew : "
3285 			    "state_ref_count %d, lease_valid %d",
3286 			    sp->state_ref_count, sp->lease_valid));
3287 
3288 			mutex_enter(&cpr_lock);
3289 			CALLB_CPR_SAFE_BEGIN(&cpr_info);
3290 			mutex_exit(&cpr_lock);
3291 			time_left = cv_reltimedwait(&sp->cv_thread_exit,
3292 			    &sp->s_lock, tick_delay, TR_CLOCK_TICK);
3293 			mutex_enter(&cpr_lock);
3294 			CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
3295 			mutex_exit(&cpr_lock);
3296 
3297 			NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3298 			    "nfs4_renew_lease_thread: no renew: "
3299 			    "time left %ld", time_left));
3300 
3301 			if (sp->s_thread_exit == NFS4_THREAD_EXIT)
3302 				goto die;
3303 			continue;
3304 		}
3305 
3306 		tmp_last_renewal_time = sp->last_renewal_time;
3307 
3308 		tmp_time = gethrestime_sec() - sp->last_renewal_time +
3309 		    (3 * sp->propagation_delay.tv_sec);
3310 
3311 		NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3312 		    "nfs4_renew_lease_thread: tmp_time %ld, "
3313 		    "sp->last_renewal_time %ld", tmp_time,
3314 		    sp->last_renewal_time));
3315 
3316 		kip_secs = MAX((sp->s_lease_time >> 1) - tmp_time, 1);
3317 
3318 		tick_delay = SEC_TO_TICK(kip_secs);
3319 
3320 		NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3321 		    "nfs4_renew_lease_thread: valid lease: sleep for %ld "
3322 		    "secs", kip_secs));
3323 
3324 		mutex_enter(&cpr_lock);
3325 		CALLB_CPR_SAFE_BEGIN(&cpr_info);
3326 		mutex_exit(&cpr_lock);
3327 		time_left = cv_reltimedwait(&sp->cv_thread_exit, &sp->s_lock,
3328 		    tick_delay, TR_CLOCK_TICK);
3329 		mutex_enter(&cpr_lock);
3330 		CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
3331 		mutex_exit(&cpr_lock);
3332 
3333 		NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3334 		    "nfs4_renew_lease_thread: valid lease: time left %ld :"
3335 		    "sp last_renewal_time %ld, nfs4_client_resumed %ld, "
3336 		    "tmp_last_renewal_time %ld", time_left,
3337 		    sp->last_renewal_time, nfs4_client_resumed,
3338 		    tmp_last_renewal_time));
3339 
3340 		if (sp->s_thread_exit == NFS4_THREAD_EXIT)
3341 			goto die;
3342 
3343 		if (tmp_last_renewal_time == sp->last_renewal_time ||
3344 		    (nfs4_client_resumed != 0 &&
3345 		    nfs4_client_resumed > sp->last_renewal_time)) {
3346 			/*
3347 			 * Issue RENEW op since we haven't renewed the lease
3348 			 * since we slept.
3349 			 */
3350 			tmp_now_time = gethrestime_sec();
3351 			error = nfs4renew(sp);
3352 			/*
3353 			 * Need to re-acquire sp's lock, nfs4renew()
3354 			 * relinqueshes it.
3355 			 */
3356 			mutex_enter(&sp->s_lock);
3357 
3358 			/*
3359 			 * See if someone changed s_thread_exit while we gave
3360 			 * up s_lock.
3361 			 */
3362 			if (sp->s_thread_exit == NFS4_THREAD_EXIT)
3363 				goto die;
3364 
3365 			if (!error) {
3366 				/*
3367 				 * check to see if we implicitly renewed while
3368 				 * we waited for a reply for our RENEW call.
3369 				 */
3370 				if (tmp_last_renewal_time ==
3371 				    sp->last_renewal_time) {
3372 					/* no implicit renew came */
3373 					sp->last_renewal_time = tmp_now_time;
3374 				} else {
3375 					NFS4_DEBUG(nfs4_client_lease_debug,
3376 					    (CE_NOTE, "renew_thread: did "
3377 					    "implicit renewal before reply "
3378 					    "from server for RENEW"));
3379 				}
3380 			} else {
3381 				/* figure out error */
3382 				NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3383 				    "renew_thread: nfs4renew returned error"
3384 				    " %d", error));
3385 			}
3386 
3387 		}
3388 	}
3389 
3390 die:
3391 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3392 	    "nfs4_renew_lease_thread: thread exiting"));
3393 
3394 	while (sp->s_otw_call_count != 0) {
3395 		NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3396 		    "nfs4_renew_lease_thread: waiting for outstanding "
3397 		    "otw calls to finish for sp 0x%p, current "
3398 		    "s_otw_call_count %d", (void *)sp,
3399 		    sp->s_otw_call_count));
3400 		mutex_enter(&cpr_lock);
3401 		CALLB_CPR_SAFE_BEGIN(&cpr_info);
3402 		mutex_exit(&cpr_lock);
3403 		cv_wait(&sp->s_cv_otw_count, &sp->s_lock);
3404 		mutex_enter(&cpr_lock);
3405 		CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
3406 		mutex_exit(&cpr_lock);
3407 	}
3408 	mutex_exit(&sp->s_lock);
3409 
3410 	nfs4_server_rele(sp);		/* free the thread's reference */
3411 	nfs4_server_rele(sp);		/* free the list's reference */
3412 	sp = NULL;
3413 
3414 done:
3415 	mutex_enter(&cpr_lock);
3416 	CALLB_CPR_EXIT(&cpr_info);	/* drops cpr_lock */
3417 	mutex_destroy(&cpr_lock);
3418 
3419 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3420 	    "nfs4_renew_lease_thread: renew thread exit officially"));
3421 
3422 	zthread_exit();
3423 	/* NOT REACHED */
3424 }
3425 
3426 /*
3427  * Send out a RENEW op to the server.
3428  * Assumes sp is locked down.
3429  */
3430 static int
3431 nfs4renew(nfs4_server_t *sp)
3432 {
3433 	COMPOUND4args_clnt args;
3434 	COMPOUND4res_clnt res;
3435 	nfs_argop4 argop[1];
3436 	int doqueue = 1;
3437 	int rpc_error;
3438 	cred_t *cr;
3439 	mntinfo4_t *mi;
3440 	timespec_t prop_time, after_time;
3441 	int needrecov = FALSE;
3442 	nfs4_recov_state_t recov_state;
3443 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
3444 
3445 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "nfs4renew"));
3446 
3447 	recov_state.rs_flags = 0;
3448 	recov_state.rs_num_retry_despite_err = 0;
3449 
3450 recov_retry:
3451 	mi = sp->mntinfo4_list;
3452 	VFS_HOLD(mi->mi_vfsp);
3453 	mutex_exit(&sp->s_lock);
3454 	ASSERT(mi != NULL);
3455 
3456 	e.error = nfs4_start_op(mi, NULL, NULL, &recov_state);
3457 	if (e.error) {
3458 		VFS_RELE(mi->mi_vfsp);
3459 		return (e.error);
3460 	}
3461 
3462 	/* Check to see if we're dealing with a marked-dead sp */
3463 	mutex_enter(&sp->s_lock);
3464 	if (sp->s_thread_exit == NFS4_THREAD_EXIT) {
3465 		mutex_exit(&sp->s_lock);
3466 		nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3467 		VFS_RELE(mi->mi_vfsp);
3468 		return (0);
3469 	}
3470 
3471 	/* Make sure mi hasn't changed on us */
3472 	if (mi != sp->mntinfo4_list) {
3473 		/* Must drop sp's lock to avoid a recursive mutex enter */
3474 		mutex_exit(&sp->s_lock);
3475 		nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3476 		VFS_RELE(mi->mi_vfsp);
3477 		mutex_enter(&sp->s_lock);
3478 		goto recov_retry;
3479 	}
3480 	mutex_exit(&sp->s_lock);
3481 
3482 	args.ctag = TAG_RENEW;
3483 
3484 	args.array_len = 1;
3485 	args.array = argop;
3486 
3487 	argop[0].argop = OP_RENEW;
3488 
3489 	mutex_enter(&sp->s_lock);
3490 	argop[0].nfs_argop4_u.oprenew.clientid = sp->clientid;
3491 	cr = sp->s_cred;
3492 	crhold(cr);
3493 	mutex_exit(&sp->s_lock);
3494 
3495 	ASSERT(cr != NULL);
3496 
3497 	/* used to figure out RTT for sp */
3498 	gethrestime(&prop_time);
3499 
3500 	NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
3501 	    "nfs4renew: %s call, sp 0x%p", needrecov ? "recov" : "first",
3502 	    (void*)sp));
3503 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "before: %ld s %ld ns ",
3504 	    prop_time.tv_sec, prop_time.tv_nsec));
3505 
3506 	DTRACE_PROBE2(nfs4__renew__start, nfs4_server_t *, sp,
3507 	    mntinfo4_t *, mi);
3508 
3509 	rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
3510 	crfree(cr);
3511 
3512 	DTRACE_PROBE2(nfs4__renew__end, nfs4_server_t *, sp,
3513 	    mntinfo4_t *, mi);
3514 
3515 	gethrestime(&after_time);
3516 
3517 	mutex_enter(&sp->s_lock);
3518 	sp->propagation_delay.tv_sec =
3519 	    MAX(1, after_time.tv_sec - prop_time.tv_sec);
3520 	mutex_exit(&sp->s_lock);
3521 
3522 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "after : %ld s %ld ns ",
3523 	    after_time.tv_sec, after_time.tv_nsec));
3524 
3525 	if (e.error == 0 && res.status == NFS4ERR_CB_PATH_DOWN) {
3526 		xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3527 		nfs4_delegreturn_all(sp);
3528 		nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3529 		VFS_RELE(mi->mi_vfsp);
3530 		/*
3531 		 * If the server returns CB_PATH_DOWN, it has renewed
3532 		 * the lease and informed us that the callback path is
3533 		 * down.  Since the lease is renewed, just return 0 and
3534 		 * let the renew thread proceed as normal.
3535 		 */
3536 		return (0);
3537 	}
3538 
3539 	needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
3540 	if (!needrecov && e.error) {
3541 		nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3542 		VFS_RELE(mi->mi_vfsp);
3543 		return (e.error);
3544 	}
3545 
3546 	rpc_error = e.error;
3547 
3548 	if (needrecov) {
3549 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
3550 		    "nfs4renew: initiating recovery\n"));
3551 
3552 		if (nfs4_start_recovery(&e, mi, NULL, NULL, NULL, NULL,
3553 		    OP_RENEW, NULL, NULL, NULL) == FALSE) {
3554 			nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3555 			VFS_RELE(mi->mi_vfsp);
3556 			if (!e.error)
3557 				xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3558 			mutex_enter(&sp->s_lock);
3559 			goto recov_retry;
3560 		}
3561 		/* fall through for res.status case */
3562 	}
3563 
3564 	if (res.status) {
3565 		if (res.status == NFS4ERR_LEASE_MOVED) {
3566 			/*EMPTY*/
3567 			/*
3568 			 * XXX need to try every mntinfo4 in sp->mntinfo4_list
3569 			 * to renew the lease on that server
3570 			 */
3571 		}
3572 		e.error = geterrno4(res.status);
3573 	}
3574 
3575 	if (!rpc_error)
3576 		xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3577 
3578 	nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3579 
3580 	VFS_RELE(mi->mi_vfsp);
3581 
3582 	return (e.error);
3583 }
3584 
3585 void
3586 nfs4_inc_state_ref_count(mntinfo4_t *mi)
3587 {
3588 	nfs4_server_t	*sp;
3589 
3590 	/* this locks down sp if it is found */
3591 	sp = find_nfs4_server(mi);
3592 
3593 	if (sp != NULL) {
3594 		nfs4_inc_state_ref_count_nolock(sp, mi);
3595 		mutex_exit(&sp->s_lock);
3596 		nfs4_server_rele(sp);
3597 	}
3598 }
3599 
3600 /*
3601  * Bump the number of OPEN files (ie: those with state) so we know if this
3602  * nfs4_server has any state to maintain a lease for or not.
3603  *
3604  * Also, marks the nfs4_server's lease valid if it hasn't been done so already.
3605  */
3606 void
3607 nfs4_inc_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi)
3608 {
3609 	ASSERT(mutex_owned(&sp->s_lock));
3610 
3611 	sp->state_ref_count++;
3612 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3613 	    "nfs4_inc_state_ref_count: state_ref_count now %d",
3614 	    sp->state_ref_count));
3615 
3616 	if (sp->lease_valid == NFS4_LEASE_UNINITIALIZED)
3617 		sp->lease_valid = NFS4_LEASE_VALID;
3618 
3619 	/*
3620 	 * If this call caused the lease to be marked valid and/or
3621 	 * took the state_ref_count from 0 to 1, then start the time
3622 	 * on lease renewal.
3623 	 */
3624 	if (sp->lease_valid == NFS4_LEASE_VALID && sp->state_ref_count == 1)
3625 		sp->last_renewal_time = gethrestime_sec();
3626 
3627 	/* update the number of open files for mi */
3628 	mi->mi_open_files++;
3629 }
3630 
3631 void
3632 nfs4_dec_state_ref_count(mntinfo4_t *mi)
3633 {
3634 	nfs4_server_t	*sp;
3635 
3636 	/* this locks down sp if it is found */
3637 	sp = find_nfs4_server_all(mi, 1);
3638 
3639 	if (sp != NULL) {
3640 		nfs4_dec_state_ref_count_nolock(sp, mi);
3641 		mutex_exit(&sp->s_lock);
3642 		nfs4_server_rele(sp);
3643 	}
3644 }
3645 
3646 /*
3647  * Decrement the number of OPEN files (ie: those with state) so we know if
3648  * this nfs4_server has any state to maintain a lease for or not.
3649  */
3650 void
3651 nfs4_dec_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi)
3652 {
3653 	ASSERT(mutex_owned(&sp->s_lock));
3654 	ASSERT(sp->state_ref_count != 0);
3655 	sp->state_ref_count--;
3656 
3657 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3658 	    "nfs4_dec_state_ref_count: state ref count now %d",
3659 	    sp->state_ref_count));
3660 
3661 	mi->mi_open_files--;
3662 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3663 	    "nfs4_dec_state_ref_count: mi open files %d, v4 flags 0x%x",
3664 	    mi->mi_open_files, mi->mi_flags));
3665 
3666 	/* We don't have to hold the mi_lock to test mi_flags */
3667 	if (mi->mi_open_files == 0 &&
3668 	    (mi->mi_flags & MI4_REMOVE_ON_LAST_CLOSE)) {
3669 		NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3670 		    "nfs4_dec_state_ref_count: remove mntinfo4 %p since "
3671 		    "we have closed the last open file", (void*)mi));
3672 		nfs4_remove_mi_from_server(mi, sp);
3673 	}
3674 }
3675 
3676 bool_t
3677 inlease(nfs4_server_t *sp)
3678 {
3679 	bool_t result;
3680 
3681 	ASSERT(mutex_owned(&sp->s_lock));
3682 
3683 	if (sp->lease_valid == NFS4_LEASE_VALID &&
3684 	    gethrestime_sec() < sp->last_renewal_time + sp->s_lease_time)
3685 		result = TRUE;
3686 	else
3687 		result = FALSE;
3688 
3689 	return (result);
3690 }
3691 
3692 
3693 /*
3694  * Return non-zero if the given nfs4_server_t is going through recovery.
3695  */
3696 
3697 int
3698 nfs4_server_in_recovery(nfs4_server_t *sp)
3699 {
3700 	return (nfs_rw_lock_held(&sp->s_recovlock, RW_WRITER));
3701 }
3702 
3703 /*
3704  * Compare two shared filehandle objects.  Returns -1, 0, or +1, if the
3705  * first is less than, equal to, or greater than the second.
3706  */
3707 
3708 int
3709 sfh4cmp(const void *p1, const void *p2)
3710 {
3711 	const nfs4_sharedfh_t *sfh1 = (const nfs4_sharedfh_t *)p1;
3712 	const nfs4_sharedfh_t *sfh2 = (const nfs4_sharedfh_t *)p2;
3713 
3714 	return (nfs4cmpfh(&sfh1->sfh_fh, &sfh2->sfh_fh));
3715 }
3716 
3717 /*
3718  * Create a table for shared filehandle objects.
3719  */
3720 
3721 void
3722 sfh4_createtab(avl_tree_t *tab)
3723 {
3724 	avl_create(tab, sfh4cmp, sizeof (nfs4_sharedfh_t),
3725 	    offsetof(nfs4_sharedfh_t, sfh_tree));
3726 }
3727 
3728 /*
3729  * Return a shared filehandle object for the given filehandle.  The caller
3730  * is responsible for eventually calling sfh4_rele().
3731  */
3732 
3733 nfs4_sharedfh_t *
3734 sfh4_put(const nfs_fh4 *fh, mntinfo4_t *mi, nfs4_sharedfh_t *key)
3735 {
3736 	nfs4_sharedfh_t *sfh, *nsfh;
3737 	avl_index_t where;
3738 	nfs4_sharedfh_t skey;
3739 
3740 	if (!key) {
3741 		skey.sfh_fh = *fh;
3742 		key = &skey;
3743 	}
3744 
3745 	nsfh = kmem_alloc(sizeof (nfs4_sharedfh_t), KM_SLEEP);
3746 	nsfh->sfh_fh.nfs_fh4_len = fh->nfs_fh4_len;
3747 	/*
3748 	 * We allocate the largest possible filehandle size because it's
3749 	 * not that big, and it saves us from possibly having to resize the
3750 	 * buffer later.
3751 	 */
3752 	nsfh->sfh_fh.nfs_fh4_val = kmem_alloc(NFS4_FHSIZE, KM_SLEEP);
3753 	bcopy(fh->nfs_fh4_val, nsfh->sfh_fh.nfs_fh4_val, fh->nfs_fh4_len);
3754 	mutex_init(&nsfh->sfh_lock, NULL, MUTEX_DEFAULT, NULL);
3755 	nsfh->sfh_refcnt = 1;
3756 	nsfh->sfh_flags = SFH4_IN_TREE;
3757 	nsfh->sfh_mi = mi;
3758 	NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, "sfh4_get: new object (%p)",
3759 	    (void *)nsfh));
3760 
3761 	(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
3762 	sfh = avl_find(&mi->mi_filehandles, key, &where);
3763 	if (sfh != NULL) {
3764 		mutex_enter(&sfh->sfh_lock);
3765 		sfh->sfh_refcnt++;
3766 		mutex_exit(&sfh->sfh_lock);
3767 		nfs_rw_exit(&mi->mi_fh_lock);
3768 		/* free our speculative allocs */
3769 		kmem_free(nsfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE);
3770 		kmem_free(nsfh, sizeof (nfs4_sharedfh_t));
3771 		return (sfh);
3772 	}
3773 
3774 	avl_insert(&mi->mi_filehandles, nsfh, where);
3775 	nfs_rw_exit(&mi->mi_fh_lock);
3776 
3777 	return (nsfh);
3778 }
3779 
3780 /*
3781  * Return a shared filehandle object for the given filehandle.  The caller
3782  * is responsible for eventually calling sfh4_rele().
3783  */
3784 
3785 nfs4_sharedfh_t *
3786 sfh4_get(const nfs_fh4 *fh, mntinfo4_t *mi)
3787 {
3788 	nfs4_sharedfh_t *sfh;
3789 	nfs4_sharedfh_t key;
3790 
3791 	ASSERT(fh->nfs_fh4_len <= NFS4_FHSIZE);
3792 
3793 #ifdef DEBUG
3794 	if (nfs4_sharedfh_debug) {
3795 		nfs4_fhandle_t fhandle;
3796 
3797 		fhandle.fh_len = fh->nfs_fh4_len;
3798 		bcopy(fh->nfs_fh4_val, fhandle.fh_buf, fhandle.fh_len);
3799 		zcmn_err(mi->mi_zone->zone_id, CE_NOTE, "sfh4_get:");
3800 		nfs4_printfhandle(&fhandle);
3801 	}
3802 #endif
3803 
3804 	/*
3805 	 * If there's already an object for the given filehandle, bump the
3806 	 * reference count and return it.  Otherwise, create a new object
3807 	 * and add it to the AVL tree.
3808 	 */
3809 
3810 	key.sfh_fh = *fh;
3811 
3812 	(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
3813 	sfh = avl_find(&mi->mi_filehandles, &key, NULL);
3814 	if (sfh != NULL) {
3815 		mutex_enter(&sfh->sfh_lock);
3816 		sfh->sfh_refcnt++;
3817 		NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3818 		    "sfh4_get: found existing %p, new refcnt=%d",
3819 		    (void *)sfh, sfh->sfh_refcnt));
3820 		mutex_exit(&sfh->sfh_lock);
3821 		nfs_rw_exit(&mi->mi_fh_lock);
3822 		return (sfh);
3823 	}
3824 	nfs_rw_exit(&mi->mi_fh_lock);
3825 
3826 	return (sfh4_put(fh, mi, &key));
3827 }
3828 
3829 /*
3830  * Get a reference to the given shared filehandle object.
3831  */
3832 
3833 void
3834 sfh4_hold(nfs4_sharedfh_t *sfh)
3835 {
3836 	ASSERT(sfh->sfh_refcnt > 0);
3837 
3838 	mutex_enter(&sfh->sfh_lock);
3839 	sfh->sfh_refcnt++;
3840 	NFS4_DEBUG(nfs4_sharedfh_debug,
3841 	    (CE_NOTE, "sfh4_hold %p, new refcnt=%d",
3842 	    (void *)sfh, sfh->sfh_refcnt));
3843 	mutex_exit(&sfh->sfh_lock);
3844 }
3845 
3846 /*
3847  * Release a reference to the given shared filehandle object and null out
3848  * the given pointer.
3849  */
3850 
3851 void
3852 sfh4_rele(nfs4_sharedfh_t **sfhpp)
3853 {
3854 	mntinfo4_t *mi;
3855 	nfs4_sharedfh_t *sfh = *sfhpp;
3856 
3857 	ASSERT(sfh->sfh_refcnt > 0);
3858 
3859 	mutex_enter(&sfh->sfh_lock);
3860 	if (sfh->sfh_refcnt > 1) {
3861 		sfh->sfh_refcnt--;
3862 		NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3863 		    "sfh4_rele %p, new refcnt=%d",
3864 		    (void *)sfh, sfh->sfh_refcnt));
3865 		mutex_exit(&sfh->sfh_lock);
3866 		goto finish;
3867 	}
3868 	mutex_exit(&sfh->sfh_lock);
3869 
3870 	/*
3871 	 * Possibly the last reference, so get the lock for the table in
3872 	 * case it's time to remove the object from the table.
3873 	 */
3874 	mi = sfh->sfh_mi;
3875 	(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
3876 	mutex_enter(&sfh->sfh_lock);
3877 	sfh->sfh_refcnt--;
3878 	if (sfh->sfh_refcnt > 0) {
3879 		NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3880 		    "sfh4_rele %p, new refcnt=%d",
3881 		    (void *)sfh, sfh->sfh_refcnt));
3882 		mutex_exit(&sfh->sfh_lock);
3883 		nfs_rw_exit(&mi->mi_fh_lock);
3884 		goto finish;
3885 	}
3886 
3887 	NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3888 	    "sfh4_rele %p, last ref", (void *)sfh));
3889 	if (sfh->sfh_flags & SFH4_IN_TREE) {
3890 		avl_remove(&mi->mi_filehandles, sfh);
3891 		sfh->sfh_flags &= ~SFH4_IN_TREE;
3892 	}
3893 	mutex_exit(&sfh->sfh_lock);
3894 	nfs_rw_exit(&mi->mi_fh_lock);
3895 	mutex_destroy(&sfh->sfh_lock);
3896 	kmem_free(sfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE);
3897 	kmem_free(sfh, sizeof (nfs4_sharedfh_t));
3898 
3899 finish:
3900 	*sfhpp = NULL;
3901 }
3902 
3903 /*
3904  * Update the filehandle for the given shared filehandle object.
3905  */
3906 
3907 int nfs4_warn_dupfh = 0;	/* if set, always warn about dup fhs below */
3908 
3909 void
3910 sfh4_update(nfs4_sharedfh_t *sfh, const nfs_fh4 *newfh)
3911 {
3912 	mntinfo4_t *mi = sfh->sfh_mi;
3913 	nfs4_sharedfh_t *dupsfh;
3914 	avl_index_t where;
3915 	nfs4_sharedfh_t key;
3916 
3917 #ifdef DEBUG
3918 	mutex_enter(&sfh->sfh_lock);
3919 	ASSERT(sfh->sfh_refcnt > 0);
3920 	mutex_exit(&sfh->sfh_lock);
3921 #endif
3922 	ASSERT(newfh->nfs_fh4_len <= NFS4_FHSIZE);
3923 
3924 	/*
3925 	 * The basic plan is to remove the shared filehandle object from
3926 	 * the table, update it to have the new filehandle, then reinsert
3927 	 * it.
3928 	 */
3929 
3930 	(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
3931 	mutex_enter(&sfh->sfh_lock);
3932 	if (sfh->sfh_flags & SFH4_IN_TREE) {
3933 		avl_remove(&mi->mi_filehandles, sfh);
3934 		sfh->sfh_flags &= ~SFH4_IN_TREE;
3935 	}
3936 	mutex_exit(&sfh->sfh_lock);
3937 	sfh->sfh_fh.nfs_fh4_len = newfh->nfs_fh4_len;
3938 	bcopy(newfh->nfs_fh4_val, sfh->sfh_fh.nfs_fh4_val,
3939 	    sfh->sfh_fh.nfs_fh4_len);
3940 
3941 	/*
3942 	 * XXX If there is already a shared filehandle object with the new
3943 	 * filehandle, we're in trouble, because the rnode code assumes
3944 	 * that there is only one shared filehandle object for a given
3945 	 * filehandle.  So issue a warning (for read-write mounts only)
3946 	 * and don't try to re-insert the given object into the table.
3947 	 * Hopefully the given object will quickly go away and everyone
3948 	 * will use the new object.
3949 	 */
3950 	key.sfh_fh = *newfh;
3951 	dupsfh = avl_find(&mi->mi_filehandles, &key, &where);
3952 	if (dupsfh != NULL) {
3953 		if (!(mi->mi_vfsp->vfs_flag & VFS_RDONLY) || nfs4_warn_dupfh) {
3954 			zcmn_err(mi->mi_zone->zone_id, CE_WARN, "sfh4_update: "
3955 			    "duplicate filehandle detected");
3956 			sfh4_printfhandle(dupsfh);
3957 		}
3958 	} else {
3959 		avl_insert(&mi->mi_filehandles, sfh, where);
3960 		mutex_enter(&sfh->sfh_lock);
3961 		sfh->sfh_flags |= SFH4_IN_TREE;
3962 		mutex_exit(&sfh->sfh_lock);
3963 	}
3964 	nfs_rw_exit(&mi->mi_fh_lock);
3965 }
3966 
3967 /*
3968  * Copy out the current filehandle for the given shared filehandle object.
3969  */
3970 
3971 void
3972 sfh4_copyval(const nfs4_sharedfh_t *sfh, nfs4_fhandle_t *fhp)
3973 {
3974 	mntinfo4_t *mi = sfh->sfh_mi;
3975 
3976 	ASSERT(sfh->sfh_refcnt > 0);
3977 
3978 	(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
3979 	fhp->fh_len = sfh->sfh_fh.nfs_fh4_len;
3980 	ASSERT(fhp->fh_len <= NFS4_FHSIZE);
3981 	bcopy(sfh->sfh_fh.nfs_fh4_val, fhp->fh_buf, fhp->fh_len);
3982 	nfs_rw_exit(&mi->mi_fh_lock);
3983 }
3984 
3985 /*
3986  * Print out the filehandle for the given shared filehandle object.
3987  */
3988 
3989 void
3990 sfh4_printfhandle(const nfs4_sharedfh_t *sfh)
3991 {
3992 	nfs4_fhandle_t fhandle;
3993 
3994 	sfh4_copyval(sfh, &fhandle);
3995 	nfs4_printfhandle(&fhandle);
3996 }
3997 
3998 /*
3999  * Compare 2 fnames.  Returns -1 if the first is "less" than the second, 0
4000  * if they're the same, +1 if the first is "greater" than the second.  The
4001  * caller (or whoever's calling the AVL package) is responsible for
4002  * handling locking issues.
4003  */
4004 
4005 static int
4006 fncmp(const void *p1, const void *p2)
4007 {
4008 	const nfs4_fname_t *f1 = p1;
4009 	const nfs4_fname_t *f2 = p2;
4010 	int res;
4011 
4012 	res = strcmp(f1->fn_name, f2->fn_name);
4013 	/*
4014 	 * The AVL package wants +/-1, not arbitrary positive or negative
4015 	 * integers.
4016 	 */
4017 	if (res > 0)
4018 		res = 1;
4019 	else if (res < 0)
4020 		res = -1;
4021 	return (res);
4022 }
4023 
4024 /*
4025  * Get or create an fname with the given name, as a child of the given
4026  * fname.  The caller is responsible for eventually releasing the reference
4027  * (fn_rele()).  parent may be NULL.
4028  */
4029 
4030 nfs4_fname_t *
4031 fn_get(nfs4_fname_t *parent, char *name, nfs4_sharedfh_t *sfh)
4032 {
4033 	nfs4_fname_t key;
4034 	nfs4_fname_t *fnp;
4035 	avl_index_t where;
4036 
4037 	key.fn_name = name;
4038 
4039 	/*
4040 	 * If there's already an fname registered with the given name, bump
4041 	 * its reference count and return it.  Otherwise, create a new one
4042 	 * and add it to the parent's AVL tree.
4043 	 *
4044 	 * fname entries we are looking for should match both name
4045 	 * and sfh stored in the fname.
4046 	 */
4047 again:
4048 	if (parent != NULL) {
4049 		mutex_enter(&parent->fn_lock);
4050 		fnp = avl_find(&parent->fn_children, &key, &where);
4051 		if (fnp != NULL) {
4052 			/*
4053 			 * This hold on fnp is released below later,
4054 			 * in case this is not the fnp we want.
4055 			 */
4056 			fn_hold(fnp);
4057 
4058 			if (fnp->fn_sfh == sfh) {
4059 				/*
4060 				 * We have found our entry.
4061 				 * put an hold and return it.
4062 				 */
4063 				mutex_exit(&parent->fn_lock);
4064 				return (fnp);
4065 			}
4066 
4067 			/*
4068 			 * We have found an entry that has a mismatching
4069 			 * fn_sfh. This could be a stale entry due to
4070 			 * server side rename. We will remove this entry
4071 			 * and make sure no such entries exist.
4072 			 */
4073 			mutex_exit(&parent->fn_lock);
4074 			mutex_enter(&fnp->fn_lock);
4075 			if (fnp->fn_parent == parent) {
4076 				/*
4077 				 * Remove ourselves from parent's
4078 				 * fn_children tree.
4079 				 */
4080 				mutex_enter(&parent->fn_lock);
4081 				avl_remove(&parent->fn_children, fnp);
4082 				mutex_exit(&parent->fn_lock);
4083 				fn_rele(&fnp->fn_parent);
4084 			}
4085 			mutex_exit(&fnp->fn_lock);
4086 			fn_rele(&fnp);
4087 			goto again;
4088 		}
4089 	}
4090 
4091 	fnp = kmem_alloc(sizeof (nfs4_fname_t), KM_SLEEP);
4092 	mutex_init(&fnp->fn_lock, NULL, MUTEX_DEFAULT, NULL);
4093 	fnp->fn_parent = parent;
4094 	if (parent != NULL)
4095 		fn_hold(parent);
4096 	fnp->fn_len = strlen(name);
4097 	ASSERT(fnp->fn_len < MAXNAMELEN);
4098 	fnp->fn_name = kmem_alloc(fnp->fn_len + 1, KM_SLEEP);
4099 	(void) strcpy(fnp->fn_name, name);
4100 	fnp->fn_refcnt = 1;
4101 
4102 	/*
4103 	 * This hold on sfh is later released
4104 	 * when we do the final fn_rele() on this fname.
4105 	 */
4106 	sfh4_hold(sfh);
4107 	fnp->fn_sfh = sfh;
4108 
4109 	avl_create(&fnp->fn_children, fncmp, sizeof (nfs4_fname_t),
4110 	    offsetof(nfs4_fname_t, fn_tree));
4111 	NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4112 	    "fn_get %p:%s, a new nfs4_fname_t!",
4113 	    (void *)fnp, fnp->fn_name));
4114 	if (parent != NULL) {
4115 		avl_insert(&parent->fn_children, fnp, where);
4116 		mutex_exit(&parent->fn_lock);
4117 	}
4118 
4119 	return (fnp);
4120 }
4121 
4122 void
4123 fn_hold(nfs4_fname_t *fnp)
4124 {
4125 	atomic_inc_32(&fnp->fn_refcnt);
4126 	NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4127 	    "fn_hold %p:%s, new refcnt=%d",
4128 	    (void *)fnp, fnp->fn_name, fnp->fn_refcnt));
4129 }
4130 
4131 /*
4132  * Decrement the reference count of the given fname, and destroy it if its
4133  * reference count goes to zero.  Nulls out the given pointer.
4134  */
4135 
4136 void
4137 fn_rele(nfs4_fname_t **fnpp)
4138 {
4139 	nfs4_fname_t *parent;
4140 	uint32_t newref;
4141 	nfs4_fname_t *fnp;
4142 
4143 recur:
4144 	fnp = *fnpp;
4145 	*fnpp = NULL;
4146 
4147 	mutex_enter(&fnp->fn_lock);
4148 	parent = fnp->fn_parent;
4149 	if (parent != NULL)
4150 		mutex_enter(&parent->fn_lock);	/* prevent new references */
4151 	newref = atomic_dec_32_nv(&fnp->fn_refcnt);
4152 	if (newref > 0) {
4153 		NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4154 		    "fn_rele %p:%s, new refcnt=%d",
4155 		    (void *)fnp, fnp->fn_name, fnp->fn_refcnt));
4156 		if (parent != NULL)
4157 			mutex_exit(&parent->fn_lock);
4158 		mutex_exit(&fnp->fn_lock);
4159 		return;
4160 	}
4161 
4162 	NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4163 	    "fn_rele %p:%s, last reference, deleting...",
4164 	    (void *)fnp, fnp->fn_name));
4165 	if (parent != NULL) {
4166 		avl_remove(&parent->fn_children, fnp);
4167 		mutex_exit(&parent->fn_lock);
4168 	}
4169 	kmem_free(fnp->fn_name, fnp->fn_len + 1);
4170 	sfh4_rele(&fnp->fn_sfh);
4171 	mutex_destroy(&fnp->fn_lock);
4172 	avl_destroy(&fnp->fn_children);
4173 	kmem_free(fnp, sizeof (nfs4_fname_t));
4174 	/*
4175 	 * Recursivly fn_rele the parent.
4176 	 * Use goto instead of a recursive call to avoid stack overflow.
4177 	 */
4178 	if (parent != NULL) {
4179 		fnpp = &parent;
4180 		goto recur;
4181 	}
4182 }
4183 
4184 /*
4185  * Returns the single component name of the given fname, in a MAXNAMELEN
4186  * string buffer, which the caller is responsible for freeing.  Note that
4187  * the name may become invalid as a result of fn_move().
4188  */
4189 
4190 char *
4191 fn_name(nfs4_fname_t *fnp)
4192 {
4193 	char *name;
4194 
4195 	ASSERT(fnp->fn_len < MAXNAMELEN);
4196 	name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
4197 	mutex_enter(&fnp->fn_lock);
4198 	(void) strcpy(name, fnp->fn_name);
4199 	mutex_exit(&fnp->fn_lock);
4200 
4201 	return (name);
4202 }
4203 
4204 
4205 /*
4206  * fn_path_realloc
4207  *
4208  * This function, used only by fn_path, constructs
4209  * a new string which looks like "prepend" + "/" + "current".
4210  * by allocating a new string and freeing the old one.
4211  */
4212 static void
4213 fn_path_realloc(char **curses, char *prepend)
4214 {
4215 	int len, curlen = 0;
4216 	char *news;
4217 
4218 	if (*curses == NULL) {
4219 		/*
4220 		 * Prime the pump, allocate just the
4221 		 * space for prepend and return that.
4222 		 */
4223 		len = strlen(prepend) + 1;
4224 		news = kmem_alloc(len, KM_SLEEP);
4225 		(void) strncpy(news, prepend, len);
4226 	} else {
4227 		/*
4228 		 * Allocate the space  for a new string
4229 		 * +1 +1 is for the "/" and the NULL
4230 		 * byte at the end of it all.
4231 		 */
4232 		curlen = strlen(*curses);
4233 		len = curlen + strlen(prepend) + 1 + 1;
4234 		news = kmem_alloc(len, KM_SLEEP);
4235 		(void) strncpy(news, prepend, len);
4236 		(void) strcat(news, "/");
4237 		(void) strcat(news, *curses);
4238 		kmem_free(*curses, curlen + 1);
4239 	}
4240 	*curses = news;
4241 }
4242 
4243 /*
4244  * Returns the path name (starting from the fs root) for the given fname.
4245  * The caller is responsible for freeing.  Note that the path may be or
4246  * become invalid as a result of fn_move().
4247  */
4248 
4249 char *
4250 fn_path(nfs4_fname_t *fnp)
4251 {
4252 	char *path;
4253 	nfs4_fname_t *nextfnp;
4254 
4255 	if (fnp == NULL)
4256 		return (NULL);
4257 
4258 	path = NULL;
4259 
4260 	/* walk up the tree constructing the pathname.  */
4261 
4262 	fn_hold(fnp);			/* adjust for later rele */
4263 	do {
4264 		mutex_enter(&fnp->fn_lock);
4265 		/*
4266 		 * Add fn_name in front of the current path
4267 		 */
4268 		fn_path_realloc(&path, fnp->fn_name);
4269 		nextfnp = fnp->fn_parent;
4270 		if (nextfnp != NULL)
4271 			fn_hold(nextfnp);
4272 		mutex_exit(&fnp->fn_lock);
4273 		fn_rele(&fnp);
4274 		fnp = nextfnp;
4275 	} while (fnp != NULL);
4276 
4277 	return (path);
4278 }
4279 
4280 /*
4281  * Return a reference to the parent of the given fname, which the caller is
4282  * responsible for eventually releasing.
4283  */
4284 
4285 nfs4_fname_t *
4286 fn_parent(nfs4_fname_t *fnp)
4287 {
4288 	nfs4_fname_t *parent;
4289 
4290 	mutex_enter(&fnp->fn_lock);
4291 	parent = fnp->fn_parent;
4292 	if (parent != NULL)
4293 		fn_hold(parent);
4294 	mutex_exit(&fnp->fn_lock);
4295 
4296 	return (parent);
4297 }
4298 
4299 /*
4300  * Update fnp so that its parent is newparent and its name is newname.
4301  */
4302 
4303 void
4304 fn_move(nfs4_fname_t *fnp, nfs4_fname_t *newparent, char *newname)
4305 {
4306 	nfs4_fname_t *parent, *tmpfnp;
4307 	ssize_t newlen;
4308 	nfs4_fname_t key;
4309 	avl_index_t where;
4310 
4311 	/*
4312 	 * This assert exists to catch the client trying to rename
4313 	 * a dir to be a child of itself.  This happened at a recent
4314 	 * bakeoff against a 3rd party (broken) server which allowed
4315 	 * the rename to succeed.  If it trips it means that:
4316 	 *	a) the code in nfs4rename that detects this case is broken
4317 	 *	b) the server is broken (since it allowed the bogus rename)
4318 	 *
4319 	 * For non-DEBUG kernels, prepare for a recursive mutex_enter
4320 	 * panic below from:  mutex_enter(&newparent->fn_lock);
4321 	 */
4322 	ASSERT(fnp != newparent);
4323 
4324 	/*
4325 	 * Remove fnp from its current parent, change its name, then add it
4326 	 * to newparent. It might happen that fnp was replaced by another
4327 	 * nfs4_fname_t with the same fn_name in parent->fn_children.
4328 	 * In such case, fnp->fn_parent is NULL and we skip the removal
4329 	 * of fnp from its current parent.
4330 	 */
4331 	mutex_enter(&fnp->fn_lock);
4332 	parent = fnp->fn_parent;
4333 	if (parent != NULL) {
4334 		mutex_enter(&parent->fn_lock);
4335 		avl_remove(&parent->fn_children, fnp);
4336 		mutex_exit(&parent->fn_lock);
4337 		fn_rele(&fnp->fn_parent);
4338 	}
4339 
4340 	newlen = strlen(newname);
4341 	if (newlen != fnp->fn_len) {
4342 		ASSERT(newlen < MAXNAMELEN);
4343 		kmem_free(fnp->fn_name, fnp->fn_len + 1);
4344 		fnp->fn_name = kmem_alloc(newlen + 1, KM_SLEEP);
4345 		fnp->fn_len = newlen;
4346 	}
4347 	(void) strcpy(fnp->fn_name, newname);
4348 
4349 again:
4350 	mutex_enter(&newparent->fn_lock);
4351 	key.fn_name = fnp->fn_name;
4352 	tmpfnp = avl_find(&newparent->fn_children, &key, &where);
4353 	if (tmpfnp != NULL) {
4354 		/*
4355 		 * This could be due to a file that was unlinked while
4356 		 * open, or perhaps the rnode is in the free list.  Remove
4357 		 * it from newparent and let it go away on its own.  The
4358 		 * contorted code is to deal with lock order issues and
4359 		 * race conditions.
4360 		 */
4361 		fn_hold(tmpfnp);
4362 		mutex_exit(&newparent->fn_lock);
4363 		mutex_enter(&tmpfnp->fn_lock);
4364 		if (tmpfnp->fn_parent == newparent) {
4365 			mutex_enter(&newparent->fn_lock);
4366 			avl_remove(&newparent->fn_children, tmpfnp);
4367 			mutex_exit(&newparent->fn_lock);
4368 			fn_rele(&tmpfnp->fn_parent);
4369 		}
4370 		mutex_exit(&tmpfnp->fn_lock);
4371 		fn_rele(&tmpfnp);
4372 		goto again;
4373 	}
4374 	fnp->fn_parent = newparent;
4375 	fn_hold(newparent);
4376 	avl_insert(&newparent->fn_children, fnp, where);
4377 	mutex_exit(&newparent->fn_lock);
4378 	mutex_exit(&fnp->fn_lock);
4379 }
4380 
4381 #ifdef DEBUG
4382 /*
4383  * Return non-zero if the type information makes sense for the given vnode.
4384  * Otherwise panic.
4385  */
4386 int
4387 nfs4_consistent_type(vnode_t *vp)
4388 {
4389 	rnode4_t *rp = VTOR4(vp);
4390 
4391 	if (nfs4_vtype_debug && vp->v_type != VNON &&
4392 	    rp->r_attr.va_type != VNON && vp->v_type != rp->r_attr.va_type) {
4393 		cmn_err(CE_PANIC, "vnode %p type mismatch; v_type=%d, "
4394 		    "rnode attr type=%d", (void *)vp, vp->v_type,
4395 		    rp->r_attr.va_type);
4396 	}
4397 
4398 	return (1);
4399 }
4400 #endif /* DEBUG */
4401