xref: /titanic_52/usr/src/uts/common/fs/nfs/nfs4_callback.c (revision d561bb99043ed4f82fe51b395850644c122a3867)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
27 /* All Rights Reserved */
28 
29 #include <sys/param.h>
30 #include <sys/types.h>
31 #include <sys/systm.h>
32 #include <sys/cred.h>
33 #include <sys/vfs.h>
34 #include <sys/vnode.h>
35 #include <sys/pathname.h>
36 #include <sys/sysmacros.h>
37 #include <sys/kmem.h>
38 #include <sys/kstat.h>
39 #include <sys/mkdev.h>
40 #include <sys/mount.h>
41 #include <sys/statvfs.h>
42 #include <sys/errno.h>
43 #include <sys/debug.h>
44 #include <sys/cmn_err.h>
45 #include <sys/utsname.h>
46 #include <sys/bootconf.h>
47 #include <sys/modctl.h>
48 #include <sys/acl.h>
49 #include <sys/flock.h>
50 #include <sys/kstr.h>
51 #include <sys/stropts.h>
52 #include <sys/strsubr.h>
53 #include <sys/atomic.h>
54 #include <sys/disp.h>
55 #include <sys/policy.h>
56 #include <sys/list.h>
57 #include <sys/zone.h>
58 
59 #include <rpc/types.h>
60 #include <rpc/auth.h>
61 #include <rpc/rpcsec_gss.h>
62 #include <rpc/clnt.h>
63 #include <rpc/xdr.h>
64 
65 #include <nfs/nfs.h>
66 #include <nfs/nfs_clnt.h>
67 #include <nfs/mount.h>
68 #include <nfs/nfs_acl.h>
69 
70 #include <fs/fs_subr.h>
71 
72 #include <nfs/nfs4.h>
73 #include <nfs/rnode4.h>
74 #include <nfs/nfs4_clnt.h>
75 #include <nfs/nfssys.h>
76 
77 #ifdef	DEBUG
78 /*
79  * These are "special" state IDs and file handles that
80  * match any delegation state ID or file handled.  This
81  * is for testing purposes only.
82  */
83 
84 stateid4 nfs4_deleg_any = { 0x7FFFFFF0 };
85 char nfs4_deleg_fh[] = "\0377\0376\0375\0374";
86 nfs_fh4 nfs4_deleg_anyfh = { sizeof (nfs4_deleg_fh)-1, nfs4_deleg_fh };
87 nfsstat4 cb4_getattr_fail = NFS4_OK;
88 nfsstat4 cb4_recall_fail = NFS4_OK;
89 
90 int nfs4_callback_debug;
91 int nfs4_recall_debug;
92 int nfs4_drat_debug;
93 
94 #endif
95 
96 #define	CB_NOTE(x)	NFS4_DEBUG(nfs4_callback_debug, (CE_NOTE, x))
97 #define	CB_WARN(x)	NFS4_DEBUG(nfs4_callback_debug, (CE_WARN, x))
98 #define	CB_WARN1(x, y)	NFS4_DEBUG(nfs4_callback_debug, (CE_WARN, x, y))
99 
100 enum nfs4_delegreturn_policy nfs4_delegreturn_policy = INACTIVE;
101 
102 static zone_key_t nfs4_callback_zone_key;
103 
104 /*
105  * NFS4_MAPSIZE is the number of bytes we are willing to consume
106  * for the block allocation map when the server grants a NFS_LIMIT_BLOCK
107  * style delegation.
108  */
109 
110 #define	NFS4_MAPSIZE	8192
111 #define	NFS4_MAPWORDS	NFS4_MAPSIZE/sizeof (uint_t)
112 #define	NbPW		(NBBY*sizeof (uint_t))
113 
114 static int nfs4_num_prognums = 1024;
115 static SVC_CALLOUT_TABLE nfs4_cb_sct;
116 
117 struct nfs4_dnode {
118 	list_node_t	linkage;
119 	rnode4_t	*rnodep;
120 	int		flags;		/* Flags for nfs4delegreturn_impl() */
121 };
122 
123 static const struct nfs4_callback_stats nfs4_callback_stats_tmpl = {
124 	{ "delegations",	KSTAT_DATA_UINT64 },
125 	{ "cb_getattr",		KSTAT_DATA_UINT64 },
126 	{ "cb_recall",		KSTAT_DATA_UINT64 },
127 	{ "cb_null",		KSTAT_DATA_UINT64 },
128 	{ "cb_dispatch",	KSTAT_DATA_UINT64 },
129 	{ "delegaccept_r",	KSTAT_DATA_UINT64 },
130 	{ "delegaccept_rw",	KSTAT_DATA_UINT64 },
131 	{ "delegreturn",	KSTAT_DATA_UINT64 },
132 	{ "callbacks",		KSTAT_DATA_UINT64 },
133 	{ "claim_cur",		KSTAT_DATA_UINT64 },
134 	{ "claim_cur_ok",	KSTAT_DATA_UINT64 },
135 	{ "recall_trunc",	KSTAT_DATA_UINT64 },
136 	{ "recall_failed",	KSTAT_DATA_UINT64 },
137 	{ "return_limit_write",	KSTAT_DATA_UINT64 },
138 	{ "return_limit_addmap", KSTAT_DATA_UINT64 },
139 	{ "deleg_recover",	KSTAT_DATA_UINT64 },
140 	{ "cb_illegal",		KSTAT_DATA_UINT64 }
141 };
142 
143 struct nfs4_cb_port {
144 	list_node_t		linkage; /* linkage into per-zone port list */
145 	char			netid[KNC_STRSIZE];
146 	char			uaddr[KNC_STRSIZE];
147 	char			protofmly[KNC_STRSIZE];
148 	char			proto[KNC_STRSIZE];
149 };
150 
151 static int cb_getattr_bytes;
152 
153 struct cb_recall_pass {
154 	rnode4_t	*rp;
155 	int		flags;		/* Flags for nfs4delegreturn_impl() */
156 	bool_t		truncate;
157 };
158 
159 static nfs4_open_stream_t *get_next_deleg_stream(rnode4_t *, int);
160 static void nfs4delegreturn_thread(struct cb_recall_pass *);
161 static int deleg_reopen(vnode_t *, bool_t *, struct nfs4_callback_globals *,
162     int);
163 static void nfs4_dlistadd(rnode4_t *, struct nfs4_callback_globals *, int);
164 static void nfs4_dlistclean_impl(struct nfs4_callback_globals *, int);
165 static int nfs4delegreturn_impl(rnode4_t *, int,
166     struct nfs4_callback_globals *);
167 static void nfs4delegreturn_cleanup_impl(rnode4_t *, nfs4_server_t *,
168     struct nfs4_callback_globals *);
169 static void nfs4_recall_sync_wait(nfs4_server_t *);
170 
171 static void
172 cb_getattr(nfs_cb_argop4 *argop, nfs_cb_resop4 *resop, struct svc_req *req,
173 	struct compound_state *cs, struct nfs4_callback_globals *ncg)
174 {
175 	CB_GETATTR4args *args = &argop->nfs_cb_argop4_u.opcbgetattr;
176 	CB_GETATTR4res *resp = &resop->nfs_cb_resop4_u.opcbgetattr;
177 	rnode4_t *rp;
178 	vnode_t *vp;
179 	bool_t found = FALSE;
180 	struct nfs4_server *sp;
181 	struct fattr4 *fap;
182 	rpc_inline_t *fdata;
183 	long mapcnt;
184 	fattr4_change change;
185 	fattr4_size size;
186 	uint_t rflag;
187 
188 	ncg->nfs4_callback_stats.cb_getattr.value.ui64++;
189 
190 #ifdef DEBUG
191 	/*
192 	 * error injection hook: set cb_getattr_fail global to
193 	 * NFS4 pcol error to be returned
194 	 */
195 	if (cb4_getattr_fail != NFS4_OK) {
196 		*cs->statusp = resp->status = cb4_getattr_fail;
197 		return;
198 	}
199 #endif
200 
201 	resp->obj_attributes.attrmask = 0;
202 
203 	mutex_enter(&ncg->nfs4_cb_lock);
204 	sp = ncg->nfs4prog2server[req->rq_prog - NFS4_CALLBACK];
205 	mutex_exit(&ncg->nfs4_cb_lock);
206 
207 	if (nfs4_server_vlock(sp, 0) == FALSE) {
208 
209 		CB_WARN("cb_getattr: cannot find server\n");
210 
211 		*cs->statusp = resp->status = NFS4ERR_BADHANDLE;
212 		return;
213 	}
214 
215 	/*
216 	 * In cb_compound, callback_ident was validated against rq_prog,
217 	 * but we couldn't verify that it was set to the value we provided
218 	 * at setclientid time (because we didn't have server struct yet).
219 	 * Now we have the server struct, but don't have callback_ident
220 	 * handy.  So, validate server struct program number against req
221 	 * RPC's prog number.  At this point, we know the RPC prog num
222 	 * is valid (else we wouldn't be here); however, we don't know
223 	 * that it was the prog number we supplied to this server at
224 	 * setclientid time.  If the prog numbers aren't equivalent, then
225 	 * log the problem and fail the request because either cbserv
226 	 * and/or cbclient are confused.  This will probably never happen.
227 	 */
228 	if (sp->s_program != req->rq_prog) {
229 #ifdef DEBUG
230 		zcmn_err(getzoneid(), CE_WARN,
231 		    "cb_getattr: wrong server program number srv=%d req=%d\n",
232 		    sp->s_program, req->rq_prog);
233 #else
234 		zcmn_err(getzoneid(), CE_WARN,
235 		    "cb_getattr: wrong server program number\n");
236 #endif
237 		mutex_exit(&sp->s_lock);
238 		nfs4_server_rele(sp);
239 		*cs->statusp = resp->status = NFS4ERR_BADHANDLE;
240 		return;
241 	}
242 
243 	/*
244 	 * Search the delegation list for a matching file handle;
245 	 * mutex on sp prevents the list from changing.
246 	 */
247 
248 	rp = list_head(&sp->s_deleg_list);
249 	for (; rp != NULL; rp = list_next(&sp->s_deleg_list, rp)) {
250 		nfs4_fhandle_t fhandle;
251 
252 		sfh4_copyval(rp->r_fh, &fhandle);
253 
254 		if ((fhandle.fh_len == args->fh.nfs_fh4_len &&
255 		    bcmp(fhandle.fh_buf, args->fh.nfs_fh4_val,
256 		    fhandle.fh_len) == 0)) {
257 
258 			found = TRUE;
259 			break;
260 		}
261 #ifdef	DEBUG
262 		if (nfs4_deleg_anyfh.nfs_fh4_len == args->fh.nfs_fh4_len &&
263 		    bcmp(nfs4_deleg_anyfh.nfs_fh4_val, args->fh.nfs_fh4_val,
264 		    args->fh.nfs_fh4_len) == 0) {
265 
266 			found = TRUE;
267 			break;
268 		}
269 #endif
270 	}
271 
272 	/*
273 	 * VN_HOLD the vnode before releasing s_lock to guarantee
274 	 * we have a valid vnode reference.
275 	 */
276 	if (found == TRUE) {
277 		vp = RTOV4(rp);
278 		VN_HOLD(vp);
279 	}
280 
281 	mutex_exit(&sp->s_lock);
282 	nfs4_server_rele(sp);
283 
284 	if (found == FALSE) {
285 
286 		CB_WARN("cb_getattr: bad fhandle\n");
287 
288 		*cs->statusp = resp->status = NFS4ERR_BADHANDLE;
289 		return;
290 	}
291 
292 	/*
293 	 * Figure out which attributes the server wants.  We only
294 	 * offer FATTR4_CHANGE & FATTR4_SIZE; ignore the rest.
295 	 */
296 	fdata = kmem_alloc(cb_getattr_bytes, KM_SLEEP);
297 
298 	/*
299 	 * Don't actually need to create XDR to encode these
300 	 * simple data structures.
301 	 * xdrmem_create(&xdr, fdata, cb_getattr_bytes, XDR_ENCODE);
302 	 */
303 	fap = &resp->obj_attributes;
304 
305 	fap->attrmask = 0;
306 	/* attrlist4_len starts at 0 and increases as attrs are processed */
307 	fap->attrlist4 = (char *)fdata;
308 	fap->attrlist4_len = 0;
309 
310 	/* don't supply attrs if request was zero */
311 	if (args->attr_request != 0) {
312 		if (args->attr_request & FATTR4_CHANGE_MASK) {
313 			/*
314 			 * If the file is mmapped, then increment the change
315 			 * attribute and return it.  This will guarantee that
316 			 * the server will perceive that the file has changed
317 			 * if there is any chance that the client application
318 			 * has changed it.  Otherwise, just return the change
319 			 * attribute as it has been updated by nfs4write_deleg.
320 			 */
321 
322 			mutex_enter(&rp->r_statelock);
323 			mapcnt = rp->r_mapcnt;
324 			rflag = rp->r_flags;
325 			mutex_exit(&rp->r_statelock);
326 
327 			mutex_enter(&rp->r_statev4_lock);
328 			/*
329 			 * If object mapped, then always return new change.
330 			 * Otherwise, return change if object has dirty
331 			 * pages.  If object doesn't have any dirty pages,
332 			 * then all changes have been pushed to server, so
333 			 * reset change to grant change.
334 			 */
335 			if (mapcnt)
336 				rp->r_deleg_change++;
337 			else if (! (rflag & R4DIRTY))
338 				rp->r_deleg_change = rp->r_deleg_change_grant;
339 			change = rp->r_deleg_change;
340 			mutex_exit(&rp->r_statev4_lock);
341 
342 			/*
343 			 * Use inline XDR code directly, we know that we
344 			 * going to a memory buffer and it has enough
345 			 * space so it cannot fail.
346 			 */
347 			IXDR_PUT_U_HYPER(fdata, change);
348 			fap->attrlist4_len += 2 * BYTES_PER_XDR_UNIT;
349 			fap->attrmask |= FATTR4_CHANGE_MASK;
350 		}
351 
352 		if (args->attr_request & FATTR4_SIZE_MASK) {
353 			/*
354 			 * Use an atomic add of 0 to fetch a consistent view
355 			 * of r_size; this avoids having to take rw_lock
356 			 * which could cause a deadlock.
357 			 */
358 			size = atomic_add_64_nv((uint64_t *)&rp->r_size, 0);
359 
360 			/*
361 			 * Use inline XDR code directly, we know that we
362 			 * going to a memory buffer and it has enough
363 			 * space so it cannot fail.
364 			 */
365 			IXDR_PUT_U_HYPER(fdata, size);
366 			fap->attrlist4_len += 2 * BYTES_PER_XDR_UNIT;
367 			fap->attrmask |= FATTR4_SIZE_MASK;
368 		}
369 	}
370 
371 	VN_RELE(vp);
372 
373 	*cs->statusp = resp->status = NFS4_OK;
374 }
375 
376 static void
377 cb_getattr_free(nfs_cb_resop4 *resop)
378 {
379 	if (resop->nfs_cb_resop4_u.opcbgetattr.obj_attributes.attrlist4)
380 		kmem_free(resop->nfs_cb_resop4_u.opcbgetattr.
381 		    obj_attributes.attrlist4, cb_getattr_bytes);
382 }
383 
384 static void
385 cb_recall(nfs_cb_argop4 *argop, nfs_cb_resop4 *resop, struct svc_req *req,
386 	struct compound_state *cs, struct nfs4_callback_globals *ncg)
387 {
388 	CB_RECALL4args * args = &argop->nfs_cb_argop4_u.opcbrecall;
389 	CB_RECALL4res *resp = &resop->nfs_cb_resop4_u.opcbrecall;
390 	rnode4_t *rp;
391 	vnode_t *vp;
392 	struct nfs4_server *sp;
393 	bool_t found = FALSE;
394 	bool_t retried = FALSE;
395 
396 	ncg->nfs4_callback_stats.cb_recall.value.ui64++;
397 
398 	ASSERT(req->rq_prog >= NFS4_CALLBACK);
399 	ASSERT(req->rq_prog < NFS4_CALLBACK+nfs4_num_prognums);
400 
401 #ifdef DEBUG
402 	/*
403 	 * error injection hook: set cb_recall_fail global to
404 	 * NFS4 pcol error to be returned
405 	 */
406 	if (cb4_recall_fail != NFS4_OK) {
407 		*cs->statusp = resp->status = cb4_recall_fail;
408 		return;
409 	}
410 #endif
411 
412 	mutex_enter(&ncg->nfs4_cb_lock);
413 	sp = ncg->nfs4prog2server[req->rq_prog - NFS4_CALLBACK];
414 	mutex_exit(&ncg->nfs4_cb_lock);
415 
416 	if (nfs4_server_vlock(sp, 0) == FALSE) {
417 
418 		CB_WARN("cb_recall: cannot find server\n");
419 
420 		*cs->statusp = resp->status = NFS4ERR_BADHANDLE;
421 		return;
422 	}
423 
424 retry:
425 	/*
426 	 * Search the delegation list for a matching file handle
427 	 * AND stateid; mutex on sp prevents the list from changing.
428 	 */
429 
430 	rp = list_head(&sp->s_deleg_list);
431 	for (; rp != NULL; rp = list_next(&sp->s_deleg_list, rp)) {
432 		mutex_enter(&rp->r_statev4_lock);
433 
434 		/* check both state id and file handle! */
435 
436 		if ((bcmp(&rp->r_deleg_stateid, &args->stateid,
437 		    sizeof (stateid4)) == 0)) {
438 			nfs4_fhandle_t fhandle;
439 
440 			sfh4_copyval(rp->r_fh, &fhandle);
441 			if ((fhandle.fh_len == args->fh.nfs_fh4_len &&
442 			    bcmp(fhandle.fh_buf, args->fh.nfs_fh4_val,
443 			    fhandle.fh_len) == 0)) {
444 
445 				found = TRUE;
446 				break;
447 			} else {
448 #ifdef	DEBUG
449 				CB_WARN("cb_recall: stateid OK, bad fh");
450 #endif
451 			}
452 		}
453 #ifdef	DEBUG
454 		if (bcmp(&args->stateid, &nfs4_deleg_any,
455 		    sizeof (stateid4)) == 0) {
456 
457 			found = TRUE;
458 			break;
459 		}
460 #endif
461 		mutex_exit(&rp->r_statev4_lock);
462 	}
463 
464 	/*
465 	 * VN_HOLD the vnode before releasing s_lock to guarantee
466 	 * we have a valid vnode reference.  The async thread will
467 	 * release the hold when it's done.
468 	 */
469 	if (found == TRUE) {
470 		mutex_exit(&rp->r_statev4_lock);
471 		vp = RTOV4(rp);
472 		VN_HOLD(vp);
473 	}
474 	mutex_exit(&sp->s_lock);
475 
476 	if (found == FALSE && retried == FALSE) {
477 		nfs4_recall_sync_wait(sp);
478 		mutex_enter(&sp->s_lock);
479 		retried = TRUE;
480 		goto retry;
481 	}
482 
483 	nfs4_server_rele(sp);
484 
485 	if (found == FALSE) {
486 
487 		CB_WARN("cb_recall: bad stateid\n");
488 
489 		*cs->statusp = resp->status = NFS4ERR_BAD_STATEID;
490 		return;
491 	}
492 
493 	/* Fire up a thread to do the delegreturn */
494 	nfs4delegreturn_async(rp, NFS4_DR_RECALL|NFS4_DR_REOPEN,
495 	    args->truncate);
496 
497 	*cs->statusp = resp->status = 0;
498 }
499 
500 /* ARGSUSED */
501 static void
502 cb_recall_free(nfs_cb_resop4 *resop)
503 {
504 	/* nothing to do here, cb_recall doesn't kmem_alloc */
505 }
506 
507 /*
508  * This function handles the CB_NULL proc call from an NFSv4 Server.
509  *
510  * We take note that the server has sent a CB_NULL for later processing
511  * in the recovery logic. It is noted so we may pause slightly after the
512  * setclientid and before reopening files. The pause is to allow the
513  * NFSv4 Server time to receive the CB_NULL reply and adjust any of
514  * its internal structures such that it has the opportunity to grant
515  * delegations to reopened files.
516  *
517  */
518 
519 /* ARGSUSED */
520 static void
521 cb_null(CB_COMPOUND4args *args, CB_COMPOUND4res *resp, struct svc_req *req,
522     struct nfs4_callback_globals *ncg)
523 {
524 	struct nfs4_server *sp;
525 
526 	ncg->nfs4_callback_stats.cb_null.value.ui64++;
527 
528 	ASSERT(req->rq_prog >= NFS4_CALLBACK);
529 	ASSERT(req->rq_prog < NFS4_CALLBACK+nfs4_num_prognums);
530 
531 	mutex_enter(&ncg->nfs4_cb_lock);
532 	sp = ncg->nfs4prog2server[req->rq_prog - NFS4_CALLBACK];
533 	mutex_exit(&ncg->nfs4_cb_lock);
534 
535 	if (nfs4_server_vlock(sp, 0) != FALSE) {
536 		sp->s_flags |= N4S_CB_PINGED;
537 		cv_broadcast(&sp->wait_cb_null);
538 		mutex_exit(&sp->s_lock);
539 		nfs4_server_rele(sp);
540 	}
541 }
542 
543 /*
544  * cb_illegal	args: void
545  *		res : status (NFS4ERR_OP_CB_ILLEGAL)
546  */
547 /* ARGSUSED */
548 static void
549 cb_illegal(nfs_cb_argop4 *argop, nfs_cb_resop4 *resop, struct svc_req *req,
550 	struct compound_state *cs, struct nfs4_callback_globals *ncg)
551 {
552 	CB_ILLEGAL4res *resp = &resop->nfs_cb_resop4_u.opcbillegal;
553 
554 	ncg->nfs4_callback_stats.cb_illegal.value.ui64++;
555 	resop->resop = OP_CB_ILLEGAL;
556 	*cs->statusp = resp->status = NFS4ERR_OP_ILLEGAL;
557 }
558 
559 static void
560 cb_compound(CB_COMPOUND4args *args, CB_COMPOUND4res *resp, struct svc_req *req,
561 	struct nfs4_callback_globals *ncg)
562 {
563 	uint_t i;
564 	struct compound_state cs;
565 	nfs_cb_argop4 *argop;
566 	nfs_cb_resop4 *resop, *new_res;
567 	uint_t op;
568 
569 	bzero(&cs, sizeof (cs));
570 	cs.statusp = &resp->status;
571 	cs.cont = TRUE;
572 
573 	/*
574 	 * Form a reply tag by copying over the reqeuest tag.
575 	 */
576 	resp->tag.utf8string_len = args->tag.utf8string_len;
577 	resp->tag.utf8string_val = kmem_alloc(resp->tag.utf8string_len,
578 	    KM_SLEEP);
579 	bcopy(args->tag.utf8string_val, resp->tag.utf8string_val,
580 	    args->tag.utf8string_len);
581 
582 	/*
583 	 * XXX for now, minorversion should be zero
584 	 */
585 	if (args->minorversion != CB4_MINORVERSION) {
586 		resp->array_len = 0;
587 		resp->array = NULL;
588 		resp->status = NFS4ERR_MINOR_VERS_MISMATCH;
589 		return;
590 	}
591 
592 #ifdef DEBUG
593 	/*
594 	 * Verify callback_ident.  It doesn't really matter if it's wrong
595 	 * because we don't really use callback_ident -- we use prog number
596 	 * of the RPC request instead.  In this case, just print a DEBUG
597 	 * console message to reveal brokenness of cbclient (at bkoff/cthon).
598 	 */
599 	if (args->callback_ident != req->rq_prog)
600 		zcmn_err(getzoneid(), CE_WARN,
601 		    "cb_compound: cb_client using wrong "
602 		    "callback_ident(%d), should be %d",
603 		    args->callback_ident, req->rq_prog);
604 #endif
605 
606 	resp->array_len = args->array_len;
607 	resp->array = kmem_zalloc(args->array_len * sizeof (nfs_cb_resop4),
608 	    KM_SLEEP);
609 
610 	for (i = 0; i < args->array_len && cs.cont; i++) {
611 
612 		argop = &args->array[i];
613 		resop = &resp->array[i];
614 		resop->resop = argop->argop;
615 		op = (uint_t)resop->resop;
616 
617 		switch (op) {
618 
619 		case OP_CB_GETATTR:
620 
621 			cb_getattr(argop, resop, req, &cs, ncg);
622 			break;
623 
624 		case OP_CB_RECALL:
625 
626 			cb_recall(argop, resop, req, &cs, ncg);
627 			break;
628 
629 		case OP_CB_ILLEGAL:
630 
631 			/* fall through */
632 
633 		default:
634 			/*
635 			 * Handle OP_CB_ILLEGAL and any undefined opcode.
636 			 * Currently, the XDR code will return BADXDR
637 			 * if cb op doesn't decode to legal value, so
638 			 * it really only handles OP_CB_ILLEGAL.
639 			 */
640 			op = OP_CB_ILLEGAL;
641 			cb_illegal(argop, resop, req, &cs, ncg);
642 		}
643 
644 		if (*cs.statusp != NFS4_OK)
645 			cs.cont = FALSE;
646 
647 		/*
648 		 * If not at last op, and if we are to stop, then
649 		 * compact the results array.
650 		 */
651 		if ((i + 1) < args->array_len && !cs.cont) {
652 
653 			new_res = kmem_alloc(
654 			    (i+1) * sizeof (nfs_cb_resop4), KM_SLEEP);
655 			bcopy(resp->array,
656 			    new_res, (i+1) * sizeof (nfs_cb_resop4));
657 			kmem_free(resp->array,
658 			    args->array_len * sizeof (nfs_cb_resop4));
659 
660 			resp->array_len =  i + 1;
661 			resp->array = new_res;
662 		}
663 	}
664 
665 }
666 
667 static void
668 cb_compound_free(CB_COMPOUND4res *resp)
669 {
670 	uint_t i, op;
671 	nfs_cb_resop4 *resop;
672 
673 	if (resp->tag.utf8string_val) {
674 		UTF8STRING_FREE(resp->tag)
675 	}
676 
677 	for (i = 0; i < resp->array_len; i++) {
678 
679 		resop = &resp->array[i];
680 		op = (uint_t)resop->resop;
681 
682 		switch (op) {
683 
684 		case OP_CB_GETATTR:
685 
686 			cb_getattr_free(resop);
687 			break;
688 
689 		case OP_CB_RECALL:
690 
691 			cb_recall_free(resop);
692 			break;
693 
694 		default:
695 			break;
696 		}
697 	}
698 
699 	if (resp->array != NULL) {
700 		kmem_free(resp->array,
701 		    resp->array_len * sizeof (nfs_cb_resop4));
702 	}
703 }
704 
705 static void
706 cb_dispatch(struct svc_req *req, SVCXPRT *xprt)
707 {
708 	CB_COMPOUND4args args;
709 	CB_COMPOUND4res res;
710 	struct nfs4_callback_globals *ncg;
711 
712 	bool_t (*xdr_args)(), (*xdr_res)();
713 	void (*proc)(CB_COMPOUND4args *, CB_COMPOUND4res *, struct svc_req *,
714 	    struct nfs4_callback_globals *);
715 	void (*freeproc)(CB_COMPOUND4res *);
716 
717 	ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
718 	ASSERT(ncg != NULL);
719 
720 	ncg->nfs4_callback_stats.cb_dispatch.value.ui64++;
721 
722 	switch (req->rq_proc) {
723 	case CB_NULL:
724 		xdr_args = xdr_void;
725 		xdr_res = xdr_void;
726 		proc = cb_null;
727 		freeproc = NULL;
728 		break;
729 
730 	case CB_COMPOUND:
731 		xdr_args = xdr_CB_COMPOUND4args_clnt;
732 		xdr_res = xdr_CB_COMPOUND4res;
733 		proc = cb_compound;
734 		freeproc = cb_compound_free;
735 		break;
736 
737 	default:
738 		CB_WARN("cb_dispatch: no proc\n");
739 		svcerr_noproc(xprt);
740 		return;
741 	}
742 
743 	args.tag.utf8string_val = NULL;
744 	args.array = NULL;
745 
746 	if (!SVC_GETARGS(xprt, xdr_args, (caddr_t)&args)) {
747 
748 		CB_WARN("cb_dispatch: cannot getargs\n");
749 		svcerr_decode(xprt);
750 		return;
751 	}
752 
753 	(*proc)(&args, &res, req, ncg);
754 
755 	if (svc_sendreply(xprt, xdr_res, (caddr_t)&res) == FALSE) {
756 
757 		CB_WARN("cb_dispatch: bad sendreply\n");
758 		svcerr_systemerr(xprt);
759 	}
760 
761 	if (freeproc)
762 		(*freeproc)(&res);
763 
764 	if (!SVC_FREEARGS(xprt, xdr_args, (caddr_t)&args)) {
765 
766 		CB_WARN("cb_dispatch: bad freeargs\n");
767 	}
768 }
769 
770 static rpcprog_t
771 nfs4_getnextprogram(struct nfs4_callback_globals *ncg)
772 {
773 	int i, j;
774 
775 	j = ncg->nfs4_program_hint;
776 	for (i = 0; i < nfs4_num_prognums; i++, j++) {
777 
778 		if (j >= nfs4_num_prognums)
779 			j = 0;
780 
781 		if (ncg->nfs4prog2server[j] == NULL) {
782 			ncg->nfs4_program_hint = j+1;
783 			return (j+NFS4_CALLBACK);
784 		}
785 	}
786 
787 	return (0);
788 }
789 
790 void
791 nfs4callback_destroy(nfs4_server_t *np)
792 {
793 	struct nfs4_callback_globals *ncg;
794 	int i;
795 
796 	if (np->s_program == 0)
797 		return;
798 
799 	ncg = np->zone_globals;
800 	i = np->s_program - NFS4_CALLBACK;
801 
802 	mutex_enter(&ncg->nfs4_cb_lock);
803 
804 	ASSERT(ncg->nfs4prog2server[i] == np);
805 
806 	ncg->nfs4prog2server[i] = NULL;
807 
808 	if (i < ncg->nfs4_program_hint)
809 		ncg->nfs4_program_hint = i;
810 
811 	mutex_exit(&ncg->nfs4_cb_lock);
812 }
813 
814 /*
815  * nfs4_setport - This function saves a netid and univeral address for
816  * the callback program.  These values will be used during setclientid.
817  */
818 static void
819 nfs4_setport(char *netid, char *uaddr, char *protofmly, char *proto,
820 	struct nfs4_callback_globals *ncg)
821 {
822 	struct nfs4_cb_port *p;
823 	bool_t found = FALSE;
824 
825 	ASSERT(MUTEX_HELD(&ncg->nfs4_cb_lock));
826 
827 	p = list_head(&ncg->nfs4_cb_ports);
828 	for (; p != NULL; p = list_next(&ncg->nfs4_cb_ports, p)) {
829 		if (strcmp(p->netid, netid) == 0) {
830 			found = TRUE;
831 			break;
832 		}
833 	}
834 	if (found == TRUE)
835 		(void) strcpy(p->uaddr, uaddr);
836 	else {
837 		p = kmem_alloc(sizeof (*p), KM_SLEEP);
838 
839 		(void) strcpy(p->uaddr, uaddr);
840 		(void) strcpy(p->netid, netid);
841 		(void) strcpy(p->protofmly, protofmly);
842 		(void) strcpy(p->proto, proto);
843 		list_insert_head(&ncg->nfs4_cb_ports, p);
844 	}
845 }
846 
847 /*
848  * nfs4_cb_args - This function is used to construct the callback
849  * portion of the arguments needed for setclientid.
850  */
851 
852 void
853 nfs4_cb_args(nfs4_server_t *np, struct knetconfig *knc, SETCLIENTID4args *args)
854 {
855 	struct nfs4_cb_port *p;
856 	bool_t found = FALSE;
857 	rpcprog_t pgm;
858 	struct nfs4_callback_globals *ncg = np->zone_globals;
859 
860 	/*
861 	 * This server structure may already have a program number
862 	 * assigned to it.  This happens when the client has to
863 	 * re-issue SETCLIENTID.  Just re-use the information.
864 	 */
865 	if (np->s_program >= NFS4_CALLBACK &&
866 	    np->s_program < NFS4_CALLBACK + nfs4_num_prognums)
867 		nfs4callback_destroy(np);
868 
869 	mutex_enter(&ncg->nfs4_cb_lock);
870 
871 	p = list_head(&ncg->nfs4_cb_ports);
872 	for (; p != NULL; p = list_next(&ncg->nfs4_cb_ports, p)) {
873 		if (strcmp(p->protofmly, knc->knc_protofmly) == 0 &&
874 		    strcmp(p->proto, knc->knc_proto) == 0) {
875 			found = TRUE;
876 			break;
877 		}
878 	}
879 
880 	if (found == FALSE) {
881 
882 		NFS4_DEBUG(nfs4_callback_debug,
883 		    (CE_WARN, "nfs4_cb_args: could not find netid for %s/%s\n",
884 		    knc->knc_protofmly, knc->knc_proto));
885 
886 		args->callback.cb_program = 0;
887 		args->callback.cb_location.r_netid = NULL;
888 		args->callback.cb_location.r_addr = NULL;
889 		args->callback_ident = 0;
890 		mutex_exit(&ncg->nfs4_cb_lock);
891 		return;
892 	}
893 
894 	if ((pgm = nfs4_getnextprogram(ncg)) == 0) {
895 		CB_WARN("nfs4_cb_args: out of program numbers\n");
896 
897 		args->callback.cb_program = 0;
898 		args->callback.cb_location.r_netid = NULL;
899 		args->callback.cb_location.r_addr = NULL;
900 		args->callback_ident = 0;
901 		mutex_exit(&ncg->nfs4_cb_lock);
902 		return;
903 	}
904 
905 	ncg->nfs4prog2server[pgm-NFS4_CALLBACK] = np;
906 	args->callback.cb_program = pgm;
907 	args->callback.cb_location.r_netid = p->netid;
908 	args->callback.cb_location.r_addr = p->uaddr;
909 	args->callback_ident = pgm;
910 
911 	np->s_program = pgm;
912 
913 	mutex_exit(&ncg->nfs4_cb_lock);
914 }
915 
916 static int
917 nfs4_dquery(struct nfs4_svc_args *arg, model_t model)
918 {
919 	file_t *fp;
920 	vnode_t *vp;
921 	rnode4_t *rp;
922 	int error;
923 	STRUCT_HANDLE(nfs4_svc_args, uap);
924 
925 	STRUCT_SET_HANDLE(uap, model, arg);
926 
927 	if ((fp = getf(STRUCT_FGET(uap, fd))) == NULL)
928 		return (EBADF);
929 
930 	vp = fp->f_vnode;
931 
932 	if (vp == NULL || vp->v_type != VREG ||
933 	    !vn_matchops(vp, nfs4_vnodeops)) {
934 		releasef(STRUCT_FGET(uap, fd));
935 		return (EBADF);
936 	}
937 
938 	rp = VTOR4(vp);
939 
940 	/*
941 	 * I can't convince myself that we need locking here.  The
942 	 * rnode cannot disappear and the value returned is instantly
943 	 * stale anway, so why bother?
944 	 */
945 
946 	error = suword32(STRUCT_FGETP(uap, netid), rp->r_deleg_type);
947 	releasef(STRUCT_FGET(uap, fd));
948 	return (error);
949 }
950 
951 
952 /*
953  * NFS4 client system call.  This service does the
954  * necessary initialization for the callback program.
955  * This is fashioned after the server side interaction
956  * between nfsd and the kernel.  On the client, the
957  * mount command forks and the child process does the
958  * necessary interaction with the kernel.
959  *
960  * uap->fd is the fd of an open transport provider
961  */
962 int
963 nfs4_svc(struct nfs4_svc_args *arg, model_t model)
964 {
965 	file_t *fp;
966 	int error;
967 	int readsize;
968 	char buf[KNC_STRSIZE], uaddr[KNC_STRSIZE];
969 	char protofmly[KNC_STRSIZE], proto[KNC_STRSIZE];
970 	size_t len;
971 	STRUCT_HANDLE(nfs4_svc_args, uap);
972 	struct netbuf addrmask;
973 	int cmd;
974 	SVCMASTERXPRT *cb_xprt;
975 	struct nfs4_callback_globals *ncg;
976 
977 #ifdef lint
978 	model = model;		/* STRUCT macros don't always refer to it */
979 #endif
980 
981 	STRUCT_SET_HANDLE(uap, model, arg);
982 
983 	if (STRUCT_FGET(uap, cmd) == NFS4_DQUERY)
984 		return (nfs4_dquery(arg, model));
985 
986 	if (secpolicy_nfs(CRED()) != 0)
987 		return (EPERM);
988 
989 	if ((fp = getf(STRUCT_FGET(uap, fd))) == NULL)
990 		return (EBADF);
991 
992 	/*
993 	 * Set read buffer size to rsize
994 	 * and add room for RPC headers.
995 	 */
996 	readsize = nfs3tsize() + (RPC_MAXDATASIZE - NFS_MAXDATA);
997 	if (readsize < RPC_MAXDATASIZE)
998 		readsize = RPC_MAXDATASIZE;
999 
1000 	error = copyinstr((const char *)STRUCT_FGETP(uap, netid), buf,
1001 	    KNC_STRSIZE, &len);
1002 	if (error) {
1003 		releasef(STRUCT_FGET(uap, fd));
1004 		return (error);
1005 	}
1006 
1007 	cmd = STRUCT_FGET(uap, cmd);
1008 
1009 	if (cmd & NFS4_KRPC_START) {
1010 		addrmask.len = STRUCT_FGET(uap, addrmask.len);
1011 		addrmask.maxlen = STRUCT_FGET(uap, addrmask.maxlen);
1012 		addrmask.buf = kmem_alloc(addrmask.maxlen, KM_SLEEP);
1013 		error = copyin(STRUCT_FGETP(uap, addrmask.buf), addrmask.buf,
1014 		    addrmask.len);
1015 		if (error) {
1016 			releasef(STRUCT_FGET(uap, fd));
1017 			kmem_free(addrmask.buf, addrmask.maxlen);
1018 			return (error);
1019 		}
1020 	}
1021 	else
1022 		addrmask.buf = NULL;
1023 
1024 	error = copyinstr((const char *)STRUCT_FGETP(uap, addr), uaddr,
1025 	    sizeof (uaddr), &len);
1026 	if (error) {
1027 		releasef(STRUCT_FGET(uap, fd));
1028 		if (addrmask.buf)
1029 			kmem_free(addrmask.buf, addrmask.maxlen);
1030 		return (error);
1031 	}
1032 
1033 	error = copyinstr((const char *)STRUCT_FGETP(uap, protofmly), protofmly,
1034 	    sizeof (protofmly), &len);
1035 	if (error) {
1036 		releasef(STRUCT_FGET(uap, fd));
1037 		if (addrmask.buf)
1038 			kmem_free(addrmask.buf, addrmask.maxlen);
1039 		return (error);
1040 	}
1041 
1042 	error = copyinstr((const char *)STRUCT_FGETP(uap, proto), proto,
1043 	    sizeof (proto), &len);
1044 	if (error) {
1045 		releasef(STRUCT_FGET(uap, fd));
1046 		if (addrmask.buf)
1047 			kmem_free(addrmask.buf, addrmask.maxlen);
1048 		return (error);
1049 	}
1050 
1051 	ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
1052 	ASSERT(ncg != NULL);
1053 
1054 	mutex_enter(&ncg->nfs4_cb_lock);
1055 	if (cmd & NFS4_SETPORT)
1056 		nfs4_setport(buf, uaddr, protofmly, proto, ncg);
1057 
1058 	if (cmd & NFS4_KRPC_START) {
1059 		error = svc_tli_kcreate(fp, readsize, buf, &addrmask, &cb_xprt,
1060 		    &nfs4_cb_sct, NULL, NFS_CB_SVCPOOL_ID, FALSE);
1061 		if (error) {
1062 			CB_WARN1("nfs4_svc: svc_tli_kcreate failed %d\n",
1063 			    error);
1064 			kmem_free(addrmask.buf, addrmask.maxlen);
1065 		}
1066 	}
1067 
1068 	mutex_exit(&ncg->nfs4_cb_lock);
1069 	releasef(STRUCT_FGET(uap, fd));
1070 	return (error);
1071 }
1072 
1073 struct nfs4_callback_globals *
1074 nfs4_get_callback_globals(void)
1075 {
1076 	return (zone_getspecific(nfs4_callback_zone_key, nfs_zone()));
1077 }
1078 
1079 static void *
1080 nfs4_callback_init_zone(zoneid_t zoneid)
1081 {
1082 	kstat_t *nfs4_callback_kstat;
1083 	struct nfs4_callback_globals *ncg;
1084 
1085 	ncg = kmem_zalloc(sizeof (*ncg), KM_SLEEP);
1086 
1087 	ncg->nfs4prog2server = kmem_zalloc(nfs4_num_prognums *
1088 	    sizeof (struct nfs4_server *), KM_SLEEP);
1089 
1090 	/* initialize the dlist */
1091 	mutex_init(&ncg->nfs4_dlist_lock, NULL, MUTEX_DEFAULT, NULL);
1092 	list_create(&ncg->nfs4_dlist, sizeof (struct nfs4_dnode),
1093 	    offsetof(struct nfs4_dnode, linkage));
1094 
1095 	/* initialize cb_port list */
1096 	mutex_init(&ncg->nfs4_cb_lock, NULL, MUTEX_DEFAULT, NULL);
1097 	list_create(&ncg->nfs4_cb_ports, sizeof (struct nfs4_cb_port),
1098 	    offsetof(struct nfs4_cb_port, linkage));
1099 
1100 	/* get our own copy of the kstats */
1101 	bcopy(&nfs4_callback_stats_tmpl, &ncg->nfs4_callback_stats,
1102 	    sizeof (nfs4_callback_stats_tmpl));
1103 	/* register "nfs:0:nfs4_callback_stats" for this zone */
1104 	if ((nfs4_callback_kstat =
1105 	    kstat_create_zone("nfs", 0, "nfs4_callback_stats", "misc",
1106 	    KSTAT_TYPE_NAMED,
1107 	    sizeof (ncg->nfs4_callback_stats) / sizeof (kstat_named_t),
1108 	    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE,
1109 	    zoneid)) != NULL) {
1110 		nfs4_callback_kstat->ks_data = &ncg->nfs4_callback_stats;
1111 		kstat_install(nfs4_callback_kstat);
1112 	}
1113 	return (ncg);
1114 }
1115 
1116 static void
1117 nfs4_discard_delegations(struct nfs4_callback_globals *ncg)
1118 {
1119 	nfs4_server_t *sp;
1120 	int i, num_removed;
1121 
1122 	/*
1123 	 * It's OK here to just run through the registered "programs", as
1124 	 * servers without programs won't have any delegations to handle.
1125 	 */
1126 	for (i = 0; i < nfs4_num_prognums; i++) {
1127 		rnode4_t *rp;
1128 
1129 		mutex_enter(&ncg->nfs4_cb_lock);
1130 		sp = ncg->nfs4prog2server[i];
1131 		mutex_exit(&ncg->nfs4_cb_lock);
1132 
1133 		if (nfs4_server_vlock(sp, 1) == FALSE)
1134 			continue;
1135 		num_removed = 0;
1136 		while ((rp = list_head(&sp->s_deleg_list)) != NULL) {
1137 			mutex_enter(&rp->r_statev4_lock);
1138 			if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
1139 				/*
1140 				 * We need to take matters into our own hands,
1141 				 * as nfs4delegreturn_cleanup_impl() won't
1142 				 * remove this from the list.
1143 				 */
1144 				list_remove(&sp->s_deleg_list, rp);
1145 				mutex_exit(&rp->r_statev4_lock);
1146 				nfs4_dec_state_ref_count_nolock(sp,
1147 				    VTOMI4(RTOV4(rp)));
1148 				num_removed++;
1149 				continue;
1150 			}
1151 			mutex_exit(&rp->r_statev4_lock);
1152 			VN_HOLD(RTOV4(rp));
1153 			mutex_exit(&sp->s_lock);
1154 			/*
1155 			 * The following will remove the node from the list.
1156 			 */
1157 			nfs4delegreturn_cleanup_impl(rp, sp, ncg);
1158 			VN_RELE(RTOV4(rp));
1159 			mutex_enter(&sp->s_lock);
1160 		}
1161 		mutex_exit(&sp->s_lock);
1162 		/* each removed list node reles a reference */
1163 		while (num_removed-- > 0)
1164 			nfs4_server_rele(sp);
1165 		/* remove our reference for nfs4_server_vlock */
1166 		nfs4_server_rele(sp);
1167 	}
1168 }
1169 
1170 /* ARGSUSED */
1171 static void
1172 nfs4_callback_shutdown_zone(zoneid_t zoneid, void *data)
1173 {
1174 	struct nfs4_callback_globals *ncg = data;
1175 
1176 	/*
1177 	 * Clean pending delegation return list.
1178 	 */
1179 	nfs4_dlistclean_impl(ncg, NFS4_DR_DISCARD);
1180 
1181 	/*
1182 	 * Discard all delegations.
1183 	 */
1184 	nfs4_discard_delegations(ncg);
1185 }
1186 
1187 static void
1188 nfs4_callback_fini_zone(zoneid_t zoneid, void *data)
1189 {
1190 	struct nfs4_callback_globals *ncg = data;
1191 	struct nfs4_cb_port *p;
1192 	nfs4_server_t *sp, *next;
1193 	nfs4_server_t freelist;
1194 	int i;
1195 
1196 	kstat_delete_byname_zone("nfs", 0, "nfs4_callback_stats", zoneid);
1197 
1198 	/*
1199 	 * Discard all delegations that may have crept in since we did the
1200 	 * _shutdown.
1201 	 */
1202 	nfs4_discard_delegations(ncg);
1203 	/*
1204 	 * We're completely done with this zone and all associated
1205 	 * nfs4_server_t's.  Any remaining nfs4_server_ts should only have one
1206 	 * more reference outstanding -- the reference we didn't release in
1207 	 * nfs4_renew_lease_thread().
1208 	 *
1209 	 * Here we need to run through the global nfs4_server_lst as we need to
1210 	 * deal with nfs4_server_ts without programs, as they also have threads
1211 	 * created for them, and so have outstanding references that we need to
1212 	 * release.
1213 	 */
1214 	freelist.forw = &freelist;
1215 	freelist.back = &freelist;
1216 	mutex_enter(&nfs4_server_lst_lock);
1217 	sp = nfs4_server_lst.forw;
1218 	while (sp != &nfs4_server_lst) {
1219 		next = sp->forw;
1220 		if (sp->zoneid == zoneid) {
1221 			remque(sp);
1222 			insque(sp, &freelist);
1223 		}
1224 		sp = next;
1225 	}
1226 	mutex_exit(&nfs4_server_lst_lock);
1227 
1228 	sp = freelist.forw;
1229 	while (sp != &freelist) {
1230 		next = sp->forw;
1231 		nfs4_server_rele(sp);	/* free the list's reference */
1232 		sp = next;
1233 	}
1234 
1235 #ifdef DEBUG
1236 	for (i = 0; i < nfs4_num_prognums; i++) {
1237 		ASSERT(ncg->nfs4prog2server[i] == NULL);
1238 	}
1239 #endif
1240 	kmem_free(ncg->nfs4prog2server, nfs4_num_prognums *
1241 	    sizeof (struct nfs4_server *));
1242 
1243 	mutex_enter(&ncg->nfs4_cb_lock);
1244 	while ((p = list_head(&ncg->nfs4_cb_ports)) != NULL) {
1245 		list_remove(&ncg->nfs4_cb_ports, p);
1246 		kmem_free(p, sizeof (*p));
1247 	}
1248 	list_destroy(&ncg->nfs4_cb_ports);
1249 	mutex_destroy(&ncg->nfs4_cb_lock);
1250 	list_destroy(&ncg->nfs4_dlist);
1251 	mutex_destroy(&ncg->nfs4_dlist_lock);
1252 	kmem_free(ncg, sizeof (*ncg));
1253 }
1254 
1255 void
1256 nfs4_callback_init(void)
1257 {
1258 	int i;
1259 	SVC_CALLOUT *nfs4_cb_sc;
1260 
1261 	/* initialize the callback table */
1262 	nfs4_cb_sc = kmem_alloc(nfs4_num_prognums *
1263 	    sizeof (SVC_CALLOUT), KM_SLEEP);
1264 
1265 	for (i = 0; i < nfs4_num_prognums; i++) {
1266 		nfs4_cb_sc[i].sc_prog = NFS4_CALLBACK+i;
1267 		nfs4_cb_sc[i].sc_versmin = NFS_CB;
1268 		nfs4_cb_sc[i].sc_versmax = NFS_CB;
1269 		nfs4_cb_sc[i].sc_dispatch = cb_dispatch;
1270 	}
1271 
1272 	nfs4_cb_sct.sct_size = nfs4_num_prognums;
1273 	nfs4_cb_sct.sct_free = FALSE;
1274 	nfs4_cb_sct.sct_sc = nfs4_cb_sc;
1275 
1276 	/*
1277 	 * Compute max bytes required for dyamically allocated parts
1278 	 * of cb_getattr reply.  Only size and change are supported now.
1279 	 * If CB_GETATTR is changed to reply with additional attrs,
1280 	 * additional sizes must be added below.
1281 	 *
1282 	 * fattr4_change + fattr4_size == uint64_t + uint64_t
1283 	 */
1284 	cb_getattr_bytes = 2 * BYTES_PER_XDR_UNIT + 2 * BYTES_PER_XDR_UNIT;
1285 
1286 	zone_key_create(&nfs4_callback_zone_key, nfs4_callback_init_zone,
1287 	    nfs4_callback_shutdown_zone, nfs4_callback_fini_zone);
1288 }
1289 
1290 void
1291 nfs4_callback_fini(void)
1292 {
1293 }
1294 
1295 /*
1296  * NB: This function can be called from the *wrong* zone (ie, the zone that
1297  * 'rp' belongs to and the caller's zone may not be the same).  This can happen
1298  * if the zone is going away and we get called from nfs4_async_inactive().  In
1299  * this case the globals will be NULL and we won't update the counters, which
1300  * doesn't matter as the zone is going away anyhow.
1301  */
1302 static void
1303 nfs4delegreturn_cleanup_impl(rnode4_t *rp, nfs4_server_t *np,
1304 	struct nfs4_callback_globals *ncg)
1305 {
1306 	mntinfo4_t *mi = VTOMI4(RTOV4(rp));
1307 	boolean_t need_rele = B_FALSE;
1308 
1309 	/*
1310 	 * Caller must be holding mi_recovlock in read mode
1311 	 * to call here.  This is provided by start_op.
1312 	 * Delegation management requires to grab s_lock
1313 	 * first and then r_statev4_lock.
1314 	 */
1315 
1316 	if (np == NULL) {
1317 		np = find_nfs4_server_all(mi, 1);
1318 		if (np == NULL)
1319 			return;
1320 		need_rele = B_TRUE;
1321 	} else {
1322 		mutex_enter(&np->s_lock);
1323 	}
1324 
1325 	mutex_enter(&rp->r_statev4_lock);
1326 
1327 	if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
1328 		mutex_exit(&rp->r_statev4_lock);
1329 		mutex_exit(&np->s_lock);
1330 		if (need_rele)
1331 			nfs4_server_rele(np);
1332 		return;
1333 	}
1334 
1335 	/*
1336 	 * Free the cred originally held when
1337 	 * the delegation was granted.  Caller must
1338 	 * hold this cred if it wants to use it after
1339 	 * this call.
1340 	 */
1341 	crfree(rp->r_deleg_cred);
1342 	rp->r_deleg_cred = NULL;
1343 	rp->r_deleg_type = OPEN_DELEGATE_NONE;
1344 	rp->r_deleg_needs_recovery = OPEN_DELEGATE_NONE;
1345 	rp->r_deleg_needs_recall = FALSE;
1346 	rp->r_deleg_return_pending = FALSE;
1347 
1348 	/*
1349 	 * Remove the rnode from the server's list and
1350 	 * update the ref counts.
1351 	 */
1352 	list_remove(&np->s_deleg_list, rp);
1353 	mutex_exit(&rp->r_statev4_lock);
1354 	nfs4_dec_state_ref_count_nolock(np, mi);
1355 	mutex_exit(&np->s_lock);
1356 	/* removed list node removes a reference */
1357 	nfs4_server_rele(np);
1358 	if (need_rele)
1359 		nfs4_server_rele(np);
1360 	if (ncg != NULL)
1361 		ncg->nfs4_callback_stats.delegations.value.ui64--;
1362 }
1363 
1364 void
1365 nfs4delegreturn_cleanup(rnode4_t *rp, nfs4_server_t *np)
1366 {
1367 	struct nfs4_callback_globals *ncg;
1368 
1369 	if (np != NULL) {
1370 		ncg = np->zone_globals;
1371 	} else if (nfs_zone() == VTOMI4(RTOV4(rp))->mi_zone) {
1372 		ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
1373 		ASSERT(ncg != NULL);
1374 	} else {
1375 		/*
1376 		 * Request coming from the wrong zone.
1377 		 */
1378 		ASSERT(getzoneid() == GLOBAL_ZONEID);
1379 		ncg = NULL;
1380 	}
1381 
1382 	nfs4delegreturn_cleanup_impl(rp, np, ncg);
1383 }
1384 
1385 static void
1386 nfs4delegreturn_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp,
1387 	cred_t *cr, vnode_t *vp)
1388 {
1389 	if (error != ETIMEDOUT && error != EINTR &&
1390 	    !NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) {
1391 		lost_rqstp->lr_op = 0;
1392 		return;
1393 	}
1394 
1395 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
1396 	    "nfs4close_save_lost_rqst: error %d", error));
1397 
1398 	lost_rqstp->lr_op = OP_DELEGRETURN;
1399 	/*
1400 	 * The vp is held and rele'd via the recovery code.
1401 	 * See nfs4_save_lost_rqst.
1402 	 */
1403 	lost_rqstp->lr_vp = vp;
1404 	lost_rqstp->lr_dvp = NULL;
1405 	lost_rqstp->lr_oop = NULL;
1406 	lost_rqstp->lr_osp = NULL;
1407 	lost_rqstp->lr_lop = NULL;
1408 	lost_rqstp->lr_cr = cr;
1409 	lost_rqstp->lr_flk = NULL;
1410 	lost_rqstp->lr_putfirst = FALSE;
1411 }
1412 
1413 static void
1414 nfs4delegreturn_otw(rnode4_t *rp, cred_t *cr, nfs4_error_t *ep)
1415 {
1416 	COMPOUND4args_clnt args;
1417 	COMPOUND4res_clnt res;
1418 	nfs_argop4 argops[3];
1419 	nfs4_ga_res_t *garp = NULL;
1420 	hrtime_t t;
1421 	int numops;
1422 	int doqueue = 1;
1423 
1424 	args.ctag = TAG_DELEGRETURN;
1425 
1426 	numops = 3;		/* PUTFH, GETATTR, DELEGRETURN */
1427 
1428 	args.array = argops;
1429 	args.array_len = numops;
1430 
1431 	argops[0].argop = OP_CPUTFH;
1432 	argops[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
1433 
1434 	argops[1].argop = OP_GETATTR;
1435 	argops[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
1436 	argops[1].nfs_argop4_u.opgetattr.mi = VTOMI4(RTOV4(rp));
1437 
1438 	argops[2].argop = OP_DELEGRETURN;
1439 	argops[2].nfs_argop4_u.opdelegreturn.deleg_stateid =
1440 	    rp->r_deleg_stateid;
1441 
1442 	t = gethrtime();
1443 	rfs4call(VTOMI4(RTOV4(rp)), &args, &res, cr, &doqueue, 0, ep);
1444 
1445 	if (ep->error)
1446 		return;
1447 
1448 	if (res.status == NFS4_OK) {
1449 		garp = &res.array[1].nfs_resop4_u.opgetattr.ga_res;
1450 		nfs4_attr_cache(RTOV4(rp), garp, t, cr, TRUE, NULL);
1451 
1452 	}
1453 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1454 }
1455 
1456 int
1457 nfs4_do_delegreturn(rnode4_t *rp, int flags, cred_t *cr,
1458 	struct nfs4_callback_globals *ncg)
1459 {
1460 	vnode_t *vp = RTOV4(rp);
1461 	mntinfo4_t *mi = VTOMI4(vp);
1462 	nfs4_lost_rqst_t lost_rqst;
1463 	nfs4_recov_state_t recov_state;
1464 	bool_t needrecov = FALSE, recovonly, done = FALSE;
1465 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
1466 
1467 	ncg->nfs4_callback_stats.delegreturn.value.ui64++;
1468 
1469 	while (!done) {
1470 		e.error = nfs4_start_fop(mi, vp, NULL, OH_DELEGRETURN,
1471 		    &recov_state, &recovonly);
1472 
1473 		if (e.error) {
1474 			if (flags & NFS4_DR_FORCE) {
1475 				(void) nfs_rw_enter_sig(&mi->mi_recovlock,
1476 				    RW_READER, 0);
1477 				nfs4delegreturn_cleanup_impl(rp, NULL, ncg);
1478 				nfs_rw_exit(&mi->mi_recovlock);
1479 			}
1480 			break;
1481 		}
1482 
1483 		/*
1484 		 * Check to see if the delegation has already been
1485 		 * returned by the recovery thread.   The state of
1486 		 * the delegation cannot change at this point due
1487 		 * to start_fop and the r_deleg_recall_lock.
1488 		 */
1489 		if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
1490 			e.error = 0;
1491 			nfs4_end_op(mi, vp, NULL, &recov_state, needrecov);
1492 			break;
1493 		}
1494 
1495 		if (recovonly) {
1496 			/*
1497 			 * Delegation will be returned via the
1498 			 * recovery framework.  Build a lost request
1499 			 * structure, start recovery and get out.
1500 			 */
1501 			nfs4_error_init(&e, EINTR);
1502 			nfs4delegreturn_save_lost_rqst(e.error, &lost_rqst,
1503 			    cr, vp);
1504 			(void) nfs4_start_recovery(&e, mi, vp,
1505 			    NULL, &rp->r_deleg_stateid,
1506 			    lost_rqst.lr_op == OP_DELEGRETURN ?
1507 			    &lost_rqst : NULL, OP_DELEGRETURN, NULL,
1508 			    NULL, NULL);
1509 			nfs4_end_op(mi, vp, NULL, &recov_state, needrecov);
1510 			break;
1511 		}
1512 
1513 		nfs4delegreturn_otw(rp, cr, &e);
1514 
1515 		/*
1516 		 * Ignore some errors on delegreturn; no point in marking
1517 		 * the file dead on a state destroying operation.
1518 		 */
1519 		if (e.error == 0 && (nfs4_recov_marks_dead(e.stat) ||
1520 		    e.stat == NFS4ERR_BADHANDLE ||
1521 		    e.stat == NFS4ERR_STALE ||
1522 		    (e.stat == NFS4ERR_STALE_STATEID &&
1523 		     !(rp->r_flags & R4HASHED))))
1524 			needrecov = FALSE;
1525 		else
1526 			needrecov = nfs4_needs_recovery(&e, TRUE, vp->v_vfsp);
1527 
1528 		if (needrecov) {
1529 			nfs4delegreturn_save_lost_rqst(e.error, &lost_rqst,
1530 			    cr, vp);
1531 			(void) nfs4_start_recovery(&e, mi, vp,
1532 			    NULL, &rp->r_deleg_stateid,
1533 			    lost_rqst.lr_op == OP_DELEGRETURN ?
1534 			    &lost_rqst : NULL, OP_DELEGRETURN, NULL,
1535 			    NULL, NULL);
1536 		} else {
1537 			nfs4delegreturn_cleanup_impl(rp, NULL, ncg);
1538 			done = TRUE;
1539 		}
1540 
1541 		nfs4_end_op(mi, vp, NULL, &recov_state, needrecov);
1542 	}
1543 	return (e.error);
1544 }
1545 
1546 /*
1547  * nfs4_resend_delegreturn - used to drive the delegreturn
1548  * operation via the recovery thread.
1549  */
1550 void
1551 nfs4_resend_delegreturn(nfs4_lost_rqst_t *lorp, nfs4_error_t *ep,
1552 	nfs4_server_t *np)
1553 {
1554 	rnode4_t *rp = VTOR4(lorp->lr_vp);
1555 
1556 	/* If the file failed recovery, just quit. */
1557 	mutex_enter(&rp->r_statelock);
1558 	if (rp->r_flags & R4RECOVERR) {
1559 		ep->error = EIO;
1560 	}
1561 	mutex_exit(&rp->r_statelock);
1562 
1563 	if (!ep->error)
1564 		nfs4delegreturn_otw(rp, lorp->lr_cr, ep);
1565 
1566 	/*
1567 	 * If recovery is now needed, then return the error
1568 	 * and status and let the recovery thread handle it,
1569 	 * including re-driving another delegreturn.  Otherwise,
1570 	 * just give up and clean up the delegation.
1571 	 */
1572 	if (nfs4_needs_recovery(ep, TRUE, lorp->lr_vp->v_vfsp))
1573 		return;
1574 
1575 	if (rp->r_deleg_type != OPEN_DELEGATE_NONE)
1576 		nfs4delegreturn_cleanup(rp, np);
1577 
1578 	nfs4_error_zinit(ep);
1579 }
1580 
1581 /*
1582  * nfs4delegreturn - general function to return a delegation.
1583  *
1584  * NFS4_DR_FORCE - return the delegation even if start_op fails
1585  * NFS4_DR_PUSH - push modified data back to the server via VOP_PUTPAGE
1586  * NFS4_DR_DISCARD - discard the delegation w/o delegreturn
1587  * NFS4_DR_DID_OP - calling function already did nfs4_start_op
1588  * NFS4_DR_RECALL - delegreturned initiated via CB_RECALL
1589  * NFS4_DR_REOPEN - do file reopens, if applicable
1590  */
1591 static int
1592 nfs4delegreturn_impl(rnode4_t *rp, int flags, struct nfs4_callback_globals *ncg)
1593 {
1594 	int error = 0;
1595 	cred_t *cr = NULL;
1596 	vnode_t *vp;
1597 	bool_t needrecov = FALSE;
1598 	bool_t rw_entered = FALSE;
1599 	bool_t do_reopen;
1600 
1601 	vp = RTOV4(rp);
1602 
1603 	/*
1604 	 * If NFS4_DR_DISCARD is set by itself, take a short-cut and
1605 	 * discard without doing an otw DELEGRETURN.  This may only be used
1606 	 * by the recovery thread because it bypasses the synchronization
1607 	 * with r_deleg_recall_lock and mi->mi_recovlock.
1608 	 */
1609 	if (flags == NFS4_DR_DISCARD) {
1610 		nfs4delegreturn_cleanup_impl(rp, NULL, ncg);
1611 		return (0);
1612 	}
1613 
1614 	if (flags & NFS4_DR_DID_OP) {
1615 		/*
1616 		 * Caller had already done start_op, which means the
1617 		 * r_deleg_recall_lock is already held in READ mode
1618 		 * so we cannot take it in write mode.  Return the
1619 		 * delegation asynchronously.
1620 		 *
1621 		 * Remove the NFS4_DR_DID_OP flag so we don't
1622 		 * get stuck looping through here.
1623 		 */
1624 		VN_HOLD(vp);
1625 		nfs4delegreturn_async(rp, (flags & ~NFS4_DR_DID_OP), FALSE);
1626 		return (0);
1627 	}
1628 
1629 	/*
1630 	 * Verify we still have a delegation and crhold the credential.
1631 	 */
1632 	mutex_enter(&rp->r_statev4_lock);
1633 	if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
1634 		mutex_exit(&rp->r_statev4_lock);
1635 		goto out;
1636 	}
1637 	cr = rp->r_deleg_cred;
1638 	ASSERT(cr != NULL);
1639 	crhold(cr);
1640 	mutex_exit(&rp->r_statev4_lock);
1641 
1642 	/*
1643 	 * Push the modified data back to the server synchronously
1644 	 * before doing DELEGRETURN.
1645 	 */
1646 	if (flags & NFS4_DR_PUSH)
1647 		(void) VOP_PUTPAGE(vp, 0, 0, 0, cr, NULL);
1648 
1649 	/*
1650 	 * Take r_deleg_recall_lock in WRITE mode, this will prevent
1651 	 * nfs4_is_otw_open_necessary from trying to use the delegation
1652 	 * while the DELEGRETURN is in progress.
1653 	 */
1654 	(void) nfs_rw_enter_sig(&rp->r_deleg_recall_lock, RW_WRITER, FALSE);
1655 
1656 	rw_entered = TRUE;
1657 
1658 	if (rp->r_deleg_type == OPEN_DELEGATE_NONE)
1659 		goto out;
1660 
1661 	if (flags & NFS4_DR_REOPEN) {
1662 		/*
1663 		 * If R4RECOVERRP is already set, then skip re-opening
1664 		 * the delegation open streams and go straight to doing
1665 		 * delegreturn.  (XXX if the file has failed recovery, then the
1666 		 * delegreturn attempt is likely to be futile.)
1667 		 */
1668 		mutex_enter(&rp->r_statelock);
1669 		do_reopen = !(rp->r_flags & R4RECOVERRP);
1670 		mutex_exit(&rp->r_statelock);
1671 
1672 		if (do_reopen) {
1673 			error = deleg_reopen(vp, &needrecov, ncg, flags);
1674 			if (error != 0) {
1675 				if ((flags & (NFS4_DR_FORCE | NFS4_DR_RECALL))
1676 				    == 0)
1677 					goto out;
1678 			} else if (needrecov) {
1679 				if ((flags & NFS4_DR_FORCE) == 0)
1680 					goto out;
1681 			}
1682 		}
1683 	}
1684 
1685 	if (flags & NFS4_DR_DISCARD) {
1686 		mntinfo4_t *mi = VTOMI4(RTOV4(rp));
1687 
1688 		mutex_enter(&rp->r_statelock);
1689 		/*
1690 		 * deleg_return_pending is cleared inside of delegation_accept
1691 		 * when a delegation is accepted.  if this flag has been
1692 		 * cleared, then a new delegation has overwritten the one we
1693 		 * were about to throw away.
1694 		 */
1695 		if (!rp->r_deleg_return_pending) {
1696 			mutex_exit(&rp->r_statelock);
1697 			goto out;
1698 		}
1699 		mutex_exit(&rp->r_statelock);
1700 		(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, FALSE);
1701 		nfs4delegreturn_cleanup_impl(rp, NULL, ncg);
1702 		nfs_rw_exit(&mi->mi_recovlock);
1703 	} else {
1704 		error = nfs4_do_delegreturn(rp, flags, cr, ncg);
1705 	}
1706 
1707 out:
1708 	if (cr)
1709 		crfree(cr);
1710 	if (rw_entered)
1711 		nfs_rw_exit(&rp->r_deleg_recall_lock);
1712 	return (error);
1713 }
1714 
1715 int
1716 nfs4delegreturn(rnode4_t *rp, int flags)
1717 {
1718 	struct nfs4_callback_globals *ncg;
1719 
1720 	ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
1721 	ASSERT(ncg != NULL);
1722 
1723 	return (nfs4delegreturn_impl(rp, flags, ncg));
1724 }
1725 
1726 void
1727 nfs4delegreturn_async(rnode4_t *rp, int flags, bool_t trunc)
1728 {
1729 	struct cb_recall_pass *pp;
1730 
1731 	pp = kmem_alloc(sizeof (struct cb_recall_pass), KM_SLEEP);
1732 	pp->rp = rp;
1733 	pp->flags = flags;
1734 	pp->truncate = trunc;
1735 
1736 	/*
1737 	 * Fire up a thread to do the actual delegreturn
1738 	 * Caller must guarantee that the rnode doesn't
1739 	 * vanish (by calling VN_HOLD).
1740 	 */
1741 
1742 	(void) zthread_create(NULL, 0, nfs4delegreturn_thread, pp, 0,
1743 	    minclsyspri);
1744 }
1745 
1746 static void
1747 delegreturn_all_thread(rpcprog_t *pp)
1748 {
1749 	nfs4_server_t *np;
1750 	bool_t found = FALSE;
1751 	rpcprog_t prog;
1752 	rnode4_t *rp;
1753 	vnode_t *vp;
1754 	zoneid_t zoneid = getzoneid();
1755 	struct nfs4_callback_globals *ncg;
1756 
1757 	NFS4_DEBUG(nfs4_drat_debug,
1758 	    (CE_NOTE, "delereturn_all_thread: prog %d\n", *pp));
1759 
1760 	prog = *pp;
1761 	kmem_free(pp, sizeof (*pp));
1762 	pp = NULL;
1763 
1764 	mutex_enter(&nfs4_server_lst_lock);
1765 	for (np = nfs4_server_lst.forw; np != &nfs4_server_lst; np = np->forw) {
1766 		if (np->zoneid == zoneid && np->s_program == prog) {
1767 			mutex_enter(&np->s_lock);
1768 			found = TRUE;
1769 			break;
1770 		}
1771 	}
1772 	mutex_exit(&nfs4_server_lst_lock);
1773 
1774 	/*
1775 	 * It's possible that the nfs4_server which was using this
1776 	 * program number has vanished since this thread is async.
1777 	 * If so, just return.  Your work here is finished, my friend.
1778 	 */
1779 	if (!found)
1780 		goto out;
1781 
1782 	ncg = np->zone_globals;
1783 	while ((rp = list_head(&np->s_deleg_list)) != NULL) {
1784 		vp = RTOV4(rp);
1785 		VN_HOLD(vp);
1786 		mutex_exit(&np->s_lock);
1787 		(void) nfs4delegreturn_impl(rp, NFS4_DR_PUSH|NFS4_DR_REOPEN,
1788 		    ncg);
1789 		VN_RELE(vp);
1790 
1791 		/* retake the s_lock for next trip through the loop */
1792 		mutex_enter(&np->s_lock);
1793 	}
1794 	mutex_exit(&np->s_lock);
1795 out:
1796 	NFS4_DEBUG(nfs4_drat_debug,
1797 	    (CE_NOTE, "delereturn_all_thread: complete\n"));
1798 	zthread_exit();
1799 }
1800 
1801 void
1802 nfs4_delegreturn_all(nfs4_server_t *sp)
1803 {
1804 	rpcprog_t pro, *pp;
1805 
1806 	mutex_enter(&sp->s_lock);
1807 
1808 	/* Check to see if the delegation list is empty */
1809 
1810 	if (list_head(&sp->s_deleg_list) == NULL) {
1811 		mutex_exit(&sp->s_lock);
1812 		return;
1813 	}
1814 	/*
1815 	 * Grab the program number; the async thread will use this
1816 	 * to find the nfs4_server.
1817 	 */
1818 	pro = sp->s_program;
1819 	mutex_exit(&sp->s_lock);
1820 	pp = kmem_alloc(sizeof (rpcprog_t), KM_SLEEP);
1821 	*pp = pro;
1822 	(void) zthread_create(NULL, 0, delegreturn_all_thread, pp, 0,
1823 	    minclsyspri);
1824 }
1825 
1826 
1827 /*
1828  * Discard any delegations
1829  *
1830  * Iterate over the servers s_deleg_list and
1831  * for matching mount-point rnodes discard
1832  * the delegation.
1833  */
1834 void
1835 nfs4_deleg_discard(mntinfo4_t *mi, nfs4_server_t *sp)
1836 {
1837 	rnode4_t *rp, *next;
1838 	mntinfo4_t *r_mi;
1839 	struct nfs4_callback_globals *ncg;
1840 
1841 	ASSERT(mutex_owned(&sp->s_lock));
1842 	ncg = sp->zone_globals;
1843 
1844 	for (rp = list_head(&sp->s_deleg_list); rp != NULL; rp = next) {
1845 		r_mi = VTOMI4(RTOV4(rp));
1846 		next = list_next(&sp->s_deleg_list, rp);
1847 
1848 		if (r_mi != mi) {
1849 			/*
1850 			 * Skip if this rnode is in not on the
1851 			 * same mount-point
1852 			 */
1853 			continue;
1854 		}
1855 
1856 		ASSERT(rp->r_deleg_type == OPEN_DELEGATE_READ);
1857 
1858 #ifdef DEBUG
1859 		if (nfs4_client_recov_debug) {
1860 			zprintf(getzoneid(),
1861 			    "nfs4_deleg_discard: matched rnode %p "
1862 			"-- discarding delegation\n", (void *)rp);
1863 		}
1864 #endif
1865 		mutex_enter(&rp->r_statev4_lock);
1866 		/*
1867 		 * Free the cred originally held when the delegation
1868 		 * was granted. Also need to decrement the refcnt
1869 		 * on this server for each delegation we discard
1870 		 */
1871 		if (rp->r_deleg_cred)
1872 			crfree(rp->r_deleg_cred);
1873 		rp->r_deleg_cred = NULL;
1874 		rp->r_deleg_type = OPEN_DELEGATE_NONE;
1875 		rp->r_deleg_needs_recovery = OPEN_DELEGATE_NONE;
1876 		rp->r_deleg_needs_recall = FALSE;
1877 		ASSERT(sp->s_refcnt > 1);
1878 		sp->s_refcnt--;
1879 		list_remove(&sp->s_deleg_list, rp);
1880 		mutex_exit(&rp->r_statev4_lock);
1881 		nfs4_dec_state_ref_count_nolock(sp, mi);
1882 		ncg->nfs4_callback_stats.delegations.value.ui64--;
1883 	}
1884 }
1885 
1886 /*
1887  * Reopen any open streams that were covered by the given file's
1888  * delegation.
1889  * Returns zero or an errno value.  If there was no error, *recovp
1890  * indicates whether recovery was initiated.
1891  */
1892 
1893 static int
1894 deleg_reopen(vnode_t *vp, bool_t *recovp, struct nfs4_callback_globals *ncg,
1895 	int flags)
1896 {
1897 	nfs4_open_stream_t *osp;
1898 	nfs4_recov_state_t recov_state;
1899 	bool_t needrecov = FALSE;
1900 	mntinfo4_t *mi;
1901 	rnode4_t *rp;
1902 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
1903 	int claimnull;
1904 
1905 	mi = VTOMI4(vp);
1906 	rp = VTOR4(vp);
1907 
1908 	recov_state.rs_flags = 0;
1909 	recov_state.rs_num_retry_despite_err = 0;
1910 
1911 retry:
1912 	if ((e.error = nfs4_start_op(mi, vp, NULL, &recov_state)) != 0) {
1913 		return (e.error);
1914 	}
1915 
1916 	/*
1917 	 * if we mean to discard the delegation, it must be BAD, so don't
1918 	 * use it when doing the reopen or it will fail too.
1919 	 */
1920 	claimnull = (flags & NFS4_DR_DISCARD);
1921 	/*
1922 	 * Loop through the open streams for this rnode to find
1923 	 * all of the ones created using the delegation state ID.
1924 	 * Each of these needs to be re-opened.
1925 	 */
1926 
1927 	while ((osp = get_next_deleg_stream(rp, claimnull)) != NULL) {
1928 
1929 		if (claimnull) {
1930 			nfs4_reopen(vp, osp, &e, CLAIM_NULL, FALSE, FALSE);
1931 		} else {
1932 			ncg->nfs4_callback_stats.claim_cur.value.ui64++;
1933 
1934 			nfs4_reopen(vp, osp, &e, CLAIM_DELEGATE_CUR, FALSE,
1935 			    FALSE);
1936 			if (e.error == 0 && e.stat == NFS4_OK)
1937 				ncg->nfs4_callback_stats.
1938 				    claim_cur_ok.value.ui64++;
1939 		}
1940 
1941 		if (e.error == EAGAIN) {
1942 			open_stream_rele(osp, rp);
1943 			nfs4_end_op(mi, vp, NULL, &recov_state, TRUE);
1944 			goto retry;
1945 		}
1946 
1947 		/*
1948 		 * if error is EINTR, ETIMEDOUT, or NFS4_FRC_UNMT_ERR, then
1949 		 * recovery has already been started inside of nfs4_reopen.
1950 		 */
1951 		if (e.error == EINTR || e.error == ETIMEDOUT ||
1952 		    NFS4_FRC_UNMT_ERR(e.error, vp->v_vfsp)) {
1953 			open_stream_rele(osp, rp);
1954 			break;
1955 		}
1956 
1957 		needrecov = nfs4_needs_recovery(&e, TRUE, vp->v_vfsp);
1958 
1959 		if (e.error != 0 && !needrecov) {
1960 			/*
1961 			 * Recovery is not possible, but don't give up yet;
1962 			 * we'd still like to do delegreturn after
1963 			 * reopening as many streams as possible.
1964 			 * Continue processing the open streams.
1965 			 */
1966 
1967 			ncg->nfs4_callback_stats.recall_failed.value.ui64++;
1968 
1969 		} else if (needrecov) {
1970 			/*
1971 			 * Start recovery and bail out.  The recovery
1972 			 * thread will take it from here.
1973 			 */
1974 			(void) nfs4_start_recovery(&e, mi, vp, NULL, NULL,
1975 			    NULL, OP_OPEN, NULL, NULL, NULL);
1976 			open_stream_rele(osp, rp);
1977 			*recovp = TRUE;
1978 			break;
1979 		}
1980 
1981 		open_stream_rele(osp, rp);
1982 	}
1983 
1984 	nfs4_end_op(mi, vp, NULL, &recov_state, needrecov);
1985 
1986 	return (e.error);
1987 }
1988 
1989 /*
1990  * get_next_deleg_stream - returns the next open stream which
1991  * represents a delegation for this rnode.  In order to assure
1992  * forward progress, the caller must guarantee that each open
1993  * stream returned is changed so that a future call won't return
1994  * it again.
1995  *
1996  * There are several ways for the open stream to change.  If the open
1997  * stream is !os_delegation, then we aren't interested in it.  Also, if
1998  * either os_failed_reopen or !os_valid, then don't return the osp.
1999  *
2000  * If claimnull is false (doing reopen CLAIM_DELEGATE_CUR) then return
2001  * the osp if it is an os_delegation open stream.  Also, if the rnode still
2002  * has r_deleg_return_pending, then return the os_delegation osp.  Lastly,
2003  * if the rnode's r_deleg_stateid is different from the osp's open_stateid,
2004  * then return the osp.
2005  *
2006  * We have already taken the 'r_deleg_recall_lock' as WRITER, which
2007  * prevents new OPENs from going OTW (as start_fop takes this
2008  * lock in READ mode); thus, no new open streams can be created
2009  * (which inherently means no new delegation open streams are
2010  * being created).
2011  */
2012 
2013 static nfs4_open_stream_t *
2014 get_next_deleg_stream(rnode4_t *rp, int claimnull)
2015 {
2016 	nfs4_open_stream_t	*osp;
2017 
2018 	ASSERT(nfs_rw_lock_held(&rp->r_deleg_recall_lock, RW_WRITER));
2019 
2020 	/*
2021 	 * Search through the list of open streams looking for
2022 	 * one that was created while holding the delegation.
2023 	 */
2024 	mutex_enter(&rp->r_os_lock);
2025 	for (osp = list_head(&rp->r_open_streams); osp != NULL;
2026 	    osp = list_next(&rp->r_open_streams, osp)) {
2027 		mutex_enter(&osp->os_sync_lock);
2028 		if (!osp->os_delegation || osp->os_failed_reopen ||
2029 		    !osp->os_valid) {
2030 			mutex_exit(&osp->os_sync_lock);
2031 			continue;
2032 		}
2033 		if (!claimnull || rp->r_deleg_return_pending ||
2034 		    !stateid4_cmp(&osp->open_stateid, &rp->r_deleg_stateid)) {
2035 			osp->os_ref_count++;
2036 			mutex_exit(&osp->os_sync_lock);
2037 			mutex_exit(&rp->r_os_lock);
2038 			return (osp);
2039 		}
2040 		mutex_exit(&osp->os_sync_lock);
2041 	}
2042 	mutex_exit(&rp->r_os_lock);
2043 
2044 	return (NULL);
2045 }
2046 
2047 static void
2048 nfs4delegreturn_thread(struct cb_recall_pass *args)
2049 {
2050 	rnode4_t *rp;
2051 	vnode_t *vp;
2052 	cred_t *cr;
2053 	int dtype, error, flags;
2054 	bool_t rdirty, rip;
2055 	kmutex_t cpr_lock;
2056 	callb_cpr_t cpr_info;
2057 	struct nfs4_callback_globals *ncg;
2058 
2059 	ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
2060 	ASSERT(ncg != NULL);
2061 
2062 	mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
2063 
2064 	CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr,
2065 	    "nfsv4delegRtn");
2066 
2067 	rp = args->rp;
2068 	vp = RTOV4(rp);
2069 
2070 	mutex_enter(&rp->r_statev4_lock);
2071 	if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
2072 		mutex_exit(&rp->r_statev4_lock);
2073 		goto out;
2074 	}
2075 	mutex_exit(&rp->r_statev4_lock);
2076 
2077 	/*
2078 	 * Take the read-write lock in read mode to prevent other
2079 	 * threads from modifying the data during the recall.  This
2080 	 * doesn't affect mmappers.
2081 	 */
2082 	(void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
2083 
2084 	/* Proceed with delegreturn */
2085 
2086 	mutex_enter(&rp->r_statev4_lock);
2087 	if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
2088 		mutex_exit(&rp->r_statev4_lock);
2089 		nfs_rw_exit(&rp->r_rwlock);
2090 		goto out;
2091 	}
2092 	dtype = rp->r_deleg_type;
2093 	cr = rp->r_deleg_cred;
2094 	ASSERT(cr != NULL);
2095 	crhold(cr);
2096 	mutex_exit(&rp->r_statev4_lock);
2097 
2098 	flags = args->flags;
2099 
2100 	/*
2101 	 * If the file is being truncated at the server, then throw
2102 	 * away all of the pages, it doesn't matter what flavor of
2103 	 * delegation we have.
2104 	 */
2105 
2106 	if (args->truncate) {
2107 		ncg->nfs4_callback_stats.recall_trunc.value.ui64++;
2108 		nfs4_invalidate_pages(vp, 0, cr);
2109 	} else if (dtype == OPEN_DELEGATE_WRITE) {
2110 
2111 		mutex_enter(&rp->r_statelock);
2112 		rdirty = rp->r_flags & R4DIRTY;
2113 		mutex_exit(&rp->r_statelock);
2114 
2115 		if (rdirty) {
2116 			error = VOP_PUTPAGE(vp, 0, 0, 0, cr, NULL);
2117 
2118 			if (error)
2119 				CB_WARN1("nfs4delegreturn_thread:"
2120 				" VOP_PUTPAGE: %d\n", error);
2121 		}
2122 		/* turn off NFS4_DR_PUSH because we just did that above. */
2123 		flags &= ~NFS4_DR_PUSH;
2124 	}
2125 
2126 	mutex_enter(&rp->r_statelock);
2127 	rip =  rp->r_flags & R4RECOVERRP;
2128 	mutex_exit(&rp->r_statelock);
2129 
2130 	/* If a failed recovery is indicated, discard the pages */
2131 
2132 	if (rip) {
2133 
2134 		error = VOP_PUTPAGE(vp, 0, 0, B_INVAL, cr, NULL);
2135 
2136 		if (error)
2137 			CB_WARN1("nfs4delegreturn_thread: VOP_PUTPAGE: %d\n",
2138 			    error);
2139 	}
2140 
2141 	/*
2142 	 * Pass the flags to nfs4delegreturn_impl, but be sure not to pass
2143 	 * NFS4_DR_DID_OP, which just calls nfs4delegreturn_async again.
2144 	 */
2145 	flags &= ~NFS4_DR_DID_OP;
2146 
2147 	(void) nfs4delegreturn_impl(rp, flags, ncg);
2148 
2149 	nfs_rw_exit(&rp->r_rwlock);
2150 	crfree(cr);
2151 out:
2152 	kmem_free(args, sizeof (struct cb_recall_pass));
2153 	VN_RELE(vp);
2154 	mutex_enter(&cpr_lock);
2155 	CALLB_CPR_EXIT(&cpr_info);
2156 	mutex_destroy(&cpr_lock);
2157 	zthread_exit();
2158 }
2159 
2160 /*
2161  * This function has one assumption that the caller of this function is
2162  * either doing recovery (therefore cannot call nfs4_start_op) or has
2163  * already called nfs4_start_op().
2164  */
2165 void
2166 nfs4_delegation_accept(rnode4_t *rp, open_claim_type4 claim, OPEN4res *res,
2167 	nfs4_ga_res_t *garp, cred_t *cr)
2168 {
2169 	open_read_delegation4 *orp;
2170 	open_write_delegation4 *owp;
2171 	nfs4_server_t *np;
2172 	bool_t already = FALSE;
2173 	bool_t recall = FALSE;
2174 	bool_t valid_garp = TRUE;
2175 	bool_t delegation_granted = FALSE;
2176 	bool_t dr_needed = FALSE;
2177 	bool_t recov;
2178 	int dr_flags = 0;
2179 	long mapcnt;
2180 	uint_t rflag;
2181 	mntinfo4_t *mi;
2182 	struct nfs4_callback_globals *ncg;
2183 	open_delegation_type4 odt;
2184 
2185 	ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
2186 	ASSERT(ncg != NULL);
2187 
2188 	mi = VTOMI4(RTOV4(rp));
2189 
2190 	/*
2191 	 * Accept a delegation granted to the client via an OPEN.
2192 	 * Set the delegation fields in the rnode and insert the
2193 	 * rnode onto the list anchored in the nfs4_server_t.  The
2194 	 * proper locking order requires the nfs4_server_t first,
2195 	 * even though it may not be needed in all cases.
2196 	 *
2197 	 * NB: find_nfs4_server returns with s_lock held.
2198 	 */
2199 
2200 	if ((np = find_nfs4_server(mi)) == NULL)
2201 		return;
2202 
2203 	/* grab the statelock too, for examining r_mapcnt */
2204 	mutex_enter(&rp->r_statelock);
2205 	mutex_enter(&rp->r_statev4_lock);
2206 
2207 	if (rp->r_deleg_type == OPEN_DELEGATE_READ ||
2208 	    rp->r_deleg_type == OPEN_DELEGATE_WRITE)
2209 		already = TRUE;
2210 
2211 	odt = res->delegation.delegation_type;
2212 
2213 	if (odt == OPEN_DELEGATE_READ) {
2214 
2215 		rp->r_deleg_type = res->delegation.delegation_type;
2216 		orp = &res->delegation.open_delegation4_u.read;
2217 		rp->r_deleg_stateid = orp->stateid;
2218 		rp->r_deleg_perms = orp->permissions;
2219 		if (claim == CLAIM_PREVIOUS)
2220 			if ((recall = orp->recall) != 0)
2221 				dr_needed = TRUE;
2222 
2223 		delegation_granted = TRUE;
2224 
2225 		ncg->nfs4_callback_stats.delegations.value.ui64++;
2226 		ncg->nfs4_callback_stats.delegaccept_r.value.ui64++;
2227 
2228 	} else if (odt == OPEN_DELEGATE_WRITE) {
2229 
2230 		rp->r_deleg_type = res->delegation.delegation_type;
2231 		owp = &res->delegation.open_delegation4_u.write;
2232 		rp->r_deleg_stateid = owp->stateid;
2233 		rp->r_deleg_perms = owp->permissions;
2234 		rp->r_deleg_limit = owp->space_limit;
2235 		if (claim == CLAIM_PREVIOUS)
2236 			if ((recall = owp->recall) != 0)
2237 				dr_needed = TRUE;
2238 
2239 		delegation_granted = TRUE;
2240 
2241 		if (garp == NULL || !garp->n4g_change_valid) {
2242 			valid_garp = FALSE;
2243 			rp->r_deleg_change = 0;
2244 			rp->r_deleg_change_grant = 0;
2245 		} else {
2246 			rp->r_deleg_change = garp->n4g_change;
2247 			rp->r_deleg_change_grant = garp->n4g_change;
2248 		}
2249 		mapcnt = rp->r_mapcnt;
2250 		rflag = rp->r_flags;
2251 
2252 		/*
2253 		 * Update the delegation change attribute if
2254 		 * there are mappers for the file is dirty.  This
2255 		 * might be the case during recovery after server
2256 		 * reboot.
2257 		 */
2258 		if (mapcnt > 0 || rflag & R4DIRTY)
2259 			rp->r_deleg_change++;
2260 
2261 		NFS4_DEBUG(nfs4_callback_debug, (CE_NOTE,
2262 		    "nfs4_delegation_accept: r_deleg_change: 0x%x\n",
2263 		    (int)(rp->r_deleg_change >> 32)));
2264 		NFS4_DEBUG(nfs4_callback_debug, (CE_NOTE,
2265 		    "nfs4_delegation_accept: r_delg_change_grant: 0x%x\n",
2266 		    (int)(rp->r_deleg_change_grant >> 32)));
2267 
2268 
2269 		ncg->nfs4_callback_stats.delegations.value.ui64++;
2270 		ncg->nfs4_callback_stats.delegaccept_rw.value.ui64++;
2271 	} else if (already) {
2272 		/*
2273 		 * No delegation granted.  If the rnode currently has
2274 		 * has one, then consider it tainted and return it.
2275 		 */
2276 		dr_needed = TRUE;
2277 	}
2278 
2279 	if (delegation_granted) {
2280 		/* Add the rnode to the list. */
2281 		if (!already) {
2282 			crhold(cr);
2283 			rp->r_deleg_cred = cr;
2284 
2285 			ASSERT(mutex_owned(&np->s_lock));
2286 			list_insert_head(&np->s_deleg_list, rp);
2287 			/* added list node gets a reference */
2288 			np->s_refcnt++;
2289 			nfs4_inc_state_ref_count_nolock(np, mi);
2290 		}
2291 		rp->r_deleg_needs_recovery = OPEN_DELEGATE_NONE;
2292 	}
2293 
2294 	/*
2295 	 * We've now safely accepted the delegation, if any.  Drop the
2296 	 * locks and figure out what post-processing is needed.  We'd
2297 	 * like to retain r_statev4_lock, but nfs4_server_rele takes
2298 	 * s_lock which would be a lock ordering violation.
2299 	 */
2300 	mutex_exit(&rp->r_statev4_lock);
2301 	mutex_exit(&rp->r_statelock);
2302 	mutex_exit(&np->s_lock);
2303 	nfs4_server_rele(np);
2304 
2305 	/*
2306 	 * Check to see if we are in recovery.  Remember that
2307 	 * this function is protected by start_op, so a recovery
2308 	 * cannot begin until we are out of here.
2309 	 */
2310 	mutex_enter(&mi->mi_lock);
2311 	recov = mi->mi_recovflags & MI4_RECOV_ACTIV;
2312 	mutex_exit(&mi->mi_lock);
2313 
2314 	mutex_enter(&rp->r_statev4_lock);
2315 
2316 	if (nfs4_delegreturn_policy == IMMEDIATE || !valid_garp)
2317 		dr_needed = TRUE;
2318 
2319 	if (dr_needed && rp->r_deleg_return_pending == FALSE) {
2320 		if (recov) {
2321 			/*
2322 			 * We cannot call delegreturn from inside
2323 			 * of recovery or VOP_PUTPAGE will hang
2324 			 * due to nfs4_start_fop call in
2325 			 * nfs4write.  Use dlistadd to add the
2326 			 * rnode to the list of rnodes needing
2327 			 * cleaning.  We do not need to do reopen
2328 			 * here because recov_openfiles will do it.
2329 			 * In the non-recall case, just discard the
2330 			 * delegation as it is no longer valid.
2331 			 */
2332 			if (recall)
2333 				dr_flags = NFS4_DR_PUSH;
2334 			else
2335 				dr_flags = NFS4_DR_PUSH|NFS4_DR_DISCARD;
2336 
2337 			nfs4_dlistadd(rp, ncg, dr_flags);
2338 			dr_flags = 0;
2339 		} else {
2340 			/*
2341 			 * Push the modified data back to the server,
2342 			 * reopen any delegation open streams, and return
2343 			 * the delegation.  Drop the statev4_lock first!
2344 			 */
2345 			dr_flags =  NFS4_DR_PUSH|NFS4_DR_DID_OP|NFS4_DR_REOPEN;
2346 		}
2347 	}
2348 	mutex_exit(&rp->r_statev4_lock);
2349 	if (dr_flags)
2350 		(void) nfs4delegreturn_impl(rp, dr_flags, ncg);
2351 }
2352 
2353 /*
2354  * nfs4delegabandon - Abandon the delegation on an rnode4.  This code
2355  * is called when the client receives EXPIRED, BAD_STATEID, OLD_STATEID
2356  * or BADSEQID and the recovery code is unable to recover.  Push any
2357  * dirty data back to the server and return the delegation (if any).
2358  */
2359 
2360 void
2361 nfs4delegabandon(rnode4_t *rp)
2362 {
2363 	vnode_t *vp;
2364 	struct cb_recall_pass *pp;
2365 	open_delegation_type4 dt;
2366 
2367 	mutex_enter(&rp->r_statev4_lock);
2368 	dt = rp->r_deleg_type;
2369 	mutex_exit(&rp->r_statev4_lock);
2370 
2371 	if (dt == OPEN_DELEGATE_NONE)
2372 		return;
2373 
2374 	vp = RTOV4(rp);
2375 	VN_HOLD(vp);
2376 
2377 	pp = kmem_alloc(sizeof (struct cb_recall_pass), KM_SLEEP);
2378 	pp->rp = rp;
2379 	/*
2380 	 * Recovery on the file has failed and we want to return
2381 	 * the delegation.  We don't want to reopen files and
2382 	 * nfs4delegreturn_thread() figures out what to do about
2383 	 * the data.  The only thing to do is attempt to return
2384 	 * the delegation.
2385 	 */
2386 	pp->flags = 0;
2387 	pp->truncate = FALSE;
2388 
2389 	/*
2390 	 * Fire up a thread to do the delegreturn; this is
2391 	 * necessary because we could be inside a GETPAGE or
2392 	 * PUTPAGE and we cannot do another one.
2393 	 */
2394 
2395 	(void) zthread_create(NULL, 0, nfs4delegreturn_thread, pp, 0,
2396 	    minclsyspri);
2397 }
2398 
2399 static int
2400 wait_for_recall1(vnode_t *vp, nfs4_op_hint_t op, nfs4_recov_state_t *rsp,
2401 	int flg)
2402 {
2403 	rnode4_t *rp;
2404 	int error = 0;
2405 
2406 #ifdef lint
2407 	op = op;
2408 #endif
2409 
2410 	if (vp && vp->v_type == VREG) {
2411 		rp = VTOR4(vp);
2412 
2413 		/*
2414 		 * Take r_deleg_recall_lock in read mode to synchronize
2415 		 * with delegreturn.
2416 		 */
2417 		error = nfs_rw_enter_sig(&rp->r_deleg_recall_lock,
2418 		    RW_READER, INTR4(vp));
2419 
2420 		if (error == 0)
2421 			rsp->rs_flags |= flg;
2422 
2423 	}
2424 	return (error);
2425 }
2426 
2427 void
2428 nfs4_end_op_recall(vnode_t *vp1, vnode_t *vp2, nfs4_recov_state_t *rsp)
2429 {
2430 	NFS4_DEBUG(nfs4_recall_debug,
2431 	    (CE_NOTE, "nfs4_end_op_recall: 0x%p, 0x%p\n",
2432 	    (void *)vp1, (void *)vp2));
2433 
2434 	if (vp2 && rsp->rs_flags & NFS4_RS_RECALL_HELD2)
2435 		nfs_rw_exit(&VTOR4(vp2)->r_deleg_recall_lock);
2436 	if (vp1 && rsp->rs_flags & NFS4_RS_RECALL_HELD1)
2437 		nfs_rw_exit(&VTOR4(vp1)->r_deleg_recall_lock);
2438 }
2439 
2440 int
2441 wait_for_recall(vnode_t *vp1, vnode_t *vp2, nfs4_op_hint_t op,
2442 	nfs4_recov_state_t *rsp)
2443 {
2444 	int error;
2445 
2446 	NFS4_DEBUG(nfs4_recall_debug,
2447 	    (CE_NOTE, "wait_for_recall:    0x%p, 0x%p\n",
2448 	    (void *)vp1, (void *) vp2));
2449 
2450 	rsp->rs_flags &= ~(NFS4_RS_RECALL_HELD1|NFS4_RS_RECALL_HELD2);
2451 
2452 	if ((error = wait_for_recall1(vp1, op, rsp, NFS4_RS_RECALL_HELD1)) != 0)
2453 		return (error);
2454 
2455 	if ((error = wait_for_recall1(vp2, op, rsp, NFS4_RS_RECALL_HELD2))
2456 	    != 0) {
2457 		if (rsp->rs_flags & NFS4_RS_RECALL_HELD1) {
2458 			nfs_rw_exit(&VTOR4(vp1)->r_deleg_recall_lock);
2459 			rsp->rs_flags &= ~NFS4_RS_RECALL_HELD1;
2460 		}
2461 
2462 		return (error);
2463 	}
2464 
2465 	return (0);
2466 }
2467 
2468 /*
2469  * nfs4_dlistadd - Add this rnode to a list of rnodes to be
2470  * DELEGRETURN'd at the end of recovery.
2471  */
2472 
2473 static void
2474 nfs4_dlistadd(rnode4_t *rp, struct nfs4_callback_globals *ncg, int flags)
2475 {
2476 	struct nfs4_dnode *dp;
2477 
2478 	ASSERT(mutex_owned(&rp->r_statev4_lock));
2479 	/*
2480 	 * Mark the delegation as having a return pending.
2481 	 * This will prevent the use of the delegation stateID
2482 	 * by read, write, setattr and open.
2483 	 */
2484 	rp->r_deleg_return_pending = TRUE;
2485 	dp = kmem_alloc(sizeof (*dp), KM_SLEEP);
2486 	VN_HOLD(RTOV4(rp));
2487 	dp->rnodep = rp;
2488 	dp->flags = flags;
2489 	mutex_enter(&ncg->nfs4_dlist_lock);
2490 	list_insert_head(&ncg->nfs4_dlist, dp);
2491 #ifdef	DEBUG
2492 	ncg->nfs4_dlistadd_c++;
2493 #endif
2494 	mutex_exit(&ncg->nfs4_dlist_lock);
2495 }
2496 
2497 /*
2498  * nfs4_dlistclean_impl - Do DELEGRETURN for each rnode on the list.
2499  * of files awaiting cleaning.  If the override_flags are non-zero
2500  * then use them rather than the flags that were set when the rnode
2501  * was added to the dlist.
2502  */
2503 static void
2504 nfs4_dlistclean_impl(struct nfs4_callback_globals *ncg, int override_flags)
2505 {
2506 	rnode4_t *rp;
2507 	struct nfs4_dnode *dp;
2508 	int flags;
2509 
2510 	ASSERT(override_flags == 0 || override_flags == NFS4_DR_DISCARD);
2511 
2512 	mutex_enter(&ncg->nfs4_dlist_lock);
2513 	while ((dp = list_head(&ncg->nfs4_dlist)) != NULL) {
2514 #ifdef	DEBUG
2515 		ncg->nfs4_dlistclean_c++;
2516 #endif
2517 		list_remove(&ncg->nfs4_dlist, dp);
2518 		mutex_exit(&ncg->nfs4_dlist_lock);
2519 		rp = dp->rnodep;
2520 		flags = (override_flags != 0) ? override_flags : dp->flags;
2521 		kmem_free(dp, sizeof (*dp));
2522 		(void) nfs4delegreturn_impl(rp, flags, ncg);
2523 		VN_RELE(RTOV4(rp));
2524 		mutex_enter(&ncg->nfs4_dlist_lock);
2525 	}
2526 	mutex_exit(&ncg->nfs4_dlist_lock);
2527 }
2528 
2529 void
2530 nfs4_dlistclean(void)
2531 {
2532 	struct nfs4_callback_globals *ncg;
2533 
2534 	ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
2535 	ASSERT(ncg != NULL);
2536 
2537 	nfs4_dlistclean_impl(ncg, 0);
2538 }
2539 
2540 nfs4_rcsync_t *
2541 nfs4_recall_sync_start(mntinfo4_t *mi)
2542 {
2543 	nfs4_server_t *sp = mi->mi_srv;
2544 	nfs4_rcsync_t *rcp = kmem_zalloc(sizeof (*rcp), KM_SLEEP);
2545 
2546 	if (nfs4_server_vlock(sp, 0) == FALSE) {
2547 		rcp->rs_flags = RS_SERVER_GONE;
2548 		return rcp;
2549 	}
2550 	rcp->rs_mi = mi;
2551 
2552 	mutex_enter(&sp->s_rcsync_lock);
2553 	rcp->rs_seq = sp->s_rcsync_seq++;
2554 	list_insert_tail(&sp->s_rcsync_list, rcp);
2555 	mutex_exit(&sp->s_rcsync_lock);
2556 
2557 	mutex_exit(&sp->s_lock);
2558 	nfs4_server_rele(sp);
2559 
2560 	return rcp;
2561 }
2562 
2563 void
2564 nfs4_recall_sync_end(mntinfo4_t *mi, nfs4_rcsync_t *rcp)
2565 {
2566 	nfs4_server_t *sp = mi->mi_srv;
2567 
2568 	if (nfs4_server_vlock(sp, 1) == FALSE) {
2569 		ASSERT((rcp->rs_flags & RS_SERVER_GONE) != 0);
2570 		kmem_free(rcp, sizeof (*rcp));
2571 		return;
2572 	}
2573 
2574 	mutex_enter(&sp->s_rcsync_lock);
2575 	if ((rcp->rs_flags & RS_SERVER_GONE) == 0) {
2576 		list_remove(&sp->s_rcsync_list, rcp);
2577 		cv_broadcast(&sp->s_rcsync_cv);
2578 	}
2579 	mutex_exit(&sp->s_rcsync_lock);
2580 	mutex_exit(&sp->s_lock);
2581 	nfs4_server_rele(sp);
2582 	kmem_free(rcp, sizeof (*rcp));
2583 }
2584 
2585 static void
2586 nfs4_recall_sync_wait(nfs4_server_t *sp)
2587 {
2588 	uint64_t seq;
2589 
2590 	mutex_enter(&sp->s_rcsync_lock);
2591 	seq = sp->s_rcsync_seq;
2592 
2593 	while (!list_is_empty(&sp->s_rcsync_list)) {
2594 		nfs4_rcsync_t *rcp = list_head(&sp->s_rcsync_list);
2595 
2596 		if (rcp->rs_seq >= seq)
2597 			break;
2598 
2599 		cv_wait(&sp->s_rcsync_cv, &sp->s_rcsync_lock);
2600 	}
2601 
2602 	mutex_exit(&sp->s_rcsync_lock);
2603 }
2604