xref: /illumos-gate/usr/src/uts/common/fs/nfs/nfs4_callback.c (revision 9514bcf4c37a9b87200462594803414d12cdd29d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
27 /* All Rights Reserved */
28 
29 #include <sys/param.h>
30 #include <sys/types.h>
31 #include <sys/systm.h>
32 #include <sys/cred.h>
33 #include <sys/vfs.h>
34 #include <sys/vnode.h>
35 #include <sys/pathname.h>
36 #include <sys/sysmacros.h>
37 #include <sys/kmem.h>
38 #include <sys/kstat.h>
39 #include <sys/mkdev.h>
40 #include <sys/mount.h>
41 #include <sys/statvfs.h>
42 #include <sys/errno.h>
43 #include <sys/debug.h>
44 #include <sys/cmn_err.h>
45 #include <sys/utsname.h>
46 #include <sys/bootconf.h>
47 #include <sys/modctl.h>
48 #include <sys/acl.h>
49 #include <sys/flock.h>
50 #include <sys/kstr.h>
51 #include <sys/stropts.h>
52 #include <sys/strsubr.h>
53 #include <sys/atomic.h>
54 #include <sys/disp.h>
55 #include <sys/policy.h>
56 #include <sys/list.h>
57 #include <sys/zone.h>
58 
59 #include <rpc/types.h>
60 #include <rpc/auth.h>
61 #include <rpc/rpcsec_gss.h>
62 #include <rpc/clnt.h>
63 #include <rpc/xdr.h>
64 
65 #include <nfs/nfs.h>
66 #include <nfs/nfs_clnt.h>
67 #include <nfs/mount.h>
68 #include <nfs/nfs_acl.h>
69 
70 #include <fs/fs_subr.h>
71 
72 #include <nfs/nfs4.h>
73 #include <nfs/rnode4.h>
74 #include <nfs/nfs4_clnt.h>
75 #include <nfs/nfssys.h>
76 
77 #ifdef	DEBUG
78 /*
79  * These are "special" state IDs and file handles that
80  * match any delegation state ID or file handled.  This
81  * is for testing purposes only.
82  */
83 
84 stateid4 nfs4_deleg_any = { 0x7FFFFFF0 };
85 char nfs4_deleg_fh[] = "\0377\0376\0375\0374";
86 nfs_fh4 nfs4_deleg_anyfh = { sizeof (nfs4_deleg_fh)-1, nfs4_deleg_fh };
87 nfsstat4 cb4_getattr_fail = NFS4_OK;
88 nfsstat4 cb4_recall_fail = NFS4_OK;
89 
90 int nfs4_callback_debug;
91 int nfs4_recall_debug;
92 int nfs4_drat_debug;
93 
94 #endif
95 
96 #define	CB_NOTE(x)	NFS4_DEBUG(nfs4_callback_debug, (CE_NOTE, x))
97 #define	CB_WARN(x)	NFS4_DEBUG(nfs4_callback_debug, (CE_WARN, x))
98 #define	CB_WARN1(x, y)	NFS4_DEBUG(nfs4_callback_debug, (CE_WARN, x, y))
99 
100 enum nfs4_delegreturn_policy nfs4_delegreturn_policy = INACTIVE;
101 
102 static zone_key_t nfs4_callback_zone_key;
103 
104 /*
105  * NFS4_MAPSIZE is the number of bytes we are willing to consume
106  * for the block allocation map when the server grants a NFS_LIMIT_BLOCK
107  * style delegation.
108  */
109 
110 #define	NFS4_MAPSIZE	8192
111 #define	NFS4_MAPWORDS	NFS4_MAPSIZE/sizeof (uint_t)
112 #define	NbPW		(NBBY*sizeof (uint_t))
113 
114 static int nfs4_num_prognums = 1024;
115 static SVC_CALLOUT_TABLE nfs4_cb_sct;
116 
117 struct nfs4_dnode {
118 	list_node_t	linkage;
119 	rnode4_t	*rnodep;
120 	int		flags;		/* Flags for nfs4delegreturn_impl() */
121 };
122 
123 static const struct nfs4_callback_stats nfs4_callback_stats_tmpl = {
124 	{ "delegations",	KSTAT_DATA_UINT64 },
125 	{ "cb_getattr",		KSTAT_DATA_UINT64 },
126 	{ "cb_recall",		KSTAT_DATA_UINT64 },
127 	{ "cb_null",		KSTAT_DATA_UINT64 },
128 	{ "cb_dispatch",	KSTAT_DATA_UINT64 },
129 	{ "delegaccept_r",	KSTAT_DATA_UINT64 },
130 	{ "delegaccept_rw",	KSTAT_DATA_UINT64 },
131 	{ "delegreturn",	KSTAT_DATA_UINT64 },
132 	{ "callbacks",		KSTAT_DATA_UINT64 },
133 	{ "claim_cur",		KSTAT_DATA_UINT64 },
134 	{ "claim_cur_ok",	KSTAT_DATA_UINT64 },
135 	{ "recall_trunc",	KSTAT_DATA_UINT64 },
136 	{ "recall_failed",	KSTAT_DATA_UINT64 },
137 	{ "return_limit_write",	KSTAT_DATA_UINT64 },
138 	{ "return_limit_addmap", KSTAT_DATA_UINT64 },
139 	{ "deleg_recover",	KSTAT_DATA_UINT64 },
140 	{ "cb_illegal",		KSTAT_DATA_UINT64 }
141 };
142 
143 struct nfs4_cb_port {
144 	list_node_t		linkage; /* linkage into per-zone port list */
145 	char			netid[KNC_STRSIZE];
146 	char			uaddr[KNC_STRSIZE];
147 	char			protofmly[KNC_STRSIZE];
148 	char			proto[KNC_STRSIZE];
149 };
150 
151 static int cb_getattr_bytes;
152 
153 struct cb_recall_pass {
154 	rnode4_t	*rp;
155 	int		flags;		/* Flags for nfs4delegreturn_impl() */
156 	bool_t		truncate;
157 };
158 
159 static nfs4_open_stream_t *get_next_deleg_stream(rnode4_t *, int);
160 static void nfs4delegreturn_thread(struct cb_recall_pass *);
161 static int deleg_reopen(vnode_t *, bool_t *, struct nfs4_callback_globals *,
162     int);
163 static void nfs4_dlistadd(rnode4_t *, struct nfs4_callback_globals *, int);
164 static void nfs4_dlistclean_impl(struct nfs4_callback_globals *, int);
165 static int nfs4delegreturn_impl(rnode4_t *, int,
166     struct nfs4_callback_globals *);
167 static void nfs4delegreturn_cleanup_impl(rnode4_t *, nfs4_server_t *,
168     struct nfs4_callback_globals *);
169 
170 static void
171 cb_getattr(nfs_cb_argop4 *argop, nfs_cb_resop4 *resop, struct svc_req *req,
172     struct compound_state *cs, struct nfs4_callback_globals *ncg)
173 {
174 	CB_GETATTR4args *args = &argop->nfs_cb_argop4_u.opcbgetattr;
175 	CB_GETATTR4res *resp = &resop->nfs_cb_resop4_u.opcbgetattr;
176 	rnode4_t *rp;
177 	vnode_t *vp;
178 	bool_t found = FALSE;
179 	struct nfs4_server *sp;
180 	struct fattr4 *fap;
181 	rpc_inline_t *fdata;
182 	long mapcnt;
183 	fattr4_change change;
184 	fattr4_size size;
185 	uint_t rflag;
186 
187 	ncg->nfs4_callback_stats.cb_getattr.value.ui64++;
188 
189 #ifdef DEBUG
190 	/*
191 	 * error injection hook: set cb_getattr_fail global to
192 	 * NFS4 pcol error to be returned
193 	 */
194 	if (cb4_getattr_fail != NFS4_OK) {
195 		*cs->statusp = resp->status = cb4_getattr_fail;
196 		return;
197 	}
198 #endif
199 
200 	resp->obj_attributes.attrmask = 0;
201 
202 	mutex_enter(&ncg->nfs4_cb_lock);
203 	sp = ncg->nfs4prog2server[req->rq_prog - NFS4_CALLBACK];
204 	mutex_exit(&ncg->nfs4_cb_lock);
205 
206 	if (nfs4_server_vlock(sp, 0) == FALSE) {
207 
208 		CB_WARN("cb_getattr: cannot find server\n");
209 
210 		*cs->statusp = resp->status = NFS4ERR_BADHANDLE;
211 		return;
212 	}
213 
214 	/*
215 	 * In cb_compound, callback_ident was validated against rq_prog,
216 	 * but we couldn't verify that it was set to the value we provided
217 	 * at setclientid time (because we didn't have server struct yet).
218 	 * Now we have the server struct, but don't have callback_ident
219 	 * handy.  So, validate server struct program number against req
220 	 * RPC's prog number.  At this point, we know the RPC prog num
221 	 * is valid (else we wouldn't be here); however, we don't know
222 	 * that it was the prog number we supplied to this server at
223 	 * setclientid time.  If the prog numbers aren't equivalent, then
224 	 * log the problem and fail the request because either cbserv
225 	 * and/or cbclient are confused.  This will probably never happen.
226 	 */
227 	if (sp->s_program != req->rq_prog) {
228 #ifdef DEBUG
229 		zcmn_err(getzoneid(), CE_WARN,
230 		    "cb_getattr: wrong server program number srv=%d req=%d\n",
231 		    sp->s_program, req->rq_prog);
232 #else
233 		zcmn_err(getzoneid(), CE_WARN,
234 		    "cb_getattr: wrong server program number\n");
235 #endif
236 		mutex_exit(&sp->s_lock);
237 		nfs4_server_rele(sp);
238 		*cs->statusp = resp->status = NFS4ERR_BADHANDLE;
239 		return;
240 	}
241 
242 	/*
243 	 * Search the delegation list for a matching file handle;
244 	 * mutex on sp prevents the list from changing.
245 	 */
246 
247 	rp = list_head(&sp->s_deleg_list);
248 	for (; rp != NULL; rp = list_next(&sp->s_deleg_list, rp)) {
249 		nfs4_fhandle_t fhandle;
250 
251 		sfh4_copyval(rp->r_fh, &fhandle);
252 
253 		if ((fhandle.fh_len == args->fh.nfs_fh4_len &&
254 		    bcmp(fhandle.fh_buf, args->fh.nfs_fh4_val,
255 		    fhandle.fh_len) == 0)) {
256 
257 			found = TRUE;
258 			break;
259 		}
260 #ifdef	DEBUG
261 		if (nfs4_deleg_anyfh.nfs_fh4_len == args->fh.nfs_fh4_len &&
262 		    bcmp(nfs4_deleg_anyfh.nfs_fh4_val, args->fh.nfs_fh4_val,
263 		    args->fh.nfs_fh4_len) == 0) {
264 
265 			found = TRUE;
266 			break;
267 		}
268 #endif
269 	}
270 
271 	/*
272 	 * VN_HOLD the vnode before releasing s_lock to guarantee
273 	 * we have a valid vnode reference.
274 	 */
275 	if (found == TRUE) {
276 		vp = RTOV4(rp);
277 		VN_HOLD(vp);
278 	}
279 
280 	mutex_exit(&sp->s_lock);
281 	nfs4_server_rele(sp);
282 
283 	if (found == FALSE) {
284 
285 		CB_WARN("cb_getattr: bad fhandle\n");
286 
287 		*cs->statusp = resp->status = NFS4ERR_BADHANDLE;
288 		return;
289 	}
290 
291 	/*
292 	 * Figure out which attributes the server wants.  We only
293 	 * offer FATTR4_CHANGE & FATTR4_SIZE; ignore the rest.
294 	 */
295 	fdata = kmem_alloc(cb_getattr_bytes, KM_SLEEP);
296 
297 	/*
298 	 * Don't actually need to create XDR to encode these
299 	 * simple data structures.
300 	 * xdrmem_create(&xdr, fdata, cb_getattr_bytes, XDR_ENCODE);
301 	 */
302 	fap = &resp->obj_attributes;
303 
304 	fap->attrmask = 0;
305 	/* attrlist4_len starts at 0 and increases as attrs are processed */
306 	fap->attrlist4 = (char *)fdata;
307 	fap->attrlist4_len = 0;
308 
309 	/* don't supply attrs if request was zero */
310 	if (args->attr_request != 0) {
311 		if (args->attr_request & FATTR4_CHANGE_MASK) {
312 			/*
313 			 * If the file is mmapped, then increment the change
314 			 * attribute and return it.  This will guarantee that
315 			 * the server will perceive that the file has changed
316 			 * if there is any chance that the client application
317 			 * has changed it.  Otherwise, just return the change
318 			 * attribute as it has been updated by nfs4write_deleg.
319 			 */
320 
321 			mutex_enter(&rp->r_statelock);
322 			mapcnt = rp->r_mapcnt;
323 			rflag = rp->r_flags;
324 			mutex_exit(&rp->r_statelock);
325 
326 			mutex_enter(&rp->r_statev4_lock);
327 			/*
328 			 * If object mapped, then always return new change.
329 			 * Otherwise, return change if object has dirty
330 			 * pages.  If object doesn't have any dirty pages,
331 			 * then all changes have been pushed to server, so
332 			 * reset change to grant change.
333 			 */
334 			if (mapcnt)
335 				rp->r_deleg_change++;
336 			else if (! (rflag & R4DIRTY))
337 				rp->r_deleg_change = rp->r_deleg_change_grant;
338 			change = rp->r_deleg_change;
339 			mutex_exit(&rp->r_statev4_lock);
340 
341 			/*
342 			 * Use inline XDR code directly, we know that we
343 			 * going to a memory buffer and it has enough
344 			 * space so it cannot fail.
345 			 */
346 			IXDR_PUT_U_HYPER(fdata, change);
347 			fap->attrlist4_len += 2 * BYTES_PER_XDR_UNIT;
348 			fap->attrmask |= FATTR4_CHANGE_MASK;
349 		}
350 
351 		if (args->attr_request & FATTR4_SIZE_MASK) {
352 			/*
353 			 * Use an atomic add of 0 to fetch a consistent view
354 			 * of r_size; this avoids having to take rw_lock
355 			 * which could cause a deadlock.
356 			 */
357 			size = atomic_add_64_nv((uint64_t *)&rp->r_size, 0);
358 
359 			/*
360 			 * Use inline XDR code directly, we know that we
361 			 * going to a memory buffer and it has enough
362 			 * space so it cannot fail.
363 			 */
364 			IXDR_PUT_U_HYPER(fdata, size);
365 			fap->attrlist4_len += 2 * BYTES_PER_XDR_UNIT;
366 			fap->attrmask |= FATTR4_SIZE_MASK;
367 		}
368 	}
369 
370 	VN_RELE(vp);
371 
372 	*cs->statusp = resp->status = NFS4_OK;
373 }
374 
375 static void
376 cb_getattr_free(nfs_cb_resop4 *resop)
377 {
378 	if (resop->nfs_cb_resop4_u.opcbgetattr.obj_attributes.attrlist4)
379 		kmem_free(resop->nfs_cb_resop4_u.opcbgetattr.
380 		    obj_attributes.attrlist4, cb_getattr_bytes);
381 }
382 
383 static void
384 cb_recall(nfs_cb_argop4 *argop, nfs_cb_resop4 *resop, struct svc_req *req,
385     struct compound_state *cs, struct nfs4_callback_globals *ncg)
386 {
387 	CB_RECALL4args * args = &argop->nfs_cb_argop4_u.opcbrecall;
388 	CB_RECALL4res *resp = &resop->nfs_cb_resop4_u.opcbrecall;
389 	rnode4_t *rp;
390 	vnode_t *vp;
391 	struct nfs4_server *sp;
392 	bool_t found = FALSE;
393 
394 	ncg->nfs4_callback_stats.cb_recall.value.ui64++;
395 
396 	ASSERT(req->rq_prog >= NFS4_CALLBACK);
397 	ASSERT(req->rq_prog < NFS4_CALLBACK+nfs4_num_prognums);
398 
399 #ifdef DEBUG
400 	/*
401 	 * error injection hook: set cb_recall_fail global to
402 	 * NFS4 pcol error to be returned
403 	 */
404 	if (cb4_recall_fail != NFS4_OK) {
405 		*cs->statusp = resp->status = cb4_recall_fail;
406 		return;
407 	}
408 #endif
409 
410 	mutex_enter(&ncg->nfs4_cb_lock);
411 	sp = ncg->nfs4prog2server[req->rq_prog - NFS4_CALLBACK];
412 	mutex_exit(&ncg->nfs4_cb_lock);
413 
414 	if (nfs4_server_vlock(sp, 0) == FALSE) {
415 
416 		CB_WARN("cb_recall: cannot find server\n");
417 
418 		*cs->statusp = resp->status = NFS4ERR_BADHANDLE;
419 		return;
420 	}
421 
422 	/*
423 	 * Search the delegation list for a matching file handle
424 	 * AND stateid; mutex on sp prevents the list from changing.
425 	 */
426 
427 	rp = list_head(&sp->s_deleg_list);
428 	for (; rp != NULL; rp = list_next(&sp->s_deleg_list, rp)) {
429 		mutex_enter(&rp->r_statev4_lock);
430 
431 		/* check both state id and file handle! */
432 
433 		if ((bcmp(&rp->r_deleg_stateid, &args->stateid,
434 		    sizeof (stateid4)) == 0)) {
435 			nfs4_fhandle_t fhandle;
436 
437 			sfh4_copyval(rp->r_fh, &fhandle);
438 			if ((fhandle.fh_len == args->fh.nfs_fh4_len &&
439 			    bcmp(fhandle.fh_buf, args->fh.nfs_fh4_val,
440 			    fhandle.fh_len) == 0)) {
441 
442 				found = TRUE;
443 				break;
444 			} else {
445 #ifdef	DEBUG
446 				CB_WARN("cb_recall: stateid OK, bad fh");
447 #endif
448 			}
449 		}
450 #ifdef	DEBUG
451 		if (bcmp(&args->stateid, &nfs4_deleg_any,
452 		    sizeof (stateid4)) == 0) {
453 
454 			found = TRUE;
455 			break;
456 		}
457 #endif
458 		mutex_exit(&rp->r_statev4_lock);
459 	}
460 
461 	/*
462 	 * VN_HOLD the vnode before releasing s_lock to guarantee
463 	 * we have a valid vnode reference.  The async thread will
464 	 * release the hold when it's done.
465 	 */
466 	if (found == TRUE) {
467 		mutex_exit(&rp->r_statev4_lock);
468 		vp = RTOV4(rp);
469 		VN_HOLD(vp);
470 	}
471 	mutex_exit(&sp->s_lock);
472 	nfs4_server_rele(sp);
473 
474 	if (found == FALSE) {
475 
476 		CB_WARN("cb_recall: bad stateid\n");
477 
478 		*cs->statusp = resp->status = NFS4ERR_BAD_STATEID;
479 		return;
480 	}
481 
482 	/* Fire up a thread to do the delegreturn */
483 	nfs4delegreturn_async(rp, NFS4_DR_RECALL|NFS4_DR_REOPEN,
484 	    args->truncate);
485 
486 	*cs->statusp = resp->status = 0;
487 }
488 
489 /* ARGSUSED */
490 static void
491 cb_recall_free(nfs_cb_resop4 *resop)
492 {
493 	/* nothing to do here, cb_recall doesn't kmem_alloc */
494 }
495 
496 /*
497  * This function handles the CB_NULL proc call from an NFSv4 Server.
498  *
499  * We take note that the server has sent a CB_NULL for later processing
500  * in the recovery logic. It is noted so we may pause slightly after the
501  * setclientid and before reopening files. The pause is to allow the
502  * NFSv4 Server time to receive the CB_NULL reply and adjust any of
503  * its internal structures such that it has the opportunity to grant
504  * delegations to reopened files.
505  *
506  */
507 
508 /* ARGSUSED */
509 static void
510 cb_null(CB_COMPOUND4args *args, CB_COMPOUND4res *resp, struct svc_req *req,
511     struct nfs4_callback_globals *ncg)
512 {
513 	struct nfs4_server *sp;
514 
515 	ncg->nfs4_callback_stats.cb_null.value.ui64++;
516 
517 	ASSERT(req->rq_prog >= NFS4_CALLBACK);
518 	ASSERT(req->rq_prog < NFS4_CALLBACK+nfs4_num_prognums);
519 
520 	mutex_enter(&ncg->nfs4_cb_lock);
521 	sp = ncg->nfs4prog2server[req->rq_prog - NFS4_CALLBACK];
522 	mutex_exit(&ncg->nfs4_cb_lock);
523 
524 	if (nfs4_server_vlock(sp, 0) != FALSE) {
525 		sp->s_flags |= N4S_CB_PINGED;
526 		cv_broadcast(&sp->wait_cb_null);
527 		mutex_exit(&sp->s_lock);
528 		nfs4_server_rele(sp);
529 	}
530 }
531 
532 /*
533  * cb_illegal	args: void
534  *		res : status (NFS4ERR_OP_CB_ILLEGAL)
535  */
536 /* ARGSUSED */
537 static void
538 cb_illegal(nfs_cb_argop4 *argop, nfs_cb_resop4 *resop, struct svc_req *req,
539     struct compound_state *cs, struct nfs4_callback_globals *ncg)
540 {
541 	CB_ILLEGAL4res *resp = &resop->nfs_cb_resop4_u.opcbillegal;
542 
543 	ncg->nfs4_callback_stats.cb_illegal.value.ui64++;
544 	resop->resop = OP_CB_ILLEGAL;
545 	*cs->statusp = resp->status = NFS4ERR_OP_ILLEGAL;
546 }
547 
548 static void
549 cb_compound(CB_COMPOUND4args *args, CB_COMPOUND4res *resp, struct svc_req *req,
550     struct nfs4_callback_globals *ncg)
551 {
552 	uint_t i;
553 	struct compound_state cs;
554 	nfs_cb_argop4 *argop;
555 	nfs_cb_resop4 *resop, *new_res;
556 	uint_t op;
557 
558 	bzero(&cs, sizeof (cs));
559 	cs.statusp = &resp->status;
560 	cs.cont = TRUE;
561 
562 	/*
563 	 * Form a reply tag by copying over the request tag.
564 	 */
565 	resp->tag.utf8string_len = args->tag.utf8string_len;
566 	if (args->tag.utf8string_len != 0) {
567 		resp->tag.utf8string_val =
568 		    kmem_alloc(resp->tag.utf8string_len, KM_SLEEP);
569 		bcopy(args->tag.utf8string_val, resp->tag.utf8string_val,
570 		    args->tag.utf8string_len);
571 	} else {
572 		resp->tag.utf8string_val = NULL;
573 	}
574 
575 	/*
576 	 * XXX for now, minorversion should be zero
577 	 */
578 	if (args->minorversion != CB4_MINORVERSION) {
579 		resp->array_len = 0;
580 		resp->array = NULL;
581 		resp->status = NFS4ERR_MINOR_VERS_MISMATCH;
582 		return;
583 	}
584 
585 #ifdef DEBUG
586 	/*
587 	 * Verify callback_ident.  It doesn't really matter if it's wrong
588 	 * because we don't really use callback_ident -- we use prog number
589 	 * of the RPC request instead.  In this case, just print a DEBUG
590 	 * console message to reveal brokenness of cbclient (at bkoff/cthon).
591 	 */
592 	if (args->callback_ident != req->rq_prog)
593 		zcmn_err(getzoneid(), CE_WARN,
594 		    "cb_compound: cb_client using wrong "
595 		    "callback_ident(%d), should be %d",
596 		    args->callback_ident, req->rq_prog);
597 #endif
598 
599 	resp->array_len = args->array_len;
600 	resp->array = kmem_zalloc(args->array_len * sizeof (nfs_cb_resop4),
601 	    KM_SLEEP);
602 
603 	for (i = 0; i < args->array_len && cs.cont; i++) {
604 
605 		argop = &args->array[i];
606 		resop = &resp->array[i];
607 		resop->resop = argop->argop;
608 		op = (uint_t)resop->resop;
609 
610 		switch (op) {
611 
612 		case OP_CB_GETATTR:
613 
614 			cb_getattr(argop, resop, req, &cs, ncg);
615 			break;
616 
617 		case OP_CB_RECALL:
618 
619 			cb_recall(argop, resop, req, &cs, ncg);
620 			break;
621 
622 		case OP_CB_ILLEGAL:
623 
624 			/* fall through */
625 
626 		default:
627 			/*
628 			 * Handle OP_CB_ILLEGAL and any undefined opcode.
629 			 * Currently, the XDR code will return BADXDR
630 			 * if cb op doesn't decode to legal value, so
631 			 * it really only handles OP_CB_ILLEGAL.
632 			 */
633 			op = OP_CB_ILLEGAL;
634 			cb_illegal(argop, resop, req, &cs, ncg);
635 		}
636 
637 		if (*cs.statusp != NFS4_OK)
638 			cs.cont = FALSE;
639 
640 		/*
641 		 * If not at last op, and if we are to stop, then
642 		 * compact the results array.
643 		 */
644 		if ((i + 1) < args->array_len && !cs.cont) {
645 
646 			new_res = kmem_alloc(
647 			    (i+1) * sizeof (nfs_cb_resop4), KM_SLEEP);
648 			bcopy(resp->array,
649 			    new_res, (i+1) * sizeof (nfs_cb_resop4));
650 			kmem_free(resp->array,
651 			    args->array_len * sizeof (nfs_cb_resop4));
652 
653 			resp->array_len =  i + 1;
654 			resp->array = new_res;
655 		}
656 	}
657 
658 }
659 
660 static void
661 cb_compound_free(CB_COMPOUND4res *resp)
662 {
663 	uint_t i, op;
664 	nfs_cb_resop4 *resop;
665 
666 	if (resp->tag.utf8string_val) {
667 		UTF8STRING_FREE(resp->tag)
668 	}
669 
670 	for (i = 0; i < resp->array_len; i++) {
671 
672 		resop = &resp->array[i];
673 		op = (uint_t)resop->resop;
674 
675 		switch (op) {
676 
677 		case OP_CB_GETATTR:
678 
679 			cb_getattr_free(resop);
680 			break;
681 
682 		case OP_CB_RECALL:
683 
684 			cb_recall_free(resop);
685 			break;
686 
687 		default:
688 			break;
689 		}
690 	}
691 
692 	if (resp->array != NULL) {
693 		kmem_free(resp->array,
694 		    resp->array_len * sizeof (nfs_cb_resop4));
695 	}
696 }
697 
698 static void
699 cb_dispatch(struct svc_req *req, SVCXPRT *xprt)
700 {
701 	CB_COMPOUND4args args;
702 	CB_COMPOUND4res res;
703 	struct nfs4_callback_globals *ncg;
704 
705 	bool_t (*xdr_args)(), (*xdr_res)();
706 	void (*proc)(CB_COMPOUND4args *, CB_COMPOUND4res *, struct svc_req *,
707 	    struct nfs4_callback_globals *);
708 	void (*freeproc)(CB_COMPOUND4res *);
709 
710 	ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
711 	ASSERT(ncg != NULL);
712 
713 	ncg->nfs4_callback_stats.cb_dispatch.value.ui64++;
714 
715 	switch (req->rq_proc) {
716 	case CB_NULL:
717 		xdr_args = xdr_void;
718 		xdr_res = xdr_void;
719 		proc = cb_null;
720 		freeproc = NULL;
721 		break;
722 
723 	case CB_COMPOUND:
724 		xdr_args = xdr_CB_COMPOUND4args_clnt;
725 		xdr_res = xdr_CB_COMPOUND4res;
726 		proc = cb_compound;
727 		freeproc = cb_compound_free;
728 		break;
729 
730 	default:
731 		CB_WARN("cb_dispatch: no proc\n");
732 		svcerr_noproc(xprt);
733 		return;
734 	}
735 
736 	args.tag.utf8string_val = NULL;
737 	args.array = NULL;
738 
739 	if (!SVC_GETARGS(xprt, xdr_args, (caddr_t)&args)) {
740 
741 		CB_WARN("cb_dispatch: cannot getargs\n");
742 		svcerr_decode(xprt);
743 		return;
744 	}
745 
746 	(*proc)(&args, &res, req, ncg);
747 
748 	if (svc_sendreply(xprt, xdr_res, (caddr_t)&res) == FALSE) {
749 
750 		CB_WARN("cb_dispatch: bad sendreply\n");
751 		svcerr_systemerr(xprt);
752 	}
753 
754 	if (freeproc)
755 		(*freeproc)(&res);
756 
757 	if (!SVC_FREEARGS(xprt, xdr_args, (caddr_t)&args)) {
758 
759 		CB_WARN("cb_dispatch: bad freeargs\n");
760 	}
761 }
762 
763 static rpcprog_t
764 nfs4_getnextprogram(struct nfs4_callback_globals *ncg)
765 {
766 	int i, j;
767 
768 	j = ncg->nfs4_program_hint;
769 	for (i = 0; i < nfs4_num_prognums; i++, j++) {
770 
771 		if (j >= nfs4_num_prognums)
772 			j = 0;
773 
774 		if (ncg->nfs4prog2server[j] == NULL) {
775 			ncg->nfs4_program_hint = j+1;
776 			return (j+NFS4_CALLBACK);
777 		}
778 	}
779 
780 	return (0);
781 }
782 
783 void
784 nfs4callback_destroy(nfs4_server_t *np)
785 {
786 	struct nfs4_callback_globals *ncg;
787 	int i;
788 
789 	if (np->s_program == 0)
790 		return;
791 
792 	ncg = np->zone_globals;
793 	i = np->s_program - NFS4_CALLBACK;
794 
795 	mutex_enter(&ncg->nfs4_cb_lock);
796 
797 	ASSERT(ncg->nfs4prog2server[i] == np);
798 
799 	ncg->nfs4prog2server[i] = NULL;
800 
801 	if (i < ncg->nfs4_program_hint)
802 		ncg->nfs4_program_hint = i;
803 
804 	mutex_exit(&ncg->nfs4_cb_lock);
805 }
806 
807 /*
808  * nfs4_setport - This function saves a netid and univeral address for
809  * the callback program.  These values will be used during setclientid.
810  */
811 static void
812 nfs4_setport(char *netid, char *uaddr, char *protofmly, char *proto,
813     struct nfs4_callback_globals *ncg)
814 {
815 	struct nfs4_cb_port *p;
816 	bool_t found = FALSE;
817 
818 	ASSERT(MUTEX_HELD(&ncg->nfs4_cb_lock));
819 
820 	p = list_head(&ncg->nfs4_cb_ports);
821 	for (; p != NULL; p = list_next(&ncg->nfs4_cb_ports, p)) {
822 		if (strcmp(p->netid, netid) == 0) {
823 			found = TRUE;
824 			break;
825 		}
826 	}
827 	if (found == TRUE)
828 		(void) strcpy(p->uaddr, uaddr);
829 	else {
830 		p = kmem_alloc(sizeof (*p), KM_SLEEP);
831 
832 		(void) strcpy(p->uaddr, uaddr);
833 		(void) strcpy(p->netid, netid);
834 		(void) strcpy(p->protofmly, protofmly);
835 		(void) strcpy(p->proto, proto);
836 		list_insert_head(&ncg->nfs4_cb_ports, p);
837 	}
838 }
839 
840 /*
841  * nfs4_cb_args - This function is used to construct the callback
842  * portion of the arguments needed for setclientid.
843  */
844 
845 void
846 nfs4_cb_args(nfs4_server_t *np, struct knetconfig *knc, SETCLIENTID4args *args)
847 {
848 	struct nfs4_cb_port *p;
849 	bool_t found = FALSE;
850 	rpcprog_t pgm;
851 	struct nfs4_callback_globals *ncg = np->zone_globals;
852 
853 	/*
854 	 * This server structure may already have a program number
855 	 * assigned to it.  This happens when the client has to
856 	 * re-issue SETCLIENTID.  Just re-use the information.
857 	 */
858 	if (np->s_program >= NFS4_CALLBACK &&
859 	    np->s_program < NFS4_CALLBACK + nfs4_num_prognums)
860 		nfs4callback_destroy(np);
861 
862 	mutex_enter(&ncg->nfs4_cb_lock);
863 
864 	p = list_head(&ncg->nfs4_cb_ports);
865 	for (; p != NULL; p = list_next(&ncg->nfs4_cb_ports, p)) {
866 		if (strcmp(p->protofmly, knc->knc_protofmly) == 0 &&
867 		    strcmp(p->proto, knc->knc_proto) == 0) {
868 			found = TRUE;
869 			break;
870 		}
871 	}
872 
873 	if (found == FALSE) {
874 
875 		NFS4_DEBUG(nfs4_callback_debug,
876 		    (CE_WARN, "nfs4_cb_args: could not find netid for %s/%s\n",
877 		    knc->knc_protofmly, knc->knc_proto));
878 
879 		args->callback.cb_program = 0;
880 		args->callback.cb_location.r_netid = NULL;
881 		args->callback.cb_location.r_addr = NULL;
882 		args->callback_ident = 0;
883 		mutex_exit(&ncg->nfs4_cb_lock);
884 		return;
885 	}
886 
887 	if ((pgm = nfs4_getnextprogram(ncg)) == 0) {
888 		CB_WARN("nfs4_cb_args: out of program numbers\n");
889 
890 		args->callback.cb_program = 0;
891 		args->callback.cb_location.r_netid = NULL;
892 		args->callback.cb_location.r_addr = NULL;
893 		args->callback_ident = 0;
894 		mutex_exit(&ncg->nfs4_cb_lock);
895 		return;
896 	}
897 
898 	ncg->nfs4prog2server[pgm-NFS4_CALLBACK] = np;
899 	args->callback.cb_program = pgm;
900 	args->callback.cb_location.r_netid = p->netid;
901 	args->callback.cb_location.r_addr = p->uaddr;
902 	args->callback_ident = pgm;
903 
904 	np->s_program = pgm;
905 
906 	mutex_exit(&ncg->nfs4_cb_lock);
907 }
908 
909 static int
910 nfs4_dquery(struct nfs4_svc_args *arg, model_t model)
911 {
912 	file_t *fp;
913 	vnode_t *vp;
914 	rnode4_t *rp;
915 	int error;
916 	STRUCT_HANDLE(nfs4_svc_args, uap);
917 
918 	STRUCT_SET_HANDLE(uap, model, arg);
919 
920 	if ((fp = getf(STRUCT_FGET(uap, fd))) == NULL)
921 		return (EBADF);
922 
923 	vp = fp->f_vnode;
924 
925 	if (vp == NULL || vp->v_type != VREG ||
926 	    !vn_matchops(vp, nfs4_vnodeops)) {
927 		releasef(STRUCT_FGET(uap, fd));
928 		return (EBADF);
929 	}
930 
931 	rp = VTOR4(vp);
932 
933 	/*
934 	 * I can't convince myself that we need locking here.  The
935 	 * rnode cannot disappear and the value returned is instantly
936 	 * stale anway, so why bother?
937 	 */
938 
939 	error = suword32(STRUCT_FGETP(uap, netid), rp->r_deleg_type);
940 	releasef(STRUCT_FGET(uap, fd));
941 	return (error);
942 }
943 
944 
945 /*
946  * NFS4 client system call.  This service does the
947  * necessary initialization for the callback program.
948  * This is fashioned after the server side interaction
949  * between nfsd and the kernel.  On the client, the
950  * mount command forks and the child process does the
951  * necessary interaction with the kernel.
952  *
953  * uap->fd is the fd of an open transport provider
954  */
955 int
956 nfs4_svc(struct nfs4_svc_args *arg, model_t model)
957 {
958 	file_t *fp;
959 	int error;
960 	int readsize;
961 	char buf[KNC_STRSIZE], uaddr[KNC_STRSIZE];
962 	char protofmly[KNC_STRSIZE], proto[KNC_STRSIZE];
963 	size_t len;
964 	STRUCT_HANDLE(nfs4_svc_args, uap);
965 	struct netbuf addrmask;
966 	int cmd;
967 	SVCMASTERXPRT *cb_xprt;
968 	struct nfs4_callback_globals *ncg;
969 
970 #ifdef lint
971 	model = model;		/* STRUCT macros don't always refer to it */
972 #endif
973 
974 	STRUCT_SET_HANDLE(uap, model, arg);
975 
976 	if (STRUCT_FGET(uap, cmd) == NFS4_DQUERY)
977 		return (nfs4_dquery(arg, model));
978 
979 	if (secpolicy_nfs(CRED()) != 0)
980 		return (EPERM);
981 
982 	if ((fp = getf(STRUCT_FGET(uap, fd))) == NULL)
983 		return (EBADF);
984 
985 	/*
986 	 * Set read buffer size to rsize
987 	 * and add room for RPC headers.
988 	 */
989 	readsize = nfs3tsize() + (RPC_MAXDATASIZE - NFS_MAXDATA);
990 	if (readsize < RPC_MAXDATASIZE)
991 		readsize = RPC_MAXDATASIZE;
992 
993 	error = copyinstr((const char *)STRUCT_FGETP(uap, netid), buf,
994 	    KNC_STRSIZE, &len);
995 	if (error) {
996 		releasef(STRUCT_FGET(uap, fd));
997 		return (error);
998 	}
999 
1000 	cmd = STRUCT_FGET(uap, cmd);
1001 
1002 	if (cmd & NFS4_KRPC_START) {
1003 		addrmask.len = STRUCT_FGET(uap, addrmask.len);
1004 		addrmask.maxlen = STRUCT_FGET(uap, addrmask.maxlen);
1005 		addrmask.buf = kmem_alloc(addrmask.maxlen, KM_SLEEP);
1006 		error = copyin(STRUCT_FGETP(uap, addrmask.buf), addrmask.buf,
1007 		    addrmask.len);
1008 		if (error) {
1009 			releasef(STRUCT_FGET(uap, fd));
1010 			kmem_free(addrmask.buf, addrmask.maxlen);
1011 			return (error);
1012 		}
1013 	}
1014 	else
1015 		addrmask.buf = NULL;
1016 
1017 	error = copyinstr((const char *)STRUCT_FGETP(uap, addr), uaddr,
1018 	    sizeof (uaddr), &len);
1019 	if (error) {
1020 		releasef(STRUCT_FGET(uap, fd));
1021 		if (addrmask.buf)
1022 			kmem_free(addrmask.buf, addrmask.maxlen);
1023 		return (error);
1024 	}
1025 
1026 	error = copyinstr((const char *)STRUCT_FGETP(uap, protofmly), protofmly,
1027 	    sizeof (protofmly), &len);
1028 	if (error) {
1029 		releasef(STRUCT_FGET(uap, fd));
1030 		if (addrmask.buf)
1031 			kmem_free(addrmask.buf, addrmask.maxlen);
1032 		return (error);
1033 	}
1034 
1035 	error = copyinstr((const char *)STRUCT_FGETP(uap, proto), proto,
1036 	    sizeof (proto), &len);
1037 	if (error) {
1038 		releasef(STRUCT_FGET(uap, fd));
1039 		if (addrmask.buf)
1040 			kmem_free(addrmask.buf, addrmask.maxlen);
1041 		return (error);
1042 	}
1043 
1044 	ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
1045 	ASSERT(ncg != NULL);
1046 
1047 	mutex_enter(&ncg->nfs4_cb_lock);
1048 	if (cmd & NFS4_SETPORT)
1049 		nfs4_setport(buf, uaddr, protofmly, proto, ncg);
1050 
1051 	if (cmd & NFS4_KRPC_START) {
1052 		error = svc_tli_kcreate(fp, readsize, buf, &addrmask, &cb_xprt,
1053 		    &nfs4_cb_sct, NULL, NFS_CB_SVCPOOL_ID, FALSE);
1054 		if (error) {
1055 			CB_WARN1("nfs4_svc: svc_tli_kcreate failed %d\n",
1056 			    error);
1057 			kmem_free(addrmask.buf, addrmask.maxlen);
1058 		}
1059 	}
1060 
1061 	mutex_exit(&ncg->nfs4_cb_lock);
1062 	releasef(STRUCT_FGET(uap, fd));
1063 	return (error);
1064 }
1065 
1066 struct nfs4_callback_globals *
1067 nfs4_get_callback_globals(void)
1068 {
1069 	return (zone_getspecific(nfs4_callback_zone_key, nfs_zone()));
1070 }
1071 
1072 static void *
1073 nfs4_callback_init_zone(zoneid_t zoneid)
1074 {
1075 	kstat_t *nfs4_callback_kstat;
1076 	struct nfs4_callback_globals *ncg;
1077 
1078 	ncg = kmem_zalloc(sizeof (*ncg), KM_SLEEP);
1079 
1080 	ncg->nfs4prog2server = kmem_zalloc(nfs4_num_prognums *
1081 	    sizeof (struct nfs4_server *), KM_SLEEP);
1082 
1083 	/* initialize the dlist */
1084 	mutex_init(&ncg->nfs4_dlist_lock, NULL, MUTEX_DEFAULT, NULL);
1085 	list_create(&ncg->nfs4_dlist, sizeof (struct nfs4_dnode),
1086 	    offsetof(struct nfs4_dnode, linkage));
1087 
1088 	/* initialize cb_port list */
1089 	mutex_init(&ncg->nfs4_cb_lock, NULL, MUTEX_DEFAULT, NULL);
1090 	list_create(&ncg->nfs4_cb_ports, sizeof (struct nfs4_cb_port),
1091 	    offsetof(struct nfs4_cb_port, linkage));
1092 
1093 	/* get our own copy of the kstats */
1094 	bcopy(&nfs4_callback_stats_tmpl, &ncg->nfs4_callback_stats,
1095 	    sizeof (nfs4_callback_stats_tmpl));
1096 	/* register "nfs:0:nfs4_callback_stats" for this zone */
1097 	if ((nfs4_callback_kstat =
1098 	    kstat_create_zone("nfs", 0, "nfs4_callback_stats", "misc",
1099 	    KSTAT_TYPE_NAMED,
1100 	    sizeof (ncg->nfs4_callback_stats) / sizeof (kstat_named_t),
1101 	    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE,
1102 	    zoneid)) != NULL) {
1103 		nfs4_callback_kstat->ks_data = &ncg->nfs4_callback_stats;
1104 		kstat_install(nfs4_callback_kstat);
1105 	}
1106 	return (ncg);
1107 }
1108 
1109 static void
1110 nfs4_discard_delegations(struct nfs4_callback_globals *ncg)
1111 {
1112 	nfs4_server_t *sp;
1113 	int i, num_removed;
1114 
1115 	/*
1116 	 * It's OK here to just run through the registered "programs", as
1117 	 * servers without programs won't have any delegations to handle.
1118 	 */
1119 	for (i = 0; i < nfs4_num_prognums; i++) {
1120 		rnode4_t *rp;
1121 
1122 		mutex_enter(&ncg->nfs4_cb_lock);
1123 		sp = ncg->nfs4prog2server[i];
1124 		mutex_exit(&ncg->nfs4_cb_lock);
1125 
1126 		if (nfs4_server_vlock(sp, 1) == FALSE)
1127 			continue;
1128 		num_removed = 0;
1129 		while ((rp = list_head(&sp->s_deleg_list)) != NULL) {
1130 			mutex_enter(&rp->r_statev4_lock);
1131 			if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
1132 				/*
1133 				 * We need to take matters into our own hands,
1134 				 * as nfs4delegreturn_cleanup_impl() won't
1135 				 * remove this from the list.
1136 				 */
1137 				list_remove(&sp->s_deleg_list, rp);
1138 				mutex_exit(&rp->r_statev4_lock);
1139 				nfs4_dec_state_ref_count_nolock(sp,
1140 				    VTOMI4(RTOV4(rp)));
1141 				num_removed++;
1142 				continue;
1143 			}
1144 			mutex_exit(&rp->r_statev4_lock);
1145 			VN_HOLD(RTOV4(rp));
1146 			mutex_exit(&sp->s_lock);
1147 			/*
1148 			 * The following will remove the node from the list.
1149 			 */
1150 			nfs4delegreturn_cleanup_impl(rp, sp, ncg);
1151 			VN_RELE(RTOV4(rp));
1152 			mutex_enter(&sp->s_lock);
1153 		}
1154 		mutex_exit(&sp->s_lock);
1155 		/* each removed list node reles a reference */
1156 		while (num_removed-- > 0)
1157 			nfs4_server_rele(sp);
1158 		/* remove our reference for nfs4_server_vlock */
1159 		nfs4_server_rele(sp);
1160 	}
1161 }
1162 
1163 /* ARGSUSED */
1164 static void
1165 nfs4_callback_shutdown_zone(zoneid_t zoneid, void *data)
1166 {
1167 	struct nfs4_callback_globals *ncg = data;
1168 
1169 	/*
1170 	 * Clean pending delegation return list.
1171 	 */
1172 	nfs4_dlistclean_impl(ncg, NFS4_DR_DISCARD);
1173 
1174 	/*
1175 	 * Discard all delegations.
1176 	 */
1177 	nfs4_discard_delegations(ncg);
1178 }
1179 
1180 static void
1181 nfs4_callback_fini_zone(zoneid_t zoneid, void *data)
1182 {
1183 	struct nfs4_callback_globals *ncg = data;
1184 	struct nfs4_cb_port *p;
1185 	nfs4_server_t *sp, *next;
1186 	nfs4_server_t freelist;
1187 	int i;
1188 
1189 	kstat_delete_byname_zone("nfs", 0, "nfs4_callback_stats", zoneid);
1190 
1191 	/*
1192 	 * Discard all delegations that may have crept in since we did the
1193 	 * _shutdown.
1194 	 */
1195 	nfs4_discard_delegations(ncg);
1196 	/*
1197 	 * We're completely done with this zone and all associated
1198 	 * nfs4_server_t's.  Any remaining nfs4_server_ts should only have one
1199 	 * more reference outstanding -- the reference we didn't release in
1200 	 * nfs4_renew_lease_thread().
1201 	 *
1202 	 * Here we need to run through the global nfs4_server_lst as we need to
1203 	 * deal with nfs4_server_ts without programs, as they also have threads
1204 	 * created for them, and so have outstanding references that we need to
1205 	 * release.
1206 	 */
1207 	freelist.forw = &freelist;
1208 	freelist.back = &freelist;
1209 	mutex_enter(&nfs4_server_lst_lock);
1210 	sp = nfs4_server_lst.forw;
1211 	while (sp != &nfs4_server_lst) {
1212 		next = sp->forw;
1213 		if (sp->zoneid == zoneid) {
1214 			remque(sp);
1215 			insque(sp, &freelist);
1216 		}
1217 		sp = next;
1218 	}
1219 	mutex_exit(&nfs4_server_lst_lock);
1220 
1221 	sp = freelist.forw;
1222 	while (sp != &freelist) {
1223 		next = sp->forw;
1224 		nfs4_server_rele(sp);	/* free the list's reference */
1225 		sp = next;
1226 	}
1227 
1228 #ifdef DEBUG
1229 	for (i = 0; i < nfs4_num_prognums; i++) {
1230 		ASSERT(ncg->nfs4prog2server[i] == NULL);
1231 	}
1232 #endif
1233 	kmem_free(ncg->nfs4prog2server, nfs4_num_prognums *
1234 	    sizeof (struct nfs4_server *));
1235 
1236 	mutex_enter(&ncg->nfs4_cb_lock);
1237 	while ((p = list_head(&ncg->nfs4_cb_ports)) != NULL) {
1238 		list_remove(&ncg->nfs4_cb_ports, p);
1239 		kmem_free(p, sizeof (*p));
1240 	}
1241 	list_destroy(&ncg->nfs4_cb_ports);
1242 	mutex_destroy(&ncg->nfs4_cb_lock);
1243 	list_destroy(&ncg->nfs4_dlist);
1244 	mutex_destroy(&ncg->nfs4_dlist_lock);
1245 	kmem_free(ncg, sizeof (*ncg));
1246 }
1247 
1248 void
1249 nfs4_callback_init(void)
1250 {
1251 	int i;
1252 	SVC_CALLOUT *nfs4_cb_sc;
1253 
1254 	/* initialize the callback table */
1255 	nfs4_cb_sc = kmem_alloc(nfs4_num_prognums *
1256 	    sizeof (SVC_CALLOUT), KM_SLEEP);
1257 
1258 	for (i = 0; i < nfs4_num_prognums; i++) {
1259 		nfs4_cb_sc[i].sc_prog = NFS4_CALLBACK+i;
1260 		nfs4_cb_sc[i].sc_versmin = NFS_CB;
1261 		nfs4_cb_sc[i].sc_versmax = NFS_CB;
1262 		nfs4_cb_sc[i].sc_dispatch = cb_dispatch;
1263 	}
1264 
1265 	nfs4_cb_sct.sct_size = nfs4_num_prognums;
1266 	nfs4_cb_sct.sct_free = FALSE;
1267 	nfs4_cb_sct.sct_sc = nfs4_cb_sc;
1268 
1269 	/*
1270 	 * Compute max bytes required for dyamically allocated parts
1271 	 * of cb_getattr reply.  Only size and change are supported now.
1272 	 * If CB_GETATTR is changed to reply with additional attrs,
1273 	 * additional sizes must be added below.
1274 	 *
1275 	 * fattr4_change + fattr4_size == uint64_t + uint64_t
1276 	 */
1277 	cb_getattr_bytes = 2 * BYTES_PER_XDR_UNIT + 2 * BYTES_PER_XDR_UNIT;
1278 
1279 	zone_key_create(&nfs4_callback_zone_key, nfs4_callback_init_zone,
1280 	    nfs4_callback_shutdown_zone, nfs4_callback_fini_zone);
1281 }
1282 
1283 void
1284 nfs4_callback_fini(void)
1285 {
1286 }
1287 
1288 /*
1289  * NB: This function can be called from the *wrong* zone (ie, the zone that
1290  * 'rp' belongs to and the caller's zone may not be the same).  This can happen
1291  * if the zone is going away and we get called from nfs4_async_inactive().  In
1292  * this case the globals will be NULL and we won't update the counters, which
1293  * doesn't matter as the zone is going away anyhow.
1294  */
1295 static void
1296 nfs4delegreturn_cleanup_impl(rnode4_t *rp, nfs4_server_t *np,
1297     struct nfs4_callback_globals *ncg)
1298 {
1299 	mntinfo4_t *mi = VTOMI4(RTOV4(rp));
1300 	boolean_t need_rele = B_FALSE;
1301 
1302 	/*
1303 	 * Caller must be holding mi_recovlock in read mode
1304 	 * to call here.  This is provided by start_op.
1305 	 * Delegation management requires to grab s_lock
1306 	 * first and then r_statev4_lock.
1307 	 */
1308 
1309 	if (np == NULL) {
1310 		np = find_nfs4_server_all(mi, 1);
1311 		if (np == NULL)
1312 			return;
1313 		need_rele = B_TRUE;
1314 	} else {
1315 		mutex_enter(&np->s_lock);
1316 	}
1317 
1318 	mutex_enter(&rp->r_statev4_lock);
1319 
1320 	if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
1321 		mutex_exit(&rp->r_statev4_lock);
1322 		mutex_exit(&np->s_lock);
1323 		if (need_rele)
1324 			nfs4_server_rele(np);
1325 		return;
1326 	}
1327 
1328 	/*
1329 	 * Free the cred originally held when
1330 	 * the delegation was granted.  Caller must
1331 	 * hold this cred if it wants to use it after
1332 	 * this call.
1333 	 */
1334 	crfree(rp->r_deleg_cred);
1335 	rp->r_deleg_cred = NULL;
1336 	rp->r_deleg_type = OPEN_DELEGATE_NONE;
1337 	rp->r_deleg_needs_recovery = OPEN_DELEGATE_NONE;
1338 	rp->r_deleg_needs_recall = FALSE;
1339 	rp->r_deleg_return_pending = FALSE;
1340 
1341 	/*
1342 	 * Remove the rnode from the server's list and
1343 	 * update the ref counts.
1344 	 */
1345 	list_remove(&np->s_deleg_list, rp);
1346 	mutex_exit(&rp->r_statev4_lock);
1347 	nfs4_dec_state_ref_count_nolock(np, mi);
1348 	mutex_exit(&np->s_lock);
1349 	/* removed list node removes a reference */
1350 	nfs4_server_rele(np);
1351 	if (need_rele)
1352 		nfs4_server_rele(np);
1353 	if (ncg != NULL)
1354 		ncg->nfs4_callback_stats.delegations.value.ui64--;
1355 }
1356 
1357 void
1358 nfs4delegreturn_cleanup(rnode4_t *rp, nfs4_server_t *np)
1359 {
1360 	struct nfs4_callback_globals *ncg;
1361 
1362 	if (np != NULL) {
1363 		ncg = np->zone_globals;
1364 	} else if (nfs_zone() == VTOMI4(RTOV4(rp))->mi_zone) {
1365 		ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
1366 		ASSERT(ncg != NULL);
1367 	} else {
1368 		/*
1369 		 * Request coming from the wrong zone.
1370 		 */
1371 		ASSERT(getzoneid() == GLOBAL_ZONEID);
1372 		ncg = NULL;
1373 	}
1374 
1375 	nfs4delegreturn_cleanup_impl(rp, np, ncg);
1376 }
1377 
1378 static void
1379 nfs4delegreturn_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp,
1380     cred_t *cr, vnode_t *vp)
1381 {
1382 	if (error != ETIMEDOUT && error != EINTR &&
1383 	    !NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) {
1384 		lost_rqstp->lr_op = 0;
1385 		return;
1386 	}
1387 
1388 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
1389 	    "nfs4close_save_lost_rqst: error %d", error));
1390 
1391 	lost_rqstp->lr_op = OP_DELEGRETURN;
1392 	/*
1393 	 * The vp is held and rele'd via the recovery code.
1394 	 * See nfs4_save_lost_rqst.
1395 	 */
1396 	lost_rqstp->lr_vp = vp;
1397 	lost_rqstp->lr_dvp = NULL;
1398 	lost_rqstp->lr_oop = NULL;
1399 	lost_rqstp->lr_osp = NULL;
1400 	lost_rqstp->lr_lop = NULL;
1401 	lost_rqstp->lr_cr = cr;
1402 	lost_rqstp->lr_flk = NULL;
1403 	lost_rqstp->lr_putfirst = FALSE;
1404 }
1405 
1406 static void
1407 nfs4delegreturn_otw(rnode4_t *rp, cred_t *cr, nfs4_error_t *ep)
1408 {
1409 	COMPOUND4args_clnt args;
1410 	COMPOUND4res_clnt res;
1411 	nfs_argop4 argops[3];
1412 	nfs4_ga_res_t *garp = NULL;
1413 	hrtime_t t;
1414 	int numops;
1415 	int doqueue = 1;
1416 
1417 	args.ctag = TAG_DELEGRETURN;
1418 
1419 	numops = 3;		/* PUTFH, GETATTR, DELEGRETURN */
1420 
1421 	args.array = argops;
1422 	args.array_len = numops;
1423 
1424 	argops[0].argop = OP_CPUTFH;
1425 	argops[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
1426 
1427 	argops[1].argop = OP_GETATTR;
1428 	argops[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
1429 	argops[1].nfs_argop4_u.opgetattr.mi = VTOMI4(RTOV4(rp));
1430 
1431 	argops[2].argop = OP_DELEGRETURN;
1432 	argops[2].nfs_argop4_u.opdelegreturn.deleg_stateid =
1433 	    rp->r_deleg_stateid;
1434 
1435 	t = gethrtime();
1436 	rfs4call(VTOMI4(RTOV4(rp)), &args, &res, cr, &doqueue, 0, ep);
1437 
1438 	if (ep->error)
1439 		return;
1440 
1441 	if (res.status == NFS4_OK) {
1442 		garp = &res.array[1].nfs_resop4_u.opgetattr.ga_res;
1443 		nfs4_attr_cache(RTOV4(rp), garp, t, cr, TRUE, NULL);
1444 
1445 	}
1446 	xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1447 }
1448 
1449 int
1450 nfs4_do_delegreturn(rnode4_t *rp, int flags, cred_t *cr,
1451     struct nfs4_callback_globals *ncg)
1452 {
1453 	vnode_t *vp = RTOV4(rp);
1454 	mntinfo4_t *mi = VTOMI4(vp);
1455 	nfs4_lost_rqst_t lost_rqst;
1456 	nfs4_recov_state_t recov_state;
1457 	bool_t needrecov = FALSE, recovonly, done = FALSE;
1458 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
1459 
1460 	ncg->nfs4_callback_stats.delegreturn.value.ui64++;
1461 
1462 	while (!done) {
1463 		e.error = nfs4_start_fop(mi, vp, NULL, OH_DELEGRETURN,
1464 		    &recov_state, &recovonly);
1465 
1466 		if (e.error) {
1467 			if (flags & NFS4_DR_FORCE) {
1468 				(void) nfs_rw_enter_sig(&mi->mi_recovlock,
1469 				    RW_READER, 0);
1470 				nfs4delegreturn_cleanup_impl(rp, NULL, ncg);
1471 				nfs_rw_exit(&mi->mi_recovlock);
1472 			}
1473 			break;
1474 		}
1475 
1476 		/*
1477 		 * Check to see if the delegation has already been
1478 		 * returned by the recovery thread.   The state of
1479 		 * the delegation cannot change at this point due
1480 		 * to start_fop and the r_deleg_recall_lock.
1481 		 */
1482 		if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
1483 			e.error = 0;
1484 			nfs4_end_op(mi, vp, NULL, &recov_state, needrecov);
1485 			break;
1486 		}
1487 
1488 		if (recovonly) {
1489 			/*
1490 			 * Delegation will be returned via the
1491 			 * recovery framework.  Build a lost request
1492 			 * structure, start recovery and get out.
1493 			 */
1494 			nfs4_error_init(&e, EINTR);
1495 			nfs4delegreturn_save_lost_rqst(e.error, &lost_rqst,
1496 			    cr, vp);
1497 			(void) nfs4_start_recovery(&e, mi, vp,
1498 			    NULL, &rp->r_deleg_stateid,
1499 			    lost_rqst.lr_op == OP_DELEGRETURN ?
1500 			    &lost_rqst : NULL, OP_DELEGRETURN, NULL,
1501 			    NULL, NULL);
1502 			nfs4_end_op(mi, vp, NULL, &recov_state, needrecov);
1503 			break;
1504 		}
1505 
1506 		nfs4delegreturn_otw(rp, cr, &e);
1507 
1508 		/*
1509 		 * Ignore some errors on delegreturn; no point in marking
1510 		 * the file dead on a state destroying operation.
1511 		 */
1512 		if (e.error == 0 && (nfs4_recov_marks_dead(e.stat) ||
1513 		    e.stat == NFS4ERR_BADHANDLE ||
1514 		    e.stat == NFS4ERR_STALE))
1515 			needrecov = FALSE;
1516 		else
1517 			needrecov = nfs4_needs_recovery(&e, TRUE, vp->v_vfsp);
1518 
1519 		if (needrecov) {
1520 			nfs4delegreturn_save_lost_rqst(e.error, &lost_rqst,
1521 			    cr, vp);
1522 			(void) nfs4_start_recovery(&e, mi, vp,
1523 			    NULL, &rp->r_deleg_stateid,
1524 			    lost_rqst.lr_op == OP_DELEGRETURN ?
1525 			    &lost_rqst : NULL, OP_DELEGRETURN, NULL,
1526 			    NULL, NULL);
1527 		} else {
1528 			nfs4delegreturn_cleanup_impl(rp, NULL, ncg);
1529 			done = TRUE;
1530 		}
1531 
1532 		nfs4_end_op(mi, vp, NULL, &recov_state, needrecov);
1533 	}
1534 	return (e.error);
1535 }
1536 
1537 /*
1538  * nfs4_resend_delegreturn - used to drive the delegreturn
1539  * operation via the recovery thread.
1540  */
1541 void
1542 nfs4_resend_delegreturn(nfs4_lost_rqst_t *lorp, nfs4_error_t *ep,
1543     nfs4_server_t *np)
1544 {
1545 	rnode4_t *rp = VTOR4(lorp->lr_vp);
1546 
1547 	/* If the file failed recovery, just quit. */
1548 	mutex_enter(&rp->r_statelock);
1549 	if (rp->r_flags & R4RECOVERR) {
1550 		ep->error = EIO;
1551 	}
1552 	mutex_exit(&rp->r_statelock);
1553 
1554 	if (!ep->error)
1555 		nfs4delegreturn_otw(rp, lorp->lr_cr, ep);
1556 
1557 	/*
1558 	 * If recovery is now needed, then return the error
1559 	 * and status and let the recovery thread handle it,
1560 	 * including re-driving another delegreturn.  Otherwise,
1561 	 * just give up and clean up the delegation.
1562 	 */
1563 	if (nfs4_needs_recovery(ep, TRUE, lorp->lr_vp->v_vfsp))
1564 		return;
1565 
1566 	if (rp->r_deleg_type != OPEN_DELEGATE_NONE)
1567 		nfs4delegreturn_cleanup(rp, np);
1568 
1569 	nfs4_error_zinit(ep);
1570 }
1571 
1572 /*
1573  * nfs4delegreturn - general function to return a delegation.
1574  *
1575  * NFS4_DR_FORCE - return the delegation even if start_op fails
1576  * NFS4_DR_PUSH - push modified data back to the server via VOP_PUTPAGE
1577  * NFS4_DR_DISCARD - discard the delegation w/o delegreturn
1578  * NFS4_DR_DID_OP - calling function already did nfs4_start_op
1579  * NFS4_DR_RECALL - delegreturned initiated via CB_RECALL
1580  * NFS4_DR_REOPEN - do file reopens, if applicable
1581  */
1582 static int
1583 nfs4delegreturn_impl(rnode4_t *rp, int flags, struct nfs4_callback_globals *ncg)
1584 {
1585 	int error = 0;
1586 	cred_t *cr = NULL;
1587 	vnode_t *vp;
1588 	bool_t needrecov = FALSE;
1589 	bool_t rw_entered = FALSE;
1590 	bool_t do_reopen;
1591 
1592 	vp = RTOV4(rp);
1593 
1594 	/*
1595 	 * If NFS4_DR_DISCARD is set by itself, take a short-cut and
1596 	 * discard without doing an otw DELEGRETURN.  This may only be used
1597 	 * by the recovery thread because it bypasses the synchronization
1598 	 * with r_deleg_recall_lock and mi->mi_recovlock.
1599 	 */
1600 	if (flags == NFS4_DR_DISCARD) {
1601 		nfs4delegreturn_cleanup_impl(rp, NULL, ncg);
1602 		return (0);
1603 	}
1604 
1605 	if (flags & NFS4_DR_DID_OP) {
1606 		/*
1607 		 * Caller had already done start_op, which means the
1608 		 * r_deleg_recall_lock is already held in READ mode
1609 		 * so we cannot take it in write mode.  Return the
1610 		 * delegation asynchronously.
1611 		 *
1612 		 * Remove the NFS4_DR_DID_OP flag so we don't
1613 		 * get stuck looping through here.
1614 		 */
1615 		VN_HOLD(vp);
1616 		nfs4delegreturn_async(rp, (flags & ~NFS4_DR_DID_OP), FALSE);
1617 		return (0);
1618 	}
1619 
1620 	/*
1621 	 * Verify we still have a delegation and crhold the credential.
1622 	 */
1623 	mutex_enter(&rp->r_statev4_lock);
1624 	if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
1625 		mutex_exit(&rp->r_statev4_lock);
1626 		goto out;
1627 	}
1628 	cr = rp->r_deleg_cred;
1629 	ASSERT(cr != NULL);
1630 	crhold(cr);
1631 	mutex_exit(&rp->r_statev4_lock);
1632 
1633 	/*
1634 	 * Push the modified data back to the server synchronously
1635 	 * before doing DELEGRETURN.
1636 	 */
1637 	if (flags & NFS4_DR_PUSH)
1638 		(void) VOP_PUTPAGE(vp, 0, 0, 0, cr, NULL);
1639 
1640 	/*
1641 	 * Take r_deleg_recall_lock in WRITE mode, this will prevent
1642 	 * nfs4_is_otw_open_necessary from trying to use the delegation
1643 	 * while the DELEGRETURN is in progress.
1644 	 */
1645 	(void) nfs_rw_enter_sig(&rp->r_deleg_recall_lock, RW_WRITER, FALSE);
1646 
1647 	rw_entered = TRUE;
1648 
1649 	if (rp->r_deleg_type == OPEN_DELEGATE_NONE)
1650 		goto out;
1651 
1652 	if (flags & NFS4_DR_REOPEN) {
1653 		/*
1654 		 * If R4RECOVERRP is already set, then skip re-opening
1655 		 * the delegation open streams and go straight to doing
1656 		 * delegreturn.  (XXX if the file has failed recovery, then the
1657 		 * delegreturn attempt is likely to be futile.)
1658 		 */
1659 		mutex_enter(&rp->r_statelock);
1660 		do_reopen = !(rp->r_flags & R4RECOVERRP);
1661 		mutex_exit(&rp->r_statelock);
1662 
1663 		if (do_reopen) {
1664 			error = deleg_reopen(vp, &needrecov, ncg, flags);
1665 			if (error != 0) {
1666 				if ((flags & (NFS4_DR_FORCE | NFS4_DR_RECALL))
1667 				    == 0)
1668 					goto out;
1669 			} else if (needrecov) {
1670 				if ((flags & NFS4_DR_FORCE) == 0)
1671 					goto out;
1672 			}
1673 		}
1674 	}
1675 
1676 	if (flags & NFS4_DR_DISCARD) {
1677 		mntinfo4_t *mi = VTOMI4(RTOV4(rp));
1678 
1679 		mutex_enter(&rp->r_statelock);
1680 		/*
1681 		 * deleg_return_pending is cleared inside of delegation_accept
1682 		 * when a delegation is accepted.  if this flag has been
1683 		 * cleared, then a new delegation has overwritten the one we
1684 		 * were about to throw away.
1685 		 */
1686 		if (!rp->r_deleg_return_pending) {
1687 			mutex_exit(&rp->r_statelock);
1688 			goto out;
1689 		}
1690 		mutex_exit(&rp->r_statelock);
1691 		(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, FALSE);
1692 		nfs4delegreturn_cleanup_impl(rp, NULL, ncg);
1693 		nfs_rw_exit(&mi->mi_recovlock);
1694 	} else {
1695 		error = nfs4_do_delegreturn(rp, flags, cr, ncg);
1696 	}
1697 
1698 out:
1699 	if (cr)
1700 		crfree(cr);
1701 	if (rw_entered)
1702 		nfs_rw_exit(&rp->r_deleg_recall_lock);
1703 	return (error);
1704 }
1705 
1706 int
1707 nfs4delegreturn(rnode4_t *rp, int flags)
1708 {
1709 	struct nfs4_callback_globals *ncg;
1710 
1711 	ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
1712 	ASSERT(ncg != NULL);
1713 
1714 	return (nfs4delegreturn_impl(rp, flags, ncg));
1715 }
1716 
1717 void
1718 nfs4delegreturn_async(rnode4_t *rp, int flags, bool_t trunc)
1719 {
1720 	struct cb_recall_pass *pp;
1721 
1722 	pp = kmem_alloc(sizeof (struct cb_recall_pass), KM_SLEEP);
1723 	pp->rp = rp;
1724 	pp->flags = flags;
1725 	pp->truncate = trunc;
1726 
1727 	/*
1728 	 * Fire up a thread to do the actual delegreturn
1729 	 * Caller must guarantee that the rnode doesn't
1730 	 * vanish (by calling VN_HOLD).
1731 	 */
1732 
1733 	(void) zthread_create(NULL, 0, nfs4delegreturn_thread, pp, 0,
1734 	    minclsyspri);
1735 }
1736 
1737 static void
1738 delegreturn_all_thread(rpcprog_t *pp)
1739 {
1740 	nfs4_server_t *np;
1741 	bool_t found = FALSE;
1742 	rpcprog_t prog;
1743 	rnode4_t *rp;
1744 	vnode_t *vp;
1745 	zoneid_t zoneid = getzoneid();
1746 	struct nfs4_callback_globals *ncg;
1747 
1748 	NFS4_DEBUG(nfs4_drat_debug,
1749 	    (CE_NOTE, "delereturn_all_thread: prog %d\n", *pp));
1750 
1751 	prog = *pp;
1752 	kmem_free(pp, sizeof (*pp));
1753 	pp = NULL;
1754 
1755 	mutex_enter(&nfs4_server_lst_lock);
1756 	for (np = nfs4_server_lst.forw; np != &nfs4_server_lst; np = np->forw) {
1757 		if (np->zoneid == zoneid && np->s_program == prog) {
1758 			mutex_enter(&np->s_lock);
1759 			found = TRUE;
1760 			break;
1761 		}
1762 	}
1763 	mutex_exit(&nfs4_server_lst_lock);
1764 
1765 	/*
1766 	 * It's possible that the nfs4_server which was using this
1767 	 * program number has vanished since this thread is async.
1768 	 * If so, just return.  Your work here is finished, my friend.
1769 	 */
1770 	if (!found)
1771 		goto out;
1772 
1773 	ncg = np->zone_globals;
1774 	while ((rp = list_head(&np->s_deleg_list)) != NULL) {
1775 		vp = RTOV4(rp);
1776 		VN_HOLD(vp);
1777 		mutex_exit(&np->s_lock);
1778 		(void) nfs4delegreturn_impl(rp, NFS4_DR_PUSH|NFS4_DR_REOPEN,
1779 		    ncg);
1780 		VN_RELE(vp);
1781 
1782 		/* retake the s_lock for next trip through the loop */
1783 		mutex_enter(&np->s_lock);
1784 	}
1785 	mutex_exit(&np->s_lock);
1786 out:
1787 	NFS4_DEBUG(nfs4_drat_debug,
1788 	    (CE_NOTE, "delereturn_all_thread: complete\n"));
1789 	zthread_exit();
1790 }
1791 
1792 void
1793 nfs4_delegreturn_all(nfs4_server_t *sp)
1794 {
1795 	rpcprog_t pro, *pp;
1796 
1797 	mutex_enter(&sp->s_lock);
1798 
1799 	/* Check to see if the delegation list is empty */
1800 
1801 	if (list_head(&sp->s_deleg_list) == NULL) {
1802 		mutex_exit(&sp->s_lock);
1803 		return;
1804 	}
1805 	/*
1806 	 * Grab the program number; the async thread will use this
1807 	 * to find the nfs4_server.
1808 	 */
1809 	pro = sp->s_program;
1810 	mutex_exit(&sp->s_lock);
1811 	pp = kmem_alloc(sizeof (rpcprog_t), KM_SLEEP);
1812 	*pp = pro;
1813 	(void) zthread_create(NULL, 0, delegreturn_all_thread, pp, 0,
1814 	    minclsyspri);
1815 }
1816 
1817 
1818 /*
1819  * Discard any delegations
1820  *
1821  * Iterate over the servers s_deleg_list and
1822  * for matching mount-point rnodes discard
1823  * the delegation.
1824  */
1825 void
1826 nfs4_deleg_discard(mntinfo4_t *mi, nfs4_server_t *sp)
1827 {
1828 	rnode4_t *rp, *next;
1829 	mntinfo4_t *r_mi;
1830 	struct nfs4_callback_globals *ncg;
1831 
1832 	ASSERT(mutex_owned(&sp->s_lock));
1833 	ncg = sp->zone_globals;
1834 
1835 	for (rp = list_head(&sp->s_deleg_list); rp != NULL; rp = next) {
1836 		r_mi = VTOMI4(RTOV4(rp));
1837 		next = list_next(&sp->s_deleg_list, rp);
1838 
1839 		if (r_mi != mi) {
1840 			/*
1841 			 * Skip if this rnode is in not on the
1842 			 * same mount-point
1843 			 */
1844 			continue;
1845 		}
1846 
1847 		ASSERT(rp->r_deleg_type == OPEN_DELEGATE_READ);
1848 
1849 #ifdef DEBUG
1850 		if (nfs4_client_recov_debug) {
1851 			zprintf(getzoneid(),
1852 			    "nfs4_deleg_discard: matched rnode %p "
1853 			"-- discarding delegation\n", (void *)rp);
1854 		}
1855 #endif
1856 		mutex_enter(&rp->r_statev4_lock);
1857 		/*
1858 		 * Free the cred originally held when the delegation
1859 		 * was granted. Also need to decrement the refcnt
1860 		 * on this server for each delegation we discard
1861 		 */
1862 		if (rp->r_deleg_cred)
1863 			crfree(rp->r_deleg_cred);
1864 		rp->r_deleg_cred = NULL;
1865 		rp->r_deleg_type = OPEN_DELEGATE_NONE;
1866 		rp->r_deleg_needs_recovery = OPEN_DELEGATE_NONE;
1867 		rp->r_deleg_needs_recall = FALSE;
1868 		ASSERT(sp->s_refcnt > 1);
1869 		sp->s_refcnt--;
1870 		list_remove(&sp->s_deleg_list, rp);
1871 		mutex_exit(&rp->r_statev4_lock);
1872 		nfs4_dec_state_ref_count_nolock(sp, mi);
1873 		ncg->nfs4_callback_stats.delegations.value.ui64--;
1874 	}
1875 }
1876 
1877 /*
1878  * Reopen any open streams that were covered by the given file's
1879  * delegation.
1880  * Returns zero or an errno value.  If there was no error, *recovp
1881  * indicates whether recovery was initiated.
1882  */
1883 
1884 static int
1885 deleg_reopen(vnode_t *vp, bool_t *recovp, struct nfs4_callback_globals *ncg,
1886     int flags)
1887 {
1888 	nfs4_open_stream_t *osp;
1889 	nfs4_recov_state_t recov_state;
1890 	bool_t needrecov = FALSE;
1891 	mntinfo4_t *mi;
1892 	rnode4_t *rp;
1893 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
1894 	int claimnull;
1895 
1896 	mi = VTOMI4(vp);
1897 	rp = VTOR4(vp);
1898 
1899 	recov_state.rs_flags = 0;
1900 	recov_state.rs_num_retry_despite_err = 0;
1901 
1902 retry:
1903 	if ((e.error = nfs4_start_op(mi, vp, NULL, &recov_state)) != 0) {
1904 		return (e.error);
1905 	}
1906 
1907 	/*
1908 	 * if we mean to discard the delegation, it must be BAD, so don't
1909 	 * use it when doing the reopen or it will fail too.
1910 	 */
1911 	claimnull = (flags & NFS4_DR_DISCARD);
1912 	/*
1913 	 * Loop through the open streams for this rnode to find
1914 	 * all of the ones created using the delegation state ID.
1915 	 * Each of these needs to be re-opened.
1916 	 */
1917 
1918 	while ((osp = get_next_deleg_stream(rp, claimnull)) != NULL) {
1919 
1920 		if (claimnull) {
1921 			nfs4_reopen(vp, osp, &e, CLAIM_NULL, FALSE, FALSE);
1922 		} else {
1923 			ncg->nfs4_callback_stats.claim_cur.value.ui64++;
1924 
1925 			nfs4_reopen(vp, osp, &e, CLAIM_DELEGATE_CUR, FALSE,
1926 			    FALSE);
1927 			if (e.error == 0 && e.stat == NFS4_OK)
1928 				ncg->nfs4_callback_stats.
1929 				    claim_cur_ok.value.ui64++;
1930 		}
1931 
1932 		if (e.error == EAGAIN) {
1933 			open_stream_rele(osp, rp);
1934 			nfs4_end_op(mi, vp, NULL, &recov_state, TRUE);
1935 			goto retry;
1936 		}
1937 
1938 		/*
1939 		 * if error is EINTR, ETIMEDOUT, or NFS4_FRC_UNMT_ERR, then
1940 		 * recovery has already been started inside of nfs4_reopen.
1941 		 */
1942 		if (e.error == EINTR || e.error == ETIMEDOUT ||
1943 		    NFS4_FRC_UNMT_ERR(e.error, vp->v_vfsp)) {
1944 			open_stream_rele(osp, rp);
1945 			break;
1946 		}
1947 
1948 		needrecov = nfs4_needs_recovery(&e, TRUE, vp->v_vfsp);
1949 
1950 		if (e.error != 0 && !needrecov) {
1951 			/*
1952 			 * Recovery is not possible, but don't give up yet;
1953 			 * we'd still like to do delegreturn after
1954 			 * reopening as many streams as possible.
1955 			 * Continue processing the open streams.
1956 			 */
1957 
1958 			ncg->nfs4_callback_stats.recall_failed.value.ui64++;
1959 
1960 		} else if (needrecov) {
1961 			/*
1962 			 * Start recovery and bail out.  The recovery
1963 			 * thread will take it from here.
1964 			 */
1965 			(void) nfs4_start_recovery(&e, mi, vp, NULL, NULL,
1966 			    NULL, OP_OPEN, NULL, NULL, NULL);
1967 			open_stream_rele(osp, rp);
1968 			*recovp = TRUE;
1969 			break;
1970 		}
1971 
1972 		open_stream_rele(osp, rp);
1973 	}
1974 
1975 	nfs4_end_op(mi, vp, NULL, &recov_state, needrecov);
1976 
1977 	return (e.error);
1978 }
1979 
1980 /*
1981  * get_next_deleg_stream - returns the next open stream which
1982  * represents a delegation for this rnode.  In order to assure
1983  * forward progress, the caller must guarantee that each open
1984  * stream returned is changed so that a future call won't return
1985  * it again.
1986  *
1987  * There are several ways for the open stream to change.  If the open
1988  * stream is !os_delegation, then we aren't interested in it.  Also, if
1989  * either os_failed_reopen or !os_valid, then don't return the osp.
1990  *
1991  * If claimnull is false (doing reopen CLAIM_DELEGATE_CUR) then return
1992  * the osp if it is an os_delegation open stream.  Also, if the rnode still
1993  * has r_deleg_return_pending, then return the os_delegation osp.  Lastly,
1994  * if the rnode's r_deleg_stateid is different from the osp's open_stateid,
1995  * then return the osp.
1996  *
1997  * We have already taken the 'r_deleg_recall_lock' as WRITER, which
1998  * prevents new OPENs from going OTW (as start_fop takes this
1999  * lock in READ mode); thus, no new open streams can be created
2000  * (which inherently means no new delegation open streams are
2001  * being created).
2002  */
2003 
2004 static nfs4_open_stream_t *
2005 get_next_deleg_stream(rnode4_t *rp, int claimnull)
2006 {
2007 	nfs4_open_stream_t	*osp;
2008 
2009 	ASSERT(nfs_rw_lock_held(&rp->r_deleg_recall_lock, RW_WRITER));
2010 
2011 	/*
2012 	 * Search through the list of open streams looking for
2013 	 * one that was created while holding the delegation.
2014 	 */
2015 	mutex_enter(&rp->r_os_lock);
2016 	for (osp = list_head(&rp->r_open_streams); osp != NULL;
2017 	    osp = list_next(&rp->r_open_streams, osp)) {
2018 		mutex_enter(&osp->os_sync_lock);
2019 		if (!osp->os_delegation || osp->os_failed_reopen ||
2020 		    !osp->os_valid) {
2021 			mutex_exit(&osp->os_sync_lock);
2022 			continue;
2023 		}
2024 		if (!claimnull || rp->r_deleg_return_pending ||
2025 		    !stateid4_cmp(&osp->open_stateid, &rp->r_deleg_stateid)) {
2026 			osp->os_ref_count++;
2027 			mutex_exit(&osp->os_sync_lock);
2028 			mutex_exit(&rp->r_os_lock);
2029 			return (osp);
2030 		}
2031 		mutex_exit(&osp->os_sync_lock);
2032 	}
2033 	mutex_exit(&rp->r_os_lock);
2034 
2035 	return (NULL);
2036 }
2037 
2038 static void
2039 nfs4delegreturn_thread(struct cb_recall_pass *args)
2040 {
2041 	rnode4_t *rp;
2042 	vnode_t *vp;
2043 	cred_t *cr;
2044 	int dtype, error, flags;
2045 	bool_t rdirty, rip;
2046 	kmutex_t cpr_lock;
2047 	callb_cpr_t cpr_info;
2048 	struct nfs4_callback_globals *ncg;
2049 
2050 	ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
2051 	ASSERT(ncg != NULL);
2052 
2053 	mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
2054 
2055 	CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr,
2056 	    "nfsv4delegRtn");
2057 
2058 	rp = args->rp;
2059 	vp = RTOV4(rp);
2060 
2061 	mutex_enter(&rp->r_statev4_lock);
2062 	if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
2063 		mutex_exit(&rp->r_statev4_lock);
2064 		goto out;
2065 	}
2066 	mutex_exit(&rp->r_statev4_lock);
2067 
2068 	/*
2069 	 * Take the read-write lock in read mode to prevent other
2070 	 * threads from modifying the data during the recall.  This
2071 	 * doesn't affect mmappers.
2072 	 */
2073 	(void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
2074 
2075 	/* Proceed with delegreturn */
2076 
2077 	mutex_enter(&rp->r_statev4_lock);
2078 	if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
2079 		mutex_exit(&rp->r_statev4_lock);
2080 		nfs_rw_exit(&rp->r_rwlock);
2081 		goto out;
2082 	}
2083 	dtype = rp->r_deleg_type;
2084 	cr = rp->r_deleg_cred;
2085 	ASSERT(cr != NULL);
2086 	crhold(cr);
2087 	mutex_exit(&rp->r_statev4_lock);
2088 
2089 	flags = args->flags;
2090 
2091 	/*
2092 	 * If the file is being truncated at the server, then throw
2093 	 * away all of the pages, it doesn't matter what flavor of
2094 	 * delegation we have.
2095 	 */
2096 
2097 	if (args->truncate) {
2098 		ncg->nfs4_callback_stats.recall_trunc.value.ui64++;
2099 		nfs4_invalidate_pages(vp, 0, cr);
2100 	} else if (dtype == OPEN_DELEGATE_WRITE) {
2101 
2102 		mutex_enter(&rp->r_statelock);
2103 		rdirty = rp->r_flags & R4DIRTY;
2104 		mutex_exit(&rp->r_statelock);
2105 
2106 		if (rdirty) {
2107 			error = VOP_PUTPAGE(vp, 0, 0, 0, cr, NULL);
2108 
2109 			if (error)
2110 				CB_WARN1("nfs4delegreturn_thread:"
2111 				" VOP_PUTPAGE: %d\n", error);
2112 		}
2113 		/* turn off NFS4_DR_PUSH because we just did that above. */
2114 		flags &= ~NFS4_DR_PUSH;
2115 	}
2116 
2117 	mutex_enter(&rp->r_statelock);
2118 	rip =  rp->r_flags & R4RECOVERRP;
2119 	mutex_exit(&rp->r_statelock);
2120 
2121 	/* If a failed recovery is indicated, discard the pages */
2122 
2123 	if (rip) {
2124 
2125 		error = VOP_PUTPAGE(vp, 0, 0, B_INVAL, cr, NULL);
2126 
2127 		if (error)
2128 			CB_WARN1("nfs4delegreturn_thread: VOP_PUTPAGE: %d\n",
2129 			    error);
2130 	}
2131 
2132 	/*
2133 	 * Pass the flags to nfs4delegreturn_impl, but be sure not to pass
2134 	 * NFS4_DR_DID_OP, which just calls nfs4delegreturn_async again.
2135 	 */
2136 	flags &= ~NFS4_DR_DID_OP;
2137 
2138 	(void) nfs4delegreturn_impl(rp, flags, ncg);
2139 
2140 	nfs_rw_exit(&rp->r_rwlock);
2141 	crfree(cr);
2142 out:
2143 	kmem_free(args, sizeof (struct cb_recall_pass));
2144 	VN_RELE(vp);
2145 	mutex_enter(&cpr_lock);
2146 	CALLB_CPR_EXIT(&cpr_info);
2147 	mutex_destroy(&cpr_lock);
2148 	zthread_exit();
2149 }
2150 
2151 /*
2152  * This function has one assumption that the caller of this function is
2153  * either doing recovery (therefore cannot call nfs4_start_op) or has
2154  * already called nfs4_start_op().
2155  */
2156 void
2157 nfs4_delegation_accept(rnode4_t *rp, open_claim_type4 claim, OPEN4res *res,
2158     nfs4_ga_res_t *garp, cred_t *cr)
2159 {
2160 	open_read_delegation4 *orp;
2161 	open_write_delegation4 *owp;
2162 	nfs4_server_t *np;
2163 	bool_t already = FALSE;
2164 	bool_t recall = FALSE;
2165 	bool_t valid_garp = TRUE;
2166 	bool_t delegation_granted = FALSE;
2167 	bool_t dr_needed = FALSE;
2168 	bool_t recov;
2169 	int dr_flags = 0;
2170 	long mapcnt;
2171 	uint_t rflag;
2172 	mntinfo4_t *mi;
2173 	struct nfs4_callback_globals *ncg;
2174 	open_delegation_type4 odt;
2175 
2176 	ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
2177 	ASSERT(ncg != NULL);
2178 
2179 	mi = VTOMI4(RTOV4(rp));
2180 
2181 	/*
2182 	 * Accept a delegation granted to the client via an OPEN.
2183 	 * Set the delegation fields in the rnode and insert the
2184 	 * rnode onto the list anchored in the nfs4_server_t.  The
2185 	 * proper locking order requires the nfs4_server_t first,
2186 	 * even though it may not be needed in all cases.
2187 	 *
2188 	 * NB: find_nfs4_server returns with s_lock held.
2189 	 */
2190 
2191 	if ((np = find_nfs4_server(mi)) == NULL)
2192 		return;
2193 
2194 	/* grab the statelock too, for examining r_mapcnt */
2195 	mutex_enter(&rp->r_statelock);
2196 	mutex_enter(&rp->r_statev4_lock);
2197 
2198 	if (rp->r_deleg_type == OPEN_DELEGATE_READ ||
2199 	    rp->r_deleg_type == OPEN_DELEGATE_WRITE)
2200 		already = TRUE;
2201 
2202 	odt = res->delegation.delegation_type;
2203 
2204 	if (odt == OPEN_DELEGATE_READ) {
2205 
2206 		rp->r_deleg_type = res->delegation.delegation_type;
2207 		orp = &res->delegation.open_delegation4_u.read;
2208 		rp->r_deleg_stateid = orp->stateid;
2209 		rp->r_deleg_perms = orp->permissions;
2210 		if (claim == CLAIM_PREVIOUS)
2211 			if ((recall = orp->recall) != 0)
2212 				dr_needed = TRUE;
2213 
2214 		delegation_granted = TRUE;
2215 
2216 		ncg->nfs4_callback_stats.delegations.value.ui64++;
2217 		ncg->nfs4_callback_stats.delegaccept_r.value.ui64++;
2218 
2219 	} else if (odt == OPEN_DELEGATE_WRITE) {
2220 
2221 		rp->r_deleg_type = res->delegation.delegation_type;
2222 		owp = &res->delegation.open_delegation4_u.write;
2223 		rp->r_deleg_stateid = owp->stateid;
2224 		rp->r_deleg_perms = owp->permissions;
2225 		rp->r_deleg_limit = owp->space_limit;
2226 		if (claim == CLAIM_PREVIOUS)
2227 			if ((recall = owp->recall) != 0)
2228 				dr_needed = TRUE;
2229 
2230 		delegation_granted = TRUE;
2231 
2232 		if (garp == NULL || !garp->n4g_change_valid) {
2233 			valid_garp = FALSE;
2234 			rp->r_deleg_change = 0;
2235 			rp->r_deleg_change_grant = 0;
2236 		} else {
2237 			rp->r_deleg_change = garp->n4g_change;
2238 			rp->r_deleg_change_grant = garp->n4g_change;
2239 		}
2240 		mapcnt = rp->r_mapcnt;
2241 		rflag = rp->r_flags;
2242 
2243 		/*
2244 		 * Update the delegation change attribute if
2245 		 * there are mappers for the file is dirty.  This
2246 		 * might be the case during recovery after server
2247 		 * reboot.
2248 		 */
2249 		if (mapcnt > 0 || rflag & R4DIRTY)
2250 			rp->r_deleg_change++;
2251 
2252 		NFS4_DEBUG(nfs4_callback_debug, (CE_NOTE,
2253 		    "nfs4_delegation_accept: r_deleg_change: 0x%x\n",
2254 		    (int)(rp->r_deleg_change >> 32)));
2255 		NFS4_DEBUG(nfs4_callback_debug, (CE_NOTE,
2256 		    "nfs4_delegation_accept: r_delg_change_grant: 0x%x\n",
2257 		    (int)(rp->r_deleg_change_grant >> 32)));
2258 
2259 
2260 		ncg->nfs4_callback_stats.delegations.value.ui64++;
2261 		ncg->nfs4_callback_stats.delegaccept_rw.value.ui64++;
2262 	} else if (already) {
2263 		/*
2264 		 * No delegation granted.  If the rnode currently has
2265 		 * has one, then consider it tainted and return it.
2266 		 */
2267 		dr_needed = TRUE;
2268 	}
2269 
2270 	if (delegation_granted) {
2271 		/* Add the rnode to the list. */
2272 		if (!already) {
2273 			crhold(cr);
2274 			rp->r_deleg_cred = cr;
2275 
2276 			ASSERT(mutex_owned(&np->s_lock));
2277 			list_insert_head(&np->s_deleg_list, rp);
2278 			/* added list node gets a reference */
2279 			np->s_refcnt++;
2280 			nfs4_inc_state_ref_count_nolock(np, mi);
2281 		}
2282 		rp->r_deleg_needs_recovery = OPEN_DELEGATE_NONE;
2283 	}
2284 
2285 	/*
2286 	 * We've now safely accepted the delegation, if any.  Drop the
2287 	 * locks and figure out what post-processing is needed.  We'd
2288 	 * like to retain r_statev4_lock, but nfs4_server_rele takes
2289 	 * s_lock which would be a lock ordering violation.
2290 	 */
2291 	mutex_exit(&rp->r_statev4_lock);
2292 	mutex_exit(&rp->r_statelock);
2293 	mutex_exit(&np->s_lock);
2294 	nfs4_server_rele(np);
2295 
2296 	/*
2297 	 * Check to see if we are in recovery.  Remember that
2298 	 * this function is protected by start_op, so a recovery
2299 	 * cannot begin until we are out of here.
2300 	 */
2301 	mutex_enter(&mi->mi_lock);
2302 	recov = mi->mi_recovflags & MI4_RECOV_ACTIV;
2303 	mutex_exit(&mi->mi_lock);
2304 
2305 	mutex_enter(&rp->r_statev4_lock);
2306 
2307 	if (nfs4_delegreturn_policy == IMMEDIATE || !valid_garp)
2308 		dr_needed = TRUE;
2309 
2310 	if (dr_needed && rp->r_deleg_return_pending == FALSE) {
2311 		if (recov) {
2312 			/*
2313 			 * We cannot call delegreturn from inside
2314 			 * of recovery or VOP_PUTPAGE will hang
2315 			 * due to nfs4_start_fop call in
2316 			 * nfs4write.  Use dlistadd to add the
2317 			 * rnode to the list of rnodes needing
2318 			 * cleaning.  We do not need to do reopen
2319 			 * here because recov_openfiles will do it.
2320 			 * In the non-recall case, just discard the
2321 			 * delegation as it is no longer valid.
2322 			 */
2323 			if (recall)
2324 				dr_flags = NFS4_DR_PUSH;
2325 			else
2326 				dr_flags = NFS4_DR_PUSH|NFS4_DR_DISCARD;
2327 
2328 			nfs4_dlistadd(rp, ncg, dr_flags);
2329 			dr_flags = 0;
2330 		} else {
2331 			/*
2332 			 * Push the modified data back to the server,
2333 			 * reopen any delegation open streams, and return
2334 			 * the delegation.  Drop the statev4_lock first!
2335 			 */
2336 			dr_flags =  NFS4_DR_PUSH|NFS4_DR_DID_OP|NFS4_DR_REOPEN;
2337 		}
2338 	}
2339 	mutex_exit(&rp->r_statev4_lock);
2340 	if (dr_flags)
2341 		(void) nfs4delegreturn_impl(rp, dr_flags, ncg);
2342 }
2343 
2344 /*
2345  * nfs4delegabandon - Abandon the delegation on an rnode4.  This code
2346  * is called when the client receives EXPIRED, BAD_STATEID, OLD_STATEID
2347  * or BADSEQID and the recovery code is unable to recover.  Push any
2348  * dirty data back to the server and return the delegation (if any).
2349  */
2350 
2351 void
2352 nfs4delegabandon(rnode4_t *rp)
2353 {
2354 	vnode_t *vp;
2355 	struct cb_recall_pass *pp;
2356 	open_delegation_type4 dt;
2357 
2358 	mutex_enter(&rp->r_statev4_lock);
2359 	dt = rp->r_deleg_type;
2360 	mutex_exit(&rp->r_statev4_lock);
2361 
2362 	if (dt == OPEN_DELEGATE_NONE)
2363 		return;
2364 
2365 	vp = RTOV4(rp);
2366 	VN_HOLD(vp);
2367 
2368 	pp = kmem_alloc(sizeof (struct cb_recall_pass), KM_SLEEP);
2369 	pp->rp = rp;
2370 	/*
2371 	 * Recovery on the file has failed and we want to return
2372 	 * the delegation.  We don't want to reopen files and
2373 	 * nfs4delegreturn_thread() figures out what to do about
2374 	 * the data.  The only thing to do is attempt to return
2375 	 * the delegation.
2376 	 */
2377 	pp->flags = 0;
2378 	pp->truncate = FALSE;
2379 
2380 	/*
2381 	 * Fire up a thread to do the delegreturn; this is
2382 	 * necessary because we could be inside a GETPAGE or
2383 	 * PUTPAGE and we cannot do another one.
2384 	 */
2385 
2386 	(void) zthread_create(NULL, 0, nfs4delegreturn_thread, pp, 0,
2387 	    minclsyspri);
2388 }
2389 
2390 static int
2391 wait_for_recall1(vnode_t *vp, nfs4_op_hint_t op, nfs4_recov_state_t *rsp,
2392     int flg)
2393 {
2394 	rnode4_t *rp;
2395 	int error = 0;
2396 
2397 #ifdef lint
2398 	op = op;
2399 #endif
2400 
2401 	if (vp && vp->v_type == VREG) {
2402 		rp = VTOR4(vp);
2403 
2404 		/*
2405 		 * Take r_deleg_recall_lock in read mode to synchronize
2406 		 * with delegreturn.
2407 		 */
2408 		error = nfs_rw_enter_sig(&rp->r_deleg_recall_lock,
2409 		    RW_READER, INTR4(vp));
2410 
2411 		if (error == 0)
2412 			rsp->rs_flags |= flg;
2413 
2414 	}
2415 	return (error);
2416 }
2417 
2418 void
2419 nfs4_end_op_recall(vnode_t *vp1, vnode_t *vp2, nfs4_recov_state_t *rsp)
2420 {
2421 	NFS4_DEBUG(nfs4_recall_debug,
2422 	    (CE_NOTE, "nfs4_end_op_recall: 0x%p, 0x%p\n",
2423 	    (void *)vp1, (void *)vp2));
2424 
2425 	if (vp2 && rsp->rs_flags & NFS4_RS_RECALL_HELD2)
2426 		nfs_rw_exit(&VTOR4(vp2)->r_deleg_recall_lock);
2427 	if (vp1 && rsp->rs_flags & NFS4_RS_RECALL_HELD1)
2428 		nfs_rw_exit(&VTOR4(vp1)->r_deleg_recall_lock);
2429 }
2430 
2431 int
2432 wait_for_recall(vnode_t *vp1, vnode_t *vp2, nfs4_op_hint_t op,
2433     nfs4_recov_state_t *rsp)
2434 {
2435 	int error;
2436 
2437 	NFS4_DEBUG(nfs4_recall_debug,
2438 	    (CE_NOTE, "wait_for_recall:    0x%p, 0x%p\n",
2439 	    (void *)vp1, (void *) vp2));
2440 
2441 	rsp->rs_flags &= ~(NFS4_RS_RECALL_HELD1|NFS4_RS_RECALL_HELD2);
2442 
2443 	if ((error = wait_for_recall1(vp1, op, rsp, NFS4_RS_RECALL_HELD1)) != 0)
2444 		return (error);
2445 
2446 	if ((error = wait_for_recall1(vp2, op, rsp, NFS4_RS_RECALL_HELD2))
2447 	    != 0) {
2448 		if (rsp->rs_flags & NFS4_RS_RECALL_HELD1) {
2449 			nfs_rw_exit(&VTOR4(vp1)->r_deleg_recall_lock);
2450 			rsp->rs_flags &= ~NFS4_RS_RECALL_HELD1;
2451 		}
2452 
2453 		return (error);
2454 	}
2455 
2456 	return (0);
2457 }
2458 
2459 /*
2460  * nfs4_dlistadd - Add this rnode to a list of rnodes to be
2461  * DELEGRETURN'd at the end of recovery.
2462  */
2463 
2464 static void
2465 nfs4_dlistadd(rnode4_t *rp, struct nfs4_callback_globals *ncg, int flags)
2466 {
2467 	struct nfs4_dnode *dp;
2468 
2469 	ASSERT(mutex_owned(&rp->r_statev4_lock));
2470 	/*
2471 	 * Mark the delegation as having a return pending.
2472 	 * This will prevent the use of the delegation stateID
2473 	 * by read, write, setattr and open.
2474 	 */
2475 	rp->r_deleg_return_pending = TRUE;
2476 	dp = kmem_alloc(sizeof (*dp), KM_SLEEP);
2477 	VN_HOLD(RTOV4(rp));
2478 	dp->rnodep = rp;
2479 	dp->flags = flags;
2480 	mutex_enter(&ncg->nfs4_dlist_lock);
2481 	list_insert_head(&ncg->nfs4_dlist, dp);
2482 #ifdef	DEBUG
2483 	ncg->nfs4_dlistadd_c++;
2484 #endif
2485 	mutex_exit(&ncg->nfs4_dlist_lock);
2486 }
2487 
2488 /*
2489  * nfs4_dlistclean_impl - Do DELEGRETURN for each rnode on the list.
2490  * of files awaiting cleaning.  If the override_flags are non-zero
2491  * then use them rather than the flags that were set when the rnode
2492  * was added to the dlist.
2493  */
2494 static void
2495 nfs4_dlistclean_impl(struct nfs4_callback_globals *ncg, int override_flags)
2496 {
2497 	rnode4_t *rp;
2498 	struct nfs4_dnode *dp;
2499 	int flags;
2500 
2501 	ASSERT(override_flags == 0 || override_flags == NFS4_DR_DISCARD);
2502 
2503 	mutex_enter(&ncg->nfs4_dlist_lock);
2504 	while ((dp = list_head(&ncg->nfs4_dlist)) != NULL) {
2505 #ifdef	DEBUG
2506 		ncg->nfs4_dlistclean_c++;
2507 #endif
2508 		list_remove(&ncg->nfs4_dlist, dp);
2509 		mutex_exit(&ncg->nfs4_dlist_lock);
2510 		rp = dp->rnodep;
2511 		flags = (override_flags != 0) ? override_flags : dp->flags;
2512 		kmem_free(dp, sizeof (*dp));
2513 		(void) nfs4delegreturn_impl(rp, flags, ncg);
2514 		VN_RELE(RTOV4(rp));
2515 		mutex_enter(&ncg->nfs4_dlist_lock);
2516 	}
2517 	mutex_exit(&ncg->nfs4_dlist_lock);
2518 }
2519 
2520 void
2521 nfs4_dlistclean(void)
2522 {
2523 	struct nfs4_callback_globals *ncg;
2524 
2525 	ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
2526 	ASSERT(ncg != NULL);
2527 
2528 	nfs4_dlistclean_impl(ncg, 0);
2529 }
2530