xref: /titanic_52/usr/src/uts/common/fs/nfs/nfs4_callback.c (revision bde3d612a7c090234c60e6e4578821237a5db135)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
27 /* All Rights Reserved */
28 
29 #include <sys/param.h>
30 #include <sys/types.h>
31 #include <sys/systm.h>
32 #include <sys/cred.h>
33 #include <sys/vfs.h>
34 #include <sys/vnode.h>
35 #include <sys/pathname.h>
36 #include <sys/sysmacros.h>
37 #include <sys/kmem.h>
38 #include <sys/kstat.h>
39 #include <sys/mkdev.h>
40 #include <sys/mount.h>
41 #include <sys/statvfs.h>
42 #include <sys/errno.h>
43 #include <sys/debug.h>
44 #include <sys/cmn_err.h>
45 #include <sys/utsname.h>
46 #include <sys/bootconf.h>
47 #include <sys/modctl.h>
48 #include <sys/acl.h>
49 #include <sys/flock.h>
50 #include <sys/kstr.h>
51 #include <sys/stropts.h>
52 #include <sys/strsubr.h>
53 #include <sys/atomic.h>
54 #include <sys/disp.h>
55 #include <sys/policy.h>
56 #include <sys/list.h>
57 #include <sys/zone.h>
58 
59 #include <rpc/types.h>
60 #include <rpc/auth.h>
61 #include <rpc/rpcsec_gss.h>
62 #include <rpc/clnt.h>
63 #include <rpc/xdr.h>
64 
65 #include <nfs/nfs.h>
66 #include <nfs/nfs_clnt.h>
67 #include <nfs/mount.h>
68 #include <nfs/nfs_acl.h>
69 
70 #include <fs/fs_subr.h>
71 
72 #include <nfs/nfs4.h>
73 #include <nfs/rnode4.h>
74 #include <nfs/nfs4_clnt.h>
75 #include <nfs/nfssys.h>
76 
77 #ifdef	DEBUG
78 /*
79  * These are "special" state IDs and file handles that
80  * match any delegation state ID or file handled.  This
81  * is for testing purposes only.
82  */
83 
84 stateid4 nfs4_deleg_any = { 0x7FFFFFF0 };
85 char nfs4_deleg_fh[] = "\0377\0376\0375\0374";
86 nfs_fh4 nfs4_deleg_anyfh = { sizeof (nfs4_deleg_fh)-1, nfs4_deleg_fh };
87 nfsstat4 cb4_getattr_fail = NFS4_OK;
88 nfsstat4 cb4_recall_fail = NFS4_OK;
89 
90 int nfs4_callback_debug;
91 int nfs4_recall_debug;
92 int nfs4_drat_debug;
93 
94 #endif
95 
96 #define	CB_NOTE(x)	NFS4_DEBUG(nfs4_callback_debug, (CE_NOTE, x))
97 #define	CB_WARN(x)	NFS4_DEBUG(nfs4_callback_debug, (CE_WARN, x))
98 #define	CB_WARN1(x, y)	NFS4_DEBUG(nfs4_callback_debug, (CE_WARN, x, y))
99 
100 enum nfs4_delegreturn_policy nfs4_delegreturn_policy = INACTIVE;
101 
102 static zone_key_t nfs4_callback_zone_key;
103 
104 /*
105  * NFS4_MAPSIZE is the number of bytes we are willing to consume
106  * for the block allocation map when the server grants a NFS_LIMIT_BLOCK
107  * style delegation.
108  */
109 
110 #define	NFS4_MAPSIZE	8192
111 #define	NFS4_MAPWORDS	NFS4_MAPSIZE/sizeof (uint_t)
112 #define	NbPW		(NBBY*sizeof (uint_t))
113 
114 static int nfs4_num_prognums = 1024;
115 static SVC_CALLOUT_TABLE nfs4_cb_sct;
116 
117 struct nfs4_dnode {
118 	list_node_t	linkage;
119 	rnode4_t	*rnodep;
120 	int		flags;		/* Flags for nfs4delegreturn_impl() */
121 };
122 
123 static const struct nfs4_callback_stats nfs4_callback_stats_tmpl = {
124 	{ "delegations",	KSTAT_DATA_UINT64 },
125 	{ "cb_getattr",		KSTAT_DATA_UINT64 },
126 	{ "cb_recall",		KSTAT_DATA_UINT64 },
127 	{ "cb_null",		KSTAT_DATA_UINT64 },
128 	{ "cb_dispatch",	KSTAT_DATA_UINT64 },
129 	{ "delegaccept_r",	KSTAT_DATA_UINT64 },
130 	{ "delegaccept_rw",	KSTAT_DATA_UINT64 },
131 	{ "delegreturn",	KSTAT_DATA_UINT64 },
132 	{ "callbacks",		KSTAT_DATA_UINT64 },
133 	{ "claim_cur",		KSTAT_DATA_UINT64 },
134 	{ "claim_cur_ok",	KSTAT_DATA_UINT64 },
135 	{ "recall_trunc",	KSTAT_DATA_UINT64 },
136 	{ "recall_failed",	KSTAT_DATA_UINT64 },
137 	{ "return_limit_write",	KSTAT_DATA_UINT64 },
138 	{ "return_limit_addmap", KSTAT_DATA_UINT64 },
139 	{ "deleg_recover",	KSTAT_DATA_UINT64 },
140 	{ "cb_illegal",		KSTAT_DATA_UINT64 }
141 };
142 
143 struct nfs4_cb_port {
144 	list_node_t		linkage; /* linkage into per-zone port list */
145 	char			netid[KNC_STRSIZE];
146 	char			uaddr[KNC_STRSIZE];
147 	char			protofmly[KNC_STRSIZE];
148 	char			proto[KNC_STRSIZE];
149 };
150 
151 static int cb_getattr_bytes;
152 
153 struct cb_recall_pass {
154 	rnode4_t	*rp;
155 	int		flags;		/* Flags for nfs4delegreturn_impl() */
156 	bool_t		truncate;
157 };
158 
159 static nfs4_open_stream_t *get_next_deleg_stream(rnode4_t *, int);
160 static void nfs4delegreturn_thread(struct cb_recall_pass *);
161 static int deleg_reopen(vnode_t *, bool_t *, struct nfs4_callback_globals *,
162     int);
163 static void nfs4_dlistadd(rnode4_t *, struct nfs4_callback_globals *, int);
164 static void nfs4_dlistclean_impl(struct nfs4_callback_globals *, int);
165 static int nfs4delegreturn_impl(rnode4_t *, int,
166     struct nfs4_callback_globals *);
167 static void nfs4delegreturn_cleanup_impl(rnode4_t *, nfs4_server_t *,
168     struct nfs4_callback_globals *);
169 
170 static void
171 cb_getattr(nfs_cb_argop4 *argop, nfs_cb_resop4 *resop, struct svc_req *req,
172 	struct compound_state *cs, struct nfs4_callback_globals *ncg)
173 {
174 	CB_GETATTR4args *args = &argop->nfs_cb_argop4_u.opcbgetattr;
175 	CB_GETATTR4res *resp = &resop->nfs_cb_resop4_u.opcbgetattr;
176 	rnode4_t *rp;
177 	vnode_t *vp;
178 	bool_t found = FALSE;
179 	struct nfs4_server *sp;
180 	struct fattr4 *fap;
181 	rpc_inline_t *fdata;
182 	long mapcnt;
183 	fattr4_change change;
184 	fattr4_size size;
185 	uint_t rflag;
186 
187 	ncg->nfs4_callback_stats.cb_getattr.value.ui64++;
188 
189 #ifdef DEBUG
190 	/*
191 	 * error injection hook: set cb_getattr_fail global to
192 	 * NFS4 pcol error to be returned
193 	 */
194 	if (cb4_getattr_fail != NFS4_OK) {
195 		*cs->statusp = resp->status = cb4_getattr_fail;
196 		return;
197 	}
198 #endif
199 
200 	resp->obj_attributes.attrmask = 0;
201 
202 	mutex_enter(&ncg->nfs4_cb_lock);
203 	sp = ncg->nfs4prog2server[req->rq_prog - NFS4_CALLBACK];
204 	mutex_exit(&ncg->nfs4_cb_lock);
205 
206 	if (nfs4_server_vlock(sp, 0) == FALSE) {
207 
208 		CB_WARN("cb_getattr: cannot find server\n");
209 
210 		*cs->statusp = resp->status = NFS4ERR_BADHANDLE;
211 		return;
212 	}
213 
214 	/*
215 	 * In cb_compound, callback_ident was validated against rq_prog,
216 	 * but we couldn't verify that it was set to the value we provided
217 	 * at setclientid time (because we didn't have server struct yet).
218 	 * Now we have the server struct, but don't have callback_ident
219 	 * handy.  So, validate server struct program number against req
220 	 * RPC's prog number.  At this point, we know the RPC prog num
221 	 * is valid (else we wouldn't be here); however, we don't know
222 	 * that it was the prog number we supplied to this server at
223 	 * setclientid time.  If the prog numbers aren't equivalent, then
224 	 * log the problem and fail the request because either cbserv
225 	 * and/or cbclient are confused.  This will probably never happen.
226 	 */
227 	if (sp->s_program != req->rq_prog) {
228 #ifdef DEBUG
229 		zcmn_err(getzoneid(), CE_WARN,
230 		    "cb_getattr: wrong server program number srv=%d req=%d\n",
231 		    sp->s_program, req->rq_prog);
232 #else
233 		zcmn_err(getzoneid(), CE_WARN,
234 		    "cb_getattr: wrong server program number\n");
235 #endif
236 		mutex_exit(&sp->s_lock);
237 		nfs4_server_rele(sp);
238 		*cs->statusp = resp->status = NFS4ERR_BADHANDLE;
239 		return;
240 	}
241 
242 	/*
243 	 * Search the delegation list for a matching file handle;
244 	 * mutex on sp prevents the list from changing.
245 	 */
246 
247 	rp = list_head(&sp->s_deleg_list);
248 	for (; rp != NULL; rp = list_next(&sp->s_deleg_list, rp)) {
249 		nfs4_fhandle_t fhandle;
250 
251 		sfh4_copyval(rp->r_fh, &fhandle);
252 
253 		if ((fhandle.fh_len == args->fh.nfs_fh4_len &&
254 		    bcmp(fhandle.fh_buf, args->fh.nfs_fh4_val,
255 		    fhandle.fh_len) == 0)) {
256 
257 			found = TRUE;
258 			break;
259 		}
260 #ifdef	DEBUG
261 		if (nfs4_deleg_anyfh.nfs_fh4_len == args->fh.nfs_fh4_len &&
262 		    bcmp(nfs4_deleg_anyfh.nfs_fh4_val, args->fh.nfs_fh4_val,
263 		    args->fh.nfs_fh4_len) == 0) {
264 
265 			found = TRUE;
266 			break;
267 		}
268 #endif
269 	}
270 
271 	/*
272 	 * VN_HOLD the vnode before releasing s_lock to guarantee
273 	 * we have a valid vnode reference.
274 	 */
275 	if (found == TRUE) {
276 		vp = RTOV4(rp);
277 		VN_HOLD(vp);
278 	}
279 
280 	mutex_exit(&sp->s_lock);
281 	nfs4_server_rele(sp);
282 
283 	if (found == FALSE) {
284 
285 		CB_WARN("cb_getattr: bad fhandle\n");
286 
287 		*cs->statusp = resp->status = NFS4ERR_BADHANDLE;
288 		return;
289 	}
290 
291 	/*
292 	 * Figure out which attributes the server wants.  We only
293 	 * offer FATTR4_CHANGE & FATTR4_SIZE; ignore the rest.
294 	 */
295 	fdata = kmem_alloc(cb_getattr_bytes, KM_SLEEP);
296 
297 	/*
298 	 * Don't actually need to create XDR to encode these
299 	 * simple data structures.
300 	 * xdrmem_create(&xdr, fdata, cb_getattr_bytes, XDR_ENCODE);
301 	 */
302 	fap = &resp->obj_attributes;
303 
304 	fap->attrmask = 0;
305 	/* attrlist4_len starts at 0 and increases as attrs are processed */
306 	fap->attrlist4 = (char *)fdata;
307 	fap->attrlist4_len = 0;
308 
309 	/* don't supply attrs if request was zero */
310 	if (args->attr_request != 0) {
311 		if (args->attr_request & FATTR4_CHANGE_MASK) {
312 			/*
313 			 * If the file is mmapped, then increment the change
314 			 * attribute and return it.  This will guarantee that
315 			 * the server will perceive that the file has changed
316 			 * if there is any chance that the client application
317 			 * has changed it.  Otherwise, just return the change
318 			 * attribute as it has been updated by nfs4write_deleg.
319 			 */
320 
321 			mutex_enter(&rp->r_statelock);
322 			mapcnt = rp->r_mapcnt;
323 			rflag = rp->r_flags;
324 			mutex_exit(&rp->r_statelock);
325 
326 			mutex_enter(&rp->r_statev4_lock);
327 			/*
328 			 * If object mapped, then always return new change.
329 			 * Otherwise, return change if object has dirty
330 			 * pages.  If object doesn't have any dirty pages,
331 			 * then all changes have been pushed to server, so
332 			 * reset change to grant change.
333 			 */
334 			if (mapcnt)
335 				rp->r_deleg_change++;
336 			else if (! (rflag & R4DIRTY))
337 				rp->r_deleg_change = rp->r_deleg_change_grant;
338 			change = rp->r_deleg_change;
339 			mutex_exit(&rp->r_statev4_lock);
340 
341 			/*
342 			 * Use inline XDR code directly, we know that we
343 			 * going to a memory buffer and it has enough
344 			 * space so it cannot fail.
345 			 */
346 			IXDR_PUT_U_HYPER(fdata, change);
347 			fap->attrlist4_len += 2 * BYTES_PER_XDR_UNIT;
348 			fap->attrmask |= FATTR4_CHANGE_MASK;
349 		}
350 
351 		if (args->attr_request & FATTR4_SIZE_MASK) {
352 			/*
353 			 * Use an atomic add of 0 to fetch a consistent view
354 			 * of r_size; this avoids having to take rw_lock
355 			 * which could cause a deadlock.
356 			 */
357 			size = atomic_add_64_nv((uint64_t *)&rp->r_size, 0);
358 
359 			/*
360 			 * Use inline XDR code directly, we know that we
361 			 * going to a memory buffer and it has enough
362 			 * space so it cannot fail.
363 			 */
364 			IXDR_PUT_U_HYPER(fdata, size);
365 			fap->attrlist4_len += 2 * BYTES_PER_XDR_UNIT;
366 			fap->attrmask |= FATTR4_SIZE_MASK;
367 		}
368 	}
369 
370 	VN_RELE(vp);
371 
372 	*cs->statusp = resp->status = NFS4_OK;
373 }
374 
375 static void
376 cb_getattr_free(nfs_cb_resop4 *resop)
377 {
378 	if (resop->nfs_cb_resop4_u.opcbgetattr.obj_attributes.attrlist4)
379 		kmem_free(resop->nfs_cb_resop4_u.opcbgetattr.
380 		    obj_attributes.attrlist4, cb_getattr_bytes);
381 }
382 
383 static void
384 cb_recall(nfs_cb_argop4 *argop, nfs_cb_resop4 *resop, struct svc_req *req,
385 	struct compound_state *cs, struct nfs4_callback_globals *ncg)
386 {
387 	CB_RECALL4args * args = &argop->nfs_cb_argop4_u.opcbrecall;
388 	CB_RECALL4res *resp = &resop->nfs_cb_resop4_u.opcbrecall;
389 	rnode4_t *rp;
390 	vnode_t *vp;
391 	struct nfs4_server *sp;
392 	bool_t found = FALSE;
393 
394 	ncg->nfs4_callback_stats.cb_recall.value.ui64++;
395 
396 	ASSERT(req->rq_prog >= NFS4_CALLBACK);
397 	ASSERT(req->rq_prog < NFS4_CALLBACK+nfs4_num_prognums);
398 
399 #ifdef DEBUG
400 	/*
401 	 * error injection hook: set cb_recall_fail global to
402 	 * NFS4 pcol error to be returned
403 	 */
404 	if (cb4_recall_fail != NFS4_OK) {
405 		*cs->statusp = resp->status = cb4_recall_fail;
406 		return;
407 	}
408 #endif
409 
410 	mutex_enter(&ncg->nfs4_cb_lock);
411 	sp = ncg->nfs4prog2server[req->rq_prog - NFS4_CALLBACK];
412 	mutex_exit(&ncg->nfs4_cb_lock);
413 
414 	if (nfs4_server_vlock(sp, 0) == FALSE) {
415 
416 		CB_WARN("cb_recall: cannot find server\n");
417 
418 		*cs->statusp = resp->status = NFS4ERR_BADHANDLE;
419 		return;
420 	}
421 
422 	/*
423 	 * Search the delegation list for a matching file handle
424 	 * AND stateid; mutex on sp prevents the list from changing.
425 	 */
426 
427 	rp = list_head(&sp->s_deleg_list);
428 	for (; rp != NULL; rp = list_next(&sp->s_deleg_list, rp)) {
429 		mutex_enter(&rp->r_statev4_lock);
430 
431 		/* check both state id and file handle! */
432 
433 		if ((bcmp(&rp->r_deleg_stateid, &args->stateid,
434 		    sizeof (stateid4)) == 0)) {
435 			nfs4_fhandle_t fhandle;
436 
437 			sfh4_copyval(rp->r_fh, &fhandle);
438 			if ((fhandle.fh_len == args->fh.nfs_fh4_len &&
439 			    bcmp(fhandle.fh_buf, args->fh.nfs_fh4_val,
440 			    fhandle.fh_len) == 0)) {
441 
442 				found = TRUE;
443 				break;
444 			} else {
445 #ifdef	DEBUG
446 				CB_WARN("cb_recall: stateid OK, bad fh");
447 #endif
448 			}
449 		}
450 #ifdef	DEBUG
451 		if (bcmp(&args->stateid, &nfs4_deleg_any,
452 		    sizeof (stateid4)) == 0) {
453 
454 			found = TRUE;
455 			break;
456 		}
457 #endif
458 		mutex_exit(&rp->r_statev4_lock);
459 	}
460 
461 	/*
462 	 * VN_HOLD the vnode before releasing s_lock to guarantee
463 	 * we have a valid vnode reference.  The async thread will
464 	 * release the hold when it's done.
465 	 */
466 	if (found == TRUE) {
467 		mutex_exit(&rp->r_statev4_lock);
468 		vp = RTOV4(rp);
469 		VN_HOLD(vp);
470 	}
471 	mutex_exit(&sp->s_lock);
472 	nfs4_server_rele(sp);
473 
474 	if (found == FALSE) {
475 
476 		CB_WARN("cb_recall: bad stateid\n");
477 
478 		*cs->statusp = resp->status = NFS4ERR_BAD_STATEID;
479 		return;
480 	}
481 
482 	/* Fire up a thread to do the delegreturn */
483 	nfs4delegreturn_async(rp, NFS4_DR_RECALL|NFS4_DR_REOPEN,
484 	    args->truncate);
485 
486 	*cs->statusp = resp->status = 0;
487 }
488 
489 /* ARGSUSED */
490 static void
491 cb_recall_free(nfs_cb_resop4 *resop)
492 {
493 	/* nothing to do here, cb_recall doesn't kmem_alloc */
494 }
495 
496 /*
497  * This function handles the CB_NULL proc call from an NFSv4 Server.
498  *
499  * We take note that the server has sent a CB_NULL for later processing
500  * in the recovery logic. It is noted so we may pause slightly after the
501  * setclientid and before reopening files. The pause is to allow the
502  * NFSv4 Server time to receive the CB_NULL reply and adjust any of
503  * its internal structures such that it has the opportunity to grant
504  * delegations to reopened files.
505  *
506  */
507 
508 /* ARGSUSED */
509 static void
510 cb_null(CB_COMPOUND4args *args, CB_COMPOUND4res *resp, struct svc_req *req,
511     struct nfs4_callback_globals *ncg)
512 {
513 	struct nfs4_server *sp;
514 
515 	ncg->nfs4_callback_stats.cb_null.value.ui64++;
516 
517 	ASSERT(req->rq_prog >= NFS4_CALLBACK);
518 	ASSERT(req->rq_prog < NFS4_CALLBACK+nfs4_num_prognums);
519 
520 	mutex_enter(&ncg->nfs4_cb_lock);
521 	sp = ncg->nfs4prog2server[req->rq_prog - NFS4_CALLBACK];
522 	mutex_exit(&ncg->nfs4_cb_lock);
523 
524 	if (nfs4_server_vlock(sp, 0) != FALSE) {
525 		sp->s_flags |= N4S_CB_PINGED;
526 		cv_broadcast(&sp->wait_cb_null);
527 		mutex_exit(&sp->s_lock);
528 		nfs4_server_rele(sp);
529 	}
530 }
531 
532 /*
533  * cb_illegal	args: void
534  *		res : status (NFS4ERR_OP_CB_ILLEGAL)
535  */
536 /* ARGSUSED */
537 static void
538 cb_illegal(nfs_cb_argop4 *argop, nfs_cb_resop4 *resop, struct svc_req *req,
539 	struct compound_state *cs, struct nfs4_callback_globals *ncg)
540 {
541 	CB_ILLEGAL4res *resp = &resop->nfs_cb_resop4_u.opcbillegal;
542 
543 	ncg->nfs4_callback_stats.cb_illegal.value.ui64++;
544 	resop->resop = OP_CB_ILLEGAL;
545 	*cs->statusp = resp->status = NFS4ERR_OP_ILLEGAL;
546 }
547 
548 static void
549 cb_compound(CB_COMPOUND4args *args, CB_COMPOUND4res *resp, struct svc_req *req,
550 	struct nfs4_callback_globals *ncg)
551 {
552 	uint_t i;
553 	struct compound_state cs;
554 	nfs_cb_argop4 *argop;
555 	nfs_cb_resop4 *resop, *new_res;
556 	uint_t op;
557 
558 	bzero(&cs, sizeof (cs));
559 	cs.statusp = &resp->status;
560 	cs.cont = TRUE;
561 
562 	/*
563 	 * Form a reply tag by copying over the reqeuest tag.
564 	 */
565 	resp->tag.utf8string_len = args->tag.utf8string_len;
566 	resp->tag.utf8string_val = kmem_alloc(resp->tag.utf8string_len,
567 	    KM_SLEEP);
568 	bcopy(args->tag.utf8string_val, resp->tag.utf8string_val,
569 	    args->tag.utf8string_len);
570 
571 	/*
572 	 * XXX for now, minorversion should be zero
573 	 */
574 	if (args->minorversion != CB4_MINORVERSION) {
575 		resp->array_len = 0;
576 		resp->array = NULL;
577 		resp->status = NFS4ERR_MINOR_VERS_MISMATCH;
578 		return;
579 	}
580 
581 #ifdef DEBUG
582 	/*
583 	 * Verify callback_ident.  It doesn't really matter if it's wrong
584 	 * because we don't really use callback_ident -- we use prog number
585 	 * of the RPC request instead.  In this case, just print a DEBUG
586 	 * console message to reveal brokenness of cbclient (at bkoff/cthon).
587 	 */
588 	if (args->callback_ident != req->rq_prog)
589 		zcmn_err(getzoneid(), CE_WARN,
590 		    "cb_compound: cb_client using wrong "
591 		    "callback_ident(%d), should be %d",
592 		    args->callback_ident, req->rq_prog);
593 #endif
594 
595 	resp->array_len = args->array_len;
596 	resp->array = kmem_zalloc(args->array_len * sizeof (nfs_cb_resop4),
597 	    KM_SLEEP);
598 
599 	for (i = 0; i < args->array_len && cs.cont; i++) {
600 
601 		argop = &args->array[i];
602 		resop = &resp->array[i];
603 		resop->resop = argop->argop;
604 		op = (uint_t)resop->resop;
605 
606 		switch (op) {
607 
608 		case OP_CB_GETATTR:
609 
610 			cb_getattr(argop, resop, req, &cs, ncg);
611 			break;
612 
613 		case OP_CB_RECALL:
614 
615 			cb_recall(argop, resop, req, &cs, ncg);
616 			break;
617 
618 		case OP_CB_ILLEGAL:
619 
620 			/* fall through */
621 
622 		default:
623 			/*
624 			 * Handle OP_CB_ILLEGAL and any undefined opcode.
625 			 * Currently, the XDR code will return BADXDR
626 			 * if cb op doesn't decode to legal value, so
627 			 * it really only handles OP_CB_ILLEGAL.
628 			 */
629 			op = OP_CB_ILLEGAL;
630 			cb_illegal(argop, resop, req, &cs, ncg);
631 		}
632 
633 		if (*cs.statusp != NFS4_OK)
634 			cs.cont = FALSE;
635 
636 		/*
637 		 * If not at last op, and if we are to stop, then
638 		 * compact the results array.
639 		 */
640 		if ((i + 1) < args->array_len && !cs.cont) {
641 
642 			new_res = kmem_alloc(
643 			    (i+1) * sizeof (nfs_cb_resop4), KM_SLEEP);
644 			bcopy(resp->array,
645 			    new_res, (i+1) * sizeof (nfs_cb_resop4));
646 			kmem_free(resp->array,
647 			    args->array_len * sizeof (nfs_cb_resop4));
648 
649 			resp->array_len =  i + 1;
650 			resp->array = new_res;
651 		}
652 	}
653 
654 }
655 
656 static void
657 cb_compound_free(CB_COMPOUND4res *resp)
658 {
659 	uint_t i, op;
660 	nfs_cb_resop4 *resop;
661 
662 	if (resp->tag.utf8string_val) {
663 		UTF8STRING_FREE(resp->tag)
664 	}
665 
666 	for (i = 0; i < resp->array_len; i++) {
667 
668 		resop = &resp->array[i];
669 		op = (uint_t)resop->resop;
670 
671 		switch (op) {
672 
673 		case OP_CB_GETATTR:
674 
675 			cb_getattr_free(resop);
676 			break;
677 
678 		case OP_CB_RECALL:
679 
680 			cb_recall_free(resop);
681 			break;
682 
683 		default:
684 			break;
685 		}
686 	}
687 
688 	if (resp->array != NULL) {
689 		kmem_free(resp->array,
690 		    resp->array_len * sizeof (nfs_cb_resop4));
691 	}
692 }
693 
694 static void
695 cb_dispatch(struct svc_req *req, SVCXPRT *xprt)
696 {
697 	CB_COMPOUND4args args;
698 	CB_COMPOUND4res res;
699 	struct nfs4_callback_globals *ncg;
700 
701 	bool_t (*xdr_args)(), (*xdr_res)();
702 	void (*proc)(CB_COMPOUND4args *, CB_COMPOUND4res *, struct svc_req *,
703 	    struct nfs4_callback_globals *);
704 	void (*freeproc)(CB_COMPOUND4res *);
705 
706 	ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
707 	ASSERT(ncg != NULL);
708 
709 	ncg->nfs4_callback_stats.cb_dispatch.value.ui64++;
710 
711 	switch (req->rq_proc) {
712 	case CB_NULL:
713 		xdr_args = xdr_void;
714 		xdr_res = xdr_void;
715 		proc = cb_null;
716 		freeproc = NULL;
717 		break;
718 
719 	case CB_COMPOUND:
720 		xdr_args = xdr_CB_COMPOUND4args_clnt;
721 		xdr_res = xdr_CB_COMPOUND4res;
722 		proc = cb_compound;
723 		freeproc = cb_compound_free;
724 		break;
725 
726 	default:
727 		CB_WARN("cb_dispatch: no proc\n");
728 		svcerr_noproc(xprt);
729 		return;
730 	}
731 
732 	args.tag.utf8string_val = NULL;
733 	args.array = NULL;
734 
735 	if (!SVC_GETARGS(xprt, xdr_args, (caddr_t)&args)) {
736 
737 		CB_WARN("cb_dispatch: cannot getargs\n");
738 		svcerr_decode(xprt);
739 		return;
740 	}
741 
742 	(*proc)(&args, &res, req, ncg);
743 
744 	if (svc_sendreply(xprt, xdr_res, (caddr_t)&res) == FALSE) {
745 
746 		CB_WARN("cb_dispatch: bad sendreply\n");
747 		svcerr_systemerr(xprt);
748 	}
749 
750 	if (freeproc)
751 		(*freeproc)(&res);
752 
753 	if (!SVC_FREEARGS(xprt, xdr_args, (caddr_t)&args)) {
754 
755 		CB_WARN("cb_dispatch: bad freeargs\n");
756 	}
757 }
758 
759 static rpcprog_t
760 nfs4_getnextprogram(struct nfs4_callback_globals *ncg)
761 {
762 	int i, j;
763 
764 	j = ncg->nfs4_program_hint;
765 	for (i = 0; i < nfs4_num_prognums; i++, j++) {
766 
767 		if (j >= nfs4_num_prognums)
768 			j = 0;
769 
770 		if (ncg->nfs4prog2server[j] == NULL) {
771 			ncg->nfs4_program_hint = j+1;
772 			return (j+NFS4_CALLBACK);
773 		}
774 	}
775 
776 	return (0);
777 }
778 
779 void
780 nfs4callback_destroy(nfs4_server_t *np)
781 {
782 	struct nfs4_callback_globals *ncg;
783 	int i;
784 
785 	if (np->s_program == 0)
786 		return;
787 
788 	ncg = np->zone_globals;
789 	i = np->s_program - NFS4_CALLBACK;
790 
791 	mutex_enter(&ncg->nfs4_cb_lock);
792 
793 	ASSERT(ncg->nfs4prog2server[i] == np);
794 
795 	ncg->nfs4prog2server[i] = NULL;
796 
797 	if (i < ncg->nfs4_program_hint)
798 		ncg->nfs4_program_hint = i;
799 
800 	mutex_exit(&ncg->nfs4_cb_lock);
801 }
802 
803 /*
804  * nfs4_setport - This function saves a netid and univeral address for
805  * the callback program.  These values will be used during setclientid.
806  */
807 static void
808 nfs4_setport(char *netid, char *uaddr, char *protofmly, char *proto,
809 	struct nfs4_callback_globals *ncg)
810 {
811 	struct nfs4_cb_port *p;
812 	bool_t found = FALSE;
813 
814 	ASSERT(MUTEX_HELD(&ncg->nfs4_cb_lock));
815 
816 	p = list_head(&ncg->nfs4_cb_ports);
817 	for (; p != NULL; p = list_next(&ncg->nfs4_cb_ports, p)) {
818 		if (strcmp(p->netid, netid) == 0) {
819 			found = TRUE;
820 			break;
821 		}
822 	}
823 	if (found == TRUE)
824 		(void) strcpy(p->uaddr, uaddr);
825 	else {
826 		p = kmem_alloc(sizeof (*p), KM_SLEEP);
827 
828 		(void) strcpy(p->uaddr, uaddr);
829 		(void) strcpy(p->netid, netid);
830 		(void) strcpy(p->protofmly, protofmly);
831 		(void) strcpy(p->proto, proto);
832 		list_insert_head(&ncg->nfs4_cb_ports, p);
833 	}
834 }
835 
836 /*
837  * nfs4_cb_args - This function is used to construct the callback
838  * portion of the arguments needed for setclientid.
839  */
840 
841 void
842 nfs4_cb_args(nfs4_server_t *np, struct knetconfig *knc, SETCLIENTID4args *args)
843 {
844 	struct nfs4_cb_port *p;
845 	bool_t found = FALSE;
846 	rpcprog_t pgm;
847 	struct nfs4_callback_globals *ncg = np->zone_globals;
848 
849 	/*
850 	 * This server structure may already have a program number
851 	 * assigned to it.  This happens when the client has to
852 	 * re-issue SETCLIENTID.  Just re-use the information.
853 	 */
854 	if (np->s_program >= NFS4_CALLBACK &&
855 	    np->s_program < NFS4_CALLBACK + nfs4_num_prognums)
856 		nfs4callback_destroy(np);
857 
858 	mutex_enter(&ncg->nfs4_cb_lock);
859 
860 	p = list_head(&ncg->nfs4_cb_ports);
861 	for (; p != NULL; p = list_next(&ncg->nfs4_cb_ports, p)) {
862 		if (strcmp(p->protofmly, knc->knc_protofmly) == 0 &&
863 		    strcmp(p->proto, knc->knc_proto) == 0) {
864 			found = TRUE;
865 			break;
866 		}
867 	}
868 
869 	if (found == FALSE) {
870 
871 		NFS4_DEBUG(nfs4_callback_debug,
872 		    (CE_WARN, "nfs4_cb_args: could not find netid for %s/%s\n",
873 		    knc->knc_protofmly, knc->knc_proto));
874 
875 		args->callback.cb_program = 0;
876 		args->callback.cb_location.r_netid = NULL;
877 		args->callback.cb_location.r_addr = NULL;
878 		args->callback_ident = 0;
879 		mutex_exit(&ncg->nfs4_cb_lock);
880 		return;
881 	}
882 
883 	if ((pgm = nfs4_getnextprogram(ncg)) == 0) {
884 		CB_WARN("nfs4_cb_args: out of program numbers\n");
885 
886 		args->callback.cb_program = 0;
887 		args->callback.cb_location.r_netid = NULL;
888 		args->callback.cb_location.r_addr = NULL;
889 		args->callback_ident = 0;
890 		mutex_exit(&ncg->nfs4_cb_lock);
891 		return;
892 	}
893 
894 	ncg->nfs4prog2server[pgm-NFS4_CALLBACK] = np;
895 	args->callback.cb_program = pgm;
896 	args->callback.cb_location.r_netid = p->netid;
897 	args->callback.cb_location.r_addr = p->uaddr;
898 	args->callback_ident = pgm;
899 
900 	np->s_program = pgm;
901 
902 	mutex_exit(&ncg->nfs4_cb_lock);
903 }
904 
905 static int
906 nfs4_dquery(struct nfs4_svc_args *arg, model_t model)
907 {
908 	file_t *fp;
909 	vnode_t *vp;
910 	rnode4_t *rp;
911 	int error;
912 	STRUCT_HANDLE(nfs4_svc_args, uap);
913 
914 	STRUCT_SET_HANDLE(uap, model, arg);
915 
916 	if ((fp = getf(STRUCT_FGET(uap, fd))) == NULL)
917 		return (EBADF);
918 
919 	vp = fp->f_vnode;
920 
921 	if (vp == NULL || vp->v_type != VREG ||
922 	    !vn_matchops(vp, nfs4_vnodeops)) {
923 		releasef(STRUCT_FGET(uap, fd));
924 		return (EBADF);
925 	}
926 
927 	rp = VTOR4(vp);
928 
929 	/*
930 	 * I can't convince myself that we need locking here.  The
931 	 * rnode cannot disappear and the value returned is instantly
932 	 * stale anway, so why bother?
933 	 */
934 
935 	error = suword32(STRUCT_FGETP(uap, netid), rp->r_deleg_type);
936 	releasef(STRUCT_FGET(uap, fd));
937 	return (error);
938 }
939 
940 
941 /*
942  * NFS4 client system call.  This service does the
943  * necessary initialization for the callback program.
944  * This is fashioned after the server side interaction
945  * between nfsd and the kernel.  On the client, the
946  * mount command forks and the child process does the
947  * necessary interaction with the kernel.
948  *
949  * uap->fd is the fd of an open transport provider
950  */
951 int
952 nfs4_svc(struct nfs4_svc_args *arg, model_t model)
953 {
954 	file_t *fp;
955 	int error;
956 	int readsize;
957 	char buf[KNC_STRSIZE], uaddr[KNC_STRSIZE];
958 	char protofmly[KNC_STRSIZE], proto[KNC_STRSIZE];
959 	size_t len;
960 	STRUCT_HANDLE(nfs4_svc_args, uap);
961 	struct netbuf addrmask;
962 	int cmd;
963 	SVCMASTERXPRT *cb_xprt;
964 	struct nfs4_callback_globals *ncg;
965 
966 #ifdef lint
967 	model = model;		/* STRUCT macros don't always refer to it */
968 #endif
969 
970 	STRUCT_SET_HANDLE(uap, model, arg);
971 
972 	if (STRUCT_FGET(uap, cmd) == NFS4_DQUERY)
973 		return (nfs4_dquery(arg, model));
974 
975 	if (secpolicy_nfs(CRED()) != 0)
976 		return (EPERM);
977 
978 	if ((fp = getf(STRUCT_FGET(uap, fd))) == NULL)
979 		return (EBADF);
980 
981 	/*
982 	 * Set read buffer size to rsize
983 	 * and add room for RPC headers.
984 	 */
985 	readsize = nfs3tsize() + (RPC_MAXDATASIZE - NFS_MAXDATA);
986 	if (readsize < RPC_MAXDATASIZE)
987 		readsize = RPC_MAXDATASIZE;
988 
989 	error = copyinstr((const char *)STRUCT_FGETP(uap, netid), buf,
990 	    KNC_STRSIZE, &len);
991 	if (error) {
992 		releasef(STRUCT_FGET(uap, fd));
993 		return (error);
994 	}
995 
996 	cmd = STRUCT_FGET(uap, cmd);
997 
998 	if (cmd & NFS4_KRPC_START) {
999 		addrmask.len = STRUCT_FGET(uap, addrmask.len);
1000 		addrmask.maxlen = STRUCT_FGET(uap, addrmask.maxlen);
1001 		addrmask.buf = kmem_alloc(addrmask.maxlen, KM_SLEEP);
1002 		error = copyin(STRUCT_FGETP(uap, addrmask.buf), addrmask.buf,
1003 		    addrmask.len);
1004 		if (error) {
1005 			releasef(STRUCT_FGET(uap, fd));
1006 			kmem_free(addrmask.buf, addrmask.maxlen);
1007 			return (error);
1008 		}
1009 	}
1010 	else
1011 		addrmask.buf = NULL;
1012 
1013 	error = copyinstr((const char *)STRUCT_FGETP(uap, addr), uaddr,
1014 	    sizeof (uaddr), &len);
1015 	if (error) {
1016 		releasef(STRUCT_FGET(uap, fd));
1017 		if (addrmask.buf)
1018 			kmem_free(addrmask.buf, addrmask.maxlen);
1019 		return (error);
1020 	}
1021 
1022 	error = copyinstr((const char *)STRUCT_FGETP(uap, protofmly), protofmly,
1023 	    sizeof (protofmly), &len);
1024 	if (error) {
1025 		releasef(STRUCT_FGET(uap, fd));
1026 		if (addrmask.buf)
1027 			kmem_free(addrmask.buf, addrmask.maxlen);
1028 		return (error);
1029 	}
1030 
1031 	error = copyinstr((const char *)STRUCT_FGETP(uap, proto), proto,
1032 	    sizeof (proto), &len);
1033 	if (error) {
1034 		releasef(STRUCT_FGET(uap, fd));
1035 		if (addrmask.buf)
1036 			kmem_free(addrmask.buf, addrmask.maxlen);
1037 		return (error);
1038 	}
1039 
1040 	ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
1041 	ASSERT(ncg != NULL);
1042 
1043 	mutex_enter(&ncg->nfs4_cb_lock);
1044 	if (cmd & NFS4_SETPORT)
1045 		nfs4_setport(buf, uaddr, protofmly, proto, ncg);
1046 
1047 	if (cmd & NFS4_KRPC_START) {
1048 		error = svc_tli_kcreate(fp, readsize, buf, &addrmask, &cb_xprt,
1049 		    &nfs4_cb_sct, NULL, NFS_CB_SVCPOOL_ID, FALSE);
1050 		if (error) {
1051 			CB_WARN1("nfs4_svc: svc_tli_kcreate failed %d\n",
1052 			    error);
1053 			kmem_free(addrmask.buf, addrmask.maxlen);
1054 		}
1055 	}
1056 
1057 	mutex_exit(&ncg->nfs4_cb_lock);
1058 	releasef(STRUCT_FGET(uap, fd));
1059 	return (error);
1060 }
1061 
1062 struct nfs4_callback_globals *
1063 nfs4_get_callback_globals(void)
1064 {
1065 	return (zone_getspecific(nfs4_callback_zone_key, nfs_zone()));
1066 }
1067 
1068 static void *
1069 nfs4_callback_init_zone(zoneid_t zoneid)
1070 {
1071 	kstat_t *nfs4_callback_kstat;
1072 	struct nfs4_callback_globals *ncg;
1073 
1074 	ncg = kmem_zalloc(sizeof (*ncg), KM_SLEEP);
1075 
1076 	ncg->nfs4prog2server = kmem_zalloc(nfs4_num_prognums *
1077 	    sizeof (struct nfs4_server *), KM_SLEEP);
1078 
1079 	/* initialize the dlist */
1080 	mutex_init(&ncg->nfs4_dlist_lock, NULL, MUTEX_DEFAULT, NULL);
1081 	list_create(&ncg->nfs4_dlist, sizeof (struct nfs4_dnode),
1082 	    offsetof(struct nfs4_dnode, linkage));
1083 
1084 	/* initialize cb_port list */
1085 	mutex_init(&ncg->nfs4_cb_lock, NULL, MUTEX_DEFAULT, NULL);
1086 	list_create(&ncg->nfs4_cb_ports, sizeof (struct nfs4_cb_port),
1087 	    offsetof(struct nfs4_cb_port, linkage));
1088 
1089 	/* get our own copy of the kstats */
1090 	bcopy(&nfs4_callback_stats_tmpl, &ncg->nfs4_callback_stats,
1091 	    sizeof (nfs4_callback_stats_tmpl));
1092 	/* register "nfs:0:nfs4_callback_stats" for this zone */
1093 	if ((nfs4_callback_kstat =
1094 	    kstat_create_zone("nfs", 0, "nfs4_callback_stats", "misc",
1095 	    KSTAT_TYPE_NAMED,
1096 	    sizeof (ncg->nfs4_callback_stats) / sizeof (kstat_named_t),
1097 	    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE,
1098 	    zoneid)) != NULL) {
1099 		nfs4_callback_kstat->ks_data = &ncg->nfs4_callback_stats;
1100 		kstat_install(nfs4_callback_kstat);
1101 	}
1102 	return (ncg);
1103 }
1104 
1105 static void
1106 nfs4_discard_delegations(struct nfs4_callback_globals *ncg)
1107 {
1108 	nfs4_server_t *sp;
1109 	int i, num_removed;
1110 
1111 	/*
1112 	 * It's OK here to just run through the registered "programs", as
1113 	 * servers without programs won't have any delegations to handle.
1114 	 */
1115 	for (i = 0; i < nfs4_num_prognums; i++) {
1116 		rnode4_t *rp;
1117 
1118 		mutex_enter(&ncg->nfs4_cb_lock);
1119 		sp = ncg->nfs4prog2server[i];
1120 		mutex_exit(&ncg->nfs4_cb_lock);
1121 
1122 		if (nfs4_server_vlock(sp, 1) == FALSE)
1123 			continue;
1124 		num_removed = 0;
1125 		while ((rp = list_head(&sp->s_deleg_list)) != NULL) {
1126 			mutex_enter(&rp->r_statev4_lock);
1127 			if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
1128 				/*
1129 				 * We need to take matters into our own hands,
1130 				 * as nfs4delegreturn_cleanup_impl() won't
1131 				 * remove this from the list.
1132 				 */
1133 				list_remove(&sp->s_deleg_list, rp);
1134 				mutex_exit(&rp->r_statev4_lock);
1135 				nfs4_dec_state_ref_count_nolock(sp,
1136 				    VTOMI4(RTOV4(rp)));
1137 				num_removed++;
1138 				continue;
1139 			}
1140 			mutex_exit(&rp->r_statev4_lock);
1141 			VN_HOLD(RTOV4(rp));
1142 			mutex_exit(&sp->s_lock);
1143 			/*
1144 			 * The following will remove the node from the list.
1145 			 */
1146 			nfs4delegreturn_cleanup_impl(rp, sp, ncg);
1147 			VN_RELE(RTOV4(rp));
1148 			mutex_enter(&sp->s_lock);
1149 		}
1150 		mutex_exit(&sp->s_lock);
1151 		/* each removed list node reles a reference */
1152 		while (num_removed-- > 0)
1153 			nfs4_server_rele(sp);
1154 		/* remove our reference for nfs4_server_vlock */
1155 		nfs4_server_rele(sp);
1156 	}
1157 }
1158 
1159 /* ARGSUSED */
1160 static void
1161 nfs4_callback_shutdown_zone(zoneid_t zoneid, void *data)
1162 {
1163 	struct nfs4_callback_globals *ncg = data;
1164 
1165 	/*
1166 	 * Clean pending delegation return list.
1167 	 */
1168 	nfs4_dlistclean_impl(ncg, NFS4_DR_DISCARD);
1169 
1170 	/*
1171 	 * Discard all delegations.
1172 	 */
1173 	nfs4_discard_delegations(ncg);
1174 }
1175 
1176 static void
1177 nfs4_callback_fini_zone(zoneid_t zoneid, void *data)
1178 {
1179 	struct nfs4_callback_globals *ncg = data;
1180 	struct nfs4_cb_port *p;
1181 	nfs4_server_t *sp, *next;
1182 	nfs4_server_t freelist;
1183 	int i;
1184 
1185 	kstat_delete_byname_zone("nfs", 0, "nfs4_callback_stats", zoneid);
1186 
1187 	/*
1188 	 * Discard all delegations that may have crept in since we did the
1189 	 * _shutdown.
1190 	 */
1191 	nfs4_discard_delegations(ncg);
1192 	/*
1193 	 * We're completely done with this zone and all associated
1194 	 * nfs4_server_t's.  Any remaining nfs4_server_ts should only have one
1195 	 * more reference outstanding -- the reference we didn't release in
1196 	 * nfs4_renew_lease_thread().
1197 	 *
1198 	 * Here we need to run through the global nfs4_server_lst as we need to
1199 	 * deal with nfs4_server_ts without programs, as they also have threads
1200 	 * created for them, and so have outstanding references that we need to
1201 	 * release.
1202 	 */
1203 	freelist.forw = &freelist;
1204 	freelist.back = &freelist;
1205 	mutex_enter(&nfs4_server_lst_lock);
1206 	sp = nfs4_server_lst.forw;
1207 	while (sp != &nfs4_server_lst) {
1208 		next = sp->forw;
1209 		if (sp->zoneid == zoneid) {
1210 			remque(sp);
1211 			insque(sp, &freelist);
1212 		}
1213 		sp = next;
1214 	}
1215 	mutex_exit(&nfs4_server_lst_lock);
1216 
1217 	sp = freelist.forw;
1218 	while (sp != &freelist) {
1219 		next = sp->forw;
1220 		nfs4_server_rele(sp);	/* free the list's reference */
1221 		sp = next;
1222 	}
1223 
1224 #ifdef DEBUG
1225 	for (i = 0; i < nfs4_num_prognums; i++) {
1226 		ASSERT(ncg->nfs4prog2server[i] == NULL);
1227 	}
1228 #endif
1229 	kmem_free(ncg->nfs4prog2server, nfs4_num_prognums *
1230 	    sizeof (struct nfs4_server *));
1231 
1232 	mutex_enter(&ncg->nfs4_cb_lock);
1233 	while ((p = list_head(&ncg->nfs4_cb_ports)) != NULL) {
1234 		list_remove(&ncg->nfs4_cb_ports, p);
1235 		kmem_free(p, sizeof (*p));
1236 	}
1237 	list_destroy(&ncg->nfs4_cb_ports);
1238 	mutex_destroy(&ncg->nfs4_cb_lock);
1239 	list_destroy(&ncg->nfs4_dlist);
1240 	mutex_destroy(&ncg->nfs4_dlist_lock);
1241 	kmem_free(ncg, sizeof (*ncg));
1242 }
1243 
1244 void
1245 nfs4_callback_init(void)
1246 {
1247 	int i;
1248 	SVC_CALLOUT *nfs4_cb_sc;
1249 
1250 	/* initialize the callback table */
1251 	nfs4_cb_sc = kmem_alloc(nfs4_num_prognums *
1252 	    sizeof (SVC_CALLOUT), KM_SLEEP);
1253 
1254 	for (i = 0; i < nfs4_num_prognums; i++) {
1255 		nfs4_cb_sc[i].sc_prog = NFS4_CALLBACK+i;
1256 		nfs4_cb_sc[i].sc_versmin = NFS_CB;
1257 		nfs4_cb_sc[i].sc_versmax = NFS_CB;
1258 		nfs4_cb_sc[i].sc_dispatch = cb_dispatch;
1259 	}
1260 
1261 	nfs4_cb_sct.sct_size = nfs4_num_prognums;
1262 	nfs4_cb_sct.sct_free = FALSE;
1263 	nfs4_cb_sct.sct_sc = nfs4_cb_sc;
1264 
1265 	/*
1266 	 * Compute max bytes required for dyamically allocated parts
1267 	 * of cb_getattr reply.  Only size and change are supported now.
1268 	 * If CB_GETATTR is changed to reply with additional attrs,
1269 	 * additional sizes must be added below.
1270 	 *
1271 	 * fattr4_change + fattr4_size == uint64_t + uint64_t
1272 	 */
1273 	cb_getattr_bytes = 2 * BYTES_PER_XDR_UNIT + 2 * BYTES_PER_XDR_UNIT;
1274 
1275 	zone_key_create(&nfs4_callback_zone_key, nfs4_callback_init_zone,
1276 	    nfs4_callback_shutdown_zone, nfs4_callback_fini_zone);
1277 }
1278 
1279 void
1280 nfs4_callback_fini(void)
1281 {
1282 }
1283 
1284 /*
1285  * NB: This function can be called from the *wrong* zone (ie, the zone that
1286  * 'rp' belongs to and the caller's zone may not be the same).  This can happen
1287  * if the zone is going away and we get called from nfs4_async_inactive().  In
1288  * this case the globals will be NULL and we won't update the counters, which
1289  * doesn't matter as the zone is going away anyhow.
1290  */
1291 static void
1292 nfs4delegreturn_cleanup_impl(rnode4_t *rp, nfs4_server_t *np,
1293 	struct nfs4_callback_globals *ncg)
1294 {
1295 	mntinfo4_t *mi = VTOMI4(RTOV4(rp));
1296 	boolean_t need_rele = B_FALSE;
1297 
1298 	/*
1299 	 * Caller must be holding mi_recovlock in read mode
1300 	 * to call here.  This is provided by start_op.
1301 	 * Delegation management requires to grab s_lock
1302 	 * first and then r_statev4_lock.
1303 	 */
1304 
1305 	if (np == NULL) {
1306 		np = find_nfs4_server_all(mi, 1);
1307 		if (np == NULL)
1308 			return;
1309 		need_rele = B_TRUE;
1310 	} else {
1311 		mutex_enter(&np->s_lock);
1312 	}
1313 
1314 	mutex_enter(&rp->r_statev4_lock);
1315 
1316 	if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
1317 		mutex_exit(&rp->r_statev4_lock);
1318 		mutex_exit(&np->s_lock);
1319 		if (need_rele)
1320 			nfs4_server_rele(np);
1321 		return;
1322 	}
1323 
1324 	/*
1325 	 * Free the cred originally held when
1326 	 * the delegation was granted.  Caller must
1327 	 * hold this cred if it wants to use it after
1328 	 * this call.
1329 	 */
1330 	crfree(rp->r_deleg_cred);
1331 	rp->r_deleg_cred = NULL;
1332 	rp->r_deleg_type = OPEN_DELEGATE_NONE;
1333 	rp->r_deleg_needs_recovery = OPEN_DELEGATE_NONE;
1334 	rp->r_deleg_needs_recall = FALSE;
1335 	rp->r_deleg_return_pending = FALSE;
1336 
1337 	/*
1338 	 * Remove the rnode from the server's list and
1339 	 * update the ref counts.
1340 	 */
1341 	list_remove(&np->s_deleg_list, rp);
1342 	mutex_exit(&rp->r_statev4_lock);
1343 	nfs4_dec_state_ref_count_nolock(np, mi);
1344 	mutex_exit(&np->s_lock);
1345 	/* removed list node removes a reference */
1346 	nfs4_server_rele(np);
1347 	if (need_rele)
1348 		nfs4_server_rele(np);
1349 	if (ncg != NULL)
1350 		ncg->nfs4_callback_stats.delegations.value.ui64--;
1351 }
1352 
1353 void
1354 nfs4delegreturn_cleanup(rnode4_t *rp, nfs4_server_t *np)
1355 {
1356 	struct nfs4_callback_globals *ncg;
1357 
1358 	if (np != NULL) {
1359 		ncg = np->zone_globals;
1360 	} else if (nfs_zone() == VTOMI4(RTOV4(rp))->mi_zone) {
1361 		ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
1362 		ASSERT(ncg != NULL);
1363 	} else {
1364 		/*
1365 		 * Request coming from the wrong zone.
1366 		 */
1367 		ASSERT(getzoneid() == GLOBAL_ZONEID);
1368 		ncg = NULL;
1369 	}
1370 
1371 	nfs4delegreturn_cleanup_impl(rp, np, ncg);
1372 }
1373 
1374 static void
1375 nfs4delegreturn_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp,
1376 	cred_t *cr, vnode_t *vp)
1377 {
1378 	if (error != ETIMEDOUT && error != EINTR &&
1379 	    !NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) {
1380 		lost_rqstp->lr_op = 0;
1381 		return;
1382 	}
1383 
1384 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
1385 	    "nfs4close_save_lost_rqst: error %d", error));
1386 
1387 	lost_rqstp->lr_op = OP_DELEGRETURN;
1388 	/*
1389 	 * The vp is held and rele'd via the recovery code.
1390 	 * See nfs4_save_lost_rqst.
1391 	 */
1392 	lost_rqstp->lr_vp = vp;
1393 	lost_rqstp->lr_dvp = NULL;
1394 	lost_rqstp->lr_oop = NULL;
1395 	lost_rqstp->lr_osp = NULL;
1396 	lost_rqstp->lr_lop = NULL;
1397 	lost_rqstp->lr_cr = cr;
1398 	lost_rqstp->lr_flk = NULL;
1399 	lost_rqstp->lr_putfirst = FALSE;
1400 }
1401 
1402 static void
1403 nfs4delegreturn_otw(rnode4_t *rp, cred_t *cr, nfs4_error_t *ep)
1404 {
1405 	COMPOUND4args_clnt args;
1406 	COMPOUND4res_clnt res;
1407 	nfs_argop4 argops[3];
1408 	nfs4_ga_res_t *garp = NULL;
1409 	hrtime_t t;
1410 	int numops;
1411 	int doqueue = 1;
1412 
1413 	args.ctag = TAG_DELEGRETURN;
1414 
1415 	numops = 3;		/* PUTFH, GETATTR, DELEGRETURN */
1416 
1417 	args.array = argops;
1418 	args.array_len = numops;
1419 
1420 	argops[0].argop = OP_CPUTFH;
1421 	argops[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
1422 
1423 	argops[1].argop = OP_GETATTR;
1424 	argops[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
1425 	argops[1].nfs_argop4_u.opgetattr.mi = VTOMI4(RTOV4(rp));
1426 
1427 	argops[2].argop = OP_DELEGRETURN;
1428 	argops[2].nfs_argop4_u.opdelegreturn.deleg_stateid =
1429 	    rp->r_deleg_stateid;
1430 
1431 	t = gethrtime();
1432 	rfs4call(VTOMI4(RTOV4(rp)), &args, &res, cr, &doqueue, 0, ep);
1433 
1434 	if (ep->error)
1435 		return;
1436 
1437 	if (res.status == NFS4_OK) {
1438 		garp = &res.array[1].nfs_resop4_u.opgetattr.ga_res;
1439 		nfs4_attr_cache(RTOV4(rp), garp, t, cr, TRUE, NULL);
1440 
1441 	}
1442 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1443 }
1444 
1445 int
1446 nfs4_do_delegreturn(rnode4_t *rp, int flags, cred_t *cr,
1447 	struct nfs4_callback_globals *ncg)
1448 {
1449 	vnode_t *vp = RTOV4(rp);
1450 	mntinfo4_t *mi = VTOMI4(vp);
1451 	nfs4_lost_rqst_t lost_rqst;
1452 	nfs4_recov_state_t recov_state;
1453 	bool_t needrecov = FALSE, recovonly, done = FALSE;
1454 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
1455 
1456 	ncg->nfs4_callback_stats.delegreturn.value.ui64++;
1457 
1458 	while (!done) {
1459 		e.error = nfs4_start_fop(mi, vp, NULL, OH_DELEGRETURN,
1460 		    &recov_state, &recovonly);
1461 
1462 		if (e.error) {
1463 			if (flags & NFS4_DR_FORCE) {
1464 				(void) nfs_rw_enter_sig(&mi->mi_recovlock,
1465 				    RW_READER, 0);
1466 				nfs4delegreturn_cleanup_impl(rp, NULL, ncg);
1467 				nfs_rw_exit(&mi->mi_recovlock);
1468 			}
1469 			break;
1470 		}
1471 
1472 		/*
1473 		 * Check to see if the delegation has already been
1474 		 * returned by the recovery thread.   The state of
1475 		 * the delegation cannot change at this point due
1476 		 * to start_fop and the r_deleg_recall_lock.
1477 		 */
1478 		if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
1479 			e.error = 0;
1480 			nfs4_end_op(mi, vp, NULL, &recov_state, needrecov);
1481 			break;
1482 		}
1483 
1484 		if (recovonly) {
1485 			/*
1486 			 * Delegation will be returned via the
1487 			 * recovery framework.  Build a lost request
1488 			 * structure, start recovery and get out.
1489 			 */
1490 			nfs4_error_init(&e, EINTR);
1491 			nfs4delegreturn_save_lost_rqst(e.error, &lost_rqst,
1492 			    cr, vp);
1493 			(void) nfs4_start_recovery(&e, mi, vp,
1494 			    NULL, &rp->r_deleg_stateid,
1495 			    lost_rqst.lr_op == OP_DELEGRETURN ?
1496 			    &lost_rqst : NULL, OP_DELEGRETURN, NULL,
1497 			    NULL, NULL);
1498 			nfs4_end_op(mi, vp, NULL, &recov_state, needrecov);
1499 			break;
1500 		}
1501 
1502 		nfs4delegreturn_otw(rp, cr, &e);
1503 
1504 		/*
1505 		 * Ignore some errors on delegreturn; no point in marking
1506 		 * the file dead on a state destroying operation.
1507 		 */
1508 		if (e.error == 0 && (nfs4_recov_marks_dead(e.stat) ||
1509 		    e.stat == NFS4ERR_BADHANDLE ||
1510 		    e.stat == NFS4ERR_STALE))
1511 			needrecov = FALSE;
1512 		else
1513 			needrecov = nfs4_needs_recovery(&e, TRUE, vp->v_vfsp);
1514 
1515 		if (needrecov) {
1516 			nfs4delegreturn_save_lost_rqst(e.error, &lost_rqst,
1517 			    cr, vp);
1518 			(void) nfs4_start_recovery(&e, mi, vp,
1519 			    NULL, &rp->r_deleg_stateid,
1520 			    lost_rqst.lr_op == OP_DELEGRETURN ?
1521 			    &lost_rqst : NULL, OP_DELEGRETURN, NULL,
1522 			    NULL, NULL);
1523 		} else {
1524 			nfs4delegreturn_cleanup_impl(rp, NULL, ncg);
1525 			done = TRUE;
1526 		}
1527 
1528 		nfs4_end_op(mi, vp, NULL, &recov_state, needrecov);
1529 	}
1530 	return (e.error);
1531 }
1532 
1533 /*
1534  * nfs4_resend_delegreturn - used to drive the delegreturn
1535  * operation via the recovery thread.
1536  */
1537 void
1538 nfs4_resend_delegreturn(nfs4_lost_rqst_t *lorp, nfs4_error_t *ep,
1539 	nfs4_server_t *np)
1540 {
1541 	rnode4_t *rp = VTOR4(lorp->lr_vp);
1542 
1543 	/* If the file failed recovery, just quit. */
1544 	mutex_enter(&rp->r_statelock);
1545 	if (rp->r_flags & R4RECOVERR) {
1546 		ep->error = EIO;
1547 	}
1548 	mutex_exit(&rp->r_statelock);
1549 
1550 	if (!ep->error)
1551 		nfs4delegreturn_otw(rp, lorp->lr_cr, ep);
1552 
1553 	/*
1554 	 * If recovery is now needed, then return the error
1555 	 * and status and let the recovery thread handle it,
1556 	 * including re-driving another delegreturn.  Otherwise,
1557 	 * just give up and clean up the delegation.
1558 	 */
1559 	if (nfs4_needs_recovery(ep, TRUE, lorp->lr_vp->v_vfsp))
1560 		return;
1561 
1562 	if (rp->r_deleg_type != OPEN_DELEGATE_NONE)
1563 		nfs4delegreturn_cleanup(rp, np);
1564 
1565 	nfs4_error_zinit(ep);
1566 }
1567 
1568 /*
1569  * nfs4delegreturn - general function to return a delegation.
1570  *
1571  * NFS4_DR_FORCE - return the delegation even if start_op fails
1572  * NFS4_DR_PUSH - push modified data back to the server via VOP_PUTPAGE
1573  * NFS4_DR_DISCARD - discard the delegation w/o delegreturn
1574  * NFS4_DR_DID_OP - calling function already did nfs4_start_op
1575  * NFS4_DR_RECALL - delegreturned initiated via CB_RECALL
1576  * NFS4_DR_REOPEN - do file reopens, if applicable
1577  */
1578 static int
1579 nfs4delegreturn_impl(rnode4_t *rp, int flags, struct nfs4_callback_globals *ncg)
1580 {
1581 	int error = 0;
1582 	cred_t *cr = NULL;
1583 	vnode_t *vp;
1584 	bool_t needrecov = FALSE;
1585 	bool_t rw_entered = FALSE;
1586 	bool_t do_reopen;
1587 
1588 	vp = RTOV4(rp);
1589 
1590 	/*
1591 	 * If NFS4_DR_DISCARD is set by itself, take a short-cut and
1592 	 * discard without doing an otw DELEGRETURN.  This may only be used
1593 	 * by the recovery thread because it bypasses the synchronization
1594 	 * with r_deleg_recall_lock and mi->mi_recovlock.
1595 	 */
1596 	if (flags == NFS4_DR_DISCARD) {
1597 		nfs4delegreturn_cleanup_impl(rp, NULL, ncg);
1598 		return (0);
1599 	}
1600 
1601 	if (flags & NFS4_DR_DID_OP) {
1602 		/*
1603 		 * Caller had already done start_op, which means the
1604 		 * r_deleg_recall_lock is already held in READ mode
1605 		 * so we cannot take it in write mode.  Return the
1606 		 * delegation asynchronously.
1607 		 *
1608 		 * Remove the NFS4_DR_DID_OP flag so we don't
1609 		 * get stuck looping through here.
1610 		 */
1611 		VN_HOLD(vp);
1612 		nfs4delegreturn_async(rp, (flags & ~NFS4_DR_DID_OP), FALSE);
1613 		return (0);
1614 	}
1615 
1616 	/*
1617 	 * Verify we still have a delegation and crhold the credential.
1618 	 */
1619 	mutex_enter(&rp->r_statev4_lock);
1620 	if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
1621 		mutex_exit(&rp->r_statev4_lock);
1622 		goto out;
1623 	}
1624 	cr = rp->r_deleg_cred;
1625 	ASSERT(cr != NULL);
1626 	crhold(cr);
1627 	mutex_exit(&rp->r_statev4_lock);
1628 
1629 	/*
1630 	 * Push the modified data back to the server synchronously
1631 	 * before doing DELEGRETURN.
1632 	 */
1633 	if (flags & NFS4_DR_PUSH)
1634 		(void) VOP_PUTPAGE(vp, 0, 0, 0, cr, NULL);
1635 
1636 	/*
1637 	 * Take r_deleg_recall_lock in WRITE mode, this will prevent
1638 	 * nfs4_is_otw_open_necessary from trying to use the delegation
1639 	 * while the DELEGRETURN is in progress.
1640 	 */
1641 	(void) nfs_rw_enter_sig(&rp->r_deleg_recall_lock, RW_WRITER, FALSE);
1642 
1643 	rw_entered = TRUE;
1644 
1645 	if (rp->r_deleg_type == OPEN_DELEGATE_NONE)
1646 		goto out;
1647 
1648 	if (flags & NFS4_DR_REOPEN) {
1649 		/*
1650 		 * If R4RECOVERRP is already set, then skip re-opening
1651 		 * the delegation open streams and go straight to doing
1652 		 * delegreturn.  (XXX if the file has failed recovery, then the
1653 		 * delegreturn attempt is likely to be futile.)
1654 		 */
1655 		mutex_enter(&rp->r_statelock);
1656 		do_reopen = !(rp->r_flags & R4RECOVERRP);
1657 		mutex_exit(&rp->r_statelock);
1658 
1659 		if (do_reopen) {
1660 			error = deleg_reopen(vp, &needrecov, ncg, flags);
1661 			if (error != 0) {
1662 				if ((flags & (NFS4_DR_FORCE | NFS4_DR_RECALL))
1663 				    == 0)
1664 					goto out;
1665 			} else if (needrecov) {
1666 				if ((flags & NFS4_DR_FORCE) == 0)
1667 					goto out;
1668 			}
1669 		}
1670 	}
1671 
1672 	if (flags & NFS4_DR_DISCARD) {
1673 		mntinfo4_t *mi = VTOMI4(RTOV4(rp));
1674 
1675 		mutex_enter(&rp->r_statelock);
1676 		/*
1677 		 * deleg_return_pending is cleared inside of delegation_accept
1678 		 * when a delegation is accepted.  if this flag has been
1679 		 * cleared, then a new delegation has overwritten the one we
1680 		 * were about to throw away.
1681 		 */
1682 		if (!rp->r_deleg_return_pending) {
1683 			mutex_exit(&rp->r_statelock);
1684 			goto out;
1685 		}
1686 		mutex_exit(&rp->r_statelock);
1687 		(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, FALSE);
1688 		nfs4delegreturn_cleanup_impl(rp, NULL, ncg);
1689 		nfs_rw_exit(&mi->mi_recovlock);
1690 	} else {
1691 		error = nfs4_do_delegreturn(rp, flags, cr, ncg);
1692 	}
1693 
1694 out:
1695 	if (cr)
1696 		crfree(cr);
1697 	if (rw_entered)
1698 		nfs_rw_exit(&rp->r_deleg_recall_lock);
1699 	return (error);
1700 }
1701 
1702 int
1703 nfs4delegreturn(rnode4_t *rp, int flags)
1704 {
1705 	struct nfs4_callback_globals *ncg;
1706 
1707 	ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
1708 	ASSERT(ncg != NULL);
1709 
1710 	return (nfs4delegreturn_impl(rp, flags, ncg));
1711 }
1712 
1713 void
1714 nfs4delegreturn_async(rnode4_t *rp, int flags, bool_t trunc)
1715 {
1716 	struct cb_recall_pass *pp;
1717 
1718 	pp = kmem_alloc(sizeof (struct cb_recall_pass), KM_SLEEP);
1719 	pp->rp = rp;
1720 	pp->flags = flags;
1721 	pp->truncate = trunc;
1722 
1723 	/*
1724 	 * Fire up a thread to do the actual delegreturn
1725 	 * Caller must guarantee that the rnode doesn't
1726 	 * vanish (by calling VN_HOLD).
1727 	 */
1728 
1729 	(void) zthread_create(NULL, 0, nfs4delegreturn_thread, pp, 0,
1730 	    minclsyspri);
1731 }
1732 
1733 static void
1734 delegreturn_all_thread(rpcprog_t *pp)
1735 {
1736 	nfs4_server_t *np;
1737 	bool_t found = FALSE;
1738 	rpcprog_t prog;
1739 	rnode4_t *rp;
1740 	vnode_t *vp;
1741 	zoneid_t zoneid = getzoneid();
1742 	struct nfs4_callback_globals *ncg;
1743 
1744 	NFS4_DEBUG(nfs4_drat_debug,
1745 	    (CE_NOTE, "delereturn_all_thread: prog %d\n", *pp));
1746 
1747 	prog = *pp;
1748 	kmem_free(pp, sizeof (*pp));
1749 	pp = NULL;
1750 
1751 	mutex_enter(&nfs4_server_lst_lock);
1752 	for (np = nfs4_server_lst.forw; np != &nfs4_server_lst; np = np->forw) {
1753 		if (np->zoneid == zoneid && np->s_program == prog) {
1754 			mutex_enter(&np->s_lock);
1755 			found = TRUE;
1756 			break;
1757 		}
1758 	}
1759 	mutex_exit(&nfs4_server_lst_lock);
1760 
1761 	/*
1762 	 * It's possible that the nfs4_server which was using this
1763 	 * program number has vanished since this thread is async.
1764 	 * If so, just return.  Your work here is finished, my friend.
1765 	 */
1766 	if (!found)
1767 		goto out;
1768 
1769 	ncg = np->zone_globals;
1770 	while ((rp = list_head(&np->s_deleg_list)) != NULL) {
1771 		vp = RTOV4(rp);
1772 		VN_HOLD(vp);
1773 		mutex_exit(&np->s_lock);
1774 		(void) nfs4delegreturn_impl(rp, NFS4_DR_PUSH|NFS4_DR_REOPEN,
1775 		    ncg);
1776 		VN_RELE(vp);
1777 
1778 		/* retake the s_lock for next trip through the loop */
1779 		mutex_enter(&np->s_lock);
1780 	}
1781 	mutex_exit(&np->s_lock);
1782 out:
1783 	NFS4_DEBUG(nfs4_drat_debug,
1784 	    (CE_NOTE, "delereturn_all_thread: complete\n"));
1785 	zthread_exit();
1786 }
1787 
1788 void
1789 nfs4_delegreturn_all(nfs4_server_t *sp)
1790 {
1791 	rpcprog_t pro, *pp;
1792 
1793 	mutex_enter(&sp->s_lock);
1794 
1795 	/* Check to see if the delegation list is empty */
1796 
1797 	if (list_head(&sp->s_deleg_list) == NULL) {
1798 		mutex_exit(&sp->s_lock);
1799 		return;
1800 	}
1801 	/*
1802 	 * Grab the program number; the async thread will use this
1803 	 * to find the nfs4_server.
1804 	 */
1805 	pro = sp->s_program;
1806 	mutex_exit(&sp->s_lock);
1807 	pp = kmem_alloc(sizeof (rpcprog_t), KM_SLEEP);
1808 	*pp = pro;
1809 	(void) zthread_create(NULL, 0, delegreturn_all_thread, pp, 0,
1810 	    minclsyspri);
1811 }
1812 
1813 
1814 /*
1815  * Discard any delegations
1816  *
1817  * Iterate over the servers s_deleg_list and
1818  * for matching mount-point rnodes discard
1819  * the delegation.
1820  */
1821 void
1822 nfs4_deleg_discard(mntinfo4_t *mi, nfs4_server_t *sp)
1823 {
1824 	rnode4_t *rp, *next;
1825 	mntinfo4_t *r_mi;
1826 	struct nfs4_callback_globals *ncg;
1827 
1828 	ASSERT(mutex_owned(&sp->s_lock));
1829 	ncg = sp->zone_globals;
1830 
1831 	for (rp = list_head(&sp->s_deleg_list); rp != NULL; rp = next) {
1832 		r_mi = VTOMI4(RTOV4(rp));
1833 		next = list_next(&sp->s_deleg_list, rp);
1834 
1835 		if (r_mi != mi) {
1836 			/*
1837 			 * Skip if this rnode is in not on the
1838 			 * same mount-point
1839 			 */
1840 			continue;
1841 		}
1842 
1843 		ASSERT(rp->r_deleg_type == OPEN_DELEGATE_READ);
1844 
1845 #ifdef DEBUG
1846 		if (nfs4_client_recov_debug) {
1847 			zprintf(getzoneid(),
1848 			    "nfs4_deleg_discard: matched rnode %p "
1849 			"-- discarding delegation\n", (void *)rp);
1850 		}
1851 #endif
1852 		mutex_enter(&rp->r_statev4_lock);
1853 		/*
1854 		 * Free the cred originally held when the delegation
1855 		 * was granted. Also need to decrement the refcnt
1856 		 * on this server for each delegation we discard
1857 		 */
1858 		if (rp->r_deleg_cred)
1859 			crfree(rp->r_deleg_cred);
1860 		rp->r_deleg_cred = NULL;
1861 		rp->r_deleg_type = OPEN_DELEGATE_NONE;
1862 		rp->r_deleg_needs_recovery = OPEN_DELEGATE_NONE;
1863 		rp->r_deleg_needs_recall = FALSE;
1864 		ASSERT(sp->s_refcnt > 1);
1865 		sp->s_refcnt--;
1866 		list_remove(&sp->s_deleg_list, rp);
1867 		mutex_exit(&rp->r_statev4_lock);
1868 		nfs4_dec_state_ref_count_nolock(sp, mi);
1869 		ncg->nfs4_callback_stats.delegations.value.ui64--;
1870 	}
1871 }
1872 
1873 /*
1874  * Reopen any open streams that were covered by the given file's
1875  * delegation.
1876  * Returns zero or an errno value.  If there was no error, *recovp
1877  * indicates whether recovery was initiated.
1878  */
1879 
1880 static int
1881 deleg_reopen(vnode_t *vp, bool_t *recovp, struct nfs4_callback_globals *ncg,
1882 	int flags)
1883 {
1884 	nfs4_open_stream_t *osp;
1885 	nfs4_recov_state_t recov_state;
1886 	bool_t needrecov = FALSE;
1887 	mntinfo4_t *mi;
1888 	rnode4_t *rp;
1889 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
1890 	int claimnull;
1891 
1892 	mi = VTOMI4(vp);
1893 	rp = VTOR4(vp);
1894 
1895 	recov_state.rs_flags = 0;
1896 	recov_state.rs_num_retry_despite_err = 0;
1897 
1898 retry:
1899 	if ((e.error = nfs4_start_op(mi, vp, NULL, &recov_state)) != 0) {
1900 		return (e.error);
1901 	}
1902 
1903 	/*
1904 	 * if we mean to discard the delegation, it must be BAD, so don't
1905 	 * use it when doing the reopen or it will fail too.
1906 	 */
1907 	claimnull = (flags & NFS4_DR_DISCARD);
1908 	/*
1909 	 * Loop through the open streams for this rnode to find
1910 	 * all of the ones created using the delegation state ID.
1911 	 * Each of these needs to be re-opened.
1912 	 */
1913 
1914 	while ((osp = get_next_deleg_stream(rp, claimnull)) != NULL) {
1915 
1916 		if (claimnull) {
1917 			nfs4_reopen(vp, osp, &e, CLAIM_NULL, FALSE, FALSE);
1918 		} else {
1919 			ncg->nfs4_callback_stats.claim_cur.value.ui64++;
1920 
1921 			nfs4_reopen(vp, osp, &e, CLAIM_DELEGATE_CUR, FALSE,
1922 			    FALSE);
1923 			if (e.error == 0 && e.stat == NFS4_OK)
1924 				ncg->nfs4_callback_stats.
1925 				    claim_cur_ok.value.ui64++;
1926 		}
1927 
1928 		if (e.error == EAGAIN) {
1929 			open_stream_rele(osp, rp);
1930 			nfs4_end_op(mi, vp, NULL, &recov_state, TRUE);
1931 			goto retry;
1932 		}
1933 
1934 		/*
1935 		 * if error is EINTR, ETIMEDOUT, or NFS4_FRC_UNMT_ERR, then
1936 		 * recovery has already been started inside of nfs4_reopen.
1937 		 */
1938 		if (e.error == EINTR || e.error == ETIMEDOUT ||
1939 		    NFS4_FRC_UNMT_ERR(e.error, vp->v_vfsp)) {
1940 			open_stream_rele(osp, rp);
1941 			break;
1942 		}
1943 
1944 		needrecov = nfs4_needs_recovery(&e, TRUE, vp->v_vfsp);
1945 
1946 		if (e.error != 0 && !needrecov) {
1947 			/*
1948 			 * Recovery is not possible, but don't give up yet;
1949 			 * we'd still like to do delegreturn after
1950 			 * reopening as many streams as possible.
1951 			 * Continue processing the open streams.
1952 			 */
1953 
1954 			ncg->nfs4_callback_stats.recall_failed.value.ui64++;
1955 
1956 		} else if (needrecov) {
1957 			/*
1958 			 * Start recovery and bail out.  The recovery
1959 			 * thread will take it from here.
1960 			 */
1961 			(void) nfs4_start_recovery(&e, mi, vp, NULL, NULL,
1962 			    NULL, OP_OPEN, NULL, NULL, NULL);
1963 			open_stream_rele(osp, rp);
1964 			*recovp = TRUE;
1965 			break;
1966 		}
1967 
1968 		open_stream_rele(osp, rp);
1969 	}
1970 
1971 	nfs4_end_op(mi, vp, NULL, &recov_state, needrecov);
1972 
1973 	return (e.error);
1974 }
1975 
1976 /*
1977  * get_next_deleg_stream - returns the next open stream which
1978  * represents a delegation for this rnode.  In order to assure
1979  * forward progress, the caller must guarantee that each open
1980  * stream returned is changed so that a future call won't return
1981  * it again.
1982  *
1983  * There are several ways for the open stream to change.  If the open
1984  * stream is !os_delegation, then we aren't interested in it.  Also, if
1985  * either os_failed_reopen or !os_valid, then don't return the osp.
1986  *
1987  * If claimnull is false (doing reopen CLAIM_DELEGATE_CUR) then return
1988  * the osp if it is an os_delegation open stream.  Also, if the rnode still
1989  * has r_deleg_return_pending, then return the os_delegation osp.  Lastly,
1990  * if the rnode's r_deleg_stateid is different from the osp's open_stateid,
1991  * then return the osp.
1992  *
1993  * We have already taken the 'r_deleg_recall_lock' as WRITER, which
1994  * prevents new OPENs from going OTW (as start_fop takes this
1995  * lock in READ mode); thus, no new open streams can be created
1996  * (which inherently means no new delegation open streams are
1997  * being created).
1998  */
1999 
2000 static nfs4_open_stream_t *
2001 get_next_deleg_stream(rnode4_t *rp, int claimnull)
2002 {
2003 	nfs4_open_stream_t	*osp;
2004 
2005 	ASSERT(nfs_rw_lock_held(&rp->r_deleg_recall_lock, RW_WRITER));
2006 
2007 	/*
2008 	 * Search through the list of open streams looking for
2009 	 * one that was created while holding the delegation.
2010 	 */
2011 	mutex_enter(&rp->r_os_lock);
2012 	for (osp = list_head(&rp->r_open_streams); osp != NULL;
2013 	    osp = list_next(&rp->r_open_streams, osp)) {
2014 		mutex_enter(&osp->os_sync_lock);
2015 		if (!osp->os_delegation || osp->os_failed_reopen ||
2016 		    !osp->os_valid) {
2017 			mutex_exit(&osp->os_sync_lock);
2018 			continue;
2019 		}
2020 		if (!claimnull || rp->r_deleg_return_pending ||
2021 		    !stateid4_cmp(&osp->open_stateid, &rp->r_deleg_stateid)) {
2022 			osp->os_ref_count++;
2023 			mutex_exit(&osp->os_sync_lock);
2024 			mutex_exit(&rp->r_os_lock);
2025 			return (osp);
2026 		}
2027 		mutex_exit(&osp->os_sync_lock);
2028 	}
2029 	mutex_exit(&rp->r_os_lock);
2030 
2031 	return (NULL);
2032 }
2033 
2034 static void
2035 nfs4delegreturn_thread(struct cb_recall_pass *args)
2036 {
2037 	rnode4_t *rp;
2038 	vnode_t *vp;
2039 	cred_t *cr;
2040 	int dtype, error, flags;
2041 	bool_t rdirty, rip;
2042 	kmutex_t cpr_lock;
2043 	callb_cpr_t cpr_info;
2044 	struct nfs4_callback_globals *ncg;
2045 
2046 	ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
2047 	ASSERT(ncg != NULL);
2048 
2049 	mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
2050 
2051 	CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr,
2052 	    "nfsv4delegRtn");
2053 
2054 	rp = args->rp;
2055 	vp = RTOV4(rp);
2056 
2057 	mutex_enter(&rp->r_statev4_lock);
2058 	if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
2059 		mutex_exit(&rp->r_statev4_lock);
2060 		goto out;
2061 	}
2062 	mutex_exit(&rp->r_statev4_lock);
2063 
2064 	/*
2065 	 * Take the read-write lock in read mode to prevent other
2066 	 * threads from modifying the data during the recall.  This
2067 	 * doesn't affect mmappers.
2068 	 */
2069 	(void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
2070 
2071 	/* Proceed with delegreturn */
2072 
2073 	mutex_enter(&rp->r_statev4_lock);
2074 	if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
2075 		mutex_exit(&rp->r_statev4_lock);
2076 		nfs_rw_exit(&rp->r_rwlock);
2077 		goto out;
2078 	}
2079 	dtype = rp->r_deleg_type;
2080 	cr = rp->r_deleg_cred;
2081 	ASSERT(cr != NULL);
2082 	crhold(cr);
2083 	mutex_exit(&rp->r_statev4_lock);
2084 
2085 	flags = args->flags;
2086 
2087 	/*
2088 	 * If the file is being truncated at the server, then throw
2089 	 * away all of the pages, it doesn't matter what flavor of
2090 	 * delegation we have.
2091 	 */
2092 
2093 	if (args->truncate) {
2094 		ncg->nfs4_callback_stats.recall_trunc.value.ui64++;
2095 		nfs4_invalidate_pages(vp, 0, cr);
2096 	} else if (dtype == OPEN_DELEGATE_WRITE) {
2097 
2098 		mutex_enter(&rp->r_statelock);
2099 		rdirty = rp->r_flags & R4DIRTY;
2100 		mutex_exit(&rp->r_statelock);
2101 
2102 		if (rdirty) {
2103 			error = VOP_PUTPAGE(vp, 0, 0, 0, cr, NULL);
2104 
2105 			if (error)
2106 				CB_WARN1("nfs4delegreturn_thread:"
2107 				" VOP_PUTPAGE: %d\n", error);
2108 		}
2109 		/* turn off NFS4_DR_PUSH because we just did that above. */
2110 		flags &= ~NFS4_DR_PUSH;
2111 	}
2112 
2113 	mutex_enter(&rp->r_statelock);
2114 	rip =  rp->r_flags & R4RECOVERRP;
2115 	mutex_exit(&rp->r_statelock);
2116 
2117 	/* If a failed recovery is indicated, discard the pages */
2118 
2119 	if (rip) {
2120 
2121 		error = VOP_PUTPAGE(vp, 0, 0, B_INVAL, cr, NULL);
2122 
2123 		if (error)
2124 			CB_WARN1("nfs4delegreturn_thread: VOP_PUTPAGE: %d\n",
2125 			    error);
2126 	}
2127 
2128 	/*
2129 	 * Pass the flags to nfs4delegreturn_impl, but be sure not to pass
2130 	 * NFS4_DR_DID_OP, which just calls nfs4delegreturn_async again.
2131 	 */
2132 	flags &= ~NFS4_DR_DID_OP;
2133 
2134 	(void) nfs4delegreturn_impl(rp, flags, ncg);
2135 
2136 	nfs_rw_exit(&rp->r_rwlock);
2137 	crfree(cr);
2138 out:
2139 	kmem_free(args, sizeof (struct cb_recall_pass));
2140 	VN_RELE(vp);
2141 	mutex_enter(&cpr_lock);
2142 	CALLB_CPR_EXIT(&cpr_info);
2143 	mutex_destroy(&cpr_lock);
2144 	zthread_exit();
2145 }
2146 
2147 /*
2148  * This function has one assumption that the caller of this function is
2149  * either doing recovery (therefore cannot call nfs4_start_op) or has
2150  * already called nfs4_start_op().
2151  */
2152 void
2153 nfs4_delegation_accept(rnode4_t *rp, open_claim_type4 claim, OPEN4res *res,
2154 	nfs4_ga_res_t *garp, cred_t *cr)
2155 {
2156 	open_read_delegation4 *orp;
2157 	open_write_delegation4 *owp;
2158 	nfs4_server_t *np;
2159 	bool_t already = FALSE;
2160 	bool_t recall = FALSE;
2161 	bool_t valid_garp = TRUE;
2162 	bool_t delegation_granted = FALSE;
2163 	bool_t dr_needed = FALSE;
2164 	bool_t recov;
2165 	int dr_flags = 0;
2166 	long mapcnt;
2167 	uint_t rflag;
2168 	mntinfo4_t *mi;
2169 	struct nfs4_callback_globals *ncg;
2170 	open_delegation_type4 odt;
2171 
2172 	ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
2173 	ASSERT(ncg != NULL);
2174 
2175 	mi = VTOMI4(RTOV4(rp));
2176 
2177 	/*
2178 	 * Accept a delegation granted to the client via an OPEN.
2179 	 * Set the delegation fields in the rnode and insert the
2180 	 * rnode onto the list anchored in the nfs4_server_t.  The
2181 	 * proper locking order requires the nfs4_server_t first,
2182 	 * even though it may not be needed in all cases.
2183 	 *
2184 	 * NB: find_nfs4_server returns with s_lock held.
2185 	 */
2186 
2187 	if ((np = find_nfs4_server(mi)) == NULL)
2188 		return;
2189 
2190 	/* grab the statelock too, for examining r_mapcnt */
2191 	mutex_enter(&rp->r_statelock);
2192 	mutex_enter(&rp->r_statev4_lock);
2193 
2194 	if (rp->r_deleg_type == OPEN_DELEGATE_READ ||
2195 	    rp->r_deleg_type == OPEN_DELEGATE_WRITE)
2196 		already = TRUE;
2197 
2198 	odt = res->delegation.delegation_type;
2199 
2200 	if (odt == OPEN_DELEGATE_READ) {
2201 
2202 		rp->r_deleg_type = res->delegation.delegation_type;
2203 		orp = &res->delegation.open_delegation4_u.read;
2204 		rp->r_deleg_stateid = orp->stateid;
2205 		rp->r_deleg_perms = orp->permissions;
2206 		if (claim == CLAIM_PREVIOUS)
2207 			if ((recall = orp->recall) != 0)
2208 				dr_needed = TRUE;
2209 
2210 		delegation_granted = TRUE;
2211 
2212 		ncg->nfs4_callback_stats.delegations.value.ui64++;
2213 		ncg->nfs4_callback_stats.delegaccept_r.value.ui64++;
2214 
2215 	} else if (odt == OPEN_DELEGATE_WRITE) {
2216 
2217 		rp->r_deleg_type = res->delegation.delegation_type;
2218 		owp = &res->delegation.open_delegation4_u.write;
2219 		rp->r_deleg_stateid = owp->stateid;
2220 		rp->r_deleg_perms = owp->permissions;
2221 		rp->r_deleg_limit = owp->space_limit;
2222 		if (claim == CLAIM_PREVIOUS)
2223 			if ((recall = owp->recall) != 0)
2224 				dr_needed = TRUE;
2225 
2226 		delegation_granted = TRUE;
2227 
2228 		if (garp == NULL || !garp->n4g_change_valid) {
2229 			valid_garp = FALSE;
2230 			rp->r_deleg_change = 0;
2231 			rp->r_deleg_change_grant = 0;
2232 		} else {
2233 			rp->r_deleg_change = garp->n4g_change;
2234 			rp->r_deleg_change_grant = garp->n4g_change;
2235 		}
2236 		mapcnt = rp->r_mapcnt;
2237 		rflag = rp->r_flags;
2238 
2239 		/*
2240 		 * Update the delegation change attribute if
2241 		 * there are mappers for the file is dirty.  This
2242 		 * might be the case during recovery after server
2243 		 * reboot.
2244 		 */
2245 		if (mapcnt > 0 || rflag & R4DIRTY)
2246 			rp->r_deleg_change++;
2247 
2248 		NFS4_DEBUG(nfs4_callback_debug, (CE_NOTE,
2249 		    "nfs4_delegation_accept: r_deleg_change: 0x%x\n",
2250 		    (int)(rp->r_deleg_change >> 32)));
2251 		NFS4_DEBUG(nfs4_callback_debug, (CE_NOTE,
2252 		    "nfs4_delegation_accept: r_delg_change_grant: 0x%x\n",
2253 		    (int)(rp->r_deleg_change_grant >> 32)));
2254 
2255 
2256 		ncg->nfs4_callback_stats.delegations.value.ui64++;
2257 		ncg->nfs4_callback_stats.delegaccept_rw.value.ui64++;
2258 	} else if (already) {
2259 		/*
2260 		 * No delegation granted.  If the rnode currently has
2261 		 * has one, then consider it tainted and return it.
2262 		 */
2263 		dr_needed = TRUE;
2264 	}
2265 
2266 	if (delegation_granted) {
2267 		/* Add the rnode to the list. */
2268 		if (!already) {
2269 			crhold(cr);
2270 			rp->r_deleg_cred = cr;
2271 
2272 			ASSERT(mutex_owned(&np->s_lock));
2273 			list_insert_head(&np->s_deleg_list, rp);
2274 			/* added list node gets a reference */
2275 			np->s_refcnt++;
2276 			nfs4_inc_state_ref_count_nolock(np, mi);
2277 		}
2278 		rp->r_deleg_needs_recovery = OPEN_DELEGATE_NONE;
2279 	}
2280 
2281 	/*
2282 	 * We've now safely accepted the delegation, if any.  Drop the
2283 	 * locks and figure out what post-processing is needed.  We'd
2284 	 * like to retain r_statev4_lock, but nfs4_server_rele takes
2285 	 * s_lock which would be a lock ordering violation.
2286 	 */
2287 	mutex_exit(&rp->r_statev4_lock);
2288 	mutex_exit(&rp->r_statelock);
2289 	mutex_exit(&np->s_lock);
2290 	nfs4_server_rele(np);
2291 
2292 	/*
2293 	 * Check to see if we are in recovery.  Remember that
2294 	 * this function is protected by start_op, so a recovery
2295 	 * cannot begin until we are out of here.
2296 	 */
2297 	mutex_enter(&mi->mi_lock);
2298 	recov = mi->mi_recovflags & MI4_RECOV_ACTIV;
2299 	mutex_exit(&mi->mi_lock);
2300 
2301 	mutex_enter(&rp->r_statev4_lock);
2302 
2303 	if (nfs4_delegreturn_policy == IMMEDIATE || !valid_garp)
2304 		dr_needed = TRUE;
2305 
2306 	if (dr_needed && rp->r_deleg_return_pending == FALSE) {
2307 		if (recov) {
2308 			/*
2309 			 * We cannot call delegreturn from inside
2310 			 * of recovery or VOP_PUTPAGE will hang
2311 			 * due to nfs4_start_fop call in
2312 			 * nfs4write.  Use dlistadd to add the
2313 			 * rnode to the list of rnodes needing
2314 			 * cleaning.  We do not need to do reopen
2315 			 * here because recov_openfiles will do it.
2316 			 * In the non-recall case, just discard the
2317 			 * delegation as it is no longer valid.
2318 			 */
2319 			if (recall)
2320 				dr_flags = NFS4_DR_PUSH;
2321 			else
2322 				dr_flags = NFS4_DR_PUSH|NFS4_DR_DISCARD;
2323 
2324 			nfs4_dlistadd(rp, ncg, dr_flags);
2325 			dr_flags = 0;
2326 		} else {
2327 			/*
2328 			 * Push the modified data back to the server,
2329 			 * reopen any delegation open streams, and return
2330 			 * the delegation.  Drop the statev4_lock first!
2331 			 */
2332 			dr_flags =  NFS4_DR_PUSH|NFS4_DR_DID_OP|NFS4_DR_REOPEN;
2333 		}
2334 	}
2335 	mutex_exit(&rp->r_statev4_lock);
2336 	if (dr_flags)
2337 		(void) nfs4delegreturn_impl(rp, dr_flags, ncg);
2338 }
2339 
2340 /*
2341  * nfs4delegabandon - Abandon the delegation on an rnode4.  This code
2342  * is called when the client receives EXPIRED, BAD_STATEID, OLD_STATEID
2343  * or BADSEQID and the recovery code is unable to recover.  Push any
2344  * dirty data back to the server and return the delegation (if any).
2345  */
2346 
2347 void
2348 nfs4delegabandon(rnode4_t *rp)
2349 {
2350 	vnode_t *vp;
2351 	struct cb_recall_pass *pp;
2352 	open_delegation_type4 dt;
2353 
2354 	mutex_enter(&rp->r_statev4_lock);
2355 	dt = rp->r_deleg_type;
2356 	mutex_exit(&rp->r_statev4_lock);
2357 
2358 	if (dt == OPEN_DELEGATE_NONE)
2359 		return;
2360 
2361 	vp = RTOV4(rp);
2362 	VN_HOLD(vp);
2363 
2364 	pp = kmem_alloc(sizeof (struct cb_recall_pass), KM_SLEEP);
2365 	pp->rp = rp;
2366 	/*
2367 	 * Recovery on the file has failed and we want to return
2368 	 * the delegation.  We don't want to reopen files and
2369 	 * nfs4delegreturn_thread() figures out what to do about
2370 	 * the data.  The only thing to do is attempt to return
2371 	 * the delegation.
2372 	 */
2373 	pp->flags = 0;
2374 	pp->truncate = FALSE;
2375 
2376 	/*
2377 	 * Fire up a thread to do the delegreturn; this is
2378 	 * necessary because we could be inside a GETPAGE or
2379 	 * PUTPAGE and we cannot do another one.
2380 	 */
2381 
2382 	(void) zthread_create(NULL, 0, nfs4delegreturn_thread, pp, 0,
2383 	    minclsyspri);
2384 }
2385 
2386 static int
2387 wait_for_recall1(vnode_t *vp, nfs4_op_hint_t op, nfs4_recov_state_t *rsp,
2388 	int flg)
2389 {
2390 	rnode4_t *rp;
2391 	int error = 0;
2392 
2393 #ifdef lint
2394 	op = op;
2395 #endif
2396 
2397 	if (vp && vp->v_type == VREG) {
2398 		rp = VTOR4(vp);
2399 
2400 		/*
2401 		 * Take r_deleg_recall_lock in read mode to synchronize
2402 		 * with delegreturn.
2403 		 */
2404 		error = nfs_rw_enter_sig(&rp->r_deleg_recall_lock,
2405 		    RW_READER, INTR4(vp));
2406 
2407 		if (error == 0)
2408 			rsp->rs_flags |= flg;
2409 
2410 	}
2411 	return (error);
2412 }
2413 
2414 void
2415 nfs4_end_op_recall(vnode_t *vp1, vnode_t *vp2, nfs4_recov_state_t *rsp)
2416 {
2417 	NFS4_DEBUG(nfs4_recall_debug,
2418 	    (CE_NOTE, "nfs4_end_op_recall: 0x%p, 0x%p\n",
2419 	    (void *)vp1, (void *)vp2));
2420 
2421 	if (vp2 && rsp->rs_flags & NFS4_RS_RECALL_HELD2)
2422 		nfs_rw_exit(&VTOR4(vp2)->r_deleg_recall_lock);
2423 	if (vp1 && rsp->rs_flags & NFS4_RS_RECALL_HELD1)
2424 		nfs_rw_exit(&VTOR4(vp1)->r_deleg_recall_lock);
2425 }
2426 
2427 int
2428 wait_for_recall(vnode_t *vp1, vnode_t *vp2, nfs4_op_hint_t op,
2429 	nfs4_recov_state_t *rsp)
2430 {
2431 	int error;
2432 
2433 	NFS4_DEBUG(nfs4_recall_debug,
2434 	    (CE_NOTE, "wait_for_recall:    0x%p, 0x%p\n",
2435 	    (void *)vp1, (void *) vp2));
2436 
2437 	rsp->rs_flags &= ~(NFS4_RS_RECALL_HELD1|NFS4_RS_RECALL_HELD2);
2438 
2439 	if ((error = wait_for_recall1(vp1, op, rsp, NFS4_RS_RECALL_HELD1)) != 0)
2440 		return (error);
2441 
2442 	if ((error = wait_for_recall1(vp2, op, rsp, NFS4_RS_RECALL_HELD2))
2443 	    != 0) {
2444 		if (rsp->rs_flags & NFS4_RS_RECALL_HELD1) {
2445 			nfs_rw_exit(&VTOR4(vp1)->r_deleg_recall_lock);
2446 			rsp->rs_flags &= ~NFS4_RS_RECALL_HELD1;
2447 		}
2448 
2449 		return (error);
2450 	}
2451 
2452 	return (0);
2453 }
2454 
2455 /*
2456  * nfs4_dlistadd - Add this rnode to a list of rnodes to be
2457  * DELEGRETURN'd at the end of recovery.
2458  */
2459 
2460 static void
2461 nfs4_dlistadd(rnode4_t *rp, struct nfs4_callback_globals *ncg, int flags)
2462 {
2463 	struct nfs4_dnode *dp;
2464 
2465 	ASSERT(mutex_owned(&rp->r_statev4_lock));
2466 	/*
2467 	 * Mark the delegation as having a return pending.
2468 	 * This will prevent the use of the delegation stateID
2469 	 * by read, write, setattr and open.
2470 	 */
2471 	rp->r_deleg_return_pending = TRUE;
2472 	dp = kmem_alloc(sizeof (*dp), KM_SLEEP);
2473 	VN_HOLD(RTOV4(rp));
2474 	dp->rnodep = rp;
2475 	dp->flags = flags;
2476 	mutex_enter(&ncg->nfs4_dlist_lock);
2477 	list_insert_head(&ncg->nfs4_dlist, dp);
2478 #ifdef	DEBUG
2479 	ncg->nfs4_dlistadd_c++;
2480 #endif
2481 	mutex_exit(&ncg->nfs4_dlist_lock);
2482 }
2483 
2484 /*
2485  * nfs4_dlistclean_impl - Do DELEGRETURN for each rnode on the list.
2486  * of files awaiting cleaning.  If the override_flags are non-zero
2487  * then use them rather than the flags that were set when the rnode
2488  * was added to the dlist.
2489  */
2490 static void
2491 nfs4_dlistclean_impl(struct nfs4_callback_globals *ncg, int override_flags)
2492 {
2493 	rnode4_t *rp;
2494 	struct nfs4_dnode *dp;
2495 	int flags;
2496 
2497 	ASSERT(override_flags == 0 || override_flags == NFS4_DR_DISCARD);
2498 
2499 	mutex_enter(&ncg->nfs4_dlist_lock);
2500 	while ((dp = list_head(&ncg->nfs4_dlist)) != NULL) {
2501 #ifdef	DEBUG
2502 		ncg->nfs4_dlistclean_c++;
2503 #endif
2504 		list_remove(&ncg->nfs4_dlist, dp);
2505 		mutex_exit(&ncg->nfs4_dlist_lock);
2506 		rp = dp->rnodep;
2507 		flags = (override_flags != 0) ? override_flags : dp->flags;
2508 		kmem_free(dp, sizeof (*dp));
2509 		(void) nfs4delegreturn_impl(rp, flags, ncg);
2510 		VN_RELE(RTOV4(rp));
2511 		mutex_enter(&ncg->nfs4_dlist_lock);
2512 	}
2513 	mutex_exit(&ncg->nfs4_dlist_lock);
2514 }
2515 
2516 void
2517 nfs4_dlistclean(void)
2518 {
2519 	struct nfs4_callback_globals *ncg;
2520 
2521 	ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
2522 	ASSERT(ncg != NULL);
2523 
2524 	nfs4_dlistclean_impl(ncg, 0);
2525 }
2526