xref: /titanic_52/usr/src/uts/common/fs/nfs/nfs4_callback.c (revision 30a4e2aa5b7fcba6115c2e8edec9f5d7ee22085d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
28 /* All Rights Reserved */
29 
30 #pragma ident	"%Z%%M%	%I%	%E% SMI"
31 
32 #include <sys/param.h>
33 #include <sys/types.h>
34 #include <sys/systm.h>
35 #include <sys/cred.h>
36 #include <sys/vfs.h>
37 #include <sys/vnode.h>
38 #include <sys/pathname.h>
39 #include <sys/sysmacros.h>
40 #include <sys/kmem.h>
41 #include <sys/kstat.h>
42 #include <sys/mkdev.h>
43 #include <sys/mount.h>
44 #include <sys/statvfs.h>
45 #include <sys/errno.h>
46 #include <sys/debug.h>
47 #include <sys/cmn_err.h>
48 #include <sys/utsname.h>
49 #include <sys/bootconf.h>
50 #include <sys/modctl.h>
51 #include <sys/acl.h>
52 #include <sys/flock.h>
53 #include <sys/kstr.h>
54 #include <sys/stropts.h>
55 #include <sys/strsubr.h>
56 #include <sys/atomic.h>
57 #include <sys/disp.h>
58 #include <sys/policy.h>
59 #include <sys/list.h>
60 #include <sys/zone.h>
61 
62 #include <rpc/types.h>
63 #include <rpc/auth.h>
64 #include <rpc/rpcsec_gss.h>
65 #include <rpc/clnt.h>
66 #include <rpc/xdr.h>
67 
68 #include <nfs/nfs.h>
69 #include <nfs/nfs_clnt.h>
70 #include <nfs/mount.h>
71 #include <nfs/nfs_acl.h>
72 
73 #include <fs/fs_subr.h>
74 
75 #include <nfs/nfs4.h>
76 #include <nfs/rnode4.h>
77 #include <nfs/nfs4_clnt.h>
78 #include <nfs/nfssys.h>
79 
80 #ifdef	DEBUG
81 /*
82  * These are "special" state IDs and file handles that
83  * match any delegation state ID or file handled.  This
84  * is for testing purposes only.
85  */
86 
87 stateid4 nfs4_deleg_any = { 0x7FFFFFF0 };
88 char nfs4_deleg_fh[] = "\0377\0376\0375\0374";
89 nfs_fh4 nfs4_deleg_anyfh = { sizeof (nfs4_deleg_fh)-1, nfs4_deleg_fh };
90 int nfs4_deleg_accept_phony = OPEN_DELEGATE_NONE;
91 nfsace4 nfs4_deleg_ace_phony;
92 nfs_space_limit4 nfs4_deleg_space_phony = { NFS_LIMIT_SIZE, 8192 };
93 nfs_space_limit4 nfs4_deleg_space_phony2 = { NFS_LIMIT_BLOCKS, 0 };
94 nfs_modified_limit4 nfs4_deleg_space_phonyl = { 8, 512 };
95 changeid4 nfs4_deleg_change_phony = 0x7eeeeeee76666660LL;
96 int nfs4_use_phony_limit;
97 int nfs4_use_phony_recall;
98 int nfs4_phony_recall_v;
99 nfsstat4 cb4_getattr_fail = NFS4_OK;
100 nfsstat4 cb4_recall_fail = NFS4_OK;
101 
102 int nfs4_callback_debug;
103 int nfs4_recall_debug;
104 int nfs4_drat_debug;
105 
106 #endif
107 
108 #define	CB_NOTE(x)	NFS4_DEBUG(nfs4_callback_debug, (CE_NOTE, x))
109 #define	CB_WARN(x)	NFS4_DEBUG(nfs4_callback_debug, (CE_WARN, x))
110 #define	CB_WARN1(x, y)	NFS4_DEBUG(nfs4_callback_debug, (CE_WARN, x, y))
111 
112 enum nfs4_delegreturn_policy nfs4_delegreturn_policy = INACTIVE;
113 
114 static zone_key_t nfs4_callback_zone_key;
115 
116 /*
117  * NFS4_MAPSIZE is the number of bytes we are willing to consume
118  * for the block allocation map when the server grants a NFS_LIMIT_BLOCK
119  * style delegation.
120  */
121 
122 #define	NFS4_MAPSIZE	8192
123 #define	NFS4_MAPWORDS	NFS4_MAPSIZE/sizeof (uint_t)
124 #define	NbPW		(NBBY*sizeof (uint_t))
125 
126 static int nfs4_num_prognums = 1024;
127 static SVC_CALLOUT_TABLE nfs4_cb_sct;
128 
129 struct nfs4_dnode {
130 	list_node_t	linkage;
131 	rnode4_t	*rnodep;
132 	int		flags;		/* Flags for nfs4delegreturn_impl() */
133 };
134 
135 static const struct nfs4_callback_stats nfs4_callback_stats_tmpl = {
136 	{ "delegations",	KSTAT_DATA_UINT64 },
137 	{ "cb_getattr",		KSTAT_DATA_UINT64 },
138 	{ "cb_recall",		KSTAT_DATA_UINT64 },
139 	{ "cb_null",		KSTAT_DATA_UINT64 },
140 	{ "cb_dispatch",	KSTAT_DATA_UINT64 },
141 	{ "delegaccept_r",	KSTAT_DATA_UINT64 },
142 	{ "delegaccept_rw",	KSTAT_DATA_UINT64 },
143 	{ "delegreturn",	KSTAT_DATA_UINT64 },
144 	{ "callbacks",		KSTAT_DATA_UINT64 },
145 	{ "claim_cur",		KSTAT_DATA_UINT64 },
146 	{ "claim_cur_ok",	KSTAT_DATA_UINT64 },
147 	{ "recall_trunc",	KSTAT_DATA_UINT64 },
148 	{ "recall_failed",	KSTAT_DATA_UINT64 },
149 	{ "return_limit_write",	KSTAT_DATA_UINT64 },
150 	{ "return_limit_addmap", KSTAT_DATA_UINT64 },
151 	{ "deleg_recover",	KSTAT_DATA_UINT64 },
152 	{ "cb_illegal",		KSTAT_DATA_UINT64 }
153 };
154 
155 struct nfs4_cb_port {
156 	list_node_t		linkage; /* linkage into per-zone port list */
157 	char			netid[KNC_STRSIZE];
158 	char			uaddr[KNC_STRSIZE];
159 	char			protofmly[KNC_STRSIZE];
160 	char			proto[KNC_STRSIZE];
161 };
162 
163 static int cb_getattr_bytes;
164 
165 struct cb_recall_pass {
166 	rnode4_t	*rp;
167 	int		flags;		/* Flags for nfs4delegreturn_impl() */
168 	bool_t		truncate;
169 };
170 
171 static nfs4_open_stream_t *get_next_deleg_stream(rnode4_t *, int);
172 static void nfs4delegreturn_thread(struct cb_recall_pass *);
173 static int deleg_reopen(vnode_t *, bool_t *, struct nfs4_callback_globals *,
174     int);
175 static void nfs4_dlistadd(rnode4_t *, struct nfs4_callback_globals *, int);
176 static void nfs4_dlistclean_impl(struct nfs4_callback_globals *, int);
177 static int nfs4delegreturn_impl(rnode4_t *, int,
178     struct nfs4_callback_globals *);
179 static void nfs4delegreturn_cleanup_impl(rnode4_t *, nfs4_server_t *,
180     struct nfs4_callback_globals *);
181 
182 static void
183 cb_getattr(nfs_cb_argop4 *argop, nfs_cb_resop4 *resop, struct svc_req *req,
184 	struct compound_state *cs, struct nfs4_callback_globals *ncg)
185 {
186 	CB_GETATTR4args *args = &argop->nfs_cb_argop4_u.opcbgetattr;
187 	CB_GETATTR4res *resp = &resop->nfs_cb_resop4_u.opcbgetattr;
188 	rnode4_t *rp;
189 	vnode_t *vp;
190 	bool_t found = FALSE;
191 	struct nfs4_server *sp;
192 	struct fattr4 *fap;
193 	rpc_inline_t *fdata;
194 	long mapcnt;
195 	fattr4_change change;
196 	fattr4_size size;
197 	uint_t rflag;
198 
199 	ncg->nfs4_callback_stats.cb_getattr.value.ui64++;
200 
201 #ifdef DEBUG
202 	/*
203 	 * error injection hook: set cb_getattr_fail global to
204 	 * NFS4 pcol error to be returned
205 	 */
206 	if (cb4_getattr_fail != NFS4_OK) {
207 		*cs->statusp = resp->status = cb4_getattr_fail;
208 		return;
209 	}
210 #endif
211 
212 	resp->obj_attributes.attrmask = 0;
213 
214 	mutex_enter(&ncg->nfs4_cb_lock);
215 	sp = ncg->nfs4prog2server[req->rq_prog - NFS4_CALLBACK];
216 	mutex_exit(&ncg->nfs4_cb_lock);
217 
218 	if (nfs4_server_vlock(sp, 0) == FALSE) {
219 
220 		CB_WARN("cb_getattr: cannot find server\n");
221 
222 		*cs->statusp = resp->status = NFS4ERR_BADHANDLE;
223 		return;
224 	}
225 
226 	/*
227 	 * In cb_compound, callback_ident was validated against rq_prog,
228 	 * but we couldn't verify that it was set to the value we provided
229 	 * at setclientid time (because we didn't have server struct yet).
230 	 * Now we have the server struct, but don't have callback_ident
231 	 * handy.  So, validate server struct program number against req
232 	 * RPC's prog number.  At this point, we know the RPC prog num
233 	 * is valid (else we wouldn't be here); however, we don't know
234 	 * that it was the prog number we supplied to this server at
235 	 * setclientid time.  If the prog numbers aren't equivalent, then
236 	 * log the problem and fail the request because either cbserv
237 	 * and/or cbclient are confused.  This will probably never happen.
238 	 */
239 	if (sp->s_program != req->rq_prog) {
240 #ifdef DEBUG
241 		zcmn_err(getzoneid(), CE_WARN,
242 		    "cb_getattr: wrong server program number srv=%d req=%d\n",
243 		    sp->s_program, req->rq_prog);
244 #else
245 		zcmn_err(getzoneid(), CE_WARN,
246 		    "cb_getattr: wrong server program number\n");
247 #endif
248 		mutex_exit(&sp->s_lock);
249 		nfs4_server_rele(sp);
250 		*cs->statusp = resp->status = NFS4ERR_BADHANDLE;
251 		return;
252 	}
253 
254 	/*
255 	 * Search the delegation list for a matching file handle;
256 	 * mutex on sp prevents the list from changing.
257 	 */
258 
259 	rp = list_head(&sp->s_deleg_list);
260 	for (; rp != NULL; rp = list_next(&sp->s_deleg_list, rp)) {
261 		nfs4_fhandle_t fhandle;
262 
263 		sfh4_copyval(rp->r_fh, &fhandle);
264 
265 		if ((fhandle.fh_len == args->fh.nfs_fh4_len &&
266 		    bcmp(fhandle.fh_buf, args->fh.nfs_fh4_val,
267 		    fhandle.fh_len) == 0)) {
268 
269 			found = TRUE;
270 			break;
271 		}
272 #ifdef	DEBUG
273 		if (nfs4_deleg_anyfh.nfs_fh4_len == args->fh.nfs_fh4_len &&
274 		    bcmp(nfs4_deleg_anyfh.nfs_fh4_val, args->fh.nfs_fh4_val,
275 		    args->fh.nfs_fh4_len) == 0) {
276 
277 			found = TRUE;
278 			break;
279 		}
280 #endif
281 	}
282 
283 	/*
284 	 * VN_HOLD the vnode before releasing s_lock to guarantee
285 	 * we have a valid vnode reference.
286 	 */
287 	if (found == TRUE) {
288 		vp = RTOV4(rp);
289 		VN_HOLD(vp);
290 	}
291 
292 	mutex_exit(&sp->s_lock);
293 	nfs4_server_rele(sp);
294 
295 	if (found == FALSE) {
296 
297 		CB_WARN("cb_getattr: bad fhandle\n");
298 
299 		*cs->statusp = resp->status = NFS4ERR_BADHANDLE;
300 		return;
301 	}
302 
303 	/*
304 	 * Figure out which attributes the server wants.  We only
305 	 * offer FATTR4_CHANGE & FATTR4_SIZE; ignore the rest.
306 	 */
307 	fdata = kmem_alloc(cb_getattr_bytes, KM_SLEEP);
308 
309 	/*
310 	 * Don't actually need to create XDR to encode these
311 	 * simple data structures.
312 	 * xdrmem_create(&xdr, fdata, cb_getattr_bytes, XDR_ENCODE);
313 	 */
314 	fap = &resp->obj_attributes;
315 
316 	fap->attrmask = 0;
317 	/* attrlist4_len starts at 0 and increases as attrs are processed */
318 	fap->attrlist4 = (char *)fdata;
319 	fap->attrlist4_len = 0;
320 
321 	/* don't supply attrs if request was zero */
322 	if (args->attr_request != 0) {
323 		if (args->attr_request & FATTR4_CHANGE_MASK) {
324 			/*
325 			 * If the file is mmapped, then increment the change
326 			 * attribute and return it.  This will guarantee that
327 			 * the server will perceive that the file has changed
328 			 * if there is any chance that the client application
329 			 * has changed it.  Otherwise, just return the change
330 			 * attribute as it has been updated by nfs4write_deleg.
331 			 */
332 
333 			mutex_enter(&rp->r_statelock);
334 			mapcnt = rp->r_mapcnt;
335 			rflag = rp->r_flags;
336 			mutex_exit(&rp->r_statelock);
337 
338 			mutex_enter(&rp->r_statev4_lock);
339 			/*
340 			 * If object mapped, then always return new change.
341 			 * Otherwise, return change if object has dirty
342 			 * pages.  If object doesn't have any dirty pages,
343 			 * then all changes have been pushed to server, so
344 			 * reset change to grant change.
345 			 */
346 			if (mapcnt)
347 				rp->r_deleg_change++;
348 			else if (! (rflag & R4DIRTY))
349 				rp->r_deleg_change = rp->r_deleg_change_grant;
350 			change = rp->r_deleg_change;
351 			mutex_exit(&rp->r_statev4_lock);
352 
353 			/*
354 			 * Use inline XDR code directly, we know that we
355 			 * going to a memory buffer and it has enough
356 			 * space so it cannot fail.
357 			 */
358 			IXDR_PUT_U_HYPER(fdata, change);
359 			fap->attrlist4_len += 2 * BYTES_PER_XDR_UNIT;
360 			fap->attrmask |= FATTR4_CHANGE_MASK;
361 		}
362 
363 		if (args->attr_request & FATTR4_SIZE_MASK) {
364 			/*
365 			 * Use an atomic add of 0 to fetch a consistent view
366 			 * of r_size; this avoids having to take rw_lock
367 			 * which could cause a deadlock.
368 			 */
369 			size = atomic_add_64_nv((uint64_t *)&rp->r_size, 0);
370 
371 			/*
372 			 * Use inline XDR code directly, we know that we
373 			 * going to a memory buffer and it has enough
374 			 * space so it cannot fail.
375 			 */
376 			IXDR_PUT_U_HYPER(fdata, size);
377 			fap->attrlist4_len += 2 * BYTES_PER_XDR_UNIT;
378 			fap->attrmask |= FATTR4_SIZE_MASK;
379 		}
380 	}
381 
382 	VN_RELE(vp);
383 
384 	*cs->statusp = resp->status = NFS4_OK;
385 }
386 
387 static void
388 cb_getattr_free(nfs_cb_resop4 *resop)
389 {
390 	if (resop->nfs_cb_resop4_u.opcbgetattr.obj_attributes.attrlist4)
391 		kmem_free(resop->nfs_cb_resop4_u.opcbgetattr.
392 			obj_attributes.attrlist4,
393 			cb_getattr_bytes);
394 }
395 
396 static void
397 cb_recall(nfs_cb_argop4 *argop, nfs_cb_resop4 *resop, struct svc_req *req,
398 	struct compound_state *cs, struct nfs4_callback_globals *ncg)
399 {
400 	CB_RECALL4args * args = &argop->nfs_cb_argop4_u.opcbrecall;
401 	CB_RECALL4res *resp = &resop->nfs_cb_resop4_u.opcbrecall;
402 	rnode4_t *rp;
403 	vnode_t *vp;
404 	struct nfs4_server *sp;
405 	bool_t found = FALSE;
406 
407 	ncg->nfs4_callback_stats.cb_recall.value.ui64++;
408 
409 	ASSERT(req->rq_prog >= NFS4_CALLBACK);
410 	ASSERT(req->rq_prog < NFS4_CALLBACK+nfs4_num_prognums);
411 
412 #ifdef DEBUG
413 	/*
414 	 * error injection hook: set cb_recall_fail global to
415 	 * NFS4 pcol error to be returned
416 	 */
417 	if (cb4_recall_fail != NFS4_OK) {
418 		*cs->statusp = resp->status = cb4_recall_fail;
419 		return;
420 	}
421 #endif
422 
423 	mutex_enter(&ncg->nfs4_cb_lock);
424 	sp = ncg->nfs4prog2server[req->rq_prog - NFS4_CALLBACK];
425 	mutex_exit(&ncg->nfs4_cb_lock);
426 
427 	if (nfs4_server_vlock(sp, 0) == FALSE) {
428 
429 		CB_WARN("cb_recall: cannot find server\n");
430 
431 		*cs->statusp = resp->status = NFS4ERR_BADHANDLE;
432 		return;
433 	}
434 
435 	/*
436 	 * Search the delegation list for a matching file handle
437 	 * AND stateid; mutex on sp prevents the list from changing.
438 	 */
439 
440 	rp = list_head(&sp->s_deleg_list);
441 	for (; rp != NULL; rp = list_next(&sp->s_deleg_list, rp)) {
442 		mutex_enter(&rp->r_statev4_lock);
443 
444 		/* check both state id and file handle! */
445 
446 		if ((bcmp(&rp->r_deleg_stateid, &args->stateid,
447 		    sizeof (stateid4)) == 0)) {
448 			nfs4_fhandle_t fhandle;
449 
450 			sfh4_copyval(rp->r_fh, &fhandle);
451 			if ((fhandle.fh_len == args->fh.nfs_fh4_len &&
452 			    bcmp(fhandle.fh_buf, args->fh.nfs_fh4_val,
453 			    fhandle.fh_len) == 0)) {
454 
455 				found = TRUE;
456 				break;
457 			} else {
458 #ifdef	DEBUG
459 				CB_WARN("cb_recall: stateid OK, bad fh");
460 #endif
461 			}
462 		}
463 #ifdef	DEBUG
464 		if (bcmp(&args->stateid, &nfs4_deleg_any,
465 		    sizeof (stateid4)) == 0) {
466 
467 			found = TRUE;
468 			break;
469 		}
470 #endif
471 		mutex_exit(&rp->r_statev4_lock);
472 	}
473 
474 	/*
475 	 * VN_HOLD the vnode before releasing s_lock to guarantee
476 	 * we have a valid vnode reference.  The async thread will
477 	 * release the hold when it's done.
478 	 */
479 	if (found == TRUE) {
480 		mutex_exit(&rp->r_statev4_lock);
481 		vp = RTOV4(rp);
482 		VN_HOLD(vp);
483 	}
484 	mutex_exit(&sp->s_lock);
485 	nfs4_server_rele(sp);
486 
487 	if (found == FALSE) {
488 
489 		CB_WARN("cb_recall: bad stateid\n");
490 
491 		*cs->statusp = resp->status = NFS4ERR_BAD_STATEID;
492 		return;
493 	}
494 
495 	/* Fire up a thread to do the delegreturn */
496 	nfs4delegreturn_async(rp, NFS4_DR_RECALL|NFS4_DR_REOPEN,
497 					args->truncate);
498 
499 	*cs->statusp = resp->status = 0;
500 }
501 
502 /* ARGSUSED */
503 static void
504 cb_recall_free(nfs_cb_resop4 *resop)
505 {
506 	/* nothing to do here, cb_recall doesn't kmem_alloc */
507 }
508 
509 /*
510  * This function handles the CB_NULL proc call from an NFSv4 Server.
511  *
512  * We take note that the server has sent a CB_NULL for later processing
513  * in the recovery logic. It is noted so we may pause slightly after the
514  * setclientid and before reopening files. The pause is to allow the
515  * NFSv4 Server time to receive the CB_NULL reply and adjust any of
516  * its internal structures such that it has the opportunity to grant
517  * delegations to reopened files.
518  *
519  */
520 
521 /* ARGSUSED */
522 static void
523 cb_null(CB_COMPOUND4args *args, CB_COMPOUND4res *resp, struct svc_req *req,
524     struct nfs4_callback_globals *ncg)
525 {
526 	struct nfs4_server *sp;
527 
528 	ncg->nfs4_callback_stats.cb_null.value.ui64++;
529 
530 	ASSERT(req->rq_prog >= NFS4_CALLBACK);
531 	ASSERT(req->rq_prog < NFS4_CALLBACK+nfs4_num_prognums);
532 
533 	mutex_enter(&ncg->nfs4_cb_lock);
534 	sp = ncg->nfs4prog2server[req->rq_prog - NFS4_CALLBACK];
535 	mutex_exit(&ncg->nfs4_cb_lock);
536 
537 	if (nfs4_server_vlock(sp, 0) != FALSE) {
538 		sp->s_flags |= N4S_CB_PINGED;
539 		cv_broadcast(&sp->wait_cb_null);
540 		mutex_exit(&sp->s_lock);
541 		nfs4_server_rele(sp);
542 	}
543 }
544 
545 /*
546  * cb_illegal	args: void
547  *		res : status (NFS4ERR_OP_CB_ILLEGAL)
548  */
549 /* ARGSUSED */
550 static void
551 cb_illegal(nfs_cb_argop4 *argop, nfs_cb_resop4 *resop, struct svc_req *req,
552 	struct compound_state *cs, struct nfs4_callback_globals *ncg)
553 {
554 	CB_ILLEGAL4res *resp = &resop->nfs_cb_resop4_u.opcbillegal;
555 
556 	ncg->nfs4_callback_stats.cb_illegal.value.ui64++;
557 	resop->resop = OP_CB_ILLEGAL;
558 	*cs->statusp = resp->status = NFS4ERR_OP_ILLEGAL;
559 }
560 
561 static void
562 cb_compound(CB_COMPOUND4args *args, CB_COMPOUND4res *resp, struct svc_req *req,
563 	struct nfs4_callback_globals *ncg)
564 {
565 	uint_t i;
566 	struct compound_state cs;
567 	nfs_cb_argop4 *argop;
568 	nfs_cb_resop4 *resop, *new_res;
569 	uint_t op;
570 
571 	bzero(&cs, sizeof (cs));
572 	cs.statusp = &resp->status;
573 	cs.cont = TRUE;
574 
575 	/*
576 	 * Form a reply tag by copying over the reqeuest tag.
577 	 */
578 	resp->tag.utf8string_len = args->tag.utf8string_len;
579 	resp->tag.utf8string_val = kmem_alloc(resp->tag.utf8string_len,
580 					KM_SLEEP);
581 	bcopy(args->tag.utf8string_val, resp->tag.utf8string_val,
582 		args->tag.utf8string_len);
583 
584 	/*
585 	 * XXX for now, minorversion should be zero
586 	 */
587 	if (args->minorversion != CB4_MINORVERSION) {
588 		resp->array_len = 0;
589 		resp->array = NULL;
590 		resp->status = NFS4ERR_MINOR_VERS_MISMATCH;
591 		return;
592 	}
593 
594 #ifdef DEBUG
595 	/*
596 	 * Verify callback_ident.  It doesn't really matter if it's wrong
597 	 * because we don't really use callback_ident -- we use prog number
598 	 * of the RPC request instead.  In this case, just print a DEBUG
599 	 * console message to reveal brokenness of cbclient (at bkoff/cthon).
600 	 */
601 	if (args->callback_ident != req->rq_prog)
602 		zcmn_err(getzoneid(), CE_WARN,
603 		    "cb_compound: cb_client using wrong "
604 		    "callback_ident(%d), should be %d",
605 		    args->callback_ident, req->rq_prog);
606 #endif
607 
608 	resp->array_len = args->array_len;
609 	resp->array = kmem_zalloc(args->array_len * sizeof (nfs_cb_resop4),
610 							KM_SLEEP);
611 
612 	for (i = 0; i < args->array_len && cs.cont; i++) {
613 
614 		argop = &args->array[i];
615 		resop = &resp->array[i];
616 		resop->resop = argop->argop;
617 		op = (uint_t)resop->resop;
618 
619 		switch (op) {
620 
621 		case OP_CB_GETATTR:
622 
623 			cb_getattr(argop, resop, req, &cs, ncg);
624 			break;
625 
626 		case OP_CB_RECALL:
627 
628 			cb_recall(argop, resop, req, &cs, ncg);
629 			break;
630 
631 		case OP_CB_ILLEGAL:
632 
633 			/* fall through */
634 
635 		default:
636 			/*
637 			 * Handle OP_CB_ILLEGAL and any undefined opcode.
638 			 * Currently, the XDR code will return BADXDR
639 			 * if cb op doesn't decode to legal value, so
640 			 * it really only handles OP_CB_ILLEGAL.
641 			 */
642 			op = OP_CB_ILLEGAL;
643 			cb_illegal(argop, resop, req, &cs, ncg);
644 		}
645 
646 		if (*cs.statusp != NFS4_OK)
647 			cs.cont = FALSE;
648 
649 		/*
650 		 * If not at last op, and if we are to stop, then
651 		 * compact the results array.
652 		 */
653 		if ((i + 1) < args->array_len && !cs.cont) {
654 
655 			new_res = kmem_alloc(
656 				(i+1) * sizeof (nfs_cb_resop4), KM_SLEEP);
657 			bcopy(resp->array,
658 				new_res, (i+1) * sizeof (nfs_cb_resop4));
659 			kmem_free(resp->array,
660 				args->array_len * sizeof (nfs_cb_resop4));
661 
662 			resp->array_len =  i + 1;
663 			resp->array = new_res;
664 		}
665 	}
666 
667 }
668 
669 static void
670 cb_compound_free(CB_COMPOUND4res *resp)
671 {
672 	uint_t i, op;
673 	nfs_cb_resop4 *resop;
674 
675 	if (resp->tag.utf8string_val) {
676 		UTF8STRING_FREE(resp->tag)
677 	}
678 
679 	for (i = 0; i < resp->array_len; i++) {
680 
681 		resop = &resp->array[i];
682 		op = (uint_t)resop->resop;
683 
684 		switch (op) {
685 
686 		case OP_CB_GETATTR:
687 
688 			cb_getattr_free(resop);
689 			break;
690 
691 		case OP_CB_RECALL:
692 
693 			cb_recall_free(resop);
694 			break;
695 
696 		default:
697 			break;
698 		}
699 	}
700 
701 	if (resp->array != NULL) {
702 		kmem_free(resp->array,
703 			resp->array_len * sizeof (nfs_cb_resop4));
704 	}
705 }
706 
707 static void
708 cb_dispatch(struct svc_req *req, SVCXPRT *xprt)
709 {
710 	CB_COMPOUND4args args;
711 	CB_COMPOUND4res res;
712 	struct nfs4_callback_globals *ncg;
713 
714 	bool_t (*xdr_args)(), (*xdr_res)();
715 	void (*proc)(CB_COMPOUND4args *, CB_COMPOUND4res *, struct svc_req *,
716 	    struct nfs4_callback_globals *);
717 	void (*freeproc)(CB_COMPOUND4res *);
718 
719 	ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
720 	ASSERT(ncg != NULL);
721 
722 	ncg->nfs4_callback_stats.cb_dispatch.value.ui64++;
723 
724 	switch (req->rq_proc) {
725 	case CB_NULL:
726 		xdr_args = xdr_void;
727 		xdr_res = xdr_void;
728 		proc = cb_null;
729 		freeproc = NULL;
730 		break;
731 
732 	case CB_COMPOUND:
733 		xdr_args = xdr_CB_COMPOUND4args_clnt;
734 		xdr_res = xdr_CB_COMPOUND4res;
735 		proc = cb_compound;
736 		freeproc = cb_compound_free;
737 		break;
738 
739 	default:
740 		CB_WARN("cb_dispatch: no proc\n");
741 		svcerr_noproc(xprt);
742 		return;
743 	}
744 
745 	args.tag.utf8string_val = NULL;
746 	args.array = NULL;
747 
748 	if (!SVC_GETARGS(xprt, xdr_args, (caddr_t)&args)) {
749 
750 		CB_WARN("cb_dispatch: cannot getargs\n");
751 		svcerr_decode(xprt);
752 		return;
753 	}
754 
755 	(*proc)(&args, &res, req, ncg);
756 
757 	if (svc_sendreply(xprt, xdr_res, (caddr_t)&res) == FALSE) {
758 
759 		CB_WARN("cb_dispatch: bad sendreply\n");
760 
761 		/*
762 		 * svcerr_systemerr(xprt);
763 		 */
764 	}
765 
766 	if (freeproc)
767 		(*freeproc)(&res);
768 
769 	if (!SVC_FREEARGS(xprt, xdr_args, (caddr_t)&args)) {
770 
771 		CB_WARN("cb_dispatch: bad freeargs\n");
772 	}
773 }
774 
775 static rpcprog_t
776 nfs4_getnextprogram(struct nfs4_callback_globals *ncg)
777 {
778 	int i, j;
779 
780 	j = ncg->nfs4_program_hint;
781 	for (i = 0; i < nfs4_num_prognums; i++, j++) {
782 
783 		if (j >= nfs4_num_prognums)
784 			j = 0;
785 
786 		if (ncg->nfs4prog2server[j] == NULL) {
787 			ncg->nfs4_program_hint = j+1;
788 			return (j+NFS4_CALLBACK);
789 		}
790 	}
791 
792 	return (0);
793 }
794 
795 void
796 nfs4callback_destroy(nfs4_server_t *np)
797 {
798 	struct nfs4_callback_globals *ncg;
799 	int i;
800 
801 	if (np->s_program == 0)
802 		return;
803 
804 	ncg = np->zone_globals;
805 	i = np->s_program - NFS4_CALLBACK;
806 
807 	mutex_enter(&ncg->nfs4_cb_lock);
808 
809 	ASSERT(ncg->nfs4prog2server[i] == np);
810 
811 	ncg->nfs4prog2server[i] = NULL;
812 
813 	if (i < ncg->nfs4_program_hint)
814 		ncg->nfs4_program_hint = i;
815 
816 	mutex_exit(&ncg->nfs4_cb_lock);
817 }
818 
819 /*
820  * nfs4_setport - This function saves a netid and univeral address for
821  * the callback program.  These values will be used during setclientid.
822  */
823 static void
824 nfs4_setport(char *netid, char *uaddr, char *protofmly, char *proto,
825 	struct nfs4_callback_globals *ncg)
826 {
827 	struct nfs4_cb_port *p;
828 	bool_t found = FALSE;
829 
830 	ASSERT(MUTEX_HELD(&ncg->nfs4_cb_lock));
831 
832 	p = list_head(&ncg->nfs4_cb_ports);
833 	for (; p != NULL; p = list_next(&ncg->nfs4_cb_ports, p)) {
834 		if (strcmp(p->netid, netid) == 0) {
835 			found = TRUE;
836 			break;
837 		}
838 	}
839 	if (found == TRUE)
840 		(void) strcpy(p->uaddr, uaddr);
841 	else {
842 		p = kmem_alloc(sizeof (*p), KM_SLEEP);
843 
844 		(void) strcpy(p->uaddr, uaddr);
845 		(void) strcpy(p->netid, netid);
846 		(void) strcpy(p->protofmly, protofmly);
847 		(void) strcpy(p->proto, proto);
848 		list_insert_head(&ncg->nfs4_cb_ports, p);
849 	}
850 }
851 
852 /*
853  * nfs4_cb_args - This function is used to construct the callback
854  * portion of the arguments needed for setclientid.
855  */
856 
857 void
858 nfs4_cb_args(nfs4_server_t *np, struct knetconfig *knc, SETCLIENTID4args *args)
859 {
860 	struct nfs4_cb_port *p;
861 	bool_t found = FALSE;
862 	rpcprog_t pgm;
863 	struct nfs4_callback_globals *ncg = np->zone_globals;
864 
865 	/*
866 	 * This server structure may already have a program number
867 	 * assigned to it.  This happens when the client has to
868 	 * re-issue SETCLIENTID.  Just re-use the information.
869 	 */
870 	if (np->s_program >= NFS4_CALLBACK &&
871 	    np->s_program < NFS4_CALLBACK + nfs4_num_prognums)
872 		nfs4callback_destroy(np);
873 
874 	mutex_enter(&ncg->nfs4_cb_lock);
875 
876 	p = list_head(&ncg->nfs4_cb_ports);
877 	for (; p != NULL; p = list_next(&ncg->nfs4_cb_ports, p)) {
878 		if (strcmp(p->protofmly, knc->knc_protofmly) == 0 &&
879 		    strcmp(p->proto, knc->knc_proto) == 0) {
880 			found = TRUE;
881 			break;
882 		}
883 	}
884 
885 	if (found == FALSE) {
886 
887 		NFS4_DEBUG(nfs4_callback_debug,
888 		(CE_WARN, "nfs4_cb_args: could not find netid for %s/%s\n",
889 			knc->knc_protofmly, knc->knc_proto));
890 
891 		args->callback.cb_program = 0;
892 		args->callback.cb_location.r_netid = NULL;
893 		args->callback.cb_location.r_addr = NULL;
894 		args->callback_ident = 0;
895 		mutex_exit(&ncg->nfs4_cb_lock);
896 		return;
897 	}
898 
899 	if ((pgm = nfs4_getnextprogram(ncg)) == 0) {
900 		CB_WARN("nfs4_cb_args: out of program numbers\n");
901 
902 		args->callback.cb_program = 0;
903 		args->callback.cb_location.r_netid = NULL;
904 		args->callback.cb_location.r_addr = NULL;
905 		args->callback_ident = 0;
906 		mutex_exit(&ncg->nfs4_cb_lock);
907 		return;
908 	}
909 
910 	ncg->nfs4prog2server[pgm-NFS4_CALLBACK] = np;
911 	args->callback.cb_program = pgm;
912 	args->callback.cb_location.r_netid = p->netid;
913 	args->callback.cb_location.r_addr = p->uaddr;
914 	args->callback_ident = pgm;
915 
916 	np->s_program = pgm;
917 
918 	mutex_exit(&ncg->nfs4_cb_lock);
919 }
920 
921 static int
922 nfs4_dquery(struct nfs4_svc_args *arg, model_t model)
923 {
924 	file_t *fp;
925 	vnode_t *vp;
926 	rnode4_t *rp;
927 	int error;
928 	STRUCT_HANDLE(nfs4_svc_args, uap);
929 
930 	STRUCT_SET_HANDLE(uap, model, arg);
931 
932 	if ((fp = getf(STRUCT_FGET(uap, fd))) == NULL)
933 		return (EBADF);
934 
935 	vp = fp->f_vnode;
936 
937 	if (vp == NULL || vp->v_type != VREG ||
938 	    !vn_matchops(vp, nfs4_vnodeops)) {
939 		releasef(STRUCT_FGET(uap, fd));
940 		return (EBADF);
941 	}
942 
943 	rp = VTOR4(vp);
944 
945 	/*
946 	 * I can't convince myself that we need locking here.  The
947 	 * rnode cannot disappear and the value returned is instantly
948 	 * stale anway, so why bother?
949 	 */
950 
951 	error = suword32(STRUCT_FGETP(uap, netid), rp->r_deleg_type);
952 	releasef(STRUCT_FGET(uap, fd));
953 	return (error);
954 }
955 
956 
957 /*
958  * NFS4 client system call.  This service does the
959  * necessary initialization for the callback program.
960  * This is fashioned after the server side interaction
961  * between nfsd and the kernel.  On the client, the
962  * mount command forks and the child process does the
963  * necessary interaction with the kernel.
964  *
965  * uap->fd is the fd of an open transport provider
966  */
967 int
968 nfs4_svc(struct nfs4_svc_args *arg, model_t model)
969 {
970 	file_t *fp;
971 	int error;
972 	int readsize;
973 	char buf[KNC_STRSIZE], uaddr[KNC_STRSIZE];
974 	char protofmly[KNC_STRSIZE], proto[KNC_STRSIZE];
975 	size_t len;
976 	STRUCT_HANDLE(nfs4_svc_args, uap);
977 	struct netbuf addrmask;
978 	int cmd;
979 	SVCMASTERXPRT *cb_xprt;
980 	struct nfs4_callback_globals *ncg;
981 
982 #ifdef lint
983 	model = model;		/* STRUCT macros don't always refer to it */
984 #endif
985 
986 	STRUCT_SET_HANDLE(uap, model, arg);
987 
988 	if (STRUCT_FGET(uap, cmd) == NFS4_DQUERY)
989 		return (nfs4_dquery(arg, model));
990 
991 	if (secpolicy_nfs(CRED()) != 0)
992 		return (EPERM);
993 
994 	if ((fp = getf(STRUCT_FGET(uap, fd))) == NULL)
995 		return (EBADF);
996 
997 	/*
998 	 * Set read buffer size to rsize
999 	 * and add room for RPC headers.
1000 	 */
1001 	readsize = nfs3tsize() + (RPC_MAXDATASIZE - NFS_MAXDATA);
1002 	if (readsize < RPC_MAXDATASIZE)
1003 		readsize = RPC_MAXDATASIZE;
1004 
1005 	error = copyinstr((const char *)STRUCT_FGETP(uap, netid), buf,
1006 	    KNC_STRSIZE, &len);
1007 	if (error) {
1008 		releasef(STRUCT_FGET(uap, fd));
1009 		return (error);
1010 	}
1011 
1012 	cmd = STRUCT_FGET(uap, cmd);
1013 
1014 	if (cmd & NFS4_KRPC_START) {
1015 		addrmask.len = STRUCT_FGET(uap, addrmask.len);
1016 		addrmask.maxlen = STRUCT_FGET(uap, addrmask.maxlen);
1017 		addrmask.buf = kmem_alloc(addrmask.maxlen, KM_SLEEP);
1018 		error = copyin(STRUCT_FGETP(uap, addrmask.buf), addrmask.buf,
1019 		    addrmask.len);
1020 		if (error) {
1021 			releasef(STRUCT_FGET(uap, fd));
1022 			kmem_free(addrmask.buf, addrmask.maxlen);
1023 			return (error);
1024 		}
1025 	}
1026 	else
1027 		addrmask.buf = NULL;
1028 
1029 	error = copyinstr((const char *)STRUCT_FGETP(uap, addr), uaddr,
1030 	    sizeof (uaddr), &len);
1031 	if (error) {
1032 		releasef(STRUCT_FGET(uap, fd));
1033 		if (addrmask.buf)
1034 			kmem_free(addrmask.buf, addrmask.maxlen);
1035 		return (error);
1036 	}
1037 
1038 	error = copyinstr((const char *)STRUCT_FGETP(uap, protofmly), protofmly,
1039 	    sizeof (protofmly), &len);
1040 	if (error) {
1041 		releasef(STRUCT_FGET(uap, fd));
1042 		if (addrmask.buf)
1043 			kmem_free(addrmask.buf, addrmask.maxlen);
1044 		return (error);
1045 	}
1046 
1047 	error = copyinstr((const char *)STRUCT_FGETP(uap, proto), proto,
1048 	    sizeof (proto), &len);
1049 	if (error) {
1050 		releasef(STRUCT_FGET(uap, fd));
1051 		if (addrmask.buf)
1052 			kmem_free(addrmask.buf, addrmask.maxlen);
1053 		return (error);
1054 	}
1055 
1056 	ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
1057 	ASSERT(ncg != NULL);
1058 
1059 	mutex_enter(&ncg->nfs4_cb_lock);
1060 	if (cmd & NFS4_SETPORT)
1061 		nfs4_setport(buf, uaddr, protofmly, proto, ncg);
1062 
1063 	if (cmd & NFS4_KRPC_START) {
1064 		error = svc_tli_kcreate(fp, readsize, buf, &addrmask, &cb_xprt,
1065 		    &nfs4_cb_sct, NULL, NFS_CB_SVCPOOL_ID, FALSE);
1066 		if (error) {
1067 			CB_WARN1("nfs4_svc: svc_tli_kcreate failed %d\n",
1068 				error);
1069 			kmem_free(addrmask.buf, addrmask.maxlen);
1070 		}
1071 	}
1072 
1073 	mutex_exit(&ncg->nfs4_cb_lock);
1074 	releasef(STRUCT_FGET(uap, fd));
1075 	return (error);
1076 }
1077 
1078 struct nfs4_callback_globals *
1079 nfs4_get_callback_globals(void)
1080 {
1081 	return (zone_getspecific(nfs4_callback_zone_key, nfs_zone()));
1082 }
1083 
1084 static void *
1085 nfs4_callback_init_zone(zoneid_t zoneid)
1086 {
1087 	kstat_t *nfs4_callback_kstat;
1088 	struct nfs4_callback_globals *ncg;
1089 
1090 	ncg = kmem_zalloc(sizeof (*ncg), KM_SLEEP);
1091 
1092 	ncg->nfs4prog2server = kmem_zalloc(nfs4_num_prognums *
1093 		sizeof (struct nfs4_server *), KM_SLEEP);
1094 
1095 	/* initialize the dlist */
1096 	mutex_init(&ncg->nfs4_dlist_lock, NULL, MUTEX_DEFAULT, NULL);
1097 	list_create(&ncg->nfs4_dlist, sizeof (struct nfs4_dnode),
1098 	    offsetof(struct nfs4_dnode, linkage));
1099 
1100 	/* initialize cb_port list */
1101 	mutex_init(&ncg->nfs4_cb_lock, NULL, MUTEX_DEFAULT, NULL);
1102 	list_create(&ncg->nfs4_cb_ports, sizeof (struct nfs4_cb_port),
1103 	    offsetof(struct nfs4_cb_port, linkage));
1104 
1105 	/* get our own copy of the kstats */
1106 	bcopy(&nfs4_callback_stats_tmpl, &ncg->nfs4_callback_stats,
1107 	    sizeof (nfs4_callback_stats_tmpl));
1108 	/* register "nfs:0:nfs4_callback_stats" for this zone */
1109 	if ((nfs4_callback_kstat =
1110 		kstat_create_zone("nfs", 0, "nfs4_callback_stats", "misc",
1111 		    KSTAT_TYPE_NAMED,
1112 		    sizeof (ncg->nfs4_callback_stats) / sizeof (kstat_named_t),
1113 		    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE,
1114 		    zoneid)) != NULL) {
1115 		nfs4_callback_kstat->ks_data = &ncg->nfs4_callback_stats;
1116 		kstat_install(nfs4_callback_kstat);
1117 	}
1118 	return (ncg);
1119 }
1120 
1121 static void
1122 nfs4_discard_delegations(struct nfs4_callback_globals *ncg)
1123 {
1124 	nfs4_server_t *sp;
1125 	int i, num_removed;
1126 
1127 	/*
1128 	 * It's OK here to just run through the registered "programs", as
1129 	 * servers without programs won't have any delegations to handle.
1130 	 */
1131 	for (i = 0; i < nfs4_num_prognums; i++) {
1132 		rnode4_t *rp;
1133 
1134 		mutex_enter(&ncg->nfs4_cb_lock);
1135 		sp = ncg->nfs4prog2server[i];
1136 		mutex_exit(&ncg->nfs4_cb_lock);
1137 
1138 		if (nfs4_server_vlock(sp, 1) == FALSE)
1139 			continue;
1140 		num_removed = 0;
1141 		while ((rp = list_head(&sp->s_deleg_list)) != NULL) {
1142 			mutex_enter(&rp->r_statev4_lock);
1143 			if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
1144 				/*
1145 				 * We need to take matters into our own hands,
1146 				 * as nfs4delegreturn_cleanup_impl() won't
1147 				 * remove this from the list.
1148 				 */
1149 				list_remove(&sp->s_deleg_list, rp);
1150 				mutex_exit(&rp->r_statev4_lock);
1151 				nfs4_dec_state_ref_count_nolock(sp,
1152 				    VTOMI4(RTOV4(rp)));
1153 				num_removed++;
1154 				continue;
1155 			}
1156 			mutex_exit(&rp->r_statev4_lock);
1157 			VN_HOLD(RTOV4(rp));
1158 			mutex_exit(&sp->s_lock);
1159 			/*
1160 			 * The following will remove the node from the list.
1161 			 */
1162 			nfs4delegreturn_cleanup_impl(rp, sp, ncg);
1163 			VN_RELE(RTOV4(rp));
1164 			mutex_enter(&sp->s_lock);
1165 		}
1166 		mutex_exit(&sp->s_lock);
1167 		/* each removed list node reles a reference */
1168 		while (num_removed-- > 0)
1169 			nfs4_server_rele(sp);
1170 		/* remove our reference for nfs4_server_vlock */
1171 		nfs4_server_rele(sp);
1172 	}
1173 }
1174 
1175 /* ARGSUSED */
1176 static void
1177 nfs4_callback_shutdown_zone(zoneid_t zoneid, void *data)
1178 {
1179 	struct nfs4_callback_globals *ncg = data;
1180 
1181 	/*
1182 	 * Clean pending delegation return list.
1183 	 */
1184 	nfs4_dlistclean_impl(ncg, NFS4_DR_DISCARD);
1185 
1186 	/*
1187 	 * Discard all delegations.
1188 	 */
1189 	nfs4_discard_delegations(ncg);
1190 }
1191 
1192 static void
1193 nfs4_callback_fini_zone(zoneid_t zoneid, void *data)
1194 {
1195 	struct nfs4_callback_globals *ncg = data;
1196 	struct nfs4_cb_port *p;
1197 	nfs4_server_t *sp, *next;
1198 	nfs4_server_t freelist;
1199 	int i;
1200 
1201 	kstat_delete_byname_zone("nfs", 0, "nfs4_callback_stats", zoneid);
1202 
1203 	/*
1204 	 * Discard all delegations that may have crept in since we did the
1205 	 * _shutdown.
1206 	 */
1207 	nfs4_discard_delegations(ncg);
1208 	/*
1209 	 * We're completely done with this zone and all associated
1210 	 * nfs4_server_t's.  Any remaining nfs4_server_ts should only have one
1211 	 * more reference outstanding -- the reference we didn't release in
1212 	 * nfs4_renew_lease_thread().
1213 	 *
1214 	 * Here we need to run through the global nfs4_server_lst as we need to
1215 	 * deal with nfs4_server_ts without programs, as they also have threads
1216 	 * created for them, and so have outstanding references that we need to
1217 	 * release.
1218 	 */
1219 	freelist.forw = &freelist;
1220 	freelist.back = &freelist;
1221 	mutex_enter(&nfs4_server_lst_lock);
1222 	sp = nfs4_server_lst.forw;
1223 	while (sp != &nfs4_server_lst) {
1224 		next = sp->forw;
1225 		if (sp->zoneid == zoneid) {
1226 			remque(sp);
1227 			insque(sp, &freelist);
1228 			mutex_enter(&sp->s_lock);
1229 			sp->s_flags &= ~N4S_INSERTED;
1230 			mutex_exit(&sp->s_lock);
1231 		}
1232 		sp = next;
1233 	}
1234 	mutex_exit(&nfs4_server_lst_lock);
1235 
1236 	sp = freelist.forw;
1237 	while (sp != &freelist) {
1238 		next = sp->forw;
1239 		sp->forw = sp->back = NULL;
1240 		nfs4_server_rele(sp);	/* free the list's reference */
1241 		sp = next;
1242 	}
1243 
1244 #ifdef DEBUG
1245 	for (i = 0; i < nfs4_num_prognums; i++) {
1246 		ASSERT(ncg->nfs4prog2server[i] == NULL);
1247 	}
1248 #endif
1249 	kmem_free(ncg->nfs4prog2server, nfs4_num_prognums *
1250 	    sizeof (struct nfs4_server *));
1251 
1252 	mutex_enter(&ncg->nfs4_cb_lock);
1253 	while ((p = list_head(&ncg->nfs4_cb_ports)) != NULL) {
1254 		list_remove(&ncg->nfs4_cb_ports, p);
1255 		kmem_free(p, sizeof (*p));
1256 	}
1257 	list_destroy(&ncg->nfs4_cb_ports);
1258 	mutex_destroy(&ncg->nfs4_cb_lock);
1259 	list_destroy(&ncg->nfs4_dlist);
1260 	mutex_destroy(&ncg->nfs4_dlist_lock);
1261 	kmem_free(ncg, sizeof (*ncg));
1262 }
1263 
1264 void
1265 nfs4_callback_init(void)
1266 {
1267 	int i;
1268 	SVC_CALLOUT *nfs4_cb_sc;
1269 
1270 	/* initialize the callback table */
1271 	nfs4_cb_sc = kmem_alloc(nfs4_num_prognums *
1272 		sizeof (SVC_CALLOUT), KM_SLEEP);
1273 
1274 	for (i = 0; i < nfs4_num_prognums; i++) {
1275 		nfs4_cb_sc[i].sc_prog = NFS4_CALLBACK+i;
1276 		nfs4_cb_sc[i].sc_versmin = NFS_CB;
1277 		nfs4_cb_sc[i].sc_versmax = NFS_CB;
1278 		nfs4_cb_sc[i].sc_dispatch = cb_dispatch;
1279 	}
1280 
1281 	nfs4_cb_sct.sct_size = nfs4_num_prognums;
1282 	nfs4_cb_sct.sct_free = FALSE;
1283 	nfs4_cb_sct.sct_sc = nfs4_cb_sc;
1284 
1285 	/*
1286 	 * Compute max bytes required for dyamically allocated parts
1287 	 * of cb_getattr reply.  Only size and change are supported now.
1288 	 * If CB_GETATTR is changed to reply with additional attrs,
1289 	 * additional sizes must be added below.
1290 	 *
1291 	 * fattr4_change + fattr4_size == uint64_t + uint64_t
1292 	 */
1293 	cb_getattr_bytes = 2 * BYTES_PER_XDR_UNIT + 2 * BYTES_PER_XDR_UNIT;
1294 
1295 	zone_key_create(&nfs4_callback_zone_key, nfs4_callback_init_zone,
1296 	    nfs4_callback_shutdown_zone, nfs4_callback_fini_zone);
1297 }
1298 
1299 void
1300 nfs4_callback_fini(void)
1301 {
1302 }
1303 
1304 /*
1305  * NB: This function can be called from the *wrong* zone (ie, the zone that
1306  * 'rp' belongs to and the caller's zone may not be the same).  This can happen
1307  * if the zone is going away and we get called from nfs4_async_inactive().  In
1308  * this case the globals will be NULL and we won't update the counters, which
1309  * doesn't matter as the zone is going away anyhow.
1310  */
1311 static void
1312 nfs4delegreturn_cleanup_impl(rnode4_t *rp, nfs4_server_t *np,
1313 	struct nfs4_callback_globals *ncg)
1314 {
1315 	mntinfo4_t *mi = VTOMI4(RTOV4(rp));
1316 	boolean_t need_rele = B_FALSE;
1317 
1318 	mutex_enter(&rp->r_statev4_lock);
1319 
1320 	if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
1321 		mutex_exit(&rp->r_statev4_lock);
1322 		return;
1323 	}
1324 
1325 	/*
1326 	 * Free the cred originally held when
1327 	 * the delegation was granted.  Caller must
1328 	 * hold this cred if it wants to use it after
1329 	 * this call.
1330 	 */
1331 	crfree(rp->r_deleg_cred);
1332 	rp->r_deleg_cred = NULL;
1333 	rp->r_deleg_type = OPEN_DELEGATE_NONE;
1334 	rp->r_deleg_needs_recovery = OPEN_DELEGATE_NONE;
1335 	rp->r_deleg_needs_recall = FALSE;
1336 	rp->r_deleg_return_pending = FALSE;
1337 	mutex_exit(&rp->r_statev4_lock);
1338 
1339 	/*
1340 	 * Caller must be holding mi_recovlock in read mode
1341 	 * to call here.  This is provided by start_op.
1342 	 */
1343 
1344 	if (np == NULL) {
1345 		np = find_nfs4_server_all(mi, 1);
1346 		ASSERT(np != NULL);
1347 		need_rele = B_TRUE;
1348 	} else {
1349 		mutex_enter(&np->s_lock);
1350 	}
1351 
1352 	/*
1353 	 * Remove the rnode from the server's list and
1354 	 * update the ref counts.
1355 	 */
1356 	list_remove(&np->s_deleg_list, rp);
1357 	nfs4_dec_state_ref_count_nolock(np, mi);
1358 	mutex_exit(&np->s_lock);
1359 	/* removed list node removes a reference */
1360 	nfs4_server_rele(np);
1361 	if (need_rele)
1362 		nfs4_server_rele(np);
1363 	if (ncg != NULL)
1364 		ncg->nfs4_callback_stats.delegations.value.ui64--;
1365 }
1366 
1367 void
1368 nfs4delegreturn_cleanup(rnode4_t *rp, nfs4_server_t *np)
1369 {
1370 	struct nfs4_callback_globals *ncg;
1371 
1372 	if (np != NULL) {
1373 		ncg = np->zone_globals;
1374 	} else if (nfs_zone() == VTOMI4(RTOV4(rp))->mi_zone) {
1375 		ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
1376 		ASSERT(ncg != NULL);
1377 	} else {
1378 		/*
1379 		 * Request coming from the wrong zone.
1380 		 */
1381 		ASSERT(getzoneid() == GLOBAL_ZONEID);
1382 		ncg = NULL;
1383 	}
1384 
1385 	nfs4delegreturn_cleanup_impl(rp, np, ncg);
1386 }
1387 
1388 static void
1389 nfs4delegreturn_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp,
1390 	cred_t *cr, vnode_t *vp)
1391 {
1392 	if (error != ETIMEDOUT && error != EINTR &&
1393 	    !NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) {
1394 		lost_rqstp->lr_op = 0;
1395 		return;
1396 	}
1397 
1398 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
1399 			"nfs4close_save_lost_rqst: error %d", error));
1400 
1401 	lost_rqstp->lr_op = OP_DELEGRETURN;
1402 	/*
1403 	 * The vp is held and rele'd via the recovery code.
1404 	 * See nfs4_save_lost_rqst.
1405 	 */
1406 	lost_rqstp->lr_vp = vp;
1407 	lost_rqstp->lr_dvp = NULL;
1408 	lost_rqstp->lr_oop = NULL;
1409 	lost_rqstp->lr_osp = NULL;
1410 	lost_rqstp->lr_lop = NULL;
1411 	lost_rqstp->lr_cr = cr;
1412 	lost_rqstp->lr_flk = NULL;
1413 	lost_rqstp->lr_putfirst = FALSE;
1414 }
1415 
1416 static void
1417 nfs4delegreturn_otw(rnode4_t *rp, cred_t *cr, nfs4_error_t *ep)
1418 {
1419 	COMPOUND4args_clnt args;
1420 	COMPOUND4res_clnt res;
1421 	nfs_argop4 argops[3];
1422 	nfs4_ga_res_t *garp = NULL;
1423 	hrtime_t t;
1424 	int numops;
1425 	int doqueue = 1;
1426 
1427 	args.ctag = TAG_DELEGRETURN;
1428 
1429 	numops = 3;		/* PUTFH, GETATTR, DELEGRETURN */
1430 
1431 	args.array = argops;
1432 	args.array_len = numops;
1433 
1434 	argops[0].argop = OP_CPUTFH;
1435 	argops[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
1436 
1437 	argops[1].argop = OP_GETATTR;
1438 	argops[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
1439 	argops[1].nfs_argop4_u.opgetattr.mi = VTOMI4(RTOV4(rp));
1440 
1441 	argops[2].argop = OP_DELEGRETURN;
1442 	argops[2].nfs_argop4_u.opdelegreturn.deleg_stateid =
1443 		rp->r_deleg_stateid;
1444 
1445 	t = gethrtime();
1446 	rfs4call(VTOMI4(RTOV4(rp)), &args, &res, cr, &doqueue, 0, ep);
1447 
1448 	if (ep->error)
1449 		return;
1450 
1451 	if (res.status == NFS4_OK) {
1452 		garp = &res.array[1].nfs_resop4_u.opgetattr.ga_res;
1453 		nfs4_attr_cache(RTOV4(rp), garp, t, cr, TRUE, NULL);
1454 
1455 	}
1456 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1457 }
1458 
1459 int
1460 nfs4_do_delegreturn(rnode4_t *rp, int flags, cred_t *cr,
1461 	struct nfs4_callback_globals *ncg)
1462 {
1463 	vnode_t *vp = RTOV4(rp);
1464 	mntinfo4_t *mi = VTOMI4(vp);
1465 	nfs4_lost_rqst_t lost_rqst;
1466 	nfs4_recov_state_t recov_state;
1467 	bool_t needrecov = FALSE, recovonly, done = FALSE;
1468 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
1469 
1470 	ncg->nfs4_callback_stats.delegreturn.value.ui64++;
1471 
1472 	while (!done) {
1473 		e.error = nfs4_start_fop(mi, vp, NULL, OH_DELEGRETURN,
1474 				&recov_state, &recovonly);
1475 
1476 		if (e.error) {
1477 			if (flags & NFS4_DR_FORCE) {
1478 				(void) nfs_rw_enter_sig(&mi->mi_recovlock,
1479 				    RW_READER, 0);
1480 				nfs4delegreturn_cleanup_impl(rp, NULL, ncg);
1481 				nfs_rw_exit(&mi->mi_recovlock);
1482 			}
1483 			break;
1484 		}
1485 
1486 		/*
1487 		 * Check to see if the delegation has already been
1488 		 * returned by the recovery thread.   The state of
1489 		 * the delegation cannot change at this point due
1490 		 * to start_fop and the r_deleg_recall_lock.
1491 		 */
1492 		if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
1493 			e.error = 0;
1494 			nfs4_end_op(mi, vp, NULL, &recov_state, needrecov);
1495 			break;
1496 		}
1497 
1498 		if (recovonly) {
1499 			/*
1500 			 * Delegation will be returned via the
1501 			 * recovery framework.  Build a lost request
1502 			 * structure, start recovery and get out.
1503 			 */
1504 			nfs4_error_init(&e, EINTR);
1505 			nfs4delegreturn_save_lost_rqst(e.error, &lost_rqst,
1506 				cr, vp);
1507 			(void) nfs4_start_recovery(&e, mi, vp,
1508 				NULL, &rp->r_deleg_stateid,
1509 				lost_rqst.lr_op == OP_DELEGRETURN ?
1510 				&lost_rqst : NULL, OP_DELEGRETURN, NULL);
1511 			nfs4_end_op(mi, vp, NULL, &recov_state, needrecov);
1512 			break;
1513 		}
1514 
1515 		nfs4delegreturn_otw(rp, cr, &e);
1516 
1517 		/*
1518 		 * Ignore some errors on delegreturn; no point in marking
1519 		 * the file dead on a state destroying operation.
1520 		 */
1521 		if (e.error == 0 && (nfs4_recov_marks_dead(e.stat) ||
1522 		    e.stat == NFS4ERR_BADHANDLE ||
1523 		    e.stat == NFS4ERR_STALE))
1524 			needrecov = FALSE;
1525 		else
1526 			needrecov = nfs4_needs_recovery(&e, TRUE, vp->v_vfsp);
1527 
1528 		if (needrecov) {
1529 			nfs4delegreturn_save_lost_rqst(e.error, &lost_rqst,
1530 				cr, vp);
1531 			(void) nfs4_start_recovery(&e, mi, vp,
1532 				NULL, &rp->r_deleg_stateid,
1533 				lost_rqst.lr_op == OP_DELEGRETURN ?
1534 				&lost_rqst : NULL, OP_DELEGRETURN, NULL);
1535 		} else {
1536 			nfs4delegreturn_cleanup_impl(rp, NULL, ncg);
1537 			done = TRUE;
1538 		}
1539 
1540 		nfs4_end_op(mi, vp, NULL, &recov_state, needrecov);
1541 	}
1542 	return (e.error);
1543 }
1544 
1545 /*
1546  * nfs4_resend_delegreturn - used to drive the delegreturn
1547  * operation via the recovery thread.
1548  */
1549 void
1550 nfs4_resend_delegreturn(nfs4_lost_rqst_t *lorp, nfs4_error_t *ep,
1551 	nfs4_server_t *np)
1552 {
1553 	rnode4_t *rp = VTOR4(lorp->lr_vp);
1554 
1555 	/* If the file failed recovery, just quit. */
1556 	mutex_enter(&rp->r_statelock);
1557 	if (rp->r_flags & R4RECOVERR) {
1558 		ep->error = EIO;
1559 	}
1560 	mutex_exit(&rp->r_statelock);
1561 
1562 	if (!ep->error)
1563 		nfs4delegreturn_otw(rp, lorp->lr_cr, ep);
1564 
1565 	/*
1566 	 * If recovery is now needed, then return the error
1567 	 * and status and let the recovery thread handle it,
1568 	 * including re-driving another delegreturn.  Otherwise,
1569 	 * just give up and clean up the delegation.
1570 	 */
1571 	if (nfs4_needs_recovery(ep, TRUE, lorp->lr_vp->v_vfsp))
1572 		return;
1573 
1574 	if (rp->r_deleg_type != OPEN_DELEGATE_NONE)
1575 		nfs4delegreturn_cleanup(rp, np);
1576 
1577 	nfs4_error_zinit(ep);
1578 }
1579 
1580 /*
1581  * nfs4delegreturn - general function to return a delegation.
1582  *
1583  * NFS4_DR_FORCE - return the delegation even if start_op fails
1584  * NFS4_DR_PUSH - push modified data back to the server via VOP_PUTPAGE
1585  * NFS4_DR_DISCARD - discard the delegation w/o delegreturn
1586  * NFS4_DR_DID_OP - calling function already did nfs4_start_op
1587  * NFS4_DR_RECALL - delegreturned initiated via CB_RECALL
1588  * NFS4_DR_REOPEN - do file reopens, if applicable
1589  */
1590 static int
1591 nfs4delegreturn_impl(rnode4_t *rp, int flags, struct nfs4_callback_globals *ncg)
1592 {
1593 	int error = 0;
1594 	cred_t *cr = NULL;
1595 	vnode_t *vp;
1596 	bool_t needrecov = FALSE;
1597 	bool_t rw_entered = FALSE;
1598 	bool_t do_reopen;
1599 
1600 	vp = RTOV4(rp);
1601 
1602 	/*
1603 	 * If NFS4_DR_DISCARD is set by itself, take a short-cut and
1604 	 * discard without doing an otw DELEGRETURN.  This may only be used
1605 	 * by the recovery thread because it bypasses the synchronization
1606 	 * with r_deleg_recall_lock and mi->mi_recovlock.
1607 	 */
1608 	if (flags == NFS4_DR_DISCARD) {
1609 		nfs4delegreturn_cleanup_impl(rp, NULL, ncg);
1610 		return (0);
1611 	}
1612 
1613 	if (flags & NFS4_DR_DID_OP) {
1614 		/*
1615 		 * Caller had already done start_op, which means the
1616 		 * r_deleg_recall_lock is already held in READ mode
1617 		 * so we cannot take it in write mode.  Return the
1618 		 * delegation asynchronously.
1619 		 *
1620 		 * Remove the NFS4_DR_DID_OP flag so we don't
1621 		 * get stuck looping through here.
1622 		 */
1623 		VN_HOLD(vp);
1624 		nfs4delegreturn_async(rp, (flags & ~NFS4_DR_DID_OP), FALSE);
1625 		return (0);
1626 	}
1627 
1628 	/*
1629 	 * Take r_deleg_recall_lock to verify we still have a delegation
1630 	 * and to crhold the credential.  We have to release the lock
1631 	 * before we call VOP_PUTPAGE or else we'll deadlock.
1632 	 */
1633 	(void) nfs_rw_enter_sig(&rp->r_deleg_recall_lock, RW_WRITER, FALSE);
1634 	rw_entered = TRUE;
1635 	if (rp->r_deleg_type == OPEN_DELEGATE_NONE)
1636 		goto out;
1637 	cr = rp->r_deleg_cred;
1638 	crhold(cr);
1639 	nfs_rw_exit(&rp->r_deleg_recall_lock);
1640 	rw_entered = FALSE;
1641 
1642 	/*
1643 	 * Push the modified data back to the server synchronously
1644 	 * before doing DELEGRETURN.
1645 	 */
1646 	if (flags & NFS4_DR_PUSH)
1647 		(void) VOP_PUTPAGE(vp, 0, 0, 0, cr);
1648 
1649 	/*
1650 	 * Take r_deleg_recall_lock in WRITE mode, this will prevent
1651 	 * nfs4_is_otw_open_necessary from trying to use the delegation
1652 	 * while the DELEGRETURN is in progress.
1653 	 */
1654 	(void) nfs_rw_enter_sig(&rp->r_deleg_recall_lock, RW_WRITER, FALSE);
1655 
1656 	rw_entered = TRUE;
1657 
1658 	if (rp->r_deleg_type == OPEN_DELEGATE_NONE)
1659 		goto out;
1660 
1661 	if (flags & NFS4_DR_REOPEN) {
1662 		/*
1663 		 * If R4RECOVERRP is already set, then skip re-opening
1664 		 * the delegation open streams and go straight to doing
1665 		 * delegreturn.  (XXX if the file has failed recovery, then the
1666 		 * delegreturn attempt is likely to be futile.)
1667 		 */
1668 		mutex_enter(&rp->r_statelock);
1669 		do_reopen = !(rp->r_flags & R4RECOVERRP);
1670 		mutex_exit(&rp->r_statelock);
1671 
1672 		if (do_reopen) {
1673 			error = deleg_reopen(vp, &needrecov, ncg, flags);
1674 			if (error != 0) {
1675 				if ((flags & (NFS4_DR_FORCE | NFS4_DR_RECALL))
1676 									== 0)
1677 					goto out;
1678 			} else if (needrecov) {
1679 				if ((flags & NFS4_DR_FORCE) == 0)
1680 					goto out;
1681 			}
1682 		}
1683 	}
1684 
1685 	if (flags & NFS4_DR_DISCARD) {
1686 		mntinfo4_t *mi = VTOMI4(RTOV4(rp));
1687 
1688 		mutex_enter(&rp->r_statelock);
1689 		/*
1690 		 * deleg_return_pending is cleared inside of delegation_accept
1691 		 * when a delegation is accepted.  if this flag has been
1692 		 * cleared, then a new delegation has overwritten the one we
1693 		 * were about to throw away.
1694 		 */
1695 		if (!rp->r_deleg_return_pending) {
1696 			mutex_exit(&rp->r_statelock);
1697 			goto out;
1698 		}
1699 		mutex_exit(&rp->r_statelock);
1700 		(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, FALSE);
1701 		nfs4delegreturn_cleanup_impl(rp, NULL, ncg);
1702 		nfs_rw_exit(&mi->mi_recovlock);
1703 	} else {
1704 		error = nfs4_do_delegreturn(rp, flags, cr, ncg);
1705 	}
1706 
1707 out:
1708 	if (cr)
1709 		crfree(cr);
1710 	if (rw_entered)
1711 		nfs_rw_exit(&rp->r_deleg_recall_lock);
1712 	return (error);
1713 }
1714 
1715 int
1716 nfs4delegreturn(rnode4_t *rp, int flags)
1717 {
1718 	struct nfs4_callback_globals *ncg;
1719 
1720 	ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
1721 	ASSERT(ncg != NULL);
1722 
1723 	return (nfs4delegreturn_impl(rp, flags, ncg));
1724 }
1725 
1726 void
1727 nfs4delegreturn_async(rnode4_t *rp, int flags, bool_t trunc)
1728 {
1729 	struct cb_recall_pass *pp;
1730 
1731 	pp = kmem_alloc(sizeof (struct cb_recall_pass), KM_SLEEP);
1732 	pp->rp = rp;
1733 	pp->flags = flags;
1734 	pp->truncate = trunc;
1735 
1736 	/*
1737 	 * Fire up a thread to do the actual delegreturn
1738 	 * Caller must guarantee that the rnode doesn't
1739 	 * vanish (by calling VN_HOLD).
1740 	 */
1741 
1742 	(void) zthread_create(NULL, 0, nfs4delegreturn_thread, pp, 0,
1743 				minclsyspri);
1744 }
1745 
1746 static void
1747 delegreturn_all_thread(rpcprog_t *pp)
1748 {
1749 	nfs4_server_t *np;
1750 	bool_t found = FALSE;
1751 	rpcprog_t prog;
1752 	rnode4_t *rp;
1753 	vnode_t *vp;
1754 	zoneid_t zoneid = getzoneid();
1755 	struct nfs4_callback_globals *ncg;
1756 
1757 	NFS4_DEBUG(nfs4_drat_debug,
1758 		(CE_NOTE, "delereturn_all_thread: prog %d\n", *pp));
1759 
1760 	prog = *pp;
1761 	kmem_free(pp, sizeof (*pp));
1762 	pp = NULL;
1763 
1764 	mutex_enter(&nfs4_server_lst_lock);
1765 	for (np = nfs4_server_lst.forw; np != &nfs4_server_lst; np = np->forw) {
1766 		if (np->zoneid == zoneid && np->s_program == prog) {
1767 			mutex_enter(&np->s_lock);
1768 			found = TRUE;
1769 			break;
1770 		}
1771 	}
1772 	mutex_exit(&nfs4_server_lst_lock);
1773 
1774 	/*
1775 	 * It's possible that the nfs4_server which was using this
1776 	 * program number has vanished since this thread is async.
1777 	 * If so, just return.  Your work here is finished, my friend.
1778 	 */
1779 	if (!found)
1780 		goto out;
1781 
1782 	ncg = np->zone_globals;
1783 	while ((rp = list_head(&np->s_deleg_list)) != NULL) {
1784 		vp = RTOV4(rp);
1785 		VN_HOLD(vp);
1786 		mutex_exit(&np->s_lock);
1787 		(void) nfs4delegreturn_impl(rp, NFS4_DR_PUSH|NFS4_DR_REOPEN,
1788 									ncg);
1789 		VN_RELE(vp);
1790 
1791 		/* retake the s_lock for next trip through the loop */
1792 		mutex_enter(&np->s_lock);
1793 	}
1794 	mutex_exit(&np->s_lock);
1795 out:
1796 	NFS4_DEBUG(nfs4_drat_debug,
1797 		(CE_NOTE, "delereturn_all_thread: complete\n"));
1798 	zthread_exit();
1799 }
1800 
1801 void
1802 nfs4_delegreturn_all(nfs4_server_t *sp)
1803 {
1804 	rpcprog_t pro, *pp;
1805 
1806 	mutex_enter(&sp->s_lock);
1807 
1808 	/* Check to see if the delegation list is empty */
1809 
1810 	if (list_head(&sp->s_deleg_list) == NULL) {
1811 		mutex_exit(&sp->s_lock);
1812 		return;
1813 	}
1814 	/*
1815 	 * Grab the program number; the async thread will use this
1816 	 * to find the nfs4_server.
1817 	 */
1818 	pro = sp->s_program;
1819 	mutex_exit(&sp->s_lock);
1820 	pp = kmem_alloc(sizeof (rpcprog_t), KM_SLEEP);
1821 	*pp = pro;
1822 	(void) zthread_create(NULL, 0, delegreturn_all_thread, pp, 0,
1823 	    minclsyspri);
1824 }
1825 
1826 
1827 /*
1828  * Discard any delegations
1829  *
1830  * Iterate over the servers s_deleg_list and
1831  * for matching mount-point rnodes discard
1832  * the delegation.
1833  */
1834 void
1835 nfs4_deleg_discard(mntinfo4_t *mi, nfs4_server_t *sp)
1836 {
1837 	rnode4_t *rp, *next;
1838 	mntinfo4_t *r_mi;
1839 	struct nfs4_callback_globals *ncg;
1840 
1841 	ASSERT(mutex_owned(&sp->s_lock));
1842 	ncg = sp->zone_globals;
1843 
1844 	for (rp = list_head(&sp->s_deleg_list); rp != NULL; rp = next) {
1845 		r_mi = VTOMI4(RTOV4(rp));
1846 		next = list_next(&sp->s_deleg_list, rp);
1847 
1848 		if (r_mi != mi) {
1849 			/*
1850 			 * Skip if this rnode is in not on the
1851 			 * same mount-point
1852 			 */
1853 			continue;
1854 		}
1855 
1856 		ASSERT(rp->r_deleg_type == OPEN_DELEGATE_READ);
1857 
1858 #ifdef DEBUG
1859 		if (nfs4_client_recov_debug) {
1860 			zprintf(getzoneid(),
1861 			    "nfs4_deleg_discard: matched rnode %p "
1862 			"-- discarding delegation\n", (void *)rp);
1863 		}
1864 #endif
1865 		mutex_enter(&rp->r_statev4_lock);
1866 		/*
1867 		 * Free the cred originally held when the delegation
1868 		 * was granted. Also need to decrement the refcnt
1869 		 * on this server for each delegation we discard
1870 		 */
1871 		if (rp->r_deleg_cred)
1872 			crfree(rp->r_deleg_cred);
1873 		rp->r_deleg_cred = NULL;
1874 		rp->r_deleg_type = OPEN_DELEGATE_NONE;
1875 		rp->r_deleg_needs_recovery = OPEN_DELEGATE_NONE;
1876 		rp->r_deleg_needs_recall = FALSE;
1877 		ASSERT(sp->s_refcnt > 1);
1878 		sp->s_refcnt--;
1879 		list_remove(&sp->s_deleg_list, rp);
1880 		mutex_exit(&rp->r_statev4_lock);
1881 		nfs4_dec_state_ref_count_nolock(sp, mi);
1882 		ncg->nfs4_callback_stats.delegations.value.ui64--;
1883 	}
1884 }
1885 
1886 /*
1887  * Reopen any open streams that were covered by the given file's
1888  * delegation.
1889  * Returns zero or an errno value.  If there was no error, *recovp
1890  * indicates whether recovery was initiated.
1891  */
1892 
1893 static int
1894 deleg_reopen(vnode_t *vp, bool_t *recovp, struct nfs4_callback_globals *ncg,
1895 	int flags)
1896 {
1897 	nfs4_open_stream_t *osp;
1898 	nfs4_recov_state_t recov_state;
1899 	bool_t needrecov = FALSE;
1900 	mntinfo4_t *mi;
1901 	rnode4_t *rp;
1902 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
1903 	int claimnull;
1904 
1905 	mi = VTOMI4(vp);
1906 	rp = VTOR4(vp);
1907 
1908 	recov_state.rs_flags = 0;
1909 	recov_state.rs_num_retry_despite_err = 0;
1910 
1911 retry:
1912 	if ((e.error = nfs4_start_op(mi, vp, NULL, &recov_state)) != 0) {
1913 		return (e.error);
1914 	}
1915 
1916 	/*
1917 	 * if we mean to discard the delegation, it must be BAD, so don't
1918 	 * use it when doing the reopen or it will fail too.
1919 	 */
1920 	claimnull = (flags & NFS4_DR_DISCARD);
1921 	/*
1922 	 * Loop through the open streams for this rnode to find
1923 	 * all of the ones created using the delegation state ID.
1924 	 * Each of these needs to be re-opened.
1925 	 */
1926 
1927 	while ((osp = get_next_deleg_stream(rp, claimnull)) != NULL) {
1928 
1929 		if (claimnull) {
1930 			nfs4_reopen(vp, osp, &e, CLAIM_NULL, FALSE, FALSE);
1931 		} else {
1932 			ncg->nfs4_callback_stats.claim_cur.value.ui64++;
1933 
1934 			nfs4_reopen(vp, osp, &e, CLAIM_DELEGATE_CUR, FALSE,
1935 					FALSE);
1936 			if (e.error == 0 && e.stat == NFS4_OK)
1937 				ncg->nfs4_callback_stats.
1938 					claim_cur_ok.value.ui64++;
1939 		}
1940 
1941 		if (e.error == EAGAIN) {
1942 			nfs4_end_op(mi, vp, NULL, &recov_state, TRUE);
1943 			goto retry;
1944 		}
1945 
1946 		/*
1947 		 * if error is EINTR, ETIMEDOUT, or NFS4_FRC_UNMT_ERR, then
1948 		 * recovery has already been started inside of nfs4_reopen.
1949 		 */
1950 		if (e.error == EINTR || e.error == ETIMEDOUT ||
1951 		    NFS4_FRC_UNMT_ERR(e.error, vp->v_vfsp)) {
1952 			open_stream_rele(osp, rp);
1953 			break;
1954 		}
1955 
1956 		needrecov = nfs4_needs_recovery(&e, TRUE, vp->v_vfsp);
1957 
1958 		if (e.error != 0 && !needrecov) {
1959 			/*
1960 			 * Recovery is not possible, but don't give up yet;
1961 			 * we'd still like to do delegreturn after
1962 			 * reopening as many streams as possible.
1963 			 * Continue processing the open streams.
1964 			 */
1965 
1966 			ncg->nfs4_callback_stats.recall_failed.value.ui64++;
1967 
1968 		} else if (needrecov) {
1969 			/*
1970 			 * Start recovery and bail out.  The recovery
1971 			 * thread will take it from here.
1972 			 */
1973 			(void) nfs4_start_recovery(&e, mi, vp, NULL, NULL,
1974 				NULL, OP_OPEN, NULL);
1975 			open_stream_rele(osp, rp);
1976 			*recovp = TRUE;
1977 			break;
1978 		}
1979 
1980 		open_stream_rele(osp, rp);
1981 	}
1982 
1983 	nfs4_end_op(mi, vp, NULL, &recov_state, needrecov);
1984 
1985 	return (e.error);
1986 }
1987 
1988 /*
1989  * get_next_deleg_stream - returns the next open stream which
1990  * represents a delegation for this rnode.  In order to assure
1991  * forward progress, the caller must guarantee that each open
1992  * stream returned is changed so that a future call won't return
1993  * it again.
1994  *
1995  * There are several ways for the open stream to change.  If the open
1996  * stream is !os_delegation, then we aren't interested in it.  Also, if
1997  * either os_failed_reopen or !os_valid, then don't return the osp.
1998  *
1999  * If claimnull is false (doing reopen CLAIM_DELEGATE_CUR) then return
2000  * the osp if it is an os_delegation open stream.  Also, if the rnode still
2001  * has r_deleg_return_pending, then return the os_delegation osp.  Lastly,
2002  * if the rnode's r_deleg_stateid is different from the osp's open_stateid,
2003  * then return the osp.
2004  *
2005  * We have already taken the 'r_deleg_recall_lock' as WRITER, which
2006  * prevents new OPENs from going OTW (as start_fop takes this
2007  * lock in READ mode); thus, no new open streams can be created
2008  * (which inheretly means no new delegation open streams are
2009  * being created).
2010  */
2011 
2012 static nfs4_open_stream_t *
2013 get_next_deleg_stream(rnode4_t *rp, int claimnull)
2014 {
2015 	nfs4_open_stream_t	*osp;
2016 
2017 	ASSERT(nfs_rw_lock_held(&rp->r_deleg_recall_lock, RW_WRITER));
2018 
2019 	/*
2020 	 * Search through the list of open streams looking for
2021 	 * one that was created while holding the delegation.
2022 	 */
2023 	mutex_enter(&rp->r_os_lock);
2024 	for (osp = list_head(&rp->r_open_streams); osp != NULL;
2025 	    osp = list_next(&rp->r_open_streams, osp)) {
2026 		mutex_enter(&osp->os_sync_lock);
2027 		if (!osp->os_delegation || osp->os_failed_reopen ||
2028 		    !osp->os_valid) {
2029 			mutex_exit(&osp->os_sync_lock);
2030 			continue;
2031 		}
2032 		if (!claimnull || rp->r_deleg_return_pending ||
2033 		    !stateid4_cmp(&osp->open_stateid, &rp->r_deleg_stateid)) {
2034 			osp->os_ref_count++;
2035 			mutex_exit(&osp->os_sync_lock);
2036 			mutex_exit(&rp->r_os_lock);
2037 			return (osp);
2038 		}
2039 		mutex_exit(&osp->os_sync_lock);
2040 	}
2041 	mutex_exit(&rp->r_os_lock);
2042 
2043 	return (NULL);
2044 }
2045 
2046 static void
2047 nfs4delegreturn_thread(struct cb_recall_pass *args)
2048 {
2049 	rnode4_t *rp;
2050 	vnode_t *vp;
2051 	cred_t *cr;
2052 	int dtype, error, flags;
2053 	bool_t rdirty, rip;
2054 	kmutex_t cpr_lock;
2055 	callb_cpr_t cpr_info;
2056 	struct nfs4_callback_globals *ncg;
2057 
2058 	ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
2059 	ASSERT(ncg != NULL);
2060 
2061 	mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
2062 
2063 	CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr,
2064 			"nfsv4delegRtn");
2065 
2066 	rp = args->rp;
2067 	vp = RTOV4(rp);
2068 
2069 	mutex_enter(&rp->r_statev4_lock);
2070 	if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
2071 		mutex_exit(&rp->r_statev4_lock);
2072 		goto out;
2073 	}
2074 	mutex_exit(&rp->r_statev4_lock);
2075 
2076 	/*
2077 	 * Take the read-write lock in read mode to prevent other
2078 	 * threads from modifying the data during the recall.  This
2079 	 * doesn't affect mmappers.
2080 	 */
2081 	(void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
2082 
2083 	/* Proceed with delegreturn */
2084 
2085 	mutex_enter(&rp->r_statev4_lock);
2086 	if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
2087 		mutex_exit(&rp->r_statev4_lock);
2088 		nfs_rw_exit(&rp->r_rwlock);
2089 		goto out;
2090 	}
2091 	dtype = rp->r_deleg_type;
2092 	cr = rp->r_deleg_cred;
2093 	ASSERT(cr != NULL);
2094 	crhold(cr);
2095 	mutex_exit(&rp->r_statev4_lock);
2096 
2097 	flags = args->flags;
2098 
2099 	/*
2100 	 * If the file is being truncated at the server, then throw
2101 	 * away all of the pages, it doesn't matter what flavor of
2102 	 * delegation we have.
2103 	 */
2104 
2105 	if (args->truncate) {
2106 		ncg->nfs4_callback_stats.recall_trunc.value.ui64++;
2107 		nfs4_invalidate_pages(vp, 0, cr);
2108 	} else if (dtype == OPEN_DELEGATE_WRITE) {
2109 
2110 		mutex_enter(&rp->r_statelock);
2111 		rdirty = rp->r_flags & R4DIRTY;
2112 		mutex_exit(&rp->r_statelock);
2113 
2114 		if (rdirty) {
2115 			error = VOP_PUTPAGE(vp, 0, 0, 0, cr);
2116 
2117 			if (error)
2118 				CB_WARN1("nfs4delegreturn_thread:"
2119 				" VOP_PUTPAGE: %d\n", error);
2120 		}
2121 		/* turn off NFS4_DR_PUSH because we just did that above. */
2122 		flags &= ~NFS4_DR_PUSH;
2123 	}
2124 
2125 	mutex_enter(&rp->r_statelock);
2126 	rip =  rp->r_flags & R4RECOVERRP;
2127 	mutex_exit(&rp->r_statelock);
2128 
2129 	/* If a failed recovery is indicated, discard the pages */
2130 
2131 	if (rip) {
2132 
2133 		error = VOP_PUTPAGE(vp, 0, 0, B_INVAL, cr);
2134 
2135 		if (error)
2136 			CB_WARN1("nfs4delegreturn_thread: VOP_PUTPAGE: %d\n",
2137 				error);
2138 	}
2139 
2140 	/*
2141 	 * Pass the flags to nfs4delegreturn_impl, but be sure not to pass
2142 	 * NFS4_DR_DID_OP, which just calls nfs4delegreturn_async again.
2143 	 */
2144 	flags &= ~NFS4_DR_DID_OP;
2145 
2146 	(void) nfs4delegreturn_impl(rp, flags, ncg);
2147 
2148 	nfs_rw_exit(&rp->r_rwlock);
2149 	crfree(cr);
2150 out:
2151 	kmem_free(args, sizeof (struct cb_recall_pass));
2152 	VN_RELE(vp);
2153 	mutex_enter(&cpr_lock);
2154 	CALLB_CPR_EXIT(&cpr_info);
2155 	mutex_destroy(&cpr_lock);
2156 	zthread_exit();
2157 }
2158 
2159 /*
2160  * This function has one assumption that the caller of this function is
2161  * either doing recovery (therefore cannot call nfs4_start_op) or has
2162  * already called nfs4_start_op().
2163  */
2164 void
2165 nfs4_delegation_accept(rnode4_t *rp, open_claim_type4 claim,  OPEN4res *res,
2166 	nfs4_ga_res_t *garp, cred_t *cr)
2167 {
2168 	open_read_delegation4 *orp;
2169 	open_write_delegation4 *owp;
2170 	nfs4_server_t *np;
2171 	bool_t already = FALSE;
2172 	bool_t recall = FALSE;
2173 	bool_t valid_garp = TRUE;
2174 	long mapcnt;
2175 	uint_t rflag;
2176 	mntinfo4_t *mi;
2177 	bool_t recov;
2178 	struct nfs4_callback_globals *ncg;
2179 	open_delegation_type4 odt;
2180 
2181 	ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
2182 	ASSERT(ncg != NULL);
2183 
2184 	mutex_enter(&rp->r_statev4_lock);
2185 
2186 	if (rp->r_deleg_type == OPEN_DELEGATE_READ ||
2187 	    rp->r_deleg_type == OPEN_DELEGATE_WRITE)
2188 		already = TRUE;
2189 
2190 	odt = res->delegation.delegation_type;
2191 	mutex_exit(&rp->r_statev4_lock);
2192 
2193 	if (odt == OPEN_DELEGATE_READ) {
2194 
2195 		mutex_enter(&rp->r_statev4_lock);
2196 		rp->r_deleg_type = res->delegation.delegation_type;
2197 		orp = &res->delegation.open_delegation4_u.read;
2198 		rp->r_deleg_stateid = orp->stateid;
2199 		rp->r_deleg_perms = orp->permissions;
2200 		recall = orp->recall;
2201 		mutex_exit(&rp->r_statev4_lock);
2202 
2203 		ncg->nfs4_callback_stats.delegations.value.ui64++;
2204 		ncg->nfs4_callback_stats.delegaccept_r.value.ui64++;
2205 
2206 	} else if (odt == OPEN_DELEGATE_WRITE) {
2207 
2208 		mutex_enter(&rp->r_statelock);
2209 		mutex_enter(&rp->r_statev4_lock);
2210 		rp->r_deleg_type = res->delegation.delegation_type;
2211 		owp = &res->delegation.open_delegation4_u.write;
2212 		rp->r_deleg_stateid = owp->stateid;
2213 		rp->r_deleg_perms = owp->permissions;
2214 		rp->r_deleg_limit = owp->space_limit;
2215 		recall = owp->recall;
2216 
2217 		if (garp == NULL || !garp->n4g_change_valid) {
2218 			valid_garp = FALSE;
2219 			rp->r_deleg_change = 0;
2220 			rp->r_deleg_change_grant = 0;
2221 		} else {
2222 			rp->r_deleg_change = garp->n4g_change;
2223 			rp->r_deleg_change_grant = garp->n4g_change;
2224 		}
2225 		mapcnt = rp->r_mapcnt;
2226 		rflag = rp->r_flags;
2227 
2228 		/*
2229 		 * Update the delegation change attribute if
2230 		 * there are mappers for the file is dirty.  This
2231 		 * might be the case during recovery after server
2232 		 * reboot.
2233 		 */
2234 		if (mapcnt > 0 || rflag & R4DIRTY)
2235 			rp->r_deleg_change++;
2236 
2237 		NFS4_DEBUG(nfs4_callback_debug, (CE_NOTE,
2238 			"nfs4_delegation_accept: r_deleg_change: 0x%x\n",
2239 			(int)(rp->r_deleg_change >> 32)));
2240 		NFS4_DEBUG(nfs4_callback_debug, (CE_NOTE,
2241 			"nfs4_delegation_accept: r_delg_change_grant: 0x%x\n",
2242 			(int)(rp->r_deleg_change_grant >> 32)));
2243 
2244 #ifdef	DEBUG
2245 		if (nfs4_use_phony_limit == 1)
2246 			rp->r_deleg_limit = nfs4_deleg_space_phony;
2247 		if (nfs4_use_phony_limit == 2) {
2248 			rp->r_deleg_limit = nfs4_deleg_space_phony2;
2249 			rp->r_deleg_limit.nfs_space_limit4_u.mod_blocks =
2250 				nfs4_deleg_space_phonyl;
2251 		}
2252 #endif
2253 		mutex_exit(&rp->r_statev4_lock);
2254 		mutex_exit(&rp->r_statelock);
2255 
2256 		ncg->nfs4_callback_stats.delegations.value.ui64++;
2257 		ncg->nfs4_callback_stats.delegaccept_rw.value.ui64++;
2258 
2259 #ifdef	DEBUG
2260 
2261 	} else if (nfs4_deleg_accept_phony == OPEN_DELEGATE_READ) {
2262 
2263 		mutex_enter(&rp->r_statev4_lock);
2264 		rp->r_deleg_type = OPEN_DELEGATE_READ;
2265 		rp->r_deleg_stateid = nfs4_deleg_any;
2266 		rp->r_deleg_perms = nfs4_deleg_ace_phony;
2267 		rp->r_deleg_change = nfs4_deleg_change_phony;
2268 		rp->r_deleg_change_grant = rp->r_deleg_change;
2269 		mutex_exit(&rp->r_statev4_lock);
2270 
2271 	} else if (nfs4_deleg_accept_phony == OPEN_DELEGATE_WRITE) {
2272 
2273 		mutex_enter(&rp->r_statev4_lock);
2274 		rp->r_deleg_type = OPEN_DELEGATE_WRITE;
2275 		rp->r_deleg_stateid = nfs4_deleg_any;
2276 		rp->r_deleg_perms = nfs4_deleg_ace_phony;
2277 		rp->r_deleg_limit = nfs4_deleg_space_phony;
2278 		rp->r_deleg_change = nfs4_deleg_change_phony;
2279 		rp->r_deleg_change_grant = rp->r_deleg_change;
2280 		mutex_exit(&rp->r_statev4_lock);
2281 
2282 #endif
2283 	} else {
2284 
2285 		if (already) {
2286 			switch (claim) {
2287 
2288 			case CLAIM_NULL:
2289 			case CLAIM_PREVIOUS:
2290 				/*
2291 				 * The file may already have a delegation when
2292 				 * it is reopened during recovery.  In this
2293 				 * case, we consider the delegation to no longer
2294 				 * be valid.  As a courtesy, attempt to return
2295 				 * the delegation.
2296 				 */
2297 				mi = VTOMI4(RTOV4(rp));
2298 				mutex_enter(&mi->mi_lock);
2299 				recov = mi->mi_recovflags & MI4R_REOPEN_FILES;
2300 				mutex_exit(&mi->mi_lock);
2301 
2302 				/*
2303 				 * We need to hold rp->r_statev4_lock while
2304 				 * checking rp->r_deleg_return_pending and
2305 				 * when calling nfs4_dlistadd() if we're in
2306 				 * recovery.
2307 				 */
2308 				mutex_enter(&rp->r_statev4_lock);
2309 				if (rp->r_deleg_return_pending == TRUE) {
2310 					/*
2311 					 * We're alreading in the throes of
2312 					 * returning a delegation.  Drop
2313 					 * the lock and head for the return.
2314 					 */
2315 					mutex_exit(&rp->r_statev4_lock);
2316 				} else if (recov) {
2317 					/*
2318 					 * Cannot call delegreturn from inside
2319 					 * of recovery or VOP_PUTPAGE will hang
2320 					 * due to nfs4_start_fop call in
2321 					 * nfs4write.  Use dlistadd to add the
2322 					 * rnode to the list of rnodes needing
2323 					 * cleaning.
2324 					 *
2325 					 * NB: We're in recover so don't reopen
2326 					 */
2327 					nfs4_dlistadd(rp, ncg,
2328 						NFS4_DR_PUSH|NFS4_DR_DISCARD);
2329 					mutex_exit(&rp->r_statev4_lock);
2330 				} else {
2331 					mutex_exit(&rp->r_statev4_lock);
2332 					/* XXX - Do we need to reopen? */
2333 					(void) nfs4delegreturn_impl(rp,
2334 						(NFS4_DR_PUSH |
2335 						    NFS4_DR_DID_OP |
2336 						    NFS4_DR_REOPEN),
2337 						ncg);
2338 				}
2339 				break;
2340 
2341 			default:
2342 				/*
2343 				 * CLAIM_DELEGATE_CUR, CLAIM_DELEGATE_PREV
2344 				 * fall through here
2345 				 */
2346 				break;
2347 			}
2348 		}
2349 
2350 		/* No delegation granted, get out. */
2351 		return;
2352 	}
2353 
2354 	mutex_enter(&rp->r_statev4_lock);
2355 	rp->r_deleg_return_pending = FALSE;
2356 	rp->r_deleg_needs_recovery = OPEN_DELEGATE_NONE;
2357 	if (claim == CLAIM_PREVIOUS)
2358 		rp->r_deleg_needs_recall = recall;
2359 
2360 #ifdef	DEBUG
2361 	if (nfs4_use_phony_recall)
2362 		rp->r_deleg_needs_recall = nfs4_phony_recall_v;
2363 #endif
2364 
2365 	/*
2366 	 * If the server has requested a recall, then put the
2367 	 * vnode on a list of files which need to be cleaned.
2368 	 * This will be done later by the recovery thread to
2369 	 * avoid a deadlock.  If this were a CLAIM_NULL open
2370 	 * and the server set recall, then the server is just
2371 	 * confused; the delegation will be returned eventually.
2372 	 */
2373 	if (rp->r_deleg_needs_recall)
2374 		nfs4_dlistadd(rp, ncg, NFS4_DR_PUSH|NFS4_DR_REOPEN);
2375 
2376 	if (already == FALSE) {
2377 		rp->r_deleg_cred = cr;
2378 		crhold(cr);
2379 	}
2380 	mutex_exit(&rp->r_statev4_lock);
2381 
2382 	if (already == FALSE) {
2383 
2384 		/*
2385 		 * Add this rnode to the list of rnodes with delegations
2386 		 * for this nfs4_server.  find_nfs4_server returns with
2387 		 * the mutex locked, so don't forget to mutex exit.
2388 		 */
2389 
2390 		if ((np = find_nfs4_server(VTOMI4(RTOV4(rp)))) == NULL) {
2391 
2392 			mutex_enter(&rp->r_statev4_lock);
2393 			rp->r_deleg_type = OPEN_DELEGATE_NONE;
2394 			mutex_exit(&rp->r_statev4_lock);
2395 			return;
2396 		}
2397 
2398 		list_insert_head(&np->s_deleg_list, rp);
2399 		/* added list node gets a reference */
2400 		np->s_refcnt++;
2401 		nfs4_inc_state_ref_count_nolock(np, VTOMI4(RTOV4(rp)));
2402 		mutex_exit(&np->s_lock);
2403 		nfs4_server_rele(np);
2404 	}
2405 
2406 	/*
2407 	 * This call to nfs4delegreturn assumes that nfs4_start_op MUST
2408 	 * not be called by nfs4delegreturn.
2409 	 */
2410 	if (nfs4_delegreturn_policy == IMMEDIATE || !valid_garp)
2411 		(void) nfs4delegreturn_impl(rp,
2412 			NFS4_DR_PUSH|NFS4_DR_DID_OP|NFS4_DR_REOPEN, ncg);
2413 }
2414 
2415 /*
2416  * nfs4delegabandon - Abandon the delegation on an rnode4.  This code
2417  * is called when the client receives EXPIRED, BAD_STATEID, OLD_STATEID
2418  * or BADSEQID and the recovery code is unable to recover.  Push any
2419  * dirty data back to the server and return the delegation (if any).
2420  */
2421 
2422 void
2423 nfs4delegabandon(rnode4_t *rp)
2424 {
2425 	vnode_t *vp;
2426 	struct cb_recall_pass *pp;
2427 	open_delegation_type4 dt;
2428 
2429 	mutex_enter(&rp->r_statev4_lock);
2430 	dt = rp->r_deleg_type;
2431 	mutex_exit(&rp->r_statev4_lock);
2432 
2433 	if (dt == OPEN_DELEGATE_NONE)
2434 		return;
2435 
2436 	vp = RTOV4(rp);
2437 	VN_HOLD(vp);
2438 
2439 	pp = kmem_alloc(sizeof (struct cb_recall_pass), KM_SLEEP);
2440 	pp->rp = rp;
2441 	/*
2442 	 * Recovery on the file has failed and we want to return
2443 	 * the delegation.  We don't want to reopen files and
2444 	 * nfs4delegreturn_thread() figures out what to do about
2445 	 * the data.  The only thing to do is attempt to return
2446 	 * the delegation.
2447 	 */
2448 	pp->flags = 0;
2449 	pp->truncate = FALSE;
2450 
2451 	/*
2452 	 * Fire up a thread to do the delegreturn; this is
2453 	 * necessary because we could be inside a GETPAGE or
2454 	 * PUTPAGE and we cannot do another one.
2455 	 */
2456 
2457 	(void) zthread_create(NULL, 0, nfs4delegreturn_thread, pp, 0,
2458 				minclsyspri);
2459 }
2460 
2461 static int
2462 wait_for_recall1(vnode_t *vp, nfs4_op_hint_t op, nfs4_recov_state_t *rsp,
2463 	int flg)
2464 {
2465 	rnode4_t *rp;
2466 	int error = 0;
2467 
2468 #ifdef lint
2469 	op = op;
2470 #endif
2471 
2472 	if (vp && vp->v_type == VREG) {
2473 		rp = VTOR4(vp);
2474 
2475 		/*
2476 		 * Take r_deleg_recall_lock in read mode to synchronize
2477 		 * with delegreturn.
2478 		 */
2479 		error = nfs_rw_enter_sig(&rp->r_deleg_recall_lock,
2480 			RW_READER, INTR4(vp));
2481 
2482 		if (error == 0)
2483 			rsp->rs_flags |= flg;
2484 
2485 	}
2486 	return (error);
2487 }
2488 
2489 void
2490 nfs4_end_op_recall(vnode_t *vp1, vnode_t *vp2, nfs4_recov_state_t *rsp)
2491 {
2492 	NFS4_DEBUG(nfs4_recall_debug,
2493 		(CE_NOTE, "nfs4_end_op_recall: 0x%p, 0x%p\n",
2494 		(void *)vp1, (void *)vp2));
2495 
2496 	if (vp2 && rsp->rs_flags & NFS4_RS_RECALL_HELD2)
2497 		nfs_rw_exit(&VTOR4(vp2)->r_deleg_recall_lock);
2498 	if (vp1 && rsp->rs_flags & NFS4_RS_RECALL_HELD1)
2499 		nfs_rw_exit(&VTOR4(vp1)->r_deleg_recall_lock);
2500 }
2501 
2502 int
2503 wait_for_recall(vnode_t *vp1, vnode_t *vp2, nfs4_op_hint_t op,
2504 	nfs4_recov_state_t *rsp)
2505 {
2506 	int error;
2507 
2508 	NFS4_DEBUG(nfs4_recall_debug,
2509 		(CE_NOTE, "wait_for_recall:    0x%p, 0x%p\n",
2510 		(void *)vp1, (void *) vp2));
2511 
2512 	rsp->rs_flags &= ~(NFS4_RS_RECALL_HELD1|NFS4_RS_RECALL_HELD2);
2513 
2514 	if ((error = wait_for_recall1(vp1, op, rsp, NFS4_RS_RECALL_HELD1)) != 0)
2515 		return (error);
2516 
2517 	if ((error = wait_for_recall1(vp2, op, rsp, NFS4_RS_RECALL_HELD2))
2518 	    != 0) {
2519 		if (rsp->rs_flags & NFS4_RS_RECALL_HELD1) {
2520 			nfs_rw_exit(&VTOR4(vp1)->r_deleg_recall_lock);
2521 			rsp->rs_flags &= ~NFS4_RS_RECALL_HELD1;
2522 		}
2523 
2524 		return (error);
2525 	}
2526 
2527 	return (0);
2528 }
2529 
2530 /*
2531  * nfs4_dlistadd - Add this rnode to a list of rnodes to be
2532  * DELEGRETURN'd at the end of recovery.
2533  */
2534 
2535 static void
2536 nfs4_dlistadd(rnode4_t *rp, struct nfs4_callback_globals *ncg, int flags)
2537 {
2538 	struct nfs4_dnode *dp;
2539 
2540 	ASSERT(mutex_owned(&rp->r_statev4_lock));
2541 	/*
2542 	 * Mark the delegation as having a return pending.
2543 	 * This will prevent the use of the delegation stateID
2544 	 * by read, write, setattr and open.
2545 	 */
2546 	rp->r_deleg_return_pending = TRUE;
2547 	dp = kmem_alloc(sizeof (*dp), KM_SLEEP);
2548 	VN_HOLD(RTOV4(rp));
2549 	dp->rnodep = rp;
2550 	dp->flags = flags;
2551 	mutex_enter(&ncg->nfs4_dlist_lock);
2552 	list_insert_head(&ncg->nfs4_dlist, dp);
2553 #ifdef	DEBUG
2554 	ncg->nfs4_dlistadd_c++;
2555 #endif
2556 	mutex_exit(&ncg->nfs4_dlist_lock);
2557 }
2558 
2559 /*
2560  * nfs4_dlistclean_impl - Do DELEGRETURN for each rnode on the list.
2561  * of files awaiting cleaning.  If the override_flags are non-zero
2562  * then use them rather than the flags that were set when the rnode
2563  * was added to the dlist.
2564  */
2565 static void
2566 nfs4_dlistclean_impl(struct nfs4_callback_globals *ncg, int override_flags)
2567 {
2568 	rnode4_t *rp;
2569 	struct nfs4_dnode *dp;
2570 	int flags;
2571 
2572 	ASSERT(override_flags == 0 || override_flags == NFS4_DR_DISCARD);
2573 
2574 	mutex_enter(&ncg->nfs4_dlist_lock);
2575 	while ((dp = list_head(&ncg->nfs4_dlist)) != NULL) {
2576 #ifdef	DEBUG
2577 		ncg->nfs4_dlistclean_c++;
2578 #endif
2579 		list_remove(&ncg->nfs4_dlist, dp);
2580 		mutex_exit(&ncg->nfs4_dlist_lock);
2581 		rp = dp->rnodep;
2582 		flags = (override_flags != 0) ? override_flags : dp->flags;
2583 		kmem_free(dp, sizeof (*dp));
2584 		(void) nfs4delegreturn_impl(rp, flags, ncg);
2585 		VN_RELE(RTOV4(rp));
2586 		mutex_enter(&ncg->nfs4_dlist_lock);
2587 	}
2588 	mutex_exit(&ncg->nfs4_dlist_lock);
2589 }
2590 
2591 void
2592 nfs4_dlistclean(void)
2593 {
2594 	struct nfs4_callback_globals *ncg;
2595 
2596 	ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
2597 	ASSERT(ncg != NULL);
2598 
2599 	nfs4_dlistclean_impl(ncg, 0);
2600 }
2601