xref: /titanic_41/usr/src/uts/common/fs/nfs/nfs4_callback.c (revision 445f2479fe3d7435daab18bf2cdc310b86cd6738)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
27 /* All Rights Reserved */
28 
29 #pragma ident	"%Z%%M%	%I%	%E% SMI"
30 
31 #include <sys/param.h>
32 #include <sys/types.h>
33 #include <sys/systm.h>
34 #include <sys/cred.h>
35 #include <sys/vfs.h>
36 #include <sys/vnode.h>
37 #include <sys/pathname.h>
38 #include <sys/sysmacros.h>
39 #include <sys/kmem.h>
40 #include <sys/kstat.h>
41 #include <sys/mkdev.h>
42 #include <sys/mount.h>
43 #include <sys/statvfs.h>
44 #include <sys/errno.h>
45 #include <sys/debug.h>
46 #include <sys/cmn_err.h>
47 #include <sys/utsname.h>
48 #include <sys/bootconf.h>
49 #include <sys/modctl.h>
50 #include <sys/acl.h>
51 #include <sys/flock.h>
52 #include <sys/kstr.h>
53 #include <sys/stropts.h>
54 #include <sys/strsubr.h>
55 #include <sys/atomic.h>
56 #include <sys/disp.h>
57 #include <sys/policy.h>
58 #include <sys/list.h>
59 #include <sys/zone.h>
60 
61 #include <rpc/types.h>
62 #include <rpc/auth.h>
63 #include <rpc/rpcsec_gss.h>
64 #include <rpc/clnt.h>
65 #include <rpc/xdr.h>
66 
67 #include <nfs/nfs.h>
68 #include <nfs/nfs_clnt.h>
69 #include <nfs/mount.h>
70 #include <nfs/nfs_acl.h>
71 
72 #include <fs/fs_subr.h>
73 
74 #include <nfs/nfs4.h>
75 #include <nfs/rnode4.h>
76 #include <nfs/nfs4_clnt.h>
77 #include <nfs/nfssys.h>
78 
79 #ifdef	DEBUG
80 /*
81  * These are "special" state IDs and file handles that
82  * match any delegation state ID or file handled.  This
83  * is for testing purposes only.
84  */
85 
86 stateid4 nfs4_deleg_any = { 0x7FFFFFF0 };
87 char nfs4_deleg_fh[] = "\0377\0376\0375\0374";
88 nfs_fh4 nfs4_deleg_anyfh = { sizeof (nfs4_deleg_fh)-1, nfs4_deleg_fh };
89 int nfs4_deleg_accept_phony = OPEN_DELEGATE_NONE;
90 nfsace4 nfs4_deleg_ace_phony;
91 nfs_space_limit4 nfs4_deleg_space_phony = { NFS_LIMIT_SIZE, 8192 };
92 nfs_space_limit4 nfs4_deleg_space_phony2 = { NFS_LIMIT_BLOCKS, 0 };
93 nfs_modified_limit4 nfs4_deleg_space_phonyl = { 8, 512 };
94 changeid4 nfs4_deleg_change_phony = 0x7eeeeeee76666660LL;
95 int nfs4_use_phony_limit;
96 int nfs4_use_phony_recall;
97 int nfs4_phony_recall_v;
98 nfsstat4 cb4_getattr_fail = NFS4_OK;
99 nfsstat4 cb4_recall_fail = NFS4_OK;
100 
101 int nfs4_callback_debug;
102 int nfs4_recall_debug;
103 int nfs4_drat_debug;
104 
105 #endif
106 
107 #define	CB_NOTE(x)	NFS4_DEBUG(nfs4_callback_debug, (CE_NOTE, x))
108 #define	CB_WARN(x)	NFS4_DEBUG(nfs4_callback_debug, (CE_WARN, x))
109 #define	CB_WARN1(x, y)	NFS4_DEBUG(nfs4_callback_debug, (CE_WARN, x, y))
110 
111 enum nfs4_delegreturn_policy nfs4_delegreturn_policy = INACTIVE;
112 
113 static zone_key_t nfs4_callback_zone_key;
114 
115 /*
116  * NFS4_MAPSIZE is the number of bytes we are willing to consume
117  * for the block allocation map when the server grants a NFS_LIMIT_BLOCK
118  * style delegation.
119  */
120 
121 #define	NFS4_MAPSIZE	8192
122 #define	NFS4_MAPWORDS	NFS4_MAPSIZE/sizeof (uint_t)
123 #define	NbPW		(NBBY*sizeof (uint_t))
124 
125 static int nfs4_num_prognums = 1024;
126 static SVC_CALLOUT_TABLE nfs4_cb_sct;
127 
128 struct nfs4_dnode {
129 	list_node_t	linkage;
130 	rnode4_t	*rnodep;
131 	int		flags;		/* Flags for nfs4delegreturn_impl() */
132 };
133 
134 static const struct nfs4_callback_stats nfs4_callback_stats_tmpl = {
135 	{ "delegations",	KSTAT_DATA_UINT64 },
136 	{ "cb_getattr",		KSTAT_DATA_UINT64 },
137 	{ "cb_recall",		KSTAT_DATA_UINT64 },
138 	{ "cb_null",		KSTAT_DATA_UINT64 },
139 	{ "cb_dispatch",	KSTAT_DATA_UINT64 },
140 	{ "delegaccept_r",	KSTAT_DATA_UINT64 },
141 	{ "delegaccept_rw",	KSTAT_DATA_UINT64 },
142 	{ "delegreturn",	KSTAT_DATA_UINT64 },
143 	{ "callbacks",		KSTAT_DATA_UINT64 },
144 	{ "claim_cur",		KSTAT_DATA_UINT64 },
145 	{ "claim_cur_ok",	KSTAT_DATA_UINT64 },
146 	{ "recall_trunc",	KSTAT_DATA_UINT64 },
147 	{ "recall_failed",	KSTAT_DATA_UINT64 },
148 	{ "return_limit_write",	KSTAT_DATA_UINT64 },
149 	{ "return_limit_addmap", KSTAT_DATA_UINT64 },
150 	{ "deleg_recover",	KSTAT_DATA_UINT64 },
151 	{ "cb_illegal",		KSTAT_DATA_UINT64 }
152 };
153 
154 struct nfs4_cb_port {
155 	list_node_t		linkage; /* linkage into per-zone port list */
156 	char			netid[KNC_STRSIZE];
157 	char			uaddr[KNC_STRSIZE];
158 	char			protofmly[KNC_STRSIZE];
159 	char			proto[KNC_STRSIZE];
160 };
161 
162 static int cb_getattr_bytes;
163 
164 struct cb_recall_pass {
165 	rnode4_t	*rp;
166 	int		flags;		/* Flags for nfs4delegreturn_impl() */
167 	bool_t		truncate;
168 };
169 
170 static nfs4_open_stream_t *get_next_deleg_stream(rnode4_t *, int);
171 static void nfs4delegreturn_thread(struct cb_recall_pass *);
172 static int deleg_reopen(vnode_t *, bool_t *, struct nfs4_callback_globals *,
173     int);
174 static void nfs4_dlistadd(rnode4_t *, struct nfs4_callback_globals *, int);
175 static void nfs4_dlistclean_impl(struct nfs4_callback_globals *, int);
176 static int nfs4delegreturn_impl(rnode4_t *, int,
177     struct nfs4_callback_globals *);
178 static void nfs4delegreturn_cleanup_impl(rnode4_t *, nfs4_server_t *,
179     struct nfs4_callback_globals *);
180 
181 static void
182 cb_getattr(nfs_cb_argop4 *argop, nfs_cb_resop4 *resop, struct svc_req *req,
183 	struct compound_state *cs, struct nfs4_callback_globals *ncg)
184 {
185 	CB_GETATTR4args *args = &argop->nfs_cb_argop4_u.opcbgetattr;
186 	CB_GETATTR4res *resp = &resop->nfs_cb_resop4_u.opcbgetattr;
187 	rnode4_t *rp;
188 	vnode_t *vp;
189 	bool_t found = FALSE;
190 	struct nfs4_server *sp;
191 	struct fattr4 *fap;
192 	rpc_inline_t *fdata;
193 	long mapcnt;
194 	fattr4_change change;
195 	fattr4_size size;
196 	uint_t rflag;
197 
198 	ncg->nfs4_callback_stats.cb_getattr.value.ui64++;
199 
200 #ifdef DEBUG
201 	/*
202 	 * error injection hook: set cb_getattr_fail global to
203 	 * NFS4 pcol error to be returned
204 	 */
205 	if (cb4_getattr_fail != NFS4_OK) {
206 		*cs->statusp = resp->status = cb4_getattr_fail;
207 		return;
208 	}
209 #endif
210 
211 	resp->obj_attributes.attrmask = 0;
212 
213 	mutex_enter(&ncg->nfs4_cb_lock);
214 	sp = ncg->nfs4prog2server[req->rq_prog - NFS4_CALLBACK];
215 	mutex_exit(&ncg->nfs4_cb_lock);
216 
217 	if (nfs4_server_vlock(sp, 0) == FALSE) {
218 
219 		CB_WARN("cb_getattr: cannot find server\n");
220 
221 		*cs->statusp = resp->status = NFS4ERR_BADHANDLE;
222 		return;
223 	}
224 
225 	/*
226 	 * In cb_compound, callback_ident was validated against rq_prog,
227 	 * but we couldn't verify that it was set to the value we provided
228 	 * at setclientid time (because we didn't have server struct yet).
229 	 * Now we have the server struct, but don't have callback_ident
230 	 * handy.  So, validate server struct program number against req
231 	 * RPC's prog number.  At this point, we know the RPC prog num
232 	 * is valid (else we wouldn't be here); however, we don't know
233 	 * that it was the prog number we supplied to this server at
234 	 * setclientid time.  If the prog numbers aren't equivalent, then
235 	 * log the problem and fail the request because either cbserv
236 	 * and/or cbclient are confused.  This will probably never happen.
237 	 */
238 	if (sp->s_program != req->rq_prog) {
239 #ifdef DEBUG
240 		zcmn_err(getzoneid(), CE_WARN,
241 		    "cb_getattr: wrong server program number srv=%d req=%d\n",
242 		    sp->s_program, req->rq_prog);
243 #else
244 		zcmn_err(getzoneid(), CE_WARN,
245 		    "cb_getattr: wrong server program number\n");
246 #endif
247 		mutex_exit(&sp->s_lock);
248 		nfs4_server_rele(sp);
249 		*cs->statusp = resp->status = NFS4ERR_BADHANDLE;
250 		return;
251 	}
252 
253 	/*
254 	 * Search the delegation list for a matching file handle;
255 	 * mutex on sp prevents the list from changing.
256 	 */
257 
258 	rp = list_head(&sp->s_deleg_list);
259 	for (; rp != NULL; rp = list_next(&sp->s_deleg_list, rp)) {
260 		nfs4_fhandle_t fhandle;
261 
262 		sfh4_copyval(rp->r_fh, &fhandle);
263 
264 		if ((fhandle.fh_len == args->fh.nfs_fh4_len &&
265 		    bcmp(fhandle.fh_buf, args->fh.nfs_fh4_val,
266 		    fhandle.fh_len) == 0)) {
267 
268 			found = TRUE;
269 			break;
270 		}
271 #ifdef	DEBUG
272 		if (nfs4_deleg_anyfh.nfs_fh4_len == args->fh.nfs_fh4_len &&
273 		    bcmp(nfs4_deleg_anyfh.nfs_fh4_val, args->fh.nfs_fh4_val,
274 		    args->fh.nfs_fh4_len) == 0) {
275 
276 			found = TRUE;
277 			break;
278 		}
279 #endif
280 	}
281 
282 	/*
283 	 * VN_HOLD the vnode before releasing s_lock to guarantee
284 	 * we have a valid vnode reference.
285 	 */
286 	if (found == TRUE) {
287 		vp = RTOV4(rp);
288 		VN_HOLD(vp);
289 	}
290 
291 	mutex_exit(&sp->s_lock);
292 	nfs4_server_rele(sp);
293 
294 	if (found == FALSE) {
295 
296 		CB_WARN("cb_getattr: bad fhandle\n");
297 
298 		*cs->statusp = resp->status = NFS4ERR_BADHANDLE;
299 		return;
300 	}
301 
302 	/*
303 	 * Figure out which attributes the server wants.  We only
304 	 * offer FATTR4_CHANGE & FATTR4_SIZE; ignore the rest.
305 	 */
306 	fdata = kmem_alloc(cb_getattr_bytes, KM_SLEEP);
307 
308 	/*
309 	 * Don't actually need to create XDR to encode these
310 	 * simple data structures.
311 	 * xdrmem_create(&xdr, fdata, cb_getattr_bytes, XDR_ENCODE);
312 	 */
313 	fap = &resp->obj_attributes;
314 
315 	fap->attrmask = 0;
316 	/* attrlist4_len starts at 0 and increases as attrs are processed */
317 	fap->attrlist4 = (char *)fdata;
318 	fap->attrlist4_len = 0;
319 
320 	/* don't supply attrs if request was zero */
321 	if (args->attr_request != 0) {
322 		if (args->attr_request & FATTR4_CHANGE_MASK) {
323 			/*
324 			 * If the file is mmapped, then increment the change
325 			 * attribute and return it.  This will guarantee that
326 			 * the server will perceive that the file has changed
327 			 * if there is any chance that the client application
328 			 * has changed it.  Otherwise, just return the change
329 			 * attribute as it has been updated by nfs4write_deleg.
330 			 */
331 
332 			mutex_enter(&rp->r_statelock);
333 			mapcnt = rp->r_mapcnt;
334 			rflag = rp->r_flags;
335 			mutex_exit(&rp->r_statelock);
336 
337 			mutex_enter(&rp->r_statev4_lock);
338 			/*
339 			 * If object mapped, then always return new change.
340 			 * Otherwise, return change if object has dirty
341 			 * pages.  If object doesn't have any dirty pages,
342 			 * then all changes have been pushed to server, so
343 			 * reset change to grant change.
344 			 */
345 			if (mapcnt)
346 				rp->r_deleg_change++;
347 			else if (! (rflag & R4DIRTY))
348 				rp->r_deleg_change = rp->r_deleg_change_grant;
349 			change = rp->r_deleg_change;
350 			mutex_exit(&rp->r_statev4_lock);
351 
352 			/*
353 			 * Use inline XDR code directly, we know that we
354 			 * going to a memory buffer and it has enough
355 			 * space so it cannot fail.
356 			 */
357 			IXDR_PUT_U_HYPER(fdata, change);
358 			fap->attrlist4_len += 2 * BYTES_PER_XDR_UNIT;
359 			fap->attrmask |= FATTR4_CHANGE_MASK;
360 		}
361 
362 		if (args->attr_request & FATTR4_SIZE_MASK) {
363 			/*
364 			 * Use an atomic add of 0 to fetch a consistent view
365 			 * of r_size; this avoids having to take rw_lock
366 			 * which could cause a deadlock.
367 			 */
368 			size = atomic_add_64_nv((uint64_t *)&rp->r_size, 0);
369 
370 			/*
371 			 * Use inline XDR code directly, we know that we
372 			 * going to a memory buffer and it has enough
373 			 * space so it cannot fail.
374 			 */
375 			IXDR_PUT_U_HYPER(fdata, size);
376 			fap->attrlist4_len += 2 * BYTES_PER_XDR_UNIT;
377 			fap->attrmask |= FATTR4_SIZE_MASK;
378 		}
379 	}
380 
381 	VN_RELE(vp);
382 
383 	*cs->statusp = resp->status = NFS4_OK;
384 }
385 
386 static void
387 cb_getattr_free(nfs_cb_resop4 *resop)
388 {
389 	if (resop->nfs_cb_resop4_u.opcbgetattr.obj_attributes.attrlist4)
390 		kmem_free(resop->nfs_cb_resop4_u.opcbgetattr.
391 			obj_attributes.attrlist4,
392 			cb_getattr_bytes);
393 }
394 
395 static void
396 cb_recall(nfs_cb_argop4 *argop, nfs_cb_resop4 *resop, struct svc_req *req,
397 	struct compound_state *cs, struct nfs4_callback_globals *ncg)
398 {
399 	CB_RECALL4args * args = &argop->nfs_cb_argop4_u.opcbrecall;
400 	CB_RECALL4res *resp = &resop->nfs_cb_resop4_u.opcbrecall;
401 	rnode4_t *rp;
402 	vnode_t *vp;
403 	struct nfs4_server *sp;
404 	bool_t found = FALSE;
405 
406 	ncg->nfs4_callback_stats.cb_recall.value.ui64++;
407 
408 	ASSERT(req->rq_prog >= NFS4_CALLBACK);
409 	ASSERT(req->rq_prog < NFS4_CALLBACK+nfs4_num_prognums);
410 
411 #ifdef DEBUG
412 	/*
413 	 * error injection hook: set cb_recall_fail global to
414 	 * NFS4 pcol error to be returned
415 	 */
416 	if (cb4_recall_fail != NFS4_OK) {
417 		*cs->statusp = resp->status = cb4_recall_fail;
418 		return;
419 	}
420 #endif
421 
422 	mutex_enter(&ncg->nfs4_cb_lock);
423 	sp = ncg->nfs4prog2server[req->rq_prog - NFS4_CALLBACK];
424 	mutex_exit(&ncg->nfs4_cb_lock);
425 
426 	if (nfs4_server_vlock(sp, 0) == FALSE) {
427 
428 		CB_WARN("cb_recall: cannot find server\n");
429 
430 		*cs->statusp = resp->status = NFS4ERR_BADHANDLE;
431 		return;
432 	}
433 
434 	/*
435 	 * Search the delegation list for a matching file handle
436 	 * AND stateid; mutex on sp prevents the list from changing.
437 	 */
438 
439 	rp = list_head(&sp->s_deleg_list);
440 	for (; rp != NULL; rp = list_next(&sp->s_deleg_list, rp)) {
441 		mutex_enter(&rp->r_statev4_lock);
442 
443 		/* check both state id and file handle! */
444 
445 		if ((bcmp(&rp->r_deleg_stateid, &args->stateid,
446 		    sizeof (stateid4)) == 0)) {
447 			nfs4_fhandle_t fhandle;
448 
449 			sfh4_copyval(rp->r_fh, &fhandle);
450 			if ((fhandle.fh_len == args->fh.nfs_fh4_len &&
451 			    bcmp(fhandle.fh_buf, args->fh.nfs_fh4_val,
452 			    fhandle.fh_len) == 0)) {
453 
454 				found = TRUE;
455 				break;
456 			} else {
457 #ifdef	DEBUG
458 				CB_WARN("cb_recall: stateid OK, bad fh");
459 #endif
460 			}
461 		}
462 #ifdef	DEBUG
463 		if (bcmp(&args->stateid, &nfs4_deleg_any,
464 		    sizeof (stateid4)) == 0) {
465 
466 			found = TRUE;
467 			break;
468 		}
469 #endif
470 		mutex_exit(&rp->r_statev4_lock);
471 	}
472 
473 	/*
474 	 * VN_HOLD the vnode before releasing s_lock to guarantee
475 	 * we have a valid vnode reference.  The async thread will
476 	 * release the hold when it's done.
477 	 */
478 	if (found == TRUE) {
479 		mutex_exit(&rp->r_statev4_lock);
480 		vp = RTOV4(rp);
481 		VN_HOLD(vp);
482 	}
483 	mutex_exit(&sp->s_lock);
484 	nfs4_server_rele(sp);
485 
486 	if (found == FALSE) {
487 
488 		CB_WARN("cb_recall: bad stateid\n");
489 
490 		*cs->statusp = resp->status = NFS4ERR_BAD_STATEID;
491 		return;
492 	}
493 
494 	/* Fire up a thread to do the delegreturn */
495 	nfs4delegreturn_async(rp, NFS4_DR_RECALL|NFS4_DR_REOPEN,
496 					args->truncate);
497 
498 	*cs->statusp = resp->status = 0;
499 }
500 
501 /* ARGSUSED */
502 static void
503 cb_recall_free(nfs_cb_resop4 *resop)
504 {
505 	/* nothing to do here, cb_recall doesn't kmem_alloc */
506 }
507 
508 /*
509  * This function handles the CB_NULL proc call from an NFSv4 Server.
510  *
511  * We take note that the server has sent a CB_NULL for later processing
512  * in the recovery logic. It is noted so we may pause slightly after the
513  * setclientid and before reopening files. The pause is to allow the
514  * NFSv4 Server time to receive the CB_NULL reply and adjust any of
515  * its internal structures such that it has the opportunity to grant
516  * delegations to reopened files.
517  *
518  */
519 
520 /* ARGSUSED */
521 static void
522 cb_null(CB_COMPOUND4args *args, CB_COMPOUND4res *resp, struct svc_req *req,
523     struct nfs4_callback_globals *ncg)
524 {
525 	struct nfs4_server *sp;
526 
527 	ncg->nfs4_callback_stats.cb_null.value.ui64++;
528 
529 	ASSERT(req->rq_prog >= NFS4_CALLBACK);
530 	ASSERT(req->rq_prog < NFS4_CALLBACK+nfs4_num_prognums);
531 
532 	mutex_enter(&ncg->nfs4_cb_lock);
533 	sp = ncg->nfs4prog2server[req->rq_prog - NFS4_CALLBACK];
534 	mutex_exit(&ncg->nfs4_cb_lock);
535 
536 	if (nfs4_server_vlock(sp, 0) != FALSE) {
537 		sp->s_flags |= N4S_CB_PINGED;
538 		cv_broadcast(&sp->wait_cb_null);
539 		mutex_exit(&sp->s_lock);
540 		nfs4_server_rele(sp);
541 	}
542 }
543 
544 /*
545  * cb_illegal	args: void
546  *		res : status (NFS4ERR_OP_CB_ILLEGAL)
547  */
548 /* ARGSUSED */
549 static void
550 cb_illegal(nfs_cb_argop4 *argop, nfs_cb_resop4 *resop, struct svc_req *req,
551 	struct compound_state *cs, struct nfs4_callback_globals *ncg)
552 {
553 	CB_ILLEGAL4res *resp = &resop->nfs_cb_resop4_u.opcbillegal;
554 
555 	ncg->nfs4_callback_stats.cb_illegal.value.ui64++;
556 	resop->resop = OP_CB_ILLEGAL;
557 	*cs->statusp = resp->status = NFS4ERR_OP_ILLEGAL;
558 }
559 
560 static void
561 cb_compound(CB_COMPOUND4args *args, CB_COMPOUND4res *resp, struct svc_req *req,
562 	struct nfs4_callback_globals *ncg)
563 {
564 	uint_t i;
565 	struct compound_state cs;
566 	nfs_cb_argop4 *argop;
567 	nfs_cb_resop4 *resop, *new_res;
568 	uint_t op;
569 
570 	bzero(&cs, sizeof (cs));
571 	cs.statusp = &resp->status;
572 	cs.cont = TRUE;
573 
574 	/*
575 	 * Form a reply tag by copying over the reqeuest tag.
576 	 */
577 	resp->tag.utf8string_len = args->tag.utf8string_len;
578 	resp->tag.utf8string_val = kmem_alloc(resp->tag.utf8string_len,
579 					KM_SLEEP);
580 	bcopy(args->tag.utf8string_val, resp->tag.utf8string_val,
581 		args->tag.utf8string_len);
582 
583 	/*
584 	 * XXX for now, minorversion should be zero
585 	 */
586 	if (args->minorversion != CB4_MINORVERSION) {
587 		resp->array_len = 0;
588 		resp->array = NULL;
589 		resp->status = NFS4ERR_MINOR_VERS_MISMATCH;
590 		return;
591 	}
592 
593 #ifdef DEBUG
594 	/*
595 	 * Verify callback_ident.  It doesn't really matter if it's wrong
596 	 * because we don't really use callback_ident -- we use prog number
597 	 * of the RPC request instead.  In this case, just print a DEBUG
598 	 * console message to reveal brokenness of cbclient (at bkoff/cthon).
599 	 */
600 	if (args->callback_ident != req->rq_prog)
601 		zcmn_err(getzoneid(), CE_WARN,
602 		    "cb_compound: cb_client using wrong "
603 		    "callback_ident(%d), should be %d",
604 		    args->callback_ident, req->rq_prog);
605 #endif
606 
607 	resp->array_len = args->array_len;
608 	resp->array = kmem_zalloc(args->array_len * sizeof (nfs_cb_resop4),
609 							KM_SLEEP);
610 
611 	for (i = 0; i < args->array_len && cs.cont; i++) {
612 
613 		argop = &args->array[i];
614 		resop = &resp->array[i];
615 		resop->resop = argop->argop;
616 		op = (uint_t)resop->resop;
617 
618 		switch (op) {
619 
620 		case OP_CB_GETATTR:
621 
622 			cb_getattr(argop, resop, req, &cs, ncg);
623 			break;
624 
625 		case OP_CB_RECALL:
626 
627 			cb_recall(argop, resop, req, &cs, ncg);
628 			break;
629 
630 		case OP_CB_ILLEGAL:
631 
632 			/* fall through */
633 
634 		default:
635 			/*
636 			 * Handle OP_CB_ILLEGAL and any undefined opcode.
637 			 * Currently, the XDR code will return BADXDR
638 			 * if cb op doesn't decode to legal value, so
639 			 * it really only handles OP_CB_ILLEGAL.
640 			 */
641 			op = OP_CB_ILLEGAL;
642 			cb_illegal(argop, resop, req, &cs, ncg);
643 		}
644 
645 		if (*cs.statusp != NFS4_OK)
646 			cs.cont = FALSE;
647 
648 		/*
649 		 * If not at last op, and if we are to stop, then
650 		 * compact the results array.
651 		 */
652 		if ((i + 1) < args->array_len && !cs.cont) {
653 
654 			new_res = kmem_alloc(
655 				(i+1) * sizeof (nfs_cb_resop4), KM_SLEEP);
656 			bcopy(resp->array,
657 				new_res, (i+1) * sizeof (nfs_cb_resop4));
658 			kmem_free(resp->array,
659 				args->array_len * sizeof (nfs_cb_resop4));
660 
661 			resp->array_len =  i + 1;
662 			resp->array = new_res;
663 		}
664 	}
665 
666 }
667 
668 static void
669 cb_compound_free(CB_COMPOUND4res *resp)
670 {
671 	uint_t i, op;
672 	nfs_cb_resop4 *resop;
673 
674 	if (resp->tag.utf8string_val) {
675 		UTF8STRING_FREE(resp->tag)
676 	}
677 
678 	for (i = 0; i < resp->array_len; i++) {
679 
680 		resop = &resp->array[i];
681 		op = (uint_t)resop->resop;
682 
683 		switch (op) {
684 
685 		case OP_CB_GETATTR:
686 
687 			cb_getattr_free(resop);
688 			break;
689 
690 		case OP_CB_RECALL:
691 
692 			cb_recall_free(resop);
693 			break;
694 
695 		default:
696 			break;
697 		}
698 	}
699 
700 	if (resp->array != NULL) {
701 		kmem_free(resp->array,
702 			resp->array_len * sizeof (nfs_cb_resop4));
703 	}
704 }
705 
706 static void
707 cb_dispatch(struct svc_req *req, SVCXPRT *xprt)
708 {
709 	CB_COMPOUND4args args;
710 	CB_COMPOUND4res res;
711 	struct nfs4_callback_globals *ncg;
712 
713 	bool_t (*xdr_args)(), (*xdr_res)();
714 	void (*proc)(CB_COMPOUND4args *, CB_COMPOUND4res *, struct svc_req *,
715 	    struct nfs4_callback_globals *);
716 	void (*freeproc)(CB_COMPOUND4res *);
717 
718 	ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
719 	ASSERT(ncg != NULL);
720 
721 	ncg->nfs4_callback_stats.cb_dispatch.value.ui64++;
722 
723 	switch (req->rq_proc) {
724 	case CB_NULL:
725 		xdr_args = xdr_void;
726 		xdr_res = xdr_void;
727 		proc = cb_null;
728 		freeproc = NULL;
729 		break;
730 
731 	case CB_COMPOUND:
732 		xdr_args = xdr_CB_COMPOUND4args_clnt;
733 		xdr_res = xdr_CB_COMPOUND4res;
734 		proc = cb_compound;
735 		freeproc = cb_compound_free;
736 		break;
737 
738 	default:
739 		CB_WARN("cb_dispatch: no proc\n");
740 		svcerr_noproc(xprt);
741 		return;
742 	}
743 
744 	args.tag.utf8string_val = NULL;
745 	args.array = NULL;
746 
747 	if (!SVC_GETARGS(xprt, xdr_args, (caddr_t)&args)) {
748 
749 		CB_WARN("cb_dispatch: cannot getargs\n");
750 		svcerr_decode(xprt);
751 		return;
752 	}
753 
754 	(*proc)(&args, &res, req, ncg);
755 
756 	if (svc_sendreply(xprt, xdr_res, (caddr_t)&res) == FALSE) {
757 
758 		CB_WARN("cb_dispatch: bad sendreply\n");
759 
760 		/*
761 		 * svcerr_systemerr(xprt);
762 		 */
763 	}
764 
765 	if (freeproc)
766 		(*freeproc)(&res);
767 
768 	if (!SVC_FREEARGS(xprt, xdr_args, (caddr_t)&args)) {
769 
770 		CB_WARN("cb_dispatch: bad freeargs\n");
771 	}
772 }
773 
774 static rpcprog_t
775 nfs4_getnextprogram(struct nfs4_callback_globals *ncg)
776 {
777 	int i, j;
778 
779 	j = ncg->nfs4_program_hint;
780 	for (i = 0; i < nfs4_num_prognums; i++, j++) {
781 
782 		if (j >= nfs4_num_prognums)
783 			j = 0;
784 
785 		if (ncg->nfs4prog2server[j] == NULL) {
786 			ncg->nfs4_program_hint = j+1;
787 			return (j+NFS4_CALLBACK);
788 		}
789 	}
790 
791 	return (0);
792 }
793 
794 void
795 nfs4callback_destroy(nfs4_server_t *np)
796 {
797 	struct nfs4_callback_globals *ncg;
798 	int i;
799 
800 	if (np->s_program == 0)
801 		return;
802 
803 	ncg = np->zone_globals;
804 	i = np->s_program - NFS4_CALLBACK;
805 
806 	mutex_enter(&ncg->nfs4_cb_lock);
807 
808 	ASSERT(ncg->nfs4prog2server[i] == np);
809 
810 	ncg->nfs4prog2server[i] = NULL;
811 
812 	if (i < ncg->nfs4_program_hint)
813 		ncg->nfs4_program_hint = i;
814 
815 	mutex_exit(&ncg->nfs4_cb_lock);
816 }
817 
818 /*
819  * nfs4_setport - This function saves a netid and univeral address for
820  * the callback program.  These values will be used during setclientid.
821  */
822 static void
823 nfs4_setport(char *netid, char *uaddr, char *protofmly, char *proto,
824 	struct nfs4_callback_globals *ncg)
825 {
826 	struct nfs4_cb_port *p;
827 	bool_t found = FALSE;
828 
829 	ASSERT(MUTEX_HELD(&ncg->nfs4_cb_lock));
830 
831 	p = list_head(&ncg->nfs4_cb_ports);
832 	for (; p != NULL; p = list_next(&ncg->nfs4_cb_ports, p)) {
833 		if (strcmp(p->netid, netid) == 0) {
834 			found = TRUE;
835 			break;
836 		}
837 	}
838 	if (found == TRUE)
839 		(void) strcpy(p->uaddr, uaddr);
840 	else {
841 		p = kmem_alloc(sizeof (*p), KM_SLEEP);
842 
843 		(void) strcpy(p->uaddr, uaddr);
844 		(void) strcpy(p->netid, netid);
845 		(void) strcpy(p->protofmly, protofmly);
846 		(void) strcpy(p->proto, proto);
847 		list_insert_head(&ncg->nfs4_cb_ports, p);
848 	}
849 }
850 
851 /*
852  * nfs4_cb_args - This function is used to construct the callback
853  * portion of the arguments needed for setclientid.
854  */
855 
856 void
857 nfs4_cb_args(nfs4_server_t *np, struct knetconfig *knc, SETCLIENTID4args *args)
858 {
859 	struct nfs4_cb_port *p;
860 	bool_t found = FALSE;
861 	rpcprog_t pgm;
862 	struct nfs4_callback_globals *ncg = np->zone_globals;
863 
864 	/*
865 	 * This server structure may already have a program number
866 	 * assigned to it.  This happens when the client has to
867 	 * re-issue SETCLIENTID.  Just re-use the information.
868 	 */
869 	if (np->s_program >= NFS4_CALLBACK &&
870 	    np->s_program < NFS4_CALLBACK + nfs4_num_prognums)
871 		nfs4callback_destroy(np);
872 
873 	mutex_enter(&ncg->nfs4_cb_lock);
874 
875 	p = list_head(&ncg->nfs4_cb_ports);
876 	for (; p != NULL; p = list_next(&ncg->nfs4_cb_ports, p)) {
877 		if (strcmp(p->protofmly, knc->knc_protofmly) == 0 &&
878 		    strcmp(p->proto, knc->knc_proto) == 0) {
879 			found = TRUE;
880 			break;
881 		}
882 	}
883 
884 	if (found == FALSE) {
885 
886 		NFS4_DEBUG(nfs4_callback_debug,
887 		(CE_WARN, "nfs4_cb_args: could not find netid for %s/%s\n",
888 			knc->knc_protofmly, knc->knc_proto));
889 
890 		args->callback.cb_program = 0;
891 		args->callback.cb_location.r_netid = NULL;
892 		args->callback.cb_location.r_addr = NULL;
893 		args->callback_ident = 0;
894 		mutex_exit(&ncg->nfs4_cb_lock);
895 		return;
896 	}
897 
898 	if ((pgm = nfs4_getnextprogram(ncg)) == 0) {
899 		CB_WARN("nfs4_cb_args: out of program numbers\n");
900 
901 		args->callback.cb_program = 0;
902 		args->callback.cb_location.r_netid = NULL;
903 		args->callback.cb_location.r_addr = NULL;
904 		args->callback_ident = 0;
905 		mutex_exit(&ncg->nfs4_cb_lock);
906 		return;
907 	}
908 
909 	ncg->nfs4prog2server[pgm-NFS4_CALLBACK] = np;
910 	args->callback.cb_program = pgm;
911 	args->callback.cb_location.r_netid = p->netid;
912 	args->callback.cb_location.r_addr = p->uaddr;
913 	args->callback_ident = pgm;
914 
915 	np->s_program = pgm;
916 
917 	mutex_exit(&ncg->nfs4_cb_lock);
918 }
919 
920 static int
921 nfs4_dquery(struct nfs4_svc_args *arg, model_t model)
922 {
923 	file_t *fp;
924 	vnode_t *vp;
925 	rnode4_t *rp;
926 	int error;
927 	STRUCT_HANDLE(nfs4_svc_args, uap);
928 
929 	STRUCT_SET_HANDLE(uap, model, arg);
930 
931 	if ((fp = getf(STRUCT_FGET(uap, fd))) == NULL)
932 		return (EBADF);
933 
934 	vp = fp->f_vnode;
935 
936 	if (vp == NULL || vp->v_type != VREG ||
937 	    !vn_matchops(vp, nfs4_vnodeops)) {
938 		releasef(STRUCT_FGET(uap, fd));
939 		return (EBADF);
940 	}
941 
942 	rp = VTOR4(vp);
943 
944 	/*
945 	 * I can't convince myself that we need locking here.  The
946 	 * rnode cannot disappear and the value returned is instantly
947 	 * stale anway, so why bother?
948 	 */
949 
950 	error = suword32(STRUCT_FGETP(uap, netid), rp->r_deleg_type);
951 	releasef(STRUCT_FGET(uap, fd));
952 	return (error);
953 }
954 
955 
956 /*
957  * NFS4 client system call.  This service does the
958  * necessary initialization for the callback program.
959  * This is fashioned after the server side interaction
960  * between nfsd and the kernel.  On the client, the
961  * mount command forks and the child process does the
962  * necessary interaction with the kernel.
963  *
964  * uap->fd is the fd of an open transport provider
965  */
966 int
967 nfs4_svc(struct nfs4_svc_args *arg, model_t model)
968 {
969 	file_t *fp;
970 	int error;
971 	int readsize;
972 	char buf[KNC_STRSIZE], uaddr[KNC_STRSIZE];
973 	char protofmly[KNC_STRSIZE], proto[KNC_STRSIZE];
974 	size_t len;
975 	STRUCT_HANDLE(nfs4_svc_args, uap);
976 	struct netbuf addrmask;
977 	int cmd;
978 	SVCMASTERXPRT *cb_xprt;
979 	struct nfs4_callback_globals *ncg;
980 
981 #ifdef lint
982 	model = model;		/* STRUCT macros don't always refer to it */
983 #endif
984 
985 	STRUCT_SET_HANDLE(uap, model, arg);
986 
987 	if (STRUCT_FGET(uap, cmd) == NFS4_DQUERY)
988 		return (nfs4_dquery(arg, model));
989 
990 	if (secpolicy_nfs(CRED()) != 0)
991 		return (EPERM);
992 
993 	if ((fp = getf(STRUCT_FGET(uap, fd))) == NULL)
994 		return (EBADF);
995 
996 	/*
997 	 * Set read buffer size to rsize
998 	 * and add room for RPC headers.
999 	 */
1000 	readsize = nfs3tsize() + (RPC_MAXDATASIZE - NFS_MAXDATA);
1001 	if (readsize < RPC_MAXDATASIZE)
1002 		readsize = RPC_MAXDATASIZE;
1003 
1004 	error = copyinstr((const char *)STRUCT_FGETP(uap, netid), buf,
1005 	    KNC_STRSIZE, &len);
1006 	if (error) {
1007 		releasef(STRUCT_FGET(uap, fd));
1008 		return (error);
1009 	}
1010 
1011 	cmd = STRUCT_FGET(uap, cmd);
1012 
1013 	if (cmd & NFS4_KRPC_START) {
1014 		addrmask.len = STRUCT_FGET(uap, addrmask.len);
1015 		addrmask.maxlen = STRUCT_FGET(uap, addrmask.maxlen);
1016 		addrmask.buf = kmem_alloc(addrmask.maxlen, KM_SLEEP);
1017 		error = copyin(STRUCT_FGETP(uap, addrmask.buf), addrmask.buf,
1018 		    addrmask.len);
1019 		if (error) {
1020 			releasef(STRUCT_FGET(uap, fd));
1021 			kmem_free(addrmask.buf, addrmask.maxlen);
1022 			return (error);
1023 		}
1024 	}
1025 	else
1026 		addrmask.buf = NULL;
1027 
1028 	error = copyinstr((const char *)STRUCT_FGETP(uap, addr), uaddr,
1029 	    sizeof (uaddr), &len);
1030 	if (error) {
1031 		releasef(STRUCT_FGET(uap, fd));
1032 		if (addrmask.buf)
1033 			kmem_free(addrmask.buf, addrmask.maxlen);
1034 		return (error);
1035 	}
1036 
1037 	error = copyinstr((const char *)STRUCT_FGETP(uap, protofmly), protofmly,
1038 	    sizeof (protofmly), &len);
1039 	if (error) {
1040 		releasef(STRUCT_FGET(uap, fd));
1041 		if (addrmask.buf)
1042 			kmem_free(addrmask.buf, addrmask.maxlen);
1043 		return (error);
1044 	}
1045 
1046 	error = copyinstr((const char *)STRUCT_FGETP(uap, proto), proto,
1047 	    sizeof (proto), &len);
1048 	if (error) {
1049 		releasef(STRUCT_FGET(uap, fd));
1050 		if (addrmask.buf)
1051 			kmem_free(addrmask.buf, addrmask.maxlen);
1052 		return (error);
1053 	}
1054 
1055 	ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
1056 	ASSERT(ncg != NULL);
1057 
1058 	mutex_enter(&ncg->nfs4_cb_lock);
1059 	if (cmd & NFS4_SETPORT)
1060 		nfs4_setport(buf, uaddr, protofmly, proto, ncg);
1061 
1062 	if (cmd & NFS4_KRPC_START) {
1063 		error = svc_tli_kcreate(fp, readsize, buf, &addrmask, &cb_xprt,
1064 		    &nfs4_cb_sct, NULL, NFS_CB_SVCPOOL_ID, FALSE);
1065 		if (error) {
1066 			CB_WARN1("nfs4_svc: svc_tli_kcreate failed %d\n",
1067 				error);
1068 			kmem_free(addrmask.buf, addrmask.maxlen);
1069 		}
1070 	}
1071 
1072 	mutex_exit(&ncg->nfs4_cb_lock);
1073 	releasef(STRUCT_FGET(uap, fd));
1074 	return (error);
1075 }
1076 
1077 struct nfs4_callback_globals *
1078 nfs4_get_callback_globals(void)
1079 {
1080 	return (zone_getspecific(nfs4_callback_zone_key, nfs_zone()));
1081 }
1082 
1083 static void *
1084 nfs4_callback_init_zone(zoneid_t zoneid)
1085 {
1086 	kstat_t *nfs4_callback_kstat;
1087 	struct nfs4_callback_globals *ncg;
1088 
1089 	ncg = kmem_zalloc(sizeof (*ncg), KM_SLEEP);
1090 
1091 	ncg->nfs4prog2server = kmem_zalloc(nfs4_num_prognums *
1092 		sizeof (struct nfs4_server *), KM_SLEEP);
1093 
1094 	/* initialize the dlist */
1095 	mutex_init(&ncg->nfs4_dlist_lock, NULL, MUTEX_DEFAULT, NULL);
1096 	list_create(&ncg->nfs4_dlist, sizeof (struct nfs4_dnode),
1097 	    offsetof(struct nfs4_dnode, linkage));
1098 
1099 	/* initialize cb_port list */
1100 	mutex_init(&ncg->nfs4_cb_lock, NULL, MUTEX_DEFAULT, NULL);
1101 	list_create(&ncg->nfs4_cb_ports, sizeof (struct nfs4_cb_port),
1102 	    offsetof(struct nfs4_cb_port, linkage));
1103 
1104 	/* get our own copy of the kstats */
1105 	bcopy(&nfs4_callback_stats_tmpl, &ncg->nfs4_callback_stats,
1106 	    sizeof (nfs4_callback_stats_tmpl));
1107 	/* register "nfs:0:nfs4_callback_stats" for this zone */
1108 	if ((nfs4_callback_kstat =
1109 		kstat_create_zone("nfs", 0, "nfs4_callback_stats", "misc",
1110 		    KSTAT_TYPE_NAMED,
1111 		    sizeof (ncg->nfs4_callback_stats) / sizeof (kstat_named_t),
1112 		    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE,
1113 		    zoneid)) != NULL) {
1114 		nfs4_callback_kstat->ks_data = &ncg->nfs4_callback_stats;
1115 		kstat_install(nfs4_callback_kstat);
1116 	}
1117 	return (ncg);
1118 }
1119 
1120 static void
1121 nfs4_discard_delegations(struct nfs4_callback_globals *ncg)
1122 {
1123 	nfs4_server_t *sp;
1124 	int i, num_removed;
1125 
1126 	/*
1127 	 * It's OK here to just run through the registered "programs", as
1128 	 * servers without programs won't have any delegations to handle.
1129 	 */
1130 	for (i = 0; i < nfs4_num_prognums; i++) {
1131 		rnode4_t *rp;
1132 
1133 		mutex_enter(&ncg->nfs4_cb_lock);
1134 		sp = ncg->nfs4prog2server[i];
1135 		mutex_exit(&ncg->nfs4_cb_lock);
1136 
1137 		if (nfs4_server_vlock(sp, 1) == FALSE)
1138 			continue;
1139 		num_removed = 0;
1140 		while ((rp = list_head(&sp->s_deleg_list)) != NULL) {
1141 			mutex_enter(&rp->r_statev4_lock);
1142 			if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
1143 				/*
1144 				 * We need to take matters into our own hands,
1145 				 * as nfs4delegreturn_cleanup_impl() won't
1146 				 * remove this from the list.
1147 				 */
1148 				list_remove(&sp->s_deleg_list, rp);
1149 				mutex_exit(&rp->r_statev4_lock);
1150 				nfs4_dec_state_ref_count_nolock(sp,
1151 				    VTOMI4(RTOV4(rp)));
1152 				num_removed++;
1153 				continue;
1154 			}
1155 			mutex_exit(&rp->r_statev4_lock);
1156 			VN_HOLD(RTOV4(rp));
1157 			mutex_exit(&sp->s_lock);
1158 			/*
1159 			 * The following will remove the node from the list.
1160 			 */
1161 			nfs4delegreturn_cleanup_impl(rp, sp, ncg);
1162 			VN_RELE(RTOV4(rp));
1163 			mutex_enter(&sp->s_lock);
1164 		}
1165 		mutex_exit(&sp->s_lock);
1166 		/* each removed list node reles a reference */
1167 		while (num_removed-- > 0)
1168 			nfs4_server_rele(sp);
1169 		/* remove our reference for nfs4_server_vlock */
1170 		nfs4_server_rele(sp);
1171 	}
1172 }
1173 
1174 /* ARGSUSED */
1175 static void
1176 nfs4_callback_shutdown_zone(zoneid_t zoneid, void *data)
1177 {
1178 	struct nfs4_callback_globals *ncg = data;
1179 
1180 	/*
1181 	 * Clean pending delegation return list.
1182 	 */
1183 	nfs4_dlistclean_impl(ncg, NFS4_DR_DISCARD);
1184 
1185 	/*
1186 	 * Discard all delegations.
1187 	 */
1188 	nfs4_discard_delegations(ncg);
1189 }
1190 
1191 static void
1192 nfs4_callback_fini_zone(zoneid_t zoneid, void *data)
1193 {
1194 	struct nfs4_callback_globals *ncg = data;
1195 	struct nfs4_cb_port *p;
1196 	nfs4_server_t *sp, *next;
1197 	nfs4_server_t freelist;
1198 	int i;
1199 
1200 	kstat_delete_byname_zone("nfs", 0, "nfs4_callback_stats", zoneid);
1201 
1202 	/*
1203 	 * Discard all delegations that may have crept in since we did the
1204 	 * _shutdown.
1205 	 */
1206 	nfs4_discard_delegations(ncg);
1207 	/*
1208 	 * We're completely done with this zone and all associated
1209 	 * nfs4_server_t's.  Any remaining nfs4_server_ts should only have one
1210 	 * more reference outstanding -- the reference we didn't release in
1211 	 * nfs4_renew_lease_thread().
1212 	 *
1213 	 * Here we need to run through the global nfs4_server_lst as we need to
1214 	 * deal with nfs4_server_ts without programs, as they also have threads
1215 	 * created for them, and so have outstanding references that we need to
1216 	 * release.
1217 	 */
1218 	freelist.forw = &freelist;
1219 	freelist.back = &freelist;
1220 	mutex_enter(&nfs4_server_lst_lock);
1221 	sp = nfs4_server_lst.forw;
1222 	while (sp != &nfs4_server_lst) {
1223 		next = sp->forw;
1224 		if (sp->zoneid == zoneid) {
1225 			remque(sp);
1226 			insque(sp, &freelist);
1227 		}
1228 		sp = next;
1229 	}
1230 	mutex_exit(&nfs4_server_lst_lock);
1231 
1232 	sp = freelist.forw;
1233 	while (sp != &freelist) {
1234 		next = sp->forw;
1235 		sp->forw = sp->back = NULL;
1236 		nfs4_server_rele(sp);	/* free the list's reference */
1237 		sp = next;
1238 	}
1239 
1240 #ifdef DEBUG
1241 	for (i = 0; i < nfs4_num_prognums; i++) {
1242 		ASSERT(ncg->nfs4prog2server[i] == NULL);
1243 	}
1244 #endif
1245 	kmem_free(ncg->nfs4prog2server, nfs4_num_prognums *
1246 	    sizeof (struct nfs4_server *));
1247 
1248 	mutex_enter(&ncg->nfs4_cb_lock);
1249 	while ((p = list_head(&ncg->nfs4_cb_ports)) != NULL) {
1250 		list_remove(&ncg->nfs4_cb_ports, p);
1251 		kmem_free(p, sizeof (*p));
1252 	}
1253 	list_destroy(&ncg->nfs4_cb_ports);
1254 	mutex_destroy(&ncg->nfs4_cb_lock);
1255 	list_destroy(&ncg->nfs4_dlist);
1256 	mutex_destroy(&ncg->nfs4_dlist_lock);
1257 	kmem_free(ncg, sizeof (*ncg));
1258 }
1259 
1260 void
1261 nfs4_callback_init(void)
1262 {
1263 	int i;
1264 	SVC_CALLOUT *nfs4_cb_sc;
1265 
1266 	/* initialize the callback table */
1267 	nfs4_cb_sc = kmem_alloc(nfs4_num_prognums *
1268 		sizeof (SVC_CALLOUT), KM_SLEEP);
1269 
1270 	for (i = 0; i < nfs4_num_prognums; i++) {
1271 		nfs4_cb_sc[i].sc_prog = NFS4_CALLBACK+i;
1272 		nfs4_cb_sc[i].sc_versmin = NFS_CB;
1273 		nfs4_cb_sc[i].sc_versmax = NFS_CB;
1274 		nfs4_cb_sc[i].sc_dispatch = cb_dispatch;
1275 	}
1276 
1277 	nfs4_cb_sct.sct_size = nfs4_num_prognums;
1278 	nfs4_cb_sct.sct_free = FALSE;
1279 	nfs4_cb_sct.sct_sc = nfs4_cb_sc;
1280 
1281 	/*
1282 	 * Compute max bytes required for dyamically allocated parts
1283 	 * of cb_getattr reply.  Only size and change are supported now.
1284 	 * If CB_GETATTR is changed to reply with additional attrs,
1285 	 * additional sizes must be added below.
1286 	 *
1287 	 * fattr4_change + fattr4_size == uint64_t + uint64_t
1288 	 */
1289 	cb_getattr_bytes = 2 * BYTES_PER_XDR_UNIT + 2 * BYTES_PER_XDR_UNIT;
1290 
1291 	zone_key_create(&nfs4_callback_zone_key, nfs4_callback_init_zone,
1292 	    nfs4_callback_shutdown_zone, nfs4_callback_fini_zone);
1293 }
1294 
1295 void
1296 nfs4_callback_fini(void)
1297 {
1298 }
1299 
1300 /*
1301  * NB: This function can be called from the *wrong* zone (ie, the zone that
1302  * 'rp' belongs to and the caller's zone may not be the same).  This can happen
1303  * if the zone is going away and we get called from nfs4_async_inactive().  In
1304  * this case the globals will be NULL and we won't update the counters, which
1305  * doesn't matter as the zone is going away anyhow.
1306  */
1307 static void
1308 nfs4delegreturn_cleanup_impl(rnode4_t *rp, nfs4_server_t *np,
1309 	struct nfs4_callback_globals *ncg)
1310 {
1311 	mntinfo4_t *mi = VTOMI4(RTOV4(rp));
1312 	boolean_t need_rele = B_FALSE;
1313 
1314 	mutex_enter(&rp->r_statev4_lock);
1315 
1316 	if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
1317 		mutex_exit(&rp->r_statev4_lock);
1318 		return;
1319 	}
1320 
1321 	/*
1322 	 * Free the cred originally held when
1323 	 * the delegation was granted.  Caller must
1324 	 * hold this cred if it wants to use it after
1325 	 * this call.
1326 	 */
1327 	crfree(rp->r_deleg_cred);
1328 	rp->r_deleg_cred = NULL;
1329 	rp->r_deleg_type = OPEN_DELEGATE_NONE;
1330 	rp->r_deleg_needs_recovery = OPEN_DELEGATE_NONE;
1331 	rp->r_deleg_needs_recall = FALSE;
1332 	rp->r_deleg_return_pending = FALSE;
1333 	mutex_exit(&rp->r_statev4_lock);
1334 
1335 	/*
1336 	 * Caller must be holding mi_recovlock in read mode
1337 	 * to call here.  This is provided by start_op.
1338 	 */
1339 
1340 	if (np == NULL) {
1341 		np = find_nfs4_server_all(mi, 1);
1342 		ASSERT(np != NULL);
1343 		need_rele = B_TRUE;
1344 	} else {
1345 		mutex_enter(&np->s_lock);
1346 	}
1347 
1348 	/*
1349 	 * Remove the rnode from the server's list and
1350 	 * update the ref counts.
1351 	 */
1352 	list_remove(&np->s_deleg_list, rp);
1353 	nfs4_dec_state_ref_count_nolock(np, mi);
1354 	mutex_exit(&np->s_lock);
1355 	/* removed list node removes a reference */
1356 	nfs4_server_rele(np);
1357 	if (need_rele)
1358 		nfs4_server_rele(np);
1359 	if (ncg != NULL)
1360 		ncg->nfs4_callback_stats.delegations.value.ui64--;
1361 }
1362 
1363 void
1364 nfs4delegreturn_cleanup(rnode4_t *rp, nfs4_server_t *np)
1365 {
1366 	struct nfs4_callback_globals *ncg;
1367 
1368 	if (np != NULL) {
1369 		ncg = np->zone_globals;
1370 	} else if (nfs_zone() == VTOMI4(RTOV4(rp))->mi_zone) {
1371 		ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
1372 		ASSERT(ncg != NULL);
1373 	} else {
1374 		/*
1375 		 * Request coming from the wrong zone.
1376 		 */
1377 		ASSERT(getzoneid() == GLOBAL_ZONEID);
1378 		ncg = NULL;
1379 	}
1380 
1381 	nfs4delegreturn_cleanup_impl(rp, np, ncg);
1382 }
1383 
1384 static void
1385 nfs4delegreturn_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp,
1386 	cred_t *cr, vnode_t *vp)
1387 {
1388 	if (error != ETIMEDOUT && error != EINTR &&
1389 	    !NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) {
1390 		lost_rqstp->lr_op = 0;
1391 		return;
1392 	}
1393 
1394 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
1395 			"nfs4close_save_lost_rqst: error %d", error));
1396 
1397 	lost_rqstp->lr_op = OP_DELEGRETURN;
1398 	/*
1399 	 * The vp is held and rele'd via the recovery code.
1400 	 * See nfs4_save_lost_rqst.
1401 	 */
1402 	lost_rqstp->lr_vp = vp;
1403 	lost_rqstp->lr_dvp = NULL;
1404 	lost_rqstp->lr_oop = NULL;
1405 	lost_rqstp->lr_osp = NULL;
1406 	lost_rqstp->lr_lop = NULL;
1407 	lost_rqstp->lr_cr = cr;
1408 	lost_rqstp->lr_flk = NULL;
1409 	lost_rqstp->lr_putfirst = FALSE;
1410 }
1411 
1412 static void
1413 nfs4delegreturn_otw(rnode4_t *rp, cred_t *cr, nfs4_error_t *ep)
1414 {
1415 	COMPOUND4args_clnt args;
1416 	COMPOUND4res_clnt res;
1417 	nfs_argop4 argops[3];
1418 	nfs4_ga_res_t *garp = NULL;
1419 	hrtime_t t;
1420 	int numops;
1421 	int doqueue = 1;
1422 
1423 	args.ctag = TAG_DELEGRETURN;
1424 
1425 	numops = 3;		/* PUTFH, GETATTR, DELEGRETURN */
1426 
1427 	args.array = argops;
1428 	args.array_len = numops;
1429 
1430 	argops[0].argop = OP_CPUTFH;
1431 	argops[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
1432 
1433 	argops[1].argop = OP_GETATTR;
1434 	argops[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
1435 	argops[1].nfs_argop4_u.opgetattr.mi = VTOMI4(RTOV4(rp));
1436 
1437 	argops[2].argop = OP_DELEGRETURN;
1438 	argops[2].nfs_argop4_u.opdelegreturn.deleg_stateid =
1439 		rp->r_deleg_stateid;
1440 
1441 	t = gethrtime();
1442 	rfs4call(VTOMI4(RTOV4(rp)), &args, &res, cr, &doqueue, 0, ep);
1443 
1444 	if (ep->error)
1445 		return;
1446 
1447 	if (res.status == NFS4_OK) {
1448 		garp = &res.array[1].nfs_resop4_u.opgetattr.ga_res;
1449 		nfs4_attr_cache(RTOV4(rp), garp, t, cr, TRUE, NULL);
1450 
1451 	}
1452 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1453 }
1454 
1455 int
1456 nfs4_do_delegreturn(rnode4_t *rp, int flags, cred_t *cr,
1457 	struct nfs4_callback_globals *ncg)
1458 {
1459 	vnode_t *vp = RTOV4(rp);
1460 	mntinfo4_t *mi = VTOMI4(vp);
1461 	nfs4_lost_rqst_t lost_rqst;
1462 	nfs4_recov_state_t recov_state;
1463 	bool_t needrecov = FALSE, recovonly, done = FALSE;
1464 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
1465 
1466 	ncg->nfs4_callback_stats.delegreturn.value.ui64++;
1467 
1468 	while (!done) {
1469 		e.error = nfs4_start_fop(mi, vp, NULL, OH_DELEGRETURN,
1470 				&recov_state, &recovonly);
1471 
1472 		if (e.error) {
1473 			if (flags & NFS4_DR_FORCE) {
1474 				(void) nfs_rw_enter_sig(&mi->mi_recovlock,
1475 				    RW_READER, 0);
1476 				nfs4delegreturn_cleanup_impl(rp, NULL, ncg);
1477 				nfs_rw_exit(&mi->mi_recovlock);
1478 			}
1479 			break;
1480 		}
1481 
1482 		/*
1483 		 * Check to see if the delegation has already been
1484 		 * returned by the recovery thread.   The state of
1485 		 * the delegation cannot change at this point due
1486 		 * to start_fop and the r_deleg_recall_lock.
1487 		 */
1488 		if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
1489 			e.error = 0;
1490 			nfs4_end_op(mi, vp, NULL, &recov_state, needrecov);
1491 			break;
1492 		}
1493 
1494 		if (recovonly) {
1495 			/*
1496 			 * Delegation will be returned via the
1497 			 * recovery framework.  Build a lost request
1498 			 * structure, start recovery and get out.
1499 			 */
1500 			nfs4_error_init(&e, EINTR);
1501 			nfs4delegreturn_save_lost_rqst(e.error, &lost_rqst,
1502 				cr, vp);
1503 			(void) nfs4_start_recovery(&e, mi, vp,
1504 				NULL, &rp->r_deleg_stateid,
1505 				lost_rqst.lr_op == OP_DELEGRETURN ?
1506 				&lost_rqst : NULL, OP_DELEGRETURN, NULL);
1507 			nfs4_end_op(mi, vp, NULL, &recov_state, needrecov);
1508 			break;
1509 		}
1510 
1511 		nfs4delegreturn_otw(rp, cr, &e);
1512 
1513 		/*
1514 		 * Ignore some errors on delegreturn; no point in marking
1515 		 * the file dead on a state destroying operation.
1516 		 */
1517 		if (e.error == 0 && (nfs4_recov_marks_dead(e.stat) ||
1518 		    e.stat == NFS4ERR_BADHANDLE ||
1519 		    e.stat == NFS4ERR_STALE))
1520 			needrecov = FALSE;
1521 		else
1522 			needrecov = nfs4_needs_recovery(&e, TRUE, vp->v_vfsp);
1523 
1524 		if (needrecov) {
1525 			nfs4delegreturn_save_lost_rqst(e.error, &lost_rqst,
1526 				cr, vp);
1527 			(void) nfs4_start_recovery(&e, mi, vp,
1528 				NULL, &rp->r_deleg_stateid,
1529 				lost_rqst.lr_op == OP_DELEGRETURN ?
1530 				&lost_rqst : NULL, OP_DELEGRETURN, NULL);
1531 		} else {
1532 			nfs4delegreturn_cleanup_impl(rp, NULL, ncg);
1533 			done = TRUE;
1534 		}
1535 
1536 		nfs4_end_op(mi, vp, NULL, &recov_state, needrecov);
1537 	}
1538 	return (e.error);
1539 }
1540 
1541 /*
1542  * nfs4_resend_delegreturn - used to drive the delegreturn
1543  * operation via the recovery thread.
1544  */
1545 void
1546 nfs4_resend_delegreturn(nfs4_lost_rqst_t *lorp, nfs4_error_t *ep,
1547 	nfs4_server_t *np)
1548 {
1549 	rnode4_t *rp = VTOR4(lorp->lr_vp);
1550 
1551 	/* If the file failed recovery, just quit. */
1552 	mutex_enter(&rp->r_statelock);
1553 	if (rp->r_flags & R4RECOVERR) {
1554 		ep->error = EIO;
1555 	}
1556 	mutex_exit(&rp->r_statelock);
1557 
1558 	if (!ep->error)
1559 		nfs4delegreturn_otw(rp, lorp->lr_cr, ep);
1560 
1561 	/*
1562 	 * If recovery is now needed, then return the error
1563 	 * and status and let the recovery thread handle it,
1564 	 * including re-driving another delegreturn.  Otherwise,
1565 	 * just give up and clean up the delegation.
1566 	 */
1567 	if (nfs4_needs_recovery(ep, TRUE, lorp->lr_vp->v_vfsp))
1568 		return;
1569 
1570 	if (rp->r_deleg_type != OPEN_DELEGATE_NONE)
1571 		nfs4delegreturn_cleanup(rp, np);
1572 
1573 	nfs4_error_zinit(ep);
1574 }
1575 
1576 /*
1577  * nfs4delegreturn - general function to return a delegation.
1578  *
1579  * NFS4_DR_FORCE - return the delegation even if start_op fails
1580  * NFS4_DR_PUSH - push modified data back to the server via VOP_PUTPAGE
1581  * NFS4_DR_DISCARD - discard the delegation w/o delegreturn
1582  * NFS4_DR_DID_OP - calling function already did nfs4_start_op
1583  * NFS4_DR_RECALL - delegreturned initiated via CB_RECALL
1584  * NFS4_DR_REOPEN - do file reopens, if applicable
1585  */
1586 static int
1587 nfs4delegreturn_impl(rnode4_t *rp, int flags, struct nfs4_callback_globals *ncg)
1588 {
1589 	int error = 0;
1590 	cred_t *cr = NULL;
1591 	vnode_t *vp;
1592 	bool_t needrecov = FALSE;
1593 	bool_t rw_entered = FALSE;
1594 	bool_t do_reopen;
1595 
1596 	vp = RTOV4(rp);
1597 
1598 	/*
1599 	 * If NFS4_DR_DISCARD is set by itself, take a short-cut and
1600 	 * discard without doing an otw DELEGRETURN.  This may only be used
1601 	 * by the recovery thread because it bypasses the synchronization
1602 	 * with r_deleg_recall_lock and mi->mi_recovlock.
1603 	 */
1604 	if (flags == NFS4_DR_DISCARD) {
1605 		nfs4delegreturn_cleanup_impl(rp, NULL, ncg);
1606 		return (0);
1607 	}
1608 
1609 	if (flags & NFS4_DR_DID_OP) {
1610 		/*
1611 		 * Caller had already done start_op, which means the
1612 		 * r_deleg_recall_lock is already held in READ mode
1613 		 * so we cannot take it in write mode.  Return the
1614 		 * delegation asynchronously.
1615 		 *
1616 		 * Remove the NFS4_DR_DID_OP flag so we don't
1617 		 * get stuck looping through here.
1618 		 */
1619 		VN_HOLD(vp);
1620 		nfs4delegreturn_async(rp, (flags & ~NFS4_DR_DID_OP), FALSE);
1621 		return (0);
1622 	}
1623 
1624 	/*
1625 	 * Take r_deleg_recall_lock to verify we still have a delegation
1626 	 * and to crhold the credential.  We have to release the lock
1627 	 * before we call VOP_PUTPAGE or else we'll deadlock.
1628 	 */
1629 	(void) nfs_rw_enter_sig(&rp->r_deleg_recall_lock, RW_WRITER, FALSE);
1630 	rw_entered = TRUE;
1631 	if (rp->r_deleg_type == OPEN_DELEGATE_NONE)
1632 		goto out;
1633 	cr = rp->r_deleg_cred;
1634 	crhold(cr);
1635 	nfs_rw_exit(&rp->r_deleg_recall_lock);
1636 	rw_entered = FALSE;
1637 
1638 	/*
1639 	 * Push the modified data back to the server synchronously
1640 	 * before doing DELEGRETURN.
1641 	 */
1642 	if (flags & NFS4_DR_PUSH)
1643 		(void) VOP_PUTPAGE(vp, 0, 0, 0, cr);
1644 
1645 	/*
1646 	 * Take r_deleg_recall_lock in WRITE mode, this will prevent
1647 	 * nfs4_is_otw_open_necessary from trying to use the delegation
1648 	 * while the DELEGRETURN is in progress.
1649 	 */
1650 	(void) nfs_rw_enter_sig(&rp->r_deleg_recall_lock, RW_WRITER, FALSE);
1651 
1652 	rw_entered = TRUE;
1653 
1654 	if (rp->r_deleg_type == OPEN_DELEGATE_NONE)
1655 		goto out;
1656 
1657 	if (flags & NFS4_DR_REOPEN) {
1658 		/*
1659 		 * If R4RECOVERRP is already set, then skip re-opening
1660 		 * the delegation open streams and go straight to doing
1661 		 * delegreturn.  (XXX if the file has failed recovery, then the
1662 		 * delegreturn attempt is likely to be futile.)
1663 		 */
1664 		mutex_enter(&rp->r_statelock);
1665 		do_reopen = !(rp->r_flags & R4RECOVERRP);
1666 		mutex_exit(&rp->r_statelock);
1667 
1668 		if (do_reopen) {
1669 			error = deleg_reopen(vp, &needrecov, ncg, flags);
1670 			if (error != 0) {
1671 				if ((flags & (NFS4_DR_FORCE | NFS4_DR_RECALL))
1672 									== 0)
1673 					goto out;
1674 			} else if (needrecov) {
1675 				if ((flags & NFS4_DR_FORCE) == 0)
1676 					goto out;
1677 			}
1678 		}
1679 	}
1680 
1681 	if (flags & NFS4_DR_DISCARD) {
1682 		mntinfo4_t *mi = VTOMI4(RTOV4(rp));
1683 
1684 		mutex_enter(&rp->r_statelock);
1685 		/*
1686 		 * deleg_return_pending is cleared inside of delegation_accept
1687 		 * when a delegation is accepted.  if this flag has been
1688 		 * cleared, then a new delegation has overwritten the one we
1689 		 * were about to throw away.
1690 		 */
1691 		if (!rp->r_deleg_return_pending) {
1692 			mutex_exit(&rp->r_statelock);
1693 			goto out;
1694 		}
1695 		mutex_exit(&rp->r_statelock);
1696 		(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, FALSE);
1697 		nfs4delegreturn_cleanup_impl(rp, NULL, ncg);
1698 		nfs_rw_exit(&mi->mi_recovlock);
1699 	} else {
1700 		error = nfs4_do_delegreturn(rp, flags, cr, ncg);
1701 	}
1702 
1703 out:
1704 	if (cr)
1705 		crfree(cr);
1706 	if (rw_entered)
1707 		nfs_rw_exit(&rp->r_deleg_recall_lock);
1708 	return (error);
1709 }
1710 
1711 int
1712 nfs4delegreturn(rnode4_t *rp, int flags)
1713 {
1714 	struct nfs4_callback_globals *ncg;
1715 
1716 	ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
1717 	ASSERT(ncg != NULL);
1718 
1719 	return (nfs4delegreturn_impl(rp, flags, ncg));
1720 }
1721 
1722 void
1723 nfs4delegreturn_async(rnode4_t *rp, int flags, bool_t trunc)
1724 {
1725 	struct cb_recall_pass *pp;
1726 
1727 	pp = kmem_alloc(sizeof (struct cb_recall_pass), KM_SLEEP);
1728 	pp->rp = rp;
1729 	pp->flags = flags;
1730 	pp->truncate = trunc;
1731 
1732 	/*
1733 	 * Fire up a thread to do the actual delegreturn
1734 	 * Caller must guarantee that the rnode doesn't
1735 	 * vanish (by calling VN_HOLD).
1736 	 */
1737 
1738 	(void) zthread_create(NULL, 0, nfs4delegreturn_thread, pp, 0,
1739 				minclsyspri);
1740 }
1741 
1742 static void
1743 delegreturn_all_thread(rpcprog_t *pp)
1744 {
1745 	nfs4_server_t *np;
1746 	bool_t found = FALSE;
1747 	rpcprog_t prog;
1748 	rnode4_t *rp;
1749 	vnode_t *vp;
1750 	zoneid_t zoneid = getzoneid();
1751 	struct nfs4_callback_globals *ncg;
1752 
1753 	NFS4_DEBUG(nfs4_drat_debug,
1754 		(CE_NOTE, "delereturn_all_thread: prog %d\n", *pp));
1755 
1756 	prog = *pp;
1757 	kmem_free(pp, sizeof (*pp));
1758 	pp = NULL;
1759 
1760 	mutex_enter(&nfs4_server_lst_lock);
1761 	for (np = nfs4_server_lst.forw; np != &nfs4_server_lst; np = np->forw) {
1762 		if (np->zoneid == zoneid && np->s_program == prog) {
1763 			mutex_enter(&np->s_lock);
1764 			found = TRUE;
1765 			break;
1766 		}
1767 	}
1768 	mutex_exit(&nfs4_server_lst_lock);
1769 
1770 	/*
1771 	 * It's possible that the nfs4_server which was using this
1772 	 * program number has vanished since this thread is async.
1773 	 * If so, just return.  Your work here is finished, my friend.
1774 	 */
1775 	if (!found)
1776 		goto out;
1777 
1778 	ncg = np->zone_globals;
1779 	while ((rp = list_head(&np->s_deleg_list)) != NULL) {
1780 		vp = RTOV4(rp);
1781 		VN_HOLD(vp);
1782 		mutex_exit(&np->s_lock);
1783 		(void) nfs4delegreturn_impl(rp, NFS4_DR_PUSH|NFS4_DR_REOPEN,
1784 									ncg);
1785 		VN_RELE(vp);
1786 
1787 		/* retake the s_lock for next trip through the loop */
1788 		mutex_enter(&np->s_lock);
1789 	}
1790 	mutex_exit(&np->s_lock);
1791 out:
1792 	NFS4_DEBUG(nfs4_drat_debug,
1793 		(CE_NOTE, "delereturn_all_thread: complete\n"));
1794 	zthread_exit();
1795 }
1796 
1797 void
1798 nfs4_delegreturn_all(nfs4_server_t *sp)
1799 {
1800 	rpcprog_t pro, *pp;
1801 
1802 	mutex_enter(&sp->s_lock);
1803 
1804 	/* Check to see if the delegation list is empty */
1805 
1806 	if (list_head(&sp->s_deleg_list) == NULL) {
1807 		mutex_exit(&sp->s_lock);
1808 		return;
1809 	}
1810 	/*
1811 	 * Grab the program number; the async thread will use this
1812 	 * to find the nfs4_server.
1813 	 */
1814 	pro = sp->s_program;
1815 	mutex_exit(&sp->s_lock);
1816 	pp = kmem_alloc(sizeof (rpcprog_t), KM_SLEEP);
1817 	*pp = pro;
1818 	(void) zthread_create(NULL, 0, delegreturn_all_thread, pp, 0,
1819 	    minclsyspri);
1820 }
1821 
1822 
1823 /*
1824  * Discard any delegations
1825  *
1826  * Iterate over the servers s_deleg_list and
1827  * for matching mount-point rnodes discard
1828  * the delegation.
1829  */
1830 void
1831 nfs4_deleg_discard(mntinfo4_t *mi, nfs4_server_t *sp)
1832 {
1833 	rnode4_t *rp, *next;
1834 	mntinfo4_t *r_mi;
1835 	struct nfs4_callback_globals *ncg;
1836 
1837 	ASSERT(mutex_owned(&sp->s_lock));
1838 	ncg = sp->zone_globals;
1839 
1840 	for (rp = list_head(&sp->s_deleg_list); rp != NULL; rp = next) {
1841 		r_mi = VTOMI4(RTOV4(rp));
1842 		next = list_next(&sp->s_deleg_list, rp);
1843 
1844 		if (r_mi != mi) {
1845 			/*
1846 			 * Skip if this rnode is in not on the
1847 			 * same mount-point
1848 			 */
1849 			continue;
1850 		}
1851 
1852 		ASSERT(rp->r_deleg_type == OPEN_DELEGATE_READ);
1853 
1854 #ifdef DEBUG
1855 		if (nfs4_client_recov_debug) {
1856 			zprintf(getzoneid(),
1857 			    "nfs4_deleg_discard: matched rnode %p "
1858 			"-- discarding delegation\n", (void *)rp);
1859 		}
1860 #endif
1861 		mutex_enter(&rp->r_statev4_lock);
1862 		/*
1863 		 * Free the cred originally held when the delegation
1864 		 * was granted. Also need to decrement the refcnt
1865 		 * on this server for each delegation we discard
1866 		 */
1867 		if (rp->r_deleg_cred)
1868 			crfree(rp->r_deleg_cred);
1869 		rp->r_deleg_cred = NULL;
1870 		rp->r_deleg_type = OPEN_DELEGATE_NONE;
1871 		rp->r_deleg_needs_recovery = OPEN_DELEGATE_NONE;
1872 		rp->r_deleg_needs_recall = FALSE;
1873 		ASSERT(sp->s_refcnt > 1);
1874 		sp->s_refcnt--;
1875 		list_remove(&sp->s_deleg_list, rp);
1876 		mutex_exit(&rp->r_statev4_lock);
1877 		nfs4_dec_state_ref_count_nolock(sp, mi);
1878 		ncg->nfs4_callback_stats.delegations.value.ui64--;
1879 	}
1880 }
1881 
1882 /*
1883  * Reopen any open streams that were covered by the given file's
1884  * delegation.
1885  * Returns zero or an errno value.  If there was no error, *recovp
1886  * indicates whether recovery was initiated.
1887  */
1888 
1889 static int
1890 deleg_reopen(vnode_t *vp, bool_t *recovp, struct nfs4_callback_globals *ncg,
1891 	int flags)
1892 {
1893 	nfs4_open_stream_t *osp;
1894 	nfs4_recov_state_t recov_state;
1895 	bool_t needrecov = FALSE;
1896 	mntinfo4_t *mi;
1897 	rnode4_t *rp;
1898 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
1899 	int claimnull;
1900 
1901 	mi = VTOMI4(vp);
1902 	rp = VTOR4(vp);
1903 
1904 	recov_state.rs_flags = 0;
1905 	recov_state.rs_num_retry_despite_err = 0;
1906 
1907 retry:
1908 	if ((e.error = nfs4_start_op(mi, vp, NULL, &recov_state)) != 0) {
1909 		return (e.error);
1910 	}
1911 
1912 	/*
1913 	 * if we mean to discard the delegation, it must be BAD, so don't
1914 	 * use it when doing the reopen or it will fail too.
1915 	 */
1916 	claimnull = (flags & NFS4_DR_DISCARD);
1917 	/*
1918 	 * Loop through the open streams for this rnode to find
1919 	 * all of the ones created using the delegation state ID.
1920 	 * Each of these needs to be re-opened.
1921 	 */
1922 
1923 	while ((osp = get_next_deleg_stream(rp, claimnull)) != NULL) {
1924 
1925 		if (claimnull) {
1926 			nfs4_reopen(vp, osp, &e, CLAIM_NULL, FALSE, FALSE);
1927 		} else {
1928 			ncg->nfs4_callback_stats.claim_cur.value.ui64++;
1929 
1930 			nfs4_reopen(vp, osp, &e, CLAIM_DELEGATE_CUR, FALSE,
1931 					FALSE);
1932 			if (e.error == 0 && e.stat == NFS4_OK)
1933 				ncg->nfs4_callback_stats.
1934 					claim_cur_ok.value.ui64++;
1935 		}
1936 
1937 		if (e.error == EAGAIN) {
1938 			nfs4_end_op(mi, vp, NULL, &recov_state, TRUE);
1939 			goto retry;
1940 		}
1941 
1942 		/*
1943 		 * if error is EINTR, ETIMEDOUT, or NFS4_FRC_UNMT_ERR, then
1944 		 * recovery has already been started inside of nfs4_reopen.
1945 		 */
1946 		if (e.error == EINTR || e.error == ETIMEDOUT ||
1947 		    NFS4_FRC_UNMT_ERR(e.error, vp->v_vfsp)) {
1948 			open_stream_rele(osp, rp);
1949 			break;
1950 		}
1951 
1952 		needrecov = nfs4_needs_recovery(&e, TRUE, vp->v_vfsp);
1953 
1954 		if (e.error != 0 && !needrecov) {
1955 			/*
1956 			 * Recovery is not possible, but don't give up yet;
1957 			 * we'd still like to do delegreturn after
1958 			 * reopening as many streams as possible.
1959 			 * Continue processing the open streams.
1960 			 */
1961 
1962 			ncg->nfs4_callback_stats.recall_failed.value.ui64++;
1963 
1964 		} else if (needrecov) {
1965 			/*
1966 			 * Start recovery and bail out.  The recovery
1967 			 * thread will take it from here.
1968 			 */
1969 			(void) nfs4_start_recovery(&e, mi, vp, NULL, NULL,
1970 				NULL, OP_OPEN, NULL);
1971 			open_stream_rele(osp, rp);
1972 			*recovp = TRUE;
1973 			break;
1974 		}
1975 
1976 		open_stream_rele(osp, rp);
1977 	}
1978 
1979 	nfs4_end_op(mi, vp, NULL, &recov_state, needrecov);
1980 
1981 	return (e.error);
1982 }
1983 
1984 /*
1985  * get_next_deleg_stream - returns the next open stream which
1986  * represents a delegation for this rnode.  In order to assure
1987  * forward progress, the caller must guarantee that each open
1988  * stream returned is changed so that a future call won't return
1989  * it again.
1990  *
1991  * There are several ways for the open stream to change.  If the open
1992  * stream is !os_delegation, then we aren't interested in it.  Also, if
1993  * either os_failed_reopen or !os_valid, then don't return the osp.
1994  *
1995  * If claimnull is false (doing reopen CLAIM_DELEGATE_CUR) then return
1996  * the osp if it is an os_delegation open stream.  Also, if the rnode still
1997  * has r_deleg_return_pending, then return the os_delegation osp.  Lastly,
1998  * if the rnode's r_deleg_stateid is different from the osp's open_stateid,
1999  * then return the osp.
2000  *
2001  * We have already taken the 'r_deleg_recall_lock' as WRITER, which
2002  * prevents new OPENs from going OTW (as start_fop takes this
2003  * lock in READ mode); thus, no new open streams can be created
2004  * (which inheretly means no new delegation open streams are
2005  * being created).
2006  */
2007 
2008 static nfs4_open_stream_t *
2009 get_next_deleg_stream(rnode4_t *rp, int claimnull)
2010 {
2011 	nfs4_open_stream_t	*osp;
2012 
2013 	ASSERT(nfs_rw_lock_held(&rp->r_deleg_recall_lock, RW_WRITER));
2014 
2015 	/*
2016 	 * Search through the list of open streams looking for
2017 	 * one that was created while holding the delegation.
2018 	 */
2019 	mutex_enter(&rp->r_os_lock);
2020 	for (osp = list_head(&rp->r_open_streams); osp != NULL;
2021 	    osp = list_next(&rp->r_open_streams, osp)) {
2022 		mutex_enter(&osp->os_sync_lock);
2023 		if (!osp->os_delegation || osp->os_failed_reopen ||
2024 		    !osp->os_valid) {
2025 			mutex_exit(&osp->os_sync_lock);
2026 			continue;
2027 		}
2028 		if (!claimnull || rp->r_deleg_return_pending ||
2029 		    !stateid4_cmp(&osp->open_stateid, &rp->r_deleg_stateid)) {
2030 			osp->os_ref_count++;
2031 			mutex_exit(&osp->os_sync_lock);
2032 			mutex_exit(&rp->r_os_lock);
2033 			return (osp);
2034 		}
2035 		mutex_exit(&osp->os_sync_lock);
2036 	}
2037 	mutex_exit(&rp->r_os_lock);
2038 
2039 	return (NULL);
2040 }
2041 
2042 static void
2043 nfs4delegreturn_thread(struct cb_recall_pass *args)
2044 {
2045 	rnode4_t *rp;
2046 	vnode_t *vp;
2047 	cred_t *cr;
2048 	int dtype, error, flags;
2049 	bool_t rdirty, rip;
2050 	kmutex_t cpr_lock;
2051 	callb_cpr_t cpr_info;
2052 	struct nfs4_callback_globals *ncg;
2053 
2054 	ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
2055 	ASSERT(ncg != NULL);
2056 
2057 	mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
2058 
2059 	CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr,
2060 			"nfsv4delegRtn");
2061 
2062 	rp = args->rp;
2063 	vp = RTOV4(rp);
2064 
2065 	mutex_enter(&rp->r_statev4_lock);
2066 	if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
2067 		mutex_exit(&rp->r_statev4_lock);
2068 		goto out;
2069 	}
2070 	mutex_exit(&rp->r_statev4_lock);
2071 
2072 	/*
2073 	 * Take the read-write lock in read mode to prevent other
2074 	 * threads from modifying the data during the recall.  This
2075 	 * doesn't affect mmappers.
2076 	 */
2077 	(void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
2078 
2079 	/* Proceed with delegreturn */
2080 
2081 	mutex_enter(&rp->r_statev4_lock);
2082 	if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
2083 		mutex_exit(&rp->r_statev4_lock);
2084 		nfs_rw_exit(&rp->r_rwlock);
2085 		goto out;
2086 	}
2087 	dtype = rp->r_deleg_type;
2088 	cr = rp->r_deleg_cred;
2089 	ASSERT(cr != NULL);
2090 	crhold(cr);
2091 	mutex_exit(&rp->r_statev4_lock);
2092 
2093 	flags = args->flags;
2094 
2095 	/*
2096 	 * If the file is being truncated at the server, then throw
2097 	 * away all of the pages, it doesn't matter what flavor of
2098 	 * delegation we have.
2099 	 */
2100 
2101 	if (args->truncate) {
2102 		ncg->nfs4_callback_stats.recall_trunc.value.ui64++;
2103 		nfs4_invalidate_pages(vp, 0, cr);
2104 	} else if (dtype == OPEN_DELEGATE_WRITE) {
2105 
2106 		mutex_enter(&rp->r_statelock);
2107 		rdirty = rp->r_flags & R4DIRTY;
2108 		mutex_exit(&rp->r_statelock);
2109 
2110 		if (rdirty) {
2111 			error = VOP_PUTPAGE(vp, 0, 0, 0, cr);
2112 
2113 			if (error)
2114 				CB_WARN1("nfs4delegreturn_thread:"
2115 				" VOP_PUTPAGE: %d\n", error);
2116 		}
2117 		/* turn off NFS4_DR_PUSH because we just did that above. */
2118 		flags &= ~NFS4_DR_PUSH;
2119 	}
2120 
2121 	mutex_enter(&rp->r_statelock);
2122 	rip =  rp->r_flags & R4RECOVERRP;
2123 	mutex_exit(&rp->r_statelock);
2124 
2125 	/* If a failed recovery is indicated, discard the pages */
2126 
2127 	if (rip) {
2128 
2129 		error = VOP_PUTPAGE(vp, 0, 0, B_INVAL, cr);
2130 
2131 		if (error)
2132 			CB_WARN1("nfs4delegreturn_thread: VOP_PUTPAGE: %d\n",
2133 				error);
2134 	}
2135 
2136 	/*
2137 	 * Pass the flags to nfs4delegreturn_impl, but be sure not to pass
2138 	 * NFS4_DR_DID_OP, which just calls nfs4delegreturn_async again.
2139 	 */
2140 	flags &= ~NFS4_DR_DID_OP;
2141 
2142 	(void) nfs4delegreturn_impl(rp, flags, ncg);
2143 
2144 	nfs_rw_exit(&rp->r_rwlock);
2145 	crfree(cr);
2146 out:
2147 	kmem_free(args, sizeof (struct cb_recall_pass));
2148 	VN_RELE(vp);
2149 	mutex_enter(&cpr_lock);
2150 	CALLB_CPR_EXIT(&cpr_info);
2151 	mutex_destroy(&cpr_lock);
2152 	zthread_exit();
2153 }
2154 
2155 /*
2156  * This function has one assumption that the caller of this function is
2157  * either doing recovery (therefore cannot call nfs4_start_op) or has
2158  * already called nfs4_start_op().
2159  */
2160 void
2161 nfs4_delegation_accept(rnode4_t *rp, open_claim_type4 claim,  OPEN4res *res,
2162 	nfs4_ga_res_t *garp, cred_t *cr)
2163 {
2164 	open_read_delegation4 *orp;
2165 	open_write_delegation4 *owp;
2166 	nfs4_server_t *np;
2167 	bool_t already = FALSE;
2168 	bool_t recall = FALSE;
2169 	bool_t valid_garp = TRUE;
2170 	long mapcnt;
2171 	uint_t rflag;
2172 	mntinfo4_t *mi;
2173 	bool_t recov;
2174 	struct nfs4_callback_globals *ncg;
2175 	open_delegation_type4 odt;
2176 
2177 	ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
2178 	ASSERT(ncg != NULL);
2179 
2180 	mutex_enter(&rp->r_statev4_lock);
2181 
2182 	if (rp->r_deleg_type == OPEN_DELEGATE_READ ||
2183 	    rp->r_deleg_type == OPEN_DELEGATE_WRITE)
2184 		already = TRUE;
2185 
2186 	odt = res->delegation.delegation_type;
2187 	mutex_exit(&rp->r_statev4_lock);
2188 
2189 	if (odt == OPEN_DELEGATE_READ) {
2190 
2191 		mutex_enter(&rp->r_statev4_lock);
2192 		rp->r_deleg_type = res->delegation.delegation_type;
2193 		orp = &res->delegation.open_delegation4_u.read;
2194 		rp->r_deleg_stateid = orp->stateid;
2195 		rp->r_deleg_perms = orp->permissions;
2196 		recall = orp->recall;
2197 		mutex_exit(&rp->r_statev4_lock);
2198 
2199 		ncg->nfs4_callback_stats.delegations.value.ui64++;
2200 		ncg->nfs4_callback_stats.delegaccept_r.value.ui64++;
2201 
2202 	} else if (odt == OPEN_DELEGATE_WRITE) {
2203 
2204 		mutex_enter(&rp->r_statelock);
2205 		mutex_enter(&rp->r_statev4_lock);
2206 		rp->r_deleg_type = res->delegation.delegation_type;
2207 		owp = &res->delegation.open_delegation4_u.write;
2208 		rp->r_deleg_stateid = owp->stateid;
2209 		rp->r_deleg_perms = owp->permissions;
2210 		rp->r_deleg_limit = owp->space_limit;
2211 		recall = owp->recall;
2212 
2213 		if (garp == NULL || !garp->n4g_change_valid) {
2214 			valid_garp = FALSE;
2215 			rp->r_deleg_change = 0;
2216 			rp->r_deleg_change_grant = 0;
2217 		} else {
2218 			rp->r_deleg_change = garp->n4g_change;
2219 			rp->r_deleg_change_grant = garp->n4g_change;
2220 		}
2221 		mapcnt = rp->r_mapcnt;
2222 		rflag = rp->r_flags;
2223 
2224 		/*
2225 		 * Update the delegation change attribute if
2226 		 * there are mappers for the file is dirty.  This
2227 		 * might be the case during recovery after server
2228 		 * reboot.
2229 		 */
2230 		if (mapcnt > 0 || rflag & R4DIRTY)
2231 			rp->r_deleg_change++;
2232 
2233 		NFS4_DEBUG(nfs4_callback_debug, (CE_NOTE,
2234 			"nfs4_delegation_accept: r_deleg_change: 0x%x\n",
2235 			(int)(rp->r_deleg_change >> 32)));
2236 		NFS4_DEBUG(nfs4_callback_debug, (CE_NOTE,
2237 			"nfs4_delegation_accept: r_delg_change_grant: 0x%x\n",
2238 			(int)(rp->r_deleg_change_grant >> 32)));
2239 
2240 #ifdef	DEBUG
2241 		if (nfs4_use_phony_limit == 1)
2242 			rp->r_deleg_limit = nfs4_deleg_space_phony;
2243 		if (nfs4_use_phony_limit == 2) {
2244 			rp->r_deleg_limit = nfs4_deleg_space_phony2;
2245 			rp->r_deleg_limit.nfs_space_limit4_u.mod_blocks =
2246 				nfs4_deleg_space_phonyl;
2247 		}
2248 #endif
2249 		mutex_exit(&rp->r_statev4_lock);
2250 		mutex_exit(&rp->r_statelock);
2251 
2252 		ncg->nfs4_callback_stats.delegations.value.ui64++;
2253 		ncg->nfs4_callback_stats.delegaccept_rw.value.ui64++;
2254 
2255 #ifdef	DEBUG
2256 
2257 	} else if (nfs4_deleg_accept_phony == OPEN_DELEGATE_READ) {
2258 
2259 		mutex_enter(&rp->r_statev4_lock);
2260 		rp->r_deleg_type = OPEN_DELEGATE_READ;
2261 		rp->r_deleg_stateid = nfs4_deleg_any;
2262 		rp->r_deleg_perms = nfs4_deleg_ace_phony;
2263 		rp->r_deleg_change = nfs4_deleg_change_phony;
2264 		rp->r_deleg_change_grant = rp->r_deleg_change;
2265 		mutex_exit(&rp->r_statev4_lock);
2266 
2267 	} else if (nfs4_deleg_accept_phony == OPEN_DELEGATE_WRITE) {
2268 
2269 		mutex_enter(&rp->r_statev4_lock);
2270 		rp->r_deleg_type = OPEN_DELEGATE_WRITE;
2271 		rp->r_deleg_stateid = nfs4_deleg_any;
2272 		rp->r_deleg_perms = nfs4_deleg_ace_phony;
2273 		rp->r_deleg_limit = nfs4_deleg_space_phony;
2274 		rp->r_deleg_change = nfs4_deleg_change_phony;
2275 		rp->r_deleg_change_grant = rp->r_deleg_change;
2276 		mutex_exit(&rp->r_statev4_lock);
2277 
2278 #endif
2279 	} else {
2280 
2281 		if (already) {
2282 			switch (claim) {
2283 
2284 			case CLAIM_NULL:
2285 			case CLAIM_PREVIOUS:
2286 				/*
2287 				 * The file may already have a delegation when
2288 				 * it is reopened during recovery.  In this
2289 				 * case, we consider the delegation to no longer
2290 				 * be valid.  As a courtesy, attempt to return
2291 				 * the delegation.
2292 				 */
2293 				mi = VTOMI4(RTOV4(rp));
2294 				mutex_enter(&mi->mi_lock);
2295 				recov = mi->mi_recovflags & MI4R_REOPEN_FILES;
2296 				mutex_exit(&mi->mi_lock);
2297 
2298 				/*
2299 				 * We need to hold rp->r_statev4_lock while
2300 				 * checking rp->r_deleg_return_pending and
2301 				 * when calling nfs4_dlistadd() if we're in
2302 				 * recovery.
2303 				 */
2304 				mutex_enter(&rp->r_statev4_lock);
2305 				if (rp->r_deleg_return_pending == TRUE) {
2306 					/*
2307 					 * We're alreading in the throes of
2308 					 * returning a delegation.  Drop
2309 					 * the lock and head for the return.
2310 					 */
2311 					mutex_exit(&rp->r_statev4_lock);
2312 				} else if (recov) {
2313 					/*
2314 					 * Cannot call delegreturn from inside
2315 					 * of recovery or VOP_PUTPAGE will hang
2316 					 * due to nfs4_start_fop call in
2317 					 * nfs4write.  Use dlistadd to add the
2318 					 * rnode to the list of rnodes needing
2319 					 * cleaning.
2320 					 *
2321 					 * NB: We're in recover so don't reopen
2322 					 */
2323 					nfs4_dlistadd(rp, ncg,
2324 						NFS4_DR_PUSH|NFS4_DR_DISCARD);
2325 					mutex_exit(&rp->r_statev4_lock);
2326 				} else {
2327 					mutex_exit(&rp->r_statev4_lock);
2328 					/* XXX - Do we need to reopen? */
2329 					(void) nfs4delegreturn_impl(rp,
2330 						(NFS4_DR_PUSH |
2331 						    NFS4_DR_DID_OP |
2332 						    NFS4_DR_REOPEN),
2333 						ncg);
2334 				}
2335 				break;
2336 
2337 			default:
2338 				/*
2339 				 * CLAIM_DELEGATE_CUR, CLAIM_DELEGATE_PREV
2340 				 * fall through here
2341 				 */
2342 				break;
2343 			}
2344 		}
2345 
2346 		/* No delegation granted, get out. */
2347 		return;
2348 	}
2349 
2350 	mutex_enter(&rp->r_statev4_lock);
2351 	rp->r_deleg_return_pending = FALSE;
2352 	rp->r_deleg_needs_recovery = OPEN_DELEGATE_NONE;
2353 	if (claim == CLAIM_PREVIOUS)
2354 		rp->r_deleg_needs_recall = recall;
2355 
2356 #ifdef	DEBUG
2357 	if (nfs4_use_phony_recall)
2358 		rp->r_deleg_needs_recall = nfs4_phony_recall_v;
2359 #endif
2360 
2361 	/*
2362 	 * If the server has requested a recall, then put the
2363 	 * vnode on a list of files which need to be cleaned.
2364 	 * This will be done later by the recovery thread to
2365 	 * avoid a deadlock.  If this were a CLAIM_NULL open
2366 	 * and the server set recall, then the server is just
2367 	 * confused; the delegation will be returned eventually.
2368 	 */
2369 	if (rp->r_deleg_needs_recall)
2370 		nfs4_dlistadd(rp, ncg, NFS4_DR_PUSH|NFS4_DR_REOPEN);
2371 
2372 	if (already == FALSE) {
2373 		rp->r_deleg_cred = cr;
2374 		crhold(cr);
2375 	}
2376 	mutex_exit(&rp->r_statev4_lock);
2377 
2378 	if (already == FALSE) {
2379 
2380 		/*
2381 		 * Add this rnode to the list of rnodes with delegations
2382 		 * for this nfs4_server.  find_nfs4_server returns with
2383 		 * the mutex locked, so don't forget to mutex exit.
2384 		 */
2385 
2386 		if ((np = find_nfs4_server(VTOMI4(RTOV4(rp)))) == NULL) {
2387 
2388 			mutex_enter(&rp->r_statev4_lock);
2389 			rp->r_deleg_type = OPEN_DELEGATE_NONE;
2390 			mutex_exit(&rp->r_statev4_lock);
2391 			return;
2392 		}
2393 
2394 		list_insert_head(&np->s_deleg_list, rp);
2395 		/* added list node gets a reference */
2396 		np->s_refcnt++;
2397 		nfs4_inc_state_ref_count_nolock(np, VTOMI4(RTOV4(rp)));
2398 		mutex_exit(&np->s_lock);
2399 		nfs4_server_rele(np);
2400 	}
2401 
2402 	/*
2403 	 * This call to nfs4delegreturn assumes that nfs4_start_op MUST
2404 	 * not be called by nfs4delegreturn.
2405 	 */
2406 	if (nfs4_delegreturn_policy == IMMEDIATE || !valid_garp)
2407 		(void) nfs4delegreturn_impl(rp,
2408 			NFS4_DR_PUSH|NFS4_DR_DID_OP|NFS4_DR_REOPEN, ncg);
2409 }
2410 
2411 /*
2412  * nfs4delegabandon - Abandon the delegation on an rnode4.  This code
2413  * is called when the client receives EXPIRED, BAD_STATEID, OLD_STATEID
2414  * or BADSEQID and the recovery code is unable to recover.  Push any
2415  * dirty data back to the server and return the delegation (if any).
2416  */
2417 
2418 void
2419 nfs4delegabandon(rnode4_t *rp)
2420 {
2421 	vnode_t *vp;
2422 	struct cb_recall_pass *pp;
2423 	open_delegation_type4 dt;
2424 
2425 	mutex_enter(&rp->r_statev4_lock);
2426 	dt = rp->r_deleg_type;
2427 	mutex_exit(&rp->r_statev4_lock);
2428 
2429 	if (dt == OPEN_DELEGATE_NONE)
2430 		return;
2431 
2432 	vp = RTOV4(rp);
2433 	VN_HOLD(vp);
2434 
2435 	pp = kmem_alloc(sizeof (struct cb_recall_pass), KM_SLEEP);
2436 	pp->rp = rp;
2437 	/*
2438 	 * Recovery on the file has failed and we want to return
2439 	 * the delegation.  We don't want to reopen files and
2440 	 * nfs4delegreturn_thread() figures out what to do about
2441 	 * the data.  The only thing to do is attempt to return
2442 	 * the delegation.
2443 	 */
2444 	pp->flags = 0;
2445 	pp->truncate = FALSE;
2446 
2447 	/*
2448 	 * Fire up a thread to do the delegreturn; this is
2449 	 * necessary because we could be inside a GETPAGE or
2450 	 * PUTPAGE and we cannot do another one.
2451 	 */
2452 
2453 	(void) zthread_create(NULL, 0, nfs4delegreturn_thread, pp, 0,
2454 				minclsyspri);
2455 }
2456 
2457 static int
2458 wait_for_recall1(vnode_t *vp, nfs4_op_hint_t op, nfs4_recov_state_t *rsp,
2459 	int flg)
2460 {
2461 	rnode4_t *rp;
2462 	int error = 0;
2463 
2464 #ifdef lint
2465 	op = op;
2466 #endif
2467 
2468 	if (vp && vp->v_type == VREG) {
2469 		rp = VTOR4(vp);
2470 
2471 		/*
2472 		 * Take r_deleg_recall_lock in read mode to synchronize
2473 		 * with delegreturn.
2474 		 */
2475 		error = nfs_rw_enter_sig(&rp->r_deleg_recall_lock,
2476 			RW_READER, INTR4(vp));
2477 
2478 		if (error == 0)
2479 			rsp->rs_flags |= flg;
2480 
2481 	}
2482 	return (error);
2483 }
2484 
2485 void
2486 nfs4_end_op_recall(vnode_t *vp1, vnode_t *vp2, nfs4_recov_state_t *rsp)
2487 {
2488 	NFS4_DEBUG(nfs4_recall_debug,
2489 		(CE_NOTE, "nfs4_end_op_recall: 0x%p, 0x%p\n",
2490 		(void *)vp1, (void *)vp2));
2491 
2492 	if (vp2 && rsp->rs_flags & NFS4_RS_RECALL_HELD2)
2493 		nfs_rw_exit(&VTOR4(vp2)->r_deleg_recall_lock);
2494 	if (vp1 && rsp->rs_flags & NFS4_RS_RECALL_HELD1)
2495 		nfs_rw_exit(&VTOR4(vp1)->r_deleg_recall_lock);
2496 }
2497 
2498 int
2499 wait_for_recall(vnode_t *vp1, vnode_t *vp2, nfs4_op_hint_t op,
2500 	nfs4_recov_state_t *rsp)
2501 {
2502 	int error;
2503 
2504 	NFS4_DEBUG(nfs4_recall_debug,
2505 		(CE_NOTE, "wait_for_recall:    0x%p, 0x%p\n",
2506 		(void *)vp1, (void *) vp2));
2507 
2508 	rsp->rs_flags &= ~(NFS4_RS_RECALL_HELD1|NFS4_RS_RECALL_HELD2);
2509 
2510 	if ((error = wait_for_recall1(vp1, op, rsp, NFS4_RS_RECALL_HELD1)) != 0)
2511 		return (error);
2512 
2513 	if ((error = wait_for_recall1(vp2, op, rsp, NFS4_RS_RECALL_HELD2))
2514 	    != 0) {
2515 		if (rsp->rs_flags & NFS4_RS_RECALL_HELD1) {
2516 			nfs_rw_exit(&VTOR4(vp1)->r_deleg_recall_lock);
2517 			rsp->rs_flags &= ~NFS4_RS_RECALL_HELD1;
2518 		}
2519 
2520 		return (error);
2521 	}
2522 
2523 	return (0);
2524 }
2525 
2526 /*
2527  * nfs4_dlistadd - Add this rnode to a list of rnodes to be
2528  * DELEGRETURN'd at the end of recovery.
2529  */
2530 
2531 static void
2532 nfs4_dlistadd(rnode4_t *rp, struct nfs4_callback_globals *ncg, int flags)
2533 {
2534 	struct nfs4_dnode *dp;
2535 
2536 	ASSERT(mutex_owned(&rp->r_statev4_lock));
2537 	/*
2538 	 * Mark the delegation as having a return pending.
2539 	 * This will prevent the use of the delegation stateID
2540 	 * by read, write, setattr and open.
2541 	 */
2542 	rp->r_deleg_return_pending = TRUE;
2543 	dp = kmem_alloc(sizeof (*dp), KM_SLEEP);
2544 	VN_HOLD(RTOV4(rp));
2545 	dp->rnodep = rp;
2546 	dp->flags = flags;
2547 	mutex_enter(&ncg->nfs4_dlist_lock);
2548 	list_insert_head(&ncg->nfs4_dlist, dp);
2549 #ifdef	DEBUG
2550 	ncg->nfs4_dlistadd_c++;
2551 #endif
2552 	mutex_exit(&ncg->nfs4_dlist_lock);
2553 }
2554 
2555 /*
2556  * nfs4_dlistclean_impl - Do DELEGRETURN for each rnode on the list.
2557  * of files awaiting cleaning.  If the override_flags are non-zero
2558  * then use them rather than the flags that were set when the rnode
2559  * was added to the dlist.
2560  */
2561 static void
2562 nfs4_dlistclean_impl(struct nfs4_callback_globals *ncg, int override_flags)
2563 {
2564 	rnode4_t *rp;
2565 	struct nfs4_dnode *dp;
2566 	int flags;
2567 
2568 	ASSERT(override_flags == 0 || override_flags == NFS4_DR_DISCARD);
2569 
2570 	mutex_enter(&ncg->nfs4_dlist_lock);
2571 	while ((dp = list_head(&ncg->nfs4_dlist)) != NULL) {
2572 #ifdef	DEBUG
2573 		ncg->nfs4_dlistclean_c++;
2574 #endif
2575 		list_remove(&ncg->nfs4_dlist, dp);
2576 		mutex_exit(&ncg->nfs4_dlist_lock);
2577 		rp = dp->rnodep;
2578 		flags = (override_flags != 0) ? override_flags : dp->flags;
2579 		kmem_free(dp, sizeof (*dp));
2580 		(void) nfs4delegreturn_impl(rp, flags, ncg);
2581 		VN_RELE(RTOV4(rp));
2582 		mutex_enter(&ncg->nfs4_dlist_lock);
2583 	}
2584 	mutex_exit(&ncg->nfs4_dlist_lock);
2585 }
2586 
2587 void
2588 nfs4_dlistclean(void)
2589 {
2590 	struct nfs4_callback_globals *ncg;
2591 
2592 	ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
2593 	ASSERT(ncg != NULL);
2594 
2595 	nfs4_dlistclean_impl(ncg, 0);
2596 }
2597