xref: /titanic_50/usr/src/uts/common/fs/nfs/nfs4_callback.c (revision 554ff184129088135ad2643c1c9832174a17be88)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
28 /* All Rights Reserved */
29 
30 #pragma ident	"%Z%%M%	%I%	%E% SMI"
31 
32 #include <sys/param.h>
33 #include <sys/types.h>
34 #include <sys/systm.h>
35 #include <sys/cred.h>
36 #include <sys/vfs.h>
37 #include <sys/vnode.h>
38 #include <sys/pathname.h>
39 #include <sys/sysmacros.h>
40 #include <sys/kmem.h>
41 #include <sys/kstat.h>
42 #include <sys/mkdev.h>
43 #include <sys/mount.h>
44 #include <sys/statvfs.h>
45 #include <sys/errno.h>
46 #include <sys/debug.h>
47 #include <sys/cmn_err.h>
48 #include <sys/utsname.h>
49 #include <sys/bootconf.h>
50 #include <sys/modctl.h>
51 #include <sys/acl.h>
52 #include <sys/flock.h>
53 #include <sys/kstr.h>
54 #include <sys/stropts.h>
55 #include <sys/strsubr.h>
56 #include <sys/atomic.h>
57 #include <sys/disp.h>
58 #include <sys/policy.h>
59 #include <sys/list.h>
60 #include <sys/zone.h>
61 
62 #include <rpc/types.h>
63 #include <rpc/auth.h>
64 #include <rpc/rpcsec_gss.h>
65 #include <rpc/clnt.h>
66 #include <rpc/xdr.h>
67 
68 #include <nfs/nfs.h>
69 #include <nfs/nfs_clnt.h>
70 #include <nfs/mount.h>
71 #include <nfs/nfs_acl.h>
72 
73 #include <fs/fs_subr.h>
74 
75 #include <nfs/nfs4.h>
76 #include <nfs/rnode4.h>
77 #include <nfs/nfs4_clnt.h>
78 #include <nfs/nfssys.h>
79 
80 #ifdef	DEBUG
81 /*
82  * These are "special" state IDs and file handles that
83  * match any delegation state ID or file handled.  This
84  * is for testing purposes only.
85  */
86 
87 stateid4 nfs4_deleg_any = { 0x7FFFFFF0 };
88 char nfs4_deleg_fh[] = "\0377\0376\0375\0374";
89 nfs_fh4 nfs4_deleg_anyfh = { sizeof (nfs4_deleg_fh)-1, nfs4_deleg_fh };
90 int nfs4_deleg_accept_phony = OPEN_DELEGATE_NONE;
91 nfsace4 nfs4_deleg_ace_phony;
92 nfs_space_limit4 nfs4_deleg_space_phony = { NFS_LIMIT_SIZE, 8192 };
93 nfs_space_limit4 nfs4_deleg_space_phony2 = { NFS_LIMIT_BLOCKS, 0 };
94 nfs_modified_limit4 nfs4_deleg_space_phonyl = { 8, 512 };
95 changeid4 nfs4_deleg_change_phony = 0x7eeeeeee76666660LL;
96 /* ignore malformed file handle on recall, bug in linux server */
97 int nfs4_linux_bug = 1;
98 int nfs4_use_phony_limit;
99 int nfs4_use_phony_recall;
100 int nfs4_phony_recall_v;
101 nfsstat4 cb4_getattr_fail = NFS4_OK;
102 nfsstat4 cb4_recall_fail = NFS4_OK;
103 
104 int nfs4_callback_debug;
105 int nfs4_recall_debug;
106 int nfs4_drat_debug;
107 
108 #endif
109 
110 #define	CB_NOTE(x)	NFS4_DEBUG(nfs4_callback_debug, (CE_NOTE, x))
111 #define	CB_WARN(x)	NFS4_DEBUG(nfs4_callback_debug, (CE_WARN, x))
112 #define	CB_WARN1(x, y)	NFS4_DEBUG(nfs4_callback_debug, (CE_WARN, x, y))
113 
114 enum nfs4_delegreturn_policy nfs4_delegreturn_policy = INACTIVE;
115 
116 static zone_key_t nfs4_callback_zone_key;
117 
118 /*
119  * NFS4_MAPSIZE is the number of bytes we are willing to consume
120  * for the block allocation map when the server grants a NFS_LIMIT_BLOCK
121  * style delegation.
122  */
123 
124 #define	NFS4_MAPSIZE	8192
125 #define	NFS4_MAPWORDS	NFS4_MAPSIZE/sizeof (uint_t)
126 #define	NbPW		(NBBY*sizeof (uint_t))
127 
128 static int nfs4_num_prognums = 1024;
129 static SVC_CALLOUT_TABLE nfs4_cb_sct;
130 
131 struct nfs4_dnode {
132 	list_node_t	linkage;
133 	rnode4_t	*rnodep;
134 	int		flags;		/* Flags for nfs4delegreturn_impl() */
135 };
136 
137 static const struct nfs4_callback_stats nfs4_callback_stats_tmpl = {
138 	{ "delegations",	KSTAT_DATA_UINT64 },
139 	{ "cb_getattr",		KSTAT_DATA_UINT64 },
140 	{ "cb_recall",		KSTAT_DATA_UINT64 },
141 	{ "cb_null",		KSTAT_DATA_UINT64 },
142 	{ "cb_dispatch",	KSTAT_DATA_UINT64 },
143 	{ "delegaccept_r",	KSTAT_DATA_UINT64 },
144 	{ "delegaccept_rw",	KSTAT_DATA_UINT64 },
145 	{ "delegreturn",	KSTAT_DATA_UINT64 },
146 	{ "callbacks",		KSTAT_DATA_UINT64 },
147 	{ "claim_cur",		KSTAT_DATA_UINT64 },
148 	{ "claim_cur_ok",	KSTAT_DATA_UINT64 },
149 	{ "recall_trunc",	KSTAT_DATA_UINT64 },
150 	{ "recall_failed",	KSTAT_DATA_UINT64 },
151 	{ "return_limit_write",	KSTAT_DATA_UINT64 },
152 	{ "return_limit_addmap", KSTAT_DATA_UINT64 },
153 	{ "deleg_recover",	KSTAT_DATA_UINT64 },
154 	{ "cb_illegal",		KSTAT_DATA_UINT64 }
155 };
156 
157 struct nfs4_cb_port {
158 	list_node_t		linkage; /* linkage into per-zone port list */
159 	char			netid[KNC_STRSIZE];
160 	char			uaddr[KNC_STRSIZE];
161 	char			protofmly[KNC_STRSIZE];
162 	char			proto[KNC_STRSIZE];
163 };
164 
165 static int cb_getattr_bytes;
166 
167 struct cb_recall_pass {
168 	rnode4_t	*rp;
169 	int		flags;		/* Flags for nfs4delegreturn_impl() */
170 	bool_t		truncate;
171 };
172 
173 static nfs4_open_stream_t *get_next_deleg_stream(rnode4_t *, int);
174 static void nfs4delegreturn_thread(struct cb_recall_pass *);
175 static int deleg_reopen(vnode_t *, bool_t *, struct nfs4_callback_globals *,
176     int);
177 static void nfs4_dlistadd(rnode4_t *, struct nfs4_callback_globals *, int);
178 static void nfs4_dlistclean_impl(struct nfs4_callback_globals *, int);
179 static int nfs4delegreturn_impl(rnode4_t *, int,
180     struct nfs4_callback_globals *);
181 static void nfs4delegreturn_cleanup_impl(rnode4_t *, nfs4_server_t *,
182     struct nfs4_callback_globals *);
183 
184 static void
185 cb_getattr(nfs_cb_argop4 *argop, nfs_cb_resop4 *resop, struct svc_req *req,
186 	struct compound_state *cs, struct nfs4_callback_globals *ncg)
187 {
188 	CB_GETATTR4args *args = &argop->nfs_cb_argop4_u.opcbgetattr;
189 	CB_GETATTR4res *resp = &resop->nfs_cb_resop4_u.opcbgetattr;
190 	rnode4_t *rp;
191 	vnode_t *vp;
192 	bool_t found = FALSE;
193 	struct nfs4_server *sp;
194 	struct fattr4 *fap;
195 	char *fdata = NULL;
196 	long mapcnt;
197 	fattr4_change change;
198 	fattr4_size size;
199 	uint_t rflag;
200 
201 	ncg->nfs4_callback_stats.cb_getattr.value.ui64++;
202 
203 #ifdef DEBUG
204 	/*
205 	 * error injection hook: set cb_getattr_fail global to
206 	 * NFS4 pcol error to be returned
207 	 */
208 	if (cb4_getattr_fail != NFS4_OK) {
209 		*cs->statusp = resp->status = cb4_getattr_fail;
210 		return;
211 	}
212 #endif
213 
214 	resp->obj_attributes.attrmask = 0;
215 
216 	mutex_enter(&ncg->nfs4_cb_lock);
217 	sp = ncg->nfs4prog2server[req->rq_prog - NFS4_CALLBACK];
218 	mutex_exit(&ncg->nfs4_cb_lock);
219 
220 	if (nfs4_server_vlock(sp, 0) == FALSE) {
221 
222 		CB_WARN("cb_getattr: cannot find server\n");
223 
224 		*cs->statusp = resp->status = NFS4ERR_BADHANDLE;
225 		return;
226 	}
227 
228 	/*
229 	 * In cb_compound, callback_ident was validated against rq_prog,
230 	 * but we couldn't verify that it was set to the value we provided
231 	 * at setclientid time (because we didn't have server struct yet).
232 	 * Now we have the server struct, but don't have callback_ident
233 	 * handy.  So, validate server struct program number against req
234 	 * RPC's prog number.  At this point, we know the RPC prog num
235 	 * is valid (else we wouldn't be here); however, we don't know
236 	 * that it was the prog number we supplied to this server at
237 	 * setclientid time.  If the prog numbers aren't equivalent, then
238 	 * log the problem and fail the request because either cbserv
239 	 * and/or cbclient are confused.  This will probably never happen.
240 	 */
241 	if (sp->s_program != req->rq_prog) {
242 #ifdef DEBUG
243 		zcmn_err(getzoneid(), CE_WARN,
244 		    "cb_getattr: wrong server program number srv=%d req=%d\n",
245 		    sp->s_program, req->rq_prog);
246 #else
247 		zcmn_err(getzoneid(), CE_WARN,
248 		    "cb_getattr: wrong server program number\n");
249 #endif
250 		mutex_exit(&sp->s_lock);
251 		nfs4_server_rele(sp);
252 		*cs->statusp = resp->status = NFS4ERR_BADHANDLE;
253 		return;
254 	}
255 
256 	/*
257 	 * Search the delegation list for a matching file handle;
258 	 * mutex on sp prevents the list from changing.
259 	 */
260 
261 	rp = list_head(&sp->s_deleg_list);
262 	for (; rp != NULL; rp = list_next(&sp->s_deleg_list, rp)) {
263 		nfs4_fhandle_t fhandle;
264 
265 		sfh4_copyval(rp->r_fh, &fhandle);
266 
267 		if ((fhandle.fh_len == args->fh.nfs_fh4_len &&
268 		    bcmp(fhandle.fh_buf, args->fh.nfs_fh4_val,
269 		    fhandle.fh_len) == 0)) {
270 
271 			found = TRUE;
272 			break;
273 		}
274 #ifdef	DEBUG
275 		if (nfs4_deleg_anyfh.nfs_fh4_len == args->fh.nfs_fh4_len &&
276 		    bcmp(nfs4_deleg_anyfh.nfs_fh4_val, args->fh.nfs_fh4_val,
277 		    args->fh.nfs_fh4_len) == 0) {
278 
279 			found = TRUE;
280 			break;
281 		}
282 #endif
283 	}
284 
285 	/*
286 	 * VN_HOLD the vnode before releasing s_lock to guarantee
287 	 * we have a valid vnode reference.
288 	 */
289 	if (found == TRUE) {
290 		vp = RTOV4(rp);
291 		VN_HOLD(vp);
292 	}
293 
294 	mutex_exit(&sp->s_lock);
295 	nfs4_server_rele(sp);
296 
297 	if (found == FALSE) {
298 
299 		CB_WARN("cb_getattr: bad fhandle\n");
300 
301 		*cs->statusp = resp->status = NFS4ERR_BADHANDLE;
302 		return;
303 	}
304 
305 	/*
306 	 * Figure out which attributes the server wants.  We only
307 	 * offer FATTR4_CHANGE & FATTR4_SIZE; ignore the rest.
308 	 */
309 	fdata = kmem_alloc(cb_getattr_bytes, KM_SLEEP);
310 
311 	/*
312 	 * Don't actually need to create XDR to encode these
313 	 * simple data structures.
314 	 * xdrmem_create(&xdr, fdata, cb_getattr_bytes, XDR_ENCODE);
315 	 */
316 	fap = &resp->obj_attributes;
317 
318 	fap->attrmask = 0;
319 	/* attrlist4_len starts at 0 and increases as attrs are processed */
320 	fap->attrlist4 = fdata;
321 	fap->attrlist4_len = 0;
322 
323 	/* don't supply attrs if request was zero */
324 	if (args->attr_request != 0) {
325 		if (args->attr_request & FATTR4_CHANGE_MASK) {
326 			/*
327 			 * If the file is mmapped, then increment the change
328 			 * attribute and return it.  This will guarantee that
329 			 * the server will perceive that the file has changed
330 			 * if there is any chance that the client application
331 			 * has changed it.  Otherwise, just return the change
332 			 * attribute as it has been updated by nfs4write_deleg.
333 			 */
334 
335 			mutex_enter(&rp->r_statelock);
336 			mapcnt = rp->r_mapcnt;
337 			rflag = rp->r_flags;
338 			mutex_exit(&rp->r_statelock);
339 
340 			mutex_enter(&rp->r_statev4_lock);
341 			/*
342 			 * If object mapped, then always return new change.
343 			 * Otherwise, return change if object has dirty
344 			 * pages.  If object doesn't have any dirty pages,
345 			 * then all changes have been pushed to server, so
346 			 * reset change to grant change.
347 			 */
348 			if (mapcnt)
349 				rp->r_deleg_change++;
350 			else if (! (rflag & R4DIRTY))
351 				rp->r_deleg_change = rp->r_deleg_change_grant;
352 			change = rp->r_deleg_change;
353 			mutex_exit(&rp->r_statev4_lock);
354 
355 			/*
356 			 * Use inline XDR code directly, we know that we
357 			 * going to a memory buffer and it has enough
358 			 * space so it cannot fail.
359 			 */
360 			IXDR_PUT_U_HYPER(fdata, change);
361 			fap->attrlist4_len += 2 * BYTES_PER_XDR_UNIT;
362 		}
363 
364 		if (args->attr_request & FATTR4_SIZE_MASK) {
365 			/*
366 			 * Use an atomic add of 0 to fetch a consistent view
367 			 * of r_size; this avoids having to take rw_lock
368 			 * which could cause a deadlock.
369 			 */
370 			size = atomic_add_64_nv((uint64_t *)&rp->r_size, 0);
371 
372 			/*
373 			 * Use inline XDR code directly, we know that we
374 			 * going to a memory buffer and it has enough
375 			 * space so it cannot fail.
376 			 */
377 			IXDR_PUT_U_HYPER(fdata, size);
378 			fap->attrlist4_len += 2 * BYTES_PER_XDR_UNIT;
379 		}
380 	}
381 
382 	VN_RELE(vp);
383 
384 	*cs->statusp = resp->status = NFS4_OK;
385 }
386 
387 static void
388 cb_getattr_free(nfs_cb_resop4 *resop)
389 {
390 	if (resop->nfs_cb_resop4_u.opcbgetattr.obj_attributes.attrlist4)
391 		kmem_free(resop->nfs_cb_resop4_u.opcbgetattr.
392 			obj_attributes.attrlist4,
393 			cb_getattr_bytes);
394 }
395 
396 static void
397 cb_recall(nfs_cb_argop4 *argop, nfs_cb_resop4 *resop, struct svc_req *req,
398 	struct compound_state *cs, struct nfs4_callback_globals *ncg)
399 {
400 	CB_RECALL4args * args = &argop->nfs_cb_argop4_u.opcbrecall;
401 	CB_RECALL4res *resp = &resop->nfs_cb_resop4_u.opcbrecall;
402 	rnode4_t *rp;
403 	vnode_t *vp;
404 	struct nfs4_server *sp;
405 	bool_t found = FALSE;
406 
407 	ncg->nfs4_callback_stats.cb_recall.value.ui64++;
408 
409 	ASSERT(req->rq_prog >= NFS4_CALLBACK);
410 	ASSERT(req->rq_prog < NFS4_CALLBACK+nfs4_num_prognums);
411 
412 #ifdef DEBUG
413 	/*
414 	 * error injection hook: set cb_recall_fail global to
415 	 * NFS4 pcol error to be returned
416 	 */
417 	if (cb4_recall_fail != NFS4_OK) {
418 		*cs->statusp = resp->status = cb4_recall_fail;
419 		return;
420 	}
421 #endif
422 
423 	mutex_enter(&ncg->nfs4_cb_lock);
424 	sp = ncg->nfs4prog2server[req->rq_prog - NFS4_CALLBACK];
425 	mutex_exit(&ncg->nfs4_cb_lock);
426 
427 	if (nfs4_server_vlock(sp, 0) == FALSE) {
428 
429 		CB_WARN("cb_recall: cannot find server\n");
430 
431 		*cs->statusp = resp->status = NFS4ERR_BADHANDLE;
432 		return;
433 	}
434 
435 	/*
436 	 * Search the delegation list for a matching file handle
437 	 * AND stateid; mutex on sp prevents the list from changing.
438 	 */
439 
440 	rp = list_head(&sp->s_deleg_list);
441 	for (; rp != NULL; rp = list_next(&sp->s_deleg_list, rp)) {
442 		mutex_enter(&rp->r_statev4_lock);
443 
444 		/* check both state id and file handle! */
445 
446 		if ((bcmp(&rp->r_deleg_stateid, &args->stateid,
447 		    sizeof (stateid4)) == 0)) {
448 			nfs4_fhandle_t fhandle;
449 
450 			sfh4_copyval(rp->r_fh, &fhandle);
451 			if ((fhandle.fh_len == args->fh.nfs_fh4_len &&
452 			    bcmp(fhandle.fh_buf, args->fh.nfs_fh4_val,
453 			    fhandle.fh_len) == 0)) {
454 
455 				found = TRUE;
456 				break;
457 			} else {
458 #ifdef	DEBUG
459 				CB_WARN("cb_recall: stateid OK, bad fh");
460 
461 				if (nfs4_linux_bug) {
462 					found = TRUE;
463 					break;
464 				}
465 #endif
466 			}
467 		}
468 #ifdef	DEBUG
469 		if (bcmp(&args->stateid, &nfs4_deleg_any,
470 		    sizeof (stateid4)) == 0) {
471 
472 			found = TRUE;
473 			break;
474 		}
475 #endif
476 		mutex_exit(&rp->r_statev4_lock);
477 	}
478 
479 	/*
480 	 * VN_HOLD the vnode before releasing s_lock to guarantee
481 	 * we have a valid vnode reference.  The async thread will
482 	 * release the hold when it's done.
483 	 */
484 	if (found == TRUE) {
485 		mutex_exit(&rp->r_statev4_lock);
486 		vp = RTOV4(rp);
487 		VN_HOLD(vp);
488 	}
489 	mutex_exit(&sp->s_lock);
490 	nfs4_server_rele(sp);
491 
492 	if (found == FALSE) {
493 
494 		CB_WARN("cb_recall: bad stateid\n");
495 
496 		*cs->statusp = resp->status = NFS4ERR_BAD_STATEID;
497 		return;
498 	}
499 
500 	/* Fire up a thread to do the delegreturn */
501 	nfs4delegreturn_async(rp, NFS4_DR_RECALL|NFS4_DR_REOPEN,
502 					args->truncate);
503 
504 	*cs->statusp = resp->status = 0;
505 }
506 
507 /* ARGSUSED */
508 static void
509 cb_recall_free(nfs_cb_resop4 *resop)
510 {
511 	/* nothing to do here, cb_recall doesn't kmem_alloc */
512 }
513 
514 /*
515  * This function handles the CB_NULL proc call from an NFSv4 Server.
516  *
517  * We take note that the server has sent a CB_NULL for later processing
518  * in the recovery logic. It is noted so we may pause slightly after the
519  * setclientid and before reopening files. The pause is to allow the
520  * NFSv4 Server time to receive the CB_NULL reply and adjust any of
521  * its internal structures such that it has the opportunity to grant
522  * delegations to reopened files.
523  *
524  */
525 
526 /* ARGSUSED */
527 static void
528 cb_null(CB_COMPOUND4args *args, CB_COMPOUND4res *resp, struct svc_req *req,
529     struct nfs4_callback_globals *ncg)
530 {
531 	struct nfs4_server *sp;
532 
533 	ncg->nfs4_callback_stats.cb_null.value.ui64++;
534 
535 	ASSERT(req->rq_prog >= NFS4_CALLBACK);
536 	ASSERT(req->rq_prog < NFS4_CALLBACK+nfs4_num_prognums);
537 
538 	mutex_enter(&ncg->nfs4_cb_lock);
539 	sp = ncg->nfs4prog2server[req->rq_prog - NFS4_CALLBACK];
540 	mutex_exit(&ncg->nfs4_cb_lock);
541 
542 	if (nfs4_server_vlock(sp, 0) != FALSE) {
543 		sp->s_flags |= N4S_CB_PINGED;
544 		cv_broadcast(&sp->wait_cb_null);
545 		mutex_exit(&sp->s_lock);
546 		nfs4_server_rele(sp);
547 	}
548 }
549 
550 /*
551  * cb_illegal	args: void
552  *		res : status (NFS4ERR_OP_CB_ILLEGAL)
553  */
554 /* ARGSUSED */
555 static void
556 cb_illegal(nfs_cb_argop4 *argop, nfs_cb_resop4 *resop, struct svc_req *req,
557 	struct compound_state *cs, struct nfs4_callback_globals *ncg)
558 {
559 	CB_ILLEGAL4res *resp = &resop->nfs_cb_resop4_u.opcbillegal;
560 
561 	ncg->nfs4_callback_stats.cb_illegal.value.ui64++;
562 	resop->resop = OP_CB_ILLEGAL;
563 	*cs->statusp = resp->status = NFS4ERR_OP_ILLEGAL;
564 }
565 
566 static void
567 cb_compound(CB_COMPOUND4args *args, CB_COMPOUND4res *resp, struct svc_req *req,
568 	struct nfs4_callback_globals *ncg)
569 {
570 	uint_t i;
571 	struct compound_state cs;
572 	nfs_cb_argop4 *argop;
573 	nfs_cb_resop4 *resop, *new_res;
574 	uint_t op;
575 
576 	bzero(&cs, sizeof (cs));
577 	cs.statusp = &resp->status;
578 	cs.cont = TRUE;
579 
580 	/*
581 	 * Form a reply tag by copying over the reqeuest tag.
582 	 */
583 	resp->tag.utf8string_len = args->tag.utf8string_len;
584 	resp->tag.utf8string_val = kmem_alloc(resp->tag.utf8string_len,
585 					KM_SLEEP);
586 	bcopy(args->tag.utf8string_val, resp->tag.utf8string_val,
587 		args->tag.utf8string_len);
588 
589 	/*
590 	 * XXX for now, minorversion should be zero
591 	 */
592 	if (args->minorversion != CB4_MINORVERSION) {
593 		resp->array_len = 0;
594 		resp->array = NULL;
595 		resp->status = NFS4ERR_MINOR_VERS_MISMATCH;
596 		return;
597 	}
598 
599 #ifdef DEBUG
600 	/*
601 	 * Verify callback_ident.  It doesn't really matter if it's wrong
602 	 * because we don't really use callback_ident -- we use prog number
603 	 * of the RPC request instead.  In this case, just print a DEBUG
604 	 * console message to reveal brokenness of cbclient (at bkoff/cthon).
605 	 */
606 	if (args->callback_ident != req->rq_prog)
607 		zcmn_err(getzoneid(), CE_WARN,
608 		    "cb_compound: cb_client using wrong "
609 		    "callback_ident(%d), should be %d",
610 		    args->callback_ident, req->rq_prog);
611 #endif
612 
613 	resp->array_len = args->array_len;
614 	resp->array = kmem_zalloc(args->array_len * sizeof (nfs_cb_resop4),
615 							KM_SLEEP);
616 
617 	for (i = 0; i < args->array_len && cs.cont; i++) {
618 
619 		argop = &args->array[i];
620 		resop = &resp->array[i];
621 		resop->resop = argop->argop;
622 		op = (uint_t)resop->resop;
623 
624 		switch (op) {
625 
626 		case OP_CB_GETATTR:
627 
628 			cb_getattr(argop, resop, req, &cs, ncg);
629 			break;
630 
631 		case OP_CB_RECALL:
632 
633 			cb_recall(argop, resop, req, &cs, ncg);
634 			break;
635 
636 		case OP_CB_ILLEGAL:
637 
638 			/* fall through */
639 
640 		default:
641 			/*
642 			 * Handle OP_CB_ILLEGAL and any undefined opcode.
643 			 * Currently, the XDR code will return BADXDR
644 			 * if cb op doesn't decode to legal value, so
645 			 * it really only handles OP_CB_ILLEGAL.
646 			 */
647 			op = OP_CB_ILLEGAL;
648 			cb_illegal(argop, resop, req, &cs, ncg);
649 		}
650 
651 		if (*cs.statusp != NFS4_OK)
652 			cs.cont = FALSE;
653 
654 		/*
655 		 * If not at last op, and if we are to stop, then
656 		 * compact the results array.
657 		 */
658 		if ((i + 1) < args->array_len && !cs.cont) {
659 
660 			new_res = kmem_alloc(
661 				(i+1) * sizeof (nfs_cb_resop4), KM_SLEEP);
662 			bcopy(resp->array,
663 				new_res, (i+1) * sizeof (nfs_cb_resop4));
664 			kmem_free(resp->array,
665 				args->array_len * sizeof (nfs_cb_resop4));
666 
667 			resp->array_len =  i + 1;
668 			resp->array = new_res;
669 		}
670 	}
671 
672 }
673 
674 static void
675 cb_compound_free(CB_COMPOUND4res *resp)
676 {
677 	uint_t i, op;
678 	nfs_cb_resop4 *resop;
679 
680 	if (resp->tag.utf8string_val) {
681 		UTF8STRING_FREE(resp->tag)
682 	}
683 
684 	for (i = 0; i < resp->array_len; i++) {
685 
686 		resop = &resp->array[i];
687 		op = (uint_t)resop->resop;
688 
689 		switch (op) {
690 
691 		case OP_CB_GETATTR:
692 
693 			cb_getattr_free(resop);
694 			break;
695 
696 		case OP_CB_RECALL:
697 
698 			cb_recall_free(resop);
699 			break;
700 
701 		default:
702 			break;
703 		}
704 	}
705 
706 	if (resp->array != NULL) {
707 		kmem_free(resp->array,
708 			resp->array_len * sizeof (nfs_cb_resop4));
709 	}
710 }
711 
712 static void
713 cb_dispatch(struct svc_req *req, SVCXPRT *xprt)
714 {
715 	CB_COMPOUND4args args;
716 	CB_COMPOUND4res res;
717 	struct nfs4_callback_globals *ncg;
718 
719 	bool_t (*xdr_args)(), (*xdr_res)();
720 	void (*proc)(CB_COMPOUND4args *, CB_COMPOUND4res *, struct svc_req *,
721 	    struct nfs4_callback_globals *);
722 	void (*freeproc)(CB_COMPOUND4res *);
723 
724 	ncg = zone_getspecific(nfs4_callback_zone_key, curproc->p_zone);
725 	ASSERT(ncg != NULL);
726 
727 	ncg->nfs4_callback_stats.cb_dispatch.value.ui64++;
728 
729 	switch (req->rq_proc) {
730 	case CB_NULL:
731 		xdr_args = xdr_void;
732 		xdr_res = xdr_void;
733 		proc = cb_null;
734 		freeproc = NULL;
735 		break;
736 
737 	case CB_COMPOUND:
738 		xdr_args = xdr_CB_COMPOUND4args;
739 		xdr_res = xdr_CB_COMPOUND4res;
740 		proc = cb_compound;
741 		freeproc = cb_compound_free;
742 		break;
743 
744 	default:
745 		CB_WARN("cb_dispatch: no proc\n");
746 		svcerr_noproc(xprt);
747 		return;
748 	}
749 
750 	args.tag.utf8string_val = NULL;
751 	args.array = NULL;
752 
753 	if (!SVC_GETARGS(xprt, xdr_args, (caddr_t)&args)) {
754 
755 		CB_WARN("cb_dispatch: cannot getargs\n");
756 		svcerr_decode(xprt);
757 		return;
758 	}
759 
760 	(*proc)(&args, &res, req, ncg);
761 
762 	if (svc_sendreply(xprt, xdr_res, (caddr_t)&res) == FALSE) {
763 
764 		CB_WARN("cb_dispatch: bad sendreply\n");
765 
766 		/*
767 		 * svcerr_systemerr(xprt);
768 		 */
769 	}
770 
771 	if (freeproc)
772 		(*freeproc)(&res);
773 
774 	if (!SVC_FREEARGS(xprt, xdr_args, (caddr_t)&args)) {
775 
776 		CB_WARN("cb_dispatch: bad freeargs\n");
777 	}
778 }
779 
780 static rpcprog_t
781 nfs4_getnextprogram(struct nfs4_callback_globals *ncg)
782 {
783 	int i, j;
784 
785 	j = ncg->nfs4_program_hint;
786 	for (i = 0; i < nfs4_num_prognums; i++, j++) {
787 
788 		if (j >= nfs4_num_prognums)
789 			j = 0;
790 
791 		if (ncg->nfs4prog2server[j] == NULL) {
792 			ncg->nfs4_program_hint = j+1;
793 			return (j+NFS4_CALLBACK);
794 		}
795 	}
796 
797 	return (0);
798 }
799 
800 void
801 nfs4callback_destroy(nfs4_server_t *np)
802 {
803 	struct nfs4_callback_globals *ncg;
804 	int i;
805 
806 	if (np->s_program == 0)
807 		return;
808 
809 	ncg = np->zone_globals;
810 	i = np->s_program - NFS4_CALLBACK;
811 
812 	mutex_enter(&ncg->nfs4_cb_lock);
813 
814 	ASSERT(ncg->nfs4prog2server[i] == np);
815 
816 	ncg->nfs4prog2server[i] = NULL;
817 
818 	if (i < ncg->nfs4_program_hint)
819 		ncg->nfs4_program_hint = i;
820 
821 	mutex_exit(&ncg->nfs4_cb_lock);
822 }
823 
824 /*
825  * nfs4_setport - This function saves a netid and univeral address for
826  * the callback program.  These values will be used during setclientid.
827  */
828 static void
829 nfs4_setport(char *netid, char *uaddr, char *protofmly, char *proto,
830 	struct nfs4_callback_globals *ncg)
831 {
832 	struct nfs4_cb_port *p;
833 	bool_t found = FALSE;
834 
835 	ASSERT(MUTEX_HELD(&ncg->nfs4_cb_lock));
836 
837 	p = list_head(&ncg->nfs4_cb_ports);
838 	for (; p != NULL; p = list_next(&ncg->nfs4_cb_ports, p)) {
839 		if (strcmp(p->netid, netid) == 0) {
840 			found = TRUE;
841 			break;
842 		}
843 	}
844 	if (found == TRUE)
845 		(void) strcpy(p->uaddr, uaddr);
846 	else {
847 		p = kmem_alloc(sizeof (*p), KM_SLEEP);
848 
849 		(void) strcpy(p->uaddr, uaddr);
850 		(void) strcpy(p->netid, netid);
851 		(void) strcpy(p->protofmly, protofmly);
852 		(void) strcpy(p->proto, proto);
853 		list_insert_head(&ncg->nfs4_cb_ports, p);
854 	}
855 }
856 
857 /*
858  * nfs4_cb_args - This function is used to construct the callback
859  * portion of the arguments needed for setclientid.
860  */
861 
862 void
863 nfs4_cb_args(nfs4_server_t *np, struct knetconfig *knc, SETCLIENTID4args *args)
864 {
865 	struct nfs4_cb_port *p;
866 	bool_t found = FALSE;
867 	rpcprog_t pgm;
868 	struct nfs4_callback_globals *ncg = np->zone_globals;
869 
870 	/*
871 	 * This server structure may already have a program number
872 	 * assigned to it.  This happens when the client has to
873 	 * re-issue SETCLIENTID.  Just re-use the information.
874 	 */
875 	if (np->s_program >= NFS4_CALLBACK &&
876 	    np->s_program < NFS4_CALLBACK + nfs4_num_prognums)
877 		nfs4callback_destroy(np);
878 
879 	mutex_enter(&ncg->nfs4_cb_lock);
880 
881 	p = list_head(&ncg->nfs4_cb_ports);
882 	for (; p != NULL; p = list_next(&ncg->nfs4_cb_ports, p)) {
883 		if (strcmp(p->protofmly, knc->knc_protofmly) == 0 &&
884 		    strcmp(p->proto, knc->knc_proto) == 0) {
885 			found = TRUE;
886 			break;
887 		}
888 	}
889 
890 	if (found == FALSE) {
891 
892 		NFS4_DEBUG(nfs4_callback_debug,
893 		(CE_WARN, "nfs4_cb_args: could not find netid for %s/%s\n",
894 			knc->knc_protofmly, knc->knc_proto));
895 
896 		args->callback.cb_program = 0;
897 		args->callback.cb_location.r_netid = NULL;
898 		args->callback.cb_location.r_addr = NULL;
899 		args->callback_ident = 0;
900 		mutex_exit(&ncg->nfs4_cb_lock);
901 		return;
902 	}
903 
904 	if ((pgm = nfs4_getnextprogram(ncg)) == 0) {
905 		CB_WARN("nfs4_cb_args: out of program numbers\n");
906 
907 		args->callback.cb_program = 0;
908 		args->callback.cb_location.r_netid = NULL;
909 		args->callback.cb_location.r_addr = NULL;
910 		args->callback_ident = 0;
911 		mutex_exit(&ncg->nfs4_cb_lock);
912 		return;
913 	}
914 
915 	ncg->nfs4prog2server[pgm-NFS4_CALLBACK] = np;
916 	args->callback.cb_program = pgm;
917 	args->callback.cb_location.r_netid = p->netid;
918 	args->callback.cb_location.r_addr = p->uaddr;
919 	args->callback_ident = pgm;
920 
921 	np->s_program = pgm;
922 
923 	mutex_exit(&ncg->nfs4_cb_lock);
924 }
925 
926 static int
927 nfs4_dquery(struct nfs4_svc_args *arg, model_t model)
928 {
929 	file_t *fp;
930 	vnode_t *vp;
931 	rnode4_t *rp;
932 	int error;
933 	STRUCT_HANDLE(nfs4_svc_args, uap);
934 
935 	STRUCT_SET_HANDLE(uap, model, arg);
936 
937 	if ((fp = getf(STRUCT_FGET(uap, fd))) == NULL)
938 		return (EBADF);
939 
940 	vp = fp->f_vnode;
941 
942 	if (vp == NULL || vp->v_type != VREG ||
943 	    !vn_matchops(vp, nfs4_vnodeops)) {
944 		releasef(STRUCT_FGET(uap, fd));
945 		return (EBADF);
946 	}
947 
948 	rp = VTOR4(vp);
949 
950 	/*
951 	 * I can't convince myself that we need locking here.  The
952 	 * rnode cannot disappear and the value returned is instantly
953 	 * stale anway, so why bother?
954 	 */
955 
956 	error = suword32(STRUCT_FGETP(uap, netid), rp->r_deleg_type);
957 	releasef(STRUCT_FGET(uap, fd));
958 	return (error);
959 }
960 
961 
962 /*
963  * NFS4 client system call.  This service does the
964  * necessary initialization for the callback program.
965  * This is fashioned after the server side interaction
966  * between nfsd and the kernel.  On the client, the
967  * mount command forks and the child process does the
968  * necessary interaction with the kernel.
969  *
970  * uap->fd is the fd of an open transport provider
971  */
972 int
973 nfs4_svc(struct nfs4_svc_args *arg, model_t model)
974 {
975 	file_t *fp;
976 	int error;
977 	int readsize;
978 	char buf[KNC_STRSIZE], uaddr[KNC_STRSIZE];
979 	char protofmly[KNC_STRSIZE], proto[KNC_STRSIZE];
980 	size_t len;
981 	STRUCT_HANDLE(nfs4_svc_args, uap);
982 	struct netbuf addrmask;
983 	int cmd;
984 	SVCMASTERXPRT *cb_xprt;
985 	struct nfs4_callback_globals *ncg;
986 
987 #ifdef lint
988 	model = model;		/* STRUCT macros don't always refer to it */
989 #endif
990 
991 	STRUCT_SET_HANDLE(uap, model, arg);
992 
993 	if (STRUCT_FGET(uap, cmd) == NFS4_DQUERY)
994 		return (nfs4_dquery(arg, model));
995 
996 	if (secpolicy_nfs(CRED()) != 0)
997 		return (EPERM);
998 
999 	if ((fp = getf(STRUCT_FGET(uap, fd))) == NULL)
1000 		return (EBADF);
1001 
1002 	/*
1003 	 * Set read buffer size to rsize
1004 	 * and add room for RPC headers.
1005 	 */
1006 	readsize = nfs3tsize() + (RPC_MAXDATASIZE - NFS_MAXDATA);
1007 	if (readsize < RPC_MAXDATASIZE)
1008 		readsize = RPC_MAXDATASIZE;
1009 
1010 	error = copyinstr((const char *)STRUCT_FGETP(uap, netid), buf,
1011 	    KNC_STRSIZE, &len);
1012 	if (error) {
1013 		releasef(STRUCT_FGET(uap, fd));
1014 		return (error);
1015 	}
1016 
1017 	cmd = STRUCT_FGET(uap, cmd);
1018 
1019 	if (cmd & NFS4_KRPC_START) {
1020 		addrmask.len = STRUCT_FGET(uap, addrmask.len);
1021 		addrmask.maxlen = STRUCT_FGET(uap, addrmask.maxlen);
1022 		addrmask.buf = kmem_alloc(addrmask.maxlen, KM_SLEEP);
1023 		error = copyin(STRUCT_FGETP(uap, addrmask.buf), addrmask.buf,
1024 		    addrmask.len);
1025 		if (error) {
1026 			releasef(STRUCT_FGET(uap, fd));
1027 			kmem_free(addrmask.buf, addrmask.maxlen);
1028 			return (error);
1029 		}
1030 	}
1031 	else
1032 		addrmask.buf = NULL;
1033 
1034 	error = copyinstr((const char *)STRUCT_FGETP(uap, addr), uaddr,
1035 	    sizeof (uaddr), &len);
1036 	if (error) {
1037 		releasef(STRUCT_FGET(uap, fd));
1038 		if (addrmask.buf)
1039 			kmem_free(addrmask.buf, addrmask.maxlen);
1040 		return (error);
1041 	}
1042 
1043 	error = copyinstr((const char *)STRUCT_FGETP(uap, protofmly), protofmly,
1044 	    sizeof (protofmly), &len);
1045 	if (error) {
1046 		releasef(STRUCT_FGET(uap, fd));
1047 		if (addrmask.buf)
1048 			kmem_free(addrmask.buf, addrmask.maxlen);
1049 		return (error);
1050 	}
1051 
1052 	error = copyinstr((const char *)STRUCT_FGETP(uap, proto), proto,
1053 	    sizeof (proto), &len);
1054 	if (error) {
1055 		releasef(STRUCT_FGET(uap, fd));
1056 		if (addrmask.buf)
1057 			kmem_free(addrmask.buf, addrmask.maxlen);
1058 		return (error);
1059 	}
1060 
1061 	ncg = zone_getspecific(nfs4_callback_zone_key, curproc->p_zone);
1062 	ASSERT(ncg != NULL);
1063 
1064 	mutex_enter(&ncg->nfs4_cb_lock);
1065 	if (cmd & NFS4_SETPORT)
1066 		nfs4_setport(buf, uaddr, protofmly, proto, ncg);
1067 
1068 	if (cmd & NFS4_KRPC_START) {
1069 		error = svc_tli_kcreate(fp, readsize, buf, &addrmask, &cb_xprt,
1070 		    &nfs4_cb_sct, NULL, NFS_CB_SVCPOOL_ID, FALSE);
1071 		if (error) {
1072 			CB_WARN1("nfs4_svc: svc_tli_kcreate failed %d\n",
1073 				error);
1074 			kmem_free(addrmask.buf, addrmask.maxlen);
1075 		}
1076 	}
1077 
1078 	mutex_exit(&ncg->nfs4_cb_lock);
1079 	releasef(STRUCT_FGET(uap, fd));
1080 	return (error);
1081 }
1082 
1083 struct nfs4_callback_globals *
1084 nfs4_get_callback_globals(void)
1085 {
1086 	return (zone_getspecific(nfs4_callback_zone_key, curproc->p_zone));
1087 }
1088 
1089 static void *
1090 nfs4_callback_init_zone(zoneid_t zoneid)
1091 {
1092 	kstat_t *nfs4_callback_kstat;
1093 	struct nfs4_callback_globals *ncg;
1094 
1095 	ncg = kmem_zalloc(sizeof (*ncg), KM_SLEEP);
1096 
1097 	ncg->nfs4prog2server = kmem_zalloc(nfs4_num_prognums *
1098 		sizeof (struct nfs4_server *), KM_SLEEP);
1099 
1100 	/* initialize the dlist */
1101 	mutex_init(&ncg->nfs4_dlist_lock, NULL, MUTEX_DEFAULT, NULL);
1102 	list_create(&ncg->nfs4_dlist, sizeof (struct nfs4_dnode),
1103 	    offsetof(struct nfs4_dnode, linkage));
1104 
1105 	/* initialize cb_port list */
1106 	mutex_init(&ncg->nfs4_cb_lock, NULL, MUTEX_DEFAULT, NULL);
1107 	list_create(&ncg->nfs4_cb_ports, sizeof (struct nfs4_cb_port),
1108 	    offsetof(struct nfs4_cb_port, linkage));
1109 
1110 	/* get our own copy of the kstats */
1111 	bcopy(&nfs4_callback_stats_tmpl, &ncg->nfs4_callback_stats,
1112 	    sizeof (nfs4_callback_stats_tmpl));
1113 	/* register "nfs:0:nfs4_callback_stats" for this zone */
1114 	if ((nfs4_callback_kstat =
1115 		kstat_create_zone("nfs", 0, "nfs4_callback_stats", "misc",
1116 		    KSTAT_TYPE_NAMED,
1117 		    sizeof (ncg->nfs4_callback_stats) / sizeof (kstat_named_t),
1118 		    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE,
1119 		    zoneid)) != NULL) {
1120 		nfs4_callback_kstat->ks_data = &ncg->nfs4_callback_stats;
1121 		kstat_install(nfs4_callback_kstat);
1122 	}
1123 	return (ncg);
1124 }
1125 
1126 static void
1127 nfs4_discard_delegations(struct nfs4_callback_globals *ncg)
1128 {
1129 	nfs4_server_t *sp;
1130 	int i, num_removed;
1131 
1132 	/*
1133 	 * It's OK here to just run through the registered "programs", as
1134 	 * servers without programs won't have any delegations to handle.
1135 	 */
1136 	for (i = 0; i < nfs4_num_prognums; i++) {
1137 		rnode4_t *rp;
1138 
1139 		mutex_enter(&ncg->nfs4_cb_lock);
1140 		sp = ncg->nfs4prog2server[i];
1141 		mutex_exit(&ncg->nfs4_cb_lock);
1142 
1143 		if (nfs4_server_vlock(sp, 1) == FALSE)
1144 			continue;
1145 		num_removed = 0;
1146 		while ((rp = list_head(&sp->s_deleg_list)) != NULL) {
1147 			mutex_enter(&rp->r_statev4_lock);
1148 			if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
1149 				/*
1150 				 * We need to take matters into our own hands,
1151 				 * as nfs4delegreturn_cleanup_impl() won't
1152 				 * remove this from the list.
1153 				 */
1154 				list_remove(&sp->s_deleg_list, rp);
1155 				mutex_exit(&rp->r_statev4_lock);
1156 				nfs4_dec_state_ref_count_nolock(sp,
1157 				    VTOMI4(RTOV4(rp)));
1158 				num_removed++;
1159 				continue;
1160 			}
1161 			mutex_exit(&rp->r_statev4_lock);
1162 			VN_HOLD(RTOV4(rp));
1163 			mutex_exit(&sp->s_lock);
1164 			/*
1165 			 * The following will remove the node from the list.
1166 			 */
1167 			nfs4delegreturn_cleanup_impl(rp, sp, ncg);
1168 			VN_RELE(RTOV4(rp));
1169 			mutex_enter(&sp->s_lock);
1170 		}
1171 		mutex_exit(&sp->s_lock);
1172 		/* each removed list node reles a reference */
1173 		while (num_removed-- > 0)
1174 			nfs4_server_rele(sp);
1175 		/* remove our reference for nfs4_server_vlock */
1176 		nfs4_server_rele(sp);
1177 	}
1178 }
1179 
1180 /* ARGSUSED */
1181 static void
1182 nfs4_callback_shutdown_zone(zoneid_t zoneid, void *data)
1183 {
1184 	struct nfs4_callback_globals *ncg = data;
1185 
1186 	/*
1187 	 * Clean pending delegation return list.
1188 	 */
1189 	nfs4_dlistclean_impl(ncg, NFS4_DR_DISCARD);
1190 
1191 	/*
1192 	 * Discard all delegations.
1193 	 */
1194 	nfs4_discard_delegations(ncg);
1195 }
1196 
1197 static void
1198 nfs4_callback_fini_zone(zoneid_t zoneid, void *data)
1199 {
1200 	struct nfs4_callback_globals *ncg = data;
1201 	struct nfs4_cb_port *p;
1202 	nfs4_server_t *sp, *next;
1203 	nfs4_server_t freelist;
1204 	int i;
1205 
1206 	kstat_delete_byname_zone("nfs", 0, "nfs4_callback_stats", zoneid);
1207 
1208 	/*
1209 	 * Discard all delegations that may have crept in since we did the
1210 	 * _shutdown.
1211 	 */
1212 	nfs4_discard_delegations(ncg);
1213 	/*
1214 	 * We're completely done with this zone and all associated
1215 	 * nfs4_server_t's.  Any remaining nfs4_server_ts should only have one
1216 	 * more reference outstanding -- the reference we didn't release in
1217 	 * nfs4_renew_lease_thread().
1218 	 *
1219 	 * Here we need to run through the global nfs4_server_lst as we need to
1220 	 * deal with nfs4_server_ts without programs, as they also have threads
1221 	 * created for them, and so have outstanding references that we need to
1222 	 * release.
1223 	 */
1224 	freelist.forw = &freelist;
1225 	freelist.back = &freelist;
1226 	mutex_enter(&nfs4_server_lst_lock);
1227 	sp = nfs4_server_lst.forw;
1228 	while (sp != &nfs4_server_lst) {
1229 		next = sp->forw;
1230 		if (sp->zoneid == zoneid) {
1231 			remque(sp);
1232 			insque(sp, &freelist);
1233 			mutex_enter(&sp->s_lock);
1234 			sp->s_flags &= ~N4S_INSERTED;
1235 			mutex_exit(&sp->s_lock);
1236 		}
1237 		sp = next;
1238 	}
1239 	mutex_exit(&nfs4_server_lst_lock);
1240 
1241 	sp = freelist.forw;
1242 	while (sp != &freelist) {
1243 		next = sp->forw;
1244 		sp->forw = sp->back = NULL;
1245 		nfs4_server_rele(sp);	/* free the list's reference */
1246 		sp = next;
1247 	}
1248 
1249 #ifdef DEBUG
1250 	for (i = 0; i < nfs4_num_prognums; i++) {
1251 		ASSERT(ncg->nfs4prog2server[i] == NULL);
1252 	}
1253 #endif
1254 	kmem_free(ncg->nfs4prog2server, nfs4_num_prognums *
1255 	    sizeof (struct nfs4_server *));
1256 
1257 	mutex_enter(&ncg->nfs4_cb_lock);
1258 	while ((p = list_head(&ncg->nfs4_cb_ports)) != NULL) {
1259 		list_remove(&ncg->nfs4_cb_ports, p);
1260 		kmem_free(p, sizeof (*p));
1261 	}
1262 	list_destroy(&ncg->nfs4_cb_ports);
1263 	mutex_destroy(&ncg->nfs4_cb_lock);
1264 	list_destroy(&ncg->nfs4_dlist);
1265 	mutex_destroy(&ncg->nfs4_dlist_lock);
1266 	kmem_free(ncg, sizeof (*ncg));
1267 }
1268 
1269 void
1270 nfs4_callback_init(void)
1271 {
1272 	int i;
1273 	SVC_CALLOUT *nfs4_cb_sc;
1274 
1275 	/* initialize the callback table */
1276 	nfs4_cb_sc = kmem_alloc(nfs4_num_prognums *
1277 		sizeof (SVC_CALLOUT), KM_SLEEP);
1278 
1279 	for (i = 0; i < nfs4_num_prognums; i++) {
1280 		nfs4_cb_sc[i].sc_prog = NFS4_CALLBACK+i;
1281 		nfs4_cb_sc[i].sc_versmin = NFS_CB;
1282 		nfs4_cb_sc[i].sc_versmax = NFS_CB;
1283 		nfs4_cb_sc[i].sc_dispatch = cb_dispatch;
1284 	}
1285 
1286 	nfs4_cb_sct.sct_size = nfs4_num_prognums;
1287 	nfs4_cb_sct.sct_free = FALSE;
1288 	nfs4_cb_sct.sct_sc = nfs4_cb_sc;
1289 
1290 	/*
1291 	 * Compute max bytes required for dyamically allocated parts
1292 	 * of cb_getattr reply.  Only size and change are supported now.
1293 	 * If CB_GETATTR is changed to reply with additional attrs,
1294 	 * additional sizes must be added below.
1295 	 *
1296 	 * fattr4_change + fattr4_size == uint64_t + uint64_t
1297 	 */
1298 	cb_getattr_bytes = 2 * BYTES_PER_XDR_UNIT + 2 * BYTES_PER_XDR_UNIT;
1299 
1300 	zone_key_create(&nfs4_callback_zone_key, nfs4_callback_init_zone,
1301 	    nfs4_callback_shutdown_zone, nfs4_callback_fini_zone);
1302 }
1303 
1304 void
1305 nfs4_callback_fini(void)
1306 {
1307 }
1308 
1309 /*
1310  * NB: This function can be called from the *wrong* zone (ie, the zone that
1311  * 'rp' belongs to and the caller's zone may not be the same).  This can happen
1312  * if the zone is going away and we get called from nfs4_async_inactive().  In
1313  * this case the globals will be NULL and we won't update the counters, which
1314  * doesn't matter as the zone is going away anyhow.
1315  */
1316 static void
1317 nfs4delegreturn_cleanup_impl(rnode4_t *rp, nfs4_server_t *np,
1318 	struct nfs4_callback_globals *ncg)
1319 {
1320 	mntinfo4_t *mi = VTOMI4(RTOV4(rp));
1321 	boolean_t need_rele = B_FALSE;
1322 
1323 	mutex_enter(&rp->r_statev4_lock);
1324 
1325 	if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
1326 		mutex_exit(&rp->r_statev4_lock);
1327 		return;
1328 	}
1329 
1330 	/*
1331 	 * Free the cred originally held when
1332 	 * the delegation was granted.  Caller must
1333 	 * hold this cred if it wants to use it after
1334 	 * this call.
1335 	 */
1336 	crfree(rp->r_deleg_cred);
1337 	rp->r_deleg_cred = NULL;
1338 	rp->r_deleg_type = OPEN_DELEGATE_NONE;
1339 	rp->r_deleg_needs_recovery = OPEN_DELEGATE_NONE;
1340 	rp->r_deleg_needs_recall = FALSE;
1341 	rp->r_deleg_return_pending = FALSE;
1342 	mutex_exit(&rp->r_statev4_lock);
1343 
1344 	/*
1345 	 * Caller must be holding mi_recovlock in read mode
1346 	 * to call here.  This is provided by start_op.
1347 	 */
1348 
1349 	if (np == NULL) {
1350 		np = find_nfs4_server_all(mi, 1);
1351 		ASSERT(np != NULL);
1352 		need_rele = B_TRUE;
1353 	} else {
1354 		mutex_enter(&np->s_lock);
1355 	}
1356 
1357 	/*
1358 	 * Remove the rnode from the server's list and
1359 	 * update the ref counts.
1360 	 */
1361 	list_remove(&np->s_deleg_list, rp);
1362 	nfs4_dec_state_ref_count_nolock(np, mi);
1363 	mutex_exit(&np->s_lock);
1364 	/* removed list node removes a reference */
1365 	nfs4_server_rele(np);
1366 	if (need_rele)
1367 		nfs4_server_rele(np);
1368 	if (ncg != NULL)
1369 		ncg->nfs4_callback_stats.delegations.value.ui64--;
1370 }
1371 
1372 void
1373 nfs4delegreturn_cleanup(rnode4_t *rp, nfs4_server_t *np)
1374 {
1375 	struct nfs4_callback_globals *ncg;
1376 
1377 	if (np != NULL) {
1378 		ncg = np->zone_globals;
1379 	} else if (curproc->p_zone == VTOMI4(RTOV4(rp))->mi_zone) {
1380 		ncg = zone_getspecific(nfs4_callback_zone_key, curproc->p_zone);
1381 		ASSERT(ncg != NULL);
1382 	} else {
1383 		/*
1384 		 * Request coming from the wrong zone.
1385 		 */
1386 		ASSERT(getzoneid() == GLOBAL_ZONEID);
1387 		ncg = NULL;
1388 	}
1389 
1390 	nfs4delegreturn_cleanup_impl(rp, np, ncg);
1391 }
1392 
1393 static void
1394 nfs4delegreturn_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp,
1395 	cred_t *cr, vnode_t *vp)
1396 {
1397 	if (error != ETIMEDOUT && error != EINTR &&
1398 	    !NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) {
1399 		lost_rqstp->lr_op = 0;
1400 		return;
1401 	}
1402 
1403 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
1404 			"nfs4close_save_lost_rqst: error %d", error));
1405 
1406 	lost_rqstp->lr_op = OP_DELEGRETURN;
1407 	/*
1408 	 * The vp is held and rele'd via the recovery code.
1409 	 * See nfs4_save_lost_rqst.
1410 	 */
1411 	lost_rqstp->lr_vp = vp;
1412 	lost_rqstp->lr_dvp = NULL;
1413 	lost_rqstp->lr_oop = NULL;
1414 	lost_rqstp->lr_osp = NULL;
1415 	lost_rqstp->lr_lop = NULL;
1416 	lost_rqstp->lr_cr = cr;
1417 	lost_rqstp->lr_flk = NULL;
1418 	lost_rqstp->lr_putfirst = FALSE;
1419 }
1420 
1421 static void
1422 nfs4delegreturn_otw(rnode4_t *rp, cred_t *cr, nfs4_error_t *ep)
1423 {
1424 	COMPOUND4args_clnt args;
1425 	COMPOUND4res_clnt res;
1426 	nfs_argop4 argops[3];
1427 	nfs4_ga_res_t *garp = NULL;
1428 	hrtime_t t;
1429 	int numops;
1430 	int doqueue = 1;
1431 
1432 	args.ctag = TAG_DELEGRETURN;
1433 
1434 	numops = 3;		/* PUTFH, GETATTR, DELEGRETURN */
1435 
1436 	args.array = argops;
1437 	args.array_len = numops;
1438 
1439 	argops[0].argop = OP_CPUTFH;
1440 	argops[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
1441 
1442 	argops[1].argop = OP_GETATTR;
1443 	argops[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
1444 	argops[1].nfs_argop4_u.opgetattr.mi = VTOMI4(RTOV4(rp));
1445 
1446 	argops[2].argop = OP_DELEGRETURN;
1447 	argops[2].nfs_argop4_u.opdelegreturn.deleg_stateid =
1448 		rp->r_deleg_stateid;
1449 
1450 	t = gethrtime();
1451 	rfs4call(VTOMI4(RTOV4(rp)), &args, &res, cr, &doqueue, 0, ep);
1452 
1453 	if (ep->error)
1454 		return;
1455 
1456 	if (res.status == NFS4_OK) {
1457 		garp = &res.array[1].nfs_resop4_u.opgetattr.ga_res;
1458 		nfs4_attr_cache(RTOV4(rp), garp, t, cr, TRUE, NULL);
1459 
1460 	}
1461 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1462 }
1463 
1464 int
1465 nfs4_do_delegreturn(rnode4_t *rp, int flags, cred_t *cr,
1466 	struct nfs4_callback_globals *ncg)
1467 {
1468 	vnode_t *vp = RTOV4(rp);
1469 	mntinfo4_t *mi = VTOMI4(vp);
1470 	nfs4_lost_rqst_t lost_rqst;
1471 	nfs4_recov_state_t recov_state;
1472 	bool_t needrecov = FALSE, recovonly, done = FALSE;
1473 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
1474 
1475 	ncg->nfs4_callback_stats.delegreturn.value.ui64++;
1476 
1477 	while (!done) {
1478 		e.error = nfs4_start_fop(mi, vp, NULL, OH_DELEGRETURN,
1479 				&recov_state, &recovonly);
1480 
1481 		if (e.error) {
1482 			if (flags & NFS4_DR_FORCE) {
1483 				(void) nfs_rw_enter_sig(&mi->mi_recovlock,
1484 				    RW_READER, 0);
1485 				nfs4delegreturn_cleanup_impl(rp, NULL, ncg);
1486 				nfs_rw_exit(&mi->mi_recovlock);
1487 			}
1488 			break;
1489 		}
1490 
1491 		/*
1492 		 * Check to see if the delegation has already been
1493 		 * returned by the recovery thread.   The state of
1494 		 * the delegation cannot change at this point due
1495 		 * to start_fop and the r_deleg_recall_lock.
1496 		 */
1497 		if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
1498 			e.error = 0;
1499 			nfs4_end_op(mi, vp, NULL, &recov_state, needrecov);
1500 			break;
1501 		}
1502 
1503 		if (recovonly) {
1504 			/*
1505 			 * Delegation will be returned via the
1506 			 * recovery framework.  Build a lost request
1507 			 * structure, start recovery and get out.
1508 			 */
1509 			nfs4_error_init(&e, EINTR);
1510 			nfs4delegreturn_save_lost_rqst(e.error, &lost_rqst,
1511 				cr, vp);
1512 			(void) nfs4_start_recovery(&e, mi, vp,
1513 				NULL, &rp->r_deleg_stateid,
1514 				lost_rqst.lr_op == OP_DELEGRETURN ?
1515 				&lost_rqst : NULL, OP_DELEGRETURN, NULL);
1516 			nfs4_end_op(mi, vp, NULL, &recov_state, needrecov);
1517 			break;
1518 		}
1519 
1520 		nfs4delegreturn_otw(rp, cr, &e);
1521 
1522 		/*
1523 		 * Ignore some errors on delegreturn; no point in marking
1524 		 * the file dead on a state destroying operation.
1525 		 */
1526 		if (e.error == 0 && (nfs4_recov_marks_dead(e.stat) ||
1527 		    e.stat == NFS4ERR_BADHANDLE ||
1528 		    e.stat == NFS4ERR_STALE))
1529 			needrecov = FALSE;
1530 		else
1531 			needrecov = nfs4_needs_recovery(&e, TRUE, vp->v_vfsp);
1532 
1533 		if (needrecov) {
1534 			nfs4delegreturn_save_lost_rqst(e.error, &lost_rqst,
1535 				cr, vp);
1536 			(void) nfs4_start_recovery(&e, mi, vp,
1537 				NULL, &rp->r_deleg_stateid,
1538 				lost_rqst.lr_op == OP_DELEGRETURN ?
1539 				&lost_rqst : NULL, OP_DELEGRETURN, NULL);
1540 		} else {
1541 			nfs4delegreturn_cleanup_impl(rp, NULL, ncg);
1542 			done = TRUE;
1543 		}
1544 
1545 		nfs4_end_op(mi, vp, NULL, &recov_state, needrecov);
1546 	}
1547 	return (e.error);
1548 }
1549 
1550 /*
1551  * nfs4_resend_delegreturn - used to drive the delegreturn
1552  * operation via the recovery thread.
1553  */
1554 void
1555 nfs4_resend_delegreturn(nfs4_lost_rqst_t *lorp, nfs4_error_t *ep,
1556 	nfs4_server_t *np)
1557 {
1558 	rnode4_t *rp = VTOR4(lorp->lr_vp);
1559 
1560 	/* If the file failed recovery, just quit. */
1561 	mutex_enter(&rp->r_statelock);
1562 	if (rp->r_flags & R4RECOVERR) {
1563 		ep->error = EIO;
1564 	}
1565 	mutex_exit(&rp->r_statelock);
1566 
1567 	if (!ep->error)
1568 		nfs4delegreturn_otw(rp, lorp->lr_cr, ep);
1569 
1570 	/*
1571 	 * If recovery is now needed, then return the error
1572 	 * and status and let the recovery thread handle it,
1573 	 * including re-driving another delegreturn.  Otherwise,
1574 	 * just give up and clean up the delegation.
1575 	 */
1576 	if (nfs4_needs_recovery(ep, TRUE, lorp->lr_vp->v_vfsp))
1577 		return;
1578 
1579 	if (rp->r_deleg_type != OPEN_DELEGATE_NONE)
1580 		nfs4delegreturn_cleanup(rp, np);
1581 
1582 	nfs4_error_zinit(ep);
1583 }
1584 
1585 /*
1586  * nfs4delegreturn - general function to return a delegation.
1587  *
1588  * NFS4_DR_FORCE - return the delegation even if start_op fails
1589  * NFS4_DR_PUSH - push modified data back to the server via VOP_PUTPAGE
1590  * NFS4_DR_DISCARD - discard the delegation w/o delegreturn
1591  * NFS4_DR_DID_OP - calling function already did nfs4_start_op
1592  * NFS4_DR_RECALL - delegreturned initiated via CB_RECALL
1593  * NFS4_DR_REOPEN - do file reopens, if applicable
1594  */
1595 static int
1596 nfs4delegreturn_impl(rnode4_t *rp, int flags, struct nfs4_callback_globals *ncg)
1597 {
1598 	int error = 0;
1599 	cred_t *cr = NULL;
1600 	vnode_t *vp;
1601 	bool_t needrecov = FALSE;
1602 	bool_t rw_entered = FALSE;
1603 	bool_t do_reopen;
1604 
1605 	vp = RTOV4(rp);
1606 
1607 	/*
1608 	 * If NFS4_DR_DISCARD is set by itself, take a short-cut and
1609 	 * discard without doing an otw DELEGRETURN.  This may only be used
1610 	 * by the recovery thread because it bypasses the synchronization
1611 	 * with r_deleg_recall_lock and mi->mi_recovlock.
1612 	 */
1613 	if (flags == NFS4_DR_DISCARD) {
1614 		nfs4delegreturn_cleanup_impl(rp, NULL, ncg);
1615 		return (0);
1616 	}
1617 
1618 	if (flags & NFS4_DR_DID_OP) {
1619 		/*
1620 		 * Caller had already done start_op, which means the
1621 		 * r_deleg_recall_lock is already held in READ mode
1622 		 * so we cannot take it in write mode.  Return the
1623 		 * delegation asynchronously.
1624 		 *
1625 		 * Remove the NFS4_DR_DID_OP flag so we don't
1626 		 * get stuck looping through here.
1627 		 */
1628 		VN_HOLD(vp);
1629 		nfs4delegreturn_async(rp, (flags & ~NFS4_DR_DID_OP), FALSE);
1630 		return (0);
1631 	}
1632 
1633 	/*
1634 	 * Take r_deleg_recall_lock to verify we still have a delegation
1635 	 * and to crhold the credential.  We have to release the lock
1636 	 * before we call VOP_PUTPAGE or else we'll deadlock.
1637 	 */
1638 	(void) nfs_rw_enter_sig(&rp->r_deleg_recall_lock, RW_WRITER, FALSE);
1639 	rw_entered = TRUE;
1640 	if (rp->r_deleg_type == OPEN_DELEGATE_NONE)
1641 		goto out;
1642 	cr = rp->r_deleg_cred;
1643 	crhold(cr);
1644 	nfs_rw_exit(&rp->r_deleg_recall_lock);
1645 	rw_entered = FALSE;
1646 
1647 	/*
1648 	 * Push the modified data back to the server synchronously
1649 	 * before doing DELEGRETURN.
1650 	 */
1651 	if (flags & NFS4_DR_PUSH)
1652 		(void) VOP_PUTPAGE(vp, 0, 0, 0, cr);
1653 
1654 	/*
1655 	 * Take r_deleg_recall_lock in WRITE mode, this will prevent
1656 	 * nfs4_is_otw_open_necessary from trying to use the delegation
1657 	 * while the DELEGRETURN is in progress.
1658 	 */
1659 	(void) nfs_rw_enter_sig(&rp->r_deleg_recall_lock, RW_WRITER, FALSE);
1660 
1661 	rw_entered = TRUE;
1662 
1663 	if (rp->r_deleg_type == OPEN_DELEGATE_NONE)
1664 		goto out;
1665 
1666 	if (flags & NFS4_DR_REOPEN) {
1667 		/*
1668 		 * If R4RECOVERRP is already set, then skip re-opening
1669 		 * the delegation open streams and go straight to doing
1670 		 * delegreturn.  (XXX if the file has failed recovery, then the
1671 		 * delegreturn attempt is likely to be futile.)
1672 		 */
1673 		mutex_enter(&rp->r_statelock);
1674 		do_reopen = !(rp->r_flags & R4RECOVERRP);
1675 		mutex_exit(&rp->r_statelock);
1676 
1677 		if (do_reopen) {
1678 			error = deleg_reopen(vp, &needrecov, ncg, flags);
1679 			if (error != 0) {
1680 				if ((flags & (NFS4_DR_FORCE | NFS4_DR_RECALL))
1681 									== 0)
1682 					goto out;
1683 			} else if (needrecov) {
1684 				if ((flags & NFS4_DR_FORCE) == 0)
1685 					goto out;
1686 			}
1687 		}
1688 	}
1689 
1690 	if (flags & NFS4_DR_DISCARD) {
1691 		mntinfo4_t *mi = VTOMI4(RTOV4(rp));
1692 
1693 		mutex_enter(&rp->r_statelock);
1694 		/*
1695 		 * deleg_return_pending is cleared inside of delegation_accept
1696 		 * when a delegation is accepted.  if this flag has been
1697 		 * cleared, then a new delegation has overwritten the one we
1698 		 * were about to throw away.
1699 		 */
1700 		if (!rp->r_deleg_return_pending) {
1701 			mutex_exit(&rp->r_statelock);
1702 			goto out;
1703 		}
1704 		mutex_exit(&rp->r_statelock);
1705 		(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, FALSE);
1706 		nfs4delegreturn_cleanup_impl(rp, NULL, ncg);
1707 		nfs_rw_exit(&mi->mi_recovlock);
1708 	} else {
1709 		error = nfs4_do_delegreturn(rp, flags, cr, ncg);
1710 	}
1711 
1712 out:
1713 	if (cr)
1714 		crfree(cr);
1715 	if (rw_entered)
1716 		nfs_rw_exit(&rp->r_deleg_recall_lock);
1717 	return (error);
1718 }
1719 
1720 int
1721 nfs4delegreturn(rnode4_t *rp, int flags)
1722 {
1723 	struct nfs4_callback_globals *ncg;
1724 
1725 	ncg = zone_getspecific(nfs4_callback_zone_key, curproc->p_zone);
1726 	ASSERT(ncg != NULL);
1727 
1728 	return (nfs4delegreturn_impl(rp, flags, ncg));
1729 }
1730 
1731 void
1732 nfs4delegreturn_async(rnode4_t *rp, int flags, bool_t trunc)
1733 {
1734 	struct cb_recall_pass *pp;
1735 
1736 	pp = kmem_alloc(sizeof (struct cb_recall_pass), KM_SLEEP);
1737 	pp->rp = rp;
1738 	pp->flags = flags;
1739 	pp->truncate = trunc;
1740 
1741 	/*
1742 	 * Fire up a thread to do the actual delegreturn
1743 	 * Caller must guarantee that the rnode doesn't
1744 	 * vanish (by calling VN_HOLD).
1745 	 */
1746 
1747 	(void) zthread_create(NULL, 0, nfs4delegreturn_thread, pp, 0,
1748 				minclsyspri);
1749 }
1750 
1751 static void
1752 delegreturn_all_thread(rpcprog_t *pp)
1753 {
1754 	nfs4_server_t *np;
1755 	bool_t found = FALSE;
1756 	rpcprog_t prog;
1757 	rnode4_t *rp;
1758 	vnode_t *vp;
1759 	zoneid_t zoneid = getzoneid();
1760 	struct nfs4_callback_globals *ncg;
1761 
1762 	NFS4_DEBUG(nfs4_drat_debug,
1763 		(CE_NOTE, "delereturn_all_thread: prog %d\n", *pp));
1764 
1765 	prog = *pp;
1766 	kmem_free(pp, sizeof (*pp));
1767 	pp = NULL;
1768 
1769 	mutex_enter(&nfs4_server_lst_lock);
1770 	for (np = nfs4_server_lst.forw; np != &nfs4_server_lst; np = np->forw) {
1771 		if (np->zoneid == zoneid && np->s_program == prog) {
1772 			mutex_enter(&np->s_lock);
1773 			found = TRUE;
1774 			break;
1775 		}
1776 	}
1777 	mutex_exit(&nfs4_server_lst_lock);
1778 
1779 	/*
1780 	 * It's possible that the nfs4_server which was using this
1781 	 * program number has vanished since this thread is async.
1782 	 * If so, just return.  Your work here is finished, my friend.
1783 	 */
1784 	if (!found)
1785 		goto out;
1786 
1787 	ncg = np->zone_globals;
1788 	while ((rp = list_head(&np->s_deleg_list)) != NULL) {
1789 		vp = RTOV4(rp);
1790 		VN_HOLD(vp);
1791 		mutex_exit(&np->s_lock);
1792 		(void) nfs4delegreturn_impl(rp, NFS4_DR_PUSH|NFS4_DR_REOPEN,
1793 									ncg);
1794 		VN_RELE(vp);
1795 
1796 		/* retake the s_lock for next trip through the loop */
1797 		mutex_enter(&np->s_lock);
1798 	}
1799 	mutex_exit(&np->s_lock);
1800 out:
1801 	NFS4_DEBUG(nfs4_drat_debug,
1802 		(CE_NOTE, "delereturn_all_thread: complete\n"));
1803 	zthread_exit();
1804 }
1805 
1806 void
1807 nfs4_delegreturn_all(nfs4_server_t *sp)
1808 {
1809 	rpcprog_t pro, *pp;
1810 
1811 	mutex_enter(&sp->s_lock);
1812 
1813 	/* Check to see if the delegation list is empty */
1814 
1815 	if (list_head(&sp->s_deleg_list) == NULL) {
1816 		mutex_exit(&sp->s_lock);
1817 		return;
1818 	}
1819 	/*
1820 	 * Grab the program number; the async thread will use this
1821 	 * to find the nfs4_server.
1822 	 */
1823 	pro = sp->s_program;
1824 	mutex_exit(&sp->s_lock);
1825 	pp = kmem_alloc(sizeof (rpcprog_t), KM_SLEEP);
1826 	*pp = pro;
1827 	(void) zthread_create(NULL, 0, delegreturn_all_thread, pp, 0,
1828 	    minclsyspri);
1829 }
1830 
1831 
1832 /*
1833  * Discard any delegations
1834  *
1835  * Iterate over the servers s_deleg_list and
1836  * for matching mount-point rnodes discard
1837  * the delegation.
1838  */
1839 void
1840 nfs4_deleg_discard(mntinfo4_t *mi, nfs4_server_t *sp)
1841 {
1842 	rnode4_t *rp, *next;
1843 	mntinfo4_t *r_mi;
1844 	struct nfs4_callback_globals *ncg;
1845 
1846 	ASSERT(mutex_owned(&sp->s_lock));
1847 	ncg = sp->zone_globals;
1848 
1849 	for (rp = list_head(&sp->s_deleg_list); rp != NULL; rp = next) {
1850 		r_mi = VTOMI4(RTOV4(rp));
1851 		next = list_next(&sp->s_deleg_list, rp);
1852 
1853 		if (r_mi != mi) {
1854 			/*
1855 			 * Skip if this rnode is in not on the
1856 			 * same mount-point
1857 			 */
1858 			continue;
1859 		}
1860 
1861 		ASSERT(rp->r_deleg_type == OPEN_DELEGATE_READ);
1862 
1863 #ifdef DEBUG
1864 		if (nfs4_client_recov_debug) {
1865 			zprintf(getzoneid(),
1866 			    "nfs4_deleg_discard: matched rnode %p "
1867 			"-- discarding delegation\n", (void *)rp);
1868 		}
1869 #endif
1870 		mutex_enter(&rp->r_statev4_lock);
1871 		/*
1872 		 * Free the cred originally held when the delegation
1873 		 * was granted. Also need to decrement the refcnt
1874 		 * on this server for each delegation we discard
1875 		 */
1876 		if (rp->r_deleg_cred)
1877 			crfree(rp->r_deleg_cred);
1878 		rp->r_deleg_cred = NULL;
1879 		rp->r_deleg_type = OPEN_DELEGATE_NONE;
1880 		rp->r_deleg_needs_recovery = OPEN_DELEGATE_NONE;
1881 		rp->r_deleg_needs_recall = FALSE;
1882 		ASSERT(sp->s_refcnt > 1);
1883 		sp->s_refcnt--;
1884 		list_remove(&sp->s_deleg_list, rp);
1885 		mutex_exit(&rp->r_statev4_lock);
1886 		nfs4_dec_state_ref_count_nolock(sp, mi);
1887 		ncg->nfs4_callback_stats.delegations.value.ui64--;
1888 	}
1889 }
1890 
1891 /*
1892  * Reopen any open streams that were covered by the given file's
1893  * delegation.
1894  * Returns zero or an errno value.  If there was no error, *recovp
1895  * indicates whether recovery was initiated.
1896  */
1897 
1898 static int
1899 deleg_reopen(vnode_t *vp, bool_t *recovp, struct nfs4_callback_globals *ncg,
1900 	int flags)
1901 {
1902 	nfs4_open_stream_t *osp;
1903 	nfs4_recov_state_t recov_state;
1904 	bool_t needrecov = FALSE;
1905 	mntinfo4_t *mi;
1906 	rnode4_t *rp;
1907 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
1908 	int claimnull;
1909 
1910 	mi = VTOMI4(vp);
1911 	rp = VTOR4(vp);
1912 
1913 	recov_state.rs_flags = 0;
1914 	recov_state.rs_num_retry_despite_err = 0;
1915 
1916 retry:
1917 	if ((e.error = nfs4_start_op(mi, vp, NULL, &recov_state)) != 0) {
1918 		return (e.error);
1919 	}
1920 
1921 	/*
1922 	 * if we mean to discard the delegation, it must be BAD, so don't
1923 	 * use it when doing the reopen or it will fail too.
1924 	 */
1925 	claimnull = (flags & NFS4_DR_DISCARD);
1926 	/*
1927 	 * Loop through the open streams for this rnode to find
1928 	 * all of the ones created using the delegation state ID.
1929 	 * Each of these needs to be re-opened.
1930 	 */
1931 
1932 	while ((osp = get_next_deleg_stream(rp, claimnull)) != NULL) {
1933 
1934 		if (claimnull) {
1935 			nfs4_reopen(vp, osp, &e, CLAIM_NULL, FALSE, FALSE);
1936 		} else {
1937 			ncg->nfs4_callback_stats.claim_cur.value.ui64++;
1938 
1939 			nfs4_reopen(vp, osp, &e, CLAIM_DELEGATE_CUR, FALSE,
1940 					FALSE);
1941 			if (e.error == 0 && e.stat == NFS4_OK)
1942 				ncg->nfs4_callback_stats.
1943 					claim_cur_ok.value.ui64++;
1944 		}
1945 
1946 		if (e.error == EAGAIN) {
1947 			nfs4_end_op(mi, vp, NULL, &recov_state, TRUE);
1948 			goto retry;
1949 		}
1950 
1951 		/*
1952 		 * if error is EINTR, ETIMEDOUT, or NFS4_FRC_UNMT_ERR, then
1953 		 * recovery has already been started inside of nfs4_reopen.
1954 		 */
1955 		if (e.error == EINTR || e.error == ETIMEDOUT ||
1956 		    NFS4_FRC_UNMT_ERR(e.error, vp->v_vfsp)) {
1957 			open_stream_rele(osp, rp);
1958 			break;
1959 		}
1960 
1961 		needrecov = nfs4_needs_recovery(&e, TRUE, vp->v_vfsp);
1962 
1963 		if (e.error != 0 && !needrecov) {
1964 			/*
1965 			 * Recovery is not possible, but don't give up yet;
1966 			 * we'd still like to do delegreturn after
1967 			 * reopening as many streams as possible.
1968 			 * Continue processing the open streams.
1969 			 */
1970 
1971 			ncg->nfs4_callback_stats.recall_failed.value.ui64++;
1972 
1973 		} else if (needrecov) {
1974 			/*
1975 			 * Start recovery and bail out.  The recovery
1976 			 * thread will take it from here.
1977 			 */
1978 			(void) nfs4_start_recovery(&e, mi, vp, NULL, NULL,
1979 				NULL, OP_OPEN, NULL);
1980 			open_stream_rele(osp, rp);
1981 			*recovp = TRUE;
1982 			break;
1983 		}
1984 
1985 		open_stream_rele(osp, rp);
1986 	}
1987 
1988 	nfs4_end_op(mi, vp, NULL, &recov_state, needrecov);
1989 
1990 	return (e.error);
1991 }
1992 
1993 /*
1994  * get_next_deleg_stream - returns the next open stream which
1995  * represents a delegation for this rnode.  In order to assure
1996  * forward progress, the caller must guarantee that each open
1997  * stream returned is changed so that a future call won't return
1998  * it again.
1999  *
2000  * There are several ways for the open stream to change.  If the open
2001  * stream is !os_delegation, then we aren't interested in it.  Also, if
2002  * either os_failed_reopen or !os_valid, then don't return the osp.
2003  *
2004  * If claimnull is false (doing reopen CLAIM_DELEGATE_CUR) then return
2005  * the osp if it is an os_delegation open stream.  Also, if the rnode still
2006  * has r_deleg_return_pending, then return the os_delegation osp.  Lastly,
2007  * if the rnode's r_deleg_stateid is different from the osp's open_stateid,
2008  * then return the osp.
2009  *
2010  * We have already taken the 'r_deleg_recall_lock' as WRITER, which
2011  * prevents new OPENs from going OTW (as start_fop takes this
2012  * lock in READ mode); thus, no new open streams can be created
2013  * (which inheretly means no new delegation open streams are
2014  * being created).
2015  */
2016 
2017 static nfs4_open_stream_t *
2018 get_next_deleg_stream(rnode4_t *rp, int claimnull)
2019 {
2020 	nfs4_open_stream_t	*osp;
2021 
2022 	ASSERT(nfs_rw_lock_held(&rp->r_deleg_recall_lock, RW_WRITER));
2023 
2024 	/*
2025 	 * Search through the list of open streams looking for
2026 	 * one that was created while holding the delegation.
2027 	 */
2028 	mutex_enter(&rp->r_os_lock);
2029 	for (osp = list_head(&rp->r_open_streams); osp != NULL;
2030 	    osp = list_next(&rp->r_open_streams, osp)) {
2031 		mutex_enter(&osp->os_sync_lock);
2032 		if (!osp->os_delegation || osp->os_failed_reopen ||
2033 		    !osp->os_valid) {
2034 			mutex_exit(&osp->os_sync_lock);
2035 			continue;
2036 		}
2037 		if (!claimnull || rp->r_deleg_return_pending ||
2038 		    !stateid4_cmp(&osp->open_stateid, &rp->r_deleg_stateid)) {
2039 			osp->os_ref_count++;
2040 			mutex_exit(&osp->os_sync_lock);
2041 			mutex_exit(&rp->r_os_lock);
2042 			return (osp);
2043 		}
2044 		mutex_exit(&osp->os_sync_lock);
2045 	}
2046 	mutex_exit(&rp->r_os_lock);
2047 
2048 	return (NULL);
2049 }
2050 
2051 static void
2052 nfs4delegreturn_thread(struct cb_recall_pass *args)
2053 {
2054 	rnode4_t *rp;
2055 	vnode_t *vp;
2056 	cred_t *cr;
2057 	int dtype, error, flags;
2058 	bool_t rdirty, rip;
2059 	kmutex_t cpr_lock;
2060 	callb_cpr_t cpr_info;
2061 	struct nfs4_callback_globals *ncg;
2062 
2063 	ncg = zone_getspecific(nfs4_callback_zone_key, curproc->p_zone);
2064 	ASSERT(ncg != NULL);
2065 
2066 	mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
2067 
2068 	CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr,
2069 			"nfsv4delegRtn");
2070 
2071 	rp = args->rp;
2072 	vp = RTOV4(rp);
2073 
2074 	mutex_enter(&rp->r_statev4_lock);
2075 	if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
2076 		mutex_exit(&rp->r_statev4_lock);
2077 		goto out;
2078 	}
2079 	mutex_exit(&rp->r_statev4_lock);
2080 
2081 	/*
2082 	 * Take the read-write lock in read mode to prevent other
2083 	 * threads from modifying the data during the recall.  This
2084 	 * doesn't affect mmappers.
2085 	 */
2086 	(void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
2087 
2088 	/* Proceed with delegreturn */
2089 
2090 	mutex_enter(&rp->r_statev4_lock);
2091 	if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
2092 		mutex_exit(&rp->r_statev4_lock);
2093 		nfs_rw_exit(&rp->r_rwlock);
2094 		goto out;
2095 	}
2096 	dtype = rp->r_deleg_type;
2097 	cr = rp->r_deleg_cred;
2098 	ASSERT(cr != NULL);
2099 	crhold(cr);
2100 	mutex_exit(&rp->r_statev4_lock);
2101 
2102 	flags = args->flags;
2103 
2104 	/*
2105 	 * If the file is being truncated at the server, then throw
2106 	 * away all of the pages, it doesn't matter what flavor of
2107 	 * delegation we have.
2108 	 */
2109 
2110 	if (args->truncate) {
2111 		ncg->nfs4_callback_stats.recall_trunc.value.ui64++;
2112 		nfs4_invalidate_pages(vp, 0, cr);
2113 	} else if (dtype == OPEN_DELEGATE_WRITE) {
2114 
2115 		mutex_enter(&rp->r_statelock);
2116 		rdirty = rp->r_flags & R4DIRTY;
2117 		mutex_exit(&rp->r_statelock);
2118 
2119 		if (rdirty) {
2120 			error = VOP_PUTPAGE(vp, 0, 0, 0, cr);
2121 
2122 			if (error)
2123 				CB_WARN1("nfs4delegreturn_thread:"
2124 				" VOP_PUTPAGE: %d\n", error);
2125 		}
2126 		/* turn off NFS4_DR_PUSH because we just did that above. */
2127 		flags &= ~NFS4_DR_PUSH;
2128 	}
2129 
2130 	mutex_enter(&rp->r_statelock);
2131 	rip =  rp->r_flags & R4RECOVERRP;
2132 	mutex_exit(&rp->r_statelock);
2133 
2134 	/* If a failed recovery is indicated, discard the pages */
2135 
2136 	if (rip) {
2137 
2138 		error = VOP_PUTPAGE(vp, 0, 0, B_INVAL, cr);
2139 
2140 		if (error)
2141 			CB_WARN1("nfs4delegreturn_thread: VOP_PUTPAGE: %d\n",
2142 				error);
2143 	}
2144 
2145 	/*
2146 	 * Pass the flags to nfs4delegreturn_impl, but be sure not to pass
2147 	 * NFS4_DR_DID_OP, which just calls nfs4delegreturn_async again.
2148 	 */
2149 	flags &= ~NFS4_DR_DID_OP;
2150 
2151 	(void) nfs4delegreturn_impl(rp, flags, ncg);
2152 
2153 	nfs_rw_exit(&rp->r_rwlock);
2154 	crfree(cr);
2155 out:
2156 	kmem_free(args, sizeof (struct cb_recall_pass));
2157 	VN_RELE(vp);
2158 	mutex_enter(&cpr_lock);
2159 	CALLB_CPR_EXIT(&cpr_info);
2160 	mutex_destroy(&cpr_lock);
2161 	zthread_exit();
2162 }
2163 
2164 /*
2165  * This function has one assumption that the caller of this function is
2166  * either doing recovery (therefore cannot call nfs4_start_op) or has
2167  * already called nfs4_start_op().
2168  */
2169 void
2170 nfs4_delegation_accept(rnode4_t *rp, open_claim_type4 claim,  OPEN4res *res,
2171 	nfs4_ga_res_t *garp, cred_t *cr)
2172 {
2173 	open_read_delegation4 *orp;
2174 	open_write_delegation4 *owp;
2175 	nfs4_server_t *np;
2176 	bool_t already = FALSE;
2177 	bool_t recall = FALSE;
2178 	bool_t valid_garp = TRUE;
2179 	long mapcnt;
2180 	uint_t rflag;
2181 	mntinfo4_t *mi;
2182 	bool_t recov;
2183 	struct nfs4_callback_globals *ncg;
2184 
2185 	ncg = zone_getspecific(nfs4_callback_zone_key, curproc->p_zone);
2186 	ASSERT(ncg != NULL);
2187 
2188 	mutex_enter(&rp->r_statev4_lock);
2189 
2190 	if (rp->r_deleg_type == OPEN_DELEGATE_READ ||
2191 	    rp->r_deleg_type == OPEN_DELEGATE_WRITE)
2192 		already = TRUE;
2193 
2194 	if (res->delegation.delegation_type == OPEN_DELEGATE_READ) {
2195 
2196 		rp->r_deleg_type = res->delegation.delegation_type;
2197 		orp = &res->delegation.open_delegation4_u.read;
2198 		rp->r_deleg_stateid = orp->stateid;
2199 		rp->r_deleg_perms = orp->permissions;
2200 		recall = orp->recall;
2201 
2202 		ncg->nfs4_callback_stats.delegations.value.ui64++;
2203 		ncg->nfs4_callback_stats.delegaccept_r.value.ui64++;
2204 
2205 	} else if (res->delegation.delegation_type == OPEN_DELEGATE_WRITE) {
2206 
2207 		rp->r_deleg_type = res->delegation.delegation_type;
2208 		owp = &res->delegation.open_delegation4_u.write;
2209 		rp->r_deleg_stateid = owp->stateid;
2210 		rp->r_deleg_perms = owp->permissions;
2211 		rp->r_deleg_limit = owp->space_limit;
2212 		recall = owp->recall;
2213 
2214 		if (garp == NULL || !garp->n4g_change_valid) {
2215 			valid_garp = FALSE;
2216 			rp->r_deleg_change = 0;
2217 			rp->r_deleg_change_grant = 0;
2218 		} else {
2219 			rp->r_deleg_change = garp->n4g_change;
2220 			rp->r_deleg_change_grant = garp->n4g_change;
2221 		}
2222 
2223 
2224 		mutex_enter(&rp->r_statelock);
2225 		mapcnt = rp->r_mapcnt;
2226 		rflag = rp->r_flags;
2227 		mutex_exit(&rp->r_statelock);
2228 
2229 		/*
2230 		 * Update the delegation change attribute if
2231 		 * there are mappers for the file is dirty.  This
2232 		 * might be the case during recovery after server
2233 		 * reboot.
2234 		 */
2235 
2236 		if (mapcnt > 0 || rflag & R4DIRTY)
2237 			rp->r_deleg_change++;
2238 
2239 		NFS4_DEBUG(nfs4_callback_debug, (CE_NOTE,
2240 			"nfs4_delegation_accept: r_deleg_change: 0x%x\n",
2241 			(int)(rp->r_deleg_change >> 32)));
2242 		NFS4_DEBUG(nfs4_callback_debug, (CE_NOTE,
2243 			"nfs4_delegation_accept: r_delg_change_grant: 0x%x\n",
2244 			(int)(rp->r_deleg_change_grant >> 32)));
2245 
2246 		ncg->nfs4_callback_stats.delegations.value.ui64++;
2247 		ncg->nfs4_callback_stats.delegaccept_rw.value.ui64++;
2248 
2249 #ifdef	DEBUG
2250 		if (nfs4_use_phony_limit == 1)
2251 			rp->r_deleg_limit = nfs4_deleg_space_phony;
2252 		if (nfs4_use_phony_limit == 2) {
2253 			rp->r_deleg_limit = nfs4_deleg_space_phony2;
2254 			rp->r_deleg_limit.nfs_space_limit4_u.mod_blocks =
2255 				nfs4_deleg_space_phonyl;
2256 		}
2257 #endif
2258 
2259 #ifdef	DEBUG
2260 
2261 	} else if (nfs4_deleg_accept_phony == OPEN_DELEGATE_READ) {
2262 
2263 		rp->r_deleg_type = OPEN_DELEGATE_READ;
2264 		rp->r_deleg_stateid = nfs4_deleg_any;
2265 		rp->r_deleg_perms = nfs4_deleg_ace_phony;
2266 		rp->r_deleg_change = nfs4_deleg_change_phony;
2267 		rp->r_deleg_change_grant = rp->r_deleg_change;
2268 
2269 	} else if (nfs4_deleg_accept_phony == OPEN_DELEGATE_WRITE) {
2270 
2271 		rp->r_deleg_type = OPEN_DELEGATE_WRITE;
2272 		rp->r_deleg_stateid = nfs4_deleg_any;
2273 		rp->r_deleg_perms = nfs4_deleg_ace_phony;
2274 		rp->r_deleg_limit = nfs4_deleg_space_phony;
2275 		rp->r_deleg_change = nfs4_deleg_change_phony;
2276 		rp->r_deleg_change_grant = rp->r_deleg_change;
2277 
2278 #endif
2279 	} else {
2280 		mutex_exit(&rp->r_statev4_lock);
2281 
2282 		if (already) {
2283 			switch (claim) {
2284 
2285 			case CLAIM_NULL:
2286 			case CLAIM_PREVIOUS:
2287 				/*
2288 				 * The file may already have a delegation when
2289 				 * it is reopened during recovery.  In this
2290 				 * case, we consider the delegation to no longer
2291 				 * be valid.  As a courtesy, attempt to return
2292 				 * the delegation.
2293 				 */
2294 				mi = VTOMI4(RTOV4(rp));
2295 				mutex_enter(&mi->mi_lock);
2296 				recov = mi->mi_recovflags & MI4R_REOPEN_FILES;
2297 				mutex_exit(&mi->mi_lock);
2298 
2299 				/*
2300 				 * We need to hold rp->r_statev4_lock while
2301 				 * checking rp->r_deleg_return_pending and
2302 				 * when calling nfs4_dlistadd() if we're in
2303 				 * recovery.
2304 				 */
2305 				mutex_enter(&rp->r_statev4_lock);
2306 				if (rp->r_deleg_return_pending == TRUE) {
2307 					/*
2308 					 * We're alreading in the throes of
2309 					 * returning a delegation.  Drop
2310 					 * the lock and head for the return.
2311 					 */
2312 					mutex_exit(&rp->r_statev4_lock);
2313 				} else if (recov) {
2314 					/*
2315 					 * Cannot call delegreturn from inside
2316 					 * of recovery or VOP_PUTPAGE will hang
2317 					 * due to nfs4_start_fop call in
2318 					 * nfs4write.  Use dlistadd to add the
2319 					 * rnode to the list of rnodes needing
2320 					 * cleaning.
2321 					 *
2322 					 * NB: We're in recover so don't reopen
2323 					 */
2324 					nfs4_dlistadd(rp, ncg,
2325 						NFS4_DR_PUSH|NFS4_DR_DISCARD);
2326 					mutex_exit(&rp->r_statev4_lock);
2327 				} else {
2328 					mutex_exit(&rp->r_statev4_lock);
2329 					/* XXX - Do we need to reopen? */
2330 					(void) nfs4delegreturn_impl(rp,
2331 						(NFS4_DR_PUSH |
2332 						    NFS4_DR_DID_OP |
2333 						    NFS4_DR_REOPEN),
2334 						ncg);
2335 				}
2336 				break;
2337 
2338 			default:
2339 				/*
2340 				 * CLAIM_DELEGATE_CUR, CLAIM_DELEGATE_PREV
2341 				 * fall through here
2342 				 */
2343 				break;
2344 			}
2345 		}
2346 
2347 		/* No delegation granted, get out. */
2348 		return;
2349 	}
2350 
2351 	rp->r_deleg_return_pending = FALSE;
2352 	rp->r_deleg_needs_recovery = OPEN_DELEGATE_NONE;
2353 	if (claim == CLAIM_PREVIOUS)
2354 		rp->r_deleg_needs_recall = recall;
2355 
2356 #ifdef	DEBUG
2357 	if (nfs4_use_phony_recall)
2358 		rp->r_deleg_needs_recall = nfs4_phony_recall_v;
2359 #endif
2360 
2361 	/*
2362 	 * If the server has requested a recall, then put the
2363 	 * vnode on a list of files which need to be cleaned.
2364 	 * This will be done later by the recovery thread to
2365 	 * avoid a deadlock.  If this were a CLAIM_NULL open
2366 	 * and the server set recall, then the server is just
2367 	 * confused; the delegation will be returned eventually.
2368 	 */
2369 	if (rp->r_deleg_needs_recall)
2370 		nfs4_dlistadd(rp, ncg, NFS4_DR_PUSH|NFS4_DR_REOPEN);
2371 
2372 	if (already == FALSE) {
2373 		rp->r_deleg_cred = cr;
2374 		crhold(cr);
2375 	}
2376 
2377 	mutex_exit(&rp->r_statev4_lock);
2378 
2379 	if (already == FALSE) {
2380 
2381 		/*
2382 		 * Add this rnode to the list of rnodes with delegations
2383 		 * for this nfs4_server.  find_nfs4_server returns with
2384 		 * the mutex locked, so don't forget to mutex exit.
2385 		 */
2386 
2387 		if ((np = find_nfs4_server(VTOMI4(RTOV4(rp)))) == NULL) {
2388 
2389 			mutex_enter(&rp->r_statev4_lock);
2390 			rp->r_deleg_type = OPEN_DELEGATE_NONE;
2391 			mutex_exit(&rp->r_statev4_lock);
2392 			return;
2393 		}
2394 
2395 		list_insert_head(&np->s_deleg_list, rp);
2396 		/* added list node gets a reference */
2397 		np->s_refcnt++;
2398 		nfs4_inc_state_ref_count_nolock(np, VTOMI4(RTOV4(rp)));
2399 		mutex_exit(&np->s_lock);
2400 		nfs4_server_rele(np);
2401 	}
2402 
2403 	/*
2404 	 * This call to nfs4delegreturn assumes that nfs4_start_op MUST
2405 	 * not be called by nfs4delegreturn.
2406 	 */
2407 	if (nfs4_delegreturn_policy == IMMEDIATE || !valid_garp)
2408 		(void) nfs4delegreturn_impl(rp,
2409 			NFS4_DR_PUSH|NFS4_DR_DID_OP|NFS4_DR_REOPEN, ncg);
2410 }
2411 
2412 /*
2413  * nfs4delegabandon - Abandon the delegation on an rnode4.  This code
2414  * is called when the client receives EXPIRED, BAD_STATEID, OLD_STATEID
2415  * or BADSEQID and the recovery code is unable to recover.  Push any
2416  * dirty data back to the server and return the delegation (if any).
2417  */
2418 
2419 void
2420 nfs4delegabandon(rnode4_t *rp)
2421 {
2422 	vnode_t *vp;
2423 	struct cb_recall_pass *pp;
2424 	open_delegation_type4 dt;
2425 
2426 	mutex_enter(&rp->r_statev4_lock);
2427 	dt = rp->r_deleg_type;
2428 	mutex_exit(&rp->r_statev4_lock);
2429 
2430 	if (dt == OPEN_DELEGATE_NONE)
2431 		return;
2432 
2433 	vp = RTOV4(rp);
2434 	VN_HOLD(vp);
2435 
2436 	pp = kmem_alloc(sizeof (struct cb_recall_pass), KM_SLEEP);
2437 	pp->rp = rp;
2438 	/*
2439 	 * Recovery on the file has failed and we want to return
2440 	 * the delegation.  We don't want to reopen files and
2441 	 * nfs4delegreturn_thread() figures out what to do about
2442 	 * the data.  The only thing to do is attempt to return
2443 	 * the delegation.
2444 	 */
2445 	pp->flags = 0;
2446 	pp->truncate = FALSE;
2447 
2448 	/*
2449 	 * Fire up a thread to do the delegreturn; this is
2450 	 * necessary because we could be inside a GETPAGE or
2451 	 * PUTPAGE and we cannot do another one.
2452 	 */
2453 
2454 	(void) zthread_create(NULL, 0, nfs4delegreturn_thread, pp, 0,
2455 				minclsyspri);
2456 }
2457 
2458 static int
2459 wait_for_recall1(vnode_t *vp, nfs4_op_hint_t op, nfs4_recov_state_t *rsp,
2460 	int flg)
2461 {
2462 	rnode4_t *rp;
2463 	int error = 0;
2464 
2465 #ifdef lint
2466 	op = op;
2467 #endif
2468 
2469 	if (vp && vp->v_type == VREG) {
2470 		rp = VTOR4(vp);
2471 
2472 		/*
2473 		 * Take r_deleg_recall_lock in read mode to synchronize
2474 		 * with delegreturn.
2475 		 */
2476 		error = nfs_rw_enter_sig(&rp->r_deleg_recall_lock,
2477 			RW_READER, INTR4(vp));
2478 
2479 		if (error == 0)
2480 			rsp->rs_flags |= flg;
2481 
2482 	}
2483 	return (error);
2484 }
2485 
2486 void
2487 nfs4_end_op_recall(vnode_t *vp1, vnode_t *vp2, nfs4_recov_state_t *rsp)
2488 {
2489 	NFS4_DEBUG(nfs4_recall_debug,
2490 		(CE_NOTE, "nfs4_end_op_recall: 0x%p, 0x%p\n",
2491 		(void *)vp1, (void *)vp2));
2492 
2493 	if (vp2 && rsp->rs_flags & NFS4_RS_RECALL_HELD2)
2494 		nfs_rw_exit(&VTOR4(vp2)->r_deleg_recall_lock);
2495 	if (vp1 && rsp->rs_flags & NFS4_RS_RECALL_HELD1)
2496 		nfs_rw_exit(&VTOR4(vp1)->r_deleg_recall_lock);
2497 }
2498 
2499 int
2500 wait_for_recall(vnode_t *vp1, vnode_t *vp2, nfs4_op_hint_t op,
2501 	nfs4_recov_state_t *rsp)
2502 {
2503 	int error;
2504 
2505 	NFS4_DEBUG(nfs4_recall_debug,
2506 		(CE_NOTE, "wait_for_recall:    0x%p, 0x%p\n",
2507 		(void *)vp1, (void *) vp2));
2508 
2509 	rsp->rs_flags &= ~(NFS4_RS_RECALL_HELD1|NFS4_RS_RECALL_HELD2);
2510 
2511 	if ((error = wait_for_recall1(vp1, op, rsp, NFS4_RS_RECALL_HELD1)) != 0)
2512 		return (error);
2513 
2514 	if ((error = wait_for_recall1(vp2, op, rsp, NFS4_RS_RECALL_HELD2))
2515 	    != 0) {
2516 		if (rsp->rs_flags & NFS4_RS_RECALL_HELD1) {
2517 			nfs_rw_exit(&VTOR4(vp1)->r_deleg_recall_lock);
2518 			rsp->rs_flags &= ~NFS4_RS_RECALL_HELD1;
2519 		}
2520 
2521 		return (error);
2522 	}
2523 
2524 	return (0);
2525 }
2526 
2527 /*
2528  * nfs4_dlistadd - Add this rnode to a list of rnodes to be
2529  * DELEGRETURN'd at the end of recovery.
2530  */
2531 
2532 static void
2533 nfs4_dlistadd(rnode4_t *rp, struct nfs4_callback_globals *ncg, int flags)
2534 {
2535 	struct nfs4_dnode *dp;
2536 
2537 	ASSERT(mutex_owned(&rp->r_statev4_lock));
2538 	/*
2539 	 * Mark the delegation as having a return pending.
2540 	 * This will prevent the use of the delegation stateID
2541 	 * by read, write, setattr and open.
2542 	 */
2543 	rp->r_deleg_return_pending = TRUE;
2544 	dp = kmem_alloc(sizeof (*dp), KM_SLEEP);
2545 	VN_HOLD(RTOV4(rp));
2546 	dp->rnodep = rp;
2547 	dp->flags = flags;
2548 	mutex_enter(&ncg->nfs4_dlist_lock);
2549 	list_insert_head(&ncg->nfs4_dlist, dp);
2550 #ifdef	DEBUG
2551 	ncg->nfs4_dlistadd_c++;
2552 #endif
2553 	mutex_exit(&ncg->nfs4_dlist_lock);
2554 }
2555 
2556 /*
2557  * nfs4_dlistclean_impl - Do DELEGRETURN for each rnode on the list.
2558  * of files awaiting cleaning.  If the override_flags are non-zero
2559  * then use them rather than the flags that were set when the rnode
2560  * was added to the dlist.
2561  */
2562 static void
2563 nfs4_dlistclean_impl(struct nfs4_callback_globals *ncg, int override_flags)
2564 {
2565 	rnode4_t *rp;
2566 	struct nfs4_dnode *dp;
2567 	int flags;
2568 
2569 	ASSERT(override_flags == 0 || override_flags == NFS4_DR_DISCARD);
2570 
2571 	mutex_enter(&ncg->nfs4_dlist_lock);
2572 	while ((dp = list_head(&ncg->nfs4_dlist)) != NULL) {
2573 #ifdef	DEBUG
2574 		ncg->nfs4_dlistclean_c++;
2575 #endif
2576 		list_remove(&ncg->nfs4_dlist, dp);
2577 		mutex_exit(&ncg->nfs4_dlist_lock);
2578 		rp = dp->rnodep;
2579 		flags = (override_flags != 0) ? override_flags : dp->flags;
2580 		kmem_free(dp, sizeof (*dp));
2581 		(void) nfs4delegreturn_impl(rp, flags, ncg);
2582 		VN_RELE(RTOV4(rp));
2583 		mutex_enter(&ncg->nfs4_dlist_lock);
2584 	}
2585 	mutex_exit(&ncg->nfs4_dlist_lock);
2586 }
2587 
2588 void
2589 nfs4_dlistclean(void)
2590 {
2591 	struct nfs4_callback_globals *ncg;
2592 
2593 	ncg = zone_getspecific(nfs4_callback_zone_key, curproc->p_zone);
2594 	ASSERT(ncg != NULL);
2595 
2596 	nfs4_dlistclean_impl(ncg, 0);
2597 }
2598