xref: /titanic_50/usr/src/uts/common/fs/nfs/nfs4_recovery.c (revision c77a61a72b5ecdc507d6cf104142edd371a16c84)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * NFS Version 4 state recovery code.
30  */
31 
32 #include <nfs/nfs4_clnt.h>
33 #include <nfs/nfs4.h>
34 #include <nfs/rnode4.h>
35 #include <sys/cmn_err.h>
36 #include <sys/cred.h>
37 #include <sys/systm.h>
38 #include <sys/flock.h>
39 #include <sys/dnlc.h>
40 #include <sys/ddi.h>
41 #include <sys/disp.h>
42 #include <sys/list.h>
43 #include <sys/sdt.h>
44 
45 extern r4hashq_t *rtable4;
46 
47 /*
48  * Information that describes what needs to be done for recovery.  It is
49  * passed to a client recovery thread as well as passed to various recovery
50  * routines.  rc_mi, rc_vp1, and rc_vp2 refer to the filesystem and
51  * vnode(s) affected by recovery.  rc_vp1 and rc_vp2 are references (use
52  * VN_HOLD) or NULL.  rc_lost_rqst contains information about the lost
53  * lock or open/close request, and it holds reference counts for the
54  * various objects (vnode, etc.).  The recovery thread also uses flags set
55  * in the mntinfo4_t or vnode_t to tell it what to do.  rc_error is used
56  * to save the error that originally triggered the recovery event -- will
57  * later be used to set mi_error if recovery doesn't work.  rc_bseqid_rqst
58  * contains information about the request that got NFS4ERR_BAD_SEQID, and
59  * it holds reference count for the various objects (vnode, open owner,
60  * open stream, lock owner).
61  */
62 
63 typedef struct {
64 	mntinfo4_t *rc_mi;
65 	vnode_t *rc_vp1;
66 	vnode_t *rc_vp2;
67 	nfs4_recov_t rc_action;
68 	stateid4 rc_stateid;
69 	bool_t rc_srv_reboot;		/* server has rebooted */
70 	nfs4_lost_rqst_t *rc_lost_rqst;
71 	nfs4_error_t rc_orig_errors;	/* original errors causing recovery */
72 	int rc_error;
73 	nfs4_bseqid_entry_t *rc_bseqid_rqst;
74 } recov_info_t;
75 
76 /*
77  * How long to wait before trying again if there is an error doing
78  * recovery, in seconds.
79  */
80 
81 static int recov_err_delay = 1;
82 
83 /*
84  * How long to wait when processing NFS4ERR_GRACE or NFS4ERR_DELAY
85  * errors.  Expressed in seconds.  Default is defined as
86  * NFS4ERR_DELAY_TIME and this variable is initialized in nfs4_subr_init()
87  */
88 time_t nfs4err_delay_time = 0;
89 
90 /*
91  * Tuneable to limit how many time "exempt" ops go OTW
92  * after a recovery error.  Exempt op hints are OH_CLOSE,
93  * OH_LOCKU, OH_DELEGRETURN.  These previously always went
94  * OTW even after rnode was "dead" due to recovery errors.
95  *
96  * The tuneable below limits the number of times a start_fop
97  * invocation will retry the exempt hints.  After the limit
98  * is reached, nfs4_start_fop will return an error just like
99  * it would for non-exempt op hints.
100  */
101 int nfs4_max_recov_error_retry = 3;
102 
103 /*
104  * Number of seconds the recovery thread should pause before retry when the
105  * filesystem has been forcibly unmounted.
106  */
107 
108 int nfs4_unmount_delay = 1;
109 
110 #ifdef DEBUG
111 
112 /*
113  * How long to wait (in seconds) between recovery operations on a given
114  * file.  Normally zero, but could be set longer for testing purposes.
115  */
116 static int nfs4_recovdelay = 0;
117 
118 /*
119  * Switch that controls whether to go into the debugger when recovery
120  * fails.
121  */
122 static int nfs4_fail_recov_stop = 0;
123 
124 /*
125  * Tuneables to debug client namespace interaction with server
126  * mount points:
127  *
128  *	nfs4_srvmnt_fail_cnt:
129  *		number of times EACCES returned because client
130  *		attempted to cross server mountpoint
131  *
132  *	nfs4_srvmnt_debug:
133  *		trigger console printf whenever client attempts
134  *		to cross server mountpoint
135  */
136 int nfs4_srvmnt_fail_cnt = 0;
137 int nfs4_srvmnt_debug = 0;
138 #endif
139 
140 /* forward references, in alphabetic order */
141 static void close_after_open_resend(vnode_t *, cred_t *, uint32_t,
142 	nfs4_error_t *);
143 static void errs_to_action(recov_info_t *,
144 	nfs4_server_t *, mntinfo4_t *, stateid4 *, nfs4_lost_rqst_t *, int,
145 	nfs_opnum4, nfs4_bseqid_entry_t *);
146 static void flush_reinstate(nfs4_lost_rqst_t *);
147 static void free_milist(mntinfo4_t **, int);
148 static mntinfo4_t **make_milist(nfs4_server_t *, int *);
149 static int nfs4_check_recov_err(vnode_t *, nfs4_op_hint_t,
150 	nfs4_recov_state_t *, int, char *);
151 static int nfs4_check_srvstub(vnode_t *vp, rnode4_t *rp, nfs4_op_hint_t op);
152 static char *nfs4_getsrvnames(mntinfo4_t *, size_t *);
153 static void nfs4_recov_fh_fail(vnode_t *, int, nfsstat4);
154 static void nfs4_recov_thread(recov_info_t *);
155 static void nfs4_remove_lost_rqsts(mntinfo4_t *, nfs4_server_t *);
156 static void nfs4_resend_lost_rqsts(recov_info_t *, nfs4_server_t *);
157 static cred_t *pid_to_cr(pid_t);
158 static void reclaim_one_lock(vnode_t *, flock64_t *, nfs4_error_t *, int *);
159 static void recov_bad_seqid(recov_info_t *);
160 static void recov_badstate(recov_info_t *, vnode_t *, nfsstat4);
161 static void recov_clientid(recov_info_t *, nfs4_server_t *);
162 static void recov_done(mntinfo4_t *, recov_info_t *);
163 static void recov_filehandle(nfs4_recov_t, mntinfo4_t *, vnode_t *);
164 static void recov_newserver(recov_info_t *, nfs4_server_t **, bool_t *);
165 static void recov_openfiles(recov_info_t *, nfs4_server_t *);
166 static void recov_stale(mntinfo4_t *, vnode_t *);
167 static void nfs4_free_lost_rqst(nfs4_lost_rqst_t *, nfs4_server_t *);
168 static void recov_throttle(recov_info_t *, vnode_t *);
169 static void relock_skip_pid(locklist_t *, pid_t);
170 static void resend_lock(nfs4_lost_rqst_t *, nfs4_error_t *);
171 static void resend_one_op(nfs4_lost_rqst_t *, nfs4_error_t *, mntinfo4_t *,
172 	nfs4_server_t *);
173 static void save_bseqid_rqst(nfs4_bseqid_entry_t *, recov_info_t *);
174 static void start_recovery(recov_info_t *, mntinfo4_t *, vnode_t *, vnode_t *,
175 	nfs4_server_t *);
176 static void start_recovery_action(nfs4_recov_t, bool_t, mntinfo4_t *, vnode_t *,
177 	vnode_t *);
178 static int wait_for_recovery(mntinfo4_t *, nfs4_op_hint_t);
179 
180 /*
181  * Return non-zero if the given errno, status, and rpc status codes
182  * in the nfs4_error_t indicate that client recovery is needed.
183  * "stateful" indicates whether the call that got the error establishes or
184  * removes state on the server (open, close, lock, unlock, delegreturn).
185  */
186 
187 int
188 nfs4_needs_recovery(nfs4_error_t *ep, bool_t stateful, vfs_t *vfsp)
189 {
190 	int recov = 0;
191 	mntinfo4_t *mi;
192 
193 	/*
194 	 * Try failover if the error values justify it and if
195 	 * it's a failover mount.  Don't try if the mount is in
196 	 * progress, failures are handled explicitly by nfs4rootvp.
197 	 */
198 	if (nfs4_try_failover(ep)) {
199 		mi = VFTOMI4(vfsp);
200 		mutex_enter(&mi->mi_lock);
201 		recov = FAILOVER_MOUNT4(mi) && !(mi->mi_flags & MI4_MOUNTING);
202 		mutex_exit(&mi->mi_lock);
203 		if (recov)
204 			return (recov);
205 	}
206 
207 	if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vfsp)) {
208 		/*
209 		 * The server may have gotten the request, so for stateful
210 		 * ops we need to resynchronize and possibly back out the
211 		 * op.
212 		 */
213 		return (stateful);
214 	}
215 	if (ep->error != 0)
216 		return (0);
217 
218 	/* stat values are listed alphabetically */
219 	/*
220 	 * There are two lists here: the errors for which we have code, and
221 	 * the errors for which we plan to have code before FCS.  For the
222 	 * second list, print a warning message but don't attempt recovery.
223 	 */
224 	switch (ep->stat) {
225 	case NFS4ERR_BADHANDLE:
226 	case NFS4ERR_BAD_SEQID:
227 	case NFS4ERR_BAD_STATEID:
228 	case NFS4ERR_DELAY:
229 	case NFS4ERR_EXPIRED:
230 	case NFS4ERR_FHEXPIRED:
231 	case NFS4ERR_GRACE:
232 	case NFS4ERR_OLD_STATEID:
233 	case NFS4ERR_RESOURCE:
234 	case NFS4ERR_STALE_CLIENTID:
235 	case NFS4ERR_STALE_STATEID:
236 	case NFS4ERR_WRONGSEC:
237 	case NFS4ERR_STALE:
238 		recov = 1;
239 		break;
240 #ifdef DEBUG
241 	case NFS4ERR_LEASE_MOVED:
242 	case NFS4ERR_MOVED:
243 		zcmn_err(VFTOMI4(vfsp)->mi_zone->zone_id,
244 		    CE_WARN, "!Can't yet recover from NFS status %d",
245 				ep->stat);
246 		break;
247 #endif
248 	}
249 
250 	return (recov);
251 }
252 
253 /*
254  * Some operations such as DELEGRETURN want to avoid invoking
255  * recovery actions that will only mark the file dead.  If
256  * better handlers are invoked for any of these errors, this
257  * routine should be modified.
258  */
259 int
260 nfs4_recov_marks_dead(nfsstat4 status)
261 {
262 	if (status == NFS4ERR_BAD_SEQID ||
263 	    status == NFS4ERR_EXPIRED ||
264 	    status == NFS4ERR_BAD_STATEID ||
265 	    status == NFS4ERR_OLD_STATEID)
266 		return (1);
267 	return (0);
268 }
269 
270 /*
271  * Transfer the state recovery information in recovp to mi's resend queue,
272  * and mark mi as having a lost state request.
273  */
274 static void
275 nfs4_enqueue_lost_rqst(recov_info_t *recovp, mntinfo4_t *mi)
276 {
277 	nfs4_lost_rqst_t *lrp = recovp->rc_lost_rqst;
278 
279 	ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
280 	    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
281 
282 	ASSERT(lrp != NULL && lrp->lr_op != 0);
283 
284 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
285 		"nfs4_enqueue_lost_rqst %p, op %d",
286 		(void *)lrp, lrp->lr_op));
287 
288 	mutex_enter(&mi->mi_lock);
289 	mi->mi_recovflags |= MI4R_LOST_STATE;
290 	if (lrp->lr_putfirst)
291 		list_insert_head(&mi->mi_lost_state, lrp);
292 	else
293 		list_insert_tail(&mi->mi_lost_state, lrp);
294 	recovp->rc_lost_rqst = NULL;
295 	mutex_exit(&mi->mi_lock);
296 
297 	nfs4_queue_event(RE_LOST_STATE, mi, NULL, lrp->lr_op, lrp->lr_vp,
298 		lrp->lr_dvp, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
299 }
300 
301 /*
302  * Transfer the bad seqid recovery information in recovp to mi's
303  * bad seqid queue, and mark mi as having a bad seqid request.
304  */
305 void
306 enqueue_bseqid_rqst(recov_info_t *recovp, mntinfo4_t *mi)
307 {
308 	ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
309 	    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
310 	ASSERT(recovp->rc_bseqid_rqst != NULL);
311 
312 	mutex_enter(&mi->mi_lock);
313 	mi->mi_recovflags |= MI4R_BAD_SEQID;
314 	list_insert_tail(&mi->mi_bseqid_list, recovp->rc_bseqid_rqst);
315 	recovp->rc_bseqid_rqst = NULL;
316 	mutex_exit(&mi->mi_lock);
317 }
318 
319 /*
320  * Initiate recovery.
321  *
322  * The nfs4_error_t contains the return codes that triggered a recovery
323  * attempt.  mi, vp1, and vp2 refer to the filesystem and files that were
324  * being operated on.  vp1 and vp2 may be NULL.
325  *
326  * Multiple calls are okay.  If recovery is already underway, the call
327  * updates the information about what state needs recovery but does not
328  * start a new thread.  The caller should hold mi->mi_recovlock as a reader
329  * for proper synchronization with any recovery thread.
330  *
331  * This will return TRUE if recovery was aborted, and FALSE otherwise.
332  */
333 bool_t
334 nfs4_start_recovery(nfs4_error_t *ep, mntinfo4_t *mi, vnode_t *vp1,
335     vnode_t *vp2, stateid4 *sid, nfs4_lost_rqst_t *lost_rqstp, nfs_opnum4 op,
336     nfs4_bseqid_entry_t *bsep)
337 {
338 	recov_info_t *recovp;
339 	nfs4_server_t *sp;
340 	bool_t abort = FALSE;
341 	bool_t gone = FALSE;
342 
343 	ASSERT(nfs_zone() == mi->mi_zone);
344 	mutex_enter(&mi->mi_lock);
345 	/*
346 	 * If there is lost state, we need to kick off recovery even if the
347 	 * filesystem has been unmounted or the zone is shutting down.
348 	 */
349 	gone = FS_OR_ZONE_GONE4(mi->mi_vfsp);
350 	if (gone) {
351 		ASSERT(ep->error != EINTR || lost_rqstp != NULL);
352 		if (ep->error == EIO && lost_rqstp == NULL) {
353 			/* failed due to forced unmount, no new lost state */
354 			abort = TRUE;
355 		}
356 		if ((ep->error == 0 || ep->error == ETIMEDOUT) &&
357 		    !(mi->mi_recovflags & MI4R_LOST_STATE)) {
358 			/* some other failure, no existing lost state */
359 			abort = TRUE;
360 		}
361 		if (abort) {
362 			mutex_exit(&mi->mi_lock);
363 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
364 				    "nfs4_start_recovery: fs unmounted"));
365 			return (TRUE);
366 		}
367 	}
368 	mi->mi_in_recovery++;
369 	mutex_exit(&mi->mi_lock);
370 
371 	recovp = kmem_alloc(sizeof (recov_info_t), KM_SLEEP);
372 	recovp->rc_orig_errors = *ep;
373 	sp = find_nfs4_server(mi);
374 	errs_to_action(recovp, sp, mi, sid, lost_rqstp,
375 		gone, op, bsep);
376 	if (sp != NULL)
377 		mutex_exit(&sp->s_lock);
378 	start_recovery(recovp, mi, vp1, vp2, sp);
379 	if (sp != NULL)
380 		nfs4_server_rele(sp);
381 	return (FALSE);
382 }
383 
384 /*
385  * Internal version of nfs4_start_recovery.  The difference is that the
386  * caller specifies the recovery action, rather than the errors leading to
387  * recovery.
388  */
389 static void
390 start_recovery_action(nfs4_recov_t what, bool_t reboot, mntinfo4_t *mi,
391 	vnode_t *vp1, vnode_t *vp2)
392 {
393 	recov_info_t *recovp;
394 
395 	ASSERT(nfs_zone() == mi->mi_zone);
396 	mutex_enter(&mi->mi_lock);
397 	mi->mi_in_recovery++;
398 	mutex_exit(&mi->mi_lock);
399 
400 	recovp = kmem_zalloc(sizeof (recov_info_t), KM_SLEEP);
401 	recovp->rc_action = what;
402 	recovp->rc_srv_reboot = reboot;
403 	recovp->rc_error = EIO;
404 	start_recovery(recovp, mi, vp1, vp2, NULL);
405 }
406 
407 static void
408 start_recovery(recov_info_t *recovp, mntinfo4_t *mi,
409 	vnode_t *vp1, vnode_t *vp2, nfs4_server_t *sp)
410 {
411 	NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
412 		"start_recovery: mi %p, what %s", (void*)mi,
413 		nfs4_recov_action_to_str(recovp->rc_action)));
414 
415 	/*
416 	 * Bump the reference on the vfs so that we can pass it to the
417 	 * recovery thread.
418 	 */
419 	VFS_HOLD(mi->mi_vfsp);
420 	MI4_HOLD(mi);
421 again:
422 	switch (recovp->rc_action) {
423 	case NR_FAILOVER:
424 		ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
425 		    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
426 		if (mi->mi_servers->sv_next == NULL)
427 			goto out_no_thread;
428 		mutex_enter(&mi->mi_lock);
429 		mi->mi_recovflags |= MI4R_NEED_NEW_SERVER;
430 		mutex_exit(&mi->mi_lock);
431 
432 		if (recovp->rc_lost_rqst != NULL)
433 			nfs4_enqueue_lost_rqst(recovp, mi);
434 		break;
435 
436 	case NR_CLIENTID:
437 		/*
438 		 * If the filesystem has been unmounted, punt.
439 		 */
440 		if (sp == NULL)
441 			goto out_no_thread;
442 
443 		/*
444 		 * If nobody else is working on the clientid, mark the
445 		 * clientid as being no longer set.  Then mark the specific
446 		 * filesystem being worked on.
447 		 */
448 		if (!nfs4_server_in_recovery(sp)) {
449 			mutex_enter(&sp->s_lock);
450 			sp->s_flags &= ~N4S_CLIENTID_SET;
451 			mutex_exit(&sp->s_lock);
452 		}
453 		ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
454 		    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
455 		mutex_enter(&mi->mi_lock);
456 		mi->mi_recovflags |= MI4R_NEED_CLIENTID;
457 		if (recovp->rc_srv_reboot)
458 			mi->mi_recovflags |= MI4R_SRV_REBOOT;
459 		mutex_exit(&mi->mi_lock);
460 		break;
461 
462 	case NR_OPENFILES:
463 		ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
464 		    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
465 		mutex_enter(&mi->mi_lock);
466 		mi->mi_recovflags |= MI4R_REOPEN_FILES;
467 		if (recovp->rc_srv_reboot)
468 			mi->mi_recovflags |= MI4R_SRV_REBOOT;
469 		mutex_exit(&mi->mi_lock);
470 		break;
471 
472 	case NR_WRONGSEC:
473 		ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
474 		    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
475 		mutex_enter(&mi->mi_lock);
476 		mi->mi_recovflags |= MI4R_NEED_SECINFO;
477 		mutex_exit(&mi->mi_lock);
478 		break;
479 
480 	case NR_EXPIRED:
481 		if (vp1 != NULL)
482 			recov_badstate(recovp, vp1, NFS4ERR_EXPIRED);
483 		if (vp2 != NULL)
484 			recov_badstate(recovp, vp2, NFS4ERR_EXPIRED);
485 		goto out_no_thread;	/* no further recovery possible */
486 
487 	case NR_BAD_STATEID:
488 		if (vp1 != NULL)
489 			recov_badstate(recovp, vp1, NFS4ERR_BAD_STATEID);
490 		if (vp2 != NULL)
491 			recov_badstate(recovp, vp2, NFS4ERR_BAD_STATEID);
492 		goto out_no_thread;	/* no further recovery possible */
493 
494 	case NR_FHEXPIRED:
495 	case NR_BADHANDLE:
496 		if (vp1 != NULL)
497 			recov_throttle(recovp, vp1);
498 		if (vp2 != NULL)
499 			recov_throttle(recovp, vp2);
500 		/*
501 		 * Recover the filehandle now, rather than using a
502 		 * separate thread.  We can do this because filehandle
503 		 * recovery is independent of any other state, and because
504 		 * we know that we are not competing with the recovery
505 		 * thread at this time.  recov_filehandle will deal with
506 		 * threads that are competing to recover this filehandle.
507 		 */
508 		ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
509 		    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
510 		if (vp1 != NULL)
511 			recov_filehandle(recovp->rc_action, mi, vp1);
512 		if (vp2 != NULL)
513 			recov_filehandle(recovp->rc_action, mi, vp2);
514 		goto out_no_thread;	/* no further recovery needed */
515 
516 	case NR_STALE:
517 		/*
518 		 * NFS4ERR_STALE handling
519 		 * recov_stale() could set MI4R_NEED_NEW_SERVER to
520 		 * indicate that we can and should failover.
521 		 */
522 		ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
523 		    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
524 
525 		if (vp1 != NULL)
526 			recov_stale(mi, vp1);
527 		if (vp2 != NULL)
528 			recov_stale(mi, vp2);
529 		mutex_enter(&mi->mi_lock);
530 		if ((mi->mi_recovflags & MI4R_NEED_NEW_SERVER) == 0) {
531 			mutex_exit(&mi->mi_lock);
532 			goto out_no_thread;
533 		}
534 		mutex_exit(&mi->mi_lock);
535 		recovp->rc_action = NR_FAILOVER;
536 		goto again;
537 
538 	case NR_BAD_SEQID:
539 		if (recovp->rc_bseqid_rqst) {
540 			enqueue_bseqid_rqst(recovp, mi);
541 			break;
542 		}
543 
544 		if (vp1 != NULL)
545 			recov_badstate(recovp, vp1, NFS4ERR_BAD_SEQID);
546 		if (vp2 != NULL)
547 			recov_badstate(recovp, vp2, NFS4ERR_BAD_SEQID);
548 		goto out_no_thread; /* no further recovery possible */
549 
550 	case NR_OLDSTATEID:
551 		if (vp1 != NULL)
552 			recov_badstate(recovp, vp1, NFS4ERR_OLD_STATEID);
553 		if (vp2 != NULL)
554 			recov_badstate(recovp, vp2, NFS4ERR_OLD_STATEID);
555 		goto out_no_thread;	/* no further recovery possible */
556 
557 	case NR_GRACE:
558 		nfs4_set_grace_wait(mi);
559 		goto out_no_thread; /* no further action required for GRACE */
560 
561 	case NR_DELAY:
562 		if (vp1)
563 			nfs4_set_delay_wait(vp1);
564 		goto out_no_thread; /* no further action required for DELAY */
565 
566 	case NR_LOST_STATE_RQST:
567 	case NR_LOST_LOCK:
568 		nfs4_enqueue_lost_rqst(recovp, mi);
569 		break;
570 
571 	default:
572 		nfs4_queue_event(RE_UNEXPECTED_ACTION, mi, NULL,
573 		    recovp->rc_action, NULL, NULL, 0, NULL, 0, TAG_NONE,
574 		    TAG_NONE, 0, 0);
575 		goto out_no_thread;
576 	}
577 
578 	/*
579 	 * If either file recently went through the same recovery, wait
580 	 * awhile.  This is in case there is some sort of bug; we might not
581 	 * be able to recover properly, but at least we won't bombard the
582 	 * server with calls, and we won't tie up the client.
583 	 */
584 	if (vp1 != NULL)
585 		recov_throttle(recovp, vp1);
586 	if (vp2 != NULL)
587 		recov_throttle(recovp, vp2);
588 
589 	/*
590 	 * If there's already a recovery thread, don't start another one.
591 	 */
592 
593 	mutex_enter(&mi->mi_lock);
594 	if (mi->mi_flags & MI4_RECOV_ACTIV) {
595 		mutex_exit(&mi->mi_lock);
596 		goto out_no_thread;
597 	}
598 	mi->mi_flags |= MI4_RECOV_ACTIV;
599 	mutex_exit(&mi->mi_lock);
600 	NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
601 		"start_recovery: starting new thread for mi %p", (void*)mi));
602 
603 	recovp->rc_mi = mi;
604 	recovp->rc_vp1 = vp1;
605 	if (vp1 != NULL) {
606 		ASSERT(VTOMI4(vp1) == mi);
607 		VN_HOLD(recovp->rc_vp1);
608 	}
609 	recovp->rc_vp2 = vp2;
610 	if (vp2 != NULL) {
611 		ASSERT(VTOMI4(vp2) == mi);
612 		VN_HOLD(recovp->rc_vp2);
613 	}
614 
615 	(void) zthread_create(NULL, 0, nfs4_recov_thread, recovp, 0,
616 			    minclsyspri);
617 	return;
618 
619 	/* not reached by thread creating call */
620 out_no_thread:
621 	mutex_enter(&mi->mi_lock);
622 	mi->mi_in_recovery--;
623 	if (mi->mi_in_recovery == 0)
624 		cv_broadcast(&mi->mi_cv_in_recov);
625 	mutex_exit(&mi->mi_lock);
626 
627 	VFS_RELE(mi->mi_vfsp);
628 	MI4_RELE(mi);
629 	/*
630 	 * Free up resources that were allocated for us.
631 	 */
632 	kmem_free(recovp, sizeof (recov_info_t));
633 }
634 
635 static int
636 nfs4_check_srvstub(vnode_t *vp, rnode4_t *rp, nfs4_op_hint_t op)
637 {
638 	int err = 0;
639 
640 	/*
641 	 * If tuneable does not allow client to cross srv mountpoints and
642 	 * object is a stub, then check check op hint and return EACCES for
643 	 * any hint other than access, rddir, getattr, lookup.
644 	 */
645 	if (rp->r_flags & R4SRVSTUB && op != OH_ACCESS && op != OH_GETACL &&
646 	    op != OH_GETATTR && op != OH_READDIR && op != OH_LOOKUP) {
647 		err = EACCES;
648 #ifdef DEBUG
649 		NFS4_DEBUG(nfs4_srvmnt_debug, (CE_NOTE,
650 			"nfs4_check_srvstub: op=%d err=%d rp=%p vp=%p\n"
651 			"va_nod=%llx r_mntd_fid=%llx\n"
652 			"sv_fsid=(%llx:%llx) r_srv_fsid=(%llx:%llx)",
653 			op, err, (void *)rp, (void *)vp,
654 			(u_longlong_t)rp->r_attr.va_nodeid,
655 			(u_longlong_t)rp->r_mntd_fid,
656 			(u_longlong_t)rp->r_server->sv_fsid.major,
657 			(u_longlong_t)rp->r_server->sv_fsid.minor,
658 			(u_longlong_t)rp->r_srv_fsid.major,
659 			(u_longlong_t)rp->r_srv_fsid.minor));
660 #endif
661 	}
662 
663 	return (err);
664 }
665 
666 static int
667 nfs4_check_recov_err(vnode_t *vp, nfs4_op_hint_t op,
668 			nfs4_recov_state_t *rsp, int retry_err_cnt, char *str)
669 {
670 	rnode4_t *rp;
671 	int error = 0;
672 	int exempt;
673 
674 	if (vp == NULL)
675 		return (0);
676 
677 	exempt = (op == OH_CLOSE || op == OH_LOCKU || op == OH_DELEGRETURN);
678 	rp = VTOR4(vp);
679 	mutex_enter(&rp->r_statelock);
680 
681 	/*
682 	 * If there was a recovery error, then allow op hints "exempt" from
683 	 * recov errors to retry (currently 3 times).  Either r_error or
684 	 * EIO is returned for non-exempt op hints.
685 	 *
686 	 *	Error heirarchy:
687 	 *	a) check for R4ERECOVERR
688 	 *	b) check for R4SRVSTUB (only if R4RECOVERR is not set).
689 	 */
690 	if (rp->r_flags & R4RECOVERR) {
691 		if (exempt && rsp->rs_num_retry_despite_err <=
692 				nfs4_max_recov_error_retry) {
693 
694 			/*
695 			 * Check to make sure that we haven't already inc'd
696 			 * rs_num_retry_despite_err for current nfs4_start_fop
697 			 * instance.  We don't want to double inc (if we were
698 			 * called with vp2, then the vp1 call could have
699 			 * already incremented.
700 			 */
701 			if (retry_err_cnt == rsp->rs_num_retry_despite_err)
702 				rsp->rs_num_retry_despite_err++;
703 
704 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
705 				"nfs4_start_fop: %s %p DEAD, cnt=%d", str,
706 				(void *)vp, rsp->rs_num_retry_despite_err));
707 		} else {
708 			error = (rp->r_error ? rp->r_error : EIO);
709 			/*
710 			 * An ESTALE error on a non-regular file is not
711 			 * "sticky".  Return the ESTALE error once, but
712 			 * clear the condition to allow future operations
713 			 * to go OTW.  This will allow the client to
714 			 * recover if the server has merely unshared then
715 			 * re-shared the file system.  For regular files,
716 			 * the unshare has destroyed the open state at the
717 			 * server and we aren't willing to do a reopen (yet).
718 			 */
719 			if (error == ESTALE && vp->v_type != VREG) {
720 				rp->r_flags &=
721 					~(R4RECOVERR|R4RECOVERRP|R4STALE);
722 				rp->r_error = 0;
723 				error = ESTALE;
724 			}
725 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
726 				"nfs4_start_fop: %s %p DEAD, cnt=%d error=%d",
727 				str, (void *)vp,
728 				rsp->rs_num_retry_despite_err, error));
729 		}
730 	} else {
731 		error = nfs4_check_srvstub(vp, rp, op);
732 		NFS4_DEBUG(nfs4_client_recov_stub_debug, (CE_NOTE,
733 			"nfs4_start_fop: %s %p SRVSTUB, error=%d", str,
734 			(void *)vp, error));
735 	}
736 	mutex_exit(&rp->r_statelock);
737 	return (error);
738 }
739 
740 /*
741  * Initial setup code that every operation should call if it might invoke
742  * client recovery.  Can block waiting for recovery to finish on a
743  * filesystem.  Either vnode ptr can be NULL.
744  *
745  * Returns 0 if there are no outstanding errors.  Can return an
746  * errno value under various circumstances (e.g., failed recovery, or
747  * interrupted while waiting for recovery to finish).
748  *
749  * There must be a corresponding call to nfs4_end_op() to free up any locks
750  * or resources allocated by this call (assuming this call succeeded),
751  * using the same rsp that's passed in here.
752  *
753  * The open and lock seqid synchronization must be stopped before calling this
754  * function, as it could lead to deadlock when trying to reopen a file or
755  * reclaim a lock.  The synchronization is obtained with calls to:
756  *   nfs4_start_open_seqid_sync()
757  *   nfs4_start_lock_seqid_sync()
758  *
759  * *startrecovp is set TRUE if the caller should not bother with the
760  * over-the-wire call, and just initiate recovery for the given request.
761  * This is typically used for state-releasing ops if the filesystem has
762  * been forcibly unmounted.  startrecovp may be NULL for
763  * non-state-releasing ops.
764  */
765 
766 int
767 nfs4_start_fop(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2, nfs4_op_hint_t op,
768 		nfs4_recov_state_t *rsp, bool_t *startrecovp)
769 {
770 	int error = 0, rerr_cnt;
771 	nfs4_server_t *sp = NULL;
772 	nfs4_server_t *tsp;
773 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
774 	time_t droplock_time;
775 #ifdef DEBUG
776 	void *fop_caller;
777 #endif
778 
779 	ASSERT(vp1 == NULL || vp1->v_vfsp == mi->mi_vfsp);
780 	ASSERT(vp2 == NULL || vp2->v_vfsp == mi->mi_vfsp);
781 
782 #ifdef	DEBUG
783 	if ((fop_caller = tsd_get(nfs4_tsd_key)) != NULL) {
784 		cmn_err(CE_PANIC, "Missing nfs4_end_fop: last caller %p",
785 			fop_caller);
786 	}
787 	(void) tsd_set(nfs4_tsd_key, caller());
788 #endif
789 
790 	rsp->rs_sp = NULL;
791 	rsp->rs_flags &= ~NFS4_RS_RENAME_HELD;
792 	rerr_cnt = rsp->rs_num_retry_despite_err;
793 
794 	/*
795 	 * Process the items that may delay() based on server response
796 	 */
797 	error = nfs4_wait_for_grace(mi, rsp);
798 	if (error)
799 		goto out;
800 
801 	if (vp1 != NULL) {
802 		error = nfs4_wait_for_delay(vp1, rsp);
803 		if (error)
804 			goto out;
805 	}
806 
807 	/* Wait for a delegation recall to complete. */
808 
809 	error = wait_for_recall(vp1, vp2, op, rsp);
810 	if (error)
811 		goto out;
812 
813 	/*
814 	 * Wait for any current recovery actions to finish.  Note that a
815 	 * recovery thread can still start up after wait_for_recovery()
816 	 * finishes.  We don't block out recovery operations until we
817 	 * acquire s_recovlock and mi_recovlock.
818 	 */
819 	error = wait_for_recovery(mi, op);
820 	if (error)
821 		goto out;
822 
823 	/*
824 	 * Check to see if the rnode is already marked with a
825 	 * recovery error.  If so, return it immediately.  But
826 	 * always pass CLOSE, LOCKU, and DELEGRETURN so we can
827 	 * clean up state on the server.
828 	 */
829 
830 	if (vp1 != NULL) {
831 		if (error = nfs4_check_recov_err(vp1, op, rsp, rerr_cnt, "vp1"))
832 			goto out;
833 		nfs4_check_remap(mi, vp1, NFS4_REMAP_CKATTRS, &e);
834 	}
835 
836 	if (vp2 != NULL) {
837 		if (error = nfs4_check_recov_err(vp2, op, rsp, rerr_cnt, "vp2"))
838 			goto out;
839 		nfs4_check_remap(mi, vp2, NFS4_REMAP_CKATTRS, &e);
840 	}
841 
842 	/*
843 	 * The lock order calls for us to acquire s_recovlock before
844 	 * mi_recovlock, but we have to hold mi_recovlock to look up sp (to
845 	 * prevent races with the failover/migration code).  So acquire
846 	 * mi_recovlock, look up sp, drop mi_recovlock, acquire
847 	 * s_recovlock and mi_recovlock, then verify that sp is still the
848 	 * right object.  XXX Can we find a simpler way to deal with this?
849 	 */
850 	if (nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER,
851 	    mi->mi_flags & MI4_INT)) {
852 		error = EINTR;
853 		goto out;
854 	}
855 get_sp:
856 	sp = find_nfs4_server(mi);
857 	if (sp != NULL) {
858 		sp->s_otw_call_count++;
859 		mutex_exit(&sp->s_lock);
860 		droplock_time = gethrestime_sec();
861 	}
862 	nfs_rw_exit(&mi->mi_recovlock);
863 
864 	if (sp != NULL) {
865 		if (nfs_rw_enter_sig(&sp->s_recovlock, RW_READER,
866 			    mi->mi_flags & MI4_INT)) {
867 			error = EINTR;
868 			goto out;
869 		}
870 	}
871 	if (nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER,
872 			    mi->mi_flags & MI4_INT)) {
873 		if (sp != NULL)
874 			nfs_rw_exit(&sp->s_recovlock);
875 		error = EINTR;
876 		goto out;
877 	}
878 	/*
879 	 * If the mntinfo4_t hasn't changed nfs4_sever_ts then
880 	 * there's no point in double checking to make sure it
881 	 * has switched.
882 	 */
883 	if (sp == NULL || droplock_time < mi->mi_srvsettime) {
884 		tsp = find_nfs4_server(mi);
885 		if (tsp != sp) {
886 			/* try again */
887 			if (tsp != NULL) {
888 				mutex_exit(&tsp->s_lock);
889 				nfs4_server_rele(tsp);
890 				tsp = NULL;
891 			}
892 			if (sp != NULL) {
893 				nfs_rw_exit(&sp->s_recovlock);
894 				mutex_enter(&sp->s_lock);
895 				sp->s_otw_call_count--;
896 				mutex_exit(&sp->s_lock);
897 				nfs4_server_rele(sp);
898 				sp = NULL;
899 			}
900 			goto get_sp;
901 		} else {
902 			if (tsp != NULL) {
903 				mutex_exit(&tsp->s_lock);
904 				nfs4_server_rele(tsp);
905 				tsp = NULL;
906 			}
907 		}
908 	}
909 
910 	if (sp != NULL) {
911 		rsp->rs_sp = sp;
912 	}
913 
914 	/*
915 	 * If the fileystem uses volatile filehandles, obtain a lock so
916 	 * that we synchronize with renames.  Exception: mount operations
917 	 * can change mi_fh_expire_type, which could be a problem, since
918 	 * the end_op code needs to be consistent with the start_op code
919 	 * about mi_rename_lock.  Since mounts don't compete with renames,
920 	 * it's simpler to just not acquire the rename lock for mounts.
921 	 */
922 	if (NFS4_VOLATILE_FH(mi) && op != OH_MOUNT) {
923 		if (nfs_rw_enter_sig(&mi->mi_rename_lock,
924 				    op == OH_VFH_RENAME ? RW_WRITER : RW_READER,
925 				    mi->mi_flags & MI4_INT)) {
926 			nfs_rw_exit(&mi->mi_recovlock);
927 			if (sp != NULL)
928 				nfs_rw_exit(&sp->s_recovlock);
929 			error = EINTR;
930 			goto out;
931 		}
932 		rsp->rs_flags |= NFS4_RS_RENAME_HELD;
933 	}
934 
935 	if (OH_IS_STATE_RELE(op)) {
936 		/*
937 		 * For forced unmount, letting the request proceed will
938 		 * almost always delay response to the user, so hand it off
939 		 * to the recovery thread.  For exiting lwp's, we don't
940 		 * have a good way to tell if the request will hang.  We
941 		 * generally want processes to handle their own requests so
942 		 * that they can be done in parallel, but if there is
943 		 * already a recovery thread, hand the request off to it.
944 		 * This will improve user response at no cost to overall
945 		 * system throughput.  For zone shutdown, we'd prefer
946 		 * the recovery thread to handle this as well.
947 		 */
948 		ASSERT(startrecovp != NULL);
949 		mutex_enter(&mi->mi_lock);
950 		if (FS_OR_ZONE_GONE4(mi->mi_vfsp))
951 			*startrecovp = TRUE;
952 		else if ((curthread->t_proc_flag & TP_LWPEXIT) &&
953 		    (mi->mi_flags & MI4_RECOV_ACTIV))
954 			*startrecovp = TRUE;
955 		else
956 			*startrecovp = FALSE;
957 		mutex_exit(&mi->mi_lock);
958 	} else
959 		if (startrecovp != NULL)
960 			*startrecovp = FALSE;
961 
962 	ASSERT(error == 0);
963 	return (error);
964 
965 out:
966 	ASSERT(error != 0);
967 	if (sp != NULL) {
968 		mutex_enter(&sp->s_lock);
969 		sp->s_otw_call_count--;
970 		mutex_exit(&sp->s_lock);
971 		nfs4_server_rele(sp);
972 		rsp->rs_sp = NULL;
973 	}
974 	nfs4_end_op_recall(vp1, vp2, rsp);
975 
976 #ifdef	DEBUG
977 	(void) tsd_set(nfs4_tsd_key, NULL);
978 #endif
979 	return (error);
980 }
981 
982 /*
983  * It is up to the caller to determine if rsp->rs_sp being NULL
984  * is detrimental or not.
985  */
986 int
987 nfs4_start_op(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2,
988 	nfs4_recov_state_t *rsp)
989 {
990 	ASSERT(rsp->rs_num_retry_despite_err == 0);
991 	rsp->rs_num_retry_despite_err = 0;
992 	return (nfs4_start_fop(mi, vp1, vp2, OH_OTHER, rsp, NULL));
993 }
994 
995 /*
996  * Release any resources acquired by nfs4_start_op().
997  * 'sp' should be the nfs4_server pointer returned by nfs4_start_op().
998  *
999  * The operation hint is used to avoid a deadlock by bypassing delegation
1000  * return logic for writes, which are done while returning a delegation.
1001  */
1002 
1003 void
1004 nfs4_end_fop(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2, nfs4_op_hint_t op,
1005 		nfs4_recov_state_t *rsp, bool_t needs_recov)
1006 {
1007 	nfs4_server_t *sp = rsp->rs_sp;
1008 	rnode4_t *rp = NULL;
1009 
1010 #ifdef	lint
1011 	/*
1012 	 * The op hint isn't used any more, but might be in
1013 	 * the future.
1014 	 */
1015 	op = op;
1016 #endif
1017 
1018 #ifdef	DEBUG
1019 	ASSERT(tsd_get(nfs4_tsd_key) != NULL);
1020 	(void) tsd_set(nfs4_tsd_key, NULL);
1021 #endif
1022 
1023 	nfs4_end_op_recall(vp1, vp2, rsp);
1024 
1025 	if (rsp->rs_flags & NFS4_RS_RENAME_HELD)
1026 		nfs_rw_exit(&mi->mi_rename_lock);
1027 
1028 	if (!needs_recov) {
1029 		if (rsp->rs_flags & NFS4_RS_DELAY_MSG) {
1030 			/* may need to clear the delay interval */
1031 			if (vp1 != NULL) {
1032 				rp = VTOR4(vp1);
1033 				mutex_enter(&rp->r_statelock);
1034 				rp->r_delay_interval = 0;
1035 				mutex_exit(&rp->r_statelock);
1036 			}
1037 		}
1038 		rsp->rs_flags &= ~(NFS4_RS_GRACE_MSG|NFS4_RS_DELAY_MSG);
1039 	}
1040 
1041 	/*
1042 	 * If the corresponding nfs4_start_op() found a sp,
1043 	 * then there must still be a sp.
1044 	 */
1045 	if (sp != NULL) {
1046 		nfs_rw_exit(&mi->mi_recovlock);
1047 		nfs_rw_exit(&sp->s_recovlock);
1048 		mutex_enter(&sp->s_lock);
1049 		sp->s_otw_call_count--;
1050 		cv_broadcast(&sp->s_cv_otw_count);
1051 		mutex_exit(&sp->s_lock);
1052 		nfs4_server_rele(sp);
1053 	} else {
1054 		nfs_rw_exit(&mi->mi_recovlock);
1055 	}
1056 }
1057 
1058 void
1059 nfs4_end_op(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2,
1060 	    nfs4_recov_state_t *rsp, bool_t needrecov)
1061 {
1062 	nfs4_end_fop(mi, vp1, vp2, OH_OTHER, rsp, needrecov);
1063 }
1064 
1065 /*
1066  * If the filesystem is going through client recovery, block until
1067  * finished.
1068  * Exceptions:
1069  * - state-releasing ops (CLOSE, LOCKU, DELEGRETURN) are allowed to proceed
1070  *   if the filesystem has been forcibly unmounted or the lwp is exiting.
1071  *
1072  * Return value:
1073  * - 0 if no errors
1074  * - EINTR if the call was interrupted
1075  * - EIO if the filesystem has been forcibly unmounted (non-state-releasing
1076  *   op)
1077  * - the errno value from the recovery thread, if recovery failed
1078  */
1079 
1080 static int
1081 wait_for_recovery(mntinfo4_t *mi, nfs4_op_hint_t op_hint)
1082 {
1083 	int error = 0;
1084 
1085 	mutex_enter(&mi->mi_lock);
1086 
1087 	while (mi->mi_recovflags != 0) {
1088 		klwp_t *lwp = ttolwp(curthread);
1089 
1090 		if (mi->mi_flags & MI4_RECOV_FAIL)
1091 			break;
1092 		if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)
1093 			break;
1094 		if (OH_IS_STATE_RELE(op_hint) &&
1095 		    (curthread->t_proc_flag & TP_LWPEXIT))
1096 			break;
1097 
1098 		if (lwp != NULL)
1099 			lwp->lwp_nostop++;
1100 		/* XXX - use different cv? */
1101 		if (cv_wait_sig(&mi->mi_failover_cv, &mi->mi_lock) == 0) {
1102 			error = EINTR;
1103 			if (lwp != NULL)
1104 				lwp->lwp_nostop--;
1105 			break;
1106 		}
1107 		if (lwp != NULL)
1108 			lwp->lwp_nostop--;
1109 	}
1110 
1111 	if (mi->mi_flags & MI4_RECOV_FAIL) {
1112 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1113 			"wait_for_recovery: fail since RECOV FAIL"));
1114 		error = mi->mi_error;
1115 	} else if ((mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) &&
1116 	    !OH_IS_STATE_RELE(op_hint)) {
1117 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1118 			"wait_for_recovery: forced unmount"));
1119 		error = EIO;
1120 	}
1121 
1122 	mutex_exit(&mi->mi_lock);
1123 
1124 	return (error);
1125 }
1126 
1127 /*
1128  * If the client received NFS4ERR_GRACE for this particular mount,
1129  * the client blocks here until it is time to try again.
1130  *
1131  * Return value:
1132  * - 0 if wait was successful
1133  * - EINTR if the call was interrupted
1134  */
1135 
1136 int
1137 nfs4_wait_for_grace(mntinfo4_t *mi, nfs4_recov_state_t *rsp)
1138 {
1139 	int error = 0;
1140 	time_t curtime, time_to_wait;
1141 
1142 	/* do a unprotected check to reduce mi_lock contention */
1143 	if (mi->mi_grace_wait != 0) {
1144 		mutex_enter(&mi->mi_lock);
1145 
1146 		if (mi->mi_grace_wait != 0) {
1147 			if (!(rsp->rs_flags & NFS4_RS_GRACE_MSG))
1148 				rsp->rs_flags |= NFS4_RS_GRACE_MSG;
1149 
1150 			curtime = gethrestime_sec();
1151 
1152 			if (curtime < mi->mi_grace_wait) {
1153 
1154 				time_to_wait = mi->mi_grace_wait - curtime;
1155 
1156 				mutex_exit(&mi->mi_lock);
1157 
1158 				delay(SEC_TO_TICK(time_to_wait));
1159 
1160 				curtime = gethrestime_sec();
1161 
1162 				mutex_enter(&mi->mi_lock);
1163 
1164 				if (curtime >= mi->mi_grace_wait)
1165 					mi->mi_grace_wait = 0;
1166 			} else {
1167 				mi->mi_grace_wait = 0;
1168 			}
1169 		}
1170 		mutex_exit(&mi->mi_lock);
1171 	}
1172 
1173 	return (error);
1174 }
1175 
1176 /*
1177  * If the client received NFS4ERR_DELAY for an operation on a vnode,
1178  * the client blocks here until it is time to try again.
1179  *
1180  * Return value:
1181  * - 0 if wait was successful
1182  * - EINTR if the call was interrupted
1183  */
1184 
1185 int
1186 nfs4_wait_for_delay(vnode_t *vp, nfs4_recov_state_t *rsp)
1187 {
1188 	int error = 0;
1189 	time_t curtime, time_to_wait;
1190 	rnode4_t *rp;
1191 
1192 	ASSERT(vp != NULL);
1193 
1194 	rp = VTOR4(vp);
1195 
1196 	/* do a unprotected check to reduce r_statelock contention */
1197 	if (rp->r_delay_wait != 0) {
1198 		mutex_enter(&rp->r_statelock);
1199 
1200 		if (rp->r_delay_wait != 0) {
1201 
1202 			if (!(rsp->rs_flags & NFS4_RS_DELAY_MSG)) {
1203 				rsp->rs_flags |= NFS4_RS_DELAY_MSG;
1204 				nfs4_mi_kstat_inc_delay(VTOMI4(vp));
1205 			}
1206 
1207 			curtime = gethrestime_sec();
1208 
1209 			if (curtime < rp->r_delay_wait) {
1210 
1211 				time_to_wait = rp->r_delay_wait - curtime;
1212 
1213 				mutex_exit(&rp->r_statelock);
1214 
1215 				delay(SEC_TO_TICK(time_to_wait));
1216 
1217 				curtime = gethrestime_sec();
1218 
1219 				mutex_enter(&rp->r_statelock);
1220 
1221 				if (curtime >= rp->r_delay_wait)
1222 					rp->r_delay_wait = 0;
1223 			} else {
1224 				rp->r_delay_wait = 0;
1225 			}
1226 		}
1227 		mutex_exit(&rp->r_statelock);
1228 	}
1229 
1230 	return (error);
1231 }
1232 
1233 /*
1234  * The recovery thread.
1235  */
1236 
1237 static void
1238 nfs4_recov_thread(recov_info_t *recovp)
1239 {
1240 	mntinfo4_t *mi = recovp->rc_mi;
1241 	nfs4_server_t *sp;
1242 	int done = 0, error = 0;
1243 	bool_t recov_fail = FALSE;
1244 	callb_cpr_t cpr_info;
1245 	kmutex_t cpr_lock;
1246 
1247 	nfs4_queue_event(RE_START, mi, NULL, mi->mi_recovflags,
1248 	    recovp->rc_vp1, recovp->rc_vp2, 0, NULL, 0, TAG_NONE, TAG_NONE,
1249 	    0, 0);
1250 
1251 	mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
1252 	CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr, "nfsv4Recov");
1253 
1254 	mutex_enter(&mi->mi_lock);
1255 	mi->mi_recovthread = curthread;
1256 	mutex_exit(&mi->mi_lock);
1257 
1258 	/*
1259 	 * We don't really need protection here against failover or
1260 	 * migration, since the current thread is the one that would make
1261 	 * any changes, but hold mi_recovlock anyway for completeness (and
1262 	 * to satisfy any ASSERTs).
1263 	 */
1264 	(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
1265 	sp = find_nfs4_server(mi);
1266 	if (sp != NULL)
1267 		mutex_exit(&sp->s_lock);
1268 	nfs_rw_exit(&mi->mi_recovlock);
1269 
1270 	/*
1271 	 * Do any necessary recovery, based on the information in recovp
1272 	 * and any recovery flags.
1273 	 */
1274 
1275 	do {
1276 		mutex_enter(&mi->mi_lock);
1277 		if (FS_OR_ZONE_GONE4(mi->mi_vfsp)) {
1278 			bool_t activesrv;
1279 
1280 			NFS4_DEBUG(nfs4_client_recov_debug &&
1281 			    mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED, (CE_NOTE,
1282 				"nfs4_recov_thread: file system has been "
1283 				"unmounted"));
1284 			NFS4_DEBUG(nfs4_client_recov_debug &&
1285 			    zone_status_get(curproc->p_zone) >=
1286 			    ZONE_IS_SHUTTING_DOWN, (CE_NOTE,
1287 				"nfs4_recov_thread: zone shutting down"));
1288 			/*
1289 			 * If the server has lost its state for us and
1290 			 * the filesystem is unmounted, then the filesystem
1291 			 * can be tossed, even if there are lost lock or
1292 			 * lost state calls in the recovery queue.
1293 			 */
1294 			if (mi->mi_recovflags &
1295 			    (MI4R_NEED_CLIENTID | MI4R_REOPEN_FILES)) {
1296 				NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1297 				"nfs4_recov_thread: bailing out"));
1298 				mi->mi_flags |= MI4_RECOV_FAIL;
1299 				mi->mi_error = recovp->rc_error;
1300 				recov_fail = TRUE;
1301 			}
1302 			/*
1303 			 * We don't know if the server has any state for
1304 			 * us, and the filesystem has been unmounted.  If
1305 			 * there are "lost state" recovery items, keep
1306 			 * trying to process them until there are no more
1307 			 * mounted filesystems for the server.  Otherwise,
1308 			 * bail out.  The reason we don't mark the
1309 			 * filesystem as failing recovery is in case we
1310 			 * have to do "lost state" recovery later (e.g., a
1311 			 * user process exits).
1312 			 */
1313 			if (!(mi->mi_recovflags & MI4R_LOST_STATE)) {
1314 				done = 1;
1315 				mutex_exit(&mi->mi_lock);
1316 				break;
1317 			}
1318 			mutex_exit(&mi->mi_lock);
1319 
1320 			if (sp == NULL)
1321 				activesrv = FALSE;
1322 			else {
1323 				mutex_enter(&sp->s_lock);
1324 				activesrv = nfs4_fs_active(sp);
1325 			}
1326 			if (!activesrv) {
1327 				NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1328 					"no active fs for server %p",
1329 					(void *)sp));
1330 				mutex_enter(&mi->mi_lock);
1331 				mi->mi_flags |= MI4_RECOV_FAIL;
1332 				mi->mi_error = recovp->rc_error;
1333 				mutex_exit(&mi->mi_lock);
1334 				recov_fail = TRUE;
1335 				if (sp != NULL) {
1336 					/*
1337 					 * Mark the server instance as
1338 					 * dead, so that nobody will attach
1339 					 * a new filesystem.
1340 					 */
1341 					nfs4_mark_srv_dead(sp);
1342 				}
1343 			}
1344 			if (sp != NULL)
1345 				mutex_exit(&sp->s_lock);
1346 		} else {
1347 			mutex_exit(&mi->mi_lock);
1348 		}
1349 
1350 		/*
1351 		 * Check if we need to select a new server for a
1352 		 * failover.  Choosing a new server will force at
1353 		 * least a check of the clientid.
1354 		 */
1355 		mutex_enter(&mi->mi_lock);
1356 		if (!recov_fail &&
1357 		    (mi->mi_recovflags & MI4R_NEED_NEW_SERVER)) {
1358 			mutex_exit(&mi->mi_lock);
1359 			recov_newserver(recovp, &sp, &recov_fail);
1360 		} else
1361 			mutex_exit(&mi->mi_lock);
1362 
1363 		/*
1364 		 * Check if we need to recover the clientid.  This
1365 		 * must be done before file and lock recovery, and it
1366 		 * potentially affects the recovery threads for other
1367 		 * filesystems, so it gets special treatment.
1368 		 */
1369 		if (sp != NULL && recov_fail == FALSE) {
1370 			mutex_enter(&sp->s_lock);
1371 			if (!(sp->s_flags & N4S_CLIENTID_SET)) {
1372 				mutex_exit(&sp->s_lock);
1373 				recov_clientid(recovp, sp);
1374 			} else {
1375 				/*
1376 				 * Unset this flag in case another recovery
1377 				 * thread successfully recovered the clientid
1378 				 * for us already.
1379 				 */
1380 				mutex_enter(&mi->mi_lock);
1381 				mi->mi_recovflags &= ~MI4R_NEED_CLIENTID;
1382 				mutex_exit(&mi->mi_lock);
1383 				mutex_exit(&sp->s_lock);
1384 			}
1385 		}
1386 
1387 		/*
1388 		 * Check if we need to get the security information.
1389 		 */
1390 		mutex_enter(&mi->mi_lock);
1391 		if ((mi->mi_recovflags & MI4R_NEED_SECINFO) &&
1392 		    !(mi->mi_flags & MI4_RECOV_FAIL)) {
1393 			mutex_exit(&mi->mi_lock);
1394 			(void) nfs_rw_enter_sig(&mi->mi_recovlock,
1395 							RW_WRITER, 0);
1396 			error = nfs4_secinfo_recov(recovp->rc_mi,
1397 					recovp->rc_vp1, recovp->rc_vp2);
1398 			/*
1399 			 * If error, nothing more can be done, stop
1400 			 * the recovery.
1401 			 */
1402 			if (error) {
1403 				mutex_enter(&mi->mi_lock);
1404 				mi->mi_flags |= MI4_RECOV_FAIL;
1405 				mi->mi_error = recovp->rc_error;
1406 				mutex_exit(&mi->mi_lock);
1407 				nfs4_queue_event(RE_WRONGSEC, mi, NULL,
1408 				    error, recovp->rc_vp1, recovp->rc_vp2,
1409 				    0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
1410 			}
1411 			nfs_rw_exit(&mi->mi_recovlock);
1412 		} else
1413 			mutex_exit(&mi->mi_lock);
1414 
1415 		/*
1416 		 * Check if there's a bad seqid to recover.
1417 		 */
1418 		mutex_enter(&mi->mi_lock);
1419 		if ((mi->mi_recovflags & MI4R_BAD_SEQID) &&
1420 		    !(mi->mi_flags & MI4_RECOV_FAIL)) {
1421 			mutex_exit(&mi->mi_lock);
1422 			(void) nfs_rw_enter_sig(&mi->mi_recovlock,
1423 					RW_WRITER, 0);
1424 			recov_bad_seqid(recovp);
1425 			nfs_rw_exit(&mi->mi_recovlock);
1426 		} else
1427 			mutex_exit(&mi->mi_lock);
1428 
1429 		/*
1430 		 * Next check for recovery that affects the entire
1431 		 * filesystem.
1432 		 */
1433 		if (sp != NULL) {
1434 			mutex_enter(&mi->mi_lock);
1435 			if ((mi->mi_recovflags & MI4R_REOPEN_FILES) &&
1436 			    !(mi->mi_flags & MI4_RECOV_FAIL)) {
1437 				mutex_exit(&mi->mi_lock);
1438 				recov_openfiles(recovp, sp);
1439 			} else
1440 				mutex_exit(&mi->mi_lock);
1441 		}
1442 
1443 		/*
1444 		 * Send any queued state recovery requests.
1445 		 */
1446 		mutex_enter(&mi->mi_lock);
1447 		if (sp != NULL &&
1448 		    (mi->mi_recovflags & MI4R_LOST_STATE) &&
1449 		    !(mi->mi_flags & MI4_RECOV_FAIL)) {
1450 			mutex_exit(&mi->mi_lock);
1451 			(void) nfs_rw_enter_sig(&mi->mi_recovlock,
1452 				    RW_WRITER, 0);
1453 			nfs4_resend_lost_rqsts(recovp, sp);
1454 			if (list_head(&mi->mi_lost_state) == NULL) {
1455 				/* done */
1456 				mutex_enter(&mi->mi_lock);
1457 				mi->mi_recovflags &= ~MI4R_LOST_STATE;
1458 				mutex_exit(&mi->mi_lock);
1459 			}
1460 			nfs_rw_exit(&mi->mi_recovlock);
1461 		} else {
1462 			mutex_exit(&mi->mi_lock);
1463 		}
1464 
1465 		/*
1466 		 * See if there is anything more to do.  If not, announce
1467 		 * that we are done and exit.
1468 		 *
1469 		 * Need mi_recovlock to keep 'sp' valid.  Must grab
1470 		 * mi_recovlock before mi_lock to preserve lock ordering.
1471 		 */
1472 		(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
1473 		mutex_enter(&mi->mi_lock);
1474 		if ((mi->mi_recovflags & ~MI4R_SRV_REBOOT) == 0 ||
1475 		    (mi->mi_flags & MI4_RECOV_FAIL)) {
1476 			list_t local_lost_state;
1477 			nfs4_lost_rqst_t *lrp;
1478 
1479 			/*
1480 			 * We need to remove the lost requests before we
1481 			 * unmark the mi as no longer doing recovery to
1482 			 * avoid a race with a new thread putting new lost
1483 			 * requests on the same mi (and the going away
1484 			 * thread would remove the new lost requests).
1485 			 *
1486 			 * Move the lost requests to a local list since
1487 			 * nfs4_remove_lost_rqst() drops mi_lock, and
1488 			 * dropping the mi_lock would make our check to
1489 			 * see if recovery is done no longer valid.
1490 			 */
1491 			list_create(&local_lost_state,
1492 			    sizeof (nfs4_lost_rqst_t),
1493 			    offsetof(nfs4_lost_rqst_t, lr_node));
1494 			list_move_tail(&local_lost_state, &mi->mi_lost_state);
1495 
1496 			done = 1;
1497 			mutex_exit(&mi->mi_lock);
1498 			/*
1499 			 * Now officially free the "moved"
1500 			 * lost requests.
1501 			 */
1502 			while ((lrp = list_head(&local_lost_state)) != NULL) {
1503 				list_remove(&local_lost_state, lrp);
1504 				nfs4_free_lost_rqst(lrp, sp);
1505 			}
1506 			list_destroy(&local_lost_state);
1507 		} else
1508 			mutex_exit(&mi->mi_lock);
1509 		nfs_rw_exit(&mi->mi_recovlock);
1510 
1511 		/*
1512 		 * If the filesystem has been forcibly unmounted, there is
1513 		 * probably no point in retrying immediately.  Furthermore,
1514 		 * there might be user processes waiting for a chance to
1515 		 * queue up "lost state" requests, so that they can exit.
1516 		 * So pause here for a moment.  Same logic for zone shutdown.
1517 		 */
1518 		if (!done && FS_OR_ZONE_GONE4(mi->mi_vfsp)) {
1519 			mutex_enter(&mi->mi_lock);
1520 			cv_broadcast(&mi->mi_failover_cv);
1521 			mutex_exit(&mi->mi_lock);
1522 			delay(SEC_TO_TICK(nfs4_unmount_delay));
1523 		}
1524 
1525 	} while (!done);
1526 
1527 	if (sp != NULL)
1528 		nfs4_server_rele(sp);
1529 
1530 	/*
1531 	 * Return all recalled delegations
1532 	 */
1533 	nfs4_dlistclean();
1534 
1535 	mutex_enter(&mi->mi_lock);
1536 	recov_done(mi, recovp);
1537 	mutex_exit(&mi->mi_lock);
1538 
1539 	/*
1540 	 * Free up resources that were allocated for us.
1541 	 */
1542 	if (recovp->rc_vp1 != NULL)
1543 		VN_RELE(recovp->rc_vp1);
1544 	if (recovp->rc_vp2 != NULL)
1545 		VN_RELE(recovp->rc_vp2);
1546 
1547 	/* now we are done using the mi struct, signal the waiters */
1548 	mutex_enter(&mi->mi_lock);
1549 	mi->mi_in_recovery--;
1550 	if (mi->mi_in_recovery == 0)
1551 		cv_broadcast(&mi->mi_cv_in_recov);
1552 	mutex_exit(&mi->mi_lock);
1553 
1554 	VFS_RELE(mi->mi_vfsp);
1555 	MI4_RELE(mi);
1556 	kmem_free(recovp, sizeof (recov_info_t));
1557 	mutex_enter(&cpr_lock);
1558 	CALLB_CPR_EXIT(&cpr_info);
1559 	mutex_destroy(&cpr_lock);
1560 	zthread_exit();
1561 }
1562 
1563 /*
1564  * Log the end of recovery and notify any waiting threads.
1565  */
1566 
1567 static void
1568 recov_done(mntinfo4_t *mi, recov_info_t *recovp)
1569 {
1570 
1571 	ASSERT(MUTEX_HELD(&mi->mi_lock));
1572 
1573 	nfs4_queue_event(RE_END, mi, NULL, 0, recovp->rc_vp1,
1574 		recovp->rc_vp2, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
1575 	mi->mi_recovthread = NULL;
1576 	mi->mi_flags &= ~MI4_RECOV_ACTIV;
1577 	mi->mi_recovflags &= ~MI4R_SRV_REBOOT;
1578 	cv_broadcast(&mi->mi_failover_cv);
1579 }
1580 
1581 /*
1582  * State-specific recovery routines, by state.
1583  */
1584 
1585 /*
1586  * Failover.
1587  *
1588  * Replaces *spp with a reference to the new server, which must
1589  * eventually be freed.
1590  */
1591 
1592 static void
1593 recov_newserver(recov_info_t *recovp, nfs4_server_t **spp, bool_t *recov_fail)
1594 {
1595 	mntinfo4_t *mi = recovp->rc_mi;
1596 	servinfo4_t *svp = NULL;
1597 	nfs4_server_t *osp = *spp;
1598 	CLIENT *cl;
1599 	enum clnt_stat status;
1600 	struct timeval tv;
1601 	int error;
1602 	int oncethru = 0;
1603 	rnode4_t *rp;
1604 	int index;
1605 	nfs_fh4 fh;
1606 	char *snames;
1607 	size_t len;
1608 
1609 	(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0);
1610 
1611 	tv.tv_sec = 2;
1612 	tv.tv_usec = 0;
1613 
1614 #ifdef lint
1615 	/*
1616 	 * Lint can't follow the logic, so thinks that snames and len
1617 	 * can be used before being set.  They can't, but lint can't
1618 	 * figure it out.  To address the lint warning, initialize
1619 	 * snames and len for lint.
1620 	 */
1621 	snames = NULL;
1622 	len = 0;
1623 #endif
1624 
1625 	/*
1626 	 * Ping the null NFS procedure of every server in
1627 	 * the list until one responds.  We always start
1628 	 * at the head of the list and always skip the one
1629 	 * that is current, since it's caused us a problem.
1630 	 */
1631 	while (svp == NULL) {
1632 		for (svp = mi->mi_servers; svp; svp = svp->sv_next) {
1633 
1634 			mutex_enter(&mi->mi_lock);
1635 			if (FS_OR_ZONE_GONE4(mi->mi_vfsp)) {
1636 				mi->mi_flags |= MI4_RECOV_FAIL;
1637 				mutex_exit(&mi->mi_lock);
1638 				(void) nfs_rw_exit(&mi->mi_recovlock);
1639 				*recov_fail = TRUE;
1640 				if (oncethru)
1641 					kmem_free(snames, len);
1642 				return;
1643 			}
1644 			mutex_exit(&mi->mi_lock);
1645 
1646 			(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1647 			if (svp->sv_flags & SV4_NOTINUSE) {
1648 				nfs_rw_exit(&svp->sv_lock);
1649 				continue;
1650 			}
1651 			nfs_rw_exit(&svp->sv_lock);
1652 
1653 			if (!oncethru && svp == mi->mi_curr_serv)
1654 				continue;
1655 
1656 			error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr,
1657 			    NFS_PROGRAM, NFS_V4, 0, 1, CRED(), &cl);
1658 			if (error)
1659 				continue;
1660 
1661 			if (!(mi->mi_flags & MI4_INT))
1662 				cl->cl_nosignal = TRUE;
1663 			status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL,
1664 			    xdr_void, NULL, tv);
1665 			if (!(mi->mi_flags & MI4_INT))
1666 				cl->cl_nosignal = FALSE;
1667 			AUTH_DESTROY(cl->cl_auth);
1668 			CLNT_DESTROY(cl);
1669 			if (status == RPC_SUCCESS) {
1670 				nfs4_queue_event(RE_FAILOVER, mi,
1671 				    svp == mi->mi_curr_serv ? NULL :
1672 				    svp->sv_hostname, 0, NULL, NULL, 0,
1673 				    NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
1674 				break;
1675 			}
1676 		}
1677 
1678 		if (svp == NULL) {
1679 			if (!oncethru) {
1680 				snames = nfs4_getsrvnames(mi, &len);
1681 				nfs4_queue_fact(RF_SRVS_NOT_RESPOND, mi,
1682 				    0, 0, 0, FALSE, snames, 0, NULL);
1683 				oncethru = 1;
1684 			}
1685 			delay(hz);
1686 		}
1687 	}
1688 
1689 	if (oncethru) {
1690 		nfs4_queue_fact(RF_SRVS_OK, mi, 0, 0, 0, FALSE, snames,
1691 		    0, NULL);
1692 		kmem_free(snames, len);
1693 	}
1694 
1695 #if DEBUG
1696 	(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1697 	ASSERT((svp->sv_flags & SV4_NOTINUSE) == 0);
1698 	nfs_rw_exit(&svp->sv_lock);
1699 #endif
1700 
1701 	mutex_enter(&mi->mi_lock);
1702 	mi->mi_recovflags &= ~MI4R_NEED_NEW_SERVER;
1703 	if (svp != mi->mi_curr_serv) {
1704 		servinfo4_t *osvp = mi->mi_curr_serv;
1705 
1706 		mutex_exit(&mi->mi_lock);
1707 
1708 		/*
1709 		 * Update server-dependent fields in the root vnode.
1710 		 */
1711 		index = rtable4hash(mi->mi_rootfh);
1712 		rw_enter(&rtable4[index].r_lock, RW_WRITER);
1713 
1714 		rp = r4find(&rtable4[index], mi->mi_rootfh, mi->mi_vfsp);
1715 		if (rp != NULL) {
1716 			NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
1717 			    "recov_newserver: remapping %s", rnode4info(rp)));
1718 			mutex_enter(&rp->r_statelock);
1719 			rp->r_server = svp;
1720 			PURGE_ATTRCACHE4_LOCKED(rp);
1721 			mutex_exit(&rp->r_statelock);
1722 			(void) nfs4_free_data_reclaim(rp);
1723 			nfs4_purge_rddir_cache(RTOV4(rp));
1724 			rw_exit(&rtable4[index].r_lock);
1725 			NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
1726 			    "recov_newserver: done with %s",
1727 			    rnode4info(rp)));
1728 			VN_RELE(RTOV4(rp));
1729 		} else
1730 			rw_exit(&rtable4[index].r_lock);
1731 		(void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
1732 
1733 		mutex_enter(&mi->mi_lock);
1734 		mi->mi_recovflags |= MI4R_REOPEN_FILES | MI4R_REMAP_FILES;
1735 		if (recovp->rc_srv_reboot)
1736 			mi->mi_recovflags |= MI4R_SRV_REBOOT;
1737 		mi->mi_curr_serv = svp;
1738 		mi->mi_failover++;
1739 		mi->mi_flags &= ~MI4_BADOWNER_DEBUG;
1740 		mutex_exit(&mi->mi_lock);
1741 
1742 		(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1743 		fh.nfs_fh4_len = svp->sv_fhandle.fh_len;
1744 		fh.nfs_fh4_val = svp->sv_fhandle.fh_buf;
1745 		sfh4_update(mi->mi_rootfh, &fh);
1746 		fh.nfs_fh4_len = svp->sv_pfhandle.fh_len;
1747 		fh.nfs_fh4_val = svp->sv_pfhandle.fh_buf;
1748 		sfh4_update(mi->mi_srvparentfh, &fh);
1749 		nfs_rw_exit(&svp->sv_lock);
1750 
1751 		*spp = nfs4_move_mi(mi, osvp, svp);
1752 		if (osp != NULL)
1753 			nfs4_server_rele(osp);
1754 	} else
1755 		mutex_exit(&mi->mi_lock);
1756 	(void) nfs_rw_exit(&mi->mi_recovlock);
1757 }
1758 
1759 /*
1760  * Clientid.
1761  */
1762 
1763 static void
1764 recov_clientid(recov_info_t *recovp, nfs4_server_t *sp)
1765 {
1766 	mntinfo4_t *mi = recovp->rc_mi;
1767 	int error = 0;
1768 	int still_stale;
1769 	int need_new_s;
1770 
1771 	ASSERT(sp != NULL);
1772 
1773 	/*
1774 	 * Acquire the recovery lock and then verify that the clientid
1775 	 * still needs to be recovered.  (Note that s_recovlock is supposed
1776 	 * to be acquired before s_lock.)  Since the thread holds the
1777 	 * recovery lock, no other thread will recover the clientid.
1778 	 */
1779 	(void) nfs_rw_enter_sig(&sp->s_recovlock, RW_WRITER, 0);
1780 	(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0);
1781 	mutex_enter(&sp->s_lock);
1782 	still_stale = ((sp->s_flags & N4S_CLIENTID_SET) == 0);
1783 	mutex_exit(&sp->s_lock);
1784 
1785 	if (still_stale) {
1786 		nfs4_error_t n4e;
1787 
1788 		nfs4_error_zinit(&n4e);
1789 		nfs4setclientid(mi, kcred, TRUE, &n4e);
1790 		error = n4e.error;
1791 		if (error != 0) {
1792 
1793 			/*
1794 			 * nfs4setclientid may have set MI4R_NEED_NEW_SERVER,
1795 			 * if so, just return and let recov_thread drive
1796 			 * failover.
1797 			 */
1798 			mutex_enter(&mi->mi_lock);
1799 			need_new_s = mi->mi_recovflags & MI4R_NEED_NEW_SERVER;
1800 			mutex_exit(&mi->mi_lock);
1801 
1802 			if (need_new_s) {
1803 				nfs_rw_exit(&mi->mi_recovlock);
1804 				nfs_rw_exit(&sp->s_recovlock);
1805 				return;
1806 			}
1807 
1808 			nfs4_queue_event(RE_CLIENTID, mi, NULL, n4e.error, NULL,
1809 			    NULL, n4e.stat, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
1810 			mutex_enter(&mi->mi_lock);
1811 			mi->mi_flags |= MI4_RECOV_FAIL;
1812 			mi->mi_error = recovp->rc_error;
1813 			mutex_exit(&mi->mi_lock);
1814 			/* don't destroy the nfs4_server, let umount do it */
1815 		}
1816 	}
1817 
1818 	if (error == 0) {
1819 		mutex_enter(&mi->mi_lock);
1820 		mi->mi_recovflags &= ~MI4R_NEED_CLIENTID;
1821 		/*
1822 		 * If still_stale isn't true, then another thread already
1823 		 * recovered the clientid.  And that thread that set the
1824 		 * clientid will have initiated reopening files on all the
1825 		 * filesystems for the server, so we should not initiate
1826 		 * reopening for this filesystem here.
1827 		 */
1828 		if (still_stale) {
1829 			mi->mi_recovflags |= MI4R_REOPEN_FILES;
1830 			if (recovp->rc_srv_reboot)
1831 				mi->mi_recovflags |= MI4R_SRV_REBOOT;
1832 		}
1833 		mutex_exit(&mi->mi_lock);
1834 	}
1835 
1836 	nfs_rw_exit(&mi->mi_recovlock);
1837 
1838 	if (error != 0) {
1839 		nfs_rw_exit(&sp->s_recovlock);
1840 		mutex_enter(&mi->mi_lock);
1841 		if ((mi->mi_flags & MI4_RECOV_FAIL) == 0)
1842 			delay(SEC_TO_TICK(recov_err_delay));
1843 		mutex_exit(&mi->mi_lock);
1844 	} else {
1845 		mntinfo4_t **milist;
1846 		mntinfo4_t *tmi;
1847 		int nummi, i;
1848 
1849 		/*
1850 		 * Initiate recovery of open files for other filesystems.
1851 		 * We create an array of filesystems, rather than just
1852 		 * walking the filesystem list, to avoid deadlock issues
1853 		 * with s_lock and mi_recovlock.
1854 		 */
1855 		milist = make_milist(sp, &nummi);
1856 		for (i = 0; i < nummi; i++) {
1857 			tmi = milist[i];
1858 			if (tmi != mi) {
1859 				(void) nfs_rw_enter_sig(&tmi->mi_recovlock,
1860 							RW_READER, 0);
1861 				start_recovery_action(NR_OPENFILES, TRUE, tmi,
1862 					NULL, NULL);
1863 				nfs_rw_exit(&tmi->mi_recovlock);
1864 			}
1865 		}
1866 		free_milist(milist, nummi);
1867 
1868 		nfs_rw_exit(&sp->s_recovlock);
1869 	}
1870 }
1871 
1872 /*
1873  * Return an array of filesystems associated with the given server.  The
1874  * caller should call free_milist() to free the references and memory.
1875  */
1876 
1877 static mntinfo4_t **
1878 make_milist(nfs4_server_t *sp, int *nummip)
1879 {
1880 	int nummi, i;
1881 	mntinfo4_t **milist;
1882 	mntinfo4_t *tmi;
1883 
1884 	mutex_enter(&sp->s_lock);
1885 	nummi = 0;
1886 	for (tmi = sp->mntinfo4_list; tmi != NULL; tmi = tmi->mi_clientid_next)
1887 		nummi++;
1888 
1889 	milist = kmem_alloc(nummi * sizeof (mntinfo4_t *), KM_NOSLEEP);
1890 
1891 	for (i = 0, tmi = sp->mntinfo4_list; tmi != NULL; i++,
1892 	    tmi = tmi->mi_clientid_next) {
1893 		milist[i] = tmi;
1894 		VFS_HOLD(tmi->mi_vfsp);
1895 	}
1896 	mutex_exit(&sp->s_lock);
1897 
1898 	*nummip = nummi;
1899 	return (milist);
1900 }
1901 
1902 /*
1903  * Free the filesystem list created by make_milist().
1904  */
1905 
1906 static void
1907 free_milist(mntinfo4_t **milist, int nummi)
1908 {
1909 	mntinfo4_t *tmi;
1910 	int i;
1911 
1912 	for (i = 0; i < nummi; i++) {
1913 		tmi = milist[i];
1914 		VFS_RELE(tmi->mi_vfsp);
1915 	}
1916 	kmem_free(milist, nummi * sizeof (mntinfo4_t *));
1917 }
1918 
1919 /*
1920  * Filehandle
1921  */
1922 
1923 /*
1924  * Lookup the filehandle for the given vnode and update the rnode if it has
1925  * changed.
1926  *
1927  * Errors:
1928  * - if the filehandle could not be updated because of an error that
1929  *   requires further recovery, initiate that recovery and return.
1930  * - if the filehandle could not be updated because of a signal, pretend we
1931  *   succeeded and let someone else deal with it.
1932  * - if the filehandle could not be updated and the filesystem has been
1933  *   forcibly unmounted, pretend we succeeded, and let the caller deal with
1934  *   the forced unmount (to retry or not to retry, that is the question).
1935  * - if the filehandle could not be updated because of some other error,
1936  *   mark the rnode bad and return.
1937  */
1938 static void
1939 recov_filehandle(nfs4_recov_t action, mntinfo4_t *mi, vnode_t *vp)
1940 {
1941 	rnode4_t *rp = VTOR4(vp);
1942 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
1943 	bool_t needrecov;
1944 
1945 	mutex_enter(&rp->r_statelock);
1946 
1947 	if (rp->r_flags & R4RECOVERR) {
1948 		mutex_exit(&rp->r_statelock);
1949 		return;
1950 	}
1951 
1952 	/*
1953 	 * If someone else is updating the filehandle, wait for them to
1954 	 * finish and then let our caller retry.
1955 	 */
1956 	if (rp->r_flags & R4RECEXPFH) {
1957 		while (rp->r_flags & R4RECEXPFH) {
1958 			cv_wait(&rp->r_cv, &rp->r_statelock);
1959 		}
1960 		mutex_exit(&rp->r_statelock);
1961 		return;
1962 	}
1963 	rp->r_flags |= R4RECEXPFH;
1964 	mutex_exit(&rp->r_statelock);
1965 
1966 	if (action == NR_BADHANDLE) {
1967 		/* shouldn't happen */
1968 		nfs4_queue_event(RE_BADHANDLE, mi, NULL, 0,
1969 		    vp, NULL, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
1970 	}
1971 
1972 	nfs4_remap_file(mi, vp, 0, &e);
1973 	needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
1974 
1975 	/*
1976 	 * If we get BADHANDLE or FHEXPIRED in their handler, something is
1977 	 * broken.  Don't try to recover, just mark the file dead.
1978 	 */
1979 	if (needrecov && e.error == 0 &&
1980 	    (e.stat == NFS4ERR_BADHANDLE || e.stat == NFS4ERR_FHEXPIRED))
1981 		needrecov = FALSE;
1982 	if (needrecov) {
1983 		(void) nfs4_start_recovery(&e, mi, vp,
1984 				NULL, NULL, NULL, OP_LOOKUP, NULL);
1985 	} else if (e.error != EINTR &&
1986 	    !NFS4_FRC_UNMT_ERR(e.error, mi->mi_vfsp) &&
1987 	    (e.error != 0 || e.stat != NFS4_OK)) {
1988 		nfs4_recov_fh_fail(vp, e.error, e.stat);
1989 		/*
1990 		 * Don't set r_error to ESTALE.  Higher-level code (e.g.,
1991 		 * cstatat_getvp()) retries on ESTALE, which would cause
1992 		 * an infinite loop.
1993 		 */
1994 	}
1995 
1996 	mutex_enter(&rp->r_statelock);
1997 	rp->r_flags &= ~R4RECEXPFH;
1998 	cv_broadcast(&rp->r_cv);
1999 	mutex_exit(&rp->r_statelock);
2000 }
2001 
2002 /*
2003  * Stale Filehandle
2004  */
2005 
2006 /*
2007  * A stale filehandle can happen when an individual file has
2008  * been removed, or when an entire filesystem has been taken
2009  * offline.  To distinguish these cases, we do this:
2010  * - if a GETATTR with the current filehandle is okay, we do
2011  *   nothing (this can happen with two-filehandle ops)
2012  * - if the GETATTR fails, but a GETATTR of the root filehandle
2013  *   succeeds, mark the rnode with R4STALE, which will stop use
2014  * - if the GETATTR fails, and a GETATTR of the root filehandle
2015  *   also fails, we consider the problem filesystem-wide, so:
2016  *   - if we can failover, we should
2017  *   - if we can't failover, we should mark both the original
2018  *     vnode and the root bad
2019  */
2020 static void
2021 recov_stale(mntinfo4_t *mi, vnode_t *vp)
2022 {
2023 	rnode4_t *rp = VTOR4(vp);
2024 	vnode_t *rootvp = NULL;
2025 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
2026 	nfs4_ga_res_t gar;
2027 	char *fail_msg = "failed to recover from NFS4ERR_STALE";
2028 	bool_t needrecov;
2029 
2030 	mutex_enter(&rp->r_statelock);
2031 
2032 	if (rp->r_flags & R4RECOVERR) {
2033 		mutex_exit(&rp->r_statelock);
2034 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2035 		    "recov_stale: already marked dead, rp %s",
2036 		    rnode4info(rp)));
2037 		return;
2038 	}
2039 
2040 	if (rp->r_flags & R4STALE) {
2041 		mutex_exit(&rp->r_statelock);
2042 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2043 		    "recov_stale: already marked stale, rp %s",
2044 		    rnode4info(rp)));
2045 		return;
2046 	}
2047 
2048 	mutex_exit(&rp->r_statelock);
2049 
2050 	/* Try a GETATTR on this vnode */
2051 	nfs4_getattr_otw_norecovery(vp, &gar, &e, CRED(), 0);
2052 
2053 	/*
2054 	 * Handle non-STALE recoverable errors
2055 	 */
2056 	needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
2057 	if (needrecov && (e.error != 0 || e.stat != NFS4ERR_STALE)) {
2058 		(void) nfs4_start_recovery(&e, mi, vp,
2059 				NULL, NULL, NULL, OP_GETATTR, NULL);
2060 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2061 		    "recov_stale: error=%d, stat=%d seen on rp %s",
2062 		    e.error, e.stat, rnode4info(rp)));
2063 		goto out;
2064 	}
2065 
2066 	/* Are things OK for this vnode? */
2067 	if (!e.error && e.stat == NFS4_OK) {
2068 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2069 		    "recov_stale: file appears fine, rp %s",
2070 		    rnode4info(rp)));
2071 		goto out;
2072 	}
2073 
2074 	/* Did we get an unrelated non-recoverable error? */
2075 	if (e.error || e.stat != NFS4ERR_STALE) {
2076 		nfs4_fail_recov(vp, fail_msg, e.error, e.stat);
2077 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2078 		    "recov_stale: unrelated fatal error, rp %s",
2079 		    rnode4info(rp)));
2080 		goto out;
2081 	}
2082 
2083 	/*
2084 	 * If we don't appear to be dealing with the root node, find it.
2085 	 */
2086 	if ((vp->v_flag & VROOT) == 0) {
2087 		nfs4_error_zinit(&e);
2088 		e.error = VFS_ROOT(vp->v_vfsp, &rootvp);
2089 		if (e.error) {
2090 			nfs4_fail_recov(vp, fail_msg, 0, NFS4ERR_STALE);
2091 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2092 			    "recov_stale: can't find root node for rp %s",
2093 			    rnode4info(rp)));
2094 			goto out;
2095 		}
2096 	}
2097 
2098 	/* Try a GETATTR on the root vnode */
2099 	if (rootvp != NULL) {
2100 		nfs4_error_zinit(&e);
2101 		nfs4_getattr_otw_norecovery(rootvp, &gar, &e, CRED(), 0);
2102 
2103 		/* Try recovery? */
2104 		if (e.error != 0 || e.stat != NFS4ERR_STALE) {
2105 			needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
2106 			if (needrecov) {
2107 				(void) nfs4_start_recovery(&e,
2108 					mi, rootvp, NULL, NULL, NULL,
2109 					OP_GETATTR, NULL);
2110 				NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2111 				    "recov_stale: error=%d, stat=%d seen "
2112 				    "on rp %s", e.error, e.stat,
2113 				    rnode4info(rp)));
2114 			}
2115 		}
2116 
2117 		/*
2118 		 * Check to see if a failover attempt is warranted
2119 		 * NB: nfs4_try_failover doesn't check for STALE
2120 		 * because recov_stale gets a shot first.  Now that
2121 		 * recov_stale has failed, go ahead and try failover.
2122 		 *
2123 		 * If the getattr on the root filehandle was successful,
2124 		 * then mark recovery as failed for 'vp' and exit.
2125 		 */
2126 		if (nfs4_try_failover(&e) == 0 && e.stat != NFS4ERR_STALE) {
2127 			/*
2128 			 * pass the original error to fail_recov, not
2129 			 * the one from trying the root vnode.
2130 			 */
2131 			nfs4_fail_recov(vp, fail_msg, 0, NFS4ERR_STALE);
2132 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2133 			    "recov_stale: root node OK, marking "
2134 			    "dead rp %s", rnode4info(rp)));
2135 			goto out;
2136 		}
2137 	}
2138 
2139 	/*
2140 	 * Here, we know that both the original file and the
2141 	 * root filehandle (which may be the same) are stale.
2142 	 * We want to fail over if we can, and if we can't, we
2143 	 * want to mark everything in sight bad.
2144 	 */
2145 	if (FAILOVER_MOUNT4(mi)) {
2146 		mutex_enter(&mi->mi_lock);
2147 		mi->mi_recovflags |= MI4R_NEED_NEW_SERVER;
2148 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2149 		    "recov_stale: failing over due to rp %s",
2150 		    rnode4info(rp)));
2151 		mutex_exit(&mi->mi_lock);
2152 	} else {
2153 		rnode4_t *rootrp;
2154 		servinfo4_t *svp;
2155 
2156 		/*
2157 		 * Can't fail over, so mark things dead.
2158 		 *
2159 		 * If rootvp is set, we know we have a distinct
2160 		 * non-root vnode which can be marked dead in
2161 		 * the usual way.
2162 		 *
2163 		 * Then we want to mark the root vnode dead.
2164 		 * Note that if rootvp wasn't set, our vp is
2165 		 * actually the root vnode.
2166 		 */
2167 		if (rootvp != NULL) {
2168 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2169 			    "recov_stale: can't fail over, marking dead rp %s",
2170 			    rnode4info(rp)));
2171 			nfs4_fail_recov(vp, fail_msg, 0, NFS4ERR_STALE);
2172 		} else {
2173 			rootvp = vp;
2174 			VN_HOLD(rootvp);
2175 		}
2176 
2177 		/*
2178 		 * Mark root dead, but quietly - since
2179 		 * the root rnode is frequently recreated,
2180 		 * we can encounter this at every access.
2181 		 * Also mark recovery as failed on this VFS.
2182 		 */
2183 		rootrp = VTOR4(rootvp);
2184 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_CONT,
2185 		    "recov_stale: marking dead root rp %s",
2186 		    rnode4info(rootrp)));
2187 		mutex_enter(&rootrp->r_statelock);
2188 		rootrp->r_flags |= (R4RECOVERR | R4STALE);
2189 		rootrp->r_error = ESTALE;
2190 		mutex_exit(&rootrp->r_statelock);
2191 		mutex_enter(&mi->mi_lock);
2192 		mi->mi_error = ESTALE;
2193 		mutex_exit(&mi->mi_lock);
2194 
2195 		svp = mi->mi_curr_serv;
2196 		(void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
2197 		svp->sv_flags |= SV4_ROOT_STALE;
2198 		nfs_rw_exit(&svp->sv_lock);
2199 	}
2200 
2201 out:
2202 	if (rootvp)
2203 		VN_RELE(rootvp);
2204 }
2205 
2206 /*
2207  * Locks.
2208  */
2209 
2210 /*
2211  * Reclaim all the active (acquired) locks for the given file.
2212  * If a process lost a lock, the process is sent a SIGLOST.  This is not
2213  * considered an error.
2214  *
2215  * Return values:
2216  * Errors and status are returned via the nfs4_error_t parameter
2217  * If an error indicates that recovery is needed, the caller is responsible
2218  * for dealing with it.
2219  */
2220 
2221 static void
2222 relock_file(vnode_t *vp, mntinfo4_t *mi, nfs4_error_t *ep,
2223     fattr4_change pre_change)
2224 {
2225 	locklist_t *locks, *llp;
2226 	rnode4_t *rp;
2227 
2228 	ASSERT(ep != NULL);
2229 	nfs4_error_zinit(ep);
2230 
2231 	if (VTOMI4(vp)->mi_flags & MI4_LLOCK)
2232 		return;
2233 
2234 	nfs4_flush_lock_owners(VTOR4(vp));
2235 
2236 	/*
2237 	 * If we get an error that requires recovery actions, just bail out
2238 	 * and let the top-level recovery code handle it.
2239 	 *
2240 	 * If we get some other error, kill the process that owned the lock
2241 	 * and mark its remaining locks (if any) as belonging to NOPID, so
2242 	 * that we don't make any more reclaim requests for that process.
2243 	 */
2244 
2245 	rp = VTOR4(vp);
2246 	locks = flk_active_locks_for_vp(vp);
2247 	for (llp = locks; llp != NULL; llp = llp->ll_next) {
2248 		int did_reclaim = 1;
2249 
2250 		ASSERT(llp->ll_vp == vp);
2251 		if (llp->ll_flock.l_pid == NOPID)
2252 			continue;
2253 		reclaim_one_lock(vp, &llp->ll_flock, ep, &did_reclaim);
2254 		/*
2255 		 * If we need to restart recovery, stop processing the
2256 		 * list.  Some errors would be recoverable under other
2257 		 * circumstances, but if they happen here we just give up
2258 		 * on the lock.
2259 		 */
2260 		if (nfs4_needs_recovery(ep, TRUE, vp->v_vfsp)) {
2261 			if (ep->error != 0)
2262 				break;
2263 			if (!nfs4_recov_marks_dead(ep->stat))
2264 				break;
2265 		}
2266 		/*
2267 		 *   In case the server isn't offering us a grace period, or
2268 		 * if we missed it, we might have opened & locked from scratch,
2269 		 * rather than reopened/reclaimed.
2270 		 *   We need to ensure that the object hadn't been otherwise
2271 		 * changed during this time, by comparing the changeinfo.
2272 		 *   We get passed the changeinfo from before the reopen by our
2273 		 * caller, in pre_change.
2274 		 *   The changeinfo from after the reopen is in rp->r_change,
2275 		 * courtesy of the GETATTR in the reopen.
2276 		 *   If they're different, then the file has changed, and we
2277 		 * have to SIGLOST the app.
2278 		 */
2279 		if (ep->error == 0 && ep->stat == NFS4_OK && !did_reclaim) {
2280 			mutex_enter(&rp->r_statelock);
2281 			if (pre_change != rp->r_change)
2282 				ep->stat = NFS4ERR_NO_GRACE;
2283 			mutex_exit(&rp->r_statelock);
2284 		}
2285 		if (ep->error != 0 || ep->stat != NFS4_OK) {
2286 			if (ep->error != 0)
2287 				nfs4_queue_event(RE_FAIL_RELOCK, mi,
2288 				    NULL, ep->error, vp, NULL, 0, NULL,
2289 				    llp->ll_flock.l_pid, TAG_NONE, TAG_NONE,
2290 				    0, 0);
2291 			else
2292 				nfs4_queue_event(RE_FAIL_RELOCK, mi,
2293 				    NULL, 0, vp, NULL, ep->stat, NULL,
2294 				    llp->ll_flock.l_pid, TAG_NONE, TAG_NONE,
2295 				    0, 0);
2296 			nfs4_send_siglost(llp->ll_flock.l_pid, mi, vp, TRUE,
2297 			    ep->error, ep->stat);
2298 			relock_skip_pid(llp, llp->ll_flock.l_pid);
2299 
2300 			/* Reinitialize the nfs4_error and continue */
2301 			nfs4_error_zinit(ep);
2302 		}
2303 	}
2304 
2305 	if (locks != NULL)
2306 		flk_free_locklist(locks);
2307 }
2308 
2309 /*
2310  * Reclaim the given lock.
2311  * If the lock can't be reclaimed, the process is sent SIGLOST, but this is
2312  * not considered an error.
2313  *
2314  * Errors are returned via the nfs4_error_t parameter.
2315  */
2316 static void
2317 reclaim_one_lock(vnode_t *vp, flock64_t *flk, nfs4_error_t *ep,
2318 	int *did_reclaimp)
2319 {
2320 	cred_t *cr;
2321 	rnode4_t *rp = VTOR4(vp);
2322 
2323 	cr = pid_to_cr(flk->l_pid);
2324 	if (cr == NULL) {
2325 		nfs4_error_zinit(ep);
2326 		ep->error = ESRCH;
2327 		return;
2328 	}
2329 
2330 	do {
2331 		mutex_enter(&rp->r_statelock);
2332 		if (rp->r_flags & R4RECOVERR) {
2333 			/*
2334 			 * This shouldn't affect other reclaims, so don't
2335 			 * return an error.
2336 			 */
2337 			mutex_exit(&rp->r_statelock);
2338 			break;
2339 		}
2340 		mutex_exit(&rp->r_statelock);
2341 
2342 		nfs4frlock(NFS4_LCK_CTYPE_RECLAIM, vp, F_SETLK, flk,
2343 				FREAD|FWRITE, 0, cr, ep, NULL, did_reclaimp);
2344 		if (ep->error == 0 && ep->stat == NFS4ERR_FHEXPIRED)
2345 			start_recovery_action(NR_FHEXPIRED, TRUE, VTOMI4(vp),
2346 					    vp, NULL);
2347 	} while (ep->error == 0 && ep->stat == NFS4ERR_FHEXPIRED);
2348 
2349 	crfree(cr);
2350 }
2351 
2352 /*
2353  * Open files.
2354  */
2355 
2356 /*
2357  * Verifies if the nfsstat4 is a valid error for marking this vnode dead.
2358  * Returns 1 if the error is valid; 0 otherwise.
2359  */
2360 static int
2361 nfs4_valid_recov_err_for_vp(vnode_t *vp, nfsstat4 stat)
2362 {
2363 	/*
2364 	 * We should not be marking non-regular files as dead,
2365 	 * except in very rare cases (eg: BADHANDLE or NFS4ERR_BADNAME).
2366 	 */
2367 	if (vp->v_type != VREG && stat != NFS4ERR_BADHANDLE &&
2368 	    stat != NFS4ERR_BADNAME)
2369 		return (0);
2370 
2371 	return (1);
2372 }
2373 
2374 /*
2375  * Failed attempting to recover a filehandle.  If 'stat' is valid for 'vp',
2376  * then mark the object dead.  Since we've had to do a lookup for
2377  * filehandle recovery, we will mark the object dead if we got NOENT.
2378  */
2379 static void
2380 nfs4_recov_fh_fail(vnode_t *vp, int error, nfsstat4 stat)
2381 {
2382 	ASSERT(vp != NULL);
2383 
2384 	if ((error == 0) && (stat != NFS4ERR_NOENT) &&
2385 	    (!nfs4_valid_recov_err_for_vp(vp, stat)))
2386 		return;
2387 
2388 	nfs4_fail_recov(vp, "can't recover filehandle", error, stat);
2389 }
2390 
2391 /*
2392  * Recovery from a "shouldn't happen" error.  In the long term, we'd like
2393  * to mark only the data structure(s) that provided the bad value as being
2394  * bad.  But for now we'll just mark the entire file.
2395  */
2396 
2397 static void
2398 recov_badstate(recov_info_t *recovp, vnode_t *vp, nfsstat4 stat)
2399 {
2400 	ASSERT(vp != NULL);
2401 	recov_throttle(recovp, vp);
2402 
2403 	if (!nfs4_valid_recov_err_for_vp(vp, stat))
2404 		return;
2405 
2406 	nfs4_fail_recov(vp, "", 0, stat);
2407 }
2408 
2409 /*
2410  * Free up the information saved for a lost state request.
2411  */
2412 static void
2413 nfs4_free_lost_rqst(nfs4_lost_rqst_t *lrp, nfs4_server_t *sp)
2414 {
2415 	component4 *filep;
2416 	nfs4_open_stream_t *osp;
2417 	int have_sync_lock;
2418 
2419 	NFS4_DEBUG(nfs4_lost_rqst_debug,
2420 		(CE_NOTE, "nfs4_free_lost_rqst:"));
2421 
2422 	switch (lrp->lr_op) {
2423 	case OP_OPEN:
2424 		filep = &lrp->lr_ofile;
2425 		if (filep->utf8string_val) {
2426 			kmem_free(filep->utf8string_val, filep->utf8string_len);
2427 			filep->utf8string_val = NULL;
2428 		}
2429 		break;
2430 	case OP_DELEGRETURN:
2431 		nfs4delegreturn_cleanup(VTOR4(lrp->lr_vp), sp);
2432 		break;
2433 	case OP_CLOSE:
2434 		osp = lrp->lr_osp;
2435 		ASSERT(osp != NULL);
2436 		mutex_enter(&osp->os_sync_lock);
2437 		have_sync_lock = 1;
2438 		if (osp->os_pending_close) {
2439 			/* clean up the open file state. */
2440 			osp->os_pending_close = 0;
2441 			nfs4close_notw(lrp->lr_vp, osp, &have_sync_lock);
2442 		}
2443 		if (have_sync_lock)
2444 			mutex_exit(&osp->os_sync_lock);
2445 		break;
2446 	}
2447 
2448 	lrp->lr_op = 0;
2449 	if (lrp->lr_oop != NULL) {
2450 		open_owner_rele(lrp->lr_oop);
2451 		lrp->lr_oop = NULL;
2452 	}
2453 	if (lrp->lr_osp != NULL) {
2454 		open_stream_rele(lrp->lr_osp, VTOR4(lrp->lr_vp));
2455 		lrp->lr_osp = NULL;
2456 	}
2457 	if (lrp->lr_lop != NULL) {
2458 		lock_owner_rele(lrp->lr_lop);
2459 		lrp->lr_lop = NULL;
2460 	}
2461 	if (lrp->lr_flk != NULL) {
2462 		kmem_free(lrp->lr_flk, sizeof (flock64_t));
2463 		lrp->lr_flk = NULL;
2464 	}
2465 	if (lrp->lr_vp != NULL) {
2466 		VN_RELE(lrp->lr_vp);
2467 		lrp->lr_vp = NULL;
2468 	}
2469 	if (lrp->lr_dvp != NULL) {
2470 		VN_RELE(lrp->lr_dvp);
2471 		lrp->lr_dvp = NULL;
2472 	}
2473 	if (lrp->lr_cr != NULL) {
2474 		crfree(lrp->lr_cr);
2475 		lrp->lr_cr = NULL;
2476 	}
2477 
2478 	kmem_free(lrp, sizeof (nfs4_lost_rqst_t));
2479 }
2480 
2481 /*
2482  * Remove any lost state requests and free them.
2483  */
2484 static void
2485 nfs4_remove_lost_rqsts(mntinfo4_t *mi, nfs4_server_t *sp)
2486 {
2487 	nfs4_lost_rqst_t *lrp;
2488 
2489 	mutex_enter(&mi->mi_lock);
2490 	while ((lrp = list_head(&mi->mi_lost_state)) != NULL) {
2491 		list_remove(&mi->mi_lost_state, lrp);
2492 		mutex_exit(&mi->mi_lock);
2493 		nfs4_free_lost_rqst(lrp, sp);
2494 		mutex_enter(&mi->mi_lock);
2495 	}
2496 	mutex_exit(&mi->mi_lock);
2497 }
2498 
2499 /*
2500  * Reopen all the files for the given filesystem and reclaim any locks.
2501  */
2502 
2503 static void
2504 recov_openfiles(recov_info_t *recovp, nfs4_server_t *sp)
2505 {
2506 	mntinfo4_t *mi = recovp->rc_mi;
2507 	nfs4_opinst_t *reopenlist = NULL, *rep;
2508 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
2509 	open_claim_type4 claim;
2510 	int remap;
2511 	char *fail_msg = "No such file or directory on replica";
2512 	rnode4_t *rp;
2513 	fattr4_change pre_change;
2514 
2515 	ASSERT(sp != NULL);
2516 
2517 	/*
2518 	 * This check is to allow a 10ms pause before we reopen files
2519 	 * it should allow the server time to have received the CB_NULL
2520 	 * reply and update its internal structures such that (if
2521 	 * applicable) we are granted a delegation on reopened files.
2522 	 */
2523 	mutex_enter(&sp->s_lock);
2524 	if ((sp->s_flags & (N4S_CB_PINGED | N4S_CB_WAITER)) == 0) {
2525 		sp->s_flags |= N4S_CB_WAITER;
2526 		(void) cv_timedwait(&sp->wait_cb_null, &sp->s_lock,
2527 			(lbolt+drv_usectohz(N4S_CB_PAUSE_TIME)));
2528 	}
2529 	mutex_exit(&sp->s_lock);
2530 
2531 	(void) nfs_rw_enter_sig(&sp->s_recovlock, RW_READER, 0);
2532 	(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0);
2533 
2534 	if (NFS4_VOLATILE_FH(mi)) {
2535 		nfs4_remap_root(mi, &e, 0);
2536 		if (nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp)) {
2537 			(void) nfs4_start_recovery(&e, mi, NULL,
2538 					NULL, NULL, NULL, OP_LOOKUP, NULL);
2539 		}
2540 	}
2541 
2542 	mutex_enter(&mi->mi_lock);
2543 	if (recovp->rc_srv_reboot || (mi->mi_recovflags & MI4R_SRV_REBOOT))
2544 		claim = CLAIM_PREVIOUS;
2545 	else
2546 		claim = CLAIM_NULL;
2547 	mutex_exit(&mi->mi_lock);
2548 
2549 	if (e.error == 0 && e.stat == NFS4_OK) {
2550 		/*
2551 		 * Get a snapshot of open files in the filesystem.  Note
2552 		 * that new opens will stall until the server's grace
2553 		 * period is done.
2554 		 */
2555 		reopenlist = r4mkopenlist(mi);
2556 
2557 		mutex_enter(&mi->mi_lock);
2558 		remap = mi->mi_recovflags & MI4R_REMAP_FILES;
2559 		mutex_exit(&mi->mi_lock);
2560 		/*
2561 		 * Since we are re-establishing state on the
2562 		 * server, its ok to blow away the saved lost
2563 		 * requests since we don't need to reissue it.
2564 		 */
2565 		nfs4_remove_lost_rqsts(mi, sp);
2566 
2567 		for (rep = reopenlist; rep; rep = rep->re_next) {
2568 
2569 			if (remap) {
2570 				nfs4_remap_file(mi, rep->re_vp,
2571 					NFS4_REMAP_CKATTRS, &e);
2572 			}
2573 			if (e.error == ENOENT || e.stat == NFS4ERR_NOENT) {
2574 				/*
2575 				 * The current server does not have the file
2576 				 * that is to be remapped.  This is most
2577 				 * likely due to an improperly maintained
2578 				 * replica.   The files that are missing from
2579 				 * the server will be marked dead and logged
2580 				 * in order to make sys admins aware of the
2581 				 * problem.
2582 				 */
2583 				nfs4_fail_recov(rep->re_vp,
2584 					fail_msg, e.error, e.stat);
2585 				/*
2586 				 * We've already handled the error so clear it.
2587 				 */
2588 				nfs4_error_zinit(&e);
2589 				continue;
2590 			} else if (e.error == 0 && e.stat == NFS4_OK) {
2591 				int j;
2592 
2593 				rp = VTOR4(rep->re_vp);
2594 				mutex_enter(&rp->r_statelock);
2595 				pre_change = rp->r_change;
2596 				mutex_exit(&rp->r_statelock);
2597 
2598 				for (j = 0; j < rep->re_numosp; j++) {
2599 					nfs4_reopen(rep->re_vp, rep->re_osp[j],
2600 						&e, claim, FALSE, TRUE);
2601 					if (e.error != 0 || e.stat != NFS4_OK)
2602 						break;
2603 				}
2604 				if (nfs4_needs_recovery(&e, TRUE,
2605 				    mi->mi_vfsp)) {
2606 					(void) nfs4_start_recovery(&e, mi,
2607 						rep->re_vp, NULL, NULL, NULL,
2608 						OP_OPEN, NULL);
2609 					break;
2610 				}
2611 			}
2612 #ifdef DEBUG
2613 			if (nfs4_recovdelay > 0)
2614 				delay(MSEC_TO_TICK(nfs4_recovdelay * 1000));
2615 #endif
2616 			if (e.error == 0 && e.stat == NFS4_OK)
2617 				relock_file(rep->re_vp, mi, &e, pre_change);
2618 
2619 			if (nfs4_needs_recovery(&e, TRUE, mi->mi_vfsp))
2620 				(void) nfs4_start_recovery(&e, mi,
2621 					rep->re_vp, NULL, NULL, NULL, OP_LOCK,
2622 					NULL);
2623 			if (e.error != 0 || e.stat != NFS4_OK)
2624 				break;
2625 		}
2626 
2627 		/*
2628 		 * Check to see if we need to remap files passed in
2629 		 * via the recovery arguments; this will have been
2630 		 * done for open files.  A failure here is not fatal.
2631 		 */
2632 		if (remap) {
2633 			nfs4_error_t ignore;
2634 			nfs4_check_remap(mi, recovp->rc_vp1, NFS4_REMAP_CKATTRS,
2635 				&ignore);
2636 			nfs4_check_remap(mi, recovp->rc_vp2, NFS4_REMAP_CKATTRS,
2637 				&ignore);
2638 		}
2639 	}
2640 
2641 	if (e.error == 0 && e.stat == NFS4_OK) {
2642 		mutex_enter(&mi->mi_lock);
2643 		mi->mi_recovflags &= ~(MI4R_REOPEN_FILES | MI4R_REMAP_FILES);
2644 		mutex_exit(&mi->mi_lock);
2645 	}
2646 
2647 	nfs_rw_exit(&mi->mi_recovlock);
2648 	nfs_rw_exit(&sp->s_recovlock);
2649 
2650 	if (reopenlist != NULL)
2651 		r4releopenlist(reopenlist);
2652 }
2653 
2654 /*
2655  * Resend the queued state recovery requests in "rqsts".
2656  */
2657 
2658 static void
2659 nfs4_resend_lost_rqsts(recov_info_t *recovp, nfs4_server_t *sp)
2660 {
2661 	nfs4_lost_rqst_t	*lrp, *tlrp;
2662 	mntinfo4_t		*mi = recovp->rc_mi;
2663 	nfs4_error_t		n4e;
2664 #ifdef NOTYET
2665 	uint32_t		deny_bits = 0;
2666 #endif
2667 
2668 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_resend_lost_rqsts"));
2669 
2670 	ASSERT(mi != NULL);
2671 	ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
2672 
2673 	mutex_enter(&mi->mi_lock);
2674 	lrp = list_head(&mi->mi_lost_state);
2675 	mutex_exit(&mi->mi_lock);
2676 	while (lrp != NULL) {
2677 		nfs4_error_zinit(&n4e);
2678 		resend_one_op(lrp, &n4e, mi, sp);
2679 		NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
2680 		    "nfs4_resend_lost_rqsts: resend request: for vp %p got "
2681 		    "error %d stat %d", (void *)lrp->lr_vp, n4e.error,
2682 		    n4e.stat));
2683 
2684 		/*
2685 		 * If we get a recovery error that we can actually
2686 		 * recover from (such as ETIMEDOUT, FHEXPIRED), we
2687 		 * return and let the recovery thread redrive the call.
2688 		 * Don't requeue unless the zone is still healthy.
2689 		 */
2690 		if (zone_status_get(curproc->p_zone) < ZONE_IS_SHUTTING_DOWN &&
2691 		    nfs4_needs_recovery(&n4e, TRUE, mi->mi_vfsp) &&
2692 		    (nfs4_try_failover(&n4e) ||
2693 		    NFS4_FRC_UNMT_ERR(n4e.error, mi->mi_vfsp) ||
2694 		    (n4e.error == 0 && n4e.stat != NFS4ERR_BADHANDLE &&
2695 		    !nfs4_recov_marks_dead(n4e.stat)))) {
2696 			/*
2697 			 * For these three errors, we want to delay a bit
2698 			 * instead of pounding the server into submission.
2699 			 * We have to do this manually; the normal
2700 			 * processing for these errors only works for
2701 			 * non-recovery requests.
2702 			 */
2703 			if ((n4e.error == 0 && n4e.stat == NFS4ERR_DELAY) ||
2704 			    (n4e.error == 0 && n4e.stat == NFS4ERR_GRACE) ||
2705 			    (n4e.error == 0 && n4e.stat == NFS4ERR_RESOURCE) ||
2706 			    NFS4_FRC_UNMT_ERR(n4e.error, mi->mi_vfsp)) {
2707 				delay(SEC_TO_TICK(nfs4err_delay_time));
2708 			} else {
2709 				(void) nfs4_start_recovery(&n4e,
2710 					mi, lrp->lr_dvp, lrp->lr_vp, NULL, NULL,
2711 					lrp->lr_op, NULL);
2712 			}
2713 			return;
2714 		}
2715 
2716 		mutex_enter(&mi->mi_lock);
2717 		list_remove(&mi->mi_lost_state, lrp);
2718 		tlrp = lrp;
2719 		lrp = list_head(&mi->mi_lost_state);
2720 		mutex_exit(&mi->mi_lock);
2721 		nfs4_free_lost_rqst(tlrp, sp);
2722 	}
2723 }
2724 
2725 /*
2726  * Resend the given op, and issue any necessary undo call.
2727  * errors are returned via the nfs4_error_t parameter.
2728  */
2729 
2730 static void
2731 resend_one_op(nfs4_lost_rqst_t *lrp, nfs4_error_t *ep,
2732 	mntinfo4_t *mi, nfs4_server_t *sp)
2733 {
2734 	vnode_t *vp;
2735 	nfs4_open_stream_t *osp;
2736 	cred_t *cr;
2737 	uint32_t acc_bits;
2738 
2739 	vp = lrp->lr_vp;
2740 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_one_op: "
2741 	    "have a lost open/close request for vp %p", (void *)vp));
2742 
2743 	switch (lrp->lr_op) {
2744 	case OP_OPEN:
2745 		nfs4_resend_open_otw(&vp, lrp, ep);
2746 		break;
2747 	case OP_OPEN_DOWNGRADE:
2748 		ASSERT(lrp->lr_oop != NULL);
2749 		ep->error = nfs4_start_open_seqid_sync(lrp->lr_oop, mi);
2750 		ASSERT(!ep->error);	/* recov thread always succeeds */
2751 		ASSERT(lrp->lr_osp != NULL);
2752 		mutex_enter(&lrp->lr_osp->os_sync_lock);
2753 		nfs4_open_downgrade(lrp->lr_dg_acc, lrp->lr_dg_deny,
2754 			    lrp->lr_oop, lrp->lr_osp, vp, lrp->lr_cr, lrp,
2755 			    ep, NULL, NULL);
2756 		mutex_exit(&lrp->lr_osp->os_sync_lock);
2757 		nfs4_end_open_seqid_sync(lrp->lr_oop);
2758 		break;
2759 	case OP_CLOSE:
2760 		osp = lrp->lr_osp;
2761 		cr = lrp->lr_cr;
2762 		acc_bits = 0;
2763 		mutex_enter(&osp->os_sync_lock);
2764 		if (osp->os_share_acc_read)
2765 			acc_bits |= OPEN4_SHARE_ACCESS_READ;
2766 		if (osp->os_share_acc_write)
2767 			acc_bits |= OPEN4_SHARE_ACCESS_WRITE;
2768 		mutex_exit(&osp->os_sync_lock);
2769 		nfs4close_one(vp, osp, cr, acc_bits, lrp, ep,
2770 				CLOSE_RESEND, 0, 0, 0);
2771 		break;
2772 	case OP_LOCK:
2773 	case OP_LOCKU:
2774 		resend_lock(lrp, ep);
2775 		goto done;
2776 	case OP_DELEGRETURN:
2777 		nfs4_resend_delegreturn(lrp, ep, sp);
2778 		goto done;
2779 	default:
2780 #ifdef DEBUG
2781 		cmn_err(CE_PANIC, "resend_one_op: unexpected op: %d",
2782 			lrp->lr_op);
2783 #endif
2784 		nfs4_queue_event(RE_LOST_STATE_BAD_OP, mi, NULL,
2785 		    lrp->lr_op, lrp->lr_vp, lrp->lr_dvp, NFS4_OK, NULL, 0,
2786 		    TAG_NONE, TAG_NONE, 0, 0);
2787 		nfs4_error_init(ep, EINVAL);
2788 		return;
2789 	}
2790 
2791 	/*
2792 	 * No need to retry nor send an "undo" CLOSE in the
2793 	 * event the server rebooted.
2794 	 */
2795 	if (ep->error == 0 && (ep->stat == NFS4ERR_STALE_CLIENTID ||
2796 	    ep->stat == NFS4ERR_STALE_STATEID || ep->stat == NFS4ERR_EXPIRED))
2797 		goto done;
2798 
2799 	/*
2800 	 * If we resent a CLOSE or OPEN_DOWNGRADE, there's nothing
2801 	 * to undo.  Undoing locking operations was handled by
2802 	 * resend_lock().
2803 	 */
2804 	if (lrp->lr_op == OP_OPEN_DOWNGRADE || lrp->lr_op == OP_CLOSE)
2805 		goto done;
2806 
2807 	/*
2808 	 * If we get any other error for OPEN, then don't attempt
2809 	 * to undo the resend of the open (since it was never
2810 	 * successful!).
2811 	 */
2812 	ASSERT(lrp->lr_op == OP_OPEN);
2813 	if (ep->error || ep->stat != NFS4_OK)
2814 		goto done;
2815 
2816 	/*
2817 	 * Now let's undo our OPEN.
2818 	 */
2819 	nfs4_error_zinit(ep);
2820 	close_after_open_resend(vp, lrp->lr_cr, lrp->lr_oacc, ep);
2821 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_one_op: "
2822 	    "nfs4close_one: for vp %p got error %d stat %d",
2823 	    (void *)vp, ep->error, ep->stat));
2824 
2825 done:
2826 	if (vp != lrp->lr_vp)
2827 		VN_RELE(vp);
2828 }
2829 
2830 /*
2831  * Close a file that was opened via a resent OPEN.
2832  * Most errors are passed back to the caller (via the return value and
2833  * *statp), except for FHEXPIRED, which is retried.
2834  *
2835  * It might be conceptually cleaner to push the CLOSE request onto the
2836  * front of the resend queue, rather than sending it here.  That would
2837  * match the way we undo lost lock requests.  On the other
2838  * hand, we've already got something that works, and there's no reason to
2839  * change it at this time.
2840  */
2841 
2842 static void
2843 close_after_open_resend(vnode_t *vp, cred_t *cr, uint32_t acc_bits,
2844 			nfs4_error_t *ep)
2845 {
2846 
2847 	for (;;) {
2848 		nfs4close_one(vp, NULL, cr, acc_bits, NULL, ep,
2849 				CLOSE_AFTER_RESEND, 0, 0, 0);
2850 		if (ep->error == 0 && ep->stat == NFS4_OK)
2851 			break;		/* success; done */
2852 		if (ep->error != 0 || ep->stat != NFS4ERR_FHEXPIRED)
2853 			break;
2854 		/* else retry FHEXPIRED */
2855 	}
2856 
2857 }
2858 
2859 /*
2860  * Resend the given lost lock request.  Return an errno value.  If zero,
2861  * *statp is set to the NFS status code for the call.
2862  *
2863  * Issue a SIGLOST and mark the rnode dead if we get a non-recovery error or
2864  * a recovery error that we don't actually recover from yet (eg: BAD_SEQID).
2865  * Let the recovery thread redrive the call if we get a recovery error that
2866  * we can actually recover from.
2867  */
2868 static void
2869 resend_lock(nfs4_lost_rqst_t *lrp, nfs4_error_t *ep)
2870 {
2871 	bool_t		send_siglost = FALSE;
2872 	vnode_t		*vp = lrp->lr_vp;
2873 
2874 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_lock:"));
2875 	ASSERT(lrp->lr_ctype == NFS4_LCK_CTYPE_REINSTATE ||
2876 	    lrp->lr_ctype == NFS4_LCK_CTYPE_RESEND);
2877 
2878 	nfs4frlock(lrp->lr_ctype, vp, F_SETLK,
2879 		    lrp->lr_flk, FREAD|FWRITE, 0, lrp->lr_cr, ep, lrp, NULL);
2880 
2881 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_lock: "
2882 	    "nfs4frlock for vp %p returned error %d, stat %d",
2883 	    (void *)vp, ep->error, ep->stat));
2884 
2885 	if (ep->error == 0 && ep->stat == 0)
2886 		goto done;
2887 	if (ep->error == 0 && ep->stat == NFS4ERR_DENIED &&
2888 	    lrp->lr_ctype == NFS4_LCK_CTYPE_RESEND)
2889 		goto done;
2890 
2891 	/*
2892 	 * If we failed with a non-recovery error, send SIGLOST and
2893 	 * mark the file dead.
2894 	 */
2895 	if (!nfs4_needs_recovery(ep, TRUE, vp->v_vfsp))
2896 		send_siglost = TRUE;
2897 	else {
2898 		/*
2899 		 * Done with recovering LOST LOCK in the event the
2900 		 * server rebooted or we've lost the lease.
2901 		 */
2902 		if (ep->error == 0 && (ep->stat == NFS4ERR_STALE_CLIENTID ||
2903 		    ep->stat == NFS4ERR_STALE_STATEID ||
2904 		    ep->stat == NFS4ERR_EXPIRED)) {
2905 			goto done;
2906 		}
2907 
2908 		/*
2909 		 * BAD_STATEID on an unlock indicates that the server has
2910 		 * forgotten about the lock anyway, so act like the call
2911 		 * was successful.
2912 		 */
2913 		if (ep->error == 0 && ep->stat == NFS4ERR_BAD_STATEID &&
2914 		    lrp->lr_op == OP_LOCKU)
2915 			goto done;
2916 
2917 		/*
2918 		 * If we got a recovery error that we don't actually
2919 		 * recover from, send SIGLOST.  If the filesystem was
2920 		 * forcibly unmounted, we skip the SIGLOST because (a) it's
2921 		 * unnecessary noise, and (b) there could be a new process
2922 		 * with the same pid as the one that had generated the lost
2923 		 * state request.
2924 		 */
2925 		if (ep->error == 0 && (ep->stat == NFS4ERR_BADHANDLE ||
2926 		    nfs4_recov_marks_dead(ep->stat))) {
2927 			if (!(vp->v_vfsp->vfs_flag & VFS_UNMOUNTED))
2928 				send_siglost = TRUE;
2929 			goto done;
2930 		}
2931 
2932 		/*
2933 		 * If the filesystem was forcibly unmounted, we
2934 		 * still need to synchronize with the server and
2935 		 * release state.  Try again later.
2936 		 */
2937 		if (NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp))
2938 			goto done;
2939 
2940 		/*
2941 		 * If we get a recovery error that we can actually
2942 		 * recover from (such as ETIMEDOUT, FHEXPIRED),
2943 		 * return and let the recovery thread redrive the call.
2944 		 *
2945 		 * For the three errors below, we want to delay a bit
2946 		 * instead of pounding the server into submission.
2947 		 */
2948 		if ((ep->error == 0 && ep->stat == NFS4ERR_DELAY) ||
2949 		    (ep->error == 0 && ep->stat == NFS4ERR_GRACE) ||
2950 		    (ep->error == 0 && ep->stat == NFS4ERR_RESOURCE))
2951 			delay(SEC_TO_TICK(recov_err_delay));
2952 		goto done;
2953 	}
2954 
2955 done:
2956 	if (send_siglost) {
2957 		cred_t *sv_cred;
2958 
2959 		/*
2960 		 * Must be root or the actual thread being issued the
2961 		 * SIGLOST for this to work, so just become root.
2962 		 */
2963 		sv_cred = curthread->t_cred;
2964 		curthread->t_cred = kcred;
2965 		nfs4_send_siglost(lrp->lr_flk->l_pid, VTOMI4(vp), vp, FALSE,
2966 		    ep->error, ep->stat);
2967 		curthread->t_cred = sv_cred;
2968 
2969 		/*
2970 		 * Flush any additional reinstantiation requests for
2971 		 * this operation.  Sending multiple SIGLOSTs to the user
2972 		 * process is unlikely to help and may cause trouble.
2973 		 */
2974 		if (lrp->lr_ctype == NFS4_LCK_CTYPE_REINSTATE)
2975 			flush_reinstate(lrp);
2976 	}
2977 }
2978 
2979 /*
2980  * Remove any lock reinstantiation requests that correspond to the given
2981  * lost request.  We only remove items that follow lrp in the queue,
2982  * assuming that lrp will be removed by the generic lost state code.
2983  */
2984 
2985 static void
2986 flush_reinstate(nfs4_lost_rqst_t *lrp)
2987 {
2988 	vnode_t *vp;
2989 	pid_t pid;
2990 	mntinfo4_t *mi;
2991 	nfs4_lost_rqst_t *nlrp;
2992 
2993 	vp = lrp->lr_vp;
2994 	mi = VTOMI4(vp);
2995 	pid = lrp->lr_flk->l_pid;
2996 
2997 	/*
2998 	 * If there are any more reinstantation requests to get rid of,
2999 	 * they should all be clustered at the front of the lost state
3000 	 * queue.
3001 	 */
3002 	mutex_enter(&mi->mi_lock);
3003 	for (lrp = list_next(&mi->mi_lost_state, lrp); lrp != NULL;
3004 	    lrp = nlrp) {
3005 		nlrp = list_next(&mi->mi_lost_state, lrp);
3006 		if (lrp->lr_op != OP_LOCK && lrp->lr_op != OP_LOCKU)
3007 			break;
3008 		if (lrp->lr_ctype != NFS4_LCK_CTYPE_REINSTATE)
3009 			break;
3010 		ASSERT(lrp->lr_vp == vp);
3011 		ASSERT(lrp->lr_flk->l_pid == pid);
3012 		NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
3013 				"remove reinstantiation %p", (void *)lrp));
3014 		list_remove(&mi->mi_lost_state, lrp);
3015 		nfs4_free_lost_rqst(lrp, NULL);
3016 	}
3017 	mutex_exit(&mi->mi_lock);
3018 }
3019 
3020 /*
3021  * End of state-specific recovery routines.
3022  */
3023 
3024 /*
3025  * Allocate a lost request struct, initialize it from lost_rqstp (including
3026  * bumping the reference counts for the referenced vnode, etc.), and hang
3027  * it off of recovp.
3028  */
3029 
3030 static void
3031 nfs4_save_lost_rqst(nfs4_lost_rqst_t *lost_rqstp, recov_info_t *recovp,
3032 	nfs4_recov_t *action, mntinfo4_t *mi)
3033 {
3034 	nfs4_lost_rqst_t *destp;
3035 
3036 	ASSERT(recovp->rc_lost_rqst == NULL);
3037 
3038 	destp = kmem_alloc(sizeof (nfs4_lost_rqst_t), KM_SLEEP);
3039 	recovp->rc_lost_rqst = destp;
3040 
3041 	if (lost_rqstp->lr_op == OP_LOCK ||
3042 	    lost_rqstp->lr_op == OP_LOCKU) {
3043 		ASSERT(lost_rqstp->lr_lop);
3044 		*action = NR_LOST_LOCK;
3045 		destp->lr_ctype = lost_rqstp->lr_ctype;
3046 		destp->lr_locktype = lost_rqstp->lr_locktype;
3047 	} else if (lost_rqstp->lr_op == OP_OPEN) {
3048 		component4 *srcfp, *destfp;
3049 
3050 		destp->lr_oacc = lost_rqstp->lr_oacc;
3051 		destp->lr_odeny = lost_rqstp->lr_odeny;
3052 		destp->lr_oclaim = lost_rqstp->lr_oclaim;
3053 		if (lost_rqstp->lr_oclaim == CLAIM_DELEGATE_CUR)
3054 			destp->lr_ostateid = lost_rqstp->lr_ostateid;
3055 
3056 		srcfp = &lost_rqstp->lr_ofile;
3057 		destfp = &destp->lr_ofile;
3058 		/*
3059 		 * Consume caller's utf8string
3060 		 */
3061 		destfp->utf8string_len = srcfp->utf8string_len;
3062 		destfp->utf8string_val = srcfp->utf8string_val;
3063 		srcfp->utf8string_len = 0;
3064 		srcfp->utf8string_val = NULL;	/* make sure not reused */
3065 
3066 		*action = NR_LOST_STATE_RQST;
3067 	} else if (lost_rqstp->lr_op == OP_OPEN_DOWNGRADE) {
3068 		destp->lr_dg_acc = lost_rqstp->lr_dg_acc;
3069 		destp->lr_dg_deny = lost_rqstp->lr_dg_deny;
3070 
3071 		*action = NR_LOST_STATE_RQST;
3072 	} else if (lost_rqstp->lr_op == OP_CLOSE) {
3073 		ASSERT(lost_rqstp->lr_oop);
3074 		*action = NR_LOST_STATE_RQST;
3075 	} else if (lost_rqstp->lr_op == OP_DELEGRETURN) {
3076 		*action = NR_LOST_STATE_RQST;
3077 	} else {
3078 #ifdef DEBUG
3079 		cmn_err(CE_PANIC, "nfs4_save_lost_rqst: bad op %d",
3080 			lost_rqstp->lr_op);
3081 #endif
3082 		nfs4_queue_event(RE_LOST_STATE_BAD_OP, mi, NULL,
3083 		    lost_rqstp->lr_op, lost_rqstp->lr_vp, lost_rqstp->lr_dvp,
3084 		    NFS4_OK, NULL, curproc->p_pid, TAG_NONE, TAG_NONE, 0, 0);
3085 		*action = NR_UNUSED;
3086 		recovp->rc_lost_rqst = NULL;
3087 		kmem_free(destp, sizeof (nfs4_lost_rqst_t));
3088 		return;
3089 	}
3090 
3091 	destp->lr_op = lost_rqstp->lr_op;
3092 	destp->lr_vp = lost_rqstp->lr_vp;
3093 	if (destp->lr_vp)
3094 		VN_HOLD(destp->lr_vp);
3095 	destp->lr_dvp = lost_rqstp->lr_dvp;
3096 	if (destp->lr_dvp)
3097 		VN_HOLD(destp->lr_dvp);
3098 	destp->lr_oop = lost_rqstp->lr_oop;
3099 	if (destp->lr_oop)
3100 		open_owner_hold(destp->lr_oop);
3101 	destp->lr_osp = lost_rqstp->lr_osp;
3102 	if (destp->lr_osp)
3103 		open_stream_hold(destp->lr_osp);
3104 	destp->lr_lop = lost_rqstp->lr_lop;
3105 	if (destp->lr_lop)
3106 		lock_owner_hold(destp->lr_lop);
3107 	destp->lr_cr = lost_rqstp->lr_cr;
3108 	if (destp->lr_cr)
3109 		crhold(destp->lr_cr);
3110 	if (lost_rqstp->lr_flk == NULL)
3111 		destp->lr_flk = NULL;
3112 	else {
3113 		destp->lr_flk = kmem_alloc(sizeof (flock64_t), KM_SLEEP);
3114 		*destp->lr_flk = *lost_rqstp->lr_flk;
3115 	}
3116 	destp->lr_putfirst = lost_rqstp->lr_putfirst;
3117 }
3118 
3119 /*
3120  * Map the given return values (errno and nfs4 status code) to a recovery
3121  * action and fill in the following fields of recovp: rc_action,
3122  * rc_srv_reboot, rc_stateid, rc_lost_rqst.
3123  */
3124 
3125 void
3126 errs_to_action(recov_info_t *recovp,
3127 	nfs4_server_t *sp, mntinfo4_t *mi, stateid4 *sidp,
3128 	nfs4_lost_rqst_t *lost_rqstp, int unmounted, nfs_opnum4 op,
3129 	nfs4_bseqid_entry_t *bsep)
3130 {
3131 	nfs4_recov_t action = NR_UNUSED;
3132 	bool_t reboot = FALSE;
3133 	int try_f;
3134 	int error = recovp->rc_orig_errors.error;
3135 	nfsstat4 stat = recovp->rc_orig_errors.stat;
3136 
3137 	bzero(&recovp->rc_stateid, sizeof (stateid4));
3138 	recovp->rc_lost_rqst = NULL;
3139 	recovp->rc_bseqid_rqst = NULL;
3140 
3141 	try_f = nfs4_try_failover(&recovp->rc_orig_errors) &&
3142 			FAILOVER_MOUNT4(mi);
3143 
3144 	/*
3145 	 * We start recovery for EINTR only in the lost lock
3146 	 * or lost open/close case.
3147 	 */
3148 
3149 	if (try_f || error == EINTR || (error == EIO && unmounted)) {
3150 		recovp->rc_error = (error != 0 ? error : geterrno4(stat));
3151 		if (lost_rqstp) {
3152 			ASSERT(lost_rqstp->lr_op != 0);
3153 			nfs4_save_lost_rqst(lost_rqstp, recovp, &action, mi);
3154 		}
3155 		if (try_f)
3156 			action = NR_FAILOVER;
3157 	} else if (error != 0) {
3158 		recovp->rc_error = error;
3159 		nfs4_queue_event(RE_UNEXPECTED_ERRNO, mi, NULL, error, NULL,
3160 		    NULL, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
3161 		action = NR_CLIENTID;
3162 	} else {
3163 		recovp->rc_error = geterrno4(stat);
3164 		switch (stat) {
3165 #ifdef notyet
3166 		case NFS4ERR_LEASE_MOVED:
3167 			action = xxx;
3168 			break;
3169 		case NFS4ERR_MOVED:
3170 			action = xxx;
3171 			break;
3172 #endif
3173 		case NFS4ERR_BADHANDLE:
3174 			action = NR_BADHANDLE;
3175 			break;
3176 		case NFS4ERR_BAD_SEQID:
3177 			if (bsep)
3178 				save_bseqid_rqst(bsep, recovp);
3179 			action = NR_BAD_SEQID;
3180 			break;
3181 		case NFS4ERR_OLD_STATEID:
3182 			action = NR_OLDSTATEID;
3183 			break;
3184 		case NFS4ERR_WRONGSEC:
3185 			action = NR_WRONGSEC;
3186 			break;
3187 		case NFS4ERR_FHEXPIRED:
3188 			action = NR_FHEXPIRED;
3189 			break;
3190 		case NFS4ERR_BAD_STATEID:
3191 			if (sp == NULL || (sp != NULL && inlease(sp))) {
3192 
3193 				action = NR_BAD_STATEID;
3194 				if (sidp)
3195 					recovp->rc_stateid = *sidp;
3196 			} else
3197 				action = NR_CLIENTID;
3198 			break;
3199 		case NFS4ERR_EXPIRED:
3200 			/*
3201 			 * The client's lease has expired, either due
3202 			 * to a network partition or perhaps a client
3203 			 * error.  In either case, try an NR_CLIENTID
3204 			 * style recovery.  reboot remains false, since
3205 			 * there is no evidence the server has rebooted.
3206 			 * This will cause CLAIM_NULL opens and lock
3207 			 * requests without the reclaim bit.
3208 			 */
3209 			action = NR_CLIENTID;
3210 
3211 			DTRACE_PROBE4(nfs4__expired,
3212 					nfs4_server_t *, sp,
3213 					mntinfo4_t *, mi,
3214 					stateid4 *, sidp, int, op);
3215 
3216 			break;
3217 		case NFS4ERR_STALE_CLIENTID:
3218 		case NFS4ERR_STALE_STATEID:
3219 			action = NR_CLIENTID;
3220 			reboot = TRUE;
3221 			break;
3222 		case NFS4ERR_RESOURCE:
3223 			/*
3224 			 * If this had been a FAILOVER mount, then
3225 			 * we'd have tried failover.  Since it's not,
3226 			 * just delay a while and retry.
3227 			 */
3228 			action = NR_DELAY;
3229 			break;
3230 		case NFS4ERR_GRACE:
3231 			action = NR_GRACE;
3232 			break;
3233 		case NFS4ERR_DELAY:
3234 			action = NR_DELAY;
3235 			break;
3236 		case NFS4ERR_STALE:
3237 			action = NR_STALE;
3238 			break;
3239 		default:
3240 			nfs4_queue_event(RE_UNEXPECTED_STATUS, mi, NULL, 0,
3241 			    NULL, NULL, stat, NULL, 0, TAG_NONE, TAG_NONE,
3242 			    0, 0);
3243 			action = NR_CLIENTID;
3244 			break;
3245 		}
3246 	}
3247 
3248 	/* make sure action got set */
3249 	ASSERT(action != NR_UNUSED);
3250 	recovp->rc_srv_reboot = reboot;
3251 	recovp->rc_action = action;
3252 	nfs4_queue_fact(RF_ERR, mi, stat, action, op, reboot, NULL, error,
3253 		NULL);
3254 }
3255 
3256 /*
3257  * Return the (held) credential for the process with the given pid.
3258  * May return NULL (e.g., process not found).
3259  */
3260 
3261 static cred_t *
3262 pid_to_cr(pid_t pid)
3263 {
3264 	proc_t *p;
3265 	cred_t *cr;
3266 
3267 	mutex_enter(&pidlock);
3268 	if ((p = prfind(pid)) == NULL) {
3269 		mutex_exit(&pidlock);
3270 		return (NULL);
3271 	}
3272 
3273 	mutex_enter(&p->p_crlock);
3274 	crhold(cr = p->p_cred);
3275 	mutex_exit(&p->p_crlock);
3276 	mutex_exit(&pidlock);
3277 
3278 	return (cr);
3279 }
3280 
3281 /*
3282  * Send SIGLOST to the given process and queue the event.
3283  *
3284  * The 'dump' boolean tells us whether this action should dump the
3285  * in-kernel queue of recovery messages or not.
3286  */
3287 
3288 void
3289 nfs4_send_siglost(pid_t pid, mntinfo4_t *mi, vnode_t *vp, bool_t dump,
3290     int error, nfsstat4 stat)
3291 {
3292 	proc_t *p;
3293 
3294 	mutex_enter(&pidlock);
3295 	p = prfind(pid);
3296 	if (p)
3297 		psignal(p, SIGLOST);
3298 	mutex_exit(&pidlock);
3299 	nfs4_queue_event(dump ? RE_SIGLOST : RE_SIGLOST_NO_DUMP, mi,
3300 	    NULL, error, vp, NULL, stat, NULL, pid, TAG_NONE, TAG_NONE, 0, 0);
3301 }
3302 
3303 /*
3304  * Scan the lock list for entries that match the given pid.  Change the
3305  * pid in those that do to NOPID.
3306  */
3307 
3308 static void
3309 relock_skip_pid(locklist_t *llp, pid_t pid)
3310 {
3311 	for (; llp != NULL; llp = llp->ll_next) {
3312 		if (llp->ll_flock.l_pid == pid)
3313 			llp->ll_flock.l_pid = NOPID;
3314 	}
3315 }
3316 
3317 /*
3318  * Mark a file as having failed recovery, after making a last-ditch effort
3319  * to return any delegation.
3320  *
3321  * Sets r_error to EIO or ESTALE for the given vnode.
3322  */
3323 void
3324 nfs4_fail_recov(vnode_t *vp, char *why, int error, nfsstat4 stat)
3325 {
3326 	rnode4_t *rp = VTOR4(vp);
3327 
3328 #ifdef DEBUG
3329 	if (nfs4_fail_recov_stop)
3330 		debug_enter("nfs4_fail_recov");
3331 #endif
3332 
3333 	mutex_enter(&rp->r_statelock);
3334 	if (rp->r_flags & (R4RECOVERR|R4RECOVERRP)) {
3335 		mutex_exit(&rp->r_statelock);
3336 		return;
3337 	}
3338 
3339 	/*
3340 	 * Set R4RECOVERRP to indicate that a recovery error is in
3341 	 * progress.  This will shut down reads and writes at the top
3342 	 * half.  Don't set R4RECOVERR until after we've returned the
3343 	 * delegation, otherwise it will fail.
3344 	 */
3345 
3346 	rp->r_flags |= R4RECOVERRP;
3347 	mutex_exit(&rp->r_statelock);
3348 
3349 	nfs4delegabandon(rp);
3350 
3351 	mutex_enter(&rp->r_statelock);
3352 	rp->r_flags |= (R4RECOVERR | R4STALE);
3353 	rp->r_error = (error == 0 && stat == NFS4ERR_STALE) ? ESTALE : EIO;
3354 	PURGE_ATTRCACHE4_LOCKED(rp);
3355 	if (!(vp->v_vfsp->vfs_flag & VFS_UNMOUNTED))
3356 		nfs4_queue_event(RE_DEAD_FILE, VTOMI4(vp), NULL, error,
3357 		    vp, NULL, stat, why, 0, TAG_NONE, TAG_NONE, 0, 0);
3358 	mutex_exit(&rp->r_statelock);
3359 
3360 	dnlc_purge_vp(vp);
3361 }
3362 
3363 /*
3364  * recov_throttle: if the file had the same recovery action within the
3365  * throttle interval, wait for the throttle interval to finish before
3366  * proceeding.
3367  *
3368  * Side effects: updates the rnode with the current recovery information.
3369  */
3370 
3371 static void
3372 recov_throttle(recov_info_t *recovp, vnode_t *vp)
3373 {
3374 	time_t curtime, time_to_wait;
3375 	rnode4_t *rp = VTOR4(vp);
3376 
3377 	curtime = gethrestime_sec();
3378 
3379 	mutex_enter(&rp->r_statelock);
3380 	NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
3381 		"recov_throttle: now: (%d, %ld), last: (%d, %ld)",
3382 		recovp->rc_action, curtime,
3383 		rp->r_recov_act, rp->r_last_recov));
3384 	if (recovp->rc_action == rp->r_recov_act &&
3385 	    rp->r_last_recov + recov_err_delay > curtime) {
3386 		time_to_wait = rp->r_last_recov + recov_err_delay - curtime;
3387 		mutex_exit(&rp->r_statelock);
3388 		delay(SEC_TO_TICK(time_to_wait));
3389 		curtime = gethrestime_sec();
3390 		mutex_enter(&rp->r_statelock);
3391 	}
3392 
3393 	rp->r_last_recov = curtime;
3394 	rp->r_recov_act = recovp->rc_action;
3395 	mutex_exit(&rp->r_statelock);
3396 }
3397 
3398 /*
3399  * React to NFS4ERR_GRACE by setting the time we'll permit
3400  * the next call to this filesystem.
3401  */
3402 void
3403 nfs4_set_grace_wait(mntinfo4_t *mi)
3404 {
3405 	mutex_enter(&mi->mi_lock);
3406 	/* Mark the time for the future */
3407 	mi->mi_grace_wait = gethrestime_sec() + nfs4err_delay_time;
3408 	mutex_exit(&mi->mi_lock);
3409 }
3410 
3411 /*
3412  * React to MFS4ERR_DELAY by setting the time we'll permit
3413  * the next call to this vnode.
3414  */
3415 void
3416 nfs4_set_delay_wait(vnode_t *vp)
3417 {
3418 	rnode4_t *rp = VTOR4(vp);
3419 
3420 	mutex_enter(&rp->r_statelock);
3421 	/*
3422 	 * Calculate amount we should delay, initial
3423 	 * delay will be short and then we will back off.
3424 	 */
3425 	if (rp->r_delay_interval == 0)
3426 		rp->r_delay_interval = NFS4_INITIAL_DELAY_INTERVAL;
3427 	else
3428 		/* calculate next interval value */
3429 		rp->r_delay_interval =
3430 		    MIN(NFS4_MAX_DELAY_INTERVAL, (rp->r_delay_interval << 1));
3431 	rp->r_delay_wait = gethrestime_sec() + rp->r_delay_interval;
3432 	mutex_exit(&rp->r_statelock);
3433 }
3434 
3435 /*
3436  * The caller is responsible for freeing the returned string.
3437  */
3438 static char *
3439 nfs4_getsrvnames(mntinfo4_t *mi, size_t *len)
3440 {
3441 	servinfo4_t *svp;
3442 	char *srvnames;
3443 	char *namep;
3444 	size_t length;
3445 
3446 	/*
3447 	 * Calculate the length of the string required to hold all
3448 	 * of the server names plus either a comma or a null
3449 	 * character following each individual one.
3450 	 */
3451 	length = 0;
3452 	for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) {
3453 		(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
3454 		if (svp->sv_flags & SV4_NOTINUSE) {
3455 			nfs_rw_exit(&svp->sv_lock);
3456 			continue;
3457 		}
3458 		nfs_rw_exit(&svp->sv_lock);
3459 		length += svp->sv_hostnamelen;
3460 	}
3461 
3462 	srvnames = kmem_alloc(length, KM_SLEEP);
3463 
3464 	namep = srvnames;
3465 	for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) {
3466 		(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
3467 		if (svp->sv_flags & SV4_NOTINUSE) {
3468 			nfs_rw_exit(&svp->sv_lock);
3469 			continue;
3470 		}
3471 		nfs_rw_exit(&svp->sv_lock);
3472 		(void) strcpy(namep, svp->sv_hostname);
3473 		namep += svp->sv_hostnamelen - 1;
3474 		*namep++ = ',';
3475 	}
3476 	*--namep = '\0';
3477 
3478 	*len = length;
3479 
3480 	return (srvnames);
3481 }
3482 
3483 static void
3484 save_bseqid_rqst(nfs4_bseqid_entry_t *bsep, recov_info_t *recovp)
3485 {
3486 	nfs4_bseqid_entry_t *destp;
3487 
3488 	destp = kmem_alloc(sizeof (nfs4_bseqid_entry_t), KM_SLEEP);
3489 	recovp->rc_bseqid_rqst = destp;
3490 
3491 	if (bsep->bs_oop)
3492 		open_owner_hold(bsep->bs_oop);
3493 	destp->bs_oop = bsep->bs_oop;
3494 	if (bsep->bs_lop)
3495 		lock_owner_hold(bsep->bs_lop);
3496 	destp->bs_lop = bsep->bs_lop;
3497 	if (bsep->bs_vp)
3498 		VN_HOLD(bsep->bs_vp);
3499 	destp->bs_vp = bsep->bs_vp;
3500 	destp->bs_pid = bsep->bs_pid;
3501 	destp->bs_tag = bsep->bs_tag;
3502 	destp->bs_seqid = bsep->bs_seqid;
3503 }
3504 
3505 static void
3506 free_bseqid_rqst(nfs4_bseqid_entry_t *bsep)
3507 {
3508 	if (bsep->bs_oop)
3509 		open_owner_rele(bsep->bs_oop);
3510 	if (bsep->bs_lop)
3511 		lock_owner_rele(bsep->bs_lop);
3512 	if (bsep->bs_vp)
3513 		VN_RELE(bsep->bs_vp);
3514 	kmem_free(bsep, sizeof (nfs4_bseqid_entry_t));
3515 }
3516 
3517 /*
3518  * We don't actually fully recover from NFS4ERR_BAD_SEQID.  We
3519  * simply mark the open owner and open stream (if provided) as "bad".
3520  * Then future uses of these data structures will be limited to basically
3521  * just cleaning up the internal client state (no going OTW).
3522  *
3523  * The result of this is to return errors back to the app/usr when
3524  * we receive NFS4ERR_BAD_SEQID, but also allow future/new calls to
3525  * succeed so progress can be made.
3526  */
3527 void
3528 recov_bad_seqid(recov_info_t *recovp)
3529 {
3530 	mntinfo4_t		*mi = recovp->rc_mi;
3531 	nfs4_open_owner_t	*bad_oop;
3532 	nfs4_lock_owner_t	*bad_lop;
3533 	vnode_t			*vp;
3534 	rnode4_t		*rp = NULL;
3535 	pid_t			pid;
3536 	nfs4_bseqid_entry_t	*bsep, *tbsep;
3537 	int			error;
3538 
3539 	ASSERT(mi != NULL);
3540 	ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
3541 
3542 	mutex_enter(&mi->mi_lock);
3543 	bsep = list_head(&mi->mi_bseqid_list);
3544 	mutex_exit(&mi->mi_lock);
3545 
3546 	/*
3547 	 * Handle all the bad seqid entries on mi's list.
3548 	 */
3549 	while (bsep != NULL) {
3550 		bad_oop = bsep->bs_oop;
3551 		bad_lop = bsep->bs_lop;
3552 		vp = bsep->bs_vp;
3553 		pid = bsep->bs_pid;
3554 
3555 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
3556 		    "recov_bad_seqid: mark oop %p lop %p as bad for "
3557 		    "vp %p tag %s pid %d: last good seqid %d for tag %s",
3558 		    (void *)bad_oop, (void *)bad_lop, (void *)vp,
3559 		    nfs4_ctags[bsep->bs_tag].ct_str, pid,
3560 		    bad_oop ?  bad_oop->oo_last_good_seqid : 0,
3561 		    bad_oop ? nfs4_ctags[bad_oop->oo_last_good_op].ct_str :
3562 		    nfs4_ctags[TAG_NONE].ct_str));
3563 
3564 		nfs4_queue_event(RE_BAD_SEQID, mi, NULL,
3565 		    0, vp, NULL, NFS4ERR_BAD_SEQID, NULL, pid, bsep->bs_tag,
3566 		    bad_oop ? bad_oop->oo_last_good_op : TAG_NONE,
3567 		    bsep->bs_seqid, bad_oop ? bad_oop->oo_last_good_seqid : 0);
3568 
3569 		if (bad_oop) {
3570 			/* essentially reset the open owner */
3571 			error = nfs4_start_open_seqid_sync(bad_oop, mi);
3572 			ASSERT(!error);	/* recov thread always succeeds */
3573 			bad_oop->oo_name = nfs4_get_new_oo_name();
3574 			bad_oop->oo_seqid = 0;
3575 			nfs4_end_open_seqid_sync(bad_oop);
3576 		}
3577 
3578 		if (bad_lop) {
3579 			mutex_enter(&bad_lop->lo_lock);
3580 			bad_lop->lo_flags |= NFS4_BAD_SEQID_LOCK;
3581 			mutex_exit(&bad_lop->lo_lock);
3582 
3583 			ASSERT(vp != NULL);
3584 			rp = VTOR4(vp);
3585 			mutex_enter(&rp->r_statelock);
3586 			rp->r_flags |= R4LODANGLERS;
3587 			mutex_exit(&rp->r_statelock);
3588 
3589 			nfs4_send_siglost(pid, mi, vp, TRUE,
3590 			    0, NFS4ERR_BAD_SEQID);
3591 		}
3592 
3593 		mutex_enter(&mi->mi_lock);
3594 		list_remove(&mi->mi_bseqid_list, bsep);
3595 		tbsep = bsep;
3596 		bsep = list_head(&mi->mi_bseqid_list);
3597 		mutex_exit(&mi->mi_lock);
3598 		free_bseqid_rqst(tbsep);
3599 	}
3600 
3601 	mutex_enter(&mi->mi_lock);
3602 	mi->mi_recovflags &= ~MI4R_BAD_SEQID;
3603 	mutex_exit(&mi->mi_lock);
3604 }
3605