xref: /titanic_44/usr/src/uts/common/fs/nfs/nfs4_recovery.c (revision e11f6fbcfd838459080e675d24788eda4783c1d7)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * NFS Version 4 state recovery code.
31  */
32 
33 #include <nfs/nfs4_clnt.h>
34 #include <nfs/nfs4.h>
35 #include <nfs/rnode4.h>
36 #include <sys/cmn_err.h>
37 #include <sys/cred.h>
38 #include <sys/systm.h>
39 #include <sys/flock.h>
40 #include <sys/dnlc.h>
41 #include <sys/ddi.h>
42 #include <sys/disp.h>
43 #include <sys/list.h>
44 #include <sys/sdt.h>
45 
46 extern r4hashq_t *rtable4;
47 
48 /*
49  * Information that describes what needs to be done for recovery.  It is
50  * passed to a client recovery thread as well as passed to various recovery
51  * routines.  rc_mi, rc_vp1, and rc_vp2 refer to the filesystem and
52  * vnode(s) affected by recovery.  rc_vp1 and rc_vp2 are references (use
53  * VN_HOLD) or NULL.  rc_lost_rqst contains information about the lost
54  * lock or open/close request, and it holds reference counts for the
55  * various objects (vnode, etc.).  The recovery thread also uses flags set
56  * in the mntinfo4_t or vnode_t to tell it what to do.  rc_error is used
57  * to save the error that originally triggered the recovery event -- will
58  * later be used to set mi_error if recovery doesn't work.  rc_bseqid_rqst
59  * contains information about the request that got NFS4ERR_BAD_SEQID, and
60  * it holds reference count for the various objects (vnode, open owner,
61  * open stream, lock owner).
62  */
63 
64 typedef struct {
65 	mntinfo4_t *rc_mi;
66 	vnode_t *rc_vp1;
67 	vnode_t *rc_vp2;
68 	nfs4_recov_t rc_action;
69 	stateid4 rc_stateid;
70 	bool_t rc_srv_reboot;		/* server has rebooted */
71 	nfs4_lost_rqst_t *rc_lost_rqst;
72 	nfs4_error_t rc_orig_errors;	/* original errors causing recovery */
73 	int rc_error;
74 	nfs4_bseqid_entry_t *rc_bseqid_rqst;
75 } recov_info_t;
76 
77 /*
78  * How long to wait before trying again if there is an error doing
79  * recovery, in seconds.
80  */
81 
82 static int recov_err_delay = 1;
83 
84 /*
85  * How long to wait when processing NFS4ERR_GRACE or NFS4ERR_DELAY
86  * errors.  Expressed in seconds.  Default is defined as
87  * NFS4ERR_DELAY_TIME and this variable is initialized in nfs4_subr_init()
88  */
89 time_t nfs4err_delay_time = 0;
90 
91 /*
92  * Tuneable to limit how many time "exempt" ops go OTW
93  * after a recovery error.  Exempt op hints are OH_CLOSE,
94  * OH_LOCKU, OH_DELEGRETURN.  These previously always went
95  * OTW even after rnode was "dead" due to recovery errors.
96  *
97  * The tuneable below limits the number of times a start_fop
98  * invocation will retry the exempt hints.  After the limit
99  * is reached, nfs4_start_fop will return an error just like
100  * it would for non-exempt op hints.
101  */
102 int nfs4_max_recov_error_retry = 3;
103 
104 /*
105  * Number of seconds the recovery thread should pause before retry when the
106  * filesystem has been forcibly unmounted.
107  */
108 
109 int nfs4_unmount_delay = 1;
110 
111 #ifdef DEBUG
112 
113 /*
114  * How long to wait (in seconds) between recovery operations on a given
115  * file.  Normally zero, but could be set longer for testing purposes.
116  */
117 static int nfs4_recovdelay = 0;
118 
119 /*
120  * Switch that controls whether to go into the debugger when recovery
121  * fails.
122  */
123 static int nfs4_fail_recov_stop = 0;
124 
125 /*
126  * Tuneables to debug client namespace interaction with server
127  * mount points:
128  *
129  *	nfs4_srvmnt_fail_cnt:
130  *		number of times EACCES returned because client
131  *		attempted to cross server mountpoint
132  *
133  *	nfs4_srvmnt_debug:
134  *		trigger console printf whenever client attempts
135  *		to cross server mountpoint
136  */
137 int nfs4_srvmnt_fail_cnt = 0;
138 int nfs4_srvmnt_debug = 0;
139 #endif
140 
141 /* forward references, in alphabetic order */
142 static void close_after_open_resend(vnode_t *, cred_t *, uint32_t,
143 	nfs4_error_t *);
144 static void errs_to_action(recov_info_t *,
145 	nfs4_server_t *, mntinfo4_t *, stateid4 *, nfs4_lost_rqst_t *, int,
146 	nfs_opnum4, nfs4_bseqid_entry_t *);
147 static void flush_reinstate(nfs4_lost_rqst_t *);
148 static void free_milist(mntinfo4_t **, int);
149 static mntinfo4_t **make_milist(nfs4_server_t *, int *);
150 static int nfs4_check_recov_err(vnode_t *, nfs4_op_hint_t,
151 	nfs4_recov_state_t *, int, char *);
152 static int nfs4_check_srvstub(vnode_t *vp, rnode4_t *rp, nfs4_op_hint_t op);
153 static char *nfs4_getsrvnames(mntinfo4_t *, size_t *);
154 static void nfs4_recov_fh_fail(vnode_t *, int, nfsstat4);
155 static void nfs4_recov_thread(recov_info_t *);
156 static void nfs4_remove_lost_rqsts(mntinfo4_t *, nfs4_server_t *);
157 static void nfs4_resend_lost_rqsts(recov_info_t *, nfs4_server_t *);
158 static cred_t *pid_to_cr(pid_t);
159 static void reclaim_one_lock(vnode_t *, flock64_t *, nfs4_error_t *, int *);
160 static void recov_bad_seqid(recov_info_t *);
161 static void recov_badstate(recov_info_t *, vnode_t *, nfsstat4);
162 static void recov_clientid(recov_info_t *, nfs4_server_t *);
163 static void recov_done(mntinfo4_t *, recov_info_t *);
164 static void recov_filehandle(nfs4_recov_t, mntinfo4_t *, vnode_t *);
165 static void recov_newserver(recov_info_t *, nfs4_server_t **, bool_t *);
166 static void recov_openfiles(recov_info_t *, nfs4_server_t *);
167 static void recov_stale(mntinfo4_t *, vnode_t *);
168 static void nfs4_free_lost_rqst(nfs4_lost_rqst_t *, nfs4_server_t *);
169 static void recov_throttle(recov_info_t *, vnode_t *);
170 static void relock_skip_pid(locklist_t *, pid_t);
171 static void resend_lock(nfs4_lost_rqst_t *, nfs4_error_t *);
172 static void resend_one_op(nfs4_lost_rqst_t *, nfs4_error_t *, mntinfo4_t *,
173 	nfs4_server_t *);
174 static void save_bseqid_rqst(nfs4_bseqid_entry_t *, recov_info_t *);
175 static void start_recovery(recov_info_t *, mntinfo4_t *, vnode_t *, vnode_t *,
176 	nfs4_server_t *);
177 static void start_recovery_action(nfs4_recov_t, bool_t, mntinfo4_t *, vnode_t *,
178 	vnode_t *);
179 static int wait_for_recovery(mntinfo4_t *, nfs4_op_hint_t);
180 
181 /*
182  * Return non-zero if the given errno, status, and rpc status codes
183  * in the nfs4_error_t indicate that client recovery is needed.
184  * "stateful" indicates whether the call that got the error establishes or
185  * removes state on the server (open, close, lock, unlock, delegreturn).
186  */
187 
188 int
189 nfs4_needs_recovery(nfs4_error_t *ep, bool_t stateful, vfs_t *vfsp)
190 {
191 	int recov = 0;
192 	mntinfo4_t *mi;
193 
194 	/*
195 	 * Try failover if the error values justify it and if
196 	 * it's a failover mount.  Don't try if the mount is in
197 	 * progress, failures are handled explicitly by nfs4rootvp.
198 	 */
199 	if (nfs4_try_failover(ep)) {
200 		mi = VFTOMI4(vfsp);
201 		mutex_enter(&mi->mi_lock);
202 		recov = FAILOVER_MOUNT4(mi) && !(mi->mi_flags & MI4_MOUNTING);
203 		mutex_exit(&mi->mi_lock);
204 		if (recov)
205 			return (recov);
206 	}
207 
208 	if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vfsp)) {
209 		/*
210 		 * The server may have gotten the request, so for stateful
211 		 * ops we need to resynchronize and possibly back out the
212 		 * op.
213 		 */
214 		return (stateful);
215 	}
216 	if (ep->error != 0)
217 		return (0);
218 
219 	/* stat values are listed alphabetically */
220 	/*
221 	 * There are two lists here: the errors for which we have code, and
222 	 * the errors for which we plan to have code before FCS.  For the
223 	 * second list, print a warning message but don't attempt recovery.
224 	 */
225 	switch (ep->stat) {
226 	case NFS4ERR_BADHANDLE:
227 	case NFS4ERR_BAD_SEQID:
228 	case NFS4ERR_BAD_STATEID:
229 	case NFS4ERR_DELAY:
230 	case NFS4ERR_EXPIRED:
231 	case NFS4ERR_FHEXPIRED:
232 	case NFS4ERR_GRACE:
233 	case NFS4ERR_OLD_STATEID:
234 	case NFS4ERR_RESOURCE:
235 	case NFS4ERR_STALE_CLIENTID:
236 	case NFS4ERR_STALE_STATEID:
237 	case NFS4ERR_WRONGSEC:
238 	case NFS4ERR_STALE:
239 		recov = 1;
240 		break;
241 #ifdef DEBUG
242 	case NFS4ERR_LEASE_MOVED:
243 	case NFS4ERR_MOVED:
244 		zcmn_err(VFTOMI4(vfsp)->mi_zone->zone_id,
245 		    CE_WARN, "!Can't yet recover from NFS status %d",
246 				ep->stat);
247 		break;
248 #endif
249 	}
250 
251 	return (recov);
252 }
253 
254 /*
255  * Some operations such as DELEGRETURN want to avoid invoking
256  * recovery actions that will only mark the file dead.  If
257  * better handlers are invoked for any of these errors, this
258  * routine should be modified.
259  */
260 int
261 nfs4_recov_marks_dead(nfsstat4 status)
262 {
263 	if (status == NFS4ERR_BAD_SEQID ||
264 	    status == NFS4ERR_EXPIRED ||
265 	    status == NFS4ERR_BAD_STATEID ||
266 	    status == NFS4ERR_OLD_STATEID)
267 		return (1);
268 	return (0);
269 }
270 
271 /*
272  * Transfer the state recovery information in recovp to mi's resend queue,
273  * and mark mi as having a lost state request.
274  */
275 static void
276 nfs4_enqueue_lost_rqst(recov_info_t *recovp, mntinfo4_t *mi)
277 {
278 	nfs4_lost_rqst_t *lrp = recovp->rc_lost_rqst;
279 
280 	ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
281 	    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
282 
283 	ASSERT(lrp != NULL && lrp->lr_op != 0);
284 
285 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
286 		"nfs4_enqueue_lost_rqst %p, op %d",
287 		(void *)lrp, lrp->lr_op));
288 
289 	mutex_enter(&mi->mi_lock);
290 	mi->mi_recovflags |= MI4R_LOST_STATE;
291 	if (lrp->lr_putfirst)
292 		list_insert_head(&mi->mi_lost_state, lrp);
293 	else
294 		list_insert_tail(&mi->mi_lost_state, lrp);
295 	recovp->rc_lost_rqst = NULL;
296 	mutex_exit(&mi->mi_lock);
297 
298 	nfs4_queue_event(RE_LOST_STATE, mi, NULL, lrp->lr_op, lrp->lr_vp,
299 		lrp->lr_dvp, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
300 }
301 
302 /*
303  * Transfer the bad seqid recovery information in recovp to mi's
304  * bad seqid queue, and mark mi as having a bad seqid request.
305  */
306 void
307 enqueue_bseqid_rqst(recov_info_t *recovp, mntinfo4_t *mi)
308 {
309 	ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
310 	    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
311 	ASSERT(recovp->rc_bseqid_rqst != NULL);
312 
313 	mutex_enter(&mi->mi_lock);
314 	mi->mi_recovflags |= MI4R_BAD_SEQID;
315 	list_insert_tail(&mi->mi_bseqid_list, recovp->rc_bseqid_rqst);
316 	recovp->rc_bseqid_rqst = NULL;
317 	mutex_exit(&mi->mi_lock);
318 }
319 
320 /*
321  * Initiate recovery.
322  *
323  * The nfs4_error_t contains the return codes that triggered a recovery
324  * attempt.  mi, vp1, and vp2 refer to the filesystem and files that were
325  * being operated on.  vp1 and vp2 may be NULL.
326  *
327  * Multiple calls are okay.  If recovery is already underway, the call
328  * updates the information about what state needs recovery but does not
329  * start a new thread.  The caller should hold mi->mi_recovlock as a reader
330  * for proper synchronization with any recovery thread.
331  *
332  * This will return TRUE if recovery was aborted, and FALSE otherwise.
333  */
334 bool_t
335 nfs4_start_recovery(nfs4_error_t *ep, mntinfo4_t *mi, vnode_t *vp1,
336     vnode_t *vp2, stateid4 *sid, nfs4_lost_rqst_t *lost_rqstp, nfs_opnum4 op,
337     nfs4_bseqid_entry_t *bsep)
338 {
339 	recov_info_t *recovp;
340 	nfs4_server_t *sp;
341 	bool_t abort = FALSE;
342 	bool_t gone = FALSE;
343 
344 	ASSERT(curproc->p_zone == mi->mi_zone);
345 	mutex_enter(&mi->mi_lock);
346 	/*
347 	 * If there is lost state, we need to kick off recovery even if the
348 	 * filesystem has been unmounted or the zone is shutting down.
349 	 */
350 	gone = FS_OR_ZONE_GONE4(mi->mi_vfsp);
351 	if (gone) {
352 		ASSERT(ep->error != EINTR || lost_rqstp != NULL);
353 		if (ep->error == EIO && lost_rqstp == NULL) {
354 			/* failed due to forced unmount, no new lost state */
355 			abort = TRUE;
356 		}
357 		if ((ep->error == 0 || ep->error == ETIMEDOUT) &&
358 		    !(mi->mi_recovflags & MI4R_LOST_STATE)) {
359 			/* some other failure, no existing lost state */
360 			abort = TRUE;
361 		}
362 		if (abort) {
363 			mutex_exit(&mi->mi_lock);
364 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
365 				    "nfs4_start_recovery: fs unmounted"));
366 			return (TRUE);
367 		}
368 	}
369 	mi->mi_in_recovery++;
370 	mutex_exit(&mi->mi_lock);
371 
372 	recovp = kmem_alloc(sizeof (recov_info_t), KM_SLEEP);
373 	recovp->rc_orig_errors = *ep;
374 	sp = find_nfs4_server(mi);
375 	errs_to_action(recovp, sp, mi, sid, lost_rqstp,
376 		gone, op, bsep);
377 	if (sp != NULL)
378 		mutex_exit(&sp->s_lock);
379 	start_recovery(recovp, mi, vp1, vp2, sp);
380 	if (sp != NULL)
381 		nfs4_server_rele(sp);
382 	return (FALSE);
383 }
384 
385 /*
386  * Internal version of nfs4_start_recovery.  The difference is that the
387  * caller specifies the recovery action, rather than the errors leading to
388  * recovery.
389  */
390 static void
391 start_recovery_action(nfs4_recov_t what, bool_t reboot, mntinfo4_t *mi,
392 	vnode_t *vp1, vnode_t *vp2)
393 {
394 	recov_info_t *recovp;
395 
396 	ASSERT(curproc->p_zone == mi->mi_zone);
397 	mutex_enter(&mi->mi_lock);
398 	mi->mi_in_recovery++;
399 	mutex_exit(&mi->mi_lock);
400 
401 	recovp = kmem_zalloc(sizeof (recov_info_t), KM_SLEEP);
402 	recovp->rc_action = what;
403 	recovp->rc_srv_reboot = reboot;
404 	recovp->rc_error = EIO;
405 	start_recovery(recovp, mi, vp1, vp2, NULL);
406 }
407 
408 static void
409 start_recovery(recov_info_t *recovp, mntinfo4_t *mi,
410 	vnode_t *vp1, vnode_t *vp2, nfs4_server_t *sp)
411 {
412 	NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
413 		"start_recovery: mi %p, what %s", (void*)mi,
414 		nfs4_recov_action_to_str(recovp->rc_action)));
415 
416 	/*
417 	 * Bump the reference on the vfs so that we can pass it to the
418 	 * recovery thread.
419 	 */
420 	VFS_HOLD(mi->mi_vfsp);
421 
422 again:
423 	switch (recovp->rc_action) {
424 	case NR_FAILOVER:
425 		ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
426 		    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
427 		if (mi->mi_servers->sv_next == NULL)
428 			goto out_no_thread;
429 		mutex_enter(&mi->mi_lock);
430 		mi->mi_recovflags |= MI4R_NEED_NEW_SERVER;
431 		mutex_exit(&mi->mi_lock);
432 
433 		if (recovp->rc_lost_rqst != NULL)
434 			nfs4_enqueue_lost_rqst(recovp, mi);
435 		break;
436 
437 	case NR_CLIENTID:
438 		/*
439 		 * If the filesystem has been unmounted, punt.
440 		 */
441 		if (sp == NULL)
442 			goto out_no_thread;
443 
444 		/*
445 		 * If nobody else is working on the clientid, mark the
446 		 * clientid as being no longer set.  Then mark the specific
447 		 * filesystem being worked on.
448 		 */
449 		if (!nfs4_server_in_recovery(sp)) {
450 			mutex_enter(&sp->s_lock);
451 			sp->s_flags &= ~N4S_CLIENTID_SET;
452 			mutex_exit(&sp->s_lock);
453 		}
454 		ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
455 		    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
456 		mutex_enter(&mi->mi_lock);
457 		mi->mi_recovflags |= MI4R_NEED_CLIENTID;
458 		if (recovp->rc_srv_reboot)
459 			mi->mi_recovflags |= MI4R_SRV_REBOOT;
460 		mutex_exit(&mi->mi_lock);
461 		break;
462 
463 	case NR_OPENFILES:
464 		ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
465 		    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
466 		mutex_enter(&mi->mi_lock);
467 		mi->mi_recovflags |= MI4R_REOPEN_FILES;
468 		if (recovp->rc_srv_reboot)
469 			mi->mi_recovflags |= MI4R_SRV_REBOOT;
470 		mutex_exit(&mi->mi_lock);
471 		break;
472 
473 	case NR_WRONGSEC:
474 		ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
475 		    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
476 		mutex_enter(&mi->mi_lock);
477 		mi->mi_recovflags |= MI4R_NEED_SECINFO;
478 		mutex_exit(&mi->mi_lock);
479 		break;
480 
481 	case NR_EXPIRED:
482 		if (vp1 != NULL)
483 			recov_badstate(recovp, vp1, NFS4ERR_EXPIRED);
484 		if (vp2 != NULL)
485 			recov_badstate(recovp, vp2, NFS4ERR_EXPIRED);
486 		goto out_no_thread;	/* no further recovery possible */
487 
488 	case NR_BAD_STATEID:
489 		if (vp1 != NULL)
490 			recov_badstate(recovp, vp1, NFS4ERR_BAD_STATEID);
491 		if (vp2 != NULL)
492 			recov_badstate(recovp, vp2, NFS4ERR_BAD_STATEID);
493 		goto out_no_thread;	/* no further recovery possible */
494 
495 	case NR_FHEXPIRED:
496 	case NR_BADHANDLE:
497 		if (vp1 != NULL)
498 			recov_throttle(recovp, vp1);
499 		if (vp2 != NULL)
500 			recov_throttle(recovp, vp2);
501 		/*
502 		 * Recover the filehandle now, rather than using a
503 		 * separate thread.  We can do this because filehandle
504 		 * recovery is independent of any other state, and because
505 		 * we know that we are not competing with the recovery
506 		 * thread at this time.  recov_filehandle will deal with
507 		 * threads that are competing to recover this filehandle.
508 		 */
509 		ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
510 		    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
511 		if (vp1 != NULL)
512 			recov_filehandle(recovp->rc_action, mi, vp1);
513 		if (vp2 != NULL)
514 			recov_filehandle(recovp->rc_action, mi, vp2);
515 		goto out_no_thread;	/* no further recovery needed */
516 
517 	case NR_STALE:
518 		/*
519 		 * NFS4ERR_STALE handling
520 		 * recov_stale() could set MI4R_NEED_NEW_SERVER to
521 		 * indicate that we can and should failover.
522 		 */
523 		ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
524 		    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
525 
526 		if (vp1 != NULL)
527 			recov_stale(mi, vp1);
528 		if (vp2 != NULL)
529 			recov_stale(mi, vp2);
530 		mutex_enter(&mi->mi_lock);
531 		if ((mi->mi_recovflags & MI4R_NEED_NEW_SERVER) == 0) {
532 			mutex_exit(&mi->mi_lock);
533 			goto out_no_thread;
534 		}
535 		mutex_exit(&mi->mi_lock);
536 		recovp->rc_action = NR_FAILOVER;
537 		goto again;
538 
539 	case NR_BAD_SEQID:
540 		if (recovp->rc_bseqid_rqst) {
541 			enqueue_bseqid_rqst(recovp, mi);
542 			break;
543 		}
544 
545 		if (vp1 != NULL)
546 			recov_badstate(recovp, vp1, NFS4ERR_BAD_SEQID);
547 		if (vp2 != NULL)
548 			recov_badstate(recovp, vp2, NFS4ERR_BAD_SEQID);
549 		goto out_no_thread; /* no further recovery possible */
550 
551 	case NR_OLDSTATEID:
552 		if (vp1 != NULL)
553 			recov_badstate(recovp, vp1, NFS4ERR_OLD_STATEID);
554 		if (vp2 != NULL)
555 			recov_badstate(recovp, vp2, NFS4ERR_OLD_STATEID);
556 		goto out_no_thread;	/* no further recovery possible */
557 
558 	case NR_GRACE:
559 		nfs4_set_grace_wait(mi);
560 		goto out_no_thread; /* no further action required for GRACE */
561 
562 	case NR_DELAY:
563 		if (vp1)
564 			nfs4_set_delay_wait(vp1);
565 		goto out_no_thread; /* no further action required for DELAY */
566 
567 	case NR_LOST_STATE_RQST:
568 	case NR_LOST_LOCK:
569 		nfs4_enqueue_lost_rqst(recovp, mi);
570 		break;
571 
572 	default:
573 		nfs4_queue_event(RE_UNEXPECTED_ACTION, mi, NULL,
574 		    recovp->rc_action, NULL, NULL, 0, NULL, 0, TAG_NONE,
575 		    TAG_NONE, 0, 0);
576 		goto out_no_thread;
577 	}
578 
579 	/*
580 	 * If either file recently went through the same recovery, wait
581 	 * awhile.  This is in case there is some sort of bug; we might not
582 	 * be able to recover properly, but at least we won't bombard the
583 	 * server with calls, and we won't tie up the client.
584 	 */
585 	if (vp1 != NULL)
586 		recov_throttle(recovp, vp1);
587 	if (vp2 != NULL)
588 		recov_throttle(recovp, vp2);
589 
590 	/*
591 	 * If there's already a recovery thread, don't start another one.
592 	 */
593 
594 	mutex_enter(&mi->mi_lock);
595 	if (mi->mi_flags & MI4_RECOV_ACTIV) {
596 		mutex_exit(&mi->mi_lock);
597 		goto out_no_thread;
598 	}
599 	mi->mi_flags |= MI4_RECOV_ACTIV;
600 	mutex_exit(&mi->mi_lock);
601 	NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
602 		"start_recovery: starting new thread for mi %p", (void*)mi));
603 
604 	recovp->rc_mi = mi;
605 	recovp->rc_vp1 = vp1;
606 	if (vp1 != NULL) {
607 		ASSERT(VTOMI4(vp1) == mi);
608 		VN_HOLD(recovp->rc_vp1);
609 	}
610 	recovp->rc_vp2 = vp2;
611 	if (vp2 != NULL) {
612 		ASSERT(VTOMI4(vp2) == mi);
613 		VN_HOLD(recovp->rc_vp2);
614 	}
615 
616 	(void) zthread_create(NULL, 0, nfs4_recov_thread, recovp, 0,
617 			    minclsyspri);
618 	return;
619 
620 	/* not reached by thread creating call */
621 out_no_thread:
622 	mutex_enter(&mi->mi_lock);
623 	mi->mi_in_recovery--;
624 	cv_broadcast(&mi->mi_cv_in_recov);
625 	mutex_exit(&mi->mi_lock);
626 
627 	VFS_RELE(mi->mi_vfsp);
628 	/*
629 	 * Free up resources that were allocated for us.
630 	 */
631 	kmem_free(recovp, sizeof (recov_info_t));
632 }
633 
634 static int
635 nfs4_check_srvstub(vnode_t *vp, rnode4_t *rp, nfs4_op_hint_t op)
636 {
637 	int err = 0;
638 
639 	/*
640 	 * If tuneable does not allow client to cross srv mountpoints and
641 	 * object is a stub, then check check op hint and return EACCES for
642 	 * any hint other than access, rddir, getattr, lookup.
643 	 */
644 	if (rp->r_flags & R4SRVSTUB && op != OH_ACCESS && op != OH_GETACL &&
645 	    op != OH_GETATTR && op != OH_READDIR && op != OH_LOOKUP) {
646 		err = EACCES;
647 #ifdef DEBUG
648 		NFS4_DEBUG(nfs4_srvmnt_debug, (CE_NOTE,
649 			"nfs4_check_srvstub: op=%d err=%d rp=%p vp=%p\n"
650 			"va_nod=%llx r_mntd_fid=%llx\n"
651 			"sv_fsid=(%llx:%llx) r_srv_fsid=(%llx:%llx)",
652 			op, err, (void *)rp, (void *)vp,
653 			(u_longlong_t)rp->r_attr.va_nodeid,
654 			(u_longlong_t)rp->r_mntd_fid,
655 			(u_longlong_t)rp->r_server->sv_fsid.major,
656 			(u_longlong_t)rp->r_server->sv_fsid.minor,
657 			(u_longlong_t)rp->r_srv_fsid.major,
658 			(u_longlong_t)rp->r_srv_fsid.minor));
659 #endif
660 	}
661 
662 	return (err);
663 }
664 
665 static int
666 nfs4_check_recov_err(vnode_t *vp, nfs4_op_hint_t op,
667 			nfs4_recov_state_t *rsp, int retry_err_cnt, char *str)
668 {
669 	rnode4_t *rp;
670 	int error = 0;
671 	int exempt;
672 
673 	if (vp == NULL)
674 		return (0);
675 
676 	exempt = (op == OH_CLOSE || op == OH_LOCKU || op == OH_DELEGRETURN);
677 	rp = VTOR4(vp);
678 	mutex_enter(&rp->r_statelock);
679 
680 	/*
681 	 * If there was a recovery error, then allow op hints "exempt" from
682 	 * recov errors to retry (currently 3 times).  Either r_error or
683 	 * EIO is returned for non-exempt op hints.
684 	 *
685 	 *	Error heirarchy:
686 	 *	a) check for R4ERECOVERR
687 	 *	b) check for R4SRVSTUB (only if R4RECOVERR is not set).
688 	 */
689 	if (rp->r_flags & R4RECOVERR) {
690 		if (exempt && rsp->rs_num_retry_despite_err <=
691 				nfs4_max_recov_error_retry) {
692 
693 			/*
694 			 * Check to make sure that we haven't already inc'd
695 			 * rs_num_retry_despite_err for current nfs4_start_fop
696 			 * instance.  We don't want to double inc (if we were
697 			 * called with vp2, then the vp1 call could have
698 			 * already incremented.
699 			 */
700 			if (retry_err_cnt == rsp->rs_num_retry_despite_err)
701 				rsp->rs_num_retry_despite_err++;
702 
703 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
704 				"nfs4_start_fop: %s %p DEAD, cnt=%d", str,
705 				(void *)vp, rsp->rs_num_retry_despite_err));
706 		} else {
707 			error = (rp->r_error ? rp->r_error : EIO);
708 			/*
709 			 * An ESTALE error on a non-regular file is not
710 			 * "sticky".  Return the ESTALE error once, but
711 			 * clear the condition to allow future operations
712 			 * to go OTW.  This will allow the client to
713 			 * recover if the server has merely unshared then
714 			 * re-shared the file system.  For regular files,
715 			 * the unshare has destroyed the open state at the
716 			 * server and we aren't willing to do a reopen (yet).
717 			 */
718 			if (error == ESTALE && vp->v_type != VREG) {
719 				rp->r_flags &=
720 					~(R4RECOVERR|R4RECOVERRP|R4STALE);
721 				rp->r_error = 0;
722 				error = ESTALE;
723 			}
724 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
725 				"nfs4_start_fop: %s %p DEAD, cnt=%d error=%d",
726 				str, (void *)vp,
727 				rsp->rs_num_retry_despite_err, error));
728 		}
729 	} else {
730 		error = nfs4_check_srvstub(vp, rp, op);
731 		NFS4_DEBUG(nfs4_client_recov_stub_debug, (CE_NOTE,
732 			"nfs4_start_fop: %s %p SRVSTUB, error=%d", str,
733 			(void *)vp, error));
734 	}
735 	mutex_exit(&rp->r_statelock);
736 	return (error);
737 }
738 
739 /*
740  * Initial setup code that every operation should call if it might invoke
741  * client recovery.  Can block waiting for recovery to finish on a
742  * filesystem.  Either vnode ptr can be NULL.
743  *
744  * Returns 0 if there are no outstanding errors.  Can return an
745  * errno value under various circumstances (e.g., failed recovery, or
746  * interrupted while waiting for recovery to finish).
747  *
748  * There must be a corresponding call to nfs4_end_op() to free up any locks
749  * or resources allocated by this call (assuming this call succeeded),
750  * using the same rsp that's passed in here.
751  *
752  * The open and lock seqid synchronization must be stopped before calling this
753  * function, as it could lead to deadlock when trying to reopen a file or
754  * reclaim a lock.  The synchronization is obtained with calls to:
755  *   nfs4_start_open_seqid_sync()
756  *   nfs4_start_lock_seqid_sync()
757  *
758  * *startrecovp is set TRUE if the caller should not bother with the
759  * over-the-wire call, and just initiate recovery for the given request.
760  * This is typically used for state-releasing ops if the filesystem has
761  * been forcibly unmounted.  startrecovp may be NULL for
762  * non-state-releasing ops.
763  */
764 
765 int
766 nfs4_start_fop(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2, nfs4_op_hint_t op,
767 		nfs4_recov_state_t *rsp, bool_t *startrecovp)
768 {
769 	int error = 0, rerr_cnt;
770 	nfs4_server_t *sp = NULL;
771 	nfs4_server_t *tsp;
772 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
773 	time_t droplock_time;
774 #ifdef DEBUG
775 	void *fop_caller;
776 #endif
777 
778 	ASSERT(vp1 == NULL || vp1->v_vfsp == mi->mi_vfsp);
779 	ASSERT(vp2 == NULL || vp2->v_vfsp == mi->mi_vfsp);
780 
781 #ifdef	DEBUG
782 	if ((fop_caller = tsd_get(nfs4_tsd_key)) != NULL) {
783 		cmn_err(CE_PANIC, "Missing nfs4_end_fop: last caller %p",
784 			fop_caller);
785 	}
786 	(void) tsd_set(nfs4_tsd_key, caller());
787 #endif
788 
789 	rsp->rs_sp = NULL;
790 	rsp->rs_flags &= ~NFS4_RS_RENAME_HELD;
791 	rerr_cnt = rsp->rs_num_retry_despite_err;
792 
793 	/*
794 	 * Process the items that may delay() based on server response
795 	 */
796 	error = nfs4_wait_for_grace(mi, rsp);
797 	if (error)
798 		goto out;
799 
800 	if (vp1 != NULL) {
801 		error = nfs4_wait_for_delay(vp1, rsp);
802 		if (error)
803 			goto out;
804 	}
805 
806 	/* Wait for a delegation recall to complete. */
807 
808 	error = wait_for_recall(vp1, vp2, op, rsp);
809 	if (error)
810 		goto out;
811 
812 	/*
813 	 * Wait for any current recovery actions to finish.  Note that a
814 	 * recovery thread can still start up after wait_for_recovery()
815 	 * finishes.  We don't block out recovery operations until we
816 	 * acquire s_recovlock and mi_recovlock.
817 	 */
818 	error = wait_for_recovery(mi, op);
819 	if (error)
820 		goto out;
821 
822 	/*
823 	 * Check to see if the rnode is already marked with a
824 	 * recovery error.  If so, return it immediately.  But
825 	 * always pass CLOSE, LOCKU, and DELEGRETURN so we can
826 	 * clean up state on the server.
827 	 */
828 
829 	if (vp1 != NULL) {
830 		if (error = nfs4_check_recov_err(vp1, op, rsp, rerr_cnt, "vp1"))
831 			goto out;
832 		nfs4_check_remap(mi, vp1, NFS4_REMAP_CKATTRS, &e);
833 	}
834 
835 	if (vp2 != NULL) {
836 		if (error = nfs4_check_recov_err(vp2, op, rsp, rerr_cnt, "vp2"))
837 			goto out;
838 		nfs4_check_remap(mi, vp2, NFS4_REMAP_CKATTRS, &e);
839 	}
840 
841 	/*
842 	 * The lock order calls for us to acquire s_recovlock before
843 	 * mi_recovlock, but we have to hold mi_recovlock to look up sp (to
844 	 * prevent races with the failover/migration code).  So acquire
845 	 * mi_recovlock, look up sp, drop mi_recovlock, acquire
846 	 * s_recovlock and mi_recovlock, then verify that sp is still the
847 	 * right object.  XXX Can we find a simpler way to deal with this?
848 	 */
849 	if (nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER,
850 	    mi->mi_flags & MI4_INT)) {
851 		error = EINTR;
852 		goto out;
853 	}
854 get_sp:
855 	sp = find_nfs4_server(mi);
856 	if (sp != NULL) {
857 		sp->s_otw_call_count++;
858 		mutex_exit(&sp->s_lock);
859 		droplock_time = gethrestime_sec();
860 	}
861 	nfs_rw_exit(&mi->mi_recovlock);
862 
863 	if (sp != NULL) {
864 		if (nfs_rw_enter_sig(&sp->s_recovlock, RW_READER,
865 			    mi->mi_flags & MI4_INT)) {
866 			error = EINTR;
867 			goto out;
868 		}
869 	}
870 	if (nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER,
871 			    mi->mi_flags & MI4_INT)) {
872 		if (sp != NULL)
873 			nfs_rw_exit(&sp->s_recovlock);
874 		error = EINTR;
875 		goto out;
876 	}
877 	/*
878 	 * If the mntinfo4_t hasn't changed nfs4_sever_ts then
879 	 * there's no point in double checking to make sure it
880 	 * has switched.
881 	 */
882 	if (sp == NULL || droplock_time < mi->mi_srvsettime) {
883 		tsp = find_nfs4_server(mi);
884 		if (tsp != sp) {
885 			/* try again */
886 			if (tsp != NULL) {
887 				mutex_exit(&tsp->s_lock);
888 				nfs4_server_rele(tsp);
889 				tsp = NULL;
890 			}
891 			if (sp != NULL) {
892 				nfs_rw_exit(&sp->s_recovlock);
893 				mutex_enter(&sp->s_lock);
894 				sp->s_otw_call_count--;
895 				mutex_exit(&sp->s_lock);
896 				nfs4_server_rele(sp);
897 				sp = NULL;
898 			}
899 			goto get_sp;
900 		} else {
901 			if (tsp != NULL) {
902 				mutex_exit(&tsp->s_lock);
903 				nfs4_server_rele(tsp);
904 				tsp = NULL;
905 			}
906 		}
907 	}
908 
909 	if (sp != NULL) {
910 		rsp->rs_sp = sp;
911 	}
912 
913 	/*
914 	 * If the fileystem uses volatile filehandles, obtain a lock so
915 	 * that we synchronize with renames.  Exception: mount operations
916 	 * can change mi_fh_expire_type, which could be a problem, since
917 	 * the end_op code needs to be consistent with the start_op code
918 	 * about mi_rename_lock.  Since mounts don't compete with renames,
919 	 * it's simpler to just not acquire the rename lock for mounts.
920 	 */
921 	if (NFS4_VOLATILE_FH(mi) && op != OH_MOUNT) {
922 		if (nfs_rw_enter_sig(&mi->mi_rename_lock,
923 				    op == OH_VFH_RENAME ? RW_WRITER : RW_READER,
924 				    mi->mi_flags & MI4_INT)) {
925 			nfs_rw_exit(&mi->mi_recovlock);
926 			if (sp != NULL)
927 				nfs_rw_exit(&sp->s_recovlock);
928 			error = EINTR;
929 			goto out;
930 		}
931 		rsp->rs_flags |= NFS4_RS_RENAME_HELD;
932 	}
933 
934 	if (OH_IS_STATE_RELE(op)) {
935 		/*
936 		 * For forced unmount, letting the request proceed will
937 		 * almost always delay response to the user, so hand it off
938 		 * to the recovery thread.  For exiting lwp's, we don't
939 		 * have a good way to tell if the request will hang.  We
940 		 * generally want processes to handle their own requests so
941 		 * that they can be done in parallel, but if there is
942 		 * already a recovery thread, hand the request off to it.
943 		 * This will improve user response at no cost to overall
944 		 * system throughput.  For zone shutdown, we'd prefer
945 		 * the recovery thread to handle this as well.
946 		 */
947 		ASSERT(startrecovp != NULL);
948 		mutex_enter(&mi->mi_lock);
949 		if (FS_OR_ZONE_GONE4(mi->mi_vfsp))
950 			*startrecovp = TRUE;
951 		else if ((curthread->t_proc_flag & TP_LWPEXIT) &&
952 		    (mi->mi_flags & MI4_RECOV_ACTIV))
953 			*startrecovp = TRUE;
954 		else
955 			*startrecovp = FALSE;
956 		mutex_exit(&mi->mi_lock);
957 	} else
958 		if (startrecovp != NULL)
959 			*startrecovp = FALSE;
960 
961 	ASSERT(error == 0);
962 	return (error);
963 
964 out:
965 	ASSERT(error != 0);
966 	if (sp != NULL) {
967 		mutex_enter(&sp->s_lock);
968 		sp->s_otw_call_count--;
969 		mutex_exit(&sp->s_lock);
970 		nfs4_server_rele(sp);
971 		rsp->rs_sp = NULL;
972 	}
973 	nfs4_end_op_recall(vp1, vp2, rsp);
974 
975 #ifdef	DEBUG
976 	(void) tsd_set(nfs4_tsd_key, NULL);
977 #endif
978 	return (error);
979 }
980 
981 /*
982  * It is up to the caller to determine if rsp->rs_sp being NULL
983  * is detrimental or not.
984  */
985 int
986 nfs4_start_op(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2,
987 	nfs4_recov_state_t *rsp)
988 {
989 	ASSERT(rsp->rs_num_retry_despite_err == 0);
990 	rsp->rs_num_retry_despite_err = 0;
991 	return (nfs4_start_fop(mi, vp1, vp2, OH_OTHER, rsp, NULL));
992 }
993 
994 /*
995  * Release any resources acquired by nfs4_start_op().
996  * 'sp' should be the nfs4_server pointer returned by nfs4_start_op().
997  *
998  * The operation hint is used to avoid a deadlock by bypassing delegation
999  * return logic for writes, which are done while returning a delegation.
1000  */
1001 
1002 void
1003 nfs4_end_fop(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2, nfs4_op_hint_t op,
1004 		nfs4_recov_state_t *rsp, bool_t needs_recov)
1005 {
1006 	nfs4_server_t *sp = rsp->rs_sp;
1007 	rnode4_t *rp = NULL;
1008 
1009 #ifdef	lint
1010 	/*
1011 	 * The op hint isn't used any more, but might be in
1012 	 * the future.
1013 	 */
1014 	op = op;
1015 #endif
1016 
1017 #ifdef	DEBUG
1018 	ASSERT(tsd_get(nfs4_tsd_key) != NULL);
1019 	(void) tsd_set(nfs4_tsd_key, NULL);
1020 #endif
1021 
1022 	nfs4_end_op_recall(vp1, vp2, rsp);
1023 
1024 	if (rsp->rs_flags & NFS4_RS_RENAME_HELD)
1025 		nfs_rw_exit(&mi->mi_rename_lock);
1026 
1027 	if (!needs_recov) {
1028 		if (rsp->rs_flags & NFS4_RS_DELAY_MSG) {
1029 			/* may need to clear the delay interval */
1030 			if (vp1 != NULL) {
1031 				rp = VTOR4(vp1);
1032 				mutex_enter(&rp->r_statelock);
1033 				rp->r_delay_interval = 0;
1034 				mutex_exit(&rp->r_statelock);
1035 			}
1036 		}
1037 		rsp->rs_flags &= ~(NFS4_RS_GRACE_MSG|NFS4_RS_DELAY_MSG);
1038 	}
1039 
1040 	/*
1041 	 * If the corresponding nfs4_start_op() found a sp,
1042 	 * then there must still be a sp.
1043 	 */
1044 	if (sp != NULL) {
1045 		nfs_rw_exit(&mi->mi_recovlock);
1046 		nfs_rw_exit(&sp->s_recovlock);
1047 		mutex_enter(&sp->s_lock);
1048 		sp->s_otw_call_count--;
1049 		cv_broadcast(&sp->s_cv_otw_count);
1050 		mutex_exit(&sp->s_lock);
1051 		nfs4_server_rele(sp);
1052 	} else {
1053 		nfs_rw_exit(&mi->mi_recovlock);
1054 	}
1055 }
1056 
1057 void
1058 nfs4_end_op(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2,
1059 	    nfs4_recov_state_t *rsp, bool_t needrecov)
1060 {
1061 	nfs4_end_fop(mi, vp1, vp2, OH_OTHER, rsp, needrecov);
1062 }
1063 
1064 /*
1065  * If the filesystem is going through client recovery, block until
1066  * finished.
1067  * Exceptions:
1068  * - state-releasing ops (CLOSE, LOCKU, DELEGRETURN) are allowed to proceed
1069  *   if the filesystem has been forcibly unmounted or the lwp is exiting.
1070  *
1071  * Return value:
1072  * - 0 if no errors
1073  * - EINTR if the call was interrupted
1074  * - EIO if the filesystem has been forcibly unmounted (non-state-releasing
1075  *   op)
1076  * - the errno value from the recovery thread, if recovery failed
1077  */
1078 
1079 static int
1080 wait_for_recovery(mntinfo4_t *mi, nfs4_op_hint_t op_hint)
1081 {
1082 	int error = 0;
1083 
1084 	mutex_enter(&mi->mi_lock);
1085 
1086 	while (mi->mi_recovflags != 0) {
1087 		klwp_t *lwp = ttolwp(curthread);
1088 
1089 		if (mi->mi_flags & MI4_RECOV_FAIL)
1090 			break;
1091 		if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)
1092 			break;
1093 		if (OH_IS_STATE_RELE(op_hint) &&
1094 		    (curthread->t_proc_flag & TP_LWPEXIT))
1095 			break;
1096 
1097 		if (lwp != NULL)
1098 			lwp->lwp_nostop++;
1099 		/* XXX - use different cv? */
1100 		if (cv_wait_sig(&mi->mi_failover_cv, &mi->mi_lock) == 0) {
1101 			error = EINTR;
1102 			if (lwp != NULL)
1103 				lwp->lwp_nostop--;
1104 			break;
1105 		}
1106 		if (lwp != NULL)
1107 			lwp->lwp_nostop--;
1108 	}
1109 
1110 	if (mi->mi_flags & MI4_RECOV_FAIL) {
1111 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1112 			"wait_for_recovery: fail since RECOV FAIL"));
1113 		error = mi->mi_error;
1114 	} else if ((mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) &&
1115 	    !OH_IS_STATE_RELE(op_hint)) {
1116 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1117 			"wait_for_recovery: forced unmount"));
1118 		error = EIO;
1119 	}
1120 
1121 	mutex_exit(&mi->mi_lock);
1122 
1123 	return (error);
1124 }
1125 
1126 /*
1127  * If the client received NFS4ERR_GRACE for this particular mount,
1128  * the client blocks here until it is time to try again.
1129  *
1130  * Return value:
1131  * - 0 if wait was successful
1132  * - EINTR if the call was interrupted
1133  */
1134 
1135 int
1136 nfs4_wait_for_grace(mntinfo4_t *mi, nfs4_recov_state_t *rsp)
1137 {
1138 	int error = 0;
1139 	time_t curtime, time_to_wait;
1140 
1141 	/* do a unprotected check to reduce mi_lock contention */
1142 	if (mi->mi_grace_wait != 0) {
1143 		mutex_enter(&mi->mi_lock);
1144 
1145 		if (mi->mi_grace_wait != 0) {
1146 			if (!(rsp->rs_flags & NFS4_RS_GRACE_MSG))
1147 				rsp->rs_flags |= NFS4_RS_GRACE_MSG;
1148 
1149 			curtime = gethrestime_sec();
1150 
1151 			if (curtime < mi->mi_grace_wait) {
1152 
1153 				time_to_wait = mi->mi_grace_wait - curtime;
1154 
1155 				mutex_exit(&mi->mi_lock);
1156 
1157 				delay(SEC_TO_TICK(time_to_wait));
1158 
1159 				curtime = gethrestime_sec();
1160 
1161 				mutex_enter(&mi->mi_lock);
1162 
1163 				if (curtime >= mi->mi_grace_wait)
1164 					mi->mi_grace_wait = 0;
1165 			} else {
1166 				mi->mi_grace_wait = 0;
1167 			}
1168 		}
1169 		mutex_exit(&mi->mi_lock);
1170 	}
1171 
1172 	return (error);
1173 }
1174 
1175 /*
1176  * If the client received NFS4ERR_DELAY for an operation on a vnode,
1177  * the client blocks here until it is time to try again.
1178  *
1179  * Return value:
1180  * - 0 if wait was successful
1181  * - EINTR if the call was interrupted
1182  */
1183 
1184 int
1185 nfs4_wait_for_delay(vnode_t *vp, nfs4_recov_state_t *rsp)
1186 {
1187 	int error = 0;
1188 	time_t curtime, time_to_wait;
1189 	rnode4_t *rp;
1190 
1191 	ASSERT(vp != NULL);
1192 
1193 	rp = VTOR4(vp);
1194 
1195 	/* do a unprotected check to reduce r_statelock contention */
1196 	if (rp->r_delay_wait != 0) {
1197 		mutex_enter(&rp->r_statelock);
1198 
1199 		if (rp->r_delay_wait != 0) {
1200 
1201 			if (!(rsp->rs_flags & NFS4_RS_DELAY_MSG)) {
1202 				rsp->rs_flags |= NFS4_RS_DELAY_MSG;
1203 				nfs4_mi_kstat_inc_delay(VTOMI4(vp));
1204 			}
1205 
1206 			curtime = gethrestime_sec();
1207 
1208 			if (curtime < rp->r_delay_wait) {
1209 
1210 				time_to_wait = rp->r_delay_wait - curtime;
1211 
1212 				mutex_exit(&rp->r_statelock);
1213 
1214 				delay(SEC_TO_TICK(time_to_wait));
1215 
1216 				curtime = gethrestime_sec();
1217 
1218 				mutex_enter(&rp->r_statelock);
1219 
1220 				if (curtime >= rp->r_delay_wait)
1221 					rp->r_delay_wait = 0;
1222 			} else {
1223 				rp->r_delay_wait = 0;
1224 			}
1225 		}
1226 		mutex_exit(&rp->r_statelock);
1227 	}
1228 
1229 	return (error);
1230 }
1231 
1232 /*
1233  * The recovery thread.
1234  */
1235 
1236 static void
1237 nfs4_recov_thread(recov_info_t *recovp)
1238 {
1239 	mntinfo4_t *mi = recovp->rc_mi;
1240 	nfs4_server_t *sp;
1241 	int done = 0, error = 0;
1242 	bool_t recov_fail = FALSE;
1243 	callb_cpr_t cpr_info;
1244 	kmutex_t cpr_lock;
1245 
1246 	nfs4_queue_event(RE_START, mi, NULL, mi->mi_recovflags,
1247 	    recovp->rc_vp1, recovp->rc_vp2, 0, NULL, 0, TAG_NONE, TAG_NONE,
1248 	    0, 0);
1249 
1250 	mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
1251 	CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr, "nfsv4Recov");
1252 
1253 	mutex_enter(&mi->mi_lock);
1254 	mi->mi_recovthread = curthread;
1255 	mutex_exit(&mi->mi_lock);
1256 
1257 	/*
1258 	 * We don't really need protection here against failover or
1259 	 * migration, since the current thread is the one that would make
1260 	 * any changes, but hold mi_recovlock anyway for completeness (and
1261 	 * to satisfy any ASSERTs).
1262 	 */
1263 	(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
1264 	sp = find_nfs4_server(mi);
1265 	if (sp != NULL)
1266 		mutex_exit(&sp->s_lock);
1267 	nfs_rw_exit(&mi->mi_recovlock);
1268 
1269 	/*
1270 	 * Do any necessary recovery, based on the information in recovp
1271 	 * and any recovery flags.
1272 	 */
1273 
1274 	do {
1275 		mutex_enter(&mi->mi_lock);
1276 		if (FS_OR_ZONE_GONE4(mi->mi_vfsp)) {
1277 			bool_t activesrv;
1278 
1279 			NFS4_DEBUG(nfs4_client_recov_debug &&
1280 			    mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED, (CE_NOTE,
1281 				"nfs4_recov_thread: file system has been "
1282 				"unmounted"));
1283 			NFS4_DEBUG(nfs4_client_recov_debug &&
1284 			    zone_status_get(curproc->p_zone) >=
1285 			    ZONE_IS_SHUTTING_DOWN, (CE_NOTE,
1286 				"nfs4_recov_thread: zone shutting down"));
1287 			/*
1288 			 * If the server has lost its state for us and
1289 			 * the filesystem is unmounted, then the filesystem
1290 			 * can be tossed, even if there are lost lock or
1291 			 * lost state calls in the recovery queue.
1292 			 */
1293 			if (mi->mi_recovflags &
1294 			    (MI4R_NEED_CLIENTID | MI4R_REOPEN_FILES)) {
1295 				NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1296 				"nfs4_recov_thread: bailing out"));
1297 				mi->mi_flags |= MI4_RECOV_FAIL;
1298 				mi->mi_error = recovp->rc_error;
1299 				recov_fail = TRUE;
1300 			}
1301 			/*
1302 			 * We don't know if the server has any state for
1303 			 * us, and the filesystem has been unmounted.  If
1304 			 * there are "lost state" recovery items, keep
1305 			 * trying to process them until there are no more
1306 			 * mounted filesystems for the server.  Otherwise,
1307 			 * bail out.  The reason we don't mark the
1308 			 * filesystem as failing recovery is in case we
1309 			 * have to do "lost state" recovery later (e.g., a
1310 			 * user process exits).
1311 			 */
1312 			if (!(mi->mi_recovflags & MI4R_LOST_STATE)) {
1313 				done = 1;
1314 				mutex_exit(&mi->mi_lock);
1315 				break;
1316 			}
1317 			mutex_exit(&mi->mi_lock);
1318 
1319 			if (sp == NULL)
1320 				activesrv = FALSE;
1321 			else {
1322 				mutex_enter(&sp->s_lock);
1323 				activesrv = nfs4_fs_active(sp);
1324 			}
1325 			if (!activesrv) {
1326 				NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1327 					"no active fs for server %p",
1328 					(void *)sp));
1329 				mutex_enter(&mi->mi_lock);
1330 				mi->mi_flags |= MI4_RECOV_FAIL;
1331 				mi->mi_error = recovp->rc_error;
1332 				mutex_exit(&mi->mi_lock);
1333 				recov_fail = TRUE;
1334 				if (sp != NULL) {
1335 					/*
1336 					 * Mark the server instance as
1337 					 * dead, so that nobody will attach
1338 					 * a new filesystem.
1339 					 */
1340 					nfs4_mark_srv_dead(sp);
1341 				}
1342 			}
1343 			if (sp != NULL)
1344 				mutex_exit(&sp->s_lock);
1345 		} else {
1346 			mutex_exit(&mi->mi_lock);
1347 		}
1348 
1349 		/*
1350 		 * Check if we need to select a new server for a
1351 		 * failover.  Choosing a new server will force at
1352 		 * least a check of the clientid.
1353 		 */
1354 		mutex_enter(&mi->mi_lock);
1355 		if (!recov_fail &&
1356 		    (mi->mi_recovflags & MI4R_NEED_NEW_SERVER)) {
1357 			mutex_exit(&mi->mi_lock);
1358 			recov_newserver(recovp, &sp, &recov_fail);
1359 		} else
1360 			mutex_exit(&mi->mi_lock);
1361 
1362 		/*
1363 		 * Check if we need to recover the clientid.  This
1364 		 * must be done before file and lock recovery, and it
1365 		 * potentially affects the recovery threads for other
1366 		 * filesystems, so it gets special treatment.
1367 		 */
1368 		if (sp != NULL && recov_fail == FALSE) {
1369 			mutex_enter(&sp->s_lock);
1370 			if (!(sp->s_flags & N4S_CLIENTID_SET)) {
1371 				mutex_exit(&sp->s_lock);
1372 				recov_clientid(recovp, sp);
1373 			} else {
1374 				/*
1375 				 * Unset this flag in case another recovery
1376 				 * thread successfully recovered the clientid
1377 				 * for us already.
1378 				 */
1379 				mutex_enter(&mi->mi_lock);
1380 				mi->mi_recovflags &= ~MI4R_NEED_CLIENTID;
1381 				mutex_exit(&mi->mi_lock);
1382 				mutex_exit(&sp->s_lock);
1383 			}
1384 		}
1385 
1386 		/*
1387 		 * Check if we need to get the security information.
1388 		 */
1389 		mutex_enter(&mi->mi_lock);
1390 		if ((mi->mi_recovflags & MI4R_NEED_SECINFO) &&
1391 		    !(mi->mi_flags & MI4_RECOV_FAIL)) {
1392 			mutex_exit(&mi->mi_lock);
1393 			(void) nfs_rw_enter_sig(&mi->mi_recovlock,
1394 							RW_WRITER, 0);
1395 			error = nfs4_secinfo_recov(recovp->rc_mi,
1396 					recovp->rc_vp1, recovp->rc_vp2);
1397 			/*
1398 			 * If error, nothing more can be done, stop
1399 			 * the recovery.
1400 			 */
1401 			if (error) {
1402 				mutex_enter(&mi->mi_lock);
1403 				mi->mi_flags |= MI4_RECOV_FAIL;
1404 				mi->mi_error = recovp->rc_error;
1405 				mutex_exit(&mi->mi_lock);
1406 				nfs4_queue_event(RE_WRONGSEC, mi, NULL,
1407 				    error, recovp->rc_vp1, recovp->rc_vp2,
1408 				    0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
1409 			}
1410 			nfs_rw_exit(&mi->mi_recovlock);
1411 		} else
1412 			mutex_exit(&mi->mi_lock);
1413 
1414 		/*
1415 		 * Check if there's a bad seqid to recover.
1416 		 */
1417 		mutex_enter(&mi->mi_lock);
1418 		if ((mi->mi_recovflags & MI4R_BAD_SEQID) &&
1419 		    !(mi->mi_flags & MI4_RECOV_FAIL)) {
1420 			mutex_exit(&mi->mi_lock);
1421 			(void) nfs_rw_enter_sig(&mi->mi_recovlock,
1422 					RW_WRITER, 0);
1423 			recov_bad_seqid(recovp);
1424 			nfs_rw_exit(&mi->mi_recovlock);
1425 		} else
1426 			mutex_exit(&mi->mi_lock);
1427 
1428 		/*
1429 		 * Next check for recovery that affects the entire
1430 		 * filesystem.
1431 		 */
1432 		if (sp != NULL) {
1433 			mutex_enter(&mi->mi_lock);
1434 			if ((mi->mi_recovflags & MI4R_REOPEN_FILES) &&
1435 			    !(mi->mi_flags & MI4_RECOV_FAIL)) {
1436 				mutex_exit(&mi->mi_lock);
1437 				recov_openfiles(recovp, sp);
1438 			} else
1439 				mutex_exit(&mi->mi_lock);
1440 		}
1441 
1442 		/*
1443 		 * Send any queued state recovery requests.
1444 		 */
1445 		mutex_enter(&mi->mi_lock);
1446 		if (sp != NULL &&
1447 		    (mi->mi_recovflags & MI4R_LOST_STATE) &&
1448 		    !(mi->mi_flags & MI4_RECOV_FAIL)) {
1449 			mutex_exit(&mi->mi_lock);
1450 			(void) nfs_rw_enter_sig(&mi->mi_recovlock,
1451 				    RW_WRITER, 0);
1452 			nfs4_resend_lost_rqsts(recovp, sp);
1453 			if (list_head(&mi->mi_lost_state) == NULL) {
1454 				/* done */
1455 				mutex_enter(&mi->mi_lock);
1456 				mi->mi_recovflags &= ~MI4R_LOST_STATE;
1457 				mutex_exit(&mi->mi_lock);
1458 			}
1459 			nfs_rw_exit(&mi->mi_recovlock);
1460 		} else {
1461 			mutex_exit(&mi->mi_lock);
1462 		}
1463 
1464 		/*
1465 		 * See if there is anything more to do.  If not, announce
1466 		 * that we are done and exit.
1467 		 *
1468 		 * Need mi_recovlock to keep 'sp' valid.  Must grab
1469 		 * mi_recovlock before mi_lock to preserve lock ordering.
1470 		 */
1471 		(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
1472 		mutex_enter(&mi->mi_lock);
1473 		if ((mi->mi_recovflags & ~MI4R_SRV_REBOOT) == 0 ||
1474 		    (mi->mi_flags & MI4_RECOV_FAIL)) {
1475 			list_t local_lost_state;
1476 			nfs4_lost_rqst_t *lrp;
1477 
1478 			/*
1479 			 * We need to remove the lost requests before we
1480 			 * unmark the mi as no longer doing recovery to
1481 			 * avoid a race with a new thread putting new lost
1482 			 * requests on the same mi (and the going away
1483 			 * thread would remove the new lost requests).
1484 			 *
1485 			 * Move the lost requests to a local list since
1486 			 * nfs4_remove_lost_rqst() drops mi_lock, and
1487 			 * dropping the mi_lock would make our check to
1488 			 * see if recovery is done no longer valid.
1489 			 */
1490 			list_create(&local_lost_state,
1491 			    sizeof (nfs4_lost_rqst_t),
1492 			    offsetof(nfs4_lost_rqst_t, lr_node));
1493 			list_move_tail(&local_lost_state, &mi->mi_lost_state);
1494 
1495 			done = 1;
1496 			mutex_exit(&mi->mi_lock);
1497 			/*
1498 			 * Now officially free the "moved"
1499 			 * lost requests.
1500 			 */
1501 			while ((lrp = list_head(&local_lost_state)) != NULL) {
1502 				list_remove(&local_lost_state, lrp);
1503 				nfs4_free_lost_rqst(lrp, sp);
1504 			}
1505 			list_destroy(&local_lost_state);
1506 		} else
1507 			mutex_exit(&mi->mi_lock);
1508 		nfs_rw_exit(&mi->mi_recovlock);
1509 
1510 		/*
1511 		 * If the filesystem has been forcibly unmounted, there is
1512 		 * probably no point in retrying immediately.  Furthermore,
1513 		 * there might be user processes waiting for a chance to
1514 		 * queue up "lost state" requests, so that they can exit.
1515 		 * So pause here for a moment.  Same logic for zone shutdown.
1516 		 */
1517 		if (!done && FS_OR_ZONE_GONE4(mi->mi_vfsp)) {
1518 			mutex_enter(&mi->mi_lock);
1519 			cv_broadcast(&mi->mi_failover_cv);
1520 			mutex_exit(&mi->mi_lock);
1521 			delay(SEC_TO_TICK(nfs4_unmount_delay));
1522 		}
1523 
1524 	} while (!done);
1525 
1526 
1527 	if (sp != NULL)
1528 		nfs4_server_rele(sp);
1529 
1530 	/*
1531 	 * Return all recalled delegations
1532 	 */
1533 	nfs4_dlistclean();
1534 
1535 	mutex_enter(&mi->mi_lock);
1536 	recov_done(mi, recovp);
1537 	mi->mi_in_recovery--;
1538 
1539 	/*
1540 	 * Free up resources that were allocated for us.
1541 	 */
1542 	if (recovp->rc_vp1 != NULL)
1543 		VN_RELE(recovp->rc_vp1);
1544 	if (recovp->rc_vp2 != NULL)
1545 		VN_RELE(recovp->rc_vp2);
1546 	VFS_RELE(mi->mi_vfsp);
1547 	cv_broadcast(&mi->mi_cv_in_recov);
1548 	mutex_exit(&mi->mi_lock);
1549 
1550 	kmem_free(recovp, sizeof (recov_info_t));
1551 	mutex_enter(&cpr_lock);
1552 	CALLB_CPR_EXIT(&cpr_info);
1553 	mutex_destroy(&cpr_lock);
1554 	zthread_exit();
1555 }
1556 
1557 /*
1558  * Log the end of recovery and notify any waiting threads.
1559  */
1560 
1561 static void
1562 recov_done(mntinfo4_t *mi, recov_info_t *recovp)
1563 {
1564 
1565 	ASSERT(MUTEX_HELD(&mi->mi_lock));
1566 
1567 	nfs4_queue_event(RE_END, mi, NULL, 0, recovp->rc_vp1,
1568 		recovp->rc_vp2, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
1569 	mi->mi_recovthread = NULL;
1570 	mi->mi_flags &= ~MI4_RECOV_ACTIV;
1571 	mi->mi_recovflags &= ~MI4R_SRV_REBOOT;
1572 	cv_broadcast(&mi->mi_failover_cv);
1573 }
1574 
1575 /*
1576  * State-specific recovery routines, by state.
1577  */
1578 
1579 /*
1580  * Failover.
1581  *
1582  * Replaces *spp with a reference to the new server, which must
1583  * eventually be freed.
1584  */
1585 
1586 static void
1587 recov_newserver(recov_info_t *recovp, nfs4_server_t **spp, bool_t *recov_fail)
1588 {
1589 	mntinfo4_t *mi = recovp->rc_mi;
1590 	servinfo4_t *svp = NULL;
1591 	nfs4_server_t *osp = *spp;
1592 	CLIENT *cl;
1593 	enum clnt_stat status;
1594 	struct timeval tv;
1595 	int error;
1596 	int oncethru = 0;
1597 	rnode4_t *rp;
1598 	int index;
1599 	nfs_fh4 fh;
1600 	char *snames;
1601 	size_t len;
1602 
1603 	(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0);
1604 
1605 	tv.tv_sec = 2;
1606 	tv.tv_usec = 0;
1607 
1608 #ifdef lint
1609 	/*
1610 	 * Lint can't follow the logic, so thinks that snames and len
1611 	 * can be used before being set.  They can't, but lint can't
1612 	 * figure it out.  To address the lint warning, initialize
1613 	 * snames and len for lint.
1614 	 */
1615 	snames = NULL;
1616 	len = 0;
1617 #endif
1618 
1619 	/*
1620 	 * Ping the null NFS procedure of every server in
1621 	 * the list until one responds.  We always start
1622 	 * at the head of the list and always skip the one
1623 	 * that is current, since it's caused us a problem.
1624 	 */
1625 	while (svp == NULL) {
1626 		for (svp = mi->mi_servers; svp; svp = svp->sv_next) {
1627 
1628 			mutex_enter(&mi->mi_lock);
1629 			if (FS_OR_ZONE_GONE4(mi->mi_vfsp)) {
1630 				mi->mi_flags |= MI4_RECOV_FAIL;
1631 				mutex_exit(&mi->mi_lock);
1632 				(void) nfs_rw_exit(&mi->mi_recovlock);
1633 				*recov_fail = TRUE;
1634 				if (oncethru)
1635 					kmem_free(snames, len);
1636 				return;
1637 			}
1638 			mutex_exit(&mi->mi_lock);
1639 
1640 			(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1641 			if (svp->sv_flags & SV4_NOTINUSE) {
1642 				nfs_rw_exit(&svp->sv_lock);
1643 				continue;
1644 			}
1645 			nfs_rw_exit(&svp->sv_lock);
1646 
1647 			if (!oncethru && svp == mi->mi_curr_serv)
1648 				continue;
1649 
1650 			error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr,
1651 			    NFS_PROGRAM, NFS_V4, 0, 1, CRED(), &cl);
1652 			if (error)
1653 				continue;
1654 
1655 			if (!(mi->mi_flags & MI4_INT))
1656 				cl->cl_nosignal = TRUE;
1657 			status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL,
1658 			    xdr_void, NULL, tv);
1659 			if (!(mi->mi_flags & MI4_INT))
1660 				cl->cl_nosignal = FALSE;
1661 			AUTH_DESTROY(cl->cl_auth);
1662 			CLNT_DESTROY(cl);
1663 			if (status == RPC_SUCCESS) {
1664 				nfs4_queue_event(RE_FAILOVER, mi,
1665 				    svp == mi->mi_curr_serv ? NULL :
1666 				    svp->sv_hostname, 0, NULL, NULL, 0,
1667 				    NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
1668 				break;
1669 			}
1670 		}
1671 
1672 		if (svp == NULL) {
1673 			if (!oncethru) {
1674 				snames = nfs4_getsrvnames(mi, &len);
1675 				nfs4_queue_fact(RF_SRVS_NOT_RESPOND, mi,
1676 				    0, 0, 0, FALSE, snames, 0, NULL);
1677 				oncethru = 1;
1678 			}
1679 			delay(hz);
1680 		}
1681 	}
1682 
1683 	if (oncethru) {
1684 		nfs4_queue_fact(RF_SRVS_OK, mi, 0, 0, 0, FALSE, snames,
1685 		    0, NULL);
1686 		kmem_free(snames, len);
1687 	}
1688 
1689 #if DEBUG
1690 	(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1691 	ASSERT((svp->sv_flags & SV4_NOTINUSE) == 0);
1692 	nfs_rw_exit(&svp->sv_lock);
1693 #endif
1694 
1695 	mutex_enter(&mi->mi_lock);
1696 	mi->mi_recovflags &= ~MI4R_NEED_NEW_SERVER;
1697 	if (svp != mi->mi_curr_serv) {
1698 		servinfo4_t *osvp = mi->mi_curr_serv;
1699 
1700 		mutex_exit(&mi->mi_lock);
1701 
1702 		/*
1703 		 * Update server-dependent fields in the root vnode.
1704 		 */
1705 		index = rtable4hash(mi->mi_rootfh);
1706 		rw_enter(&rtable4[index].r_lock, RW_WRITER);
1707 
1708 		rp = r4find(&rtable4[index], mi->mi_rootfh, mi->mi_vfsp);
1709 		if (rp != NULL) {
1710 			NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
1711 			    "recov_newserver: remapping %s", rnode4info(rp)));
1712 			mutex_enter(&rp->r_statelock);
1713 			rp->r_server = svp;
1714 			PURGE_ATTRCACHE4_LOCKED(rp);
1715 			mutex_exit(&rp->r_statelock);
1716 			(void) nfs4_free_data_reclaim(rp);
1717 			nfs4_purge_rddir_cache(RTOV4(rp));
1718 			rw_exit(&rtable4[index].r_lock);
1719 			NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
1720 			    "recov_newserver: done with %s",
1721 			    rnode4info(rp)));
1722 			VN_RELE(RTOV4(rp));
1723 		} else
1724 			rw_exit(&rtable4[index].r_lock);
1725 		(void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
1726 
1727 		mutex_enter(&mi->mi_lock);
1728 		mi->mi_recovflags |= MI4R_REOPEN_FILES | MI4R_REMAP_FILES;
1729 		if (recovp->rc_srv_reboot)
1730 			mi->mi_recovflags |= MI4R_SRV_REBOOT;
1731 		mi->mi_curr_serv = svp;
1732 		mi->mi_failover++;
1733 		mi->mi_flags &= ~MI4_BADOWNER_DEBUG;
1734 		mutex_exit(&mi->mi_lock);
1735 
1736 		(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1737 		fh.nfs_fh4_len = svp->sv_fhandle.fh_len;
1738 		fh.nfs_fh4_val = svp->sv_fhandle.fh_buf;
1739 		sfh4_update(mi->mi_rootfh, &fh);
1740 		fh.nfs_fh4_len = svp->sv_pfhandle.fh_len;
1741 		fh.nfs_fh4_val = svp->sv_pfhandle.fh_buf;
1742 		sfh4_update(mi->mi_srvparentfh, &fh);
1743 		nfs_rw_exit(&svp->sv_lock);
1744 
1745 		*spp = nfs4_move_mi(mi, osvp, svp);
1746 		if (osp != NULL)
1747 			nfs4_server_rele(osp);
1748 	} else
1749 		mutex_exit(&mi->mi_lock);
1750 	(void) nfs_rw_exit(&mi->mi_recovlock);
1751 }
1752 
1753 /*
1754  * Clientid.
1755  */
1756 
1757 static void
1758 recov_clientid(recov_info_t *recovp, nfs4_server_t *sp)
1759 {
1760 	mntinfo4_t *mi = recovp->rc_mi;
1761 	int error = 0;
1762 	int still_stale;
1763 	int need_new_s;
1764 
1765 	ASSERT(sp != NULL);
1766 
1767 	/*
1768 	 * Acquire the recovery lock and then verify that the clientid
1769 	 * still needs to be recovered.  (Note that s_recovlock is supposed
1770 	 * to be acquired before s_lock.)  Since the thread holds the
1771 	 * recovery lock, no other thread will recover the clientid.
1772 	 */
1773 	(void) nfs_rw_enter_sig(&sp->s_recovlock, RW_WRITER, 0);
1774 	(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0);
1775 	mutex_enter(&sp->s_lock);
1776 	still_stale = ((sp->s_flags & N4S_CLIENTID_SET) == 0);
1777 	mutex_exit(&sp->s_lock);
1778 
1779 	if (still_stale) {
1780 		nfs4_error_t n4e;
1781 
1782 		nfs4_error_zinit(&n4e);
1783 		nfs4setclientid(mi, kcred, TRUE, &n4e);
1784 		error = n4e.error;
1785 		if (error != 0) {
1786 
1787 			/*
1788 			 * nfs4setclientid may have set MI4R_NEED_NEW_SERVER,
1789 			 * if so, just return and let recov_thread drive
1790 			 * failover.
1791 			 */
1792 			mutex_enter(&mi->mi_lock);
1793 			need_new_s = mi->mi_recovflags & MI4R_NEED_NEW_SERVER;
1794 			mutex_exit(&mi->mi_lock);
1795 
1796 			if (need_new_s) {
1797 				nfs_rw_exit(&mi->mi_recovlock);
1798 				nfs_rw_exit(&sp->s_recovlock);
1799 				return;
1800 			}
1801 
1802 			nfs4_queue_event(RE_CLIENTID, mi, NULL, n4e.error, NULL,
1803 			    NULL, n4e.stat, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
1804 			mutex_enter(&mi->mi_lock);
1805 			mi->mi_flags |= MI4_RECOV_FAIL;
1806 			mi->mi_error = recovp->rc_error;
1807 			mutex_exit(&mi->mi_lock);
1808 			/* don't destroy the nfs4_server, let umount do it */
1809 		}
1810 	}
1811 
1812 	if (error == 0) {
1813 		mutex_enter(&mi->mi_lock);
1814 		mi->mi_recovflags &= ~MI4R_NEED_CLIENTID;
1815 		/*
1816 		 * If still_stale isn't true, then another thread already
1817 		 * recovered the clientid.  And that thread that set the
1818 		 * clientid will have initiated reopening files on all the
1819 		 * filesystems for the server, so we should not initiate
1820 		 * reopening for this filesystem here.
1821 		 */
1822 		if (still_stale) {
1823 			mi->mi_recovflags |= MI4R_REOPEN_FILES;
1824 			if (recovp->rc_srv_reboot)
1825 				mi->mi_recovflags |= MI4R_SRV_REBOOT;
1826 		}
1827 		mutex_exit(&mi->mi_lock);
1828 	}
1829 
1830 	nfs_rw_exit(&mi->mi_recovlock);
1831 
1832 	if (error != 0) {
1833 		nfs_rw_exit(&sp->s_recovlock);
1834 		mutex_enter(&mi->mi_lock);
1835 		if ((mi->mi_flags & MI4_RECOV_FAIL) == 0)
1836 			delay(SEC_TO_TICK(recov_err_delay));
1837 		mutex_exit(&mi->mi_lock);
1838 	} else {
1839 		mntinfo4_t **milist;
1840 		mntinfo4_t *tmi;
1841 		int nummi, i;
1842 
1843 		/*
1844 		 * Initiate recovery of open files for other filesystems.
1845 		 * We create an array of filesystems, rather than just
1846 		 * walking the filesystem list, to avoid deadlock issues
1847 		 * with s_lock and mi_recovlock.
1848 		 */
1849 		milist = make_milist(sp, &nummi);
1850 		for (i = 0; i < nummi; i++) {
1851 			tmi = milist[i];
1852 			if (tmi != mi) {
1853 				(void) nfs_rw_enter_sig(&tmi->mi_recovlock,
1854 							RW_READER, 0);
1855 				start_recovery_action(NR_OPENFILES, TRUE, tmi,
1856 					NULL, NULL);
1857 				nfs_rw_exit(&tmi->mi_recovlock);
1858 			}
1859 		}
1860 		free_milist(milist, nummi);
1861 
1862 		nfs_rw_exit(&sp->s_recovlock);
1863 	}
1864 }
1865 
1866 /*
1867  * Return an array of filesystems associated with the given server.  The
1868  * caller should call free_milist() to free the references and memory.
1869  */
1870 
1871 static mntinfo4_t **
1872 make_milist(nfs4_server_t *sp, int *nummip)
1873 {
1874 	int nummi, i;
1875 	mntinfo4_t **milist;
1876 	mntinfo4_t *tmi;
1877 
1878 	mutex_enter(&sp->s_lock);
1879 	nummi = 0;
1880 	for (tmi = sp->mntinfo4_list; tmi != NULL; tmi = tmi->mi_clientid_next)
1881 		nummi++;
1882 
1883 	milist = kmem_alloc(nummi * sizeof (mntinfo4_t *), KM_NOSLEEP);
1884 
1885 	for (i = 0, tmi = sp->mntinfo4_list; tmi != NULL; i++,
1886 	    tmi = tmi->mi_clientid_next) {
1887 		milist[i] = tmi;
1888 		VFS_HOLD(tmi->mi_vfsp);
1889 	}
1890 	mutex_exit(&sp->s_lock);
1891 
1892 	*nummip = nummi;
1893 	return (milist);
1894 }
1895 
1896 /*
1897  * Free the filesystem list created by make_milist().
1898  */
1899 
1900 static void
1901 free_milist(mntinfo4_t **milist, int nummi)
1902 {
1903 	mntinfo4_t *tmi;
1904 	int i;
1905 
1906 	for (i = 0; i < nummi; i++) {
1907 		tmi = milist[i];
1908 		VFS_RELE(tmi->mi_vfsp);
1909 	}
1910 	kmem_free(milist, nummi * sizeof (mntinfo4_t *));
1911 }
1912 
1913 /*
1914  * Filehandle
1915  */
1916 
1917 /*
1918  * Lookup the filehandle for the given vnode and update the rnode if it has
1919  * changed.
1920  *
1921  * Errors:
1922  * - if the filehandle could not be updated because of an error that
1923  *   requires further recovery, initiate that recovery and return.
1924  * - if the filehandle could not be updated because of a signal, pretend we
1925  *   succeeded and let someone else deal with it.
1926  * - if the filehandle could not be updated and the filesystem has been
1927  *   forcibly unmounted, pretend we succeeded, and let the caller deal with
1928  *   the forced unmount (to retry or not to retry, that is the question).
1929  * - if the filehandle could not be updated because of some other error,
1930  *   mark the rnode bad and return.
1931  */
1932 static void
1933 recov_filehandle(nfs4_recov_t action, mntinfo4_t *mi, vnode_t *vp)
1934 {
1935 	rnode4_t *rp = VTOR4(vp);
1936 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
1937 	bool_t needrecov;
1938 
1939 	mutex_enter(&rp->r_statelock);
1940 
1941 	if (rp->r_flags & R4RECOVERR) {
1942 		mutex_exit(&rp->r_statelock);
1943 		return;
1944 	}
1945 
1946 	/*
1947 	 * If someone else is updating the filehandle, wait for them to
1948 	 * finish and then let our caller retry.
1949 	 */
1950 	if (rp->r_flags & R4RECEXPFH) {
1951 		while (rp->r_flags & R4RECEXPFH) {
1952 			cv_wait(&rp->r_cv, &rp->r_statelock);
1953 		}
1954 		mutex_exit(&rp->r_statelock);
1955 		return;
1956 	}
1957 	rp->r_flags |= R4RECEXPFH;
1958 	mutex_exit(&rp->r_statelock);
1959 
1960 	if (action == NR_BADHANDLE) {
1961 		/* shouldn't happen */
1962 		nfs4_queue_event(RE_BADHANDLE, mi, NULL, 0,
1963 		    vp, NULL, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
1964 	}
1965 
1966 	nfs4_remap_file(mi, vp, 0, &e);
1967 	needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
1968 
1969 	/*
1970 	 * If we get BADHANDLE or FHEXPIRED in their handler, something is
1971 	 * broken.  Don't try to recover, just mark the file dead.
1972 	 */
1973 	if (needrecov && e.error == 0 &&
1974 	    (e.stat == NFS4ERR_BADHANDLE || e.stat == NFS4ERR_FHEXPIRED))
1975 		needrecov = FALSE;
1976 	if (needrecov) {
1977 		(void) nfs4_start_recovery(&e, mi, vp,
1978 				NULL, NULL, NULL, OP_LOOKUP, NULL);
1979 	} else if (e.error != EINTR &&
1980 	    !NFS4_FRC_UNMT_ERR(e.error, mi->mi_vfsp) &&
1981 	    (e.error != 0 || e.stat != NFS4_OK)) {
1982 		nfs4_recov_fh_fail(vp, e.error, e.stat);
1983 		/*
1984 		 * Don't set r_error to ESTALE.  Higher-level code (e.g.,
1985 		 * cstatat_getvp()) retries on ESTALE, which would cause
1986 		 * an infinite loop.
1987 		 */
1988 	}
1989 
1990 	mutex_enter(&rp->r_statelock);
1991 	rp->r_flags &= ~R4RECEXPFH;
1992 	cv_broadcast(&rp->r_cv);
1993 	mutex_exit(&rp->r_statelock);
1994 }
1995 
1996 /*
1997  * Stale Filehandle
1998  */
1999 
2000 /*
2001  * A stale filehandle can happen when an individual file has
2002  * been removed, or when an entire filesystem has been taken
2003  * offline.  To distinguish these cases, we do this:
2004  * - if a GETATTR with the current filehandle is okay, we do
2005  *   nothing (this can happen with two-filehandle ops)
2006  * - if the GETATTR fails, but a GETATTR of the root filehandle
2007  *   succeeds, mark the rnode with R4STALE, which will stop use
2008  * - if the GETATTR fails, and a GETATTR of the root filehandle
2009  *   also fails, we consider the problem filesystem-wide, so:
2010  *   - if we can failover, we should
2011  *   - if we can't failover, we should mark both the original
2012  *     vnode and the root bad
2013  */
2014 static void
2015 recov_stale(mntinfo4_t *mi, vnode_t *vp)
2016 {
2017 	rnode4_t *rp = VTOR4(vp);
2018 	vnode_t *rootvp = NULL;
2019 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
2020 	nfs4_ga_res_t gar;
2021 	char *fail_msg = "failed to recover from NFS4ERR_STALE";
2022 	bool_t needrecov;
2023 
2024 	mutex_enter(&rp->r_statelock);
2025 
2026 	if (rp->r_flags & R4RECOVERR) {
2027 		mutex_exit(&rp->r_statelock);
2028 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2029 		    "recov_stale: already marked dead, rp %s",
2030 		    rnode4info(rp)));
2031 		return;
2032 	}
2033 
2034 	if (rp->r_flags & R4STALE) {
2035 		mutex_exit(&rp->r_statelock);
2036 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2037 		    "recov_stale: already marked stale, rp %s",
2038 		    rnode4info(rp)));
2039 		return;
2040 	}
2041 
2042 	mutex_exit(&rp->r_statelock);
2043 
2044 	/* Try a GETATTR on this vnode */
2045 	nfs4_getattr_otw_norecovery(vp, &gar, &e, CRED(), 0);
2046 
2047 	/*
2048 	 * Handle non-STALE recoverable errors
2049 	 */
2050 	needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
2051 	if (needrecov && (e.error != 0 || e.stat != NFS4ERR_STALE)) {
2052 		(void) nfs4_start_recovery(&e, mi, vp,
2053 				NULL, NULL, NULL, OP_GETATTR, NULL);
2054 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2055 		    "recov_stale: error=%d, stat=%d seen on rp %s",
2056 		    e.error, e.stat, rnode4info(rp)));
2057 		goto out;
2058 	}
2059 
2060 	/* Are things OK for this vnode? */
2061 	if (!e.error && e.stat == NFS4_OK) {
2062 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2063 		    "recov_stale: file appears fine, rp %s",
2064 		    rnode4info(rp)));
2065 		goto out;
2066 	}
2067 
2068 	/* Did we get an unrelated non-recoverable error? */
2069 	if (e.error || e.stat != NFS4ERR_STALE) {
2070 		nfs4_fail_recov(vp, fail_msg, e.error, e.stat);
2071 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2072 		    "recov_stale: unrelated fatal error, rp %s",
2073 		    rnode4info(rp)));
2074 		goto out;
2075 	}
2076 
2077 	/*
2078 	 * If we don't appear to be dealing with the root node, find it.
2079 	 */
2080 	if ((vp->v_flag & VROOT) == 0) {
2081 		nfs4_error_zinit(&e);
2082 		e.error = VFS_ROOT(vp->v_vfsp, &rootvp);
2083 		if (e.error) {
2084 			nfs4_fail_recov(vp, fail_msg, 0, NFS4ERR_STALE);
2085 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2086 			    "recov_stale: can't find root node for rp %s",
2087 			    rnode4info(rp)));
2088 			goto out;
2089 		}
2090 	}
2091 
2092 	/* Try a GETATTR on the root vnode */
2093 	if (rootvp != NULL) {
2094 		nfs4_error_zinit(&e);
2095 		nfs4_getattr_otw_norecovery(rootvp, &gar, &e, CRED(), 0);
2096 
2097 		/* Try recovery? */
2098 		if (e.error != 0 || e.stat != NFS4ERR_STALE) {
2099 			needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
2100 			if (needrecov) {
2101 				(void) nfs4_start_recovery(&e,
2102 					mi, rootvp, NULL, NULL, NULL,
2103 					OP_GETATTR, NULL);
2104 				NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2105 				    "recov_stale: error=%d, stat=%d seen "
2106 				    "on rp %s", e.error, e.stat,
2107 				    rnode4info(rp)));
2108 			}
2109 		}
2110 
2111 		/*
2112 		 * Check to see if a failover attempt is warranted
2113 		 * NB: nfs4_try_failover doesn't check for STALE
2114 		 * because recov_stale gets a shot first.  Now that
2115 		 * recov_stale has failed, go ahead and try failover.
2116 		 *
2117 		 * If the getattr on the root filehandle was successful,
2118 		 * then mark recovery as failed for 'vp' and exit.
2119 		 */
2120 		if (nfs4_try_failover(&e) == 0 && e.stat != NFS4ERR_STALE) {
2121 			/*
2122 			 * pass the original error to fail_recov, not
2123 			 * the one from trying the root vnode.
2124 			 */
2125 			nfs4_fail_recov(vp, fail_msg, 0, NFS4ERR_STALE);
2126 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2127 			    "recov_stale: root node OK, marking "
2128 			    "dead rp %s", rnode4info(rp)));
2129 			goto out;
2130 		}
2131 	}
2132 
2133 	/*
2134 	 * Here, we know that both the original file and the
2135 	 * root filehandle (which may be the same) are stale.
2136 	 * We want to fail over if we can, and if we can't, we
2137 	 * want to mark everything in sight bad.
2138 	 */
2139 	if (FAILOVER_MOUNT4(mi)) {
2140 		mutex_enter(&mi->mi_lock);
2141 		mi->mi_recovflags |= MI4R_NEED_NEW_SERVER;
2142 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2143 		    "recov_stale: failing over due to rp %s",
2144 		    rnode4info(rp)));
2145 		mutex_exit(&mi->mi_lock);
2146 	} else {
2147 		rnode4_t *rootrp;
2148 		servinfo4_t *svp;
2149 
2150 		/*
2151 		 * Can't fail over, so mark things dead.
2152 		 *
2153 		 * If rootvp is set, we know we have a distinct
2154 		 * non-root vnode which can be marked dead in
2155 		 * the usual way.
2156 		 *
2157 		 * Then we want to mark the root vnode dead.
2158 		 * Note that if rootvp wasn't set, our vp is
2159 		 * actually the root vnode.
2160 		 */
2161 		if (rootvp != NULL) {
2162 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2163 			    "recov_stale: can't fail over, marking dead rp %s",
2164 			    rnode4info(rp)));
2165 			nfs4_fail_recov(vp, fail_msg, 0, NFS4ERR_STALE);
2166 		} else {
2167 			rootvp = vp;
2168 			VN_HOLD(rootvp);
2169 		}
2170 
2171 		/*
2172 		 * Mark root dead, but quietly - since
2173 		 * the root rnode is frequently recreated,
2174 		 * we can encounter this at every access.
2175 		 * Also mark recovery as failed on this VFS.
2176 		 */
2177 		rootrp = VTOR4(rootvp);
2178 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_CONT,
2179 		    "recov_stale: marking dead root rp %s",
2180 		    rnode4info(rootrp)));
2181 		mutex_enter(&rootrp->r_statelock);
2182 		rootrp->r_flags |= (R4RECOVERR | R4STALE);
2183 		rootrp->r_error = ESTALE;
2184 		mutex_exit(&rootrp->r_statelock);
2185 		mutex_enter(&mi->mi_lock);
2186 		mi->mi_error = ESTALE;
2187 		mutex_exit(&mi->mi_lock);
2188 
2189 		svp = mi->mi_curr_serv;
2190 		(void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
2191 		svp->sv_flags |= SV4_ROOT_STALE;
2192 		nfs_rw_exit(&svp->sv_lock);
2193 	}
2194 
2195 out:
2196 	if (rootvp)
2197 		VN_RELE(rootvp);
2198 }
2199 
2200 /*
2201  * Locks.
2202  */
2203 
2204 /*
2205  * Reclaim all the active (acquired) locks for the given file.
2206  * If a process lost a lock, the process is sent a SIGLOST.  This is not
2207  * considered an error.
2208  *
2209  * Return values:
2210  * Errors and status are returned via the nfs4_error_t parameter
2211  * If an error indicates that recovery is needed, the caller is responsible
2212  * for dealing with it.
2213  */
2214 
2215 static void
2216 relock_file(vnode_t *vp, mntinfo4_t *mi, nfs4_error_t *ep,
2217     fattr4_change pre_change)
2218 {
2219 	locklist_t *locks, *llp;
2220 	rnode4_t *rp;
2221 
2222 	ASSERT(ep != NULL);
2223 	nfs4_error_zinit(ep);
2224 
2225 	if (VTOMI4(vp)->mi_flags & MI4_LLOCK)
2226 		return;
2227 
2228 	nfs4_flush_lock_owners(VTOR4(vp));
2229 
2230 	/*
2231 	 * If we get an error that requires recovery actions, just bail out
2232 	 * and let the top-level recovery code handle it.
2233 	 *
2234 	 * If we get some other error, kill the process that owned the lock
2235 	 * and mark its remaining locks (if any) as belonging to NOPID, so
2236 	 * that we don't make any more reclaim requests for that process.
2237 	 */
2238 
2239 	rp = VTOR4(vp);
2240 	locks = flk_active_locks_for_vp(vp);
2241 	for (llp = locks; llp != NULL; llp = llp->ll_next) {
2242 		int did_reclaim = 1;
2243 
2244 		ASSERT(llp->ll_vp == vp);
2245 		if (llp->ll_flock.l_pid == NOPID)
2246 			continue;
2247 		reclaim_one_lock(vp, &llp->ll_flock, ep, &did_reclaim);
2248 		/*
2249 		 * If we need to restart recovery, stop processing the
2250 		 * list.  Some errors would be recoverable under other
2251 		 * circumstances, but if they happen here we just give up
2252 		 * on the lock.
2253 		 */
2254 		if (nfs4_needs_recovery(ep, TRUE, vp->v_vfsp)) {
2255 			if (ep->error != 0)
2256 				break;
2257 			if (!nfs4_recov_marks_dead(ep->stat))
2258 				break;
2259 		}
2260 		/*
2261 		 *   In case the server isn't offering us a grace period, or
2262 		 * if we missed it, we might have opened & locked from scratch,
2263 		 * rather than reopened/reclaimed.
2264 		 *   We need to ensure that the object hadn't been otherwise
2265 		 * changed during this time, by comparing the changeinfo.
2266 		 *   We get passed the changeinfo from before the reopen by our
2267 		 * caller, in pre_change.
2268 		 *   The changeinfo from after the reopen is in rp->r_change,
2269 		 * courtesy of the GETATTR in the reopen.
2270 		 *   If they're different, then the file has changed, and we
2271 		 * have to SIGLOST the app.
2272 		 */
2273 		if (ep->error == 0 && ep->stat == NFS4_OK && !did_reclaim) {
2274 			mutex_enter(&rp->r_statelock);
2275 			if (pre_change != rp->r_change)
2276 				ep->stat = NFS4ERR_NO_GRACE;
2277 			mutex_exit(&rp->r_statelock);
2278 		}
2279 		if (ep->error != 0 || ep->stat != NFS4_OK) {
2280 			if (ep->error != 0)
2281 				nfs4_queue_event(RE_FAIL_RELOCK, mi,
2282 				    NULL, ep->error, vp, NULL, 0, NULL,
2283 				    llp->ll_flock.l_pid, TAG_NONE, TAG_NONE,
2284 				    0, 0);
2285 			else
2286 				nfs4_queue_event(RE_FAIL_RELOCK, mi,
2287 				    NULL, 0, vp, NULL, ep->stat, NULL,
2288 				    llp->ll_flock.l_pid, TAG_NONE, TAG_NONE,
2289 				    0, 0);
2290 			nfs4_send_siglost(llp->ll_flock.l_pid, mi, vp, TRUE,
2291 			    ep->error, ep->stat);
2292 			relock_skip_pid(llp, llp->ll_flock.l_pid);
2293 
2294 			/* Reinitialize the nfs4_error and continue */
2295 			nfs4_error_zinit(ep);
2296 		}
2297 	}
2298 
2299 	if (locks != NULL)
2300 		flk_free_locklist(locks);
2301 }
2302 
2303 /*
2304  * Reclaim the given lock.
2305  * If the lock can't be reclaimed, the process is sent SIGLOST, but this is
2306  * not considered an error.
2307  *
2308  * Errors are returned via the nfs4_error_t parameter.
2309  */
2310 static void
2311 reclaim_one_lock(vnode_t *vp, flock64_t *flk, nfs4_error_t *ep,
2312 	int *did_reclaimp)
2313 {
2314 	cred_t *cr;
2315 	rnode4_t *rp = VTOR4(vp);
2316 
2317 	cr = pid_to_cr(flk->l_pid);
2318 	if (cr == NULL) {
2319 		nfs4_error_zinit(ep);
2320 		ep->error = ESRCH;
2321 		return;
2322 	}
2323 
2324 	do {
2325 		mutex_enter(&rp->r_statelock);
2326 		if (rp->r_flags & R4RECOVERR) {
2327 			/*
2328 			 * This shouldn't affect other reclaims, so don't
2329 			 * return an error.
2330 			 */
2331 			mutex_exit(&rp->r_statelock);
2332 			break;
2333 		}
2334 		mutex_exit(&rp->r_statelock);
2335 
2336 		nfs4frlock(NFS4_LCK_CTYPE_RECLAIM, vp, F_SETLK, flk,
2337 				FREAD|FWRITE, 0, cr, ep, NULL, did_reclaimp);
2338 		if (ep->error == 0 && ep->stat == NFS4ERR_FHEXPIRED)
2339 			start_recovery_action(NR_FHEXPIRED, TRUE, VTOMI4(vp),
2340 					    vp, NULL);
2341 	} while (ep->error == 0 && ep->stat == NFS4ERR_FHEXPIRED);
2342 
2343 	crfree(cr);
2344 }
2345 
2346 /*
2347  * Open files.
2348  */
2349 
2350 /*
2351  * Verifies if the nfsstat4 is a valid error for marking this vnode dead.
2352  * Returns 1 if the error is valid; 0 otherwise.
2353  */
2354 static int
2355 nfs4_valid_recov_err_for_vp(vnode_t *vp, nfsstat4 stat)
2356 {
2357 	/*
2358 	 * We should not be marking non-regular files as dead,
2359 	 * except in very rare cases (eg: BADHANDLE or NFS4ERR_BADNAME).
2360 	 */
2361 	if (vp->v_type != VREG && stat != NFS4ERR_BADHANDLE &&
2362 	    stat != NFS4ERR_BADNAME)
2363 		return (0);
2364 
2365 	return (1);
2366 }
2367 
2368 /*
2369  * Failed attempting to recover a filehandle.  If 'stat' is valid for 'vp',
2370  * then mark the object dead.  Since we've had to do a lookup for
2371  * filehandle recovery, we will mark the object dead if we got NOENT.
2372  */
2373 static void
2374 nfs4_recov_fh_fail(vnode_t *vp, int error, nfsstat4 stat)
2375 {
2376 	ASSERT(vp != NULL);
2377 
2378 	if ((error == 0) && (stat != NFS4ERR_NOENT) &&
2379 	    (!nfs4_valid_recov_err_for_vp(vp, stat)))
2380 		return;
2381 
2382 	nfs4_fail_recov(vp, "can't recover filehandle", error, stat);
2383 }
2384 
2385 /*
2386  * Recovery from a "shouldn't happen" error.  In the long term, we'd like
2387  * to mark only the data structure(s) that provided the bad value as being
2388  * bad.  But for now we'll just mark the entire file.
2389  */
2390 
2391 static void
2392 recov_badstate(recov_info_t *recovp, vnode_t *vp, nfsstat4 stat)
2393 {
2394 	ASSERT(vp != NULL);
2395 	recov_throttle(recovp, vp);
2396 
2397 	if (!nfs4_valid_recov_err_for_vp(vp, stat))
2398 		return;
2399 
2400 	nfs4_fail_recov(vp, "", 0, stat);
2401 }
2402 
2403 /*
2404  * Free up the information saved for a lost state request.
2405  */
2406 static void
2407 nfs4_free_lost_rqst(nfs4_lost_rqst_t *lrp, nfs4_server_t *sp)
2408 {
2409 	component4 *filep;
2410 	nfs4_open_stream_t *osp;
2411 	int have_sync_lock;
2412 
2413 	NFS4_DEBUG(nfs4_lost_rqst_debug,
2414 		(CE_NOTE, "nfs4_free_lost_rqst:"));
2415 
2416 	switch (lrp->lr_op) {
2417 	case OP_OPEN:
2418 		filep = &lrp->lr_ofile;
2419 		if (filep->utf8string_val) {
2420 			kmem_free(filep->utf8string_val, filep->utf8string_len);
2421 			filep->utf8string_val = NULL;
2422 		}
2423 		break;
2424 	case OP_DELEGRETURN:
2425 		nfs4delegreturn_cleanup(VTOR4(lrp->lr_vp), sp);
2426 		break;
2427 	case OP_CLOSE:
2428 		osp = lrp->lr_osp;
2429 		ASSERT(osp != NULL);
2430 		mutex_enter(&osp->os_sync_lock);
2431 		have_sync_lock = 1;
2432 		if (osp->os_pending_close) {
2433 			/* clean up the open file state. */
2434 			osp->os_pending_close = 0;
2435 			nfs4close_notw(lrp->lr_vp, osp, &have_sync_lock);
2436 		}
2437 		if (have_sync_lock)
2438 			mutex_exit(&osp->os_sync_lock);
2439 		break;
2440 	}
2441 
2442 	lrp->lr_op = 0;
2443 	if (lrp->lr_oop != NULL) {
2444 		open_owner_rele(lrp->lr_oop);
2445 		lrp->lr_oop = NULL;
2446 	}
2447 	if (lrp->lr_osp != NULL) {
2448 		open_stream_rele(lrp->lr_osp, VTOR4(lrp->lr_vp));
2449 		lrp->lr_osp = NULL;
2450 	}
2451 	if (lrp->lr_lop != NULL) {
2452 		lock_owner_rele(lrp->lr_lop);
2453 		lrp->lr_lop = NULL;
2454 	}
2455 	if (lrp->lr_flk != NULL) {
2456 		kmem_free(lrp->lr_flk, sizeof (flock64_t));
2457 		lrp->lr_flk = NULL;
2458 	}
2459 	if (lrp->lr_vp != NULL) {
2460 		VN_RELE(lrp->lr_vp);
2461 		lrp->lr_vp = NULL;
2462 	}
2463 	if (lrp->lr_dvp != NULL) {
2464 		VN_RELE(lrp->lr_dvp);
2465 		lrp->lr_dvp = NULL;
2466 	}
2467 	if (lrp->lr_cr != NULL) {
2468 		crfree(lrp->lr_cr);
2469 		lrp->lr_cr = NULL;
2470 	}
2471 
2472 	kmem_free(lrp, sizeof (nfs4_lost_rqst_t));
2473 }
2474 
2475 /*
2476  * Remove any lost state requests and free them.
2477  */
2478 static void
2479 nfs4_remove_lost_rqsts(mntinfo4_t *mi, nfs4_server_t *sp)
2480 {
2481 	nfs4_lost_rqst_t *lrp;
2482 
2483 	mutex_enter(&mi->mi_lock);
2484 	while ((lrp = list_head(&mi->mi_lost_state)) != NULL) {
2485 		list_remove(&mi->mi_lost_state, lrp);
2486 		mutex_exit(&mi->mi_lock);
2487 		nfs4_free_lost_rqst(lrp, sp);
2488 		mutex_enter(&mi->mi_lock);
2489 	}
2490 	mutex_exit(&mi->mi_lock);
2491 }
2492 
2493 /*
2494  * Reopen all the files for the given filesystem and reclaim any locks.
2495  */
2496 
2497 static void
2498 recov_openfiles(recov_info_t *recovp, nfs4_server_t *sp)
2499 {
2500 	mntinfo4_t *mi = recovp->rc_mi;
2501 	nfs4_opinst_t *reopenlist = NULL, *rep;
2502 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
2503 	open_claim_type4 claim;
2504 	int remap;
2505 	char *fail_msg = "No such file or directory on replica";
2506 	rnode4_t *rp;
2507 	fattr4_change pre_change;
2508 
2509 	ASSERT(sp != NULL);
2510 
2511 	/*
2512 	 * This check is to allow a 10ms pause before we reopen files
2513 	 * it should allow the server time to have received the CB_NULL
2514 	 * reply and update its internal structures such that (if
2515 	 * applicable) we are granted a delegation on reopened files.
2516 	 */
2517 	mutex_enter(&sp->s_lock);
2518 	if ((sp->s_flags & (N4S_CB_PINGED | N4S_CB_WAITER)) == 0) {
2519 		sp->s_flags |= N4S_CB_WAITER;
2520 		(void) cv_timedwait(&sp->wait_cb_null, &sp->s_lock,
2521 			(lbolt+drv_usectohz(N4S_CB_PAUSE_TIME)));
2522 	}
2523 	mutex_exit(&sp->s_lock);
2524 
2525 	(void) nfs_rw_enter_sig(&sp->s_recovlock, RW_READER, 0);
2526 	(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0);
2527 
2528 	if (NFS4_VOLATILE_FH(mi)) {
2529 		nfs4_remap_root(mi, &e, 0);
2530 		if (nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp)) {
2531 			(void) nfs4_start_recovery(&e, mi, NULL,
2532 					NULL, NULL, NULL, OP_LOOKUP, NULL);
2533 		}
2534 	}
2535 
2536 	mutex_enter(&mi->mi_lock);
2537 	if (recovp->rc_srv_reboot || (mi->mi_recovflags & MI4R_SRV_REBOOT))
2538 		claim = CLAIM_PREVIOUS;
2539 	else
2540 		claim = CLAIM_NULL;
2541 	mutex_exit(&mi->mi_lock);
2542 
2543 	if (e.error == 0 && e.stat == NFS4_OK) {
2544 		/*
2545 		 * Get a snapshot of open files in the filesystem.  Note
2546 		 * that new opens will stall until the server's grace
2547 		 * period is done.
2548 		 */
2549 		reopenlist = r4mkopenlist(mi);
2550 
2551 		mutex_enter(&mi->mi_lock);
2552 		remap = mi->mi_recovflags & MI4R_REMAP_FILES;
2553 		mutex_exit(&mi->mi_lock);
2554 		/*
2555 		 * Since we are re-establishing state on the
2556 		 * server, its ok to blow away the saved lost
2557 		 * requests since we don't need to reissue it.
2558 		 */
2559 		nfs4_remove_lost_rqsts(mi, sp);
2560 
2561 		for (rep = reopenlist; rep; rep = rep->re_next) {
2562 
2563 			if (remap) {
2564 				nfs4_remap_file(mi, rep->re_vp,
2565 					NFS4_REMAP_CKATTRS, &e);
2566 			}
2567 			if (e.error == ENOENT || e.stat == NFS4ERR_NOENT) {
2568 				/*
2569 				 * The current server does not have the file
2570 				 * that is to be remapped.  This is most
2571 				 * likely due to an improperly maintained
2572 				 * replica.   The files that are missing from
2573 				 * the server will be marked dead and logged
2574 				 * in order to make sys admins aware of the
2575 				 * problem.
2576 				 */
2577 				nfs4_fail_recov(rep->re_vp,
2578 					fail_msg, e.error, e.stat);
2579 				/*
2580 				 * We've already handled the error so clear it.
2581 				 */
2582 				nfs4_error_zinit(&e);
2583 				continue;
2584 			} else if (e.error == 0 && e.stat == NFS4_OK) {
2585 				int j;
2586 
2587 				rp = VTOR4(rep->re_vp);
2588 				mutex_enter(&rp->r_statelock);
2589 				pre_change = rp->r_change;
2590 				mutex_exit(&rp->r_statelock);
2591 
2592 				for (j = 0; j < rep->re_numosp; j++) {
2593 					nfs4_reopen(rep->re_vp, rep->re_osp[j],
2594 						&e, claim, FALSE, TRUE);
2595 					if (e.error != 0 || e.stat != NFS4_OK)
2596 						break;
2597 				}
2598 				if (nfs4_needs_recovery(&e, TRUE,
2599 				    mi->mi_vfsp)) {
2600 					(void) nfs4_start_recovery(&e, mi,
2601 						rep->re_vp, NULL, NULL, NULL,
2602 						OP_OPEN, NULL);
2603 					break;
2604 				}
2605 			}
2606 #ifdef DEBUG
2607 			if (nfs4_recovdelay > 0)
2608 				delay(MSEC_TO_TICK(nfs4_recovdelay * 1000));
2609 #endif
2610 			if (e.error == 0 && e.stat == NFS4_OK)
2611 				relock_file(rep->re_vp, mi, &e, pre_change);
2612 
2613 			if (nfs4_needs_recovery(&e, TRUE, mi->mi_vfsp))
2614 				(void) nfs4_start_recovery(&e, mi,
2615 					rep->re_vp, NULL, NULL, NULL, OP_LOCK,
2616 					NULL);
2617 			if (e.error != 0 || e.stat != NFS4_OK)
2618 				break;
2619 		}
2620 
2621 		/*
2622 		 * Check to see if we need to remap files passed in
2623 		 * via the recovery arguments; this will have been
2624 		 * done for open files.  A failure here is not fatal.
2625 		 */
2626 		if (remap) {
2627 			nfs4_error_t ignore;
2628 			nfs4_check_remap(mi, recovp->rc_vp1, NFS4_REMAP_CKATTRS,
2629 				&ignore);
2630 			nfs4_check_remap(mi, recovp->rc_vp2, NFS4_REMAP_CKATTRS,
2631 				&ignore);
2632 		}
2633 	}
2634 
2635 	if (e.error == 0 && e.stat == NFS4_OK) {
2636 		mutex_enter(&mi->mi_lock);
2637 		mi->mi_recovflags &= ~(MI4R_REOPEN_FILES | MI4R_REMAP_FILES);
2638 		mutex_exit(&mi->mi_lock);
2639 	}
2640 
2641 	nfs_rw_exit(&mi->mi_recovlock);
2642 	nfs_rw_exit(&sp->s_recovlock);
2643 
2644 	if (reopenlist != NULL)
2645 		r4releopenlist(reopenlist);
2646 }
2647 
2648 /*
2649  * Resend the queued state recovery requests in "rqsts".
2650  */
2651 
2652 static void
2653 nfs4_resend_lost_rqsts(recov_info_t *recovp, nfs4_server_t *sp)
2654 {
2655 	nfs4_lost_rqst_t	*lrp, *tlrp;
2656 	mntinfo4_t		*mi = recovp->rc_mi;
2657 	nfs4_error_t		n4e;
2658 #ifdef NOTYET
2659 	uint32_t		deny_bits = 0;
2660 #endif
2661 
2662 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_resend_lost_rqsts"));
2663 
2664 	ASSERT(mi != NULL);
2665 	ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
2666 
2667 	mutex_enter(&mi->mi_lock);
2668 	lrp = list_head(&mi->mi_lost_state);
2669 	mutex_exit(&mi->mi_lock);
2670 	while (lrp != NULL) {
2671 		nfs4_error_zinit(&n4e);
2672 		resend_one_op(lrp, &n4e, mi, sp);
2673 		NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
2674 		    "nfs4_resend_lost_rqsts: resend request: for vp %p got "
2675 		    "error %d stat %d", (void *)lrp->lr_vp, n4e.error,
2676 		    n4e.stat));
2677 
2678 		/*
2679 		 * If we get a recovery error that we can actually
2680 		 * recover from (such as ETIMEDOUT, FHEXPIRED), we
2681 		 * return and let the recovery thread redrive the call.
2682 		 * Don't requeue unless the zone is still healthy.
2683 		 */
2684 		if (zone_status_get(curproc->p_zone) < ZONE_IS_SHUTTING_DOWN &&
2685 		    nfs4_needs_recovery(&n4e, TRUE, mi->mi_vfsp) &&
2686 		    (nfs4_try_failover(&n4e) ||
2687 		    NFS4_FRC_UNMT_ERR(n4e.error, mi->mi_vfsp) ||
2688 		    (n4e.error == 0 && n4e.stat != NFS4ERR_BADHANDLE &&
2689 		    !nfs4_recov_marks_dead(n4e.stat)))) {
2690 			/*
2691 			 * For these three errors, we want to delay a bit
2692 			 * instead of pounding the server into submission.
2693 			 * We have to do this manually; the normal
2694 			 * processing for these errors only works for
2695 			 * non-recovery requests.
2696 			 */
2697 			if ((n4e.error == 0 && n4e.stat == NFS4ERR_DELAY) ||
2698 			    (n4e.error == 0 && n4e.stat == NFS4ERR_GRACE) ||
2699 			    (n4e.error == 0 && n4e.stat == NFS4ERR_RESOURCE) ||
2700 			    NFS4_FRC_UNMT_ERR(n4e.error, mi->mi_vfsp)) {
2701 				delay(SEC_TO_TICK(nfs4err_delay_time));
2702 			} else {
2703 				(void) nfs4_start_recovery(&n4e,
2704 					mi, lrp->lr_dvp, lrp->lr_vp, NULL, NULL,
2705 					lrp->lr_op, NULL);
2706 			}
2707 			return;
2708 		}
2709 
2710 		mutex_enter(&mi->mi_lock);
2711 		list_remove(&mi->mi_lost_state, lrp);
2712 		tlrp = lrp;
2713 		lrp = list_head(&mi->mi_lost_state);
2714 		mutex_exit(&mi->mi_lock);
2715 		nfs4_free_lost_rqst(tlrp, sp);
2716 	}
2717 }
2718 
2719 /*
2720  * Resend the given op, and issue any necessary undo call.
2721  * errors are returned via the nfs4_error_t parameter.
2722  */
2723 
2724 static void
2725 resend_one_op(nfs4_lost_rqst_t *lrp, nfs4_error_t *ep,
2726 	mntinfo4_t *mi, nfs4_server_t *sp)
2727 {
2728 	vnode_t *vp;
2729 	nfs4_open_stream_t *osp;
2730 	cred_t *cr;
2731 	uint32_t acc_bits;
2732 
2733 	vp = lrp->lr_vp;
2734 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_one_op: "
2735 	    "have a lost open/close request for vp %p", (void *)vp));
2736 
2737 	switch (lrp->lr_op) {
2738 	case OP_OPEN:
2739 		nfs4_resend_open_otw(&vp, lrp, ep);
2740 		break;
2741 	case OP_OPEN_DOWNGRADE:
2742 		ASSERT(lrp->lr_oop != NULL);
2743 		ep->error = nfs4_start_open_seqid_sync(lrp->lr_oop, mi);
2744 		ASSERT(!ep->error);	/* recov thread always succeeds */
2745 		ASSERT(lrp->lr_osp != NULL);
2746 		mutex_enter(&lrp->lr_osp->os_sync_lock);
2747 		nfs4_open_downgrade(lrp->lr_dg_acc, lrp->lr_dg_deny,
2748 			    lrp->lr_oop, lrp->lr_osp, vp, lrp->lr_cr, lrp,
2749 			    ep, NULL, NULL);
2750 		mutex_exit(&lrp->lr_osp->os_sync_lock);
2751 		nfs4_end_open_seqid_sync(lrp->lr_oop);
2752 		break;
2753 	case OP_CLOSE:
2754 		osp = lrp->lr_osp;
2755 		cr = lrp->lr_cr;
2756 		acc_bits = 0;
2757 		mutex_enter(&osp->os_sync_lock);
2758 		if (osp->os_share_acc_read)
2759 			acc_bits |= OPEN4_SHARE_ACCESS_READ;
2760 		if (osp->os_share_acc_write)
2761 			acc_bits |= OPEN4_SHARE_ACCESS_WRITE;
2762 		mutex_exit(&osp->os_sync_lock);
2763 		nfs4close_one(vp, osp, cr, acc_bits, lrp, ep,
2764 				CLOSE_RESEND, 0, 0, 0);
2765 		break;
2766 	case OP_LOCK:
2767 	case OP_LOCKU:
2768 		resend_lock(lrp, ep);
2769 		goto done;
2770 	case OP_DELEGRETURN:
2771 		nfs4_resend_delegreturn(lrp, ep, sp);
2772 		goto done;
2773 	default:
2774 #ifdef DEBUG
2775 		cmn_err(CE_PANIC, "resend_one_op: unexpected op: %d",
2776 			lrp->lr_op);
2777 #endif
2778 		nfs4_queue_event(RE_LOST_STATE_BAD_OP, mi, NULL,
2779 		    lrp->lr_op, lrp->lr_vp, lrp->lr_dvp, NFS4_OK, NULL, 0,
2780 		    TAG_NONE, TAG_NONE, 0, 0);
2781 		nfs4_error_init(ep, EINVAL);
2782 		return;
2783 	}
2784 
2785 	/*
2786 	 * No need to retry nor send an "undo" CLOSE in the
2787 	 * event the server rebooted.
2788 	 */
2789 	if (ep->error == 0 && (ep->stat == NFS4ERR_STALE_CLIENTID ||
2790 	    ep->stat == NFS4ERR_STALE_STATEID || ep->stat == NFS4ERR_EXPIRED))
2791 		goto done;
2792 
2793 	/*
2794 	 * If we resent a CLOSE or OPEN_DOWNGRADE, there's nothing
2795 	 * to undo.  Undoing locking operations was handled by
2796 	 * resend_lock().
2797 	 */
2798 	if (lrp->lr_op == OP_OPEN_DOWNGRADE || lrp->lr_op == OP_CLOSE)
2799 		goto done;
2800 
2801 	/*
2802 	 * If we get any other error for OPEN, then don't attempt
2803 	 * to undo the resend of the open (since it was never
2804 	 * successful!).
2805 	 */
2806 	ASSERT(lrp->lr_op == OP_OPEN);
2807 	if (ep->error || ep->stat != NFS4_OK)
2808 		goto done;
2809 
2810 	/*
2811 	 * Now let's undo our OPEN.
2812 	 */
2813 	nfs4_error_zinit(ep);
2814 	close_after_open_resend(vp, lrp->lr_cr, lrp->lr_oacc, ep);
2815 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_one_op: "
2816 	    "nfs4close_one: for vp %p got error %d stat %d",
2817 	    (void *)vp, ep->error, ep->stat));
2818 
2819 done:
2820 	if (vp != lrp->lr_vp)
2821 		VN_RELE(vp);
2822 }
2823 
2824 /*
2825  * Close a file that was opened via a resent OPEN.
2826  * Most errors are passed back to the caller (via the return value and
2827  * *statp), except for FHEXPIRED, which is retried.
2828  *
2829  * It might be conceptually cleaner to push the CLOSE request onto the
2830  * front of the resend queue, rather than sending it here.  That would
2831  * match the way we undo lost lock requests.  On the other
2832  * hand, we've already got something that works, and there's no reason to
2833  * change it at this time.
2834  */
2835 
2836 static void
2837 close_after_open_resend(vnode_t *vp, cred_t *cr, uint32_t acc_bits,
2838 			nfs4_error_t *ep)
2839 {
2840 
2841 	for (;;) {
2842 		nfs4close_one(vp, NULL, cr, acc_bits, NULL, ep,
2843 				CLOSE_AFTER_RESEND, 0, 0, 0);
2844 		if (ep->error == 0 && ep->stat == NFS4_OK)
2845 			break;		/* success; done */
2846 		if (ep->error != 0 || ep->stat != NFS4ERR_FHEXPIRED)
2847 			break;
2848 		/* else retry FHEXPIRED */
2849 	}
2850 
2851 }
2852 
2853 /*
2854  * Resend the given lost lock request.  Return an errno value.  If zero,
2855  * *statp is set to the NFS status code for the call.
2856  *
2857  * Issue a SIGLOST and mark the rnode dead if we get a non-recovery error or
2858  * a recovery error that we don't actually recover from yet (eg: BAD_SEQID).
2859  * Let the recovery thread redrive the call if we get a recovery error that
2860  * we can actually recover from.
2861  */
2862 static void
2863 resend_lock(nfs4_lost_rqst_t *lrp, nfs4_error_t *ep)
2864 {
2865 	bool_t		send_siglost = FALSE;
2866 	vnode_t		*vp = lrp->lr_vp;
2867 
2868 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_lock:"));
2869 	ASSERT(lrp->lr_ctype == NFS4_LCK_CTYPE_REINSTATE ||
2870 	    lrp->lr_ctype == NFS4_LCK_CTYPE_RESEND);
2871 
2872 	nfs4frlock(lrp->lr_ctype, vp, F_SETLK,
2873 		    lrp->lr_flk, FREAD|FWRITE, 0, lrp->lr_cr, ep, lrp, NULL);
2874 
2875 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_lock: "
2876 	    "nfs4frlock for vp %p returned error %d, stat %d",
2877 	    (void *)vp, ep->error, ep->stat));
2878 
2879 	if (ep->error == 0 && ep->stat == 0)
2880 		goto done;
2881 	if (ep->error == 0 && ep->stat == NFS4ERR_DENIED &&
2882 	    lrp->lr_ctype == NFS4_LCK_CTYPE_RESEND)
2883 		goto done;
2884 
2885 	/*
2886 	 * If we failed with a non-recovery error, send SIGLOST and
2887 	 * mark the file dead.
2888 	 */
2889 	if (!nfs4_needs_recovery(ep, TRUE, vp->v_vfsp))
2890 		send_siglost = TRUE;
2891 	else {
2892 		/*
2893 		 * Done with recovering LOST LOCK in the event the
2894 		 * server rebooted or we've lost the lease.
2895 		 */
2896 		if (ep->error == 0 && (ep->stat == NFS4ERR_STALE_CLIENTID ||
2897 		    ep->stat == NFS4ERR_STALE_STATEID ||
2898 		    ep->stat == NFS4ERR_EXPIRED)) {
2899 			goto done;
2900 		}
2901 
2902 		/*
2903 		 * BAD_STATEID on an unlock indicates that the server has
2904 		 * forgotten about the lock anyway, so act like the call
2905 		 * was successful.
2906 		 */
2907 		if (ep->error == 0 && ep->stat == NFS4ERR_BAD_STATEID &&
2908 		    lrp->lr_op == OP_LOCKU)
2909 			goto done;
2910 
2911 		/*
2912 		 * If we got a recovery error that we don't actually
2913 		 * recover from, send SIGLOST.  If the filesystem was
2914 		 * forcibly unmounted, we skip the SIGLOST because (a) it's
2915 		 * unnecessary noise, and (b) there could be a new process
2916 		 * with the same pid as the one that had generated the lost
2917 		 * state request.
2918 		 */
2919 		if (ep->error == 0 && (ep->stat == NFS4ERR_BADHANDLE ||
2920 		    nfs4_recov_marks_dead(ep->stat))) {
2921 			if (!(vp->v_vfsp->vfs_flag & VFS_UNMOUNTED))
2922 				send_siglost = TRUE;
2923 			goto done;
2924 		}
2925 
2926 		/*
2927 		 * If the filesystem was forcibly unmounted, we
2928 		 * still need to synchronize with the server and
2929 		 * release state.  Try again later.
2930 		 */
2931 		if (NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp))
2932 			goto done;
2933 
2934 		/*
2935 		 * If we get a recovery error that we can actually
2936 		 * recover from (such as ETIMEDOUT, FHEXPIRED),
2937 		 * return and let the recovery thread redrive the call.
2938 		 *
2939 		 * For the three errors below, we want to delay a bit
2940 		 * instead of pounding the server into submission.
2941 		 */
2942 		if ((ep->error == 0 && ep->stat == NFS4ERR_DELAY) ||
2943 		    (ep->error == 0 && ep->stat == NFS4ERR_GRACE) ||
2944 		    (ep->error == 0 && ep->stat == NFS4ERR_RESOURCE))
2945 			delay(SEC_TO_TICK(recov_err_delay));
2946 		goto done;
2947 	}
2948 
2949 done:
2950 	if (send_siglost) {
2951 		cred_t *sv_cred;
2952 
2953 		/*
2954 		 * Must be root or the actual thread being issued the
2955 		 * SIGLOST for this to work, so just become root.
2956 		 */
2957 		sv_cred = curthread->t_cred;
2958 		curthread->t_cred = kcred;
2959 		nfs4_send_siglost(lrp->lr_flk->l_pid, VTOMI4(vp), vp, FALSE,
2960 		    ep->error, ep->stat);
2961 		curthread->t_cred = sv_cred;
2962 
2963 		/*
2964 		 * Flush any additional reinstantiation requests for
2965 		 * this operation.  Sending multiple SIGLOSTs to the user
2966 		 * process is unlikely to help and may cause trouble.
2967 		 */
2968 		if (lrp->lr_ctype == NFS4_LCK_CTYPE_REINSTATE)
2969 			flush_reinstate(lrp);
2970 	}
2971 }
2972 
2973 /*
2974  * Remove any lock reinstantiation requests that correspond to the given
2975  * lost request.  We only remove items that follow lrp in the queue,
2976  * assuming that lrp will be removed by the generic lost state code.
2977  */
2978 
2979 static void
2980 flush_reinstate(nfs4_lost_rqst_t *lrp)
2981 {
2982 	vnode_t *vp;
2983 	pid_t pid;
2984 	mntinfo4_t *mi;
2985 	nfs4_lost_rqst_t *nlrp;
2986 
2987 	vp = lrp->lr_vp;
2988 	mi = VTOMI4(vp);
2989 	pid = lrp->lr_flk->l_pid;
2990 
2991 	/*
2992 	 * If there are any more reinstantation requests to get rid of,
2993 	 * they should all be clustered at the front of the lost state
2994 	 * queue.
2995 	 */
2996 	mutex_enter(&mi->mi_lock);
2997 	for (lrp = list_next(&mi->mi_lost_state, lrp); lrp != NULL;
2998 	    lrp = nlrp) {
2999 		nlrp = list_next(&mi->mi_lost_state, lrp);
3000 		if (lrp->lr_op != OP_LOCK && lrp->lr_op != OP_LOCKU)
3001 			break;
3002 		if (lrp->lr_ctype != NFS4_LCK_CTYPE_REINSTATE)
3003 			break;
3004 		ASSERT(lrp->lr_vp == vp);
3005 		ASSERT(lrp->lr_flk->l_pid == pid);
3006 		NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
3007 				"remove reinstantiation %p", (void *)lrp));
3008 		list_remove(&mi->mi_lost_state, lrp);
3009 		nfs4_free_lost_rqst(lrp, NULL);
3010 	}
3011 	mutex_exit(&mi->mi_lock);
3012 }
3013 
3014 /*
3015  * End of state-specific recovery routines.
3016  */
3017 
3018 /*
3019  * Allocate a lost request struct, initialize it from lost_rqstp (including
3020  * bumping the reference counts for the referenced vnode, etc.), and hang
3021  * it off of recovp.
3022  */
3023 
3024 static void
3025 nfs4_save_lost_rqst(nfs4_lost_rqst_t *lost_rqstp, recov_info_t *recovp,
3026 	nfs4_recov_t *action, mntinfo4_t *mi)
3027 {
3028 	nfs4_lost_rqst_t *destp;
3029 
3030 	ASSERT(recovp->rc_lost_rqst == NULL);
3031 
3032 	destp = kmem_alloc(sizeof (nfs4_lost_rqst_t), KM_SLEEP);
3033 	recovp->rc_lost_rqst = destp;
3034 
3035 	if (lost_rqstp->lr_op == OP_LOCK ||
3036 	    lost_rqstp->lr_op == OP_LOCKU) {
3037 		ASSERT(lost_rqstp->lr_lop);
3038 		*action = NR_LOST_LOCK;
3039 		destp->lr_ctype = lost_rqstp->lr_ctype;
3040 		destp->lr_locktype = lost_rqstp->lr_locktype;
3041 	} else if (lost_rqstp->lr_op == OP_OPEN) {
3042 		component4 *srcfp, *destfp;
3043 
3044 		destp->lr_oacc = lost_rqstp->lr_oacc;
3045 		destp->lr_odeny = lost_rqstp->lr_odeny;
3046 		destp->lr_oclaim = lost_rqstp->lr_oclaim;
3047 		if (lost_rqstp->lr_oclaim == CLAIM_DELEGATE_CUR)
3048 			destp->lr_ostateid = lost_rqstp->lr_ostateid;
3049 
3050 		srcfp = &lost_rqstp->lr_ofile;
3051 		destfp = &destp->lr_ofile;
3052 		/*
3053 		 * Consume caller's utf8string
3054 		 */
3055 		destfp->utf8string_len = srcfp->utf8string_len;
3056 		destfp->utf8string_val = srcfp->utf8string_val;
3057 		srcfp->utf8string_len = 0;
3058 		srcfp->utf8string_val = NULL;	/* make sure not reused */
3059 
3060 		*action = NR_LOST_STATE_RQST;
3061 	} else if (lost_rqstp->lr_op == OP_OPEN_DOWNGRADE) {
3062 		destp->lr_dg_acc = lost_rqstp->lr_dg_acc;
3063 		destp->lr_dg_deny = lost_rqstp->lr_dg_deny;
3064 
3065 		*action = NR_LOST_STATE_RQST;
3066 	} else if (lost_rqstp->lr_op == OP_CLOSE) {
3067 		ASSERT(lost_rqstp->lr_oop);
3068 		*action = NR_LOST_STATE_RQST;
3069 	} else if (lost_rqstp->lr_op == OP_DELEGRETURN) {
3070 		*action = NR_LOST_STATE_RQST;
3071 	} else {
3072 #ifdef DEBUG
3073 		cmn_err(CE_PANIC, "nfs4_save_lost_rqst: bad op %d",
3074 			lost_rqstp->lr_op);
3075 #endif
3076 		nfs4_queue_event(RE_LOST_STATE_BAD_OP, mi, NULL,
3077 		    lost_rqstp->lr_op, lost_rqstp->lr_vp, lost_rqstp->lr_dvp,
3078 		    NFS4_OK, NULL, curproc->p_pid, TAG_NONE, TAG_NONE, 0, 0);
3079 		*action = NR_UNUSED;
3080 		recovp->rc_lost_rqst = NULL;
3081 		kmem_free(destp, sizeof (nfs4_lost_rqst_t));
3082 		return;
3083 	}
3084 
3085 	destp->lr_op = lost_rqstp->lr_op;
3086 	destp->lr_vp = lost_rqstp->lr_vp;
3087 	if (destp->lr_vp)
3088 		VN_HOLD(destp->lr_vp);
3089 	destp->lr_dvp = lost_rqstp->lr_dvp;
3090 	if (destp->lr_dvp)
3091 		VN_HOLD(destp->lr_dvp);
3092 	destp->lr_oop = lost_rqstp->lr_oop;
3093 	if (destp->lr_oop)
3094 		open_owner_hold(destp->lr_oop);
3095 	destp->lr_osp = lost_rqstp->lr_osp;
3096 	if (destp->lr_osp)
3097 		open_stream_hold(destp->lr_osp);
3098 	destp->lr_lop = lost_rqstp->lr_lop;
3099 	if (destp->lr_lop)
3100 		lock_owner_hold(destp->lr_lop);
3101 	destp->lr_cr = lost_rqstp->lr_cr;
3102 	if (destp->lr_cr)
3103 		crhold(destp->lr_cr);
3104 	if (lost_rqstp->lr_flk == NULL)
3105 		destp->lr_flk = NULL;
3106 	else {
3107 		destp->lr_flk = kmem_alloc(sizeof (flock64_t), KM_SLEEP);
3108 		*destp->lr_flk = *lost_rqstp->lr_flk;
3109 	}
3110 	destp->lr_putfirst = lost_rqstp->lr_putfirst;
3111 }
3112 
3113 /*
3114  * Map the given return values (errno and nfs4 status code) to a recovery
3115  * action and fill in the following fields of recovp: rc_action,
3116  * rc_srv_reboot, rc_stateid, rc_lost_rqst.
3117  */
3118 
3119 void
3120 errs_to_action(recov_info_t *recovp,
3121 	nfs4_server_t *sp, mntinfo4_t *mi, stateid4 *sidp,
3122 	nfs4_lost_rqst_t *lost_rqstp, int unmounted, nfs_opnum4 op,
3123 	nfs4_bseqid_entry_t *bsep)
3124 {
3125 	nfs4_recov_t action = NR_UNUSED;
3126 	bool_t reboot = FALSE;
3127 	int try_f;
3128 	int error = recovp->rc_orig_errors.error;
3129 	nfsstat4 stat = recovp->rc_orig_errors.stat;
3130 
3131 	bzero(&recovp->rc_stateid, sizeof (stateid4));
3132 	recovp->rc_lost_rqst = NULL;
3133 	recovp->rc_bseqid_rqst = NULL;
3134 
3135 	try_f = nfs4_try_failover(&recovp->rc_orig_errors) &&
3136 			FAILOVER_MOUNT4(mi);
3137 
3138 	/*
3139 	 * We start recovery for EINTR only in the lost lock
3140 	 * or lost open/close case.
3141 	 */
3142 
3143 	if (try_f || error == EINTR || (error == EIO && unmounted)) {
3144 		recovp->rc_error = (error != 0 ? error : geterrno4(stat));
3145 		if (lost_rqstp) {
3146 			ASSERT(lost_rqstp->lr_op != 0);
3147 			nfs4_save_lost_rqst(lost_rqstp, recovp, &action, mi);
3148 		}
3149 		if (try_f)
3150 			action = NR_FAILOVER;
3151 	} else if (error != 0) {
3152 		recovp->rc_error = error;
3153 		nfs4_queue_event(RE_UNEXPECTED_ERRNO, mi, NULL, error, NULL,
3154 		    NULL, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
3155 		action = NR_CLIENTID;
3156 	} else {
3157 		recovp->rc_error = geterrno4(stat);
3158 		switch (stat) {
3159 #ifdef notyet
3160 		case NFS4ERR_LEASE_MOVED:
3161 			action = xxx;
3162 			break;
3163 		case NFS4ERR_MOVED:
3164 			action = xxx;
3165 			break;
3166 #endif
3167 		case NFS4ERR_BADHANDLE:
3168 			action = NR_BADHANDLE;
3169 			break;
3170 		case NFS4ERR_BAD_SEQID:
3171 			if (bsep)
3172 				save_bseqid_rqst(bsep, recovp);
3173 			action = NR_BAD_SEQID;
3174 			break;
3175 		case NFS4ERR_OLD_STATEID:
3176 			action = NR_OLDSTATEID;
3177 			break;
3178 		case NFS4ERR_WRONGSEC:
3179 			action = NR_WRONGSEC;
3180 			break;
3181 		case NFS4ERR_FHEXPIRED:
3182 			action = NR_FHEXPIRED;
3183 			break;
3184 		case NFS4ERR_BAD_STATEID:
3185 			if (sp == NULL || (sp != NULL && inlease(sp))) {
3186 
3187 				action = NR_BAD_STATEID;
3188 				if (sidp)
3189 					recovp->rc_stateid = *sidp;
3190 			} else
3191 				action = NR_CLIENTID;
3192 			break;
3193 		case NFS4ERR_EXPIRED:
3194 			/*
3195 			 * The client's lease has expired, either due
3196 			 * to a network partition or perhaps a client
3197 			 * error.  In either case, try an NR_CLIENTID
3198 			 * style recovery.  reboot remains false, since
3199 			 * there is no evidence the server has rebooted.
3200 			 * This will cause CLAIM_NULL opens and lock
3201 			 * requests without the reclaim bit.
3202 			 */
3203 			action = NR_CLIENTID;
3204 
3205 			DTRACE_PROBE4(nfs4__expired,
3206 					nfs4_server_t *, sp,
3207 					mntinfo4_t *, mi,
3208 					stateid4 *, sidp, int, op);
3209 
3210 			break;
3211 		case NFS4ERR_STALE_CLIENTID:
3212 		case NFS4ERR_STALE_STATEID:
3213 			action = NR_CLIENTID;
3214 			reboot = TRUE;
3215 			break;
3216 		case NFS4ERR_RESOURCE:
3217 			/*
3218 			 * If this had been a FAILOVER mount, then
3219 			 * we'd have tried failover.  Since it's not,
3220 			 * just delay a while and retry.
3221 			 */
3222 			action = NR_DELAY;
3223 			break;
3224 		case NFS4ERR_GRACE:
3225 			action = NR_GRACE;
3226 			break;
3227 		case NFS4ERR_DELAY:
3228 			action = NR_DELAY;
3229 			break;
3230 		case NFS4ERR_STALE:
3231 			action = NR_STALE;
3232 			break;
3233 		default:
3234 			nfs4_queue_event(RE_UNEXPECTED_STATUS, mi, NULL, 0,
3235 			    NULL, NULL, stat, NULL, 0, TAG_NONE, TAG_NONE,
3236 			    0, 0);
3237 			action = NR_CLIENTID;
3238 			break;
3239 		}
3240 	}
3241 
3242 	/* make sure action got set */
3243 	ASSERT(action != NR_UNUSED);
3244 	recovp->rc_srv_reboot = reboot;
3245 	recovp->rc_action = action;
3246 	nfs4_queue_fact(RF_ERR, mi, stat, action, op, reboot, NULL, error,
3247 		NULL);
3248 }
3249 
3250 /*
3251  * Return the (held) credential for the process with the given pid.
3252  * May return NULL (e.g., process not found).
3253  */
3254 
3255 static cred_t *
3256 pid_to_cr(pid_t pid)
3257 {
3258 	proc_t *p;
3259 	cred_t *cr;
3260 
3261 	mutex_enter(&pidlock);
3262 	if ((p = prfind(pid)) == NULL) {
3263 		mutex_exit(&pidlock);
3264 		return (NULL);
3265 	}
3266 
3267 	mutex_enter(&p->p_crlock);
3268 	crhold(cr = p->p_cred);
3269 	mutex_exit(&p->p_crlock);
3270 	mutex_exit(&pidlock);
3271 
3272 	return (cr);
3273 }
3274 
3275 /*
3276  * Send SIGLOST to the given process and queue the event.
3277  *
3278  * The 'dump' boolean tells us whether this action should dump the
3279  * in-kernel queue of recovery messages or not.
3280  */
3281 
3282 void
3283 nfs4_send_siglost(pid_t pid, mntinfo4_t *mi, vnode_t *vp, bool_t dump,
3284     int error, nfsstat4 stat)
3285 {
3286 	proc_t *p;
3287 
3288 	mutex_enter(&pidlock);
3289 	p = prfind(pid);
3290 	if (p)
3291 		psignal(p, SIGLOST);
3292 	mutex_exit(&pidlock);
3293 	nfs4_queue_event(dump ? RE_SIGLOST : RE_SIGLOST_NO_DUMP, mi,
3294 	    NULL, error, vp, NULL, stat, NULL, pid, TAG_NONE, TAG_NONE, 0, 0);
3295 }
3296 
3297 /*
3298  * Scan the lock list for entries that match the given pid.  Change the
3299  * pid in those that do to NOPID.
3300  */
3301 
3302 static void
3303 relock_skip_pid(locklist_t *llp, pid_t pid)
3304 {
3305 	for (; llp != NULL; llp = llp->ll_next) {
3306 		if (llp->ll_flock.l_pid == pid)
3307 			llp->ll_flock.l_pid = NOPID;
3308 	}
3309 }
3310 
3311 /*
3312  * Mark a file as having failed recovery, after making a last-ditch effort
3313  * to return any delegation.
3314  *
3315  * Sets r_error to EIO or ESTALE for the given vnode.
3316  */
3317 void
3318 nfs4_fail_recov(vnode_t *vp, char *why, int error, nfsstat4 stat)
3319 {
3320 	rnode4_t *rp = VTOR4(vp);
3321 
3322 #ifdef DEBUG
3323 	if (nfs4_fail_recov_stop)
3324 		debug_enter("nfs4_fail_recov");
3325 #endif
3326 
3327 	mutex_enter(&rp->r_statelock);
3328 	if (rp->r_flags & (R4RECOVERR|R4RECOVERRP)) {
3329 		mutex_exit(&rp->r_statelock);
3330 		return;
3331 	}
3332 
3333 	/*
3334 	 * Set R4RECOVERRP to indicate that a recovery error is in
3335 	 * progress.  This will shut down reads and writes at the top
3336 	 * half.  Don't set R4RECOVERR until after we've returned the
3337 	 * delegation, otherwise it will fail.
3338 	 */
3339 
3340 	rp->r_flags |= R4RECOVERRP;
3341 	mutex_exit(&rp->r_statelock);
3342 
3343 	nfs4delegabandon(rp);
3344 
3345 	mutex_enter(&rp->r_statelock);
3346 	rp->r_flags |= (R4RECOVERR | R4STALE);
3347 	rp->r_error = (error == 0 && stat == NFS4ERR_STALE) ? ESTALE : EIO;
3348 	PURGE_ATTRCACHE4_LOCKED(rp);
3349 	if (!(vp->v_vfsp->vfs_flag & VFS_UNMOUNTED))
3350 		nfs4_queue_event(RE_DEAD_FILE, VTOMI4(vp), NULL, error,
3351 		    vp, NULL, stat, why, 0, TAG_NONE, TAG_NONE, 0, 0);
3352 	mutex_exit(&rp->r_statelock);
3353 
3354 	dnlc_purge_vp(vp);
3355 }
3356 
3357 /*
3358  * recov_throttle: if the file had the same recovery action within the
3359  * throttle interval, wait for the throttle interval to finish before
3360  * proceeding.
3361  *
3362  * Side effects: updates the rnode with the current recovery information.
3363  */
3364 
3365 static void
3366 recov_throttle(recov_info_t *recovp, vnode_t *vp)
3367 {
3368 	time_t curtime, time_to_wait;
3369 	rnode4_t *rp = VTOR4(vp);
3370 
3371 	curtime = gethrestime_sec();
3372 
3373 	mutex_enter(&rp->r_statelock);
3374 	NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
3375 		"recov_throttle: now: (%d, %ld), last: (%d, %ld)",
3376 		recovp->rc_action, curtime,
3377 		rp->r_recov_act, rp->r_last_recov));
3378 	if (recovp->rc_action == rp->r_recov_act &&
3379 	    rp->r_last_recov + recov_err_delay > curtime) {
3380 		time_to_wait = rp->r_last_recov + recov_err_delay - curtime;
3381 		mutex_exit(&rp->r_statelock);
3382 		delay(SEC_TO_TICK(time_to_wait));
3383 		curtime = gethrestime_sec();
3384 		mutex_enter(&rp->r_statelock);
3385 	}
3386 
3387 	rp->r_last_recov = curtime;
3388 	rp->r_recov_act = recovp->rc_action;
3389 	mutex_exit(&rp->r_statelock);
3390 }
3391 
3392 /*
3393  * React to NFS4ERR_GRACE by setting the time we'll permit
3394  * the next call to this filesystem.
3395  */
3396 void
3397 nfs4_set_grace_wait(mntinfo4_t *mi)
3398 {
3399 	mutex_enter(&mi->mi_lock);
3400 	/* Mark the time for the future */
3401 	mi->mi_grace_wait = gethrestime_sec() + nfs4err_delay_time;
3402 	mutex_exit(&mi->mi_lock);
3403 }
3404 
3405 /*
3406  * React to MFS4ERR_DELAY by setting the time we'll permit
3407  * the next call to this vnode.
3408  */
3409 void
3410 nfs4_set_delay_wait(vnode_t *vp)
3411 {
3412 	rnode4_t *rp = VTOR4(vp);
3413 
3414 	mutex_enter(&rp->r_statelock);
3415 	/*
3416 	 * Calculate amount we should delay, initial
3417 	 * delay will be short and then we will back off.
3418 	 */
3419 	if (rp->r_delay_interval == 0)
3420 		rp->r_delay_interval = NFS4_INITIAL_DELAY_INTERVAL;
3421 	else
3422 		/* calculate next interval value */
3423 		rp->r_delay_interval =
3424 		    MIN(NFS4_MAX_DELAY_INTERVAL, (rp->r_delay_interval << 1));
3425 	rp->r_delay_wait = gethrestime_sec() + rp->r_delay_interval;
3426 	mutex_exit(&rp->r_statelock);
3427 }
3428 
3429 /*
3430  * The caller is responsible for freeing the returned string.
3431  */
3432 static char *
3433 nfs4_getsrvnames(mntinfo4_t *mi, size_t *len)
3434 {
3435 	servinfo4_t *svp;
3436 	char *srvnames;
3437 	char *namep;
3438 	size_t length;
3439 
3440 	/*
3441 	 * Calculate the length of the string required to hold all
3442 	 * of the server names plus either a comma or a null
3443 	 * character following each individual one.
3444 	 */
3445 	length = 0;
3446 	for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) {
3447 		(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
3448 		if (svp->sv_flags & SV4_NOTINUSE) {
3449 			nfs_rw_exit(&svp->sv_lock);
3450 			continue;
3451 		}
3452 		nfs_rw_exit(&svp->sv_lock);
3453 		length += svp->sv_hostnamelen;
3454 	}
3455 
3456 	srvnames = kmem_alloc(length, KM_SLEEP);
3457 
3458 	namep = srvnames;
3459 	for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) {
3460 		(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
3461 		if (svp->sv_flags & SV4_NOTINUSE) {
3462 			nfs_rw_exit(&svp->sv_lock);
3463 			continue;
3464 		}
3465 		nfs_rw_exit(&svp->sv_lock);
3466 		(void) strcpy(namep, svp->sv_hostname);
3467 		namep += svp->sv_hostnamelen - 1;
3468 		*namep++ = ',';
3469 	}
3470 	*--namep = '\0';
3471 
3472 	*len = length;
3473 
3474 	return (srvnames);
3475 }
3476 
3477 static void
3478 save_bseqid_rqst(nfs4_bseqid_entry_t *bsep, recov_info_t *recovp)
3479 {
3480 	nfs4_bseqid_entry_t *destp;
3481 
3482 	destp = kmem_alloc(sizeof (nfs4_bseqid_entry_t), KM_SLEEP);
3483 	recovp->rc_bseqid_rqst = destp;
3484 
3485 	if (bsep->bs_oop)
3486 		open_owner_hold(bsep->bs_oop);
3487 	destp->bs_oop = bsep->bs_oop;
3488 	if (bsep->bs_lop)
3489 		lock_owner_hold(bsep->bs_lop);
3490 	destp->bs_lop = bsep->bs_lop;
3491 	if (bsep->bs_vp)
3492 		VN_HOLD(bsep->bs_vp);
3493 	destp->bs_vp = bsep->bs_vp;
3494 	destp->bs_pid = bsep->bs_pid;
3495 	destp->bs_tag = bsep->bs_tag;
3496 	destp->bs_seqid = bsep->bs_seqid;
3497 }
3498 
3499 static void
3500 free_bseqid_rqst(nfs4_bseqid_entry_t *bsep)
3501 {
3502 	if (bsep->bs_oop)
3503 		open_owner_rele(bsep->bs_oop);
3504 	if (bsep->bs_lop)
3505 		lock_owner_rele(bsep->bs_lop);
3506 	if (bsep->bs_vp)
3507 		VN_RELE(bsep->bs_vp);
3508 	kmem_free(bsep, sizeof (nfs4_bseqid_entry_t));
3509 }
3510 
3511 /*
3512  * We don't actually fully recover from NFS4ERR_BAD_SEQID.  We
3513  * simply mark the open owner and open stream (if provided) as "bad".
3514  * Then future uses of these data structures will be limited to basically
3515  * just cleaning up the internal client state (no going OTW).
3516  *
3517  * The result of this is to return errors back to the app/usr when
3518  * we receive NFS4ERR_BAD_SEQID, but also allow future/new calls to
3519  * succeed so progress can be made.
3520  */
3521 void
3522 recov_bad_seqid(recov_info_t *recovp)
3523 {
3524 	mntinfo4_t		*mi = recovp->rc_mi;
3525 	nfs4_open_owner_t	*bad_oop;
3526 	nfs4_lock_owner_t	*bad_lop;
3527 	vnode_t			*vp;
3528 	rnode4_t		*rp = NULL;
3529 	pid_t			pid;
3530 	nfs4_bseqid_entry_t	*bsep, *tbsep;
3531 	int			error;
3532 
3533 	ASSERT(mi != NULL);
3534 	ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
3535 
3536 	mutex_enter(&mi->mi_lock);
3537 	bsep = list_head(&mi->mi_bseqid_list);
3538 	mutex_exit(&mi->mi_lock);
3539 
3540 	/*
3541 	 * Handle all the bad seqid entries on mi's list.
3542 	 */
3543 	while (bsep != NULL) {
3544 		bad_oop = bsep->bs_oop;
3545 		bad_lop = bsep->bs_lop;
3546 		vp = bsep->bs_vp;
3547 		pid = bsep->bs_pid;
3548 
3549 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
3550 		    "recov_bad_seqid: mark oop %p lop %p as bad for "
3551 		    "vp %p tag %s pid %d: last good seqid %d for tag %s",
3552 		    (void *)bad_oop, (void *)bad_lop, (void *)vp,
3553 		    nfs4_ctags[bsep->bs_tag].ct_str, pid,
3554 		    bad_oop ?  bad_oop->oo_last_good_seqid : 0,
3555 		    bad_oop ? nfs4_ctags[bad_oop->oo_last_good_op].ct_str :
3556 		    nfs4_ctags[TAG_NONE].ct_str));
3557 
3558 		nfs4_queue_event(RE_BAD_SEQID, mi, NULL,
3559 		    0, vp, NULL, NFS4ERR_BAD_SEQID, NULL, pid, bsep->bs_tag,
3560 		    bad_oop ? bad_oop->oo_last_good_op : TAG_NONE,
3561 		    bsep->bs_seqid, bad_oop ? bad_oop->oo_last_good_seqid : 0);
3562 
3563 		if (bad_oop) {
3564 			/* essentially reset the open owner */
3565 			error = nfs4_start_open_seqid_sync(bad_oop, mi);
3566 			ASSERT(!error);	/* recov thread always succeeds */
3567 			bad_oop->oo_name = nfs4_get_new_oo_name();
3568 			bad_oop->oo_seqid = 0;
3569 			nfs4_end_open_seqid_sync(bad_oop);
3570 		}
3571 
3572 		if (bad_lop) {
3573 			mutex_enter(&bad_lop->lo_lock);
3574 			bad_lop->lo_flags |= NFS4_BAD_SEQID_LOCK;
3575 			mutex_exit(&bad_lop->lo_lock);
3576 
3577 			ASSERT(vp != NULL);
3578 			rp = VTOR4(vp);
3579 			mutex_enter(&rp->r_statelock);
3580 			rp->r_flags |= R4LODANGLERS;
3581 			mutex_exit(&rp->r_statelock);
3582 
3583 			nfs4_send_siglost(pid, mi, vp, TRUE,
3584 			    0, NFS4ERR_BAD_SEQID);
3585 		}
3586 
3587 		mutex_enter(&mi->mi_lock);
3588 		list_remove(&mi->mi_bseqid_list, bsep);
3589 		tbsep = bsep;
3590 		bsep = list_head(&mi->mi_bseqid_list);
3591 		mutex_exit(&mi->mi_lock);
3592 		free_bseqid_rqst(tbsep);
3593 	}
3594 
3595 	mutex_enter(&mi->mi_lock);
3596 	mi->mi_recovflags &= ~MI4R_BAD_SEQID;
3597 	mutex_exit(&mi->mi_lock);
3598 }
3599