xref: /titanic_50/usr/src/uts/common/fs/nfs/nfs4_recovery.c (revision da5577f07f6199b51ea374581248790c288e827b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * NFS Version 4 state recovery code.
28  */
29 
30 #include <nfs/nfs4_clnt.h>
31 #include <nfs/nfs4.h>
32 #include <nfs/rnode4.h>
33 #include <sys/cmn_err.h>
34 #include <sys/cred.h>
35 #include <sys/systm.h>
36 #include <sys/flock.h>
37 #include <sys/dnlc.h>
38 #include <sys/ddi.h>
39 #include <sys/disp.h>
40 #include <sys/list.h>
41 #include <sys/sdt.h>
42 #include <sys/mount.h>
43 #include <sys/door.h>
44 #include <nfs/nfssys.h>
45 #include <nfs/nfsid_map.h>
46 #include <nfs/nfs4_idmap_impl.h>
47 
48 extern r4hashq_t *rtable4;
49 
50 /*
51  * Information that describes what needs to be done for recovery.  It is
52  * passed to a client recovery thread as well as passed to various recovery
53  * routines.  rc_mi, rc_vp1, and rc_vp2 refer to the filesystem and
54  * vnode(s) affected by recovery.  rc_vp1 and rc_vp2 are references (use
55  * VN_HOLD) or NULL.  rc_lost_rqst contains information about the lost
56  * lock or open/close request, and it holds reference counts for the
57  * various objects (vnode, etc.).  The recovery thread also uses flags set
58  * in the mntinfo4_t or vnode_t to tell it what to do.  rc_error is used
59  * to save the error that originally triggered the recovery event -- will
60  * later be used to set mi_error if recovery doesn't work.  rc_bseqid_rqst
61  * contains information about the request that got NFS4ERR_BAD_SEQID, and
62  * it holds reference count for the various objects (vnode, open owner,
63  * open stream, lock owner).
64  */
65 
66 typedef struct {
67 	mntinfo4_t *rc_mi;
68 	vnode_t *rc_vp1;
69 	vnode_t *rc_vp2;
70 	nfs4_recov_t rc_action;
71 	stateid4 rc_stateid;
72 	bool_t rc_srv_reboot;		/* server has rebooted */
73 	nfs4_lost_rqst_t *rc_lost_rqst;
74 	nfs4_error_t rc_orig_errors;	/* original errors causing recovery */
75 	int rc_error;
76 	nfs4_bseqid_entry_t *rc_bseqid_rqst;
77 	vnode_t *rc_moved_vp;
78 	char *rc_moved_nm;
79 } recov_info_t;
80 
81 /*
82  * How long to wait before trying again if there is an error doing
83  * recovery, in seconds.
84  */
85 
86 static int recov_err_delay = 1;
87 
88 /*
89  * How long to wait when processing NFS4ERR_GRACE or NFS4ERR_DELAY
90  * errors.  Expressed in seconds.  Default is defined as
91  * NFS4ERR_DELAY_TIME and this variable is initialized in nfs4_subr_init()
92  */
93 time_t nfs4err_delay_time = 0;
94 
95 /*
96  * Tuneable to limit how many time "exempt" ops go OTW
97  * after a recovery error.  Exempt op hints are OH_CLOSE,
98  * OH_LOCKU, OH_DELEGRETURN.  These previously always went
99  * OTW even after rnode was "dead" due to recovery errors.
100  *
101  * The tuneable below limits the number of times a start_fop
102  * invocation will retry the exempt hints.  After the limit
103  * is reached, nfs4_start_fop will return an error just like
104  * it would for non-exempt op hints.
105  */
106 int nfs4_max_recov_error_retry = 3;
107 
108 /*
109  * Number of seconds the recovery thread should pause before retry when the
110  * filesystem has been forcibly unmounted.
111  */
112 
113 int nfs4_unmount_delay = 1;
114 
115 #ifdef DEBUG
116 
117 /*
118  * How long to wait (in seconds) between recovery operations on a given
119  * file.  Normally zero, but could be set longer for testing purposes.
120  */
121 static int nfs4_recovdelay = 0;
122 
123 /*
124  * Switch that controls whether to go into the debugger when recovery
125  * fails.
126  */
127 static int nfs4_fail_recov_stop = 0;
128 
129 /*
130  * Tuneables to debug client namespace interaction with server
131  * mount points:
132  *
133  *	nfs4_srvmnt_fail_cnt:
134  *		number of times EACCES returned because client
135  *		attempted to cross server mountpoint
136  *
137  *	nfs4_srvmnt_debug:
138  *		trigger console printf whenever client attempts
139  *		to cross server mountpoint
140  */
141 int nfs4_srvmnt_fail_cnt = 0;
142 int nfs4_srvmnt_debug = 0;
143 #endif
144 
145 extern zone_key_t	nfs4clnt_zone_key;
146 
147 /* forward references, in alphabetic order */
148 static void close_after_open_resend(vnode_t *, cred_t *, uint32_t,
149 	nfs4_error_t *);
150 static void errs_to_action(recov_info_t *,
151 	nfs4_server_t *, mntinfo4_t *, stateid4 *, nfs4_lost_rqst_t *, int,
152 	nfs_opnum4, nfs4_bseqid_entry_t *);
153 static void flush_reinstate(nfs4_lost_rqst_t *);
154 static void free_milist(mntinfo4_t **, int);
155 static mntinfo4_t **make_milist(nfs4_server_t *, int *);
156 static int nfs4_check_recov_err(vnode_t *, nfs4_op_hint_t,
157 	nfs4_recov_state_t *, int, char *);
158 static char *nfs4_getsrvnames(mntinfo4_t *, size_t *);
159 static void nfs4_recov_fh_fail(vnode_t *, int, nfsstat4);
160 static void nfs4_recov_thread(recov_info_t *);
161 static void nfs4_remove_lost_rqsts(mntinfo4_t *, nfs4_server_t *);
162 static void nfs4_resend_lost_rqsts(recov_info_t *, nfs4_server_t *);
163 static cred_t *pid_to_cr(pid_t);
164 static void reclaim_one_lock(vnode_t *, flock64_t *, nfs4_error_t *, int *);
165 static void recov_bad_seqid(recov_info_t *);
166 static void recov_badstate(recov_info_t *, vnode_t *, nfsstat4);
167 static void recov_clientid(recov_info_t *, nfs4_server_t *);
168 static void recov_done(mntinfo4_t *, recov_info_t *);
169 static void recov_filehandle(nfs4_recov_t, mntinfo4_t *, vnode_t *);
170 static void recov_newserver(recov_info_t *, nfs4_server_t **, bool_t *);
171 static void recov_openfiles(recov_info_t *, nfs4_server_t *);
172 static void recov_stale(mntinfo4_t *, vnode_t *);
173 static void nfs4_free_lost_rqst(nfs4_lost_rqst_t *, nfs4_server_t *);
174 static void recov_throttle(recov_info_t *, vnode_t *);
175 static void relock_skip_pid(vnode_t *, locklist_t *, pid_t);
176 static void resend_lock(nfs4_lost_rqst_t *, nfs4_error_t *);
177 static void resend_one_op(nfs4_lost_rqst_t *, nfs4_error_t *, mntinfo4_t *,
178 	nfs4_server_t *);
179 static void save_bseqid_rqst(nfs4_bseqid_entry_t *, recov_info_t *);
180 static void start_recovery(recov_info_t *, mntinfo4_t *, vnode_t *, vnode_t *,
181 	nfs4_server_t *, vnode_t *, char *);
182 static void start_recovery_action(nfs4_recov_t, bool_t, mntinfo4_t *, vnode_t *,
183 	vnode_t *);
184 static int wait_for_recovery(mntinfo4_t *, nfs4_op_hint_t);
185 
186 /*
187  * Return non-zero if the given errno, status, and rpc status codes
188  * in the nfs4_error_t indicate that client recovery is needed.
189  * "stateful" indicates whether the call that got the error establishes or
190  * removes state on the server (open, close, lock, unlock, delegreturn).
191  */
192 
193 int
194 nfs4_needs_recovery(nfs4_error_t *ep, bool_t stateful, vfs_t *vfsp)
195 {
196 	int recov = 0;
197 	mntinfo4_t *mi;
198 
199 	/*
200 	 * Try failover if the error values justify it and if
201 	 * it's a failover mount.  Don't try if the mount is in
202 	 * progress, failures are handled explicitly by nfs4rootvp.
203 	 */
204 	if (nfs4_try_failover(ep)) {
205 		mi = VFTOMI4(vfsp);
206 		mutex_enter(&mi->mi_lock);
207 		recov = FAILOVER_MOUNT4(mi) && !(mi->mi_flags & MI4_MOUNTING);
208 		mutex_exit(&mi->mi_lock);
209 		if (recov)
210 			return (recov);
211 	}
212 
213 	if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vfsp)) {
214 		/*
215 		 * The server may have gotten the request, so for stateful
216 		 * ops we need to resynchronize and possibly back out the
217 		 * op.
218 		 */
219 		return (stateful);
220 	}
221 	if (ep->error != 0)
222 		return (0);
223 
224 	/* stat values are listed alphabetically */
225 	/*
226 	 * There are two lists here: the errors for which we have code, and
227 	 * the errors for which we plan to have code before FCS.  For the
228 	 * second list, print a warning message but don't attempt recovery.
229 	 */
230 	switch (ep->stat) {
231 	case NFS4ERR_BADHANDLE:
232 	case NFS4ERR_BAD_SEQID:
233 	case NFS4ERR_BAD_STATEID:
234 	case NFS4ERR_DELAY:
235 	case NFS4ERR_EXPIRED:
236 	case NFS4ERR_FHEXPIRED:
237 	case NFS4ERR_GRACE:
238 	case NFS4ERR_OLD_STATEID:
239 	case NFS4ERR_RESOURCE:
240 	case NFS4ERR_STALE_CLIENTID:
241 	case NFS4ERR_STALE_STATEID:
242 	case NFS4ERR_WRONGSEC:
243 	case NFS4ERR_STALE:
244 		recov = 1;
245 		break;
246 #ifdef DEBUG
247 	case NFS4ERR_LEASE_MOVED:
248 	case NFS4ERR_MOVED:
249 		zcmn_err(VFTOMI4(vfsp)->mi_zone->zone_id,
250 		    CE_WARN, "!Can't yet recover from NFS status %d",
251 		    ep->stat);
252 		break;
253 #endif
254 	}
255 
256 	return (recov);
257 }
258 
259 /*
260  * Some operations such as DELEGRETURN want to avoid invoking
261  * recovery actions that will only mark the file dead.  If
262  * better handlers are invoked for any of these errors, this
263  * routine should be modified.
264  */
265 int
266 nfs4_recov_marks_dead(nfsstat4 status)
267 {
268 	if (status == NFS4ERR_BAD_SEQID ||
269 	    status == NFS4ERR_EXPIRED ||
270 	    status == NFS4ERR_BAD_STATEID ||
271 	    status == NFS4ERR_OLD_STATEID)
272 		return (1);
273 	return (0);
274 }
275 
276 /*
277  * Transfer the state recovery information in recovp to mi's resend queue,
278  * and mark mi as having a lost state request.
279  */
280 static void
281 nfs4_enqueue_lost_rqst(recov_info_t *recovp, mntinfo4_t *mi)
282 {
283 	nfs4_lost_rqst_t *lrp = recovp->rc_lost_rqst;
284 
285 	ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
286 	    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
287 
288 	ASSERT(lrp != NULL && lrp->lr_op != 0);
289 
290 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
291 	    "nfs4_enqueue_lost_rqst %p, op %d",
292 	    (void *)lrp, lrp->lr_op));
293 
294 	mutex_enter(&mi->mi_lock);
295 	mi->mi_recovflags |= MI4R_LOST_STATE;
296 	if (lrp->lr_putfirst)
297 		list_insert_head(&mi->mi_lost_state, lrp);
298 	else
299 		list_insert_tail(&mi->mi_lost_state, lrp);
300 	recovp->rc_lost_rqst = NULL;
301 	mutex_exit(&mi->mi_lock);
302 
303 	nfs4_queue_event(RE_LOST_STATE, mi, NULL, lrp->lr_op, lrp->lr_vp,
304 	    lrp->lr_dvp, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
305 }
306 
307 /*
308  * Transfer the bad seqid recovery information in recovp to mi's
309  * bad seqid queue, and mark mi as having a bad seqid request.
310  */
311 void
312 enqueue_bseqid_rqst(recov_info_t *recovp, mntinfo4_t *mi)
313 {
314 	ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
315 	    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
316 	ASSERT(recovp->rc_bseqid_rqst != NULL);
317 
318 	mutex_enter(&mi->mi_lock);
319 	mi->mi_recovflags |= MI4R_BAD_SEQID;
320 	list_insert_tail(&mi->mi_bseqid_list, recovp->rc_bseqid_rqst);
321 	recovp->rc_bseqid_rqst = NULL;
322 	mutex_exit(&mi->mi_lock);
323 }
324 
325 /*
326  * Initiate recovery.
327  *
328  * The nfs4_error_t contains the return codes that triggered a recovery
329  * attempt.  mi, vp1, and vp2 refer to the filesystem and files that were
330  * being operated on.  vp1 and vp2 may be NULL.
331  *
332  * Multiple calls are okay.  If recovery is already underway, the call
333  * updates the information about what state needs recovery but does not
334  * start a new thread.  The caller should hold mi->mi_recovlock as a reader
335  * for proper synchronization with any recovery thread.
336  *
337  * This will return TRUE if recovery was aborted, and FALSE otherwise.
338  */
339 bool_t
340 nfs4_start_recovery(nfs4_error_t *ep, mntinfo4_t *mi, vnode_t *vp1,
341     vnode_t *vp2, stateid4 *sid, nfs4_lost_rqst_t *lost_rqstp, nfs_opnum4 op,
342     nfs4_bseqid_entry_t *bsep, vnode_t *moved_vp, char *moved_nm)
343 {
344 	recov_info_t *recovp;
345 	nfs4_server_t *sp;
346 	bool_t abort = FALSE;
347 	bool_t gone = FALSE;
348 
349 	ASSERT(nfs_zone() == mi->mi_zone);
350 	mutex_enter(&mi->mi_lock);
351 	/*
352 	 * If there is lost state, we need to kick off recovery even if the
353 	 * filesystem has been unmounted or the zone is shutting down.
354 	 */
355 	gone = FS_OR_ZONE_GONE4(mi->mi_vfsp);
356 	if (gone) {
357 		ASSERT(ep->error != EINTR || lost_rqstp != NULL);
358 		if (ep->error == EIO && lost_rqstp == NULL) {
359 			/* failed due to forced unmount, no new lost state */
360 			abort = TRUE;
361 		}
362 		if ((ep->error == 0 || ep->error == ETIMEDOUT) &&
363 		    !(mi->mi_recovflags & MI4R_LOST_STATE)) {
364 			/* some other failure, no existing lost state */
365 			abort = TRUE;
366 		}
367 		if (abort) {
368 			mutex_exit(&mi->mi_lock);
369 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
370 			    "nfs4_start_recovery: fs unmounted"));
371 			return (TRUE);
372 		}
373 	}
374 	mi->mi_in_recovery++;
375 	mutex_exit(&mi->mi_lock);
376 
377 	recovp = kmem_alloc(sizeof (recov_info_t), KM_SLEEP);
378 	recovp->rc_orig_errors = *ep;
379 	sp = find_nfs4_server(mi);
380 	errs_to_action(recovp, sp, mi, sid, lost_rqstp, gone, op, bsep);
381 	if (sp != NULL)
382 		mutex_exit(&sp->s_lock);
383 	start_recovery(recovp, mi, vp1, vp2, sp, moved_vp, moved_nm);
384 	if (sp != NULL)
385 		nfs4_server_rele(sp);
386 	return (FALSE);
387 }
388 
389 /*
390  * Internal version of nfs4_start_recovery.  The difference is that the
391  * caller specifies the recovery action, rather than the errors leading to
392  * recovery.
393  */
394 static void
395 start_recovery_action(nfs4_recov_t what, bool_t reboot, mntinfo4_t *mi,
396     vnode_t *vp1, vnode_t *vp2)
397 {
398 	recov_info_t *recovp;
399 
400 	ASSERT(nfs_zone() == mi->mi_zone);
401 	mutex_enter(&mi->mi_lock);
402 	mi->mi_in_recovery++;
403 	mutex_exit(&mi->mi_lock);
404 
405 	recovp = kmem_zalloc(sizeof (recov_info_t), KM_SLEEP);
406 	recovp->rc_action = what;
407 	recovp->rc_srv_reboot = reboot;
408 	recovp->rc_error = EIO;
409 	start_recovery(recovp, mi, vp1, vp2, NULL, NULL, NULL);
410 }
411 
412 static void
413 start_recovery(recov_info_t *recovp, mntinfo4_t *mi,
414     vnode_t *vp1, vnode_t *vp2, nfs4_server_t *sp,
415     vnode_t *moved_vp, char *moved_nm)
416 {
417 	NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
418 	    "start_recovery: mi %p, what %s", (void*)mi,
419 	    nfs4_recov_action_to_str(recovp->rc_action)));
420 
421 	/*
422 	 * Bump the reference on the vfs so that we can pass it to the
423 	 * recovery thread.
424 	 */
425 	VFS_HOLD(mi->mi_vfsp);
426 	MI4_HOLD(mi);
427 again:
428 	switch (recovp->rc_action) {
429 	case NR_FAILOVER:
430 		ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
431 		    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
432 		if (mi->mi_servers->sv_next == NULL)
433 			goto out_no_thread;
434 		mutex_enter(&mi->mi_lock);
435 		mi->mi_recovflags |= MI4R_NEED_NEW_SERVER;
436 		mutex_exit(&mi->mi_lock);
437 
438 		if (recovp->rc_lost_rqst != NULL)
439 			nfs4_enqueue_lost_rqst(recovp, mi);
440 		break;
441 
442 	case NR_CLIENTID:
443 		/*
444 		 * If the filesystem has been unmounted, punt.
445 		 */
446 		if (sp == NULL)
447 			goto out_no_thread;
448 
449 		/*
450 		 * If nobody else is working on the clientid, mark the
451 		 * clientid as being no longer set.  Then mark the specific
452 		 * filesystem being worked on.
453 		 */
454 		if (!nfs4_server_in_recovery(sp)) {
455 			mutex_enter(&sp->s_lock);
456 			sp->s_flags &= ~N4S_CLIENTID_SET;
457 			mutex_exit(&sp->s_lock);
458 		}
459 		ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
460 		    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
461 		mutex_enter(&mi->mi_lock);
462 		mi->mi_recovflags |= MI4R_NEED_CLIENTID;
463 		if (recovp->rc_srv_reboot)
464 			mi->mi_recovflags |= MI4R_SRV_REBOOT;
465 		mutex_exit(&mi->mi_lock);
466 		break;
467 
468 	case NR_OPENFILES:
469 		ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
470 		    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
471 		mutex_enter(&mi->mi_lock);
472 		mi->mi_recovflags |= MI4R_REOPEN_FILES;
473 		if (recovp->rc_srv_reboot)
474 			mi->mi_recovflags |= MI4R_SRV_REBOOT;
475 		mutex_exit(&mi->mi_lock);
476 		break;
477 
478 	case NR_WRONGSEC:
479 		ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
480 		    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
481 		mutex_enter(&mi->mi_lock);
482 		mi->mi_recovflags |= MI4R_NEED_SECINFO;
483 		mutex_exit(&mi->mi_lock);
484 		break;
485 
486 	case NR_EXPIRED:
487 		if (vp1 != NULL)
488 			recov_badstate(recovp, vp1, NFS4ERR_EXPIRED);
489 		if (vp2 != NULL)
490 			recov_badstate(recovp, vp2, NFS4ERR_EXPIRED);
491 		goto out_no_thread;	/* no further recovery possible */
492 
493 	case NR_BAD_STATEID:
494 		if (vp1 != NULL)
495 			recov_badstate(recovp, vp1, NFS4ERR_BAD_STATEID);
496 		if (vp2 != NULL)
497 			recov_badstate(recovp, vp2, NFS4ERR_BAD_STATEID);
498 		goto out_no_thread;	/* no further recovery possible */
499 
500 	case NR_FHEXPIRED:
501 	case NR_BADHANDLE:
502 		if (vp1 != NULL)
503 			recov_throttle(recovp, vp1);
504 		if (vp2 != NULL)
505 			recov_throttle(recovp, vp2);
506 		/*
507 		 * Recover the filehandle now, rather than using a
508 		 * separate thread.  We can do this because filehandle
509 		 * recovery is independent of any other state, and because
510 		 * we know that we are not competing with the recovery
511 		 * thread at this time.  recov_filehandle will deal with
512 		 * threads that are competing to recover this filehandle.
513 		 */
514 		ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
515 		    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
516 		if (vp1 != NULL)
517 			recov_filehandle(recovp->rc_action, mi, vp1);
518 		if (vp2 != NULL)
519 			recov_filehandle(recovp->rc_action, mi, vp2);
520 		goto out_no_thread;	/* no further recovery needed */
521 
522 	case NR_STALE:
523 		/*
524 		 * NFS4ERR_STALE handling
525 		 * recov_stale() could set MI4R_NEED_NEW_SERVER to
526 		 * indicate that we can and should failover.
527 		 */
528 		ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
529 		    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
530 
531 		if (vp1 != NULL)
532 			recov_stale(mi, vp1);
533 		if (vp2 != NULL)
534 			recov_stale(mi, vp2);
535 		mutex_enter(&mi->mi_lock);
536 		if ((mi->mi_recovflags & MI4R_NEED_NEW_SERVER) == 0) {
537 			mutex_exit(&mi->mi_lock);
538 			goto out_no_thread;
539 		}
540 		mutex_exit(&mi->mi_lock);
541 		recovp->rc_action = NR_FAILOVER;
542 		goto again;
543 
544 	case NR_BAD_SEQID:
545 		if (recovp->rc_bseqid_rqst) {
546 			enqueue_bseqid_rqst(recovp, mi);
547 			break;
548 		}
549 
550 		if (vp1 != NULL)
551 			recov_badstate(recovp, vp1, NFS4ERR_BAD_SEQID);
552 		if (vp2 != NULL)
553 			recov_badstate(recovp, vp2, NFS4ERR_BAD_SEQID);
554 		goto out_no_thread; /* no further recovery possible */
555 
556 	case NR_OLDSTATEID:
557 		if (vp1 != NULL)
558 			recov_badstate(recovp, vp1, NFS4ERR_OLD_STATEID);
559 		if (vp2 != NULL)
560 			recov_badstate(recovp, vp2, NFS4ERR_OLD_STATEID);
561 		goto out_no_thread;	/* no further recovery possible */
562 
563 	case NR_GRACE:
564 		nfs4_set_grace_wait(mi);
565 		goto out_no_thread; /* no further action required for GRACE */
566 
567 	case NR_DELAY:
568 		if (vp1)
569 			nfs4_set_delay_wait(vp1);
570 		goto out_no_thread; /* no further action required for DELAY */
571 
572 	case NR_LOST_STATE_RQST:
573 	case NR_LOST_LOCK:
574 		nfs4_enqueue_lost_rqst(recovp, mi);
575 		break;
576 	default:
577 		nfs4_queue_event(RE_UNEXPECTED_ACTION, mi, NULL,
578 		    recovp->rc_action, NULL, NULL, 0, NULL, 0, TAG_NONE,
579 		    TAG_NONE, 0, 0);
580 		goto out_no_thread;
581 	}
582 
583 	/*
584 	 * If either file recently went through the same recovery, wait
585 	 * awhile.  This is in case there is some sort of bug; we might not
586 	 * be able to recover properly, but at least we won't bombard the
587 	 * server with calls, and we won't tie up the client.
588 	 */
589 	if (vp1 != NULL)
590 		recov_throttle(recovp, vp1);
591 	if (vp2 != NULL)
592 		recov_throttle(recovp, vp2);
593 
594 	/*
595 	 * If there's already a recovery thread, don't start another one.
596 	 */
597 
598 	mutex_enter(&mi->mi_lock);
599 	if (mi->mi_flags & MI4_RECOV_ACTIV) {
600 		mutex_exit(&mi->mi_lock);
601 		goto out_no_thread;
602 	}
603 	mi->mi_flags |= MI4_RECOV_ACTIV;
604 	mutex_exit(&mi->mi_lock);
605 	NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
606 	    "start_recovery: starting new thread for mi %p", (void*)mi));
607 
608 	recovp->rc_mi = mi;
609 	recovp->rc_vp1 = vp1;
610 	if (vp1 != NULL) {
611 		ASSERT(VTOMI4(vp1) == mi);
612 		VN_HOLD(recovp->rc_vp1);
613 	}
614 	recovp->rc_vp2 = vp2;
615 	if (vp2 != NULL) {
616 		ASSERT(VTOMI4(vp2) == mi);
617 		VN_HOLD(recovp->rc_vp2);
618 	}
619 	recovp->rc_moved_vp = moved_vp;
620 	recovp->rc_moved_nm = moved_nm;
621 
622 	(void) zthread_create(NULL, 0, nfs4_recov_thread, recovp, 0,
623 	    minclsyspri);
624 	return;
625 
626 	/* not reached by thread creating call */
627 out_no_thread:
628 	mutex_enter(&mi->mi_lock);
629 	mi->mi_in_recovery--;
630 	if (mi->mi_in_recovery == 0)
631 		cv_broadcast(&mi->mi_cv_in_recov);
632 	mutex_exit(&mi->mi_lock);
633 
634 	VFS_RELE(mi->mi_vfsp);
635 	MI4_RELE(mi);
636 	/*
637 	 * Free up resources that were allocated for us.
638 	 */
639 	kmem_free(recovp, sizeof (recov_info_t));
640 }
641 
642 static int
643 nfs4_check_recov_err(vnode_t *vp, nfs4_op_hint_t op,
644     nfs4_recov_state_t *rsp, int retry_err_cnt, char *str)
645 {
646 	rnode4_t *rp;
647 	int error = 0;
648 	int exempt;
649 
650 	if (vp == NULL)
651 		return (0);
652 
653 	exempt = (op == OH_CLOSE || op == OH_LOCKU || op == OH_DELEGRETURN);
654 	rp = VTOR4(vp);
655 	mutex_enter(&rp->r_statelock);
656 
657 	/*
658 	 * If there was a recovery error, then allow op hints "exempt" from
659 	 * recov errors to retry (currently 3 times).  Either r_error or
660 	 * EIO is returned for non-exempt op hints.
661 	 */
662 	if (rp->r_flags & R4RECOVERR) {
663 		if (exempt && rsp->rs_num_retry_despite_err <=
664 		    nfs4_max_recov_error_retry) {
665 
666 			/*
667 			 * Check to make sure that we haven't already inc'd
668 			 * rs_num_retry_despite_err for current nfs4_start_fop
669 			 * instance.  We don't want to double inc (if we were
670 			 * called with vp2, then the vp1 call could have
671 			 * already incremented.
672 			 */
673 			if (retry_err_cnt == rsp->rs_num_retry_despite_err)
674 				rsp->rs_num_retry_despite_err++;
675 
676 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
677 			    "nfs4_start_fop: %s %p DEAD, cnt=%d", str,
678 			    (void *)vp, rsp->rs_num_retry_despite_err));
679 		} else {
680 			error = (rp->r_error ? rp->r_error : EIO);
681 			/*
682 			 * An ESTALE error on a non-regular file is not
683 			 * "sticky".  Return the ESTALE error once, but
684 			 * clear the condition to allow future operations
685 			 * to go OTW.  This will allow the client to
686 			 * recover if the server has merely unshared then
687 			 * re-shared the file system.  For regular files,
688 			 * the unshare has destroyed the open state at the
689 			 * server and we aren't willing to do a reopen (yet).
690 			 */
691 			if (error == ESTALE && vp->v_type != VREG) {
692 				rp->r_flags &=
693 				    ~(R4RECOVERR|R4RECOVERRP|R4STALE);
694 				rp->r_error = 0;
695 				error = ESTALE;
696 			}
697 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
698 			    "nfs4_start_fop: %s %p DEAD, cnt=%d error=%d",
699 			    str, (void *)vp,
700 			    rsp->rs_num_retry_despite_err, error));
701 		}
702 	}
703 
704 	mutex_exit(&rp->r_statelock);
705 	return (error);
706 }
707 
708 /*
709  * Initial setup code that every operation should call if it might invoke
710  * client recovery.  Can block waiting for recovery to finish on a
711  * filesystem.  Either vnode ptr can be NULL.
712  *
713  * Returns 0 if there are no outstanding errors.  Can return an
714  * errno value under various circumstances (e.g., failed recovery, or
715  * interrupted while waiting for recovery to finish).
716  *
717  * There must be a corresponding call to nfs4_end_op() to free up any locks
718  * or resources allocated by this call (assuming this call succeeded),
719  * using the same rsp that's passed in here.
720  *
721  * The open and lock seqid synchronization must be stopped before calling this
722  * function, as it could lead to deadlock when trying to reopen a file or
723  * reclaim a lock.  The synchronization is obtained with calls to:
724  *   nfs4_start_open_seqid_sync()
725  *   nfs4_start_lock_seqid_sync()
726  *
727  * *startrecovp is set TRUE if the caller should not bother with the
728  * over-the-wire call, and just initiate recovery for the given request.
729  * This is typically used for state-releasing ops if the filesystem has
730  * been forcibly unmounted.  startrecovp may be NULL for
731  * non-state-releasing ops.
732  */
733 
734 int
735 nfs4_start_fop(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2, nfs4_op_hint_t op,
736     nfs4_recov_state_t *rsp, bool_t *startrecovp)
737 {
738 	int error = 0, rerr_cnt;
739 	nfs4_server_t *sp = NULL;
740 	nfs4_server_t *tsp;
741 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
742 	uint_t droplock_cnt;
743 #ifdef DEBUG
744 	void *fop_caller;
745 #endif
746 
747 	ASSERT(vp1 == NULL || vp1->v_vfsp == mi->mi_vfsp);
748 	ASSERT(vp2 == NULL || vp2->v_vfsp == mi->mi_vfsp);
749 
750 #ifdef	DEBUG
751 	if ((fop_caller = tsd_get(nfs4_tsd_key)) != NULL) {
752 		cmn_err(CE_PANIC, "Missing nfs4_end_fop: last caller %p",
753 		    fop_caller);
754 	}
755 	(void) tsd_set(nfs4_tsd_key, caller());
756 #endif
757 
758 	rsp->rs_sp = NULL;
759 	rsp->rs_flags &= ~NFS4_RS_RENAME_HELD;
760 	rerr_cnt = rsp->rs_num_retry_despite_err;
761 
762 	/*
763 	 * Process the items that may delay() based on server response
764 	 */
765 	error = nfs4_wait_for_grace(mi, rsp);
766 	if (error)
767 		goto out;
768 
769 	if (vp1 != NULL) {
770 		error = nfs4_wait_for_delay(vp1, rsp);
771 		if (error)
772 			goto out;
773 	}
774 
775 	/* Wait for a delegation recall to complete. */
776 
777 	error = wait_for_recall(vp1, vp2, op, rsp);
778 	if (error)
779 		goto out;
780 
781 	/*
782 	 * Wait for any current recovery actions to finish.  Note that a
783 	 * recovery thread can still start up after wait_for_recovery()
784 	 * finishes.  We don't block out recovery operations until we
785 	 * acquire s_recovlock and mi_recovlock.
786 	 */
787 	error = wait_for_recovery(mi, op);
788 	if (error)
789 		goto out;
790 
791 	/*
792 	 * Check to see if the rnode is already marked with a
793 	 * recovery error.  If so, return it immediately.  But
794 	 * always pass CLOSE, LOCKU, and DELEGRETURN so we can
795 	 * clean up state on the server.
796 	 */
797 
798 	if (vp1 != NULL) {
799 		if (error = nfs4_check_recov_err(vp1, op, rsp, rerr_cnt, "vp1"))
800 			goto out;
801 		nfs4_check_remap(mi, vp1, NFS4_REMAP_CKATTRS, &e);
802 	}
803 
804 	if (vp2 != NULL) {
805 		if (error = nfs4_check_recov_err(vp2, op, rsp, rerr_cnt, "vp2"))
806 			goto out;
807 		nfs4_check_remap(mi, vp2, NFS4_REMAP_CKATTRS, &e);
808 	}
809 
810 	/*
811 	 * The lock order calls for us to acquire s_recovlock before
812 	 * mi_recovlock, but we have to hold mi_recovlock to look up sp (to
813 	 * prevent races with the failover/migration code).  So acquire
814 	 * mi_recovlock, look up sp, drop mi_recovlock, acquire
815 	 * s_recovlock and mi_recovlock, then verify that sp is still the
816 	 * right object.  XXX Can we find a simpler way to deal with this?
817 	 */
818 	if (nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER,
819 	    mi->mi_flags & MI4_INT)) {
820 		error = EINTR;
821 		goto out;
822 	}
823 get_sp:
824 	sp = find_nfs4_server(mi);
825 	if (sp != NULL) {
826 		sp->s_otw_call_count++;
827 		mutex_exit(&sp->s_lock);
828 		droplock_cnt = mi->mi_srvset_cnt;
829 	}
830 	nfs_rw_exit(&mi->mi_recovlock);
831 
832 	if (sp != NULL) {
833 		if (nfs_rw_enter_sig(&sp->s_recovlock, RW_READER,
834 		    mi->mi_flags & MI4_INT)) {
835 			error = EINTR;
836 			goto out;
837 		}
838 	}
839 	if (nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER,
840 	    mi->mi_flags & MI4_INT)) {
841 		if (sp != NULL)
842 			nfs_rw_exit(&sp->s_recovlock);
843 		error = EINTR;
844 		goto out;
845 	}
846 	/*
847 	 * If the mntinfo4_t hasn't changed nfs4_sever_ts then
848 	 * there's no point in double checking to make sure it
849 	 * has switched.
850 	 */
851 	if (sp == NULL || droplock_cnt != mi->mi_srvset_cnt) {
852 		tsp = find_nfs4_server(mi);
853 		if (tsp != sp) {
854 			/* try again */
855 			if (tsp != NULL) {
856 				mutex_exit(&tsp->s_lock);
857 				nfs4_server_rele(tsp);
858 				tsp = NULL;
859 			}
860 			if (sp != NULL) {
861 				nfs_rw_exit(&sp->s_recovlock);
862 				mutex_enter(&sp->s_lock);
863 				sp->s_otw_call_count--;
864 				mutex_exit(&sp->s_lock);
865 				nfs4_server_rele(sp);
866 				sp = NULL;
867 			}
868 			goto get_sp;
869 		} else {
870 			if (tsp != NULL) {
871 				mutex_exit(&tsp->s_lock);
872 				nfs4_server_rele(tsp);
873 				tsp = NULL;
874 			}
875 		}
876 	}
877 
878 	if (sp != NULL) {
879 		rsp->rs_sp = sp;
880 	}
881 
882 	/*
883 	 * If the fileystem uses volatile filehandles, obtain a lock so
884 	 * that we synchronize with renames.  Exception: mount operations
885 	 * can change mi_fh_expire_type, which could be a problem, since
886 	 * the end_op code needs to be consistent with the start_op code
887 	 * about mi_rename_lock.  Since mounts don't compete with renames,
888 	 * it's simpler to just not acquire the rename lock for mounts.
889 	 */
890 	if (NFS4_VOLATILE_FH(mi) && op != OH_MOUNT) {
891 		if (nfs_rw_enter_sig(&mi->mi_rename_lock,
892 		    op == OH_VFH_RENAME ? RW_WRITER : RW_READER,
893 		    mi->mi_flags & MI4_INT)) {
894 			nfs_rw_exit(&mi->mi_recovlock);
895 			if (sp != NULL)
896 				nfs_rw_exit(&sp->s_recovlock);
897 			error = EINTR;
898 			goto out;
899 		}
900 		rsp->rs_flags |= NFS4_RS_RENAME_HELD;
901 	}
902 
903 	if (OH_IS_STATE_RELE(op)) {
904 		/*
905 		 * For forced unmount, letting the request proceed will
906 		 * almost always delay response to the user, so hand it off
907 		 * to the recovery thread.  For exiting lwp's, we don't
908 		 * have a good way to tell if the request will hang.  We
909 		 * generally want processes to handle their own requests so
910 		 * that they can be done in parallel, but if there is
911 		 * already a recovery thread, hand the request off to it.
912 		 * This will improve user response at no cost to overall
913 		 * system throughput.  For zone shutdown, we'd prefer
914 		 * the recovery thread to handle this as well.
915 		 */
916 		ASSERT(startrecovp != NULL);
917 		mutex_enter(&mi->mi_lock);
918 		if (FS_OR_ZONE_GONE4(mi->mi_vfsp))
919 			*startrecovp = TRUE;
920 		else if ((curthread->t_proc_flag & TP_LWPEXIT) &&
921 		    (mi->mi_flags & MI4_RECOV_ACTIV))
922 			*startrecovp = TRUE;
923 		else
924 			*startrecovp = FALSE;
925 		mutex_exit(&mi->mi_lock);
926 	} else
927 		if (startrecovp != NULL)
928 			*startrecovp = FALSE;
929 
930 	ASSERT(error == 0);
931 	return (error);
932 
933 out:
934 	ASSERT(error != 0);
935 	if (sp != NULL) {
936 		mutex_enter(&sp->s_lock);
937 		sp->s_otw_call_count--;
938 		mutex_exit(&sp->s_lock);
939 		nfs4_server_rele(sp);
940 		rsp->rs_sp = NULL;
941 	}
942 	nfs4_end_op_recall(vp1, vp2, rsp);
943 
944 #ifdef	DEBUG
945 	(void) tsd_set(nfs4_tsd_key, NULL);
946 #endif
947 	return (error);
948 }
949 
950 /*
951  * It is up to the caller to determine if rsp->rs_sp being NULL
952  * is detrimental or not.
953  */
954 int
955 nfs4_start_op(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2,
956     nfs4_recov_state_t *rsp)
957 {
958 	ASSERT(rsp->rs_num_retry_despite_err == 0);
959 	rsp->rs_num_retry_despite_err = 0;
960 	return (nfs4_start_fop(mi, vp1, vp2, OH_OTHER, rsp, NULL));
961 }
962 
963 /*
964  * Release any resources acquired by nfs4_start_op().
965  * 'sp' should be the nfs4_server pointer returned by nfs4_start_op().
966  *
967  * The operation hint is used to avoid a deadlock by bypassing delegation
968  * return logic for writes, which are done while returning a delegation.
969  */
970 
971 void
972 nfs4_end_fop(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2, nfs4_op_hint_t op,
973     nfs4_recov_state_t *rsp, bool_t needs_recov)
974 {
975 	nfs4_server_t *sp = rsp->rs_sp;
976 	rnode4_t *rp = NULL;
977 
978 #ifdef	lint
979 	/*
980 	 * The op hint isn't used any more, but might be in
981 	 * the future.
982 	 */
983 	op = op;
984 #endif
985 
986 #ifdef	DEBUG
987 	ASSERT(tsd_get(nfs4_tsd_key) != NULL);
988 	(void) tsd_set(nfs4_tsd_key, NULL);
989 #endif
990 
991 	nfs4_end_op_recall(vp1, vp2, rsp);
992 
993 	if (rsp->rs_flags & NFS4_RS_RENAME_HELD)
994 		nfs_rw_exit(&mi->mi_rename_lock);
995 
996 	if (!needs_recov) {
997 		if (rsp->rs_flags & NFS4_RS_DELAY_MSG) {
998 			/* may need to clear the delay interval */
999 			if (vp1 != NULL) {
1000 				rp = VTOR4(vp1);
1001 				mutex_enter(&rp->r_statelock);
1002 				rp->r_delay_interval = 0;
1003 				mutex_exit(&rp->r_statelock);
1004 			}
1005 		}
1006 		rsp->rs_flags &= ~(NFS4_RS_GRACE_MSG|NFS4_RS_DELAY_MSG);
1007 	}
1008 
1009 	/*
1010 	 * If the corresponding nfs4_start_op() found a sp,
1011 	 * then there must still be a sp.
1012 	 */
1013 	if (sp != NULL) {
1014 		nfs_rw_exit(&mi->mi_recovlock);
1015 		nfs_rw_exit(&sp->s_recovlock);
1016 		mutex_enter(&sp->s_lock);
1017 		sp->s_otw_call_count--;
1018 		cv_broadcast(&sp->s_cv_otw_count);
1019 		mutex_exit(&sp->s_lock);
1020 		nfs4_server_rele(sp);
1021 	} else {
1022 		nfs_rw_exit(&mi->mi_recovlock);
1023 	}
1024 }
1025 
1026 void
1027 nfs4_end_op(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2,
1028     nfs4_recov_state_t *rsp, bool_t needrecov)
1029 {
1030 	nfs4_end_fop(mi, vp1, vp2, OH_OTHER, rsp, needrecov);
1031 }
1032 
1033 /*
1034  * If the filesystem is going through client recovery, block until
1035  * finished.
1036  * Exceptions:
1037  * - state-releasing ops (CLOSE, LOCKU, DELEGRETURN) are allowed to proceed
1038  *   if the filesystem has been forcibly unmounted or the lwp is exiting.
1039  *
1040  * Return value:
1041  * - 0 if no errors
1042  * - EINTR if the call was interrupted
1043  * - EIO if the filesystem has been forcibly unmounted (non-state-releasing
1044  *   op)
1045  * - the errno value from the recovery thread, if recovery failed
1046  */
1047 
1048 static int
1049 wait_for_recovery(mntinfo4_t *mi, nfs4_op_hint_t op_hint)
1050 {
1051 	int error = 0;
1052 
1053 	mutex_enter(&mi->mi_lock);
1054 
1055 	while (mi->mi_recovflags != 0) {
1056 		klwp_t *lwp = ttolwp(curthread);
1057 
1058 		if ((mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) ||
1059 		    (mi->mi_flags & MI4_RECOV_FAIL))
1060 			break;
1061 		if (OH_IS_STATE_RELE(op_hint) &&
1062 		    (curthread->t_proc_flag & TP_LWPEXIT))
1063 			break;
1064 
1065 		if (lwp != NULL)
1066 			lwp->lwp_nostop++;
1067 		/* XXX - use different cv? */
1068 		if (cv_wait_sig(&mi->mi_failover_cv, &mi->mi_lock) == 0) {
1069 			error = EINTR;
1070 			if (lwp != NULL)
1071 				lwp->lwp_nostop--;
1072 			break;
1073 		}
1074 		if (lwp != NULL)
1075 			lwp->lwp_nostop--;
1076 	}
1077 
1078 	if ((mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) &&
1079 	    !OH_IS_STATE_RELE(op_hint)) {
1080 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1081 		    "wait_for_recovery: forced unmount"));
1082 		error = EIO;
1083 	} else if (mi->mi_flags & MI4_RECOV_FAIL) {
1084 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1085 		    "wait_for_recovery: fail since RECOV FAIL"));
1086 		error = mi->mi_error;
1087 	}
1088 
1089 	mutex_exit(&mi->mi_lock);
1090 
1091 	return (error);
1092 }
1093 
1094 /*
1095  * If the client received NFS4ERR_GRACE for this particular mount,
1096  * the client blocks here until it is time to try again.
1097  *
1098  * Return value:
1099  * - 0 if wait was successful
1100  * - EINTR if the call was interrupted
1101  */
1102 
1103 int
1104 nfs4_wait_for_grace(mntinfo4_t *mi, nfs4_recov_state_t *rsp)
1105 {
1106 	int error = 0;
1107 	time_t curtime, time_to_wait;
1108 
1109 	/* do a unprotected check to reduce mi_lock contention */
1110 	if (mi->mi_grace_wait != 0) {
1111 		mutex_enter(&mi->mi_lock);
1112 
1113 		if (mi->mi_grace_wait != 0) {
1114 			if (!(rsp->rs_flags & NFS4_RS_GRACE_MSG))
1115 				rsp->rs_flags |= NFS4_RS_GRACE_MSG;
1116 
1117 			curtime = gethrestime_sec();
1118 
1119 			if (curtime < mi->mi_grace_wait) {
1120 
1121 				time_to_wait = mi->mi_grace_wait - curtime;
1122 
1123 				mutex_exit(&mi->mi_lock);
1124 
1125 				delay(SEC_TO_TICK(time_to_wait));
1126 
1127 				curtime = gethrestime_sec();
1128 
1129 				mutex_enter(&mi->mi_lock);
1130 
1131 				if (curtime >= mi->mi_grace_wait)
1132 					mi->mi_grace_wait = 0;
1133 			} else {
1134 				mi->mi_grace_wait = 0;
1135 			}
1136 		}
1137 		mutex_exit(&mi->mi_lock);
1138 	}
1139 
1140 	return (error);
1141 }
1142 
1143 /*
1144  * If the client received NFS4ERR_DELAY for an operation on a vnode,
1145  * the client blocks here until it is time to try again.
1146  *
1147  * Return value:
1148  * - 0 if wait was successful
1149  * - EINTR if the call was interrupted
1150  */
1151 
1152 int
1153 nfs4_wait_for_delay(vnode_t *vp, nfs4_recov_state_t *rsp)
1154 {
1155 	int error = 0;
1156 	time_t curtime, time_to_wait;
1157 	rnode4_t *rp;
1158 
1159 	ASSERT(vp != NULL);
1160 
1161 	rp = VTOR4(vp);
1162 
1163 	/* do a unprotected check to reduce r_statelock contention */
1164 	if (rp->r_delay_wait != 0) {
1165 		mutex_enter(&rp->r_statelock);
1166 
1167 		if (rp->r_delay_wait != 0) {
1168 
1169 			if (!(rsp->rs_flags & NFS4_RS_DELAY_MSG)) {
1170 				rsp->rs_flags |= NFS4_RS_DELAY_MSG;
1171 				nfs4_mi_kstat_inc_delay(VTOMI4(vp));
1172 			}
1173 
1174 			curtime = gethrestime_sec();
1175 
1176 			if (curtime < rp->r_delay_wait) {
1177 
1178 				time_to_wait = rp->r_delay_wait - curtime;
1179 
1180 				mutex_exit(&rp->r_statelock);
1181 
1182 				delay(SEC_TO_TICK(time_to_wait));
1183 
1184 				curtime = gethrestime_sec();
1185 
1186 				mutex_enter(&rp->r_statelock);
1187 
1188 				if (curtime >= rp->r_delay_wait)
1189 					rp->r_delay_wait = 0;
1190 			} else {
1191 				rp->r_delay_wait = 0;
1192 			}
1193 		}
1194 		mutex_exit(&rp->r_statelock);
1195 	}
1196 
1197 	return (error);
1198 }
1199 
1200 /*
1201  * The recovery thread.
1202  */
1203 
1204 static void
1205 nfs4_recov_thread(recov_info_t *recovp)
1206 {
1207 	mntinfo4_t *mi = recovp->rc_mi;
1208 	nfs4_server_t *sp;
1209 	int done = 0, error = 0;
1210 	bool_t recov_fail = FALSE;
1211 	callb_cpr_t cpr_info;
1212 	kmutex_t cpr_lock;
1213 
1214 	nfs4_queue_event(RE_START, mi, NULL, mi->mi_recovflags,
1215 	    recovp->rc_vp1, recovp->rc_vp2, 0, NULL, 0, TAG_NONE, TAG_NONE,
1216 	    0, 0);
1217 
1218 	mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
1219 	CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr, "nfsv4Recov");
1220 
1221 	mutex_enter(&mi->mi_lock);
1222 	mi->mi_recovthread = curthread;
1223 	mutex_exit(&mi->mi_lock);
1224 
1225 	/*
1226 	 * We don't really need protection here against failover or
1227 	 * migration, since the current thread is the one that would make
1228 	 * any changes, but hold mi_recovlock anyway for completeness (and
1229 	 * to satisfy any ASSERTs).
1230 	 */
1231 	(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
1232 	sp = find_nfs4_server(mi);
1233 	if (sp != NULL)
1234 		mutex_exit(&sp->s_lock);
1235 	nfs_rw_exit(&mi->mi_recovlock);
1236 
1237 	/*
1238 	 * Do any necessary recovery, based on the information in recovp
1239 	 * and any recovery flags.
1240 	 */
1241 
1242 	do {
1243 		mutex_enter(&mi->mi_lock);
1244 		if (FS_OR_ZONE_GONE4(mi->mi_vfsp)) {
1245 			bool_t activesrv;
1246 
1247 			NFS4_DEBUG(nfs4_client_recov_debug &&
1248 			    mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED, (CE_NOTE,
1249 			    "nfs4_recov_thread: file system has been "
1250 			    "unmounted"));
1251 			NFS4_DEBUG(nfs4_client_recov_debug &&
1252 			    zone_status_get(curproc->p_zone) >=
1253 			    ZONE_IS_SHUTTING_DOWN, (CE_NOTE,
1254 			    "nfs4_recov_thread: zone shutting down"));
1255 			/*
1256 			 * If the server has lost its state for us and
1257 			 * the filesystem is unmounted, then the filesystem
1258 			 * can be tossed, even if there are lost lock or
1259 			 * lost state calls in the recovery queue.
1260 			 */
1261 			if (mi->mi_recovflags &
1262 			    (MI4R_NEED_CLIENTID | MI4R_REOPEN_FILES)) {
1263 				NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1264 				"nfs4_recov_thread: bailing out"));
1265 				mi->mi_flags |= MI4_RECOV_FAIL;
1266 				mi->mi_error = recovp->rc_error;
1267 				recov_fail = TRUE;
1268 			}
1269 			/*
1270 			 * We don't know if the server has any state for
1271 			 * us, and the filesystem has been unmounted.  If
1272 			 * there are "lost state" recovery items, keep
1273 			 * trying to process them until there are no more
1274 			 * mounted filesystems for the server.  Otherwise,
1275 			 * bail out.  The reason we don't mark the
1276 			 * filesystem as failing recovery is in case we
1277 			 * have to do "lost state" recovery later (e.g., a
1278 			 * user process exits).
1279 			 */
1280 			if (!(mi->mi_recovflags & MI4R_LOST_STATE)) {
1281 				done = 1;
1282 				mutex_exit(&mi->mi_lock);
1283 				break;
1284 			}
1285 			mutex_exit(&mi->mi_lock);
1286 
1287 			if (sp == NULL)
1288 				activesrv = FALSE;
1289 			else {
1290 				mutex_enter(&sp->s_lock);
1291 				activesrv = nfs4_fs_active(sp);
1292 			}
1293 			if (!activesrv) {
1294 				NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1295 				    "no active fs for server %p",
1296 				    (void *)sp));
1297 				mutex_enter(&mi->mi_lock);
1298 				mi->mi_flags |= MI4_RECOV_FAIL;
1299 				mi->mi_error = recovp->rc_error;
1300 				mutex_exit(&mi->mi_lock);
1301 				recov_fail = TRUE;
1302 				if (sp != NULL) {
1303 					/*
1304 					 * Mark the server instance as
1305 					 * dead, so that nobody will attach
1306 					 * a new filesystem.
1307 					 */
1308 					nfs4_mark_srv_dead(sp);
1309 				}
1310 			}
1311 			if (sp != NULL)
1312 				mutex_exit(&sp->s_lock);
1313 		} else {
1314 			mutex_exit(&mi->mi_lock);
1315 		}
1316 
1317 		/*
1318 		 * Check if we need to select a new server for a
1319 		 * failover.  Choosing a new server will force at
1320 		 * least a check of the clientid.
1321 		 */
1322 		mutex_enter(&mi->mi_lock);
1323 		if (!recov_fail &&
1324 		    (mi->mi_recovflags & MI4R_NEED_NEW_SERVER)) {
1325 			mutex_exit(&mi->mi_lock);
1326 			recov_newserver(recovp, &sp, &recov_fail);
1327 		} else
1328 			mutex_exit(&mi->mi_lock);
1329 
1330 		/*
1331 		 * Check if we need to recover the clientid.  This
1332 		 * must be done before file and lock recovery, and it
1333 		 * potentially affects the recovery threads for other
1334 		 * filesystems, so it gets special treatment.
1335 		 */
1336 		if (sp != NULL && recov_fail == FALSE) {
1337 			mutex_enter(&sp->s_lock);
1338 			if (!(sp->s_flags & N4S_CLIENTID_SET)) {
1339 				mutex_exit(&sp->s_lock);
1340 				recov_clientid(recovp, sp);
1341 			} else {
1342 				/*
1343 				 * Unset this flag in case another recovery
1344 				 * thread successfully recovered the clientid
1345 				 * for us already.
1346 				 */
1347 				mutex_enter(&mi->mi_lock);
1348 				mi->mi_recovflags &= ~MI4R_NEED_CLIENTID;
1349 				mutex_exit(&mi->mi_lock);
1350 				mutex_exit(&sp->s_lock);
1351 			}
1352 		}
1353 
1354 		/*
1355 		 * Check if we need to get the security information.
1356 		 */
1357 		mutex_enter(&mi->mi_lock);
1358 		if ((mi->mi_recovflags & MI4R_NEED_SECINFO) &&
1359 		    !(mi->mi_flags & MI4_RECOV_FAIL)) {
1360 			mutex_exit(&mi->mi_lock);
1361 			(void) nfs_rw_enter_sig(&mi->mi_recovlock,
1362 			    RW_WRITER, 0);
1363 			error = nfs4_secinfo_recov(recovp->rc_mi,
1364 			    recovp->rc_vp1, recovp->rc_vp2);
1365 			/*
1366 			 * If error, nothing more can be done, stop
1367 			 * the recovery.
1368 			 */
1369 			if (error) {
1370 				mutex_enter(&mi->mi_lock);
1371 				mi->mi_flags |= MI4_RECOV_FAIL;
1372 				mi->mi_error = recovp->rc_error;
1373 				mutex_exit(&mi->mi_lock);
1374 				nfs4_queue_event(RE_WRONGSEC, mi, NULL,
1375 				    error, recovp->rc_vp1, recovp->rc_vp2,
1376 				    0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
1377 			}
1378 			nfs_rw_exit(&mi->mi_recovlock);
1379 		} else
1380 			mutex_exit(&mi->mi_lock);
1381 
1382 		/*
1383 		 * Check if there's a bad seqid to recover.
1384 		 */
1385 		mutex_enter(&mi->mi_lock);
1386 		if ((mi->mi_recovflags & MI4R_BAD_SEQID) &&
1387 		    !(mi->mi_flags & MI4_RECOV_FAIL)) {
1388 			mutex_exit(&mi->mi_lock);
1389 			(void) nfs_rw_enter_sig(&mi->mi_recovlock,
1390 			    RW_WRITER, 0);
1391 			recov_bad_seqid(recovp);
1392 			nfs_rw_exit(&mi->mi_recovlock);
1393 		} else
1394 			mutex_exit(&mi->mi_lock);
1395 
1396 		/*
1397 		 * Next check for recovery that affects the entire
1398 		 * filesystem.
1399 		 */
1400 		if (sp != NULL) {
1401 			mutex_enter(&mi->mi_lock);
1402 			if ((mi->mi_recovflags & MI4R_REOPEN_FILES) &&
1403 			    !(mi->mi_flags & MI4_RECOV_FAIL)) {
1404 				mutex_exit(&mi->mi_lock);
1405 				recov_openfiles(recovp, sp);
1406 			} else
1407 				mutex_exit(&mi->mi_lock);
1408 		}
1409 
1410 		/*
1411 		 * Send any queued state recovery requests.
1412 		 */
1413 		mutex_enter(&mi->mi_lock);
1414 		if (sp != NULL &&
1415 		    (mi->mi_recovflags & MI4R_LOST_STATE) &&
1416 		    !(mi->mi_flags & MI4_RECOV_FAIL)) {
1417 			mutex_exit(&mi->mi_lock);
1418 			(void) nfs_rw_enter_sig(&mi->mi_recovlock,
1419 			    RW_WRITER, 0);
1420 			nfs4_resend_lost_rqsts(recovp, sp);
1421 			if (list_head(&mi->mi_lost_state) == NULL) {
1422 				/* done */
1423 				mutex_enter(&mi->mi_lock);
1424 				mi->mi_recovflags &= ~MI4R_LOST_STATE;
1425 				mutex_exit(&mi->mi_lock);
1426 			}
1427 			nfs_rw_exit(&mi->mi_recovlock);
1428 		} else {
1429 			mutex_exit(&mi->mi_lock);
1430 		}
1431 
1432 		/*
1433 		 * See if there is anything more to do.  If not, announce
1434 		 * that we are done and exit.
1435 		 *
1436 		 * Need mi_recovlock to keep 'sp' valid.  Must grab
1437 		 * mi_recovlock before mi_lock to preserve lock ordering.
1438 		 */
1439 		(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
1440 		mutex_enter(&mi->mi_lock);
1441 		if ((mi->mi_recovflags & ~MI4R_SRV_REBOOT) == 0 ||
1442 		    (mi->mi_flags & MI4_RECOV_FAIL)) {
1443 			list_t local_lost_state;
1444 			nfs4_lost_rqst_t *lrp;
1445 
1446 			/*
1447 			 * We need to remove the lost requests before we
1448 			 * unmark the mi as no longer doing recovery to
1449 			 * avoid a race with a new thread putting new lost
1450 			 * requests on the same mi (and the going away
1451 			 * thread would remove the new lost requests).
1452 			 *
1453 			 * Move the lost requests to a local list since
1454 			 * nfs4_remove_lost_rqst() drops mi_lock, and
1455 			 * dropping the mi_lock would make our check to
1456 			 * see if recovery is done no longer valid.
1457 			 */
1458 			list_create(&local_lost_state,
1459 			    sizeof (nfs4_lost_rqst_t),
1460 			    offsetof(nfs4_lost_rqst_t, lr_node));
1461 			list_move_tail(&local_lost_state, &mi->mi_lost_state);
1462 
1463 			done = 1;
1464 			mutex_exit(&mi->mi_lock);
1465 			/*
1466 			 * Now officially free the "moved"
1467 			 * lost requests.
1468 			 */
1469 			while ((lrp = list_head(&local_lost_state)) != NULL) {
1470 				list_remove(&local_lost_state, lrp);
1471 				nfs4_free_lost_rqst(lrp, sp);
1472 			}
1473 			list_destroy(&local_lost_state);
1474 		} else
1475 			mutex_exit(&mi->mi_lock);
1476 		nfs_rw_exit(&mi->mi_recovlock);
1477 
1478 		/*
1479 		 * If the filesystem has been forcibly unmounted, there is
1480 		 * probably no point in retrying immediately.  Furthermore,
1481 		 * there might be user processes waiting for a chance to
1482 		 * queue up "lost state" requests, so that they can exit.
1483 		 * So pause here for a moment.  Same logic for zone shutdown.
1484 		 */
1485 		if (!done && FS_OR_ZONE_GONE4(mi->mi_vfsp)) {
1486 			mutex_enter(&mi->mi_lock);
1487 			cv_broadcast(&mi->mi_failover_cv);
1488 			mutex_exit(&mi->mi_lock);
1489 			delay(SEC_TO_TICK(nfs4_unmount_delay));
1490 		}
1491 
1492 	} while (!done);
1493 
1494 	if (sp != NULL)
1495 		nfs4_server_rele(sp);
1496 
1497 	/*
1498 	 * Return all recalled delegations
1499 	 */
1500 	nfs4_dlistclean();
1501 
1502 	mutex_enter(&mi->mi_lock);
1503 	recov_done(mi, recovp);
1504 	mutex_exit(&mi->mi_lock);
1505 
1506 	/*
1507 	 * Free up resources that were allocated for us.
1508 	 */
1509 	if (recovp->rc_vp1 != NULL)
1510 		VN_RELE(recovp->rc_vp1);
1511 	if (recovp->rc_vp2 != NULL)
1512 		VN_RELE(recovp->rc_vp2);
1513 
1514 	/* now we are done using the mi struct, signal the waiters */
1515 	mutex_enter(&mi->mi_lock);
1516 	mi->mi_in_recovery--;
1517 	if (mi->mi_in_recovery == 0)
1518 		cv_broadcast(&mi->mi_cv_in_recov);
1519 	mutex_exit(&mi->mi_lock);
1520 
1521 	VFS_RELE(mi->mi_vfsp);
1522 	MI4_RELE(mi);
1523 	kmem_free(recovp, sizeof (recov_info_t));
1524 	mutex_enter(&cpr_lock);
1525 	CALLB_CPR_EXIT(&cpr_info);
1526 	mutex_destroy(&cpr_lock);
1527 	zthread_exit();
1528 }
1529 
1530 /*
1531  * Log the end of recovery and notify any waiting threads.
1532  */
1533 
1534 static void
1535 recov_done(mntinfo4_t *mi, recov_info_t *recovp)
1536 {
1537 
1538 	ASSERT(MUTEX_HELD(&mi->mi_lock));
1539 
1540 	nfs4_queue_event(RE_END, mi, NULL, 0, recovp->rc_vp1,
1541 	    recovp->rc_vp2, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
1542 	mi->mi_recovthread = NULL;
1543 	mi->mi_flags &= ~MI4_RECOV_ACTIV;
1544 	mi->mi_recovflags &= ~MI4R_SRV_REBOOT;
1545 	cv_broadcast(&mi->mi_failover_cv);
1546 }
1547 
1548 /*
1549  * State-specific recovery routines, by state.
1550  */
1551 
1552 /*
1553  * Failover.
1554  *
1555  * Replaces *spp with a reference to the new server, which must
1556  * eventually be freed.
1557  */
1558 
1559 static void
1560 recov_newserver(recov_info_t *recovp, nfs4_server_t **spp, bool_t *recov_fail)
1561 {
1562 	mntinfo4_t *mi = recovp->rc_mi;
1563 	servinfo4_t *svp = NULL;
1564 	nfs4_server_t *osp = *spp;
1565 	CLIENT *cl;
1566 	enum clnt_stat status;
1567 	struct timeval tv;
1568 	int error;
1569 	int oncethru = 0;
1570 	rnode4_t *rp;
1571 	int index;
1572 	nfs_fh4 fh;
1573 	char *snames;
1574 	size_t len;
1575 
1576 	(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0);
1577 
1578 	tv.tv_sec = 2;
1579 	tv.tv_usec = 0;
1580 
1581 #ifdef lint
1582 	/*
1583 	 * Lint can't follow the logic, so thinks that snames and len
1584 	 * can be used before being set.  They can't, but lint can't
1585 	 * figure it out.  To address the lint warning, initialize
1586 	 * snames and len for lint.
1587 	 */
1588 	snames = NULL;
1589 	len = 0;
1590 #endif
1591 
1592 	/*
1593 	 * Ping the null NFS procedure of every server in
1594 	 * the list until one responds.  We always start
1595 	 * at the head of the list and always skip the one
1596 	 * that is current, since it's caused us a problem.
1597 	 */
1598 	while (svp == NULL) {
1599 		for (svp = mi->mi_servers; svp; svp = svp->sv_next) {
1600 
1601 			mutex_enter(&mi->mi_lock);
1602 			if (FS_OR_ZONE_GONE4(mi->mi_vfsp)) {
1603 				mi->mi_flags |= MI4_RECOV_FAIL;
1604 				mutex_exit(&mi->mi_lock);
1605 				(void) nfs_rw_exit(&mi->mi_recovlock);
1606 				*recov_fail = TRUE;
1607 				if (oncethru)
1608 					kmem_free(snames, len);
1609 				return;
1610 			}
1611 			mutex_exit(&mi->mi_lock);
1612 
1613 			(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1614 			if (svp->sv_flags & SV4_NOTINUSE) {
1615 				nfs_rw_exit(&svp->sv_lock);
1616 				continue;
1617 			}
1618 			nfs_rw_exit(&svp->sv_lock);
1619 
1620 			if (!oncethru && svp == mi->mi_curr_serv)
1621 				continue;
1622 
1623 			error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr,
1624 			    NFS_PROGRAM, NFS_V4, 0, 1, CRED(), &cl);
1625 			if (error)
1626 				continue;
1627 
1628 			if (!(mi->mi_flags & MI4_INT))
1629 				cl->cl_nosignal = TRUE;
1630 			status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL,
1631 			    xdr_void, NULL, tv);
1632 			if (!(mi->mi_flags & MI4_INT))
1633 				cl->cl_nosignal = FALSE;
1634 			AUTH_DESTROY(cl->cl_auth);
1635 			CLNT_DESTROY(cl);
1636 			if (status == RPC_SUCCESS) {
1637 				nfs4_queue_event(RE_FAILOVER, mi,
1638 				    svp == mi->mi_curr_serv ? NULL :
1639 				    svp->sv_hostname, 0, NULL, NULL, 0,
1640 				    NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
1641 				break;
1642 			}
1643 		}
1644 
1645 		if (svp == NULL) {
1646 			if (!oncethru) {
1647 				snames = nfs4_getsrvnames(mi, &len);
1648 				nfs4_queue_fact(RF_SRVS_NOT_RESPOND, mi,
1649 				    0, 0, 0, FALSE, snames, 0, NULL);
1650 				oncethru = 1;
1651 			}
1652 			delay(hz);
1653 		}
1654 	}
1655 
1656 	if (oncethru) {
1657 		nfs4_queue_fact(RF_SRVS_OK, mi, 0, 0, 0, FALSE, snames,
1658 		    0, NULL);
1659 		kmem_free(snames, len);
1660 	}
1661 
1662 #if DEBUG
1663 	(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1664 	ASSERT((svp->sv_flags & SV4_NOTINUSE) == 0);
1665 	nfs_rw_exit(&svp->sv_lock);
1666 #endif
1667 
1668 	mutex_enter(&mi->mi_lock);
1669 	mi->mi_recovflags &= ~MI4R_NEED_NEW_SERVER;
1670 	if (svp != mi->mi_curr_serv) {
1671 		servinfo4_t *osvp = mi->mi_curr_serv;
1672 
1673 		mutex_exit(&mi->mi_lock);
1674 
1675 		/*
1676 		 * Update server-dependent fields in the root vnode.
1677 		 */
1678 		index = rtable4hash(mi->mi_rootfh);
1679 		rw_enter(&rtable4[index].r_lock, RW_WRITER);
1680 
1681 		rp = r4find(&rtable4[index], mi->mi_rootfh, mi->mi_vfsp);
1682 		if (rp != NULL) {
1683 			NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
1684 			    "recov_newserver: remapping %s", rnode4info(rp)));
1685 			mutex_enter(&rp->r_statelock);
1686 			rp->r_server = svp;
1687 			PURGE_ATTRCACHE4_LOCKED(rp);
1688 			mutex_exit(&rp->r_statelock);
1689 			(void) nfs4_free_data_reclaim(rp);
1690 			nfs4_purge_rddir_cache(RTOV4(rp));
1691 			rw_exit(&rtable4[index].r_lock);
1692 			NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
1693 			    "recov_newserver: done with %s",
1694 			    rnode4info(rp)));
1695 			VN_RELE(RTOV4(rp));
1696 		} else
1697 			rw_exit(&rtable4[index].r_lock);
1698 		(void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
1699 
1700 		mutex_enter(&mi->mi_lock);
1701 		mi->mi_recovflags |= MI4R_REOPEN_FILES | MI4R_REMAP_FILES;
1702 		if (recovp->rc_srv_reboot)
1703 			mi->mi_recovflags |= MI4R_SRV_REBOOT;
1704 		mi->mi_curr_serv = svp;
1705 		mi->mi_failover++;
1706 		mi->mi_flags &= ~MI4_BADOWNER_DEBUG;
1707 		mutex_exit(&mi->mi_lock);
1708 
1709 		(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1710 		fh.nfs_fh4_len = svp->sv_fhandle.fh_len;
1711 		fh.nfs_fh4_val = svp->sv_fhandle.fh_buf;
1712 		sfh4_update(mi->mi_rootfh, &fh);
1713 		fh.nfs_fh4_len = svp->sv_pfhandle.fh_len;
1714 		fh.nfs_fh4_val = svp->sv_pfhandle.fh_buf;
1715 		sfh4_update(mi->mi_srvparentfh, &fh);
1716 		nfs_rw_exit(&svp->sv_lock);
1717 
1718 		*spp = nfs4_move_mi(mi, osvp, svp);
1719 		if (osp != NULL)
1720 			nfs4_server_rele(osp);
1721 	} else
1722 		mutex_exit(&mi->mi_lock);
1723 	(void) nfs_rw_exit(&mi->mi_recovlock);
1724 }
1725 
1726 /*
1727  * Clientid.
1728  */
1729 
1730 static void
1731 recov_clientid(recov_info_t *recovp, nfs4_server_t *sp)
1732 {
1733 	mntinfo4_t *mi = recovp->rc_mi;
1734 	int error = 0;
1735 	int still_stale;
1736 	int need_new_s;
1737 
1738 	ASSERT(sp != NULL);
1739 
1740 	/*
1741 	 * Acquire the recovery lock and then verify that the clientid
1742 	 * still needs to be recovered.  (Note that s_recovlock is supposed
1743 	 * to be acquired before s_lock.)  Since the thread holds the
1744 	 * recovery lock, no other thread will recover the clientid.
1745 	 */
1746 	(void) nfs_rw_enter_sig(&sp->s_recovlock, RW_WRITER, 0);
1747 	(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0);
1748 	mutex_enter(&sp->s_lock);
1749 	still_stale = ((sp->s_flags & N4S_CLIENTID_SET) == 0);
1750 	mutex_exit(&sp->s_lock);
1751 
1752 	if (still_stale) {
1753 		nfs4_error_t n4e;
1754 
1755 		nfs4_error_zinit(&n4e);
1756 		nfs4setclientid(mi, kcred, TRUE, &n4e);
1757 		error = n4e.error;
1758 		if (error != 0) {
1759 
1760 			/*
1761 			 * nfs4setclientid may have set MI4R_NEED_NEW_SERVER,
1762 			 * if so, just return and let recov_thread drive
1763 			 * failover.
1764 			 */
1765 			mutex_enter(&mi->mi_lock);
1766 			need_new_s = mi->mi_recovflags & MI4R_NEED_NEW_SERVER;
1767 			mutex_exit(&mi->mi_lock);
1768 
1769 			if (need_new_s) {
1770 				nfs_rw_exit(&mi->mi_recovlock);
1771 				nfs_rw_exit(&sp->s_recovlock);
1772 				return;
1773 			}
1774 
1775 			nfs4_queue_event(RE_CLIENTID, mi, NULL, n4e.error, NULL,
1776 			    NULL, n4e.stat, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
1777 			mutex_enter(&mi->mi_lock);
1778 			mi->mi_flags |= MI4_RECOV_FAIL;
1779 			mi->mi_error = recovp->rc_error;
1780 			mutex_exit(&mi->mi_lock);
1781 			/* don't destroy the nfs4_server, let umount do it */
1782 		}
1783 	}
1784 
1785 	if (error == 0) {
1786 		mutex_enter(&mi->mi_lock);
1787 		mi->mi_recovflags &= ~MI4R_NEED_CLIENTID;
1788 		/*
1789 		 * If still_stale isn't true, then another thread already
1790 		 * recovered the clientid.  And that thread that set the
1791 		 * clientid will have initiated reopening files on all the
1792 		 * filesystems for the server, so we should not initiate
1793 		 * reopening for this filesystem here.
1794 		 */
1795 		if (still_stale) {
1796 			mi->mi_recovflags |= MI4R_REOPEN_FILES;
1797 			if (recovp->rc_srv_reboot)
1798 				mi->mi_recovflags |= MI4R_SRV_REBOOT;
1799 		}
1800 		mutex_exit(&mi->mi_lock);
1801 	}
1802 
1803 	nfs_rw_exit(&mi->mi_recovlock);
1804 
1805 	if (error != 0) {
1806 		nfs_rw_exit(&sp->s_recovlock);
1807 		mutex_enter(&mi->mi_lock);
1808 		if ((mi->mi_flags & MI4_RECOV_FAIL) == 0)
1809 			delay(SEC_TO_TICK(recov_err_delay));
1810 		mutex_exit(&mi->mi_lock);
1811 	} else {
1812 		mntinfo4_t **milist;
1813 		mntinfo4_t *tmi;
1814 		int nummi, i;
1815 
1816 		/*
1817 		 * Initiate recovery of open files for other filesystems.
1818 		 * We create an array of filesystems, rather than just
1819 		 * walking the filesystem list, to avoid deadlock issues
1820 		 * with s_lock and mi_recovlock.
1821 		 */
1822 		milist = make_milist(sp, &nummi);
1823 		for (i = 0; i < nummi; i++) {
1824 			tmi = milist[i];
1825 			if (tmi != mi) {
1826 				(void) nfs_rw_enter_sig(&tmi->mi_recovlock,
1827 				    RW_READER, 0);
1828 				start_recovery_action(NR_OPENFILES, TRUE, tmi,
1829 				    NULL, NULL);
1830 				nfs_rw_exit(&tmi->mi_recovlock);
1831 			}
1832 		}
1833 		free_milist(milist, nummi);
1834 
1835 		nfs_rw_exit(&sp->s_recovlock);
1836 	}
1837 }
1838 
1839 /*
1840  * Return an array of filesystems associated with the given server.  The
1841  * caller should call free_milist() to free the references and memory.
1842  */
1843 
1844 static mntinfo4_t **
1845 make_milist(nfs4_server_t *sp, int *nummip)
1846 {
1847 	int nummi, i;
1848 	mntinfo4_t **milist;
1849 	mntinfo4_t *tmi;
1850 
1851 	mutex_enter(&sp->s_lock);
1852 	nummi = 0;
1853 	for (tmi = sp->mntinfo4_list; tmi != NULL; tmi = tmi->mi_clientid_next)
1854 		nummi++;
1855 
1856 	milist = kmem_alloc(nummi * sizeof (mntinfo4_t *), KM_SLEEP);
1857 
1858 	for (i = 0, tmi = sp->mntinfo4_list; tmi != NULL; i++,
1859 	    tmi = tmi->mi_clientid_next) {
1860 		milist[i] = tmi;
1861 		VFS_HOLD(tmi->mi_vfsp);
1862 	}
1863 	mutex_exit(&sp->s_lock);
1864 
1865 	*nummip = nummi;
1866 	return (milist);
1867 }
1868 
1869 /*
1870  * Free the filesystem list created by make_milist().
1871  */
1872 
1873 static void
1874 free_milist(mntinfo4_t **milist, int nummi)
1875 {
1876 	mntinfo4_t *tmi;
1877 	int i;
1878 
1879 	for (i = 0; i < nummi; i++) {
1880 		tmi = milist[i];
1881 		VFS_RELE(tmi->mi_vfsp);
1882 	}
1883 	kmem_free(milist, nummi * sizeof (mntinfo4_t *));
1884 }
1885 
1886 /*
1887  * Filehandle
1888  */
1889 
1890 /*
1891  * Lookup the filehandle for the given vnode and update the rnode if it has
1892  * changed.
1893  *
1894  * Errors:
1895  * - if the filehandle could not be updated because of an error that
1896  *   requires further recovery, initiate that recovery and return.
1897  * - if the filehandle could not be updated because of a signal, pretend we
1898  *   succeeded and let someone else deal with it.
1899  * - if the filehandle could not be updated and the filesystem has been
1900  *   forcibly unmounted, pretend we succeeded, and let the caller deal with
1901  *   the forced unmount (to retry or not to retry, that is the question).
1902  * - if the filehandle could not be updated because of some other error,
1903  *   mark the rnode bad and return.
1904  */
1905 static void
1906 recov_filehandle(nfs4_recov_t action, mntinfo4_t *mi, vnode_t *vp)
1907 {
1908 	rnode4_t *rp = VTOR4(vp);
1909 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
1910 	bool_t needrecov;
1911 
1912 	mutex_enter(&rp->r_statelock);
1913 
1914 	if (rp->r_flags & R4RECOVERR) {
1915 		mutex_exit(&rp->r_statelock);
1916 		return;
1917 	}
1918 
1919 	/*
1920 	 * If someone else is updating the filehandle, wait for them to
1921 	 * finish and then let our caller retry.
1922 	 */
1923 	if (rp->r_flags & R4RECEXPFH) {
1924 		while (rp->r_flags & R4RECEXPFH) {
1925 			cv_wait(&rp->r_cv, &rp->r_statelock);
1926 		}
1927 		mutex_exit(&rp->r_statelock);
1928 		return;
1929 	}
1930 	rp->r_flags |= R4RECEXPFH;
1931 	mutex_exit(&rp->r_statelock);
1932 
1933 	if (action == NR_BADHANDLE) {
1934 		/* shouldn't happen */
1935 		nfs4_queue_event(RE_BADHANDLE, mi, NULL, 0,
1936 		    vp, NULL, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
1937 	}
1938 
1939 	nfs4_remap_file(mi, vp, 0, &e);
1940 	needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
1941 
1942 	/*
1943 	 * If we get BADHANDLE, FHEXPIRED or STALE in their handler,
1944 	 * something is broken. Don't try to recover, just mark the
1945 	 * file dead.
1946 	 */
1947 	DTRACE_PROBE2(recov__filehandle, nfs4_error_t, &e, vnode_t, vp);
1948 	if (needrecov) {
1949 		if (e.error == 0) {
1950 			switch (e.stat) {
1951 			case NFS4ERR_BADHANDLE:
1952 			case NFS4ERR_FHEXPIRED:
1953 			case NFS4ERR_STALE:
1954 				goto norec;	/* Unrecoverable errors */
1955 			default:
1956 				break;
1957 			}
1958 		}
1959 		(void) nfs4_start_recovery(&e, mi, vp, NULL,
1960 		    NULL, NULL, OP_LOOKUP, NULL, NULL, NULL);
1961 
1962 	} else if (e.error != EINTR &&
1963 	    !NFS4_FRC_UNMT_ERR(e.error, mi->mi_vfsp) &&
1964 	    (e.error != 0 || e.stat != NFS4_OK)) {
1965 		nfs4_recov_fh_fail(vp, e.error, e.stat);
1966 		/*
1967 		 * Don't set r_error to ESTALE. Higher-level code (e.g.,
1968 		 * cstatat_getvp()) retries on ESTALE, which would cause
1969 		 * an infinite loop.
1970 		 */
1971 	}
1972 norec:
1973 	mutex_enter(&rp->r_statelock);
1974 	rp->r_flags &= ~R4RECEXPFH;
1975 	cv_broadcast(&rp->r_cv);
1976 	mutex_exit(&rp->r_statelock);
1977 }
1978 
1979 /*
1980  * Stale Filehandle
1981  */
1982 
1983 /*
1984  * A stale filehandle can happen when an individual file has
1985  * been removed, or when an entire filesystem has been taken
1986  * offline.  To distinguish these cases, we do this:
1987  * - if a GETATTR with the current filehandle is okay, we do
1988  *   nothing (this can happen with two-filehandle ops)
1989  * - if the GETATTR fails, but a GETATTR of the root filehandle
1990  *   succeeds, mark the rnode with R4STALE, which will stop use
1991  * - if the GETATTR fails, and a GETATTR of the root filehandle
1992  *   also fails, we consider the problem filesystem-wide, so:
1993  *   - if we can failover, we should
1994  *   - if we can't failover, we should mark both the original
1995  *     vnode and the root bad
1996  */
1997 static void
1998 recov_stale(mntinfo4_t *mi, vnode_t *vp)
1999 {
2000 	rnode4_t *rp = VTOR4(vp);
2001 	vnode_t *rootvp = NULL;
2002 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
2003 	nfs4_ga_res_t gar;
2004 	char *fail_msg = "failed to recover from NFS4ERR_STALE";
2005 	bool_t needrecov;
2006 
2007 	mutex_enter(&rp->r_statelock);
2008 
2009 	if (rp->r_flags & R4RECOVERR) {
2010 		mutex_exit(&rp->r_statelock);
2011 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2012 		    "recov_stale: already marked dead, rp %s",
2013 		    rnode4info(rp)));
2014 		return;
2015 	}
2016 
2017 	if (rp->r_flags & R4STALE) {
2018 		mutex_exit(&rp->r_statelock);
2019 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2020 		    "recov_stale: already marked stale, rp %s",
2021 		    rnode4info(rp)));
2022 		return;
2023 	}
2024 
2025 	mutex_exit(&rp->r_statelock);
2026 
2027 	/* Try a GETATTR on this vnode */
2028 	nfs4_getattr_otw_norecovery(vp, &gar, &e, CRED(), 0);
2029 
2030 	/*
2031 	 * Handle non-STALE recoverable errors
2032 	 */
2033 	needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
2034 	if (needrecov) {
2035 		if (e.error == 0) {
2036 			switch (e.stat) {
2037 			case NFS4ERR_STALE:
2038 			case NFS4ERR_BADHANDLE:
2039 				goto norec;	/* Unrecoverable */
2040 			default:
2041 				break;
2042 			}
2043 		}
2044 		(void) nfs4_start_recovery(&e, mi, vp, NULL,
2045 		    NULL, NULL, OP_GETATTR, NULL, NULL, NULL);
2046 		goto out;
2047 	}
2048 norec:
2049 	/* Are things OK for this vnode? */
2050 	if (!e.error && e.stat == NFS4_OK) {
2051 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2052 		    "recov_stale: file appears fine, rp %s",
2053 		    rnode4info(rp)));
2054 		goto out;
2055 	}
2056 
2057 	/* Did we get an unrelated non-recoverable error? */
2058 	if (e.error || e.stat != NFS4ERR_STALE) {
2059 		nfs4_fail_recov(vp, fail_msg, e.error, e.stat);
2060 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2061 		    "recov_stale: unrelated fatal error, rp %s",
2062 		    rnode4info(rp)));
2063 		goto out;
2064 	}
2065 
2066 	/*
2067 	 * If we don't appear to be dealing with the root node, find it.
2068 	 */
2069 	if ((vp->v_flag & VROOT) == 0) {
2070 		nfs4_error_zinit(&e);
2071 		e.error = VFS_ROOT(vp->v_vfsp, &rootvp);
2072 		if (e.error) {
2073 			nfs4_fail_recov(vp, fail_msg, 0, NFS4ERR_STALE);
2074 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2075 			    "recov_stale: can't find root node for rp %s",
2076 			    rnode4info(rp)));
2077 			goto out;
2078 		}
2079 	}
2080 
2081 	/* Try a GETATTR on the root vnode */
2082 	if (rootvp != NULL) {
2083 		nfs4_error_zinit(&e);
2084 		nfs4_getattr_otw_norecovery(rootvp, &gar, &e, CRED(), 0);
2085 
2086 		needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
2087 		if (needrecov) {
2088 			if (e.error == 0) {
2089 				switch (e.stat) {
2090 				case NFS4ERR_STALE:
2091 				case NFS4ERR_BADHANDLE:
2092 					goto unrec;	/* Unrecoverable */
2093 				default:
2094 					break;
2095 				}
2096 			}
2097 			(void) nfs4_start_recovery(&e, mi, rootvp, NULL,
2098 			    NULL, NULL, OP_GETATTR, NULL, NULL, NULL);
2099 		}
2100 unrec:
2101 		/*
2102 		 * Check to see if a failover attempt is warranted
2103 		 * NB: nfs4_try_failover doesn't check for STALE
2104 		 * because recov_stale gets a shot first.  Now that
2105 		 * recov_stale has failed, go ahead and try failover.
2106 		 *
2107 		 * If the getattr on the root filehandle was successful,
2108 		 * then mark recovery as failed for 'vp' and exit.
2109 		 */
2110 		if (nfs4_try_failover(&e) == 0 && e.stat != NFS4ERR_STALE) {
2111 			/*
2112 			 * pass the original error to fail_recov, not
2113 			 * the one from trying the root vnode.
2114 			 */
2115 			nfs4_fail_recov(vp, fail_msg, 0, NFS4ERR_STALE);
2116 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2117 			    "recov_stale: root node OK, marking "
2118 			    "dead rp %s", rnode4info(rp)));
2119 			goto out;
2120 		}
2121 	}
2122 
2123 	/*
2124 	 * Here, we know that both the original file and the
2125 	 * root filehandle (which may be the same) are stale.
2126 	 * We want to fail over if we can, and if we can't, we
2127 	 * want to mark everything in sight bad.
2128 	 */
2129 	if (FAILOVER_MOUNT4(mi)) {
2130 		mutex_enter(&mi->mi_lock);
2131 		mi->mi_recovflags |= MI4R_NEED_NEW_SERVER;
2132 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2133 		    "recov_stale: failing over due to rp %s",
2134 		    rnode4info(rp)));
2135 		mutex_exit(&mi->mi_lock);
2136 	} else {
2137 		rnode4_t *rootrp;
2138 		servinfo4_t *svp;
2139 
2140 		/*
2141 		 * Can't fail over, so mark things dead.
2142 		 *
2143 		 * If rootvp is set, we know we have a distinct
2144 		 * non-root vnode which can be marked dead in
2145 		 * the usual way.
2146 		 *
2147 		 * Then we want to mark the root vnode dead.
2148 		 * Note that if rootvp wasn't set, our vp is
2149 		 * actually the root vnode.
2150 		 */
2151 		if (rootvp != NULL) {
2152 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2153 			    "recov_stale: can't fail over, marking dead rp %s",
2154 			    rnode4info(rp)));
2155 			nfs4_fail_recov(vp, fail_msg, 0, NFS4ERR_STALE);
2156 		} else {
2157 			rootvp = vp;
2158 			VN_HOLD(rootvp);
2159 		}
2160 
2161 		/*
2162 		 * Mark root dead, but quietly - since
2163 		 * the root rnode is frequently recreated,
2164 		 * we can encounter this at every access.
2165 		 * Also mark recovery as failed on this VFS.
2166 		 */
2167 		rootrp = VTOR4(rootvp);
2168 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_CONT,
2169 		    "recov_stale: marking dead root rp %s",
2170 		    rnode4info(rootrp)));
2171 		mutex_enter(&rootrp->r_statelock);
2172 		rootrp->r_flags |= (R4RECOVERR | R4STALE);
2173 		rootrp->r_error = ESTALE;
2174 		mutex_exit(&rootrp->r_statelock);
2175 		mutex_enter(&mi->mi_lock);
2176 		mi->mi_error = ESTALE;
2177 		mutex_exit(&mi->mi_lock);
2178 
2179 		svp = mi->mi_curr_serv;
2180 		(void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
2181 		svp->sv_flags |= SV4_ROOT_STALE;
2182 		nfs_rw_exit(&svp->sv_lock);
2183 	}
2184 
2185 out:
2186 	if (rootvp)
2187 		VN_RELE(rootvp);
2188 }
2189 
2190 /*
2191  * Locks.
2192  */
2193 
2194 /*
2195  * Reclaim all the active (acquired) locks for the given file.
2196  * If a process lost a lock, the process is sent a SIGLOST.  This is not
2197  * considered an error.
2198  *
2199  * Return values:
2200  * Errors and status are returned via the nfs4_error_t parameter
2201  * If an error indicates that recovery is needed, the caller is responsible
2202  * for dealing with it.
2203  */
2204 
2205 static void
2206 relock_file(vnode_t *vp, mntinfo4_t *mi, nfs4_error_t *ep,
2207     fattr4_change pre_change)
2208 {
2209 	locklist_t *locks, *llp;
2210 	rnode4_t *rp;
2211 
2212 	ASSERT(ep != NULL);
2213 	nfs4_error_zinit(ep);
2214 
2215 	if (VTOMI4(vp)->mi_flags & MI4_LLOCK)
2216 		return;
2217 
2218 	nfs4_flush_lock_owners(VTOR4(vp));
2219 
2220 	/*
2221 	 * If we get an error that requires recovery actions, just bail out
2222 	 * and let the top-level recovery code handle it.
2223 	 *
2224 	 * If we get some other error, kill the process that owned the lock
2225 	 * and mark its remaining locks (if any) as belonging to NOPID, so
2226 	 * that we don't make any more reclaim requests for that process.
2227 	 */
2228 
2229 	rp = VTOR4(vp);
2230 	locks = flk_active_locks_for_vp(vp);
2231 	for (llp = locks; llp != NULL; llp = llp->ll_next) {
2232 		int did_reclaim = 1;
2233 
2234 		ASSERT(llp->ll_vp == vp);
2235 		if (llp->ll_flock.l_pid == NOPID)
2236 			continue;
2237 		reclaim_one_lock(vp, &llp->ll_flock, ep, &did_reclaim);
2238 		/*
2239 		 * If we need to restart recovery, stop processing the
2240 		 * list.  Some errors would be recoverable under other
2241 		 * circumstances, but if they happen here we just give up
2242 		 * on the lock.
2243 		 */
2244 		if (nfs4_needs_recovery(ep, TRUE, vp->v_vfsp)) {
2245 			if (ep->error != 0)
2246 				break;
2247 			if (!nfs4_recov_marks_dead(ep->stat))
2248 				break;
2249 		}
2250 		/*
2251 		 *   In case the server isn't offering us a grace period, or
2252 		 * if we missed it, we might have opened & locked from scratch,
2253 		 * rather than reopened/reclaimed.
2254 		 *   We need to ensure that the object hadn't been otherwise
2255 		 * changed during this time, by comparing the changeinfo.
2256 		 *   We get passed the changeinfo from before the reopen by our
2257 		 * caller, in pre_change.
2258 		 *   The changeinfo from after the reopen is in rp->r_change,
2259 		 * courtesy of the GETATTR in the reopen.
2260 		 *   If they're different, then the file has changed, and we
2261 		 * have to SIGLOST the app.
2262 		 */
2263 		if (ep->error == 0 && ep->stat == NFS4_OK && !did_reclaim) {
2264 			mutex_enter(&rp->r_statelock);
2265 			if (pre_change != rp->r_change)
2266 				ep->stat = NFS4ERR_NO_GRACE;
2267 			mutex_exit(&rp->r_statelock);
2268 		}
2269 		if (ep->error != 0 || ep->stat != NFS4_OK) {
2270 			if (ep->error != 0)
2271 				nfs4_queue_event(RE_FAIL_RELOCK, mi,
2272 				    NULL, ep->error, vp, NULL, 0, NULL,
2273 				    llp->ll_flock.l_pid, TAG_NONE, TAG_NONE,
2274 				    0, 0);
2275 			else
2276 				nfs4_queue_event(RE_FAIL_RELOCK, mi,
2277 				    NULL, 0, vp, NULL, ep->stat, NULL,
2278 				    llp->ll_flock.l_pid, TAG_NONE, TAG_NONE,
2279 				    0, 0);
2280 			nfs4_send_siglost(llp->ll_flock.l_pid, mi, vp, TRUE,
2281 			    ep->error, ep->stat);
2282 			relock_skip_pid(vp, llp, llp->ll_flock.l_pid);
2283 
2284 			/* Reinitialize the nfs4_error and continue */
2285 			nfs4_error_zinit(ep);
2286 		}
2287 	}
2288 
2289 	if (locks != NULL)
2290 		flk_free_locklist(locks);
2291 }
2292 
2293 /*
2294  * Reclaim the given lock.
2295  *
2296  * Errors are returned via the nfs4_error_t parameter.
2297  */
2298 static void
2299 reclaim_one_lock(vnode_t *vp, flock64_t *flk, nfs4_error_t *ep,
2300     int *did_reclaimp)
2301 {
2302 	cred_t *cr;
2303 	rnode4_t *rp = VTOR4(vp);
2304 
2305 	cr = pid_to_cr(flk->l_pid);
2306 	if (cr == NULL) {
2307 		nfs4_error_init(ep, ESRCH);
2308 		return;
2309 	}
2310 
2311 	do {
2312 		mutex_enter(&rp->r_statelock);
2313 		if (rp->r_flags & R4RECOVERR) {
2314 			mutex_exit(&rp->r_statelock);
2315 			nfs4_error_init(ep, ESTALE);
2316 			break;
2317 		}
2318 		mutex_exit(&rp->r_statelock);
2319 
2320 		nfs4frlock(NFS4_LCK_CTYPE_RECLAIM, vp, F_SETLK, flk, cr, ep,
2321 		    NULL, did_reclaimp);
2322 		if (ep->error == 0 && ep->stat == NFS4ERR_FHEXPIRED)
2323 			start_recovery_action(NR_FHEXPIRED, TRUE, VTOMI4(vp),
2324 			    vp, NULL);
2325 	} while (ep->error == 0 && ep->stat == NFS4ERR_FHEXPIRED);
2326 
2327 	crfree(cr);
2328 }
2329 
2330 /*
2331  * Open files.
2332  */
2333 
2334 /*
2335  * Verifies if the nfsstat4 is a valid error for marking this vnode dead.
2336  * Returns 1 if the error is valid; 0 otherwise.
2337  */
2338 static int
2339 nfs4_valid_recov_err_for_vp(vnode_t *vp, nfsstat4 stat)
2340 {
2341 	/*
2342 	 * We should not be marking non-regular files as dead,
2343 	 * except in very rare cases (eg: BADHANDLE or NFS4ERR_BADNAME).
2344 	 */
2345 	if (vp->v_type != VREG && stat != NFS4ERR_BADHANDLE &&
2346 	    stat != NFS4ERR_BADNAME)
2347 		return (0);
2348 
2349 	return (1);
2350 }
2351 
2352 /*
2353  * Failed attempting to recover a filehandle.  If 'stat' is valid for 'vp',
2354  * then mark the object dead.  Since we've had to do a lookup for
2355  * filehandle recovery, we will mark the object dead if we got NOENT.
2356  */
2357 static void
2358 nfs4_recov_fh_fail(vnode_t *vp, int error, nfsstat4 stat)
2359 {
2360 	ASSERT(vp != NULL);
2361 
2362 	if ((error == 0) && (stat != NFS4ERR_NOENT) &&
2363 	    (!nfs4_valid_recov_err_for_vp(vp, stat)))
2364 		return;
2365 
2366 	nfs4_fail_recov(vp, "can't recover filehandle", error, stat);
2367 }
2368 
2369 /*
2370  * Recovery from a "shouldn't happen" error.  In the long term, we'd like
2371  * to mark only the data structure(s) that provided the bad value as being
2372  * bad.  But for now we'll just mark the entire file.
2373  */
2374 
2375 static void
2376 recov_badstate(recov_info_t *recovp, vnode_t *vp, nfsstat4 stat)
2377 {
2378 	ASSERT(vp != NULL);
2379 	recov_throttle(recovp, vp);
2380 
2381 	if (!nfs4_valid_recov_err_for_vp(vp, stat))
2382 		return;
2383 
2384 	nfs4_fail_recov(vp, "", 0, stat);
2385 }
2386 
2387 /*
2388  * Free up the information saved for a lost state request.
2389  */
2390 static void
2391 nfs4_free_lost_rqst(nfs4_lost_rqst_t *lrp, nfs4_server_t *sp)
2392 {
2393 	component4 *filep;
2394 	nfs4_open_stream_t *osp;
2395 	int have_sync_lock;
2396 
2397 	NFS4_DEBUG(nfs4_lost_rqst_debug,
2398 	    (CE_NOTE, "nfs4_free_lost_rqst:"));
2399 
2400 	switch (lrp->lr_op) {
2401 	case OP_OPEN:
2402 		filep = &lrp->lr_ofile;
2403 		if (filep->utf8string_val) {
2404 			kmem_free(filep->utf8string_val, filep->utf8string_len);
2405 			filep->utf8string_val = NULL;
2406 		}
2407 		break;
2408 	case OP_DELEGRETURN:
2409 		nfs4delegreturn_cleanup(VTOR4(lrp->lr_vp), sp);
2410 		break;
2411 	case OP_CLOSE:
2412 		osp = lrp->lr_osp;
2413 		ASSERT(osp != NULL);
2414 		mutex_enter(&osp->os_sync_lock);
2415 		have_sync_lock = 1;
2416 		if (osp->os_pending_close) {
2417 			/* clean up the open file state. */
2418 			osp->os_pending_close = 0;
2419 			nfs4close_notw(lrp->lr_vp, osp, &have_sync_lock);
2420 		}
2421 		if (have_sync_lock)
2422 			mutex_exit(&osp->os_sync_lock);
2423 		break;
2424 	}
2425 
2426 	lrp->lr_op = 0;
2427 	if (lrp->lr_oop != NULL) {
2428 		open_owner_rele(lrp->lr_oop);
2429 		lrp->lr_oop = NULL;
2430 	}
2431 	if (lrp->lr_osp != NULL) {
2432 		open_stream_rele(lrp->lr_osp, VTOR4(lrp->lr_vp));
2433 		lrp->lr_osp = NULL;
2434 	}
2435 	if (lrp->lr_lop != NULL) {
2436 		lock_owner_rele(lrp->lr_lop);
2437 		lrp->lr_lop = NULL;
2438 	}
2439 	if (lrp->lr_flk != NULL) {
2440 		kmem_free(lrp->lr_flk, sizeof (flock64_t));
2441 		lrp->lr_flk = NULL;
2442 	}
2443 	if (lrp->lr_vp != NULL) {
2444 		VN_RELE(lrp->lr_vp);
2445 		lrp->lr_vp = NULL;
2446 	}
2447 	if (lrp->lr_dvp != NULL) {
2448 		VN_RELE(lrp->lr_dvp);
2449 		lrp->lr_dvp = NULL;
2450 	}
2451 	if (lrp->lr_cr != NULL) {
2452 		crfree(lrp->lr_cr);
2453 		lrp->lr_cr = NULL;
2454 	}
2455 
2456 	kmem_free(lrp, sizeof (nfs4_lost_rqst_t));
2457 }
2458 
2459 /*
2460  * Remove any lost state requests and free them.
2461  */
2462 static void
2463 nfs4_remove_lost_rqsts(mntinfo4_t *mi, nfs4_server_t *sp)
2464 {
2465 	nfs4_lost_rqst_t *lrp;
2466 
2467 	mutex_enter(&mi->mi_lock);
2468 	while ((lrp = list_head(&mi->mi_lost_state)) != NULL) {
2469 		list_remove(&mi->mi_lost_state, lrp);
2470 		mutex_exit(&mi->mi_lock);
2471 		nfs4_free_lost_rqst(lrp, sp);
2472 		mutex_enter(&mi->mi_lock);
2473 	}
2474 	mutex_exit(&mi->mi_lock);
2475 }
2476 
2477 /*
2478  * Reopen all the files for the given filesystem and reclaim any locks.
2479  */
2480 
2481 static void
2482 recov_openfiles(recov_info_t *recovp, nfs4_server_t *sp)
2483 {
2484 	mntinfo4_t *mi = recovp->rc_mi;
2485 	nfs4_opinst_t *reopenlist = NULL, *rep;
2486 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
2487 	open_claim_type4 claim;
2488 	int remap;
2489 	char *fail_msg = "No such file or directory on replica";
2490 	rnode4_t *rp;
2491 	fattr4_change pre_change;
2492 
2493 	ASSERT(sp != NULL);
2494 
2495 	/*
2496 	 * This check is to allow a 10ms pause before we reopen files
2497 	 * it should allow the server time to have received the CB_NULL
2498 	 * reply and update its internal structures such that (if
2499 	 * applicable) we are granted a delegation on reopened files.
2500 	 */
2501 	mutex_enter(&sp->s_lock);
2502 	if ((sp->s_flags & (N4S_CB_PINGED | N4S_CB_WAITER)) == 0) {
2503 		sp->s_flags |= N4S_CB_WAITER;
2504 		(void) cv_reltimedwait(&sp->wait_cb_null, &sp->s_lock,
2505 		    drv_usectohz(N4S_CB_PAUSE_TIME), TR_CLOCK_TICK);
2506 	}
2507 	mutex_exit(&sp->s_lock);
2508 
2509 	(void) nfs_rw_enter_sig(&sp->s_recovlock, RW_READER, 0);
2510 	(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0);
2511 
2512 	if (NFS4_VOLATILE_FH(mi)) {
2513 		nfs4_remap_root(mi, &e, 0);
2514 		if (nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp)) {
2515 			(void) nfs4_start_recovery(&e, mi, NULL,
2516 			    NULL, NULL, NULL, OP_LOOKUP, NULL, NULL, NULL);
2517 		}
2518 	}
2519 
2520 	mutex_enter(&mi->mi_lock);
2521 	if (recovp->rc_srv_reboot || (mi->mi_recovflags & MI4R_SRV_REBOOT))
2522 		claim = CLAIM_PREVIOUS;
2523 	else
2524 		claim = CLAIM_NULL;
2525 	mutex_exit(&mi->mi_lock);
2526 
2527 	if (e.error == 0 && e.stat == NFS4_OK) {
2528 		/*
2529 		 * Get a snapshot of open files in the filesystem.  Note
2530 		 * that new opens will stall until the server's grace
2531 		 * period is done.
2532 		 */
2533 		reopenlist = r4mkopenlist(mi);
2534 
2535 		mutex_enter(&mi->mi_lock);
2536 		remap = mi->mi_recovflags & MI4R_REMAP_FILES;
2537 		mutex_exit(&mi->mi_lock);
2538 		/*
2539 		 * Since we are re-establishing state on the
2540 		 * server, its ok to blow away the saved lost
2541 		 * requests since we don't need to reissue it.
2542 		 */
2543 		nfs4_remove_lost_rqsts(mi, sp);
2544 
2545 		for (rep = reopenlist; rep; rep = rep->re_next) {
2546 
2547 			if (remap) {
2548 				nfs4_remap_file(mi, rep->re_vp,
2549 				    NFS4_REMAP_CKATTRS, &e);
2550 			}
2551 			DTRACE_PROBE2(recov__openfiles, nfs4_error_t, &e,
2552 			    vnode_t, rep->re_vp);
2553 			if (e.error == ENOENT || e.stat == NFS4ERR_NOENT) {
2554 				/*
2555 				 * The current server does not have the file
2556 				 * that is to be remapped.  This is most
2557 				 * likely due to an improperly maintained
2558 				 * replica.   The files that are missing from
2559 				 * the server will be marked dead and logged
2560 				 * in order to make sys admins aware of the
2561 				 * problem.
2562 				 */
2563 				nfs4_fail_recov(rep->re_vp,
2564 				    fail_msg, e.error, e.stat);
2565 				/*
2566 				 * We've already handled the error so clear it.
2567 				 */
2568 				nfs4_error_zinit(&e);
2569 				continue;
2570 			} else if (e.error == 0 && e.stat == NFS4_OK) {
2571 				int j;
2572 
2573 				rp = VTOR4(rep->re_vp);
2574 				mutex_enter(&rp->r_statelock);
2575 				pre_change = rp->r_change;
2576 				mutex_exit(&rp->r_statelock);
2577 
2578 				for (j = 0; j < rep->re_numosp; j++) {
2579 					nfs4_reopen(rep->re_vp, rep->re_osp[j],
2580 					    &e, claim, FALSE, TRUE);
2581 					if (e.error != 0 || e.stat != NFS4_OK)
2582 						break;
2583 				}
2584 				if (nfs4_needs_recovery(&e, TRUE,
2585 				    mi->mi_vfsp)) {
2586 					(void) nfs4_start_recovery(&e, mi,
2587 					    rep->re_vp, NULL, NULL, NULL,
2588 					    OP_OPEN, NULL, NULL, NULL);
2589 					break;
2590 				}
2591 			}
2592 #ifdef DEBUG
2593 			if (nfs4_recovdelay > 0)
2594 				delay(MSEC_TO_TICK(nfs4_recovdelay * 1000));
2595 #endif
2596 			if (e.error == 0 && e.stat == NFS4_OK) {
2597 				relock_file(rep->re_vp, mi, &e, pre_change);
2598 
2599 				if (nfs4_needs_recovery(&e, TRUE, mi->mi_vfsp))
2600 					(void) nfs4_start_recovery(&e, mi,
2601 					    rep->re_vp, NULL, NULL, NULL,
2602 					    OP_LOCK, NULL, NULL, NULL);
2603 			}
2604 
2605 			if (e.error != 0 || e.stat != NFS4_OK)
2606 				break;
2607 		}
2608 
2609 		/*
2610 		 * Check to see if we need to remap files passed in
2611 		 * via the recovery arguments; this will have been
2612 		 * done for open files.  A failure here is not fatal.
2613 		 */
2614 		if (remap) {
2615 			nfs4_error_t ignore;
2616 			nfs4_check_remap(mi, recovp->rc_vp1, NFS4_REMAP_CKATTRS,
2617 			    &ignore);
2618 			nfs4_check_remap(mi, recovp->rc_vp2, NFS4_REMAP_CKATTRS,
2619 			    &ignore);
2620 		}
2621 	}
2622 
2623 	if (e.error == 0 && e.stat == NFS4_OK) {
2624 		mutex_enter(&mi->mi_lock);
2625 		mi->mi_recovflags &= ~(MI4R_REOPEN_FILES | MI4R_REMAP_FILES);
2626 		mutex_exit(&mi->mi_lock);
2627 	}
2628 
2629 	nfs_rw_exit(&mi->mi_recovlock);
2630 	nfs_rw_exit(&sp->s_recovlock);
2631 
2632 	if (reopenlist != NULL)
2633 		r4releopenlist(reopenlist);
2634 }
2635 
2636 /*
2637  * Resend the queued state recovery requests in "rqsts".
2638  */
2639 
2640 static void
2641 nfs4_resend_lost_rqsts(recov_info_t *recovp, nfs4_server_t *sp)
2642 {
2643 	nfs4_lost_rqst_t	*lrp, *tlrp;
2644 	mntinfo4_t		*mi = recovp->rc_mi;
2645 	nfs4_error_t		n4e;
2646 #ifdef NOTYET
2647 	uint32_t		deny_bits = 0;
2648 #endif
2649 
2650 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_resend_lost_rqsts"));
2651 
2652 	ASSERT(mi != NULL);
2653 	ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
2654 
2655 	mutex_enter(&mi->mi_lock);
2656 	lrp = list_head(&mi->mi_lost_state);
2657 	mutex_exit(&mi->mi_lock);
2658 	while (lrp != NULL) {
2659 		nfs4_error_zinit(&n4e);
2660 		resend_one_op(lrp, &n4e, mi, sp);
2661 		NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
2662 		    "nfs4_resend_lost_rqsts: resend request: for vp %p got "
2663 		    "error %d stat %d", (void *)lrp->lr_vp, n4e.error,
2664 		    n4e.stat));
2665 
2666 		/*
2667 		 * If we get a recovery error that we can actually
2668 		 * recover from (such as ETIMEDOUT, FHEXPIRED), we
2669 		 * return and let the recovery thread redrive the call.
2670 		 * Don't requeue unless the zone is still healthy.
2671 		 */
2672 		if (zone_status_get(curproc->p_zone) < ZONE_IS_SHUTTING_DOWN &&
2673 		    nfs4_needs_recovery(&n4e, TRUE, mi->mi_vfsp) &&
2674 		    (nfs4_try_failover(&n4e) ||
2675 		    NFS4_FRC_UNMT_ERR(n4e.error, mi->mi_vfsp) ||
2676 		    (n4e.error == 0 && n4e.stat != NFS4ERR_BADHANDLE &&
2677 		    !nfs4_recov_marks_dead(n4e.stat)))) {
2678 			/*
2679 			 * For these three errors, we want to delay a bit
2680 			 * instead of pounding the server into submission.
2681 			 * We have to do this manually; the normal
2682 			 * processing for these errors only works for
2683 			 * non-recovery requests.
2684 			 */
2685 			if ((n4e.error == 0 && n4e.stat == NFS4ERR_DELAY) ||
2686 			    (n4e.error == 0 && n4e.stat == NFS4ERR_GRACE) ||
2687 			    (n4e.error == 0 && n4e.stat == NFS4ERR_RESOURCE) ||
2688 			    NFS4_FRC_UNMT_ERR(n4e.error, mi->mi_vfsp)) {
2689 				delay(SEC_TO_TICK(nfs4err_delay_time));
2690 			} else {
2691 				(void) nfs4_start_recovery(&n4e,
2692 				    mi, lrp->lr_dvp, lrp->lr_vp, NULL, NULL,
2693 				    lrp->lr_op, NULL, NULL, NULL);
2694 			}
2695 			return;
2696 		}
2697 
2698 		mutex_enter(&mi->mi_lock);
2699 		list_remove(&mi->mi_lost_state, lrp);
2700 		tlrp = lrp;
2701 		lrp = list_head(&mi->mi_lost_state);
2702 		mutex_exit(&mi->mi_lock);
2703 		nfs4_free_lost_rqst(tlrp, sp);
2704 	}
2705 }
2706 
2707 /*
2708  * Resend the given op, and issue any necessary undo call.
2709  * errors are returned via the nfs4_error_t parameter.
2710  */
2711 
2712 static void
2713 resend_one_op(nfs4_lost_rqst_t *lrp, nfs4_error_t *ep,
2714     mntinfo4_t *mi, nfs4_server_t *sp)
2715 {
2716 	vnode_t *vp;
2717 	nfs4_open_stream_t *osp;
2718 	cred_t *cr;
2719 	uint32_t acc_bits;
2720 
2721 	vp = lrp->lr_vp;
2722 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_one_op: "
2723 	    "have a lost open/close request for vp %p", (void *)vp));
2724 
2725 	switch (lrp->lr_op) {
2726 	case OP_OPEN:
2727 		nfs4_resend_open_otw(&vp, lrp, ep);
2728 		break;
2729 	case OP_OPEN_DOWNGRADE:
2730 		ASSERT(lrp->lr_oop != NULL);
2731 		ep->error = nfs4_start_open_seqid_sync(lrp->lr_oop, mi);
2732 		ASSERT(!ep->error);	/* recov thread always succeeds */
2733 		ASSERT(lrp->lr_osp != NULL);
2734 		mutex_enter(&lrp->lr_osp->os_sync_lock);
2735 		nfs4_open_downgrade(lrp->lr_dg_acc, lrp->lr_dg_deny,
2736 		    lrp->lr_oop, lrp->lr_osp, vp, lrp->lr_cr, lrp,
2737 		    ep, NULL, NULL);
2738 		mutex_exit(&lrp->lr_osp->os_sync_lock);
2739 		nfs4_end_open_seqid_sync(lrp->lr_oop);
2740 		break;
2741 	case OP_CLOSE:
2742 		osp = lrp->lr_osp;
2743 		cr = lrp->lr_cr;
2744 		acc_bits = 0;
2745 		mutex_enter(&osp->os_sync_lock);
2746 		if (osp->os_share_acc_read)
2747 			acc_bits |= OPEN4_SHARE_ACCESS_READ;
2748 		if (osp->os_share_acc_write)
2749 			acc_bits |= OPEN4_SHARE_ACCESS_WRITE;
2750 		mutex_exit(&osp->os_sync_lock);
2751 		nfs4close_one(vp, osp, cr, acc_bits, lrp, ep,
2752 		    CLOSE_RESEND, 0, 0, 0);
2753 		break;
2754 	case OP_LOCK:
2755 	case OP_LOCKU:
2756 		resend_lock(lrp, ep);
2757 		goto done;
2758 	case OP_DELEGRETURN:
2759 		nfs4_resend_delegreturn(lrp, ep, sp);
2760 		goto done;
2761 	default:
2762 #ifdef DEBUG
2763 		cmn_err(CE_PANIC, "resend_one_op: unexpected op: %d",
2764 		    lrp->lr_op);
2765 #endif
2766 		nfs4_queue_event(RE_LOST_STATE_BAD_OP, mi, NULL,
2767 		    lrp->lr_op, lrp->lr_vp, lrp->lr_dvp, NFS4_OK, NULL, 0,
2768 		    TAG_NONE, TAG_NONE, 0, 0);
2769 		nfs4_error_init(ep, EINVAL);
2770 		return;
2771 	}
2772 
2773 	/*
2774 	 * No need to retry nor send an "undo" CLOSE in the
2775 	 * event the server rebooted.
2776 	 */
2777 	if (ep->error == 0 && (ep->stat == NFS4ERR_STALE_CLIENTID ||
2778 	    ep->stat == NFS4ERR_STALE_STATEID || ep->stat == NFS4ERR_EXPIRED))
2779 		goto done;
2780 
2781 	/*
2782 	 * If we resent a CLOSE or OPEN_DOWNGRADE, there's nothing
2783 	 * to undo.  Undoing locking operations was handled by
2784 	 * resend_lock().
2785 	 */
2786 	if (lrp->lr_op == OP_OPEN_DOWNGRADE || lrp->lr_op == OP_CLOSE)
2787 		goto done;
2788 
2789 	/*
2790 	 * If we get any other error for OPEN, then don't attempt
2791 	 * to undo the resend of the open (since it was never
2792 	 * successful!).
2793 	 */
2794 	ASSERT(lrp->lr_op == OP_OPEN);
2795 	if (ep->error || ep->stat != NFS4_OK)
2796 		goto done;
2797 
2798 	/*
2799 	 * Now let's undo our OPEN.
2800 	 */
2801 	nfs4_error_zinit(ep);
2802 	close_after_open_resend(vp, lrp->lr_cr, lrp->lr_oacc, ep);
2803 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_one_op: "
2804 	    "nfs4close_one: for vp %p got error %d stat %d",
2805 	    (void *)vp, ep->error, ep->stat));
2806 
2807 done:
2808 	if (vp != lrp->lr_vp)
2809 		VN_RELE(vp);
2810 }
2811 
2812 /*
2813  * Close a file that was opened via a resent OPEN.
2814  * Most errors are passed back to the caller (via the return value and
2815  * *statp), except for FHEXPIRED, which is retried.
2816  *
2817  * It might be conceptually cleaner to push the CLOSE request onto the
2818  * front of the resend queue, rather than sending it here.  That would
2819  * match the way we undo lost lock requests.  On the other
2820  * hand, we've already got something that works, and there's no reason to
2821  * change it at this time.
2822  */
2823 
2824 static void
2825 close_after_open_resend(vnode_t *vp, cred_t *cr, uint32_t acc_bits,
2826     nfs4_error_t *ep)
2827 {
2828 
2829 	for (;;) {
2830 		nfs4close_one(vp, NULL, cr, acc_bits, NULL, ep,
2831 		    CLOSE_AFTER_RESEND, 0, 0, 0);
2832 		if (ep->error == 0 && ep->stat == NFS4_OK)
2833 			break;		/* success; done */
2834 		if (ep->error != 0 || ep->stat != NFS4ERR_FHEXPIRED)
2835 			break;
2836 		/* else retry FHEXPIRED */
2837 	}
2838 
2839 }
2840 
2841 /*
2842  * Resend the given lost lock request.  Return an errno value.  If zero,
2843  * *statp is set to the NFS status code for the call.
2844  *
2845  * Issue a SIGLOST and mark the rnode dead if we get a non-recovery error or
2846  * a recovery error that we don't actually recover from yet (eg: BAD_SEQID).
2847  * Let the recovery thread redrive the call if we get a recovery error that
2848  * we can actually recover from.
2849  */
2850 static void
2851 resend_lock(nfs4_lost_rqst_t *lrp, nfs4_error_t *ep)
2852 {
2853 	bool_t		send_siglost = FALSE;
2854 	vnode_t		*vp = lrp->lr_vp;
2855 
2856 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_lock:"));
2857 	ASSERT(lrp->lr_ctype == NFS4_LCK_CTYPE_REINSTATE ||
2858 	    lrp->lr_ctype == NFS4_LCK_CTYPE_RESEND);
2859 
2860 	nfs4frlock(lrp->lr_ctype, vp, F_SETLK, lrp->lr_flk, lrp->lr_cr, ep,
2861 	    lrp, NULL);
2862 
2863 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_lock: "
2864 	    "nfs4frlock for vp %p returned error %d, stat %d",
2865 	    (void *)vp, ep->error, ep->stat));
2866 
2867 	if (ep->error == 0 && ep->stat == 0)
2868 		goto done;
2869 	if (ep->error == 0 && ep->stat == NFS4ERR_DENIED &&
2870 	    lrp->lr_ctype == NFS4_LCK_CTYPE_RESEND)
2871 		goto done;
2872 
2873 	/*
2874 	 * If we failed with a non-recovery error, send SIGLOST and
2875 	 * mark the file dead.
2876 	 */
2877 	if (!nfs4_needs_recovery(ep, TRUE, vp->v_vfsp))
2878 		send_siglost = TRUE;
2879 	else {
2880 		/*
2881 		 * Done with recovering LOST LOCK in the event the
2882 		 * server rebooted or we've lost the lease.
2883 		 */
2884 		if (ep->error == 0 && (ep->stat == NFS4ERR_STALE_CLIENTID ||
2885 		    ep->stat == NFS4ERR_STALE_STATEID ||
2886 		    ep->stat == NFS4ERR_EXPIRED)) {
2887 			goto done;
2888 		}
2889 
2890 		/*
2891 		 * BAD_STATEID on an unlock indicates that the server has
2892 		 * forgotten about the lock anyway, so act like the call
2893 		 * was successful.
2894 		 */
2895 		if (ep->error == 0 && ep->stat == NFS4ERR_BAD_STATEID &&
2896 		    lrp->lr_op == OP_LOCKU)
2897 			goto done;
2898 
2899 		/*
2900 		 * If we got a recovery error that we don't actually
2901 		 * recover from, send SIGLOST.  If the filesystem was
2902 		 * forcibly unmounted, we skip the SIGLOST because (a) it's
2903 		 * unnecessary noise, and (b) there could be a new process
2904 		 * with the same pid as the one that had generated the lost
2905 		 * state request.
2906 		 */
2907 		if (ep->error == 0 && (ep->stat == NFS4ERR_BADHANDLE ||
2908 		    nfs4_recov_marks_dead(ep->stat))) {
2909 			if (!(vp->v_vfsp->vfs_flag & VFS_UNMOUNTED))
2910 				send_siglost = TRUE;
2911 			goto done;
2912 		}
2913 
2914 		/*
2915 		 * If the filesystem was forcibly unmounted, we
2916 		 * still need to synchronize with the server and
2917 		 * release state.  Try again later.
2918 		 */
2919 		if (NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp))
2920 			goto done;
2921 
2922 		/*
2923 		 * If we get a recovery error that we can actually
2924 		 * recover from (such as ETIMEDOUT, FHEXPIRED),
2925 		 * return and let the recovery thread redrive the call.
2926 		 *
2927 		 * For the three errors below, we want to delay a bit
2928 		 * instead of pounding the server into submission.
2929 		 */
2930 		if ((ep->error == 0 && ep->stat == NFS4ERR_DELAY) ||
2931 		    (ep->error == 0 && ep->stat == NFS4ERR_GRACE) ||
2932 		    (ep->error == 0 && ep->stat == NFS4ERR_RESOURCE))
2933 			delay(SEC_TO_TICK(recov_err_delay));
2934 		goto done;
2935 	}
2936 
2937 done:
2938 	if (send_siglost) {
2939 		cred_t *sv_cred;
2940 
2941 		/*
2942 		 * Must be root or the actual thread being issued the
2943 		 * SIGLOST for this to work, so just become root.
2944 		 */
2945 		sv_cred = curthread->t_cred;
2946 		curthread->t_cred = kcred;
2947 		nfs4_send_siglost(lrp->lr_flk->l_pid, VTOMI4(vp), vp, FALSE,
2948 		    ep->error, ep->stat);
2949 		curthread->t_cred = sv_cred;
2950 
2951 		/*
2952 		 * Flush any additional reinstantiation requests for
2953 		 * this operation.  Sending multiple SIGLOSTs to the user
2954 		 * process is unlikely to help and may cause trouble.
2955 		 */
2956 		if (lrp->lr_ctype == NFS4_LCK_CTYPE_REINSTATE)
2957 			flush_reinstate(lrp);
2958 	}
2959 }
2960 
2961 /*
2962  * Remove any lock reinstantiation requests that correspond to the given
2963  * lost request.  We only remove items that follow lrp in the queue,
2964  * assuming that lrp will be removed by the generic lost state code.
2965  */
2966 
2967 static void
2968 flush_reinstate(nfs4_lost_rqst_t *lrp)
2969 {
2970 	vnode_t *vp;
2971 	pid_t pid;
2972 	mntinfo4_t *mi;
2973 	nfs4_lost_rqst_t *nlrp;
2974 
2975 	vp = lrp->lr_vp;
2976 	mi = VTOMI4(vp);
2977 	pid = lrp->lr_flk->l_pid;
2978 
2979 	/*
2980 	 * If there are any more reinstantation requests to get rid of,
2981 	 * they should all be clustered at the front of the lost state
2982 	 * queue.
2983 	 */
2984 	mutex_enter(&mi->mi_lock);
2985 	for (lrp = list_next(&mi->mi_lost_state, lrp); lrp != NULL;
2986 	    lrp = nlrp) {
2987 		nlrp = list_next(&mi->mi_lost_state, lrp);
2988 		if (lrp->lr_op != OP_LOCK && lrp->lr_op != OP_LOCKU)
2989 			break;
2990 		if (lrp->lr_ctype != NFS4_LCK_CTYPE_REINSTATE)
2991 			break;
2992 		ASSERT(lrp->lr_vp == vp);
2993 		ASSERT(lrp->lr_flk->l_pid == pid);
2994 		NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
2995 		    "remove reinstantiation %p", (void *)lrp));
2996 		list_remove(&mi->mi_lost_state, lrp);
2997 		nfs4_free_lost_rqst(lrp, NULL);
2998 	}
2999 	mutex_exit(&mi->mi_lock);
3000 }
3001 
3002 /*
3003  * End of state-specific recovery routines.
3004  */
3005 
3006 /*
3007  * Allocate a lost request struct, initialize it from lost_rqstp (including
3008  * bumping the reference counts for the referenced vnode, etc.), and hang
3009  * it off of recovp.
3010  */
3011 
3012 static void
3013 nfs4_save_lost_rqst(nfs4_lost_rqst_t *lost_rqstp, recov_info_t *recovp,
3014     nfs4_recov_t *action, mntinfo4_t *mi)
3015 {
3016 	nfs4_lost_rqst_t *destp;
3017 
3018 	ASSERT(recovp->rc_lost_rqst == NULL);
3019 
3020 	destp = kmem_alloc(sizeof (nfs4_lost_rqst_t), KM_SLEEP);
3021 	recovp->rc_lost_rqst = destp;
3022 
3023 	if (lost_rqstp->lr_op == OP_LOCK ||
3024 	    lost_rqstp->lr_op == OP_LOCKU) {
3025 		ASSERT(lost_rqstp->lr_lop);
3026 		*action = NR_LOST_LOCK;
3027 		destp->lr_ctype = lost_rqstp->lr_ctype;
3028 		destp->lr_locktype = lost_rqstp->lr_locktype;
3029 	} else if (lost_rqstp->lr_op == OP_OPEN) {
3030 		component4 *srcfp, *destfp;
3031 
3032 		destp->lr_oacc = lost_rqstp->lr_oacc;
3033 		destp->lr_odeny = lost_rqstp->lr_odeny;
3034 		destp->lr_oclaim = lost_rqstp->lr_oclaim;
3035 		if (lost_rqstp->lr_oclaim == CLAIM_DELEGATE_CUR)
3036 			destp->lr_ostateid = lost_rqstp->lr_ostateid;
3037 
3038 		srcfp = &lost_rqstp->lr_ofile;
3039 		destfp = &destp->lr_ofile;
3040 		/*
3041 		 * Consume caller's utf8string
3042 		 */
3043 		destfp->utf8string_len = srcfp->utf8string_len;
3044 		destfp->utf8string_val = srcfp->utf8string_val;
3045 		srcfp->utf8string_len = 0;
3046 		srcfp->utf8string_val = NULL;	/* make sure not reused */
3047 
3048 		*action = NR_LOST_STATE_RQST;
3049 	} else if (lost_rqstp->lr_op == OP_OPEN_DOWNGRADE) {
3050 		destp->lr_dg_acc = lost_rqstp->lr_dg_acc;
3051 		destp->lr_dg_deny = lost_rqstp->lr_dg_deny;
3052 
3053 		*action = NR_LOST_STATE_RQST;
3054 	} else if (lost_rqstp->lr_op == OP_CLOSE) {
3055 		ASSERT(lost_rqstp->lr_oop);
3056 		*action = NR_LOST_STATE_RQST;
3057 	} else if (lost_rqstp->lr_op == OP_DELEGRETURN) {
3058 		*action = NR_LOST_STATE_RQST;
3059 	} else {
3060 #ifdef DEBUG
3061 		cmn_err(CE_PANIC, "nfs4_save_lost_rqst: bad op %d",
3062 		    lost_rqstp->lr_op);
3063 #endif
3064 		nfs4_queue_event(RE_LOST_STATE_BAD_OP, mi, NULL,
3065 		    lost_rqstp->lr_op, lost_rqstp->lr_vp, lost_rqstp->lr_dvp,
3066 		    NFS4_OK, NULL, curproc->p_pid, TAG_NONE, TAG_NONE, 0, 0);
3067 		*action = NR_UNUSED;
3068 		recovp->rc_lost_rqst = NULL;
3069 		kmem_free(destp, sizeof (nfs4_lost_rqst_t));
3070 		return;
3071 	}
3072 
3073 	destp->lr_op = lost_rqstp->lr_op;
3074 	destp->lr_vp = lost_rqstp->lr_vp;
3075 	if (destp->lr_vp)
3076 		VN_HOLD(destp->lr_vp);
3077 	destp->lr_dvp = lost_rqstp->lr_dvp;
3078 	if (destp->lr_dvp)
3079 		VN_HOLD(destp->lr_dvp);
3080 	destp->lr_oop = lost_rqstp->lr_oop;
3081 	if (destp->lr_oop)
3082 		open_owner_hold(destp->lr_oop);
3083 	destp->lr_osp = lost_rqstp->lr_osp;
3084 	if (destp->lr_osp)
3085 		open_stream_hold(destp->lr_osp);
3086 	destp->lr_lop = lost_rqstp->lr_lop;
3087 	if (destp->lr_lop)
3088 		lock_owner_hold(destp->lr_lop);
3089 	destp->lr_cr = lost_rqstp->lr_cr;
3090 	if (destp->lr_cr)
3091 		crhold(destp->lr_cr);
3092 	if (lost_rqstp->lr_flk == NULL)
3093 		destp->lr_flk = NULL;
3094 	else {
3095 		destp->lr_flk = kmem_alloc(sizeof (flock64_t), KM_SLEEP);
3096 		*destp->lr_flk = *lost_rqstp->lr_flk;
3097 	}
3098 	destp->lr_putfirst = lost_rqstp->lr_putfirst;
3099 }
3100 
3101 /*
3102  * Map the given return values (errno and nfs4 status code) to a recovery
3103  * action and fill in the following fields of recovp: rc_action,
3104  * rc_srv_reboot, rc_stateid, rc_lost_rqst.
3105  */
3106 
3107 void
3108 errs_to_action(recov_info_t *recovp,
3109     nfs4_server_t *sp, mntinfo4_t *mi, stateid4 *sidp,
3110     nfs4_lost_rqst_t *lost_rqstp, int unmounted, nfs_opnum4 op,
3111     nfs4_bseqid_entry_t *bsep)
3112 {
3113 	nfs4_recov_t action = NR_UNUSED;
3114 	bool_t reboot = FALSE;
3115 	int try_f;
3116 	int error = recovp->rc_orig_errors.error;
3117 	nfsstat4 stat = recovp->rc_orig_errors.stat;
3118 
3119 	bzero(&recovp->rc_stateid, sizeof (stateid4));
3120 	recovp->rc_lost_rqst = NULL;
3121 	recovp->rc_bseqid_rqst = NULL;
3122 
3123 	try_f = nfs4_try_failover(&recovp->rc_orig_errors) &&
3124 	    FAILOVER_MOUNT4(mi);
3125 
3126 	/*
3127 	 * We start recovery for EINTR only in the lost lock
3128 	 * or lost open/close case.
3129 	 */
3130 
3131 	if (try_f || error == EINTR || (error == EIO && unmounted)) {
3132 		recovp->rc_error = (error != 0 ? error : geterrno4(stat));
3133 		if (lost_rqstp) {
3134 			ASSERT(lost_rqstp->lr_op != 0);
3135 			nfs4_save_lost_rqst(lost_rqstp, recovp, &action, mi);
3136 		}
3137 		if (try_f)
3138 			action = NR_FAILOVER;
3139 	} else if (error != 0) {
3140 		recovp->rc_error = error;
3141 		nfs4_queue_event(RE_UNEXPECTED_ERRNO, mi, NULL, error, NULL,
3142 		    NULL, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
3143 		action = NR_CLIENTID;
3144 	} else {
3145 		recovp->rc_error = geterrno4(stat);
3146 		switch (stat) {
3147 #ifdef notyet
3148 		case NFS4ERR_LEASE_MOVED:
3149 			action = xxx;
3150 			break;
3151 #endif
3152 		case NFS4ERR_MOVED:
3153 			action = NR_MOVED;
3154 			break;
3155 		case NFS4ERR_BADHANDLE:
3156 			action = NR_BADHANDLE;
3157 			break;
3158 		case NFS4ERR_BAD_SEQID:
3159 			if (bsep)
3160 				save_bseqid_rqst(bsep, recovp);
3161 			action = NR_BAD_SEQID;
3162 			break;
3163 		case NFS4ERR_OLD_STATEID:
3164 			action = NR_OLDSTATEID;
3165 			break;
3166 		case NFS4ERR_WRONGSEC:
3167 			action = NR_WRONGSEC;
3168 			break;
3169 		case NFS4ERR_FHEXPIRED:
3170 			action = NR_FHEXPIRED;
3171 			break;
3172 		case NFS4ERR_BAD_STATEID:
3173 			if (sp == NULL || (sp != NULL && inlease(sp))) {
3174 
3175 				action = NR_BAD_STATEID;
3176 				if (sidp)
3177 					recovp->rc_stateid = *sidp;
3178 			} else
3179 				action = NR_CLIENTID;
3180 			break;
3181 		case NFS4ERR_EXPIRED:
3182 			/*
3183 			 * The client's lease has expired, either due
3184 			 * to a network partition or perhaps a client
3185 			 * error.  In either case, try an NR_CLIENTID
3186 			 * style recovery.  reboot remains false, since
3187 			 * there is no evidence the server has rebooted.
3188 			 * This will cause CLAIM_NULL opens and lock
3189 			 * requests without the reclaim bit.
3190 			 */
3191 			action = NR_CLIENTID;
3192 
3193 			DTRACE_PROBE4(nfs4__expired,
3194 			    nfs4_server_t *, sp,
3195 			    mntinfo4_t *, mi,
3196 			    stateid4 *, sidp, int, op);
3197 
3198 			break;
3199 		case NFS4ERR_STALE_CLIENTID:
3200 		case NFS4ERR_STALE_STATEID:
3201 			action = NR_CLIENTID;
3202 			reboot = TRUE;
3203 			break;
3204 		case NFS4ERR_RESOURCE:
3205 			/*
3206 			 * If this had been a FAILOVER mount, then
3207 			 * we'd have tried failover.  Since it's not,
3208 			 * just delay a while and retry.
3209 			 */
3210 			action = NR_DELAY;
3211 			break;
3212 		case NFS4ERR_GRACE:
3213 			action = NR_GRACE;
3214 			break;
3215 		case NFS4ERR_DELAY:
3216 			action = NR_DELAY;
3217 			break;
3218 		case NFS4ERR_STALE:
3219 			action = NR_STALE;
3220 			break;
3221 		default:
3222 			nfs4_queue_event(RE_UNEXPECTED_STATUS, mi, NULL, 0,
3223 			    NULL, NULL, stat, NULL, 0, TAG_NONE, TAG_NONE,
3224 			    0, 0);
3225 			action = NR_CLIENTID;
3226 			break;
3227 		}
3228 	}
3229 
3230 	/* make sure action got set */
3231 	ASSERT(action != NR_UNUSED);
3232 	recovp->rc_srv_reboot = reboot;
3233 	recovp->rc_action = action;
3234 	nfs4_queue_fact(RF_ERR, mi, stat, action, op, reboot, NULL, error,
3235 	    NULL);
3236 }
3237 
3238 /*
3239  * Return the (held) credential for the process with the given pid.
3240  * May return NULL (e.g., process not found).
3241  */
3242 
3243 static cred_t *
3244 pid_to_cr(pid_t pid)
3245 {
3246 	proc_t *p;
3247 	cred_t *cr;
3248 
3249 	mutex_enter(&pidlock);
3250 	if ((p = prfind(pid)) == NULL) {
3251 		mutex_exit(&pidlock);
3252 		return (NULL);
3253 	}
3254 
3255 	mutex_enter(&p->p_crlock);
3256 	crhold(cr = p->p_cred);
3257 	mutex_exit(&p->p_crlock);
3258 	mutex_exit(&pidlock);
3259 
3260 	return (cr);
3261 }
3262 
3263 /*
3264  * Send SIGLOST to the given process and queue the event.
3265  *
3266  * The 'dump' boolean tells us whether this action should dump the
3267  * in-kernel queue of recovery messages or not.
3268  */
3269 
3270 void
3271 nfs4_send_siglost(pid_t pid, mntinfo4_t *mi, vnode_t *vp, bool_t dump,
3272     int error, nfsstat4 stat)
3273 {
3274 	proc_t *p;
3275 
3276 	mutex_enter(&pidlock);
3277 	p = prfind(pid);
3278 	if (p)
3279 		psignal(p, SIGLOST);
3280 	mutex_exit(&pidlock);
3281 	nfs4_queue_event(dump ? RE_SIGLOST : RE_SIGLOST_NO_DUMP, mi,
3282 	    NULL, error, vp, NULL, stat, NULL, pid, TAG_NONE, TAG_NONE, 0, 0);
3283 }
3284 
3285 /*
3286  * Scan the lock list for entries that match the given pid.  Unregister those
3287  * locks that do and change their pid to NOPID.
3288  */
3289 
3290 static void
3291 relock_skip_pid(vnode_t *vp, locklist_t *llp, pid_t pid)
3292 {
3293 	for (; llp != NULL; llp = llp->ll_next) {
3294 		if (llp->ll_flock.l_pid == pid) {
3295 			int r;
3296 
3297 			/*
3298 			 * Unregister the lost lock.
3299 			 */
3300 			llp->ll_flock.l_type = F_UNLCK;
3301 			r = reclock(vp, &llp->ll_flock, SETFLCK, FREAD | FWRITE,
3302 			    0, NULL);
3303 			/* The unlock cannot fail */
3304 			ASSERT(r == 0);
3305 
3306 			llp->ll_flock.l_pid = NOPID;
3307 		}
3308 	}
3309 }
3310 
3311 /*
3312  * Mark a file as having failed recovery, after making a last-ditch effort
3313  * to return any delegation.
3314  *
3315  * Sets r_error to EIO or ESTALE for the given vnode.
3316  */
3317 void
3318 nfs4_fail_recov(vnode_t *vp, char *why, int error, nfsstat4 stat)
3319 {
3320 	rnode4_t *rp = VTOR4(vp);
3321 
3322 #ifdef DEBUG
3323 	if (nfs4_fail_recov_stop)
3324 		debug_enter("nfs4_fail_recov");
3325 #endif
3326 
3327 	mutex_enter(&rp->r_statelock);
3328 	if (rp->r_flags & (R4RECOVERR|R4RECOVERRP)) {
3329 		mutex_exit(&rp->r_statelock);
3330 		return;
3331 	}
3332 
3333 	/*
3334 	 * Set R4RECOVERRP to indicate that a recovery error is in
3335 	 * progress.  This will shut down reads and writes at the top
3336 	 * half.  Don't set R4RECOVERR until after we've returned the
3337 	 * delegation, otherwise it will fail.
3338 	 */
3339 
3340 	rp->r_flags |= R4RECOVERRP;
3341 	mutex_exit(&rp->r_statelock);
3342 
3343 	nfs4delegabandon(rp);
3344 
3345 	mutex_enter(&rp->r_statelock);
3346 	rp->r_flags |= (R4RECOVERR | R4STALE);
3347 	rp->r_error = (error == 0 && stat == NFS4ERR_STALE) ? ESTALE : EIO;
3348 	PURGE_ATTRCACHE4_LOCKED(rp);
3349 	if (!(vp->v_vfsp->vfs_flag & VFS_UNMOUNTED))
3350 		nfs4_queue_event(RE_DEAD_FILE, VTOMI4(vp), NULL, error,
3351 		    vp, NULL, stat, why, 0, TAG_NONE, TAG_NONE, 0, 0);
3352 	mutex_exit(&rp->r_statelock);
3353 
3354 	dnlc_purge_vp(vp);
3355 }
3356 
3357 /*
3358  * recov_throttle: if the file had the same recovery action within the
3359  * throttle interval, wait for the throttle interval to finish before
3360  * proceeding.
3361  *
3362  * Side effects: updates the rnode with the current recovery information.
3363  */
3364 
3365 static void
3366 recov_throttle(recov_info_t *recovp, vnode_t *vp)
3367 {
3368 	time_t curtime, time_to_wait;
3369 	rnode4_t *rp = VTOR4(vp);
3370 
3371 	curtime = gethrestime_sec();
3372 
3373 	mutex_enter(&rp->r_statelock);
3374 	NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
3375 	    "recov_throttle: now: (%d, %ld), last: (%d, %ld)",
3376 	    recovp->rc_action, curtime,
3377 	    rp->r_recov_act, rp->r_last_recov));
3378 	if (recovp->rc_action == rp->r_recov_act &&
3379 	    rp->r_last_recov + recov_err_delay > curtime) {
3380 		time_to_wait = rp->r_last_recov + recov_err_delay - curtime;
3381 		mutex_exit(&rp->r_statelock);
3382 		delay(SEC_TO_TICK(time_to_wait));
3383 		curtime = gethrestime_sec();
3384 		mutex_enter(&rp->r_statelock);
3385 	}
3386 
3387 	rp->r_last_recov = curtime;
3388 	rp->r_recov_act = recovp->rc_action;
3389 	mutex_exit(&rp->r_statelock);
3390 }
3391 
3392 /*
3393  * React to NFS4ERR_GRACE by setting the time we'll permit
3394  * the next call to this filesystem.
3395  */
3396 void
3397 nfs4_set_grace_wait(mntinfo4_t *mi)
3398 {
3399 	mutex_enter(&mi->mi_lock);
3400 	/* Mark the time for the future */
3401 	mi->mi_grace_wait = gethrestime_sec() + nfs4err_delay_time;
3402 	mutex_exit(&mi->mi_lock);
3403 }
3404 
3405 /*
3406  * React to MFS4ERR_DELAY by setting the time we'll permit
3407  * the next call to this vnode.
3408  */
3409 void
3410 nfs4_set_delay_wait(vnode_t *vp)
3411 {
3412 	rnode4_t *rp = VTOR4(vp);
3413 
3414 	mutex_enter(&rp->r_statelock);
3415 	/*
3416 	 * Calculate amount we should delay, initial
3417 	 * delay will be short and then we will back off.
3418 	 */
3419 	if (rp->r_delay_interval == 0)
3420 		rp->r_delay_interval = NFS4_INITIAL_DELAY_INTERVAL;
3421 	else
3422 		/* calculate next interval value */
3423 		rp->r_delay_interval =
3424 		    MIN(NFS4_MAX_DELAY_INTERVAL, (rp->r_delay_interval << 1));
3425 	rp->r_delay_wait = gethrestime_sec() + rp->r_delay_interval;
3426 	mutex_exit(&rp->r_statelock);
3427 }
3428 
3429 /*
3430  * The caller is responsible for freeing the returned string.
3431  */
3432 static char *
3433 nfs4_getsrvnames(mntinfo4_t *mi, size_t *len)
3434 {
3435 	servinfo4_t *svp;
3436 	char *srvnames;
3437 	char *namep;
3438 	size_t length;
3439 
3440 	/*
3441 	 * Calculate the length of the string required to hold all
3442 	 * of the server names plus either a comma or a null
3443 	 * character following each individual one.
3444 	 */
3445 	length = 0;
3446 	for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) {
3447 		(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
3448 		if (svp->sv_flags & SV4_NOTINUSE) {
3449 			nfs_rw_exit(&svp->sv_lock);
3450 			continue;
3451 		}
3452 		nfs_rw_exit(&svp->sv_lock);
3453 		length += svp->sv_hostnamelen;
3454 	}
3455 
3456 	srvnames = kmem_alloc(length, KM_SLEEP);
3457 
3458 	namep = srvnames;
3459 	for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) {
3460 		(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
3461 		if (svp->sv_flags & SV4_NOTINUSE) {
3462 			nfs_rw_exit(&svp->sv_lock);
3463 			continue;
3464 		}
3465 		nfs_rw_exit(&svp->sv_lock);
3466 		(void) strcpy(namep, svp->sv_hostname);
3467 		namep += svp->sv_hostnamelen - 1;
3468 		*namep++ = ',';
3469 	}
3470 	*--namep = '\0';
3471 
3472 	*len = length;
3473 
3474 	return (srvnames);
3475 }
3476 
3477 static void
3478 save_bseqid_rqst(nfs4_bseqid_entry_t *bsep, recov_info_t *recovp)
3479 {
3480 	nfs4_bseqid_entry_t *destp;
3481 
3482 	destp = kmem_alloc(sizeof (nfs4_bseqid_entry_t), KM_SLEEP);
3483 	recovp->rc_bseqid_rqst = destp;
3484 
3485 	if (bsep->bs_oop)
3486 		open_owner_hold(bsep->bs_oop);
3487 	destp->bs_oop = bsep->bs_oop;
3488 	if (bsep->bs_lop)
3489 		lock_owner_hold(bsep->bs_lop);
3490 	destp->bs_lop = bsep->bs_lop;
3491 	if (bsep->bs_vp)
3492 		VN_HOLD(bsep->bs_vp);
3493 	destp->bs_vp = bsep->bs_vp;
3494 	destp->bs_pid = bsep->bs_pid;
3495 	destp->bs_tag = bsep->bs_tag;
3496 	destp->bs_seqid = bsep->bs_seqid;
3497 }
3498 
3499 static void
3500 free_bseqid_rqst(nfs4_bseqid_entry_t *bsep)
3501 {
3502 	if (bsep->bs_oop)
3503 		open_owner_rele(bsep->bs_oop);
3504 	if (bsep->bs_lop)
3505 		lock_owner_rele(bsep->bs_lop);
3506 	if (bsep->bs_vp)
3507 		VN_RELE(bsep->bs_vp);
3508 	kmem_free(bsep, sizeof (nfs4_bseqid_entry_t));
3509 }
3510 
3511 /*
3512  * We don't actually fully recover from NFS4ERR_BAD_SEQID.  We
3513  * simply mark the open owner and open stream (if provided) as "bad".
3514  * Then future uses of these data structures will be limited to basically
3515  * just cleaning up the internal client state (no going OTW).
3516  *
3517  * The result of this is to return errors back to the app/usr when
3518  * we receive NFS4ERR_BAD_SEQID, but also allow future/new calls to
3519  * succeed so progress can be made.
3520  */
3521 void
3522 recov_bad_seqid(recov_info_t *recovp)
3523 {
3524 	mntinfo4_t		*mi = recovp->rc_mi;
3525 	nfs4_open_owner_t	*bad_oop;
3526 	nfs4_lock_owner_t	*bad_lop;
3527 	vnode_t			*vp;
3528 	rnode4_t		*rp = NULL;
3529 	pid_t			pid;
3530 	nfs4_bseqid_entry_t	*bsep, *tbsep;
3531 	int			error;
3532 
3533 	ASSERT(mi != NULL);
3534 	ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
3535 
3536 	mutex_enter(&mi->mi_lock);
3537 	bsep = list_head(&mi->mi_bseqid_list);
3538 	mutex_exit(&mi->mi_lock);
3539 
3540 	/*
3541 	 * Handle all the bad seqid entries on mi's list.
3542 	 */
3543 	while (bsep != NULL) {
3544 		bad_oop = bsep->bs_oop;
3545 		bad_lop = bsep->bs_lop;
3546 		vp = bsep->bs_vp;
3547 		pid = bsep->bs_pid;
3548 
3549 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
3550 		    "recov_bad_seqid: mark oop %p lop %p as bad for "
3551 		    "vp %p tag %s pid %d: last good seqid %d for tag %s",
3552 		    (void *)bad_oop, (void *)bad_lop, (void *)vp,
3553 		    nfs4_ctags[bsep->bs_tag].ct_str, pid,
3554 		    bad_oop ?  bad_oop->oo_last_good_seqid : 0,
3555 		    bad_oop ? nfs4_ctags[bad_oop->oo_last_good_op].ct_str :
3556 		    nfs4_ctags[TAG_NONE].ct_str));
3557 
3558 		nfs4_queue_event(RE_BAD_SEQID, mi, NULL,
3559 		    0, vp, NULL, NFS4ERR_BAD_SEQID, NULL, pid, bsep->bs_tag,
3560 		    bad_oop ? bad_oop->oo_last_good_op : TAG_NONE,
3561 		    bsep->bs_seqid, bad_oop ? bad_oop->oo_last_good_seqid : 0);
3562 
3563 		if (bad_oop) {
3564 			/* essentially reset the open owner */
3565 			error = nfs4_start_open_seqid_sync(bad_oop, mi);
3566 			ASSERT(!error);	/* recov thread always succeeds */
3567 			bad_oop->oo_name = nfs4_get_new_oo_name();
3568 			bad_oop->oo_seqid = 0;
3569 			nfs4_end_open_seqid_sync(bad_oop);
3570 		}
3571 
3572 		if (bad_lop) {
3573 			mutex_enter(&bad_lop->lo_lock);
3574 			bad_lop->lo_flags |= NFS4_BAD_SEQID_LOCK;
3575 			mutex_exit(&bad_lop->lo_lock);
3576 
3577 			ASSERT(vp != NULL);
3578 			rp = VTOR4(vp);
3579 			mutex_enter(&rp->r_statelock);
3580 			rp->r_flags |= R4LODANGLERS;
3581 			mutex_exit(&rp->r_statelock);
3582 
3583 			nfs4_send_siglost(pid, mi, vp, TRUE,
3584 			    0, NFS4ERR_BAD_SEQID);
3585 		}
3586 
3587 		mutex_enter(&mi->mi_lock);
3588 		list_remove(&mi->mi_bseqid_list, bsep);
3589 		tbsep = bsep;
3590 		bsep = list_head(&mi->mi_bseqid_list);
3591 		mutex_exit(&mi->mi_lock);
3592 		free_bseqid_rqst(tbsep);
3593 	}
3594 
3595 	mutex_enter(&mi->mi_lock);
3596 	mi->mi_recovflags &= ~MI4R_BAD_SEQID;
3597 	mutex_exit(&mi->mi_lock);
3598 }
3599