xref: /titanic_41/usr/src/uts/common/fs/nfs/nfs4_recovery.c (revision f33c1cdb6d38eb0715f03cf492f31c3d4d395c98)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * NFS Version 4 state recovery code.
28  */
29 
30 #include <nfs/nfs4_clnt.h>
31 #include <nfs/nfs4.h>
32 #include <nfs/rnode4.h>
33 #include <sys/cmn_err.h>
34 #include <sys/cred.h>
35 #include <sys/systm.h>
36 #include <sys/flock.h>
37 #include <sys/dnlc.h>
38 #include <sys/ddi.h>
39 #include <sys/disp.h>
40 #include <sys/list.h>
41 #include <sys/sdt.h>
42 
43 extern r4hashq_t *rtable4;
44 
45 /*
46  * Information that describes what needs to be done for recovery.  It is
47  * passed to a client recovery thread as well as passed to various recovery
48  * routines.  rc_mi, rc_vp1, and rc_vp2 refer to the filesystem and
49  * vnode(s) affected by recovery.  rc_vp1 and rc_vp2 are references (use
50  * VN_HOLD) or NULL.  rc_lost_rqst contains information about the lost
51  * lock or open/close request, and it holds reference counts for the
52  * various objects (vnode, etc.).  The recovery thread also uses flags set
53  * in the mntinfo4_t or vnode_t to tell it what to do.  rc_error is used
54  * to save the error that originally triggered the recovery event -- will
55  * later be used to set mi_error if recovery doesn't work.  rc_bseqid_rqst
56  * contains information about the request that got NFS4ERR_BAD_SEQID, and
57  * it holds reference count for the various objects (vnode, open owner,
58  * open stream, lock owner).
59  */
60 
61 typedef struct {
62 	mntinfo4_t *rc_mi;
63 	vnode_t *rc_vp1;
64 	vnode_t *rc_vp2;
65 	nfs4_recov_t rc_action;
66 	stateid4 rc_stateid;
67 	bool_t rc_srv_reboot;		/* server has rebooted */
68 	nfs4_lost_rqst_t *rc_lost_rqst;
69 	nfs4_error_t rc_orig_errors;	/* original errors causing recovery */
70 	int rc_error;
71 	nfs4_bseqid_entry_t *rc_bseqid_rqst;
72 } recov_info_t;
73 
74 /*
75  * How long to wait before trying again if there is an error doing
76  * recovery, in seconds.
77  */
78 
79 static int recov_err_delay = 1;
80 
81 /*
82  * How long to wait when processing NFS4ERR_GRACE or NFS4ERR_DELAY
83  * errors.  Expressed in seconds.  Default is defined as
84  * NFS4ERR_DELAY_TIME and this variable is initialized in nfs4_subr_init()
85  */
86 time_t nfs4err_delay_time = 0;
87 
88 /*
89  * Tuneable to limit how many time "exempt" ops go OTW
90  * after a recovery error.  Exempt op hints are OH_CLOSE,
91  * OH_LOCKU, OH_DELEGRETURN.  These previously always went
92  * OTW even after rnode was "dead" due to recovery errors.
93  *
94  * The tuneable below limits the number of times a start_fop
95  * invocation will retry the exempt hints.  After the limit
96  * is reached, nfs4_start_fop will return an error just like
97  * it would for non-exempt op hints.
98  */
99 int nfs4_max_recov_error_retry = 3;
100 
101 /*
102  * Number of seconds the recovery thread should pause before retry when the
103  * filesystem has been forcibly unmounted.
104  */
105 
106 int nfs4_unmount_delay = 1;
107 
108 #ifdef DEBUG
109 
110 /*
111  * How long to wait (in seconds) between recovery operations on a given
112  * file.  Normally zero, but could be set longer for testing purposes.
113  */
114 static int nfs4_recovdelay = 0;
115 
116 /*
117  * Switch that controls whether to go into the debugger when recovery
118  * fails.
119  */
120 static int nfs4_fail_recov_stop = 0;
121 
122 /*
123  * Tuneables to debug client namespace interaction with server
124  * mount points:
125  *
126  *	nfs4_srvmnt_fail_cnt:
127  *		number of times EACCES returned because client
128  *		attempted to cross server mountpoint
129  *
130  *	nfs4_srvmnt_debug:
131  *		trigger console printf whenever client attempts
132  *		to cross server mountpoint
133  */
134 int nfs4_srvmnt_fail_cnt = 0;
135 int nfs4_srvmnt_debug = 0;
136 #endif
137 
138 /* forward references, in alphabetic order */
139 static void close_after_open_resend(vnode_t *, cred_t *, uint32_t,
140 	nfs4_error_t *);
141 static void errs_to_action(recov_info_t *,
142 	nfs4_server_t *, mntinfo4_t *, stateid4 *, nfs4_lost_rqst_t *, int,
143 	nfs_opnum4, nfs4_bseqid_entry_t *);
144 static void flush_reinstate(nfs4_lost_rqst_t *);
145 static void free_milist(mntinfo4_t **, int);
146 static mntinfo4_t **make_milist(nfs4_server_t *, int *);
147 static int nfs4_check_recov_err(vnode_t *, nfs4_op_hint_t,
148 	nfs4_recov_state_t *, int, char *);
149 static char *nfs4_getsrvnames(mntinfo4_t *, size_t *);
150 static void nfs4_recov_fh_fail(vnode_t *, int, nfsstat4);
151 static void nfs4_recov_thread(recov_info_t *);
152 static void nfs4_remove_lost_rqsts(mntinfo4_t *, nfs4_server_t *);
153 static void nfs4_resend_lost_rqsts(recov_info_t *, nfs4_server_t *);
154 static cred_t *pid_to_cr(pid_t);
155 static void reclaim_one_lock(vnode_t *, flock64_t *, nfs4_error_t *, int *);
156 static void recov_bad_seqid(recov_info_t *);
157 static void recov_badstate(recov_info_t *, vnode_t *, nfsstat4);
158 static void recov_clientid(recov_info_t *, nfs4_server_t *);
159 static void recov_done(mntinfo4_t *, recov_info_t *);
160 static void recov_filehandle(nfs4_recov_t, mntinfo4_t *, vnode_t *);
161 static void recov_newserver(recov_info_t *, nfs4_server_t **, bool_t *);
162 static void recov_openfiles(recov_info_t *, nfs4_server_t *);
163 static void recov_stale(mntinfo4_t *, vnode_t *);
164 static void nfs4_free_lost_rqst(nfs4_lost_rqst_t *, nfs4_server_t *);
165 static void recov_throttle(recov_info_t *, vnode_t *);
166 static void relock_skip_pid(locklist_t *, pid_t);
167 static void resend_lock(nfs4_lost_rqst_t *, nfs4_error_t *);
168 static void resend_one_op(nfs4_lost_rqst_t *, nfs4_error_t *, mntinfo4_t *,
169 	nfs4_server_t *);
170 static void save_bseqid_rqst(nfs4_bseqid_entry_t *, recov_info_t *);
171 static void start_recovery(recov_info_t *, mntinfo4_t *, vnode_t *, vnode_t *,
172 	nfs4_server_t *);
173 static void start_recovery_action(nfs4_recov_t, bool_t, mntinfo4_t *, vnode_t *,
174 	vnode_t *);
175 static int wait_for_recovery(mntinfo4_t *, nfs4_op_hint_t);
176 
177 /*
178  * Return non-zero if the given errno, status, and rpc status codes
179  * in the nfs4_error_t indicate that client recovery is needed.
180  * "stateful" indicates whether the call that got the error establishes or
181  * removes state on the server (open, close, lock, unlock, delegreturn).
182  */
183 
184 int
185 nfs4_needs_recovery(nfs4_error_t *ep, bool_t stateful, vfs_t *vfsp)
186 {
187 	int recov = 0;
188 	mntinfo4_t *mi;
189 
190 	/*
191 	 * Try failover if the error values justify it and if
192 	 * it's a failover mount.  Don't try if the mount is in
193 	 * progress, failures are handled explicitly by nfs4rootvp.
194 	 */
195 	if (nfs4_try_failover(ep)) {
196 		mi = VFTOMI4(vfsp);
197 		mutex_enter(&mi->mi_lock);
198 		recov = FAILOVER_MOUNT4(mi) && !(mi->mi_flags & MI4_MOUNTING);
199 		mutex_exit(&mi->mi_lock);
200 		if (recov)
201 			return (recov);
202 	}
203 
204 	if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vfsp)) {
205 		/*
206 		 * The server may have gotten the request, so for stateful
207 		 * ops we need to resynchronize and possibly back out the
208 		 * op.
209 		 */
210 		return (stateful);
211 	}
212 	if (ep->error != 0)
213 		return (0);
214 
215 	/* stat values are listed alphabetically */
216 	/*
217 	 * There are two lists here: the errors for which we have code, and
218 	 * the errors for which we plan to have code before FCS.  For the
219 	 * second list, print a warning message but don't attempt recovery.
220 	 */
221 	switch (ep->stat) {
222 	case NFS4ERR_BADHANDLE:
223 	case NFS4ERR_BAD_SEQID:
224 	case NFS4ERR_BAD_STATEID:
225 	case NFS4ERR_DELAY:
226 	case NFS4ERR_EXPIRED:
227 	case NFS4ERR_FHEXPIRED:
228 	case NFS4ERR_GRACE:
229 	case NFS4ERR_OLD_STATEID:
230 	case NFS4ERR_RESOURCE:
231 	case NFS4ERR_STALE_CLIENTID:
232 	case NFS4ERR_STALE_STATEID:
233 	case NFS4ERR_WRONGSEC:
234 	case NFS4ERR_STALE:
235 		recov = 1;
236 		break;
237 #ifdef DEBUG
238 	case NFS4ERR_LEASE_MOVED:
239 	case NFS4ERR_MOVED:
240 		zcmn_err(VFTOMI4(vfsp)->mi_zone->zone_id,
241 		    CE_WARN, "!Can't yet recover from NFS status %d",
242 		    ep->stat);
243 		break;
244 #endif
245 	}
246 
247 	return (recov);
248 }
249 
250 /*
251  * Some operations such as DELEGRETURN want to avoid invoking
252  * recovery actions that will only mark the file dead.  If
253  * better handlers are invoked for any of these errors, this
254  * routine should be modified.
255  */
256 int
257 nfs4_recov_marks_dead(nfsstat4 status)
258 {
259 	if (status == NFS4ERR_BAD_SEQID ||
260 	    status == NFS4ERR_EXPIRED ||
261 	    status == NFS4ERR_BAD_STATEID ||
262 	    status == NFS4ERR_OLD_STATEID)
263 		return (1);
264 	return (0);
265 }
266 
267 /*
268  * Transfer the state recovery information in recovp to mi's resend queue,
269  * and mark mi as having a lost state request.
270  */
271 static void
272 nfs4_enqueue_lost_rqst(recov_info_t *recovp, mntinfo4_t *mi)
273 {
274 	nfs4_lost_rqst_t *lrp = recovp->rc_lost_rqst;
275 
276 	ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
277 	    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
278 
279 	ASSERT(lrp != NULL && lrp->lr_op != 0);
280 
281 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
282 	    "nfs4_enqueue_lost_rqst %p, op %d",
283 	    (void *)lrp, lrp->lr_op));
284 
285 	mutex_enter(&mi->mi_lock);
286 	mi->mi_recovflags |= MI4R_LOST_STATE;
287 	if (lrp->lr_putfirst)
288 		list_insert_head(&mi->mi_lost_state, lrp);
289 	else
290 		list_insert_tail(&mi->mi_lost_state, lrp);
291 	recovp->rc_lost_rqst = NULL;
292 	mutex_exit(&mi->mi_lock);
293 
294 	nfs4_queue_event(RE_LOST_STATE, mi, NULL, lrp->lr_op, lrp->lr_vp,
295 	    lrp->lr_dvp, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
296 }
297 
298 /*
299  * Transfer the bad seqid recovery information in recovp to mi's
300  * bad seqid queue, and mark mi as having a bad seqid request.
301  */
302 void
303 enqueue_bseqid_rqst(recov_info_t *recovp, mntinfo4_t *mi)
304 {
305 	ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
306 	    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
307 	ASSERT(recovp->rc_bseqid_rqst != NULL);
308 
309 	mutex_enter(&mi->mi_lock);
310 	mi->mi_recovflags |= MI4R_BAD_SEQID;
311 	list_insert_tail(&mi->mi_bseqid_list, recovp->rc_bseqid_rqst);
312 	recovp->rc_bseqid_rqst = NULL;
313 	mutex_exit(&mi->mi_lock);
314 }
315 
316 /*
317  * Initiate recovery.
318  *
319  * The nfs4_error_t contains the return codes that triggered a recovery
320  * attempt.  mi, vp1, and vp2 refer to the filesystem and files that were
321  * being operated on.  vp1 and vp2 may be NULL.
322  *
323  * Multiple calls are okay.  If recovery is already underway, the call
324  * updates the information about what state needs recovery but does not
325  * start a new thread.  The caller should hold mi->mi_recovlock as a reader
326  * for proper synchronization with any recovery thread.
327  *
328  * This will return TRUE if recovery was aborted, and FALSE otherwise.
329  */
330 bool_t
331 nfs4_start_recovery(nfs4_error_t *ep, mntinfo4_t *mi, vnode_t *vp1,
332     vnode_t *vp2, stateid4 *sid, nfs4_lost_rqst_t *lost_rqstp, nfs_opnum4 op,
333     nfs4_bseqid_entry_t *bsep)
334 {
335 	recov_info_t *recovp;
336 	nfs4_server_t *sp;
337 	bool_t abort = FALSE;
338 	bool_t gone = FALSE;
339 
340 	ASSERT(nfs_zone() == mi->mi_zone);
341 	mutex_enter(&mi->mi_lock);
342 	/*
343 	 * If there is lost state, we need to kick off recovery even if the
344 	 * filesystem has been unmounted or the zone is shutting down.
345 	 */
346 	gone = FS_OR_ZONE_GONE4(mi->mi_vfsp);
347 	if (gone) {
348 		ASSERT(ep->error != EINTR || lost_rqstp != NULL);
349 		if (ep->error == EIO && lost_rqstp == NULL) {
350 			/* failed due to forced unmount, no new lost state */
351 			abort = TRUE;
352 		}
353 		if ((ep->error == 0 || ep->error == ETIMEDOUT) &&
354 		    !(mi->mi_recovflags & MI4R_LOST_STATE)) {
355 			/* some other failure, no existing lost state */
356 			abort = TRUE;
357 		}
358 		if (abort) {
359 			mutex_exit(&mi->mi_lock);
360 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
361 			    "nfs4_start_recovery: fs unmounted"));
362 			return (TRUE);
363 		}
364 	}
365 	mi->mi_in_recovery++;
366 	mutex_exit(&mi->mi_lock);
367 
368 	recovp = kmem_alloc(sizeof (recov_info_t), KM_SLEEP);
369 	recovp->rc_orig_errors = *ep;
370 	sp = find_nfs4_server(mi);
371 	errs_to_action(recovp, sp, mi, sid, lost_rqstp, gone, op, bsep);
372 	if (sp != NULL)
373 		mutex_exit(&sp->s_lock);
374 	start_recovery(recovp, mi, vp1, vp2, sp);
375 	if (sp != NULL)
376 		nfs4_server_rele(sp);
377 	return (FALSE);
378 }
379 
380 /*
381  * Internal version of nfs4_start_recovery.  The difference is that the
382  * caller specifies the recovery action, rather than the errors leading to
383  * recovery.
384  */
385 static void
386 start_recovery_action(nfs4_recov_t what, bool_t reboot, mntinfo4_t *mi,
387     vnode_t *vp1, vnode_t *vp2)
388 {
389 	recov_info_t *recovp;
390 
391 	ASSERT(nfs_zone() == mi->mi_zone);
392 	mutex_enter(&mi->mi_lock);
393 	mi->mi_in_recovery++;
394 	mutex_exit(&mi->mi_lock);
395 
396 	recovp = kmem_zalloc(sizeof (recov_info_t), KM_SLEEP);
397 	recovp->rc_action = what;
398 	recovp->rc_srv_reboot = reboot;
399 	recovp->rc_error = EIO;
400 	start_recovery(recovp, mi, vp1, vp2, NULL);
401 }
402 
403 static void
404 start_recovery(recov_info_t *recovp, mntinfo4_t *mi,
405     vnode_t *vp1, vnode_t *vp2, nfs4_server_t *sp)
406 {
407 	NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
408 	    "start_recovery: mi %p, what %s", (void*)mi,
409 	    nfs4_recov_action_to_str(recovp->rc_action)));
410 
411 	/*
412 	 * Bump the reference on the vfs so that we can pass it to the
413 	 * recovery thread.
414 	 */
415 	VFS_HOLD(mi->mi_vfsp);
416 	MI4_HOLD(mi);
417 again:
418 	switch (recovp->rc_action) {
419 	case NR_FAILOVER:
420 		ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
421 		    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
422 		if (mi->mi_servers->sv_next == NULL)
423 			goto out_no_thread;
424 		mutex_enter(&mi->mi_lock);
425 		mi->mi_recovflags |= MI4R_NEED_NEW_SERVER;
426 		mutex_exit(&mi->mi_lock);
427 
428 		if (recovp->rc_lost_rqst != NULL)
429 			nfs4_enqueue_lost_rqst(recovp, mi);
430 		break;
431 
432 	case NR_CLIENTID:
433 		/*
434 		 * If the filesystem has been unmounted, punt.
435 		 */
436 		if (sp == NULL)
437 			goto out_no_thread;
438 
439 		/*
440 		 * If nobody else is working on the clientid, mark the
441 		 * clientid as being no longer set.  Then mark the specific
442 		 * filesystem being worked on.
443 		 */
444 		if (!nfs4_server_in_recovery(sp)) {
445 			mutex_enter(&sp->s_lock);
446 			sp->s_flags &= ~N4S_CLIENTID_SET;
447 			mutex_exit(&sp->s_lock);
448 		}
449 		ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
450 		    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
451 		mutex_enter(&mi->mi_lock);
452 		mi->mi_recovflags |= MI4R_NEED_CLIENTID;
453 		if (recovp->rc_srv_reboot)
454 			mi->mi_recovflags |= MI4R_SRV_REBOOT;
455 		mutex_exit(&mi->mi_lock);
456 		break;
457 
458 	case NR_OPENFILES:
459 		ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
460 		    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
461 		mutex_enter(&mi->mi_lock);
462 		mi->mi_recovflags |= MI4R_REOPEN_FILES;
463 		if (recovp->rc_srv_reboot)
464 			mi->mi_recovflags |= MI4R_SRV_REBOOT;
465 		mutex_exit(&mi->mi_lock);
466 		break;
467 
468 	case NR_WRONGSEC:
469 		ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
470 		    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
471 		mutex_enter(&mi->mi_lock);
472 		mi->mi_recovflags |= MI4R_NEED_SECINFO;
473 		mutex_exit(&mi->mi_lock);
474 		break;
475 
476 	case NR_EXPIRED:
477 		if (vp1 != NULL)
478 			recov_badstate(recovp, vp1, NFS4ERR_EXPIRED);
479 		if (vp2 != NULL)
480 			recov_badstate(recovp, vp2, NFS4ERR_EXPIRED);
481 		goto out_no_thread;	/* no further recovery possible */
482 
483 	case NR_BAD_STATEID:
484 		if (vp1 != NULL)
485 			recov_badstate(recovp, vp1, NFS4ERR_BAD_STATEID);
486 		if (vp2 != NULL)
487 			recov_badstate(recovp, vp2, NFS4ERR_BAD_STATEID);
488 		goto out_no_thread;	/* no further recovery possible */
489 
490 	case NR_FHEXPIRED:
491 	case NR_BADHANDLE:
492 		if (vp1 != NULL)
493 			recov_throttle(recovp, vp1);
494 		if (vp2 != NULL)
495 			recov_throttle(recovp, vp2);
496 		/*
497 		 * Recover the filehandle now, rather than using a
498 		 * separate thread.  We can do this because filehandle
499 		 * recovery is independent of any other state, and because
500 		 * we know that we are not competing with the recovery
501 		 * thread at this time.  recov_filehandle will deal with
502 		 * threads that are competing to recover this filehandle.
503 		 */
504 		ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
505 		    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
506 		if (vp1 != NULL)
507 			recov_filehandle(recovp->rc_action, mi, vp1);
508 		if (vp2 != NULL)
509 			recov_filehandle(recovp->rc_action, mi, vp2);
510 		goto out_no_thread;	/* no further recovery needed */
511 
512 	case NR_STALE:
513 		/*
514 		 * NFS4ERR_STALE handling
515 		 * recov_stale() could set MI4R_NEED_NEW_SERVER to
516 		 * indicate that we can and should failover.
517 		 */
518 		ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
519 		    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
520 
521 		if (vp1 != NULL)
522 			recov_stale(mi, vp1);
523 		if (vp2 != NULL)
524 			recov_stale(mi, vp2);
525 		mutex_enter(&mi->mi_lock);
526 		if ((mi->mi_recovflags & MI4R_NEED_NEW_SERVER) == 0) {
527 			mutex_exit(&mi->mi_lock);
528 			goto out_no_thread;
529 		}
530 		mutex_exit(&mi->mi_lock);
531 		recovp->rc_action = NR_FAILOVER;
532 		goto again;
533 
534 	case NR_BAD_SEQID:
535 		if (recovp->rc_bseqid_rqst) {
536 			enqueue_bseqid_rqst(recovp, mi);
537 			break;
538 		}
539 
540 		if (vp1 != NULL)
541 			recov_badstate(recovp, vp1, NFS4ERR_BAD_SEQID);
542 		if (vp2 != NULL)
543 			recov_badstate(recovp, vp2, NFS4ERR_BAD_SEQID);
544 		goto out_no_thread; /* no further recovery possible */
545 
546 	case NR_OLDSTATEID:
547 		if (vp1 != NULL)
548 			recov_badstate(recovp, vp1, NFS4ERR_OLD_STATEID);
549 		if (vp2 != NULL)
550 			recov_badstate(recovp, vp2, NFS4ERR_OLD_STATEID);
551 		goto out_no_thread;	/* no further recovery possible */
552 
553 	case NR_GRACE:
554 		nfs4_set_grace_wait(mi);
555 		goto out_no_thread; /* no further action required for GRACE */
556 
557 	case NR_DELAY:
558 		if (vp1)
559 			nfs4_set_delay_wait(vp1);
560 		goto out_no_thread; /* no further action required for DELAY */
561 
562 	case NR_LOST_STATE_RQST:
563 	case NR_LOST_LOCK:
564 		nfs4_enqueue_lost_rqst(recovp, mi);
565 		break;
566 
567 	default:
568 		nfs4_queue_event(RE_UNEXPECTED_ACTION, mi, NULL,
569 		    recovp->rc_action, NULL, NULL, 0, NULL, 0, TAG_NONE,
570 		    TAG_NONE, 0, 0);
571 		goto out_no_thread;
572 	}
573 
574 	/*
575 	 * If either file recently went through the same recovery, wait
576 	 * awhile.  This is in case there is some sort of bug; we might not
577 	 * be able to recover properly, but at least we won't bombard the
578 	 * server with calls, and we won't tie up the client.
579 	 */
580 	if (vp1 != NULL)
581 		recov_throttle(recovp, vp1);
582 	if (vp2 != NULL)
583 		recov_throttle(recovp, vp2);
584 
585 	/*
586 	 * If there's already a recovery thread, don't start another one.
587 	 */
588 
589 	mutex_enter(&mi->mi_lock);
590 	if (mi->mi_flags & MI4_RECOV_ACTIV) {
591 		mutex_exit(&mi->mi_lock);
592 		goto out_no_thread;
593 	}
594 	mi->mi_flags |= MI4_RECOV_ACTIV;
595 	mutex_exit(&mi->mi_lock);
596 	NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
597 	    "start_recovery: starting new thread for mi %p", (void*)mi));
598 
599 	recovp->rc_mi = mi;
600 	recovp->rc_vp1 = vp1;
601 	if (vp1 != NULL) {
602 		ASSERT(VTOMI4(vp1) == mi);
603 		VN_HOLD(recovp->rc_vp1);
604 	}
605 	recovp->rc_vp2 = vp2;
606 	if (vp2 != NULL) {
607 		ASSERT(VTOMI4(vp2) == mi);
608 		VN_HOLD(recovp->rc_vp2);
609 	}
610 
611 	(void) zthread_create(NULL, 0, nfs4_recov_thread, recovp, 0,
612 	    minclsyspri);
613 	return;
614 
615 	/* not reached by thread creating call */
616 out_no_thread:
617 	mutex_enter(&mi->mi_lock);
618 	mi->mi_in_recovery--;
619 	if (mi->mi_in_recovery == 0)
620 		cv_broadcast(&mi->mi_cv_in_recov);
621 	mutex_exit(&mi->mi_lock);
622 
623 	VFS_RELE(mi->mi_vfsp);
624 	MI4_RELE(mi);
625 	/*
626 	 * Free up resources that were allocated for us.
627 	 */
628 	kmem_free(recovp, sizeof (recov_info_t));
629 }
630 
631 static int
632 nfs4_check_recov_err(vnode_t *vp, nfs4_op_hint_t op,
633     nfs4_recov_state_t *rsp, int retry_err_cnt, char *str)
634 {
635 	rnode4_t *rp;
636 	int error = 0;
637 	int exempt;
638 
639 	if (vp == NULL)
640 		return (0);
641 
642 	exempt = (op == OH_CLOSE || op == OH_LOCKU || op == OH_DELEGRETURN);
643 	rp = VTOR4(vp);
644 	mutex_enter(&rp->r_statelock);
645 
646 	/*
647 	 * If there was a recovery error, then allow op hints "exempt" from
648 	 * recov errors to retry (currently 3 times).  Either r_error or
649 	 * EIO is returned for non-exempt op hints.
650 	 */
651 	if (rp->r_flags & R4RECOVERR) {
652 		if (exempt && rsp->rs_num_retry_despite_err <=
653 		    nfs4_max_recov_error_retry) {
654 
655 			/*
656 			 * Check to make sure that we haven't already inc'd
657 			 * rs_num_retry_despite_err for current nfs4_start_fop
658 			 * instance.  We don't want to double inc (if we were
659 			 * called with vp2, then the vp1 call could have
660 			 * already incremented.
661 			 */
662 			if (retry_err_cnt == rsp->rs_num_retry_despite_err)
663 				rsp->rs_num_retry_despite_err++;
664 
665 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
666 			    "nfs4_start_fop: %s %p DEAD, cnt=%d", str,
667 			    (void *)vp, rsp->rs_num_retry_despite_err));
668 		} else {
669 			error = (rp->r_error ? rp->r_error : EIO);
670 			/*
671 			 * An ESTALE error on a non-regular file is not
672 			 * "sticky".  Return the ESTALE error once, but
673 			 * clear the condition to allow future operations
674 			 * to go OTW.  This will allow the client to
675 			 * recover if the server has merely unshared then
676 			 * re-shared the file system.  For regular files,
677 			 * the unshare has destroyed the open state at the
678 			 * server and we aren't willing to do a reopen (yet).
679 			 */
680 			if (error == ESTALE && vp->v_type != VREG) {
681 				rp->r_flags &=
682 				    ~(R4RECOVERR|R4RECOVERRP|R4STALE);
683 				rp->r_error = 0;
684 				error = ESTALE;
685 			}
686 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
687 			    "nfs4_start_fop: %s %p DEAD, cnt=%d error=%d",
688 			    str, (void *)vp,
689 			    rsp->rs_num_retry_despite_err, error));
690 		}
691 	}
692 
693 	mutex_exit(&rp->r_statelock);
694 	return (error);
695 }
696 
697 /*
698  * Initial setup code that every operation should call if it might invoke
699  * client recovery.  Can block waiting for recovery to finish on a
700  * filesystem.  Either vnode ptr can be NULL.
701  *
702  * Returns 0 if there are no outstanding errors.  Can return an
703  * errno value under various circumstances (e.g., failed recovery, or
704  * interrupted while waiting for recovery to finish).
705  *
706  * There must be a corresponding call to nfs4_end_op() to free up any locks
707  * or resources allocated by this call (assuming this call succeeded),
708  * using the same rsp that's passed in here.
709  *
710  * The open and lock seqid synchronization must be stopped before calling this
711  * function, as it could lead to deadlock when trying to reopen a file or
712  * reclaim a lock.  The synchronization is obtained with calls to:
713  *   nfs4_start_open_seqid_sync()
714  *   nfs4_start_lock_seqid_sync()
715  *
716  * *startrecovp is set TRUE if the caller should not bother with the
717  * over-the-wire call, and just initiate recovery for the given request.
718  * This is typically used for state-releasing ops if the filesystem has
719  * been forcibly unmounted.  startrecovp may be NULL for
720  * non-state-releasing ops.
721  */
722 
723 int
724 nfs4_start_fop(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2, nfs4_op_hint_t op,
725     nfs4_recov_state_t *rsp, bool_t *startrecovp)
726 {
727 	int error = 0, rerr_cnt;
728 	nfs4_server_t *sp = NULL;
729 	nfs4_server_t *tsp;
730 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
731 	uint_t droplock_cnt;
732 #ifdef DEBUG
733 	void *fop_caller;
734 #endif
735 
736 	ASSERT(vp1 == NULL || vp1->v_vfsp == mi->mi_vfsp);
737 	ASSERT(vp2 == NULL || vp2->v_vfsp == mi->mi_vfsp);
738 
739 #ifdef	DEBUG
740 	if ((fop_caller = tsd_get(nfs4_tsd_key)) != NULL) {
741 		cmn_err(CE_PANIC, "Missing nfs4_end_fop: last caller %p",
742 		    fop_caller);
743 	}
744 	(void) tsd_set(nfs4_tsd_key, caller());
745 #endif
746 
747 	rsp->rs_sp = NULL;
748 	rsp->rs_flags &= ~NFS4_RS_RENAME_HELD;
749 	rerr_cnt = rsp->rs_num_retry_despite_err;
750 
751 	/*
752 	 * Process the items that may delay() based on server response
753 	 */
754 	error = nfs4_wait_for_grace(mi, rsp);
755 	if (error)
756 		goto out;
757 
758 	if (vp1 != NULL) {
759 		error = nfs4_wait_for_delay(vp1, rsp);
760 		if (error)
761 			goto out;
762 	}
763 
764 	/* Wait for a delegation recall to complete. */
765 
766 	error = wait_for_recall(vp1, vp2, op, rsp);
767 	if (error)
768 		goto out;
769 
770 	/*
771 	 * Wait for any current recovery actions to finish.  Note that a
772 	 * recovery thread can still start up after wait_for_recovery()
773 	 * finishes.  We don't block out recovery operations until we
774 	 * acquire s_recovlock and mi_recovlock.
775 	 */
776 	error = wait_for_recovery(mi, op);
777 	if (error)
778 		goto out;
779 
780 	/*
781 	 * Check to see if the rnode is already marked with a
782 	 * recovery error.  If so, return it immediately.  But
783 	 * always pass CLOSE, LOCKU, and DELEGRETURN so we can
784 	 * clean up state on the server.
785 	 */
786 
787 	if (vp1 != NULL) {
788 		if (error = nfs4_check_recov_err(vp1, op, rsp, rerr_cnt, "vp1"))
789 			goto out;
790 		nfs4_check_remap(mi, vp1, NFS4_REMAP_CKATTRS, &e);
791 	}
792 
793 	if (vp2 != NULL) {
794 		if (error = nfs4_check_recov_err(vp2, op, rsp, rerr_cnt, "vp2"))
795 			goto out;
796 		nfs4_check_remap(mi, vp2, NFS4_REMAP_CKATTRS, &e);
797 	}
798 
799 	/*
800 	 * The lock order calls for us to acquire s_recovlock before
801 	 * mi_recovlock, but we have to hold mi_recovlock to look up sp (to
802 	 * prevent races with the failover/migration code).  So acquire
803 	 * mi_recovlock, look up sp, drop mi_recovlock, acquire
804 	 * s_recovlock and mi_recovlock, then verify that sp is still the
805 	 * right object.  XXX Can we find a simpler way to deal with this?
806 	 */
807 	if (nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER,
808 	    mi->mi_flags & MI4_INT)) {
809 		error = EINTR;
810 		goto out;
811 	}
812 get_sp:
813 	sp = find_nfs4_server(mi);
814 	if (sp != NULL) {
815 		sp->s_otw_call_count++;
816 		mutex_exit(&sp->s_lock);
817 		droplock_cnt = mi->mi_srvset_cnt;
818 	}
819 	nfs_rw_exit(&mi->mi_recovlock);
820 
821 	if (sp != NULL) {
822 		if (nfs_rw_enter_sig(&sp->s_recovlock, RW_READER,
823 		    mi->mi_flags & MI4_INT)) {
824 			error = EINTR;
825 			goto out;
826 		}
827 	}
828 	if (nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER,
829 	    mi->mi_flags & MI4_INT)) {
830 		if (sp != NULL)
831 			nfs_rw_exit(&sp->s_recovlock);
832 		error = EINTR;
833 		goto out;
834 	}
835 	/*
836 	 * If the mntinfo4_t hasn't changed nfs4_sever_ts then
837 	 * there's no point in double checking to make sure it
838 	 * has switched.
839 	 */
840 	if (sp == NULL || droplock_cnt != mi->mi_srvset_cnt) {
841 		tsp = find_nfs4_server(mi);
842 		if (tsp != sp) {
843 			/* try again */
844 			if (tsp != NULL) {
845 				mutex_exit(&tsp->s_lock);
846 				nfs4_server_rele(tsp);
847 				tsp = NULL;
848 			}
849 			if (sp != NULL) {
850 				nfs_rw_exit(&sp->s_recovlock);
851 				mutex_enter(&sp->s_lock);
852 				sp->s_otw_call_count--;
853 				mutex_exit(&sp->s_lock);
854 				nfs4_server_rele(sp);
855 				sp = NULL;
856 			}
857 			goto get_sp;
858 		} else {
859 			if (tsp != NULL) {
860 				mutex_exit(&tsp->s_lock);
861 				nfs4_server_rele(tsp);
862 				tsp = NULL;
863 			}
864 		}
865 	}
866 
867 	if (sp != NULL) {
868 		rsp->rs_sp = sp;
869 	}
870 
871 	/*
872 	 * If the fileystem uses volatile filehandles, obtain a lock so
873 	 * that we synchronize with renames.  Exception: mount operations
874 	 * can change mi_fh_expire_type, which could be a problem, since
875 	 * the end_op code needs to be consistent with the start_op code
876 	 * about mi_rename_lock.  Since mounts don't compete with renames,
877 	 * it's simpler to just not acquire the rename lock for mounts.
878 	 */
879 	if (NFS4_VOLATILE_FH(mi) && op != OH_MOUNT) {
880 		if (nfs_rw_enter_sig(&mi->mi_rename_lock,
881 		    op == OH_VFH_RENAME ? RW_WRITER : RW_READER,
882 		    mi->mi_flags & MI4_INT)) {
883 			nfs_rw_exit(&mi->mi_recovlock);
884 			if (sp != NULL)
885 				nfs_rw_exit(&sp->s_recovlock);
886 			error = EINTR;
887 			goto out;
888 		}
889 		rsp->rs_flags |= NFS4_RS_RENAME_HELD;
890 	}
891 
892 	if (OH_IS_STATE_RELE(op)) {
893 		/*
894 		 * For forced unmount, letting the request proceed will
895 		 * almost always delay response to the user, so hand it off
896 		 * to the recovery thread.  For exiting lwp's, we don't
897 		 * have a good way to tell if the request will hang.  We
898 		 * generally want processes to handle their own requests so
899 		 * that they can be done in parallel, but if there is
900 		 * already a recovery thread, hand the request off to it.
901 		 * This will improve user response at no cost to overall
902 		 * system throughput.  For zone shutdown, we'd prefer
903 		 * the recovery thread to handle this as well.
904 		 */
905 		ASSERT(startrecovp != NULL);
906 		mutex_enter(&mi->mi_lock);
907 		if (FS_OR_ZONE_GONE4(mi->mi_vfsp))
908 			*startrecovp = TRUE;
909 		else if ((curthread->t_proc_flag & TP_LWPEXIT) &&
910 		    (mi->mi_flags & MI4_RECOV_ACTIV))
911 			*startrecovp = TRUE;
912 		else
913 			*startrecovp = FALSE;
914 		mutex_exit(&mi->mi_lock);
915 	} else
916 		if (startrecovp != NULL)
917 			*startrecovp = FALSE;
918 
919 	ASSERT(error == 0);
920 	return (error);
921 
922 out:
923 	ASSERT(error != 0);
924 	if (sp != NULL) {
925 		mutex_enter(&sp->s_lock);
926 		sp->s_otw_call_count--;
927 		mutex_exit(&sp->s_lock);
928 		nfs4_server_rele(sp);
929 		rsp->rs_sp = NULL;
930 	}
931 	nfs4_end_op_recall(vp1, vp2, rsp);
932 
933 #ifdef	DEBUG
934 	(void) tsd_set(nfs4_tsd_key, NULL);
935 #endif
936 	return (error);
937 }
938 
939 /*
940  * It is up to the caller to determine if rsp->rs_sp being NULL
941  * is detrimental or not.
942  */
943 int
944 nfs4_start_op(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2,
945     nfs4_recov_state_t *rsp)
946 {
947 	ASSERT(rsp->rs_num_retry_despite_err == 0);
948 	rsp->rs_num_retry_despite_err = 0;
949 	return (nfs4_start_fop(mi, vp1, vp2, OH_OTHER, rsp, NULL));
950 }
951 
952 /*
953  * Release any resources acquired by nfs4_start_op().
954  * 'sp' should be the nfs4_server pointer returned by nfs4_start_op().
955  *
956  * The operation hint is used to avoid a deadlock by bypassing delegation
957  * return logic for writes, which are done while returning a delegation.
958  */
959 
960 void
961 nfs4_end_fop(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2, nfs4_op_hint_t op,
962     nfs4_recov_state_t *rsp, bool_t needs_recov)
963 {
964 	nfs4_server_t *sp = rsp->rs_sp;
965 	rnode4_t *rp = NULL;
966 
967 #ifdef	lint
968 	/*
969 	 * The op hint isn't used any more, but might be in
970 	 * the future.
971 	 */
972 	op = op;
973 #endif
974 
975 #ifdef	DEBUG
976 	ASSERT(tsd_get(nfs4_tsd_key) != NULL);
977 	(void) tsd_set(nfs4_tsd_key, NULL);
978 #endif
979 
980 	nfs4_end_op_recall(vp1, vp2, rsp);
981 
982 	if (rsp->rs_flags & NFS4_RS_RENAME_HELD)
983 		nfs_rw_exit(&mi->mi_rename_lock);
984 
985 	if (!needs_recov) {
986 		if (rsp->rs_flags & NFS4_RS_DELAY_MSG) {
987 			/* may need to clear the delay interval */
988 			if (vp1 != NULL) {
989 				rp = VTOR4(vp1);
990 				mutex_enter(&rp->r_statelock);
991 				rp->r_delay_interval = 0;
992 				mutex_exit(&rp->r_statelock);
993 			}
994 		}
995 		rsp->rs_flags &= ~(NFS4_RS_GRACE_MSG|NFS4_RS_DELAY_MSG);
996 	}
997 
998 	/*
999 	 * If the corresponding nfs4_start_op() found a sp,
1000 	 * then there must still be a sp.
1001 	 */
1002 	if (sp != NULL) {
1003 		nfs_rw_exit(&mi->mi_recovlock);
1004 		nfs_rw_exit(&sp->s_recovlock);
1005 		mutex_enter(&sp->s_lock);
1006 		sp->s_otw_call_count--;
1007 		cv_broadcast(&sp->s_cv_otw_count);
1008 		mutex_exit(&sp->s_lock);
1009 		nfs4_server_rele(sp);
1010 	} else {
1011 		nfs_rw_exit(&mi->mi_recovlock);
1012 	}
1013 }
1014 
1015 void
1016 nfs4_end_op(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2,
1017     nfs4_recov_state_t *rsp, bool_t needrecov)
1018 {
1019 	nfs4_end_fop(mi, vp1, vp2, OH_OTHER, rsp, needrecov);
1020 }
1021 
1022 /*
1023  * If the filesystem is going through client recovery, block until
1024  * finished.
1025  * Exceptions:
1026  * - state-releasing ops (CLOSE, LOCKU, DELEGRETURN) are allowed to proceed
1027  *   if the filesystem has been forcibly unmounted or the lwp is exiting.
1028  *
1029  * Return value:
1030  * - 0 if no errors
1031  * - EINTR if the call was interrupted
1032  * - EIO if the filesystem has been forcibly unmounted (non-state-releasing
1033  *   op)
1034  * - the errno value from the recovery thread, if recovery failed
1035  */
1036 
1037 static int
1038 wait_for_recovery(mntinfo4_t *mi, nfs4_op_hint_t op_hint)
1039 {
1040 	int error = 0;
1041 
1042 	mutex_enter(&mi->mi_lock);
1043 
1044 	while (mi->mi_recovflags != 0) {
1045 		klwp_t *lwp = ttolwp(curthread);
1046 
1047 		if ((mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) ||
1048 		    (mi->mi_flags & MI4_RECOV_FAIL))
1049 			break;
1050 		if (OH_IS_STATE_RELE(op_hint) &&
1051 		    (curthread->t_proc_flag & TP_LWPEXIT))
1052 			break;
1053 
1054 		if (lwp != NULL)
1055 			lwp->lwp_nostop++;
1056 		/* XXX - use different cv? */
1057 		if (cv_wait_sig(&mi->mi_failover_cv, &mi->mi_lock) == 0) {
1058 			error = EINTR;
1059 			if (lwp != NULL)
1060 				lwp->lwp_nostop--;
1061 			break;
1062 		}
1063 		if (lwp != NULL)
1064 			lwp->lwp_nostop--;
1065 	}
1066 
1067 	if ((mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) &&
1068 	    !OH_IS_STATE_RELE(op_hint)) {
1069 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1070 		    "wait_for_recovery: forced unmount"));
1071 		error = EIO;
1072 	} else if (mi->mi_flags & MI4_RECOV_FAIL) {
1073 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1074 		    "wait_for_recovery: fail since RECOV FAIL"));
1075 		error = mi->mi_error;
1076 	}
1077 
1078 	mutex_exit(&mi->mi_lock);
1079 
1080 	return (error);
1081 }
1082 
1083 /*
1084  * If the client received NFS4ERR_GRACE for this particular mount,
1085  * the client blocks here until it is time to try again.
1086  *
1087  * Return value:
1088  * - 0 if wait was successful
1089  * - EINTR if the call was interrupted
1090  */
1091 
1092 int
1093 nfs4_wait_for_grace(mntinfo4_t *mi, nfs4_recov_state_t *rsp)
1094 {
1095 	int error = 0;
1096 	time_t curtime, time_to_wait;
1097 
1098 	/* do a unprotected check to reduce mi_lock contention */
1099 	if (mi->mi_grace_wait != 0) {
1100 		mutex_enter(&mi->mi_lock);
1101 
1102 		if (mi->mi_grace_wait != 0) {
1103 			if (!(rsp->rs_flags & NFS4_RS_GRACE_MSG))
1104 				rsp->rs_flags |= NFS4_RS_GRACE_MSG;
1105 
1106 			curtime = gethrestime_sec();
1107 
1108 			if (curtime < mi->mi_grace_wait) {
1109 
1110 				time_to_wait = mi->mi_grace_wait - curtime;
1111 
1112 				mutex_exit(&mi->mi_lock);
1113 
1114 				delay(SEC_TO_TICK(time_to_wait));
1115 
1116 				curtime = gethrestime_sec();
1117 
1118 				mutex_enter(&mi->mi_lock);
1119 
1120 				if (curtime >= mi->mi_grace_wait)
1121 					mi->mi_grace_wait = 0;
1122 			} else {
1123 				mi->mi_grace_wait = 0;
1124 			}
1125 		}
1126 		mutex_exit(&mi->mi_lock);
1127 	}
1128 
1129 	return (error);
1130 }
1131 
1132 /*
1133  * If the client received NFS4ERR_DELAY for an operation on a vnode,
1134  * the client blocks here until it is time to try again.
1135  *
1136  * Return value:
1137  * - 0 if wait was successful
1138  * - EINTR if the call was interrupted
1139  */
1140 
1141 int
1142 nfs4_wait_for_delay(vnode_t *vp, nfs4_recov_state_t *rsp)
1143 {
1144 	int error = 0;
1145 	time_t curtime, time_to_wait;
1146 	rnode4_t *rp;
1147 
1148 	ASSERT(vp != NULL);
1149 
1150 	rp = VTOR4(vp);
1151 
1152 	/* do a unprotected check to reduce r_statelock contention */
1153 	if (rp->r_delay_wait != 0) {
1154 		mutex_enter(&rp->r_statelock);
1155 
1156 		if (rp->r_delay_wait != 0) {
1157 
1158 			if (!(rsp->rs_flags & NFS4_RS_DELAY_MSG)) {
1159 				rsp->rs_flags |= NFS4_RS_DELAY_MSG;
1160 				nfs4_mi_kstat_inc_delay(VTOMI4(vp));
1161 			}
1162 
1163 			curtime = gethrestime_sec();
1164 
1165 			if (curtime < rp->r_delay_wait) {
1166 
1167 				time_to_wait = rp->r_delay_wait - curtime;
1168 
1169 				mutex_exit(&rp->r_statelock);
1170 
1171 				delay(SEC_TO_TICK(time_to_wait));
1172 
1173 				curtime = gethrestime_sec();
1174 
1175 				mutex_enter(&rp->r_statelock);
1176 
1177 				if (curtime >= rp->r_delay_wait)
1178 					rp->r_delay_wait = 0;
1179 			} else {
1180 				rp->r_delay_wait = 0;
1181 			}
1182 		}
1183 		mutex_exit(&rp->r_statelock);
1184 	}
1185 
1186 	return (error);
1187 }
1188 
1189 /*
1190  * The recovery thread.
1191  */
1192 
1193 static void
1194 nfs4_recov_thread(recov_info_t *recovp)
1195 {
1196 	mntinfo4_t *mi = recovp->rc_mi;
1197 	nfs4_server_t *sp;
1198 	int done = 0, error = 0;
1199 	bool_t recov_fail = FALSE;
1200 	callb_cpr_t cpr_info;
1201 	kmutex_t cpr_lock;
1202 
1203 	nfs4_queue_event(RE_START, mi, NULL, mi->mi_recovflags,
1204 	    recovp->rc_vp1, recovp->rc_vp2, 0, NULL, 0, TAG_NONE, TAG_NONE,
1205 	    0, 0);
1206 
1207 	mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
1208 	CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr, "nfsv4Recov");
1209 
1210 	mutex_enter(&mi->mi_lock);
1211 	mi->mi_recovthread = curthread;
1212 	mutex_exit(&mi->mi_lock);
1213 
1214 	/*
1215 	 * We don't really need protection here against failover or
1216 	 * migration, since the current thread is the one that would make
1217 	 * any changes, but hold mi_recovlock anyway for completeness (and
1218 	 * to satisfy any ASSERTs).
1219 	 */
1220 	(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
1221 	sp = find_nfs4_server(mi);
1222 	if (sp != NULL)
1223 		mutex_exit(&sp->s_lock);
1224 	nfs_rw_exit(&mi->mi_recovlock);
1225 
1226 	/*
1227 	 * Do any necessary recovery, based on the information in recovp
1228 	 * and any recovery flags.
1229 	 */
1230 
1231 	do {
1232 		mutex_enter(&mi->mi_lock);
1233 		if (FS_OR_ZONE_GONE4(mi->mi_vfsp)) {
1234 			bool_t activesrv;
1235 
1236 			NFS4_DEBUG(nfs4_client_recov_debug &&
1237 			    mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED, (CE_NOTE,
1238 			    "nfs4_recov_thread: file system has been "
1239 			    "unmounted"));
1240 			NFS4_DEBUG(nfs4_client_recov_debug &&
1241 			    zone_status_get(curproc->p_zone) >=
1242 			    ZONE_IS_SHUTTING_DOWN, (CE_NOTE,
1243 			    "nfs4_recov_thread: zone shutting down"));
1244 			/*
1245 			 * If the server has lost its state for us and
1246 			 * the filesystem is unmounted, then the filesystem
1247 			 * can be tossed, even if there are lost lock or
1248 			 * lost state calls in the recovery queue.
1249 			 */
1250 			if (mi->mi_recovflags &
1251 			    (MI4R_NEED_CLIENTID | MI4R_REOPEN_FILES)) {
1252 				NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1253 				"nfs4_recov_thread: bailing out"));
1254 				mi->mi_flags |= MI4_RECOV_FAIL;
1255 				mi->mi_error = recovp->rc_error;
1256 				recov_fail = TRUE;
1257 			}
1258 			/*
1259 			 * We don't know if the server has any state for
1260 			 * us, and the filesystem has been unmounted.  If
1261 			 * there are "lost state" recovery items, keep
1262 			 * trying to process them until there are no more
1263 			 * mounted filesystems for the server.  Otherwise,
1264 			 * bail out.  The reason we don't mark the
1265 			 * filesystem as failing recovery is in case we
1266 			 * have to do "lost state" recovery later (e.g., a
1267 			 * user process exits).
1268 			 */
1269 			if (!(mi->mi_recovflags & MI4R_LOST_STATE)) {
1270 				done = 1;
1271 				mutex_exit(&mi->mi_lock);
1272 				break;
1273 			}
1274 			mutex_exit(&mi->mi_lock);
1275 
1276 			if (sp == NULL)
1277 				activesrv = FALSE;
1278 			else {
1279 				mutex_enter(&sp->s_lock);
1280 				activesrv = nfs4_fs_active(sp);
1281 			}
1282 			if (!activesrv) {
1283 				NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1284 				    "no active fs for server %p",
1285 				    (void *)sp));
1286 				mutex_enter(&mi->mi_lock);
1287 				mi->mi_flags |= MI4_RECOV_FAIL;
1288 				mi->mi_error = recovp->rc_error;
1289 				mutex_exit(&mi->mi_lock);
1290 				recov_fail = TRUE;
1291 				if (sp != NULL) {
1292 					/*
1293 					 * Mark the server instance as
1294 					 * dead, so that nobody will attach
1295 					 * a new filesystem.
1296 					 */
1297 					nfs4_mark_srv_dead(sp);
1298 				}
1299 			}
1300 			if (sp != NULL)
1301 				mutex_exit(&sp->s_lock);
1302 		} else {
1303 			mutex_exit(&mi->mi_lock);
1304 		}
1305 
1306 		/*
1307 		 * Check if we need to select a new server for a
1308 		 * failover.  Choosing a new server will force at
1309 		 * least a check of the clientid.
1310 		 */
1311 		mutex_enter(&mi->mi_lock);
1312 		if (!recov_fail &&
1313 		    (mi->mi_recovflags & MI4R_NEED_NEW_SERVER)) {
1314 			mutex_exit(&mi->mi_lock);
1315 			recov_newserver(recovp, &sp, &recov_fail);
1316 		} else
1317 			mutex_exit(&mi->mi_lock);
1318 
1319 		/*
1320 		 * Check if we need to recover the clientid.  This
1321 		 * must be done before file and lock recovery, and it
1322 		 * potentially affects the recovery threads for other
1323 		 * filesystems, so it gets special treatment.
1324 		 */
1325 		if (sp != NULL && recov_fail == FALSE) {
1326 			mutex_enter(&sp->s_lock);
1327 			if (!(sp->s_flags & N4S_CLIENTID_SET)) {
1328 				mutex_exit(&sp->s_lock);
1329 				recov_clientid(recovp, sp);
1330 			} else {
1331 				/*
1332 				 * Unset this flag in case another recovery
1333 				 * thread successfully recovered the clientid
1334 				 * for us already.
1335 				 */
1336 				mutex_enter(&mi->mi_lock);
1337 				mi->mi_recovflags &= ~MI4R_NEED_CLIENTID;
1338 				mutex_exit(&mi->mi_lock);
1339 				mutex_exit(&sp->s_lock);
1340 			}
1341 		}
1342 
1343 		/*
1344 		 * Check if we need to get the security information.
1345 		 */
1346 		mutex_enter(&mi->mi_lock);
1347 		if ((mi->mi_recovflags & MI4R_NEED_SECINFO) &&
1348 		    !(mi->mi_flags & MI4_RECOV_FAIL)) {
1349 			mutex_exit(&mi->mi_lock);
1350 			(void) nfs_rw_enter_sig(&mi->mi_recovlock,
1351 			    RW_WRITER, 0);
1352 			error = nfs4_secinfo_recov(recovp->rc_mi,
1353 			    recovp->rc_vp1, recovp->rc_vp2);
1354 			/*
1355 			 * If error, nothing more can be done, stop
1356 			 * the recovery.
1357 			 */
1358 			if (error) {
1359 				mutex_enter(&mi->mi_lock);
1360 				mi->mi_flags |= MI4_RECOV_FAIL;
1361 				mi->mi_error = recovp->rc_error;
1362 				mutex_exit(&mi->mi_lock);
1363 				nfs4_queue_event(RE_WRONGSEC, mi, NULL,
1364 				    error, recovp->rc_vp1, recovp->rc_vp2,
1365 				    0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
1366 			}
1367 			nfs_rw_exit(&mi->mi_recovlock);
1368 		} else
1369 			mutex_exit(&mi->mi_lock);
1370 
1371 		/*
1372 		 * Check if there's a bad seqid to recover.
1373 		 */
1374 		mutex_enter(&mi->mi_lock);
1375 		if ((mi->mi_recovflags & MI4R_BAD_SEQID) &&
1376 		    !(mi->mi_flags & MI4_RECOV_FAIL)) {
1377 			mutex_exit(&mi->mi_lock);
1378 			(void) nfs_rw_enter_sig(&mi->mi_recovlock,
1379 			    RW_WRITER, 0);
1380 			recov_bad_seqid(recovp);
1381 			nfs_rw_exit(&mi->mi_recovlock);
1382 		} else
1383 			mutex_exit(&mi->mi_lock);
1384 
1385 		/*
1386 		 * Next check for recovery that affects the entire
1387 		 * filesystem.
1388 		 */
1389 		if (sp != NULL) {
1390 			mutex_enter(&mi->mi_lock);
1391 			if ((mi->mi_recovflags & MI4R_REOPEN_FILES) &&
1392 			    !(mi->mi_flags & MI4_RECOV_FAIL)) {
1393 				mutex_exit(&mi->mi_lock);
1394 				recov_openfiles(recovp, sp);
1395 			} else
1396 				mutex_exit(&mi->mi_lock);
1397 		}
1398 
1399 		/*
1400 		 * Send any queued state recovery requests.
1401 		 */
1402 		mutex_enter(&mi->mi_lock);
1403 		if (sp != NULL &&
1404 		    (mi->mi_recovflags & MI4R_LOST_STATE) &&
1405 		    !(mi->mi_flags & MI4_RECOV_FAIL)) {
1406 			mutex_exit(&mi->mi_lock);
1407 			(void) nfs_rw_enter_sig(&mi->mi_recovlock,
1408 			    RW_WRITER, 0);
1409 			nfs4_resend_lost_rqsts(recovp, sp);
1410 			if (list_head(&mi->mi_lost_state) == NULL) {
1411 				/* done */
1412 				mutex_enter(&mi->mi_lock);
1413 				mi->mi_recovflags &= ~MI4R_LOST_STATE;
1414 				mutex_exit(&mi->mi_lock);
1415 			}
1416 			nfs_rw_exit(&mi->mi_recovlock);
1417 		} else {
1418 			mutex_exit(&mi->mi_lock);
1419 		}
1420 
1421 		/*
1422 		 * See if there is anything more to do.  If not, announce
1423 		 * that we are done and exit.
1424 		 *
1425 		 * Need mi_recovlock to keep 'sp' valid.  Must grab
1426 		 * mi_recovlock before mi_lock to preserve lock ordering.
1427 		 */
1428 		(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
1429 		mutex_enter(&mi->mi_lock);
1430 		if ((mi->mi_recovflags & ~MI4R_SRV_REBOOT) == 0 ||
1431 		    (mi->mi_flags & MI4_RECOV_FAIL)) {
1432 			list_t local_lost_state;
1433 			nfs4_lost_rqst_t *lrp;
1434 
1435 			/*
1436 			 * We need to remove the lost requests before we
1437 			 * unmark the mi as no longer doing recovery to
1438 			 * avoid a race with a new thread putting new lost
1439 			 * requests on the same mi (and the going away
1440 			 * thread would remove the new lost requests).
1441 			 *
1442 			 * Move the lost requests to a local list since
1443 			 * nfs4_remove_lost_rqst() drops mi_lock, and
1444 			 * dropping the mi_lock would make our check to
1445 			 * see if recovery is done no longer valid.
1446 			 */
1447 			list_create(&local_lost_state,
1448 			    sizeof (nfs4_lost_rqst_t),
1449 			    offsetof(nfs4_lost_rqst_t, lr_node));
1450 			list_move_tail(&local_lost_state, &mi->mi_lost_state);
1451 
1452 			done = 1;
1453 			mutex_exit(&mi->mi_lock);
1454 			/*
1455 			 * Now officially free the "moved"
1456 			 * lost requests.
1457 			 */
1458 			while ((lrp = list_head(&local_lost_state)) != NULL) {
1459 				list_remove(&local_lost_state, lrp);
1460 				nfs4_free_lost_rqst(lrp, sp);
1461 			}
1462 			list_destroy(&local_lost_state);
1463 		} else
1464 			mutex_exit(&mi->mi_lock);
1465 		nfs_rw_exit(&mi->mi_recovlock);
1466 
1467 		/*
1468 		 * If the filesystem has been forcibly unmounted, there is
1469 		 * probably no point in retrying immediately.  Furthermore,
1470 		 * there might be user processes waiting for a chance to
1471 		 * queue up "lost state" requests, so that they can exit.
1472 		 * So pause here for a moment.  Same logic for zone shutdown.
1473 		 */
1474 		if (!done && FS_OR_ZONE_GONE4(mi->mi_vfsp)) {
1475 			mutex_enter(&mi->mi_lock);
1476 			cv_broadcast(&mi->mi_failover_cv);
1477 			mutex_exit(&mi->mi_lock);
1478 			delay(SEC_TO_TICK(nfs4_unmount_delay));
1479 		}
1480 
1481 	} while (!done);
1482 
1483 	if (sp != NULL)
1484 		nfs4_server_rele(sp);
1485 
1486 	/*
1487 	 * Return all recalled delegations
1488 	 */
1489 	nfs4_dlistclean();
1490 
1491 	mutex_enter(&mi->mi_lock);
1492 	recov_done(mi, recovp);
1493 	mutex_exit(&mi->mi_lock);
1494 
1495 	/*
1496 	 * Free up resources that were allocated for us.
1497 	 */
1498 	if (recovp->rc_vp1 != NULL)
1499 		VN_RELE(recovp->rc_vp1);
1500 	if (recovp->rc_vp2 != NULL)
1501 		VN_RELE(recovp->rc_vp2);
1502 
1503 	/* now we are done using the mi struct, signal the waiters */
1504 	mutex_enter(&mi->mi_lock);
1505 	mi->mi_in_recovery--;
1506 	if (mi->mi_in_recovery == 0)
1507 		cv_broadcast(&mi->mi_cv_in_recov);
1508 	mutex_exit(&mi->mi_lock);
1509 
1510 	VFS_RELE(mi->mi_vfsp);
1511 	MI4_RELE(mi);
1512 	kmem_free(recovp, sizeof (recov_info_t));
1513 	mutex_enter(&cpr_lock);
1514 	CALLB_CPR_EXIT(&cpr_info);
1515 	mutex_destroy(&cpr_lock);
1516 	zthread_exit();
1517 }
1518 
1519 /*
1520  * Log the end of recovery and notify any waiting threads.
1521  */
1522 
1523 static void
1524 recov_done(mntinfo4_t *mi, recov_info_t *recovp)
1525 {
1526 
1527 	ASSERT(MUTEX_HELD(&mi->mi_lock));
1528 
1529 	nfs4_queue_event(RE_END, mi, NULL, 0, recovp->rc_vp1,
1530 	    recovp->rc_vp2, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
1531 	mi->mi_recovthread = NULL;
1532 	mi->mi_flags &= ~MI4_RECOV_ACTIV;
1533 	mi->mi_recovflags &= ~MI4R_SRV_REBOOT;
1534 	cv_broadcast(&mi->mi_failover_cv);
1535 }
1536 
1537 /*
1538  * State-specific recovery routines, by state.
1539  */
1540 
1541 /*
1542  * Failover.
1543  *
1544  * Replaces *spp with a reference to the new server, which must
1545  * eventually be freed.
1546  */
1547 
1548 static void
1549 recov_newserver(recov_info_t *recovp, nfs4_server_t **spp, bool_t *recov_fail)
1550 {
1551 	mntinfo4_t *mi = recovp->rc_mi;
1552 	servinfo4_t *svp = NULL;
1553 	nfs4_server_t *osp = *spp;
1554 	CLIENT *cl;
1555 	enum clnt_stat status;
1556 	struct timeval tv;
1557 	int error;
1558 	int oncethru = 0;
1559 	rnode4_t *rp;
1560 	int index;
1561 	nfs_fh4 fh;
1562 	char *snames;
1563 	size_t len;
1564 
1565 	(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0);
1566 
1567 	tv.tv_sec = 2;
1568 	tv.tv_usec = 0;
1569 
1570 #ifdef lint
1571 	/*
1572 	 * Lint can't follow the logic, so thinks that snames and len
1573 	 * can be used before being set.  They can't, but lint can't
1574 	 * figure it out.  To address the lint warning, initialize
1575 	 * snames and len for lint.
1576 	 */
1577 	snames = NULL;
1578 	len = 0;
1579 #endif
1580 
1581 	/*
1582 	 * Ping the null NFS procedure of every server in
1583 	 * the list until one responds.  We always start
1584 	 * at the head of the list and always skip the one
1585 	 * that is current, since it's caused us a problem.
1586 	 */
1587 	while (svp == NULL) {
1588 		for (svp = mi->mi_servers; svp; svp = svp->sv_next) {
1589 
1590 			mutex_enter(&mi->mi_lock);
1591 			if (FS_OR_ZONE_GONE4(mi->mi_vfsp)) {
1592 				mi->mi_flags |= MI4_RECOV_FAIL;
1593 				mutex_exit(&mi->mi_lock);
1594 				(void) nfs_rw_exit(&mi->mi_recovlock);
1595 				*recov_fail = TRUE;
1596 				if (oncethru)
1597 					kmem_free(snames, len);
1598 				return;
1599 			}
1600 			mutex_exit(&mi->mi_lock);
1601 
1602 			(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1603 			if (svp->sv_flags & SV4_NOTINUSE) {
1604 				nfs_rw_exit(&svp->sv_lock);
1605 				continue;
1606 			}
1607 			nfs_rw_exit(&svp->sv_lock);
1608 
1609 			if (!oncethru && svp == mi->mi_curr_serv)
1610 				continue;
1611 
1612 			error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr,
1613 			    NFS_PROGRAM, NFS_V4, 0, 1, CRED(), &cl);
1614 			if (error)
1615 				continue;
1616 
1617 			if (!(mi->mi_flags & MI4_INT))
1618 				cl->cl_nosignal = TRUE;
1619 			status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL,
1620 			    xdr_void, NULL, tv);
1621 			if (!(mi->mi_flags & MI4_INT))
1622 				cl->cl_nosignal = FALSE;
1623 			AUTH_DESTROY(cl->cl_auth);
1624 			CLNT_DESTROY(cl);
1625 			if (status == RPC_SUCCESS) {
1626 				nfs4_queue_event(RE_FAILOVER, mi,
1627 				    svp == mi->mi_curr_serv ? NULL :
1628 				    svp->sv_hostname, 0, NULL, NULL, 0,
1629 				    NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
1630 				break;
1631 			}
1632 		}
1633 
1634 		if (svp == NULL) {
1635 			if (!oncethru) {
1636 				snames = nfs4_getsrvnames(mi, &len);
1637 				nfs4_queue_fact(RF_SRVS_NOT_RESPOND, mi,
1638 				    0, 0, 0, FALSE, snames, 0, NULL);
1639 				oncethru = 1;
1640 			}
1641 			delay(hz);
1642 		}
1643 	}
1644 
1645 	if (oncethru) {
1646 		nfs4_queue_fact(RF_SRVS_OK, mi, 0, 0, 0, FALSE, snames,
1647 		    0, NULL);
1648 		kmem_free(snames, len);
1649 	}
1650 
1651 #if DEBUG
1652 	(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1653 	ASSERT((svp->sv_flags & SV4_NOTINUSE) == 0);
1654 	nfs_rw_exit(&svp->sv_lock);
1655 #endif
1656 
1657 	mutex_enter(&mi->mi_lock);
1658 	mi->mi_recovflags &= ~MI4R_NEED_NEW_SERVER;
1659 	if (svp != mi->mi_curr_serv) {
1660 		servinfo4_t *osvp = mi->mi_curr_serv;
1661 
1662 		mutex_exit(&mi->mi_lock);
1663 
1664 		/*
1665 		 * Update server-dependent fields in the root vnode.
1666 		 */
1667 		index = rtable4hash(mi->mi_rootfh);
1668 		rw_enter(&rtable4[index].r_lock, RW_WRITER);
1669 
1670 		rp = r4find(&rtable4[index], mi->mi_rootfh, mi->mi_vfsp);
1671 		if (rp != NULL) {
1672 			NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
1673 			    "recov_newserver: remapping %s", rnode4info(rp)));
1674 			mutex_enter(&rp->r_statelock);
1675 			rp->r_server = svp;
1676 			PURGE_ATTRCACHE4_LOCKED(rp);
1677 			mutex_exit(&rp->r_statelock);
1678 			(void) nfs4_free_data_reclaim(rp);
1679 			nfs4_purge_rddir_cache(RTOV4(rp));
1680 			rw_exit(&rtable4[index].r_lock);
1681 			NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
1682 			    "recov_newserver: done with %s",
1683 			    rnode4info(rp)));
1684 			VN_RELE(RTOV4(rp));
1685 		} else
1686 			rw_exit(&rtable4[index].r_lock);
1687 		(void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
1688 
1689 		mutex_enter(&mi->mi_lock);
1690 		mi->mi_recovflags |= MI4R_REOPEN_FILES | MI4R_REMAP_FILES;
1691 		if (recovp->rc_srv_reboot)
1692 			mi->mi_recovflags |= MI4R_SRV_REBOOT;
1693 		mi->mi_curr_serv = svp;
1694 		mi->mi_failover++;
1695 		mi->mi_flags &= ~MI4_BADOWNER_DEBUG;
1696 		mutex_exit(&mi->mi_lock);
1697 
1698 		(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1699 		fh.nfs_fh4_len = svp->sv_fhandle.fh_len;
1700 		fh.nfs_fh4_val = svp->sv_fhandle.fh_buf;
1701 		sfh4_update(mi->mi_rootfh, &fh);
1702 		fh.nfs_fh4_len = svp->sv_pfhandle.fh_len;
1703 		fh.nfs_fh4_val = svp->sv_pfhandle.fh_buf;
1704 		sfh4_update(mi->mi_srvparentfh, &fh);
1705 		nfs_rw_exit(&svp->sv_lock);
1706 
1707 		*spp = nfs4_move_mi(mi, osvp, svp);
1708 		if (osp != NULL)
1709 			nfs4_server_rele(osp);
1710 	} else
1711 		mutex_exit(&mi->mi_lock);
1712 	(void) nfs_rw_exit(&mi->mi_recovlock);
1713 }
1714 
1715 /*
1716  * Clientid.
1717  */
1718 
1719 static void
1720 recov_clientid(recov_info_t *recovp, nfs4_server_t *sp)
1721 {
1722 	mntinfo4_t *mi = recovp->rc_mi;
1723 	int error = 0;
1724 	int still_stale;
1725 	int need_new_s;
1726 
1727 	ASSERT(sp != NULL);
1728 
1729 	/*
1730 	 * Acquire the recovery lock and then verify that the clientid
1731 	 * still needs to be recovered.  (Note that s_recovlock is supposed
1732 	 * to be acquired before s_lock.)  Since the thread holds the
1733 	 * recovery lock, no other thread will recover the clientid.
1734 	 */
1735 	(void) nfs_rw_enter_sig(&sp->s_recovlock, RW_WRITER, 0);
1736 	(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0);
1737 	mutex_enter(&sp->s_lock);
1738 	still_stale = ((sp->s_flags & N4S_CLIENTID_SET) == 0);
1739 	mutex_exit(&sp->s_lock);
1740 
1741 	if (still_stale) {
1742 		nfs4_error_t n4e;
1743 
1744 		nfs4_error_zinit(&n4e);
1745 		nfs4setclientid(mi, kcred, TRUE, &n4e);
1746 		error = n4e.error;
1747 		if (error != 0) {
1748 
1749 			/*
1750 			 * nfs4setclientid may have set MI4R_NEED_NEW_SERVER,
1751 			 * if so, just return and let recov_thread drive
1752 			 * failover.
1753 			 */
1754 			mutex_enter(&mi->mi_lock);
1755 			need_new_s = mi->mi_recovflags & MI4R_NEED_NEW_SERVER;
1756 			mutex_exit(&mi->mi_lock);
1757 
1758 			if (need_new_s) {
1759 				nfs_rw_exit(&mi->mi_recovlock);
1760 				nfs_rw_exit(&sp->s_recovlock);
1761 				return;
1762 			}
1763 
1764 			nfs4_queue_event(RE_CLIENTID, mi, NULL, n4e.error, NULL,
1765 			    NULL, n4e.stat, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
1766 			mutex_enter(&mi->mi_lock);
1767 			mi->mi_flags |= MI4_RECOV_FAIL;
1768 			mi->mi_error = recovp->rc_error;
1769 			mutex_exit(&mi->mi_lock);
1770 			/* don't destroy the nfs4_server, let umount do it */
1771 		}
1772 	}
1773 
1774 	if (error == 0) {
1775 		mutex_enter(&mi->mi_lock);
1776 		mi->mi_recovflags &= ~MI4R_NEED_CLIENTID;
1777 		/*
1778 		 * If still_stale isn't true, then another thread already
1779 		 * recovered the clientid.  And that thread that set the
1780 		 * clientid will have initiated reopening files on all the
1781 		 * filesystems for the server, so we should not initiate
1782 		 * reopening for this filesystem here.
1783 		 */
1784 		if (still_stale) {
1785 			mi->mi_recovflags |= MI4R_REOPEN_FILES;
1786 			if (recovp->rc_srv_reboot)
1787 				mi->mi_recovflags |= MI4R_SRV_REBOOT;
1788 		}
1789 		mutex_exit(&mi->mi_lock);
1790 	}
1791 
1792 	nfs_rw_exit(&mi->mi_recovlock);
1793 
1794 	if (error != 0) {
1795 		nfs_rw_exit(&sp->s_recovlock);
1796 		mutex_enter(&mi->mi_lock);
1797 		if ((mi->mi_flags & MI4_RECOV_FAIL) == 0)
1798 			delay(SEC_TO_TICK(recov_err_delay));
1799 		mutex_exit(&mi->mi_lock);
1800 	} else {
1801 		mntinfo4_t **milist;
1802 		mntinfo4_t *tmi;
1803 		int nummi, i;
1804 
1805 		/*
1806 		 * Initiate recovery of open files for other filesystems.
1807 		 * We create an array of filesystems, rather than just
1808 		 * walking the filesystem list, to avoid deadlock issues
1809 		 * with s_lock and mi_recovlock.
1810 		 */
1811 		milist = make_milist(sp, &nummi);
1812 		for (i = 0; i < nummi; i++) {
1813 			tmi = milist[i];
1814 			if (tmi != mi) {
1815 				(void) nfs_rw_enter_sig(&tmi->mi_recovlock,
1816 				    RW_READER, 0);
1817 				start_recovery_action(NR_OPENFILES, TRUE, tmi,
1818 				    NULL, NULL);
1819 				nfs_rw_exit(&tmi->mi_recovlock);
1820 			}
1821 		}
1822 		free_milist(milist, nummi);
1823 
1824 		nfs_rw_exit(&sp->s_recovlock);
1825 	}
1826 }
1827 
1828 /*
1829  * Return an array of filesystems associated with the given server.  The
1830  * caller should call free_milist() to free the references and memory.
1831  */
1832 
1833 static mntinfo4_t **
1834 make_milist(nfs4_server_t *sp, int *nummip)
1835 {
1836 	int nummi, i;
1837 	mntinfo4_t **milist;
1838 	mntinfo4_t *tmi;
1839 
1840 	mutex_enter(&sp->s_lock);
1841 	nummi = 0;
1842 	for (tmi = sp->mntinfo4_list; tmi != NULL; tmi = tmi->mi_clientid_next)
1843 		nummi++;
1844 
1845 	milist = kmem_alloc(nummi * sizeof (mntinfo4_t *), KM_SLEEP);
1846 
1847 	for (i = 0, tmi = sp->mntinfo4_list; tmi != NULL; i++,
1848 	    tmi = tmi->mi_clientid_next) {
1849 		milist[i] = tmi;
1850 		VFS_HOLD(tmi->mi_vfsp);
1851 	}
1852 	mutex_exit(&sp->s_lock);
1853 
1854 	*nummip = nummi;
1855 	return (milist);
1856 }
1857 
1858 /*
1859  * Free the filesystem list created by make_milist().
1860  */
1861 
1862 static void
1863 free_milist(mntinfo4_t **milist, int nummi)
1864 {
1865 	mntinfo4_t *tmi;
1866 	int i;
1867 
1868 	for (i = 0; i < nummi; i++) {
1869 		tmi = milist[i];
1870 		VFS_RELE(tmi->mi_vfsp);
1871 	}
1872 	kmem_free(milist, nummi * sizeof (mntinfo4_t *));
1873 }
1874 
1875 /*
1876  * Filehandle
1877  */
1878 
1879 /*
1880  * Lookup the filehandle for the given vnode and update the rnode if it has
1881  * changed.
1882  *
1883  * Errors:
1884  * - if the filehandle could not be updated because of an error that
1885  *   requires further recovery, initiate that recovery and return.
1886  * - if the filehandle could not be updated because of a signal, pretend we
1887  *   succeeded and let someone else deal with it.
1888  * - if the filehandle could not be updated and the filesystem has been
1889  *   forcibly unmounted, pretend we succeeded, and let the caller deal with
1890  *   the forced unmount (to retry or not to retry, that is the question).
1891  * - if the filehandle could not be updated because of some other error,
1892  *   mark the rnode bad and return.
1893  */
1894 static void
1895 recov_filehandle(nfs4_recov_t action, mntinfo4_t *mi, vnode_t *vp)
1896 {
1897 	rnode4_t *rp = VTOR4(vp);
1898 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
1899 	bool_t needrecov;
1900 
1901 	mutex_enter(&rp->r_statelock);
1902 
1903 	if (rp->r_flags & R4RECOVERR) {
1904 		mutex_exit(&rp->r_statelock);
1905 		return;
1906 	}
1907 
1908 	/*
1909 	 * If someone else is updating the filehandle, wait for them to
1910 	 * finish and then let our caller retry.
1911 	 */
1912 	if (rp->r_flags & R4RECEXPFH) {
1913 		while (rp->r_flags & R4RECEXPFH) {
1914 			cv_wait(&rp->r_cv, &rp->r_statelock);
1915 		}
1916 		mutex_exit(&rp->r_statelock);
1917 		return;
1918 	}
1919 	rp->r_flags |= R4RECEXPFH;
1920 	mutex_exit(&rp->r_statelock);
1921 
1922 	if (action == NR_BADHANDLE) {
1923 		/* shouldn't happen */
1924 		nfs4_queue_event(RE_BADHANDLE, mi, NULL, 0,
1925 		    vp, NULL, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
1926 	}
1927 
1928 	nfs4_remap_file(mi, vp, 0, &e);
1929 	needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
1930 
1931 	/*
1932 	 * If we get BADHANDLE or FHEXPIRED in their handler, something is
1933 	 * broken.  Don't try to recover, just mark the file dead.
1934 	 */
1935 	if (needrecov && e.error == 0 &&
1936 	    (e.stat == NFS4ERR_BADHANDLE || e.stat == NFS4ERR_FHEXPIRED))
1937 		needrecov = FALSE;
1938 	if (needrecov) {
1939 		(void) nfs4_start_recovery(&e, mi, vp,
1940 		    NULL, NULL, NULL, OP_LOOKUP, NULL);
1941 	} else if (e.error != EINTR &&
1942 	    !NFS4_FRC_UNMT_ERR(e.error, mi->mi_vfsp) &&
1943 	    (e.error != 0 || e.stat != NFS4_OK)) {
1944 		nfs4_recov_fh_fail(vp, e.error, e.stat);
1945 		/*
1946 		 * Don't set r_error to ESTALE.  Higher-level code (e.g.,
1947 		 * cstatat_getvp()) retries on ESTALE, which would cause
1948 		 * an infinite loop.
1949 		 */
1950 	}
1951 
1952 	mutex_enter(&rp->r_statelock);
1953 	rp->r_flags &= ~R4RECEXPFH;
1954 	cv_broadcast(&rp->r_cv);
1955 	mutex_exit(&rp->r_statelock);
1956 }
1957 
1958 /*
1959  * Stale Filehandle
1960  */
1961 
1962 /*
1963  * A stale filehandle can happen when an individual file has
1964  * been removed, or when an entire filesystem has been taken
1965  * offline.  To distinguish these cases, we do this:
1966  * - if a GETATTR with the current filehandle is okay, we do
1967  *   nothing (this can happen with two-filehandle ops)
1968  * - if the GETATTR fails, but a GETATTR of the root filehandle
1969  *   succeeds, mark the rnode with R4STALE, which will stop use
1970  * - if the GETATTR fails, and a GETATTR of the root filehandle
1971  *   also fails, we consider the problem filesystem-wide, so:
1972  *   - if we can failover, we should
1973  *   - if we can't failover, we should mark both the original
1974  *     vnode and the root bad
1975  */
1976 static void
1977 recov_stale(mntinfo4_t *mi, vnode_t *vp)
1978 {
1979 	rnode4_t *rp = VTOR4(vp);
1980 	vnode_t *rootvp = NULL;
1981 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
1982 	nfs4_ga_res_t gar;
1983 	char *fail_msg = "failed to recover from NFS4ERR_STALE";
1984 	bool_t needrecov;
1985 
1986 	mutex_enter(&rp->r_statelock);
1987 
1988 	if (rp->r_flags & R4RECOVERR) {
1989 		mutex_exit(&rp->r_statelock);
1990 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1991 		    "recov_stale: already marked dead, rp %s",
1992 		    rnode4info(rp)));
1993 		return;
1994 	}
1995 
1996 	if (rp->r_flags & R4STALE) {
1997 		mutex_exit(&rp->r_statelock);
1998 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1999 		    "recov_stale: already marked stale, rp %s",
2000 		    rnode4info(rp)));
2001 		return;
2002 	}
2003 
2004 	mutex_exit(&rp->r_statelock);
2005 
2006 	/* Try a GETATTR on this vnode */
2007 	nfs4_getattr_otw_norecovery(vp, &gar, &e, CRED(), 0);
2008 
2009 	/*
2010 	 * Handle non-STALE recoverable errors
2011 	 */
2012 	needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
2013 	if (needrecov && (e.error != 0 || e.stat != NFS4ERR_STALE)) {
2014 		(void) nfs4_start_recovery(&e, mi, vp,
2015 		    NULL, NULL, NULL, OP_GETATTR, NULL);
2016 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2017 		    "recov_stale: error=%d, stat=%d seen on rp %s",
2018 		    e.error, e.stat, rnode4info(rp)));
2019 		goto out;
2020 	}
2021 
2022 	/* Are things OK for this vnode? */
2023 	if (!e.error && e.stat == NFS4_OK) {
2024 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2025 		    "recov_stale: file appears fine, rp %s",
2026 		    rnode4info(rp)));
2027 		goto out;
2028 	}
2029 
2030 	/* Did we get an unrelated non-recoverable error? */
2031 	if (e.error || e.stat != NFS4ERR_STALE) {
2032 		nfs4_fail_recov(vp, fail_msg, e.error, e.stat);
2033 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2034 		    "recov_stale: unrelated fatal error, rp %s",
2035 		    rnode4info(rp)));
2036 		goto out;
2037 	}
2038 
2039 	/*
2040 	 * If we don't appear to be dealing with the root node, find it.
2041 	 */
2042 	if ((vp->v_flag & VROOT) == 0) {
2043 		nfs4_error_zinit(&e);
2044 		e.error = VFS_ROOT(vp->v_vfsp, &rootvp);
2045 		if (e.error) {
2046 			nfs4_fail_recov(vp, fail_msg, 0, NFS4ERR_STALE);
2047 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2048 			    "recov_stale: can't find root node for rp %s",
2049 			    rnode4info(rp)));
2050 			goto out;
2051 		}
2052 	}
2053 
2054 	/* Try a GETATTR on the root vnode */
2055 	if (rootvp != NULL) {
2056 		nfs4_error_zinit(&e);
2057 		nfs4_getattr_otw_norecovery(rootvp, &gar, &e, CRED(), 0);
2058 
2059 		/* Try recovery? */
2060 		if (e.error != 0 || e.stat != NFS4ERR_STALE) {
2061 			needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
2062 			if (needrecov) {
2063 				(void) nfs4_start_recovery(&e,
2064 				    mi, rootvp, NULL, NULL, NULL,
2065 				    OP_GETATTR, NULL);
2066 				NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2067 				    "recov_stale: error=%d, stat=%d seen "
2068 				    "on rp %s", e.error, e.stat,
2069 				    rnode4info(rp)));
2070 			}
2071 		}
2072 
2073 		/*
2074 		 * Check to see if a failover attempt is warranted
2075 		 * NB: nfs4_try_failover doesn't check for STALE
2076 		 * because recov_stale gets a shot first.  Now that
2077 		 * recov_stale has failed, go ahead and try failover.
2078 		 *
2079 		 * If the getattr on the root filehandle was successful,
2080 		 * then mark recovery as failed for 'vp' and exit.
2081 		 */
2082 		if (nfs4_try_failover(&e) == 0 && e.stat != NFS4ERR_STALE) {
2083 			/*
2084 			 * pass the original error to fail_recov, not
2085 			 * the one from trying the root vnode.
2086 			 */
2087 			nfs4_fail_recov(vp, fail_msg, 0, NFS4ERR_STALE);
2088 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2089 			    "recov_stale: root node OK, marking "
2090 			    "dead rp %s", rnode4info(rp)));
2091 			goto out;
2092 		}
2093 	}
2094 
2095 	/*
2096 	 * Here, we know that both the original file and the
2097 	 * root filehandle (which may be the same) are stale.
2098 	 * We want to fail over if we can, and if we can't, we
2099 	 * want to mark everything in sight bad.
2100 	 */
2101 	if (FAILOVER_MOUNT4(mi)) {
2102 		mutex_enter(&mi->mi_lock);
2103 		mi->mi_recovflags |= MI4R_NEED_NEW_SERVER;
2104 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2105 		    "recov_stale: failing over due to rp %s",
2106 		    rnode4info(rp)));
2107 		mutex_exit(&mi->mi_lock);
2108 	} else {
2109 		rnode4_t *rootrp;
2110 		servinfo4_t *svp;
2111 
2112 		/*
2113 		 * Can't fail over, so mark things dead.
2114 		 *
2115 		 * If rootvp is set, we know we have a distinct
2116 		 * non-root vnode which can be marked dead in
2117 		 * the usual way.
2118 		 *
2119 		 * Then we want to mark the root vnode dead.
2120 		 * Note that if rootvp wasn't set, our vp is
2121 		 * actually the root vnode.
2122 		 */
2123 		if (rootvp != NULL) {
2124 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2125 			    "recov_stale: can't fail over, marking dead rp %s",
2126 			    rnode4info(rp)));
2127 			nfs4_fail_recov(vp, fail_msg, 0, NFS4ERR_STALE);
2128 		} else {
2129 			rootvp = vp;
2130 			VN_HOLD(rootvp);
2131 		}
2132 
2133 		/*
2134 		 * Mark root dead, but quietly - since
2135 		 * the root rnode is frequently recreated,
2136 		 * we can encounter this at every access.
2137 		 * Also mark recovery as failed on this VFS.
2138 		 */
2139 		rootrp = VTOR4(rootvp);
2140 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_CONT,
2141 		    "recov_stale: marking dead root rp %s",
2142 		    rnode4info(rootrp)));
2143 		mutex_enter(&rootrp->r_statelock);
2144 		rootrp->r_flags |= (R4RECOVERR | R4STALE);
2145 		rootrp->r_error = ESTALE;
2146 		mutex_exit(&rootrp->r_statelock);
2147 		mutex_enter(&mi->mi_lock);
2148 		mi->mi_error = ESTALE;
2149 		mutex_exit(&mi->mi_lock);
2150 
2151 		svp = mi->mi_curr_serv;
2152 		(void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
2153 		svp->sv_flags |= SV4_ROOT_STALE;
2154 		nfs_rw_exit(&svp->sv_lock);
2155 	}
2156 
2157 out:
2158 	if (rootvp)
2159 		VN_RELE(rootvp);
2160 }
2161 
2162 /*
2163  * Locks.
2164  */
2165 
2166 /*
2167  * Reclaim all the active (acquired) locks for the given file.
2168  * If a process lost a lock, the process is sent a SIGLOST.  This is not
2169  * considered an error.
2170  *
2171  * Return values:
2172  * Errors and status are returned via the nfs4_error_t parameter
2173  * If an error indicates that recovery is needed, the caller is responsible
2174  * for dealing with it.
2175  */
2176 
2177 static void
2178 relock_file(vnode_t *vp, mntinfo4_t *mi, nfs4_error_t *ep,
2179     fattr4_change pre_change)
2180 {
2181 	locklist_t *locks, *llp;
2182 	rnode4_t *rp;
2183 
2184 	ASSERT(ep != NULL);
2185 	nfs4_error_zinit(ep);
2186 
2187 	if (VTOMI4(vp)->mi_flags & MI4_LLOCK)
2188 		return;
2189 
2190 	nfs4_flush_lock_owners(VTOR4(vp));
2191 
2192 	/*
2193 	 * If we get an error that requires recovery actions, just bail out
2194 	 * and let the top-level recovery code handle it.
2195 	 *
2196 	 * If we get some other error, kill the process that owned the lock
2197 	 * and mark its remaining locks (if any) as belonging to NOPID, so
2198 	 * that we don't make any more reclaim requests for that process.
2199 	 */
2200 
2201 	rp = VTOR4(vp);
2202 	locks = flk_active_locks_for_vp(vp);
2203 	for (llp = locks; llp != NULL; llp = llp->ll_next) {
2204 		int did_reclaim = 1;
2205 
2206 		ASSERT(llp->ll_vp == vp);
2207 		if (llp->ll_flock.l_pid == NOPID)
2208 			continue;
2209 		reclaim_one_lock(vp, &llp->ll_flock, ep, &did_reclaim);
2210 		/*
2211 		 * If we need to restart recovery, stop processing the
2212 		 * list.  Some errors would be recoverable under other
2213 		 * circumstances, but if they happen here we just give up
2214 		 * on the lock.
2215 		 */
2216 		if (nfs4_needs_recovery(ep, TRUE, vp->v_vfsp)) {
2217 			if (ep->error != 0)
2218 				break;
2219 			if (!nfs4_recov_marks_dead(ep->stat))
2220 				break;
2221 		}
2222 		/*
2223 		 *   In case the server isn't offering us a grace period, or
2224 		 * if we missed it, we might have opened & locked from scratch,
2225 		 * rather than reopened/reclaimed.
2226 		 *   We need to ensure that the object hadn't been otherwise
2227 		 * changed during this time, by comparing the changeinfo.
2228 		 *   We get passed the changeinfo from before the reopen by our
2229 		 * caller, in pre_change.
2230 		 *   The changeinfo from after the reopen is in rp->r_change,
2231 		 * courtesy of the GETATTR in the reopen.
2232 		 *   If they're different, then the file has changed, and we
2233 		 * have to SIGLOST the app.
2234 		 */
2235 		if (ep->error == 0 && ep->stat == NFS4_OK && !did_reclaim) {
2236 			mutex_enter(&rp->r_statelock);
2237 			if (pre_change != rp->r_change)
2238 				ep->stat = NFS4ERR_NO_GRACE;
2239 			mutex_exit(&rp->r_statelock);
2240 		}
2241 		if (ep->error != 0 || ep->stat != NFS4_OK) {
2242 			if (ep->error != 0)
2243 				nfs4_queue_event(RE_FAIL_RELOCK, mi,
2244 				    NULL, ep->error, vp, NULL, 0, NULL,
2245 				    llp->ll_flock.l_pid, TAG_NONE, TAG_NONE,
2246 				    0, 0);
2247 			else
2248 				nfs4_queue_event(RE_FAIL_RELOCK, mi,
2249 				    NULL, 0, vp, NULL, ep->stat, NULL,
2250 				    llp->ll_flock.l_pid, TAG_NONE, TAG_NONE,
2251 				    0, 0);
2252 			nfs4_send_siglost(llp->ll_flock.l_pid, mi, vp, TRUE,
2253 			    ep->error, ep->stat);
2254 			relock_skip_pid(llp, llp->ll_flock.l_pid);
2255 
2256 			/* Reinitialize the nfs4_error and continue */
2257 			nfs4_error_zinit(ep);
2258 		}
2259 	}
2260 
2261 	if (locks != NULL)
2262 		flk_free_locklist(locks);
2263 }
2264 
2265 /*
2266  * Reclaim the given lock.
2267  * If the lock can't be reclaimed, the process is sent SIGLOST, but this is
2268  * not considered an error.
2269  *
2270  * Errors are returned via the nfs4_error_t parameter.
2271  */
2272 static void
2273 reclaim_one_lock(vnode_t *vp, flock64_t *flk, nfs4_error_t *ep,
2274     int *did_reclaimp)
2275 {
2276 	cred_t *cr;
2277 	rnode4_t *rp = VTOR4(vp);
2278 
2279 	cr = pid_to_cr(flk->l_pid);
2280 	if (cr == NULL) {
2281 		nfs4_error_zinit(ep);
2282 		ep->error = ESRCH;
2283 		return;
2284 	}
2285 
2286 	do {
2287 		mutex_enter(&rp->r_statelock);
2288 		if (rp->r_flags & R4RECOVERR) {
2289 			/*
2290 			 * This shouldn't affect other reclaims, so don't
2291 			 * return an error.
2292 			 */
2293 			mutex_exit(&rp->r_statelock);
2294 			break;
2295 		}
2296 		mutex_exit(&rp->r_statelock);
2297 
2298 		nfs4frlock(NFS4_LCK_CTYPE_RECLAIM, vp, F_SETLK, flk,
2299 		    FREAD|FWRITE, 0, cr, ep, NULL, did_reclaimp);
2300 		if (ep->error == 0 && ep->stat == NFS4ERR_FHEXPIRED)
2301 			start_recovery_action(NR_FHEXPIRED, TRUE, VTOMI4(vp),
2302 			    vp, NULL);
2303 	} while (ep->error == 0 && ep->stat == NFS4ERR_FHEXPIRED);
2304 
2305 	crfree(cr);
2306 }
2307 
2308 /*
2309  * Open files.
2310  */
2311 
2312 /*
2313  * Verifies if the nfsstat4 is a valid error for marking this vnode dead.
2314  * Returns 1 if the error is valid; 0 otherwise.
2315  */
2316 static int
2317 nfs4_valid_recov_err_for_vp(vnode_t *vp, nfsstat4 stat)
2318 {
2319 	/*
2320 	 * We should not be marking non-regular files as dead,
2321 	 * except in very rare cases (eg: BADHANDLE or NFS4ERR_BADNAME).
2322 	 */
2323 	if (vp->v_type != VREG && stat != NFS4ERR_BADHANDLE &&
2324 	    stat != NFS4ERR_BADNAME)
2325 		return (0);
2326 
2327 	return (1);
2328 }
2329 
2330 /*
2331  * Failed attempting to recover a filehandle.  If 'stat' is valid for 'vp',
2332  * then mark the object dead.  Since we've had to do a lookup for
2333  * filehandle recovery, we will mark the object dead if we got NOENT.
2334  */
2335 static void
2336 nfs4_recov_fh_fail(vnode_t *vp, int error, nfsstat4 stat)
2337 {
2338 	ASSERT(vp != NULL);
2339 
2340 	if ((error == 0) && (stat != NFS4ERR_NOENT) &&
2341 	    (!nfs4_valid_recov_err_for_vp(vp, stat)))
2342 		return;
2343 
2344 	nfs4_fail_recov(vp, "can't recover filehandle", error, stat);
2345 }
2346 
2347 /*
2348  * Recovery from a "shouldn't happen" error.  In the long term, we'd like
2349  * to mark only the data structure(s) that provided the bad value as being
2350  * bad.  But for now we'll just mark the entire file.
2351  */
2352 
2353 static void
2354 recov_badstate(recov_info_t *recovp, vnode_t *vp, nfsstat4 stat)
2355 {
2356 	ASSERT(vp != NULL);
2357 	recov_throttle(recovp, vp);
2358 
2359 	if (!nfs4_valid_recov_err_for_vp(vp, stat))
2360 		return;
2361 
2362 	nfs4_fail_recov(vp, "", 0, stat);
2363 }
2364 
2365 /*
2366  * Free up the information saved for a lost state request.
2367  */
2368 static void
2369 nfs4_free_lost_rqst(nfs4_lost_rqst_t *lrp, nfs4_server_t *sp)
2370 {
2371 	component4 *filep;
2372 	nfs4_open_stream_t *osp;
2373 	int have_sync_lock;
2374 
2375 	NFS4_DEBUG(nfs4_lost_rqst_debug,
2376 	    (CE_NOTE, "nfs4_free_lost_rqst:"));
2377 
2378 	switch (lrp->lr_op) {
2379 	case OP_OPEN:
2380 		filep = &lrp->lr_ofile;
2381 		if (filep->utf8string_val) {
2382 			kmem_free(filep->utf8string_val, filep->utf8string_len);
2383 			filep->utf8string_val = NULL;
2384 		}
2385 		break;
2386 	case OP_DELEGRETURN:
2387 		nfs4delegreturn_cleanup(VTOR4(lrp->lr_vp), sp);
2388 		break;
2389 	case OP_CLOSE:
2390 		osp = lrp->lr_osp;
2391 		ASSERT(osp != NULL);
2392 		mutex_enter(&osp->os_sync_lock);
2393 		have_sync_lock = 1;
2394 		if (osp->os_pending_close) {
2395 			/* clean up the open file state. */
2396 			osp->os_pending_close = 0;
2397 			nfs4close_notw(lrp->lr_vp, osp, &have_sync_lock);
2398 		}
2399 		if (have_sync_lock)
2400 			mutex_exit(&osp->os_sync_lock);
2401 		break;
2402 	}
2403 
2404 	lrp->lr_op = 0;
2405 	if (lrp->lr_oop != NULL) {
2406 		open_owner_rele(lrp->lr_oop);
2407 		lrp->lr_oop = NULL;
2408 	}
2409 	if (lrp->lr_osp != NULL) {
2410 		open_stream_rele(lrp->lr_osp, VTOR4(lrp->lr_vp));
2411 		lrp->lr_osp = NULL;
2412 	}
2413 	if (lrp->lr_lop != NULL) {
2414 		lock_owner_rele(lrp->lr_lop);
2415 		lrp->lr_lop = NULL;
2416 	}
2417 	if (lrp->lr_flk != NULL) {
2418 		kmem_free(lrp->lr_flk, sizeof (flock64_t));
2419 		lrp->lr_flk = NULL;
2420 	}
2421 	if (lrp->lr_vp != NULL) {
2422 		VN_RELE(lrp->lr_vp);
2423 		lrp->lr_vp = NULL;
2424 	}
2425 	if (lrp->lr_dvp != NULL) {
2426 		VN_RELE(lrp->lr_dvp);
2427 		lrp->lr_dvp = NULL;
2428 	}
2429 	if (lrp->lr_cr != NULL) {
2430 		crfree(lrp->lr_cr);
2431 		lrp->lr_cr = NULL;
2432 	}
2433 
2434 	kmem_free(lrp, sizeof (nfs4_lost_rqst_t));
2435 }
2436 
2437 /*
2438  * Remove any lost state requests and free them.
2439  */
2440 static void
2441 nfs4_remove_lost_rqsts(mntinfo4_t *mi, nfs4_server_t *sp)
2442 {
2443 	nfs4_lost_rqst_t *lrp;
2444 
2445 	mutex_enter(&mi->mi_lock);
2446 	while ((lrp = list_head(&mi->mi_lost_state)) != NULL) {
2447 		list_remove(&mi->mi_lost_state, lrp);
2448 		mutex_exit(&mi->mi_lock);
2449 		nfs4_free_lost_rqst(lrp, sp);
2450 		mutex_enter(&mi->mi_lock);
2451 	}
2452 	mutex_exit(&mi->mi_lock);
2453 }
2454 
2455 /*
2456  * Reopen all the files for the given filesystem and reclaim any locks.
2457  */
2458 
2459 static void
2460 recov_openfiles(recov_info_t *recovp, nfs4_server_t *sp)
2461 {
2462 	mntinfo4_t *mi = recovp->rc_mi;
2463 	nfs4_opinst_t *reopenlist = NULL, *rep;
2464 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
2465 	open_claim_type4 claim;
2466 	int remap;
2467 	char *fail_msg = "No such file or directory on replica";
2468 	rnode4_t *rp;
2469 	fattr4_change pre_change;
2470 
2471 	ASSERT(sp != NULL);
2472 
2473 	/*
2474 	 * This check is to allow a 10ms pause before we reopen files
2475 	 * it should allow the server time to have received the CB_NULL
2476 	 * reply and update its internal structures such that (if
2477 	 * applicable) we are granted a delegation on reopened files.
2478 	 */
2479 	mutex_enter(&sp->s_lock);
2480 	if ((sp->s_flags & (N4S_CB_PINGED | N4S_CB_WAITER)) == 0) {
2481 		sp->s_flags |= N4S_CB_WAITER;
2482 		(void) cv_timedwait(&sp->wait_cb_null, &sp->s_lock,
2483 		    (lbolt + drv_usectohz(N4S_CB_PAUSE_TIME)));
2484 	}
2485 	mutex_exit(&sp->s_lock);
2486 
2487 	(void) nfs_rw_enter_sig(&sp->s_recovlock, RW_READER, 0);
2488 	(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0);
2489 
2490 	if (NFS4_VOLATILE_FH(mi)) {
2491 		nfs4_remap_root(mi, &e, 0);
2492 		if (nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp)) {
2493 			(void) nfs4_start_recovery(&e, mi, NULL,
2494 			    NULL, NULL, NULL, OP_LOOKUP, NULL);
2495 		}
2496 	}
2497 
2498 	mutex_enter(&mi->mi_lock);
2499 	if (recovp->rc_srv_reboot || (mi->mi_recovflags & MI4R_SRV_REBOOT))
2500 		claim = CLAIM_PREVIOUS;
2501 	else
2502 		claim = CLAIM_NULL;
2503 	mutex_exit(&mi->mi_lock);
2504 
2505 	if (e.error == 0 && e.stat == NFS4_OK) {
2506 		/*
2507 		 * Get a snapshot of open files in the filesystem.  Note
2508 		 * that new opens will stall until the server's grace
2509 		 * period is done.
2510 		 */
2511 		reopenlist = r4mkopenlist(mi);
2512 
2513 		mutex_enter(&mi->mi_lock);
2514 		remap = mi->mi_recovflags & MI4R_REMAP_FILES;
2515 		mutex_exit(&mi->mi_lock);
2516 		/*
2517 		 * Since we are re-establishing state on the
2518 		 * server, its ok to blow away the saved lost
2519 		 * requests since we don't need to reissue it.
2520 		 */
2521 		nfs4_remove_lost_rqsts(mi, sp);
2522 
2523 		for (rep = reopenlist; rep; rep = rep->re_next) {
2524 
2525 			if (remap) {
2526 				nfs4_remap_file(mi, rep->re_vp,
2527 				    NFS4_REMAP_CKATTRS, &e);
2528 			}
2529 			if (e.error == ENOENT || e.stat == NFS4ERR_NOENT) {
2530 				/*
2531 				 * The current server does not have the file
2532 				 * that is to be remapped.  This is most
2533 				 * likely due to an improperly maintained
2534 				 * replica.   The files that are missing from
2535 				 * the server will be marked dead and logged
2536 				 * in order to make sys admins aware of the
2537 				 * problem.
2538 				 */
2539 				nfs4_fail_recov(rep->re_vp,
2540 				    fail_msg, e.error, e.stat);
2541 				/*
2542 				 * We've already handled the error so clear it.
2543 				 */
2544 				nfs4_error_zinit(&e);
2545 				continue;
2546 			} else if (e.error == 0 && e.stat == NFS4_OK) {
2547 				int j;
2548 
2549 				rp = VTOR4(rep->re_vp);
2550 				mutex_enter(&rp->r_statelock);
2551 				pre_change = rp->r_change;
2552 				mutex_exit(&rp->r_statelock);
2553 
2554 				for (j = 0; j < rep->re_numosp; j++) {
2555 					nfs4_reopen(rep->re_vp, rep->re_osp[j],
2556 					    &e, claim, FALSE, TRUE);
2557 					if (e.error != 0 || e.stat != NFS4_OK)
2558 						break;
2559 				}
2560 				if (nfs4_needs_recovery(&e, TRUE,
2561 				    mi->mi_vfsp)) {
2562 					(void) nfs4_start_recovery(&e, mi,
2563 					    rep->re_vp, NULL, NULL, NULL,
2564 					    OP_OPEN, NULL);
2565 					break;
2566 				}
2567 			}
2568 #ifdef DEBUG
2569 			if (nfs4_recovdelay > 0)
2570 				delay(MSEC_TO_TICK(nfs4_recovdelay * 1000));
2571 #endif
2572 			if (e.error == 0 && e.stat == NFS4_OK)
2573 				relock_file(rep->re_vp, mi, &e, pre_change);
2574 
2575 			if (nfs4_needs_recovery(&e, TRUE, mi->mi_vfsp))
2576 				(void) nfs4_start_recovery(&e, mi,
2577 				    rep->re_vp, NULL, NULL, NULL, OP_LOCK,
2578 				    NULL);
2579 			if (e.error != 0 || e.stat != NFS4_OK)
2580 				break;
2581 		}
2582 
2583 		/*
2584 		 * Check to see if we need to remap files passed in
2585 		 * via the recovery arguments; this will have been
2586 		 * done for open files.  A failure here is not fatal.
2587 		 */
2588 		if (remap) {
2589 			nfs4_error_t ignore;
2590 			nfs4_check_remap(mi, recovp->rc_vp1, NFS4_REMAP_CKATTRS,
2591 			    &ignore);
2592 			nfs4_check_remap(mi, recovp->rc_vp2, NFS4_REMAP_CKATTRS,
2593 			    &ignore);
2594 		}
2595 	}
2596 
2597 	if (e.error == 0 && e.stat == NFS4_OK) {
2598 		mutex_enter(&mi->mi_lock);
2599 		mi->mi_recovflags &= ~(MI4R_REOPEN_FILES | MI4R_REMAP_FILES);
2600 		mutex_exit(&mi->mi_lock);
2601 	}
2602 
2603 	nfs_rw_exit(&mi->mi_recovlock);
2604 	nfs_rw_exit(&sp->s_recovlock);
2605 
2606 	if (reopenlist != NULL)
2607 		r4releopenlist(reopenlist);
2608 }
2609 
2610 /*
2611  * Resend the queued state recovery requests in "rqsts".
2612  */
2613 
2614 static void
2615 nfs4_resend_lost_rqsts(recov_info_t *recovp, nfs4_server_t *sp)
2616 {
2617 	nfs4_lost_rqst_t	*lrp, *tlrp;
2618 	mntinfo4_t		*mi = recovp->rc_mi;
2619 	nfs4_error_t		n4e;
2620 #ifdef NOTYET
2621 	uint32_t		deny_bits = 0;
2622 #endif
2623 
2624 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_resend_lost_rqsts"));
2625 
2626 	ASSERT(mi != NULL);
2627 	ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
2628 
2629 	mutex_enter(&mi->mi_lock);
2630 	lrp = list_head(&mi->mi_lost_state);
2631 	mutex_exit(&mi->mi_lock);
2632 	while (lrp != NULL) {
2633 		nfs4_error_zinit(&n4e);
2634 		resend_one_op(lrp, &n4e, mi, sp);
2635 		NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
2636 		    "nfs4_resend_lost_rqsts: resend request: for vp %p got "
2637 		    "error %d stat %d", (void *)lrp->lr_vp, n4e.error,
2638 		    n4e.stat));
2639 
2640 		/*
2641 		 * If we get a recovery error that we can actually
2642 		 * recover from (such as ETIMEDOUT, FHEXPIRED), we
2643 		 * return and let the recovery thread redrive the call.
2644 		 * Don't requeue unless the zone is still healthy.
2645 		 */
2646 		if (zone_status_get(curproc->p_zone) < ZONE_IS_SHUTTING_DOWN &&
2647 		    nfs4_needs_recovery(&n4e, TRUE, mi->mi_vfsp) &&
2648 		    (nfs4_try_failover(&n4e) ||
2649 		    NFS4_FRC_UNMT_ERR(n4e.error, mi->mi_vfsp) ||
2650 		    (n4e.error == 0 && n4e.stat != NFS4ERR_BADHANDLE &&
2651 		    !nfs4_recov_marks_dead(n4e.stat)))) {
2652 			/*
2653 			 * For these three errors, we want to delay a bit
2654 			 * instead of pounding the server into submission.
2655 			 * We have to do this manually; the normal
2656 			 * processing for these errors only works for
2657 			 * non-recovery requests.
2658 			 */
2659 			if ((n4e.error == 0 && n4e.stat == NFS4ERR_DELAY) ||
2660 			    (n4e.error == 0 && n4e.stat == NFS4ERR_GRACE) ||
2661 			    (n4e.error == 0 && n4e.stat == NFS4ERR_RESOURCE) ||
2662 			    NFS4_FRC_UNMT_ERR(n4e.error, mi->mi_vfsp)) {
2663 				delay(SEC_TO_TICK(nfs4err_delay_time));
2664 			} else {
2665 				(void) nfs4_start_recovery(&n4e,
2666 				    mi, lrp->lr_dvp, lrp->lr_vp, NULL, NULL,
2667 				    lrp->lr_op, NULL);
2668 			}
2669 			return;
2670 		}
2671 
2672 		mutex_enter(&mi->mi_lock);
2673 		list_remove(&mi->mi_lost_state, lrp);
2674 		tlrp = lrp;
2675 		lrp = list_head(&mi->mi_lost_state);
2676 		mutex_exit(&mi->mi_lock);
2677 		nfs4_free_lost_rqst(tlrp, sp);
2678 	}
2679 }
2680 
2681 /*
2682  * Resend the given op, and issue any necessary undo call.
2683  * errors are returned via the nfs4_error_t parameter.
2684  */
2685 
2686 static void
2687 resend_one_op(nfs4_lost_rqst_t *lrp, nfs4_error_t *ep,
2688     mntinfo4_t *mi, nfs4_server_t *sp)
2689 {
2690 	vnode_t *vp;
2691 	nfs4_open_stream_t *osp;
2692 	cred_t *cr;
2693 	uint32_t acc_bits;
2694 
2695 	vp = lrp->lr_vp;
2696 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_one_op: "
2697 	    "have a lost open/close request for vp %p", (void *)vp));
2698 
2699 	switch (lrp->lr_op) {
2700 	case OP_OPEN:
2701 		nfs4_resend_open_otw(&vp, lrp, ep);
2702 		break;
2703 	case OP_OPEN_DOWNGRADE:
2704 		ASSERT(lrp->lr_oop != NULL);
2705 		ep->error = nfs4_start_open_seqid_sync(lrp->lr_oop, mi);
2706 		ASSERT(!ep->error);	/* recov thread always succeeds */
2707 		ASSERT(lrp->lr_osp != NULL);
2708 		mutex_enter(&lrp->lr_osp->os_sync_lock);
2709 		nfs4_open_downgrade(lrp->lr_dg_acc, lrp->lr_dg_deny,
2710 		    lrp->lr_oop, lrp->lr_osp, vp, lrp->lr_cr, lrp,
2711 		    ep, NULL, NULL);
2712 		mutex_exit(&lrp->lr_osp->os_sync_lock);
2713 		nfs4_end_open_seqid_sync(lrp->lr_oop);
2714 		break;
2715 	case OP_CLOSE:
2716 		osp = lrp->lr_osp;
2717 		cr = lrp->lr_cr;
2718 		acc_bits = 0;
2719 		mutex_enter(&osp->os_sync_lock);
2720 		if (osp->os_share_acc_read)
2721 			acc_bits |= OPEN4_SHARE_ACCESS_READ;
2722 		if (osp->os_share_acc_write)
2723 			acc_bits |= OPEN4_SHARE_ACCESS_WRITE;
2724 		mutex_exit(&osp->os_sync_lock);
2725 		nfs4close_one(vp, osp, cr, acc_bits, lrp, ep,
2726 		    CLOSE_RESEND, 0, 0, 0);
2727 		break;
2728 	case OP_LOCK:
2729 	case OP_LOCKU:
2730 		resend_lock(lrp, ep);
2731 		goto done;
2732 	case OP_DELEGRETURN:
2733 		nfs4_resend_delegreturn(lrp, ep, sp);
2734 		goto done;
2735 	default:
2736 #ifdef DEBUG
2737 		cmn_err(CE_PANIC, "resend_one_op: unexpected op: %d",
2738 		    lrp->lr_op);
2739 #endif
2740 		nfs4_queue_event(RE_LOST_STATE_BAD_OP, mi, NULL,
2741 		    lrp->lr_op, lrp->lr_vp, lrp->lr_dvp, NFS4_OK, NULL, 0,
2742 		    TAG_NONE, TAG_NONE, 0, 0);
2743 		nfs4_error_init(ep, EINVAL);
2744 		return;
2745 	}
2746 
2747 	/*
2748 	 * No need to retry nor send an "undo" CLOSE in the
2749 	 * event the server rebooted.
2750 	 */
2751 	if (ep->error == 0 && (ep->stat == NFS4ERR_STALE_CLIENTID ||
2752 	    ep->stat == NFS4ERR_STALE_STATEID || ep->stat == NFS4ERR_EXPIRED))
2753 		goto done;
2754 
2755 	/*
2756 	 * If we resent a CLOSE or OPEN_DOWNGRADE, there's nothing
2757 	 * to undo.  Undoing locking operations was handled by
2758 	 * resend_lock().
2759 	 */
2760 	if (lrp->lr_op == OP_OPEN_DOWNGRADE || lrp->lr_op == OP_CLOSE)
2761 		goto done;
2762 
2763 	/*
2764 	 * If we get any other error for OPEN, then don't attempt
2765 	 * to undo the resend of the open (since it was never
2766 	 * successful!).
2767 	 */
2768 	ASSERT(lrp->lr_op == OP_OPEN);
2769 	if (ep->error || ep->stat != NFS4_OK)
2770 		goto done;
2771 
2772 	/*
2773 	 * Now let's undo our OPEN.
2774 	 */
2775 	nfs4_error_zinit(ep);
2776 	close_after_open_resend(vp, lrp->lr_cr, lrp->lr_oacc, ep);
2777 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_one_op: "
2778 	    "nfs4close_one: for vp %p got error %d stat %d",
2779 	    (void *)vp, ep->error, ep->stat));
2780 
2781 done:
2782 	if (vp != lrp->lr_vp)
2783 		VN_RELE(vp);
2784 }
2785 
2786 /*
2787  * Close a file that was opened via a resent OPEN.
2788  * Most errors are passed back to the caller (via the return value and
2789  * *statp), except for FHEXPIRED, which is retried.
2790  *
2791  * It might be conceptually cleaner to push the CLOSE request onto the
2792  * front of the resend queue, rather than sending it here.  That would
2793  * match the way we undo lost lock requests.  On the other
2794  * hand, we've already got something that works, and there's no reason to
2795  * change it at this time.
2796  */
2797 
2798 static void
2799 close_after_open_resend(vnode_t *vp, cred_t *cr, uint32_t acc_bits,
2800     nfs4_error_t *ep)
2801 {
2802 
2803 	for (;;) {
2804 		nfs4close_one(vp, NULL, cr, acc_bits, NULL, ep,
2805 		    CLOSE_AFTER_RESEND, 0, 0, 0);
2806 		if (ep->error == 0 && ep->stat == NFS4_OK)
2807 			break;		/* success; done */
2808 		if (ep->error != 0 || ep->stat != NFS4ERR_FHEXPIRED)
2809 			break;
2810 		/* else retry FHEXPIRED */
2811 	}
2812 
2813 }
2814 
2815 /*
2816  * Resend the given lost lock request.  Return an errno value.  If zero,
2817  * *statp is set to the NFS status code for the call.
2818  *
2819  * Issue a SIGLOST and mark the rnode dead if we get a non-recovery error or
2820  * a recovery error that we don't actually recover from yet (eg: BAD_SEQID).
2821  * Let the recovery thread redrive the call if we get a recovery error that
2822  * we can actually recover from.
2823  */
2824 static void
2825 resend_lock(nfs4_lost_rqst_t *lrp, nfs4_error_t *ep)
2826 {
2827 	bool_t		send_siglost = FALSE;
2828 	vnode_t		*vp = lrp->lr_vp;
2829 
2830 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_lock:"));
2831 	ASSERT(lrp->lr_ctype == NFS4_LCK_CTYPE_REINSTATE ||
2832 	    lrp->lr_ctype == NFS4_LCK_CTYPE_RESEND);
2833 
2834 	nfs4frlock(lrp->lr_ctype, vp, F_SETLK,
2835 	    lrp->lr_flk, FREAD|FWRITE, 0, lrp->lr_cr, ep, lrp, NULL);
2836 
2837 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_lock: "
2838 	    "nfs4frlock for vp %p returned error %d, stat %d",
2839 	    (void *)vp, ep->error, ep->stat));
2840 
2841 	if (ep->error == 0 && ep->stat == 0)
2842 		goto done;
2843 	if (ep->error == 0 && ep->stat == NFS4ERR_DENIED &&
2844 	    lrp->lr_ctype == NFS4_LCK_CTYPE_RESEND)
2845 		goto done;
2846 
2847 	/*
2848 	 * If we failed with a non-recovery error, send SIGLOST and
2849 	 * mark the file dead.
2850 	 */
2851 	if (!nfs4_needs_recovery(ep, TRUE, vp->v_vfsp))
2852 		send_siglost = TRUE;
2853 	else {
2854 		/*
2855 		 * Done with recovering LOST LOCK in the event the
2856 		 * server rebooted or we've lost the lease.
2857 		 */
2858 		if (ep->error == 0 && (ep->stat == NFS4ERR_STALE_CLIENTID ||
2859 		    ep->stat == NFS4ERR_STALE_STATEID ||
2860 		    ep->stat == NFS4ERR_EXPIRED)) {
2861 			goto done;
2862 		}
2863 
2864 		/*
2865 		 * BAD_STATEID on an unlock indicates that the server has
2866 		 * forgotten about the lock anyway, so act like the call
2867 		 * was successful.
2868 		 */
2869 		if (ep->error == 0 && ep->stat == NFS4ERR_BAD_STATEID &&
2870 		    lrp->lr_op == OP_LOCKU)
2871 			goto done;
2872 
2873 		/*
2874 		 * If we got a recovery error that we don't actually
2875 		 * recover from, send SIGLOST.  If the filesystem was
2876 		 * forcibly unmounted, we skip the SIGLOST because (a) it's
2877 		 * unnecessary noise, and (b) there could be a new process
2878 		 * with the same pid as the one that had generated the lost
2879 		 * state request.
2880 		 */
2881 		if (ep->error == 0 && (ep->stat == NFS4ERR_BADHANDLE ||
2882 		    nfs4_recov_marks_dead(ep->stat))) {
2883 			if (!(vp->v_vfsp->vfs_flag & VFS_UNMOUNTED))
2884 				send_siglost = TRUE;
2885 			goto done;
2886 		}
2887 
2888 		/*
2889 		 * If the filesystem was forcibly unmounted, we
2890 		 * still need to synchronize with the server and
2891 		 * release state.  Try again later.
2892 		 */
2893 		if (NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp))
2894 			goto done;
2895 
2896 		/*
2897 		 * If we get a recovery error that we can actually
2898 		 * recover from (such as ETIMEDOUT, FHEXPIRED),
2899 		 * return and let the recovery thread redrive the call.
2900 		 *
2901 		 * For the three errors below, we want to delay a bit
2902 		 * instead of pounding the server into submission.
2903 		 */
2904 		if ((ep->error == 0 && ep->stat == NFS4ERR_DELAY) ||
2905 		    (ep->error == 0 && ep->stat == NFS4ERR_GRACE) ||
2906 		    (ep->error == 0 && ep->stat == NFS4ERR_RESOURCE))
2907 			delay(SEC_TO_TICK(recov_err_delay));
2908 		goto done;
2909 	}
2910 
2911 done:
2912 	if (send_siglost) {
2913 		cred_t *sv_cred;
2914 
2915 		/*
2916 		 * Must be root or the actual thread being issued the
2917 		 * SIGLOST for this to work, so just become root.
2918 		 */
2919 		sv_cred = curthread->t_cred;
2920 		curthread->t_cred = kcred;
2921 		nfs4_send_siglost(lrp->lr_flk->l_pid, VTOMI4(vp), vp, FALSE,
2922 		    ep->error, ep->stat);
2923 		curthread->t_cred = sv_cred;
2924 
2925 		/*
2926 		 * Flush any additional reinstantiation requests for
2927 		 * this operation.  Sending multiple SIGLOSTs to the user
2928 		 * process is unlikely to help and may cause trouble.
2929 		 */
2930 		if (lrp->lr_ctype == NFS4_LCK_CTYPE_REINSTATE)
2931 			flush_reinstate(lrp);
2932 	}
2933 }
2934 
2935 /*
2936  * Remove any lock reinstantiation requests that correspond to the given
2937  * lost request.  We only remove items that follow lrp in the queue,
2938  * assuming that lrp will be removed by the generic lost state code.
2939  */
2940 
2941 static void
2942 flush_reinstate(nfs4_lost_rqst_t *lrp)
2943 {
2944 	vnode_t *vp;
2945 	pid_t pid;
2946 	mntinfo4_t *mi;
2947 	nfs4_lost_rqst_t *nlrp;
2948 
2949 	vp = lrp->lr_vp;
2950 	mi = VTOMI4(vp);
2951 	pid = lrp->lr_flk->l_pid;
2952 
2953 	/*
2954 	 * If there are any more reinstantation requests to get rid of,
2955 	 * they should all be clustered at the front of the lost state
2956 	 * queue.
2957 	 */
2958 	mutex_enter(&mi->mi_lock);
2959 	for (lrp = list_next(&mi->mi_lost_state, lrp); lrp != NULL;
2960 	    lrp = nlrp) {
2961 		nlrp = list_next(&mi->mi_lost_state, lrp);
2962 		if (lrp->lr_op != OP_LOCK && lrp->lr_op != OP_LOCKU)
2963 			break;
2964 		if (lrp->lr_ctype != NFS4_LCK_CTYPE_REINSTATE)
2965 			break;
2966 		ASSERT(lrp->lr_vp == vp);
2967 		ASSERT(lrp->lr_flk->l_pid == pid);
2968 		NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
2969 		    "remove reinstantiation %p", (void *)lrp));
2970 		list_remove(&mi->mi_lost_state, lrp);
2971 		nfs4_free_lost_rqst(lrp, NULL);
2972 	}
2973 	mutex_exit(&mi->mi_lock);
2974 }
2975 
2976 /*
2977  * End of state-specific recovery routines.
2978  */
2979 
2980 /*
2981  * Allocate a lost request struct, initialize it from lost_rqstp (including
2982  * bumping the reference counts for the referenced vnode, etc.), and hang
2983  * it off of recovp.
2984  */
2985 
2986 static void
2987 nfs4_save_lost_rqst(nfs4_lost_rqst_t *lost_rqstp, recov_info_t *recovp,
2988     nfs4_recov_t *action, mntinfo4_t *mi)
2989 {
2990 	nfs4_lost_rqst_t *destp;
2991 
2992 	ASSERT(recovp->rc_lost_rqst == NULL);
2993 
2994 	destp = kmem_alloc(sizeof (nfs4_lost_rqst_t), KM_SLEEP);
2995 	recovp->rc_lost_rqst = destp;
2996 
2997 	if (lost_rqstp->lr_op == OP_LOCK ||
2998 	    lost_rqstp->lr_op == OP_LOCKU) {
2999 		ASSERT(lost_rqstp->lr_lop);
3000 		*action = NR_LOST_LOCK;
3001 		destp->lr_ctype = lost_rqstp->lr_ctype;
3002 		destp->lr_locktype = lost_rqstp->lr_locktype;
3003 	} else if (lost_rqstp->lr_op == OP_OPEN) {
3004 		component4 *srcfp, *destfp;
3005 
3006 		destp->lr_oacc = lost_rqstp->lr_oacc;
3007 		destp->lr_odeny = lost_rqstp->lr_odeny;
3008 		destp->lr_oclaim = lost_rqstp->lr_oclaim;
3009 		if (lost_rqstp->lr_oclaim == CLAIM_DELEGATE_CUR)
3010 			destp->lr_ostateid = lost_rqstp->lr_ostateid;
3011 
3012 		srcfp = &lost_rqstp->lr_ofile;
3013 		destfp = &destp->lr_ofile;
3014 		/*
3015 		 * Consume caller's utf8string
3016 		 */
3017 		destfp->utf8string_len = srcfp->utf8string_len;
3018 		destfp->utf8string_val = srcfp->utf8string_val;
3019 		srcfp->utf8string_len = 0;
3020 		srcfp->utf8string_val = NULL;	/* make sure not reused */
3021 
3022 		*action = NR_LOST_STATE_RQST;
3023 	} else if (lost_rqstp->lr_op == OP_OPEN_DOWNGRADE) {
3024 		destp->lr_dg_acc = lost_rqstp->lr_dg_acc;
3025 		destp->lr_dg_deny = lost_rqstp->lr_dg_deny;
3026 
3027 		*action = NR_LOST_STATE_RQST;
3028 	} else if (lost_rqstp->lr_op == OP_CLOSE) {
3029 		ASSERT(lost_rqstp->lr_oop);
3030 		*action = NR_LOST_STATE_RQST;
3031 	} else if (lost_rqstp->lr_op == OP_DELEGRETURN) {
3032 		*action = NR_LOST_STATE_RQST;
3033 	} else {
3034 #ifdef DEBUG
3035 		cmn_err(CE_PANIC, "nfs4_save_lost_rqst: bad op %d",
3036 		    lost_rqstp->lr_op);
3037 #endif
3038 		nfs4_queue_event(RE_LOST_STATE_BAD_OP, mi, NULL,
3039 		    lost_rqstp->lr_op, lost_rqstp->lr_vp, lost_rqstp->lr_dvp,
3040 		    NFS4_OK, NULL, curproc->p_pid, TAG_NONE, TAG_NONE, 0, 0);
3041 		*action = NR_UNUSED;
3042 		recovp->rc_lost_rqst = NULL;
3043 		kmem_free(destp, sizeof (nfs4_lost_rqst_t));
3044 		return;
3045 	}
3046 
3047 	destp->lr_op = lost_rqstp->lr_op;
3048 	destp->lr_vp = lost_rqstp->lr_vp;
3049 	if (destp->lr_vp)
3050 		VN_HOLD(destp->lr_vp);
3051 	destp->lr_dvp = lost_rqstp->lr_dvp;
3052 	if (destp->lr_dvp)
3053 		VN_HOLD(destp->lr_dvp);
3054 	destp->lr_oop = lost_rqstp->lr_oop;
3055 	if (destp->lr_oop)
3056 		open_owner_hold(destp->lr_oop);
3057 	destp->lr_osp = lost_rqstp->lr_osp;
3058 	if (destp->lr_osp)
3059 		open_stream_hold(destp->lr_osp);
3060 	destp->lr_lop = lost_rqstp->lr_lop;
3061 	if (destp->lr_lop)
3062 		lock_owner_hold(destp->lr_lop);
3063 	destp->lr_cr = lost_rqstp->lr_cr;
3064 	if (destp->lr_cr)
3065 		crhold(destp->lr_cr);
3066 	if (lost_rqstp->lr_flk == NULL)
3067 		destp->lr_flk = NULL;
3068 	else {
3069 		destp->lr_flk = kmem_alloc(sizeof (flock64_t), KM_SLEEP);
3070 		*destp->lr_flk = *lost_rqstp->lr_flk;
3071 	}
3072 	destp->lr_putfirst = lost_rqstp->lr_putfirst;
3073 }
3074 
3075 /*
3076  * Map the given return values (errno and nfs4 status code) to a recovery
3077  * action and fill in the following fields of recovp: rc_action,
3078  * rc_srv_reboot, rc_stateid, rc_lost_rqst.
3079  */
3080 
3081 void
3082 errs_to_action(recov_info_t *recovp,
3083     nfs4_server_t *sp, mntinfo4_t *mi, stateid4 *sidp,
3084     nfs4_lost_rqst_t *lost_rqstp, int unmounted, nfs_opnum4 op,
3085     nfs4_bseqid_entry_t *bsep)
3086 {
3087 	nfs4_recov_t action = NR_UNUSED;
3088 	bool_t reboot = FALSE;
3089 	int try_f;
3090 	int error = recovp->rc_orig_errors.error;
3091 	nfsstat4 stat = recovp->rc_orig_errors.stat;
3092 
3093 	bzero(&recovp->rc_stateid, sizeof (stateid4));
3094 	recovp->rc_lost_rqst = NULL;
3095 	recovp->rc_bseqid_rqst = NULL;
3096 
3097 	try_f = nfs4_try_failover(&recovp->rc_orig_errors) &&
3098 	    FAILOVER_MOUNT4(mi);
3099 
3100 	/*
3101 	 * We start recovery for EINTR only in the lost lock
3102 	 * or lost open/close case.
3103 	 */
3104 
3105 	if (try_f || error == EINTR || (error == EIO && unmounted)) {
3106 		recovp->rc_error = (error != 0 ? error : geterrno4(stat));
3107 		if (lost_rqstp) {
3108 			ASSERT(lost_rqstp->lr_op != 0);
3109 			nfs4_save_lost_rqst(lost_rqstp, recovp, &action, mi);
3110 		}
3111 		if (try_f)
3112 			action = NR_FAILOVER;
3113 	} else if (error != 0) {
3114 		recovp->rc_error = error;
3115 		nfs4_queue_event(RE_UNEXPECTED_ERRNO, mi, NULL, error, NULL,
3116 		    NULL, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
3117 		action = NR_CLIENTID;
3118 	} else {
3119 		recovp->rc_error = geterrno4(stat);
3120 		switch (stat) {
3121 #ifdef notyet
3122 		case NFS4ERR_LEASE_MOVED:
3123 			action = xxx;
3124 			break;
3125 		case NFS4ERR_MOVED:
3126 			action = xxx;
3127 			break;
3128 #endif
3129 		case NFS4ERR_BADHANDLE:
3130 			action = NR_BADHANDLE;
3131 			break;
3132 		case NFS4ERR_BAD_SEQID:
3133 			if (bsep)
3134 				save_bseqid_rqst(bsep, recovp);
3135 			action = NR_BAD_SEQID;
3136 			break;
3137 		case NFS4ERR_OLD_STATEID:
3138 			action = NR_OLDSTATEID;
3139 			break;
3140 		case NFS4ERR_WRONGSEC:
3141 			action = NR_WRONGSEC;
3142 			break;
3143 		case NFS4ERR_FHEXPIRED:
3144 			action = NR_FHEXPIRED;
3145 			break;
3146 		case NFS4ERR_BAD_STATEID:
3147 			if (sp == NULL || (sp != NULL && inlease(sp))) {
3148 
3149 				action = NR_BAD_STATEID;
3150 				if (sidp)
3151 					recovp->rc_stateid = *sidp;
3152 			} else
3153 				action = NR_CLIENTID;
3154 			break;
3155 		case NFS4ERR_EXPIRED:
3156 			/*
3157 			 * The client's lease has expired, either due
3158 			 * to a network partition or perhaps a client
3159 			 * error.  In either case, try an NR_CLIENTID
3160 			 * style recovery.  reboot remains false, since
3161 			 * there is no evidence the server has rebooted.
3162 			 * This will cause CLAIM_NULL opens and lock
3163 			 * requests without the reclaim bit.
3164 			 */
3165 			action = NR_CLIENTID;
3166 
3167 			DTRACE_PROBE4(nfs4__expired,
3168 			    nfs4_server_t *, sp,
3169 			    mntinfo4_t *, mi,
3170 			    stateid4 *, sidp, int, op);
3171 
3172 			break;
3173 		case NFS4ERR_STALE_CLIENTID:
3174 		case NFS4ERR_STALE_STATEID:
3175 			action = NR_CLIENTID;
3176 			reboot = TRUE;
3177 			break;
3178 		case NFS4ERR_RESOURCE:
3179 			/*
3180 			 * If this had been a FAILOVER mount, then
3181 			 * we'd have tried failover.  Since it's not,
3182 			 * just delay a while and retry.
3183 			 */
3184 			action = NR_DELAY;
3185 			break;
3186 		case NFS4ERR_GRACE:
3187 			action = NR_GRACE;
3188 			break;
3189 		case NFS4ERR_DELAY:
3190 			action = NR_DELAY;
3191 			break;
3192 		case NFS4ERR_STALE:
3193 			action = NR_STALE;
3194 			break;
3195 		default:
3196 			nfs4_queue_event(RE_UNEXPECTED_STATUS, mi, NULL, 0,
3197 			    NULL, NULL, stat, NULL, 0, TAG_NONE, TAG_NONE,
3198 			    0, 0);
3199 			action = NR_CLIENTID;
3200 			break;
3201 		}
3202 	}
3203 
3204 	/* make sure action got set */
3205 	ASSERT(action != NR_UNUSED);
3206 	recovp->rc_srv_reboot = reboot;
3207 	recovp->rc_action = action;
3208 	nfs4_queue_fact(RF_ERR, mi, stat, action, op, reboot, NULL, error,
3209 	    NULL);
3210 }
3211 
3212 /*
3213  * Return the (held) credential for the process with the given pid.
3214  * May return NULL (e.g., process not found).
3215  */
3216 
3217 static cred_t *
3218 pid_to_cr(pid_t pid)
3219 {
3220 	proc_t *p;
3221 	cred_t *cr;
3222 
3223 	mutex_enter(&pidlock);
3224 	if ((p = prfind(pid)) == NULL) {
3225 		mutex_exit(&pidlock);
3226 		return (NULL);
3227 	}
3228 
3229 	mutex_enter(&p->p_crlock);
3230 	crhold(cr = p->p_cred);
3231 	mutex_exit(&p->p_crlock);
3232 	mutex_exit(&pidlock);
3233 
3234 	return (cr);
3235 }
3236 
3237 /*
3238  * Send SIGLOST to the given process and queue the event.
3239  *
3240  * The 'dump' boolean tells us whether this action should dump the
3241  * in-kernel queue of recovery messages or not.
3242  */
3243 
3244 void
3245 nfs4_send_siglost(pid_t pid, mntinfo4_t *mi, vnode_t *vp, bool_t dump,
3246     int error, nfsstat4 stat)
3247 {
3248 	proc_t *p;
3249 
3250 	mutex_enter(&pidlock);
3251 	p = prfind(pid);
3252 	if (p)
3253 		psignal(p, SIGLOST);
3254 	mutex_exit(&pidlock);
3255 	nfs4_queue_event(dump ? RE_SIGLOST : RE_SIGLOST_NO_DUMP, mi,
3256 	    NULL, error, vp, NULL, stat, NULL, pid, TAG_NONE, TAG_NONE, 0, 0);
3257 }
3258 
3259 /*
3260  * Scan the lock list for entries that match the given pid.  Change the
3261  * pid in those that do to NOPID.
3262  */
3263 
3264 static void
3265 relock_skip_pid(locklist_t *llp, pid_t pid)
3266 {
3267 	for (; llp != NULL; llp = llp->ll_next) {
3268 		if (llp->ll_flock.l_pid == pid)
3269 			llp->ll_flock.l_pid = NOPID;
3270 	}
3271 }
3272 
3273 /*
3274  * Mark a file as having failed recovery, after making a last-ditch effort
3275  * to return any delegation.
3276  *
3277  * Sets r_error to EIO or ESTALE for the given vnode.
3278  */
3279 void
3280 nfs4_fail_recov(vnode_t *vp, char *why, int error, nfsstat4 stat)
3281 {
3282 	rnode4_t *rp = VTOR4(vp);
3283 
3284 #ifdef DEBUG
3285 	if (nfs4_fail_recov_stop)
3286 		debug_enter("nfs4_fail_recov");
3287 #endif
3288 
3289 	mutex_enter(&rp->r_statelock);
3290 	if (rp->r_flags & (R4RECOVERR|R4RECOVERRP)) {
3291 		mutex_exit(&rp->r_statelock);
3292 		return;
3293 	}
3294 
3295 	/*
3296 	 * Set R4RECOVERRP to indicate that a recovery error is in
3297 	 * progress.  This will shut down reads and writes at the top
3298 	 * half.  Don't set R4RECOVERR until after we've returned the
3299 	 * delegation, otherwise it will fail.
3300 	 */
3301 
3302 	rp->r_flags |= R4RECOVERRP;
3303 	mutex_exit(&rp->r_statelock);
3304 
3305 	nfs4delegabandon(rp);
3306 
3307 	mutex_enter(&rp->r_statelock);
3308 	rp->r_flags |= (R4RECOVERR | R4STALE);
3309 	rp->r_error = (error == 0 && stat == NFS4ERR_STALE) ? ESTALE : EIO;
3310 	PURGE_ATTRCACHE4_LOCKED(rp);
3311 	if (!(vp->v_vfsp->vfs_flag & VFS_UNMOUNTED))
3312 		nfs4_queue_event(RE_DEAD_FILE, VTOMI4(vp), NULL, error,
3313 		    vp, NULL, stat, why, 0, TAG_NONE, TAG_NONE, 0, 0);
3314 	mutex_exit(&rp->r_statelock);
3315 
3316 	dnlc_purge_vp(vp);
3317 }
3318 
3319 /*
3320  * recov_throttle: if the file had the same recovery action within the
3321  * throttle interval, wait for the throttle interval to finish before
3322  * proceeding.
3323  *
3324  * Side effects: updates the rnode with the current recovery information.
3325  */
3326 
3327 static void
3328 recov_throttle(recov_info_t *recovp, vnode_t *vp)
3329 {
3330 	time_t curtime, time_to_wait;
3331 	rnode4_t *rp = VTOR4(vp);
3332 
3333 	curtime = gethrestime_sec();
3334 
3335 	mutex_enter(&rp->r_statelock);
3336 	NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
3337 	    "recov_throttle: now: (%d, %ld), last: (%d, %ld)",
3338 	    recovp->rc_action, curtime,
3339 	    rp->r_recov_act, rp->r_last_recov));
3340 	if (recovp->rc_action == rp->r_recov_act &&
3341 	    rp->r_last_recov + recov_err_delay > curtime) {
3342 		time_to_wait = rp->r_last_recov + recov_err_delay - curtime;
3343 		mutex_exit(&rp->r_statelock);
3344 		delay(SEC_TO_TICK(time_to_wait));
3345 		curtime = gethrestime_sec();
3346 		mutex_enter(&rp->r_statelock);
3347 	}
3348 
3349 	rp->r_last_recov = curtime;
3350 	rp->r_recov_act = recovp->rc_action;
3351 	mutex_exit(&rp->r_statelock);
3352 }
3353 
3354 /*
3355  * React to NFS4ERR_GRACE by setting the time we'll permit
3356  * the next call to this filesystem.
3357  */
3358 void
3359 nfs4_set_grace_wait(mntinfo4_t *mi)
3360 {
3361 	mutex_enter(&mi->mi_lock);
3362 	/* Mark the time for the future */
3363 	mi->mi_grace_wait = gethrestime_sec() + nfs4err_delay_time;
3364 	mutex_exit(&mi->mi_lock);
3365 }
3366 
3367 /*
3368  * React to MFS4ERR_DELAY by setting the time we'll permit
3369  * the next call to this vnode.
3370  */
3371 void
3372 nfs4_set_delay_wait(vnode_t *vp)
3373 {
3374 	rnode4_t *rp = VTOR4(vp);
3375 
3376 	mutex_enter(&rp->r_statelock);
3377 	/*
3378 	 * Calculate amount we should delay, initial
3379 	 * delay will be short and then we will back off.
3380 	 */
3381 	if (rp->r_delay_interval == 0)
3382 		rp->r_delay_interval = NFS4_INITIAL_DELAY_INTERVAL;
3383 	else
3384 		/* calculate next interval value */
3385 		rp->r_delay_interval =
3386 		    MIN(NFS4_MAX_DELAY_INTERVAL, (rp->r_delay_interval << 1));
3387 	rp->r_delay_wait = gethrestime_sec() + rp->r_delay_interval;
3388 	mutex_exit(&rp->r_statelock);
3389 }
3390 
3391 /*
3392  * The caller is responsible for freeing the returned string.
3393  */
3394 static char *
3395 nfs4_getsrvnames(mntinfo4_t *mi, size_t *len)
3396 {
3397 	servinfo4_t *svp;
3398 	char *srvnames;
3399 	char *namep;
3400 	size_t length;
3401 
3402 	/*
3403 	 * Calculate the length of the string required to hold all
3404 	 * of the server names plus either a comma or a null
3405 	 * character following each individual one.
3406 	 */
3407 	length = 0;
3408 	for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) {
3409 		(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
3410 		if (svp->sv_flags & SV4_NOTINUSE) {
3411 			nfs_rw_exit(&svp->sv_lock);
3412 			continue;
3413 		}
3414 		nfs_rw_exit(&svp->sv_lock);
3415 		length += svp->sv_hostnamelen;
3416 	}
3417 
3418 	srvnames = kmem_alloc(length, KM_SLEEP);
3419 
3420 	namep = srvnames;
3421 	for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) {
3422 		(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
3423 		if (svp->sv_flags & SV4_NOTINUSE) {
3424 			nfs_rw_exit(&svp->sv_lock);
3425 			continue;
3426 		}
3427 		nfs_rw_exit(&svp->sv_lock);
3428 		(void) strcpy(namep, svp->sv_hostname);
3429 		namep += svp->sv_hostnamelen - 1;
3430 		*namep++ = ',';
3431 	}
3432 	*--namep = '\0';
3433 
3434 	*len = length;
3435 
3436 	return (srvnames);
3437 }
3438 
3439 static void
3440 save_bseqid_rqst(nfs4_bseqid_entry_t *bsep, recov_info_t *recovp)
3441 {
3442 	nfs4_bseqid_entry_t *destp;
3443 
3444 	destp = kmem_alloc(sizeof (nfs4_bseqid_entry_t), KM_SLEEP);
3445 	recovp->rc_bseqid_rqst = destp;
3446 
3447 	if (bsep->bs_oop)
3448 		open_owner_hold(bsep->bs_oop);
3449 	destp->bs_oop = bsep->bs_oop;
3450 	if (bsep->bs_lop)
3451 		lock_owner_hold(bsep->bs_lop);
3452 	destp->bs_lop = bsep->bs_lop;
3453 	if (bsep->bs_vp)
3454 		VN_HOLD(bsep->bs_vp);
3455 	destp->bs_vp = bsep->bs_vp;
3456 	destp->bs_pid = bsep->bs_pid;
3457 	destp->bs_tag = bsep->bs_tag;
3458 	destp->bs_seqid = bsep->bs_seqid;
3459 }
3460 
3461 static void
3462 free_bseqid_rqst(nfs4_bseqid_entry_t *bsep)
3463 {
3464 	if (bsep->bs_oop)
3465 		open_owner_rele(bsep->bs_oop);
3466 	if (bsep->bs_lop)
3467 		lock_owner_rele(bsep->bs_lop);
3468 	if (bsep->bs_vp)
3469 		VN_RELE(bsep->bs_vp);
3470 	kmem_free(bsep, sizeof (nfs4_bseqid_entry_t));
3471 }
3472 
3473 /*
3474  * We don't actually fully recover from NFS4ERR_BAD_SEQID.  We
3475  * simply mark the open owner and open stream (if provided) as "bad".
3476  * Then future uses of these data structures will be limited to basically
3477  * just cleaning up the internal client state (no going OTW).
3478  *
3479  * The result of this is to return errors back to the app/usr when
3480  * we receive NFS4ERR_BAD_SEQID, but also allow future/new calls to
3481  * succeed so progress can be made.
3482  */
3483 void
3484 recov_bad_seqid(recov_info_t *recovp)
3485 {
3486 	mntinfo4_t		*mi = recovp->rc_mi;
3487 	nfs4_open_owner_t	*bad_oop;
3488 	nfs4_lock_owner_t	*bad_lop;
3489 	vnode_t			*vp;
3490 	rnode4_t		*rp = NULL;
3491 	pid_t			pid;
3492 	nfs4_bseqid_entry_t	*bsep, *tbsep;
3493 	int			error;
3494 
3495 	ASSERT(mi != NULL);
3496 	ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
3497 
3498 	mutex_enter(&mi->mi_lock);
3499 	bsep = list_head(&mi->mi_bseqid_list);
3500 	mutex_exit(&mi->mi_lock);
3501 
3502 	/*
3503 	 * Handle all the bad seqid entries on mi's list.
3504 	 */
3505 	while (bsep != NULL) {
3506 		bad_oop = bsep->bs_oop;
3507 		bad_lop = bsep->bs_lop;
3508 		vp = bsep->bs_vp;
3509 		pid = bsep->bs_pid;
3510 
3511 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
3512 		    "recov_bad_seqid: mark oop %p lop %p as bad for "
3513 		    "vp %p tag %s pid %d: last good seqid %d for tag %s",
3514 		    (void *)bad_oop, (void *)bad_lop, (void *)vp,
3515 		    nfs4_ctags[bsep->bs_tag].ct_str, pid,
3516 		    bad_oop ?  bad_oop->oo_last_good_seqid : 0,
3517 		    bad_oop ? nfs4_ctags[bad_oop->oo_last_good_op].ct_str :
3518 		    nfs4_ctags[TAG_NONE].ct_str));
3519 
3520 		nfs4_queue_event(RE_BAD_SEQID, mi, NULL,
3521 		    0, vp, NULL, NFS4ERR_BAD_SEQID, NULL, pid, bsep->bs_tag,
3522 		    bad_oop ? bad_oop->oo_last_good_op : TAG_NONE,
3523 		    bsep->bs_seqid, bad_oop ? bad_oop->oo_last_good_seqid : 0);
3524 
3525 		if (bad_oop) {
3526 			/* essentially reset the open owner */
3527 			error = nfs4_start_open_seqid_sync(bad_oop, mi);
3528 			ASSERT(!error);	/* recov thread always succeeds */
3529 			bad_oop->oo_name = nfs4_get_new_oo_name();
3530 			bad_oop->oo_seqid = 0;
3531 			nfs4_end_open_seqid_sync(bad_oop);
3532 		}
3533 
3534 		if (bad_lop) {
3535 			mutex_enter(&bad_lop->lo_lock);
3536 			bad_lop->lo_flags |= NFS4_BAD_SEQID_LOCK;
3537 			mutex_exit(&bad_lop->lo_lock);
3538 
3539 			ASSERT(vp != NULL);
3540 			rp = VTOR4(vp);
3541 			mutex_enter(&rp->r_statelock);
3542 			rp->r_flags |= R4LODANGLERS;
3543 			mutex_exit(&rp->r_statelock);
3544 
3545 			nfs4_send_siglost(pid, mi, vp, TRUE,
3546 			    0, NFS4ERR_BAD_SEQID);
3547 		}
3548 
3549 		mutex_enter(&mi->mi_lock);
3550 		list_remove(&mi->mi_bseqid_list, bsep);
3551 		tbsep = bsep;
3552 		bsep = list_head(&mi->mi_bseqid_list);
3553 		mutex_exit(&mi->mi_lock);
3554 		free_bseqid_rqst(tbsep);
3555 	}
3556 
3557 	mutex_enter(&mi->mi_lock);
3558 	mi->mi_recovflags &= ~MI4R_BAD_SEQID;
3559 	mutex_exit(&mi->mi_lock);
3560 }
3561