xref: /titanic_50/usr/src/uts/common/fs/nfs/nfs4_recovery.c (revision 03100a6332bd4edc7a53091fcf7c9a7131bcdaa7)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * NFS Version 4 state recovery code.
30  */
31 
32 #include <nfs/nfs4_clnt.h>
33 #include <nfs/nfs4.h>
34 #include <nfs/rnode4.h>
35 #include <sys/cmn_err.h>
36 #include <sys/cred.h>
37 #include <sys/systm.h>
38 #include <sys/flock.h>
39 #include <sys/dnlc.h>
40 #include <sys/ddi.h>
41 #include <sys/disp.h>
42 #include <sys/list.h>
43 #include <sys/sdt.h>
44 
45 extern r4hashq_t *rtable4;
46 
47 /*
48  * Information that describes what needs to be done for recovery.  It is
49  * passed to a client recovery thread as well as passed to various recovery
50  * routines.  rc_mi, rc_vp1, and rc_vp2 refer to the filesystem and
51  * vnode(s) affected by recovery.  rc_vp1 and rc_vp2 are references (use
52  * VN_HOLD) or NULL.  rc_lost_rqst contains information about the lost
53  * lock or open/close request, and it holds reference counts for the
54  * various objects (vnode, etc.).  The recovery thread also uses flags set
55  * in the mntinfo4_t or vnode_t to tell it what to do.  rc_error is used
56  * to save the error that originally triggered the recovery event -- will
57  * later be used to set mi_error if recovery doesn't work.  rc_bseqid_rqst
58  * contains information about the request that got NFS4ERR_BAD_SEQID, and
59  * it holds reference count for the various objects (vnode, open owner,
60  * open stream, lock owner).
61  */
62 
63 typedef struct {
64 	mntinfo4_t *rc_mi;
65 	vnode_t *rc_vp1;
66 	vnode_t *rc_vp2;
67 	nfs4_recov_t rc_action;
68 	stateid4 rc_stateid;
69 	bool_t rc_srv_reboot;		/* server has rebooted */
70 	nfs4_lost_rqst_t *rc_lost_rqst;
71 	nfs4_error_t rc_orig_errors;	/* original errors causing recovery */
72 	int rc_error;
73 	nfs4_bseqid_entry_t *rc_bseqid_rqst;
74 } recov_info_t;
75 
76 /*
77  * How long to wait before trying again if there is an error doing
78  * recovery, in seconds.
79  */
80 
81 static int recov_err_delay = 1;
82 
83 /*
84  * How long to wait when processing NFS4ERR_GRACE or NFS4ERR_DELAY
85  * errors.  Expressed in seconds.  Default is defined as
86  * NFS4ERR_DELAY_TIME and this variable is initialized in nfs4_subr_init()
87  */
88 time_t nfs4err_delay_time = 0;
89 
90 /*
91  * Tuneable to limit how many time "exempt" ops go OTW
92  * after a recovery error.  Exempt op hints are OH_CLOSE,
93  * OH_LOCKU, OH_DELEGRETURN.  These previously always went
94  * OTW even after rnode was "dead" due to recovery errors.
95  *
96  * The tuneable below limits the number of times a start_fop
97  * invocation will retry the exempt hints.  After the limit
98  * is reached, nfs4_start_fop will return an error just like
99  * it would for non-exempt op hints.
100  */
101 int nfs4_max_recov_error_retry = 3;
102 
103 /*
104  * Number of seconds the recovery thread should pause before retry when the
105  * filesystem has been forcibly unmounted.
106  */
107 
108 int nfs4_unmount_delay = 1;
109 
110 #ifdef DEBUG
111 
112 /*
113  * How long to wait (in seconds) between recovery operations on a given
114  * file.  Normally zero, but could be set longer for testing purposes.
115  */
116 static int nfs4_recovdelay = 0;
117 
118 /*
119  * Switch that controls whether to go into the debugger when recovery
120  * fails.
121  */
122 static int nfs4_fail_recov_stop = 0;
123 
124 /*
125  * Tuneables to debug client namespace interaction with server
126  * mount points:
127  *
128  *	nfs4_srvmnt_fail_cnt:
129  *		number of times EACCES returned because client
130  *		attempted to cross server mountpoint
131  *
132  *	nfs4_srvmnt_debug:
133  *		trigger console printf whenever client attempts
134  *		to cross server mountpoint
135  */
136 int nfs4_srvmnt_fail_cnt = 0;
137 int nfs4_srvmnt_debug = 0;
138 #endif
139 
140 /* forward references, in alphabetic order */
141 static void close_after_open_resend(vnode_t *, cred_t *, uint32_t,
142 	nfs4_error_t *);
143 static void errs_to_action(recov_info_t *,
144 	nfs4_server_t *, mntinfo4_t *, stateid4 *, nfs4_lost_rqst_t *, int,
145 	nfs_opnum4, nfs4_bseqid_entry_t *);
146 static void flush_reinstate(nfs4_lost_rqst_t *);
147 static void free_milist(mntinfo4_t **, int);
148 static mntinfo4_t **make_milist(nfs4_server_t *, int *);
149 static int nfs4_check_recov_err(vnode_t *, nfs4_op_hint_t,
150 	nfs4_recov_state_t *, int, char *);
151 static char *nfs4_getsrvnames(mntinfo4_t *, size_t *);
152 static void nfs4_recov_fh_fail(vnode_t *, int, nfsstat4);
153 static void nfs4_recov_thread(recov_info_t *);
154 static void nfs4_remove_lost_rqsts(mntinfo4_t *, nfs4_server_t *);
155 static void nfs4_resend_lost_rqsts(recov_info_t *, nfs4_server_t *);
156 static cred_t *pid_to_cr(pid_t);
157 static void reclaim_one_lock(vnode_t *, flock64_t *, nfs4_error_t *, int *);
158 static void recov_bad_seqid(recov_info_t *);
159 static void recov_badstate(recov_info_t *, vnode_t *, nfsstat4);
160 static void recov_clientid(recov_info_t *, nfs4_server_t *);
161 static void recov_done(mntinfo4_t *, recov_info_t *);
162 static void recov_filehandle(nfs4_recov_t, mntinfo4_t *, vnode_t *);
163 static void recov_newserver(recov_info_t *, nfs4_server_t **, bool_t *);
164 static void recov_openfiles(recov_info_t *, nfs4_server_t *);
165 static void recov_stale(mntinfo4_t *, vnode_t *);
166 static void nfs4_free_lost_rqst(nfs4_lost_rqst_t *, nfs4_server_t *);
167 static void recov_throttle(recov_info_t *, vnode_t *);
168 static void relock_skip_pid(locklist_t *, pid_t);
169 static void resend_lock(nfs4_lost_rqst_t *, nfs4_error_t *);
170 static void resend_one_op(nfs4_lost_rqst_t *, nfs4_error_t *, mntinfo4_t *,
171 	nfs4_server_t *);
172 static void save_bseqid_rqst(nfs4_bseqid_entry_t *, recov_info_t *);
173 static void start_recovery(recov_info_t *, mntinfo4_t *, vnode_t *, vnode_t *,
174 	nfs4_server_t *);
175 static void start_recovery_action(nfs4_recov_t, bool_t, mntinfo4_t *, vnode_t *,
176 	vnode_t *);
177 static int wait_for_recovery(mntinfo4_t *, nfs4_op_hint_t);
178 
179 /*
180  * Return non-zero if the given errno, status, and rpc status codes
181  * in the nfs4_error_t indicate that client recovery is needed.
182  * "stateful" indicates whether the call that got the error establishes or
183  * removes state on the server (open, close, lock, unlock, delegreturn).
184  */
185 
186 int
187 nfs4_needs_recovery(nfs4_error_t *ep, bool_t stateful, vfs_t *vfsp)
188 {
189 	int recov = 0;
190 	mntinfo4_t *mi;
191 
192 	/*
193 	 * Try failover if the error values justify it and if
194 	 * it's a failover mount.  Don't try if the mount is in
195 	 * progress, failures are handled explicitly by nfs4rootvp.
196 	 */
197 	if (nfs4_try_failover(ep)) {
198 		mi = VFTOMI4(vfsp);
199 		mutex_enter(&mi->mi_lock);
200 		recov = FAILOVER_MOUNT4(mi) && !(mi->mi_flags & MI4_MOUNTING);
201 		mutex_exit(&mi->mi_lock);
202 		if (recov)
203 			return (recov);
204 	}
205 
206 	if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vfsp)) {
207 		/*
208 		 * The server may have gotten the request, so for stateful
209 		 * ops we need to resynchronize and possibly back out the
210 		 * op.
211 		 */
212 		return (stateful);
213 	}
214 	if (ep->error != 0)
215 		return (0);
216 
217 	/* stat values are listed alphabetically */
218 	/*
219 	 * There are two lists here: the errors for which we have code, and
220 	 * the errors for which we plan to have code before FCS.  For the
221 	 * second list, print a warning message but don't attempt recovery.
222 	 */
223 	switch (ep->stat) {
224 	case NFS4ERR_BADHANDLE:
225 	case NFS4ERR_BAD_SEQID:
226 	case NFS4ERR_BAD_STATEID:
227 	case NFS4ERR_DELAY:
228 	case NFS4ERR_EXPIRED:
229 	case NFS4ERR_FHEXPIRED:
230 	case NFS4ERR_GRACE:
231 	case NFS4ERR_OLD_STATEID:
232 	case NFS4ERR_RESOURCE:
233 	case NFS4ERR_STALE_CLIENTID:
234 	case NFS4ERR_STALE_STATEID:
235 	case NFS4ERR_WRONGSEC:
236 	case NFS4ERR_STALE:
237 		recov = 1;
238 		break;
239 #ifdef DEBUG
240 	case NFS4ERR_LEASE_MOVED:
241 	case NFS4ERR_MOVED:
242 		zcmn_err(VFTOMI4(vfsp)->mi_zone->zone_id,
243 		    CE_WARN, "!Can't yet recover from NFS status %d",
244 		    ep->stat);
245 		break;
246 #endif
247 	}
248 
249 	return (recov);
250 }
251 
252 /*
253  * Some operations such as DELEGRETURN want to avoid invoking
254  * recovery actions that will only mark the file dead.  If
255  * better handlers are invoked for any of these errors, this
256  * routine should be modified.
257  */
258 int
259 nfs4_recov_marks_dead(nfsstat4 status)
260 {
261 	if (status == NFS4ERR_BAD_SEQID ||
262 	    status == NFS4ERR_EXPIRED ||
263 	    status == NFS4ERR_BAD_STATEID ||
264 	    status == NFS4ERR_OLD_STATEID)
265 		return (1);
266 	return (0);
267 }
268 
269 /*
270  * Transfer the state recovery information in recovp to mi's resend queue,
271  * and mark mi as having a lost state request.
272  */
273 static void
274 nfs4_enqueue_lost_rqst(recov_info_t *recovp, mntinfo4_t *mi)
275 {
276 	nfs4_lost_rqst_t *lrp = recovp->rc_lost_rqst;
277 
278 	ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
279 	    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
280 
281 	ASSERT(lrp != NULL && lrp->lr_op != 0);
282 
283 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
284 	    "nfs4_enqueue_lost_rqst %p, op %d",
285 	    (void *)lrp, lrp->lr_op));
286 
287 	mutex_enter(&mi->mi_lock);
288 	mi->mi_recovflags |= MI4R_LOST_STATE;
289 	if (lrp->lr_putfirst)
290 		list_insert_head(&mi->mi_lost_state, lrp);
291 	else
292 		list_insert_tail(&mi->mi_lost_state, lrp);
293 	recovp->rc_lost_rqst = NULL;
294 	mutex_exit(&mi->mi_lock);
295 
296 	nfs4_queue_event(RE_LOST_STATE, mi, NULL, lrp->lr_op, lrp->lr_vp,
297 	    lrp->lr_dvp, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
298 }
299 
300 /*
301  * Transfer the bad seqid recovery information in recovp to mi's
302  * bad seqid queue, and mark mi as having a bad seqid request.
303  */
304 void
305 enqueue_bseqid_rqst(recov_info_t *recovp, mntinfo4_t *mi)
306 {
307 	ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
308 	    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
309 	ASSERT(recovp->rc_bseqid_rqst != NULL);
310 
311 	mutex_enter(&mi->mi_lock);
312 	mi->mi_recovflags |= MI4R_BAD_SEQID;
313 	list_insert_tail(&mi->mi_bseqid_list, recovp->rc_bseqid_rqst);
314 	recovp->rc_bseqid_rqst = NULL;
315 	mutex_exit(&mi->mi_lock);
316 }
317 
318 /*
319  * Initiate recovery.
320  *
321  * The nfs4_error_t contains the return codes that triggered a recovery
322  * attempt.  mi, vp1, and vp2 refer to the filesystem and files that were
323  * being operated on.  vp1 and vp2 may be NULL.
324  *
325  * Multiple calls are okay.  If recovery is already underway, the call
326  * updates the information about what state needs recovery but does not
327  * start a new thread.  The caller should hold mi->mi_recovlock as a reader
328  * for proper synchronization with any recovery thread.
329  *
330  * This will return TRUE if recovery was aborted, and FALSE otherwise.
331  */
332 bool_t
333 nfs4_start_recovery(nfs4_error_t *ep, mntinfo4_t *mi, vnode_t *vp1,
334     vnode_t *vp2, stateid4 *sid, nfs4_lost_rqst_t *lost_rqstp, nfs_opnum4 op,
335     nfs4_bseqid_entry_t *bsep)
336 {
337 	recov_info_t *recovp;
338 	nfs4_server_t *sp;
339 	bool_t abort = FALSE;
340 	bool_t gone = FALSE;
341 
342 	ASSERT(nfs_zone() == mi->mi_zone);
343 	mutex_enter(&mi->mi_lock);
344 	/*
345 	 * If there is lost state, we need to kick off recovery even if the
346 	 * filesystem has been unmounted or the zone is shutting down.
347 	 */
348 	gone = FS_OR_ZONE_GONE4(mi->mi_vfsp);
349 	if (gone) {
350 		ASSERT(ep->error != EINTR || lost_rqstp != NULL);
351 		if (ep->error == EIO && lost_rqstp == NULL) {
352 			/* failed due to forced unmount, no new lost state */
353 			abort = TRUE;
354 		}
355 		if ((ep->error == 0 || ep->error == ETIMEDOUT) &&
356 		    !(mi->mi_recovflags & MI4R_LOST_STATE)) {
357 			/* some other failure, no existing lost state */
358 			abort = TRUE;
359 		}
360 		if (abort) {
361 			mutex_exit(&mi->mi_lock);
362 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
363 			    "nfs4_start_recovery: fs unmounted"));
364 			return (TRUE);
365 		}
366 	}
367 	mi->mi_in_recovery++;
368 	mutex_exit(&mi->mi_lock);
369 
370 	recovp = kmem_alloc(sizeof (recov_info_t), KM_SLEEP);
371 	recovp->rc_orig_errors = *ep;
372 	sp = find_nfs4_server(mi);
373 	errs_to_action(recovp, sp, mi, sid, lost_rqstp, gone, op, bsep);
374 	if (sp != NULL)
375 		mutex_exit(&sp->s_lock);
376 	start_recovery(recovp, mi, vp1, vp2, sp);
377 	if (sp != NULL)
378 		nfs4_server_rele(sp);
379 	return (FALSE);
380 }
381 
382 /*
383  * Internal version of nfs4_start_recovery.  The difference is that the
384  * caller specifies the recovery action, rather than the errors leading to
385  * recovery.
386  */
387 static void
388 start_recovery_action(nfs4_recov_t what, bool_t reboot, mntinfo4_t *mi,
389     vnode_t *vp1, vnode_t *vp2)
390 {
391 	recov_info_t *recovp;
392 
393 	ASSERT(nfs_zone() == mi->mi_zone);
394 	mutex_enter(&mi->mi_lock);
395 	mi->mi_in_recovery++;
396 	mutex_exit(&mi->mi_lock);
397 
398 	recovp = kmem_zalloc(sizeof (recov_info_t), KM_SLEEP);
399 	recovp->rc_action = what;
400 	recovp->rc_srv_reboot = reboot;
401 	recovp->rc_error = EIO;
402 	start_recovery(recovp, mi, vp1, vp2, NULL);
403 }
404 
405 static void
406 start_recovery(recov_info_t *recovp, mntinfo4_t *mi,
407     vnode_t *vp1, vnode_t *vp2, nfs4_server_t *sp)
408 {
409 	NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
410 	    "start_recovery: mi %p, what %s", (void*)mi,
411 	    nfs4_recov_action_to_str(recovp->rc_action)));
412 
413 	/*
414 	 * Bump the reference on the vfs so that we can pass it to the
415 	 * recovery thread.
416 	 */
417 	VFS_HOLD(mi->mi_vfsp);
418 	MI4_HOLD(mi);
419 again:
420 	switch (recovp->rc_action) {
421 	case NR_FAILOVER:
422 		ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
423 		    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
424 		if (mi->mi_servers->sv_next == NULL)
425 			goto out_no_thread;
426 		mutex_enter(&mi->mi_lock);
427 		mi->mi_recovflags |= MI4R_NEED_NEW_SERVER;
428 		mutex_exit(&mi->mi_lock);
429 
430 		if (recovp->rc_lost_rqst != NULL)
431 			nfs4_enqueue_lost_rqst(recovp, mi);
432 		break;
433 
434 	case NR_CLIENTID:
435 		/*
436 		 * If the filesystem has been unmounted, punt.
437 		 */
438 		if (sp == NULL)
439 			goto out_no_thread;
440 
441 		/*
442 		 * If nobody else is working on the clientid, mark the
443 		 * clientid as being no longer set.  Then mark the specific
444 		 * filesystem being worked on.
445 		 */
446 		if (!nfs4_server_in_recovery(sp)) {
447 			mutex_enter(&sp->s_lock);
448 			sp->s_flags &= ~N4S_CLIENTID_SET;
449 			mutex_exit(&sp->s_lock);
450 		}
451 		ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
452 		    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
453 		mutex_enter(&mi->mi_lock);
454 		mi->mi_recovflags |= MI4R_NEED_CLIENTID;
455 		if (recovp->rc_srv_reboot)
456 			mi->mi_recovflags |= MI4R_SRV_REBOOT;
457 		mutex_exit(&mi->mi_lock);
458 		break;
459 
460 	case NR_OPENFILES:
461 		ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
462 		    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
463 		mutex_enter(&mi->mi_lock);
464 		mi->mi_recovflags |= MI4R_REOPEN_FILES;
465 		if (recovp->rc_srv_reboot)
466 			mi->mi_recovflags |= MI4R_SRV_REBOOT;
467 		mutex_exit(&mi->mi_lock);
468 		break;
469 
470 	case NR_WRONGSEC:
471 		ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
472 		    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
473 		mutex_enter(&mi->mi_lock);
474 		mi->mi_recovflags |= MI4R_NEED_SECINFO;
475 		mutex_exit(&mi->mi_lock);
476 		break;
477 
478 	case NR_EXPIRED:
479 		if (vp1 != NULL)
480 			recov_badstate(recovp, vp1, NFS4ERR_EXPIRED);
481 		if (vp2 != NULL)
482 			recov_badstate(recovp, vp2, NFS4ERR_EXPIRED);
483 		goto out_no_thread;	/* no further recovery possible */
484 
485 	case NR_BAD_STATEID:
486 		if (vp1 != NULL)
487 			recov_badstate(recovp, vp1, NFS4ERR_BAD_STATEID);
488 		if (vp2 != NULL)
489 			recov_badstate(recovp, vp2, NFS4ERR_BAD_STATEID);
490 		goto out_no_thread;	/* no further recovery possible */
491 
492 	case NR_FHEXPIRED:
493 	case NR_BADHANDLE:
494 		if (vp1 != NULL)
495 			recov_throttle(recovp, vp1);
496 		if (vp2 != NULL)
497 			recov_throttle(recovp, vp2);
498 		/*
499 		 * Recover the filehandle now, rather than using a
500 		 * separate thread.  We can do this because filehandle
501 		 * recovery is independent of any other state, and because
502 		 * we know that we are not competing with the recovery
503 		 * thread at this time.  recov_filehandle will deal with
504 		 * threads that are competing to recover this filehandle.
505 		 */
506 		ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
507 		    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
508 		if (vp1 != NULL)
509 			recov_filehandle(recovp->rc_action, mi, vp1);
510 		if (vp2 != NULL)
511 			recov_filehandle(recovp->rc_action, mi, vp2);
512 		goto out_no_thread;	/* no further recovery needed */
513 
514 	case NR_STALE:
515 		/*
516 		 * NFS4ERR_STALE handling
517 		 * recov_stale() could set MI4R_NEED_NEW_SERVER to
518 		 * indicate that we can and should failover.
519 		 */
520 		ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
521 		    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
522 
523 		if (vp1 != NULL)
524 			recov_stale(mi, vp1);
525 		if (vp2 != NULL)
526 			recov_stale(mi, vp2);
527 		mutex_enter(&mi->mi_lock);
528 		if ((mi->mi_recovflags & MI4R_NEED_NEW_SERVER) == 0) {
529 			mutex_exit(&mi->mi_lock);
530 			goto out_no_thread;
531 		}
532 		mutex_exit(&mi->mi_lock);
533 		recovp->rc_action = NR_FAILOVER;
534 		goto again;
535 
536 	case NR_BAD_SEQID:
537 		if (recovp->rc_bseqid_rqst) {
538 			enqueue_bseqid_rqst(recovp, mi);
539 			break;
540 		}
541 
542 		if (vp1 != NULL)
543 			recov_badstate(recovp, vp1, NFS4ERR_BAD_SEQID);
544 		if (vp2 != NULL)
545 			recov_badstate(recovp, vp2, NFS4ERR_BAD_SEQID);
546 		goto out_no_thread; /* no further recovery possible */
547 
548 	case NR_OLDSTATEID:
549 		if (vp1 != NULL)
550 			recov_badstate(recovp, vp1, NFS4ERR_OLD_STATEID);
551 		if (vp2 != NULL)
552 			recov_badstate(recovp, vp2, NFS4ERR_OLD_STATEID);
553 		goto out_no_thread;	/* no further recovery possible */
554 
555 	case NR_GRACE:
556 		nfs4_set_grace_wait(mi);
557 		goto out_no_thread; /* no further action required for GRACE */
558 
559 	case NR_DELAY:
560 		if (vp1)
561 			nfs4_set_delay_wait(vp1);
562 		goto out_no_thread; /* no further action required for DELAY */
563 
564 	case NR_LOST_STATE_RQST:
565 	case NR_LOST_LOCK:
566 		nfs4_enqueue_lost_rqst(recovp, mi);
567 		break;
568 
569 	default:
570 		nfs4_queue_event(RE_UNEXPECTED_ACTION, mi, NULL,
571 		    recovp->rc_action, NULL, NULL, 0, NULL, 0, TAG_NONE,
572 		    TAG_NONE, 0, 0);
573 		goto out_no_thread;
574 	}
575 
576 	/*
577 	 * If either file recently went through the same recovery, wait
578 	 * awhile.  This is in case there is some sort of bug; we might not
579 	 * be able to recover properly, but at least we won't bombard the
580 	 * server with calls, and we won't tie up the client.
581 	 */
582 	if (vp1 != NULL)
583 		recov_throttle(recovp, vp1);
584 	if (vp2 != NULL)
585 		recov_throttle(recovp, vp2);
586 
587 	/*
588 	 * If there's already a recovery thread, don't start another one.
589 	 */
590 
591 	mutex_enter(&mi->mi_lock);
592 	if (mi->mi_flags & MI4_RECOV_ACTIV) {
593 		mutex_exit(&mi->mi_lock);
594 		goto out_no_thread;
595 	}
596 	mi->mi_flags |= MI4_RECOV_ACTIV;
597 	mutex_exit(&mi->mi_lock);
598 	NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
599 	    "start_recovery: starting new thread for mi %p", (void*)mi));
600 
601 	recovp->rc_mi = mi;
602 	recovp->rc_vp1 = vp1;
603 	if (vp1 != NULL) {
604 		ASSERT(VTOMI4(vp1) == mi);
605 		VN_HOLD(recovp->rc_vp1);
606 	}
607 	recovp->rc_vp2 = vp2;
608 	if (vp2 != NULL) {
609 		ASSERT(VTOMI4(vp2) == mi);
610 		VN_HOLD(recovp->rc_vp2);
611 	}
612 
613 	(void) zthread_create(NULL, 0, nfs4_recov_thread, recovp, 0,
614 	    minclsyspri);
615 	return;
616 
617 	/* not reached by thread creating call */
618 out_no_thread:
619 	mutex_enter(&mi->mi_lock);
620 	mi->mi_in_recovery--;
621 	if (mi->mi_in_recovery == 0)
622 		cv_broadcast(&mi->mi_cv_in_recov);
623 	mutex_exit(&mi->mi_lock);
624 
625 	VFS_RELE(mi->mi_vfsp);
626 	MI4_RELE(mi);
627 	/*
628 	 * Free up resources that were allocated for us.
629 	 */
630 	kmem_free(recovp, sizeof (recov_info_t));
631 }
632 
633 static int
634 nfs4_check_recov_err(vnode_t *vp, nfs4_op_hint_t op,
635     nfs4_recov_state_t *rsp, int retry_err_cnt, char *str)
636 {
637 	rnode4_t *rp;
638 	int error = 0;
639 	int exempt;
640 
641 	if (vp == NULL)
642 		return (0);
643 
644 	exempt = (op == OH_CLOSE || op == OH_LOCKU || op == OH_DELEGRETURN);
645 	rp = VTOR4(vp);
646 	mutex_enter(&rp->r_statelock);
647 
648 	/*
649 	 * If there was a recovery error, then allow op hints "exempt" from
650 	 * recov errors to retry (currently 3 times).  Either r_error or
651 	 * EIO is returned for non-exempt op hints.
652 	 */
653 	if (rp->r_flags & R4RECOVERR) {
654 		if (exempt && rsp->rs_num_retry_despite_err <=
655 		    nfs4_max_recov_error_retry) {
656 
657 			/*
658 			 * Check to make sure that we haven't already inc'd
659 			 * rs_num_retry_despite_err for current nfs4_start_fop
660 			 * instance.  We don't want to double inc (if we were
661 			 * called with vp2, then the vp1 call could have
662 			 * already incremented.
663 			 */
664 			if (retry_err_cnt == rsp->rs_num_retry_despite_err)
665 				rsp->rs_num_retry_despite_err++;
666 
667 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
668 			    "nfs4_start_fop: %s %p DEAD, cnt=%d", str,
669 			    (void *)vp, rsp->rs_num_retry_despite_err));
670 		} else {
671 			error = (rp->r_error ? rp->r_error : EIO);
672 			/*
673 			 * An ESTALE error on a non-regular file is not
674 			 * "sticky".  Return the ESTALE error once, but
675 			 * clear the condition to allow future operations
676 			 * to go OTW.  This will allow the client to
677 			 * recover if the server has merely unshared then
678 			 * re-shared the file system.  For regular files,
679 			 * the unshare has destroyed the open state at the
680 			 * server and we aren't willing to do a reopen (yet).
681 			 */
682 			if (error == ESTALE && vp->v_type != VREG) {
683 				rp->r_flags &=
684 				    ~(R4RECOVERR|R4RECOVERRP|R4STALE);
685 				rp->r_error = 0;
686 				error = ESTALE;
687 			}
688 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
689 			    "nfs4_start_fop: %s %p DEAD, cnt=%d error=%d",
690 			    str, (void *)vp,
691 			    rsp->rs_num_retry_despite_err, error));
692 		}
693 	}
694 
695 	mutex_exit(&rp->r_statelock);
696 	return (error);
697 }
698 
699 /*
700  * Initial setup code that every operation should call if it might invoke
701  * client recovery.  Can block waiting for recovery to finish on a
702  * filesystem.  Either vnode ptr can be NULL.
703  *
704  * Returns 0 if there are no outstanding errors.  Can return an
705  * errno value under various circumstances (e.g., failed recovery, or
706  * interrupted while waiting for recovery to finish).
707  *
708  * There must be a corresponding call to nfs4_end_op() to free up any locks
709  * or resources allocated by this call (assuming this call succeeded),
710  * using the same rsp that's passed in here.
711  *
712  * The open and lock seqid synchronization must be stopped before calling this
713  * function, as it could lead to deadlock when trying to reopen a file or
714  * reclaim a lock.  The synchronization is obtained with calls to:
715  *   nfs4_start_open_seqid_sync()
716  *   nfs4_start_lock_seqid_sync()
717  *
718  * *startrecovp is set TRUE if the caller should not bother with the
719  * over-the-wire call, and just initiate recovery for the given request.
720  * This is typically used for state-releasing ops if the filesystem has
721  * been forcibly unmounted.  startrecovp may be NULL for
722  * non-state-releasing ops.
723  */
724 
725 int
726 nfs4_start_fop(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2, nfs4_op_hint_t op,
727     nfs4_recov_state_t *rsp, bool_t *startrecovp)
728 {
729 	int error = 0, rerr_cnt;
730 	nfs4_server_t *sp = NULL;
731 	nfs4_server_t *tsp;
732 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
733 	time_t droplock_time;
734 #ifdef DEBUG
735 	void *fop_caller;
736 #endif
737 
738 	ASSERT(vp1 == NULL || vp1->v_vfsp == mi->mi_vfsp);
739 	ASSERT(vp2 == NULL || vp2->v_vfsp == mi->mi_vfsp);
740 
741 #ifdef	DEBUG
742 	if ((fop_caller = tsd_get(nfs4_tsd_key)) != NULL) {
743 		cmn_err(CE_PANIC, "Missing nfs4_end_fop: last caller %p",
744 		    fop_caller);
745 	}
746 	(void) tsd_set(nfs4_tsd_key, caller());
747 #endif
748 
749 	rsp->rs_sp = NULL;
750 	rsp->rs_flags &= ~NFS4_RS_RENAME_HELD;
751 	rerr_cnt = rsp->rs_num_retry_despite_err;
752 
753 	/*
754 	 * Process the items that may delay() based on server response
755 	 */
756 	error = nfs4_wait_for_grace(mi, rsp);
757 	if (error)
758 		goto out;
759 
760 	if (vp1 != NULL) {
761 		error = nfs4_wait_for_delay(vp1, rsp);
762 		if (error)
763 			goto out;
764 	}
765 
766 	/* Wait for a delegation recall to complete. */
767 
768 	error = wait_for_recall(vp1, vp2, op, rsp);
769 	if (error)
770 		goto out;
771 
772 	/*
773 	 * Wait for any current recovery actions to finish.  Note that a
774 	 * recovery thread can still start up after wait_for_recovery()
775 	 * finishes.  We don't block out recovery operations until we
776 	 * acquire s_recovlock and mi_recovlock.
777 	 */
778 	error = wait_for_recovery(mi, op);
779 	if (error)
780 		goto out;
781 
782 	/*
783 	 * Check to see if the rnode is already marked with a
784 	 * recovery error.  If so, return it immediately.  But
785 	 * always pass CLOSE, LOCKU, and DELEGRETURN so we can
786 	 * clean up state on the server.
787 	 */
788 
789 	if (vp1 != NULL) {
790 		if (error = nfs4_check_recov_err(vp1, op, rsp, rerr_cnt, "vp1"))
791 			goto out;
792 		nfs4_check_remap(mi, vp1, NFS4_REMAP_CKATTRS, &e);
793 	}
794 
795 	if (vp2 != NULL) {
796 		if (error = nfs4_check_recov_err(vp2, op, rsp, rerr_cnt, "vp2"))
797 			goto out;
798 		nfs4_check_remap(mi, vp2, NFS4_REMAP_CKATTRS, &e);
799 	}
800 
801 	/*
802 	 * The lock order calls for us to acquire s_recovlock before
803 	 * mi_recovlock, but we have to hold mi_recovlock to look up sp (to
804 	 * prevent races with the failover/migration code).  So acquire
805 	 * mi_recovlock, look up sp, drop mi_recovlock, acquire
806 	 * s_recovlock and mi_recovlock, then verify that sp is still the
807 	 * right object.  XXX Can we find a simpler way to deal with this?
808 	 */
809 	if (nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER,
810 	    mi->mi_flags & MI4_INT)) {
811 		error = EINTR;
812 		goto out;
813 	}
814 get_sp:
815 	sp = find_nfs4_server(mi);
816 	if (sp != NULL) {
817 		sp->s_otw_call_count++;
818 		mutex_exit(&sp->s_lock);
819 		droplock_time = gethrestime_sec();
820 	}
821 	nfs_rw_exit(&mi->mi_recovlock);
822 
823 	if (sp != NULL) {
824 		if (nfs_rw_enter_sig(&sp->s_recovlock, RW_READER,
825 		    mi->mi_flags & MI4_INT)) {
826 			error = EINTR;
827 			goto out;
828 		}
829 	}
830 	if (nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER,
831 	    mi->mi_flags & MI4_INT)) {
832 		if (sp != NULL)
833 			nfs_rw_exit(&sp->s_recovlock);
834 		error = EINTR;
835 		goto out;
836 	}
837 	/*
838 	 * If the mntinfo4_t hasn't changed nfs4_sever_ts then
839 	 * there's no point in double checking to make sure it
840 	 * has switched.
841 	 */
842 	if (sp == NULL || droplock_time < mi->mi_srvsettime) {
843 		tsp = find_nfs4_server(mi);
844 		if (tsp != sp) {
845 			/* try again */
846 			if (tsp != NULL) {
847 				mutex_exit(&tsp->s_lock);
848 				nfs4_server_rele(tsp);
849 				tsp = NULL;
850 			}
851 			if (sp != NULL) {
852 				nfs_rw_exit(&sp->s_recovlock);
853 				mutex_enter(&sp->s_lock);
854 				sp->s_otw_call_count--;
855 				mutex_exit(&sp->s_lock);
856 				nfs4_server_rele(sp);
857 				sp = NULL;
858 			}
859 			goto get_sp;
860 		} else {
861 			if (tsp != NULL) {
862 				mutex_exit(&tsp->s_lock);
863 				nfs4_server_rele(tsp);
864 				tsp = NULL;
865 			}
866 		}
867 	}
868 
869 	if (sp != NULL) {
870 		rsp->rs_sp = sp;
871 	}
872 
873 	/*
874 	 * If the fileystem uses volatile filehandles, obtain a lock so
875 	 * that we synchronize with renames.  Exception: mount operations
876 	 * can change mi_fh_expire_type, which could be a problem, since
877 	 * the end_op code needs to be consistent with the start_op code
878 	 * about mi_rename_lock.  Since mounts don't compete with renames,
879 	 * it's simpler to just not acquire the rename lock for mounts.
880 	 */
881 	if (NFS4_VOLATILE_FH(mi) && op != OH_MOUNT) {
882 		if (nfs_rw_enter_sig(&mi->mi_rename_lock,
883 		    op == OH_VFH_RENAME ? RW_WRITER : RW_READER,
884 		    mi->mi_flags & MI4_INT)) {
885 			nfs_rw_exit(&mi->mi_recovlock);
886 			if (sp != NULL)
887 				nfs_rw_exit(&sp->s_recovlock);
888 			error = EINTR;
889 			goto out;
890 		}
891 		rsp->rs_flags |= NFS4_RS_RENAME_HELD;
892 	}
893 
894 	if (OH_IS_STATE_RELE(op)) {
895 		/*
896 		 * For forced unmount, letting the request proceed will
897 		 * almost always delay response to the user, so hand it off
898 		 * to the recovery thread.  For exiting lwp's, we don't
899 		 * have a good way to tell if the request will hang.  We
900 		 * generally want processes to handle their own requests so
901 		 * that they can be done in parallel, but if there is
902 		 * already a recovery thread, hand the request off to it.
903 		 * This will improve user response at no cost to overall
904 		 * system throughput.  For zone shutdown, we'd prefer
905 		 * the recovery thread to handle this as well.
906 		 */
907 		ASSERT(startrecovp != NULL);
908 		mutex_enter(&mi->mi_lock);
909 		if (FS_OR_ZONE_GONE4(mi->mi_vfsp))
910 			*startrecovp = TRUE;
911 		else if ((curthread->t_proc_flag & TP_LWPEXIT) &&
912 		    (mi->mi_flags & MI4_RECOV_ACTIV))
913 			*startrecovp = TRUE;
914 		else
915 			*startrecovp = FALSE;
916 		mutex_exit(&mi->mi_lock);
917 	} else
918 		if (startrecovp != NULL)
919 			*startrecovp = FALSE;
920 
921 	ASSERT(error == 0);
922 	return (error);
923 
924 out:
925 	ASSERT(error != 0);
926 	if (sp != NULL) {
927 		mutex_enter(&sp->s_lock);
928 		sp->s_otw_call_count--;
929 		mutex_exit(&sp->s_lock);
930 		nfs4_server_rele(sp);
931 		rsp->rs_sp = NULL;
932 	}
933 	nfs4_end_op_recall(vp1, vp2, rsp);
934 
935 #ifdef	DEBUG
936 	(void) tsd_set(nfs4_tsd_key, NULL);
937 #endif
938 	return (error);
939 }
940 
941 /*
942  * It is up to the caller to determine if rsp->rs_sp being NULL
943  * is detrimental or not.
944  */
945 int
946 nfs4_start_op(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2,
947     nfs4_recov_state_t *rsp)
948 {
949 	ASSERT(rsp->rs_num_retry_despite_err == 0);
950 	rsp->rs_num_retry_despite_err = 0;
951 	return (nfs4_start_fop(mi, vp1, vp2, OH_OTHER, rsp, NULL));
952 }
953 
954 /*
955  * Release any resources acquired by nfs4_start_op().
956  * 'sp' should be the nfs4_server pointer returned by nfs4_start_op().
957  *
958  * The operation hint is used to avoid a deadlock by bypassing delegation
959  * return logic for writes, which are done while returning a delegation.
960  */
961 
962 void
963 nfs4_end_fop(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2, nfs4_op_hint_t op,
964     nfs4_recov_state_t *rsp, bool_t needs_recov)
965 {
966 	nfs4_server_t *sp = rsp->rs_sp;
967 	rnode4_t *rp = NULL;
968 
969 #ifdef	lint
970 	/*
971 	 * The op hint isn't used any more, but might be in
972 	 * the future.
973 	 */
974 	op = op;
975 #endif
976 
977 #ifdef	DEBUG
978 	ASSERT(tsd_get(nfs4_tsd_key) != NULL);
979 	(void) tsd_set(nfs4_tsd_key, NULL);
980 #endif
981 
982 	nfs4_end_op_recall(vp1, vp2, rsp);
983 
984 	if (rsp->rs_flags & NFS4_RS_RENAME_HELD)
985 		nfs_rw_exit(&mi->mi_rename_lock);
986 
987 	if (!needs_recov) {
988 		if (rsp->rs_flags & NFS4_RS_DELAY_MSG) {
989 			/* may need to clear the delay interval */
990 			if (vp1 != NULL) {
991 				rp = VTOR4(vp1);
992 				mutex_enter(&rp->r_statelock);
993 				rp->r_delay_interval = 0;
994 				mutex_exit(&rp->r_statelock);
995 			}
996 		}
997 		rsp->rs_flags &= ~(NFS4_RS_GRACE_MSG|NFS4_RS_DELAY_MSG);
998 	}
999 
1000 	/*
1001 	 * If the corresponding nfs4_start_op() found a sp,
1002 	 * then there must still be a sp.
1003 	 */
1004 	if (sp != NULL) {
1005 		nfs_rw_exit(&mi->mi_recovlock);
1006 		nfs_rw_exit(&sp->s_recovlock);
1007 		mutex_enter(&sp->s_lock);
1008 		sp->s_otw_call_count--;
1009 		cv_broadcast(&sp->s_cv_otw_count);
1010 		mutex_exit(&sp->s_lock);
1011 		nfs4_server_rele(sp);
1012 	} else {
1013 		nfs_rw_exit(&mi->mi_recovlock);
1014 	}
1015 }
1016 
1017 void
1018 nfs4_end_op(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2,
1019     nfs4_recov_state_t *rsp, bool_t needrecov)
1020 {
1021 	nfs4_end_fop(mi, vp1, vp2, OH_OTHER, rsp, needrecov);
1022 }
1023 
1024 /*
1025  * If the filesystem is going through client recovery, block until
1026  * finished.
1027  * Exceptions:
1028  * - state-releasing ops (CLOSE, LOCKU, DELEGRETURN) are allowed to proceed
1029  *   if the filesystem has been forcibly unmounted or the lwp is exiting.
1030  *
1031  * Return value:
1032  * - 0 if no errors
1033  * - EINTR if the call was interrupted
1034  * - EIO if the filesystem has been forcibly unmounted (non-state-releasing
1035  *   op)
1036  * - the errno value from the recovery thread, if recovery failed
1037  */
1038 
1039 static int
1040 wait_for_recovery(mntinfo4_t *mi, nfs4_op_hint_t op_hint)
1041 {
1042 	int error = 0;
1043 
1044 	mutex_enter(&mi->mi_lock);
1045 
1046 	while (mi->mi_recovflags != 0) {
1047 		klwp_t *lwp = ttolwp(curthread);
1048 
1049 		if (mi->mi_flags & MI4_RECOV_FAIL)
1050 			break;
1051 		if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)
1052 			break;
1053 		if (OH_IS_STATE_RELE(op_hint) &&
1054 		    (curthread->t_proc_flag & TP_LWPEXIT))
1055 			break;
1056 
1057 		if (lwp != NULL)
1058 			lwp->lwp_nostop++;
1059 		/* XXX - use different cv? */
1060 		if (cv_wait_sig(&mi->mi_failover_cv, &mi->mi_lock) == 0) {
1061 			error = EINTR;
1062 			if (lwp != NULL)
1063 				lwp->lwp_nostop--;
1064 			break;
1065 		}
1066 		if (lwp != NULL)
1067 			lwp->lwp_nostop--;
1068 	}
1069 
1070 	if (mi->mi_flags & MI4_RECOV_FAIL) {
1071 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1072 		    "wait_for_recovery: fail since RECOV FAIL"));
1073 		error = mi->mi_error;
1074 	} else if ((mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) &&
1075 	    !OH_IS_STATE_RELE(op_hint)) {
1076 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1077 		    "wait_for_recovery: forced unmount"));
1078 		error = EIO;
1079 	}
1080 
1081 	mutex_exit(&mi->mi_lock);
1082 
1083 	return (error);
1084 }
1085 
1086 /*
1087  * If the client received NFS4ERR_GRACE for this particular mount,
1088  * the client blocks here until it is time to try again.
1089  *
1090  * Return value:
1091  * - 0 if wait was successful
1092  * - EINTR if the call was interrupted
1093  */
1094 
1095 int
1096 nfs4_wait_for_grace(mntinfo4_t *mi, nfs4_recov_state_t *rsp)
1097 {
1098 	int error = 0;
1099 	time_t curtime, time_to_wait;
1100 
1101 	/* do a unprotected check to reduce mi_lock contention */
1102 	if (mi->mi_grace_wait != 0) {
1103 		mutex_enter(&mi->mi_lock);
1104 
1105 		if (mi->mi_grace_wait != 0) {
1106 			if (!(rsp->rs_flags & NFS4_RS_GRACE_MSG))
1107 				rsp->rs_flags |= NFS4_RS_GRACE_MSG;
1108 
1109 			curtime = gethrestime_sec();
1110 
1111 			if (curtime < mi->mi_grace_wait) {
1112 
1113 				time_to_wait = mi->mi_grace_wait - curtime;
1114 
1115 				mutex_exit(&mi->mi_lock);
1116 
1117 				error = delay_sig(SEC_TO_TICK(time_to_wait));
1118 				if (error)
1119 					return (error);
1120 
1121 				curtime = gethrestime_sec();
1122 
1123 				mutex_enter(&mi->mi_lock);
1124 
1125 				if (curtime >= mi->mi_grace_wait)
1126 					mi->mi_grace_wait = 0;
1127 			} else {
1128 				mi->mi_grace_wait = 0;
1129 			}
1130 		}
1131 		mutex_exit(&mi->mi_lock);
1132 	}
1133 
1134 	return (error);
1135 }
1136 
1137 /*
1138  * If the client received NFS4ERR_DELAY for an operation on a vnode,
1139  * the client blocks here until it is time to try again.
1140  *
1141  * Return value:
1142  * - 0 if wait was successful
1143  * - EINTR if the call was interrupted
1144  */
1145 
1146 int
1147 nfs4_wait_for_delay(vnode_t *vp, nfs4_recov_state_t *rsp)
1148 {
1149 	int error = 0;
1150 	time_t curtime, time_to_wait;
1151 	rnode4_t *rp;
1152 
1153 	ASSERT(vp != NULL);
1154 
1155 	rp = VTOR4(vp);
1156 
1157 	/* do a unprotected check to reduce r_statelock contention */
1158 	if (rp->r_delay_wait != 0) {
1159 		mutex_enter(&rp->r_statelock);
1160 
1161 		if (rp->r_delay_wait != 0) {
1162 
1163 			if (!(rsp->rs_flags & NFS4_RS_DELAY_MSG)) {
1164 				rsp->rs_flags |= NFS4_RS_DELAY_MSG;
1165 				nfs4_mi_kstat_inc_delay(VTOMI4(vp));
1166 			}
1167 
1168 			curtime = gethrestime_sec();
1169 
1170 			if (curtime < rp->r_delay_wait) {
1171 
1172 				time_to_wait = rp->r_delay_wait - curtime;
1173 
1174 				mutex_exit(&rp->r_statelock);
1175 
1176 				error = delay_sig(SEC_TO_TICK(time_to_wait));
1177 				if (error)
1178 					return (error);
1179 
1180 				curtime = gethrestime_sec();
1181 
1182 				mutex_enter(&rp->r_statelock);
1183 
1184 				if (curtime >= rp->r_delay_wait)
1185 					rp->r_delay_wait = 0;
1186 			} else {
1187 				rp->r_delay_wait = 0;
1188 			}
1189 		}
1190 		mutex_exit(&rp->r_statelock);
1191 	}
1192 
1193 	return (error);
1194 }
1195 
1196 /*
1197  * The recovery thread.
1198  */
1199 
1200 static void
1201 nfs4_recov_thread(recov_info_t *recovp)
1202 {
1203 	mntinfo4_t *mi = recovp->rc_mi;
1204 	nfs4_server_t *sp;
1205 	int done = 0, error = 0;
1206 	bool_t recov_fail = FALSE;
1207 	callb_cpr_t cpr_info;
1208 	kmutex_t cpr_lock;
1209 
1210 	nfs4_queue_event(RE_START, mi, NULL, mi->mi_recovflags,
1211 	    recovp->rc_vp1, recovp->rc_vp2, 0, NULL, 0, TAG_NONE, TAG_NONE,
1212 	    0, 0);
1213 
1214 	mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
1215 	CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr, "nfsv4Recov");
1216 
1217 	mutex_enter(&mi->mi_lock);
1218 	mi->mi_recovthread = curthread;
1219 	mutex_exit(&mi->mi_lock);
1220 
1221 	/*
1222 	 * We don't really need protection here against failover or
1223 	 * migration, since the current thread is the one that would make
1224 	 * any changes, but hold mi_recovlock anyway for completeness (and
1225 	 * to satisfy any ASSERTs).
1226 	 */
1227 	(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
1228 	sp = find_nfs4_server(mi);
1229 	if (sp != NULL)
1230 		mutex_exit(&sp->s_lock);
1231 	nfs_rw_exit(&mi->mi_recovlock);
1232 
1233 	/*
1234 	 * Do any necessary recovery, based on the information in recovp
1235 	 * and any recovery flags.
1236 	 */
1237 
1238 	do {
1239 		mutex_enter(&mi->mi_lock);
1240 		if (FS_OR_ZONE_GONE4(mi->mi_vfsp)) {
1241 			bool_t activesrv;
1242 
1243 			NFS4_DEBUG(nfs4_client_recov_debug &&
1244 			    mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED, (CE_NOTE,
1245 			    "nfs4_recov_thread: file system has been "
1246 			    "unmounted"));
1247 			NFS4_DEBUG(nfs4_client_recov_debug &&
1248 			    zone_status_get(curproc->p_zone) >=
1249 			    ZONE_IS_SHUTTING_DOWN, (CE_NOTE,
1250 			    "nfs4_recov_thread: zone shutting down"));
1251 			/*
1252 			 * If the server has lost its state for us and
1253 			 * the filesystem is unmounted, then the filesystem
1254 			 * can be tossed, even if there are lost lock or
1255 			 * lost state calls in the recovery queue.
1256 			 */
1257 			if (mi->mi_recovflags &
1258 			    (MI4R_NEED_CLIENTID | MI4R_REOPEN_FILES)) {
1259 				NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1260 				"nfs4_recov_thread: bailing out"));
1261 				mi->mi_flags |= MI4_RECOV_FAIL;
1262 				mi->mi_error = recovp->rc_error;
1263 				recov_fail = TRUE;
1264 			}
1265 			/*
1266 			 * We don't know if the server has any state for
1267 			 * us, and the filesystem has been unmounted.  If
1268 			 * there are "lost state" recovery items, keep
1269 			 * trying to process them until there are no more
1270 			 * mounted filesystems for the server.  Otherwise,
1271 			 * bail out.  The reason we don't mark the
1272 			 * filesystem as failing recovery is in case we
1273 			 * have to do "lost state" recovery later (e.g., a
1274 			 * user process exits).
1275 			 */
1276 			if (!(mi->mi_recovflags & MI4R_LOST_STATE)) {
1277 				done = 1;
1278 				mutex_exit(&mi->mi_lock);
1279 				break;
1280 			}
1281 			mutex_exit(&mi->mi_lock);
1282 
1283 			if (sp == NULL)
1284 				activesrv = FALSE;
1285 			else {
1286 				mutex_enter(&sp->s_lock);
1287 				activesrv = nfs4_fs_active(sp);
1288 			}
1289 			if (!activesrv) {
1290 				NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1291 				    "no active fs for server %p",
1292 				    (void *)sp));
1293 				mutex_enter(&mi->mi_lock);
1294 				mi->mi_flags |= MI4_RECOV_FAIL;
1295 				mi->mi_error = recovp->rc_error;
1296 				mutex_exit(&mi->mi_lock);
1297 				recov_fail = TRUE;
1298 				if (sp != NULL) {
1299 					/*
1300 					 * Mark the server instance as
1301 					 * dead, so that nobody will attach
1302 					 * a new filesystem.
1303 					 */
1304 					nfs4_mark_srv_dead(sp);
1305 				}
1306 			}
1307 			if (sp != NULL)
1308 				mutex_exit(&sp->s_lock);
1309 		} else {
1310 			mutex_exit(&mi->mi_lock);
1311 		}
1312 
1313 		/*
1314 		 * Check if we need to select a new server for a
1315 		 * failover.  Choosing a new server will force at
1316 		 * least a check of the clientid.
1317 		 */
1318 		mutex_enter(&mi->mi_lock);
1319 		if (!recov_fail &&
1320 		    (mi->mi_recovflags & MI4R_NEED_NEW_SERVER)) {
1321 			mutex_exit(&mi->mi_lock);
1322 			recov_newserver(recovp, &sp, &recov_fail);
1323 		} else
1324 			mutex_exit(&mi->mi_lock);
1325 
1326 		/*
1327 		 * Check if we need to recover the clientid.  This
1328 		 * must be done before file and lock recovery, and it
1329 		 * potentially affects the recovery threads for other
1330 		 * filesystems, so it gets special treatment.
1331 		 */
1332 		if (sp != NULL && recov_fail == FALSE) {
1333 			mutex_enter(&sp->s_lock);
1334 			if (!(sp->s_flags & N4S_CLIENTID_SET)) {
1335 				mutex_exit(&sp->s_lock);
1336 				recov_clientid(recovp, sp);
1337 			} else {
1338 				/*
1339 				 * Unset this flag in case another recovery
1340 				 * thread successfully recovered the clientid
1341 				 * for us already.
1342 				 */
1343 				mutex_enter(&mi->mi_lock);
1344 				mi->mi_recovflags &= ~MI4R_NEED_CLIENTID;
1345 				mutex_exit(&mi->mi_lock);
1346 				mutex_exit(&sp->s_lock);
1347 			}
1348 		}
1349 
1350 		/*
1351 		 * Check if we need to get the security information.
1352 		 */
1353 		mutex_enter(&mi->mi_lock);
1354 		if ((mi->mi_recovflags & MI4R_NEED_SECINFO) &&
1355 		    !(mi->mi_flags & MI4_RECOV_FAIL)) {
1356 			mutex_exit(&mi->mi_lock);
1357 			(void) nfs_rw_enter_sig(&mi->mi_recovlock,
1358 			    RW_WRITER, 0);
1359 			error = nfs4_secinfo_recov(recovp->rc_mi,
1360 			    recovp->rc_vp1, recovp->rc_vp2);
1361 			/*
1362 			 * If error, nothing more can be done, stop
1363 			 * the recovery.
1364 			 */
1365 			if (error) {
1366 				mutex_enter(&mi->mi_lock);
1367 				mi->mi_flags |= MI4_RECOV_FAIL;
1368 				mi->mi_error = recovp->rc_error;
1369 				mutex_exit(&mi->mi_lock);
1370 				nfs4_queue_event(RE_WRONGSEC, mi, NULL,
1371 				    error, recovp->rc_vp1, recovp->rc_vp2,
1372 				    0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
1373 			}
1374 			nfs_rw_exit(&mi->mi_recovlock);
1375 		} else
1376 			mutex_exit(&mi->mi_lock);
1377 
1378 		/*
1379 		 * Check if there's a bad seqid to recover.
1380 		 */
1381 		mutex_enter(&mi->mi_lock);
1382 		if ((mi->mi_recovflags & MI4R_BAD_SEQID) &&
1383 		    !(mi->mi_flags & MI4_RECOV_FAIL)) {
1384 			mutex_exit(&mi->mi_lock);
1385 			(void) nfs_rw_enter_sig(&mi->mi_recovlock,
1386 			    RW_WRITER, 0);
1387 			recov_bad_seqid(recovp);
1388 			nfs_rw_exit(&mi->mi_recovlock);
1389 		} else
1390 			mutex_exit(&mi->mi_lock);
1391 
1392 		/*
1393 		 * Next check for recovery that affects the entire
1394 		 * filesystem.
1395 		 */
1396 		if (sp != NULL) {
1397 			mutex_enter(&mi->mi_lock);
1398 			if ((mi->mi_recovflags & MI4R_REOPEN_FILES) &&
1399 			    !(mi->mi_flags & MI4_RECOV_FAIL)) {
1400 				mutex_exit(&mi->mi_lock);
1401 				recov_openfiles(recovp, sp);
1402 			} else
1403 				mutex_exit(&mi->mi_lock);
1404 		}
1405 
1406 		/*
1407 		 * Send any queued state recovery requests.
1408 		 */
1409 		mutex_enter(&mi->mi_lock);
1410 		if (sp != NULL &&
1411 		    (mi->mi_recovflags & MI4R_LOST_STATE) &&
1412 		    !(mi->mi_flags & MI4_RECOV_FAIL)) {
1413 			mutex_exit(&mi->mi_lock);
1414 			(void) nfs_rw_enter_sig(&mi->mi_recovlock,
1415 			    RW_WRITER, 0);
1416 			nfs4_resend_lost_rqsts(recovp, sp);
1417 			if (list_head(&mi->mi_lost_state) == NULL) {
1418 				/* done */
1419 				mutex_enter(&mi->mi_lock);
1420 				mi->mi_recovflags &= ~MI4R_LOST_STATE;
1421 				mutex_exit(&mi->mi_lock);
1422 			}
1423 			nfs_rw_exit(&mi->mi_recovlock);
1424 		} else {
1425 			mutex_exit(&mi->mi_lock);
1426 		}
1427 
1428 		/*
1429 		 * See if there is anything more to do.  If not, announce
1430 		 * that we are done and exit.
1431 		 *
1432 		 * Need mi_recovlock to keep 'sp' valid.  Must grab
1433 		 * mi_recovlock before mi_lock to preserve lock ordering.
1434 		 */
1435 		(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
1436 		mutex_enter(&mi->mi_lock);
1437 		if ((mi->mi_recovflags & ~MI4R_SRV_REBOOT) == 0 ||
1438 		    (mi->mi_flags & MI4_RECOV_FAIL)) {
1439 			list_t local_lost_state;
1440 			nfs4_lost_rqst_t *lrp;
1441 
1442 			/*
1443 			 * We need to remove the lost requests before we
1444 			 * unmark the mi as no longer doing recovery to
1445 			 * avoid a race with a new thread putting new lost
1446 			 * requests on the same mi (and the going away
1447 			 * thread would remove the new lost requests).
1448 			 *
1449 			 * Move the lost requests to a local list since
1450 			 * nfs4_remove_lost_rqst() drops mi_lock, and
1451 			 * dropping the mi_lock would make our check to
1452 			 * see if recovery is done no longer valid.
1453 			 */
1454 			list_create(&local_lost_state,
1455 			    sizeof (nfs4_lost_rqst_t),
1456 			    offsetof(nfs4_lost_rqst_t, lr_node));
1457 			list_move_tail(&local_lost_state, &mi->mi_lost_state);
1458 
1459 			done = 1;
1460 			mutex_exit(&mi->mi_lock);
1461 			/*
1462 			 * Now officially free the "moved"
1463 			 * lost requests.
1464 			 */
1465 			while ((lrp = list_head(&local_lost_state)) != NULL) {
1466 				list_remove(&local_lost_state, lrp);
1467 				nfs4_free_lost_rqst(lrp, sp);
1468 			}
1469 			list_destroy(&local_lost_state);
1470 		} else
1471 			mutex_exit(&mi->mi_lock);
1472 		nfs_rw_exit(&mi->mi_recovlock);
1473 
1474 		/*
1475 		 * If the filesystem has been forcibly unmounted, there is
1476 		 * probably no point in retrying immediately.  Furthermore,
1477 		 * there might be user processes waiting for a chance to
1478 		 * queue up "lost state" requests, so that they can exit.
1479 		 * So pause here for a moment.  Same logic for zone shutdown.
1480 		 */
1481 		if (!done && FS_OR_ZONE_GONE4(mi->mi_vfsp)) {
1482 			mutex_enter(&mi->mi_lock);
1483 			cv_broadcast(&mi->mi_failover_cv);
1484 			mutex_exit(&mi->mi_lock);
1485 			delay(SEC_TO_TICK(nfs4_unmount_delay));
1486 		}
1487 
1488 	} while (!done);
1489 
1490 	if (sp != NULL)
1491 		nfs4_server_rele(sp);
1492 
1493 	/*
1494 	 * Return all recalled delegations
1495 	 */
1496 	nfs4_dlistclean();
1497 
1498 	mutex_enter(&mi->mi_lock);
1499 	recov_done(mi, recovp);
1500 	mutex_exit(&mi->mi_lock);
1501 
1502 	/*
1503 	 * Free up resources that were allocated for us.
1504 	 */
1505 	if (recovp->rc_vp1 != NULL)
1506 		VN_RELE(recovp->rc_vp1);
1507 	if (recovp->rc_vp2 != NULL)
1508 		VN_RELE(recovp->rc_vp2);
1509 
1510 	/* now we are done using the mi struct, signal the waiters */
1511 	mutex_enter(&mi->mi_lock);
1512 	mi->mi_in_recovery--;
1513 	if (mi->mi_in_recovery == 0)
1514 		cv_broadcast(&mi->mi_cv_in_recov);
1515 	mutex_exit(&mi->mi_lock);
1516 
1517 	VFS_RELE(mi->mi_vfsp);
1518 	MI4_RELE(mi);
1519 	kmem_free(recovp, sizeof (recov_info_t));
1520 	mutex_enter(&cpr_lock);
1521 	CALLB_CPR_EXIT(&cpr_info);
1522 	mutex_destroy(&cpr_lock);
1523 	zthread_exit();
1524 }
1525 
1526 /*
1527  * Log the end of recovery and notify any waiting threads.
1528  */
1529 
1530 static void
1531 recov_done(mntinfo4_t *mi, recov_info_t *recovp)
1532 {
1533 
1534 	ASSERT(MUTEX_HELD(&mi->mi_lock));
1535 
1536 	nfs4_queue_event(RE_END, mi, NULL, 0, recovp->rc_vp1,
1537 	    recovp->rc_vp2, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
1538 	mi->mi_recovthread = NULL;
1539 	mi->mi_flags &= ~MI4_RECOV_ACTIV;
1540 	mi->mi_recovflags &= ~MI4R_SRV_REBOOT;
1541 	cv_broadcast(&mi->mi_failover_cv);
1542 }
1543 
1544 /*
1545  * State-specific recovery routines, by state.
1546  */
1547 
1548 /*
1549  * Failover.
1550  *
1551  * Replaces *spp with a reference to the new server, which must
1552  * eventually be freed.
1553  */
1554 
1555 static void
1556 recov_newserver(recov_info_t *recovp, nfs4_server_t **spp, bool_t *recov_fail)
1557 {
1558 	mntinfo4_t *mi = recovp->rc_mi;
1559 	servinfo4_t *svp = NULL;
1560 	nfs4_server_t *osp = *spp;
1561 	CLIENT *cl;
1562 	enum clnt_stat status;
1563 	struct timeval tv;
1564 	int error;
1565 	int oncethru = 0;
1566 	rnode4_t *rp;
1567 	int index;
1568 	nfs_fh4 fh;
1569 	char *snames;
1570 	size_t len;
1571 
1572 	(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0);
1573 
1574 	tv.tv_sec = 2;
1575 	tv.tv_usec = 0;
1576 
1577 #ifdef lint
1578 	/*
1579 	 * Lint can't follow the logic, so thinks that snames and len
1580 	 * can be used before being set.  They can't, but lint can't
1581 	 * figure it out.  To address the lint warning, initialize
1582 	 * snames and len for lint.
1583 	 */
1584 	snames = NULL;
1585 	len = 0;
1586 #endif
1587 
1588 	/*
1589 	 * Ping the null NFS procedure of every server in
1590 	 * the list until one responds.  We always start
1591 	 * at the head of the list and always skip the one
1592 	 * that is current, since it's caused us a problem.
1593 	 */
1594 	while (svp == NULL) {
1595 		for (svp = mi->mi_servers; svp; svp = svp->sv_next) {
1596 
1597 			mutex_enter(&mi->mi_lock);
1598 			if (FS_OR_ZONE_GONE4(mi->mi_vfsp)) {
1599 				mi->mi_flags |= MI4_RECOV_FAIL;
1600 				mutex_exit(&mi->mi_lock);
1601 				(void) nfs_rw_exit(&mi->mi_recovlock);
1602 				*recov_fail = TRUE;
1603 				if (oncethru)
1604 					kmem_free(snames, len);
1605 				return;
1606 			}
1607 			mutex_exit(&mi->mi_lock);
1608 
1609 			(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1610 			if (svp->sv_flags & SV4_NOTINUSE) {
1611 				nfs_rw_exit(&svp->sv_lock);
1612 				continue;
1613 			}
1614 			nfs_rw_exit(&svp->sv_lock);
1615 
1616 			if (!oncethru && svp == mi->mi_curr_serv)
1617 				continue;
1618 
1619 			error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr,
1620 			    NFS_PROGRAM, NFS_V4, 0, 1, CRED(), &cl);
1621 			if (error)
1622 				continue;
1623 
1624 			if (!(mi->mi_flags & MI4_INT))
1625 				cl->cl_nosignal = TRUE;
1626 			status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL,
1627 			    xdr_void, NULL, tv);
1628 			if (!(mi->mi_flags & MI4_INT))
1629 				cl->cl_nosignal = FALSE;
1630 			AUTH_DESTROY(cl->cl_auth);
1631 			CLNT_DESTROY(cl);
1632 			if (status == RPC_SUCCESS) {
1633 				nfs4_queue_event(RE_FAILOVER, mi,
1634 				    svp == mi->mi_curr_serv ? NULL :
1635 				    svp->sv_hostname, 0, NULL, NULL, 0,
1636 				    NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
1637 				break;
1638 			}
1639 		}
1640 
1641 		if (svp == NULL) {
1642 			if (!oncethru) {
1643 				snames = nfs4_getsrvnames(mi, &len);
1644 				nfs4_queue_fact(RF_SRVS_NOT_RESPOND, mi,
1645 				    0, 0, 0, FALSE, snames, 0, NULL);
1646 				oncethru = 1;
1647 			}
1648 			delay(hz);
1649 		}
1650 	}
1651 
1652 	if (oncethru) {
1653 		nfs4_queue_fact(RF_SRVS_OK, mi, 0, 0, 0, FALSE, snames,
1654 		    0, NULL);
1655 		kmem_free(snames, len);
1656 	}
1657 
1658 #if DEBUG
1659 	(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1660 	ASSERT((svp->sv_flags & SV4_NOTINUSE) == 0);
1661 	nfs_rw_exit(&svp->sv_lock);
1662 #endif
1663 
1664 	mutex_enter(&mi->mi_lock);
1665 	mi->mi_recovflags &= ~MI4R_NEED_NEW_SERVER;
1666 	if (svp != mi->mi_curr_serv) {
1667 		servinfo4_t *osvp = mi->mi_curr_serv;
1668 
1669 		mutex_exit(&mi->mi_lock);
1670 
1671 		/*
1672 		 * Update server-dependent fields in the root vnode.
1673 		 */
1674 		index = rtable4hash(mi->mi_rootfh);
1675 		rw_enter(&rtable4[index].r_lock, RW_WRITER);
1676 
1677 		rp = r4find(&rtable4[index], mi->mi_rootfh, mi->mi_vfsp);
1678 		if (rp != NULL) {
1679 			NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
1680 			    "recov_newserver: remapping %s", rnode4info(rp)));
1681 			mutex_enter(&rp->r_statelock);
1682 			rp->r_server = svp;
1683 			PURGE_ATTRCACHE4_LOCKED(rp);
1684 			mutex_exit(&rp->r_statelock);
1685 			(void) nfs4_free_data_reclaim(rp);
1686 			nfs4_purge_rddir_cache(RTOV4(rp));
1687 			rw_exit(&rtable4[index].r_lock);
1688 			NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
1689 			    "recov_newserver: done with %s",
1690 			    rnode4info(rp)));
1691 			VN_RELE(RTOV4(rp));
1692 		} else
1693 			rw_exit(&rtable4[index].r_lock);
1694 		(void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
1695 
1696 		mutex_enter(&mi->mi_lock);
1697 		mi->mi_recovflags |= MI4R_REOPEN_FILES | MI4R_REMAP_FILES;
1698 		if (recovp->rc_srv_reboot)
1699 			mi->mi_recovflags |= MI4R_SRV_REBOOT;
1700 		mi->mi_curr_serv = svp;
1701 		mi->mi_failover++;
1702 		mi->mi_flags &= ~MI4_BADOWNER_DEBUG;
1703 		mutex_exit(&mi->mi_lock);
1704 
1705 		(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1706 		fh.nfs_fh4_len = svp->sv_fhandle.fh_len;
1707 		fh.nfs_fh4_val = svp->sv_fhandle.fh_buf;
1708 		sfh4_update(mi->mi_rootfh, &fh);
1709 		fh.nfs_fh4_len = svp->sv_pfhandle.fh_len;
1710 		fh.nfs_fh4_val = svp->sv_pfhandle.fh_buf;
1711 		sfh4_update(mi->mi_srvparentfh, &fh);
1712 		nfs_rw_exit(&svp->sv_lock);
1713 
1714 		*spp = nfs4_move_mi(mi, osvp, svp);
1715 		if (osp != NULL)
1716 			nfs4_server_rele(osp);
1717 	} else
1718 		mutex_exit(&mi->mi_lock);
1719 	(void) nfs_rw_exit(&mi->mi_recovlock);
1720 }
1721 
1722 /*
1723  * Clientid.
1724  */
1725 
1726 static void
1727 recov_clientid(recov_info_t *recovp, nfs4_server_t *sp)
1728 {
1729 	mntinfo4_t *mi = recovp->rc_mi;
1730 	int error = 0;
1731 	int still_stale;
1732 	int need_new_s;
1733 
1734 	ASSERT(sp != NULL);
1735 
1736 	/*
1737 	 * Acquire the recovery lock and then verify that the clientid
1738 	 * still needs to be recovered.  (Note that s_recovlock is supposed
1739 	 * to be acquired before s_lock.)  Since the thread holds the
1740 	 * recovery lock, no other thread will recover the clientid.
1741 	 */
1742 	(void) nfs_rw_enter_sig(&sp->s_recovlock, RW_WRITER, 0);
1743 	(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0);
1744 	mutex_enter(&sp->s_lock);
1745 	still_stale = ((sp->s_flags & N4S_CLIENTID_SET) == 0);
1746 	mutex_exit(&sp->s_lock);
1747 
1748 	if (still_stale) {
1749 		nfs4_error_t n4e;
1750 
1751 		nfs4_error_zinit(&n4e);
1752 		nfs4setclientid(mi, kcred, TRUE, &n4e);
1753 		error = n4e.error;
1754 		if (error != 0) {
1755 
1756 			/*
1757 			 * nfs4setclientid may have set MI4R_NEED_NEW_SERVER,
1758 			 * if so, just return and let recov_thread drive
1759 			 * failover.
1760 			 */
1761 			mutex_enter(&mi->mi_lock);
1762 			need_new_s = mi->mi_recovflags & MI4R_NEED_NEW_SERVER;
1763 			mutex_exit(&mi->mi_lock);
1764 
1765 			if (need_new_s) {
1766 				nfs_rw_exit(&mi->mi_recovlock);
1767 				nfs_rw_exit(&sp->s_recovlock);
1768 				return;
1769 			}
1770 
1771 			nfs4_queue_event(RE_CLIENTID, mi, NULL, n4e.error, NULL,
1772 			    NULL, n4e.stat, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
1773 			mutex_enter(&mi->mi_lock);
1774 			mi->mi_flags |= MI4_RECOV_FAIL;
1775 			mi->mi_error = recovp->rc_error;
1776 			mutex_exit(&mi->mi_lock);
1777 			/* don't destroy the nfs4_server, let umount do it */
1778 		}
1779 	}
1780 
1781 	if (error == 0) {
1782 		mutex_enter(&mi->mi_lock);
1783 		mi->mi_recovflags &= ~MI4R_NEED_CLIENTID;
1784 		/*
1785 		 * If still_stale isn't true, then another thread already
1786 		 * recovered the clientid.  And that thread that set the
1787 		 * clientid will have initiated reopening files on all the
1788 		 * filesystems for the server, so we should not initiate
1789 		 * reopening for this filesystem here.
1790 		 */
1791 		if (still_stale) {
1792 			mi->mi_recovflags |= MI4R_REOPEN_FILES;
1793 			if (recovp->rc_srv_reboot)
1794 				mi->mi_recovflags |= MI4R_SRV_REBOOT;
1795 		}
1796 		mutex_exit(&mi->mi_lock);
1797 	}
1798 
1799 	nfs_rw_exit(&mi->mi_recovlock);
1800 
1801 	if (error != 0) {
1802 		nfs_rw_exit(&sp->s_recovlock);
1803 		mutex_enter(&mi->mi_lock);
1804 		if ((mi->mi_flags & MI4_RECOV_FAIL) == 0)
1805 			delay(SEC_TO_TICK(recov_err_delay));
1806 		mutex_exit(&mi->mi_lock);
1807 	} else {
1808 		mntinfo4_t **milist;
1809 		mntinfo4_t *tmi;
1810 		int nummi, i;
1811 
1812 		/*
1813 		 * Initiate recovery of open files for other filesystems.
1814 		 * We create an array of filesystems, rather than just
1815 		 * walking the filesystem list, to avoid deadlock issues
1816 		 * with s_lock and mi_recovlock.
1817 		 */
1818 		milist = make_milist(sp, &nummi);
1819 		for (i = 0; i < nummi; i++) {
1820 			tmi = milist[i];
1821 			if (tmi != mi) {
1822 				(void) nfs_rw_enter_sig(&tmi->mi_recovlock,
1823 				    RW_READER, 0);
1824 				start_recovery_action(NR_OPENFILES, TRUE, tmi,
1825 				    NULL, NULL);
1826 				nfs_rw_exit(&tmi->mi_recovlock);
1827 			}
1828 		}
1829 		free_milist(milist, nummi);
1830 
1831 		nfs_rw_exit(&sp->s_recovlock);
1832 	}
1833 }
1834 
1835 /*
1836  * Return an array of filesystems associated with the given server.  The
1837  * caller should call free_milist() to free the references and memory.
1838  */
1839 
1840 static mntinfo4_t **
1841 make_milist(nfs4_server_t *sp, int *nummip)
1842 {
1843 	int nummi, i;
1844 	mntinfo4_t **milist;
1845 	mntinfo4_t *tmi;
1846 
1847 	mutex_enter(&sp->s_lock);
1848 	nummi = 0;
1849 	for (tmi = sp->mntinfo4_list; tmi != NULL; tmi = tmi->mi_clientid_next)
1850 		nummi++;
1851 
1852 	milist = kmem_alloc(nummi * sizeof (mntinfo4_t *), KM_SLEEP);
1853 
1854 	for (i = 0, tmi = sp->mntinfo4_list; tmi != NULL; i++,
1855 	    tmi = tmi->mi_clientid_next) {
1856 		milist[i] = tmi;
1857 		VFS_HOLD(tmi->mi_vfsp);
1858 	}
1859 	mutex_exit(&sp->s_lock);
1860 
1861 	*nummip = nummi;
1862 	return (milist);
1863 }
1864 
1865 /*
1866  * Free the filesystem list created by make_milist().
1867  */
1868 
1869 static void
1870 free_milist(mntinfo4_t **milist, int nummi)
1871 {
1872 	mntinfo4_t *tmi;
1873 	int i;
1874 
1875 	for (i = 0; i < nummi; i++) {
1876 		tmi = milist[i];
1877 		VFS_RELE(tmi->mi_vfsp);
1878 	}
1879 	kmem_free(milist, nummi * sizeof (mntinfo4_t *));
1880 }
1881 
1882 /*
1883  * Filehandle
1884  */
1885 
1886 /*
1887  * Lookup the filehandle for the given vnode and update the rnode if it has
1888  * changed.
1889  *
1890  * Errors:
1891  * - if the filehandle could not be updated because of an error that
1892  *   requires further recovery, initiate that recovery and return.
1893  * - if the filehandle could not be updated because of a signal, pretend we
1894  *   succeeded and let someone else deal with it.
1895  * - if the filehandle could not be updated and the filesystem has been
1896  *   forcibly unmounted, pretend we succeeded, and let the caller deal with
1897  *   the forced unmount (to retry or not to retry, that is the question).
1898  * - if the filehandle could not be updated because of some other error,
1899  *   mark the rnode bad and return.
1900  */
1901 static void
1902 recov_filehandle(nfs4_recov_t action, mntinfo4_t *mi, vnode_t *vp)
1903 {
1904 	rnode4_t *rp = VTOR4(vp);
1905 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
1906 	bool_t needrecov;
1907 
1908 	mutex_enter(&rp->r_statelock);
1909 
1910 	if (rp->r_flags & R4RECOVERR) {
1911 		mutex_exit(&rp->r_statelock);
1912 		return;
1913 	}
1914 
1915 	/*
1916 	 * If someone else is updating the filehandle, wait for them to
1917 	 * finish and then let our caller retry.
1918 	 */
1919 	if (rp->r_flags & R4RECEXPFH) {
1920 		while (rp->r_flags & R4RECEXPFH) {
1921 			cv_wait(&rp->r_cv, &rp->r_statelock);
1922 		}
1923 		mutex_exit(&rp->r_statelock);
1924 		return;
1925 	}
1926 	rp->r_flags |= R4RECEXPFH;
1927 	mutex_exit(&rp->r_statelock);
1928 
1929 	if (action == NR_BADHANDLE) {
1930 		/* shouldn't happen */
1931 		nfs4_queue_event(RE_BADHANDLE, mi, NULL, 0,
1932 		    vp, NULL, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
1933 	}
1934 
1935 	nfs4_remap_file(mi, vp, 0, &e);
1936 	needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
1937 
1938 	/*
1939 	 * If we get BADHANDLE or FHEXPIRED in their handler, something is
1940 	 * broken.  Don't try to recover, just mark the file dead.
1941 	 */
1942 	if (needrecov && e.error == 0 &&
1943 	    (e.stat == NFS4ERR_BADHANDLE || e.stat == NFS4ERR_FHEXPIRED))
1944 		needrecov = FALSE;
1945 	if (needrecov) {
1946 		(void) nfs4_start_recovery(&e, mi, vp,
1947 		    NULL, NULL, NULL, OP_LOOKUP, NULL);
1948 	} else if (e.error != EINTR &&
1949 	    !NFS4_FRC_UNMT_ERR(e.error, mi->mi_vfsp) &&
1950 	    (e.error != 0 || e.stat != NFS4_OK)) {
1951 		nfs4_recov_fh_fail(vp, e.error, e.stat);
1952 		/*
1953 		 * Don't set r_error to ESTALE.  Higher-level code (e.g.,
1954 		 * cstatat_getvp()) retries on ESTALE, which would cause
1955 		 * an infinite loop.
1956 		 */
1957 	}
1958 
1959 	mutex_enter(&rp->r_statelock);
1960 	rp->r_flags &= ~R4RECEXPFH;
1961 	cv_broadcast(&rp->r_cv);
1962 	mutex_exit(&rp->r_statelock);
1963 }
1964 
1965 /*
1966  * Stale Filehandle
1967  */
1968 
1969 /*
1970  * A stale filehandle can happen when an individual file has
1971  * been removed, or when an entire filesystem has been taken
1972  * offline.  To distinguish these cases, we do this:
1973  * - if a GETATTR with the current filehandle is okay, we do
1974  *   nothing (this can happen with two-filehandle ops)
1975  * - if the GETATTR fails, but a GETATTR of the root filehandle
1976  *   succeeds, mark the rnode with R4STALE, which will stop use
1977  * - if the GETATTR fails, and a GETATTR of the root filehandle
1978  *   also fails, we consider the problem filesystem-wide, so:
1979  *   - if we can failover, we should
1980  *   - if we can't failover, we should mark both the original
1981  *     vnode and the root bad
1982  */
1983 static void
1984 recov_stale(mntinfo4_t *mi, vnode_t *vp)
1985 {
1986 	rnode4_t *rp = VTOR4(vp);
1987 	vnode_t *rootvp = NULL;
1988 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
1989 	nfs4_ga_res_t gar;
1990 	char *fail_msg = "failed to recover from NFS4ERR_STALE";
1991 	bool_t needrecov;
1992 
1993 	mutex_enter(&rp->r_statelock);
1994 
1995 	if (rp->r_flags & R4RECOVERR) {
1996 		mutex_exit(&rp->r_statelock);
1997 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1998 		    "recov_stale: already marked dead, rp %s",
1999 		    rnode4info(rp)));
2000 		return;
2001 	}
2002 
2003 	if (rp->r_flags & R4STALE) {
2004 		mutex_exit(&rp->r_statelock);
2005 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2006 		    "recov_stale: already marked stale, rp %s",
2007 		    rnode4info(rp)));
2008 		return;
2009 	}
2010 
2011 	mutex_exit(&rp->r_statelock);
2012 
2013 	/* Try a GETATTR on this vnode */
2014 	nfs4_getattr_otw_norecovery(vp, &gar, &e, CRED(), 0);
2015 
2016 	/*
2017 	 * Handle non-STALE recoverable errors
2018 	 */
2019 	needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
2020 	if (needrecov && (e.error != 0 || e.stat != NFS4ERR_STALE)) {
2021 		(void) nfs4_start_recovery(&e, mi, vp,
2022 		    NULL, NULL, NULL, OP_GETATTR, NULL);
2023 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2024 		    "recov_stale: error=%d, stat=%d seen on rp %s",
2025 		    e.error, e.stat, rnode4info(rp)));
2026 		goto out;
2027 	}
2028 
2029 	/* Are things OK for this vnode? */
2030 	if (!e.error && e.stat == NFS4_OK) {
2031 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2032 		    "recov_stale: file appears fine, rp %s",
2033 		    rnode4info(rp)));
2034 		goto out;
2035 	}
2036 
2037 	/* Did we get an unrelated non-recoverable error? */
2038 	if (e.error || e.stat != NFS4ERR_STALE) {
2039 		nfs4_fail_recov(vp, fail_msg, e.error, e.stat);
2040 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2041 		    "recov_stale: unrelated fatal error, rp %s",
2042 		    rnode4info(rp)));
2043 		goto out;
2044 	}
2045 
2046 	/*
2047 	 * If we don't appear to be dealing with the root node, find it.
2048 	 */
2049 	if ((vp->v_flag & VROOT) == 0) {
2050 		nfs4_error_zinit(&e);
2051 		e.error = VFS_ROOT(vp->v_vfsp, &rootvp);
2052 		if (e.error) {
2053 			nfs4_fail_recov(vp, fail_msg, 0, NFS4ERR_STALE);
2054 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2055 			    "recov_stale: can't find root node for rp %s",
2056 			    rnode4info(rp)));
2057 			goto out;
2058 		}
2059 	}
2060 
2061 	/* Try a GETATTR on the root vnode */
2062 	if (rootvp != NULL) {
2063 		nfs4_error_zinit(&e);
2064 		nfs4_getattr_otw_norecovery(rootvp, &gar, &e, CRED(), 0);
2065 
2066 		/* Try recovery? */
2067 		if (e.error != 0 || e.stat != NFS4ERR_STALE) {
2068 			needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
2069 			if (needrecov) {
2070 				(void) nfs4_start_recovery(&e,
2071 				    mi, rootvp, NULL, NULL, NULL,
2072 				    OP_GETATTR, NULL);
2073 				NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2074 				    "recov_stale: error=%d, stat=%d seen "
2075 				    "on rp %s", e.error, e.stat,
2076 				    rnode4info(rp)));
2077 			}
2078 		}
2079 
2080 		/*
2081 		 * Check to see if a failover attempt is warranted
2082 		 * NB: nfs4_try_failover doesn't check for STALE
2083 		 * because recov_stale gets a shot first.  Now that
2084 		 * recov_stale has failed, go ahead and try failover.
2085 		 *
2086 		 * If the getattr on the root filehandle was successful,
2087 		 * then mark recovery as failed for 'vp' and exit.
2088 		 */
2089 		if (nfs4_try_failover(&e) == 0 && e.stat != NFS4ERR_STALE) {
2090 			/*
2091 			 * pass the original error to fail_recov, not
2092 			 * the one from trying the root vnode.
2093 			 */
2094 			nfs4_fail_recov(vp, fail_msg, 0, NFS4ERR_STALE);
2095 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2096 			    "recov_stale: root node OK, marking "
2097 			    "dead rp %s", rnode4info(rp)));
2098 			goto out;
2099 		}
2100 	}
2101 
2102 	/*
2103 	 * Here, we know that both the original file and the
2104 	 * root filehandle (which may be the same) are stale.
2105 	 * We want to fail over if we can, and if we can't, we
2106 	 * want to mark everything in sight bad.
2107 	 */
2108 	if (FAILOVER_MOUNT4(mi)) {
2109 		mutex_enter(&mi->mi_lock);
2110 		mi->mi_recovflags |= MI4R_NEED_NEW_SERVER;
2111 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2112 		    "recov_stale: failing over due to rp %s",
2113 		    rnode4info(rp)));
2114 		mutex_exit(&mi->mi_lock);
2115 	} else {
2116 		rnode4_t *rootrp;
2117 		servinfo4_t *svp;
2118 
2119 		/*
2120 		 * Can't fail over, so mark things dead.
2121 		 *
2122 		 * If rootvp is set, we know we have a distinct
2123 		 * non-root vnode which can be marked dead in
2124 		 * the usual way.
2125 		 *
2126 		 * Then we want to mark the root vnode dead.
2127 		 * Note that if rootvp wasn't set, our vp is
2128 		 * actually the root vnode.
2129 		 */
2130 		if (rootvp != NULL) {
2131 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2132 			    "recov_stale: can't fail over, marking dead rp %s",
2133 			    rnode4info(rp)));
2134 			nfs4_fail_recov(vp, fail_msg, 0, NFS4ERR_STALE);
2135 		} else {
2136 			rootvp = vp;
2137 			VN_HOLD(rootvp);
2138 		}
2139 
2140 		/*
2141 		 * Mark root dead, but quietly - since
2142 		 * the root rnode is frequently recreated,
2143 		 * we can encounter this at every access.
2144 		 * Also mark recovery as failed on this VFS.
2145 		 */
2146 		rootrp = VTOR4(rootvp);
2147 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_CONT,
2148 		    "recov_stale: marking dead root rp %s",
2149 		    rnode4info(rootrp)));
2150 		mutex_enter(&rootrp->r_statelock);
2151 		rootrp->r_flags |= (R4RECOVERR | R4STALE);
2152 		rootrp->r_error = ESTALE;
2153 		mutex_exit(&rootrp->r_statelock);
2154 		mutex_enter(&mi->mi_lock);
2155 		mi->mi_error = ESTALE;
2156 		mutex_exit(&mi->mi_lock);
2157 
2158 		svp = mi->mi_curr_serv;
2159 		(void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
2160 		svp->sv_flags |= SV4_ROOT_STALE;
2161 		nfs_rw_exit(&svp->sv_lock);
2162 	}
2163 
2164 out:
2165 	if (rootvp)
2166 		VN_RELE(rootvp);
2167 }
2168 
2169 /*
2170  * Locks.
2171  */
2172 
2173 /*
2174  * Reclaim all the active (acquired) locks for the given file.
2175  * If a process lost a lock, the process is sent a SIGLOST.  This is not
2176  * considered an error.
2177  *
2178  * Return values:
2179  * Errors and status are returned via the nfs4_error_t parameter
2180  * If an error indicates that recovery is needed, the caller is responsible
2181  * for dealing with it.
2182  */
2183 
2184 static void
2185 relock_file(vnode_t *vp, mntinfo4_t *mi, nfs4_error_t *ep,
2186     fattr4_change pre_change)
2187 {
2188 	locklist_t *locks, *llp;
2189 	rnode4_t *rp;
2190 
2191 	ASSERT(ep != NULL);
2192 	nfs4_error_zinit(ep);
2193 
2194 	if (VTOMI4(vp)->mi_flags & MI4_LLOCK)
2195 		return;
2196 
2197 	nfs4_flush_lock_owners(VTOR4(vp));
2198 
2199 	/*
2200 	 * If we get an error that requires recovery actions, just bail out
2201 	 * and let the top-level recovery code handle it.
2202 	 *
2203 	 * If we get some other error, kill the process that owned the lock
2204 	 * and mark its remaining locks (if any) as belonging to NOPID, so
2205 	 * that we don't make any more reclaim requests for that process.
2206 	 */
2207 
2208 	rp = VTOR4(vp);
2209 	locks = flk_active_locks_for_vp(vp);
2210 	for (llp = locks; llp != NULL; llp = llp->ll_next) {
2211 		int did_reclaim = 1;
2212 
2213 		ASSERT(llp->ll_vp == vp);
2214 		if (llp->ll_flock.l_pid == NOPID)
2215 			continue;
2216 		reclaim_one_lock(vp, &llp->ll_flock, ep, &did_reclaim);
2217 		/*
2218 		 * If we need to restart recovery, stop processing the
2219 		 * list.  Some errors would be recoverable under other
2220 		 * circumstances, but if they happen here we just give up
2221 		 * on the lock.
2222 		 */
2223 		if (nfs4_needs_recovery(ep, TRUE, vp->v_vfsp)) {
2224 			if (ep->error != 0)
2225 				break;
2226 			if (!nfs4_recov_marks_dead(ep->stat))
2227 				break;
2228 		}
2229 		/*
2230 		 *   In case the server isn't offering us a grace period, or
2231 		 * if we missed it, we might have opened & locked from scratch,
2232 		 * rather than reopened/reclaimed.
2233 		 *   We need to ensure that the object hadn't been otherwise
2234 		 * changed during this time, by comparing the changeinfo.
2235 		 *   We get passed the changeinfo from before the reopen by our
2236 		 * caller, in pre_change.
2237 		 *   The changeinfo from after the reopen is in rp->r_change,
2238 		 * courtesy of the GETATTR in the reopen.
2239 		 *   If they're different, then the file has changed, and we
2240 		 * have to SIGLOST the app.
2241 		 */
2242 		if (ep->error == 0 && ep->stat == NFS4_OK && !did_reclaim) {
2243 			mutex_enter(&rp->r_statelock);
2244 			if (pre_change != rp->r_change)
2245 				ep->stat = NFS4ERR_NO_GRACE;
2246 			mutex_exit(&rp->r_statelock);
2247 		}
2248 		if (ep->error != 0 || ep->stat != NFS4_OK) {
2249 			if (ep->error != 0)
2250 				nfs4_queue_event(RE_FAIL_RELOCK, mi,
2251 				    NULL, ep->error, vp, NULL, 0, NULL,
2252 				    llp->ll_flock.l_pid, TAG_NONE, TAG_NONE,
2253 				    0, 0);
2254 			else
2255 				nfs4_queue_event(RE_FAIL_RELOCK, mi,
2256 				    NULL, 0, vp, NULL, ep->stat, NULL,
2257 				    llp->ll_flock.l_pid, TAG_NONE, TAG_NONE,
2258 				    0, 0);
2259 			nfs4_send_siglost(llp->ll_flock.l_pid, mi, vp, TRUE,
2260 			    ep->error, ep->stat);
2261 			relock_skip_pid(llp, llp->ll_flock.l_pid);
2262 
2263 			/* Reinitialize the nfs4_error and continue */
2264 			nfs4_error_zinit(ep);
2265 		}
2266 	}
2267 
2268 	if (locks != NULL)
2269 		flk_free_locklist(locks);
2270 }
2271 
2272 /*
2273  * Reclaim the given lock.
2274  * If the lock can't be reclaimed, the process is sent SIGLOST, but this is
2275  * not considered an error.
2276  *
2277  * Errors are returned via the nfs4_error_t parameter.
2278  */
2279 static void
2280 reclaim_one_lock(vnode_t *vp, flock64_t *flk, nfs4_error_t *ep,
2281     int *did_reclaimp)
2282 {
2283 	cred_t *cr;
2284 	rnode4_t *rp = VTOR4(vp);
2285 
2286 	cr = pid_to_cr(flk->l_pid);
2287 	if (cr == NULL) {
2288 		nfs4_error_zinit(ep);
2289 		ep->error = ESRCH;
2290 		return;
2291 	}
2292 
2293 	do {
2294 		mutex_enter(&rp->r_statelock);
2295 		if (rp->r_flags & R4RECOVERR) {
2296 			/*
2297 			 * This shouldn't affect other reclaims, so don't
2298 			 * return an error.
2299 			 */
2300 			mutex_exit(&rp->r_statelock);
2301 			break;
2302 		}
2303 		mutex_exit(&rp->r_statelock);
2304 
2305 		nfs4frlock(NFS4_LCK_CTYPE_RECLAIM, vp, F_SETLK, flk,
2306 		    FREAD|FWRITE, 0, cr, ep, NULL, did_reclaimp);
2307 		if (ep->error == 0 && ep->stat == NFS4ERR_FHEXPIRED)
2308 			start_recovery_action(NR_FHEXPIRED, TRUE, VTOMI4(vp),
2309 			    vp, NULL);
2310 	} while (ep->error == 0 && ep->stat == NFS4ERR_FHEXPIRED);
2311 
2312 	crfree(cr);
2313 }
2314 
2315 /*
2316  * Open files.
2317  */
2318 
2319 /*
2320  * Verifies if the nfsstat4 is a valid error for marking this vnode dead.
2321  * Returns 1 if the error is valid; 0 otherwise.
2322  */
2323 static int
2324 nfs4_valid_recov_err_for_vp(vnode_t *vp, nfsstat4 stat)
2325 {
2326 	/*
2327 	 * We should not be marking non-regular files as dead,
2328 	 * except in very rare cases (eg: BADHANDLE or NFS4ERR_BADNAME).
2329 	 */
2330 	if (vp->v_type != VREG && stat != NFS4ERR_BADHANDLE &&
2331 	    stat != NFS4ERR_BADNAME)
2332 		return (0);
2333 
2334 	return (1);
2335 }
2336 
2337 /*
2338  * Failed attempting to recover a filehandle.  If 'stat' is valid for 'vp',
2339  * then mark the object dead.  Since we've had to do a lookup for
2340  * filehandle recovery, we will mark the object dead if we got NOENT.
2341  */
2342 static void
2343 nfs4_recov_fh_fail(vnode_t *vp, int error, nfsstat4 stat)
2344 {
2345 	ASSERT(vp != NULL);
2346 
2347 	if ((error == 0) && (stat != NFS4ERR_NOENT) &&
2348 	    (!nfs4_valid_recov_err_for_vp(vp, stat)))
2349 		return;
2350 
2351 	nfs4_fail_recov(vp, "can't recover filehandle", error, stat);
2352 }
2353 
2354 /*
2355  * Recovery from a "shouldn't happen" error.  In the long term, we'd like
2356  * to mark only the data structure(s) that provided the bad value as being
2357  * bad.  But for now we'll just mark the entire file.
2358  */
2359 
2360 static void
2361 recov_badstate(recov_info_t *recovp, vnode_t *vp, nfsstat4 stat)
2362 {
2363 	ASSERT(vp != NULL);
2364 	recov_throttle(recovp, vp);
2365 
2366 	if (!nfs4_valid_recov_err_for_vp(vp, stat))
2367 		return;
2368 
2369 	nfs4_fail_recov(vp, "", 0, stat);
2370 }
2371 
2372 /*
2373  * Free up the information saved for a lost state request.
2374  */
2375 static void
2376 nfs4_free_lost_rqst(nfs4_lost_rqst_t *lrp, nfs4_server_t *sp)
2377 {
2378 	component4 *filep;
2379 	nfs4_open_stream_t *osp;
2380 	int have_sync_lock;
2381 
2382 	NFS4_DEBUG(nfs4_lost_rqst_debug,
2383 	    (CE_NOTE, "nfs4_free_lost_rqst:"));
2384 
2385 	switch (lrp->lr_op) {
2386 	case OP_OPEN:
2387 		filep = &lrp->lr_ofile;
2388 		if (filep->utf8string_val) {
2389 			kmem_free(filep->utf8string_val, filep->utf8string_len);
2390 			filep->utf8string_val = NULL;
2391 		}
2392 		break;
2393 	case OP_DELEGRETURN:
2394 		nfs4delegreturn_cleanup(VTOR4(lrp->lr_vp), sp);
2395 		break;
2396 	case OP_CLOSE:
2397 		osp = lrp->lr_osp;
2398 		ASSERT(osp != NULL);
2399 		mutex_enter(&osp->os_sync_lock);
2400 		have_sync_lock = 1;
2401 		if (osp->os_pending_close) {
2402 			/* clean up the open file state. */
2403 			osp->os_pending_close = 0;
2404 			nfs4close_notw(lrp->lr_vp, osp, &have_sync_lock);
2405 		}
2406 		if (have_sync_lock)
2407 			mutex_exit(&osp->os_sync_lock);
2408 		break;
2409 	}
2410 
2411 	lrp->lr_op = 0;
2412 	if (lrp->lr_oop != NULL) {
2413 		open_owner_rele(lrp->lr_oop);
2414 		lrp->lr_oop = NULL;
2415 	}
2416 	if (lrp->lr_osp != NULL) {
2417 		open_stream_rele(lrp->lr_osp, VTOR4(lrp->lr_vp));
2418 		lrp->lr_osp = NULL;
2419 	}
2420 	if (lrp->lr_lop != NULL) {
2421 		lock_owner_rele(lrp->lr_lop);
2422 		lrp->lr_lop = NULL;
2423 	}
2424 	if (lrp->lr_flk != NULL) {
2425 		kmem_free(lrp->lr_flk, sizeof (flock64_t));
2426 		lrp->lr_flk = NULL;
2427 	}
2428 	if (lrp->lr_vp != NULL) {
2429 		VN_RELE(lrp->lr_vp);
2430 		lrp->lr_vp = NULL;
2431 	}
2432 	if (lrp->lr_dvp != NULL) {
2433 		VN_RELE(lrp->lr_dvp);
2434 		lrp->lr_dvp = NULL;
2435 	}
2436 	if (lrp->lr_cr != NULL) {
2437 		crfree(lrp->lr_cr);
2438 		lrp->lr_cr = NULL;
2439 	}
2440 
2441 	kmem_free(lrp, sizeof (nfs4_lost_rqst_t));
2442 }
2443 
2444 /*
2445  * Remove any lost state requests and free them.
2446  */
2447 static void
2448 nfs4_remove_lost_rqsts(mntinfo4_t *mi, nfs4_server_t *sp)
2449 {
2450 	nfs4_lost_rqst_t *lrp;
2451 
2452 	mutex_enter(&mi->mi_lock);
2453 	while ((lrp = list_head(&mi->mi_lost_state)) != NULL) {
2454 		list_remove(&mi->mi_lost_state, lrp);
2455 		mutex_exit(&mi->mi_lock);
2456 		nfs4_free_lost_rqst(lrp, sp);
2457 		mutex_enter(&mi->mi_lock);
2458 	}
2459 	mutex_exit(&mi->mi_lock);
2460 }
2461 
2462 /*
2463  * Reopen all the files for the given filesystem and reclaim any locks.
2464  */
2465 
2466 static void
2467 recov_openfiles(recov_info_t *recovp, nfs4_server_t *sp)
2468 {
2469 	mntinfo4_t *mi = recovp->rc_mi;
2470 	nfs4_opinst_t *reopenlist = NULL, *rep;
2471 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
2472 	open_claim_type4 claim;
2473 	int remap;
2474 	char *fail_msg = "No such file or directory on replica";
2475 	rnode4_t *rp;
2476 	fattr4_change pre_change;
2477 
2478 	ASSERT(sp != NULL);
2479 
2480 	/*
2481 	 * This check is to allow a 10ms pause before we reopen files
2482 	 * it should allow the server time to have received the CB_NULL
2483 	 * reply and update its internal structures such that (if
2484 	 * applicable) we are granted a delegation on reopened files.
2485 	 */
2486 	mutex_enter(&sp->s_lock);
2487 	if ((sp->s_flags & (N4S_CB_PINGED | N4S_CB_WAITER)) == 0) {
2488 		sp->s_flags |= N4S_CB_WAITER;
2489 		(void) cv_timedwait(&sp->wait_cb_null, &sp->s_lock,
2490 		    (lbolt + drv_usectohz(N4S_CB_PAUSE_TIME)));
2491 	}
2492 	mutex_exit(&sp->s_lock);
2493 
2494 	(void) nfs_rw_enter_sig(&sp->s_recovlock, RW_READER, 0);
2495 	(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0);
2496 
2497 	if (NFS4_VOLATILE_FH(mi)) {
2498 		nfs4_remap_root(mi, &e, 0);
2499 		if (nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp)) {
2500 			(void) nfs4_start_recovery(&e, mi, NULL,
2501 			    NULL, NULL, NULL, OP_LOOKUP, NULL);
2502 		}
2503 	}
2504 
2505 	mutex_enter(&mi->mi_lock);
2506 	if (recovp->rc_srv_reboot || (mi->mi_recovflags & MI4R_SRV_REBOOT))
2507 		claim = CLAIM_PREVIOUS;
2508 	else
2509 		claim = CLAIM_NULL;
2510 	mutex_exit(&mi->mi_lock);
2511 
2512 	if (e.error == 0 && e.stat == NFS4_OK) {
2513 		/*
2514 		 * Get a snapshot of open files in the filesystem.  Note
2515 		 * that new opens will stall until the server's grace
2516 		 * period is done.
2517 		 */
2518 		reopenlist = r4mkopenlist(mi);
2519 
2520 		mutex_enter(&mi->mi_lock);
2521 		remap = mi->mi_recovflags & MI4R_REMAP_FILES;
2522 		mutex_exit(&mi->mi_lock);
2523 		/*
2524 		 * Since we are re-establishing state on the
2525 		 * server, its ok to blow away the saved lost
2526 		 * requests since we don't need to reissue it.
2527 		 */
2528 		nfs4_remove_lost_rqsts(mi, sp);
2529 
2530 		for (rep = reopenlist; rep; rep = rep->re_next) {
2531 
2532 			if (remap) {
2533 				nfs4_remap_file(mi, rep->re_vp,
2534 				    NFS4_REMAP_CKATTRS, &e);
2535 			}
2536 			if (e.error == ENOENT || e.stat == NFS4ERR_NOENT) {
2537 				/*
2538 				 * The current server does not have the file
2539 				 * that is to be remapped.  This is most
2540 				 * likely due to an improperly maintained
2541 				 * replica.   The files that are missing from
2542 				 * the server will be marked dead and logged
2543 				 * in order to make sys admins aware of the
2544 				 * problem.
2545 				 */
2546 				nfs4_fail_recov(rep->re_vp,
2547 				    fail_msg, e.error, e.stat);
2548 				/*
2549 				 * We've already handled the error so clear it.
2550 				 */
2551 				nfs4_error_zinit(&e);
2552 				continue;
2553 			} else if (e.error == 0 && e.stat == NFS4_OK) {
2554 				int j;
2555 
2556 				rp = VTOR4(rep->re_vp);
2557 				mutex_enter(&rp->r_statelock);
2558 				pre_change = rp->r_change;
2559 				mutex_exit(&rp->r_statelock);
2560 
2561 				for (j = 0; j < rep->re_numosp; j++) {
2562 					nfs4_reopen(rep->re_vp, rep->re_osp[j],
2563 					    &e, claim, FALSE, TRUE);
2564 					if (e.error != 0 || e.stat != NFS4_OK)
2565 						break;
2566 				}
2567 				if (nfs4_needs_recovery(&e, TRUE,
2568 				    mi->mi_vfsp)) {
2569 					(void) nfs4_start_recovery(&e, mi,
2570 					    rep->re_vp, NULL, NULL, NULL,
2571 					    OP_OPEN, NULL);
2572 					break;
2573 				}
2574 			}
2575 #ifdef DEBUG
2576 			if (nfs4_recovdelay > 0)
2577 				delay(MSEC_TO_TICK(nfs4_recovdelay * 1000));
2578 #endif
2579 			if (e.error == 0 && e.stat == NFS4_OK)
2580 				relock_file(rep->re_vp, mi, &e, pre_change);
2581 
2582 			if (nfs4_needs_recovery(&e, TRUE, mi->mi_vfsp))
2583 				(void) nfs4_start_recovery(&e, mi,
2584 				    rep->re_vp, NULL, NULL, NULL, OP_LOCK,
2585 				    NULL);
2586 			if (e.error != 0 || e.stat != NFS4_OK)
2587 				break;
2588 		}
2589 
2590 		/*
2591 		 * Check to see if we need to remap files passed in
2592 		 * via the recovery arguments; this will have been
2593 		 * done for open files.  A failure here is not fatal.
2594 		 */
2595 		if (remap) {
2596 			nfs4_error_t ignore;
2597 			nfs4_check_remap(mi, recovp->rc_vp1, NFS4_REMAP_CKATTRS,
2598 			    &ignore);
2599 			nfs4_check_remap(mi, recovp->rc_vp2, NFS4_REMAP_CKATTRS,
2600 			    &ignore);
2601 		}
2602 	}
2603 
2604 	if (e.error == 0 && e.stat == NFS4_OK) {
2605 		mutex_enter(&mi->mi_lock);
2606 		mi->mi_recovflags &= ~(MI4R_REOPEN_FILES | MI4R_REMAP_FILES);
2607 		mutex_exit(&mi->mi_lock);
2608 	}
2609 
2610 	nfs_rw_exit(&mi->mi_recovlock);
2611 	nfs_rw_exit(&sp->s_recovlock);
2612 
2613 	if (reopenlist != NULL)
2614 		r4releopenlist(reopenlist);
2615 }
2616 
2617 /*
2618  * Resend the queued state recovery requests in "rqsts".
2619  */
2620 
2621 static void
2622 nfs4_resend_lost_rqsts(recov_info_t *recovp, nfs4_server_t *sp)
2623 {
2624 	nfs4_lost_rqst_t	*lrp, *tlrp;
2625 	mntinfo4_t		*mi = recovp->rc_mi;
2626 	nfs4_error_t		n4e;
2627 #ifdef NOTYET
2628 	uint32_t		deny_bits = 0;
2629 #endif
2630 
2631 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_resend_lost_rqsts"));
2632 
2633 	ASSERT(mi != NULL);
2634 	ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
2635 
2636 	mutex_enter(&mi->mi_lock);
2637 	lrp = list_head(&mi->mi_lost_state);
2638 	mutex_exit(&mi->mi_lock);
2639 	while (lrp != NULL) {
2640 		nfs4_error_zinit(&n4e);
2641 		resend_one_op(lrp, &n4e, mi, sp);
2642 		NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
2643 		    "nfs4_resend_lost_rqsts: resend request: for vp %p got "
2644 		    "error %d stat %d", (void *)lrp->lr_vp, n4e.error,
2645 		    n4e.stat));
2646 
2647 		/*
2648 		 * If we get a recovery error that we can actually
2649 		 * recover from (such as ETIMEDOUT, FHEXPIRED), we
2650 		 * return and let the recovery thread redrive the call.
2651 		 * Don't requeue unless the zone is still healthy.
2652 		 */
2653 		if (zone_status_get(curproc->p_zone) < ZONE_IS_SHUTTING_DOWN &&
2654 		    nfs4_needs_recovery(&n4e, TRUE, mi->mi_vfsp) &&
2655 		    (nfs4_try_failover(&n4e) ||
2656 		    NFS4_FRC_UNMT_ERR(n4e.error, mi->mi_vfsp) ||
2657 		    (n4e.error == 0 && n4e.stat != NFS4ERR_BADHANDLE &&
2658 		    !nfs4_recov_marks_dead(n4e.stat)))) {
2659 			/*
2660 			 * For these three errors, we want to delay a bit
2661 			 * instead of pounding the server into submission.
2662 			 * We have to do this manually; the normal
2663 			 * processing for these errors only works for
2664 			 * non-recovery requests.
2665 			 */
2666 			if ((n4e.error == 0 && n4e.stat == NFS4ERR_DELAY) ||
2667 			    (n4e.error == 0 && n4e.stat == NFS4ERR_GRACE) ||
2668 			    (n4e.error == 0 && n4e.stat == NFS4ERR_RESOURCE) ||
2669 			    NFS4_FRC_UNMT_ERR(n4e.error, mi->mi_vfsp)) {
2670 				delay(SEC_TO_TICK(nfs4err_delay_time));
2671 			} else {
2672 				(void) nfs4_start_recovery(&n4e,
2673 				    mi, lrp->lr_dvp, lrp->lr_vp, NULL, NULL,
2674 				    lrp->lr_op, NULL);
2675 			}
2676 			return;
2677 		}
2678 
2679 		mutex_enter(&mi->mi_lock);
2680 		list_remove(&mi->mi_lost_state, lrp);
2681 		tlrp = lrp;
2682 		lrp = list_head(&mi->mi_lost_state);
2683 		mutex_exit(&mi->mi_lock);
2684 		nfs4_free_lost_rqst(tlrp, sp);
2685 	}
2686 }
2687 
2688 /*
2689  * Resend the given op, and issue any necessary undo call.
2690  * errors are returned via the nfs4_error_t parameter.
2691  */
2692 
2693 static void
2694 resend_one_op(nfs4_lost_rqst_t *lrp, nfs4_error_t *ep,
2695     mntinfo4_t *mi, nfs4_server_t *sp)
2696 {
2697 	vnode_t *vp;
2698 	nfs4_open_stream_t *osp;
2699 	cred_t *cr;
2700 	uint32_t acc_bits;
2701 
2702 	vp = lrp->lr_vp;
2703 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_one_op: "
2704 	    "have a lost open/close request for vp %p", (void *)vp));
2705 
2706 	switch (lrp->lr_op) {
2707 	case OP_OPEN:
2708 		nfs4_resend_open_otw(&vp, lrp, ep);
2709 		break;
2710 	case OP_OPEN_DOWNGRADE:
2711 		ASSERT(lrp->lr_oop != NULL);
2712 		ep->error = nfs4_start_open_seqid_sync(lrp->lr_oop, mi);
2713 		ASSERT(!ep->error);	/* recov thread always succeeds */
2714 		ASSERT(lrp->lr_osp != NULL);
2715 		mutex_enter(&lrp->lr_osp->os_sync_lock);
2716 		nfs4_open_downgrade(lrp->lr_dg_acc, lrp->lr_dg_deny,
2717 		    lrp->lr_oop, lrp->lr_osp, vp, lrp->lr_cr, lrp,
2718 		    ep, NULL, NULL);
2719 		mutex_exit(&lrp->lr_osp->os_sync_lock);
2720 		nfs4_end_open_seqid_sync(lrp->lr_oop);
2721 		break;
2722 	case OP_CLOSE:
2723 		osp = lrp->lr_osp;
2724 		cr = lrp->lr_cr;
2725 		acc_bits = 0;
2726 		mutex_enter(&osp->os_sync_lock);
2727 		if (osp->os_share_acc_read)
2728 			acc_bits |= OPEN4_SHARE_ACCESS_READ;
2729 		if (osp->os_share_acc_write)
2730 			acc_bits |= OPEN4_SHARE_ACCESS_WRITE;
2731 		mutex_exit(&osp->os_sync_lock);
2732 		nfs4close_one(vp, osp, cr, acc_bits, lrp, ep,
2733 		    CLOSE_RESEND, 0, 0, 0);
2734 		break;
2735 	case OP_LOCK:
2736 	case OP_LOCKU:
2737 		resend_lock(lrp, ep);
2738 		goto done;
2739 	case OP_DELEGRETURN:
2740 		nfs4_resend_delegreturn(lrp, ep, sp);
2741 		goto done;
2742 	default:
2743 #ifdef DEBUG
2744 		cmn_err(CE_PANIC, "resend_one_op: unexpected op: %d",
2745 		    lrp->lr_op);
2746 #endif
2747 		nfs4_queue_event(RE_LOST_STATE_BAD_OP, mi, NULL,
2748 		    lrp->lr_op, lrp->lr_vp, lrp->lr_dvp, NFS4_OK, NULL, 0,
2749 		    TAG_NONE, TAG_NONE, 0, 0);
2750 		nfs4_error_init(ep, EINVAL);
2751 		return;
2752 	}
2753 
2754 	/*
2755 	 * No need to retry nor send an "undo" CLOSE in the
2756 	 * event the server rebooted.
2757 	 */
2758 	if (ep->error == 0 && (ep->stat == NFS4ERR_STALE_CLIENTID ||
2759 	    ep->stat == NFS4ERR_STALE_STATEID || ep->stat == NFS4ERR_EXPIRED))
2760 		goto done;
2761 
2762 	/*
2763 	 * If we resent a CLOSE or OPEN_DOWNGRADE, there's nothing
2764 	 * to undo.  Undoing locking operations was handled by
2765 	 * resend_lock().
2766 	 */
2767 	if (lrp->lr_op == OP_OPEN_DOWNGRADE || lrp->lr_op == OP_CLOSE)
2768 		goto done;
2769 
2770 	/*
2771 	 * If we get any other error for OPEN, then don't attempt
2772 	 * to undo the resend of the open (since it was never
2773 	 * successful!).
2774 	 */
2775 	ASSERT(lrp->lr_op == OP_OPEN);
2776 	if (ep->error || ep->stat != NFS4_OK)
2777 		goto done;
2778 
2779 	/*
2780 	 * Now let's undo our OPEN.
2781 	 */
2782 	nfs4_error_zinit(ep);
2783 	close_after_open_resend(vp, lrp->lr_cr, lrp->lr_oacc, ep);
2784 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_one_op: "
2785 	    "nfs4close_one: for vp %p got error %d stat %d",
2786 	    (void *)vp, ep->error, ep->stat));
2787 
2788 done:
2789 	if (vp != lrp->lr_vp)
2790 		VN_RELE(vp);
2791 }
2792 
2793 /*
2794  * Close a file that was opened via a resent OPEN.
2795  * Most errors are passed back to the caller (via the return value and
2796  * *statp), except for FHEXPIRED, which is retried.
2797  *
2798  * It might be conceptually cleaner to push the CLOSE request onto the
2799  * front of the resend queue, rather than sending it here.  That would
2800  * match the way we undo lost lock requests.  On the other
2801  * hand, we've already got something that works, and there's no reason to
2802  * change it at this time.
2803  */
2804 
2805 static void
2806 close_after_open_resend(vnode_t *vp, cred_t *cr, uint32_t acc_bits,
2807     nfs4_error_t *ep)
2808 {
2809 
2810 	for (;;) {
2811 		nfs4close_one(vp, NULL, cr, acc_bits, NULL, ep,
2812 		    CLOSE_AFTER_RESEND, 0, 0, 0);
2813 		if (ep->error == 0 && ep->stat == NFS4_OK)
2814 			break;		/* success; done */
2815 		if (ep->error != 0 || ep->stat != NFS4ERR_FHEXPIRED)
2816 			break;
2817 		/* else retry FHEXPIRED */
2818 	}
2819 
2820 }
2821 
2822 /*
2823  * Resend the given lost lock request.  Return an errno value.  If zero,
2824  * *statp is set to the NFS status code for the call.
2825  *
2826  * Issue a SIGLOST and mark the rnode dead if we get a non-recovery error or
2827  * a recovery error that we don't actually recover from yet (eg: BAD_SEQID).
2828  * Let the recovery thread redrive the call if we get a recovery error that
2829  * we can actually recover from.
2830  */
2831 static void
2832 resend_lock(nfs4_lost_rqst_t *lrp, nfs4_error_t *ep)
2833 {
2834 	bool_t		send_siglost = FALSE;
2835 	vnode_t		*vp = lrp->lr_vp;
2836 
2837 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_lock:"));
2838 	ASSERT(lrp->lr_ctype == NFS4_LCK_CTYPE_REINSTATE ||
2839 	    lrp->lr_ctype == NFS4_LCK_CTYPE_RESEND);
2840 
2841 	nfs4frlock(lrp->lr_ctype, vp, F_SETLK,
2842 	    lrp->lr_flk, FREAD|FWRITE, 0, lrp->lr_cr, ep, lrp, NULL);
2843 
2844 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_lock: "
2845 	    "nfs4frlock for vp %p returned error %d, stat %d",
2846 	    (void *)vp, ep->error, ep->stat));
2847 
2848 	if (ep->error == 0 && ep->stat == 0)
2849 		goto done;
2850 	if (ep->error == 0 && ep->stat == NFS4ERR_DENIED &&
2851 	    lrp->lr_ctype == NFS4_LCK_CTYPE_RESEND)
2852 		goto done;
2853 
2854 	/*
2855 	 * If we failed with a non-recovery error, send SIGLOST and
2856 	 * mark the file dead.
2857 	 */
2858 	if (!nfs4_needs_recovery(ep, TRUE, vp->v_vfsp))
2859 		send_siglost = TRUE;
2860 	else {
2861 		/*
2862 		 * Done with recovering LOST LOCK in the event the
2863 		 * server rebooted or we've lost the lease.
2864 		 */
2865 		if (ep->error == 0 && (ep->stat == NFS4ERR_STALE_CLIENTID ||
2866 		    ep->stat == NFS4ERR_STALE_STATEID ||
2867 		    ep->stat == NFS4ERR_EXPIRED)) {
2868 			goto done;
2869 		}
2870 
2871 		/*
2872 		 * BAD_STATEID on an unlock indicates that the server has
2873 		 * forgotten about the lock anyway, so act like the call
2874 		 * was successful.
2875 		 */
2876 		if (ep->error == 0 && ep->stat == NFS4ERR_BAD_STATEID &&
2877 		    lrp->lr_op == OP_LOCKU)
2878 			goto done;
2879 
2880 		/*
2881 		 * If we got a recovery error that we don't actually
2882 		 * recover from, send SIGLOST.  If the filesystem was
2883 		 * forcibly unmounted, we skip the SIGLOST because (a) it's
2884 		 * unnecessary noise, and (b) there could be a new process
2885 		 * with the same pid as the one that had generated the lost
2886 		 * state request.
2887 		 */
2888 		if (ep->error == 0 && (ep->stat == NFS4ERR_BADHANDLE ||
2889 		    nfs4_recov_marks_dead(ep->stat))) {
2890 			if (!(vp->v_vfsp->vfs_flag & VFS_UNMOUNTED))
2891 				send_siglost = TRUE;
2892 			goto done;
2893 		}
2894 
2895 		/*
2896 		 * If the filesystem was forcibly unmounted, we
2897 		 * still need to synchronize with the server and
2898 		 * release state.  Try again later.
2899 		 */
2900 		if (NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp))
2901 			goto done;
2902 
2903 		/*
2904 		 * If we get a recovery error that we can actually
2905 		 * recover from (such as ETIMEDOUT, FHEXPIRED),
2906 		 * return and let the recovery thread redrive the call.
2907 		 *
2908 		 * For the three errors below, we want to delay a bit
2909 		 * instead of pounding the server into submission.
2910 		 */
2911 		if ((ep->error == 0 && ep->stat == NFS4ERR_DELAY) ||
2912 		    (ep->error == 0 && ep->stat == NFS4ERR_GRACE) ||
2913 		    (ep->error == 0 && ep->stat == NFS4ERR_RESOURCE))
2914 			delay(SEC_TO_TICK(recov_err_delay));
2915 		goto done;
2916 	}
2917 
2918 done:
2919 	if (send_siglost) {
2920 		cred_t *sv_cred;
2921 
2922 		/*
2923 		 * Must be root or the actual thread being issued the
2924 		 * SIGLOST for this to work, so just become root.
2925 		 */
2926 		sv_cred = curthread->t_cred;
2927 		curthread->t_cred = kcred;
2928 		nfs4_send_siglost(lrp->lr_flk->l_pid, VTOMI4(vp), vp, FALSE,
2929 		    ep->error, ep->stat);
2930 		curthread->t_cred = sv_cred;
2931 
2932 		/*
2933 		 * Flush any additional reinstantiation requests for
2934 		 * this operation.  Sending multiple SIGLOSTs to the user
2935 		 * process is unlikely to help and may cause trouble.
2936 		 */
2937 		if (lrp->lr_ctype == NFS4_LCK_CTYPE_REINSTATE)
2938 			flush_reinstate(lrp);
2939 	}
2940 }
2941 
2942 /*
2943  * Remove any lock reinstantiation requests that correspond to the given
2944  * lost request.  We only remove items that follow lrp in the queue,
2945  * assuming that lrp will be removed by the generic lost state code.
2946  */
2947 
2948 static void
2949 flush_reinstate(nfs4_lost_rqst_t *lrp)
2950 {
2951 	vnode_t *vp;
2952 	pid_t pid;
2953 	mntinfo4_t *mi;
2954 	nfs4_lost_rqst_t *nlrp;
2955 
2956 	vp = lrp->lr_vp;
2957 	mi = VTOMI4(vp);
2958 	pid = lrp->lr_flk->l_pid;
2959 
2960 	/*
2961 	 * If there are any more reinstantation requests to get rid of,
2962 	 * they should all be clustered at the front of the lost state
2963 	 * queue.
2964 	 */
2965 	mutex_enter(&mi->mi_lock);
2966 	for (lrp = list_next(&mi->mi_lost_state, lrp); lrp != NULL;
2967 	    lrp = nlrp) {
2968 		nlrp = list_next(&mi->mi_lost_state, lrp);
2969 		if (lrp->lr_op != OP_LOCK && lrp->lr_op != OP_LOCKU)
2970 			break;
2971 		if (lrp->lr_ctype != NFS4_LCK_CTYPE_REINSTATE)
2972 			break;
2973 		ASSERT(lrp->lr_vp == vp);
2974 		ASSERT(lrp->lr_flk->l_pid == pid);
2975 		NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
2976 		    "remove reinstantiation %p", (void *)lrp));
2977 		list_remove(&mi->mi_lost_state, lrp);
2978 		nfs4_free_lost_rqst(lrp, NULL);
2979 	}
2980 	mutex_exit(&mi->mi_lock);
2981 }
2982 
2983 /*
2984  * End of state-specific recovery routines.
2985  */
2986 
2987 /*
2988  * Allocate a lost request struct, initialize it from lost_rqstp (including
2989  * bumping the reference counts for the referenced vnode, etc.), and hang
2990  * it off of recovp.
2991  */
2992 
2993 static void
2994 nfs4_save_lost_rqst(nfs4_lost_rqst_t *lost_rqstp, recov_info_t *recovp,
2995     nfs4_recov_t *action, mntinfo4_t *mi)
2996 {
2997 	nfs4_lost_rqst_t *destp;
2998 
2999 	ASSERT(recovp->rc_lost_rqst == NULL);
3000 
3001 	destp = kmem_alloc(sizeof (nfs4_lost_rqst_t), KM_SLEEP);
3002 	recovp->rc_lost_rqst = destp;
3003 
3004 	if (lost_rqstp->lr_op == OP_LOCK ||
3005 	    lost_rqstp->lr_op == OP_LOCKU) {
3006 		ASSERT(lost_rqstp->lr_lop);
3007 		*action = NR_LOST_LOCK;
3008 		destp->lr_ctype = lost_rqstp->lr_ctype;
3009 		destp->lr_locktype = lost_rqstp->lr_locktype;
3010 	} else if (lost_rqstp->lr_op == OP_OPEN) {
3011 		component4 *srcfp, *destfp;
3012 
3013 		destp->lr_oacc = lost_rqstp->lr_oacc;
3014 		destp->lr_odeny = lost_rqstp->lr_odeny;
3015 		destp->lr_oclaim = lost_rqstp->lr_oclaim;
3016 		if (lost_rqstp->lr_oclaim == CLAIM_DELEGATE_CUR)
3017 			destp->lr_ostateid = lost_rqstp->lr_ostateid;
3018 
3019 		srcfp = &lost_rqstp->lr_ofile;
3020 		destfp = &destp->lr_ofile;
3021 		/*
3022 		 * Consume caller's utf8string
3023 		 */
3024 		destfp->utf8string_len = srcfp->utf8string_len;
3025 		destfp->utf8string_val = srcfp->utf8string_val;
3026 		srcfp->utf8string_len = 0;
3027 		srcfp->utf8string_val = NULL;	/* make sure not reused */
3028 
3029 		*action = NR_LOST_STATE_RQST;
3030 	} else if (lost_rqstp->lr_op == OP_OPEN_DOWNGRADE) {
3031 		destp->lr_dg_acc = lost_rqstp->lr_dg_acc;
3032 		destp->lr_dg_deny = lost_rqstp->lr_dg_deny;
3033 
3034 		*action = NR_LOST_STATE_RQST;
3035 	} else if (lost_rqstp->lr_op == OP_CLOSE) {
3036 		ASSERT(lost_rqstp->lr_oop);
3037 		*action = NR_LOST_STATE_RQST;
3038 	} else if (lost_rqstp->lr_op == OP_DELEGRETURN) {
3039 		*action = NR_LOST_STATE_RQST;
3040 	} else {
3041 #ifdef DEBUG
3042 		cmn_err(CE_PANIC, "nfs4_save_lost_rqst: bad op %d",
3043 		    lost_rqstp->lr_op);
3044 #endif
3045 		nfs4_queue_event(RE_LOST_STATE_BAD_OP, mi, NULL,
3046 		    lost_rqstp->lr_op, lost_rqstp->lr_vp, lost_rqstp->lr_dvp,
3047 		    NFS4_OK, NULL, curproc->p_pid, TAG_NONE, TAG_NONE, 0, 0);
3048 		*action = NR_UNUSED;
3049 		recovp->rc_lost_rqst = NULL;
3050 		kmem_free(destp, sizeof (nfs4_lost_rqst_t));
3051 		return;
3052 	}
3053 
3054 	destp->lr_op = lost_rqstp->lr_op;
3055 	destp->lr_vp = lost_rqstp->lr_vp;
3056 	if (destp->lr_vp)
3057 		VN_HOLD(destp->lr_vp);
3058 	destp->lr_dvp = lost_rqstp->lr_dvp;
3059 	if (destp->lr_dvp)
3060 		VN_HOLD(destp->lr_dvp);
3061 	destp->lr_oop = lost_rqstp->lr_oop;
3062 	if (destp->lr_oop)
3063 		open_owner_hold(destp->lr_oop);
3064 	destp->lr_osp = lost_rqstp->lr_osp;
3065 	if (destp->lr_osp)
3066 		open_stream_hold(destp->lr_osp);
3067 	destp->lr_lop = lost_rqstp->lr_lop;
3068 	if (destp->lr_lop)
3069 		lock_owner_hold(destp->lr_lop);
3070 	destp->lr_cr = lost_rqstp->lr_cr;
3071 	if (destp->lr_cr)
3072 		crhold(destp->lr_cr);
3073 	if (lost_rqstp->lr_flk == NULL)
3074 		destp->lr_flk = NULL;
3075 	else {
3076 		destp->lr_flk = kmem_alloc(sizeof (flock64_t), KM_SLEEP);
3077 		*destp->lr_flk = *lost_rqstp->lr_flk;
3078 	}
3079 	destp->lr_putfirst = lost_rqstp->lr_putfirst;
3080 }
3081 
3082 /*
3083  * Map the given return values (errno and nfs4 status code) to a recovery
3084  * action and fill in the following fields of recovp: rc_action,
3085  * rc_srv_reboot, rc_stateid, rc_lost_rqst.
3086  */
3087 
3088 void
3089 errs_to_action(recov_info_t *recovp,
3090     nfs4_server_t *sp, mntinfo4_t *mi, stateid4 *sidp,
3091     nfs4_lost_rqst_t *lost_rqstp, int unmounted, nfs_opnum4 op,
3092     nfs4_bseqid_entry_t *bsep)
3093 {
3094 	nfs4_recov_t action = NR_UNUSED;
3095 	bool_t reboot = FALSE;
3096 	int try_f;
3097 	int error = recovp->rc_orig_errors.error;
3098 	nfsstat4 stat = recovp->rc_orig_errors.stat;
3099 
3100 	bzero(&recovp->rc_stateid, sizeof (stateid4));
3101 	recovp->rc_lost_rqst = NULL;
3102 	recovp->rc_bseqid_rqst = NULL;
3103 
3104 	try_f = nfs4_try_failover(&recovp->rc_orig_errors) &&
3105 	    FAILOVER_MOUNT4(mi);
3106 
3107 	/*
3108 	 * We start recovery for EINTR only in the lost lock
3109 	 * or lost open/close case.
3110 	 */
3111 
3112 	if (try_f || error == EINTR || (error == EIO && unmounted)) {
3113 		recovp->rc_error = (error != 0 ? error : geterrno4(stat));
3114 		if (lost_rqstp) {
3115 			ASSERT(lost_rqstp->lr_op != 0);
3116 			nfs4_save_lost_rqst(lost_rqstp, recovp, &action, mi);
3117 		}
3118 		if (try_f)
3119 			action = NR_FAILOVER;
3120 	} else if (error != 0) {
3121 		recovp->rc_error = error;
3122 		nfs4_queue_event(RE_UNEXPECTED_ERRNO, mi, NULL, error, NULL,
3123 		    NULL, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
3124 		action = NR_CLIENTID;
3125 	} else {
3126 		recovp->rc_error = geterrno4(stat);
3127 		switch (stat) {
3128 #ifdef notyet
3129 		case NFS4ERR_LEASE_MOVED:
3130 			action = xxx;
3131 			break;
3132 		case NFS4ERR_MOVED:
3133 			action = xxx;
3134 			break;
3135 #endif
3136 		case NFS4ERR_BADHANDLE:
3137 			action = NR_BADHANDLE;
3138 			break;
3139 		case NFS4ERR_BAD_SEQID:
3140 			if (bsep)
3141 				save_bseqid_rqst(bsep, recovp);
3142 			action = NR_BAD_SEQID;
3143 			break;
3144 		case NFS4ERR_OLD_STATEID:
3145 			action = NR_OLDSTATEID;
3146 			break;
3147 		case NFS4ERR_WRONGSEC:
3148 			action = NR_WRONGSEC;
3149 			break;
3150 		case NFS4ERR_FHEXPIRED:
3151 			action = NR_FHEXPIRED;
3152 			break;
3153 		case NFS4ERR_BAD_STATEID:
3154 			if (sp == NULL || (sp != NULL && inlease(sp))) {
3155 
3156 				action = NR_BAD_STATEID;
3157 				if (sidp)
3158 					recovp->rc_stateid = *sidp;
3159 			} else
3160 				action = NR_CLIENTID;
3161 			break;
3162 		case NFS4ERR_EXPIRED:
3163 			/*
3164 			 * The client's lease has expired, either due
3165 			 * to a network partition or perhaps a client
3166 			 * error.  In either case, try an NR_CLIENTID
3167 			 * style recovery.  reboot remains false, since
3168 			 * there is no evidence the server has rebooted.
3169 			 * This will cause CLAIM_NULL opens and lock
3170 			 * requests without the reclaim bit.
3171 			 */
3172 			action = NR_CLIENTID;
3173 
3174 			DTRACE_PROBE4(nfs4__expired,
3175 			    nfs4_server_t *, sp,
3176 			    mntinfo4_t *, mi,
3177 			    stateid4 *, sidp, int, op);
3178 
3179 			break;
3180 		case NFS4ERR_STALE_CLIENTID:
3181 		case NFS4ERR_STALE_STATEID:
3182 			action = NR_CLIENTID;
3183 			reboot = TRUE;
3184 			break;
3185 		case NFS4ERR_RESOURCE:
3186 			/*
3187 			 * If this had been a FAILOVER mount, then
3188 			 * we'd have tried failover.  Since it's not,
3189 			 * just delay a while and retry.
3190 			 */
3191 			action = NR_DELAY;
3192 			break;
3193 		case NFS4ERR_GRACE:
3194 			action = NR_GRACE;
3195 			break;
3196 		case NFS4ERR_DELAY:
3197 			action = NR_DELAY;
3198 			break;
3199 		case NFS4ERR_STALE:
3200 			action = NR_STALE;
3201 			break;
3202 		default:
3203 			nfs4_queue_event(RE_UNEXPECTED_STATUS, mi, NULL, 0,
3204 			    NULL, NULL, stat, NULL, 0, TAG_NONE, TAG_NONE,
3205 			    0, 0);
3206 			action = NR_CLIENTID;
3207 			break;
3208 		}
3209 	}
3210 
3211 	/* make sure action got set */
3212 	ASSERT(action != NR_UNUSED);
3213 	recovp->rc_srv_reboot = reboot;
3214 	recovp->rc_action = action;
3215 	nfs4_queue_fact(RF_ERR, mi, stat, action, op, reboot, NULL, error,
3216 	    NULL);
3217 }
3218 
3219 /*
3220  * Return the (held) credential for the process with the given pid.
3221  * May return NULL (e.g., process not found).
3222  */
3223 
3224 static cred_t *
3225 pid_to_cr(pid_t pid)
3226 {
3227 	proc_t *p;
3228 	cred_t *cr;
3229 
3230 	mutex_enter(&pidlock);
3231 	if ((p = prfind(pid)) == NULL) {
3232 		mutex_exit(&pidlock);
3233 		return (NULL);
3234 	}
3235 
3236 	mutex_enter(&p->p_crlock);
3237 	crhold(cr = p->p_cred);
3238 	mutex_exit(&p->p_crlock);
3239 	mutex_exit(&pidlock);
3240 
3241 	return (cr);
3242 }
3243 
3244 /*
3245  * Send SIGLOST to the given process and queue the event.
3246  *
3247  * The 'dump' boolean tells us whether this action should dump the
3248  * in-kernel queue of recovery messages or not.
3249  */
3250 
3251 void
3252 nfs4_send_siglost(pid_t pid, mntinfo4_t *mi, vnode_t *vp, bool_t dump,
3253     int error, nfsstat4 stat)
3254 {
3255 	proc_t *p;
3256 
3257 	mutex_enter(&pidlock);
3258 	p = prfind(pid);
3259 	if (p)
3260 		psignal(p, SIGLOST);
3261 	mutex_exit(&pidlock);
3262 	nfs4_queue_event(dump ? RE_SIGLOST : RE_SIGLOST_NO_DUMP, mi,
3263 	    NULL, error, vp, NULL, stat, NULL, pid, TAG_NONE, TAG_NONE, 0, 0);
3264 }
3265 
3266 /*
3267  * Scan the lock list for entries that match the given pid.  Change the
3268  * pid in those that do to NOPID.
3269  */
3270 
3271 static void
3272 relock_skip_pid(locklist_t *llp, pid_t pid)
3273 {
3274 	for (; llp != NULL; llp = llp->ll_next) {
3275 		if (llp->ll_flock.l_pid == pid)
3276 			llp->ll_flock.l_pid = NOPID;
3277 	}
3278 }
3279 
3280 /*
3281  * Mark a file as having failed recovery, after making a last-ditch effort
3282  * to return any delegation.
3283  *
3284  * Sets r_error to EIO or ESTALE for the given vnode.
3285  */
3286 void
3287 nfs4_fail_recov(vnode_t *vp, char *why, int error, nfsstat4 stat)
3288 {
3289 	rnode4_t *rp = VTOR4(vp);
3290 
3291 #ifdef DEBUG
3292 	if (nfs4_fail_recov_stop)
3293 		debug_enter("nfs4_fail_recov");
3294 #endif
3295 
3296 	mutex_enter(&rp->r_statelock);
3297 	if (rp->r_flags & (R4RECOVERR|R4RECOVERRP)) {
3298 		mutex_exit(&rp->r_statelock);
3299 		return;
3300 	}
3301 
3302 	/*
3303 	 * Set R4RECOVERRP to indicate that a recovery error is in
3304 	 * progress.  This will shut down reads and writes at the top
3305 	 * half.  Don't set R4RECOVERR until after we've returned the
3306 	 * delegation, otherwise it will fail.
3307 	 */
3308 
3309 	rp->r_flags |= R4RECOVERRP;
3310 	mutex_exit(&rp->r_statelock);
3311 
3312 	nfs4delegabandon(rp);
3313 
3314 	mutex_enter(&rp->r_statelock);
3315 	rp->r_flags |= (R4RECOVERR | R4STALE);
3316 	rp->r_error = (error == 0 && stat == NFS4ERR_STALE) ? ESTALE : EIO;
3317 	PURGE_ATTRCACHE4_LOCKED(rp);
3318 	if (!(vp->v_vfsp->vfs_flag & VFS_UNMOUNTED))
3319 		nfs4_queue_event(RE_DEAD_FILE, VTOMI4(vp), NULL, error,
3320 		    vp, NULL, stat, why, 0, TAG_NONE, TAG_NONE, 0, 0);
3321 	mutex_exit(&rp->r_statelock);
3322 
3323 	dnlc_purge_vp(vp);
3324 }
3325 
3326 /*
3327  * recov_throttle: if the file had the same recovery action within the
3328  * throttle interval, wait for the throttle interval to finish before
3329  * proceeding.
3330  *
3331  * Side effects: updates the rnode with the current recovery information.
3332  */
3333 
3334 static void
3335 recov_throttle(recov_info_t *recovp, vnode_t *vp)
3336 {
3337 	time_t curtime, time_to_wait;
3338 	rnode4_t *rp = VTOR4(vp);
3339 
3340 	curtime = gethrestime_sec();
3341 
3342 	mutex_enter(&rp->r_statelock);
3343 	NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
3344 	    "recov_throttle: now: (%d, %ld), last: (%d, %ld)",
3345 	    recovp->rc_action, curtime,
3346 	    rp->r_recov_act, rp->r_last_recov));
3347 	if (recovp->rc_action == rp->r_recov_act &&
3348 	    rp->r_last_recov + recov_err_delay > curtime) {
3349 		time_to_wait = rp->r_last_recov + recov_err_delay - curtime;
3350 		mutex_exit(&rp->r_statelock);
3351 		delay(SEC_TO_TICK(time_to_wait));
3352 		curtime = gethrestime_sec();
3353 		mutex_enter(&rp->r_statelock);
3354 	}
3355 
3356 	rp->r_last_recov = curtime;
3357 	rp->r_recov_act = recovp->rc_action;
3358 	mutex_exit(&rp->r_statelock);
3359 }
3360 
3361 /*
3362  * React to NFS4ERR_GRACE by setting the time we'll permit
3363  * the next call to this filesystem.
3364  */
3365 void
3366 nfs4_set_grace_wait(mntinfo4_t *mi)
3367 {
3368 	mutex_enter(&mi->mi_lock);
3369 	/* Mark the time for the future */
3370 	mi->mi_grace_wait = gethrestime_sec() + nfs4err_delay_time;
3371 	mutex_exit(&mi->mi_lock);
3372 }
3373 
3374 /*
3375  * React to MFS4ERR_DELAY by setting the time we'll permit
3376  * the next call to this vnode.
3377  */
3378 void
3379 nfs4_set_delay_wait(vnode_t *vp)
3380 {
3381 	rnode4_t *rp = VTOR4(vp);
3382 
3383 	mutex_enter(&rp->r_statelock);
3384 	/*
3385 	 * Calculate amount we should delay, initial
3386 	 * delay will be short and then we will back off.
3387 	 */
3388 	if (rp->r_delay_interval == 0)
3389 		rp->r_delay_interval = NFS4_INITIAL_DELAY_INTERVAL;
3390 	else
3391 		/* calculate next interval value */
3392 		rp->r_delay_interval =
3393 		    MIN(NFS4_MAX_DELAY_INTERVAL, (rp->r_delay_interval << 1));
3394 	rp->r_delay_wait = gethrestime_sec() + rp->r_delay_interval;
3395 	mutex_exit(&rp->r_statelock);
3396 }
3397 
3398 /*
3399  * The caller is responsible for freeing the returned string.
3400  */
3401 static char *
3402 nfs4_getsrvnames(mntinfo4_t *mi, size_t *len)
3403 {
3404 	servinfo4_t *svp;
3405 	char *srvnames;
3406 	char *namep;
3407 	size_t length;
3408 
3409 	/*
3410 	 * Calculate the length of the string required to hold all
3411 	 * of the server names plus either a comma or a null
3412 	 * character following each individual one.
3413 	 */
3414 	length = 0;
3415 	for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) {
3416 		(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
3417 		if (svp->sv_flags & SV4_NOTINUSE) {
3418 			nfs_rw_exit(&svp->sv_lock);
3419 			continue;
3420 		}
3421 		nfs_rw_exit(&svp->sv_lock);
3422 		length += svp->sv_hostnamelen;
3423 	}
3424 
3425 	srvnames = kmem_alloc(length, KM_SLEEP);
3426 
3427 	namep = srvnames;
3428 	for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) {
3429 		(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
3430 		if (svp->sv_flags & SV4_NOTINUSE) {
3431 			nfs_rw_exit(&svp->sv_lock);
3432 			continue;
3433 		}
3434 		nfs_rw_exit(&svp->sv_lock);
3435 		(void) strcpy(namep, svp->sv_hostname);
3436 		namep += svp->sv_hostnamelen - 1;
3437 		*namep++ = ',';
3438 	}
3439 	*--namep = '\0';
3440 
3441 	*len = length;
3442 
3443 	return (srvnames);
3444 }
3445 
3446 static void
3447 save_bseqid_rqst(nfs4_bseqid_entry_t *bsep, recov_info_t *recovp)
3448 {
3449 	nfs4_bseqid_entry_t *destp;
3450 
3451 	destp = kmem_alloc(sizeof (nfs4_bseqid_entry_t), KM_SLEEP);
3452 	recovp->rc_bseqid_rqst = destp;
3453 
3454 	if (bsep->bs_oop)
3455 		open_owner_hold(bsep->bs_oop);
3456 	destp->bs_oop = bsep->bs_oop;
3457 	if (bsep->bs_lop)
3458 		lock_owner_hold(bsep->bs_lop);
3459 	destp->bs_lop = bsep->bs_lop;
3460 	if (bsep->bs_vp)
3461 		VN_HOLD(bsep->bs_vp);
3462 	destp->bs_vp = bsep->bs_vp;
3463 	destp->bs_pid = bsep->bs_pid;
3464 	destp->bs_tag = bsep->bs_tag;
3465 	destp->bs_seqid = bsep->bs_seqid;
3466 }
3467 
3468 static void
3469 free_bseqid_rqst(nfs4_bseqid_entry_t *bsep)
3470 {
3471 	if (bsep->bs_oop)
3472 		open_owner_rele(bsep->bs_oop);
3473 	if (bsep->bs_lop)
3474 		lock_owner_rele(bsep->bs_lop);
3475 	if (bsep->bs_vp)
3476 		VN_RELE(bsep->bs_vp);
3477 	kmem_free(bsep, sizeof (nfs4_bseqid_entry_t));
3478 }
3479 
3480 /*
3481  * We don't actually fully recover from NFS4ERR_BAD_SEQID.  We
3482  * simply mark the open owner and open stream (if provided) as "bad".
3483  * Then future uses of these data structures will be limited to basically
3484  * just cleaning up the internal client state (no going OTW).
3485  *
3486  * The result of this is to return errors back to the app/usr when
3487  * we receive NFS4ERR_BAD_SEQID, but also allow future/new calls to
3488  * succeed so progress can be made.
3489  */
3490 void
3491 recov_bad_seqid(recov_info_t *recovp)
3492 {
3493 	mntinfo4_t		*mi = recovp->rc_mi;
3494 	nfs4_open_owner_t	*bad_oop;
3495 	nfs4_lock_owner_t	*bad_lop;
3496 	vnode_t			*vp;
3497 	rnode4_t		*rp = NULL;
3498 	pid_t			pid;
3499 	nfs4_bseqid_entry_t	*bsep, *tbsep;
3500 	int			error;
3501 
3502 	ASSERT(mi != NULL);
3503 	ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
3504 
3505 	mutex_enter(&mi->mi_lock);
3506 	bsep = list_head(&mi->mi_bseqid_list);
3507 	mutex_exit(&mi->mi_lock);
3508 
3509 	/*
3510 	 * Handle all the bad seqid entries on mi's list.
3511 	 */
3512 	while (bsep != NULL) {
3513 		bad_oop = bsep->bs_oop;
3514 		bad_lop = bsep->bs_lop;
3515 		vp = bsep->bs_vp;
3516 		pid = bsep->bs_pid;
3517 
3518 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
3519 		    "recov_bad_seqid: mark oop %p lop %p as bad for "
3520 		    "vp %p tag %s pid %d: last good seqid %d for tag %s",
3521 		    (void *)bad_oop, (void *)bad_lop, (void *)vp,
3522 		    nfs4_ctags[bsep->bs_tag].ct_str, pid,
3523 		    bad_oop ?  bad_oop->oo_last_good_seqid : 0,
3524 		    bad_oop ? nfs4_ctags[bad_oop->oo_last_good_op].ct_str :
3525 		    nfs4_ctags[TAG_NONE].ct_str));
3526 
3527 		nfs4_queue_event(RE_BAD_SEQID, mi, NULL,
3528 		    0, vp, NULL, NFS4ERR_BAD_SEQID, NULL, pid, bsep->bs_tag,
3529 		    bad_oop ? bad_oop->oo_last_good_op : TAG_NONE,
3530 		    bsep->bs_seqid, bad_oop ? bad_oop->oo_last_good_seqid : 0);
3531 
3532 		if (bad_oop) {
3533 			/* essentially reset the open owner */
3534 			error = nfs4_start_open_seqid_sync(bad_oop, mi);
3535 			ASSERT(!error);	/* recov thread always succeeds */
3536 			bad_oop->oo_name = nfs4_get_new_oo_name();
3537 			bad_oop->oo_seqid = 0;
3538 			nfs4_end_open_seqid_sync(bad_oop);
3539 		}
3540 
3541 		if (bad_lop) {
3542 			mutex_enter(&bad_lop->lo_lock);
3543 			bad_lop->lo_flags |= NFS4_BAD_SEQID_LOCK;
3544 			mutex_exit(&bad_lop->lo_lock);
3545 
3546 			ASSERT(vp != NULL);
3547 			rp = VTOR4(vp);
3548 			mutex_enter(&rp->r_statelock);
3549 			rp->r_flags |= R4LODANGLERS;
3550 			mutex_exit(&rp->r_statelock);
3551 
3552 			nfs4_send_siglost(pid, mi, vp, TRUE,
3553 			    0, NFS4ERR_BAD_SEQID);
3554 		}
3555 
3556 		mutex_enter(&mi->mi_lock);
3557 		list_remove(&mi->mi_bseqid_list, bsep);
3558 		tbsep = bsep;
3559 		bsep = list_head(&mi->mi_bseqid_list);
3560 		mutex_exit(&mi->mi_lock);
3561 		free_bseqid_rqst(tbsep);
3562 	}
3563 
3564 	mutex_enter(&mi->mi_lock);
3565 	mi->mi_recovflags &= ~MI4R_BAD_SEQID;
3566 	mutex_exit(&mi->mi_lock);
3567 }
3568