1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 /*
27 * NFS Version 4 state recovery code.
28 */
29
30 #include <nfs/nfs4_clnt.h>
31 #include <nfs/nfs4.h>
32 #include <nfs/rnode4.h>
33 #include <sys/cmn_err.h>
34 #include <sys/cred.h>
35 #include <sys/systm.h>
36 #include <sys/flock.h>
37 #include <sys/dnlc.h>
38 #include <sys/ddi.h>
39 #include <sys/disp.h>
40 #include <sys/list.h>
41 #include <sys/sdt.h>
42 #include <sys/mount.h>
43 #include <sys/door.h>
44 #include <nfs/nfssys.h>
45 #include <nfs/nfsid_map.h>
46 #include <nfs/nfs4_idmap_impl.h>
47
48 extern r4hashq_t *rtable4;
49
50 /*
51 * Information that describes what needs to be done for recovery. It is
52 * passed to a client recovery thread as well as passed to various recovery
53 * routines. rc_mi, rc_vp1, and rc_vp2 refer to the filesystem and
54 * vnode(s) affected by recovery. rc_vp1 and rc_vp2 are references (use
55 * VN_HOLD) or NULL. rc_lost_rqst contains information about the lost
56 * lock or open/close request, and it holds reference counts for the
57 * various objects (vnode, etc.). The recovery thread also uses flags set
58 * in the mntinfo4_t or vnode_t to tell it what to do. rc_error is used
59 * to save the error that originally triggered the recovery event -- will
60 * later be used to set mi_error if recovery doesn't work. rc_bseqid_rqst
61 * contains information about the request that got NFS4ERR_BAD_SEQID, and
62 * it holds reference count for the various objects (vnode, open owner,
63 * open stream, lock owner).
64 */
65
66 typedef struct {
67 mntinfo4_t *rc_mi;
68 vnode_t *rc_vp1;
69 vnode_t *rc_vp2;
70 nfs4_recov_t rc_action;
71 stateid4 rc_stateid;
72 bool_t rc_srv_reboot; /* server has rebooted */
73 nfs4_lost_rqst_t *rc_lost_rqst;
74 nfs4_error_t rc_orig_errors; /* original errors causing recovery */
75 int rc_error;
76 nfs4_bseqid_entry_t *rc_bseqid_rqst;
77 vnode_t *rc_moved_vp;
78 char *rc_moved_nm;
79 } recov_info_t;
80
81 /*
82 * How long to wait before trying again if there is an error doing
83 * recovery, in seconds.
84 */
85
86 static int recov_err_delay = 1;
87
88 /*
89 * How long to wait when processing NFS4ERR_GRACE or NFS4ERR_DELAY
90 * errors. Expressed in seconds. Default is defined as
91 * NFS4ERR_DELAY_TIME and this variable is initialized in nfs4_subr_init()
92 */
93 time_t nfs4err_delay_time = 0;
94
95 /*
96 * Tuneable to limit how many time "exempt" ops go OTW
97 * after a recovery error. Exempt op hints are OH_CLOSE,
98 * OH_LOCKU, OH_DELEGRETURN. These previously always went
99 * OTW even after rnode was "dead" due to recovery errors.
100 *
101 * The tuneable below limits the number of times a start_fop
102 * invocation will retry the exempt hints. After the limit
103 * is reached, nfs4_start_fop will return an error just like
104 * it would for non-exempt op hints.
105 */
106 int nfs4_max_recov_error_retry = 3;
107
108 /*
109 * Number of seconds the recovery thread should pause before retry when the
110 * filesystem has been forcibly unmounted.
111 */
112
113 int nfs4_unmount_delay = 1;
114
115 #ifdef DEBUG
116
117 /*
118 * How long to wait (in seconds) between recovery operations on a given
119 * file. Normally zero, but could be set longer for testing purposes.
120 */
121 static int nfs4_recovdelay = 0;
122
123 /*
124 * Switch that controls whether to go into the debugger when recovery
125 * fails.
126 */
127 static int nfs4_fail_recov_stop = 0;
128
129 /*
130 * Tuneables to debug client namespace interaction with server
131 * mount points:
132 *
133 * nfs4_srvmnt_fail_cnt:
134 * number of times EACCES returned because client
135 * attempted to cross server mountpoint
136 *
137 * nfs4_srvmnt_debug:
138 * trigger console printf whenever client attempts
139 * to cross server mountpoint
140 */
141 int nfs4_srvmnt_fail_cnt = 0;
142 int nfs4_srvmnt_debug = 0;
143 #endif
144
145 extern zone_key_t nfs4clnt_zone_key;
146
147 /* forward references, in alphabetic order */
148 static void close_after_open_resend(vnode_t *, cred_t *, uint32_t,
149 nfs4_error_t *);
150 static void errs_to_action(recov_info_t *,
151 nfs4_server_t *, mntinfo4_t *, stateid4 *, nfs4_lost_rqst_t *, int,
152 nfs_opnum4, nfs4_bseqid_entry_t *);
153 static void flush_reinstate(nfs4_lost_rqst_t *);
154 static void free_milist(mntinfo4_t **, int);
155 static mntinfo4_t **make_milist(nfs4_server_t *, int *);
156 static int nfs4_check_recov_err(vnode_t *, nfs4_op_hint_t,
157 nfs4_recov_state_t *, int, char *);
158 static char *nfs4_getsrvnames(mntinfo4_t *, size_t *);
159 static void nfs4_recov_fh_fail(vnode_t *, int, nfsstat4);
160 static void nfs4_recov_thread(recov_info_t *);
161 static void nfs4_remove_lost_rqsts(mntinfo4_t *, nfs4_server_t *);
162 static void nfs4_resend_lost_rqsts(recov_info_t *, nfs4_server_t *);
163 static cred_t *pid_to_cr(pid_t);
164 static void reclaim_one_lock(vnode_t *, flock64_t *, nfs4_error_t *, int *);
165 static void recov_bad_seqid(recov_info_t *);
166 static void recov_badstate(recov_info_t *, vnode_t *, nfsstat4);
167 static void recov_clientid(recov_info_t *, nfs4_server_t *);
168 static void recov_done(mntinfo4_t *, recov_info_t *);
169 static void recov_filehandle(nfs4_recov_t, mntinfo4_t *, vnode_t *);
170 static void recov_newserver(recov_info_t *, nfs4_server_t **, bool_t *);
171 static void recov_openfiles(recov_info_t *, nfs4_server_t *);
172 static void recov_stale(mntinfo4_t *, vnode_t *);
173 static void nfs4_free_lost_rqst(nfs4_lost_rqst_t *, nfs4_server_t *);
174 static void recov_throttle(recov_info_t *, vnode_t *);
175 static void relock_skip_pid(vnode_t *, locklist_t *, pid_t);
176 static void resend_lock(nfs4_lost_rqst_t *, nfs4_error_t *);
177 static void resend_one_op(nfs4_lost_rqst_t *, nfs4_error_t *, mntinfo4_t *,
178 nfs4_server_t *);
179 static void save_bseqid_rqst(nfs4_bseqid_entry_t *, recov_info_t *);
180 static void start_recovery(recov_info_t *, mntinfo4_t *, vnode_t *, vnode_t *,
181 nfs4_server_t *, vnode_t *, char *);
182 static void start_recovery_action(nfs4_recov_t, bool_t, mntinfo4_t *, vnode_t *,
183 vnode_t *);
184 static int wait_for_recovery(mntinfo4_t *, nfs4_op_hint_t);
185
186 /*
187 * Return non-zero if the given errno, status, and rpc status codes
188 * in the nfs4_error_t indicate that client recovery is needed.
189 * "stateful" indicates whether the call that got the error establishes or
190 * removes state on the server (open, close, lock, unlock, delegreturn).
191 */
192
193 int
nfs4_needs_recovery(nfs4_error_t * ep,bool_t stateful,vfs_t * vfsp)194 nfs4_needs_recovery(nfs4_error_t *ep, bool_t stateful, vfs_t *vfsp)
195 {
196 int recov = 0;
197 mntinfo4_t *mi;
198
199 /*
200 * Try failover if the error values justify it and if
201 * it's a failover mount. Don't try if the mount is in
202 * progress, failures are handled explicitly by nfs4rootvp.
203 */
204 if (nfs4_try_failover(ep)) {
205 mi = VFTOMI4(vfsp);
206 mutex_enter(&mi->mi_lock);
207 recov = FAILOVER_MOUNT4(mi) && !(mi->mi_flags & MI4_MOUNTING);
208 mutex_exit(&mi->mi_lock);
209 if (recov)
210 return (recov);
211 }
212
213 if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vfsp)) {
214 /*
215 * The server may have gotten the request, so for stateful
216 * ops we need to resynchronize and possibly back out the
217 * op.
218 */
219 return (stateful);
220 }
221 if (ep->error != 0)
222 return (0);
223
224 /* stat values are listed alphabetically */
225 /*
226 * There are two lists here: the errors for which we have code, and
227 * the errors for which we plan to have code before FCS. For the
228 * second list, print a warning message but don't attempt recovery.
229 */
230 switch (ep->stat) {
231 case NFS4ERR_BADHANDLE:
232 case NFS4ERR_BAD_SEQID:
233 case NFS4ERR_BAD_STATEID:
234 case NFS4ERR_DELAY:
235 case NFS4ERR_EXPIRED:
236 case NFS4ERR_FHEXPIRED:
237 case NFS4ERR_GRACE:
238 case NFS4ERR_OLD_STATEID:
239 case NFS4ERR_RESOURCE:
240 case NFS4ERR_STALE_CLIENTID:
241 case NFS4ERR_STALE_STATEID:
242 case NFS4ERR_WRONGSEC:
243 case NFS4ERR_STALE:
244 recov = 1;
245 break;
246 #ifdef DEBUG
247 case NFS4ERR_LEASE_MOVED:
248 case NFS4ERR_MOVED:
249 zcmn_err(VFTOMI4(vfsp)->mi_zone->zone_id,
250 CE_WARN, "!Can't yet recover from NFS status %d",
251 ep->stat);
252 break;
253 #endif
254 }
255
256 return (recov);
257 }
258
259 /*
260 * Some operations such as DELEGRETURN want to avoid invoking
261 * recovery actions that will only mark the file dead. If
262 * better handlers are invoked for any of these errors, this
263 * routine should be modified.
264 */
265 int
nfs4_recov_marks_dead(nfsstat4 status)266 nfs4_recov_marks_dead(nfsstat4 status)
267 {
268 if (status == NFS4ERR_BAD_SEQID ||
269 status == NFS4ERR_EXPIRED ||
270 status == NFS4ERR_BAD_STATEID ||
271 status == NFS4ERR_OLD_STATEID)
272 return (1);
273 return (0);
274 }
275
276 /*
277 * Transfer the state recovery information in recovp to mi's resend queue,
278 * and mark mi as having a lost state request.
279 */
280 static void
nfs4_enqueue_lost_rqst(recov_info_t * recovp,mntinfo4_t * mi)281 nfs4_enqueue_lost_rqst(recov_info_t *recovp, mntinfo4_t *mi)
282 {
283 nfs4_lost_rqst_t *lrp = recovp->rc_lost_rqst;
284
285 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
286 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
287
288 ASSERT(lrp != NULL && lrp->lr_op != 0);
289
290 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
291 "nfs4_enqueue_lost_rqst %p, op %d",
292 (void *)lrp, lrp->lr_op));
293
294 mutex_enter(&mi->mi_lock);
295 mi->mi_recovflags |= MI4R_LOST_STATE;
296 if (lrp->lr_putfirst)
297 list_insert_head(&mi->mi_lost_state, lrp);
298 else
299 list_insert_tail(&mi->mi_lost_state, lrp);
300 recovp->rc_lost_rqst = NULL;
301 mutex_exit(&mi->mi_lock);
302
303 nfs4_queue_event(RE_LOST_STATE, mi, NULL, lrp->lr_op, lrp->lr_vp,
304 lrp->lr_dvp, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
305 }
306
307 /*
308 * Transfer the bad seqid recovery information in recovp to mi's
309 * bad seqid queue, and mark mi as having a bad seqid request.
310 */
311 void
enqueue_bseqid_rqst(recov_info_t * recovp,mntinfo4_t * mi)312 enqueue_bseqid_rqst(recov_info_t *recovp, mntinfo4_t *mi)
313 {
314 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
315 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
316 ASSERT(recovp->rc_bseqid_rqst != NULL);
317
318 mutex_enter(&mi->mi_lock);
319 mi->mi_recovflags |= MI4R_BAD_SEQID;
320 list_insert_tail(&mi->mi_bseqid_list, recovp->rc_bseqid_rqst);
321 recovp->rc_bseqid_rqst = NULL;
322 mutex_exit(&mi->mi_lock);
323 }
324
325 /*
326 * Initiate recovery.
327 *
328 * The nfs4_error_t contains the return codes that triggered a recovery
329 * attempt. mi, vp1, and vp2 refer to the filesystem and files that were
330 * being operated on. vp1 and vp2 may be NULL.
331 *
332 * Multiple calls are okay. If recovery is already underway, the call
333 * updates the information about what state needs recovery but does not
334 * start a new thread. The caller should hold mi->mi_recovlock as a reader
335 * for proper synchronization with any recovery thread.
336 *
337 * This will return TRUE if recovery was aborted, and FALSE otherwise.
338 */
339 bool_t
nfs4_start_recovery(nfs4_error_t * ep,mntinfo4_t * mi,vnode_t * vp1,vnode_t * vp2,stateid4 * sid,nfs4_lost_rqst_t * lost_rqstp,nfs_opnum4 op,nfs4_bseqid_entry_t * bsep,vnode_t * moved_vp,char * moved_nm)340 nfs4_start_recovery(nfs4_error_t *ep, mntinfo4_t *mi, vnode_t *vp1,
341 vnode_t *vp2, stateid4 *sid, nfs4_lost_rqst_t *lost_rqstp, nfs_opnum4 op,
342 nfs4_bseqid_entry_t *bsep, vnode_t *moved_vp, char *moved_nm)
343 {
344 recov_info_t *recovp;
345 nfs4_server_t *sp;
346 bool_t abort = FALSE;
347 bool_t gone = FALSE;
348
349 ASSERT(nfs_zone() == mi->mi_zone);
350 mutex_enter(&mi->mi_lock);
351 /*
352 * If there is lost state, we need to kick off recovery even if the
353 * filesystem has been unmounted or the zone is shutting down.
354 */
355 gone = FS_OR_ZONE_GONE4(mi->mi_vfsp);
356 if (gone) {
357 ASSERT(ep->error != EINTR || lost_rqstp != NULL);
358 if (ep->error == EIO && lost_rqstp == NULL) {
359 /* failed due to forced unmount, no new lost state */
360 abort = TRUE;
361 }
362 if ((ep->error == 0 || ep->error == ETIMEDOUT) &&
363 !(mi->mi_recovflags & MI4R_LOST_STATE)) {
364 /* some other failure, no existing lost state */
365 abort = TRUE;
366 }
367 if (abort) {
368 mutex_exit(&mi->mi_lock);
369 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
370 "nfs4_start_recovery: fs unmounted"));
371 return (TRUE);
372 }
373 }
374 mi->mi_in_recovery++;
375 mutex_exit(&mi->mi_lock);
376
377 recovp = kmem_alloc(sizeof (recov_info_t), KM_SLEEP);
378 recovp->rc_orig_errors = *ep;
379 sp = find_nfs4_server(mi);
380 errs_to_action(recovp, sp, mi, sid, lost_rqstp, gone, op, bsep);
381 if (sp != NULL)
382 mutex_exit(&sp->s_lock);
383 start_recovery(recovp, mi, vp1, vp2, sp, moved_vp, moved_nm);
384 if (sp != NULL)
385 nfs4_server_rele(sp);
386 return (FALSE);
387 }
388
389 /*
390 * Internal version of nfs4_start_recovery. The difference is that the
391 * caller specifies the recovery action, rather than the errors leading to
392 * recovery.
393 */
394 static void
start_recovery_action(nfs4_recov_t what,bool_t reboot,mntinfo4_t * mi,vnode_t * vp1,vnode_t * vp2)395 start_recovery_action(nfs4_recov_t what, bool_t reboot, mntinfo4_t *mi,
396 vnode_t *vp1, vnode_t *vp2)
397 {
398 recov_info_t *recovp;
399
400 ASSERT(nfs_zone() == mi->mi_zone);
401 mutex_enter(&mi->mi_lock);
402 mi->mi_in_recovery++;
403 mutex_exit(&mi->mi_lock);
404
405 recovp = kmem_zalloc(sizeof (recov_info_t), KM_SLEEP);
406 recovp->rc_action = what;
407 recovp->rc_srv_reboot = reboot;
408 recovp->rc_error = EIO;
409 start_recovery(recovp, mi, vp1, vp2, NULL, NULL, NULL);
410 }
411
412 static void
start_recovery(recov_info_t * recovp,mntinfo4_t * mi,vnode_t * vp1,vnode_t * vp2,nfs4_server_t * sp,vnode_t * moved_vp,char * moved_nm)413 start_recovery(recov_info_t *recovp, mntinfo4_t *mi,
414 vnode_t *vp1, vnode_t *vp2, nfs4_server_t *sp,
415 vnode_t *moved_vp, char *moved_nm)
416 {
417 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
418 "start_recovery: mi %p, what %s", (void*)mi,
419 nfs4_recov_action_to_str(recovp->rc_action)));
420
421 /*
422 * Bump the reference on the vfs so that we can pass it to the
423 * recovery thread.
424 */
425 VFS_HOLD(mi->mi_vfsp);
426 MI4_HOLD(mi);
427 again:
428 switch (recovp->rc_action) {
429 case NR_FAILOVER:
430 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
431 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
432 if (mi->mi_servers->sv_next == NULL)
433 goto out_no_thread;
434 mutex_enter(&mi->mi_lock);
435 mi->mi_recovflags |= MI4R_NEED_NEW_SERVER;
436 mutex_exit(&mi->mi_lock);
437
438 if (recovp->rc_lost_rqst != NULL)
439 nfs4_enqueue_lost_rqst(recovp, mi);
440 break;
441
442 case NR_CLIENTID:
443 /*
444 * If the filesystem has been unmounted, punt.
445 */
446 if (sp == NULL)
447 goto out_no_thread;
448
449 /*
450 * If nobody else is working on the clientid, mark the
451 * clientid as being no longer set. Then mark the specific
452 * filesystem being worked on.
453 */
454 if (!nfs4_server_in_recovery(sp)) {
455 mutex_enter(&sp->s_lock);
456 sp->s_flags &= ~N4S_CLIENTID_SET;
457 mutex_exit(&sp->s_lock);
458 }
459 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
460 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
461 mutex_enter(&mi->mi_lock);
462 mi->mi_recovflags |= MI4R_NEED_CLIENTID;
463 if (recovp->rc_srv_reboot)
464 mi->mi_recovflags |= MI4R_SRV_REBOOT;
465 mutex_exit(&mi->mi_lock);
466 break;
467
468 case NR_OPENFILES:
469 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
470 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
471 mutex_enter(&mi->mi_lock);
472 mi->mi_recovflags |= MI4R_REOPEN_FILES;
473 if (recovp->rc_srv_reboot)
474 mi->mi_recovflags |= MI4R_SRV_REBOOT;
475 mutex_exit(&mi->mi_lock);
476 break;
477
478 case NR_WRONGSEC:
479 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
480 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
481 mutex_enter(&mi->mi_lock);
482 mi->mi_recovflags |= MI4R_NEED_SECINFO;
483 mutex_exit(&mi->mi_lock);
484 break;
485
486 case NR_EXPIRED:
487 if (vp1 != NULL)
488 recov_badstate(recovp, vp1, NFS4ERR_EXPIRED);
489 if (vp2 != NULL)
490 recov_badstate(recovp, vp2, NFS4ERR_EXPIRED);
491 goto out_no_thread; /* no further recovery possible */
492
493 case NR_BAD_STATEID:
494 if (vp1 != NULL)
495 recov_badstate(recovp, vp1, NFS4ERR_BAD_STATEID);
496 if (vp2 != NULL)
497 recov_badstate(recovp, vp2, NFS4ERR_BAD_STATEID);
498 goto out_no_thread; /* no further recovery possible */
499
500 case NR_FHEXPIRED:
501 case NR_BADHANDLE:
502 if (vp1 != NULL)
503 recov_throttle(recovp, vp1);
504 if (vp2 != NULL)
505 recov_throttle(recovp, vp2);
506 /*
507 * Recover the filehandle now, rather than using a
508 * separate thread. We can do this because filehandle
509 * recovery is independent of any other state, and because
510 * we know that we are not competing with the recovery
511 * thread at this time. recov_filehandle will deal with
512 * threads that are competing to recover this filehandle.
513 */
514 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
515 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
516 if (vp1 != NULL)
517 recov_filehandle(recovp->rc_action, mi, vp1);
518 if (vp2 != NULL)
519 recov_filehandle(recovp->rc_action, mi, vp2);
520 goto out_no_thread; /* no further recovery needed */
521
522 case NR_STALE:
523 /*
524 * NFS4ERR_STALE handling
525 * recov_stale() could set MI4R_NEED_NEW_SERVER to
526 * indicate that we can and should failover.
527 */
528 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
529 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
530
531 if (vp1 != NULL)
532 recov_stale(mi, vp1);
533 if (vp2 != NULL)
534 recov_stale(mi, vp2);
535 mutex_enter(&mi->mi_lock);
536 if ((mi->mi_recovflags & MI4R_NEED_NEW_SERVER) == 0) {
537 mutex_exit(&mi->mi_lock);
538 goto out_no_thread;
539 }
540 mutex_exit(&mi->mi_lock);
541 recovp->rc_action = NR_FAILOVER;
542 goto again;
543
544 case NR_BAD_SEQID:
545 if (recovp->rc_bseqid_rqst) {
546 enqueue_bseqid_rqst(recovp, mi);
547 break;
548 }
549
550 if (vp1 != NULL)
551 recov_badstate(recovp, vp1, NFS4ERR_BAD_SEQID);
552 if (vp2 != NULL)
553 recov_badstate(recovp, vp2, NFS4ERR_BAD_SEQID);
554 goto out_no_thread; /* no further recovery possible */
555
556 case NR_OLDSTATEID:
557 if (vp1 != NULL)
558 recov_badstate(recovp, vp1, NFS4ERR_OLD_STATEID);
559 if (vp2 != NULL)
560 recov_badstate(recovp, vp2, NFS4ERR_OLD_STATEID);
561 goto out_no_thread; /* no further recovery possible */
562
563 case NR_GRACE:
564 nfs4_set_grace_wait(mi);
565 goto out_no_thread; /* no further action required for GRACE */
566
567 case NR_DELAY:
568 if (vp1)
569 nfs4_set_delay_wait(vp1);
570 goto out_no_thread; /* no further action required for DELAY */
571
572 case NR_LOST_STATE_RQST:
573 case NR_LOST_LOCK:
574 nfs4_enqueue_lost_rqst(recovp, mi);
575 break;
576 default:
577 nfs4_queue_event(RE_UNEXPECTED_ACTION, mi, NULL,
578 recovp->rc_action, NULL, NULL, 0, NULL, 0, TAG_NONE,
579 TAG_NONE, 0, 0);
580 goto out_no_thread;
581 }
582
583 /*
584 * If either file recently went through the same recovery, wait
585 * awhile. This is in case there is some sort of bug; we might not
586 * be able to recover properly, but at least we won't bombard the
587 * server with calls, and we won't tie up the client.
588 */
589 if (vp1 != NULL)
590 recov_throttle(recovp, vp1);
591 if (vp2 != NULL)
592 recov_throttle(recovp, vp2);
593
594 /*
595 * If there's already a recovery thread, don't start another one.
596 */
597
598 mutex_enter(&mi->mi_lock);
599 if (mi->mi_flags & MI4_RECOV_ACTIV) {
600 mutex_exit(&mi->mi_lock);
601 goto out_no_thread;
602 }
603 mi->mi_flags |= MI4_RECOV_ACTIV;
604 mutex_exit(&mi->mi_lock);
605 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
606 "start_recovery: starting new thread for mi %p", (void*)mi));
607
608 recovp->rc_mi = mi;
609 recovp->rc_vp1 = vp1;
610 if (vp1 != NULL) {
611 ASSERT(VTOMI4(vp1) == mi);
612 VN_HOLD(recovp->rc_vp1);
613 }
614 recovp->rc_vp2 = vp2;
615 if (vp2 != NULL) {
616 ASSERT(VTOMI4(vp2) == mi);
617 VN_HOLD(recovp->rc_vp2);
618 }
619 recovp->rc_moved_vp = moved_vp;
620 recovp->rc_moved_nm = moved_nm;
621
622 (void) zthread_create(NULL, 0, nfs4_recov_thread, recovp, 0,
623 minclsyspri);
624 return;
625
626 /* not reached by thread creating call */
627 out_no_thread:
628 mutex_enter(&mi->mi_lock);
629 mi->mi_in_recovery--;
630 if (mi->mi_in_recovery == 0)
631 cv_broadcast(&mi->mi_cv_in_recov);
632 mutex_exit(&mi->mi_lock);
633
634 VFS_RELE(mi->mi_vfsp);
635 MI4_RELE(mi);
636 /*
637 * Free up resources that were allocated for us.
638 */
639 kmem_free(recovp, sizeof (recov_info_t));
640 }
641
642 static int
nfs4_check_recov_err(vnode_t * vp,nfs4_op_hint_t op,nfs4_recov_state_t * rsp,int retry_err_cnt,char * str)643 nfs4_check_recov_err(vnode_t *vp, nfs4_op_hint_t op,
644 nfs4_recov_state_t *rsp, int retry_err_cnt, char *str)
645 {
646 rnode4_t *rp;
647 int error = 0;
648 int exempt;
649
650 if (vp == NULL)
651 return (0);
652
653 exempt = (op == OH_CLOSE || op == OH_LOCKU || op == OH_DELEGRETURN);
654 rp = VTOR4(vp);
655 mutex_enter(&rp->r_statelock);
656
657 /*
658 * If there was a recovery error, then allow op hints "exempt" from
659 * recov errors to retry (currently 3 times). Either r_error or
660 * EIO is returned for non-exempt op hints.
661 */
662 if (rp->r_flags & R4RECOVERR) {
663 if (exempt && rsp->rs_num_retry_despite_err <=
664 nfs4_max_recov_error_retry) {
665
666 /*
667 * Check to make sure that we haven't already inc'd
668 * rs_num_retry_despite_err for current nfs4_start_fop
669 * instance. We don't want to double inc (if we were
670 * called with vp2, then the vp1 call could have
671 * already incremented.
672 */
673 if (retry_err_cnt == rsp->rs_num_retry_despite_err)
674 rsp->rs_num_retry_despite_err++;
675
676 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
677 "nfs4_start_fop: %s %p DEAD, cnt=%d", str,
678 (void *)vp, rsp->rs_num_retry_despite_err));
679 } else {
680 error = (rp->r_error ? rp->r_error : EIO);
681 /*
682 * An ESTALE error on a non-regular file is not
683 * "sticky". Return the ESTALE error once, but
684 * clear the condition to allow future operations
685 * to go OTW. This will allow the client to
686 * recover if the server has merely unshared then
687 * re-shared the file system. For regular files,
688 * the unshare has destroyed the open state at the
689 * server and we aren't willing to do a reopen (yet).
690 */
691 if (error == ESTALE && vp->v_type != VREG) {
692 rp->r_flags &=
693 ~(R4RECOVERR|R4RECOVERRP|R4STALE);
694 rp->r_error = 0;
695 error = ESTALE;
696 }
697 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
698 "nfs4_start_fop: %s %p DEAD, cnt=%d error=%d",
699 str, (void *)vp,
700 rsp->rs_num_retry_despite_err, error));
701 }
702 }
703
704 mutex_exit(&rp->r_statelock);
705 return (error);
706 }
707
708 /*
709 * Initial setup code that every operation should call if it might invoke
710 * client recovery. Can block waiting for recovery to finish on a
711 * filesystem. Either vnode ptr can be NULL.
712 *
713 * Returns 0 if there are no outstanding errors. Can return an
714 * errno value under various circumstances (e.g., failed recovery, or
715 * interrupted while waiting for recovery to finish).
716 *
717 * There must be a corresponding call to nfs4_end_op() to free up any locks
718 * or resources allocated by this call (assuming this call succeeded),
719 * using the same rsp that's passed in here.
720 *
721 * The open and lock seqid synchronization must be stopped before calling this
722 * function, as it could lead to deadlock when trying to reopen a file or
723 * reclaim a lock. The synchronization is obtained with calls to:
724 * nfs4_start_open_seqid_sync()
725 * nfs4_start_lock_seqid_sync()
726 *
727 * *startrecovp is set TRUE if the caller should not bother with the
728 * over-the-wire call, and just initiate recovery for the given request.
729 * This is typically used for state-releasing ops if the filesystem has
730 * been forcibly unmounted. startrecovp may be NULL for
731 * non-state-releasing ops.
732 */
733
734 int
nfs4_start_fop(mntinfo4_t * mi,vnode_t * vp1,vnode_t * vp2,nfs4_op_hint_t op,nfs4_recov_state_t * rsp,bool_t * startrecovp)735 nfs4_start_fop(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2, nfs4_op_hint_t op,
736 nfs4_recov_state_t *rsp, bool_t *startrecovp)
737 {
738 int error = 0, rerr_cnt;
739 nfs4_server_t *sp = NULL;
740 nfs4_server_t *tsp;
741 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
742 uint_t droplock_cnt;
743 #ifdef DEBUG
744 void *fop_caller;
745 #endif
746
747 ASSERT(vp1 == NULL || vp1->v_vfsp == mi->mi_vfsp);
748 ASSERT(vp2 == NULL || vp2->v_vfsp == mi->mi_vfsp);
749
750 #ifdef DEBUG
751 if ((fop_caller = tsd_get(nfs4_tsd_key)) != NULL) {
752 cmn_err(CE_PANIC, "Missing nfs4_end_fop: last caller %p",
753 fop_caller);
754 }
755 (void) tsd_set(nfs4_tsd_key, caller());
756 #endif
757
758 rsp->rs_sp = NULL;
759 rsp->rs_flags &= ~NFS4_RS_RENAME_HELD;
760 rerr_cnt = rsp->rs_num_retry_despite_err;
761
762 /*
763 * Process the items that may delay() based on server response
764 */
765 error = nfs4_wait_for_grace(mi, rsp);
766 if (error)
767 goto out;
768
769 if (vp1 != NULL) {
770 error = nfs4_wait_for_delay(vp1, rsp);
771 if (error)
772 goto out;
773 }
774
775 /* Wait for a delegation recall to complete. */
776
777 error = wait_for_recall(vp1, vp2, op, rsp);
778 if (error)
779 goto out;
780
781 /*
782 * Wait for any current recovery actions to finish. Note that a
783 * recovery thread can still start up after wait_for_recovery()
784 * finishes. We don't block out recovery operations until we
785 * acquire s_recovlock and mi_recovlock.
786 */
787 error = wait_for_recovery(mi, op);
788 if (error)
789 goto out;
790
791 /*
792 * Check to see if the rnode is already marked with a
793 * recovery error. If so, return it immediately. But
794 * always pass CLOSE, LOCKU, and DELEGRETURN so we can
795 * clean up state on the server.
796 */
797
798 if (vp1 != NULL) {
799 if (error = nfs4_check_recov_err(vp1, op, rsp, rerr_cnt, "vp1"))
800 goto out;
801 nfs4_check_remap(mi, vp1, NFS4_REMAP_CKATTRS, &e);
802 }
803
804 if (vp2 != NULL) {
805 if (error = nfs4_check_recov_err(vp2, op, rsp, rerr_cnt, "vp2"))
806 goto out;
807 nfs4_check_remap(mi, vp2, NFS4_REMAP_CKATTRS, &e);
808 }
809
810 /*
811 * The lock order calls for us to acquire s_recovlock before
812 * mi_recovlock, but we have to hold mi_recovlock to look up sp (to
813 * prevent races with the failover/migration code). So acquire
814 * mi_recovlock, look up sp, drop mi_recovlock, acquire
815 * s_recovlock and mi_recovlock, then verify that sp is still the
816 * right object. XXX Can we find a simpler way to deal with this?
817 */
818 if (nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER,
819 mi->mi_flags & MI4_INT)) {
820 error = EINTR;
821 goto out;
822 }
823 get_sp:
824 sp = find_nfs4_server(mi);
825 if (sp != NULL) {
826 sp->s_otw_call_count++;
827 mutex_exit(&sp->s_lock);
828 droplock_cnt = mi->mi_srvset_cnt;
829 }
830 nfs_rw_exit(&mi->mi_recovlock);
831
832 if (sp != NULL) {
833 if (nfs_rw_enter_sig(&sp->s_recovlock, RW_READER,
834 mi->mi_flags & MI4_INT)) {
835 error = EINTR;
836 goto out;
837 }
838 }
839 if (nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER,
840 mi->mi_flags & MI4_INT)) {
841 if (sp != NULL)
842 nfs_rw_exit(&sp->s_recovlock);
843 error = EINTR;
844 goto out;
845 }
846 /*
847 * If the mntinfo4_t hasn't changed nfs4_sever_ts then
848 * there's no point in double checking to make sure it
849 * has switched.
850 */
851 if (sp == NULL || droplock_cnt != mi->mi_srvset_cnt) {
852 tsp = find_nfs4_server(mi);
853 if (tsp != sp) {
854 /* try again */
855 if (tsp != NULL) {
856 mutex_exit(&tsp->s_lock);
857 nfs4_server_rele(tsp);
858 tsp = NULL;
859 }
860 if (sp != NULL) {
861 nfs_rw_exit(&sp->s_recovlock);
862 mutex_enter(&sp->s_lock);
863 sp->s_otw_call_count--;
864 mutex_exit(&sp->s_lock);
865 nfs4_server_rele(sp);
866 sp = NULL;
867 }
868 goto get_sp;
869 } else {
870 if (tsp != NULL) {
871 mutex_exit(&tsp->s_lock);
872 nfs4_server_rele(tsp);
873 tsp = NULL;
874 }
875 }
876 }
877
878 if (sp != NULL) {
879 rsp->rs_sp = sp;
880 }
881
882 /*
883 * If the fileystem uses volatile filehandles, obtain a lock so
884 * that we synchronize with renames. Exception: mount operations
885 * can change mi_fh_expire_type, which could be a problem, since
886 * the end_op code needs to be consistent with the start_op code
887 * about mi_rename_lock. Since mounts don't compete with renames,
888 * it's simpler to just not acquire the rename lock for mounts.
889 */
890 if (NFS4_VOLATILE_FH(mi) && op != OH_MOUNT) {
891 if (nfs_rw_enter_sig(&mi->mi_rename_lock,
892 op == OH_VFH_RENAME ? RW_WRITER : RW_READER,
893 mi->mi_flags & MI4_INT)) {
894 nfs_rw_exit(&mi->mi_recovlock);
895 if (sp != NULL)
896 nfs_rw_exit(&sp->s_recovlock);
897 error = EINTR;
898 goto out;
899 }
900 rsp->rs_flags |= NFS4_RS_RENAME_HELD;
901 }
902
903 if (OH_IS_STATE_RELE(op)) {
904 /*
905 * For forced unmount, letting the request proceed will
906 * almost always delay response to the user, so hand it off
907 * to the recovery thread. For exiting lwp's, we don't
908 * have a good way to tell if the request will hang. We
909 * generally want processes to handle their own requests so
910 * that they can be done in parallel, but if there is
911 * already a recovery thread, hand the request off to it.
912 * This will improve user response at no cost to overall
913 * system throughput. For zone shutdown, we'd prefer
914 * the recovery thread to handle this as well.
915 */
916 ASSERT(startrecovp != NULL);
917 mutex_enter(&mi->mi_lock);
918 if (FS_OR_ZONE_GONE4(mi->mi_vfsp))
919 *startrecovp = TRUE;
920 else if ((curthread->t_proc_flag & TP_LWPEXIT) &&
921 (mi->mi_flags & MI4_RECOV_ACTIV))
922 *startrecovp = TRUE;
923 else
924 *startrecovp = FALSE;
925 mutex_exit(&mi->mi_lock);
926 } else
927 if (startrecovp != NULL)
928 *startrecovp = FALSE;
929
930 ASSERT(error == 0);
931 return (error);
932
933 out:
934 ASSERT(error != 0);
935 if (sp != NULL) {
936 mutex_enter(&sp->s_lock);
937 sp->s_otw_call_count--;
938 mutex_exit(&sp->s_lock);
939 nfs4_server_rele(sp);
940 rsp->rs_sp = NULL;
941 }
942 nfs4_end_op_recall(vp1, vp2, rsp);
943
944 #ifdef DEBUG
945 (void) tsd_set(nfs4_tsd_key, NULL);
946 #endif
947 return (error);
948 }
949
950 /*
951 * It is up to the caller to determine if rsp->rs_sp being NULL
952 * is detrimental or not.
953 */
954 int
nfs4_start_op(mntinfo4_t * mi,vnode_t * vp1,vnode_t * vp2,nfs4_recov_state_t * rsp)955 nfs4_start_op(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2,
956 nfs4_recov_state_t *rsp)
957 {
958 ASSERT(rsp->rs_num_retry_despite_err == 0);
959 rsp->rs_num_retry_despite_err = 0;
960 return (nfs4_start_fop(mi, vp1, vp2, OH_OTHER, rsp, NULL));
961 }
962
963 /*
964 * Release any resources acquired by nfs4_start_op().
965 * 'sp' should be the nfs4_server pointer returned by nfs4_start_op().
966 *
967 * The operation hint is used to avoid a deadlock by bypassing delegation
968 * return logic for writes, which are done while returning a delegation.
969 */
970
971 void
nfs4_end_fop(mntinfo4_t * mi,vnode_t * vp1,vnode_t * vp2,nfs4_op_hint_t op,nfs4_recov_state_t * rsp,bool_t needs_recov)972 nfs4_end_fop(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2, nfs4_op_hint_t op,
973 nfs4_recov_state_t *rsp, bool_t needs_recov)
974 {
975 nfs4_server_t *sp = rsp->rs_sp;
976 rnode4_t *rp = NULL;
977
978 #ifdef lint
979 /*
980 * The op hint isn't used any more, but might be in
981 * the future.
982 */
983 op = op;
984 #endif
985
986 #ifdef DEBUG
987 ASSERT(tsd_get(nfs4_tsd_key) != NULL);
988 (void) tsd_set(nfs4_tsd_key, NULL);
989 #endif
990
991 nfs4_end_op_recall(vp1, vp2, rsp);
992
993 if (rsp->rs_flags & NFS4_RS_RENAME_HELD)
994 nfs_rw_exit(&mi->mi_rename_lock);
995
996 if (!needs_recov) {
997 if (rsp->rs_flags & NFS4_RS_DELAY_MSG) {
998 /* may need to clear the delay interval */
999 if (vp1 != NULL) {
1000 rp = VTOR4(vp1);
1001 mutex_enter(&rp->r_statelock);
1002 rp->r_delay_interval = 0;
1003 mutex_exit(&rp->r_statelock);
1004 }
1005 }
1006 rsp->rs_flags &= ~(NFS4_RS_GRACE_MSG|NFS4_RS_DELAY_MSG);
1007 }
1008
1009 /*
1010 * If the corresponding nfs4_start_op() found a sp,
1011 * then there must still be a sp.
1012 */
1013 if (sp != NULL) {
1014 nfs_rw_exit(&mi->mi_recovlock);
1015 nfs_rw_exit(&sp->s_recovlock);
1016 mutex_enter(&sp->s_lock);
1017 sp->s_otw_call_count--;
1018 cv_broadcast(&sp->s_cv_otw_count);
1019 mutex_exit(&sp->s_lock);
1020 nfs4_server_rele(sp);
1021 } else {
1022 nfs_rw_exit(&mi->mi_recovlock);
1023 }
1024 }
1025
1026 void
nfs4_end_op(mntinfo4_t * mi,vnode_t * vp1,vnode_t * vp2,nfs4_recov_state_t * rsp,bool_t needrecov)1027 nfs4_end_op(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2,
1028 nfs4_recov_state_t *rsp, bool_t needrecov)
1029 {
1030 nfs4_end_fop(mi, vp1, vp2, OH_OTHER, rsp, needrecov);
1031 }
1032
1033 /*
1034 * If the filesystem is going through client recovery, block until
1035 * finished.
1036 * Exceptions:
1037 * - state-releasing ops (CLOSE, LOCKU, DELEGRETURN) are allowed to proceed
1038 * if the filesystem has been forcibly unmounted or the lwp is exiting.
1039 *
1040 * Return value:
1041 * - 0 if no errors
1042 * - EINTR if the call was interrupted
1043 * - EIO if the filesystem has been forcibly unmounted (non-state-releasing
1044 * op)
1045 * - the errno value from the recovery thread, if recovery failed
1046 */
1047
1048 static int
wait_for_recovery(mntinfo4_t * mi,nfs4_op_hint_t op_hint)1049 wait_for_recovery(mntinfo4_t *mi, nfs4_op_hint_t op_hint)
1050 {
1051 int error = 0;
1052
1053 mutex_enter(&mi->mi_lock);
1054
1055 while (mi->mi_recovflags != 0) {
1056 klwp_t *lwp = ttolwp(curthread);
1057
1058 if ((mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) ||
1059 (mi->mi_flags & MI4_RECOV_FAIL))
1060 break;
1061 if (OH_IS_STATE_RELE(op_hint) &&
1062 (curthread->t_proc_flag & TP_LWPEXIT))
1063 break;
1064
1065 if (lwp != NULL)
1066 lwp->lwp_nostop++;
1067 /* XXX - use different cv? */
1068 if (cv_wait_sig(&mi->mi_failover_cv, &mi->mi_lock) == 0) {
1069 error = EINTR;
1070 if (lwp != NULL)
1071 lwp->lwp_nostop--;
1072 break;
1073 }
1074 if (lwp != NULL)
1075 lwp->lwp_nostop--;
1076 }
1077
1078 if ((mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) &&
1079 !OH_IS_STATE_RELE(op_hint)) {
1080 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1081 "wait_for_recovery: forced unmount"));
1082 error = EIO;
1083 } else if (mi->mi_flags & MI4_RECOV_FAIL) {
1084 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1085 "wait_for_recovery: fail since RECOV FAIL"));
1086 error = mi->mi_error;
1087 }
1088
1089 mutex_exit(&mi->mi_lock);
1090
1091 return (error);
1092 }
1093
1094 /*
1095 * If the client received NFS4ERR_GRACE for this particular mount,
1096 * the client blocks here until it is time to try again.
1097 *
1098 * Return value:
1099 * - 0 if wait was successful
1100 * - EINTR if the call was interrupted
1101 */
1102
1103 int
nfs4_wait_for_grace(mntinfo4_t * mi,nfs4_recov_state_t * rsp)1104 nfs4_wait_for_grace(mntinfo4_t *mi, nfs4_recov_state_t *rsp)
1105 {
1106 int error = 0;
1107 time_t curtime, time_to_wait;
1108
1109 /* do a unprotected check to reduce mi_lock contention */
1110 if (mi->mi_grace_wait != 0) {
1111 mutex_enter(&mi->mi_lock);
1112
1113 if (mi->mi_grace_wait != 0) {
1114 if (!(rsp->rs_flags & NFS4_RS_GRACE_MSG))
1115 rsp->rs_flags |= NFS4_RS_GRACE_MSG;
1116
1117 curtime = gethrestime_sec();
1118
1119 if (curtime < mi->mi_grace_wait) {
1120
1121 time_to_wait = mi->mi_grace_wait - curtime;
1122
1123 mutex_exit(&mi->mi_lock);
1124
1125 delay(SEC_TO_TICK(time_to_wait));
1126
1127 curtime = gethrestime_sec();
1128
1129 mutex_enter(&mi->mi_lock);
1130
1131 if (curtime >= mi->mi_grace_wait)
1132 mi->mi_grace_wait = 0;
1133 } else {
1134 mi->mi_grace_wait = 0;
1135 }
1136 }
1137 mutex_exit(&mi->mi_lock);
1138 }
1139
1140 return (error);
1141 }
1142
1143 /*
1144 * If the client received NFS4ERR_DELAY for an operation on a vnode,
1145 * the client blocks here until it is time to try again.
1146 *
1147 * Return value:
1148 * - 0 if wait was successful
1149 * - EINTR if the call was interrupted
1150 */
1151
1152 int
nfs4_wait_for_delay(vnode_t * vp,nfs4_recov_state_t * rsp)1153 nfs4_wait_for_delay(vnode_t *vp, nfs4_recov_state_t *rsp)
1154 {
1155 int error = 0;
1156 time_t curtime, time_to_wait;
1157 rnode4_t *rp;
1158
1159 ASSERT(vp != NULL);
1160
1161 rp = VTOR4(vp);
1162
1163 /* do a unprotected check to reduce r_statelock contention */
1164 if (rp->r_delay_wait != 0) {
1165 mutex_enter(&rp->r_statelock);
1166
1167 if (rp->r_delay_wait != 0) {
1168
1169 if (!(rsp->rs_flags & NFS4_RS_DELAY_MSG)) {
1170 rsp->rs_flags |= NFS4_RS_DELAY_MSG;
1171 nfs4_mi_kstat_inc_delay(VTOMI4(vp));
1172 }
1173
1174 curtime = gethrestime_sec();
1175
1176 if (curtime < rp->r_delay_wait) {
1177
1178 time_to_wait = rp->r_delay_wait - curtime;
1179
1180 mutex_exit(&rp->r_statelock);
1181
1182 delay(SEC_TO_TICK(time_to_wait));
1183
1184 curtime = gethrestime_sec();
1185
1186 mutex_enter(&rp->r_statelock);
1187
1188 if (curtime >= rp->r_delay_wait)
1189 rp->r_delay_wait = 0;
1190 } else {
1191 rp->r_delay_wait = 0;
1192 }
1193 }
1194 mutex_exit(&rp->r_statelock);
1195 }
1196
1197 return (error);
1198 }
1199
1200 /*
1201 * The recovery thread.
1202 */
1203
1204 static void
nfs4_recov_thread(recov_info_t * recovp)1205 nfs4_recov_thread(recov_info_t *recovp)
1206 {
1207 mntinfo4_t *mi = recovp->rc_mi;
1208 nfs4_server_t *sp;
1209 int done = 0, error = 0;
1210 bool_t recov_fail = FALSE;
1211 callb_cpr_t cpr_info;
1212 kmutex_t cpr_lock;
1213
1214 nfs4_queue_event(RE_START, mi, NULL, mi->mi_recovflags,
1215 recovp->rc_vp1, recovp->rc_vp2, 0, NULL, 0, TAG_NONE, TAG_NONE,
1216 0, 0);
1217
1218 mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
1219 CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr, "nfsv4Recov");
1220
1221 mutex_enter(&mi->mi_lock);
1222 mi->mi_recovthread = curthread;
1223 mutex_exit(&mi->mi_lock);
1224
1225 /*
1226 * We don't really need protection here against failover or
1227 * migration, since the current thread is the one that would make
1228 * any changes, but hold mi_recovlock anyway for completeness (and
1229 * to satisfy any ASSERTs).
1230 */
1231 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
1232 sp = find_nfs4_server(mi);
1233 if (sp != NULL)
1234 mutex_exit(&sp->s_lock);
1235 nfs_rw_exit(&mi->mi_recovlock);
1236
1237 /*
1238 * Do any necessary recovery, based on the information in recovp
1239 * and any recovery flags.
1240 */
1241
1242 do {
1243 mutex_enter(&mi->mi_lock);
1244 if (FS_OR_ZONE_GONE4(mi->mi_vfsp)) {
1245 bool_t activesrv;
1246
1247 NFS4_DEBUG(nfs4_client_recov_debug &&
1248 mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED, (CE_NOTE,
1249 "nfs4_recov_thread: file system has been "
1250 "unmounted"));
1251 NFS4_DEBUG(nfs4_client_recov_debug &&
1252 zone_status_get(curproc->p_zone) >=
1253 ZONE_IS_SHUTTING_DOWN, (CE_NOTE,
1254 "nfs4_recov_thread: zone shutting down"));
1255 /*
1256 * If the server has lost its state for us and
1257 * the filesystem is unmounted, then the filesystem
1258 * can be tossed, even if there are lost lock or
1259 * lost state calls in the recovery queue.
1260 */
1261 if (mi->mi_recovflags &
1262 (MI4R_NEED_CLIENTID | MI4R_REOPEN_FILES)) {
1263 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1264 "nfs4_recov_thread: bailing out"));
1265 mi->mi_flags |= MI4_RECOV_FAIL;
1266 mi->mi_error = recovp->rc_error;
1267 recov_fail = TRUE;
1268 }
1269 /*
1270 * We don't know if the server has any state for
1271 * us, and the filesystem has been unmounted. If
1272 * there are "lost state" recovery items, keep
1273 * trying to process them until there are no more
1274 * mounted filesystems for the server. Otherwise,
1275 * bail out. The reason we don't mark the
1276 * filesystem as failing recovery is in case we
1277 * have to do "lost state" recovery later (e.g., a
1278 * user process exits).
1279 */
1280 if (!(mi->mi_recovflags & MI4R_LOST_STATE)) {
1281 done = 1;
1282 mutex_exit(&mi->mi_lock);
1283 break;
1284 }
1285 mutex_exit(&mi->mi_lock);
1286
1287 if (sp == NULL)
1288 activesrv = FALSE;
1289 else {
1290 mutex_enter(&sp->s_lock);
1291 activesrv = nfs4_fs_active(sp);
1292 }
1293 if (!activesrv) {
1294 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1295 "no active fs for server %p",
1296 (void *)sp));
1297 mutex_enter(&mi->mi_lock);
1298 mi->mi_flags |= MI4_RECOV_FAIL;
1299 mi->mi_error = recovp->rc_error;
1300 mutex_exit(&mi->mi_lock);
1301 recov_fail = TRUE;
1302 if (sp != NULL) {
1303 /*
1304 * Mark the server instance as
1305 * dead, so that nobody will attach
1306 * a new filesystem.
1307 */
1308 nfs4_mark_srv_dead(sp);
1309 }
1310 }
1311 if (sp != NULL)
1312 mutex_exit(&sp->s_lock);
1313 } else {
1314 mutex_exit(&mi->mi_lock);
1315 }
1316
1317 /*
1318 * Check if we need to select a new server for a
1319 * failover. Choosing a new server will force at
1320 * least a check of the clientid.
1321 */
1322 mutex_enter(&mi->mi_lock);
1323 if (!recov_fail &&
1324 (mi->mi_recovflags & MI4R_NEED_NEW_SERVER)) {
1325 mutex_exit(&mi->mi_lock);
1326 recov_newserver(recovp, &sp, &recov_fail);
1327 } else
1328 mutex_exit(&mi->mi_lock);
1329
1330 /*
1331 * Check if we need to recover the clientid. This
1332 * must be done before file and lock recovery, and it
1333 * potentially affects the recovery threads for other
1334 * filesystems, so it gets special treatment.
1335 */
1336 if (sp != NULL && recov_fail == FALSE) {
1337 mutex_enter(&sp->s_lock);
1338 if (!(sp->s_flags & N4S_CLIENTID_SET)) {
1339 mutex_exit(&sp->s_lock);
1340 recov_clientid(recovp, sp);
1341 } else {
1342 /*
1343 * Unset this flag in case another recovery
1344 * thread successfully recovered the clientid
1345 * for us already.
1346 */
1347 mutex_enter(&mi->mi_lock);
1348 mi->mi_recovflags &= ~MI4R_NEED_CLIENTID;
1349 mutex_exit(&mi->mi_lock);
1350 mutex_exit(&sp->s_lock);
1351 }
1352 }
1353
1354 /*
1355 * Check if we need to get the security information.
1356 */
1357 mutex_enter(&mi->mi_lock);
1358 if ((mi->mi_recovflags & MI4R_NEED_SECINFO) &&
1359 !(mi->mi_flags & MI4_RECOV_FAIL)) {
1360 mutex_exit(&mi->mi_lock);
1361 (void) nfs_rw_enter_sig(&mi->mi_recovlock,
1362 RW_WRITER, 0);
1363 error = nfs4_secinfo_recov(recovp->rc_mi,
1364 recovp->rc_vp1, recovp->rc_vp2);
1365 /*
1366 * If error, nothing more can be done, stop
1367 * the recovery.
1368 */
1369 if (error) {
1370 mutex_enter(&mi->mi_lock);
1371 mi->mi_flags |= MI4_RECOV_FAIL;
1372 mi->mi_error = recovp->rc_error;
1373 mutex_exit(&mi->mi_lock);
1374 nfs4_queue_event(RE_WRONGSEC, mi, NULL,
1375 error, recovp->rc_vp1, recovp->rc_vp2,
1376 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
1377 }
1378 nfs_rw_exit(&mi->mi_recovlock);
1379 } else
1380 mutex_exit(&mi->mi_lock);
1381
1382 /*
1383 * Check if there's a bad seqid to recover.
1384 */
1385 mutex_enter(&mi->mi_lock);
1386 if ((mi->mi_recovflags & MI4R_BAD_SEQID) &&
1387 !(mi->mi_flags & MI4_RECOV_FAIL)) {
1388 mutex_exit(&mi->mi_lock);
1389 (void) nfs_rw_enter_sig(&mi->mi_recovlock,
1390 RW_WRITER, 0);
1391 recov_bad_seqid(recovp);
1392 nfs_rw_exit(&mi->mi_recovlock);
1393 } else
1394 mutex_exit(&mi->mi_lock);
1395
1396 /*
1397 * Next check for recovery that affects the entire
1398 * filesystem.
1399 */
1400 if (sp != NULL) {
1401 mutex_enter(&mi->mi_lock);
1402 if ((mi->mi_recovflags & MI4R_REOPEN_FILES) &&
1403 !(mi->mi_flags & MI4_RECOV_FAIL)) {
1404 mutex_exit(&mi->mi_lock);
1405 recov_openfiles(recovp, sp);
1406 } else
1407 mutex_exit(&mi->mi_lock);
1408 }
1409
1410 /*
1411 * Send any queued state recovery requests.
1412 */
1413 mutex_enter(&mi->mi_lock);
1414 if (sp != NULL &&
1415 (mi->mi_recovflags & MI4R_LOST_STATE) &&
1416 !(mi->mi_flags & MI4_RECOV_FAIL)) {
1417 mutex_exit(&mi->mi_lock);
1418 (void) nfs_rw_enter_sig(&mi->mi_recovlock,
1419 RW_WRITER, 0);
1420 nfs4_resend_lost_rqsts(recovp, sp);
1421 if (list_head(&mi->mi_lost_state) == NULL) {
1422 /* done */
1423 mutex_enter(&mi->mi_lock);
1424 mi->mi_recovflags &= ~MI4R_LOST_STATE;
1425 mutex_exit(&mi->mi_lock);
1426 }
1427 nfs_rw_exit(&mi->mi_recovlock);
1428 } else {
1429 mutex_exit(&mi->mi_lock);
1430 }
1431
1432 /*
1433 * See if there is anything more to do. If not, announce
1434 * that we are done and exit.
1435 *
1436 * Need mi_recovlock to keep 'sp' valid. Must grab
1437 * mi_recovlock before mi_lock to preserve lock ordering.
1438 */
1439 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
1440 mutex_enter(&mi->mi_lock);
1441 if ((mi->mi_recovflags & ~MI4R_SRV_REBOOT) == 0 ||
1442 (mi->mi_flags & MI4_RECOV_FAIL)) {
1443 list_t local_lost_state;
1444 nfs4_lost_rqst_t *lrp;
1445
1446 /*
1447 * We need to remove the lost requests before we
1448 * unmark the mi as no longer doing recovery to
1449 * avoid a race with a new thread putting new lost
1450 * requests on the same mi (and the going away
1451 * thread would remove the new lost requests).
1452 *
1453 * Move the lost requests to a local list since
1454 * nfs4_remove_lost_rqst() drops mi_lock, and
1455 * dropping the mi_lock would make our check to
1456 * see if recovery is done no longer valid.
1457 */
1458 list_create(&local_lost_state,
1459 sizeof (nfs4_lost_rqst_t),
1460 offsetof(nfs4_lost_rqst_t, lr_node));
1461 list_move_tail(&local_lost_state, &mi->mi_lost_state);
1462
1463 done = 1;
1464 mutex_exit(&mi->mi_lock);
1465 /*
1466 * Now officially free the "moved"
1467 * lost requests.
1468 */
1469 while ((lrp = list_head(&local_lost_state)) != NULL) {
1470 list_remove(&local_lost_state, lrp);
1471 nfs4_free_lost_rqst(lrp, sp);
1472 }
1473 list_destroy(&local_lost_state);
1474 } else
1475 mutex_exit(&mi->mi_lock);
1476 nfs_rw_exit(&mi->mi_recovlock);
1477
1478 /*
1479 * If the filesystem has been forcibly unmounted, there is
1480 * probably no point in retrying immediately. Furthermore,
1481 * there might be user processes waiting for a chance to
1482 * queue up "lost state" requests, so that they can exit.
1483 * So pause here for a moment. Same logic for zone shutdown.
1484 */
1485 if (!done && FS_OR_ZONE_GONE4(mi->mi_vfsp)) {
1486 mutex_enter(&mi->mi_lock);
1487 cv_broadcast(&mi->mi_failover_cv);
1488 mutex_exit(&mi->mi_lock);
1489 delay(SEC_TO_TICK(nfs4_unmount_delay));
1490 }
1491
1492 } while (!done);
1493
1494 if (sp != NULL)
1495 nfs4_server_rele(sp);
1496
1497 /*
1498 * Return all recalled delegations
1499 */
1500 nfs4_dlistclean();
1501
1502 mutex_enter(&mi->mi_lock);
1503 recov_done(mi, recovp);
1504 mutex_exit(&mi->mi_lock);
1505
1506 /*
1507 * Free up resources that were allocated for us.
1508 */
1509 if (recovp->rc_vp1 != NULL)
1510 VN_RELE(recovp->rc_vp1);
1511 if (recovp->rc_vp2 != NULL)
1512 VN_RELE(recovp->rc_vp2);
1513
1514 /* now we are done using the mi struct, signal the waiters */
1515 mutex_enter(&mi->mi_lock);
1516 mi->mi_in_recovery--;
1517 if (mi->mi_in_recovery == 0)
1518 cv_broadcast(&mi->mi_cv_in_recov);
1519 mutex_exit(&mi->mi_lock);
1520
1521 VFS_RELE(mi->mi_vfsp);
1522 MI4_RELE(mi);
1523 kmem_free(recovp, sizeof (recov_info_t));
1524 mutex_enter(&cpr_lock);
1525 CALLB_CPR_EXIT(&cpr_info);
1526 mutex_destroy(&cpr_lock);
1527 zthread_exit();
1528 }
1529
1530 /*
1531 * Log the end of recovery and notify any waiting threads.
1532 */
1533
1534 static void
recov_done(mntinfo4_t * mi,recov_info_t * recovp)1535 recov_done(mntinfo4_t *mi, recov_info_t *recovp)
1536 {
1537
1538 ASSERT(MUTEX_HELD(&mi->mi_lock));
1539
1540 nfs4_queue_event(RE_END, mi, NULL, 0, recovp->rc_vp1,
1541 recovp->rc_vp2, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
1542 mi->mi_recovthread = NULL;
1543 mi->mi_flags &= ~MI4_RECOV_ACTIV;
1544 mi->mi_recovflags &= ~MI4R_SRV_REBOOT;
1545 cv_broadcast(&mi->mi_failover_cv);
1546 }
1547
1548 /*
1549 * State-specific recovery routines, by state.
1550 */
1551
1552 /*
1553 * Failover.
1554 *
1555 * Replaces *spp with a reference to the new server, which must
1556 * eventually be freed.
1557 */
1558
1559 static void
recov_newserver(recov_info_t * recovp,nfs4_server_t ** spp,bool_t * recov_fail)1560 recov_newserver(recov_info_t *recovp, nfs4_server_t **spp, bool_t *recov_fail)
1561 {
1562 mntinfo4_t *mi = recovp->rc_mi;
1563 servinfo4_t *svp = NULL;
1564 nfs4_server_t *osp = *spp;
1565 CLIENT *cl;
1566 enum clnt_stat status;
1567 struct timeval tv;
1568 int error;
1569 int oncethru = 0;
1570 rnode4_t *rp;
1571 int index;
1572 nfs_fh4 fh;
1573 char *snames;
1574 size_t len;
1575
1576 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0);
1577
1578 tv.tv_sec = 2;
1579 tv.tv_usec = 0;
1580
1581 #ifdef lint
1582 /*
1583 * Lint can't follow the logic, so thinks that snames and len
1584 * can be used before being set. They can't, but lint can't
1585 * figure it out. To address the lint warning, initialize
1586 * snames and len for lint.
1587 */
1588 snames = NULL;
1589 len = 0;
1590 #endif
1591
1592 /*
1593 * Ping the null NFS procedure of every server in
1594 * the list until one responds. We always start
1595 * at the head of the list and always skip the one
1596 * that is current, since it's caused us a problem.
1597 */
1598 while (svp == NULL) {
1599 for (svp = mi->mi_servers; svp; svp = svp->sv_next) {
1600
1601 mutex_enter(&mi->mi_lock);
1602 if (FS_OR_ZONE_GONE4(mi->mi_vfsp)) {
1603 mi->mi_flags |= MI4_RECOV_FAIL;
1604 mutex_exit(&mi->mi_lock);
1605 (void) nfs_rw_exit(&mi->mi_recovlock);
1606 *recov_fail = TRUE;
1607 if (oncethru)
1608 kmem_free(snames, len);
1609 return;
1610 }
1611 mutex_exit(&mi->mi_lock);
1612
1613 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1614 if (svp->sv_flags & SV4_NOTINUSE) {
1615 nfs_rw_exit(&svp->sv_lock);
1616 continue;
1617 }
1618 nfs_rw_exit(&svp->sv_lock);
1619
1620 if (!oncethru && svp == mi->mi_curr_serv)
1621 continue;
1622
1623 error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr,
1624 NFS_PROGRAM, NFS_V4, 0, 1, CRED(), &cl);
1625 if (error)
1626 continue;
1627
1628 if (!(mi->mi_flags & MI4_INT))
1629 cl->cl_nosignal = TRUE;
1630 status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL,
1631 xdr_void, NULL, tv);
1632 if (!(mi->mi_flags & MI4_INT))
1633 cl->cl_nosignal = FALSE;
1634 AUTH_DESTROY(cl->cl_auth);
1635 CLNT_DESTROY(cl);
1636 if (status == RPC_SUCCESS) {
1637 nfs4_queue_event(RE_FAILOVER, mi,
1638 svp == mi->mi_curr_serv ? NULL :
1639 svp->sv_hostname, 0, NULL, NULL, 0,
1640 NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
1641 break;
1642 }
1643 }
1644
1645 if (svp == NULL) {
1646 if (!oncethru) {
1647 snames = nfs4_getsrvnames(mi, &len);
1648 nfs4_queue_fact(RF_SRVS_NOT_RESPOND, mi,
1649 0, 0, 0, FALSE, snames, 0, NULL);
1650 oncethru = 1;
1651 }
1652 delay(hz);
1653 }
1654 }
1655
1656 if (oncethru) {
1657 nfs4_queue_fact(RF_SRVS_OK, mi, 0, 0, 0, FALSE, snames,
1658 0, NULL);
1659 kmem_free(snames, len);
1660 }
1661
1662 #if DEBUG
1663 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1664 ASSERT((svp->sv_flags & SV4_NOTINUSE) == 0);
1665 nfs_rw_exit(&svp->sv_lock);
1666 #endif
1667
1668 mutex_enter(&mi->mi_lock);
1669 mi->mi_recovflags &= ~MI4R_NEED_NEW_SERVER;
1670 if (svp != mi->mi_curr_serv) {
1671 servinfo4_t *osvp = mi->mi_curr_serv;
1672
1673 mutex_exit(&mi->mi_lock);
1674
1675 /*
1676 * Update server-dependent fields in the root vnode.
1677 */
1678 index = rtable4hash(mi->mi_rootfh);
1679 rw_enter(&rtable4[index].r_lock, RW_WRITER);
1680
1681 rp = r4find(&rtable4[index], mi->mi_rootfh, mi->mi_vfsp);
1682 if (rp != NULL) {
1683 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
1684 "recov_newserver: remapping %s", rnode4info(rp)));
1685 mutex_enter(&rp->r_statelock);
1686 rp->r_server = svp;
1687 PURGE_ATTRCACHE4_LOCKED(rp);
1688 mutex_exit(&rp->r_statelock);
1689 (void) nfs4_free_data_reclaim(rp);
1690 nfs4_purge_rddir_cache(RTOV4(rp));
1691 rw_exit(&rtable4[index].r_lock);
1692 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
1693 "recov_newserver: done with %s",
1694 rnode4info(rp)));
1695 VN_RELE(RTOV4(rp));
1696 } else
1697 rw_exit(&rtable4[index].r_lock);
1698 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
1699
1700 mutex_enter(&mi->mi_lock);
1701 mi->mi_recovflags |= MI4R_REOPEN_FILES | MI4R_REMAP_FILES;
1702 if (recovp->rc_srv_reboot)
1703 mi->mi_recovflags |= MI4R_SRV_REBOOT;
1704 mi->mi_curr_serv = svp;
1705 mi->mi_failover++;
1706 mi->mi_flags &= ~MI4_BADOWNER_DEBUG;
1707 mutex_exit(&mi->mi_lock);
1708
1709 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1710 fh.nfs_fh4_len = svp->sv_fhandle.fh_len;
1711 fh.nfs_fh4_val = svp->sv_fhandle.fh_buf;
1712 sfh4_update(mi->mi_rootfh, &fh);
1713 fh.nfs_fh4_len = svp->sv_pfhandle.fh_len;
1714 fh.nfs_fh4_val = svp->sv_pfhandle.fh_buf;
1715 sfh4_update(mi->mi_srvparentfh, &fh);
1716 nfs_rw_exit(&svp->sv_lock);
1717
1718 *spp = nfs4_move_mi(mi, osvp, svp);
1719 if (osp != NULL)
1720 nfs4_server_rele(osp);
1721 } else
1722 mutex_exit(&mi->mi_lock);
1723 (void) nfs_rw_exit(&mi->mi_recovlock);
1724 }
1725
1726 /*
1727 * Clientid.
1728 */
1729
1730 static void
recov_clientid(recov_info_t * recovp,nfs4_server_t * sp)1731 recov_clientid(recov_info_t *recovp, nfs4_server_t *sp)
1732 {
1733 mntinfo4_t *mi = recovp->rc_mi;
1734 int error = 0;
1735 int still_stale;
1736 int need_new_s;
1737
1738 ASSERT(sp != NULL);
1739
1740 /*
1741 * Acquire the recovery lock and then verify that the clientid
1742 * still needs to be recovered. (Note that s_recovlock is supposed
1743 * to be acquired before s_lock.) Since the thread holds the
1744 * recovery lock, no other thread will recover the clientid.
1745 */
1746 (void) nfs_rw_enter_sig(&sp->s_recovlock, RW_WRITER, 0);
1747 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0);
1748 mutex_enter(&sp->s_lock);
1749 still_stale = ((sp->s_flags & N4S_CLIENTID_SET) == 0);
1750 mutex_exit(&sp->s_lock);
1751
1752 if (still_stale) {
1753 nfs4_error_t n4e;
1754
1755 nfs4_error_zinit(&n4e);
1756 nfs4setclientid(mi, kcred, TRUE, &n4e);
1757 error = n4e.error;
1758 if (error != 0) {
1759
1760 /*
1761 * nfs4setclientid may have set MI4R_NEED_NEW_SERVER,
1762 * if so, just return and let recov_thread drive
1763 * failover.
1764 */
1765 mutex_enter(&mi->mi_lock);
1766 need_new_s = mi->mi_recovflags & MI4R_NEED_NEW_SERVER;
1767 mutex_exit(&mi->mi_lock);
1768
1769 if (need_new_s) {
1770 nfs_rw_exit(&mi->mi_recovlock);
1771 nfs_rw_exit(&sp->s_recovlock);
1772 return;
1773 }
1774
1775 nfs4_queue_event(RE_CLIENTID, mi, NULL, n4e.error, NULL,
1776 NULL, n4e.stat, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
1777 mutex_enter(&mi->mi_lock);
1778 mi->mi_flags |= MI4_RECOV_FAIL;
1779 mi->mi_error = recovp->rc_error;
1780 mutex_exit(&mi->mi_lock);
1781 /* don't destroy the nfs4_server, let umount do it */
1782 }
1783 }
1784
1785 if (error == 0) {
1786 mutex_enter(&mi->mi_lock);
1787 mi->mi_recovflags &= ~MI4R_NEED_CLIENTID;
1788 /*
1789 * If still_stale isn't true, then another thread already
1790 * recovered the clientid. And that thread that set the
1791 * clientid will have initiated reopening files on all the
1792 * filesystems for the server, so we should not initiate
1793 * reopening for this filesystem here.
1794 */
1795 if (still_stale) {
1796 mi->mi_recovflags |= MI4R_REOPEN_FILES;
1797 if (recovp->rc_srv_reboot)
1798 mi->mi_recovflags |= MI4R_SRV_REBOOT;
1799 }
1800 mutex_exit(&mi->mi_lock);
1801 }
1802
1803 nfs_rw_exit(&mi->mi_recovlock);
1804
1805 if (error != 0) {
1806 nfs_rw_exit(&sp->s_recovlock);
1807 mutex_enter(&mi->mi_lock);
1808 if ((mi->mi_flags & MI4_RECOV_FAIL) == 0)
1809 delay(SEC_TO_TICK(recov_err_delay));
1810 mutex_exit(&mi->mi_lock);
1811 } else {
1812 mntinfo4_t **milist;
1813 mntinfo4_t *tmi;
1814 int nummi, i;
1815
1816 /*
1817 * Initiate recovery of open files for other filesystems.
1818 * We create an array of filesystems, rather than just
1819 * walking the filesystem list, to avoid deadlock issues
1820 * with s_lock and mi_recovlock.
1821 */
1822 milist = make_milist(sp, &nummi);
1823 for (i = 0; i < nummi; i++) {
1824 tmi = milist[i];
1825 if (tmi != mi) {
1826 (void) nfs_rw_enter_sig(&tmi->mi_recovlock,
1827 RW_READER, 0);
1828 start_recovery_action(NR_OPENFILES, TRUE, tmi,
1829 NULL, NULL);
1830 nfs_rw_exit(&tmi->mi_recovlock);
1831 }
1832 }
1833 free_milist(milist, nummi);
1834
1835 nfs_rw_exit(&sp->s_recovlock);
1836 }
1837 }
1838
1839 /*
1840 * Return an array of filesystems associated with the given server. The
1841 * caller should call free_milist() to free the references and memory.
1842 */
1843
1844 static mntinfo4_t **
make_milist(nfs4_server_t * sp,int * nummip)1845 make_milist(nfs4_server_t *sp, int *nummip)
1846 {
1847 int nummi, i;
1848 mntinfo4_t **milist;
1849 mntinfo4_t *tmi;
1850
1851 mutex_enter(&sp->s_lock);
1852 nummi = 0;
1853 for (tmi = sp->mntinfo4_list; tmi != NULL; tmi = tmi->mi_clientid_next)
1854 nummi++;
1855
1856 milist = kmem_alloc(nummi * sizeof (mntinfo4_t *), KM_SLEEP);
1857
1858 for (i = 0, tmi = sp->mntinfo4_list; tmi != NULL; i++,
1859 tmi = tmi->mi_clientid_next) {
1860 milist[i] = tmi;
1861 VFS_HOLD(tmi->mi_vfsp);
1862 }
1863 mutex_exit(&sp->s_lock);
1864
1865 *nummip = nummi;
1866 return (milist);
1867 }
1868
1869 /*
1870 * Free the filesystem list created by make_milist().
1871 */
1872
1873 static void
free_milist(mntinfo4_t ** milist,int nummi)1874 free_milist(mntinfo4_t **milist, int nummi)
1875 {
1876 mntinfo4_t *tmi;
1877 int i;
1878
1879 for (i = 0; i < nummi; i++) {
1880 tmi = milist[i];
1881 VFS_RELE(tmi->mi_vfsp);
1882 }
1883 kmem_free(milist, nummi * sizeof (mntinfo4_t *));
1884 }
1885
1886 /*
1887 * Filehandle
1888 */
1889
1890 /*
1891 * Lookup the filehandle for the given vnode and update the rnode if it has
1892 * changed.
1893 *
1894 * Errors:
1895 * - if the filehandle could not be updated because of an error that
1896 * requires further recovery, initiate that recovery and return.
1897 * - if the filehandle could not be updated because of a signal, pretend we
1898 * succeeded and let someone else deal with it.
1899 * - if the filehandle could not be updated and the filesystem has been
1900 * forcibly unmounted, pretend we succeeded, and let the caller deal with
1901 * the forced unmount (to retry or not to retry, that is the question).
1902 * - if the filehandle could not be updated because of some other error,
1903 * mark the rnode bad and return.
1904 */
1905 static void
recov_filehandle(nfs4_recov_t action,mntinfo4_t * mi,vnode_t * vp)1906 recov_filehandle(nfs4_recov_t action, mntinfo4_t *mi, vnode_t *vp)
1907 {
1908 rnode4_t *rp = VTOR4(vp);
1909 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
1910 bool_t needrecov;
1911
1912 mutex_enter(&rp->r_statelock);
1913
1914 if (rp->r_flags & R4RECOVERR) {
1915 mutex_exit(&rp->r_statelock);
1916 return;
1917 }
1918
1919 /*
1920 * If someone else is updating the filehandle, wait for them to
1921 * finish and then let our caller retry.
1922 */
1923 if (rp->r_flags & R4RECEXPFH) {
1924 while (rp->r_flags & R4RECEXPFH) {
1925 cv_wait(&rp->r_cv, &rp->r_statelock);
1926 }
1927 mutex_exit(&rp->r_statelock);
1928 return;
1929 }
1930 rp->r_flags |= R4RECEXPFH;
1931 mutex_exit(&rp->r_statelock);
1932
1933 if (action == NR_BADHANDLE) {
1934 /* shouldn't happen */
1935 nfs4_queue_event(RE_BADHANDLE, mi, NULL, 0,
1936 vp, NULL, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
1937 }
1938
1939 nfs4_remap_file(mi, vp, 0, &e);
1940 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
1941
1942 /*
1943 * If we get BADHANDLE, FHEXPIRED or STALE in their handler,
1944 * something is broken. Don't try to recover, just mark the
1945 * file dead.
1946 */
1947 DTRACE_PROBE2(recov__filehandle, nfs4_error_t, &e, vnode_t, vp);
1948 if (needrecov) {
1949 if (e.error == 0) {
1950 switch (e.stat) {
1951 case NFS4ERR_BADHANDLE:
1952 case NFS4ERR_FHEXPIRED:
1953 case NFS4ERR_STALE:
1954 goto norec; /* Unrecoverable errors */
1955 default:
1956 break;
1957 }
1958 }
1959 (void) nfs4_start_recovery(&e, mi, vp, NULL,
1960 NULL, NULL, OP_LOOKUP, NULL, NULL, NULL);
1961
1962 } else if (e.error != EINTR &&
1963 !NFS4_FRC_UNMT_ERR(e.error, mi->mi_vfsp) &&
1964 (e.error != 0 || e.stat != NFS4_OK)) {
1965 nfs4_recov_fh_fail(vp, e.error, e.stat);
1966 /*
1967 * Don't set r_error to ESTALE. Higher-level code (e.g.,
1968 * cstatat_getvp()) retries on ESTALE, which would cause
1969 * an infinite loop.
1970 */
1971 }
1972 norec:
1973 mutex_enter(&rp->r_statelock);
1974 rp->r_flags &= ~R4RECEXPFH;
1975 cv_broadcast(&rp->r_cv);
1976 mutex_exit(&rp->r_statelock);
1977 }
1978
1979 /*
1980 * Stale Filehandle
1981 */
1982
1983 /*
1984 * A stale filehandle can happen when an individual file has
1985 * been removed, or when an entire filesystem has been taken
1986 * offline. To distinguish these cases, we do this:
1987 * - if a GETATTR with the current filehandle is okay, we do
1988 * nothing (this can happen with two-filehandle ops)
1989 * - if the GETATTR fails, but a GETATTR of the root filehandle
1990 * succeeds, mark the rnode with R4STALE, which will stop use
1991 * - if the GETATTR fails, and a GETATTR of the root filehandle
1992 * also fails, we consider the problem filesystem-wide, so:
1993 * - if we can failover, we should
1994 * - if we can't failover, we should mark both the original
1995 * vnode and the root bad
1996 */
1997 static void
recov_stale(mntinfo4_t * mi,vnode_t * vp)1998 recov_stale(mntinfo4_t *mi, vnode_t *vp)
1999 {
2000 rnode4_t *rp = VTOR4(vp);
2001 vnode_t *rootvp = NULL;
2002 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
2003 nfs4_ga_res_t gar;
2004 char *fail_msg = "failed to recover from NFS4ERR_STALE";
2005 bool_t needrecov;
2006
2007 mutex_enter(&rp->r_statelock);
2008
2009 if (rp->r_flags & R4RECOVERR) {
2010 mutex_exit(&rp->r_statelock);
2011 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2012 "recov_stale: already marked dead, rp %s",
2013 rnode4info(rp)));
2014 return;
2015 }
2016
2017 if (rp->r_flags & R4STALE) {
2018 mutex_exit(&rp->r_statelock);
2019 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2020 "recov_stale: already marked stale, rp %s",
2021 rnode4info(rp)));
2022 return;
2023 }
2024
2025 mutex_exit(&rp->r_statelock);
2026
2027 /* Try a GETATTR on this vnode */
2028 nfs4_getattr_otw_norecovery(vp, &gar, &e, CRED(), 0);
2029
2030 /*
2031 * Handle non-STALE recoverable errors
2032 */
2033 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
2034 if (needrecov) {
2035 if (e.error == 0) {
2036 switch (e.stat) {
2037 case NFS4ERR_STALE:
2038 case NFS4ERR_BADHANDLE:
2039 goto norec; /* Unrecoverable */
2040 default:
2041 break;
2042 }
2043 }
2044 (void) nfs4_start_recovery(&e, mi, vp, NULL,
2045 NULL, NULL, OP_GETATTR, NULL, NULL, NULL);
2046 goto out;
2047 }
2048 norec:
2049 /* Are things OK for this vnode? */
2050 if (!e.error && e.stat == NFS4_OK) {
2051 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2052 "recov_stale: file appears fine, rp %s",
2053 rnode4info(rp)));
2054 goto out;
2055 }
2056
2057 /* Did we get an unrelated non-recoverable error? */
2058 if (e.error || e.stat != NFS4ERR_STALE) {
2059 nfs4_fail_recov(vp, fail_msg, e.error, e.stat);
2060 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2061 "recov_stale: unrelated fatal error, rp %s",
2062 rnode4info(rp)));
2063 goto out;
2064 }
2065
2066 /*
2067 * If we don't appear to be dealing with the root node, find it.
2068 */
2069 if ((vp->v_flag & VROOT) == 0) {
2070 nfs4_error_zinit(&e);
2071 e.error = VFS_ROOT(vp->v_vfsp, &rootvp);
2072 if (e.error) {
2073 nfs4_fail_recov(vp, fail_msg, 0, NFS4ERR_STALE);
2074 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2075 "recov_stale: can't find root node for rp %s",
2076 rnode4info(rp)));
2077 goto out;
2078 }
2079 }
2080
2081 /* Try a GETATTR on the root vnode */
2082 if (rootvp != NULL) {
2083 nfs4_error_zinit(&e);
2084 nfs4_getattr_otw_norecovery(rootvp, &gar, &e, CRED(), 0);
2085
2086 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
2087 if (needrecov) {
2088 if (e.error == 0) {
2089 switch (e.stat) {
2090 case NFS4ERR_STALE:
2091 case NFS4ERR_BADHANDLE:
2092 goto unrec; /* Unrecoverable */
2093 default:
2094 break;
2095 }
2096 }
2097 (void) nfs4_start_recovery(&e, mi, rootvp, NULL,
2098 NULL, NULL, OP_GETATTR, NULL, NULL, NULL);
2099 }
2100 unrec:
2101 /*
2102 * Check to see if a failover attempt is warranted
2103 * NB: nfs4_try_failover doesn't check for STALE
2104 * because recov_stale gets a shot first. Now that
2105 * recov_stale has failed, go ahead and try failover.
2106 *
2107 * If the getattr on the root filehandle was successful,
2108 * then mark recovery as failed for 'vp' and exit.
2109 */
2110 if (nfs4_try_failover(&e) == 0 && e.stat != NFS4ERR_STALE) {
2111 /*
2112 * pass the original error to fail_recov, not
2113 * the one from trying the root vnode.
2114 */
2115 nfs4_fail_recov(vp, fail_msg, 0, NFS4ERR_STALE);
2116 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2117 "recov_stale: root node OK, marking "
2118 "dead rp %s", rnode4info(rp)));
2119 goto out;
2120 }
2121 }
2122
2123 /*
2124 * Here, we know that both the original file and the
2125 * root filehandle (which may be the same) are stale.
2126 * We want to fail over if we can, and if we can't, we
2127 * want to mark everything in sight bad.
2128 */
2129 if (FAILOVER_MOUNT4(mi)) {
2130 mutex_enter(&mi->mi_lock);
2131 mi->mi_recovflags |= MI4R_NEED_NEW_SERVER;
2132 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2133 "recov_stale: failing over due to rp %s",
2134 rnode4info(rp)));
2135 mutex_exit(&mi->mi_lock);
2136 } else {
2137 rnode4_t *rootrp;
2138 servinfo4_t *svp;
2139
2140 /*
2141 * Can't fail over, so mark things dead.
2142 *
2143 * If rootvp is set, we know we have a distinct
2144 * non-root vnode which can be marked dead in
2145 * the usual way.
2146 *
2147 * Then we want to mark the root vnode dead.
2148 * Note that if rootvp wasn't set, our vp is
2149 * actually the root vnode.
2150 */
2151 if (rootvp != NULL) {
2152 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2153 "recov_stale: can't fail over, marking dead rp %s",
2154 rnode4info(rp)));
2155 nfs4_fail_recov(vp, fail_msg, 0, NFS4ERR_STALE);
2156 } else {
2157 rootvp = vp;
2158 VN_HOLD(rootvp);
2159 }
2160
2161 /*
2162 * Mark root dead, but quietly - since
2163 * the root rnode is frequently recreated,
2164 * we can encounter this at every access.
2165 * Also mark recovery as failed on this VFS.
2166 */
2167 rootrp = VTOR4(rootvp);
2168 NFS4_DEBUG(nfs4_client_recov_debug, (CE_CONT,
2169 "recov_stale: marking dead root rp %s",
2170 rnode4info(rootrp)));
2171 mutex_enter(&rootrp->r_statelock);
2172 rootrp->r_flags |= (R4RECOVERR | R4STALE);
2173 rootrp->r_error = ESTALE;
2174 mutex_exit(&rootrp->r_statelock);
2175 mutex_enter(&mi->mi_lock);
2176 mi->mi_error = ESTALE;
2177 mutex_exit(&mi->mi_lock);
2178
2179 svp = mi->mi_curr_serv;
2180 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
2181 svp->sv_flags |= SV4_ROOT_STALE;
2182 nfs_rw_exit(&svp->sv_lock);
2183 }
2184
2185 out:
2186 if (rootvp)
2187 VN_RELE(rootvp);
2188 }
2189
2190 /*
2191 * Locks.
2192 */
2193
2194 /*
2195 * Reclaim all the active (acquired) locks for the given file.
2196 * If a process lost a lock, the process is sent a SIGLOST. This is not
2197 * considered an error.
2198 *
2199 * Return values:
2200 * Errors and status are returned via the nfs4_error_t parameter
2201 * If an error indicates that recovery is needed, the caller is responsible
2202 * for dealing with it.
2203 */
2204
2205 static void
relock_file(vnode_t * vp,mntinfo4_t * mi,nfs4_error_t * ep,fattr4_change pre_change)2206 relock_file(vnode_t *vp, mntinfo4_t *mi, nfs4_error_t *ep,
2207 fattr4_change pre_change)
2208 {
2209 locklist_t *locks, *llp;
2210 rnode4_t *rp;
2211
2212 ASSERT(ep != NULL);
2213 nfs4_error_zinit(ep);
2214
2215 if (VTOMI4(vp)->mi_flags & MI4_LLOCK)
2216 return;
2217
2218 nfs4_flush_lock_owners(VTOR4(vp));
2219
2220 /*
2221 * If we get an error that requires recovery actions, just bail out
2222 * and let the top-level recovery code handle it.
2223 *
2224 * If we get some other error, kill the process that owned the lock
2225 * and mark its remaining locks (if any) as belonging to NOPID, so
2226 * that we don't make any more reclaim requests for that process.
2227 */
2228
2229 rp = VTOR4(vp);
2230 locks = flk_active_locks_for_vp(vp);
2231 for (llp = locks; llp != NULL; llp = llp->ll_next) {
2232 int did_reclaim = 1;
2233
2234 ASSERT(llp->ll_vp == vp);
2235 if (llp->ll_flock.l_pid == NOPID)
2236 continue;
2237 reclaim_one_lock(vp, &llp->ll_flock, ep, &did_reclaim);
2238 /*
2239 * If we need to restart recovery, stop processing the
2240 * list. Some errors would be recoverable under other
2241 * circumstances, but if they happen here we just give up
2242 * on the lock.
2243 */
2244 if (nfs4_needs_recovery(ep, TRUE, vp->v_vfsp)) {
2245 if (ep->error != 0)
2246 break;
2247 if (!nfs4_recov_marks_dead(ep->stat))
2248 break;
2249 }
2250 /*
2251 * In case the server isn't offering us a grace period, or
2252 * if we missed it, we might have opened & locked from scratch,
2253 * rather than reopened/reclaimed.
2254 * We need to ensure that the object hadn't been otherwise
2255 * changed during this time, by comparing the changeinfo.
2256 * We get passed the changeinfo from before the reopen by our
2257 * caller, in pre_change.
2258 * The changeinfo from after the reopen is in rp->r_change,
2259 * courtesy of the GETATTR in the reopen.
2260 * If they're different, then the file has changed, and we
2261 * have to SIGLOST the app.
2262 */
2263 if (ep->error == 0 && ep->stat == NFS4_OK && !did_reclaim) {
2264 mutex_enter(&rp->r_statelock);
2265 if (pre_change != rp->r_change)
2266 ep->stat = NFS4ERR_NO_GRACE;
2267 mutex_exit(&rp->r_statelock);
2268 }
2269 if (ep->error != 0 || ep->stat != NFS4_OK) {
2270 if (ep->error != 0)
2271 nfs4_queue_event(RE_FAIL_RELOCK, mi,
2272 NULL, ep->error, vp, NULL, 0, NULL,
2273 llp->ll_flock.l_pid, TAG_NONE, TAG_NONE,
2274 0, 0);
2275 else
2276 nfs4_queue_event(RE_FAIL_RELOCK, mi,
2277 NULL, 0, vp, NULL, ep->stat, NULL,
2278 llp->ll_flock.l_pid, TAG_NONE, TAG_NONE,
2279 0, 0);
2280 nfs4_send_siglost(llp->ll_flock.l_pid, mi, vp, TRUE,
2281 ep->error, ep->stat);
2282 relock_skip_pid(vp, llp, llp->ll_flock.l_pid);
2283
2284 /* Reinitialize the nfs4_error and continue */
2285 nfs4_error_zinit(ep);
2286 }
2287 }
2288
2289 if (locks != NULL)
2290 flk_free_locklist(locks);
2291 }
2292
2293 /*
2294 * Reclaim the given lock.
2295 *
2296 * Errors are returned via the nfs4_error_t parameter.
2297 */
2298 static void
reclaim_one_lock(vnode_t * vp,flock64_t * flk,nfs4_error_t * ep,int * did_reclaimp)2299 reclaim_one_lock(vnode_t *vp, flock64_t *flk, nfs4_error_t *ep,
2300 int *did_reclaimp)
2301 {
2302 cred_t *cr;
2303 rnode4_t *rp = VTOR4(vp);
2304
2305 cr = pid_to_cr(flk->l_pid);
2306 if (cr == NULL) {
2307 nfs4_error_init(ep, ESRCH);
2308 return;
2309 }
2310
2311 do {
2312 mutex_enter(&rp->r_statelock);
2313 if (rp->r_flags & R4RECOVERR) {
2314 mutex_exit(&rp->r_statelock);
2315 nfs4_error_init(ep, ESTALE);
2316 break;
2317 }
2318 mutex_exit(&rp->r_statelock);
2319
2320 nfs4frlock(NFS4_LCK_CTYPE_RECLAIM, vp, F_SETLK, flk,
2321 FREAD|FWRITE, 0, cr, ep, NULL, did_reclaimp);
2322 if (ep->error == 0 && ep->stat == NFS4ERR_FHEXPIRED)
2323 start_recovery_action(NR_FHEXPIRED, TRUE, VTOMI4(vp),
2324 vp, NULL);
2325 } while (ep->error == 0 && ep->stat == NFS4ERR_FHEXPIRED);
2326
2327 crfree(cr);
2328 }
2329
2330 /*
2331 * Open files.
2332 */
2333
2334 /*
2335 * Verifies if the nfsstat4 is a valid error for marking this vnode dead.
2336 * Returns 1 if the error is valid; 0 otherwise.
2337 */
2338 static int
nfs4_valid_recov_err_for_vp(vnode_t * vp,nfsstat4 stat)2339 nfs4_valid_recov_err_for_vp(vnode_t *vp, nfsstat4 stat)
2340 {
2341 /*
2342 * We should not be marking non-regular files as dead,
2343 * except in very rare cases (eg: BADHANDLE or NFS4ERR_BADNAME).
2344 */
2345 if (vp->v_type != VREG && stat != NFS4ERR_BADHANDLE &&
2346 stat != NFS4ERR_BADNAME)
2347 return (0);
2348
2349 return (1);
2350 }
2351
2352 /*
2353 * Failed attempting to recover a filehandle. If 'stat' is valid for 'vp',
2354 * then mark the object dead. Since we've had to do a lookup for
2355 * filehandle recovery, we will mark the object dead if we got NOENT.
2356 */
2357 static void
nfs4_recov_fh_fail(vnode_t * vp,int error,nfsstat4 stat)2358 nfs4_recov_fh_fail(vnode_t *vp, int error, nfsstat4 stat)
2359 {
2360 ASSERT(vp != NULL);
2361
2362 if ((error == 0) && (stat != NFS4ERR_NOENT) &&
2363 (!nfs4_valid_recov_err_for_vp(vp, stat)))
2364 return;
2365
2366 nfs4_fail_recov(vp, "can't recover filehandle", error, stat);
2367 }
2368
2369 /*
2370 * Recovery from a "shouldn't happen" error. In the long term, we'd like
2371 * to mark only the data structure(s) that provided the bad value as being
2372 * bad. But for now we'll just mark the entire file.
2373 */
2374
2375 static void
recov_badstate(recov_info_t * recovp,vnode_t * vp,nfsstat4 stat)2376 recov_badstate(recov_info_t *recovp, vnode_t *vp, nfsstat4 stat)
2377 {
2378 ASSERT(vp != NULL);
2379 recov_throttle(recovp, vp);
2380
2381 if (!nfs4_valid_recov_err_for_vp(vp, stat))
2382 return;
2383
2384 nfs4_fail_recov(vp, "", 0, stat);
2385 }
2386
2387 /*
2388 * Free up the information saved for a lost state request.
2389 */
2390 static void
nfs4_free_lost_rqst(nfs4_lost_rqst_t * lrp,nfs4_server_t * sp)2391 nfs4_free_lost_rqst(nfs4_lost_rqst_t *lrp, nfs4_server_t *sp)
2392 {
2393 component4 *filep;
2394 nfs4_open_stream_t *osp;
2395 int have_sync_lock;
2396
2397 NFS4_DEBUG(nfs4_lost_rqst_debug,
2398 (CE_NOTE, "nfs4_free_lost_rqst:"));
2399
2400 switch (lrp->lr_op) {
2401 case OP_OPEN:
2402 filep = &lrp->lr_ofile;
2403 if (filep->utf8string_val) {
2404 kmem_free(filep->utf8string_val, filep->utf8string_len);
2405 filep->utf8string_val = NULL;
2406 }
2407 break;
2408 case OP_DELEGRETURN:
2409 nfs4delegreturn_cleanup(VTOR4(lrp->lr_vp), sp);
2410 break;
2411 case OP_CLOSE:
2412 osp = lrp->lr_osp;
2413 ASSERT(osp != NULL);
2414 mutex_enter(&osp->os_sync_lock);
2415 have_sync_lock = 1;
2416 if (osp->os_pending_close) {
2417 /* clean up the open file state. */
2418 osp->os_pending_close = 0;
2419 nfs4close_notw(lrp->lr_vp, osp, &have_sync_lock);
2420 }
2421 if (have_sync_lock)
2422 mutex_exit(&osp->os_sync_lock);
2423 break;
2424 }
2425
2426 lrp->lr_op = 0;
2427 if (lrp->lr_oop != NULL) {
2428 open_owner_rele(lrp->lr_oop);
2429 lrp->lr_oop = NULL;
2430 }
2431 if (lrp->lr_osp != NULL) {
2432 open_stream_rele(lrp->lr_osp, VTOR4(lrp->lr_vp));
2433 lrp->lr_osp = NULL;
2434 }
2435 if (lrp->lr_lop != NULL) {
2436 lock_owner_rele(lrp->lr_lop);
2437 lrp->lr_lop = NULL;
2438 }
2439 if (lrp->lr_flk != NULL) {
2440 kmem_free(lrp->lr_flk, sizeof (flock64_t));
2441 lrp->lr_flk = NULL;
2442 }
2443 if (lrp->lr_vp != NULL) {
2444 VN_RELE(lrp->lr_vp);
2445 lrp->lr_vp = NULL;
2446 }
2447 if (lrp->lr_dvp != NULL) {
2448 VN_RELE(lrp->lr_dvp);
2449 lrp->lr_dvp = NULL;
2450 }
2451 if (lrp->lr_cr != NULL) {
2452 crfree(lrp->lr_cr);
2453 lrp->lr_cr = NULL;
2454 }
2455
2456 kmem_free(lrp, sizeof (nfs4_lost_rqst_t));
2457 }
2458
2459 /*
2460 * Remove any lost state requests and free them.
2461 */
2462 static void
nfs4_remove_lost_rqsts(mntinfo4_t * mi,nfs4_server_t * sp)2463 nfs4_remove_lost_rqsts(mntinfo4_t *mi, nfs4_server_t *sp)
2464 {
2465 nfs4_lost_rqst_t *lrp;
2466
2467 mutex_enter(&mi->mi_lock);
2468 while ((lrp = list_head(&mi->mi_lost_state)) != NULL) {
2469 list_remove(&mi->mi_lost_state, lrp);
2470 mutex_exit(&mi->mi_lock);
2471 nfs4_free_lost_rqst(lrp, sp);
2472 mutex_enter(&mi->mi_lock);
2473 }
2474 mutex_exit(&mi->mi_lock);
2475 }
2476
2477 /*
2478 * Reopen all the files for the given filesystem and reclaim any locks.
2479 */
2480
2481 static void
recov_openfiles(recov_info_t * recovp,nfs4_server_t * sp)2482 recov_openfiles(recov_info_t *recovp, nfs4_server_t *sp)
2483 {
2484 mntinfo4_t *mi = recovp->rc_mi;
2485 nfs4_opinst_t *reopenlist = NULL, *rep;
2486 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
2487 open_claim_type4 claim;
2488 int remap;
2489 char *fail_msg = "No such file or directory on replica";
2490 rnode4_t *rp;
2491 fattr4_change pre_change;
2492
2493 ASSERT(sp != NULL);
2494
2495 /*
2496 * This check is to allow a 10ms pause before we reopen files
2497 * it should allow the server time to have received the CB_NULL
2498 * reply and update its internal structures such that (if
2499 * applicable) we are granted a delegation on reopened files.
2500 */
2501 mutex_enter(&sp->s_lock);
2502 if ((sp->s_flags & (N4S_CB_PINGED | N4S_CB_WAITER)) == 0) {
2503 sp->s_flags |= N4S_CB_WAITER;
2504 (void) cv_reltimedwait(&sp->wait_cb_null, &sp->s_lock,
2505 drv_usectohz(N4S_CB_PAUSE_TIME), TR_CLOCK_TICK);
2506 }
2507 mutex_exit(&sp->s_lock);
2508
2509 (void) nfs_rw_enter_sig(&sp->s_recovlock, RW_READER, 0);
2510 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0);
2511
2512 if (NFS4_VOLATILE_FH(mi)) {
2513 nfs4_remap_root(mi, &e, 0);
2514 if (nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp)) {
2515 (void) nfs4_start_recovery(&e, mi, NULL,
2516 NULL, NULL, NULL, OP_LOOKUP, NULL, NULL, NULL);
2517 }
2518 }
2519
2520 mutex_enter(&mi->mi_lock);
2521 if (recovp->rc_srv_reboot || (mi->mi_recovflags & MI4R_SRV_REBOOT))
2522 claim = CLAIM_PREVIOUS;
2523 else
2524 claim = CLAIM_NULL;
2525 mutex_exit(&mi->mi_lock);
2526
2527 if (e.error == 0 && e.stat == NFS4_OK) {
2528 /*
2529 * Get a snapshot of open files in the filesystem. Note
2530 * that new opens will stall until the server's grace
2531 * period is done.
2532 */
2533 reopenlist = r4mkopenlist(mi);
2534
2535 mutex_enter(&mi->mi_lock);
2536 remap = mi->mi_recovflags & MI4R_REMAP_FILES;
2537 mutex_exit(&mi->mi_lock);
2538 /*
2539 * Since we are re-establishing state on the
2540 * server, its ok to blow away the saved lost
2541 * requests since we don't need to reissue it.
2542 */
2543 nfs4_remove_lost_rqsts(mi, sp);
2544
2545 for (rep = reopenlist; rep; rep = rep->re_next) {
2546
2547 if (remap) {
2548 nfs4_remap_file(mi, rep->re_vp,
2549 NFS4_REMAP_CKATTRS, &e);
2550 }
2551 DTRACE_PROBE2(recov__openfiles, nfs4_error_t, &e,
2552 vnode_t, rep->re_vp);
2553 if (e.error == ENOENT || e.stat == NFS4ERR_NOENT) {
2554 /*
2555 * The current server does not have the file
2556 * that is to be remapped. This is most
2557 * likely due to an improperly maintained
2558 * replica. The files that are missing from
2559 * the server will be marked dead and logged
2560 * in order to make sys admins aware of the
2561 * problem.
2562 */
2563 nfs4_fail_recov(rep->re_vp,
2564 fail_msg, e.error, e.stat);
2565 /*
2566 * We've already handled the error so clear it.
2567 */
2568 nfs4_error_zinit(&e);
2569 continue;
2570 } else if (e.error == 0 && e.stat == NFS4_OK) {
2571 int j;
2572
2573 rp = VTOR4(rep->re_vp);
2574 mutex_enter(&rp->r_statelock);
2575 pre_change = rp->r_change;
2576 mutex_exit(&rp->r_statelock);
2577
2578 for (j = 0; j < rep->re_numosp; j++) {
2579 nfs4_reopen(rep->re_vp, rep->re_osp[j],
2580 &e, claim, FALSE, TRUE);
2581 if (e.error != 0 || e.stat != NFS4_OK)
2582 break;
2583 }
2584 if (nfs4_needs_recovery(&e, TRUE,
2585 mi->mi_vfsp)) {
2586 (void) nfs4_start_recovery(&e, mi,
2587 rep->re_vp, NULL, NULL, NULL,
2588 OP_OPEN, NULL, NULL, NULL);
2589 break;
2590 }
2591 }
2592 #ifdef DEBUG
2593 if (nfs4_recovdelay > 0)
2594 delay(MSEC_TO_TICK(nfs4_recovdelay * 1000));
2595 #endif
2596 if (e.error == 0 && e.stat == NFS4_OK) {
2597 relock_file(rep->re_vp, mi, &e, pre_change);
2598
2599 if (nfs4_needs_recovery(&e, TRUE, mi->mi_vfsp))
2600 (void) nfs4_start_recovery(&e, mi,
2601 rep->re_vp, NULL, NULL, NULL,
2602 OP_LOCK, NULL, NULL, NULL);
2603 }
2604
2605 if (e.error != 0 || e.stat != NFS4_OK)
2606 break;
2607 }
2608
2609 /*
2610 * Check to see if we need to remap files passed in
2611 * via the recovery arguments; this will have been
2612 * done for open files. A failure here is not fatal.
2613 */
2614 if (remap) {
2615 nfs4_error_t ignore;
2616 nfs4_check_remap(mi, recovp->rc_vp1, NFS4_REMAP_CKATTRS,
2617 &ignore);
2618 nfs4_check_remap(mi, recovp->rc_vp2, NFS4_REMAP_CKATTRS,
2619 &ignore);
2620 }
2621 }
2622
2623 if (e.error == 0 && e.stat == NFS4_OK) {
2624 mutex_enter(&mi->mi_lock);
2625 mi->mi_recovflags &= ~(MI4R_REOPEN_FILES | MI4R_REMAP_FILES);
2626 mutex_exit(&mi->mi_lock);
2627 }
2628
2629 nfs_rw_exit(&mi->mi_recovlock);
2630 nfs_rw_exit(&sp->s_recovlock);
2631
2632 if (reopenlist != NULL)
2633 r4releopenlist(reopenlist);
2634 }
2635
2636 /*
2637 * Resend the queued state recovery requests in "rqsts".
2638 */
2639
2640 static void
nfs4_resend_lost_rqsts(recov_info_t * recovp,nfs4_server_t * sp)2641 nfs4_resend_lost_rqsts(recov_info_t *recovp, nfs4_server_t *sp)
2642 {
2643 nfs4_lost_rqst_t *lrp, *tlrp;
2644 mntinfo4_t *mi = recovp->rc_mi;
2645 nfs4_error_t n4e;
2646 #ifdef NOTYET
2647 uint32_t deny_bits = 0;
2648 #endif
2649
2650 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_resend_lost_rqsts"));
2651
2652 ASSERT(mi != NULL);
2653 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
2654
2655 mutex_enter(&mi->mi_lock);
2656 lrp = list_head(&mi->mi_lost_state);
2657 mutex_exit(&mi->mi_lock);
2658 while (lrp != NULL) {
2659 nfs4_error_zinit(&n4e);
2660 resend_one_op(lrp, &n4e, mi, sp);
2661 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
2662 "nfs4_resend_lost_rqsts: resend request: for vp %p got "
2663 "error %d stat %d", (void *)lrp->lr_vp, n4e.error,
2664 n4e.stat));
2665
2666 /*
2667 * If we get a recovery error that we can actually
2668 * recover from (such as ETIMEDOUT, FHEXPIRED), we
2669 * return and let the recovery thread redrive the call.
2670 * Don't requeue unless the zone is still healthy.
2671 */
2672 if (zone_status_get(curproc->p_zone) < ZONE_IS_SHUTTING_DOWN &&
2673 nfs4_needs_recovery(&n4e, TRUE, mi->mi_vfsp) &&
2674 (nfs4_try_failover(&n4e) ||
2675 NFS4_FRC_UNMT_ERR(n4e.error, mi->mi_vfsp) ||
2676 (n4e.error == 0 && n4e.stat != NFS4ERR_BADHANDLE &&
2677 !nfs4_recov_marks_dead(n4e.stat)))) {
2678 /*
2679 * For these three errors, we want to delay a bit
2680 * instead of pounding the server into submission.
2681 * We have to do this manually; the normal
2682 * processing for these errors only works for
2683 * non-recovery requests.
2684 */
2685 if ((n4e.error == 0 && n4e.stat == NFS4ERR_DELAY) ||
2686 (n4e.error == 0 && n4e.stat == NFS4ERR_GRACE) ||
2687 (n4e.error == 0 && n4e.stat == NFS4ERR_RESOURCE) ||
2688 NFS4_FRC_UNMT_ERR(n4e.error, mi->mi_vfsp)) {
2689 delay(SEC_TO_TICK(nfs4err_delay_time));
2690 } else {
2691 (void) nfs4_start_recovery(&n4e,
2692 mi, lrp->lr_dvp, lrp->lr_vp, NULL, NULL,
2693 lrp->lr_op, NULL, NULL, NULL);
2694 }
2695 return;
2696 }
2697
2698 mutex_enter(&mi->mi_lock);
2699 list_remove(&mi->mi_lost_state, lrp);
2700 tlrp = lrp;
2701 lrp = list_head(&mi->mi_lost_state);
2702 mutex_exit(&mi->mi_lock);
2703 nfs4_free_lost_rqst(tlrp, sp);
2704 }
2705 }
2706
2707 /*
2708 * Resend the given op, and issue any necessary undo call.
2709 * errors are returned via the nfs4_error_t parameter.
2710 */
2711
2712 static void
resend_one_op(nfs4_lost_rqst_t * lrp,nfs4_error_t * ep,mntinfo4_t * mi,nfs4_server_t * sp)2713 resend_one_op(nfs4_lost_rqst_t *lrp, nfs4_error_t *ep,
2714 mntinfo4_t *mi, nfs4_server_t *sp)
2715 {
2716 vnode_t *vp;
2717 nfs4_open_stream_t *osp;
2718 cred_t *cr;
2719 uint32_t acc_bits;
2720
2721 vp = lrp->lr_vp;
2722 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_one_op: "
2723 "have a lost open/close request for vp %p", (void *)vp));
2724
2725 switch (lrp->lr_op) {
2726 case OP_OPEN:
2727 nfs4_resend_open_otw(&vp, lrp, ep);
2728 break;
2729 case OP_OPEN_DOWNGRADE:
2730 ASSERT(lrp->lr_oop != NULL);
2731 ep->error = nfs4_start_open_seqid_sync(lrp->lr_oop, mi);
2732 ASSERT(!ep->error); /* recov thread always succeeds */
2733 ASSERT(lrp->lr_osp != NULL);
2734 mutex_enter(&lrp->lr_osp->os_sync_lock);
2735 nfs4_open_downgrade(lrp->lr_dg_acc, lrp->lr_dg_deny,
2736 lrp->lr_oop, lrp->lr_osp, vp, lrp->lr_cr, lrp,
2737 ep, NULL, NULL);
2738 mutex_exit(&lrp->lr_osp->os_sync_lock);
2739 nfs4_end_open_seqid_sync(lrp->lr_oop);
2740 break;
2741 case OP_CLOSE:
2742 osp = lrp->lr_osp;
2743 cr = lrp->lr_cr;
2744 acc_bits = 0;
2745 mutex_enter(&osp->os_sync_lock);
2746 if (osp->os_share_acc_read)
2747 acc_bits |= OPEN4_SHARE_ACCESS_READ;
2748 if (osp->os_share_acc_write)
2749 acc_bits |= OPEN4_SHARE_ACCESS_WRITE;
2750 mutex_exit(&osp->os_sync_lock);
2751 nfs4close_one(vp, osp, cr, acc_bits, lrp, ep,
2752 CLOSE_RESEND, 0, 0, 0);
2753 break;
2754 case OP_LOCK:
2755 case OP_LOCKU:
2756 resend_lock(lrp, ep);
2757 goto done;
2758 case OP_DELEGRETURN:
2759 nfs4_resend_delegreturn(lrp, ep, sp);
2760 goto done;
2761 default:
2762 #ifdef DEBUG
2763 cmn_err(CE_PANIC, "resend_one_op: unexpected op: %d",
2764 lrp->lr_op);
2765 #endif
2766 nfs4_queue_event(RE_LOST_STATE_BAD_OP, mi, NULL,
2767 lrp->lr_op, lrp->lr_vp, lrp->lr_dvp, NFS4_OK, NULL, 0,
2768 TAG_NONE, TAG_NONE, 0, 0);
2769 nfs4_error_init(ep, EINVAL);
2770 return;
2771 }
2772
2773 /*
2774 * No need to retry nor send an "undo" CLOSE in the
2775 * event the server rebooted.
2776 */
2777 if (ep->error == 0 && (ep->stat == NFS4ERR_STALE_CLIENTID ||
2778 ep->stat == NFS4ERR_STALE_STATEID || ep->stat == NFS4ERR_EXPIRED))
2779 goto done;
2780
2781 /*
2782 * If we resent a CLOSE or OPEN_DOWNGRADE, there's nothing
2783 * to undo. Undoing locking operations was handled by
2784 * resend_lock().
2785 */
2786 if (lrp->lr_op == OP_OPEN_DOWNGRADE || lrp->lr_op == OP_CLOSE)
2787 goto done;
2788
2789 /*
2790 * If we get any other error for OPEN, then don't attempt
2791 * to undo the resend of the open (since it was never
2792 * successful!).
2793 */
2794 ASSERT(lrp->lr_op == OP_OPEN);
2795 if (ep->error || ep->stat != NFS4_OK)
2796 goto done;
2797
2798 /*
2799 * Now let's undo our OPEN.
2800 */
2801 nfs4_error_zinit(ep);
2802 close_after_open_resend(vp, lrp->lr_cr, lrp->lr_oacc, ep);
2803 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_one_op: "
2804 "nfs4close_one: for vp %p got error %d stat %d",
2805 (void *)vp, ep->error, ep->stat));
2806
2807 done:
2808 if (vp != lrp->lr_vp)
2809 VN_RELE(vp);
2810 }
2811
2812 /*
2813 * Close a file that was opened via a resent OPEN.
2814 * Most errors are passed back to the caller (via the return value and
2815 * *statp), except for FHEXPIRED, which is retried.
2816 *
2817 * It might be conceptually cleaner to push the CLOSE request onto the
2818 * front of the resend queue, rather than sending it here. That would
2819 * match the way we undo lost lock requests. On the other
2820 * hand, we've already got something that works, and there's no reason to
2821 * change it at this time.
2822 */
2823
2824 static void
close_after_open_resend(vnode_t * vp,cred_t * cr,uint32_t acc_bits,nfs4_error_t * ep)2825 close_after_open_resend(vnode_t *vp, cred_t *cr, uint32_t acc_bits,
2826 nfs4_error_t *ep)
2827 {
2828
2829 for (;;) {
2830 nfs4close_one(vp, NULL, cr, acc_bits, NULL, ep,
2831 CLOSE_AFTER_RESEND, 0, 0, 0);
2832 if (ep->error == 0 && ep->stat == NFS4_OK)
2833 break; /* success; done */
2834 if (ep->error != 0 || ep->stat != NFS4ERR_FHEXPIRED)
2835 break;
2836 /* else retry FHEXPIRED */
2837 }
2838
2839 }
2840
2841 /*
2842 * Resend the given lost lock request. Return an errno value. If zero,
2843 * *statp is set to the NFS status code for the call.
2844 *
2845 * Issue a SIGLOST and mark the rnode dead if we get a non-recovery error or
2846 * a recovery error that we don't actually recover from yet (eg: BAD_SEQID).
2847 * Let the recovery thread redrive the call if we get a recovery error that
2848 * we can actually recover from.
2849 */
2850 static void
resend_lock(nfs4_lost_rqst_t * lrp,nfs4_error_t * ep)2851 resend_lock(nfs4_lost_rqst_t *lrp, nfs4_error_t *ep)
2852 {
2853 bool_t send_siglost = FALSE;
2854 vnode_t *vp = lrp->lr_vp;
2855
2856 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_lock:"));
2857 ASSERT(lrp->lr_ctype == NFS4_LCK_CTYPE_REINSTATE ||
2858 lrp->lr_ctype == NFS4_LCK_CTYPE_RESEND);
2859
2860 nfs4frlock(lrp->lr_ctype, vp, F_SETLK,
2861 lrp->lr_flk, FREAD|FWRITE, 0, lrp->lr_cr, ep, lrp, NULL);
2862
2863 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_lock: "
2864 "nfs4frlock for vp %p returned error %d, stat %d",
2865 (void *)vp, ep->error, ep->stat));
2866
2867 if (ep->error == 0 && ep->stat == 0)
2868 goto done;
2869 if (ep->error == 0 && ep->stat == NFS4ERR_DENIED &&
2870 lrp->lr_ctype == NFS4_LCK_CTYPE_RESEND)
2871 goto done;
2872
2873 /*
2874 * If we failed with a non-recovery error, send SIGLOST and
2875 * mark the file dead.
2876 */
2877 if (!nfs4_needs_recovery(ep, TRUE, vp->v_vfsp))
2878 send_siglost = TRUE;
2879 else {
2880 /*
2881 * Done with recovering LOST LOCK in the event the
2882 * server rebooted or we've lost the lease.
2883 */
2884 if (ep->error == 0 && (ep->stat == NFS4ERR_STALE_CLIENTID ||
2885 ep->stat == NFS4ERR_STALE_STATEID ||
2886 ep->stat == NFS4ERR_EXPIRED)) {
2887 goto done;
2888 }
2889
2890 /*
2891 * BAD_STATEID on an unlock indicates that the server has
2892 * forgotten about the lock anyway, so act like the call
2893 * was successful.
2894 */
2895 if (ep->error == 0 && ep->stat == NFS4ERR_BAD_STATEID &&
2896 lrp->lr_op == OP_LOCKU)
2897 goto done;
2898
2899 /*
2900 * If we got a recovery error that we don't actually
2901 * recover from, send SIGLOST. If the filesystem was
2902 * forcibly unmounted, we skip the SIGLOST because (a) it's
2903 * unnecessary noise, and (b) there could be a new process
2904 * with the same pid as the one that had generated the lost
2905 * state request.
2906 */
2907 if (ep->error == 0 && (ep->stat == NFS4ERR_BADHANDLE ||
2908 nfs4_recov_marks_dead(ep->stat))) {
2909 if (!(vp->v_vfsp->vfs_flag & VFS_UNMOUNTED))
2910 send_siglost = TRUE;
2911 goto done;
2912 }
2913
2914 /*
2915 * If the filesystem was forcibly unmounted, we
2916 * still need to synchronize with the server and
2917 * release state. Try again later.
2918 */
2919 if (NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp))
2920 goto done;
2921
2922 /*
2923 * If we get a recovery error that we can actually
2924 * recover from (such as ETIMEDOUT, FHEXPIRED),
2925 * return and let the recovery thread redrive the call.
2926 *
2927 * For the three errors below, we want to delay a bit
2928 * instead of pounding the server into submission.
2929 */
2930 if ((ep->error == 0 && ep->stat == NFS4ERR_DELAY) ||
2931 (ep->error == 0 && ep->stat == NFS4ERR_GRACE) ||
2932 (ep->error == 0 && ep->stat == NFS4ERR_RESOURCE))
2933 delay(SEC_TO_TICK(recov_err_delay));
2934 goto done;
2935 }
2936
2937 done:
2938 if (send_siglost) {
2939 cred_t *sv_cred;
2940
2941 /*
2942 * Must be root or the actual thread being issued the
2943 * SIGLOST for this to work, so just become root.
2944 */
2945 sv_cred = curthread->t_cred;
2946 curthread->t_cred = kcred;
2947 nfs4_send_siglost(lrp->lr_flk->l_pid, VTOMI4(vp), vp, FALSE,
2948 ep->error, ep->stat);
2949 curthread->t_cred = sv_cred;
2950
2951 /*
2952 * Flush any additional reinstantiation requests for
2953 * this operation. Sending multiple SIGLOSTs to the user
2954 * process is unlikely to help and may cause trouble.
2955 */
2956 if (lrp->lr_ctype == NFS4_LCK_CTYPE_REINSTATE)
2957 flush_reinstate(lrp);
2958 }
2959 }
2960
2961 /*
2962 * Remove any lock reinstantiation requests that correspond to the given
2963 * lost request. We only remove items that follow lrp in the queue,
2964 * assuming that lrp will be removed by the generic lost state code.
2965 */
2966
2967 static void
flush_reinstate(nfs4_lost_rqst_t * lrp)2968 flush_reinstate(nfs4_lost_rqst_t *lrp)
2969 {
2970 vnode_t *vp;
2971 pid_t pid;
2972 mntinfo4_t *mi;
2973 nfs4_lost_rqst_t *nlrp;
2974
2975 vp = lrp->lr_vp;
2976 mi = VTOMI4(vp);
2977 pid = lrp->lr_flk->l_pid;
2978
2979 /*
2980 * If there are any more reinstantation requests to get rid of,
2981 * they should all be clustered at the front of the lost state
2982 * queue.
2983 */
2984 mutex_enter(&mi->mi_lock);
2985 for (lrp = list_next(&mi->mi_lost_state, lrp); lrp != NULL;
2986 lrp = nlrp) {
2987 nlrp = list_next(&mi->mi_lost_state, lrp);
2988 if (lrp->lr_op != OP_LOCK && lrp->lr_op != OP_LOCKU)
2989 break;
2990 if (lrp->lr_ctype != NFS4_LCK_CTYPE_REINSTATE)
2991 break;
2992 ASSERT(lrp->lr_vp == vp);
2993 ASSERT(lrp->lr_flk->l_pid == pid);
2994 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
2995 "remove reinstantiation %p", (void *)lrp));
2996 list_remove(&mi->mi_lost_state, lrp);
2997 nfs4_free_lost_rqst(lrp, NULL);
2998 }
2999 mutex_exit(&mi->mi_lock);
3000 }
3001
3002 /*
3003 * End of state-specific recovery routines.
3004 */
3005
3006 /*
3007 * Allocate a lost request struct, initialize it from lost_rqstp (including
3008 * bumping the reference counts for the referenced vnode, etc.), and hang
3009 * it off of recovp.
3010 */
3011
3012 static void
nfs4_save_lost_rqst(nfs4_lost_rqst_t * lost_rqstp,recov_info_t * recovp,nfs4_recov_t * action,mntinfo4_t * mi)3013 nfs4_save_lost_rqst(nfs4_lost_rqst_t *lost_rqstp, recov_info_t *recovp,
3014 nfs4_recov_t *action, mntinfo4_t *mi)
3015 {
3016 nfs4_lost_rqst_t *destp;
3017
3018 ASSERT(recovp->rc_lost_rqst == NULL);
3019
3020 destp = kmem_alloc(sizeof (nfs4_lost_rqst_t), KM_SLEEP);
3021 recovp->rc_lost_rqst = destp;
3022
3023 if (lost_rqstp->lr_op == OP_LOCK ||
3024 lost_rqstp->lr_op == OP_LOCKU) {
3025 ASSERT(lost_rqstp->lr_lop);
3026 *action = NR_LOST_LOCK;
3027 destp->lr_ctype = lost_rqstp->lr_ctype;
3028 destp->lr_locktype = lost_rqstp->lr_locktype;
3029 } else if (lost_rqstp->lr_op == OP_OPEN) {
3030 component4 *srcfp, *destfp;
3031
3032 destp->lr_oacc = lost_rqstp->lr_oacc;
3033 destp->lr_odeny = lost_rqstp->lr_odeny;
3034 destp->lr_oclaim = lost_rqstp->lr_oclaim;
3035 if (lost_rqstp->lr_oclaim == CLAIM_DELEGATE_CUR)
3036 destp->lr_ostateid = lost_rqstp->lr_ostateid;
3037
3038 srcfp = &lost_rqstp->lr_ofile;
3039 destfp = &destp->lr_ofile;
3040 /*
3041 * Consume caller's utf8string
3042 */
3043 destfp->utf8string_len = srcfp->utf8string_len;
3044 destfp->utf8string_val = srcfp->utf8string_val;
3045 srcfp->utf8string_len = 0;
3046 srcfp->utf8string_val = NULL; /* make sure not reused */
3047
3048 *action = NR_LOST_STATE_RQST;
3049 } else if (lost_rqstp->lr_op == OP_OPEN_DOWNGRADE) {
3050 destp->lr_dg_acc = lost_rqstp->lr_dg_acc;
3051 destp->lr_dg_deny = lost_rqstp->lr_dg_deny;
3052
3053 *action = NR_LOST_STATE_RQST;
3054 } else if (lost_rqstp->lr_op == OP_CLOSE) {
3055 ASSERT(lost_rqstp->lr_oop);
3056 *action = NR_LOST_STATE_RQST;
3057 } else if (lost_rqstp->lr_op == OP_DELEGRETURN) {
3058 *action = NR_LOST_STATE_RQST;
3059 } else {
3060 #ifdef DEBUG
3061 cmn_err(CE_PANIC, "nfs4_save_lost_rqst: bad op %d",
3062 lost_rqstp->lr_op);
3063 #endif
3064 nfs4_queue_event(RE_LOST_STATE_BAD_OP, mi, NULL,
3065 lost_rqstp->lr_op, lost_rqstp->lr_vp, lost_rqstp->lr_dvp,
3066 NFS4_OK, NULL, curproc->p_pid, TAG_NONE, TAG_NONE, 0, 0);
3067 *action = NR_UNUSED;
3068 recovp->rc_lost_rqst = NULL;
3069 kmem_free(destp, sizeof (nfs4_lost_rqst_t));
3070 return;
3071 }
3072
3073 destp->lr_op = lost_rqstp->lr_op;
3074 destp->lr_vp = lost_rqstp->lr_vp;
3075 if (destp->lr_vp)
3076 VN_HOLD(destp->lr_vp);
3077 destp->lr_dvp = lost_rqstp->lr_dvp;
3078 if (destp->lr_dvp)
3079 VN_HOLD(destp->lr_dvp);
3080 destp->lr_oop = lost_rqstp->lr_oop;
3081 if (destp->lr_oop)
3082 open_owner_hold(destp->lr_oop);
3083 destp->lr_osp = lost_rqstp->lr_osp;
3084 if (destp->lr_osp)
3085 open_stream_hold(destp->lr_osp);
3086 destp->lr_lop = lost_rqstp->lr_lop;
3087 if (destp->lr_lop)
3088 lock_owner_hold(destp->lr_lop);
3089 destp->lr_cr = lost_rqstp->lr_cr;
3090 if (destp->lr_cr)
3091 crhold(destp->lr_cr);
3092 if (lost_rqstp->lr_flk == NULL)
3093 destp->lr_flk = NULL;
3094 else {
3095 destp->lr_flk = kmem_alloc(sizeof (flock64_t), KM_SLEEP);
3096 *destp->lr_flk = *lost_rqstp->lr_flk;
3097 }
3098 destp->lr_putfirst = lost_rqstp->lr_putfirst;
3099 }
3100
3101 /*
3102 * Map the given return values (errno and nfs4 status code) to a recovery
3103 * action and fill in the following fields of recovp: rc_action,
3104 * rc_srv_reboot, rc_stateid, rc_lost_rqst.
3105 */
3106
3107 void
errs_to_action(recov_info_t * recovp,nfs4_server_t * sp,mntinfo4_t * mi,stateid4 * sidp,nfs4_lost_rqst_t * lost_rqstp,int unmounted,nfs_opnum4 op,nfs4_bseqid_entry_t * bsep)3108 errs_to_action(recov_info_t *recovp,
3109 nfs4_server_t *sp, mntinfo4_t *mi, stateid4 *sidp,
3110 nfs4_lost_rqst_t *lost_rqstp, int unmounted, nfs_opnum4 op,
3111 nfs4_bseqid_entry_t *bsep)
3112 {
3113 nfs4_recov_t action = NR_UNUSED;
3114 bool_t reboot = FALSE;
3115 int try_f;
3116 int error = recovp->rc_orig_errors.error;
3117 nfsstat4 stat = recovp->rc_orig_errors.stat;
3118
3119 bzero(&recovp->rc_stateid, sizeof (stateid4));
3120 recovp->rc_lost_rqst = NULL;
3121 recovp->rc_bseqid_rqst = NULL;
3122
3123 try_f = nfs4_try_failover(&recovp->rc_orig_errors) &&
3124 FAILOVER_MOUNT4(mi);
3125
3126 /*
3127 * We start recovery for EINTR only in the lost lock
3128 * or lost open/close case.
3129 */
3130
3131 if (try_f || error == EINTR || (error == EIO && unmounted)) {
3132 recovp->rc_error = (error != 0 ? error : geterrno4(stat));
3133 if (lost_rqstp) {
3134 ASSERT(lost_rqstp->lr_op != 0);
3135 nfs4_save_lost_rqst(lost_rqstp, recovp, &action, mi);
3136 }
3137 if (try_f)
3138 action = NR_FAILOVER;
3139 } else if (error != 0) {
3140 recovp->rc_error = error;
3141 nfs4_queue_event(RE_UNEXPECTED_ERRNO, mi, NULL, error, NULL,
3142 NULL, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
3143 action = NR_CLIENTID;
3144 } else {
3145 recovp->rc_error = geterrno4(stat);
3146 switch (stat) {
3147 #ifdef notyet
3148 case NFS4ERR_LEASE_MOVED:
3149 action = xxx;
3150 break;
3151 #endif
3152 case NFS4ERR_MOVED:
3153 action = NR_MOVED;
3154 break;
3155 case NFS4ERR_BADHANDLE:
3156 action = NR_BADHANDLE;
3157 break;
3158 case NFS4ERR_BAD_SEQID:
3159 if (bsep)
3160 save_bseqid_rqst(bsep, recovp);
3161 action = NR_BAD_SEQID;
3162 break;
3163 case NFS4ERR_OLD_STATEID:
3164 action = NR_OLDSTATEID;
3165 break;
3166 case NFS4ERR_WRONGSEC:
3167 action = NR_WRONGSEC;
3168 break;
3169 case NFS4ERR_FHEXPIRED:
3170 action = NR_FHEXPIRED;
3171 break;
3172 case NFS4ERR_BAD_STATEID:
3173 if (sp == NULL || (sp != NULL && inlease(sp))) {
3174
3175 action = NR_BAD_STATEID;
3176 if (sidp)
3177 recovp->rc_stateid = *sidp;
3178 } else
3179 action = NR_CLIENTID;
3180 break;
3181 case NFS4ERR_EXPIRED:
3182 /*
3183 * The client's lease has expired, either due
3184 * to a network partition or perhaps a client
3185 * error. In either case, try an NR_CLIENTID
3186 * style recovery. reboot remains false, since
3187 * there is no evidence the server has rebooted.
3188 * This will cause CLAIM_NULL opens and lock
3189 * requests without the reclaim bit.
3190 */
3191 action = NR_CLIENTID;
3192
3193 DTRACE_PROBE4(nfs4__expired,
3194 nfs4_server_t *, sp,
3195 mntinfo4_t *, mi,
3196 stateid4 *, sidp, int, op);
3197
3198 break;
3199 case NFS4ERR_STALE_CLIENTID:
3200 case NFS4ERR_STALE_STATEID:
3201 action = NR_CLIENTID;
3202 reboot = TRUE;
3203 break;
3204 case NFS4ERR_RESOURCE:
3205 /*
3206 * If this had been a FAILOVER mount, then
3207 * we'd have tried failover. Since it's not,
3208 * just delay a while and retry.
3209 */
3210 action = NR_DELAY;
3211 break;
3212 case NFS4ERR_GRACE:
3213 action = NR_GRACE;
3214 break;
3215 case NFS4ERR_DELAY:
3216 action = NR_DELAY;
3217 break;
3218 case NFS4ERR_STALE:
3219 action = NR_STALE;
3220 break;
3221 default:
3222 nfs4_queue_event(RE_UNEXPECTED_STATUS, mi, NULL, 0,
3223 NULL, NULL, stat, NULL, 0, TAG_NONE, TAG_NONE,
3224 0, 0);
3225 action = NR_CLIENTID;
3226 break;
3227 }
3228 }
3229
3230 /* make sure action got set */
3231 ASSERT(action != NR_UNUSED);
3232 recovp->rc_srv_reboot = reboot;
3233 recovp->rc_action = action;
3234 nfs4_queue_fact(RF_ERR, mi, stat, action, op, reboot, NULL, error,
3235 NULL);
3236 }
3237
3238 /*
3239 * Return the (held) credential for the process with the given pid.
3240 * May return NULL (e.g., process not found).
3241 */
3242
3243 static cred_t *
pid_to_cr(pid_t pid)3244 pid_to_cr(pid_t pid)
3245 {
3246 proc_t *p;
3247 cred_t *cr;
3248
3249 mutex_enter(&pidlock);
3250 if ((p = prfind(pid)) == NULL) {
3251 mutex_exit(&pidlock);
3252 return (NULL);
3253 }
3254
3255 mutex_enter(&p->p_crlock);
3256 crhold(cr = p->p_cred);
3257 mutex_exit(&p->p_crlock);
3258 mutex_exit(&pidlock);
3259
3260 return (cr);
3261 }
3262
3263 /*
3264 * Send SIGLOST to the given process and queue the event.
3265 *
3266 * The 'dump' boolean tells us whether this action should dump the
3267 * in-kernel queue of recovery messages or not.
3268 */
3269
3270 void
nfs4_send_siglost(pid_t pid,mntinfo4_t * mi,vnode_t * vp,bool_t dump,int error,nfsstat4 stat)3271 nfs4_send_siglost(pid_t pid, mntinfo4_t *mi, vnode_t *vp, bool_t dump,
3272 int error, nfsstat4 stat)
3273 {
3274 proc_t *p;
3275
3276 mutex_enter(&pidlock);
3277 p = prfind(pid);
3278 if (p)
3279 psignal(p, SIGLOST);
3280 mutex_exit(&pidlock);
3281 nfs4_queue_event(dump ? RE_SIGLOST : RE_SIGLOST_NO_DUMP, mi,
3282 NULL, error, vp, NULL, stat, NULL, pid, TAG_NONE, TAG_NONE, 0, 0);
3283 }
3284
3285 /*
3286 * Scan the lock list for entries that match the given pid. Unregister those
3287 * locks that do and change their pid to NOPID.
3288 */
3289
3290 static void
relock_skip_pid(vnode_t * vp,locklist_t * llp,pid_t pid)3291 relock_skip_pid(vnode_t *vp, locklist_t *llp, pid_t pid)
3292 {
3293 for (; llp != NULL; llp = llp->ll_next) {
3294 if (llp->ll_flock.l_pid == pid) {
3295 int r;
3296
3297 /*
3298 * Unregister the lost lock.
3299 */
3300 llp->ll_flock.l_type = F_UNLCK;
3301 r = reclock(vp, &llp->ll_flock, SETFLCK, FREAD | FWRITE,
3302 0, NULL);
3303 /* The unlock cannot fail */
3304 ASSERT(r == 0);
3305
3306 llp->ll_flock.l_pid = NOPID;
3307 }
3308 }
3309 }
3310
3311 /*
3312 * Mark a file as having failed recovery, after making a last-ditch effort
3313 * to return any delegation.
3314 *
3315 * Sets r_error to EIO or ESTALE for the given vnode.
3316 */
3317 void
nfs4_fail_recov(vnode_t * vp,char * why,int error,nfsstat4 stat)3318 nfs4_fail_recov(vnode_t *vp, char *why, int error, nfsstat4 stat)
3319 {
3320 rnode4_t *rp = VTOR4(vp);
3321
3322 #ifdef DEBUG
3323 if (nfs4_fail_recov_stop)
3324 debug_enter("nfs4_fail_recov");
3325 #endif
3326
3327 mutex_enter(&rp->r_statelock);
3328 if (rp->r_flags & (R4RECOVERR|R4RECOVERRP)) {
3329 mutex_exit(&rp->r_statelock);
3330 return;
3331 }
3332
3333 /*
3334 * Set R4RECOVERRP to indicate that a recovery error is in
3335 * progress. This will shut down reads and writes at the top
3336 * half. Don't set R4RECOVERR until after we've returned the
3337 * delegation, otherwise it will fail.
3338 */
3339
3340 rp->r_flags |= R4RECOVERRP;
3341 mutex_exit(&rp->r_statelock);
3342
3343 nfs4delegabandon(rp);
3344
3345 mutex_enter(&rp->r_statelock);
3346 rp->r_flags |= (R4RECOVERR | R4STALE);
3347 rp->r_error = (error == 0 && stat == NFS4ERR_STALE) ? ESTALE : EIO;
3348 PURGE_ATTRCACHE4_LOCKED(rp);
3349 if (!(vp->v_vfsp->vfs_flag & VFS_UNMOUNTED))
3350 nfs4_queue_event(RE_DEAD_FILE, VTOMI4(vp), NULL, error,
3351 vp, NULL, stat, why, 0, TAG_NONE, TAG_NONE, 0, 0);
3352 mutex_exit(&rp->r_statelock);
3353
3354 dnlc_purge_vp(vp);
3355 }
3356
3357 /*
3358 * recov_throttle: if the file had the same recovery action within the
3359 * throttle interval, wait for the throttle interval to finish before
3360 * proceeding.
3361 *
3362 * Side effects: updates the rnode with the current recovery information.
3363 */
3364
3365 static void
recov_throttle(recov_info_t * recovp,vnode_t * vp)3366 recov_throttle(recov_info_t *recovp, vnode_t *vp)
3367 {
3368 time_t curtime, time_to_wait;
3369 rnode4_t *rp = VTOR4(vp);
3370
3371 curtime = gethrestime_sec();
3372
3373 mutex_enter(&rp->r_statelock);
3374 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
3375 "recov_throttle: now: (%d, %ld), last: (%d, %ld)",
3376 recovp->rc_action, curtime,
3377 rp->r_recov_act, rp->r_last_recov));
3378 if (recovp->rc_action == rp->r_recov_act &&
3379 rp->r_last_recov + recov_err_delay > curtime) {
3380 time_to_wait = rp->r_last_recov + recov_err_delay - curtime;
3381 mutex_exit(&rp->r_statelock);
3382 delay(SEC_TO_TICK(time_to_wait));
3383 curtime = gethrestime_sec();
3384 mutex_enter(&rp->r_statelock);
3385 }
3386
3387 rp->r_last_recov = curtime;
3388 rp->r_recov_act = recovp->rc_action;
3389 mutex_exit(&rp->r_statelock);
3390 }
3391
3392 /*
3393 * React to NFS4ERR_GRACE by setting the time we'll permit
3394 * the next call to this filesystem.
3395 */
3396 void
nfs4_set_grace_wait(mntinfo4_t * mi)3397 nfs4_set_grace_wait(mntinfo4_t *mi)
3398 {
3399 mutex_enter(&mi->mi_lock);
3400 /* Mark the time for the future */
3401 mi->mi_grace_wait = gethrestime_sec() + nfs4err_delay_time;
3402 mutex_exit(&mi->mi_lock);
3403 }
3404
3405 /*
3406 * React to MFS4ERR_DELAY by setting the time we'll permit
3407 * the next call to this vnode.
3408 */
3409 void
nfs4_set_delay_wait(vnode_t * vp)3410 nfs4_set_delay_wait(vnode_t *vp)
3411 {
3412 rnode4_t *rp = VTOR4(vp);
3413
3414 mutex_enter(&rp->r_statelock);
3415 /*
3416 * Calculate amount we should delay, initial
3417 * delay will be short and then we will back off.
3418 */
3419 if (rp->r_delay_interval == 0)
3420 rp->r_delay_interval = NFS4_INITIAL_DELAY_INTERVAL;
3421 else
3422 /* calculate next interval value */
3423 rp->r_delay_interval =
3424 MIN(NFS4_MAX_DELAY_INTERVAL, (rp->r_delay_interval << 1));
3425 rp->r_delay_wait = gethrestime_sec() + rp->r_delay_interval;
3426 mutex_exit(&rp->r_statelock);
3427 }
3428
3429 /*
3430 * The caller is responsible for freeing the returned string.
3431 */
3432 static char *
nfs4_getsrvnames(mntinfo4_t * mi,size_t * len)3433 nfs4_getsrvnames(mntinfo4_t *mi, size_t *len)
3434 {
3435 servinfo4_t *svp;
3436 char *srvnames;
3437 char *namep;
3438 size_t length;
3439
3440 /*
3441 * Calculate the length of the string required to hold all
3442 * of the server names plus either a comma or a null
3443 * character following each individual one.
3444 */
3445 length = 0;
3446 for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) {
3447 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
3448 if (svp->sv_flags & SV4_NOTINUSE) {
3449 nfs_rw_exit(&svp->sv_lock);
3450 continue;
3451 }
3452 nfs_rw_exit(&svp->sv_lock);
3453 length += svp->sv_hostnamelen;
3454 }
3455
3456 srvnames = kmem_alloc(length, KM_SLEEP);
3457
3458 namep = srvnames;
3459 for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) {
3460 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
3461 if (svp->sv_flags & SV4_NOTINUSE) {
3462 nfs_rw_exit(&svp->sv_lock);
3463 continue;
3464 }
3465 nfs_rw_exit(&svp->sv_lock);
3466 (void) strcpy(namep, svp->sv_hostname);
3467 namep += svp->sv_hostnamelen - 1;
3468 *namep++ = ',';
3469 }
3470 *--namep = '\0';
3471
3472 *len = length;
3473
3474 return (srvnames);
3475 }
3476
3477 static void
save_bseqid_rqst(nfs4_bseqid_entry_t * bsep,recov_info_t * recovp)3478 save_bseqid_rqst(nfs4_bseqid_entry_t *bsep, recov_info_t *recovp)
3479 {
3480 nfs4_bseqid_entry_t *destp;
3481
3482 destp = kmem_alloc(sizeof (nfs4_bseqid_entry_t), KM_SLEEP);
3483 recovp->rc_bseqid_rqst = destp;
3484
3485 if (bsep->bs_oop)
3486 open_owner_hold(bsep->bs_oop);
3487 destp->bs_oop = bsep->bs_oop;
3488 if (bsep->bs_lop)
3489 lock_owner_hold(bsep->bs_lop);
3490 destp->bs_lop = bsep->bs_lop;
3491 if (bsep->bs_vp)
3492 VN_HOLD(bsep->bs_vp);
3493 destp->bs_vp = bsep->bs_vp;
3494 destp->bs_pid = bsep->bs_pid;
3495 destp->bs_tag = bsep->bs_tag;
3496 destp->bs_seqid = bsep->bs_seqid;
3497 }
3498
3499 static void
free_bseqid_rqst(nfs4_bseqid_entry_t * bsep)3500 free_bseqid_rqst(nfs4_bseqid_entry_t *bsep)
3501 {
3502 if (bsep->bs_oop)
3503 open_owner_rele(bsep->bs_oop);
3504 if (bsep->bs_lop)
3505 lock_owner_rele(bsep->bs_lop);
3506 if (bsep->bs_vp)
3507 VN_RELE(bsep->bs_vp);
3508 kmem_free(bsep, sizeof (nfs4_bseqid_entry_t));
3509 }
3510
3511 /*
3512 * We don't actually fully recover from NFS4ERR_BAD_SEQID. We
3513 * simply mark the open owner and open stream (if provided) as "bad".
3514 * Then future uses of these data structures will be limited to basically
3515 * just cleaning up the internal client state (no going OTW).
3516 *
3517 * The result of this is to return errors back to the app/usr when
3518 * we receive NFS4ERR_BAD_SEQID, but also allow future/new calls to
3519 * succeed so progress can be made.
3520 */
3521 void
recov_bad_seqid(recov_info_t * recovp)3522 recov_bad_seqid(recov_info_t *recovp)
3523 {
3524 mntinfo4_t *mi = recovp->rc_mi;
3525 nfs4_open_owner_t *bad_oop;
3526 nfs4_lock_owner_t *bad_lop;
3527 vnode_t *vp;
3528 rnode4_t *rp = NULL;
3529 pid_t pid;
3530 nfs4_bseqid_entry_t *bsep, *tbsep;
3531 int error;
3532
3533 ASSERT(mi != NULL);
3534 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
3535
3536 mutex_enter(&mi->mi_lock);
3537 bsep = list_head(&mi->mi_bseqid_list);
3538 mutex_exit(&mi->mi_lock);
3539
3540 /*
3541 * Handle all the bad seqid entries on mi's list.
3542 */
3543 while (bsep != NULL) {
3544 bad_oop = bsep->bs_oop;
3545 bad_lop = bsep->bs_lop;
3546 vp = bsep->bs_vp;
3547 pid = bsep->bs_pid;
3548
3549 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
3550 "recov_bad_seqid: mark oop %p lop %p as bad for "
3551 "vp %p tag %s pid %d: last good seqid %d for tag %s",
3552 (void *)bad_oop, (void *)bad_lop, (void *)vp,
3553 nfs4_ctags[bsep->bs_tag].ct_str, pid,
3554 bad_oop ? bad_oop->oo_last_good_seqid : 0,
3555 bad_oop ? nfs4_ctags[bad_oop->oo_last_good_op].ct_str :
3556 nfs4_ctags[TAG_NONE].ct_str));
3557
3558 nfs4_queue_event(RE_BAD_SEQID, mi, NULL,
3559 0, vp, NULL, NFS4ERR_BAD_SEQID, NULL, pid, bsep->bs_tag,
3560 bad_oop ? bad_oop->oo_last_good_op : TAG_NONE,
3561 bsep->bs_seqid, bad_oop ? bad_oop->oo_last_good_seqid : 0);
3562
3563 if (bad_oop) {
3564 /* essentially reset the open owner */
3565 error = nfs4_start_open_seqid_sync(bad_oop, mi);
3566 ASSERT(!error); /* recov thread always succeeds */
3567 bad_oop->oo_name = nfs4_get_new_oo_name();
3568 bad_oop->oo_seqid = 0;
3569 nfs4_end_open_seqid_sync(bad_oop);
3570 }
3571
3572 if (bad_lop) {
3573 mutex_enter(&bad_lop->lo_lock);
3574 bad_lop->lo_flags |= NFS4_BAD_SEQID_LOCK;
3575 mutex_exit(&bad_lop->lo_lock);
3576
3577 ASSERT(vp != NULL);
3578 rp = VTOR4(vp);
3579 mutex_enter(&rp->r_statelock);
3580 rp->r_flags |= R4LODANGLERS;
3581 mutex_exit(&rp->r_statelock);
3582
3583 nfs4_send_siglost(pid, mi, vp, TRUE,
3584 0, NFS4ERR_BAD_SEQID);
3585 }
3586
3587 mutex_enter(&mi->mi_lock);
3588 list_remove(&mi->mi_bseqid_list, bsep);
3589 tbsep = bsep;
3590 bsep = list_head(&mi->mi_bseqid_list);
3591 mutex_exit(&mi->mi_lock);
3592 free_bseqid_rqst(tbsep);
3593 }
3594
3595 mutex_enter(&mi->mi_lock);
3596 mi->mi_recovflags &= ~MI4R_BAD_SEQID;
3597 mutex_exit(&mi->mi_lock);
3598 }
3599