1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * NFS Version 4 state recovery code. 30 */ 31 32 #include <nfs/nfs4_clnt.h> 33 #include <nfs/nfs4.h> 34 #include <nfs/rnode4.h> 35 #include <sys/cmn_err.h> 36 #include <sys/cred.h> 37 #include <sys/systm.h> 38 #include <sys/flock.h> 39 #include <sys/dnlc.h> 40 #include <sys/ddi.h> 41 #include <sys/disp.h> 42 #include <sys/list.h> 43 #include <sys/sdt.h> 44 45 extern r4hashq_t *rtable4; 46 47 /* 48 * Information that describes what needs to be done for recovery. It is 49 * passed to a client recovery thread as well as passed to various recovery 50 * routines. rc_mi, rc_vp1, and rc_vp2 refer to the filesystem and 51 * vnode(s) affected by recovery. rc_vp1 and rc_vp2 are references (use 52 * VN_HOLD) or NULL. rc_lost_rqst contains information about the lost 53 * lock or open/close request, and it holds reference counts for the 54 * various objects (vnode, etc.). The recovery thread also uses flags set 55 * in the mntinfo4_t or vnode_t to tell it what to do. rc_error is used 56 * to save the error that originally triggered the recovery event -- will 57 * later be used to set mi_error if recovery doesn't work. rc_bseqid_rqst 58 * contains information about the request that got NFS4ERR_BAD_SEQID, and 59 * it holds reference count for the various objects (vnode, open owner, 60 * open stream, lock owner). 61 */ 62 63 typedef struct { 64 mntinfo4_t *rc_mi; 65 vnode_t *rc_vp1; 66 vnode_t *rc_vp2; 67 nfs4_recov_t rc_action; 68 stateid4 rc_stateid; 69 bool_t rc_srv_reboot; /* server has rebooted */ 70 nfs4_lost_rqst_t *rc_lost_rqst; 71 nfs4_error_t rc_orig_errors; /* original errors causing recovery */ 72 int rc_error; 73 nfs4_bseqid_entry_t *rc_bseqid_rqst; 74 } recov_info_t; 75 76 /* 77 * How long to wait before trying again if there is an error doing 78 * recovery, in seconds. 79 */ 80 81 static int recov_err_delay = 1; 82 83 /* 84 * How long to wait when processing NFS4ERR_GRACE or NFS4ERR_DELAY 85 * errors. Expressed in seconds. Default is defined as 86 * NFS4ERR_DELAY_TIME and this variable is initialized in nfs4_subr_init() 87 */ 88 time_t nfs4err_delay_time = 0; 89 90 /* 91 * Tuneable to limit how many time "exempt" ops go OTW 92 * after a recovery error. Exempt op hints are OH_CLOSE, 93 * OH_LOCKU, OH_DELEGRETURN. These previously always went 94 * OTW even after rnode was "dead" due to recovery errors. 95 * 96 * The tuneable below limits the number of times a start_fop 97 * invocation will retry the exempt hints. After the limit 98 * is reached, nfs4_start_fop will return an error just like 99 * it would for non-exempt op hints. 100 */ 101 int nfs4_max_recov_error_retry = 3; 102 103 /* 104 * Number of seconds the recovery thread should pause before retry when the 105 * filesystem has been forcibly unmounted. 106 */ 107 108 int nfs4_unmount_delay = 1; 109 110 #ifdef DEBUG 111 112 /* 113 * How long to wait (in seconds) between recovery operations on a given 114 * file. Normally zero, but could be set longer for testing purposes. 115 */ 116 static int nfs4_recovdelay = 0; 117 118 /* 119 * Switch that controls whether to go into the debugger when recovery 120 * fails. 121 */ 122 static int nfs4_fail_recov_stop = 0; 123 124 /* 125 * Tuneables to debug client namespace interaction with server 126 * mount points: 127 * 128 * nfs4_srvmnt_fail_cnt: 129 * number of times EACCES returned because client 130 * attempted to cross server mountpoint 131 * 132 * nfs4_srvmnt_debug: 133 * trigger console printf whenever client attempts 134 * to cross server mountpoint 135 */ 136 int nfs4_srvmnt_fail_cnt = 0; 137 int nfs4_srvmnt_debug = 0; 138 #endif 139 140 /* forward references, in alphabetic order */ 141 static void close_after_open_resend(vnode_t *, cred_t *, uint32_t, 142 nfs4_error_t *); 143 static void errs_to_action(recov_info_t *, 144 nfs4_server_t *, mntinfo4_t *, stateid4 *, nfs4_lost_rqst_t *, int, 145 nfs_opnum4, nfs4_bseqid_entry_t *); 146 static void flush_reinstate(nfs4_lost_rqst_t *); 147 static void free_milist(mntinfo4_t **, int); 148 static mntinfo4_t **make_milist(nfs4_server_t *, int *); 149 static int nfs4_check_recov_err(vnode_t *, nfs4_op_hint_t, 150 nfs4_recov_state_t *, int, char *); 151 static char *nfs4_getsrvnames(mntinfo4_t *, size_t *); 152 static void nfs4_recov_fh_fail(vnode_t *, int, nfsstat4); 153 static void nfs4_recov_thread(recov_info_t *); 154 static void nfs4_remove_lost_rqsts(mntinfo4_t *, nfs4_server_t *); 155 static void nfs4_resend_lost_rqsts(recov_info_t *, nfs4_server_t *); 156 static cred_t *pid_to_cr(pid_t); 157 static void reclaim_one_lock(vnode_t *, flock64_t *, nfs4_error_t *, int *); 158 static void recov_bad_seqid(recov_info_t *); 159 static void recov_badstate(recov_info_t *, vnode_t *, nfsstat4); 160 static void recov_clientid(recov_info_t *, nfs4_server_t *); 161 static void recov_done(mntinfo4_t *, recov_info_t *); 162 static void recov_filehandle(nfs4_recov_t, mntinfo4_t *, vnode_t *); 163 static void recov_newserver(recov_info_t *, nfs4_server_t **, bool_t *); 164 static void recov_openfiles(recov_info_t *, nfs4_server_t *); 165 static void recov_stale(mntinfo4_t *, vnode_t *); 166 static void nfs4_free_lost_rqst(nfs4_lost_rqst_t *, nfs4_server_t *); 167 static void recov_throttle(recov_info_t *, vnode_t *); 168 static void relock_skip_pid(locklist_t *, pid_t); 169 static void resend_lock(nfs4_lost_rqst_t *, nfs4_error_t *); 170 static void resend_one_op(nfs4_lost_rqst_t *, nfs4_error_t *, mntinfo4_t *, 171 nfs4_server_t *); 172 static void save_bseqid_rqst(nfs4_bseqid_entry_t *, recov_info_t *); 173 static void start_recovery(recov_info_t *, mntinfo4_t *, vnode_t *, vnode_t *, 174 nfs4_server_t *); 175 static void start_recovery_action(nfs4_recov_t, bool_t, mntinfo4_t *, vnode_t *, 176 vnode_t *); 177 static int wait_for_recovery(mntinfo4_t *, nfs4_op_hint_t); 178 179 /* 180 * Return non-zero if the given errno, status, and rpc status codes 181 * in the nfs4_error_t indicate that client recovery is needed. 182 * "stateful" indicates whether the call that got the error establishes or 183 * removes state on the server (open, close, lock, unlock, delegreturn). 184 */ 185 186 int 187 nfs4_needs_recovery(nfs4_error_t *ep, bool_t stateful, vfs_t *vfsp) 188 { 189 int recov = 0; 190 mntinfo4_t *mi; 191 192 /* 193 * Try failover if the error values justify it and if 194 * it's a failover mount. Don't try if the mount is in 195 * progress, failures are handled explicitly by nfs4rootvp. 196 */ 197 if (nfs4_try_failover(ep)) { 198 mi = VFTOMI4(vfsp); 199 mutex_enter(&mi->mi_lock); 200 recov = FAILOVER_MOUNT4(mi) && !(mi->mi_flags & MI4_MOUNTING); 201 mutex_exit(&mi->mi_lock); 202 if (recov) 203 return (recov); 204 } 205 206 if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vfsp)) { 207 /* 208 * The server may have gotten the request, so for stateful 209 * ops we need to resynchronize and possibly back out the 210 * op. 211 */ 212 return (stateful); 213 } 214 if (ep->error != 0) 215 return (0); 216 217 /* stat values are listed alphabetically */ 218 /* 219 * There are two lists here: the errors for which we have code, and 220 * the errors for which we plan to have code before FCS. For the 221 * second list, print a warning message but don't attempt recovery. 222 */ 223 switch (ep->stat) { 224 case NFS4ERR_BADHANDLE: 225 case NFS4ERR_BAD_SEQID: 226 case NFS4ERR_BAD_STATEID: 227 case NFS4ERR_DELAY: 228 case NFS4ERR_EXPIRED: 229 case NFS4ERR_FHEXPIRED: 230 case NFS4ERR_GRACE: 231 case NFS4ERR_OLD_STATEID: 232 case NFS4ERR_RESOURCE: 233 case NFS4ERR_STALE_CLIENTID: 234 case NFS4ERR_STALE_STATEID: 235 case NFS4ERR_WRONGSEC: 236 case NFS4ERR_STALE: 237 recov = 1; 238 break; 239 #ifdef DEBUG 240 case NFS4ERR_LEASE_MOVED: 241 case NFS4ERR_MOVED: 242 zcmn_err(VFTOMI4(vfsp)->mi_zone->zone_id, 243 CE_WARN, "!Can't yet recover from NFS status %d", 244 ep->stat); 245 break; 246 #endif 247 } 248 249 return (recov); 250 } 251 252 /* 253 * Some operations such as DELEGRETURN want to avoid invoking 254 * recovery actions that will only mark the file dead. If 255 * better handlers are invoked for any of these errors, this 256 * routine should be modified. 257 */ 258 int 259 nfs4_recov_marks_dead(nfsstat4 status) 260 { 261 if (status == NFS4ERR_BAD_SEQID || 262 status == NFS4ERR_EXPIRED || 263 status == NFS4ERR_BAD_STATEID || 264 status == NFS4ERR_OLD_STATEID) 265 return (1); 266 return (0); 267 } 268 269 /* 270 * Transfer the state recovery information in recovp to mi's resend queue, 271 * and mark mi as having a lost state request. 272 */ 273 static void 274 nfs4_enqueue_lost_rqst(recov_info_t *recovp, mntinfo4_t *mi) 275 { 276 nfs4_lost_rqst_t *lrp = recovp->rc_lost_rqst; 277 278 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 279 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 280 281 ASSERT(lrp != NULL && lrp->lr_op != 0); 282 283 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 284 "nfs4_enqueue_lost_rqst %p, op %d", 285 (void *)lrp, lrp->lr_op)); 286 287 mutex_enter(&mi->mi_lock); 288 mi->mi_recovflags |= MI4R_LOST_STATE; 289 if (lrp->lr_putfirst) 290 list_insert_head(&mi->mi_lost_state, lrp); 291 else 292 list_insert_tail(&mi->mi_lost_state, lrp); 293 recovp->rc_lost_rqst = NULL; 294 mutex_exit(&mi->mi_lock); 295 296 nfs4_queue_event(RE_LOST_STATE, mi, NULL, lrp->lr_op, lrp->lr_vp, 297 lrp->lr_dvp, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0); 298 } 299 300 /* 301 * Transfer the bad seqid recovery information in recovp to mi's 302 * bad seqid queue, and mark mi as having a bad seqid request. 303 */ 304 void 305 enqueue_bseqid_rqst(recov_info_t *recovp, mntinfo4_t *mi) 306 { 307 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 308 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 309 ASSERT(recovp->rc_bseqid_rqst != NULL); 310 311 mutex_enter(&mi->mi_lock); 312 mi->mi_recovflags |= MI4R_BAD_SEQID; 313 list_insert_tail(&mi->mi_bseqid_list, recovp->rc_bseqid_rqst); 314 recovp->rc_bseqid_rqst = NULL; 315 mutex_exit(&mi->mi_lock); 316 } 317 318 /* 319 * Initiate recovery. 320 * 321 * The nfs4_error_t contains the return codes that triggered a recovery 322 * attempt. mi, vp1, and vp2 refer to the filesystem and files that were 323 * being operated on. vp1 and vp2 may be NULL. 324 * 325 * Multiple calls are okay. If recovery is already underway, the call 326 * updates the information about what state needs recovery but does not 327 * start a new thread. The caller should hold mi->mi_recovlock as a reader 328 * for proper synchronization with any recovery thread. 329 * 330 * This will return TRUE if recovery was aborted, and FALSE otherwise. 331 */ 332 bool_t 333 nfs4_start_recovery(nfs4_error_t *ep, mntinfo4_t *mi, vnode_t *vp1, 334 vnode_t *vp2, stateid4 *sid, nfs4_lost_rqst_t *lost_rqstp, nfs_opnum4 op, 335 nfs4_bseqid_entry_t *bsep) 336 { 337 recov_info_t *recovp; 338 nfs4_server_t *sp; 339 bool_t abort = FALSE; 340 bool_t gone = FALSE; 341 342 ASSERT(nfs_zone() == mi->mi_zone); 343 mutex_enter(&mi->mi_lock); 344 /* 345 * If there is lost state, we need to kick off recovery even if the 346 * filesystem has been unmounted or the zone is shutting down. 347 */ 348 gone = FS_OR_ZONE_GONE4(mi->mi_vfsp); 349 if (gone) { 350 ASSERT(ep->error != EINTR || lost_rqstp != NULL); 351 if (ep->error == EIO && lost_rqstp == NULL) { 352 /* failed due to forced unmount, no new lost state */ 353 abort = TRUE; 354 } 355 if ((ep->error == 0 || ep->error == ETIMEDOUT) && 356 !(mi->mi_recovflags & MI4R_LOST_STATE)) { 357 /* some other failure, no existing lost state */ 358 abort = TRUE; 359 } 360 if (abort) { 361 mutex_exit(&mi->mi_lock); 362 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 363 "nfs4_start_recovery: fs unmounted")); 364 return (TRUE); 365 } 366 } 367 mi->mi_in_recovery++; 368 mutex_exit(&mi->mi_lock); 369 370 recovp = kmem_alloc(sizeof (recov_info_t), KM_SLEEP); 371 recovp->rc_orig_errors = *ep; 372 sp = find_nfs4_server(mi); 373 errs_to_action(recovp, sp, mi, sid, lost_rqstp, gone, op, bsep); 374 if (sp != NULL) 375 mutex_exit(&sp->s_lock); 376 start_recovery(recovp, mi, vp1, vp2, sp); 377 if (sp != NULL) 378 nfs4_server_rele(sp); 379 return (FALSE); 380 } 381 382 /* 383 * Internal version of nfs4_start_recovery. The difference is that the 384 * caller specifies the recovery action, rather than the errors leading to 385 * recovery. 386 */ 387 static void 388 start_recovery_action(nfs4_recov_t what, bool_t reboot, mntinfo4_t *mi, 389 vnode_t *vp1, vnode_t *vp2) 390 { 391 recov_info_t *recovp; 392 393 ASSERT(nfs_zone() == mi->mi_zone); 394 mutex_enter(&mi->mi_lock); 395 mi->mi_in_recovery++; 396 mutex_exit(&mi->mi_lock); 397 398 recovp = kmem_zalloc(sizeof (recov_info_t), KM_SLEEP); 399 recovp->rc_action = what; 400 recovp->rc_srv_reboot = reboot; 401 recovp->rc_error = EIO; 402 start_recovery(recovp, mi, vp1, vp2, NULL); 403 } 404 405 static void 406 start_recovery(recov_info_t *recovp, mntinfo4_t *mi, 407 vnode_t *vp1, vnode_t *vp2, nfs4_server_t *sp) 408 { 409 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 410 "start_recovery: mi %p, what %s", (void*)mi, 411 nfs4_recov_action_to_str(recovp->rc_action))); 412 413 /* 414 * Bump the reference on the vfs so that we can pass it to the 415 * recovery thread. 416 */ 417 VFS_HOLD(mi->mi_vfsp); 418 MI4_HOLD(mi); 419 again: 420 switch (recovp->rc_action) { 421 case NR_FAILOVER: 422 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 423 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 424 if (mi->mi_servers->sv_next == NULL) 425 goto out_no_thread; 426 mutex_enter(&mi->mi_lock); 427 mi->mi_recovflags |= MI4R_NEED_NEW_SERVER; 428 mutex_exit(&mi->mi_lock); 429 430 if (recovp->rc_lost_rqst != NULL) 431 nfs4_enqueue_lost_rqst(recovp, mi); 432 break; 433 434 case NR_CLIENTID: 435 /* 436 * If the filesystem has been unmounted, punt. 437 */ 438 if (sp == NULL) 439 goto out_no_thread; 440 441 /* 442 * If nobody else is working on the clientid, mark the 443 * clientid as being no longer set. Then mark the specific 444 * filesystem being worked on. 445 */ 446 if (!nfs4_server_in_recovery(sp)) { 447 mutex_enter(&sp->s_lock); 448 sp->s_flags &= ~N4S_CLIENTID_SET; 449 mutex_exit(&sp->s_lock); 450 } 451 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 452 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 453 mutex_enter(&mi->mi_lock); 454 mi->mi_recovflags |= MI4R_NEED_CLIENTID; 455 if (recovp->rc_srv_reboot) 456 mi->mi_recovflags |= MI4R_SRV_REBOOT; 457 mutex_exit(&mi->mi_lock); 458 break; 459 460 case NR_OPENFILES: 461 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 462 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 463 mutex_enter(&mi->mi_lock); 464 mi->mi_recovflags |= MI4R_REOPEN_FILES; 465 if (recovp->rc_srv_reboot) 466 mi->mi_recovflags |= MI4R_SRV_REBOOT; 467 mutex_exit(&mi->mi_lock); 468 break; 469 470 case NR_WRONGSEC: 471 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 472 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 473 mutex_enter(&mi->mi_lock); 474 mi->mi_recovflags |= MI4R_NEED_SECINFO; 475 mutex_exit(&mi->mi_lock); 476 break; 477 478 case NR_EXPIRED: 479 if (vp1 != NULL) 480 recov_badstate(recovp, vp1, NFS4ERR_EXPIRED); 481 if (vp2 != NULL) 482 recov_badstate(recovp, vp2, NFS4ERR_EXPIRED); 483 goto out_no_thread; /* no further recovery possible */ 484 485 case NR_BAD_STATEID: 486 if (vp1 != NULL) 487 recov_badstate(recovp, vp1, NFS4ERR_BAD_STATEID); 488 if (vp2 != NULL) 489 recov_badstate(recovp, vp2, NFS4ERR_BAD_STATEID); 490 goto out_no_thread; /* no further recovery possible */ 491 492 case NR_FHEXPIRED: 493 case NR_BADHANDLE: 494 if (vp1 != NULL) 495 recov_throttle(recovp, vp1); 496 if (vp2 != NULL) 497 recov_throttle(recovp, vp2); 498 /* 499 * Recover the filehandle now, rather than using a 500 * separate thread. We can do this because filehandle 501 * recovery is independent of any other state, and because 502 * we know that we are not competing with the recovery 503 * thread at this time. recov_filehandle will deal with 504 * threads that are competing to recover this filehandle. 505 */ 506 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 507 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 508 if (vp1 != NULL) 509 recov_filehandle(recovp->rc_action, mi, vp1); 510 if (vp2 != NULL) 511 recov_filehandle(recovp->rc_action, mi, vp2); 512 goto out_no_thread; /* no further recovery needed */ 513 514 case NR_STALE: 515 /* 516 * NFS4ERR_STALE handling 517 * recov_stale() could set MI4R_NEED_NEW_SERVER to 518 * indicate that we can and should failover. 519 */ 520 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 521 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 522 523 if (vp1 != NULL) 524 recov_stale(mi, vp1); 525 if (vp2 != NULL) 526 recov_stale(mi, vp2); 527 mutex_enter(&mi->mi_lock); 528 if ((mi->mi_recovflags & MI4R_NEED_NEW_SERVER) == 0) { 529 mutex_exit(&mi->mi_lock); 530 goto out_no_thread; 531 } 532 mutex_exit(&mi->mi_lock); 533 recovp->rc_action = NR_FAILOVER; 534 goto again; 535 536 case NR_BAD_SEQID: 537 if (recovp->rc_bseqid_rqst) { 538 enqueue_bseqid_rqst(recovp, mi); 539 break; 540 } 541 542 if (vp1 != NULL) 543 recov_badstate(recovp, vp1, NFS4ERR_BAD_SEQID); 544 if (vp2 != NULL) 545 recov_badstate(recovp, vp2, NFS4ERR_BAD_SEQID); 546 goto out_no_thread; /* no further recovery possible */ 547 548 case NR_OLDSTATEID: 549 if (vp1 != NULL) 550 recov_badstate(recovp, vp1, NFS4ERR_OLD_STATEID); 551 if (vp2 != NULL) 552 recov_badstate(recovp, vp2, NFS4ERR_OLD_STATEID); 553 goto out_no_thread; /* no further recovery possible */ 554 555 case NR_GRACE: 556 nfs4_set_grace_wait(mi); 557 goto out_no_thread; /* no further action required for GRACE */ 558 559 case NR_DELAY: 560 if (vp1) 561 nfs4_set_delay_wait(vp1); 562 goto out_no_thread; /* no further action required for DELAY */ 563 564 case NR_LOST_STATE_RQST: 565 case NR_LOST_LOCK: 566 nfs4_enqueue_lost_rqst(recovp, mi); 567 break; 568 569 default: 570 nfs4_queue_event(RE_UNEXPECTED_ACTION, mi, NULL, 571 recovp->rc_action, NULL, NULL, 0, NULL, 0, TAG_NONE, 572 TAG_NONE, 0, 0); 573 goto out_no_thread; 574 } 575 576 /* 577 * If either file recently went through the same recovery, wait 578 * awhile. This is in case there is some sort of bug; we might not 579 * be able to recover properly, but at least we won't bombard the 580 * server with calls, and we won't tie up the client. 581 */ 582 if (vp1 != NULL) 583 recov_throttle(recovp, vp1); 584 if (vp2 != NULL) 585 recov_throttle(recovp, vp2); 586 587 /* 588 * If there's already a recovery thread, don't start another one. 589 */ 590 591 mutex_enter(&mi->mi_lock); 592 if (mi->mi_flags & MI4_RECOV_ACTIV) { 593 mutex_exit(&mi->mi_lock); 594 goto out_no_thread; 595 } 596 mi->mi_flags |= MI4_RECOV_ACTIV; 597 mutex_exit(&mi->mi_lock); 598 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 599 "start_recovery: starting new thread for mi %p", (void*)mi)); 600 601 recovp->rc_mi = mi; 602 recovp->rc_vp1 = vp1; 603 if (vp1 != NULL) { 604 ASSERT(VTOMI4(vp1) == mi); 605 VN_HOLD(recovp->rc_vp1); 606 } 607 recovp->rc_vp2 = vp2; 608 if (vp2 != NULL) { 609 ASSERT(VTOMI4(vp2) == mi); 610 VN_HOLD(recovp->rc_vp2); 611 } 612 613 (void) zthread_create(NULL, 0, nfs4_recov_thread, recovp, 0, 614 minclsyspri); 615 return; 616 617 /* not reached by thread creating call */ 618 out_no_thread: 619 mutex_enter(&mi->mi_lock); 620 mi->mi_in_recovery--; 621 if (mi->mi_in_recovery == 0) 622 cv_broadcast(&mi->mi_cv_in_recov); 623 mutex_exit(&mi->mi_lock); 624 625 VFS_RELE(mi->mi_vfsp); 626 MI4_RELE(mi); 627 /* 628 * Free up resources that were allocated for us. 629 */ 630 kmem_free(recovp, sizeof (recov_info_t)); 631 } 632 633 static int 634 nfs4_check_recov_err(vnode_t *vp, nfs4_op_hint_t op, 635 nfs4_recov_state_t *rsp, int retry_err_cnt, char *str) 636 { 637 rnode4_t *rp; 638 int error = 0; 639 int exempt; 640 641 if (vp == NULL) 642 return (0); 643 644 exempt = (op == OH_CLOSE || op == OH_LOCKU || op == OH_DELEGRETURN); 645 rp = VTOR4(vp); 646 mutex_enter(&rp->r_statelock); 647 648 /* 649 * If there was a recovery error, then allow op hints "exempt" from 650 * recov errors to retry (currently 3 times). Either r_error or 651 * EIO is returned for non-exempt op hints. 652 */ 653 if (rp->r_flags & R4RECOVERR) { 654 if (exempt && rsp->rs_num_retry_despite_err <= 655 nfs4_max_recov_error_retry) { 656 657 /* 658 * Check to make sure that we haven't already inc'd 659 * rs_num_retry_despite_err for current nfs4_start_fop 660 * instance. We don't want to double inc (if we were 661 * called with vp2, then the vp1 call could have 662 * already incremented. 663 */ 664 if (retry_err_cnt == rsp->rs_num_retry_despite_err) 665 rsp->rs_num_retry_despite_err++; 666 667 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 668 "nfs4_start_fop: %s %p DEAD, cnt=%d", str, 669 (void *)vp, rsp->rs_num_retry_despite_err)); 670 } else { 671 error = (rp->r_error ? rp->r_error : EIO); 672 /* 673 * An ESTALE error on a non-regular file is not 674 * "sticky". Return the ESTALE error once, but 675 * clear the condition to allow future operations 676 * to go OTW. This will allow the client to 677 * recover if the server has merely unshared then 678 * re-shared the file system. For regular files, 679 * the unshare has destroyed the open state at the 680 * server and we aren't willing to do a reopen (yet). 681 */ 682 if (error == ESTALE && vp->v_type != VREG) { 683 rp->r_flags &= 684 ~(R4RECOVERR|R4RECOVERRP|R4STALE); 685 rp->r_error = 0; 686 error = ESTALE; 687 } 688 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 689 "nfs4_start_fop: %s %p DEAD, cnt=%d error=%d", 690 str, (void *)vp, 691 rsp->rs_num_retry_despite_err, error)); 692 } 693 } 694 695 mutex_exit(&rp->r_statelock); 696 return (error); 697 } 698 699 /* 700 * Initial setup code that every operation should call if it might invoke 701 * client recovery. Can block waiting for recovery to finish on a 702 * filesystem. Either vnode ptr can be NULL. 703 * 704 * Returns 0 if there are no outstanding errors. Can return an 705 * errno value under various circumstances (e.g., failed recovery, or 706 * interrupted while waiting for recovery to finish). 707 * 708 * There must be a corresponding call to nfs4_end_op() to free up any locks 709 * or resources allocated by this call (assuming this call succeeded), 710 * using the same rsp that's passed in here. 711 * 712 * The open and lock seqid synchronization must be stopped before calling this 713 * function, as it could lead to deadlock when trying to reopen a file or 714 * reclaim a lock. The synchronization is obtained with calls to: 715 * nfs4_start_open_seqid_sync() 716 * nfs4_start_lock_seqid_sync() 717 * 718 * *startrecovp is set TRUE if the caller should not bother with the 719 * over-the-wire call, and just initiate recovery for the given request. 720 * This is typically used for state-releasing ops if the filesystem has 721 * been forcibly unmounted. startrecovp may be NULL for 722 * non-state-releasing ops. 723 */ 724 725 int 726 nfs4_start_fop(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2, nfs4_op_hint_t op, 727 nfs4_recov_state_t *rsp, bool_t *startrecovp) 728 { 729 int error = 0, rerr_cnt; 730 nfs4_server_t *sp = NULL; 731 nfs4_server_t *tsp; 732 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 733 time_t droplock_time; 734 #ifdef DEBUG 735 void *fop_caller; 736 #endif 737 738 ASSERT(vp1 == NULL || vp1->v_vfsp == mi->mi_vfsp); 739 ASSERT(vp2 == NULL || vp2->v_vfsp == mi->mi_vfsp); 740 741 #ifdef DEBUG 742 if ((fop_caller = tsd_get(nfs4_tsd_key)) != NULL) { 743 cmn_err(CE_PANIC, "Missing nfs4_end_fop: last caller %p", 744 fop_caller); 745 } 746 (void) tsd_set(nfs4_tsd_key, caller()); 747 #endif 748 749 rsp->rs_sp = NULL; 750 rsp->rs_flags &= ~NFS4_RS_RENAME_HELD; 751 rerr_cnt = rsp->rs_num_retry_despite_err; 752 753 /* 754 * Process the items that may delay() based on server response 755 */ 756 error = nfs4_wait_for_grace(mi, rsp); 757 if (error) 758 goto out; 759 760 if (vp1 != NULL) { 761 error = nfs4_wait_for_delay(vp1, rsp); 762 if (error) 763 goto out; 764 } 765 766 /* Wait for a delegation recall to complete. */ 767 768 error = wait_for_recall(vp1, vp2, op, rsp); 769 if (error) 770 goto out; 771 772 /* 773 * Wait for any current recovery actions to finish. Note that a 774 * recovery thread can still start up after wait_for_recovery() 775 * finishes. We don't block out recovery operations until we 776 * acquire s_recovlock and mi_recovlock. 777 */ 778 error = wait_for_recovery(mi, op); 779 if (error) 780 goto out; 781 782 /* 783 * Check to see if the rnode is already marked with a 784 * recovery error. If so, return it immediately. But 785 * always pass CLOSE, LOCKU, and DELEGRETURN so we can 786 * clean up state on the server. 787 */ 788 789 if (vp1 != NULL) { 790 if (error = nfs4_check_recov_err(vp1, op, rsp, rerr_cnt, "vp1")) 791 goto out; 792 nfs4_check_remap(mi, vp1, NFS4_REMAP_CKATTRS, &e); 793 } 794 795 if (vp2 != NULL) { 796 if (error = nfs4_check_recov_err(vp2, op, rsp, rerr_cnt, "vp2")) 797 goto out; 798 nfs4_check_remap(mi, vp2, NFS4_REMAP_CKATTRS, &e); 799 } 800 801 /* 802 * The lock order calls for us to acquire s_recovlock before 803 * mi_recovlock, but we have to hold mi_recovlock to look up sp (to 804 * prevent races with the failover/migration code). So acquire 805 * mi_recovlock, look up sp, drop mi_recovlock, acquire 806 * s_recovlock and mi_recovlock, then verify that sp is still the 807 * right object. XXX Can we find a simpler way to deal with this? 808 */ 809 if (nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 810 mi->mi_flags & MI4_INT)) { 811 error = EINTR; 812 goto out; 813 } 814 get_sp: 815 sp = find_nfs4_server(mi); 816 if (sp != NULL) { 817 sp->s_otw_call_count++; 818 mutex_exit(&sp->s_lock); 819 droplock_time = gethrestime_sec(); 820 } 821 nfs_rw_exit(&mi->mi_recovlock); 822 823 if (sp != NULL) { 824 if (nfs_rw_enter_sig(&sp->s_recovlock, RW_READER, 825 mi->mi_flags & MI4_INT)) { 826 error = EINTR; 827 goto out; 828 } 829 } 830 if (nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 831 mi->mi_flags & MI4_INT)) { 832 if (sp != NULL) 833 nfs_rw_exit(&sp->s_recovlock); 834 error = EINTR; 835 goto out; 836 } 837 /* 838 * If the mntinfo4_t hasn't changed nfs4_sever_ts then 839 * there's no point in double checking to make sure it 840 * has switched. 841 */ 842 if (sp == NULL || droplock_time < mi->mi_srvsettime) { 843 tsp = find_nfs4_server(mi); 844 if (tsp != sp) { 845 /* try again */ 846 if (tsp != NULL) { 847 mutex_exit(&tsp->s_lock); 848 nfs4_server_rele(tsp); 849 tsp = NULL; 850 } 851 if (sp != NULL) { 852 nfs_rw_exit(&sp->s_recovlock); 853 mutex_enter(&sp->s_lock); 854 sp->s_otw_call_count--; 855 mutex_exit(&sp->s_lock); 856 nfs4_server_rele(sp); 857 sp = NULL; 858 } 859 goto get_sp; 860 } else { 861 if (tsp != NULL) { 862 mutex_exit(&tsp->s_lock); 863 nfs4_server_rele(tsp); 864 tsp = NULL; 865 } 866 } 867 } 868 869 if (sp != NULL) { 870 rsp->rs_sp = sp; 871 } 872 873 /* 874 * If the fileystem uses volatile filehandles, obtain a lock so 875 * that we synchronize with renames. Exception: mount operations 876 * can change mi_fh_expire_type, which could be a problem, since 877 * the end_op code needs to be consistent with the start_op code 878 * about mi_rename_lock. Since mounts don't compete with renames, 879 * it's simpler to just not acquire the rename lock for mounts. 880 */ 881 if (NFS4_VOLATILE_FH(mi) && op != OH_MOUNT) { 882 if (nfs_rw_enter_sig(&mi->mi_rename_lock, 883 op == OH_VFH_RENAME ? RW_WRITER : RW_READER, 884 mi->mi_flags & MI4_INT)) { 885 nfs_rw_exit(&mi->mi_recovlock); 886 if (sp != NULL) 887 nfs_rw_exit(&sp->s_recovlock); 888 error = EINTR; 889 goto out; 890 } 891 rsp->rs_flags |= NFS4_RS_RENAME_HELD; 892 } 893 894 if (OH_IS_STATE_RELE(op)) { 895 /* 896 * For forced unmount, letting the request proceed will 897 * almost always delay response to the user, so hand it off 898 * to the recovery thread. For exiting lwp's, we don't 899 * have a good way to tell if the request will hang. We 900 * generally want processes to handle their own requests so 901 * that they can be done in parallel, but if there is 902 * already a recovery thread, hand the request off to it. 903 * This will improve user response at no cost to overall 904 * system throughput. For zone shutdown, we'd prefer 905 * the recovery thread to handle this as well. 906 */ 907 ASSERT(startrecovp != NULL); 908 mutex_enter(&mi->mi_lock); 909 if (FS_OR_ZONE_GONE4(mi->mi_vfsp)) 910 *startrecovp = TRUE; 911 else if ((curthread->t_proc_flag & TP_LWPEXIT) && 912 (mi->mi_flags & MI4_RECOV_ACTIV)) 913 *startrecovp = TRUE; 914 else 915 *startrecovp = FALSE; 916 mutex_exit(&mi->mi_lock); 917 } else 918 if (startrecovp != NULL) 919 *startrecovp = FALSE; 920 921 ASSERT(error == 0); 922 return (error); 923 924 out: 925 ASSERT(error != 0); 926 if (sp != NULL) { 927 mutex_enter(&sp->s_lock); 928 sp->s_otw_call_count--; 929 mutex_exit(&sp->s_lock); 930 nfs4_server_rele(sp); 931 rsp->rs_sp = NULL; 932 } 933 nfs4_end_op_recall(vp1, vp2, rsp); 934 935 #ifdef DEBUG 936 (void) tsd_set(nfs4_tsd_key, NULL); 937 #endif 938 return (error); 939 } 940 941 /* 942 * It is up to the caller to determine if rsp->rs_sp being NULL 943 * is detrimental or not. 944 */ 945 int 946 nfs4_start_op(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2, 947 nfs4_recov_state_t *rsp) 948 { 949 ASSERT(rsp->rs_num_retry_despite_err == 0); 950 rsp->rs_num_retry_despite_err = 0; 951 return (nfs4_start_fop(mi, vp1, vp2, OH_OTHER, rsp, NULL)); 952 } 953 954 /* 955 * Release any resources acquired by nfs4_start_op(). 956 * 'sp' should be the nfs4_server pointer returned by nfs4_start_op(). 957 * 958 * The operation hint is used to avoid a deadlock by bypassing delegation 959 * return logic for writes, which are done while returning a delegation. 960 */ 961 962 void 963 nfs4_end_fop(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2, nfs4_op_hint_t op, 964 nfs4_recov_state_t *rsp, bool_t needs_recov) 965 { 966 nfs4_server_t *sp = rsp->rs_sp; 967 rnode4_t *rp = NULL; 968 969 #ifdef lint 970 /* 971 * The op hint isn't used any more, but might be in 972 * the future. 973 */ 974 op = op; 975 #endif 976 977 #ifdef DEBUG 978 ASSERT(tsd_get(nfs4_tsd_key) != NULL); 979 (void) tsd_set(nfs4_tsd_key, NULL); 980 #endif 981 982 nfs4_end_op_recall(vp1, vp2, rsp); 983 984 if (rsp->rs_flags & NFS4_RS_RENAME_HELD) 985 nfs_rw_exit(&mi->mi_rename_lock); 986 987 if (!needs_recov) { 988 if (rsp->rs_flags & NFS4_RS_DELAY_MSG) { 989 /* may need to clear the delay interval */ 990 if (vp1 != NULL) { 991 rp = VTOR4(vp1); 992 mutex_enter(&rp->r_statelock); 993 rp->r_delay_interval = 0; 994 mutex_exit(&rp->r_statelock); 995 } 996 } 997 rsp->rs_flags &= ~(NFS4_RS_GRACE_MSG|NFS4_RS_DELAY_MSG); 998 } 999 1000 /* 1001 * If the corresponding nfs4_start_op() found a sp, 1002 * then there must still be a sp. 1003 */ 1004 if (sp != NULL) { 1005 nfs_rw_exit(&mi->mi_recovlock); 1006 nfs_rw_exit(&sp->s_recovlock); 1007 mutex_enter(&sp->s_lock); 1008 sp->s_otw_call_count--; 1009 cv_broadcast(&sp->s_cv_otw_count); 1010 mutex_exit(&sp->s_lock); 1011 nfs4_server_rele(sp); 1012 } else { 1013 nfs_rw_exit(&mi->mi_recovlock); 1014 } 1015 } 1016 1017 void 1018 nfs4_end_op(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2, 1019 nfs4_recov_state_t *rsp, bool_t needrecov) 1020 { 1021 nfs4_end_fop(mi, vp1, vp2, OH_OTHER, rsp, needrecov); 1022 } 1023 1024 /* 1025 * If the filesystem is going through client recovery, block until 1026 * finished. 1027 * Exceptions: 1028 * - state-releasing ops (CLOSE, LOCKU, DELEGRETURN) are allowed to proceed 1029 * if the filesystem has been forcibly unmounted or the lwp is exiting. 1030 * 1031 * Return value: 1032 * - 0 if no errors 1033 * - EINTR if the call was interrupted 1034 * - EIO if the filesystem has been forcibly unmounted (non-state-releasing 1035 * op) 1036 * - the errno value from the recovery thread, if recovery failed 1037 */ 1038 1039 static int 1040 wait_for_recovery(mntinfo4_t *mi, nfs4_op_hint_t op_hint) 1041 { 1042 int error = 0; 1043 1044 mutex_enter(&mi->mi_lock); 1045 1046 while (mi->mi_recovflags != 0) { 1047 klwp_t *lwp = ttolwp(curthread); 1048 1049 if (mi->mi_flags & MI4_RECOV_FAIL) 1050 break; 1051 if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) 1052 break; 1053 if (OH_IS_STATE_RELE(op_hint) && 1054 (curthread->t_proc_flag & TP_LWPEXIT)) 1055 break; 1056 1057 if (lwp != NULL) 1058 lwp->lwp_nostop++; 1059 /* XXX - use different cv? */ 1060 if (cv_wait_sig(&mi->mi_failover_cv, &mi->mi_lock) == 0) { 1061 error = EINTR; 1062 if (lwp != NULL) 1063 lwp->lwp_nostop--; 1064 break; 1065 } 1066 if (lwp != NULL) 1067 lwp->lwp_nostop--; 1068 } 1069 1070 if (mi->mi_flags & MI4_RECOV_FAIL) { 1071 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1072 "wait_for_recovery: fail since RECOV FAIL")); 1073 error = mi->mi_error; 1074 } else if ((mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) && 1075 !OH_IS_STATE_RELE(op_hint)) { 1076 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1077 "wait_for_recovery: forced unmount")); 1078 error = EIO; 1079 } 1080 1081 mutex_exit(&mi->mi_lock); 1082 1083 return (error); 1084 } 1085 1086 /* 1087 * If the client received NFS4ERR_GRACE for this particular mount, 1088 * the client blocks here until it is time to try again. 1089 * 1090 * Return value: 1091 * - 0 if wait was successful 1092 * - EINTR if the call was interrupted 1093 */ 1094 1095 int 1096 nfs4_wait_for_grace(mntinfo4_t *mi, nfs4_recov_state_t *rsp) 1097 { 1098 int error = 0; 1099 time_t curtime, time_to_wait; 1100 1101 /* do a unprotected check to reduce mi_lock contention */ 1102 if (mi->mi_grace_wait != 0) { 1103 mutex_enter(&mi->mi_lock); 1104 1105 if (mi->mi_grace_wait != 0) { 1106 if (!(rsp->rs_flags & NFS4_RS_GRACE_MSG)) 1107 rsp->rs_flags |= NFS4_RS_GRACE_MSG; 1108 1109 curtime = gethrestime_sec(); 1110 1111 if (curtime < mi->mi_grace_wait) { 1112 1113 time_to_wait = mi->mi_grace_wait - curtime; 1114 1115 mutex_exit(&mi->mi_lock); 1116 1117 error = delay_sig(SEC_TO_TICK(time_to_wait)); 1118 if (error) 1119 return (error); 1120 1121 curtime = gethrestime_sec(); 1122 1123 mutex_enter(&mi->mi_lock); 1124 1125 if (curtime >= mi->mi_grace_wait) 1126 mi->mi_grace_wait = 0; 1127 } else { 1128 mi->mi_grace_wait = 0; 1129 } 1130 } 1131 mutex_exit(&mi->mi_lock); 1132 } 1133 1134 return (error); 1135 } 1136 1137 /* 1138 * If the client received NFS4ERR_DELAY for an operation on a vnode, 1139 * the client blocks here until it is time to try again. 1140 * 1141 * Return value: 1142 * - 0 if wait was successful 1143 * - EINTR if the call was interrupted 1144 */ 1145 1146 int 1147 nfs4_wait_for_delay(vnode_t *vp, nfs4_recov_state_t *rsp) 1148 { 1149 int error = 0; 1150 time_t curtime, time_to_wait; 1151 rnode4_t *rp; 1152 1153 ASSERT(vp != NULL); 1154 1155 rp = VTOR4(vp); 1156 1157 /* do a unprotected check to reduce r_statelock contention */ 1158 if (rp->r_delay_wait != 0) { 1159 mutex_enter(&rp->r_statelock); 1160 1161 if (rp->r_delay_wait != 0) { 1162 1163 if (!(rsp->rs_flags & NFS4_RS_DELAY_MSG)) { 1164 rsp->rs_flags |= NFS4_RS_DELAY_MSG; 1165 nfs4_mi_kstat_inc_delay(VTOMI4(vp)); 1166 } 1167 1168 curtime = gethrestime_sec(); 1169 1170 if (curtime < rp->r_delay_wait) { 1171 1172 time_to_wait = rp->r_delay_wait - curtime; 1173 1174 mutex_exit(&rp->r_statelock); 1175 1176 error = delay_sig(SEC_TO_TICK(time_to_wait)); 1177 if (error) 1178 return (error); 1179 1180 curtime = gethrestime_sec(); 1181 1182 mutex_enter(&rp->r_statelock); 1183 1184 if (curtime >= rp->r_delay_wait) 1185 rp->r_delay_wait = 0; 1186 } else { 1187 rp->r_delay_wait = 0; 1188 } 1189 } 1190 mutex_exit(&rp->r_statelock); 1191 } 1192 1193 return (error); 1194 } 1195 1196 /* 1197 * The recovery thread. 1198 */ 1199 1200 static void 1201 nfs4_recov_thread(recov_info_t *recovp) 1202 { 1203 mntinfo4_t *mi = recovp->rc_mi; 1204 nfs4_server_t *sp; 1205 int done = 0, error = 0; 1206 bool_t recov_fail = FALSE; 1207 callb_cpr_t cpr_info; 1208 kmutex_t cpr_lock; 1209 1210 nfs4_queue_event(RE_START, mi, NULL, mi->mi_recovflags, 1211 recovp->rc_vp1, recovp->rc_vp2, 0, NULL, 0, TAG_NONE, TAG_NONE, 1212 0, 0); 1213 1214 mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL); 1215 CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr, "nfsv4Recov"); 1216 1217 mutex_enter(&mi->mi_lock); 1218 mi->mi_recovthread = curthread; 1219 mutex_exit(&mi->mi_lock); 1220 1221 /* 1222 * We don't really need protection here against failover or 1223 * migration, since the current thread is the one that would make 1224 * any changes, but hold mi_recovlock anyway for completeness (and 1225 * to satisfy any ASSERTs). 1226 */ 1227 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0); 1228 sp = find_nfs4_server(mi); 1229 if (sp != NULL) 1230 mutex_exit(&sp->s_lock); 1231 nfs_rw_exit(&mi->mi_recovlock); 1232 1233 /* 1234 * Do any necessary recovery, based on the information in recovp 1235 * and any recovery flags. 1236 */ 1237 1238 do { 1239 mutex_enter(&mi->mi_lock); 1240 if (FS_OR_ZONE_GONE4(mi->mi_vfsp)) { 1241 bool_t activesrv; 1242 1243 NFS4_DEBUG(nfs4_client_recov_debug && 1244 mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED, (CE_NOTE, 1245 "nfs4_recov_thread: file system has been " 1246 "unmounted")); 1247 NFS4_DEBUG(nfs4_client_recov_debug && 1248 zone_status_get(curproc->p_zone) >= 1249 ZONE_IS_SHUTTING_DOWN, (CE_NOTE, 1250 "nfs4_recov_thread: zone shutting down")); 1251 /* 1252 * If the server has lost its state for us and 1253 * the filesystem is unmounted, then the filesystem 1254 * can be tossed, even if there are lost lock or 1255 * lost state calls in the recovery queue. 1256 */ 1257 if (mi->mi_recovflags & 1258 (MI4R_NEED_CLIENTID | MI4R_REOPEN_FILES)) { 1259 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1260 "nfs4_recov_thread: bailing out")); 1261 mi->mi_flags |= MI4_RECOV_FAIL; 1262 mi->mi_error = recovp->rc_error; 1263 recov_fail = TRUE; 1264 } 1265 /* 1266 * We don't know if the server has any state for 1267 * us, and the filesystem has been unmounted. If 1268 * there are "lost state" recovery items, keep 1269 * trying to process them until there are no more 1270 * mounted filesystems for the server. Otherwise, 1271 * bail out. The reason we don't mark the 1272 * filesystem as failing recovery is in case we 1273 * have to do "lost state" recovery later (e.g., a 1274 * user process exits). 1275 */ 1276 if (!(mi->mi_recovflags & MI4R_LOST_STATE)) { 1277 done = 1; 1278 mutex_exit(&mi->mi_lock); 1279 break; 1280 } 1281 mutex_exit(&mi->mi_lock); 1282 1283 if (sp == NULL) 1284 activesrv = FALSE; 1285 else { 1286 mutex_enter(&sp->s_lock); 1287 activesrv = nfs4_fs_active(sp); 1288 } 1289 if (!activesrv) { 1290 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1291 "no active fs for server %p", 1292 (void *)sp)); 1293 mutex_enter(&mi->mi_lock); 1294 mi->mi_flags |= MI4_RECOV_FAIL; 1295 mi->mi_error = recovp->rc_error; 1296 mutex_exit(&mi->mi_lock); 1297 recov_fail = TRUE; 1298 if (sp != NULL) { 1299 /* 1300 * Mark the server instance as 1301 * dead, so that nobody will attach 1302 * a new filesystem. 1303 */ 1304 nfs4_mark_srv_dead(sp); 1305 } 1306 } 1307 if (sp != NULL) 1308 mutex_exit(&sp->s_lock); 1309 } else { 1310 mutex_exit(&mi->mi_lock); 1311 } 1312 1313 /* 1314 * Check if we need to select a new server for a 1315 * failover. Choosing a new server will force at 1316 * least a check of the clientid. 1317 */ 1318 mutex_enter(&mi->mi_lock); 1319 if (!recov_fail && 1320 (mi->mi_recovflags & MI4R_NEED_NEW_SERVER)) { 1321 mutex_exit(&mi->mi_lock); 1322 recov_newserver(recovp, &sp, &recov_fail); 1323 } else 1324 mutex_exit(&mi->mi_lock); 1325 1326 /* 1327 * Check if we need to recover the clientid. This 1328 * must be done before file and lock recovery, and it 1329 * potentially affects the recovery threads for other 1330 * filesystems, so it gets special treatment. 1331 */ 1332 if (sp != NULL && recov_fail == FALSE) { 1333 mutex_enter(&sp->s_lock); 1334 if (!(sp->s_flags & N4S_CLIENTID_SET)) { 1335 mutex_exit(&sp->s_lock); 1336 recov_clientid(recovp, sp); 1337 } else { 1338 /* 1339 * Unset this flag in case another recovery 1340 * thread successfully recovered the clientid 1341 * for us already. 1342 */ 1343 mutex_enter(&mi->mi_lock); 1344 mi->mi_recovflags &= ~MI4R_NEED_CLIENTID; 1345 mutex_exit(&mi->mi_lock); 1346 mutex_exit(&sp->s_lock); 1347 } 1348 } 1349 1350 /* 1351 * Check if we need to get the security information. 1352 */ 1353 mutex_enter(&mi->mi_lock); 1354 if ((mi->mi_recovflags & MI4R_NEED_SECINFO) && 1355 !(mi->mi_flags & MI4_RECOV_FAIL)) { 1356 mutex_exit(&mi->mi_lock); 1357 (void) nfs_rw_enter_sig(&mi->mi_recovlock, 1358 RW_WRITER, 0); 1359 error = nfs4_secinfo_recov(recovp->rc_mi, 1360 recovp->rc_vp1, recovp->rc_vp2); 1361 /* 1362 * If error, nothing more can be done, stop 1363 * the recovery. 1364 */ 1365 if (error) { 1366 mutex_enter(&mi->mi_lock); 1367 mi->mi_flags |= MI4_RECOV_FAIL; 1368 mi->mi_error = recovp->rc_error; 1369 mutex_exit(&mi->mi_lock); 1370 nfs4_queue_event(RE_WRONGSEC, mi, NULL, 1371 error, recovp->rc_vp1, recovp->rc_vp2, 1372 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0); 1373 } 1374 nfs_rw_exit(&mi->mi_recovlock); 1375 } else 1376 mutex_exit(&mi->mi_lock); 1377 1378 /* 1379 * Check if there's a bad seqid to recover. 1380 */ 1381 mutex_enter(&mi->mi_lock); 1382 if ((mi->mi_recovflags & MI4R_BAD_SEQID) && 1383 !(mi->mi_flags & MI4_RECOV_FAIL)) { 1384 mutex_exit(&mi->mi_lock); 1385 (void) nfs_rw_enter_sig(&mi->mi_recovlock, 1386 RW_WRITER, 0); 1387 recov_bad_seqid(recovp); 1388 nfs_rw_exit(&mi->mi_recovlock); 1389 } else 1390 mutex_exit(&mi->mi_lock); 1391 1392 /* 1393 * Next check for recovery that affects the entire 1394 * filesystem. 1395 */ 1396 if (sp != NULL) { 1397 mutex_enter(&mi->mi_lock); 1398 if ((mi->mi_recovflags & MI4R_REOPEN_FILES) && 1399 !(mi->mi_flags & MI4_RECOV_FAIL)) { 1400 mutex_exit(&mi->mi_lock); 1401 recov_openfiles(recovp, sp); 1402 } else 1403 mutex_exit(&mi->mi_lock); 1404 } 1405 1406 /* 1407 * Send any queued state recovery requests. 1408 */ 1409 mutex_enter(&mi->mi_lock); 1410 if (sp != NULL && 1411 (mi->mi_recovflags & MI4R_LOST_STATE) && 1412 !(mi->mi_flags & MI4_RECOV_FAIL)) { 1413 mutex_exit(&mi->mi_lock); 1414 (void) nfs_rw_enter_sig(&mi->mi_recovlock, 1415 RW_WRITER, 0); 1416 nfs4_resend_lost_rqsts(recovp, sp); 1417 if (list_head(&mi->mi_lost_state) == NULL) { 1418 /* done */ 1419 mutex_enter(&mi->mi_lock); 1420 mi->mi_recovflags &= ~MI4R_LOST_STATE; 1421 mutex_exit(&mi->mi_lock); 1422 } 1423 nfs_rw_exit(&mi->mi_recovlock); 1424 } else { 1425 mutex_exit(&mi->mi_lock); 1426 } 1427 1428 /* 1429 * See if there is anything more to do. If not, announce 1430 * that we are done and exit. 1431 * 1432 * Need mi_recovlock to keep 'sp' valid. Must grab 1433 * mi_recovlock before mi_lock to preserve lock ordering. 1434 */ 1435 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0); 1436 mutex_enter(&mi->mi_lock); 1437 if ((mi->mi_recovflags & ~MI4R_SRV_REBOOT) == 0 || 1438 (mi->mi_flags & MI4_RECOV_FAIL)) { 1439 list_t local_lost_state; 1440 nfs4_lost_rqst_t *lrp; 1441 1442 /* 1443 * We need to remove the lost requests before we 1444 * unmark the mi as no longer doing recovery to 1445 * avoid a race with a new thread putting new lost 1446 * requests on the same mi (and the going away 1447 * thread would remove the new lost requests). 1448 * 1449 * Move the lost requests to a local list since 1450 * nfs4_remove_lost_rqst() drops mi_lock, and 1451 * dropping the mi_lock would make our check to 1452 * see if recovery is done no longer valid. 1453 */ 1454 list_create(&local_lost_state, 1455 sizeof (nfs4_lost_rqst_t), 1456 offsetof(nfs4_lost_rqst_t, lr_node)); 1457 list_move_tail(&local_lost_state, &mi->mi_lost_state); 1458 1459 done = 1; 1460 mutex_exit(&mi->mi_lock); 1461 /* 1462 * Now officially free the "moved" 1463 * lost requests. 1464 */ 1465 while ((lrp = list_head(&local_lost_state)) != NULL) { 1466 list_remove(&local_lost_state, lrp); 1467 nfs4_free_lost_rqst(lrp, sp); 1468 } 1469 list_destroy(&local_lost_state); 1470 } else 1471 mutex_exit(&mi->mi_lock); 1472 nfs_rw_exit(&mi->mi_recovlock); 1473 1474 /* 1475 * If the filesystem has been forcibly unmounted, there is 1476 * probably no point in retrying immediately. Furthermore, 1477 * there might be user processes waiting for a chance to 1478 * queue up "lost state" requests, so that they can exit. 1479 * So pause here for a moment. Same logic for zone shutdown. 1480 */ 1481 if (!done && FS_OR_ZONE_GONE4(mi->mi_vfsp)) { 1482 mutex_enter(&mi->mi_lock); 1483 cv_broadcast(&mi->mi_failover_cv); 1484 mutex_exit(&mi->mi_lock); 1485 delay(SEC_TO_TICK(nfs4_unmount_delay)); 1486 } 1487 1488 } while (!done); 1489 1490 if (sp != NULL) 1491 nfs4_server_rele(sp); 1492 1493 /* 1494 * Return all recalled delegations 1495 */ 1496 nfs4_dlistclean(); 1497 1498 mutex_enter(&mi->mi_lock); 1499 recov_done(mi, recovp); 1500 mutex_exit(&mi->mi_lock); 1501 1502 /* 1503 * Free up resources that were allocated for us. 1504 */ 1505 if (recovp->rc_vp1 != NULL) 1506 VN_RELE(recovp->rc_vp1); 1507 if (recovp->rc_vp2 != NULL) 1508 VN_RELE(recovp->rc_vp2); 1509 1510 /* now we are done using the mi struct, signal the waiters */ 1511 mutex_enter(&mi->mi_lock); 1512 mi->mi_in_recovery--; 1513 if (mi->mi_in_recovery == 0) 1514 cv_broadcast(&mi->mi_cv_in_recov); 1515 mutex_exit(&mi->mi_lock); 1516 1517 VFS_RELE(mi->mi_vfsp); 1518 MI4_RELE(mi); 1519 kmem_free(recovp, sizeof (recov_info_t)); 1520 mutex_enter(&cpr_lock); 1521 CALLB_CPR_EXIT(&cpr_info); 1522 mutex_destroy(&cpr_lock); 1523 zthread_exit(); 1524 } 1525 1526 /* 1527 * Log the end of recovery and notify any waiting threads. 1528 */ 1529 1530 static void 1531 recov_done(mntinfo4_t *mi, recov_info_t *recovp) 1532 { 1533 1534 ASSERT(MUTEX_HELD(&mi->mi_lock)); 1535 1536 nfs4_queue_event(RE_END, mi, NULL, 0, recovp->rc_vp1, 1537 recovp->rc_vp2, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0); 1538 mi->mi_recovthread = NULL; 1539 mi->mi_flags &= ~MI4_RECOV_ACTIV; 1540 mi->mi_recovflags &= ~MI4R_SRV_REBOOT; 1541 cv_broadcast(&mi->mi_failover_cv); 1542 } 1543 1544 /* 1545 * State-specific recovery routines, by state. 1546 */ 1547 1548 /* 1549 * Failover. 1550 * 1551 * Replaces *spp with a reference to the new server, which must 1552 * eventually be freed. 1553 */ 1554 1555 static void 1556 recov_newserver(recov_info_t *recovp, nfs4_server_t **spp, bool_t *recov_fail) 1557 { 1558 mntinfo4_t *mi = recovp->rc_mi; 1559 servinfo4_t *svp = NULL; 1560 nfs4_server_t *osp = *spp; 1561 CLIENT *cl; 1562 enum clnt_stat status; 1563 struct timeval tv; 1564 int error; 1565 int oncethru = 0; 1566 rnode4_t *rp; 1567 int index; 1568 nfs_fh4 fh; 1569 char *snames; 1570 size_t len; 1571 1572 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0); 1573 1574 tv.tv_sec = 2; 1575 tv.tv_usec = 0; 1576 1577 #ifdef lint 1578 /* 1579 * Lint can't follow the logic, so thinks that snames and len 1580 * can be used before being set. They can't, but lint can't 1581 * figure it out. To address the lint warning, initialize 1582 * snames and len for lint. 1583 */ 1584 snames = NULL; 1585 len = 0; 1586 #endif 1587 1588 /* 1589 * Ping the null NFS procedure of every server in 1590 * the list until one responds. We always start 1591 * at the head of the list and always skip the one 1592 * that is current, since it's caused us a problem. 1593 */ 1594 while (svp == NULL) { 1595 for (svp = mi->mi_servers; svp; svp = svp->sv_next) { 1596 1597 mutex_enter(&mi->mi_lock); 1598 if (FS_OR_ZONE_GONE4(mi->mi_vfsp)) { 1599 mi->mi_flags |= MI4_RECOV_FAIL; 1600 mutex_exit(&mi->mi_lock); 1601 (void) nfs_rw_exit(&mi->mi_recovlock); 1602 *recov_fail = TRUE; 1603 if (oncethru) 1604 kmem_free(snames, len); 1605 return; 1606 } 1607 mutex_exit(&mi->mi_lock); 1608 1609 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 1610 if (svp->sv_flags & SV4_NOTINUSE) { 1611 nfs_rw_exit(&svp->sv_lock); 1612 continue; 1613 } 1614 nfs_rw_exit(&svp->sv_lock); 1615 1616 if (!oncethru && svp == mi->mi_curr_serv) 1617 continue; 1618 1619 error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, 1620 NFS_PROGRAM, NFS_V4, 0, 1, CRED(), &cl); 1621 if (error) 1622 continue; 1623 1624 if (!(mi->mi_flags & MI4_INT)) 1625 cl->cl_nosignal = TRUE; 1626 status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL, 1627 xdr_void, NULL, tv); 1628 if (!(mi->mi_flags & MI4_INT)) 1629 cl->cl_nosignal = FALSE; 1630 AUTH_DESTROY(cl->cl_auth); 1631 CLNT_DESTROY(cl); 1632 if (status == RPC_SUCCESS) { 1633 nfs4_queue_event(RE_FAILOVER, mi, 1634 svp == mi->mi_curr_serv ? NULL : 1635 svp->sv_hostname, 0, NULL, NULL, 0, 1636 NULL, 0, TAG_NONE, TAG_NONE, 0, 0); 1637 break; 1638 } 1639 } 1640 1641 if (svp == NULL) { 1642 if (!oncethru) { 1643 snames = nfs4_getsrvnames(mi, &len); 1644 nfs4_queue_fact(RF_SRVS_NOT_RESPOND, mi, 1645 0, 0, 0, FALSE, snames, 0, NULL); 1646 oncethru = 1; 1647 } 1648 delay(hz); 1649 } 1650 } 1651 1652 if (oncethru) { 1653 nfs4_queue_fact(RF_SRVS_OK, mi, 0, 0, 0, FALSE, snames, 1654 0, NULL); 1655 kmem_free(snames, len); 1656 } 1657 1658 #if DEBUG 1659 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 1660 ASSERT((svp->sv_flags & SV4_NOTINUSE) == 0); 1661 nfs_rw_exit(&svp->sv_lock); 1662 #endif 1663 1664 mutex_enter(&mi->mi_lock); 1665 mi->mi_recovflags &= ~MI4R_NEED_NEW_SERVER; 1666 if (svp != mi->mi_curr_serv) { 1667 servinfo4_t *osvp = mi->mi_curr_serv; 1668 1669 mutex_exit(&mi->mi_lock); 1670 1671 /* 1672 * Update server-dependent fields in the root vnode. 1673 */ 1674 index = rtable4hash(mi->mi_rootfh); 1675 rw_enter(&rtable4[index].r_lock, RW_WRITER); 1676 1677 rp = r4find(&rtable4[index], mi->mi_rootfh, mi->mi_vfsp); 1678 if (rp != NULL) { 1679 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE, 1680 "recov_newserver: remapping %s", rnode4info(rp))); 1681 mutex_enter(&rp->r_statelock); 1682 rp->r_server = svp; 1683 PURGE_ATTRCACHE4_LOCKED(rp); 1684 mutex_exit(&rp->r_statelock); 1685 (void) nfs4_free_data_reclaim(rp); 1686 nfs4_purge_rddir_cache(RTOV4(rp)); 1687 rw_exit(&rtable4[index].r_lock); 1688 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE, 1689 "recov_newserver: done with %s", 1690 rnode4info(rp))); 1691 VN_RELE(RTOV4(rp)); 1692 } else 1693 rw_exit(&rtable4[index].r_lock); 1694 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0); 1695 1696 mutex_enter(&mi->mi_lock); 1697 mi->mi_recovflags |= MI4R_REOPEN_FILES | MI4R_REMAP_FILES; 1698 if (recovp->rc_srv_reboot) 1699 mi->mi_recovflags |= MI4R_SRV_REBOOT; 1700 mi->mi_curr_serv = svp; 1701 mi->mi_failover++; 1702 mi->mi_flags &= ~MI4_BADOWNER_DEBUG; 1703 mutex_exit(&mi->mi_lock); 1704 1705 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 1706 fh.nfs_fh4_len = svp->sv_fhandle.fh_len; 1707 fh.nfs_fh4_val = svp->sv_fhandle.fh_buf; 1708 sfh4_update(mi->mi_rootfh, &fh); 1709 fh.nfs_fh4_len = svp->sv_pfhandle.fh_len; 1710 fh.nfs_fh4_val = svp->sv_pfhandle.fh_buf; 1711 sfh4_update(mi->mi_srvparentfh, &fh); 1712 nfs_rw_exit(&svp->sv_lock); 1713 1714 *spp = nfs4_move_mi(mi, osvp, svp); 1715 if (osp != NULL) 1716 nfs4_server_rele(osp); 1717 } else 1718 mutex_exit(&mi->mi_lock); 1719 (void) nfs_rw_exit(&mi->mi_recovlock); 1720 } 1721 1722 /* 1723 * Clientid. 1724 */ 1725 1726 static void 1727 recov_clientid(recov_info_t *recovp, nfs4_server_t *sp) 1728 { 1729 mntinfo4_t *mi = recovp->rc_mi; 1730 int error = 0; 1731 int still_stale; 1732 int need_new_s; 1733 1734 ASSERT(sp != NULL); 1735 1736 /* 1737 * Acquire the recovery lock and then verify that the clientid 1738 * still needs to be recovered. (Note that s_recovlock is supposed 1739 * to be acquired before s_lock.) Since the thread holds the 1740 * recovery lock, no other thread will recover the clientid. 1741 */ 1742 (void) nfs_rw_enter_sig(&sp->s_recovlock, RW_WRITER, 0); 1743 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0); 1744 mutex_enter(&sp->s_lock); 1745 still_stale = ((sp->s_flags & N4S_CLIENTID_SET) == 0); 1746 mutex_exit(&sp->s_lock); 1747 1748 if (still_stale) { 1749 nfs4_error_t n4e; 1750 1751 nfs4_error_zinit(&n4e); 1752 nfs4setclientid(mi, kcred, TRUE, &n4e); 1753 error = n4e.error; 1754 if (error != 0) { 1755 1756 /* 1757 * nfs4setclientid may have set MI4R_NEED_NEW_SERVER, 1758 * if so, just return and let recov_thread drive 1759 * failover. 1760 */ 1761 mutex_enter(&mi->mi_lock); 1762 need_new_s = mi->mi_recovflags & MI4R_NEED_NEW_SERVER; 1763 mutex_exit(&mi->mi_lock); 1764 1765 if (need_new_s) { 1766 nfs_rw_exit(&mi->mi_recovlock); 1767 nfs_rw_exit(&sp->s_recovlock); 1768 return; 1769 } 1770 1771 nfs4_queue_event(RE_CLIENTID, mi, NULL, n4e.error, NULL, 1772 NULL, n4e.stat, NULL, 0, TAG_NONE, TAG_NONE, 0, 0); 1773 mutex_enter(&mi->mi_lock); 1774 mi->mi_flags |= MI4_RECOV_FAIL; 1775 mi->mi_error = recovp->rc_error; 1776 mutex_exit(&mi->mi_lock); 1777 /* don't destroy the nfs4_server, let umount do it */ 1778 } 1779 } 1780 1781 if (error == 0) { 1782 mutex_enter(&mi->mi_lock); 1783 mi->mi_recovflags &= ~MI4R_NEED_CLIENTID; 1784 /* 1785 * If still_stale isn't true, then another thread already 1786 * recovered the clientid. And that thread that set the 1787 * clientid will have initiated reopening files on all the 1788 * filesystems for the server, so we should not initiate 1789 * reopening for this filesystem here. 1790 */ 1791 if (still_stale) { 1792 mi->mi_recovflags |= MI4R_REOPEN_FILES; 1793 if (recovp->rc_srv_reboot) 1794 mi->mi_recovflags |= MI4R_SRV_REBOOT; 1795 } 1796 mutex_exit(&mi->mi_lock); 1797 } 1798 1799 nfs_rw_exit(&mi->mi_recovlock); 1800 1801 if (error != 0) { 1802 nfs_rw_exit(&sp->s_recovlock); 1803 mutex_enter(&mi->mi_lock); 1804 if ((mi->mi_flags & MI4_RECOV_FAIL) == 0) 1805 delay(SEC_TO_TICK(recov_err_delay)); 1806 mutex_exit(&mi->mi_lock); 1807 } else { 1808 mntinfo4_t **milist; 1809 mntinfo4_t *tmi; 1810 int nummi, i; 1811 1812 /* 1813 * Initiate recovery of open files for other filesystems. 1814 * We create an array of filesystems, rather than just 1815 * walking the filesystem list, to avoid deadlock issues 1816 * with s_lock and mi_recovlock. 1817 */ 1818 milist = make_milist(sp, &nummi); 1819 for (i = 0; i < nummi; i++) { 1820 tmi = milist[i]; 1821 if (tmi != mi) { 1822 (void) nfs_rw_enter_sig(&tmi->mi_recovlock, 1823 RW_READER, 0); 1824 start_recovery_action(NR_OPENFILES, TRUE, tmi, 1825 NULL, NULL); 1826 nfs_rw_exit(&tmi->mi_recovlock); 1827 } 1828 } 1829 free_milist(milist, nummi); 1830 1831 nfs_rw_exit(&sp->s_recovlock); 1832 } 1833 } 1834 1835 /* 1836 * Return an array of filesystems associated with the given server. The 1837 * caller should call free_milist() to free the references and memory. 1838 */ 1839 1840 static mntinfo4_t ** 1841 make_milist(nfs4_server_t *sp, int *nummip) 1842 { 1843 int nummi, i; 1844 mntinfo4_t **milist; 1845 mntinfo4_t *tmi; 1846 1847 mutex_enter(&sp->s_lock); 1848 nummi = 0; 1849 for (tmi = sp->mntinfo4_list; tmi != NULL; tmi = tmi->mi_clientid_next) 1850 nummi++; 1851 1852 milist = kmem_alloc(nummi * sizeof (mntinfo4_t *), KM_SLEEP); 1853 1854 for (i = 0, tmi = sp->mntinfo4_list; tmi != NULL; i++, 1855 tmi = tmi->mi_clientid_next) { 1856 milist[i] = tmi; 1857 VFS_HOLD(tmi->mi_vfsp); 1858 } 1859 mutex_exit(&sp->s_lock); 1860 1861 *nummip = nummi; 1862 return (milist); 1863 } 1864 1865 /* 1866 * Free the filesystem list created by make_milist(). 1867 */ 1868 1869 static void 1870 free_milist(mntinfo4_t **milist, int nummi) 1871 { 1872 mntinfo4_t *tmi; 1873 int i; 1874 1875 for (i = 0; i < nummi; i++) { 1876 tmi = milist[i]; 1877 VFS_RELE(tmi->mi_vfsp); 1878 } 1879 kmem_free(milist, nummi * sizeof (mntinfo4_t *)); 1880 } 1881 1882 /* 1883 * Filehandle 1884 */ 1885 1886 /* 1887 * Lookup the filehandle for the given vnode and update the rnode if it has 1888 * changed. 1889 * 1890 * Errors: 1891 * - if the filehandle could not be updated because of an error that 1892 * requires further recovery, initiate that recovery and return. 1893 * - if the filehandle could not be updated because of a signal, pretend we 1894 * succeeded and let someone else deal with it. 1895 * - if the filehandle could not be updated and the filesystem has been 1896 * forcibly unmounted, pretend we succeeded, and let the caller deal with 1897 * the forced unmount (to retry or not to retry, that is the question). 1898 * - if the filehandle could not be updated because of some other error, 1899 * mark the rnode bad and return. 1900 */ 1901 static void 1902 recov_filehandle(nfs4_recov_t action, mntinfo4_t *mi, vnode_t *vp) 1903 { 1904 rnode4_t *rp = VTOR4(vp); 1905 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 1906 bool_t needrecov; 1907 1908 mutex_enter(&rp->r_statelock); 1909 1910 if (rp->r_flags & R4RECOVERR) { 1911 mutex_exit(&rp->r_statelock); 1912 return; 1913 } 1914 1915 /* 1916 * If someone else is updating the filehandle, wait for them to 1917 * finish and then let our caller retry. 1918 */ 1919 if (rp->r_flags & R4RECEXPFH) { 1920 while (rp->r_flags & R4RECEXPFH) { 1921 cv_wait(&rp->r_cv, &rp->r_statelock); 1922 } 1923 mutex_exit(&rp->r_statelock); 1924 return; 1925 } 1926 rp->r_flags |= R4RECEXPFH; 1927 mutex_exit(&rp->r_statelock); 1928 1929 if (action == NR_BADHANDLE) { 1930 /* shouldn't happen */ 1931 nfs4_queue_event(RE_BADHANDLE, mi, NULL, 0, 1932 vp, NULL, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0); 1933 } 1934 1935 nfs4_remap_file(mi, vp, 0, &e); 1936 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 1937 1938 /* 1939 * If we get BADHANDLE or FHEXPIRED in their handler, something is 1940 * broken. Don't try to recover, just mark the file dead. 1941 */ 1942 if (needrecov && e.error == 0 && 1943 (e.stat == NFS4ERR_BADHANDLE || e.stat == NFS4ERR_FHEXPIRED)) 1944 needrecov = FALSE; 1945 if (needrecov) { 1946 (void) nfs4_start_recovery(&e, mi, vp, 1947 NULL, NULL, NULL, OP_LOOKUP, NULL); 1948 } else if (e.error != EINTR && 1949 !NFS4_FRC_UNMT_ERR(e.error, mi->mi_vfsp) && 1950 (e.error != 0 || e.stat != NFS4_OK)) { 1951 nfs4_recov_fh_fail(vp, e.error, e.stat); 1952 /* 1953 * Don't set r_error to ESTALE. Higher-level code (e.g., 1954 * cstatat_getvp()) retries on ESTALE, which would cause 1955 * an infinite loop. 1956 */ 1957 } 1958 1959 mutex_enter(&rp->r_statelock); 1960 rp->r_flags &= ~R4RECEXPFH; 1961 cv_broadcast(&rp->r_cv); 1962 mutex_exit(&rp->r_statelock); 1963 } 1964 1965 /* 1966 * Stale Filehandle 1967 */ 1968 1969 /* 1970 * A stale filehandle can happen when an individual file has 1971 * been removed, or when an entire filesystem has been taken 1972 * offline. To distinguish these cases, we do this: 1973 * - if a GETATTR with the current filehandle is okay, we do 1974 * nothing (this can happen with two-filehandle ops) 1975 * - if the GETATTR fails, but a GETATTR of the root filehandle 1976 * succeeds, mark the rnode with R4STALE, which will stop use 1977 * - if the GETATTR fails, and a GETATTR of the root filehandle 1978 * also fails, we consider the problem filesystem-wide, so: 1979 * - if we can failover, we should 1980 * - if we can't failover, we should mark both the original 1981 * vnode and the root bad 1982 */ 1983 static void 1984 recov_stale(mntinfo4_t *mi, vnode_t *vp) 1985 { 1986 rnode4_t *rp = VTOR4(vp); 1987 vnode_t *rootvp = NULL; 1988 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 1989 nfs4_ga_res_t gar; 1990 char *fail_msg = "failed to recover from NFS4ERR_STALE"; 1991 bool_t needrecov; 1992 1993 mutex_enter(&rp->r_statelock); 1994 1995 if (rp->r_flags & R4RECOVERR) { 1996 mutex_exit(&rp->r_statelock); 1997 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1998 "recov_stale: already marked dead, rp %s", 1999 rnode4info(rp))); 2000 return; 2001 } 2002 2003 if (rp->r_flags & R4STALE) { 2004 mutex_exit(&rp->r_statelock); 2005 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2006 "recov_stale: already marked stale, rp %s", 2007 rnode4info(rp))); 2008 return; 2009 } 2010 2011 mutex_exit(&rp->r_statelock); 2012 2013 /* Try a GETATTR on this vnode */ 2014 nfs4_getattr_otw_norecovery(vp, &gar, &e, CRED(), 0); 2015 2016 /* 2017 * Handle non-STALE recoverable errors 2018 */ 2019 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 2020 if (needrecov && (e.error != 0 || e.stat != NFS4ERR_STALE)) { 2021 (void) nfs4_start_recovery(&e, mi, vp, 2022 NULL, NULL, NULL, OP_GETATTR, NULL); 2023 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2024 "recov_stale: error=%d, stat=%d seen on rp %s", 2025 e.error, e.stat, rnode4info(rp))); 2026 goto out; 2027 } 2028 2029 /* Are things OK for this vnode? */ 2030 if (!e.error && e.stat == NFS4_OK) { 2031 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2032 "recov_stale: file appears fine, rp %s", 2033 rnode4info(rp))); 2034 goto out; 2035 } 2036 2037 /* Did we get an unrelated non-recoverable error? */ 2038 if (e.error || e.stat != NFS4ERR_STALE) { 2039 nfs4_fail_recov(vp, fail_msg, e.error, e.stat); 2040 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2041 "recov_stale: unrelated fatal error, rp %s", 2042 rnode4info(rp))); 2043 goto out; 2044 } 2045 2046 /* 2047 * If we don't appear to be dealing with the root node, find it. 2048 */ 2049 if ((vp->v_flag & VROOT) == 0) { 2050 nfs4_error_zinit(&e); 2051 e.error = VFS_ROOT(vp->v_vfsp, &rootvp); 2052 if (e.error) { 2053 nfs4_fail_recov(vp, fail_msg, 0, NFS4ERR_STALE); 2054 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2055 "recov_stale: can't find root node for rp %s", 2056 rnode4info(rp))); 2057 goto out; 2058 } 2059 } 2060 2061 /* Try a GETATTR on the root vnode */ 2062 if (rootvp != NULL) { 2063 nfs4_error_zinit(&e); 2064 nfs4_getattr_otw_norecovery(rootvp, &gar, &e, CRED(), 0); 2065 2066 /* Try recovery? */ 2067 if (e.error != 0 || e.stat != NFS4ERR_STALE) { 2068 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 2069 if (needrecov) { 2070 (void) nfs4_start_recovery(&e, 2071 mi, rootvp, NULL, NULL, NULL, 2072 OP_GETATTR, NULL); 2073 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2074 "recov_stale: error=%d, stat=%d seen " 2075 "on rp %s", e.error, e.stat, 2076 rnode4info(rp))); 2077 } 2078 } 2079 2080 /* 2081 * Check to see if a failover attempt is warranted 2082 * NB: nfs4_try_failover doesn't check for STALE 2083 * because recov_stale gets a shot first. Now that 2084 * recov_stale has failed, go ahead and try failover. 2085 * 2086 * If the getattr on the root filehandle was successful, 2087 * then mark recovery as failed for 'vp' and exit. 2088 */ 2089 if (nfs4_try_failover(&e) == 0 && e.stat != NFS4ERR_STALE) { 2090 /* 2091 * pass the original error to fail_recov, not 2092 * the one from trying the root vnode. 2093 */ 2094 nfs4_fail_recov(vp, fail_msg, 0, NFS4ERR_STALE); 2095 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2096 "recov_stale: root node OK, marking " 2097 "dead rp %s", rnode4info(rp))); 2098 goto out; 2099 } 2100 } 2101 2102 /* 2103 * Here, we know that both the original file and the 2104 * root filehandle (which may be the same) are stale. 2105 * We want to fail over if we can, and if we can't, we 2106 * want to mark everything in sight bad. 2107 */ 2108 if (FAILOVER_MOUNT4(mi)) { 2109 mutex_enter(&mi->mi_lock); 2110 mi->mi_recovflags |= MI4R_NEED_NEW_SERVER; 2111 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2112 "recov_stale: failing over due to rp %s", 2113 rnode4info(rp))); 2114 mutex_exit(&mi->mi_lock); 2115 } else { 2116 rnode4_t *rootrp; 2117 servinfo4_t *svp; 2118 2119 /* 2120 * Can't fail over, so mark things dead. 2121 * 2122 * If rootvp is set, we know we have a distinct 2123 * non-root vnode which can be marked dead in 2124 * the usual way. 2125 * 2126 * Then we want to mark the root vnode dead. 2127 * Note that if rootvp wasn't set, our vp is 2128 * actually the root vnode. 2129 */ 2130 if (rootvp != NULL) { 2131 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2132 "recov_stale: can't fail over, marking dead rp %s", 2133 rnode4info(rp))); 2134 nfs4_fail_recov(vp, fail_msg, 0, NFS4ERR_STALE); 2135 } else { 2136 rootvp = vp; 2137 VN_HOLD(rootvp); 2138 } 2139 2140 /* 2141 * Mark root dead, but quietly - since 2142 * the root rnode is frequently recreated, 2143 * we can encounter this at every access. 2144 * Also mark recovery as failed on this VFS. 2145 */ 2146 rootrp = VTOR4(rootvp); 2147 NFS4_DEBUG(nfs4_client_recov_debug, (CE_CONT, 2148 "recov_stale: marking dead root rp %s", 2149 rnode4info(rootrp))); 2150 mutex_enter(&rootrp->r_statelock); 2151 rootrp->r_flags |= (R4RECOVERR | R4STALE); 2152 rootrp->r_error = ESTALE; 2153 mutex_exit(&rootrp->r_statelock); 2154 mutex_enter(&mi->mi_lock); 2155 mi->mi_error = ESTALE; 2156 mutex_exit(&mi->mi_lock); 2157 2158 svp = mi->mi_curr_serv; 2159 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0); 2160 svp->sv_flags |= SV4_ROOT_STALE; 2161 nfs_rw_exit(&svp->sv_lock); 2162 } 2163 2164 out: 2165 if (rootvp) 2166 VN_RELE(rootvp); 2167 } 2168 2169 /* 2170 * Locks. 2171 */ 2172 2173 /* 2174 * Reclaim all the active (acquired) locks for the given file. 2175 * If a process lost a lock, the process is sent a SIGLOST. This is not 2176 * considered an error. 2177 * 2178 * Return values: 2179 * Errors and status are returned via the nfs4_error_t parameter 2180 * If an error indicates that recovery is needed, the caller is responsible 2181 * for dealing with it. 2182 */ 2183 2184 static void 2185 relock_file(vnode_t *vp, mntinfo4_t *mi, nfs4_error_t *ep, 2186 fattr4_change pre_change) 2187 { 2188 locklist_t *locks, *llp; 2189 rnode4_t *rp; 2190 2191 ASSERT(ep != NULL); 2192 nfs4_error_zinit(ep); 2193 2194 if (VTOMI4(vp)->mi_flags & MI4_LLOCK) 2195 return; 2196 2197 nfs4_flush_lock_owners(VTOR4(vp)); 2198 2199 /* 2200 * If we get an error that requires recovery actions, just bail out 2201 * and let the top-level recovery code handle it. 2202 * 2203 * If we get some other error, kill the process that owned the lock 2204 * and mark its remaining locks (if any) as belonging to NOPID, so 2205 * that we don't make any more reclaim requests for that process. 2206 */ 2207 2208 rp = VTOR4(vp); 2209 locks = flk_active_locks_for_vp(vp); 2210 for (llp = locks; llp != NULL; llp = llp->ll_next) { 2211 int did_reclaim = 1; 2212 2213 ASSERT(llp->ll_vp == vp); 2214 if (llp->ll_flock.l_pid == NOPID) 2215 continue; 2216 reclaim_one_lock(vp, &llp->ll_flock, ep, &did_reclaim); 2217 /* 2218 * If we need to restart recovery, stop processing the 2219 * list. Some errors would be recoverable under other 2220 * circumstances, but if they happen here we just give up 2221 * on the lock. 2222 */ 2223 if (nfs4_needs_recovery(ep, TRUE, vp->v_vfsp)) { 2224 if (ep->error != 0) 2225 break; 2226 if (!nfs4_recov_marks_dead(ep->stat)) 2227 break; 2228 } 2229 /* 2230 * In case the server isn't offering us a grace period, or 2231 * if we missed it, we might have opened & locked from scratch, 2232 * rather than reopened/reclaimed. 2233 * We need to ensure that the object hadn't been otherwise 2234 * changed during this time, by comparing the changeinfo. 2235 * We get passed the changeinfo from before the reopen by our 2236 * caller, in pre_change. 2237 * The changeinfo from after the reopen is in rp->r_change, 2238 * courtesy of the GETATTR in the reopen. 2239 * If they're different, then the file has changed, and we 2240 * have to SIGLOST the app. 2241 */ 2242 if (ep->error == 0 && ep->stat == NFS4_OK && !did_reclaim) { 2243 mutex_enter(&rp->r_statelock); 2244 if (pre_change != rp->r_change) 2245 ep->stat = NFS4ERR_NO_GRACE; 2246 mutex_exit(&rp->r_statelock); 2247 } 2248 if (ep->error != 0 || ep->stat != NFS4_OK) { 2249 if (ep->error != 0) 2250 nfs4_queue_event(RE_FAIL_RELOCK, mi, 2251 NULL, ep->error, vp, NULL, 0, NULL, 2252 llp->ll_flock.l_pid, TAG_NONE, TAG_NONE, 2253 0, 0); 2254 else 2255 nfs4_queue_event(RE_FAIL_RELOCK, mi, 2256 NULL, 0, vp, NULL, ep->stat, NULL, 2257 llp->ll_flock.l_pid, TAG_NONE, TAG_NONE, 2258 0, 0); 2259 nfs4_send_siglost(llp->ll_flock.l_pid, mi, vp, TRUE, 2260 ep->error, ep->stat); 2261 relock_skip_pid(llp, llp->ll_flock.l_pid); 2262 2263 /* Reinitialize the nfs4_error and continue */ 2264 nfs4_error_zinit(ep); 2265 } 2266 } 2267 2268 if (locks != NULL) 2269 flk_free_locklist(locks); 2270 } 2271 2272 /* 2273 * Reclaim the given lock. 2274 * If the lock can't be reclaimed, the process is sent SIGLOST, but this is 2275 * not considered an error. 2276 * 2277 * Errors are returned via the nfs4_error_t parameter. 2278 */ 2279 static void 2280 reclaim_one_lock(vnode_t *vp, flock64_t *flk, nfs4_error_t *ep, 2281 int *did_reclaimp) 2282 { 2283 cred_t *cr; 2284 rnode4_t *rp = VTOR4(vp); 2285 2286 cr = pid_to_cr(flk->l_pid); 2287 if (cr == NULL) { 2288 nfs4_error_zinit(ep); 2289 ep->error = ESRCH; 2290 return; 2291 } 2292 2293 do { 2294 mutex_enter(&rp->r_statelock); 2295 if (rp->r_flags & R4RECOVERR) { 2296 /* 2297 * This shouldn't affect other reclaims, so don't 2298 * return an error. 2299 */ 2300 mutex_exit(&rp->r_statelock); 2301 break; 2302 } 2303 mutex_exit(&rp->r_statelock); 2304 2305 nfs4frlock(NFS4_LCK_CTYPE_RECLAIM, vp, F_SETLK, flk, 2306 FREAD|FWRITE, 0, cr, ep, NULL, did_reclaimp); 2307 if (ep->error == 0 && ep->stat == NFS4ERR_FHEXPIRED) 2308 start_recovery_action(NR_FHEXPIRED, TRUE, VTOMI4(vp), 2309 vp, NULL); 2310 } while (ep->error == 0 && ep->stat == NFS4ERR_FHEXPIRED); 2311 2312 crfree(cr); 2313 } 2314 2315 /* 2316 * Open files. 2317 */ 2318 2319 /* 2320 * Verifies if the nfsstat4 is a valid error for marking this vnode dead. 2321 * Returns 1 if the error is valid; 0 otherwise. 2322 */ 2323 static int 2324 nfs4_valid_recov_err_for_vp(vnode_t *vp, nfsstat4 stat) 2325 { 2326 /* 2327 * We should not be marking non-regular files as dead, 2328 * except in very rare cases (eg: BADHANDLE or NFS4ERR_BADNAME). 2329 */ 2330 if (vp->v_type != VREG && stat != NFS4ERR_BADHANDLE && 2331 stat != NFS4ERR_BADNAME) 2332 return (0); 2333 2334 return (1); 2335 } 2336 2337 /* 2338 * Failed attempting to recover a filehandle. If 'stat' is valid for 'vp', 2339 * then mark the object dead. Since we've had to do a lookup for 2340 * filehandle recovery, we will mark the object dead if we got NOENT. 2341 */ 2342 static void 2343 nfs4_recov_fh_fail(vnode_t *vp, int error, nfsstat4 stat) 2344 { 2345 ASSERT(vp != NULL); 2346 2347 if ((error == 0) && (stat != NFS4ERR_NOENT) && 2348 (!nfs4_valid_recov_err_for_vp(vp, stat))) 2349 return; 2350 2351 nfs4_fail_recov(vp, "can't recover filehandle", error, stat); 2352 } 2353 2354 /* 2355 * Recovery from a "shouldn't happen" error. In the long term, we'd like 2356 * to mark only the data structure(s) that provided the bad value as being 2357 * bad. But for now we'll just mark the entire file. 2358 */ 2359 2360 static void 2361 recov_badstate(recov_info_t *recovp, vnode_t *vp, nfsstat4 stat) 2362 { 2363 ASSERT(vp != NULL); 2364 recov_throttle(recovp, vp); 2365 2366 if (!nfs4_valid_recov_err_for_vp(vp, stat)) 2367 return; 2368 2369 nfs4_fail_recov(vp, "", 0, stat); 2370 } 2371 2372 /* 2373 * Free up the information saved for a lost state request. 2374 */ 2375 static void 2376 nfs4_free_lost_rqst(nfs4_lost_rqst_t *lrp, nfs4_server_t *sp) 2377 { 2378 component4 *filep; 2379 nfs4_open_stream_t *osp; 2380 int have_sync_lock; 2381 2382 NFS4_DEBUG(nfs4_lost_rqst_debug, 2383 (CE_NOTE, "nfs4_free_lost_rqst:")); 2384 2385 switch (lrp->lr_op) { 2386 case OP_OPEN: 2387 filep = &lrp->lr_ofile; 2388 if (filep->utf8string_val) { 2389 kmem_free(filep->utf8string_val, filep->utf8string_len); 2390 filep->utf8string_val = NULL; 2391 } 2392 break; 2393 case OP_DELEGRETURN: 2394 nfs4delegreturn_cleanup(VTOR4(lrp->lr_vp), sp); 2395 break; 2396 case OP_CLOSE: 2397 osp = lrp->lr_osp; 2398 ASSERT(osp != NULL); 2399 mutex_enter(&osp->os_sync_lock); 2400 have_sync_lock = 1; 2401 if (osp->os_pending_close) { 2402 /* clean up the open file state. */ 2403 osp->os_pending_close = 0; 2404 nfs4close_notw(lrp->lr_vp, osp, &have_sync_lock); 2405 } 2406 if (have_sync_lock) 2407 mutex_exit(&osp->os_sync_lock); 2408 break; 2409 } 2410 2411 lrp->lr_op = 0; 2412 if (lrp->lr_oop != NULL) { 2413 open_owner_rele(lrp->lr_oop); 2414 lrp->lr_oop = NULL; 2415 } 2416 if (lrp->lr_osp != NULL) { 2417 open_stream_rele(lrp->lr_osp, VTOR4(lrp->lr_vp)); 2418 lrp->lr_osp = NULL; 2419 } 2420 if (lrp->lr_lop != NULL) { 2421 lock_owner_rele(lrp->lr_lop); 2422 lrp->lr_lop = NULL; 2423 } 2424 if (lrp->lr_flk != NULL) { 2425 kmem_free(lrp->lr_flk, sizeof (flock64_t)); 2426 lrp->lr_flk = NULL; 2427 } 2428 if (lrp->lr_vp != NULL) { 2429 VN_RELE(lrp->lr_vp); 2430 lrp->lr_vp = NULL; 2431 } 2432 if (lrp->lr_dvp != NULL) { 2433 VN_RELE(lrp->lr_dvp); 2434 lrp->lr_dvp = NULL; 2435 } 2436 if (lrp->lr_cr != NULL) { 2437 crfree(lrp->lr_cr); 2438 lrp->lr_cr = NULL; 2439 } 2440 2441 kmem_free(lrp, sizeof (nfs4_lost_rqst_t)); 2442 } 2443 2444 /* 2445 * Remove any lost state requests and free them. 2446 */ 2447 static void 2448 nfs4_remove_lost_rqsts(mntinfo4_t *mi, nfs4_server_t *sp) 2449 { 2450 nfs4_lost_rqst_t *lrp; 2451 2452 mutex_enter(&mi->mi_lock); 2453 while ((lrp = list_head(&mi->mi_lost_state)) != NULL) { 2454 list_remove(&mi->mi_lost_state, lrp); 2455 mutex_exit(&mi->mi_lock); 2456 nfs4_free_lost_rqst(lrp, sp); 2457 mutex_enter(&mi->mi_lock); 2458 } 2459 mutex_exit(&mi->mi_lock); 2460 } 2461 2462 /* 2463 * Reopen all the files for the given filesystem and reclaim any locks. 2464 */ 2465 2466 static void 2467 recov_openfiles(recov_info_t *recovp, nfs4_server_t *sp) 2468 { 2469 mntinfo4_t *mi = recovp->rc_mi; 2470 nfs4_opinst_t *reopenlist = NULL, *rep; 2471 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 2472 open_claim_type4 claim; 2473 int remap; 2474 char *fail_msg = "No such file or directory on replica"; 2475 rnode4_t *rp; 2476 fattr4_change pre_change; 2477 2478 ASSERT(sp != NULL); 2479 2480 /* 2481 * This check is to allow a 10ms pause before we reopen files 2482 * it should allow the server time to have received the CB_NULL 2483 * reply and update its internal structures such that (if 2484 * applicable) we are granted a delegation on reopened files. 2485 */ 2486 mutex_enter(&sp->s_lock); 2487 if ((sp->s_flags & (N4S_CB_PINGED | N4S_CB_WAITER)) == 0) { 2488 sp->s_flags |= N4S_CB_WAITER; 2489 (void) cv_timedwait(&sp->wait_cb_null, &sp->s_lock, 2490 (lbolt + drv_usectohz(N4S_CB_PAUSE_TIME))); 2491 } 2492 mutex_exit(&sp->s_lock); 2493 2494 (void) nfs_rw_enter_sig(&sp->s_recovlock, RW_READER, 0); 2495 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0); 2496 2497 if (NFS4_VOLATILE_FH(mi)) { 2498 nfs4_remap_root(mi, &e, 0); 2499 if (nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp)) { 2500 (void) nfs4_start_recovery(&e, mi, NULL, 2501 NULL, NULL, NULL, OP_LOOKUP, NULL); 2502 } 2503 } 2504 2505 mutex_enter(&mi->mi_lock); 2506 if (recovp->rc_srv_reboot || (mi->mi_recovflags & MI4R_SRV_REBOOT)) 2507 claim = CLAIM_PREVIOUS; 2508 else 2509 claim = CLAIM_NULL; 2510 mutex_exit(&mi->mi_lock); 2511 2512 if (e.error == 0 && e.stat == NFS4_OK) { 2513 /* 2514 * Get a snapshot of open files in the filesystem. Note 2515 * that new opens will stall until the server's grace 2516 * period is done. 2517 */ 2518 reopenlist = r4mkopenlist(mi); 2519 2520 mutex_enter(&mi->mi_lock); 2521 remap = mi->mi_recovflags & MI4R_REMAP_FILES; 2522 mutex_exit(&mi->mi_lock); 2523 /* 2524 * Since we are re-establishing state on the 2525 * server, its ok to blow away the saved lost 2526 * requests since we don't need to reissue it. 2527 */ 2528 nfs4_remove_lost_rqsts(mi, sp); 2529 2530 for (rep = reopenlist; rep; rep = rep->re_next) { 2531 2532 if (remap) { 2533 nfs4_remap_file(mi, rep->re_vp, 2534 NFS4_REMAP_CKATTRS, &e); 2535 } 2536 if (e.error == ENOENT || e.stat == NFS4ERR_NOENT) { 2537 /* 2538 * The current server does not have the file 2539 * that is to be remapped. This is most 2540 * likely due to an improperly maintained 2541 * replica. The files that are missing from 2542 * the server will be marked dead and logged 2543 * in order to make sys admins aware of the 2544 * problem. 2545 */ 2546 nfs4_fail_recov(rep->re_vp, 2547 fail_msg, e.error, e.stat); 2548 /* 2549 * We've already handled the error so clear it. 2550 */ 2551 nfs4_error_zinit(&e); 2552 continue; 2553 } else if (e.error == 0 && e.stat == NFS4_OK) { 2554 int j; 2555 2556 rp = VTOR4(rep->re_vp); 2557 mutex_enter(&rp->r_statelock); 2558 pre_change = rp->r_change; 2559 mutex_exit(&rp->r_statelock); 2560 2561 for (j = 0; j < rep->re_numosp; j++) { 2562 nfs4_reopen(rep->re_vp, rep->re_osp[j], 2563 &e, claim, FALSE, TRUE); 2564 if (e.error != 0 || e.stat != NFS4_OK) 2565 break; 2566 } 2567 if (nfs4_needs_recovery(&e, TRUE, 2568 mi->mi_vfsp)) { 2569 (void) nfs4_start_recovery(&e, mi, 2570 rep->re_vp, NULL, NULL, NULL, 2571 OP_OPEN, NULL); 2572 break; 2573 } 2574 } 2575 #ifdef DEBUG 2576 if (nfs4_recovdelay > 0) 2577 delay(MSEC_TO_TICK(nfs4_recovdelay * 1000)); 2578 #endif 2579 if (e.error == 0 && e.stat == NFS4_OK) 2580 relock_file(rep->re_vp, mi, &e, pre_change); 2581 2582 if (nfs4_needs_recovery(&e, TRUE, mi->mi_vfsp)) 2583 (void) nfs4_start_recovery(&e, mi, 2584 rep->re_vp, NULL, NULL, NULL, OP_LOCK, 2585 NULL); 2586 if (e.error != 0 || e.stat != NFS4_OK) 2587 break; 2588 } 2589 2590 /* 2591 * Check to see if we need to remap files passed in 2592 * via the recovery arguments; this will have been 2593 * done for open files. A failure here is not fatal. 2594 */ 2595 if (remap) { 2596 nfs4_error_t ignore; 2597 nfs4_check_remap(mi, recovp->rc_vp1, NFS4_REMAP_CKATTRS, 2598 &ignore); 2599 nfs4_check_remap(mi, recovp->rc_vp2, NFS4_REMAP_CKATTRS, 2600 &ignore); 2601 } 2602 } 2603 2604 if (e.error == 0 && e.stat == NFS4_OK) { 2605 mutex_enter(&mi->mi_lock); 2606 mi->mi_recovflags &= ~(MI4R_REOPEN_FILES | MI4R_REMAP_FILES); 2607 mutex_exit(&mi->mi_lock); 2608 } 2609 2610 nfs_rw_exit(&mi->mi_recovlock); 2611 nfs_rw_exit(&sp->s_recovlock); 2612 2613 if (reopenlist != NULL) 2614 r4releopenlist(reopenlist); 2615 } 2616 2617 /* 2618 * Resend the queued state recovery requests in "rqsts". 2619 */ 2620 2621 static void 2622 nfs4_resend_lost_rqsts(recov_info_t *recovp, nfs4_server_t *sp) 2623 { 2624 nfs4_lost_rqst_t *lrp, *tlrp; 2625 mntinfo4_t *mi = recovp->rc_mi; 2626 nfs4_error_t n4e; 2627 #ifdef NOTYET 2628 uint32_t deny_bits = 0; 2629 #endif 2630 2631 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_resend_lost_rqsts")); 2632 2633 ASSERT(mi != NULL); 2634 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 2635 2636 mutex_enter(&mi->mi_lock); 2637 lrp = list_head(&mi->mi_lost_state); 2638 mutex_exit(&mi->mi_lock); 2639 while (lrp != NULL) { 2640 nfs4_error_zinit(&n4e); 2641 resend_one_op(lrp, &n4e, mi, sp); 2642 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 2643 "nfs4_resend_lost_rqsts: resend request: for vp %p got " 2644 "error %d stat %d", (void *)lrp->lr_vp, n4e.error, 2645 n4e.stat)); 2646 2647 /* 2648 * If we get a recovery error that we can actually 2649 * recover from (such as ETIMEDOUT, FHEXPIRED), we 2650 * return and let the recovery thread redrive the call. 2651 * Don't requeue unless the zone is still healthy. 2652 */ 2653 if (zone_status_get(curproc->p_zone) < ZONE_IS_SHUTTING_DOWN && 2654 nfs4_needs_recovery(&n4e, TRUE, mi->mi_vfsp) && 2655 (nfs4_try_failover(&n4e) || 2656 NFS4_FRC_UNMT_ERR(n4e.error, mi->mi_vfsp) || 2657 (n4e.error == 0 && n4e.stat != NFS4ERR_BADHANDLE && 2658 !nfs4_recov_marks_dead(n4e.stat)))) { 2659 /* 2660 * For these three errors, we want to delay a bit 2661 * instead of pounding the server into submission. 2662 * We have to do this manually; the normal 2663 * processing for these errors only works for 2664 * non-recovery requests. 2665 */ 2666 if ((n4e.error == 0 && n4e.stat == NFS4ERR_DELAY) || 2667 (n4e.error == 0 && n4e.stat == NFS4ERR_GRACE) || 2668 (n4e.error == 0 && n4e.stat == NFS4ERR_RESOURCE) || 2669 NFS4_FRC_UNMT_ERR(n4e.error, mi->mi_vfsp)) { 2670 delay(SEC_TO_TICK(nfs4err_delay_time)); 2671 } else { 2672 (void) nfs4_start_recovery(&n4e, 2673 mi, lrp->lr_dvp, lrp->lr_vp, NULL, NULL, 2674 lrp->lr_op, NULL); 2675 } 2676 return; 2677 } 2678 2679 mutex_enter(&mi->mi_lock); 2680 list_remove(&mi->mi_lost_state, lrp); 2681 tlrp = lrp; 2682 lrp = list_head(&mi->mi_lost_state); 2683 mutex_exit(&mi->mi_lock); 2684 nfs4_free_lost_rqst(tlrp, sp); 2685 } 2686 } 2687 2688 /* 2689 * Resend the given op, and issue any necessary undo call. 2690 * errors are returned via the nfs4_error_t parameter. 2691 */ 2692 2693 static void 2694 resend_one_op(nfs4_lost_rqst_t *lrp, nfs4_error_t *ep, 2695 mntinfo4_t *mi, nfs4_server_t *sp) 2696 { 2697 vnode_t *vp; 2698 nfs4_open_stream_t *osp; 2699 cred_t *cr; 2700 uint32_t acc_bits; 2701 2702 vp = lrp->lr_vp; 2703 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_one_op: " 2704 "have a lost open/close request for vp %p", (void *)vp)); 2705 2706 switch (lrp->lr_op) { 2707 case OP_OPEN: 2708 nfs4_resend_open_otw(&vp, lrp, ep); 2709 break; 2710 case OP_OPEN_DOWNGRADE: 2711 ASSERT(lrp->lr_oop != NULL); 2712 ep->error = nfs4_start_open_seqid_sync(lrp->lr_oop, mi); 2713 ASSERT(!ep->error); /* recov thread always succeeds */ 2714 ASSERT(lrp->lr_osp != NULL); 2715 mutex_enter(&lrp->lr_osp->os_sync_lock); 2716 nfs4_open_downgrade(lrp->lr_dg_acc, lrp->lr_dg_deny, 2717 lrp->lr_oop, lrp->lr_osp, vp, lrp->lr_cr, lrp, 2718 ep, NULL, NULL); 2719 mutex_exit(&lrp->lr_osp->os_sync_lock); 2720 nfs4_end_open_seqid_sync(lrp->lr_oop); 2721 break; 2722 case OP_CLOSE: 2723 osp = lrp->lr_osp; 2724 cr = lrp->lr_cr; 2725 acc_bits = 0; 2726 mutex_enter(&osp->os_sync_lock); 2727 if (osp->os_share_acc_read) 2728 acc_bits |= OPEN4_SHARE_ACCESS_READ; 2729 if (osp->os_share_acc_write) 2730 acc_bits |= OPEN4_SHARE_ACCESS_WRITE; 2731 mutex_exit(&osp->os_sync_lock); 2732 nfs4close_one(vp, osp, cr, acc_bits, lrp, ep, 2733 CLOSE_RESEND, 0, 0, 0); 2734 break; 2735 case OP_LOCK: 2736 case OP_LOCKU: 2737 resend_lock(lrp, ep); 2738 goto done; 2739 case OP_DELEGRETURN: 2740 nfs4_resend_delegreturn(lrp, ep, sp); 2741 goto done; 2742 default: 2743 #ifdef DEBUG 2744 cmn_err(CE_PANIC, "resend_one_op: unexpected op: %d", 2745 lrp->lr_op); 2746 #endif 2747 nfs4_queue_event(RE_LOST_STATE_BAD_OP, mi, NULL, 2748 lrp->lr_op, lrp->lr_vp, lrp->lr_dvp, NFS4_OK, NULL, 0, 2749 TAG_NONE, TAG_NONE, 0, 0); 2750 nfs4_error_init(ep, EINVAL); 2751 return; 2752 } 2753 2754 /* 2755 * No need to retry nor send an "undo" CLOSE in the 2756 * event the server rebooted. 2757 */ 2758 if (ep->error == 0 && (ep->stat == NFS4ERR_STALE_CLIENTID || 2759 ep->stat == NFS4ERR_STALE_STATEID || ep->stat == NFS4ERR_EXPIRED)) 2760 goto done; 2761 2762 /* 2763 * If we resent a CLOSE or OPEN_DOWNGRADE, there's nothing 2764 * to undo. Undoing locking operations was handled by 2765 * resend_lock(). 2766 */ 2767 if (lrp->lr_op == OP_OPEN_DOWNGRADE || lrp->lr_op == OP_CLOSE) 2768 goto done; 2769 2770 /* 2771 * If we get any other error for OPEN, then don't attempt 2772 * to undo the resend of the open (since it was never 2773 * successful!). 2774 */ 2775 ASSERT(lrp->lr_op == OP_OPEN); 2776 if (ep->error || ep->stat != NFS4_OK) 2777 goto done; 2778 2779 /* 2780 * Now let's undo our OPEN. 2781 */ 2782 nfs4_error_zinit(ep); 2783 close_after_open_resend(vp, lrp->lr_cr, lrp->lr_oacc, ep); 2784 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_one_op: " 2785 "nfs4close_one: for vp %p got error %d stat %d", 2786 (void *)vp, ep->error, ep->stat)); 2787 2788 done: 2789 if (vp != lrp->lr_vp) 2790 VN_RELE(vp); 2791 } 2792 2793 /* 2794 * Close a file that was opened via a resent OPEN. 2795 * Most errors are passed back to the caller (via the return value and 2796 * *statp), except for FHEXPIRED, which is retried. 2797 * 2798 * It might be conceptually cleaner to push the CLOSE request onto the 2799 * front of the resend queue, rather than sending it here. That would 2800 * match the way we undo lost lock requests. On the other 2801 * hand, we've already got something that works, and there's no reason to 2802 * change it at this time. 2803 */ 2804 2805 static void 2806 close_after_open_resend(vnode_t *vp, cred_t *cr, uint32_t acc_bits, 2807 nfs4_error_t *ep) 2808 { 2809 2810 for (;;) { 2811 nfs4close_one(vp, NULL, cr, acc_bits, NULL, ep, 2812 CLOSE_AFTER_RESEND, 0, 0, 0); 2813 if (ep->error == 0 && ep->stat == NFS4_OK) 2814 break; /* success; done */ 2815 if (ep->error != 0 || ep->stat != NFS4ERR_FHEXPIRED) 2816 break; 2817 /* else retry FHEXPIRED */ 2818 } 2819 2820 } 2821 2822 /* 2823 * Resend the given lost lock request. Return an errno value. If zero, 2824 * *statp is set to the NFS status code for the call. 2825 * 2826 * Issue a SIGLOST and mark the rnode dead if we get a non-recovery error or 2827 * a recovery error that we don't actually recover from yet (eg: BAD_SEQID). 2828 * Let the recovery thread redrive the call if we get a recovery error that 2829 * we can actually recover from. 2830 */ 2831 static void 2832 resend_lock(nfs4_lost_rqst_t *lrp, nfs4_error_t *ep) 2833 { 2834 bool_t send_siglost = FALSE; 2835 vnode_t *vp = lrp->lr_vp; 2836 2837 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_lock:")); 2838 ASSERT(lrp->lr_ctype == NFS4_LCK_CTYPE_REINSTATE || 2839 lrp->lr_ctype == NFS4_LCK_CTYPE_RESEND); 2840 2841 nfs4frlock(lrp->lr_ctype, vp, F_SETLK, 2842 lrp->lr_flk, FREAD|FWRITE, 0, lrp->lr_cr, ep, lrp, NULL); 2843 2844 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_lock: " 2845 "nfs4frlock for vp %p returned error %d, stat %d", 2846 (void *)vp, ep->error, ep->stat)); 2847 2848 if (ep->error == 0 && ep->stat == 0) 2849 goto done; 2850 if (ep->error == 0 && ep->stat == NFS4ERR_DENIED && 2851 lrp->lr_ctype == NFS4_LCK_CTYPE_RESEND) 2852 goto done; 2853 2854 /* 2855 * If we failed with a non-recovery error, send SIGLOST and 2856 * mark the file dead. 2857 */ 2858 if (!nfs4_needs_recovery(ep, TRUE, vp->v_vfsp)) 2859 send_siglost = TRUE; 2860 else { 2861 /* 2862 * Done with recovering LOST LOCK in the event the 2863 * server rebooted or we've lost the lease. 2864 */ 2865 if (ep->error == 0 && (ep->stat == NFS4ERR_STALE_CLIENTID || 2866 ep->stat == NFS4ERR_STALE_STATEID || 2867 ep->stat == NFS4ERR_EXPIRED)) { 2868 goto done; 2869 } 2870 2871 /* 2872 * BAD_STATEID on an unlock indicates that the server has 2873 * forgotten about the lock anyway, so act like the call 2874 * was successful. 2875 */ 2876 if (ep->error == 0 && ep->stat == NFS4ERR_BAD_STATEID && 2877 lrp->lr_op == OP_LOCKU) 2878 goto done; 2879 2880 /* 2881 * If we got a recovery error that we don't actually 2882 * recover from, send SIGLOST. If the filesystem was 2883 * forcibly unmounted, we skip the SIGLOST because (a) it's 2884 * unnecessary noise, and (b) there could be a new process 2885 * with the same pid as the one that had generated the lost 2886 * state request. 2887 */ 2888 if (ep->error == 0 && (ep->stat == NFS4ERR_BADHANDLE || 2889 nfs4_recov_marks_dead(ep->stat))) { 2890 if (!(vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) 2891 send_siglost = TRUE; 2892 goto done; 2893 } 2894 2895 /* 2896 * If the filesystem was forcibly unmounted, we 2897 * still need to synchronize with the server and 2898 * release state. Try again later. 2899 */ 2900 if (NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp)) 2901 goto done; 2902 2903 /* 2904 * If we get a recovery error that we can actually 2905 * recover from (such as ETIMEDOUT, FHEXPIRED), 2906 * return and let the recovery thread redrive the call. 2907 * 2908 * For the three errors below, we want to delay a bit 2909 * instead of pounding the server into submission. 2910 */ 2911 if ((ep->error == 0 && ep->stat == NFS4ERR_DELAY) || 2912 (ep->error == 0 && ep->stat == NFS4ERR_GRACE) || 2913 (ep->error == 0 && ep->stat == NFS4ERR_RESOURCE)) 2914 delay(SEC_TO_TICK(recov_err_delay)); 2915 goto done; 2916 } 2917 2918 done: 2919 if (send_siglost) { 2920 cred_t *sv_cred; 2921 2922 /* 2923 * Must be root or the actual thread being issued the 2924 * SIGLOST for this to work, so just become root. 2925 */ 2926 sv_cred = curthread->t_cred; 2927 curthread->t_cred = kcred; 2928 nfs4_send_siglost(lrp->lr_flk->l_pid, VTOMI4(vp), vp, FALSE, 2929 ep->error, ep->stat); 2930 curthread->t_cred = sv_cred; 2931 2932 /* 2933 * Flush any additional reinstantiation requests for 2934 * this operation. Sending multiple SIGLOSTs to the user 2935 * process is unlikely to help and may cause trouble. 2936 */ 2937 if (lrp->lr_ctype == NFS4_LCK_CTYPE_REINSTATE) 2938 flush_reinstate(lrp); 2939 } 2940 } 2941 2942 /* 2943 * Remove any lock reinstantiation requests that correspond to the given 2944 * lost request. We only remove items that follow lrp in the queue, 2945 * assuming that lrp will be removed by the generic lost state code. 2946 */ 2947 2948 static void 2949 flush_reinstate(nfs4_lost_rqst_t *lrp) 2950 { 2951 vnode_t *vp; 2952 pid_t pid; 2953 mntinfo4_t *mi; 2954 nfs4_lost_rqst_t *nlrp; 2955 2956 vp = lrp->lr_vp; 2957 mi = VTOMI4(vp); 2958 pid = lrp->lr_flk->l_pid; 2959 2960 /* 2961 * If there are any more reinstantation requests to get rid of, 2962 * they should all be clustered at the front of the lost state 2963 * queue. 2964 */ 2965 mutex_enter(&mi->mi_lock); 2966 for (lrp = list_next(&mi->mi_lost_state, lrp); lrp != NULL; 2967 lrp = nlrp) { 2968 nlrp = list_next(&mi->mi_lost_state, lrp); 2969 if (lrp->lr_op != OP_LOCK && lrp->lr_op != OP_LOCKU) 2970 break; 2971 if (lrp->lr_ctype != NFS4_LCK_CTYPE_REINSTATE) 2972 break; 2973 ASSERT(lrp->lr_vp == vp); 2974 ASSERT(lrp->lr_flk->l_pid == pid); 2975 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 2976 "remove reinstantiation %p", (void *)lrp)); 2977 list_remove(&mi->mi_lost_state, lrp); 2978 nfs4_free_lost_rqst(lrp, NULL); 2979 } 2980 mutex_exit(&mi->mi_lock); 2981 } 2982 2983 /* 2984 * End of state-specific recovery routines. 2985 */ 2986 2987 /* 2988 * Allocate a lost request struct, initialize it from lost_rqstp (including 2989 * bumping the reference counts for the referenced vnode, etc.), and hang 2990 * it off of recovp. 2991 */ 2992 2993 static void 2994 nfs4_save_lost_rqst(nfs4_lost_rqst_t *lost_rqstp, recov_info_t *recovp, 2995 nfs4_recov_t *action, mntinfo4_t *mi) 2996 { 2997 nfs4_lost_rqst_t *destp; 2998 2999 ASSERT(recovp->rc_lost_rqst == NULL); 3000 3001 destp = kmem_alloc(sizeof (nfs4_lost_rqst_t), KM_SLEEP); 3002 recovp->rc_lost_rqst = destp; 3003 3004 if (lost_rqstp->lr_op == OP_LOCK || 3005 lost_rqstp->lr_op == OP_LOCKU) { 3006 ASSERT(lost_rqstp->lr_lop); 3007 *action = NR_LOST_LOCK; 3008 destp->lr_ctype = lost_rqstp->lr_ctype; 3009 destp->lr_locktype = lost_rqstp->lr_locktype; 3010 } else if (lost_rqstp->lr_op == OP_OPEN) { 3011 component4 *srcfp, *destfp; 3012 3013 destp->lr_oacc = lost_rqstp->lr_oacc; 3014 destp->lr_odeny = lost_rqstp->lr_odeny; 3015 destp->lr_oclaim = lost_rqstp->lr_oclaim; 3016 if (lost_rqstp->lr_oclaim == CLAIM_DELEGATE_CUR) 3017 destp->lr_ostateid = lost_rqstp->lr_ostateid; 3018 3019 srcfp = &lost_rqstp->lr_ofile; 3020 destfp = &destp->lr_ofile; 3021 /* 3022 * Consume caller's utf8string 3023 */ 3024 destfp->utf8string_len = srcfp->utf8string_len; 3025 destfp->utf8string_val = srcfp->utf8string_val; 3026 srcfp->utf8string_len = 0; 3027 srcfp->utf8string_val = NULL; /* make sure not reused */ 3028 3029 *action = NR_LOST_STATE_RQST; 3030 } else if (lost_rqstp->lr_op == OP_OPEN_DOWNGRADE) { 3031 destp->lr_dg_acc = lost_rqstp->lr_dg_acc; 3032 destp->lr_dg_deny = lost_rqstp->lr_dg_deny; 3033 3034 *action = NR_LOST_STATE_RQST; 3035 } else if (lost_rqstp->lr_op == OP_CLOSE) { 3036 ASSERT(lost_rqstp->lr_oop); 3037 *action = NR_LOST_STATE_RQST; 3038 } else if (lost_rqstp->lr_op == OP_DELEGRETURN) { 3039 *action = NR_LOST_STATE_RQST; 3040 } else { 3041 #ifdef DEBUG 3042 cmn_err(CE_PANIC, "nfs4_save_lost_rqst: bad op %d", 3043 lost_rqstp->lr_op); 3044 #endif 3045 nfs4_queue_event(RE_LOST_STATE_BAD_OP, mi, NULL, 3046 lost_rqstp->lr_op, lost_rqstp->lr_vp, lost_rqstp->lr_dvp, 3047 NFS4_OK, NULL, curproc->p_pid, TAG_NONE, TAG_NONE, 0, 0); 3048 *action = NR_UNUSED; 3049 recovp->rc_lost_rqst = NULL; 3050 kmem_free(destp, sizeof (nfs4_lost_rqst_t)); 3051 return; 3052 } 3053 3054 destp->lr_op = lost_rqstp->lr_op; 3055 destp->lr_vp = lost_rqstp->lr_vp; 3056 if (destp->lr_vp) 3057 VN_HOLD(destp->lr_vp); 3058 destp->lr_dvp = lost_rqstp->lr_dvp; 3059 if (destp->lr_dvp) 3060 VN_HOLD(destp->lr_dvp); 3061 destp->lr_oop = lost_rqstp->lr_oop; 3062 if (destp->lr_oop) 3063 open_owner_hold(destp->lr_oop); 3064 destp->lr_osp = lost_rqstp->lr_osp; 3065 if (destp->lr_osp) 3066 open_stream_hold(destp->lr_osp); 3067 destp->lr_lop = lost_rqstp->lr_lop; 3068 if (destp->lr_lop) 3069 lock_owner_hold(destp->lr_lop); 3070 destp->lr_cr = lost_rqstp->lr_cr; 3071 if (destp->lr_cr) 3072 crhold(destp->lr_cr); 3073 if (lost_rqstp->lr_flk == NULL) 3074 destp->lr_flk = NULL; 3075 else { 3076 destp->lr_flk = kmem_alloc(sizeof (flock64_t), KM_SLEEP); 3077 *destp->lr_flk = *lost_rqstp->lr_flk; 3078 } 3079 destp->lr_putfirst = lost_rqstp->lr_putfirst; 3080 } 3081 3082 /* 3083 * Map the given return values (errno and nfs4 status code) to a recovery 3084 * action and fill in the following fields of recovp: rc_action, 3085 * rc_srv_reboot, rc_stateid, rc_lost_rqst. 3086 */ 3087 3088 void 3089 errs_to_action(recov_info_t *recovp, 3090 nfs4_server_t *sp, mntinfo4_t *mi, stateid4 *sidp, 3091 nfs4_lost_rqst_t *lost_rqstp, int unmounted, nfs_opnum4 op, 3092 nfs4_bseqid_entry_t *bsep) 3093 { 3094 nfs4_recov_t action = NR_UNUSED; 3095 bool_t reboot = FALSE; 3096 int try_f; 3097 int error = recovp->rc_orig_errors.error; 3098 nfsstat4 stat = recovp->rc_orig_errors.stat; 3099 3100 bzero(&recovp->rc_stateid, sizeof (stateid4)); 3101 recovp->rc_lost_rqst = NULL; 3102 recovp->rc_bseqid_rqst = NULL; 3103 3104 try_f = nfs4_try_failover(&recovp->rc_orig_errors) && 3105 FAILOVER_MOUNT4(mi); 3106 3107 /* 3108 * We start recovery for EINTR only in the lost lock 3109 * or lost open/close case. 3110 */ 3111 3112 if (try_f || error == EINTR || (error == EIO && unmounted)) { 3113 recovp->rc_error = (error != 0 ? error : geterrno4(stat)); 3114 if (lost_rqstp) { 3115 ASSERT(lost_rqstp->lr_op != 0); 3116 nfs4_save_lost_rqst(lost_rqstp, recovp, &action, mi); 3117 } 3118 if (try_f) 3119 action = NR_FAILOVER; 3120 } else if (error != 0) { 3121 recovp->rc_error = error; 3122 nfs4_queue_event(RE_UNEXPECTED_ERRNO, mi, NULL, error, NULL, 3123 NULL, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0); 3124 action = NR_CLIENTID; 3125 } else { 3126 recovp->rc_error = geterrno4(stat); 3127 switch (stat) { 3128 #ifdef notyet 3129 case NFS4ERR_LEASE_MOVED: 3130 action = xxx; 3131 break; 3132 case NFS4ERR_MOVED: 3133 action = xxx; 3134 break; 3135 #endif 3136 case NFS4ERR_BADHANDLE: 3137 action = NR_BADHANDLE; 3138 break; 3139 case NFS4ERR_BAD_SEQID: 3140 if (bsep) 3141 save_bseqid_rqst(bsep, recovp); 3142 action = NR_BAD_SEQID; 3143 break; 3144 case NFS4ERR_OLD_STATEID: 3145 action = NR_OLDSTATEID; 3146 break; 3147 case NFS4ERR_WRONGSEC: 3148 action = NR_WRONGSEC; 3149 break; 3150 case NFS4ERR_FHEXPIRED: 3151 action = NR_FHEXPIRED; 3152 break; 3153 case NFS4ERR_BAD_STATEID: 3154 if (sp == NULL || (sp != NULL && inlease(sp))) { 3155 3156 action = NR_BAD_STATEID; 3157 if (sidp) 3158 recovp->rc_stateid = *sidp; 3159 } else 3160 action = NR_CLIENTID; 3161 break; 3162 case NFS4ERR_EXPIRED: 3163 /* 3164 * The client's lease has expired, either due 3165 * to a network partition or perhaps a client 3166 * error. In either case, try an NR_CLIENTID 3167 * style recovery. reboot remains false, since 3168 * there is no evidence the server has rebooted. 3169 * This will cause CLAIM_NULL opens and lock 3170 * requests without the reclaim bit. 3171 */ 3172 action = NR_CLIENTID; 3173 3174 DTRACE_PROBE4(nfs4__expired, 3175 nfs4_server_t *, sp, 3176 mntinfo4_t *, mi, 3177 stateid4 *, sidp, int, op); 3178 3179 break; 3180 case NFS4ERR_STALE_CLIENTID: 3181 case NFS4ERR_STALE_STATEID: 3182 action = NR_CLIENTID; 3183 reboot = TRUE; 3184 break; 3185 case NFS4ERR_RESOURCE: 3186 /* 3187 * If this had been a FAILOVER mount, then 3188 * we'd have tried failover. Since it's not, 3189 * just delay a while and retry. 3190 */ 3191 action = NR_DELAY; 3192 break; 3193 case NFS4ERR_GRACE: 3194 action = NR_GRACE; 3195 break; 3196 case NFS4ERR_DELAY: 3197 action = NR_DELAY; 3198 break; 3199 case NFS4ERR_STALE: 3200 action = NR_STALE; 3201 break; 3202 default: 3203 nfs4_queue_event(RE_UNEXPECTED_STATUS, mi, NULL, 0, 3204 NULL, NULL, stat, NULL, 0, TAG_NONE, TAG_NONE, 3205 0, 0); 3206 action = NR_CLIENTID; 3207 break; 3208 } 3209 } 3210 3211 /* make sure action got set */ 3212 ASSERT(action != NR_UNUSED); 3213 recovp->rc_srv_reboot = reboot; 3214 recovp->rc_action = action; 3215 nfs4_queue_fact(RF_ERR, mi, stat, action, op, reboot, NULL, error, 3216 NULL); 3217 } 3218 3219 /* 3220 * Return the (held) credential for the process with the given pid. 3221 * May return NULL (e.g., process not found). 3222 */ 3223 3224 static cred_t * 3225 pid_to_cr(pid_t pid) 3226 { 3227 proc_t *p; 3228 cred_t *cr; 3229 3230 mutex_enter(&pidlock); 3231 if ((p = prfind(pid)) == NULL) { 3232 mutex_exit(&pidlock); 3233 return (NULL); 3234 } 3235 3236 mutex_enter(&p->p_crlock); 3237 crhold(cr = p->p_cred); 3238 mutex_exit(&p->p_crlock); 3239 mutex_exit(&pidlock); 3240 3241 return (cr); 3242 } 3243 3244 /* 3245 * Send SIGLOST to the given process and queue the event. 3246 * 3247 * The 'dump' boolean tells us whether this action should dump the 3248 * in-kernel queue of recovery messages or not. 3249 */ 3250 3251 void 3252 nfs4_send_siglost(pid_t pid, mntinfo4_t *mi, vnode_t *vp, bool_t dump, 3253 int error, nfsstat4 stat) 3254 { 3255 proc_t *p; 3256 3257 mutex_enter(&pidlock); 3258 p = prfind(pid); 3259 if (p) 3260 psignal(p, SIGLOST); 3261 mutex_exit(&pidlock); 3262 nfs4_queue_event(dump ? RE_SIGLOST : RE_SIGLOST_NO_DUMP, mi, 3263 NULL, error, vp, NULL, stat, NULL, pid, TAG_NONE, TAG_NONE, 0, 0); 3264 } 3265 3266 /* 3267 * Scan the lock list for entries that match the given pid. Change the 3268 * pid in those that do to NOPID. 3269 */ 3270 3271 static void 3272 relock_skip_pid(locklist_t *llp, pid_t pid) 3273 { 3274 for (; llp != NULL; llp = llp->ll_next) { 3275 if (llp->ll_flock.l_pid == pid) 3276 llp->ll_flock.l_pid = NOPID; 3277 } 3278 } 3279 3280 /* 3281 * Mark a file as having failed recovery, after making a last-ditch effort 3282 * to return any delegation. 3283 * 3284 * Sets r_error to EIO or ESTALE for the given vnode. 3285 */ 3286 void 3287 nfs4_fail_recov(vnode_t *vp, char *why, int error, nfsstat4 stat) 3288 { 3289 rnode4_t *rp = VTOR4(vp); 3290 3291 #ifdef DEBUG 3292 if (nfs4_fail_recov_stop) 3293 debug_enter("nfs4_fail_recov"); 3294 #endif 3295 3296 mutex_enter(&rp->r_statelock); 3297 if (rp->r_flags & (R4RECOVERR|R4RECOVERRP)) { 3298 mutex_exit(&rp->r_statelock); 3299 return; 3300 } 3301 3302 /* 3303 * Set R4RECOVERRP to indicate that a recovery error is in 3304 * progress. This will shut down reads and writes at the top 3305 * half. Don't set R4RECOVERR until after we've returned the 3306 * delegation, otherwise it will fail. 3307 */ 3308 3309 rp->r_flags |= R4RECOVERRP; 3310 mutex_exit(&rp->r_statelock); 3311 3312 nfs4delegabandon(rp); 3313 3314 mutex_enter(&rp->r_statelock); 3315 rp->r_flags |= (R4RECOVERR | R4STALE); 3316 rp->r_error = (error == 0 && stat == NFS4ERR_STALE) ? ESTALE : EIO; 3317 PURGE_ATTRCACHE4_LOCKED(rp); 3318 if (!(vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) 3319 nfs4_queue_event(RE_DEAD_FILE, VTOMI4(vp), NULL, error, 3320 vp, NULL, stat, why, 0, TAG_NONE, TAG_NONE, 0, 0); 3321 mutex_exit(&rp->r_statelock); 3322 3323 dnlc_purge_vp(vp); 3324 } 3325 3326 /* 3327 * recov_throttle: if the file had the same recovery action within the 3328 * throttle interval, wait for the throttle interval to finish before 3329 * proceeding. 3330 * 3331 * Side effects: updates the rnode with the current recovery information. 3332 */ 3333 3334 static void 3335 recov_throttle(recov_info_t *recovp, vnode_t *vp) 3336 { 3337 time_t curtime, time_to_wait; 3338 rnode4_t *rp = VTOR4(vp); 3339 3340 curtime = gethrestime_sec(); 3341 3342 mutex_enter(&rp->r_statelock); 3343 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 3344 "recov_throttle: now: (%d, %ld), last: (%d, %ld)", 3345 recovp->rc_action, curtime, 3346 rp->r_recov_act, rp->r_last_recov)); 3347 if (recovp->rc_action == rp->r_recov_act && 3348 rp->r_last_recov + recov_err_delay > curtime) { 3349 time_to_wait = rp->r_last_recov + recov_err_delay - curtime; 3350 mutex_exit(&rp->r_statelock); 3351 delay(SEC_TO_TICK(time_to_wait)); 3352 curtime = gethrestime_sec(); 3353 mutex_enter(&rp->r_statelock); 3354 } 3355 3356 rp->r_last_recov = curtime; 3357 rp->r_recov_act = recovp->rc_action; 3358 mutex_exit(&rp->r_statelock); 3359 } 3360 3361 /* 3362 * React to NFS4ERR_GRACE by setting the time we'll permit 3363 * the next call to this filesystem. 3364 */ 3365 void 3366 nfs4_set_grace_wait(mntinfo4_t *mi) 3367 { 3368 mutex_enter(&mi->mi_lock); 3369 /* Mark the time for the future */ 3370 mi->mi_grace_wait = gethrestime_sec() + nfs4err_delay_time; 3371 mutex_exit(&mi->mi_lock); 3372 } 3373 3374 /* 3375 * React to MFS4ERR_DELAY by setting the time we'll permit 3376 * the next call to this vnode. 3377 */ 3378 void 3379 nfs4_set_delay_wait(vnode_t *vp) 3380 { 3381 rnode4_t *rp = VTOR4(vp); 3382 3383 mutex_enter(&rp->r_statelock); 3384 /* 3385 * Calculate amount we should delay, initial 3386 * delay will be short and then we will back off. 3387 */ 3388 if (rp->r_delay_interval == 0) 3389 rp->r_delay_interval = NFS4_INITIAL_DELAY_INTERVAL; 3390 else 3391 /* calculate next interval value */ 3392 rp->r_delay_interval = 3393 MIN(NFS4_MAX_DELAY_INTERVAL, (rp->r_delay_interval << 1)); 3394 rp->r_delay_wait = gethrestime_sec() + rp->r_delay_interval; 3395 mutex_exit(&rp->r_statelock); 3396 } 3397 3398 /* 3399 * The caller is responsible for freeing the returned string. 3400 */ 3401 static char * 3402 nfs4_getsrvnames(mntinfo4_t *mi, size_t *len) 3403 { 3404 servinfo4_t *svp; 3405 char *srvnames; 3406 char *namep; 3407 size_t length; 3408 3409 /* 3410 * Calculate the length of the string required to hold all 3411 * of the server names plus either a comma or a null 3412 * character following each individual one. 3413 */ 3414 length = 0; 3415 for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) { 3416 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 3417 if (svp->sv_flags & SV4_NOTINUSE) { 3418 nfs_rw_exit(&svp->sv_lock); 3419 continue; 3420 } 3421 nfs_rw_exit(&svp->sv_lock); 3422 length += svp->sv_hostnamelen; 3423 } 3424 3425 srvnames = kmem_alloc(length, KM_SLEEP); 3426 3427 namep = srvnames; 3428 for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) { 3429 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 3430 if (svp->sv_flags & SV4_NOTINUSE) { 3431 nfs_rw_exit(&svp->sv_lock); 3432 continue; 3433 } 3434 nfs_rw_exit(&svp->sv_lock); 3435 (void) strcpy(namep, svp->sv_hostname); 3436 namep += svp->sv_hostnamelen - 1; 3437 *namep++ = ','; 3438 } 3439 *--namep = '\0'; 3440 3441 *len = length; 3442 3443 return (srvnames); 3444 } 3445 3446 static void 3447 save_bseqid_rqst(nfs4_bseqid_entry_t *bsep, recov_info_t *recovp) 3448 { 3449 nfs4_bseqid_entry_t *destp; 3450 3451 destp = kmem_alloc(sizeof (nfs4_bseqid_entry_t), KM_SLEEP); 3452 recovp->rc_bseqid_rqst = destp; 3453 3454 if (bsep->bs_oop) 3455 open_owner_hold(bsep->bs_oop); 3456 destp->bs_oop = bsep->bs_oop; 3457 if (bsep->bs_lop) 3458 lock_owner_hold(bsep->bs_lop); 3459 destp->bs_lop = bsep->bs_lop; 3460 if (bsep->bs_vp) 3461 VN_HOLD(bsep->bs_vp); 3462 destp->bs_vp = bsep->bs_vp; 3463 destp->bs_pid = bsep->bs_pid; 3464 destp->bs_tag = bsep->bs_tag; 3465 destp->bs_seqid = bsep->bs_seqid; 3466 } 3467 3468 static void 3469 free_bseqid_rqst(nfs4_bseqid_entry_t *bsep) 3470 { 3471 if (bsep->bs_oop) 3472 open_owner_rele(bsep->bs_oop); 3473 if (bsep->bs_lop) 3474 lock_owner_rele(bsep->bs_lop); 3475 if (bsep->bs_vp) 3476 VN_RELE(bsep->bs_vp); 3477 kmem_free(bsep, sizeof (nfs4_bseqid_entry_t)); 3478 } 3479 3480 /* 3481 * We don't actually fully recover from NFS4ERR_BAD_SEQID. We 3482 * simply mark the open owner and open stream (if provided) as "bad". 3483 * Then future uses of these data structures will be limited to basically 3484 * just cleaning up the internal client state (no going OTW). 3485 * 3486 * The result of this is to return errors back to the app/usr when 3487 * we receive NFS4ERR_BAD_SEQID, but also allow future/new calls to 3488 * succeed so progress can be made. 3489 */ 3490 void 3491 recov_bad_seqid(recov_info_t *recovp) 3492 { 3493 mntinfo4_t *mi = recovp->rc_mi; 3494 nfs4_open_owner_t *bad_oop; 3495 nfs4_lock_owner_t *bad_lop; 3496 vnode_t *vp; 3497 rnode4_t *rp = NULL; 3498 pid_t pid; 3499 nfs4_bseqid_entry_t *bsep, *tbsep; 3500 int error; 3501 3502 ASSERT(mi != NULL); 3503 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 3504 3505 mutex_enter(&mi->mi_lock); 3506 bsep = list_head(&mi->mi_bseqid_list); 3507 mutex_exit(&mi->mi_lock); 3508 3509 /* 3510 * Handle all the bad seqid entries on mi's list. 3511 */ 3512 while (bsep != NULL) { 3513 bad_oop = bsep->bs_oop; 3514 bad_lop = bsep->bs_lop; 3515 vp = bsep->bs_vp; 3516 pid = bsep->bs_pid; 3517 3518 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 3519 "recov_bad_seqid: mark oop %p lop %p as bad for " 3520 "vp %p tag %s pid %d: last good seqid %d for tag %s", 3521 (void *)bad_oop, (void *)bad_lop, (void *)vp, 3522 nfs4_ctags[bsep->bs_tag].ct_str, pid, 3523 bad_oop ? bad_oop->oo_last_good_seqid : 0, 3524 bad_oop ? nfs4_ctags[bad_oop->oo_last_good_op].ct_str : 3525 nfs4_ctags[TAG_NONE].ct_str)); 3526 3527 nfs4_queue_event(RE_BAD_SEQID, mi, NULL, 3528 0, vp, NULL, NFS4ERR_BAD_SEQID, NULL, pid, bsep->bs_tag, 3529 bad_oop ? bad_oop->oo_last_good_op : TAG_NONE, 3530 bsep->bs_seqid, bad_oop ? bad_oop->oo_last_good_seqid : 0); 3531 3532 if (bad_oop) { 3533 /* essentially reset the open owner */ 3534 error = nfs4_start_open_seqid_sync(bad_oop, mi); 3535 ASSERT(!error); /* recov thread always succeeds */ 3536 bad_oop->oo_name = nfs4_get_new_oo_name(); 3537 bad_oop->oo_seqid = 0; 3538 nfs4_end_open_seqid_sync(bad_oop); 3539 } 3540 3541 if (bad_lop) { 3542 mutex_enter(&bad_lop->lo_lock); 3543 bad_lop->lo_flags |= NFS4_BAD_SEQID_LOCK; 3544 mutex_exit(&bad_lop->lo_lock); 3545 3546 ASSERT(vp != NULL); 3547 rp = VTOR4(vp); 3548 mutex_enter(&rp->r_statelock); 3549 rp->r_flags |= R4LODANGLERS; 3550 mutex_exit(&rp->r_statelock); 3551 3552 nfs4_send_siglost(pid, mi, vp, TRUE, 3553 0, NFS4ERR_BAD_SEQID); 3554 } 3555 3556 mutex_enter(&mi->mi_lock); 3557 list_remove(&mi->mi_bseqid_list, bsep); 3558 tbsep = bsep; 3559 bsep = list_head(&mi->mi_bseqid_list); 3560 mutex_exit(&mi->mi_lock); 3561 free_bseqid_rqst(tbsep); 3562 } 3563 3564 mutex_enter(&mi->mi_lock); 3565 mi->mi_recovflags &= ~MI4R_BAD_SEQID; 3566 mutex_exit(&mi->mi_lock); 3567 } 3568