1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * NFS Version 4 state recovery code. 30 */ 31 32 #include <nfs/nfs4_clnt.h> 33 #include <nfs/nfs4.h> 34 #include <nfs/rnode4.h> 35 #include <sys/cmn_err.h> 36 #include <sys/cred.h> 37 #include <sys/systm.h> 38 #include <sys/flock.h> 39 #include <sys/dnlc.h> 40 #include <sys/ddi.h> 41 #include <sys/disp.h> 42 #include <sys/list.h> 43 #include <sys/sdt.h> 44 45 extern r4hashq_t *rtable4; 46 47 /* 48 * Information that describes what needs to be done for recovery. It is 49 * passed to a client recovery thread as well as passed to various recovery 50 * routines. rc_mi, rc_vp1, and rc_vp2 refer to the filesystem and 51 * vnode(s) affected by recovery. rc_vp1 and rc_vp2 are references (use 52 * VN_HOLD) or NULL. rc_lost_rqst contains information about the lost 53 * lock or open/close request, and it holds reference counts for the 54 * various objects (vnode, etc.). The recovery thread also uses flags set 55 * in the mntinfo4_t or vnode_t to tell it what to do. rc_error is used 56 * to save the error that originally triggered the recovery event -- will 57 * later be used to set mi_error if recovery doesn't work. rc_bseqid_rqst 58 * contains information about the request that got NFS4ERR_BAD_SEQID, and 59 * it holds reference count for the various objects (vnode, open owner, 60 * open stream, lock owner). 61 */ 62 63 typedef struct { 64 mntinfo4_t *rc_mi; 65 vnode_t *rc_vp1; 66 vnode_t *rc_vp2; 67 nfs4_recov_t rc_action; 68 stateid4 rc_stateid; 69 bool_t rc_srv_reboot; /* server has rebooted */ 70 nfs4_lost_rqst_t *rc_lost_rqst; 71 nfs4_error_t rc_orig_errors; /* original errors causing recovery */ 72 int rc_error; 73 nfs4_bseqid_entry_t *rc_bseqid_rqst; 74 } recov_info_t; 75 76 /* 77 * How long to wait before trying again if there is an error doing 78 * recovery, in seconds. 79 */ 80 81 static int recov_err_delay = 1; 82 83 /* 84 * How long to wait when processing NFS4ERR_GRACE or NFS4ERR_DELAY 85 * errors. Expressed in seconds. Default is defined as 86 * NFS4ERR_DELAY_TIME and this variable is initialized in nfs4_subr_init() 87 */ 88 time_t nfs4err_delay_time = 0; 89 90 /* 91 * Tuneable to limit how many time "exempt" ops go OTW 92 * after a recovery error. Exempt op hints are OH_CLOSE, 93 * OH_LOCKU, OH_DELEGRETURN. These previously always went 94 * OTW even after rnode was "dead" due to recovery errors. 95 * 96 * The tuneable below limits the number of times a start_fop 97 * invocation will retry the exempt hints. After the limit 98 * is reached, nfs4_start_fop will return an error just like 99 * it would for non-exempt op hints. 100 */ 101 int nfs4_max_recov_error_retry = 3; 102 103 /* 104 * Number of seconds the recovery thread should pause before retry when the 105 * filesystem has been forcibly unmounted. 106 */ 107 108 int nfs4_unmount_delay = 1; 109 110 #ifdef DEBUG 111 112 /* 113 * How long to wait (in seconds) between recovery operations on a given 114 * file. Normally zero, but could be set longer for testing purposes. 115 */ 116 static int nfs4_recovdelay = 0; 117 118 /* 119 * Switch that controls whether to go into the debugger when recovery 120 * fails. 121 */ 122 static int nfs4_fail_recov_stop = 0; 123 124 /* 125 * Tuneables to debug client namespace interaction with server 126 * mount points: 127 * 128 * nfs4_srvmnt_fail_cnt: 129 * number of times EACCES returned because client 130 * attempted to cross server mountpoint 131 * 132 * nfs4_srvmnt_debug: 133 * trigger console printf whenever client attempts 134 * to cross server mountpoint 135 */ 136 int nfs4_srvmnt_fail_cnt = 0; 137 int nfs4_srvmnt_debug = 0; 138 #endif 139 140 /* forward references, in alphabetic order */ 141 static void close_after_open_resend(vnode_t *, cred_t *, uint32_t, 142 nfs4_error_t *); 143 static void errs_to_action(recov_info_t *, 144 nfs4_server_t *, mntinfo4_t *, stateid4 *, nfs4_lost_rqst_t *, int, 145 nfs_opnum4, nfs4_bseqid_entry_t *); 146 static void flush_reinstate(nfs4_lost_rqst_t *); 147 static void free_milist(mntinfo4_t **, int); 148 static mntinfo4_t **make_milist(nfs4_server_t *, int *); 149 static int nfs4_check_recov_err(vnode_t *, nfs4_op_hint_t, 150 nfs4_recov_state_t *, int, char *); 151 static char *nfs4_getsrvnames(mntinfo4_t *, size_t *); 152 static void nfs4_recov_fh_fail(vnode_t *, int, nfsstat4); 153 static void nfs4_recov_thread(recov_info_t *); 154 static void nfs4_remove_lost_rqsts(mntinfo4_t *, nfs4_server_t *); 155 static void nfs4_resend_lost_rqsts(recov_info_t *, nfs4_server_t *); 156 static cred_t *pid_to_cr(pid_t); 157 static void reclaim_one_lock(vnode_t *, flock64_t *, nfs4_error_t *, int *); 158 static void recov_bad_seqid(recov_info_t *); 159 static void recov_badstate(recov_info_t *, vnode_t *, nfsstat4); 160 static void recov_clientid(recov_info_t *, nfs4_server_t *); 161 static void recov_done(mntinfo4_t *, recov_info_t *); 162 static void recov_filehandle(nfs4_recov_t, mntinfo4_t *, vnode_t *); 163 static void recov_newserver(recov_info_t *, nfs4_server_t **, bool_t *); 164 static void recov_openfiles(recov_info_t *, nfs4_server_t *); 165 static void recov_stale(mntinfo4_t *, vnode_t *); 166 static void nfs4_free_lost_rqst(nfs4_lost_rqst_t *, nfs4_server_t *); 167 static void recov_throttle(recov_info_t *, vnode_t *); 168 static void relock_skip_pid(locklist_t *, pid_t); 169 static void resend_lock(nfs4_lost_rqst_t *, nfs4_error_t *); 170 static void resend_one_op(nfs4_lost_rqst_t *, nfs4_error_t *, mntinfo4_t *, 171 nfs4_server_t *); 172 static void save_bseqid_rqst(nfs4_bseqid_entry_t *, recov_info_t *); 173 static void start_recovery(recov_info_t *, mntinfo4_t *, vnode_t *, vnode_t *, 174 nfs4_server_t *); 175 static void start_recovery_action(nfs4_recov_t, bool_t, mntinfo4_t *, vnode_t *, 176 vnode_t *); 177 static int wait_for_recovery(mntinfo4_t *, nfs4_op_hint_t); 178 179 /* 180 * Return non-zero if the given errno, status, and rpc status codes 181 * in the nfs4_error_t indicate that client recovery is needed. 182 * "stateful" indicates whether the call that got the error establishes or 183 * removes state on the server (open, close, lock, unlock, delegreturn). 184 */ 185 186 int 187 nfs4_needs_recovery(nfs4_error_t *ep, bool_t stateful, vfs_t *vfsp) 188 { 189 int recov = 0; 190 mntinfo4_t *mi; 191 192 /* 193 * Try failover if the error values justify it and if 194 * it's a failover mount. Don't try if the mount is in 195 * progress, failures are handled explicitly by nfs4rootvp. 196 */ 197 if (nfs4_try_failover(ep)) { 198 mi = VFTOMI4(vfsp); 199 mutex_enter(&mi->mi_lock); 200 recov = FAILOVER_MOUNT4(mi) && !(mi->mi_flags & MI4_MOUNTING); 201 mutex_exit(&mi->mi_lock); 202 if (recov) 203 return (recov); 204 } 205 206 if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vfsp)) { 207 /* 208 * The server may have gotten the request, so for stateful 209 * ops we need to resynchronize and possibly back out the 210 * op. 211 */ 212 return (stateful); 213 } 214 if (ep->error != 0) 215 return (0); 216 217 /* stat values are listed alphabetically */ 218 /* 219 * There are two lists here: the errors for which we have code, and 220 * the errors for which we plan to have code before FCS. For the 221 * second list, print a warning message but don't attempt recovery. 222 */ 223 switch (ep->stat) { 224 case NFS4ERR_BADHANDLE: 225 case NFS4ERR_BAD_SEQID: 226 case NFS4ERR_BAD_STATEID: 227 case NFS4ERR_DELAY: 228 case NFS4ERR_EXPIRED: 229 case NFS4ERR_FHEXPIRED: 230 case NFS4ERR_GRACE: 231 case NFS4ERR_OLD_STATEID: 232 case NFS4ERR_RESOURCE: 233 case NFS4ERR_STALE_CLIENTID: 234 case NFS4ERR_STALE_STATEID: 235 case NFS4ERR_WRONGSEC: 236 case NFS4ERR_STALE: 237 recov = 1; 238 break; 239 #ifdef DEBUG 240 case NFS4ERR_LEASE_MOVED: 241 case NFS4ERR_MOVED: 242 zcmn_err(VFTOMI4(vfsp)->mi_zone->zone_id, 243 CE_WARN, "!Can't yet recover from NFS status %d", 244 ep->stat); 245 break; 246 #endif 247 } 248 249 return (recov); 250 } 251 252 /* 253 * Some operations such as DELEGRETURN want to avoid invoking 254 * recovery actions that will only mark the file dead. If 255 * better handlers are invoked for any of these errors, this 256 * routine should be modified. 257 */ 258 int 259 nfs4_recov_marks_dead(nfsstat4 status) 260 { 261 if (status == NFS4ERR_BAD_SEQID || 262 status == NFS4ERR_EXPIRED || 263 status == NFS4ERR_BAD_STATEID || 264 status == NFS4ERR_OLD_STATEID) 265 return (1); 266 return (0); 267 } 268 269 /* 270 * Transfer the state recovery information in recovp to mi's resend queue, 271 * and mark mi as having a lost state request. 272 */ 273 static void 274 nfs4_enqueue_lost_rqst(recov_info_t *recovp, mntinfo4_t *mi) 275 { 276 nfs4_lost_rqst_t *lrp = recovp->rc_lost_rqst; 277 278 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 279 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 280 281 ASSERT(lrp != NULL && lrp->lr_op != 0); 282 283 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 284 "nfs4_enqueue_lost_rqst %p, op %d", 285 (void *)lrp, lrp->lr_op)); 286 287 mutex_enter(&mi->mi_lock); 288 mi->mi_recovflags |= MI4R_LOST_STATE; 289 if (lrp->lr_putfirst) 290 list_insert_head(&mi->mi_lost_state, lrp); 291 else 292 list_insert_tail(&mi->mi_lost_state, lrp); 293 recovp->rc_lost_rqst = NULL; 294 mutex_exit(&mi->mi_lock); 295 296 nfs4_queue_event(RE_LOST_STATE, mi, NULL, lrp->lr_op, lrp->lr_vp, 297 lrp->lr_dvp, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0); 298 } 299 300 /* 301 * Transfer the bad seqid recovery information in recovp to mi's 302 * bad seqid queue, and mark mi as having a bad seqid request. 303 */ 304 void 305 enqueue_bseqid_rqst(recov_info_t *recovp, mntinfo4_t *mi) 306 { 307 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 308 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 309 ASSERT(recovp->rc_bseqid_rqst != NULL); 310 311 mutex_enter(&mi->mi_lock); 312 mi->mi_recovflags |= MI4R_BAD_SEQID; 313 list_insert_tail(&mi->mi_bseqid_list, recovp->rc_bseqid_rqst); 314 recovp->rc_bseqid_rqst = NULL; 315 mutex_exit(&mi->mi_lock); 316 } 317 318 /* 319 * Initiate recovery. 320 * 321 * The nfs4_error_t contains the return codes that triggered a recovery 322 * attempt. mi, vp1, and vp2 refer to the filesystem and files that were 323 * being operated on. vp1 and vp2 may be NULL. 324 * 325 * Multiple calls are okay. If recovery is already underway, the call 326 * updates the information about what state needs recovery but does not 327 * start a new thread. The caller should hold mi->mi_recovlock as a reader 328 * for proper synchronization with any recovery thread. 329 * 330 * This will return TRUE if recovery was aborted, and FALSE otherwise. 331 */ 332 bool_t 333 nfs4_start_recovery(nfs4_error_t *ep, mntinfo4_t *mi, vnode_t *vp1, 334 vnode_t *vp2, stateid4 *sid, nfs4_lost_rqst_t *lost_rqstp, nfs_opnum4 op, 335 nfs4_bseqid_entry_t *bsep) 336 { 337 recov_info_t *recovp; 338 nfs4_server_t *sp; 339 bool_t abort = FALSE; 340 bool_t gone = FALSE; 341 342 ASSERT(nfs_zone() == mi->mi_zone); 343 mutex_enter(&mi->mi_lock); 344 /* 345 * If there is lost state, we need to kick off recovery even if the 346 * filesystem has been unmounted or the zone is shutting down. 347 */ 348 gone = FS_OR_ZONE_GONE4(mi->mi_vfsp); 349 if (gone) { 350 ASSERT(ep->error != EINTR || lost_rqstp != NULL); 351 if (ep->error == EIO && lost_rqstp == NULL) { 352 /* failed due to forced unmount, no new lost state */ 353 abort = TRUE; 354 } 355 if ((ep->error == 0 || ep->error == ETIMEDOUT) && 356 !(mi->mi_recovflags & MI4R_LOST_STATE)) { 357 /* some other failure, no existing lost state */ 358 abort = TRUE; 359 } 360 if (abort) { 361 mutex_exit(&mi->mi_lock); 362 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 363 "nfs4_start_recovery: fs unmounted")); 364 return (TRUE); 365 } 366 } 367 mi->mi_in_recovery++; 368 mutex_exit(&mi->mi_lock); 369 370 recovp = kmem_alloc(sizeof (recov_info_t), KM_SLEEP); 371 recovp->rc_orig_errors = *ep; 372 sp = find_nfs4_server(mi); 373 errs_to_action(recovp, sp, mi, sid, lost_rqstp, gone, op, bsep); 374 if (sp != NULL) 375 mutex_exit(&sp->s_lock); 376 start_recovery(recovp, mi, vp1, vp2, sp); 377 if (sp != NULL) 378 nfs4_server_rele(sp); 379 return (FALSE); 380 } 381 382 /* 383 * Internal version of nfs4_start_recovery. The difference is that the 384 * caller specifies the recovery action, rather than the errors leading to 385 * recovery. 386 */ 387 static void 388 start_recovery_action(nfs4_recov_t what, bool_t reboot, mntinfo4_t *mi, 389 vnode_t *vp1, vnode_t *vp2) 390 { 391 recov_info_t *recovp; 392 393 ASSERT(nfs_zone() == mi->mi_zone); 394 mutex_enter(&mi->mi_lock); 395 mi->mi_in_recovery++; 396 mutex_exit(&mi->mi_lock); 397 398 recovp = kmem_zalloc(sizeof (recov_info_t), KM_SLEEP); 399 recovp->rc_action = what; 400 recovp->rc_srv_reboot = reboot; 401 recovp->rc_error = EIO; 402 start_recovery(recovp, mi, vp1, vp2, NULL); 403 } 404 405 static void 406 start_recovery(recov_info_t *recovp, mntinfo4_t *mi, 407 vnode_t *vp1, vnode_t *vp2, nfs4_server_t *sp) 408 { 409 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 410 "start_recovery: mi %p, what %s", (void*)mi, 411 nfs4_recov_action_to_str(recovp->rc_action))); 412 413 /* 414 * Bump the reference on the vfs so that we can pass it to the 415 * recovery thread. 416 */ 417 VFS_HOLD(mi->mi_vfsp); 418 MI4_HOLD(mi); 419 again: 420 switch (recovp->rc_action) { 421 case NR_FAILOVER: 422 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 423 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 424 if (mi->mi_servers->sv_next == NULL) 425 goto out_no_thread; 426 mutex_enter(&mi->mi_lock); 427 mi->mi_recovflags |= MI4R_NEED_NEW_SERVER; 428 mutex_exit(&mi->mi_lock); 429 430 if (recovp->rc_lost_rqst != NULL) 431 nfs4_enqueue_lost_rqst(recovp, mi); 432 break; 433 434 case NR_CLIENTID: 435 /* 436 * If the filesystem has been unmounted, punt. 437 */ 438 if (sp == NULL) 439 goto out_no_thread; 440 441 /* 442 * If nobody else is working on the clientid, mark the 443 * clientid as being no longer set. Then mark the specific 444 * filesystem being worked on. 445 */ 446 if (!nfs4_server_in_recovery(sp)) { 447 mutex_enter(&sp->s_lock); 448 sp->s_flags &= ~N4S_CLIENTID_SET; 449 mutex_exit(&sp->s_lock); 450 } 451 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 452 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 453 mutex_enter(&mi->mi_lock); 454 mi->mi_recovflags |= MI4R_NEED_CLIENTID; 455 if (recovp->rc_srv_reboot) 456 mi->mi_recovflags |= MI4R_SRV_REBOOT; 457 mutex_exit(&mi->mi_lock); 458 break; 459 460 case NR_OPENFILES: 461 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 462 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 463 mutex_enter(&mi->mi_lock); 464 mi->mi_recovflags |= MI4R_REOPEN_FILES; 465 if (recovp->rc_srv_reboot) 466 mi->mi_recovflags |= MI4R_SRV_REBOOT; 467 mutex_exit(&mi->mi_lock); 468 break; 469 470 case NR_WRONGSEC: 471 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 472 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 473 mutex_enter(&mi->mi_lock); 474 mi->mi_recovflags |= MI4R_NEED_SECINFO; 475 mutex_exit(&mi->mi_lock); 476 break; 477 478 case NR_EXPIRED: 479 if (vp1 != NULL) 480 recov_badstate(recovp, vp1, NFS4ERR_EXPIRED); 481 if (vp2 != NULL) 482 recov_badstate(recovp, vp2, NFS4ERR_EXPIRED); 483 goto out_no_thread; /* no further recovery possible */ 484 485 case NR_BAD_STATEID: 486 if (vp1 != NULL) 487 recov_badstate(recovp, vp1, NFS4ERR_BAD_STATEID); 488 if (vp2 != NULL) 489 recov_badstate(recovp, vp2, NFS4ERR_BAD_STATEID); 490 goto out_no_thread; /* no further recovery possible */ 491 492 case NR_FHEXPIRED: 493 case NR_BADHANDLE: 494 if (vp1 != NULL) 495 recov_throttle(recovp, vp1); 496 if (vp2 != NULL) 497 recov_throttle(recovp, vp2); 498 /* 499 * Recover the filehandle now, rather than using a 500 * separate thread. We can do this because filehandle 501 * recovery is independent of any other state, and because 502 * we know that we are not competing with the recovery 503 * thread at this time. recov_filehandle will deal with 504 * threads that are competing to recover this filehandle. 505 */ 506 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 507 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 508 if (vp1 != NULL) 509 recov_filehandle(recovp->rc_action, mi, vp1); 510 if (vp2 != NULL) 511 recov_filehandle(recovp->rc_action, mi, vp2); 512 goto out_no_thread; /* no further recovery needed */ 513 514 case NR_STALE: 515 /* 516 * NFS4ERR_STALE handling 517 * recov_stale() could set MI4R_NEED_NEW_SERVER to 518 * indicate that we can and should failover. 519 */ 520 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 521 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 522 523 if (vp1 != NULL) 524 recov_stale(mi, vp1); 525 if (vp2 != NULL) 526 recov_stale(mi, vp2); 527 mutex_enter(&mi->mi_lock); 528 if ((mi->mi_recovflags & MI4R_NEED_NEW_SERVER) == 0) { 529 mutex_exit(&mi->mi_lock); 530 goto out_no_thread; 531 } 532 mutex_exit(&mi->mi_lock); 533 recovp->rc_action = NR_FAILOVER; 534 goto again; 535 536 case NR_BAD_SEQID: 537 if (recovp->rc_bseqid_rqst) { 538 enqueue_bseqid_rqst(recovp, mi); 539 break; 540 } 541 542 if (vp1 != NULL) 543 recov_badstate(recovp, vp1, NFS4ERR_BAD_SEQID); 544 if (vp2 != NULL) 545 recov_badstate(recovp, vp2, NFS4ERR_BAD_SEQID); 546 goto out_no_thread; /* no further recovery possible */ 547 548 case NR_OLDSTATEID: 549 if (vp1 != NULL) 550 recov_badstate(recovp, vp1, NFS4ERR_OLD_STATEID); 551 if (vp2 != NULL) 552 recov_badstate(recovp, vp2, NFS4ERR_OLD_STATEID); 553 goto out_no_thread; /* no further recovery possible */ 554 555 case NR_GRACE: 556 nfs4_set_grace_wait(mi); 557 goto out_no_thread; /* no further action required for GRACE */ 558 559 case NR_DELAY: 560 if (vp1) 561 nfs4_set_delay_wait(vp1); 562 goto out_no_thread; /* no further action required for DELAY */ 563 564 case NR_LOST_STATE_RQST: 565 case NR_LOST_LOCK: 566 nfs4_enqueue_lost_rqst(recovp, mi); 567 break; 568 569 default: 570 nfs4_queue_event(RE_UNEXPECTED_ACTION, mi, NULL, 571 recovp->rc_action, NULL, NULL, 0, NULL, 0, TAG_NONE, 572 TAG_NONE, 0, 0); 573 goto out_no_thread; 574 } 575 576 /* 577 * If either file recently went through the same recovery, wait 578 * awhile. This is in case there is some sort of bug; we might not 579 * be able to recover properly, but at least we won't bombard the 580 * server with calls, and we won't tie up the client. 581 */ 582 if (vp1 != NULL) 583 recov_throttle(recovp, vp1); 584 if (vp2 != NULL) 585 recov_throttle(recovp, vp2); 586 587 /* 588 * If there's already a recovery thread, don't start another one. 589 */ 590 591 mutex_enter(&mi->mi_lock); 592 if (mi->mi_flags & MI4_RECOV_ACTIV) { 593 mutex_exit(&mi->mi_lock); 594 goto out_no_thread; 595 } 596 mi->mi_flags |= MI4_RECOV_ACTIV; 597 mutex_exit(&mi->mi_lock); 598 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 599 "start_recovery: starting new thread for mi %p", (void*)mi)); 600 601 recovp->rc_mi = mi; 602 recovp->rc_vp1 = vp1; 603 if (vp1 != NULL) { 604 ASSERT(VTOMI4(vp1) == mi); 605 VN_HOLD(recovp->rc_vp1); 606 } 607 recovp->rc_vp2 = vp2; 608 if (vp2 != NULL) { 609 ASSERT(VTOMI4(vp2) == mi); 610 VN_HOLD(recovp->rc_vp2); 611 } 612 613 (void) zthread_create(NULL, 0, nfs4_recov_thread, recovp, 0, 614 minclsyspri); 615 return; 616 617 /* not reached by thread creating call */ 618 out_no_thread: 619 mutex_enter(&mi->mi_lock); 620 mi->mi_in_recovery--; 621 if (mi->mi_in_recovery == 0) 622 cv_broadcast(&mi->mi_cv_in_recov); 623 mutex_exit(&mi->mi_lock); 624 625 VFS_RELE(mi->mi_vfsp); 626 MI4_RELE(mi); 627 /* 628 * Free up resources that were allocated for us. 629 */ 630 kmem_free(recovp, sizeof (recov_info_t)); 631 } 632 633 static int 634 nfs4_check_recov_err(vnode_t *vp, nfs4_op_hint_t op, 635 nfs4_recov_state_t *rsp, int retry_err_cnt, char *str) 636 { 637 rnode4_t *rp; 638 int error = 0; 639 int exempt; 640 641 if (vp == NULL) 642 return (0); 643 644 exempt = (op == OH_CLOSE || op == OH_LOCKU || op == OH_DELEGRETURN); 645 rp = VTOR4(vp); 646 mutex_enter(&rp->r_statelock); 647 648 /* 649 * If there was a recovery error, then allow op hints "exempt" from 650 * recov errors to retry (currently 3 times). Either r_error or 651 * EIO is returned for non-exempt op hints. 652 */ 653 if (rp->r_flags & R4RECOVERR) { 654 if (exempt && rsp->rs_num_retry_despite_err <= 655 nfs4_max_recov_error_retry) { 656 657 /* 658 * Check to make sure that we haven't already inc'd 659 * rs_num_retry_despite_err for current nfs4_start_fop 660 * instance. We don't want to double inc (if we were 661 * called with vp2, then the vp1 call could have 662 * already incremented. 663 */ 664 if (retry_err_cnt == rsp->rs_num_retry_despite_err) 665 rsp->rs_num_retry_despite_err++; 666 667 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 668 "nfs4_start_fop: %s %p DEAD, cnt=%d", str, 669 (void *)vp, rsp->rs_num_retry_despite_err)); 670 } else { 671 error = (rp->r_error ? rp->r_error : EIO); 672 /* 673 * An ESTALE error on a non-regular file is not 674 * "sticky". Return the ESTALE error once, but 675 * clear the condition to allow future operations 676 * to go OTW. This will allow the client to 677 * recover if the server has merely unshared then 678 * re-shared the file system. For regular files, 679 * the unshare has destroyed the open state at the 680 * server and we aren't willing to do a reopen (yet). 681 */ 682 if (error == ESTALE && vp->v_type != VREG) { 683 rp->r_flags &= 684 ~(R4RECOVERR|R4RECOVERRP|R4STALE); 685 rp->r_error = 0; 686 error = ESTALE; 687 } 688 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 689 "nfs4_start_fop: %s %p DEAD, cnt=%d error=%d", 690 str, (void *)vp, 691 rsp->rs_num_retry_despite_err, error)); 692 } 693 } 694 695 mutex_exit(&rp->r_statelock); 696 return (error); 697 } 698 699 /* 700 * Initial setup code that every operation should call if it might invoke 701 * client recovery. Can block waiting for recovery to finish on a 702 * filesystem. Either vnode ptr can be NULL. 703 * 704 * Returns 0 if there are no outstanding errors. Can return an 705 * errno value under various circumstances (e.g., failed recovery, or 706 * interrupted while waiting for recovery to finish). 707 * 708 * There must be a corresponding call to nfs4_end_op() to free up any locks 709 * or resources allocated by this call (assuming this call succeeded), 710 * using the same rsp that's passed in here. 711 * 712 * The open and lock seqid synchronization must be stopped before calling this 713 * function, as it could lead to deadlock when trying to reopen a file or 714 * reclaim a lock. The synchronization is obtained with calls to: 715 * nfs4_start_open_seqid_sync() 716 * nfs4_start_lock_seqid_sync() 717 * 718 * *startrecovp is set TRUE if the caller should not bother with the 719 * over-the-wire call, and just initiate recovery for the given request. 720 * This is typically used for state-releasing ops if the filesystem has 721 * been forcibly unmounted. startrecovp may be NULL for 722 * non-state-releasing ops. 723 */ 724 725 int 726 nfs4_start_fop(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2, nfs4_op_hint_t op, 727 nfs4_recov_state_t *rsp, bool_t *startrecovp) 728 { 729 int error = 0, rerr_cnt; 730 nfs4_server_t *sp = NULL; 731 nfs4_server_t *tsp; 732 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 733 time_t droplock_time; 734 #ifdef DEBUG 735 void *fop_caller; 736 #endif 737 738 ASSERT(vp1 == NULL || vp1->v_vfsp == mi->mi_vfsp); 739 ASSERT(vp2 == NULL || vp2->v_vfsp == mi->mi_vfsp); 740 741 #ifdef DEBUG 742 if ((fop_caller = tsd_get(nfs4_tsd_key)) != NULL) { 743 cmn_err(CE_PANIC, "Missing nfs4_end_fop: last caller %p", 744 fop_caller); 745 } 746 (void) tsd_set(nfs4_tsd_key, caller()); 747 #endif 748 749 rsp->rs_sp = NULL; 750 rsp->rs_flags &= ~NFS4_RS_RENAME_HELD; 751 rerr_cnt = rsp->rs_num_retry_despite_err; 752 753 /* 754 * Process the items that may delay() based on server response 755 */ 756 error = nfs4_wait_for_grace(mi, rsp); 757 if (error) 758 goto out; 759 760 if (vp1 != NULL) { 761 error = nfs4_wait_for_delay(vp1, rsp); 762 if (error) 763 goto out; 764 } 765 766 /* Wait for a delegation recall to complete. */ 767 768 error = wait_for_recall(vp1, vp2, op, rsp); 769 if (error) 770 goto out; 771 772 /* 773 * Wait for any current recovery actions to finish. Note that a 774 * recovery thread can still start up after wait_for_recovery() 775 * finishes. We don't block out recovery operations until we 776 * acquire s_recovlock and mi_recovlock. 777 */ 778 error = wait_for_recovery(mi, op); 779 if (error) 780 goto out; 781 782 /* 783 * Check to see if the rnode is already marked with a 784 * recovery error. If so, return it immediately. But 785 * always pass CLOSE, LOCKU, and DELEGRETURN so we can 786 * clean up state on the server. 787 */ 788 789 if (vp1 != NULL) { 790 if (error = nfs4_check_recov_err(vp1, op, rsp, rerr_cnt, "vp1")) 791 goto out; 792 nfs4_check_remap(mi, vp1, NFS4_REMAP_CKATTRS, &e); 793 } 794 795 if (vp2 != NULL) { 796 if (error = nfs4_check_recov_err(vp2, op, rsp, rerr_cnt, "vp2")) 797 goto out; 798 nfs4_check_remap(mi, vp2, NFS4_REMAP_CKATTRS, &e); 799 } 800 801 /* 802 * The lock order calls for us to acquire s_recovlock before 803 * mi_recovlock, but we have to hold mi_recovlock to look up sp (to 804 * prevent races with the failover/migration code). So acquire 805 * mi_recovlock, look up sp, drop mi_recovlock, acquire 806 * s_recovlock and mi_recovlock, then verify that sp is still the 807 * right object. XXX Can we find a simpler way to deal with this? 808 */ 809 if (nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 810 mi->mi_flags & MI4_INT)) { 811 error = EINTR; 812 goto out; 813 } 814 get_sp: 815 sp = find_nfs4_server(mi); 816 if (sp != NULL) { 817 sp->s_otw_call_count++; 818 mutex_exit(&sp->s_lock); 819 droplock_time = gethrestime_sec(); 820 } 821 nfs_rw_exit(&mi->mi_recovlock); 822 823 if (sp != NULL) { 824 if (nfs_rw_enter_sig(&sp->s_recovlock, RW_READER, 825 mi->mi_flags & MI4_INT)) { 826 error = EINTR; 827 goto out; 828 } 829 } 830 if (nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 831 mi->mi_flags & MI4_INT)) { 832 if (sp != NULL) 833 nfs_rw_exit(&sp->s_recovlock); 834 error = EINTR; 835 goto out; 836 } 837 /* 838 * If the mntinfo4_t hasn't changed nfs4_sever_ts then 839 * there's no point in double checking to make sure it 840 * has switched. 841 */ 842 if (sp == NULL || droplock_time < mi->mi_srvsettime) { 843 tsp = find_nfs4_server(mi); 844 if (tsp != sp) { 845 /* try again */ 846 if (tsp != NULL) { 847 mutex_exit(&tsp->s_lock); 848 nfs4_server_rele(tsp); 849 tsp = NULL; 850 } 851 if (sp != NULL) { 852 nfs_rw_exit(&sp->s_recovlock); 853 mutex_enter(&sp->s_lock); 854 sp->s_otw_call_count--; 855 mutex_exit(&sp->s_lock); 856 nfs4_server_rele(sp); 857 sp = NULL; 858 } 859 goto get_sp; 860 } else { 861 if (tsp != NULL) { 862 mutex_exit(&tsp->s_lock); 863 nfs4_server_rele(tsp); 864 tsp = NULL; 865 } 866 } 867 } 868 869 if (sp != NULL) { 870 rsp->rs_sp = sp; 871 } 872 873 /* 874 * If the fileystem uses volatile filehandles, obtain a lock so 875 * that we synchronize with renames. Exception: mount operations 876 * can change mi_fh_expire_type, which could be a problem, since 877 * the end_op code needs to be consistent with the start_op code 878 * about mi_rename_lock. Since mounts don't compete with renames, 879 * it's simpler to just not acquire the rename lock for mounts. 880 */ 881 if (NFS4_VOLATILE_FH(mi) && op != OH_MOUNT) { 882 if (nfs_rw_enter_sig(&mi->mi_rename_lock, 883 op == OH_VFH_RENAME ? RW_WRITER : RW_READER, 884 mi->mi_flags & MI4_INT)) { 885 nfs_rw_exit(&mi->mi_recovlock); 886 if (sp != NULL) 887 nfs_rw_exit(&sp->s_recovlock); 888 error = EINTR; 889 goto out; 890 } 891 rsp->rs_flags |= NFS4_RS_RENAME_HELD; 892 } 893 894 if (OH_IS_STATE_RELE(op)) { 895 /* 896 * For forced unmount, letting the request proceed will 897 * almost always delay response to the user, so hand it off 898 * to the recovery thread. For exiting lwp's, we don't 899 * have a good way to tell if the request will hang. We 900 * generally want processes to handle their own requests so 901 * that they can be done in parallel, but if there is 902 * already a recovery thread, hand the request off to it. 903 * This will improve user response at no cost to overall 904 * system throughput. For zone shutdown, we'd prefer 905 * the recovery thread to handle this as well. 906 */ 907 ASSERT(startrecovp != NULL); 908 mutex_enter(&mi->mi_lock); 909 if (FS_OR_ZONE_GONE4(mi->mi_vfsp)) 910 *startrecovp = TRUE; 911 else if ((curthread->t_proc_flag & TP_LWPEXIT) && 912 (mi->mi_flags & MI4_RECOV_ACTIV)) 913 *startrecovp = TRUE; 914 else 915 *startrecovp = FALSE; 916 mutex_exit(&mi->mi_lock); 917 } else 918 if (startrecovp != NULL) 919 *startrecovp = FALSE; 920 921 ASSERT(error == 0); 922 return (error); 923 924 out: 925 ASSERT(error != 0); 926 if (sp != NULL) { 927 mutex_enter(&sp->s_lock); 928 sp->s_otw_call_count--; 929 mutex_exit(&sp->s_lock); 930 nfs4_server_rele(sp); 931 rsp->rs_sp = NULL; 932 } 933 nfs4_end_op_recall(vp1, vp2, rsp); 934 935 #ifdef DEBUG 936 (void) tsd_set(nfs4_tsd_key, NULL); 937 #endif 938 return (error); 939 } 940 941 /* 942 * It is up to the caller to determine if rsp->rs_sp being NULL 943 * is detrimental or not. 944 */ 945 int 946 nfs4_start_op(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2, 947 nfs4_recov_state_t *rsp) 948 { 949 ASSERT(rsp->rs_num_retry_despite_err == 0); 950 rsp->rs_num_retry_despite_err = 0; 951 return (nfs4_start_fop(mi, vp1, vp2, OH_OTHER, rsp, NULL)); 952 } 953 954 /* 955 * Release any resources acquired by nfs4_start_op(). 956 * 'sp' should be the nfs4_server pointer returned by nfs4_start_op(). 957 * 958 * The operation hint is used to avoid a deadlock by bypassing delegation 959 * return logic for writes, which are done while returning a delegation. 960 */ 961 962 void 963 nfs4_end_fop(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2, nfs4_op_hint_t op, 964 nfs4_recov_state_t *rsp, bool_t needs_recov) 965 { 966 nfs4_server_t *sp = rsp->rs_sp; 967 rnode4_t *rp = NULL; 968 969 #ifdef lint 970 /* 971 * The op hint isn't used any more, but might be in 972 * the future. 973 */ 974 op = op; 975 #endif 976 977 #ifdef DEBUG 978 ASSERT(tsd_get(nfs4_tsd_key) != NULL); 979 (void) tsd_set(nfs4_tsd_key, NULL); 980 #endif 981 982 nfs4_end_op_recall(vp1, vp2, rsp); 983 984 if (rsp->rs_flags & NFS4_RS_RENAME_HELD) 985 nfs_rw_exit(&mi->mi_rename_lock); 986 987 if (!needs_recov) { 988 if (rsp->rs_flags & NFS4_RS_DELAY_MSG) { 989 /* may need to clear the delay interval */ 990 if (vp1 != NULL) { 991 rp = VTOR4(vp1); 992 mutex_enter(&rp->r_statelock); 993 rp->r_delay_interval = 0; 994 mutex_exit(&rp->r_statelock); 995 } 996 } 997 rsp->rs_flags &= ~(NFS4_RS_GRACE_MSG|NFS4_RS_DELAY_MSG); 998 } 999 1000 /* 1001 * If the corresponding nfs4_start_op() found a sp, 1002 * then there must still be a sp. 1003 */ 1004 if (sp != NULL) { 1005 nfs_rw_exit(&mi->mi_recovlock); 1006 nfs_rw_exit(&sp->s_recovlock); 1007 mutex_enter(&sp->s_lock); 1008 sp->s_otw_call_count--; 1009 cv_broadcast(&sp->s_cv_otw_count); 1010 mutex_exit(&sp->s_lock); 1011 nfs4_server_rele(sp); 1012 } else { 1013 nfs_rw_exit(&mi->mi_recovlock); 1014 } 1015 } 1016 1017 void 1018 nfs4_end_op(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2, 1019 nfs4_recov_state_t *rsp, bool_t needrecov) 1020 { 1021 nfs4_end_fop(mi, vp1, vp2, OH_OTHER, rsp, needrecov); 1022 } 1023 1024 /* 1025 * If the filesystem is going through client recovery, block until 1026 * finished. 1027 * Exceptions: 1028 * - state-releasing ops (CLOSE, LOCKU, DELEGRETURN) are allowed to proceed 1029 * if the filesystem has been forcibly unmounted or the lwp is exiting. 1030 * 1031 * Return value: 1032 * - 0 if no errors 1033 * - EINTR if the call was interrupted 1034 * - EIO if the filesystem has been forcibly unmounted (non-state-releasing 1035 * op) 1036 * - the errno value from the recovery thread, if recovery failed 1037 */ 1038 1039 static int 1040 wait_for_recovery(mntinfo4_t *mi, nfs4_op_hint_t op_hint) 1041 { 1042 int error = 0; 1043 1044 mutex_enter(&mi->mi_lock); 1045 1046 while (mi->mi_recovflags != 0) { 1047 klwp_t *lwp = ttolwp(curthread); 1048 1049 if (mi->mi_flags & MI4_RECOV_FAIL) 1050 break; 1051 if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) 1052 break; 1053 if (OH_IS_STATE_RELE(op_hint) && 1054 (curthread->t_proc_flag & TP_LWPEXIT)) 1055 break; 1056 1057 if (lwp != NULL) 1058 lwp->lwp_nostop++; 1059 /* XXX - use different cv? */ 1060 if (cv_wait_sig(&mi->mi_failover_cv, &mi->mi_lock) == 0) { 1061 error = EINTR; 1062 if (lwp != NULL) 1063 lwp->lwp_nostop--; 1064 break; 1065 } 1066 if (lwp != NULL) 1067 lwp->lwp_nostop--; 1068 } 1069 1070 if (mi->mi_flags & MI4_RECOV_FAIL) { 1071 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1072 "wait_for_recovery: fail since RECOV FAIL")); 1073 error = mi->mi_error; 1074 } else if ((mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) && 1075 !OH_IS_STATE_RELE(op_hint)) { 1076 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1077 "wait_for_recovery: forced unmount")); 1078 error = EIO; 1079 } 1080 1081 mutex_exit(&mi->mi_lock); 1082 1083 return (error); 1084 } 1085 1086 /* 1087 * If the client received NFS4ERR_GRACE for this particular mount, 1088 * the client blocks here until it is time to try again. 1089 * 1090 * Return value: 1091 * - 0 if wait was successful 1092 * - EINTR if the call was interrupted 1093 */ 1094 1095 int 1096 nfs4_wait_for_grace(mntinfo4_t *mi, nfs4_recov_state_t *rsp) 1097 { 1098 int error = 0; 1099 time_t curtime, time_to_wait; 1100 1101 /* do a unprotected check to reduce mi_lock contention */ 1102 if (mi->mi_grace_wait != 0) { 1103 mutex_enter(&mi->mi_lock); 1104 1105 if (mi->mi_grace_wait != 0) { 1106 if (!(rsp->rs_flags & NFS4_RS_GRACE_MSG)) 1107 rsp->rs_flags |= NFS4_RS_GRACE_MSG; 1108 1109 curtime = gethrestime_sec(); 1110 1111 if (curtime < mi->mi_grace_wait) { 1112 1113 time_to_wait = mi->mi_grace_wait - curtime; 1114 1115 mutex_exit(&mi->mi_lock); 1116 1117 delay(SEC_TO_TICK(time_to_wait)); 1118 1119 curtime = gethrestime_sec(); 1120 1121 mutex_enter(&mi->mi_lock); 1122 1123 if (curtime >= mi->mi_grace_wait) 1124 mi->mi_grace_wait = 0; 1125 } else { 1126 mi->mi_grace_wait = 0; 1127 } 1128 } 1129 mutex_exit(&mi->mi_lock); 1130 } 1131 1132 return (error); 1133 } 1134 1135 /* 1136 * If the client received NFS4ERR_DELAY for an operation on a vnode, 1137 * the client blocks here until it is time to try again. 1138 * 1139 * Return value: 1140 * - 0 if wait was successful 1141 * - EINTR if the call was interrupted 1142 */ 1143 1144 int 1145 nfs4_wait_for_delay(vnode_t *vp, nfs4_recov_state_t *rsp) 1146 { 1147 int error = 0; 1148 time_t curtime, time_to_wait; 1149 rnode4_t *rp; 1150 1151 ASSERT(vp != NULL); 1152 1153 rp = VTOR4(vp); 1154 1155 /* do a unprotected check to reduce r_statelock contention */ 1156 if (rp->r_delay_wait != 0) { 1157 mutex_enter(&rp->r_statelock); 1158 1159 if (rp->r_delay_wait != 0) { 1160 1161 if (!(rsp->rs_flags & NFS4_RS_DELAY_MSG)) { 1162 rsp->rs_flags |= NFS4_RS_DELAY_MSG; 1163 nfs4_mi_kstat_inc_delay(VTOMI4(vp)); 1164 } 1165 1166 curtime = gethrestime_sec(); 1167 1168 if (curtime < rp->r_delay_wait) { 1169 1170 time_to_wait = rp->r_delay_wait - curtime; 1171 1172 mutex_exit(&rp->r_statelock); 1173 1174 delay(SEC_TO_TICK(time_to_wait)); 1175 1176 curtime = gethrestime_sec(); 1177 1178 mutex_enter(&rp->r_statelock); 1179 1180 if (curtime >= rp->r_delay_wait) 1181 rp->r_delay_wait = 0; 1182 } else { 1183 rp->r_delay_wait = 0; 1184 } 1185 } 1186 mutex_exit(&rp->r_statelock); 1187 } 1188 1189 return (error); 1190 } 1191 1192 /* 1193 * The recovery thread. 1194 */ 1195 1196 static void 1197 nfs4_recov_thread(recov_info_t *recovp) 1198 { 1199 mntinfo4_t *mi = recovp->rc_mi; 1200 nfs4_server_t *sp; 1201 int done = 0, error = 0; 1202 bool_t recov_fail = FALSE; 1203 callb_cpr_t cpr_info; 1204 kmutex_t cpr_lock; 1205 1206 nfs4_queue_event(RE_START, mi, NULL, mi->mi_recovflags, 1207 recovp->rc_vp1, recovp->rc_vp2, 0, NULL, 0, TAG_NONE, TAG_NONE, 1208 0, 0); 1209 1210 mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL); 1211 CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr, "nfsv4Recov"); 1212 1213 mutex_enter(&mi->mi_lock); 1214 mi->mi_recovthread = curthread; 1215 mutex_exit(&mi->mi_lock); 1216 1217 /* 1218 * We don't really need protection here against failover or 1219 * migration, since the current thread is the one that would make 1220 * any changes, but hold mi_recovlock anyway for completeness (and 1221 * to satisfy any ASSERTs). 1222 */ 1223 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0); 1224 sp = find_nfs4_server(mi); 1225 if (sp != NULL) 1226 mutex_exit(&sp->s_lock); 1227 nfs_rw_exit(&mi->mi_recovlock); 1228 1229 /* 1230 * Do any necessary recovery, based on the information in recovp 1231 * and any recovery flags. 1232 */ 1233 1234 do { 1235 mutex_enter(&mi->mi_lock); 1236 if (FS_OR_ZONE_GONE4(mi->mi_vfsp)) { 1237 bool_t activesrv; 1238 1239 NFS4_DEBUG(nfs4_client_recov_debug && 1240 mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED, (CE_NOTE, 1241 "nfs4_recov_thread: file system has been " 1242 "unmounted")); 1243 NFS4_DEBUG(nfs4_client_recov_debug && 1244 zone_status_get(curproc->p_zone) >= 1245 ZONE_IS_SHUTTING_DOWN, (CE_NOTE, 1246 "nfs4_recov_thread: zone shutting down")); 1247 /* 1248 * If the server has lost its state for us and 1249 * the filesystem is unmounted, then the filesystem 1250 * can be tossed, even if there are lost lock or 1251 * lost state calls in the recovery queue. 1252 */ 1253 if (mi->mi_recovflags & 1254 (MI4R_NEED_CLIENTID | MI4R_REOPEN_FILES)) { 1255 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1256 "nfs4_recov_thread: bailing out")); 1257 mi->mi_flags |= MI4_RECOV_FAIL; 1258 mi->mi_error = recovp->rc_error; 1259 recov_fail = TRUE; 1260 } 1261 /* 1262 * We don't know if the server has any state for 1263 * us, and the filesystem has been unmounted. If 1264 * there are "lost state" recovery items, keep 1265 * trying to process them until there are no more 1266 * mounted filesystems for the server. Otherwise, 1267 * bail out. The reason we don't mark the 1268 * filesystem as failing recovery is in case we 1269 * have to do "lost state" recovery later (e.g., a 1270 * user process exits). 1271 */ 1272 if (!(mi->mi_recovflags & MI4R_LOST_STATE)) { 1273 done = 1; 1274 mutex_exit(&mi->mi_lock); 1275 break; 1276 } 1277 mutex_exit(&mi->mi_lock); 1278 1279 if (sp == NULL) 1280 activesrv = FALSE; 1281 else { 1282 mutex_enter(&sp->s_lock); 1283 activesrv = nfs4_fs_active(sp); 1284 } 1285 if (!activesrv) { 1286 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1287 "no active fs for server %p", 1288 (void *)sp)); 1289 mutex_enter(&mi->mi_lock); 1290 mi->mi_flags |= MI4_RECOV_FAIL; 1291 mi->mi_error = recovp->rc_error; 1292 mutex_exit(&mi->mi_lock); 1293 recov_fail = TRUE; 1294 if (sp != NULL) { 1295 /* 1296 * Mark the server instance as 1297 * dead, so that nobody will attach 1298 * a new filesystem. 1299 */ 1300 nfs4_mark_srv_dead(sp); 1301 } 1302 } 1303 if (sp != NULL) 1304 mutex_exit(&sp->s_lock); 1305 } else { 1306 mutex_exit(&mi->mi_lock); 1307 } 1308 1309 /* 1310 * Check if we need to select a new server for a 1311 * failover. Choosing a new server will force at 1312 * least a check of the clientid. 1313 */ 1314 mutex_enter(&mi->mi_lock); 1315 if (!recov_fail && 1316 (mi->mi_recovflags & MI4R_NEED_NEW_SERVER)) { 1317 mutex_exit(&mi->mi_lock); 1318 recov_newserver(recovp, &sp, &recov_fail); 1319 } else 1320 mutex_exit(&mi->mi_lock); 1321 1322 /* 1323 * Check if we need to recover the clientid. This 1324 * must be done before file and lock recovery, and it 1325 * potentially affects the recovery threads for other 1326 * filesystems, so it gets special treatment. 1327 */ 1328 if (sp != NULL && recov_fail == FALSE) { 1329 mutex_enter(&sp->s_lock); 1330 if (!(sp->s_flags & N4S_CLIENTID_SET)) { 1331 mutex_exit(&sp->s_lock); 1332 recov_clientid(recovp, sp); 1333 } else { 1334 /* 1335 * Unset this flag in case another recovery 1336 * thread successfully recovered the clientid 1337 * for us already. 1338 */ 1339 mutex_enter(&mi->mi_lock); 1340 mi->mi_recovflags &= ~MI4R_NEED_CLIENTID; 1341 mutex_exit(&mi->mi_lock); 1342 mutex_exit(&sp->s_lock); 1343 } 1344 } 1345 1346 /* 1347 * Check if we need to get the security information. 1348 */ 1349 mutex_enter(&mi->mi_lock); 1350 if ((mi->mi_recovflags & MI4R_NEED_SECINFO) && 1351 !(mi->mi_flags & MI4_RECOV_FAIL)) { 1352 mutex_exit(&mi->mi_lock); 1353 (void) nfs_rw_enter_sig(&mi->mi_recovlock, 1354 RW_WRITER, 0); 1355 error = nfs4_secinfo_recov(recovp->rc_mi, 1356 recovp->rc_vp1, recovp->rc_vp2); 1357 /* 1358 * If error, nothing more can be done, stop 1359 * the recovery. 1360 */ 1361 if (error) { 1362 mutex_enter(&mi->mi_lock); 1363 mi->mi_flags |= MI4_RECOV_FAIL; 1364 mi->mi_error = recovp->rc_error; 1365 mutex_exit(&mi->mi_lock); 1366 nfs4_queue_event(RE_WRONGSEC, mi, NULL, 1367 error, recovp->rc_vp1, recovp->rc_vp2, 1368 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0); 1369 } 1370 nfs_rw_exit(&mi->mi_recovlock); 1371 } else 1372 mutex_exit(&mi->mi_lock); 1373 1374 /* 1375 * Check if there's a bad seqid to recover. 1376 */ 1377 mutex_enter(&mi->mi_lock); 1378 if ((mi->mi_recovflags & MI4R_BAD_SEQID) && 1379 !(mi->mi_flags & MI4_RECOV_FAIL)) { 1380 mutex_exit(&mi->mi_lock); 1381 (void) nfs_rw_enter_sig(&mi->mi_recovlock, 1382 RW_WRITER, 0); 1383 recov_bad_seqid(recovp); 1384 nfs_rw_exit(&mi->mi_recovlock); 1385 } else 1386 mutex_exit(&mi->mi_lock); 1387 1388 /* 1389 * Next check for recovery that affects the entire 1390 * filesystem. 1391 */ 1392 if (sp != NULL) { 1393 mutex_enter(&mi->mi_lock); 1394 if ((mi->mi_recovflags & MI4R_REOPEN_FILES) && 1395 !(mi->mi_flags & MI4_RECOV_FAIL)) { 1396 mutex_exit(&mi->mi_lock); 1397 recov_openfiles(recovp, sp); 1398 } else 1399 mutex_exit(&mi->mi_lock); 1400 } 1401 1402 /* 1403 * Send any queued state recovery requests. 1404 */ 1405 mutex_enter(&mi->mi_lock); 1406 if (sp != NULL && 1407 (mi->mi_recovflags & MI4R_LOST_STATE) && 1408 !(mi->mi_flags & MI4_RECOV_FAIL)) { 1409 mutex_exit(&mi->mi_lock); 1410 (void) nfs_rw_enter_sig(&mi->mi_recovlock, 1411 RW_WRITER, 0); 1412 nfs4_resend_lost_rqsts(recovp, sp); 1413 if (list_head(&mi->mi_lost_state) == NULL) { 1414 /* done */ 1415 mutex_enter(&mi->mi_lock); 1416 mi->mi_recovflags &= ~MI4R_LOST_STATE; 1417 mutex_exit(&mi->mi_lock); 1418 } 1419 nfs_rw_exit(&mi->mi_recovlock); 1420 } else { 1421 mutex_exit(&mi->mi_lock); 1422 } 1423 1424 /* 1425 * See if there is anything more to do. If not, announce 1426 * that we are done and exit. 1427 * 1428 * Need mi_recovlock to keep 'sp' valid. Must grab 1429 * mi_recovlock before mi_lock to preserve lock ordering. 1430 */ 1431 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0); 1432 mutex_enter(&mi->mi_lock); 1433 if ((mi->mi_recovflags & ~MI4R_SRV_REBOOT) == 0 || 1434 (mi->mi_flags & MI4_RECOV_FAIL)) { 1435 list_t local_lost_state; 1436 nfs4_lost_rqst_t *lrp; 1437 1438 /* 1439 * We need to remove the lost requests before we 1440 * unmark the mi as no longer doing recovery to 1441 * avoid a race with a new thread putting new lost 1442 * requests on the same mi (and the going away 1443 * thread would remove the new lost requests). 1444 * 1445 * Move the lost requests to a local list since 1446 * nfs4_remove_lost_rqst() drops mi_lock, and 1447 * dropping the mi_lock would make our check to 1448 * see if recovery is done no longer valid. 1449 */ 1450 list_create(&local_lost_state, 1451 sizeof (nfs4_lost_rqst_t), 1452 offsetof(nfs4_lost_rqst_t, lr_node)); 1453 list_move_tail(&local_lost_state, &mi->mi_lost_state); 1454 1455 done = 1; 1456 mutex_exit(&mi->mi_lock); 1457 /* 1458 * Now officially free the "moved" 1459 * lost requests. 1460 */ 1461 while ((lrp = list_head(&local_lost_state)) != NULL) { 1462 list_remove(&local_lost_state, lrp); 1463 nfs4_free_lost_rqst(lrp, sp); 1464 } 1465 list_destroy(&local_lost_state); 1466 } else 1467 mutex_exit(&mi->mi_lock); 1468 nfs_rw_exit(&mi->mi_recovlock); 1469 1470 /* 1471 * If the filesystem has been forcibly unmounted, there is 1472 * probably no point in retrying immediately. Furthermore, 1473 * there might be user processes waiting for a chance to 1474 * queue up "lost state" requests, so that they can exit. 1475 * So pause here for a moment. Same logic for zone shutdown. 1476 */ 1477 if (!done && FS_OR_ZONE_GONE4(mi->mi_vfsp)) { 1478 mutex_enter(&mi->mi_lock); 1479 cv_broadcast(&mi->mi_failover_cv); 1480 mutex_exit(&mi->mi_lock); 1481 delay(SEC_TO_TICK(nfs4_unmount_delay)); 1482 } 1483 1484 } while (!done); 1485 1486 if (sp != NULL) 1487 nfs4_server_rele(sp); 1488 1489 /* 1490 * Return all recalled delegations 1491 */ 1492 nfs4_dlistclean(); 1493 1494 mutex_enter(&mi->mi_lock); 1495 recov_done(mi, recovp); 1496 mutex_exit(&mi->mi_lock); 1497 1498 /* 1499 * Free up resources that were allocated for us. 1500 */ 1501 if (recovp->rc_vp1 != NULL) 1502 VN_RELE(recovp->rc_vp1); 1503 if (recovp->rc_vp2 != NULL) 1504 VN_RELE(recovp->rc_vp2); 1505 1506 /* now we are done using the mi struct, signal the waiters */ 1507 mutex_enter(&mi->mi_lock); 1508 mi->mi_in_recovery--; 1509 if (mi->mi_in_recovery == 0) 1510 cv_broadcast(&mi->mi_cv_in_recov); 1511 mutex_exit(&mi->mi_lock); 1512 1513 VFS_RELE(mi->mi_vfsp); 1514 MI4_RELE(mi); 1515 kmem_free(recovp, sizeof (recov_info_t)); 1516 mutex_enter(&cpr_lock); 1517 CALLB_CPR_EXIT(&cpr_info); 1518 mutex_destroy(&cpr_lock); 1519 zthread_exit(); 1520 } 1521 1522 /* 1523 * Log the end of recovery and notify any waiting threads. 1524 */ 1525 1526 static void 1527 recov_done(mntinfo4_t *mi, recov_info_t *recovp) 1528 { 1529 1530 ASSERT(MUTEX_HELD(&mi->mi_lock)); 1531 1532 nfs4_queue_event(RE_END, mi, NULL, 0, recovp->rc_vp1, 1533 recovp->rc_vp2, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0); 1534 mi->mi_recovthread = NULL; 1535 mi->mi_flags &= ~MI4_RECOV_ACTIV; 1536 mi->mi_recovflags &= ~MI4R_SRV_REBOOT; 1537 cv_broadcast(&mi->mi_failover_cv); 1538 } 1539 1540 /* 1541 * State-specific recovery routines, by state. 1542 */ 1543 1544 /* 1545 * Failover. 1546 * 1547 * Replaces *spp with a reference to the new server, which must 1548 * eventually be freed. 1549 */ 1550 1551 static void 1552 recov_newserver(recov_info_t *recovp, nfs4_server_t **spp, bool_t *recov_fail) 1553 { 1554 mntinfo4_t *mi = recovp->rc_mi; 1555 servinfo4_t *svp = NULL; 1556 nfs4_server_t *osp = *spp; 1557 CLIENT *cl; 1558 enum clnt_stat status; 1559 struct timeval tv; 1560 int error; 1561 int oncethru = 0; 1562 rnode4_t *rp; 1563 int index; 1564 nfs_fh4 fh; 1565 char *snames; 1566 size_t len; 1567 1568 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0); 1569 1570 tv.tv_sec = 2; 1571 tv.tv_usec = 0; 1572 1573 #ifdef lint 1574 /* 1575 * Lint can't follow the logic, so thinks that snames and len 1576 * can be used before being set. They can't, but lint can't 1577 * figure it out. To address the lint warning, initialize 1578 * snames and len for lint. 1579 */ 1580 snames = NULL; 1581 len = 0; 1582 #endif 1583 1584 /* 1585 * Ping the null NFS procedure of every server in 1586 * the list until one responds. We always start 1587 * at the head of the list and always skip the one 1588 * that is current, since it's caused us a problem. 1589 */ 1590 while (svp == NULL) { 1591 for (svp = mi->mi_servers; svp; svp = svp->sv_next) { 1592 1593 mutex_enter(&mi->mi_lock); 1594 if (FS_OR_ZONE_GONE4(mi->mi_vfsp)) { 1595 mi->mi_flags |= MI4_RECOV_FAIL; 1596 mutex_exit(&mi->mi_lock); 1597 (void) nfs_rw_exit(&mi->mi_recovlock); 1598 *recov_fail = TRUE; 1599 if (oncethru) 1600 kmem_free(snames, len); 1601 return; 1602 } 1603 mutex_exit(&mi->mi_lock); 1604 1605 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 1606 if (svp->sv_flags & SV4_NOTINUSE) { 1607 nfs_rw_exit(&svp->sv_lock); 1608 continue; 1609 } 1610 nfs_rw_exit(&svp->sv_lock); 1611 1612 if (!oncethru && svp == mi->mi_curr_serv) 1613 continue; 1614 1615 error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, 1616 NFS_PROGRAM, NFS_V4, 0, 1, CRED(), &cl); 1617 if (error) 1618 continue; 1619 1620 if (!(mi->mi_flags & MI4_INT)) 1621 cl->cl_nosignal = TRUE; 1622 status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL, 1623 xdr_void, NULL, tv); 1624 if (!(mi->mi_flags & MI4_INT)) 1625 cl->cl_nosignal = FALSE; 1626 AUTH_DESTROY(cl->cl_auth); 1627 CLNT_DESTROY(cl); 1628 if (status == RPC_SUCCESS) { 1629 nfs4_queue_event(RE_FAILOVER, mi, 1630 svp == mi->mi_curr_serv ? NULL : 1631 svp->sv_hostname, 0, NULL, NULL, 0, 1632 NULL, 0, TAG_NONE, TAG_NONE, 0, 0); 1633 break; 1634 } 1635 } 1636 1637 if (svp == NULL) { 1638 if (!oncethru) { 1639 snames = nfs4_getsrvnames(mi, &len); 1640 nfs4_queue_fact(RF_SRVS_NOT_RESPOND, mi, 1641 0, 0, 0, FALSE, snames, 0, NULL); 1642 oncethru = 1; 1643 } 1644 delay(hz); 1645 } 1646 } 1647 1648 if (oncethru) { 1649 nfs4_queue_fact(RF_SRVS_OK, mi, 0, 0, 0, FALSE, snames, 1650 0, NULL); 1651 kmem_free(snames, len); 1652 } 1653 1654 #if DEBUG 1655 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 1656 ASSERT((svp->sv_flags & SV4_NOTINUSE) == 0); 1657 nfs_rw_exit(&svp->sv_lock); 1658 #endif 1659 1660 mutex_enter(&mi->mi_lock); 1661 mi->mi_recovflags &= ~MI4R_NEED_NEW_SERVER; 1662 if (svp != mi->mi_curr_serv) { 1663 servinfo4_t *osvp = mi->mi_curr_serv; 1664 1665 mutex_exit(&mi->mi_lock); 1666 1667 /* 1668 * Update server-dependent fields in the root vnode. 1669 */ 1670 index = rtable4hash(mi->mi_rootfh); 1671 rw_enter(&rtable4[index].r_lock, RW_WRITER); 1672 1673 rp = r4find(&rtable4[index], mi->mi_rootfh, mi->mi_vfsp); 1674 if (rp != NULL) { 1675 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE, 1676 "recov_newserver: remapping %s", rnode4info(rp))); 1677 mutex_enter(&rp->r_statelock); 1678 rp->r_server = svp; 1679 PURGE_ATTRCACHE4_LOCKED(rp); 1680 mutex_exit(&rp->r_statelock); 1681 (void) nfs4_free_data_reclaim(rp); 1682 nfs4_purge_rddir_cache(RTOV4(rp)); 1683 rw_exit(&rtable4[index].r_lock); 1684 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE, 1685 "recov_newserver: done with %s", 1686 rnode4info(rp))); 1687 VN_RELE(RTOV4(rp)); 1688 } else 1689 rw_exit(&rtable4[index].r_lock); 1690 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0); 1691 1692 mutex_enter(&mi->mi_lock); 1693 mi->mi_recovflags |= MI4R_REOPEN_FILES | MI4R_REMAP_FILES; 1694 if (recovp->rc_srv_reboot) 1695 mi->mi_recovflags |= MI4R_SRV_REBOOT; 1696 mi->mi_curr_serv = svp; 1697 mi->mi_failover++; 1698 mi->mi_flags &= ~MI4_BADOWNER_DEBUG; 1699 mutex_exit(&mi->mi_lock); 1700 1701 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 1702 fh.nfs_fh4_len = svp->sv_fhandle.fh_len; 1703 fh.nfs_fh4_val = svp->sv_fhandle.fh_buf; 1704 sfh4_update(mi->mi_rootfh, &fh); 1705 fh.nfs_fh4_len = svp->sv_pfhandle.fh_len; 1706 fh.nfs_fh4_val = svp->sv_pfhandle.fh_buf; 1707 sfh4_update(mi->mi_srvparentfh, &fh); 1708 nfs_rw_exit(&svp->sv_lock); 1709 1710 *spp = nfs4_move_mi(mi, osvp, svp); 1711 if (osp != NULL) 1712 nfs4_server_rele(osp); 1713 } else 1714 mutex_exit(&mi->mi_lock); 1715 (void) nfs_rw_exit(&mi->mi_recovlock); 1716 } 1717 1718 /* 1719 * Clientid. 1720 */ 1721 1722 static void 1723 recov_clientid(recov_info_t *recovp, nfs4_server_t *sp) 1724 { 1725 mntinfo4_t *mi = recovp->rc_mi; 1726 int error = 0; 1727 int still_stale; 1728 int need_new_s; 1729 1730 ASSERT(sp != NULL); 1731 1732 /* 1733 * Acquire the recovery lock and then verify that the clientid 1734 * still needs to be recovered. (Note that s_recovlock is supposed 1735 * to be acquired before s_lock.) Since the thread holds the 1736 * recovery lock, no other thread will recover the clientid. 1737 */ 1738 (void) nfs_rw_enter_sig(&sp->s_recovlock, RW_WRITER, 0); 1739 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0); 1740 mutex_enter(&sp->s_lock); 1741 still_stale = ((sp->s_flags & N4S_CLIENTID_SET) == 0); 1742 mutex_exit(&sp->s_lock); 1743 1744 if (still_stale) { 1745 nfs4_error_t n4e; 1746 1747 nfs4_error_zinit(&n4e); 1748 nfs4setclientid(mi, kcred, TRUE, &n4e); 1749 error = n4e.error; 1750 if (error != 0) { 1751 1752 /* 1753 * nfs4setclientid may have set MI4R_NEED_NEW_SERVER, 1754 * if so, just return and let recov_thread drive 1755 * failover. 1756 */ 1757 mutex_enter(&mi->mi_lock); 1758 need_new_s = mi->mi_recovflags & MI4R_NEED_NEW_SERVER; 1759 mutex_exit(&mi->mi_lock); 1760 1761 if (need_new_s) { 1762 nfs_rw_exit(&mi->mi_recovlock); 1763 nfs_rw_exit(&sp->s_recovlock); 1764 return; 1765 } 1766 1767 nfs4_queue_event(RE_CLIENTID, mi, NULL, n4e.error, NULL, 1768 NULL, n4e.stat, NULL, 0, TAG_NONE, TAG_NONE, 0, 0); 1769 mutex_enter(&mi->mi_lock); 1770 mi->mi_flags |= MI4_RECOV_FAIL; 1771 mi->mi_error = recovp->rc_error; 1772 mutex_exit(&mi->mi_lock); 1773 /* don't destroy the nfs4_server, let umount do it */ 1774 } 1775 } 1776 1777 if (error == 0) { 1778 mutex_enter(&mi->mi_lock); 1779 mi->mi_recovflags &= ~MI4R_NEED_CLIENTID; 1780 /* 1781 * If still_stale isn't true, then another thread already 1782 * recovered the clientid. And that thread that set the 1783 * clientid will have initiated reopening files on all the 1784 * filesystems for the server, so we should not initiate 1785 * reopening for this filesystem here. 1786 */ 1787 if (still_stale) { 1788 mi->mi_recovflags |= MI4R_REOPEN_FILES; 1789 if (recovp->rc_srv_reboot) 1790 mi->mi_recovflags |= MI4R_SRV_REBOOT; 1791 } 1792 mutex_exit(&mi->mi_lock); 1793 } 1794 1795 nfs_rw_exit(&mi->mi_recovlock); 1796 1797 if (error != 0) { 1798 nfs_rw_exit(&sp->s_recovlock); 1799 mutex_enter(&mi->mi_lock); 1800 if ((mi->mi_flags & MI4_RECOV_FAIL) == 0) 1801 delay(SEC_TO_TICK(recov_err_delay)); 1802 mutex_exit(&mi->mi_lock); 1803 } else { 1804 mntinfo4_t **milist; 1805 mntinfo4_t *tmi; 1806 int nummi, i; 1807 1808 /* 1809 * Initiate recovery of open files for other filesystems. 1810 * We create an array of filesystems, rather than just 1811 * walking the filesystem list, to avoid deadlock issues 1812 * with s_lock and mi_recovlock. 1813 */ 1814 milist = make_milist(sp, &nummi); 1815 for (i = 0; i < nummi; i++) { 1816 tmi = milist[i]; 1817 if (tmi != mi) { 1818 (void) nfs_rw_enter_sig(&tmi->mi_recovlock, 1819 RW_READER, 0); 1820 start_recovery_action(NR_OPENFILES, TRUE, tmi, 1821 NULL, NULL); 1822 nfs_rw_exit(&tmi->mi_recovlock); 1823 } 1824 } 1825 free_milist(milist, nummi); 1826 1827 nfs_rw_exit(&sp->s_recovlock); 1828 } 1829 } 1830 1831 /* 1832 * Return an array of filesystems associated with the given server. The 1833 * caller should call free_milist() to free the references and memory. 1834 */ 1835 1836 static mntinfo4_t ** 1837 make_milist(nfs4_server_t *sp, int *nummip) 1838 { 1839 int nummi, i; 1840 mntinfo4_t **milist; 1841 mntinfo4_t *tmi; 1842 1843 mutex_enter(&sp->s_lock); 1844 nummi = 0; 1845 for (tmi = sp->mntinfo4_list; tmi != NULL; tmi = tmi->mi_clientid_next) 1846 nummi++; 1847 1848 milist = kmem_alloc(nummi * sizeof (mntinfo4_t *), KM_SLEEP); 1849 1850 for (i = 0, tmi = sp->mntinfo4_list; tmi != NULL; i++, 1851 tmi = tmi->mi_clientid_next) { 1852 milist[i] = tmi; 1853 VFS_HOLD(tmi->mi_vfsp); 1854 } 1855 mutex_exit(&sp->s_lock); 1856 1857 *nummip = nummi; 1858 return (milist); 1859 } 1860 1861 /* 1862 * Free the filesystem list created by make_milist(). 1863 */ 1864 1865 static void 1866 free_milist(mntinfo4_t **milist, int nummi) 1867 { 1868 mntinfo4_t *tmi; 1869 int i; 1870 1871 for (i = 0; i < nummi; i++) { 1872 tmi = milist[i]; 1873 VFS_RELE(tmi->mi_vfsp); 1874 } 1875 kmem_free(milist, nummi * sizeof (mntinfo4_t *)); 1876 } 1877 1878 /* 1879 * Filehandle 1880 */ 1881 1882 /* 1883 * Lookup the filehandle for the given vnode and update the rnode if it has 1884 * changed. 1885 * 1886 * Errors: 1887 * - if the filehandle could not be updated because of an error that 1888 * requires further recovery, initiate that recovery and return. 1889 * - if the filehandle could not be updated because of a signal, pretend we 1890 * succeeded and let someone else deal with it. 1891 * - if the filehandle could not be updated and the filesystem has been 1892 * forcibly unmounted, pretend we succeeded, and let the caller deal with 1893 * the forced unmount (to retry or not to retry, that is the question). 1894 * - if the filehandle could not be updated because of some other error, 1895 * mark the rnode bad and return. 1896 */ 1897 static void 1898 recov_filehandle(nfs4_recov_t action, mntinfo4_t *mi, vnode_t *vp) 1899 { 1900 rnode4_t *rp = VTOR4(vp); 1901 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 1902 bool_t needrecov; 1903 1904 mutex_enter(&rp->r_statelock); 1905 1906 if (rp->r_flags & R4RECOVERR) { 1907 mutex_exit(&rp->r_statelock); 1908 return; 1909 } 1910 1911 /* 1912 * If someone else is updating the filehandle, wait for them to 1913 * finish and then let our caller retry. 1914 */ 1915 if (rp->r_flags & R4RECEXPFH) { 1916 while (rp->r_flags & R4RECEXPFH) { 1917 cv_wait(&rp->r_cv, &rp->r_statelock); 1918 } 1919 mutex_exit(&rp->r_statelock); 1920 return; 1921 } 1922 rp->r_flags |= R4RECEXPFH; 1923 mutex_exit(&rp->r_statelock); 1924 1925 if (action == NR_BADHANDLE) { 1926 /* shouldn't happen */ 1927 nfs4_queue_event(RE_BADHANDLE, mi, NULL, 0, 1928 vp, NULL, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0); 1929 } 1930 1931 nfs4_remap_file(mi, vp, 0, &e); 1932 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 1933 1934 /* 1935 * If we get BADHANDLE or FHEXPIRED in their handler, something is 1936 * broken. Don't try to recover, just mark the file dead. 1937 */ 1938 if (needrecov && e.error == 0 && 1939 (e.stat == NFS4ERR_BADHANDLE || e.stat == NFS4ERR_FHEXPIRED)) 1940 needrecov = FALSE; 1941 if (needrecov) { 1942 (void) nfs4_start_recovery(&e, mi, vp, 1943 NULL, NULL, NULL, OP_LOOKUP, NULL); 1944 } else if (e.error != EINTR && 1945 !NFS4_FRC_UNMT_ERR(e.error, mi->mi_vfsp) && 1946 (e.error != 0 || e.stat != NFS4_OK)) { 1947 nfs4_recov_fh_fail(vp, e.error, e.stat); 1948 /* 1949 * Don't set r_error to ESTALE. Higher-level code (e.g., 1950 * cstatat_getvp()) retries on ESTALE, which would cause 1951 * an infinite loop. 1952 */ 1953 } 1954 1955 mutex_enter(&rp->r_statelock); 1956 rp->r_flags &= ~R4RECEXPFH; 1957 cv_broadcast(&rp->r_cv); 1958 mutex_exit(&rp->r_statelock); 1959 } 1960 1961 /* 1962 * Stale Filehandle 1963 */ 1964 1965 /* 1966 * A stale filehandle can happen when an individual file has 1967 * been removed, or when an entire filesystem has been taken 1968 * offline. To distinguish these cases, we do this: 1969 * - if a GETATTR with the current filehandle is okay, we do 1970 * nothing (this can happen with two-filehandle ops) 1971 * - if the GETATTR fails, but a GETATTR of the root filehandle 1972 * succeeds, mark the rnode with R4STALE, which will stop use 1973 * - if the GETATTR fails, and a GETATTR of the root filehandle 1974 * also fails, we consider the problem filesystem-wide, so: 1975 * - if we can failover, we should 1976 * - if we can't failover, we should mark both the original 1977 * vnode and the root bad 1978 */ 1979 static void 1980 recov_stale(mntinfo4_t *mi, vnode_t *vp) 1981 { 1982 rnode4_t *rp = VTOR4(vp); 1983 vnode_t *rootvp = NULL; 1984 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 1985 nfs4_ga_res_t gar; 1986 char *fail_msg = "failed to recover from NFS4ERR_STALE"; 1987 bool_t needrecov; 1988 1989 mutex_enter(&rp->r_statelock); 1990 1991 if (rp->r_flags & R4RECOVERR) { 1992 mutex_exit(&rp->r_statelock); 1993 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1994 "recov_stale: already marked dead, rp %s", 1995 rnode4info(rp))); 1996 return; 1997 } 1998 1999 if (rp->r_flags & R4STALE) { 2000 mutex_exit(&rp->r_statelock); 2001 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2002 "recov_stale: already marked stale, rp %s", 2003 rnode4info(rp))); 2004 return; 2005 } 2006 2007 mutex_exit(&rp->r_statelock); 2008 2009 /* Try a GETATTR on this vnode */ 2010 nfs4_getattr_otw_norecovery(vp, &gar, &e, CRED(), 0); 2011 2012 /* 2013 * Handle non-STALE recoverable errors 2014 */ 2015 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 2016 if (needrecov && (e.error != 0 || e.stat != NFS4ERR_STALE)) { 2017 (void) nfs4_start_recovery(&e, mi, vp, 2018 NULL, NULL, NULL, OP_GETATTR, NULL); 2019 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2020 "recov_stale: error=%d, stat=%d seen on rp %s", 2021 e.error, e.stat, rnode4info(rp))); 2022 goto out; 2023 } 2024 2025 /* Are things OK for this vnode? */ 2026 if (!e.error && e.stat == NFS4_OK) { 2027 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2028 "recov_stale: file appears fine, rp %s", 2029 rnode4info(rp))); 2030 goto out; 2031 } 2032 2033 /* Did we get an unrelated non-recoverable error? */ 2034 if (e.error || e.stat != NFS4ERR_STALE) { 2035 nfs4_fail_recov(vp, fail_msg, e.error, e.stat); 2036 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2037 "recov_stale: unrelated fatal error, rp %s", 2038 rnode4info(rp))); 2039 goto out; 2040 } 2041 2042 /* 2043 * If we don't appear to be dealing with the root node, find it. 2044 */ 2045 if ((vp->v_flag & VROOT) == 0) { 2046 nfs4_error_zinit(&e); 2047 e.error = VFS_ROOT(vp->v_vfsp, &rootvp); 2048 if (e.error) { 2049 nfs4_fail_recov(vp, fail_msg, 0, NFS4ERR_STALE); 2050 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2051 "recov_stale: can't find root node for rp %s", 2052 rnode4info(rp))); 2053 goto out; 2054 } 2055 } 2056 2057 /* Try a GETATTR on the root vnode */ 2058 if (rootvp != NULL) { 2059 nfs4_error_zinit(&e); 2060 nfs4_getattr_otw_norecovery(rootvp, &gar, &e, CRED(), 0); 2061 2062 /* Try recovery? */ 2063 if (e.error != 0 || e.stat != NFS4ERR_STALE) { 2064 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 2065 if (needrecov) { 2066 (void) nfs4_start_recovery(&e, 2067 mi, rootvp, NULL, NULL, NULL, 2068 OP_GETATTR, NULL); 2069 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2070 "recov_stale: error=%d, stat=%d seen " 2071 "on rp %s", e.error, e.stat, 2072 rnode4info(rp))); 2073 } 2074 } 2075 2076 /* 2077 * Check to see if a failover attempt is warranted 2078 * NB: nfs4_try_failover doesn't check for STALE 2079 * because recov_stale gets a shot first. Now that 2080 * recov_stale has failed, go ahead and try failover. 2081 * 2082 * If the getattr on the root filehandle was successful, 2083 * then mark recovery as failed for 'vp' and exit. 2084 */ 2085 if (nfs4_try_failover(&e) == 0 && e.stat != NFS4ERR_STALE) { 2086 /* 2087 * pass the original error to fail_recov, not 2088 * the one from trying the root vnode. 2089 */ 2090 nfs4_fail_recov(vp, fail_msg, 0, NFS4ERR_STALE); 2091 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2092 "recov_stale: root node OK, marking " 2093 "dead rp %s", rnode4info(rp))); 2094 goto out; 2095 } 2096 } 2097 2098 /* 2099 * Here, we know that both the original file and the 2100 * root filehandle (which may be the same) are stale. 2101 * We want to fail over if we can, and if we can't, we 2102 * want to mark everything in sight bad. 2103 */ 2104 if (FAILOVER_MOUNT4(mi)) { 2105 mutex_enter(&mi->mi_lock); 2106 mi->mi_recovflags |= MI4R_NEED_NEW_SERVER; 2107 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2108 "recov_stale: failing over due to rp %s", 2109 rnode4info(rp))); 2110 mutex_exit(&mi->mi_lock); 2111 } else { 2112 rnode4_t *rootrp; 2113 servinfo4_t *svp; 2114 2115 /* 2116 * Can't fail over, so mark things dead. 2117 * 2118 * If rootvp is set, we know we have a distinct 2119 * non-root vnode which can be marked dead in 2120 * the usual way. 2121 * 2122 * Then we want to mark the root vnode dead. 2123 * Note that if rootvp wasn't set, our vp is 2124 * actually the root vnode. 2125 */ 2126 if (rootvp != NULL) { 2127 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2128 "recov_stale: can't fail over, marking dead rp %s", 2129 rnode4info(rp))); 2130 nfs4_fail_recov(vp, fail_msg, 0, NFS4ERR_STALE); 2131 } else { 2132 rootvp = vp; 2133 VN_HOLD(rootvp); 2134 } 2135 2136 /* 2137 * Mark root dead, but quietly - since 2138 * the root rnode is frequently recreated, 2139 * we can encounter this at every access. 2140 * Also mark recovery as failed on this VFS. 2141 */ 2142 rootrp = VTOR4(rootvp); 2143 NFS4_DEBUG(nfs4_client_recov_debug, (CE_CONT, 2144 "recov_stale: marking dead root rp %s", 2145 rnode4info(rootrp))); 2146 mutex_enter(&rootrp->r_statelock); 2147 rootrp->r_flags |= (R4RECOVERR | R4STALE); 2148 rootrp->r_error = ESTALE; 2149 mutex_exit(&rootrp->r_statelock); 2150 mutex_enter(&mi->mi_lock); 2151 mi->mi_error = ESTALE; 2152 mutex_exit(&mi->mi_lock); 2153 2154 svp = mi->mi_curr_serv; 2155 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0); 2156 svp->sv_flags |= SV4_ROOT_STALE; 2157 nfs_rw_exit(&svp->sv_lock); 2158 } 2159 2160 out: 2161 if (rootvp) 2162 VN_RELE(rootvp); 2163 } 2164 2165 /* 2166 * Locks. 2167 */ 2168 2169 /* 2170 * Reclaim all the active (acquired) locks for the given file. 2171 * If a process lost a lock, the process is sent a SIGLOST. This is not 2172 * considered an error. 2173 * 2174 * Return values: 2175 * Errors and status are returned via the nfs4_error_t parameter 2176 * If an error indicates that recovery is needed, the caller is responsible 2177 * for dealing with it. 2178 */ 2179 2180 static void 2181 relock_file(vnode_t *vp, mntinfo4_t *mi, nfs4_error_t *ep, 2182 fattr4_change pre_change) 2183 { 2184 locklist_t *locks, *llp; 2185 rnode4_t *rp; 2186 2187 ASSERT(ep != NULL); 2188 nfs4_error_zinit(ep); 2189 2190 if (VTOMI4(vp)->mi_flags & MI4_LLOCK) 2191 return; 2192 2193 nfs4_flush_lock_owners(VTOR4(vp)); 2194 2195 /* 2196 * If we get an error that requires recovery actions, just bail out 2197 * and let the top-level recovery code handle it. 2198 * 2199 * If we get some other error, kill the process that owned the lock 2200 * and mark its remaining locks (if any) as belonging to NOPID, so 2201 * that we don't make any more reclaim requests for that process. 2202 */ 2203 2204 rp = VTOR4(vp); 2205 locks = flk_active_locks_for_vp(vp); 2206 for (llp = locks; llp != NULL; llp = llp->ll_next) { 2207 int did_reclaim = 1; 2208 2209 ASSERT(llp->ll_vp == vp); 2210 if (llp->ll_flock.l_pid == NOPID) 2211 continue; 2212 reclaim_one_lock(vp, &llp->ll_flock, ep, &did_reclaim); 2213 /* 2214 * If we need to restart recovery, stop processing the 2215 * list. Some errors would be recoverable under other 2216 * circumstances, but if they happen here we just give up 2217 * on the lock. 2218 */ 2219 if (nfs4_needs_recovery(ep, TRUE, vp->v_vfsp)) { 2220 if (ep->error != 0) 2221 break; 2222 if (!nfs4_recov_marks_dead(ep->stat)) 2223 break; 2224 } 2225 /* 2226 * In case the server isn't offering us a grace period, or 2227 * if we missed it, we might have opened & locked from scratch, 2228 * rather than reopened/reclaimed. 2229 * We need to ensure that the object hadn't been otherwise 2230 * changed during this time, by comparing the changeinfo. 2231 * We get passed the changeinfo from before the reopen by our 2232 * caller, in pre_change. 2233 * The changeinfo from after the reopen is in rp->r_change, 2234 * courtesy of the GETATTR in the reopen. 2235 * If they're different, then the file has changed, and we 2236 * have to SIGLOST the app. 2237 */ 2238 if (ep->error == 0 && ep->stat == NFS4_OK && !did_reclaim) { 2239 mutex_enter(&rp->r_statelock); 2240 if (pre_change != rp->r_change) 2241 ep->stat = NFS4ERR_NO_GRACE; 2242 mutex_exit(&rp->r_statelock); 2243 } 2244 if (ep->error != 0 || ep->stat != NFS4_OK) { 2245 if (ep->error != 0) 2246 nfs4_queue_event(RE_FAIL_RELOCK, mi, 2247 NULL, ep->error, vp, NULL, 0, NULL, 2248 llp->ll_flock.l_pid, TAG_NONE, TAG_NONE, 2249 0, 0); 2250 else 2251 nfs4_queue_event(RE_FAIL_RELOCK, mi, 2252 NULL, 0, vp, NULL, ep->stat, NULL, 2253 llp->ll_flock.l_pid, TAG_NONE, TAG_NONE, 2254 0, 0); 2255 nfs4_send_siglost(llp->ll_flock.l_pid, mi, vp, TRUE, 2256 ep->error, ep->stat); 2257 relock_skip_pid(llp, llp->ll_flock.l_pid); 2258 2259 /* Reinitialize the nfs4_error and continue */ 2260 nfs4_error_zinit(ep); 2261 } 2262 } 2263 2264 if (locks != NULL) 2265 flk_free_locklist(locks); 2266 } 2267 2268 /* 2269 * Reclaim the given lock. 2270 * If the lock can't be reclaimed, the process is sent SIGLOST, but this is 2271 * not considered an error. 2272 * 2273 * Errors are returned via the nfs4_error_t parameter. 2274 */ 2275 static void 2276 reclaim_one_lock(vnode_t *vp, flock64_t *flk, nfs4_error_t *ep, 2277 int *did_reclaimp) 2278 { 2279 cred_t *cr; 2280 rnode4_t *rp = VTOR4(vp); 2281 2282 cr = pid_to_cr(flk->l_pid); 2283 if (cr == NULL) { 2284 nfs4_error_zinit(ep); 2285 ep->error = ESRCH; 2286 return; 2287 } 2288 2289 do { 2290 mutex_enter(&rp->r_statelock); 2291 if (rp->r_flags & R4RECOVERR) { 2292 /* 2293 * This shouldn't affect other reclaims, so don't 2294 * return an error. 2295 */ 2296 mutex_exit(&rp->r_statelock); 2297 break; 2298 } 2299 mutex_exit(&rp->r_statelock); 2300 2301 nfs4frlock(NFS4_LCK_CTYPE_RECLAIM, vp, F_SETLK, flk, 2302 FREAD|FWRITE, 0, cr, ep, NULL, did_reclaimp); 2303 if (ep->error == 0 && ep->stat == NFS4ERR_FHEXPIRED) 2304 start_recovery_action(NR_FHEXPIRED, TRUE, VTOMI4(vp), 2305 vp, NULL); 2306 } while (ep->error == 0 && ep->stat == NFS4ERR_FHEXPIRED); 2307 2308 crfree(cr); 2309 } 2310 2311 /* 2312 * Open files. 2313 */ 2314 2315 /* 2316 * Verifies if the nfsstat4 is a valid error for marking this vnode dead. 2317 * Returns 1 if the error is valid; 0 otherwise. 2318 */ 2319 static int 2320 nfs4_valid_recov_err_for_vp(vnode_t *vp, nfsstat4 stat) 2321 { 2322 /* 2323 * We should not be marking non-regular files as dead, 2324 * except in very rare cases (eg: BADHANDLE or NFS4ERR_BADNAME). 2325 */ 2326 if (vp->v_type != VREG && stat != NFS4ERR_BADHANDLE && 2327 stat != NFS4ERR_BADNAME) 2328 return (0); 2329 2330 return (1); 2331 } 2332 2333 /* 2334 * Failed attempting to recover a filehandle. If 'stat' is valid for 'vp', 2335 * then mark the object dead. Since we've had to do a lookup for 2336 * filehandle recovery, we will mark the object dead if we got NOENT. 2337 */ 2338 static void 2339 nfs4_recov_fh_fail(vnode_t *vp, int error, nfsstat4 stat) 2340 { 2341 ASSERT(vp != NULL); 2342 2343 if ((error == 0) && (stat != NFS4ERR_NOENT) && 2344 (!nfs4_valid_recov_err_for_vp(vp, stat))) 2345 return; 2346 2347 nfs4_fail_recov(vp, "can't recover filehandle", error, stat); 2348 } 2349 2350 /* 2351 * Recovery from a "shouldn't happen" error. In the long term, we'd like 2352 * to mark only the data structure(s) that provided the bad value as being 2353 * bad. But for now we'll just mark the entire file. 2354 */ 2355 2356 static void 2357 recov_badstate(recov_info_t *recovp, vnode_t *vp, nfsstat4 stat) 2358 { 2359 ASSERT(vp != NULL); 2360 recov_throttle(recovp, vp); 2361 2362 if (!nfs4_valid_recov_err_for_vp(vp, stat)) 2363 return; 2364 2365 nfs4_fail_recov(vp, "", 0, stat); 2366 } 2367 2368 /* 2369 * Free up the information saved for a lost state request. 2370 */ 2371 static void 2372 nfs4_free_lost_rqst(nfs4_lost_rqst_t *lrp, nfs4_server_t *sp) 2373 { 2374 component4 *filep; 2375 nfs4_open_stream_t *osp; 2376 int have_sync_lock; 2377 2378 NFS4_DEBUG(nfs4_lost_rqst_debug, 2379 (CE_NOTE, "nfs4_free_lost_rqst:")); 2380 2381 switch (lrp->lr_op) { 2382 case OP_OPEN: 2383 filep = &lrp->lr_ofile; 2384 if (filep->utf8string_val) { 2385 kmem_free(filep->utf8string_val, filep->utf8string_len); 2386 filep->utf8string_val = NULL; 2387 } 2388 break; 2389 case OP_DELEGRETURN: 2390 nfs4delegreturn_cleanup(VTOR4(lrp->lr_vp), sp); 2391 break; 2392 case OP_CLOSE: 2393 osp = lrp->lr_osp; 2394 ASSERT(osp != NULL); 2395 mutex_enter(&osp->os_sync_lock); 2396 have_sync_lock = 1; 2397 if (osp->os_pending_close) { 2398 /* clean up the open file state. */ 2399 osp->os_pending_close = 0; 2400 nfs4close_notw(lrp->lr_vp, osp, &have_sync_lock); 2401 } 2402 if (have_sync_lock) 2403 mutex_exit(&osp->os_sync_lock); 2404 break; 2405 } 2406 2407 lrp->lr_op = 0; 2408 if (lrp->lr_oop != NULL) { 2409 open_owner_rele(lrp->lr_oop); 2410 lrp->lr_oop = NULL; 2411 } 2412 if (lrp->lr_osp != NULL) { 2413 open_stream_rele(lrp->lr_osp, VTOR4(lrp->lr_vp)); 2414 lrp->lr_osp = NULL; 2415 } 2416 if (lrp->lr_lop != NULL) { 2417 lock_owner_rele(lrp->lr_lop); 2418 lrp->lr_lop = NULL; 2419 } 2420 if (lrp->lr_flk != NULL) { 2421 kmem_free(lrp->lr_flk, sizeof (flock64_t)); 2422 lrp->lr_flk = NULL; 2423 } 2424 if (lrp->lr_vp != NULL) { 2425 VN_RELE(lrp->lr_vp); 2426 lrp->lr_vp = NULL; 2427 } 2428 if (lrp->lr_dvp != NULL) { 2429 VN_RELE(lrp->lr_dvp); 2430 lrp->lr_dvp = NULL; 2431 } 2432 if (lrp->lr_cr != NULL) { 2433 crfree(lrp->lr_cr); 2434 lrp->lr_cr = NULL; 2435 } 2436 2437 kmem_free(lrp, sizeof (nfs4_lost_rqst_t)); 2438 } 2439 2440 /* 2441 * Remove any lost state requests and free them. 2442 */ 2443 static void 2444 nfs4_remove_lost_rqsts(mntinfo4_t *mi, nfs4_server_t *sp) 2445 { 2446 nfs4_lost_rqst_t *lrp; 2447 2448 mutex_enter(&mi->mi_lock); 2449 while ((lrp = list_head(&mi->mi_lost_state)) != NULL) { 2450 list_remove(&mi->mi_lost_state, lrp); 2451 mutex_exit(&mi->mi_lock); 2452 nfs4_free_lost_rqst(lrp, sp); 2453 mutex_enter(&mi->mi_lock); 2454 } 2455 mutex_exit(&mi->mi_lock); 2456 } 2457 2458 /* 2459 * Reopen all the files for the given filesystem and reclaim any locks. 2460 */ 2461 2462 static void 2463 recov_openfiles(recov_info_t *recovp, nfs4_server_t *sp) 2464 { 2465 mntinfo4_t *mi = recovp->rc_mi; 2466 nfs4_opinst_t *reopenlist = NULL, *rep; 2467 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 2468 open_claim_type4 claim; 2469 int remap; 2470 char *fail_msg = "No such file or directory on replica"; 2471 rnode4_t *rp; 2472 fattr4_change pre_change; 2473 2474 ASSERT(sp != NULL); 2475 2476 /* 2477 * This check is to allow a 10ms pause before we reopen files 2478 * it should allow the server time to have received the CB_NULL 2479 * reply and update its internal structures such that (if 2480 * applicable) we are granted a delegation on reopened files. 2481 */ 2482 mutex_enter(&sp->s_lock); 2483 if ((sp->s_flags & (N4S_CB_PINGED | N4S_CB_WAITER)) == 0) { 2484 sp->s_flags |= N4S_CB_WAITER; 2485 (void) cv_timedwait(&sp->wait_cb_null, &sp->s_lock, 2486 (lbolt + drv_usectohz(N4S_CB_PAUSE_TIME))); 2487 } 2488 mutex_exit(&sp->s_lock); 2489 2490 (void) nfs_rw_enter_sig(&sp->s_recovlock, RW_READER, 0); 2491 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0); 2492 2493 if (NFS4_VOLATILE_FH(mi)) { 2494 nfs4_remap_root(mi, &e, 0); 2495 if (nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp)) { 2496 (void) nfs4_start_recovery(&e, mi, NULL, 2497 NULL, NULL, NULL, OP_LOOKUP, NULL); 2498 } 2499 } 2500 2501 mutex_enter(&mi->mi_lock); 2502 if (recovp->rc_srv_reboot || (mi->mi_recovflags & MI4R_SRV_REBOOT)) 2503 claim = CLAIM_PREVIOUS; 2504 else 2505 claim = CLAIM_NULL; 2506 mutex_exit(&mi->mi_lock); 2507 2508 if (e.error == 0 && e.stat == NFS4_OK) { 2509 /* 2510 * Get a snapshot of open files in the filesystem. Note 2511 * that new opens will stall until the server's grace 2512 * period is done. 2513 */ 2514 reopenlist = r4mkopenlist(mi); 2515 2516 mutex_enter(&mi->mi_lock); 2517 remap = mi->mi_recovflags & MI4R_REMAP_FILES; 2518 mutex_exit(&mi->mi_lock); 2519 /* 2520 * Since we are re-establishing state on the 2521 * server, its ok to blow away the saved lost 2522 * requests since we don't need to reissue it. 2523 */ 2524 nfs4_remove_lost_rqsts(mi, sp); 2525 2526 for (rep = reopenlist; rep; rep = rep->re_next) { 2527 2528 if (remap) { 2529 nfs4_remap_file(mi, rep->re_vp, 2530 NFS4_REMAP_CKATTRS, &e); 2531 } 2532 if (e.error == ENOENT || e.stat == NFS4ERR_NOENT) { 2533 /* 2534 * The current server does not have the file 2535 * that is to be remapped. This is most 2536 * likely due to an improperly maintained 2537 * replica. The files that are missing from 2538 * the server will be marked dead and logged 2539 * in order to make sys admins aware of the 2540 * problem. 2541 */ 2542 nfs4_fail_recov(rep->re_vp, 2543 fail_msg, e.error, e.stat); 2544 /* 2545 * We've already handled the error so clear it. 2546 */ 2547 nfs4_error_zinit(&e); 2548 continue; 2549 } else if (e.error == 0 && e.stat == NFS4_OK) { 2550 int j; 2551 2552 rp = VTOR4(rep->re_vp); 2553 mutex_enter(&rp->r_statelock); 2554 pre_change = rp->r_change; 2555 mutex_exit(&rp->r_statelock); 2556 2557 for (j = 0; j < rep->re_numosp; j++) { 2558 nfs4_reopen(rep->re_vp, rep->re_osp[j], 2559 &e, claim, FALSE, TRUE); 2560 if (e.error != 0 || e.stat != NFS4_OK) 2561 break; 2562 } 2563 if (nfs4_needs_recovery(&e, TRUE, 2564 mi->mi_vfsp)) { 2565 (void) nfs4_start_recovery(&e, mi, 2566 rep->re_vp, NULL, NULL, NULL, 2567 OP_OPEN, NULL); 2568 break; 2569 } 2570 } 2571 #ifdef DEBUG 2572 if (nfs4_recovdelay > 0) 2573 delay(MSEC_TO_TICK(nfs4_recovdelay * 1000)); 2574 #endif 2575 if (e.error == 0 && e.stat == NFS4_OK) 2576 relock_file(rep->re_vp, mi, &e, pre_change); 2577 2578 if (nfs4_needs_recovery(&e, TRUE, mi->mi_vfsp)) 2579 (void) nfs4_start_recovery(&e, mi, 2580 rep->re_vp, NULL, NULL, NULL, OP_LOCK, 2581 NULL); 2582 if (e.error != 0 || e.stat != NFS4_OK) 2583 break; 2584 } 2585 2586 /* 2587 * Check to see if we need to remap files passed in 2588 * via the recovery arguments; this will have been 2589 * done for open files. A failure here is not fatal. 2590 */ 2591 if (remap) { 2592 nfs4_error_t ignore; 2593 nfs4_check_remap(mi, recovp->rc_vp1, NFS4_REMAP_CKATTRS, 2594 &ignore); 2595 nfs4_check_remap(mi, recovp->rc_vp2, NFS4_REMAP_CKATTRS, 2596 &ignore); 2597 } 2598 } 2599 2600 if (e.error == 0 && e.stat == NFS4_OK) { 2601 mutex_enter(&mi->mi_lock); 2602 mi->mi_recovflags &= ~(MI4R_REOPEN_FILES | MI4R_REMAP_FILES); 2603 mutex_exit(&mi->mi_lock); 2604 } 2605 2606 nfs_rw_exit(&mi->mi_recovlock); 2607 nfs_rw_exit(&sp->s_recovlock); 2608 2609 if (reopenlist != NULL) 2610 r4releopenlist(reopenlist); 2611 } 2612 2613 /* 2614 * Resend the queued state recovery requests in "rqsts". 2615 */ 2616 2617 static void 2618 nfs4_resend_lost_rqsts(recov_info_t *recovp, nfs4_server_t *sp) 2619 { 2620 nfs4_lost_rqst_t *lrp, *tlrp; 2621 mntinfo4_t *mi = recovp->rc_mi; 2622 nfs4_error_t n4e; 2623 #ifdef NOTYET 2624 uint32_t deny_bits = 0; 2625 #endif 2626 2627 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_resend_lost_rqsts")); 2628 2629 ASSERT(mi != NULL); 2630 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 2631 2632 mutex_enter(&mi->mi_lock); 2633 lrp = list_head(&mi->mi_lost_state); 2634 mutex_exit(&mi->mi_lock); 2635 while (lrp != NULL) { 2636 nfs4_error_zinit(&n4e); 2637 resend_one_op(lrp, &n4e, mi, sp); 2638 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 2639 "nfs4_resend_lost_rqsts: resend request: for vp %p got " 2640 "error %d stat %d", (void *)lrp->lr_vp, n4e.error, 2641 n4e.stat)); 2642 2643 /* 2644 * If we get a recovery error that we can actually 2645 * recover from (such as ETIMEDOUT, FHEXPIRED), we 2646 * return and let the recovery thread redrive the call. 2647 * Don't requeue unless the zone is still healthy. 2648 */ 2649 if (zone_status_get(curproc->p_zone) < ZONE_IS_SHUTTING_DOWN && 2650 nfs4_needs_recovery(&n4e, TRUE, mi->mi_vfsp) && 2651 (nfs4_try_failover(&n4e) || 2652 NFS4_FRC_UNMT_ERR(n4e.error, mi->mi_vfsp) || 2653 (n4e.error == 0 && n4e.stat != NFS4ERR_BADHANDLE && 2654 !nfs4_recov_marks_dead(n4e.stat)))) { 2655 /* 2656 * For these three errors, we want to delay a bit 2657 * instead of pounding the server into submission. 2658 * We have to do this manually; the normal 2659 * processing for these errors only works for 2660 * non-recovery requests. 2661 */ 2662 if ((n4e.error == 0 && n4e.stat == NFS4ERR_DELAY) || 2663 (n4e.error == 0 && n4e.stat == NFS4ERR_GRACE) || 2664 (n4e.error == 0 && n4e.stat == NFS4ERR_RESOURCE) || 2665 NFS4_FRC_UNMT_ERR(n4e.error, mi->mi_vfsp)) { 2666 delay(SEC_TO_TICK(nfs4err_delay_time)); 2667 } else { 2668 (void) nfs4_start_recovery(&n4e, 2669 mi, lrp->lr_dvp, lrp->lr_vp, NULL, NULL, 2670 lrp->lr_op, NULL); 2671 } 2672 return; 2673 } 2674 2675 mutex_enter(&mi->mi_lock); 2676 list_remove(&mi->mi_lost_state, lrp); 2677 tlrp = lrp; 2678 lrp = list_head(&mi->mi_lost_state); 2679 mutex_exit(&mi->mi_lock); 2680 nfs4_free_lost_rqst(tlrp, sp); 2681 } 2682 } 2683 2684 /* 2685 * Resend the given op, and issue any necessary undo call. 2686 * errors are returned via the nfs4_error_t parameter. 2687 */ 2688 2689 static void 2690 resend_one_op(nfs4_lost_rqst_t *lrp, nfs4_error_t *ep, 2691 mntinfo4_t *mi, nfs4_server_t *sp) 2692 { 2693 vnode_t *vp; 2694 nfs4_open_stream_t *osp; 2695 cred_t *cr; 2696 uint32_t acc_bits; 2697 2698 vp = lrp->lr_vp; 2699 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_one_op: " 2700 "have a lost open/close request for vp %p", (void *)vp)); 2701 2702 switch (lrp->lr_op) { 2703 case OP_OPEN: 2704 nfs4_resend_open_otw(&vp, lrp, ep); 2705 break; 2706 case OP_OPEN_DOWNGRADE: 2707 ASSERT(lrp->lr_oop != NULL); 2708 ep->error = nfs4_start_open_seqid_sync(lrp->lr_oop, mi); 2709 ASSERT(!ep->error); /* recov thread always succeeds */ 2710 ASSERT(lrp->lr_osp != NULL); 2711 mutex_enter(&lrp->lr_osp->os_sync_lock); 2712 nfs4_open_downgrade(lrp->lr_dg_acc, lrp->lr_dg_deny, 2713 lrp->lr_oop, lrp->lr_osp, vp, lrp->lr_cr, lrp, 2714 ep, NULL, NULL); 2715 mutex_exit(&lrp->lr_osp->os_sync_lock); 2716 nfs4_end_open_seqid_sync(lrp->lr_oop); 2717 break; 2718 case OP_CLOSE: 2719 osp = lrp->lr_osp; 2720 cr = lrp->lr_cr; 2721 acc_bits = 0; 2722 mutex_enter(&osp->os_sync_lock); 2723 if (osp->os_share_acc_read) 2724 acc_bits |= OPEN4_SHARE_ACCESS_READ; 2725 if (osp->os_share_acc_write) 2726 acc_bits |= OPEN4_SHARE_ACCESS_WRITE; 2727 mutex_exit(&osp->os_sync_lock); 2728 nfs4close_one(vp, osp, cr, acc_bits, lrp, ep, 2729 CLOSE_RESEND, 0, 0, 0); 2730 break; 2731 case OP_LOCK: 2732 case OP_LOCKU: 2733 resend_lock(lrp, ep); 2734 goto done; 2735 case OP_DELEGRETURN: 2736 nfs4_resend_delegreturn(lrp, ep, sp); 2737 goto done; 2738 default: 2739 #ifdef DEBUG 2740 cmn_err(CE_PANIC, "resend_one_op: unexpected op: %d", 2741 lrp->lr_op); 2742 #endif 2743 nfs4_queue_event(RE_LOST_STATE_BAD_OP, mi, NULL, 2744 lrp->lr_op, lrp->lr_vp, lrp->lr_dvp, NFS4_OK, NULL, 0, 2745 TAG_NONE, TAG_NONE, 0, 0); 2746 nfs4_error_init(ep, EINVAL); 2747 return; 2748 } 2749 2750 /* 2751 * No need to retry nor send an "undo" CLOSE in the 2752 * event the server rebooted. 2753 */ 2754 if (ep->error == 0 && (ep->stat == NFS4ERR_STALE_CLIENTID || 2755 ep->stat == NFS4ERR_STALE_STATEID || ep->stat == NFS4ERR_EXPIRED)) 2756 goto done; 2757 2758 /* 2759 * If we resent a CLOSE or OPEN_DOWNGRADE, there's nothing 2760 * to undo. Undoing locking operations was handled by 2761 * resend_lock(). 2762 */ 2763 if (lrp->lr_op == OP_OPEN_DOWNGRADE || lrp->lr_op == OP_CLOSE) 2764 goto done; 2765 2766 /* 2767 * If we get any other error for OPEN, then don't attempt 2768 * to undo the resend of the open (since it was never 2769 * successful!). 2770 */ 2771 ASSERT(lrp->lr_op == OP_OPEN); 2772 if (ep->error || ep->stat != NFS4_OK) 2773 goto done; 2774 2775 /* 2776 * Now let's undo our OPEN. 2777 */ 2778 nfs4_error_zinit(ep); 2779 close_after_open_resend(vp, lrp->lr_cr, lrp->lr_oacc, ep); 2780 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_one_op: " 2781 "nfs4close_one: for vp %p got error %d stat %d", 2782 (void *)vp, ep->error, ep->stat)); 2783 2784 done: 2785 if (vp != lrp->lr_vp) 2786 VN_RELE(vp); 2787 } 2788 2789 /* 2790 * Close a file that was opened via a resent OPEN. 2791 * Most errors are passed back to the caller (via the return value and 2792 * *statp), except for FHEXPIRED, which is retried. 2793 * 2794 * It might be conceptually cleaner to push the CLOSE request onto the 2795 * front of the resend queue, rather than sending it here. That would 2796 * match the way we undo lost lock requests. On the other 2797 * hand, we've already got something that works, and there's no reason to 2798 * change it at this time. 2799 */ 2800 2801 static void 2802 close_after_open_resend(vnode_t *vp, cred_t *cr, uint32_t acc_bits, 2803 nfs4_error_t *ep) 2804 { 2805 2806 for (;;) { 2807 nfs4close_one(vp, NULL, cr, acc_bits, NULL, ep, 2808 CLOSE_AFTER_RESEND, 0, 0, 0); 2809 if (ep->error == 0 && ep->stat == NFS4_OK) 2810 break; /* success; done */ 2811 if (ep->error != 0 || ep->stat != NFS4ERR_FHEXPIRED) 2812 break; 2813 /* else retry FHEXPIRED */ 2814 } 2815 2816 } 2817 2818 /* 2819 * Resend the given lost lock request. Return an errno value. If zero, 2820 * *statp is set to the NFS status code for the call. 2821 * 2822 * Issue a SIGLOST and mark the rnode dead if we get a non-recovery error or 2823 * a recovery error that we don't actually recover from yet (eg: BAD_SEQID). 2824 * Let the recovery thread redrive the call if we get a recovery error that 2825 * we can actually recover from. 2826 */ 2827 static void 2828 resend_lock(nfs4_lost_rqst_t *lrp, nfs4_error_t *ep) 2829 { 2830 bool_t send_siglost = FALSE; 2831 vnode_t *vp = lrp->lr_vp; 2832 2833 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_lock:")); 2834 ASSERT(lrp->lr_ctype == NFS4_LCK_CTYPE_REINSTATE || 2835 lrp->lr_ctype == NFS4_LCK_CTYPE_RESEND); 2836 2837 nfs4frlock(lrp->lr_ctype, vp, F_SETLK, 2838 lrp->lr_flk, FREAD|FWRITE, 0, lrp->lr_cr, ep, lrp, NULL); 2839 2840 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_lock: " 2841 "nfs4frlock for vp %p returned error %d, stat %d", 2842 (void *)vp, ep->error, ep->stat)); 2843 2844 if (ep->error == 0 && ep->stat == 0) 2845 goto done; 2846 if (ep->error == 0 && ep->stat == NFS4ERR_DENIED && 2847 lrp->lr_ctype == NFS4_LCK_CTYPE_RESEND) 2848 goto done; 2849 2850 /* 2851 * If we failed with a non-recovery error, send SIGLOST and 2852 * mark the file dead. 2853 */ 2854 if (!nfs4_needs_recovery(ep, TRUE, vp->v_vfsp)) 2855 send_siglost = TRUE; 2856 else { 2857 /* 2858 * Done with recovering LOST LOCK in the event the 2859 * server rebooted or we've lost the lease. 2860 */ 2861 if (ep->error == 0 && (ep->stat == NFS4ERR_STALE_CLIENTID || 2862 ep->stat == NFS4ERR_STALE_STATEID || 2863 ep->stat == NFS4ERR_EXPIRED)) { 2864 goto done; 2865 } 2866 2867 /* 2868 * BAD_STATEID on an unlock indicates that the server has 2869 * forgotten about the lock anyway, so act like the call 2870 * was successful. 2871 */ 2872 if (ep->error == 0 && ep->stat == NFS4ERR_BAD_STATEID && 2873 lrp->lr_op == OP_LOCKU) 2874 goto done; 2875 2876 /* 2877 * If we got a recovery error that we don't actually 2878 * recover from, send SIGLOST. If the filesystem was 2879 * forcibly unmounted, we skip the SIGLOST because (a) it's 2880 * unnecessary noise, and (b) there could be a new process 2881 * with the same pid as the one that had generated the lost 2882 * state request. 2883 */ 2884 if (ep->error == 0 && (ep->stat == NFS4ERR_BADHANDLE || 2885 nfs4_recov_marks_dead(ep->stat))) { 2886 if (!(vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) 2887 send_siglost = TRUE; 2888 goto done; 2889 } 2890 2891 /* 2892 * If the filesystem was forcibly unmounted, we 2893 * still need to synchronize with the server and 2894 * release state. Try again later. 2895 */ 2896 if (NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp)) 2897 goto done; 2898 2899 /* 2900 * If we get a recovery error that we can actually 2901 * recover from (such as ETIMEDOUT, FHEXPIRED), 2902 * return and let the recovery thread redrive the call. 2903 * 2904 * For the three errors below, we want to delay a bit 2905 * instead of pounding the server into submission. 2906 */ 2907 if ((ep->error == 0 && ep->stat == NFS4ERR_DELAY) || 2908 (ep->error == 0 && ep->stat == NFS4ERR_GRACE) || 2909 (ep->error == 0 && ep->stat == NFS4ERR_RESOURCE)) 2910 delay(SEC_TO_TICK(recov_err_delay)); 2911 goto done; 2912 } 2913 2914 done: 2915 if (send_siglost) { 2916 cred_t *sv_cred; 2917 2918 /* 2919 * Must be root or the actual thread being issued the 2920 * SIGLOST for this to work, so just become root. 2921 */ 2922 sv_cred = curthread->t_cred; 2923 curthread->t_cred = kcred; 2924 nfs4_send_siglost(lrp->lr_flk->l_pid, VTOMI4(vp), vp, FALSE, 2925 ep->error, ep->stat); 2926 curthread->t_cred = sv_cred; 2927 2928 /* 2929 * Flush any additional reinstantiation requests for 2930 * this operation. Sending multiple SIGLOSTs to the user 2931 * process is unlikely to help and may cause trouble. 2932 */ 2933 if (lrp->lr_ctype == NFS4_LCK_CTYPE_REINSTATE) 2934 flush_reinstate(lrp); 2935 } 2936 } 2937 2938 /* 2939 * Remove any lock reinstantiation requests that correspond to the given 2940 * lost request. We only remove items that follow lrp in the queue, 2941 * assuming that lrp will be removed by the generic lost state code. 2942 */ 2943 2944 static void 2945 flush_reinstate(nfs4_lost_rqst_t *lrp) 2946 { 2947 vnode_t *vp; 2948 pid_t pid; 2949 mntinfo4_t *mi; 2950 nfs4_lost_rqst_t *nlrp; 2951 2952 vp = lrp->lr_vp; 2953 mi = VTOMI4(vp); 2954 pid = lrp->lr_flk->l_pid; 2955 2956 /* 2957 * If there are any more reinstantation requests to get rid of, 2958 * they should all be clustered at the front of the lost state 2959 * queue. 2960 */ 2961 mutex_enter(&mi->mi_lock); 2962 for (lrp = list_next(&mi->mi_lost_state, lrp); lrp != NULL; 2963 lrp = nlrp) { 2964 nlrp = list_next(&mi->mi_lost_state, lrp); 2965 if (lrp->lr_op != OP_LOCK && lrp->lr_op != OP_LOCKU) 2966 break; 2967 if (lrp->lr_ctype != NFS4_LCK_CTYPE_REINSTATE) 2968 break; 2969 ASSERT(lrp->lr_vp == vp); 2970 ASSERT(lrp->lr_flk->l_pid == pid); 2971 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 2972 "remove reinstantiation %p", (void *)lrp)); 2973 list_remove(&mi->mi_lost_state, lrp); 2974 nfs4_free_lost_rqst(lrp, NULL); 2975 } 2976 mutex_exit(&mi->mi_lock); 2977 } 2978 2979 /* 2980 * End of state-specific recovery routines. 2981 */ 2982 2983 /* 2984 * Allocate a lost request struct, initialize it from lost_rqstp (including 2985 * bumping the reference counts for the referenced vnode, etc.), and hang 2986 * it off of recovp. 2987 */ 2988 2989 static void 2990 nfs4_save_lost_rqst(nfs4_lost_rqst_t *lost_rqstp, recov_info_t *recovp, 2991 nfs4_recov_t *action, mntinfo4_t *mi) 2992 { 2993 nfs4_lost_rqst_t *destp; 2994 2995 ASSERT(recovp->rc_lost_rqst == NULL); 2996 2997 destp = kmem_alloc(sizeof (nfs4_lost_rqst_t), KM_SLEEP); 2998 recovp->rc_lost_rqst = destp; 2999 3000 if (lost_rqstp->lr_op == OP_LOCK || 3001 lost_rqstp->lr_op == OP_LOCKU) { 3002 ASSERT(lost_rqstp->lr_lop); 3003 *action = NR_LOST_LOCK; 3004 destp->lr_ctype = lost_rqstp->lr_ctype; 3005 destp->lr_locktype = lost_rqstp->lr_locktype; 3006 } else if (lost_rqstp->lr_op == OP_OPEN) { 3007 component4 *srcfp, *destfp; 3008 3009 destp->lr_oacc = lost_rqstp->lr_oacc; 3010 destp->lr_odeny = lost_rqstp->lr_odeny; 3011 destp->lr_oclaim = lost_rqstp->lr_oclaim; 3012 if (lost_rqstp->lr_oclaim == CLAIM_DELEGATE_CUR) 3013 destp->lr_ostateid = lost_rqstp->lr_ostateid; 3014 3015 srcfp = &lost_rqstp->lr_ofile; 3016 destfp = &destp->lr_ofile; 3017 /* 3018 * Consume caller's utf8string 3019 */ 3020 destfp->utf8string_len = srcfp->utf8string_len; 3021 destfp->utf8string_val = srcfp->utf8string_val; 3022 srcfp->utf8string_len = 0; 3023 srcfp->utf8string_val = NULL; /* make sure not reused */ 3024 3025 *action = NR_LOST_STATE_RQST; 3026 } else if (lost_rqstp->lr_op == OP_OPEN_DOWNGRADE) { 3027 destp->lr_dg_acc = lost_rqstp->lr_dg_acc; 3028 destp->lr_dg_deny = lost_rqstp->lr_dg_deny; 3029 3030 *action = NR_LOST_STATE_RQST; 3031 } else if (lost_rqstp->lr_op == OP_CLOSE) { 3032 ASSERT(lost_rqstp->lr_oop); 3033 *action = NR_LOST_STATE_RQST; 3034 } else if (lost_rqstp->lr_op == OP_DELEGRETURN) { 3035 *action = NR_LOST_STATE_RQST; 3036 } else { 3037 #ifdef DEBUG 3038 cmn_err(CE_PANIC, "nfs4_save_lost_rqst: bad op %d", 3039 lost_rqstp->lr_op); 3040 #endif 3041 nfs4_queue_event(RE_LOST_STATE_BAD_OP, mi, NULL, 3042 lost_rqstp->lr_op, lost_rqstp->lr_vp, lost_rqstp->lr_dvp, 3043 NFS4_OK, NULL, curproc->p_pid, TAG_NONE, TAG_NONE, 0, 0); 3044 *action = NR_UNUSED; 3045 recovp->rc_lost_rqst = NULL; 3046 kmem_free(destp, sizeof (nfs4_lost_rqst_t)); 3047 return; 3048 } 3049 3050 destp->lr_op = lost_rqstp->lr_op; 3051 destp->lr_vp = lost_rqstp->lr_vp; 3052 if (destp->lr_vp) 3053 VN_HOLD(destp->lr_vp); 3054 destp->lr_dvp = lost_rqstp->lr_dvp; 3055 if (destp->lr_dvp) 3056 VN_HOLD(destp->lr_dvp); 3057 destp->lr_oop = lost_rqstp->lr_oop; 3058 if (destp->lr_oop) 3059 open_owner_hold(destp->lr_oop); 3060 destp->lr_osp = lost_rqstp->lr_osp; 3061 if (destp->lr_osp) 3062 open_stream_hold(destp->lr_osp); 3063 destp->lr_lop = lost_rqstp->lr_lop; 3064 if (destp->lr_lop) 3065 lock_owner_hold(destp->lr_lop); 3066 destp->lr_cr = lost_rqstp->lr_cr; 3067 if (destp->lr_cr) 3068 crhold(destp->lr_cr); 3069 if (lost_rqstp->lr_flk == NULL) 3070 destp->lr_flk = NULL; 3071 else { 3072 destp->lr_flk = kmem_alloc(sizeof (flock64_t), KM_SLEEP); 3073 *destp->lr_flk = *lost_rqstp->lr_flk; 3074 } 3075 destp->lr_putfirst = lost_rqstp->lr_putfirst; 3076 } 3077 3078 /* 3079 * Map the given return values (errno and nfs4 status code) to a recovery 3080 * action and fill in the following fields of recovp: rc_action, 3081 * rc_srv_reboot, rc_stateid, rc_lost_rqst. 3082 */ 3083 3084 void 3085 errs_to_action(recov_info_t *recovp, 3086 nfs4_server_t *sp, mntinfo4_t *mi, stateid4 *sidp, 3087 nfs4_lost_rqst_t *lost_rqstp, int unmounted, nfs_opnum4 op, 3088 nfs4_bseqid_entry_t *bsep) 3089 { 3090 nfs4_recov_t action = NR_UNUSED; 3091 bool_t reboot = FALSE; 3092 int try_f; 3093 int error = recovp->rc_orig_errors.error; 3094 nfsstat4 stat = recovp->rc_orig_errors.stat; 3095 3096 bzero(&recovp->rc_stateid, sizeof (stateid4)); 3097 recovp->rc_lost_rqst = NULL; 3098 recovp->rc_bseqid_rqst = NULL; 3099 3100 try_f = nfs4_try_failover(&recovp->rc_orig_errors) && 3101 FAILOVER_MOUNT4(mi); 3102 3103 /* 3104 * We start recovery for EINTR only in the lost lock 3105 * or lost open/close case. 3106 */ 3107 3108 if (try_f || error == EINTR || (error == EIO && unmounted)) { 3109 recovp->rc_error = (error != 0 ? error : geterrno4(stat)); 3110 if (lost_rqstp) { 3111 ASSERT(lost_rqstp->lr_op != 0); 3112 nfs4_save_lost_rqst(lost_rqstp, recovp, &action, mi); 3113 } 3114 if (try_f) 3115 action = NR_FAILOVER; 3116 } else if (error != 0) { 3117 recovp->rc_error = error; 3118 nfs4_queue_event(RE_UNEXPECTED_ERRNO, mi, NULL, error, NULL, 3119 NULL, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0); 3120 action = NR_CLIENTID; 3121 } else { 3122 recovp->rc_error = geterrno4(stat); 3123 switch (stat) { 3124 #ifdef notyet 3125 case NFS4ERR_LEASE_MOVED: 3126 action = xxx; 3127 break; 3128 case NFS4ERR_MOVED: 3129 action = xxx; 3130 break; 3131 #endif 3132 case NFS4ERR_BADHANDLE: 3133 action = NR_BADHANDLE; 3134 break; 3135 case NFS4ERR_BAD_SEQID: 3136 if (bsep) 3137 save_bseqid_rqst(bsep, recovp); 3138 action = NR_BAD_SEQID; 3139 break; 3140 case NFS4ERR_OLD_STATEID: 3141 action = NR_OLDSTATEID; 3142 break; 3143 case NFS4ERR_WRONGSEC: 3144 action = NR_WRONGSEC; 3145 break; 3146 case NFS4ERR_FHEXPIRED: 3147 action = NR_FHEXPIRED; 3148 break; 3149 case NFS4ERR_BAD_STATEID: 3150 if (sp == NULL || (sp != NULL && inlease(sp))) { 3151 3152 action = NR_BAD_STATEID; 3153 if (sidp) 3154 recovp->rc_stateid = *sidp; 3155 } else 3156 action = NR_CLIENTID; 3157 break; 3158 case NFS4ERR_EXPIRED: 3159 /* 3160 * The client's lease has expired, either due 3161 * to a network partition or perhaps a client 3162 * error. In either case, try an NR_CLIENTID 3163 * style recovery. reboot remains false, since 3164 * there is no evidence the server has rebooted. 3165 * This will cause CLAIM_NULL opens and lock 3166 * requests without the reclaim bit. 3167 */ 3168 action = NR_CLIENTID; 3169 3170 DTRACE_PROBE4(nfs4__expired, 3171 nfs4_server_t *, sp, 3172 mntinfo4_t *, mi, 3173 stateid4 *, sidp, int, op); 3174 3175 break; 3176 case NFS4ERR_STALE_CLIENTID: 3177 case NFS4ERR_STALE_STATEID: 3178 action = NR_CLIENTID; 3179 reboot = TRUE; 3180 break; 3181 case NFS4ERR_RESOURCE: 3182 /* 3183 * If this had been a FAILOVER mount, then 3184 * we'd have tried failover. Since it's not, 3185 * just delay a while and retry. 3186 */ 3187 action = NR_DELAY; 3188 break; 3189 case NFS4ERR_GRACE: 3190 action = NR_GRACE; 3191 break; 3192 case NFS4ERR_DELAY: 3193 action = NR_DELAY; 3194 break; 3195 case NFS4ERR_STALE: 3196 action = NR_STALE; 3197 break; 3198 default: 3199 nfs4_queue_event(RE_UNEXPECTED_STATUS, mi, NULL, 0, 3200 NULL, NULL, stat, NULL, 0, TAG_NONE, TAG_NONE, 3201 0, 0); 3202 action = NR_CLIENTID; 3203 break; 3204 } 3205 } 3206 3207 /* make sure action got set */ 3208 ASSERT(action != NR_UNUSED); 3209 recovp->rc_srv_reboot = reboot; 3210 recovp->rc_action = action; 3211 nfs4_queue_fact(RF_ERR, mi, stat, action, op, reboot, NULL, error, 3212 NULL); 3213 } 3214 3215 /* 3216 * Return the (held) credential for the process with the given pid. 3217 * May return NULL (e.g., process not found). 3218 */ 3219 3220 static cred_t * 3221 pid_to_cr(pid_t pid) 3222 { 3223 proc_t *p; 3224 cred_t *cr; 3225 3226 mutex_enter(&pidlock); 3227 if ((p = prfind(pid)) == NULL) { 3228 mutex_exit(&pidlock); 3229 return (NULL); 3230 } 3231 3232 mutex_enter(&p->p_crlock); 3233 crhold(cr = p->p_cred); 3234 mutex_exit(&p->p_crlock); 3235 mutex_exit(&pidlock); 3236 3237 return (cr); 3238 } 3239 3240 /* 3241 * Send SIGLOST to the given process and queue the event. 3242 * 3243 * The 'dump' boolean tells us whether this action should dump the 3244 * in-kernel queue of recovery messages or not. 3245 */ 3246 3247 void 3248 nfs4_send_siglost(pid_t pid, mntinfo4_t *mi, vnode_t *vp, bool_t dump, 3249 int error, nfsstat4 stat) 3250 { 3251 proc_t *p; 3252 3253 mutex_enter(&pidlock); 3254 p = prfind(pid); 3255 if (p) 3256 psignal(p, SIGLOST); 3257 mutex_exit(&pidlock); 3258 nfs4_queue_event(dump ? RE_SIGLOST : RE_SIGLOST_NO_DUMP, mi, 3259 NULL, error, vp, NULL, stat, NULL, pid, TAG_NONE, TAG_NONE, 0, 0); 3260 } 3261 3262 /* 3263 * Scan the lock list for entries that match the given pid. Change the 3264 * pid in those that do to NOPID. 3265 */ 3266 3267 static void 3268 relock_skip_pid(locklist_t *llp, pid_t pid) 3269 { 3270 for (; llp != NULL; llp = llp->ll_next) { 3271 if (llp->ll_flock.l_pid == pid) 3272 llp->ll_flock.l_pid = NOPID; 3273 } 3274 } 3275 3276 /* 3277 * Mark a file as having failed recovery, after making a last-ditch effort 3278 * to return any delegation. 3279 * 3280 * Sets r_error to EIO or ESTALE for the given vnode. 3281 */ 3282 void 3283 nfs4_fail_recov(vnode_t *vp, char *why, int error, nfsstat4 stat) 3284 { 3285 rnode4_t *rp = VTOR4(vp); 3286 3287 #ifdef DEBUG 3288 if (nfs4_fail_recov_stop) 3289 debug_enter("nfs4_fail_recov"); 3290 #endif 3291 3292 mutex_enter(&rp->r_statelock); 3293 if (rp->r_flags & (R4RECOVERR|R4RECOVERRP)) { 3294 mutex_exit(&rp->r_statelock); 3295 return; 3296 } 3297 3298 /* 3299 * Set R4RECOVERRP to indicate that a recovery error is in 3300 * progress. This will shut down reads and writes at the top 3301 * half. Don't set R4RECOVERR until after we've returned the 3302 * delegation, otherwise it will fail. 3303 */ 3304 3305 rp->r_flags |= R4RECOVERRP; 3306 mutex_exit(&rp->r_statelock); 3307 3308 nfs4delegabandon(rp); 3309 3310 mutex_enter(&rp->r_statelock); 3311 rp->r_flags |= (R4RECOVERR | R4STALE); 3312 rp->r_error = (error == 0 && stat == NFS4ERR_STALE) ? ESTALE : EIO; 3313 PURGE_ATTRCACHE4_LOCKED(rp); 3314 if (!(vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) 3315 nfs4_queue_event(RE_DEAD_FILE, VTOMI4(vp), NULL, error, 3316 vp, NULL, stat, why, 0, TAG_NONE, TAG_NONE, 0, 0); 3317 mutex_exit(&rp->r_statelock); 3318 3319 dnlc_purge_vp(vp); 3320 } 3321 3322 /* 3323 * recov_throttle: if the file had the same recovery action within the 3324 * throttle interval, wait for the throttle interval to finish before 3325 * proceeding. 3326 * 3327 * Side effects: updates the rnode with the current recovery information. 3328 */ 3329 3330 static void 3331 recov_throttle(recov_info_t *recovp, vnode_t *vp) 3332 { 3333 time_t curtime, time_to_wait; 3334 rnode4_t *rp = VTOR4(vp); 3335 3336 curtime = gethrestime_sec(); 3337 3338 mutex_enter(&rp->r_statelock); 3339 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 3340 "recov_throttle: now: (%d, %ld), last: (%d, %ld)", 3341 recovp->rc_action, curtime, 3342 rp->r_recov_act, rp->r_last_recov)); 3343 if (recovp->rc_action == rp->r_recov_act && 3344 rp->r_last_recov + recov_err_delay > curtime) { 3345 time_to_wait = rp->r_last_recov + recov_err_delay - curtime; 3346 mutex_exit(&rp->r_statelock); 3347 delay(SEC_TO_TICK(time_to_wait)); 3348 curtime = gethrestime_sec(); 3349 mutex_enter(&rp->r_statelock); 3350 } 3351 3352 rp->r_last_recov = curtime; 3353 rp->r_recov_act = recovp->rc_action; 3354 mutex_exit(&rp->r_statelock); 3355 } 3356 3357 /* 3358 * React to NFS4ERR_GRACE by setting the time we'll permit 3359 * the next call to this filesystem. 3360 */ 3361 void 3362 nfs4_set_grace_wait(mntinfo4_t *mi) 3363 { 3364 mutex_enter(&mi->mi_lock); 3365 /* Mark the time for the future */ 3366 mi->mi_grace_wait = gethrestime_sec() + nfs4err_delay_time; 3367 mutex_exit(&mi->mi_lock); 3368 } 3369 3370 /* 3371 * React to MFS4ERR_DELAY by setting the time we'll permit 3372 * the next call to this vnode. 3373 */ 3374 void 3375 nfs4_set_delay_wait(vnode_t *vp) 3376 { 3377 rnode4_t *rp = VTOR4(vp); 3378 3379 mutex_enter(&rp->r_statelock); 3380 /* 3381 * Calculate amount we should delay, initial 3382 * delay will be short and then we will back off. 3383 */ 3384 if (rp->r_delay_interval == 0) 3385 rp->r_delay_interval = NFS4_INITIAL_DELAY_INTERVAL; 3386 else 3387 /* calculate next interval value */ 3388 rp->r_delay_interval = 3389 MIN(NFS4_MAX_DELAY_INTERVAL, (rp->r_delay_interval << 1)); 3390 rp->r_delay_wait = gethrestime_sec() + rp->r_delay_interval; 3391 mutex_exit(&rp->r_statelock); 3392 } 3393 3394 /* 3395 * The caller is responsible for freeing the returned string. 3396 */ 3397 static char * 3398 nfs4_getsrvnames(mntinfo4_t *mi, size_t *len) 3399 { 3400 servinfo4_t *svp; 3401 char *srvnames; 3402 char *namep; 3403 size_t length; 3404 3405 /* 3406 * Calculate the length of the string required to hold all 3407 * of the server names plus either a comma or a null 3408 * character following each individual one. 3409 */ 3410 length = 0; 3411 for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) { 3412 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 3413 if (svp->sv_flags & SV4_NOTINUSE) { 3414 nfs_rw_exit(&svp->sv_lock); 3415 continue; 3416 } 3417 nfs_rw_exit(&svp->sv_lock); 3418 length += svp->sv_hostnamelen; 3419 } 3420 3421 srvnames = kmem_alloc(length, KM_SLEEP); 3422 3423 namep = srvnames; 3424 for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) { 3425 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 3426 if (svp->sv_flags & SV4_NOTINUSE) { 3427 nfs_rw_exit(&svp->sv_lock); 3428 continue; 3429 } 3430 nfs_rw_exit(&svp->sv_lock); 3431 (void) strcpy(namep, svp->sv_hostname); 3432 namep += svp->sv_hostnamelen - 1; 3433 *namep++ = ','; 3434 } 3435 *--namep = '\0'; 3436 3437 *len = length; 3438 3439 return (srvnames); 3440 } 3441 3442 static void 3443 save_bseqid_rqst(nfs4_bseqid_entry_t *bsep, recov_info_t *recovp) 3444 { 3445 nfs4_bseqid_entry_t *destp; 3446 3447 destp = kmem_alloc(sizeof (nfs4_bseqid_entry_t), KM_SLEEP); 3448 recovp->rc_bseqid_rqst = destp; 3449 3450 if (bsep->bs_oop) 3451 open_owner_hold(bsep->bs_oop); 3452 destp->bs_oop = bsep->bs_oop; 3453 if (bsep->bs_lop) 3454 lock_owner_hold(bsep->bs_lop); 3455 destp->bs_lop = bsep->bs_lop; 3456 if (bsep->bs_vp) 3457 VN_HOLD(bsep->bs_vp); 3458 destp->bs_vp = bsep->bs_vp; 3459 destp->bs_pid = bsep->bs_pid; 3460 destp->bs_tag = bsep->bs_tag; 3461 destp->bs_seqid = bsep->bs_seqid; 3462 } 3463 3464 static void 3465 free_bseqid_rqst(nfs4_bseqid_entry_t *bsep) 3466 { 3467 if (bsep->bs_oop) 3468 open_owner_rele(bsep->bs_oop); 3469 if (bsep->bs_lop) 3470 lock_owner_rele(bsep->bs_lop); 3471 if (bsep->bs_vp) 3472 VN_RELE(bsep->bs_vp); 3473 kmem_free(bsep, sizeof (nfs4_bseqid_entry_t)); 3474 } 3475 3476 /* 3477 * We don't actually fully recover from NFS4ERR_BAD_SEQID. We 3478 * simply mark the open owner and open stream (if provided) as "bad". 3479 * Then future uses of these data structures will be limited to basically 3480 * just cleaning up the internal client state (no going OTW). 3481 * 3482 * The result of this is to return errors back to the app/usr when 3483 * we receive NFS4ERR_BAD_SEQID, but also allow future/new calls to 3484 * succeed so progress can be made. 3485 */ 3486 void 3487 recov_bad_seqid(recov_info_t *recovp) 3488 { 3489 mntinfo4_t *mi = recovp->rc_mi; 3490 nfs4_open_owner_t *bad_oop; 3491 nfs4_lock_owner_t *bad_lop; 3492 vnode_t *vp; 3493 rnode4_t *rp = NULL; 3494 pid_t pid; 3495 nfs4_bseqid_entry_t *bsep, *tbsep; 3496 int error; 3497 3498 ASSERT(mi != NULL); 3499 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 3500 3501 mutex_enter(&mi->mi_lock); 3502 bsep = list_head(&mi->mi_bseqid_list); 3503 mutex_exit(&mi->mi_lock); 3504 3505 /* 3506 * Handle all the bad seqid entries on mi's list. 3507 */ 3508 while (bsep != NULL) { 3509 bad_oop = bsep->bs_oop; 3510 bad_lop = bsep->bs_lop; 3511 vp = bsep->bs_vp; 3512 pid = bsep->bs_pid; 3513 3514 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 3515 "recov_bad_seqid: mark oop %p lop %p as bad for " 3516 "vp %p tag %s pid %d: last good seqid %d for tag %s", 3517 (void *)bad_oop, (void *)bad_lop, (void *)vp, 3518 nfs4_ctags[bsep->bs_tag].ct_str, pid, 3519 bad_oop ? bad_oop->oo_last_good_seqid : 0, 3520 bad_oop ? nfs4_ctags[bad_oop->oo_last_good_op].ct_str : 3521 nfs4_ctags[TAG_NONE].ct_str)); 3522 3523 nfs4_queue_event(RE_BAD_SEQID, mi, NULL, 3524 0, vp, NULL, NFS4ERR_BAD_SEQID, NULL, pid, bsep->bs_tag, 3525 bad_oop ? bad_oop->oo_last_good_op : TAG_NONE, 3526 bsep->bs_seqid, bad_oop ? bad_oop->oo_last_good_seqid : 0); 3527 3528 if (bad_oop) { 3529 /* essentially reset the open owner */ 3530 error = nfs4_start_open_seqid_sync(bad_oop, mi); 3531 ASSERT(!error); /* recov thread always succeeds */ 3532 bad_oop->oo_name = nfs4_get_new_oo_name(); 3533 bad_oop->oo_seqid = 0; 3534 nfs4_end_open_seqid_sync(bad_oop); 3535 } 3536 3537 if (bad_lop) { 3538 mutex_enter(&bad_lop->lo_lock); 3539 bad_lop->lo_flags |= NFS4_BAD_SEQID_LOCK; 3540 mutex_exit(&bad_lop->lo_lock); 3541 3542 ASSERT(vp != NULL); 3543 rp = VTOR4(vp); 3544 mutex_enter(&rp->r_statelock); 3545 rp->r_flags |= R4LODANGLERS; 3546 mutex_exit(&rp->r_statelock); 3547 3548 nfs4_send_siglost(pid, mi, vp, TRUE, 3549 0, NFS4ERR_BAD_SEQID); 3550 } 3551 3552 mutex_enter(&mi->mi_lock); 3553 list_remove(&mi->mi_bseqid_list, bsep); 3554 tbsep = bsep; 3555 bsep = list_head(&mi->mi_bseqid_list); 3556 mutex_exit(&mi->mi_lock); 3557 free_bseqid_rqst(tbsep); 3558 } 3559 3560 mutex_enter(&mi->mi_lock); 3561 mi->mi_recovflags &= ~MI4R_BAD_SEQID; 3562 mutex_exit(&mi->mi_lock); 3563 } 3564