1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * NFS Version 4 state recovery code. 28 */ 29 30 #include <nfs/nfs4_clnt.h> 31 #include <nfs/nfs4.h> 32 #include <nfs/rnode4.h> 33 #include <sys/cmn_err.h> 34 #include <sys/cred.h> 35 #include <sys/systm.h> 36 #include <sys/flock.h> 37 #include <sys/dnlc.h> 38 #include <sys/ddi.h> 39 #include <sys/disp.h> 40 #include <sys/list.h> 41 #include <sys/sdt.h> 42 43 extern r4hashq_t *rtable4; 44 45 /* 46 * Information that describes what needs to be done for recovery. It is 47 * passed to a client recovery thread as well as passed to various recovery 48 * routines. rc_mi, rc_vp1, and rc_vp2 refer to the filesystem and 49 * vnode(s) affected by recovery. rc_vp1 and rc_vp2 are references (use 50 * VN_HOLD) or NULL. rc_lost_rqst contains information about the lost 51 * lock or open/close request, and it holds reference counts for the 52 * various objects (vnode, etc.). The recovery thread also uses flags set 53 * in the mntinfo4_t or vnode_t to tell it what to do. rc_error is used 54 * to save the error that originally triggered the recovery event -- will 55 * later be used to set mi_error if recovery doesn't work. rc_bseqid_rqst 56 * contains information about the request that got NFS4ERR_BAD_SEQID, and 57 * it holds reference count for the various objects (vnode, open owner, 58 * open stream, lock owner). 59 */ 60 61 typedef struct { 62 mntinfo4_t *rc_mi; 63 vnode_t *rc_vp1; 64 vnode_t *rc_vp2; 65 nfs4_recov_t rc_action; 66 stateid4 rc_stateid; 67 bool_t rc_srv_reboot; /* server has rebooted */ 68 nfs4_lost_rqst_t *rc_lost_rqst; 69 nfs4_error_t rc_orig_errors; /* original errors causing recovery */ 70 int rc_error; 71 nfs4_bseqid_entry_t *rc_bseqid_rqst; 72 } recov_info_t; 73 74 /* 75 * How long to wait before trying again if there is an error doing 76 * recovery, in seconds. 77 */ 78 79 static int recov_err_delay = 1; 80 81 /* 82 * How long to wait when processing NFS4ERR_GRACE or NFS4ERR_DELAY 83 * errors. Expressed in seconds. Default is defined as 84 * NFS4ERR_DELAY_TIME and this variable is initialized in nfs4_subr_init() 85 */ 86 time_t nfs4err_delay_time = 0; 87 88 /* 89 * Tuneable to limit how many time "exempt" ops go OTW 90 * after a recovery error. Exempt op hints are OH_CLOSE, 91 * OH_LOCKU, OH_DELEGRETURN. These previously always went 92 * OTW even after rnode was "dead" due to recovery errors. 93 * 94 * The tuneable below limits the number of times a start_fop 95 * invocation will retry the exempt hints. After the limit 96 * is reached, nfs4_start_fop will return an error just like 97 * it would for non-exempt op hints. 98 */ 99 int nfs4_max_recov_error_retry = 3; 100 101 /* 102 * Number of seconds the recovery thread should pause before retry when the 103 * filesystem has been forcibly unmounted. 104 */ 105 106 int nfs4_unmount_delay = 1; 107 108 #ifdef DEBUG 109 110 /* 111 * How long to wait (in seconds) between recovery operations on a given 112 * file. Normally zero, but could be set longer for testing purposes. 113 */ 114 static int nfs4_recovdelay = 0; 115 116 /* 117 * Switch that controls whether to go into the debugger when recovery 118 * fails. 119 */ 120 static int nfs4_fail_recov_stop = 0; 121 122 /* 123 * Tuneables to debug client namespace interaction with server 124 * mount points: 125 * 126 * nfs4_srvmnt_fail_cnt: 127 * number of times EACCES returned because client 128 * attempted to cross server mountpoint 129 * 130 * nfs4_srvmnt_debug: 131 * trigger console printf whenever client attempts 132 * to cross server mountpoint 133 */ 134 int nfs4_srvmnt_fail_cnt = 0; 135 int nfs4_srvmnt_debug = 0; 136 #endif 137 138 /* forward references, in alphabetic order */ 139 static void close_after_open_resend(vnode_t *, cred_t *, uint32_t, 140 nfs4_error_t *); 141 static void errs_to_action(recov_info_t *, 142 nfs4_server_t *, mntinfo4_t *, stateid4 *, nfs4_lost_rqst_t *, int, 143 nfs_opnum4, nfs4_bseqid_entry_t *); 144 static void flush_reinstate(nfs4_lost_rqst_t *); 145 static void free_milist(mntinfo4_t **, int); 146 static mntinfo4_t **make_milist(nfs4_server_t *, int *); 147 static int nfs4_check_recov_err(vnode_t *, nfs4_op_hint_t, 148 nfs4_recov_state_t *, int, char *); 149 static char *nfs4_getsrvnames(mntinfo4_t *, size_t *); 150 static void nfs4_recov_fh_fail(vnode_t *, int, nfsstat4); 151 static void nfs4_recov_thread(recov_info_t *); 152 static void nfs4_remove_lost_rqsts(mntinfo4_t *, nfs4_server_t *); 153 static void nfs4_resend_lost_rqsts(recov_info_t *, nfs4_server_t *); 154 static cred_t *pid_to_cr(pid_t); 155 static void reclaim_one_lock(vnode_t *, flock64_t *, nfs4_error_t *, int *); 156 static void recov_bad_seqid(recov_info_t *); 157 static void recov_badstate(recov_info_t *, vnode_t *, nfsstat4); 158 static void recov_clientid(recov_info_t *, nfs4_server_t *); 159 static void recov_done(mntinfo4_t *, recov_info_t *); 160 static void recov_filehandle(nfs4_recov_t, mntinfo4_t *, vnode_t *); 161 static void recov_newserver(recov_info_t *, nfs4_server_t **, bool_t *); 162 static void recov_openfiles(recov_info_t *, nfs4_server_t *); 163 static void recov_stale(mntinfo4_t *, vnode_t *); 164 static void nfs4_free_lost_rqst(nfs4_lost_rqst_t *, nfs4_server_t *); 165 static void recov_throttle(recov_info_t *, vnode_t *); 166 static void relock_skip_pid(locklist_t *, pid_t); 167 static void resend_lock(nfs4_lost_rqst_t *, nfs4_error_t *); 168 static void resend_one_op(nfs4_lost_rqst_t *, nfs4_error_t *, mntinfo4_t *, 169 nfs4_server_t *); 170 static void save_bseqid_rqst(nfs4_bseqid_entry_t *, recov_info_t *); 171 static void start_recovery(recov_info_t *, mntinfo4_t *, vnode_t *, vnode_t *, 172 nfs4_server_t *); 173 static void start_recovery_action(nfs4_recov_t, bool_t, mntinfo4_t *, vnode_t *, 174 vnode_t *); 175 static int wait_for_recovery(mntinfo4_t *, nfs4_op_hint_t); 176 177 /* 178 * Return non-zero if the given errno, status, and rpc status codes 179 * in the nfs4_error_t indicate that client recovery is needed. 180 * "stateful" indicates whether the call that got the error establishes or 181 * removes state on the server (open, close, lock, unlock, delegreturn). 182 */ 183 184 int 185 nfs4_needs_recovery(nfs4_error_t *ep, bool_t stateful, vfs_t *vfsp) 186 { 187 int recov = 0; 188 mntinfo4_t *mi; 189 190 /* 191 * Try failover if the error values justify it and if 192 * it's a failover mount. Don't try if the mount is in 193 * progress, failures are handled explicitly by nfs4rootvp. 194 */ 195 if (nfs4_try_failover(ep)) { 196 mi = VFTOMI4(vfsp); 197 mutex_enter(&mi->mi_lock); 198 recov = FAILOVER_MOUNT4(mi) && !(mi->mi_flags & MI4_MOUNTING); 199 mutex_exit(&mi->mi_lock); 200 if (recov) 201 return (recov); 202 } 203 204 if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vfsp)) { 205 /* 206 * The server may have gotten the request, so for stateful 207 * ops we need to resynchronize and possibly back out the 208 * op. 209 */ 210 return (stateful); 211 } 212 if (ep->error != 0) 213 return (0); 214 215 /* stat values are listed alphabetically */ 216 /* 217 * There are two lists here: the errors for which we have code, and 218 * the errors for which we plan to have code before FCS. For the 219 * second list, print a warning message but don't attempt recovery. 220 */ 221 switch (ep->stat) { 222 case NFS4ERR_BADHANDLE: 223 case NFS4ERR_BAD_SEQID: 224 case NFS4ERR_BAD_STATEID: 225 case NFS4ERR_DELAY: 226 case NFS4ERR_EXPIRED: 227 case NFS4ERR_FHEXPIRED: 228 case NFS4ERR_GRACE: 229 case NFS4ERR_OLD_STATEID: 230 case NFS4ERR_RESOURCE: 231 case NFS4ERR_STALE_CLIENTID: 232 case NFS4ERR_STALE_STATEID: 233 case NFS4ERR_WRONGSEC: 234 case NFS4ERR_STALE: 235 recov = 1; 236 break; 237 #ifdef DEBUG 238 case NFS4ERR_LEASE_MOVED: 239 case NFS4ERR_MOVED: 240 zcmn_err(VFTOMI4(vfsp)->mi_zone->zone_id, 241 CE_WARN, "!Can't yet recover from NFS status %d", 242 ep->stat); 243 break; 244 #endif 245 } 246 247 return (recov); 248 } 249 250 /* 251 * Some operations such as DELEGRETURN want to avoid invoking 252 * recovery actions that will only mark the file dead. If 253 * better handlers are invoked for any of these errors, this 254 * routine should be modified. 255 */ 256 int 257 nfs4_recov_marks_dead(nfsstat4 status) 258 { 259 if (status == NFS4ERR_BAD_SEQID || 260 status == NFS4ERR_EXPIRED || 261 status == NFS4ERR_BAD_STATEID || 262 status == NFS4ERR_OLD_STATEID) 263 return (1); 264 return (0); 265 } 266 267 /* 268 * Transfer the state recovery information in recovp to mi's resend queue, 269 * and mark mi as having a lost state request. 270 */ 271 static void 272 nfs4_enqueue_lost_rqst(recov_info_t *recovp, mntinfo4_t *mi) 273 { 274 nfs4_lost_rqst_t *lrp = recovp->rc_lost_rqst; 275 276 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 277 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 278 279 ASSERT(lrp != NULL && lrp->lr_op != 0); 280 281 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 282 "nfs4_enqueue_lost_rqst %p, op %d", 283 (void *)lrp, lrp->lr_op)); 284 285 mutex_enter(&mi->mi_lock); 286 mi->mi_recovflags |= MI4R_LOST_STATE; 287 if (lrp->lr_putfirst) 288 list_insert_head(&mi->mi_lost_state, lrp); 289 else 290 list_insert_tail(&mi->mi_lost_state, lrp); 291 recovp->rc_lost_rqst = NULL; 292 mutex_exit(&mi->mi_lock); 293 294 nfs4_queue_event(RE_LOST_STATE, mi, NULL, lrp->lr_op, lrp->lr_vp, 295 lrp->lr_dvp, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0); 296 } 297 298 /* 299 * Transfer the bad seqid recovery information in recovp to mi's 300 * bad seqid queue, and mark mi as having a bad seqid request. 301 */ 302 void 303 enqueue_bseqid_rqst(recov_info_t *recovp, mntinfo4_t *mi) 304 { 305 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 306 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 307 ASSERT(recovp->rc_bseqid_rqst != NULL); 308 309 mutex_enter(&mi->mi_lock); 310 mi->mi_recovflags |= MI4R_BAD_SEQID; 311 list_insert_tail(&mi->mi_bseqid_list, recovp->rc_bseqid_rqst); 312 recovp->rc_bseqid_rqst = NULL; 313 mutex_exit(&mi->mi_lock); 314 } 315 316 /* 317 * Initiate recovery. 318 * 319 * The nfs4_error_t contains the return codes that triggered a recovery 320 * attempt. mi, vp1, and vp2 refer to the filesystem and files that were 321 * being operated on. vp1 and vp2 may be NULL. 322 * 323 * Multiple calls are okay. If recovery is already underway, the call 324 * updates the information about what state needs recovery but does not 325 * start a new thread. The caller should hold mi->mi_recovlock as a reader 326 * for proper synchronization with any recovery thread. 327 * 328 * This will return TRUE if recovery was aborted, and FALSE otherwise. 329 */ 330 bool_t 331 nfs4_start_recovery(nfs4_error_t *ep, mntinfo4_t *mi, vnode_t *vp1, 332 vnode_t *vp2, stateid4 *sid, nfs4_lost_rqst_t *lost_rqstp, nfs_opnum4 op, 333 nfs4_bseqid_entry_t *bsep) 334 { 335 recov_info_t *recovp; 336 nfs4_server_t *sp; 337 bool_t abort = FALSE; 338 bool_t gone = FALSE; 339 340 ASSERT(nfs_zone() == mi->mi_zone); 341 mutex_enter(&mi->mi_lock); 342 /* 343 * If there is lost state, we need to kick off recovery even if the 344 * filesystem has been unmounted or the zone is shutting down. 345 */ 346 gone = FS_OR_ZONE_GONE4(mi->mi_vfsp); 347 if (gone) { 348 ASSERT(ep->error != EINTR || lost_rqstp != NULL); 349 if (ep->error == EIO && lost_rqstp == NULL) { 350 /* failed due to forced unmount, no new lost state */ 351 abort = TRUE; 352 } 353 if ((ep->error == 0 || ep->error == ETIMEDOUT) && 354 !(mi->mi_recovflags & MI4R_LOST_STATE)) { 355 /* some other failure, no existing lost state */ 356 abort = TRUE; 357 } 358 if (abort) { 359 mutex_exit(&mi->mi_lock); 360 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 361 "nfs4_start_recovery: fs unmounted")); 362 return (TRUE); 363 } 364 } 365 mi->mi_in_recovery++; 366 mutex_exit(&mi->mi_lock); 367 368 recovp = kmem_alloc(sizeof (recov_info_t), KM_SLEEP); 369 recovp->rc_orig_errors = *ep; 370 sp = find_nfs4_server(mi); 371 errs_to_action(recovp, sp, mi, sid, lost_rqstp, gone, op, bsep); 372 if (sp != NULL) 373 mutex_exit(&sp->s_lock); 374 start_recovery(recovp, mi, vp1, vp2, sp); 375 if (sp != NULL) 376 nfs4_server_rele(sp); 377 return (FALSE); 378 } 379 380 /* 381 * Internal version of nfs4_start_recovery. The difference is that the 382 * caller specifies the recovery action, rather than the errors leading to 383 * recovery. 384 */ 385 static void 386 start_recovery_action(nfs4_recov_t what, bool_t reboot, mntinfo4_t *mi, 387 vnode_t *vp1, vnode_t *vp2) 388 { 389 recov_info_t *recovp; 390 391 ASSERT(nfs_zone() == mi->mi_zone); 392 mutex_enter(&mi->mi_lock); 393 mi->mi_in_recovery++; 394 mutex_exit(&mi->mi_lock); 395 396 recovp = kmem_zalloc(sizeof (recov_info_t), KM_SLEEP); 397 recovp->rc_action = what; 398 recovp->rc_srv_reboot = reboot; 399 recovp->rc_error = EIO; 400 start_recovery(recovp, mi, vp1, vp2, NULL); 401 } 402 403 static void 404 start_recovery(recov_info_t *recovp, mntinfo4_t *mi, 405 vnode_t *vp1, vnode_t *vp2, nfs4_server_t *sp) 406 { 407 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 408 "start_recovery: mi %p, what %s", (void*)mi, 409 nfs4_recov_action_to_str(recovp->rc_action))); 410 411 /* 412 * Bump the reference on the vfs so that we can pass it to the 413 * recovery thread. 414 */ 415 VFS_HOLD(mi->mi_vfsp); 416 MI4_HOLD(mi); 417 again: 418 switch (recovp->rc_action) { 419 case NR_FAILOVER: 420 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 421 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 422 if (mi->mi_servers->sv_next == NULL) 423 goto out_no_thread; 424 mutex_enter(&mi->mi_lock); 425 mi->mi_recovflags |= MI4R_NEED_NEW_SERVER; 426 mutex_exit(&mi->mi_lock); 427 428 if (recovp->rc_lost_rqst != NULL) 429 nfs4_enqueue_lost_rqst(recovp, mi); 430 break; 431 432 case NR_CLIENTID: 433 /* 434 * If the filesystem has been unmounted, punt. 435 */ 436 if (sp == NULL) 437 goto out_no_thread; 438 439 /* 440 * If nobody else is working on the clientid, mark the 441 * clientid as being no longer set. Then mark the specific 442 * filesystem being worked on. 443 */ 444 if (!nfs4_server_in_recovery(sp)) { 445 mutex_enter(&sp->s_lock); 446 sp->s_flags &= ~N4S_CLIENTID_SET; 447 mutex_exit(&sp->s_lock); 448 } 449 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 450 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 451 mutex_enter(&mi->mi_lock); 452 mi->mi_recovflags |= MI4R_NEED_CLIENTID; 453 if (recovp->rc_srv_reboot) 454 mi->mi_recovflags |= MI4R_SRV_REBOOT; 455 mutex_exit(&mi->mi_lock); 456 break; 457 458 case NR_OPENFILES: 459 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 460 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 461 mutex_enter(&mi->mi_lock); 462 mi->mi_recovflags |= MI4R_REOPEN_FILES; 463 if (recovp->rc_srv_reboot) 464 mi->mi_recovflags |= MI4R_SRV_REBOOT; 465 mutex_exit(&mi->mi_lock); 466 break; 467 468 case NR_WRONGSEC: 469 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 470 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 471 mutex_enter(&mi->mi_lock); 472 mi->mi_recovflags |= MI4R_NEED_SECINFO; 473 mutex_exit(&mi->mi_lock); 474 break; 475 476 case NR_EXPIRED: 477 if (vp1 != NULL) 478 recov_badstate(recovp, vp1, NFS4ERR_EXPIRED); 479 if (vp2 != NULL) 480 recov_badstate(recovp, vp2, NFS4ERR_EXPIRED); 481 goto out_no_thread; /* no further recovery possible */ 482 483 case NR_BAD_STATEID: 484 if (vp1 != NULL) 485 recov_badstate(recovp, vp1, NFS4ERR_BAD_STATEID); 486 if (vp2 != NULL) 487 recov_badstate(recovp, vp2, NFS4ERR_BAD_STATEID); 488 goto out_no_thread; /* no further recovery possible */ 489 490 case NR_FHEXPIRED: 491 case NR_BADHANDLE: 492 if (vp1 != NULL) 493 recov_throttle(recovp, vp1); 494 if (vp2 != NULL) 495 recov_throttle(recovp, vp2); 496 /* 497 * Recover the filehandle now, rather than using a 498 * separate thread. We can do this because filehandle 499 * recovery is independent of any other state, and because 500 * we know that we are not competing with the recovery 501 * thread at this time. recov_filehandle will deal with 502 * threads that are competing to recover this filehandle. 503 */ 504 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 505 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 506 if (vp1 != NULL) 507 recov_filehandle(recovp->rc_action, mi, vp1); 508 if (vp2 != NULL) 509 recov_filehandle(recovp->rc_action, mi, vp2); 510 goto out_no_thread; /* no further recovery needed */ 511 512 case NR_STALE: 513 /* 514 * NFS4ERR_STALE handling 515 * recov_stale() could set MI4R_NEED_NEW_SERVER to 516 * indicate that we can and should failover. 517 */ 518 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 519 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 520 521 if (vp1 != NULL) 522 recov_stale(mi, vp1); 523 if (vp2 != NULL) 524 recov_stale(mi, vp2); 525 mutex_enter(&mi->mi_lock); 526 if ((mi->mi_recovflags & MI4R_NEED_NEW_SERVER) == 0) { 527 mutex_exit(&mi->mi_lock); 528 goto out_no_thread; 529 } 530 mutex_exit(&mi->mi_lock); 531 recovp->rc_action = NR_FAILOVER; 532 goto again; 533 534 case NR_BAD_SEQID: 535 if (recovp->rc_bseqid_rqst) { 536 enqueue_bseqid_rqst(recovp, mi); 537 break; 538 } 539 540 if (vp1 != NULL) 541 recov_badstate(recovp, vp1, NFS4ERR_BAD_SEQID); 542 if (vp2 != NULL) 543 recov_badstate(recovp, vp2, NFS4ERR_BAD_SEQID); 544 goto out_no_thread; /* no further recovery possible */ 545 546 case NR_OLDSTATEID: 547 if (vp1 != NULL) 548 recov_badstate(recovp, vp1, NFS4ERR_OLD_STATEID); 549 if (vp2 != NULL) 550 recov_badstate(recovp, vp2, NFS4ERR_OLD_STATEID); 551 goto out_no_thread; /* no further recovery possible */ 552 553 case NR_GRACE: 554 nfs4_set_grace_wait(mi); 555 goto out_no_thread; /* no further action required for GRACE */ 556 557 case NR_DELAY: 558 if (vp1) 559 nfs4_set_delay_wait(vp1); 560 goto out_no_thread; /* no further action required for DELAY */ 561 562 case NR_LOST_STATE_RQST: 563 case NR_LOST_LOCK: 564 nfs4_enqueue_lost_rqst(recovp, mi); 565 break; 566 567 default: 568 nfs4_queue_event(RE_UNEXPECTED_ACTION, mi, NULL, 569 recovp->rc_action, NULL, NULL, 0, NULL, 0, TAG_NONE, 570 TAG_NONE, 0, 0); 571 goto out_no_thread; 572 } 573 574 /* 575 * If either file recently went through the same recovery, wait 576 * awhile. This is in case there is some sort of bug; we might not 577 * be able to recover properly, but at least we won't bombard the 578 * server with calls, and we won't tie up the client. 579 */ 580 if (vp1 != NULL) 581 recov_throttle(recovp, vp1); 582 if (vp2 != NULL) 583 recov_throttle(recovp, vp2); 584 585 /* 586 * If there's already a recovery thread, don't start another one. 587 */ 588 589 mutex_enter(&mi->mi_lock); 590 if (mi->mi_flags & MI4_RECOV_ACTIV) { 591 mutex_exit(&mi->mi_lock); 592 goto out_no_thread; 593 } 594 mi->mi_flags |= MI4_RECOV_ACTIV; 595 mutex_exit(&mi->mi_lock); 596 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 597 "start_recovery: starting new thread for mi %p", (void*)mi)); 598 599 recovp->rc_mi = mi; 600 recovp->rc_vp1 = vp1; 601 if (vp1 != NULL) { 602 ASSERT(VTOMI4(vp1) == mi); 603 VN_HOLD(recovp->rc_vp1); 604 } 605 recovp->rc_vp2 = vp2; 606 if (vp2 != NULL) { 607 ASSERT(VTOMI4(vp2) == mi); 608 VN_HOLD(recovp->rc_vp2); 609 } 610 611 (void) zthread_create(NULL, 0, nfs4_recov_thread, recovp, 0, 612 minclsyspri); 613 return; 614 615 /* not reached by thread creating call */ 616 out_no_thread: 617 mutex_enter(&mi->mi_lock); 618 mi->mi_in_recovery--; 619 if (mi->mi_in_recovery == 0) 620 cv_broadcast(&mi->mi_cv_in_recov); 621 mutex_exit(&mi->mi_lock); 622 623 VFS_RELE(mi->mi_vfsp); 624 MI4_RELE(mi); 625 /* 626 * Free up resources that were allocated for us. 627 */ 628 kmem_free(recovp, sizeof (recov_info_t)); 629 } 630 631 static int 632 nfs4_check_recov_err(vnode_t *vp, nfs4_op_hint_t op, 633 nfs4_recov_state_t *rsp, int retry_err_cnt, char *str) 634 { 635 rnode4_t *rp; 636 int error = 0; 637 int exempt; 638 639 if (vp == NULL) 640 return (0); 641 642 exempt = (op == OH_CLOSE || op == OH_LOCKU || op == OH_DELEGRETURN); 643 rp = VTOR4(vp); 644 mutex_enter(&rp->r_statelock); 645 646 /* 647 * If there was a recovery error, then allow op hints "exempt" from 648 * recov errors to retry (currently 3 times). Either r_error or 649 * EIO is returned for non-exempt op hints. 650 */ 651 if (rp->r_flags & R4RECOVERR) { 652 if (exempt && rsp->rs_num_retry_despite_err <= 653 nfs4_max_recov_error_retry) { 654 655 /* 656 * Check to make sure that we haven't already inc'd 657 * rs_num_retry_despite_err for current nfs4_start_fop 658 * instance. We don't want to double inc (if we were 659 * called with vp2, then the vp1 call could have 660 * already incremented. 661 */ 662 if (retry_err_cnt == rsp->rs_num_retry_despite_err) 663 rsp->rs_num_retry_despite_err++; 664 665 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 666 "nfs4_start_fop: %s %p DEAD, cnt=%d", str, 667 (void *)vp, rsp->rs_num_retry_despite_err)); 668 } else { 669 error = (rp->r_error ? rp->r_error : EIO); 670 /* 671 * An ESTALE error on a non-regular file is not 672 * "sticky". Return the ESTALE error once, but 673 * clear the condition to allow future operations 674 * to go OTW. This will allow the client to 675 * recover if the server has merely unshared then 676 * re-shared the file system. For regular files, 677 * the unshare has destroyed the open state at the 678 * server and we aren't willing to do a reopen (yet). 679 */ 680 if (error == ESTALE && vp->v_type != VREG) { 681 rp->r_flags &= 682 ~(R4RECOVERR|R4RECOVERRP|R4STALE); 683 rp->r_error = 0; 684 error = ESTALE; 685 } 686 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 687 "nfs4_start_fop: %s %p DEAD, cnt=%d error=%d", 688 str, (void *)vp, 689 rsp->rs_num_retry_despite_err, error)); 690 } 691 } 692 693 mutex_exit(&rp->r_statelock); 694 return (error); 695 } 696 697 /* 698 * Initial setup code that every operation should call if it might invoke 699 * client recovery. Can block waiting for recovery to finish on a 700 * filesystem. Either vnode ptr can be NULL. 701 * 702 * Returns 0 if there are no outstanding errors. Can return an 703 * errno value under various circumstances (e.g., failed recovery, or 704 * interrupted while waiting for recovery to finish). 705 * 706 * There must be a corresponding call to nfs4_end_op() to free up any locks 707 * or resources allocated by this call (assuming this call succeeded), 708 * using the same rsp that's passed in here. 709 * 710 * The open and lock seqid synchronization must be stopped before calling this 711 * function, as it could lead to deadlock when trying to reopen a file or 712 * reclaim a lock. The synchronization is obtained with calls to: 713 * nfs4_start_open_seqid_sync() 714 * nfs4_start_lock_seqid_sync() 715 * 716 * *startrecovp is set TRUE if the caller should not bother with the 717 * over-the-wire call, and just initiate recovery for the given request. 718 * This is typically used for state-releasing ops if the filesystem has 719 * been forcibly unmounted. startrecovp may be NULL for 720 * non-state-releasing ops. 721 */ 722 723 int 724 nfs4_start_fop(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2, nfs4_op_hint_t op, 725 nfs4_recov_state_t *rsp, bool_t *startrecovp) 726 { 727 int error = 0, rerr_cnt; 728 nfs4_server_t *sp = NULL; 729 nfs4_server_t *tsp; 730 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 731 uint_t droplock_cnt; 732 #ifdef DEBUG 733 void *fop_caller; 734 #endif 735 736 ASSERT(vp1 == NULL || vp1->v_vfsp == mi->mi_vfsp); 737 ASSERT(vp2 == NULL || vp2->v_vfsp == mi->mi_vfsp); 738 739 #ifdef DEBUG 740 if ((fop_caller = tsd_get(nfs4_tsd_key)) != NULL) { 741 cmn_err(CE_PANIC, "Missing nfs4_end_fop: last caller %p", 742 fop_caller); 743 } 744 (void) tsd_set(nfs4_tsd_key, caller()); 745 #endif 746 747 rsp->rs_sp = NULL; 748 rsp->rs_flags &= ~NFS4_RS_RENAME_HELD; 749 rerr_cnt = rsp->rs_num_retry_despite_err; 750 751 /* 752 * Process the items that may delay() based on server response 753 */ 754 error = nfs4_wait_for_grace(mi, rsp); 755 if (error) 756 goto out; 757 758 if (vp1 != NULL) { 759 error = nfs4_wait_for_delay(vp1, rsp); 760 if (error) 761 goto out; 762 } 763 764 /* Wait for a delegation recall to complete. */ 765 766 error = wait_for_recall(vp1, vp2, op, rsp); 767 if (error) 768 goto out; 769 770 /* 771 * Wait for any current recovery actions to finish. Note that a 772 * recovery thread can still start up after wait_for_recovery() 773 * finishes. We don't block out recovery operations until we 774 * acquire s_recovlock and mi_recovlock. 775 */ 776 error = wait_for_recovery(mi, op); 777 if (error) 778 goto out; 779 780 /* 781 * Check to see if the rnode is already marked with a 782 * recovery error. If so, return it immediately. But 783 * always pass CLOSE, LOCKU, and DELEGRETURN so we can 784 * clean up state on the server. 785 */ 786 787 if (vp1 != NULL) { 788 if (error = nfs4_check_recov_err(vp1, op, rsp, rerr_cnt, "vp1")) 789 goto out; 790 nfs4_check_remap(mi, vp1, NFS4_REMAP_CKATTRS, &e); 791 } 792 793 if (vp2 != NULL) { 794 if (error = nfs4_check_recov_err(vp2, op, rsp, rerr_cnt, "vp2")) 795 goto out; 796 nfs4_check_remap(mi, vp2, NFS4_REMAP_CKATTRS, &e); 797 } 798 799 /* 800 * The lock order calls for us to acquire s_recovlock before 801 * mi_recovlock, but we have to hold mi_recovlock to look up sp (to 802 * prevent races with the failover/migration code). So acquire 803 * mi_recovlock, look up sp, drop mi_recovlock, acquire 804 * s_recovlock and mi_recovlock, then verify that sp is still the 805 * right object. XXX Can we find a simpler way to deal with this? 806 */ 807 if (nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 808 mi->mi_flags & MI4_INT)) { 809 error = EINTR; 810 goto out; 811 } 812 get_sp: 813 sp = find_nfs4_server(mi); 814 if (sp != NULL) { 815 sp->s_otw_call_count++; 816 mutex_exit(&sp->s_lock); 817 droplock_cnt = mi->mi_srvset_cnt; 818 } 819 nfs_rw_exit(&mi->mi_recovlock); 820 821 if (sp != NULL) { 822 if (nfs_rw_enter_sig(&sp->s_recovlock, RW_READER, 823 mi->mi_flags & MI4_INT)) { 824 error = EINTR; 825 goto out; 826 } 827 } 828 if (nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 829 mi->mi_flags & MI4_INT)) { 830 if (sp != NULL) 831 nfs_rw_exit(&sp->s_recovlock); 832 error = EINTR; 833 goto out; 834 } 835 /* 836 * If the mntinfo4_t hasn't changed nfs4_sever_ts then 837 * there's no point in double checking to make sure it 838 * has switched. 839 */ 840 if (sp == NULL || droplock_cnt != mi->mi_srvset_cnt) { 841 tsp = find_nfs4_server(mi); 842 if (tsp != sp) { 843 /* try again */ 844 if (tsp != NULL) { 845 mutex_exit(&tsp->s_lock); 846 nfs4_server_rele(tsp); 847 tsp = NULL; 848 } 849 if (sp != NULL) { 850 nfs_rw_exit(&sp->s_recovlock); 851 mutex_enter(&sp->s_lock); 852 sp->s_otw_call_count--; 853 mutex_exit(&sp->s_lock); 854 nfs4_server_rele(sp); 855 sp = NULL; 856 } 857 goto get_sp; 858 } else { 859 if (tsp != NULL) { 860 mutex_exit(&tsp->s_lock); 861 nfs4_server_rele(tsp); 862 tsp = NULL; 863 } 864 } 865 } 866 867 if (sp != NULL) { 868 rsp->rs_sp = sp; 869 } 870 871 /* 872 * If the fileystem uses volatile filehandles, obtain a lock so 873 * that we synchronize with renames. Exception: mount operations 874 * can change mi_fh_expire_type, which could be a problem, since 875 * the end_op code needs to be consistent with the start_op code 876 * about mi_rename_lock. Since mounts don't compete with renames, 877 * it's simpler to just not acquire the rename lock for mounts. 878 */ 879 if (NFS4_VOLATILE_FH(mi) && op != OH_MOUNT) { 880 if (nfs_rw_enter_sig(&mi->mi_rename_lock, 881 op == OH_VFH_RENAME ? RW_WRITER : RW_READER, 882 mi->mi_flags & MI4_INT)) { 883 nfs_rw_exit(&mi->mi_recovlock); 884 if (sp != NULL) 885 nfs_rw_exit(&sp->s_recovlock); 886 error = EINTR; 887 goto out; 888 } 889 rsp->rs_flags |= NFS4_RS_RENAME_HELD; 890 } 891 892 if (OH_IS_STATE_RELE(op)) { 893 /* 894 * For forced unmount, letting the request proceed will 895 * almost always delay response to the user, so hand it off 896 * to the recovery thread. For exiting lwp's, we don't 897 * have a good way to tell if the request will hang. We 898 * generally want processes to handle their own requests so 899 * that they can be done in parallel, but if there is 900 * already a recovery thread, hand the request off to it. 901 * This will improve user response at no cost to overall 902 * system throughput. For zone shutdown, we'd prefer 903 * the recovery thread to handle this as well. 904 */ 905 ASSERT(startrecovp != NULL); 906 mutex_enter(&mi->mi_lock); 907 if (FS_OR_ZONE_GONE4(mi->mi_vfsp)) 908 *startrecovp = TRUE; 909 else if ((curthread->t_proc_flag & TP_LWPEXIT) && 910 (mi->mi_flags & MI4_RECOV_ACTIV)) 911 *startrecovp = TRUE; 912 else 913 *startrecovp = FALSE; 914 mutex_exit(&mi->mi_lock); 915 } else 916 if (startrecovp != NULL) 917 *startrecovp = FALSE; 918 919 ASSERT(error == 0); 920 return (error); 921 922 out: 923 ASSERT(error != 0); 924 if (sp != NULL) { 925 mutex_enter(&sp->s_lock); 926 sp->s_otw_call_count--; 927 mutex_exit(&sp->s_lock); 928 nfs4_server_rele(sp); 929 rsp->rs_sp = NULL; 930 } 931 nfs4_end_op_recall(vp1, vp2, rsp); 932 933 #ifdef DEBUG 934 (void) tsd_set(nfs4_tsd_key, NULL); 935 #endif 936 return (error); 937 } 938 939 /* 940 * It is up to the caller to determine if rsp->rs_sp being NULL 941 * is detrimental or not. 942 */ 943 int 944 nfs4_start_op(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2, 945 nfs4_recov_state_t *rsp) 946 { 947 ASSERT(rsp->rs_num_retry_despite_err == 0); 948 rsp->rs_num_retry_despite_err = 0; 949 return (nfs4_start_fop(mi, vp1, vp2, OH_OTHER, rsp, NULL)); 950 } 951 952 /* 953 * Release any resources acquired by nfs4_start_op(). 954 * 'sp' should be the nfs4_server pointer returned by nfs4_start_op(). 955 * 956 * The operation hint is used to avoid a deadlock by bypassing delegation 957 * return logic for writes, which are done while returning a delegation. 958 */ 959 960 void 961 nfs4_end_fop(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2, nfs4_op_hint_t op, 962 nfs4_recov_state_t *rsp, bool_t needs_recov) 963 { 964 nfs4_server_t *sp = rsp->rs_sp; 965 rnode4_t *rp = NULL; 966 967 #ifdef lint 968 /* 969 * The op hint isn't used any more, but might be in 970 * the future. 971 */ 972 op = op; 973 #endif 974 975 #ifdef DEBUG 976 ASSERT(tsd_get(nfs4_tsd_key) != NULL); 977 (void) tsd_set(nfs4_tsd_key, NULL); 978 #endif 979 980 nfs4_end_op_recall(vp1, vp2, rsp); 981 982 if (rsp->rs_flags & NFS4_RS_RENAME_HELD) 983 nfs_rw_exit(&mi->mi_rename_lock); 984 985 if (!needs_recov) { 986 if (rsp->rs_flags & NFS4_RS_DELAY_MSG) { 987 /* may need to clear the delay interval */ 988 if (vp1 != NULL) { 989 rp = VTOR4(vp1); 990 mutex_enter(&rp->r_statelock); 991 rp->r_delay_interval = 0; 992 mutex_exit(&rp->r_statelock); 993 } 994 } 995 rsp->rs_flags &= ~(NFS4_RS_GRACE_MSG|NFS4_RS_DELAY_MSG); 996 } 997 998 /* 999 * If the corresponding nfs4_start_op() found a sp, 1000 * then there must still be a sp. 1001 */ 1002 if (sp != NULL) { 1003 nfs_rw_exit(&mi->mi_recovlock); 1004 nfs_rw_exit(&sp->s_recovlock); 1005 mutex_enter(&sp->s_lock); 1006 sp->s_otw_call_count--; 1007 cv_broadcast(&sp->s_cv_otw_count); 1008 mutex_exit(&sp->s_lock); 1009 nfs4_server_rele(sp); 1010 } else { 1011 nfs_rw_exit(&mi->mi_recovlock); 1012 } 1013 } 1014 1015 void 1016 nfs4_end_op(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2, 1017 nfs4_recov_state_t *rsp, bool_t needrecov) 1018 { 1019 nfs4_end_fop(mi, vp1, vp2, OH_OTHER, rsp, needrecov); 1020 } 1021 1022 /* 1023 * If the filesystem is going through client recovery, block until 1024 * finished. 1025 * Exceptions: 1026 * - state-releasing ops (CLOSE, LOCKU, DELEGRETURN) are allowed to proceed 1027 * if the filesystem has been forcibly unmounted or the lwp is exiting. 1028 * 1029 * Return value: 1030 * - 0 if no errors 1031 * - EINTR if the call was interrupted 1032 * - EIO if the filesystem has been forcibly unmounted (non-state-releasing 1033 * op) 1034 * - the errno value from the recovery thread, if recovery failed 1035 */ 1036 1037 static int 1038 wait_for_recovery(mntinfo4_t *mi, nfs4_op_hint_t op_hint) 1039 { 1040 int error = 0; 1041 1042 mutex_enter(&mi->mi_lock); 1043 1044 while (mi->mi_recovflags != 0) { 1045 klwp_t *lwp = ttolwp(curthread); 1046 1047 if ((mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) || 1048 (mi->mi_flags & MI4_RECOV_FAIL)) 1049 break; 1050 if (OH_IS_STATE_RELE(op_hint) && 1051 (curthread->t_proc_flag & TP_LWPEXIT)) 1052 break; 1053 1054 if (lwp != NULL) 1055 lwp->lwp_nostop++; 1056 /* XXX - use different cv? */ 1057 if (cv_wait_sig(&mi->mi_failover_cv, &mi->mi_lock) == 0) { 1058 error = EINTR; 1059 if (lwp != NULL) 1060 lwp->lwp_nostop--; 1061 break; 1062 } 1063 if (lwp != NULL) 1064 lwp->lwp_nostop--; 1065 } 1066 1067 if ((mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) && 1068 !OH_IS_STATE_RELE(op_hint)) { 1069 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1070 "wait_for_recovery: forced unmount")); 1071 error = EIO; 1072 } else if (mi->mi_flags & MI4_RECOV_FAIL) { 1073 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1074 "wait_for_recovery: fail since RECOV FAIL")); 1075 error = mi->mi_error; 1076 } 1077 1078 mutex_exit(&mi->mi_lock); 1079 1080 return (error); 1081 } 1082 1083 /* 1084 * If the client received NFS4ERR_GRACE for this particular mount, 1085 * the client blocks here until it is time to try again. 1086 * 1087 * Return value: 1088 * - 0 if wait was successful 1089 * - EINTR if the call was interrupted 1090 */ 1091 1092 int 1093 nfs4_wait_for_grace(mntinfo4_t *mi, nfs4_recov_state_t *rsp) 1094 { 1095 int error = 0; 1096 time_t curtime, time_to_wait; 1097 1098 /* do a unprotected check to reduce mi_lock contention */ 1099 if (mi->mi_grace_wait != 0) { 1100 mutex_enter(&mi->mi_lock); 1101 1102 if (mi->mi_grace_wait != 0) { 1103 if (!(rsp->rs_flags & NFS4_RS_GRACE_MSG)) 1104 rsp->rs_flags |= NFS4_RS_GRACE_MSG; 1105 1106 curtime = gethrestime_sec(); 1107 1108 if (curtime < mi->mi_grace_wait) { 1109 1110 time_to_wait = mi->mi_grace_wait - curtime; 1111 1112 mutex_exit(&mi->mi_lock); 1113 1114 delay(SEC_TO_TICK(time_to_wait)); 1115 1116 curtime = gethrestime_sec(); 1117 1118 mutex_enter(&mi->mi_lock); 1119 1120 if (curtime >= mi->mi_grace_wait) 1121 mi->mi_grace_wait = 0; 1122 } else { 1123 mi->mi_grace_wait = 0; 1124 } 1125 } 1126 mutex_exit(&mi->mi_lock); 1127 } 1128 1129 return (error); 1130 } 1131 1132 /* 1133 * If the client received NFS4ERR_DELAY for an operation on a vnode, 1134 * the client blocks here until it is time to try again. 1135 * 1136 * Return value: 1137 * - 0 if wait was successful 1138 * - EINTR if the call was interrupted 1139 */ 1140 1141 int 1142 nfs4_wait_for_delay(vnode_t *vp, nfs4_recov_state_t *rsp) 1143 { 1144 int error = 0; 1145 time_t curtime, time_to_wait; 1146 rnode4_t *rp; 1147 1148 ASSERT(vp != NULL); 1149 1150 rp = VTOR4(vp); 1151 1152 /* do a unprotected check to reduce r_statelock contention */ 1153 if (rp->r_delay_wait != 0) { 1154 mutex_enter(&rp->r_statelock); 1155 1156 if (rp->r_delay_wait != 0) { 1157 1158 if (!(rsp->rs_flags & NFS4_RS_DELAY_MSG)) { 1159 rsp->rs_flags |= NFS4_RS_DELAY_MSG; 1160 nfs4_mi_kstat_inc_delay(VTOMI4(vp)); 1161 } 1162 1163 curtime = gethrestime_sec(); 1164 1165 if (curtime < rp->r_delay_wait) { 1166 1167 time_to_wait = rp->r_delay_wait - curtime; 1168 1169 mutex_exit(&rp->r_statelock); 1170 1171 delay(SEC_TO_TICK(time_to_wait)); 1172 1173 curtime = gethrestime_sec(); 1174 1175 mutex_enter(&rp->r_statelock); 1176 1177 if (curtime >= rp->r_delay_wait) 1178 rp->r_delay_wait = 0; 1179 } else { 1180 rp->r_delay_wait = 0; 1181 } 1182 } 1183 mutex_exit(&rp->r_statelock); 1184 } 1185 1186 return (error); 1187 } 1188 1189 /* 1190 * The recovery thread. 1191 */ 1192 1193 static void 1194 nfs4_recov_thread(recov_info_t *recovp) 1195 { 1196 mntinfo4_t *mi = recovp->rc_mi; 1197 nfs4_server_t *sp; 1198 int done = 0, error = 0; 1199 bool_t recov_fail = FALSE; 1200 callb_cpr_t cpr_info; 1201 kmutex_t cpr_lock; 1202 1203 nfs4_queue_event(RE_START, mi, NULL, mi->mi_recovflags, 1204 recovp->rc_vp1, recovp->rc_vp2, 0, NULL, 0, TAG_NONE, TAG_NONE, 1205 0, 0); 1206 1207 mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL); 1208 CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr, "nfsv4Recov"); 1209 1210 mutex_enter(&mi->mi_lock); 1211 mi->mi_recovthread = curthread; 1212 mutex_exit(&mi->mi_lock); 1213 1214 /* 1215 * We don't really need protection here against failover or 1216 * migration, since the current thread is the one that would make 1217 * any changes, but hold mi_recovlock anyway for completeness (and 1218 * to satisfy any ASSERTs). 1219 */ 1220 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0); 1221 sp = find_nfs4_server(mi); 1222 if (sp != NULL) 1223 mutex_exit(&sp->s_lock); 1224 nfs_rw_exit(&mi->mi_recovlock); 1225 1226 /* 1227 * Do any necessary recovery, based on the information in recovp 1228 * and any recovery flags. 1229 */ 1230 1231 do { 1232 mutex_enter(&mi->mi_lock); 1233 if (FS_OR_ZONE_GONE4(mi->mi_vfsp)) { 1234 bool_t activesrv; 1235 1236 NFS4_DEBUG(nfs4_client_recov_debug && 1237 mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED, (CE_NOTE, 1238 "nfs4_recov_thread: file system has been " 1239 "unmounted")); 1240 NFS4_DEBUG(nfs4_client_recov_debug && 1241 zone_status_get(curproc->p_zone) >= 1242 ZONE_IS_SHUTTING_DOWN, (CE_NOTE, 1243 "nfs4_recov_thread: zone shutting down")); 1244 /* 1245 * If the server has lost its state for us and 1246 * the filesystem is unmounted, then the filesystem 1247 * can be tossed, even if there are lost lock or 1248 * lost state calls in the recovery queue. 1249 */ 1250 if (mi->mi_recovflags & 1251 (MI4R_NEED_CLIENTID | MI4R_REOPEN_FILES)) { 1252 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1253 "nfs4_recov_thread: bailing out")); 1254 mi->mi_flags |= MI4_RECOV_FAIL; 1255 mi->mi_error = recovp->rc_error; 1256 recov_fail = TRUE; 1257 } 1258 /* 1259 * We don't know if the server has any state for 1260 * us, and the filesystem has been unmounted. If 1261 * there are "lost state" recovery items, keep 1262 * trying to process them until there are no more 1263 * mounted filesystems for the server. Otherwise, 1264 * bail out. The reason we don't mark the 1265 * filesystem as failing recovery is in case we 1266 * have to do "lost state" recovery later (e.g., a 1267 * user process exits). 1268 */ 1269 if (!(mi->mi_recovflags & MI4R_LOST_STATE)) { 1270 done = 1; 1271 mutex_exit(&mi->mi_lock); 1272 break; 1273 } 1274 mutex_exit(&mi->mi_lock); 1275 1276 if (sp == NULL) 1277 activesrv = FALSE; 1278 else { 1279 mutex_enter(&sp->s_lock); 1280 activesrv = nfs4_fs_active(sp); 1281 } 1282 if (!activesrv) { 1283 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1284 "no active fs for server %p", 1285 (void *)sp)); 1286 mutex_enter(&mi->mi_lock); 1287 mi->mi_flags |= MI4_RECOV_FAIL; 1288 mi->mi_error = recovp->rc_error; 1289 mutex_exit(&mi->mi_lock); 1290 recov_fail = TRUE; 1291 if (sp != NULL) { 1292 /* 1293 * Mark the server instance as 1294 * dead, so that nobody will attach 1295 * a new filesystem. 1296 */ 1297 nfs4_mark_srv_dead(sp); 1298 } 1299 } 1300 if (sp != NULL) 1301 mutex_exit(&sp->s_lock); 1302 } else { 1303 mutex_exit(&mi->mi_lock); 1304 } 1305 1306 /* 1307 * Check if we need to select a new server for a 1308 * failover. Choosing a new server will force at 1309 * least a check of the clientid. 1310 */ 1311 mutex_enter(&mi->mi_lock); 1312 if (!recov_fail && 1313 (mi->mi_recovflags & MI4R_NEED_NEW_SERVER)) { 1314 mutex_exit(&mi->mi_lock); 1315 recov_newserver(recovp, &sp, &recov_fail); 1316 } else 1317 mutex_exit(&mi->mi_lock); 1318 1319 /* 1320 * Check if we need to recover the clientid. This 1321 * must be done before file and lock recovery, and it 1322 * potentially affects the recovery threads for other 1323 * filesystems, so it gets special treatment. 1324 */ 1325 if (sp != NULL && recov_fail == FALSE) { 1326 mutex_enter(&sp->s_lock); 1327 if (!(sp->s_flags & N4S_CLIENTID_SET)) { 1328 mutex_exit(&sp->s_lock); 1329 recov_clientid(recovp, sp); 1330 } else { 1331 /* 1332 * Unset this flag in case another recovery 1333 * thread successfully recovered the clientid 1334 * for us already. 1335 */ 1336 mutex_enter(&mi->mi_lock); 1337 mi->mi_recovflags &= ~MI4R_NEED_CLIENTID; 1338 mutex_exit(&mi->mi_lock); 1339 mutex_exit(&sp->s_lock); 1340 } 1341 } 1342 1343 /* 1344 * Check if we need to get the security information. 1345 */ 1346 mutex_enter(&mi->mi_lock); 1347 if ((mi->mi_recovflags & MI4R_NEED_SECINFO) && 1348 !(mi->mi_flags & MI4_RECOV_FAIL)) { 1349 mutex_exit(&mi->mi_lock); 1350 (void) nfs_rw_enter_sig(&mi->mi_recovlock, 1351 RW_WRITER, 0); 1352 error = nfs4_secinfo_recov(recovp->rc_mi, 1353 recovp->rc_vp1, recovp->rc_vp2); 1354 /* 1355 * If error, nothing more can be done, stop 1356 * the recovery. 1357 */ 1358 if (error) { 1359 mutex_enter(&mi->mi_lock); 1360 mi->mi_flags |= MI4_RECOV_FAIL; 1361 mi->mi_error = recovp->rc_error; 1362 mutex_exit(&mi->mi_lock); 1363 nfs4_queue_event(RE_WRONGSEC, mi, NULL, 1364 error, recovp->rc_vp1, recovp->rc_vp2, 1365 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0); 1366 } 1367 nfs_rw_exit(&mi->mi_recovlock); 1368 } else 1369 mutex_exit(&mi->mi_lock); 1370 1371 /* 1372 * Check if there's a bad seqid to recover. 1373 */ 1374 mutex_enter(&mi->mi_lock); 1375 if ((mi->mi_recovflags & MI4R_BAD_SEQID) && 1376 !(mi->mi_flags & MI4_RECOV_FAIL)) { 1377 mutex_exit(&mi->mi_lock); 1378 (void) nfs_rw_enter_sig(&mi->mi_recovlock, 1379 RW_WRITER, 0); 1380 recov_bad_seqid(recovp); 1381 nfs_rw_exit(&mi->mi_recovlock); 1382 } else 1383 mutex_exit(&mi->mi_lock); 1384 1385 /* 1386 * Next check for recovery that affects the entire 1387 * filesystem. 1388 */ 1389 if (sp != NULL) { 1390 mutex_enter(&mi->mi_lock); 1391 if ((mi->mi_recovflags & MI4R_REOPEN_FILES) && 1392 !(mi->mi_flags & MI4_RECOV_FAIL)) { 1393 mutex_exit(&mi->mi_lock); 1394 recov_openfiles(recovp, sp); 1395 } else 1396 mutex_exit(&mi->mi_lock); 1397 } 1398 1399 /* 1400 * Send any queued state recovery requests. 1401 */ 1402 mutex_enter(&mi->mi_lock); 1403 if (sp != NULL && 1404 (mi->mi_recovflags & MI4R_LOST_STATE) && 1405 !(mi->mi_flags & MI4_RECOV_FAIL)) { 1406 mutex_exit(&mi->mi_lock); 1407 (void) nfs_rw_enter_sig(&mi->mi_recovlock, 1408 RW_WRITER, 0); 1409 nfs4_resend_lost_rqsts(recovp, sp); 1410 if (list_head(&mi->mi_lost_state) == NULL) { 1411 /* done */ 1412 mutex_enter(&mi->mi_lock); 1413 mi->mi_recovflags &= ~MI4R_LOST_STATE; 1414 mutex_exit(&mi->mi_lock); 1415 } 1416 nfs_rw_exit(&mi->mi_recovlock); 1417 } else { 1418 mutex_exit(&mi->mi_lock); 1419 } 1420 1421 /* 1422 * See if there is anything more to do. If not, announce 1423 * that we are done and exit. 1424 * 1425 * Need mi_recovlock to keep 'sp' valid. Must grab 1426 * mi_recovlock before mi_lock to preserve lock ordering. 1427 */ 1428 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0); 1429 mutex_enter(&mi->mi_lock); 1430 if ((mi->mi_recovflags & ~MI4R_SRV_REBOOT) == 0 || 1431 (mi->mi_flags & MI4_RECOV_FAIL)) { 1432 list_t local_lost_state; 1433 nfs4_lost_rqst_t *lrp; 1434 1435 /* 1436 * We need to remove the lost requests before we 1437 * unmark the mi as no longer doing recovery to 1438 * avoid a race with a new thread putting new lost 1439 * requests on the same mi (and the going away 1440 * thread would remove the new lost requests). 1441 * 1442 * Move the lost requests to a local list since 1443 * nfs4_remove_lost_rqst() drops mi_lock, and 1444 * dropping the mi_lock would make our check to 1445 * see if recovery is done no longer valid. 1446 */ 1447 list_create(&local_lost_state, 1448 sizeof (nfs4_lost_rqst_t), 1449 offsetof(nfs4_lost_rqst_t, lr_node)); 1450 list_move_tail(&local_lost_state, &mi->mi_lost_state); 1451 1452 done = 1; 1453 mutex_exit(&mi->mi_lock); 1454 /* 1455 * Now officially free the "moved" 1456 * lost requests. 1457 */ 1458 while ((lrp = list_head(&local_lost_state)) != NULL) { 1459 list_remove(&local_lost_state, lrp); 1460 nfs4_free_lost_rqst(lrp, sp); 1461 } 1462 list_destroy(&local_lost_state); 1463 } else 1464 mutex_exit(&mi->mi_lock); 1465 nfs_rw_exit(&mi->mi_recovlock); 1466 1467 /* 1468 * If the filesystem has been forcibly unmounted, there is 1469 * probably no point in retrying immediately. Furthermore, 1470 * there might be user processes waiting for a chance to 1471 * queue up "lost state" requests, so that they can exit. 1472 * So pause here for a moment. Same logic for zone shutdown. 1473 */ 1474 if (!done && FS_OR_ZONE_GONE4(mi->mi_vfsp)) { 1475 mutex_enter(&mi->mi_lock); 1476 cv_broadcast(&mi->mi_failover_cv); 1477 mutex_exit(&mi->mi_lock); 1478 delay(SEC_TO_TICK(nfs4_unmount_delay)); 1479 } 1480 1481 } while (!done); 1482 1483 if (sp != NULL) 1484 nfs4_server_rele(sp); 1485 1486 /* 1487 * Return all recalled delegations 1488 */ 1489 nfs4_dlistclean(); 1490 1491 mutex_enter(&mi->mi_lock); 1492 recov_done(mi, recovp); 1493 mutex_exit(&mi->mi_lock); 1494 1495 /* 1496 * Free up resources that were allocated for us. 1497 */ 1498 if (recovp->rc_vp1 != NULL) 1499 VN_RELE(recovp->rc_vp1); 1500 if (recovp->rc_vp2 != NULL) 1501 VN_RELE(recovp->rc_vp2); 1502 1503 /* now we are done using the mi struct, signal the waiters */ 1504 mutex_enter(&mi->mi_lock); 1505 mi->mi_in_recovery--; 1506 if (mi->mi_in_recovery == 0) 1507 cv_broadcast(&mi->mi_cv_in_recov); 1508 mutex_exit(&mi->mi_lock); 1509 1510 VFS_RELE(mi->mi_vfsp); 1511 MI4_RELE(mi); 1512 kmem_free(recovp, sizeof (recov_info_t)); 1513 mutex_enter(&cpr_lock); 1514 CALLB_CPR_EXIT(&cpr_info); 1515 mutex_destroy(&cpr_lock); 1516 zthread_exit(); 1517 } 1518 1519 /* 1520 * Log the end of recovery and notify any waiting threads. 1521 */ 1522 1523 static void 1524 recov_done(mntinfo4_t *mi, recov_info_t *recovp) 1525 { 1526 1527 ASSERT(MUTEX_HELD(&mi->mi_lock)); 1528 1529 nfs4_queue_event(RE_END, mi, NULL, 0, recovp->rc_vp1, 1530 recovp->rc_vp2, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0); 1531 mi->mi_recovthread = NULL; 1532 mi->mi_flags &= ~MI4_RECOV_ACTIV; 1533 mi->mi_recovflags &= ~MI4R_SRV_REBOOT; 1534 cv_broadcast(&mi->mi_failover_cv); 1535 } 1536 1537 /* 1538 * State-specific recovery routines, by state. 1539 */ 1540 1541 /* 1542 * Failover. 1543 * 1544 * Replaces *spp with a reference to the new server, which must 1545 * eventually be freed. 1546 */ 1547 1548 static void 1549 recov_newserver(recov_info_t *recovp, nfs4_server_t **spp, bool_t *recov_fail) 1550 { 1551 mntinfo4_t *mi = recovp->rc_mi; 1552 servinfo4_t *svp = NULL; 1553 nfs4_server_t *osp = *spp; 1554 CLIENT *cl; 1555 enum clnt_stat status; 1556 struct timeval tv; 1557 int error; 1558 int oncethru = 0; 1559 rnode4_t *rp; 1560 int index; 1561 nfs_fh4 fh; 1562 char *snames; 1563 size_t len; 1564 1565 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0); 1566 1567 tv.tv_sec = 2; 1568 tv.tv_usec = 0; 1569 1570 #ifdef lint 1571 /* 1572 * Lint can't follow the logic, so thinks that snames and len 1573 * can be used before being set. They can't, but lint can't 1574 * figure it out. To address the lint warning, initialize 1575 * snames and len for lint. 1576 */ 1577 snames = NULL; 1578 len = 0; 1579 #endif 1580 1581 /* 1582 * Ping the null NFS procedure of every server in 1583 * the list until one responds. We always start 1584 * at the head of the list and always skip the one 1585 * that is current, since it's caused us a problem. 1586 */ 1587 while (svp == NULL) { 1588 for (svp = mi->mi_servers; svp; svp = svp->sv_next) { 1589 1590 mutex_enter(&mi->mi_lock); 1591 if (FS_OR_ZONE_GONE4(mi->mi_vfsp)) { 1592 mi->mi_flags |= MI4_RECOV_FAIL; 1593 mutex_exit(&mi->mi_lock); 1594 (void) nfs_rw_exit(&mi->mi_recovlock); 1595 *recov_fail = TRUE; 1596 if (oncethru) 1597 kmem_free(snames, len); 1598 return; 1599 } 1600 mutex_exit(&mi->mi_lock); 1601 1602 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 1603 if (svp->sv_flags & SV4_NOTINUSE) { 1604 nfs_rw_exit(&svp->sv_lock); 1605 continue; 1606 } 1607 nfs_rw_exit(&svp->sv_lock); 1608 1609 if (!oncethru && svp == mi->mi_curr_serv) 1610 continue; 1611 1612 error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, 1613 NFS_PROGRAM, NFS_V4, 0, 1, CRED(), &cl); 1614 if (error) 1615 continue; 1616 1617 if (!(mi->mi_flags & MI4_INT)) 1618 cl->cl_nosignal = TRUE; 1619 status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL, 1620 xdr_void, NULL, tv); 1621 if (!(mi->mi_flags & MI4_INT)) 1622 cl->cl_nosignal = FALSE; 1623 AUTH_DESTROY(cl->cl_auth); 1624 CLNT_DESTROY(cl); 1625 if (status == RPC_SUCCESS) { 1626 nfs4_queue_event(RE_FAILOVER, mi, 1627 svp == mi->mi_curr_serv ? NULL : 1628 svp->sv_hostname, 0, NULL, NULL, 0, 1629 NULL, 0, TAG_NONE, TAG_NONE, 0, 0); 1630 break; 1631 } 1632 } 1633 1634 if (svp == NULL) { 1635 if (!oncethru) { 1636 snames = nfs4_getsrvnames(mi, &len); 1637 nfs4_queue_fact(RF_SRVS_NOT_RESPOND, mi, 1638 0, 0, 0, FALSE, snames, 0, NULL); 1639 oncethru = 1; 1640 } 1641 delay(hz); 1642 } 1643 } 1644 1645 if (oncethru) { 1646 nfs4_queue_fact(RF_SRVS_OK, mi, 0, 0, 0, FALSE, snames, 1647 0, NULL); 1648 kmem_free(snames, len); 1649 } 1650 1651 #if DEBUG 1652 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 1653 ASSERT((svp->sv_flags & SV4_NOTINUSE) == 0); 1654 nfs_rw_exit(&svp->sv_lock); 1655 #endif 1656 1657 mutex_enter(&mi->mi_lock); 1658 mi->mi_recovflags &= ~MI4R_NEED_NEW_SERVER; 1659 if (svp != mi->mi_curr_serv) { 1660 servinfo4_t *osvp = mi->mi_curr_serv; 1661 1662 mutex_exit(&mi->mi_lock); 1663 1664 /* 1665 * Update server-dependent fields in the root vnode. 1666 */ 1667 index = rtable4hash(mi->mi_rootfh); 1668 rw_enter(&rtable4[index].r_lock, RW_WRITER); 1669 1670 rp = r4find(&rtable4[index], mi->mi_rootfh, mi->mi_vfsp); 1671 if (rp != NULL) { 1672 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE, 1673 "recov_newserver: remapping %s", rnode4info(rp))); 1674 mutex_enter(&rp->r_statelock); 1675 rp->r_server = svp; 1676 PURGE_ATTRCACHE4_LOCKED(rp); 1677 mutex_exit(&rp->r_statelock); 1678 (void) nfs4_free_data_reclaim(rp); 1679 nfs4_purge_rddir_cache(RTOV4(rp)); 1680 rw_exit(&rtable4[index].r_lock); 1681 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE, 1682 "recov_newserver: done with %s", 1683 rnode4info(rp))); 1684 VN_RELE(RTOV4(rp)); 1685 } else 1686 rw_exit(&rtable4[index].r_lock); 1687 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0); 1688 1689 mutex_enter(&mi->mi_lock); 1690 mi->mi_recovflags |= MI4R_REOPEN_FILES | MI4R_REMAP_FILES; 1691 if (recovp->rc_srv_reboot) 1692 mi->mi_recovflags |= MI4R_SRV_REBOOT; 1693 mi->mi_curr_serv = svp; 1694 mi->mi_failover++; 1695 mi->mi_flags &= ~MI4_BADOWNER_DEBUG; 1696 mutex_exit(&mi->mi_lock); 1697 1698 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 1699 fh.nfs_fh4_len = svp->sv_fhandle.fh_len; 1700 fh.nfs_fh4_val = svp->sv_fhandle.fh_buf; 1701 sfh4_update(mi->mi_rootfh, &fh); 1702 fh.nfs_fh4_len = svp->sv_pfhandle.fh_len; 1703 fh.nfs_fh4_val = svp->sv_pfhandle.fh_buf; 1704 sfh4_update(mi->mi_srvparentfh, &fh); 1705 nfs_rw_exit(&svp->sv_lock); 1706 1707 *spp = nfs4_move_mi(mi, osvp, svp); 1708 if (osp != NULL) 1709 nfs4_server_rele(osp); 1710 } else 1711 mutex_exit(&mi->mi_lock); 1712 (void) nfs_rw_exit(&mi->mi_recovlock); 1713 } 1714 1715 /* 1716 * Clientid. 1717 */ 1718 1719 static void 1720 recov_clientid(recov_info_t *recovp, nfs4_server_t *sp) 1721 { 1722 mntinfo4_t *mi = recovp->rc_mi; 1723 int error = 0; 1724 int still_stale; 1725 int need_new_s; 1726 1727 ASSERT(sp != NULL); 1728 1729 /* 1730 * Acquire the recovery lock and then verify that the clientid 1731 * still needs to be recovered. (Note that s_recovlock is supposed 1732 * to be acquired before s_lock.) Since the thread holds the 1733 * recovery lock, no other thread will recover the clientid. 1734 */ 1735 (void) nfs_rw_enter_sig(&sp->s_recovlock, RW_WRITER, 0); 1736 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0); 1737 mutex_enter(&sp->s_lock); 1738 still_stale = ((sp->s_flags & N4S_CLIENTID_SET) == 0); 1739 mutex_exit(&sp->s_lock); 1740 1741 if (still_stale) { 1742 nfs4_error_t n4e; 1743 1744 nfs4_error_zinit(&n4e); 1745 nfs4setclientid(mi, kcred, TRUE, &n4e); 1746 error = n4e.error; 1747 if (error != 0) { 1748 1749 /* 1750 * nfs4setclientid may have set MI4R_NEED_NEW_SERVER, 1751 * if so, just return and let recov_thread drive 1752 * failover. 1753 */ 1754 mutex_enter(&mi->mi_lock); 1755 need_new_s = mi->mi_recovflags & MI4R_NEED_NEW_SERVER; 1756 mutex_exit(&mi->mi_lock); 1757 1758 if (need_new_s) { 1759 nfs_rw_exit(&mi->mi_recovlock); 1760 nfs_rw_exit(&sp->s_recovlock); 1761 return; 1762 } 1763 1764 nfs4_queue_event(RE_CLIENTID, mi, NULL, n4e.error, NULL, 1765 NULL, n4e.stat, NULL, 0, TAG_NONE, TAG_NONE, 0, 0); 1766 mutex_enter(&mi->mi_lock); 1767 mi->mi_flags |= MI4_RECOV_FAIL; 1768 mi->mi_error = recovp->rc_error; 1769 mutex_exit(&mi->mi_lock); 1770 /* don't destroy the nfs4_server, let umount do it */ 1771 } 1772 } 1773 1774 if (error == 0) { 1775 mutex_enter(&mi->mi_lock); 1776 mi->mi_recovflags &= ~MI4R_NEED_CLIENTID; 1777 /* 1778 * If still_stale isn't true, then another thread already 1779 * recovered the clientid. And that thread that set the 1780 * clientid will have initiated reopening files on all the 1781 * filesystems for the server, so we should not initiate 1782 * reopening for this filesystem here. 1783 */ 1784 if (still_stale) { 1785 mi->mi_recovflags |= MI4R_REOPEN_FILES; 1786 if (recovp->rc_srv_reboot) 1787 mi->mi_recovflags |= MI4R_SRV_REBOOT; 1788 } 1789 mutex_exit(&mi->mi_lock); 1790 } 1791 1792 nfs_rw_exit(&mi->mi_recovlock); 1793 1794 if (error != 0) { 1795 nfs_rw_exit(&sp->s_recovlock); 1796 mutex_enter(&mi->mi_lock); 1797 if ((mi->mi_flags & MI4_RECOV_FAIL) == 0) 1798 delay(SEC_TO_TICK(recov_err_delay)); 1799 mutex_exit(&mi->mi_lock); 1800 } else { 1801 mntinfo4_t **milist; 1802 mntinfo4_t *tmi; 1803 int nummi, i; 1804 1805 /* 1806 * Initiate recovery of open files for other filesystems. 1807 * We create an array of filesystems, rather than just 1808 * walking the filesystem list, to avoid deadlock issues 1809 * with s_lock and mi_recovlock. 1810 */ 1811 milist = make_milist(sp, &nummi); 1812 for (i = 0; i < nummi; i++) { 1813 tmi = milist[i]; 1814 if (tmi != mi) { 1815 (void) nfs_rw_enter_sig(&tmi->mi_recovlock, 1816 RW_READER, 0); 1817 start_recovery_action(NR_OPENFILES, TRUE, tmi, 1818 NULL, NULL); 1819 nfs_rw_exit(&tmi->mi_recovlock); 1820 } 1821 } 1822 free_milist(milist, nummi); 1823 1824 nfs_rw_exit(&sp->s_recovlock); 1825 } 1826 } 1827 1828 /* 1829 * Return an array of filesystems associated with the given server. The 1830 * caller should call free_milist() to free the references and memory. 1831 */ 1832 1833 static mntinfo4_t ** 1834 make_milist(nfs4_server_t *sp, int *nummip) 1835 { 1836 int nummi, i; 1837 mntinfo4_t **milist; 1838 mntinfo4_t *tmi; 1839 1840 mutex_enter(&sp->s_lock); 1841 nummi = 0; 1842 for (tmi = sp->mntinfo4_list; tmi != NULL; tmi = tmi->mi_clientid_next) 1843 nummi++; 1844 1845 milist = kmem_alloc(nummi * sizeof (mntinfo4_t *), KM_SLEEP); 1846 1847 for (i = 0, tmi = sp->mntinfo4_list; tmi != NULL; i++, 1848 tmi = tmi->mi_clientid_next) { 1849 milist[i] = tmi; 1850 VFS_HOLD(tmi->mi_vfsp); 1851 } 1852 mutex_exit(&sp->s_lock); 1853 1854 *nummip = nummi; 1855 return (milist); 1856 } 1857 1858 /* 1859 * Free the filesystem list created by make_milist(). 1860 */ 1861 1862 static void 1863 free_milist(mntinfo4_t **milist, int nummi) 1864 { 1865 mntinfo4_t *tmi; 1866 int i; 1867 1868 for (i = 0; i < nummi; i++) { 1869 tmi = milist[i]; 1870 VFS_RELE(tmi->mi_vfsp); 1871 } 1872 kmem_free(milist, nummi * sizeof (mntinfo4_t *)); 1873 } 1874 1875 /* 1876 * Filehandle 1877 */ 1878 1879 /* 1880 * Lookup the filehandle for the given vnode and update the rnode if it has 1881 * changed. 1882 * 1883 * Errors: 1884 * - if the filehandle could not be updated because of an error that 1885 * requires further recovery, initiate that recovery and return. 1886 * - if the filehandle could not be updated because of a signal, pretend we 1887 * succeeded and let someone else deal with it. 1888 * - if the filehandle could not be updated and the filesystem has been 1889 * forcibly unmounted, pretend we succeeded, and let the caller deal with 1890 * the forced unmount (to retry or not to retry, that is the question). 1891 * - if the filehandle could not be updated because of some other error, 1892 * mark the rnode bad and return. 1893 */ 1894 static void 1895 recov_filehandle(nfs4_recov_t action, mntinfo4_t *mi, vnode_t *vp) 1896 { 1897 rnode4_t *rp = VTOR4(vp); 1898 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 1899 bool_t needrecov; 1900 1901 mutex_enter(&rp->r_statelock); 1902 1903 if (rp->r_flags & R4RECOVERR) { 1904 mutex_exit(&rp->r_statelock); 1905 return; 1906 } 1907 1908 /* 1909 * If someone else is updating the filehandle, wait for them to 1910 * finish and then let our caller retry. 1911 */ 1912 if (rp->r_flags & R4RECEXPFH) { 1913 while (rp->r_flags & R4RECEXPFH) { 1914 cv_wait(&rp->r_cv, &rp->r_statelock); 1915 } 1916 mutex_exit(&rp->r_statelock); 1917 return; 1918 } 1919 rp->r_flags |= R4RECEXPFH; 1920 mutex_exit(&rp->r_statelock); 1921 1922 if (action == NR_BADHANDLE) { 1923 /* shouldn't happen */ 1924 nfs4_queue_event(RE_BADHANDLE, mi, NULL, 0, 1925 vp, NULL, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0); 1926 } 1927 1928 nfs4_remap_file(mi, vp, 0, &e); 1929 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 1930 1931 /* 1932 * If we get BADHANDLE or FHEXPIRED in their handler, something is 1933 * broken. Don't try to recover, just mark the file dead. 1934 */ 1935 if (needrecov && e.error == 0 && 1936 (e.stat == NFS4ERR_BADHANDLE || e.stat == NFS4ERR_FHEXPIRED)) 1937 needrecov = FALSE; 1938 if (needrecov) { 1939 (void) nfs4_start_recovery(&e, mi, vp, 1940 NULL, NULL, NULL, OP_LOOKUP, NULL); 1941 } else if (e.error != EINTR && 1942 !NFS4_FRC_UNMT_ERR(e.error, mi->mi_vfsp) && 1943 (e.error != 0 || e.stat != NFS4_OK)) { 1944 nfs4_recov_fh_fail(vp, e.error, e.stat); 1945 /* 1946 * Don't set r_error to ESTALE. Higher-level code (e.g., 1947 * cstatat_getvp()) retries on ESTALE, which would cause 1948 * an infinite loop. 1949 */ 1950 } 1951 1952 mutex_enter(&rp->r_statelock); 1953 rp->r_flags &= ~R4RECEXPFH; 1954 cv_broadcast(&rp->r_cv); 1955 mutex_exit(&rp->r_statelock); 1956 } 1957 1958 /* 1959 * Stale Filehandle 1960 */ 1961 1962 /* 1963 * A stale filehandle can happen when an individual file has 1964 * been removed, or when an entire filesystem has been taken 1965 * offline. To distinguish these cases, we do this: 1966 * - if a GETATTR with the current filehandle is okay, we do 1967 * nothing (this can happen with two-filehandle ops) 1968 * - if the GETATTR fails, but a GETATTR of the root filehandle 1969 * succeeds, mark the rnode with R4STALE, which will stop use 1970 * - if the GETATTR fails, and a GETATTR of the root filehandle 1971 * also fails, we consider the problem filesystem-wide, so: 1972 * - if we can failover, we should 1973 * - if we can't failover, we should mark both the original 1974 * vnode and the root bad 1975 */ 1976 static void 1977 recov_stale(mntinfo4_t *mi, vnode_t *vp) 1978 { 1979 rnode4_t *rp = VTOR4(vp); 1980 vnode_t *rootvp = NULL; 1981 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 1982 nfs4_ga_res_t gar; 1983 char *fail_msg = "failed to recover from NFS4ERR_STALE"; 1984 bool_t needrecov; 1985 1986 mutex_enter(&rp->r_statelock); 1987 1988 if (rp->r_flags & R4RECOVERR) { 1989 mutex_exit(&rp->r_statelock); 1990 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1991 "recov_stale: already marked dead, rp %s", 1992 rnode4info(rp))); 1993 return; 1994 } 1995 1996 if (rp->r_flags & R4STALE) { 1997 mutex_exit(&rp->r_statelock); 1998 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1999 "recov_stale: already marked stale, rp %s", 2000 rnode4info(rp))); 2001 return; 2002 } 2003 2004 mutex_exit(&rp->r_statelock); 2005 2006 /* Try a GETATTR on this vnode */ 2007 nfs4_getattr_otw_norecovery(vp, &gar, &e, CRED(), 0); 2008 2009 /* 2010 * Handle non-STALE recoverable errors 2011 */ 2012 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 2013 if (needrecov && (e.error != 0 || e.stat != NFS4ERR_STALE)) { 2014 (void) nfs4_start_recovery(&e, mi, vp, 2015 NULL, NULL, NULL, OP_GETATTR, NULL); 2016 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2017 "recov_stale: error=%d, stat=%d seen on rp %s", 2018 e.error, e.stat, rnode4info(rp))); 2019 goto out; 2020 } 2021 2022 /* Are things OK for this vnode? */ 2023 if (!e.error && e.stat == NFS4_OK) { 2024 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2025 "recov_stale: file appears fine, rp %s", 2026 rnode4info(rp))); 2027 goto out; 2028 } 2029 2030 /* Did we get an unrelated non-recoverable error? */ 2031 if (e.error || e.stat != NFS4ERR_STALE) { 2032 nfs4_fail_recov(vp, fail_msg, e.error, e.stat); 2033 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2034 "recov_stale: unrelated fatal error, rp %s", 2035 rnode4info(rp))); 2036 goto out; 2037 } 2038 2039 /* 2040 * If we don't appear to be dealing with the root node, find it. 2041 */ 2042 if ((vp->v_flag & VROOT) == 0) { 2043 nfs4_error_zinit(&e); 2044 e.error = VFS_ROOT(vp->v_vfsp, &rootvp); 2045 if (e.error) { 2046 nfs4_fail_recov(vp, fail_msg, 0, NFS4ERR_STALE); 2047 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2048 "recov_stale: can't find root node for rp %s", 2049 rnode4info(rp))); 2050 goto out; 2051 } 2052 } 2053 2054 /* Try a GETATTR on the root vnode */ 2055 if (rootvp != NULL) { 2056 nfs4_error_zinit(&e); 2057 nfs4_getattr_otw_norecovery(rootvp, &gar, &e, CRED(), 0); 2058 2059 /* Try recovery? */ 2060 if (e.error != 0 || e.stat != NFS4ERR_STALE) { 2061 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 2062 if (needrecov) { 2063 (void) nfs4_start_recovery(&e, 2064 mi, rootvp, NULL, NULL, NULL, 2065 OP_GETATTR, NULL); 2066 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2067 "recov_stale: error=%d, stat=%d seen " 2068 "on rp %s", e.error, e.stat, 2069 rnode4info(rp))); 2070 } 2071 } 2072 2073 /* 2074 * Check to see if a failover attempt is warranted 2075 * NB: nfs4_try_failover doesn't check for STALE 2076 * because recov_stale gets a shot first. Now that 2077 * recov_stale has failed, go ahead and try failover. 2078 * 2079 * If the getattr on the root filehandle was successful, 2080 * then mark recovery as failed for 'vp' and exit. 2081 */ 2082 if (nfs4_try_failover(&e) == 0 && e.stat != NFS4ERR_STALE) { 2083 /* 2084 * pass the original error to fail_recov, not 2085 * the one from trying the root vnode. 2086 */ 2087 nfs4_fail_recov(vp, fail_msg, 0, NFS4ERR_STALE); 2088 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2089 "recov_stale: root node OK, marking " 2090 "dead rp %s", rnode4info(rp))); 2091 goto out; 2092 } 2093 } 2094 2095 /* 2096 * Here, we know that both the original file and the 2097 * root filehandle (which may be the same) are stale. 2098 * We want to fail over if we can, and if we can't, we 2099 * want to mark everything in sight bad. 2100 */ 2101 if (FAILOVER_MOUNT4(mi)) { 2102 mutex_enter(&mi->mi_lock); 2103 mi->mi_recovflags |= MI4R_NEED_NEW_SERVER; 2104 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2105 "recov_stale: failing over due to rp %s", 2106 rnode4info(rp))); 2107 mutex_exit(&mi->mi_lock); 2108 } else { 2109 rnode4_t *rootrp; 2110 servinfo4_t *svp; 2111 2112 /* 2113 * Can't fail over, so mark things dead. 2114 * 2115 * If rootvp is set, we know we have a distinct 2116 * non-root vnode which can be marked dead in 2117 * the usual way. 2118 * 2119 * Then we want to mark the root vnode dead. 2120 * Note that if rootvp wasn't set, our vp is 2121 * actually the root vnode. 2122 */ 2123 if (rootvp != NULL) { 2124 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2125 "recov_stale: can't fail over, marking dead rp %s", 2126 rnode4info(rp))); 2127 nfs4_fail_recov(vp, fail_msg, 0, NFS4ERR_STALE); 2128 } else { 2129 rootvp = vp; 2130 VN_HOLD(rootvp); 2131 } 2132 2133 /* 2134 * Mark root dead, but quietly - since 2135 * the root rnode is frequently recreated, 2136 * we can encounter this at every access. 2137 * Also mark recovery as failed on this VFS. 2138 */ 2139 rootrp = VTOR4(rootvp); 2140 NFS4_DEBUG(nfs4_client_recov_debug, (CE_CONT, 2141 "recov_stale: marking dead root rp %s", 2142 rnode4info(rootrp))); 2143 mutex_enter(&rootrp->r_statelock); 2144 rootrp->r_flags |= (R4RECOVERR | R4STALE); 2145 rootrp->r_error = ESTALE; 2146 mutex_exit(&rootrp->r_statelock); 2147 mutex_enter(&mi->mi_lock); 2148 mi->mi_error = ESTALE; 2149 mutex_exit(&mi->mi_lock); 2150 2151 svp = mi->mi_curr_serv; 2152 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0); 2153 svp->sv_flags |= SV4_ROOT_STALE; 2154 nfs_rw_exit(&svp->sv_lock); 2155 } 2156 2157 out: 2158 if (rootvp) 2159 VN_RELE(rootvp); 2160 } 2161 2162 /* 2163 * Locks. 2164 */ 2165 2166 /* 2167 * Reclaim all the active (acquired) locks for the given file. 2168 * If a process lost a lock, the process is sent a SIGLOST. This is not 2169 * considered an error. 2170 * 2171 * Return values: 2172 * Errors and status are returned via the nfs4_error_t parameter 2173 * If an error indicates that recovery is needed, the caller is responsible 2174 * for dealing with it. 2175 */ 2176 2177 static void 2178 relock_file(vnode_t *vp, mntinfo4_t *mi, nfs4_error_t *ep, 2179 fattr4_change pre_change) 2180 { 2181 locklist_t *locks, *llp; 2182 rnode4_t *rp; 2183 2184 ASSERT(ep != NULL); 2185 nfs4_error_zinit(ep); 2186 2187 if (VTOMI4(vp)->mi_flags & MI4_LLOCK) 2188 return; 2189 2190 nfs4_flush_lock_owners(VTOR4(vp)); 2191 2192 /* 2193 * If we get an error that requires recovery actions, just bail out 2194 * and let the top-level recovery code handle it. 2195 * 2196 * If we get some other error, kill the process that owned the lock 2197 * and mark its remaining locks (if any) as belonging to NOPID, so 2198 * that we don't make any more reclaim requests for that process. 2199 */ 2200 2201 rp = VTOR4(vp); 2202 locks = flk_active_locks_for_vp(vp); 2203 for (llp = locks; llp != NULL; llp = llp->ll_next) { 2204 int did_reclaim = 1; 2205 2206 ASSERT(llp->ll_vp == vp); 2207 if (llp->ll_flock.l_pid == NOPID) 2208 continue; 2209 reclaim_one_lock(vp, &llp->ll_flock, ep, &did_reclaim); 2210 /* 2211 * If we need to restart recovery, stop processing the 2212 * list. Some errors would be recoverable under other 2213 * circumstances, but if they happen here we just give up 2214 * on the lock. 2215 */ 2216 if (nfs4_needs_recovery(ep, TRUE, vp->v_vfsp)) { 2217 if (ep->error != 0) 2218 break; 2219 if (!nfs4_recov_marks_dead(ep->stat)) 2220 break; 2221 } 2222 /* 2223 * In case the server isn't offering us a grace period, or 2224 * if we missed it, we might have opened & locked from scratch, 2225 * rather than reopened/reclaimed. 2226 * We need to ensure that the object hadn't been otherwise 2227 * changed during this time, by comparing the changeinfo. 2228 * We get passed the changeinfo from before the reopen by our 2229 * caller, in pre_change. 2230 * The changeinfo from after the reopen is in rp->r_change, 2231 * courtesy of the GETATTR in the reopen. 2232 * If they're different, then the file has changed, and we 2233 * have to SIGLOST the app. 2234 */ 2235 if (ep->error == 0 && ep->stat == NFS4_OK && !did_reclaim) { 2236 mutex_enter(&rp->r_statelock); 2237 if (pre_change != rp->r_change) 2238 ep->stat = NFS4ERR_NO_GRACE; 2239 mutex_exit(&rp->r_statelock); 2240 } 2241 if (ep->error != 0 || ep->stat != NFS4_OK) { 2242 if (ep->error != 0) 2243 nfs4_queue_event(RE_FAIL_RELOCK, mi, 2244 NULL, ep->error, vp, NULL, 0, NULL, 2245 llp->ll_flock.l_pid, TAG_NONE, TAG_NONE, 2246 0, 0); 2247 else 2248 nfs4_queue_event(RE_FAIL_RELOCK, mi, 2249 NULL, 0, vp, NULL, ep->stat, NULL, 2250 llp->ll_flock.l_pid, TAG_NONE, TAG_NONE, 2251 0, 0); 2252 nfs4_send_siglost(llp->ll_flock.l_pid, mi, vp, TRUE, 2253 ep->error, ep->stat); 2254 relock_skip_pid(llp, llp->ll_flock.l_pid); 2255 2256 /* Reinitialize the nfs4_error and continue */ 2257 nfs4_error_zinit(ep); 2258 } 2259 } 2260 2261 if (locks != NULL) 2262 flk_free_locklist(locks); 2263 } 2264 2265 /* 2266 * Reclaim the given lock. 2267 * If the lock can't be reclaimed, the process is sent SIGLOST, but this is 2268 * not considered an error. 2269 * 2270 * Errors are returned via the nfs4_error_t parameter. 2271 */ 2272 static void 2273 reclaim_one_lock(vnode_t *vp, flock64_t *flk, nfs4_error_t *ep, 2274 int *did_reclaimp) 2275 { 2276 cred_t *cr; 2277 rnode4_t *rp = VTOR4(vp); 2278 2279 cr = pid_to_cr(flk->l_pid); 2280 if (cr == NULL) { 2281 nfs4_error_zinit(ep); 2282 ep->error = ESRCH; 2283 return; 2284 } 2285 2286 do { 2287 mutex_enter(&rp->r_statelock); 2288 if (rp->r_flags & R4RECOVERR) { 2289 /* 2290 * This shouldn't affect other reclaims, so don't 2291 * return an error. 2292 */ 2293 mutex_exit(&rp->r_statelock); 2294 break; 2295 } 2296 mutex_exit(&rp->r_statelock); 2297 2298 nfs4frlock(NFS4_LCK_CTYPE_RECLAIM, vp, F_SETLK, flk, 2299 FREAD|FWRITE, 0, cr, ep, NULL, did_reclaimp); 2300 if (ep->error == 0 && ep->stat == NFS4ERR_FHEXPIRED) 2301 start_recovery_action(NR_FHEXPIRED, TRUE, VTOMI4(vp), 2302 vp, NULL); 2303 } while (ep->error == 0 && ep->stat == NFS4ERR_FHEXPIRED); 2304 2305 crfree(cr); 2306 } 2307 2308 /* 2309 * Open files. 2310 */ 2311 2312 /* 2313 * Verifies if the nfsstat4 is a valid error for marking this vnode dead. 2314 * Returns 1 if the error is valid; 0 otherwise. 2315 */ 2316 static int 2317 nfs4_valid_recov_err_for_vp(vnode_t *vp, nfsstat4 stat) 2318 { 2319 /* 2320 * We should not be marking non-regular files as dead, 2321 * except in very rare cases (eg: BADHANDLE or NFS4ERR_BADNAME). 2322 */ 2323 if (vp->v_type != VREG && stat != NFS4ERR_BADHANDLE && 2324 stat != NFS4ERR_BADNAME) 2325 return (0); 2326 2327 return (1); 2328 } 2329 2330 /* 2331 * Failed attempting to recover a filehandle. If 'stat' is valid for 'vp', 2332 * then mark the object dead. Since we've had to do a lookup for 2333 * filehandle recovery, we will mark the object dead if we got NOENT. 2334 */ 2335 static void 2336 nfs4_recov_fh_fail(vnode_t *vp, int error, nfsstat4 stat) 2337 { 2338 ASSERT(vp != NULL); 2339 2340 if ((error == 0) && (stat != NFS4ERR_NOENT) && 2341 (!nfs4_valid_recov_err_for_vp(vp, stat))) 2342 return; 2343 2344 nfs4_fail_recov(vp, "can't recover filehandle", error, stat); 2345 } 2346 2347 /* 2348 * Recovery from a "shouldn't happen" error. In the long term, we'd like 2349 * to mark only the data structure(s) that provided the bad value as being 2350 * bad. But for now we'll just mark the entire file. 2351 */ 2352 2353 static void 2354 recov_badstate(recov_info_t *recovp, vnode_t *vp, nfsstat4 stat) 2355 { 2356 ASSERT(vp != NULL); 2357 recov_throttle(recovp, vp); 2358 2359 if (!nfs4_valid_recov_err_for_vp(vp, stat)) 2360 return; 2361 2362 nfs4_fail_recov(vp, "", 0, stat); 2363 } 2364 2365 /* 2366 * Free up the information saved for a lost state request. 2367 */ 2368 static void 2369 nfs4_free_lost_rqst(nfs4_lost_rqst_t *lrp, nfs4_server_t *sp) 2370 { 2371 component4 *filep; 2372 nfs4_open_stream_t *osp; 2373 int have_sync_lock; 2374 2375 NFS4_DEBUG(nfs4_lost_rqst_debug, 2376 (CE_NOTE, "nfs4_free_lost_rqst:")); 2377 2378 switch (lrp->lr_op) { 2379 case OP_OPEN: 2380 filep = &lrp->lr_ofile; 2381 if (filep->utf8string_val) { 2382 kmem_free(filep->utf8string_val, filep->utf8string_len); 2383 filep->utf8string_val = NULL; 2384 } 2385 break; 2386 case OP_DELEGRETURN: 2387 nfs4delegreturn_cleanup(VTOR4(lrp->lr_vp), sp); 2388 break; 2389 case OP_CLOSE: 2390 osp = lrp->lr_osp; 2391 ASSERT(osp != NULL); 2392 mutex_enter(&osp->os_sync_lock); 2393 have_sync_lock = 1; 2394 if (osp->os_pending_close) { 2395 /* clean up the open file state. */ 2396 osp->os_pending_close = 0; 2397 nfs4close_notw(lrp->lr_vp, osp, &have_sync_lock); 2398 } 2399 if (have_sync_lock) 2400 mutex_exit(&osp->os_sync_lock); 2401 break; 2402 } 2403 2404 lrp->lr_op = 0; 2405 if (lrp->lr_oop != NULL) { 2406 open_owner_rele(lrp->lr_oop); 2407 lrp->lr_oop = NULL; 2408 } 2409 if (lrp->lr_osp != NULL) { 2410 open_stream_rele(lrp->lr_osp, VTOR4(lrp->lr_vp)); 2411 lrp->lr_osp = NULL; 2412 } 2413 if (lrp->lr_lop != NULL) { 2414 lock_owner_rele(lrp->lr_lop); 2415 lrp->lr_lop = NULL; 2416 } 2417 if (lrp->lr_flk != NULL) { 2418 kmem_free(lrp->lr_flk, sizeof (flock64_t)); 2419 lrp->lr_flk = NULL; 2420 } 2421 if (lrp->lr_vp != NULL) { 2422 VN_RELE(lrp->lr_vp); 2423 lrp->lr_vp = NULL; 2424 } 2425 if (lrp->lr_dvp != NULL) { 2426 VN_RELE(lrp->lr_dvp); 2427 lrp->lr_dvp = NULL; 2428 } 2429 if (lrp->lr_cr != NULL) { 2430 crfree(lrp->lr_cr); 2431 lrp->lr_cr = NULL; 2432 } 2433 2434 kmem_free(lrp, sizeof (nfs4_lost_rqst_t)); 2435 } 2436 2437 /* 2438 * Remove any lost state requests and free them. 2439 */ 2440 static void 2441 nfs4_remove_lost_rqsts(mntinfo4_t *mi, nfs4_server_t *sp) 2442 { 2443 nfs4_lost_rqst_t *lrp; 2444 2445 mutex_enter(&mi->mi_lock); 2446 while ((lrp = list_head(&mi->mi_lost_state)) != NULL) { 2447 list_remove(&mi->mi_lost_state, lrp); 2448 mutex_exit(&mi->mi_lock); 2449 nfs4_free_lost_rqst(lrp, sp); 2450 mutex_enter(&mi->mi_lock); 2451 } 2452 mutex_exit(&mi->mi_lock); 2453 } 2454 2455 /* 2456 * Reopen all the files for the given filesystem and reclaim any locks. 2457 */ 2458 2459 static void 2460 recov_openfiles(recov_info_t *recovp, nfs4_server_t *sp) 2461 { 2462 mntinfo4_t *mi = recovp->rc_mi; 2463 nfs4_opinst_t *reopenlist = NULL, *rep; 2464 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 2465 open_claim_type4 claim; 2466 int remap; 2467 char *fail_msg = "No such file or directory on replica"; 2468 rnode4_t *rp; 2469 fattr4_change pre_change; 2470 2471 ASSERT(sp != NULL); 2472 2473 /* 2474 * This check is to allow a 10ms pause before we reopen files 2475 * it should allow the server time to have received the CB_NULL 2476 * reply and update its internal structures such that (if 2477 * applicable) we are granted a delegation on reopened files. 2478 */ 2479 mutex_enter(&sp->s_lock); 2480 if ((sp->s_flags & (N4S_CB_PINGED | N4S_CB_WAITER)) == 0) { 2481 sp->s_flags |= N4S_CB_WAITER; 2482 (void) cv_reltimedwait(&sp->wait_cb_null, &sp->s_lock, 2483 drv_usectohz(N4S_CB_PAUSE_TIME), TR_CLOCK_TICK); 2484 } 2485 mutex_exit(&sp->s_lock); 2486 2487 (void) nfs_rw_enter_sig(&sp->s_recovlock, RW_READER, 0); 2488 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0); 2489 2490 if (NFS4_VOLATILE_FH(mi)) { 2491 nfs4_remap_root(mi, &e, 0); 2492 if (nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp)) { 2493 (void) nfs4_start_recovery(&e, mi, NULL, 2494 NULL, NULL, NULL, OP_LOOKUP, NULL); 2495 } 2496 } 2497 2498 mutex_enter(&mi->mi_lock); 2499 if (recovp->rc_srv_reboot || (mi->mi_recovflags & MI4R_SRV_REBOOT)) 2500 claim = CLAIM_PREVIOUS; 2501 else 2502 claim = CLAIM_NULL; 2503 mutex_exit(&mi->mi_lock); 2504 2505 if (e.error == 0 && e.stat == NFS4_OK) { 2506 /* 2507 * Get a snapshot of open files in the filesystem. Note 2508 * that new opens will stall until the server's grace 2509 * period is done. 2510 */ 2511 reopenlist = r4mkopenlist(mi); 2512 2513 mutex_enter(&mi->mi_lock); 2514 remap = mi->mi_recovflags & MI4R_REMAP_FILES; 2515 mutex_exit(&mi->mi_lock); 2516 /* 2517 * Since we are re-establishing state on the 2518 * server, its ok to blow away the saved lost 2519 * requests since we don't need to reissue it. 2520 */ 2521 nfs4_remove_lost_rqsts(mi, sp); 2522 2523 for (rep = reopenlist; rep; rep = rep->re_next) { 2524 2525 if (remap) { 2526 nfs4_remap_file(mi, rep->re_vp, 2527 NFS4_REMAP_CKATTRS, &e); 2528 } 2529 if (e.error == ENOENT || e.stat == NFS4ERR_NOENT) { 2530 /* 2531 * The current server does not have the file 2532 * that is to be remapped. This is most 2533 * likely due to an improperly maintained 2534 * replica. The files that are missing from 2535 * the server will be marked dead and logged 2536 * in order to make sys admins aware of the 2537 * problem. 2538 */ 2539 nfs4_fail_recov(rep->re_vp, 2540 fail_msg, e.error, e.stat); 2541 /* 2542 * We've already handled the error so clear it. 2543 */ 2544 nfs4_error_zinit(&e); 2545 continue; 2546 } else if (e.error == 0 && e.stat == NFS4_OK) { 2547 int j; 2548 2549 rp = VTOR4(rep->re_vp); 2550 mutex_enter(&rp->r_statelock); 2551 pre_change = rp->r_change; 2552 mutex_exit(&rp->r_statelock); 2553 2554 for (j = 0; j < rep->re_numosp; j++) { 2555 nfs4_reopen(rep->re_vp, rep->re_osp[j], 2556 &e, claim, FALSE, TRUE); 2557 if (e.error != 0 || e.stat != NFS4_OK) 2558 break; 2559 } 2560 if (nfs4_needs_recovery(&e, TRUE, 2561 mi->mi_vfsp)) { 2562 (void) nfs4_start_recovery(&e, mi, 2563 rep->re_vp, NULL, NULL, NULL, 2564 OP_OPEN, NULL); 2565 break; 2566 } 2567 } 2568 #ifdef DEBUG 2569 if (nfs4_recovdelay > 0) 2570 delay(MSEC_TO_TICK(nfs4_recovdelay * 1000)); 2571 #endif 2572 if (e.error == 0 && e.stat == NFS4_OK) 2573 relock_file(rep->re_vp, mi, &e, pre_change); 2574 2575 if (nfs4_needs_recovery(&e, TRUE, mi->mi_vfsp)) 2576 (void) nfs4_start_recovery(&e, mi, 2577 rep->re_vp, NULL, NULL, NULL, OP_LOCK, 2578 NULL); 2579 if (e.error != 0 || e.stat != NFS4_OK) 2580 break; 2581 } 2582 2583 /* 2584 * Check to see if we need to remap files passed in 2585 * via the recovery arguments; this will have been 2586 * done for open files. A failure here is not fatal. 2587 */ 2588 if (remap) { 2589 nfs4_error_t ignore; 2590 nfs4_check_remap(mi, recovp->rc_vp1, NFS4_REMAP_CKATTRS, 2591 &ignore); 2592 nfs4_check_remap(mi, recovp->rc_vp2, NFS4_REMAP_CKATTRS, 2593 &ignore); 2594 } 2595 } 2596 2597 if (e.error == 0 && e.stat == NFS4_OK) { 2598 mutex_enter(&mi->mi_lock); 2599 mi->mi_recovflags &= ~(MI4R_REOPEN_FILES | MI4R_REMAP_FILES); 2600 mutex_exit(&mi->mi_lock); 2601 } 2602 2603 nfs_rw_exit(&mi->mi_recovlock); 2604 nfs_rw_exit(&sp->s_recovlock); 2605 2606 if (reopenlist != NULL) 2607 r4releopenlist(reopenlist); 2608 } 2609 2610 /* 2611 * Resend the queued state recovery requests in "rqsts". 2612 */ 2613 2614 static void 2615 nfs4_resend_lost_rqsts(recov_info_t *recovp, nfs4_server_t *sp) 2616 { 2617 nfs4_lost_rqst_t *lrp, *tlrp; 2618 mntinfo4_t *mi = recovp->rc_mi; 2619 nfs4_error_t n4e; 2620 #ifdef NOTYET 2621 uint32_t deny_bits = 0; 2622 #endif 2623 2624 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_resend_lost_rqsts")); 2625 2626 ASSERT(mi != NULL); 2627 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 2628 2629 mutex_enter(&mi->mi_lock); 2630 lrp = list_head(&mi->mi_lost_state); 2631 mutex_exit(&mi->mi_lock); 2632 while (lrp != NULL) { 2633 nfs4_error_zinit(&n4e); 2634 resend_one_op(lrp, &n4e, mi, sp); 2635 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 2636 "nfs4_resend_lost_rqsts: resend request: for vp %p got " 2637 "error %d stat %d", (void *)lrp->lr_vp, n4e.error, 2638 n4e.stat)); 2639 2640 /* 2641 * If we get a recovery error that we can actually 2642 * recover from (such as ETIMEDOUT, FHEXPIRED), we 2643 * return and let the recovery thread redrive the call. 2644 * Don't requeue unless the zone is still healthy. 2645 */ 2646 if (zone_status_get(curproc->p_zone) < ZONE_IS_SHUTTING_DOWN && 2647 nfs4_needs_recovery(&n4e, TRUE, mi->mi_vfsp) && 2648 (nfs4_try_failover(&n4e) || 2649 NFS4_FRC_UNMT_ERR(n4e.error, mi->mi_vfsp) || 2650 (n4e.error == 0 && n4e.stat != NFS4ERR_BADHANDLE && 2651 !nfs4_recov_marks_dead(n4e.stat)))) { 2652 /* 2653 * For these three errors, we want to delay a bit 2654 * instead of pounding the server into submission. 2655 * We have to do this manually; the normal 2656 * processing for these errors only works for 2657 * non-recovery requests. 2658 */ 2659 if ((n4e.error == 0 && n4e.stat == NFS4ERR_DELAY) || 2660 (n4e.error == 0 && n4e.stat == NFS4ERR_GRACE) || 2661 (n4e.error == 0 && n4e.stat == NFS4ERR_RESOURCE) || 2662 NFS4_FRC_UNMT_ERR(n4e.error, mi->mi_vfsp)) { 2663 delay(SEC_TO_TICK(nfs4err_delay_time)); 2664 } else { 2665 (void) nfs4_start_recovery(&n4e, 2666 mi, lrp->lr_dvp, lrp->lr_vp, NULL, NULL, 2667 lrp->lr_op, NULL); 2668 } 2669 return; 2670 } 2671 2672 mutex_enter(&mi->mi_lock); 2673 list_remove(&mi->mi_lost_state, lrp); 2674 tlrp = lrp; 2675 lrp = list_head(&mi->mi_lost_state); 2676 mutex_exit(&mi->mi_lock); 2677 nfs4_free_lost_rqst(tlrp, sp); 2678 } 2679 } 2680 2681 /* 2682 * Resend the given op, and issue any necessary undo call. 2683 * errors are returned via the nfs4_error_t parameter. 2684 */ 2685 2686 static void 2687 resend_one_op(nfs4_lost_rqst_t *lrp, nfs4_error_t *ep, 2688 mntinfo4_t *mi, nfs4_server_t *sp) 2689 { 2690 vnode_t *vp; 2691 nfs4_open_stream_t *osp; 2692 cred_t *cr; 2693 uint32_t acc_bits; 2694 2695 vp = lrp->lr_vp; 2696 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_one_op: " 2697 "have a lost open/close request for vp %p", (void *)vp)); 2698 2699 switch (lrp->lr_op) { 2700 case OP_OPEN: 2701 nfs4_resend_open_otw(&vp, lrp, ep); 2702 break; 2703 case OP_OPEN_DOWNGRADE: 2704 ASSERT(lrp->lr_oop != NULL); 2705 ep->error = nfs4_start_open_seqid_sync(lrp->lr_oop, mi); 2706 ASSERT(!ep->error); /* recov thread always succeeds */ 2707 ASSERT(lrp->lr_osp != NULL); 2708 mutex_enter(&lrp->lr_osp->os_sync_lock); 2709 nfs4_open_downgrade(lrp->lr_dg_acc, lrp->lr_dg_deny, 2710 lrp->lr_oop, lrp->lr_osp, vp, lrp->lr_cr, lrp, 2711 ep, NULL, NULL); 2712 mutex_exit(&lrp->lr_osp->os_sync_lock); 2713 nfs4_end_open_seqid_sync(lrp->lr_oop); 2714 break; 2715 case OP_CLOSE: 2716 osp = lrp->lr_osp; 2717 cr = lrp->lr_cr; 2718 acc_bits = 0; 2719 mutex_enter(&osp->os_sync_lock); 2720 if (osp->os_share_acc_read) 2721 acc_bits |= OPEN4_SHARE_ACCESS_READ; 2722 if (osp->os_share_acc_write) 2723 acc_bits |= OPEN4_SHARE_ACCESS_WRITE; 2724 mutex_exit(&osp->os_sync_lock); 2725 nfs4close_one(vp, osp, cr, acc_bits, lrp, ep, 2726 CLOSE_RESEND, 0, 0, 0); 2727 break; 2728 case OP_LOCK: 2729 case OP_LOCKU: 2730 resend_lock(lrp, ep); 2731 goto done; 2732 case OP_DELEGRETURN: 2733 nfs4_resend_delegreturn(lrp, ep, sp); 2734 goto done; 2735 default: 2736 #ifdef DEBUG 2737 cmn_err(CE_PANIC, "resend_one_op: unexpected op: %d", 2738 lrp->lr_op); 2739 #endif 2740 nfs4_queue_event(RE_LOST_STATE_BAD_OP, mi, NULL, 2741 lrp->lr_op, lrp->lr_vp, lrp->lr_dvp, NFS4_OK, NULL, 0, 2742 TAG_NONE, TAG_NONE, 0, 0); 2743 nfs4_error_init(ep, EINVAL); 2744 return; 2745 } 2746 2747 /* 2748 * No need to retry nor send an "undo" CLOSE in the 2749 * event the server rebooted. 2750 */ 2751 if (ep->error == 0 && (ep->stat == NFS4ERR_STALE_CLIENTID || 2752 ep->stat == NFS4ERR_STALE_STATEID || ep->stat == NFS4ERR_EXPIRED)) 2753 goto done; 2754 2755 /* 2756 * If we resent a CLOSE or OPEN_DOWNGRADE, there's nothing 2757 * to undo. Undoing locking operations was handled by 2758 * resend_lock(). 2759 */ 2760 if (lrp->lr_op == OP_OPEN_DOWNGRADE || lrp->lr_op == OP_CLOSE) 2761 goto done; 2762 2763 /* 2764 * If we get any other error for OPEN, then don't attempt 2765 * to undo the resend of the open (since it was never 2766 * successful!). 2767 */ 2768 ASSERT(lrp->lr_op == OP_OPEN); 2769 if (ep->error || ep->stat != NFS4_OK) 2770 goto done; 2771 2772 /* 2773 * Now let's undo our OPEN. 2774 */ 2775 nfs4_error_zinit(ep); 2776 close_after_open_resend(vp, lrp->lr_cr, lrp->lr_oacc, ep); 2777 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_one_op: " 2778 "nfs4close_one: for vp %p got error %d stat %d", 2779 (void *)vp, ep->error, ep->stat)); 2780 2781 done: 2782 if (vp != lrp->lr_vp) 2783 VN_RELE(vp); 2784 } 2785 2786 /* 2787 * Close a file that was opened via a resent OPEN. 2788 * Most errors are passed back to the caller (via the return value and 2789 * *statp), except for FHEXPIRED, which is retried. 2790 * 2791 * It might be conceptually cleaner to push the CLOSE request onto the 2792 * front of the resend queue, rather than sending it here. That would 2793 * match the way we undo lost lock requests. On the other 2794 * hand, we've already got something that works, and there's no reason to 2795 * change it at this time. 2796 */ 2797 2798 static void 2799 close_after_open_resend(vnode_t *vp, cred_t *cr, uint32_t acc_bits, 2800 nfs4_error_t *ep) 2801 { 2802 2803 for (;;) { 2804 nfs4close_one(vp, NULL, cr, acc_bits, NULL, ep, 2805 CLOSE_AFTER_RESEND, 0, 0, 0); 2806 if (ep->error == 0 && ep->stat == NFS4_OK) 2807 break; /* success; done */ 2808 if (ep->error != 0 || ep->stat != NFS4ERR_FHEXPIRED) 2809 break; 2810 /* else retry FHEXPIRED */ 2811 } 2812 2813 } 2814 2815 /* 2816 * Resend the given lost lock request. Return an errno value. If zero, 2817 * *statp is set to the NFS status code for the call. 2818 * 2819 * Issue a SIGLOST and mark the rnode dead if we get a non-recovery error or 2820 * a recovery error that we don't actually recover from yet (eg: BAD_SEQID). 2821 * Let the recovery thread redrive the call if we get a recovery error that 2822 * we can actually recover from. 2823 */ 2824 static void 2825 resend_lock(nfs4_lost_rqst_t *lrp, nfs4_error_t *ep) 2826 { 2827 bool_t send_siglost = FALSE; 2828 vnode_t *vp = lrp->lr_vp; 2829 2830 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_lock:")); 2831 ASSERT(lrp->lr_ctype == NFS4_LCK_CTYPE_REINSTATE || 2832 lrp->lr_ctype == NFS4_LCK_CTYPE_RESEND); 2833 2834 nfs4frlock(lrp->lr_ctype, vp, F_SETLK, 2835 lrp->lr_flk, FREAD|FWRITE, 0, lrp->lr_cr, ep, lrp, NULL); 2836 2837 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_lock: " 2838 "nfs4frlock for vp %p returned error %d, stat %d", 2839 (void *)vp, ep->error, ep->stat)); 2840 2841 if (ep->error == 0 && ep->stat == 0) 2842 goto done; 2843 if (ep->error == 0 && ep->stat == NFS4ERR_DENIED && 2844 lrp->lr_ctype == NFS4_LCK_CTYPE_RESEND) 2845 goto done; 2846 2847 /* 2848 * If we failed with a non-recovery error, send SIGLOST and 2849 * mark the file dead. 2850 */ 2851 if (!nfs4_needs_recovery(ep, TRUE, vp->v_vfsp)) 2852 send_siglost = TRUE; 2853 else { 2854 /* 2855 * Done with recovering LOST LOCK in the event the 2856 * server rebooted or we've lost the lease. 2857 */ 2858 if (ep->error == 0 && (ep->stat == NFS4ERR_STALE_CLIENTID || 2859 ep->stat == NFS4ERR_STALE_STATEID || 2860 ep->stat == NFS4ERR_EXPIRED)) { 2861 goto done; 2862 } 2863 2864 /* 2865 * BAD_STATEID on an unlock indicates that the server has 2866 * forgotten about the lock anyway, so act like the call 2867 * was successful. 2868 */ 2869 if (ep->error == 0 && ep->stat == NFS4ERR_BAD_STATEID && 2870 lrp->lr_op == OP_LOCKU) 2871 goto done; 2872 2873 /* 2874 * If we got a recovery error that we don't actually 2875 * recover from, send SIGLOST. If the filesystem was 2876 * forcibly unmounted, we skip the SIGLOST because (a) it's 2877 * unnecessary noise, and (b) there could be a new process 2878 * with the same pid as the one that had generated the lost 2879 * state request. 2880 */ 2881 if (ep->error == 0 && (ep->stat == NFS4ERR_BADHANDLE || 2882 nfs4_recov_marks_dead(ep->stat))) { 2883 if (!(vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) 2884 send_siglost = TRUE; 2885 goto done; 2886 } 2887 2888 /* 2889 * If the filesystem was forcibly unmounted, we 2890 * still need to synchronize with the server and 2891 * release state. Try again later. 2892 */ 2893 if (NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp)) 2894 goto done; 2895 2896 /* 2897 * If we get a recovery error that we can actually 2898 * recover from (such as ETIMEDOUT, FHEXPIRED), 2899 * return and let the recovery thread redrive the call. 2900 * 2901 * For the three errors below, we want to delay a bit 2902 * instead of pounding the server into submission. 2903 */ 2904 if ((ep->error == 0 && ep->stat == NFS4ERR_DELAY) || 2905 (ep->error == 0 && ep->stat == NFS4ERR_GRACE) || 2906 (ep->error == 0 && ep->stat == NFS4ERR_RESOURCE)) 2907 delay(SEC_TO_TICK(recov_err_delay)); 2908 goto done; 2909 } 2910 2911 done: 2912 if (send_siglost) { 2913 cred_t *sv_cred; 2914 2915 /* 2916 * Must be root or the actual thread being issued the 2917 * SIGLOST for this to work, so just become root. 2918 */ 2919 sv_cred = curthread->t_cred; 2920 curthread->t_cred = kcred; 2921 nfs4_send_siglost(lrp->lr_flk->l_pid, VTOMI4(vp), vp, FALSE, 2922 ep->error, ep->stat); 2923 curthread->t_cred = sv_cred; 2924 2925 /* 2926 * Flush any additional reinstantiation requests for 2927 * this operation. Sending multiple SIGLOSTs to the user 2928 * process is unlikely to help and may cause trouble. 2929 */ 2930 if (lrp->lr_ctype == NFS4_LCK_CTYPE_REINSTATE) 2931 flush_reinstate(lrp); 2932 } 2933 } 2934 2935 /* 2936 * Remove any lock reinstantiation requests that correspond to the given 2937 * lost request. We only remove items that follow lrp in the queue, 2938 * assuming that lrp will be removed by the generic lost state code. 2939 */ 2940 2941 static void 2942 flush_reinstate(nfs4_lost_rqst_t *lrp) 2943 { 2944 vnode_t *vp; 2945 pid_t pid; 2946 mntinfo4_t *mi; 2947 nfs4_lost_rqst_t *nlrp; 2948 2949 vp = lrp->lr_vp; 2950 mi = VTOMI4(vp); 2951 pid = lrp->lr_flk->l_pid; 2952 2953 /* 2954 * If there are any more reinstantation requests to get rid of, 2955 * they should all be clustered at the front of the lost state 2956 * queue. 2957 */ 2958 mutex_enter(&mi->mi_lock); 2959 for (lrp = list_next(&mi->mi_lost_state, lrp); lrp != NULL; 2960 lrp = nlrp) { 2961 nlrp = list_next(&mi->mi_lost_state, lrp); 2962 if (lrp->lr_op != OP_LOCK && lrp->lr_op != OP_LOCKU) 2963 break; 2964 if (lrp->lr_ctype != NFS4_LCK_CTYPE_REINSTATE) 2965 break; 2966 ASSERT(lrp->lr_vp == vp); 2967 ASSERT(lrp->lr_flk->l_pid == pid); 2968 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 2969 "remove reinstantiation %p", (void *)lrp)); 2970 list_remove(&mi->mi_lost_state, lrp); 2971 nfs4_free_lost_rqst(lrp, NULL); 2972 } 2973 mutex_exit(&mi->mi_lock); 2974 } 2975 2976 /* 2977 * End of state-specific recovery routines. 2978 */ 2979 2980 /* 2981 * Allocate a lost request struct, initialize it from lost_rqstp (including 2982 * bumping the reference counts for the referenced vnode, etc.), and hang 2983 * it off of recovp. 2984 */ 2985 2986 static void 2987 nfs4_save_lost_rqst(nfs4_lost_rqst_t *lost_rqstp, recov_info_t *recovp, 2988 nfs4_recov_t *action, mntinfo4_t *mi) 2989 { 2990 nfs4_lost_rqst_t *destp; 2991 2992 ASSERT(recovp->rc_lost_rqst == NULL); 2993 2994 destp = kmem_alloc(sizeof (nfs4_lost_rqst_t), KM_SLEEP); 2995 recovp->rc_lost_rqst = destp; 2996 2997 if (lost_rqstp->lr_op == OP_LOCK || 2998 lost_rqstp->lr_op == OP_LOCKU) { 2999 ASSERT(lost_rqstp->lr_lop); 3000 *action = NR_LOST_LOCK; 3001 destp->lr_ctype = lost_rqstp->lr_ctype; 3002 destp->lr_locktype = lost_rqstp->lr_locktype; 3003 } else if (lost_rqstp->lr_op == OP_OPEN) { 3004 component4 *srcfp, *destfp; 3005 3006 destp->lr_oacc = lost_rqstp->lr_oacc; 3007 destp->lr_odeny = lost_rqstp->lr_odeny; 3008 destp->lr_oclaim = lost_rqstp->lr_oclaim; 3009 if (lost_rqstp->lr_oclaim == CLAIM_DELEGATE_CUR) 3010 destp->lr_ostateid = lost_rqstp->lr_ostateid; 3011 3012 srcfp = &lost_rqstp->lr_ofile; 3013 destfp = &destp->lr_ofile; 3014 /* 3015 * Consume caller's utf8string 3016 */ 3017 destfp->utf8string_len = srcfp->utf8string_len; 3018 destfp->utf8string_val = srcfp->utf8string_val; 3019 srcfp->utf8string_len = 0; 3020 srcfp->utf8string_val = NULL; /* make sure not reused */ 3021 3022 *action = NR_LOST_STATE_RQST; 3023 } else if (lost_rqstp->lr_op == OP_OPEN_DOWNGRADE) { 3024 destp->lr_dg_acc = lost_rqstp->lr_dg_acc; 3025 destp->lr_dg_deny = lost_rqstp->lr_dg_deny; 3026 3027 *action = NR_LOST_STATE_RQST; 3028 } else if (lost_rqstp->lr_op == OP_CLOSE) { 3029 ASSERT(lost_rqstp->lr_oop); 3030 *action = NR_LOST_STATE_RQST; 3031 } else if (lost_rqstp->lr_op == OP_DELEGRETURN) { 3032 *action = NR_LOST_STATE_RQST; 3033 } else { 3034 #ifdef DEBUG 3035 cmn_err(CE_PANIC, "nfs4_save_lost_rqst: bad op %d", 3036 lost_rqstp->lr_op); 3037 #endif 3038 nfs4_queue_event(RE_LOST_STATE_BAD_OP, mi, NULL, 3039 lost_rqstp->lr_op, lost_rqstp->lr_vp, lost_rqstp->lr_dvp, 3040 NFS4_OK, NULL, curproc->p_pid, TAG_NONE, TAG_NONE, 0, 0); 3041 *action = NR_UNUSED; 3042 recovp->rc_lost_rqst = NULL; 3043 kmem_free(destp, sizeof (nfs4_lost_rqst_t)); 3044 return; 3045 } 3046 3047 destp->lr_op = lost_rqstp->lr_op; 3048 destp->lr_vp = lost_rqstp->lr_vp; 3049 if (destp->lr_vp) 3050 VN_HOLD(destp->lr_vp); 3051 destp->lr_dvp = lost_rqstp->lr_dvp; 3052 if (destp->lr_dvp) 3053 VN_HOLD(destp->lr_dvp); 3054 destp->lr_oop = lost_rqstp->lr_oop; 3055 if (destp->lr_oop) 3056 open_owner_hold(destp->lr_oop); 3057 destp->lr_osp = lost_rqstp->lr_osp; 3058 if (destp->lr_osp) 3059 open_stream_hold(destp->lr_osp); 3060 destp->lr_lop = lost_rqstp->lr_lop; 3061 if (destp->lr_lop) 3062 lock_owner_hold(destp->lr_lop); 3063 destp->lr_cr = lost_rqstp->lr_cr; 3064 if (destp->lr_cr) 3065 crhold(destp->lr_cr); 3066 if (lost_rqstp->lr_flk == NULL) 3067 destp->lr_flk = NULL; 3068 else { 3069 destp->lr_flk = kmem_alloc(sizeof (flock64_t), KM_SLEEP); 3070 *destp->lr_flk = *lost_rqstp->lr_flk; 3071 } 3072 destp->lr_putfirst = lost_rqstp->lr_putfirst; 3073 } 3074 3075 /* 3076 * Map the given return values (errno and nfs4 status code) to a recovery 3077 * action and fill in the following fields of recovp: rc_action, 3078 * rc_srv_reboot, rc_stateid, rc_lost_rqst. 3079 */ 3080 3081 void 3082 errs_to_action(recov_info_t *recovp, 3083 nfs4_server_t *sp, mntinfo4_t *mi, stateid4 *sidp, 3084 nfs4_lost_rqst_t *lost_rqstp, int unmounted, nfs_opnum4 op, 3085 nfs4_bseqid_entry_t *bsep) 3086 { 3087 nfs4_recov_t action = NR_UNUSED; 3088 bool_t reboot = FALSE; 3089 int try_f; 3090 int error = recovp->rc_orig_errors.error; 3091 nfsstat4 stat = recovp->rc_orig_errors.stat; 3092 3093 bzero(&recovp->rc_stateid, sizeof (stateid4)); 3094 recovp->rc_lost_rqst = NULL; 3095 recovp->rc_bseqid_rqst = NULL; 3096 3097 try_f = nfs4_try_failover(&recovp->rc_orig_errors) && 3098 FAILOVER_MOUNT4(mi); 3099 3100 /* 3101 * We start recovery for EINTR only in the lost lock 3102 * or lost open/close case. 3103 */ 3104 3105 if (try_f || error == EINTR || (error == EIO && unmounted)) { 3106 recovp->rc_error = (error != 0 ? error : geterrno4(stat)); 3107 if (lost_rqstp) { 3108 ASSERT(lost_rqstp->lr_op != 0); 3109 nfs4_save_lost_rqst(lost_rqstp, recovp, &action, mi); 3110 } 3111 if (try_f) 3112 action = NR_FAILOVER; 3113 } else if (error != 0) { 3114 recovp->rc_error = error; 3115 nfs4_queue_event(RE_UNEXPECTED_ERRNO, mi, NULL, error, NULL, 3116 NULL, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0); 3117 action = NR_CLIENTID; 3118 } else { 3119 recovp->rc_error = geterrno4(stat); 3120 switch (stat) { 3121 #ifdef notyet 3122 case NFS4ERR_LEASE_MOVED: 3123 action = xxx; 3124 break; 3125 case NFS4ERR_MOVED: 3126 action = xxx; 3127 break; 3128 #endif 3129 case NFS4ERR_BADHANDLE: 3130 action = NR_BADHANDLE; 3131 break; 3132 case NFS4ERR_BAD_SEQID: 3133 if (bsep) 3134 save_bseqid_rqst(bsep, recovp); 3135 action = NR_BAD_SEQID; 3136 break; 3137 case NFS4ERR_OLD_STATEID: 3138 action = NR_OLDSTATEID; 3139 break; 3140 case NFS4ERR_WRONGSEC: 3141 action = NR_WRONGSEC; 3142 break; 3143 case NFS4ERR_FHEXPIRED: 3144 action = NR_FHEXPIRED; 3145 break; 3146 case NFS4ERR_BAD_STATEID: 3147 if (sp == NULL || (sp != NULL && inlease(sp))) { 3148 3149 action = NR_BAD_STATEID; 3150 if (sidp) 3151 recovp->rc_stateid = *sidp; 3152 } else 3153 action = NR_CLIENTID; 3154 break; 3155 case NFS4ERR_EXPIRED: 3156 /* 3157 * The client's lease has expired, either due 3158 * to a network partition or perhaps a client 3159 * error. In either case, try an NR_CLIENTID 3160 * style recovery. reboot remains false, since 3161 * there is no evidence the server has rebooted. 3162 * This will cause CLAIM_NULL opens and lock 3163 * requests without the reclaim bit. 3164 */ 3165 action = NR_CLIENTID; 3166 3167 DTRACE_PROBE4(nfs4__expired, 3168 nfs4_server_t *, sp, 3169 mntinfo4_t *, mi, 3170 stateid4 *, sidp, int, op); 3171 3172 break; 3173 case NFS4ERR_STALE_CLIENTID: 3174 case NFS4ERR_STALE_STATEID: 3175 action = NR_CLIENTID; 3176 reboot = TRUE; 3177 break; 3178 case NFS4ERR_RESOURCE: 3179 /* 3180 * If this had been a FAILOVER mount, then 3181 * we'd have tried failover. Since it's not, 3182 * just delay a while and retry. 3183 */ 3184 action = NR_DELAY; 3185 break; 3186 case NFS4ERR_GRACE: 3187 action = NR_GRACE; 3188 break; 3189 case NFS4ERR_DELAY: 3190 action = NR_DELAY; 3191 break; 3192 case NFS4ERR_STALE: 3193 action = NR_STALE; 3194 break; 3195 default: 3196 nfs4_queue_event(RE_UNEXPECTED_STATUS, mi, NULL, 0, 3197 NULL, NULL, stat, NULL, 0, TAG_NONE, TAG_NONE, 3198 0, 0); 3199 action = NR_CLIENTID; 3200 break; 3201 } 3202 } 3203 3204 /* make sure action got set */ 3205 ASSERT(action != NR_UNUSED); 3206 recovp->rc_srv_reboot = reboot; 3207 recovp->rc_action = action; 3208 nfs4_queue_fact(RF_ERR, mi, stat, action, op, reboot, NULL, error, 3209 NULL); 3210 } 3211 3212 /* 3213 * Return the (held) credential for the process with the given pid. 3214 * May return NULL (e.g., process not found). 3215 */ 3216 3217 static cred_t * 3218 pid_to_cr(pid_t pid) 3219 { 3220 proc_t *p; 3221 cred_t *cr; 3222 3223 mutex_enter(&pidlock); 3224 if ((p = prfind(pid)) == NULL) { 3225 mutex_exit(&pidlock); 3226 return (NULL); 3227 } 3228 3229 mutex_enter(&p->p_crlock); 3230 crhold(cr = p->p_cred); 3231 mutex_exit(&p->p_crlock); 3232 mutex_exit(&pidlock); 3233 3234 return (cr); 3235 } 3236 3237 /* 3238 * Send SIGLOST to the given process and queue the event. 3239 * 3240 * The 'dump' boolean tells us whether this action should dump the 3241 * in-kernel queue of recovery messages or not. 3242 */ 3243 3244 void 3245 nfs4_send_siglost(pid_t pid, mntinfo4_t *mi, vnode_t *vp, bool_t dump, 3246 int error, nfsstat4 stat) 3247 { 3248 proc_t *p; 3249 3250 mutex_enter(&pidlock); 3251 p = prfind(pid); 3252 if (p) 3253 psignal(p, SIGLOST); 3254 mutex_exit(&pidlock); 3255 nfs4_queue_event(dump ? RE_SIGLOST : RE_SIGLOST_NO_DUMP, mi, 3256 NULL, error, vp, NULL, stat, NULL, pid, TAG_NONE, TAG_NONE, 0, 0); 3257 } 3258 3259 /* 3260 * Scan the lock list for entries that match the given pid. Change the 3261 * pid in those that do to NOPID. 3262 */ 3263 3264 static void 3265 relock_skip_pid(locklist_t *llp, pid_t pid) 3266 { 3267 for (; llp != NULL; llp = llp->ll_next) { 3268 if (llp->ll_flock.l_pid == pid) 3269 llp->ll_flock.l_pid = NOPID; 3270 } 3271 } 3272 3273 /* 3274 * Mark a file as having failed recovery, after making a last-ditch effort 3275 * to return any delegation. 3276 * 3277 * Sets r_error to EIO or ESTALE for the given vnode. 3278 */ 3279 void 3280 nfs4_fail_recov(vnode_t *vp, char *why, int error, nfsstat4 stat) 3281 { 3282 rnode4_t *rp = VTOR4(vp); 3283 3284 #ifdef DEBUG 3285 if (nfs4_fail_recov_stop) 3286 debug_enter("nfs4_fail_recov"); 3287 #endif 3288 3289 mutex_enter(&rp->r_statelock); 3290 if (rp->r_flags & (R4RECOVERR|R4RECOVERRP)) { 3291 mutex_exit(&rp->r_statelock); 3292 return; 3293 } 3294 3295 /* 3296 * Set R4RECOVERRP to indicate that a recovery error is in 3297 * progress. This will shut down reads and writes at the top 3298 * half. Don't set R4RECOVERR until after we've returned the 3299 * delegation, otherwise it will fail. 3300 */ 3301 3302 rp->r_flags |= R4RECOVERRP; 3303 mutex_exit(&rp->r_statelock); 3304 3305 nfs4delegabandon(rp); 3306 3307 mutex_enter(&rp->r_statelock); 3308 rp->r_flags |= (R4RECOVERR | R4STALE); 3309 rp->r_error = (error == 0 && stat == NFS4ERR_STALE) ? ESTALE : EIO; 3310 PURGE_ATTRCACHE4_LOCKED(rp); 3311 if (!(vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) 3312 nfs4_queue_event(RE_DEAD_FILE, VTOMI4(vp), NULL, error, 3313 vp, NULL, stat, why, 0, TAG_NONE, TAG_NONE, 0, 0); 3314 mutex_exit(&rp->r_statelock); 3315 3316 dnlc_purge_vp(vp); 3317 } 3318 3319 /* 3320 * recov_throttle: if the file had the same recovery action within the 3321 * throttle interval, wait for the throttle interval to finish before 3322 * proceeding. 3323 * 3324 * Side effects: updates the rnode with the current recovery information. 3325 */ 3326 3327 static void 3328 recov_throttle(recov_info_t *recovp, vnode_t *vp) 3329 { 3330 time_t curtime, time_to_wait; 3331 rnode4_t *rp = VTOR4(vp); 3332 3333 curtime = gethrestime_sec(); 3334 3335 mutex_enter(&rp->r_statelock); 3336 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 3337 "recov_throttle: now: (%d, %ld), last: (%d, %ld)", 3338 recovp->rc_action, curtime, 3339 rp->r_recov_act, rp->r_last_recov)); 3340 if (recovp->rc_action == rp->r_recov_act && 3341 rp->r_last_recov + recov_err_delay > curtime) { 3342 time_to_wait = rp->r_last_recov + recov_err_delay - curtime; 3343 mutex_exit(&rp->r_statelock); 3344 delay(SEC_TO_TICK(time_to_wait)); 3345 curtime = gethrestime_sec(); 3346 mutex_enter(&rp->r_statelock); 3347 } 3348 3349 rp->r_last_recov = curtime; 3350 rp->r_recov_act = recovp->rc_action; 3351 mutex_exit(&rp->r_statelock); 3352 } 3353 3354 /* 3355 * React to NFS4ERR_GRACE by setting the time we'll permit 3356 * the next call to this filesystem. 3357 */ 3358 void 3359 nfs4_set_grace_wait(mntinfo4_t *mi) 3360 { 3361 mutex_enter(&mi->mi_lock); 3362 /* Mark the time for the future */ 3363 mi->mi_grace_wait = gethrestime_sec() + nfs4err_delay_time; 3364 mutex_exit(&mi->mi_lock); 3365 } 3366 3367 /* 3368 * React to MFS4ERR_DELAY by setting the time we'll permit 3369 * the next call to this vnode. 3370 */ 3371 void 3372 nfs4_set_delay_wait(vnode_t *vp) 3373 { 3374 rnode4_t *rp = VTOR4(vp); 3375 3376 mutex_enter(&rp->r_statelock); 3377 /* 3378 * Calculate amount we should delay, initial 3379 * delay will be short and then we will back off. 3380 */ 3381 if (rp->r_delay_interval == 0) 3382 rp->r_delay_interval = NFS4_INITIAL_DELAY_INTERVAL; 3383 else 3384 /* calculate next interval value */ 3385 rp->r_delay_interval = 3386 MIN(NFS4_MAX_DELAY_INTERVAL, (rp->r_delay_interval << 1)); 3387 rp->r_delay_wait = gethrestime_sec() + rp->r_delay_interval; 3388 mutex_exit(&rp->r_statelock); 3389 } 3390 3391 /* 3392 * The caller is responsible for freeing the returned string. 3393 */ 3394 static char * 3395 nfs4_getsrvnames(mntinfo4_t *mi, size_t *len) 3396 { 3397 servinfo4_t *svp; 3398 char *srvnames; 3399 char *namep; 3400 size_t length; 3401 3402 /* 3403 * Calculate the length of the string required to hold all 3404 * of the server names plus either a comma or a null 3405 * character following each individual one. 3406 */ 3407 length = 0; 3408 for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) { 3409 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 3410 if (svp->sv_flags & SV4_NOTINUSE) { 3411 nfs_rw_exit(&svp->sv_lock); 3412 continue; 3413 } 3414 nfs_rw_exit(&svp->sv_lock); 3415 length += svp->sv_hostnamelen; 3416 } 3417 3418 srvnames = kmem_alloc(length, KM_SLEEP); 3419 3420 namep = srvnames; 3421 for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) { 3422 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 3423 if (svp->sv_flags & SV4_NOTINUSE) { 3424 nfs_rw_exit(&svp->sv_lock); 3425 continue; 3426 } 3427 nfs_rw_exit(&svp->sv_lock); 3428 (void) strcpy(namep, svp->sv_hostname); 3429 namep += svp->sv_hostnamelen - 1; 3430 *namep++ = ','; 3431 } 3432 *--namep = '\0'; 3433 3434 *len = length; 3435 3436 return (srvnames); 3437 } 3438 3439 static void 3440 save_bseqid_rqst(nfs4_bseqid_entry_t *bsep, recov_info_t *recovp) 3441 { 3442 nfs4_bseqid_entry_t *destp; 3443 3444 destp = kmem_alloc(sizeof (nfs4_bseqid_entry_t), KM_SLEEP); 3445 recovp->rc_bseqid_rqst = destp; 3446 3447 if (bsep->bs_oop) 3448 open_owner_hold(bsep->bs_oop); 3449 destp->bs_oop = bsep->bs_oop; 3450 if (bsep->bs_lop) 3451 lock_owner_hold(bsep->bs_lop); 3452 destp->bs_lop = bsep->bs_lop; 3453 if (bsep->bs_vp) 3454 VN_HOLD(bsep->bs_vp); 3455 destp->bs_vp = bsep->bs_vp; 3456 destp->bs_pid = bsep->bs_pid; 3457 destp->bs_tag = bsep->bs_tag; 3458 destp->bs_seqid = bsep->bs_seqid; 3459 } 3460 3461 static void 3462 free_bseqid_rqst(nfs4_bseqid_entry_t *bsep) 3463 { 3464 if (bsep->bs_oop) 3465 open_owner_rele(bsep->bs_oop); 3466 if (bsep->bs_lop) 3467 lock_owner_rele(bsep->bs_lop); 3468 if (bsep->bs_vp) 3469 VN_RELE(bsep->bs_vp); 3470 kmem_free(bsep, sizeof (nfs4_bseqid_entry_t)); 3471 } 3472 3473 /* 3474 * We don't actually fully recover from NFS4ERR_BAD_SEQID. We 3475 * simply mark the open owner and open stream (if provided) as "bad". 3476 * Then future uses of these data structures will be limited to basically 3477 * just cleaning up the internal client state (no going OTW). 3478 * 3479 * The result of this is to return errors back to the app/usr when 3480 * we receive NFS4ERR_BAD_SEQID, but also allow future/new calls to 3481 * succeed so progress can be made. 3482 */ 3483 void 3484 recov_bad_seqid(recov_info_t *recovp) 3485 { 3486 mntinfo4_t *mi = recovp->rc_mi; 3487 nfs4_open_owner_t *bad_oop; 3488 nfs4_lock_owner_t *bad_lop; 3489 vnode_t *vp; 3490 rnode4_t *rp = NULL; 3491 pid_t pid; 3492 nfs4_bseqid_entry_t *bsep, *tbsep; 3493 int error; 3494 3495 ASSERT(mi != NULL); 3496 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 3497 3498 mutex_enter(&mi->mi_lock); 3499 bsep = list_head(&mi->mi_bseqid_list); 3500 mutex_exit(&mi->mi_lock); 3501 3502 /* 3503 * Handle all the bad seqid entries on mi's list. 3504 */ 3505 while (bsep != NULL) { 3506 bad_oop = bsep->bs_oop; 3507 bad_lop = bsep->bs_lop; 3508 vp = bsep->bs_vp; 3509 pid = bsep->bs_pid; 3510 3511 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 3512 "recov_bad_seqid: mark oop %p lop %p as bad for " 3513 "vp %p tag %s pid %d: last good seqid %d for tag %s", 3514 (void *)bad_oop, (void *)bad_lop, (void *)vp, 3515 nfs4_ctags[bsep->bs_tag].ct_str, pid, 3516 bad_oop ? bad_oop->oo_last_good_seqid : 0, 3517 bad_oop ? nfs4_ctags[bad_oop->oo_last_good_op].ct_str : 3518 nfs4_ctags[TAG_NONE].ct_str)); 3519 3520 nfs4_queue_event(RE_BAD_SEQID, mi, NULL, 3521 0, vp, NULL, NFS4ERR_BAD_SEQID, NULL, pid, bsep->bs_tag, 3522 bad_oop ? bad_oop->oo_last_good_op : TAG_NONE, 3523 bsep->bs_seqid, bad_oop ? bad_oop->oo_last_good_seqid : 0); 3524 3525 if (bad_oop) { 3526 /* essentially reset the open owner */ 3527 error = nfs4_start_open_seqid_sync(bad_oop, mi); 3528 ASSERT(!error); /* recov thread always succeeds */ 3529 bad_oop->oo_name = nfs4_get_new_oo_name(); 3530 bad_oop->oo_seqid = 0; 3531 nfs4_end_open_seqid_sync(bad_oop); 3532 } 3533 3534 if (bad_lop) { 3535 mutex_enter(&bad_lop->lo_lock); 3536 bad_lop->lo_flags |= NFS4_BAD_SEQID_LOCK; 3537 mutex_exit(&bad_lop->lo_lock); 3538 3539 ASSERT(vp != NULL); 3540 rp = VTOR4(vp); 3541 mutex_enter(&rp->r_statelock); 3542 rp->r_flags |= R4LODANGLERS; 3543 mutex_exit(&rp->r_statelock); 3544 3545 nfs4_send_siglost(pid, mi, vp, TRUE, 3546 0, NFS4ERR_BAD_SEQID); 3547 } 3548 3549 mutex_enter(&mi->mi_lock); 3550 list_remove(&mi->mi_bseqid_list, bsep); 3551 tbsep = bsep; 3552 bsep = list_head(&mi->mi_bseqid_list); 3553 mutex_exit(&mi->mi_lock); 3554 free_bseqid_rqst(tbsep); 3555 } 3556 3557 mutex_enter(&mi->mi_lock); 3558 mi->mi_recovflags &= ~MI4R_BAD_SEQID; 3559 mutex_exit(&mi->mi_lock); 3560 } 3561