1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * NFS Version 4 state recovery code. 30 */ 31 32 #include <nfs/nfs4_clnt.h> 33 #include <nfs/nfs4.h> 34 #include <nfs/rnode4.h> 35 #include <sys/cmn_err.h> 36 #include <sys/cred.h> 37 #include <sys/systm.h> 38 #include <sys/flock.h> 39 #include <sys/dnlc.h> 40 #include <sys/ddi.h> 41 #include <sys/disp.h> 42 #include <sys/list.h> 43 #include <sys/sdt.h> 44 45 extern r4hashq_t *rtable4; 46 47 /* 48 * Information that describes what needs to be done for recovery. It is 49 * passed to a client recovery thread as well as passed to various recovery 50 * routines. rc_mi, rc_vp1, and rc_vp2 refer to the filesystem and 51 * vnode(s) affected by recovery. rc_vp1 and rc_vp2 are references (use 52 * VN_HOLD) or NULL. rc_lost_rqst contains information about the lost 53 * lock or open/close request, and it holds reference counts for the 54 * various objects (vnode, etc.). The recovery thread also uses flags set 55 * in the mntinfo4_t or vnode_t to tell it what to do. rc_error is used 56 * to save the error that originally triggered the recovery event -- will 57 * later be used to set mi_error if recovery doesn't work. rc_bseqid_rqst 58 * contains information about the request that got NFS4ERR_BAD_SEQID, and 59 * it holds reference count for the various objects (vnode, open owner, 60 * open stream, lock owner). 61 */ 62 63 typedef struct { 64 mntinfo4_t *rc_mi; 65 vnode_t *rc_vp1; 66 vnode_t *rc_vp2; 67 nfs4_recov_t rc_action; 68 stateid4 rc_stateid; 69 bool_t rc_srv_reboot; /* server has rebooted */ 70 nfs4_lost_rqst_t *rc_lost_rqst; 71 nfs4_error_t rc_orig_errors; /* original errors causing recovery */ 72 int rc_error; 73 nfs4_bseqid_entry_t *rc_bseqid_rqst; 74 } recov_info_t; 75 76 /* 77 * How long to wait before trying again if there is an error doing 78 * recovery, in seconds. 79 */ 80 81 static int recov_err_delay = 1; 82 83 /* 84 * How long to wait when processing NFS4ERR_GRACE or NFS4ERR_DELAY 85 * errors. Expressed in seconds. Default is defined as 86 * NFS4ERR_DELAY_TIME and this variable is initialized in nfs4_subr_init() 87 */ 88 time_t nfs4err_delay_time = 0; 89 90 /* 91 * Tuneable to limit how many time "exempt" ops go OTW 92 * after a recovery error. Exempt op hints are OH_CLOSE, 93 * OH_LOCKU, OH_DELEGRETURN. These previously always went 94 * OTW even after rnode was "dead" due to recovery errors. 95 * 96 * The tuneable below limits the number of times a start_fop 97 * invocation will retry the exempt hints. After the limit 98 * is reached, nfs4_start_fop will return an error just like 99 * it would for non-exempt op hints. 100 */ 101 int nfs4_max_recov_error_retry = 3; 102 103 /* 104 * Number of seconds the recovery thread should pause before retry when the 105 * filesystem has been forcibly unmounted. 106 */ 107 108 int nfs4_unmount_delay = 1; 109 110 #ifdef DEBUG 111 112 /* 113 * How long to wait (in seconds) between recovery operations on a given 114 * file. Normally zero, but could be set longer for testing purposes. 115 */ 116 static int nfs4_recovdelay = 0; 117 118 /* 119 * Switch that controls whether to go into the debugger when recovery 120 * fails. 121 */ 122 static int nfs4_fail_recov_stop = 0; 123 124 /* 125 * Tuneables to debug client namespace interaction with server 126 * mount points: 127 * 128 * nfs4_srvmnt_fail_cnt: 129 * number of times EACCES returned because client 130 * attempted to cross server mountpoint 131 * 132 * nfs4_srvmnt_debug: 133 * trigger console printf whenever client attempts 134 * to cross server mountpoint 135 */ 136 int nfs4_srvmnt_fail_cnt = 0; 137 int nfs4_srvmnt_debug = 0; 138 #endif 139 140 /* forward references, in alphabetic order */ 141 static void close_after_open_resend(vnode_t *, cred_t *, uint32_t, 142 nfs4_error_t *); 143 static void errs_to_action(recov_info_t *, 144 nfs4_server_t *, mntinfo4_t *, stateid4 *, nfs4_lost_rqst_t *, int, 145 nfs_opnum4, nfs4_bseqid_entry_t *); 146 static void flush_reinstate(nfs4_lost_rqst_t *); 147 static void free_milist(mntinfo4_t **, int); 148 static mntinfo4_t **make_milist(nfs4_server_t *, int *); 149 static int nfs4_check_recov_err(vnode_t *, nfs4_op_hint_t, 150 nfs4_recov_state_t *, int, char *); 151 static char *nfs4_getsrvnames(mntinfo4_t *, size_t *); 152 static void nfs4_recov_fh_fail(vnode_t *, int, nfsstat4); 153 static void nfs4_recov_thread(recov_info_t *); 154 static void nfs4_remove_lost_rqsts(mntinfo4_t *, nfs4_server_t *); 155 static void nfs4_resend_lost_rqsts(recov_info_t *, nfs4_server_t *); 156 static cred_t *pid_to_cr(pid_t); 157 static void reclaim_one_lock(vnode_t *, flock64_t *, nfs4_error_t *, int *); 158 static void recov_bad_seqid(recov_info_t *); 159 static void recov_badstate(recov_info_t *, vnode_t *, nfsstat4); 160 static void recov_clientid(recov_info_t *, nfs4_server_t *); 161 static void recov_done(mntinfo4_t *, recov_info_t *); 162 static void recov_filehandle(nfs4_recov_t, mntinfo4_t *, vnode_t *); 163 static void recov_newserver(recov_info_t *, nfs4_server_t **, bool_t *); 164 static void recov_openfiles(recov_info_t *, nfs4_server_t *); 165 static void recov_stale(mntinfo4_t *, vnode_t *); 166 static void nfs4_free_lost_rqst(nfs4_lost_rqst_t *, nfs4_server_t *); 167 static void recov_throttle(recov_info_t *, vnode_t *); 168 static void relock_skip_pid(locklist_t *, pid_t); 169 static void resend_lock(nfs4_lost_rqst_t *, nfs4_error_t *); 170 static void resend_one_op(nfs4_lost_rqst_t *, nfs4_error_t *, mntinfo4_t *, 171 nfs4_server_t *); 172 static void save_bseqid_rqst(nfs4_bseqid_entry_t *, recov_info_t *); 173 static void start_recovery(recov_info_t *, mntinfo4_t *, vnode_t *, vnode_t *, 174 nfs4_server_t *); 175 static void start_recovery_action(nfs4_recov_t, bool_t, mntinfo4_t *, vnode_t *, 176 vnode_t *); 177 static int wait_for_recovery(mntinfo4_t *, nfs4_op_hint_t); 178 179 /* 180 * Return non-zero if the given errno, status, and rpc status codes 181 * in the nfs4_error_t indicate that client recovery is needed. 182 * "stateful" indicates whether the call that got the error establishes or 183 * removes state on the server (open, close, lock, unlock, delegreturn). 184 */ 185 186 int 187 nfs4_needs_recovery(nfs4_error_t *ep, bool_t stateful, vfs_t *vfsp) 188 { 189 int recov = 0; 190 mntinfo4_t *mi; 191 192 /* 193 * Try failover if the error values justify it and if 194 * it's a failover mount. Don't try if the mount is in 195 * progress, failures are handled explicitly by nfs4rootvp. 196 */ 197 if (nfs4_try_failover(ep)) { 198 mi = VFTOMI4(vfsp); 199 mutex_enter(&mi->mi_lock); 200 recov = FAILOVER_MOUNT4(mi) && !(mi->mi_flags & MI4_MOUNTING); 201 mutex_exit(&mi->mi_lock); 202 if (recov) 203 return (recov); 204 } 205 206 if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vfsp)) { 207 /* 208 * The server may have gotten the request, so for stateful 209 * ops we need to resynchronize and possibly back out the 210 * op. 211 */ 212 return (stateful); 213 } 214 if (ep->error != 0) 215 return (0); 216 217 /* stat values are listed alphabetically */ 218 /* 219 * There are two lists here: the errors for which we have code, and 220 * the errors for which we plan to have code before FCS. For the 221 * second list, print a warning message but don't attempt recovery. 222 */ 223 switch (ep->stat) { 224 case NFS4ERR_BADHANDLE: 225 case NFS4ERR_BAD_SEQID: 226 case NFS4ERR_BAD_STATEID: 227 case NFS4ERR_DELAY: 228 case NFS4ERR_EXPIRED: 229 case NFS4ERR_FHEXPIRED: 230 case NFS4ERR_GRACE: 231 case NFS4ERR_OLD_STATEID: 232 case NFS4ERR_RESOURCE: 233 case NFS4ERR_STALE_CLIENTID: 234 case NFS4ERR_STALE_STATEID: 235 case NFS4ERR_WRONGSEC: 236 case NFS4ERR_STALE: 237 recov = 1; 238 break; 239 #ifdef DEBUG 240 case NFS4ERR_LEASE_MOVED: 241 case NFS4ERR_MOVED: 242 zcmn_err(VFTOMI4(vfsp)->mi_zone->zone_id, 243 CE_WARN, "!Can't yet recover from NFS status %d", 244 ep->stat); 245 break; 246 #endif 247 } 248 249 return (recov); 250 } 251 252 /* 253 * Some operations such as DELEGRETURN want to avoid invoking 254 * recovery actions that will only mark the file dead. If 255 * better handlers are invoked for any of these errors, this 256 * routine should be modified. 257 */ 258 int 259 nfs4_recov_marks_dead(nfsstat4 status) 260 { 261 if (status == NFS4ERR_BAD_SEQID || 262 status == NFS4ERR_EXPIRED || 263 status == NFS4ERR_BAD_STATEID || 264 status == NFS4ERR_OLD_STATEID) 265 return (1); 266 return (0); 267 } 268 269 /* 270 * Transfer the state recovery information in recovp to mi's resend queue, 271 * and mark mi as having a lost state request. 272 */ 273 static void 274 nfs4_enqueue_lost_rqst(recov_info_t *recovp, mntinfo4_t *mi) 275 { 276 nfs4_lost_rqst_t *lrp = recovp->rc_lost_rqst; 277 278 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 279 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 280 281 ASSERT(lrp != NULL && lrp->lr_op != 0); 282 283 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 284 "nfs4_enqueue_lost_rqst %p, op %d", 285 (void *)lrp, lrp->lr_op)); 286 287 mutex_enter(&mi->mi_lock); 288 mi->mi_recovflags |= MI4R_LOST_STATE; 289 if (lrp->lr_putfirst) 290 list_insert_head(&mi->mi_lost_state, lrp); 291 else 292 list_insert_tail(&mi->mi_lost_state, lrp); 293 recovp->rc_lost_rqst = NULL; 294 mutex_exit(&mi->mi_lock); 295 296 nfs4_queue_event(RE_LOST_STATE, mi, NULL, lrp->lr_op, lrp->lr_vp, 297 lrp->lr_dvp, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0); 298 } 299 300 /* 301 * Transfer the bad seqid recovery information in recovp to mi's 302 * bad seqid queue, and mark mi as having a bad seqid request. 303 */ 304 void 305 enqueue_bseqid_rqst(recov_info_t *recovp, mntinfo4_t *mi) 306 { 307 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 308 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 309 ASSERT(recovp->rc_bseqid_rqst != NULL); 310 311 mutex_enter(&mi->mi_lock); 312 mi->mi_recovflags |= MI4R_BAD_SEQID; 313 list_insert_tail(&mi->mi_bseqid_list, recovp->rc_bseqid_rqst); 314 recovp->rc_bseqid_rqst = NULL; 315 mutex_exit(&mi->mi_lock); 316 } 317 318 /* 319 * Initiate recovery. 320 * 321 * The nfs4_error_t contains the return codes that triggered a recovery 322 * attempt. mi, vp1, and vp2 refer to the filesystem and files that were 323 * being operated on. vp1 and vp2 may be NULL. 324 * 325 * Multiple calls are okay. If recovery is already underway, the call 326 * updates the information about what state needs recovery but does not 327 * start a new thread. The caller should hold mi->mi_recovlock as a reader 328 * for proper synchronization with any recovery thread. 329 * 330 * This will return TRUE if recovery was aborted, and FALSE otherwise. 331 */ 332 bool_t 333 nfs4_start_recovery(nfs4_error_t *ep, mntinfo4_t *mi, vnode_t *vp1, 334 vnode_t *vp2, stateid4 *sid, nfs4_lost_rqst_t *lost_rqstp, nfs_opnum4 op, 335 nfs4_bseqid_entry_t *bsep) 336 { 337 recov_info_t *recovp; 338 nfs4_server_t *sp; 339 bool_t abort = FALSE; 340 bool_t gone = FALSE; 341 342 ASSERT(nfs_zone() == mi->mi_zone); 343 mutex_enter(&mi->mi_lock); 344 /* 345 * If there is lost state, we need to kick off recovery even if the 346 * filesystem has been unmounted or the zone is shutting down. 347 */ 348 gone = FS_OR_ZONE_GONE4(mi->mi_vfsp); 349 if (gone) { 350 ASSERT(ep->error != EINTR || lost_rqstp != NULL); 351 if (ep->error == EIO && lost_rqstp == NULL) { 352 /* failed due to forced unmount, no new lost state */ 353 abort = TRUE; 354 } 355 if ((ep->error == 0 || ep->error == ETIMEDOUT) && 356 !(mi->mi_recovflags & MI4R_LOST_STATE)) { 357 /* some other failure, no existing lost state */ 358 abort = TRUE; 359 } 360 if (abort) { 361 mutex_exit(&mi->mi_lock); 362 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 363 "nfs4_start_recovery: fs unmounted")); 364 return (TRUE); 365 } 366 } 367 mi->mi_in_recovery++; 368 mutex_exit(&mi->mi_lock); 369 370 recovp = kmem_alloc(sizeof (recov_info_t), KM_SLEEP); 371 recovp->rc_orig_errors = *ep; 372 sp = find_nfs4_server(mi); 373 errs_to_action(recovp, sp, mi, sid, lost_rqstp, gone, op, bsep); 374 if (sp != NULL) 375 mutex_exit(&sp->s_lock); 376 start_recovery(recovp, mi, vp1, vp2, sp); 377 if (sp != NULL) 378 nfs4_server_rele(sp); 379 return (FALSE); 380 } 381 382 /* 383 * Internal version of nfs4_start_recovery. The difference is that the 384 * caller specifies the recovery action, rather than the errors leading to 385 * recovery. 386 */ 387 static void 388 start_recovery_action(nfs4_recov_t what, bool_t reboot, mntinfo4_t *mi, 389 vnode_t *vp1, vnode_t *vp2) 390 { 391 recov_info_t *recovp; 392 393 ASSERT(nfs_zone() == mi->mi_zone); 394 mutex_enter(&mi->mi_lock); 395 mi->mi_in_recovery++; 396 mutex_exit(&mi->mi_lock); 397 398 recovp = kmem_zalloc(sizeof (recov_info_t), KM_SLEEP); 399 recovp->rc_action = what; 400 recovp->rc_srv_reboot = reboot; 401 recovp->rc_error = EIO; 402 start_recovery(recovp, mi, vp1, vp2, NULL); 403 } 404 405 static void 406 start_recovery(recov_info_t *recovp, mntinfo4_t *mi, 407 vnode_t *vp1, vnode_t *vp2, nfs4_server_t *sp) 408 { 409 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 410 "start_recovery: mi %p, what %s", (void*)mi, 411 nfs4_recov_action_to_str(recovp->rc_action))); 412 413 /* 414 * Bump the reference on the vfs so that we can pass it to the 415 * recovery thread. 416 */ 417 VFS_HOLD(mi->mi_vfsp); 418 MI4_HOLD(mi); 419 again: 420 switch (recovp->rc_action) { 421 case NR_FAILOVER: 422 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 423 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 424 if (mi->mi_servers->sv_next == NULL) 425 goto out_no_thread; 426 mutex_enter(&mi->mi_lock); 427 mi->mi_recovflags |= MI4R_NEED_NEW_SERVER; 428 mutex_exit(&mi->mi_lock); 429 430 if (recovp->rc_lost_rqst != NULL) 431 nfs4_enqueue_lost_rqst(recovp, mi); 432 break; 433 434 case NR_CLIENTID: 435 /* 436 * If the filesystem has been unmounted, punt. 437 */ 438 if (sp == NULL) 439 goto out_no_thread; 440 441 /* 442 * If nobody else is working on the clientid, mark the 443 * clientid as being no longer set. Then mark the specific 444 * filesystem being worked on. 445 */ 446 if (!nfs4_server_in_recovery(sp)) { 447 mutex_enter(&sp->s_lock); 448 sp->s_flags &= ~N4S_CLIENTID_SET; 449 mutex_exit(&sp->s_lock); 450 } 451 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 452 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 453 mutex_enter(&mi->mi_lock); 454 mi->mi_recovflags |= MI4R_NEED_CLIENTID; 455 if (recovp->rc_srv_reboot) 456 mi->mi_recovflags |= MI4R_SRV_REBOOT; 457 mutex_exit(&mi->mi_lock); 458 break; 459 460 case NR_OPENFILES: 461 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 462 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 463 mutex_enter(&mi->mi_lock); 464 mi->mi_recovflags |= MI4R_REOPEN_FILES; 465 if (recovp->rc_srv_reboot) 466 mi->mi_recovflags |= MI4R_SRV_REBOOT; 467 mutex_exit(&mi->mi_lock); 468 break; 469 470 case NR_WRONGSEC: 471 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 472 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 473 mutex_enter(&mi->mi_lock); 474 mi->mi_recovflags |= MI4R_NEED_SECINFO; 475 mutex_exit(&mi->mi_lock); 476 break; 477 478 case NR_EXPIRED: 479 if (vp1 != NULL) 480 recov_badstate(recovp, vp1, NFS4ERR_EXPIRED); 481 if (vp2 != NULL) 482 recov_badstate(recovp, vp2, NFS4ERR_EXPIRED); 483 goto out_no_thread; /* no further recovery possible */ 484 485 case NR_BAD_STATEID: 486 if (vp1 != NULL) 487 recov_badstate(recovp, vp1, NFS4ERR_BAD_STATEID); 488 if (vp2 != NULL) 489 recov_badstate(recovp, vp2, NFS4ERR_BAD_STATEID); 490 goto out_no_thread; /* no further recovery possible */ 491 492 case NR_FHEXPIRED: 493 case NR_BADHANDLE: 494 if (vp1 != NULL) 495 recov_throttle(recovp, vp1); 496 if (vp2 != NULL) 497 recov_throttle(recovp, vp2); 498 /* 499 * Recover the filehandle now, rather than using a 500 * separate thread. We can do this because filehandle 501 * recovery is independent of any other state, and because 502 * we know that we are not competing with the recovery 503 * thread at this time. recov_filehandle will deal with 504 * threads that are competing to recover this filehandle. 505 */ 506 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 507 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 508 if (vp1 != NULL) 509 recov_filehandle(recovp->rc_action, mi, vp1); 510 if (vp2 != NULL) 511 recov_filehandle(recovp->rc_action, mi, vp2); 512 goto out_no_thread; /* no further recovery needed */ 513 514 case NR_STALE: 515 /* 516 * NFS4ERR_STALE handling 517 * recov_stale() could set MI4R_NEED_NEW_SERVER to 518 * indicate that we can and should failover. 519 */ 520 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 521 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 522 523 if (vp1 != NULL) 524 recov_stale(mi, vp1); 525 if (vp2 != NULL) 526 recov_stale(mi, vp2); 527 mutex_enter(&mi->mi_lock); 528 if ((mi->mi_recovflags & MI4R_NEED_NEW_SERVER) == 0) { 529 mutex_exit(&mi->mi_lock); 530 goto out_no_thread; 531 } 532 mutex_exit(&mi->mi_lock); 533 recovp->rc_action = NR_FAILOVER; 534 goto again; 535 536 case NR_BAD_SEQID: 537 if (recovp->rc_bseqid_rqst) { 538 enqueue_bseqid_rqst(recovp, mi); 539 break; 540 } 541 542 if (vp1 != NULL) 543 recov_badstate(recovp, vp1, NFS4ERR_BAD_SEQID); 544 if (vp2 != NULL) 545 recov_badstate(recovp, vp2, NFS4ERR_BAD_SEQID); 546 goto out_no_thread; /* no further recovery possible */ 547 548 case NR_OLDSTATEID: 549 if (vp1 != NULL) 550 recov_badstate(recovp, vp1, NFS4ERR_OLD_STATEID); 551 if (vp2 != NULL) 552 recov_badstate(recovp, vp2, NFS4ERR_OLD_STATEID); 553 goto out_no_thread; /* no further recovery possible */ 554 555 case NR_GRACE: 556 nfs4_set_grace_wait(mi); 557 goto out_no_thread; /* no further action required for GRACE */ 558 559 case NR_DELAY: 560 if (vp1) 561 nfs4_set_delay_wait(vp1); 562 goto out_no_thread; /* no further action required for DELAY */ 563 564 case NR_LOST_STATE_RQST: 565 case NR_LOST_LOCK: 566 nfs4_enqueue_lost_rqst(recovp, mi); 567 break; 568 569 default: 570 nfs4_queue_event(RE_UNEXPECTED_ACTION, mi, NULL, 571 recovp->rc_action, NULL, NULL, 0, NULL, 0, TAG_NONE, 572 TAG_NONE, 0, 0); 573 goto out_no_thread; 574 } 575 576 /* 577 * If either file recently went through the same recovery, wait 578 * awhile. This is in case there is some sort of bug; we might not 579 * be able to recover properly, but at least we won't bombard the 580 * server with calls, and we won't tie up the client. 581 */ 582 if (vp1 != NULL) 583 recov_throttle(recovp, vp1); 584 if (vp2 != NULL) 585 recov_throttle(recovp, vp2); 586 587 /* 588 * If there's already a recovery thread, don't start another one. 589 */ 590 591 mutex_enter(&mi->mi_lock); 592 if (mi->mi_flags & MI4_RECOV_ACTIV) { 593 mutex_exit(&mi->mi_lock); 594 goto out_no_thread; 595 } 596 mi->mi_flags |= MI4_RECOV_ACTIV; 597 mutex_exit(&mi->mi_lock); 598 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 599 "start_recovery: starting new thread for mi %p", (void*)mi)); 600 601 recovp->rc_mi = mi; 602 recovp->rc_vp1 = vp1; 603 if (vp1 != NULL) { 604 ASSERT(VTOMI4(vp1) == mi); 605 VN_HOLD(recovp->rc_vp1); 606 } 607 recovp->rc_vp2 = vp2; 608 if (vp2 != NULL) { 609 ASSERT(VTOMI4(vp2) == mi); 610 VN_HOLD(recovp->rc_vp2); 611 } 612 613 (void) zthread_create(NULL, 0, nfs4_recov_thread, recovp, 0, 614 minclsyspri); 615 return; 616 617 /* not reached by thread creating call */ 618 out_no_thread: 619 mutex_enter(&mi->mi_lock); 620 mi->mi_in_recovery--; 621 if (mi->mi_in_recovery == 0) 622 cv_broadcast(&mi->mi_cv_in_recov); 623 mutex_exit(&mi->mi_lock); 624 625 VFS_RELE(mi->mi_vfsp); 626 MI4_RELE(mi); 627 /* 628 * Free up resources that were allocated for us. 629 */ 630 kmem_free(recovp, sizeof (recov_info_t)); 631 } 632 633 static int 634 nfs4_check_recov_err(vnode_t *vp, nfs4_op_hint_t op, 635 nfs4_recov_state_t *rsp, int retry_err_cnt, char *str) 636 { 637 rnode4_t *rp; 638 int error = 0; 639 int exempt; 640 641 if (vp == NULL) 642 return (0); 643 644 exempt = (op == OH_CLOSE || op == OH_LOCKU || op == OH_DELEGRETURN); 645 rp = VTOR4(vp); 646 mutex_enter(&rp->r_statelock); 647 648 /* 649 * If there was a recovery error, then allow op hints "exempt" from 650 * recov errors to retry (currently 3 times). Either r_error or 651 * EIO is returned for non-exempt op hints. 652 */ 653 if (rp->r_flags & R4RECOVERR) { 654 if (exempt && rsp->rs_num_retry_despite_err <= 655 nfs4_max_recov_error_retry) { 656 657 /* 658 * Check to make sure that we haven't already inc'd 659 * rs_num_retry_despite_err for current nfs4_start_fop 660 * instance. We don't want to double inc (if we were 661 * called with vp2, then the vp1 call could have 662 * already incremented. 663 */ 664 if (retry_err_cnt == rsp->rs_num_retry_despite_err) 665 rsp->rs_num_retry_despite_err++; 666 667 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 668 "nfs4_start_fop: %s %p DEAD, cnt=%d", str, 669 (void *)vp, rsp->rs_num_retry_despite_err)); 670 } else { 671 error = (rp->r_error ? rp->r_error : EIO); 672 /* 673 * An ESTALE error on a non-regular file is not 674 * "sticky". Return the ESTALE error once, but 675 * clear the condition to allow future operations 676 * to go OTW. This will allow the client to 677 * recover if the server has merely unshared then 678 * re-shared the file system. For regular files, 679 * the unshare has destroyed the open state at the 680 * server and we aren't willing to do a reopen (yet). 681 */ 682 if (error == ESTALE && vp->v_type != VREG) { 683 rp->r_flags &= 684 ~(R4RECOVERR|R4RECOVERRP|R4STALE); 685 rp->r_error = 0; 686 error = ESTALE; 687 } 688 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 689 "nfs4_start_fop: %s %p DEAD, cnt=%d error=%d", 690 str, (void *)vp, 691 rsp->rs_num_retry_despite_err, error)); 692 } 693 } 694 695 mutex_exit(&rp->r_statelock); 696 return (error); 697 } 698 699 /* 700 * Initial setup code that every operation should call if it might invoke 701 * client recovery. Can block waiting for recovery to finish on a 702 * filesystem. Either vnode ptr can be NULL. 703 * 704 * Returns 0 if there are no outstanding errors. Can return an 705 * errno value under various circumstances (e.g., failed recovery, or 706 * interrupted while waiting for recovery to finish). 707 * 708 * There must be a corresponding call to nfs4_end_op() to free up any locks 709 * or resources allocated by this call (assuming this call succeeded), 710 * using the same rsp that's passed in here. 711 * 712 * The open and lock seqid synchronization must be stopped before calling this 713 * function, as it could lead to deadlock when trying to reopen a file or 714 * reclaim a lock. The synchronization is obtained with calls to: 715 * nfs4_start_open_seqid_sync() 716 * nfs4_start_lock_seqid_sync() 717 * 718 * *startrecovp is set TRUE if the caller should not bother with the 719 * over-the-wire call, and just initiate recovery for the given request. 720 * This is typically used for state-releasing ops if the filesystem has 721 * been forcibly unmounted. startrecovp may be NULL for 722 * non-state-releasing ops. 723 */ 724 725 int 726 nfs4_start_fop(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2, nfs4_op_hint_t op, 727 nfs4_recov_state_t *rsp, bool_t *startrecovp) 728 { 729 int error = 0, rerr_cnt; 730 nfs4_server_t *sp = NULL; 731 nfs4_server_t *tsp; 732 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 733 time_t droplock_time; 734 #ifdef DEBUG 735 void *fop_caller; 736 #endif 737 738 ASSERT(vp1 == NULL || vp1->v_vfsp == mi->mi_vfsp); 739 ASSERT(vp2 == NULL || vp2->v_vfsp == mi->mi_vfsp); 740 741 #ifdef DEBUG 742 if ((fop_caller = tsd_get(nfs4_tsd_key)) != NULL) { 743 cmn_err(CE_PANIC, "Missing nfs4_end_fop: last caller %p", 744 fop_caller); 745 } 746 (void) tsd_set(nfs4_tsd_key, caller()); 747 #endif 748 749 rsp->rs_sp = NULL; 750 rsp->rs_flags &= ~NFS4_RS_RENAME_HELD; 751 rerr_cnt = rsp->rs_num_retry_despite_err; 752 753 /* 754 * Process the items that may delay() based on server response 755 */ 756 error = nfs4_wait_for_grace(mi, rsp); 757 if (error) 758 goto out; 759 760 if (vp1 != NULL) { 761 error = nfs4_wait_for_delay(vp1, rsp); 762 if (error) 763 goto out; 764 } 765 766 /* Wait for a delegation recall to complete. */ 767 768 error = wait_for_recall(vp1, vp2, op, rsp); 769 if (error) 770 goto out; 771 772 /* 773 * Wait for any current recovery actions to finish. Note that a 774 * recovery thread can still start up after wait_for_recovery() 775 * finishes. We don't block out recovery operations until we 776 * acquire s_recovlock and mi_recovlock. 777 */ 778 error = wait_for_recovery(mi, op); 779 if (error) 780 goto out; 781 782 /* 783 * Check to see if the rnode is already marked with a 784 * recovery error. If so, return it immediately. But 785 * always pass CLOSE, LOCKU, and DELEGRETURN so we can 786 * clean up state on the server. 787 */ 788 789 if (vp1 != NULL) { 790 if (error = nfs4_check_recov_err(vp1, op, rsp, rerr_cnt, "vp1")) 791 goto out; 792 nfs4_check_remap(mi, vp1, NFS4_REMAP_CKATTRS, &e); 793 } 794 795 if (vp2 != NULL) { 796 if (error = nfs4_check_recov_err(vp2, op, rsp, rerr_cnt, "vp2")) 797 goto out; 798 nfs4_check_remap(mi, vp2, NFS4_REMAP_CKATTRS, &e); 799 } 800 801 /* 802 * The lock order calls for us to acquire s_recovlock before 803 * mi_recovlock, but we have to hold mi_recovlock to look up sp (to 804 * prevent races with the failover/migration code). So acquire 805 * mi_recovlock, look up sp, drop mi_recovlock, acquire 806 * s_recovlock and mi_recovlock, then verify that sp is still the 807 * right object. XXX Can we find a simpler way to deal with this? 808 */ 809 if (nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 810 mi->mi_flags & MI4_INT)) { 811 error = EINTR; 812 goto out; 813 } 814 get_sp: 815 sp = find_nfs4_server(mi); 816 if (sp != NULL) { 817 sp->s_otw_call_count++; 818 mutex_exit(&sp->s_lock); 819 droplock_time = gethrestime_sec(); 820 } 821 nfs_rw_exit(&mi->mi_recovlock); 822 823 if (sp != NULL) { 824 if (nfs_rw_enter_sig(&sp->s_recovlock, RW_READER, 825 mi->mi_flags & MI4_INT)) { 826 error = EINTR; 827 goto out; 828 } 829 } 830 if (nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 831 mi->mi_flags & MI4_INT)) { 832 if (sp != NULL) 833 nfs_rw_exit(&sp->s_recovlock); 834 error = EINTR; 835 goto out; 836 } 837 /* 838 * If the mntinfo4_t hasn't changed nfs4_sever_ts then 839 * there's no point in double checking to make sure it 840 * has switched. 841 */ 842 if (sp == NULL || droplock_time < mi->mi_srvsettime) { 843 tsp = find_nfs4_server(mi); 844 if (tsp != sp) { 845 /* try again */ 846 if (tsp != NULL) { 847 mutex_exit(&tsp->s_lock); 848 nfs4_server_rele(tsp); 849 tsp = NULL; 850 } 851 if (sp != NULL) { 852 nfs_rw_exit(&sp->s_recovlock); 853 mutex_enter(&sp->s_lock); 854 sp->s_otw_call_count--; 855 mutex_exit(&sp->s_lock); 856 nfs4_server_rele(sp); 857 sp = NULL; 858 } 859 goto get_sp; 860 } else { 861 if (tsp != NULL) { 862 mutex_exit(&tsp->s_lock); 863 nfs4_server_rele(tsp); 864 tsp = NULL; 865 } 866 } 867 } 868 869 if (sp != NULL) { 870 rsp->rs_sp = sp; 871 } 872 873 /* 874 * If the fileystem uses volatile filehandles, obtain a lock so 875 * that we synchronize with renames. Exception: mount operations 876 * can change mi_fh_expire_type, which could be a problem, since 877 * the end_op code needs to be consistent with the start_op code 878 * about mi_rename_lock. Since mounts don't compete with renames, 879 * it's simpler to just not acquire the rename lock for mounts. 880 */ 881 if (NFS4_VOLATILE_FH(mi) && op != OH_MOUNT) { 882 if (nfs_rw_enter_sig(&mi->mi_rename_lock, 883 op == OH_VFH_RENAME ? RW_WRITER : RW_READER, 884 mi->mi_flags & MI4_INT)) { 885 nfs_rw_exit(&mi->mi_recovlock); 886 if (sp != NULL) 887 nfs_rw_exit(&sp->s_recovlock); 888 error = EINTR; 889 goto out; 890 } 891 rsp->rs_flags |= NFS4_RS_RENAME_HELD; 892 } 893 894 if (OH_IS_STATE_RELE(op)) { 895 /* 896 * For forced unmount, letting the request proceed will 897 * almost always delay response to the user, so hand it off 898 * to the recovery thread. For exiting lwp's, we don't 899 * have a good way to tell if the request will hang. We 900 * generally want processes to handle their own requests so 901 * that they can be done in parallel, but if there is 902 * already a recovery thread, hand the request off to it. 903 * This will improve user response at no cost to overall 904 * system throughput. For zone shutdown, we'd prefer 905 * the recovery thread to handle this as well. 906 */ 907 ASSERT(startrecovp != NULL); 908 mutex_enter(&mi->mi_lock); 909 if (FS_OR_ZONE_GONE4(mi->mi_vfsp)) 910 *startrecovp = TRUE; 911 else if ((curthread->t_proc_flag & TP_LWPEXIT) && 912 (mi->mi_flags & MI4_RECOV_ACTIV)) 913 *startrecovp = TRUE; 914 else 915 *startrecovp = FALSE; 916 mutex_exit(&mi->mi_lock); 917 } else 918 if (startrecovp != NULL) 919 *startrecovp = FALSE; 920 921 ASSERT(error == 0); 922 return (error); 923 924 out: 925 ASSERT(error != 0); 926 if (sp != NULL) { 927 mutex_enter(&sp->s_lock); 928 sp->s_otw_call_count--; 929 mutex_exit(&sp->s_lock); 930 nfs4_server_rele(sp); 931 rsp->rs_sp = NULL; 932 } 933 nfs4_end_op_recall(vp1, vp2, rsp); 934 935 #ifdef DEBUG 936 (void) tsd_set(nfs4_tsd_key, NULL); 937 #endif 938 return (error); 939 } 940 941 /* 942 * It is up to the caller to determine if rsp->rs_sp being NULL 943 * is detrimental or not. 944 */ 945 int 946 nfs4_start_op(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2, 947 nfs4_recov_state_t *rsp) 948 { 949 ASSERT(rsp->rs_num_retry_despite_err == 0); 950 rsp->rs_num_retry_despite_err = 0; 951 return (nfs4_start_fop(mi, vp1, vp2, OH_OTHER, rsp, NULL)); 952 } 953 954 /* 955 * Release any resources acquired by nfs4_start_op(). 956 * 'sp' should be the nfs4_server pointer returned by nfs4_start_op(). 957 * 958 * The operation hint is used to avoid a deadlock by bypassing delegation 959 * return logic for writes, which are done while returning a delegation. 960 */ 961 962 void 963 nfs4_end_fop(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2, nfs4_op_hint_t op, 964 nfs4_recov_state_t *rsp, bool_t needs_recov) 965 { 966 nfs4_server_t *sp = rsp->rs_sp; 967 rnode4_t *rp = NULL; 968 969 #ifdef lint 970 /* 971 * The op hint isn't used any more, but might be in 972 * the future. 973 */ 974 op = op; 975 #endif 976 977 #ifdef DEBUG 978 ASSERT(tsd_get(nfs4_tsd_key) != NULL); 979 (void) tsd_set(nfs4_tsd_key, NULL); 980 #endif 981 982 nfs4_end_op_recall(vp1, vp2, rsp); 983 984 if (rsp->rs_flags & NFS4_RS_RENAME_HELD) 985 nfs_rw_exit(&mi->mi_rename_lock); 986 987 if (!needs_recov) { 988 if (rsp->rs_flags & NFS4_RS_DELAY_MSG) { 989 /* may need to clear the delay interval */ 990 if (vp1 != NULL) { 991 rp = VTOR4(vp1); 992 mutex_enter(&rp->r_statelock); 993 rp->r_delay_interval = 0; 994 mutex_exit(&rp->r_statelock); 995 } 996 } 997 rsp->rs_flags &= ~(NFS4_RS_GRACE_MSG|NFS4_RS_DELAY_MSG); 998 } 999 1000 /* 1001 * If the corresponding nfs4_start_op() found a sp, 1002 * then there must still be a sp. 1003 */ 1004 if (sp != NULL) { 1005 nfs_rw_exit(&mi->mi_recovlock); 1006 nfs_rw_exit(&sp->s_recovlock); 1007 mutex_enter(&sp->s_lock); 1008 sp->s_otw_call_count--; 1009 cv_broadcast(&sp->s_cv_otw_count); 1010 mutex_exit(&sp->s_lock); 1011 nfs4_server_rele(sp); 1012 } else { 1013 nfs_rw_exit(&mi->mi_recovlock); 1014 } 1015 } 1016 1017 void 1018 nfs4_end_op(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2, 1019 nfs4_recov_state_t *rsp, bool_t needrecov) 1020 { 1021 nfs4_end_fop(mi, vp1, vp2, OH_OTHER, rsp, needrecov); 1022 } 1023 1024 /* 1025 * If the filesystem is going through client recovery, block until 1026 * finished. 1027 * Exceptions: 1028 * - state-releasing ops (CLOSE, LOCKU, DELEGRETURN) are allowed to proceed 1029 * if the filesystem has been forcibly unmounted or the lwp is exiting. 1030 * 1031 * Return value: 1032 * - 0 if no errors 1033 * - EINTR if the call was interrupted 1034 * - EIO if the filesystem has been forcibly unmounted (non-state-releasing 1035 * op) 1036 * - the errno value from the recovery thread, if recovery failed 1037 */ 1038 1039 static int 1040 wait_for_recovery(mntinfo4_t *mi, nfs4_op_hint_t op_hint) 1041 { 1042 int error = 0; 1043 1044 mutex_enter(&mi->mi_lock); 1045 1046 while (mi->mi_recovflags != 0) { 1047 klwp_t *lwp = ttolwp(curthread); 1048 1049 if ((mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) || 1050 (mi->mi_flags & MI4_RECOV_FAIL)) 1051 break; 1052 if (OH_IS_STATE_RELE(op_hint) && 1053 (curthread->t_proc_flag & TP_LWPEXIT)) 1054 break; 1055 1056 if (lwp != NULL) 1057 lwp->lwp_nostop++; 1058 /* XXX - use different cv? */ 1059 if (cv_wait_sig(&mi->mi_failover_cv, &mi->mi_lock) == 0) { 1060 error = EINTR; 1061 if (lwp != NULL) 1062 lwp->lwp_nostop--; 1063 break; 1064 } 1065 if (lwp != NULL) 1066 lwp->lwp_nostop--; 1067 } 1068 1069 if ((mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) && 1070 !OH_IS_STATE_RELE(op_hint)) { 1071 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1072 "wait_for_recovery: forced unmount")); 1073 error = EIO; 1074 } else if (mi->mi_flags & MI4_RECOV_FAIL) { 1075 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1076 "wait_for_recovery: fail since RECOV FAIL")); 1077 error = mi->mi_error; 1078 } 1079 1080 mutex_exit(&mi->mi_lock); 1081 1082 return (error); 1083 } 1084 1085 /* 1086 * If the client received NFS4ERR_GRACE for this particular mount, 1087 * the client blocks here until it is time to try again. 1088 * 1089 * Return value: 1090 * - 0 if wait was successful 1091 * - EINTR if the call was interrupted 1092 */ 1093 1094 int 1095 nfs4_wait_for_grace(mntinfo4_t *mi, nfs4_recov_state_t *rsp) 1096 { 1097 int error = 0; 1098 time_t curtime, time_to_wait; 1099 1100 /* do a unprotected check to reduce mi_lock contention */ 1101 if (mi->mi_grace_wait != 0) { 1102 mutex_enter(&mi->mi_lock); 1103 1104 if (mi->mi_grace_wait != 0) { 1105 if (!(rsp->rs_flags & NFS4_RS_GRACE_MSG)) 1106 rsp->rs_flags |= NFS4_RS_GRACE_MSG; 1107 1108 curtime = gethrestime_sec(); 1109 1110 if (curtime < mi->mi_grace_wait) { 1111 1112 time_to_wait = mi->mi_grace_wait - curtime; 1113 1114 mutex_exit(&mi->mi_lock); 1115 1116 delay(SEC_TO_TICK(time_to_wait)); 1117 1118 curtime = gethrestime_sec(); 1119 1120 mutex_enter(&mi->mi_lock); 1121 1122 if (curtime >= mi->mi_grace_wait) 1123 mi->mi_grace_wait = 0; 1124 } else { 1125 mi->mi_grace_wait = 0; 1126 } 1127 } 1128 mutex_exit(&mi->mi_lock); 1129 } 1130 1131 return (error); 1132 } 1133 1134 /* 1135 * If the client received NFS4ERR_DELAY for an operation on a vnode, 1136 * the client blocks here until it is time to try again. 1137 * 1138 * Return value: 1139 * - 0 if wait was successful 1140 * - EINTR if the call was interrupted 1141 */ 1142 1143 int 1144 nfs4_wait_for_delay(vnode_t *vp, nfs4_recov_state_t *rsp) 1145 { 1146 int error = 0; 1147 time_t curtime, time_to_wait; 1148 rnode4_t *rp; 1149 1150 ASSERT(vp != NULL); 1151 1152 rp = VTOR4(vp); 1153 1154 /* do a unprotected check to reduce r_statelock contention */ 1155 if (rp->r_delay_wait != 0) { 1156 mutex_enter(&rp->r_statelock); 1157 1158 if (rp->r_delay_wait != 0) { 1159 1160 if (!(rsp->rs_flags & NFS4_RS_DELAY_MSG)) { 1161 rsp->rs_flags |= NFS4_RS_DELAY_MSG; 1162 nfs4_mi_kstat_inc_delay(VTOMI4(vp)); 1163 } 1164 1165 curtime = gethrestime_sec(); 1166 1167 if (curtime < rp->r_delay_wait) { 1168 1169 time_to_wait = rp->r_delay_wait - curtime; 1170 1171 mutex_exit(&rp->r_statelock); 1172 1173 delay(SEC_TO_TICK(time_to_wait)); 1174 1175 curtime = gethrestime_sec(); 1176 1177 mutex_enter(&rp->r_statelock); 1178 1179 if (curtime >= rp->r_delay_wait) 1180 rp->r_delay_wait = 0; 1181 } else { 1182 rp->r_delay_wait = 0; 1183 } 1184 } 1185 mutex_exit(&rp->r_statelock); 1186 } 1187 1188 return (error); 1189 } 1190 1191 /* 1192 * The recovery thread. 1193 */ 1194 1195 static void 1196 nfs4_recov_thread(recov_info_t *recovp) 1197 { 1198 mntinfo4_t *mi = recovp->rc_mi; 1199 nfs4_server_t *sp; 1200 int done = 0, error = 0; 1201 bool_t recov_fail = FALSE; 1202 callb_cpr_t cpr_info; 1203 kmutex_t cpr_lock; 1204 1205 nfs4_queue_event(RE_START, mi, NULL, mi->mi_recovflags, 1206 recovp->rc_vp1, recovp->rc_vp2, 0, NULL, 0, TAG_NONE, TAG_NONE, 1207 0, 0); 1208 1209 mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL); 1210 CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr, "nfsv4Recov"); 1211 1212 mutex_enter(&mi->mi_lock); 1213 mi->mi_recovthread = curthread; 1214 mutex_exit(&mi->mi_lock); 1215 1216 /* 1217 * We don't really need protection here against failover or 1218 * migration, since the current thread is the one that would make 1219 * any changes, but hold mi_recovlock anyway for completeness (and 1220 * to satisfy any ASSERTs). 1221 */ 1222 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0); 1223 sp = find_nfs4_server(mi); 1224 if (sp != NULL) 1225 mutex_exit(&sp->s_lock); 1226 nfs_rw_exit(&mi->mi_recovlock); 1227 1228 /* 1229 * Do any necessary recovery, based on the information in recovp 1230 * and any recovery flags. 1231 */ 1232 1233 do { 1234 mutex_enter(&mi->mi_lock); 1235 if (FS_OR_ZONE_GONE4(mi->mi_vfsp)) { 1236 bool_t activesrv; 1237 1238 NFS4_DEBUG(nfs4_client_recov_debug && 1239 mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED, (CE_NOTE, 1240 "nfs4_recov_thread: file system has been " 1241 "unmounted")); 1242 NFS4_DEBUG(nfs4_client_recov_debug && 1243 zone_status_get(curproc->p_zone) >= 1244 ZONE_IS_SHUTTING_DOWN, (CE_NOTE, 1245 "nfs4_recov_thread: zone shutting down")); 1246 /* 1247 * If the server has lost its state for us and 1248 * the filesystem is unmounted, then the filesystem 1249 * can be tossed, even if there are lost lock or 1250 * lost state calls in the recovery queue. 1251 */ 1252 if (mi->mi_recovflags & 1253 (MI4R_NEED_CLIENTID | MI4R_REOPEN_FILES)) { 1254 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1255 "nfs4_recov_thread: bailing out")); 1256 mi->mi_flags |= MI4_RECOV_FAIL; 1257 mi->mi_error = recovp->rc_error; 1258 recov_fail = TRUE; 1259 } 1260 /* 1261 * We don't know if the server has any state for 1262 * us, and the filesystem has been unmounted. If 1263 * there are "lost state" recovery items, keep 1264 * trying to process them until there are no more 1265 * mounted filesystems for the server. Otherwise, 1266 * bail out. The reason we don't mark the 1267 * filesystem as failing recovery is in case we 1268 * have to do "lost state" recovery later (e.g., a 1269 * user process exits). 1270 */ 1271 if (!(mi->mi_recovflags & MI4R_LOST_STATE)) { 1272 done = 1; 1273 mutex_exit(&mi->mi_lock); 1274 break; 1275 } 1276 mutex_exit(&mi->mi_lock); 1277 1278 if (sp == NULL) 1279 activesrv = FALSE; 1280 else { 1281 mutex_enter(&sp->s_lock); 1282 activesrv = nfs4_fs_active(sp); 1283 } 1284 if (!activesrv) { 1285 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1286 "no active fs for server %p", 1287 (void *)sp)); 1288 mutex_enter(&mi->mi_lock); 1289 mi->mi_flags |= MI4_RECOV_FAIL; 1290 mi->mi_error = recovp->rc_error; 1291 mutex_exit(&mi->mi_lock); 1292 recov_fail = TRUE; 1293 if (sp != NULL) { 1294 /* 1295 * Mark the server instance as 1296 * dead, so that nobody will attach 1297 * a new filesystem. 1298 */ 1299 nfs4_mark_srv_dead(sp); 1300 } 1301 } 1302 if (sp != NULL) 1303 mutex_exit(&sp->s_lock); 1304 } else { 1305 mutex_exit(&mi->mi_lock); 1306 } 1307 1308 /* 1309 * Check if we need to select a new server for a 1310 * failover. Choosing a new server will force at 1311 * least a check of the clientid. 1312 */ 1313 mutex_enter(&mi->mi_lock); 1314 if (!recov_fail && 1315 (mi->mi_recovflags & MI4R_NEED_NEW_SERVER)) { 1316 mutex_exit(&mi->mi_lock); 1317 recov_newserver(recovp, &sp, &recov_fail); 1318 } else 1319 mutex_exit(&mi->mi_lock); 1320 1321 /* 1322 * Check if we need to recover the clientid. This 1323 * must be done before file and lock recovery, and it 1324 * potentially affects the recovery threads for other 1325 * filesystems, so it gets special treatment. 1326 */ 1327 if (sp != NULL && recov_fail == FALSE) { 1328 mutex_enter(&sp->s_lock); 1329 if (!(sp->s_flags & N4S_CLIENTID_SET)) { 1330 mutex_exit(&sp->s_lock); 1331 recov_clientid(recovp, sp); 1332 } else { 1333 /* 1334 * Unset this flag in case another recovery 1335 * thread successfully recovered the clientid 1336 * for us already. 1337 */ 1338 mutex_enter(&mi->mi_lock); 1339 mi->mi_recovflags &= ~MI4R_NEED_CLIENTID; 1340 mutex_exit(&mi->mi_lock); 1341 mutex_exit(&sp->s_lock); 1342 } 1343 } 1344 1345 /* 1346 * Check if we need to get the security information. 1347 */ 1348 mutex_enter(&mi->mi_lock); 1349 if ((mi->mi_recovflags & MI4R_NEED_SECINFO) && 1350 !(mi->mi_flags & MI4_RECOV_FAIL)) { 1351 mutex_exit(&mi->mi_lock); 1352 (void) nfs_rw_enter_sig(&mi->mi_recovlock, 1353 RW_WRITER, 0); 1354 error = nfs4_secinfo_recov(recovp->rc_mi, 1355 recovp->rc_vp1, recovp->rc_vp2); 1356 /* 1357 * If error, nothing more can be done, stop 1358 * the recovery. 1359 */ 1360 if (error) { 1361 mutex_enter(&mi->mi_lock); 1362 mi->mi_flags |= MI4_RECOV_FAIL; 1363 mi->mi_error = recovp->rc_error; 1364 mutex_exit(&mi->mi_lock); 1365 nfs4_queue_event(RE_WRONGSEC, mi, NULL, 1366 error, recovp->rc_vp1, recovp->rc_vp2, 1367 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0); 1368 } 1369 nfs_rw_exit(&mi->mi_recovlock); 1370 } else 1371 mutex_exit(&mi->mi_lock); 1372 1373 /* 1374 * Check if there's a bad seqid to recover. 1375 */ 1376 mutex_enter(&mi->mi_lock); 1377 if ((mi->mi_recovflags & MI4R_BAD_SEQID) && 1378 !(mi->mi_flags & MI4_RECOV_FAIL)) { 1379 mutex_exit(&mi->mi_lock); 1380 (void) nfs_rw_enter_sig(&mi->mi_recovlock, 1381 RW_WRITER, 0); 1382 recov_bad_seqid(recovp); 1383 nfs_rw_exit(&mi->mi_recovlock); 1384 } else 1385 mutex_exit(&mi->mi_lock); 1386 1387 /* 1388 * Next check for recovery that affects the entire 1389 * filesystem. 1390 */ 1391 if (sp != NULL) { 1392 mutex_enter(&mi->mi_lock); 1393 if ((mi->mi_recovflags & MI4R_REOPEN_FILES) && 1394 !(mi->mi_flags & MI4_RECOV_FAIL)) { 1395 mutex_exit(&mi->mi_lock); 1396 recov_openfiles(recovp, sp); 1397 } else 1398 mutex_exit(&mi->mi_lock); 1399 } 1400 1401 /* 1402 * Send any queued state recovery requests. 1403 */ 1404 mutex_enter(&mi->mi_lock); 1405 if (sp != NULL && 1406 (mi->mi_recovflags & MI4R_LOST_STATE) && 1407 !(mi->mi_flags & MI4_RECOV_FAIL)) { 1408 mutex_exit(&mi->mi_lock); 1409 (void) nfs_rw_enter_sig(&mi->mi_recovlock, 1410 RW_WRITER, 0); 1411 nfs4_resend_lost_rqsts(recovp, sp); 1412 if (list_head(&mi->mi_lost_state) == NULL) { 1413 /* done */ 1414 mutex_enter(&mi->mi_lock); 1415 mi->mi_recovflags &= ~MI4R_LOST_STATE; 1416 mutex_exit(&mi->mi_lock); 1417 } 1418 nfs_rw_exit(&mi->mi_recovlock); 1419 } else { 1420 mutex_exit(&mi->mi_lock); 1421 } 1422 1423 /* 1424 * See if there is anything more to do. If not, announce 1425 * that we are done and exit. 1426 * 1427 * Need mi_recovlock to keep 'sp' valid. Must grab 1428 * mi_recovlock before mi_lock to preserve lock ordering. 1429 */ 1430 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0); 1431 mutex_enter(&mi->mi_lock); 1432 if ((mi->mi_recovflags & ~MI4R_SRV_REBOOT) == 0 || 1433 (mi->mi_flags & MI4_RECOV_FAIL)) { 1434 list_t local_lost_state; 1435 nfs4_lost_rqst_t *lrp; 1436 1437 /* 1438 * We need to remove the lost requests before we 1439 * unmark the mi as no longer doing recovery to 1440 * avoid a race with a new thread putting new lost 1441 * requests on the same mi (and the going away 1442 * thread would remove the new lost requests). 1443 * 1444 * Move the lost requests to a local list since 1445 * nfs4_remove_lost_rqst() drops mi_lock, and 1446 * dropping the mi_lock would make our check to 1447 * see if recovery is done no longer valid. 1448 */ 1449 list_create(&local_lost_state, 1450 sizeof (nfs4_lost_rqst_t), 1451 offsetof(nfs4_lost_rqst_t, lr_node)); 1452 list_move_tail(&local_lost_state, &mi->mi_lost_state); 1453 1454 done = 1; 1455 mutex_exit(&mi->mi_lock); 1456 /* 1457 * Now officially free the "moved" 1458 * lost requests. 1459 */ 1460 while ((lrp = list_head(&local_lost_state)) != NULL) { 1461 list_remove(&local_lost_state, lrp); 1462 nfs4_free_lost_rqst(lrp, sp); 1463 } 1464 list_destroy(&local_lost_state); 1465 } else 1466 mutex_exit(&mi->mi_lock); 1467 nfs_rw_exit(&mi->mi_recovlock); 1468 1469 /* 1470 * If the filesystem has been forcibly unmounted, there is 1471 * probably no point in retrying immediately. Furthermore, 1472 * there might be user processes waiting for a chance to 1473 * queue up "lost state" requests, so that they can exit. 1474 * So pause here for a moment. Same logic for zone shutdown. 1475 */ 1476 if (!done && FS_OR_ZONE_GONE4(mi->mi_vfsp)) { 1477 mutex_enter(&mi->mi_lock); 1478 cv_broadcast(&mi->mi_failover_cv); 1479 mutex_exit(&mi->mi_lock); 1480 delay(SEC_TO_TICK(nfs4_unmount_delay)); 1481 } 1482 1483 } while (!done); 1484 1485 if (sp != NULL) 1486 nfs4_server_rele(sp); 1487 1488 /* 1489 * Return all recalled delegations 1490 */ 1491 nfs4_dlistclean(); 1492 1493 mutex_enter(&mi->mi_lock); 1494 recov_done(mi, recovp); 1495 mutex_exit(&mi->mi_lock); 1496 1497 /* 1498 * Free up resources that were allocated for us. 1499 */ 1500 if (recovp->rc_vp1 != NULL) 1501 VN_RELE(recovp->rc_vp1); 1502 if (recovp->rc_vp2 != NULL) 1503 VN_RELE(recovp->rc_vp2); 1504 1505 /* now we are done using the mi struct, signal the waiters */ 1506 mutex_enter(&mi->mi_lock); 1507 mi->mi_in_recovery--; 1508 if (mi->mi_in_recovery == 0) 1509 cv_broadcast(&mi->mi_cv_in_recov); 1510 mutex_exit(&mi->mi_lock); 1511 1512 VFS_RELE(mi->mi_vfsp); 1513 MI4_RELE(mi); 1514 kmem_free(recovp, sizeof (recov_info_t)); 1515 mutex_enter(&cpr_lock); 1516 CALLB_CPR_EXIT(&cpr_info); 1517 mutex_destroy(&cpr_lock); 1518 zthread_exit(); 1519 } 1520 1521 /* 1522 * Log the end of recovery and notify any waiting threads. 1523 */ 1524 1525 static void 1526 recov_done(mntinfo4_t *mi, recov_info_t *recovp) 1527 { 1528 1529 ASSERT(MUTEX_HELD(&mi->mi_lock)); 1530 1531 nfs4_queue_event(RE_END, mi, NULL, 0, recovp->rc_vp1, 1532 recovp->rc_vp2, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0); 1533 mi->mi_recovthread = NULL; 1534 mi->mi_flags &= ~MI4_RECOV_ACTIV; 1535 mi->mi_recovflags &= ~MI4R_SRV_REBOOT; 1536 cv_broadcast(&mi->mi_failover_cv); 1537 } 1538 1539 /* 1540 * State-specific recovery routines, by state. 1541 */ 1542 1543 /* 1544 * Failover. 1545 * 1546 * Replaces *spp with a reference to the new server, which must 1547 * eventually be freed. 1548 */ 1549 1550 static void 1551 recov_newserver(recov_info_t *recovp, nfs4_server_t **spp, bool_t *recov_fail) 1552 { 1553 mntinfo4_t *mi = recovp->rc_mi; 1554 servinfo4_t *svp = NULL; 1555 nfs4_server_t *osp = *spp; 1556 CLIENT *cl; 1557 enum clnt_stat status; 1558 struct timeval tv; 1559 int error; 1560 int oncethru = 0; 1561 rnode4_t *rp; 1562 int index; 1563 nfs_fh4 fh; 1564 char *snames; 1565 size_t len; 1566 1567 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0); 1568 1569 tv.tv_sec = 2; 1570 tv.tv_usec = 0; 1571 1572 #ifdef lint 1573 /* 1574 * Lint can't follow the logic, so thinks that snames and len 1575 * can be used before being set. They can't, but lint can't 1576 * figure it out. To address the lint warning, initialize 1577 * snames and len for lint. 1578 */ 1579 snames = NULL; 1580 len = 0; 1581 #endif 1582 1583 /* 1584 * Ping the null NFS procedure of every server in 1585 * the list until one responds. We always start 1586 * at the head of the list and always skip the one 1587 * that is current, since it's caused us a problem. 1588 */ 1589 while (svp == NULL) { 1590 for (svp = mi->mi_servers; svp; svp = svp->sv_next) { 1591 1592 mutex_enter(&mi->mi_lock); 1593 if (FS_OR_ZONE_GONE4(mi->mi_vfsp)) { 1594 mi->mi_flags |= MI4_RECOV_FAIL; 1595 mutex_exit(&mi->mi_lock); 1596 (void) nfs_rw_exit(&mi->mi_recovlock); 1597 *recov_fail = TRUE; 1598 if (oncethru) 1599 kmem_free(snames, len); 1600 return; 1601 } 1602 mutex_exit(&mi->mi_lock); 1603 1604 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 1605 if (svp->sv_flags & SV4_NOTINUSE) { 1606 nfs_rw_exit(&svp->sv_lock); 1607 continue; 1608 } 1609 nfs_rw_exit(&svp->sv_lock); 1610 1611 if (!oncethru && svp == mi->mi_curr_serv) 1612 continue; 1613 1614 error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, 1615 NFS_PROGRAM, NFS_V4, 0, 1, CRED(), &cl); 1616 if (error) 1617 continue; 1618 1619 if (!(mi->mi_flags & MI4_INT)) 1620 cl->cl_nosignal = TRUE; 1621 status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL, 1622 xdr_void, NULL, tv); 1623 if (!(mi->mi_flags & MI4_INT)) 1624 cl->cl_nosignal = FALSE; 1625 AUTH_DESTROY(cl->cl_auth); 1626 CLNT_DESTROY(cl); 1627 if (status == RPC_SUCCESS) { 1628 nfs4_queue_event(RE_FAILOVER, mi, 1629 svp == mi->mi_curr_serv ? NULL : 1630 svp->sv_hostname, 0, NULL, NULL, 0, 1631 NULL, 0, TAG_NONE, TAG_NONE, 0, 0); 1632 break; 1633 } 1634 } 1635 1636 if (svp == NULL) { 1637 if (!oncethru) { 1638 snames = nfs4_getsrvnames(mi, &len); 1639 nfs4_queue_fact(RF_SRVS_NOT_RESPOND, mi, 1640 0, 0, 0, FALSE, snames, 0, NULL); 1641 oncethru = 1; 1642 } 1643 delay(hz); 1644 } 1645 } 1646 1647 if (oncethru) { 1648 nfs4_queue_fact(RF_SRVS_OK, mi, 0, 0, 0, FALSE, snames, 1649 0, NULL); 1650 kmem_free(snames, len); 1651 } 1652 1653 #if DEBUG 1654 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 1655 ASSERT((svp->sv_flags & SV4_NOTINUSE) == 0); 1656 nfs_rw_exit(&svp->sv_lock); 1657 #endif 1658 1659 mutex_enter(&mi->mi_lock); 1660 mi->mi_recovflags &= ~MI4R_NEED_NEW_SERVER; 1661 if (svp != mi->mi_curr_serv) { 1662 servinfo4_t *osvp = mi->mi_curr_serv; 1663 1664 mutex_exit(&mi->mi_lock); 1665 1666 /* 1667 * Update server-dependent fields in the root vnode. 1668 */ 1669 index = rtable4hash(mi->mi_rootfh); 1670 rw_enter(&rtable4[index].r_lock, RW_WRITER); 1671 1672 rp = r4find(&rtable4[index], mi->mi_rootfh, mi->mi_vfsp); 1673 if (rp != NULL) { 1674 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE, 1675 "recov_newserver: remapping %s", rnode4info(rp))); 1676 mutex_enter(&rp->r_statelock); 1677 rp->r_server = svp; 1678 PURGE_ATTRCACHE4_LOCKED(rp); 1679 mutex_exit(&rp->r_statelock); 1680 (void) nfs4_free_data_reclaim(rp); 1681 nfs4_purge_rddir_cache(RTOV4(rp)); 1682 rw_exit(&rtable4[index].r_lock); 1683 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE, 1684 "recov_newserver: done with %s", 1685 rnode4info(rp))); 1686 VN_RELE(RTOV4(rp)); 1687 } else 1688 rw_exit(&rtable4[index].r_lock); 1689 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0); 1690 1691 mutex_enter(&mi->mi_lock); 1692 mi->mi_recovflags |= MI4R_REOPEN_FILES | MI4R_REMAP_FILES; 1693 if (recovp->rc_srv_reboot) 1694 mi->mi_recovflags |= MI4R_SRV_REBOOT; 1695 mi->mi_curr_serv = svp; 1696 mi->mi_failover++; 1697 mi->mi_flags &= ~MI4_BADOWNER_DEBUG; 1698 mutex_exit(&mi->mi_lock); 1699 1700 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 1701 fh.nfs_fh4_len = svp->sv_fhandle.fh_len; 1702 fh.nfs_fh4_val = svp->sv_fhandle.fh_buf; 1703 sfh4_update(mi->mi_rootfh, &fh); 1704 fh.nfs_fh4_len = svp->sv_pfhandle.fh_len; 1705 fh.nfs_fh4_val = svp->sv_pfhandle.fh_buf; 1706 sfh4_update(mi->mi_srvparentfh, &fh); 1707 nfs_rw_exit(&svp->sv_lock); 1708 1709 *spp = nfs4_move_mi(mi, osvp, svp); 1710 if (osp != NULL) 1711 nfs4_server_rele(osp); 1712 } else 1713 mutex_exit(&mi->mi_lock); 1714 (void) nfs_rw_exit(&mi->mi_recovlock); 1715 } 1716 1717 /* 1718 * Clientid. 1719 */ 1720 1721 static void 1722 recov_clientid(recov_info_t *recovp, nfs4_server_t *sp) 1723 { 1724 mntinfo4_t *mi = recovp->rc_mi; 1725 int error = 0; 1726 int still_stale; 1727 int need_new_s; 1728 1729 ASSERT(sp != NULL); 1730 1731 /* 1732 * Acquire the recovery lock and then verify that the clientid 1733 * still needs to be recovered. (Note that s_recovlock is supposed 1734 * to be acquired before s_lock.) Since the thread holds the 1735 * recovery lock, no other thread will recover the clientid. 1736 */ 1737 (void) nfs_rw_enter_sig(&sp->s_recovlock, RW_WRITER, 0); 1738 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0); 1739 mutex_enter(&sp->s_lock); 1740 still_stale = ((sp->s_flags & N4S_CLIENTID_SET) == 0); 1741 mutex_exit(&sp->s_lock); 1742 1743 if (still_stale) { 1744 nfs4_error_t n4e; 1745 1746 nfs4_error_zinit(&n4e); 1747 nfs4setclientid(mi, kcred, TRUE, &n4e); 1748 error = n4e.error; 1749 if (error != 0) { 1750 1751 /* 1752 * nfs4setclientid may have set MI4R_NEED_NEW_SERVER, 1753 * if so, just return and let recov_thread drive 1754 * failover. 1755 */ 1756 mutex_enter(&mi->mi_lock); 1757 need_new_s = mi->mi_recovflags & MI4R_NEED_NEW_SERVER; 1758 mutex_exit(&mi->mi_lock); 1759 1760 if (need_new_s) { 1761 nfs_rw_exit(&mi->mi_recovlock); 1762 nfs_rw_exit(&sp->s_recovlock); 1763 return; 1764 } 1765 1766 nfs4_queue_event(RE_CLIENTID, mi, NULL, n4e.error, NULL, 1767 NULL, n4e.stat, NULL, 0, TAG_NONE, TAG_NONE, 0, 0); 1768 mutex_enter(&mi->mi_lock); 1769 mi->mi_flags |= MI4_RECOV_FAIL; 1770 mi->mi_error = recovp->rc_error; 1771 mutex_exit(&mi->mi_lock); 1772 /* don't destroy the nfs4_server, let umount do it */ 1773 } 1774 } 1775 1776 if (error == 0) { 1777 mutex_enter(&mi->mi_lock); 1778 mi->mi_recovflags &= ~MI4R_NEED_CLIENTID; 1779 /* 1780 * If still_stale isn't true, then another thread already 1781 * recovered the clientid. And that thread that set the 1782 * clientid will have initiated reopening files on all the 1783 * filesystems for the server, so we should not initiate 1784 * reopening for this filesystem here. 1785 */ 1786 if (still_stale) { 1787 mi->mi_recovflags |= MI4R_REOPEN_FILES; 1788 if (recovp->rc_srv_reboot) 1789 mi->mi_recovflags |= MI4R_SRV_REBOOT; 1790 } 1791 mutex_exit(&mi->mi_lock); 1792 } 1793 1794 nfs_rw_exit(&mi->mi_recovlock); 1795 1796 if (error != 0) { 1797 nfs_rw_exit(&sp->s_recovlock); 1798 mutex_enter(&mi->mi_lock); 1799 if ((mi->mi_flags & MI4_RECOV_FAIL) == 0) 1800 delay(SEC_TO_TICK(recov_err_delay)); 1801 mutex_exit(&mi->mi_lock); 1802 } else { 1803 mntinfo4_t **milist; 1804 mntinfo4_t *tmi; 1805 int nummi, i; 1806 1807 /* 1808 * Initiate recovery of open files for other filesystems. 1809 * We create an array of filesystems, rather than just 1810 * walking the filesystem list, to avoid deadlock issues 1811 * with s_lock and mi_recovlock. 1812 */ 1813 milist = make_milist(sp, &nummi); 1814 for (i = 0; i < nummi; i++) { 1815 tmi = milist[i]; 1816 if (tmi != mi) { 1817 (void) nfs_rw_enter_sig(&tmi->mi_recovlock, 1818 RW_READER, 0); 1819 start_recovery_action(NR_OPENFILES, TRUE, tmi, 1820 NULL, NULL); 1821 nfs_rw_exit(&tmi->mi_recovlock); 1822 } 1823 } 1824 free_milist(milist, nummi); 1825 1826 nfs_rw_exit(&sp->s_recovlock); 1827 } 1828 } 1829 1830 /* 1831 * Return an array of filesystems associated with the given server. The 1832 * caller should call free_milist() to free the references and memory. 1833 */ 1834 1835 static mntinfo4_t ** 1836 make_milist(nfs4_server_t *sp, int *nummip) 1837 { 1838 int nummi, i; 1839 mntinfo4_t **milist; 1840 mntinfo4_t *tmi; 1841 1842 mutex_enter(&sp->s_lock); 1843 nummi = 0; 1844 for (tmi = sp->mntinfo4_list; tmi != NULL; tmi = tmi->mi_clientid_next) 1845 nummi++; 1846 1847 milist = kmem_alloc(nummi * sizeof (mntinfo4_t *), KM_SLEEP); 1848 1849 for (i = 0, tmi = sp->mntinfo4_list; tmi != NULL; i++, 1850 tmi = tmi->mi_clientid_next) { 1851 milist[i] = tmi; 1852 VFS_HOLD(tmi->mi_vfsp); 1853 } 1854 mutex_exit(&sp->s_lock); 1855 1856 *nummip = nummi; 1857 return (milist); 1858 } 1859 1860 /* 1861 * Free the filesystem list created by make_milist(). 1862 */ 1863 1864 static void 1865 free_milist(mntinfo4_t **milist, int nummi) 1866 { 1867 mntinfo4_t *tmi; 1868 int i; 1869 1870 for (i = 0; i < nummi; i++) { 1871 tmi = milist[i]; 1872 VFS_RELE(tmi->mi_vfsp); 1873 } 1874 kmem_free(milist, nummi * sizeof (mntinfo4_t *)); 1875 } 1876 1877 /* 1878 * Filehandle 1879 */ 1880 1881 /* 1882 * Lookup the filehandle for the given vnode and update the rnode if it has 1883 * changed. 1884 * 1885 * Errors: 1886 * - if the filehandle could not be updated because of an error that 1887 * requires further recovery, initiate that recovery and return. 1888 * - if the filehandle could not be updated because of a signal, pretend we 1889 * succeeded and let someone else deal with it. 1890 * - if the filehandle could not be updated and the filesystem has been 1891 * forcibly unmounted, pretend we succeeded, and let the caller deal with 1892 * the forced unmount (to retry or not to retry, that is the question). 1893 * - if the filehandle could not be updated because of some other error, 1894 * mark the rnode bad and return. 1895 */ 1896 static void 1897 recov_filehandle(nfs4_recov_t action, mntinfo4_t *mi, vnode_t *vp) 1898 { 1899 rnode4_t *rp = VTOR4(vp); 1900 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 1901 bool_t needrecov; 1902 1903 mutex_enter(&rp->r_statelock); 1904 1905 if (rp->r_flags & R4RECOVERR) { 1906 mutex_exit(&rp->r_statelock); 1907 return; 1908 } 1909 1910 /* 1911 * If someone else is updating the filehandle, wait for them to 1912 * finish and then let our caller retry. 1913 */ 1914 if (rp->r_flags & R4RECEXPFH) { 1915 while (rp->r_flags & R4RECEXPFH) { 1916 cv_wait(&rp->r_cv, &rp->r_statelock); 1917 } 1918 mutex_exit(&rp->r_statelock); 1919 return; 1920 } 1921 rp->r_flags |= R4RECEXPFH; 1922 mutex_exit(&rp->r_statelock); 1923 1924 if (action == NR_BADHANDLE) { 1925 /* shouldn't happen */ 1926 nfs4_queue_event(RE_BADHANDLE, mi, NULL, 0, 1927 vp, NULL, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0); 1928 } 1929 1930 nfs4_remap_file(mi, vp, 0, &e); 1931 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 1932 1933 /* 1934 * If we get BADHANDLE or FHEXPIRED in their handler, something is 1935 * broken. Don't try to recover, just mark the file dead. 1936 */ 1937 if (needrecov && e.error == 0 && 1938 (e.stat == NFS4ERR_BADHANDLE || e.stat == NFS4ERR_FHEXPIRED)) 1939 needrecov = FALSE; 1940 if (needrecov) { 1941 (void) nfs4_start_recovery(&e, mi, vp, 1942 NULL, NULL, NULL, OP_LOOKUP, NULL); 1943 } else if (e.error != EINTR && 1944 !NFS4_FRC_UNMT_ERR(e.error, mi->mi_vfsp) && 1945 (e.error != 0 || e.stat != NFS4_OK)) { 1946 nfs4_recov_fh_fail(vp, e.error, e.stat); 1947 /* 1948 * Don't set r_error to ESTALE. Higher-level code (e.g., 1949 * cstatat_getvp()) retries on ESTALE, which would cause 1950 * an infinite loop. 1951 */ 1952 } 1953 1954 mutex_enter(&rp->r_statelock); 1955 rp->r_flags &= ~R4RECEXPFH; 1956 cv_broadcast(&rp->r_cv); 1957 mutex_exit(&rp->r_statelock); 1958 } 1959 1960 /* 1961 * Stale Filehandle 1962 */ 1963 1964 /* 1965 * A stale filehandle can happen when an individual file has 1966 * been removed, or when an entire filesystem has been taken 1967 * offline. To distinguish these cases, we do this: 1968 * - if a GETATTR with the current filehandle is okay, we do 1969 * nothing (this can happen with two-filehandle ops) 1970 * - if the GETATTR fails, but a GETATTR of the root filehandle 1971 * succeeds, mark the rnode with R4STALE, which will stop use 1972 * - if the GETATTR fails, and a GETATTR of the root filehandle 1973 * also fails, we consider the problem filesystem-wide, so: 1974 * - if we can failover, we should 1975 * - if we can't failover, we should mark both the original 1976 * vnode and the root bad 1977 */ 1978 static void 1979 recov_stale(mntinfo4_t *mi, vnode_t *vp) 1980 { 1981 rnode4_t *rp = VTOR4(vp); 1982 vnode_t *rootvp = NULL; 1983 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 1984 nfs4_ga_res_t gar; 1985 char *fail_msg = "failed to recover from NFS4ERR_STALE"; 1986 bool_t needrecov; 1987 1988 mutex_enter(&rp->r_statelock); 1989 1990 if (rp->r_flags & R4RECOVERR) { 1991 mutex_exit(&rp->r_statelock); 1992 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1993 "recov_stale: already marked dead, rp %s", 1994 rnode4info(rp))); 1995 return; 1996 } 1997 1998 if (rp->r_flags & R4STALE) { 1999 mutex_exit(&rp->r_statelock); 2000 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2001 "recov_stale: already marked stale, rp %s", 2002 rnode4info(rp))); 2003 return; 2004 } 2005 2006 mutex_exit(&rp->r_statelock); 2007 2008 /* Try a GETATTR on this vnode */ 2009 nfs4_getattr_otw_norecovery(vp, &gar, &e, CRED(), 0); 2010 2011 /* 2012 * Handle non-STALE recoverable errors 2013 */ 2014 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 2015 if (needrecov && (e.error != 0 || e.stat != NFS4ERR_STALE)) { 2016 (void) nfs4_start_recovery(&e, mi, vp, 2017 NULL, NULL, NULL, OP_GETATTR, NULL); 2018 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2019 "recov_stale: error=%d, stat=%d seen on rp %s", 2020 e.error, e.stat, rnode4info(rp))); 2021 goto out; 2022 } 2023 2024 /* Are things OK for this vnode? */ 2025 if (!e.error && e.stat == NFS4_OK) { 2026 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2027 "recov_stale: file appears fine, rp %s", 2028 rnode4info(rp))); 2029 goto out; 2030 } 2031 2032 /* Did we get an unrelated non-recoverable error? */ 2033 if (e.error || e.stat != NFS4ERR_STALE) { 2034 nfs4_fail_recov(vp, fail_msg, e.error, e.stat); 2035 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2036 "recov_stale: unrelated fatal error, rp %s", 2037 rnode4info(rp))); 2038 goto out; 2039 } 2040 2041 /* 2042 * If we don't appear to be dealing with the root node, find it. 2043 */ 2044 if ((vp->v_flag & VROOT) == 0) { 2045 nfs4_error_zinit(&e); 2046 e.error = VFS_ROOT(vp->v_vfsp, &rootvp); 2047 if (e.error) { 2048 nfs4_fail_recov(vp, fail_msg, 0, NFS4ERR_STALE); 2049 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2050 "recov_stale: can't find root node for rp %s", 2051 rnode4info(rp))); 2052 goto out; 2053 } 2054 } 2055 2056 /* Try a GETATTR on the root vnode */ 2057 if (rootvp != NULL) { 2058 nfs4_error_zinit(&e); 2059 nfs4_getattr_otw_norecovery(rootvp, &gar, &e, CRED(), 0); 2060 2061 /* Try recovery? */ 2062 if (e.error != 0 || e.stat != NFS4ERR_STALE) { 2063 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 2064 if (needrecov) { 2065 (void) nfs4_start_recovery(&e, 2066 mi, rootvp, NULL, NULL, NULL, 2067 OP_GETATTR, NULL); 2068 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2069 "recov_stale: error=%d, stat=%d seen " 2070 "on rp %s", e.error, e.stat, 2071 rnode4info(rp))); 2072 } 2073 } 2074 2075 /* 2076 * Check to see if a failover attempt is warranted 2077 * NB: nfs4_try_failover doesn't check for STALE 2078 * because recov_stale gets a shot first. Now that 2079 * recov_stale has failed, go ahead and try failover. 2080 * 2081 * If the getattr on the root filehandle was successful, 2082 * then mark recovery as failed for 'vp' and exit. 2083 */ 2084 if (nfs4_try_failover(&e) == 0 && e.stat != NFS4ERR_STALE) { 2085 /* 2086 * pass the original error to fail_recov, not 2087 * the one from trying the root vnode. 2088 */ 2089 nfs4_fail_recov(vp, fail_msg, 0, NFS4ERR_STALE); 2090 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2091 "recov_stale: root node OK, marking " 2092 "dead rp %s", rnode4info(rp))); 2093 goto out; 2094 } 2095 } 2096 2097 /* 2098 * Here, we know that both the original file and the 2099 * root filehandle (which may be the same) are stale. 2100 * We want to fail over if we can, and if we can't, we 2101 * want to mark everything in sight bad. 2102 */ 2103 if (FAILOVER_MOUNT4(mi)) { 2104 mutex_enter(&mi->mi_lock); 2105 mi->mi_recovflags |= MI4R_NEED_NEW_SERVER; 2106 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2107 "recov_stale: failing over due to rp %s", 2108 rnode4info(rp))); 2109 mutex_exit(&mi->mi_lock); 2110 } else { 2111 rnode4_t *rootrp; 2112 servinfo4_t *svp; 2113 2114 /* 2115 * Can't fail over, so mark things dead. 2116 * 2117 * If rootvp is set, we know we have a distinct 2118 * non-root vnode which can be marked dead in 2119 * the usual way. 2120 * 2121 * Then we want to mark the root vnode dead. 2122 * Note that if rootvp wasn't set, our vp is 2123 * actually the root vnode. 2124 */ 2125 if (rootvp != NULL) { 2126 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2127 "recov_stale: can't fail over, marking dead rp %s", 2128 rnode4info(rp))); 2129 nfs4_fail_recov(vp, fail_msg, 0, NFS4ERR_STALE); 2130 } else { 2131 rootvp = vp; 2132 VN_HOLD(rootvp); 2133 } 2134 2135 /* 2136 * Mark root dead, but quietly - since 2137 * the root rnode is frequently recreated, 2138 * we can encounter this at every access. 2139 * Also mark recovery as failed on this VFS. 2140 */ 2141 rootrp = VTOR4(rootvp); 2142 NFS4_DEBUG(nfs4_client_recov_debug, (CE_CONT, 2143 "recov_stale: marking dead root rp %s", 2144 rnode4info(rootrp))); 2145 mutex_enter(&rootrp->r_statelock); 2146 rootrp->r_flags |= (R4RECOVERR | R4STALE); 2147 rootrp->r_error = ESTALE; 2148 mutex_exit(&rootrp->r_statelock); 2149 mutex_enter(&mi->mi_lock); 2150 mi->mi_error = ESTALE; 2151 mutex_exit(&mi->mi_lock); 2152 2153 svp = mi->mi_curr_serv; 2154 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0); 2155 svp->sv_flags |= SV4_ROOT_STALE; 2156 nfs_rw_exit(&svp->sv_lock); 2157 } 2158 2159 out: 2160 if (rootvp) 2161 VN_RELE(rootvp); 2162 } 2163 2164 /* 2165 * Locks. 2166 */ 2167 2168 /* 2169 * Reclaim all the active (acquired) locks for the given file. 2170 * If a process lost a lock, the process is sent a SIGLOST. This is not 2171 * considered an error. 2172 * 2173 * Return values: 2174 * Errors and status are returned via the nfs4_error_t parameter 2175 * If an error indicates that recovery is needed, the caller is responsible 2176 * for dealing with it. 2177 */ 2178 2179 static void 2180 relock_file(vnode_t *vp, mntinfo4_t *mi, nfs4_error_t *ep, 2181 fattr4_change pre_change) 2182 { 2183 locklist_t *locks, *llp; 2184 rnode4_t *rp; 2185 2186 ASSERT(ep != NULL); 2187 nfs4_error_zinit(ep); 2188 2189 if (VTOMI4(vp)->mi_flags & MI4_LLOCK) 2190 return; 2191 2192 nfs4_flush_lock_owners(VTOR4(vp)); 2193 2194 /* 2195 * If we get an error that requires recovery actions, just bail out 2196 * and let the top-level recovery code handle it. 2197 * 2198 * If we get some other error, kill the process that owned the lock 2199 * and mark its remaining locks (if any) as belonging to NOPID, so 2200 * that we don't make any more reclaim requests for that process. 2201 */ 2202 2203 rp = VTOR4(vp); 2204 locks = flk_active_locks_for_vp(vp); 2205 for (llp = locks; llp != NULL; llp = llp->ll_next) { 2206 int did_reclaim = 1; 2207 2208 ASSERT(llp->ll_vp == vp); 2209 if (llp->ll_flock.l_pid == NOPID) 2210 continue; 2211 reclaim_one_lock(vp, &llp->ll_flock, ep, &did_reclaim); 2212 /* 2213 * If we need to restart recovery, stop processing the 2214 * list. Some errors would be recoverable under other 2215 * circumstances, but if they happen here we just give up 2216 * on the lock. 2217 */ 2218 if (nfs4_needs_recovery(ep, TRUE, vp->v_vfsp)) { 2219 if (ep->error != 0) 2220 break; 2221 if (!nfs4_recov_marks_dead(ep->stat)) 2222 break; 2223 } 2224 /* 2225 * In case the server isn't offering us a grace period, or 2226 * if we missed it, we might have opened & locked from scratch, 2227 * rather than reopened/reclaimed. 2228 * We need to ensure that the object hadn't been otherwise 2229 * changed during this time, by comparing the changeinfo. 2230 * We get passed the changeinfo from before the reopen by our 2231 * caller, in pre_change. 2232 * The changeinfo from after the reopen is in rp->r_change, 2233 * courtesy of the GETATTR in the reopen. 2234 * If they're different, then the file has changed, and we 2235 * have to SIGLOST the app. 2236 */ 2237 if (ep->error == 0 && ep->stat == NFS4_OK && !did_reclaim) { 2238 mutex_enter(&rp->r_statelock); 2239 if (pre_change != rp->r_change) 2240 ep->stat = NFS4ERR_NO_GRACE; 2241 mutex_exit(&rp->r_statelock); 2242 } 2243 if (ep->error != 0 || ep->stat != NFS4_OK) { 2244 if (ep->error != 0) 2245 nfs4_queue_event(RE_FAIL_RELOCK, mi, 2246 NULL, ep->error, vp, NULL, 0, NULL, 2247 llp->ll_flock.l_pid, TAG_NONE, TAG_NONE, 2248 0, 0); 2249 else 2250 nfs4_queue_event(RE_FAIL_RELOCK, mi, 2251 NULL, 0, vp, NULL, ep->stat, NULL, 2252 llp->ll_flock.l_pid, TAG_NONE, TAG_NONE, 2253 0, 0); 2254 nfs4_send_siglost(llp->ll_flock.l_pid, mi, vp, TRUE, 2255 ep->error, ep->stat); 2256 relock_skip_pid(llp, llp->ll_flock.l_pid); 2257 2258 /* Reinitialize the nfs4_error and continue */ 2259 nfs4_error_zinit(ep); 2260 } 2261 } 2262 2263 if (locks != NULL) 2264 flk_free_locklist(locks); 2265 } 2266 2267 /* 2268 * Reclaim the given lock. 2269 * If the lock can't be reclaimed, the process is sent SIGLOST, but this is 2270 * not considered an error. 2271 * 2272 * Errors are returned via the nfs4_error_t parameter. 2273 */ 2274 static void 2275 reclaim_one_lock(vnode_t *vp, flock64_t *flk, nfs4_error_t *ep, 2276 int *did_reclaimp) 2277 { 2278 cred_t *cr; 2279 rnode4_t *rp = VTOR4(vp); 2280 2281 cr = pid_to_cr(flk->l_pid); 2282 if (cr == NULL) { 2283 nfs4_error_zinit(ep); 2284 ep->error = ESRCH; 2285 return; 2286 } 2287 2288 do { 2289 mutex_enter(&rp->r_statelock); 2290 if (rp->r_flags & R4RECOVERR) { 2291 /* 2292 * This shouldn't affect other reclaims, so don't 2293 * return an error. 2294 */ 2295 mutex_exit(&rp->r_statelock); 2296 break; 2297 } 2298 mutex_exit(&rp->r_statelock); 2299 2300 nfs4frlock(NFS4_LCK_CTYPE_RECLAIM, vp, F_SETLK, flk, 2301 FREAD|FWRITE, 0, cr, ep, NULL, did_reclaimp); 2302 if (ep->error == 0 && ep->stat == NFS4ERR_FHEXPIRED) 2303 start_recovery_action(NR_FHEXPIRED, TRUE, VTOMI4(vp), 2304 vp, NULL); 2305 } while (ep->error == 0 && ep->stat == NFS4ERR_FHEXPIRED); 2306 2307 crfree(cr); 2308 } 2309 2310 /* 2311 * Open files. 2312 */ 2313 2314 /* 2315 * Verifies if the nfsstat4 is a valid error for marking this vnode dead. 2316 * Returns 1 if the error is valid; 0 otherwise. 2317 */ 2318 static int 2319 nfs4_valid_recov_err_for_vp(vnode_t *vp, nfsstat4 stat) 2320 { 2321 /* 2322 * We should not be marking non-regular files as dead, 2323 * except in very rare cases (eg: BADHANDLE or NFS4ERR_BADNAME). 2324 */ 2325 if (vp->v_type != VREG && stat != NFS4ERR_BADHANDLE && 2326 stat != NFS4ERR_BADNAME) 2327 return (0); 2328 2329 return (1); 2330 } 2331 2332 /* 2333 * Failed attempting to recover a filehandle. If 'stat' is valid for 'vp', 2334 * then mark the object dead. Since we've had to do a lookup for 2335 * filehandle recovery, we will mark the object dead if we got NOENT. 2336 */ 2337 static void 2338 nfs4_recov_fh_fail(vnode_t *vp, int error, nfsstat4 stat) 2339 { 2340 ASSERT(vp != NULL); 2341 2342 if ((error == 0) && (stat != NFS4ERR_NOENT) && 2343 (!nfs4_valid_recov_err_for_vp(vp, stat))) 2344 return; 2345 2346 nfs4_fail_recov(vp, "can't recover filehandle", error, stat); 2347 } 2348 2349 /* 2350 * Recovery from a "shouldn't happen" error. In the long term, we'd like 2351 * to mark only the data structure(s) that provided the bad value as being 2352 * bad. But for now we'll just mark the entire file. 2353 */ 2354 2355 static void 2356 recov_badstate(recov_info_t *recovp, vnode_t *vp, nfsstat4 stat) 2357 { 2358 ASSERT(vp != NULL); 2359 recov_throttle(recovp, vp); 2360 2361 if (!nfs4_valid_recov_err_for_vp(vp, stat)) 2362 return; 2363 2364 nfs4_fail_recov(vp, "", 0, stat); 2365 } 2366 2367 /* 2368 * Free up the information saved for a lost state request. 2369 */ 2370 static void 2371 nfs4_free_lost_rqst(nfs4_lost_rqst_t *lrp, nfs4_server_t *sp) 2372 { 2373 component4 *filep; 2374 nfs4_open_stream_t *osp; 2375 int have_sync_lock; 2376 2377 NFS4_DEBUG(nfs4_lost_rqst_debug, 2378 (CE_NOTE, "nfs4_free_lost_rqst:")); 2379 2380 switch (lrp->lr_op) { 2381 case OP_OPEN: 2382 filep = &lrp->lr_ofile; 2383 if (filep->utf8string_val) { 2384 kmem_free(filep->utf8string_val, filep->utf8string_len); 2385 filep->utf8string_val = NULL; 2386 } 2387 break; 2388 case OP_DELEGRETURN: 2389 nfs4delegreturn_cleanup(VTOR4(lrp->lr_vp), sp); 2390 break; 2391 case OP_CLOSE: 2392 osp = lrp->lr_osp; 2393 ASSERT(osp != NULL); 2394 mutex_enter(&osp->os_sync_lock); 2395 have_sync_lock = 1; 2396 if (osp->os_pending_close) { 2397 /* clean up the open file state. */ 2398 osp->os_pending_close = 0; 2399 nfs4close_notw(lrp->lr_vp, osp, &have_sync_lock); 2400 } 2401 if (have_sync_lock) 2402 mutex_exit(&osp->os_sync_lock); 2403 break; 2404 } 2405 2406 lrp->lr_op = 0; 2407 if (lrp->lr_oop != NULL) { 2408 open_owner_rele(lrp->lr_oop); 2409 lrp->lr_oop = NULL; 2410 } 2411 if (lrp->lr_osp != NULL) { 2412 open_stream_rele(lrp->lr_osp, VTOR4(lrp->lr_vp)); 2413 lrp->lr_osp = NULL; 2414 } 2415 if (lrp->lr_lop != NULL) { 2416 lock_owner_rele(lrp->lr_lop); 2417 lrp->lr_lop = NULL; 2418 } 2419 if (lrp->lr_flk != NULL) { 2420 kmem_free(lrp->lr_flk, sizeof (flock64_t)); 2421 lrp->lr_flk = NULL; 2422 } 2423 if (lrp->lr_vp != NULL) { 2424 VN_RELE(lrp->lr_vp); 2425 lrp->lr_vp = NULL; 2426 } 2427 if (lrp->lr_dvp != NULL) { 2428 VN_RELE(lrp->lr_dvp); 2429 lrp->lr_dvp = NULL; 2430 } 2431 if (lrp->lr_cr != NULL) { 2432 crfree(lrp->lr_cr); 2433 lrp->lr_cr = NULL; 2434 } 2435 2436 kmem_free(lrp, sizeof (nfs4_lost_rqst_t)); 2437 } 2438 2439 /* 2440 * Remove any lost state requests and free them. 2441 */ 2442 static void 2443 nfs4_remove_lost_rqsts(mntinfo4_t *mi, nfs4_server_t *sp) 2444 { 2445 nfs4_lost_rqst_t *lrp; 2446 2447 mutex_enter(&mi->mi_lock); 2448 while ((lrp = list_head(&mi->mi_lost_state)) != NULL) { 2449 list_remove(&mi->mi_lost_state, lrp); 2450 mutex_exit(&mi->mi_lock); 2451 nfs4_free_lost_rqst(lrp, sp); 2452 mutex_enter(&mi->mi_lock); 2453 } 2454 mutex_exit(&mi->mi_lock); 2455 } 2456 2457 /* 2458 * Reopen all the files for the given filesystem and reclaim any locks. 2459 */ 2460 2461 static void 2462 recov_openfiles(recov_info_t *recovp, nfs4_server_t *sp) 2463 { 2464 mntinfo4_t *mi = recovp->rc_mi; 2465 nfs4_opinst_t *reopenlist = NULL, *rep; 2466 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 2467 open_claim_type4 claim; 2468 int remap; 2469 char *fail_msg = "No such file or directory on replica"; 2470 rnode4_t *rp; 2471 fattr4_change pre_change; 2472 2473 ASSERT(sp != NULL); 2474 2475 /* 2476 * This check is to allow a 10ms pause before we reopen files 2477 * it should allow the server time to have received the CB_NULL 2478 * reply and update its internal structures such that (if 2479 * applicable) we are granted a delegation on reopened files. 2480 */ 2481 mutex_enter(&sp->s_lock); 2482 if ((sp->s_flags & (N4S_CB_PINGED | N4S_CB_WAITER)) == 0) { 2483 sp->s_flags |= N4S_CB_WAITER; 2484 (void) cv_timedwait(&sp->wait_cb_null, &sp->s_lock, 2485 (lbolt + drv_usectohz(N4S_CB_PAUSE_TIME))); 2486 } 2487 mutex_exit(&sp->s_lock); 2488 2489 (void) nfs_rw_enter_sig(&sp->s_recovlock, RW_READER, 0); 2490 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0); 2491 2492 if (NFS4_VOLATILE_FH(mi)) { 2493 nfs4_remap_root(mi, &e, 0); 2494 if (nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp)) { 2495 (void) nfs4_start_recovery(&e, mi, NULL, 2496 NULL, NULL, NULL, OP_LOOKUP, NULL); 2497 } 2498 } 2499 2500 mutex_enter(&mi->mi_lock); 2501 if (recovp->rc_srv_reboot || (mi->mi_recovflags & MI4R_SRV_REBOOT)) 2502 claim = CLAIM_PREVIOUS; 2503 else 2504 claim = CLAIM_NULL; 2505 mutex_exit(&mi->mi_lock); 2506 2507 if (e.error == 0 && e.stat == NFS4_OK) { 2508 /* 2509 * Get a snapshot of open files in the filesystem. Note 2510 * that new opens will stall until the server's grace 2511 * period is done. 2512 */ 2513 reopenlist = r4mkopenlist(mi); 2514 2515 mutex_enter(&mi->mi_lock); 2516 remap = mi->mi_recovflags & MI4R_REMAP_FILES; 2517 mutex_exit(&mi->mi_lock); 2518 /* 2519 * Since we are re-establishing state on the 2520 * server, its ok to blow away the saved lost 2521 * requests since we don't need to reissue it. 2522 */ 2523 nfs4_remove_lost_rqsts(mi, sp); 2524 2525 for (rep = reopenlist; rep; rep = rep->re_next) { 2526 2527 if (remap) { 2528 nfs4_remap_file(mi, rep->re_vp, 2529 NFS4_REMAP_CKATTRS, &e); 2530 } 2531 if (e.error == ENOENT || e.stat == NFS4ERR_NOENT) { 2532 /* 2533 * The current server does not have the file 2534 * that is to be remapped. This is most 2535 * likely due to an improperly maintained 2536 * replica. The files that are missing from 2537 * the server will be marked dead and logged 2538 * in order to make sys admins aware of the 2539 * problem. 2540 */ 2541 nfs4_fail_recov(rep->re_vp, 2542 fail_msg, e.error, e.stat); 2543 /* 2544 * We've already handled the error so clear it. 2545 */ 2546 nfs4_error_zinit(&e); 2547 continue; 2548 } else if (e.error == 0 && e.stat == NFS4_OK) { 2549 int j; 2550 2551 rp = VTOR4(rep->re_vp); 2552 mutex_enter(&rp->r_statelock); 2553 pre_change = rp->r_change; 2554 mutex_exit(&rp->r_statelock); 2555 2556 for (j = 0; j < rep->re_numosp; j++) { 2557 nfs4_reopen(rep->re_vp, rep->re_osp[j], 2558 &e, claim, FALSE, TRUE); 2559 if (e.error != 0 || e.stat != NFS4_OK) 2560 break; 2561 } 2562 if (nfs4_needs_recovery(&e, TRUE, 2563 mi->mi_vfsp)) { 2564 (void) nfs4_start_recovery(&e, mi, 2565 rep->re_vp, NULL, NULL, NULL, 2566 OP_OPEN, NULL); 2567 break; 2568 } 2569 } 2570 #ifdef DEBUG 2571 if (nfs4_recovdelay > 0) 2572 delay(MSEC_TO_TICK(nfs4_recovdelay * 1000)); 2573 #endif 2574 if (e.error == 0 && e.stat == NFS4_OK) 2575 relock_file(rep->re_vp, mi, &e, pre_change); 2576 2577 if (nfs4_needs_recovery(&e, TRUE, mi->mi_vfsp)) 2578 (void) nfs4_start_recovery(&e, mi, 2579 rep->re_vp, NULL, NULL, NULL, OP_LOCK, 2580 NULL); 2581 if (e.error != 0 || e.stat != NFS4_OK) 2582 break; 2583 } 2584 2585 /* 2586 * Check to see if we need to remap files passed in 2587 * via the recovery arguments; this will have been 2588 * done for open files. A failure here is not fatal. 2589 */ 2590 if (remap) { 2591 nfs4_error_t ignore; 2592 nfs4_check_remap(mi, recovp->rc_vp1, NFS4_REMAP_CKATTRS, 2593 &ignore); 2594 nfs4_check_remap(mi, recovp->rc_vp2, NFS4_REMAP_CKATTRS, 2595 &ignore); 2596 } 2597 } 2598 2599 if (e.error == 0 && e.stat == NFS4_OK) { 2600 mutex_enter(&mi->mi_lock); 2601 mi->mi_recovflags &= ~(MI4R_REOPEN_FILES | MI4R_REMAP_FILES); 2602 mutex_exit(&mi->mi_lock); 2603 } 2604 2605 nfs_rw_exit(&mi->mi_recovlock); 2606 nfs_rw_exit(&sp->s_recovlock); 2607 2608 if (reopenlist != NULL) 2609 r4releopenlist(reopenlist); 2610 } 2611 2612 /* 2613 * Resend the queued state recovery requests in "rqsts". 2614 */ 2615 2616 static void 2617 nfs4_resend_lost_rqsts(recov_info_t *recovp, nfs4_server_t *sp) 2618 { 2619 nfs4_lost_rqst_t *lrp, *tlrp; 2620 mntinfo4_t *mi = recovp->rc_mi; 2621 nfs4_error_t n4e; 2622 #ifdef NOTYET 2623 uint32_t deny_bits = 0; 2624 #endif 2625 2626 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_resend_lost_rqsts")); 2627 2628 ASSERT(mi != NULL); 2629 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 2630 2631 mutex_enter(&mi->mi_lock); 2632 lrp = list_head(&mi->mi_lost_state); 2633 mutex_exit(&mi->mi_lock); 2634 while (lrp != NULL) { 2635 nfs4_error_zinit(&n4e); 2636 resend_one_op(lrp, &n4e, mi, sp); 2637 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 2638 "nfs4_resend_lost_rqsts: resend request: for vp %p got " 2639 "error %d stat %d", (void *)lrp->lr_vp, n4e.error, 2640 n4e.stat)); 2641 2642 /* 2643 * If we get a recovery error that we can actually 2644 * recover from (such as ETIMEDOUT, FHEXPIRED), we 2645 * return and let the recovery thread redrive the call. 2646 * Don't requeue unless the zone is still healthy. 2647 */ 2648 if (zone_status_get(curproc->p_zone) < ZONE_IS_SHUTTING_DOWN && 2649 nfs4_needs_recovery(&n4e, TRUE, mi->mi_vfsp) && 2650 (nfs4_try_failover(&n4e) || 2651 NFS4_FRC_UNMT_ERR(n4e.error, mi->mi_vfsp) || 2652 (n4e.error == 0 && n4e.stat != NFS4ERR_BADHANDLE && 2653 !nfs4_recov_marks_dead(n4e.stat)))) { 2654 /* 2655 * For these three errors, we want to delay a bit 2656 * instead of pounding the server into submission. 2657 * We have to do this manually; the normal 2658 * processing for these errors only works for 2659 * non-recovery requests. 2660 */ 2661 if ((n4e.error == 0 && n4e.stat == NFS4ERR_DELAY) || 2662 (n4e.error == 0 && n4e.stat == NFS4ERR_GRACE) || 2663 (n4e.error == 0 && n4e.stat == NFS4ERR_RESOURCE) || 2664 NFS4_FRC_UNMT_ERR(n4e.error, mi->mi_vfsp)) { 2665 delay(SEC_TO_TICK(nfs4err_delay_time)); 2666 } else { 2667 (void) nfs4_start_recovery(&n4e, 2668 mi, lrp->lr_dvp, lrp->lr_vp, NULL, NULL, 2669 lrp->lr_op, NULL); 2670 } 2671 return; 2672 } 2673 2674 mutex_enter(&mi->mi_lock); 2675 list_remove(&mi->mi_lost_state, lrp); 2676 tlrp = lrp; 2677 lrp = list_head(&mi->mi_lost_state); 2678 mutex_exit(&mi->mi_lock); 2679 nfs4_free_lost_rqst(tlrp, sp); 2680 } 2681 } 2682 2683 /* 2684 * Resend the given op, and issue any necessary undo call. 2685 * errors are returned via the nfs4_error_t parameter. 2686 */ 2687 2688 static void 2689 resend_one_op(nfs4_lost_rqst_t *lrp, nfs4_error_t *ep, 2690 mntinfo4_t *mi, nfs4_server_t *sp) 2691 { 2692 vnode_t *vp; 2693 nfs4_open_stream_t *osp; 2694 cred_t *cr; 2695 uint32_t acc_bits; 2696 2697 vp = lrp->lr_vp; 2698 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_one_op: " 2699 "have a lost open/close request for vp %p", (void *)vp)); 2700 2701 switch (lrp->lr_op) { 2702 case OP_OPEN: 2703 nfs4_resend_open_otw(&vp, lrp, ep); 2704 break; 2705 case OP_OPEN_DOWNGRADE: 2706 ASSERT(lrp->lr_oop != NULL); 2707 ep->error = nfs4_start_open_seqid_sync(lrp->lr_oop, mi); 2708 ASSERT(!ep->error); /* recov thread always succeeds */ 2709 ASSERT(lrp->lr_osp != NULL); 2710 mutex_enter(&lrp->lr_osp->os_sync_lock); 2711 nfs4_open_downgrade(lrp->lr_dg_acc, lrp->lr_dg_deny, 2712 lrp->lr_oop, lrp->lr_osp, vp, lrp->lr_cr, lrp, 2713 ep, NULL, NULL); 2714 mutex_exit(&lrp->lr_osp->os_sync_lock); 2715 nfs4_end_open_seqid_sync(lrp->lr_oop); 2716 break; 2717 case OP_CLOSE: 2718 osp = lrp->lr_osp; 2719 cr = lrp->lr_cr; 2720 acc_bits = 0; 2721 mutex_enter(&osp->os_sync_lock); 2722 if (osp->os_share_acc_read) 2723 acc_bits |= OPEN4_SHARE_ACCESS_READ; 2724 if (osp->os_share_acc_write) 2725 acc_bits |= OPEN4_SHARE_ACCESS_WRITE; 2726 mutex_exit(&osp->os_sync_lock); 2727 nfs4close_one(vp, osp, cr, acc_bits, lrp, ep, 2728 CLOSE_RESEND, 0, 0, 0); 2729 break; 2730 case OP_LOCK: 2731 case OP_LOCKU: 2732 resend_lock(lrp, ep); 2733 goto done; 2734 case OP_DELEGRETURN: 2735 nfs4_resend_delegreturn(lrp, ep, sp); 2736 goto done; 2737 default: 2738 #ifdef DEBUG 2739 cmn_err(CE_PANIC, "resend_one_op: unexpected op: %d", 2740 lrp->lr_op); 2741 #endif 2742 nfs4_queue_event(RE_LOST_STATE_BAD_OP, mi, NULL, 2743 lrp->lr_op, lrp->lr_vp, lrp->lr_dvp, NFS4_OK, NULL, 0, 2744 TAG_NONE, TAG_NONE, 0, 0); 2745 nfs4_error_init(ep, EINVAL); 2746 return; 2747 } 2748 2749 /* 2750 * No need to retry nor send an "undo" CLOSE in the 2751 * event the server rebooted. 2752 */ 2753 if (ep->error == 0 && (ep->stat == NFS4ERR_STALE_CLIENTID || 2754 ep->stat == NFS4ERR_STALE_STATEID || ep->stat == NFS4ERR_EXPIRED)) 2755 goto done; 2756 2757 /* 2758 * If we resent a CLOSE or OPEN_DOWNGRADE, there's nothing 2759 * to undo. Undoing locking operations was handled by 2760 * resend_lock(). 2761 */ 2762 if (lrp->lr_op == OP_OPEN_DOWNGRADE || lrp->lr_op == OP_CLOSE) 2763 goto done; 2764 2765 /* 2766 * If we get any other error for OPEN, then don't attempt 2767 * to undo the resend of the open (since it was never 2768 * successful!). 2769 */ 2770 ASSERT(lrp->lr_op == OP_OPEN); 2771 if (ep->error || ep->stat != NFS4_OK) 2772 goto done; 2773 2774 /* 2775 * Now let's undo our OPEN. 2776 */ 2777 nfs4_error_zinit(ep); 2778 close_after_open_resend(vp, lrp->lr_cr, lrp->lr_oacc, ep); 2779 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_one_op: " 2780 "nfs4close_one: for vp %p got error %d stat %d", 2781 (void *)vp, ep->error, ep->stat)); 2782 2783 done: 2784 if (vp != lrp->lr_vp) 2785 VN_RELE(vp); 2786 } 2787 2788 /* 2789 * Close a file that was opened via a resent OPEN. 2790 * Most errors are passed back to the caller (via the return value and 2791 * *statp), except for FHEXPIRED, which is retried. 2792 * 2793 * It might be conceptually cleaner to push the CLOSE request onto the 2794 * front of the resend queue, rather than sending it here. That would 2795 * match the way we undo lost lock requests. On the other 2796 * hand, we've already got something that works, and there's no reason to 2797 * change it at this time. 2798 */ 2799 2800 static void 2801 close_after_open_resend(vnode_t *vp, cred_t *cr, uint32_t acc_bits, 2802 nfs4_error_t *ep) 2803 { 2804 2805 for (;;) { 2806 nfs4close_one(vp, NULL, cr, acc_bits, NULL, ep, 2807 CLOSE_AFTER_RESEND, 0, 0, 0); 2808 if (ep->error == 0 && ep->stat == NFS4_OK) 2809 break; /* success; done */ 2810 if (ep->error != 0 || ep->stat != NFS4ERR_FHEXPIRED) 2811 break; 2812 /* else retry FHEXPIRED */ 2813 } 2814 2815 } 2816 2817 /* 2818 * Resend the given lost lock request. Return an errno value. If zero, 2819 * *statp is set to the NFS status code for the call. 2820 * 2821 * Issue a SIGLOST and mark the rnode dead if we get a non-recovery error or 2822 * a recovery error that we don't actually recover from yet (eg: BAD_SEQID). 2823 * Let the recovery thread redrive the call if we get a recovery error that 2824 * we can actually recover from. 2825 */ 2826 static void 2827 resend_lock(nfs4_lost_rqst_t *lrp, nfs4_error_t *ep) 2828 { 2829 bool_t send_siglost = FALSE; 2830 vnode_t *vp = lrp->lr_vp; 2831 2832 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_lock:")); 2833 ASSERT(lrp->lr_ctype == NFS4_LCK_CTYPE_REINSTATE || 2834 lrp->lr_ctype == NFS4_LCK_CTYPE_RESEND); 2835 2836 nfs4frlock(lrp->lr_ctype, vp, F_SETLK, 2837 lrp->lr_flk, FREAD|FWRITE, 0, lrp->lr_cr, ep, lrp, NULL); 2838 2839 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_lock: " 2840 "nfs4frlock for vp %p returned error %d, stat %d", 2841 (void *)vp, ep->error, ep->stat)); 2842 2843 if (ep->error == 0 && ep->stat == 0) 2844 goto done; 2845 if (ep->error == 0 && ep->stat == NFS4ERR_DENIED && 2846 lrp->lr_ctype == NFS4_LCK_CTYPE_RESEND) 2847 goto done; 2848 2849 /* 2850 * If we failed with a non-recovery error, send SIGLOST and 2851 * mark the file dead. 2852 */ 2853 if (!nfs4_needs_recovery(ep, TRUE, vp->v_vfsp)) 2854 send_siglost = TRUE; 2855 else { 2856 /* 2857 * Done with recovering LOST LOCK in the event the 2858 * server rebooted or we've lost the lease. 2859 */ 2860 if (ep->error == 0 && (ep->stat == NFS4ERR_STALE_CLIENTID || 2861 ep->stat == NFS4ERR_STALE_STATEID || 2862 ep->stat == NFS4ERR_EXPIRED)) { 2863 goto done; 2864 } 2865 2866 /* 2867 * BAD_STATEID on an unlock indicates that the server has 2868 * forgotten about the lock anyway, so act like the call 2869 * was successful. 2870 */ 2871 if (ep->error == 0 && ep->stat == NFS4ERR_BAD_STATEID && 2872 lrp->lr_op == OP_LOCKU) 2873 goto done; 2874 2875 /* 2876 * If we got a recovery error that we don't actually 2877 * recover from, send SIGLOST. If the filesystem was 2878 * forcibly unmounted, we skip the SIGLOST because (a) it's 2879 * unnecessary noise, and (b) there could be a new process 2880 * with the same pid as the one that had generated the lost 2881 * state request. 2882 */ 2883 if (ep->error == 0 && (ep->stat == NFS4ERR_BADHANDLE || 2884 nfs4_recov_marks_dead(ep->stat))) { 2885 if (!(vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) 2886 send_siglost = TRUE; 2887 goto done; 2888 } 2889 2890 /* 2891 * If the filesystem was forcibly unmounted, we 2892 * still need to synchronize with the server and 2893 * release state. Try again later. 2894 */ 2895 if (NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp)) 2896 goto done; 2897 2898 /* 2899 * If we get a recovery error that we can actually 2900 * recover from (such as ETIMEDOUT, FHEXPIRED), 2901 * return and let the recovery thread redrive the call. 2902 * 2903 * For the three errors below, we want to delay a bit 2904 * instead of pounding the server into submission. 2905 */ 2906 if ((ep->error == 0 && ep->stat == NFS4ERR_DELAY) || 2907 (ep->error == 0 && ep->stat == NFS4ERR_GRACE) || 2908 (ep->error == 0 && ep->stat == NFS4ERR_RESOURCE)) 2909 delay(SEC_TO_TICK(recov_err_delay)); 2910 goto done; 2911 } 2912 2913 done: 2914 if (send_siglost) { 2915 cred_t *sv_cred; 2916 2917 /* 2918 * Must be root or the actual thread being issued the 2919 * SIGLOST for this to work, so just become root. 2920 */ 2921 sv_cred = curthread->t_cred; 2922 curthread->t_cred = kcred; 2923 nfs4_send_siglost(lrp->lr_flk->l_pid, VTOMI4(vp), vp, FALSE, 2924 ep->error, ep->stat); 2925 curthread->t_cred = sv_cred; 2926 2927 /* 2928 * Flush any additional reinstantiation requests for 2929 * this operation. Sending multiple SIGLOSTs to the user 2930 * process is unlikely to help and may cause trouble. 2931 */ 2932 if (lrp->lr_ctype == NFS4_LCK_CTYPE_REINSTATE) 2933 flush_reinstate(lrp); 2934 } 2935 } 2936 2937 /* 2938 * Remove any lock reinstantiation requests that correspond to the given 2939 * lost request. We only remove items that follow lrp in the queue, 2940 * assuming that lrp will be removed by the generic lost state code. 2941 */ 2942 2943 static void 2944 flush_reinstate(nfs4_lost_rqst_t *lrp) 2945 { 2946 vnode_t *vp; 2947 pid_t pid; 2948 mntinfo4_t *mi; 2949 nfs4_lost_rqst_t *nlrp; 2950 2951 vp = lrp->lr_vp; 2952 mi = VTOMI4(vp); 2953 pid = lrp->lr_flk->l_pid; 2954 2955 /* 2956 * If there are any more reinstantation requests to get rid of, 2957 * they should all be clustered at the front of the lost state 2958 * queue. 2959 */ 2960 mutex_enter(&mi->mi_lock); 2961 for (lrp = list_next(&mi->mi_lost_state, lrp); lrp != NULL; 2962 lrp = nlrp) { 2963 nlrp = list_next(&mi->mi_lost_state, lrp); 2964 if (lrp->lr_op != OP_LOCK && lrp->lr_op != OP_LOCKU) 2965 break; 2966 if (lrp->lr_ctype != NFS4_LCK_CTYPE_REINSTATE) 2967 break; 2968 ASSERT(lrp->lr_vp == vp); 2969 ASSERT(lrp->lr_flk->l_pid == pid); 2970 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 2971 "remove reinstantiation %p", (void *)lrp)); 2972 list_remove(&mi->mi_lost_state, lrp); 2973 nfs4_free_lost_rqst(lrp, NULL); 2974 } 2975 mutex_exit(&mi->mi_lock); 2976 } 2977 2978 /* 2979 * End of state-specific recovery routines. 2980 */ 2981 2982 /* 2983 * Allocate a lost request struct, initialize it from lost_rqstp (including 2984 * bumping the reference counts for the referenced vnode, etc.), and hang 2985 * it off of recovp. 2986 */ 2987 2988 static void 2989 nfs4_save_lost_rqst(nfs4_lost_rqst_t *lost_rqstp, recov_info_t *recovp, 2990 nfs4_recov_t *action, mntinfo4_t *mi) 2991 { 2992 nfs4_lost_rqst_t *destp; 2993 2994 ASSERT(recovp->rc_lost_rqst == NULL); 2995 2996 destp = kmem_alloc(sizeof (nfs4_lost_rqst_t), KM_SLEEP); 2997 recovp->rc_lost_rqst = destp; 2998 2999 if (lost_rqstp->lr_op == OP_LOCK || 3000 lost_rqstp->lr_op == OP_LOCKU) { 3001 ASSERT(lost_rqstp->lr_lop); 3002 *action = NR_LOST_LOCK; 3003 destp->lr_ctype = lost_rqstp->lr_ctype; 3004 destp->lr_locktype = lost_rqstp->lr_locktype; 3005 } else if (lost_rqstp->lr_op == OP_OPEN) { 3006 component4 *srcfp, *destfp; 3007 3008 destp->lr_oacc = lost_rqstp->lr_oacc; 3009 destp->lr_odeny = lost_rqstp->lr_odeny; 3010 destp->lr_oclaim = lost_rqstp->lr_oclaim; 3011 if (lost_rqstp->lr_oclaim == CLAIM_DELEGATE_CUR) 3012 destp->lr_ostateid = lost_rqstp->lr_ostateid; 3013 3014 srcfp = &lost_rqstp->lr_ofile; 3015 destfp = &destp->lr_ofile; 3016 /* 3017 * Consume caller's utf8string 3018 */ 3019 destfp->utf8string_len = srcfp->utf8string_len; 3020 destfp->utf8string_val = srcfp->utf8string_val; 3021 srcfp->utf8string_len = 0; 3022 srcfp->utf8string_val = NULL; /* make sure not reused */ 3023 3024 *action = NR_LOST_STATE_RQST; 3025 } else if (lost_rqstp->lr_op == OP_OPEN_DOWNGRADE) { 3026 destp->lr_dg_acc = lost_rqstp->lr_dg_acc; 3027 destp->lr_dg_deny = lost_rqstp->lr_dg_deny; 3028 3029 *action = NR_LOST_STATE_RQST; 3030 } else if (lost_rqstp->lr_op == OP_CLOSE) { 3031 ASSERT(lost_rqstp->lr_oop); 3032 *action = NR_LOST_STATE_RQST; 3033 } else if (lost_rqstp->lr_op == OP_DELEGRETURN) { 3034 *action = NR_LOST_STATE_RQST; 3035 } else { 3036 #ifdef DEBUG 3037 cmn_err(CE_PANIC, "nfs4_save_lost_rqst: bad op %d", 3038 lost_rqstp->lr_op); 3039 #endif 3040 nfs4_queue_event(RE_LOST_STATE_BAD_OP, mi, NULL, 3041 lost_rqstp->lr_op, lost_rqstp->lr_vp, lost_rqstp->lr_dvp, 3042 NFS4_OK, NULL, curproc->p_pid, TAG_NONE, TAG_NONE, 0, 0); 3043 *action = NR_UNUSED; 3044 recovp->rc_lost_rqst = NULL; 3045 kmem_free(destp, sizeof (nfs4_lost_rqst_t)); 3046 return; 3047 } 3048 3049 destp->lr_op = lost_rqstp->lr_op; 3050 destp->lr_vp = lost_rqstp->lr_vp; 3051 if (destp->lr_vp) 3052 VN_HOLD(destp->lr_vp); 3053 destp->lr_dvp = lost_rqstp->lr_dvp; 3054 if (destp->lr_dvp) 3055 VN_HOLD(destp->lr_dvp); 3056 destp->lr_oop = lost_rqstp->lr_oop; 3057 if (destp->lr_oop) 3058 open_owner_hold(destp->lr_oop); 3059 destp->lr_osp = lost_rqstp->lr_osp; 3060 if (destp->lr_osp) 3061 open_stream_hold(destp->lr_osp); 3062 destp->lr_lop = lost_rqstp->lr_lop; 3063 if (destp->lr_lop) 3064 lock_owner_hold(destp->lr_lop); 3065 destp->lr_cr = lost_rqstp->lr_cr; 3066 if (destp->lr_cr) 3067 crhold(destp->lr_cr); 3068 if (lost_rqstp->lr_flk == NULL) 3069 destp->lr_flk = NULL; 3070 else { 3071 destp->lr_flk = kmem_alloc(sizeof (flock64_t), KM_SLEEP); 3072 *destp->lr_flk = *lost_rqstp->lr_flk; 3073 } 3074 destp->lr_putfirst = lost_rqstp->lr_putfirst; 3075 } 3076 3077 /* 3078 * Map the given return values (errno and nfs4 status code) to a recovery 3079 * action and fill in the following fields of recovp: rc_action, 3080 * rc_srv_reboot, rc_stateid, rc_lost_rqst. 3081 */ 3082 3083 void 3084 errs_to_action(recov_info_t *recovp, 3085 nfs4_server_t *sp, mntinfo4_t *mi, stateid4 *sidp, 3086 nfs4_lost_rqst_t *lost_rqstp, int unmounted, nfs_opnum4 op, 3087 nfs4_bseqid_entry_t *bsep) 3088 { 3089 nfs4_recov_t action = NR_UNUSED; 3090 bool_t reboot = FALSE; 3091 int try_f; 3092 int error = recovp->rc_orig_errors.error; 3093 nfsstat4 stat = recovp->rc_orig_errors.stat; 3094 3095 bzero(&recovp->rc_stateid, sizeof (stateid4)); 3096 recovp->rc_lost_rqst = NULL; 3097 recovp->rc_bseqid_rqst = NULL; 3098 3099 try_f = nfs4_try_failover(&recovp->rc_orig_errors) && 3100 FAILOVER_MOUNT4(mi); 3101 3102 /* 3103 * We start recovery for EINTR only in the lost lock 3104 * or lost open/close case. 3105 */ 3106 3107 if (try_f || error == EINTR || (error == EIO && unmounted)) { 3108 recovp->rc_error = (error != 0 ? error : geterrno4(stat)); 3109 if (lost_rqstp) { 3110 ASSERT(lost_rqstp->lr_op != 0); 3111 nfs4_save_lost_rqst(lost_rqstp, recovp, &action, mi); 3112 } 3113 if (try_f) 3114 action = NR_FAILOVER; 3115 } else if (error != 0) { 3116 recovp->rc_error = error; 3117 nfs4_queue_event(RE_UNEXPECTED_ERRNO, mi, NULL, error, NULL, 3118 NULL, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0); 3119 action = NR_CLIENTID; 3120 } else { 3121 recovp->rc_error = geterrno4(stat); 3122 switch (stat) { 3123 #ifdef notyet 3124 case NFS4ERR_LEASE_MOVED: 3125 action = xxx; 3126 break; 3127 case NFS4ERR_MOVED: 3128 action = xxx; 3129 break; 3130 #endif 3131 case NFS4ERR_BADHANDLE: 3132 action = NR_BADHANDLE; 3133 break; 3134 case NFS4ERR_BAD_SEQID: 3135 if (bsep) 3136 save_bseqid_rqst(bsep, recovp); 3137 action = NR_BAD_SEQID; 3138 break; 3139 case NFS4ERR_OLD_STATEID: 3140 action = NR_OLDSTATEID; 3141 break; 3142 case NFS4ERR_WRONGSEC: 3143 action = NR_WRONGSEC; 3144 break; 3145 case NFS4ERR_FHEXPIRED: 3146 action = NR_FHEXPIRED; 3147 break; 3148 case NFS4ERR_BAD_STATEID: 3149 if (sp == NULL || (sp != NULL && inlease(sp))) { 3150 3151 action = NR_BAD_STATEID; 3152 if (sidp) 3153 recovp->rc_stateid = *sidp; 3154 } else 3155 action = NR_CLIENTID; 3156 break; 3157 case NFS4ERR_EXPIRED: 3158 /* 3159 * The client's lease has expired, either due 3160 * to a network partition or perhaps a client 3161 * error. In either case, try an NR_CLIENTID 3162 * style recovery. reboot remains false, since 3163 * there is no evidence the server has rebooted. 3164 * This will cause CLAIM_NULL opens and lock 3165 * requests without the reclaim bit. 3166 */ 3167 action = NR_CLIENTID; 3168 3169 DTRACE_PROBE4(nfs4__expired, 3170 nfs4_server_t *, sp, 3171 mntinfo4_t *, mi, 3172 stateid4 *, sidp, int, op); 3173 3174 break; 3175 case NFS4ERR_STALE_CLIENTID: 3176 case NFS4ERR_STALE_STATEID: 3177 action = NR_CLIENTID; 3178 reboot = TRUE; 3179 break; 3180 case NFS4ERR_RESOURCE: 3181 /* 3182 * If this had been a FAILOVER mount, then 3183 * we'd have tried failover. Since it's not, 3184 * just delay a while and retry. 3185 */ 3186 action = NR_DELAY; 3187 break; 3188 case NFS4ERR_GRACE: 3189 action = NR_GRACE; 3190 break; 3191 case NFS4ERR_DELAY: 3192 action = NR_DELAY; 3193 break; 3194 case NFS4ERR_STALE: 3195 action = NR_STALE; 3196 break; 3197 default: 3198 nfs4_queue_event(RE_UNEXPECTED_STATUS, mi, NULL, 0, 3199 NULL, NULL, stat, NULL, 0, TAG_NONE, TAG_NONE, 3200 0, 0); 3201 action = NR_CLIENTID; 3202 break; 3203 } 3204 } 3205 3206 /* make sure action got set */ 3207 ASSERT(action != NR_UNUSED); 3208 recovp->rc_srv_reboot = reboot; 3209 recovp->rc_action = action; 3210 nfs4_queue_fact(RF_ERR, mi, stat, action, op, reboot, NULL, error, 3211 NULL); 3212 } 3213 3214 /* 3215 * Return the (held) credential for the process with the given pid. 3216 * May return NULL (e.g., process not found). 3217 */ 3218 3219 static cred_t * 3220 pid_to_cr(pid_t pid) 3221 { 3222 proc_t *p; 3223 cred_t *cr; 3224 3225 mutex_enter(&pidlock); 3226 if ((p = prfind(pid)) == NULL) { 3227 mutex_exit(&pidlock); 3228 return (NULL); 3229 } 3230 3231 mutex_enter(&p->p_crlock); 3232 crhold(cr = p->p_cred); 3233 mutex_exit(&p->p_crlock); 3234 mutex_exit(&pidlock); 3235 3236 return (cr); 3237 } 3238 3239 /* 3240 * Send SIGLOST to the given process and queue the event. 3241 * 3242 * The 'dump' boolean tells us whether this action should dump the 3243 * in-kernel queue of recovery messages or not. 3244 */ 3245 3246 void 3247 nfs4_send_siglost(pid_t pid, mntinfo4_t *mi, vnode_t *vp, bool_t dump, 3248 int error, nfsstat4 stat) 3249 { 3250 proc_t *p; 3251 3252 mutex_enter(&pidlock); 3253 p = prfind(pid); 3254 if (p) 3255 psignal(p, SIGLOST); 3256 mutex_exit(&pidlock); 3257 nfs4_queue_event(dump ? RE_SIGLOST : RE_SIGLOST_NO_DUMP, mi, 3258 NULL, error, vp, NULL, stat, NULL, pid, TAG_NONE, TAG_NONE, 0, 0); 3259 } 3260 3261 /* 3262 * Scan the lock list for entries that match the given pid. Change the 3263 * pid in those that do to NOPID. 3264 */ 3265 3266 static void 3267 relock_skip_pid(locklist_t *llp, pid_t pid) 3268 { 3269 for (; llp != NULL; llp = llp->ll_next) { 3270 if (llp->ll_flock.l_pid == pid) 3271 llp->ll_flock.l_pid = NOPID; 3272 } 3273 } 3274 3275 /* 3276 * Mark a file as having failed recovery, after making a last-ditch effort 3277 * to return any delegation. 3278 * 3279 * Sets r_error to EIO or ESTALE for the given vnode. 3280 */ 3281 void 3282 nfs4_fail_recov(vnode_t *vp, char *why, int error, nfsstat4 stat) 3283 { 3284 rnode4_t *rp = VTOR4(vp); 3285 3286 #ifdef DEBUG 3287 if (nfs4_fail_recov_stop) 3288 debug_enter("nfs4_fail_recov"); 3289 #endif 3290 3291 mutex_enter(&rp->r_statelock); 3292 if (rp->r_flags & (R4RECOVERR|R4RECOVERRP)) { 3293 mutex_exit(&rp->r_statelock); 3294 return; 3295 } 3296 3297 /* 3298 * Set R4RECOVERRP to indicate that a recovery error is in 3299 * progress. This will shut down reads and writes at the top 3300 * half. Don't set R4RECOVERR until after we've returned the 3301 * delegation, otherwise it will fail. 3302 */ 3303 3304 rp->r_flags |= R4RECOVERRP; 3305 mutex_exit(&rp->r_statelock); 3306 3307 nfs4delegabandon(rp); 3308 3309 mutex_enter(&rp->r_statelock); 3310 rp->r_flags |= (R4RECOVERR | R4STALE); 3311 rp->r_error = (error == 0 && stat == NFS4ERR_STALE) ? ESTALE : EIO; 3312 PURGE_ATTRCACHE4_LOCKED(rp); 3313 if (!(vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) 3314 nfs4_queue_event(RE_DEAD_FILE, VTOMI4(vp), NULL, error, 3315 vp, NULL, stat, why, 0, TAG_NONE, TAG_NONE, 0, 0); 3316 mutex_exit(&rp->r_statelock); 3317 3318 dnlc_purge_vp(vp); 3319 } 3320 3321 /* 3322 * recov_throttle: if the file had the same recovery action within the 3323 * throttle interval, wait for the throttle interval to finish before 3324 * proceeding. 3325 * 3326 * Side effects: updates the rnode with the current recovery information. 3327 */ 3328 3329 static void 3330 recov_throttle(recov_info_t *recovp, vnode_t *vp) 3331 { 3332 time_t curtime, time_to_wait; 3333 rnode4_t *rp = VTOR4(vp); 3334 3335 curtime = gethrestime_sec(); 3336 3337 mutex_enter(&rp->r_statelock); 3338 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 3339 "recov_throttle: now: (%d, %ld), last: (%d, %ld)", 3340 recovp->rc_action, curtime, 3341 rp->r_recov_act, rp->r_last_recov)); 3342 if (recovp->rc_action == rp->r_recov_act && 3343 rp->r_last_recov + recov_err_delay > curtime) { 3344 time_to_wait = rp->r_last_recov + recov_err_delay - curtime; 3345 mutex_exit(&rp->r_statelock); 3346 delay(SEC_TO_TICK(time_to_wait)); 3347 curtime = gethrestime_sec(); 3348 mutex_enter(&rp->r_statelock); 3349 } 3350 3351 rp->r_last_recov = curtime; 3352 rp->r_recov_act = recovp->rc_action; 3353 mutex_exit(&rp->r_statelock); 3354 } 3355 3356 /* 3357 * React to NFS4ERR_GRACE by setting the time we'll permit 3358 * the next call to this filesystem. 3359 */ 3360 void 3361 nfs4_set_grace_wait(mntinfo4_t *mi) 3362 { 3363 mutex_enter(&mi->mi_lock); 3364 /* Mark the time for the future */ 3365 mi->mi_grace_wait = gethrestime_sec() + nfs4err_delay_time; 3366 mutex_exit(&mi->mi_lock); 3367 } 3368 3369 /* 3370 * React to MFS4ERR_DELAY by setting the time we'll permit 3371 * the next call to this vnode. 3372 */ 3373 void 3374 nfs4_set_delay_wait(vnode_t *vp) 3375 { 3376 rnode4_t *rp = VTOR4(vp); 3377 3378 mutex_enter(&rp->r_statelock); 3379 /* 3380 * Calculate amount we should delay, initial 3381 * delay will be short and then we will back off. 3382 */ 3383 if (rp->r_delay_interval == 0) 3384 rp->r_delay_interval = NFS4_INITIAL_DELAY_INTERVAL; 3385 else 3386 /* calculate next interval value */ 3387 rp->r_delay_interval = 3388 MIN(NFS4_MAX_DELAY_INTERVAL, (rp->r_delay_interval << 1)); 3389 rp->r_delay_wait = gethrestime_sec() + rp->r_delay_interval; 3390 mutex_exit(&rp->r_statelock); 3391 } 3392 3393 /* 3394 * The caller is responsible for freeing the returned string. 3395 */ 3396 static char * 3397 nfs4_getsrvnames(mntinfo4_t *mi, size_t *len) 3398 { 3399 servinfo4_t *svp; 3400 char *srvnames; 3401 char *namep; 3402 size_t length; 3403 3404 /* 3405 * Calculate the length of the string required to hold all 3406 * of the server names plus either a comma or a null 3407 * character following each individual one. 3408 */ 3409 length = 0; 3410 for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) { 3411 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 3412 if (svp->sv_flags & SV4_NOTINUSE) { 3413 nfs_rw_exit(&svp->sv_lock); 3414 continue; 3415 } 3416 nfs_rw_exit(&svp->sv_lock); 3417 length += svp->sv_hostnamelen; 3418 } 3419 3420 srvnames = kmem_alloc(length, KM_SLEEP); 3421 3422 namep = srvnames; 3423 for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) { 3424 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 3425 if (svp->sv_flags & SV4_NOTINUSE) { 3426 nfs_rw_exit(&svp->sv_lock); 3427 continue; 3428 } 3429 nfs_rw_exit(&svp->sv_lock); 3430 (void) strcpy(namep, svp->sv_hostname); 3431 namep += svp->sv_hostnamelen - 1; 3432 *namep++ = ','; 3433 } 3434 *--namep = '\0'; 3435 3436 *len = length; 3437 3438 return (srvnames); 3439 } 3440 3441 static void 3442 save_bseqid_rqst(nfs4_bseqid_entry_t *bsep, recov_info_t *recovp) 3443 { 3444 nfs4_bseqid_entry_t *destp; 3445 3446 destp = kmem_alloc(sizeof (nfs4_bseqid_entry_t), KM_SLEEP); 3447 recovp->rc_bseqid_rqst = destp; 3448 3449 if (bsep->bs_oop) 3450 open_owner_hold(bsep->bs_oop); 3451 destp->bs_oop = bsep->bs_oop; 3452 if (bsep->bs_lop) 3453 lock_owner_hold(bsep->bs_lop); 3454 destp->bs_lop = bsep->bs_lop; 3455 if (bsep->bs_vp) 3456 VN_HOLD(bsep->bs_vp); 3457 destp->bs_vp = bsep->bs_vp; 3458 destp->bs_pid = bsep->bs_pid; 3459 destp->bs_tag = bsep->bs_tag; 3460 destp->bs_seqid = bsep->bs_seqid; 3461 } 3462 3463 static void 3464 free_bseqid_rqst(nfs4_bseqid_entry_t *bsep) 3465 { 3466 if (bsep->bs_oop) 3467 open_owner_rele(bsep->bs_oop); 3468 if (bsep->bs_lop) 3469 lock_owner_rele(bsep->bs_lop); 3470 if (bsep->bs_vp) 3471 VN_RELE(bsep->bs_vp); 3472 kmem_free(bsep, sizeof (nfs4_bseqid_entry_t)); 3473 } 3474 3475 /* 3476 * We don't actually fully recover from NFS4ERR_BAD_SEQID. We 3477 * simply mark the open owner and open stream (if provided) as "bad". 3478 * Then future uses of these data structures will be limited to basically 3479 * just cleaning up the internal client state (no going OTW). 3480 * 3481 * The result of this is to return errors back to the app/usr when 3482 * we receive NFS4ERR_BAD_SEQID, but also allow future/new calls to 3483 * succeed so progress can be made. 3484 */ 3485 void 3486 recov_bad_seqid(recov_info_t *recovp) 3487 { 3488 mntinfo4_t *mi = recovp->rc_mi; 3489 nfs4_open_owner_t *bad_oop; 3490 nfs4_lock_owner_t *bad_lop; 3491 vnode_t *vp; 3492 rnode4_t *rp = NULL; 3493 pid_t pid; 3494 nfs4_bseqid_entry_t *bsep, *tbsep; 3495 int error; 3496 3497 ASSERT(mi != NULL); 3498 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 3499 3500 mutex_enter(&mi->mi_lock); 3501 bsep = list_head(&mi->mi_bseqid_list); 3502 mutex_exit(&mi->mi_lock); 3503 3504 /* 3505 * Handle all the bad seqid entries on mi's list. 3506 */ 3507 while (bsep != NULL) { 3508 bad_oop = bsep->bs_oop; 3509 bad_lop = bsep->bs_lop; 3510 vp = bsep->bs_vp; 3511 pid = bsep->bs_pid; 3512 3513 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 3514 "recov_bad_seqid: mark oop %p lop %p as bad for " 3515 "vp %p tag %s pid %d: last good seqid %d for tag %s", 3516 (void *)bad_oop, (void *)bad_lop, (void *)vp, 3517 nfs4_ctags[bsep->bs_tag].ct_str, pid, 3518 bad_oop ? bad_oop->oo_last_good_seqid : 0, 3519 bad_oop ? nfs4_ctags[bad_oop->oo_last_good_op].ct_str : 3520 nfs4_ctags[TAG_NONE].ct_str)); 3521 3522 nfs4_queue_event(RE_BAD_SEQID, mi, NULL, 3523 0, vp, NULL, NFS4ERR_BAD_SEQID, NULL, pid, bsep->bs_tag, 3524 bad_oop ? bad_oop->oo_last_good_op : TAG_NONE, 3525 bsep->bs_seqid, bad_oop ? bad_oop->oo_last_good_seqid : 0); 3526 3527 if (bad_oop) { 3528 /* essentially reset the open owner */ 3529 error = nfs4_start_open_seqid_sync(bad_oop, mi); 3530 ASSERT(!error); /* recov thread always succeeds */ 3531 bad_oop->oo_name = nfs4_get_new_oo_name(); 3532 bad_oop->oo_seqid = 0; 3533 nfs4_end_open_seqid_sync(bad_oop); 3534 } 3535 3536 if (bad_lop) { 3537 mutex_enter(&bad_lop->lo_lock); 3538 bad_lop->lo_flags |= NFS4_BAD_SEQID_LOCK; 3539 mutex_exit(&bad_lop->lo_lock); 3540 3541 ASSERT(vp != NULL); 3542 rp = VTOR4(vp); 3543 mutex_enter(&rp->r_statelock); 3544 rp->r_flags |= R4LODANGLERS; 3545 mutex_exit(&rp->r_statelock); 3546 3547 nfs4_send_siglost(pid, mi, vp, TRUE, 3548 0, NFS4ERR_BAD_SEQID); 3549 } 3550 3551 mutex_enter(&mi->mi_lock); 3552 list_remove(&mi->mi_bseqid_list, bsep); 3553 tbsep = bsep; 3554 bsep = list_head(&mi->mi_bseqid_list); 3555 mutex_exit(&mi->mi_lock); 3556 free_bseqid_rqst(tbsep); 3557 } 3558 3559 mutex_enter(&mi->mi_lock); 3560 mi->mi_recovflags &= ~MI4R_BAD_SEQID; 3561 mutex_exit(&mi->mi_lock); 3562 } 3563