1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * NFS Version 4 state recovery code. 30 */ 31 32 #include <nfs/nfs4_clnt.h> 33 #include <nfs/nfs4.h> 34 #include <nfs/rnode4.h> 35 #include <sys/cmn_err.h> 36 #include <sys/cred.h> 37 #include <sys/systm.h> 38 #include <sys/flock.h> 39 #include <sys/dnlc.h> 40 #include <sys/ddi.h> 41 #include <sys/disp.h> 42 #include <sys/list.h> 43 #include <sys/sdt.h> 44 45 extern r4hashq_t *rtable4; 46 47 /* 48 * Information that describes what needs to be done for recovery. It is 49 * passed to a client recovery thread as well as passed to various recovery 50 * routines. rc_mi, rc_vp1, and rc_vp2 refer to the filesystem and 51 * vnode(s) affected by recovery. rc_vp1 and rc_vp2 are references (use 52 * VN_HOLD) or NULL. rc_lost_rqst contains information about the lost 53 * lock or open/close request, and it holds reference counts for the 54 * various objects (vnode, etc.). The recovery thread also uses flags set 55 * in the mntinfo4_t or vnode_t to tell it what to do. rc_error is used 56 * to save the error that originally triggered the recovery event -- will 57 * later be used to set mi_error if recovery doesn't work. rc_bseqid_rqst 58 * contains information about the request that got NFS4ERR_BAD_SEQID, and 59 * it holds reference count for the various objects (vnode, open owner, 60 * open stream, lock owner). 61 */ 62 63 typedef struct { 64 mntinfo4_t *rc_mi; 65 vnode_t *rc_vp1; 66 vnode_t *rc_vp2; 67 nfs4_recov_t rc_action; 68 stateid4 rc_stateid; 69 bool_t rc_srv_reboot; /* server has rebooted */ 70 nfs4_lost_rqst_t *rc_lost_rqst; 71 nfs4_error_t rc_orig_errors; /* original errors causing recovery */ 72 int rc_error; 73 nfs4_bseqid_entry_t *rc_bseqid_rqst; 74 } recov_info_t; 75 76 /* 77 * How long to wait before trying again if there is an error doing 78 * recovery, in seconds. 79 */ 80 81 static int recov_err_delay = 1; 82 83 /* 84 * How long to wait when processing NFS4ERR_GRACE or NFS4ERR_DELAY 85 * errors. Expressed in seconds. Default is defined as 86 * NFS4ERR_DELAY_TIME and this variable is initialized in nfs4_subr_init() 87 */ 88 time_t nfs4err_delay_time = 0; 89 90 /* 91 * Tuneable to limit how many time "exempt" ops go OTW 92 * after a recovery error. Exempt op hints are OH_CLOSE, 93 * OH_LOCKU, OH_DELEGRETURN. These previously always went 94 * OTW even after rnode was "dead" due to recovery errors. 95 * 96 * The tuneable below limits the number of times a start_fop 97 * invocation will retry the exempt hints. After the limit 98 * is reached, nfs4_start_fop will return an error just like 99 * it would for non-exempt op hints. 100 */ 101 int nfs4_max_recov_error_retry = 3; 102 103 /* 104 * Number of seconds the recovery thread should pause before retry when the 105 * filesystem has been forcibly unmounted. 106 */ 107 108 int nfs4_unmount_delay = 1; 109 110 #ifdef DEBUG 111 112 /* 113 * How long to wait (in seconds) between recovery operations on a given 114 * file. Normally zero, but could be set longer for testing purposes. 115 */ 116 static int nfs4_recovdelay = 0; 117 118 /* 119 * Switch that controls whether to go into the debugger when recovery 120 * fails. 121 */ 122 static int nfs4_fail_recov_stop = 0; 123 124 /* 125 * Tuneables to debug client namespace interaction with server 126 * mount points: 127 * 128 * nfs4_srvmnt_fail_cnt: 129 * number of times EACCES returned because client 130 * attempted to cross server mountpoint 131 * 132 * nfs4_srvmnt_debug: 133 * trigger console printf whenever client attempts 134 * to cross server mountpoint 135 */ 136 int nfs4_srvmnt_fail_cnt = 0; 137 int nfs4_srvmnt_debug = 0; 138 #endif 139 140 /* forward references, in alphabetic order */ 141 static void close_after_open_resend(vnode_t *, cred_t *, uint32_t, 142 nfs4_error_t *); 143 static void errs_to_action(recov_info_t *, 144 nfs4_server_t *, mntinfo4_t *, stateid4 *, nfs4_lost_rqst_t *, int, 145 nfs_opnum4, nfs4_bseqid_entry_t *); 146 static void flush_reinstate(nfs4_lost_rqst_t *); 147 static void free_milist(mntinfo4_t **, int); 148 static mntinfo4_t **make_milist(nfs4_server_t *, int *); 149 static int nfs4_check_recov_err(vnode_t *, nfs4_op_hint_t, 150 nfs4_recov_state_t *, int, char *); 151 static int nfs4_check_srvstub(vnode_t *vp, rnode4_t *rp, nfs4_op_hint_t op); 152 static char *nfs4_getsrvnames(mntinfo4_t *, size_t *); 153 static void nfs4_recov_fh_fail(vnode_t *, int, nfsstat4); 154 static void nfs4_recov_thread(recov_info_t *); 155 static void nfs4_remove_lost_rqsts(mntinfo4_t *, nfs4_server_t *); 156 static void nfs4_resend_lost_rqsts(recov_info_t *, nfs4_server_t *); 157 static cred_t *pid_to_cr(pid_t); 158 static void reclaim_one_lock(vnode_t *, flock64_t *, nfs4_error_t *, int *); 159 static void recov_bad_seqid(recov_info_t *); 160 static void recov_badstate(recov_info_t *, vnode_t *, nfsstat4); 161 static void recov_clientid(recov_info_t *, nfs4_server_t *); 162 static void recov_done(mntinfo4_t *, recov_info_t *); 163 static void recov_filehandle(nfs4_recov_t, mntinfo4_t *, vnode_t *); 164 static void recov_newserver(recov_info_t *, nfs4_server_t **, bool_t *); 165 static void recov_openfiles(recov_info_t *, nfs4_server_t *); 166 static void recov_stale(mntinfo4_t *, vnode_t *); 167 static void nfs4_free_lost_rqst(nfs4_lost_rqst_t *, nfs4_server_t *); 168 static void recov_throttle(recov_info_t *, vnode_t *); 169 static void relock_skip_pid(locklist_t *, pid_t); 170 static void resend_lock(nfs4_lost_rqst_t *, nfs4_error_t *); 171 static void resend_one_op(nfs4_lost_rqst_t *, nfs4_error_t *, mntinfo4_t *, 172 nfs4_server_t *); 173 static void save_bseqid_rqst(nfs4_bseqid_entry_t *, recov_info_t *); 174 static void start_recovery(recov_info_t *, mntinfo4_t *, vnode_t *, vnode_t *, 175 nfs4_server_t *); 176 static void start_recovery_action(nfs4_recov_t, bool_t, mntinfo4_t *, vnode_t *, 177 vnode_t *); 178 static int wait_for_recovery(mntinfo4_t *, nfs4_op_hint_t); 179 180 /* 181 * Return non-zero if the given errno, status, and rpc status codes 182 * in the nfs4_error_t indicate that client recovery is needed. 183 * "stateful" indicates whether the call that got the error establishes or 184 * removes state on the server (open, close, lock, unlock, delegreturn). 185 */ 186 187 int 188 nfs4_needs_recovery(nfs4_error_t *ep, bool_t stateful, vfs_t *vfsp) 189 { 190 int recov = 0; 191 mntinfo4_t *mi; 192 193 /* 194 * Try failover if the error values justify it and if 195 * it's a failover mount. Don't try if the mount is in 196 * progress, failures are handled explicitly by nfs4rootvp. 197 */ 198 if (nfs4_try_failover(ep)) { 199 mi = VFTOMI4(vfsp); 200 mutex_enter(&mi->mi_lock); 201 recov = FAILOVER_MOUNT4(mi) && !(mi->mi_flags & MI4_MOUNTING); 202 mutex_exit(&mi->mi_lock); 203 if (recov) 204 return (recov); 205 } 206 207 if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vfsp)) { 208 /* 209 * The server may have gotten the request, so for stateful 210 * ops we need to resynchronize and possibly back out the 211 * op. 212 */ 213 return (stateful); 214 } 215 if (ep->error != 0) 216 return (0); 217 218 /* stat values are listed alphabetically */ 219 /* 220 * There are two lists here: the errors for which we have code, and 221 * the errors for which we plan to have code before FCS. For the 222 * second list, print a warning message but don't attempt recovery. 223 */ 224 switch (ep->stat) { 225 case NFS4ERR_BADHANDLE: 226 case NFS4ERR_BAD_SEQID: 227 case NFS4ERR_BAD_STATEID: 228 case NFS4ERR_DELAY: 229 case NFS4ERR_EXPIRED: 230 case NFS4ERR_FHEXPIRED: 231 case NFS4ERR_GRACE: 232 case NFS4ERR_OLD_STATEID: 233 case NFS4ERR_RESOURCE: 234 case NFS4ERR_STALE_CLIENTID: 235 case NFS4ERR_STALE_STATEID: 236 case NFS4ERR_WRONGSEC: 237 case NFS4ERR_STALE: 238 recov = 1; 239 break; 240 #ifdef DEBUG 241 case NFS4ERR_LEASE_MOVED: 242 case NFS4ERR_MOVED: 243 zcmn_err(VFTOMI4(vfsp)->mi_zone->zone_id, 244 CE_WARN, "!Can't yet recover from NFS status %d", 245 ep->stat); 246 break; 247 #endif 248 } 249 250 return (recov); 251 } 252 253 /* 254 * Some operations such as DELEGRETURN want to avoid invoking 255 * recovery actions that will only mark the file dead. If 256 * better handlers are invoked for any of these errors, this 257 * routine should be modified. 258 */ 259 int 260 nfs4_recov_marks_dead(nfsstat4 status) 261 { 262 if (status == NFS4ERR_BAD_SEQID || 263 status == NFS4ERR_EXPIRED || 264 status == NFS4ERR_BAD_STATEID || 265 status == NFS4ERR_OLD_STATEID) 266 return (1); 267 return (0); 268 } 269 270 /* 271 * Transfer the state recovery information in recovp to mi's resend queue, 272 * and mark mi as having a lost state request. 273 */ 274 static void 275 nfs4_enqueue_lost_rqst(recov_info_t *recovp, mntinfo4_t *mi) 276 { 277 nfs4_lost_rqst_t *lrp = recovp->rc_lost_rqst; 278 279 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 280 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 281 282 ASSERT(lrp != NULL && lrp->lr_op != 0); 283 284 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 285 "nfs4_enqueue_lost_rqst %p, op %d", 286 (void *)lrp, lrp->lr_op)); 287 288 mutex_enter(&mi->mi_lock); 289 mi->mi_recovflags |= MI4R_LOST_STATE; 290 if (lrp->lr_putfirst) 291 list_insert_head(&mi->mi_lost_state, lrp); 292 else 293 list_insert_tail(&mi->mi_lost_state, lrp); 294 recovp->rc_lost_rqst = NULL; 295 mutex_exit(&mi->mi_lock); 296 297 nfs4_queue_event(RE_LOST_STATE, mi, NULL, lrp->lr_op, lrp->lr_vp, 298 lrp->lr_dvp, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0); 299 } 300 301 /* 302 * Transfer the bad seqid recovery information in recovp to mi's 303 * bad seqid queue, and mark mi as having a bad seqid request. 304 */ 305 void 306 enqueue_bseqid_rqst(recov_info_t *recovp, mntinfo4_t *mi) 307 { 308 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 309 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 310 ASSERT(recovp->rc_bseqid_rqst != NULL); 311 312 mutex_enter(&mi->mi_lock); 313 mi->mi_recovflags |= MI4R_BAD_SEQID; 314 list_insert_tail(&mi->mi_bseqid_list, recovp->rc_bseqid_rqst); 315 recovp->rc_bseqid_rqst = NULL; 316 mutex_exit(&mi->mi_lock); 317 } 318 319 /* 320 * Initiate recovery. 321 * 322 * The nfs4_error_t contains the return codes that triggered a recovery 323 * attempt. mi, vp1, and vp2 refer to the filesystem and files that were 324 * being operated on. vp1 and vp2 may be NULL. 325 * 326 * Multiple calls are okay. If recovery is already underway, the call 327 * updates the information about what state needs recovery but does not 328 * start a new thread. The caller should hold mi->mi_recovlock as a reader 329 * for proper synchronization with any recovery thread. 330 * 331 * This will return TRUE if recovery was aborted, and FALSE otherwise. 332 */ 333 bool_t 334 nfs4_start_recovery(nfs4_error_t *ep, mntinfo4_t *mi, vnode_t *vp1, 335 vnode_t *vp2, stateid4 *sid, nfs4_lost_rqst_t *lost_rqstp, nfs_opnum4 op, 336 nfs4_bseqid_entry_t *bsep) 337 { 338 recov_info_t *recovp; 339 nfs4_server_t *sp; 340 bool_t abort = FALSE; 341 bool_t gone = FALSE; 342 343 ASSERT(nfs_zone() == mi->mi_zone); 344 mutex_enter(&mi->mi_lock); 345 /* 346 * If there is lost state, we need to kick off recovery even if the 347 * filesystem has been unmounted or the zone is shutting down. 348 */ 349 gone = FS_OR_ZONE_GONE4(mi->mi_vfsp); 350 if (gone) { 351 ASSERT(ep->error != EINTR || lost_rqstp != NULL); 352 if (ep->error == EIO && lost_rqstp == NULL) { 353 /* failed due to forced unmount, no new lost state */ 354 abort = TRUE; 355 } 356 if ((ep->error == 0 || ep->error == ETIMEDOUT) && 357 !(mi->mi_recovflags & MI4R_LOST_STATE)) { 358 /* some other failure, no existing lost state */ 359 abort = TRUE; 360 } 361 if (abort) { 362 mutex_exit(&mi->mi_lock); 363 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 364 "nfs4_start_recovery: fs unmounted")); 365 return (TRUE); 366 } 367 } 368 mi->mi_in_recovery++; 369 mutex_exit(&mi->mi_lock); 370 371 recovp = kmem_alloc(sizeof (recov_info_t), KM_SLEEP); 372 recovp->rc_orig_errors = *ep; 373 sp = find_nfs4_server(mi); 374 errs_to_action(recovp, sp, mi, sid, lost_rqstp, 375 gone, op, bsep); 376 if (sp != NULL) 377 mutex_exit(&sp->s_lock); 378 start_recovery(recovp, mi, vp1, vp2, sp); 379 if (sp != NULL) 380 nfs4_server_rele(sp); 381 return (FALSE); 382 } 383 384 /* 385 * Internal version of nfs4_start_recovery. The difference is that the 386 * caller specifies the recovery action, rather than the errors leading to 387 * recovery. 388 */ 389 static void 390 start_recovery_action(nfs4_recov_t what, bool_t reboot, mntinfo4_t *mi, 391 vnode_t *vp1, vnode_t *vp2) 392 { 393 recov_info_t *recovp; 394 395 ASSERT(nfs_zone() == mi->mi_zone); 396 mutex_enter(&mi->mi_lock); 397 mi->mi_in_recovery++; 398 mutex_exit(&mi->mi_lock); 399 400 recovp = kmem_zalloc(sizeof (recov_info_t), KM_SLEEP); 401 recovp->rc_action = what; 402 recovp->rc_srv_reboot = reboot; 403 recovp->rc_error = EIO; 404 start_recovery(recovp, mi, vp1, vp2, NULL); 405 } 406 407 static void 408 start_recovery(recov_info_t *recovp, mntinfo4_t *mi, 409 vnode_t *vp1, vnode_t *vp2, nfs4_server_t *sp) 410 { 411 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 412 "start_recovery: mi %p, what %s", (void*)mi, 413 nfs4_recov_action_to_str(recovp->rc_action))); 414 415 /* 416 * Bump the reference on the vfs so that we can pass it to the 417 * recovery thread. 418 */ 419 VFS_HOLD(mi->mi_vfsp); 420 MI4_HOLD(mi); 421 again: 422 switch (recovp->rc_action) { 423 case NR_FAILOVER: 424 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 425 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 426 if (mi->mi_servers->sv_next == NULL) 427 goto out_no_thread; 428 mutex_enter(&mi->mi_lock); 429 mi->mi_recovflags |= MI4R_NEED_NEW_SERVER; 430 mutex_exit(&mi->mi_lock); 431 432 if (recovp->rc_lost_rqst != NULL) 433 nfs4_enqueue_lost_rqst(recovp, mi); 434 break; 435 436 case NR_CLIENTID: 437 /* 438 * If the filesystem has been unmounted, punt. 439 */ 440 if (sp == NULL) 441 goto out_no_thread; 442 443 /* 444 * If nobody else is working on the clientid, mark the 445 * clientid as being no longer set. Then mark the specific 446 * filesystem being worked on. 447 */ 448 if (!nfs4_server_in_recovery(sp)) { 449 mutex_enter(&sp->s_lock); 450 sp->s_flags &= ~N4S_CLIENTID_SET; 451 mutex_exit(&sp->s_lock); 452 } 453 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 454 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 455 mutex_enter(&mi->mi_lock); 456 mi->mi_recovflags |= MI4R_NEED_CLIENTID; 457 if (recovp->rc_srv_reboot) 458 mi->mi_recovflags |= MI4R_SRV_REBOOT; 459 mutex_exit(&mi->mi_lock); 460 break; 461 462 case NR_OPENFILES: 463 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 464 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 465 mutex_enter(&mi->mi_lock); 466 mi->mi_recovflags |= MI4R_REOPEN_FILES; 467 if (recovp->rc_srv_reboot) 468 mi->mi_recovflags |= MI4R_SRV_REBOOT; 469 mutex_exit(&mi->mi_lock); 470 break; 471 472 case NR_WRONGSEC: 473 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 474 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 475 mutex_enter(&mi->mi_lock); 476 mi->mi_recovflags |= MI4R_NEED_SECINFO; 477 mutex_exit(&mi->mi_lock); 478 break; 479 480 case NR_EXPIRED: 481 if (vp1 != NULL) 482 recov_badstate(recovp, vp1, NFS4ERR_EXPIRED); 483 if (vp2 != NULL) 484 recov_badstate(recovp, vp2, NFS4ERR_EXPIRED); 485 goto out_no_thread; /* no further recovery possible */ 486 487 case NR_BAD_STATEID: 488 if (vp1 != NULL) 489 recov_badstate(recovp, vp1, NFS4ERR_BAD_STATEID); 490 if (vp2 != NULL) 491 recov_badstate(recovp, vp2, NFS4ERR_BAD_STATEID); 492 goto out_no_thread; /* no further recovery possible */ 493 494 case NR_FHEXPIRED: 495 case NR_BADHANDLE: 496 if (vp1 != NULL) 497 recov_throttle(recovp, vp1); 498 if (vp2 != NULL) 499 recov_throttle(recovp, vp2); 500 /* 501 * Recover the filehandle now, rather than using a 502 * separate thread. We can do this because filehandle 503 * recovery is independent of any other state, and because 504 * we know that we are not competing with the recovery 505 * thread at this time. recov_filehandle will deal with 506 * threads that are competing to recover this filehandle. 507 */ 508 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 509 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 510 if (vp1 != NULL) 511 recov_filehandle(recovp->rc_action, mi, vp1); 512 if (vp2 != NULL) 513 recov_filehandle(recovp->rc_action, mi, vp2); 514 goto out_no_thread; /* no further recovery needed */ 515 516 case NR_STALE: 517 /* 518 * NFS4ERR_STALE handling 519 * recov_stale() could set MI4R_NEED_NEW_SERVER to 520 * indicate that we can and should failover. 521 */ 522 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 523 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 524 525 if (vp1 != NULL) 526 recov_stale(mi, vp1); 527 if (vp2 != NULL) 528 recov_stale(mi, vp2); 529 mutex_enter(&mi->mi_lock); 530 if ((mi->mi_recovflags & MI4R_NEED_NEW_SERVER) == 0) { 531 mutex_exit(&mi->mi_lock); 532 goto out_no_thread; 533 } 534 mutex_exit(&mi->mi_lock); 535 recovp->rc_action = NR_FAILOVER; 536 goto again; 537 538 case NR_BAD_SEQID: 539 if (recovp->rc_bseqid_rqst) { 540 enqueue_bseqid_rqst(recovp, mi); 541 break; 542 } 543 544 if (vp1 != NULL) 545 recov_badstate(recovp, vp1, NFS4ERR_BAD_SEQID); 546 if (vp2 != NULL) 547 recov_badstate(recovp, vp2, NFS4ERR_BAD_SEQID); 548 goto out_no_thread; /* no further recovery possible */ 549 550 case NR_OLDSTATEID: 551 if (vp1 != NULL) 552 recov_badstate(recovp, vp1, NFS4ERR_OLD_STATEID); 553 if (vp2 != NULL) 554 recov_badstate(recovp, vp2, NFS4ERR_OLD_STATEID); 555 goto out_no_thread; /* no further recovery possible */ 556 557 case NR_GRACE: 558 nfs4_set_grace_wait(mi); 559 goto out_no_thread; /* no further action required for GRACE */ 560 561 case NR_DELAY: 562 if (vp1) 563 nfs4_set_delay_wait(vp1); 564 goto out_no_thread; /* no further action required for DELAY */ 565 566 case NR_LOST_STATE_RQST: 567 case NR_LOST_LOCK: 568 nfs4_enqueue_lost_rqst(recovp, mi); 569 break; 570 571 default: 572 nfs4_queue_event(RE_UNEXPECTED_ACTION, mi, NULL, 573 recovp->rc_action, NULL, NULL, 0, NULL, 0, TAG_NONE, 574 TAG_NONE, 0, 0); 575 goto out_no_thread; 576 } 577 578 /* 579 * If either file recently went through the same recovery, wait 580 * awhile. This is in case there is some sort of bug; we might not 581 * be able to recover properly, but at least we won't bombard the 582 * server with calls, and we won't tie up the client. 583 */ 584 if (vp1 != NULL) 585 recov_throttle(recovp, vp1); 586 if (vp2 != NULL) 587 recov_throttle(recovp, vp2); 588 589 /* 590 * If there's already a recovery thread, don't start another one. 591 */ 592 593 mutex_enter(&mi->mi_lock); 594 if (mi->mi_flags & MI4_RECOV_ACTIV) { 595 mutex_exit(&mi->mi_lock); 596 goto out_no_thread; 597 } 598 mi->mi_flags |= MI4_RECOV_ACTIV; 599 mutex_exit(&mi->mi_lock); 600 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 601 "start_recovery: starting new thread for mi %p", (void*)mi)); 602 603 recovp->rc_mi = mi; 604 recovp->rc_vp1 = vp1; 605 if (vp1 != NULL) { 606 ASSERT(VTOMI4(vp1) == mi); 607 VN_HOLD(recovp->rc_vp1); 608 } 609 recovp->rc_vp2 = vp2; 610 if (vp2 != NULL) { 611 ASSERT(VTOMI4(vp2) == mi); 612 VN_HOLD(recovp->rc_vp2); 613 } 614 615 (void) zthread_create(NULL, 0, nfs4_recov_thread, recovp, 0, 616 minclsyspri); 617 return; 618 619 /* not reached by thread creating call */ 620 out_no_thread: 621 mutex_enter(&mi->mi_lock); 622 mi->mi_in_recovery--; 623 if (mi->mi_in_recovery == 0) 624 cv_broadcast(&mi->mi_cv_in_recov); 625 mutex_exit(&mi->mi_lock); 626 627 VFS_RELE(mi->mi_vfsp); 628 MI4_RELE(mi); 629 /* 630 * Free up resources that were allocated for us. 631 */ 632 kmem_free(recovp, sizeof (recov_info_t)); 633 } 634 635 static int 636 nfs4_check_srvstub(vnode_t *vp, rnode4_t *rp, nfs4_op_hint_t op) 637 { 638 int err = 0; 639 640 /* 641 * If tuneable does not allow client to cross srv mountpoints and 642 * object is a stub, then check check op hint and return EACCES for 643 * any hint other than access, rddir, getattr, lookup. 644 */ 645 if (rp->r_flags & R4SRVSTUB && op != OH_ACCESS && op != OH_GETACL && 646 op != OH_GETATTR && op != OH_READDIR && op != OH_LOOKUP) { 647 err = EACCES; 648 #ifdef DEBUG 649 NFS4_DEBUG(nfs4_srvmnt_debug, (CE_NOTE, 650 "nfs4_check_srvstub: op=%d err=%d rp=%p vp=%p\n" 651 "va_nod=%llx r_mntd_fid=%llx\n" 652 "sv_fsid=(%llx:%llx) r_srv_fsid=(%llx:%llx)", 653 op, err, (void *)rp, (void *)vp, 654 (u_longlong_t)rp->r_attr.va_nodeid, 655 (u_longlong_t)rp->r_mntd_fid, 656 (u_longlong_t)rp->r_server->sv_fsid.major, 657 (u_longlong_t)rp->r_server->sv_fsid.minor, 658 (u_longlong_t)rp->r_srv_fsid.major, 659 (u_longlong_t)rp->r_srv_fsid.minor)); 660 #endif 661 } 662 663 return (err); 664 } 665 666 static int 667 nfs4_check_recov_err(vnode_t *vp, nfs4_op_hint_t op, 668 nfs4_recov_state_t *rsp, int retry_err_cnt, char *str) 669 { 670 rnode4_t *rp; 671 int error = 0; 672 int exempt; 673 674 if (vp == NULL) 675 return (0); 676 677 exempt = (op == OH_CLOSE || op == OH_LOCKU || op == OH_DELEGRETURN); 678 rp = VTOR4(vp); 679 mutex_enter(&rp->r_statelock); 680 681 /* 682 * If there was a recovery error, then allow op hints "exempt" from 683 * recov errors to retry (currently 3 times). Either r_error or 684 * EIO is returned for non-exempt op hints. 685 * 686 * Error heirarchy: 687 * a) check for R4ERECOVERR 688 * b) check for R4SRVSTUB (only if R4RECOVERR is not set). 689 */ 690 if (rp->r_flags & R4RECOVERR) { 691 if (exempt && rsp->rs_num_retry_despite_err <= 692 nfs4_max_recov_error_retry) { 693 694 /* 695 * Check to make sure that we haven't already inc'd 696 * rs_num_retry_despite_err for current nfs4_start_fop 697 * instance. We don't want to double inc (if we were 698 * called with vp2, then the vp1 call could have 699 * already incremented. 700 */ 701 if (retry_err_cnt == rsp->rs_num_retry_despite_err) 702 rsp->rs_num_retry_despite_err++; 703 704 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 705 "nfs4_start_fop: %s %p DEAD, cnt=%d", str, 706 (void *)vp, rsp->rs_num_retry_despite_err)); 707 } else { 708 error = (rp->r_error ? rp->r_error : EIO); 709 /* 710 * An ESTALE error on a non-regular file is not 711 * "sticky". Return the ESTALE error once, but 712 * clear the condition to allow future operations 713 * to go OTW. This will allow the client to 714 * recover if the server has merely unshared then 715 * re-shared the file system. For regular files, 716 * the unshare has destroyed the open state at the 717 * server and we aren't willing to do a reopen (yet). 718 */ 719 if (error == ESTALE && vp->v_type != VREG) { 720 rp->r_flags &= 721 ~(R4RECOVERR|R4RECOVERRP|R4STALE); 722 rp->r_error = 0; 723 error = ESTALE; 724 } 725 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 726 "nfs4_start_fop: %s %p DEAD, cnt=%d error=%d", 727 str, (void *)vp, 728 rsp->rs_num_retry_despite_err, error)); 729 } 730 } else { 731 error = nfs4_check_srvstub(vp, rp, op); 732 NFS4_DEBUG(nfs4_client_recov_stub_debug, (CE_NOTE, 733 "nfs4_start_fop: %s %p SRVSTUB, error=%d", str, 734 (void *)vp, error)); 735 } 736 mutex_exit(&rp->r_statelock); 737 return (error); 738 } 739 740 /* 741 * Initial setup code that every operation should call if it might invoke 742 * client recovery. Can block waiting for recovery to finish on a 743 * filesystem. Either vnode ptr can be NULL. 744 * 745 * Returns 0 if there are no outstanding errors. Can return an 746 * errno value under various circumstances (e.g., failed recovery, or 747 * interrupted while waiting for recovery to finish). 748 * 749 * There must be a corresponding call to nfs4_end_op() to free up any locks 750 * or resources allocated by this call (assuming this call succeeded), 751 * using the same rsp that's passed in here. 752 * 753 * The open and lock seqid synchronization must be stopped before calling this 754 * function, as it could lead to deadlock when trying to reopen a file or 755 * reclaim a lock. The synchronization is obtained with calls to: 756 * nfs4_start_open_seqid_sync() 757 * nfs4_start_lock_seqid_sync() 758 * 759 * *startrecovp is set TRUE if the caller should not bother with the 760 * over-the-wire call, and just initiate recovery for the given request. 761 * This is typically used for state-releasing ops if the filesystem has 762 * been forcibly unmounted. startrecovp may be NULL for 763 * non-state-releasing ops. 764 */ 765 766 int 767 nfs4_start_fop(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2, nfs4_op_hint_t op, 768 nfs4_recov_state_t *rsp, bool_t *startrecovp) 769 { 770 int error = 0, rerr_cnt; 771 nfs4_server_t *sp = NULL; 772 nfs4_server_t *tsp; 773 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 774 time_t droplock_time; 775 #ifdef DEBUG 776 void *fop_caller; 777 #endif 778 779 ASSERT(vp1 == NULL || vp1->v_vfsp == mi->mi_vfsp); 780 ASSERT(vp2 == NULL || vp2->v_vfsp == mi->mi_vfsp); 781 782 #ifdef DEBUG 783 if ((fop_caller = tsd_get(nfs4_tsd_key)) != NULL) { 784 cmn_err(CE_PANIC, "Missing nfs4_end_fop: last caller %p", 785 fop_caller); 786 } 787 (void) tsd_set(nfs4_tsd_key, caller()); 788 #endif 789 790 rsp->rs_sp = NULL; 791 rsp->rs_flags &= ~NFS4_RS_RENAME_HELD; 792 rerr_cnt = rsp->rs_num_retry_despite_err; 793 794 /* 795 * Process the items that may delay() based on server response 796 */ 797 error = nfs4_wait_for_grace(mi, rsp); 798 if (error) 799 goto out; 800 801 if (vp1 != NULL) { 802 error = nfs4_wait_for_delay(vp1, rsp); 803 if (error) 804 goto out; 805 } 806 807 /* Wait for a delegation recall to complete. */ 808 809 error = wait_for_recall(vp1, vp2, op, rsp); 810 if (error) 811 goto out; 812 813 /* 814 * Wait for any current recovery actions to finish. Note that a 815 * recovery thread can still start up after wait_for_recovery() 816 * finishes. We don't block out recovery operations until we 817 * acquire s_recovlock and mi_recovlock. 818 */ 819 error = wait_for_recovery(mi, op); 820 if (error) 821 goto out; 822 823 /* 824 * Check to see if the rnode is already marked with a 825 * recovery error. If so, return it immediately. But 826 * always pass CLOSE, LOCKU, and DELEGRETURN so we can 827 * clean up state on the server. 828 */ 829 830 if (vp1 != NULL) { 831 if (error = nfs4_check_recov_err(vp1, op, rsp, rerr_cnt, "vp1")) 832 goto out; 833 nfs4_check_remap(mi, vp1, NFS4_REMAP_CKATTRS, &e); 834 } 835 836 if (vp2 != NULL) { 837 if (error = nfs4_check_recov_err(vp2, op, rsp, rerr_cnt, "vp2")) 838 goto out; 839 nfs4_check_remap(mi, vp2, NFS4_REMAP_CKATTRS, &e); 840 } 841 842 /* 843 * The lock order calls for us to acquire s_recovlock before 844 * mi_recovlock, but we have to hold mi_recovlock to look up sp (to 845 * prevent races with the failover/migration code). So acquire 846 * mi_recovlock, look up sp, drop mi_recovlock, acquire 847 * s_recovlock and mi_recovlock, then verify that sp is still the 848 * right object. XXX Can we find a simpler way to deal with this? 849 */ 850 if (nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 851 mi->mi_flags & MI4_INT)) { 852 error = EINTR; 853 goto out; 854 } 855 get_sp: 856 sp = find_nfs4_server(mi); 857 if (sp != NULL) { 858 sp->s_otw_call_count++; 859 mutex_exit(&sp->s_lock); 860 droplock_time = gethrestime_sec(); 861 } 862 nfs_rw_exit(&mi->mi_recovlock); 863 864 if (sp != NULL) { 865 if (nfs_rw_enter_sig(&sp->s_recovlock, RW_READER, 866 mi->mi_flags & MI4_INT)) { 867 error = EINTR; 868 goto out; 869 } 870 } 871 if (nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 872 mi->mi_flags & MI4_INT)) { 873 if (sp != NULL) 874 nfs_rw_exit(&sp->s_recovlock); 875 error = EINTR; 876 goto out; 877 } 878 /* 879 * If the mntinfo4_t hasn't changed nfs4_sever_ts then 880 * there's no point in double checking to make sure it 881 * has switched. 882 */ 883 if (sp == NULL || droplock_time < mi->mi_srvsettime) { 884 tsp = find_nfs4_server(mi); 885 if (tsp != sp) { 886 /* try again */ 887 if (tsp != NULL) { 888 mutex_exit(&tsp->s_lock); 889 nfs4_server_rele(tsp); 890 tsp = NULL; 891 } 892 if (sp != NULL) { 893 nfs_rw_exit(&sp->s_recovlock); 894 mutex_enter(&sp->s_lock); 895 sp->s_otw_call_count--; 896 mutex_exit(&sp->s_lock); 897 nfs4_server_rele(sp); 898 sp = NULL; 899 } 900 goto get_sp; 901 } else { 902 if (tsp != NULL) { 903 mutex_exit(&tsp->s_lock); 904 nfs4_server_rele(tsp); 905 tsp = NULL; 906 } 907 } 908 } 909 910 if (sp != NULL) { 911 rsp->rs_sp = sp; 912 } 913 914 /* 915 * If the fileystem uses volatile filehandles, obtain a lock so 916 * that we synchronize with renames. Exception: mount operations 917 * can change mi_fh_expire_type, which could be a problem, since 918 * the end_op code needs to be consistent with the start_op code 919 * about mi_rename_lock. Since mounts don't compete with renames, 920 * it's simpler to just not acquire the rename lock for mounts. 921 */ 922 if (NFS4_VOLATILE_FH(mi) && op != OH_MOUNT) { 923 if (nfs_rw_enter_sig(&mi->mi_rename_lock, 924 op == OH_VFH_RENAME ? RW_WRITER : RW_READER, 925 mi->mi_flags & MI4_INT)) { 926 nfs_rw_exit(&mi->mi_recovlock); 927 if (sp != NULL) 928 nfs_rw_exit(&sp->s_recovlock); 929 error = EINTR; 930 goto out; 931 } 932 rsp->rs_flags |= NFS4_RS_RENAME_HELD; 933 } 934 935 if (OH_IS_STATE_RELE(op)) { 936 /* 937 * For forced unmount, letting the request proceed will 938 * almost always delay response to the user, so hand it off 939 * to the recovery thread. For exiting lwp's, we don't 940 * have a good way to tell if the request will hang. We 941 * generally want processes to handle their own requests so 942 * that they can be done in parallel, but if there is 943 * already a recovery thread, hand the request off to it. 944 * This will improve user response at no cost to overall 945 * system throughput. For zone shutdown, we'd prefer 946 * the recovery thread to handle this as well. 947 */ 948 ASSERT(startrecovp != NULL); 949 mutex_enter(&mi->mi_lock); 950 if (FS_OR_ZONE_GONE4(mi->mi_vfsp)) 951 *startrecovp = TRUE; 952 else if ((curthread->t_proc_flag & TP_LWPEXIT) && 953 (mi->mi_flags & MI4_RECOV_ACTIV)) 954 *startrecovp = TRUE; 955 else 956 *startrecovp = FALSE; 957 mutex_exit(&mi->mi_lock); 958 } else 959 if (startrecovp != NULL) 960 *startrecovp = FALSE; 961 962 ASSERT(error == 0); 963 return (error); 964 965 out: 966 ASSERT(error != 0); 967 if (sp != NULL) { 968 mutex_enter(&sp->s_lock); 969 sp->s_otw_call_count--; 970 mutex_exit(&sp->s_lock); 971 nfs4_server_rele(sp); 972 rsp->rs_sp = NULL; 973 } 974 nfs4_end_op_recall(vp1, vp2, rsp); 975 976 #ifdef DEBUG 977 (void) tsd_set(nfs4_tsd_key, NULL); 978 #endif 979 return (error); 980 } 981 982 /* 983 * It is up to the caller to determine if rsp->rs_sp being NULL 984 * is detrimental or not. 985 */ 986 int 987 nfs4_start_op(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2, 988 nfs4_recov_state_t *rsp) 989 { 990 ASSERT(rsp->rs_num_retry_despite_err == 0); 991 rsp->rs_num_retry_despite_err = 0; 992 return (nfs4_start_fop(mi, vp1, vp2, OH_OTHER, rsp, NULL)); 993 } 994 995 /* 996 * Release any resources acquired by nfs4_start_op(). 997 * 'sp' should be the nfs4_server pointer returned by nfs4_start_op(). 998 * 999 * The operation hint is used to avoid a deadlock by bypassing delegation 1000 * return logic for writes, which are done while returning a delegation. 1001 */ 1002 1003 void 1004 nfs4_end_fop(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2, nfs4_op_hint_t op, 1005 nfs4_recov_state_t *rsp, bool_t needs_recov) 1006 { 1007 nfs4_server_t *sp = rsp->rs_sp; 1008 rnode4_t *rp = NULL; 1009 1010 #ifdef lint 1011 /* 1012 * The op hint isn't used any more, but might be in 1013 * the future. 1014 */ 1015 op = op; 1016 #endif 1017 1018 #ifdef DEBUG 1019 ASSERT(tsd_get(nfs4_tsd_key) != NULL); 1020 (void) tsd_set(nfs4_tsd_key, NULL); 1021 #endif 1022 1023 nfs4_end_op_recall(vp1, vp2, rsp); 1024 1025 if (rsp->rs_flags & NFS4_RS_RENAME_HELD) 1026 nfs_rw_exit(&mi->mi_rename_lock); 1027 1028 if (!needs_recov) { 1029 if (rsp->rs_flags & NFS4_RS_DELAY_MSG) { 1030 /* may need to clear the delay interval */ 1031 if (vp1 != NULL) { 1032 rp = VTOR4(vp1); 1033 mutex_enter(&rp->r_statelock); 1034 rp->r_delay_interval = 0; 1035 mutex_exit(&rp->r_statelock); 1036 } 1037 } 1038 rsp->rs_flags &= ~(NFS4_RS_GRACE_MSG|NFS4_RS_DELAY_MSG); 1039 } 1040 1041 /* 1042 * If the corresponding nfs4_start_op() found a sp, 1043 * then there must still be a sp. 1044 */ 1045 if (sp != NULL) { 1046 nfs_rw_exit(&mi->mi_recovlock); 1047 nfs_rw_exit(&sp->s_recovlock); 1048 mutex_enter(&sp->s_lock); 1049 sp->s_otw_call_count--; 1050 cv_broadcast(&sp->s_cv_otw_count); 1051 mutex_exit(&sp->s_lock); 1052 nfs4_server_rele(sp); 1053 } else { 1054 nfs_rw_exit(&mi->mi_recovlock); 1055 } 1056 } 1057 1058 void 1059 nfs4_end_op(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2, 1060 nfs4_recov_state_t *rsp, bool_t needrecov) 1061 { 1062 nfs4_end_fop(mi, vp1, vp2, OH_OTHER, rsp, needrecov); 1063 } 1064 1065 /* 1066 * If the filesystem is going through client recovery, block until 1067 * finished. 1068 * Exceptions: 1069 * - state-releasing ops (CLOSE, LOCKU, DELEGRETURN) are allowed to proceed 1070 * if the filesystem has been forcibly unmounted or the lwp is exiting. 1071 * 1072 * Return value: 1073 * - 0 if no errors 1074 * - EINTR if the call was interrupted 1075 * - EIO if the filesystem has been forcibly unmounted (non-state-releasing 1076 * op) 1077 * - the errno value from the recovery thread, if recovery failed 1078 */ 1079 1080 static int 1081 wait_for_recovery(mntinfo4_t *mi, nfs4_op_hint_t op_hint) 1082 { 1083 int error = 0; 1084 1085 mutex_enter(&mi->mi_lock); 1086 1087 while (mi->mi_recovflags != 0) { 1088 klwp_t *lwp = ttolwp(curthread); 1089 1090 if (mi->mi_flags & MI4_RECOV_FAIL) 1091 break; 1092 if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) 1093 break; 1094 if (OH_IS_STATE_RELE(op_hint) && 1095 (curthread->t_proc_flag & TP_LWPEXIT)) 1096 break; 1097 1098 if (lwp != NULL) 1099 lwp->lwp_nostop++; 1100 /* XXX - use different cv? */ 1101 if (cv_wait_sig(&mi->mi_failover_cv, &mi->mi_lock) == 0) { 1102 error = EINTR; 1103 if (lwp != NULL) 1104 lwp->lwp_nostop--; 1105 break; 1106 } 1107 if (lwp != NULL) 1108 lwp->lwp_nostop--; 1109 } 1110 1111 if (mi->mi_flags & MI4_RECOV_FAIL) { 1112 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1113 "wait_for_recovery: fail since RECOV FAIL")); 1114 error = mi->mi_error; 1115 } else if ((mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) && 1116 !OH_IS_STATE_RELE(op_hint)) { 1117 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1118 "wait_for_recovery: forced unmount")); 1119 error = EIO; 1120 } 1121 1122 mutex_exit(&mi->mi_lock); 1123 1124 return (error); 1125 } 1126 1127 /* 1128 * If the client received NFS4ERR_GRACE for this particular mount, 1129 * the client blocks here until it is time to try again. 1130 * 1131 * Return value: 1132 * - 0 if wait was successful 1133 * - EINTR if the call was interrupted 1134 */ 1135 1136 int 1137 nfs4_wait_for_grace(mntinfo4_t *mi, nfs4_recov_state_t *rsp) 1138 { 1139 int error = 0; 1140 time_t curtime, time_to_wait; 1141 1142 /* do a unprotected check to reduce mi_lock contention */ 1143 if (mi->mi_grace_wait != 0) { 1144 mutex_enter(&mi->mi_lock); 1145 1146 if (mi->mi_grace_wait != 0) { 1147 if (!(rsp->rs_flags & NFS4_RS_GRACE_MSG)) 1148 rsp->rs_flags |= NFS4_RS_GRACE_MSG; 1149 1150 curtime = gethrestime_sec(); 1151 1152 if (curtime < mi->mi_grace_wait) { 1153 1154 time_to_wait = mi->mi_grace_wait - curtime; 1155 1156 mutex_exit(&mi->mi_lock); 1157 1158 delay(SEC_TO_TICK(time_to_wait)); 1159 1160 curtime = gethrestime_sec(); 1161 1162 mutex_enter(&mi->mi_lock); 1163 1164 if (curtime >= mi->mi_grace_wait) 1165 mi->mi_grace_wait = 0; 1166 } else { 1167 mi->mi_grace_wait = 0; 1168 } 1169 } 1170 mutex_exit(&mi->mi_lock); 1171 } 1172 1173 return (error); 1174 } 1175 1176 /* 1177 * If the client received NFS4ERR_DELAY for an operation on a vnode, 1178 * the client blocks here until it is time to try again. 1179 * 1180 * Return value: 1181 * - 0 if wait was successful 1182 * - EINTR if the call was interrupted 1183 */ 1184 1185 int 1186 nfs4_wait_for_delay(vnode_t *vp, nfs4_recov_state_t *rsp) 1187 { 1188 int error = 0; 1189 time_t curtime, time_to_wait; 1190 rnode4_t *rp; 1191 1192 ASSERT(vp != NULL); 1193 1194 rp = VTOR4(vp); 1195 1196 /* do a unprotected check to reduce r_statelock contention */ 1197 if (rp->r_delay_wait != 0) { 1198 mutex_enter(&rp->r_statelock); 1199 1200 if (rp->r_delay_wait != 0) { 1201 1202 if (!(rsp->rs_flags & NFS4_RS_DELAY_MSG)) { 1203 rsp->rs_flags |= NFS4_RS_DELAY_MSG; 1204 nfs4_mi_kstat_inc_delay(VTOMI4(vp)); 1205 } 1206 1207 curtime = gethrestime_sec(); 1208 1209 if (curtime < rp->r_delay_wait) { 1210 1211 time_to_wait = rp->r_delay_wait - curtime; 1212 1213 mutex_exit(&rp->r_statelock); 1214 1215 delay(SEC_TO_TICK(time_to_wait)); 1216 1217 curtime = gethrestime_sec(); 1218 1219 mutex_enter(&rp->r_statelock); 1220 1221 if (curtime >= rp->r_delay_wait) 1222 rp->r_delay_wait = 0; 1223 } else { 1224 rp->r_delay_wait = 0; 1225 } 1226 } 1227 mutex_exit(&rp->r_statelock); 1228 } 1229 1230 return (error); 1231 } 1232 1233 /* 1234 * The recovery thread. 1235 */ 1236 1237 static void 1238 nfs4_recov_thread(recov_info_t *recovp) 1239 { 1240 mntinfo4_t *mi = recovp->rc_mi; 1241 nfs4_server_t *sp; 1242 int done = 0, error = 0; 1243 bool_t recov_fail = FALSE; 1244 callb_cpr_t cpr_info; 1245 kmutex_t cpr_lock; 1246 1247 nfs4_queue_event(RE_START, mi, NULL, mi->mi_recovflags, 1248 recovp->rc_vp1, recovp->rc_vp2, 0, NULL, 0, TAG_NONE, TAG_NONE, 1249 0, 0); 1250 1251 mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL); 1252 CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr, "nfsv4Recov"); 1253 1254 mutex_enter(&mi->mi_lock); 1255 mi->mi_recovthread = curthread; 1256 mutex_exit(&mi->mi_lock); 1257 1258 /* 1259 * We don't really need protection here against failover or 1260 * migration, since the current thread is the one that would make 1261 * any changes, but hold mi_recovlock anyway for completeness (and 1262 * to satisfy any ASSERTs). 1263 */ 1264 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0); 1265 sp = find_nfs4_server(mi); 1266 if (sp != NULL) 1267 mutex_exit(&sp->s_lock); 1268 nfs_rw_exit(&mi->mi_recovlock); 1269 1270 /* 1271 * Do any necessary recovery, based on the information in recovp 1272 * and any recovery flags. 1273 */ 1274 1275 do { 1276 mutex_enter(&mi->mi_lock); 1277 if (FS_OR_ZONE_GONE4(mi->mi_vfsp)) { 1278 bool_t activesrv; 1279 1280 NFS4_DEBUG(nfs4_client_recov_debug && 1281 mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED, (CE_NOTE, 1282 "nfs4_recov_thread: file system has been " 1283 "unmounted")); 1284 NFS4_DEBUG(nfs4_client_recov_debug && 1285 zone_status_get(curproc->p_zone) >= 1286 ZONE_IS_SHUTTING_DOWN, (CE_NOTE, 1287 "nfs4_recov_thread: zone shutting down")); 1288 /* 1289 * If the server has lost its state for us and 1290 * the filesystem is unmounted, then the filesystem 1291 * can be tossed, even if there are lost lock or 1292 * lost state calls in the recovery queue. 1293 */ 1294 if (mi->mi_recovflags & 1295 (MI4R_NEED_CLIENTID | MI4R_REOPEN_FILES)) { 1296 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1297 "nfs4_recov_thread: bailing out")); 1298 mi->mi_flags |= MI4_RECOV_FAIL; 1299 mi->mi_error = recovp->rc_error; 1300 recov_fail = TRUE; 1301 } 1302 /* 1303 * We don't know if the server has any state for 1304 * us, and the filesystem has been unmounted. If 1305 * there are "lost state" recovery items, keep 1306 * trying to process them until there are no more 1307 * mounted filesystems for the server. Otherwise, 1308 * bail out. The reason we don't mark the 1309 * filesystem as failing recovery is in case we 1310 * have to do "lost state" recovery later (e.g., a 1311 * user process exits). 1312 */ 1313 if (!(mi->mi_recovflags & MI4R_LOST_STATE)) { 1314 done = 1; 1315 mutex_exit(&mi->mi_lock); 1316 break; 1317 } 1318 mutex_exit(&mi->mi_lock); 1319 1320 if (sp == NULL) 1321 activesrv = FALSE; 1322 else { 1323 mutex_enter(&sp->s_lock); 1324 activesrv = nfs4_fs_active(sp); 1325 } 1326 if (!activesrv) { 1327 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1328 "no active fs for server %p", 1329 (void *)sp)); 1330 mutex_enter(&mi->mi_lock); 1331 mi->mi_flags |= MI4_RECOV_FAIL; 1332 mi->mi_error = recovp->rc_error; 1333 mutex_exit(&mi->mi_lock); 1334 recov_fail = TRUE; 1335 if (sp != NULL) { 1336 /* 1337 * Mark the server instance as 1338 * dead, so that nobody will attach 1339 * a new filesystem. 1340 */ 1341 nfs4_mark_srv_dead(sp); 1342 } 1343 } 1344 if (sp != NULL) 1345 mutex_exit(&sp->s_lock); 1346 } else { 1347 mutex_exit(&mi->mi_lock); 1348 } 1349 1350 /* 1351 * Check if we need to select a new server for a 1352 * failover. Choosing a new server will force at 1353 * least a check of the clientid. 1354 */ 1355 mutex_enter(&mi->mi_lock); 1356 if (!recov_fail && 1357 (mi->mi_recovflags & MI4R_NEED_NEW_SERVER)) { 1358 mutex_exit(&mi->mi_lock); 1359 recov_newserver(recovp, &sp, &recov_fail); 1360 } else 1361 mutex_exit(&mi->mi_lock); 1362 1363 /* 1364 * Check if we need to recover the clientid. This 1365 * must be done before file and lock recovery, and it 1366 * potentially affects the recovery threads for other 1367 * filesystems, so it gets special treatment. 1368 */ 1369 if (sp != NULL && recov_fail == FALSE) { 1370 mutex_enter(&sp->s_lock); 1371 if (!(sp->s_flags & N4S_CLIENTID_SET)) { 1372 mutex_exit(&sp->s_lock); 1373 recov_clientid(recovp, sp); 1374 } else { 1375 /* 1376 * Unset this flag in case another recovery 1377 * thread successfully recovered the clientid 1378 * for us already. 1379 */ 1380 mutex_enter(&mi->mi_lock); 1381 mi->mi_recovflags &= ~MI4R_NEED_CLIENTID; 1382 mutex_exit(&mi->mi_lock); 1383 mutex_exit(&sp->s_lock); 1384 } 1385 } 1386 1387 /* 1388 * Check if we need to get the security information. 1389 */ 1390 mutex_enter(&mi->mi_lock); 1391 if ((mi->mi_recovflags & MI4R_NEED_SECINFO) && 1392 !(mi->mi_flags & MI4_RECOV_FAIL)) { 1393 mutex_exit(&mi->mi_lock); 1394 (void) nfs_rw_enter_sig(&mi->mi_recovlock, 1395 RW_WRITER, 0); 1396 error = nfs4_secinfo_recov(recovp->rc_mi, 1397 recovp->rc_vp1, recovp->rc_vp2); 1398 /* 1399 * If error, nothing more can be done, stop 1400 * the recovery. 1401 */ 1402 if (error) { 1403 mutex_enter(&mi->mi_lock); 1404 mi->mi_flags |= MI4_RECOV_FAIL; 1405 mi->mi_error = recovp->rc_error; 1406 mutex_exit(&mi->mi_lock); 1407 nfs4_queue_event(RE_WRONGSEC, mi, NULL, 1408 error, recovp->rc_vp1, recovp->rc_vp2, 1409 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0); 1410 } 1411 nfs_rw_exit(&mi->mi_recovlock); 1412 } else 1413 mutex_exit(&mi->mi_lock); 1414 1415 /* 1416 * Check if there's a bad seqid to recover. 1417 */ 1418 mutex_enter(&mi->mi_lock); 1419 if ((mi->mi_recovflags & MI4R_BAD_SEQID) && 1420 !(mi->mi_flags & MI4_RECOV_FAIL)) { 1421 mutex_exit(&mi->mi_lock); 1422 (void) nfs_rw_enter_sig(&mi->mi_recovlock, 1423 RW_WRITER, 0); 1424 recov_bad_seqid(recovp); 1425 nfs_rw_exit(&mi->mi_recovlock); 1426 } else 1427 mutex_exit(&mi->mi_lock); 1428 1429 /* 1430 * Next check for recovery that affects the entire 1431 * filesystem. 1432 */ 1433 if (sp != NULL) { 1434 mutex_enter(&mi->mi_lock); 1435 if ((mi->mi_recovflags & MI4R_REOPEN_FILES) && 1436 !(mi->mi_flags & MI4_RECOV_FAIL)) { 1437 mutex_exit(&mi->mi_lock); 1438 recov_openfiles(recovp, sp); 1439 } else 1440 mutex_exit(&mi->mi_lock); 1441 } 1442 1443 /* 1444 * Send any queued state recovery requests. 1445 */ 1446 mutex_enter(&mi->mi_lock); 1447 if (sp != NULL && 1448 (mi->mi_recovflags & MI4R_LOST_STATE) && 1449 !(mi->mi_flags & MI4_RECOV_FAIL)) { 1450 mutex_exit(&mi->mi_lock); 1451 (void) nfs_rw_enter_sig(&mi->mi_recovlock, 1452 RW_WRITER, 0); 1453 nfs4_resend_lost_rqsts(recovp, sp); 1454 if (list_head(&mi->mi_lost_state) == NULL) { 1455 /* done */ 1456 mutex_enter(&mi->mi_lock); 1457 mi->mi_recovflags &= ~MI4R_LOST_STATE; 1458 mutex_exit(&mi->mi_lock); 1459 } 1460 nfs_rw_exit(&mi->mi_recovlock); 1461 } else { 1462 mutex_exit(&mi->mi_lock); 1463 } 1464 1465 /* 1466 * See if there is anything more to do. If not, announce 1467 * that we are done and exit. 1468 * 1469 * Need mi_recovlock to keep 'sp' valid. Must grab 1470 * mi_recovlock before mi_lock to preserve lock ordering. 1471 */ 1472 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0); 1473 mutex_enter(&mi->mi_lock); 1474 if ((mi->mi_recovflags & ~MI4R_SRV_REBOOT) == 0 || 1475 (mi->mi_flags & MI4_RECOV_FAIL)) { 1476 list_t local_lost_state; 1477 nfs4_lost_rqst_t *lrp; 1478 1479 /* 1480 * We need to remove the lost requests before we 1481 * unmark the mi as no longer doing recovery to 1482 * avoid a race with a new thread putting new lost 1483 * requests on the same mi (and the going away 1484 * thread would remove the new lost requests). 1485 * 1486 * Move the lost requests to a local list since 1487 * nfs4_remove_lost_rqst() drops mi_lock, and 1488 * dropping the mi_lock would make our check to 1489 * see if recovery is done no longer valid. 1490 */ 1491 list_create(&local_lost_state, 1492 sizeof (nfs4_lost_rqst_t), 1493 offsetof(nfs4_lost_rqst_t, lr_node)); 1494 list_move_tail(&local_lost_state, &mi->mi_lost_state); 1495 1496 done = 1; 1497 mutex_exit(&mi->mi_lock); 1498 /* 1499 * Now officially free the "moved" 1500 * lost requests. 1501 */ 1502 while ((lrp = list_head(&local_lost_state)) != NULL) { 1503 list_remove(&local_lost_state, lrp); 1504 nfs4_free_lost_rqst(lrp, sp); 1505 } 1506 list_destroy(&local_lost_state); 1507 } else 1508 mutex_exit(&mi->mi_lock); 1509 nfs_rw_exit(&mi->mi_recovlock); 1510 1511 /* 1512 * If the filesystem has been forcibly unmounted, there is 1513 * probably no point in retrying immediately. Furthermore, 1514 * there might be user processes waiting for a chance to 1515 * queue up "lost state" requests, so that they can exit. 1516 * So pause here for a moment. Same logic for zone shutdown. 1517 */ 1518 if (!done && FS_OR_ZONE_GONE4(mi->mi_vfsp)) { 1519 mutex_enter(&mi->mi_lock); 1520 cv_broadcast(&mi->mi_failover_cv); 1521 mutex_exit(&mi->mi_lock); 1522 delay(SEC_TO_TICK(nfs4_unmount_delay)); 1523 } 1524 1525 } while (!done); 1526 1527 if (sp != NULL) 1528 nfs4_server_rele(sp); 1529 1530 /* 1531 * Return all recalled delegations 1532 */ 1533 nfs4_dlistclean(); 1534 1535 mutex_enter(&mi->mi_lock); 1536 recov_done(mi, recovp); 1537 mutex_exit(&mi->mi_lock); 1538 1539 /* 1540 * Free up resources that were allocated for us. 1541 */ 1542 if (recovp->rc_vp1 != NULL) 1543 VN_RELE(recovp->rc_vp1); 1544 if (recovp->rc_vp2 != NULL) 1545 VN_RELE(recovp->rc_vp2); 1546 1547 /* now we are done using the mi struct, signal the waiters */ 1548 mutex_enter(&mi->mi_lock); 1549 mi->mi_in_recovery--; 1550 if (mi->mi_in_recovery == 0) 1551 cv_broadcast(&mi->mi_cv_in_recov); 1552 mutex_exit(&mi->mi_lock); 1553 1554 VFS_RELE(mi->mi_vfsp); 1555 MI4_RELE(mi); 1556 kmem_free(recovp, sizeof (recov_info_t)); 1557 mutex_enter(&cpr_lock); 1558 CALLB_CPR_EXIT(&cpr_info); 1559 mutex_destroy(&cpr_lock); 1560 zthread_exit(); 1561 } 1562 1563 /* 1564 * Log the end of recovery and notify any waiting threads. 1565 */ 1566 1567 static void 1568 recov_done(mntinfo4_t *mi, recov_info_t *recovp) 1569 { 1570 1571 ASSERT(MUTEX_HELD(&mi->mi_lock)); 1572 1573 nfs4_queue_event(RE_END, mi, NULL, 0, recovp->rc_vp1, 1574 recovp->rc_vp2, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0); 1575 mi->mi_recovthread = NULL; 1576 mi->mi_flags &= ~MI4_RECOV_ACTIV; 1577 mi->mi_recovflags &= ~MI4R_SRV_REBOOT; 1578 cv_broadcast(&mi->mi_failover_cv); 1579 } 1580 1581 /* 1582 * State-specific recovery routines, by state. 1583 */ 1584 1585 /* 1586 * Failover. 1587 * 1588 * Replaces *spp with a reference to the new server, which must 1589 * eventually be freed. 1590 */ 1591 1592 static void 1593 recov_newserver(recov_info_t *recovp, nfs4_server_t **spp, bool_t *recov_fail) 1594 { 1595 mntinfo4_t *mi = recovp->rc_mi; 1596 servinfo4_t *svp = NULL; 1597 nfs4_server_t *osp = *spp; 1598 CLIENT *cl; 1599 enum clnt_stat status; 1600 struct timeval tv; 1601 int error; 1602 int oncethru = 0; 1603 rnode4_t *rp; 1604 int index; 1605 nfs_fh4 fh; 1606 char *snames; 1607 size_t len; 1608 1609 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0); 1610 1611 tv.tv_sec = 2; 1612 tv.tv_usec = 0; 1613 1614 #ifdef lint 1615 /* 1616 * Lint can't follow the logic, so thinks that snames and len 1617 * can be used before being set. They can't, but lint can't 1618 * figure it out. To address the lint warning, initialize 1619 * snames and len for lint. 1620 */ 1621 snames = NULL; 1622 len = 0; 1623 #endif 1624 1625 /* 1626 * Ping the null NFS procedure of every server in 1627 * the list until one responds. We always start 1628 * at the head of the list and always skip the one 1629 * that is current, since it's caused us a problem. 1630 */ 1631 while (svp == NULL) { 1632 for (svp = mi->mi_servers; svp; svp = svp->sv_next) { 1633 1634 mutex_enter(&mi->mi_lock); 1635 if (FS_OR_ZONE_GONE4(mi->mi_vfsp)) { 1636 mi->mi_flags |= MI4_RECOV_FAIL; 1637 mutex_exit(&mi->mi_lock); 1638 (void) nfs_rw_exit(&mi->mi_recovlock); 1639 *recov_fail = TRUE; 1640 if (oncethru) 1641 kmem_free(snames, len); 1642 return; 1643 } 1644 mutex_exit(&mi->mi_lock); 1645 1646 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 1647 if (svp->sv_flags & SV4_NOTINUSE) { 1648 nfs_rw_exit(&svp->sv_lock); 1649 continue; 1650 } 1651 nfs_rw_exit(&svp->sv_lock); 1652 1653 if (!oncethru && svp == mi->mi_curr_serv) 1654 continue; 1655 1656 error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, 1657 NFS_PROGRAM, NFS_V4, 0, 1, CRED(), &cl); 1658 if (error) 1659 continue; 1660 1661 if (!(mi->mi_flags & MI4_INT)) 1662 cl->cl_nosignal = TRUE; 1663 status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL, 1664 xdr_void, NULL, tv); 1665 if (!(mi->mi_flags & MI4_INT)) 1666 cl->cl_nosignal = FALSE; 1667 AUTH_DESTROY(cl->cl_auth); 1668 CLNT_DESTROY(cl); 1669 if (status == RPC_SUCCESS) { 1670 nfs4_queue_event(RE_FAILOVER, mi, 1671 svp == mi->mi_curr_serv ? NULL : 1672 svp->sv_hostname, 0, NULL, NULL, 0, 1673 NULL, 0, TAG_NONE, TAG_NONE, 0, 0); 1674 break; 1675 } 1676 } 1677 1678 if (svp == NULL) { 1679 if (!oncethru) { 1680 snames = nfs4_getsrvnames(mi, &len); 1681 nfs4_queue_fact(RF_SRVS_NOT_RESPOND, mi, 1682 0, 0, 0, FALSE, snames, 0, NULL); 1683 oncethru = 1; 1684 } 1685 delay(hz); 1686 } 1687 } 1688 1689 if (oncethru) { 1690 nfs4_queue_fact(RF_SRVS_OK, mi, 0, 0, 0, FALSE, snames, 1691 0, NULL); 1692 kmem_free(snames, len); 1693 } 1694 1695 #if DEBUG 1696 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 1697 ASSERT((svp->sv_flags & SV4_NOTINUSE) == 0); 1698 nfs_rw_exit(&svp->sv_lock); 1699 #endif 1700 1701 mutex_enter(&mi->mi_lock); 1702 mi->mi_recovflags &= ~MI4R_NEED_NEW_SERVER; 1703 if (svp != mi->mi_curr_serv) { 1704 servinfo4_t *osvp = mi->mi_curr_serv; 1705 1706 mutex_exit(&mi->mi_lock); 1707 1708 /* 1709 * Update server-dependent fields in the root vnode. 1710 */ 1711 index = rtable4hash(mi->mi_rootfh); 1712 rw_enter(&rtable4[index].r_lock, RW_WRITER); 1713 1714 rp = r4find(&rtable4[index], mi->mi_rootfh, mi->mi_vfsp); 1715 if (rp != NULL) { 1716 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE, 1717 "recov_newserver: remapping %s", rnode4info(rp))); 1718 mutex_enter(&rp->r_statelock); 1719 rp->r_server = svp; 1720 PURGE_ATTRCACHE4_LOCKED(rp); 1721 mutex_exit(&rp->r_statelock); 1722 (void) nfs4_free_data_reclaim(rp); 1723 nfs4_purge_rddir_cache(RTOV4(rp)); 1724 rw_exit(&rtable4[index].r_lock); 1725 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE, 1726 "recov_newserver: done with %s", 1727 rnode4info(rp))); 1728 VN_RELE(RTOV4(rp)); 1729 } else 1730 rw_exit(&rtable4[index].r_lock); 1731 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0); 1732 1733 mutex_enter(&mi->mi_lock); 1734 mi->mi_recovflags |= MI4R_REOPEN_FILES | MI4R_REMAP_FILES; 1735 if (recovp->rc_srv_reboot) 1736 mi->mi_recovflags |= MI4R_SRV_REBOOT; 1737 mi->mi_curr_serv = svp; 1738 mi->mi_failover++; 1739 mi->mi_flags &= ~MI4_BADOWNER_DEBUG; 1740 mutex_exit(&mi->mi_lock); 1741 1742 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 1743 fh.nfs_fh4_len = svp->sv_fhandle.fh_len; 1744 fh.nfs_fh4_val = svp->sv_fhandle.fh_buf; 1745 sfh4_update(mi->mi_rootfh, &fh); 1746 fh.nfs_fh4_len = svp->sv_pfhandle.fh_len; 1747 fh.nfs_fh4_val = svp->sv_pfhandle.fh_buf; 1748 sfh4_update(mi->mi_srvparentfh, &fh); 1749 nfs_rw_exit(&svp->sv_lock); 1750 1751 *spp = nfs4_move_mi(mi, osvp, svp); 1752 if (osp != NULL) 1753 nfs4_server_rele(osp); 1754 } else 1755 mutex_exit(&mi->mi_lock); 1756 (void) nfs_rw_exit(&mi->mi_recovlock); 1757 } 1758 1759 /* 1760 * Clientid. 1761 */ 1762 1763 static void 1764 recov_clientid(recov_info_t *recovp, nfs4_server_t *sp) 1765 { 1766 mntinfo4_t *mi = recovp->rc_mi; 1767 int error = 0; 1768 int still_stale; 1769 int need_new_s; 1770 1771 ASSERT(sp != NULL); 1772 1773 /* 1774 * Acquire the recovery lock and then verify that the clientid 1775 * still needs to be recovered. (Note that s_recovlock is supposed 1776 * to be acquired before s_lock.) Since the thread holds the 1777 * recovery lock, no other thread will recover the clientid. 1778 */ 1779 (void) nfs_rw_enter_sig(&sp->s_recovlock, RW_WRITER, 0); 1780 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0); 1781 mutex_enter(&sp->s_lock); 1782 still_stale = ((sp->s_flags & N4S_CLIENTID_SET) == 0); 1783 mutex_exit(&sp->s_lock); 1784 1785 if (still_stale) { 1786 nfs4_error_t n4e; 1787 1788 nfs4_error_zinit(&n4e); 1789 nfs4setclientid(mi, kcred, TRUE, &n4e); 1790 error = n4e.error; 1791 if (error != 0) { 1792 1793 /* 1794 * nfs4setclientid may have set MI4R_NEED_NEW_SERVER, 1795 * if so, just return and let recov_thread drive 1796 * failover. 1797 */ 1798 mutex_enter(&mi->mi_lock); 1799 need_new_s = mi->mi_recovflags & MI4R_NEED_NEW_SERVER; 1800 mutex_exit(&mi->mi_lock); 1801 1802 if (need_new_s) { 1803 nfs_rw_exit(&mi->mi_recovlock); 1804 nfs_rw_exit(&sp->s_recovlock); 1805 return; 1806 } 1807 1808 nfs4_queue_event(RE_CLIENTID, mi, NULL, n4e.error, NULL, 1809 NULL, n4e.stat, NULL, 0, TAG_NONE, TAG_NONE, 0, 0); 1810 mutex_enter(&mi->mi_lock); 1811 mi->mi_flags |= MI4_RECOV_FAIL; 1812 mi->mi_error = recovp->rc_error; 1813 mutex_exit(&mi->mi_lock); 1814 /* don't destroy the nfs4_server, let umount do it */ 1815 } 1816 } 1817 1818 if (error == 0) { 1819 mutex_enter(&mi->mi_lock); 1820 mi->mi_recovflags &= ~MI4R_NEED_CLIENTID; 1821 /* 1822 * If still_stale isn't true, then another thread already 1823 * recovered the clientid. And that thread that set the 1824 * clientid will have initiated reopening files on all the 1825 * filesystems for the server, so we should not initiate 1826 * reopening for this filesystem here. 1827 */ 1828 if (still_stale) { 1829 mi->mi_recovflags |= MI4R_REOPEN_FILES; 1830 if (recovp->rc_srv_reboot) 1831 mi->mi_recovflags |= MI4R_SRV_REBOOT; 1832 } 1833 mutex_exit(&mi->mi_lock); 1834 } 1835 1836 nfs_rw_exit(&mi->mi_recovlock); 1837 1838 if (error != 0) { 1839 nfs_rw_exit(&sp->s_recovlock); 1840 mutex_enter(&mi->mi_lock); 1841 if ((mi->mi_flags & MI4_RECOV_FAIL) == 0) 1842 delay(SEC_TO_TICK(recov_err_delay)); 1843 mutex_exit(&mi->mi_lock); 1844 } else { 1845 mntinfo4_t **milist; 1846 mntinfo4_t *tmi; 1847 int nummi, i; 1848 1849 /* 1850 * Initiate recovery of open files for other filesystems. 1851 * We create an array of filesystems, rather than just 1852 * walking the filesystem list, to avoid deadlock issues 1853 * with s_lock and mi_recovlock. 1854 */ 1855 milist = make_milist(sp, &nummi); 1856 for (i = 0; i < nummi; i++) { 1857 tmi = milist[i]; 1858 if (tmi != mi) { 1859 (void) nfs_rw_enter_sig(&tmi->mi_recovlock, 1860 RW_READER, 0); 1861 start_recovery_action(NR_OPENFILES, TRUE, tmi, 1862 NULL, NULL); 1863 nfs_rw_exit(&tmi->mi_recovlock); 1864 } 1865 } 1866 free_milist(milist, nummi); 1867 1868 nfs_rw_exit(&sp->s_recovlock); 1869 } 1870 } 1871 1872 /* 1873 * Return an array of filesystems associated with the given server. The 1874 * caller should call free_milist() to free the references and memory. 1875 */ 1876 1877 static mntinfo4_t ** 1878 make_milist(nfs4_server_t *sp, int *nummip) 1879 { 1880 int nummi, i; 1881 mntinfo4_t **milist; 1882 mntinfo4_t *tmi; 1883 1884 mutex_enter(&sp->s_lock); 1885 nummi = 0; 1886 for (tmi = sp->mntinfo4_list; tmi != NULL; tmi = tmi->mi_clientid_next) 1887 nummi++; 1888 1889 milist = kmem_alloc(nummi * sizeof (mntinfo4_t *), KM_NOSLEEP); 1890 1891 for (i = 0, tmi = sp->mntinfo4_list; tmi != NULL; i++, 1892 tmi = tmi->mi_clientid_next) { 1893 milist[i] = tmi; 1894 VFS_HOLD(tmi->mi_vfsp); 1895 } 1896 mutex_exit(&sp->s_lock); 1897 1898 *nummip = nummi; 1899 return (milist); 1900 } 1901 1902 /* 1903 * Free the filesystem list created by make_milist(). 1904 */ 1905 1906 static void 1907 free_milist(mntinfo4_t **milist, int nummi) 1908 { 1909 mntinfo4_t *tmi; 1910 int i; 1911 1912 for (i = 0; i < nummi; i++) { 1913 tmi = milist[i]; 1914 VFS_RELE(tmi->mi_vfsp); 1915 } 1916 kmem_free(milist, nummi * sizeof (mntinfo4_t *)); 1917 } 1918 1919 /* 1920 * Filehandle 1921 */ 1922 1923 /* 1924 * Lookup the filehandle for the given vnode and update the rnode if it has 1925 * changed. 1926 * 1927 * Errors: 1928 * - if the filehandle could not be updated because of an error that 1929 * requires further recovery, initiate that recovery and return. 1930 * - if the filehandle could not be updated because of a signal, pretend we 1931 * succeeded and let someone else deal with it. 1932 * - if the filehandle could not be updated and the filesystem has been 1933 * forcibly unmounted, pretend we succeeded, and let the caller deal with 1934 * the forced unmount (to retry or not to retry, that is the question). 1935 * - if the filehandle could not be updated because of some other error, 1936 * mark the rnode bad and return. 1937 */ 1938 static void 1939 recov_filehandle(nfs4_recov_t action, mntinfo4_t *mi, vnode_t *vp) 1940 { 1941 rnode4_t *rp = VTOR4(vp); 1942 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 1943 bool_t needrecov; 1944 1945 mutex_enter(&rp->r_statelock); 1946 1947 if (rp->r_flags & R4RECOVERR) { 1948 mutex_exit(&rp->r_statelock); 1949 return; 1950 } 1951 1952 /* 1953 * If someone else is updating the filehandle, wait for them to 1954 * finish and then let our caller retry. 1955 */ 1956 if (rp->r_flags & R4RECEXPFH) { 1957 while (rp->r_flags & R4RECEXPFH) { 1958 cv_wait(&rp->r_cv, &rp->r_statelock); 1959 } 1960 mutex_exit(&rp->r_statelock); 1961 return; 1962 } 1963 rp->r_flags |= R4RECEXPFH; 1964 mutex_exit(&rp->r_statelock); 1965 1966 if (action == NR_BADHANDLE) { 1967 /* shouldn't happen */ 1968 nfs4_queue_event(RE_BADHANDLE, mi, NULL, 0, 1969 vp, NULL, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0); 1970 } 1971 1972 nfs4_remap_file(mi, vp, 0, &e); 1973 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 1974 1975 /* 1976 * If we get BADHANDLE or FHEXPIRED in their handler, something is 1977 * broken. Don't try to recover, just mark the file dead. 1978 */ 1979 if (needrecov && e.error == 0 && 1980 (e.stat == NFS4ERR_BADHANDLE || e.stat == NFS4ERR_FHEXPIRED)) 1981 needrecov = FALSE; 1982 if (needrecov) { 1983 (void) nfs4_start_recovery(&e, mi, vp, 1984 NULL, NULL, NULL, OP_LOOKUP, NULL); 1985 } else if (e.error != EINTR && 1986 !NFS4_FRC_UNMT_ERR(e.error, mi->mi_vfsp) && 1987 (e.error != 0 || e.stat != NFS4_OK)) { 1988 nfs4_recov_fh_fail(vp, e.error, e.stat); 1989 /* 1990 * Don't set r_error to ESTALE. Higher-level code (e.g., 1991 * cstatat_getvp()) retries on ESTALE, which would cause 1992 * an infinite loop. 1993 */ 1994 } 1995 1996 mutex_enter(&rp->r_statelock); 1997 rp->r_flags &= ~R4RECEXPFH; 1998 cv_broadcast(&rp->r_cv); 1999 mutex_exit(&rp->r_statelock); 2000 } 2001 2002 /* 2003 * Stale Filehandle 2004 */ 2005 2006 /* 2007 * A stale filehandle can happen when an individual file has 2008 * been removed, or when an entire filesystem has been taken 2009 * offline. To distinguish these cases, we do this: 2010 * - if a GETATTR with the current filehandle is okay, we do 2011 * nothing (this can happen with two-filehandle ops) 2012 * - if the GETATTR fails, but a GETATTR of the root filehandle 2013 * succeeds, mark the rnode with R4STALE, which will stop use 2014 * - if the GETATTR fails, and a GETATTR of the root filehandle 2015 * also fails, we consider the problem filesystem-wide, so: 2016 * - if we can failover, we should 2017 * - if we can't failover, we should mark both the original 2018 * vnode and the root bad 2019 */ 2020 static void 2021 recov_stale(mntinfo4_t *mi, vnode_t *vp) 2022 { 2023 rnode4_t *rp = VTOR4(vp); 2024 vnode_t *rootvp = NULL; 2025 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 2026 nfs4_ga_res_t gar; 2027 char *fail_msg = "failed to recover from NFS4ERR_STALE"; 2028 bool_t needrecov; 2029 2030 mutex_enter(&rp->r_statelock); 2031 2032 if (rp->r_flags & R4RECOVERR) { 2033 mutex_exit(&rp->r_statelock); 2034 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2035 "recov_stale: already marked dead, rp %s", 2036 rnode4info(rp))); 2037 return; 2038 } 2039 2040 if (rp->r_flags & R4STALE) { 2041 mutex_exit(&rp->r_statelock); 2042 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2043 "recov_stale: already marked stale, rp %s", 2044 rnode4info(rp))); 2045 return; 2046 } 2047 2048 mutex_exit(&rp->r_statelock); 2049 2050 /* Try a GETATTR on this vnode */ 2051 nfs4_getattr_otw_norecovery(vp, &gar, &e, CRED(), 0); 2052 2053 /* 2054 * Handle non-STALE recoverable errors 2055 */ 2056 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 2057 if (needrecov && (e.error != 0 || e.stat != NFS4ERR_STALE)) { 2058 (void) nfs4_start_recovery(&e, mi, vp, 2059 NULL, NULL, NULL, OP_GETATTR, NULL); 2060 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2061 "recov_stale: error=%d, stat=%d seen on rp %s", 2062 e.error, e.stat, rnode4info(rp))); 2063 goto out; 2064 } 2065 2066 /* Are things OK for this vnode? */ 2067 if (!e.error && e.stat == NFS4_OK) { 2068 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2069 "recov_stale: file appears fine, rp %s", 2070 rnode4info(rp))); 2071 goto out; 2072 } 2073 2074 /* Did we get an unrelated non-recoverable error? */ 2075 if (e.error || e.stat != NFS4ERR_STALE) { 2076 nfs4_fail_recov(vp, fail_msg, e.error, e.stat); 2077 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2078 "recov_stale: unrelated fatal error, rp %s", 2079 rnode4info(rp))); 2080 goto out; 2081 } 2082 2083 /* 2084 * If we don't appear to be dealing with the root node, find it. 2085 */ 2086 if ((vp->v_flag & VROOT) == 0) { 2087 nfs4_error_zinit(&e); 2088 e.error = VFS_ROOT(vp->v_vfsp, &rootvp); 2089 if (e.error) { 2090 nfs4_fail_recov(vp, fail_msg, 0, NFS4ERR_STALE); 2091 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2092 "recov_stale: can't find root node for rp %s", 2093 rnode4info(rp))); 2094 goto out; 2095 } 2096 } 2097 2098 /* Try a GETATTR on the root vnode */ 2099 if (rootvp != NULL) { 2100 nfs4_error_zinit(&e); 2101 nfs4_getattr_otw_norecovery(rootvp, &gar, &e, CRED(), 0); 2102 2103 /* Try recovery? */ 2104 if (e.error != 0 || e.stat != NFS4ERR_STALE) { 2105 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 2106 if (needrecov) { 2107 (void) nfs4_start_recovery(&e, 2108 mi, rootvp, NULL, NULL, NULL, 2109 OP_GETATTR, NULL); 2110 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2111 "recov_stale: error=%d, stat=%d seen " 2112 "on rp %s", e.error, e.stat, 2113 rnode4info(rp))); 2114 } 2115 } 2116 2117 /* 2118 * Check to see if a failover attempt is warranted 2119 * NB: nfs4_try_failover doesn't check for STALE 2120 * because recov_stale gets a shot first. Now that 2121 * recov_stale has failed, go ahead and try failover. 2122 * 2123 * If the getattr on the root filehandle was successful, 2124 * then mark recovery as failed for 'vp' and exit. 2125 */ 2126 if (nfs4_try_failover(&e) == 0 && e.stat != NFS4ERR_STALE) { 2127 /* 2128 * pass the original error to fail_recov, not 2129 * the one from trying the root vnode. 2130 */ 2131 nfs4_fail_recov(vp, fail_msg, 0, NFS4ERR_STALE); 2132 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2133 "recov_stale: root node OK, marking " 2134 "dead rp %s", rnode4info(rp))); 2135 goto out; 2136 } 2137 } 2138 2139 /* 2140 * Here, we know that both the original file and the 2141 * root filehandle (which may be the same) are stale. 2142 * We want to fail over if we can, and if we can't, we 2143 * want to mark everything in sight bad. 2144 */ 2145 if (FAILOVER_MOUNT4(mi)) { 2146 mutex_enter(&mi->mi_lock); 2147 mi->mi_recovflags |= MI4R_NEED_NEW_SERVER; 2148 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2149 "recov_stale: failing over due to rp %s", 2150 rnode4info(rp))); 2151 mutex_exit(&mi->mi_lock); 2152 } else { 2153 rnode4_t *rootrp; 2154 servinfo4_t *svp; 2155 2156 /* 2157 * Can't fail over, so mark things dead. 2158 * 2159 * If rootvp is set, we know we have a distinct 2160 * non-root vnode which can be marked dead in 2161 * the usual way. 2162 * 2163 * Then we want to mark the root vnode dead. 2164 * Note that if rootvp wasn't set, our vp is 2165 * actually the root vnode. 2166 */ 2167 if (rootvp != NULL) { 2168 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2169 "recov_stale: can't fail over, marking dead rp %s", 2170 rnode4info(rp))); 2171 nfs4_fail_recov(vp, fail_msg, 0, NFS4ERR_STALE); 2172 } else { 2173 rootvp = vp; 2174 VN_HOLD(rootvp); 2175 } 2176 2177 /* 2178 * Mark root dead, but quietly - since 2179 * the root rnode is frequently recreated, 2180 * we can encounter this at every access. 2181 * Also mark recovery as failed on this VFS. 2182 */ 2183 rootrp = VTOR4(rootvp); 2184 NFS4_DEBUG(nfs4_client_recov_debug, (CE_CONT, 2185 "recov_stale: marking dead root rp %s", 2186 rnode4info(rootrp))); 2187 mutex_enter(&rootrp->r_statelock); 2188 rootrp->r_flags |= (R4RECOVERR | R4STALE); 2189 rootrp->r_error = ESTALE; 2190 mutex_exit(&rootrp->r_statelock); 2191 mutex_enter(&mi->mi_lock); 2192 mi->mi_error = ESTALE; 2193 mutex_exit(&mi->mi_lock); 2194 2195 svp = mi->mi_curr_serv; 2196 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0); 2197 svp->sv_flags |= SV4_ROOT_STALE; 2198 nfs_rw_exit(&svp->sv_lock); 2199 } 2200 2201 out: 2202 if (rootvp) 2203 VN_RELE(rootvp); 2204 } 2205 2206 /* 2207 * Locks. 2208 */ 2209 2210 /* 2211 * Reclaim all the active (acquired) locks for the given file. 2212 * If a process lost a lock, the process is sent a SIGLOST. This is not 2213 * considered an error. 2214 * 2215 * Return values: 2216 * Errors and status are returned via the nfs4_error_t parameter 2217 * If an error indicates that recovery is needed, the caller is responsible 2218 * for dealing with it. 2219 */ 2220 2221 static void 2222 relock_file(vnode_t *vp, mntinfo4_t *mi, nfs4_error_t *ep, 2223 fattr4_change pre_change) 2224 { 2225 locklist_t *locks, *llp; 2226 rnode4_t *rp; 2227 2228 ASSERT(ep != NULL); 2229 nfs4_error_zinit(ep); 2230 2231 if (VTOMI4(vp)->mi_flags & MI4_LLOCK) 2232 return; 2233 2234 nfs4_flush_lock_owners(VTOR4(vp)); 2235 2236 /* 2237 * If we get an error that requires recovery actions, just bail out 2238 * and let the top-level recovery code handle it. 2239 * 2240 * If we get some other error, kill the process that owned the lock 2241 * and mark its remaining locks (if any) as belonging to NOPID, so 2242 * that we don't make any more reclaim requests for that process. 2243 */ 2244 2245 rp = VTOR4(vp); 2246 locks = flk_active_locks_for_vp(vp); 2247 for (llp = locks; llp != NULL; llp = llp->ll_next) { 2248 int did_reclaim = 1; 2249 2250 ASSERT(llp->ll_vp == vp); 2251 if (llp->ll_flock.l_pid == NOPID) 2252 continue; 2253 reclaim_one_lock(vp, &llp->ll_flock, ep, &did_reclaim); 2254 /* 2255 * If we need to restart recovery, stop processing the 2256 * list. Some errors would be recoverable under other 2257 * circumstances, but if they happen here we just give up 2258 * on the lock. 2259 */ 2260 if (nfs4_needs_recovery(ep, TRUE, vp->v_vfsp)) { 2261 if (ep->error != 0) 2262 break; 2263 if (!nfs4_recov_marks_dead(ep->stat)) 2264 break; 2265 } 2266 /* 2267 * In case the server isn't offering us a grace period, or 2268 * if we missed it, we might have opened & locked from scratch, 2269 * rather than reopened/reclaimed. 2270 * We need to ensure that the object hadn't been otherwise 2271 * changed during this time, by comparing the changeinfo. 2272 * We get passed the changeinfo from before the reopen by our 2273 * caller, in pre_change. 2274 * The changeinfo from after the reopen is in rp->r_change, 2275 * courtesy of the GETATTR in the reopen. 2276 * If they're different, then the file has changed, and we 2277 * have to SIGLOST the app. 2278 */ 2279 if (ep->error == 0 && ep->stat == NFS4_OK && !did_reclaim) { 2280 mutex_enter(&rp->r_statelock); 2281 if (pre_change != rp->r_change) 2282 ep->stat = NFS4ERR_NO_GRACE; 2283 mutex_exit(&rp->r_statelock); 2284 } 2285 if (ep->error != 0 || ep->stat != NFS4_OK) { 2286 if (ep->error != 0) 2287 nfs4_queue_event(RE_FAIL_RELOCK, mi, 2288 NULL, ep->error, vp, NULL, 0, NULL, 2289 llp->ll_flock.l_pid, TAG_NONE, TAG_NONE, 2290 0, 0); 2291 else 2292 nfs4_queue_event(RE_FAIL_RELOCK, mi, 2293 NULL, 0, vp, NULL, ep->stat, NULL, 2294 llp->ll_flock.l_pid, TAG_NONE, TAG_NONE, 2295 0, 0); 2296 nfs4_send_siglost(llp->ll_flock.l_pid, mi, vp, TRUE, 2297 ep->error, ep->stat); 2298 relock_skip_pid(llp, llp->ll_flock.l_pid); 2299 2300 /* Reinitialize the nfs4_error and continue */ 2301 nfs4_error_zinit(ep); 2302 } 2303 } 2304 2305 if (locks != NULL) 2306 flk_free_locklist(locks); 2307 } 2308 2309 /* 2310 * Reclaim the given lock. 2311 * If the lock can't be reclaimed, the process is sent SIGLOST, but this is 2312 * not considered an error. 2313 * 2314 * Errors are returned via the nfs4_error_t parameter. 2315 */ 2316 static void 2317 reclaim_one_lock(vnode_t *vp, flock64_t *flk, nfs4_error_t *ep, 2318 int *did_reclaimp) 2319 { 2320 cred_t *cr; 2321 rnode4_t *rp = VTOR4(vp); 2322 2323 cr = pid_to_cr(flk->l_pid); 2324 if (cr == NULL) { 2325 nfs4_error_zinit(ep); 2326 ep->error = ESRCH; 2327 return; 2328 } 2329 2330 do { 2331 mutex_enter(&rp->r_statelock); 2332 if (rp->r_flags & R4RECOVERR) { 2333 /* 2334 * This shouldn't affect other reclaims, so don't 2335 * return an error. 2336 */ 2337 mutex_exit(&rp->r_statelock); 2338 break; 2339 } 2340 mutex_exit(&rp->r_statelock); 2341 2342 nfs4frlock(NFS4_LCK_CTYPE_RECLAIM, vp, F_SETLK, flk, 2343 FREAD|FWRITE, 0, cr, ep, NULL, did_reclaimp); 2344 if (ep->error == 0 && ep->stat == NFS4ERR_FHEXPIRED) 2345 start_recovery_action(NR_FHEXPIRED, TRUE, VTOMI4(vp), 2346 vp, NULL); 2347 } while (ep->error == 0 && ep->stat == NFS4ERR_FHEXPIRED); 2348 2349 crfree(cr); 2350 } 2351 2352 /* 2353 * Open files. 2354 */ 2355 2356 /* 2357 * Verifies if the nfsstat4 is a valid error for marking this vnode dead. 2358 * Returns 1 if the error is valid; 0 otherwise. 2359 */ 2360 static int 2361 nfs4_valid_recov_err_for_vp(vnode_t *vp, nfsstat4 stat) 2362 { 2363 /* 2364 * We should not be marking non-regular files as dead, 2365 * except in very rare cases (eg: BADHANDLE or NFS4ERR_BADNAME). 2366 */ 2367 if (vp->v_type != VREG && stat != NFS4ERR_BADHANDLE && 2368 stat != NFS4ERR_BADNAME) 2369 return (0); 2370 2371 return (1); 2372 } 2373 2374 /* 2375 * Failed attempting to recover a filehandle. If 'stat' is valid for 'vp', 2376 * then mark the object dead. Since we've had to do a lookup for 2377 * filehandle recovery, we will mark the object dead if we got NOENT. 2378 */ 2379 static void 2380 nfs4_recov_fh_fail(vnode_t *vp, int error, nfsstat4 stat) 2381 { 2382 ASSERT(vp != NULL); 2383 2384 if ((error == 0) && (stat != NFS4ERR_NOENT) && 2385 (!nfs4_valid_recov_err_for_vp(vp, stat))) 2386 return; 2387 2388 nfs4_fail_recov(vp, "can't recover filehandle", error, stat); 2389 } 2390 2391 /* 2392 * Recovery from a "shouldn't happen" error. In the long term, we'd like 2393 * to mark only the data structure(s) that provided the bad value as being 2394 * bad. But for now we'll just mark the entire file. 2395 */ 2396 2397 static void 2398 recov_badstate(recov_info_t *recovp, vnode_t *vp, nfsstat4 stat) 2399 { 2400 ASSERT(vp != NULL); 2401 recov_throttle(recovp, vp); 2402 2403 if (!nfs4_valid_recov_err_for_vp(vp, stat)) 2404 return; 2405 2406 nfs4_fail_recov(vp, "", 0, stat); 2407 } 2408 2409 /* 2410 * Free up the information saved for a lost state request. 2411 */ 2412 static void 2413 nfs4_free_lost_rqst(nfs4_lost_rqst_t *lrp, nfs4_server_t *sp) 2414 { 2415 component4 *filep; 2416 nfs4_open_stream_t *osp; 2417 int have_sync_lock; 2418 2419 NFS4_DEBUG(nfs4_lost_rqst_debug, 2420 (CE_NOTE, "nfs4_free_lost_rqst:")); 2421 2422 switch (lrp->lr_op) { 2423 case OP_OPEN: 2424 filep = &lrp->lr_ofile; 2425 if (filep->utf8string_val) { 2426 kmem_free(filep->utf8string_val, filep->utf8string_len); 2427 filep->utf8string_val = NULL; 2428 } 2429 break; 2430 case OP_DELEGRETURN: 2431 nfs4delegreturn_cleanup(VTOR4(lrp->lr_vp), sp); 2432 break; 2433 case OP_CLOSE: 2434 osp = lrp->lr_osp; 2435 ASSERT(osp != NULL); 2436 mutex_enter(&osp->os_sync_lock); 2437 have_sync_lock = 1; 2438 if (osp->os_pending_close) { 2439 /* clean up the open file state. */ 2440 osp->os_pending_close = 0; 2441 nfs4close_notw(lrp->lr_vp, osp, &have_sync_lock); 2442 } 2443 if (have_sync_lock) 2444 mutex_exit(&osp->os_sync_lock); 2445 break; 2446 } 2447 2448 lrp->lr_op = 0; 2449 if (lrp->lr_oop != NULL) { 2450 open_owner_rele(lrp->lr_oop); 2451 lrp->lr_oop = NULL; 2452 } 2453 if (lrp->lr_osp != NULL) { 2454 open_stream_rele(lrp->lr_osp, VTOR4(lrp->lr_vp)); 2455 lrp->lr_osp = NULL; 2456 } 2457 if (lrp->lr_lop != NULL) { 2458 lock_owner_rele(lrp->lr_lop); 2459 lrp->lr_lop = NULL; 2460 } 2461 if (lrp->lr_flk != NULL) { 2462 kmem_free(lrp->lr_flk, sizeof (flock64_t)); 2463 lrp->lr_flk = NULL; 2464 } 2465 if (lrp->lr_vp != NULL) { 2466 VN_RELE(lrp->lr_vp); 2467 lrp->lr_vp = NULL; 2468 } 2469 if (lrp->lr_dvp != NULL) { 2470 VN_RELE(lrp->lr_dvp); 2471 lrp->lr_dvp = NULL; 2472 } 2473 if (lrp->lr_cr != NULL) { 2474 crfree(lrp->lr_cr); 2475 lrp->lr_cr = NULL; 2476 } 2477 2478 kmem_free(lrp, sizeof (nfs4_lost_rqst_t)); 2479 } 2480 2481 /* 2482 * Remove any lost state requests and free them. 2483 */ 2484 static void 2485 nfs4_remove_lost_rqsts(mntinfo4_t *mi, nfs4_server_t *sp) 2486 { 2487 nfs4_lost_rqst_t *lrp; 2488 2489 mutex_enter(&mi->mi_lock); 2490 while ((lrp = list_head(&mi->mi_lost_state)) != NULL) { 2491 list_remove(&mi->mi_lost_state, lrp); 2492 mutex_exit(&mi->mi_lock); 2493 nfs4_free_lost_rqst(lrp, sp); 2494 mutex_enter(&mi->mi_lock); 2495 } 2496 mutex_exit(&mi->mi_lock); 2497 } 2498 2499 /* 2500 * Reopen all the files for the given filesystem and reclaim any locks. 2501 */ 2502 2503 static void 2504 recov_openfiles(recov_info_t *recovp, nfs4_server_t *sp) 2505 { 2506 mntinfo4_t *mi = recovp->rc_mi; 2507 nfs4_opinst_t *reopenlist = NULL, *rep; 2508 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 2509 open_claim_type4 claim; 2510 int remap; 2511 char *fail_msg = "No such file or directory on replica"; 2512 rnode4_t *rp; 2513 fattr4_change pre_change; 2514 2515 ASSERT(sp != NULL); 2516 2517 /* 2518 * This check is to allow a 10ms pause before we reopen files 2519 * it should allow the server time to have received the CB_NULL 2520 * reply and update its internal structures such that (if 2521 * applicable) we are granted a delegation on reopened files. 2522 */ 2523 mutex_enter(&sp->s_lock); 2524 if ((sp->s_flags & (N4S_CB_PINGED | N4S_CB_WAITER)) == 0) { 2525 sp->s_flags |= N4S_CB_WAITER; 2526 (void) cv_timedwait(&sp->wait_cb_null, &sp->s_lock, 2527 (lbolt+drv_usectohz(N4S_CB_PAUSE_TIME))); 2528 } 2529 mutex_exit(&sp->s_lock); 2530 2531 (void) nfs_rw_enter_sig(&sp->s_recovlock, RW_READER, 0); 2532 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0); 2533 2534 if (NFS4_VOLATILE_FH(mi)) { 2535 nfs4_remap_root(mi, &e, 0); 2536 if (nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp)) { 2537 (void) nfs4_start_recovery(&e, mi, NULL, 2538 NULL, NULL, NULL, OP_LOOKUP, NULL); 2539 } 2540 } 2541 2542 mutex_enter(&mi->mi_lock); 2543 if (recovp->rc_srv_reboot || (mi->mi_recovflags & MI4R_SRV_REBOOT)) 2544 claim = CLAIM_PREVIOUS; 2545 else 2546 claim = CLAIM_NULL; 2547 mutex_exit(&mi->mi_lock); 2548 2549 if (e.error == 0 && e.stat == NFS4_OK) { 2550 /* 2551 * Get a snapshot of open files in the filesystem. Note 2552 * that new opens will stall until the server's grace 2553 * period is done. 2554 */ 2555 reopenlist = r4mkopenlist(mi); 2556 2557 mutex_enter(&mi->mi_lock); 2558 remap = mi->mi_recovflags & MI4R_REMAP_FILES; 2559 mutex_exit(&mi->mi_lock); 2560 /* 2561 * Since we are re-establishing state on the 2562 * server, its ok to blow away the saved lost 2563 * requests since we don't need to reissue it. 2564 */ 2565 nfs4_remove_lost_rqsts(mi, sp); 2566 2567 for (rep = reopenlist; rep; rep = rep->re_next) { 2568 2569 if (remap) { 2570 nfs4_remap_file(mi, rep->re_vp, 2571 NFS4_REMAP_CKATTRS, &e); 2572 } 2573 if (e.error == ENOENT || e.stat == NFS4ERR_NOENT) { 2574 /* 2575 * The current server does not have the file 2576 * that is to be remapped. This is most 2577 * likely due to an improperly maintained 2578 * replica. The files that are missing from 2579 * the server will be marked dead and logged 2580 * in order to make sys admins aware of the 2581 * problem. 2582 */ 2583 nfs4_fail_recov(rep->re_vp, 2584 fail_msg, e.error, e.stat); 2585 /* 2586 * We've already handled the error so clear it. 2587 */ 2588 nfs4_error_zinit(&e); 2589 continue; 2590 } else if (e.error == 0 && e.stat == NFS4_OK) { 2591 int j; 2592 2593 rp = VTOR4(rep->re_vp); 2594 mutex_enter(&rp->r_statelock); 2595 pre_change = rp->r_change; 2596 mutex_exit(&rp->r_statelock); 2597 2598 for (j = 0; j < rep->re_numosp; j++) { 2599 nfs4_reopen(rep->re_vp, rep->re_osp[j], 2600 &e, claim, FALSE, TRUE); 2601 if (e.error != 0 || e.stat != NFS4_OK) 2602 break; 2603 } 2604 if (nfs4_needs_recovery(&e, TRUE, 2605 mi->mi_vfsp)) { 2606 (void) nfs4_start_recovery(&e, mi, 2607 rep->re_vp, NULL, NULL, NULL, 2608 OP_OPEN, NULL); 2609 break; 2610 } 2611 } 2612 #ifdef DEBUG 2613 if (nfs4_recovdelay > 0) 2614 delay(MSEC_TO_TICK(nfs4_recovdelay * 1000)); 2615 #endif 2616 if (e.error == 0 && e.stat == NFS4_OK) 2617 relock_file(rep->re_vp, mi, &e, pre_change); 2618 2619 if (nfs4_needs_recovery(&e, TRUE, mi->mi_vfsp)) 2620 (void) nfs4_start_recovery(&e, mi, 2621 rep->re_vp, NULL, NULL, NULL, OP_LOCK, 2622 NULL); 2623 if (e.error != 0 || e.stat != NFS4_OK) 2624 break; 2625 } 2626 2627 /* 2628 * Check to see if we need to remap files passed in 2629 * via the recovery arguments; this will have been 2630 * done for open files. A failure here is not fatal. 2631 */ 2632 if (remap) { 2633 nfs4_error_t ignore; 2634 nfs4_check_remap(mi, recovp->rc_vp1, NFS4_REMAP_CKATTRS, 2635 &ignore); 2636 nfs4_check_remap(mi, recovp->rc_vp2, NFS4_REMAP_CKATTRS, 2637 &ignore); 2638 } 2639 } 2640 2641 if (e.error == 0 && e.stat == NFS4_OK) { 2642 mutex_enter(&mi->mi_lock); 2643 mi->mi_recovflags &= ~(MI4R_REOPEN_FILES | MI4R_REMAP_FILES); 2644 mutex_exit(&mi->mi_lock); 2645 } 2646 2647 nfs_rw_exit(&mi->mi_recovlock); 2648 nfs_rw_exit(&sp->s_recovlock); 2649 2650 if (reopenlist != NULL) 2651 r4releopenlist(reopenlist); 2652 } 2653 2654 /* 2655 * Resend the queued state recovery requests in "rqsts". 2656 */ 2657 2658 static void 2659 nfs4_resend_lost_rqsts(recov_info_t *recovp, nfs4_server_t *sp) 2660 { 2661 nfs4_lost_rqst_t *lrp, *tlrp; 2662 mntinfo4_t *mi = recovp->rc_mi; 2663 nfs4_error_t n4e; 2664 #ifdef NOTYET 2665 uint32_t deny_bits = 0; 2666 #endif 2667 2668 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_resend_lost_rqsts")); 2669 2670 ASSERT(mi != NULL); 2671 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 2672 2673 mutex_enter(&mi->mi_lock); 2674 lrp = list_head(&mi->mi_lost_state); 2675 mutex_exit(&mi->mi_lock); 2676 while (lrp != NULL) { 2677 nfs4_error_zinit(&n4e); 2678 resend_one_op(lrp, &n4e, mi, sp); 2679 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 2680 "nfs4_resend_lost_rqsts: resend request: for vp %p got " 2681 "error %d stat %d", (void *)lrp->lr_vp, n4e.error, 2682 n4e.stat)); 2683 2684 /* 2685 * If we get a recovery error that we can actually 2686 * recover from (such as ETIMEDOUT, FHEXPIRED), we 2687 * return and let the recovery thread redrive the call. 2688 * Don't requeue unless the zone is still healthy. 2689 */ 2690 if (zone_status_get(curproc->p_zone) < ZONE_IS_SHUTTING_DOWN && 2691 nfs4_needs_recovery(&n4e, TRUE, mi->mi_vfsp) && 2692 (nfs4_try_failover(&n4e) || 2693 NFS4_FRC_UNMT_ERR(n4e.error, mi->mi_vfsp) || 2694 (n4e.error == 0 && n4e.stat != NFS4ERR_BADHANDLE && 2695 !nfs4_recov_marks_dead(n4e.stat)))) { 2696 /* 2697 * For these three errors, we want to delay a bit 2698 * instead of pounding the server into submission. 2699 * We have to do this manually; the normal 2700 * processing for these errors only works for 2701 * non-recovery requests. 2702 */ 2703 if ((n4e.error == 0 && n4e.stat == NFS4ERR_DELAY) || 2704 (n4e.error == 0 && n4e.stat == NFS4ERR_GRACE) || 2705 (n4e.error == 0 && n4e.stat == NFS4ERR_RESOURCE) || 2706 NFS4_FRC_UNMT_ERR(n4e.error, mi->mi_vfsp)) { 2707 delay(SEC_TO_TICK(nfs4err_delay_time)); 2708 } else { 2709 (void) nfs4_start_recovery(&n4e, 2710 mi, lrp->lr_dvp, lrp->lr_vp, NULL, NULL, 2711 lrp->lr_op, NULL); 2712 } 2713 return; 2714 } 2715 2716 mutex_enter(&mi->mi_lock); 2717 list_remove(&mi->mi_lost_state, lrp); 2718 tlrp = lrp; 2719 lrp = list_head(&mi->mi_lost_state); 2720 mutex_exit(&mi->mi_lock); 2721 nfs4_free_lost_rqst(tlrp, sp); 2722 } 2723 } 2724 2725 /* 2726 * Resend the given op, and issue any necessary undo call. 2727 * errors are returned via the nfs4_error_t parameter. 2728 */ 2729 2730 static void 2731 resend_one_op(nfs4_lost_rqst_t *lrp, nfs4_error_t *ep, 2732 mntinfo4_t *mi, nfs4_server_t *sp) 2733 { 2734 vnode_t *vp; 2735 nfs4_open_stream_t *osp; 2736 cred_t *cr; 2737 uint32_t acc_bits; 2738 2739 vp = lrp->lr_vp; 2740 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_one_op: " 2741 "have a lost open/close request for vp %p", (void *)vp)); 2742 2743 switch (lrp->lr_op) { 2744 case OP_OPEN: 2745 nfs4_resend_open_otw(&vp, lrp, ep); 2746 break; 2747 case OP_OPEN_DOWNGRADE: 2748 ASSERT(lrp->lr_oop != NULL); 2749 ep->error = nfs4_start_open_seqid_sync(lrp->lr_oop, mi); 2750 ASSERT(!ep->error); /* recov thread always succeeds */ 2751 ASSERT(lrp->lr_osp != NULL); 2752 mutex_enter(&lrp->lr_osp->os_sync_lock); 2753 nfs4_open_downgrade(lrp->lr_dg_acc, lrp->lr_dg_deny, 2754 lrp->lr_oop, lrp->lr_osp, vp, lrp->lr_cr, lrp, 2755 ep, NULL, NULL); 2756 mutex_exit(&lrp->lr_osp->os_sync_lock); 2757 nfs4_end_open_seqid_sync(lrp->lr_oop); 2758 break; 2759 case OP_CLOSE: 2760 osp = lrp->lr_osp; 2761 cr = lrp->lr_cr; 2762 acc_bits = 0; 2763 mutex_enter(&osp->os_sync_lock); 2764 if (osp->os_share_acc_read) 2765 acc_bits |= OPEN4_SHARE_ACCESS_READ; 2766 if (osp->os_share_acc_write) 2767 acc_bits |= OPEN4_SHARE_ACCESS_WRITE; 2768 mutex_exit(&osp->os_sync_lock); 2769 nfs4close_one(vp, osp, cr, acc_bits, lrp, ep, 2770 CLOSE_RESEND, 0, 0, 0); 2771 break; 2772 case OP_LOCK: 2773 case OP_LOCKU: 2774 resend_lock(lrp, ep); 2775 goto done; 2776 case OP_DELEGRETURN: 2777 nfs4_resend_delegreturn(lrp, ep, sp); 2778 goto done; 2779 default: 2780 #ifdef DEBUG 2781 cmn_err(CE_PANIC, "resend_one_op: unexpected op: %d", 2782 lrp->lr_op); 2783 #endif 2784 nfs4_queue_event(RE_LOST_STATE_BAD_OP, mi, NULL, 2785 lrp->lr_op, lrp->lr_vp, lrp->lr_dvp, NFS4_OK, NULL, 0, 2786 TAG_NONE, TAG_NONE, 0, 0); 2787 nfs4_error_init(ep, EINVAL); 2788 return; 2789 } 2790 2791 /* 2792 * No need to retry nor send an "undo" CLOSE in the 2793 * event the server rebooted. 2794 */ 2795 if (ep->error == 0 && (ep->stat == NFS4ERR_STALE_CLIENTID || 2796 ep->stat == NFS4ERR_STALE_STATEID || ep->stat == NFS4ERR_EXPIRED)) 2797 goto done; 2798 2799 /* 2800 * If we resent a CLOSE or OPEN_DOWNGRADE, there's nothing 2801 * to undo. Undoing locking operations was handled by 2802 * resend_lock(). 2803 */ 2804 if (lrp->lr_op == OP_OPEN_DOWNGRADE || lrp->lr_op == OP_CLOSE) 2805 goto done; 2806 2807 /* 2808 * If we get any other error for OPEN, then don't attempt 2809 * to undo the resend of the open (since it was never 2810 * successful!). 2811 */ 2812 ASSERT(lrp->lr_op == OP_OPEN); 2813 if (ep->error || ep->stat != NFS4_OK) 2814 goto done; 2815 2816 /* 2817 * Now let's undo our OPEN. 2818 */ 2819 nfs4_error_zinit(ep); 2820 close_after_open_resend(vp, lrp->lr_cr, lrp->lr_oacc, ep); 2821 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_one_op: " 2822 "nfs4close_one: for vp %p got error %d stat %d", 2823 (void *)vp, ep->error, ep->stat)); 2824 2825 done: 2826 if (vp != lrp->lr_vp) 2827 VN_RELE(vp); 2828 } 2829 2830 /* 2831 * Close a file that was opened via a resent OPEN. 2832 * Most errors are passed back to the caller (via the return value and 2833 * *statp), except for FHEXPIRED, which is retried. 2834 * 2835 * It might be conceptually cleaner to push the CLOSE request onto the 2836 * front of the resend queue, rather than sending it here. That would 2837 * match the way we undo lost lock requests. On the other 2838 * hand, we've already got something that works, and there's no reason to 2839 * change it at this time. 2840 */ 2841 2842 static void 2843 close_after_open_resend(vnode_t *vp, cred_t *cr, uint32_t acc_bits, 2844 nfs4_error_t *ep) 2845 { 2846 2847 for (;;) { 2848 nfs4close_one(vp, NULL, cr, acc_bits, NULL, ep, 2849 CLOSE_AFTER_RESEND, 0, 0, 0); 2850 if (ep->error == 0 && ep->stat == NFS4_OK) 2851 break; /* success; done */ 2852 if (ep->error != 0 || ep->stat != NFS4ERR_FHEXPIRED) 2853 break; 2854 /* else retry FHEXPIRED */ 2855 } 2856 2857 } 2858 2859 /* 2860 * Resend the given lost lock request. Return an errno value. If zero, 2861 * *statp is set to the NFS status code for the call. 2862 * 2863 * Issue a SIGLOST and mark the rnode dead if we get a non-recovery error or 2864 * a recovery error that we don't actually recover from yet (eg: BAD_SEQID). 2865 * Let the recovery thread redrive the call if we get a recovery error that 2866 * we can actually recover from. 2867 */ 2868 static void 2869 resend_lock(nfs4_lost_rqst_t *lrp, nfs4_error_t *ep) 2870 { 2871 bool_t send_siglost = FALSE; 2872 vnode_t *vp = lrp->lr_vp; 2873 2874 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_lock:")); 2875 ASSERT(lrp->lr_ctype == NFS4_LCK_CTYPE_REINSTATE || 2876 lrp->lr_ctype == NFS4_LCK_CTYPE_RESEND); 2877 2878 nfs4frlock(lrp->lr_ctype, vp, F_SETLK, 2879 lrp->lr_flk, FREAD|FWRITE, 0, lrp->lr_cr, ep, lrp, NULL); 2880 2881 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_lock: " 2882 "nfs4frlock for vp %p returned error %d, stat %d", 2883 (void *)vp, ep->error, ep->stat)); 2884 2885 if (ep->error == 0 && ep->stat == 0) 2886 goto done; 2887 if (ep->error == 0 && ep->stat == NFS4ERR_DENIED && 2888 lrp->lr_ctype == NFS4_LCK_CTYPE_RESEND) 2889 goto done; 2890 2891 /* 2892 * If we failed with a non-recovery error, send SIGLOST and 2893 * mark the file dead. 2894 */ 2895 if (!nfs4_needs_recovery(ep, TRUE, vp->v_vfsp)) 2896 send_siglost = TRUE; 2897 else { 2898 /* 2899 * Done with recovering LOST LOCK in the event the 2900 * server rebooted or we've lost the lease. 2901 */ 2902 if (ep->error == 0 && (ep->stat == NFS4ERR_STALE_CLIENTID || 2903 ep->stat == NFS4ERR_STALE_STATEID || 2904 ep->stat == NFS4ERR_EXPIRED)) { 2905 goto done; 2906 } 2907 2908 /* 2909 * BAD_STATEID on an unlock indicates that the server has 2910 * forgotten about the lock anyway, so act like the call 2911 * was successful. 2912 */ 2913 if (ep->error == 0 && ep->stat == NFS4ERR_BAD_STATEID && 2914 lrp->lr_op == OP_LOCKU) 2915 goto done; 2916 2917 /* 2918 * If we got a recovery error that we don't actually 2919 * recover from, send SIGLOST. If the filesystem was 2920 * forcibly unmounted, we skip the SIGLOST because (a) it's 2921 * unnecessary noise, and (b) there could be a new process 2922 * with the same pid as the one that had generated the lost 2923 * state request. 2924 */ 2925 if (ep->error == 0 && (ep->stat == NFS4ERR_BADHANDLE || 2926 nfs4_recov_marks_dead(ep->stat))) { 2927 if (!(vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) 2928 send_siglost = TRUE; 2929 goto done; 2930 } 2931 2932 /* 2933 * If the filesystem was forcibly unmounted, we 2934 * still need to synchronize with the server and 2935 * release state. Try again later. 2936 */ 2937 if (NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp)) 2938 goto done; 2939 2940 /* 2941 * If we get a recovery error that we can actually 2942 * recover from (such as ETIMEDOUT, FHEXPIRED), 2943 * return and let the recovery thread redrive the call. 2944 * 2945 * For the three errors below, we want to delay a bit 2946 * instead of pounding the server into submission. 2947 */ 2948 if ((ep->error == 0 && ep->stat == NFS4ERR_DELAY) || 2949 (ep->error == 0 && ep->stat == NFS4ERR_GRACE) || 2950 (ep->error == 0 && ep->stat == NFS4ERR_RESOURCE)) 2951 delay(SEC_TO_TICK(recov_err_delay)); 2952 goto done; 2953 } 2954 2955 done: 2956 if (send_siglost) { 2957 cred_t *sv_cred; 2958 2959 /* 2960 * Must be root or the actual thread being issued the 2961 * SIGLOST for this to work, so just become root. 2962 */ 2963 sv_cred = curthread->t_cred; 2964 curthread->t_cred = kcred; 2965 nfs4_send_siglost(lrp->lr_flk->l_pid, VTOMI4(vp), vp, FALSE, 2966 ep->error, ep->stat); 2967 curthread->t_cred = sv_cred; 2968 2969 /* 2970 * Flush any additional reinstantiation requests for 2971 * this operation. Sending multiple SIGLOSTs to the user 2972 * process is unlikely to help and may cause trouble. 2973 */ 2974 if (lrp->lr_ctype == NFS4_LCK_CTYPE_REINSTATE) 2975 flush_reinstate(lrp); 2976 } 2977 } 2978 2979 /* 2980 * Remove any lock reinstantiation requests that correspond to the given 2981 * lost request. We only remove items that follow lrp in the queue, 2982 * assuming that lrp will be removed by the generic lost state code. 2983 */ 2984 2985 static void 2986 flush_reinstate(nfs4_lost_rqst_t *lrp) 2987 { 2988 vnode_t *vp; 2989 pid_t pid; 2990 mntinfo4_t *mi; 2991 nfs4_lost_rqst_t *nlrp; 2992 2993 vp = lrp->lr_vp; 2994 mi = VTOMI4(vp); 2995 pid = lrp->lr_flk->l_pid; 2996 2997 /* 2998 * If there are any more reinstantation requests to get rid of, 2999 * they should all be clustered at the front of the lost state 3000 * queue. 3001 */ 3002 mutex_enter(&mi->mi_lock); 3003 for (lrp = list_next(&mi->mi_lost_state, lrp); lrp != NULL; 3004 lrp = nlrp) { 3005 nlrp = list_next(&mi->mi_lost_state, lrp); 3006 if (lrp->lr_op != OP_LOCK && lrp->lr_op != OP_LOCKU) 3007 break; 3008 if (lrp->lr_ctype != NFS4_LCK_CTYPE_REINSTATE) 3009 break; 3010 ASSERT(lrp->lr_vp == vp); 3011 ASSERT(lrp->lr_flk->l_pid == pid); 3012 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 3013 "remove reinstantiation %p", (void *)lrp)); 3014 list_remove(&mi->mi_lost_state, lrp); 3015 nfs4_free_lost_rqst(lrp, NULL); 3016 } 3017 mutex_exit(&mi->mi_lock); 3018 } 3019 3020 /* 3021 * End of state-specific recovery routines. 3022 */ 3023 3024 /* 3025 * Allocate a lost request struct, initialize it from lost_rqstp (including 3026 * bumping the reference counts for the referenced vnode, etc.), and hang 3027 * it off of recovp. 3028 */ 3029 3030 static void 3031 nfs4_save_lost_rqst(nfs4_lost_rqst_t *lost_rqstp, recov_info_t *recovp, 3032 nfs4_recov_t *action, mntinfo4_t *mi) 3033 { 3034 nfs4_lost_rqst_t *destp; 3035 3036 ASSERT(recovp->rc_lost_rqst == NULL); 3037 3038 destp = kmem_alloc(sizeof (nfs4_lost_rqst_t), KM_SLEEP); 3039 recovp->rc_lost_rqst = destp; 3040 3041 if (lost_rqstp->lr_op == OP_LOCK || 3042 lost_rqstp->lr_op == OP_LOCKU) { 3043 ASSERT(lost_rqstp->lr_lop); 3044 *action = NR_LOST_LOCK; 3045 destp->lr_ctype = lost_rqstp->lr_ctype; 3046 destp->lr_locktype = lost_rqstp->lr_locktype; 3047 } else if (lost_rqstp->lr_op == OP_OPEN) { 3048 component4 *srcfp, *destfp; 3049 3050 destp->lr_oacc = lost_rqstp->lr_oacc; 3051 destp->lr_odeny = lost_rqstp->lr_odeny; 3052 destp->lr_oclaim = lost_rqstp->lr_oclaim; 3053 if (lost_rqstp->lr_oclaim == CLAIM_DELEGATE_CUR) 3054 destp->lr_ostateid = lost_rqstp->lr_ostateid; 3055 3056 srcfp = &lost_rqstp->lr_ofile; 3057 destfp = &destp->lr_ofile; 3058 /* 3059 * Consume caller's utf8string 3060 */ 3061 destfp->utf8string_len = srcfp->utf8string_len; 3062 destfp->utf8string_val = srcfp->utf8string_val; 3063 srcfp->utf8string_len = 0; 3064 srcfp->utf8string_val = NULL; /* make sure not reused */ 3065 3066 *action = NR_LOST_STATE_RQST; 3067 } else if (lost_rqstp->lr_op == OP_OPEN_DOWNGRADE) { 3068 destp->lr_dg_acc = lost_rqstp->lr_dg_acc; 3069 destp->lr_dg_deny = lost_rqstp->lr_dg_deny; 3070 3071 *action = NR_LOST_STATE_RQST; 3072 } else if (lost_rqstp->lr_op == OP_CLOSE) { 3073 ASSERT(lost_rqstp->lr_oop); 3074 *action = NR_LOST_STATE_RQST; 3075 } else if (lost_rqstp->lr_op == OP_DELEGRETURN) { 3076 *action = NR_LOST_STATE_RQST; 3077 } else { 3078 #ifdef DEBUG 3079 cmn_err(CE_PANIC, "nfs4_save_lost_rqst: bad op %d", 3080 lost_rqstp->lr_op); 3081 #endif 3082 nfs4_queue_event(RE_LOST_STATE_BAD_OP, mi, NULL, 3083 lost_rqstp->lr_op, lost_rqstp->lr_vp, lost_rqstp->lr_dvp, 3084 NFS4_OK, NULL, curproc->p_pid, TAG_NONE, TAG_NONE, 0, 0); 3085 *action = NR_UNUSED; 3086 recovp->rc_lost_rqst = NULL; 3087 kmem_free(destp, sizeof (nfs4_lost_rqst_t)); 3088 return; 3089 } 3090 3091 destp->lr_op = lost_rqstp->lr_op; 3092 destp->lr_vp = lost_rqstp->lr_vp; 3093 if (destp->lr_vp) 3094 VN_HOLD(destp->lr_vp); 3095 destp->lr_dvp = lost_rqstp->lr_dvp; 3096 if (destp->lr_dvp) 3097 VN_HOLD(destp->lr_dvp); 3098 destp->lr_oop = lost_rqstp->lr_oop; 3099 if (destp->lr_oop) 3100 open_owner_hold(destp->lr_oop); 3101 destp->lr_osp = lost_rqstp->lr_osp; 3102 if (destp->lr_osp) 3103 open_stream_hold(destp->lr_osp); 3104 destp->lr_lop = lost_rqstp->lr_lop; 3105 if (destp->lr_lop) 3106 lock_owner_hold(destp->lr_lop); 3107 destp->lr_cr = lost_rqstp->lr_cr; 3108 if (destp->lr_cr) 3109 crhold(destp->lr_cr); 3110 if (lost_rqstp->lr_flk == NULL) 3111 destp->lr_flk = NULL; 3112 else { 3113 destp->lr_flk = kmem_alloc(sizeof (flock64_t), KM_SLEEP); 3114 *destp->lr_flk = *lost_rqstp->lr_flk; 3115 } 3116 destp->lr_putfirst = lost_rqstp->lr_putfirst; 3117 } 3118 3119 /* 3120 * Map the given return values (errno and nfs4 status code) to a recovery 3121 * action and fill in the following fields of recovp: rc_action, 3122 * rc_srv_reboot, rc_stateid, rc_lost_rqst. 3123 */ 3124 3125 void 3126 errs_to_action(recov_info_t *recovp, 3127 nfs4_server_t *sp, mntinfo4_t *mi, stateid4 *sidp, 3128 nfs4_lost_rqst_t *lost_rqstp, int unmounted, nfs_opnum4 op, 3129 nfs4_bseqid_entry_t *bsep) 3130 { 3131 nfs4_recov_t action = NR_UNUSED; 3132 bool_t reboot = FALSE; 3133 int try_f; 3134 int error = recovp->rc_orig_errors.error; 3135 nfsstat4 stat = recovp->rc_orig_errors.stat; 3136 3137 bzero(&recovp->rc_stateid, sizeof (stateid4)); 3138 recovp->rc_lost_rqst = NULL; 3139 recovp->rc_bseqid_rqst = NULL; 3140 3141 try_f = nfs4_try_failover(&recovp->rc_orig_errors) && 3142 FAILOVER_MOUNT4(mi); 3143 3144 /* 3145 * We start recovery for EINTR only in the lost lock 3146 * or lost open/close case. 3147 */ 3148 3149 if (try_f || error == EINTR || (error == EIO && unmounted)) { 3150 recovp->rc_error = (error != 0 ? error : geterrno4(stat)); 3151 if (lost_rqstp) { 3152 ASSERT(lost_rqstp->lr_op != 0); 3153 nfs4_save_lost_rqst(lost_rqstp, recovp, &action, mi); 3154 } 3155 if (try_f) 3156 action = NR_FAILOVER; 3157 } else if (error != 0) { 3158 recovp->rc_error = error; 3159 nfs4_queue_event(RE_UNEXPECTED_ERRNO, mi, NULL, error, NULL, 3160 NULL, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0); 3161 action = NR_CLIENTID; 3162 } else { 3163 recovp->rc_error = geterrno4(stat); 3164 switch (stat) { 3165 #ifdef notyet 3166 case NFS4ERR_LEASE_MOVED: 3167 action = xxx; 3168 break; 3169 case NFS4ERR_MOVED: 3170 action = xxx; 3171 break; 3172 #endif 3173 case NFS4ERR_BADHANDLE: 3174 action = NR_BADHANDLE; 3175 break; 3176 case NFS4ERR_BAD_SEQID: 3177 if (bsep) 3178 save_bseqid_rqst(bsep, recovp); 3179 action = NR_BAD_SEQID; 3180 break; 3181 case NFS4ERR_OLD_STATEID: 3182 action = NR_OLDSTATEID; 3183 break; 3184 case NFS4ERR_WRONGSEC: 3185 action = NR_WRONGSEC; 3186 break; 3187 case NFS4ERR_FHEXPIRED: 3188 action = NR_FHEXPIRED; 3189 break; 3190 case NFS4ERR_BAD_STATEID: 3191 if (sp == NULL || (sp != NULL && inlease(sp))) { 3192 3193 action = NR_BAD_STATEID; 3194 if (sidp) 3195 recovp->rc_stateid = *sidp; 3196 } else 3197 action = NR_CLIENTID; 3198 break; 3199 case NFS4ERR_EXPIRED: 3200 /* 3201 * The client's lease has expired, either due 3202 * to a network partition or perhaps a client 3203 * error. In either case, try an NR_CLIENTID 3204 * style recovery. reboot remains false, since 3205 * there is no evidence the server has rebooted. 3206 * This will cause CLAIM_NULL opens and lock 3207 * requests without the reclaim bit. 3208 */ 3209 action = NR_CLIENTID; 3210 3211 DTRACE_PROBE4(nfs4__expired, 3212 nfs4_server_t *, sp, 3213 mntinfo4_t *, mi, 3214 stateid4 *, sidp, int, op); 3215 3216 break; 3217 case NFS4ERR_STALE_CLIENTID: 3218 case NFS4ERR_STALE_STATEID: 3219 action = NR_CLIENTID; 3220 reboot = TRUE; 3221 break; 3222 case NFS4ERR_RESOURCE: 3223 /* 3224 * If this had been a FAILOVER mount, then 3225 * we'd have tried failover. Since it's not, 3226 * just delay a while and retry. 3227 */ 3228 action = NR_DELAY; 3229 break; 3230 case NFS4ERR_GRACE: 3231 action = NR_GRACE; 3232 break; 3233 case NFS4ERR_DELAY: 3234 action = NR_DELAY; 3235 break; 3236 case NFS4ERR_STALE: 3237 action = NR_STALE; 3238 break; 3239 default: 3240 nfs4_queue_event(RE_UNEXPECTED_STATUS, mi, NULL, 0, 3241 NULL, NULL, stat, NULL, 0, TAG_NONE, TAG_NONE, 3242 0, 0); 3243 action = NR_CLIENTID; 3244 break; 3245 } 3246 } 3247 3248 /* make sure action got set */ 3249 ASSERT(action != NR_UNUSED); 3250 recovp->rc_srv_reboot = reboot; 3251 recovp->rc_action = action; 3252 nfs4_queue_fact(RF_ERR, mi, stat, action, op, reboot, NULL, error, 3253 NULL); 3254 } 3255 3256 /* 3257 * Return the (held) credential for the process with the given pid. 3258 * May return NULL (e.g., process not found). 3259 */ 3260 3261 static cred_t * 3262 pid_to_cr(pid_t pid) 3263 { 3264 proc_t *p; 3265 cred_t *cr; 3266 3267 mutex_enter(&pidlock); 3268 if ((p = prfind(pid)) == NULL) { 3269 mutex_exit(&pidlock); 3270 return (NULL); 3271 } 3272 3273 mutex_enter(&p->p_crlock); 3274 crhold(cr = p->p_cred); 3275 mutex_exit(&p->p_crlock); 3276 mutex_exit(&pidlock); 3277 3278 return (cr); 3279 } 3280 3281 /* 3282 * Send SIGLOST to the given process and queue the event. 3283 * 3284 * The 'dump' boolean tells us whether this action should dump the 3285 * in-kernel queue of recovery messages or not. 3286 */ 3287 3288 void 3289 nfs4_send_siglost(pid_t pid, mntinfo4_t *mi, vnode_t *vp, bool_t dump, 3290 int error, nfsstat4 stat) 3291 { 3292 proc_t *p; 3293 3294 mutex_enter(&pidlock); 3295 p = prfind(pid); 3296 if (p) 3297 psignal(p, SIGLOST); 3298 mutex_exit(&pidlock); 3299 nfs4_queue_event(dump ? RE_SIGLOST : RE_SIGLOST_NO_DUMP, mi, 3300 NULL, error, vp, NULL, stat, NULL, pid, TAG_NONE, TAG_NONE, 0, 0); 3301 } 3302 3303 /* 3304 * Scan the lock list for entries that match the given pid. Change the 3305 * pid in those that do to NOPID. 3306 */ 3307 3308 static void 3309 relock_skip_pid(locklist_t *llp, pid_t pid) 3310 { 3311 for (; llp != NULL; llp = llp->ll_next) { 3312 if (llp->ll_flock.l_pid == pid) 3313 llp->ll_flock.l_pid = NOPID; 3314 } 3315 } 3316 3317 /* 3318 * Mark a file as having failed recovery, after making a last-ditch effort 3319 * to return any delegation. 3320 * 3321 * Sets r_error to EIO or ESTALE for the given vnode. 3322 */ 3323 void 3324 nfs4_fail_recov(vnode_t *vp, char *why, int error, nfsstat4 stat) 3325 { 3326 rnode4_t *rp = VTOR4(vp); 3327 3328 #ifdef DEBUG 3329 if (nfs4_fail_recov_stop) 3330 debug_enter("nfs4_fail_recov"); 3331 #endif 3332 3333 mutex_enter(&rp->r_statelock); 3334 if (rp->r_flags & (R4RECOVERR|R4RECOVERRP)) { 3335 mutex_exit(&rp->r_statelock); 3336 return; 3337 } 3338 3339 /* 3340 * Set R4RECOVERRP to indicate that a recovery error is in 3341 * progress. This will shut down reads and writes at the top 3342 * half. Don't set R4RECOVERR until after we've returned the 3343 * delegation, otherwise it will fail. 3344 */ 3345 3346 rp->r_flags |= R4RECOVERRP; 3347 mutex_exit(&rp->r_statelock); 3348 3349 nfs4delegabandon(rp); 3350 3351 mutex_enter(&rp->r_statelock); 3352 rp->r_flags |= (R4RECOVERR | R4STALE); 3353 rp->r_error = (error == 0 && stat == NFS4ERR_STALE) ? ESTALE : EIO; 3354 PURGE_ATTRCACHE4_LOCKED(rp); 3355 if (!(vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) 3356 nfs4_queue_event(RE_DEAD_FILE, VTOMI4(vp), NULL, error, 3357 vp, NULL, stat, why, 0, TAG_NONE, TAG_NONE, 0, 0); 3358 mutex_exit(&rp->r_statelock); 3359 3360 dnlc_purge_vp(vp); 3361 } 3362 3363 /* 3364 * recov_throttle: if the file had the same recovery action within the 3365 * throttle interval, wait for the throttle interval to finish before 3366 * proceeding. 3367 * 3368 * Side effects: updates the rnode with the current recovery information. 3369 */ 3370 3371 static void 3372 recov_throttle(recov_info_t *recovp, vnode_t *vp) 3373 { 3374 time_t curtime, time_to_wait; 3375 rnode4_t *rp = VTOR4(vp); 3376 3377 curtime = gethrestime_sec(); 3378 3379 mutex_enter(&rp->r_statelock); 3380 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 3381 "recov_throttle: now: (%d, %ld), last: (%d, %ld)", 3382 recovp->rc_action, curtime, 3383 rp->r_recov_act, rp->r_last_recov)); 3384 if (recovp->rc_action == rp->r_recov_act && 3385 rp->r_last_recov + recov_err_delay > curtime) { 3386 time_to_wait = rp->r_last_recov + recov_err_delay - curtime; 3387 mutex_exit(&rp->r_statelock); 3388 delay(SEC_TO_TICK(time_to_wait)); 3389 curtime = gethrestime_sec(); 3390 mutex_enter(&rp->r_statelock); 3391 } 3392 3393 rp->r_last_recov = curtime; 3394 rp->r_recov_act = recovp->rc_action; 3395 mutex_exit(&rp->r_statelock); 3396 } 3397 3398 /* 3399 * React to NFS4ERR_GRACE by setting the time we'll permit 3400 * the next call to this filesystem. 3401 */ 3402 void 3403 nfs4_set_grace_wait(mntinfo4_t *mi) 3404 { 3405 mutex_enter(&mi->mi_lock); 3406 /* Mark the time for the future */ 3407 mi->mi_grace_wait = gethrestime_sec() + nfs4err_delay_time; 3408 mutex_exit(&mi->mi_lock); 3409 } 3410 3411 /* 3412 * React to MFS4ERR_DELAY by setting the time we'll permit 3413 * the next call to this vnode. 3414 */ 3415 void 3416 nfs4_set_delay_wait(vnode_t *vp) 3417 { 3418 rnode4_t *rp = VTOR4(vp); 3419 3420 mutex_enter(&rp->r_statelock); 3421 /* 3422 * Calculate amount we should delay, initial 3423 * delay will be short and then we will back off. 3424 */ 3425 if (rp->r_delay_interval == 0) 3426 rp->r_delay_interval = NFS4_INITIAL_DELAY_INTERVAL; 3427 else 3428 /* calculate next interval value */ 3429 rp->r_delay_interval = 3430 MIN(NFS4_MAX_DELAY_INTERVAL, (rp->r_delay_interval << 1)); 3431 rp->r_delay_wait = gethrestime_sec() + rp->r_delay_interval; 3432 mutex_exit(&rp->r_statelock); 3433 } 3434 3435 /* 3436 * The caller is responsible for freeing the returned string. 3437 */ 3438 static char * 3439 nfs4_getsrvnames(mntinfo4_t *mi, size_t *len) 3440 { 3441 servinfo4_t *svp; 3442 char *srvnames; 3443 char *namep; 3444 size_t length; 3445 3446 /* 3447 * Calculate the length of the string required to hold all 3448 * of the server names plus either a comma or a null 3449 * character following each individual one. 3450 */ 3451 length = 0; 3452 for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) { 3453 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 3454 if (svp->sv_flags & SV4_NOTINUSE) { 3455 nfs_rw_exit(&svp->sv_lock); 3456 continue; 3457 } 3458 nfs_rw_exit(&svp->sv_lock); 3459 length += svp->sv_hostnamelen; 3460 } 3461 3462 srvnames = kmem_alloc(length, KM_SLEEP); 3463 3464 namep = srvnames; 3465 for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) { 3466 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 3467 if (svp->sv_flags & SV4_NOTINUSE) { 3468 nfs_rw_exit(&svp->sv_lock); 3469 continue; 3470 } 3471 nfs_rw_exit(&svp->sv_lock); 3472 (void) strcpy(namep, svp->sv_hostname); 3473 namep += svp->sv_hostnamelen - 1; 3474 *namep++ = ','; 3475 } 3476 *--namep = '\0'; 3477 3478 *len = length; 3479 3480 return (srvnames); 3481 } 3482 3483 static void 3484 save_bseqid_rqst(nfs4_bseqid_entry_t *bsep, recov_info_t *recovp) 3485 { 3486 nfs4_bseqid_entry_t *destp; 3487 3488 destp = kmem_alloc(sizeof (nfs4_bseqid_entry_t), KM_SLEEP); 3489 recovp->rc_bseqid_rqst = destp; 3490 3491 if (bsep->bs_oop) 3492 open_owner_hold(bsep->bs_oop); 3493 destp->bs_oop = bsep->bs_oop; 3494 if (bsep->bs_lop) 3495 lock_owner_hold(bsep->bs_lop); 3496 destp->bs_lop = bsep->bs_lop; 3497 if (bsep->bs_vp) 3498 VN_HOLD(bsep->bs_vp); 3499 destp->bs_vp = bsep->bs_vp; 3500 destp->bs_pid = bsep->bs_pid; 3501 destp->bs_tag = bsep->bs_tag; 3502 destp->bs_seqid = bsep->bs_seqid; 3503 } 3504 3505 static void 3506 free_bseqid_rqst(nfs4_bseqid_entry_t *bsep) 3507 { 3508 if (bsep->bs_oop) 3509 open_owner_rele(bsep->bs_oop); 3510 if (bsep->bs_lop) 3511 lock_owner_rele(bsep->bs_lop); 3512 if (bsep->bs_vp) 3513 VN_RELE(bsep->bs_vp); 3514 kmem_free(bsep, sizeof (nfs4_bseqid_entry_t)); 3515 } 3516 3517 /* 3518 * We don't actually fully recover from NFS4ERR_BAD_SEQID. We 3519 * simply mark the open owner and open stream (if provided) as "bad". 3520 * Then future uses of these data structures will be limited to basically 3521 * just cleaning up the internal client state (no going OTW). 3522 * 3523 * The result of this is to return errors back to the app/usr when 3524 * we receive NFS4ERR_BAD_SEQID, but also allow future/new calls to 3525 * succeed so progress can be made. 3526 */ 3527 void 3528 recov_bad_seqid(recov_info_t *recovp) 3529 { 3530 mntinfo4_t *mi = recovp->rc_mi; 3531 nfs4_open_owner_t *bad_oop; 3532 nfs4_lock_owner_t *bad_lop; 3533 vnode_t *vp; 3534 rnode4_t *rp = NULL; 3535 pid_t pid; 3536 nfs4_bseqid_entry_t *bsep, *tbsep; 3537 int error; 3538 3539 ASSERT(mi != NULL); 3540 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 3541 3542 mutex_enter(&mi->mi_lock); 3543 bsep = list_head(&mi->mi_bseqid_list); 3544 mutex_exit(&mi->mi_lock); 3545 3546 /* 3547 * Handle all the bad seqid entries on mi's list. 3548 */ 3549 while (bsep != NULL) { 3550 bad_oop = bsep->bs_oop; 3551 bad_lop = bsep->bs_lop; 3552 vp = bsep->bs_vp; 3553 pid = bsep->bs_pid; 3554 3555 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 3556 "recov_bad_seqid: mark oop %p lop %p as bad for " 3557 "vp %p tag %s pid %d: last good seqid %d for tag %s", 3558 (void *)bad_oop, (void *)bad_lop, (void *)vp, 3559 nfs4_ctags[bsep->bs_tag].ct_str, pid, 3560 bad_oop ? bad_oop->oo_last_good_seqid : 0, 3561 bad_oop ? nfs4_ctags[bad_oop->oo_last_good_op].ct_str : 3562 nfs4_ctags[TAG_NONE].ct_str)); 3563 3564 nfs4_queue_event(RE_BAD_SEQID, mi, NULL, 3565 0, vp, NULL, NFS4ERR_BAD_SEQID, NULL, pid, bsep->bs_tag, 3566 bad_oop ? bad_oop->oo_last_good_op : TAG_NONE, 3567 bsep->bs_seqid, bad_oop ? bad_oop->oo_last_good_seqid : 0); 3568 3569 if (bad_oop) { 3570 /* essentially reset the open owner */ 3571 error = nfs4_start_open_seqid_sync(bad_oop, mi); 3572 ASSERT(!error); /* recov thread always succeeds */ 3573 bad_oop->oo_name = nfs4_get_new_oo_name(); 3574 bad_oop->oo_seqid = 0; 3575 nfs4_end_open_seqid_sync(bad_oop); 3576 } 3577 3578 if (bad_lop) { 3579 mutex_enter(&bad_lop->lo_lock); 3580 bad_lop->lo_flags |= NFS4_BAD_SEQID_LOCK; 3581 mutex_exit(&bad_lop->lo_lock); 3582 3583 ASSERT(vp != NULL); 3584 rp = VTOR4(vp); 3585 mutex_enter(&rp->r_statelock); 3586 rp->r_flags |= R4LODANGLERS; 3587 mutex_exit(&rp->r_statelock); 3588 3589 nfs4_send_siglost(pid, mi, vp, TRUE, 3590 0, NFS4ERR_BAD_SEQID); 3591 } 3592 3593 mutex_enter(&mi->mi_lock); 3594 list_remove(&mi->mi_bseqid_list, bsep); 3595 tbsep = bsep; 3596 bsep = list_head(&mi->mi_bseqid_list); 3597 mutex_exit(&mi->mi_lock); 3598 free_bseqid_rqst(tbsep); 3599 } 3600 3601 mutex_enter(&mi->mi_lock); 3602 mi->mi_recovflags &= ~MI4R_BAD_SEQID; 3603 mutex_exit(&mi->mi_lock); 3604 } 3605