1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * NFS Version 4 state recovery code. 31 */ 32 33 #include <nfs/nfs4_clnt.h> 34 #include <nfs/nfs4.h> 35 #include <nfs/rnode4.h> 36 #include <sys/cmn_err.h> 37 #include <sys/cred.h> 38 #include <sys/systm.h> 39 #include <sys/flock.h> 40 #include <sys/dnlc.h> 41 #include <sys/ddi.h> 42 #include <sys/disp.h> 43 #include <sys/list.h> 44 #include <sys/sdt.h> 45 46 extern r4hashq_t *rtable4; 47 48 /* 49 * Information that describes what needs to be done for recovery. It is 50 * passed to a client recovery thread as well as passed to various recovery 51 * routines. rc_mi, rc_vp1, and rc_vp2 refer to the filesystem and 52 * vnode(s) affected by recovery. rc_vp1 and rc_vp2 are references (use 53 * VN_HOLD) or NULL. rc_lost_rqst contains information about the lost 54 * lock or open/close request, and it holds reference counts for the 55 * various objects (vnode, etc.). The recovery thread also uses flags set 56 * in the mntinfo4_t or vnode_t to tell it what to do. rc_error is used 57 * to save the error that originally triggered the recovery event -- will 58 * later be used to set mi_error if recovery doesn't work. rc_bseqid_rqst 59 * contains information about the request that got NFS4ERR_BAD_SEQID, and 60 * it holds reference count for the various objects (vnode, open owner, 61 * open stream, lock owner). 62 */ 63 64 typedef struct { 65 mntinfo4_t *rc_mi; 66 vnode_t *rc_vp1; 67 vnode_t *rc_vp2; 68 nfs4_recov_t rc_action; 69 stateid4 rc_stateid; 70 bool_t rc_srv_reboot; /* server has rebooted */ 71 nfs4_lost_rqst_t *rc_lost_rqst; 72 nfs4_error_t rc_orig_errors; /* original errors causing recovery */ 73 int rc_error; 74 nfs4_bseqid_entry_t *rc_bseqid_rqst; 75 } recov_info_t; 76 77 /* 78 * How long to wait before trying again if there is an error doing 79 * recovery, in seconds. 80 */ 81 82 static int recov_err_delay = 1; 83 84 /* 85 * How long to wait when processing NFS4ERR_GRACE or NFS4ERR_DELAY 86 * errors. Expressed in seconds. Default is defined as 87 * NFS4ERR_DELAY_TIME and this variable is initialized in nfs4_subr_init() 88 */ 89 time_t nfs4err_delay_time = 0; 90 91 /* 92 * Tuneable to limit how many time "exempt" ops go OTW 93 * after a recovery error. Exempt op hints are OH_CLOSE, 94 * OH_LOCKU, OH_DELEGRETURN. These previously always went 95 * OTW even after rnode was "dead" due to recovery errors. 96 * 97 * The tuneable below limits the number of times a start_fop 98 * invocation will retry the exempt hints. After the limit 99 * is reached, nfs4_start_fop will return an error just like 100 * it would for non-exempt op hints. 101 */ 102 int nfs4_max_recov_error_retry = 3; 103 104 /* 105 * Number of seconds the recovery thread should pause before retry when the 106 * filesystem has been forcibly unmounted. 107 */ 108 109 int nfs4_unmount_delay = 1; 110 111 #ifdef DEBUG 112 113 /* 114 * How long to wait (in seconds) between recovery operations on a given 115 * file. Normally zero, but could be set longer for testing purposes. 116 */ 117 static int nfs4_recovdelay = 0; 118 119 /* 120 * Switch that controls whether to go into the debugger when recovery 121 * fails. 122 */ 123 static int nfs4_fail_recov_stop = 0; 124 125 /* 126 * Tuneables to debug client namespace interaction with server 127 * mount points: 128 * 129 * nfs4_srvmnt_fail_cnt: 130 * number of times EACCES returned because client 131 * attempted to cross server mountpoint 132 * 133 * nfs4_srvmnt_debug: 134 * trigger console printf whenever client attempts 135 * to cross server mountpoint 136 */ 137 int nfs4_srvmnt_fail_cnt = 0; 138 int nfs4_srvmnt_debug = 0; 139 #endif 140 141 /* forward references, in alphabetic order */ 142 static void close_after_open_resend(vnode_t *, cred_t *, uint32_t, 143 nfs4_error_t *); 144 static void errs_to_action(recov_info_t *, 145 nfs4_server_t *, mntinfo4_t *, stateid4 *, nfs4_lost_rqst_t *, int, 146 nfs_opnum4, nfs4_bseqid_entry_t *); 147 static void flush_reinstate(nfs4_lost_rqst_t *); 148 static void free_milist(mntinfo4_t **, int); 149 static mntinfo4_t **make_milist(nfs4_server_t *, int *); 150 static int nfs4_check_recov_err(vnode_t *, nfs4_op_hint_t, 151 nfs4_recov_state_t *, int, char *); 152 static int nfs4_check_srvstub(vnode_t *vp, rnode4_t *rp, nfs4_op_hint_t op); 153 static char *nfs4_getsrvnames(mntinfo4_t *, size_t *); 154 static void nfs4_recov_fh_fail(vnode_t *, int, nfsstat4); 155 static void nfs4_recov_thread(recov_info_t *); 156 static void nfs4_remove_lost_rqsts(mntinfo4_t *, nfs4_server_t *); 157 static void nfs4_resend_lost_rqsts(recov_info_t *, nfs4_server_t *); 158 static cred_t *pid_to_cr(pid_t); 159 static void reclaim_one_lock(vnode_t *, flock64_t *, nfs4_error_t *, int *); 160 static void recov_bad_seqid(recov_info_t *); 161 static void recov_badstate(recov_info_t *, vnode_t *, nfsstat4); 162 static void recov_clientid(recov_info_t *, nfs4_server_t *); 163 static void recov_done(mntinfo4_t *, recov_info_t *); 164 static void recov_filehandle(nfs4_recov_t, mntinfo4_t *, vnode_t *); 165 static void recov_newserver(recov_info_t *, nfs4_server_t **, bool_t *); 166 static void recov_openfiles(recov_info_t *, nfs4_server_t *); 167 static void recov_stale(mntinfo4_t *, vnode_t *); 168 static void nfs4_free_lost_rqst(nfs4_lost_rqst_t *, nfs4_server_t *); 169 static void recov_throttle(recov_info_t *, vnode_t *); 170 static void relock_skip_pid(locklist_t *, pid_t); 171 static void resend_lock(nfs4_lost_rqst_t *, nfs4_error_t *); 172 static void resend_one_op(nfs4_lost_rqst_t *, nfs4_error_t *, mntinfo4_t *, 173 nfs4_server_t *); 174 static void save_bseqid_rqst(nfs4_bseqid_entry_t *, recov_info_t *); 175 static void start_recovery(recov_info_t *, mntinfo4_t *, vnode_t *, vnode_t *, 176 nfs4_server_t *); 177 static void start_recovery_action(nfs4_recov_t, bool_t, mntinfo4_t *, vnode_t *, 178 vnode_t *); 179 static int wait_for_recovery(mntinfo4_t *, nfs4_op_hint_t); 180 181 /* 182 * Return non-zero if the given errno, status, and rpc status codes 183 * in the nfs4_error_t indicate that client recovery is needed. 184 * "stateful" indicates whether the call that got the error establishes or 185 * removes state on the server (open, close, lock, unlock, delegreturn). 186 */ 187 188 int 189 nfs4_needs_recovery(nfs4_error_t *ep, bool_t stateful, vfs_t *vfsp) 190 { 191 int recov = 0; 192 mntinfo4_t *mi; 193 194 /* 195 * Try failover if the error values justify it and if 196 * it's a failover mount. Don't try if the mount is in 197 * progress, failures are handled explicitly by nfs4rootvp. 198 */ 199 if (nfs4_try_failover(ep)) { 200 mi = VFTOMI4(vfsp); 201 mutex_enter(&mi->mi_lock); 202 recov = FAILOVER_MOUNT4(mi) && !(mi->mi_flags & MI4_MOUNTING); 203 mutex_exit(&mi->mi_lock); 204 if (recov) 205 return (recov); 206 } 207 208 if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vfsp)) { 209 /* 210 * The server may have gotten the request, so for stateful 211 * ops we need to resynchronize and possibly back out the 212 * op. 213 */ 214 return (stateful); 215 } 216 if (ep->error != 0) 217 return (0); 218 219 /* stat values are listed alphabetically */ 220 /* 221 * There are two lists here: the errors for which we have code, and 222 * the errors for which we plan to have code before FCS. For the 223 * second list, print a warning message but don't attempt recovery. 224 */ 225 switch (ep->stat) { 226 case NFS4ERR_BADHANDLE: 227 case NFS4ERR_BAD_SEQID: 228 case NFS4ERR_BAD_STATEID: 229 case NFS4ERR_DELAY: 230 case NFS4ERR_EXPIRED: 231 case NFS4ERR_FHEXPIRED: 232 case NFS4ERR_GRACE: 233 case NFS4ERR_OLD_STATEID: 234 case NFS4ERR_RESOURCE: 235 case NFS4ERR_STALE_CLIENTID: 236 case NFS4ERR_STALE_STATEID: 237 case NFS4ERR_WRONGSEC: 238 case NFS4ERR_STALE: 239 recov = 1; 240 break; 241 #ifdef DEBUG 242 case NFS4ERR_LEASE_MOVED: 243 case NFS4ERR_MOVED: 244 zcmn_err(VFTOMI4(vfsp)->mi_zone->zone_id, 245 CE_WARN, "!Can't yet recover from NFS status %d", 246 ep->stat); 247 break; 248 #endif 249 } 250 251 return (recov); 252 } 253 254 /* 255 * Some operations such as DELEGRETURN want to avoid invoking 256 * recovery actions that will only mark the file dead. If 257 * better handlers are invoked for any of these errors, this 258 * routine should be modified. 259 */ 260 int 261 nfs4_recov_marks_dead(nfsstat4 status) 262 { 263 if (status == NFS4ERR_BAD_SEQID || 264 status == NFS4ERR_EXPIRED || 265 status == NFS4ERR_BAD_STATEID || 266 status == NFS4ERR_OLD_STATEID) 267 return (1); 268 return (0); 269 } 270 271 /* 272 * Transfer the state recovery information in recovp to mi's resend queue, 273 * and mark mi as having a lost state request. 274 */ 275 static void 276 nfs4_enqueue_lost_rqst(recov_info_t *recovp, mntinfo4_t *mi) 277 { 278 nfs4_lost_rqst_t *lrp = recovp->rc_lost_rqst; 279 280 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 281 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 282 283 ASSERT(lrp != NULL && lrp->lr_op != 0); 284 285 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 286 "nfs4_enqueue_lost_rqst %p, op %d", 287 (void *)lrp, lrp->lr_op)); 288 289 mutex_enter(&mi->mi_lock); 290 mi->mi_recovflags |= MI4R_LOST_STATE; 291 if (lrp->lr_putfirst) 292 list_insert_head(&mi->mi_lost_state, lrp); 293 else 294 list_insert_tail(&mi->mi_lost_state, lrp); 295 recovp->rc_lost_rqst = NULL; 296 mutex_exit(&mi->mi_lock); 297 298 nfs4_queue_event(RE_LOST_STATE, mi, NULL, lrp->lr_op, lrp->lr_vp, 299 lrp->lr_dvp, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0); 300 } 301 302 /* 303 * Transfer the bad seqid recovery information in recovp to mi's 304 * bad seqid queue, and mark mi as having a bad seqid request. 305 */ 306 void 307 enqueue_bseqid_rqst(recov_info_t *recovp, mntinfo4_t *mi) 308 { 309 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 310 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 311 ASSERT(recovp->rc_bseqid_rqst != NULL); 312 313 mutex_enter(&mi->mi_lock); 314 mi->mi_recovflags |= MI4R_BAD_SEQID; 315 list_insert_tail(&mi->mi_bseqid_list, recovp->rc_bseqid_rqst); 316 recovp->rc_bseqid_rqst = NULL; 317 mutex_exit(&mi->mi_lock); 318 } 319 320 /* 321 * Initiate recovery. 322 * 323 * The nfs4_error_t contains the return codes that triggered a recovery 324 * attempt. mi, vp1, and vp2 refer to the filesystem and files that were 325 * being operated on. vp1 and vp2 may be NULL. 326 * 327 * Multiple calls are okay. If recovery is already underway, the call 328 * updates the information about what state needs recovery but does not 329 * start a new thread. The caller should hold mi->mi_recovlock as a reader 330 * for proper synchronization with any recovery thread. 331 * 332 * This will return TRUE if recovery was aborted, and FALSE otherwise. 333 */ 334 bool_t 335 nfs4_start_recovery(nfs4_error_t *ep, mntinfo4_t *mi, vnode_t *vp1, 336 vnode_t *vp2, stateid4 *sid, nfs4_lost_rqst_t *lost_rqstp, nfs_opnum4 op, 337 nfs4_bseqid_entry_t *bsep) 338 { 339 recov_info_t *recovp; 340 nfs4_server_t *sp; 341 bool_t abort = FALSE; 342 bool_t gone = FALSE; 343 344 ASSERT(nfs_zone() == mi->mi_zone); 345 mutex_enter(&mi->mi_lock); 346 /* 347 * If there is lost state, we need to kick off recovery even if the 348 * filesystem has been unmounted or the zone is shutting down. 349 */ 350 gone = FS_OR_ZONE_GONE4(mi->mi_vfsp); 351 if (gone) { 352 ASSERT(ep->error != EINTR || lost_rqstp != NULL); 353 if (ep->error == EIO && lost_rqstp == NULL) { 354 /* failed due to forced unmount, no new lost state */ 355 abort = TRUE; 356 } 357 if ((ep->error == 0 || ep->error == ETIMEDOUT) && 358 !(mi->mi_recovflags & MI4R_LOST_STATE)) { 359 /* some other failure, no existing lost state */ 360 abort = TRUE; 361 } 362 if (abort) { 363 mutex_exit(&mi->mi_lock); 364 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 365 "nfs4_start_recovery: fs unmounted")); 366 return (TRUE); 367 } 368 } 369 mi->mi_in_recovery++; 370 mutex_exit(&mi->mi_lock); 371 372 recovp = kmem_alloc(sizeof (recov_info_t), KM_SLEEP); 373 recovp->rc_orig_errors = *ep; 374 sp = find_nfs4_server(mi); 375 errs_to_action(recovp, sp, mi, sid, lost_rqstp, 376 gone, op, bsep); 377 if (sp != NULL) 378 mutex_exit(&sp->s_lock); 379 start_recovery(recovp, mi, vp1, vp2, sp); 380 if (sp != NULL) 381 nfs4_server_rele(sp); 382 return (FALSE); 383 } 384 385 /* 386 * Internal version of nfs4_start_recovery. The difference is that the 387 * caller specifies the recovery action, rather than the errors leading to 388 * recovery. 389 */ 390 static void 391 start_recovery_action(nfs4_recov_t what, bool_t reboot, mntinfo4_t *mi, 392 vnode_t *vp1, vnode_t *vp2) 393 { 394 recov_info_t *recovp; 395 396 ASSERT(nfs_zone() == mi->mi_zone); 397 mutex_enter(&mi->mi_lock); 398 mi->mi_in_recovery++; 399 mutex_exit(&mi->mi_lock); 400 401 recovp = kmem_zalloc(sizeof (recov_info_t), KM_SLEEP); 402 recovp->rc_action = what; 403 recovp->rc_srv_reboot = reboot; 404 recovp->rc_error = EIO; 405 start_recovery(recovp, mi, vp1, vp2, NULL); 406 } 407 408 static void 409 start_recovery(recov_info_t *recovp, mntinfo4_t *mi, 410 vnode_t *vp1, vnode_t *vp2, nfs4_server_t *sp) 411 { 412 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 413 "start_recovery: mi %p, what %s", (void*)mi, 414 nfs4_recov_action_to_str(recovp->rc_action))); 415 416 /* 417 * Bump the reference on the vfs so that we can pass it to the 418 * recovery thread. 419 */ 420 VFS_HOLD(mi->mi_vfsp); 421 422 again: 423 switch (recovp->rc_action) { 424 case NR_FAILOVER: 425 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 426 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 427 if (mi->mi_servers->sv_next == NULL) 428 goto out_no_thread; 429 mutex_enter(&mi->mi_lock); 430 mi->mi_recovflags |= MI4R_NEED_NEW_SERVER; 431 mutex_exit(&mi->mi_lock); 432 433 if (recovp->rc_lost_rqst != NULL) 434 nfs4_enqueue_lost_rqst(recovp, mi); 435 break; 436 437 case NR_CLIENTID: 438 /* 439 * If the filesystem has been unmounted, punt. 440 */ 441 if (sp == NULL) 442 goto out_no_thread; 443 444 /* 445 * If nobody else is working on the clientid, mark the 446 * clientid as being no longer set. Then mark the specific 447 * filesystem being worked on. 448 */ 449 if (!nfs4_server_in_recovery(sp)) { 450 mutex_enter(&sp->s_lock); 451 sp->s_flags &= ~N4S_CLIENTID_SET; 452 mutex_exit(&sp->s_lock); 453 } 454 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 455 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 456 mutex_enter(&mi->mi_lock); 457 mi->mi_recovflags |= MI4R_NEED_CLIENTID; 458 if (recovp->rc_srv_reboot) 459 mi->mi_recovflags |= MI4R_SRV_REBOOT; 460 mutex_exit(&mi->mi_lock); 461 break; 462 463 case NR_OPENFILES: 464 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 465 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 466 mutex_enter(&mi->mi_lock); 467 mi->mi_recovflags |= MI4R_REOPEN_FILES; 468 if (recovp->rc_srv_reboot) 469 mi->mi_recovflags |= MI4R_SRV_REBOOT; 470 mutex_exit(&mi->mi_lock); 471 break; 472 473 case NR_WRONGSEC: 474 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 475 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 476 mutex_enter(&mi->mi_lock); 477 mi->mi_recovflags |= MI4R_NEED_SECINFO; 478 mutex_exit(&mi->mi_lock); 479 break; 480 481 case NR_EXPIRED: 482 if (vp1 != NULL) 483 recov_badstate(recovp, vp1, NFS4ERR_EXPIRED); 484 if (vp2 != NULL) 485 recov_badstate(recovp, vp2, NFS4ERR_EXPIRED); 486 goto out_no_thread; /* no further recovery possible */ 487 488 case NR_BAD_STATEID: 489 if (vp1 != NULL) 490 recov_badstate(recovp, vp1, NFS4ERR_BAD_STATEID); 491 if (vp2 != NULL) 492 recov_badstate(recovp, vp2, NFS4ERR_BAD_STATEID); 493 goto out_no_thread; /* no further recovery possible */ 494 495 case NR_FHEXPIRED: 496 case NR_BADHANDLE: 497 if (vp1 != NULL) 498 recov_throttle(recovp, vp1); 499 if (vp2 != NULL) 500 recov_throttle(recovp, vp2); 501 /* 502 * Recover the filehandle now, rather than using a 503 * separate thread. We can do this because filehandle 504 * recovery is independent of any other state, and because 505 * we know that we are not competing with the recovery 506 * thread at this time. recov_filehandle will deal with 507 * threads that are competing to recover this filehandle. 508 */ 509 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 510 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 511 if (vp1 != NULL) 512 recov_filehandle(recovp->rc_action, mi, vp1); 513 if (vp2 != NULL) 514 recov_filehandle(recovp->rc_action, mi, vp2); 515 goto out_no_thread; /* no further recovery needed */ 516 517 case NR_STALE: 518 /* 519 * NFS4ERR_STALE handling 520 * recov_stale() could set MI4R_NEED_NEW_SERVER to 521 * indicate that we can and should failover. 522 */ 523 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 524 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 525 526 if (vp1 != NULL) 527 recov_stale(mi, vp1); 528 if (vp2 != NULL) 529 recov_stale(mi, vp2); 530 mutex_enter(&mi->mi_lock); 531 if ((mi->mi_recovflags & MI4R_NEED_NEW_SERVER) == 0) { 532 mutex_exit(&mi->mi_lock); 533 goto out_no_thread; 534 } 535 mutex_exit(&mi->mi_lock); 536 recovp->rc_action = NR_FAILOVER; 537 goto again; 538 539 case NR_BAD_SEQID: 540 if (recovp->rc_bseqid_rqst) { 541 enqueue_bseqid_rqst(recovp, mi); 542 break; 543 } 544 545 if (vp1 != NULL) 546 recov_badstate(recovp, vp1, NFS4ERR_BAD_SEQID); 547 if (vp2 != NULL) 548 recov_badstate(recovp, vp2, NFS4ERR_BAD_SEQID); 549 goto out_no_thread; /* no further recovery possible */ 550 551 case NR_OLDSTATEID: 552 if (vp1 != NULL) 553 recov_badstate(recovp, vp1, NFS4ERR_OLD_STATEID); 554 if (vp2 != NULL) 555 recov_badstate(recovp, vp2, NFS4ERR_OLD_STATEID); 556 goto out_no_thread; /* no further recovery possible */ 557 558 case NR_GRACE: 559 nfs4_set_grace_wait(mi); 560 goto out_no_thread; /* no further action required for GRACE */ 561 562 case NR_DELAY: 563 if (vp1) 564 nfs4_set_delay_wait(vp1); 565 goto out_no_thread; /* no further action required for DELAY */ 566 567 case NR_LOST_STATE_RQST: 568 case NR_LOST_LOCK: 569 nfs4_enqueue_lost_rqst(recovp, mi); 570 break; 571 572 default: 573 nfs4_queue_event(RE_UNEXPECTED_ACTION, mi, NULL, 574 recovp->rc_action, NULL, NULL, 0, NULL, 0, TAG_NONE, 575 TAG_NONE, 0, 0); 576 goto out_no_thread; 577 } 578 579 /* 580 * If either file recently went through the same recovery, wait 581 * awhile. This is in case there is some sort of bug; we might not 582 * be able to recover properly, but at least we won't bombard the 583 * server with calls, and we won't tie up the client. 584 */ 585 if (vp1 != NULL) 586 recov_throttle(recovp, vp1); 587 if (vp2 != NULL) 588 recov_throttle(recovp, vp2); 589 590 /* 591 * If there's already a recovery thread, don't start another one. 592 */ 593 594 mutex_enter(&mi->mi_lock); 595 if (mi->mi_flags & MI4_RECOV_ACTIV) { 596 mutex_exit(&mi->mi_lock); 597 goto out_no_thread; 598 } 599 mi->mi_flags |= MI4_RECOV_ACTIV; 600 mutex_exit(&mi->mi_lock); 601 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 602 "start_recovery: starting new thread for mi %p", (void*)mi)); 603 604 recovp->rc_mi = mi; 605 recovp->rc_vp1 = vp1; 606 if (vp1 != NULL) { 607 ASSERT(VTOMI4(vp1) == mi); 608 VN_HOLD(recovp->rc_vp1); 609 } 610 recovp->rc_vp2 = vp2; 611 if (vp2 != NULL) { 612 ASSERT(VTOMI4(vp2) == mi); 613 VN_HOLD(recovp->rc_vp2); 614 } 615 616 (void) zthread_create(NULL, 0, nfs4_recov_thread, recovp, 0, 617 minclsyspri); 618 return; 619 620 /* not reached by thread creating call */ 621 out_no_thread: 622 mutex_enter(&mi->mi_lock); 623 mi->mi_in_recovery--; 624 if (mi->mi_in_recovery == 0) 625 cv_broadcast(&mi->mi_cv_in_recov); 626 mutex_exit(&mi->mi_lock); 627 628 VFS_RELE(mi->mi_vfsp); 629 /* 630 * Free up resources that were allocated for us. 631 */ 632 kmem_free(recovp, sizeof (recov_info_t)); 633 } 634 635 static int 636 nfs4_check_srvstub(vnode_t *vp, rnode4_t *rp, nfs4_op_hint_t op) 637 { 638 int err = 0; 639 640 /* 641 * If tuneable does not allow client to cross srv mountpoints and 642 * object is a stub, then check check op hint and return EACCES for 643 * any hint other than access, rddir, getattr, lookup. 644 */ 645 if (rp->r_flags & R4SRVSTUB && op != OH_ACCESS && op != OH_GETACL && 646 op != OH_GETATTR && op != OH_READDIR && op != OH_LOOKUP) { 647 err = EACCES; 648 #ifdef DEBUG 649 NFS4_DEBUG(nfs4_srvmnt_debug, (CE_NOTE, 650 "nfs4_check_srvstub: op=%d err=%d rp=%p vp=%p\n" 651 "va_nod=%llx r_mntd_fid=%llx\n" 652 "sv_fsid=(%llx:%llx) r_srv_fsid=(%llx:%llx)", 653 op, err, (void *)rp, (void *)vp, 654 (u_longlong_t)rp->r_attr.va_nodeid, 655 (u_longlong_t)rp->r_mntd_fid, 656 (u_longlong_t)rp->r_server->sv_fsid.major, 657 (u_longlong_t)rp->r_server->sv_fsid.minor, 658 (u_longlong_t)rp->r_srv_fsid.major, 659 (u_longlong_t)rp->r_srv_fsid.minor)); 660 #endif 661 } 662 663 return (err); 664 } 665 666 static int 667 nfs4_check_recov_err(vnode_t *vp, nfs4_op_hint_t op, 668 nfs4_recov_state_t *rsp, int retry_err_cnt, char *str) 669 { 670 rnode4_t *rp; 671 int error = 0; 672 int exempt; 673 674 if (vp == NULL) 675 return (0); 676 677 exempt = (op == OH_CLOSE || op == OH_LOCKU || op == OH_DELEGRETURN); 678 rp = VTOR4(vp); 679 mutex_enter(&rp->r_statelock); 680 681 /* 682 * If there was a recovery error, then allow op hints "exempt" from 683 * recov errors to retry (currently 3 times). Either r_error or 684 * EIO is returned for non-exempt op hints. 685 * 686 * Error heirarchy: 687 * a) check for R4ERECOVERR 688 * b) check for R4SRVSTUB (only if R4RECOVERR is not set). 689 */ 690 if (rp->r_flags & R4RECOVERR) { 691 if (exempt && rsp->rs_num_retry_despite_err <= 692 nfs4_max_recov_error_retry) { 693 694 /* 695 * Check to make sure that we haven't already inc'd 696 * rs_num_retry_despite_err for current nfs4_start_fop 697 * instance. We don't want to double inc (if we were 698 * called with vp2, then the vp1 call could have 699 * already incremented. 700 */ 701 if (retry_err_cnt == rsp->rs_num_retry_despite_err) 702 rsp->rs_num_retry_despite_err++; 703 704 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 705 "nfs4_start_fop: %s %p DEAD, cnt=%d", str, 706 (void *)vp, rsp->rs_num_retry_despite_err)); 707 } else { 708 error = (rp->r_error ? rp->r_error : EIO); 709 /* 710 * An ESTALE error on a non-regular file is not 711 * "sticky". Return the ESTALE error once, but 712 * clear the condition to allow future operations 713 * to go OTW. This will allow the client to 714 * recover if the server has merely unshared then 715 * re-shared the file system. For regular files, 716 * the unshare has destroyed the open state at the 717 * server and we aren't willing to do a reopen (yet). 718 */ 719 if (error == ESTALE && vp->v_type != VREG) { 720 rp->r_flags &= 721 ~(R4RECOVERR|R4RECOVERRP|R4STALE); 722 rp->r_error = 0; 723 error = ESTALE; 724 } 725 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 726 "nfs4_start_fop: %s %p DEAD, cnt=%d error=%d", 727 str, (void *)vp, 728 rsp->rs_num_retry_despite_err, error)); 729 } 730 } else { 731 error = nfs4_check_srvstub(vp, rp, op); 732 NFS4_DEBUG(nfs4_client_recov_stub_debug, (CE_NOTE, 733 "nfs4_start_fop: %s %p SRVSTUB, error=%d", str, 734 (void *)vp, error)); 735 } 736 mutex_exit(&rp->r_statelock); 737 return (error); 738 } 739 740 /* 741 * Initial setup code that every operation should call if it might invoke 742 * client recovery. Can block waiting for recovery to finish on a 743 * filesystem. Either vnode ptr can be NULL. 744 * 745 * Returns 0 if there are no outstanding errors. Can return an 746 * errno value under various circumstances (e.g., failed recovery, or 747 * interrupted while waiting for recovery to finish). 748 * 749 * There must be a corresponding call to nfs4_end_op() to free up any locks 750 * or resources allocated by this call (assuming this call succeeded), 751 * using the same rsp that's passed in here. 752 * 753 * The open and lock seqid synchronization must be stopped before calling this 754 * function, as it could lead to deadlock when trying to reopen a file or 755 * reclaim a lock. The synchronization is obtained with calls to: 756 * nfs4_start_open_seqid_sync() 757 * nfs4_start_lock_seqid_sync() 758 * 759 * *startrecovp is set TRUE if the caller should not bother with the 760 * over-the-wire call, and just initiate recovery for the given request. 761 * This is typically used for state-releasing ops if the filesystem has 762 * been forcibly unmounted. startrecovp may be NULL for 763 * non-state-releasing ops. 764 */ 765 766 int 767 nfs4_start_fop(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2, nfs4_op_hint_t op, 768 nfs4_recov_state_t *rsp, bool_t *startrecovp) 769 { 770 int error = 0, rerr_cnt; 771 nfs4_server_t *sp = NULL; 772 nfs4_server_t *tsp; 773 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 774 time_t droplock_time; 775 #ifdef DEBUG 776 void *fop_caller; 777 #endif 778 779 ASSERT(vp1 == NULL || vp1->v_vfsp == mi->mi_vfsp); 780 ASSERT(vp2 == NULL || vp2->v_vfsp == mi->mi_vfsp); 781 782 #ifdef DEBUG 783 if ((fop_caller = tsd_get(nfs4_tsd_key)) != NULL) { 784 cmn_err(CE_PANIC, "Missing nfs4_end_fop: last caller %p", 785 fop_caller); 786 } 787 (void) tsd_set(nfs4_tsd_key, caller()); 788 #endif 789 790 rsp->rs_sp = NULL; 791 rsp->rs_flags &= ~NFS4_RS_RENAME_HELD; 792 rerr_cnt = rsp->rs_num_retry_despite_err; 793 794 /* 795 * Process the items that may delay() based on server response 796 */ 797 error = nfs4_wait_for_grace(mi, rsp); 798 if (error) 799 goto out; 800 801 if (vp1 != NULL) { 802 error = nfs4_wait_for_delay(vp1, rsp); 803 if (error) 804 goto out; 805 } 806 807 /* Wait for a delegation recall to complete. */ 808 809 error = wait_for_recall(vp1, vp2, op, rsp); 810 if (error) 811 goto out; 812 813 /* 814 * Wait for any current recovery actions to finish. Note that a 815 * recovery thread can still start up after wait_for_recovery() 816 * finishes. We don't block out recovery operations until we 817 * acquire s_recovlock and mi_recovlock. 818 */ 819 error = wait_for_recovery(mi, op); 820 if (error) 821 goto out; 822 823 /* 824 * Check to see if the rnode is already marked with a 825 * recovery error. If so, return it immediately. But 826 * always pass CLOSE, LOCKU, and DELEGRETURN so we can 827 * clean up state on the server. 828 */ 829 830 if (vp1 != NULL) { 831 if (error = nfs4_check_recov_err(vp1, op, rsp, rerr_cnt, "vp1")) 832 goto out; 833 nfs4_check_remap(mi, vp1, NFS4_REMAP_CKATTRS, &e); 834 } 835 836 if (vp2 != NULL) { 837 if (error = nfs4_check_recov_err(vp2, op, rsp, rerr_cnt, "vp2")) 838 goto out; 839 nfs4_check_remap(mi, vp2, NFS4_REMAP_CKATTRS, &e); 840 } 841 842 /* 843 * The lock order calls for us to acquire s_recovlock before 844 * mi_recovlock, but we have to hold mi_recovlock to look up sp (to 845 * prevent races with the failover/migration code). So acquire 846 * mi_recovlock, look up sp, drop mi_recovlock, acquire 847 * s_recovlock and mi_recovlock, then verify that sp is still the 848 * right object. XXX Can we find a simpler way to deal with this? 849 */ 850 if (nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 851 mi->mi_flags & MI4_INT)) { 852 error = EINTR; 853 goto out; 854 } 855 get_sp: 856 sp = find_nfs4_server(mi); 857 if (sp != NULL) { 858 sp->s_otw_call_count++; 859 mutex_exit(&sp->s_lock); 860 droplock_time = gethrestime_sec(); 861 } 862 nfs_rw_exit(&mi->mi_recovlock); 863 864 if (sp != NULL) { 865 if (nfs_rw_enter_sig(&sp->s_recovlock, RW_READER, 866 mi->mi_flags & MI4_INT)) { 867 error = EINTR; 868 goto out; 869 } 870 } 871 if (nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 872 mi->mi_flags & MI4_INT)) { 873 if (sp != NULL) 874 nfs_rw_exit(&sp->s_recovlock); 875 error = EINTR; 876 goto out; 877 } 878 /* 879 * If the mntinfo4_t hasn't changed nfs4_sever_ts then 880 * there's no point in double checking to make sure it 881 * has switched. 882 */ 883 if (sp == NULL || droplock_time < mi->mi_srvsettime) { 884 tsp = find_nfs4_server(mi); 885 if (tsp != sp) { 886 /* try again */ 887 if (tsp != NULL) { 888 mutex_exit(&tsp->s_lock); 889 nfs4_server_rele(tsp); 890 tsp = NULL; 891 } 892 if (sp != NULL) { 893 nfs_rw_exit(&sp->s_recovlock); 894 mutex_enter(&sp->s_lock); 895 sp->s_otw_call_count--; 896 mutex_exit(&sp->s_lock); 897 nfs4_server_rele(sp); 898 sp = NULL; 899 } 900 goto get_sp; 901 } else { 902 if (tsp != NULL) { 903 mutex_exit(&tsp->s_lock); 904 nfs4_server_rele(tsp); 905 tsp = NULL; 906 } 907 } 908 } 909 910 if (sp != NULL) { 911 rsp->rs_sp = sp; 912 } 913 914 /* 915 * If the fileystem uses volatile filehandles, obtain a lock so 916 * that we synchronize with renames. Exception: mount operations 917 * can change mi_fh_expire_type, which could be a problem, since 918 * the end_op code needs to be consistent with the start_op code 919 * about mi_rename_lock. Since mounts don't compete with renames, 920 * it's simpler to just not acquire the rename lock for mounts. 921 */ 922 if (NFS4_VOLATILE_FH(mi) && op != OH_MOUNT) { 923 if (nfs_rw_enter_sig(&mi->mi_rename_lock, 924 op == OH_VFH_RENAME ? RW_WRITER : RW_READER, 925 mi->mi_flags & MI4_INT)) { 926 nfs_rw_exit(&mi->mi_recovlock); 927 if (sp != NULL) 928 nfs_rw_exit(&sp->s_recovlock); 929 error = EINTR; 930 goto out; 931 } 932 rsp->rs_flags |= NFS4_RS_RENAME_HELD; 933 } 934 935 if (OH_IS_STATE_RELE(op)) { 936 /* 937 * For forced unmount, letting the request proceed will 938 * almost always delay response to the user, so hand it off 939 * to the recovery thread. For exiting lwp's, we don't 940 * have a good way to tell if the request will hang. We 941 * generally want processes to handle their own requests so 942 * that they can be done in parallel, but if there is 943 * already a recovery thread, hand the request off to it. 944 * This will improve user response at no cost to overall 945 * system throughput. For zone shutdown, we'd prefer 946 * the recovery thread to handle this as well. 947 */ 948 ASSERT(startrecovp != NULL); 949 mutex_enter(&mi->mi_lock); 950 if (FS_OR_ZONE_GONE4(mi->mi_vfsp)) 951 *startrecovp = TRUE; 952 else if ((curthread->t_proc_flag & TP_LWPEXIT) && 953 (mi->mi_flags & MI4_RECOV_ACTIV)) 954 *startrecovp = TRUE; 955 else 956 *startrecovp = FALSE; 957 mutex_exit(&mi->mi_lock); 958 } else 959 if (startrecovp != NULL) 960 *startrecovp = FALSE; 961 962 ASSERT(error == 0); 963 return (error); 964 965 out: 966 ASSERT(error != 0); 967 if (sp != NULL) { 968 mutex_enter(&sp->s_lock); 969 sp->s_otw_call_count--; 970 mutex_exit(&sp->s_lock); 971 nfs4_server_rele(sp); 972 rsp->rs_sp = NULL; 973 } 974 nfs4_end_op_recall(vp1, vp2, rsp); 975 976 #ifdef DEBUG 977 (void) tsd_set(nfs4_tsd_key, NULL); 978 #endif 979 return (error); 980 } 981 982 /* 983 * It is up to the caller to determine if rsp->rs_sp being NULL 984 * is detrimental or not. 985 */ 986 int 987 nfs4_start_op(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2, 988 nfs4_recov_state_t *rsp) 989 { 990 ASSERT(rsp->rs_num_retry_despite_err == 0); 991 rsp->rs_num_retry_despite_err = 0; 992 return (nfs4_start_fop(mi, vp1, vp2, OH_OTHER, rsp, NULL)); 993 } 994 995 /* 996 * Release any resources acquired by nfs4_start_op(). 997 * 'sp' should be the nfs4_server pointer returned by nfs4_start_op(). 998 * 999 * The operation hint is used to avoid a deadlock by bypassing delegation 1000 * return logic for writes, which are done while returning a delegation. 1001 */ 1002 1003 void 1004 nfs4_end_fop(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2, nfs4_op_hint_t op, 1005 nfs4_recov_state_t *rsp, bool_t needs_recov) 1006 { 1007 nfs4_server_t *sp = rsp->rs_sp; 1008 rnode4_t *rp = NULL; 1009 1010 #ifdef lint 1011 /* 1012 * The op hint isn't used any more, but might be in 1013 * the future. 1014 */ 1015 op = op; 1016 #endif 1017 1018 #ifdef DEBUG 1019 ASSERT(tsd_get(nfs4_tsd_key) != NULL); 1020 (void) tsd_set(nfs4_tsd_key, NULL); 1021 #endif 1022 1023 nfs4_end_op_recall(vp1, vp2, rsp); 1024 1025 if (rsp->rs_flags & NFS4_RS_RENAME_HELD) 1026 nfs_rw_exit(&mi->mi_rename_lock); 1027 1028 if (!needs_recov) { 1029 if (rsp->rs_flags & NFS4_RS_DELAY_MSG) { 1030 /* may need to clear the delay interval */ 1031 if (vp1 != NULL) { 1032 rp = VTOR4(vp1); 1033 mutex_enter(&rp->r_statelock); 1034 rp->r_delay_interval = 0; 1035 mutex_exit(&rp->r_statelock); 1036 } 1037 } 1038 rsp->rs_flags &= ~(NFS4_RS_GRACE_MSG|NFS4_RS_DELAY_MSG); 1039 } 1040 1041 /* 1042 * If the corresponding nfs4_start_op() found a sp, 1043 * then there must still be a sp. 1044 */ 1045 if (sp != NULL) { 1046 nfs_rw_exit(&mi->mi_recovlock); 1047 nfs_rw_exit(&sp->s_recovlock); 1048 mutex_enter(&sp->s_lock); 1049 sp->s_otw_call_count--; 1050 cv_broadcast(&sp->s_cv_otw_count); 1051 mutex_exit(&sp->s_lock); 1052 nfs4_server_rele(sp); 1053 } else { 1054 nfs_rw_exit(&mi->mi_recovlock); 1055 } 1056 } 1057 1058 void 1059 nfs4_end_op(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2, 1060 nfs4_recov_state_t *rsp, bool_t needrecov) 1061 { 1062 nfs4_end_fop(mi, vp1, vp2, OH_OTHER, rsp, needrecov); 1063 } 1064 1065 /* 1066 * If the filesystem is going through client recovery, block until 1067 * finished. 1068 * Exceptions: 1069 * - state-releasing ops (CLOSE, LOCKU, DELEGRETURN) are allowed to proceed 1070 * if the filesystem has been forcibly unmounted or the lwp is exiting. 1071 * 1072 * Return value: 1073 * - 0 if no errors 1074 * - EINTR if the call was interrupted 1075 * - EIO if the filesystem has been forcibly unmounted (non-state-releasing 1076 * op) 1077 * - the errno value from the recovery thread, if recovery failed 1078 */ 1079 1080 static int 1081 wait_for_recovery(mntinfo4_t *mi, nfs4_op_hint_t op_hint) 1082 { 1083 int error = 0; 1084 1085 mutex_enter(&mi->mi_lock); 1086 1087 while (mi->mi_recovflags != 0) { 1088 klwp_t *lwp = ttolwp(curthread); 1089 1090 if (mi->mi_flags & MI4_RECOV_FAIL) 1091 break; 1092 if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) 1093 break; 1094 if (OH_IS_STATE_RELE(op_hint) && 1095 (curthread->t_proc_flag & TP_LWPEXIT)) 1096 break; 1097 1098 if (lwp != NULL) 1099 lwp->lwp_nostop++; 1100 /* XXX - use different cv? */ 1101 if (cv_wait_sig(&mi->mi_failover_cv, &mi->mi_lock) == 0) { 1102 error = EINTR; 1103 if (lwp != NULL) 1104 lwp->lwp_nostop--; 1105 break; 1106 } 1107 if (lwp != NULL) 1108 lwp->lwp_nostop--; 1109 } 1110 1111 if (mi->mi_flags & MI4_RECOV_FAIL) { 1112 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1113 "wait_for_recovery: fail since RECOV FAIL")); 1114 error = mi->mi_error; 1115 } else if ((mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) && 1116 !OH_IS_STATE_RELE(op_hint)) { 1117 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1118 "wait_for_recovery: forced unmount")); 1119 error = EIO; 1120 } 1121 1122 mutex_exit(&mi->mi_lock); 1123 1124 return (error); 1125 } 1126 1127 /* 1128 * If the client received NFS4ERR_GRACE for this particular mount, 1129 * the client blocks here until it is time to try again. 1130 * 1131 * Return value: 1132 * - 0 if wait was successful 1133 * - EINTR if the call was interrupted 1134 */ 1135 1136 int 1137 nfs4_wait_for_grace(mntinfo4_t *mi, nfs4_recov_state_t *rsp) 1138 { 1139 int error = 0; 1140 time_t curtime, time_to_wait; 1141 1142 /* do a unprotected check to reduce mi_lock contention */ 1143 if (mi->mi_grace_wait != 0) { 1144 mutex_enter(&mi->mi_lock); 1145 1146 if (mi->mi_grace_wait != 0) { 1147 if (!(rsp->rs_flags & NFS4_RS_GRACE_MSG)) 1148 rsp->rs_flags |= NFS4_RS_GRACE_MSG; 1149 1150 curtime = gethrestime_sec(); 1151 1152 if (curtime < mi->mi_grace_wait) { 1153 1154 time_to_wait = mi->mi_grace_wait - curtime; 1155 1156 mutex_exit(&mi->mi_lock); 1157 1158 delay(SEC_TO_TICK(time_to_wait)); 1159 1160 curtime = gethrestime_sec(); 1161 1162 mutex_enter(&mi->mi_lock); 1163 1164 if (curtime >= mi->mi_grace_wait) 1165 mi->mi_grace_wait = 0; 1166 } else { 1167 mi->mi_grace_wait = 0; 1168 } 1169 } 1170 mutex_exit(&mi->mi_lock); 1171 } 1172 1173 return (error); 1174 } 1175 1176 /* 1177 * If the client received NFS4ERR_DELAY for an operation on a vnode, 1178 * the client blocks here until it is time to try again. 1179 * 1180 * Return value: 1181 * - 0 if wait was successful 1182 * - EINTR if the call was interrupted 1183 */ 1184 1185 int 1186 nfs4_wait_for_delay(vnode_t *vp, nfs4_recov_state_t *rsp) 1187 { 1188 int error = 0; 1189 time_t curtime, time_to_wait; 1190 rnode4_t *rp; 1191 1192 ASSERT(vp != NULL); 1193 1194 rp = VTOR4(vp); 1195 1196 /* do a unprotected check to reduce r_statelock contention */ 1197 if (rp->r_delay_wait != 0) { 1198 mutex_enter(&rp->r_statelock); 1199 1200 if (rp->r_delay_wait != 0) { 1201 1202 if (!(rsp->rs_flags & NFS4_RS_DELAY_MSG)) { 1203 rsp->rs_flags |= NFS4_RS_DELAY_MSG; 1204 nfs4_mi_kstat_inc_delay(VTOMI4(vp)); 1205 } 1206 1207 curtime = gethrestime_sec(); 1208 1209 if (curtime < rp->r_delay_wait) { 1210 1211 time_to_wait = rp->r_delay_wait - curtime; 1212 1213 mutex_exit(&rp->r_statelock); 1214 1215 delay(SEC_TO_TICK(time_to_wait)); 1216 1217 curtime = gethrestime_sec(); 1218 1219 mutex_enter(&rp->r_statelock); 1220 1221 if (curtime >= rp->r_delay_wait) 1222 rp->r_delay_wait = 0; 1223 } else { 1224 rp->r_delay_wait = 0; 1225 } 1226 } 1227 mutex_exit(&rp->r_statelock); 1228 } 1229 1230 return (error); 1231 } 1232 1233 /* 1234 * The recovery thread. 1235 */ 1236 1237 static void 1238 nfs4_recov_thread(recov_info_t *recovp) 1239 { 1240 mntinfo4_t *mi = recovp->rc_mi; 1241 nfs4_server_t *sp; 1242 int done = 0, error = 0; 1243 bool_t recov_fail = FALSE; 1244 callb_cpr_t cpr_info; 1245 kmutex_t cpr_lock; 1246 vfs_t *tvfsp; 1247 1248 nfs4_queue_event(RE_START, mi, NULL, mi->mi_recovflags, 1249 recovp->rc_vp1, recovp->rc_vp2, 0, NULL, 0, TAG_NONE, TAG_NONE, 1250 0, 0); 1251 1252 mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL); 1253 CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr, "nfsv4Recov"); 1254 1255 mutex_enter(&mi->mi_lock); 1256 mi->mi_recovthread = curthread; 1257 mutex_exit(&mi->mi_lock); 1258 1259 /* 1260 * We don't really need protection here against failover or 1261 * migration, since the current thread is the one that would make 1262 * any changes, but hold mi_recovlock anyway for completeness (and 1263 * to satisfy any ASSERTs). 1264 */ 1265 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0); 1266 sp = find_nfs4_server(mi); 1267 if (sp != NULL) 1268 mutex_exit(&sp->s_lock); 1269 nfs_rw_exit(&mi->mi_recovlock); 1270 1271 /* 1272 * Do any necessary recovery, based on the information in recovp 1273 * and any recovery flags. 1274 */ 1275 1276 do { 1277 mutex_enter(&mi->mi_lock); 1278 if (FS_OR_ZONE_GONE4(mi->mi_vfsp)) { 1279 bool_t activesrv; 1280 1281 NFS4_DEBUG(nfs4_client_recov_debug && 1282 mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED, (CE_NOTE, 1283 "nfs4_recov_thread: file system has been " 1284 "unmounted")); 1285 NFS4_DEBUG(nfs4_client_recov_debug && 1286 zone_status_get(curproc->p_zone) >= 1287 ZONE_IS_SHUTTING_DOWN, (CE_NOTE, 1288 "nfs4_recov_thread: zone shutting down")); 1289 /* 1290 * If the server has lost its state for us and 1291 * the filesystem is unmounted, then the filesystem 1292 * can be tossed, even if there are lost lock or 1293 * lost state calls in the recovery queue. 1294 */ 1295 if (mi->mi_recovflags & 1296 (MI4R_NEED_CLIENTID | MI4R_REOPEN_FILES)) { 1297 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1298 "nfs4_recov_thread: bailing out")); 1299 mi->mi_flags |= MI4_RECOV_FAIL; 1300 mi->mi_error = recovp->rc_error; 1301 recov_fail = TRUE; 1302 } 1303 /* 1304 * We don't know if the server has any state for 1305 * us, and the filesystem has been unmounted. If 1306 * there are "lost state" recovery items, keep 1307 * trying to process them until there are no more 1308 * mounted filesystems for the server. Otherwise, 1309 * bail out. The reason we don't mark the 1310 * filesystem as failing recovery is in case we 1311 * have to do "lost state" recovery later (e.g., a 1312 * user process exits). 1313 */ 1314 if (!(mi->mi_recovflags & MI4R_LOST_STATE)) { 1315 done = 1; 1316 mutex_exit(&mi->mi_lock); 1317 break; 1318 } 1319 mutex_exit(&mi->mi_lock); 1320 1321 if (sp == NULL) 1322 activesrv = FALSE; 1323 else { 1324 mutex_enter(&sp->s_lock); 1325 activesrv = nfs4_fs_active(sp); 1326 } 1327 if (!activesrv) { 1328 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1329 "no active fs for server %p", 1330 (void *)sp)); 1331 mutex_enter(&mi->mi_lock); 1332 mi->mi_flags |= MI4_RECOV_FAIL; 1333 mi->mi_error = recovp->rc_error; 1334 mutex_exit(&mi->mi_lock); 1335 recov_fail = TRUE; 1336 if (sp != NULL) { 1337 /* 1338 * Mark the server instance as 1339 * dead, so that nobody will attach 1340 * a new filesystem. 1341 */ 1342 nfs4_mark_srv_dead(sp); 1343 } 1344 } 1345 if (sp != NULL) 1346 mutex_exit(&sp->s_lock); 1347 } else { 1348 mutex_exit(&mi->mi_lock); 1349 } 1350 1351 /* 1352 * Check if we need to select a new server for a 1353 * failover. Choosing a new server will force at 1354 * least a check of the clientid. 1355 */ 1356 mutex_enter(&mi->mi_lock); 1357 if (!recov_fail && 1358 (mi->mi_recovflags & MI4R_NEED_NEW_SERVER)) { 1359 mutex_exit(&mi->mi_lock); 1360 recov_newserver(recovp, &sp, &recov_fail); 1361 } else 1362 mutex_exit(&mi->mi_lock); 1363 1364 /* 1365 * Check if we need to recover the clientid. This 1366 * must be done before file and lock recovery, and it 1367 * potentially affects the recovery threads for other 1368 * filesystems, so it gets special treatment. 1369 */ 1370 if (sp != NULL && recov_fail == FALSE) { 1371 mutex_enter(&sp->s_lock); 1372 if (!(sp->s_flags & N4S_CLIENTID_SET)) { 1373 mutex_exit(&sp->s_lock); 1374 recov_clientid(recovp, sp); 1375 } else { 1376 /* 1377 * Unset this flag in case another recovery 1378 * thread successfully recovered the clientid 1379 * for us already. 1380 */ 1381 mutex_enter(&mi->mi_lock); 1382 mi->mi_recovflags &= ~MI4R_NEED_CLIENTID; 1383 mutex_exit(&mi->mi_lock); 1384 mutex_exit(&sp->s_lock); 1385 } 1386 } 1387 1388 /* 1389 * Check if we need to get the security information. 1390 */ 1391 mutex_enter(&mi->mi_lock); 1392 if ((mi->mi_recovflags & MI4R_NEED_SECINFO) && 1393 !(mi->mi_flags & MI4_RECOV_FAIL)) { 1394 mutex_exit(&mi->mi_lock); 1395 (void) nfs_rw_enter_sig(&mi->mi_recovlock, 1396 RW_WRITER, 0); 1397 error = nfs4_secinfo_recov(recovp->rc_mi, 1398 recovp->rc_vp1, recovp->rc_vp2); 1399 /* 1400 * If error, nothing more can be done, stop 1401 * the recovery. 1402 */ 1403 if (error) { 1404 mutex_enter(&mi->mi_lock); 1405 mi->mi_flags |= MI4_RECOV_FAIL; 1406 mi->mi_error = recovp->rc_error; 1407 mutex_exit(&mi->mi_lock); 1408 nfs4_queue_event(RE_WRONGSEC, mi, NULL, 1409 error, recovp->rc_vp1, recovp->rc_vp2, 1410 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0); 1411 } 1412 nfs_rw_exit(&mi->mi_recovlock); 1413 } else 1414 mutex_exit(&mi->mi_lock); 1415 1416 /* 1417 * Check if there's a bad seqid to recover. 1418 */ 1419 mutex_enter(&mi->mi_lock); 1420 if ((mi->mi_recovflags & MI4R_BAD_SEQID) && 1421 !(mi->mi_flags & MI4_RECOV_FAIL)) { 1422 mutex_exit(&mi->mi_lock); 1423 (void) nfs_rw_enter_sig(&mi->mi_recovlock, 1424 RW_WRITER, 0); 1425 recov_bad_seqid(recovp); 1426 nfs_rw_exit(&mi->mi_recovlock); 1427 } else 1428 mutex_exit(&mi->mi_lock); 1429 1430 /* 1431 * Next check for recovery that affects the entire 1432 * filesystem. 1433 */ 1434 if (sp != NULL) { 1435 mutex_enter(&mi->mi_lock); 1436 if ((mi->mi_recovflags & MI4R_REOPEN_FILES) && 1437 !(mi->mi_flags & MI4_RECOV_FAIL)) { 1438 mutex_exit(&mi->mi_lock); 1439 recov_openfiles(recovp, sp); 1440 } else 1441 mutex_exit(&mi->mi_lock); 1442 } 1443 1444 /* 1445 * Send any queued state recovery requests. 1446 */ 1447 mutex_enter(&mi->mi_lock); 1448 if (sp != NULL && 1449 (mi->mi_recovflags & MI4R_LOST_STATE) && 1450 !(mi->mi_flags & MI4_RECOV_FAIL)) { 1451 mutex_exit(&mi->mi_lock); 1452 (void) nfs_rw_enter_sig(&mi->mi_recovlock, 1453 RW_WRITER, 0); 1454 nfs4_resend_lost_rqsts(recovp, sp); 1455 if (list_head(&mi->mi_lost_state) == NULL) { 1456 /* done */ 1457 mutex_enter(&mi->mi_lock); 1458 mi->mi_recovflags &= ~MI4R_LOST_STATE; 1459 mutex_exit(&mi->mi_lock); 1460 } 1461 nfs_rw_exit(&mi->mi_recovlock); 1462 } else { 1463 mutex_exit(&mi->mi_lock); 1464 } 1465 1466 /* 1467 * See if there is anything more to do. If not, announce 1468 * that we are done and exit. 1469 * 1470 * Need mi_recovlock to keep 'sp' valid. Must grab 1471 * mi_recovlock before mi_lock to preserve lock ordering. 1472 */ 1473 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0); 1474 mutex_enter(&mi->mi_lock); 1475 if ((mi->mi_recovflags & ~MI4R_SRV_REBOOT) == 0 || 1476 (mi->mi_flags & MI4_RECOV_FAIL)) { 1477 list_t local_lost_state; 1478 nfs4_lost_rqst_t *lrp; 1479 1480 /* 1481 * We need to remove the lost requests before we 1482 * unmark the mi as no longer doing recovery to 1483 * avoid a race with a new thread putting new lost 1484 * requests on the same mi (and the going away 1485 * thread would remove the new lost requests). 1486 * 1487 * Move the lost requests to a local list since 1488 * nfs4_remove_lost_rqst() drops mi_lock, and 1489 * dropping the mi_lock would make our check to 1490 * see if recovery is done no longer valid. 1491 */ 1492 list_create(&local_lost_state, 1493 sizeof (nfs4_lost_rqst_t), 1494 offsetof(nfs4_lost_rqst_t, lr_node)); 1495 list_move_tail(&local_lost_state, &mi->mi_lost_state); 1496 1497 done = 1; 1498 mutex_exit(&mi->mi_lock); 1499 /* 1500 * Now officially free the "moved" 1501 * lost requests. 1502 */ 1503 while ((lrp = list_head(&local_lost_state)) != NULL) { 1504 list_remove(&local_lost_state, lrp); 1505 nfs4_free_lost_rqst(lrp, sp); 1506 } 1507 list_destroy(&local_lost_state); 1508 } else 1509 mutex_exit(&mi->mi_lock); 1510 nfs_rw_exit(&mi->mi_recovlock); 1511 1512 /* 1513 * If the filesystem has been forcibly unmounted, there is 1514 * probably no point in retrying immediately. Furthermore, 1515 * there might be user processes waiting for a chance to 1516 * queue up "lost state" requests, so that they can exit. 1517 * So pause here for a moment. Same logic for zone shutdown. 1518 */ 1519 if (!done && FS_OR_ZONE_GONE4(mi->mi_vfsp)) { 1520 mutex_enter(&mi->mi_lock); 1521 cv_broadcast(&mi->mi_failover_cv); 1522 mutex_exit(&mi->mi_lock); 1523 delay(SEC_TO_TICK(nfs4_unmount_delay)); 1524 } 1525 1526 } while (!done); 1527 1528 if (sp != NULL) 1529 nfs4_server_rele(sp); 1530 1531 /* 1532 * Return all recalled delegations 1533 */ 1534 nfs4_dlistclean(); 1535 1536 mutex_enter(&mi->mi_lock); 1537 recov_done(mi, recovp); 1538 mutex_exit(&mi->mi_lock); 1539 1540 /* 1541 * Free up resources that were allocated for us. 1542 */ 1543 if (recovp->rc_vp1 != NULL) 1544 VN_RELE(recovp->rc_vp1); 1545 if (recovp->rc_vp2 != NULL) 1546 VN_RELE(recovp->rc_vp2); 1547 1548 /* Once we broadcast complete, the mi structure could be freed */ 1549 tvfsp = mi->mi_vfsp; 1550 1551 /* now we are done using the mi struct, signal the waiters */ 1552 mutex_enter(&mi->mi_lock); 1553 mi->mi_in_recovery--; 1554 if (mi->mi_in_recovery == 0) 1555 cv_broadcast(&mi->mi_cv_in_recov); 1556 mutex_exit(&mi->mi_lock); 1557 1558 VFS_RELE(tvfsp); 1559 1560 kmem_free(recovp, sizeof (recov_info_t)); 1561 mutex_enter(&cpr_lock); 1562 CALLB_CPR_EXIT(&cpr_info); 1563 mutex_destroy(&cpr_lock); 1564 zthread_exit(); 1565 } 1566 1567 /* 1568 * Log the end of recovery and notify any waiting threads. 1569 */ 1570 1571 static void 1572 recov_done(mntinfo4_t *mi, recov_info_t *recovp) 1573 { 1574 1575 ASSERT(MUTEX_HELD(&mi->mi_lock)); 1576 1577 nfs4_queue_event(RE_END, mi, NULL, 0, recovp->rc_vp1, 1578 recovp->rc_vp2, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0); 1579 mi->mi_recovthread = NULL; 1580 mi->mi_flags &= ~MI4_RECOV_ACTIV; 1581 mi->mi_recovflags &= ~MI4R_SRV_REBOOT; 1582 cv_broadcast(&mi->mi_failover_cv); 1583 } 1584 1585 /* 1586 * State-specific recovery routines, by state. 1587 */ 1588 1589 /* 1590 * Failover. 1591 * 1592 * Replaces *spp with a reference to the new server, which must 1593 * eventually be freed. 1594 */ 1595 1596 static void 1597 recov_newserver(recov_info_t *recovp, nfs4_server_t **spp, bool_t *recov_fail) 1598 { 1599 mntinfo4_t *mi = recovp->rc_mi; 1600 servinfo4_t *svp = NULL; 1601 nfs4_server_t *osp = *spp; 1602 CLIENT *cl; 1603 enum clnt_stat status; 1604 struct timeval tv; 1605 int error; 1606 int oncethru = 0; 1607 rnode4_t *rp; 1608 int index; 1609 nfs_fh4 fh; 1610 char *snames; 1611 size_t len; 1612 1613 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0); 1614 1615 tv.tv_sec = 2; 1616 tv.tv_usec = 0; 1617 1618 #ifdef lint 1619 /* 1620 * Lint can't follow the logic, so thinks that snames and len 1621 * can be used before being set. They can't, but lint can't 1622 * figure it out. To address the lint warning, initialize 1623 * snames and len for lint. 1624 */ 1625 snames = NULL; 1626 len = 0; 1627 #endif 1628 1629 /* 1630 * Ping the null NFS procedure of every server in 1631 * the list until one responds. We always start 1632 * at the head of the list and always skip the one 1633 * that is current, since it's caused us a problem. 1634 */ 1635 while (svp == NULL) { 1636 for (svp = mi->mi_servers; svp; svp = svp->sv_next) { 1637 1638 mutex_enter(&mi->mi_lock); 1639 if (FS_OR_ZONE_GONE4(mi->mi_vfsp)) { 1640 mi->mi_flags |= MI4_RECOV_FAIL; 1641 mutex_exit(&mi->mi_lock); 1642 (void) nfs_rw_exit(&mi->mi_recovlock); 1643 *recov_fail = TRUE; 1644 if (oncethru) 1645 kmem_free(snames, len); 1646 return; 1647 } 1648 mutex_exit(&mi->mi_lock); 1649 1650 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 1651 if (svp->sv_flags & SV4_NOTINUSE) { 1652 nfs_rw_exit(&svp->sv_lock); 1653 continue; 1654 } 1655 nfs_rw_exit(&svp->sv_lock); 1656 1657 if (!oncethru && svp == mi->mi_curr_serv) 1658 continue; 1659 1660 error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, 1661 NFS_PROGRAM, NFS_V4, 0, 1, CRED(), &cl); 1662 if (error) 1663 continue; 1664 1665 if (!(mi->mi_flags & MI4_INT)) 1666 cl->cl_nosignal = TRUE; 1667 status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL, 1668 xdr_void, NULL, tv); 1669 if (!(mi->mi_flags & MI4_INT)) 1670 cl->cl_nosignal = FALSE; 1671 AUTH_DESTROY(cl->cl_auth); 1672 CLNT_DESTROY(cl); 1673 if (status == RPC_SUCCESS) { 1674 nfs4_queue_event(RE_FAILOVER, mi, 1675 svp == mi->mi_curr_serv ? NULL : 1676 svp->sv_hostname, 0, NULL, NULL, 0, 1677 NULL, 0, TAG_NONE, TAG_NONE, 0, 0); 1678 break; 1679 } 1680 } 1681 1682 if (svp == NULL) { 1683 if (!oncethru) { 1684 snames = nfs4_getsrvnames(mi, &len); 1685 nfs4_queue_fact(RF_SRVS_NOT_RESPOND, mi, 1686 0, 0, 0, FALSE, snames, 0, NULL); 1687 oncethru = 1; 1688 } 1689 delay(hz); 1690 } 1691 } 1692 1693 if (oncethru) { 1694 nfs4_queue_fact(RF_SRVS_OK, mi, 0, 0, 0, FALSE, snames, 1695 0, NULL); 1696 kmem_free(snames, len); 1697 } 1698 1699 #if DEBUG 1700 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 1701 ASSERT((svp->sv_flags & SV4_NOTINUSE) == 0); 1702 nfs_rw_exit(&svp->sv_lock); 1703 #endif 1704 1705 mutex_enter(&mi->mi_lock); 1706 mi->mi_recovflags &= ~MI4R_NEED_NEW_SERVER; 1707 if (svp != mi->mi_curr_serv) { 1708 servinfo4_t *osvp = mi->mi_curr_serv; 1709 1710 mutex_exit(&mi->mi_lock); 1711 1712 /* 1713 * Update server-dependent fields in the root vnode. 1714 */ 1715 index = rtable4hash(mi->mi_rootfh); 1716 rw_enter(&rtable4[index].r_lock, RW_WRITER); 1717 1718 rp = r4find(&rtable4[index], mi->mi_rootfh, mi->mi_vfsp); 1719 if (rp != NULL) { 1720 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE, 1721 "recov_newserver: remapping %s", rnode4info(rp))); 1722 mutex_enter(&rp->r_statelock); 1723 rp->r_server = svp; 1724 PURGE_ATTRCACHE4_LOCKED(rp); 1725 mutex_exit(&rp->r_statelock); 1726 (void) nfs4_free_data_reclaim(rp); 1727 nfs4_purge_rddir_cache(RTOV4(rp)); 1728 rw_exit(&rtable4[index].r_lock); 1729 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE, 1730 "recov_newserver: done with %s", 1731 rnode4info(rp))); 1732 VN_RELE(RTOV4(rp)); 1733 } else 1734 rw_exit(&rtable4[index].r_lock); 1735 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0); 1736 1737 mutex_enter(&mi->mi_lock); 1738 mi->mi_recovflags |= MI4R_REOPEN_FILES | MI4R_REMAP_FILES; 1739 if (recovp->rc_srv_reboot) 1740 mi->mi_recovflags |= MI4R_SRV_REBOOT; 1741 mi->mi_curr_serv = svp; 1742 mi->mi_failover++; 1743 mi->mi_flags &= ~MI4_BADOWNER_DEBUG; 1744 mutex_exit(&mi->mi_lock); 1745 1746 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 1747 fh.nfs_fh4_len = svp->sv_fhandle.fh_len; 1748 fh.nfs_fh4_val = svp->sv_fhandle.fh_buf; 1749 sfh4_update(mi->mi_rootfh, &fh); 1750 fh.nfs_fh4_len = svp->sv_pfhandle.fh_len; 1751 fh.nfs_fh4_val = svp->sv_pfhandle.fh_buf; 1752 sfh4_update(mi->mi_srvparentfh, &fh); 1753 nfs_rw_exit(&svp->sv_lock); 1754 1755 *spp = nfs4_move_mi(mi, osvp, svp); 1756 if (osp != NULL) 1757 nfs4_server_rele(osp); 1758 } else 1759 mutex_exit(&mi->mi_lock); 1760 (void) nfs_rw_exit(&mi->mi_recovlock); 1761 } 1762 1763 /* 1764 * Clientid. 1765 */ 1766 1767 static void 1768 recov_clientid(recov_info_t *recovp, nfs4_server_t *sp) 1769 { 1770 mntinfo4_t *mi = recovp->rc_mi; 1771 int error = 0; 1772 int still_stale; 1773 int need_new_s; 1774 1775 ASSERT(sp != NULL); 1776 1777 /* 1778 * Acquire the recovery lock and then verify that the clientid 1779 * still needs to be recovered. (Note that s_recovlock is supposed 1780 * to be acquired before s_lock.) Since the thread holds the 1781 * recovery lock, no other thread will recover the clientid. 1782 */ 1783 (void) nfs_rw_enter_sig(&sp->s_recovlock, RW_WRITER, 0); 1784 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0); 1785 mutex_enter(&sp->s_lock); 1786 still_stale = ((sp->s_flags & N4S_CLIENTID_SET) == 0); 1787 mutex_exit(&sp->s_lock); 1788 1789 if (still_stale) { 1790 nfs4_error_t n4e; 1791 1792 nfs4_error_zinit(&n4e); 1793 nfs4setclientid(mi, kcred, TRUE, &n4e); 1794 error = n4e.error; 1795 if (error != 0) { 1796 1797 /* 1798 * nfs4setclientid may have set MI4R_NEED_NEW_SERVER, 1799 * if so, just return and let recov_thread drive 1800 * failover. 1801 */ 1802 mutex_enter(&mi->mi_lock); 1803 need_new_s = mi->mi_recovflags & MI4R_NEED_NEW_SERVER; 1804 mutex_exit(&mi->mi_lock); 1805 1806 if (need_new_s) { 1807 nfs_rw_exit(&mi->mi_recovlock); 1808 nfs_rw_exit(&sp->s_recovlock); 1809 return; 1810 } 1811 1812 nfs4_queue_event(RE_CLIENTID, mi, NULL, n4e.error, NULL, 1813 NULL, n4e.stat, NULL, 0, TAG_NONE, TAG_NONE, 0, 0); 1814 mutex_enter(&mi->mi_lock); 1815 mi->mi_flags |= MI4_RECOV_FAIL; 1816 mi->mi_error = recovp->rc_error; 1817 mutex_exit(&mi->mi_lock); 1818 /* don't destroy the nfs4_server, let umount do it */ 1819 } 1820 } 1821 1822 if (error == 0) { 1823 mutex_enter(&mi->mi_lock); 1824 mi->mi_recovflags &= ~MI4R_NEED_CLIENTID; 1825 /* 1826 * If still_stale isn't true, then another thread already 1827 * recovered the clientid. And that thread that set the 1828 * clientid will have initiated reopening files on all the 1829 * filesystems for the server, so we should not initiate 1830 * reopening for this filesystem here. 1831 */ 1832 if (still_stale) { 1833 mi->mi_recovflags |= MI4R_REOPEN_FILES; 1834 if (recovp->rc_srv_reboot) 1835 mi->mi_recovflags |= MI4R_SRV_REBOOT; 1836 } 1837 mutex_exit(&mi->mi_lock); 1838 } 1839 1840 nfs_rw_exit(&mi->mi_recovlock); 1841 1842 if (error != 0) { 1843 nfs_rw_exit(&sp->s_recovlock); 1844 mutex_enter(&mi->mi_lock); 1845 if ((mi->mi_flags & MI4_RECOV_FAIL) == 0) 1846 delay(SEC_TO_TICK(recov_err_delay)); 1847 mutex_exit(&mi->mi_lock); 1848 } else { 1849 mntinfo4_t **milist; 1850 mntinfo4_t *tmi; 1851 int nummi, i; 1852 1853 /* 1854 * Initiate recovery of open files for other filesystems. 1855 * We create an array of filesystems, rather than just 1856 * walking the filesystem list, to avoid deadlock issues 1857 * with s_lock and mi_recovlock. 1858 */ 1859 milist = make_milist(sp, &nummi); 1860 for (i = 0; i < nummi; i++) { 1861 tmi = milist[i]; 1862 if (tmi != mi) { 1863 (void) nfs_rw_enter_sig(&tmi->mi_recovlock, 1864 RW_READER, 0); 1865 start_recovery_action(NR_OPENFILES, TRUE, tmi, 1866 NULL, NULL); 1867 nfs_rw_exit(&tmi->mi_recovlock); 1868 } 1869 } 1870 free_milist(milist, nummi); 1871 1872 nfs_rw_exit(&sp->s_recovlock); 1873 } 1874 } 1875 1876 /* 1877 * Return an array of filesystems associated with the given server. The 1878 * caller should call free_milist() to free the references and memory. 1879 */ 1880 1881 static mntinfo4_t ** 1882 make_milist(nfs4_server_t *sp, int *nummip) 1883 { 1884 int nummi, i; 1885 mntinfo4_t **milist; 1886 mntinfo4_t *tmi; 1887 1888 mutex_enter(&sp->s_lock); 1889 nummi = 0; 1890 for (tmi = sp->mntinfo4_list; tmi != NULL; tmi = tmi->mi_clientid_next) 1891 nummi++; 1892 1893 milist = kmem_alloc(nummi * sizeof (mntinfo4_t *), KM_NOSLEEP); 1894 1895 for (i = 0, tmi = sp->mntinfo4_list; tmi != NULL; i++, 1896 tmi = tmi->mi_clientid_next) { 1897 milist[i] = tmi; 1898 VFS_HOLD(tmi->mi_vfsp); 1899 } 1900 mutex_exit(&sp->s_lock); 1901 1902 *nummip = nummi; 1903 return (milist); 1904 } 1905 1906 /* 1907 * Free the filesystem list created by make_milist(). 1908 */ 1909 1910 static void 1911 free_milist(mntinfo4_t **milist, int nummi) 1912 { 1913 mntinfo4_t *tmi; 1914 int i; 1915 1916 for (i = 0; i < nummi; i++) { 1917 tmi = milist[i]; 1918 VFS_RELE(tmi->mi_vfsp); 1919 } 1920 kmem_free(milist, nummi * sizeof (mntinfo4_t *)); 1921 } 1922 1923 /* 1924 * Filehandle 1925 */ 1926 1927 /* 1928 * Lookup the filehandle for the given vnode and update the rnode if it has 1929 * changed. 1930 * 1931 * Errors: 1932 * - if the filehandle could not be updated because of an error that 1933 * requires further recovery, initiate that recovery and return. 1934 * - if the filehandle could not be updated because of a signal, pretend we 1935 * succeeded and let someone else deal with it. 1936 * - if the filehandle could not be updated and the filesystem has been 1937 * forcibly unmounted, pretend we succeeded, and let the caller deal with 1938 * the forced unmount (to retry or not to retry, that is the question). 1939 * - if the filehandle could not be updated because of some other error, 1940 * mark the rnode bad and return. 1941 */ 1942 static void 1943 recov_filehandle(nfs4_recov_t action, mntinfo4_t *mi, vnode_t *vp) 1944 { 1945 rnode4_t *rp = VTOR4(vp); 1946 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 1947 bool_t needrecov; 1948 1949 mutex_enter(&rp->r_statelock); 1950 1951 if (rp->r_flags & R4RECOVERR) { 1952 mutex_exit(&rp->r_statelock); 1953 return; 1954 } 1955 1956 /* 1957 * If someone else is updating the filehandle, wait for them to 1958 * finish and then let our caller retry. 1959 */ 1960 if (rp->r_flags & R4RECEXPFH) { 1961 while (rp->r_flags & R4RECEXPFH) { 1962 cv_wait(&rp->r_cv, &rp->r_statelock); 1963 } 1964 mutex_exit(&rp->r_statelock); 1965 return; 1966 } 1967 rp->r_flags |= R4RECEXPFH; 1968 mutex_exit(&rp->r_statelock); 1969 1970 if (action == NR_BADHANDLE) { 1971 /* shouldn't happen */ 1972 nfs4_queue_event(RE_BADHANDLE, mi, NULL, 0, 1973 vp, NULL, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0); 1974 } 1975 1976 nfs4_remap_file(mi, vp, 0, &e); 1977 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 1978 1979 /* 1980 * If we get BADHANDLE or FHEXPIRED in their handler, something is 1981 * broken. Don't try to recover, just mark the file dead. 1982 */ 1983 if (needrecov && e.error == 0 && 1984 (e.stat == NFS4ERR_BADHANDLE || e.stat == NFS4ERR_FHEXPIRED)) 1985 needrecov = FALSE; 1986 if (needrecov) { 1987 (void) nfs4_start_recovery(&e, mi, vp, 1988 NULL, NULL, NULL, OP_LOOKUP, NULL); 1989 } else if (e.error != EINTR && 1990 !NFS4_FRC_UNMT_ERR(e.error, mi->mi_vfsp) && 1991 (e.error != 0 || e.stat != NFS4_OK)) { 1992 nfs4_recov_fh_fail(vp, e.error, e.stat); 1993 /* 1994 * Don't set r_error to ESTALE. Higher-level code (e.g., 1995 * cstatat_getvp()) retries on ESTALE, which would cause 1996 * an infinite loop. 1997 */ 1998 } 1999 2000 mutex_enter(&rp->r_statelock); 2001 rp->r_flags &= ~R4RECEXPFH; 2002 cv_broadcast(&rp->r_cv); 2003 mutex_exit(&rp->r_statelock); 2004 } 2005 2006 /* 2007 * Stale Filehandle 2008 */ 2009 2010 /* 2011 * A stale filehandle can happen when an individual file has 2012 * been removed, or when an entire filesystem has been taken 2013 * offline. To distinguish these cases, we do this: 2014 * - if a GETATTR with the current filehandle is okay, we do 2015 * nothing (this can happen with two-filehandle ops) 2016 * - if the GETATTR fails, but a GETATTR of the root filehandle 2017 * succeeds, mark the rnode with R4STALE, which will stop use 2018 * - if the GETATTR fails, and a GETATTR of the root filehandle 2019 * also fails, we consider the problem filesystem-wide, so: 2020 * - if we can failover, we should 2021 * - if we can't failover, we should mark both the original 2022 * vnode and the root bad 2023 */ 2024 static void 2025 recov_stale(mntinfo4_t *mi, vnode_t *vp) 2026 { 2027 rnode4_t *rp = VTOR4(vp); 2028 vnode_t *rootvp = NULL; 2029 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 2030 nfs4_ga_res_t gar; 2031 char *fail_msg = "failed to recover from NFS4ERR_STALE"; 2032 bool_t needrecov; 2033 2034 mutex_enter(&rp->r_statelock); 2035 2036 if (rp->r_flags & R4RECOVERR) { 2037 mutex_exit(&rp->r_statelock); 2038 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2039 "recov_stale: already marked dead, rp %s", 2040 rnode4info(rp))); 2041 return; 2042 } 2043 2044 if (rp->r_flags & R4STALE) { 2045 mutex_exit(&rp->r_statelock); 2046 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2047 "recov_stale: already marked stale, rp %s", 2048 rnode4info(rp))); 2049 return; 2050 } 2051 2052 mutex_exit(&rp->r_statelock); 2053 2054 /* Try a GETATTR on this vnode */ 2055 nfs4_getattr_otw_norecovery(vp, &gar, &e, CRED(), 0); 2056 2057 /* 2058 * Handle non-STALE recoverable errors 2059 */ 2060 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 2061 if (needrecov && (e.error != 0 || e.stat != NFS4ERR_STALE)) { 2062 (void) nfs4_start_recovery(&e, mi, vp, 2063 NULL, NULL, NULL, OP_GETATTR, NULL); 2064 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2065 "recov_stale: error=%d, stat=%d seen on rp %s", 2066 e.error, e.stat, rnode4info(rp))); 2067 goto out; 2068 } 2069 2070 /* Are things OK for this vnode? */ 2071 if (!e.error && e.stat == NFS4_OK) { 2072 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2073 "recov_stale: file appears fine, rp %s", 2074 rnode4info(rp))); 2075 goto out; 2076 } 2077 2078 /* Did we get an unrelated non-recoverable error? */ 2079 if (e.error || e.stat != NFS4ERR_STALE) { 2080 nfs4_fail_recov(vp, fail_msg, e.error, e.stat); 2081 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2082 "recov_stale: unrelated fatal error, rp %s", 2083 rnode4info(rp))); 2084 goto out; 2085 } 2086 2087 /* 2088 * If we don't appear to be dealing with the root node, find it. 2089 */ 2090 if ((vp->v_flag & VROOT) == 0) { 2091 nfs4_error_zinit(&e); 2092 e.error = VFS_ROOT(vp->v_vfsp, &rootvp); 2093 if (e.error) { 2094 nfs4_fail_recov(vp, fail_msg, 0, NFS4ERR_STALE); 2095 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2096 "recov_stale: can't find root node for rp %s", 2097 rnode4info(rp))); 2098 goto out; 2099 } 2100 } 2101 2102 /* Try a GETATTR on the root vnode */ 2103 if (rootvp != NULL) { 2104 nfs4_error_zinit(&e); 2105 nfs4_getattr_otw_norecovery(rootvp, &gar, &e, CRED(), 0); 2106 2107 /* Try recovery? */ 2108 if (e.error != 0 || e.stat != NFS4ERR_STALE) { 2109 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 2110 if (needrecov) { 2111 (void) nfs4_start_recovery(&e, 2112 mi, rootvp, NULL, NULL, NULL, 2113 OP_GETATTR, NULL); 2114 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2115 "recov_stale: error=%d, stat=%d seen " 2116 "on rp %s", e.error, e.stat, 2117 rnode4info(rp))); 2118 } 2119 } 2120 2121 /* 2122 * Check to see if a failover attempt is warranted 2123 * NB: nfs4_try_failover doesn't check for STALE 2124 * because recov_stale gets a shot first. Now that 2125 * recov_stale has failed, go ahead and try failover. 2126 * 2127 * If the getattr on the root filehandle was successful, 2128 * then mark recovery as failed for 'vp' and exit. 2129 */ 2130 if (nfs4_try_failover(&e) == 0 && e.stat != NFS4ERR_STALE) { 2131 /* 2132 * pass the original error to fail_recov, not 2133 * the one from trying the root vnode. 2134 */ 2135 nfs4_fail_recov(vp, fail_msg, 0, NFS4ERR_STALE); 2136 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2137 "recov_stale: root node OK, marking " 2138 "dead rp %s", rnode4info(rp))); 2139 goto out; 2140 } 2141 } 2142 2143 /* 2144 * Here, we know that both the original file and the 2145 * root filehandle (which may be the same) are stale. 2146 * We want to fail over if we can, and if we can't, we 2147 * want to mark everything in sight bad. 2148 */ 2149 if (FAILOVER_MOUNT4(mi)) { 2150 mutex_enter(&mi->mi_lock); 2151 mi->mi_recovflags |= MI4R_NEED_NEW_SERVER; 2152 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2153 "recov_stale: failing over due to rp %s", 2154 rnode4info(rp))); 2155 mutex_exit(&mi->mi_lock); 2156 } else { 2157 rnode4_t *rootrp; 2158 servinfo4_t *svp; 2159 2160 /* 2161 * Can't fail over, so mark things dead. 2162 * 2163 * If rootvp is set, we know we have a distinct 2164 * non-root vnode which can be marked dead in 2165 * the usual way. 2166 * 2167 * Then we want to mark the root vnode dead. 2168 * Note that if rootvp wasn't set, our vp is 2169 * actually the root vnode. 2170 */ 2171 if (rootvp != NULL) { 2172 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2173 "recov_stale: can't fail over, marking dead rp %s", 2174 rnode4info(rp))); 2175 nfs4_fail_recov(vp, fail_msg, 0, NFS4ERR_STALE); 2176 } else { 2177 rootvp = vp; 2178 VN_HOLD(rootvp); 2179 } 2180 2181 /* 2182 * Mark root dead, but quietly - since 2183 * the root rnode is frequently recreated, 2184 * we can encounter this at every access. 2185 * Also mark recovery as failed on this VFS. 2186 */ 2187 rootrp = VTOR4(rootvp); 2188 NFS4_DEBUG(nfs4_client_recov_debug, (CE_CONT, 2189 "recov_stale: marking dead root rp %s", 2190 rnode4info(rootrp))); 2191 mutex_enter(&rootrp->r_statelock); 2192 rootrp->r_flags |= (R4RECOVERR | R4STALE); 2193 rootrp->r_error = ESTALE; 2194 mutex_exit(&rootrp->r_statelock); 2195 mutex_enter(&mi->mi_lock); 2196 mi->mi_error = ESTALE; 2197 mutex_exit(&mi->mi_lock); 2198 2199 svp = mi->mi_curr_serv; 2200 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0); 2201 svp->sv_flags |= SV4_ROOT_STALE; 2202 nfs_rw_exit(&svp->sv_lock); 2203 } 2204 2205 out: 2206 if (rootvp) 2207 VN_RELE(rootvp); 2208 } 2209 2210 /* 2211 * Locks. 2212 */ 2213 2214 /* 2215 * Reclaim all the active (acquired) locks for the given file. 2216 * If a process lost a lock, the process is sent a SIGLOST. This is not 2217 * considered an error. 2218 * 2219 * Return values: 2220 * Errors and status are returned via the nfs4_error_t parameter 2221 * If an error indicates that recovery is needed, the caller is responsible 2222 * for dealing with it. 2223 */ 2224 2225 static void 2226 relock_file(vnode_t *vp, mntinfo4_t *mi, nfs4_error_t *ep, 2227 fattr4_change pre_change) 2228 { 2229 locklist_t *locks, *llp; 2230 rnode4_t *rp; 2231 2232 ASSERT(ep != NULL); 2233 nfs4_error_zinit(ep); 2234 2235 if (VTOMI4(vp)->mi_flags & MI4_LLOCK) 2236 return; 2237 2238 nfs4_flush_lock_owners(VTOR4(vp)); 2239 2240 /* 2241 * If we get an error that requires recovery actions, just bail out 2242 * and let the top-level recovery code handle it. 2243 * 2244 * If we get some other error, kill the process that owned the lock 2245 * and mark its remaining locks (if any) as belonging to NOPID, so 2246 * that we don't make any more reclaim requests for that process. 2247 */ 2248 2249 rp = VTOR4(vp); 2250 locks = flk_active_locks_for_vp(vp); 2251 for (llp = locks; llp != NULL; llp = llp->ll_next) { 2252 int did_reclaim = 1; 2253 2254 ASSERT(llp->ll_vp == vp); 2255 if (llp->ll_flock.l_pid == NOPID) 2256 continue; 2257 reclaim_one_lock(vp, &llp->ll_flock, ep, &did_reclaim); 2258 /* 2259 * If we need to restart recovery, stop processing the 2260 * list. Some errors would be recoverable under other 2261 * circumstances, but if they happen here we just give up 2262 * on the lock. 2263 */ 2264 if (nfs4_needs_recovery(ep, TRUE, vp->v_vfsp)) { 2265 if (ep->error != 0) 2266 break; 2267 if (!nfs4_recov_marks_dead(ep->stat)) 2268 break; 2269 } 2270 /* 2271 * In case the server isn't offering us a grace period, or 2272 * if we missed it, we might have opened & locked from scratch, 2273 * rather than reopened/reclaimed. 2274 * We need to ensure that the object hadn't been otherwise 2275 * changed during this time, by comparing the changeinfo. 2276 * We get passed the changeinfo from before the reopen by our 2277 * caller, in pre_change. 2278 * The changeinfo from after the reopen is in rp->r_change, 2279 * courtesy of the GETATTR in the reopen. 2280 * If they're different, then the file has changed, and we 2281 * have to SIGLOST the app. 2282 */ 2283 if (ep->error == 0 && ep->stat == NFS4_OK && !did_reclaim) { 2284 mutex_enter(&rp->r_statelock); 2285 if (pre_change != rp->r_change) 2286 ep->stat = NFS4ERR_NO_GRACE; 2287 mutex_exit(&rp->r_statelock); 2288 } 2289 if (ep->error != 0 || ep->stat != NFS4_OK) { 2290 if (ep->error != 0) 2291 nfs4_queue_event(RE_FAIL_RELOCK, mi, 2292 NULL, ep->error, vp, NULL, 0, NULL, 2293 llp->ll_flock.l_pid, TAG_NONE, TAG_NONE, 2294 0, 0); 2295 else 2296 nfs4_queue_event(RE_FAIL_RELOCK, mi, 2297 NULL, 0, vp, NULL, ep->stat, NULL, 2298 llp->ll_flock.l_pid, TAG_NONE, TAG_NONE, 2299 0, 0); 2300 nfs4_send_siglost(llp->ll_flock.l_pid, mi, vp, TRUE, 2301 ep->error, ep->stat); 2302 relock_skip_pid(llp, llp->ll_flock.l_pid); 2303 2304 /* Reinitialize the nfs4_error and continue */ 2305 nfs4_error_zinit(ep); 2306 } 2307 } 2308 2309 if (locks != NULL) 2310 flk_free_locklist(locks); 2311 } 2312 2313 /* 2314 * Reclaim the given lock. 2315 * If the lock can't be reclaimed, the process is sent SIGLOST, but this is 2316 * not considered an error. 2317 * 2318 * Errors are returned via the nfs4_error_t parameter. 2319 */ 2320 static void 2321 reclaim_one_lock(vnode_t *vp, flock64_t *flk, nfs4_error_t *ep, 2322 int *did_reclaimp) 2323 { 2324 cred_t *cr; 2325 rnode4_t *rp = VTOR4(vp); 2326 2327 cr = pid_to_cr(flk->l_pid); 2328 if (cr == NULL) { 2329 nfs4_error_zinit(ep); 2330 ep->error = ESRCH; 2331 return; 2332 } 2333 2334 do { 2335 mutex_enter(&rp->r_statelock); 2336 if (rp->r_flags & R4RECOVERR) { 2337 /* 2338 * This shouldn't affect other reclaims, so don't 2339 * return an error. 2340 */ 2341 mutex_exit(&rp->r_statelock); 2342 break; 2343 } 2344 mutex_exit(&rp->r_statelock); 2345 2346 nfs4frlock(NFS4_LCK_CTYPE_RECLAIM, vp, F_SETLK, flk, 2347 FREAD|FWRITE, 0, cr, ep, NULL, did_reclaimp); 2348 if (ep->error == 0 && ep->stat == NFS4ERR_FHEXPIRED) 2349 start_recovery_action(NR_FHEXPIRED, TRUE, VTOMI4(vp), 2350 vp, NULL); 2351 } while (ep->error == 0 && ep->stat == NFS4ERR_FHEXPIRED); 2352 2353 crfree(cr); 2354 } 2355 2356 /* 2357 * Open files. 2358 */ 2359 2360 /* 2361 * Verifies if the nfsstat4 is a valid error for marking this vnode dead. 2362 * Returns 1 if the error is valid; 0 otherwise. 2363 */ 2364 static int 2365 nfs4_valid_recov_err_for_vp(vnode_t *vp, nfsstat4 stat) 2366 { 2367 /* 2368 * We should not be marking non-regular files as dead, 2369 * except in very rare cases (eg: BADHANDLE or NFS4ERR_BADNAME). 2370 */ 2371 if (vp->v_type != VREG && stat != NFS4ERR_BADHANDLE && 2372 stat != NFS4ERR_BADNAME) 2373 return (0); 2374 2375 return (1); 2376 } 2377 2378 /* 2379 * Failed attempting to recover a filehandle. If 'stat' is valid for 'vp', 2380 * then mark the object dead. Since we've had to do a lookup for 2381 * filehandle recovery, we will mark the object dead if we got NOENT. 2382 */ 2383 static void 2384 nfs4_recov_fh_fail(vnode_t *vp, int error, nfsstat4 stat) 2385 { 2386 ASSERT(vp != NULL); 2387 2388 if ((error == 0) && (stat != NFS4ERR_NOENT) && 2389 (!nfs4_valid_recov_err_for_vp(vp, stat))) 2390 return; 2391 2392 nfs4_fail_recov(vp, "can't recover filehandle", error, stat); 2393 } 2394 2395 /* 2396 * Recovery from a "shouldn't happen" error. In the long term, we'd like 2397 * to mark only the data structure(s) that provided the bad value as being 2398 * bad. But for now we'll just mark the entire file. 2399 */ 2400 2401 static void 2402 recov_badstate(recov_info_t *recovp, vnode_t *vp, nfsstat4 stat) 2403 { 2404 ASSERT(vp != NULL); 2405 recov_throttle(recovp, vp); 2406 2407 if (!nfs4_valid_recov_err_for_vp(vp, stat)) 2408 return; 2409 2410 nfs4_fail_recov(vp, "", 0, stat); 2411 } 2412 2413 /* 2414 * Free up the information saved for a lost state request. 2415 */ 2416 static void 2417 nfs4_free_lost_rqst(nfs4_lost_rqst_t *lrp, nfs4_server_t *sp) 2418 { 2419 component4 *filep; 2420 nfs4_open_stream_t *osp; 2421 int have_sync_lock; 2422 2423 NFS4_DEBUG(nfs4_lost_rqst_debug, 2424 (CE_NOTE, "nfs4_free_lost_rqst:")); 2425 2426 switch (lrp->lr_op) { 2427 case OP_OPEN: 2428 filep = &lrp->lr_ofile; 2429 if (filep->utf8string_val) { 2430 kmem_free(filep->utf8string_val, filep->utf8string_len); 2431 filep->utf8string_val = NULL; 2432 } 2433 break; 2434 case OP_DELEGRETURN: 2435 nfs4delegreturn_cleanup(VTOR4(lrp->lr_vp), sp); 2436 break; 2437 case OP_CLOSE: 2438 osp = lrp->lr_osp; 2439 ASSERT(osp != NULL); 2440 mutex_enter(&osp->os_sync_lock); 2441 have_sync_lock = 1; 2442 if (osp->os_pending_close) { 2443 /* clean up the open file state. */ 2444 osp->os_pending_close = 0; 2445 nfs4close_notw(lrp->lr_vp, osp, &have_sync_lock); 2446 } 2447 if (have_sync_lock) 2448 mutex_exit(&osp->os_sync_lock); 2449 break; 2450 } 2451 2452 lrp->lr_op = 0; 2453 if (lrp->lr_oop != NULL) { 2454 open_owner_rele(lrp->lr_oop); 2455 lrp->lr_oop = NULL; 2456 } 2457 if (lrp->lr_osp != NULL) { 2458 open_stream_rele(lrp->lr_osp, VTOR4(lrp->lr_vp)); 2459 lrp->lr_osp = NULL; 2460 } 2461 if (lrp->lr_lop != NULL) { 2462 lock_owner_rele(lrp->lr_lop); 2463 lrp->lr_lop = NULL; 2464 } 2465 if (lrp->lr_flk != NULL) { 2466 kmem_free(lrp->lr_flk, sizeof (flock64_t)); 2467 lrp->lr_flk = NULL; 2468 } 2469 if (lrp->lr_vp != NULL) { 2470 VN_RELE(lrp->lr_vp); 2471 lrp->lr_vp = NULL; 2472 } 2473 if (lrp->lr_dvp != NULL) { 2474 VN_RELE(lrp->lr_dvp); 2475 lrp->lr_dvp = NULL; 2476 } 2477 if (lrp->lr_cr != NULL) { 2478 crfree(lrp->lr_cr); 2479 lrp->lr_cr = NULL; 2480 } 2481 2482 kmem_free(lrp, sizeof (nfs4_lost_rqst_t)); 2483 } 2484 2485 /* 2486 * Remove any lost state requests and free them. 2487 */ 2488 static void 2489 nfs4_remove_lost_rqsts(mntinfo4_t *mi, nfs4_server_t *sp) 2490 { 2491 nfs4_lost_rqst_t *lrp; 2492 2493 mutex_enter(&mi->mi_lock); 2494 while ((lrp = list_head(&mi->mi_lost_state)) != NULL) { 2495 list_remove(&mi->mi_lost_state, lrp); 2496 mutex_exit(&mi->mi_lock); 2497 nfs4_free_lost_rqst(lrp, sp); 2498 mutex_enter(&mi->mi_lock); 2499 } 2500 mutex_exit(&mi->mi_lock); 2501 } 2502 2503 /* 2504 * Reopen all the files for the given filesystem and reclaim any locks. 2505 */ 2506 2507 static void 2508 recov_openfiles(recov_info_t *recovp, nfs4_server_t *sp) 2509 { 2510 mntinfo4_t *mi = recovp->rc_mi; 2511 nfs4_opinst_t *reopenlist = NULL, *rep; 2512 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 2513 open_claim_type4 claim; 2514 int remap; 2515 char *fail_msg = "No such file or directory on replica"; 2516 rnode4_t *rp; 2517 fattr4_change pre_change; 2518 2519 ASSERT(sp != NULL); 2520 2521 /* 2522 * This check is to allow a 10ms pause before we reopen files 2523 * it should allow the server time to have received the CB_NULL 2524 * reply and update its internal structures such that (if 2525 * applicable) we are granted a delegation on reopened files. 2526 */ 2527 mutex_enter(&sp->s_lock); 2528 if ((sp->s_flags & (N4S_CB_PINGED | N4S_CB_WAITER)) == 0) { 2529 sp->s_flags |= N4S_CB_WAITER; 2530 (void) cv_timedwait(&sp->wait_cb_null, &sp->s_lock, 2531 (lbolt+drv_usectohz(N4S_CB_PAUSE_TIME))); 2532 } 2533 mutex_exit(&sp->s_lock); 2534 2535 (void) nfs_rw_enter_sig(&sp->s_recovlock, RW_READER, 0); 2536 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0); 2537 2538 if (NFS4_VOLATILE_FH(mi)) { 2539 nfs4_remap_root(mi, &e, 0); 2540 if (nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp)) { 2541 (void) nfs4_start_recovery(&e, mi, NULL, 2542 NULL, NULL, NULL, OP_LOOKUP, NULL); 2543 } 2544 } 2545 2546 mutex_enter(&mi->mi_lock); 2547 if (recovp->rc_srv_reboot || (mi->mi_recovflags & MI4R_SRV_REBOOT)) 2548 claim = CLAIM_PREVIOUS; 2549 else 2550 claim = CLAIM_NULL; 2551 mutex_exit(&mi->mi_lock); 2552 2553 if (e.error == 0 && e.stat == NFS4_OK) { 2554 /* 2555 * Get a snapshot of open files in the filesystem. Note 2556 * that new opens will stall until the server's grace 2557 * period is done. 2558 */ 2559 reopenlist = r4mkopenlist(mi); 2560 2561 mutex_enter(&mi->mi_lock); 2562 remap = mi->mi_recovflags & MI4R_REMAP_FILES; 2563 mutex_exit(&mi->mi_lock); 2564 /* 2565 * Since we are re-establishing state on the 2566 * server, its ok to blow away the saved lost 2567 * requests since we don't need to reissue it. 2568 */ 2569 nfs4_remove_lost_rqsts(mi, sp); 2570 2571 for (rep = reopenlist; rep; rep = rep->re_next) { 2572 2573 if (remap) { 2574 nfs4_remap_file(mi, rep->re_vp, 2575 NFS4_REMAP_CKATTRS, &e); 2576 } 2577 if (e.error == ENOENT || e.stat == NFS4ERR_NOENT) { 2578 /* 2579 * The current server does not have the file 2580 * that is to be remapped. This is most 2581 * likely due to an improperly maintained 2582 * replica. The files that are missing from 2583 * the server will be marked dead and logged 2584 * in order to make sys admins aware of the 2585 * problem. 2586 */ 2587 nfs4_fail_recov(rep->re_vp, 2588 fail_msg, e.error, e.stat); 2589 /* 2590 * We've already handled the error so clear it. 2591 */ 2592 nfs4_error_zinit(&e); 2593 continue; 2594 } else if (e.error == 0 && e.stat == NFS4_OK) { 2595 int j; 2596 2597 rp = VTOR4(rep->re_vp); 2598 mutex_enter(&rp->r_statelock); 2599 pre_change = rp->r_change; 2600 mutex_exit(&rp->r_statelock); 2601 2602 for (j = 0; j < rep->re_numosp; j++) { 2603 nfs4_reopen(rep->re_vp, rep->re_osp[j], 2604 &e, claim, FALSE, TRUE); 2605 if (e.error != 0 || e.stat != NFS4_OK) 2606 break; 2607 } 2608 if (nfs4_needs_recovery(&e, TRUE, 2609 mi->mi_vfsp)) { 2610 (void) nfs4_start_recovery(&e, mi, 2611 rep->re_vp, NULL, NULL, NULL, 2612 OP_OPEN, NULL); 2613 break; 2614 } 2615 } 2616 #ifdef DEBUG 2617 if (nfs4_recovdelay > 0) 2618 delay(MSEC_TO_TICK(nfs4_recovdelay * 1000)); 2619 #endif 2620 if (e.error == 0 && e.stat == NFS4_OK) 2621 relock_file(rep->re_vp, mi, &e, pre_change); 2622 2623 if (nfs4_needs_recovery(&e, TRUE, mi->mi_vfsp)) 2624 (void) nfs4_start_recovery(&e, mi, 2625 rep->re_vp, NULL, NULL, NULL, OP_LOCK, 2626 NULL); 2627 if (e.error != 0 || e.stat != NFS4_OK) 2628 break; 2629 } 2630 2631 /* 2632 * Check to see if we need to remap files passed in 2633 * via the recovery arguments; this will have been 2634 * done for open files. A failure here is not fatal. 2635 */ 2636 if (remap) { 2637 nfs4_error_t ignore; 2638 nfs4_check_remap(mi, recovp->rc_vp1, NFS4_REMAP_CKATTRS, 2639 &ignore); 2640 nfs4_check_remap(mi, recovp->rc_vp2, NFS4_REMAP_CKATTRS, 2641 &ignore); 2642 } 2643 } 2644 2645 if (e.error == 0 && e.stat == NFS4_OK) { 2646 mutex_enter(&mi->mi_lock); 2647 mi->mi_recovflags &= ~(MI4R_REOPEN_FILES | MI4R_REMAP_FILES); 2648 mutex_exit(&mi->mi_lock); 2649 } 2650 2651 nfs_rw_exit(&mi->mi_recovlock); 2652 nfs_rw_exit(&sp->s_recovlock); 2653 2654 if (reopenlist != NULL) 2655 r4releopenlist(reopenlist); 2656 } 2657 2658 /* 2659 * Resend the queued state recovery requests in "rqsts". 2660 */ 2661 2662 static void 2663 nfs4_resend_lost_rqsts(recov_info_t *recovp, nfs4_server_t *sp) 2664 { 2665 nfs4_lost_rqst_t *lrp, *tlrp; 2666 mntinfo4_t *mi = recovp->rc_mi; 2667 nfs4_error_t n4e; 2668 #ifdef NOTYET 2669 uint32_t deny_bits = 0; 2670 #endif 2671 2672 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_resend_lost_rqsts")); 2673 2674 ASSERT(mi != NULL); 2675 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 2676 2677 mutex_enter(&mi->mi_lock); 2678 lrp = list_head(&mi->mi_lost_state); 2679 mutex_exit(&mi->mi_lock); 2680 while (lrp != NULL) { 2681 nfs4_error_zinit(&n4e); 2682 resend_one_op(lrp, &n4e, mi, sp); 2683 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 2684 "nfs4_resend_lost_rqsts: resend request: for vp %p got " 2685 "error %d stat %d", (void *)lrp->lr_vp, n4e.error, 2686 n4e.stat)); 2687 2688 /* 2689 * If we get a recovery error that we can actually 2690 * recover from (such as ETIMEDOUT, FHEXPIRED), we 2691 * return and let the recovery thread redrive the call. 2692 * Don't requeue unless the zone is still healthy. 2693 */ 2694 if (zone_status_get(curproc->p_zone) < ZONE_IS_SHUTTING_DOWN && 2695 nfs4_needs_recovery(&n4e, TRUE, mi->mi_vfsp) && 2696 (nfs4_try_failover(&n4e) || 2697 NFS4_FRC_UNMT_ERR(n4e.error, mi->mi_vfsp) || 2698 (n4e.error == 0 && n4e.stat != NFS4ERR_BADHANDLE && 2699 !nfs4_recov_marks_dead(n4e.stat)))) { 2700 /* 2701 * For these three errors, we want to delay a bit 2702 * instead of pounding the server into submission. 2703 * We have to do this manually; the normal 2704 * processing for these errors only works for 2705 * non-recovery requests. 2706 */ 2707 if ((n4e.error == 0 && n4e.stat == NFS4ERR_DELAY) || 2708 (n4e.error == 0 && n4e.stat == NFS4ERR_GRACE) || 2709 (n4e.error == 0 && n4e.stat == NFS4ERR_RESOURCE) || 2710 NFS4_FRC_UNMT_ERR(n4e.error, mi->mi_vfsp)) { 2711 delay(SEC_TO_TICK(nfs4err_delay_time)); 2712 } else { 2713 (void) nfs4_start_recovery(&n4e, 2714 mi, lrp->lr_dvp, lrp->lr_vp, NULL, NULL, 2715 lrp->lr_op, NULL); 2716 } 2717 return; 2718 } 2719 2720 mutex_enter(&mi->mi_lock); 2721 list_remove(&mi->mi_lost_state, lrp); 2722 tlrp = lrp; 2723 lrp = list_head(&mi->mi_lost_state); 2724 mutex_exit(&mi->mi_lock); 2725 nfs4_free_lost_rqst(tlrp, sp); 2726 } 2727 } 2728 2729 /* 2730 * Resend the given op, and issue any necessary undo call. 2731 * errors are returned via the nfs4_error_t parameter. 2732 */ 2733 2734 static void 2735 resend_one_op(nfs4_lost_rqst_t *lrp, nfs4_error_t *ep, 2736 mntinfo4_t *mi, nfs4_server_t *sp) 2737 { 2738 vnode_t *vp; 2739 nfs4_open_stream_t *osp; 2740 cred_t *cr; 2741 uint32_t acc_bits; 2742 2743 vp = lrp->lr_vp; 2744 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_one_op: " 2745 "have a lost open/close request for vp %p", (void *)vp)); 2746 2747 switch (lrp->lr_op) { 2748 case OP_OPEN: 2749 nfs4_resend_open_otw(&vp, lrp, ep); 2750 break; 2751 case OP_OPEN_DOWNGRADE: 2752 ASSERT(lrp->lr_oop != NULL); 2753 ep->error = nfs4_start_open_seqid_sync(lrp->lr_oop, mi); 2754 ASSERT(!ep->error); /* recov thread always succeeds */ 2755 ASSERT(lrp->lr_osp != NULL); 2756 mutex_enter(&lrp->lr_osp->os_sync_lock); 2757 nfs4_open_downgrade(lrp->lr_dg_acc, lrp->lr_dg_deny, 2758 lrp->lr_oop, lrp->lr_osp, vp, lrp->lr_cr, lrp, 2759 ep, NULL, NULL); 2760 mutex_exit(&lrp->lr_osp->os_sync_lock); 2761 nfs4_end_open_seqid_sync(lrp->lr_oop); 2762 break; 2763 case OP_CLOSE: 2764 osp = lrp->lr_osp; 2765 cr = lrp->lr_cr; 2766 acc_bits = 0; 2767 mutex_enter(&osp->os_sync_lock); 2768 if (osp->os_share_acc_read) 2769 acc_bits |= OPEN4_SHARE_ACCESS_READ; 2770 if (osp->os_share_acc_write) 2771 acc_bits |= OPEN4_SHARE_ACCESS_WRITE; 2772 mutex_exit(&osp->os_sync_lock); 2773 nfs4close_one(vp, osp, cr, acc_bits, lrp, ep, 2774 CLOSE_RESEND, 0, 0, 0); 2775 break; 2776 case OP_LOCK: 2777 case OP_LOCKU: 2778 resend_lock(lrp, ep); 2779 goto done; 2780 case OP_DELEGRETURN: 2781 nfs4_resend_delegreturn(lrp, ep, sp); 2782 goto done; 2783 default: 2784 #ifdef DEBUG 2785 cmn_err(CE_PANIC, "resend_one_op: unexpected op: %d", 2786 lrp->lr_op); 2787 #endif 2788 nfs4_queue_event(RE_LOST_STATE_BAD_OP, mi, NULL, 2789 lrp->lr_op, lrp->lr_vp, lrp->lr_dvp, NFS4_OK, NULL, 0, 2790 TAG_NONE, TAG_NONE, 0, 0); 2791 nfs4_error_init(ep, EINVAL); 2792 return; 2793 } 2794 2795 /* 2796 * No need to retry nor send an "undo" CLOSE in the 2797 * event the server rebooted. 2798 */ 2799 if (ep->error == 0 && (ep->stat == NFS4ERR_STALE_CLIENTID || 2800 ep->stat == NFS4ERR_STALE_STATEID || ep->stat == NFS4ERR_EXPIRED)) 2801 goto done; 2802 2803 /* 2804 * If we resent a CLOSE or OPEN_DOWNGRADE, there's nothing 2805 * to undo. Undoing locking operations was handled by 2806 * resend_lock(). 2807 */ 2808 if (lrp->lr_op == OP_OPEN_DOWNGRADE || lrp->lr_op == OP_CLOSE) 2809 goto done; 2810 2811 /* 2812 * If we get any other error for OPEN, then don't attempt 2813 * to undo the resend of the open (since it was never 2814 * successful!). 2815 */ 2816 ASSERT(lrp->lr_op == OP_OPEN); 2817 if (ep->error || ep->stat != NFS4_OK) 2818 goto done; 2819 2820 /* 2821 * Now let's undo our OPEN. 2822 */ 2823 nfs4_error_zinit(ep); 2824 close_after_open_resend(vp, lrp->lr_cr, lrp->lr_oacc, ep); 2825 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_one_op: " 2826 "nfs4close_one: for vp %p got error %d stat %d", 2827 (void *)vp, ep->error, ep->stat)); 2828 2829 done: 2830 if (vp != lrp->lr_vp) 2831 VN_RELE(vp); 2832 } 2833 2834 /* 2835 * Close a file that was opened via a resent OPEN. 2836 * Most errors are passed back to the caller (via the return value and 2837 * *statp), except for FHEXPIRED, which is retried. 2838 * 2839 * It might be conceptually cleaner to push the CLOSE request onto the 2840 * front of the resend queue, rather than sending it here. That would 2841 * match the way we undo lost lock requests. On the other 2842 * hand, we've already got something that works, and there's no reason to 2843 * change it at this time. 2844 */ 2845 2846 static void 2847 close_after_open_resend(vnode_t *vp, cred_t *cr, uint32_t acc_bits, 2848 nfs4_error_t *ep) 2849 { 2850 2851 for (;;) { 2852 nfs4close_one(vp, NULL, cr, acc_bits, NULL, ep, 2853 CLOSE_AFTER_RESEND, 0, 0, 0); 2854 if (ep->error == 0 && ep->stat == NFS4_OK) 2855 break; /* success; done */ 2856 if (ep->error != 0 || ep->stat != NFS4ERR_FHEXPIRED) 2857 break; 2858 /* else retry FHEXPIRED */ 2859 } 2860 2861 } 2862 2863 /* 2864 * Resend the given lost lock request. Return an errno value. If zero, 2865 * *statp is set to the NFS status code for the call. 2866 * 2867 * Issue a SIGLOST and mark the rnode dead if we get a non-recovery error or 2868 * a recovery error that we don't actually recover from yet (eg: BAD_SEQID). 2869 * Let the recovery thread redrive the call if we get a recovery error that 2870 * we can actually recover from. 2871 */ 2872 static void 2873 resend_lock(nfs4_lost_rqst_t *lrp, nfs4_error_t *ep) 2874 { 2875 bool_t send_siglost = FALSE; 2876 vnode_t *vp = lrp->lr_vp; 2877 2878 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_lock:")); 2879 ASSERT(lrp->lr_ctype == NFS4_LCK_CTYPE_REINSTATE || 2880 lrp->lr_ctype == NFS4_LCK_CTYPE_RESEND); 2881 2882 nfs4frlock(lrp->lr_ctype, vp, F_SETLK, 2883 lrp->lr_flk, FREAD|FWRITE, 0, lrp->lr_cr, ep, lrp, NULL); 2884 2885 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_lock: " 2886 "nfs4frlock for vp %p returned error %d, stat %d", 2887 (void *)vp, ep->error, ep->stat)); 2888 2889 if (ep->error == 0 && ep->stat == 0) 2890 goto done; 2891 if (ep->error == 0 && ep->stat == NFS4ERR_DENIED && 2892 lrp->lr_ctype == NFS4_LCK_CTYPE_RESEND) 2893 goto done; 2894 2895 /* 2896 * If we failed with a non-recovery error, send SIGLOST and 2897 * mark the file dead. 2898 */ 2899 if (!nfs4_needs_recovery(ep, TRUE, vp->v_vfsp)) 2900 send_siglost = TRUE; 2901 else { 2902 /* 2903 * Done with recovering LOST LOCK in the event the 2904 * server rebooted or we've lost the lease. 2905 */ 2906 if (ep->error == 0 && (ep->stat == NFS4ERR_STALE_CLIENTID || 2907 ep->stat == NFS4ERR_STALE_STATEID || 2908 ep->stat == NFS4ERR_EXPIRED)) { 2909 goto done; 2910 } 2911 2912 /* 2913 * BAD_STATEID on an unlock indicates that the server has 2914 * forgotten about the lock anyway, so act like the call 2915 * was successful. 2916 */ 2917 if (ep->error == 0 && ep->stat == NFS4ERR_BAD_STATEID && 2918 lrp->lr_op == OP_LOCKU) 2919 goto done; 2920 2921 /* 2922 * If we got a recovery error that we don't actually 2923 * recover from, send SIGLOST. If the filesystem was 2924 * forcibly unmounted, we skip the SIGLOST because (a) it's 2925 * unnecessary noise, and (b) there could be a new process 2926 * with the same pid as the one that had generated the lost 2927 * state request. 2928 */ 2929 if (ep->error == 0 && (ep->stat == NFS4ERR_BADHANDLE || 2930 nfs4_recov_marks_dead(ep->stat))) { 2931 if (!(vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) 2932 send_siglost = TRUE; 2933 goto done; 2934 } 2935 2936 /* 2937 * If the filesystem was forcibly unmounted, we 2938 * still need to synchronize with the server and 2939 * release state. Try again later. 2940 */ 2941 if (NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp)) 2942 goto done; 2943 2944 /* 2945 * If we get a recovery error that we can actually 2946 * recover from (such as ETIMEDOUT, FHEXPIRED), 2947 * return and let the recovery thread redrive the call. 2948 * 2949 * For the three errors below, we want to delay a bit 2950 * instead of pounding the server into submission. 2951 */ 2952 if ((ep->error == 0 && ep->stat == NFS4ERR_DELAY) || 2953 (ep->error == 0 && ep->stat == NFS4ERR_GRACE) || 2954 (ep->error == 0 && ep->stat == NFS4ERR_RESOURCE)) 2955 delay(SEC_TO_TICK(recov_err_delay)); 2956 goto done; 2957 } 2958 2959 done: 2960 if (send_siglost) { 2961 cred_t *sv_cred; 2962 2963 /* 2964 * Must be root or the actual thread being issued the 2965 * SIGLOST for this to work, so just become root. 2966 */ 2967 sv_cred = curthread->t_cred; 2968 curthread->t_cred = kcred; 2969 nfs4_send_siglost(lrp->lr_flk->l_pid, VTOMI4(vp), vp, FALSE, 2970 ep->error, ep->stat); 2971 curthread->t_cred = sv_cred; 2972 2973 /* 2974 * Flush any additional reinstantiation requests for 2975 * this operation. Sending multiple SIGLOSTs to the user 2976 * process is unlikely to help and may cause trouble. 2977 */ 2978 if (lrp->lr_ctype == NFS4_LCK_CTYPE_REINSTATE) 2979 flush_reinstate(lrp); 2980 } 2981 } 2982 2983 /* 2984 * Remove any lock reinstantiation requests that correspond to the given 2985 * lost request. We only remove items that follow lrp in the queue, 2986 * assuming that lrp will be removed by the generic lost state code. 2987 */ 2988 2989 static void 2990 flush_reinstate(nfs4_lost_rqst_t *lrp) 2991 { 2992 vnode_t *vp; 2993 pid_t pid; 2994 mntinfo4_t *mi; 2995 nfs4_lost_rqst_t *nlrp; 2996 2997 vp = lrp->lr_vp; 2998 mi = VTOMI4(vp); 2999 pid = lrp->lr_flk->l_pid; 3000 3001 /* 3002 * If there are any more reinstantation requests to get rid of, 3003 * they should all be clustered at the front of the lost state 3004 * queue. 3005 */ 3006 mutex_enter(&mi->mi_lock); 3007 for (lrp = list_next(&mi->mi_lost_state, lrp); lrp != NULL; 3008 lrp = nlrp) { 3009 nlrp = list_next(&mi->mi_lost_state, lrp); 3010 if (lrp->lr_op != OP_LOCK && lrp->lr_op != OP_LOCKU) 3011 break; 3012 if (lrp->lr_ctype != NFS4_LCK_CTYPE_REINSTATE) 3013 break; 3014 ASSERT(lrp->lr_vp == vp); 3015 ASSERT(lrp->lr_flk->l_pid == pid); 3016 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 3017 "remove reinstantiation %p", (void *)lrp)); 3018 list_remove(&mi->mi_lost_state, lrp); 3019 nfs4_free_lost_rqst(lrp, NULL); 3020 } 3021 mutex_exit(&mi->mi_lock); 3022 } 3023 3024 /* 3025 * End of state-specific recovery routines. 3026 */ 3027 3028 /* 3029 * Allocate a lost request struct, initialize it from lost_rqstp (including 3030 * bumping the reference counts for the referenced vnode, etc.), and hang 3031 * it off of recovp. 3032 */ 3033 3034 static void 3035 nfs4_save_lost_rqst(nfs4_lost_rqst_t *lost_rqstp, recov_info_t *recovp, 3036 nfs4_recov_t *action, mntinfo4_t *mi) 3037 { 3038 nfs4_lost_rqst_t *destp; 3039 3040 ASSERT(recovp->rc_lost_rqst == NULL); 3041 3042 destp = kmem_alloc(sizeof (nfs4_lost_rqst_t), KM_SLEEP); 3043 recovp->rc_lost_rqst = destp; 3044 3045 if (lost_rqstp->lr_op == OP_LOCK || 3046 lost_rqstp->lr_op == OP_LOCKU) { 3047 ASSERT(lost_rqstp->lr_lop); 3048 *action = NR_LOST_LOCK; 3049 destp->lr_ctype = lost_rqstp->lr_ctype; 3050 destp->lr_locktype = lost_rqstp->lr_locktype; 3051 } else if (lost_rqstp->lr_op == OP_OPEN) { 3052 component4 *srcfp, *destfp; 3053 3054 destp->lr_oacc = lost_rqstp->lr_oacc; 3055 destp->lr_odeny = lost_rqstp->lr_odeny; 3056 destp->lr_oclaim = lost_rqstp->lr_oclaim; 3057 if (lost_rqstp->lr_oclaim == CLAIM_DELEGATE_CUR) 3058 destp->lr_ostateid = lost_rqstp->lr_ostateid; 3059 3060 srcfp = &lost_rqstp->lr_ofile; 3061 destfp = &destp->lr_ofile; 3062 /* 3063 * Consume caller's utf8string 3064 */ 3065 destfp->utf8string_len = srcfp->utf8string_len; 3066 destfp->utf8string_val = srcfp->utf8string_val; 3067 srcfp->utf8string_len = 0; 3068 srcfp->utf8string_val = NULL; /* make sure not reused */ 3069 3070 *action = NR_LOST_STATE_RQST; 3071 } else if (lost_rqstp->lr_op == OP_OPEN_DOWNGRADE) { 3072 destp->lr_dg_acc = lost_rqstp->lr_dg_acc; 3073 destp->lr_dg_deny = lost_rqstp->lr_dg_deny; 3074 3075 *action = NR_LOST_STATE_RQST; 3076 } else if (lost_rqstp->lr_op == OP_CLOSE) { 3077 ASSERT(lost_rqstp->lr_oop); 3078 *action = NR_LOST_STATE_RQST; 3079 } else if (lost_rqstp->lr_op == OP_DELEGRETURN) { 3080 *action = NR_LOST_STATE_RQST; 3081 } else { 3082 #ifdef DEBUG 3083 cmn_err(CE_PANIC, "nfs4_save_lost_rqst: bad op %d", 3084 lost_rqstp->lr_op); 3085 #endif 3086 nfs4_queue_event(RE_LOST_STATE_BAD_OP, mi, NULL, 3087 lost_rqstp->lr_op, lost_rqstp->lr_vp, lost_rqstp->lr_dvp, 3088 NFS4_OK, NULL, curproc->p_pid, TAG_NONE, TAG_NONE, 0, 0); 3089 *action = NR_UNUSED; 3090 recovp->rc_lost_rqst = NULL; 3091 kmem_free(destp, sizeof (nfs4_lost_rqst_t)); 3092 return; 3093 } 3094 3095 destp->lr_op = lost_rqstp->lr_op; 3096 destp->lr_vp = lost_rqstp->lr_vp; 3097 if (destp->lr_vp) 3098 VN_HOLD(destp->lr_vp); 3099 destp->lr_dvp = lost_rqstp->lr_dvp; 3100 if (destp->lr_dvp) 3101 VN_HOLD(destp->lr_dvp); 3102 destp->lr_oop = lost_rqstp->lr_oop; 3103 if (destp->lr_oop) 3104 open_owner_hold(destp->lr_oop); 3105 destp->lr_osp = lost_rqstp->lr_osp; 3106 if (destp->lr_osp) 3107 open_stream_hold(destp->lr_osp); 3108 destp->lr_lop = lost_rqstp->lr_lop; 3109 if (destp->lr_lop) 3110 lock_owner_hold(destp->lr_lop); 3111 destp->lr_cr = lost_rqstp->lr_cr; 3112 if (destp->lr_cr) 3113 crhold(destp->lr_cr); 3114 if (lost_rqstp->lr_flk == NULL) 3115 destp->lr_flk = NULL; 3116 else { 3117 destp->lr_flk = kmem_alloc(sizeof (flock64_t), KM_SLEEP); 3118 *destp->lr_flk = *lost_rqstp->lr_flk; 3119 } 3120 destp->lr_putfirst = lost_rqstp->lr_putfirst; 3121 } 3122 3123 /* 3124 * Map the given return values (errno and nfs4 status code) to a recovery 3125 * action and fill in the following fields of recovp: rc_action, 3126 * rc_srv_reboot, rc_stateid, rc_lost_rqst. 3127 */ 3128 3129 void 3130 errs_to_action(recov_info_t *recovp, 3131 nfs4_server_t *sp, mntinfo4_t *mi, stateid4 *sidp, 3132 nfs4_lost_rqst_t *lost_rqstp, int unmounted, nfs_opnum4 op, 3133 nfs4_bseqid_entry_t *bsep) 3134 { 3135 nfs4_recov_t action = NR_UNUSED; 3136 bool_t reboot = FALSE; 3137 int try_f; 3138 int error = recovp->rc_orig_errors.error; 3139 nfsstat4 stat = recovp->rc_orig_errors.stat; 3140 3141 bzero(&recovp->rc_stateid, sizeof (stateid4)); 3142 recovp->rc_lost_rqst = NULL; 3143 recovp->rc_bseqid_rqst = NULL; 3144 3145 try_f = nfs4_try_failover(&recovp->rc_orig_errors) && 3146 FAILOVER_MOUNT4(mi); 3147 3148 /* 3149 * We start recovery for EINTR only in the lost lock 3150 * or lost open/close case. 3151 */ 3152 3153 if (try_f || error == EINTR || (error == EIO && unmounted)) { 3154 recovp->rc_error = (error != 0 ? error : geterrno4(stat)); 3155 if (lost_rqstp) { 3156 ASSERT(lost_rqstp->lr_op != 0); 3157 nfs4_save_lost_rqst(lost_rqstp, recovp, &action, mi); 3158 } 3159 if (try_f) 3160 action = NR_FAILOVER; 3161 } else if (error != 0) { 3162 recovp->rc_error = error; 3163 nfs4_queue_event(RE_UNEXPECTED_ERRNO, mi, NULL, error, NULL, 3164 NULL, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0); 3165 action = NR_CLIENTID; 3166 } else { 3167 recovp->rc_error = geterrno4(stat); 3168 switch (stat) { 3169 #ifdef notyet 3170 case NFS4ERR_LEASE_MOVED: 3171 action = xxx; 3172 break; 3173 case NFS4ERR_MOVED: 3174 action = xxx; 3175 break; 3176 #endif 3177 case NFS4ERR_BADHANDLE: 3178 action = NR_BADHANDLE; 3179 break; 3180 case NFS4ERR_BAD_SEQID: 3181 if (bsep) 3182 save_bseqid_rqst(bsep, recovp); 3183 action = NR_BAD_SEQID; 3184 break; 3185 case NFS4ERR_OLD_STATEID: 3186 action = NR_OLDSTATEID; 3187 break; 3188 case NFS4ERR_WRONGSEC: 3189 action = NR_WRONGSEC; 3190 break; 3191 case NFS4ERR_FHEXPIRED: 3192 action = NR_FHEXPIRED; 3193 break; 3194 case NFS4ERR_BAD_STATEID: 3195 if (sp == NULL || (sp != NULL && inlease(sp))) { 3196 3197 action = NR_BAD_STATEID; 3198 if (sidp) 3199 recovp->rc_stateid = *sidp; 3200 } else 3201 action = NR_CLIENTID; 3202 break; 3203 case NFS4ERR_EXPIRED: 3204 /* 3205 * The client's lease has expired, either due 3206 * to a network partition or perhaps a client 3207 * error. In either case, try an NR_CLIENTID 3208 * style recovery. reboot remains false, since 3209 * there is no evidence the server has rebooted. 3210 * This will cause CLAIM_NULL opens and lock 3211 * requests without the reclaim bit. 3212 */ 3213 action = NR_CLIENTID; 3214 3215 DTRACE_PROBE4(nfs4__expired, 3216 nfs4_server_t *, sp, 3217 mntinfo4_t *, mi, 3218 stateid4 *, sidp, int, op); 3219 3220 break; 3221 case NFS4ERR_STALE_CLIENTID: 3222 case NFS4ERR_STALE_STATEID: 3223 action = NR_CLIENTID; 3224 reboot = TRUE; 3225 break; 3226 case NFS4ERR_RESOURCE: 3227 /* 3228 * If this had been a FAILOVER mount, then 3229 * we'd have tried failover. Since it's not, 3230 * just delay a while and retry. 3231 */ 3232 action = NR_DELAY; 3233 break; 3234 case NFS4ERR_GRACE: 3235 action = NR_GRACE; 3236 break; 3237 case NFS4ERR_DELAY: 3238 action = NR_DELAY; 3239 break; 3240 case NFS4ERR_STALE: 3241 action = NR_STALE; 3242 break; 3243 default: 3244 nfs4_queue_event(RE_UNEXPECTED_STATUS, mi, NULL, 0, 3245 NULL, NULL, stat, NULL, 0, TAG_NONE, TAG_NONE, 3246 0, 0); 3247 action = NR_CLIENTID; 3248 break; 3249 } 3250 } 3251 3252 /* make sure action got set */ 3253 ASSERT(action != NR_UNUSED); 3254 recovp->rc_srv_reboot = reboot; 3255 recovp->rc_action = action; 3256 nfs4_queue_fact(RF_ERR, mi, stat, action, op, reboot, NULL, error, 3257 NULL); 3258 } 3259 3260 /* 3261 * Return the (held) credential for the process with the given pid. 3262 * May return NULL (e.g., process not found). 3263 */ 3264 3265 static cred_t * 3266 pid_to_cr(pid_t pid) 3267 { 3268 proc_t *p; 3269 cred_t *cr; 3270 3271 mutex_enter(&pidlock); 3272 if ((p = prfind(pid)) == NULL) { 3273 mutex_exit(&pidlock); 3274 return (NULL); 3275 } 3276 3277 mutex_enter(&p->p_crlock); 3278 crhold(cr = p->p_cred); 3279 mutex_exit(&p->p_crlock); 3280 mutex_exit(&pidlock); 3281 3282 return (cr); 3283 } 3284 3285 /* 3286 * Send SIGLOST to the given process and queue the event. 3287 * 3288 * The 'dump' boolean tells us whether this action should dump the 3289 * in-kernel queue of recovery messages or not. 3290 */ 3291 3292 void 3293 nfs4_send_siglost(pid_t pid, mntinfo4_t *mi, vnode_t *vp, bool_t dump, 3294 int error, nfsstat4 stat) 3295 { 3296 proc_t *p; 3297 3298 mutex_enter(&pidlock); 3299 p = prfind(pid); 3300 if (p) 3301 psignal(p, SIGLOST); 3302 mutex_exit(&pidlock); 3303 nfs4_queue_event(dump ? RE_SIGLOST : RE_SIGLOST_NO_DUMP, mi, 3304 NULL, error, vp, NULL, stat, NULL, pid, TAG_NONE, TAG_NONE, 0, 0); 3305 } 3306 3307 /* 3308 * Scan the lock list for entries that match the given pid. Change the 3309 * pid in those that do to NOPID. 3310 */ 3311 3312 static void 3313 relock_skip_pid(locklist_t *llp, pid_t pid) 3314 { 3315 for (; llp != NULL; llp = llp->ll_next) { 3316 if (llp->ll_flock.l_pid == pid) 3317 llp->ll_flock.l_pid = NOPID; 3318 } 3319 } 3320 3321 /* 3322 * Mark a file as having failed recovery, after making a last-ditch effort 3323 * to return any delegation. 3324 * 3325 * Sets r_error to EIO or ESTALE for the given vnode. 3326 */ 3327 void 3328 nfs4_fail_recov(vnode_t *vp, char *why, int error, nfsstat4 stat) 3329 { 3330 rnode4_t *rp = VTOR4(vp); 3331 3332 #ifdef DEBUG 3333 if (nfs4_fail_recov_stop) 3334 debug_enter("nfs4_fail_recov"); 3335 #endif 3336 3337 mutex_enter(&rp->r_statelock); 3338 if (rp->r_flags & (R4RECOVERR|R4RECOVERRP)) { 3339 mutex_exit(&rp->r_statelock); 3340 return; 3341 } 3342 3343 /* 3344 * Set R4RECOVERRP to indicate that a recovery error is in 3345 * progress. This will shut down reads and writes at the top 3346 * half. Don't set R4RECOVERR until after we've returned the 3347 * delegation, otherwise it will fail. 3348 */ 3349 3350 rp->r_flags |= R4RECOVERRP; 3351 mutex_exit(&rp->r_statelock); 3352 3353 nfs4delegabandon(rp); 3354 3355 mutex_enter(&rp->r_statelock); 3356 rp->r_flags |= (R4RECOVERR | R4STALE); 3357 rp->r_error = (error == 0 && stat == NFS4ERR_STALE) ? ESTALE : EIO; 3358 PURGE_ATTRCACHE4_LOCKED(rp); 3359 if (!(vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) 3360 nfs4_queue_event(RE_DEAD_FILE, VTOMI4(vp), NULL, error, 3361 vp, NULL, stat, why, 0, TAG_NONE, TAG_NONE, 0, 0); 3362 mutex_exit(&rp->r_statelock); 3363 3364 dnlc_purge_vp(vp); 3365 } 3366 3367 /* 3368 * recov_throttle: if the file had the same recovery action within the 3369 * throttle interval, wait for the throttle interval to finish before 3370 * proceeding. 3371 * 3372 * Side effects: updates the rnode with the current recovery information. 3373 */ 3374 3375 static void 3376 recov_throttle(recov_info_t *recovp, vnode_t *vp) 3377 { 3378 time_t curtime, time_to_wait; 3379 rnode4_t *rp = VTOR4(vp); 3380 3381 curtime = gethrestime_sec(); 3382 3383 mutex_enter(&rp->r_statelock); 3384 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 3385 "recov_throttle: now: (%d, %ld), last: (%d, %ld)", 3386 recovp->rc_action, curtime, 3387 rp->r_recov_act, rp->r_last_recov)); 3388 if (recovp->rc_action == rp->r_recov_act && 3389 rp->r_last_recov + recov_err_delay > curtime) { 3390 time_to_wait = rp->r_last_recov + recov_err_delay - curtime; 3391 mutex_exit(&rp->r_statelock); 3392 delay(SEC_TO_TICK(time_to_wait)); 3393 curtime = gethrestime_sec(); 3394 mutex_enter(&rp->r_statelock); 3395 } 3396 3397 rp->r_last_recov = curtime; 3398 rp->r_recov_act = recovp->rc_action; 3399 mutex_exit(&rp->r_statelock); 3400 } 3401 3402 /* 3403 * React to NFS4ERR_GRACE by setting the time we'll permit 3404 * the next call to this filesystem. 3405 */ 3406 void 3407 nfs4_set_grace_wait(mntinfo4_t *mi) 3408 { 3409 mutex_enter(&mi->mi_lock); 3410 /* Mark the time for the future */ 3411 mi->mi_grace_wait = gethrestime_sec() + nfs4err_delay_time; 3412 mutex_exit(&mi->mi_lock); 3413 } 3414 3415 /* 3416 * React to MFS4ERR_DELAY by setting the time we'll permit 3417 * the next call to this vnode. 3418 */ 3419 void 3420 nfs4_set_delay_wait(vnode_t *vp) 3421 { 3422 rnode4_t *rp = VTOR4(vp); 3423 3424 mutex_enter(&rp->r_statelock); 3425 /* 3426 * Calculate amount we should delay, initial 3427 * delay will be short and then we will back off. 3428 */ 3429 if (rp->r_delay_interval == 0) 3430 rp->r_delay_interval = NFS4_INITIAL_DELAY_INTERVAL; 3431 else 3432 /* calculate next interval value */ 3433 rp->r_delay_interval = 3434 MIN(NFS4_MAX_DELAY_INTERVAL, (rp->r_delay_interval << 1)); 3435 rp->r_delay_wait = gethrestime_sec() + rp->r_delay_interval; 3436 mutex_exit(&rp->r_statelock); 3437 } 3438 3439 /* 3440 * The caller is responsible for freeing the returned string. 3441 */ 3442 static char * 3443 nfs4_getsrvnames(mntinfo4_t *mi, size_t *len) 3444 { 3445 servinfo4_t *svp; 3446 char *srvnames; 3447 char *namep; 3448 size_t length; 3449 3450 /* 3451 * Calculate the length of the string required to hold all 3452 * of the server names plus either a comma or a null 3453 * character following each individual one. 3454 */ 3455 length = 0; 3456 for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) { 3457 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 3458 if (svp->sv_flags & SV4_NOTINUSE) { 3459 nfs_rw_exit(&svp->sv_lock); 3460 continue; 3461 } 3462 nfs_rw_exit(&svp->sv_lock); 3463 length += svp->sv_hostnamelen; 3464 } 3465 3466 srvnames = kmem_alloc(length, KM_SLEEP); 3467 3468 namep = srvnames; 3469 for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) { 3470 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 3471 if (svp->sv_flags & SV4_NOTINUSE) { 3472 nfs_rw_exit(&svp->sv_lock); 3473 continue; 3474 } 3475 nfs_rw_exit(&svp->sv_lock); 3476 (void) strcpy(namep, svp->sv_hostname); 3477 namep += svp->sv_hostnamelen - 1; 3478 *namep++ = ','; 3479 } 3480 *--namep = '\0'; 3481 3482 *len = length; 3483 3484 return (srvnames); 3485 } 3486 3487 static void 3488 save_bseqid_rqst(nfs4_bseqid_entry_t *bsep, recov_info_t *recovp) 3489 { 3490 nfs4_bseqid_entry_t *destp; 3491 3492 destp = kmem_alloc(sizeof (nfs4_bseqid_entry_t), KM_SLEEP); 3493 recovp->rc_bseqid_rqst = destp; 3494 3495 if (bsep->bs_oop) 3496 open_owner_hold(bsep->bs_oop); 3497 destp->bs_oop = bsep->bs_oop; 3498 if (bsep->bs_lop) 3499 lock_owner_hold(bsep->bs_lop); 3500 destp->bs_lop = bsep->bs_lop; 3501 if (bsep->bs_vp) 3502 VN_HOLD(bsep->bs_vp); 3503 destp->bs_vp = bsep->bs_vp; 3504 destp->bs_pid = bsep->bs_pid; 3505 destp->bs_tag = bsep->bs_tag; 3506 destp->bs_seqid = bsep->bs_seqid; 3507 } 3508 3509 static void 3510 free_bseqid_rqst(nfs4_bseqid_entry_t *bsep) 3511 { 3512 if (bsep->bs_oop) 3513 open_owner_rele(bsep->bs_oop); 3514 if (bsep->bs_lop) 3515 lock_owner_rele(bsep->bs_lop); 3516 if (bsep->bs_vp) 3517 VN_RELE(bsep->bs_vp); 3518 kmem_free(bsep, sizeof (nfs4_bseqid_entry_t)); 3519 } 3520 3521 /* 3522 * We don't actually fully recover from NFS4ERR_BAD_SEQID. We 3523 * simply mark the open owner and open stream (if provided) as "bad". 3524 * Then future uses of these data structures will be limited to basically 3525 * just cleaning up the internal client state (no going OTW). 3526 * 3527 * The result of this is to return errors back to the app/usr when 3528 * we receive NFS4ERR_BAD_SEQID, but also allow future/new calls to 3529 * succeed so progress can be made. 3530 */ 3531 void 3532 recov_bad_seqid(recov_info_t *recovp) 3533 { 3534 mntinfo4_t *mi = recovp->rc_mi; 3535 nfs4_open_owner_t *bad_oop; 3536 nfs4_lock_owner_t *bad_lop; 3537 vnode_t *vp; 3538 rnode4_t *rp = NULL; 3539 pid_t pid; 3540 nfs4_bseqid_entry_t *bsep, *tbsep; 3541 int error; 3542 3543 ASSERT(mi != NULL); 3544 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 3545 3546 mutex_enter(&mi->mi_lock); 3547 bsep = list_head(&mi->mi_bseqid_list); 3548 mutex_exit(&mi->mi_lock); 3549 3550 /* 3551 * Handle all the bad seqid entries on mi's list. 3552 */ 3553 while (bsep != NULL) { 3554 bad_oop = bsep->bs_oop; 3555 bad_lop = bsep->bs_lop; 3556 vp = bsep->bs_vp; 3557 pid = bsep->bs_pid; 3558 3559 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 3560 "recov_bad_seqid: mark oop %p lop %p as bad for " 3561 "vp %p tag %s pid %d: last good seqid %d for tag %s", 3562 (void *)bad_oop, (void *)bad_lop, (void *)vp, 3563 nfs4_ctags[bsep->bs_tag].ct_str, pid, 3564 bad_oop ? bad_oop->oo_last_good_seqid : 0, 3565 bad_oop ? nfs4_ctags[bad_oop->oo_last_good_op].ct_str : 3566 nfs4_ctags[TAG_NONE].ct_str)); 3567 3568 nfs4_queue_event(RE_BAD_SEQID, mi, NULL, 3569 0, vp, NULL, NFS4ERR_BAD_SEQID, NULL, pid, bsep->bs_tag, 3570 bad_oop ? bad_oop->oo_last_good_op : TAG_NONE, 3571 bsep->bs_seqid, bad_oop ? bad_oop->oo_last_good_seqid : 0); 3572 3573 if (bad_oop) { 3574 /* essentially reset the open owner */ 3575 error = nfs4_start_open_seqid_sync(bad_oop, mi); 3576 ASSERT(!error); /* recov thread always succeeds */ 3577 bad_oop->oo_name = nfs4_get_new_oo_name(); 3578 bad_oop->oo_seqid = 0; 3579 nfs4_end_open_seqid_sync(bad_oop); 3580 } 3581 3582 if (bad_lop) { 3583 mutex_enter(&bad_lop->lo_lock); 3584 bad_lop->lo_flags |= NFS4_BAD_SEQID_LOCK; 3585 mutex_exit(&bad_lop->lo_lock); 3586 3587 ASSERT(vp != NULL); 3588 rp = VTOR4(vp); 3589 mutex_enter(&rp->r_statelock); 3590 rp->r_flags |= R4LODANGLERS; 3591 mutex_exit(&rp->r_statelock); 3592 3593 nfs4_send_siglost(pid, mi, vp, TRUE, 3594 0, NFS4ERR_BAD_SEQID); 3595 } 3596 3597 mutex_enter(&mi->mi_lock); 3598 list_remove(&mi->mi_bseqid_list, bsep); 3599 tbsep = bsep; 3600 bsep = list_head(&mi->mi_bseqid_list); 3601 mutex_exit(&mi->mi_lock); 3602 free_bseqid_rqst(tbsep); 3603 } 3604 3605 mutex_enter(&mi->mi_lock); 3606 mi->mi_recovflags &= ~MI4R_BAD_SEQID; 3607 mutex_exit(&mi->mi_lock); 3608 } 3609