1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * NFS Version 4 state recovery code. 31 */ 32 33 #include <nfs/nfs4_clnt.h> 34 #include <nfs/nfs4.h> 35 #include <nfs/rnode4.h> 36 #include <sys/cmn_err.h> 37 #include <sys/cred.h> 38 #include <sys/systm.h> 39 #include <sys/flock.h> 40 #include <sys/dnlc.h> 41 #include <sys/ddi.h> 42 #include <sys/disp.h> 43 #include <sys/list.h> 44 #include <sys/sdt.h> 45 46 extern r4hashq_t *rtable4; 47 48 /* 49 * Information that describes what needs to be done for recovery. It is 50 * passed to a client recovery thread as well as passed to various recovery 51 * routines. rc_mi, rc_vp1, and rc_vp2 refer to the filesystem and 52 * vnode(s) affected by recovery. rc_vp1 and rc_vp2 are references (use 53 * VN_HOLD) or NULL. rc_lost_rqst contains information about the lost 54 * lock or open/close request, and it holds reference counts for the 55 * various objects (vnode, etc.). The recovery thread also uses flags set 56 * in the mntinfo4_t or vnode_t to tell it what to do. rc_error is used 57 * to save the error that originally triggered the recovery event -- will 58 * later be used to set mi_error if recovery doesn't work. rc_bseqid_rqst 59 * contains information about the request that got NFS4ERR_BAD_SEQID, and 60 * it holds reference count for the various objects (vnode, open owner, 61 * open stream, lock owner). 62 */ 63 64 typedef struct { 65 mntinfo4_t *rc_mi; 66 vnode_t *rc_vp1; 67 vnode_t *rc_vp2; 68 nfs4_recov_t rc_action; 69 stateid4 rc_stateid; 70 bool_t rc_srv_reboot; /* server has rebooted */ 71 nfs4_lost_rqst_t *rc_lost_rqst; 72 nfs4_error_t rc_orig_errors; /* original errors causing recovery */ 73 int rc_error; 74 nfs4_bseqid_entry_t *rc_bseqid_rqst; 75 } recov_info_t; 76 77 /* 78 * How long to wait before trying again if there is an error doing 79 * recovery, in seconds. 80 */ 81 82 static int recov_err_delay = 1; 83 84 /* 85 * How long to wait when processing NFS4ERR_GRACE or NFS4ERR_DELAY 86 * errors. Expressed in seconds. Default is defined as 87 * NFS4ERR_DELAY_TIME and this variable is initialized in nfs4_subr_init() 88 */ 89 time_t nfs4err_delay_time = 0; 90 91 /* 92 * Tuneable to limit how many time "exempt" ops go OTW 93 * after a recovery error. Exempt op hints are OH_CLOSE, 94 * OH_LOCKU, OH_DELEGRETURN. These previously always went 95 * OTW even after rnode was "dead" due to recovery errors. 96 * 97 * The tuneable below limits the number of times a start_fop 98 * invocation will retry the exempt hints. After the limit 99 * is reached, nfs4_start_fop will return an error just like 100 * it would for non-exempt op hints. 101 */ 102 int nfs4_max_recov_error_retry = 3; 103 104 /* 105 * Number of seconds the recovery thread should pause before retry when the 106 * filesystem has been forcibly unmounted. 107 */ 108 109 int nfs4_unmount_delay = 1; 110 111 #ifdef DEBUG 112 113 /* 114 * How long to wait (in seconds) between recovery operations on a given 115 * file. Normally zero, but could be set longer for testing purposes. 116 */ 117 static int nfs4_recovdelay = 0; 118 119 /* 120 * Switch that controls whether to go into the debugger when recovery 121 * fails. 122 */ 123 static int nfs4_fail_recov_stop = 0; 124 125 /* 126 * Tuneables to debug client namespace interaction with server 127 * mount points: 128 * 129 * nfs4_srvmnt_fail_cnt: 130 * number of times EACCES returned because client 131 * attempted to cross server mountpoint 132 * 133 * nfs4_srvmnt_debug: 134 * trigger console printf whenever client attempts 135 * to cross server mountpoint 136 */ 137 int nfs4_srvmnt_fail_cnt = 0; 138 int nfs4_srvmnt_debug = 0; 139 #endif 140 141 /* forward references, in alphabetic order */ 142 static void close_after_open_resend(vnode_t *, cred_t *, uint32_t, 143 nfs4_error_t *); 144 static void errs_to_action(recov_info_t *, 145 nfs4_server_t *, mntinfo4_t *, stateid4 *, nfs4_lost_rqst_t *, int, 146 nfs_opnum4, nfs4_bseqid_entry_t *); 147 static void flush_reinstate(nfs4_lost_rqst_t *); 148 static void free_milist(mntinfo4_t **, int); 149 static mntinfo4_t **make_milist(nfs4_server_t *, int *); 150 static int nfs4_check_recov_err(vnode_t *, nfs4_op_hint_t, 151 nfs4_recov_state_t *, int, char *); 152 static int nfs4_check_srvstub(vnode_t *vp, rnode4_t *rp, nfs4_op_hint_t op); 153 static char *nfs4_getsrvnames(mntinfo4_t *, size_t *); 154 static void nfs4_recov_fh_fail(vnode_t *, int, nfsstat4); 155 static void nfs4_recov_thread(recov_info_t *); 156 static void nfs4_remove_lost_rqsts(mntinfo4_t *, nfs4_server_t *); 157 static void nfs4_resend_lost_rqsts(recov_info_t *, nfs4_server_t *); 158 static cred_t *pid_to_cr(pid_t); 159 static void reclaim_one_lock(vnode_t *, flock64_t *, nfs4_error_t *, int *); 160 static void recov_bad_seqid(recov_info_t *); 161 static void recov_badstate(recov_info_t *, vnode_t *, nfsstat4); 162 static void recov_clientid(recov_info_t *, nfs4_server_t *); 163 static void recov_done(mntinfo4_t *, recov_info_t *); 164 static void recov_filehandle(nfs4_recov_t, mntinfo4_t *, vnode_t *); 165 static void recov_newserver(recov_info_t *, nfs4_server_t **, bool_t *); 166 static void recov_openfiles(recov_info_t *, nfs4_server_t *); 167 static void recov_stale(mntinfo4_t *, vnode_t *); 168 static void nfs4_free_lost_rqst(nfs4_lost_rqst_t *, nfs4_server_t *); 169 static void recov_throttle(recov_info_t *, vnode_t *); 170 static void relock_skip_pid(locklist_t *, pid_t); 171 static void resend_lock(nfs4_lost_rqst_t *, nfs4_error_t *); 172 static void resend_one_op(nfs4_lost_rqst_t *, nfs4_error_t *, mntinfo4_t *, 173 nfs4_server_t *); 174 static void save_bseqid_rqst(nfs4_bseqid_entry_t *, recov_info_t *); 175 static void start_recovery(recov_info_t *, mntinfo4_t *, vnode_t *, vnode_t *, 176 nfs4_server_t *); 177 static void start_recovery_action(nfs4_recov_t, bool_t, mntinfo4_t *, vnode_t *, 178 vnode_t *); 179 static int wait_for_recovery(mntinfo4_t *, nfs4_op_hint_t); 180 181 /* 182 * Return non-zero if the given errno, status, and rpc status codes 183 * in the nfs4_error_t indicate that client recovery is needed. 184 * "stateful" indicates whether the call that got the error establishes or 185 * removes state on the server (open, close, lock, unlock, delegreturn). 186 */ 187 188 int 189 nfs4_needs_recovery(nfs4_error_t *ep, bool_t stateful, vfs_t *vfsp) 190 { 191 int recov = 0; 192 mntinfo4_t *mi; 193 194 /* 195 * Try failover if the error values justify it and if 196 * it's a failover mount. Don't try if the mount is in 197 * progress, failures are handled explicitly by nfs4rootvp. 198 */ 199 if (nfs4_try_failover(ep)) { 200 mi = VFTOMI4(vfsp); 201 mutex_enter(&mi->mi_lock); 202 recov = FAILOVER_MOUNT4(mi) && !(mi->mi_flags & MI4_MOUNTING); 203 mutex_exit(&mi->mi_lock); 204 if (recov) 205 return (recov); 206 } 207 208 if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vfsp)) { 209 /* 210 * The server may have gotten the request, so for stateful 211 * ops we need to resynchronize and possibly back out the 212 * op. 213 */ 214 return (stateful); 215 } 216 if (ep->error != 0) 217 return (0); 218 219 /* stat values are listed alphabetically */ 220 /* 221 * There are two lists here: the errors for which we have code, and 222 * the errors for which we plan to have code before FCS. For the 223 * second list, print a warning message but don't attempt recovery. 224 */ 225 switch (ep->stat) { 226 case NFS4ERR_BADHANDLE: 227 case NFS4ERR_BAD_SEQID: 228 case NFS4ERR_BAD_STATEID: 229 case NFS4ERR_DELAY: 230 case NFS4ERR_EXPIRED: 231 case NFS4ERR_FHEXPIRED: 232 case NFS4ERR_GRACE: 233 case NFS4ERR_OLD_STATEID: 234 case NFS4ERR_RESOURCE: 235 case NFS4ERR_STALE_CLIENTID: 236 case NFS4ERR_STALE_STATEID: 237 case NFS4ERR_WRONGSEC: 238 case NFS4ERR_STALE: 239 recov = 1; 240 break; 241 #ifdef DEBUG 242 case NFS4ERR_LEASE_MOVED: 243 case NFS4ERR_MOVED: 244 zcmn_err(VFTOMI4(vfsp)->mi_zone->zone_id, 245 CE_WARN, "!Can't yet recover from NFS status %d", 246 ep->stat); 247 break; 248 #endif 249 } 250 251 return (recov); 252 } 253 254 /* 255 * Some operations such as DELEGRETURN want to avoid invoking 256 * recovery actions that will only mark the file dead. If 257 * better handlers are invoked for any of these errors, this 258 * routine should be modified. 259 */ 260 int 261 nfs4_recov_marks_dead(nfsstat4 status) 262 { 263 if (status == NFS4ERR_BAD_SEQID || 264 status == NFS4ERR_EXPIRED || 265 status == NFS4ERR_BAD_STATEID || 266 status == NFS4ERR_OLD_STATEID) 267 return (1); 268 return (0); 269 } 270 271 /* 272 * Transfer the state recovery information in recovp to mi's resend queue, 273 * and mark mi as having a lost state request. 274 */ 275 static void 276 nfs4_enqueue_lost_rqst(recov_info_t *recovp, mntinfo4_t *mi) 277 { 278 nfs4_lost_rqst_t *lrp = recovp->rc_lost_rqst; 279 280 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 281 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 282 283 ASSERT(lrp != NULL && lrp->lr_op != 0); 284 285 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 286 "nfs4_enqueue_lost_rqst %p, op %d", 287 (void *)lrp, lrp->lr_op)); 288 289 mutex_enter(&mi->mi_lock); 290 mi->mi_recovflags |= MI4R_LOST_STATE; 291 if (lrp->lr_putfirst) 292 list_insert_head(&mi->mi_lost_state, lrp); 293 else 294 list_insert_tail(&mi->mi_lost_state, lrp); 295 recovp->rc_lost_rqst = NULL; 296 mutex_exit(&mi->mi_lock); 297 298 nfs4_queue_event(RE_LOST_STATE, mi, NULL, lrp->lr_op, lrp->lr_vp, 299 lrp->lr_dvp, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0); 300 } 301 302 /* 303 * Transfer the bad seqid recovery information in recovp to mi's 304 * bad seqid queue, and mark mi as having a bad seqid request. 305 */ 306 void 307 enqueue_bseqid_rqst(recov_info_t *recovp, mntinfo4_t *mi) 308 { 309 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 310 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 311 ASSERT(recovp->rc_bseqid_rqst != NULL); 312 313 mutex_enter(&mi->mi_lock); 314 mi->mi_recovflags |= MI4R_BAD_SEQID; 315 list_insert_tail(&mi->mi_bseqid_list, recovp->rc_bseqid_rqst); 316 recovp->rc_bseqid_rqst = NULL; 317 mutex_exit(&mi->mi_lock); 318 } 319 320 /* 321 * Initiate recovery. 322 * 323 * The nfs4_error_t contains the return codes that triggered a recovery 324 * attempt. mi, vp1, and vp2 refer to the filesystem and files that were 325 * being operated on. vp1 and vp2 may be NULL. 326 * 327 * Multiple calls are okay. If recovery is already underway, the call 328 * updates the information about what state needs recovery but does not 329 * start a new thread. The caller should hold mi->mi_recovlock as a reader 330 * for proper synchronization with any recovery thread. 331 * 332 * This will return TRUE if recovery was aborted, and FALSE otherwise. 333 */ 334 bool_t 335 nfs4_start_recovery(nfs4_error_t *ep, mntinfo4_t *mi, vnode_t *vp1, 336 vnode_t *vp2, stateid4 *sid, nfs4_lost_rqst_t *lost_rqstp, nfs_opnum4 op, 337 nfs4_bseqid_entry_t *bsep) 338 { 339 recov_info_t *recovp; 340 nfs4_server_t *sp; 341 bool_t abort = FALSE; 342 bool_t gone = FALSE; 343 344 ASSERT(nfs_zone() == mi->mi_zone); 345 mutex_enter(&mi->mi_lock); 346 /* 347 * If there is lost state, we need to kick off recovery even if the 348 * filesystem has been unmounted or the zone is shutting down. 349 */ 350 gone = FS_OR_ZONE_GONE4(mi->mi_vfsp); 351 if (gone) { 352 ASSERT(ep->error != EINTR || lost_rqstp != NULL); 353 if (ep->error == EIO && lost_rqstp == NULL) { 354 /* failed due to forced unmount, no new lost state */ 355 abort = TRUE; 356 } 357 if ((ep->error == 0 || ep->error == ETIMEDOUT) && 358 !(mi->mi_recovflags & MI4R_LOST_STATE)) { 359 /* some other failure, no existing lost state */ 360 abort = TRUE; 361 } 362 if (abort) { 363 mutex_exit(&mi->mi_lock); 364 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 365 "nfs4_start_recovery: fs unmounted")); 366 return (TRUE); 367 } 368 } 369 mi->mi_in_recovery++; 370 mutex_exit(&mi->mi_lock); 371 372 recovp = kmem_alloc(sizeof (recov_info_t), KM_SLEEP); 373 recovp->rc_orig_errors = *ep; 374 sp = find_nfs4_server(mi); 375 errs_to_action(recovp, sp, mi, sid, lost_rqstp, 376 gone, op, bsep); 377 if (sp != NULL) 378 mutex_exit(&sp->s_lock); 379 start_recovery(recovp, mi, vp1, vp2, sp); 380 if (sp != NULL) 381 nfs4_server_rele(sp); 382 return (FALSE); 383 } 384 385 /* 386 * Internal version of nfs4_start_recovery. The difference is that the 387 * caller specifies the recovery action, rather than the errors leading to 388 * recovery. 389 */ 390 static void 391 start_recovery_action(nfs4_recov_t what, bool_t reboot, mntinfo4_t *mi, 392 vnode_t *vp1, vnode_t *vp2) 393 { 394 recov_info_t *recovp; 395 396 ASSERT(nfs_zone() == mi->mi_zone); 397 mutex_enter(&mi->mi_lock); 398 mi->mi_in_recovery++; 399 mutex_exit(&mi->mi_lock); 400 401 recovp = kmem_zalloc(sizeof (recov_info_t), KM_SLEEP); 402 recovp->rc_action = what; 403 recovp->rc_srv_reboot = reboot; 404 recovp->rc_error = EIO; 405 start_recovery(recovp, mi, vp1, vp2, NULL); 406 } 407 408 static void 409 start_recovery(recov_info_t *recovp, mntinfo4_t *mi, 410 vnode_t *vp1, vnode_t *vp2, nfs4_server_t *sp) 411 { 412 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 413 "start_recovery: mi %p, what %s", (void*)mi, 414 nfs4_recov_action_to_str(recovp->rc_action))); 415 416 /* 417 * Bump the reference on the vfs so that we can pass it to the 418 * recovery thread. 419 */ 420 VFS_HOLD(mi->mi_vfsp); 421 422 again: 423 switch (recovp->rc_action) { 424 case NR_FAILOVER: 425 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 426 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 427 if (mi->mi_servers->sv_next == NULL) 428 goto out_no_thread; 429 mutex_enter(&mi->mi_lock); 430 mi->mi_recovflags |= MI4R_NEED_NEW_SERVER; 431 mutex_exit(&mi->mi_lock); 432 433 if (recovp->rc_lost_rqst != NULL) 434 nfs4_enqueue_lost_rqst(recovp, mi); 435 break; 436 437 case NR_CLIENTID: 438 /* 439 * If the filesystem has been unmounted, punt. 440 */ 441 if (sp == NULL) 442 goto out_no_thread; 443 444 /* 445 * If nobody else is working on the clientid, mark the 446 * clientid as being no longer set. Then mark the specific 447 * filesystem being worked on. 448 */ 449 if (!nfs4_server_in_recovery(sp)) { 450 mutex_enter(&sp->s_lock); 451 sp->s_flags &= ~N4S_CLIENTID_SET; 452 mutex_exit(&sp->s_lock); 453 } 454 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 455 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 456 mutex_enter(&mi->mi_lock); 457 mi->mi_recovflags |= MI4R_NEED_CLIENTID; 458 if (recovp->rc_srv_reboot) 459 mi->mi_recovflags |= MI4R_SRV_REBOOT; 460 mutex_exit(&mi->mi_lock); 461 break; 462 463 case NR_OPENFILES: 464 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 465 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 466 mutex_enter(&mi->mi_lock); 467 mi->mi_recovflags |= MI4R_REOPEN_FILES; 468 if (recovp->rc_srv_reboot) 469 mi->mi_recovflags |= MI4R_SRV_REBOOT; 470 mutex_exit(&mi->mi_lock); 471 break; 472 473 case NR_WRONGSEC: 474 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 475 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 476 mutex_enter(&mi->mi_lock); 477 mi->mi_recovflags |= MI4R_NEED_SECINFO; 478 mutex_exit(&mi->mi_lock); 479 break; 480 481 case NR_EXPIRED: 482 if (vp1 != NULL) 483 recov_badstate(recovp, vp1, NFS4ERR_EXPIRED); 484 if (vp2 != NULL) 485 recov_badstate(recovp, vp2, NFS4ERR_EXPIRED); 486 goto out_no_thread; /* no further recovery possible */ 487 488 case NR_BAD_STATEID: 489 if (vp1 != NULL) 490 recov_badstate(recovp, vp1, NFS4ERR_BAD_STATEID); 491 if (vp2 != NULL) 492 recov_badstate(recovp, vp2, NFS4ERR_BAD_STATEID); 493 goto out_no_thread; /* no further recovery possible */ 494 495 case NR_FHEXPIRED: 496 case NR_BADHANDLE: 497 if (vp1 != NULL) 498 recov_throttle(recovp, vp1); 499 if (vp2 != NULL) 500 recov_throttle(recovp, vp2); 501 /* 502 * Recover the filehandle now, rather than using a 503 * separate thread. We can do this because filehandle 504 * recovery is independent of any other state, and because 505 * we know that we are not competing with the recovery 506 * thread at this time. recov_filehandle will deal with 507 * threads that are competing to recover this filehandle. 508 */ 509 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 510 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 511 if (vp1 != NULL) 512 recov_filehandle(recovp->rc_action, mi, vp1); 513 if (vp2 != NULL) 514 recov_filehandle(recovp->rc_action, mi, vp2); 515 goto out_no_thread; /* no further recovery needed */ 516 517 case NR_STALE: 518 /* 519 * NFS4ERR_STALE handling 520 * recov_stale() could set MI4R_NEED_NEW_SERVER to 521 * indicate that we can and should failover. 522 */ 523 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 524 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 525 526 if (vp1 != NULL) 527 recov_stale(mi, vp1); 528 if (vp2 != NULL) 529 recov_stale(mi, vp2); 530 mutex_enter(&mi->mi_lock); 531 if ((mi->mi_recovflags & MI4R_NEED_NEW_SERVER) == 0) { 532 mutex_exit(&mi->mi_lock); 533 goto out_no_thread; 534 } 535 mutex_exit(&mi->mi_lock); 536 recovp->rc_action = NR_FAILOVER; 537 goto again; 538 539 case NR_BAD_SEQID: 540 if (recovp->rc_bseqid_rqst) { 541 enqueue_bseqid_rqst(recovp, mi); 542 break; 543 } 544 545 if (vp1 != NULL) 546 recov_badstate(recovp, vp1, NFS4ERR_BAD_SEQID); 547 if (vp2 != NULL) 548 recov_badstate(recovp, vp2, NFS4ERR_BAD_SEQID); 549 goto out_no_thread; /* no further recovery possible */ 550 551 case NR_OLDSTATEID: 552 if (vp1 != NULL) 553 recov_badstate(recovp, vp1, NFS4ERR_OLD_STATEID); 554 if (vp2 != NULL) 555 recov_badstate(recovp, vp2, NFS4ERR_OLD_STATEID); 556 goto out_no_thread; /* no further recovery possible */ 557 558 case NR_GRACE: 559 nfs4_set_grace_wait(mi); 560 goto out_no_thread; /* no further action required for GRACE */ 561 562 case NR_DELAY: 563 if (vp1) 564 nfs4_set_delay_wait(vp1); 565 goto out_no_thread; /* no further action required for DELAY */ 566 567 case NR_LOST_STATE_RQST: 568 case NR_LOST_LOCK: 569 nfs4_enqueue_lost_rqst(recovp, mi); 570 break; 571 572 default: 573 nfs4_queue_event(RE_UNEXPECTED_ACTION, mi, NULL, 574 recovp->rc_action, NULL, NULL, 0, NULL, 0, TAG_NONE, 575 TAG_NONE, 0, 0); 576 goto out_no_thread; 577 } 578 579 /* 580 * If either file recently went through the same recovery, wait 581 * awhile. This is in case there is some sort of bug; we might not 582 * be able to recover properly, but at least we won't bombard the 583 * server with calls, and we won't tie up the client. 584 */ 585 if (vp1 != NULL) 586 recov_throttle(recovp, vp1); 587 if (vp2 != NULL) 588 recov_throttle(recovp, vp2); 589 590 /* 591 * If there's already a recovery thread, don't start another one. 592 */ 593 594 mutex_enter(&mi->mi_lock); 595 if (mi->mi_flags & MI4_RECOV_ACTIV) { 596 mutex_exit(&mi->mi_lock); 597 goto out_no_thread; 598 } 599 mi->mi_flags |= MI4_RECOV_ACTIV; 600 mutex_exit(&mi->mi_lock); 601 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 602 "start_recovery: starting new thread for mi %p", (void*)mi)); 603 604 recovp->rc_mi = mi; 605 recovp->rc_vp1 = vp1; 606 if (vp1 != NULL) { 607 ASSERT(VTOMI4(vp1) == mi); 608 VN_HOLD(recovp->rc_vp1); 609 } 610 recovp->rc_vp2 = vp2; 611 if (vp2 != NULL) { 612 ASSERT(VTOMI4(vp2) == mi); 613 VN_HOLD(recovp->rc_vp2); 614 } 615 616 (void) zthread_create(NULL, 0, nfs4_recov_thread, recovp, 0, 617 minclsyspri); 618 return; 619 620 /* not reached by thread creating call */ 621 out_no_thread: 622 mutex_enter(&mi->mi_lock); 623 mi->mi_in_recovery--; 624 cv_broadcast(&mi->mi_cv_in_recov); 625 mutex_exit(&mi->mi_lock); 626 627 VFS_RELE(mi->mi_vfsp); 628 /* 629 * Free up resources that were allocated for us. 630 */ 631 kmem_free(recovp, sizeof (recov_info_t)); 632 } 633 634 static int 635 nfs4_check_srvstub(vnode_t *vp, rnode4_t *rp, nfs4_op_hint_t op) 636 { 637 int err = 0; 638 639 /* 640 * If tuneable does not allow client to cross srv mountpoints and 641 * object is a stub, then check check op hint and return EACCES for 642 * any hint other than access, rddir, getattr, lookup. 643 */ 644 if (rp->r_flags & R4SRVSTUB && op != OH_ACCESS && op != OH_GETACL && 645 op != OH_GETATTR && op != OH_READDIR && op != OH_LOOKUP) { 646 err = EACCES; 647 #ifdef DEBUG 648 NFS4_DEBUG(nfs4_srvmnt_debug, (CE_NOTE, 649 "nfs4_check_srvstub: op=%d err=%d rp=%p vp=%p\n" 650 "va_nod=%llx r_mntd_fid=%llx\n" 651 "sv_fsid=(%llx:%llx) r_srv_fsid=(%llx:%llx)", 652 op, err, (void *)rp, (void *)vp, 653 (u_longlong_t)rp->r_attr.va_nodeid, 654 (u_longlong_t)rp->r_mntd_fid, 655 (u_longlong_t)rp->r_server->sv_fsid.major, 656 (u_longlong_t)rp->r_server->sv_fsid.minor, 657 (u_longlong_t)rp->r_srv_fsid.major, 658 (u_longlong_t)rp->r_srv_fsid.minor)); 659 #endif 660 } 661 662 return (err); 663 } 664 665 static int 666 nfs4_check_recov_err(vnode_t *vp, nfs4_op_hint_t op, 667 nfs4_recov_state_t *rsp, int retry_err_cnt, char *str) 668 { 669 rnode4_t *rp; 670 int error = 0; 671 int exempt; 672 673 if (vp == NULL) 674 return (0); 675 676 exempt = (op == OH_CLOSE || op == OH_LOCKU || op == OH_DELEGRETURN); 677 rp = VTOR4(vp); 678 mutex_enter(&rp->r_statelock); 679 680 /* 681 * If there was a recovery error, then allow op hints "exempt" from 682 * recov errors to retry (currently 3 times). Either r_error or 683 * EIO is returned for non-exempt op hints. 684 * 685 * Error heirarchy: 686 * a) check for R4ERECOVERR 687 * b) check for R4SRVSTUB (only if R4RECOVERR is not set). 688 */ 689 if (rp->r_flags & R4RECOVERR) { 690 if (exempt && rsp->rs_num_retry_despite_err <= 691 nfs4_max_recov_error_retry) { 692 693 /* 694 * Check to make sure that we haven't already inc'd 695 * rs_num_retry_despite_err for current nfs4_start_fop 696 * instance. We don't want to double inc (if we were 697 * called with vp2, then the vp1 call could have 698 * already incremented. 699 */ 700 if (retry_err_cnt == rsp->rs_num_retry_despite_err) 701 rsp->rs_num_retry_despite_err++; 702 703 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 704 "nfs4_start_fop: %s %p DEAD, cnt=%d", str, 705 (void *)vp, rsp->rs_num_retry_despite_err)); 706 } else { 707 error = (rp->r_error ? rp->r_error : EIO); 708 /* 709 * An ESTALE error on a non-regular file is not 710 * "sticky". Return the ESTALE error once, but 711 * clear the condition to allow future operations 712 * to go OTW. This will allow the client to 713 * recover if the server has merely unshared then 714 * re-shared the file system. For regular files, 715 * the unshare has destroyed the open state at the 716 * server and we aren't willing to do a reopen (yet). 717 */ 718 if (error == ESTALE && vp->v_type != VREG) { 719 rp->r_flags &= 720 ~(R4RECOVERR|R4RECOVERRP|R4STALE); 721 rp->r_error = 0; 722 error = ESTALE; 723 } 724 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 725 "nfs4_start_fop: %s %p DEAD, cnt=%d error=%d", 726 str, (void *)vp, 727 rsp->rs_num_retry_despite_err, error)); 728 } 729 } else { 730 error = nfs4_check_srvstub(vp, rp, op); 731 NFS4_DEBUG(nfs4_client_recov_stub_debug, (CE_NOTE, 732 "nfs4_start_fop: %s %p SRVSTUB, error=%d", str, 733 (void *)vp, error)); 734 } 735 mutex_exit(&rp->r_statelock); 736 return (error); 737 } 738 739 /* 740 * Initial setup code that every operation should call if it might invoke 741 * client recovery. Can block waiting for recovery to finish on a 742 * filesystem. Either vnode ptr can be NULL. 743 * 744 * Returns 0 if there are no outstanding errors. Can return an 745 * errno value under various circumstances (e.g., failed recovery, or 746 * interrupted while waiting for recovery to finish). 747 * 748 * There must be a corresponding call to nfs4_end_op() to free up any locks 749 * or resources allocated by this call (assuming this call succeeded), 750 * using the same rsp that's passed in here. 751 * 752 * The open and lock seqid synchronization must be stopped before calling this 753 * function, as it could lead to deadlock when trying to reopen a file or 754 * reclaim a lock. The synchronization is obtained with calls to: 755 * nfs4_start_open_seqid_sync() 756 * nfs4_start_lock_seqid_sync() 757 * 758 * *startrecovp is set TRUE if the caller should not bother with the 759 * over-the-wire call, and just initiate recovery for the given request. 760 * This is typically used for state-releasing ops if the filesystem has 761 * been forcibly unmounted. startrecovp may be NULL for 762 * non-state-releasing ops. 763 */ 764 765 int 766 nfs4_start_fop(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2, nfs4_op_hint_t op, 767 nfs4_recov_state_t *rsp, bool_t *startrecovp) 768 { 769 int error = 0, rerr_cnt; 770 nfs4_server_t *sp = NULL; 771 nfs4_server_t *tsp; 772 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 773 time_t droplock_time; 774 #ifdef DEBUG 775 void *fop_caller; 776 #endif 777 778 ASSERT(vp1 == NULL || vp1->v_vfsp == mi->mi_vfsp); 779 ASSERT(vp2 == NULL || vp2->v_vfsp == mi->mi_vfsp); 780 781 #ifdef DEBUG 782 if ((fop_caller = tsd_get(nfs4_tsd_key)) != NULL) { 783 cmn_err(CE_PANIC, "Missing nfs4_end_fop: last caller %p", 784 fop_caller); 785 } 786 (void) tsd_set(nfs4_tsd_key, caller()); 787 #endif 788 789 rsp->rs_sp = NULL; 790 rsp->rs_flags &= ~NFS4_RS_RENAME_HELD; 791 rerr_cnt = rsp->rs_num_retry_despite_err; 792 793 /* 794 * Process the items that may delay() based on server response 795 */ 796 error = nfs4_wait_for_grace(mi, rsp); 797 if (error) 798 goto out; 799 800 if (vp1 != NULL) { 801 error = nfs4_wait_for_delay(vp1, rsp); 802 if (error) 803 goto out; 804 } 805 806 /* Wait for a delegation recall to complete. */ 807 808 error = wait_for_recall(vp1, vp2, op, rsp); 809 if (error) 810 goto out; 811 812 /* 813 * Wait for any current recovery actions to finish. Note that a 814 * recovery thread can still start up after wait_for_recovery() 815 * finishes. We don't block out recovery operations until we 816 * acquire s_recovlock and mi_recovlock. 817 */ 818 error = wait_for_recovery(mi, op); 819 if (error) 820 goto out; 821 822 /* 823 * Check to see if the rnode is already marked with a 824 * recovery error. If so, return it immediately. But 825 * always pass CLOSE, LOCKU, and DELEGRETURN so we can 826 * clean up state on the server. 827 */ 828 829 if (vp1 != NULL) { 830 if (error = nfs4_check_recov_err(vp1, op, rsp, rerr_cnt, "vp1")) 831 goto out; 832 nfs4_check_remap(mi, vp1, NFS4_REMAP_CKATTRS, &e); 833 } 834 835 if (vp2 != NULL) { 836 if (error = nfs4_check_recov_err(vp2, op, rsp, rerr_cnt, "vp2")) 837 goto out; 838 nfs4_check_remap(mi, vp2, NFS4_REMAP_CKATTRS, &e); 839 } 840 841 /* 842 * The lock order calls for us to acquire s_recovlock before 843 * mi_recovlock, but we have to hold mi_recovlock to look up sp (to 844 * prevent races with the failover/migration code). So acquire 845 * mi_recovlock, look up sp, drop mi_recovlock, acquire 846 * s_recovlock and mi_recovlock, then verify that sp is still the 847 * right object. XXX Can we find a simpler way to deal with this? 848 */ 849 if (nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 850 mi->mi_flags & MI4_INT)) { 851 error = EINTR; 852 goto out; 853 } 854 get_sp: 855 sp = find_nfs4_server(mi); 856 if (sp != NULL) { 857 sp->s_otw_call_count++; 858 mutex_exit(&sp->s_lock); 859 droplock_time = gethrestime_sec(); 860 } 861 nfs_rw_exit(&mi->mi_recovlock); 862 863 if (sp != NULL) { 864 if (nfs_rw_enter_sig(&sp->s_recovlock, RW_READER, 865 mi->mi_flags & MI4_INT)) { 866 error = EINTR; 867 goto out; 868 } 869 } 870 if (nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 871 mi->mi_flags & MI4_INT)) { 872 if (sp != NULL) 873 nfs_rw_exit(&sp->s_recovlock); 874 error = EINTR; 875 goto out; 876 } 877 /* 878 * If the mntinfo4_t hasn't changed nfs4_sever_ts then 879 * there's no point in double checking to make sure it 880 * has switched. 881 */ 882 if (sp == NULL || droplock_time < mi->mi_srvsettime) { 883 tsp = find_nfs4_server(mi); 884 if (tsp != sp) { 885 /* try again */ 886 if (tsp != NULL) { 887 mutex_exit(&tsp->s_lock); 888 nfs4_server_rele(tsp); 889 tsp = NULL; 890 } 891 if (sp != NULL) { 892 nfs_rw_exit(&sp->s_recovlock); 893 mutex_enter(&sp->s_lock); 894 sp->s_otw_call_count--; 895 mutex_exit(&sp->s_lock); 896 nfs4_server_rele(sp); 897 sp = NULL; 898 } 899 goto get_sp; 900 } else { 901 if (tsp != NULL) { 902 mutex_exit(&tsp->s_lock); 903 nfs4_server_rele(tsp); 904 tsp = NULL; 905 } 906 } 907 } 908 909 if (sp != NULL) { 910 rsp->rs_sp = sp; 911 } 912 913 /* 914 * If the fileystem uses volatile filehandles, obtain a lock so 915 * that we synchronize with renames. Exception: mount operations 916 * can change mi_fh_expire_type, which could be a problem, since 917 * the end_op code needs to be consistent with the start_op code 918 * about mi_rename_lock. Since mounts don't compete with renames, 919 * it's simpler to just not acquire the rename lock for mounts. 920 */ 921 if (NFS4_VOLATILE_FH(mi) && op != OH_MOUNT) { 922 if (nfs_rw_enter_sig(&mi->mi_rename_lock, 923 op == OH_VFH_RENAME ? RW_WRITER : RW_READER, 924 mi->mi_flags & MI4_INT)) { 925 nfs_rw_exit(&mi->mi_recovlock); 926 if (sp != NULL) 927 nfs_rw_exit(&sp->s_recovlock); 928 error = EINTR; 929 goto out; 930 } 931 rsp->rs_flags |= NFS4_RS_RENAME_HELD; 932 } 933 934 if (OH_IS_STATE_RELE(op)) { 935 /* 936 * For forced unmount, letting the request proceed will 937 * almost always delay response to the user, so hand it off 938 * to the recovery thread. For exiting lwp's, we don't 939 * have a good way to tell if the request will hang. We 940 * generally want processes to handle their own requests so 941 * that they can be done in parallel, but if there is 942 * already a recovery thread, hand the request off to it. 943 * This will improve user response at no cost to overall 944 * system throughput. For zone shutdown, we'd prefer 945 * the recovery thread to handle this as well. 946 */ 947 ASSERT(startrecovp != NULL); 948 mutex_enter(&mi->mi_lock); 949 if (FS_OR_ZONE_GONE4(mi->mi_vfsp)) 950 *startrecovp = TRUE; 951 else if ((curthread->t_proc_flag & TP_LWPEXIT) && 952 (mi->mi_flags & MI4_RECOV_ACTIV)) 953 *startrecovp = TRUE; 954 else 955 *startrecovp = FALSE; 956 mutex_exit(&mi->mi_lock); 957 } else 958 if (startrecovp != NULL) 959 *startrecovp = FALSE; 960 961 ASSERT(error == 0); 962 return (error); 963 964 out: 965 ASSERT(error != 0); 966 if (sp != NULL) { 967 mutex_enter(&sp->s_lock); 968 sp->s_otw_call_count--; 969 mutex_exit(&sp->s_lock); 970 nfs4_server_rele(sp); 971 rsp->rs_sp = NULL; 972 } 973 nfs4_end_op_recall(vp1, vp2, rsp); 974 975 #ifdef DEBUG 976 (void) tsd_set(nfs4_tsd_key, NULL); 977 #endif 978 return (error); 979 } 980 981 /* 982 * It is up to the caller to determine if rsp->rs_sp being NULL 983 * is detrimental or not. 984 */ 985 int 986 nfs4_start_op(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2, 987 nfs4_recov_state_t *rsp) 988 { 989 ASSERT(rsp->rs_num_retry_despite_err == 0); 990 rsp->rs_num_retry_despite_err = 0; 991 return (nfs4_start_fop(mi, vp1, vp2, OH_OTHER, rsp, NULL)); 992 } 993 994 /* 995 * Release any resources acquired by nfs4_start_op(). 996 * 'sp' should be the nfs4_server pointer returned by nfs4_start_op(). 997 * 998 * The operation hint is used to avoid a deadlock by bypassing delegation 999 * return logic for writes, which are done while returning a delegation. 1000 */ 1001 1002 void 1003 nfs4_end_fop(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2, nfs4_op_hint_t op, 1004 nfs4_recov_state_t *rsp, bool_t needs_recov) 1005 { 1006 nfs4_server_t *sp = rsp->rs_sp; 1007 rnode4_t *rp = NULL; 1008 1009 #ifdef lint 1010 /* 1011 * The op hint isn't used any more, but might be in 1012 * the future. 1013 */ 1014 op = op; 1015 #endif 1016 1017 #ifdef DEBUG 1018 ASSERT(tsd_get(nfs4_tsd_key) != NULL); 1019 (void) tsd_set(nfs4_tsd_key, NULL); 1020 #endif 1021 1022 nfs4_end_op_recall(vp1, vp2, rsp); 1023 1024 if (rsp->rs_flags & NFS4_RS_RENAME_HELD) 1025 nfs_rw_exit(&mi->mi_rename_lock); 1026 1027 if (!needs_recov) { 1028 if (rsp->rs_flags & NFS4_RS_DELAY_MSG) { 1029 /* may need to clear the delay interval */ 1030 if (vp1 != NULL) { 1031 rp = VTOR4(vp1); 1032 mutex_enter(&rp->r_statelock); 1033 rp->r_delay_interval = 0; 1034 mutex_exit(&rp->r_statelock); 1035 } 1036 } 1037 rsp->rs_flags &= ~(NFS4_RS_GRACE_MSG|NFS4_RS_DELAY_MSG); 1038 } 1039 1040 /* 1041 * If the corresponding nfs4_start_op() found a sp, 1042 * then there must still be a sp. 1043 */ 1044 if (sp != NULL) { 1045 nfs_rw_exit(&mi->mi_recovlock); 1046 nfs_rw_exit(&sp->s_recovlock); 1047 mutex_enter(&sp->s_lock); 1048 sp->s_otw_call_count--; 1049 cv_broadcast(&sp->s_cv_otw_count); 1050 mutex_exit(&sp->s_lock); 1051 nfs4_server_rele(sp); 1052 } else { 1053 nfs_rw_exit(&mi->mi_recovlock); 1054 } 1055 } 1056 1057 void 1058 nfs4_end_op(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2, 1059 nfs4_recov_state_t *rsp, bool_t needrecov) 1060 { 1061 nfs4_end_fop(mi, vp1, vp2, OH_OTHER, rsp, needrecov); 1062 } 1063 1064 /* 1065 * If the filesystem is going through client recovery, block until 1066 * finished. 1067 * Exceptions: 1068 * - state-releasing ops (CLOSE, LOCKU, DELEGRETURN) are allowed to proceed 1069 * if the filesystem has been forcibly unmounted or the lwp is exiting. 1070 * 1071 * Return value: 1072 * - 0 if no errors 1073 * - EINTR if the call was interrupted 1074 * - EIO if the filesystem has been forcibly unmounted (non-state-releasing 1075 * op) 1076 * - the errno value from the recovery thread, if recovery failed 1077 */ 1078 1079 static int 1080 wait_for_recovery(mntinfo4_t *mi, nfs4_op_hint_t op_hint) 1081 { 1082 int error = 0; 1083 1084 mutex_enter(&mi->mi_lock); 1085 1086 while (mi->mi_recovflags != 0) { 1087 klwp_t *lwp = ttolwp(curthread); 1088 1089 if (mi->mi_flags & MI4_RECOV_FAIL) 1090 break; 1091 if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) 1092 break; 1093 if (OH_IS_STATE_RELE(op_hint) && 1094 (curthread->t_proc_flag & TP_LWPEXIT)) 1095 break; 1096 1097 if (lwp != NULL) 1098 lwp->lwp_nostop++; 1099 /* XXX - use different cv? */ 1100 if (cv_wait_sig(&mi->mi_failover_cv, &mi->mi_lock) == 0) { 1101 error = EINTR; 1102 if (lwp != NULL) 1103 lwp->lwp_nostop--; 1104 break; 1105 } 1106 if (lwp != NULL) 1107 lwp->lwp_nostop--; 1108 } 1109 1110 if (mi->mi_flags & MI4_RECOV_FAIL) { 1111 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1112 "wait_for_recovery: fail since RECOV FAIL")); 1113 error = mi->mi_error; 1114 } else if ((mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) && 1115 !OH_IS_STATE_RELE(op_hint)) { 1116 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1117 "wait_for_recovery: forced unmount")); 1118 error = EIO; 1119 } 1120 1121 mutex_exit(&mi->mi_lock); 1122 1123 return (error); 1124 } 1125 1126 /* 1127 * If the client received NFS4ERR_GRACE for this particular mount, 1128 * the client blocks here until it is time to try again. 1129 * 1130 * Return value: 1131 * - 0 if wait was successful 1132 * - EINTR if the call was interrupted 1133 */ 1134 1135 int 1136 nfs4_wait_for_grace(mntinfo4_t *mi, nfs4_recov_state_t *rsp) 1137 { 1138 int error = 0; 1139 time_t curtime, time_to_wait; 1140 1141 /* do a unprotected check to reduce mi_lock contention */ 1142 if (mi->mi_grace_wait != 0) { 1143 mutex_enter(&mi->mi_lock); 1144 1145 if (mi->mi_grace_wait != 0) { 1146 if (!(rsp->rs_flags & NFS4_RS_GRACE_MSG)) 1147 rsp->rs_flags |= NFS4_RS_GRACE_MSG; 1148 1149 curtime = gethrestime_sec(); 1150 1151 if (curtime < mi->mi_grace_wait) { 1152 1153 time_to_wait = mi->mi_grace_wait - curtime; 1154 1155 mutex_exit(&mi->mi_lock); 1156 1157 delay(SEC_TO_TICK(time_to_wait)); 1158 1159 curtime = gethrestime_sec(); 1160 1161 mutex_enter(&mi->mi_lock); 1162 1163 if (curtime >= mi->mi_grace_wait) 1164 mi->mi_grace_wait = 0; 1165 } else { 1166 mi->mi_grace_wait = 0; 1167 } 1168 } 1169 mutex_exit(&mi->mi_lock); 1170 } 1171 1172 return (error); 1173 } 1174 1175 /* 1176 * If the client received NFS4ERR_DELAY for an operation on a vnode, 1177 * the client blocks here until it is time to try again. 1178 * 1179 * Return value: 1180 * - 0 if wait was successful 1181 * - EINTR if the call was interrupted 1182 */ 1183 1184 int 1185 nfs4_wait_for_delay(vnode_t *vp, nfs4_recov_state_t *rsp) 1186 { 1187 int error = 0; 1188 time_t curtime, time_to_wait; 1189 rnode4_t *rp; 1190 1191 ASSERT(vp != NULL); 1192 1193 rp = VTOR4(vp); 1194 1195 /* do a unprotected check to reduce r_statelock contention */ 1196 if (rp->r_delay_wait != 0) { 1197 mutex_enter(&rp->r_statelock); 1198 1199 if (rp->r_delay_wait != 0) { 1200 1201 if (!(rsp->rs_flags & NFS4_RS_DELAY_MSG)) { 1202 rsp->rs_flags |= NFS4_RS_DELAY_MSG; 1203 nfs4_mi_kstat_inc_delay(VTOMI4(vp)); 1204 } 1205 1206 curtime = gethrestime_sec(); 1207 1208 if (curtime < rp->r_delay_wait) { 1209 1210 time_to_wait = rp->r_delay_wait - curtime; 1211 1212 mutex_exit(&rp->r_statelock); 1213 1214 delay(SEC_TO_TICK(time_to_wait)); 1215 1216 curtime = gethrestime_sec(); 1217 1218 mutex_enter(&rp->r_statelock); 1219 1220 if (curtime >= rp->r_delay_wait) 1221 rp->r_delay_wait = 0; 1222 } else { 1223 rp->r_delay_wait = 0; 1224 } 1225 } 1226 mutex_exit(&rp->r_statelock); 1227 } 1228 1229 return (error); 1230 } 1231 1232 /* 1233 * The recovery thread. 1234 */ 1235 1236 static void 1237 nfs4_recov_thread(recov_info_t *recovp) 1238 { 1239 mntinfo4_t *mi = recovp->rc_mi; 1240 nfs4_server_t *sp; 1241 int done = 0, error = 0; 1242 bool_t recov_fail = FALSE; 1243 callb_cpr_t cpr_info; 1244 kmutex_t cpr_lock; 1245 1246 nfs4_queue_event(RE_START, mi, NULL, mi->mi_recovflags, 1247 recovp->rc_vp1, recovp->rc_vp2, 0, NULL, 0, TAG_NONE, TAG_NONE, 1248 0, 0); 1249 1250 mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL); 1251 CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr, "nfsv4Recov"); 1252 1253 mutex_enter(&mi->mi_lock); 1254 mi->mi_recovthread = curthread; 1255 mutex_exit(&mi->mi_lock); 1256 1257 /* 1258 * We don't really need protection here against failover or 1259 * migration, since the current thread is the one that would make 1260 * any changes, but hold mi_recovlock anyway for completeness (and 1261 * to satisfy any ASSERTs). 1262 */ 1263 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0); 1264 sp = find_nfs4_server(mi); 1265 if (sp != NULL) 1266 mutex_exit(&sp->s_lock); 1267 nfs_rw_exit(&mi->mi_recovlock); 1268 1269 /* 1270 * Do any necessary recovery, based on the information in recovp 1271 * and any recovery flags. 1272 */ 1273 1274 do { 1275 mutex_enter(&mi->mi_lock); 1276 if (FS_OR_ZONE_GONE4(mi->mi_vfsp)) { 1277 bool_t activesrv; 1278 1279 NFS4_DEBUG(nfs4_client_recov_debug && 1280 mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED, (CE_NOTE, 1281 "nfs4_recov_thread: file system has been " 1282 "unmounted")); 1283 NFS4_DEBUG(nfs4_client_recov_debug && 1284 zone_status_get(curproc->p_zone) >= 1285 ZONE_IS_SHUTTING_DOWN, (CE_NOTE, 1286 "nfs4_recov_thread: zone shutting down")); 1287 /* 1288 * If the server has lost its state for us and 1289 * the filesystem is unmounted, then the filesystem 1290 * can be tossed, even if there are lost lock or 1291 * lost state calls in the recovery queue. 1292 */ 1293 if (mi->mi_recovflags & 1294 (MI4R_NEED_CLIENTID | MI4R_REOPEN_FILES)) { 1295 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1296 "nfs4_recov_thread: bailing out")); 1297 mi->mi_flags |= MI4_RECOV_FAIL; 1298 mi->mi_error = recovp->rc_error; 1299 recov_fail = TRUE; 1300 } 1301 /* 1302 * We don't know if the server has any state for 1303 * us, and the filesystem has been unmounted. If 1304 * there are "lost state" recovery items, keep 1305 * trying to process them until there are no more 1306 * mounted filesystems for the server. Otherwise, 1307 * bail out. The reason we don't mark the 1308 * filesystem as failing recovery is in case we 1309 * have to do "lost state" recovery later (e.g., a 1310 * user process exits). 1311 */ 1312 if (!(mi->mi_recovflags & MI4R_LOST_STATE)) { 1313 done = 1; 1314 mutex_exit(&mi->mi_lock); 1315 break; 1316 } 1317 mutex_exit(&mi->mi_lock); 1318 1319 if (sp == NULL) 1320 activesrv = FALSE; 1321 else { 1322 mutex_enter(&sp->s_lock); 1323 activesrv = nfs4_fs_active(sp); 1324 } 1325 if (!activesrv) { 1326 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1327 "no active fs for server %p", 1328 (void *)sp)); 1329 mutex_enter(&mi->mi_lock); 1330 mi->mi_flags |= MI4_RECOV_FAIL; 1331 mi->mi_error = recovp->rc_error; 1332 mutex_exit(&mi->mi_lock); 1333 recov_fail = TRUE; 1334 if (sp != NULL) { 1335 /* 1336 * Mark the server instance as 1337 * dead, so that nobody will attach 1338 * a new filesystem. 1339 */ 1340 nfs4_mark_srv_dead(sp); 1341 } 1342 } 1343 if (sp != NULL) 1344 mutex_exit(&sp->s_lock); 1345 } else { 1346 mutex_exit(&mi->mi_lock); 1347 } 1348 1349 /* 1350 * Check if we need to select a new server for a 1351 * failover. Choosing a new server will force at 1352 * least a check of the clientid. 1353 */ 1354 mutex_enter(&mi->mi_lock); 1355 if (!recov_fail && 1356 (mi->mi_recovflags & MI4R_NEED_NEW_SERVER)) { 1357 mutex_exit(&mi->mi_lock); 1358 recov_newserver(recovp, &sp, &recov_fail); 1359 } else 1360 mutex_exit(&mi->mi_lock); 1361 1362 /* 1363 * Check if we need to recover the clientid. This 1364 * must be done before file and lock recovery, and it 1365 * potentially affects the recovery threads for other 1366 * filesystems, so it gets special treatment. 1367 */ 1368 if (sp != NULL && recov_fail == FALSE) { 1369 mutex_enter(&sp->s_lock); 1370 if (!(sp->s_flags & N4S_CLIENTID_SET)) { 1371 mutex_exit(&sp->s_lock); 1372 recov_clientid(recovp, sp); 1373 } else { 1374 /* 1375 * Unset this flag in case another recovery 1376 * thread successfully recovered the clientid 1377 * for us already. 1378 */ 1379 mutex_enter(&mi->mi_lock); 1380 mi->mi_recovflags &= ~MI4R_NEED_CLIENTID; 1381 mutex_exit(&mi->mi_lock); 1382 mutex_exit(&sp->s_lock); 1383 } 1384 } 1385 1386 /* 1387 * Check if we need to get the security information. 1388 */ 1389 mutex_enter(&mi->mi_lock); 1390 if ((mi->mi_recovflags & MI4R_NEED_SECINFO) && 1391 !(mi->mi_flags & MI4_RECOV_FAIL)) { 1392 mutex_exit(&mi->mi_lock); 1393 (void) nfs_rw_enter_sig(&mi->mi_recovlock, 1394 RW_WRITER, 0); 1395 error = nfs4_secinfo_recov(recovp->rc_mi, 1396 recovp->rc_vp1, recovp->rc_vp2); 1397 /* 1398 * If error, nothing more can be done, stop 1399 * the recovery. 1400 */ 1401 if (error) { 1402 mutex_enter(&mi->mi_lock); 1403 mi->mi_flags |= MI4_RECOV_FAIL; 1404 mi->mi_error = recovp->rc_error; 1405 mutex_exit(&mi->mi_lock); 1406 nfs4_queue_event(RE_WRONGSEC, mi, NULL, 1407 error, recovp->rc_vp1, recovp->rc_vp2, 1408 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0); 1409 } 1410 nfs_rw_exit(&mi->mi_recovlock); 1411 } else 1412 mutex_exit(&mi->mi_lock); 1413 1414 /* 1415 * Check if there's a bad seqid to recover. 1416 */ 1417 mutex_enter(&mi->mi_lock); 1418 if ((mi->mi_recovflags & MI4R_BAD_SEQID) && 1419 !(mi->mi_flags & MI4_RECOV_FAIL)) { 1420 mutex_exit(&mi->mi_lock); 1421 (void) nfs_rw_enter_sig(&mi->mi_recovlock, 1422 RW_WRITER, 0); 1423 recov_bad_seqid(recovp); 1424 nfs_rw_exit(&mi->mi_recovlock); 1425 } else 1426 mutex_exit(&mi->mi_lock); 1427 1428 /* 1429 * Next check for recovery that affects the entire 1430 * filesystem. 1431 */ 1432 if (sp != NULL) { 1433 mutex_enter(&mi->mi_lock); 1434 if ((mi->mi_recovflags & MI4R_REOPEN_FILES) && 1435 !(mi->mi_flags & MI4_RECOV_FAIL)) { 1436 mutex_exit(&mi->mi_lock); 1437 recov_openfiles(recovp, sp); 1438 } else 1439 mutex_exit(&mi->mi_lock); 1440 } 1441 1442 /* 1443 * Send any queued state recovery requests. 1444 */ 1445 mutex_enter(&mi->mi_lock); 1446 if (sp != NULL && 1447 (mi->mi_recovflags & MI4R_LOST_STATE) && 1448 !(mi->mi_flags & MI4_RECOV_FAIL)) { 1449 mutex_exit(&mi->mi_lock); 1450 (void) nfs_rw_enter_sig(&mi->mi_recovlock, 1451 RW_WRITER, 0); 1452 nfs4_resend_lost_rqsts(recovp, sp); 1453 if (list_head(&mi->mi_lost_state) == NULL) { 1454 /* done */ 1455 mutex_enter(&mi->mi_lock); 1456 mi->mi_recovflags &= ~MI4R_LOST_STATE; 1457 mutex_exit(&mi->mi_lock); 1458 } 1459 nfs_rw_exit(&mi->mi_recovlock); 1460 } else { 1461 mutex_exit(&mi->mi_lock); 1462 } 1463 1464 /* 1465 * See if there is anything more to do. If not, announce 1466 * that we are done and exit. 1467 * 1468 * Need mi_recovlock to keep 'sp' valid. Must grab 1469 * mi_recovlock before mi_lock to preserve lock ordering. 1470 */ 1471 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0); 1472 mutex_enter(&mi->mi_lock); 1473 if ((mi->mi_recovflags & ~MI4R_SRV_REBOOT) == 0 || 1474 (mi->mi_flags & MI4_RECOV_FAIL)) { 1475 list_t local_lost_state; 1476 nfs4_lost_rqst_t *lrp; 1477 1478 /* 1479 * We need to remove the lost requests before we 1480 * unmark the mi as no longer doing recovery to 1481 * avoid a race with a new thread putting new lost 1482 * requests on the same mi (and the going away 1483 * thread would remove the new lost requests). 1484 * 1485 * Move the lost requests to a local list since 1486 * nfs4_remove_lost_rqst() drops mi_lock, and 1487 * dropping the mi_lock would make our check to 1488 * see if recovery is done no longer valid. 1489 */ 1490 list_create(&local_lost_state, 1491 sizeof (nfs4_lost_rqst_t), 1492 offsetof(nfs4_lost_rqst_t, lr_node)); 1493 list_move_tail(&local_lost_state, &mi->mi_lost_state); 1494 1495 done = 1; 1496 mutex_exit(&mi->mi_lock); 1497 /* 1498 * Now officially free the "moved" 1499 * lost requests. 1500 */ 1501 while ((lrp = list_head(&local_lost_state)) != NULL) { 1502 list_remove(&local_lost_state, lrp); 1503 nfs4_free_lost_rqst(lrp, sp); 1504 } 1505 list_destroy(&local_lost_state); 1506 } else 1507 mutex_exit(&mi->mi_lock); 1508 nfs_rw_exit(&mi->mi_recovlock); 1509 1510 /* 1511 * If the filesystem has been forcibly unmounted, there is 1512 * probably no point in retrying immediately. Furthermore, 1513 * there might be user processes waiting for a chance to 1514 * queue up "lost state" requests, so that they can exit. 1515 * So pause here for a moment. Same logic for zone shutdown. 1516 */ 1517 if (!done && FS_OR_ZONE_GONE4(mi->mi_vfsp)) { 1518 mutex_enter(&mi->mi_lock); 1519 cv_broadcast(&mi->mi_failover_cv); 1520 mutex_exit(&mi->mi_lock); 1521 delay(SEC_TO_TICK(nfs4_unmount_delay)); 1522 } 1523 1524 } while (!done); 1525 1526 1527 if (sp != NULL) 1528 nfs4_server_rele(sp); 1529 1530 /* 1531 * Return all recalled delegations 1532 */ 1533 nfs4_dlistclean(); 1534 1535 mutex_enter(&mi->mi_lock); 1536 recov_done(mi, recovp); 1537 mi->mi_in_recovery--; 1538 1539 /* 1540 * Free up resources that were allocated for us. 1541 */ 1542 if (recovp->rc_vp1 != NULL) 1543 VN_RELE(recovp->rc_vp1); 1544 if (recovp->rc_vp2 != NULL) 1545 VN_RELE(recovp->rc_vp2); 1546 VFS_RELE(mi->mi_vfsp); 1547 cv_broadcast(&mi->mi_cv_in_recov); 1548 mutex_exit(&mi->mi_lock); 1549 1550 kmem_free(recovp, sizeof (recov_info_t)); 1551 mutex_enter(&cpr_lock); 1552 CALLB_CPR_EXIT(&cpr_info); 1553 mutex_destroy(&cpr_lock); 1554 zthread_exit(); 1555 } 1556 1557 /* 1558 * Log the end of recovery and notify any waiting threads. 1559 */ 1560 1561 static void 1562 recov_done(mntinfo4_t *mi, recov_info_t *recovp) 1563 { 1564 1565 ASSERT(MUTEX_HELD(&mi->mi_lock)); 1566 1567 nfs4_queue_event(RE_END, mi, NULL, 0, recovp->rc_vp1, 1568 recovp->rc_vp2, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0); 1569 mi->mi_recovthread = NULL; 1570 mi->mi_flags &= ~MI4_RECOV_ACTIV; 1571 mi->mi_recovflags &= ~MI4R_SRV_REBOOT; 1572 cv_broadcast(&mi->mi_failover_cv); 1573 } 1574 1575 /* 1576 * State-specific recovery routines, by state. 1577 */ 1578 1579 /* 1580 * Failover. 1581 * 1582 * Replaces *spp with a reference to the new server, which must 1583 * eventually be freed. 1584 */ 1585 1586 static void 1587 recov_newserver(recov_info_t *recovp, nfs4_server_t **spp, bool_t *recov_fail) 1588 { 1589 mntinfo4_t *mi = recovp->rc_mi; 1590 servinfo4_t *svp = NULL; 1591 nfs4_server_t *osp = *spp; 1592 CLIENT *cl; 1593 enum clnt_stat status; 1594 struct timeval tv; 1595 int error; 1596 int oncethru = 0; 1597 rnode4_t *rp; 1598 int index; 1599 nfs_fh4 fh; 1600 char *snames; 1601 size_t len; 1602 1603 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0); 1604 1605 tv.tv_sec = 2; 1606 tv.tv_usec = 0; 1607 1608 #ifdef lint 1609 /* 1610 * Lint can't follow the logic, so thinks that snames and len 1611 * can be used before being set. They can't, but lint can't 1612 * figure it out. To address the lint warning, initialize 1613 * snames and len for lint. 1614 */ 1615 snames = NULL; 1616 len = 0; 1617 #endif 1618 1619 /* 1620 * Ping the null NFS procedure of every server in 1621 * the list until one responds. We always start 1622 * at the head of the list and always skip the one 1623 * that is current, since it's caused us a problem. 1624 */ 1625 while (svp == NULL) { 1626 for (svp = mi->mi_servers; svp; svp = svp->sv_next) { 1627 1628 mutex_enter(&mi->mi_lock); 1629 if (FS_OR_ZONE_GONE4(mi->mi_vfsp)) { 1630 mi->mi_flags |= MI4_RECOV_FAIL; 1631 mutex_exit(&mi->mi_lock); 1632 (void) nfs_rw_exit(&mi->mi_recovlock); 1633 *recov_fail = TRUE; 1634 if (oncethru) 1635 kmem_free(snames, len); 1636 return; 1637 } 1638 mutex_exit(&mi->mi_lock); 1639 1640 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 1641 if (svp->sv_flags & SV4_NOTINUSE) { 1642 nfs_rw_exit(&svp->sv_lock); 1643 continue; 1644 } 1645 nfs_rw_exit(&svp->sv_lock); 1646 1647 if (!oncethru && svp == mi->mi_curr_serv) 1648 continue; 1649 1650 error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, 1651 NFS_PROGRAM, NFS_V4, 0, 1, CRED(), &cl); 1652 if (error) 1653 continue; 1654 1655 if (!(mi->mi_flags & MI4_INT)) 1656 cl->cl_nosignal = TRUE; 1657 status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL, 1658 xdr_void, NULL, tv); 1659 if (!(mi->mi_flags & MI4_INT)) 1660 cl->cl_nosignal = FALSE; 1661 AUTH_DESTROY(cl->cl_auth); 1662 CLNT_DESTROY(cl); 1663 if (status == RPC_SUCCESS) { 1664 nfs4_queue_event(RE_FAILOVER, mi, 1665 svp == mi->mi_curr_serv ? NULL : 1666 svp->sv_hostname, 0, NULL, NULL, 0, 1667 NULL, 0, TAG_NONE, TAG_NONE, 0, 0); 1668 break; 1669 } 1670 } 1671 1672 if (svp == NULL) { 1673 if (!oncethru) { 1674 snames = nfs4_getsrvnames(mi, &len); 1675 nfs4_queue_fact(RF_SRVS_NOT_RESPOND, mi, 1676 0, 0, 0, FALSE, snames, 0, NULL); 1677 oncethru = 1; 1678 } 1679 delay(hz); 1680 } 1681 } 1682 1683 if (oncethru) { 1684 nfs4_queue_fact(RF_SRVS_OK, mi, 0, 0, 0, FALSE, snames, 1685 0, NULL); 1686 kmem_free(snames, len); 1687 } 1688 1689 #if DEBUG 1690 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 1691 ASSERT((svp->sv_flags & SV4_NOTINUSE) == 0); 1692 nfs_rw_exit(&svp->sv_lock); 1693 #endif 1694 1695 mutex_enter(&mi->mi_lock); 1696 mi->mi_recovflags &= ~MI4R_NEED_NEW_SERVER; 1697 if (svp != mi->mi_curr_serv) { 1698 servinfo4_t *osvp = mi->mi_curr_serv; 1699 1700 mutex_exit(&mi->mi_lock); 1701 1702 /* 1703 * Update server-dependent fields in the root vnode. 1704 */ 1705 index = rtable4hash(mi->mi_rootfh); 1706 rw_enter(&rtable4[index].r_lock, RW_WRITER); 1707 1708 rp = r4find(&rtable4[index], mi->mi_rootfh, mi->mi_vfsp); 1709 if (rp != NULL) { 1710 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE, 1711 "recov_newserver: remapping %s", rnode4info(rp))); 1712 mutex_enter(&rp->r_statelock); 1713 rp->r_server = svp; 1714 PURGE_ATTRCACHE4_LOCKED(rp); 1715 mutex_exit(&rp->r_statelock); 1716 (void) nfs4_free_data_reclaim(rp); 1717 nfs4_purge_rddir_cache(RTOV4(rp)); 1718 rw_exit(&rtable4[index].r_lock); 1719 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE, 1720 "recov_newserver: done with %s", 1721 rnode4info(rp))); 1722 VN_RELE(RTOV4(rp)); 1723 } else 1724 rw_exit(&rtable4[index].r_lock); 1725 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0); 1726 1727 mutex_enter(&mi->mi_lock); 1728 mi->mi_recovflags |= MI4R_REOPEN_FILES | MI4R_REMAP_FILES; 1729 if (recovp->rc_srv_reboot) 1730 mi->mi_recovflags |= MI4R_SRV_REBOOT; 1731 mi->mi_curr_serv = svp; 1732 mi->mi_failover++; 1733 mi->mi_flags &= ~MI4_BADOWNER_DEBUG; 1734 mutex_exit(&mi->mi_lock); 1735 1736 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 1737 fh.nfs_fh4_len = svp->sv_fhandle.fh_len; 1738 fh.nfs_fh4_val = svp->sv_fhandle.fh_buf; 1739 sfh4_update(mi->mi_rootfh, &fh); 1740 fh.nfs_fh4_len = svp->sv_pfhandle.fh_len; 1741 fh.nfs_fh4_val = svp->sv_pfhandle.fh_buf; 1742 sfh4_update(mi->mi_srvparentfh, &fh); 1743 nfs_rw_exit(&svp->sv_lock); 1744 1745 *spp = nfs4_move_mi(mi, osvp, svp); 1746 if (osp != NULL) 1747 nfs4_server_rele(osp); 1748 } else 1749 mutex_exit(&mi->mi_lock); 1750 (void) nfs_rw_exit(&mi->mi_recovlock); 1751 } 1752 1753 /* 1754 * Clientid. 1755 */ 1756 1757 static void 1758 recov_clientid(recov_info_t *recovp, nfs4_server_t *sp) 1759 { 1760 mntinfo4_t *mi = recovp->rc_mi; 1761 int error = 0; 1762 int still_stale; 1763 int need_new_s; 1764 1765 ASSERT(sp != NULL); 1766 1767 /* 1768 * Acquire the recovery lock and then verify that the clientid 1769 * still needs to be recovered. (Note that s_recovlock is supposed 1770 * to be acquired before s_lock.) Since the thread holds the 1771 * recovery lock, no other thread will recover the clientid. 1772 */ 1773 (void) nfs_rw_enter_sig(&sp->s_recovlock, RW_WRITER, 0); 1774 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0); 1775 mutex_enter(&sp->s_lock); 1776 still_stale = ((sp->s_flags & N4S_CLIENTID_SET) == 0); 1777 mutex_exit(&sp->s_lock); 1778 1779 if (still_stale) { 1780 nfs4_error_t n4e; 1781 1782 nfs4_error_zinit(&n4e); 1783 nfs4setclientid(mi, kcred, TRUE, &n4e); 1784 error = n4e.error; 1785 if (error != 0) { 1786 1787 /* 1788 * nfs4setclientid may have set MI4R_NEED_NEW_SERVER, 1789 * if so, just return and let recov_thread drive 1790 * failover. 1791 */ 1792 mutex_enter(&mi->mi_lock); 1793 need_new_s = mi->mi_recovflags & MI4R_NEED_NEW_SERVER; 1794 mutex_exit(&mi->mi_lock); 1795 1796 if (need_new_s) { 1797 nfs_rw_exit(&mi->mi_recovlock); 1798 nfs_rw_exit(&sp->s_recovlock); 1799 return; 1800 } 1801 1802 nfs4_queue_event(RE_CLIENTID, mi, NULL, n4e.error, NULL, 1803 NULL, n4e.stat, NULL, 0, TAG_NONE, TAG_NONE, 0, 0); 1804 mutex_enter(&mi->mi_lock); 1805 mi->mi_flags |= MI4_RECOV_FAIL; 1806 mi->mi_error = recovp->rc_error; 1807 mutex_exit(&mi->mi_lock); 1808 /* don't destroy the nfs4_server, let umount do it */ 1809 } 1810 } 1811 1812 if (error == 0) { 1813 mutex_enter(&mi->mi_lock); 1814 mi->mi_recovflags &= ~MI4R_NEED_CLIENTID; 1815 /* 1816 * If still_stale isn't true, then another thread already 1817 * recovered the clientid. And that thread that set the 1818 * clientid will have initiated reopening files on all the 1819 * filesystems for the server, so we should not initiate 1820 * reopening for this filesystem here. 1821 */ 1822 if (still_stale) { 1823 mi->mi_recovflags |= MI4R_REOPEN_FILES; 1824 if (recovp->rc_srv_reboot) 1825 mi->mi_recovflags |= MI4R_SRV_REBOOT; 1826 } 1827 mutex_exit(&mi->mi_lock); 1828 } 1829 1830 nfs_rw_exit(&mi->mi_recovlock); 1831 1832 if (error != 0) { 1833 nfs_rw_exit(&sp->s_recovlock); 1834 mutex_enter(&mi->mi_lock); 1835 if ((mi->mi_flags & MI4_RECOV_FAIL) == 0) 1836 delay(SEC_TO_TICK(recov_err_delay)); 1837 mutex_exit(&mi->mi_lock); 1838 } else { 1839 mntinfo4_t **milist; 1840 mntinfo4_t *tmi; 1841 int nummi, i; 1842 1843 /* 1844 * Initiate recovery of open files for other filesystems. 1845 * We create an array of filesystems, rather than just 1846 * walking the filesystem list, to avoid deadlock issues 1847 * with s_lock and mi_recovlock. 1848 */ 1849 milist = make_milist(sp, &nummi); 1850 for (i = 0; i < nummi; i++) { 1851 tmi = milist[i]; 1852 if (tmi != mi) { 1853 (void) nfs_rw_enter_sig(&tmi->mi_recovlock, 1854 RW_READER, 0); 1855 start_recovery_action(NR_OPENFILES, TRUE, tmi, 1856 NULL, NULL); 1857 nfs_rw_exit(&tmi->mi_recovlock); 1858 } 1859 } 1860 free_milist(milist, nummi); 1861 1862 nfs_rw_exit(&sp->s_recovlock); 1863 } 1864 } 1865 1866 /* 1867 * Return an array of filesystems associated with the given server. The 1868 * caller should call free_milist() to free the references and memory. 1869 */ 1870 1871 static mntinfo4_t ** 1872 make_milist(nfs4_server_t *sp, int *nummip) 1873 { 1874 int nummi, i; 1875 mntinfo4_t **milist; 1876 mntinfo4_t *tmi; 1877 1878 mutex_enter(&sp->s_lock); 1879 nummi = 0; 1880 for (tmi = sp->mntinfo4_list; tmi != NULL; tmi = tmi->mi_clientid_next) 1881 nummi++; 1882 1883 milist = kmem_alloc(nummi * sizeof (mntinfo4_t *), KM_NOSLEEP); 1884 1885 for (i = 0, tmi = sp->mntinfo4_list; tmi != NULL; i++, 1886 tmi = tmi->mi_clientid_next) { 1887 milist[i] = tmi; 1888 VFS_HOLD(tmi->mi_vfsp); 1889 } 1890 mutex_exit(&sp->s_lock); 1891 1892 *nummip = nummi; 1893 return (milist); 1894 } 1895 1896 /* 1897 * Free the filesystem list created by make_milist(). 1898 */ 1899 1900 static void 1901 free_milist(mntinfo4_t **milist, int nummi) 1902 { 1903 mntinfo4_t *tmi; 1904 int i; 1905 1906 for (i = 0; i < nummi; i++) { 1907 tmi = milist[i]; 1908 VFS_RELE(tmi->mi_vfsp); 1909 } 1910 kmem_free(milist, nummi * sizeof (mntinfo4_t *)); 1911 } 1912 1913 /* 1914 * Filehandle 1915 */ 1916 1917 /* 1918 * Lookup the filehandle for the given vnode and update the rnode if it has 1919 * changed. 1920 * 1921 * Errors: 1922 * - if the filehandle could not be updated because of an error that 1923 * requires further recovery, initiate that recovery and return. 1924 * - if the filehandle could not be updated because of a signal, pretend we 1925 * succeeded and let someone else deal with it. 1926 * - if the filehandle could not be updated and the filesystem has been 1927 * forcibly unmounted, pretend we succeeded, and let the caller deal with 1928 * the forced unmount (to retry or not to retry, that is the question). 1929 * - if the filehandle could not be updated because of some other error, 1930 * mark the rnode bad and return. 1931 */ 1932 static void 1933 recov_filehandle(nfs4_recov_t action, mntinfo4_t *mi, vnode_t *vp) 1934 { 1935 rnode4_t *rp = VTOR4(vp); 1936 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 1937 bool_t needrecov; 1938 1939 mutex_enter(&rp->r_statelock); 1940 1941 if (rp->r_flags & R4RECOVERR) { 1942 mutex_exit(&rp->r_statelock); 1943 return; 1944 } 1945 1946 /* 1947 * If someone else is updating the filehandle, wait for them to 1948 * finish and then let our caller retry. 1949 */ 1950 if (rp->r_flags & R4RECEXPFH) { 1951 while (rp->r_flags & R4RECEXPFH) { 1952 cv_wait(&rp->r_cv, &rp->r_statelock); 1953 } 1954 mutex_exit(&rp->r_statelock); 1955 return; 1956 } 1957 rp->r_flags |= R4RECEXPFH; 1958 mutex_exit(&rp->r_statelock); 1959 1960 if (action == NR_BADHANDLE) { 1961 /* shouldn't happen */ 1962 nfs4_queue_event(RE_BADHANDLE, mi, NULL, 0, 1963 vp, NULL, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0); 1964 } 1965 1966 nfs4_remap_file(mi, vp, 0, &e); 1967 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 1968 1969 /* 1970 * If we get BADHANDLE or FHEXPIRED in their handler, something is 1971 * broken. Don't try to recover, just mark the file dead. 1972 */ 1973 if (needrecov && e.error == 0 && 1974 (e.stat == NFS4ERR_BADHANDLE || e.stat == NFS4ERR_FHEXPIRED)) 1975 needrecov = FALSE; 1976 if (needrecov) { 1977 (void) nfs4_start_recovery(&e, mi, vp, 1978 NULL, NULL, NULL, OP_LOOKUP, NULL); 1979 } else if (e.error != EINTR && 1980 !NFS4_FRC_UNMT_ERR(e.error, mi->mi_vfsp) && 1981 (e.error != 0 || e.stat != NFS4_OK)) { 1982 nfs4_recov_fh_fail(vp, e.error, e.stat); 1983 /* 1984 * Don't set r_error to ESTALE. Higher-level code (e.g., 1985 * cstatat_getvp()) retries on ESTALE, which would cause 1986 * an infinite loop. 1987 */ 1988 } 1989 1990 mutex_enter(&rp->r_statelock); 1991 rp->r_flags &= ~R4RECEXPFH; 1992 cv_broadcast(&rp->r_cv); 1993 mutex_exit(&rp->r_statelock); 1994 } 1995 1996 /* 1997 * Stale Filehandle 1998 */ 1999 2000 /* 2001 * A stale filehandle can happen when an individual file has 2002 * been removed, or when an entire filesystem has been taken 2003 * offline. To distinguish these cases, we do this: 2004 * - if a GETATTR with the current filehandle is okay, we do 2005 * nothing (this can happen with two-filehandle ops) 2006 * - if the GETATTR fails, but a GETATTR of the root filehandle 2007 * succeeds, mark the rnode with R4STALE, which will stop use 2008 * - if the GETATTR fails, and a GETATTR of the root filehandle 2009 * also fails, we consider the problem filesystem-wide, so: 2010 * - if we can failover, we should 2011 * - if we can't failover, we should mark both the original 2012 * vnode and the root bad 2013 */ 2014 static void 2015 recov_stale(mntinfo4_t *mi, vnode_t *vp) 2016 { 2017 rnode4_t *rp = VTOR4(vp); 2018 vnode_t *rootvp = NULL; 2019 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 2020 nfs4_ga_res_t gar; 2021 char *fail_msg = "failed to recover from NFS4ERR_STALE"; 2022 bool_t needrecov; 2023 2024 mutex_enter(&rp->r_statelock); 2025 2026 if (rp->r_flags & R4RECOVERR) { 2027 mutex_exit(&rp->r_statelock); 2028 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2029 "recov_stale: already marked dead, rp %s", 2030 rnode4info(rp))); 2031 return; 2032 } 2033 2034 if (rp->r_flags & R4STALE) { 2035 mutex_exit(&rp->r_statelock); 2036 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2037 "recov_stale: already marked stale, rp %s", 2038 rnode4info(rp))); 2039 return; 2040 } 2041 2042 mutex_exit(&rp->r_statelock); 2043 2044 /* Try a GETATTR on this vnode */ 2045 nfs4_getattr_otw_norecovery(vp, &gar, &e, CRED(), 0); 2046 2047 /* 2048 * Handle non-STALE recoverable errors 2049 */ 2050 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 2051 if (needrecov && (e.error != 0 || e.stat != NFS4ERR_STALE)) { 2052 (void) nfs4_start_recovery(&e, mi, vp, 2053 NULL, NULL, NULL, OP_GETATTR, NULL); 2054 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2055 "recov_stale: error=%d, stat=%d seen on rp %s", 2056 e.error, e.stat, rnode4info(rp))); 2057 goto out; 2058 } 2059 2060 /* Are things OK for this vnode? */ 2061 if (!e.error && e.stat == NFS4_OK) { 2062 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2063 "recov_stale: file appears fine, rp %s", 2064 rnode4info(rp))); 2065 goto out; 2066 } 2067 2068 /* Did we get an unrelated non-recoverable error? */ 2069 if (e.error || e.stat != NFS4ERR_STALE) { 2070 nfs4_fail_recov(vp, fail_msg, e.error, e.stat); 2071 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2072 "recov_stale: unrelated fatal error, rp %s", 2073 rnode4info(rp))); 2074 goto out; 2075 } 2076 2077 /* 2078 * If we don't appear to be dealing with the root node, find it. 2079 */ 2080 if ((vp->v_flag & VROOT) == 0) { 2081 nfs4_error_zinit(&e); 2082 e.error = VFS_ROOT(vp->v_vfsp, &rootvp); 2083 if (e.error) { 2084 nfs4_fail_recov(vp, fail_msg, 0, NFS4ERR_STALE); 2085 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2086 "recov_stale: can't find root node for rp %s", 2087 rnode4info(rp))); 2088 goto out; 2089 } 2090 } 2091 2092 /* Try a GETATTR on the root vnode */ 2093 if (rootvp != NULL) { 2094 nfs4_error_zinit(&e); 2095 nfs4_getattr_otw_norecovery(rootvp, &gar, &e, CRED(), 0); 2096 2097 /* Try recovery? */ 2098 if (e.error != 0 || e.stat != NFS4ERR_STALE) { 2099 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 2100 if (needrecov) { 2101 (void) nfs4_start_recovery(&e, 2102 mi, rootvp, NULL, NULL, NULL, 2103 OP_GETATTR, NULL); 2104 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2105 "recov_stale: error=%d, stat=%d seen " 2106 "on rp %s", e.error, e.stat, 2107 rnode4info(rp))); 2108 } 2109 } 2110 2111 /* 2112 * Check to see if a failover attempt is warranted 2113 * NB: nfs4_try_failover doesn't check for STALE 2114 * because recov_stale gets a shot first. Now that 2115 * recov_stale has failed, go ahead and try failover. 2116 * 2117 * If the getattr on the root filehandle was successful, 2118 * then mark recovery as failed for 'vp' and exit. 2119 */ 2120 if (nfs4_try_failover(&e) == 0 && e.stat != NFS4ERR_STALE) { 2121 /* 2122 * pass the original error to fail_recov, not 2123 * the one from trying the root vnode. 2124 */ 2125 nfs4_fail_recov(vp, fail_msg, 0, NFS4ERR_STALE); 2126 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2127 "recov_stale: root node OK, marking " 2128 "dead rp %s", rnode4info(rp))); 2129 goto out; 2130 } 2131 } 2132 2133 /* 2134 * Here, we know that both the original file and the 2135 * root filehandle (which may be the same) are stale. 2136 * We want to fail over if we can, and if we can't, we 2137 * want to mark everything in sight bad. 2138 */ 2139 if (FAILOVER_MOUNT4(mi)) { 2140 mutex_enter(&mi->mi_lock); 2141 mi->mi_recovflags |= MI4R_NEED_NEW_SERVER; 2142 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2143 "recov_stale: failing over due to rp %s", 2144 rnode4info(rp))); 2145 mutex_exit(&mi->mi_lock); 2146 } else { 2147 rnode4_t *rootrp; 2148 servinfo4_t *svp; 2149 2150 /* 2151 * Can't fail over, so mark things dead. 2152 * 2153 * If rootvp is set, we know we have a distinct 2154 * non-root vnode which can be marked dead in 2155 * the usual way. 2156 * 2157 * Then we want to mark the root vnode dead. 2158 * Note that if rootvp wasn't set, our vp is 2159 * actually the root vnode. 2160 */ 2161 if (rootvp != NULL) { 2162 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2163 "recov_stale: can't fail over, marking dead rp %s", 2164 rnode4info(rp))); 2165 nfs4_fail_recov(vp, fail_msg, 0, NFS4ERR_STALE); 2166 } else { 2167 rootvp = vp; 2168 VN_HOLD(rootvp); 2169 } 2170 2171 /* 2172 * Mark root dead, but quietly - since 2173 * the root rnode is frequently recreated, 2174 * we can encounter this at every access. 2175 * Also mark recovery as failed on this VFS. 2176 */ 2177 rootrp = VTOR4(rootvp); 2178 NFS4_DEBUG(nfs4_client_recov_debug, (CE_CONT, 2179 "recov_stale: marking dead root rp %s", 2180 rnode4info(rootrp))); 2181 mutex_enter(&rootrp->r_statelock); 2182 rootrp->r_flags |= (R4RECOVERR | R4STALE); 2183 rootrp->r_error = ESTALE; 2184 mutex_exit(&rootrp->r_statelock); 2185 mutex_enter(&mi->mi_lock); 2186 mi->mi_error = ESTALE; 2187 mutex_exit(&mi->mi_lock); 2188 2189 svp = mi->mi_curr_serv; 2190 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0); 2191 svp->sv_flags |= SV4_ROOT_STALE; 2192 nfs_rw_exit(&svp->sv_lock); 2193 } 2194 2195 out: 2196 if (rootvp) 2197 VN_RELE(rootvp); 2198 } 2199 2200 /* 2201 * Locks. 2202 */ 2203 2204 /* 2205 * Reclaim all the active (acquired) locks for the given file. 2206 * If a process lost a lock, the process is sent a SIGLOST. This is not 2207 * considered an error. 2208 * 2209 * Return values: 2210 * Errors and status are returned via the nfs4_error_t parameter 2211 * If an error indicates that recovery is needed, the caller is responsible 2212 * for dealing with it. 2213 */ 2214 2215 static void 2216 relock_file(vnode_t *vp, mntinfo4_t *mi, nfs4_error_t *ep, 2217 fattr4_change pre_change) 2218 { 2219 locklist_t *locks, *llp; 2220 rnode4_t *rp; 2221 2222 ASSERT(ep != NULL); 2223 nfs4_error_zinit(ep); 2224 2225 if (VTOMI4(vp)->mi_flags & MI4_LLOCK) 2226 return; 2227 2228 nfs4_flush_lock_owners(VTOR4(vp)); 2229 2230 /* 2231 * If we get an error that requires recovery actions, just bail out 2232 * and let the top-level recovery code handle it. 2233 * 2234 * If we get some other error, kill the process that owned the lock 2235 * and mark its remaining locks (if any) as belonging to NOPID, so 2236 * that we don't make any more reclaim requests for that process. 2237 */ 2238 2239 rp = VTOR4(vp); 2240 locks = flk_active_locks_for_vp(vp); 2241 for (llp = locks; llp != NULL; llp = llp->ll_next) { 2242 int did_reclaim = 1; 2243 2244 ASSERT(llp->ll_vp == vp); 2245 if (llp->ll_flock.l_pid == NOPID) 2246 continue; 2247 reclaim_one_lock(vp, &llp->ll_flock, ep, &did_reclaim); 2248 /* 2249 * If we need to restart recovery, stop processing the 2250 * list. Some errors would be recoverable under other 2251 * circumstances, but if they happen here we just give up 2252 * on the lock. 2253 */ 2254 if (nfs4_needs_recovery(ep, TRUE, vp->v_vfsp)) { 2255 if (ep->error != 0) 2256 break; 2257 if (!nfs4_recov_marks_dead(ep->stat)) 2258 break; 2259 } 2260 /* 2261 * In case the server isn't offering us a grace period, or 2262 * if we missed it, we might have opened & locked from scratch, 2263 * rather than reopened/reclaimed. 2264 * We need to ensure that the object hadn't been otherwise 2265 * changed during this time, by comparing the changeinfo. 2266 * We get passed the changeinfo from before the reopen by our 2267 * caller, in pre_change. 2268 * The changeinfo from after the reopen is in rp->r_change, 2269 * courtesy of the GETATTR in the reopen. 2270 * If they're different, then the file has changed, and we 2271 * have to SIGLOST the app. 2272 */ 2273 if (ep->error == 0 && ep->stat == NFS4_OK && !did_reclaim) { 2274 mutex_enter(&rp->r_statelock); 2275 if (pre_change != rp->r_change) 2276 ep->stat = NFS4ERR_NO_GRACE; 2277 mutex_exit(&rp->r_statelock); 2278 } 2279 if (ep->error != 0 || ep->stat != NFS4_OK) { 2280 if (ep->error != 0) 2281 nfs4_queue_event(RE_FAIL_RELOCK, mi, 2282 NULL, ep->error, vp, NULL, 0, NULL, 2283 llp->ll_flock.l_pid, TAG_NONE, TAG_NONE, 2284 0, 0); 2285 else 2286 nfs4_queue_event(RE_FAIL_RELOCK, mi, 2287 NULL, 0, vp, NULL, ep->stat, NULL, 2288 llp->ll_flock.l_pid, TAG_NONE, TAG_NONE, 2289 0, 0); 2290 nfs4_send_siglost(llp->ll_flock.l_pid, mi, vp, TRUE, 2291 ep->error, ep->stat); 2292 relock_skip_pid(llp, llp->ll_flock.l_pid); 2293 2294 /* Reinitialize the nfs4_error and continue */ 2295 nfs4_error_zinit(ep); 2296 } 2297 } 2298 2299 if (locks != NULL) 2300 flk_free_locklist(locks); 2301 } 2302 2303 /* 2304 * Reclaim the given lock. 2305 * If the lock can't be reclaimed, the process is sent SIGLOST, but this is 2306 * not considered an error. 2307 * 2308 * Errors are returned via the nfs4_error_t parameter. 2309 */ 2310 static void 2311 reclaim_one_lock(vnode_t *vp, flock64_t *flk, nfs4_error_t *ep, 2312 int *did_reclaimp) 2313 { 2314 cred_t *cr; 2315 rnode4_t *rp = VTOR4(vp); 2316 2317 cr = pid_to_cr(flk->l_pid); 2318 if (cr == NULL) { 2319 nfs4_error_zinit(ep); 2320 ep->error = ESRCH; 2321 return; 2322 } 2323 2324 do { 2325 mutex_enter(&rp->r_statelock); 2326 if (rp->r_flags & R4RECOVERR) { 2327 /* 2328 * This shouldn't affect other reclaims, so don't 2329 * return an error. 2330 */ 2331 mutex_exit(&rp->r_statelock); 2332 break; 2333 } 2334 mutex_exit(&rp->r_statelock); 2335 2336 nfs4frlock(NFS4_LCK_CTYPE_RECLAIM, vp, F_SETLK, flk, 2337 FREAD|FWRITE, 0, cr, ep, NULL, did_reclaimp); 2338 if (ep->error == 0 && ep->stat == NFS4ERR_FHEXPIRED) 2339 start_recovery_action(NR_FHEXPIRED, TRUE, VTOMI4(vp), 2340 vp, NULL); 2341 } while (ep->error == 0 && ep->stat == NFS4ERR_FHEXPIRED); 2342 2343 crfree(cr); 2344 } 2345 2346 /* 2347 * Open files. 2348 */ 2349 2350 /* 2351 * Verifies if the nfsstat4 is a valid error for marking this vnode dead. 2352 * Returns 1 if the error is valid; 0 otherwise. 2353 */ 2354 static int 2355 nfs4_valid_recov_err_for_vp(vnode_t *vp, nfsstat4 stat) 2356 { 2357 /* 2358 * We should not be marking non-regular files as dead, 2359 * except in very rare cases (eg: BADHANDLE or NFS4ERR_BADNAME). 2360 */ 2361 if (vp->v_type != VREG && stat != NFS4ERR_BADHANDLE && 2362 stat != NFS4ERR_BADNAME) 2363 return (0); 2364 2365 return (1); 2366 } 2367 2368 /* 2369 * Failed attempting to recover a filehandle. If 'stat' is valid for 'vp', 2370 * then mark the object dead. Since we've had to do a lookup for 2371 * filehandle recovery, we will mark the object dead if we got NOENT. 2372 */ 2373 static void 2374 nfs4_recov_fh_fail(vnode_t *vp, int error, nfsstat4 stat) 2375 { 2376 ASSERT(vp != NULL); 2377 2378 if ((error == 0) && (stat != NFS4ERR_NOENT) && 2379 (!nfs4_valid_recov_err_for_vp(vp, stat))) 2380 return; 2381 2382 nfs4_fail_recov(vp, "can't recover filehandle", error, stat); 2383 } 2384 2385 /* 2386 * Recovery from a "shouldn't happen" error. In the long term, we'd like 2387 * to mark only the data structure(s) that provided the bad value as being 2388 * bad. But for now we'll just mark the entire file. 2389 */ 2390 2391 static void 2392 recov_badstate(recov_info_t *recovp, vnode_t *vp, nfsstat4 stat) 2393 { 2394 ASSERT(vp != NULL); 2395 recov_throttle(recovp, vp); 2396 2397 if (!nfs4_valid_recov_err_for_vp(vp, stat)) 2398 return; 2399 2400 nfs4_fail_recov(vp, "", 0, stat); 2401 } 2402 2403 /* 2404 * Free up the information saved for a lost state request. 2405 */ 2406 static void 2407 nfs4_free_lost_rqst(nfs4_lost_rqst_t *lrp, nfs4_server_t *sp) 2408 { 2409 component4 *filep; 2410 nfs4_open_stream_t *osp; 2411 int have_sync_lock; 2412 2413 NFS4_DEBUG(nfs4_lost_rqst_debug, 2414 (CE_NOTE, "nfs4_free_lost_rqst:")); 2415 2416 switch (lrp->lr_op) { 2417 case OP_OPEN: 2418 filep = &lrp->lr_ofile; 2419 if (filep->utf8string_val) { 2420 kmem_free(filep->utf8string_val, filep->utf8string_len); 2421 filep->utf8string_val = NULL; 2422 } 2423 break; 2424 case OP_DELEGRETURN: 2425 nfs4delegreturn_cleanup(VTOR4(lrp->lr_vp), sp); 2426 break; 2427 case OP_CLOSE: 2428 osp = lrp->lr_osp; 2429 ASSERT(osp != NULL); 2430 mutex_enter(&osp->os_sync_lock); 2431 have_sync_lock = 1; 2432 if (osp->os_pending_close) { 2433 /* clean up the open file state. */ 2434 osp->os_pending_close = 0; 2435 nfs4close_notw(lrp->lr_vp, osp, &have_sync_lock); 2436 } 2437 if (have_sync_lock) 2438 mutex_exit(&osp->os_sync_lock); 2439 break; 2440 } 2441 2442 lrp->lr_op = 0; 2443 if (lrp->lr_oop != NULL) { 2444 open_owner_rele(lrp->lr_oop); 2445 lrp->lr_oop = NULL; 2446 } 2447 if (lrp->lr_osp != NULL) { 2448 open_stream_rele(lrp->lr_osp, VTOR4(lrp->lr_vp)); 2449 lrp->lr_osp = NULL; 2450 } 2451 if (lrp->lr_lop != NULL) { 2452 lock_owner_rele(lrp->lr_lop); 2453 lrp->lr_lop = NULL; 2454 } 2455 if (lrp->lr_flk != NULL) { 2456 kmem_free(lrp->lr_flk, sizeof (flock64_t)); 2457 lrp->lr_flk = NULL; 2458 } 2459 if (lrp->lr_vp != NULL) { 2460 VN_RELE(lrp->lr_vp); 2461 lrp->lr_vp = NULL; 2462 } 2463 if (lrp->lr_dvp != NULL) { 2464 VN_RELE(lrp->lr_dvp); 2465 lrp->lr_dvp = NULL; 2466 } 2467 if (lrp->lr_cr != NULL) { 2468 crfree(lrp->lr_cr); 2469 lrp->lr_cr = NULL; 2470 } 2471 2472 kmem_free(lrp, sizeof (nfs4_lost_rqst_t)); 2473 } 2474 2475 /* 2476 * Remove any lost state requests and free them. 2477 */ 2478 static void 2479 nfs4_remove_lost_rqsts(mntinfo4_t *mi, nfs4_server_t *sp) 2480 { 2481 nfs4_lost_rqst_t *lrp; 2482 2483 mutex_enter(&mi->mi_lock); 2484 while ((lrp = list_head(&mi->mi_lost_state)) != NULL) { 2485 list_remove(&mi->mi_lost_state, lrp); 2486 mutex_exit(&mi->mi_lock); 2487 nfs4_free_lost_rqst(lrp, sp); 2488 mutex_enter(&mi->mi_lock); 2489 } 2490 mutex_exit(&mi->mi_lock); 2491 } 2492 2493 /* 2494 * Reopen all the files for the given filesystem and reclaim any locks. 2495 */ 2496 2497 static void 2498 recov_openfiles(recov_info_t *recovp, nfs4_server_t *sp) 2499 { 2500 mntinfo4_t *mi = recovp->rc_mi; 2501 nfs4_opinst_t *reopenlist = NULL, *rep; 2502 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 2503 open_claim_type4 claim; 2504 int remap; 2505 char *fail_msg = "No such file or directory on replica"; 2506 rnode4_t *rp; 2507 fattr4_change pre_change; 2508 2509 ASSERT(sp != NULL); 2510 2511 /* 2512 * This check is to allow a 10ms pause before we reopen files 2513 * it should allow the server time to have received the CB_NULL 2514 * reply and update its internal structures such that (if 2515 * applicable) we are granted a delegation on reopened files. 2516 */ 2517 mutex_enter(&sp->s_lock); 2518 if ((sp->s_flags & (N4S_CB_PINGED | N4S_CB_WAITER)) == 0) { 2519 sp->s_flags |= N4S_CB_WAITER; 2520 (void) cv_timedwait(&sp->wait_cb_null, &sp->s_lock, 2521 (lbolt+drv_usectohz(N4S_CB_PAUSE_TIME))); 2522 } 2523 mutex_exit(&sp->s_lock); 2524 2525 (void) nfs_rw_enter_sig(&sp->s_recovlock, RW_READER, 0); 2526 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0); 2527 2528 if (NFS4_VOLATILE_FH(mi)) { 2529 nfs4_remap_root(mi, &e, 0); 2530 if (nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp)) { 2531 (void) nfs4_start_recovery(&e, mi, NULL, 2532 NULL, NULL, NULL, OP_LOOKUP, NULL); 2533 } 2534 } 2535 2536 mutex_enter(&mi->mi_lock); 2537 if (recovp->rc_srv_reboot || (mi->mi_recovflags & MI4R_SRV_REBOOT)) 2538 claim = CLAIM_PREVIOUS; 2539 else 2540 claim = CLAIM_NULL; 2541 mutex_exit(&mi->mi_lock); 2542 2543 if (e.error == 0 && e.stat == NFS4_OK) { 2544 /* 2545 * Get a snapshot of open files in the filesystem. Note 2546 * that new opens will stall until the server's grace 2547 * period is done. 2548 */ 2549 reopenlist = r4mkopenlist(mi); 2550 2551 mutex_enter(&mi->mi_lock); 2552 remap = mi->mi_recovflags & MI4R_REMAP_FILES; 2553 mutex_exit(&mi->mi_lock); 2554 /* 2555 * Since we are re-establishing state on the 2556 * server, its ok to blow away the saved lost 2557 * requests since we don't need to reissue it. 2558 */ 2559 nfs4_remove_lost_rqsts(mi, sp); 2560 2561 for (rep = reopenlist; rep; rep = rep->re_next) { 2562 2563 if (remap) { 2564 nfs4_remap_file(mi, rep->re_vp, 2565 NFS4_REMAP_CKATTRS, &e); 2566 } 2567 if (e.error == ENOENT || e.stat == NFS4ERR_NOENT) { 2568 /* 2569 * The current server does not have the file 2570 * that is to be remapped. This is most 2571 * likely due to an improperly maintained 2572 * replica. The files that are missing from 2573 * the server will be marked dead and logged 2574 * in order to make sys admins aware of the 2575 * problem. 2576 */ 2577 nfs4_fail_recov(rep->re_vp, 2578 fail_msg, e.error, e.stat); 2579 /* 2580 * We've already handled the error so clear it. 2581 */ 2582 nfs4_error_zinit(&e); 2583 continue; 2584 } else if (e.error == 0 && e.stat == NFS4_OK) { 2585 int j; 2586 2587 rp = VTOR4(rep->re_vp); 2588 mutex_enter(&rp->r_statelock); 2589 pre_change = rp->r_change; 2590 mutex_exit(&rp->r_statelock); 2591 2592 for (j = 0; j < rep->re_numosp; j++) { 2593 nfs4_reopen(rep->re_vp, rep->re_osp[j], 2594 &e, claim, FALSE, TRUE); 2595 if (e.error != 0 || e.stat != NFS4_OK) 2596 break; 2597 } 2598 if (nfs4_needs_recovery(&e, TRUE, 2599 mi->mi_vfsp)) { 2600 (void) nfs4_start_recovery(&e, mi, 2601 rep->re_vp, NULL, NULL, NULL, 2602 OP_OPEN, NULL); 2603 break; 2604 } 2605 } 2606 #ifdef DEBUG 2607 if (nfs4_recovdelay > 0) 2608 delay(MSEC_TO_TICK(nfs4_recovdelay * 1000)); 2609 #endif 2610 if (e.error == 0 && e.stat == NFS4_OK) 2611 relock_file(rep->re_vp, mi, &e, pre_change); 2612 2613 if (nfs4_needs_recovery(&e, TRUE, mi->mi_vfsp)) 2614 (void) nfs4_start_recovery(&e, mi, 2615 rep->re_vp, NULL, NULL, NULL, OP_LOCK, 2616 NULL); 2617 if (e.error != 0 || e.stat != NFS4_OK) 2618 break; 2619 } 2620 2621 /* 2622 * Check to see if we need to remap files passed in 2623 * via the recovery arguments; this will have been 2624 * done for open files. A failure here is not fatal. 2625 */ 2626 if (remap) { 2627 nfs4_error_t ignore; 2628 nfs4_check_remap(mi, recovp->rc_vp1, NFS4_REMAP_CKATTRS, 2629 &ignore); 2630 nfs4_check_remap(mi, recovp->rc_vp2, NFS4_REMAP_CKATTRS, 2631 &ignore); 2632 } 2633 } 2634 2635 if (e.error == 0 && e.stat == NFS4_OK) { 2636 mutex_enter(&mi->mi_lock); 2637 mi->mi_recovflags &= ~(MI4R_REOPEN_FILES | MI4R_REMAP_FILES); 2638 mutex_exit(&mi->mi_lock); 2639 } 2640 2641 nfs_rw_exit(&mi->mi_recovlock); 2642 nfs_rw_exit(&sp->s_recovlock); 2643 2644 if (reopenlist != NULL) 2645 r4releopenlist(reopenlist); 2646 } 2647 2648 /* 2649 * Resend the queued state recovery requests in "rqsts". 2650 */ 2651 2652 static void 2653 nfs4_resend_lost_rqsts(recov_info_t *recovp, nfs4_server_t *sp) 2654 { 2655 nfs4_lost_rqst_t *lrp, *tlrp; 2656 mntinfo4_t *mi = recovp->rc_mi; 2657 nfs4_error_t n4e; 2658 #ifdef NOTYET 2659 uint32_t deny_bits = 0; 2660 #endif 2661 2662 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_resend_lost_rqsts")); 2663 2664 ASSERT(mi != NULL); 2665 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 2666 2667 mutex_enter(&mi->mi_lock); 2668 lrp = list_head(&mi->mi_lost_state); 2669 mutex_exit(&mi->mi_lock); 2670 while (lrp != NULL) { 2671 nfs4_error_zinit(&n4e); 2672 resend_one_op(lrp, &n4e, mi, sp); 2673 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 2674 "nfs4_resend_lost_rqsts: resend request: for vp %p got " 2675 "error %d stat %d", (void *)lrp->lr_vp, n4e.error, 2676 n4e.stat)); 2677 2678 /* 2679 * If we get a recovery error that we can actually 2680 * recover from (such as ETIMEDOUT, FHEXPIRED), we 2681 * return and let the recovery thread redrive the call. 2682 * Don't requeue unless the zone is still healthy. 2683 */ 2684 if (zone_status_get(curproc->p_zone) < ZONE_IS_SHUTTING_DOWN && 2685 nfs4_needs_recovery(&n4e, TRUE, mi->mi_vfsp) && 2686 (nfs4_try_failover(&n4e) || 2687 NFS4_FRC_UNMT_ERR(n4e.error, mi->mi_vfsp) || 2688 (n4e.error == 0 && n4e.stat != NFS4ERR_BADHANDLE && 2689 !nfs4_recov_marks_dead(n4e.stat)))) { 2690 /* 2691 * For these three errors, we want to delay a bit 2692 * instead of pounding the server into submission. 2693 * We have to do this manually; the normal 2694 * processing for these errors only works for 2695 * non-recovery requests. 2696 */ 2697 if ((n4e.error == 0 && n4e.stat == NFS4ERR_DELAY) || 2698 (n4e.error == 0 && n4e.stat == NFS4ERR_GRACE) || 2699 (n4e.error == 0 && n4e.stat == NFS4ERR_RESOURCE) || 2700 NFS4_FRC_UNMT_ERR(n4e.error, mi->mi_vfsp)) { 2701 delay(SEC_TO_TICK(nfs4err_delay_time)); 2702 } else { 2703 (void) nfs4_start_recovery(&n4e, 2704 mi, lrp->lr_dvp, lrp->lr_vp, NULL, NULL, 2705 lrp->lr_op, NULL); 2706 } 2707 return; 2708 } 2709 2710 mutex_enter(&mi->mi_lock); 2711 list_remove(&mi->mi_lost_state, lrp); 2712 tlrp = lrp; 2713 lrp = list_head(&mi->mi_lost_state); 2714 mutex_exit(&mi->mi_lock); 2715 nfs4_free_lost_rqst(tlrp, sp); 2716 } 2717 } 2718 2719 /* 2720 * Resend the given op, and issue any necessary undo call. 2721 * errors are returned via the nfs4_error_t parameter. 2722 */ 2723 2724 static void 2725 resend_one_op(nfs4_lost_rqst_t *lrp, nfs4_error_t *ep, 2726 mntinfo4_t *mi, nfs4_server_t *sp) 2727 { 2728 vnode_t *vp; 2729 nfs4_open_stream_t *osp; 2730 cred_t *cr; 2731 uint32_t acc_bits; 2732 2733 vp = lrp->lr_vp; 2734 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_one_op: " 2735 "have a lost open/close request for vp %p", (void *)vp)); 2736 2737 switch (lrp->lr_op) { 2738 case OP_OPEN: 2739 nfs4_resend_open_otw(&vp, lrp, ep); 2740 break; 2741 case OP_OPEN_DOWNGRADE: 2742 ASSERT(lrp->lr_oop != NULL); 2743 ep->error = nfs4_start_open_seqid_sync(lrp->lr_oop, mi); 2744 ASSERT(!ep->error); /* recov thread always succeeds */ 2745 ASSERT(lrp->lr_osp != NULL); 2746 mutex_enter(&lrp->lr_osp->os_sync_lock); 2747 nfs4_open_downgrade(lrp->lr_dg_acc, lrp->lr_dg_deny, 2748 lrp->lr_oop, lrp->lr_osp, vp, lrp->lr_cr, lrp, 2749 ep, NULL, NULL); 2750 mutex_exit(&lrp->lr_osp->os_sync_lock); 2751 nfs4_end_open_seqid_sync(lrp->lr_oop); 2752 break; 2753 case OP_CLOSE: 2754 osp = lrp->lr_osp; 2755 cr = lrp->lr_cr; 2756 acc_bits = 0; 2757 mutex_enter(&osp->os_sync_lock); 2758 if (osp->os_share_acc_read) 2759 acc_bits |= OPEN4_SHARE_ACCESS_READ; 2760 if (osp->os_share_acc_write) 2761 acc_bits |= OPEN4_SHARE_ACCESS_WRITE; 2762 mutex_exit(&osp->os_sync_lock); 2763 nfs4close_one(vp, osp, cr, acc_bits, lrp, ep, 2764 CLOSE_RESEND, 0, 0, 0); 2765 break; 2766 case OP_LOCK: 2767 case OP_LOCKU: 2768 resend_lock(lrp, ep); 2769 goto done; 2770 case OP_DELEGRETURN: 2771 nfs4_resend_delegreturn(lrp, ep, sp); 2772 goto done; 2773 default: 2774 #ifdef DEBUG 2775 cmn_err(CE_PANIC, "resend_one_op: unexpected op: %d", 2776 lrp->lr_op); 2777 #endif 2778 nfs4_queue_event(RE_LOST_STATE_BAD_OP, mi, NULL, 2779 lrp->lr_op, lrp->lr_vp, lrp->lr_dvp, NFS4_OK, NULL, 0, 2780 TAG_NONE, TAG_NONE, 0, 0); 2781 nfs4_error_init(ep, EINVAL); 2782 return; 2783 } 2784 2785 /* 2786 * No need to retry nor send an "undo" CLOSE in the 2787 * event the server rebooted. 2788 */ 2789 if (ep->error == 0 && (ep->stat == NFS4ERR_STALE_CLIENTID || 2790 ep->stat == NFS4ERR_STALE_STATEID || ep->stat == NFS4ERR_EXPIRED)) 2791 goto done; 2792 2793 /* 2794 * If we resent a CLOSE or OPEN_DOWNGRADE, there's nothing 2795 * to undo. Undoing locking operations was handled by 2796 * resend_lock(). 2797 */ 2798 if (lrp->lr_op == OP_OPEN_DOWNGRADE || lrp->lr_op == OP_CLOSE) 2799 goto done; 2800 2801 /* 2802 * If we get any other error for OPEN, then don't attempt 2803 * to undo the resend of the open (since it was never 2804 * successful!). 2805 */ 2806 ASSERT(lrp->lr_op == OP_OPEN); 2807 if (ep->error || ep->stat != NFS4_OK) 2808 goto done; 2809 2810 /* 2811 * Now let's undo our OPEN. 2812 */ 2813 nfs4_error_zinit(ep); 2814 close_after_open_resend(vp, lrp->lr_cr, lrp->lr_oacc, ep); 2815 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_one_op: " 2816 "nfs4close_one: for vp %p got error %d stat %d", 2817 (void *)vp, ep->error, ep->stat)); 2818 2819 done: 2820 if (vp != lrp->lr_vp) 2821 VN_RELE(vp); 2822 } 2823 2824 /* 2825 * Close a file that was opened via a resent OPEN. 2826 * Most errors are passed back to the caller (via the return value and 2827 * *statp), except for FHEXPIRED, which is retried. 2828 * 2829 * It might be conceptually cleaner to push the CLOSE request onto the 2830 * front of the resend queue, rather than sending it here. That would 2831 * match the way we undo lost lock requests. On the other 2832 * hand, we've already got something that works, and there's no reason to 2833 * change it at this time. 2834 */ 2835 2836 static void 2837 close_after_open_resend(vnode_t *vp, cred_t *cr, uint32_t acc_bits, 2838 nfs4_error_t *ep) 2839 { 2840 2841 for (;;) { 2842 nfs4close_one(vp, NULL, cr, acc_bits, NULL, ep, 2843 CLOSE_AFTER_RESEND, 0, 0, 0); 2844 if (ep->error == 0 && ep->stat == NFS4_OK) 2845 break; /* success; done */ 2846 if (ep->error != 0 || ep->stat != NFS4ERR_FHEXPIRED) 2847 break; 2848 /* else retry FHEXPIRED */ 2849 } 2850 2851 } 2852 2853 /* 2854 * Resend the given lost lock request. Return an errno value. If zero, 2855 * *statp is set to the NFS status code for the call. 2856 * 2857 * Issue a SIGLOST and mark the rnode dead if we get a non-recovery error or 2858 * a recovery error that we don't actually recover from yet (eg: BAD_SEQID). 2859 * Let the recovery thread redrive the call if we get a recovery error that 2860 * we can actually recover from. 2861 */ 2862 static void 2863 resend_lock(nfs4_lost_rqst_t *lrp, nfs4_error_t *ep) 2864 { 2865 bool_t send_siglost = FALSE; 2866 vnode_t *vp = lrp->lr_vp; 2867 2868 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_lock:")); 2869 ASSERT(lrp->lr_ctype == NFS4_LCK_CTYPE_REINSTATE || 2870 lrp->lr_ctype == NFS4_LCK_CTYPE_RESEND); 2871 2872 nfs4frlock(lrp->lr_ctype, vp, F_SETLK, 2873 lrp->lr_flk, FREAD|FWRITE, 0, lrp->lr_cr, ep, lrp, NULL); 2874 2875 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_lock: " 2876 "nfs4frlock for vp %p returned error %d, stat %d", 2877 (void *)vp, ep->error, ep->stat)); 2878 2879 if (ep->error == 0 && ep->stat == 0) 2880 goto done; 2881 if (ep->error == 0 && ep->stat == NFS4ERR_DENIED && 2882 lrp->lr_ctype == NFS4_LCK_CTYPE_RESEND) 2883 goto done; 2884 2885 /* 2886 * If we failed with a non-recovery error, send SIGLOST and 2887 * mark the file dead. 2888 */ 2889 if (!nfs4_needs_recovery(ep, TRUE, vp->v_vfsp)) 2890 send_siglost = TRUE; 2891 else { 2892 /* 2893 * Done with recovering LOST LOCK in the event the 2894 * server rebooted or we've lost the lease. 2895 */ 2896 if (ep->error == 0 && (ep->stat == NFS4ERR_STALE_CLIENTID || 2897 ep->stat == NFS4ERR_STALE_STATEID || 2898 ep->stat == NFS4ERR_EXPIRED)) { 2899 goto done; 2900 } 2901 2902 /* 2903 * BAD_STATEID on an unlock indicates that the server has 2904 * forgotten about the lock anyway, so act like the call 2905 * was successful. 2906 */ 2907 if (ep->error == 0 && ep->stat == NFS4ERR_BAD_STATEID && 2908 lrp->lr_op == OP_LOCKU) 2909 goto done; 2910 2911 /* 2912 * If we got a recovery error that we don't actually 2913 * recover from, send SIGLOST. If the filesystem was 2914 * forcibly unmounted, we skip the SIGLOST because (a) it's 2915 * unnecessary noise, and (b) there could be a new process 2916 * with the same pid as the one that had generated the lost 2917 * state request. 2918 */ 2919 if (ep->error == 0 && (ep->stat == NFS4ERR_BADHANDLE || 2920 nfs4_recov_marks_dead(ep->stat))) { 2921 if (!(vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) 2922 send_siglost = TRUE; 2923 goto done; 2924 } 2925 2926 /* 2927 * If the filesystem was forcibly unmounted, we 2928 * still need to synchronize with the server and 2929 * release state. Try again later. 2930 */ 2931 if (NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp)) 2932 goto done; 2933 2934 /* 2935 * If we get a recovery error that we can actually 2936 * recover from (such as ETIMEDOUT, FHEXPIRED), 2937 * return and let the recovery thread redrive the call. 2938 * 2939 * For the three errors below, we want to delay a bit 2940 * instead of pounding the server into submission. 2941 */ 2942 if ((ep->error == 0 && ep->stat == NFS4ERR_DELAY) || 2943 (ep->error == 0 && ep->stat == NFS4ERR_GRACE) || 2944 (ep->error == 0 && ep->stat == NFS4ERR_RESOURCE)) 2945 delay(SEC_TO_TICK(recov_err_delay)); 2946 goto done; 2947 } 2948 2949 done: 2950 if (send_siglost) { 2951 cred_t *sv_cred; 2952 2953 /* 2954 * Must be root or the actual thread being issued the 2955 * SIGLOST for this to work, so just become root. 2956 */ 2957 sv_cred = curthread->t_cred; 2958 curthread->t_cred = kcred; 2959 nfs4_send_siglost(lrp->lr_flk->l_pid, VTOMI4(vp), vp, FALSE, 2960 ep->error, ep->stat); 2961 curthread->t_cred = sv_cred; 2962 2963 /* 2964 * Flush any additional reinstantiation requests for 2965 * this operation. Sending multiple SIGLOSTs to the user 2966 * process is unlikely to help and may cause trouble. 2967 */ 2968 if (lrp->lr_ctype == NFS4_LCK_CTYPE_REINSTATE) 2969 flush_reinstate(lrp); 2970 } 2971 } 2972 2973 /* 2974 * Remove any lock reinstantiation requests that correspond to the given 2975 * lost request. We only remove items that follow lrp in the queue, 2976 * assuming that lrp will be removed by the generic lost state code. 2977 */ 2978 2979 static void 2980 flush_reinstate(nfs4_lost_rqst_t *lrp) 2981 { 2982 vnode_t *vp; 2983 pid_t pid; 2984 mntinfo4_t *mi; 2985 nfs4_lost_rqst_t *nlrp; 2986 2987 vp = lrp->lr_vp; 2988 mi = VTOMI4(vp); 2989 pid = lrp->lr_flk->l_pid; 2990 2991 /* 2992 * If there are any more reinstantation requests to get rid of, 2993 * they should all be clustered at the front of the lost state 2994 * queue. 2995 */ 2996 mutex_enter(&mi->mi_lock); 2997 for (lrp = list_next(&mi->mi_lost_state, lrp); lrp != NULL; 2998 lrp = nlrp) { 2999 nlrp = list_next(&mi->mi_lost_state, lrp); 3000 if (lrp->lr_op != OP_LOCK && lrp->lr_op != OP_LOCKU) 3001 break; 3002 if (lrp->lr_ctype != NFS4_LCK_CTYPE_REINSTATE) 3003 break; 3004 ASSERT(lrp->lr_vp == vp); 3005 ASSERT(lrp->lr_flk->l_pid == pid); 3006 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 3007 "remove reinstantiation %p", (void *)lrp)); 3008 list_remove(&mi->mi_lost_state, lrp); 3009 nfs4_free_lost_rqst(lrp, NULL); 3010 } 3011 mutex_exit(&mi->mi_lock); 3012 } 3013 3014 /* 3015 * End of state-specific recovery routines. 3016 */ 3017 3018 /* 3019 * Allocate a lost request struct, initialize it from lost_rqstp (including 3020 * bumping the reference counts for the referenced vnode, etc.), and hang 3021 * it off of recovp. 3022 */ 3023 3024 static void 3025 nfs4_save_lost_rqst(nfs4_lost_rqst_t *lost_rqstp, recov_info_t *recovp, 3026 nfs4_recov_t *action, mntinfo4_t *mi) 3027 { 3028 nfs4_lost_rqst_t *destp; 3029 3030 ASSERT(recovp->rc_lost_rqst == NULL); 3031 3032 destp = kmem_alloc(sizeof (nfs4_lost_rqst_t), KM_SLEEP); 3033 recovp->rc_lost_rqst = destp; 3034 3035 if (lost_rqstp->lr_op == OP_LOCK || 3036 lost_rqstp->lr_op == OP_LOCKU) { 3037 ASSERT(lost_rqstp->lr_lop); 3038 *action = NR_LOST_LOCK; 3039 destp->lr_ctype = lost_rqstp->lr_ctype; 3040 destp->lr_locktype = lost_rqstp->lr_locktype; 3041 } else if (lost_rqstp->lr_op == OP_OPEN) { 3042 component4 *srcfp, *destfp; 3043 3044 destp->lr_oacc = lost_rqstp->lr_oacc; 3045 destp->lr_odeny = lost_rqstp->lr_odeny; 3046 destp->lr_oclaim = lost_rqstp->lr_oclaim; 3047 if (lost_rqstp->lr_oclaim == CLAIM_DELEGATE_CUR) 3048 destp->lr_ostateid = lost_rqstp->lr_ostateid; 3049 3050 srcfp = &lost_rqstp->lr_ofile; 3051 destfp = &destp->lr_ofile; 3052 /* 3053 * Consume caller's utf8string 3054 */ 3055 destfp->utf8string_len = srcfp->utf8string_len; 3056 destfp->utf8string_val = srcfp->utf8string_val; 3057 srcfp->utf8string_len = 0; 3058 srcfp->utf8string_val = NULL; /* make sure not reused */ 3059 3060 *action = NR_LOST_STATE_RQST; 3061 } else if (lost_rqstp->lr_op == OP_OPEN_DOWNGRADE) { 3062 destp->lr_dg_acc = lost_rqstp->lr_dg_acc; 3063 destp->lr_dg_deny = lost_rqstp->lr_dg_deny; 3064 3065 *action = NR_LOST_STATE_RQST; 3066 } else if (lost_rqstp->lr_op == OP_CLOSE) { 3067 ASSERT(lost_rqstp->lr_oop); 3068 *action = NR_LOST_STATE_RQST; 3069 } else if (lost_rqstp->lr_op == OP_DELEGRETURN) { 3070 *action = NR_LOST_STATE_RQST; 3071 } else { 3072 #ifdef DEBUG 3073 cmn_err(CE_PANIC, "nfs4_save_lost_rqst: bad op %d", 3074 lost_rqstp->lr_op); 3075 #endif 3076 nfs4_queue_event(RE_LOST_STATE_BAD_OP, mi, NULL, 3077 lost_rqstp->lr_op, lost_rqstp->lr_vp, lost_rqstp->lr_dvp, 3078 NFS4_OK, NULL, curproc->p_pid, TAG_NONE, TAG_NONE, 0, 0); 3079 *action = NR_UNUSED; 3080 recovp->rc_lost_rqst = NULL; 3081 kmem_free(destp, sizeof (nfs4_lost_rqst_t)); 3082 return; 3083 } 3084 3085 destp->lr_op = lost_rqstp->lr_op; 3086 destp->lr_vp = lost_rqstp->lr_vp; 3087 if (destp->lr_vp) 3088 VN_HOLD(destp->lr_vp); 3089 destp->lr_dvp = lost_rqstp->lr_dvp; 3090 if (destp->lr_dvp) 3091 VN_HOLD(destp->lr_dvp); 3092 destp->lr_oop = lost_rqstp->lr_oop; 3093 if (destp->lr_oop) 3094 open_owner_hold(destp->lr_oop); 3095 destp->lr_osp = lost_rqstp->lr_osp; 3096 if (destp->lr_osp) 3097 open_stream_hold(destp->lr_osp); 3098 destp->lr_lop = lost_rqstp->lr_lop; 3099 if (destp->lr_lop) 3100 lock_owner_hold(destp->lr_lop); 3101 destp->lr_cr = lost_rqstp->lr_cr; 3102 if (destp->lr_cr) 3103 crhold(destp->lr_cr); 3104 if (lost_rqstp->lr_flk == NULL) 3105 destp->lr_flk = NULL; 3106 else { 3107 destp->lr_flk = kmem_alloc(sizeof (flock64_t), KM_SLEEP); 3108 *destp->lr_flk = *lost_rqstp->lr_flk; 3109 } 3110 destp->lr_putfirst = lost_rqstp->lr_putfirst; 3111 } 3112 3113 /* 3114 * Map the given return values (errno and nfs4 status code) to a recovery 3115 * action and fill in the following fields of recovp: rc_action, 3116 * rc_srv_reboot, rc_stateid, rc_lost_rqst. 3117 */ 3118 3119 void 3120 errs_to_action(recov_info_t *recovp, 3121 nfs4_server_t *sp, mntinfo4_t *mi, stateid4 *sidp, 3122 nfs4_lost_rqst_t *lost_rqstp, int unmounted, nfs_opnum4 op, 3123 nfs4_bseqid_entry_t *bsep) 3124 { 3125 nfs4_recov_t action = NR_UNUSED; 3126 bool_t reboot = FALSE; 3127 int try_f; 3128 int error = recovp->rc_orig_errors.error; 3129 nfsstat4 stat = recovp->rc_orig_errors.stat; 3130 3131 bzero(&recovp->rc_stateid, sizeof (stateid4)); 3132 recovp->rc_lost_rqst = NULL; 3133 recovp->rc_bseqid_rqst = NULL; 3134 3135 try_f = nfs4_try_failover(&recovp->rc_orig_errors) && 3136 FAILOVER_MOUNT4(mi); 3137 3138 /* 3139 * We start recovery for EINTR only in the lost lock 3140 * or lost open/close case. 3141 */ 3142 3143 if (try_f || error == EINTR || (error == EIO && unmounted)) { 3144 recovp->rc_error = (error != 0 ? error : geterrno4(stat)); 3145 if (lost_rqstp) { 3146 ASSERT(lost_rqstp->lr_op != 0); 3147 nfs4_save_lost_rqst(lost_rqstp, recovp, &action, mi); 3148 } 3149 if (try_f) 3150 action = NR_FAILOVER; 3151 } else if (error != 0) { 3152 recovp->rc_error = error; 3153 nfs4_queue_event(RE_UNEXPECTED_ERRNO, mi, NULL, error, NULL, 3154 NULL, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0); 3155 action = NR_CLIENTID; 3156 } else { 3157 recovp->rc_error = geterrno4(stat); 3158 switch (stat) { 3159 #ifdef notyet 3160 case NFS4ERR_LEASE_MOVED: 3161 action = xxx; 3162 break; 3163 case NFS4ERR_MOVED: 3164 action = xxx; 3165 break; 3166 #endif 3167 case NFS4ERR_BADHANDLE: 3168 action = NR_BADHANDLE; 3169 break; 3170 case NFS4ERR_BAD_SEQID: 3171 if (bsep) 3172 save_bseqid_rqst(bsep, recovp); 3173 action = NR_BAD_SEQID; 3174 break; 3175 case NFS4ERR_OLD_STATEID: 3176 action = NR_OLDSTATEID; 3177 break; 3178 case NFS4ERR_WRONGSEC: 3179 action = NR_WRONGSEC; 3180 break; 3181 case NFS4ERR_FHEXPIRED: 3182 action = NR_FHEXPIRED; 3183 break; 3184 case NFS4ERR_BAD_STATEID: 3185 if (sp == NULL || (sp != NULL && inlease(sp))) { 3186 3187 action = NR_BAD_STATEID; 3188 if (sidp) 3189 recovp->rc_stateid = *sidp; 3190 } else 3191 action = NR_CLIENTID; 3192 break; 3193 case NFS4ERR_EXPIRED: 3194 /* 3195 * The client's lease has expired, either due 3196 * to a network partition or perhaps a client 3197 * error. In either case, try an NR_CLIENTID 3198 * style recovery. reboot remains false, since 3199 * there is no evidence the server has rebooted. 3200 * This will cause CLAIM_NULL opens and lock 3201 * requests without the reclaim bit. 3202 */ 3203 action = NR_CLIENTID; 3204 3205 DTRACE_PROBE4(nfs4__expired, 3206 nfs4_server_t *, sp, 3207 mntinfo4_t *, mi, 3208 stateid4 *, sidp, int, op); 3209 3210 break; 3211 case NFS4ERR_STALE_CLIENTID: 3212 case NFS4ERR_STALE_STATEID: 3213 action = NR_CLIENTID; 3214 reboot = TRUE; 3215 break; 3216 case NFS4ERR_RESOURCE: 3217 /* 3218 * If this had been a FAILOVER mount, then 3219 * we'd have tried failover. Since it's not, 3220 * just delay a while and retry. 3221 */ 3222 action = NR_DELAY; 3223 break; 3224 case NFS4ERR_GRACE: 3225 action = NR_GRACE; 3226 break; 3227 case NFS4ERR_DELAY: 3228 action = NR_DELAY; 3229 break; 3230 case NFS4ERR_STALE: 3231 action = NR_STALE; 3232 break; 3233 default: 3234 nfs4_queue_event(RE_UNEXPECTED_STATUS, mi, NULL, 0, 3235 NULL, NULL, stat, NULL, 0, TAG_NONE, TAG_NONE, 3236 0, 0); 3237 action = NR_CLIENTID; 3238 break; 3239 } 3240 } 3241 3242 /* make sure action got set */ 3243 ASSERT(action != NR_UNUSED); 3244 recovp->rc_srv_reboot = reboot; 3245 recovp->rc_action = action; 3246 nfs4_queue_fact(RF_ERR, mi, stat, action, op, reboot, NULL, error, 3247 NULL); 3248 } 3249 3250 /* 3251 * Return the (held) credential for the process with the given pid. 3252 * May return NULL (e.g., process not found). 3253 */ 3254 3255 static cred_t * 3256 pid_to_cr(pid_t pid) 3257 { 3258 proc_t *p; 3259 cred_t *cr; 3260 3261 mutex_enter(&pidlock); 3262 if ((p = prfind(pid)) == NULL) { 3263 mutex_exit(&pidlock); 3264 return (NULL); 3265 } 3266 3267 mutex_enter(&p->p_crlock); 3268 crhold(cr = p->p_cred); 3269 mutex_exit(&p->p_crlock); 3270 mutex_exit(&pidlock); 3271 3272 return (cr); 3273 } 3274 3275 /* 3276 * Send SIGLOST to the given process and queue the event. 3277 * 3278 * The 'dump' boolean tells us whether this action should dump the 3279 * in-kernel queue of recovery messages or not. 3280 */ 3281 3282 void 3283 nfs4_send_siglost(pid_t pid, mntinfo4_t *mi, vnode_t *vp, bool_t dump, 3284 int error, nfsstat4 stat) 3285 { 3286 proc_t *p; 3287 3288 mutex_enter(&pidlock); 3289 p = prfind(pid); 3290 if (p) 3291 psignal(p, SIGLOST); 3292 mutex_exit(&pidlock); 3293 nfs4_queue_event(dump ? RE_SIGLOST : RE_SIGLOST_NO_DUMP, mi, 3294 NULL, error, vp, NULL, stat, NULL, pid, TAG_NONE, TAG_NONE, 0, 0); 3295 } 3296 3297 /* 3298 * Scan the lock list for entries that match the given pid. Change the 3299 * pid in those that do to NOPID. 3300 */ 3301 3302 static void 3303 relock_skip_pid(locklist_t *llp, pid_t pid) 3304 { 3305 for (; llp != NULL; llp = llp->ll_next) { 3306 if (llp->ll_flock.l_pid == pid) 3307 llp->ll_flock.l_pid = NOPID; 3308 } 3309 } 3310 3311 /* 3312 * Mark a file as having failed recovery, after making a last-ditch effort 3313 * to return any delegation. 3314 * 3315 * Sets r_error to EIO or ESTALE for the given vnode. 3316 */ 3317 void 3318 nfs4_fail_recov(vnode_t *vp, char *why, int error, nfsstat4 stat) 3319 { 3320 rnode4_t *rp = VTOR4(vp); 3321 3322 #ifdef DEBUG 3323 if (nfs4_fail_recov_stop) 3324 debug_enter("nfs4_fail_recov"); 3325 #endif 3326 3327 mutex_enter(&rp->r_statelock); 3328 if (rp->r_flags & (R4RECOVERR|R4RECOVERRP)) { 3329 mutex_exit(&rp->r_statelock); 3330 return; 3331 } 3332 3333 /* 3334 * Set R4RECOVERRP to indicate that a recovery error is in 3335 * progress. This will shut down reads and writes at the top 3336 * half. Don't set R4RECOVERR until after we've returned the 3337 * delegation, otherwise it will fail. 3338 */ 3339 3340 rp->r_flags |= R4RECOVERRP; 3341 mutex_exit(&rp->r_statelock); 3342 3343 nfs4delegabandon(rp); 3344 3345 mutex_enter(&rp->r_statelock); 3346 rp->r_flags |= (R4RECOVERR | R4STALE); 3347 rp->r_error = (error == 0 && stat == NFS4ERR_STALE) ? ESTALE : EIO; 3348 PURGE_ATTRCACHE4_LOCKED(rp); 3349 if (!(vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) 3350 nfs4_queue_event(RE_DEAD_FILE, VTOMI4(vp), NULL, error, 3351 vp, NULL, stat, why, 0, TAG_NONE, TAG_NONE, 0, 0); 3352 mutex_exit(&rp->r_statelock); 3353 3354 dnlc_purge_vp(vp); 3355 } 3356 3357 /* 3358 * recov_throttle: if the file had the same recovery action within the 3359 * throttle interval, wait for the throttle interval to finish before 3360 * proceeding. 3361 * 3362 * Side effects: updates the rnode with the current recovery information. 3363 */ 3364 3365 static void 3366 recov_throttle(recov_info_t *recovp, vnode_t *vp) 3367 { 3368 time_t curtime, time_to_wait; 3369 rnode4_t *rp = VTOR4(vp); 3370 3371 curtime = gethrestime_sec(); 3372 3373 mutex_enter(&rp->r_statelock); 3374 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 3375 "recov_throttle: now: (%d, %ld), last: (%d, %ld)", 3376 recovp->rc_action, curtime, 3377 rp->r_recov_act, rp->r_last_recov)); 3378 if (recovp->rc_action == rp->r_recov_act && 3379 rp->r_last_recov + recov_err_delay > curtime) { 3380 time_to_wait = rp->r_last_recov + recov_err_delay - curtime; 3381 mutex_exit(&rp->r_statelock); 3382 delay(SEC_TO_TICK(time_to_wait)); 3383 curtime = gethrestime_sec(); 3384 mutex_enter(&rp->r_statelock); 3385 } 3386 3387 rp->r_last_recov = curtime; 3388 rp->r_recov_act = recovp->rc_action; 3389 mutex_exit(&rp->r_statelock); 3390 } 3391 3392 /* 3393 * React to NFS4ERR_GRACE by setting the time we'll permit 3394 * the next call to this filesystem. 3395 */ 3396 void 3397 nfs4_set_grace_wait(mntinfo4_t *mi) 3398 { 3399 mutex_enter(&mi->mi_lock); 3400 /* Mark the time for the future */ 3401 mi->mi_grace_wait = gethrestime_sec() + nfs4err_delay_time; 3402 mutex_exit(&mi->mi_lock); 3403 } 3404 3405 /* 3406 * React to MFS4ERR_DELAY by setting the time we'll permit 3407 * the next call to this vnode. 3408 */ 3409 void 3410 nfs4_set_delay_wait(vnode_t *vp) 3411 { 3412 rnode4_t *rp = VTOR4(vp); 3413 3414 mutex_enter(&rp->r_statelock); 3415 /* 3416 * Calculate amount we should delay, initial 3417 * delay will be short and then we will back off. 3418 */ 3419 if (rp->r_delay_interval == 0) 3420 rp->r_delay_interval = NFS4_INITIAL_DELAY_INTERVAL; 3421 else 3422 /* calculate next interval value */ 3423 rp->r_delay_interval = 3424 MIN(NFS4_MAX_DELAY_INTERVAL, (rp->r_delay_interval << 1)); 3425 rp->r_delay_wait = gethrestime_sec() + rp->r_delay_interval; 3426 mutex_exit(&rp->r_statelock); 3427 } 3428 3429 /* 3430 * The caller is responsible for freeing the returned string. 3431 */ 3432 static char * 3433 nfs4_getsrvnames(mntinfo4_t *mi, size_t *len) 3434 { 3435 servinfo4_t *svp; 3436 char *srvnames; 3437 char *namep; 3438 size_t length; 3439 3440 /* 3441 * Calculate the length of the string required to hold all 3442 * of the server names plus either a comma or a null 3443 * character following each individual one. 3444 */ 3445 length = 0; 3446 for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) { 3447 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 3448 if (svp->sv_flags & SV4_NOTINUSE) { 3449 nfs_rw_exit(&svp->sv_lock); 3450 continue; 3451 } 3452 nfs_rw_exit(&svp->sv_lock); 3453 length += svp->sv_hostnamelen; 3454 } 3455 3456 srvnames = kmem_alloc(length, KM_SLEEP); 3457 3458 namep = srvnames; 3459 for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) { 3460 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 3461 if (svp->sv_flags & SV4_NOTINUSE) { 3462 nfs_rw_exit(&svp->sv_lock); 3463 continue; 3464 } 3465 nfs_rw_exit(&svp->sv_lock); 3466 (void) strcpy(namep, svp->sv_hostname); 3467 namep += svp->sv_hostnamelen - 1; 3468 *namep++ = ','; 3469 } 3470 *--namep = '\0'; 3471 3472 *len = length; 3473 3474 return (srvnames); 3475 } 3476 3477 static void 3478 save_bseqid_rqst(nfs4_bseqid_entry_t *bsep, recov_info_t *recovp) 3479 { 3480 nfs4_bseqid_entry_t *destp; 3481 3482 destp = kmem_alloc(sizeof (nfs4_bseqid_entry_t), KM_SLEEP); 3483 recovp->rc_bseqid_rqst = destp; 3484 3485 if (bsep->bs_oop) 3486 open_owner_hold(bsep->bs_oop); 3487 destp->bs_oop = bsep->bs_oop; 3488 if (bsep->bs_lop) 3489 lock_owner_hold(bsep->bs_lop); 3490 destp->bs_lop = bsep->bs_lop; 3491 if (bsep->bs_vp) 3492 VN_HOLD(bsep->bs_vp); 3493 destp->bs_vp = bsep->bs_vp; 3494 destp->bs_pid = bsep->bs_pid; 3495 destp->bs_tag = bsep->bs_tag; 3496 destp->bs_seqid = bsep->bs_seqid; 3497 } 3498 3499 static void 3500 free_bseqid_rqst(nfs4_bseqid_entry_t *bsep) 3501 { 3502 if (bsep->bs_oop) 3503 open_owner_rele(bsep->bs_oop); 3504 if (bsep->bs_lop) 3505 lock_owner_rele(bsep->bs_lop); 3506 if (bsep->bs_vp) 3507 VN_RELE(bsep->bs_vp); 3508 kmem_free(bsep, sizeof (nfs4_bseqid_entry_t)); 3509 } 3510 3511 /* 3512 * We don't actually fully recover from NFS4ERR_BAD_SEQID. We 3513 * simply mark the open owner and open stream (if provided) as "bad". 3514 * Then future uses of these data structures will be limited to basically 3515 * just cleaning up the internal client state (no going OTW). 3516 * 3517 * The result of this is to return errors back to the app/usr when 3518 * we receive NFS4ERR_BAD_SEQID, but also allow future/new calls to 3519 * succeed so progress can be made. 3520 */ 3521 void 3522 recov_bad_seqid(recov_info_t *recovp) 3523 { 3524 mntinfo4_t *mi = recovp->rc_mi; 3525 nfs4_open_owner_t *bad_oop; 3526 nfs4_lock_owner_t *bad_lop; 3527 vnode_t *vp; 3528 rnode4_t *rp = NULL; 3529 pid_t pid; 3530 nfs4_bseqid_entry_t *bsep, *tbsep; 3531 int error; 3532 3533 ASSERT(mi != NULL); 3534 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 3535 3536 mutex_enter(&mi->mi_lock); 3537 bsep = list_head(&mi->mi_bseqid_list); 3538 mutex_exit(&mi->mi_lock); 3539 3540 /* 3541 * Handle all the bad seqid entries on mi's list. 3542 */ 3543 while (bsep != NULL) { 3544 bad_oop = bsep->bs_oop; 3545 bad_lop = bsep->bs_lop; 3546 vp = bsep->bs_vp; 3547 pid = bsep->bs_pid; 3548 3549 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 3550 "recov_bad_seqid: mark oop %p lop %p as bad for " 3551 "vp %p tag %s pid %d: last good seqid %d for tag %s", 3552 (void *)bad_oop, (void *)bad_lop, (void *)vp, 3553 nfs4_ctags[bsep->bs_tag].ct_str, pid, 3554 bad_oop ? bad_oop->oo_last_good_seqid : 0, 3555 bad_oop ? nfs4_ctags[bad_oop->oo_last_good_op].ct_str : 3556 nfs4_ctags[TAG_NONE].ct_str)); 3557 3558 nfs4_queue_event(RE_BAD_SEQID, mi, NULL, 3559 0, vp, NULL, NFS4ERR_BAD_SEQID, NULL, pid, bsep->bs_tag, 3560 bad_oop ? bad_oop->oo_last_good_op : TAG_NONE, 3561 bsep->bs_seqid, bad_oop ? bad_oop->oo_last_good_seqid : 0); 3562 3563 if (bad_oop) { 3564 /* essentially reset the open owner */ 3565 error = nfs4_start_open_seqid_sync(bad_oop, mi); 3566 ASSERT(!error); /* recov thread always succeeds */ 3567 bad_oop->oo_name = nfs4_get_new_oo_name(); 3568 bad_oop->oo_seqid = 0; 3569 nfs4_end_open_seqid_sync(bad_oop); 3570 } 3571 3572 if (bad_lop) { 3573 mutex_enter(&bad_lop->lo_lock); 3574 bad_lop->lo_flags |= NFS4_BAD_SEQID_LOCK; 3575 mutex_exit(&bad_lop->lo_lock); 3576 3577 ASSERT(vp != NULL); 3578 rp = VTOR4(vp); 3579 mutex_enter(&rp->r_statelock); 3580 rp->r_flags |= R4LODANGLERS; 3581 mutex_exit(&rp->r_statelock); 3582 3583 nfs4_send_siglost(pid, mi, vp, TRUE, 3584 0, NFS4ERR_BAD_SEQID); 3585 } 3586 3587 mutex_enter(&mi->mi_lock); 3588 list_remove(&mi->mi_bseqid_list, bsep); 3589 tbsep = bsep; 3590 bsep = list_head(&mi->mi_bseqid_list); 3591 mutex_exit(&mi->mi_lock); 3592 free_bseqid_rqst(tbsep); 3593 } 3594 3595 mutex_enter(&mi->mi_lock); 3596 mi->mi_recovflags &= ~MI4R_BAD_SEQID; 3597 mutex_exit(&mi->mi_lock); 3598 } 3599