1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * NFS Version 4 state recovery code. 28 */ 29 30 #include <nfs/nfs4_clnt.h> 31 #include <nfs/nfs4.h> 32 #include <nfs/rnode4.h> 33 #include <sys/cmn_err.h> 34 #include <sys/cred.h> 35 #include <sys/systm.h> 36 #include <sys/flock.h> 37 #include <sys/dnlc.h> 38 #include <sys/ddi.h> 39 #include <sys/disp.h> 40 #include <sys/list.h> 41 #include <sys/sdt.h> 42 #include <sys/mount.h> 43 #include <sys/door.h> 44 #include <nfs/nfssys.h> 45 #include <nfs/nfsid_map.h> 46 #include <nfs/nfs4_idmap_impl.h> 47 48 extern r4hashq_t *rtable4; 49 50 /* 51 * Information that describes what needs to be done for recovery. It is 52 * passed to a client recovery thread as well as passed to various recovery 53 * routines. rc_mi, rc_vp1, and rc_vp2 refer to the filesystem and 54 * vnode(s) affected by recovery. rc_vp1 and rc_vp2 are references (use 55 * VN_HOLD) or NULL. rc_lost_rqst contains information about the lost 56 * lock or open/close request, and it holds reference counts for the 57 * various objects (vnode, etc.). The recovery thread also uses flags set 58 * in the mntinfo4_t or vnode_t to tell it what to do. rc_error is used 59 * to save the error that originally triggered the recovery event -- will 60 * later be used to set mi_error if recovery doesn't work. rc_bseqid_rqst 61 * contains information about the request that got NFS4ERR_BAD_SEQID, and 62 * it holds reference count for the various objects (vnode, open owner, 63 * open stream, lock owner). 64 */ 65 66 typedef struct { 67 mntinfo4_t *rc_mi; 68 vnode_t *rc_vp1; 69 vnode_t *rc_vp2; 70 nfs4_recov_t rc_action; 71 stateid4 rc_stateid; 72 bool_t rc_srv_reboot; /* server has rebooted */ 73 nfs4_lost_rqst_t *rc_lost_rqst; 74 nfs4_error_t rc_orig_errors; /* original errors causing recovery */ 75 int rc_error; 76 nfs4_bseqid_entry_t *rc_bseqid_rqst; 77 vnode_t *rc_moved_vp; 78 char *rc_moved_nm; 79 } recov_info_t; 80 81 /* 82 * How long to wait before trying again if there is an error doing 83 * recovery, in seconds. 84 */ 85 86 static int recov_err_delay = 1; 87 88 /* 89 * How long to wait when processing NFS4ERR_GRACE or NFS4ERR_DELAY 90 * errors. Expressed in seconds. Default is defined as 91 * NFS4ERR_DELAY_TIME and this variable is initialized in nfs4_subr_init() 92 */ 93 time_t nfs4err_delay_time = 0; 94 95 /* 96 * Tuneable to limit how many time "exempt" ops go OTW 97 * after a recovery error. Exempt op hints are OH_CLOSE, 98 * OH_LOCKU, OH_DELEGRETURN. These previously always went 99 * OTW even after rnode was "dead" due to recovery errors. 100 * 101 * The tuneable below limits the number of times a start_fop 102 * invocation will retry the exempt hints. After the limit 103 * is reached, nfs4_start_fop will return an error just like 104 * it would for non-exempt op hints. 105 */ 106 int nfs4_max_recov_error_retry = 3; 107 108 /* 109 * Number of seconds the recovery thread should pause before retry when the 110 * filesystem has been forcibly unmounted. 111 */ 112 113 int nfs4_unmount_delay = 1; 114 115 #ifdef DEBUG 116 117 /* 118 * How long to wait (in seconds) between recovery operations on a given 119 * file. Normally zero, but could be set longer for testing purposes. 120 */ 121 static int nfs4_recovdelay = 0; 122 123 /* 124 * Switch that controls whether to go into the debugger when recovery 125 * fails. 126 */ 127 static int nfs4_fail_recov_stop = 0; 128 129 /* 130 * Tuneables to debug client namespace interaction with server 131 * mount points: 132 * 133 * nfs4_srvmnt_fail_cnt: 134 * number of times EACCES returned because client 135 * attempted to cross server mountpoint 136 * 137 * nfs4_srvmnt_debug: 138 * trigger console printf whenever client attempts 139 * to cross server mountpoint 140 */ 141 int nfs4_srvmnt_fail_cnt = 0; 142 int nfs4_srvmnt_debug = 0; 143 #endif 144 145 extern zone_key_t nfs4clnt_zone_key; 146 147 /* forward references, in alphabetic order */ 148 static void close_after_open_resend(vnode_t *, cred_t *, uint32_t, 149 nfs4_error_t *); 150 static void errs_to_action(recov_info_t *, 151 nfs4_server_t *, mntinfo4_t *, stateid4 *, nfs4_lost_rqst_t *, int, 152 nfs_opnum4, nfs4_bseqid_entry_t *); 153 static void flush_reinstate(nfs4_lost_rqst_t *); 154 static void free_milist(mntinfo4_t **, int); 155 static mntinfo4_t **make_milist(nfs4_server_t *, int *); 156 static int nfs4_check_recov_err(vnode_t *, nfs4_op_hint_t, 157 nfs4_recov_state_t *, int, char *); 158 static char *nfs4_getsrvnames(mntinfo4_t *, size_t *); 159 static void nfs4_recov_fh_fail(vnode_t *, int, nfsstat4); 160 static void nfs4_recov_thread(recov_info_t *); 161 static void nfs4_remove_lost_rqsts(mntinfo4_t *, nfs4_server_t *); 162 static void nfs4_resend_lost_rqsts(recov_info_t *, nfs4_server_t *); 163 static cred_t *pid_to_cr(pid_t); 164 static void reclaim_one_lock(vnode_t *, flock64_t *, nfs4_error_t *, int *); 165 static void recov_bad_seqid(recov_info_t *); 166 static void recov_badstate(recov_info_t *, vnode_t *, nfsstat4); 167 static void recov_clientid(recov_info_t *, nfs4_server_t *); 168 static void recov_done(mntinfo4_t *, recov_info_t *); 169 static void recov_filehandle(nfs4_recov_t, mntinfo4_t *, vnode_t *); 170 static void recov_newserver(recov_info_t *, nfs4_server_t **, bool_t *); 171 static void recov_openfiles(recov_info_t *, nfs4_server_t *); 172 static void recov_stale(mntinfo4_t *, vnode_t *); 173 static void nfs4_free_lost_rqst(nfs4_lost_rqst_t *, nfs4_server_t *); 174 static void recov_throttle(recov_info_t *, vnode_t *); 175 static void relock_skip_pid(vnode_t *, locklist_t *, pid_t); 176 static void resend_lock(nfs4_lost_rqst_t *, nfs4_error_t *); 177 static void resend_one_op(nfs4_lost_rqst_t *, nfs4_error_t *, mntinfo4_t *, 178 nfs4_server_t *); 179 static void save_bseqid_rqst(nfs4_bseqid_entry_t *, recov_info_t *); 180 static void start_recovery(recov_info_t *, mntinfo4_t *, vnode_t *, vnode_t *, 181 nfs4_server_t *, vnode_t *, char *); 182 static void start_recovery_action(nfs4_recov_t, bool_t, mntinfo4_t *, vnode_t *, 183 vnode_t *); 184 static int wait_for_recovery(mntinfo4_t *, nfs4_op_hint_t); 185 186 /* 187 * Return non-zero if the given errno, status, and rpc status codes 188 * in the nfs4_error_t indicate that client recovery is needed. 189 * "stateful" indicates whether the call that got the error establishes or 190 * removes state on the server (open, close, lock, unlock, delegreturn). 191 */ 192 193 int 194 nfs4_needs_recovery(nfs4_error_t *ep, bool_t stateful, vfs_t *vfsp) 195 { 196 int recov = 0; 197 mntinfo4_t *mi; 198 199 /* 200 * Try failover if the error values justify it and if 201 * it's a failover mount. Don't try if the mount is in 202 * progress, failures are handled explicitly by nfs4rootvp. 203 */ 204 if (nfs4_try_failover(ep)) { 205 mi = VFTOMI4(vfsp); 206 mutex_enter(&mi->mi_lock); 207 recov = FAILOVER_MOUNT4(mi) && !(mi->mi_flags & MI4_MOUNTING); 208 mutex_exit(&mi->mi_lock); 209 if (recov) 210 return (recov); 211 } 212 213 if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vfsp)) { 214 /* 215 * The server may have gotten the request, so for stateful 216 * ops we need to resynchronize and possibly back out the 217 * op. 218 */ 219 return (stateful); 220 } 221 if (ep->error != 0) 222 return (0); 223 224 /* stat values are listed alphabetically */ 225 /* 226 * There are two lists here: the errors for which we have code, and 227 * the errors for which we plan to have code before FCS. For the 228 * second list, print a warning message but don't attempt recovery. 229 */ 230 switch (ep->stat) { 231 case NFS4ERR_BADHANDLE: 232 case NFS4ERR_BAD_SEQID: 233 case NFS4ERR_BAD_STATEID: 234 case NFS4ERR_DELAY: 235 case NFS4ERR_EXPIRED: 236 case NFS4ERR_FHEXPIRED: 237 case NFS4ERR_GRACE: 238 case NFS4ERR_OLD_STATEID: 239 case NFS4ERR_RESOURCE: 240 case NFS4ERR_STALE_CLIENTID: 241 case NFS4ERR_STALE_STATEID: 242 case NFS4ERR_WRONGSEC: 243 case NFS4ERR_STALE: 244 recov = 1; 245 break; 246 #ifdef DEBUG 247 case NFS4ERR_LEASE_MOVED: 248 case NFS4ERR_MOVED: 249 zcmn_err(VFTOMI4(vfsp)->mi_zone->zone_id, 250 CE_WARN, "!Can't yet recover from NFS status %d", 251 ep->stat); 252 break; 253 #endif 254 } 255 256 return (recov); 257 } 258 259 /* 260 * Some operations such as DELEGRETURN want to avoid invoking 261 * recovery actions that will only mark the file dead. If 262 * better handlers are invoked for any of these errors, this 263 * routine should be modified. 264 */ 265 int 266 nfs4_recov_marks_dead(nfsstat4 status) 267 { 268 if (status == NFS4ERR_BAD_SEQID || 269 status == NFS4ERR_EXPIRED || 270 status == NFS4ERR_BAD_STATEID || 271 status == NFS4ERR_OLD_STATEID) 272 return (1); 273 return (0); 274 } 275 276 /* 277 * Transfer the state recovery information in recovp to mi's resend queue, 278 * and mark mi as having a lost state request. 279 */ 280 static void 281 nfs4_enqueue_lost_rqst(recov_info_t *recovp, mntinfo4_t *mi) 282 { 283 nfs4_lost_rqst_t *lrp = recovp->rc_lost_rqst; 284 285 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 286 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 287 288 ASSERT(lrp != NULL && lrp->lr_op != 0); 289 290 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 291 "nfs4_enqueue_lost_rqst %p, op %d", 292 (void *)lrp, lrp->lr_op)); 293 294 mutex_enter(&mi->mi_lock); 295 mi->mi_recovflags |= MI4R_LOST_STATE; 296 if (lrp->lr_putfirst) 297 list_insert_head(&mi->mi_lost_state, lrp); 298 else 299 list_insert_tail(&mi->mi_lost_state, lrp); 300 recovp->rc_lost_rqst = NULL; 301 mutex_exit(&mi->mi_lock); 302 303 nfs4_queue_event(RE_LOST_STATE, mi, NULL, lrp->lr_op, lrp->lr_vp, 304 lrp->lr_dvp, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0); 305 } 306 307 /* 308 * Transfer the bad seqid recovery information in recovp to mi's 309 * bad seqid queue, and mark mi as having a bad seqid request. 310 */ 311 void 312 enqueue_bseqid_rqst(recov_info_t *recovp, mntinfo4_t *mi) 313 { 314 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 315 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 316 ASSERT(recovp->rc_bseqid_rqst != NULL); 317 318 mutex_enter(&mi->mi_lock); 319 mi->mi_recovflags |= MI4R_BAD_SEQID; 320 list_insert_tail(&mi->mi_bseqid_list, recovp->rc_bseqid_rqst); 321 recovp->rc_bseqid_rqst = NULL; 322 mutex_exit(&mi->mi_lock); 323 } 324 325 /* 326 * Initiate recovery. 327 * 328 * The nfs4_error_t contains the return codes that triggered a recovery 329 * attempt. mi, vp1, and vp2 refer to the filesystem and files that were 330 * being operated on. vp1 and vp2 may be NULL. 331 * 332 * Multiple calls are okay. If recovery is already underway, the call 333 * updates the information about what state needs recovery but does not 334 * start a new thread. The caller should hold mi->mi_recovlock as a reader 335 * for proper synchronization with any recovery thread. 336 * 337 * This will return TRUE if recovery was aborted, and FALSE otherwise. 338 */ 339 bool_t 340 nfs4_start_recovery(nfs4_error_t *ep, mntinfo4_t *mi, vnode_t *vp1, 341 vnode_t *vp2, stateid4 *sid, nfs4_lost_rqst_t *lost_rqstp, nfs_opnum4 op, 342 nfs4_bseqid_entry_t *bsep, vnode_t *moved_vp, char *moved_nm) 343 { 344 recov_info_t *recovp; 345 nfs4_server_t *sp; 346 bool_t abort = FALSE; 347 bool_t gone = FALSE; 348 349 ASSERT(nfs_zone() == mi->mi_zone); 350 mutex_enter(&mi->mi_lock); 351 /* 352 * If there is lost state, we need to kick off recovery even if the 353 * filesystem has been unmounted or the zone is shutting down. 354 */ 355 gone = FS_OR_ZONE_GONE4(mi->mi_vfsp); 356 if (gone) { 357 ASSERT(ep->error != EINTR || lost_rqstp != NULL); 358 if (ep->error == EIO && lost_rqstp == NULL) { 359 /* failed due to forced unmount, no new lost state */ 360 abort = TRUE; 361 } 362 if ((ep->error == 0 || ep->error == ETIMEDOUT) && 363 !(mi->mi_recovflags & MI4R_LOST_STATE)) { 364 /* some other failure, no existing lost state */ 365 abort = TRUE; 366 } 367 if (abort) { 368 mutex_exit(&mi->mi_lock); 369 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 370 "nfs4_start_recovery: fs unmounted")); 371 return (TRUE); 372 } 373 } 374 mi->mi_in_recovery++; 375 mutex_exit(&mi->mi_lock); 376 377 recovp = kmem_alloc(sizeof (recov_info_t), KM_SLEEP); 378 recovp->rc_orig_errors = *ep; 379 sp = find_nfs4_server(mi); 380 errs_to_action(recovp, sp, mi, sid, lost_rqstp, gone, op, bsep); 381 if (sp != NULL) 382 mutex_exit(&sp->s_lock); 383 start_recovery(recovp, mi, vp1, vp2, sp, moved_vp, moved_nm); 384 if (sp != NULL) 385 nfs4_server_rele(sp); 386 return (FALSE); 387 } 388 389 /* 390 * Internal version of nfs4_start_recovery. The difference is that the 391 * caller specifies the recovery action, rather than the errors leading to 392 * recovery. 393 */ 394 static void 395 start_recovery_action(nfs4_recov_t what, bool_t reboot, mntinfo4_t *mi, 396 vnode_t *vp1, vnode_t *vp2) 397 { 398 recov_info_t *recovp; 399 400 ASSERT(nfs_zone() == mi->mi_zone); 401 mutex_enter(&mi->mi_lock); 402 mi->mi_in_recovery++; 403 mutex_exit(&mi->mi_lock); 404 405 recovp = kmem_zalloc(sizeof (recov_info_t), KM_SLEEP); 406 recovp->rc_action = what; 407 recovp->rc_srv_reboot = reboot; 408 recovp->rc_error = EIO; 409 start_recovery(recovp, mi, vp1, vp2, NULL, NULL, NULL); 410 } 411 412 static void 413 start_recovery(recov_info_t *recovp, mntinfo4_t *mi, 414 vnode_t *vp1, vnode_t *vp2, nfs4_server_t *sp, 415 vnode_t *moved_vp, char *moved_nm) 416 { 417 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 418 "start_recovery: mi %p, what %s", (void*)mi, 419 nfs4_recov_action_to_str(recovp->rc_action))); 420 421 /* 422 * Bump the reference on the vfs so that we can pass it to the 423 * recovery thread. 424 */ 425 VFS_HOLD(mi->mi_vfsp); 426 MI4_HOLD(mi); 427 again: 428 switch (recovp->rc_action) { 429 case NR_FAILOVER: 430 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 431 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 432 if (mi->mi_servers->sv_next == NULL) 433 goto out_no_thread; 434 mutex_enter(&mi->mi_lock); 435 mi->mi_recovflags |= MI4R_NEED_NEW_SERVER; 436 mutex_exit(&mi->mi_lock); 437 438 if (recovp->rc_lost_rqst != NULL) 439 nfs4_enqueue_lost_rqst(recovp, mi); 440 break; 441 442 case NR_CLIENTID: 443 /* 444 * If the filesystem has been unmounted, punt. 445 */ 446 if (sp == NULL) 447 goto out_no_thread; 448 449 /* 450 * If nobody else is working on the clientid, mark the 451 * clientid as being no longer set. Then mark the specific 452 * filesystem being worked on. 453 */ 454 if (!nfs4_server_in_recovery(sp)) { 455 mutex_enter(&sp->s_lock); 456 sp->s_flags &= ~N4S_CLIENTID_SET; 457 mutex_exit(&sp->s_lock); 458 } 459 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 460 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 461 mutex_enter(&mi->mi_lock); 462 mi->mi_recovflags |= MI4R_NEED_CLIENTID; 463 if (recovp->rc_srv_reboot) 464 mi->mi_recovflags |= MI4R_SRV_REBOOT; 465 mutex_exit(&mi->mi_lock); 466 break; 467 468 case NR_OPENFILES: 469 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 470 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 471 mutex_enter(&mi->mi_lock); 472 mi->mi_recovflags |= MI4R_REOPEN_FILES; 473 if (recovp->rc_srv_reboot) 474 mi->mi_recovflags |= MI4R_SRV_REBOOT; 475 mutex_exit(&mi->mi_lock); 476 break; 477 478 case NR_WRONGSEC: 479 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 480 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 481 mutex_enter(&mi->mi_lock); 482 mi->mi_recovflags |= MI4R_NEED_SECINFO; 483 mutex_exit(&mi->mi_lock); 484 break; 485 486 case NR_EXPIRED: 487 if (vp1 != NULL) 488 recov_badstate(recovp, vp1, NFS4ERR_EXPIRED); 489 if (vp2 != NULL) 490 recov_badstate(recovp, vp2, NFS4ERR_EXPIRED); 491 goto out_no_thread; /* no further recovery possible */ 492 493 case NR_BAD_STATEID: 494 if (vp1 != NULL) 495 recov_badstate(recovp, vp1, NFS4ERR_BAD_STATEID); 496 if (vp2 != NULL) 497 recov_badstate(recovp, vp2, NFS4ERR_BAD_STATEID); 498 goto out_no_thread; /* no further recovery possible */ 499 500 case NR_FHEXPIRED: 501 case NR_BADHANDLE: 502 if (vp1 != NULL) 503 recov_throttle(recovp, vp1); 504 if (vp2 != NULL) 505 recov_throttle(recovp, vp2); 506 /* 507 * Recover the filehandle now, rather than using a 508 * separate thread. We can do this because filehandle 509 * recovery is independent of any other state, and because 510 * we know that we are not competing with the recovery 511 * thread at this time. recov_filehandle will deal with 512 * threads that are competing to recover this filehandle. 513 */ 514 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 515 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 516 if (vp1 != NULL) 517 recov_filehandle(recovp->rc_action, mi, vp1); 518 if (vp2 != NULL) 519 recov_filehandle(recovp->rc_action, mi, vp2); 520 goto out_no_thread; /* no further recovery needed */ 521 522 case NR_STALE: 523 /* 524 * NFS4ERR_STALE handling 525 * recov_stale() could set MI4R_NEED_NEW_SERVER to 526 * indicate that we can and should failover. 527 */ 528 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) || 529 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 530 531 if (vp1 != NULL) 532 recov_stale(mi, vp1); 533 if (vp2 != NULL) 534 recov_stale(mi, vp2); 535 mutex_enter(&mi->mi_lock); 536 if ((mi->mi_recovflags & MI4R_NEED_NEW_SERVER) == 0) { 537 mutex_exit(&mi->mi_lock); 538 goto out_no_thread; 539 } 540 mutex_exit(&mi->mi_lock); 541 recovp->rc_action = NR_FAILOVER; 542 goto again; 543 544 case NR_BAD_SEQID: 545 if (recovp->rc_bseqid_rqst) { 546 enqueue_bseqid_rqst(recovp, mi); 547 break; 548 } 549 550 if (vp1 != NULL) 551 recov_badstate(recovp, vp1, NFS4ERR_BAD_SEQID); 552 if (vp2 != NULL) 553 recov_badstate(recovp, vp2, NFS4ERR_BAD_SEQID); 554 goto out_no_thread; /* no further recovery possible */ 555 556 case NR_OLDSTATEID: 557 if (vp1 != NULL) 558 recov_badstate(recovp, vp1, NFS4ERR_OLD_STATEID); 559 if (vp2 != NULL) 560 recov_badstate(recovp, vp2, NFS4ERR_OLD_STATEID); 561 goto out_no_thread; /* no further recovery possible */ 562 563 case NR_GRACE: 564 nfs4_set_grace_wait(mi); 565 goto out_no_thread; /* no further action required for GRACE */ 566 567 case NR_DELAY: 568 if (vp1) 569 nfs4_set_delay_wait(vp1); 570 goto out_no_thread; /* no further action required for DELAY */ 571 572 case NR_LOST_STATE_RQST: 573 case NR_LOST_LOCK: 574 nfs4_enqueue_lost_rqst(recovp, mi); 575 break; 576 default: 577 nfs4_queue_event(RE_UNEXPECTED_ACTION, mi, NULL, 578 recovp->rc_action, NULL, NULL, 0, NULL, 0, TAG_NONE, 579 TAG_NONE, 0, 0); 580 goto out_no_thread; 581 } 582 583 /* 584 * If either file recently went through the same recovery, wait 585 * awhile. This is in case there is some sort of bug; we might not 586 * be able to recover properly, but at least we won't bombard the 587 * server with calls, and we won't tie up the client. 588 */ 589 if (vp1 != NULL) 590 recov_throttle(recovp, vp1); 591 if (vp2 != NULL) 592 recov_throttle(recovp, vp2); 593 594 /* 595 * If there's already a recovery thread, don't start another one. 596 */ 597 598 mutex_enter(&mi->mi_lock); 599 if (mi->mi_flags & MI4_RECOV_ACTIV) { 600 mutex_exit(&mi->mi_lock); 601 goto out_no_thread; 602 } 603 mi->mi_flags |= MI4_RECOV_ACTIV; 604 mutex_exit(&mi->mi_lock); 605 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 606 "start_recovery: starting new thread for mi %p", (void*)mi)); 607 608 recovp->rc_mi = mi; 609 recovp->rc_vp1 = vp1; 610 if (vp1 != NULL) { 611 ASSERT(VTOMI4(vp1) == mi); 612 VN_HOLD(recovp->rc_vp1); 613 } 614 recovp->rc_vp2 = vp2; 615 if (vp2 != NULL) { 616 ASSERT(VTOMI4(vp2) == mi); 617 VN_HOLD(recovp->rc_vp2); 618 } 619 recovp->rc_moved_vp = moved_vp; 620 recovp->rc_moved_nm = moved_nm; 621 622 (void) zthread_create(NULL, 0, nfs4_recov_thread, recovp, 0, 623 minclsyspri); 624 return; 625 626 /* not reached by thread creating call */ 627 out_no_thread: 628 mutex_enter(&mi->mi_lock); 629 mi->mi_in_recovery--; 630 if (mi->mi_in_recovery == 0) 631 cv_broadcast(&mi->mi_cv_in_recov); 632 mutex_exit(&mi->mi_lock); 633 634 VFS_RELE(mi->mi_vfsp); 635 MI4_RELE(mi); 636 /* 637 * Free up resources that were allocated for us. 638 */ 639 kmem_free(recovp, sizeof (recov_info_t)); 640 } 641 642 static int 643 nfs4_check_recov_err(vnode_t *vp, nfs4_op_hint_t op, 644 nfs4_recov_state_t *rsp, int retry_err_cnt, char *str) 645 { 646 rnode4_t *rp; 647 int error = 0; 648 int exempt; 649 650 if (vp == NULL) 651 return (0); 652 653 exempt = (op == OH_CLOSE || op == OH_LOCKU || op == OH_DELEGRETURN); 654 rp = VTOR4(vp); 655 mutex_enter(&rp->r_statelock); 656 657 /* 658 * If there was a recovery error, then allow op hints "exempt" from 659 * recov errors to retry (currently 3 times). Either r_error or 660 * EIO is returned for non-exempt op hints. 661 */ 662 if (rp->r_flags & R4RECOVERR) { 663 if (exempt && rsp->rs_num_retry_despite_err <= 664 nfs4_max_recov_error_retry) { 665 666 /* 667 * Check to make sure that we haven't already inc'd 668 * rs_num_retry_despite_err for current nfs4_start_fop 669 * instance. We don't want to double inc (if we were 670 * called with vp2, then the vp1 call could have 671 * already incremented. 672 */ 673 if (retry_err_cnt == rsp->rs_num_retry_despite_err) 674 rsp->rs_num_retry_despite_err++; 675 676 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 677 "nfs4_start_fop: %s %p DEAD, cnt=%d", str, 678 (void *)vp, rsp->rs_num_retry_despite_err)); 679 } else { 680 error = (rp->r_error ? rp->r_error : EIO); 681 /* 682 * An ESTALE error on a non-regular file is not 683 * "sticky". Return the ESTALE error once, but 684 * clear the condition to allow future operations 685 * to go OTW. This will allow the client to 686 * recover if the server has merely unshared then 687 * re-shared the file system. For regular files, 688 * the unshare has destroyed the open state at the 689 * server and we aren't willing to do a reopen (yet). 690 */ 691 if (error == ESTALE && vp->v_type != VREG) { 692 rp->r_flags &= 693 ~(R4RECOVERR|R4RECOVERRP|R4STALE); 694 rp->r_error = 0; 695 error = ESTALE; 696 } 697 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 698 "nfs4_start_fop: %s %p DEAD, cnt=%d error=%d", 699 str, (void *)vp, 700 rsp->rs_num_retry_despite_err, error)); 701 } 702 } 703 704 mutex_exit(&rp->r_statelock); 705 return (error); 706 } 707 708 /* 709 * Initial setup code that every operation should call if it might invoke 710 * client recovery. Can block waiting for recovery to finish on a 711 * filesystem. Either vnode ptr can be NULL. 712 * 713 * Returns 0 if there are no outstanding errors. Can return an 714 * errno value under various circumstances (e.g., failed recovery, or 715 * interrupted while waiting for recovery to finish). 716 * 717 * There must be a corresponding call to nfs4_end_op() to free up any locks 718 * or resources allocated by this call (assuming this call succeeded), 719 * using the same rsp that's passed in here. 720 * 721 * The open and lock seqid synchronization must be stopped before calling this 722 * function, as it could lead to deadlock when trying to reopen a file or 723 * reclaim a lock. The synchronization is obtained with calls to: 724 * nfs4_start_open_seqid_sync() 725 * nfs4_start_lock_seqid_sync() 726 * 727 * *startrecovp is set TRUE if the caller should not bother with the 728 * over-the-wire call, and just initiate recovery for the given request. 729 * This is typically used for state-releasing ops if the filesystem has 730 * been forcibly unmounted. startrecovp may be NULL for 731 * non-state-releasing ops. 732 */ 733 734 int 735 nfs4_start_fop(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2, nfs4_op_hint_t op, 736 nfs4_recov_state_t *rsp, bool_t *startrecovp) 737 { 738 int error = 0, rerr_cnt; 739 nfs4_server_t *sp = NULL; 740 nfs4_server_t *tsp; 741 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 742 uint_t droplock_cnt; 743 #ifdef DEBUG 744 void *fop_caller; 745 #endif 746 747 ASSERT(vp1 == NULL || vp1->v_vfsp == mi->mi_vfsp); 748 ASSERT(vp2 == NULL || vp2->v_vfsp == mi->mi_vfsp); 749 750 #ifdef DEBUG 751 if ((fop_caller = tsd_get(nfs4_tsd_key)) != NULL) { 752 cmn_err(CE_PANIC, "Missing nfs4_end_fop: last caller %p", 753 fop_caller); 754 } 755 (void) tsd_set(nfs4_tsd_key, caller()); 756 #endif 757 758 rsp->rs_sp = NULL; 759 rsp->rs_flags &= ~NFS4_RS_RENAME_HELD; 760 rerr_cnt = rsp->rs_num_retry_despite_err; 761 762 /* 763 * Process the items that may delay() based on server response 764 */ 765 error = nfs4_wait_for_grace(mi, rsp); 766 if (error) 767 goto out; 768 769 if (vp1 != NULL) { 770 error = nfs4_wait_for_delay(vp1, rsp); 771 if (error) 772 goto out; 773 } 774 775 /* Wait for a delegation recall to complete. */ 776 777 error = wait_for_recall(vp1, vp2, op, rsp); 778 if (error) 779 goto out; 780 781 /* 782 * Wait for any current recovery actions to finish. Note that a 783 * recovery thread can still start up after wait_for_recovery() 784 * finishes. We don't block out recovery operations until we 785 * acquire s_recovlock and mi_recovlock. 786 */ 787 error = wait_for_recovery(mi, op); 788 if (error) 789 goto out; 790 791 /* 792 * Check to see if the rnode is already marked with a 793 * recovery error. If so, return it immediately. But 794 * always pass CLOSE, LOCKU, and DELEGRETURN so we can 795 * clean up state on the server. 796 */ 797 798 if (vp1 != NULL) { 799 if (error = nfs4_check_recov_err(vp1, op, rsp, rerr_cnt, "vp1")) 800 goto out; 801 nfs4_check_remap(mi, vp1, NFS4_REMAP_CKATTRS, &e); 802 } 803 804 if (vp2 != NULL) { 805 if (error = nfs4_check_recov_err(vp2, op, rsp, rerr_cnt, "vp2")) 806 goto out; 807 nfs4_check_remap(mi, vp2, NFS4_REMAP_CKATTRS, &e); 808 } 809 810 /* 811 * The lock order calls for us to acquire s_recovlock before 812 * mi_recovlock, but we have to hold mi_recovlock to look up sp (to 813 * prevent races with the failover/migration code). So acquire 814 * mi_recovlock, look up sp, drop mi_recovlock, acquire 815 * s_recovlock and mi_recovlock, then verify that sp is still the 816 * right object. XXX Can we find a simpler way to deal with this? 817 */ 818 if (nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 819 mi->mi_flags & MI4_INT)) { 820 error = EINTR; 821 goto out; 822 } 823 get_sp: 824 sp = find_nfs4_server(mi); 825 if (sp != NULL) { 826 sp->s_otw_call_count++; 827 mutex_exit(&sp->s_lock); 828 droplock_cnt = mi->mi_srvset_cnt; 829 } 830 nfs_rw_exit(&mi->mi_recovlock); 831 832 if (sp != NULL) { 833 if (nfs_rw_enter_sig(&sp->s_recovlock, RW_READER, 834 mi->mi_flags & MI4_INT)) { 835 error = EINTR; 836 goto out; 837 } 838 } 839 if (nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 840 mi->mi_flags & MI4_INT)) { 841 if (sp != NULL) 842 nfs_rw_exit(&sp->s_recovlock); 843 error = EINTR; 844 goto out; 845 } 846 /* 847 * If the mntinfo4_t hasn't changed nfs4_sever_ts then 848 * there's no point in double checking to make sure it 849 * has switched. 850 */ 851 if (sp == NULL || droplock_cnt != mi->mi_srvset_cnt) { 852 tsp = find_nfs4_server(mi); 853 if (tsp != sp) { 854 /* try again */ 855 if (tsp != NULL) { 856 mutex_exit(&tsp->s_lock); 857 nfs4_server_rele(tsp); 858 tsp = NULL; 859 } 860 if (sp != NULL) { 861 nfs_rw_exit(&sp->s_recovlock); 862 mutex_enter(&sp->s_lock); 863 sp->s_otw_call_count--; 864 mutex_exit(&sp->s_lock); 865 nfs4_server_rele(sp); 866 sp = NULL; 867 } 868 goto get_sp; 869 } else { 870 if (tsp != NULL) { 871 mutex_exit(&tsp->s_lock); 872 nfs4_server_rele(tsp); 873 tsp = NULL; 874 } 875 } 876 } 877 878 if (sp != NULL) { 879 rsp->rs_sp = sp; 880 } 881 882 /* 883 * If the fileystem uses volatile filehandles, obtain a lock so 884 * that we synchronize with renames. Exception: mount operations 885 * can change mi_fh_expire_type, which could be a problem, since 886 * the end_op code needs to be consistent with the start_op code 887 * about mi_rename_lock. Since mounts don't compete with renames, 888 * it's simpler to just not acquire the rename lock for mounts. 889 */ 890 if (NFS4_VOLATILE_FH(mi) && op != OH_MOUNT) { 891 if (nfs_rw_enter_sig(&mi->mi_rename_lock, 892 op == OH_VFH_RENAME ? RW_WRITER : RW_READER, 893 mi->mi_flags & MI4_INT)) { 894 nfs_rw_exit(&mi->mi_recovlock); 895 if (sp != NULL) 896 nfs_rw_exit(&sp->s_recovlock); 897 error = EINTR; 898 goto out; 899 } 900 rsp->rs_flags |= NFS4_RS_RENAME_HELD; 901 } 902 903 if (OH_IS_STATE_RELE(op)) { 904 /* 905 * For forced unmount, letting the request proceed will 906 * almost always delay response to the user, so hand it off 907 * to the recovery thread. For exiting lwp's, we don't 908 * have a good way to tell if the request will hang. We 909 * generally want processes to handle their own requests so 910 * that they can be done in parallel, but if there is 911 * already a recovery thread, hand the request off to it. 912 * This will improve user response at no cost to overall 913 * system throughput. For zone shutdown, we'd prefer 914 * the recovery thread to handle this as well. 915 */ 916 ASSERT(startrecovp != NULL); 917 mutex_enter(&mi->mi_lock); 918 if (FS_OR_ZONE_GONE4(mi->mi_vfsp)) 919 *startrecovp = TRUE; 920 else if ((curthread->t_proc_flag & TP_LWPEXIT) && 921 (mi->mi_flags & MI4_RECOV_ACTIV)) 922 *startrecovp = TRUE; 923 else 924 *startrecovp = FALSE; 925 mutex_exit(&mi->mi_lock); 926 } else 927 if (startrecovp != NULL) 928 *startrecovp = FALSE; 929 930 ASSERT(error == 0); 931 return (error); 932 933 out: 934 ASSERT(error != 0); 935 if (sp != NULL) { 936 mutex_enter(&sp->s_lock); 937 sp->s_otw_call_count--; 938 mutex_exit(&sp->s_lock); 939 nfs4_server_rele(sp); 940 rsp->rs_sp = NULL; 941 } 942 nfs4_end_op_recall(vp1, vp2, rsp); 943 944 #ifdef DEBUG 945 (void) tsd_set(nfs4_tsd_key, NULL); 946 #endif 947 return (error); 948 } 949 950 /* 951 * It is up to the caller to determine if rsp->rs_sp being NULL 952 * is detrimental or not. 953 */ 954 int 955 nfs4_start_op(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2, 956 nfs4_recov_state_t *rsp) 957 { 958 ASSERT(rsp->rs_num_retry_despite_err == 0); 959 rsp->rs_num_retry_despite_err = 0; 960 return (nfs4_start_fop(mi, vp1, vp2, OH_OTHER, rsp, NULL)); 961 } 962 963 /* 964 * Release any resources acquired by nfs4_start_op(). 965 * 'sp' should be the nfs4_server pointer returned by nfs4_start_op(). 966 * 967 * The operation hint is used to avoid a deadlock by bypassing delegation 968 * return logic for writes, which are done while returning a delegation. 969 */ 970 971 void 972 nfs4_end_fop(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2, nfs4_op_hint_t op, 973 nfs4_recov_state_t *rsp, bool_t needs_recov) 974 { 975 nfs4_server_t *sp = rsp->rs_sp; 976 rnode4_t *rp = NULL; 977 978 #ifdef lint 979 /* 980 * The op hint isn't used any more, but might be in 981 * the future. 982 */ 983 op = op; 984 #endif 985 986 #ifdef DEBUG 987 ASSERT(tsd_get(nfs4_tsd_key) != NULL); 988 (void) tsd_set(nfs4_tsd_key, NULL); 989 #endif 990 991 nfs4_end_op_recall(vp1, vp2, rsp); 992 993 if (rsp->rs_flags & NFS4_RS_RENAME_HELD) 994 nfs_rw_exit(&mi->mi_rename_lock); 995 996 if (!needs_recov) { 997 if (rsp->rs_flags & NFS4_RS_DELAY_MSG) { 998 /* may need to clear the delay interval */ 999 if (vp1 != NULL) { 1000 rp = VTOR4(vp1); 1001 mutex_enter(&rp->r_statelock); 1002 rp->r_delay_interval = 0; 1003 mutex_exit(&rp->r_statelock); 1004 } 1005 } 1006 rsp->rs_flags &= ~(NFS4_RS_GRACE_MSG|NFS4_RS_DELAY_MSG); 1007 } 1008 1009 /* 1010 * If the corresponding nfs4_start_op() found a sp, 1011 * then there must still be a sp. 1012 */ 1013 if (sp != NULL) { 1014 nfs_rw_exit(&mi->mi_recovlock); 1015 nfs_rw_exit(&sp->s_recovlock); 1016 mutex_enter(&sp->s_lock); 1017 sp->s_otw_call_count--; 1018 cv_broadcast(&sp->s_cv_otw_count); 1019 mutex_exit(&sp->s_lock); 1020 nfs4_server_rele(sp); 1021 } else { 1022 nfs_rw_exit(&mi->mi_recovlock); 1023 } 1024 } 1025 1026 void 1027 nfs4_end_op(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2, 1028 nfs4_recov_state_t *rsp, bool_t needrecov) 1029 { 1030 nfs4_end_fop(mi, vp1, vp2, OH_OTHER, rsp, needrecov); 1031 } 1032 1033 /* 1034 * If the filesystem is going through client recovery, block until 1035 * finished. 1036 * Exceptions: 1037 * - state-releasing ops (CLOSE, LOCKU, DELEGRETURN) are allowed to proceed 1038 * if the filesystem has been forcibly unmounted or the lwp is exiting. 1039 * 1040 * Return value: 1041 * - 0 if no errors 1042 * - EINTR if the call was interrupted 1043 * - EIO if the filesystem has been forcibly unmounted (non-state-releasing 1044 * op) 1045 * - the errno value from the recovery thread, if recovery failed 1046 */ 1047 1048 static int 1049 wait_for_recovery(mntinfo4_t *mi, nfs4_op_hint_t op_hint) 1050 { 1051 int error = 0; 1052 1053 mutex_enter(&mi->mi_lock); 1054 1055 while (mi->mi_recovflags != 0) { 1056 klwp_t *lwp = ttolwp(curthread); 1057 1058 if ((mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) || 1059 (mi->mi_flags & MI4_RECOV_FAIL)) 1060 break; 1061 if (OH_IS_STATE_RELE(op_hint) && 1062 (curthread->t_proc_flag & TP_LWPEXIT)) 1063 break; 1064 1065 if (lwp != NULL) 1066 lwp->lwp_nostop++; 1067 /* XXX - use different cv? */ 1068 if (cv_wait_sig(&mi->mi_failover_cv, &mi->mi_lock) == 0) { 1069 error = EINTR; 1070 if (lwp != NULL) 1071 lwp->lwp_nostop--; 1072 break; 1073 } 1074 if (lwp != NULL) 1075 lwp->lwp_nostop--; 1076 } 1077 1078 if ((mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) && 1079 !OH_IS_STATE_RELE(op_hint)) { 1080 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1081 "wait_for_recovery: forced unmount")); 1082 error = EIO; 1083 } else if (mi->mi_flags & MI4_RECOV_FAIL) { 1084 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1085 "wait_for_recovery: fail since RECOV FAIL")); 1086 error = mi->mi_error; 1087 } 1088 1089 mutex_exit(&mi->mi_lock); 1090 1091 return (error); 1092 } 1093 1094 /* 1095 * If the client received NFS4ERR_GRACE for this particular mount, 1096 * the client blocks here until it is time to try again. 1097 * 1098 * Return value: 1099 * - 0 if wait was successful 1100 * - EINTR if the call was interrupted 1101 */ 1102 1103 int 1104 nfs4_wait_for_grace(mntinfo4_t *mi, nfs4_recov_state_t *rsp) 1105 { 1106 int error = 0; 1107 time_t curtime, time_to_wait; 1108 1109 /* do a unprotected check to reduce mi_lock contention */ 1110 if (mi->mi_grace_wait != 0) { 1111 mutex_enter(&mi->mi_lock); 1112 1113 if (mi->mi_grace_wait != 0) { 1114 if (!(rsp->rs_flags & NFS4_RS_GRACE_MSG)) 1115 rsp->rs_flags |= NFS4_RS_GRACE_MSG; 1116 1117 curtime = gethrestime_sec(); 1118 1119 if (curtime < mi->mi_grace_wait) { 1120 1121 time_to_wait = mi->mi_grace_wait - curtime; 1122 1123 mutex_exit(&mi->mi_lock); 1124 1125 delay(SEC_TO_TICK(time_to_wait)); 1126 1127 curtime = gethrestime_sec(); 1128 1129 mutex_enter(&mi->mi_lock); 1130 1131 if (curtime >= mi->mi_grace_wait) 1132 mi->mi_grace_wait = 0; 1133 } else { 1134 mi->mi_grace_wait = 0; 1135 } 1136 } 1137 mutex_exit(&mi->mi_lock); 1138 } 1139 1140 return (error); 1141 } 1142 1143 /* 1144 * If the client received NFS4ERR_DELAY for an operation on a vnode, 1145 * the client blocks here until it is time to try again. 1146 * 1147 * Return value: 1148 * - 0 if wait was successful 1149 * - EINTR if the call was interrupted 1150 */ 1151 1152 int 1153 nfs4_wait_for_delay(vnode_t *vp, nfs4_recov_state_t *rsp) 1154 { 1155 int error = 0; 1156 time_t curtime, time_to_wait; 1157 rnode4_t *rp; 1158 1159 ASSERT(vp != NULL); 1160 1161 rp = VTOR4(vp); 1162 1163 /* do a unprotected check to reduce r_statelock contention */ 1164 if (rp->r_delay_wait != 0) { 1165 mutex_enter(&rp->r_statelock); 1166 1167 if (rp->r_delay_wait != 0) { 1168 1169 if (!(rsp->rs_flags & NFS4_RS_DELAY_MSG)) { 1170 rsp->rs_flags |= NFS4_RS_DELAY_MSG; 1171 nfs4_mi_kstat_inc_delay(VTOMI4(vp)); 1172 } 1173 1174 curtime = gethrestime_sec(); 1175 1176 if (curtime < rp->r_delay_wait) { 1177 1178 time_to_wait = rp->r_delay_wait - curtime; 1179 1180 mutex_exit(&rp->r_statelock); 1181 1182 delay(SEC_TO_TICK(time_to_wait)); 1183 1184 curtime = gethrestime_sec(); 1185 1186 mutex_enter(&rp->r_statelock); 1187 1188 if (curtime >= rp->r_delay_wait) 1189 rp->r_delay_wait = 0; 1190 } else { 1191 rp->r_delay_wait = 0; 1192 } 1193 } 1194 mutex_exit(&rp->r_statelock); 1195 } 1196 1197 return (error); 1198 } 1199 1200 /* 1201 * The recovery thread. 1202 */ 1203 1204 static void 1205 nfs4_recov_thread(recov_info_t *recovp) 1206 { 1207 mntinfo4_t *mi = recovp->rc_mi; 1208 nfs4_server_t *sp; 1209 int done = 0, error = 0; 1210 bool_t recov_fail = FALSE; 1211 callb_cpr_t cpr_info; 1212 kmutex_t cpr_lock; 1213 1214 nfs4_queue_event(RE_START, mi, NULL, mi->mi_recovflags, 1215 recovp->rc_vp1, recovp->rc_vp2, 0, NULL, 0, TAG_NONE, TAG_NONE, 1216 0, 0); 1217 1218 mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL); 1219 CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr, "nfsv4Recov"); 1220 1221 mutex_enter(&mi->mi_lock); 1222 mi->mi_recovthread = curthread; 1223 mutex_exit(&mi->mi_lock); 1224 1225 /* 1226 * We don't really need protection here against failover or 1227 * migration, since the current thread is the one that would make 1228 * any changes, but hold mi_recovlock anyway for completeness (and 1229 * to satisfy any ASSERTs). 1230 */ 1231 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0); 1232 sp = find_nfs4_server(mi); 1233 if (sp != NULL) 1234 mutex_exit(&sp->s_lock); 1235 nfs_rw_exit(&mi->mi_recovlock); 1236 1237 /* 1238 * Do any necessary recovery, based on the information in recovp 1239 * and any recovery flags. 1240 */ 1241 1242 do { 1243 mutex_enter(&mi->mi_lock); 1244 if (FS_OR_ZONE_GONE4(mi->mi_vfsp)) { 1245 bool_t activesrv; 1246 1247 NFS4_DEBUG(nfs4_client_recov_debug && 1248 mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED, (CE_NOTE, 1249 "nfs4_recov_thread: file system has been " 1250 "unmounted")); 1251 NFS4_DEBUG(nfs4_client_recov_debug && 1252 zone_status_get(curproc->p_zone) >= 1253 ZONE_IS_SHUTTING_DOWN, (CE_NOTE, 1254 "nfs4_recov_thread: zone shutting down")); 1255 /* 1256 * If the server has lost its state for us and 1257 * the filesystem is unmounted, then the filesystem 1258 * can be tossed, even if there are lost lock or 1259 * lost state calls in the recovery queue. 1260 */ 1261 if (mi->mi_recovflags & 1262 (MI4R_NEED_CLIENTID | MI4R_REOPEN_FILES)) { 1263 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1264 "nfs4_recov_thread: bailing out")); 1265 mi->mi_flags |= MI4_RECOV_FAIL; 1266 mi->mi_error = recovp->rc_error; 1267 recov_fail = TRUE; 1268 } 1269 /* 1270 * We don't know if the server has any state for 1271 * us, and the filesystem has been unmounted. If 1272 * there are "lost state" recovery items, keep 1273 * trying to process them until there are no more 1274 * mounted filesystems for the server. Otherwise, 1275 * bail out. The reason we don't mark the 1276 * filesystem as failing recovery is in case we 1277 * have to do "lost state" recovery later (e.g., a 1278 * user process exits). 1279 */ 1280 if (!(mi->mi_recovflags & MI4R_LOST_STATE)) { 1281 done = 1; 1282 mutex_exit(&mi->mi_lock); 1283 break; 1284 } 1285 mutex_exit(&mi->mi_lock); 1286 1287 if (sp == NULL) 1288 activesrv = FALSE; 1289 else { 1290 mutex_enter(&sp->s_lock); 1291 activesrv = nfs4_fs_active(sp); 1292 } 1293 if (!activesrv) { 1294 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1295 "no active fs for server %p", 1296 (void *)sp)); 1297 mutex_enter(&mi->mi_lock); 1298 mi->mi_flags |= MI4_RECOV_FAIL; 1299 mi->mi_error = recovp->rc_error; 1300 mutex_exit(&mi->mi_lock); 1301 recov_fail = TRUE; 1302 if (sp != NULL) { 1303 /* 1304 * Mark the server instance as 1305 * dead, so that nobody will attach 1306 * a new filesystem. 1307 */ 1308 nfs4_mark_srv_dead(sp); 1309 } 1310 } 1311 if (sp != NULL) 1312 mutex_exit(&sp->s_lock); 1313 } else { 1314 mutex_exit(&mi->mi_lock); 1315 } 1316 1317 /* 1318 * Check if we need to select a new server for a 1319 * failover. Choosing a new server will force at 1320 * least a check of the clientid. 1321 */ 1322 mutex_enter(&mi->mi_lock); 1323 if (!recov_fail && 1324 (mi->mi_recovflags & MI4R_NEED_NEW_SERVER)) { 1325 mutex_exit(&mi->mi_lock); 1326 recov_newserver(recovp, &sp, &recov_fail); 1327 } else 1328 mutex_exit(&mi->mi_lock); 1329 1330 /* 1331 * Check if we need to recover the clientid. This 1332 * must be done before file and lock recovery, and it 1333 * potentially affects the recovery threads for other 1334 * filesystems, so it gets special treatment. 1335 */ 1336 if (sp != NULL && recov_fail == FALSE) { 1337 mutex_enter(&sp->s_lock); 1338 if (!(sp->s_flags & N4S_CLIENTID_SET)) { 1339 mutex_exit(&sp->s_lock); 1340 recov_clientid(recovp, sp); 1341 } else { 1342 /* 1343 * Unset this flag in case another recovery 1344 * thread successfully recovered the clientid 1345 * for us already. 1346 */ 1347 mutex_enter(&mi->mi_lock); 1348 mi->mi_recovflags &= ~MI4R_NEED_CLIENTID; 1349 mutex_exit(&mi->mi_lock); 1350 mutex_exit(&sp->s_lock); 1351 } 1352 } 1353 1354 /* 1355 * Check if we need to get the security information. 1356 */ 1357 mutex_enter(&mi->mi_lock); 1358 if ((mi->mi_recovflags & MI4R_NEED_SECINFO) && 1359 !(mi->mi_flags & MI4_RECOV_FAIL)) { 1360 mutex_exit(&mi->mi_lock); 1361 (void) nfs_rw_enter_sig(&mi->mi_recovlock, 1362 RW_WRITER, 0); 1363 error = nfs4_secinfo_recov(recovp->rc_mi, 1364 recovp->rc_vp1, recovp->rc_vp2); 1365 /* 1366 * If error, nothing more can be done, stop 1367 * the recovery. 1368 */ 1369 if (error) { 1370 mutex_enter(&mi->mi_lock); 1371 mi->mi_flags |= MI4_RECOV_FAIL; 1372 mi->mi_error = recovp->rc_error; 1373 mutex_exit(&mi->mi_lock); 1374 nfs4_queue_event(RE_WRONGSEC, mi, NULL, 1375 error, recovp->rc_vp1, recovp->rc_vp2, 1376 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0); 1377 } 1378 nfs_rw_exit(&mi->mi_recovlock); 1379 } else 1380 mutex_exit(&mi->mi_lock); 1381 1382 /* 1383 * Check if there's a bad seqid to recover. 1384 */ 1385 mutex_enter(&mi->mi_lock); 1386 if ((mi->mi_recovflags & MI4R_BAD_SEQID) && 1387 !(mi->mi_flags & MI4_RECOV_FAIL)) { 1388 mutex_exit(&mi->mi_lock); 1389 (void) nfs_rw_enter_sig(&mi->mi_recovlock, 1390 RW_WRITER, 0); 1391 recov_bad_seqid(recovp); 1392 nfs_rw_exit(&mi->mi_recovlock); 1393 } else 1394 mutex_exit(&mi->mi_lock); 1395 1396 /* 1397 * Next check for recovery that affects the entire 1398 * filesystem. 1399 */ 1400 if (sp != NULL) { 1401 mutex_enter(&mi->mi_lock); 1402 if ((mi->mi_recovflags & MI4R_REOPEN_FILES) && 1403 !(mi->mi_flags & MI4_RECOV_FAIL)) { 1404 mutex_exit(&mi->mi_lock); 1405 recov_openfiles(recovp, sp); 1406 } else 1407 mutex_exit(&mi->mi_lock); 1408 } 1409 1410 /* 1411 * Send any queued state recovery requests. 1412 */ 1413 mutex_enter(&mi->mi_lock); 1414 if (sp != NULL && 1415 (mi->mi_recovflags & MI4R_LOST_STATE) && 1416 !(mi->mi_flags & MI4_RECOV_FAIL)) { 1417 mutex_exit(&mi->mi_lock); 1418 (void) nfs_rw_enter_sig(&mi->mi_recovlock, 1419 RW_WRITER, 0); 1420 nfs4_resend_lost_rqsts(recovp, sp); 1421 if (list_head(&mi->mi_lost_state) == NULL) { 1422 /* done */ 1423 mutex_enter(&mi->mi_lock); 1424 mi->mi_recovflags &= ~MI4R_LOST_STATE; 1425 mutex_exit(&mi->mi_lock); 1426 } 1427 nfs_rw_exit(&mi->mi_recovlock); 1428 } else { 1429 mutex_exit(&mi->mi_lock); 1430 } 1431 1432 /* 1433 * See if there is anything more to do. If not, announce 1434 * that we are done and exit. 1435 * 1436 * Need mi_recovlock to keep 'sp' valid. Must grab 1437 * mi_recovlock before mi_lock to preserve lock ordering. 1438 */ 1439 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0); 1440 mutex_enter(&mi->mi_lock); 1441 if ((mi->mi_recovflags & ~MI4R_SRV_REBOOT) == 0 || 1442 (mi->mi_flags & MI4_RECOV_FAIL)) { 1443 list_t local_lost_state; 1444 nfs4_lost_rqst_t *lrp; 1445 1446 /* 1447 * We need to remove the lost requests before we 1448 * unmark the mi as no longer doing recovery to 1449 * avoid a race with a new thread putting new lost 1450 * requests on the same mi (and the going away 1451 * thread would remove the new lost requests). 1452 * 1453 * Move the lost requests to a local list since 1454 * nfs4_remove_lost_rqst() drops mi_lock, and 1455 * dropping the mi_lock would make our check to 1456 * see if recovery is done no longer valid. 1457 */ 1458 list_create(&local_lost_state, 1459 sizeof (nfs4_lost_rqst_t), 1460 offsetof(nfs4_lost_rqst_t, lr_node)); 1461 list_move_tail(&local_lost_state, &mi->mi_lost_state); 1462 1463 done = 1; 1464 mutex_exit(&mi->mi_lock); 1465 /* 1466 * Now officially free the "moved" 1467 * lost requests. 1468 */ 1469 while ((lrp = list_head(&local_lost_state)) != NULL) { 1470 list_remove(&local_lost_state, lrp); 1471 nfs4_free_lost_rqst(lrp, sp); 1472 } 1473 list_destroy(&local_lost_state); 1474 } else 1475 mutex_exit(&mi->mi_lock); 1476 nfs_rw_exit(&mi->mi_recovlock); 1477 1478 /* 1479 * If the filesystem has been forcibly unmounted, there is 1480 * probably no point in retrying immediately. Furthermore, 1481 * there might be user processes waiting for a chance to 1482 * queue up "lost state" requests, so that they can exit. 1483 * So pause here for a moment. Same logic for zone shutdown. 1484 */ 1485 if (!done && FS_OR_ZONE_GONE4(mi->mi_vfsp)) { 1486 mutex_enter(&mi->mi_lock); 1487 cv_broadcast(&mi->mi_failover_cv); 1488 mutex_exit(&mi->mi_lock); 1489 delay(SEC_TO_TICK(nfs4_unmount_delay)); 1490 } 1491 1492 } while (!done); 1493 1494 if (sp != NULL) 1495 nfs4_server_rele(sp); 1496 1497 /* 1498 * Return all recalled delegations 1499 */ 1500 nfs4_dlistclean(); 1501 1502 mutex_enter(&mi->mi_lock); 1503 recov_done(mi, recovp); 1504 mutex_exit(&mi->mi_lock); 1505 1506 /* 1507 * Free up resources that were allocated for us. 1508 */ 1509 if (recovp->rc_vp1 != NULL) 1510 VN_RELE(recovp->rc_vp1); 1511 if (recovp->rc_vp2 != NULL) 1512 VN_RELE(recovp->rc_vp2); 1513 1514 /* now we are done using the mi struct, signal the waiters */ 1515 mutex_enter(&mi->mi_lock); 1516 mi->mi_in_recovery--; 1517 if (mi->mi_in_recovery == 0) 1518 cv_broadcast(&mi->mi_cv_in_recov); 1519 mutex_exit(&mi->mi_lock); 1520 1521 VFS_RELE(mi->mi_vfsp); 1522 MI4_RELE(mi); 1523 kmem_free(recovp, sizeof (recov_info_t)); 1524 mutex_enter(&cpr_lock); 1525 CALLB_CPR_EXIT(&cpr_info); 1526 mutex_destroy(&cpr_lock); 1527 zthread_exit(); 1528 } 1529 1530 /* 1531 * Log the end of recovery and notify any waiting threads. 1532 */ 1533 1534 static void 1535 recov_done(mntinfo4_t *mi, recov_info_t *recovp) 1536 { 1537 1538 ASSERT(MUTEX_HELD(&mi->mi_lock)); 1539 1540 nfs4_queue_event(RE_END, mi, NULL, 0, recovp->rc_vp1, 1541 recovp->rc_vp2, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0); 1542 mi->mi_recovthread = NULL; 1543 mi->mi_flags &= ~MI4_RECOV_ACTIV; 1544 mi->mi_recovflags &= ~MI4R_SRV_REBOOT; 1545 cv_broadcast(&mi->mi_failover_cv); 1546 } 1547 1548 /* 1549 * State-specific recovery routines, by state. 1550 */ 1551 1552 /* 1553 * Failover. 1554 * 1555 * Replaces *spp with a reference to the new server, which must 1556 * eventually be freed. 1557 */ 1558 1559 static void 1560 recov_newserver(recov_info_t *recovp, nfs4_server_t **spp, bool_t *recov_fail) 1561 { 1562 mntinfo4_t *mi = recovp->rc_mi; 1563 servinfo4_t *svp = NULL; 1564 nfs4_server_t *osp = *spp; 1565 CLIENT *cl; 1566 enum clnt_stat status; 1567 struct timeval tv; 1568 int error; 1569 int oncethru = 0; 1570 rnode4_t *rp; 1571 int index; 1572 nfs_fh4 fh; 1573 char *snames; 1574 size_t len; 1575 1576 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0); 1577 1578 tv.tv_sec = 2; 1579 tv.tv_usec = 0; 1580 1581 #ifdef lint 1582 /* 1583 * Lint can't follow the logic, so thinks that snames and len 1584 * can be used before being set. They can't, but lint can't 1585 * figure it out. To address the lint warning, initialize 1586 * snames and len for lint. 1587 */ 1588 snames = NULL; 1589 len = 0; 1590 #endif 1591 1592 /* 1593 * Ping the null NFS procedure of every server in 1594 * the list until one responds. We always start 1595 * at the head of the list and always skip the one 1596 * that is current, since it's caused us a problem. 1597 */ 1598 while (svp == NULL) { 1599 for (svp = mi->mi_servers; svp; svp = svp->sv_next) { 1600 1601 mutex_enter(&mi->mi_lock); 1602 if (FS_OR_ZONE_GONE4(mi->mi_vfsp)) { 1603 mi->mi_flags |= MI4_RECOV_FAIL; 1604 mutex_exit(&mi->mi_lock); 1605 (void) nfs_rw_exit(&mi->mi_recovlock); 1606 *recov_fail = TRUE; 1607 if (oncethru) 1608 kmem_free(snames, len); 1609 return; 1610 } 1611 mutex_exit(&mi->mi_lock); 1612 1613 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 1614 if (svp->sv_flags & SV4_NOTINUSE) { 1615 nfs_rw_exit(&svp->sv_lock); 1616 continue; 1617 } 1618 nfs_rw_exit(&svp->sv_lock); 1619 1620 if (!oncethru && svp == mi->mi_curr_serv) 1621 continue; 1622 1623 error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, 1624 NFS_PROGRAM, NFS_V4, 0, 1, CRED(), &cl); 1625 if (error) 1626 continue; 1627 1628 if (!(mi->mi_flags & MI4_INT)) 1629 cl->cl_nosignal = TRUE; 1630 status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL, 1631 xdr_void, NULL, tv); 1632 if (!(mi->mi_flags & MI4_INT)) 1633 cl->cl_nosignal = FALSE; 1634 AUTH_DESTROY(cl->cl_auth); 1635 CLNT_DESTROY(cl); 1636 if (status == RPC_SUCCESS) { 1637 nfs4_queue_event(RE_FAILOVER, mi, 1638 svp == mi->mi_curr_serv ? NULL : 1639 svp->sv_hostname, 0, NULL, NULL, 0, 1640 NULL, 0, TAG_NONE, TAG_NONE, 0, 0); 1641 break; 1642 } 1643 } 1644 1645 if (svp == NULL) { 1646 if (!oncethru) { 1647 snames = nfs4_getsrvnames(mi, &len); 1648 nfs4_queue_fact(RF_SRVS_NOT_RESPOND, mi, 1649 0, 0, 0, FALSE, snames, 0, NULL); 1650 oncethru = 1; 1651 } 1652 delay(hz); 1653 } 1654 } 1655 1656 if (oncethru) { 1657 nfs4_queue_fact(RF_SRVS_OK, mi, 0, 0, 0, FALSE, snames, 1658 0, NULL); 1659 kmem_free(snames, len); 1660 } 1661 1662 #if DEBUG 1663 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 1664 ASSERT((svp->sv_flags & SV4_NOTINUSE) == 0); 1665 nfs_rw_exit(&svp->sv_lock); 1666 #endif 1667 1668 mutex_enter(&mi->mi_lock); 1669 mi->mi_recovflags &= ~MI4R_NEED_NEW_SERVER; 1670 if (svp != mi->mi_curr_serv) { 1671 servinfo4_t *osvp = mi->mi_curr_serv; 1672 1673 mutex_exit(&mi->mi_lock); 1674 1675 /* 1676 * Update server-dependent fields in the root vnode. 1677 */ 1678 index = rtable4hash(mi->mi_rootfh); 1679 rw_enter(&rtable4[index].r_lock, RW_WRITER); 1680 1681 rp = r4find(&rtable4[index], mi->mi_rootfh, mi->mi_vfsp); 1682 if (rp != NULL) { 1683 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE, 1684 "recov_newserver: remapping %s", rnode4info(rp))); 1685 mutex_enter(&rp->r_statelock); 1686 rp->r_server = svp; 1687 PURGE_ATTRCACHE4_LOCKED(rp); 1688 mutex_exit(&rp->r_statelock); 1689 (void) nfs4_free_data_reclaim(rp); 1690 nfs4_purge_rddir_cache(RTOV4(rp)); 1691 rw_exit(&rtable4[index].r_lock); 1692 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE, 1693 "recov_newserver: done with %s", 1694 rnode4info(rp))); 1695 VN_RELE(RTOV4(rp)); 1696 } else 1697 rw_exit(&rtable4[index].r_lock); 1698 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0); 1699 1700 mutex_enter(&mi->mi_lock); 1701 mi->mi_recovflags |= MI4R_REOPEN_FILES | MI4R_REMAP_FILES; 1702 if (recovp->rc_srv_reboot) 1703 mi->mi_recovflags |= MI4R_SRV_REBOOT; 1704 mi->mi_curr_serv = svp; 1705 mi->mi_failover++; 1706 mi->mi_flags &= ~MI4_BADOWNER_DEBUG; 1707 mutex_exit(&mi->mi_lock); 1708 1709 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 1710 fh.nfs_fh4_len = svp->sv_fhandle.fh_len; 1711 fh.nfs_fh4_val = svp->sv_fhandle.fh_buf; 1712 sfh4_update(mi->mi_rootfh, &fh); 1713 fh.nfs_fh4_len = svp->sv_pfhandle.fh_len; 1714 fh.nfs_fh4_val = svp->sv_pfhandle.fh_buf; 1715 sfh4_update(mi->mi_srvparentfh, &fh); 1716 nfs_rw_exit(&svp->sv_lock); 1717 1718 *spp = nfs4_move_mi(mi, osvp, svp); 1719 if (osp != NULL) 1720 nfs4_server_rele(osp); 1721 } else 1722 mutex_exit(&mi->mi_lock); 1723 (void) nfs_rw_exit(&mi->mi_recovlock); 1724 } 1725 1726 /* 1727 * Clientid. 1728 */ 1729 1730 static void 1731 recov_clientid(recov_info_t *recovp, nfs4_server_t *sp) 1732 { 1733 mntinfo4_t *mi = recovp->rc_mi; 1734 int error = 0; 1735 int still_stale; 1736 int need_new_s; 1737 1738 ASSERT(sp != NULL); 1739 1740 /* 1741 * Acquire the recovery lock and then verify that the clientid 1742 * still needs to be recovered. (Note that s_recovlock is supposed 1743 * to be acquired before s_lock.) Since the thread holds the 1744 * recovery lock, no other thread will recover the clientid. 1745 */ 1746 (void) nfs_rw_enter_sig(&sp->s_recovlock, RW_WRITER, 0); 1747 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0); 1748 mutex_enter(&sp->s_lock); 1749 still_stale = ((sp->s_flags & N4S_CLIENTID_SET) == 0); 1750 mutex_exit(&sp->s_lock); 1751 1752 if (still_stale) { 1753 nfs4_error_t n4e; 1754 1755 nfs4_error_zinit(&n4e); 1756 nfs4setclientid(mi, kcred, TRUE, &n4e); 1757 error = n4e.error; 1758 if (error != 0) { 1759 1760 /* 1761 * nfs4setclientid may have set MI4R_NEED_NEW_SERVER, 1762 * if so, just return and let recov_thread drive 1763 * failover. 1764 */ 1765 mutex_enter(&mi->mi_lock); 1766 need_new_s = mi->mi_recovflags & MI4R_NEED_NEW_SERVER; 1767 mutex_exit(&mi->mi_lock); 1768 1769 if (need_new_s) { 1770 nfs_rw_exit(&mi->mi_recovlock); 1771 nfs_rw_exit(&sp->s_recovlock); 1772 return; 1773 } 1774 1775 nfs4_queue_event(RE_CLIENTID, mi, NULL, n4e.error, NULL, 1776 NULL, n4e.stat, NULL, 0, TAG_NONE, TAG_NONE, 0, 0); 1777 mutex_enter(&mi->mi_lock); 1778 mi->mi_flags |= MI4_RECOV_FAIL; 1779 mi->mi_error = recovp->rc_error; 1780 mutex_exit(&mi->mi_lock); 1781 /* don't destroy the nfs4_server, let umount do it */ 1782 } 1783 } 1784 1785 if (error == 0) { 1786 mutex_enter(&mi->mi_lock); 1787 mi->mi_recovflags &= ~MI4R_NEED_CLIENTID; 1788 /* 1789 * If still_stale isn't true, then another thread already 1790 * recovered the clientid. And that thread that set the 1791 * clientid will have initiated reopening files on all the 1792 * filesystems for the server, so we should not initiate 1793 * reopening for this filesystem here. 1794 */ 1795 if (still_stale) { 1796 mi->mi_recovflags |= MI4R_REOPEN_FILES; 1797 if (recovp->rc_srv_reboot) 1798 mi->mi_recovflags |= MI4R_SRV_REBOOT; 1799 } 1800 mutex_exit(&mi->mi_lock); 1801 } 1802 1803 nfs_rw_exit(&mi->mi_recovlock); 1804 1805 if (error != 0) { 1806 nfs_rw_exit(&sp->s_recovlock); 1807 mutex_enter(&mi->mi_lock); 1808 if ((mi->mi_flags & MI4_RECOV_FAIL) == 0) 1809 delay(SEC_TO_TICK(recov_err_delay)); 1810 mutex_exit(&mi->mi_lock); 1811 } else { 1812 mntinfo4_t **milist; 1813 mntinfo4_t *tmi; 1814 int nummi, i; 1815 1816 /* 1817 * Initiate recovery of open files for other filesystems. 1818 * We create an array of filesystems, rather than just 1819 * walking the filesystem list, to avoid deadlock issues 1820 * with s_lock and mi_recovlock. 1821 */ 1822 milist = make_milist(sp, &nummi); 1823 for (i = 0; i < nummi; i++) { 1824 tmi = milist[i]; 1825 if (tmi != mi) { 1826 (void) nfs_rw_enter_sig(&tmi->mi_recovlock, 1827 RW_READER, 0); 1828 start_recovery_action(NR_OPENFILES, TRUE, tmi, 1829 NULL, NULL); 1830 nfs_rw_exit(&tmi->mi_recovlock); 1831 } 1832 } 1833 free_milist(milist, nummi); 1834 1835 nfs_rw_exit(&sp->s_recovlock); 1836 } 1837 } 1838 1839 /* 1840 * Return an array of filesystems associated with the given server. The 1841 * caller should call free_milist() to free the references and memory. 1842 */ 1843 1844 static mntinfo4_t ** 1845 make_milist(nfs4_server_t *sp, int *nummip) 1846 { 1847 int nummi, i; 1848 mntinfo4_t **milist; 1849 mntinfo4_t *tmi; 1850 1851 mutex_enter(&sp->s_lock); 1852 nummi = 0; 1853 for (tmi = sp->mntinfo4_list; tmi != NULL; tmi = tmi->mi_clientid_next) 1854 nummi++; 1855 1856 milist = kmem_alloc(nummi * sizeof (mntinfo4_t *), KM_SLEEP); 1857 1858 for (i = 0, tmi = sp->mntinfo4_list; tmi != NULL; i++, 1859 tmi = tmi->mi_clientid_next) { 1860 milist[i] = tmi; 1861 VFS_HOLD(tmi->mi_vfsp); 1862 } 1863 mutex_exit(&sp->s_lock); 1864 1865 *nummip = nummi; 1866 return (milist); 1867 } 1868 1869 /* 1870 * Free the filesystem list created by make_milist(). 1871 */ 1872 1873 static void 1874 free_milist(mntinfo4_t **milist, int nummi) 1875 { 1876 mntinfo4_t *tmi; 1877 int i; 1878 1879 for (i = 0; i < nummi; i++) { 1880 tmi = milist[i]; 1881 VFS_RELE(tmi->mi_vfsp); 1882 } 1883 kmem_free(milist, nummi * sizeof (mntinfo4_t *)); 1884 } 1885 1886 /* 1887 * Filehandle 1888 */ 1889 1890 /* 1891 * Lookup the filehandle for the given vnode and update the rnode if it has 1892 * changed. 1893 * 1894 * Errors: 1895 * - if the filehandle could not be updated because of an error that 1896 * requires further recovery, initiate that recovery and return. 1897 * - if the filehandle could not be updated because of a signal, pretend we 1898 * succeeded and let someone else deal with it. 1899 * - if the filehandle could not be updated and the filesystem has been 1900 * forcibly unmounted, pretend we succeeded, and let the caller deal with 1901 * the forced unmount (to retry or not to retry, that is the question). 1902 * - if the filehandle could not be updated because of some other error, 1903 * mark the rnode bad and return. 1904 */ 1905 static void 1906 recov_filehandle(nfs4_recov_t action, mntinfo4_t *mi, vnode_t *vp) 1907 { 1908 rnode4_t *rp = VTOR4(vp); 1909 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 1910 bool_t needrecov; 1911 1912 mutex_enter(&rp->r_statelock); 1913 1914 if (rp->r_flags & R4RECOVERR) { 1915 mutex_exit(&rp->r_statelock); 1916 return; 1917 } 1918 1919 /* 1920 * If someone else is updating the filehandle, wait for them to 1921 * finish and then let our caller retry. 1922 */ 1923 if (rp->r_flags & R4RECEXPFH) { 1924 while (rp->r_flags & R4RECEXPFH) { 1925 cv_wait(&rp->r_cv, &rp->r_statelock); 1926 } 1927 mutex_exit(&rp->r_statelock); 1928 return; 1929 } 1930 rp->r_flags |= R4RECEXPFH; 1931 mutex_exit(&rp->r_statelock); 1932 1933 if (action == NR_BADHANDLE) { 1934 /* shouldn't happen */ 1935 nfs4_queue_event(RE_BADHANDLE, mi, NULL, 0, 1936 vp, NULL, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0); 1937 } 1938 1939 nfs4_remap_file(mi, vp, 0, &e); 1940 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 1941 1942 /* 1943 * If we get BADHANDLE, FHEXPIRED or STALE in their handler, 1944 * something is broken. Don't try to recover, just mark the 1945 * file dead. 1946 */ 1947 DTRACE_PROBE2(recov__filehandle, nfs4_error_t, &e, vnode_t, vp); 1948 if (needrecov) { 1949 if (e.error == 0) { 1950 switch (e.stat) { 1951 case NFS4ERR_BADHANDLE: 1952 case NFS4ERR_FHEXPIRED: 1953 case NFS4ERR_STALE: 1954 goto norec; /* Unrecoverable errors */ 1955 default: 1956 break; 1957 } 1958 } 1959 (void) nfs4_start_recovery(&e, mi, vp, NULL, 1960 NULL, NULL, OP_LOOKUP, NULL, NULL, NULL); 1961 1962 } else if (e.error != EINTR && 1963 !NFS4_FRC_UNMT_ERR(e.error, mi->mi_vfsp) && 1964 (e.error != 0 || e.stat != NFS4_OK)) { 1965 nfs4_recov_fh_fail(vp, e.error, e.stat); 1966 /* 1967 * Don't set r_error to ESTALE. Higher-level code (e.g., 1968 * cstatat_getvp()) retries on ESTALE, which would cause 1969 * an infinite loop. 1970 */ 1971 } 1972 norec: 1973 mutex_enter(&rp->r_statelock); 1974 rp->r_flags &= ~R4RECEXPFH; 1975 cv_broadcast(&rp->r_cv); 1976 mutex_exit(&rp->r_statelock); 1977 } 1978 1979 /* 1980 * Stale Filehandle 1981 */ 1982 1983 /* 1984 * A stale filehandle can happen when an individual file has 1985 * been removed, or when an entire filesystem has been taken 1986 * offline. To distinguish these cases, we do this: 1987 * - if a GETATTR with the current filehandle is okay, we do 1988 * nothing (this can happen with two-filehandle ops) 1989 * - if the GETATTR fails, but a GETATTR of the root filehandle 1990 * succeeds, mark the rnode with R4STALE, which will stop use 1991 * - if the GETATTR fails, and a GETATTR of the root filehandle 1992 * also fails, we consider the problem filesystem-wide, so: 1993 * - if we can failover, we should 1994 * - if we can't failover, we should mark both the original 1995 * vnode and the root bad 1996 */ 1997 static void 1998 recov_stale(mntinfo4_t *mi, vnode_t *vp) 1999 { 2000 rnode4_t *rp = VTOR4(vp); 2001 vnode_t *rootvp = NULL; 2002 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 2003 nfs4_ga_res_t gar; 2004 char *fail_msg = "failed to recover from NFS4ERR_STALE"; 2005 bool_t needrecov; 2006 2007 mutex_enter(&rp->r_statelock); 2008 2009 if (rp->r_flags & R4RECOVERR) { 2010 mutex_exit(&rp->r_statelock); 2011 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2012 "recov_stale: already marked dead, rp %s", 2013 rnode4info(rp))); 2014 return; 2015 } 2016 2017 if (rp->r_flags & R4STALE) { 2018 mutex_exit(&rp->r_statelock); 2019 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2020 "recov_stale: already marked stale, rp %s", 2021 rnode4info(rp))); 2022 return; 2023 } 2024 2025 mutex_exit(&rp->r_statelock); 2026 2027 /* Try a GETATTR on this vnode */ 2028 nfs4_getattr_otw_norecovery(vp, &gar, &e, CRED(), 0); 2029 2030 /* 2031 * Handle non-STALE recoverable errors 2032 */ 2033 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 2034 if (needrecov) { 2035 if (e.error == 0) { 2036 switch (e.stat) { 2037 case NFS4ERR_STALE: 2038 case NFS4ERR_BADHANDLE: 2039 goto norec; /* Unrecoverable */ 2040 default: 2041 break; 2042 } 2043 } 2044 (void) nfs4_start_recovery(&e, mi, vp, NULL, 2045 NULL, NULL, OP_GETATTR, NULL, NULL, NULL); 2046 goto out; 2047 } 2048 norec: 2049 /* Are things OK for this vnode? */ 2050 if (!e.error && e.stat == NFS4_OK) { 2051 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2052 "recov_stale: file appears fine, rp %s", 2053 rnode4info(rp))); 2054 goto out; 2055 } 2056 2057 /* Did we get an unrelated non-recoverable error? */ 2058 if (e.error || e.stat != NFS4ERR_STALE) { 2059 nfs4_fail_recov(vp, fail_msg, e.error, e.stat); 2060 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2061 "recov_stale: unrelated fatal error, rp %s", 2062 rnode4info(rp))); 2063 goto out; 2064 } 2065 2066 /* 2067 * If we don't appear to be dealing with the root node, find it. 2068 */ 2069 if ((vp->v_flag & VROOT) == 0) { 2070 nfs4_error_zinit(&e); 2071 e.error = VFS_ROOT(vp->v_vfsp, &rootvp); 2072 if (e.error) { 2073 nfs4_fail_recov(vp, fail_msg, 0, NFS4ERR_STALE); 2074 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2075 "recov_stale: can't find root node for rp %s", 2076 rnode4info(rp))); 2077 goto out; 2078 } 2079 } 2080 2081 /* Try a GETATTR on the root vnode */ 2082 if (rootvp != NULL) { 2083 nfs4_error_zinit(&e); 2084 nfs4_getattr_otw_norecovery(rootvp, &gar, &e, CRED(), 0); 2085 2086 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 2087 if (needrecov) { 2088 if (e.error == 0) { 2089 switch (e.stat) { 2090 case NFS4ERR_STALE: 2091 case NFS4ERR_BADHANDLE: 2092 goto unrec; /* Unrecoverable */ 2093 default: 2094 break; 2095 } 2096 } 2097 (void) nfs4_start_recovery(&e, mi, rootvp, NULL, 2098 NULL, NULL, OP_GETATTR, NULL, NULL, NULL); 2099 } 2100 unrec: 2101 /* 2102 * Check to see if a failover attempt is warranted 2103 * NB: nfs4_try_failover doesn't check for STALE 2104 * because recov_stale gets a shot first. Now that 2105 * recov_stale has failed, go ahead and try failover. 2106 * 2107 * If the getattr on the root filehandle was successful, 2108 * then mark recovery as failed for 'vp' and exit. 2109 */ 2110 if (nfs4_try_failover(&e) == 0 && e.stat != NFS4ERR_STALE) { 2111 /* 2112 * pass the original error to fail_recov, not 2113 * the one from trying the root vnode. 2114 */ 2115 nfs4_fail_recov(vp, fail_msg, 0, NFS4ERR_STALE); 2116 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2117 "recov_stale: root node OK, marking " 2118 "dead rp %s", rnode4info(rp))); 2119 goto out; 2120 } 2121 } 2122 2123 /* 2124 * Here, we know that both the original file and the 2125 * root filehandle (which may be the same) are stale. 2126 * We want to fail over if we can, and if we can't, we 2127 * want to mark everything in sight bad. 2128 */ 2129 if (FAILOVER_MOUNT4(mi)) { 2130 mutex_enter(&mi->mi_lock); 2131 mi->mi_recovflags |= MI4R_NEED_NEW_SERVER; 2132 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2133 "recov_stale: failing over due to rp %s", 2134 rnode4info(rp))); 2135 mutex_exit(&mi->mi_lock); 2136 } else { 2137 rnode4_t *rootrp; 2138 servinfo4_t *svp; 2139 2140 /* 2141 * Can't fail over, so mark things dead. 2142 * 2143 * If rootvp is set, we know we have a distinct 2144 * non-root vnode which can be marked dead in 2145 * the usual way. 2146 * 2147 * Then we want to mark the root vnode dead. 2148 * Note that if rootvp wasn't set, our vp is 2149 * actually the root vnode. 2150 */ 2151 if (rootvp != NULL) { 2152 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2153 "recov_stale: can't fail over, marking dead rp %s", 2154 rnode4info(rp))); 2155 nfs4_fail_recov(vp, fail_msg, 0, NFS4ERR_STALE); 2156 } else { 2157 rootvp = vp; 2158 VN_HOLD(rootvp); 2159 } 2160 2161 /* 2162 * Mark root dead, but quietly - since 2163 * the root rnode is frequently recreated, 2164 * we can encounter this at every access. 2165 * Also mark recovery as failed on this VFS. 2166 */ 2167 rootrp = VTOR4(rootvp); 2168 NFS4_DEBUG(nfs4_client_recov_debug, (CE_CONT, 2169 "recov_stale: marking dead root rp %s", 2170 rnode4info(rootrp))); 2171 mutex_enter(&rootrp->r_statelock); 2172 rootrp->r_flags |= (R4RECOVERR | R4STALE); 2173 rootrp->r_error = ESTALE; 2174 mutex_exit(&rootrp->r_statelock); 2175 mutex_enter(&mi->mi_lock); 2176 mi->mi_error = ESTALE; 2177 mutex_exit(&mi->mi_lock); 2178 2179 svp = mi->mi_curr_serv; 2180 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0); 2181 svp->sv_flags |= SV4_ROOT_STALE; 2182 nfs_rw_exit(&svp->sv_lock); 2183 } 2184 2185 out: 2186 if (rootvp) 2187 VN_RELE(rootvp); 2188 } 2189 2190 /* 2191 * Locks. 2192 */ 2193 2194 /* 2195 * Reclaim all the active (acquired) locks for the given file. 2196 * If a process lost a lock, the process is sent a SIGLOST. This is not 2197 * considered an error. 2198 * 2199 * Return values: 2200 * Errors and status are returned via the nfs4_error_t parameter 2201 * If an error indicates that recovery is needed, the caller is responsible 2202 * for dealing with it. 2203 */ 2204 2205 static void 2206 relock_file(vnode_t *vp, mntinfo4_t *mi, nfs4_error_t *ep, 2207 fattr4_change pre_change) 2208 { 2209 locklist_t *locks, *llp; 2210 rnode4_t *rp; 2211 2212 ASSERT(ep != NULL); 2213 nfs4_error_zinit(ep); 2214 2215 if (VTOMI4(vp)->mi_flags & MI4_LLOCK) 2216 return; 2217 2218 nfs4_flush_lock_owners(VTOR4(vp)); 2219 2220 /* 2221 * If we get an error that requires recovery actions, just bail out 2222 * and let the top-level recovery code handle it. 2223 * 2224 * If we get some other error, kill the process that owned the lock 2225 * and mark its remaining locks (if any) as belonging to NOPID, so 2226 * that we don't make any more reclaim requests for that process. 2227 */ 2228 2229 rp = VTOR4(vp); 2230 locks = flk_active_locks_for_vp(vp); 2231 for (llp = locks; llp != NULL; llp = llp->ll_next) { 2232 int did_reclaim = 1; 2233 2234 ASSERT(llp->ll_vp == vp); 2235 if (llp->ll_flock.l_pid == NOPID) 2236 continue; 2237 reclaim_one_lock(vp, &llp->ll_flock, ep, &did_reclaim); 2238 /* 2239 * If we need to restart recovery, stop processing the 2240 * list. Some errors would be recoverable under other 2241 * circumstances, but if they happen here we just give up 2242 * on the lock. 2243 */ 2244 if (nfs4_needs_recovery(ep, TRUE, vp->v_vfsp)) { 2245 if (ep->error != 0) 2246 break; 2247 if (!nfs4_recov_marks_dead(ep->stat)) 2248 break; 2249 } 2250 /* 2251 * In case the server isn't offering us a grace period, or 2252 * if we missed it, we might have opened & locked from scratch, 2253 * rather than reopened/reclaimed. 2254 * We need to ensure that the object hadn't been otherwise 2255 * changed during this time, by comparing the changeinfo. 2256 * We get passed the changeinfo from before the reopen by our 2257 * caller, in pre_change. 2258 * The changeinfo from after the reopen is in rp->r_change, 2259 * courtesy of the GETATTR in the reopen. 2260 * If they're different, then the file has changed, and we 2261 * have to SIGLOST the app. 2262 */ 2263 if (ep->error == 0 && ep->stat == NFS4_OK && !did_reclaim) { 2264 mutex_enter(&rp->r_statelock); 2265 if (pre_change != rp->r_change) 2266 ep->stat = NFS4ERR_NO_GRACE; 2267 mutex_exit(&rp->r_statelock); 2268 } 2269 if (ep->error != 0 || ep->stat != NFS4_OK) { 2270 if (ep->error != 0) 2271 nfs4_queue_event(RE_FAIL_RELOCK, mi, 2272 NULL, ep->error, vp, NULL, 0, NULL, 2273 llp->ll_flock.l_pid, TAG_NONE, TAG_NONE, 2274 0, 0); 2275 else 2276 nfs4_queue_event(RE_FAIL_RELOCK, mi, 2277 NULL, 0, vp, NULL, ep->stat, NULL, 2278 llp->ll_flock.l_pid, TAG_NONE, TAG_NONE, 2279 0, 0); 2280 nfs4_send_siglost(llp->ll_flock.l_pid, mi, vp, TRUE, 2281 ep->error, ep->stat); 2282 relock_skip_pid(vp, llp, llp->ll_flock.l_pid); 2283 2284 /* Reinitialize the nfs4_error and continue */ 2285 nfs4_error_zinit(ep); 2286 } 2287 } 2288 2289 if (locks != NULL) 2290 flk_free_locklist(locks); 2291 } 2292 2293 /* 2294 * Reclaim the given lock. 2295 * 2296 * Errors are returned via the nfs4_error_t parameter. 2297 */ 2298 static void 2299 reclaim_one_lock(vnode_t *vp, flock64_t *flk, nfs4_error_t *ep, 2300 int *did_reclaimp) 2301 { 2302 cred_t *cr; 2303 rnode4_t *rp = VTOR4(vp); 2304 2305 cr = pid_to_cr(flk->l_pid); 2306 if (cr == NULL) { 2307 nfs4_error_init(ep, ESRCH); 2308 return; 2309 } 2310 2311 do { 2312 mutex_enter(&rp->r_statelock); 2313 if (rp->r_flags & R4RECOVERR) { 2314 mutex_exit(&rp->r_statelock); 2315 nfs4_error_init(ep, ESTALE); 2316 break; 2317 } 2318 mutex_exit(&rp->r_statelock); 2319 2320 nfs4frlock(NFS4_LCK_CTYPE_RECLAIM, vp, F_SETLK, flk, cr, ep, 2321 NULL, did_reclaimp); 2322 if (ep->error == 0 && ep->stat == NFS4ERR_FHEXPIRED) 2323 start_recovery_action(NR_FHEXPIRED, TRUE, VTOMI4(vp), 2324 vp, NULL); 2325 } while (ep->error == 0 && ep->stat == NFS4ERR_FHEXPIRED); 2326 2327 crfree(cr); 2328 } 2329 2330 /* 2331 * Open files. 2332 */ 2333 2334 /* 2335 * Verifies if the nfsstat4 is a valid error for marking this vnode dead. 2336 * Returns 1 if the error is valid; 0 otherwise. 2337 */ 2338 static int 2339 nfs4_valid_recov_err_for_vp(vnode_t *vp, nfsstat4 stat) 2340 { 2341 /* 2342 * We should not be marking non-regular files as dead, 2343 * except in very rare cases (eg: BADHANDLE or NFS4ERR_BADNAME). 2344 */ 2345 if (vp->v_type != VREG && stat != NFS4ERR_BADHANDLE && 2346 stat != NFS4ERR_BADNAME) 2347 return (0); 2348 2349 return (1); 2350 } 2351 2352 /* 2353 * Failed attempting to recover a filehandle. If 'stat' is valid for 'vp', 2354 * then mark the object dead. Since we've had to do a lookup for 2355 * filehandle recovery, we will mark the object dead if we got NOENT. 2356 */ 2357 static void 2358 nfs4_recov_fh_fail(vnode_t *vp, int error, nfsstat4 stat) 2359 { 2360 ASSERT(vp != NULL); 2361 2362 if ((error == 0) && (stat != NFS4ERR_NOENT) && 2363 (!nfs4_valid_recov_err_for_vp(vp, stat))) 2364 return; 2365 2366 nfs4_fail_recov(vp, "can't recover filehandle", error, stat); 2367 } 2368 2369 /* 2370 * Recovery from a "shouldn't happen" error. In the long term, we'd like 2371 * to mark only the data structure(s) that provided the bad value as being 2372 * bad. But for now we'll just mark the entire file. 2373 */ 2374 2375 static void 2376 recov_badstate(recov_info_t *recovp, vnode_t *vp, nfsstat4 stat) 2377 { 2378 ASSERT(vp != NULL); 2379 recov_throttle(recovp, vp); 2380 2381 if (!nfs4_valid_recov_err_for_vp(vp, stat)) 2382 return; 2383 2384 nfs4_fail_recov(vp, "", 0, stat); 2385 } 2386 2387 /* 2388 * Free up the information saved for a lost state request. 2389 */ 2390 static void 2391 nfs4_free_lost_rqst(nfs4_lost_rqst_t *lrp, nfs4_server_t *sp) 2392 { 2393 component4 *filep; 2394 nfs4_open_stream_t *osp; 2395 int have_sync_lock; 2396 2397 NFS4_DEBUG(nfs4_lost_rqst_debug, 2398 (CE_NOTE, "nfs4_free_lost_rqst:")); 2399 2400 switch (lrp->lr_op) { 2401 case OP_OPEN: 2402 filep = &lrp->lr_ofile; 2403 if (filep->utf8string_val) { 2404 kmem_free(filep->utf8string_val, filep->utf8string_len); 2405 filep->utf8string_val = NULL; 2406 } 2407 break; 2408 case OP_DELEGRETURN: 2409 nfs4delegreturn_cleanup(VTOR4(lrp->lr_vp), sp); 2410 break; 2411 case OP_CLOSE: 2412 osp = lrp->lr_osp; 2413 ASSERT(osp != NULL); 2414 mutex_enter(&osp->os_sync_lock); 2415 have_sync_lock = 1; 2416 if (osp->os_pending_close) { 2417 /* clean up the open file state. */ 2418 osp->os_pending_close = 0; 2419 nfs4close_notw(lrp->lr_vp, osp, &have_sync_lock); 2420 } 2421 if (have_sync_lock) 2422 mutex_exit(&osp->os_sync_lock); 2423 break; 2424 } 2425 2426 lrp->lr_op = 0; 2427 if (lrp->lr_oop != NULL) { 2428 open_owner_rele(lrp->lr_oop); 2429 lrp->lr_oop = NULL; 2430 } 2431 if (lrp->lr_osp != NULL) { 2432 open_stream_rele(lrp->lr_osp, VTOR4(lrp->lr_vp)); 2433 lrp->lr_osp = NULL; 2434 } 2435 if (lrp->lr_lop != NULL) { 2436 lock_owner_rele(lrp->lr_lop); 2437 lrp->lr_lop = NULL; 2438 } 2439 if (lrp->lr_flk != NULL) { 2440 kmem_free(lrp->lr_flk, sizeof (flock64_t)); 2441 lrp->lr_flk = NULL; 2442 } 2443 if (lrp->lr_vp != NULL) { 2444 VN_RELE(lrp->lr_vp); 2445 lrp->lr_vp = NULL; 2446 } 2447 if (lrp->lr_dvp != NULL) { 2448 VN_RELE(lrp->lr_dvp); 2449 lrp->lr_dvp = NULL; 2450 } 2451 if (lrp->lr_cr != NULL) { 2452 crfree(lrp->lr_cr); 2453 lrp->lr_cr = NULL; 2454 } 2455 2456 kmem_free(lrp, sizeof (nfs4_lost_rqst_t)); 2457 } 2458 2459 /* 2460 * Remove any lost state requests and free them. 2461 */ 2462 static void 2463 nfs4_remove_lost_rqsts(mntinfo4_t *mi, nfs4_server_t *sp) 2464 { 2465 nfs4_lost_rqst_t *lrp; 2466 2467 mutex_enter(&mi->mi_lock); 2468 while ((lrp = list_head(&mi->mi_lost_state)) != NULL) { 2469 list_remove(&mi->mi_lost_state, lrp); 2470 mutex_exit(&mi->mi_lock); 2471 nfs4_free_lost_rqst(lrp, sp); 2472 mutex_enter(&mi->mi_lock); 2473 } 2474 mutex_exit(&mi->mi_lock); 2475 } 2476 2477 /* 2478 * Reopen all the files for the given filesystem and reclaim any locks. 2479 */ 2480 2481 static void 2482 recov_openfiles(recov_info_t *recovp, nfs4_server_t *sp) 2483 { 2484 mntinfo4_t *mi = recovp->rc_mi; 2485 nfs4_opinst_t *reopenlist = NULL, *rep; 2486 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 2487 open_claim_type4 claim; 2488 int remap; 2489 char *fail_msg = "No such file or directory on replica"; 2490 rnode4_t *rp; 2491 fattr4_change pre_change; 2492 2493 ASSERT(sp != NULL); 2494 2495 /* 2496 * This check is to allow a 10ms pause before we reopen files 2497 * it should allow the server time to have received the CB_NULL 2498 * reply and update its internal structures such that (if 2499 * applicable) we are granted a delegation on reopened files. 2500 */ 2501 mutex_enter(&sp->s_lock); 2502 if ((sp->s_flags & (N4S_CB_PINGED | N4S_CB_WAITER)) == 0) { 2503 sp->s_flags |= N4S_CB_WAITER; 2504 (void) cv_reltimedwait(&sp->wait_cb_null, &sp->s_lock, 2505 drv_usectohz(N4S_CB_PAUSE_TIME), TR_CLOCK_TICK); 2506 } 2507 mutex_exit(&sp->s_lock); 2508 2509 (void) nfs_rw_enter_sig(&sp->s_recovlock, RW_READER, 0); 2510 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0); 2511 2512 if (NFS4_VOLATILE_FH(mi)) { 2513 nfs4_remap_root(mi, &e, 0); 2514 if (nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp)) { 2515 (void) nfs4_start_recovery(&e, mi, NULL, 2516 NULL, NULL, NULL, OP_LOOKUP, NULL, NULL, NULL); 2517 } 2518 } 2519 2520 mutex_enter(&mi->mi_lock); 2521 if (recovp->rc_srv_reboot || (mi->mi_recovflags & MI4R_SRV_REBOOT)) 2522 claim = CLAIM_PREVIOUS; 2523 else 2524 claim = CLAIM_NULL; 2525 mutex_exit(&mi->mi_lock); 2526 2527 if (e.error == 0 && e.stat == NFS4_OK) { 2528 /* 2529 * Get a snapshot of open files in the filesystem. Note 2530 * that new opens will stall until the server's grace 2531 * period is done. 2532 */ 2533 reopenlist = r4mkopenlist(mi); 2534 2535 mutex_enter(&mi->mi_lock); 2536 remap = mi->mi_recovflags & MI4R_REMAP_FILES; 2537 mutex_exit(&mi->mi_lock); 2538 /* 2539 * Since we are re-establishing state on the 2540 * server, its ok to blow away the saved lost 2541 * requests since we don't need to reissue it. 2542 */ 2543 nfs4_remove_lost_rqsts(mi, sp); 2544 2545 for (rep = reopenlist; rep; rep = rep->re_next) { 2546 2547 if (remap) { 2548 nfs4_remap_file(mi, rep->re_vp, 2549 NFS4_REMAP_CKATTRS, &e); 2550 } 2551 DTRACE_PROBE2(recov__openfiles, nfs4_error_t, &e, 2552 vnode_t, rep->re_vp); 2553 if (e.error == ENOENT || e.stat == NFS4ERR_NOENT) { 2554 /* 2555 * The current server does not have the file 2556 * that is to be remapped. This is most 2557 * likely due to an improperly maintained 2558 * replica. The files that are missing from 2559 * the server will be marked dead and logged 2560 * in order to make sys admins aware of the 2561 * problem. 2562 */ 2563 nfs4_fail_recov(rep->re_vp, 2564 fail_msg, e.error, e.stat); 2565 /* 2566 * We've already handled the error so clear it. 2567 */ 2568 nfs4_error_zinit(&e); 2569 continue; 2570 } else if (e.error == 0 && e.stat == NFS4_OK) { 2571 int j; 2572 2573 rp = VTOR4(rep->re_vp); 2574 mutex_enter(&rp->r_statelock); 2575 pre_change = rp->r_change; 2576 mutex_exit(&rp->r_statelock); 2577 2578 for (j = 0; j < rep->re_numosp; j++) { 2579 nfs4_reopen(rep->re_vp, rep->re_osp[j], 2580 &e, claim, FALSE, TRUE); 2581 if (e.error != 0 || e.stat != NFS4_OK) 2582 break; 2583 } 2584 if (nfs4_needs_recovery(&e, TRUE, 2585 mi->mi_vfsp)) { 2586 (void) nfs4_start_recovery(&e, mi, 2587 rep->re_vp, NULL, NULL, NULL, 2588 OP_OPEN, NULL, NULL, NULL); 2589 break; 2590 } 2591 } 2592 #ifdef DEBUG 2593 if (nfs4_recovdelay > 0) 2594 delay(MSEC_TO_TICK(nfs4_recovdelay * 1000)); 2595 #endif 2596 if (e.error == 0 && e.stat == NFS4_OK) { 2597 relock_file(rep->re_vp, mi, &e, pre_change); 2598 2599 if (nfs4_needs_recovery(&e, TRUE, mi->mi_vfsp)) 2600 (void) nfs4_start_recovery(&e, mi, 2601 rep->re_vp, NULL, NULL, NULL, 2602 OP_LOCK, NULL, NULL, NULL); 2603 } 2604 2605 if (e.error != 0 || e.stat != NFS4_OK) 2606 break; 2607 } 2608 2609 /* 2610 * Check to see if we need to remap files passed in 2611 * via the recovery arguments; this will have been 2612 * done for open files. A failure here is not fatal. 2613 */ 2614 if (remap) { 2615 nfs4_error_t ignore; 2616 nfs4_check_remap(mi, recovp->rc_vp1, NFS4_REMAP_CKATTRS, 2617 &ignore); 2618 nfs4_check_remap(mi, recovp->rc_vp2, NFS4_REMAP_CKATTRS, 2619 &ignore); 2620 } 2621 } 2622 2623 if (e.error == 0 && e.stat == NFS4_OK) { 2624 mutex_enter(&mi->mi_lock); 2625 mi->mi_recovflags &= ~(MI4R_REOPEN_FILES | MI4R_REMAP_FILES); 2626 mutex_exit(&mi->mi_lock); 2627 } 2628 2629 nfs_rw_exit(&mi->mi_recovlock); 2630 nfs_rw_exit(&sp->s_recovlock); 2631 2632 if (reopenlist != NULL) 2633 r4releopenlist(reopenlist); 2634 } 2635 2636 /* 2637 * Resend the queued state recovery requests in "rqsts". 2638 */ 2639 2640 static void 2641 nfs4_resend_lost_rqsts(recov_info_t *recovp, nfs4_server_t *sp) 2642 { 2643 nfs4_lost_rqst_t *lrp, *tlrp; 2644 mntinfo4_t *mi = recovp->rc_mi; 2645 nfs4_error_t n4e; 2646 #ifdef NOTYET 2647 uint32_t deny_bits = 0; 2648 #endif 2649 2650 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_resend_lost_rqsts")); 2651 2652 ASSERT(mi != NULL); 2653 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 2654 2655 mutex_enter(&mi->mi_lock); 2656 lrp = list_head(&mi->mi_lost_state); 2657 mutex_exit(&mi->mi_lock); 2658 while (lrp != NULL) { 2659 nfs4_error_zinit(&n4e); 2660 resend_one_op(lrp, &n4e, mi, sp); 2661 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 2662 "nfs4_resend_lost_rqsts: resend request: for vp %p got " 2663 "error %d stat %d", (void *)lrp->lr_vp, n4e.error, 2664 n4e.stat)); 2665 2666 /* 2667 * If we get a recovery error that we can actually 2668 * recover from (such as ETIMEDOUT, FHEXPIRED), we 2669 * return and let the recovery thread redrive the call. 2670 * Don't requeue unless the zone is still healthy. 2671 */ 2672 if (zone_status_get(curproc->p_zone) < ZONE_IS_SHUTTING_DOWN && 2673 nfs4_needs_recovery(&n4e, TRUE, mi->mi_vfsp) && 2674 (nfs4_try_failover(&n4e) || 2675 NFS4_FRC_UNMT_ERR(n4e.error, mi->mi_vfsp) || 2676 (n4e.error == 0 && n4e.stat != NFS4ERR_BADHANDLE && 2677 !nfs4_recov_marks_dead(n4e.stat)))) { 2678 /* 2679 * For these three errors, we want to delay a bit 2680 * instead of pounding the server into submission. 2681 * We have to do this manually; the normal 2682 * processing for these errors only works for 2683 * non-recovery requests. 2684 */ 2685 if ((n4e.error == 0 && n4e.stat == NFS4ERR_DELAY) || 2686 (n4e.error == 0 && n4e.stat == NFS4ERR_GRACE) || 2687 (n4e.error == 0 && n4e.stat == NFS4ERR_RESOURCE) || 2688 NFS4_FRC_UNMT_ERR(n4e.error, mi->mi_vfsp)) { 2689 delay(SEC_TO_TICK(nfs4err_delay_time)); 2690 } else { 2691 (void) nfs4_start_recovery(&n4e, 2692 mi, lrp->lr_dvp, lrp->lr_vp, NULL, NULL, 2693 lrp->lr_op, NULL, NULL, NULL); 2694 } 2695 return; 2696 } 2697 2698 mutex_enter(&mi->mi_lock); 2699 list_remove(&mi->mi_lost_state, lrp); 2700 tlrp = lrp; 2701 lrp = list_head(&mi->mi_lost_state); 2702 mutex_exit(&mi->mi_lock); 2703 nfs4_free_lost_rqst(tlrp, sp); 2704 } 2705 } 2706 2707 /* 2708 * Resend the given op, and issue any necessary undo call. 2709 * errors are returned via the nfs4_error_t parameter. 2710 */ 2711 2712 static void 2713 resend_one_op(nfs4_lost_rqst_t *lrp, nfs4_error_t *ep, 2714 mntinfo4_t *mi, nfs4_server_t *sp) 2715 { 2716 vnode_t *vp; 2717 nfs4_open_stream_t *osp; 2718 cred_t *cr; 2719 uint32_t acc_bits; 2720 2721 vp = lrp->lr_vp; 2722 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_one_op: " 2723 "have a lost open/close request for vp %p", (void *)vp)); 2724 2725 switch (lrp->lr_op) { 2726 case OP_OPEN: 2727 nfs4_resend_open_otw(&vp, lrp, ep); 2728 break; 2729 case OP_OPEN_DOWNGRADE: 2730 ASSERT(lrp->lr_oop != NULL); 2731 ep->error = nfs4_start_open_seqid_sync(lrp->lr_oop, mi); 2732 ASSERT(!ep->error); /* recov thread always succeeds */ 2733 ASSERT(lrp->lr_osp != NULL); 2734 mutex_enter(&lrp->lr_osp->os_sync_lock); 2735 nfs4_open_downgrade(lrp->lr_dg_acc, lrp->lr_dg_deny, 2736 lrp->lr_oop, lrp->lr_osp, vp, lrp->lr_cr, lrp, 2737 ep, NULL, NULL); 2738 mutex_exit(&lrp->lr_osp->os_sync_lock); 2739 nfs4_end_open_seqid_sync(lrp->lr_oop); 2740 break; 2741 case OP_CLOSE: 2742 osp = lrp->lr_osp; 2743 cr = lrp->lr_cr; 2744 acc_bits = 0; 2745 mutex_enter(&osp->os_sync_lock); 2746 if (osp->os_share_acc_read) 2747 acc_bits |= OPEN4_SHARE_ACCESS_READ; 2748 if (osp->os_share_acc_write) 2749 acc_bits |= OPEN4_SHARE_ACCESS_WRITE; 2750 mutex_exit(&osp->os_sync_lock); 2751 nfs4close_one(vp, osp, cr, acc_bits, lrp, ep, 2752 CLOSE_RESEND, 0, 0, 0); 2753 break; 2754 case OP_LOCK: 2755 case OP_LOCKU: 2756 resend_lock(lrp, ep); 2757 goto done; 2758 case OP_DELEGRETURN: 2759 nfs4_resend_delegreturn(lrp, ep, sp); 2760 goto done; 2761 default: 2762 #ifdef DEBUG 2763 cmn_err(CE_PANIC, "resend_one_op: unexpected op: %d", 2764 lrp->lr_op); 2765 #endif 2766 nfs4_queue_event(RE_LOST_STATE_BAD_OP, mi, NULL, 2767 lrp->lr_op, lrp->lr_vp, lrp->lr_dvp, NFS4_OK, NULL, 0, 2768 TAG_NONE, TAG_NONE, 0, 0); 2769 nfs4_error_init(ep, EINVAL); 2770 return; 2771 } 2772 2773 /* 2774 * No need to retry nor send an "undo" CLOSE in the 2775 * event the server rebooted. 2776 */ 2777 if (ep->error == 0 && (ep->stat == NFS4ERR_STALE_CLIENTID || 2778 ep->stat == NFS4ERR_STALE_STATEID || ep->stat == NFS4ERR_EXPIRED)) 2779 goto done; 2780 2781 /* 2782 * If we resent a CLOSE or OPEN_DOWNGRADE, there's nothing 2783 * to undo. Undoing locking operations was handled by 2784 * resend_lock(). 2785 */ 2786 if (lrp->lr_op == OP_OPEN_DOWNGRADE || lrp->lr_op == OP_CLOSE) 2787 goto done; 2788 2789 /* 2790 * If we get any other error for OPEN, then don't attempt 2791 * to undo the resend of the open (since it was never 2792 * successful!). 2793 */ 2794 ASSERT(lrp->lr_op == OP_OPEN); 2795 if (ep->error || ep->stat != NFS4_OK) 2796 goto done; 2797 2798 /* 2799 * Now let's undo our OPEN. 2800 */ 2801 nfs4_error_zinit(ep); 2802 close_after_open_resend(vp, lrp->lr_cr, lrp->lr_oacc, ep); 2803 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_one_op: " 2804 "nfs4close_one: for vp %p got error %d stat %d", 2805 (void *)vp, ep->error, ep->stat)); 2806 2807 done: 2808 if (vp != lrp->lr_vp) 2809 VN_RELE(vp); 2810 } 2811 2812 /* 2813 * Close a file that was opened via a resent OPEN. 2814 * Most errors are passed back to the caller (via the return value and 2815 * *statp), except for FHEXPIRED, which is retried. 2816 * 2817 * It might be conceptually cleaner to push the CLOSE request onto the 2818 * front of the resend queue, rather than sending it here. That would 2819 * match the way we undo lost lock requests. On the other 2820 * hand, we've already got something that works, and there's no reason to 2821 * change it at this time. 2822 */ 2823 2824 static void 2825 close_after_open_resend(vnode_t *vp, cred_t *cr, uint32_t acc_bits, 2826 nfs4_error_t *ep) 2827 { 2828 2829 for (;;) { 2830 nfs4close_one(vp, NULL, cr, acc_bits, NULL, ep, 2831 CLOSE_AFTER_RESEND, 0, 0, 0); 2832 if (ep->error == 0 && ep->stat == NFS4_OK) 2833 break; /* success; done */ 2834 if (ep->error != 0 || ep->stat != NFS4ERR_FHEXPIRED) 2835 break; 2836 /* else retry FHEXPIRED */ 2837 } 2838 2839 } 2840 2841 /* 2842 * Resend the given lost lock request. Return an errno value. If zero, 2843 * *statp is set to the NFS status code for the call. 2844 * 2845 * Issue a SIGLOST and mark the rnode dead if we get a non-recovery error or 2846 * a recovery error that we don't actually recover from yet (eg: BAD_SEQID). 2847 * Let the recovery thread redrive the call if we get a recovery error that 2848 * we can actually recover from. 2849 */ 2850 static void 2851 resend_lock(nfs4_lost_rqst_t *lrp, nfs4_error_t *ep) 2852 { 2853 bool_t send_siglost = FALSE; 2854 vnode_t *vp = lrp->lr_vp; 2855 2856 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_lock:")); 2857 ASSERT(lrp->lr_ctype == NFS4_LCK_CTYPE_REINSTATE || 2858 lrp->lr_ctype == NFS4_LCK_CTYPE_RESEND); 2859 2860 nfs4frlock(lrp->lr_ctype, vp, F_SETLK, lrp->lr_flk, lrp->lr_cr, ep, 2861 lrp, NULL); 2862 2863 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_lock: " 2864 "nfs4frlock for vp %p returned error %d, stat %d", 2865 (void *)vp, ep->error, ep->stat)); 2866 2867 if (ep->error == 0 && ep->stat == 0) 2868 goto done; 2869 if (ep->error == 0 && ep->stat == NFS4ERR_DENIED && 2870 lrp->lr_ctype == NFS4_LCK_CTYPE_RESEND) 2871 goto done; 2872 2873 /* 2874 * If we failed with a non-recovery error, send SIGLOST and 2875 * mark the file dead. 2876 */ 2877 if (!nfs4_needs_recovery(ep, TRUE, vp->v_vfsp)) 2878 send_siglost = TRUE; 2879 else { 2880 /* 2881 * Done with recovering LOST LOCK in the event the 2882 * server rebooted or we've lost the lease. 2883 */ 2884 if (ep->error == 0 && (ep->stat == NFS4ERR_STALE_CLIENTID || 2885 ep->stat == NFS4ERR_STALE_STATEID || 2886 ep->stat == NFS4ERR_EXPIRED)) { 2887 goto done; 2888 } 2889 2890 /* 2891 * BAD_STATEID on an unlock indicates that the server has 2892 * forgotten about the lock anyway, so act like the call 2893 * was successful. 2894 */ 2895 if (ep->error == 0 && ep->stat == NFS4ERR_BAD_STATEID && 2896 lrp->lr_op == OP_LOCKU) 2897 goto done; 2898 2899 /* 2900 * If we got a recovery error that we don't actually 2901 * recover from, send SIGLOST. If the filesystem was 2902 * forcibly unmounted, we skip the SIGLOST because (a) it's 2903 * unnecessary noise, and (b) there could be a new process 2904 * with the same pid as the one that had generated the lost 2905 * state request. 2906 */ 2907 if (ep->error == 0 && (ep->stat == NFS4ERR_BADHANDLE || 2908 nfs4_recov_marks_dead(ep->stat))) { 2909 if (!(vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) 2910 send_siglost = TRUE; 2911 goto done; 2912 } 2913 2914 /* 2915 * If the filesystem was forcibly unmounted, we 2916 * still need to synchronize with the server and 2917 * release state. Try again later. 2918 */ 2919 if (NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp)) 2920 goto done; 2921 2922 /* 2923 * If we get a recovery error that we can actually 2924 * recover from (such as ETIMEDOUT, FHEXPIRED), 2925 * return and let the recovery thread redrive the call. 2926 * 2927 * For the three errors below, we want to delay a bit 2928 * instead of pounding the server into submission. 2929 */ 2930 if ((ep->error == 0 && ep->stat == NFS4ERR_DELAY) || 2931 (ep->error == 0 && ep->stat == NFS4ERR_GRACE) || 2932 (ep->error == 0 && ep->stat == NFS4ERR_RESOURCE)) 2933 delay(SEC_TO_TICK(recov_err_delay)); 2934 goto done; 2935 } 2936 2937 done: 2938 if (send_siglost) { 2939 cred_t *sv_cred; 2940 2941 /* 2942 * Must be root or the actual thread being issued the 2943 * SIGLOST for this to work, so just become root. 2944 */ 2945 sv_cred = curthread->t_cred; 2946 curthread->t_cred = kcred; 2947 nfs4_send_siglost(lrp->lr_flk->l_pid, VTOMI4(vp), vp, FALSE, 2948 ep->error, ep->stat); 2949 curthread->t_cred = sv_cred; 2950 2951 /* 2952 * Flush any additional reinstantiation requests for 2953 * this operation. Sending multiple SIGLOSTs to the user 2954 * process is unlikely to help and may cause trouble. 2955 */ 2956 if (lrp->lr_ctype == NFS4_LCK_CTYPE_REINSTATE) 2957 flush_reinstate(lrp); 2958 } 2959 } 2960 2961 /* 2962 * Remove any lock reinstantiation requests that correspond to the given 2963 * lost request. We only remove items that follow lrp in the queue, 2964 * assuming that lrp will be removed by the generic lost state code. 2965 */ 2966 2967 static void 2968 flush_reinstate(nfs4_lost_rqst_t *lrp) 2969 { 2970 vnode_t *vp; 2971 pid_t pid; 2972 mntinfo4_t *mi; 2973 nfs4_lost_rqst_t *nlrp; 2974 2975 vp = lrp->lr_vp; 2976 mi = VTOMI4(vp); 2977 pid = lrp->lr_flk->l_pid; 2978 2979 /* 2980 * If there are any more reinstantation requests to get rid of, 2981 * they should all be clustered at the front of the lost state 2982 * queue. 2983 */ 2984 mutex_enter(&mi->mi_lock); 2985 for (lrp = list_next(&mi->mi_lost_state, lrp); lrp != NULL; 2986 lrp = nlrp) { 2987 nlrp = list_next(&mi->mi_lost_state, lrp); 2988 if (lrp->lr_op != OP_LOCK && lrp->lr_op != OP_LOCKU) 2989 break; 2990 if (lrp->lr_ctype != NFS4_LCK_CTYPE_REINSTATE) 2991 break; 2992 ASSERT(lrp->lr_vp == vp); 2993 ASSERT(lrp->lr_flk->l_pid == pid); 2994 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 2995 "remove reinstantiation %p", (void *)lrp)); 2996 list_remove(&mi->mi_lost_state, lrp); 2997 nfs4_free_lost_rqst(lrp, NULL); 2998 } 2999 mutex_exit(&mi->mi_lock); 3000 } 3001 3002 /* 3003 * End of state-specific recovery routines. 3004 */ 3005 3006 /* 3007 * Allocate a lost request struct, initialize it from lost_rqstp (including 3008 * bumping the reference counts for the referenced vnode, etc.), and hang 3009 * it off of recovp. 3010 */ 3011 3012 static void 3013 nfs4_save_lost_rqst(nfs4_lost_rqst_t *lost_rqstp, recov_info_t *recovp, 3014 nfs4_recov_t *action, mntinfo4_t *mi) 3015 { 3016 nfs4_lost_rqst_t *destp; 3017 3018 ASSERT(recovp->rc_lost_rqst == NULL); 3019 3020 destp = kmem_alloc(sizeof (nfs4_lost_rqst_t), KM_SLEEP); 3021 recovp->rc_lost_rqst = destp; 3022 3023 if (lost_rqstp->lr_op == OP_LOCK || 3024 lost_rqstp->lr_op == OP_LOCKU) { 3025 ASSERT(lost_rqstp->lr_lop); 3026 *action = NR_LOST_LOCK; 3027 destp->lr_ctype = lost_rqstp->lr_ctype; 3028 destp->lr_locktype = lost_rqstp->lr_locktype; 3029 } else if (lost_rqstp->lr_op == OP_OPEN) { 3030 component4 *srcfp, *destfp; 3031 3032 destp->lr_oacc = lost_rqstp->lr_oacc; 3033 destp->lr_odeny = lost_rqstp->lr_odeny; 3034 destp->lr_oclaim = lost_rqstp->lr_oclaim; 3035 if (lost_rqstp->lr_oclaim == CLAIM_DELEGATE_CUR) 3036 destp->lr_ostateid = lost_rqstp->lr_ostateid; 3037 3038 srcfp = &lost_rqstp->lr_ofile; 3039 destfp = &destp->lr_ofile; 3040 /* 3041 * Consume caller's utf8string 3042 */ 3043 destfp->utf8string_len = srcfp->utf8string_len; 3044 destfp->utf8string_val = srcfp->utf8string_val; 3045 srcfp->utf8string_len = 0; 3046 srcfp->utf8string_val = NULL; /* make sure not reused */ 3047 3048 *action = NR_LOST_STATE_RQST; 3049 } else if (lost_rqstp->lr_op == OP_OPEN_DOWNGRADE) { 3050 destp->lr_dg_acc = lost_rqstp->lr_dg_acc; 3051 destp->lr_dg_deny = lost_rqstp->lr_dg_deny; 3052 3053 *action = NR_LOST_STATE_RQST; 3054 } else if (lost_rqstp->lr_op == OP_CLOSE) { 3055 ASSERT(lost_rqstp->lr_oop); 3056 *action = NR_LOST_STATE_RQST; 3057 } else if (lost_rqstp->lr_op == OP_DELEGRETURN) { 3058 *action = NR_LOST_STATE_RQST; 3059 } else { 3060 #ifdef DEBUG 3061 cmn_err(CE_PANIC, "nfs4_save_lost_rqst: bad op %d", 3062 lost_rqstp->lr_op); 3063 #endif 3064 nfs4_queue_event(RE_LOST_STATE_BAD_OP, mi, NULL, 3065 lost_rqstp->lr_op, lost_rqstp->lr_vp, lost_rqstp->lr_dvp, 3066 NFS4_OK, NULL, curproc->p_pid, TAG_NONE, TAG_NONE, 0, 0); 3067 *action = NR_UNUSED; 3068 recovp->rc_lost_rqst = NULL; 3069 kmem_free(destp, sizeof (nfs4_lost_rqst_t)); 3070 return; 3071 } 3072 3073 destp->lr_op = lost_rqstp->lr_op; 3074 destp->lr_vp = lost_rqstp->lr_vp; 3075 if (destp->lr_vp) 3076 VN_HOLD(destp->lr_vp); 3077 destp->lr_dvp = lost_rqstp->lr_dvp; 3078 if (destp->lr_dvp) 3079 VN_HOLD(destp->lr_dvp); 3080 destp->lr_oop = lost_rqstp->lr_oop; 3081 if (destp->lr_oop) 3082 open_owner_hold(destp->lr_oop); 3083 destp->lr_osp = lost_rqstp->lr_osp; 3084 if (destp->lr_osp) 3085 open_stream_hold(destp->lr_osp); 3086 destp->lr_lop = lost_rqstp->lr_lop; 3087 if (destp->lr_lop) 3088 lock_owner_hold(destp->lr_lop); 3089 destp->lr_cr = lost_rqstp->lr_cr; 3090 if (destp->lr_cr) 3091 crhold(destp->lr_cr); 3092 if (lost_rqstp->lr_flk == NULL) 3093 destp->lr_flk = NULL; 3094 else { 3095 destp->lr_flk = kmem_alloc(sizeof (flock64_t), KM_SLEEP); 3096 *destp->lr_flk = *lost_rqstp->lr_flk; 3097 } 3098 destp->lr_putfirst = lost_rqstp->lr_putfirst; 3099 } 3100 3101 /* 3102 * Map the given return values (errno and nfs4 status code) to a recovery 3103 * action and fill in the following fields of recovp: rc_action, 3104 * rc_srv_reboot, rc_stateid, rc_lost_rqst. 3105 */ 3106 3107 void 3108 errs_to_action(recov_info_t *recovp, 3109 nfs4_server_t *sp, mntinfo4_t *mi, stateid4 *sidp, 3110 nfs4_lost_rqst_t *lost_rqstp, int unmounted, nfs_opnum4 op, 3111 nfs4_bseqid_entry_t *bsep) 3112 { 3113 nfs4_recov_t action = NR_UNUSED; 3114 bool_t reboot = FALSE; 3115 int try_f; 3116 int error = recovp->rc_orig_errors.error; 3117 nfsstat4 stat = recovp->rc_orig_errors.stat; 3118 3119 bzero(&recovp->rc_stateid, sizeof (stateid4)); 3120 recovp->rc_lost_rqst = NULL; 3121 recovp->rc_bseqid_rqst = NULL; 3122 3123 try_f = nfs4_try_failover(&recovp->rc_orig_errors) && 3124 FAILOVER_MOUNT4(mi); 3125 3126 /* 3127 * We start recovery for EINTR only in the lost lock 3128 * or lost open/close case. 3129 */ 3130 3131 if (try_f || error == EINTR || (error == EIO && unmounted)) { 3132 recovp->rc_error = (error != 0 ? error : geterrno4(stat)); 3133 if (lost_rqstp) { 3134 ASSERT(lost_rqstp->lr_op != 0); 3135 nfs4_save_lost_rqst(lost_rqstp, recovp, &action, mi); 3136 } 3137 if (try_f) 3138 action = NR_FAILOVER; 3139 } else if (error != 0) { 3140 recovp->rc_error = error; 3141 nfs4_queue_event(RE_UNEXPECTED_ERRNO, mi, NULL, error, NULL, 3142 NULL, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0); 3143 action = NR_CLIENTID; 3144 } else { 3145 recovp->rc_error = geterrno4(stat); 3146 switch (stat) { 3147 #ifdef notyet 3148 case NFS4ERR_LEASE_MOVED: 3149 action = xxx; 3150 break; 3151 #endif 3152 case NFS4ERR_MOVED: 3153 action = NR_MOVED; 3154 break; 3155 case NFS4ERR_BADHANDLE: 3156 action = NR_BADHANDLE; 3157 break; 3158 case NFS4ERR_BAD_SEQID: 3159 if (bsep) 3160 save_bseqid_rqst(bsep, recovp); 3161 action = NR_BAD_SEQID; 3162 break; 3163 case NFS4ERR_OLD_STATEID: 3164 action = NR_OLDSTATEID; 3165 break; 3166 case NFS4ERR_WRONGSEC: 3167 action = NR_WRONGSEC; 3168 break; 3169 case NFS4ERR_FHEXPIRED: 3170 action = NR_FHEXPIRED; 3171 break; 3172 case NFS4ERR_BAD_STATEID: 3173 if (sp == NULL || (sp != NULL && inlease(sp))) { 3174 3175 action = NR_BAD_STATEID; 3176 if (sidp) 3177 recovp->rc_stateid = *sidp; 3178 } else 3179 action = NR_CLIENTID; 3180 break; 3181 case NFS4ERR_EXPIRED: 3182 /* 3183 * The client's lease has expired, either due 3184 * to a network partition or perhaps a client 3185 * error. In either case, try an NR_CLIENTID 3186 * style recovery. reboot remains false, since 3187 * there is no evidence the server has rebooted. 3188 * This will cause CLAIM_NULL opens and lock 3189 * requests without the reclaim bit. 3190 */ 3191 action = NR_CLIENTID; 3192 3193 DTRACE_PROBE4(nfs4__expired, 3194 nfs4_server_t *, sp, 3195 mntinfo4_t *, mi, 3196 stateid4 *, sidp, int, op); 3197 3198 break; 3199 case NFS4ERR_STALE_CLIENTID: 3200 case NFS4ERR_STALE_STATEID: 3201 action = NR_CLIENTID; 3202 reboot = TRUE; 3203 break; 3204 case NFS4ERR_RESOURCE: 3205 /* 3206 * If this had been a FAILOVER mount, then 3207 * we'd have tried failover. Since it's not, 3208 * just delay a while and retry. 3209 */ 3210 action = NR_DELAY; 3211 break; 3212 case NFS4ERR_GRACE: 3213 action = NR_GRACE; 3214 break; 3215 case NFS4ERR_DELAY: 3216 action = NR_DELAY; 3217 break; 3218 case NFS4ERR_STALE: 3219 action = NR_STALE; 3220 break; 3221 default: 3222 nfs4_queue_event(RE_UNEXPECTED_STATUS, mi, NULL, 0, 3223 NULL, NULL, stat, NULL, 0, TAG_NONE, TAG_NONE, 3224 0, 0); 3225 action = NR_CLIENTID; 3226 break; 3227 } 3228 } 3229 3230 /* make sure action got set */ 3231 ASSERT(action != NR_UNUSED); 3232 recovp->rc_srv_reboot = reboot; 3233 recovp->rc_action = action; 3234 nfs4_queue_fact(RF_ERR, mi, stat, action, op, reboot, NULL, error, 3235 NULL); 3236 } 3237 3238 /* 3239 * Return the (held) credential for the process with the given pid. 3240 * May return NULL (e.g., process not found). 3241 */ 3242 3243 static cred_t * 3244 pid_to_cr(pid_t pid) 3245 { 3246 proc_t *p; 3247 cred_t *cr; 3248 3249 mutex_enter(&pidlock); 3250 if ((p = prfind(pid)) == NULL) { 3251 mutex_exit(&pidlock); 3252 return (NULL); 3253 } 3254 3255 mutex_enter(&p->p_crlock); 3256 crhold(cr = p->p_cred); 3257 mutex_exit(&p->p_crlock); 3258 mutex_exit(&pidlock); 3259 3260 return (cr); 3261 } 3262 3263 /* 3264 * Send SIGLOST to the given process and queue the event. 3265 * 3266 * The 'dump' boolean tells us whether this action should dump the 3267 * in-kernel queue of recovery messages or not. 3268 */ 3269 3270 void 3271 nfs4_send_siglost(pid_t pid, mntinfo4_t *mi, vnode_t *vp, bool_t dump, 3272 int error, nfsstat4 stat) 3273 { 3274 proc_t *p; 3275 3276 mutex_enter(&pidlock); 3277 p = prfind(pid); 3278 if (p) 3279 psignal(p, SIGLOST); 3280 mutex_exit(&pidlock); 3281 nfs4_queue_event(dump ? RE_SIGLOST : RE_SIGLOST_NO_DUMP, mi, 3282 NULL, error, vp, NULL, stat, NULL, pid, TAG_NONE, TAG_NONE, 0, 0); 3283 } 3284 3285 /* 3286 * Scan the lock list for entries that match the given pid. Unregister those 3287 * locks that do and change their pid to NOPID. 3288 */ 3289 3290 static void 3291 relock_skip_pid(vnode_t *vp, locklist_t *llp, pid_t pid) 3292 { 3293 for (; llp != NULL; llp = llp->ll_next) { 3294 if (llp->ll_flock.l_pid == pid) { 3295 int r; 3296 3297 /* 3298 * Unregister the lost lock. 3299 */ 3300 llp->ll_flock.l_type = F_UNLCK; 3301 r = reclock(vp, &llp->ll_flock, SETFLCK, FREAD | FWRITE, 3302 0, NULL); 3303 /* The unlock cannot fail */ 3304 ASSERT(r == 0); 3305 3306 llp->ll_flock.l_pid = NOPID; 3307 } 3308 } 3309 } 3310 3311 /* 3312 * Mark a file as having failed recovery, after making a last-ditch effort 3313 * to return any delegation. 3314 * 3315 * Sets r_error to EIO or ESTALE for the given vnode. 3316 */ 3317 void 3318 nfs4_fail_recov(vnode_t *vp, char *why, int error, nfsstat4 stat) 3319 { 3320 rnode4_t *rp = VTOR4(vp); 3321 3322 #ifdef DEBUG 3323 if (nfs4_fail_recov_stop) 3324 debug_enter("nfs4_fail_recov"); 3325 #endif 3326 3327 mutex_enter(&rp->r_statelock); 3328 if (rp->r_flags & (R4RECOVERR|R4RECOVERRP)) { 3329 mutex_exit(&rp->r_statelock); 3330 return; 3331 } 3332 3333 /* 3334 * Set R4RECOVERRP to indicate that a recovery error is in 3335 * progress. This will shut down reads and writes at the top 3336 * half. Don't set R4RECOVERR until after we've returned the 3337 * delegation, otherwise it will fail. 3338 */ 3339 3340 rp->r_flags |= R4RECOVERRP; 3341 mutex_exit(&rp->r_statelock); 3342 3343 nfs4delegabandon(rp); 3344 3345 mutex_enter(&rp->r_statelock); 3346 rp->r_flags |= (R4RECOVERR | R4STALE); 3347 rp->r_error = (error == 0 && stat == NFS4ERR_STALE) ? ESTALE : EIO; 3348 PURGE_ATTRCACHE4_LOCKED(rp); 3349 if (!(vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) 3350 nfs4_queue_event(RE_DEAD_FILE, VTOMI4(vp), NULL, error, 3351 vp, NULL, stat, why, 0, TAG_NONE, TAG_NONE, 0, 0); 3352 mutex_exit(&rp->r_statelock); 3353 3354 dnlc_purge_vp(vp); 3355 } 3356 3357 /* 3358 * recov_throttle: if the file had the same recovery action within the 3359 * throttle interval, wait for the throttle interval to finish before 3360 * proceeding. 3361 * 3362 * Side effects: updates the rnode with the current recovery information. 3363 */ 3364 3365 static void 3366 recov_throttle(recov_info_t *recovp, vnode_t *vp) 3367 { 3368 time_t curtime, time_to_wait; 3369 rnode4_t *rp = VTOR4(vp); 3370 3371 curtime = gethrestime_sec(); 3372 3373 mutex_enter(&rp->r_statelock); 3374 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 3375 "recov_throttle: now: (%d, %ld), last: (%d, %ld)", 3376 recovp->rc_action, curtime, 3377 rp->r_recov_act, rp->r_last_recov)); 3378 if (recovp->rc_action == rp->r_recov_act && 3379 rp->r_last_recov + recov_err_delay > curtime) { 3380 time_to_wait = rp->r_last_recov + recov_err_delay - curtime; 3381 mutex_exit(&rp->r_statelock); 3382 delay(SEC_TO_TICK(time_to_wait)); 3383 curtime = gethrestime_sec(); 3384 mutex_enter(&rp->r_statelock); 3385 } 3386 3387 rp->r_last_recov = curtime; 3388 rp->r_recov_act = recovp->rc_action; 3389 mutex_exit(&rp->r_statelock); 3390 } 3391 3392 /* 3393 * React to NFS4ERR_GRACE by setting the time we'll permit 3394 * the next call to this filesystem. 3395 */ 3396 void 3397 nfs4_set_grace_wait(mntinfo4_t *mi) 3398 { 3399 mutex_enter(&mi->mi_lock); 3400 /* Mark the time for the future */ 3401 mi->mi_grace_wait = gethrestime_sec() + nfs4err_delay_time; 3402 mutex_exit(&mi->mi_lock); 3403 } 3404 3405 /* 3406 * React to MFS4ERR_DELAY by setting the time we'll permit 3407 * the next call to this vnode. 3408 */ 3409 void 3410 nfs4_set_delay_wait(vnode_t *vp) 3411 { 3412 rnode4_t *rp = VTOR4(vp); 3413 3414 mutex_enter(&rp->r_statelock); 3415 /* 3416 * Calculate amount we should delay, initial 3417 * delay will be short and then we will back off. 3418 */ 3419 if (rp->r_delay_interval == 0) 3420 rp->r_delay_interval = NFS4_INITIAL_DELAY_INTERVAL; 3421 else 3422 /* calculate next interval value */ 3423 rp->r_delay_interval = 3424 MIN(NFS4_MAX_DELAY_INTERVAL, (rp->r_delay_interval << 1)); 3425 rp->r_delay_wait = gethrestime_sec() + rp->r_delay_interval; 3426 mutex_exit(&rp->r_statelock); 3427 } 3428 3429 /* 3430 * The caller is responsible for freeing the returned string. 3431 */ 3432 static char * 3433 nfs4_getsrvnames(mntinfo4_t *mi, size_t *len) 3434 { 3435 servinfo4_t *svp; 3436 char *srvnames; 3437 char *namep; 3438 size_t length; 3439 3440 /* 3441 * Calculate the length of the string required to hold all 3442 * of the server names plus either a comma or a null 3443 * character following each individual one. 3444 */ 3445 length = 0; 3446 for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) { 3447 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 3448 if (svp->sv_flags & SV4_NOTINUSE) { 3449 nfs_rw_exit(&svp->sv_lock); 3450 continue; 3451 } 3452 nfs_rw_exit(&svp->sv_lock); 3453 length += svp->sv_hostnamelen; 3454 } 3455 3456 srvnames = kmem_alloc(length, KM_SLEEP); 3457 3458 namep = srvnames; 3459 for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) { 3460 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 3461 if (svp->sv_flags & SV4_NOTINUSE) { 3462 nfs_rw_exit(&svp->sv_lock); 3463 continue; 3464 } 3465 nfs_rw_exit(&svp->sv_lock); 3466 (void) strcpy(namep, svp->sv_hostname); 3467 namep += svp->sv_hostnamelen - 1; 3468 *namep++ = ','; 3469 } 3470 *--namep = '\0'; 3471 3472 *len = length; 3473 3474 return (srvnames); 3475 } 3476 3477 static void 3478 save_bseqid_rqst(nfs4_bseqid_entry_t *bsep, recov_info_t *recovp) 3479 { 3480 nfs4_bseqid_entry_t *destp; 3481 3482 destp = kmem_alloc(sizeof (nfs4_bseqid_entry_t), KM_SLEEP); 3483 recovp->rc_bseqid_rqst = destp; 3484 3485 if (bsep->bs_oop) 3486 open_owner_hold(bsep->bs_oop); 3487 destp->bs_oop = bsep->bs_oop; 3488 if (bsep->bs_lop) 3489 lock_owner_hold(bsep->bs_lop); 3490 destp->bs_lop = bsep->bs_lop; 3491 if (bsep->bs_vp) 3492 VN_HOLD(bsep->bs_vp); 3493 destp->bs_vp = bsep->bs_vp; 3494 destp->bs_pid = bsep->bs_pid; 3495 destp->bs_tag = bsep->bs_tag; 3496 destp->bs_seqid = bsep->bs_seqid; 3497 } 3498 3499 static void 3500 free_bseqid_rqst(nfs4_bseqid_entry_t *bsep) 3501 { 3502 if (bsep->bs_oop) 3503 open_owner_rele(bsep->bs_oop); 3504 if (bsep->bs_lop) 3505 lock_owner_rele(bsep->bs_lop); 3506 if (bsep->bs_vp) 3507 VN_RELE(bsep->bs_vp); 3508 kmem_free(bsep, sizeof (nfs4_bseqid_entry_t)); 3509 } 3510 3511 /* 3512 * We don't actually fully recover from NFS4ERR_BAD_SEQID. We 3513 * simply mark the open owner and open stream (if provided) as "bad". 3514 * Then future uses of these data structures will be limited to basically 3515 * just cleaning up the internal client state (no going OTW). 3516 * 3517 * The result of this is to return errors back to the app/usr when 3518 * we receive NFS4ERR_BAD_SEQID, but also allow future/new calls to 3519 * succeed so progress can be made. 3520 */ 3521 void 3522 recov_bad_seqid(recov_info_t *recovp) 3523 { 3524 mntinfo4_t *mi = recovp->rc_mi; 3525 nfs4_open_owner_t *bad_oop; 3526 nfs4_lock_owner_t *bad_lop; 3527 vnode_t *vp; 3528 rnode4_t *rp = NULL; 3529 pid_t pid; 3530 nfs4_bseqid_entry_t *bsep, *tbsep; 3531 int error; 3532 3533 ASSERT(mi != NULL); 3534 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER)); 3535 3536 mutex_enter(&mi->mi_lock); 3537 bsep = list_head(&mi->mi_bseqid_list); 3538 mutex_exit(&mi->mi_lock); 3539 3540 /* 3541 * Handle all the bad seqid entries on mi's list. 3542 */ 3543 while (bsep != NULL) { 3544 bad_oop = bsep->bs_oop; 3545 bad_lop = bsep->bs_lop; 3546 vp = bsep->bs_vp; 3547 pid = bsep->bs_pid; 3548 3549 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 3550 "recov_bad_seqid: mark oop %p lop %p as bad for " 3551 "vp %p tag %s pid %d: last good seqid %d for tag %s", 3552 (void *)bad_oop, (void *)bad_lop, (void *)vp, 3553 nfs4_ctags[bsep->bs_tag].ct_str, pid, 3554 bad_oop ? bad_oop->oo_last_good_seqid : 0, 3555 bad_oop ? nfs4_ctags[bad_oop->oo_last_good_op].ct_str : 3556 nfs4_ctags[TAG_NONE].ct_str)); 3557 3558 nfs4_queue_event(RE_BAD_SEQID, mi, NULL, 3559 0, vp, NULL, NFS4ERR_BAD_SEQID, NULL, pid, bsep->bs_tag, 3560 bad_oop ? bad_oop->oo_last_good_op : TAG_NONE, 3561 bsep->bs_seqid, bad_oop ? bad_oop->oo_last_good_seqid : 0); 3562 3563 if (bad_oop) { 3564 /* essentially reset the open owner */ 3565 error = nfs4_start_open_seqid_sync(bad_oop, mi); 3566 ASSERT(!error); /* recov thread always succeeds */ 3567 bad_oop->oo_name = nfs4_get_new_oo_name(); 3568 bad_oop->oo_seqid = 0; 3569 nfs4_end_open_seqid_sync(bad_oop); 3570 } 3571 3572 if (bad_lop) { 3573 mutex_enter(&bad_lop->lo_lock); 3574 bad_lop->lo_flags |= NFS4_BAD_SEQID_LOCK; 3575 mutex_exit(&bad_lop->lo_lock); 3576 3577 ASSERT(vp != NULL); 3578 rp = VTOR4(vp); 3579 mutex_enter(&rp->r_statelock); 3580 rp->r_flags |= R4LODANGLERS; 3581 mutex_exit(&rp->r_statelock); 3582 3583 nfs4_send_siglost(pid, mi, vp, TRUE, 3584 0, NFS4ERR_BAD_SEQID); 3585 } 3586 3587 mutex_enter(&mi->mi_lock); 3588 list_remove(&mi->mi_bseqid_list, bsep); 3589 tbsep = bsep; 3590 bsep = list_head(&mi->mi_bseqid_list); 3591 mutex_exit(&mi->mi_lock); 3592 free_bseqid_rqst(tbsep); 3593 } 3594 3595 mutex_enter(&mi->mi_lock); 3596 mi->mi_recovflags &= ~MI4R_BAD_SEQID; 3597 mutex_exit(&mi->mi_lock); 3598 } 3599