1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/types.h> 27 #include <sys/ksynch.h> 28 #include <sys/cmn_err.h> 29 #include <sys/errno.h> 30 #include <sys/kmem.h> 31 #include <sys/cred.h> 32 #include <sys/ddi.h> 33 34 #include <sys/nsc_thread.h> 35 #include <sys/nsctl/nsctl.h> 36 37 #include <sys/sdt.h> /* dtrace is S10 or later */ 38 39 #include "sd_bcache.h" 40 #include "sd_ft.h" 41 #include "sd_trace.h" 42 #include "sd_io.h" 43 #include "sd_misc.h" 44 #include <sys/ncall/ncall.h> 45 46 _sd_ft_info_t _sd_ft_data; 47 48 static volatile int _sd_ft_exit = 0; 49 static kcondvar_t _sd_ft_cv; 50 int _sd_node_recovery; /* node recovery in progress */ 51 /* 52 * _sd_async_recovery: 53 * 0 = flush and wait 54 * 1 = clone and async-write 55 * 2 = quicksort, clone, and async-write 56 * quicksort allows contiguous blocks to be joined, 57 * which may greatly improve recovery time for raid devices. 58 * if kmem_alloc fails, acts as _sd_async_recovery == 1 59 */ 60 static int _sd_async_recovery = 2; 61 static int xmem_inval_hit, xmem_inval_miss, xmem_inval_inuse; 62 63 64 /* 65 * flag to inhibit reset of remote SCSI buses and sending of 66 * nodedown callback if mirror was deconfigured properly. 67 * - prevents trashing any I/O that may be happening on the mirror 68 * node during a normal shutdown and prevents undesired simckd failover. 69 */ 70 static int mirror_clean_shutdown = 0; 71 72 /* 73 * Forward declare all statics that are used before defined to enforce 74 * parameter checking 75 * Some (if not all) of these could be removed if the code were reordered 76 */ 77 78 static void _sd_health_thread(void); 79 static void _sd_cache_recover(void); 80 static int _sd_ft_clone(ss_centry_info_t *, int); 81 static void _sd_remote_enable(void); 82 static void sdbc_setmodeandftdata(); 83 static void _sd_cd_discard_mirror(int cd); 84 static int _sd_failover_file_open(void); 85 static void _sd_failover_done(void); 86 static void _sd_wait_for_dirty(void); 87 static void _sdbc_clear_warm_start(void); 88 static int sdbc_recover_vol(ss_vol_t *, int); 89 void _ncall_poke(int); 90 91 int _sdbc_ft_hold_io; 92 kcondvar_t _sdbc_ft_hold_io_cv; 93 kmutex_t _sdbc_ft_hold_io_lk; 94 extern int sdbc_use_dmchain; 95 extern void sdbc_requeue_head_dm_try(_sd_cctl_t *cc_ent); 96 97 /* 98 * _sdbc_ft_unload - cache is being unloaded (or failed to load). 99 * Deallocate any global lock/sv that we created. 100 */ 101 void 102 _sdbc_ft_unload(void) 103 { 104 cv_destroy(&_sd_ft_cv); 105 mutex_destroy(&_sd_ft_data.fi_lock); 106 cv_destroy(&_sd_ft_data.fi_rem_sv); 107 mutex_destroy(&_sd_ft_data.fi_sleep); 108 bzero(&_sd_ft_data, sizeof (_sd_ft_info_t)); 109 } 110 111 /* 112 * _sdbc_ft_load - cache is being loaded. Allocate all global lock/sv 113 * that we need. Return 0 if we succeed. If we fail return -1 (don't 114 * need to do the unload step as we expect our caller to do that). 115 */ 116 int 117 _sdbc_ft_load(void) 118 { 119 /* _sd_ft_data is sure to be zeroes, don't need to bzero it */ 120 121 mutex_init(&_sd_ft_data.fi_lock, NULL, MUTEX_DRIVER, NULL); 122 cv_init(&_sd_ft_data.fi_rem_sv, NULL, CV_DRIVER, NULL); 123 cv_init(&_sd_ft_cv, NULL, CV_DRIVER, NULL); 124 mutex_init(&_sd_ft_data.fi_sleep, NULL, MUTEX_DRIVER, NULL); 125 return (0); 126 } 127 128 129 int 130 _sdbc_ft_configure(void) 131 { 132 _sd_ft_exit = 1; 133 return (nsc_create_process( 134 (void (*)(void *))_sd_health_thread, 0, TRUE)); 135 } 136 137 138 void 139 _sdbc_ft_deconfigure(void) 140 { 141 _sd_ft_exit = 0; 142 _sd_unblock(&_sd_ft_cv); 143 mutex_enter(&_sd_ft_data.fi_lock); 144 _sd_node_recovery = 0; 145 cv_broadcast(&_sd_ft_data.fi_rem_sv); 146 mutex_exit(&_sd_ft_data.fi_lock); 147 } 148 149 150 /* 151 * _sd_health_thread -- daemon thread on each node watches if mirror 152 * node to has crashed, and it needs to flush the mirrors cache entries. 153 * Note we do *not* detect that the node has come up again, but wait 154 * for the node to inform us that it is up via _sd_cache_reenable(). 155 */ 156 static void 157 _sd_health_thread(void) 158 { 159 int warm_started = 0; 160 161 mutex_enter(&_sd_cache_lock); 162 _sd_cache_dem_cnt++; 163 mutex_exit(&_sd_cache_lock); 164 165 /* clear _sd_ft_data in case this is a cache re-enable w/o unload */ 166 167 bzero(&_sd_ft_data, sizeof (_sd_ft_info_t)); 168 169 sdbc_setmodeandftdata(); 170 171 #ifdef DEBUG 172 cmn_err(CE_NOTE, "!sdbc(_sd_health_thread) safestore " 173 "is %s. Fast writes %s", 174 (_SD_MIRROR_CONFIGD) ? "up" : "down", 175 (_SD_NODE_HINTS & _SD_WRTHRU_MASK) ? 176 "disabled" : "enabled"); 177 #endif 178 179 /* CONSTCOND */ 180 while (1) { 181 _sd_timed_block(HZ/8, &_sd_ft_cv); 182 if (_sd_ft_exit == 0) { 183 mutex_enter(&_sd_cache_lock); 184 _sd_cache_dem_cnt--; 185 mutex_exit(&_sd_cache_lock); 186 return; 187 } 188 189 /* NB evaluation order is important here for nvmem systems */ 190 if (_sd_is_mirror_crashed() || 191 (warm_started = _sdbc_warm_start())) { 192 193 /* 194 * Hash invalidate here. We do not want data from 195 * previous failover incarnation to be cache hits, if 196 * the 2 failover happens within a short time 197 */ 198 _sd_hash_invalidate_cd(-1); 199 200 /* 201 * don't change mirror state when warm starting 202 * nvmem systems. _sd_mirror_down() is called in 203 * in _sd_remote_enable() on nvmem systems if the 204 * media is down. 205 */ 206 if (!warm_started) 207 if (!mirror_clean_shutdown) 208 _sd_mirror_down(); 209 else 210 _sd_mirror_cache_down(); 211 212 (void) _sd_set_node_hint(NSC_FORCED_WRTHRU); 213 if (!warm_started) { 214 /* was FAST */ 215 mutex_enter(&_sd_ft_data.fi_lock); 216 _sd_node_recovery = 0; 217 /* was FAST */ 218 mutex_exit(&_sd_ft_data.fi_lock); 219 /* Assume other side is still up */ 220 cmn_err(CE_WARN, 221 "!sdbc(_sd_health_thread)" 222 "Safestore is down. Fast writes %s", 223 (_SD_NODE_HINTS & _SD_WRTHRU_MASK) ? 224 "disabled" : "enabled"); 225 _sd_unblock(&_sd_flush_cv); 226 227 if (SAFESTORE_LOCAL(sdbc_safestore)) 228 continue; 229 230 /* Wait for cache to drain and panic */ 231 _sd_wait_for_dirty(); 232 cmn_err(CE_WARN, 233 "!sdbc(_sd_health_thread)" 234 " dirty blocks flushed"); 235 continue; 236 } 237 /* was FAST */ 238 mutex_enter(&_sd_ft_data.fi_lock); 239 _sd_node_recovery = 1; 240 /* was FAST */ 241 mutex_exit(&_sd_ft_data.fi_lock); 242 if (!SAFESTORE_LOCAL(sdbc_safestore)) 243 cmn_err(CE_WARN, 244 "!sdbc(_sd_health_thread)" 245 " Cache on node %d is down. " 246 "Fast writes %s", 247 _SD_MIRROR_HOST, 248 (_SD_NODE_HINTS & _SD_WRTHRU_MASK) ? 249 "disabled" : "enabled"); 250 cmn_err(CE_NOTE, 251 "!sdbc(_sd_health_thread)" 252 " Cache recovery in progress"); 253 _sd_cache_recover(); 254 255 mutex_enter(&_sd_ft_data.fi_lock); 256 _sd_node_recovery = 0; 257 _sdbc_clear_warm_start(); /* nvmem systems */ 258 cv_broadcast(&_sd_ft_data.fi_rem_sv); 259 mutex_exit(&_sd_ft_data.fi_lock); 260 cmn_err(CE_NOTE, 261 "!sdbc(_sd_health_thread) %s Cache recovery done", 262 _sd_async_recovery ? 263 "asynchronous" : "synchronous"); 264 /* restore previous state */ 265 if (warm_started && !_sd_is_mirror_down()) { 266 (void) _sd_clear_node_hint(NSC_FORCED_WRTHRU); 267 cmn_err(CE_NOTE, 268 "!sdbc(_sd_health_thread) Fast writes %s", 269 (_SD_NODE_HINTS & _SD_WRTHRU_MASK) ? 270 "disabled" : "enabled"); 271 } 272 warm_started = 0; 273 274 } else if (_sd_is_mirror_node_down()) { 275 _sd_mirror_down(); 276 } 277 } 278 } 279 280 /* 281 * _sdbc_recovery_io_wait - wait for i/o being done directly 282 * out of safe storage to complete. If the i/o does not make any 283 * progress within about 25 seconds we return EIO otherwise return 0. 284 * 285 */ 286 static 287 int 288 _sdbc_recovery_io_wait(void) 289 { 290 int tries = 0; 291 int last_numio = 0; 292 293 /* 294 * Wait for numio to reach 0. 295 * If numio has not changed for 85+ seconds, 296 * break & pin blocks 297 */ 298 while (_sd_ft_data.fi_numio > 0) { 299 if (last_numio == _sd_ft_data.fi_numio) { 300 if (++tries > 512) break; 301 } else { 302 last_numio = _sd_ft_data.fi_numio; 303 tries = 0; 304 } 305 delay(HZ/8); 306 } 307 if (_sd_ft_data.fi_numio != 0) { 308 cmn_err(CE_WARN, "!sdbc(_sdbc_recovery_io_wait) %d " 309 "recovery i/o's not done", _sd_ft_data.fi_numio); 310 return (EIO); 311 } 312 return (0); 313 } 314 315 316 #if defined(_SD_FAULT_RES) 317 /* 318 * _sd_recovery_wait() 319 * while _sd_node_recovery is set, accesses to mirrored devices will block 320 * (_sd_node_recovery-1) is count of blocked threads. 321 */ 322 int 323 _sd_recovery_wait(void) 324 { 325 int blk; 326 327 mutex_enter(&_sd_ft_data.fi_lock); 328 blk = _sd_node_recovery ? _sd_node_recovery++ : 0; 329 330 if (blk) 331 cv_wait(&_sd_ft_data.fi_rem_sv, &_sd_ft_data.fi_lock); 332 mutex_exit(&_sd_ft_data.fi_lock); 333 334 if (!_sd_cache_initialized) 335 return (EINVAL); 336 return (0); 337 } 338 339 /* 340 * _sd_recovery_wblk_wait - wait for recovery i/o to a device 341 * to cease. If the file is closed or the cache is disabled 342 * first return an error otherwise return 0. 343 * 344 * A device is being recovered from our point of view either 345 * during failover or by putting a disk back online after 346 * a disk failure. 347 * 348 * This code is used to delay access to a device while recovery 349 * writes are in progress from either a failover or while flushing 350 * i/o after a failed disk has been repaired. 351 */ 352 int 353 _sd_recovery_wblk_wait(int cd) 354 { 355 _sd_cd_info_t *cdi = &_sd_cache_files[cd]; 356 357 while (_sd_cache_initialized && 358 FILE_OPENED(cd) && cdi->cd_recovering) { 359 /* spawn writer if none */ 360 if (!cdi->cd_writer) (void) cd_writer(cd); 361 delay(HZ/8); 362 } 363 if (!_sd_cache_initialized || !FILE_OPENED(cd)) 364 return (EINVAL); 365 return (0); 366 } 367 368 /* 369 * Recover from a crash of another node: 370 * 371 * 1) Open all remote files 372 * 2) Allocate other node's buffers and new buffer headers 373 * 3) Flush all dirty buffers to disk 374 * 4) Deallocate resources 375 */ 376 static void 377 _sd_cache_recover(void) 378 { 379 int cblocks_processed; 380 381 SDTRACE(ST_ENTER|SDF_RECOVER, SDT_INV_CD, 0, SDT_INV_BL, 0, 0); 382 383 /* was FAST */ 384 mutex_enter(&_sd_ft_data.fi_lock); 385 _sd_ft_data.fi_numio = 0; 386 /* was FAST */ 387 mutex_exit(&_sd_ft_data.fi_lock); 388 389 #ifdef _SD_DRIVE_RESP 390 if (!mirror_clean_shutdown) 391 _raw_reset_other(); 392 #endif 393 mirror_clean_shutdown = 0; 394 395 cblocks_processed = _sd_failover_file_open(); 396 397 /* allow cache config to proceed */ 398 mutex_enter(&_sdbc_ft_hold_io_lk); 399 _sdbc_ft_hold_io = 0; 400 cv_signal(&_sdbc_ft_hold_io_cv); 401 mutex_exit(&_sdbc_ft_hold_io_lk); 402 403 /* wait for sequential recovery to complete */ 404 if (!_sd_async_recovery && cblocks_processed) 405 (void) _sdbc_recovery_io_wait(); 406 407 _sd_failover_done(); 408 409 if (cblocks_processed) 410 cmn_err(CE_NOTE, 411 "!sdbc %ssynchronous recovery complete " 412 "%d cache blocks processed", 413 _sd_async_recovery ? "a" : "", 414 cblocks_processed); 415 416 SDTRACE(ST_EXIT|SDF_RECOVER, SDT_INV_CD, 0, SDT_INV_BL, 0, 0); 417 } 418 419 void 420 _sd_mirror_iodone(void) 421 { 422 /* was FAST */ 423 mutex_enter(&_sd_ft_data.fi_lock); 424 _sd_ft_data.fi_numio--; 425 /* was FAST */ 426 mutex_exit(&_sd_ft_data.fi_lock); 427 } 428 429 430 431 /* 432 * _sd_ft_clone -- clone cache block from ft area, retry write or pin. 433 */ 434 static int 435 _sd_ft_clone(ss_centry_info_t *ft_cent, int async) 436 { 437 _sd_cctl_t *ent; 438 int cd = ft_cent->sc_cd; 439 nsc_off_t cblk = ft_cent->sc_fpos; 440 int dirty = ft_cent->sc_dirty; 441 ss_resource_t *res = ft_cent->sc_res; 442 _sd_cd_info_t *cdi; 443 444 SDTRACE(ST_ENTER|SDF_FT_CLONE, cd, BLK_FBAS, cblk, dirty, _SD_NO_NET); 445 cdi = &(_sd_cache_files[cd]); 446 if ((cdi->cd_info->sh_failed != 2) && !FILE_OPENED(cd)) { 447 cmn_err(CE_WARN, "!sdbc(_sd_ft_clone) recovery " 448 "write failed: cd %x; cblk %" NSC_SZFMT "; dirty %x", 449 cd, cblk, dirty); 450 SDTRACE(ST_EXIT|SDF_FT_CLONE, 451 cd, BLK_FBAS, cblk, dirty, EINTR); 452 return (-1); 453 } 454 455 /* 456 * allocate new cache entry and read data 457 */ 458 ent = sdbc_centry_alloc_blks(cd, cblk, 1, 0); 459 460 if (SSOP_READ_CBLOCK(sdbc_safestore, res, (void *)ent->cc_data, 461 CACHE_BLOCK_SIZE, 0) == SS_ERR) { 462 cmn_err(CE_WARN, "!sdbc(_sd_ft_clone) read of " 463 "pinned data block failed. cannot recover " 464 "0x%p size 0x%x", (void *)res, CACHE_BLOCK_SIZE); 465 466 /* _sd_process_failure ?? */ 467 _sd_centry_release(ent); 468 return (-1); 469 } 470 471 ent->cc_write = ft_cent; 472 ent->cc_dirty = ent->cc_valid = (ushort_t)dirty; 473 ent->cc_flag |= (ft_cent->sc_flag & CC_PINNABLE); 474 475 ent->cc_chain = NULL; 476 477 /* 478 * _sd_process_failure() adds to failed list & does pinned callback 479 * otherwise async flush 480 */ 481 if (cdi->cd_info->sh_failed) { /* raw device open/reserve failed */ 482 mutex_enter(&cdi->cd_lock); 483 (cdi->cd_info->sh_numio)++; 484 mutex_exit(&cdi->cd_lock); 485 (void) _sd_process_failure(ent); 486 } else { 487 488 if (cdi->cd_global->sv_pinned != _SD_NO_HOST) { 489 cdi->cd_global->sv_pinned = _SD_NO_HOST; 490 SSOP_SETVOL(sdbc_safestore, cdi->cd_global); 491 } 492 493 if (async) { 494 _sd_enqueue_dirty(cd, ent, ent, 1); 495 } else { 496 /* 497 * this is sync write with asynchronous callback 498 * (queue to disk and return). 499 */ 500 501 mutex_enter(&(cdi->cd_lock)); 502 (cdi->cd_info->sh_numio)++; 503 mutex_exit(&cdi->cd_lock); 504 _sd_async_flcent(ent, cdi->cd_crdev); 505 } 506 } 507 _sd_centry_release(ent); 508 SDTRACE(ST_EXIT|SDF_FT_CLONE, cd, BLK_FBAS, cblk, dirty, _SD_NO_NET); 509 return (0); 510 } 511 512 513 /* 514 * _sd_repin_cd - scan for dirty blocks held by mirror node. 515 * 516 * sdbc on this node is being attached to cd. If sdbc on other 517 * node had failed writes (pinnable or not) we need to take 518 * responsbility for them now here. 519 */ 520 int 521 _sd_repin_cd(int cd) 522 { 523 ss_voldata_t *cd_gl; 524 _sd_cd_info_t *cdi; 525 526 if (!FILE_OPENED(cd)) 527 return (EINVAL); 528 529 cdi = &_sd_cache_files[cd]; 530 if (cdi->cd_global->sv_pinned == _SD_NO_HOST) 531 return (0); 532 533 cd_gl = _sdbc_gl_file_info + cd; 534 535 if (sdbc_recover_vol(cd_gl->sv_vol, cd)) 536 _sd_cd_discard_mirror(cd); 537 538 return (0); 539 } 540 541 542 static int 543 _sd_cache_mirror_enable(int host) 544 { 545 if (_sd_cache_initialized) { 546 if (host != _SD_MIRROR_HOST) { 547 cmn_err(CE_WARN, "!sdbc(_sd_cache_mirror_enable) " 548 "Configured mirror %x. Got message from %x", 549 _SD_MIRROR_HOST, host); 550 return (-EINVAL); 551 } 552 if (_sd_node_recovery) (void) _sd_recovery_wait(); 553 if (_sd_cache_initialized && _sd_is_mirror_down()) { 554 int i; 555 556 /* make sure any pinned data we have is now refreshed */ 557 for (i = 0; i < sdbc_max_devs; i++) 558 if (FILE_OPENED(i)) 559 (void) _sdbc_remote_store_pinned(i); 560 561 cmn_err(CE_NOTE, 562 "!sdbc(_sd_cache_mirror_enable) Cache on " 563 "mirror node %d is up. Fast writes enabled", 564 host); 565 _sd_mirror_up(); 566 (void) _sd_clear_node_hint(NSC_FORCED_WRTHRU); 567 } 568 } 569 _sd_ft_data.fi_host_state = _SD_HOST_CONFIGURED; 570 return (_sd_cache_initialized); 571 } 572 573 574 /* 575 * two stage mirror disable: 576 * stage 0: set FORCED_WRTHRU hint (cache shutdown started) 577 * stage 1: mirror shutdown completed 578 */ 579 static int 580 _sd_cache_mirror_disable(int host, int stage) 581 { 582 if (_sd_cache_initialized) { 583 584 if (host != _SD_MIRROR_HOST) 585 return (0); 586 if (stage == 0) { 587 (void) _sd_set_node_hint(NSC_FORCED_WRTHRU); 588 return (0); 589 } 590 _sd_ft_data.fi_host_state = _SD_HOST_DECONFIGURED; 591 mirror_clean_shutdown = 1; 592 _sd_unblock(&_sd_ft_cv); 593 } else { 594 _sd_ft_data.fi_host_state = _SD_HOST_NONE; 595 } 596 return (0); 597 } 598 599 /* 600 * set the fault tolerant data to indicate the state 601 * of the safestore host. set mode to writethru if appropriate 602 */ 603 static void 604 sdbc_setmodeandftdata() 605 { 606 /* 607 * if single node local safestore or ram safestore 608 * then mark host state as carashed/_SD_HOST_NONE and set writethru 609 */ 610 if (SAFESTORE_LOCAL(sdbc_safestore)) { 611 if (!SAFESTORE_SAFE(sdbc_safestore)) { 612 _sd_mirror_down(); /* mirror node down */ 613 (void) _sd_set_node_hint(NSC_FORCED_WRTHRU); 614 } else { 615 _sd_ft_data.fi_host_state = _SD_HOST_CONFIGURED; 616 if (_sdbc_warm_start()) 617 (void) _sd_set_node_hint(NSC_FORCED_WRTHRU); 618 } 619 } else 620 _sd_remote_enable(); 621 } 622 623 static void 624 _sd_remote_enable(void) 625 { 626 ncall_t *ncall; 627 long r; 628 629 if (ncall_alloc(_SD_MIRROR_HOST, 0, _SD_NO_NET, &ncall)) { 630 _sd_mirror_down(); /* mirror node down */ 631 (void) _sd_set_node_hint(NSC_FORCED_WRTHRU); 632 return; 633 } 634 635 r = ncall_send(ncall, 0, SD_ENABLE, _SD_SELF_HOST); 636 if (!r) (void) ncall_read_reply(ncall, 1, &r); 637 ncall_free(ncall); 638 639 if (r == 1) { /* _sd_cache_initialized */ 640 if (!_sd_is_mirror_crashed() && 641 _sd_ft_data.fi_host_state == _SD_HOST_NONE) 642 _sd_ft_data.fi_host_state = _SD_HOST_CONFIGURED; 643 return; 644 } 645 if (r == ENOLINK) 646 _sd_mirror_down(); /* mirror node down */ 647 else 648 _sd_mirror_cache_down(); /* mirror up, but no cache */ 649 (void) _sd_set_node_hint(NSC_FORCED_WRTHRU); 650 } 651 652 653 void 654 _sd_remote_disable(int stage) 655 { 656 ncall_t *ncall; 657 658 if (ncall_alloc(_SD_MIRROR_HOST, 0, 0, &ncall) == 0) 659 (void) ncall_send(ncall, NCALL_ASYNC, SD_DISABLE, 660 _SD_SELF_HOST, stage); 661 } 662 663 void 664 r_sd_ifs_cache_enable(ncall_t *ncall, int *ap) 665 { 666 ncall_reply(ncall, _sd_cache_mirror_enable(*ap)); 667 } 668 669 670 671 void 672 r_sd_ifs_cache_disable(ncall_t *ncall, int *ap) 673 { 674 (void) _sd_cache_mirror_disable(ap[0], ap[1]); 675 ncall_done(ncall); 676 } 677 678 #else /* (_SD_FAULT_RES) */ 679 680 void r_sd_ifs_cache_enable() {; } 681 void r_sd_ifs_cache_disable() {; } 682 683 #endif /* (_SD_FAULT_RES) */ 684 685 /* 686 * invalidate cache hash table entries for given device 687 * or (-1) all devices belonging to mirrored node 688 */ 689 void 690 _sd_hash_invalidate_cd(int CD) 691 { 692 int i; 693 _sd_cd_info_t *cdi; 694 _sd_hash_hd_t *hptr; 695 _sd_cctl_t *cc_ent, *ent; 696 _sd_hash_bucket_t *bucket; 697 int cd; 698 nsc_off_t blk; 699 700 for (i = 0; i < (_sd_htable->ht_size); i++) { 701 bucket = (_sd_htable->ht_buckets + i); 702 mutex_enter(bucket->hb_lock); 703 hptr = bucket->hb_head; 704 while (hptr) { 705 cc_ent = (_sd_cctl_t *)hptr; 706 cd = CENTRY_CD(cc_ent); 707 blk = CENTRY_BLK(cc_ent); 708 cdi = &_sd_cache_files[cd]; 709 710 /* 711 * Skip if device doesn't match or pinned. 712 * (-1) skip attached cd's 713 */ 714 if ((CD != -1 && (cd != CD || CENTRY_PINNED(cc_ent))) || 715 (CD == -1 && nsc_held(cdi->cd_rawfd))) { 716 hptr = hptr->hh_next; 717 continue; 718 } 719 mutex_exit(bucket->hb_lock); 720 721 ent = cc_ent; 722 fl1: 723 if (CC_CD_BLK_MATCH(cd, blk, ent) || 724 (ent = (_sd_cctl_t *)_sd_hash_search(cd, blk, 725 _sd_htable))) { 726 if (SET_CENTRY_INUSE(ent)) { 727 xmem_inval_inuse++; 728 _sd_cc_wait(cd, blk, ent, CC_INUSE); 729 goto fl1; /* try again */ 730 } 731 732 /* cc_inuse is set, delete on block match */ 733 if (CC_CD_BLK_MATCH(cd, blk, ent)) { 734 xmem_inval_hit++; 735 (void) _sd_hash_delete( 736 (struct _sd_hash_hd *)ent, 737 _sd_htable); 738 739 if (sdbc_use_dmchain) { 740 741 /* attempt to que head */ 742 if (ent->cc_alloc_size_dm) { 743 sdbc_requeue_head_dm_try 744 (ent); 745 } 746 } else 747 _sd_requeue_head(ent); 748 749 } else 750 xmem_inval_miss++; 751 752 CLEAR_CENTRY_INUSE(ent); 753 } 754 mutex_enter(bucket->hb_lock); 755 hptr = bucket->hb_head; 756 } 757 mutex_exit(bucket->hb_lock); 758 } 759 } 760 761 762 /* 763 * _sd_cd_online(cd,discard) 764 * clear local error state. 765 * if (discard && _attached != _SD_SELF_HOST) then release buffers. 766 * if (!discard && _attached != _SD_MIRROR_HOST) then re-issue I/Os 767 * (add to dirty pending queue). 768 * returns: 769 * 0 success 770 * EINVAL invalid device or not failed 771 * EBUSY attached by this node, or by active mirror 772 */ 773 static int 774 _sd_cd_online(int cd, int discard) 775 { 776 _sd_cd_info_t *cdi = &_sd_cache_files[cd]; 777 int failed, num; 778 _sd_cctl_t *cc_ent, *cc_next, *cc_last, *cc_first, *cc_next_chain; 779 780 /* 781 * in the case where a failed device has been closed and 782 * then re-opened, sh_failed will be zero because it is 783 * cleared in _sd_open_cd(). hence the test for 784 * _pinned != _SD_SELF_HOST which allows the restore to 785 * proceed in this scenario. 786 */ 787 if (cd < 0 || cd >= sdbc_max_devs) 788 return (EINVAL); 789 790 if (!cdi->cd_info || !cdi->cd_global) 791 return (EINVAL); 792 793 if ((cdi->cd_info->sh_failed == 0) && 794 (cdi->cd_global->sv_pinned != _SD_SELF_HOST)) 795 return (0); 796 797 if (_sd_nodes_configured > 1) { 798 799 /* can't discard while attached on multinode systems */ 800 if (discard && (cdi->cd_global->sv_attached == _SD_SELF_HOST)) 801 return (EBUSY); 802 803 if (!discard && /* attached by active mirror! */ 804 (cdi->cd_global->sv_attached == _SD_MIRROR_HOST) && 805 !_sd_is_mirror_down()) 806 return (EBUSY); 807 } 808 809 mutex_enter(&cdi->cd_lock); 810 811 cc_ent = cdi->cd_fail_head; 812 failed = cdi->cd_info->sh_numfail; 813 cdi->cd_fail_head = NULL; 814 cdi->cd_info->sh_numfail = 0; 815 cdi->cd_info->sh_failed = 0; 816 cdi->cd_global->sv_pinned = _SD_NO_HOST; 817 SSOP_SETVOL(sdbc_safestore, cdi->cd_global); 818 819 if (cc_ent == NULL) { 820 mutex_exit(&cdi->cd_lock); 821 return (0); 822 } 823 /* prevent any new i/o from arriving for this cd */ 824 if (!discard) 825 cdi->cd_recovering = 1; 826 827 mutex_exit(&cdi->cd_lock); 828 829 num = 0; 830 cc_first = cc_ent; 831 for (; cc_ent; cc_ent = cc_next_chain) { 832 cc_next_chain = cc_ent->cc_dirty_link; 833 834 for (; cc_ent; cc_ent = cc_next) { 835 cc_next = cc_ent->cc_dirty_next; 836 cc_last = cc_ent; 837 num++; 838 839 if (discard) { 840 ss_centry_info_t *wctl; 841 /* was FAST */ 842 mutex_enter(&cc_ent->cc_lock); 843 cc_ent->cc_valid = cc_ent->cc_dirty = 0; 844 cc_ent->cc_flag &= ~(CC_PEND_DIRTY|CC_PINNED); 845 cc_ent->cc_dirty_next = NULL; 846 cc_ent->cc_dirty_link = NULL; 847 wctl = cc_ent->cc_write; 848 cc_ent->cc_write = NULL; 849 /* was FAST */ 850 mutex_exit(&cc_ent->cc_lock); 851 if (wctl) { 852 wctl->sc_flag = 0; 853 wctl->sc_dirty = 0; 854 855 SSOP_SETCENTRY(sdbc_safestore, wctl); 856 SSOP_DEALLOCRESOURCE(sdbc_safestore, 857 wctl->sc_res); 858 } 859 860 continue; 861 } 862 863 /* Clear PEND_DIRTY, iocount & iostatus */ 864 if (SET_CENTRY_INUSE(cc_ent) == 0) { 865 cc_ent->cc_flag &= ~CC_PEND_DIRTY; 866 cc_ent->cc_iocount = 0; 867 cc_ent->cc_iostatus = 0; /* _SD_IO_NONE */ 868 CLEAR_CENTRY_INUSE(cc_ent); 869 } else { 870 /* was FAST */ 871 mutex_enter(&cc_ent->cc_lock); 872 cc_ent->cc_flag &= ~CC_PEND_DIRTY; 873 cc_ent->cc_iocount = 0; 874 cc_ent->cc_iostatus = 0; /* _SD_IO_NONE */ 875 /* was FAST */ 876 mutex_exit(&cc_ent->cc_lock); 877 } 878 } 879 } 880 if (num != failed) 881 cmn_err(CE_WARN, "!sdbc(_sd_cd_online) count %d vs numfail %d", 882 num, failed); 883 if (discard) { 884 _sd_hash_invalidate_cd(cd); 885 return (0); 886 } 887 888 _sd_enqueue_dirty_chain(cd, cc_first, cc_last, num); 889 /* make sure data gets flushed in case there is no new I/O */ 890 (void) nsc_reserve(cdi->cd_rawfd, NSC_MULTI); 891 (void) _sd_wait_for_flush(cd); 892 cdi->cd_recovering = 0; 893 nsc_release(cdi->cd_rawfd); 894 895 return (0); 896 } 897 898 #if defined(_SD_FAULT_RES) 899 900 /* 901 * This node has disk attached, discard pins held by mirror 902 */ 903 static void 904 _sd_cd_discard_mirror(int cd) 905 { 906 ncall_t *ncall; 907 if (ncall_alloc(_SD_MIRROR_HOST, 0, 0, &ncall)) 908 return; 909 (void) ncall_send(ncall, NCALL_ASYNC, SD_CD_DISCARD, cd); 910 } 911 912 void 913 r_cd_discard(ncall_t *ncall, int *ap) 914 { 915 int r, cd = *ap; 916 if (_sd_cache_initialized) { 917 SDTRACE(ST_ENTER|SDF_ONLINE, cd, 1, SDT_INV_BL, 1, 0); 918 r = _sd_cd_online(cd, 1); 919 SDTRACE(ST_EXIT|SDF_ONLINE, cd, 1, SDT_INV_BL, 1, r); 920 } 921 ncall_done(ncall); 922 } 923 924 /* 925 * _sd_failover_file_open - 926 * on failover, open devices which are not attached by this node. 927 */ 928 static int 929 _sd_failover_file_open(void) 930 { 931 int rc, cd, flag = 0; 932 ss_voldata_t *cd_gl; 933 _sd_cd_info_t *cdi; 934 int cblocks_processed = 0; 935 extern ss_voldata_t *_sdbc_gl_file_info; 936 937 for (cd = 0; cd < sdbc_max_devs; cd++) { 938 cd_gl = _sdbc_gl_file_info + cd; 939 cdi = &(_sd_cache_files[cd]); 940 941 /* 942 * If the cd is open and reserved we certainly don't 943 * need to do it again. However the recovery code 944 * must be racing some other cache usage which could 945 * be bad. We really need to be able to lock out 946 * all cache activity for this cd that is not tied 947 * to the recovery process. This doesn't seem to be 948 * feasible in sdbc since a competing thread could 949 * already be finished doing an alloc_buf. If this 950 * hole is to be closed sd-ctl must be more in 951 * control of the failover process. 952 */ 953 if (FILE_OPENED(cd) && nsc_held(cdi->cd_rawfd)) 954 continue; 955 956 /* 957 * this constuct says that, on non-nvmem systems, 958 * if we are attempting to open a "local" device and 959 * nothing is pinned, then continue. i.e. open only 960 * remote devices or devices that have pinned data. 961 * for recovery on nvmem systems we open all devices. 962 */ 963 if ((!_sdbc_warm_start()) && 964 ((cd_gl->sv_attached != _SD_MIRROR_HOST) && 965 (cd_gl->sv_pinned != _SD_MIRROR_HOST) && 966 (cd_gl->sv_pinned != _SD_SELF_HOST))) 967 continue; 968 if (!cd_gl->sv_volname || !cd_gl->sv_volname[0]) 969 continue; 970 971 if (_sd_open_cd(cd_gl->sv_volname, cd, flag) < 0) { 972 cmn_err(CE_WARN, "!sdbc(_sd_failover_file_open) " 973 "Unable to open disk partition %s", 974 cd_gl->sv_volname); 975 continue; 976 } 977 978 SDTRACE(ST_INFO|SDF_RECOVER, cd, 0, 0, 0, 0); 979 rc = nsc_reserve(cdi->cd_rawfd, NSC_MULTI); 980 if (rc == 0) { 981 cdi->cd_failover = 1; 982 } 983 984 if (rc != 0) cdi->cd_info->sh_failed = 1; 985 986 cblocks_processed += sdbc_recover_vol(cd_gl->sv_vol, cd); 987 } 988 989 return (cblocks_processed); 990 } 991 992 993 static int 994 sdbc_recover_vol(ss_vol_t *vol, int cd) 995 { 996 ss_cdirkey_t key; 997 ss_cdir_t cdir; 998 ss_voldata_t *cd_gl = _sdbc_gl_file_info + cd; 999 ss_centry_info_t *cinfo; 1000 ss_centry_info_t centry; 1001 int cblocks_processed = 0; 1002 int err; 1003 ss_centry_info_t *sdbc_get_cinfo_byres(ss_resource_t *); 1004 1005 /* setup the key to get a volume directory stream of centrys */ 1006 key.ck_type = CDIR_VOL; 1007 key.cdk_u.ck_vol = vol; 1008 1009 if (SSOP_GETCDIR(sdbc_safestore, &key, &cdir)) { 1010 cmn_err(CE_WARN, "!sdbc(sdbc_recover_vol): " 1011 "cannot recover volume %s", 1012 cd_gl->sv_volname); 1013 return (0); 1014 } 1015 1016 /* cycle through the cdir getting resource tokens and reading centrys */ 1017 /*CONSTANTCONDITION*/ 1018 while (1) { 1019 1020 if ((err = SSOP_GETCDIRENT(sdbc_safestore, &cdir, ¢ry)) 1021 == SS_ERR) { 1022 cmn_err(CE_WARN, "!sdbc(sdbc_recover_vol): " 1023 "cache entry read failure %s %p", 1024 cd_gl->sv_volname, (void *)centry.sc_res); 1025 1026 continue; 1027 } 1028 1029 1030 if (err == SS_EOF) 1031 break; /* done */ 1032 1033 1034 /* 1035 * this get into double caching consistency 1036 * need to resolve this jgk 1037 */ 1038 if ((cinfo = sdbc_get_cinfo_byres(centry.sc_res)) == NULL) { 1039 /* should not happen */ 1040 cmn_err(CE_WARN, "!sdbc(sdbc_recover_vol): " 1041 "invalid ss resource %p", (void *)centry.sc_res); 1042 continue; 1043 } 1044 bcopy(¢ry, cinfo, sizeof (ss_centry_info_t)); 1045 1046 /* 1047 * note 1048 * ss should return a stream of dirty blocks ordered 1049 * by block number. if it turns out that ss will not support 1050 * this then sorting for async recovery will have to be 1051 * done here jgk 1052 */ 1053 ASSERT(cinfo->sc_dirty); 1054 1055 if (!cinfo->sc_dirty) /* should not happen */ 1056 continue; 1057 1058 /* 1059 * clone mirror cache entry and do 1060 * async I/O or sync I/O or pin if sh_failed 1061 */ 1062 (void) _sd_ft_clone(cinfo, _sd_async_recovery); 1063 ++cblocks_processed; 1064 } 1065 1066 1067 if (cblocks_processed) 1068 cmn_err(CE_NOTE, 1069 "!sdbc(sdbc_recover_vol) %d cache blocks processed for " 1070 "volume %s", cblocks_processed, cd_gl->sv_volname); 1071 1072 return (cblocks_processed); 1073 } 1074 1075 /* 1076 * _sd_failover_done - 1077 * mark failover open'd devices as requiring nsc_release() 1078 * when all queued I/O's have drained. 1079 */ 1080 static void 1081 _sd_failover_done(void) 1082 { 1083 _sd_cd_info_t *cdi; 1084 int cd; 1085 1086 for (cd = 0; cd < sdbc_max_devs; cd++) { 1087 cdi = &(_sd_cache_files[cd]); 1088 1089 if (FILE_OPENED(cd) && cdi->cd_failover) 1090 cdi->cd_failover = 2; 1091 } 1092 } 1093 1094 #endif /* (_SD_FAULT_RES) */ 1095 1096 /* 1097 * _sd_uncommit - discard local buffer modifications 1098 * clear the valid bits. 1099 */ 1100 int 1101 _sd_uncommit(_sd_buf_handle_t *handle, nsc_off_t fba_pos, nsc_size_t fba_len, 1102 int flag) 1103 { 1104 int cd; 1105 sdbc_cblk_fba_t st_cblk_len; /* FBA len of starting cache block */ 1106 sdbc_cblk_fba_t end_cblk_len; /* FBA len of ending cache block */ 1107 sdbc_cblk_fba_t st_cblk_off; /* FBA offset into starting cblock */ 1108 nsc_size_t cc_len; 1109 int bits; 1110 _sd_cctl_t *cc_ent; 1111 1112 cd = HANDLE_CD(handle); 1113 1114 ASSERT_HANDLE_LIMITS(handle, fba_pos, fba_len); 1115 1116 if ((handle->bh_flag & NSC_WRBUF) == 0) { 1117 DTRACE_PROBE(_sd_uncommit_end_handle_write); 1118 1119 return (EINVAL); 1120 } 1121 1122 if (fba_len == 0) { 1123 DTRACE_PROBE(_sd_uncommit_end_zero_len); 1124 return (NSC_DONE); 1125 } 1126 1127 SDTRACE(ST_ENTER|SDF_UNCOMMIT, cd, fba_len, fba_pos, flag, 0); 1128 1129 cc_ent = handle->bh_centry; 1130 while (CENTRY_BLK(cc_ent) != FBA_TO_BLK_NUM(fba_pos)) 1131 cc_ent = cc_ent->cc_chain; 1132 1133 cc_len = fba_len; /* current length */ 1134 st_cblk_off = BLK_FBA_OFF(fba_pos); 1135 st_cblk_len = (BLK_FBAS - st_cblk_off); 1136 if ((nsc_size_t)st_cblk_len >= fba_len) { 1137 end_cblk_len = 0; 1138 st_cblk_len = (sdbc_cblk_fba_t)fba_len; 1139 } 1140 else 1141 end_cblk_len = BLK_FBA_OFF(fba_pos + fba_len); 1142 1143 /* 1144 * Check if remote write-cache spool is dirty, 1145 * if not, we can just discard local valid bits. 1146 */ 1147 bits = SDBC_GET_BITS(st_cblk_off, st_cblk_len); 1148 cc_ent->cc_valid &= ~bits; 1149 1150 cc_len -= st_cblk_len; 1151 cc_ent = cc_ent->cc_chain; 1152 bits = SDBC_GET_BITS(0, BLK_FBAS); 1153 1154 while (cc_len > (nsc_size_t)end_cblk_len) { 1155 cc_ent->cc_valid = 0; 1156 cc_ent = cc_ent->cc_chain; 1157 cc_len -= BLK_FBAS; 1158 } 1159 1160 #if defined(_SD_DEBUG) 1161 if (cc_len != end_cblk_len) 1162 cmn_err(CE_WARN, "!fba_len %" NSC_SZFMT " end_cblk_len %d in " 1163 "_sd_write", fba_len, end_cblk_len); 1164 #endif 1165 1166 if (cc_len) { 1167 bits = SDBC_GET_BITS(0, end_cblk_len); 1168 cc_ent->cc_valid &= ~bits; 1169 } 1170 SDTRACE(ST_EXIT|SDF_UNCOMMIT, cd, fba_len, fba_pos, flag, 0); 1171 1172 return (NSC_DONE); 1173 } 1174 1175 static void 1176 _sd_wait_for_dirty(void) 1177 { 1178 int cd; 1179 1180 for (cd = 0; cd < sdbc_max_devs; cd++) { 1181 while (_SD_CD_WBLK_USED(cd)) 1182 delay(HZ); 1183 } 1184 } 1185 1186 /* 1187 * _sd_wait_for_flush - wait for all i/o for this cd to cease. 1188 * This function assumes that no further i/o are being issued 1189 * against this device. This assumption is enforced by sd-ctl 1190 * when called from _sd_flush_cd. Recovery also uses this 1191 * wait and it enforces this assumption (somewhat imperfectly) 1192 * by using cd_recovering. 1193 * We must see progress in getting i/o complete within 25 seconds 1194 * or we will return an error. If we complete normally (all i/o done) 1195 * we return 0. 1196 */ 1197 int 1198 _sd_wait_for_flush(int cd) 1199 { 1200 _sd_cd_info_t *cdi = &(_sd_cache_files[cd]); 1201 int tries = 0, used, last_used = 0, inprogress = 0; 1202 1203 if (!(_SD_CD_WBLK_USED(cd))) 1204 return (0); 1205 /* 1206 * Wait for WBLK_USED to reach 0. 1207 * If unchanged for 32+ seconds returns EAGAIN 1208 */ 1209 if (!cdi->cd_writer) 1210 (void) cd_writer(cd); /* spawn writer if not already running */ 1211 1212 while (((used = _SD_CD_WBLK_USED(cd)) != 0) || cdi->cd_writer) { 1213 if (last_used == used && 1214 inprogress == cdi->cd_write_inprogress) { 1215 if (cdi->cd_info->sh_failed) 1216 break; 1217 if (++tries > 128) { 1218 cmn_err(CE_WARN, "!sdbc(_sd_wait_for_flush) " 1219 "%s still has %d blocks pending %d" 1220 " in progress (@ %lx)", 1221 cdi->cd_info->sh_filename, last_used, 1222 inprogress, nsc_lbolt()); 1223 return (EAGAIN); 1224 } 1225 } else { 1226 last_used = used; 1227 inprogress = cdi->cd_write_inprogress; 1228 tries = 0; 1229 } 1230 _sd_unblock(&_sd_flush_cv); 1231 delay(HZ/4); 1232 } 1233 if (cdi->cd_info->sh_failed) 1234 return (EIO); 1235 else 1236 return (0); 1237 } 1238 1239 1240 static 1241 int _sd_ft_warm_start; 1242 1243 int 1244 _sdbc_warm_start(void) 1245 { 1246 return (_sd_ft_warm_start); 1247 } 1248 1249 void 1250 _sdbc_clear_warm_start(void) 1251 { 1252 _sd_ft_warm_start = 0; 1253 } 1254 1255 void 1256 _sdbc_set_warm_start(void) 1257 { 1258 _sd_ft_warm_start = 1; 1259 } 1260 1261 /*ARGSUSED*/ 1262 void 1263 _ncall_poke(int host) 1264 { 1265 cmn_err(CE_PANIC, " NYI - _ncall_poke"); 1266 } 1267