xref: /titanic_51/usr/src/uts/common/avs/ns/sdbc/sd_ft.c (revision 3270659f55e0928d6edec3d26217cc29398a8149)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/ksynch.h>
28 #include <sys/cmn_err.h>
29 #include <sys/errno.h>
30 #include <sys/kmem.h>
31 #include <sys/cred.h>
32 #include <sys/ddi.h>
33 
34 #include <sys/nsc_thread.h>
35 #include <sys/nsctl/nsctl.h>
36 
37 #include <sys/sdt.h>		/* dtrace is S10 or later */
38 
39 #include "sd_bcache.h"
40 #include "sd_ft.h"
41 #include "sd_trace.h"
42 #include "sd_io.h"
43 #include "sd_misc.h"
44 #include <sys/ncall/ncall.h>
45 
46 _sd_ft_info_t  _sd_ft_data;
47 
48 static volatile int _sd_ft_exit = 0;
49 static kcondvar_t _sd_ft_cv;
50 int _sd_node_recovery;		/* node recovery in progress */
51 /*
52  *  _sd_async_recovery:
53  *	0 = flush and wait
54  *	1 = clone and async-write
55  *	2 = quicksort, clone, and async-write
56  * quicksort allows contiguous blocks to be joined,
57  * which may greatly improve recovery time for raid devices.
58  * if kmem_alloc fails, acts as _sd_async_recovery == 1
59  */
60 static int _sd_async_recovery = 2;
61 static int xmem_inval_hit, xmem_inval_miss, xmem_inval_inuse;
62 
63 
64 /*
65  * flag to inhibit reset of remote SCSI buses and sending of
66  * nodedown callback if mirror was deconfigured properly.
67  * - prevents trashing any I/O that may be happening on the mirror
68  *   node during a normal shutdown and prevents undesired simckd failover.
69  */
70 static int mirror_clean_shutdown = 0;
71 
72 /*
73  * Forward declare all statics that are used before defined to enforce
74  * parameter checking
75  * Some (if not all) of these could be removed if the code were reordered
76  */
77 
78 static void _sd_health_thread(void);
79 static void _sd_cache_recover(void);
80 static int _sd_ft_clone(ss_centry_info_t *, int);
81 static void _sd_remote_enable(void);
82 static void sdbc_setmodeandftdata();
83 static void _sd_cd_discard_mirror(int cd);
84 static int _sd_failover_file_open(void);
85 static void _sd_failover_done(void);
86 static void _sd_wait_for_dirty(void);
87 static void _sdbc_clear_warm_start(void);
88 static int sdbc_recover_vol(ss_vol_t *, int);
89 void _ncall_poke(int);
90 
91 int _sdbc_ft_hold_io;
92 kcondvar_t _sdbc_ft_hold_io_cv;
93 kmutex_t _sdbc_ft_hold_io_lk;
94 extern int sdbc_use_dmchain;
95 extern void sdbc_requeue_head_dm_try(_sd_cctl_t *cc_ent);
96 
97 /*
98  * _sdbc_ft_unload - cache is being unloaded (or failed to load).
99  * Deallocate any global lock/sv that we created.
100  */
101 void
102 _sdbc_ft_unload(void)
103 {
104 	cv_destroy(&_sd_ft_cv);
105 	mutex_destroy(&_sd_ft_data.fi_lock);
106 	cv_destroy(&_sd_ft_data.fi_rem_sv);
107 	mutex_destroy(&_sd_ft_data.fi_sleep);
108 	bzero(&_sd_ft_data, sizeof (_sd_ft_info_t));
109 }
110 
111 /*
112  * _sdbc_ft_load - cache is being loaded. Allocate all global lock/sv
113  * that we need. Return 0 if we succeed. If we fail return -1 (don't
114  * need to do the unload step as we expect our caller to do that).
115  */
116 int
117 _sdbc_ft_load(void)
118 {
119 	/* _sd_ft_data is sure to be zeroes, don't need to bzero it */
120 
121 	mutex_init(&_sd_ft_data.fi_lock, NULL, MUTEX_DRIVER, NULL);
122 	cv_init(&_sd_ft_data.fi_rem_sv, NULL, CV_DRIVER, NULL);
123 	cv_init(&_sd_ft_cv, NULL, CV_DRIVER, NULL);
124 	mutex_init(&_sd_ft_data.fi_sleep, NULL, MUTEX_DRIVER, NULL);
125 	return (0);
126 }
127 
128 
129 int
130 _sdbc_ft_configure(void)
131 {
132 	_sd_ft_exit = 1;
133 	return (nsc_create_process(
134 	    (void (*)(void *))_sd_health_thread, 0, TRUE));
135 }
136 
137 
138 void
139 _sdbc_ft_deconfigure(void)
140 {
141 	_sd_ft_exit = 0;
142 	_sd_unblock(&_sd_ft_cv);
143 	mutex_enter(&_sd_ft_data.fi_lock);
144 	_sd_node_recovery = 0;
145 	cv_broadcast(&_sd_ft_data.fi_rem_sv);
146 	mutex_exit(&_sd_ft_data.fi_lock);
147 }
148 
149 
150 /*
151  * _sd_health_thread -- daemon thread on each node watches if mirror
152  * node to has crashed, and it needs to flush the mirrors cache entries.
153  * Note we do *not* detect that the node has come up again, but wait
154  * for the node to inform us that it is up via _sd_cache_reenable().
155  */
156 static void
157 _sd_health_thread(void)
158 {
159 	int warm_started = 0;
160 
161 	mutex_enter(&_sd_cache_lock);
162 	_sd_cache_dem_cnt++;
163 	mutex_exit(&_sd_cache_lock);
164 
165 	/* clear _sd_ft_data in case this is a cache re-enable w/o unload */
166 
167 	bzero(&_sd_ft_data, sizeof (_sd_ft_info_t));
168 
169 	sdbc_setmodeandftdata();
170 
171 #ifdef DEBUG
172 	cmn_err(CE_NOTE, "!sdbc(_sd_health_thread) safestore "
173 	    "is %s. Fast writes %s",
174 	    (_SD_MIRROR_CONFIGD) ? "up" : "down",
175 	    (_SD_NODE_HINTS & _SD_WRTHRU_MASK) ?
176 	    "disabled" : "enabled");
177 #endif
178 
179 	/* CONSTCOND */
180 	while (1) {
181 		_sd_timed_block(HZ/8, &_sd_ft_cv);
182 		if (_sd_ft_exit == 0) {
183 			mutex_enter(&_sd_cache_lock);
184 			_sd_cache_dem_cnt--;
185 			mutex_exit(&_sd_cache_lock);
186 			return;
187 		}
188 
189 		/* NB evaluation order is important here for nvmem systems */
190 		if (_sd_is_mirror_crashed() ||
191 		    (warm_started = _sdbc_warm_start())) {
192 
193 			/*
194 			 * Hash invalidate here. We do not want data from
195 			 * previous failover incarnation to be cache hits, if
196 			 * the 2 failover happens within a short time
197 			 */
198 			_sd_hash_invalidate_cd(-1);
199 
200 			/*
201 			 * don't change mirror state when warm starting
202 			 * nvmem systems.  _sd_mirror_down() is called in
203 			 * in _sd_remote_enable() on nvmem systems if the
204 			 * media is down.
205 			 */
206 			if (!warm_started)
207 				if (!mirror_clean_shutdown)
208 					_sd_mirror_down();
209 				else
210 					_sd_mirror_cache_down();
211 
212 			(void) _sd_set_node_hint(NSC_FORCED_WRTHRU);
213 			if (!warm_started) {
214 				/* was FAST */
215 				mutex_enter(&_sd_ft_data.fi_lock);
216 				_sd_node_recovery = 0;
217 				/* was FAST */
218 				mutex_exit(&_sd_ft_data.fi_lock);
219 				/* Assume other side is still up */
220 				cmn_err(CE_WARN,
221 				    "!sdbc(_sd_health_thread)"
222 				    "Safestore is down. Fast writes %s",
223 				    (_SD_NODE_HINTS & _SD_WRTHRU_MASK) ?
224 				    "disabled" : "enabled");
225 				_sd_unblock(&_sd_flush_cv);
226 
227 				if (SAFESTORE_LOCAL(sdbc_safestore))
228 					continue;
229 
230 				/* Wait for cache to drain and panic */
231 				_sd_wait_for_dirty();
232 				cmn_err(CE_WARN,
233 				    "!sdbc(_sd_health_thread)"
234 				    " dirty blocks flushed");
235 				continue;
236 			}
237 			/* was FAST */
238 			mutex_enter(&_sd_ft_data.fi_lock);
239 			_sd_node_recovery = 1;
240 			/* was FAST */
241 			mutex_exit(&_sd_ft_data.fi_lock);
242 			if (!SAFESTORE_LOCAL(sdbc_safestore))
243 				cmn_err(CE_WARN,
244 				    "!sdbc(_sd_health_thread)"
245 				    " Cache on node %d is down. "
246 				    "Fast writes %s",
247 				    _SD_MIRROR_HOST,
248 				    (_SD_NODE_HINTS & _SD_WRTHRU_MASK) ?
249 				    "disabled" : "enabled");
250 			cmn_err(CE_NOTE,
251 			    "!sdbc(_sd_health_thread)"
252 			    " Cache recovery in progress");
253 			_sd_cache_recover();
254 
255 			mutex_enter(&_sd_ft_data.fi_lock);
256 			_sd_node_recovery = 0;
257 			_sdbc_clear_warm_start(); /* nvmem systems */
258 			cv_broadcast(&_sd_ft_data.fi_rem_sv);
259 			mutex_exit(&_sd_ft_data.fi_lock);
260 			cmn_err(CE_NOTE,
261 			    "!sdbc(_sd_health_thread) %s Cache recovery done",
262 			    _sd_async_recovery ?
263 			    "asynchronous" : "synchronous");
264 			/* restore previous state */
265 			if (warm_started && !_sd_is_mirror_down()) {
266 				(void) _sd_clear_node_hint(NSC_FORCED_WRTHRU);
267 				cmn_err(CE_NOTE,
268 				    "!sdbc(_sd_health_thread) Fast writes %s",
269 				    (_SD_NODE_HINTS & _SD_WRTHRU_MASK) ?
270 				    "disabled" : "enabled");
271 			}
272 			warm_started = 0;
273 
274 		} else if (_sd_is_mirror_node_down()) {
275 			_sd_mirror_down();
276 		}
277 	}
278 }
279 
280 /*
281  * _sdbc_recovery_io_wait - wait for i/o being done directly
282  * out of safe storage to complete. If the i/o does not make any
283  * progress within about 25 seconds we return EIO otherwise return 0.
284  *
285  */
286 static
287 int
288 _sdbc_recovery_io_wait(void)
289 {
290 	int tries = 0;
291 	int last_numio = 0;
292 
293 	/*
294 	 * Wait for numio to reach 0.
295 	 * If numio has not changed for 85+ seconds,
296 	 * break & pin blocks
297 	 */
298 	while (_sd_ft_data.fi_numio > 0) {
299 		if (last_numio == _sd_ft_data.fi_numio) {
300 			if (++tries > 512) break;
301 		} else {
302 			last_numio = _sd_ft_data.fi_numio;
303 			tries = 0;
304 		}
305 		delay(HZ/8);
306 	}
307 	if (_sd_ft_data.fi_numio != 0) {
308 		cmn_err(CE_WARN, "!sdbc(_sdbc_recovery_io_wait) %d "
309 		    "recovery i/o's not done", _sd_ft_data.fi_numio);
310 		return (EIO);
311 	}
312 	return (0);
313 }
314 
315 
316 #if defined(_SD_FAULT_RES)
317 /*
318  * _sd_recovery_wait()
319  *   while _sd_node_recovery is set, accesses to mirrored devices will block
320  *   (_sd_node_recovery-1) is count of blocked threads.
321  */
322 int
323 _sd_recovery_wait(void)
324 {
325 	int blk;
326 
327 	mutex_enter(&_sd_ft_data.fi_lock);
328 	blk = _sd_node_recovery ? _sd_node_recovery++ : 0;
329 
330 	if (blk)
331 		cv_wait(&_sd_ft_data.fi_rem_sv, &_sd_ft_data.fi_lock);
332 	mutex_exit(&_sd_ft_data.fi_lock);
333 
334 	if (!_sd_cache_initialized)
335 		return (EINVAL);
336 	return (0);
337 }
338 
339 /*
340  * _sd_recovery_wblk_wait - wait for recovery i/o to a device
341  * to cease. If the file is closed or the cache is disabled
342  * first return an error otherwise return 0.
343  *
344  * A device is being recovered from our point of view either
345  * during failover or by putting a disk back online after
346  * a disk failure.
347  *
348  * This code is used to delay access to a device while recovery
349  * writes are in progress from either a failover or while flushing
350  * i/o after a failed disk has been repaired.
351  */
352 int
353 _sd_recovery_wblk_wait(int cd)
354 {
355 	_sd_cd_info_t *cdi = &_sd_cache_files[cd];
356 
357 	while (_sd_cache_initialized &&
358 	    FILE_OPENED(cd) && cdi->cd_recovering) {
359 		/* spawn writer if none */
360 		if (!cdi->cd_writer) (void) cd_writer(cd);
361 		delay(HZ/8);
362 	}
363 	if (!_sd_cache_initialized || !FILE_OPENED(cd))
364 		return (EINVAL);
365 	return (0);
366 }
367 
368 /*
369  * Recover from a crash of another node:
370  *
371  * 1) Open all remote files
372  * 2) Allocate other node's buffers and new buffer headers
373  * 3) Flush all dirty buffers to disk
374  * 4) Deallocate resources
375  */
376 static void
377 _sd_cache_recover(void)
378 {
379 	int cblocks_processed;
380 
381 	SDTRACE(ST_ENTER|SDF_RECOVER, SDT_INV_CD, 0, SDT_INV_BL, 0, 0);
382 
383 	/* was FAST */
384 	mutex_enter(&_sd_ft_data.fi_lock);
385 	_sd_ft_data.fi_numio = 0;
386 	/* was FAST */
387 	mutex_exit(&_sd_ft_data.fi_lock);
388 
389 #ifdef _SD_DRIVE_RESP
390 	if (!mirror_clean_shutdown)
391 		_raw_reset_other();
392 #endif
393 	mirror_clean_shutdown = 0;
394 
395 	cblocks_processed = _sd_failover_file_open();
396 
397 	/* allow cache config to proceed */
398 	mutex_enter(&_sdbc_ft_hold_io_lk);
399 	_sdbc_ft_hold_io = 0;
400 	cv_signal(&_sdbc_ft_hold_io_cv);
401 	mutex_exit(&_sdbc_ft_hold_io_lk);
402 
403 	/* wait for sequential recovery to complete */
404 	if (!_sd_async_recovery && cblocks_processed)
405 		(void) _sdbc_recovery_io_wait();
406 
407 	_sd_failover_done();
408 
409 	if (cblocks_processed)
410 		cmn_err(CE_NOTE,
411 		    "!sdbc %ssynchronous recovery complete "
412 		    "%d cache blocks processed",
413 		    _sd_async_recovery ? "a" : "",
414 		    cblocks_processed);
415 
416 	SDTRACE(ST_EXIT|SDF_RECOVER, SDT_INV_CD, 0, SDT_INV_BL, 0, 0);
417 }
418 
419 void
420 _sd_mirror_iodone(void)
421 {
422 	/* was FAST */
423 	mutex_enter(&_sd_ft_data.fi_lock);
424 	_sd_ft_data.fi_numio--;
425 	/* was FAST */
426 	mutex_exit(&_sd_ft_data.fi_lock);
427 }
428 
429 
430 
431 /*
432  * _sd_ft_clone -- clone cache block from ft area, retry write or pin.
433  */
434 static int
435 _sd_ft_clone(ss_centry_info_t *ft_cent, int async)
436 {
437 	_sd_cctl_t *ent;
438 	int cd = ft_cent->sc_cd;
439 	nsc_off_t cblk = ft_cent->sc_fpos;
440 	int dirty = ft_cent->sc_dirty;
441 	ss_resource_t *res = ft_cent->sc_res;
442 	_sd_cd_info_t *cdi;
443 
444 	SDTRACE(ST_ENTER|SDF_FT_CLONE, cd, BLK_FBAS, cblk, dirty, _SD_NO_NET);
445 	cdi = &(_sd_cache_files[cd]);
446 	if ((cdi->cd_info->sh_failed != 2) && !FILE_OPENED(cd)) {
447 		cmn_err(CE_WARN, "!sdbc(_sd_ft_clone) recovery "
448 		    "write failed: cd %x; cblk %" NSC_SZFMT "; dirty %x",
449 		    cd, cblk, dirty);
450 		SDTRACE(ST_EXIT|SDF_FT_CLONE,
451 		    cd, BLK_FBAS, cblk, dirty, EINTR);
452 		return (-1);
453 	}
454 
455 	/*
456 	 * allocate new cache entry and read data
457 	 */
458 	ent = sdbc_centry_alloc_blks(cd, cblk, 1, 0);
459 
460 	if (SSOP_READ_CBLOCK(sdbc_safestore, res, (void *)ent->cc_data,
461 	    CACHE_BLOCK_SIZE, 0) == SS_ERR) {
462 		cmn_err(CE_WARN, "!sdbc(_sd_ft_clone) read of "
463 		    "pinned data block failed. cannot recover "
464 		    "0x%p size 0x%x", (void *)res, CACHE_BLOCK_SIZE);
465 
466 		/* _sd_process_failure ?? */
467 		_sd_centry_release(ent);
468 		return (-1);
469 	}
470 
471 	ent->cc_write = ft_cent;
472 	ent->cc_dirty = ent->cc_valid = (ushort_t)dirty;
473 	ent->cc_flag |= (ft_cent->sc_flag & CC_PINNABLE);
474 
475 	ent->cc_chain = NULL;
476 
477 	/*
478 	 * _sd_process_failure() adds to failed list & does pinned callback
479 	 * otherwise async flush
480 	 */
481 	if (cdi->cd_info->sh_failed) { /* raw device open/reserve failed */
482 		mutex_enter(&cdi->cd_lock);
483 		(cdi->cd_info->sh_numio)++;
484 		mutex_exit(&cdi->cd_lock);
485 		(void) _sd_process_failure(ent);
486 	} else {
487 
488 		if (cdi->cd_global->sv_pinned != _SD_NO_HOST) {
489 			cdi->cd_global->sv_pinned = _SD_NO_HOST;
490 			SSOP_SETVOL(sdbc_safestore, cdi->cd_global);
491 		}
492 
493 		if (async) {
494 			_sd_enqueue_dirty(cd, ent, ent, 1);
495 		} else {
496 			/*
497 			 * this is sync write with asynchronous callback
498 			 * (queue to disk and return).
499 			 */
500 
501 			mutex_enter(&(cdi->cd_lock));
502 			(cdi->cd_info->sh_numio)++;
503 			mutex_exit(&cdi->cd_lock);
504 			_sd_async_flcent(ent, cdi->cd_crdev);
505 		}
506 	}
507 	_sd_centry_release(ent);
508 	SDTRACE(ST_EXIT|SDF_FT_CLONE, cd, BLK_FBAS, cblk, dirty, _SD_NO_NET);
509 	return (0);
510 }
511 
512 
513 /*
514  * _sd_repin_cd - scan for dirty blocks held by mirror node.
515  *
516  * sdbc on this node is being attached to cd. If sdbc on other
517  * node had failed writes (pinnable or not) we need to take
518  * responsbility for them now here.
519  */
520 int
521 _sd_repin_cd(int cd)
522 {
523 	ss_voldata_t *cd_gl;
524 	_sd_cd_info_t *cdi;
525 
526 	if (!FILE_OPENED(cd))
527 		return (EINVAL);
528 
529 	cdi = &_sd_cache_files[cd];
530 	if (cdi->cd_global->sv_pinned == _SD_NO_HOST)
531 		return (0);
532 
533 	cd_gl = _sdbc_gl_file_info + cd;
534 
535 	if (sdbc_recover_vol(cd_gl->sv_vol, cd))
536 		_sd_cd_discard_mirror(cd);
537 
538 	return (0);
539 }
540 
541 
542 static int
543 _sd_cache_mirror_enable(int host)
544 {
545 	if (_sd_cache_initialized) {
546 		if (host != _SD_MIRROR_HOST) {
547 			cmn_err(CE_WARN, "!sdbc(_sd_cache_mirror_enable) "
548 			    "Configured mirror %x. Got message from %x",
549 			    _SD_MIRROR_HOST, host);
550 			return (-EINVAL);
551 		}
552 		if (_sd_node_recovery) (void) _sd_recovery_wait();
553 		if (_sd_cache_initialized && _sd_is_mirror_down()) {
554 			int i;
555 
556 			/* make sure any pinned data we have is now refreshed */
557 			for (i = 0; i < sdbc_max_devs; i++)
558 				if (FILE_OPENED(i))
559 					(void) _sdbc_remote_store_pinned(i);
560 
561 			cmn_err(CE_NOTE,
562 			    "!sdbc(_sd_cache_mirror_enable) Cache on "
563 			    "mirror node %d is up. Fast writes enabled",
564 			    host);
565 			_sd_mirror_up();
566 			(void) _sd_clear_node_hint(NSC_FORCED_WRTHRU);
567 		}
568 	}
569 	_sd_ft_data.fi_host_state = _SD_HOST_CONFIGURED;
570 	return (_sd_cache_initialized);
571 }
572 
573 
574 /*
575  * two stage mirror disable:
576  *	stage 0: set FORCED_WRTHRU hint (cache shutdown started)
577  *	stage 1: mirror shutdown completed
578  */
579 static int
580 _sd_cache_mirror_disable(int host, int stage)
581 {
582 	if (_sd_cache_initialized) {
583 
584 		if (host != _SD_MIRROR_HOST)
585 			return (0);
586 		if (stage == 0) {
587 			(void) _sd_set_node_hint(NSC_FORCED_WRTHRU);
588 			return (0);
589 		}
590 		_sd_ft_data.fi_host_state = _SD_HOST_DECONFIGURED;
591 		mirror_clean_shutdown = 1;
592 		_sd_unblock(&_sd_ft_cv);
593 	} else {
594 		_sd_ft_data.fi_host_state = _SD_HOST_NONE;
595 	}
596 	return (0);
597 }
598 
599 /*
600  * set the fault tolerant data to indicate the state
601  * of the safestore host.  set mode to writethru if appropriate
602  */
603 static void
604 sdbc_setmodeandftdata()
605 {
606 	/*
607 	 * if single node local safestore or ram safestore
608 	 * then mark host state as carashed/_SD_HOST_NONE and set writethru
609 	 */
610 	if (SAFESTORE_LOCAL(sdbc_safestore)) {
611 		if (!SAFESTORE_SAFE(sdbc_safestore)) {
612 			_sd_mirror_down();	/* mirror node down */
613 			(void) _sd_set_node_hint(NSC_FORCED_WRTHRU);
614 		} else {
615 			_sd_ft_data.fi_host_state = _SD_HOST_CONFIGURED;
616 			if (_sdbc_warm_start())
617 				(void) _sd_set_node_hint(NSC_FORCED_WRTHRU);
618 		}
619 	} else
620 		_sd_remote_enable();
621 }
622 
623 static void
624 _sd_remote_enable(void)
625 {
626 	ncall_t *ncall;
627 	long r;
628 
629 	if (ncall_alloc(_SD_MIRROR_HOST, 0, _SD_NO_NET, &ncall)) {
630 		_sd_mirror_down();	/* mirror node down */
631 		(void) _sd_set_node_hint(NSC_FORCED_WRTHRU);
632 		return;
633 	}
634 
635 	r = ncall_send(ncall, 0, SD_ENABLE, _SD_SELF_HOST);
636 	if (!r) (void) ncall_read_reply(ncall, 1, &r);
637 	ncall_free(ncall);
638 
639 	if (r == 1) {		/* _sd_cache_initialized */
640 		if (!_sd_is_mirror_crashed() &&
641 		    _sd_ft_data.fi_host_state == _SD_HOST_NONE)
642 			_sd_ft_data.fi_host_state = _SD_HOST_CONFIGURED;
643 		return;
644 	}
645 	if (r == ENOLINK)
646 		_sd_mirror_down();		/* mirror node down */
647 	else
648 		_sd_mirror_cache_down();	/* mirror up, but no cache */
649 	(void) _sd_set_node_hint(NSC_FORCED_WRTHRU);
650 }
651 
652 
653 void
654 _sd_remote_disable(int stage)
655 {
656 	ncall_t *ncall;
657 
658 	if (ncall_alloc(_SD_MIRROR_HOST, 0, 0, &ncall) == 0)
659 		(void) ncall_send(ncall, NCALL_ASYNC, SD_DISABLE,
660 		    _SD_SELF_HOST, stage);
661 }
662 
663 void
664 r_sd_ifs_cache_enable(ncall_t *ncall, int *ap)
665 {
666 	ncall_reply(ncall, _sd_cache_mirror_enable(*ap));
667 }
668 
669 
670 
671 void
672 r_sd_ifs_cache_disable(ncall_t *ncall, int *ap)
673 {
674 	(void) _sd_cache_mirror_disable(ap[0], ap[1]);
675 	ncall_done(ncall);
676 }
677 
678 #else /* (_SD_FAULT_RES) */
679 
680 void r_sd_ifs_cache_enable()  {; }
681 void r_sd_ifs_cache_disable() {; }
682 
683 #endif /* (_SD_FAULT_RES) */
684 
685 /*
686  * invalidate cache hash table entries for given device
687  * or (-1) all devices belonging to mirrored node
688  */
689 void
690 _sd_hash_invalidate_cd(int CD)
691 {
692 	int i;
693 	_sd_cd_info_t *cdi;
694 	_sd_hash_hd_t *hptr;
695 	_sd_cctl_t *cc_ent, *ent;
696 	_sd_hash_bucket_t *bucket;
697 	int cd;
698 	nsc_off_t blk;
699 
700 	for (i = 0; i < (_sd_htable->ht_size); i++) {
701 		bucket = (_sd_htable->ht_buckets + i);
702 		mutex_enter(bucket->hb_lock);
703 		hptr = bucket->hb_head;
704 		while (hptr) {
705 			cc_ent = (_sd_cctl_t *)hptr;
706 			cd = CENTRY_CD(cc_ent);
707 			blk = CENTRY_BLK(cc_ent);
708 			cdi = &_sd_cache_files[cd];
709 
710 			/*
711 			 * Skip if device doesn't match or pinned.
712 			 * (-1) skip attached cd's
713 			 */
714 			if ((CD != -1 && (cd != CD || CENTRY_PINNED(cc_ent))) ||
715 			    (CD == -1 && nsc_held(cdi->cd_rawfd))) {
716 				hptr = hptr->hh_next;
717 				continue;
718 			}
719 			mutex_exit(bucket->hb_lock);
720 
721 			ent = cc_ent;
722 		fl1:
723 			if (CC_CD_BLK_MATCH(cd, blk, ent) ||
724 			    (ent = (_sd_cctl_t *)_sd_hash_search(cd, blk,
725 			    _sd_htable))) {
726 				if (SET_CENTRY_INUSE(ent)) {
727 					xmem_inval_inuse++;
728 					_sd_cc_wait(cd, blk, ent, CC_INUSE);
729 					goto fl1; /* try again */
730 				}
731 
732 				/* cc_inuse is set, delete on block match */
733 				if (CC_CD_BLK_MATCH(cd, blk, ent)) {
734 					xmem_inval_hit++;
735 					(void) _sd_hash_delete(
736 					    (struct _sd_hash_hd *)ent,
737 					    _sd_htable);
738 
739 					if (sdbc_use_dmchain) {
740 
741 						/* attempt to que head */
742 						if (ent->cc_alloc_size_dm) {
743 							sdbc_requeue_head_dm_try
744 							    (ent);
745 						}
746 					} else
747 						_sd_requeue_head(ent);
748 
749 				} else
750 					xmem_inval_miss++;
751 
752 				CLEAR_CENTRY_INUSE(ent);
753 			}
754 			mutex_enter(bucket->hb_lock);
755 			hptr = bucket->hb_head;
756 		}
757 		mutex_exit(bucket->hb_lock);
758 	}
759 }
760 
761 
762 /*
763  * _sd_cd_online(cd,discard)
764  *	clear local error state.
765  *	if (discard && _attached != _SD_SELF_HOST) then release buffers.
766  *	if (!discard && _attached != _SD_MIRROR_HOST) then re-issue I/Os
767  *		(add to dirty pending queue).
768  * returns:
769  *	0	success
770  *	EINVAL	invalid device or not failed
771  *	EBUSY	attached by this node, or by active mirror
772  */
773 static int
774 _sd_cd_online(int cd, int discard)
775 {
776 	_sd_cd_info_t *cdi = &_sd_cache_files[cd];
777 	int failed, num;
778 	_sd_cctl_t *cc_ent, *cc_next, *cc_last, *cc_first, *cc_next_chain;
779 
780 	/*
781 	 * in the case where a failed device has been closed and
782 	 * then re-opened, sh_failed will be zero because it is
783 	 * cleared in _sd_open_cd().  hence the test for
784 	 * _pinned != _SD_SELF_HOST which allows the restore to
785 	 * proceed in this scenario.
786 	 */
787 	if (cd < 0 || cd >= sdbc_max_devs)
788 		return (EINVAL);
789 
790 	if (!cdi->cd_info || !cdi->cd_global)
791 		return (EINVAL);
792 
793 	if ((cdi->cd_info->sh_failed == 0) &&
794 	    (cdi->cd_global->sv_pinned != _SD_SELF_HOST))
795 		return (0);
796 
797 	if (_sd_nodes_configured > 1) {
798 
799 		/* can't discard while attached on multinode systems */
800 		if (discard && (cdi->cd_global->sv_attached == _SD_SELF_HOST))
801 			return (EBUSY);
802 
803 		if (!discard &&		/* attached by active mirror! */
804 		    (cdi->cd_global->sv_attached == _SD_MIRROR_HOST) &&
805 		    !_sd_is_mirror_down())
806 			return (EBUSY);
807 	}
808 
809 	mutex_enter(&cdi->cd_lock);
810 
811 	cc_ent = cdi->cd_fail_head;
812 	failed = cdi->cd_info->sh_numfail;
813 	cdi->cd_fail_head = NULL;
814 	cdi->cd_info->sh_numfail = 0;
815 	cdi->cd_info->sh_failed = 0;
816 	cdi->cd_global->sv_pinned = _SD_NO_HOST;
817 	SSOP_SETVOL(sdbc_safestore, cdi->cd_global);
818 
819 	if (cc_ent == NULL) {
820 		mutex_exit(&cdi->cd_lock);
821 		return (0);
822 	}
823 	/* prevent any new i/o from arriving for this cd */
824 	if (!discard)
825 		cdi->cd_recovering = 1;
826 
827 	mutex_exit(&cdi->cd_lock);
828 
829 	num = 0;
830 	cc_first = cc_ent;
831 	for (; cc_ent; cc_ent = cc_next_chain) {
832 		cc_next_chain = cc_ent->cc_dirty_link;
833 
834 		for (; cc_ent; cc_ent = cc_next) {
835 			cc_next = cc_ent->cc_dirty_next;
836 			cc_last = cc_ent;
837 			num++;
838 
839 			if (discard) {
840 				ss_centry_info_t *wctl;
841 				/* was FAST */
842 				mutex_enter(&cc_ent->cc_lock);
843 				cc_ent->cc_valid = cc_ent->cc_dirty = 0;
844 				cc_ent->cc_flag &= ~(CC_PEND_DIRTY|CC_PINNED);
845 				cc_ent->cc_dirty_next = NULL;
846 				cc_ent->cc_dirty_link = NULL;
847 				wctl = cc_ent->cc_write;
848 				cc_ent->cc_write = NULL;
849 				/* was FAST */
850 				mutex_exit(&cc_ent->cc_lock);
851 				if (wctl) {
852 					wctl->sc_flag = 0;
853 					wctl->sc_dirty = 0;
854 
855 					SSOP_SETCENTRY(sdbc_safestore, wctl);
856 					SSOP_DEALLOCRESOURCE(sdbc_safestore,
857 					    wctl->sc_res);
858 				}
859 
860 				continue;
861 			}
862 
863 			/* Clear PEND_DIRTY, iocount & iostatus */
864 			if (SET_CENTRY_INUSE(cc_ent) == 0) {
865 				cc_ent->cc_flag &= ~CC_PEND_DIRTY;
866 				cc_ent->cc_iocount = 0;
867 				cc_ent->cc_iostatus = 0; /* _SD_IO_NONE */
868 				CLEAR_CENTRY_INUSE(cc_ent);
869 			} else {
870 				/* was FAST */
871 				mutex_enter(&cc_ent->cc_lock);
872 				cc_ent->cc_flag &= ~CC_PEND_DIRTY;
873 				cc_ent->cc_iocount = 0;
874 				cc_ent->cc_iostatus = 0; /* _SD_IO_NONE */
875 				/* was FAST */
876 				mutex_exit(&cc_ent->cc_lock);
877 			}
878 		}
879 	}
880 	if (num != failed)
881 		cmn_err(CE_WARN, "!sdbc(_sd_cd_online) count %d vs numfail %d",
882 		    num, failed);
883 	if (discard) {
884 		_sd_hash_invalidate_cd(cd);
885 		return (0);
886 	}
887 
888 	_sd_enqueue_dirty_chain(cd, cc_first, cc_last, num);
889 	/* make sure data gets flushed in case there is no new I/O */
890 	(void) nsc_reserve(cdi->cd_rawfd, NSC_MULTI);
891 	(void) _sd_wait_for_flush(cd);
892 	cdi->cd_recovering = 0;
893 	nsc_release(cdi->cd_rawfd);
894 
895 	return (0);
896 }
897 
898 #if defined(_SD_FAULT_RES)
899 
900 /*
901  * This node has disk attached, discard pins held by mirror
902  */
903 static void
904 _sd_cd_discard_mirror(int cd)
905 {
906 	ncall_t *ncall;
907 	if (ncall_alloc(_SD_MIRROR_HOST, 0, 0, &ncall))
908 		return;
909 	(void) ncall_send(ncall, NCALL_ASYNC, SD_CD_DISCARD, cd);
910 }
911 
912 void
913 r_cd_discard(ncall_t *ncall, int *ap)
914 {
915 	int r, cd = *ap;
916 	if (_sd_cache_initialized) {
917 		SDTRACE(ST_ENTER|SDF_ONLINE, cd, 1, SDT_INV_BL, 1, 0);
918 		r = _sd_cd_online(cd, 1);
919 		SDTRACE(ST_EXIT|SDF_ONLINE, cd, 1, SDT_INV_BL, 1, r);
920 	}
921 	ncall_done(ncall);
922 }
923 
924 /*
925  * _sd_failover_file_open -
926  *	on failover, open devices which are not attached by this node.
927  */
928 static int
929 _sd_failover_file_open(void)
930 {
931 	int rc, cd, flag = 0;
932 	ss_voldata_t *cd_gl;
933 	_sd_cd_info_t *cdi;
934 	int cblocks_processed = 0;
935 	extern ss_voldata_t *_sdbc_gl_file_info;
936 
937 	for (cd = 0; cd < sdbc_max_devs; cd++) {
938 		cd_gl = _sdbc_gl_file_info + cd;
939 		cdi = &(_sd_cache_files[cd]);
940 
941 		/*
942 		 * If the cd is open and reserved we certainly don't
943 		 * need to do it again. However the recovery code
944 		 * must be racing some other cache usage which could
945 		 * be bad.  We really need to be able to lock out
946 		 * all cache activity for this cd that is not tied
947 		 * to the recovery process. This doesn't seem to be
948 		 * feasible in sdbc since a competing thread could
949 		 * already be finished doing an alloc_buf. If this
950 		 * hole is to be closed sd-ctl must be more in
951 		 * control of the failover process.
952 		 */
953 		if (FILE_OPENED(cd) && nsc_held(cdi->cd_rawfd))
954 			continue;
955 
956 		/*
957 		 * this constuct says that, on non-nvmem systems,
958 		 * if we are attempting to open a "local" device and
959 		 * nothing is pinned, then continue.  i.e. open only
960 		 * remote devices or devices that have pinned data.
961 		 * for recovery on nvmem systems we open all devices.
962 		 */
963 		if ((!_sdbc_warm_start()) &&
964 		    ((cd_gl->sv_attached != _SD_MIRROR_HOST) &&
965 		    (cd_gl->sv_pinned != _SD_MIRROR_HOST) &&
966 		    (cd_gl->sv_pinned != _SD_SELF_HOST)))
967 			continue;
968 		if (!cd_gl->sv_volname || !cd_gl->sv_volname[0])
969 			continue;
970 
971 		if (_sd_open_cd(cd_gl->sv_volname, cd, flag) < 0) {
972 			cmn_err(CE_WARN, "!sdbc(_sd_failover_file_open) "
973 			    "Unable to open disk partition %s",
974 			    cd_gl->sv_volname);
975 			continue;
976 		}
977 
978 		SDTRACE(ST_INFO|SDF_RECOVER, cd, 0, 0, 0, 0);
979 		rc = nsc_reserve(cdi->cd_rawfd, NSC_MULTI);
980 		if (rc == 0) {
981 			cdi->cd_failover = 1;
982 		}
983 
984 		if (rc != 0) cdi->cd_info->sh_failed = 1;
985 
986 		cblocks_processed += sdbc_recover_vol(cd_gl->sv_vol, cd);
987 	}
988 
989 	return (cblocks_processed);
990 }
991 
992 
993 static int
994 sdbc_recover_vol(ss_vol_t *vol, int cd)
995 {
996 	ss_cdirkey_t key;
997 	ss_cdir_t cdir;
998 	ss_voldata_t *cd_gl = _sdbc_gl_file_info + cd;
999 	ss_centry_info_t *cinfo;
1000 	ss_centry_info_t centry;
1001 	int cblocks_processed = 0;
1002 	int err;
1003 	ss_centry_info_t *sdbc_get_cinfo_byres(ss_resource_t *);
1004 
1005 	/* setup the key to get a volume directory stream of centrys */
1006 	key.ck_type  = CDIR_VOL;
1007 	key.cdk_u.ck_vol = vol;
1008 
1009 	if (SSOP_GETCDIR(sdbc_safestore, &key, &cdir)) {
1010 		cmn_err(CE_WARN, "!sdbc(sdbc_recover_vol): "
1011 		    "cannot recover volume %s",
1012 		    cd_gl->sv_volname);
1013 		return (0);
1014 	}
1015 
1016 	/* cycle through the cdir getting resource tokens and reading centrys */
1017 	/*CONSTANTCONDITION*/
1018 	while (1) {
1019 
1020 		if ((err = SSOP_GETCDIRENT(sdbc_safestore, &cdir, &centry))
1021 		    == SS_ERR) {
1022 			cmn_err(CE_WARN, "!sdbc(sdbc_recover_vol): "
1023 			    "cache entry read failure %s %p",
1024 			    cd_gl->sv_volname, (void *)centry.sc_res);
1025 
1026 			continue;
1027 		}
1028 
1029 
1030 		if (err == SS_EOF)
1031 			break; /* done */
1032 
1033 
1034 		/*
1035 		 * this get into double caching consistency
1036 		 * need to resolve this jgk
1037 		 */
1038 		if ((cinfo = sdbc_get_cinfo_byres(centry.sc_res)) == NULL) {
1039 			/* should not happen */
1040 			cmn_err(CE_WARN, "!sdbc(sdbc_recover_vol): "
1041 			    "invalid ss resource %p", (void *)centry.sc_res);
1042 			continue;
1043 		}
1044 		bcopy(&centry, cinfo, sizeof (ss_centry_info_t));
1045 
1046 		/*
1047 		 * note
1048 		 * ss should return a stream of dirty blocks ordered
1049 		 * by block number.  if it turns out that ss will not support
1050 		 * this then sorting for async recovery will have to be
1051 		 * done here  jgk
1052 		 */
1053 		ASSERT(cinfo->sc_dirty);
1054 
1055 		if (!cinfo->sc_dirty) /* should not happen */
1056 			continue;
1057 
1058 		/*
1059 		 * clone mirror cache entry and do
1060 		 * 	async I/O or sync I/O or pin if sh_failed
1061 		 */
1062 		(void) _sd_ft_clone(cinfo, _sd_async_recovery);
1063 		++cblocks_processed;
1064 	}
1065 
1066 
1067 	if (cblocks_processed)
1068 		cmn_err(CE_NOTE,
1069 		    "!sdbc(sdbc_recover_vol) %d cache blocks processed for "
1070 		    "volume %s", cblocks_processed, cd_gl->sv_volname);
1071 
1072 	return (cblocks_processed);
1073 }
1074 
1075 /*
1076  * _sd_failover_done -
1077  *	mark failover open'd devices as requiring nsc_release()
1078  *	when all queued I/O's have drained.
1079  */
1080 static void
1081 _sd_failover_done(void)
1082 {
1083 	_sd_cd_info_t *cdi;
1084 	int cd;
1085 
1086 	for (cd = 0; cd < sdbc_max_devs; cd++) {
1087 		cdi = &(_sd_cache_files[cd]);
1088 
1089 		if (FILE_OPENED(cd) && cdi->cd_failover)
1090 			cdi->cd_failover = 2;
1091 	}
1092 }
1093 
1094 #endif /* (_SD_FAULT_RES) */
1095 
1096 /*
1097  * _sd_uncommit - discard local buffer modifications
1098  *	clear the valid bits.
1099  */
1100 int
1101 _sd_uncommit(_sd_buf_handle_t *handle, nsc_off_t fba_pos, nsc_size_t fba_len,
1102     int flag)
1103 {
1104 	int cd;
1105 	sdbc_cblk_fba_t st_cblk_len;	/* FBA len of starting cache block */
1106 	sdbc_cblk_fba_t end_cblk_len;	/* FBA len of ending cache block */
1107 	sdbc_cblk_fba_t st_cblk_off;	/* FBA offset into starting cblock */
1108 	nsc_size_t cc_len;
1109 	int bits;
1110 	_sd_cctl_t *cc_ent;
1111 
1112 	cd = HANDLE_CD(handle);
1113 
1114 	ASSERT_HANDLE_LIMITS(handle, fba_pos, fba_len);
1115 
1116 	if ((handle->bh_flag & NSC_WRBUF) == 0) {
1117 		DTRACE_PROBE(_sd_uncommit_end_handle_write);
1118 
1119 		return (EINVAL);
1120 	}
1121 
1122 	if (fba_len == 0) {
1123 		DTRACE_PROBE(_sd_uncommit_end_zero_len);
1124 		return (NSC_DONE);
1125 	}
1126 
1127 	SDTRACE(ST_ENTER|SDF_UNCOMMIT, cd, fba_len, fba_pos, flag, 0);
1128 
1129 	cc_ent = handle->bh_centry;
1130 	while (CENTRY_BLK(cc_ent) != FBA_TO_BLK_NUM(fba_pos))
1131 		cc_ent = cc_ent->cc_chain;
1132 
1133 	cc_len = fba_len;	/* current length */
1134 	st_cblk_off = BLK_FBA_OFF(fba_pos);
1135 	st_cblk_len = (BLK_FBAS - st_cblk_off);
1136 	if ((nsc_size_t)st_cblk_len >= fba_len) {
1137 		end_cblk_len = 0;
1138 		st_cblk_len = (sdbc_cblk_fba_t)fba_len;
1139 	}
1140 	else
1141 		end_cblk_len = BLK_FBA_OFF(fba_pos + fba_len);
1142 
1143 	/*
1144 	 * Check if remote write-cache spool is dirty,
1145 	 * if not, we can just discard local valid bits.
1146 	 */
1147 	bits = SDBC_GET_BITS(st_cblk_off, st_cblk_len);
1148 	cc_ent->cc_valid &= ~bits;
1149 
1150 	cc_len -= st_cblk_len;
1151 	cc_ent = cc_ent->cc_chain;
1152 	bits = SDBC_GET_BITS(0, BLK_FBAS);
1153 
1154 	while (cc_len > (nsc_size_t)end_cblk_len) {
1155 		cc_ent->cc_valid = 0;
1156 		cc_ent = cc_ent->cc_chain;
1157 		cc_len -= BLK_FBAS;
1158 	}
1159 
1160 #if defined(_SD_DEBUG)
1161 	if (cc_len != end_cblk_len)
1162 		cmn_err(CE_WARN, "!fba_len %" NSC_SZFMT " end_cblk_len %d in "
1163 		    "_sd_write", fba_len, end_cblk_len);
1164 #endif
1165 
1166 	if (cc_len) {
1167 		bits = SDBC_GET_BITS(0, end_cblk_len);
1168 		cc_ent->cc_valid &= ~bits;
1169 	}
1170 	SDTRACE(ST_EXIT|SDF_UNCOMMIT, cd, fba_len, fba_pos, flag, 0);
1171 
1172 	return (NSC_DONE);
1173 }
1174 
1175 static void
1176 _sd_wait_for_dirty(void)
1177 {
1178 	int cd;
1179 
1180 	for (cd = 0; cd < sdbc_max_devs; cd++) {
1181 		while (_SD_CD_WBLK_USED(cd))
1182 			delay(HZ);
1183 	}
1184 }
1185 
1186 /*
1187  * _sd_wait_for_flush - wait for all i/o for this cd to cease.
1188  * This function assumes that no further i/o are being issued
1189  * against this device. This assumption is enforced by sd-ctl
1190  * when called from _sd_flush_cd. Recovery also uses this
1191  * wait and it enforces this assumption (somewhat imperfectly)
1192  * by using cd_recovering.
1193  * We must see progress in getting i/o complete within 25 seconds
1194  * or we will return an error. If we complete normally (all i/o done)
1195  * we return 0.
1196  */
1197 int
1198 _sd_wait_for_flush(int cd)
1199 {
1200 	_sd_cd_info_t *cdi = &(_sd_cache_files[cd]);
1201 	int tries = 0, used, last_used = 0, inprogress = 0;
1202 
1203 	if (!(_SD_CD_WBLK_USED(cd)))
1204 		return (0);
1205 	/*
1206 	 * Wait for WBLK_USED to reach 0.
1207 	 * If unchanged for 32+ seconds returns EAGAIN
1208 	 */
1209 	if (!cdi->cd_writer)
1210 		(void) cd_writer(cd); /* spawn writer if not already running */
1211 
1212 	while (((used = _SD_CD_WBLK_USED(cd)) != 0) || cdi->cd_writer) {
1213 		if (last_used == used &&
1214 		    inprogress == cdi->cd_write_inprogress) {
1215 			if (cdi->cd_info->sh_failed)
1216 				break;
1217 			if (++tries > 128) {
1218 				cmn_err(CE_WARN, "!sdbc(_sd_wait_for_flush) "
1219 				    "%s still has %d blocks pending %d"
1220 				    " in progress (@ %lx)",
1221 				    cdi->cd_info->sh_filename, last_used,
1222 				    inprogress, nsc_lbolt());
1223 				return (EAGAIN);
1224 			}
1225 		} else {
1226 			last_used = used;
1227 			inprogress = cdi->cd_write_inprogress;
1228 			tries = 0;
1229 		}
1230 		_sd_unblock(&_sd_flush_cv);
1231 		delay(HZ/4);
1232 	}
1233 	if (cdi->cd_info->sh_failed)
1234 		return (EIO);
1235 	else
1236 		return (0);
1237 }
1238 
1239 
1240 static
1241 int _sd_ft_warm_start;
1242 
1243 int
1244 _sdbc_warm_start(void)
1245 {
1246 	return (_sd_ft_warm_start);
1247 }
1248 
1249 void
1250 _sdbc_clear_warm_start(void)
1251 {
1252 	_sd_ft_warm_start = 0;
1253 }
1254 
1255 void
1256 _sdbc_set_warm_start(void)
1257 {
1258 	_sd_ft_warm_start = 1;
1259 }
1260 
1261 /*ARGSUSED*/
1262 void
1263 _ncall_poke(int host)
1264 {
1265 	cmn_err(CE_PANIC, " NYI - _ncall_poke");
1266 }
1267