xref: /titanic_51/usr/src/uts/common/avs/ns/sdbc/sd_io.c (revision d3d50737e566cade9a08d73d2af95105ac7cd960)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/ksynch.h>
28 #include <sys/cmn_err.h>
29 #include <sys/kmem.h>
30 #include <sys/buf.h>
31 #include <sys/cred.h>
32 #include <sys/errno.h>
33 #include <sys/ddi.h>
34 
35 #include <sys/nsc_thread.h>
36 #include <sys/nsctl/nsctl.h>
37 
38 #include <sys/sdt.h>		/* dtrace is S10 or later */
39 
40 #include "sd_bcache.h"
41 #include "sd_trace.h"
42 #include "sd_io.h"
43 #include "sd_bio.h"
44 #include "sd_misc.h"
45 #include "sd_ft.h"
46 #include "sd_pcu.h"
47 
48 /*
49  * dynamic memory support
50  */
51 _dm_process_vars_t dynmem_processing_dm;
52 static int  sd_dealloc_flag_dm = NO_THREAD_DM;
53 static void _sd_dealloc_dm(void);
54 static int  _sd_entry_availability_dm(_sd_cctl_t *cc_ent, int *nodata);
55 
56 extern void sdbc_requeue_dmchain(_sd_queue_t *, _sd_cctl_t *, int, int);
57 extern void sdbc_ins_dmqueue_front(_sd_queue_t *q, _sd_cctl_t *cc_ent);
58 extern void sdbc_remq_dmchain(_sd_queue_t *q, _sd_cctl_t *cc_ent);
59 extern void sdbc_requeue_head_dm_try(_sd_cctl_t *);
60 extern int sdbc_use_dmchain;
61 extern _sd_queue_t *sdbc_dm_queues;
62 
63 kcondvar_t   _sd_flush_cv;
64 static volatile int _sd_flush_exit;
65 
66 /* secret flush toggle flag for testing */
67 #ifdef DEBUG
68 int _sdbc_flush_flag = 1; /* 0 ==> noflushing, 1 ==> flush */
69 #endif
70 
71 static int sdbc_flush_pageio;
72 
73 
74 
75 /*
76  * Forward declare all statics that are used before defined to enforce
77  * parameter checking
78  * Some (if not all) of these could be removed if the code were reordered
79  */
80 
81 static void _sd_flcent_ea(blind_t xcc_ent, nsc_off_t fba_pos,
82     nsc_size_t fba_len, int error);
83 static void _sd_flclist_ea(blind_t xcc_ent, nsc_off_t fba_pos,
84     nsc_size_t fba_len, int error);
85 static void _sd_process_reflush(_sd_cctl_t *cc_ent);
86 static void _sd_flush_thread(void);
87 
88 int
89 _sdbc_flush_configure(void)
90 {
91 	_sd_flush_exit = 1;
92 	sdbc_flush_pageio = 0;
93 	return (nsc_create_process(
94 	    (void (*)(void *))_sd_flush_thread, 0, TRUE));
95 }
96 
97 
98 void
99 _sdbc_flush_deconfigure(void)
100 {
101 	_sd_unblock(&_sd_flush_cv);
102 	_sd_flush_exit = 0;
103 }
104 
105 static int
106 sdbc_alloc_static_cache(int reqblks)
107 {
108 	_sd_cctl_t *centry;
109 	_sd_cctl_t *next_centry;
110 
111 	if (centry = sdbc_centry_alloc_blks(_CD_NOHASH, 0, reqblks,
112 	    ALLOC_NOWAIT)) {
113 		/* release the blocks to the queue */
114 		while (centry) {
115 			next_centry = centry->cc_chain;
116 			_sd_centry_release(centry);
117 			centry = next_centry;
118 		}
119 		return (reqblks);
120 	}
121 	return (0);
122 }
123 
124 int
125 _sdbc_dealloc_configure_dm(void)
126 {
127 	int rc = 0;
128 	int reqblks = MEGABYTE/BLK_SIZE(1); /* alloc in mb chunks */
129 	int i;
130 	int blk_groups; /* number of ~MB groups */
131 	int blks_remaining;
132 	int blks_allocd = 0;
133 
134 	dynmem_processing_dm.alloc_ct = 0;
135 	dynmem_processing_dm.dealloc_ct = 0;
136 
137 	if (sdbc_static_cache) { /* alloc all static cache memory here */
138 		dynmem_processing_dm.max_dyn_list = reqblks;
139 
140 		blk_groups = CBLOCKS / reqblks;
141 		blks_remaining = CBLOCKS % reqblks;
142 
143 		for (i = 0; i < blk_groups; ++i) {
144 			if (!sdbc_alloc_static_cache(reqblks))
145 				break;
146 			blks_allocd += reqblks;
147 		}
148 		DTRACE_PROBE2(_sdbc_dealloc_configure_dm1,
149 		    int, i, int, blks_allocd);
150 
151 		/* if successful then allocate any remaining blocks */
152 		if ((i == blk_groups) && blks_remaining)
153 			if (sdbc_alloc_static_cache(blks_remaining))
154 				blks_allocd += blks_remaining;
155 
156 		DTRACE_PROBE2(_sdbc_dealloc_configure_dm2,
157 		    int, i, int, blks_allocd);
158 
159 		sd_dealloc_flag_dm = NO_THREAD_DM;
160 
161 		if (blks_allocd < CBLOCKS) {
162 			cmn_err(CE_WARN, "!Failed to allocate sdbc cache "
163 			    "memory.\n requested mem: %d MB; actual mem: %d MB",
164 			    CBLOCKS/reqblks, blks_allocd/reqblks);
165 			rc = ENOMEM;
166 		}
167 
168 
169 #ifdef DEBUG
170 		cmn_err(CE_NOTE, "!sdbc(_sdbc_dealloc_configure_dm) %d bytes "
171 		    "(%d cache blocks) allocated for static cache, "
172 		    "block size %d", blks_allocd * BLK_SIZE(1), blks_allocd,
173 		    BLK_SIZE(1));
174 #endif /* DEBUG */
175 	} else {
176 		sd_dealloc_flag_dm = PROCESS_CACHE_DM;
177 		rc = nsc_create_process((void (*)(void *))_sd_dealloc_dm, 0,
178 		    TRUE);
179 		if (rc != 0)
180 			sd_dealloc_flag_dm = NO_THREAD_DM;
181 	}
182 	return (rc);
183 }
184 
185 /*
186  * sdbc_dealloc_dm_shutdown - deallocate cache memory.
187  *
188  * ARGUMENTS: none
189  *
190  * RETURNS: nothing
191  *
192  * USAGE:
193  *	this function is intended for use after all i/o has stopped and all
194  * 	other cache threads have terminated.  write cache resources, if any
195  *	are released, except in the case of pinned data.
196  */
197 static void
198 sdbc_dealloc_dm_shutdown()
199 {
200 	_sd_cctl_t *cc_ent;
201 	ss_centry_info_t *wctl;
202 
203 	cc_ent = _sd_cctl[0];
204 
205 	if (!cc_ent)
206 		return;
207 
208 	do {
209 		if (cc_ent->cc_alloc_size_dm) {
210 			/* HOST or OTHER */
211 
212 			if (cc_ent->cc_data)
213 				kmem_free(cc_ent->cc_data,
214 				    cc_ent->cc_alloc_size_dm);
215 
216 			cc_ent->cc_alloc_size_dm = 0;
217 
218 			dynmem_processing_dm.dealloc_ct++;
219 
220 			DTRACE_PROBE2(sdbc_dealloc_dm_shutdown, char *,
221 			    cc_ent->cc_data, int, cc_ent->cc_alloc_size_dm);
222 		}
223 
224 		/* release safestore resource, if any. preserve pinned data */
225 		if (!(CENTRY_DIRTY(cc_ent)) && (wctl = cc_ent->cc_write)) {
226 			wctl->sc_flag = 0;
227 			wctl->sc_dirty = 0;
228 
229 			SSOP_SETCENTRY(sdbc_safestore, wctl);
230 			SSOP_DEALLOCRESOURCE(sdbc_safestore, wctl->sc_res);
231 		}
232 		cc_ent = cc_ent->cc_link_list_dm;
233 	} while (cc_ent != _sd_cctl[0]);
234 }
235 
236 void
237 _sdbc_dealloc_deconfigure_dm(void)
238 {
239 	int one_sec;
240 
241 	if (sdbc_static_cache) {
242 		sdbc_dealloc_dm_shutdown();
243 		return;
244 	}
245 
246 	if (sd_dealloc_flag_dm == NO_THREAD_DM)
247 		return;			/* thread never started */
248 	one_sec = HZ; /* drv_usectohz(1000000); */
249 
250 	mutex_enter(&dynmem_processing_dm.thread_dm_lock);
251 	sd_dealloc_flag_dm = CACHE_SHUTDOWN_DM;
252 	cv_broadcast(&dynmem_processing_dm.thread_dm_cv);
253 	mutex_exit(&dynmem_processing_dm.thread_dm_lock);
254 
255 	while (sd_dealloc_flag_dm != CACHE_THREAD_TERMINATED_DM)
256 		delay(one_sec);
257 
258 	sd_dealloc_flag_dm = NO_THREAD_DM;
259 }
260 
261 /*
262  * This complicated - possibly overly complicated routine works as follows:
263  * In general the routine sleeps a specified amount of time then wakes and
264  * examines the entire centry list. If an entry is avail. it ages it by one
265  * tick else it clears the aging flag completely. It then determines if the
266  * centry has aged sufficiently to have its memory deallocated and for it to
267  * be placed at the top of the lru.
268  *
269  * There are two deallocation schemes in place depending on whether the
270  * centry is a standalone entry or it is a member of a host/parasite chain.
271  *
272  * The behavior for a standalone entry is as follows:
273  * If the given centry is selected it will age normally however at full
274  * aging it will only be placed at the head of the lru. It's memory will
275  * not be deallocated until a further aging level has been reached. The
276  * entries selected for this behavior are goverend by counting the number
277  * of these holdovers in existence on each wakeup and and comparing it
278  * to a specified percentage. This comparision is always one cycle out of
279  * date and will float in the relative vicinity of the specified number.
280  *
281  * The behavior for a host/parasite chain is as follows:
282  * The chain is examined. If all entries are fully aged the entire chain
283  * is removed - ie mem is dealloc. from the host entry and all memory ref.
284  * removed from the parasitic entries and each entry requeued on to the lru.
285  *
286  * There are three delay timeouts and two percentage levels specified. Timeout
287  * level 1 is honored between 100% free and pcnt level 1. Timeout level 2 is
288  * honored between pcnt level 1 and pcnt level 2, Timeout level 3 is
289  * honored between pcnt level 2 and 0% free. In addition there exist an
290  * accelerated
291  * aging flag which mimics hysterisis behavior. If the available centrys fall
292  * between pcnt1 and pcnt2 an 8 bit counter is switched on. The effect is to
293  * keep the timer value at timer level 2 for 8 cycles even if the number
294  * available cache entries drifts above pcnt1. If it falls below pcnt2 an
295  * additional 8 bit counter is switched on. This causes the sleep timer to
296  * remain at timer level 3 for at least 8 cycles even if it floats above
297  * pcnt2 or even pcnt1. The effect of all this is to accelerate the release
298  * of system resources under a heavy load.
299  *
300  * All of the footwork can be stubbed out by a judicious selection of values
301  * for the times, aging counts and pcnts.
302  *
303  * All of these behavior parameters are adjustable on the fly via the kstat
304  * mechanism. In addition there is a thread wakeup msg available through the
305  * same mechanism.
306  */
307 
308 static void
309 _sd_dealloc_dm(void)
310 {
311 	int one_sec_tics, tic_delay;
312 	int sleep_tics_lvl1, sleep_tics_lvl2, sleep_tics_lvl3;
313 	int transition_lvl1, transition_lvl2;
314 	int host_cache_aging_ct, meta_cache_aging_ct, hold_cache_aging_ct;
315 	int max_holds_ct;
316 	int cache_aging_ct, hold_candidate, last_holds_ct;
317 	_sd_cctl_t *cc_ent, *next_ccentry, *cur_ent, *nxt_ent;
318 	ss_centry_info_t *wctl;
319 	int current_breakout_count, number_cache_entries;
320 	int dealloc;
321 	_dm_process_vars_t *ppvars;
322 
323 	int write_dealloc; /* remove after debugging */
324 
325 	ppvars = &dynmem_processing_dm;
326 
327 	/* setup a one sec time var */
328 	one_sec_tics = HZ; /* drv_usectohz(1000000); */
329 
330 	ppvars->history = 0;
331 
332 	cc_ent = _sd_cctl[0];
333 
334 	number_cache_entries = _sd_net_config.sn_cpages;
335 
336 	last_holds_ct = 0;
337 
338 	/*CONSTANTCONDITION*/
339 	while (1) {
340 		if (sd_dealloc_flag_dm == CACHE_SHUTDOWN_DM) {
341 			/* finished.  shutdown - get out */
342 			sdbc_dealloc_dm_shutdown(); /* free all memory */
343 			sd_dealloc_flag_dm = CACHE_THREAD_TERMINATED_DM;
344 			return;
345 		}
346 
347 		/* has the world changed */
348 
349 		/*
350 		 * get num cctl entries (%) below which different sleep
351 		 * rates kick in
352 		 */
353 		transition_lvl1 =
354 		    (ppvars->cache_aging_pcnt1*number_cache_entries) / 100;
355 		transition_lvl2 =
356 		    (ppvars->cache_aging_pcnt2*number_cache_entries) / 100;
357 
358 		/* get sleep rates for each level */
359 		sleep_tics_lvl1 = ppvars->cache_aging_sec1 * one_sec_tics;
360 		sleep_tics_lvl2 = ppvars->cache_aging_sec2 * one_sec_tics;
361 		sleep_tics_lvl3 = ppvars->cache_aging_sec3 * one_sec_tics;
362 
363 		/* get num of cycles for full normal aging */
364 		host_cache_aging_ct = ppvars->cache_aging_ct1;
365 
366 		/* get num of cycles for full meta aging */
367 		meta_cache_aging_ct = ppvars->cache_aging_ct2;
368 
369 		/* get num of cycles for full extended holdover aging */
370 		hold_cache_aging_ct = ppvars->cache_aging_ct3;
371 
372 		/* get maximum holds count in % */
373 		max_holds_ct = (ppvars->max_holds_pcnt*number_cache_entries)
374 		    / 100;
375 
376 		/* apply the delay */
377 		tic_delay = sleep_tics_lvl1;
378 		if (sd_dealloc_flag_dm == TIME_DELAY_LVL1)
379 			tic_delay = sleep_tics_lvl2;
380 		else
381 			if (sd_dealloc_flag_dm == TIME_DELAY_LVL2)
382 				tic_delay = sleep_tics_lvl3;
383 
384 		mutex_enter(&ppvars->thread_dm_lock);
385 		(void) cv_reltimedwait(&ppvars->thread_dm_cv,
386 		    &ppvars->thread_dm_lock, tic_delay, TR_CLOCK_TICK);
387 		mutex_exit(&ppvars->thread_dm_lock);
388 
389 		/* check for special directives on wakeup */
390 		if (ppvars->process_directive &
391 		    MAX_OUT_ACCEL_HIST_FLAG_DM) {
392 			ppvars->process_directive &=
393 			    ~MAX_OUT_ACCEL_HIST_FLAG_DM;
394 			ppvars->history =
395 			    (HISTORY_LVL1|HISTORY_LVL2);
396 		}
397 
398 		/* Start of deallocation loop */
399 		current_breakout_count = 0;
400 
401 		ppvars->nodatas = 0;
402 		write_dealloc = 0;
403 		ppvars->deallocs = 0;
404 		ppvars->candidates = 0;
405 		ppvars->hosts = 0;
406 		ppvars->pests = 0;
407 		ppvars->metas = 0;
408 		ppvars->holds = 0;
409 		ppvars->others = 0;
410 		ppvars->notavail = 0;
411 
412 		while (sd_dealloc_flag_dm != CACHE_SHUTDOWN_DM &&
413 		    current_breakout_count < number_cache_entries) {
414 
415 			next_ccentry = cc_ent->cc_link_list_dm;
416 
417 			if (_sd_entry_availability_dm(cc_ent, &ppvars->nodatas)
418 			    == FALSE) {
419 				ppvars->notavail++;
420 				goto next_dealloc_entry;
421 			}
422 
423 			cache_aging_ct = host_cache_aging_ct;
424 			hold_candidate = FALSE;
425 			if (cc_ent->cc_aging_dm & HOST_ENTRY_DM)
426 				ppvars->hosts++;
427 			else
428 				if (cc_ent->cc_aging_dm & PARASITIC_ENTRY_DM)
429 					ppvars->pests++;
430 			else
431 				if (cc_ent->cc_aging_dm & STICKY_METADATA_DM) {
432 					cache_aging_ct = meta_cache_aging_ct;
433 					ppvars->metas++;
434 				} else {
435 					if (last_holds_ct < max_holds_ct)
436 						hold_candidate = TRUE;
437 					ppvars->others++;
438 				}
439 
440 			ppvars->candidates++;
441 
442 			if ((cc_ent->cc_aging_dm & FINAL_AGING_DM) <
443 			    cache_aging_ct) {
444 				cc_ent->cc_aging_dm += FIRST_AGING_DM;
445 				CLEAR_CENTRY_PAGEIO(cc_ent);
446 				CLEAR_CENTRY_INUSE(cc_ent);
447 				goto next_dealloc_entry;
448 			}
449 
450 			/* bonafide aged entry - examine its chain */
451 			dealloc = TRUE;
452 			cur_ent = cc_ent->cc_head_dm;
453 			while (cur_ent) {
454 				if (cur_ent == cc_ent)
455 					cur_ent->cc_aging_dm |= AVAIL_ENTRY_DM;
456 				else {
457 					if (_sd_entry_availability_dm(cur_ent,
458 					    0) == TRUE) {
459 						cur_ent->cc_aging_dm |=
460 						    AVAIL_ENTRY_DM;
461 						if ((cur_ent->cc_aging_dm &
462 						    FINAL_AGING_DM) <
463 						    cache_aging_ct)
464 							dealloc = FALSE;
465 					} else
466 						dealloc = FALSE;
467 				}
468 
469 				cur_ent = cur_ent->cc_next_dm;
470 			}
471 			cur_ent = cc_ent->cc_head_dm;
472 
473 			/* chain not fully free - free inuse for all entries */
474 			if (dealloc == FALSE) {
475 				while (cur_ent) {
476 					nxt_ent = cur_ent->cc_next_dm;
477 
478 					if (cur_ent->cc_aging_dm &
479 					    AVAIL_ENTRY_DM) {
480 						cur_ent->cc_aging_dm &=
481 						    ~AVAIL_ENTRY_DM;
482 						CLEAR_CENTRY_PAGEIO(cur_ent);
483 						CLEAR_CENTRY_INUSE(cur_ent);
484 					}
485 					cur_ent = nxt_ent;
486 				}
487 			} else { /* OK - free memory */
488 				if (hold_candidate == TRUE &&
489 				    (cur_ent->cc_aging_dm & FINAL_AGING_DM) <
490 				    hold_cache_aging_ct) {
491 					ppvars->holds++;
492 
493 					ASSERT(cur_ent == cc_ent);
494 
495 					cc_ent->cc_aging_dm += FIRST_AGING_DM;
496 
497 					cur_ent->cc_aging_dm &= ~AVAIL_ENTRY_DM;
498 
499 					wctl = cur_ent->cc_write;
500 
501 					CLEAR_CENTRY_PAGEIO(cur_ent);
502 					CLEAR_CENTRY_INUSE(cur_ent);
503 
504 					if (wctl) {
505 						write_dealloc++;
506 						wctl->sc_flag = 0;
507 						wctl->sc_dirty = 0;
508 						SSOP_SETCENTRY(sdbc_safestore,
509 						    wctl);
510 						SSOP_DEALLOCRESOURCE(
511 						    sdbc_safestore,
512 						    wctl->sc_res);
513 					}
514 					goto next_dealloc_entry;
515 				} /* if (hold_candidate == TRUE */
516 
517 				while (cur_ent) {
518 
519 					DTRACE_PROBE4(_sd_dealloc_dm,
520 					    _sd_cctl_t *, cur_ent,
521 					    int, CENTRY_CD(cur_ent),
522 					    int, CENTRY_BLK(cur_ent),
523 					    uint_t, cur_ent->cc_aging_dm);
524 
525 					if ((cur_ent->cc_aging_dm
526 					    & BAD_CHAIN_DM)) {
527 						(void) _sd_hash_delete(
528 						    (_sd_hash_hd_t *)cur_ent,
529 						    _sd_htable);
530 
531 						nxt_ent = cur_ent->cc_next_dm;
532 						CLEAR_CENTRY_PAGEIO(cur_ent);
533 						CLEAR_CENTRY_INUSE(cur_ent);
534 						cur_ent = nxt_ent;
535 						continue;
536 					}
537 
538 					ppvars->deallocs++;
539 
540 					if (cur_ent->cc_alloc_size_dm) {
541 						int qidx;
542 						_sd_queue_t *q;
543 
544 						/* HOST or OTHER */
545 
546 						/* debugging */
547 						ppvars->dealloc_ct++;
548 						cur_ent->cc_dealloc_ct_dm++;
549 						kmem_free(cur_ent->cc_data,
550 						    cur_ent->cc_alloc_size_dm);
551 
552 						/*
553 						 * remove from queue
554 						 * in preparation for putting
555 						 * on the 0 queue after
556 						 * memory is freed
557 						 */
558 						if (sdbc_use_dmchain) {
559 
560 							qidx =
561 							    cur_ent->cc_cblocks;
562 							q = &sdbc_dm_queues
563 							    [qidx];
564 
565 							sdbc_remq_dmchain(q,
566 							    cur_ent);
567 						}
568 					}
569 
570 					wctl = cur_ent->cc_write;
571 					cur_ent->cc_write = 0;
572 					cur_ent->cc_data = 0;
573 					cur_ent->cc_alloc_size_dm = 0;
574 					cur_ent->cc_head_dm = NULL;
575 					cur_ent->cc_aging_dm &=
576 					    ~(FINAL_AGING_DM | ENTRY_FIELD_DM |
577 					    CATAGORY_ENTRY_DM | AVAIL_ENTRY_DM |
578 					    PREFETCH_BUF_I | PREFETCH_BUF_E);
579 
580 					(void) _sd_hash_delete(
581 					    (_sd_hash_hd_t *)cur_ent,
582 					    _sd_htable);
583 					cur_ent->cc_valid = 0;
584 
585 					if (sdbc_use_dmchain) {
586 						_sd_queue_t *q;
587 
588 						nxt_ent = cur_ent->cc_next_dm;
589 
590 						cur_ent->cc_next_dm = NULL;
591 
592 						CLEAR_CENTRY_PAGEIO(cur_ent);
593 						CLEAR_CENTRY_INUSE(cur_ent);
594 
595 						q = &sdbc_dm_queues[0];
596 						sdbc_ins_dmqueue_front(q,
597 						    cur_ent);
598 					} else {
599 						_sd_requeue_head(cur_ent);
600 
601 						nxt_ent = cur_ent->cc_next_dm;
602 						cur_ent->cc_next_dm = NULL;
603 
604 						CLEAR_CENTRY_PAGEIO(cur_ent);
605 						CLEAR_CENTRY_INUSE(cur_ent);
606 					}
607 
608 					cur_ent = nxt_ent;
609 
610 					if (wctl) {
611 						write_dealloc++;
612 						wctl->sc_flag = 0;
613 						wctl->sc_dirty = 0;
614 						SSOP_SETCENTRY(sdbc_safestore,
615 						    wctl);
616 						SSOP_DEALLOCRESOURCE(
617 						    sdbc_safestore,
618 						    wctl->sc_res);
619 					}
620 				} /* while (cur_ent) */
621 			} /* else OK - free memory */
622 next_dealloc_entry:
623 		current_breakout_count++;
624 
625 		cc_ent = next_ccentry;
626 		}  /* while (entries) */
627 
628 		if (ppvars->monitor_dynmem_process & RPT_DEALLOC_STATS1_DM) {
629 			cmn_err(CE_NOTE,
630 			    "!notavl=%x, nodat=%x, cand=%x, hosts=%x,"
631 			    " pests=%x, metas=%x, holds=%x, others=%x,"
632 			    " deallo=%x",
633 			    ppvars->notavail, ppvars->nodatas,
634 			    ppvars->candidates, ppvars->hosts, ppvars->pests,
635 			    ppvars->metas, ppvars->holds, ppvars->others,
636 			    ppvars->deallocs);
637 		}
638 
639 		if (ppvars->monitor_dynmem_process & RPT_DEALLOC_STATS2_DM) {
640 			cmn_err(CE_NOTE,
641 			    "!hist=%x, gross a/d=%x %x", ppvars->history,
642 			    ppvars->alloc_ct, ppvars->dealloc_ct);
643 		}
644 
645 		if (sd_dealloc_flag_dm == CACHE_SHUTDOWN_DM)
646 			continue;
647 
648 		last_holds_ct = ppvars->holds;
649 
650 		/* set the history flag which will govern the sleep rate */
651 		if (ppvars->nodatas > transition_lvl1) {
652 			/* upper - lots of virgin cctls */
653 			if (ppvars->history)
654 				ppvars->history >>= 1;
655 		} else {
656 			if (ppvars->nodatas > transition_lvl2) {
657 				/* middle - not so many virgin cctls */
658 				if (ppvars->history & (HISTORY_LVL1-1))
659 					ppvars->history >>= 1;
660 				else
661 					ppvars->history = HISTORY_LVL1;
662 
663 			} else {
664 				/*
665 				 * appear to be running low - accelerate the
666 				 * aging to free more
667 				 */
668 				if (ppvars->history & HISTORY_LVL2)
669 					ppvars->history >>= 1;
670 				else
671 					ppvars->history =
672 					    (HISTORY_LVL1|HISTORY_LVL2);
673 			}
674 		}
675 
676 		sd_dealloc_flag_dm = TIME_DELAY_LVL0;
677 		if (ppvars->history & HISTORY_LVL2)
678 			sd_dealloc_flag_dm = TIME_DELAY_LVL2;
679 		else
680 			if (ppvars->history & HISTORY_LVL1)
681 				sd_dealloc_flag_dm = TIME_DELAY_LVL1;
682 
683 	} /* while (TRUE) */
684 }
685 
686 int
687 _sd_entry_availability_dm(_sd_cctl_t *cc_ent, int *nodata)
688 {
689 	/*
690 	 * if using dmchaining return immediately and do not attempt
691 	 * to acquire the cc_ent if there is no memory associated with
692 	 * this cc_ent.
693 	 * this avoids conflicts for centrys on the 0 queue.
694 	 * see sdbc_get_dmchain()
695 	 */
696 
697 	if ((sdbc_use_dmchain) && (cc_ent->cc_data == 0)) {
698 
699 		if (nodata)
700 			(*nodata)++;
701 
702 		DTRACE_PROBE(sdbc_availability_dm_end1);
703 		return (FALSE);
704 	}
705 
706 	if ((SET_CENTRY_INUSE(cc_ent))) {
707 
708 		DTRACE_PROBE(sdbc_availability_dm_end2);
709 
710 		return (FALSE);
711 	}
712 
713 
714 	if ((SET_CENTRY_PAGEIO(cc_ent))) {
715 
716 		CLEAR_CENTRY_INUSE(cc_ent);
717 
718 		DTRACE_PROBE(sdbc_availability_dm_end3);
719 
720 		return (FALSE);
721 	}
722 
723 	/*
724 	 * we allow the QHEAD flag as it does not affect the availabilty
725 	 * of memory for aging
726 	 */
727 	if ((CENTRY_DIRTY(cc_ent)) || (CENTRY_IO_INPROGRESS(cc_ent)) ||
728 	    (cc_ent->cc_flag & ~(CC_QHEAD)) ||
729 	    cc_ent->cc_dirty_next || cc_ent->cc_dirty_link ||
730 	    cc_ent->cc_data == 0) {
731 
732 		cc_ent->cc_aging_dm &= ~FINAL_AGING_DM;
733 		if (nodata)
734 			if (cc_ent->cc_data == 0) {
735 				(*nodata)++;
736 		}
737 
738 		CLEAR_CENTRY_PAGEIO(cc_ent);
739 		CLEAR_CENTRY_INUSE(cc_ent);
740 
741 		DTRACE_PROBE(sdbc_availability_dm_end4);
742 
743 		return (FALSE);
744 	}
745 
746 	return (TRUE);
747 }
748 
749 /*
750  * function below to prohibit code movement by compiler
751  * and avoid using spinlocks for syncronization
752  */
753 static void
754 _sd_cc_iostatus_initiate(_sd_cctl_t *cc_ent)
755 {
756 	cc_ent->cc_iostatus = _SD_IO_INITIATE;
757 	sd_serialize();
758 }
759 
760 /*
761  * Yet another switch!
762  * alloc mem and coalesce if at least this number of frags
763  */
764 static int sdbc_coalesce_backend = 1;
765 
766 /*
767  * optimization for _sd_async_flclist()
768  * called only if not doing pageio and sdbc_coalesce_backend > 0
769  *
770  * returns with pagio bit set in the centrys in list
771  */
772 static unsigned char *
773 sdbc_alloc_io_mem(_sd_cctl_t *cc_ent, int first_dirty, int last_dirty)
774 {
775 	unsigned char *prev_addr = NULL;
776 	_sd_cctl_t *cc_ent_orig = cc_ent;
777 	int fba_len;
778 	int total_len_bytes = 0;
779 	unsigned char *start_addr = NULL; /* function return value */
780 	unsigned char *next_addr;
781 	int num_frags = 0;
782 
783 	if (first_dirty && (!_SD_BMAP_ISFULL(first_dirty))) {
784 		WAIT_CENTRY_PAGEIO(cc_ent, sdbc_flush_pageio);
785 
786 		fba_len = SDBC_LOOKUP_LEN(first_dirty);
787 		total_len_bytes += FBA_SIZE(fba_len);
788 
789 		prev_addr = cc_ent->cc_data;
790 		cc_ent = cc_ent->cc_dirty_next;
791 	}
792 
793 	while (cc_ent) {
794 
795 		WAIT_CENTRY_PAGEIO(cc_ent, sdbc_flush_pageio);
796 		/* check for contiguity */
797 		if (prev_addr &&
798 		    !((prev_addr + CACHE_BLOCK_SIZE) == cc_ent->cc_data))
799 			++num_frags;
800 
801 		/* compute length */
802 		if (FULLY_DIRTY(cc_ent)) {
803 			total_len_bytes += CACHE_BLOCK_SIZE;
804 		} else {
805 			fba_len = SDBC_LOOKUP_LEN(last_dirty);
806 			total_len_bytes += FBA_SIZE(fba_len);
807 		}
808 
809 		prev_addr = cc_ent->cc_data;
810 		cc_ent = cc_ent->cc_dirty_next;
811 	}
812 
813 	if (num_frags >= sdbc_coalesce_backend) {
814 		/*
815 		 * TODO - determine metric for deciding
816 		 * whether to coalesce memory or do separate i/o's
817 		 */
818 
819 		DTRACE_PROBE(sdbc_io_mem_kmem_start);
820 
821 		if (start_addr = kmem_alloc(total_len_bytes, KM_NOSLEEP)) {
822 			int sblk, offset;
823 
824 			cc_ent = cc_ent_orig;
825 
826 			cc_ent->cc_anon_addr.sa_virt = start_addr;
827 			cc_ent->cc_anon_len = total_len_bytes;
828 
829 			next_addr = start_addr;
830 
831 			DTRACE_PROBE2(sdbc_io_mem_bcopy_start,
832 			    int, num_frags, int, total_len_bytes);
833 
834 			/* copy the first dirty piece */
835 			if (first_dirty && (!_SD_BMAP_ISFULL(first_dirty))) {
836 
837 				fba_len = SDBC_LOOKUP_LEN(first_dirty);
838 				sblk = SDBC_LOOKUP_STPOS(first_dirty);
839 				offset = FBA_SIZE(sblk);
840 
841 				bcopy(cc_ent->cc_data + offset, next_addr,
842 				    FBA_SIZE(fba_len));
843 				cc_ent = cc_ent->cc_dirty_next;
844 				next_addr += FBA_SIZE(fba_len);
845 			}
846 
847 			/* copy the rest of data */
848 			while (cc_ent) {
849 				if (FULLY_DIRTY(cc_ent)) {
850 					bcopy(cc_ent->cc_data, next_addr,
851 					    CACHE_BLOCK_SIZE);
852 					next_addr += CACHE_BLOCK_SIZE;
853 				} else {
854 					fba_len = SDBC_LOOKUP_LEN(last_dirty);
855 					bcopy(cc_ent->cc_data, next_addr,
856 					    FBA_SIZE(fba_len));
857 					next_addr += FBA_SIZE(fba_len);
858 				}
859 
860 				cc_ent = cc_ent->cc_dirty_next;
861 			}
862 
863 			DTRACE_PROBE(sdbc_io_mem_bcopy_end);
864 		}
865 
866 		DTRACE_PROBE(sdbc_io_mem_kmem_end);
867 	}
868 
869 	return (start_addr);
870 }
871 
872 void
873 _sd_async_flclist(_sd_cctl_t *cclist, dev_t rdev)
874 {
875 	int flushed, i, cd;
876 	uint_t first_dirty, last_dirty;
877 	_sd_cctl_t *cc_ent, *cc_prev = NULL;
878 	struct buf *bp;
879 	int dblk, fba_len;
880 	int len;
881 	int toflush;
882 	int coalesce; /* convenience boolean */
883 	unsigned char *anon_mem = NULL;
884 	extern int sdbc_do_page;
885 
886 
887 	SDTRACE(ST_ENTER|SDF_FLCLIST, CENTRY_CD(cclist),
888 	    0, BLK_TO_FBA_NUM(CENTRY_BLK(cclist)), 0, 0);
889 
890 	coalesce = (!sdbc_do_page && sdbc_coalesce_backend);
891 
892 	cc_ent = cclist;
893 	_sd_cc_iostatus_initiate(cc_ent);
894 	first_dirty = CENTRY_DIRTY(cc_ent);
895 	if (SDBC_IS_FRAGMENTED(first_dirty)) {
896 		cclist = cc_ent->cc_dirty_next;
897 		cc_ent->cc_dirty_next = NULL;
898 		_sd_async_flcent(cc_ent, rdev);
899 		cc_ent = cclist;
900 		first_dirty = 0;
901 	}
902 
903 	toflush = 0;
904 	while (cc_ent->cc_dirty_next) {
905 		if (cc_ent->cc_iocount)
906 			SDALERT(SDF_FLCLIST, CENTRY_CD(cc_ent), 0,
907 			    BLK_TO_FBA_NUM(CENTRY_BLK(cc_ent)),
908 			    cc_ent->cc_iocount, 0);
909 		cc_prev = cc_ent;
910 		cc_ent = cc_ent->cc_dirty_next;
911 		toflush++;
912 	}
913 	_sd_cc_iostatus_initiate(cc_ent);
914 	last_dirty = CENTRY_DIRTY(cc_ent);
915 	if (SDBC_IS_FRAGMENTED(last_dirty)) {
916 		if (cc_prev)
917 			cc_prev->cc_dirty_next = NULL;
918 		_sd_async_flcent(cc_ent, rdev);
919 		last_dirty = 0;
920 	}
921 	else
922 		toflush++;
923 
924 	if (toflush == 0)
925 		return;
926 
927 
928 	dblk = BLK_TO_FBA_NUM(CENTRY_BLK(cclist));
929 	if (first_dirty && (!_SD_BMAP_ISFULL(first_dirty)))
930 		dblk += SDBC_LOOKUP_STPOS(first_dirty);
931 
932 	cd = CENTRY_CD(cclist);
933 	bp = sd_alloc_iob(rdev, dblk, toflush, B_WRITE);
934 	cc_ent = cclist;
935 
936 	if (coalesce && (anon_mem = sdbc_alloc_io_mem(cc_ent, first_dirty,
937 	    last_dirty)))
938 		sd_add_fba(bp, &cc_ent->cc_anon_addr, 0,
939 		    FBA_NUM(cc_ent->cc_anon_len));
940 
941 	if (first_dirty && (!_SD_BMAP_ISFULL(first_dirty))) {
942 		cc_ent->cc_iocount = flushed = 1;
943 
944 		/* pageio bit already set in sdbc_alloc_io_mem() above */
945 		if (!coalesce)
946 			WAIT_CENTRY_PAGEIO(cc_ent, sdbc_flush_pageio);
947 
948 		fba_len = SDBC_LOOKUP_LEN(first_dirty);
949 
950 		/* build buffer only if it was not done above */
951 		if (!anon_mem) {
952 			i = SDBC_LOOKUP_STPOS(first_dirty);
953 			sd_add_fba(bp, &cc_ent->cc_addr, i, fba_len);
954 			DATA_LOG(SDF_FLSHLIST, cc_ent, i, fba_len);
955 
956 			DTRACE_PROBE4(_sd_async_flclist_data1, int,
957 			    BLK_TO_FBA_NUM(CENTRY_BLK(cc_ent)) + i,
958 			    int, fba_len, char *,
959 			    *(int64_t *)(cc_ent->cc_data + FBA_SIZE(i)),
960 			    char *, *(int64_t *)(cc_ent->cc_data +
961 			    FBA_SIZE(i + fba_len) - 8));
962 		}
963 
964 		len = FBA_SIZE(fba_len);
965 		cc_ent = cc_ent->cc_dirty_next;
966 	} else {
967 		len = 0;
968 		flushed = 0;
969 	}
970 	while (cc_ent) {
971 		_sd_cc_iostatus_initiate(cc_ent);
972 
973 		/* pageio bit already set in sdbc_alloc_io_mem() above */
974 		if (!coalesce)
975 			WAIT_CENTRY_PAGEIO(cc_ent, sdbc_flush_pageio);
976 
977 		if (FULLY_DIRTY(cc_ent)) {
978 			flushed++;
979 			cc_ent->cc_iocount = 1;
980 
981 			/* build buffer only if it was not done above */
982 			if (!anon_mem) {
983 				sd_add_fba(bp, &cc_ent->cc_addr, 0, BLK_FBAS);
984 				DATA_LOG(SDF_FLSHLIST, cc_ent, 0, BLK_FBAS);
985 
986 				DTRACE_PROBE4(_sd_async_flclist_data2,
987 				    int, BLK_TO_FBA_NUM(CENTRY_BLK(cc_ent)),
988 				    int, BLK_FBAS, char *,
989 				    *(int64_t *)(cc_ent->cc_data),
990 				    char *, *(int64_t *)(cc_ent->cc_data +
991 				    FBA_SIZE(BLK_FBAS) - 8));
992 			}
993 
994 			len += CACHE_BLOCK_SIZE;
995 		} else {
996 #if defined(_SD_DEBUG)
997 			/*
998 			 * consistency check.
999 			 */
1000 			if (!last_dirty || cc_ent->cc_dirty_next ||
1001 			    SDBC_IS_FRAGMENTED(last_dirty)) {
1002 				SDALERT(SDF_FLCLIST, cd, 0,
1003 				    BLK_TO_FBA_NUM(CENTRY_BLK(cc_ent)),
1004 				    cc_ent->cc_dirty_next, last_dirty);
1005 				cmn_err(CE_WARN,
1006 				    "!_sd_err: flclist: last_dirty %x next %x",
1007 				    last_dirty, cc_ent->cc_dirty_next);
1008 			}
1009 #endif
1010 			flushed++;
1011 			cc_ent->cc_iocount = 1;
1012 
1013 			fba_len = SDBC_LOOKUP_LEN(last_dirty);
1014 
1015 			/* build buffer only if it was not done above */
1016 			if (!anon_mem) {
1017 				sd_add_fba(bp, &cc_ent->cc_addr, 0, fba_len);
1018 				DATA_LOG(SDF_FLSHLIST, cc_ent, 0, fba_len);
1019 
1020 				DTRACE_PROBE4(_sd_async_flclist_data3, int,
1021 				    BLK_TO_FBA_NUM(CENTRY_BLK(cc_ent)),
1022 				    int, fba_len, char *,
1023 				    *(int64_t *)(cc_ent->cc_data), char *,
1024 				    *(int64_t *)(cc_ent->cc_data +
1025 				    FBA_SIZE(fba_len) - 8));
1026 			}
1027 
1028 			len += FBA_SIZE(fba_len);
1029 		}
1030 		cc_ent = cc_ent->cc_dirty_next;
1031 	}
1032 
1033 #ifdef DEBUG
1034 	if (anon_mem)
1035 		ASSERT(len == cclist->cc_anon_len);
1036 #endif
1037 
1038 	/* SDTRACE(ST_INFO|SDF_FLCLIST, cd, FBA_NUM(len), dblk, flushed, bp); */
1039 	(void) sd_start_io(bp, _sd_cache_files[cd].cd_strategy,
1040 	    _sd_flclist_ea, cclist);
1041 
1042 	DISK_FBA_WRITE(cd, FBA_NUM(len));
1043 	/* increment number of bytes destaged to disk */
1044 	WRITE_DESTAGED(cd, FBA_NUM(len));
1045 
1046 	_sd_enqueue_io_pending(cd, cclist);
1047 
1048 	SDTRACE(ST_EXIT|SDF_FLCLIST, cd, FBA_NUM(len), dblk, flushed, 0);
1049 }
1050 
1051 
1052 void
1053 _sd_enqueue_io_pending(int cd, _sd_cctl_t *cclist)
1054 {
1055 	_sd_cd_info_t *cdi;
1056 
1057 	cdi = &(_sd_cache_files[cd]);
1058 	if (cdi->cd_io_head == NULL)
1059 		cdi->cd_io_head = cdi->cd_io_tail = cclist;
1060 	else {
1061 		cdi->cd_io_tail->cc_dirty_link = cclist;
1062 		cdi->cd_io_tail = cclist;
1063 	}
1064 }
1065 
1066 
1067 
1068 void
1069 _sd_async_flcent(_sd_cctl_t *cc_ent, dev_t rdev)
1070 {
1071 	int dblk, len, sblk;
1072 	int dirty;
1073 	struct buf *bp;
1074 	int cd;
1075 
1076 	cd = CENTRY_CD(cc_ent);
1077 
1078 	SDTRACE(ST_ENTER|SDF_FLCENT, cd, 0,
1079 	    BLK_TO_FBA_NUM(CENTRY_BLK(cc_ent)), 0, 0);
1080 #if defined(_SD_DEBUG_PATTERN)
1081 	check_write_consistency(cc_ent);
1082 #endif
1083 	if (cc_ent->cc_iocount)
1084 		SDALERT(SDF_FLCENT, cd, 0, BLK_TO_FBA_NUM(CENTRY_BLK(cc_ent)),
1085 		    cc_ent->cc_iocount, 0);
1086 	_sd_cc_iostatus_initiate(cc_ent);
1087 	WAIT_CENTRY_PAGEIO(cc_ent, sdbc_flush_pageio);
1088 
1089 	dirty = CENTRY_DIRTY(cc_ent);
1090 
1091 	if (_SD_BMAP_ISFULL(dirty)) {
1092 		cc_ent->cc_iocount = 1;
1093 		dblk = BLK_TO_FBA_NUM(CENTRY_BLK(cc_ent));
1094 		bp = sd_alloc_iob(rdev, dblk, 1, B_WRITE);
1095 		sd_add_fba(bp, &cc_ent->cc_addr, 0, BLK_FBAS);
1096 		DATA_LOG(SDF_FLSHENT, cc_ent, 0, BLK_FBAS);
1097 
1098 		DTRACE_PROBE4(_sd_async_flcent_data1,
1099 		    int, BLK_TO_FBA_NUM(CENTRY_BLK(cc_ent)),
1100 		    int, BLK_FBAS, char *, *(int64_t *)(cc_ent->cc_data),
1101 		    char *, *(int64_t *)(cc_ent->cc_data +
1102 		    FBA_SIZE(BLK_FBAS) - 8));
1103 		cc_ent->cc_iocount = 1;
1104 		(void) sd_start_io(bp, _sd_cache_files[cd].cd_strategy,
1105 		    _sd_flcent_ea, cc_ent);
1106 		DISK_FBA_WRITE(cd, BLK_FBAS);
1107 		/* increment number of bytes destaged to disk */
1108 		WRITE_DESTAGED(cd, BLK_FBAS);
1109 	} else {
1110 		cc_ent->cc_iocount = SDBC_LOOKUP_DTCOUNT(dirty);
1111 
1112 		while (dirty) {
1113 			sblk = SDBC_LOOKUP_STPOS(dirty);
1114 			len = SDBC_LOOKUP_LEN(dirty);
1115 			SDBC_LOOKUP_MODIFY(dirty);
1116 
1117 			dblk = BLK_TO_FBA_NUM(CENTRY_BLK(cc_ent)) + sblk;
1118 			bp = sd_alloc_iob(rdev, dblk, 1, B_WRITE);
1119 			sd_add_fba(bp, &cc_ent->cc_addr, sblk, len);
1120 			DATA_LOG(SDF_FLSHENT, cc_ent, sblk, len);
1121 
1122 			DTRACE_PROBE4(_sd_async_flcent_data2, int,
1123 			    BLK_TO_FBA_NUM(CENTRY_BLK(cc_ent)) + sblk,
1124 			    int, len, char *,
1125 			    *(int64_t *)(cc_ent->cc_data + FBA_SIZE(sblk)),
1126 			    char *, *(int64_t *)(cc_ent->cc_data +
1127 			    FBA_SIZE(sblk + len) - 8));
1128 
1129 			/* SDTRACE(ST_INFO|SDF_FLCENT, cd, len, dblk, 0, bp); */
1130 
1131 			(void) sd_start_io(bp, _sd_cache_files[cd].cd_strategy,
1132 			    _sd_flcent_ea, cc_ent);
1133 			DISK_FBA_WRITE(cd, len);
1134 			/* increment number of bytes destaged to disk */
1135 			WRITE_DESTAGED(cd, len);
1136 		}
1137 	}
1138 	_sd_enqueue_io_pending(cd, cc_ent);
1139 
1140 	SDTRACE(ST_EXIT|SDF_FLCENT, cd, 0, dblk, 0, 0);
1141 }
1142 
1143 static void
1144 _sd_process_pending(int cd)
1145 {
1146 	_sd_cd_info_t *cdi;
1147 	_sd_cctl_t *cc_ent, *cc_next;
1148 	int dirty_enq;
1149 	ss_centry_info_t *wctl;
1150 	_sd_cctl_t *dirty_hd, **dirty_nxt;
1151 	int sts, processed = 0;
1152 
1153 	cdi = &(_sd_cache_files[cd]);
1154 
1155 	SDTRACE(ST_ENTER|SDF_FLDONE, cd, 0,
1156 	    SDT_INV_BL, cdi->cd_info->sh_numio, 0);
1157 process_loop:
1158 	if (cdi->cd_io_head == NULL) {
1159 		if (processed) {
1160 			mutex_enter(&cdi->cd_lock);
1161 			cdi->cd_info->sh_numio -= processed;
1162 			mutex_exit(&cdi->cd_lock);
1163 		}
1164 		SDTRACE(ST_EXIT|SDF_FLDONE, cd, 0,
1165 		    SDT_INV_BL, cdi->cd_info->sh_numio, processed);
1166 		return;
1167 	}
1168 	cc_ent = cdi->cd_io_head;
1169 	if ((sts = cc_ent->cc_iostatus) == _SD_IO_INITIATE) {
1170 		if (processed)  {
1171 			mutex_enter(&cdi->cd_lock);
1172 			cdi->cd_info->sh_numio -= processed;
1173 			mutex_exit(&cdi->cd_lock);
1174 		}
1175 		SDTRACE(ST_EXIT|SDF_FLDONE, cd, 0,
1176 		    SDT_INV_BL, cdi->cd_info->sh_numio, processed);
1177 		return;
1178 	}
1179 	LINTUSED(sts);
1180 #if defined(_SD_DEBUG)
1181 	if ((sts != _SD_IO_DONE) && (sts != _SD_IO_FAILED))
1182 		SDALERT(SDF_FLDONE, cd, 0,
1183 		    BLK_TO_FBA_NUM(CENTRY_BLK(cc_ent)), 0, sts);
1184 #endif
1185 
1186 	if ((cdi->cd_io_head = cc_ent->cc_dirty_link) == NULL)
1187 		cdi->cd_io_tail = NULL;
1188 
1189 	cc_ent->cc_dirty_link = NULL;
1190 	if (cc_ent->cc_iostatus == _SD_IO_FAILED &&
1191 	    _sd_process_failure(cc_ent))
1192 		goto process_loop;
1193 
1194 	dirty_enq = 0;
1195 	dirty_nxt = &(dirty_hd);
1196 
1197 	DTRACE_PROBE1(_sd_process_pending_cd, int, cd);
1198 
1199 	for (; cc_ent; cc_ent = cc_next) {
1200 
1201 		DTRACE_PROBE1(_sd_process_pending_cc_ent,
1202 		    _sd_cctl_t *, cc_ent);
1203 		processed++;
1204 		cc_next = cc_ent->cc_dirty_next;
1205 		cc_ent->cc_dirty_next = NULL;
1206 
1207 		if (CENTRY_PINNED(cc_ent))
1208 			_sd_process_reflush(cc_ent);
1209 
1210 		/*
1211 		 * Optimize for common case where block not inuse
1212 		 * Grabbing cc_inuse is faster than cc_lock.
1213 		 */
1214 		if (SET_CENTRY_INUSE(cc_ent))
1215 			goto must_lock;
1216 
1217 		cc_ent->cc_iostatus = _SD_IO_NONE;
1218 		if (CENTRY_DIRTY_PENDING(cc_ent)) {
1219 			cc_ent->cc_flag &= ~CC_PEND_DIRTY;
1220 
1221 			CLEAR_CENTRY_INUSE(cc_ent);
1222 			if (dirty_enq)
1223 				dirty_nxt = &((*dirty_nxt)->cc_dirty_link);
1224 			(*dirty_nxt) = cc_ent;
1225 			dirty_enq++;
1226 			continue;
1227 		}
1228 		cc_ent->cc_dirty = 0;
1229 		wctl = cc_ent->cc_write;
1230 		cc_ent->cc_write = NULL;
1231 		cc_ent->cc_flag &= ~(CC_PINNABLE);
1232 
1233 
1234 		wctl->sc_dirty = 0;
1235 		SSOP_SETCENTRY(sdbc_safestore, wctl);
1236 		SSOP_DEALLOCRESOURCE(sdbc_safestore, wctl->sc_res);
1237 
1238 		/*
1239 		 * if this was a QHEAD cache block, then
1240 		 * _sd_centry_release() did not requeue it as
1241 		 * it was dirty.  Requeue it now.
1242 		 */
1243 
1244 		if (CENTRY_QHEAD(cc_ent))
1245 			if (sdbc_use_dmchain) {
1246 
1247 				/* attempt to que head */
1248 				if (cc_ent->cc_alloc_size_dm) {
1249 
1250 					sdbc_requeue_head_dm_try(cc_ent);
1251 				}
1252 			} else
1253 				_sd_requeue_head(cc_ent);
1254 
1255 		CLEAR_CENTRY_INUSE(cc_ent);
1256 		continue;
1257 
1258 		/*
1259 		 * Block is inuse, must take cc_lock
1260 		 * if DIRTY_PENDING, must re-issue
1261 		 */
1262 	must_lock:
1263 		/* was FAST */
1264 		mutex_enter(&cc_ent->cc_lock);
1265 		cc_ent->cc_iostatus = _SD_IO_NONE;
1266 		if (CENTRY_DIRTY_PENDING(cc_ent)) {
1267 			cc_ent->cc_flag &= ~CC_PEND_DIRTY;
1268 			/* was FAST */
1269 			mutex_exit(&cc_ent->cc_lock);
1270 			if (dirty_enq)
1271 				dirty_nxt = &((*dirty_nxt)->cc_dirty_link);
1272 			(*dirty_nxt) = cc_ent;
1273 			dirty_enq++;
1274 			continue;
1275 		}
1276 		/*
1277 		 * clear dirty bits, if block no longer inuse release cc_write
1278 		 */
1279 		cc_ent->cc_dirty = 0;
1280 		if (SET_CENTRY_INUSE(cc_ent) == 0) {
1281 
1282 			wctl = cc_ent->cc_write;
1283 			cc_ent->cc_write = NULL;
1284 			cc_ent->cc_flag &= ~(CC_PINNABLE);
1285 			/* was FAST */
1286 			mutex_exit(&cc_ent->cc_lock);
1287 
1288 
1289 			wctl->sc_dirty = 0;
1290 			SSOP_SETCENTRY(sdbc_safestore, wctl);
1291 			SSOP_DEALLOCRESOURCE(sdbc_safestore, wctl->sc_res);
1292 
1293 			/*
1294 			 * if this was a QHEAD cache block, then
1295 			 * _sd_centry_release() did not requeue it as
1296 			 * it was dirty.  Requeue it now.
1297 			 */
1298 
1299 			if (CENTRY_QHEAD(cc_ent))
1300 				if (sdbc_use_dmchain) {
1301 
1302 					/* attempt to que head */
1303 					if (cc_ent->cc_alloc_size_dm) {
1304 						sdbc_requeue_head_dm_try
1305 						    (cc_ent);
1306 					}
1307 				} else
1308 					_sd_requeue_head(cc_ent);
1309 			CLEAR_CENTRY_INUSE(cc_ent);
1310 		} else {
1311 			/* was FAST */
1312 			mutex_exit(&cc_ent->cc_lock);
1313 		}
1314 	}
1315 
1316 	if (dirty_enq)
1317 		_sd_enqueue_dirty_chain(cd, dirty_hd, (*dirty_nxt), dirty_enq);
1318 
1319 	goto process_loop;
1320 }
1321 
1322 
1323 static void
1324 _sd_flcent_ea(blind_t xcc_ent, nsc_off_t fba_pos, nsc_size_t fba_len, int error)
1325 {
1326 	_sd_cctl_t *cc_ent = (_sd_cctl_t *)xcc_ent;
1327 	int cd;
1328 	nsc_off_t dblk;
1329 
1330 	_sd_cd_info_t *cdi;
1331 
1332 	cd = CENTRY_CD(cc_ent);
1333 	dblk = BLK_TO_FBA_NUM(CENTRY_BLK(cc_ent));
1334 	cdi = &(_sd_cache_files[cd]);
1335 
1336 	SDTRACE(ST_ENTER|SDF_FLCENT_EA, cd, 0, dblk, 2, (unsigned long)cc_ent);
1337 
1338 	if (error) {
1339 		if (cdi->cd_info->sh_failed == 0) {
1340 			cdi->cd_info->sh_failed = 1;
1341 			cmn_err(CE_WARN, "!sdbc(_sd_flcent_ea) "
1342 			    "Disk write failed cd %d (%s): err %d",
1343 			    cd, cdi->cd_info->sh_filename, error);
1344 		}
1345 	}
1346 
1347 	/* was FAST */
1348 	mutex_enter(&cc_ent->cc_lock);
1349 	if (--(cc_ent->cc_iocount) != 0) {
1350 		/* more io's to complete before the cc_ent is done. */
1351 
1352 		if (cc_ent->cc_iocount < 0) {
1353 			/* was FAST */
1354 			mutex_exit(&cc_ent->cc_lock);
1355 			SDALERT(SDF_FLCENT_EA, cd, 0,
1356 			    dblk, cc_ent->cc_iocount, 0);
1357 		} else {
1358 			/* was FAST */
1359 			mutex_exit(&cc_ent->cc_lock);
1360 		}
1361 		SDTRACE(ST_EXIT|SDF_FLCENT_EA, cd, 0, dblk, 2,
1362 		    (unsigned long)cc_ent);
1363 
1364 		DTRACE_PROBE(_sd_flcent_ea_end);
1365 		return;
1366 	}
1367 	/* was FAST */
1368 	mutex_exit(&cc_ent->cc_lock);
1369 
1370 	DATA_LOG(SDF_FLEA, cc_ent, BLK_FBA_OFF(fba_pos), fba_len);
1371 
1372 	DTRACE_PROBE4(_sd_flcent_ea_data, uint64_t, ((uint64_t)
1373 	    BLK_TO_FBA_NUM(CENTRY_BLK(cc_ent) + BLK_FBA_OFF(fba_pos))),
1374 	    uint64_t, (uint64_t)fba_len, char *,
1375 	    *(int64_t *)(cc_ent->cc_data + FBA_SIZE(BLK_FBA_OFF(fba_pos))),
1376 	    char *, *(int64_t *)(cc_ent->cc_data +
1377 	    FBA_SIZE(BLK_FBA_OFF(fba_pos) + fba_len) - 8));
1378 
1379 	/*
1380 	 * All io's are done for this cc_ent.
1381 	 * Clear the pagelist io flag.
1382 	 */
1383 	CLEAR_CENTRY_PAGEIO(cc_ent);
1384 
1385 	if (error)
1386 		cc_ent->cc_iostatus = _SD_IO_FAILED;
1387 	else
1388 		cc_ent->cc_iostatus = _SD_IO_DONE;
1389 
1390 	SDTRACE(ST_EXIT|SDF_FLCENT_EA, cd, 0, dblk, 2, (unsigned long)cc_ent);
1391 
1392 }
1393 
1394 
1395 
1396 static void
1397 _sd_flclist_ea(blind_t xcc_ent, nsc_off_t fba_pos, nsc_size_t fba_len,
1398     int error)
1399 {
1400 	_sd_cctl_t *cc_ent = (_sd_cctl_t *)xcc_ent;
1401 	_sd_cctl_t *first_cc = cc_ent;
1402 	_sd_cd_info_t *cdi;
1403 	int cd;
1404 	nsc_off_t dblk;
1405 
1406 	cd = CENTRY_CD(cc_ent);
1407 	dblk = BLK_TO_FBA_NUM(CENTRY_BLK(cc_ent));
1408 	cdi = &(_sd_cache_files[cd]);
1409 
1410 	SDTRACE(ST_ENTER|SDF_FLCLIST_EA, cd, 0, dblk, 1, (unsigned long)cc_ent);
1411 
1412 	if (error) {
1413 		if (cdi->cd_info->sh_failed == 0) {
1414 			cdi->cd_info->sh_failed = 1;
1415 			cmn_err(CE_WARN, "!sdbc(_sd_flclist_ea) "
1416 			    "Disk write failed cd %d (%s): err %d",
1417 			    cd, cdi->cd_info->sh_filename, error);
1418 		}
1419 	}
1420 	/*
1421 	 * Important: skip the first cc_ent in the list. Marking this will
1422 	 * make the writer think the io is done,  though the rest of the
1423 	 * chain have not been processed here. so mark the first cc_ent
1424 	 * last. Optimization, so as not to use locks
1425 	 */
1426 
1427 	cc_ent = cc_ent->cc_dirty_next;
1428 	while (cc_ent) {
1429 		DTRACE_PROBE2(_sd_flclist_ea, _sd_cctl_t *, cc_ent,
1430 		    int, CENTRY_CD(cc_ent));
1431 
1432 		if (cc_ent->cc_iocount != 1)
1433 			SDALERT(SDF_FLCLIST_EA, cd, 0,
1434 			    BLK_TO_FBA_NUM(CENTRY_BLK(cc_ent)),
1435 			    cc_ent->cc_iocount, 0);
1436 		cc_ent->cc_iocount = 0;
1437 
1438 		/*
1439 		 * Clear the pagelist io flag.
1440 		 */
1441 		CLEAR_CENTRY_PAGEIO(cc_ent);
1442 
1443 		if (error)
1444 			cc_ent->cc_iostatus = _SD_IO_FAILED;
1445 		else
1446 			cc_ent->cc_iostatus = _SD_IO_DONE;
1447 		if (cc_ent->cc_dirty_next) {
1448 			DATA_LOG(SDF_FLSTEA, cc_ent, 0, BLK_FBAS);
1449 
1450 			DTRACE_PROBE4(_sd_flclist_ea_data1, uint64_t,
1451 			    BLK_TO_FBA_NUM(CENTRY_BLK(cc_ent)),
1452 			    int, BLK_FBAS, char *,
1453 			    *(int64_t *)(cc_ent->cc_data),
1454 			    char *, *(int64_t *)(cc_ent->cc_data +
1455 			    FBA_SIZE(BLK_FBAS) - 8));
1456 		} else {
1457 			DATA_LOG(SDF_FLSTEA, cc_ent, 0,
1458 			    BLK_FBA_OFF(fba_pos + fba_len));
1459 
1460 			DTRACE_PROBE4(_sd_flclist_ea_data2, uint64_t,
1461 			    (uint64_t)BLK_TO_FBA_NUM(CENTRY_BLK(cc_ent)),
1462 			    uint64_t, (uint64_t)BLK_FBA_OFF(fba_pos + fba_len),
1463 			    char *, *(int64_t *)(cc_ent->cc_data),
1464 			    char *, *(int64_t *)(cc_ent->cc_data +
1465 			    FBA_SIZE(BLK_FBA_OFF(fba_pos + fba_len)) - 8));
1466 		}
1467 
1468 		cc_ent = cc_ent->cc_dirty_next;
1469 	}
1470 
1471 	/*
1472 	 * Now process the first cc_ent in the list.
1473 	 */
1474 	cc_ent = first_cc;
1475 	DATA_LOG(SDF_FLSTEA, cc_ent, BLK_FBA_OFF(fba_pos),
1476 	    BLK_FBAS - BLK_FBA_OFF(fba_pos));
1477 
1478 	DTRACE_PROBE4(_sd_flclist_ea_data3, uint64_t,
1479 	    (uint64_t)fba_pos, int, BLK_FBAS - BLK_FBA_OFF(fba_pos),
1480 	    char *, *(int64_t *)(cc_ent->cc_data +
1481 	    FBA_SIZE(BLK_FBA_OFF(fba_pos))), char *,
1482 	    *(int64_t *)(cc_ent->cc_data + FBA_SIZE(BLK_FBA_OFF(fba_pos) +
1483 	    BLK_FBAS - BLK_FBA_OFF(fba_pos)) - 8));
1484 
1485 	cc_ent->cc_iocount = 0;
1486 
1487 	if (cc_ent->cc_anon_addr.sa_virt) {
1488 		kmem_free(cc_ent->cc_anon_addr.sa_virt, cc_ent->cc_anon_len);
1489 		cc_ent->cc_anon_addr.sa_virt = NULL;
1490 		cc_ent->cc_anon_len = 0;
1491 	}
1492 
1493 	/*
1494 	 * Clear the pagelist io flag.
1495 	 */
1496 	CLEAR_CENTRY_PAGEIO(cc_ent);
1497 
1498 	if (error)
1499 		cc_ent->cc_iostatus = _SD_IO_FAILED;
1500 	else
1501 		cc_ent->cc_iostatus = _SD_IO_DONE;
1502 
1503 	SDTRACE(ST_EXIT|SDF_FLCLIST_EA, cd, 0, dblk, 1, (unsigned long)cc_ent);
1504 }
1505 
1506 
1507 static void
1508 _sd_mark_failed(_sd_cctl_t *cclist)
1509 {
1510 	_sd_cctl_t *cc_ent;
1511 	int cd;
1512 
1513 	cd = CENTRY_CD(cclist);
1514 	cc_ent = cclist;
1515 	while (cc_ent) {
1516 		cc_ent->cc_iostatus = _SD_IO_FAILED;
1517 		cc_ent = cc_ent->cc_dirty_next;
1518 	}
1519 	_sd_enqueue_io_pending(cd, cclist);
1520 }
1521 
1522 
1523 
1524 /*
1525  * Fail single chain of cache blocks, updating numfail/numio counts.
1526  * For dual-copy, log & clear PINNED, fall thru to regular processing.
1527  */
1528 int
1529 _sd_process_failure(_sd_cctl_t *cc_ent)
1530 {
1531 	int cd, num;
1532 	_sd_cctl_t *cc_chain;
1533 	_sd_cd_info_t *cdi;
1534 
1535 	cd = CENTRY_CD(cc_ent);
1536 	cdi = &(_sd_cache_files[cd]);
1537 
1538 	cc_chain = cc_ent;
1539 
1540 	if (!cdi->cd_global->sv_pinned) {
1541 		cdi->cd_global->sv_pinned = _SD_SELF_HOST;
1542 		SSOP_SETVOL(sdbc_safestore, cdi->cd_global);
1543 	}
1544 
1545 	for (num = 0; cc_ent; cc_ent = cc_ent->cc_dirty_next) {
1546 		num++;
1547 		/* was FAST */
1548 		mutex_enter(&cc_ent->cc_lock);
1549 		cc_ent->cc_flag |= (CC_PEND_DIRTY |
1550 		    (CENTRY_PINNABLE(cc_ent) ? CC_PINNED : 0));
1551 		if (cc_ent->cc_write) {
1552 			cc_ent->cc_write->sc_flag = cc_ent->cc_flag;
1553 			SSOP_SETCENTRY(sdbc_safestore, cc_ent->cc_write);
1554 		}
1555 		mutex_exit(&cc_ent->cc_lock);
1556 		if (CENTRY_PINNED(cc_ent))
1557 			nsc_pinned_data(cdi->cd_iodev,
1558 			    BLK_TO_FBA_NUM(CENTRY_BLK(cc_ent)), BLK_FBAS);
1559 	}
1560 
1561 	/*
1562 	 *  In normal processing we wouldn't need a lock here as all i/o
1563 	 *  is single threaded by cd. However during failover blocks can
1564 	 *  be failing from real i/o and as soon as the disk is marked bad
1565 	 *  the failover code which is furiously cloning safe-store into
1566 	 *  more blocks will short circuit to here (see _sd_ft_clone)
1567 	 *  and two threads can be executing in here simultaneously.
1568 	 */
1569 	mutex_enter(&cdi->cd_lock);
1570 	cc_chain->cc_dirty_link = cdi->cd_fail_head;
1571 	cdi->cd_fail_head = cc_chain;
1572 	cdi->cd_info->sh_numfail += num;
1573 	cdi->cd_info->sh_numio   -= num;
1574 	mutex_exit(&cdi->cd_lock);
1575 	return (1);		/* blocks are failed */
1576 }
1577 
1578 
1579 static void
1580 _sd_process_reflush(_sd_cctl_t *cc_ent)
1581 {
1582 	int cd;
1583 
1584 	if (CENTRY_PINNABLE(cc_ent)) {
1585 		cd = CENTRY_CD(cc_ent);
1586 		nsc_unpinned_data(_sd_cache_files[cd].cd_iodev,
1587 		    BLK_TO_FBA_NUM(CENTRY_BLK(cc_ent)), BLK_FBAS);
1588 	}
1589 
1590 	/* was FAST */
1591 	mutex_enter(&cc_ent->cc_lock);
1592 	cc_ent->cc_flag &= ~CC_PINNED;
1593 	/* was FAST */
1594 	mutex_exit(&cc_ent->cc_lock);
1595 }
1596 
1597 
1598 
1599 /*
1600  * cd_write_thread -- flush dirty buffers.
1601  *
1602  * ARGUMENTS:
1603  *
1604  *  cd - cache descriptor
1605  *
1606  * USAGE:
1607  *  called by cd's writer thread, returns when no more entries
1608  *
1609  * NOTE: if sdbc is being shutdown (for powerfail) then we will
1610  * process pending i/o's but issue no more new ones.
1611  */
1612 static int SD_LOOP_DELAY = 32;
1613 #if !defined(m88k) && !defined(sun)
1614 static int SD_WRITE_HIGH = 255;	/* cache blocks */
1615 #endif
1616 
1617 static void
1618 cd_write_thread(int cd)
1619 {
1620 	_sd_cctl_t *cc_list, *dirty_head, *last_chain;
1621 	_sd_cd_info_t *cdi;
1622 
1623 	cdi = &(_sd_cache_files[cd]);
1624 	if (!FILE_OPENED(cd)) {
1625 		cdi->cd_writer = _SD_WRITER_NONE;
1626 		return;
1627 	}
1628 	cdi->cd_writer = _SD_WRITER_RUNNING;
1629 
1630 	_sd_process_pending(cd);
1631 
1632 	if (_sdbc_shutdown_in_progress) {
1633 		cdi->cd_write_inprogress = 0;
1634 		cdi->cd_writer = _SD_WRITER_NONE;
1635 		return;
1636 	}
1637 #if !defined(m88k) && !defined(sun)
1638 	if (cdi->cd_info->sh_numio > SD_WRITE_HIGH) {
1639 		/* let I/Os complete before issuing more */
1640 		cdi->cd_writer = _SD_WRITER_NONE;
1641 		return;
1642 	}
1643 #endif
1644 
1645 #ifdef DEBUG
1646 	if (!_sdbc_flush_flag) { /* hang the flusher for testing */
1647 		cdi->cd_write_inprogress = 0;
1648 		cdi->cd_writer = _SD_WRITER_NONE;
1649 		return;
1650 	}
1651 #endif
1652 
1653 	dirty_head = cdi->cd_dirty_head;
1654 	if (dirty_head && (dirty_head != cdi->cd_lastchain_ptr ||
1655 	    ++cdi->cd_info->sh_flushloop > SD_LOOP_DELAY)) {
1656 		cdi->cd_info->sh_flushloop = 0;
1657 		/* was FAST */
1658 		mutex_enter(&cdi->cd_lock);
1659 		if (SD_LOOP_DELAY == 0 ||
1660 		    dirty_head == cdi->cd_lastchain_ptr) {
1661 			last_chain = NULL;
1662 			cdi->cd_dirty_head = NULL;
1663 			cdi->cd_dirty_tail = NULL;
1664 			cdi->cd_info->sh_numio += cdi->cd_info->sh_numdirty;
1665 			cdi->cd_info->sh_numdirty = 0;
1666 		} else
1667 #if !defined(m88k) && !defined(sun)
1668 		if (cdi->cd_info->sh_numdirty > SD_WRITE_HIGH) {
1669 			int count = 0;
1670 			for (last_chain = dirty_head; last_chain;
1671 			    last_chain = last_chain->cc_dirty_next)
1672 				count++;
1673 			last_chain = dirty_head->cc_dirty_link;
1674 			cdi->cd_dirty_head = last_chain;
1675 			/* cdi->cd_dirty_tail is unchanged */
1676 			cdi->cd_info->sh_numio += count;
1677 			cdi->cd_info->sh_numdirty -= count;
1678 		} else
1679 #endif
1680 		{
1681 			last_chain = cdi->cd_lastchain_ptr;
1682 			cdi->cd_dirty_head = last_chain;
1683 			cdi->cd_dirty_tail = last_chain;
1684 			cdi->cd_info->sh_numio += cdi->cd_info->sh_numdirty -
1685 			    cdi->cd_lastchain;
1686 			cdi->cd_info->sh_numdirty = cdi->cd_lastchain;
1687 		}
1688 		/* was FAST */
1689 		mutex_exit(&cdi->cd_lock);
1690 
1691 		while (((cc_list = dirty_head) != NULL) &&
1692 		    cc_list != last_chain) {
1693 			dirty_head = cc_list->cc_dirty_link;
1694 			cc_list->cc_dirty_link = NULL;
1695 			if (cdi->cd_info->sh_failed)
1696 				_sd_mark_failed(cc_list);
1697 			else if (cc_list->cc_dirty_next == NULL)
1698 				_sd_async_flcent(cc_list, cdi->cd_crdev);
1699 			else
1700 				_sd_async_flclist(cc_list, cdi->cd_crdev);
1701 			cdi->cd_write_inprogress++;
1702 		}
1703 	}
1704 	cdi->cd_write_inprogress = 0;
1705 	cdi->cd_writer = _SD_WRITER_NONE;
1706 }
1707 
1708 /*
1709  * cd_writer -- spawn new writer if not running already
1710  *	called after enqueing the dirty blocks
1711  */
1712 int
1713 cd_writer(int cd)
1714 {
1715 	_sd_cd_info_t *cdi;
1716 	nstset_t *tset = NULL;
1717 	nsthread_t *t;
1718 
1719 #if defined(_SD_USE_THREADS)
1720 	tset = _sd_ioset;
1721 #endif	/* _SD_USE_THREADS */
1722 
1723 	cdi = &(_sd_cache_files[cd]);
1724 
1725 	if (cdi->cd_writer)
1726 		return (0);
1727 
1728 	if (tset == NULL) {
1729 		_sd_unblock(&_sd_flush_cv);
1730 		return (0);
1731 	}
1732 
1733 	if (cdi->cd_writer || xmem_bu(_SD_WRITER_CREATE, &cdi->cd_writer))
1734 		return (0);
1735 
1736 	t = nst_create(tset, cd_write_thread, (blind_t)(unsigned long)cd, 0);
1737 	if (t)
1738 		return (1);
1739 
1740 	cmn_err(CE_WARN, "!sdbc(cd_writer) cd %d nst_create error", cd);
1741 	cdi->cd_writer = _SD_WRITER_NONE;
1742 	return (-1);
1743 }
1744 
1745 /*
1746  * _sd_ccent_rd - add appropriate parts of cc_ent to struct buf.
1747  *	optimized not to read dirty FBAs from disk.
1748  *
1749  * ARGUMENTS:
1750  *
1751  * cc_ent   - single cache block
1752  * wanted   - bitlist of FBAs that need to be read
1753  * bp	- struct buf to extend
1754  *
1755  * USAGE:
1756  *	Called for each dirty in a read I/O.
1757  *	The bp must be sized to allow for one entry per FBA that needs
1758  *	to be read (see _sd_doread()).
1759  */
1760 
1761 void
1762 _sd_ccent_rd(_sd_cctl_t *cc_ent, uint_t wanted, struct buf *bp)
1763 {
1764 	int index, offset = 0, size = 0;
1765 	int state, state1 = -3;	/* state1 is previous state */
1766 	sd_addr_t *addr = NULL;
1767 	uint_t dirty;
1768 
1769 	dirty  = CENTRY_DIRTY(cc_ent);
1770 	for (index = 0; index < BLK_FBAS; index++) {
1771 		if (!_SD_BIT_ISSET(wanted, index))
1772 			continue;
1773 		state = _SD_BIT_ISSET(dirty, index);
1774 		if (state == state1) /* same state, expand size */
1775 			size++;
1776 		else {
1777 			if (state1 != -3) /* not first FBA */
1778 				sd_add_fba(bp, addr, offset, size);
1779 			state1 = state;	/* new previous state */
1780 			offset = index;
1781 			size  = 1;
1782 			if (state) {		/* dirty, don't overwrite */
1783 				addr = NULL;
1784 			} else {
1785 				addr = &cc_ent->cc_addr;
1786 			}
1787 		}
1788 	}
1789 	if (state1 != -3)
1790 		sd_add_fba(bp, addr, offset, size);
1791 }
1792 
1793 
1794 
1795 int _SD_WR_THRESHOLD = 1000;
1796 static void
1797 _sd_flush_thread(void)
1798 {
1799 	int cd;
1800 	_sd_cd_info_t *cdi;
1801 	_sd_shared_t *shi;
1802 	int cnt;
1803 	int short_sleep = 0;
1804 	long tics;
1805 	int waiting_for_idle = 0;
1806 	int check_count = 0;
1807 	int pending, last_pending;
1808 	int SD_LONG_SLEEP_TICS, SD_SHORT_SLEEP_TICS;
1809 	nstset_t *tset = NULL;
1810 	nsthread_t *t;
1811 
1812 #if defined(_SD_USE_THREADS)
1813 	tset = _sd_ioset;
1814 #endif	/* _SD_USE_THREADS */
1815 
1816 	mutex_enter(&_sd_cache_lock);
1817 	_sd_cache_dem_cnt++;
1818 	mutex_exit(&_sd_cache_lock);
1819 
1820 	/* .2 seconds */
1821 	SD_LONG_SLEEP_TICS = drv_usectohz(200000);
1822 	/* .02 seconds */
1823 	SD_SHORT_SLEEP_TICS = drv_usectohz(20000);
1824 
1825 	/* CONSTCOND */
1826 	while (1) {
1827 		if (_sd_flush_exit == 0) {
1828 			/*
1829 			 * wait until no i/o's pending (on two successive
1830 			 * iterations) or we see no progress after
1831 			 * GIVE_UP_WAITING total sleeps.
1832 			 */
1833 /* at most 5*128 ticks about 6 seconds of no progress */
1834 #define	GIVE_UP_WAITING	128
1835 			if (waiting_for_idle) {
1836 				pending = _sd_pending_iobuf();
1837 				/*LINTED*/
1838 				if (pending == last_pending) {
1839 					if (pending != 0)
1840 						check_count++;
1841 				} else
1842 					check_count = 0;
1843 				if ((last_pending == 0 && (pending == 0)) ||
1844 				    (check_count == GIVE_UP_WAITING)) {
1845 					mutex_enter(&_sd_cache_lock);
1846 					_sd_cache_dem_cnt--;
1847 					mutex_exit(&_sd_cache_lock);
1848 					if (check_count == GIVE_UP_WAITING)
1849 						cmn_err(CE_WARN,
1850 						    "!_sd_flush_thread "
1851 						    "exiting with %d IOs "
1852 						    "pending", pending);
1853 					return;
1854 				}
1855 				last_pending = pending;
1856 			} else {
1857 				waiting_for_idle = 1;
1858 				last_pending = _sd_pending_iobuf();
1859 			}
1860 		}
1861 
1862 		/*
1863 		 * Normally wakeup every SD_LONG_SLEEP_TICS to flush.
1864 		 */
1865 
1866 		if (!short_sleep) {
1867 			ssioc_stats_t ss_stats;
1868 			int rc;
1869 
1870 			if ((rc = SSOP_CTL(sdbc_safestore, SSIOC_STATS,
1871 			    (uintptr_t)&ss_stats)) == 0) {
1872 
1873 				if (ss_stats.wq_inq < _SD_WR_THRESHOLD)
1874 					short_sleep = 1;
1875 			} else {
1876 				if (rc == SS_ERR)
1877 					cmn_err(CE_WARN,
1878 					    "!sdbc(_sd_flush_thread)"
1879 					    "cannot get safestore inq");
1880 			}
1881 		}
1882 
1883 		if (short_sleep)
1884 			tics = SD_SHORT_SLEEP_TICS;
1885 		else
1886 			tics = SD_LONG_SLEEP_TICS;
1887 
1888 		_sd_timed_block(tics, &_sd_flush_cv);
1889 		cd = 0;
1890 		cnt = short_sleep = 0;
1891 		for (; (cnt < _sd_cache_stats->st_loc_count) &&
1892 		    (cd < sdbc_max_devs); cd++) {
1893 			cdi = &_sd_cache_files[cd];
1894 			shi = cdi->cd_info;
1895 
1896 			if (shi == NULL || (shi->sh_failed == 2))
1897 				continue;
1898 
1899 			if (!(shi->sh_alloc & CD_ALLOCATED) ||
1900 			    !(shi->sh_flag & CD_ATTACHED))
1901 				continue;
1902 			cnt++;
1903 			if (cdi->cd_writer)
1904 				continue;
1905 			if (!_SD_CD_WBLK_USED(cd)) {
1906 				if (cdi->cd_failover == 2) {
1907 					nsc_release(cdi->cd_rawfd);
1908 					cdi->cd_failover = 0;
1909 				}
1910 				continue;
1911 			}
1912 			if (cdi->cd_writer ||
1913 			    xmem_bu(_SD_WRITER_CREATE, &cdi->cd_writer))
1914 				continue;
1915 
1916 			t = NULL;
1917 			if (tset) {
1918 				t = nst_create(tset,
1919 				    cd_write_thread, (blind_t)(unsigned long)cd,
1920 				    0);
1921 			}
1922 			if (!t)
1923 				cd_write_thread(cd);
1924 		}
1925 	}
1926 }
1927 
1928 
1929 #if defined(_SD_DEBUG_PATTERN)
1930 check_write_consistency(cc_entry)
1931 	_sd_cctl_t *cc_entry;
1932 {
1933 	int *data;
1934 	nsc_off_t fba_pos;
1935 	int i, dirty_bl;
1936 
1937 	while (cc_entry) {
1938 		dirty_bl = CENTRY_DIRTY(cc_entry);
1939 		if (dirty_bl == 0) {
1940 			cmn_err(CE_WARN, "!check: no dirty");
1941 		}
1942 		data = (int *)cc_entry->cc_data;
1943 		fba_pos = BLK_TO_FBA_NUM(CENTRY_BLK(cc_entry));
1944 
1945 		for (i = 0; i < 8; i++, data += 128, fba_pos++) {
1946 			if (dirty_bl & 1) {
1947 				if (*((int *)(data + 2)) != fba_pos) {
1948 					cmn_err(CE_WARN, "!wr exp %" NSC_SZFMT
1949 					    " got %x", fba_pos, *(data + 2));
1950 				}
1951 			}
1952 			dirty_bl >>= 1;
1953 		}
1954 		cc_entry = cc_entry->cc_dirty_next;
1955 	}
1956 }
1957 
1958 check_buf_consistency(handle, rw)
1959 	_sd_buf_handle_t *handle;
1960 	char *rw;
1961 {
1962 	_sd_bufvec_t *bvec1;
1963 	int *data;
1964 	nsc_off_t fpos;
1965 	nsc_size_t fba_len, i;
1966 	nsc_size_t len = 0;
1967 
1968 	bvec1 = handle->bh_bufvec;
1969 	fpos =  handle->bh_fba_pos;
1970 
1971 	while (bvec1->bufaddr) {
1972 		fba_len = FBA_NUM(bvec1->buflen);
1973 		data = (int *)bvec1->bufaddr;
1974 		for (i = 0; i < fba_len; i++, data += 128, fpos++) {
1975 			len++;
1976 			if (*(data+2) != fpos) {
1977 				cmn_err(CE_WARN, "!%s exp%" NSC_SZFMT " got%x",
1978 				    rw, fpos, *(data + 2));
1979 			}
1980 		}
1981 		bvec1++;
1982 	}
1983 	if (handle->bh_fba_len != len) {
1984 		cmn_err(CE_WARN, "!len %" NSC_SZFMT " real %" NSC_SZFMT, len,
1985 		    handle->bh_fba_len);
1986 	}
1987 }
1988 #endif
1989 
1990 int
1991 _sdbc_wait_pending(void)
1992 {
1993 	int tries, pend, last;
1994 
1995 	tries = 0;
1996 	last  = _sd_pending_iobuf();
1997 	while ((pend = _sd_pending_iobuf()) > 0) {
1998 		if (pend == last) {
1999 			if (++tries > 60) {
2000 				return (pend);
2001 			}
2002 		} else {
2003 			pend = last;
2004 			tries = 0;
2005 		}
2006 		delay(HZ);
2007 	}
2008 	return (0);
2009 }
2010