xref: /titanic_52/usr/src/uts/common/avs/ns/sdbc/sd_bcache.c (revision 3270659f55e0928d6edec3d26217cc29398a8149)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/ksynch.h>
28 #include <sys/cmn_err.h>
29 #include <sys/errno.h>
30 #include <sys/kmem.h>
31 #include <sys/cred.h>
32 #include <sys/buf.h>
33 #include <sys/ddi.h>
34 
35 #include <sys/nsc_thread.h>
36 #include <sys/nsctl/nsctl.h>
37 
38 #include <sys/sdt.h>		/* dtrace is S10 or later */
39 
40 #include "sd_bcache.h"
41 #include "sd_trace.h"
42 #include "sd_io.h"
43 #include "sd_bio.h"
44 #include "sd_ft.h"
45 #include "sd_misc.h"
46 #include "sd_pcu.h"
47 
48 #include <sys/unistat/spcs_s.h>
49 #include <sys/unistat/spcs_s_k.h>
50 #include <sys/unistat/spcs_errors.h>
51 #include <sys/nsctl/safestore.h>
52 #ifndef DS_DDICT
53 #include <sys/ddi_impldefs.h>
54 #endif
55 
56 
57 /*
58  * kstat interface
59  */
60 
61 static kstat_t *sdbc_global_stats_kstat;
62 static int sdbc_global_stats_update(kstat_t *ksp, int rw);
63 
64 typedef struct {
65 	kstat_named_t	ci_sdbc_count;
66 	kstat_named_t	ci_sdbc_loc_count;
67 	kstat_named_t	ci_sdbc_rdhits;
68 	kstat_named_t	ci_sdbc_rdmiss;
69 	kstat_named_t	ci_sdbc_wrhits;
70 	kstat_named_t	ci_sdbc_wrmiss;
71 	kstat_named_t	ci_sdbc_blksize;
72 	kstat_named_t	ci_sdbc_lru_blocks;
73 #ifdef DEBUG
74 	kstat_named_t	ci_sdbc_lru_noreq;
75 	kstat_named_t	ci_sdbc_lru_req;
76 #endif
77 	kstat_named_t	ci_sdbc_wlru_inq;
78 	kstat_named_t	ci_sdbc_cachesize;
79 	kstat_named_t	ci_sdbc_numblocks;
80 	kstat_named_t	ci_sdbc_num_shared;
81 	kstat_named_t	ci_sdbc_wrcancelns;
82 	kstat_named_t	ci_sdbc_destaged;
83 	kstat_named_t	ci_sdbc_nodehints;
84 } sdbc_global_stats_t;
85 
86 static sdbc_global_stats_t sdbc_global_stats = {
87 	{SDBC_GKSTAT_COUNT,		KSTAT_DATA_ULONG},
88 	{SDBC_GKSTAT_LOC_COUNT,		KSTAT_DATA_ULONG},
89 	{SDBC_GKSTAT_RDHITS,		KSTAT_DATA_ULONG},
90 	{SDBC_GKSTAT_RDMISS,		KSTAT_DATA_ULONG},
91 	{SDBC_GKSTAT_WRHITS,		KSTAT_DATA_ULONG},
92 	{SDBC_GKSTAT_WRMISS,		KSTAT_DATA_ULONG},
93 	{SDBC_GKSTAT_BLKSIZE,		KSTAT_DATA_ULONG},
94 	{SDBC_GKSTAT_LRU_BLOCKS,	KSTAT_DATA_ULONG},
95 #ifdef DEBUG
96 	{SDBC_GKSTAT_LRU_NOREQ,		KSTAT_DATA_ULONG},
97 	{SDBC_GKSTAT_LRU_REQ,		KSTAT_DATA_ULONG},
98 #endif
99 	{SDBC_GKSTAT_WLRU_INQ,		KSTAT_DATA_ULONG},
100 	{SDBC_GKSTAT_CACHESIZE,		KSTAT_DATA_ULONG},
101 	{SDBC_GKSTAT_NUMBLOCKS,		KSTAT_DATA_ULONG},
102 	{SDBC_GKSTAT_NUM_SHARED,	KSTAT_DATA_ULONG},
103 	{SDBC_GKSTAT_WRCANCELNS,	KSTAT_DATA_ULONG},
104 	{SDBC_GKSTAT_DESTAGED,		KSTAT_DATA_ULONG},
105 	{SDBC_GKSTAT_NODEHINTS,		KSTAT_DATA_ULONG},
106 };
107 
108 static kstat_t **sdbc_cd_kstats;
109 static kstat_t **sdbc_cd_io_kstats;
110 static kmutex_t *sdbc_cd_io_kstats_mutexes;
111 static kstat_t *sdbc_global_io_kstat;
112 static kmutex_t sdbc_global_io_kstat_mutex;
113 static int sdbc_cd_stats_update(kstat_t *ksp, int rw);
114 static int cd_kstat_add(int cd);
115 static int cd_kstat_remove(int cd);
116 
117 typedef struct {
118 	kstat_named_t	ci_sdbc_vol_name;
119 	kstat_named_t	ci_sdbc_failed;
120 	kstat_named_t	ci_sdbc_cd;
121 	kstat_named_t	ci_sdbc_cache_read;
122 	kstat_named_t	ci_sdbc_cache_write;
123 	kstat_named_t	ci_sdbc_disk_read;
124 	kstat_named_t	ci_sdbc_disk_write;
125 	kstat_named_t	ci_sdbc_filesize;
126 	kstat_named_t	ci_sdbc_numdirty;
127 	kstat_named_t	ci_sdbc_numio;
128 	kstat_named_t	ci_sdbc_numfail;
129 	kstat_named_t	ci_sdbc_destaged;
130 	kstat_named_t	ci_sdbc_wrcancelns;
131 	kstat_named_t	ci_sdbc_cdhints;
132 } sdbc_cd_stats_t;
133 
134 static sdbc_cd_stats_t sdbc_cd_stats = {
135 	{SDBC_CDKSTAT_VOL_NAME,		KSTAT_DATA_CHAR},
136 	{SDBC_CDKSTAT_FAILED,		KSTAT_DATA_ULONG},
137 	{SDBC_CDKSTAT_CD,		KSTAT_DATA_ULONG},
138 	{SDBC_CDKSTAT_CACHE_READ,	KSTAT_DATA_ULONG},
139 	{SDBC_CDKSTAT_CACHE_WRITE,	KSTAT_DATA_ULONG},
140 	{SDBC_CDKSTAT_DISK_READ,	KSTAT_DATA_ULONG},
141 	{SDBC_CDKSTAT_DISK_WRITE,	KSTAT_DATA_ULONG},
142 #ifdef NSC_MULTI_TERABYTE
143 	{SDBC_CDKSTAT_FILESIZE,		KSTAT_DATA_UINT64},
144 #else
145 	{SDBC_CDKSTAT_FILESIZE,		KSTAT_DATA_ULONG},
146 #endif
147 	{SDBC_CDKSTAT_NUMDIRTY,		KSTAT_DATA_ULONG},
148 	{SDBC_CDKSTAT_NUMIO,		KSTAT_DATA_ULONG},
149 	{SDBC_CDKSTAT_NUMFAIL,		KSTAT_DATA_ULONG},
150 	{SDBC_CDKSTAT_DESTAGED,		KSTAT_DATA_ULONG},
151 	{SDBC_CDKSTAT_WRCANCELNS,	KSTAT_DATA_ULONG},
152 	{SDBC_CDKSTAT_CDHINTS,		KSTAT_DATA_ULONG},
153 };
154 
155 #ifdef DEBUG
156 /*
157  * dynmem kstat interface
158  */
159 static kstat_t *sdbc_dynmem_kstat_dm;
160 static int simplect_dm;
161 static int sdbc_dynmem_kstat_update_dm(kstat_t *ksp, int rw);
162 
163 typedef struct {
164 	kstat_named_t  ci_sdbc_monitor_dynmem;
165 	kstat_named_t  ci_sdbc_max_dyn_list;
166 	kstat_named_t  ci_sdbc_cache_aging_ct1;
167 	kstat_named_t  ci_sdbc_cache_aging_ct2;
168 	kstat_named_t  ci_sdbc_cache_aging_ct3;
169 	kstat_named_t  ci_sdbc_cache_aging_sec1;
170 	kstat_named_t  ci_sdbc_cache_aging_sec2;
171 	kstat_named_t  ci_sdbc_cache_aging_sec3;
172 	kstat_named_t  ci_sdbc_cache_aging_pcnt1;
173 	kstat_named_t  ci_sdbc_cache_aging_pcnt2;
174 	kstat_named_t  ci_sdbc_max_holds_pcnt;
175 
176 	kstat_named_t  ci_sdbc_alloc_ct;
177 	kstat_named_t  ci_sdbc_dealloc_ct;
178 	kstat_named_t  ci_sdbc_history;
179 	kstat_named_t  ci_sdbc_nodatas;
180 	kstat_named_t  ci_sdbc_candidates;
181 	kstat_named_t  ci_sdbc_deallocs;
182 	kstat_named_t  ci_sdbc_hosts;
183 	kstat_named_t  ci_sdbc_pests;
184 	kstat_named_t  ci_sdbc_metas;
185 	kstat_named_t  ci_sdbc_holds;
186 	kstat_named_t  ci_sdbc_others;
187 	kstat_named_t  ci_sdbc_notavail;
188 
189 	kstat_named_t  ci_sdbc_process_directive;
190 
191 	kstat_named_t  ci_sdbc_simplect;
192 } sdbc_dynmem_dm_t;
193 
194 static sdbc_dynmem_dm_t sdbc_dynmem_dm = {
195 	{SDBC_DMKSTAT_MONITOR_DYNMEM,		KSTAT_DATA_ULONG},
196 	{SDBC_DMKSTAT_MAX_DYN_LIST,		KSTAT_DATA_ULONG},
197 	{SDBC_DMKSTAT_CACHE_AGING_CT1,		KSTAT_DATA_ULONG},
198 	{SDBC_DMKSTAT_CACHE_AGING_CT2,		KSTAT_DATA_ULONG},
199 	{SDBC_DMKSTAT_CACHE_AGING_CT3,		KSTAT_DATA_ULONG},
200 	{SDBC_DMKSTAT_CACHE_AGING_SEC1,		KSTAT_DATA_ULONG},
201 	{SDBC_DMKSTAT_CACHE_AGING_SEC2,		KSTAT_DATA_ULONG},
202 	{SDBC_DMKSTAT_CACHE_AGING_SEC3,		KSTAT_DATA_ULONG},
203 	{SDBC_DMKSTAT_CACHE_AGING_PCNT1,	KSTAT_DATA_ULONG},
204 	{SDBC_DMKSTAT_CACHE_AGING_PCNT2,	KSTAT_DATA_ULONG},
205 	{SDBC_DMKSTAT_MAX_HOLDS_PCNT,		KSTAT_DATA_ULONG},
206 	{SDBC_DMKSTAT_ALLOC_CNT,		KSTAT_DATA_ULONG},
207 	{SDBC_DMKSTAT_DEALLOC_CNT,		KSTAT_DATA_ULONG},
208 	{SDBC_DMKSTAT_HISTORY,			KSTAT_DATA_ULONG},
209 	{SDBC_DMKSTAT_NODATAS,			KSTAT_DATA_ULONG},
210 	{SDBC_DMKSTAT_CANDIDATES,		KSTAT_DATA_ULONG},
211 	{SDBC_DMKSTAT_DEALLOCS,			KSTAT_DATA_ULONG},
212 	{SDBC_DMKSTAT_HOSTS,			KSTAT_DATA_ULONG},
213 	{SDBC_DMKSTAT_PESTS,			KSTAT_DATA_ULONG},
214 	{SDBC_DMKSTAT_METAS,			KSTAT_DATA_ULONG},
215 	{SDBC_DMKSTAT_HOLDS,			KSTAT_DATA_ULONG},
216 	{SDBC_DMKSTAT_OTHERS,			KSTAT_DATA_ULONG},
217 	{SDBC_DMKSTAT_NOTAVAIL,			KSTAT_DATA_ULONG},
218 	{SDBC_DMKSTAT_PROCESS_DIRECTIVE,	KSTAT_DATA_ULONG},
219 	{SDBC_DMKSTAT_SIMPLECT,			KSTAT_DATA_ULONG}
220 };
221 #endif
222 
223 /* End of dynmem kstats */
224 
225 #ifdef DEBUG
226 int *dmchainpull_table;  /* dmchain wastage stats */
227 #endif
228 
229 /*
230  * dynmem process vars
231  */
232 extern _dm_process_vars_t dynmem_processing_dm;
233 
234 /* metadata for volumes */
235 ss_voldata_t *_sdbc_gl_file_info;
236 
237 size_t _sdbc_gl_file_info_size;
238 
239 /* metadata for cache write blocks */
240 static ss_centry_info_t *_sdbc_gl_centry_info;
241 
242 /* wblocks * sizeof(ss_centry_info_t) */
243 static size_t _sdbc_gl_centry_info_size;
244 
245 static int _SD_DELAY_QUEUE = 1;
246 static int sdbc_allocb_inuse, sdbc_allocb_lost, sdbc_allocb_hit;
247 static int sdbc_allocb_pageio1, sdbc_allocb_pageio2;
248 static int sdbc_centry_hit, sdbc_centry_inuse, sdbc_centry_lost;
249 static int sdbc_dmchain_not_avail;
250 static int sdbc_allocb_deallocd;
251 static int sdbc_centry_deallocd;
252 static int sdbc_check_cot;
253 static int sdbc_ra_hash; /* 1-block read-ahead fails due to hash hit */
254 static int sdbc_ra_none; /* 1-block read-ahead fails due to "would block" */
255 
256 
257 /*
258  * Set the following variable to 1 to enable pagelist io mutual
259  * exclusion on all _sd_alloc_buf() operations.
260  *
261  * This is set to ON to prevent front end / back end races between new
262  * NSC_WRTHRU io operations coming in through _sd_alloc_buf(), and
263  * previously written data being flushed out to disk by the sdbc
264  * flusher at the back end.
265  * -- see bugtraq 4287564
266  * -- Simon Crosland, Mon Nov  8 16:34:09 GMT 1999
267  */
268 static int sdbc_pageio_always = 1;
269 
270 int sdbc_use_dmchain = 0; /* start time switch for dm chaining */
271 int sdbc_prefetch1 = 1;   /* do 1-block read-ahead */
272 /*
273  * if sdbc_static_cache is 1 allocate all cache memory at startup.
274  * deallocate only at shutdown.
275  */
276 int sdbc_static_cache = 1;
277 
278 #ifdef DEBUG
279 /*
280  * Pagelist io mutual exclusion debug facility.
281  */
282 #define	SDBC_PAGEIO_OFF		0	/* no debug */
283 #define	SDBC_PAGEIO_RDEV	1	/* force NSC_PAGEIO for specified dev */
284 #define	SDBC_PAGEIO_RAND	2	/* randomly force NSC_PAGEIO */
285 #define	SDBC_PAGEIO_ALL		3	/* always force NSC_PAGEIO */
286 static int sdbc_pageio_debug = SDBC_PAGEIO_OFF;
287 static dev_t sdbc_pageio_rdev = (dev_t)-1;
288 #endif
289 
290 /*
291  * INF SD cache global data
292  */
293 
294 _sd_cd_info_t	*_sd_cache_files;
295 _sd_stats_t   	*_sd_cache_stats;
296 kmutex_t	_sd_cache_lock;
297 
298 _sd_hash_table_t	*_sd_htable;
299 _sd_queue_t	_sd_lru_q;
300 
301 _sd_cctl_t	*_sd_cctl[_SD_CCTL_GROUPS];
302 int		_sd_cctl_groupsz;
303 
304 _sd_net_t  _sd_net_config;
305 
306 extern krwlock_t sdbc_queue_lock;
307 
308 unsigned int _sd_node_hint;
309 
310 #define	_SD_LRU_Q	(&_sd_lru_q)
311 int BLK_FBAS;		/* number of FBA's in a cache block */
312 int CACHE_BLOCK_SIZE;	/* size in bytes of a cache block */
313 int CBLOCKS;
314 _sd_bitmap_t BLK_FBA_BITS;
315 static int sdbc_prefetch_valid_cnt;
316 static int sdbc_prefetch_busy_cnt;
317 static int sdbc_prefetch_trailing;
318 static int sdbc_prefetch_deallocd;
319 static int sdbc_prefetch_pageio1;
320 static int sdbc_prefetch_pageio2;
321 static int sdbc_prefetch_hit;
322 static int sdbc_prefetch_lost;
323 static int _sd_prefetch_opt = 1; /* 0 to disable & use _prefetch_sb_vec[] */
324 static nsc_vec_t _prefetch_sb_vec[_SD_MAX_BLKS + 1];
325 
326 _sd_bitmap_t _fba_bits[] = {
327 	0x0000, 0x0001, 0x0003, 0x0007,
328 	0x000f,	0x001f, 0x003f, 0x007f,
329 	0x00ff,
330 #if defined(_SD_8K_BLKSIZE)
331 		0x01ff, 0x03ff, 0x07ff,
332 	0x0fff,	0x1fff, 0x3fff, 0x7fff,
333 	0xffff,
334 #endif
335 };
336 
337 
338 static int _sd_ccsync_cnt = 256;
339 static _sd_cctl_sync_t *_sd_ccent_sync;
340 
341 nsc_io_t *sdbc_io;
342 
343 #ifdef _MULTI_DATAMODEL
344 _sd_stats32_t *_sd_cache_stats32 = NULL;
345 #endif
346 
347 
348 #ifdef DEBUG
349 int cmn_level = CE_PANIC;
350 #else
351 int cmn_level = CE_WARN;
352 #endif
353 
354 /*
355  * Forward declare all statics that are used before defined to enforce
356  * parameter checking
357  * Some (if not all) of these could be removed if the code were reordered
358  */
359 
360 static void _sdbc_stats_deconfigure(void);
361 static int _sdbc_stats_configure(int cblocks);
362 static int _sdbc_lruq_configure(_sd_queue_t *);
363 static void _sdbc_lruq_deconfigure(void);
364 static int _sdbc_mem_configure(int cblocks, spcs_s_info_t kstatus);
365 static void _sdbc_mem_deconfigure(int cblocks);
366 static void _sd_ins_queue(_sd_queue_t *, _sd_cctl_t *centry);
367 static int _sd_flush_cd(int cd);
368 static int _sd_check_buffer_alloc(int cd, nsc_off_t fba_pos, nsc_size_t fba_len,
369     _sd_buf_handle_t **hp);
370 static int _sd_doread(_sd_buf_handle_t *handle, _sd_cctl_t *cc_ent,
371     nsc_off_t fba_pos, nsc_size_t fba_len, int flag);
372 static void _sd_async_read_ea(blind_t xhandle, nsc_off_t fba_pos,
373     nsc_size_t fba_len, int error);
374 static void _sd_async_write_ea(blind_t xhandle, nsc_off_t fba_pos,
375     nsc_size_t fba_len, int error);
376 static void _sd_queue_write(_sd_buf_handle_t *handle, nsc_off_t fba_pos,
377     nsc_size_t fba_len);
378 static int _sd_remote_store(_sd_cctl_t *cc_ent, nsc_off_t fba_pos,
379     nsc_size_t fba_len);
380 static int _sd_copy_direct(_sd_buf_handle_t *handle1, _sd_buf_handle_t *handle2,
381     nsc_off_t fba_pos1, nsc_off_t fba_pos2, nsc_size_t fba_len);
382 static int _sd_sync_write(_sd_buf_handle_t *handle, nsc_off_t fba_pos,
383     nsc_size_t fba_len, int flag);
384 static int _sd_sync_write2(_sd_buf_handle_t *wr_handle, nsc_off_t wr_st_pos,
385     nsc_size_t fba_len, int flag, _sd_buf_handle_t *rd_handle,
386     nsc_off_t rd_st_pos);
387 static int sdbc_fd_attach_cd(blind_t xcd);
388 static int sdbc_fd_detach_cd(blind_t xcd);
389 static int sdbc_fd_flush_cd(blind_t xcd);
390 static int _sdbc_gl_centry_configure(spcs_s_info_t);
391 static int _sdbc_gl_file_configure(spcs_s_info_t);
392 static void _sdbc_gl_centry_deconfigure(void);
393 static void _sdbc_gl_file_deconfigure(void);
394 static int sdbc_doread_prefetch(_sd_cctl_t *cc_ent, nsc_off_t fba_pos,
395     nsc_size_t fba_len);
396 static _sd_bitmap_t update_dirty(_sd_cctl_t *cc_ent, sdbc_cblk_fba_t st_off,
397     sdbc_cblk_fba_t st_len);
398 static int _sd_prefetch_buf(int cd, nsc_off_t fba_pos, nsc_size_t fba_len,
399     int flag, _sd_buf_handle_t *handle, int locked);
400 
401 /* dynmem support */
402 static int _sd_setup_category_on_type(_sd_cctl_t *header);
403 static int _sd_setup_mem_chaining(_sd_cctl_t *header, int flag);
404 
405 static int sdbc_check_cctl_cot(_sd_cctl_t *);
406 
407 static int sdbc_dmqueues_configure();
408 static void sdbc_dmqueues_deconfigure();
409 static _sd_cctl_t *sdbc_get_dmchain(int, int *, int);
410 static int sdbc_dmchain_avail(_sd_cctl_t *);
411 void sdbc_requeue_dmchain(_sd_queue_t *, _sd_cctl_t *, int, int);
412 static void sdbc_ins_dmqueue_back(_sd_queue_t *, _sd_cctl_t *);
413 void sdbc_ins_dmqueue_front(_sd_queue_t *, _sd_cctl_t *);
414 void sdbc_remq_dmchain(_sd_queue_t *, _sd_cctl_t *);
415 static void sdbc_clear_dmchain(_sd_cctl_t *, _sd_cctl_t *);
416 void sdbc_requeue_head_dm_try(_sd_cctl_t *);
417 static _sd_cctl_t *sdbc_alloc_dmc(int, nsc_off_t, nsc_size_t, int *,
418     sdbc_allocbuf_t *, int);
419 static _sd_cctl_t *sdbc_alloc_lru(int, nsc_off_t, int *, int);
420 static _sd_cctl_t *sdbc_alloc_from_dmchain(int, nsc_off_t, sdbc_allocbuf_t *,
421     int);
422 static void sdbc_centry_init_dm(_sd_cctl_t *);
423 static int sdbc_centry_memalloc_dm(_sd_cctl_t *, int, int);
424 static void sdbc_centry_alloc_end(sdbc_allocbuf_t *);
425 
426 
427 
428 
429 /* _SD_DEBUG */
430 #if defined(_SD_DEBUG) || defined(DEBUG)
431 static int _sd_cctl_valid(_sd_cctl_t *);
432 #endif
433 
434 static
435 nsc_def_t _sdbc_fd_def[] = {
436 	"Attach",	(uintptr_t)sdbc_fd_attach_cd,	0,
437 	"Detach",	(uintptr_t)sdbc_fd_detach_cd,	0,
438 	"Flush",	(uintptr_t)sdbc_fd_flush_cd,	0,
439 	0,		0,				0
440 };
441 
442 
443 /*
444  * _sdbc_cache_configure - initialize cache blocks, queues etc.
445  *
446  * ARGUMENTS:
447  * 	cblocks  - Number of cache blocks
448  *
449  * RETURNS:
450  *	0 on success.
451  *	SDBC_EENABLEFAIL or SDBC_EMEMCONFIG on failure.
452  *
453  */
454 
455 
456 
457 int
458 _sdbc_cache_configure(int cblocks, spcs_s_info_t kstatus)
459 {
460 	CBLOCKS = cblocks;
461 
462 	_sd_cache_files = (_sd_cd_info_t *)
463 	    kmem_zalloc(sdbc_max_devs * sizeof (_sd_cd_info_t),
464 	    KM_SLEEP);
465 
466 	if (_sdbc_stats_configure(cblocks))
467 		return (SDBC_EENABLEFAIL);
468 
469 	if (sdbc_use_dmchain) {
470 		if (sdbc_dmqueues_configure())
471 			return (SDBC_EENABLEFAIL);
472 	} else {
473 		if (_sdbc_lruq_configure(_SD_LRU_Q))
474 			return (SDBC_EENABLEFAIL);
475 	}
476 
477 
478 	if (_sdbc_mem_configure(cblocks, kstatus))
479 		return (SDBC_EMEMCONFIG);
480 
481 	CACHE_BLOCK_SIZE = BLK_SIZE(1);
482 	BLK_FBAS = FBA_NUM(CACHE_BLOCK_SIZE);
483 	BLK_FBA_BITS = _fba_bits[BLK_FBAS];
484 
485 	sdbc_allocb_pageio1 = 0;
486 	sdbc_allocb_pageio2 = 0;
487 	sdbc_allocb_hit = 0;
488 	sdbc_allocb_inuse = 0;
489 	sdbc_allocb_lost = 0;
490 	sdbc_centry_inuse = 0;
491 	sdbc_centry_lost = 0;
492 	sdbc_centry_hit = 0;
493 	sdbc_centry_deallocd = 0;
494 	sdbc_dmchain_not_avail = 0;
495 	sdbc_allocb_deallocd = 0;
496 
497 	sdbc_prefetch_valid_cnt = 0;
498 	sdbc_prefetch_busy_cnt = 0;
499 	sdbc_prefetch_trailing = 0;
500 	sdbc_prefetch_deallocd = 0;
501 	sdbc_prefetch_pageio1 = 0;
502 	sdbc_prefetch_pageio2 = 0;
503 	sdbc_prefetch_hit = 0;
504 	sdbc_prefetch_lost = 0;
505 
506 	sdbc_check_cot = 0;
507 	sdbc_prefetch1 = 1;
508 	sdbc_ra_hash = 0;
509 	sdbc_ra_none = 0;
510 
511 	return (0);
512 }
513 
514 /*
515  * _sdbc_cache_deconfigure - cache is being deconfigured. Release any
516  * memory that we acquired during the configuration process and return
517  * to the unconfigured state.
518  *
519  *  NOTE: all users of the cache should be inactive at this point,
520  *  i.e. we are unregistered from sd and all cache daemons/threads are
521  *  gone.
522  *
523  */
524 void
525 _sdbc_cache_deconfigure(void)
526 {
527 	/* CCIO shutdown must happen before memory is free'd */
528 
529 	if (_sd_cache_files) {
530 		kmem_free(_sd_cache_files,
531 		    sdbc_max_devs * sizeof (_sd_cd_info_t));
532 		_sd_cache_files = (_sd_cd_info_t *)NULL;
533 	}
534 
535 
536 	BLK_FBA_BITS = 0;
537 	BLK_FBAS = 0;
538 	CACHE_BLOCK_SIZE = 0;
539 	_sdbc_mem_deconfigure(CBLOCKS);
540 	_sdbc_gl_centry_deconfigure();
541 	_sdbc_gl_file_deconfigure();
542 
543 	if (sdbc_use_dmchain)
544 		sdbc_dmqueues_deconfigure();
545 	else
546 		_sdbc_lruq_deconfigure();
547 	_sdbc_stats_deconfigure();
548 
549 	CBLOCKS = 0;
550 }
551 
552 
553 /*
554  * _sdbc_stats_deconfigure - cache is being deconfigured turn off
555  * stats. This could seemingly do more but we leave most of the
556  * data intact until cache is configured again.
557  *
558  */
559 static void
560 _sdbc_stats_deconfigure(void)
561 {
562 	int i;
563 
564 #ifdef DEBUG
565 	if (sdbc_dynmem_kstat_dm) {
566 		kstat_delete(sdbc_dynmem_kstat_dm);
567 		sdbc_dynmem_kstat_dm  = NULL;
568 	}
569 #endif
570 
571 	if (sdbc_global_stats_kstat) {
572 		kstat_delete(sdbc_global_stats_kstat);
573 		sdbc_global_stats_kstat  = NULL;
574 	}
575 
576 	if (sdbc_cd_kstats) {
577 		for (i = 0; i < sdbc_max_devs; i++) {
578 			if (sdbc_cd_kstats[i]) {
579 				kstat_delete(sdbc_cd_kstats[i]);
580 				sdbc_cd_kstats[i] = NULL;
581 			}
582 		}
583 		kmem_free(sdbc_cd_kstats, sizeof (kstat_t *) * sdbc_max_devs);
584 		sdbc_cd_kstats = NULL;
585 	}
586 
587 	if (sdbc_global_io_kstat) {
588 		kstat_delete(sdbc_global_io_kstat);
589 		mutex_destroy(&sdbc_global_io_kstat_mutex);
590 		sdbc_global_io_kstat = NULL;
591 	}
592 
593 	if (sdbc_cd_io_kstats) {
594 		for (i = 0; i < sdbc_max_devs; i++) {
595 			if (sdbc_cd_io_kstats[i]) {
596 				kstat_delete(sdbc_cd_io_kstats[i]);
597 				sdbc_cd_io_kstats[i] = NULL;
598 			}
599 		}
600 		kmem_free(sdbc_cd_io_kstats, sizeof (kstat_t *) *
601 		    sdbc_max_devs);
602 		sdbc_cd_io_kstats = NULL;
603 	}
604 
605 	if (sdbc_cd_io_kstats_mutexes) {
606 	/* mutexes are already destroyed in cd_kstat_remove() */
607 		kmem_free(sdbc_cd_io_kstats_mutexes,
608 		    sizeof (kmutex_t) * sdbc_max_devs);
609 		sdbc_cd_io_kstats_mutexes = NULL;
610 	}
611 
612 
613 	if (_sd_cache_stats) {
614 		kmem_free(_sd_cache_stats,
615 		    sizeof (_sd_stats_t) +
616 		    (sdbc_max_devs - 1) * sizeof (_sd_shared_t));
617 		_sd_cache_stats = NULL;
618 	}
619 #ifdef _MULTI_DATAMODEL
620 	if (_sd_cache_stats32) {
621 		kmem_free(_sd_cache_stats32, sizeof (_sd_stats32_t) +
622 		    (sdbc_max_devs - 1) * sizeof (_sd_shared_t));
623 		_sd_cache_stats32 = NULL;
624 	}
625 #endif
626 }
627 
628 static int
629 _sdbc_stats_configure(int cblocks)
630 {
631 
632 	_sd_cache_stats = kmem_zalloc(sizeof (_sd_stats_t) +
633 	    (sdbc_max_devs - 1) * sizeof (_sd_shared_t), KM_SLEEP);
634 	_sd_cache_stats->st_blksize = (int)BLK_SIZE(1);
635 	_sd_cache_stats->st_cachesize = cblocks * BLK_SIZE(1);
636 	_sd_cache_stats->st_numblocks = cblocks;
637 	_sd_cache_stats->st_wrcancelns = 0;
638 	_sd_cache_stats->st_destaged = 0;
639 #ifdef _MULTI_DATAMODEL
640 	_sd_cache_stats32 = kmem_zalloc(sizeof (_sd_stats32_t) +
641 	    (sdbc_max_devs - 1) * sizeof (_sd_shared_t), KM_SLEEP);
642 #endif
643 
644 	/* kstat implementation - global stats */
645 	sdbc_global_stats_kstat = kstat_create(SDBC_KSTAT_MODULE, 0,
646 	    SDBC_KSTAT_GSTATS, SDBC_KSTAT_CLASS, KSTAT_TYPE_NAMED,
647 	    sizeof (sdbc_global_stats)/sizeof (kstat_named_t),
648 	    KSTAT_FLAG_VIRTUAL|KSTAT_FLAG_WRITABLE);
649 
650 	if (sdbc_global_stats_kstat != NULL) {
651 		sdbc_global_stats_kstat->ks_data = &sdbc_global_stats;
652 		sdbc_global_stats_kstat->ks_update = sdbc_global_stats_update;
653 		sdbc_global_stats_kstat->ks_private = _sd_cache_stats;
654 		kstat_install(sdbc_global_stats_kstat);
655 	} else {
656 		cmn_err(CE_WARN, "!sdbc: gstats kstat failed");
657 	}
658 
659 	/* global I/O kstats */
660 	sdbc_global_io_kstat = kstat_create(SDBC_KSTAT_MODULE, 0,
661 	    SDBC_IOKSTAT_GSTATS, "disk", KSTAT_TYPE_IO, 1, 0);
662 
663 	if (sdbc_global_io_kstat) {
664 		mutex_init(&sdbc_global_io_kstat_mutex, NULL, MUTEX_DRIVER,
665 		    NULL);
666 		sdbc_global_io_kstat->ks_lock =
667 		    &sdbc_global_io_kstat_mutex;
668 		kstat_install(sdbc_global_io_kstat);
669 	}
670 
671 	/*
672 	 * kstat implementation - cd stats
673 	 * NOTE: one kstat instance for each open cache descriptor
674 	 */
675 	sdbc_cd_kstats = kmem_zalloc(sizeof (kstat_t *) * sdbc_max_devs,
676 	    KM_SLEEP);
677 
678 	/*
679 	 * kstat implementation - i/o kstats per cache descriptor
680 	 * NOTE: one I/O kstat instance for each cd
681 	 */
682 	sdbc_cd_io_kstats = kmem_zalloc(sizeof (kstat_t *) * sdbc_max_devs,
683 	    KM_SLEEP);
684 
685 	sdbc_cd_io_kstats_mutexes = kmem_zalloc(sizeof (kmutex_t) *
686 	    sdbc_max_devs, KM_SLEEP);
687 
688 #ifdef DEBUG
689 	/* kstat implementation - dynamic memory stats */
690 	sdbc_dynmem_kstat_dm = kstat_create(SDBC_KSTAT_MODULE, 0,
691 	    SDBC_KSTAT_DYNMEM, SDBC_KSTAT_CLASS, KSTAT_TYPE_NAMED,
692 	    sizeof (sdbc_dynmem_dm)/sizeof (kstat_named_t),
693 	    KSTAT_FLAG_VIRTUAL|KSTAT_FLAG_WRITABLE);
694 
695 	if (sdbc_dynmem_kstat_dm != NULL) {
696 		sdbc_dynmem_kstat_dm->ks_data = &sdbc_dynmem_dm;
697 		sdbc_dynmem_kstat_dm->ks_update = sdbc_dynmem_kstat_update_dm;
698 		sdbc_dynmem_kstat_dm->ks_private = &dynmem_processing_dm;
699 		kstat_install(sdbc_dynmem_kstat_dm);
700 	} else {
701 		cmn_err(CE_WARN, "!sdbc: dynmem kstat failed");
702 	}
703 #endif
704 
705 	return (0);
706 }
707 
708 /*
709  * sdbc_dmqueues_configure()
710  * initialize the queues of dynamic memory chains.
711  */
712 
713 _sd_queue_t *sdbc_dm_queues;
714 static int max_dm_queues;
715 
716 
717 static int
718 sdbc_dmqueues_configure()
719 {
720 	int i;
721 
722 	/*
723 	 * CAUTION! this code depends on max_dyn_list not changing
724 	 * if it does change behavior may be incorrect, as cc_alloc_size_dm
725 	 * depends on max_dyn_list and indexes to dmqueues are derived from
726 	 * cc_alloc_size_dm.
727 	 * see _sd_setup_category_on_type() and _sd_dealloc_dm()
728 	 * TODO: prevent max_dyn_list from on-the-fly modification (easy) or
729 	 * allow for on-the-fly changes to number of dm queues (hard).
730 	 */
731 	max_dm_queues = dynmem_processing_dm.max_dyn_list;
732 
733 	++max_dm_queues; /* need a "0" queue for centrys with no memory */
734 
735 	sdbc_dm_queues = (_sd_queue_t *)
736 	    kmem_zalloc(max_dm_queues * sizeof (_sd_queue_t), KM_SLEEP);
737 
738 #ifdef DEBUG
739 	dmchainpull_table = (int *)kmem_zalloc(max_dm_queues *
740 	    max_dm_queues * sizeof (int), KM_SLEEP);
741 #endif
742 
743 	for (i = 0; i < max_dm_queues; ++i) {
744 		(void) _sdbc_lruq_configure(&sdbc_dm_queues[i]);
745 		sdbc_dm_queues[i].sq_dmchain_cblocks = i;
746 	}
747 
748 	return (0);
749 }
750 
751 static void
752 sdbc_dmqueues_deconfigure()
753 {
754 	/* CAUTION! this code depends on max_dyn_list not changing */
755 
756 	if (sdbc_dm_queues)
757 		kmem_free(sdbc_dm_queues, max_dm_queues * sizeof (_sd_queue_t));
758 	sdbc_dm_queues = NULL;
759 	max_dm_queues = 0;
760 }
761 
762 #define	GOOD_LRUSIZE(q) ((q->sq_inq >= 0) || (q->sq_inq <= CBLOCKS))
763 
764 /*
765  * _sdbc_lruq_configure - initialize the lru queue
766  *
767  * ARGUMENTS: NONE
768  * RETURNS:   0
769  *
770  */
771 
772 static int
773 _sdbc_lruq_configure(_sd_queue_t *_sd_lru)
774 {
775 
776 	_sd_lru->sq_inq = 0;
777 
778 	mutex_init(&_sd_lru->sq_qlock, NULL, MUTEX_DRIVER, NULL);
779 
780 	_sd_lru->sq_qhead.cc_next = _sd_lru->sq_qhead.cc_prev
781 	    = &(_sd_lru->sq_qhead);
782 	return (0);
783 }
784 
785 /*
786  * _sdbc_lruq_deconfigure - deconfigure the lru queue
787  *
788  * ARGUMENTS: NONE
789  *
790  */
791 
792 static void
793 _sdbc_lruq_deconfigure(void)
794 {
795 	_sd_queue_t *_sd_lru;
796 
797 	_sd_lru = _SD_LRU_Q;
798 
799 	mutex_destroy(&_sd_lru->sq_qlock);
800 	bzero(_sd_lru, sizeof (_sd_queue_t));
801 
802 }
803 
804 /*
805  * _sdbc_mem_configure - initialize the cache memory.
806  *		Create and initialize the hash table.
807  *		Create cache control blocks and fill them with relevent
808  *		information and enqueue onto the lru queue.
809  *		Initialize the Write control blocks (blocks that contain
810  *		information as to where the data will be mirrored)
811  *		Initialize the Fault tolerant blocks (blocks that contain
812  *		information about the mirror nodes dirty writes)
813  *
814  * ARGUMENTS:
815  *	cblocks - Number of cache blocks.
816  * RETURNS:   0
817  *
818  */
819 static int
820 _sdbc_mem_configure(int cblocks, spcs_s_info_t kstatus)
821 {
822 	int num_blks, i, blk;
823 	_sd_cctl_t *centry;
824 	_sd_net_t *netc;
825 	_sd_cctl_t *prev_entry_dm, *first_entry_dm;
826 
827 	if ((_sd_htable = _sdbc_hash_configure(cblocks)) == NULL) {
828 		spcs_s_add(kstatus, SDBC_ENOHASH);
829 		return (-1);
830 	}
831 
832 	_sd_cctl_groupsz = (cblocks / _SD_CCTL_GROUPS) +
833 	    ((cblocks % _SD_CCTL_GROUPS) != 0);
834 
835 	for (i = 0; i < _SD_CCTL_GROUPS; i++) {
836 		_sd_cctl[i] = (_sd_cctl_t *)
837 		    nsc_kmem_zalloc(_sd_cctl_groupsz * sizeof (_sd_cctl_t),
838 		    KM_SLEEP, sdbc_cache_mem);
839 
840 		if (_sd_cctl[i] == NULL) {
841 			spcs_s_add(kstatus, SDBC_ENOCB);
842 			return (-1);
843 		}
844 	}
845 
846 	_sd_ccent_sync = (_sd_cctl_sync_t *)
847 	    nsc_kmem_zalloc(_sd_ccsync_cnt * sizeof (_sd_cctl_sync_t),
848 	    KM_SLEEP, sdbc_local_mem);
849 
850 	if (_sd_ccent_sync == NULL) {
851 		spcs_s_add(kstatus, SDBC_ENOCCTL);
852 		return (-1);
853 	}
854 
855 	for (i = 0; i < _sd_ccsync_cnt; i++) {
856 		mutex_init(&_sd_ccent_sync[i]._cc_lock, NULL, MUTEX_DRIVER,
857 		    NULL);
858 		cv_init(&_sd_ccent_sync[i]._cc_blkcv, NULL, CV_DRIVER, NULL);
859 	}
860 
861 	blk = 0;
862 
863 	netc = &_sd_net_config;
864 
865 	num_blks = (netc->sn_cpages * (int)netc->sn_psize)/BLK_SIZE(1);
866 
867 	prev_entry_dm = 0;
868 	first_entry_dm = 0;
869 	for (i = 0; i < num_blks; i++, blk++) {
870 		centry = _sd_cctl[(blk/_sd_cctl_groupsz)] +
871 		    (blk%_sd_cctl_groupsz);
872 		centry->cc_sync = &_sd_ccent_sync[blk % _sd_ccsync_cnt];
873 		centry->cc_next = centry->cc_prev = NULL;
874 		centry->cc_dirty_next = centry->cc_dirty_link = NULL;
875 		centry->cc_await_use = centry->cc_await_page = 0;
876 		centry->cc_inuse = centry->cc_pageio = 0;
877 		centry->cc_flag = 0;
878 		centry->cc_iocount = 0;
879 		centry->cc_valid = 0;
880 
881 		if (!first_entry_dm)
882 			first_entry_dm = centry;
883 		if (prev_entry_dm)
884 			prev_entry_dm->cc_link_list_dm = centry;
885 		prev_entry_dm = centry;
886 		centry->cc_link_list_dm = first_entry_dm;
887 		centry->cc_data = 0;
888 		centry->cc_write = NULL;
889 		centry->cc_dirty = 0;
890 
891 		{
892 		_sd_queue_t *q;
893 			if (sdbc_use_dmchain) {
894 				q = &sdbc_dm_queues[0];
895 				centry->cc_cblocks = 0;
896 			} else
897 				q = _SD_LRU_Q;
898 
899 			_sd_ins_queue(q, centry);
900 		}
901 
902 	}
903 
904 	if (_sdbc_gl_centry_configure(kstatus) != 0)
905 		return (-1);
906 
907 	if (_sdbc_gl_file_configure(kstatus) != 0)
908 		return (-1);
909 
910 	return (0);
911 }
912 
913 /*
914  * _sdbc_gl_file_configure()
915  * 	allocate and initialize space for the global filename data.
916  *
917  */
918 static int
919 _sdbc_gl_file_configure(spcs_s_info_t kstatus)
920 {
921 	ss_voldata_t *fileinfo;
922 	ss_voldata_t tempfinfo;
923 	ss_vdir_t vdir;
924 	ss_vdirkey_t key;
925 	int err = 0;
926 
927 	_sdbc_gl_file_info_size = safestore_config.ssc_maxfiles *
928 	    sizeof (ss_voldata_t);
929 
930 	if ((_sdbc_gl_file_info = kmem_zalloc(_sdbc_gl_file_info_size,
931 	    KM_NOSLEEP)) == NULL) {
932 		spcs_s_add(kstatus, SDBC_ENOSFNV);
933 		return (-1);
934 	}
935 
936 	/* setup the key to get a directory stream of all volumes */
937 	key.vk_type  = CDIR_ALL;
938 
939 	fileinfo = _sdbc_gl_file_info;
940 
941 	/*
942 	 * if coming up after a crash, "refresh" the host
943 	 * memory copy from safestore.
944 	 */
945 	if (_sdbc_warm_start()) {
946 
947 		if (SSOP_GETVDIR(sdbc_safestore, &key, &vdir)) {
948 			cmn_err(CE_WARN, "!sdbc(_sdbc_gl_file_configure): "
949 			    "cannot read safestore");
950 			return (-1);
951 		}
952 
953 
954 		/*
955 		 * cycle through the vdir getting volume data
956 		 * and volume tokens
957 		 */
958 
959 		while ((err = SSOP_GETVDIRENT(sdbc_safestore, &vdir, fileinfo))
960 		    == SS_OK) {
961 			++fileinfo;
962 		}
963 
964 		if (err != SS_EOF) {
965 			/*
966 			 * fail to configure since
967 			 * recovery is not possible.
968 			 */
969 			spcs_s_add(kstatus, SDBC_ENOREFRESH);
970 			return (-1);
971 		}
972 
973 	} else { /* normal initialization, not a warm start */
974 
975 		/*
976 		 * if this fails, continue: cache will start
977 		 * in writethru mode
978 		 */
979 
980 		if (SSOP_GETVDIR(sdbc_safestore, &key, &vdir)) {
981 			cmn_err(CE_WARN, "!sdbc(_sdbc_gl_file_configure): "
982 			    "cannot read safestore");
983 			return (-1);
984 		}
985 
986 		/*
987 		 * cycle through the vdir getting just the volume tokens
988 		 * and initializing volume entries
989 		 */
990 
991 		while ((err = SSOP_GETVDIRENT(sdbc_safestore, &vdir,
992 		    &tempfinfo)) == 0) {
993 			/*
994 			 * initialize the host memory copy of the
995 			 * global file region.  this means setting the
996 			 * _pinned and _attached fields to _SD_NO_HOST
997 			 * because the default of zero conflicts with
998 			 * the min nodeid of zero.
999 			 */
1000 			fileinfo->sv_vol = tempfinfo.sv_vol;
1001 			fileinfo->sv_pinned = _SD_NO_HOST;
1002 			fileinfo->sv_attached = _SD_NO_HOST;
1003 			fileinfo->sv_cd = _SD_NO_CD;
1004 
1005 			/* initialize the directory entry */
1006 			if ((err = SSOP_SETVOL(sdbc_safestore, fileinfo))
1007 			    == SS_ERR) {
1008 				cmn_err(CE_WARN,
1009 				    "!sdbc(_sdbc_gl_file_configure): "
1010 				    "volume entry write failure %p",
1011 				    (void *)fileinfo->sv_vol);
1012 				break;
1013 			}
1014 
1015 			++fileinfo;
1016 		}
1017 
1018 		/* coming up clean, continue in w-t mode */
1019 		if (err != SS_EOF)
1020 			cmn_err(CE_WARN, "!sdbc(_sdbc_gl_file_configure) "
1021 			    "unable to init safe store volinfo");
1022 	}
1023 
1024 	return (0);
1025 }
1026 
1027 static void
1028 _sdbc_gl_centry_deconfigure(void)
1029 {
1030 	if (_sdbc_gl_centry_info)
1031 		kmem_free(_sdbc_gl_centry_info, _sdbc_gl_centry_info_size);
1032 
1033 	_sdbc_gl_centry_info = NULL;
1034 	_sdbc_gl_centry_info_size = 0;
1035 }
1036 
1037 static int
1038 _sdbc_gl_centry_configure(spcs_s_info_t kstatus)
1039 {
1040 
1041 	int wblocks;
1042 	ss_centry_info_t *cinfo;
1043 	ss_cdirkey_t key;
1044 	ss_cdir_t cdir;
1045 	int err = 0;
1046 
1047 
1048 	wblocks = safestore_config.ssc_wsize / BLK_SIZE(1);
1049 	_sdbc_gl_centry_info_size = sizeof (ss_centry_info_t) * wblocks;
1050 
1051 	if ((_sdbc_gl_centry_info = kmem_zalloc(_sdbc_gl_centry_info_size,
1052 	    KM_NOSLEEP)) == NULL) {
1053 		cmn_err(CE_WARN, "!sdbc(_sdbc_gl_centry_configure) "
1054 		    "alloc failed for gl_centry_info region");
1055 
1056 		_sdbc_gl_centry_deconfigure();
1057 		return (-1);
1058 	}
1059 
1060 	/*
1061 	 * synchronize the centry info area with safe store
1062 	 */
1063 
1064 	/* setup the key to get a directory stream of all centrys */
1065 	key.ck_type  = CDIR_ALL;
1066 
1067 	cinfo = _sdbc_gl_centry_info;
1068 
1069 	if (_sdbc_warm_start()) {
1070 
1071 		if (SSOP_GETCDIR(sdbc_safestore, &key, &cdir)) {
1072 			cmn_err(CE_WARN, "!sdbc(_sdbc_gl_centry_configure): "
1073 			    "cannot read safestore");
1074 			return (-1);
1075 		}
1076 
1077 
1078 		/*
1079 		 * cycle through the cdir getting resource
1080 		 * tokens and reading centrys
1081 		 */
1082 
1083 		while ((err = SSOP_GETCDIRENT(sdbc_safestore, &cdir, cinfo))
1084 		    == 0) {
1085 			++cinfo;
1086 		}
1087 
1088 		if (err != SS_EOF) {
1089 			/*
1090 			 * fail to configure since
1091 			 * recovery is not possible.
1092 			 */
1093 			_sdbc_gl_centry_deconfigure();
1094 			spcs_s_add(kstatus, SDBC_EGLDMAFAIL);
1095 			return (-1);
1096 		}
1097 
1098 	} else {
1099 
1100 		if (SSOP_GETCDIR(sdbc_safestore, &key, &cdir)) {
1101 			cmn_err(CE_WARN, "!sdbc(_sdbc_gl_centry_configure): "
1102 			    "cannot read safestore");
1103 			return (-1);
1104 		}
1105 
1106 		/*
1107 		 * cycle through the cdir getting resource
1108 		 * tokens and initializing centrys
1109 		 */
1110 
1111 		while ((err = SSOP_GETCDIRENT(sdbc_safestore, &cdir, cinfo))
1112 		    == 0) {
1113 			cinfo->sc_cd = -1;
1114 			cinfo->sc_fpos = -1;
1115 
1116 			if ((err = SSOP_SETCENTRY(sdbc_safestore, cinfo))
1117 			    == SS_ERR) {
1118 				cmn_err(CE_WARN,
1119 				    "!sdbc(_sdbc_gl_centry_configure): "
1120 				    "cache entry write failure %p",
1121 				    (void *)cinfo->sc_res);
1122 				break;
1123 			}
1124 
1125 			++cinfo;
1126 		}
1127 
1128 		/* coming up clean, continue in w-t mode */
1129 		if (err != SS_EOF) {
1130 			cmn_err(CE_WARN, "!sdbc(sdbc_gl_centry_configure) "
1131 			    "_sdbc_gl_centry_info initialization failed");
1132 		}
1133 	}
1134 
1135 	return (0);
1136 }
1137 
1138 
1139 static void
1140 _sdbc_gl_file_deconfigure(void)
1141 {
1142 
1143 	if (_sdbc_gl_file_info)
1144 		kmem_free(_sdbc_gl_file_info, _sdbc_gl_file_info_size);
1145 
1146 	_sdbc_gl_file_info = NULL;
1147 
1148 	_sdbc_gl_file_info_size = 0;
1149 }
1150 
1151 
1152 /*
1153  * _sdbc_mem_deconfigure - deconfigure the cache memory.
1154  * Release any memory/locks/sv's acquired during _sdbc_mem_configure.
1155  *
1156  * ARGUMENTS:
1157  *	cblocks - Number of cache blocks.
1158  *
1159  */
1160 /* ARGSUSED */
1161 static void
1162 _sdbc_mem_deconfigure(int cblocks)
1163 {
1164 	int i;
1165 
1166 	if (_sd_ccent_sync) {
1167 		for (i = 0; i < _sd_ccsync_cnt; i++) {
1168 			mutex_destroy(&_sd_ccent_sync[i]._cc_lock);
1169 			cv_destroy(&_sd_ccent_sync[i]._cc_blkcv);
1170 		}
1171 		nsc_kmem_free(_sd_ccent_sync,
1172 		    _sd_ccsync_cnt * sizeof (_sd_cctl_sync_t));
1173 	}
1174 	_sd_ccent_sync = NULL;
1175 
1176 	for (i = 0; i < _SD_CCTL_GROUPS; i++) {
1177 		if (_sd_cctl[i] != NULL) {
1178 			nsc_kmem_free(_sd_cctl[i],
1179 			    _sd_cctl_groupsz * sizeof (_sd_cctl_t));
1180 			_sd_cctl[i] = NULL;
1181 		}
1182 	}
1183 	_sd_cctl_groupsz = 0;
1184 
1185 	_sdbc_hash_deconfigure(_sd_htable);
1186 	_sd_htable = NULL;
1187 
1188 }
1189 
1190 
1191 #if defined(_SD_DEBUG) || defined(DEBUG)
1192 static int
1193 _sd_cctl_valid(_sd_cctl_t *addr)
1194 {
1195 	_sd_cctl_t *end;
1196 	int i, valid;
1197 
1198 	valid = 0;
1199 	for (i = 0; i < _SD_CCTL_GROUPS; i++) {
1200 		end = _sd_cctl[i] + _sd_cctl_groupsz;
1201 		if (addr >= _sd_cctl[i] && addr < end) {
1202 			valid = 1;
1203 			break;
1204 		}
1205 	}
1206 
1207 	return (valid);
1208 }
1209 #endif
1210 
1211 
1212 /*
1213  * _sd_ins_queue - insert centry into LRU queue
1214  * (during initialization, locking not required)
1215  */
1216 static void
1217 _sd_ins_queue(_sd_queue_t *q, _sd_cctl_t *centry)
1218 {
1219 	_sd_cctl_t *q_head;
1220 
1221 	ASSERT(_sd_cctl_valid(centry));
1222 
1223 	q_head = &q->sq_qhead;
1224 	centry->cc_prev = q_head;
1225 	centry->cc_next = q_head->cc_next;
1226 	q_head->cc_next->cc_prev = centry;
1227 	q_head->cc_next = centry;
1228 	q->sq_inq++;
1229 
1230 	ASSERT(GOOD_LRUSIZE(q));
1231 }
1232 
1233 
1234 
1235 void
1236 _sd_requeue(_sd_cctl_t *centry)
1237 {
1238 	_sd_queue_t *q = _SD_LRU_Q;
1239 
1240 	/* was FAST */
1241 	mutex_enter(&q->sq_qlock);
1242 #if defined(_SD_DEBUG)
1243 	if (1) {
1244 		_sd_cctl_t *cp, *cn, *qp;
1245 		cp = centry->cc_prev;
1246 		cn = centry->cc_next;
1247 		qp = (q->sq_qhead).cc_prev;
1248 		if (!_sd_cctl_valid(centry) ||
1249 		    (cp !=  &(q->sq_qhead) && !_sd_cctl_valid(cp)) ||
1250 		    (cn !=  &(q->sq_qhead) && !_sd_cctl_valid(cn)) ||
1251 		    !_sd_cctl_valid(qp))
1252 			cmn_err(CE_PANIC,
1253 			    "_sd_requeue %x prev %x next %x qp %x",
1254 			    centry, cp, cn, qp);
1255 	}
1256 #endif
1257 	centry->cc_prev->cc_next = centry->cc_next;
1258 	centry->cc_next->cc_prev = centry->cc_prev;
1259 	centry->cc_next = &(q->sq_qhead);
1260 	centry->cc_prev = q->sq_qhead.cc_prev;
1261 	q->sq_qhead.cc_prev->cc_next = centry;
1262 	q->sq_qhead.cc_prev = centry;
1263 	centry->cc_seq = q->sq_seq++;
1264 	/* was FAST */
1265 	mutex_exit(&q->sq_qlock);
1266 	(q->sq_req_stat)++;
1267 
1268 }
1269 
1270 void
1271 _sd_requeue_head(_sd_cctl_t *centry)
1272 {
1273 	_sd_queue_t *q = _SD_LRU_Q;
1274 
1275 	/* was FAST */
1276 	mutex_enter(&q->sq_qlock);
1277 #if defined(_SD_DEBUG)
1278 	if (1) {
1279 		_sd_cctl_t *cp, *cn, *qn;
1280 		cp = centry->cc_prev;
1281 		cn = centry->cc_next;
1282 		qn = (q->sq_qhead).cc_prev;
1283 		if (!_sd_cctl_valid(centry) ||
1284 		    (cp != &(q->sq_qhead) && !_sd_cctl_valid(cp)) ||
1285 		    (cn != &(q->sq_qhead) && !_sd_cctl_valid(cn)) ||
1286 		    !_sd_cctl_valid(qn))
1287 			cmn_err(CE_PANIC,
1288 			    "_sd_requeue_head %x prev %x next %x qn %x",
1289 			    centry, cp, cn, qn);
1290 	}
1291 #endif
1292 	centry->cc_prev->cc_next = centry->cc_next;
1293 	centry->cc_next->cc_prev = centry->cc_prev;
1294 	centry->cc_prev = &(q->sq_qhead);
1295 	centry->cc_next = q->sq_qhead.cc_next;
1296 	q->sq_qhead.cc_next->cc_prev = centry;
1297 	q->sq_qhead.cc_next = centry;
1298 	centry->cc_seq = q->sq_seq++;
1299 	centry->cc_flag &= ~CC_QHEAD;
1300 	/* was FAST */
1301 	mutex_exit(&q->sq_qlock);
1302 }
1303 
1304 
1305 
1306 /*
1307  * _sd_open -   Open a file.
1308  *
1309  * ARGUMENTS:
1310  *	filename -  Name of the file to be opened.
1311  *	flag	-  Flag associated with open.
1312  *			(currently used to determine a ckd device)
1313  * RETURNS:
1314  *	cd - the cache descriptor.
1315  */
1316 
1317 int
1318 _sd_open(char *filename, int flag)
1319 {
1320 	int cd;
1321 
1322 	if (!_sd_cache_initialized) {
1323 		cmn_err(CE_WARN, "!sdbc(_sd_open) cache not initialized");
1324 		return (-EINVAL);
1325 	}
1326 	cd = _sd_open_cd(filename, -1, flag);
1327 	SDTRACE(SDF_OPEN, (cd < 0) ? SDT_INV_CD : cd, 0, SDT_INV_BL, 0, cd);
1328 
1329 	return (cd);
1330 }
1331 
1332 
1333 static int
1334 _sd_open_io(char *filename, int flag, blind_t *cdp, nsc_iodev_t *iodev)
1335 {
1336 	_sd_cd_info_t *cdi;
1337 	int cd;
1338 	int rc = 0;
1339 
1340 	if ((cd = _sd_open(filename, flag)) >= 0) {
1341 
1342 		cdi = &(_sd_cache_files[cd]);
1343 		cdi->cd_iodev = iodev;
1344 		nsc_set_owner(cdi->cd_rawfd, cdi->cd_iodev);
1345 
1346 		*cdp = (blind_t)(unsigned long)cd;
1347 	} else
1348 		rc = -cd;
1349 
1350 	return (rc);
1351 }
1352 
1353 
1354 
1355 int
1356 _sd_open_cd(char *filename, const int cd, const int flag)
1357 {
1358 	int new_cd, rc = 0, alloc_cd = -1;
1359 	ss_voldata_t *cdg;
1360 	int preexists = 0;
1361 	_sd_cd_info_t *cdi;
1362 	int failover_open, open_failed;
1363 	major_t devmaj;
1364 	minor_t devmin;
1365 
1366 	if (_sdbc_shutdown_in_progress)
1367 		return (-EIO);
1368 
1369 	if (strlen(filename) > (NSC_MAXPATH-1))
1370 		return (-ENAMETOOLONG);
1371 
1372 	/*
1373 	 * If the cd is >= 0, then this is a open for a specific cd.
1374 	 * This happens when the mirror node crashes, and we attempt to
1375 	 * reopen the files with the same cache descriptors as existed on
1376 	 * the other node
1377 	 */
1378 
1379 retry_open:
1380 	failover_open = 0;
1381 	open_failed   = 0;
1382 	if (cd >= 0) {
1383 		failover_open++;
1384 		cdi = &(_sd_cache_files[cd]);
1385 		mutex_enter(&_sd_cache_lock);
1386 		if (cdi->cd_info == NULL)
1387 			cdi->cd_info = &_sd_cache_stats->st_shared[cd];
1388 		else if (cdi->cd_info->sh_alloc &&
1389 		    strcmp(cdi->cd_info->sh_filename, filename)) {
1390 			cmn_err(CE_WARN, "!sdbc(_sd_open_cd) cd %d mismatch",
1391 			    cd);
1392 			mutex_exit(&_sd_cache_lock);
1393 			return (-EEXIST);
1394 		}
1395 
1396 		if (cdi->cd_info->sh_failed != 2) {
1397 			if (cdi->cd_info->sh_alloc != 0)
1398 				preexists = 1;
1399 			else {
1400 				cdi->cd_info->sh_alloc = CD_ALLOC_IN_PROGRESS;
1401 				(void) strcpy(cdi->cd_info->sh_filename,
1402 				    filename);
1403 				if (_sd_cache_stats->st_count < sdbc_max_devs)
1404 					_sd_cache_stats->st_count++;
1405 			}
1406 		}
1407 
1408 		mutex_exit(&_sd_cache_lock);
1409 		alloc_cd = cd;
1410 
1411 		goto known_cd;
1412 	}
1413 
1414 	new_cd = 0;
1415 	mutex_enter(&_sd_cache_lock);
1416 
1417 	for (cdi = &(_sd_cache_files[new_cd]),
1418 	    cdg = _sdbc_gl_file_info + new_cd;
1419 	    new_cd < (sdbc_max_devs); new_cd++, cdi++, cdg++) {
1420 		if (strlen(cdg->sv_volname) != 0)
1421 			if (strcmp(cdg->sv_volname, filename))
1422 				continue;
1423 
1424 		if (cdi->cd_info == NULL)
1425 			cdi->cd_info = &_sd_cache_stats->st_shared[new_cd];
1426 
1427 		if (cdi->cd_info->sh_failed != 2) {
1428 			if (cdi->cd_info->sh_alloc != 0)
1429 				preexists = 1;
1430 			else {
1431 				if (cd == -2) {
1432 					mutex_exit(&_sd_cache_lock);
1433 					return (-1);
1434 				}
1435 				cdi->cd_info->sh_alloc = CD_ALLOC_IN_PROGRESS;
1436 				(void) strcpy(cdi->cd_info->sh_filename,
1437 				    filename);
1438 				(void) strcpy(cdg->sv_volname, filename);
1439 
1440 				cdg->sv_cd = new_cd;
1441 				/* update safestore */
1442 				SSOP_SETVOL(sdbc_safestore, cdg);
1443 				if (_sd_cache_stats->st_count < sdbc_max_devs)
1444 					_sd_cache_stats->st_count++;
1445 				cdi->cd_flag = 0;
1446 			}
1447 		}
1448 		alloc_cd = new_cd;
1449 		break;
1450 	}
1451 
1452 	mutex_exit(&_sd_cache_lock);
1453 
1454 	if (alloc_cd == -1)
1455 		return (-ENOSPC);
1456 
1457 known_cd:
1458 	/*
1459 	 * If preexists: someone else is attempting to open this file as
1460 	 * well. Do only one open, but block everyone else here till the
1461 	 * open is completed.
1462 	 */
1463 	if (preexists) {
1464 		while (cdi->cd_info->sh_alloc == CD_ALLOC_IN_PROGRESS) {
1465 			delay(drv_usectohz(20000));
1466 		}
1467 		if ((cdi->cd_info->sh_alloc != CD_ALLOCATED))
1468 			goto retry_open;
1469 			return (alloc_cd);
1470 	}
1471 
1472 	if (!(cdi->cd_rawfd =
1473 	    nsc_open(filename, NSC_SDBC_ID|NSC_DEVICE, _sdbc_fd_def,
1474 	    (blind_t)(unsigned long)alloc_cd, &rc)) ||
1475 	    !nsc_getval(cdi->cd_rawfd, "DevMaj", (int *)&devmaj) ||
1476 	    !nsc_getval(cdi->cd_rawfd, "DevMin", (int *)&devmin)) {
1477 		if (cdi->cd_rawfd) {
1478 			(void) nsc_close(cdi->cd_rawfd);
1479 			cdi->cd_rawfd = NULL;
1480 		}
1481 		/*
1482 		 * take into account that there may be pinned data on a
1483 		 * device that can no longer be opened
1484 		 */
1485 		open_failed++;
1486 		if (!(cdi->cd_info->sh_failed) && !failover_open) {
1487 			cdi->cd_info->sh_alloc = 0;
1488 			mutex_enter(&_sd_cache_lock);
1489 			_sd_cache_stats->st_count--;
1490 			mutex_exit(&_sd_cache_lock);
1491 			if (!rc)
1492 				rc = EIO;
1493 			return (-rc);
1494 		}
1495 	}
1496 
1497 	cdi->cd_strategy = nsc_get_strategy(devmaj);
1498 	cdi->cd_crdev	= makedevice(devmaj, devmin);
1499 	cdi->cd_desc	= alloc_cd;
1500 	cdi->cd_dirty_head = cdi->cd_dirty_tail = NULL;
1501 	cdi->cd_io_head	= cdi->cd_io_tail = NULL;
1502 	cdi->cd_hint	= 0;
1503 #ifdef DEBUG
1504 	/* put the dev_t in the ioerr_inject_table */
1505 	_sdbc_ioj_set_dev(alloc_cd, cdi->cd_crdev);
1506 #endif
1507 
1508 	cdi->cd_global = (_sdbc_gl_file_info + alloc_cd);
1509 	if (open_failed) {
1510 		cdi->cd_info->sh_failed = 2;
1511 	} else if (cdi->cd_info->sh_failed != 2)
1512 		if ((cdi->cd_global->sv_pinned == _SD_SELF_HOST) &&
1513 		    !failover_open)
1514 			cdi->cd_info->sh_failed = 1;
1515 		else
1516 			cdi->cd_info->sh_failed = 0;
1517 
1518 	cdi->cd_flag	|= flag;
1519 	mutex_init(&cdi->cd_lock, NULL, MUTEX_DRIVER, NULL);
1520 
1521 #ifndef _SD_NOTRACE
1522 	(void) _sdbc_tr_configure(alloc_cd);
1523 #endif
1524 	cdi->cd_info->sh_alloc = CD_ALLOCATED;
1525 	cdi->cd_global = (_sdbc_gl_file_info + alloc_cd);
1526 	cdi->cd_info->sh_cd = (unsigned short) alloc_cd;
1527 	mutex_enter(&_sd_cache_lock);
1528 	_sd_cache_stats->st_loc_count++;
1529 	mutex_exit(&_sd_cache_lock);
1530 
1531 	if (cd_kstat_add(alloc_cd) < 0) {
1532 		cmn_err(CE_WARN, "!Could not create kstats for cache descriptor"
1533 		    " %d", alloc_cd);
1534 	}
1535 
1536 
1537 	return (open_failed ? -EIO : alloc_cd);
1538 }
1539 
1540 
1541 /*
1542  * _sd_close -   Close a cache descriptor.
1543  *
1544  * ARGUMENTS:
1545  *	cd   -   the cache descriptor to be closed.
1546  * RETURNS:
1547  *	0 on success.
1548  *	Error otherwise.
1549  *
1550  * Note: Under Construction.
1551  */
1552 
1553 int
1554 _sd_close(int cd)
1555 {
1556 	int rc;
1557 	_sd_cd_info_t *cdi = &(_sd_cache_files[cd]);
1558 
1559 	if (!FILE_OPENED(cd)) {
1560 		rc = EINVAL;
1561 		goto out;
1562 	}
1563 
1564 	SDTRACE(ST_ENTER|SDF_CLOSE, cd, 0, SDT_INV_BL, 0, 0);
1565 
1566 	mutex_enter(&_sd_cache_lock);
1567 	if ((cdi->cd_info->sh_alloc == 0) ||
1568 	    (cdi->cd_info->sh_alloc & CD_CLOSE_IN_PROGRESS)) {
1569 		mutex_exit(&_sd_cache_lock);
1570 		SDTRACE(ST_EXIT|SDF_CLOSE, cd, 0, SDT_INV_BL, 0, EINVAL);
1571 		rc = EINVAL;
1572 		goto out;
1573 	}
1574 	cdi->cd_info->sh_alloc |= CD_CLOSE_IN_PROGRESS;
1575 	mutex_exit(&_sd_cache_lock);
1576 
1577 	/*
1578 	 * _sd_flush_cd() will return -1 for the case where pinned
1579 	 * data is present, but has been transfered to the mirror
1580 	 * node.  In this case it is safe to close the device as
1581 	 * though _sd_flush_cd() had returned 0.
1582 	 */
1583 
1584 	rc = _sd_flush_cd(cd);
1585 	if (rc == -1)
1586 		rc = 0;
1587 
1588 	if (rc != 0) {
1589 		mutex_enter(&_sd_cache_lock);
1590 		if ((rc == EAGAIN) &&
1591 		    (cdi->cd_global->sv_pinned == _SD_NO_HOST)) {
1592 			cdi->cd_global->sv_pinned = _SD_SELF_HOST;
1593 			SSOP_SETVOL(sdbc_safestore, cdi->cd_global);
1594 		}
1595 
1596 		cdi->cd_info->sh_alloc &= ~CD_CLOSE_IN_PROGRESS;
1597 		mutex_exit(&_sd_cache_lock);
1598 		SDTRACE(ST_EXIT|SDF_CLOSE, cd, 0, SDT_INV_BL,
1599 		    _SD_CD_WBLK_USED(cd), rc);
1600 		goto out;
1601 	}
1602 
1603 	rc = nsc_close(cdi->cd_rawfd);
1604 	if (rc) {
1605 		mutex_enter(&_sd_cache_lock);
1606 		cdi->cd_info->sh_alloc &= ~CD_CLOSE_IN_PROGRESS;
1607 		mutex_exit(&_sd_cache_lock);
1608 		SDTRACE(ST_EXIT|SDF_CLOSE, cd, 0, SDT_INV_BL, 0, rc);
1609 		goto out;
1610 	}
1611 	mutex_enter(&_sd_cache_lock);
1612 	_sd_cache_stats->st_loc_count--;
1613 	mutex_exit(&_sd_cache_lock);
1614 
1615 	if (cd_kstat_remove(cd) < 0) {
1616 		cmn_err(CE_WARN, "!Could not remove kstat for cache descriptor "
1617 		    "%d", cd);
1618 	}
1619 
1620 	cdi->cd_info->sh_alloc = 0;
1621 	cdi->cd_info->sh_failed = 0;
1622 	/* cdi->cd_info = NULL; */
1623 	cdi->cd_flag = 0;
1624 	SDTRACE(ST_EXIT|SDF_CLOSE, cd, 0, SDT_INV_BL, 0, NSC_DONE);
1625 	rc = NSC_DONE;
1626 	goto out;
1627 
1628 out:
1629 	return (rc);
1630 }
1631 
1632 
1633 static int
1634 _sd_close_io(blind_t xcd)
1635 {
1636 	_sd_cd_info_t *cdi;
1637 	int cd = (int)(unsigned long)xcd;
1638 	int rc = 0;
1639 
1640 	if ((rc = _sd_close((int)cd)) == NSC_DONE) {
1641 		cdi = &(_sd_cache_files[cd]);
1642 		cdi->cd_iodev = NULL;
1643 	}
1644 
1645 	return (rc);
1646 }
1647 
1648 
1649 /*
1650  * _sdbc_remote_store_pinned - reflect pinned/failed blocks for cd
1651  * to our remote mirror. Returns count of blocks reflected or -1 on error.
1652  *
1653  */
1654 int
1655 _sdbc_remote_store_pinned(int cd)
1656 {
1657 	int cnt = 0;
1658 	_sd_cd_info_t *cdi = &(_sd_cache_files[cd]);
1659 	_sd_cctl_t *cc_ent, *cc_list;
1660 
1661 	ASSERT(cd >= 0);
1662 	if (cdi->cd_info->sh_failed) {
1663 
1664 		if (cdi->cd_global->sv_pinned == _SD_NO_HOST) {
1665 			cdi->cd_global->sv_pinned = _SD_SELF_HOST;
1666 			SSOP_SETVOL(sdbc_safestore, cdi->cd_global);
1667 		}
1668 
1669 		mutex_enter(&cdi->cd_lock);
1670 		cc_ent = cc_list = cdi->cd_fail_head;
1671 		while (cc_ent) {
1672 			cnt++;
1673 
1674 			/* is this always necessary? jgk */
1675 
1676 			if (SSOP_WRITE_CBLOCK(sdbc_safestore,
1677 			    cc_ent->cc_write->sc_res, cc_ent->cc_data,
1678 			    CACHE_BLOCK_SIZE, 0)) {
1679 				mutex_exit(&cdi->cd_lock);
1680 				return (-1);
1681 			}
1682 
1683 			/* update the cache block metadata */
1684 			CENTRY_SET_FTPOS(cc_ent);
1685 			cc_ent->cc_write->sc_flag = cc_ent->cc_flag;
1686 
1687 			cc_ent->cc_write->sc_dirty = CENTRY_DIRTY(cc_ent);
1688 
1689 			SSOP_SETCENTRY(sdbc_safestore, cc_ent->cc_write);
1690 
1691 			cc_ent = cc_ent->cc_dirty_next;
1692 			if (!cc_ent)
1693 				cc_ent = cc_list = cc_list->cc_dirty_link;
1694 		}
1695 		mutex_exit(&cdi->cd_lock);
1696 	}
1697 
1698 	return (cnt);
1699 }
1700 
1701 /*
1702  * _sd_flush_cd()
1703  *	reflect pinned blocks to mirrored node
1704  *	wait for dirty blocks to be flushed
1705  * returns:
1706  *	EIO	I/O failure, or pinned blocks and no mirror
1707  *	EAGAIN	Hang: count of outstanding writes isn't decreasing
1708  *	-1	pinned blocks, reflected to mirror
1709  *	0	success
1710  */
1711 static int
1712 _sd_flush_cd(int cd)
1713 {
1714 	int rc;
1715 
1716 	if ((rc = _sd_wait_for_flush(cd)) == 0)
1717 		return (0);
1718 
1719 	/*
1720 	 * if we timed out simply return otherwise
1721 	 * it must be an i/o type of error
1722 	 */
1723 	if (rc == EAGAIN)
1724 		return (rc);
1725 
1726 	if (_sd_is_mirror_down())
1727 		return (EIO); /* already failed, no mirror */
1728 
1729 	/* flush any pinned/failed blocks to mirror */
1730 	if (_sdbc_remote_store_pinned(cd) >= 0)
1731 		/*
1732 		 * At this point it looks like we have blocks on the
1733 		 * failed list and taking up space on this node but
1734 		 * no longer have responsibility for the blocks.
1735 		 * These blocks will in fact be freed from the cache
1736 		 * and the failed list when the mirror picks them up
1737 		 * from safe storage and then calls _sd_cd_discard_mirror
1738 		 * which will issue an rpc telling us to finish up.
1739 		 *
1740 		 * Should the other node die before sending the rpc then
1741 		 * we are safe with these blocks simply waiting on the
1742 		 * failed list.
1743 		 */
1744 		return (-1);
1745 	else
1746 		return (rc);
1747 }
1748 
1749 /*
1750  * _sdbc_io_attach_cd -- set up for client access to device, reserve raw device
1751  *
1752  * ARGUMENTS:
1753  *	cd   -	the cache descriptor to attach.
1754  *
1755  * RETURNS:
1756  *	0 on success.
1757  *	Error otherwise.
1758  */
1759 int
1760 _sdbc_io_attach_cd(blind_t xcd)
1761 {
1762 	int rc = 0;
1763 	_sd_cd_info_t *cdi;
1764 	int cd = (int)(unsigned long)xcd;
1765 
1766 	SDTRACE(ST_ENTER|SDF_ATTACH, cd, 0, SDT_INV_BL, 0, 0);
1767 	if (!_sd_cache_initialized ||
1768 	    _sdbc_shutdown_in_progress ||
1769 	    !FILE_OPENED(cd)) {
1770 		SDTRACE(ST_EXIT|SDF_ATTACH, cd, 0, SDT_INV_BL, 0, EINVAL);
1771 
1772 		DTRACE_PROBE(_sdbc_io_attach_cd_end1);
1773 
1774 		return (EINVAL);
1775 	}
1776 	cdi = &(_sd_cache_files[cd]);
1777 
1778 	/*
1779 	 * check if disk is failed without raw device open.  If it is,
1780 	 * it has to be recovered using _sd_disk_online
1781 	 */
1782 
1783 	if (cdi->cd_global->sv_pinned == _SD_SELF_HOST) {
1784 		_sd_print(3,
1785 		    "_sdbc_io_attach_cd: pinned data. returning EINVAL");
1786 
1787 		DTRACE_PROBE(_sdbc_io_attach_cd_end2);
1788 
1789 		return (EINVAL);
1790 	}
1791 
1792 	if ((cdi->cd_info == NULL) || (cdi->cd_info->sh_failed)) {
1793 		DTRACE_PROBE1(_sdbc_io_attach_cd_end3,
1794 		    struct _sd_shared *, cdi->cd_info);
1795 
1796 		return (EINVAL);
1797 	}
1798 
1799 #if defined(_SD_FAULT_RES)
1800 	/* wait for node recovery to finish */
1801 	if (_sd_node_recovery)
1802 		(void) _sd_recovery_wait();
1803 #endif
1804 
1805 	/* this will provoke a sdbc_fd_attach_cd call .. */
1806 
1807 	rc = nsc_reserve(cdi->cd_rawfd, NSC_MULTI);
1808 	SDTRACE(ST_EXIT|SDF_ATTACH, cd, 0, SDT_INV_BL, 0, rc);
1809 
1810 	return (rc);
1811 }
1812 
1813 /*
1814  * sdbc_fd_attach_cd -- setup cache for access to raw device underlying cd.
1815  * This is provoked by some piece of sdbc doing a reserve on the raw device.
1816  *
1817  * ARGUMENTS:
1818  *	cd   -	the cache descriptor to attach.
1819  *
1820  * RETURNS:
1821  *	0 on success.
1822  *	Error otherwise.
1823  */
1824 static int
1825 sdbc_fd_attach_cd(blind_t xcd)
1826 {
1827 	int rc = 0;
1828 	int cd = (int)(unsigned long)xcd;
1829 	_sd_cd_info_t *cdi;
1830 
1831 	if (!_sd_cache_initialized || !FILE_OPENED(cd)) {
1832 		SDTRACE(ST_INFO|SDF_ATTACH, cd, 0, SDT_INV_BL, 0, EINVAL);
1833 
1834 		DTRACE_PROBE(sdbc_fd_attach_cd_end1);
1835 
1836 		return (EINVAL);
1837 	}
1838 	cdi = &(_sd_cache_files[cd]);
1839 
1840 #if defined(_SD_FAULT_RES)
1841 	/* retrieve pinned/failed data */
1842 	if (!_sd_node_recovery) {
1843 		(void) _sd_repin_cd(cd);
1844 	}
1845 #endif
1846 
1847 	rc = nsc_partsize(cdi->cd_rawfd, &cdi->cd_info->sh_filesize);
1848 	if (rc != 0) {
1849 		SDTRACE(ST_INFO|SDF_ATTACH, cd, 0, SDT_INV_BL, 0, rc);
1850 
1851 		DTRACE_PROBE(sdbc_fd_attach_cd_end3);
1852 
1853 		return (rc);
1854 	}
1855 
1856 	cdi->cd_global->sv_attached = _SD_SELF_HOST;
1857 
1858 	SSOP_SETVOL(sdbc_safestore, cdi->cd_global);
1859 
1860 	mutex_enter(&_sd_cache_lock);
1861 	cdi->cd_info->sh_flag |= CD_ATTACHED;
1862 	mutex_exit(&_sd_cache_lock);
1863 
1864 	return (0);
1865 }
1866 
1867 /*
1868  * _sdbc_io_detach_cd -- release raw device
1869  * Called when a cache client is being detached from this cd.
1870  *
1871  * ARGUMENTS:
1872  *	cd   -   the cache descriptor to detach.
1873  * RETURNS:
1874  *	0 on success.
1875  *	Error otherwise.
1876  */
1877 int
1878 _sdbc_io_detach_cd(blind_t xcd)
1879 {
1880 	int cd = (int)(unsigned long)xcd;
1881 	_sd_cd_info_t *cdi;
1882 
1883 
1884 	SDTRACE(ST_ENTER|SDF_DETACH, cd, 0, SDT_INV_BL, 0, 0);
1885 	if (!_sd_cache_initialized || !FILE_OPENED(cd)) {
1886 		SDTRACE(ST_EXIT|SDF_DETACH, cd, 0, SDT_INV_BL, 0, EINVAL);
1887 
1888 		DTRACE_PROBE(_sdbc_io_detach_cd_end1);
1889 
1890 		return (EINVAL);
1891 	}
1892 
1893 #if defined(_SD_FAULT_RES)
1894 	if (_sd_node_recovery)
1895 		(void) _sd_recovery_wait();
1896 #endif
1897 	/* relinquish responsibility for device */
1898 	cdi = &(_sd_cache_files[cd]);
1899 	if (!(cdi->cd_rawfd) || !nsc_held(cdi->cd_rawfd)) {
1900 		cmn_err(CE_WARN, "!sdbc(_sdbc_detach_cd)(%d) not attached", cd);
1901 		SDTRACE(ST_EXIT|SDF_DETACH, cd, 0, SDT_INV_BL, 0, EPROTO);
1902 		DTRACE_PROBE1(_sdbc_io_detach_cd_end2,
1903 		    nsc_fd_t *, cdi->cd_rawfd);
1904 
1905 		return (EPROTO);
1906 	}
1907 	/* this will provoke/allow a call to sdbc_fd_detach_cd */
1908 	nsc_release(cdi->cd_rawfd);
1909 
1910 	SDTRACE(ST_EXIT|SDF_DETACH, cd, 0, SDT_INV_BL, 0, 0);
1911 
1912 	return (0);
1913 }
1914 
1915 /*
1916  * _sdbc_detach_cd -- flush dirty writes to disk, release raw device
1917  * Called when raw device is being detached from this cd.
1918  *
1919  * ARGUMENTS:
1920  *	cd   -   the cache descriptor to detach.
1921  *	rd_only   -  non-zero if detach is for read access.
1922  * RETURNS:
1923  *	0 on success.
1924  *	Error otherwise.
1925  */
1926 static int
1927 sdbc_detach_cd(blind_t xcd, int rd_only)
1928 {
1929 	int rc;
1930 	int cd = (int)(unsigned long)xcd;
1931 	_sd_cd_info_t *cdi;
1932 
1933 	SDTRACE(ST_INFO|SDF_DETACH, cd, 0, SDT_INV_BL, 0, 0);
1934 
1935 	if (!_sd_cache_initialized || !FILE_OPENED(cd)) {
1936 		SDTRACE(ST_INFO|SDF_DETACH, cd, 0, SDT_INV_BL, 0, EINVAL);
1937 
1938 		DTRACE_PROBE(sdbc_detach_cd_end1);
1939 
1940 		return (EINVAL);
1941 	}
1942 
1943 
1944 	rc = _sd_flush_cd(cd);
1945 	if (rc > 0) {
1946 		SDTRACE(ST_INFO|SDF_DETACH, cd, 0, SDT_INV_BL, 0, rc);
1947 
1948 		DTRACE_PROBE(sdbc_detach_cd_end2);
1949 
1950 		return (rc);
1951 	}
1952 
1953 	if (!rd_only) {
1954 		_sd_hash_invalidate_cd(cd);
1955 		cdi = &(_sd_cache_files[cd]);
1956 
1957 		if (cdi->cd_global->sv_attached == _SD_SELF_HOST) {
1958 			cdi->cd_global->sv_attached = _SD_NO_HOST;
1959 			SSOP_SETVOL(sdbc_safestore, cdi->cd_global);
1960 		} else {
1961 			cmn_err(CE_WARN,
1962 			    "!sdbc(_sdbc_detach_cd) (%d) attached by node %d",
1963 			    cd, cdi->cd_global->sv_attached);
1964 			SDTRACE(SDF_DETACH, cd, 0, SDT_INV_BL, 0, EPROTO);
1965 
1966 			DTRACE_PROBE1(sdbc_detach_cd_end3,
1967 			    int, cdi->cd_global->sv_attached);
1968 
1969 			return (EPROTO);
1970 		}
1971 
1972 		mutex_enter(&_sd_cache_lock);
1973 		cdi->cd_info->sh_flag &= ~CD_ATTACHED;
1974 		mutex_exit(&_sd_cache_lock);
1975 	}
1976 
1977 	SDTRACE(ST_INFO|SDF_DETACH, cd, 0, SDT_INV_BL, 0, 0);
1978 
1979 	return (0);
1980 }
1981 
1982 /*
1983  * _sdbc_fd_detach_cd -- flush dirty writes to disk, release raw device
1984  * Called when raw device is being detached from this cd.
1985  *
1986  * ARGUMENTS:
1987  *	xcd   -   the cache descriptor to detach.
1988  * RETURNS:
1989  *	0 on success.
1990  *	Error otherwise.
1991  */
1992 static int
1993 sdbc_fd_detach_cd(blind_t xcd)
1994 {
1995 	return (sdbc_detach_cd(xcd, 0));
1996 }
1997 
1998 /*
1999  * sdbc_fd_flush_cd - raw device "xcd" is being detached and needs
2000  * flushing.  We only need to flush we don't need to hash invalidate
2001  * this file.
2002  */
2003 static int
2004 sdbc_fd_flush_cd(blind_t xcd)
2005 {
2006 	return (sdbc_detach_cd(xcd, 1));
2007 }
2008 
2009 /*
2010  * _sd_get_pinned - re-issue PINNED callbacks for cache device
2011  *
2012  * ARGUMENTS:
2013  *	cd   -   the cache descriptor to reissue pinned calbacks from.
2014  * RETURNS:
2015  *	0 on success.
2016  *	Error otherwise.
2017  */
2018 int
2019 _sd_get_pinned(blind_t xcd)
2020 {
2021 	_sd_cd_info_t *cdi;
2022 	_sd_cctl_t *cc_list, *cc_ent;
2023 	int cd = (int)(unsigned long)xcd;
2024 
2025 	cdi = &_sd_cache_files[cd];
2026 
2027 	if (cd < 0 || cd >= sdbc_max_devs) {
2028 		DTRACE_PROBE(_sd_get_pinned_end1);
2029 		return (EINVAL);
2030 	}
2031 
2032 	if (!FILE_OPENED(cd)) {
2033 		DTRACE_PROBE(_sd_get_pinned_end2);
2034 		return (0);
2035 	}
2036 
2037 	mutex_enter(&cdi->cd_lock);
2038 
2039 	if (!cdi->cd_info->sh_failed) {
2040 		mutex_exit(&cdi->cd_lock);
2041 
2042 		DTRACE_PROBE(_sd_get_pinned_end3);
2043 		return (0);
2044 	}
2045 
2046 	cc_ent = cc_list = cdi->cd_fail_head;
2047 	while (cc_ent) {
2048 		if (CENTRY_PINNED(cc_ent))
2049 			nsc_pinned_data(cdi->cd_iodev,
2050 			    BLK_TO_FBA_NUM(CENTRY_BLK(cc_ent)), BLK_FBAS);
2051 		cc_ent = cc_ent->cc_dirty_next;
2052 		if (!cc_ent)
2053 			cc_ent = cc_list = cc_list->cc_dirty_link;
2054 	}
2055 
2056 	mutex_exit(&cdi->cd_lock);
2057 
2058 	return (0);
2059 }
2060 
2061 /*
2062  * _sd_allocate_buf - allocate a vector of buffers for io.
2063  * 			*This call has been replaced by _sd_alloc_buf*
2064  */
2065 
2066 _sd_buf_handle_t *
2067 _sd_allocate_buf(int cd, nsc_off_t fba_pos, nsc_size_t fba_len, int flag,
2068     int *sts)
2069 {
2070 	_sd_buf_handle_t *handle = NULL;
2071 
2072 	*sts = _sd_alloc_buf((blind_t)(unsigned long)cd, fba_pos, fba_len,
2073 	    flag, &handle);
2074 	if (*sts == NSC_HIT)
2075 		*sts = NSC_DONE;
2076 	return (handle);
2077 }
2078 
2079 
2080 /*
2081  * _sd_prefetch_buf - _sd_alloc_buf w/flag = NSC_RDAHEAD|NSC_RDBUF
2082  *	no 'bufvec' (data is not read by caller)
2083  *	skip leading valid or busy entries (data available sooner)
2084  *	truncate on busy block (to avoid deadlock)
2085  *	release trailing valid entries, adjust length before starting I/O.
2086  */
2087 static int
2088 _sd_prefetch_buf(int cd, nsc_off_t fba_pos, nsc_size_t fba_len, int flag,
2089     _sd_buf_handle_t *handle, int locked)
2090 {
2091 	_sd_cd_info_t *cdi;
2092 	nsc_off_t cblk; 	/* position of temp cache block */
2093 	sdbc_cblk_fba_t st_cblk_len;	/* FBA len of starting cache block */
2094 	sdbc_cblk_fba_t end_cblk_len;	/* FBA len of ending cache block */
2095 	sdbc_cblk_fba_t st_cblk_off;	/* FBA offset into starting cblock */
2096 	nsc_off_t io_pos;	/* offset in FBA's */
2097 	nsc_size_t fba_orig_len;
2098 	int sts, stall;
2099 	_sd_cctl_t *centry = NULL;
2100 	_sd_cctl_t *lentry = NULL;
2101 	_sd_cctl_t *ioent = NULL;
2102 	_sd_cctl_t *last_ioent = NULL;
2103 	sdbc_allocbuf_t alloc_tok = {0};
2104 	int this_entry_type = 0;
2105 	nsc_size_t request_blocks = 0; /* number of cache blocks required */
2106 	int pageio;
2107 
2108 	handle->bh_flag |= NSC_HACTIVE;
2109 	ASSERT(cd >= 0);
2110 	cdi = &_sd_cache_files[cd];
2111 
2112 	/* prefetch: truncate if req'd */
2113 	if (fba_len > sdbc_max_fbas)
2114 		fba_len = sdbc_max_fbas;
2115 	if ((fba_pos + fba_len) > cdi->cd_info->sh_filesize) {
2116 		if (fba_pos >= cdi->cd_info->sh_filesize) {
2117 			sts = EIO;
2118 			goto done;
2119 		}
2120 		fba_len = cdi->cd_info->sh_filesize - fba_pos;
2121 	}
2122 
2123 	fba_orig_len = fba_len;
2124 
2125 	_SD_SETUP_HANDLE(handle, cd, fba_pos, fba_len, flag);
2126 	handle->bh_centry = NULL;
2127 
2128 	cblk = FBA_TO_BLK_NUM(fba_pos);
2129 	st_cblk_off = BLK_FBA_OFF(fba_pos);
2130 	st_cblk_len = BLK_FBAS - st_cblk_off;
2131 
2132 	/*
2133 	 * count number of blocks on chain that is required
2134 	 */
2135 	if ((nsc_size_t)st_cblk_len >= fba_len) {
2136 		st_cblk_len = (sdbc_cblk_fba_t)fba_len;
2137 		end_cblk_len = 0;
2138 	} else {
2139 		end_cblk_len = BLK_FBA_OFF(fba_pos + fba_len);
2140 	}
2141 
2142 	request_blocks = 1;  /* at least one */
2143 
2144 	/* middle piece */
2145 	request_blocks += (fba_len - (st_cblk_len + end_cblk_len)) >>
2146 	    BLK_FBA_SHFT;
2147 
2148 	if (end_cblk_len)
2149 		++request_blocks;
2150 
2151 	stall = 0;
2152 	do {
2153 		pageio = ((flag & NSC_PAGEIO) != 0 || sdbc_pageio_always != 0);
2154 cget:
2155 		if (centry = (_sd_cctl_t *)
2156 		    _sd_hash_search(cd, cblk, _sd_htable)) {
2157 try:
2158 			/* prefetch: skip leading valid blocks */
2159 			if ((ioent == NULL) &&
2160 			    SDBC_VALID_BITS(st_cblk_off, st_cblk_len, centry)) {
2161 skip:
2162 				sdbc_prefetch_valid_cnt++;
2163 				--request_blocks;
2164 				lentry = centry;
2165 				centry = NULL;
2166 				cblk++;
2167 				fba_len -= st_cblk_len;
2168 				st_cblk_off = 0;
2169 				st_cblk_len = (sdbc_cblk_fba_t)
2170 				    ((fba_len > (nsc_size_t)BLK_FBAS) ?
2171 				    BLK_FBAS : fba_len);
2172 				continue;
2173 			}
2174 
2175 			if (SET_CENTRY_INUSE(centry)) {
2176 				/*
2177 				 * prefetch: skip leading busy
2178 				 * or truncate at busy block
2179 				 */
2180 				if (ioent == NULL)
2181 					goto skip;
2182 				sdbc_prefetch_busy_cnt++;
2183 				fba_orig_len -= fba_len;
2184 				fba_len = 0;
2185 				centry = lentry; /* backup */
2186 				break;
2187 			}
2188 
2189 			/*
2190 			 * bug 4529671
2191 			 * now that we own the centry make sure that
2192 			 * it is still good.  it could have been processed
2193 			 * by _sd_dealloc_dm() in the window between
2194 			 * _sd_hash_search() and SET_CENTRY_INUSE().
2195 			 */
2196 			if ((_sd_cctl_t *)
2197 			    _sd_hash_search(cd, cblk, _sd_htable) != centry) {
2198 				sdbc_prefetch_deallocd++;
2199 #ifdef DEBUG
2200 				cmn_err(CE_WARN,
2201 				    "!prefetch centry %p cd %d cblk %" NSC_SZFMT
2202 				    " fba_len %" NSC_SZFMT " lost to dealloc?! "
2203 				    "cc_data %p",
2204 				    (void *)centry, cd, cblk, fba_orig_len,
2205 				    (void *)centry->cc_data);
2206 #endif
2207 
2208 				CLEAR_CENTRY_INUSE(centry);
2209 				continue;
2210 			}
2211 
2212 			if (CC_CD_BLK_MATCH(cd, cblk, centry)) {
2213 				/*
2214 				 * Do pagelist io mutual exclusion
2215 				 * before messing with the centry.
2216 				 */
2217 				if (pageio && SET_CENTRY_PAGEIO(centry)) {
2218 					/* flusher not done with pageio */
2219 					/*
2220 					 * prefetch: skip leading busy
2221 					 * or truncate at busy block
2222 					 */
2223 					CLEAR_CENTRY_INUSE(centry);
2224 					if (ioent == NULL)
2225 						goto skip;
2226 					sdbc_prefetch_pageio1++;
2227 					fba_orig_len -= fba_len;
2228 					fba_len = 0;
2229 					centry = lentry; /* backup */
2230 					break;
2231 
2232 				}
2233 
2234 				sdbc_prefetch_hit++;
2235 				this_entry_type = HASH_ENTRY_DM;
2236 				pageio = 0;
2237 				centry->cc_toflush = 0;
2238 
2239 				centry->cc_hits++;
2240 
2241 				/* this will reset the age flag */
2242 				sdbc_centry_init_dm(centry);
2243 
2244 				DTRACE_PROBE1(_sd_prefetch_buf,
2245 				    _sd_cctl_t *, centry);
2246 			} else {
2247 				/* block mismatch */
2248 				sdbc_prefetch_lost++;
2249 
2250 				CLEAR_CENTRY_INUSE(centry);
2251 				continue;
2252 			}
2253 		} else {
2254 			centry = sdbc_centry_alloc(cd, cblk, request_blocks,
2255 			    &stall, &alloc_tok, ALLOC_NOWAIT);
2256 
2257 			if (centry == NULL) {
2258 				/*
2259 				 * prefetch: cache is very busy. just do
2260 				 * the i/o for the blocks already acquired,
2261 				 * if any.
2262 				 */
2263 				fba_orig_len -= fba_len;
2264 				fba_len = 0;
2265 				/*
2266 				 * if we have a chain of centry's
2267 				 * then back up (set centry to lentry).
2268 				 * if there is no chain (ioent == NULL)
2269 				 * then centry remains NULL.  this can occur
2270 				 * if all previous centrys were hash hits
2271 				 * on valid blocks that were processed in
2272 				 * the skip logic above.
2273 				 */
2274 				if (ioent)
2275 					centry = lentry; /* backup */
2276 				break;
2277 			}
2278 
2279 			/*
2280 			 * dmchaining adjustment.
2281 			 * if centry was obtained from the dmchain
2282 			 * then clear local pageio variable because the
2283 			 * centry already has cc_pageio set.
2284 			 */
2285 			if (CENTRY_PAGEIO(centry))
2286 				pageio = 0;
2287 
2288 			DTRACE_PROBE1(_sd_alloc_buf, _sd_cctl_t *, centry);
2289 
2290 			this_entry_type = ELIGIBLE_ENTRY_DM;
2291 			if (centry->cc_aging_dm & FOUND_IN_HASH_DM)
2292 				this_entry_type = HASH_ENTRY_DM;
2293 			else {
2294 				if (centry->cc_aging_dm & FOUND_HOLD_OVER_DM)
2295 					this_entry_type = HOLD_ENTRY_DM;
2296 			}
2297 		}
2298 
2299 		centry->cc_chain = NULL;
2300 
2301 		centry->cc_aging_dm &= ~(FOUND_IN_HASH_DM|FOUND_HOLD_OVER_DM);
2302 
2303 		/*
2304 		 * Do pagelist io mutual exclusion now if we did not do
2305 		 * it above.
2306 		 */
2307 
2308 		if (pageio && SET_CENTRY_PAGEIO(centry)) {
2309 			/* flusher not done with pageio */
2310 			sdbc_prefetch_pageio2++;
2311 
2312 			/*
2313 			 * prefetch: skip leading busy
2314 			 * or truncate at busy block
2315 			 */
2316 			CLEAR_CENTRY_INUSE(centry);
2317 			if (ioent == NULL)
2318 				goto skip;
2319 			sdbc_prefetch_busy_cnt++;
2320 			fba_orig_len -= fba_len;
2321 			fba_len = 0;
2322 			centry = lentry; /* backup */
2323 			break;
2324 		}
2325 
2326 		pageio = 0;
2327 
2328 		fba_len -= st_cblk_len;
2329 
2330 		if (ioent == NULL)  {
2331 			if (!SDBC_VALID_BITS(st_cblk_off, st_cblk_len,
2332 			    centry)) {
2333 				io_pos = BLK_TO_FBA_NUM(cblk) + st_cblk_off;
2334 				ioent = last_ioent = centry;
2335 			} else {
2336 				DATA_LOG(SDF_ALLOC, centry, st_cblk_off,
2337 				    st_cblk_len);
2338 				DTRACE_PROBE4(_sd_prefetch_buf_data1,
2339 				    uint64_t, (uint64_t)(BLK_TO_FBA_NUM(cblk) +
2340 				    st_cblk_off), int, st_cblk_len,
2341 				    char *, *(int64_t *)(centry->cc_data +
2342 				    FBA_SIZE(st_cblk_off)), char *,
2343 				    *(int64_t *)(centry->cc_data +
2344 				    FBA_SIZE(st_cblk_off + st_cblk_len) - 8));
2345 			}
2346 
2347 			handle->bh_centry = centry;
2348 			st_cblk_off = 0;
2349 			st_cblk_len = (sdbc_cblk_fba_t)
2350 			    ((fba_len > (nsc_size_t)BLK_FBAS) ?
2351 			    BLK_FBAS : fba_len);
2352 		} else {
2353 			if (!SDBC_VALID_BITS(st_cblk_off, st_cblk_len, centry))
2354 				last_ioent = centry;
2355 			else {
2356 				DTRACE_PROBE4(_sd_prefetch_buf_data2,
2357 				    uint64_t, (uint64_t)(BLK_TO_FBA_NUM(cblk) +
2358 				    st_cblk_off), int, st_cblk_len,
2359 				    char *, *(int64_t *)(centry->cc_data +
2360 				    FBA_SIZE(st_cblk_off)), char *,
2361 				    *(int64_t *)(centry->cc_data +
2362 				    FBA_SIZE(st_cblk_off + st_cblk_len) - 8));
2363 			}
2364 
2365 			lentry->cc_chain = centry;
2366 			if (fba_len < (nsc_size_t)BLK_FBAS)
2367 				st_cblk_len = (sdbc_cblk_fba_t)fba_len;
2368 		}
2369 		lentry = centry;
2370 		cblk++;
2371 
2372 		/* if this block has a new identity clear prefetch history */
2373 		if (this_entry_type != HASH_ENTRY_DM)
2374 			centry->cc_aging_dm &=
2375 			    ~(PREFETCH_BUF_I | PREFETCH_BUF_E);
2376 
2377 		centry->cc_aging_dm &= ~(ENTRY_FIELD_DM);
2378 		centry->cc_aging_dm |= this_entry_type | PREFETCH_BUF_E;
2379 		if (flag & NSC_METADATA)
2380 			centry->cc_aging_dm |= STICKY_METADATA_DM;
2381 
2382 		--request_blocks;
2383 	} while (fba_len > 0);
2384 
2385 
2386 	if (locked) {
2387 		rw_exit(&sdbc_queue_lock);
2388 		locked = 0;
2389 	}
2390 
2391 	sdbc_centry_alloc_end(&alloc_tok);
2392 
2393 	if (centry) {
2394 		centry->cc_chain = NULL;
2395 		if (sts = _sd_setup_category_on_type(handle->bh_centry)) {
2396 			(void) _sd_free_buf(handle);
2397 			goto done;
2398 		}
2399 
2400 		(void) _sd_setup_mem_chaining(handle->bh_centry, 0);
2401 	}
2402 
2403 
2404 	if (ioent) {
2405 		/* prefetch: trailing valid can be released, adjust len */
2406 		if ((centry != last_ioent)) {
2407 			centry = last_ioent->cc_chain;
2408 			last_ioent->cc_chain = NULL;
2409 			while (centry) {
2410 				lentry = centry->cc_chain;
2411 				centry->cc_aging_dm &= ~PREFETCH_BUF_E;
2412 				_sd_centry_release(centry);
2413 				centry = lentry;
2414 				sdbc_prefetch_trailing++;
2415 			}
2416 			fba_len = (CENTRY_BLK(last_ioent) -
2417 			    CENTRY_BLK(ioent) + 1) *  BLK_FBAS -
2418 			    BLK_FBA_OFF(io_pos);
2419 			fba_orig_len = fba_len + (io_pos - fba_pos);
2420 		}
2421 
2422 		_SD_DISCONNECT_CALLBACK(handle);
2423 		sts = _sd_doread(handle,  ioent, io_pos,
2424 		    (fba_pos + fba_orig_len - io_pos), flag);
2425 		if (sts > 0)
2426 			(void) _sd_free_buf(handle);
2427 	} else {
2428 		CACHE_FBA_READ(cd, fba_orig_len);
2429 		CACHE_READ_HIT;
2430 		FBA_READ_IO_KSTATS(cd, FBA_SIZE(fba_orig_len));
2431 
2432 		sts = NSC_HIT;
2433 	}
2434 done:
2435 	if (locked)
2436 		rw_exit(&sdbc_queue_lock);
2437 
2438 	return (sts);
2439 }
2440 
2441 
2442 /*
2443  * _sd_cc_wait - wait for inuse cache block to become available
2444  * Usage:
2445  *	if (SET_CENTRY_INUSE(centry)) {
2446  *		_sd_cc_wait(cd, blk, centry, CC_INUSE);
2447  *		goto try_again;
2448  *	}
2449  * -or-
2450  *	if (SET_CENTRY_PAGEIO(centry)) {
2451  *		_sd_cc_wait(cd, blk, centry, CC_PAGEIO);
2452  *		goto try_again;
2453  *	}
2454  */
2455 void
2456 _sd_cc_wait(int cd, nsc_off_t cblk, _sd_cctl_t *centry, int flag)
2457 {
2458 	volatile ushort_t *waiters;
2459 	volatile uchar_t *uflag;
2460 
2461 	if (flag == CC_INUSE) {
2462 		waiters = &(centry->cc_await_use);
2463 		uflag = &(CENTRY_INUSE(centry));
2464 	} else if (flag == CC_PAGEIO) {
2465 		waiters = &(centry->cc_await_page);
2466 		uflag = &(CENTRY_PAGEIO(centry));
2467 	} else {
2468 		/* Oops! */
2469 #ifdef DEBUG
2470 		cmn_err(CE_WARN, "!_sd_cc_wait: unknown flag value (%x)", flag);
2471 #endif
2472 		return;
2473 	}
2474 
2475 	mutex_enter(&centry->cc_lock);
2476 	if (CC_CD_BLK_MATCH(cd, cblk, centry) && (*uflag) != 0) {
2477 		(*waiters)++;
2478 		sd_serialize();
2479 		if ((*uflag) != 0) {
2480 			unsigned stime = nsc_usec();
2481 			cv_wait(&centry->cc_blkcv, &centry->cc_lock);
2482 			(*waiters)--;
2483 			mutex_exit(&centry->cc_lock);
2484 			SDTRACE(ST_INFO|SDF_ENT_GET,
2485 			    cd, 0, BLK_TO_FBA_NUM(cblk), (nsc_usec()-stime), 0);
2486 		} else {
2487 			(*waiters)--;
2488 			mutex_exit(&centry->cc_lock);
2489 		}
2490 	} else
2491 		mutex_exit(&centry->cc_lock);
2492 
2493 }
2494 
2495 /*
2496  * _sd_alloc_buf  - Allocate a vector of buffers for io.
2497  *
2498  * ARGUMENTS:
2499  *	cd	 - Cache descriptor (from a previous open)
2500  *	fba_pos	 - disk position (512-byte FBAs)
2501  *	fba_len  - length in disk FBAs.
2502  *	flag	 - allocation type. Flag is one or more of
2503  *		   NSC_RDBUF, NSC_WRBUF, NSC_NOBLOCK and hints.
2504  *		   NSC_RDAHEAD - prefetch for future read.
2505  *	handle_p - pointer to a handle pointer.
2506  *		   If the handle pointer is non-null, its used as a
2507  *		   pre-allocated handle. Else a new handle will be allocated
2508  *		   and stored in *handle_p
2509  *
2510  * RETURNS:
2511  * 	errno if return > 0.
2512  *	else NSC_HIT or NSC_DONE on success
2513  *	or   NSC_PENDING on io in progress and NSC_NOBLOCK
2514  *		specified in the flag.
2515  * USAGE:
2516  *	This routine allocates the cache blocks requested and creates a list
2517  *	of entries for this request.
2518  *	If NSC_NOBLOCK was not specified, this call could block on read io.
2519  *	If flag specified NSC_RDBUF and the request is not an entire
2520  *	hit, an io is initiated.
2521  */
2522 int
2523 _sd_alloc_buf(blind_t xcd, nsc_off_t fba_pos, nsc_size_t fba_len, int flag,
2524     _sd_buf_handle_t **handle_p)
2525 {
2526 	int cd = (int)(unsigned long)xcd;
2527 	_sd_cd_info_t *cdi;
2528 	_sd_buf_handle_t *handle;
2529 	int sts;
2530 	nsc_off_t st_cblk, cblk; /* position of start and temp cache block */
2531 	sdbc_cblk_fba_t st_cblk_len;	/* FBA len of starting cache block */
2532 	sdbc_cblk_fba_t end_cblk_len;	/* FBA len of ending cache block */
2533 	sdbc_cblk_fba_t st_cblk_off;	/* FBA offset into starting cblock */
2534 	nsc_off_t io_pos;	/* offset in FBA's */
2535 	_sd_bufvec_t *bufvec;
2536 	_sd_cctl_t *centry, *lentry, *ioent = NULL;
2537 	nsc_size_t fba_orig_len = fba_len;	/* FBA length of orig request */
2538 	int stall, pageio;
2539 	unsigned char cc_flag;
2540 	int this_entry_type;
2541 	int locked = 0;
2542 	nsc_size_t dmchain_request_blocks; /* size of dmchain in cache blocks */
2543 	sdbc_allocbuf_t alloc_tok = {0};
2544 	int min_frag = 0;	/* frag statistics */
2545 	int max_frag = 0;	/* frag statistics */
2546 	int nfrags = 0;		/* frag statistics */
2547 #ifdef DEBUG
2548 	int err = 0;
2549 #endif
2550 
2551 
2552 	ASSERT(*handle_p != NULL);
2553 	handle = *handle_p;
2554 
2555 	if (_sdbc_shutdown_in_progress)
2556 		return (EIO);
2557 
2558 	if (xcd == NSC_ANON_CD)
2559 		cd = _CD_NOHASH;
2560 
2561 	KSTAT_RUNQ_ENTER(cd);
2562 
2563 	/*
2564 	 * Force large writes on nvram systems to be write-through to
2565 	 * avoid the (slow) bcopy into nvram.
2566 	 */
2567 
2568 	if (flag & NSC_WRBUF) {
2569 		if (fba_len > (nsc_size_t)sdbc_wrthru_len) {
2570 			flag |= NSC_WRTHRU;
2571 		}
2572 	}
2573 
2574 #ifdef DEBUG
2575 	if (sdbc_pageio_debug != SDBC_PAGEIO_OFF) {
2576 		switch (sdbc_pageio_debug) {
2577 		case SDBC_PAGEIO_RDEV:
2578 			if (cd != _CD_NOHASH &&
2579 			    sdbc_pageio_rdev != (dev_t)-1 &&
2580 			    _sd_cache_files[cd].cd_crdev == sdbc_pageio_rdev)
2581 				flag |= NSC_PAGEIO;
2582 			break;
2583 
2584 		case SDBC_PAGEIO_RAND:
2585 			if ((nsc_lbolt() % 3) == 0)
2586 				flag |= NSC_PAGEIO;
2587 			break;
2588 
2589 		case SDBC_PAGEIO_ALL:
2590 			flag |= NSC_PAGEIO;
2591 			break;
2592 		}
2593 	}
2594 #endif /* DEBUG */
2595 
2596 	if (fba_len > (nsc_size_t)BLK_FBAS) {
2597 		rw_enter(&sdbc_queue_lock, RW_WRITER);
2598 		locked = 1;
2599 	}
2600 
2601 	/*
2602 	 * _CD_NOHASH: client wants temporary (not hashed) cache memory
2603 	 * not associated with a local disk.  Skip local disk checks.
2604 	 */
2605 	if (cd == _CD_NOHASH) {
2606 		flag &= ~(NSC_RDBUF | NSC_WRBUF | NSC_RDAHEAD);
2607 		handle = *handle_p;
2608 		handle->bh_flag |= NSC_HACTIVE;
2609 		goto setup;
2610 	}
2611 
2612 	SDTRACE(ST_ENTER|SDF_ALLOCBUF, cd, fba_len, fba_pos, flag, 0);
2613 
2614 
2615 	if ((flag & NSC_RDAHEAD) && _sd_prefetch_opt) {
2616 		sts = _sd_prefetch_buf(cd, fba_pos, fba_len, flag, handle,
2617 		    locked);
2618 		goto done;
2619 	}
2620 
2621 #if !defined(_SD_NOCHECKS)
2622 	if (flag & NSC_RDAHEAD) { /* _sd_prefetch_opt == 0 */
2623 		nsc_size_t file_size;	/* file_size in FBA's */
2624 		/* prefetch: truncate if req'd */
2625 		if (fba_len > sdbc_max_fbas)
2626 			fba_len = sdbc_max_fbas;
2627 		file_size = _sd_cache_files[(cd)].cd_info->sh_filesize;
2628 		if ((fba_pos + fba_len) > file_size) {
2629 			fba_len = file_size - fba_pos;
2630 #ifdef NSC_MULTI_TERABYTE
2631 			if ((int64_t)fba_len <= 0) {
2632 #else
2633 			if ((int32_t)fba_len <= 0) {
2634 #endif
2635 				sts = EIO;
2636 				SDTRACE(ST_EXIT|SDF_ALLOCBUF, cd, fba_len,
2637 				    fba_pos, flag, sts);
2638 				goto done;
2639 			}
2640 		}
2641 	} else
2642 	if (sts = _sd_check_buffer_alloc(cd, fba_pos, fba_len, handle_p)) {
2643 		SDTRACE(ST_EXIT|SDF_ALLOCBUF, cd, fba_len, fba_pos, flag, sts);
2644 		goto done;
2645 	}
2646 #endif
2647 	if (fba_len == 0) {
2648 		SDTRACE(ST_EXIT|SDF_ALLOCBUF, cd, fba_len, fba_pos,
2649 		    flag, EINVAL);
2650 		sts = EINVAL;
2651 		goto done;
2652 	}
2653 
2654 	handle->bh_flag |= NSC_HACTIVE;
2655 	cdi = &_sd_cache_files[cd];
2656 
2657 	if (cdi->cd_recovering) {
2658 		/*
2659 		 * If recovering this device, then block all allocates
2660 		 * for reading or writing. If we allow reads then
2661 		 * this path could see old data before we recover.
2662 		 * If we allow writes then new data could be overwritten
2663 		 * by old data.
2664 		 * This is clearly still not a complete solution as
2665 		 * the thread doing this allocate could conceivably be
2666 		 * by this point (and in _sd_write/_sd_read for that matter
2667 		 * which don't even have this protection). But this type
2668 		 * of path seems to only exist in a failover situation
2669 		 * where a device has failed on the other node and works
2670 		 * on this node so the problem is not a huge one but exists
2671 		 * never the less.
2672 		 */
2673 		if (sts = _sd_recovery_wblk_wait(cd)) {
2674 			handle->bh_flag &= ~NSC_HACTIVE;
2675 			SDTRACE(ST_EXIT|SDF_ALLOCBUF, cd, fba_len, fba_pos,
2676 			    flag, sts);
2677 			goto done;
2678 		}
2679 	}
2680 
2681 	/* write & disk failed, return error immediately */
2682 	if ((flag & NSC_WRBUF) && cdi->cd_info->sh_failed) {
2683 		handle->bh_flag &= ~NSC_HACTIVE;
2684 		SDTRACE(ST_EXIT|SDF_ALLOCBUF, cd, fba_len, fba_pos, flag, EIO);
2685 		sts = EIO;
2686 		goto done;
2687 	}
2688 
2689 setup:
2690 
2691 	_SD_SETUP_HANDLE(handle, cd, fba_pos, fba_len, flag);
2692 	handle->bh_centry = NULL;
2693 	bufvec = handle->bh_bufvec;
2694 	if (flag & NSC_RDAHEAD) { /* _sd_prefetch_opt == 0 */
2695 		/* CKD prefetch: bufvec not req'd, use placeholder */
2696 		bufvec->bufaddr = NULL;
2697 		bufvec->bufvmeaddr = NULL;
2698 		bufvec->buflen  = 0;
2699 		bufvec = _prefetch_sb_vec;
2700 	}
2701 	st_cblk = FBA_TO_BLK_NUM(fba_pos);
2702 	st_cblk_off = BLK_FBA_OFF(fba_pos);
2703 	st_cblk_len = BLK_FBAS - st_cblk_off;
2704 	if ((nsc_size_t)st_cblk_len >= fba_len) {
2705 		end_cblk_len = 0;
2706 		st_cblk_len = (sdbc_cblk_fba_t)fba_len;
2707 	} else
2708 		end_cblk_len = BLK_FBA_OFF(fba_pos + fba_len);
2709 	cblk = st_cblk;
2710 
2711 
2712 	/*
2713 	 * count number of blocks on chain that is required
2714 	 */
2715 
2716 	/* middle piece */
2717 	dmchain_request_blocks =
2718 	    (fba_len - (st_cblk_len + end_cblk_len)) >> BLK_FBA_SHFT;
2719 
2720 	/* start piece */
2721 	++dmchain_request_blocks;
2722 
2723 	/* end piece */
2724 	if (end_cblk_len)
2725 		++dmchain_request_blocks;
2726 
2727 
2728 	cc_flag = 0;
2729 	if ((handle->bh_flag & NSC_PINNABLE) && (handle->bh_flag & NSC_WRBUF))
2730 		cc_flag |= CC_PINNABLE;
2731 	if (handle->bh_flag & (NSC_NOCACHE|NSC_SEQ_IO))
2732 		cc_flag |= CC_QHEAD;
2733 	lentry = NULL;
2734 	stall = 0;
2735 
2736 	do {
2737 		pageio = ((flag & NSC_PAGEIO) != 0 || sdbc_pageio_always != 0);
2738 cget:
2739 		if ((centry = (_sd_cctl_t *)
2740 		    _sd_hash_search(cd, cblk, _sd_htable)) != 0) {
2741 
2742 			if (SET_CENTRY_INUSE(centry)) {
2743 				/* already inuse: wait for block, retry */
2744 				sdbc_allocb_inuse++;
2745 				if (locked)
2746 					rw_exit(&sdbc_queue_lock);
2747 				_sd_cc_wait(cd, cblk, centry, CC_INUSE);
2748 				if (locked)
2749 					rw_enter(&sdbc_queue_lock, RW_WRITER);
2750 				goto cget;
2751 			}
2752 
2753 			/*
2754 			 * bug 4529671
2755 			 * now that we own the centry make sure that
2756 			 * it is still good.  it could have been processed
2757 			 * by _sd_dealloc_dm() in the window between
2758 			 * _sd_hash_search() and SET_CENTRY_INUSE().
2759 			 */
2760 			if ((_sd_cctl_t *)
2761 			    _sd_hash_search(cd, cblk, _sd_htable) != centry) {
2762 				sdbc_allocb_deallocd++;
2763 #ifdef DEBUG
2764 				cmn_err(CE_WARN,
2765 				    "!centry %p cd %d cblk %" NSC_SZFMT
2766 				    " fba_len %" NSC_SZFMT " lost to dealloc?! "
2767 				    "cc_data %p", (void *)centry, cd, cblk,
2768 				    fba_orig_len, (void *)centry->cc_data);
2769 #endif
2770 
2771 				CLEAR_CENTRY_INUSE(centry);
2772 				goto cget;
2773 			}
2774 
2775 			if (CC_CD_BLK_MATCH(cd, cblk, centry)) {
2776 				/*
2777 				 * Do pagelist io mutual exclusion
2778 				 * before messing with the centry.
2779 				 */
2780 				if (pageio && SET_CENTRY_PAGEIO(centry)) {
2781 					/* wait for flusher to finish pageio */
2782 					sdbc_allocb_pageio1++;
2783 
2784 					CLEAR_CENTRY_INUSE(centry);
2785 					if (locked)
2786 						rw_exit(&sdbc_queue_lock);
2787 					_sd_cc_wait(cd, cblk, centry,
2788 					    CC_PAGEIO);
2789 					if (locked)
2790 						rw_enter(&sdbc_queue_lock,
2791 						    RW_WRITER);
2792 					goto cget;
2793 				}
2794 
2795 				sdbc_allocb_hit++;
2796 				this_entry_type = HASH_ENTRY_DM;
2797 				pageio = 0;
2798 				centry->cc_toflush = 0;
2799 
2800 				centry->cc_hits++;
2801 
2802 				/* this will reset the age flag */
2803 				sdbc_centry_init_dm(centry);
2804 
2805 				DTRACE_PROBE1(_sd_alloc_buf1,
2806 				    _sd_cctl_t *, centry);
2807 			} else {
2808 				/* block mismatch: release, alloc new block */
2809 				sdbc_allocb_lost++;
2810 
2811 				CLEAR_CENTRY_INUSE(centry);
2812 
2813 				goto cget;
2814 
2815 			}
2816 		} else {
2817 			centry = sdbc_centry_alloc(cd, cblk,
2818 			    dmchain_request_blocks, &stall,
2819 			    &alloc_tok, locked ? ALLOC_LOCKED : 0);
2820 
2821 			/*
2822 			 * dmchaining adjustment.
2823 			 * if centry was obtained from the dmchain
2824 			 * then clear local pageio variable because the
2825 			 * centry already has cc_pageio set.
2826 			 */
2827 			if (CENTRY_PAGEIO(centry))
2828 				pageio = 0;
2829 
2830 			DTRACE_PROBE1(_sd_alloc_buf2, _sd_cctl_t *, centry);
2831 
2832 			this_entry_type = ELIGIBLE_ENTRY_DM;
2833 			if (centry->cc_aging_dm & FOUND_IN_HASH_DM)
2834 				this_entry_type = HASH_ENTRY_DM;
2835 			else {
2836 				if (centry->cc_aging_dm & FOUND_HOLD_OVER_DM)
2837 					this_entry_type = HOLD_ENTRY_DM;
2838 			}
2839 		}
2840 
2841 		centry->cc_aging_dm &= ~(FOUND_IN_HASH_DM|FOUND_HOLD_OVER_DM);
2842 
2843 		/*
2844 		 * Do pagelist io mutual exclusion now if we did not do
2845 		 * it above.
2846 		 */
2847 
2848 		if (pageio && SET_CENTRY_PAGEIO(centry)) {
2849 			/* wait for flusher to finish pageio */
2850 			sdbc_allocb_pageio2++;
2851 
2852 
2853 			CLEAR_CENTRY_INUSE(centry);
2854 			if (locked)
2855 				rw_exit(&sdbc_queue_lock);
2856 			_sd_cc_wait(cd, cblk, centry, CC_PAGEIO);
2857 			if (locked)
2858 				rw_enter(&sdbc_queue_lock, RW_WRITER);
2859 			goto cget;
2860 		}
2861 
2862 		pageio = 0;
2863 
2864 		if (CENTRY_DIRTY(centry)) {
2865 			/*
2866 			 * end action might set PEND_DIRTY flag
2867 			 * must lock if need to change flag bits
2868 			 */
2869 			if (centry->cc_flag != (centry->cc_flag | cc_flag)) {
2870 				/* was FAST */
2871 				mutex_enter(&centry->cc_lock);
2872 				centry->cc_flag |= cc_flag;
2873 				/* was FAST */
2874 				mutex_exit(&centry->cc_lock);
2875 			}
2876 		} else
2877 			centry->cc_flag |= cc_flag;
2878 
2879 		centry->cc_chain = NULL;
2880 
2881 		/*
2882 		 * step 0:check valid bits in each cache ele as
2883 		 * the chain grows - set ioent/io_pos to first
2884 		 * instance of invalid data
2885 		 */
2886 		if (cblk == st_cblk) {
2887 			handle->bh_centry = centry;
2888 			fba_len -= st_cblk_len;
2889 			lentry = centry;
2890 			if (flag & NSC_RDBUF)  {
2891 				if (!SDBC_VALID_BITS(st_cblk_off, st_cblk_len,
2892 				    centry)) {
2893 					io_pos = fba_pos;
2894 					ioent = centry;
2895 				} else {
2896 					DATA_LOG(SDF_ALLOC, centry, st_cblk_off,
2897 					    st_cblk_len);
2898 
2899 					DTRACE_PROBE4(_sd_alloc_data1,
2900 					    uint64_t, (uint64_t)
2901 					    (BLK_TO_FBA_NUM(cblk) +
2902 					    st_cblk_off), int, st_cblk_len,
2903 					    char *, *(int64_t *)
2904 					    (centry->cc_data +
2905 					    FBA_SIZE(st_cblk_off)),
2906 					    char *, *(int64_t *)
2907 					    (centry->cc_data +
2908 					    FBA_SIZE(st_cblk_off + st_cblk_len)
2909 					    - 8));
2910 				}
2911 			}
2912 			cblk++;
2913 		} else if (fba_len == (nsc_size_t)end_cblk_len) {
2914 			lentry->cc_chain = centry;
2915 			fba_len -= end_cblk_len;
2916 			if (flag & NSC_RDBUF) {
2917 				if (ioent == NULL) {
2918 					if (!SDBC_VALID_BITS(0, end_cblk_len,
2919 					    centry)) {
2920 						io_pos = BLK_TO_FBA_NUM(cblk);
2921 						ioent = centry;
2922 					} else {
2923 						DATA_LOG(SDF_ALLOC, centry, 0,
2924 						    end_cblk_len);
2925 
2926 						DTRACE_PROBE4(_sd_alloc_data2,
2927 						    uint64_t,
2928 						    BLK_TO_FBA_NUM(cblk),
2929 						    int, end_cblk_len,
2930 						    char *, *(int64_t *)
2931 						    (centry->cc_data),
2932 						    char *, *(int64_t *)
2933 						    (centry->cc_data +
2934 						    FBA_SIZE(end_cblk_len)
2935 						    - 8));
2936 					}
2937 				}
2938 			}
2939 		} else {
2940 			lentry->cc_chain = centry;
2941 			lentry = centry;
2942 			fba_len -= BLK_FBAS;
2943 			if (flag & NSC_RDBUF) {
2944 				if (ioent == NULL) {
2945 					if (!FULLY_VALID(centry)) {
2946 						io_pos = BLK_TO_FBA_NUM(cblk);
2947 						ioent = centry;
2948 					} else {
2949 						DATA_LOG(SDF_ALLOC, centry, 0,
2950 						    BLK_FBAS);
2951 
2952 						DTRACE_PROBE4(_sd_alloc_data3,
2953 						    uint64_t, (uint64_t)
2954 						    BLK_TO_FBA_NUM(cblk),
2955 						    int, BLK_FBAS,
2956 						    char *, *(int64_t *)
2957 						    (centry->cc_data),
2958 						    char *, *(int64_t *)
2959 						    (centry->cc_data +
2960 						    FBA_SIZE(BLK_FBAS) - 8));
2961 					}
2962 				}
2963 			}
2964 			cblk++;
2965 		}
2966 
2967 		/* if this block has a new identity clear prefetch history */
2968 		if (this_entry_type != HASH_ENTRY_DM)
2969 			centry->cc_aging_dm &=
2970 			    ~(PREFETCH_BUF_I | PREFETCH_BUF_E);
2971 
2972 		centry->cc_aging_dm &= ~(ENTRY_FIELD_DM);
2973 		centry->cc_aging_dm |= this_entry_type;
2974 		if (flag & NSC_METADATA)
2975 			centry->cc_aging_dm |= STICKY_METADATA_DM;
2976 
2977 		--dmchain_request_blocks;
2978 	} while (fba_len);
2979 
2980 	if (locked) {
2981 		rw_exit(&sdbc_queue_lock);
2982 		locked = 0;
2983 	}
2984 
2985 	ASSERT(dmchain_request_blocks == 0);
2986 
2987 	/*
2988 	 * do any necessary cleanup now that all the blocks are allocated.
2989 	 */
2990 	sdbc_centry_alloc_end(&alloc_tok);
2991 
2992 	/* be sure you nul term. the chain */
2993 	centry->cc_chain = NULL;
2994 
2995 	/*
2996 	 * step one: establish HOST/PARASITE/OTHER relationships
2997 	 * between the centry ele in the list and calc the alloc size
2998 	 * (fill in CATAGORY based on TYPE and immediate neighbors)
2999 	 */
3000 	if (sts = _sd_setup_category_on_type(handle->bh_centry)) {
3001 #ifdef DEBUG
3002 		err = _sd_free_buf(handle);
3003 		if (err) {
3004 			cmn_err(CE_WARN, "!sdbc(_sd_alloc_buf): _sd_free_buf "
3005 			    "failed: err:%d handle:%p", err, (void *)handle);
3006 		}
3007 #else
3008 		(void) _sd_free_buf(handle);
3009 #endif
3010 		goto done;
3011 	}
3012 
3013 	/*
3014 	 * step two: alloc the needed mem and fill in the data and chaining
3015 	 * fields (leave bufvec for step three)
3016 	 */
3017 	(void) _sd_setup_mem_chaining(handle->bh_centry, 0);
3018 
3019 	/*
3020 	 * step three: do the bufvec
3021 	 */
3022 	fba_len = fba_orig_len;
3023 	centry = handle->bh_centry;
3024 	bufvec = handle->bh_bufvec;
3025 
3026 	while (centry) {
3027 		DTRACE_PROBE3(_sd_alloc_buf_centrys, _sd_cctl_t *, centry,
3028 		    int, cd, uint64_t,
3029 		    (uint64_t)BLK_TO_FBA_NUM(CENTRY_BLK(centry)));
3030 
3031 		if (fba_len == fba_orig_len) {
3032 			bufvec->bufaddr = (centry->cc_data +
3033 			    FBA_SIZE(st_cblk_off));
3034 			bufvec->bufvmeaddr = 0; /* not used */
3035 			bufvec->buflen  = FBA_SIZE(st_cblk_len);
3036 			bufvec++;
3037 			fba_len -= st_cblk_len;
3038 		} else if (fba_len == (nsc_size_t)end_cblk_len) {
3039 			_sd_bufvec_t *pbufvec = bufvec - 1;
3040 
3041 			if ((pbufvec->bufaddr + pbufvec->buflen) ==
3042 			    centry->cc_data) {
3043 				/* contiguous */
3044 				pbufvec->buflen += FBA_SIZE(end_cblk_len);
3045 			} else {
3046 
3047 				bufvec->bufaddr = centry->cc_data;
3048 				bufvec->bufvmeaddr = 0; /* not used */
3049 				bufvec->buflen = FBA_SIZE(end_cblk_len);
3050 				bufvec++;
3051 			}
3052 
3053 			fba_len -= end_cblk_len;
3054 		} else {
3055 			_sd_bufvec_t *pbufvec = bufvec - 1;
3056 
3057 			if ((pbufvec->bufaddr + pbufvec->buflen) ==
3058 			    centry->cc_data) {
3059 				/* contiguous */
3060 				pbufvec->buflen += CACHE_BLOCK_SIZE;
3061 			} else {
3062 
3063 				bufvec->bufaddr = centry->cc_data;
3064 				bufvec->bufvmeaddr = 0; /* not used */
3065 				bufvec->buflen  = CACHE_BLOCK_SIZE;
3066 				bufvec++;
3067 			}
3068 
3069 			fba_len -= BLK_FBAS;
3070 		}
3071 
3072 		centry = centry->cc_chain;
3073 	}
3074 
3075 	/* be sure you nul term. the chain */
3076 	bufvec->bufaddr = NULL;
3077 	bufvec->bufvmeaddr = 0;
3078 	bufvec->buflen = 0;
3079 
3080 	/* frag statistics */
3081 	{
3082 		_sd_bufvec_t *tbufvec;
3083 
3084 		for (tbufvec = handle->bh_bufvec; tbufvec != bufvec;
3085 		    ++tbufvec) {
3086 			if ((min_frag > tbufvec->buflen) || (min_frag == 0))
3087 				min_frag = tbufvec->buflen;
3088 
3089 			if (max_frag < tbufvec->buflen)
3090 				max_frag = tbufvec->buflen;
3091 		}
3092 
3093 		nfrags = bufvec - handle->bh_bufvec;
3094 		min_frag = FBA_LEN(min_frag);
3095 		max_frag = FBA_LEN(max_frag);
3096 	}
3097 
3098 	/* buffer memory frag stats */
3099 	DTRACE_PROBE4(_sd_alloc_buf_frag, uint64_t, (uint64_t)fba_orig_len,
3100 	    int, nfrags, int, min_frag, int, max_frag);
3101 
3102 
3103 	if (flag & NSC_WRBUF) {
3104 		if (_SD_IS_WRTHRU(handle))
3105 			goto alloc_done;
3106 		if (_sd_alloc_write(handle->bh_centry, &stall)) {
3107 			_sd_unblock(&_sd_flush_cv);
3108 			handle->bh_flag |= NSC_FORCED_WRTHRU;
3109 		} else {
3110 			for (centry = handle->bh_centry;
3111 			    centry; centry = centry->cc_chain) {
3112 
3113 				CENTRY_SET_FTPOS(centry);
3114 				SSOP_SETCENTRY(sdbc_safestore,
3115 				    centry->cc_write);
3116 			}
3117 		}
3118 	}
3119 
3120 alloc_done:
3121 	if (locked) {
3122 		rw_exit(&sdbc_queue_lock);
3123 		locked = 0;
3124 	}
3125 	if (ioent) {
3126 		_SD_DISCONNECT_CALLBACK(handle);
3127 		sts = _sd_doread(handle,  ioent, io_pos,
3128 		    (fba_pos + fba_orig_len - io_pos), flag);
3129 		if (sts > 0)
3130 			(void) _sd_free_buf(handle);
3131 	} else
3132 		if (flag & NSC_RDBUF) {
3133 			CACHE_FBA_READ(cd, fba_orig_len);
3134 			CACHE_READ_HIT;
3135 			FBA_READ_IO_KSTATS(cd, FBA_SIZE(fba_orig_len));
3136 
3137 			sts = NSC_HIT;
3138 	} else
3139 		sts = (stall) ? NSC_DONE : NSC_HIT;
3140 
3141 	SDTRACE(ST_EXIT|SDF_ALLOCBUF, cd, fba_orig_len, fba_pos, flag, sts);
3142 
3143 done:
3144 	if (locked)
3145 		rw_exit(&sdbc_queue_lock);
3146 
3147 	KSTAT_RUNQ_EXIT(cd);
3148 
3149 	return (sts);
3150 }
3151 
3152 /*
3153  * consistency checking for ccents
3154  */
3155 
3156 #define	ELIGIBLE(p) (p & ELIGIBLE_ENTRY_DM)
3157 #define	HOLD(p) (p & HOLD_ENTRY_DM)
3158 #define	HASHE(p) (p & HASH_ENTRY_DM)
3159 
3160 #define	HOST(p) (p & HOST_ENTRY_DM)
3161 #define	PARA(p) (p & PARASITIC_ENTRY_DM)
3162 #define	OTHER(p) \
3163 	(!(p & (HOST_ENTRY_DM | PARASITIC_ENTRY_DM | ELIGIBLE_ENTRY_DM)))
3164 
3165 #define	AVAIL(p) (p & AVAIL_ENTRY_DM)
3166 
3167 /*
3168  * sdbc_check_cctl_cot -- consistency check for _sd_setup_category_on_type()
3169  * may only be called on entry to state machine (when ccent is either
3170  * ELIGIBLE_ENTRY_DM, HOLD_ENTRY_DM or HASH_ENTRY_DM).
3171  *
3172  * print message or panic (DEBUG) if inconsistency detected.
3173  */
3174 static int
3175 sdbc_check_cctl_cot(_sd_cctl_t *centry)
3176 {
3177 	uint_t age;
3178 	int size;
3179 	uchar_t *data;
3180 	int host_or_other;
3181 	int para;
3182 	int ccent_ok = 1;
3183 
3184 	age = centry->cc_aging_dm;
3185 	size = centry->cc_alloc_size_dm;
3186 	data = centry->cc_data;
3187 	host_or_other = size && data;
3188 	para = !size && data;
3189 
3190 	/*
3191 	 * on entry to _sd_setup_category_on_type(),
3192 	 * one of three mutually exclusive entry field bits must be set
3193 	 */
3194 
3195 	switch ((age & (ELIGIBLE_ENTRY_DM | HOLD_ENTRY_DM | HASH_ENTRY_DM))) {
3196 		case ELIGIBLE_ENTRY_DM:
3197 		case HOLD_ENTRY_DM:
3198 		case HASH_ENTRY_DM:
3199 			/* ok */
3200 			break;
3201 		default:
3202 			/* zero or multiple flag bits */
3203 			ccent_ok = 0;
3204 			break;
3205 	}
3206 
3207 	/* categories are mutually exclusive */
3208 	if (HOST(age) && PARA(age))
3209 		ccent_ok = 0;
3210 
3211 	/* these bits should be cleared out (STICKY_METADATA_DM not used) */
3212 	if (age & (AVAIL_ENTRY_DM | FOUND_HOLD_OVER_DM | FOUND_IN_HASH_DM |
3213 	    STICKY_METADATA_DM))
3214 		ccent_ok = 0;
3215 
3216 	/* eligible has no data and no size */
3217 	if (ELIGIBLE(age) && (size || data))
3218 		ccent_ok = 0;
3219 
3220 	/* parasite has zero size and non-zero data */
3221 	if (PARA(age) && !para)
3222 		ccent_ok = 0;
3223 
3224 	/* host has non-zero size and non-zero data */
3225 	if (HOST(age) && !host_or_other)
3226 		ccent_ok = 0;
3227 
3228 	/* "other" is just like a host */
3229 	if (OTHER(age) && !host_or_other)
3230 		ccent_ok = 0;
3231 
3232 	/* a HOLD or a HASH must have a size */
3233 	if ((size) && !(age & (HASH_ENTRY_DM | HOLD_ENTRY_DM)))
3234 		ccent_ok = 0;
3235 
3236 	if (!ccent_ok)
3237 		cmn_err(cmn_level,
3238 		    "!sdbc(sdbc_check_cctl_cot): inconsistent ccent %p "
3239 		    "age %x size %d data %p", (void *)centry, age, size,
3240 		    (void *)data);
3241 
3242 	return (ccent_ok);
3243 }
3244 
3245 /*
3246  * sdbc_mark_cctl_cot  -- mark cctls bad and invalidate when
3247  *			  inconsistency found in _sd_setup_category_on_type()
3248  * returns nothing
3249  *
3250  * Note:  this is an error recovery path that is triggered when an
3251  * inconsistency in a cctl is detected.  _sd_centry_release() will take
3252  * these cache entries out of circulation and place them on a separate list
3253  * for debugging purposes.
3254  */
3255 void
3256 sdbc_mark_cctl_cot(_sd_cctl_t *header, _sd_cctl_t *centry)
3257 {
3258 	_sd_cctl_t *cur_ent = header;
3259 
3260 	/* the entire chain is guilty by association */
3261 	while (cur_ent) {
3262 
3263 		(void) _sd_hash_delete((struct _sd_hash_hd *)cur_ent,
3264 		    _sd_htable);
3265 
3266 		cur_ent->cc_aging_dm |= BAD_CHAIN_DM;
3267 
3268 		cur_ent = cur_ent->cc_chain;
3269 	}
3270 
3271 	centry->cc_aging_dm |= BAD_ENTRY_DM; /* this is the problem child */
3272 }
3273 
3274 /*
3275  * _sd_setup_category_on_type(_sd_cctl_t *) - Setup the centry CATEGORY based on
3276  * centry TYPE and immediate neighbors. Identify each eligible (ie not HASH)
3277  * centry as a host/parasite. host actually have memory allocated to
3278  * them and parasites are chained to the host and point to page offsets within
3279  * the host's memory.
3280  *
3281  * RETURNS:
3282  *	0 on success, EINTR if inconsistency detected in centry
3283  *
3284  * Note:
3285  *	none
3286  */
3287 static int
3288 _sd_setup_category_on_type(_sd_cctl_t *header)
3289 {
3290 	_sd_cctl_t *prev_ent, *next_ent, *centry;
3291 	_sd_cctl_t *anchor = NULL;
3292 	int	 current_pest_count, local_max_dyn_list;
3293 	int	 cl;
3294 	int ret = 0;
3295 
3296 	ASSERT(header);
3297 
3298 	if (sdbc_use_dmchain)
3299 		local_max_dyn_list = max_dm_queues - 1;
3300 	else {
3301 		/* pickup a fresh copy - has the world changed */
3302 		local_max_dyn_list = dynmem_processing_dm.max_dyn_list;
3303 	}
3304 
3305 	prev_ent = 0;
3306 	centry = header;
3307 	next_ent = centry->cc_chain;
3308 	current_pest_count = 0;
3309 	cl = 2;
3310 
3311 	/* try to recover from bad cctl */
3312 	if (sdbc_check_cot && !sdbc_check_cctl_cot(centry))
3313 		ret = EINTR;
3314 
3315 	while (cl && (ret == 0)) {
3316 		switch (cl) {
3317 			case (1):  /* chain to next/monitor for completion */
3318 				prev_ent = centry;
3319 				centry = next_ent;
3320 				next_ent = 0;
3321 				cl = 0;
3322 				if (centry) {
3323 
3324 					if (sdbc_check_cot &&
3325 					    !sdbc_check_cctl_cot(centry)) {
3326 						ret = EINTR;
3327 						break;
3328 					}
3329 
3330 					next_ent = centry->cc_chain;
3331 					cl = 2;
3332 				}
3333 			break;
3334 
3335 			case (2): /* vector to appropriate routine */
3336 				if (!(centry->cc_aging_dm & ELIGIBLE_ENTRY_DM))
3337 					cl = 5;
3338 				else if (prev_ent && (prev_ent->cc_aging_dm &
3339 				    ELIGIBLE_ENTRY_DM))
3340 					cl = 15;
3341 				else
3342 					cl = 10;
3343 			break;
3344 
3345 			case (5): /* process NON-ELIGIBLE entries */
3346 				if (!(centry->cc_aging_dm &
3347 				    (HASH_ENTRY_DM|HOLD_ENTRY_DM))) {
3348 					/* no catagory */
3349 
3350 					/* consistency check */
3351 					if (centry->cc_alloc_size_dm ||
3352 					    centry->cc_data) {
3353 						cmn_err(cmn_level,
3354 						    "!sdbc(setup_cot): "
3355 						    "OTHER with data/size %p",
3356 						    (void *)centry);
3357 
3358 						ret = EINTR;
3359 						break;
3360 					}
3361 
3362 					centry->cc_aging_dm &=
3363 					    ~CATAGORY_ENTRY_DM;
3364 					centry->cc_alloc_size_dm = BLK_SIZE(1);
3365 					DTRACE_PROBE1(_sd_setup_category,
3366 					    _sd_cctl_t *, centry);
3367 				}
3368 				cl = 1;
3369 			break;
3370 
3371 			/*
3372 			 * no prev entry (ie top of list) or no prev
3373 			 * ELIGIBLE entry
3374 			 */
3375 			case (10):
3376 				/*
3377 				 * this is an eligible entry, does it start
3378 				 * a list or is it a loner
3379 				 */
3380 				/* consistency check */
3381 				if (centry->cc_alloc_size_dm ||
3382 				    centry->cc_data) {
3383 					cmn_err(cmn_level, "!sdbc(setup_cot): "
3384 					    "HOST with data/size %p",
3385 					    (void *)centry);
3386 					ret = EINTR;
3387 					break;
3388 				}
3389 
3390 				if (next_ent && (next_ent->cc_aging_dm &
3391 				    ELIGIBLE_ENTRY_DM)) {
3392 
3393 
3394 					/* it starts a list */
3395 					/* host catagory */
3396 					centry->cc_aging_dm |= HOST_ENTRY_DM;
3397 					/* start out with one page */
3398 					centry->cc_alloc_size_dm = BLK_SIZE(1);
3399 					anchor = centry;
3400 					DTRACE_PROBE1(_sd_setup_category,
3401 					    _sd_cctl_t *, anchor);
3402 					cl = 1;
3403 				} else {
3404 					/*
3405 					 * it's a loner
3406 					 * drop status to no category and
3407 					 * restart
3408 					 */
3409 					cl = 2;
3410 					centry->cc_aging_dm &=
3411 					    ~ELIGIBLE_ENTRY_DM;
3412 				}
3413 			break;
3414 
3415 			case (15): /* default to parasite catagory */
3416 
3417 				/* consistency check */
3418 				if (centry->cc_alloc_size_dm ||
3419 				    centry->cc_data) {
3420 					cmn_err(cmn_level, "!sdbc(setup_cot): "
3421 					    "PARA with data/size %p",
3422 					    (void *)centry);
3423 
3424 					ret = EINTR;
3425 					break;
3426 				}
3427 
3428 				if (current_pest_count < local_max_dyn_list-1) {
3429 					/* continue to grow the pest list */
3430 					current_pest_count++;
3431 					centry->cc_aging_dm |=
3432 					    PARASITIC_ENTRY_DM;
3433 
3434 					/*
3435 					 * offset of host ent mem this will pt
3436 					 * to
3437 					 */
3438 					centry->cc_alloc_size_dm =
3439 					    anchor->cc_alloc_size_dm;
3440 					/*
3441 					 * up the host mem req by one for
3442 					 * this parasite
3443 					 */
3444 					DTRACE_PROBE1(_sd_setup_category,
3445 					    _sd_cctl_t *, centry);
3446 
3447 					anchor->cc_alloc_size_dm += BLK_SIZE(1);
3448 
3449 					cl = 1;
3450 				} else {
3451 					/*
3452 					 * term this pest list - restart fresh
3453 					 * on this entry
3454 					 */
3455 					current_pest_count = 0;
3456 					prev_ent->cc_aging_dm &=
3457 					    ~(HOST_ENTRY_DM|ELIGIBLE_ENTRY_DM);
3458 					cl = 2;
3459 				}
3460 			break;
3461 			} /* switch(cl) */
3462 	} /* while (cl) */
3463 
3464 	if (ret != 0)
3465 		sdbc_mark_cctl_cot(header, centry);
3466 
3467 	return (ret);
3468 }
3469 
3470 /*
3471  * _sd_setup_mem_chaining(_sd_cctl_t *) - Allocate memory, setup
3472  * mem ptrs an host/pest chaining. Do the actual allocation as described in
3473  * sd_setup_category_on_type().
3474  *
3475  * RETURNS:
3476  *	0 on success
3477  *	non-zero on error
3478  *
3479  * Note:
3480  *	if called with ALLOC_NOWAIT, caller must check for non-zero return
3481  */
3482 static int
3483 _sd_setup_mem_chaining(_sd_cctl_t *header, int flag)
3484 {
3485 	_sd_cctl_t *prev_ent, *next_ent, *centry;
3486 	_sd_cctl_t *anchor = NULL;
3487 	int cl, rc = 0;
3488 
3489 	ASSERT(header);
3490 
3491 	if (!header)
3492 		return (0);
3493 
3494 	prev_ent = 0;
3495 	centry = header;
3496 	next_ent = centry->cc_chain;
3497 	cl = 2;
3498 	while (cl) {
3499 		switch (cl) {
3500 			case (1):  /* chain to next/monitor for completion */
3501 				centry->cc_aging_dm &= ~ELIGIBLE_ENTRY_DM;
3502 				prev_ent = centry;
3503 				centry = next_ent;
3504 				next_ent = 0;
3505 				cl = 0;
3506 				if (centry) {
3507 					next_ent = centry->cc_chain;
3508 					cl = 2;
3509 				}
3510 			break;
3511 
3512 			case (2): /* vector to appropriate routine */
3513 				if (centry->cc_aging_dm & HOST_ENTRY_DM)
3514 					cl = 10;
3515 				else if (centry->cc_aging_dm &
3516 				    PARASITIC_ENTRY_DM)
3517 					cl = 15;
3518 				else
3519 					cl = 5;
3520 			break;
3521 
3522 			case (5): /* OTHER processing - alloc mem */
3523 				if (rc = sdbc_centry_memalloc_dm(centry,
3524 				    centry->cc_alloc_size_dm, flag))
3525 					/* The allocation failed */
3526 					cl = 0;
3527 				else
3528 					cl = 1;
3529 			break;
3530 
3531 				/*
3532 				 * HOST entry processing - save the anchor pt,
3533 				 * alloc the memory,
3534 				 */
3535 			case (10): /* setup head and nxt ptrs */
3536 				anchor = centry;
3537 				if (rc = sdbc_centry_memalloc_dm(centry,
3538 				    centry->cc_alloc_size_dm, flag))
3539 					/* The allocation failed */
3540 					cl = 0;
3541 				else
3542 					cl = 1;
3543 			break;
3544 
3545 				/*
3546 				 * PARASITIC entry processing - setup w/no
3547 				 * memory, setup head/next ptrs,
3548 				 */
3549 			case (15):
3550 				/*
3551 				 * fudge the data mem ptr to an offset from
3552 				 * the anchor alloc
3553 				 */
3554 				if (!(centry->cc_aging_dm &
3555 				    (HASH_ENTRY_DM| HOLD_ENTRY_DM))) {
3556 					centry->cc_head_dm = anchor;
3557 
3558 					/* chain prev to this */
3559 					prev_ent->cc_next_dm = centry;
3560 
3561 					/*
3562 					 * generate the actual data ptr into
3563 					 * host entry memory
3564 					 */
3565 					centry->cc_data = anchor->cc_data +
3566 					    centry->cc_alloc_size_dm;
3567 					centry->cc_alloc_size_dm = 0;
3568 				}
3569 				cl = 1;
3570 			break;
3571 		} /* switch(cl) */
3572 	} /* while (cl) */
3573 
3574 	return (rc);
3575 }
3576 
3577 /*
3578  * _sd_check_buffer_alloc - Check if buffer allocation is invalid.
3579  *
3580  * RETURNS:
3581  *	0 if its ok to continue with allocation.
3582  *	Else errno to be returned to the user.
3583  *
3584  * Note:
3585  *	This routine could block if the device is not local and
3586  *	recovery is in progress.
3587  */
3588 
3589 /* ARGSUSED */
3590 static int
3591 _sd_check_buffer_alloc(int cd, nsc_off_t fba_pos, nsc_size_t fba_len,
3592     _sd_buf_handle_t **hp)
3593 {
3594 	/*
3595 	 * This check exists to ensure that someone will not pass in an
3596 	 * arbitrary pointer and try to pass it off as a handle.
3597 	 */
3598 	if ((*hp)->bh_flag & (~_SD_VALID_FLAGS)) {
3599 		cmn_err(CE_WARN, "!sdbc(_sd_check_buffer_alloc) "
3600 		    "cd %d invalid handle %p flags %x",
3601 		    cd, (void *)*hp, (*hp)->bh_flag);
3602 		return (EINVAL);
3603 	}
3604 
3605 	if ((_sd_cache_initialized == 0) || (FILE_OPENED(cd) == 0)) {
3606 		cmn_err(CE_WARN, "!sdbc(_sd_check_buffer_alloc) "
3607 		    "cd %d not open. Cache init %d",
3608 		    cd, _sd_cache_initialized);
3609 		return (EINVAL);
3610 	}
3611 	ASSERT(cd >= 0);
3612 	if (!(_sd_cache_files[cd].cd_rawfd) ||
3613 	    !nsc_held(_sd_cache_files[cd].cd_rawfd)) {
3614 		cmn_err(CE_WARN,
3615 		    "!sdbc(_sd_check_buffer_alloc) cd %d is not attached", cd);
3616 		return (EINVAL);
3617 	}
3618 
3619 	ASSERT_IO_SIZE(fba_pos, fba_len, cd);
3620 	ASSERT_LEN(fba_len);
3621 
3622 	return (0);
3623 }
3624 
3625 /*
3626  * sdbc_check_handle -- check that handle is valid
3627  * return 1 if ok, 0 otherwise (if debug then panic).
3628  */
3629 static int
3630 sdbc_check_handle(_sd_buf_handle_t *handle)
3631 {
3632 	int ret = 1;
3633 
3634 	if (!_SD_HANDLE_ACTIVE(handle)) {
3635 
3636 		cmn_err(cmn_level, "!sdbc(_sd_free_buf): invalid handle %p"
3637 		    "cd %d fpos %" NSC_SZFMT " flen %" NSC_SZFMT " flag %x",
3638 		    (void *)handle, HANDLE_CD(handle), handle->bh_fba_pos,
3639 		    handle->bh_fba_len, handle->bh_flag);
3640 
3641 		ret = 0;
3642 	}
3643 
3644 	return (ret);
3645 }
3646 
3647 /*
3648  * _sd_free_buf -  Free the buffers allocated in _sd_alloc_buf.
3649  *
3650  * ARGUMENTS:
3651  *	handle	-  The handle allocated in _sd_alloc_buf.
3652  *
3653  * RETURNS:
3654  *	0 on success.
3655  *	Else errno.
3656  *
3657  * NOTE:
3658  *	If handle was allocated through _sd_alloc_buf, the handle allocated
3659  *	flag (NSC_HALLOCATED) will be reset by _sd_alloc_buf. This indicates
3660  *	that _sd_free_buf should free up the handle as well.
3661  *	All other handles directly allocated from _sd_alloc_handle will have
3662  *	that flag set. Any handle with valid blocks will have the handle
3663  *	active flag. It is an error if the active flag is not set.
3664  *	(if free_buf were called without going through alloc_buf)
3665  */
3666 
3667 int
3668 _sd_free_buf(_sd_buf_handle_t *handle)
3669 {
3670 	_sd_cctl_t *centry, *cc_chain;
3671 	int cd = HANDLE_CD(handle);
3672 	int flen = handle->bh_fba_len;
3673 	int fpos = handle->bh_fba_pos;
3674 
3675 	SDTRACE(ST_ENTER|SDF_FREEBUF, HANDLE_CD(handle),
3676 	    handle->bh_fba_len, handle->bh_fba_pos, 0, 0);
3677 
3678 	if (sdbc_check_handle(handle) == 0)
3679 		return (EINVAL);
3680 
3681 	if (handle->bh_flag & NSC_MIXED) {
3682 		/*
3683 		 * Data in this handle will be a mix of data from the
3684 		 * source device and data from another device, so
3685 		 * invalidate all the blocks.
3686 		 */
3687 		handle->bh_flag &= ~NSC_QUEUE;
3688 		centry = handle->bh_centry;
3689 		while (centry) {
3690 			centry->cc_valid = 0;
3691 			centry = centry->cc_chain;
3692 		}
3693 	}
3694 
3695 	if ((handle->bh_flag & NSC_QUEUE)) {
3696 		handle->bh_flag &= ~NSC_QUEUE;
3697 		_sd_queue_write(handle, handle->bh_fba_pos, handle->bh_fba_len);
3698 	}
3699 
3700 	handle->bh_flag &= ~NSC_HACTIVE;
3701 
3702 	centry = handle->bh_centry;
3703 	while (centry) {
3704 		cc_chain = centry->cc_chain;
3705 		_sd_centry_release(centry);
3706 		centry = cc_chain;
3707 	}
3708 
3709 	/*
3710 	 * help prevent dup call to _sd_centry_release if this handle
3711 	 * is erroneously _sd_free_buf'd twice.  (should not happen).
3712 	 */
3713 	handle->bh_centry = NULL;
3714 
3715 	if ((handle->bh_flag & NSC_HALLOCATED) == 0) {
3716 		handle->bh_flag |= NSC_HALLOCATED;
3717 		(void) _sd_free_handle(handle);
3718 	} else {
3719 		handle->bh_flag = NSC_HALLOCATED;
3720 	}
3721 
3722 	SDTRACE(ST_EXIT|SDF_FREEBUF, cd, flen, fpos, 0, 0);
3723 
3724 	return (0);
3725 }
3726 
3727 
3728 static int _sd_lruq_srch = 0x2000;
3729 
3730 /*
3731  * sdbc_get_dmchain -- get a candidate centry chain pointing to
3732  * 			contiguous memory
3733  *	ARGUMENTS:
3734  *	cblocks  - number of cache blocks requested
3735  *	stall	- pointer to stall count (no blocks avail)
3736  *	flag	- ALLOC_NOWAIT flag
3737  *
3738  *	RETURNS:
3739  * 		a cache entry or possible NULL if ALLOC_NOWAIT set
3740  *	USAGE:
3741  *		attempt to satisfy entire request from queue
3742  *		that has no memory allocated.
3743  *		if this fails then attempt a partial allocation
3744  *		with a preallocated block of requested size up to
3745  *		max_dyn_list.
3746  *		then look for largest chain less than max_dyn_list.
3747  */
3748 static _sd_cctl_t *
3749 sdbc_get_dmchain(int cblocks, int *stall, int flag)
3750 {
3751 	_sd_cctl_t *cc_dmchain = NULL;
3752 	_sd_queue_t *q;
3753 	_sd_cctl_t *qhead;
3754 	int num_tries;
3755 	int cblocks_orig = cblocks;
3756 	int nowait = flag & ALLOC_NOWAIT;
3757 	int i;
3758 
3759 	num_tries = _sd_lruq_srch;
3760 
3761 	ASSERT(cblocks != 0);
3762 
3763 	while (!cc_dmchain) {
3764 		/* get it from the os if possible */
3765 		q = &sdbc_dm_queues[0];
3766 		qhead = &(q->sq_qhead);
3767 
3768 		if (q->sq_inq >= cblocks) {
3769 			mutex_enter(&q->sq_qlock);
3770 			if (q->sq_inq >= cblocks) {
3771 				_sd_cctl_t *cc_ent;
3772 
3773 				cc_dmchain = qhead->cc_next;
3774 
3775 				/*
3776 				 * set the inuse and pageio bits
3777 				 * Note: this code expects the cc_ent to
3778 				 * be available.  no other thread may set the
3779 				 * inuse or pageio bit for an entry on the
3780 				 * 0 queue.
3781 				 */
3782 				cc_ent = qhead;
3783 				for (i = 0; i < cblocks; ++i) {
3784 					cc_ent = cc_ent->cc_next;
3785 
3786 					if (SET_CENTRY_INUSE(cc_ent)) {
3787 						cmn_err(CE_PANIC,
3788 						    "centry inuse on 0 q! %p",
3789 						    (void *)cc_ent);
3790 					}
3791 
3792 					if (SET_CENTRY_PAGEIO(cc_ent)) {
3793 						cmn_err(CE_PANIC,
3794 						    "centry pageio on 0 q! %p",
3795 						    (void *)cc_ent);
3796 					}
3797 				}
3798 				/* got a dmchain */
3799 
3800 				/* remove this chain from the 0 queue */
3801 				cc_dmchain->cc_prev->cc_next = cc_ent->cc_next;
3802 				cc_ent->cc_next->cc_prev = cc_dmchain->cc_prev;
3803 				cc_dmchain->cc_prev = NULL;
3804 				cc_ent->cc_next = NULL;
3805 
3806 				q->sq_inq -= cblocks;
3807 
3808 				ASSERT(GOOD_LRUSIZE(q));
3809 
3810 			}
3811 			mutex_exit(&q->sq_qlock);
3812 			if (cc_dmchain)
3813 				continue;
3814 		}
3815 
3816 		/* look for a pre-allocated block of the requested size */
3817 
3818 
3819 		if (cblocks > (max_dm_queues - 1))
3820 			cblocks = max_dm_queues - 1;
3821 
3822 		q = &sdbc_dm_queues[cblocks];
3823 		qhead = &(q->sq_qhead);
3824 
3825 		if (q->sq_inq != 0) {
3826 			_sd_cctl_t *tmp_dmchain;
3827 
3828 			mutex_enter(&q->sq_qlock);
3829 
3830 			for (tmp_dmchain = qhead->cc_next; tmp_dmchain != qhead;
3831 			    tmp_dmchain = tmp_dmchain->cc_next) {
3832 
3833 				/*
3834 				 * get a dmchain
3835 				 * set the inuse and pageio bits
3836 				 */
3837 				if (sdbc_dmchain_avail(tmp_dmchain)) {
3838 					/* put on MRU end of queue */
3839 					sdbc_requeue_dmchain(q, tmp_dmchain,
3840 					    1, 0);
3841 					cc_dmchain = tmp_dmchain;
3842 					break;
3843 				}
3844 				sdbc_dmchain_not_avail++;
3845 			}
3846 
3847 			mutex_exit(&q->sq_qlock);
3848 			if (cc_dmchain)
3849 				continue;
3850 		}
3851 
3852 		/*
3853 		 * spin block
3854 		 * nudge the deallocator,  accelerate ageing
3855 		 */
3856 
3857 		mutex_enter(&dynmem_processing_dm.thread_dm_lock);
3858 		cv_broadcast(&dynmem_processing_dm.thread_dm_cv);
3859 		mutex_exit(&dynmem_processing_dm.thread_dm_lock);
3860 
3861 		if (nowait)
3862 			break;
3863 
3864 		if (!(--num_tries)) {
3865 			delay(drv_usectohz(20000));
3866 			(void) (*stall)++;
3867 			num_tries = _sd_lruq_srch;
3868 			cblocks = cblocks_orig;
3869 		} else { /* see if smaller request size is available */
3870 			if (!(--cblocks))
3871 				cblocks = cblocks_orig;
3872 		}
3873 
3874 	} /* while (!cc_dmchain) */
3875 
3876 	return (cc_dmchain);
3877 }
3878 
3879 static int
3880 sdbc_dmchain_avail(_sd_cctl_t *cc_ent)
3881 {
3882 	int chain_avail = 1;
3883 	_sd_cctl_t *anchor = cc_ent;
3884 
3885 	while (cc_ent) {
3886 
3887 		ASSERT(_sd_cctl_valid(cc_ent));
3888 
3889 		if (cc_ent->cc_aging_dm & BAD_CHAIN_DM) {
3890 			chain_avail = 0;
3891 			break;
3892 		}
3893 
3894 		if (CENTRY_DIRTY(cc_ent)) {
3895 			chain_avail = 0;
3896 			break;
3897 		}
3898 		if (SET_CENTRY_INUSE(cc_ent)) {
3899 			chain_avail = 0;
3900 			break;
3901 		}
3902 
3903 		if ((SET_CENTRY_PAGEIO(cc_ent))) {
3904 
3905 			CLEAR_CENTRY_INUSE(cc_ent);
3906 			chain_avail = 0;
3907 			break;
3908 		}
3909 
3910 		if (CENTRY_DIRTY(cc_ent)) {
3911 
3912 			CLEAR_CENTRY_PAGEIO(cc_ent);
3913 			CLEAR_CENTRY_INUSE(cc_ent);
3914 			chain_avail = 0;
3915 			break;
3916 		}
3917 
3918 		cc_ent->cc_flag = 0;
3919 		cc_ent->cc_toflush = 0;
3920 
3921 		cc_ent = cc_ent->cc_next_dm;
3922 	}
3923 
3924 	if (!chain_avail)
3925 		sdbc_clear_dmchain(anchor, cc_ent);
3926 	else {
3927 		cc_ent = anchor;
3928 
3929 		/*
3930 		 * prevent possible deadlocks in _sd_cc_wait():
3931 		 * remove from hash and wakeup any waiters now that we
3932 		 * have acquired the chain.
3933 		 */
3934 		while (cc_ent) {
3935 			(void) _sd_hash_delete((struct _sd_hash_hd *)cc_ent,
3936 			    _sd_htable);
3937 
3938 			mutex_enter(&cc_ent->cc_lock);
3939 			if (cc_ent->cc_await_use) {
3940 				cv_broadcast(&cc_ent->cc_blkcv);
3941 			}
3942 			mutex_exit(&cc_ent->cc_lock);
3943 
3944 			cc_ent->cc_creat = nsc_lbolt();
3945 			cc_ent->cc_hits = 0;
3946 
3947 			cc_ent = cc_ent->cc_next_dm;
3948 		}
3949 	}
3950 
3951 	return (chain_avail);
3952 }
3953 
3954 static void
3955 sdbc_clear_dmchain(_sd_cctl_t *cc_ent_start, _sd_cctl_t *cc_ent_end)
3956 {
3957 	_sd_cctl_t *cc_ent = cc_ent_start;
3958 	_sd_cctl_t *prev_ent;
3959 
3960 	ASSERT(_sd_cctl_valid(cc_ent));
3961 
3962 	while (cc_ent != cc_ent_end) {
3963 
3964 		ASSERT(_sd_cctl_valid(cc_ent));
3965 
3966 		prev_ent = cc_ent;
3967 		cc_ent = cc_ent->cc_next_dm;
3968 
3969 		CLEAR_CENTRY_PAGEIO(prev_ent);
3970 		CLEAR_CENTRY_INUSE(prev_ent);
3971 	}
3972 
3973 }
3974 
3975 /*
3976  * put a dmchain on the LRU end of a queue
3977  */
3978 void
3979 sdbc_ins_dmqueue_front(_sd_queue_t *q, _sd_cctl_t *cc_ent)
3980 {
3981 	_sd_cctl_t *qhead = &(q->sq_qhead);
3982 
3983 	ASSERT(_sd_cctl_valid(cc_ent));
3984 
3985 	mutex_enter(&q->sq_qlock);
3986 	cc_ent->cc_next = qhead->cc_next;
3987 	cc_ent->cc_prev = qhead;
3988 	qhead->cc_next->cc_prev = cc_ent;
3989 	qhead->cc_next = cc_ent;
3990 	q->sq_inq++;
3991 	cc_ent->cc_cblocks = q->sq_dmchain_cblocks;
3992 
3993 	ASSERT(GOOD_LRUSIZE(q));
3994 
3995 	mutex_exit(&q->sq_qlock);
3996 
3997 }
3998 
3999 /*
4000  * put a dmchain on the MRU end of a queue
4001  */
4002 static void
4003 sdbc_ins_dmqueue_back(_sd_queue_t *q, _sd_cctl_t *cc_ent)
4004 {
4005 	_sd_cctl_t *qhead = &(q->sq_qhead);
4006 
4007 	ASSERT(_sd_cctl_valid(cc_ent));
4008 
4009 	mutex_enter(&q->sq_qlock);
4010 	cc_ent->cc_next = qhead;
4011 	cc_ent->cc_prev = qhead->cc_prev;
4012 	qhead->cc_prev->cc_next = cc_ent;
4013 	qhead->cc_prev = cc_ent;
4014 	cc_ent->cc_seq = q->sq_seq++;
4015 	q->sq_inq++;
4016 	cc_ent->cc_cblocks = q->sq_dmchain_cblocks;
4017 
4018 	ASSERT(GOOD_LRUSIZE(q));
4019 
4020 	mutex_exit(&q->sq_qlock);
4021 
4022 }
4023 
4024 /*
4025  * remove dmchain from a queue
4026  */
4027 void
4028 sdbc_remq_dmchain(_sd_queue_t *q, _sd_cctl_t *cc_ent)
4029 {
4030 
4031 	ASSERT(_sd_cctl_valid(cc_ent));
4032 
4033 	mutex_enter(&q->sq_qlock);
4034 	cc_ent->cc_prev->cc_next = cc_ent->cc_next;
4035 	cc_ent->cc_next->cc_prev = cc_ent->cc_prev;
4036 	cc_ent->cc_next = cc_ent->cc_prev = NULL; /* defensive programming */
4037 	cc_ent->cc_cblocks = -1; /* indicate not on any queue */
4038 
4039 	q->sq_inq--;
4040 
4041 	ASSERT(GOOD_LRUSIZE(q));
4042 
4043 	mutex_exit(&q->sq_qlock);
4044 
4045 }
4046 
4047 /*
4048  * requeue a dmchain to the MRU end of its queue.
4049  * if getlock is 0 on entry the queue lock (sq_qlock) must be held
4050  */
4051 void
4052 sdbc_requeue_dmchain(_sd_queue_t *q, _sd_cctl_t *cc_ent, int mru,
4053 			int getlock)
4054 {
4055 	_sd_cctl_t *qhead = &(q->sq_qhead);
4056 
4057 
4058 	ASSERT(_sd_cctl_valid(cc_ent));
4059 
4060 	if (getlock)
4061 		mutex_enter(&q->sq_qlock);
4062 
4063 	/* inline of sdbc_remq_dmchain() */
4064 	cc_ent->cc_prev->cc_next = cc_ent->cc_next;
4065 	cc_ent->cc_next->cc_prev = cc_ent->cc_prev;
4066 
4067 	if (mru) { /* put on MRU end of queue */
4068 		/* inline of sdbc_ins_dmqueue_back */
4069 		cc_ent->cc_next = qhead;
4070 		cc_ent->cc_prev = qhead->cc_prev;
4071 		qhead->cc_prev->cc_next = cc_ent;
4072 		qhead->cc_prev = cc_ent;
4073 		cc_ent->cc_seq = q->sq_seq++;
4074 		(q->sq_req_stat)++;
4075 	} else { /* put on LRU end of queue i.e. requeue to head */
4076 		/* inline of sdbc_ins_dmqueue_front */
4077 		cc_ent->cc_next = qhead->cc_next;
4078 		cc_ent->cc_prev = qhead;
4079 		qhead->cc_next->cc_prev = cc_ent;
4080 		qhead->cc_next = cc_ent;
4081 		cc_ent->cc_seq = q->sq_seq++;
4082 
4083 		/*
4084 		 * clear the CC_QHEAD bit on all members of the chain
4085 		 */
4086 		{
4087 			_sd_cctl_t *tcent;
4088 
4089 			for (tcent = cc_ent;  tcent; tcent = tcent->cc_next_dm)
4090 				tcent->cc_flag &= ~CC_QHEAD;
4091 		}
4092 	}
4093 
4094 	if (getlock)
4095 		mutex_exit(&q->sq_qlock);
4096 
4097 }
4098 
4099 /*
4100  * sdbc_dmchain_dirty(cc_ent)
4101  * return first dirty cc_ent in dmchain, NULL if chain is not dirty
4102  */
4103 static _sd_cctl_t *
4104 sdbc_dmchain_dirty(_sd_cctl_t *cc_ent)
4105 {
4106 	for (/* CSTYLED */;  cc_ent; cc_ent = cc_ent->cc_next_dm)
4107 		if (CENTRY_DIRTY(cc_ent))
4108 			break;
4109 
4110 	return (cc_ent);
4111 }
4112 
4113 /*
4114  * sdbc_requeue_head_dm_try()
4115  * attempt to requeue a dmchain to the head of the queue
4116  */
4117 void
4118 sdbc_requeue_head_dm_try(_sd_cctl_t *cc_ent)
4119 {
4120 	int qidx;
4121 	_sd_queue_t *q;
4122 
4123 	if (!sdbc_dmchain_dirty(cc_ent)) {
4124 		qidx = cc_ent->cc_cblocks;
4125 		q = &sdbc_dm_queues[qidx];
4126 		sdbc_requeue_dmchain(q, cc_ent, 0, 1); /* requeue head */
4127 	}
4128 }
4129 
4130 /*
4131  * sdbc_centry_alloc_blks -- allocate cache entries with memory
4132  *
4133  * ARGUMENTS:
4134  *	cd	- Cache descriptor (from a previous open)
4135  *	cblk	- cache block number.
4136  *	reqblks	- number of cache blocks to be allocated
4137  *	flag	- can be ALLOC_NOWAIT
4138  * RETURNS:
4139  *	A cache block chain or NULL if ALLOC_NOWAIT and request fails
4140  *
4141  *	Note: caller must check for null return if called with
4142  *	ALLOC_NOWAIT set.
4143  */
4144 _sd_cctl_t *
4145 sdbc_centry_alloc_blks(int cd, nsc_off_t cblk, nsc_size_t reqblks, int flag)
4146 {
4147 	sdbc_allocbuf_t alloc_tok = {0}; /* must be 0 */
4148 	int stall = 0;
4149 	_sd_cctl_t *centry = NULL;
4150 	_sd_cctl_t *lentry = NULL;
4151 	_sd_cctl_t *anchor = NULL;
4152 	_sd_cctl_t *next_centry;
4153 
4154 	ASSERT(reqblks);
4155 
4156 	while (reqblks) {
4157 		centry = sdbc_centry_alloc(cd, cblk, reqblks, &stall,
4158 		    &alloc_tok, flag);
4159 
4160 		if (!centry)
4161 			break;
4162 
4163 		centry->cc_chain = NULL;
4164 
4165 		if (lentry == NULL)
4166 			anchor = centry;
4167 		else
4168 			lentry->cc_chain = centry;
4169 
4170 		lentry = centry;
4171 
4172 		centry->cc_aging_dm &= ~(ENTRY_FIELD_DM);
4173 
4174 		if (centry->cc_aging_dm & FOUND_IN_HASH_DM)
4175 			centry->cc_aging_dm |= HASH_ENTRY_DM;
4176 		else
4177 			if (centry->cc_aging_dm & FOUND_HOLD_OVER_DM)
4178 				centry->cc_aging_dm |= HOLD_ENTRY_DM;
4179 			else
4180 				centry->cc_aging_dm |= ELIGIBLE_ENTRY_DM;
4181 
4182 		centry->cc_aging_dm &= ~(FOUND_IN_HASH_DM|FOUND_HOLD_OVER_DM);
4183 		--reqblks;
4184 	}
4185 
4186 	sdbc_centry_alloc_end(&alloc_tok);
4187 
4188 	if (reqblks || (_sd_setup_category_on_type(anchor))) {
4189 		centry = anchor;
4190 		while (centry) {
4191 			next_centry = centry->cc_chain;
4192 			_sd_centry_release(centry);
4193 			centry = next_centry;
4194 		}
4195 		anchor = NULL;
4196 
4197 	} else
4198 		/* This is where the memory is actually allocated */
4199 		if (_sd_setup_mem_chaining(anchor, flag))
4200 			anchor = NULL;
4201 
4202 	return (anchor);
4203 }
4204 
4205 
4206 /*
4207  * sdbc_centry_alloc - sdbc internal function to allocate a new cache block.
4208  *
4209  * ARGUMENTS:
4210  *	cd	- Cache descriptor (from a previous open)
4211  *	cblk	- cache block number.
4212  *	stall	- pointer to stall count (no blocks avail)
4213  *	req_blocks - number of cache blocks remaining in caller's i/o request
4214  *	alloc_tok - pointer to token initialized to 0 on first call to function
4215  *	flag	- lock status of sdbc_queue_lock or ALLOC_NOWAIT flag
4216  * RETURNS:
4217  *	A cache block, or possibly NULL if ALLOC_NOWAIT set .
4218  *
4219  * USAGE:
4220  *	switch to the appropriate allocation function.
4221  *	this function is used when callers need more than one cache block.
4222  *	it is called repeatedly until the entire request is satisfied,
4223  *	at which time the caller will then do the memory allocation.
4224  *	if only one cache block is needed callers may use
4225  *	sdbc_centry_alloc_blks() which also allocates memory.
4226  *
4227  *	Note: caller must check for null return if called with
4228  *	ALLOC_NOWAIT set.
4229  */
4230 
4231 _sd_cctl_t *
4232 sdbc_centry_alloc(int cd, nsc_off_t cblk, nsc_size_t req_blocks, int *stall,
4233 			sdbc_allocbuf_t *alloc_tok, int flag)
4234 {
4235 	_sd_cctl_t *centry;
4236 
4237 	if (sdbc_use_dmchain)
4238 		centry = sdbc_alloc_dmc(cd, cblk, req_blocks, stall, alloc_tok,
4239 		    flag);
4240 	else
4241 		centry = sdbc_alloc_lru(cd, cblk, stall, flag);
4242 
4243 	return (centry);
4244 }
4245 
4246 /*
4247  * sdbc_alloc_dmc -- allocate a centry from a dmchain
4248  *
4249  * ARGUMENTS:
4250  *	cd	- Cache descriptor (from a previous open)
4251  *	cblk	- cache block number.
4252  *	stall	- pointer to stall count (no blocks avail)
4253  *	req_blocks - number of cache blocks in clients i/o request
4254  *	alloc_tok - pointer to token initialized to 0 on first call to function
4255  *	flag	- lock status of sdbc_queue_lock, or ALLOC_NOWAIT flag
4256  * RETURNS:
4257  *	A cache block or possibly NULL if ALLOC_NOWAIT set
4258  *
4259  * USAGE:
4260  *	if dmchain is empty, allocate one.
4261  */
4262 static _sd_cctl_t *
4263 sdbc_alloc_dmc(int cd, nsc_off_t cblk, nsc_size_t req_blocks, int *stall,
4264 			sdbc_allocbuf_t *alloc_tok, int flag)
4265 {
4266 	sdbc_allocbuf_impl_t *dmc = (sdbc_allocbuf_impl_t *)alloc_tok;
4267 	_sd_cctl_t *centry = NULL;
4268 
4269 	if (!dmc->sab_dmchain) {
4270 		/*
4271 		 * Note - sdbc_get_dmchain() returns
4272 		 * with cc_inuse and cc_pageio set
4273 		 * for all members of dmchain.
4274 		 */
4275 		if (dmc->sab_dmchain =
4276 		    sdbc_get_dmchain(req_blocks, stall, flag)) {
4277 
4278 			/* remember q it came from */
4279 			if (dmc->sab_dmchain->cc_alloc_size_dm)
4280 				dmc->sab_q = dmc->sab_dmchain->cc_cblocks;
4281 		}
4282 	}
4283 
4284 	/*
4285 	 * Note: dmchain pointer is advanced in sdbc_alloc_from_dmchain()
4286 	 */
4287 	if (dmc->sab_dmchain) /* could be NULL if ALLOC_NOWAIT set */
4288 		centry = sdbc_alloc_from_dmchain(cd, cblk, alloc_tok, flag);
4289 
4290 	return (centry);
4291 }
4292 
4293 /*
4294  * sdbc_alloc_from_dmchain -- allocate centry from a dmchain of centrys
4295  *
4296  * ARGUMENTS:
4297  *	cd	- Cache descriptor (from a previous open)
4298  *	cblk	- cache block number.
4299  *	alloc_tok - pointer to token
4300  *	flag	- lock status of sdbc_queue_lock or ALLOC_NOWAIT
4301  *
4302  * RETURNS:
4303  *	A cache block or possibly NULL if ALLOC_NOWAIT set.
4304  *
4305  * USAGE:
4306  *	This routine allocates a new cache block from the supplied dmchain.
4307  *	Assumes that dmchain is non-NULL and that all cache entries in
4308  *	the dmchain have been removed from hash and have their cc_inuse and
4309  *	cc_pageio bits set.
4310  */
4311 static _sd_cctl_t *
4312 sdbc_alloc_from_dmchain(int cd, nsc_off_t cblk, sdbc_allocbuf_t *alloc_tok,
4313     int flag)
4314 {
4315 	_sd_cctl_t *cc_ent, *old_ent;
4316 	int categorize_centry;
4317 	int locked = flag & ALLOC_LOCKED;
4318 	int nowait = flag & ALLOC_NOWAIT;
4319 	sdbc_allocbuf_impl_t *dmc = (sdbc_allocbuf_impl_t *)alloc_tok;
4320 
4321 	SDTRACE(ST_ENTER|SDF_ENT_ALLOC, cd, 0, BLK_TO_FBA_NUM(cblk), 0, 0);
4322 
4323 	ASSERT(dmc->sab_dmchain);
4324 
4325 	cc_ent = dmc->sab_dmchain;
4326 
4327 	ASSERT(_sd_cctl_valid(cc_ent));
4328 
4329 	cc_ent->cc_valid = 0;
4330 	categorize_centry = 0;
4331 	if (cc_ent->cc_data)
4332 		categorize_centry = FOUND_HOLD_OVER_DM;
4333 
4334 alloc_try:
4335 	if (cd == _CD_NOHASH)
4336 		CENTRY_BLK(cc_ent) = cblk;
4337 	else if ((old_ent = (_sd_cctl_t *)
4338 	    _sd_hash_insert(cd, cblk, (struct _sd_hash_hd *)cc_ent,
4339 	    _sd_htable)) != cc_ent) {
4340 
4341 		if (SET_CENTRY_INUSE(old_ent)) {
4342 			sdbc_centry_inuse++;
4343 
4344 			if (nowait) {
4345 				cc_ent = NULL;
4346 				goto out;
4347 			}
4348 
4349 			if (locked)
4350 				rw_exit(&sdbc_queue_lock);
4351 			_sd_cc_wait(cd, cblk, old_ent, CC_INUSE);
4352 			if (locked)
4353 				rw_enter(&sdbc_queue_lock, RW_WRITER);
4354 			goto alloc_try;
4355 		}
4356 
4357 		/*
4358 		 * bug 4529671
4359 		 * now that we own the centry make sure that
4360 		 * it is still good. it could have been processed
4361 		 * by _sd_dealloc_dm() in the window between
4362 		 * _sd_hash_insert() and SET_CENTRY_INUSE().
4363 		 */
4364 		if ((_sd_cctl_t *)_sd_hash_search(cd, cblk, _sd_htable)
4365 		    != old_ent) {
4366 			sdbc_centry_deallocd++;
4367 #ifdef DEBUG
4368 			cmn_err(CE_WARN, "!cc_ent %p cd %d cblk %" NSC_SZFMT
4369 			    " lost to dealloc?! cc_data %p", (void *)old_ent,
4370 			    cd, cblk, (void *)old_ent->cc_data);
4371 #endif
4372 
4373 			CLEAR_CENTRY_INUSE(old_ent);
4374 
4375 			if (nowait) {
4376 				cc_ent = NULL;
4377 				goto out;
4378 			}
4379 
4380 			goto alloc_try;
4381 		}
4382 
4383 		if (CC_CD_BLK_MATCH(cd, cblk, old_ent)) {
4384 			sdbc_centry_hit++;
4385 			old_ent->cc_toflush = 0;
4386 			/* _sd_centry_release(cc_ent); */
4387 			cc_ent = old_ent;
4388 			categorize_centry = FOUND_IN_HASH_DM;
4389 		} else {
4390 			sdbc_centry_lost++;
4391 
4392 			CLEAR_CENTRY_INUSE(old_ent);
4393 
4394 			if (nowait) {
4395 				cc_ent = NULL;
4396 				goto out;
4397 			}
4398 
4399 			goto alloc_try;
4400 		}
4401 	}
4402 
4403 	/*
4404 	 * advance the dmchain pointer, but only if we got the
4405 	 * cc_ent from the dmchain
4406 	 */
4407 	if (categorize_centry != FOUND_IN_HASH_DM) {
4408 		if (cc_ent->cc_data)
4409 			dmc->sab_dmchain = dmc->sab_dmchain->cc_next_dm;
4410 		else
4411 			dmc->sab_dmchain = dmc->sab_dmchain->cc_next;
4412 	}
4413 
4414 
4415 	SDTRACE(ST_EXIT|SDF_ENT_ALLOC, cd, 0, BLK_TO_FBA_NUM(cblk), 0, 0);
4416 
4417 	mutex_enter(&cc_ent->cc_lock);
4418 	if (cc_ent->cc_await_use) {
4419 		cv_broadcast(&cc_ent->cc_blkcv);
4420 	}
4421 	mutex_exit(&cc_ent->cc_lock);
4422 
4423 	sdbc_centry_init_dm(cc_ent);
4424 
4425 	cc_ent->cc_aging_dm |= categorize_centry;
4426 
4427 	out:
4428 
4429 	SDTRACE(ST_INFO|SDF_ENT_ALLOC, cd, 0, BLK_TO_FBA_NUM(cblk), 0, 0);
4430 
4431 	return (cc_ent);
4432 }
4433 
4434 /*
4435  * sdbc_centry_alloc_end -- tidy up after all cache blocks have been
4436  *	allocated for a request
4437  * ARGUMENTS:
4438  *	alloc_tok  - pointer to allocation token
4439  * RETURNS
4440  *	nothing
4441  * USAGE:
4442  *	at this time only useful when sdbc_use_dmchain is true.
4443  *	if there are cache blocks remaining on the chain then the inuse and
4444  *	pageio bits must be cleared (they were set in sdbc_get_dmchain().
4445  *
4446  */
4447 static void
4448 sdbc_centry_alloc_end(sdbc_allocbuf_t *alloc_tok)
4449 {
4450 	_sd_cctl_t *next_centry;
4451 	_sd_cctl_t *prev_centry;
4452 	_sd_queue_t *q;
4453 	sdbc_allocbuf_impl_t *dmc = (sdbc_allocbuf_impl_t *)alloc_tok;
4454 #ifdef DEBUG
4455 	int chainpull = 0;
4456 #endif
4457 
4458 	if (!sdbc_use_dmchain)
4459 		return;
4460 
4461 	next_centry = dmc->sab_dmchain;
4462 
4463 	while (next_centry != NULL) {
4464 		CLEAR_CENTRY_PAGEIO(next_centry);
4465 
4466 		prev_centry = next_centry;
4467 
4468 		if (next_centry->cc_data) {
4469 #ifdef DEBUG
4470 			++chainpull;
4471 #endif
4472 			next_centry = next_centry->cc_next_dm;
4473 
4474 			/* clear bit after final reference */
4475 
4476 			CLEAR_CENTRY_INUSE(prev_centry);
4477 		} else {
4478 			next_centry = next_centry->cc_next;
4479 
4480 			/*
4481 			 * a floater from the 0 queue, insert on q.
4482 			 *
4483 			 * since this centry is not on any queue
4484 			 * the inuse bit can be cleared before
4485 			 * inserting on the q.  this is also required
4486 			 * since sdbc_get_dmchain() does not expect
4487 			 * inuse bits to be set on 0 queue entry's.
4488 			 */
4489 
4490 			CLEAR_CENTRY_INUSE(prev_centry);
4491 			q = &sdbc_dm_queues[0];
4492 			sdbc_ins_dmqueue_front(q, prev_centry);
4493 		}
4494 	}
4495 
4496 #ifdef DEBUG
4497 	/* compute wastage stats */
4498 	ASSERT((chainpull >= 0) && (chainpull < max_dm_queues));
4499 	if (chainpull)
4500 		(*(dmchainpull_table + (dmc->sab_q *
4501 		    max_dm_queues + chainpull)))++;
4502 #endif
4503 
4504 }
4505 
4506 
4507 /*
4508  * sdbc_alloc_lru - allocate a new cache block from the lru queue
4509  *
4510  * ARGUMENTS:
4511  *	cd	- Cache descriptor (from a previous open)
4512  *	cblk	- cache block number.
4513  *	stall	- pointer to stall count (no blocks avail)
4514  *	flag	- lock status of sdbc_queue_lock or ALLOC_NOWAIT
4515  *
4516  * RETURNS:
4517  *	A cache block or NULL if ALLOC_NOWAIT specified
4518  *
4519  * USAGE:
4520  *	This routine allocates a new cache block from the lru.
4521  *	If an allocation cannot be done, we block, unless ALLOC_NOWAIT is set.
4522  */
4523 
4524 static _sd_cctl_t *
4525 sdbc_alloc_lru(int cd, nsc_off_t cblk, int *stall, int flag)
4526 {
4527 	_sd_cctl_t *cc_ent, *old_ent, *ccnext;
4528 	_sd_queue_t *q = _SD_LRU_Q;
4529 	_sd_cctl_t *qhead = &(q->sq_qhead);
4530 	int tries = 0, num_tries;
4531 	int categorize_centry;
4532 	int locked = flag & ALLOC_LOCKED;
4533 	int nowait = flag & ALLOC_NOWAIT;
4534 
4535 	if (nowait) {
4536 		num_tries = q->sq_inq / 100; /* only search 1% of q */
4537 
4538 		if (num_tries <= 0) /* ensure num_tries is non-zero */
4539 			num_tries = q->sq_inq;
4540 	} else
4541 		num_tries = _sd_lruq_srch;
4542 
4543 	SDTRACE(ST_ENTER|SDF_ENT_ALLOC, cd, 0, BLK_TO_FBA_NUM(cblk), 0, 0);
4544 retry_alloc_centry:
4545 
4546 	for (cc_ent = (qhead->cc_next); cc_ent != qhead; cc_ent = ccnext) {
4547 		if (--num_tries <= 0)
4548 			if (nowait) {
4549 				cc_ent = NULL;
4550 				goto out;
4551 			} else
4552 				break;
4553 
4554 		ccnext = cc_ent->cc_next;
4555 
4556 		if (cc_ent->cc_aging_dm & BAD_CHAIN_DM)
4557 			continue;
4558 
4559 		if (CENTRY_DIRTY(cc_ent))
4560 			continue;
4561 		if (SET_CENTRY_INUSE(cc_ent))
4562 			continue;
4563 
4564 		if (CENTRY_DIRTY(cc_ent)) {
4565 			sdbc_centry_lost++;
4566 
4567 			CLEAR_CENTRY_INUSE(cc_ent);
4568 			continue;
4569 		}
4570 		cc_ent->cc_flag = 0; /* CC_INUSE */
4571 		cc_ent->cc_toflush = 0;
4572 
4573 		/*
4574 		 * Inlined requeue of the LRU. (should match _sd_requeue)
4575 		 */
4576 		/* was FAST */
4577 		mutex_enter(&q->sq_qlock);
4578 #if defined(_SD_DEBUG)
4579 	if (1) {
4580 		_sd_cctl_t *cp, *cn, *qp;
4581 		cp = cc_ent->cc_prev;
4582 		cn = cc_ent->cc_next;
4583 		qp = (q->sq_qhead).cc_prev;
4584 		if (!_sd_cctl_valid(cc_ent) ||
4585 		    (cp != &(q->sq_qhead) && !_sd_cctl_valid(cp)) ||
4586 		    (cn != &(q->sq_qhead) && !_sd_cctl_valid(cn)) ||
4587 		    !_sd_cctl_valid(qp))
4588 			cmn_err(CE_PANIC,
4589 			    "_sd_centry_alloc %x prev %x next %x qp %x",
4590 			    cc_ent, cp, cn, qp);
4591 	}
4592 #endif
4593 		cc_ent->cc_prev->cc_next = cc_ent->cc_next;
4594 		cc_ent->cc_next->cc_prev = cc_ent->cc_prev;
4595 		cc_ent->cc_next = qhead;
4596 		cc_ent->cc_prev = qhead->cc_prev;
4597 		qhead->cc_prev->cc_next = cc_ent;
4598 		qhead->cc_prev = cc_ent;
4599 		cc_ent->cc_seq = q->sq_seq++;
4600 		/* was FAST */
4601 		mutex_exit(&q->sq_qlock);
4602 		/*
4603 		 * End inlined requeue.
4604 		 */
4605 
4606 #if defined(_SD_STATS)
4607 		if (_sd_hash_delete(cc_ent, _sd_htable) == 0)
4608 			SDTRACE(SDF_REPLACE,
4609 			    CENTRY_CD(cc_ent), cc_ent->cc_hits,
4610 			    BLK_TO_FBA_NUM(CENTRY_BLK(cc_ent)),
4611 			    nsc_lbolt(), cc_ent->cc_creat);
4612 		cc_ent->cc_creat = nsc_lbolt();
4613 		cc_ent->cc_hits = 0;
4614 #else
4615 #if defined(_SD_DEBUG)
4616 		if (_sd_hash_delete(cc_ent, _sd_htable) == 0) {
4617 			SDTRACE(SDF_REPLACE|ST_DL,
4618 			    CENTRY_CD(cc_ent),
4619 			    cc_ent->cc_valid,
4620 			    BLK_TO_FBA_NUM(CENTRY_BLK(cc_ent)),
4621 			    cd, BLK_TO_FBA_NUM(cblk));
4622 			if (cc_ent->cc_await_use ||
4623 			    ((cd == CENTRY_CD(cc_ent)) &&
4624 			    (cblk == CENTRY_BLK(cc_ent))))
4625 				DATA_LOG(SDF_REPLACE|ST_DL, cc_ent, 0,
4626 				    BLK_FBAS);
4627 		}
4628 #else
4629 		(void) _sd_hash_delete((struct _sd_hash_hd *)cc_ent,
4630 		    _sd_htable);
4631 #endif
4632 #endif
4633 		cc_ent->cc_creat = nsc_lbolt();
4634 		cc_ent->cc_hits = 0;
4635 
4636 		cc_ent->cc_valid = 0;
4637 		categorize_centry = 0;
4638 		if (cc_ent->cc_data)
4639 			categorize_centry = FOUND_HOLD_OVER_DM;
4640 
4641 	alloc_try:
4642 		if (cd == _CD_NOHASH)
4643 			CENTRY_BLK(cc_ent) = cblk;
4644 		else if ((old_ent = (_sd_cctl_t *)
4645 		    _sd_hash_insert(cd, cblk, (struct _sd_hash_hd *)cc_ent,
4646 		    _sd_htable)) != cc_ent) {
4647 
4648 			if (SET_CENTRY_INUSE(old_ent)) {
4649 				sdbc_centry_inuse++;
4650 
4651 				if (nowait) {
4652 					_sd_centry_release(cc_ent);
4653 					cc_ent = NULL;
4654 					goto out;
4655 				}
4656 
4657 				if (locked)
4658 					rw_exit(&sdbc_queue_lock);
4659 				_sd_cc_wait(cd, cblk, old_ent, CC_INUSE);
4660 				if (locked)
4661 					rw_enter(&sdbc_queue_lock, RW_WRITER);
4662 				goto alloc_try;
4663 			}
4664 
4665 			/*
4666 			 * bug 4529671
4667 			 * now that we own the centry make sure that
4668 			 * it is still good. it could have been processed
4669 			 * by _sd_dealloc_dm() in the window between
4670 			 * _sd_hash_insert() and SET_CENTRY_INUSE().
4671 			 */
4672 			if ((_sd_cctl_t *)
4673 			    _sd_hash_search(cd, cblk, _sd_htable) != old_ent) {
4674 				sdbc_centry_deallocd++;
4675 #ifdef DEBUG
4676 				cmn_err(CE_WARN, "!cc_ent %p cd %d cblk %"
4677 				    NSC_SZFMT " lost to dealloc?! cc_data %p",
4678 				    (void *)old_ent, cd, cblk,
4679 				    (void *)old_ent->cc_data);
4680 #endif
4681 
4682 				CLEAR_CENTRY_INUSE(old_ent);
4683 
4684 				if (nowait) {
4685 					_sd_centry_release(cc_ent);
4686 					cc_ent = NULL;
4687 					goto out;
4688 				}
4689 
4690 				goto alloc_try;
4691 			}
4692 
4693 			if (CC_CD_BLK_MATCH(cd, cblk, old_ent)) {
4694 				sdbc_centry_hit++;
4695 				old_ent->cc_toflush = 0;
4696 				_sd_centry_release(cc_ent);
4697 				cc_ent = old_ent;
4698 				categorize_centry = FOUND_IN_HASH_DM;
4699 			} else {
4700 				sdbc_centry_lost++;
4701 
4702 				CLEAR_CENTRY_INUSE(old_ent);
4703 
4704 				if (nowait) {
4705 					_sd_centry_release(cc_ent);
4706 					cc_ent = NULL;
4707 					goto out;
4708 				}
4709 
4710 				goto alloc_try;
4711 			}
4712 		}
4713 
4714 		SDTRACE(ST_EXIT|SDF_ENT_ALLOC, cd, tries,
4715 		    BLK_TO_FBA_NUM(cblk), 0, 0);
4716 
4717 		if (cc_ent->cc_await_use) {
4718 			mutex_enter(&cc_ent->cc_lock);
4719 			cv_broadcast(&cc_ent->cc_blkcv);
4720 			mutex_exit(&cc_ent->cc_lock);
4721 		}
4722 
4723 		sdbc_centry_init_dm(cc_ent);
4724 
4725 		cc_ent->cc_aging_dm |= categorize_centry;
4726 
4727 	out:
4728 		return (cc_ent);
4729 	}
4730 
4731 	SDTRACE(ST_INFO|SDF_ENT_ALLOC, cd, ++tries, BLK_TO_FBA_NUM(cblk), 0, 0);
4732 
4733 	delay(drv_usectohz(20000));
4734 	(void) (*stall)++;
4735 	num_tries = _sd_lruq_srch;
4736 	goto retry_alloc_centry;
4737 }
4738 
4739 /*
4740  * sdbc_centry_init_dm - setup the cache block for dynamic memory allocation
4741  *
4742  * ARGUMENTS:
4743  *	centry	 - Cache block.
4744  *
4745  * RETURNS:
4746  *	NONE
4747  *
4748  * USAGE:
4749  *	This routine is the central point in which cache entry blocks are setup
4750  */
4751 static void
4752 sdbc_centry_init_dm(_sd_cctl_t *centry)
4753 {
4754 
4755 	/* an entry already setup - don't touch simply refresh age */
4756 	if (centry->cc_data) {
4757 		centry->cc_aging_dm &= ~(FINAL_AGING_DM);
4758 
4759 		DTRACE_PROBE1(sdbc_centry_init_dm_end,
4760 		    char *, centry->cc_data);
4761 		return;
4762 	}
4763 
4764 	centry->cc_aging_dm &= ~(FINAL_AGING_DM | CATAGORY_ENTRY_DM);
4765 
4766 	if (centry->cc_head_dm || centry->cc_next_dm)
4767 		cmn_err(cmn_level, "!sdbc(sdbc_centry_init_dm): "
4768 		    "non-zero mem chain in ccent %p", (void *)centry);
4769 
4770 	centry->cc_head_dm = 0;
4771 
4772 	if (!sdbc_use_dmchain)
4773 		centry->cc_next_dm = 0;
4774 
4775 	centry->cc_data = 0;
4776 
4777 }
4778 
4779 /*
4780  * sdbc_centry_memalloc_dm
4781  *
4782  * Actually allocate the cache memory, storing it in the cc_data field for
4783  * the cctl
4784  *
4785  * ARGS:
4786  *	centry: cache control block for which to allocate the memory
4787  *	alloc_request: number of bytes to allocate
4788  *	flag: if called with ALLOC_NOWAIT, caller must check for non-zero return
4789  *
4790  * RETURNS:
4791  *	0 on success
4792  *	non-zero on error
4793  */
4794 static int
4795 sdbc_centry_memalloc_dm(_sd_cctl_t *centry, int alloc_request, int flag)
4796 {
4797 	int cblocks;
4798 	_sd_queue_t *newq;
4799 	int sleep;
4800 	sleep = (flag & ALLOC_NOWAIT) ? KM_NOSLEEP : KM_SLEEP;
4801 
4802 	if (!centry->cc_data && (alloc_request > 0)) {
4803 		/* host or other */
4804 		dynmem_processing_dm.alloc_ct++;
4805 		centry->cc_data = (unsigned char *)
4806 		    kmem_alloc((size_t)centry->cc_alloc_size_dm, sleep);
4807 
4808 
4809 		if (sdbc_use_dmchain) {
4810 			cblocks = centry->cc_alloc_size_dm >> _sd_cblock_shift;
4811 			newq = &sdbc_dm_queues[cblocks];
4812 
4813 			/* set the dmqueue index */
4814 			centry->cc_cblocks = cblocks;
4815 
4816 			/* put on appropriate queue */
4817 			sdbc_ins_dmqueue_back(newq, centry);
4818 		}
4819 
4820 		/*
4821 		 * for KM_NOSLEEP (should never happen with KM_SLEEP)
4822 		 */
4823 		if (!centry->cc_data)
4824 			return (LOW_RESOURCES_DM);
4825 		centry->cc_head_dm = centry;
4826 		centry->cc_alloc_ct_dm++;
4827 	}
4828 
4829 	return (0);
4830 }
4831 
4832 /*
4833  * _sd_centry_release - release a cache block
4834  *
4835  * ARGUMENTS:
4836  *	centry	 - Cache block.
4837  *
4838  * RETURNS:
4839  *	NONE
4840  *
4841  * USAGE:
4842  *	This routine frees up a cache block. It also frees up a write
4843  *	block if allocated and its valid to release it.
4844  */
4845 
4846 void
4847 _sd_centry_release(_sd_cctl_t *centry)
4848 {
4849 	ss_centry_info_t *wctl;
4850 
4851 	SDTRACE(ST_ENTER|SDF_ENT_FREE, CENTRY_CD(centry), 0,
4852 	    BLK_TO_FBA_NUM(CENTRY_BLK(centry)), 0, 0);
4853 
4854 	CLEAR_CENTRY_PAGEIO(centry);
4855 
4856 	if ((wctl = centry->cc_write) != 0) {
4857 		/* was FAST */
4858 		mutex_enter(&centry->cc_lock);
4859 		if (CENTRY_DIRTY(centry))
4860 			wctl = NULL;
4861 		else {
4862 			centry->cc_write = NULL;
4863 			centry->cc_flag &= ~(CC_PINNABLE);
4864 		}
4865 		/* was FAST */
4866 		mutex_exit(&centry->cc_lock);
4867 		if (wctl)  {
4868 			wctl->sc_dirty = 0;
4869 			SSOP_SETCENTRY(sdbc_safestore, wctl);
4870 			SSOP_DEALLOCRESOURCE(sdbc_safestore, wctl->sc_res);
4871 		}
4872 	}
4873 
4874 	if (!(centry->cc_aging_dm & BAD_CHAIN_DM)) {
4875 		if (sdbc_use_dmchain) {
4876 			if (centry->cc_alloc_size_dm) {
4877 
4878 				/* see if this can be queued to head */
4879 				if (CENTRY_QHEAD(centry)) {
4880 					sdbc_requeue_head_dm_try(centry);
4881 				} else {
4882 					int qidx;
4883 					_sd_queue_t *q;
4884 
4885 					qidx = centry->cc_cblocks;
4886 					q = &sdbc_dm_queues[qidx];
4887 
4888 					if (_sd_lru_reinsert(q, centry)) {
4889 						sdbc_requeue_dmchain(q,
4890 						    centry, 1, 1);
4891 					}
4892 				}
4893 			} else {
4894 				/*
4895 				 * Fix for bug 4949134:
4896 				 * If an internal block is marked with CC_QHEAD
4897 				 * but the HOST block is not, the chain will
4898 				 * never age properly, and will never be made
4899 				 * available.  Only the HOST of the dmchain is
4900 				 * checked for CC_QHEAD, so clearing an internal
4901 				 * block indiscriminately (as is being done
4902 				 * here) does no damage.
4903 				 *
4904 				 * The same result could instead be achieved by
4905 				 * not setting the CC_QHEAD flag in the first
4906 				 * place, if the block is an internal dmchain
4907 				 * block, and if it is found in the hash table.
4908 				 * The current solution was chosen since it is
4909 				 * the least intrusive.
4910 				 */
4911 				centry->cc_flag &= ~CC_QHEAD;
4912 			}
4913 		} else {
4914 			if (CENTRY_QHEAD(centry)) {
4915 				if (!CENTRY_DIRTY(centry))
4916 					_sd_requeue_head(centry);
4917 			} else if (_sd_lru_reinsert(_SD_LRU_Q, centry))
4918 				_sd_requeue(centry);
4919 		}
4920 	}
4921 
4922 	SDTRACE(ST_EXIT|SDF_ENT_FREE, CENTRY_CD(centry), 0,
4923 	    BLK_TO_FBA_NUM(CENTRY_BLK(centry)), 0, 0);
4924 
4925 	/* only clear inuse after final reference to centry */
4926 
4927 	CLEAR_CENTRY_INUSE(centry);
4928 }
4929 
4930 
4931 /*
4932  * lookup to centry info associated with safestore resource
4933  * return pointer to the centry info structure
4934  */
4935 ss_centry_info_t *
4936 sdbc_get_cinfo_byres(ss_resource_t *res)
4937 {
4938 	ss_centry_info_t *cinfo;
4939 	ss_centry_info_t *cend;
4940 	int found = 0;
4941 
4942 	ASSERT(res != NULL);
4943 
4944 	if (res == NULL)
4945 		return (NULL);
4946 
4947 	cinfo = _sdbc_gl_centry_info;
4948 	cend = _sdbc_gl_centry_info +
4949 	    (_sdbc_gl_centry_info_size / sizeof (ss_centry_info_t)) - 1;
4950 
4951 	for (; cinfo <= cend; ++cinfo)
4952 		if (cinfo->sc_res == res) {
4953 			++found;
4954 			break;
4955 		}
4956 
4957 	if (!found)
4958 		cinfo = NULL; /* bad */
4959 
4960 	return (cinfo);
4961 }
4962 
4963 /*
4964  * _sd_alloc_write - Allocate a write block (for remote mirroring)
4965  *		   and set centry->cc_write
4966  *
4967  * ARGUMENTS:
4968  *	centry	 - Head of Cache chain
4969  *	stall	 - pointer to stall count (no blocks avail)
4970  *
4971  * RETURNS:
4972  *	0 - and sets  cc_write for all entries when write contl block obtained.
4973  *	-1 - if a write control block could not be obtained.
4974  */
4975 
4976 int
4977 _sd_alloc_write(_sd_cctl_t *centry, int *stall)
4978 {
4979 
4980 	ss_resourcelist_t *reslist;
4981 	ss_resourcelist_t *savereslist;
4982 	ss_resource_t *res;
4983 	_sd_cctl_t *ce;
4984 	int err;
4985 	int need;
4986 
4987 
4988 	need = 0;
4989 
4990 	for (ce = centry; ce; ce = ce->cc_chain) {
4991 		if (!(ce->cc_write))
4992 			need++;
4993 	}
4994 
4995 	if (!need)
4996 		return (0);
4997 
4998 	if ((SSOP_ALLOCRESOURCE(sdbc_safestore, need, stall, &reslist))
4999 	    == SS_OK) {
5000 		savereslist = reslist;
5001 		for (ce = centry; ce; ce = ce->cc_chain) {
5002 			if (ce->cc_write)
5003 				continue;
5004 			err = SSOP_GETRESOURCE(sdbc_safestore, &reslist, &res);
5005 			if (err == SS_OK)
5006 				ce->cc_write = sdbc_get_cinfo_byres(res);
5007 
5008 			ASSERT(err == SS_OK); /* panic if DEBUG on */
5009 			ASSERT(ce->cc_write != NULL);
5010 
5011 			/*
5012 			 * this is bad and should not happen.
5013 			 * we use the saved reslist to cleanup
5014 			 * and return.
5015 			 */
5016 			if ((err != SS_OK) || !ce->cc_write) {
5017 
5018 				cmn_err(CE_WARN, "!_sd_alloc_write: "
5019 				    "bad resource list 0x%p"
5020 				    "changing to forced write thru mode",
5021 				    (void *)savereslist);
5022 
5023 				(void) _sd_set_node_hint(NSC_FORCED_WRTHRU);
5024 
5025 				while (SSOP_GETRESOURCE(sdbc_safestore,
5026 				    &savereslist, &res) == SS_OK) {
5027 
5028 					SSOP_DEALLOCRESOURCE(sdbc_safestore,
5029 					    res);
5030 				}
5031 
5032 				return (-1);
5033 
5034 			}
5035 
5036 		}
5037 		return (0);
5038 	}
5039 
5040 	/* no safestore resources available.  do sync write */
5041 	_sd_unblock(&_sd_flush_cv);
5042 	return (-1);
5043 }
5044 
5045 /*
5046  * _sd_read - Interface call to do read.
5047  *
5048  * ARGUMENTS:
5049  *	handle  - handle allocated earlier on.
5050  *	fba_pos - disk block number to read from.
5051  *	fba_len - length in fbas.
5052  *	flag	- flag: (NSC_NOBLOCK for async io)
5053  *
5054  * RETURNS:
5055  *	errno if return > 0
5056  *	NSC_DONE or NSC_PENDING otherwise.
5057  *
5058  * USAGE:
5059  *	This routine checks if the request is valid and calls the underlying
5060  *	doread routine (also called by alloc_buf)
5061  */
5062 
5063 int
5064 _sd_read(_sd_buf_handle_t *handle, nsc_off_t fba_pos, nsc_size_t fba_len,
5065     int flag)
5066 {
5067 	sdbc_cblk_fba_t st_cblk_len;	/* FBA len of starting cache block */
5068 	sdbc_cblk_fba_t end_cblk_len;	/* FBA len of ending cache block */
5069 	sdbc_cblk_fba_t st_cblk_off;	/* FBA offset into starting cblock */
5070 	_sd_cctl_t *cc_ent = NULL;
5071 	nsc_size_t fba_orig_len = fba_len;
5072 	int ret;
5073 	int cd = HANDLE_CD(handle);
5074 
5075 	if (_sdbc_shutdown_in_progress || (handle->bh_flag & NSC_ABUF)) {
5076 		ret = EIO;
5077 		goto out;
5078 	}
5079 
5080 
5081 #if !defined(_SD_NOCHECKS)
5082 	if (!_SD_HANDLE_ACTIVE(handle)) {
5083 		cmn_err(CE_WARN, "!sdbc(_sd_read) handle %p not active",
5084 		    (void *)handle);
5085 		ret = EINVAL;
5086 		goto out;
5087 	}
5088 	ASSERT_HANDLE_LIMITS(handle, fba_pos, fba_len);
5089 #endif
5090 	if (fba_len == 0) {
5091 		ret = NSC_DONE;
5092 		goto out;
5093 	}
5094 
5095 	KSTAT_RUNQ_ENTER(cd);
5096 
5097 	st_cblk_off = BLK_FBA_OFF(fba_pos);
5098 	st_cblk_len = BLK_FBAS - st_cblk_off;
5099 	if ((nsc_size_t)st_cblk_len >= fba_len) {
5100 		end_cblk_len = 0;
5101 		st_cblk_len = (sdbc_cblk_fba_t)fba_len;
5102 	} else {
5103 		end_cblk_len = BLK_FBA_OFF(fba_pos + fba_len);
5104 	}
5105 
5106 	cc_ent = handle->bh_centry;
5107 	while (CENTRY_BLK(cc_ent) != FBA_TO_BLK_NUM(fba_pos))
5108 		cc_ent = cc_ent->cc_chain;
5109 
5110 	if (!SDBC_VALID_BITS(st_cblk_off, st_cblk_len, cc_ent))
5111 		goto need_io;
5112 	DATA_LOG(SDF_RD, cc_ent, st_cblk_off, st_cblk_len);
5113 
5114 	DTRACE_PROBE4(_sd_read_data1, uint64_t,
5115 	    (uint64_t)(BLK_TO_FBA_NUM(CENTRY_BLK(cc_ent)) + st_cblk_off),
5116 	    uint64_t, (uint64_t)st_cblk_len, char *,
5117 	    *(int64_t *)(cc_ent->cc_data + FBA_SIZE(st_cblk_off)),
5118 	    char *, *(int64_t *)(cc_ent->cc_data +
5119 	    FBA_SIZE(st_cblk_off + st_cblk_len) - 8));
5120 
5121 	fba_pos += st_cblk_len;
5122 	fba_len -= st_cblk_len;
5123 	cc_ent = cc_ent->cc_chain;
5124 
5125 	while (fba_len > (nsc_size_t)end_cblk_len) {
5126 		if (!FULLY_VALID(cc_ent))
5127 			goto need_io;
5128 		DATA_LOG(SDF_RD, cc_ent, 0, BLK_FBAS);
5129 
5130 		DTRACE_PROBE4(_sd_read_data2, uint64_t,
5131 		    (uint64_t)BLK_TO_FBA_NUM(CENTRY_BLK(cc_ent)),
5132 		    uint64_t, (uint64_t)BLK_FBAS,
5133 		    char *, *(int64_t *)(cc_ent->cc_data),
5134 		    char *, *(int64_t *)(cc_ent->cc_data +
5135 		    FBA_SIZE(BLK_FBAS) - 8));
5136 
5137 		fba_pos += BLK_FBAS;
5138 		fba_len -= BLK_FBAS;
5139 		cc_ent = cc_ent->cc_chain;
5140 	}
5141 	if (fba_len) {
5142 		if (!SDBC_VALID_BITS(0, end_cblk_len, cc_ent))
5143 			goto need_io;
5144 		DATA_LOG(SDF_RD, cc_ent, 0, end_cblk_len);
5145 
5146 		DTRACE_PROBE4(_sd_read_data3, uint64_t,
5147 		    (uint64_t)BLK_TO_FBA_NUM(CENTRY_BLK(cc_ent)),
5148 		    uint64_t, (uint64_t)end_cblk_len,
5149 		    char *, *(int64_t *)(cc_ent->cc_data),
5150 		    char *, *(int64_t *)(cc_ent->cc_data +
5151 		    FBA_SIZE(end_cblk_len) - 8));
5152 	}
5153 
5154 	CACHE_FBA_READ(handle->bh_cd, fba_orig_len);
5155 	CACHE_READ_HIT;
5156 
5157 	FBA_READ_IO_KSTATS(handle->bh_cd, FBA_SIZE(fba_orig_len));
5158 
5159 	ret = NSC_HIT;
5160 	goto stats_exit;
5161 need_io:
5162 	_SD_DISCONNECT_CALLBACK(handle);
5163 
5164 	ret = _sd_doread(handle, cc_ent, fba_pos, fba_len, flag);
5165 
5166 stats_exit:
5167 	KSTAT_RUNQ_EXIT(cd);
5168 out:
5169 	return (ret);
5170 }
5171 
5172 
5173 /*
5174  * sdbc_doread_prefetch - read ahead one cache block
5175  *
5176  * ARGUMENTS:
5177  *	cc_ent - cache entry
5178  *	fba_pos - disk block number to read from
5179  *	fba_len - length in fbas.
5180  *
5181  * RETURNS:
5182  *	number of fbas, if any, that are to be read beyond (fba_pos + fba_len)
5183  *
5184  * USAGE:
5185  *	if readahead is to be done allocate a cache block and place
5186  *	on the cc_chain of cc_ent
5187  */
5188 static int
5189 sdbc_doread_prefetch(_sd_cctl_t *cc_ent, nsc_off_t fba_pos, nsc_size_t fba_len)
5190 {
5191 	nsc_off_t st_cblk = FBA_TO_BLK_NUM(fba_pos);
5192 	nsc_off_t next_cblk = FBA_TO_BLK_NUM(fba_pos + BLK_FBAS);
5193 	nsc_size_t filesize;
5194 	int fba_count = 0; /* number of fbas to prefetch */
5195 	_sd_cctl_t *cc_ra; /* the read ahead cache entry */
5196 	int cd = CENTRY_CD(cc_ent);
5197 	nsc_size_t vol_fill;
5198 
5199 	filesize = _sd_cache_files[cd].cd_info->sh_filesize;
5200 	vol_fill = filesize - (fba_pos + fba_len);
5201 
5202 	/* readahead only for small reads */
5203 	if ((fba_len <= FBA_LEN(CACHE_BLOCK_SIZE)) && (fba_pos != 0) &&
5204 	    (vol_fill > 0)) {
5205 
5206 		/*
5207 		 * if prev block is in cache and next block is not,
5208 		 * then read ahead one block
5209 		 */
5210 		if (_sd_hash_search(cd, st_cblk - 1, _sd_htable)) {
5211 			if (!_sd_hash_search(cd, next_cblk, _sd_htable)) {
5212 
5213 				cc_ra = sdbc_centry_alloc_blks
5214 				    (cd, next_cblk, 1, ALLOC_NOWAIT);
5215 				if (cc_ra) {
5216 					/* if in cache don't readahead */
5217 					if (cc_ra->cc_aging_dm &
5218 					    HASH_ENTRY_DM) {
5219 						++sdbc_ra_hash;
5220 						_sd_centry_release(cc_ra);
5221 					} else {
5222 						cc_ent->cc_chain = cc_ra;
5223 						cc_ra->cc_chain = 0;
5224 						fba_count =
5225 						    (vol_fill >
5226 						    (nsc_size_t)BLK_FBAS) ?
5227 						    BLK_FBAS : (int)vol_fill;
5228 						/*
5229 						 * indicate implicit prefetch
5230 						 * and mark for release in
5231 						 * _sd_read_complete()
5232 						 */
5233 						cc_ra->cc_aging_dm |=
5234 						    (PREFETCH_BUF_I |
5235 						    PREFETCH_BUF_IR);
5236 					}
5237 				} else {
5238 					++sdbc_ra_none;
5239 				}
5240 			}
5241 		}
5242 
5243 	}
5244 
5245 	return (fba_count);
5246 }
5247 
5248 /*
5249  * _sd_doread - Check if blocks in cache. If not completely true, do io.
5250  *
5251  * ARGUMENTS:
5252  *	handle  - handle allocated earlier on.
5253  *	fba_pos - disk block number to read from.
5254  *	fba_len - length in fbas.
5255  *	flag	- flag: (NSC_NOBLOCK for async io)
5256  *
5257  * RETURNS:
5258  *	errno if return > 0
5259  *	NSC_DONE(from disk), or NSC_PENDING otherwise.
5260  *
5261  * Comments:
5262  *	It initiates an io and either blocks waiting for the completion
5263  *	or return NSC_PENDING, depending on whether the flag bit
5264  *	NSC_NOBLOCK is reset or set.
5265  *
5266  */
5267 
5268 
5269 static int
5270 _sd_doread(_sd_buf_handle_t *handle, _sd_cctl_t *cc_ent, nsc_off_t fba_pos,
5271     nsc_size_t fba_len, int flag)
5272 {
5273 	int cd, err;
5274 	nsc_size_t fba_orig_len; /* length in FBA's of the original request */
5275 	nsc_size_t file_len;	/* length in bytes of io to be done */
5276 	sdbc_cblk_fba_t st_cblk_len;	/* FBA len of starting cache block */
5277 	sdbc_cblk_fba_t end_cblk_len;	/* FBA len of ending cache block */
5278 	sdbc_cblk_fba_t st_cblk_off;	/* FBA offset into starting cblock */
5279 	int num_bdl;
5280 	_sd_cctl_t *cc_temp;
5281 	struct buf *bp;
5282 	unsigned int want_bits;
5283 	void (*fn)(blind_t, nsc_off_t, nsc_size_t, int);
5284 	sdbc_cblk_fba_t end_cblk_fill;	/* FBA's to fill to end of last block */
5285 	nsc_size_t vol_end_fill; /* # of FBA's to fill to end of the volume */
5286 
5287 	cd = HANDLE_CD(handle);
5288 	SDTRACE(ST_ENTER|SDF_READ, cd, fba_len, fba_pos, flag, 0);
5289 
5290 	ASSERT(cd >= 0);
5291 	if (_sd_cache_files[cd].cd_info->sh_failed) {
5292 		SDTRACE(ST_EXIT|SDF_READ, cd, fba_len, fba_pos, flag, EIO);
5293 		return (EIO);
5294 	}
5295 
5296 	/*
5297 	 * adjust the position and length so that the entire cache
5298 	 * block is read in
5299 	 */
5300 
5301 	/* first, adjust to beginning of cache block */
5302 
5303 	fba_len += BLK_FBA_OFF(fba_pos); /* add start offset to length */
5304 	fba_pos &= ~BLK_FBA_MASK; /* move position back to start of block */
5305 
5306 	/* compute fill to end of cache block */
5307 	end_cblk_fill = (BLK_FBAS - 1) - ((fba_len - 1) % BLK_FBAS);
5308 	vol_end_fill = _sd_cache_files[(cd)].cd_info->sh_filesize -
5309 	    (fba_pos + fba_len);
5310 
5311 	/* fill to lesser of cache block or end of volume */
5312 	fba_len += ((nsc_size_t)end_cblk_fill < vol_end_fill) ? end_cblk_fill :
5313 	    vol_end_fill;
5314 
5315 	DTRACE_PROBE2(_sd_doread_rfill, nsc_off_t, fba_pos,
5316 	    nsc_size_t, fba_len);
5317 
5318 
5319 	/* for small reads do 1-block readahead if previous block is in cache */
5320 	if (sdbc_prefetch1)
5321 		fba_len += sdbc_doread_prefetch(cc_ent, fba_pos, fba_len);
5322 
5323 	fba_orig_len = fba_len;
5324 	st_cblk_off = BLK_FBA_OFF(fba_pos);
5325 	st_cblk_len = BLK_FBAS - st_cblk_off;
5326 	if ((nsc_size_t)st_cblk_len >= fba_len) {
5327 		end_cblk_len = 0;
5328 		st_cblk_len = (sdbc_cblk_fba_t)fba_len;
5329 	} else {
5330 		end_cblk_len = BLK_FBA_OFF(fba_pos + fba_len);
5331 	}
5332 
5333 	cc_temp = cc_ent;
5334 	num_bdl = 0;
5335 	while (cc_temp)	{
5336 		num_bdl += (SDBC_LOOKUP_IOCOUNT(CENTRY_DIRTY(cc_temp)));
5337 		cc_temp = cc_temp->cc_chain;
5338 	}
5339 	bp = sd_alloc_iob(_sd_cache_files[cd].cd_crdev,
5340 	    fba_pos, num_bdl, B_READ);
5341 	if (bp == NULL) {
5342 		SDTRACE(ST_EXIT|SDF_READ, cd, fba_len, fba_pos, flag, E2BIG);
5343 		return (E2BIG);
5344 	}
5345 
5346 	want_bits = SDBC_GET_BITS(st_cblk_off, st_cblk_len);
5347 	if (want_bits & CENTRY_DIRTY(cc_ent))
5348 		_sd_ccent_rd(cc_ent, want_bits, bp);
5349 	else {
5350 		sd_add_fba(bp, &cc_ent->cc_addr, st_cblk_off, st_cblk_len);
5351 	}
5352 	file_len = FBA_SIZE(st_cblk_len);
5353 	cc_ent = cc_ent->cc_chain;
5354 	fba_len -= st_cblk_len;
5355 
5356 	while (fba_len > (nsc_size_t)end_cblk_len) {
5357 		if (CENTRY_DIRTY(cc_ent))
5358 			_sd_ccent_rd(cc_ent, (uint_t)BLK_FBA_BITS, bp);
5359 		else {
5360 			sd_add_fba(bp, &cc_ent->cc_addr, 0, BLK_FBAS);
5361 		}
5362 		file_len += CACHE_BLOCK_SIZE;
5363 		cc_ent = cc_ent->cc_chain;
5364 		fba_len -= BLK_FBAS;
5365 	}
5366 
5367 	if (fba_len) {
5368 		want_bits = SDBC_GET_BITS(0, end_cblk_len);
5369 		if (want_bits & CENTRY_DIRTY(cc_ent))
5370 			_sd_ccent_rd(cc_ent, want_bits, bp);
5371 		else {
5372 			sd_add_fba(bp, &cc_ent->cc_addr, 0, end_cblk_len);
5373 		}
5374 		file_len += FBA_SIZE(end_cblk_len);
5375 	}
5376 
5377 	CACHE_READ_MISS;
5378 	FBA_READ_IO_KSTATS(cd, file_len);
5379 
5380 	DISK_FBA_READ(cd, FBA_NUM(file_len));
5381 
5382 	fn = (handle->bh_flag & NSC_NOBLOCK) ? _sd_async_read_ea : NULL;
5383 	err = sd_start_io(bp, _sd_cache_files[cd].cd_strategy, fn, handle);
5384 
5385 	if (err != NSC_PENDING) {
5386 		_sd_read_complete(handle, fba_pos, fba_orig_len, err);
5387 	}
5388 
5389 	SDTRACE(ST_EXIT|SDF_READ, cd, fba_orig_len, fba_pos, flag, err);
5390 
5391 	return (err);
5392 }
5393 
5394 
5395 
5396 /*
5397  * _sd_read_complete - Do whatever is necessary after a read io is done.
5398  *
5399  * ARGUMENTS:
5400  *	handle  - handle allocated earlier on.
5401  *	fba_pos - disk block number to read from.
5402  *	fba_len - length in fbas.
5403  *	error   - error from io if any.
5404  *
5405  * RETURNS:
5406  *	NONE.
5407  *
5408  * Comments:
5409  *	This routine marks the cache blocks valid if the io completed
5410  *	sucessfully. Called from the async end action as well as after
5411  * 	a synchrnous read completes.
5412  */
5413 
5414 void
5415 _sd_read_complete(_sd_buf_handle_t *handle, nsc_off_t fba_pos,
5416     nsc_size_t fba_len, int error)
5417 {
5418 	sdbc_cblk_fba_t st_cblk_len;	/* FBA len of starting cache block */
5419 	sdbc_cblk_fba_t end_cblk_len;	/* FBA len of ending cache block */
5420 	sdbc_cblk_fba_t st_cblk_off;	/* FBA offset into starting cblock */
5421 	nsc_size_t cur_fba_len; /* length in FBA's */
5422 	_sd_cctl_t *cc_iocent;
5423 	_sd_cctl_t *first_iocent; /* first buffer when processing prefetch */
5424 
5425 	cc_iocent = handle->bh_centry;
5426 
5427 	if ((handle->bh_error = error) == 0) {
5428 		while (CENTRY_BLK(cc_iocent) != FBA_TO_BLK_NUM(fba_pos))
5429 			cc_iocent = cc_iocent->cc_chain;
5430 
5431 		cur_fba_len = fba_len;
5432 		st_cblk_off = BLK_FBA_OFF(fba_pos);
5433 		st_cblk_len = BLK_FBAS - st_cblk_off;
5434 		if ((nsc_size_t)st_cblk_len >= fba_len) {
5435 			end_cblk_len = 0;
5436 			st_cblk_len = (sdbc_cblk_fba_t)fba_len;
5437 		} else {
5438 			end_cblk_len = BLK_FBA_OFF(fba_pos + fba_len);
5439 		}
5440 
5441 		SDBC_SET_VALID_BITS(st_cblk_off, st_cblk_len, cc_iocent);
5442 		DATA_LOG(SDF_RDIO, cc_iocent, st_cblk_off, st_cblk_len);
5443 
5444 		DTRACE_PROBE4(_sd_read_complete_data1, uint64_t, (uint64_t)
5445 		    BLK_TO_FBA_NUM(CENTRY_BLK(cc_iocent)) + st_cblk_off,
5446 		    int, st_cblk_len, char *,
5447 		    *(int64_t *)(cc_iocent->cc_data + FBA_SIZE(st_cblk_off)),
5448 		    char *, *(int64_t *)(cc_iocent->cc_data +
5449 		    FBA_SIZE(st_cblk_off + st_cblk_len) - 8));
5450 
5451 
5452 		first_iocent = cc_iocent;
5453 		cc_iocent = cc_iocent->cc_chain;
5454 		cur_fba_len -= st_cblk_len;
5455 
5456 		while (cur_fba_len > (nsc_size_t)end_cblk_len) {
5457 			SET_FULLY_VALID(cc_iocent);
5458 			DATA_LOG(SDF_RDIO, cc_iocent, 0, BLK_FBAS);
5459 
5460 			DTRACE_PROBE4(_sd_read_complete_data2, uint64_t,
5461 			    (uint64_t)BLK_TO_FBA_NUM(CENTRY_BLK(cc_iocent)),
5462 			    int, BLK_FBAS, char *,
5463 			    *(int64_t *)(cc_iocent->cc_data), char *,
5464 			    *(int64_t *)(cc_iocent->cc_data +
5465 			    FBA_SIZE(BLK_FBAS) - 8));
5466 
5467 			/*
5468 			 * 4755485 release implicit prefetch buffers
5469 			 *
5470 			 * the cc_chain of the first buffer must NULL'd
5471 			 * else _sd_free_buf() will do a double free when
5472 			 * it traverses the chain.
5473 			 *
5474 			 * if a buffer has been marked PREFETCH_BUF_IR then
5475 			 * it is guaranteed that
5476 			 *    1. it is the second in a chain of two.
5477 			 *    2. cur_fba_len is BLK_FBAS.
5478 			 *    3. end_cblk_len is zero.
5479 			 *
5480 			 * because of 1 (and 2) above, we can safely exit the
5481 			 * while loop via the break statement without
5482 			 * executing the last two statements.  the break
5483 			 * statement is necessary because it would be unsafe
5484 			 * to access cc_iocent which could be reallocated
5485 			 * immediately after the _sd_centry_release().
5486 			 */
5487 			if (cc_iocent->cc_aging_dm & PREFETCH_BUF_IR) {
5488 				cc_iocent->cc_aging_dm &= ~(PREFETCH_BUF_IR);
5489 				_sd_centry_release(cc_iocent);
5490 				first_iocent->cc_chain = NULL;
5491 				break;
5492 			}
5493 
5494 			cc_iocent = cc_iocent->cc_chain;
5495 			cur_fba_len -= BLK_FBAS;
5496 		}
5497 		if (end_cblk_len) {
5498 			SDBC_SET_VALID_BITS(0, end_cblk_len, cc_iocent);
5499 			DATA_LOG(SDF_RDIO, cc_iocent, 0, end_cblk_len);
5500 
5501 			DTRACE_PROBE4(_sd_read_complete_data3, uint64_t,
5502 			    (uint64_t)BLK_TO_FBA_NUM(CENTRY_BLK(cc_iocent)),
5503 			    int, end_cblk_len, char *,
5504 			    *(int64_t *)(cc_iocent->cc_data), char *,
5505 			    *(int64_t *)(cc_iocent->cc_data +
5506 			    FBA_SIZE(end_cblk_len) - 8));
5507 		}
5508 	}
5509 
5510 }
5511 
5512 
5513 /*
5514  * _sd_async_read_ea - End action for async reads.
5515  *
5516  * ARGUMENTS:
5517  *	xhandle  - handle allocated earlier on (cast to blind_t).
5518  *	fba_pos - disk block number read from.
5519  *	fba_len - length in fbas.
5520  *	error   - error from io if any.
5521  *
5522  * RETURNS:
5523  *	NONE.
5524  *
5525  * Comments:
5526  *	This routine is called at interrupt level when the io is done.
5527  *	This is called only when read is asynchronous (NSC_NOBLOCK)
5528  */
5529 
5530 static void
5531 _sd_async_read_ea(blind_t xhandle, nsc_off_t fba_pos, nsc_size_t fba_len,
5532     int error)
5533 {
5534 	_sd_buf_handle_t *handle = xhandle;
5535 	int cd;
5536 
5537 	if (error) {
5538 		cd = HANDLE_CD(handle);
5539 		ASSERT(cd >= 0);
5540 		_sd_cache_files[cd].cd_info->sh_failed = 1;
5541 	}
5542 	SDTRACE(ST_ENTER|SDF_READ_EA, HANDLE_CD(handle),
5543 	    handle->bh_fba_len, handle->bh_fba_pos, 0, error);
5544 
5545 	_sd_read_complete(handle, fba_pos, fba_len, error);
5546 
5547 #if defined(_SD_DEBUG_PATTERN)
5548 	check_buf_consistency(handle, "rd");
5549 #endif
5550 
5551 	SDTRACE(ST_EXIT|SDF_READ_EA, HANDLE_CD(handle),
5552 	    handle->bh_fba_len, handle->bh_fba_pos, 0, 0);
5553 	_SD_READ_CALLBACK(handle);
5554 }
5555 
5556 
5557 /*
5558  * _sd_async_write_ea - End action for async writes.
5559  *
5560  * ARGUMENTS:
5561  *	xhandle  - handle allocated earlier on. (cast to blind_t)
5562  *	fba_pos - disk block number written to.
5563  *	fba_len - length in fbas.
5564  *	error   - error from io if any.
5565  *
5566  * RETURNS:
5567  *	NONE.
5568  *
5569  * Comments:
5570  *	This routine is called at interrupt level when the write io is done.
5571  *	This is called only when we are in write-through mode and the write
5572  *	call indicated asynchronous callback. (NSC_NOBLOCK)
5573  */
5574 
5575 /* ARGSUSED */
5576 
5577 static void
5578 _sd_async_write_ea(blind_t xhandle, nsc_off_t fba_pos, nsc_size_t fba_len,
5579     int error)
5580 {
5581 	_sd_buf_handle_t *handle = xhandle;
5582 	handle->bh_error = error;
5583 
5584 	if (error)
5585 		_sd_cache_files[HANDLE_CD(handle)].cd_info->sh_failed = 1;
5586 
5587 	_SD_WRITE_CALLBACK(handle);
5588 }
5589 
5590 /*
5591  * update_dirty - set dirty bits in cache block which is already dirty
5592  *	cc_inuse is held, need cc_lock to avoid race with _sd_process_pending
5593  *	must check for I/O in-progress and set PEND_DIRTY.
5594  *	return previous dirty bits
5595  *	[if set _sd_process_pending will re-issue]
5596  */
5597 static _sd_bitmap_t
5598 update_dirty(_sd_cctl_t *cc_ent, sdbc_cblk_fba_t st_off, sdbc_cblk_fba_t st_len)
5599 {
5600 	_sd_bitmap_t old;
5601 
5602 	/* was FAST */
5603 	mutex_enter(&cc_ent->cc_lock);
5604 	old = CENTRY_DIRTY(cc_ent);
5605 	if (old) {
5606 		/*
5607 		 * If we are writing to an FBA that is still marked dirty,
5608 		 * record a write cancellation.
5609 		 */
5610 		if (old & SDBC_GET_BITS(st_off, st_len)) {
5611 			CACHE_WRITE_CANCELLATION(CENTRY_CD(cc_ent));
5612 		}
5613 
5614 		/* This is a write to a block that was already dirty */
5615 		SDBC_SET_DIRTY(st_off, st_len, cc_ent);
5616 		sd_serialize();
5617 		if (CENTRY_IO_INPROGRESS(cc_ent))
5618 			cc_ent->cc_flag |= CC_PEND_DIRTY;
5619 	}
5620 	/* was FAST */
5621 	mutex_exit(&cc_ent->cc_lock);
5622 	return (old);
5623 }
5624 
5625 /*
5626  * _sd_write - Interface call to commit part of handle.
5627  *
5628  * ARGUMENTS:
5629  *	handle  - handle allocated earlier o.
5630  *	fba_pos - disk block number to write to.
5631  *	fba_len - length in fbas.
5632  *	flag    - (NSC_NOBLOCK | NSC_WRTHRU)
5633  *
5634  * RETURNS:
5635  *	errno if return > 0
5636  *	NSC_HIT (in cache), NSC_DONE (to disk) or NSC_PENDING otherwise.
5637  *
5638  * Comments:
5639  *	This routine checks validity of the handle and then calls the
5640  *	sync-write function if this write is determined to be write-through.
5641  *	Else, it reflects the data to the write blocks on the mirror node,
5642  *	(allocated in alloc_buf). If the cache block is not dirty, it is
5643  *	marked dirty and queued up for io processing later on.
5644  *	If parts are already dirty but io is not in progress yet, it is
5645  *	marked dirty and left alone (it is already in the queue)
5646  *	If parts are already dirty but io is in progress, it is marked
5647  *	dirty and also a flag is set indicating that this buffer should
5648  *	be reprocessed after the io-end-action.
5649  *	Attempt is made to coalesce multiple writes into a single list
5650  *	for io processing later on.
5651  *
5652  *	Issuing of writes may be delayed until the handle is released;
5653  *	_sd_queue_write() sets NSC_QUEUE, indicating that dirty bits
5654  *	and reflection to mirror have already been done, just queue I/O.
5655  */
5656 
5657 
5658 
5659 int
5660 _sd_write(_sd_buf_handle_t *handle, nsc_off_t fba_pos, nsc_size_t fba_len,
5661     int flag)
5662 {
5663 	int cd = HANDLE_CD(handle);
5664 	int num_queued, ret, queue_only, store_only;
5665 	sdbc_cblk_fba_t st_cblk_len;	/* FBA len of starting cache block */
5666 	sdbc_cblk_fba_t end_cblk_len;	/* FBA len of ending cache block */
5667 	sdbc_cblk_fba_t st_cblk_off;	/* FBA offset into starting cblock */
5668 	nsc_size_t cur_fba_len;	/* position in disk blocks */
5669 	_sd_cctl_t *cc_ent = NULL;
5670 	_sd_cctl_t *cur_chain = NULL, *dirty_next = NULL;
5671 
5672 
5673 	if (_sdbc_shutdown_in_progress) {
5674 		ret = EIO;
5675 		goto out;
5676 	}
5677 
5678 
5679 	if (!_SD_HANDLE_ACTIVE(handle)) {
5680 		SDALERT(SDF_WRITE,
5681 		    SDT_INV_CD, 0, SDT_INV_BL, handle->bh_flag, 0);
5682 		ret = EINVAL;
5683 		goto out;
5684 	}
5685 #if !defined(_SD_NOCHECKS)
5686 	ASSERT_HANDLE_LIMITS(handle, fba_pos, fba_len);
5687 	if ((handle->bh_flag & NSC_WRBUF) == 0) {
5688 		ret = EINVAL;
5689 		goto out;
5690 	}
5691 #endif
5692 	if (fba_len == 0) {
5693 		ret = NSC_DONE;
5694 		goto out;
5695 	}
5696 
5697 	/*
5698 	 * store_only: don't queue this I/O yet
5699 	 * queue_only: queue I/O to disk, don't store in mirror node
5700 	 */
5701 	if (flag & NSC_QUEUE)
5702 		queue_only = 1, store_only = 0;
5703 	else
5704 		if (_SD_DELAY_QUEUE && (fba_len != handle->bh_fba_len))
5705 			queue_only = 0, store_only = 1;
5706 	else
5707 		queue_only = store_only = 0;
5708 
5709 	if (!queue_only && _SD_FORCE_DISCONNECT(fba_len))
5710 		_SD_DISCONNECT_CALLBACK(handle);
5711 
5712 	if (_sd_cache_files[cd].cd_info->sh_failed) {
5713 		ret = EIO;
5714 		goto out;
5715 	}
5716 
5717 	KSTAT_RUNQ_ENTER(cd);
5718 
5719 	SDTRACE(ST_ENTER|SDF_WRITE, cd, fba_len, fba_pos, flag, 0);
5720 
5721 #if defined(_SD_DEBUG_PATTERN)
5722 	check_buf_consistency(handle, "wr");
5723 #endif
5724 
5725 	cc_ent = handle->bh_centry;
5726 
5727 	while (CENTRY_BLK(cc_ent) != FBA_TO_BLK_NUM(fba_pos))
5728 		cc_ent = cc_ent->cc_chain;
5729 
5730 	if (((handle->bh_flag | flag) & _SD_WRTHRU_MASK) ||
5731 	    (!queue_only && _sd_remote_store(cc_ent, fba_pos, fba_len))) {
5732 		flag |= NSC_WRTHRU;
5733 
5734 		ret = _sd_sync_write(handle, fba_pos, fba_len, flag);
5735 		goto stats_exit;
5736 	}
5737 
5738 	if (store_only)		/* enqueue in _sd_free_buf() */
5739 		handle->bh_flag |= NSC_QUEUE;
5740 	cur_fba_len = fba_len;
5741 	st_cblk_off = BLK_FBA_OFF(fba_pos);
5742 	st_cblk_len = BLK_FBAS - st_cblk_off;
5743 	if ((nsc_size_t)st_cblk_len >= fba_len) {
5744 		end_cblk_len = 0;
5745 		st_cblk_len = (sdbc_cblk_fba_t)fba_len;
5746 	} else {
5747 		end_cblk_len = BLK_FBA_OFF(fba_pos + fba_len);
5748 	}
5749 
5750 	if (CENTRY_DIRTY(cc_ent) && update_dirty(cc_ent, st_cblk_off,
5751 	    st_cblk_len))
5752 		goto loop1;
5753 	if (store_only) {
5754 		SDBC_SET_TOFLUSH(st_cblk_off, st_cblk_len, cc_ent);
5755 		goto loop1;
5756 	}
5757 	SDBC_SET_DIRTY(st_cblk_off, st_cblk_len, cc_ent);
5758 	cur_chain = dirty_next = cc_ent;
5759 	num_queued = 1;
5760 
5761 loop1:
5762 	DATA_LOG(SDF_WR, cc_ent, st_cblk_off, st_cblk_len);
5763 
5764 	DTRACE_PROBE4(_sd_write_data1, uint64_t, (uint64_t)
5765 	    (BLK_TO_FBA_NUM(CENTRY_BLK(cc_ent)) + st_cblk_off),
5766 	    int, st_cblk_len, char *,
5767 	    *(int64_t *)(cc_ent->cc_data + FBA_SIZE(st_cblk_off)),
5768 	    char *, *(int64_t *)(cc_ent->cc_data +
5769 	    FBA_SIZE(st_cblk_off+ st_cblk_len) - 8));
5770 
5771 	cur_fba_len -= st_cblk_len;
5772 	cc_ent = cc_ent->cc_chain;
5773 
5774 	while (cur_fba_len > (nsc_size_t)end_cblk_len) {
5775 		if (CENTRY_DIRTY(cc_ent) && update_dirty(cc_ent, 0, BLK_FBAS)) {
5776 			if (cur_chain) {
5777 				_sd_enqueue_dirty(cd, cur_chain, dirty_next,
5778 				    num_queued);
5779 				cur_chain = dirty_next = NULL;
5780 			}
5781 			goto loop2;
5782 		}
5783 		if (store_only) {
5784 			SDBC_SET_TOFLUSH(0, BLK_FBAS, cc_ent);
5785 			goto loop2;
5786 		}
5787 		SDBC_SET_DIRTY(0, BLK_FBAS, cc_ent);
5788 		if (dirty_next) {
5789 			dirty_next->cc_dirty_next = cc_ent;
5790 			dirty_next = cc_ent;
5791 			num_queued++;
5792 		} else {
5793 			cur_chain = dirty_next = cc_ent;
5794 			num_queued = 1;
5795 		}
5796 	loop2:
5797 		DATA_LOG(SDF_WR, cc_ent, 0, BLK_FBAS);
5798 
5799 		DTRACE_PROBE4(_sd_write_data2, uint64_t,
5800 		    (uint64_t)(BLK_TO_FBA_NUM(CENTRY_BLK(cc_ent))),
5801 		    int, BLK_FBAS, char *, *(int64_t *)(cc_ent->cc_data),
5802 		    char *, *(int64_t *)(cc_ent->cc_data +
5803 		    FBA_SIZE(BLK_FBAS) - 8));
5804 
5805 		cc_ent = cc_ent->cc_chain;
5806 		cur_fba_len -= BLK_FBAS;
5807 	}
5808 
5809 #if defined(_SD_DEBUG)
5810 	if (cur_fba_len != end_cblk_len)
5811 		cmn_err(CE_WARN, "!fba_len %" NSC_SZFMT " end_cblk_len %d in "
5812 		    "_sd_write", cur_fba_len, end_cblk_len);
5813 #endif
5814 
5815 	if (cur_fba_len) {
5816 		if (CENTRY_DIRTY(cc_ent) && update_dirty(cc_ent, 0,
5817 		    end_cblk_len)) {
5818 			if (cur_chain) {
5819 				_sd_enqueue_dirty(cd, cur_chain, dirty_next,
5820 				    num_queued);
5821 				cur_chain = dirty_next = NULL;
5822 			}
5823 			goto loop3;
5824 		}
5825 		if (store_only) {
5826 			SDBC_SET_TOFLUSH(0, end_cblk_len, cc_ent);
5827 			goto loop3;
5828 		}
5829 		SDBC_SET_DIRTY(0, end_cblk_len, cc_ent);
5830 		if (dirty_next) {
5831 			dirty_next->cc_dirty_next = cc_ent;
5832 			dirty_next = cc_ent;
5833 			num_queued++;
5834 		} else {
5835 			cur_chain = dirty_next = cc_ent;
5836 			num_queued = 1;
5837 		}
5838 	}
5839 loop3:
5840 	if (cur_fba_len) {
5841 		DATA_LOG(SDF_WR, cc_ent, 0, end_cblk_len);
5842 
5843 		DTRACE_PROBE4(_sd_write_data3, uint64_t,
5844 		    (uint64_t)(BLK_TO_FBA_NUM(CENTRY_BLK(cc_ent))),
5845 		    int, end_cblk_len, char *, *(int64_t *)(cc_ent->cc_data),
5846 		    char *, *(int64_t *)(cc_ent->cc_data +
5847 		    FBA_SIZE(end_cblk_len) - 8));
5848 
5849 	}
5850 
5851 	if (!store_only && cur_chain) {
5852 		_sd_enqueue_dirty(cd, cur_chain, dirty_next, num_queued);
5853 	}
5854 
5855 	if (!queue_only) {
5856 		CACHE_FBA_WRITE(cd,  fba_len);
5857 		CACHE_WRITE_HIT;
5858 
5859 		FBA_WRITE_IO_KSTATS(cd, FBA_SIZE(fba_len));
5860 	}
5861 
5862 	ret = NSC_HIT;
5863 
5864 stats_exit:
5865 	SDTRACE(ST_EXIT|SDF_WRITE, cd, fba_len, fba_pos, flag, ret);
5866 	KSTAT_RUNQ_EXIT(cd);
5867 out:
5868 	return (ret);
5869 }
5870 
5871 
5872 /*
5873  * _sd_queue_write(handle, fba_pos, fba_len): Queues delayed writes for
5874  *					    flushing
5875  *
5876  * ARGUMENTS:  handle  - handle allocated with NSC_WRBUF
5877  *	fba_pos - starting fba pos from _sd_alloc_buf()
5878  *	fba_len - fba len from _sd_alloc_buf()
5879  *
5880  * USAGE    :  Called if _SD_DELAY_QUEUE is set. Finds all blocks in the
5881  *	handle marked for flushing and queues them to be written in
5882  *	optimized (i.e. sequential) order
5883  */
5884 static void
5885 _sd_queue_write(_sd_buf_handle_t *handle, nsc_off_t fba_pos, nsc_size_t fba_len)
5886 {
5887 	nsc_off_t fba_end;
5888 	sdbc_cblk_fba_t sblk, len, dirty;
5889 	_sd_cctl_t *cc_ent;
5890 	nsc_off_t flush_pos;
5891 	int flush_pos_valid = 0;
5892 	nsc_size_t flush_len = 0;
5893 
5894 	cc_ent = handle->bh_centry;
5895 	fba_end = fba_pos + fba_len;
5896 	fba_pos = BLK_TO_FBA_NUM(CENTRY_BLK(cc_ent)); /* 1st block */
5897 	while (fba_pos < fba_end) {
5898 		dirty = cc_ent->cc_toflush;
5899 		cc_ent->cc_toflush = 0;
5900 		/*
5901 		 * Full block
5902 		 */
5903 		if (_SD_BMAP_ISFULL(dirty)) {
5904 			if (flush_pos_valid == 0) {
5905 				flush_pos_valid = 1;
5906 				flush_pos = fba_pos;
5907 			}
5908 			flush_len += BLK_FBAS;
5909 		}
5910 		/*
5911 		 * Partial block
5912 		 */
5913 		else while (dirty) {
5914 			sblk = SDBC_LOOKUP_STPOS(dirty);
5915 			len  = SDBC_LOOKUP_LEN(dirty);
5916 			SDBC_LOOKUP_MODIFY(dirty);
5917 
5918 			if (sblk && flush_pos_valid) {
5919 				(void) _sd_write(handle, flush_pos, flush_len,
5920 				    NSC_QUEUE);
5921 				flush_pos_valid = 0;
5922 				flush_len = 0;
5923 			}
5924 			if (flush_pos_valid == 0) {
5925 				flush_pos_valid = 1;
5926 				flush_pos = fba_pos + sblk;
5927 			}
5928 			flush_len += len;
5929 		}
5930 		fba_pos += BLK_FBAS;
5931 		cc_ent = cc_ent->cc_chain;
5932 		/*
5933 		 * If we find a gap, write out what we've got
5934 		 */
5935 		if (flush_pos_valid && (flush_pos + flush_len) != fba_pos) {
5936 			(void) _sd_write(handle, flush_pos, flush_len,
5937 			    NSC_QUEUE);
5938 			flush_pos_valid = 0;
5939 			flush_len = 0;
5940 		}
5941 	}
5942 	if (flush_pos_valid)
5943 		(void) _sd_write(handle, flush_pos, flush_len, NSC_QUEUE);
5944 }
5945 
5946 
5947 static int
5948 _sd_remote_store(_sd_cctl_t *cc_ent, nsc_off_t fba_pos, nsc_size_t fba_len)
5949 {
5950 	sdbc_cblk_fba_t st_cblk_len;	/* FBA len of starting cache block */
5951 	sdbc_cblk_fba_t end_cblk_len;	/* FBA len of ending cache block */
5952 	sdbc_cblk_fba_t st_cblk_off;	/* FBA offset into starting cblock */
5953 	ss_resource_t *ss_res;
5954 
5955 	if (_sd_nodes_configured <= 2 && _sd_is_mirror_down())
5956 		return (0);
5957 	st_cblk_off = BLK_FBA_OFF(fba_pos);
5958 	st_cblk_len = BLK_FBAS - st_cblk_off;
5959 	if ((nsc_size_t)st_cblk_len >= fba_len) {
5960 		end_cblk_len = 0;
5961 		st_cblk_len = (sdbc_cblk_fba_t)fba_len;
5962 	} else {
5963 		end_cblk_len = BLK_FBA_OFF(fba_pos + fba_len);
5964 	}
5965 
5966 	fba_len -= st_cblk_len;
5967 
5968 	ss_res = cc_ent->cc_write->sc_res;
5969 	if (SSOP_WRITE_CBLOCK(sdbc_safestore, ss_res,
5970 	    cc_ent->cc_data + FBA_SIZE(st_cblk_off), FBA_SIZE(st_cblk_len),
5971 	    FBA_SIZE(st_cblk_off))) {
5972 
5973 		cmn_err(CE_WARN,
5974 		    "!sdbc(_sd_write) safe store failed. Going synchronous");
5975 		SDTRACE(SDF_REFLECT, CENTRY_CD(cc_ent), fba_len,
5976 		    fba_pos, 0, -1);
5977 		return (-1);
5978 	}
5979 
5980 	cc_ent = cc_ent->cc_chain;
5981 	while (fba_len > (nsc_size_t)end_cblk_len) {
5982 		fba_len -= BLK_FBAS;
5983 
5984 		if (SSOP_WRITE_CBLOCK(sdbc_safestore, ss_res, cc_ent->cc_data,
5985 		    CACHE_BLOCK_SIZE, 0)) {
5986 
5987 			cmn_err(CE_WARN, "!sdbc(_sd_write) safe store failed. "
5988 			    "Going synchronous");
5989 			SDTRACE(SDF_REFLECT, CENTRY_CD(cc_ent), fba_len,
5990 			    fba_pos, 0, -1);
5991 			return (-1);
5992 		}
5993 
5994 		cc_ent = cc_ent->cc_chain;
5995 	} /* end while */
5996 
5997 	if (fba_len) {
5998 		if (SSOP_WRITE_CBLOCK(sdbc_safestore, ss_res,
5999 		    cc_ent->cc_data, FBA_SIZE(end_cblk_len), 0)) {
6000 
6001 			cmn_err(CE_WARN, "!sdbc(_sd_write) nvmem dma failed. "
6002 			    "Going synchronous");
6003 			SDTRACE(SDF_REFLECT, CENTRY_CD(cc_ent), fba_len,
6004 			    fba_pos, 0, -1);
6005 			return (-1);
6006 		}
6007 	}
6008 	return (0);
6009 }
6010 
6011 
6012 /*
6013  * _sd_sync_write2 - Write-through function.
6014  *
6015  * ARGUMENTS:
6016  *	wr_handle - handle into which to write the data.
6017  *	wr_st_pos - starting FBA position in wr_handle.
6018  *	fba_len   - length in fbas.
6019  *	flag	- NSC_NOBLOCK for async io.
6020  *	rd_handle - handle from which to read the data, or NULL.
6021  *	rd_st_pos - starting FBA position in rd_handle.
6022  *
6023  * RETURNS:
6024  *	errno if return > 0
6025  *	NSC_DONE or NSC_PENDING otherwise.
6026  *
6027  * Comments:
6028  *	This routine initiates io of the indicated portion. It returns
6029  *	synchronously after io is completed if NSC_NOBLOCK is not set.
6030  *	Else NSC_PENDING is returned with a subsequent write callback on
6031  *	io completion.
6032  *
6033  *	See _sd_copy_direct() for usage when
6034  *	    (wr_handle != rd_handle && rd_handle != NULL)
6035  */
6036 
6037 static int
6038 _sd_sync_write2(_sd_buf_handle_t *wr_handle, nsc_off_t wr_st_pos,
6039     nsc_size_t fba_len, int flag, _sd_buf_handle_t *rd_handle,
6040     nsc_off_t rd_st_pos)
6041 {
6042 	void (*fn)(blind_t, nsc_off_t, nsc_size_t, int);
6043 	_sd_cctl_t *wr_ent, *rd_ent;
6044 	nsc_size_t this_len;
6045 	nsc_off_t rd_pos, wr_pos;
6046 	nsc_size_t log_bytes;
6047 	int cd = HANDLE_CD(wr_handle);
6048 	int err;
6049 	uint_t dirty;
6050 	struct buf *bp;
6051 
6052 	LINTUSED(flag);
6053 
6054 	_SD_DISCONNECT_CALLBACK(wr_handle);
6055 
6056 	if (rd_handle == NULL) {
6057 		rd_handle = wr_handle;
6058 		rd_st_pos = wr_st_pos;
6059 	}
6060 
6061 	wr_ent = wr_handle->bh_centry;
6062 	while (CENTRY_BLK(wr_ent) != FBA_TO_BLK_NUM(wr_st_pos))
6063 		wr_ent = wr_ent->cc_chain;
6064 
6065 	rd_ent = rd_handle->bh_centry;
6066 	while (CENTRY_BLK(rd_ent) != FBA_TO_BLK_NUM(rd_st_pos))
6067 		rd_ent = rd_ent->cc_chain;
6068 
6069 	bp = sd_alloc_iob(_sd_cache_files[cd].cd_crdev,
6070 	    wr_st_pos, FBA_TO_BLK_LEN(fba_len) + 2, B_WRITE);
6071 
6072 	if (bp == NULL)
6073 		return (E2BIG);
6074 
6075 	wr_pos = BLK_FBA_OFF(wr_st_pos);
6076 	rd_pos = BLK_FBA_OFF(rd_st_pos);
6077 	log_bytes = 0;
6078 
6079 	do {
6080 		this_len = min((BLK_FBAS - rd_pos), (BLK_FBAS - wr_pos));
6081 
6082 		if (this_len > fba_len)
6083 			this_len = fba_len;
6084 
6085 		/*
6086 		 * clear dirty bits in the write handle.
6087 		 */
6088 
6089 		if (CENTRY_DIRTY(wr_ent)) {
6090 			mutex_enter(&wr_ent->cc_lock);
6091 
6092 			if (CENTRY_DIRTY(wr_ent)) {
6093 				if (this_len == (nsc_size_t)BLK_FBAS ||
6094 				    rd_handle != wr_handle) {
6095 					/*
6096 					 * optimization for when we have a
6097 					 * full cache block, or are doing
6098 					 * copy_direct (see below).
6099 					 */
6100 
6101 					wr_ent->cc_write->sc_dirty = 0;
6102 				} else {
6103 					dirty = wr_ent->cc_write->sc_dirty;
6104 					dirty &= ~(SDBC_GET_BITS(
6105 					    wr_pos, this_len));
6106 					wr_ent->cc_write->sc_dirty = dirty;
6107 				}
6108 
6109 				SSOP_SETCENTRY(sdbc_safestore,
6110 				    wr_ent->cc_write);
6111 			}
6112 
6113 			mutex_exit(&wr_ent->cc_lock);
6114 		}
6115 
6116 		/*
6117 		 * update valid bits in the write handle.
6118 		 */
6119 
6120 		if (rd_handle == wr_handle) {
6121 			if (this_len == (nsc_size_t)BLK_FBAS) {
6122 				SET_FULLY_VALID(wr_ent);
6123 			} else {
6124 				SDBC_SET_VALID_BITS(wr_pos, this_len, wr_ent);
6125 			}
6126 		} else {
6127 			/*
6128 			 * doing copy_direct, so mark the write handle
6129 			 * as invalid since the data is on disk, but not
6130 			 * in cache.
6131 			 */
6132 			wr_ent->cc_valid = 0;
6133 		}
6134 
6135 		DATA_LOG(SDF_WRSYNC, rd_ent, rd_pos, this_len);
6136 
6137 		DTRACE_PROBE4(_sd_sync_write2_data, uint64_t,
6138 		    (uint64_t)BLK_TO_FBA_NUM(CENTRY_BLK(rd_ent)) + rd_pos,
6139 		    uint64_t, (uint64_t)this_len, char *,
6140 		    *(int64_t *)(rd_ent->cc_data + FBA_SIZE(rd_pos)),
6141 		    char *, *(int64_t *)(rd_ent->cc_data +
6142 		    FBA_SIZE(rd_pos + this_len) - 8));
6143 
6144 		sd_add_fba(bp, &rd_ent->cc_addr, rd_pos, this_len);
6145 
6146 		log_bytes += FBA_SIZE(this_len);
6147 		fba_len -= this_len;
6148 
6149 		wr_pos += this_len;
6150 		if (wr_pos >= (nsc_size_t)BLK_FBAS) {
6151 			wr_ent = wr_ent->cc_chain;
6152 			wr_pos = 0;
6153 		}
6154 
6155 		rd_pos += this_len;
6156 		if (rd_pos >= (nsc_size_t)BLK_FBAS) {
6157 			rd_ent = rd_ent->cc_chain;
6158 			rd_pos = 0;
6159 		}
6160 
6161 	} while (fba_len > 0);
6162 
6163 	DISK_FBA_WRITE(cd, FBA_NUM(log_bytes));
6164 	CACHE_WRITE_MISS;
6165 
6166 	FBA_WRITE_IO_KSTATS(cd, log_bytes);
6167 
6168 	fn = (wr_handle->bh_flag & NSC_NOBLOCK) ? _sd_async_write_ea : NULL;
6169 
6170 	err = sd_start_io(bp, _sd_cache_files[cd].cd_strategy, fn, wr_handle);
6171 
6172 	if (err != NSC_PENDING) {
6173 		DATA_LOG_CHAIN(SDF_WRSYEA, wr_handle->bh_centry,
6174 		    wr_st_pos, FBA_NUM(log_bytes));
6175 	}
6176 
6177 	return (err);
6178 }
6179 
6180 
6181 static int
6182 _sd_sync_write(_sd_buf_handle_t *handle, nsc_off_t fba_pos, nsc_size_t fba_len,
6183     int flag)
6184 {
6185 	return (_sd_sync_write2(handle, fba_pos, fba_len, flag, NULL, 0));
6186 }
6187 
6188 
6189 /*
6190  * _sd_zero - Interface call to zero out a portion of cache blocks.
6191  *
6192  * ARGUMENTS:
6193  *	handle  - handle allocated earlier on.
6194  *	fba_pos - disk block number to zero from.
6195  *	fba_len - length in fbas.
6196  *	flag    - NSC_NOBLOCK for async io.
6197  *
6198  * RETURNS:
6199  *	errno if return > 0
6200  *	NSC_DONE or NSC_PENDING otherwise.
6201  *
6202  * Comments:
6203  *	This routine zeroes out the indicated portion of the cache blocks
6204  *	and commits the data to disk.
6205  *	(See write for more details on the commit)
6206  */
6207 
6208 
6209 int
6210 _sd_zero(_sd_buf_handle_t *handle, nsc_off_t fba_pos, nsc_size_t fba_len,
6211     int flag)
6212 {
6213 	int cd;
6214 	sdbc_cblk_fba_t st_cblk_len;	/* FBA len of starting cache block */
6215 	sdbc_cblk_fba_t end_cblk_len;	/* FBA len of ending cache block */
6216 	sdbc_cblk_fba_t st_cblk_off;	/* FBA offset into starting cblock */
6217 	nsc_size_t cur_fba_len;	/* position in disk blocks */
6218 	int ret;
6219 	_sd_cctl_t *cc_ent;
6220 
6221 	if (_sdbc_shutdown_in_progress) {
6222 		DTRACE_PROBE(shutdown);
6223 		return (EIO);
6224 	}
6225 
6226 	if (!_SD_HANDLE_ACTIVE(handle)) {
6227 		cmn_err(CE_WARN, "!sdbc(_sd_zero) handle %p not active",
6228 		    (void *)handle);
6229 
6230 		DTRACE_PROBE1(handle_active, int, handle->bh_flag);
6231 
6232 		return (EINVAL);
6233 	}
6234 	ASSERT_HANDLE_LIMITS(handle, fba_pos, fba_len);
6235 	if ((handle->bh_flag & NSC_WRBUF) == 0) {
6236 		DTRACE_PROBE1(handle_write, int, handle->bh_flag);
6237 		return (EINVAL);
6238 	}
6239 
6240 	if (fba_len == 0) {
6241 		DTRACE_PROBE(zero_len);
6242 		return (NSC_DONE);
6243 	}
6244 
6245 	if (_SD_FORCE_DISCONNECT(fba_len))
6246 		_SD_DISCONNECT_CALLBACK(handle);
6247 
6248 	cd = HANDLE_CD(handle);
6249 	SDTRACE(ST_ENTER|SDF_ZERO, cd, fba_len, fba_pos, flag, 0);
6250 
6251 	cc_ent = handle->bh_centry;
6252 	while (CENTRY_BLK(cc_ent) != FBA_TO_BLK_NUM(fba_pos))
6253 		cc_ent = cc_ent->cc_chain;
6254 	cur_fba_len = fba_len;
6255 	st_cblk_off = BLK_FBA_OFF(fba_pos);
6256 	st_cblk_len = BLK_FBAS - st_cblk_off;
6257 	if ((nsc_size_t)st_cblk_len >= fba_len) {
6258 		end_cblk_len = 0;
6259 		st_cblk_len = (sdbc_cblk_fba_t)fba_len;
6260 	} else {
6261 		end_cblk_len = BLK_FBA_OFF(fba_pos + fba_len);
6262 	}
6263 
6264 	cur_fba_len -= st_cblk_len;
6265 	bzero(cc_ent->cc_data + FBA_SIZE(st_cblk_off), FBA_SIZE(st_cblk_len));
6266 
6267 	cc_ent = cc_ent->cc_chain;
6268 	while (cur_fba_len > (nsc_size_t)end_cblk_len) {
6269 		cur_fba_len -= BLK_FBAS;
6270 		bzero(cc_ent->cc_data, CACHE_BLOCK_SIZE);
6271 		cc_ent = cc_ent->cc_chain;
6272 	}
6273 	if (cur_fba_len) {
6274 		bzero(cc_ent->cc_data, FBA_SIZE(cur_fba_len));
6275 	}
6276 
6277 	ret = _sd_write(handle, fba_pos, fba_len, flag);
6278 	SDTRACE(ST_EXIT|SDF_ZERO, cd, fba_len, fba_pos, flag, ret);
6279 
6280 	return (ret);
6281 }
6282 
6283 
6284 /*
6285  * _sd_copy - Copies portions of 2 handles.
6286  *
6287  * ARGUMENTS:
6288  *	handle1  - handle allocated earlier on.
6289  *	handle2  - handle allocated earlier on.
6290  *	fba_pos1 - disk block number to read from.
6291  *	fba_pos2 - disk block number to write to.
6292  *	fba_len - length in fbas.
6293  *
6294  * RETURNS:
6295  *	errno if return > 0
6296  *	NSC_DONE otherwise.
6297  *
6298  * Comments:
6299  *	This routine copies the 2 handles.
6300  *	WARNING: this could put the cache blocks in the destination handle
6301  *	in an inconsistent state. (the blocks could be valid in cache,
6302  *	but the copy makes the cache different from disk)
6303  *
6304  */
6305 
6306 
6307 int
6308 _sd_copy(_sd_buf_handle_t *handle1, _sd_buf_handle_t *handle2,
6309     nsc_off_t fba_pos1, nsc_off_t fba_pos2, nsc_size_t fba_len)
6310 {
6311 	sdbc_cblk_fba_t st_cblk_len;	/* FBA len of starting cache block */
6312 	sdbc_cblk_fba_t end_cblk_len;	/* FBA len of ending cache block */
6313 	sdbc_cblk_fba_t st_cblk_off;	/* FBA offset into starting cblock */
6314 	nsc_off_t off1, off2;	/* offsets in FBA's into the disk */
6315 	nsc_size_t cur_fba_len;	/* position in disk blocks */
6316 	_sd_cctl_t *cc_ent1, *cc_ent2;
6317 
6318 	if (_sdbc_shutdown_in_progress) {
6319 		DTRACE_PROBE(shutdown);
6320 		return (EIO);
6321 	}
6322 	if (!_SD_HANDLE_ACTIVE(handle1) || !_SD_HANDLE_ACTIVE(handle2)) {
6323 		cmn_err(CE_WARN, "!sdbc(_sd_copy) handle %p or %p not active",
6324 		    (void *)handle1, (void *)handle2);
6325 
6326 		DTRACE_PROBE2(handle_active1, int, handle1->bh_flag,
6327 		    int, handle2->bh_flag);
6328 
6329 		return (EINVAL);
6330 	}
6331 	ASSERT_HANDLE_LIMITS(handle1, fba_pos1, fba_len);
6332 	ASSERT_HANDLE_LIMITS(handle2, fba_pos2, fba_len);
6333 
6334 	cc_ent1 = handle1->bh_centry;
6335 	while (CENTRY_BLK(cc_ent1) != FBA_TO_BLK_NUM(fba_pos1))
6336 		cc_ent1 = cc_ent1->cc_chain;
6337 
6338 	cc_ent2 = handle2->bh_centry;
6339 	while (CENTRY_BLK(cc_ent2) != FBA_TO_BLK_NUM(fba_pos2))
6340 		cc_ent2 = cc_ent2->cc_chain;
6341 
6342 	if (BLK_FBA_OFF(fba_pos1) != BLK_FBA_OFF(fba_pos2)) {
6343 		/* Different offsets, do it slowly (per fba) */
6344 
6345 		while (fba_len) {
6346 			off1 = FBA_SIZE(BLK_FBA_OFF(fba_pos1));
6347 			off2 = FBA_SIZE(BLK_FBA_OFF(fba_pos2));
6348 
6349 			bcopy(cc_ent1->cc_data+off1, cc_ent2->cc_data+off2,
6350 			    FBA_SIZE(1));
6351 
6352 			fba_pos1++;
6353 			fba_pos2++;
6354 			fba_len--;
6355 
6356 			if (FBA_TO_BLK_NUM(fba_pos1) != CENTRY_BLK(cc_ent1))
6357 				cc_ent1 = cc_ent1->cc_chain;
6358 			if (FBA_TO_BLK_NUM(fba_pos2) != CENTRY_BLK(cc_ent2))
6359 				cc_ent2 = cc_ent2->cc_chain;
6360 		}
6361 
6362 		DTRACE_PROBE(_sd_copy_end);
6363 		return (NSC_DONE);
6364 	}
6365 	cur_fba_len = fba_len;
6366 	st_cblk_off = BLK_FBA_OFF(fba_pos1);
6367 	st_cblk_len = BLK_FBAS - st_cblk_off;
6368 	if ((nsc_size_t)st_cblk_len >= fba_len) {
6369 		end_cblk_len = 0;
6370 		st_cblk_len = (sdbc_cblk_fba_t)fba_len;
6371 	} else {
6372 		end_cblk_len = BLK_FBA_OFF(fba_pos1 + fba_len);
6373 	}
6374 
6375 	bcopy(cc_ent1->cc_data + FBA_SIZE(st_cblk_off),
6376 	    cc_ent2->cc_data + FBA_SIZE(st_cblk_off), FBA_SIZE(st_cblk_len));
6377 	cur_fba_len -= st_cblk_len;
6378 	cc_ent1 = cc_ent1->cc_chain;
6379 	cc_ent2 = cc_ent2->cc_chain;
6380 
6381 	while (cur_fba_len > (nsc_size_t)end_cblk_len) {
6382 		bcopy(cc_ent1->cc_data, cc_ent2->cc_data, CACHE_BLOCK_SIZE);
6383 		cc_ent1 = cc_ent1->cc_chain;
6384 		cc_ent2 = cc_ent2->cc_chain;
6385 		cur_fba_len -= BLK_FBAS;
6386 	}
6387 	if (cur_fba_len) {
6388 		bcopy(cc_ent1->cc_data, cc_ent2->cc_data,
6389 		    FBA_SIZE(end_cblk_len));
6390 	}
6391 
6392 	return (NSC_DONE);
6393 }
6394 
6395 
6396 /*
6397  * _sd_copy_direct - Copies data from one handle direct to another disk.
6398  *
6399  * ARGUMENTS:
6400  *	handle1  - handle to read from
6401  *	handle2  - handle to write to
6402  *	fba_pos1 - disk block number to read from.
6403  *	fba_pos2 - disk block number to write to.
6404  *	fba_len - length in fbas.
6405  *
6406  * RETURNS:
6407  *	errno if return > 0
6408  *	NSC_DONE otherwise.
6409  *
6410  * Comments:
6411  *	This routine copies data from handle1 directly (sync write)
6412  *	onto the disk pointed to by handle2. The handle2 is then
6413  *	invalidated since the data it contains is now stale compared to
6414  *	the disk.
6415  */
6416 
6417 static int
6418 _sd_copy_direct(_sd_buf_handle_t *handle1, _sd_buf_handle_t *handle2,
6419     nsc_off_t fba_pos1, nsc_off_t fba_pos2, nsc_size_t fba_len)
6420 {
6421 	int rc;
6422 
6423 	if (_sdbc_shutdown_in_progress) {
6424 		DTRACE_PROBE(shutdown);
6425 		return (EIO);
6426 	}
6427 
6428 	if (!_SD_HANDLE_ACTIVE(handle1) || !_SD_HANDLE_ACTIVE(handle2)) {
6429 		cmn_err(CE_WARN,
6430 		    "!sdbc(_sd_copy_direct) handle %p or %p not active",
6431 		    (void *)handle1, (void *)handle2);
6432 
6433 		DTRACE_PROBE2(handle_active2, int, handle1->bh_flag,
6434 		    int, handle2->bh_flag);
6435 
6436 		return (EINVAL);
6437 	}
6438 
6439 	ASSERT_HANDLE_LIMITS(handle1, fba_pos1, fba_len);
6440 	ASSERT_HANDLE_LIMITS(handle2, fba_pos2, fba_len);
6441 
6442 	if ((handle2->bh_flag & NSC_WRITE) == 0) {
6443 		cmn_err(CE_WARN,
6444 		    "!sdbc(_sd_copy_direct) handle2 %p is not writeable",
6445 		    (void *)handle2);
6446 		DTRACE_PROBE1(handle2_write, int, handle2->bh_flag);
6447 		return (EINVAL);
6448 	}
6449 
6450 	rc = _sd_sync_write2(handle2, fba_pos2, fba_len, 0, handle1, fba_pos1);
6451 
6452 	return (rc);
6453 }
6454 
6455 
6456 /*
6457  * _sd_enqueue_dirty - Enqueue a list of dirty buffers.
6458  *
6459  * ARGUMENTS:
6460  *	cd	- cache descriptor.
6461  *	chain	- pointer to list.
6462  *	cc_last - last entry in the chain.
6463  *	numq    - number of entries in the list.
6464  *
6465  * RETURNS:
6466  *	NONE.
6467  *
6468  * Comments:
6469  *	This routine queues up the dirty blocks for io processing.
6470  *	It uses the cc_last to try to coalesce multiple lists into a
6471  *	single list, if consecutive writes are sequential in nature.
6472  */
6473 
6474 void
6475 _sd_enqueue_dirty(int cd, _sd_cctl_t *chain, _sd_cctl_t *cc_last, int numq)
6476 {
6477 	_sd_cd_info_t *cdi;
6478 	_sd_cctl_t *last_ent;
6479 	int start_write = 0, maxq = SGIO_MAX;
6480 
6481 	ASSERT(cd >= 0);
6482 	cdi = &(_sd_cache_files[cd]);
6483 #if defined(_SD_DEBUG)
6484 	if (chain->cc_dirty_link)
6485 		cmn_err(CE_WARN, "!dirty_link set in enq %x fl %x",
6486 		    chain->cc_dirty_link, chain->cc_flag);
6487 #endif
6488 
6489 	/* was FAST */
6490 	mutex_enter(&(cdi->cd_lock));
6491 	cdi->cd_info->sh_numdirty += numq;
6492 	if (cc_last == NULL)
6493 		numq = 0;
6494 
6495 	if (cdi->cd_dirty_head == NULL)  {
6496 		cdi->cd_dirty_head = cdi->cd_dirty_tail = chain;
6497 		cdi->cd_last_ent = cc_last;
6498 		cdi->cd_lastchain_ptr = chain;
6499 		cdi->cd_lastchain = numq;
6500 	} else {
6501 		if ((cc_last) && (last_ent = cdi->cd_last_ent) &&
6502 		    (CENTRY_BLK(chain) == (CENTRY_BLK(last_ent)+1)) &&
6503 		    (SDBC_DIRTY_NEIGHBORS(last_ent, chain)) &&
6504 		    (cdi->cd_lastchain + numq < maxq)) {
6505 			cdi->cd_last_ent->cc_dirty_next = chain;
6506 			cdi->cd_last_ent = cc_last;
6507 			cdi->cd_lastchain += numq;
6508 		} else {
6509 			cdi->cd_dirty_tail->cc_dirty_link = chain;
6510 			cdi->cd_dirty_tail = chain;
6511 			cdi->cd_last_ent = cc_last;
6512 			cdi->cd_lastchain_ptr = chain;
6513 			cdi->cd_lastchain = numq;
6514 			start_write = 1;
6515 		}
6516 	}
6517 	/* was FAST */
6518 	mutex_exit(&(cdi->cd_lock));
6519 	if (start_write)
6520 		(void) _SD_CD_WRITER(cd);
6521 }
6522 
6523 /*
6524  * _sd_enqueue_dirty_chain  - Enqueue a chain of a list of dirty buffers.
6525  *
6526  * ARGUMENTS:
6527  *	cd	- cache descriptor.
6528  *	chain_first	- first list in  this chain.
6529  *	chain_last 	- last list in this chain.
6530  *	numq    - number of entries being queue (total of all lists)
6531  *
6532  * RETURNS:
6533  *	NONE.
6534  *
6535  * Comments:
6536  *	This routine is called from the processing after io completions.
6537  *	If the buffers are still dirty, they are queued up in one shot.
6538  */
6539 
6540 void
6541 _sd_enqueue_dirty_chain(int cd,
6542 			_sd_cctl_t *chain_first,
6543 			_sd_cctl_t *chain_last,
6544 			int numq)
6545 {
6546 	_sd_cd_info_t *cdi;
6547 
6548 	ASSERT(cd >= 0);
6549 	cdi = &(_sd_cache_files[cd]);
6550 	if (chain_last->cc_dirty_link)
6551 		cmn_err(CE_PANIC,
6552 		    "!_sd_enqueue_dirty_chain: chain_last %p dirty_link %p",
6553 		    (void *)chain_last, (void *)chain_last->cc_dirty_link);
6554 	/* was FAST */
6555 	mutex_enter(&(cdi->cd_lock));
6556 	cdi->cd_last_ent = NULL;
6557 	cdi->cd_lastchain_ptr = NULL;
6558 	cdi->cd_lastchain = 0;
6559 
6560 	cdi->cd_info->sh_numdirty += numq;
6561 	if (cdi->cd_dirty_head == NULL)  {
6562 		cdi->cd_dirty_head = chain_first;
6563 		cdi->cd_dirty_tail = chain_last;
6564 	} else {
6565 		cdi->cd_dirty_tail->cc_dirty_link = chain_first;
6566 		cdi->cd_dirty_tail = chain_last;
6567 	}
6568 	/* was FAST */
6569 	mutex_exit(&(cdi->cd_lock));
6570 }
6571 
6572 
6573 #ifndef _MULTI_DATAMODEL
6574 /* ARGSUSED */
6575 #endif
6576 static int
6577 convert_stats(_sd_stats32_t *uptr)
6578 /*
6579  *	Convert the 64 bit statistic structure to 32bit version.
6580  *	Possibly losing information when cache is > 4gb. Ha!
6581  *
6582  *	NOTE: this code isn't really MT ready since the copied to struct
6583  *	is static. However the race is pretty benign and isn't a whole
6584  *	lot worse than the vanilla version which copies data to user
6585  *	space from kernel structures that can be changing under it too.
6586  *	We can't use a local stack structure since the data size is
6587  *	70k or so and kernel stacks are tiny (8k).
6588  */
6589 {
6590 #ifndef _MULTI_DATAMODEL
6591 	return (SDBC_EMODELCONVERT);
6592 #else
6593 	int rc = 0;
6594 
6595 	/*
6596 	 * This could be done in less code with bcopy type operations
6597 	 * but this is simpler to follow and easier to change if
6598 	 * the structures change.
6599 	 */
6600 
6601 	_sd_cache_stats32->net_dirty = _sd_cache_stats->net_dirty;
6602 	_sd_cache_stats32->net_pending = _sd_cache_stats->net_pending;
6603 	_sd_cache_stats32->net_free = _sd_cache_stats->net_free;
6604 	_sd_cache_stats32->st_count = _sd_cache_stats->st_count;
6605 	_sd_cache_stats32->st_loc_count = _sd_cache_stats->st_loc_count;
6606 	_sd_cache_stats32->st_rdhits = _sd_cache_stats->st_rdhits;
6607 	_sd_cache_stats32->st_rdmiss = _sd_cache_stats->st_rdmiss;
6608 	_sd_cache_stats32->st_wrhits = _sd_cache_stats->st_wrhits;
6609 	_sd_cache_stats32->st_wrmiss = _sd_cache_stats->st_wrmiss;
6610 	_sd_cache_stats32->st_blksize = _sd_cache_stats->st_blksize;
6611 
6612 	_sd_cache_stats32->st_lru_blocks = _sd_cache_stats->st_lru_blocks;
6613 	_sd_cache_stats32->st_lru_noreq = _sd_cache_stats->st_lru_noreq;
6614 	_sd_cache_stats32->st_lru_req = _sd_cache_stats->st_lru_req;
6615 
6616 	_sd_cache_stats32->st_wlru_inq = _sd_cache_stats->st_wlru_inq;
6617 
6618 	_sd_cache_stats32->st_cachesize = _sd_cache_stats->st_cachesize;
6619 	_sd_cache_stats32->st_numblocks = _sd_cache_stats->st_numblocks;
6620 	_sd_cache_stats32->st_wrcancelns = _sd_cache_stats->st_wrcancelns;
6621 	_sd_cache_stats32->st_destaged = _sd_cache_stats->st_destaged;
6622 
6623 	/*
6624 	 * bcopy the shared stats which has nothing that needs conversion
6625 	 * in them
6626 	 */
6627 
6628 	bcopy(_sd_cache_stats->st_shared, _sd_cache_stats32->st_shared,
6629 	    sizeof (_sd_shared_t) * sdbc_max_devs);
6630 
6631 	if (copyout(_sd_cache_stats32, uptr, sizeof (_sd_stats32_t) +
6632 	    (sdbc_max_devs - 1) * sizeof (_sd_shared_t)))
6633 		rc = EFAULT;
6634 
6635 	return (rc);
6636 #endif /* _MULTI_DATAMODEL */
6637 }
6638 
6639 
6640 int
6641 _sd_get_stats(_sd_stats_t *uptr, int convert_32)
6642 {
6643 	int rc = 0;
6644 
6645 	if (_sd_cache_stats == NULL) {
6646 		static _sd_stats_t dummy;
6647 #ifdef _MULTI_DATAMODEL
6648 		static _sd_stats32_t dummy32;
6649 #endif
6650 
6651 		if (convert_32) {
6652 #ifdef _MULTI_DATAMODEL
6653 			if (copyout(&dummy32, uptr, sizeof (_sd_stats32_t)))
6654 				rc = EFAULT;
6655 #else
6656 			rc = SDBC_EMODELCONVERT;
6657 #endif
6658 		} else if (copyout(&dummy, uptr, sizeof (_sd_stats_t)))
6659 			rc = EFAULT;
6660 		return (rc);
6661 	}
6662 
6663 	_sd_cache_stats->st_lru_blocks = _sd_lru_q.sq_inq;
6664 	_sd_cache_stats->st_lru_noreq  = _sd_lru_q.sq_noreq_stat;
6665 	_sd_cache_stats->st_lru_req    = _sd_lru_q.sq_req_stat;
6666 
6667 	if (sdbc_safestore) {
6668 		ssioc_stats_t ss_stats;
6669 
6670 		if (SSOP_CTL(sdbc_safestore, SSIOC_STATS,
6671 		    (uintptr_t)&ss_stats) == 0)
6672 			_sd_cache_stats->st_wlru_inq = ss_stats.wq_inq;
6673 		else
6674 			_sd_cache_stats->st_wlru_inq = 0;
6675 	}
6676 
6677 	if (convert_32)
6678 		rc = convert_stats((_sd_stats32_t *)uptr);
6679 	else if (copyout(_sd_cache_stats, uptr,
6680 	    sizeof (_sd_stats_t) + (sdbc_max_devs - 1) * sizeof (_sd_shared_t)))
6681 		rc = EFAULT;
6682 
6683 	return (rc);
6684 }
6685 
6686 
6687 int
6688 _sd_set_hint(int cd, uint_t hint)
6689 {
6690 	int ret = 0;
6691 	if (FILE_OPENED(cd))  {
6692 		SDTRACE(ST_ENTER|SDF_HINT, cd, 1, SDT_INV_BL, hint, 0);
6693 		_sd_cache_files[cd].cd_hint |= (hint & _SD_HINT_MASK);
6694 		SDTRACE(ST_EXIT|SDF_HINT, cd, 1, SDT_INV_BL, hint, ret);
6695 	} else
6696 		ret = EINVAL;
6697 
6698 	return (ret);
6699 }
6700 
6701 
6702 
6703 int
6704 _sd_clear_hint(int cd, uint_t hint)
6705 {
6706 	int ret = 0;
6707 	if (FILE_OPENED(cd)) {
6708 		SDTRACE(ST_ENTER|SDF_HINT, cd, 2, SDT_INV_BL, hint, 0);
6709 		_sd_cache_files[cd].cd_hint &= ~(hint & _SD_HINT_MASK);
6710 		SDTRACE(ST_EXIT|SDF_HINT, cd, 2, SDT_INV_BL, hint, ret);
6711 	} else
6712 		ret = EINVAL;
6713 
6714 	return (ret);
6715 }
6716 
6717 
6718 int
6719 _sd_get_cd_hint(int cd, uint_t *hint)
6720 {
6721 	*hint = 0;
6722 	if (FILE_OPENED(cd)) {
6723 		*hint = _sd_cache_files[cd].cd_hint;
6724 		return (0);
6725 	} else
6726 		return (EINVAL);
6727 }
6728 
6729 static int
6730 _sd_node_hint_caller(blind_t hint, int  hint_action)
6731 {
6732 	int rc;
6733 
6734 	switch (hint_action) {
6735 		case NSC_GET_NODE_HINT:
6736 			rc = _sd_get_node_hint((uint_t *)hint);
6737 		break;
6738 		case NSC_SET_NODE_HINT:
6739 			rc = _sd_set_node_hint((uint_t)(unsigned long)hint);
6740 		break;
6741 		case NSC_CLEAR_NODE_HINT:
6742 			rc = _sd_clear_node_hint((uint_t)(unsigned long)hint);
6743 		break;
6744 		default:
6745 			rc = EINVAL;
6746 		break;
6747 	}
6748 
6749 	return (rc);
6750 }
6751 
6752 int
6753 _sd_set_node_hint(uint_t hint)
6754 {
6755 	SDTRACE(ST_ENTER|SDF_HINT, SDT_INV_CD, 3, SDT_INV_BL, hint, 0);
6756 	if ((_sd_node_hint & NSC_NO_FORCED_WRTHRU) &&
6757 	    (hint & NSC_FORCED_WRTHRU))
6758 		return (EINVAL);
6759 	_sd_node_hint |= (hint & _SD_HINT_MASK);
6760 	SDTRACE(ST_EXIT|SDF_HINT, SDT_INV_CD, 3, SDT_INV_BL,  hint, 0);
6761 	return (0);
6762 }
6763 
6764 
6765 int
6766 _sd_clear_node_hint(uint_t hint)
6767 {
6768 	SDTRACE(ST_ENTER|SDF_HINT, SDT_INV_CD, 4, SDT_INV_BL, hint, 0);
6769 	_sd_node_hint &= ~(hint & _SD_HINT_MASK);
6770 	SDTRACE(ST_EXIT|SDF_HINT, SDT_INV_CD, 4, SDT_INV_BL, hint, 0);
6771 	return (0);
6772 }
6773 
6774 
6775 int
6776 _sd_get_node_hint(uint_t *hint)
6777 {
6778 	*hint = _sd_node_hint;
6779 	return (0);
6780 }
6781 
6782 
6783 int
6784 _sd_get_partsize(blind_t xcd, nsc_size_t *ptr)
6785 {
6786 	int cd = (int)(unsigned long)xcd;
6787 
6788 	if (FILE_OPENED(cd)) {
6789 		*ptr = _sd_cache_files[cd].cd_info->sh_filesize;
6790 		return (0);
6791 	} else
6792 		return (EINVAL);
6793 }
6794 
6795 
6796 int
6797 _sd_get_maxfbas(blind_t xcd, int flag, nsc_size_t *ptr)
6798 {
6799 	int cd = (int)(unsigned long)xcd;
6800 
6801 	if (!FILE_OPENED(cd))
6802 		return (EINVAL);
6803 
6804 	if (flag & NSC_CACHEBLK)
6805 		*ptr = BLK_FBAS;
6806 	else
6807 		*ptr = sdbc_max_fbas;
6808 
6809 	return (0);
6810 }
6811 
6812 
6813 int
6814 _sd_control(blind_t xcd, int cmd, void *ptr, int len)
6815 {
6816 	_sd_cd_info_t *cdi;
6817 	int cd = (int)(unsigned long)xcd;
6818 
6819 	cdi = &(_sd_cache_files[cd]);
6820 	return (nsc_control(cdi->cd_rawfd, cmd, ptr, len));
6821 }
6822 
6823 
6824 int
6825 _sd_discard_pinned(blind_t xcd, nsc_off_t fba_pos, nsc_size_t fba_len)
6826 {
6827 	int cd = (int)(unsigned long)xcd;
6828 	_sd_cctl_t *cc_ent, **cc_lst, **cc_tmp, *nxt;
6829 	ss_centry_info_t *wctl;
6830 	int found = 0;
6831 	nsc_off_t cblk;
6832 	_sd_cd_info_t *cdi = &_sd_cache_files[cd];
6833 	int rc;
6834 
6835 	if ((!FILE_OPENED(cd)) || (!cdi->cd_info->sh_failed)) {
6836 
6837 		return (EINVAL);
6838 	}
6839 
6840 	for (cblk = FBA_TO_BLK_NUM(fba_pos);
6841 	    cblk < FBA_TO_BLK_LEN(fba_pos + fba_len); cblk++) {
6842 		if (cc_ent =
6843 		    (_sd_cctl_t *)_sd_hash_search(cd, cblk, _sd_htable)) {
6844 			if (!CENTRY_PINNED(cc_ent))
6845 				continue;
6846 
6847 			/*
6848 			 * remove cc_ent from failed links
6849 			 * cc_lst - pointer to "cc_dirty_link" pointer
6850 			 *	    starts at &cd_failed_head.
6851 			 * cc_tmp - pointer to "cc_dirty_next"
6852 			 *	    except when equal to cc_lst.
6853 			 */
6854 			mutex_enter(&cdi->cd_lock);
6855 			cc_tmp = cc_lst = &(cdi->cd_fail_head);
6856 			while (*cc_tmp != cc_ent) {
6857 				cc_tmp = &((*cc_tmp)->cc_dirty_next);
6858 				if (!*cc_tmp)
6859 					cc_lst = &((*cc_lst)->cc_dirty_link),
6860 					    cc_tmp = cc_lst;
6861 			}
6862 			if (*cc_tmp) {
6863 				found++;
6864 				if (cc_lst != cc_tmp) /* break chain */
6865 					*cc_tmp = NULL;
6866 				nxt = cc_ent->cc_dirty_next;
6867 				if (nxt) {
6868 					nxt->cc_dirty_link =
6869 					    (*cc_lst)->cc_dirty_link;
6870 					*cc_lst = nxt;
6871 				} else {
6872 					*cc_lst = (*cc_lst)->cc_dirty_link;
6873 				}
6874 				cdi->cd_info->sh_numfail--;
6875 				nsc_unpinned_data(cdi->cd_iodev,
6876 				    BLK_TO_FBA_NUM(CENTRY_BLK(cc_ent)),
6877 				    BLK_FBAS);
6878 			}
6879 			mutex_exit(&cdi->cd_lock);
6880 
6881 			/* clear dirty bits */
6882 			/* was FAST */
6883 			mutex_enter(&cc_ent->cc_lock);
6884 			cc_ent->cc_valid = cc_ent->cc_dirty = 0;
6885 			cc_ent->cc_flag &= ~(CC_QHEAD|CC_PEND_DIRTY|CC_PINNED);
6886 			cc_ent->cc_dirty_link = NULL;
6887 			wctl = cc_ent->cc_write;
6888 			cc_ent->cc_write = NULL;
6889 			/* was FAST */
6890 			mutex_exit(&cc_ent->cc_lock);
6891 
6892 			/* release cache block to head of LRU */
6893 			if (wctl) {
6894 				wctl->sc_flag = 0;
6895 				wctl->sc_dirty = 0;
6896 				SSOP_SETCENTRY(sdbc_safestore, wctl);
6897 				SSOP_DEALLOCRESOURCE(sdbc_safestore,
6898 				    wctl->sc_res);
6899 			}
6900 
6901 			if (!sdbc_use_dmchain)
6902 				_sd_requeue_head(cc_ent);
6903 		}
6904 	}
6905 
6906 	rc = found ? NSC_DONE : EINVAL;
6907 
6908 	return (rc);
6909 }
6910 
6911 
6912 /*
6913  * Handle allocation
6914  */
6915 
6916 _sd_buf_hlist_t  _sd_handle_list;
6917 
6918 /*
6919  * _sdbc_handles_unload - cache is being unloaded.
6920  */
6921 void
6922 _sdbc_handles_unload(void)
6923 {
6924 	mutex_destroy(&_sd_handle_list.hl_lock);
6925 
6926 }
6927 
6928 /*
6929  * _sdbc_handles_load - cache is being unloaded.
6930  */
6931 int
6932 _sdbc_handles_load(void)
6933 {
6934 	mutex_init(&_sd_handle_list.hl_lock, NULL, MUTEX_DRIVER, NULL);
6935 
6936 	return (0);
6937 }
6938 
6939 int
6940 _sdbc_handles_configure()
6941 {
6942 	_sd_handle_list.hl_count = 0;
6943 
6944 	_sd_handle_list.hl_top.bh_next = &_sd_handle_list.hl_top;
6945 	_sd_handle_list.hl_top.bh_prev = &_sd_handle_list.hl_top;
6946 
6947 	return (0);
6948 }
6949 
6950 
6951 
6952 /*
6953  * _sdbc_handles_deconfigure - cache is being deconfigured
6954  */
6955 void
6956 _sdbc_handles_deconfigure(void)
6957 {
6958 	_sd_handle_list.hl_count = 0;
6959 }
6960 
6961 
6962 _sd_buf_handle_t *
6963 _sd_alloc_handle(sdbc_callback_fn_t d_cb, sdbc_callback_fn_t r_cb,
6964 		sdbc_callback_fn_t w_cb)
6965 {
6966 	_sd_buf_handle_t *handle;
6967 
6968 	handle = (_sd_buf_handle_t *)kmem_zalloc(sizeof (_sd_buf_handle_t),
6969 	    KM_SLEEP);
6970 	/* maintain list and count for debugging */
6971 	mutex_enter(&_sd_handle_list.hl_lock);
6972 
6973 	handle->bh_prev = &_sd_handle_list.hl_top;
6974 	handle->bh_next = _sd_handle_list.hl_top.bh_next;
6975 	_sd_handle_list.hl_top.bh_next->bh_prev = handle;
6976 	_sd_handle_list.hl_top.bh_next = handle;
6977 
6978 	++_sd_handle_list.hl_count;
6979 	mutex_exit(&_sd_handle_list.hl_lock);
6980 #if !defined(_SD_NOCHECKS)
6981 	ASSERT(!(handle->bh_flag & (NSC_HALLOCATED | NSC_HACTIVE)));
6982 #endif
6983 	handle->bh_disconnect_cb = d_cb;
6984 	handle->bh_read_cb = r_cb;
6985 	handle->bh_write_cb = w_cb;
6986 	handle->bh_flag |= NSC_HALLOCATED;
6987 	handle->bh_alloc_thread = nsc_threadp();
6988 
6989 	return (handle);
6990 }
6991 
6992 int
6993 _sd_free_handle(_sd_buf_handle_t *handle)
6994 {
6995 
6996 	if ((handle->bh_flag & NSC_HALLOCATED) == 0) {
6997 		cmn_err(CE_WARN, "!sdbc(_sd_free_handle) handle %p not valid",
6998 		    (void *)handle);
6999 
7000 		DTRACE_PROBE(_sd_free_handle_end);
7001 
7002 		return (EINVAL);
7003 	}
7004 	if (_SD_HANDLE_ACTIVE(handle)) {
7005 		cmn_err(CE_WARN,
7006 		    "!sdbc(_sd_free_handle) attempt to free active handle %p",
7007 		    (void *)handle);
7008 
7009 		DTRACE_PROBE1(free_handle_active, int, handle->bh_flag);
7010 
7011 		return (EINVAL);
7012 	}
7013 
7014 
7015 	/* remove from queue before free */
7016 	mutex_enter(&_sd_handle_list.hl_lock);
7017 	handle->bh_prev->bh_next = handle->bh_next;
7018 	handle->bh_next->bh_prev = handle->bh_prev;
7019 	--_sd_handle_list.hl_count;
7020 	mutex_exit(&_sd_handle_list.hl_lock);
7021 
7022 	kmem_free(handle, sizeof (_sd_buf_handle_t));
7023 
7024 	return (0);
7025 }
7026 
7027 
7028 
7029 
7030 #if !defined  (_SD_8K_BLKSIZE)
7031 #define	_SD_MAX_MAP 0x100
7032 #else 	/* !(_SD_8K_BLKSIZE)    */
7033 #define	_SD_MAX_MAP 0x10000
7034 #endif 	/* !(_SD_8K_BLKSIZE) 	*/
7035 
7036 char _sd_contig_bmap[_SD_MAX_MAP];
7037 _sd_map_info_t _sd_lookup_map[_SD_MAX_MAP];
7038 
7039 void
7040 _sd_init_contig_bmap(void)
7041 {
7042 	int i, j;
7043 
7044 	for (i = 1; i < _SD_MAX_MAP; i = ((i << 1) | 1))
7045 		for (j = i; j < _SD_MAX_MAP; j <<= 1)
7046 			_sd_contig_bmap[j] = 1;
7047 }
7048 
7049 
7050 
7051 
7052 void
7053 _sd_init_lookup_map(void)
7054 {
7055 	unsigned int i, j, k;
7056 	int stpos, len;
7057 	_sd_bitmap_t mask;
7058 
7059 	for (i = 0; i < _SD_MAX_MAP; i++) {
7060 		for (j = i, k = 0; j && ((j & 1) == 0); j >>= 1, k++)
7061 		;
7062 		stpos =  k;
7063 		_sd_lookup_map[i].mi_stpos = (unsigned char)k;
7064 
7065 		for (k = 0; j & 1; j >>= 1, k++)
7066 		;
7067 		len = k;
7068 		_sd_lookup_map[i].mi_len = (unsigned char)k;
7069 
7070 		_sd_lookup_map[i].mi_mask = SDBC_GET_BITS(stpos, len);
7071 	}
7072 	for (i = 0; i < _SD_MAX_MAP; i++) {
7073 		mask = (_sd_bitmap_t)i;
7074 		for (j = 0; mask; j++)
7075 			SDBC_LOOKUP_MODIFY(mask);
7076 
7077 		_sd_lookup_map[i].mi_dirty_count = (unsigned char)j;
7078 	}
7079 	for (i = 0; i < _SD_MAX_MAP; i++) {
7080 		_sd_lookup_map[i].mi_io_count = SDBC_LOOKUP_DTCOUNT(i);
7081 		mask = ~i;
7082 		_sd_lookup_map[i].mi_io_count += SDBC_LOOKUP_DTCOUNT(mask);
7083 	}
7084 }
7085 
7086 
7087 nsc_def_t _sd_sdbc_def[] = {
7088 	"Open",		(uintptr_t)_sd_open_io,			0,
7089 	"Close",	(uintptr_t)_sd_close_io,		0,
7090 	"Attach",	(uintptr_t)_sdbc_io_attach_cd,		0,
7091 	"Detach",	(uintptr_t)_sdbc_io_detach_cd,		0,
7092 	"AllocBuf",	(uintptr_t)_sd_alloc_buf,		0,
7093 	"FreeBuf",	(uintptr_t)_sd_free_buf,		0,
7094 	"Read",		(uintptr_t)_sd_read,			0,
7095 	"Write",	(uintptr_t)_sd_write,			0,
7096 	"Zero",		(uintptr_t)_sd_zero,			0,
7097 	"Copy",		(uintptr_t)_sd_copy,			0,
7098 	"CopyDirect",	(uintptr_t)_sd_copy_direct,		0,
7099 	"Uncommit",	(uintptr_t)_sd_uncommit,		0,
7100 	"AllocHandle",	(uintptr_t)_sd_alloc_handle,		0,
7101 	"FreeHandle",	(uintptr_t)_sd_free_handle,		0,
7102 	"Discard",	(uintptr_t)_sd_discard_pinned,		0,
7103 	"Sizes",	(uintptr_t)_sd_cache_sizes,		0,
7104 	"GetPinned",	(uintptr_t)_sd_get_pinned,		0,
7105 	"NodeHints",	(uintptr_t)_sd_node_hint_caller,	0,
7106 	"PartSize",	(uintptr_t)_sd_get_partsize,		0,
7107 	"MaxFbas",	(uintptr_t)_sd_get_maxfbas,		0,
7108 	"Control",	(uintptr_t)_sd_control,			0,
7109 	"Provide",	NSC_CACHE,				0,
7110 	0,		0,					0
7111 };
7112 
7113 /*
7114  * do the SD_GET_CD_CLUSTER_DATA ioctl (get the global filename data)
7115  */
7116 /* ARGSUSED */
7117 int
7118 sd_get_file_info_data(char *uaddrp)
7119 {
7120 	return (ENOTTY);
7121 }
7122 
7123 /*
7124  * do the SD_GET_CD_CLUSTER_SIZE ioctl (get size of global filename area)
7125  */
7126 int
7127 sd_get_file_info_size(void *uaddrp)
7128 {
7129 	if (copyout(&_sdbc_gl_file_info_size, uaddrp,
7130 	    sizeof (_sdbc_gl_file_info_size))) {
7131 		return (EFAULT);
7132 	}
7133 
7134 	return (0);
7135 }
7136 
7137 
7138 /*
7139  * SD_GET_GLMUL_SIZES ioctl
7140  * get sizes of the global info regions (for this node only)
7141  */
7142 /* ARGSUSED */
7143 int
7144 sd_get_glmul_sizes(int *uaddrp)
7145 {
7146 	return (ENOTTY);
7147 }
7148 
7149 /*
7150  * SD_GET_GLMUL_INFO ioctl
7151  * get the global metadata for write blocks (for this node only)
7152  */
7153 /* ARGSUSED */
7154 int
7155 sd_get_glmul_info(char *uaddrp)
7156 {
7157 
7158 	return (ENOTTY);
7159 }
7160 
7161 int
7162 sdbc_global_stats_update(kstat_t *ksp, int rw)
7163 {
7164 	sdbc_global_stats_t *sdbc_gstats;
7165 	_sd_stats_t *gstats_vars;
7166 	uint_t hint;
7167 
7168 	sdbc_gstats = (sdbc_global_stats_t *)(ksp->ks_data);
7169 
7170 	gstats_vars = _sd_cache_stats;
7171 
7172 	if (rw == KSTAT_WRITE) {
7173 		return (EACCES);
7174 	}
7175 
7176 	/* default to READ */
7177 	sdbc_gstats->ci_sdbc_count.value.ul = gstats_vars->st_count;
7178 	sdbc_gstats->ci_sdbc_loc_count.value.ul = gstats_vars->st_loc_count;
7179 	sdbc_gstats->ci_sdbc_rdhits.value.ul = (ulong_t)gstats_vars->st_rdhits;
7180 	sdbc_gstats->ci_sdbc_rdmiss.value.ul = (ulong_t)gstats_vars->st_rdmiss;
7181 	sdbc_gstats->ci_sdbc_wrhits.value.ul = (ulong_t)gstats_vars->st_wrhits;
7182 	sdbc_gstats->ci_sdbc_wrmiss.value.ul = (ulong_t)gstats_vars->st_wrmiss;
7183 
7184 	sdbc_gstats->ci_sdbc_blksize.value.ul =
7185 	    (ulong_t)gstats_vars->st_blksize;
7186 	sdbc_gstats->ci_sdbc_lru_blocks.value.ul = (ulong_t)_sd_lru_q.sq_inq;
7187 #ifdef DEBUG
7188 	sdbc_gstats->ci_sdbc_lru_noreq.value.ul =
7189 	    (ulong_t)_sd_lru_q.sq_noreq_stat;
7190 	sdbc_gstats->ci_sdbc_lru_req.value.ul = (ulong_t)_sd_lru_q.sq_req_stat;
7191 #endif
7192 	sdbc_gstats->ci_sdbc_wlru_inq.value.ul =
7193 	    (ulong_t)gstats_vars->st_wlru_inq;
7194 	sdbc_gstats->ci_sdbc_cachesize.value.ul =
7195 	    (ulong_t)gstats_vars->st_cachesize;
7196 	sdbc_gstats->ci_sdbc_numblocks.value.ul =
7197 	    (ulong_t)gstats_vars->st_numblocks;
7198 	sdbc_gstats->ci_sdbc_wrcancelns.value.ul =
7199 	    (ulong_t)gstats_vars->st_wrcancelns;
7200 	sdbc_gstats->ci_sdbc_destaged.value.ul =
7201 	    (ulong_t)gstats_vars->st_destaged;
7202 	sdbc_gstats->ci_sdbc_num_shared.value.ul = (ulong_t)sdbc_max_devs;
7203 	(void) _sd_get_node_hint(&hint);
7204 	sdbc_gstats->ci_sdbc_nodehints.value.ul = (ulong_t)hint;
7205 
7206 
7207 	return (0);
7208 }
7209 
7210 int
7211 sdbc_cd_stats_update(kstat_t *ksp, int rw)
7212 {
7213 	sdbc_cd_stats_t *sdbc_shstats;
7214 	_sd_shared_t *shstats_vars;
7215 	int name_len;
7216 	uint_t hint;
7217 
7218 	sdbc_shstats = (sdbc_cd_stats_t *)(ksp->ks_data);
7219 
7220 	shstats_vars = (_sd_shared_t *)(ksp->ks_private);
7221 
7222 	if (rw == KSTAT_WRITE) {
7223 		return (EACCES);
7224 	}
7225 
7226 	/* copy tail of filename to kstat. leave 1 byte for null char */
7227 	if (shstats_vars->sh_filename != NULL) {
7228 		name_len = (int)strlen(shstats_vars->sh_filename);
7229 		name_len -= (KSTAT_DATA_CHAR_LEN - 1);
7230 
7231 		if (name_len < 0) {
7232 			name_len = 0;
7233 		}
7234 
7235 		(void) strlcpy(sdbc_shstats->ci_sdbc_vol_name.value.c,
7236 		    shstats_vars->sh_filename + name_len, KSTAT_DATA_CHAR_LEN);
7237 	} else {
7238 		cmn_err(CE_WARN, "!Kstat error: no volume name associated "
7239 		    "with cache descriptor");
7240 	}
7241 
7242 	sdbc_shstats->ci_sdbc_failed.value.ul =
7243 	    (ulong_t)shstats_vars->sh_failed;
7244 	sdbc_shstats->ci_sdbc_cd.value.ul = (ulong_t)shstats_vars->sh_cd;
7245 	sdbc_shstats->ci_sdbc_cache_read.value.ul =
7246 	    (ulong_t)shstats_vars->sh_cache_read;
7247 	sdbc_shstats->ci_sdbc_cache_write.value.ul =
7248 	    (ulong_t)shstats_vars->sh_cache_write;
7249 	sdbc_shstats->ci_sdbc_disk_read.value.ul =
7250 	    (ulong_t)shstats_vars->sh_disk_read;
7251 	sdbc_shstats->ci_sdbc_disk_write.value.ul =
7252 	    (ulong_t)shstats_vars->sh_disk_write;
7253 #ifdef NSC_MULTI_TERABYTE
7254 	sdbc_shstats->ci_sdbc_filesize.value.ui64 =
7255 	    (uint64_t)shstats_vars->sh_filesize;
7256 #else
7257 	sdbc_shstats->ci_sdbc_filesize.value.ul =
7258 	    (ulong_t)shstats_vars->sh_filesize;
7259 #endif
7260 	sdbc_shstats->ci_sdbc_numdirty.value.ul =
7261 	    (ulong_t)shstats_vars->sh_numdirty;
7262 	sdbc_shstats->ci_sdbc_numio.value.ul = (ulong_t)shstats_vars->sh_numio;
7263 	sdbc_shstats->ci_sdbc_numfail.value.ul =
7264 	    (ulong_t)shstats_vars->sh_numfail;
7265 	sdbc_shstats->ci_sdbc_destaged.value.ul =
7266 	    (ulong_t)shstats_vars->sh_destaged;
7267 	sdbc_shstats->ci_sdbc_wrcancelns.value.ul =
7268 	    (ulong_t)shstats_vars->sh_wrcancelns;
7269 	(void) _sd_get_cd_hint(shstats_vars->sh_cd, &hint);
7270 	sdbc_shstats->ci_sdbc_cdhints.value.ul = (ulong_t)hint;
7271 
7272 
7273 	return (0);
7274 }
7275 
7276 
7277 /*
7278  * cd_kstat_add
7279  *
7280  * Installs all kstats and associated infrastructure (mutex, buffer),
7281  * associated with a particular cache descriptor.  This function is called
7282  * when the cache descriptor is opened in _sd_open().
7283  * "cd" -- cache descriptor number whose kstats we wish to add
7284  * returns: 0 on success, -1 on failure
7285  */
7286 static int
7287 cd_kstat_add(int cd)
7288 {
7289 	char name[KSTAT_STRLEN];
7290 
7291 	if (cd < 0 || cd >= sdbc_max_devs) {
7292 		cmn_err(CE_WARN, "!invalid cache descriptor: %d", cd);
7293 		return (-1);
7294 	}
7295 
7296 	/* create a regular kstat for this cache descriptor */
7297 	if (!sdbc_cd_kstats) {
7298 		cmn_err(CE_WARN, "!sdbc_cd_kstats not allocated");
7299 		return (-1);
7300 	}
7301 
7302 	(void) snprintf(name, KSTAT_STRLEN, "%s%d", SDBC_KSTAT_CDSTATS, cd);
7303 
7304 	sdbc_cd_kstats[cd] = kstat_create(SDBC_KSTAT_MODULE,
7305 	    cd, name, SDBC_KSTAT_CLASS, KSTAT_TYPE_NAMED,
7306 	    sizeof (sdbc_cd_stats)/sizeof (kstat_named_t),
7307 	    KSTAT_FLAG_VIRTUAL|KSTAT_FLAG_WRITABLE);
7308 
7309 	if (sdbc_cd_kstats[cd] != NULL) {
7310 		sdbc_cd_kstats[cd]->ks_data = &sdbc_cd_stats;
7311 		sdbc_cd_kstats[cd]->ks_update = sdbc_cd_stats_update;
7312 		sdbc_cd_kstats[cd]->ks_private =
7313 		    &_sd_cache_stats->st_shared[cd];
7314 		kstat_install(sdbc_cd_kstats[cd]);
7315 	} else {
7316 		cmn_err(CE_WARN, "!cdstats %d kstat allocation failed", cd);
7317 	}
7318 
7319 	/* create an I/O kstat for this cache descriptor */
7320 	if (!sdbc_cd_io_kstats) {
7321 		cmn_err(CE_WARN, "!sdbc_cd_io_kstats not allocated");
7322 		return (-1);
7323 	}
7324 
7325 	(void) snprintf(name, KSTAT_STRLEN, "%s%d", SDBC_IOKSTAT_CDSTATS, cd);
7326 
7327 	sdbc_cd_io_kstats[cd] = kstat_create(
7328 	    SDBC_KSTAT_MODULE, cd, name, "disk", KSTAT_TYPE_IO, 1, 0);
7329 
7330 	if (sdbc_cd_io_kstats[cd]) {
7331 		if (!sdbc_cd_io_kstats_mutexes) {
7332 			cmn_err(CE_WARN, "!sdbc_cd_io_kstats_mutexes not "
7333 			    "allocated");
7334 			return (-1);
7335 		}
7336 
7337 		mutex_init(&sdbc_cd_io_kstats_mutexes[cd], NULL,
7338 		    MUTEX_DRIVER, NULL);
7339 
7340 		sdbc_cd_io_kstats[cd]->ks_lock = &sdbc_cd_io_kstats_mutexes[cd];
7341 
7342 		kstat_install(sdbc_cd_io_kstats[cd]);
7343 
7344 	} else {
7345 		cmn_err(CE_WARN, "!sdbc cd %d io kstat allocation failed", cd);
7346 	}
7347 
7348 	return (0);
7349 }
7350 
7351 /*
7352  * cd_kstat_remove
7353  *
7354  * Uninstalls all kstats and associated infrastructure (mutex, buffer),
7355  * associated with a particular cache descriptor.  This function is called
7356  * when the cache descriptor is closed in _sd_close().
7357  * "cd" -- cache descriptor number whose kstats we wish to remove
7358  * returns: 0 on success, -1 on failure
7359  */
7360 static int
7361 cd_kstat_remove(int cd)
7362 {
7363 	if (cd < 0 || cd >= sdbc_max_devs) {
7364 		cmn_err(CE_WARN, "!invalid cache descriptor: %d", cd);
7365 		return (-1);
7366 	}
7367 
7368 	/* delete the regular kstat corresponding to this cache descriptor */
7369 	if (sdbc_cd_kstats && sdbc_cd_kstats[cd]) {
7370 		kstat_delete(sdbc_cd_kstats[cd]);
7371 		sdbc_cd_kstats[cd] = NULL;
7372 	}
7373 
7374 	/* delete the I/O kstat corresponding to this cache descriptor */
7375 	if (sdbc_cd_io_kstats && sdbc_cd_io_kstats[cd]) {
7376 		kstat_delete(sdbc_cd_io_kstats[cd]);
7377 		sdbc_cd_io_kstats[cd] = NULL;
7378 
7379 		if (sdbc_cd_io_kstats_mutexes) {
7380 			/* destroy the mutex associated with this I/O kstat */
7381 			mutex_destroy(&sdbc_cd_io_kstats_mutexes[cd]);
7382 		}
7383 	}
7384 
7385 	return (0);
7386 }
7387 
7388 #ifdef DEBUG
7389 /*
7390  * kstat update
7391  */
7392 int
7393 sdbc_dynmem_kstat_update_dm(kstat_t *ksp, int rw)
7394 {
7395 	sdbc_dynmem_dm_t *sdbc_dynmem;
7396 	_dm_process_vars_t *process_vars;
7397 	_dm_process_vars_t local_dm_process_vars;
7398 
7399 	simplect_dm++;
7400 
7401 	sdbc_dynmem = (sdbc_dynmem_dm_t *)(ksp->ks_data);
7402 
7403 	/* global dynmem_processing_dm */
7404 	process_vars = (_dm_process_vars_t *)(ksp->ks_private);
7405 
7406 	if (rw == KSTAT_WRITE) {
7407 		simplect_dm = sdbc_dynmem->ci_sdbc_simplect.value.ul;
7408 		local_dm_process_vars.monitor_dynmem_process =
7409 		    sdbc_dynmem->ci_sdbc_monitor_dynmem.value.ul;
7410 		local_dm_process_vars.max_dyn_list =
7411 		    sdbc_dynmem->ci_sdbc_max_dyn_list.value.ul;
7412 		local_dm_process_vars.cache_aging_ct1 =
7413 		    sdbc_dynmem->ci_sdbc_cache_aging_ct1.value.ul;
7414 		local_dm_process_vars.cache_aging_ct2 =
7415 		    sdbc_dynmem->ci_sdbc_cache_aging_ct2.value.ul;
7416 		local_dm_process_vars.cache_aging_ct3 =
7417 		    sdbc_dynmem->ci_sdbc_cache_aging_ct3.value.ul;
7418 		local_dm_process_vars.cache_aging_sec1 =
7419 		    sdbc_dynmem->ci_sdbc_cache_aging_sec1.value.ul;
7420 		local_dm_process_vars.cache_aging_sec2 =
7421 		    sdbc_dynmem->ci_sdbc_cache_aging_sec2.value.ul;
7422 		local_dm_process_vars.cache_aging_sec3 =
7423 		    sdbc_dynmem->ci_sdbc_cache_aging_sec3.value.ul;
7424 		local_dm_process_vars.cache_aging_pcnt1 =
7425 		    sdbc_dynmem->ci_sdbc_cache_aging_pcnt1.value.ul;
7426 		local_dm_process_vars.cache_aging_pcnt2 =
7427 		    sdbc_dynmem->ci_sdbc_cache_aging_pcnt2.value.ul;
7428 		local_dm_process_vars.max_holds_pcnt =
7429 		    sdbc_dynmem->ci_sdbc_max_holds_pcnt.value.ul;
7430 		local_dm_process_vars.process_directive =
7431 		    sdbc_dynmem->ci_sdbc_process_directive.value.ul;
7432 		(void) sdbc_edit_xfer_process_vars_dm(&local_dm_process_vars);
7433 
7434 		if (process_vars->process_directive & WAKE_DEALLOC_THREAD_DM) {
7435 			process_vars->process_directive &=
7436 			    ~WAKE_DEALLOC_THREAD_DM;
7437 			mutex_enter(&dynmem_processing_dm.thread_dm_lock);
7438 			cv_broadcast(&dynmem_processing_dm.thread_dm_cv);
7439 			mutex_exit(&dynmem_processing_dm.thread_dm_lock);
7440 		}
7441 
7442 		return (0);
7443 	}
7444 
7445 	/* default to READ */
7446 	sdbc_dynmem->ci_sdbc_simplect.value.ul = simplect_dm;
7447 	sdbc_dynmem->ci_sdbc_monitor_dynmem.value.ul =
7448 	    process_vars->monitor_dynmem_process;
7449 	sdbc_dynmem->ci_sdbc_max_dyn_list.value.ul =
7450 	    process_vars->max_dyn_list;
7451 	sdbc_dynmem->ci_sdbc_cache_aging_ct1.value.ul =
7452 	    process_vars->cache_aging_ct1;
7453 	sdbc_dynmem->ci_sdbc_cache_aging_ct2.value.ul =
7454 	    process_vars->cache_aging_ct2;
7455 	sdbc_dynmem->ci_sdbc_cache_aging_ct3.value.ul =
7456 	    process_vars->cache_aging_ct3;
7457 	sdbc_dynmem->ci_sdbc_cache_aging_sec1.value.ul =
7458 	    process_vars->cache_aging_sec1;
7459 	sdbc_dynmem->ci_sdbc_cache_aging_sec2.value.ul =
7460 	    process_vars->cache_aging_sec2;
7461 	sdbc_dynmem->ci_sdbc_cache_aging_sec3.value.ul =
7462 	    process_vars->cache_aging_sec3;
7463 	sdbc_dynmem->ci_sdbc_cache_aging_pcnt1.value.ul =
7464 	    process_vars->cache_aging_pcnt1;
7465 	sdbc_dynmem->ci_sdbc_cache_aging_pcnt2.value.ul =
7466 	    process_vars->cache_aging_pcnt2;
7467 	sdbc_dynmem->ci_sdbc_max_holds_pcnt.value.ul =
7468 	    process_vars->max_holds_pcnt;
7469 	sdbc_dynmem->ci_sdbc_process_directive.value.ul =
7470 	    process_vars->process_directive;
7471 
7472 	sdbc_dynmem->ci_sdbc_alloc_ct.value.ul = process_vars->alloc_ct;
7473 	sdbc_dynmem->ci_sdbc_dealloc_ct.value.ul = process_vars->dealloc_ct;
7474 	sdbc_dynmem->ci_sdbc_history.value.ul = process_vars->history;
7475 	sdbc_dynmem->ci_sdbc_nodatas.value.ul = process_vars->nodatas;
7476 	sdbc_dynmem->ci_sdbc_candidates.value.ul = process_vars->candidates;
7477 	sdbc_dynmem->ci_sdbc_deallocs.value.ul = process_vars->deallocs;
7478 	sdbc_dynmem->ci_sdbc_hosts.value.ul = process_vars->hosts;
7479 	sdbc_dynmem->ci_sdbc_pests.value.ul = process_vars->pests;
7480 	sdbc_dynmem->ci_sdbc_metas.value.ul = process_vars->metas;
7481 	sdbc_dynmem->ci_sdbc_holds.value.ul = process_vars->holds;
7482 	sdbc_dynmem->ci_sdbc_others.value.ul = process_vars->others;
7483 	sdbc_dynmem->ci_sdbc_notavail.value.ul = process_vars->notavail;
7484 
7485 	return (0);
7486 }
7487 #endif
7488