xref: /illumos-gate/usr/src/uts/common/io/blkdev/blkdev.c (revision 247e9a8ed695b16d62e2a0cb581e5c07d949d5ae)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright 2012 Garrett D'Amore <garrett@damore.org>.  All rights reserved.
24  * Copyright 2012 Alexey Zaytsev <alexey.zaytsev@gmail.com> All rights reserved.
25  * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
26  * Copyright 2017 The MathWorks, Inc.  All rights reserved.
27  * Copyright 2019 Western Digital Corporation.
28  */
29 
30 #include <sys/types.h>
31 #include <sys/ksynch.h>
32 #include <sys/kmem.h>
33 #include <sys/file.h>
34 #include <sys/errno.h>
35 #include <sys/open.h>
36 #include <sys/buf.h>
37 #include <sys/uio.h>
38 #include <sys/aio_req.h>
39 #include <sys/cred.h>
40 #include <sys/modctl.h>
41 #include <sys/cmlb.h>
42 #include <sys/conf.h>
43 #include <sys/devops.h>
44 #include <sys/list.h>
45 #include <sys/sysmacros.h>
46 #include <sys/dkio.h>
47 #include <sys/vtoc.h>
48 #include <sys/scsi/scsi.h>	/* for DTYPE_DIRECT */
49 #include <sys/kstat.h>
50 #include <sys/fs/dv_node.h>
51 #include <sys/ddi.h>
52 #include <sys/sunddi.h>
53 #include <sys/note.h>
54 #include <sys/blkdev.h>
55 #include <sys/scsi/impl/inquiry.h>
56 
57 /*
58  * blkdev is a driver which provides a lot of the common functionality
59  * a block device driver may need and helps by removing code which
60  * is frequently duplicated in block device drivers.
61  *
62  * Within this driver all the struct cb_ops functions required for a
63  * block device driver are written with appropriate call back functions
64  * to be provided by the parent driver.
65  *
66  * To use blkdev, a driver needs to:
67  *	1. Create a bd_ops_t structure which has the call back operations
68  *	   blkdev will use.
69  *	2. Create a handle by calling bd_alloc_handle(). One of the
70  *	   arguments to this function is the bd_ops_t.
71  *	3. Call bd_attach_handle(). This will instantiate a blkdev device
72  *	   as a child device node of the calling driver.
73  *
74  * A parent driver is not restricted to just allocating and attaching a
75  * single instance, it may attach as many as it wishes. For each handle
76  * attached, appropriate entries in /dev/[r]dsk are created.
77  *
78  * The bd_ops_t routines that a parent of blkdev need to provide are:
79  *
80  * o_drive_info: Provide information to blkdev such as how many I/O queues
81  *		 to create and the size of those queues. Also some device
82  *		 specifics such as EUI, vendor, product, model, serial
83  *		 number ....
84  *
85  * o_media_info: Provide information about the media. Eg size and block size.
86  *
87  * o_devid_init: Creates and initializes the device id. Typically calls
88  *		 ddi_devid_init().
89  *
90  * o_sync_cache: Issues a device appropriate command to flush any write
91  *		 caches.
92  *
93  * o_read:	 Read data as described by bd_xfer_t argument.
94  *
95  * o_write:	 Write data as described by bd_xfer_t argument.
96  *
97  *
98  * Queues
99  * ------
100  * Part of the drive_info data is a queue count. blkdev will create
101  * "queue count" number of waitq/runq pairs. Each waitq/runq pair
102  * operates independently. As an I/O is scheduled up to the parent
103  * driver via o_read or o_write its queue number is given. If the
104  * parent driver supports multiple hardware queues it can then select
105  * where to submit the I/O request.
106  *
107  * Currently blkdev uses a simplistic round-robin queue selection method.
108  * It has the advantage that it is lockless. In the future it will be
109  * worthwhile reviewing this strategy for something which prioritizes queues
110  * depending on how busy they are.
111  *
112  * Each waitq/runq pair is protected by its mutex (q_iomutex). Incoming
113  * I/O requests are initially added to the waitq. They are taken off the
114  * waitq, added to the runq and submitted, providing the runq is less
115  * than the qsize as specified in the drive_info. As an I/O request
116  * completes, the parent driver is required to call bd_xfer_done(), which
117  * will remove the I/O request from the runq and pass I/O completion
118  * status up the stack.
119  *
120  * Locks
121  * -----
122  * There are 4 instance global locks d_ocmutex, d_ksmutex, d_errmutex and
123  * d_statemutex. As well a q_iomutex per waitq/runq pair.
124  *
125  * Currently, there is no lock hierarchy. Nowhere do we ever own more than
126  * one lock, any change needs to be documented here with a defined
127  * hierarchy.
128  */
129 
130 #define	BD_MAXPART	64
131 #define	BDINST(dev)	(getminor(dev) / BD_MAXPART)
132 #define	BDPART(dev)	(getminor(dev) % BD_MAXPART)
133 
134 typedef struct bd bd_t;
135 typedef struct bd_xfer_impl bd_xfer_impl_t;
136 typedef struct bd_queue bd_queue_t;
137 
138 struct bd {
139 	void		*d_private;
140 	dev_info_t	*d_dip;
141 	kmutex_t	d_ocmutex;
142 	kmutex_t	d_ksmutex;
143 	kmutex_t	d_errmutex;
144 	kmutex_t	d_statemutex;
145 	kcondvar_t	d_statecv;
146 	enum dkio_state	d_state;
147 	cmlb_handle_t	d_cmlbh;
148 	unsigned	d_open_lyr[BD_MAXPART];	/* open count */
149 	uint64_t	d_open_excl;	/* bit mask indexed by partition */
150 	uint64_t	d_open_reg[OTYPCNT];		/* bit mask */
151 	uint64_t	d_io_counter;
152 
153 	uint32_t	d_qcount;
154 	uint32_t	d_qactive;
155 	uint32_t	d_maxxfer;
156 	uint32_t	d_blkshift;
157 	uint32_t	d_pblkshift;
158 	uint64_t	d_numblks;
159 	ddi_devid_t	d_devid;
160 
161 	kmem_cache_t	*d_cache;
162 	bd_queue_t	*d_queues;
163 	kstat_t		*d_ksp;
164 	kstat_io_t	*d_kiop;
165 	kstat_t		*d_errstats;
166 	struct bd_errstats *d_kerr;
167 
168 	boolean_t	d_rdonly;
169 	boolean_t	d_ssd;
170 	boolean_t	d_removable;
171 	boolean_t	d_hotpluggable;
172 	boolean_t	d_use_dma;
173 
174 	ddi_dma_attr_t	d_dma;
175 	bd_ops_t	d_ops;
176 	bd_handle_t	d_handle;
177 };
178 
179 struct bd_handle {
180 	bd_ops_t	h_ops;
181 	ddi_dma_attr_t	*h_dma;
182 	dev_info_t	*h_parent;
183 	dev_info_t	*h_child;
184 	void		*h_private;
185 	bd_t		*h_bd;
186 	char		*h_name;
187 	char		h_addr[30];	/* enough for w%0.16x,%X */
188 };
189 
190 struct bd_xfer_impl {
191 	bd_xfer_t	i_public;
192 	list_node_t	i_linkage;
193 	bd_t		*i_bd;
194 	buf_t		*i_bp;
195 	bd_queue_t	*i_bq;
196 	uint_t		i_num_win;
197 	uint_t		i_cur_win;
198 	off_t		i_offset;
199 	int		(*i_func)(void *, bd_xfer_t *);
200 	uint32_t	i_blkshift;
201 	size_t		i_len;
202 	size_t		i_resid;
203 };
204 
205 struct bd_queue {
206 	kmutex_t	q_iomutex;
207 	uint32_t	q_qsize;
208 	uint32_t	q_qactive;
209 	list_t		q_runq;
210 	list_t		q_waitq;
211 };
212 
213 #define	i_dmah		i_public.x_dmah
214 #define	i_dmac		i_public.x_dmac
215 #define	i_ndmac		i_public.x_ndmac
216 #define	i_kaddr		i_public.x_kaddr
217 #define	i_nblks		i_public.x_nblks
218 #define	i_blkno		i_public.x_blkno
219 #define	i_flags		i_public.x_flags
220 #define	i_qnum		i_public.x_qnum
221 
222 
223 /*
224  * Private prototypes.
225  */
226 
227 static void bd_prop_update_inqstring(dev_info_t *, char *, char *, size_t);
228 static void bd_create_inquiry_props(dev_info_t *, bd_drive_t *);
229 static void bd_create_errstats(bd_t *, int, bd_drive_t *);
230 static void bd_errstats_setstr(kstat_named_t *, char *, size_t, char *);
231 static void bd_init_errstats(bd_t *, bd_drive_t *);
232 
233 static int bd_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
234 static int bd_attach(dev_info_t *, ddi_attach_cmd_t);
235 static int bd_detach(dev_info_t *, ddi_detach_cmd_t);
236 
237 static int bd_open(dev_t *, int, int, cred_t *);
238 static int bd_close(dev_t, int, int, cred_t *);
239 static int bd_strategy(struct buf *);
240 static int bd_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
241 static int bd_dump(dev_t, caddr_t, daddr_t, int);
242 static int bd_read(dev_t, struct uio *, cred_t *);
243 static int bd_write(dev_t, struct uio *, cred_t *);
244 static int bd_aread(dev_t, struct aio_req *, cred_t *);
245 static int bd_awrite(dev_t, struct aio_req *, cred_t *);
246 static int bd_prop_op(dev_t, dev_info_t *, ddi_prop_op_t, int, char *,
247     caddr_t, int *);
248 
249 static int bd_tg_rdwr(dev_info_t *, uchar_t, void *, diskaddr_t, size_t,
250     void *);
251 static int bd_tg_getinfo(dev_info_t *, int, void *, void *);
252 static int bd_xfer_ctor(void *, void *, int);
253 static void bd_xfer_dtor(void *, void *);
254 static void bd_sched(bd_t *, bd_queue_t *);
255 static void bd_submit(bd_t *, bd_xfer_impl_t *);
256 static void bd_runq_exit(bd_xfer_impl_t *, int);
257 static void bd_update_state(bd_t *);
258 static int bd_check_state(bd_t *, enum dkio_state *);
259 static int bd_flush_write_cache(bd_t *, struct dk_callback *);
260 static int bd_check_uio(dev_t, struct uio *);
261 
262 struct cmlb_tg_ops bd_tg_ops = {
263 	TG_DK_OPS_VERSION_1,
264 	bd_tg_rdwr,
265 	bd_tg_getinfo,
266 };
267 
268 static struct cb_ops bd_cb_ops = {
269 	bd_open,		/* open */
270 	bd_close,		/* close */
271 	bd_strategy,		/* strategy */
272 	nodev,			/* print */
273 	bd_dump,		/* dump */
274 	bd_read,		/* read */
275 	bd_write,		/* write */
276 	bd_ioctl,		/* ioctl */
277 	nodev,			/* devmap */
278 	nodev,			/* mmap */
279 	nodev,			/* segmap */
280 	nochpoll,		/* poll */
281 	bd_prop_op,		/* cb_prop_op */
282 	0,			/* streamtab  */
283 	D_64BIT | D_MP,		/* Driver comaptibility flag */
284 	CB_REV,			/* cb_rev */
285 	bd_aread,		/* async read */
286 	bd_awrite		/* async write */
287 };
288 
289 struct dev_ops bd_dev_ops = {
290 	DEVO_REV,		/* devo_rev, */
291 	0,			/* refcnt  */
292 	bd_getinfo,		/* getinfo */
293 	nulldev,		/* identify */
294 	nulldev,		/* probe */
295 	bd_attach,		/* attach */
296 	bd_detach,		/* detach */
297 	nodev,			/* reset */
298 	&bd_cb_ops,		/* driver operations */
299 	NULL,			/* bus operations */
300 	NULL,			/* power */
301 	ddi_quiesce_not_needed,	/* quiesce */
302 };
303 
304 static struct modldrv modldrv = {
305 	&mod_driverops,
306 	"Generic Block Device",
307 	&bd_dev_ops,
308 };
309 
310 static struct modlinkage modlinkage = {
311 	MODREV_1, { &modldrv, NULL }
312 };
313 
314 static void *bd_state;
315 static krwlock_t bd_lock;
316 
317 int
318 _init(void)
319 {
320 	int	rv;
321 
322 	rv = ddi_soft_state_init(&bd_state, sizeof (struct bd), 2);
323 	if (rv != DDI_SUCCESS) {
324 		return (rv);
325 	}
326 	rw_init(&bd_lock, NULL, RW_DRIVER, NULL);
327 	rv = mod_install(&modlinkage);
328 	if (rv != DDI_SUCCESS) {
329 		rw_destroy(&bd_lock);
330 		ddi_soft_state_fini(&bd_state);
331 	}
332 	return (rv);
333 }
334 
335 int
336 _fini(void)
337 {
338 	int	rv;
339 
340 	rv = mod_remove(&modlinkage);
341 	if (rv == DDI_SUCCESS) {
342 		rw_destroy(&bd_lock);
343 		ddi_soft_state_fini(&bd_state);
344 	}
345 	return (rv);
346 }
347 
348 int
349 _info(struct modinfo *modinfop)
350 {
351 	return (mod_info(&modlinkage, modinfop));
352 }
353 
354 static int
355 bd_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resultp)
356 {
357 	bd_t	*bd;
358 	minor_t	inst;
359 
360 	_NOTE(ARGUNUSED(dip));
361 
362 	inst = BDINST((dev_t)arg);
363 
364 	switch (cmd) {
365 	case DDI_INFO_DEVT2DEVINFO:
366 		bd = ddi_get_soft_state(bd_state, inst);
367 		if (bd == NULL) {
368 			return (DDI_FAILURE);
369 		}
370 		*resultp = (void *)bd->d_dip;
371 		break;
372 
373 	case DDI_INFO_DEVT2INSTANCE:
374 		*resultp = (void *)(intptr_t)inst;
375 		break;
376 
377 	default:
378 		return (DDI_FAILURE);
379 	}
380 	return (DDI_SUCCESS);
381 }
382 
383 static void
384 bd_prop_update_inqstring(dev_info_t *dip, char *name, char *data, size_t len)
385 {
386 	int	ilen;
387 	char	*data_string;
388 
389 	ilen = scsi_ascii_inquiry_len(data, len);
390 	ASSERT3U(ilen, <=, len);
391 	if (ilen <= 0)
392 		return;
393 	/* ensure null termination */
394 	data_string = kmem_zalloc(ilen + 1, KM_SLEEP);
395 	bcopy(data, data_string, ilen);
396 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, dip, name, data_string);
397 	kmem_free(data_string, ilen + 1);
398 }
399 
400 static void
401 bd_create_inquiry_props(dev_info_t *dip, bd_drive_t *drive)
402 {
403 	if (drive->d_vendor_len > 0)
404 		bd_prop_update_inqstring(dip, INQUIRY_VENDOR_ID,
405 		    drive->d_vendor, drive->d_vendor_len);
406 
407 	if (drive->d_product_len > 0)
408 		bd_prop_update_inqstring(dip, INQUIRY_PRODUCT_ID,
409 		    drive->d_product, drive->d_product_len);
410 
411 	if (drive->d_serial_len > 0)
412 		bd_prop_update_inqstring(dip, INQUIRY_SERIAL_NO,
413 		    drive->d_serial, drive->d_serial_len);
414 
415 	if (drive->d_revision_len > 0)
416 		bd_prop_update_inqstring(dip, INQUIRY_REVISION_ID,
417 		    drive->d_revision, drive->d_revision_len);
418 }
419 
420 static void
421 bd_create_errstats(bd_t *bd, int inst, bd_drive_t *drive)
422 {
423 	char	ks_module[KSTAT_STRLEN];
424 	char	ks_name[KSTAT_STRLEN];
425 	int	ndata = sizeof (struct bd_errstats) / sizeof (kstat_named_t);
426 
427 	if (bd->d_errstats != NULL)
428 		return;
429 
430 	(void) snprintf(ks_module, sizeof (ks_module), "%serr",
431 	    ddi_driver_name(bd->d_dip));
432 	(void) snprintf(ks_name, sizeof (ks_name), "%s%d,err",
433 	    ddi_driver_name(bd->d_dip), inst);
434 
435 	bd->d_errstats = kstat_create(ks_module, inst, ks_name, "device_error",
436 	    KSTAT_TYPE_NAMED, ndata, KSTAT_FLAG_PERSISTENT);
437 
438 	mutex_init(&bd->d_errmutex, NULL, MUTEX_DRIVER, NULL);
439 	if (bd->d_errstats == NULL) {
440 		/*
441 		 * Even if we cannot create the kstat, we create a
442 		 * scratch kstat.  The reason for this is to ensure
443 		 * that we can update the kstat all of the time,
444 		 * without adding an extra branch instruction.
445 		 */
446 		bd->d_kerr = kmem_zalloc(sizeof (struct bd_errstats),
447 		    KM_SLEEP);
448 	} else {
449 		bd->d_errstats->ks_lock = &bd->d_errmutex;
450 		bd->d_kerr = (struct bd_errstats *)bd->d_errstats->ks_data;
451 	}
452 
453 	kstat_named_init(&bd->d_kerr->bd_softerrs,	"Soft Errors",
454 	    KSTAT_DATA_UINT32);
455 	kstat_named_init(&bd->d_kerr->bd_harderrs,	"Hard Errors",
456 	    KSTAT_DATA_UINT32);
457 	kstat_named_init(&bd->d_kerr->bd_transerrs,	"Transport Errors",
458 	    KSTAT_DATA_UINT32);
459 
460 	if (drive->d_model_len > 0) {
461 		kstat_named_init(&bd->d_kerr->bd_model,	"Model",
462 		    KSTAT_DATA_STRING);
463 	} else {
464 		kstat_named_init(&bd->d_kerr->bd_vid,	"Vendor",
465 		    KSTAT_DATA_STRING);
466 		kstat_named_init(&bd->d_kerr->bd_pid,	"Product",
467 		    KSTAT_DATA_STRING);
468 	}
469 
470 	kstat_named_init(&bd->d_kerr->bd_revision,	"Revision",
471 	    KSTAT_DATA_STRING);
472 	kstat_named_init(&bd->d_kerr->bd_serial,	"Serial No",
473 	    KSTAT_DATA_STRING);
474 	kstat_named_init(&bd->d_kerr->bd_capacity,	"Size",
475 	    KSTAT_DATA_ULONGLONG);
476 	kstat_named_init(&bd->d_kerr->bd_rq_media_err,	"Media Error",
477 	    KSTAT_DATA_UINT32);
478 	kstat_named_init(&bd->d_kerr->bd_rq_ntrdy_err,	"Device Not Ready",
479 	    KSTAT_DATA_UINT32);
480 	kstat_named_init(&bd->d_kerr->bd_rq_nodev_err,	"No Device",
481 	    KSTAT_DATA_UINT32);
482 	kstat_named_init(&bd->d_kerr->bd_rq_recov_err,	"Recoverable",
483 	    KSTAT_DATA_UINT32);
484 	kstat_named_init(&bd->d_kerr->bd_rq_illrq_err,	"Illegal Request",
485 	    KSTAT_DATA_UINT32);
486 	kstat_named_init(&bd->d_kerr->bd_rq_pfa_err,
487 	    "Predictive Failure Analysis", KSTAT_DATA_UINT32);
488 
489 	bd->d_errstats->ks_private = bd;
490 
491 	kstat_install(bd->d_errstats);
492 }
493 
494 static void
495 bd_errstats_setstr(kstat_named_t *k, char *str, size_t len, char *alt)
496 {
497 	char	*tmp;
498 	size_t	km_len;
499 
500 	if (KSTAT_NAMED_STR_PTR(k) == NULL) {
501 		if (len > 0)
502 			km_len = strnlen(str, len);
503 		else if (alt != NULL)
504 			km_len = strlen(alt);
505 		else
506 			return;
507 
508 		tmp = kmem_alloc(km_len + 1, KM_SLEEP);
509 		bcopy(len > 0 ? str : alt, tmp, km_len);
510 		tmp[km_len] = '\0';
511 
512 		kstat_named_setstr(k, tmp);
513 	}
514 }
515 
516 static void
517 bd_errstats_clrstr(kstat_named_t *k)
518 {
519 	if (KSTAT_NAMED_STR_PTR(k) == NULL)
520 		return;
521 
522 	kmem_free(KSTAT_NAMED_STR_PTR(k), KSTAT_NAMED_STR_BUFLEN(k));
523 	kstat_named_setstr(k, NULL);
524 }
525 
526 static void
527 bd_init_errstats(bd_t *bd, bd_drive_t *drive)
528 {
529 	struct bd_errstats	*est = bd->d_kerr;
530 
531 	mutex_enter(&bd->d_errmutex);
532 
533 	if (drive->d_model_len > 0 &&
534 	    KSTAT_NAMED_STR_PTR(&est->bd_model) == NULL) {
535 		bd_errstats_setstr(&est->bd_model, drive->d_model,
536 		    drive->d_model_len, NULL);
537 	} else {
538 		bd_errstats_setstr(&est->bd_vid, drive->d_vendor,
539 		    drive->d_vendor_len, "Unknown ");
540 		bd_errstats_setstr(&est->bd_pid, drive->d_product,
541 		    drive->d_product_len, "Unknown         ");
542 	}
543 
544 	bd_errstats_setstr(&est->bd_revision, drive->d_revision,
545 	    drive->d_revision_len, "0001");
546 	bd_errstats_setstr(&est->bd_serial, drive->d_serial,
547 	    drive->d_serial_len, "0               ");
548 
549 	mutex_exit(&bd->d_errmutex);
550 }
551 
552 static void
553 bd_fini_errstats(bd_t *bd)
554 {
555 	struct bd_errstats	*est = bd->d_kerr;
556 
557 	mutex_enter(&bd->d_errmutex);
558 
559 	bd_errstats_clrstr(&est->bd_model);
560 	bd_errstats_clrstr(&est->bd_vid);
561 	bd_errstats_clrstr(&est->bd_pid);
562 	bd_errstats_clrstr(&est->bd_revision);
563 	bd_errstats_clrstr(&est->bd_serial);
564 
565 	mutex_exit(&bd->d_errmutex);
566 }
567 
568 static void
569 bd_queues_free(bd_t *bd)
570 {
571 	uint32_t i;
572 
573 	for (i = 0; i < bd->d_qcount; i++) {
574 		bd_queue_t *bq = &bd->d_queues[i];
575 
576 		mutex_destroy(&bq->q_iomutex);
577 		list_destroy(&bq->q_waitq);
578 		list_destroy(&bq->q_runq);
579 	}
580 
581 	kmem_free(bd->d_queues, sizeof (*bd->d_queues) * bd->d_qcount);
582 }
583 
584 static int
585 bd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
586 {
587 	int		inst;
588 	bd_handle_t	hdl;
589 	bd_t		*bd;
590 	bd_drive_t	drive;
591 	uint32_t	i;
592 	int		rv;
593 	char		name[16];
594 	char		kcache[32];
595 
596 	switch (cmd) {
597 	case DDI_ATTACH:
598 		break;
599 	case DDI_RESUME:
600 		/* We don't do anything native for suspend/resume */
601 		return (DDI_SUCCESS);
602 	default:
603 		return (DDI_FAILURE);
604 	}
605 
606 	inst = ddi_get_instance(dip);
607 	hdl = ddi_get_parent_data(dip);
608 
609 	(void) snprintf(name, sizeof (name), "%s%d",
610 	    ddi_driver_name(dip), ddi_get_instance(dip));
611 	(void) snprintf(kcache, sizeof (kcache), "%s_xfer", name);
612 
613 	if (hdl == NULL) {
614 		cmn_err(CE_WARN, "%s: missing parent data!", name);
615 		return (DDI_FAILURE);
616 	}
617 
618 	if (ddi_soft_state_zalloc(bd_state, inst) != DDI_SUCCESS) {
619 		cmn_err(CE_WARN, "%s: unable to zalloc soft state!", name);
620 		return (DDI_FAILURE);
621 	}
622 	bd = ddi_get_soft_state(bd_state, inst);
623 
624 	if (hdl->h_dma) {
625 		bd->d_dma = *(hdl->h_dma);
626 		bd->d_dma.dma_attr_granular =
627 		    max(DEV_BSIZE, bd->d_dma.dma_attr_granular);
628 		bd->d_use_dma = B_TRUE;
629 
630 		if (bd->d_maxxfer &&
631 		    (bd->d_maxxfer != bd->d_dma.dma_attr_maxxfer)) {
632 			cmn_err(CE_WARN,
633 			    "%s: inconsistent maximum transfer size!",
634 			    name);
635 			/* We force it */
636 			bd->d_maxxfer = bd->d_dma.dma_attr_maxxfer;
637 		} else {
638 			bd->d_maxxfer = bd->d_dma.dma_attr_maxxfer;
639 		}
640 	} else {
641 		bd->d_use_dma = B_FALSE;
642 		if (bd->d_maxxfer == 0) {
643 			bd->d_maxxfer = 1024 * 1024;
644 		}
645 	}
646 	bd->d_ops = hdl->h_ops;
647 	bd->d_private = hdl->h_private;
648 	bd->d_blkshift = 9;	/* 512 bytes, to start */
649 
650 	if (bd->d_maxxfer % DEV_BSIZE) {
651 		cmn_err(CE_WARN, "%s: maximum transfer misaligned!", name);
652 		bd->d_maxxfer &= ~(DEV_BSIZE - 1);
653 	}
654 	if (bd->d_maxxfer < DEV_BSIZE) {
655 		cmn_err(CE_WARN, "%s: maximum transfer size too small!", name);
656 		ddi_soft_state_free(bd_state, inst);
657 		return (DDI_FAILURE);
658 	}
659 
660 	bd->d_dip = dip;
661 	bd->d_handle = hdl;
662 	hdl->h_bd = bd;
663 	ddi_set_driver_private(dip, bd);
664 
665 	mutex_init(&bd->d_ksmutex, NULL, MUTEX_DRIVER, NULL);
666 	mutex_init(&bd->d_ocmutex, NULL, MUTEX_DRIVER, NULL);
667 	mutex_init(&bd->d_statemutex, NULL, MUTEX_DRIVER, NULL);
668 	cv_init(&bd->d_statecv, NULL, CV_DRIVER, NULL);
669 
670 	bd->d_cache = kmem_cache_create(kcache, sizeof (bd_xfer_impl_t), 8,
671 	    bd_xfer_ctor, bd_xfer_dtor, NULL, bd, NULL, 0);
672 
673 	bd->d_ksp = kstat_create(ddi_driver_name(dip), inst, NULL, "disk",
674 	    KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT);
675 	if (bd->d_ksp != NULL) {
676 		bd->d_ksp->ks_lock = &bd->d_ksmutex;
677 		kstat_install(bd->d_ksp);
678 		bd->d_kiop = bd->d_ksp->ks_data;
679 	} else {
680 		/*
681 		 * Even if we cannot create the kstat, we create a
682 		 * scratch kstat.  The reason for this is to ensure
683 		 * that we can update the kstat all of the time,
684 		 * without adding an extra branch instruction.
685 		 */
686 		bd->d_kiop = kmem_zalloc(sizeof (kstat_io_t), KM_SLEEP);
687 	}
688 
689 	cmlb_alloc_handle(&bd->d_cmlbh);
690 
691 	bd->d_state = DKIO_NONE;
692 
693 	bzero(&drive, sizeof (drive));
694 	/*
695 	 * Default to one queue, parent driver can override.
696 	 */
697 	drive.d_qcount = 1;
698 	bd->d_ops.o_drive_info(bd->d_private, &drive);
699 	bd->d_qcount = drive.d_qcount;
700 	bd->d_removable = drive.d_removable;
701 	bd->d_hotpluggable = drive.d_hotpluggable;
702 
703 	if (drive.d_maxxfer && drive.d_maxxfer < bd->d_maxxfer)
704 		bd->d_maxxfer = drive.d_maxxfer;
705 
706 	bd_create_inquiry_props(dip, &drive);
707 
708 	bd_create_errstats(bd, inst, &drive);
709 	bd_init_errstats(bd, &drive);
710 	bd_update_state(bd);
711 
712 	bd->d_queues = kmem_alloc(sizeof (*bd->d_queues) * bd->d_qcount,
713 	    KM_SLEEP);
714 	for (i = 0; i < bd->d_qcount; i++) {
715 		bd_queue_t *bq = &bd->d_queues[i];
716 
717 		bq->q_qsize = drive.d_qsize;
718 		bq->q_qactive = 0;
719 		mutex_init(&bq->q_iomutex, NULL, MUTEX_DRIVER, NULL);
720 
721 		list_create(&bq->q_waitq, sizeof (bd_xfer_impl_t),
722 		    offsetof(struct bd_xfer_impl, i_linkage));
723 		list_create(&bq->q_runq, sizeof (bd_xfer_impl_t),
724 		    offsetof(struct bd_xfer_impl, i_linkage));
725 	}
726 
727 	rv = cmlb_attach(dip, &bd_tg_ops, DTYPE_DIRECT,
728 	    bd->d_removable, bd->d_hotpluggable,
729 	    /*LINTED: E_BAD_PTR_CAST_ALIGN*/
730 	    *(uint64_t *)drive.d_eui64 != 0 ? DDI_NT_BLOCK_BLKDEV :
731 	    drive.d_lun >= 0 ? DDI_NT_BLOCK_CHAN : DDI_NT_BLOCK,
732 	    CMLB_FAKE_LABEL_ONE_PARTITION, bd->d_cmlbh, 0);
733 	if (rv != 0) {
734 		cmlb_free_handle(&bd->d_cmlbh);
735 		kmem_cache_destroy(bd->d_cache);
736 		mutex_destroy(&bd->d_ksmutex);
737 		mutex_destroy(&bd->d_ocmutex);
738 		mutex_destroy(&bd->d_statemutex);
739 		cv_destroy(&bd->d_statecv);
740 		bd_queues_free(bd);
741 		if (bd->d_ksp != NULL) {
742 			kstat_delete(bd->d_ksp);
743 			bd->d_ksp = NULL;
744 		} else {
745 			kmem_free(bd->d_kiop, sizeof (kstat_io_t));
746 		}
747 		ddi_soft_state_free(bd_state, inst);
748 		return (DDI_FAILURE);
749 	}
750 
751 	if (bd->d_ops.o_devid_init != NULL) {
752 		rv = bd->d_ops.o_devid_init(bd->d_private, dip, &bd->d_devid);
753 		if (rv == DDI_SUCCESS) {
754 			if (ddi_devid_register(dip, bd->d_devid) !=
755 			    DDI_SUCCESS) {
756 				cmn_err(CE_WARN,
757 				    "%s: unable to register devid", name);
758 			}
759 		}
760 	}
761 
762 	/*
763 	 * Add a zero-length attribute to tell the world we support
764 	 * kernel ioctls (for layered drivers).  Also set up properties
765 	 * used by HAL to identify removable media.
766 	 */
767 	(void) ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
768 	    DDI_KERNEL_IOCTL, NULL, 0);
769 	if (bd->d_removable) {
770 		(void) ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
771 		    "removable-media", NULL, 0);
772 	}
773 	if (bd->d_hotpluggable) {
774 		(void) ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
775 		    "hotpluggable", NULL, 0);
776 	}
777 
778 	ddi_report_dev(dip);
779 
780 	return (DDI_SUCCESS);
781 }
782 
783 static int
784 bd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
785 {
786 	bd_t	*bd;
787 
788 	bd = ddi_get_driver_private(dip);
789 
790 	switch (cmd) {
791 	case DDI_DETACH:
792 		break;
793 	case DDI_SUSPEND:
794 		/* We don't suspend, but our parent does */
795 		return (DDI_SUCCESS);
796 	default:
797 		return (DDI_FAILURE);
798 	}
799 	if (bd->d_ksp != NULL) {
800 		kstat_delete(bd->d_ksp);
801 		bd->d_ksp = NULL;
802 	} else {
803 		kmem_free(bd->d_kiop, sizeof (kstat_io_t));
804 	}
805 
806 	if (bd->d_errstats != NULL) {
807 		bd_fini_errstats(bd);
808 		kstat_delete(bd->d_errstats);
809 		bd->d_errstats = NULL;
810 	} else {
811 		kmem_free(bd->d_kerr, sizeof (struct bd_errstats));
812 		mutex_destroy(&bd->d_errmutex);
813 	}
814 
815 	cmlb_detach(bd->d_cmlbh, 0);
816 	cmlb_free_handle(&bd->d_cmlbh);
817 	if (bd->d_devid)
818 		ddi_devid_free(bd->d_devid);
819 	kmem_cache_destroy(bd->d_cache);
820 	mutex_destroy(&bd->d_ksmutex);
821 	mutex_destroy(&bd->d_ocmutex);
822 	mutex_destroy(&bd->d_statemutex);
823 	cv_destroy(&bd->d_statecv);
824 	bd_queues_free(bd);
825 	ddi_soft_state_free(bd_state, ddi_get_instance(dip));
826 	return (DDI_SUCCESS);
827 }
828 
829 static int
830 bd_xfer_ctor(void *buf, void *arg, int kmflag)
831 {
832 	bd_xfer_impl_t	*xi;
833 	bd_t		*bd = arg;
834 	int		(*dcb)(caddr_t);
835 
836 	if (kmflag == KM_PUSHPAGE || kmflag == KM_SLEEP) {
837 		dcb = DDI_DMA_SLEEP;
838 	} else {
839 		dcb = DDI_DMA_DONTWAIT;
840 	}
841 
842 	xi = buf;
843 	bzero(xi, sizeof (*xi));
844 	xi->i_bd = bd;
845 
846 	if (bd->d_use_dma) {
847 		if (ddi_dma_alloc_handle(bd->d_dip, &bd->d_dma, dcb, NULL,
848 		    &xi->i_dmah) != DDI_SUCCESS) {
849 			return (-1);
850 		}
851 	}
852 
853 	return (0);
854 }
855 
856 static void
857 bd_xfer_dtor(void *buf, void *arg)
858 {
859 	bd_xfer_impl_t	*xi = buf;
860 
861 	_NOTE(ARGUNUSED(arg));
862 
863 	if (xi->i_dmah)
864 		ddi_dma_free_handle(&xi->i_dmah);
865 	xi->i_dmah = NULL;
866 }
867 
868 static bd_xfer_impl_t *
869 bd_xfer_alloc(bd_t *bd, struct buf *bp, int (*func)(void *, bd_xfer_t *),
870     int kmflag)
871 {
872 	bd_xfer_impl_t		*xi;
873 	int			rv = 0;
874 	int			status;
875 	unsigned		dir;
876 	int			(*cb)(caddr_t);
877 	size_t			len;
878 	uint32_t		shift;
879 
880 	if (kmflag == KM_SLEEP) {
881 		cb = DDI_DMA_SLEEP;
882 	} else {
883 		cb = DDI_DMA_DONTWAIT;
884 	}
885 
886 	xi = kmem_cache_alloc(bd->d_cache, kmflag);
887 	if (xi == NULL) {
888 		bioerror(bp, ENOMEM);
889 		return (NULL);
890 	}
891 
892 	ASSERT(bp);
893 
894 	xi->i_bp = bp;
895 	xi->i_func = func;
896 	xi->i_blkno = bp->b_lblkno >> (bd->d_blkshift - DEV_BSHIFT);
897 
898 	if (bp->b_bcount == 0) {
899 		xi->i_len = 0;
900 		xi->i_nblks = 0;
901 		xi->i_kaddr = NULL;
902 		xi->i_resid = 0;
903 		xi->i_num_win = 0;
904 		goto done;
905 	}
906 
907 	if (bp->b_flags & B_READ) {
908 		dir = DDI_DMA_READ;
909 		xi->i_func = bd->d_ops.o_read;
910 	} else {
911 		dir = DDI_DMA_WRITE;
912 		xi->i_func = bd->d_ops.o_write;
913 	}
914 
915 	shift = bd->d_blkshift;
916 	xi->i_blkshift = shift;
917 
918 	if (!bd->d_use_dma) {
919 		bp_mapin(bp);
920 		rv = 0;
921 		xi->i_offset = 0;
922 		xi->i_num_win =
923 		    (bp->b_bcount + (bd->d_maxxfer - 1)) / bd->d_maxxfer;
924 		xi->i_cur_win = 0;
925 		xi->i_len = min(bp->b_bcount, bd->d_maxxfer);
926 		xi->i_nblks = xi->i_len >> shift;
927 		xi->i_kaddr = bp->b_un.b_addr;
928 		xi->i_resid = bp->b_bcount;
929 	} else {
930 
931 		/*
932 		 * We have to use consistent DMA if the address is misaligned.
933 		 */
934 		if (((bp->b_flags & (B_PAGEIO | B_REMAPPED)) != B_PAGEIO) &&
935 		    ((uintptr_t)bp->b_un.b_addr & 0x7)) {
936 			dir |= DDI_DMA_CONSISTENT | DDI_DMA_PARTIAL;
937 		} else {
938 			dir |= DDI_DMA_STREAMING | DDI_DMA_PARTIAL;
939 		}
940 
941 		status = ddi_dma_buf_bind_handle(xi->i_dmah, bp, dir, cb,
942 		    NULL, &xi->i_dmac, &xi->i_ndmac);
943 		switch (status) {
944 		case DDI_DMA_MAPPED:
945 			xi->i_num_win = 1;
946 			xi->i_cur_win = 0;
947 			xi->i_offset = 0;
948 			xi->i_len = bp->b_bcount;
949 			xi->i_nblks = xi->i_len >> shift;
950 			xi->i_resid = bp->b_bcount;
951 			rv = 0;
952 			break;
953 		case DDI_DMA_PARTIAL_MAP:
954 			xi->i_cur_win = 0;
955 
956 			if ((ddi_dma_numwin(xi->i_dmah, &xi->i_num_win) !=
957 			    DDI_SUCCESS) ||
958 			    (ddi_dma_getwin(xi->i_dmah, 0, &xi->i_offset,
959 			    &len, &xi->i_dmac, &xi->i_ndmac) !=
960 			    DDI_SUCCESS) ||
961 			    (P2PHASE(len, (1U << shift)) != 0)) {
962 				(void) ddi_dma_unbind_handle(xi->i_dmah);
963 				rv = EFAULT;
964 				goto done;
965 			}
966 			xi->i_len = len;
967 			xi->i_nblks = xi->i_len >> shift;
968 			xi->i_resid = bp->b_bcount;
969 			rv = 0;
970 			break;
971 		case DDI_DMA_NORESOURCES:
972 			rv = EAGAIN;
973 			goto done;
974 		case DDI_DMA_TOOBIG:
975 			rv = EINVAL;
976 			goto done;
977 		case DDI_DMA_NOMAPPING:
978 		case DDI_DMA_INUSE:
979 		default:
980 			rv = EFAULT;
981 			goto done;
982 		}
983 	}
984 
985 done:
986 	if (rv != 0) {
987 		kmem_cache_free(bd->d_cache, xi);
988 		bioerror(bp, rv);
989 		return (NULL);
990 	}
991 
992 	return (xi);
993 }
994 
995 static void
996 bd_xfer_free(bd_xfer_impl_t *xi)
997 {
998 	if (xi->i_dmah) {
999 		(void) ddi_dma_unbind_handle(xi->i_dmah);
1000 	}
1001 	kmem_cache_free(xi->i_bd->d_cache, xi);
1002 }
1003 
1004 static int
1005 bd_open(dev_t *devp, int flag, int otyp, cred_t *credp)
1006 {
1007 	dev_t		dev = *devp;
1008 	bd_t		*bd;
1009 	minor_t		part;
1010 	minor_t		inst;
1011 	uint64_t	mask;
1012 	boolean_t	ndelay;
1013 	int		rv;
1014 	diskaddr_t	nblks;
1015 	diskaddr_t	lba;
1016 
1017 	_NOTE(ARGUNUSED(credp));
1018 
1019 	part = BDPART(dev);
1020 	inst = BDINST(dev);
1021 
1022 	if (otyp >= OTYPCNT)
1023 		return (EINVAL);
1024 
1025 	ndelay = (flag & (FNDELAY | FNONBLOCK)) ? B_TRUE : B_FALSE;
1026 
1027 	/*
1028 	 * Block any DR events from changing the set of registered
1029 	 * devices while we function.
1030 	 */
1031 	rw_enter(&bd_lock, RW_READER);
1032 	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
1033 		rw_exit(&bd_lock);
1034 		return (ENXIO);
1035 	}
1036 
1037 	mutex_enter(&bd->d_ocmutex);
1038 
1039 	ASSERT(part < 64);
1040 	mask = (1U << part);
1041 
1042 	bd_update_state(bd);
1043 
1044 	if (cmlb_validate(bd->d_cmlbh, 0, 0) != 0) {
1045 
1046 		/* non-blocking opens are allowed to succeed */
1047 		if (!ndelay) {
1048 			rv = ENXIO;
1049 			goto done;
1050 		}
1051 	} else if (cmlb_partinfo(bd->d_cmlbh, part, &nblks, &lba,
1052 	    NULL, NULL, 0) == 0) {
1053 
1054 		/*
1055 		 * We read the partinfo, verify valid ranges.  If the
1056 		 * partition is invalid, and we aren't blocking or
1057 		 * doing a raw access, then fail. (Non-blocking and
1058 		 * raw accesses can still succeed to allow a disk with
1059 		 * bad partition data to opened by format and fdisk.)
1060 		 */
1061 		if ((!nblks) && ((!ndelay) || (otyp != OTYP_CHR))) {
1062 			rv = ENXIO;
1063 			goto done;
1064 		}
1065 	} else if (!ndelay) {
1066 		/*
1067 		 * cmlb_partinfo failed -- invalid partition or no
1068 		 * disk label.
1069 		 */
1070 		rv = ENXIO;
1071 		goto done;
1072 	}
1073 
1074 	if ((flag & FWRITE) && bd->d_rdonly) {
1075 		rv = EROFS;
1076 		goto done;
1077 	}
1078 
1079 	if ((bd->d_open_excl) & (mask)) {
1080 		rv = EBUSY;
1081 		goto done;
1082 	}
1083 	if (flag & FEXCL) {
1084 		if (bd->d_open_lyr[part]) {
1085 			rv = EBUSY;
1086 			goto done;
1087 		}
1088 		for (int i = 0; i < OTYP_LYR; i++) {
1089 			if (bd->d_open_reg[i] & mask) {
1090 				rv = EBUSY;
1091 				goto done;
1092 			}
1093 		}
1094 	}
1095 
1096 	if (otyp == OTYP_LYR) {
1097 		bd->d_open_lyr[part]++;
1098 	} else {
1099 		bd->d_open_reg[otyp] |= mask;
1100 	}
1101 	if (flag & FEXCL) {
1102 		bd->d_open_excl |= mask;
1103 	}
1104 
1105 	rv = 0;
1106 done:
1107 	mutex_exit(&bd->d_ocmutex);
1108 	rw_exit(&bd_lock);
1109 
1110 	return (rv);
1111 }
1112 
1113 static int
1114 bd_close(dev_t dev, int flag, int otyp, cred_t *credp)
1115 {
1116 	bd_t		*bd;
1117 	minor_t		inst;
1118 	minor_t		part;
1119 	uint64_t	mask;
1120 	boolean_t	last = B_TRUE;
1121 
1122 	_NOTE(ARGUNUSED(flag));
1123 	_NOTE(ARGUNUSED(credp));
1124 
1125 	part = BDPART(dev);
1126 	inst = BDINST(dev);
1127 
1128 	ASSERT(part < 64);
1129 	mask = (1U << part);
1130 
1131 	rw_enter(&bd_lock, RW_READER);
1132 
1133 	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
1134 		rw_exit(&bd_lock);
1135 		return (ENXIO);
1136 	}
1137 
1138 	mutex_enter(&bd->d_ocmutex);
1139 	if (bd->d_open_excl & mask) {
1140 		bd->d_open_excl &= ~mask;
1141 	}
1142 	if (otyp == OTYP_LYR) {
1143 		bd->d_open_lyr[part]--;
1144 	} else {
1145 		bd->d_open_reg[otyp] &= ~mask;
1146 	}
1147 	for (int i = 0; i < 64; i++) {
1148 		if (bd->d_open_lyr[part]) {
1149 			last = B_FALSE;
1150 		}
1151 	}
1152 	for (int i = 0; last && (i < OTYP_LYR); i++) {
1153 		if (bd->d_open_reg[i]) {
1154 			last = B_FALSE;
1155 		}
1156 	}
1157 	mutex_exit(&bd->d_ocmutex);
1158 
1159 	if (last) {
1160 		cmlb_invalidate(bd->d_cmlbh, 0);
1161 	}
1162 	rw_exit(&bd_lock);
1163 
1164 	return (0);
1165 }
1166 
1167 static int
1168 bd_dump(dev_t dev, caddr_t caddr, daddr_t blkno, int nblk)
1169 {
1170 	minor_t		inst;
1171 	minor_t		part;
1172 	diskaddr_t	pstart;
1173 	diskaddr_t	psize;
1174 	bd_t		*bd;
1175 	bd_xfer_impl_t	*xi;
1176 	buf_t		*bp;
1177 	int		rv;
1178 	uint32_t	shift;
1179 	daddr_t		d_blkno;
1180 	int	d_nblk;
1181 
1182 	rw_enter(&bd_lock, RW_READER);
1183 
1184 	part = BDPART(dev);
1185 	inst = BDINST(dev);
1186 
1187 	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
1188 		rw_exit(&bd_lock);
1189 		return (ENXIO);
1190 	}
1191 	shift = bd->d_blkshift;
1192 	d_blkno = blkno >> (shift - DEV_BSHIFT);
1193 	d_nblk = nblk >> (shift - DEV_BSHIFT);
1194 	/*
1195 	 * do cmlb, but do it synchronously unless we already have the
1196 	 * partition (which we probably should.)
1197 	 */
1198 	if (cmlb_partinfo(bd->d_cmlbh, part, &psize, &pstart, NULL, NULL,
1199 	    (void *)1)) {
1200 		rw_exit(&bd_lock);
1201 		return (ENXIO);
1202 	}
1203 
1204 	if ((d_blkno + d_nblk) > psize) {
1205 		rw_exit(&bd_lock);
1206 		return (EINVAL);
1207 	}
1208 	bp = getrbuf(KM_NOSLEEP);
1209 	if (bp == NULL) {
1210 		rw_exit(&bd_lock);
1211 		return (ENOMEM);
1212 	}
1213 
1214 	bp->b_bcount = nblk << DEV_BSHIFT;
1215 	bp->b_resid = bp->b_bcount;
1216 	bp->b_lblkno = blkno;
1217 	bp->b_un.b_addr = caddr;
1218 
1219 	xi = bd_xfer_alloc(bd, bp,  bd->d_ops.o_write, KM_NOSLEEP);
1220 	if (xi == NULL) {
1221 		rw_exit(&bd_lock);
1222 		freerbuf(bp);
1223 		return (ENOMEM);
1224 	}
1225 	xi->i_blkno = d_blkno + pstart;
1226 	xi->i_flags = BD_XFER_POLL;
1227 	bd_submit(bd, xi);
1228 	rw_exit(&bd_lock);
1229 
1230 	/*
1231 	 * Generally, we should have run this entirely synchronously
1232 	 * at this point and the biowait call should be a no-op.  If
1233 	 * it didn't happen this way, it's a bug in the underlying
1234 	 * driver not honoring BD_XFER_POLL.
1235 	 */
1236 	(void) biowait(bp);
1237 	rv = geterror(bp);
1238 	freerbuf(bp);
1239 	return (rv);
1240 }
1241 
1242 void
1243 bd_minphys(struct buf *bp)
1244 {
1245 	minor_t inst;
1246 	bd_t	*bd;
1247 	inst = BDINST(bp->b_edev);
1248 
1249 	bd = ddi_get_soft_state(bd_state, inst);
1250 
1251 	/*
1252 	 * In a non-debug kernel, bd_strategy will catch !bd as
1253 	 * well, and will fail nicely.
1254 	 */
1255 	ASSERT(bd);
1256 
1257 	if (bp->b_bcount > bd->d_maxxfer)
1258 		bp->b_bcount = bd->d_maxxfer;
1259 }
1260 
1261 static int
1262 bd_check_uio(dev_t dev, struct uio *uio)
1263 {
1264 	bd_t		*bd;
1265 	uint32_t	shift;
1266 
1267 	if ((bd = ddi_get_soft_state(bd_state, BDINST(dev))) == NULL) {
1268 		return (ENXIO);
1269 	}
1270 
1271 	shift = bd->d_blkshift;
1272 	if ((P2PHASE(uio->uio_loffset, (1U << shift)) != 0) ||
1273 	    (P2PHASE(uio->uio_iov->iov_len, (1U << shift)) != 0)) {
1274 		return (EINVAL);
1275 	}
1276 
1277 	return (0);
1278 }
1279 
1280 static int
1281 bd_read(dev_t dev, struct uio *uio, cred_t *credp)
1282 {
1283 	_NOTE(ARGUNUSED(credp));
1284 	int	ret = bd_check_uio(dev, uio);
1285 	if (ret != 0) {
1286 		return (ret);
1287 	}
1288 	return (physio(bd_strategy, NULL, dev, B_READ, bd_minphys, uio));
1289 }
1290 
1291 static int
1292 bd_write(dev_t dev, struct uio *uio, cred_t *credp)
1293 {
1294 	_NOTE(ARGUNUSED(credp));
1295 	int	ret = bd_check_uio(dev, uio);
1296 	if (ret != 0) {
1297 		return (ret);
1298 	}
1299 	return (physio(bd_strategy, NULL, dev, B_WRITE, bd_minphys, uio));
1300 }
1301 
1302 static int
1303 bd_aread(dev_t dev, struct aio_req *aio, cred_t *credp)
1304 {
1305 	_NOTE(ARGUNUSED(credp));
1306 	int	ret = bd_check_uio(dev, aio->aio_uio);
1307 	if (ret != 0) {
1308 		return (ret);
1309 	}
1310 	return (aphysio(bd_strategy, anocancel, dev, B_READ, bd_minphys, aio));
1311 }
1312 
1313 static int
1314 bd_awrite(dev_t dev, struct aio_req *aio, cred_t *credp)
1315 {
1316 	_NOTE(ARGUNUSED(credp));
1317 	int	ret = bd_check_uio(dev, aio->aio_uio);
1318 	if (ret != 0) {
1319 		return (ret);
1320 	}
1321 	return (aphysio(bd_strategy, anocancel, dev, B_WRITE, bd_minphys, aio));
1322 }
1323 
1324 static int
1325 bd_strategy(struct buf *bp)
1326 {
1327 	minor_t		inst;
1328 	minor_t		part;
1329 	bd_t		*bd;
1330 	diskaddr_t	p_lba;
1331 	diskaddr_t	p_nblks;
1332 	diskaddr_t	b_nblks;
1333 	bd_xfer_impl_t	*xi;
1334 	uint32_t	shift;
1335 	int		(*func)(void *, bd_xfer_t *);
1336 	diskaddr_t	lblkno;
1337 
1338 	part = BDPART(bp->b_edev);
1339 	inst = BDINST(bp->b_edev);
1340 
1341 	ASSERT(bp);
1342 
1343 	bp->b_resid = bp->b_bcount;
1344 
1345 	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
1346 		bioerror(bp, ENXIO);
1347 		biodone(bp);
1348 		return (0);
1349 	}
1350 
1351 	if (cmlb_partinfo(bd->d_cmlbh, part, &p_nblks, &p_lba,
1352 	    NULL, NULL, 0)) {
1353 		bioerror(bp, ENXIO);
1354 		biodone(bp);
1355 		return (0);
1356 	}
1357 
1358 	shift = bd->d_blkshift;
1359 	lblkno = bp->b_lblkno >> (shift - DEV_BSHIFT);
1360 	if ((P2PHASE(bp->b_lblkno, (1U << (shift - DEV_BSHIFT))) != 0) ||
1361 	    (P2PHASE(bp->b_bcount, (1U << shift)) != 0) ||
1362 	    (lblkno > p_nblks)) {
1363 		bioerror(bp, EINVAL);
1364 		biodone(bp);
1365 		return (0);
1366 	}
1367 	b_nblks = bp->b_bcount >> shift;
1368 	if ((lblkno == p_nblks) || (bp->b_bcount == 0)) {
1369 		biodone(bp);
1370 		return (0);
1371 	}
1372 
1373 	if ((b_nblks + lblkno) > p_nblks) {
1374 		bp->b_resid = ((lblkno + b_nblks - p_nblks) << shift);
1375 		bp->b_bcount -= bp->b_resid;
1376 	} else {
1377 		bp->b_resid = 0;
1378 	}
1379 	func = (bp->b_flags & B_READ) ? bd->d_ops.o_read : bd->d_ops.o_write;
1380 
1381 	xi = bd_xfer_alloc(bd, bp, func, KM_NOSLEEP);
1382 	if (xi == NULL) {
1383 		xi = bd_xfer_alloc(bd, bp, func, KM_PUSHPAGE);
1384 	}
1385 	if (xi == NULL) {
1386 		/* bd_request_alloc will have done bioerror */
1387 		biodone(bp);
1388 		return (0);
1389 	}
1390 	xi->i_blkno = lblkno + p_lba;
1391 
1392 	bd_submit(bd, xi);
1393 
1394 	return (0);
1395 }
1396 
1397 static int
1398 bd_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *credp, int *rvalp)
1399 {
1400 	minor_t		inst;
1401 	uint16_t	part;
1402 	bd_t		*bd;
1403 	void		*ptr = (void *)arg;
1404 	int		rv;
1405 
1406 	part = BDPART(dev);
1407 	inst = BDINST(dev);
1408 
1409 	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
1410 		return (ENXIO);
1411 	}
1412 
1413 	rv = cmlb_ioctl(bd->d_cmlbh, dev, cmd, arg, flag, credp, rvalp, 0);
1414 	if (rv != ENOTTY)
1415 		return (rv);
1416 
1417 	if (rvalp != NULL) {
1418 		/* the return value of the ioctl is 0 by default */
1419 		*rvalp = 0;
1420 	}
1421 
1422 	switch (cmd) {
1423 	case DKIOCGMEDIAINFO: {
1424 		struct dk_minfo minfo;
1425 
1426 		/* make sure our state information is current */
1427 		bd_update_state(bd);
1428 		bzero(&minfo, sizeof (minfo));
1429 		minfo.dki_media_type = DK_FIXED_DISK;
1430 		minfo.dki_lbsize = (1U << bd->d_blkshift);
1431 		minfo.dki_capacity = bd->d_numblks;
1432 		if (ddi_copyout(&minfo, ptr, sizeof (minfo), flag)) {
1433 			return (EFAULT);
1434 		}
1435 		return (0);
1436 	}
1437 	case DKIOCGMEDIAINFOEXT: {
1438 		struct dk_minfo_ext miext;
1439 
1440 		/* make sure our state information is current */
1441 		bd_update_state(bd);
1442 		bzero(&miext, sizeof (miext));
1443 		miext.dki_media_type = DK_FIXED_DISK;
1444 		miext.dki_lbsize = (1U << bd->d_blkshift);
1445 		miext.dki_pbsize = (1U << bd->d_pblkshift);
1446 		miext.dki_capacity = bd->d_numblks;
1447 		if (ddi_copyout(&miext, ptr, sizeof (miext), flag)) {
1448 			return (EFAULT);
1449 		}
1450 		return (0);
1451 	}
1452 	case DKIOCINFO: {
1453 		struct dk_cinfo cinfo;
1454 		bzero(&cinfo, sizeof (cinfo));
1455 		cinfo.dki_ctype = DKC_BLKDEV;
1456 		cinfo.dki_cnum = ddi_get_instance(ddi_get_parent(bd->d_dip));
1457 		(void) snprintf(cinfo.dki_cname, sizeof (cinfo.dki_cname),
1458 		    "%s", ddi_driver_name(ddi_get_parent(bd->d_dip)));
1459 		(void) snprintf(cinfo.dki_dname, sizeof (cinfo.dki_dname),
1460 		    "%s", ddi_driver_name(bd->d_dip));
1461 		cinfo.dki_unit = inst;
1462 		cinfo.dki_flags = DKI_FMTVOL;
1463 		cinfo.dki_partition = part;
1464 		cinfo.dki_maxtransfer = bd->d_maxxfer / DEV_BSIZE;
1465 		cinfo.dki_addr = 0;
1466 		cinfo.dki_slave = 0;
1467 		cinfo.dki_space = 0;
1468 		cinfo.dki_prio = 0;
1469 		cinfo.dki_vec = 0;
1470 		if (ddi_copyout(&cinfo, ptr, sizeof (cinfo), flag)) {
1471 			return (EFAULT);
1472 		}
1473 		return (0);
1474 	}
1475 	case DKIOCREMOVABLE: {
1476 		int i;
1477 		i = bd->d_removable ? 1 : 0;
1478 		if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
1479 			return (EFAULT);
1480 		}
1481 		return (0);
1482 	}
1483 	case DKIOCHOTPLUGGABLE: {
1484 		int i;
1485 		i = bd->d_hotpluggable ? 1 : 0;
1486 		if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
1487 			return (EFAULT);
1488 		}
1489 		return (0);
1490 	}
1491 	case DKIOCREADONLY: {
1492 		int i;
1493 		i = bd->d_rdonly ? 1 : 0;
1494 		if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
1495 			return (EFAULT);
1496 		}
1497 		return (0);
1498 	}
1499 	case DKIOCSOLIDSTATE: {
1500 		int i;
1501 		i = bd->d_ssd ? 1 : 0;
1502 		if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
1503 			return (EFAULT);
1504 		}
1505 		return (0);
1506 	}
1507 	case DKIOCSTATE: {
1508 		enum dkio_state	state;
1509 		if (ddi_copyin(ptr, &state, sizeof (state), flag)) {
1510 			return (EFAULT);
1511 		}
1512 		if ((rv = bd_check_state(bd, &state)) != 0) {
1513 			return (rv);
1514 		}
1515 		if (ddi_copyout(&state, ptr, sizeof (state), flag)) {
1516 			return (EFAULT);
1517 		}
1518 		return (0);
1519 	}
1520 	case DKIOCFLUSHWRITECACHE: {
1521 		struct dk_callback *dkc = NULL;
1522 
1523 		if (flag & FKIOCTL)
1524 			dkc = (void *)arg;
1525 
1526 		rv = bd_flush_write_cache(bd, dkc);
1527 		return (rv);
1528 	}
1529 
1530 	default:
1531 		break;
1532 
1533 	}
1534 	return (ENOTTY);
1535 }
1536 
1537 static int
1538 bd_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int mod_flags,
1539     char *name, caddr_t valuep, int *lengthp)
1540 {
1541 	bd_t	*bd;
1542 
1543 	bd = ddi_get_soft_state(bd_state, ddi_get_instance(dip));
1544 	if (bd == NULL)
1545 		return (ddi_prop_op(dev, dip, prop_op, mod_flags,
1546 		    name, valuep, lengthp));
1547 
1548 	return (cmlb_prop_op(bd->d_cmlbh, dev, dip, prop_op, mod_flags, name,
1549 	    valuep, lengthp, BDPART(dev), 0));
1550 }
1551 
1552 
1553 static int
1554 bd_tg_rdwr(dev_info_t *dip, uchar_t cmd, void *bufaddr, diskaddr_t start,
1555     size_t length, void *tg_cookie)
1556 {
1557 	bd_t		*bd;
1558 	buf_t		*bp;
1559 	bd_xfer_impl_t	*xi;
1560 	int		rv;
1561 	int		(*func)(void *, bd_xfer_t *);
1562 	int		kmflag;
1563 
1564 	/*
1565 	 * If we are running in polled mode (such as during dump(9e)
1566 	 * execution), then we cannot sleep for kernel allocations.
1567 	 */
1568 	kmflag = tg_cookie ? KM_NOSLEEP : KM_SLEEP;
1569 
1570 	bd = ddi_get_soft_state(bd_state, ddi_get_instance(dip));
1571 
1572 	if (P2PHASE(length, (1U << bd->d_blkshift)) != 0) {
1573 		/* We can only transfer whole blocks at a time! */
1574 		return (EINVAL);
1575 	}
1576 
1577 	if ((bp = getrbuf(kmflag)) == NULL) {
1578 		return (ENOMEM);
1579 	}
1580 
1581 	switch (cmd) {
1582 	case TG_READ:
1583 		bp->b_flags = B_READ;
1584 		func = bd->d_ops.o_read;
1585 		break;
1586 	case TG_WRITE:
1587 		bp->b_flags = B_WRITE;
1588 		func = bd->d_ops.o_write;
1589 		break;
1590 	default:
1591 		freerbuf(bp);
1592 		return (EINVAL);
1593 	}
1594 
1595 	bp->b_un.b_addr = bufaddr;
1596 	bp->b_bcount = length;
1597 	xi = bd_xfer_alloc(bd, bp, func, kmflag);
1598 	if (xi == NULL) {
1599 		rv = geterror(bp);
1600 		freerbuf(bp);
1601 		return (rv);
1602 	}
1603 	xi->i_flags = tg_cookie ? BD_XFER_POLL : 0;
1604 	xi->i_blkno = start;
1605 	bd_submit(bd, xi);
1606 	(void) biowait(bp);
1607 	rv = geterror(bp);
1608 	freerbuf(bp);
1609 
1610 	return (rv);
1611 }
1612 
1613 static int
1614 bd_tg_getinfo(dev_info_t *dip, int cmd, void *arg, void *tg_cookie)
1615 {
1616 	bd_t		*bd;
1617 
1618 	_NOTE(ARGUNUSED(tg_cookie));
1619 	bd = ddi_get_soft_state(bd_state, ddi_get_instance(dip));
1620 
1621 	switch (cmd) {
1622 	case TG_GETPHYGEOM:
1623 	case TG_GETVIRTGEOM:
1624 		/*
1625 		 * We don't have any "geometry" as such, let cmlb
1626 		 * fabricate something.
1627 		 */
1628 		return (ENOTTY);
1629 
1630 	case TG_GETCAPACITY:
1631 		bd_update_state(bd);
1632 		*(diskaddr_t *)arg = bd->d_numblks;
1633 		return (0);
1634 
1635 	case TG_GETBLOCKSIZE:
1636 		*(uint32_t *)arg = (1U << bd->d_blkshift);
1637 		return (0);
1638 
1639 	case TG_GETATTR:
1640 		/*
1641 		 * It turns out that cmlb really doesn't do much for
1642 		 * non-writable media, but lets make the information
1643 		 * available for it in case it does more in the
1644 		 * future.  (The value is currently used for
1645 		 * triggering special behavior for CD-ROMs.)
1646 		 */
1647 		bd_update_state(bd);
1648 		((tg_attribute_t *)arg)->media_is_writable =
1649 		    bd->d_rdonly ? B_FALSE : B_TRUE;
1650 		((tg_attribute_t *)arg)->media_is_solid_state = bd->d_ssd;
1651 		((tg_attribute_t *)arg)->media_is_rotational = B_FALSE;
1652 		return (0);
1653 
1654 	default:
1655 		return (EINVAL);
1656 	}
1657 }
1658 
1659 
1660 static void
1661 bd_sched(bd_t *bd, bd_queue_t *bq)
1662 {
1663 	bd_xfer_impl_t	*xi;
1664 	struct buf	*bp;
1665 	int		rv;
1666 
1667 	mutex_enter(&bq->q_iomutex);
1668 
1669 	while ((bq->q_qactive < bq->q_qsize) &&
1670 	    ((xi = list_remove_head(&bq->q_waitq)) != NULL)) {
1671 		bq->q_qactive++;
1672 		list_insert_tail(&bq->q_runq, xi);
1673 
1674 		/*
1675 		 * Submit the job to the driver.  We drop the I/O mutex
1676 		 * so that we can deal with the case where the driver
1677 		 * completion routine calls back into us synchronously.
1678 		 */
1679 
1680 		mutex_exit(&bq->q_iomutex);
1681 
1682 		mutex_enter(&bd->d_ksmutex);
1683 		kstat_waitq_to_runq(bd->d_kiop);
1684 		mutex_exit(&bd->d_ksmutex);
1685 
1686 		rv = xi->i_func(bd->d_private, &xi->i_public);
1687 		if (rv != 0) {
1688 			bp = xi->i_bp;
1689 			bioerror(bp, rv);
1690 			biodone(bp);
1691 
1692 			atomic_inc_32(&bd->d_kerr->bd_transerrs.value.ui32);
1693 			mutex_enter(&bd->d_ksmutex);
1694 			kstat_runq_exit(bd->d_kiop);
1695 			mutex_exit(&bd->d_ksmutex);
1696 
1697 			mutex_enter(&bq->q_iomutex);
1698 			bq->q_qactive--;
1699 			list_remove(&bq->q_runq, xi);
1700 			bd_xfer_free(xi);
1701 		} else {
1702 			mutex_enter(&bq->q_iomutex);
1703 		}
1704 	}
1705 
1706 	mutex_exit(&bq->q_iomutex);
1707 }
1708 
1709 static void
1710 bd_submit(bd_t *bd, bd_xfer_impl_t *xi)
1711 {
1712 	uint64_t	nv = atomic_inc_64_nv(&bd->d_io_counter);
1713 	unsigned	q = nv % bd->d_qcount;
1714 	bd_queue_t	*bq = &bd->d_queues[q];
1715 
1716 	xi->i_bq = bq;
1717 	xi->i_qnum = q;
1718 
1719 	mutex_enter(&bq->q_iomutex);
1720 	list_insert_tail(&bq->q_waitq, xi);
1721 	mutex_exit(&bq->q_iomutex);
1722 
1723 	mutex_enter(&bd->d_ksmutex);
1724 	kstat_waitq_enter(bd->d_kiop);
1725 	mutex_exit(&bd->d_ksmutex);
1726 
1727 	bd_sched(bd, bq);
1728 }
1729 
1730 static void
1731 bd_runq_exit(bd_xfer_impl_t *xi, int err)
1732 {
1733 	bd_t		*bd = xi->i_bd;
1734 	buf_t		*bp = xi->i_bp;
1735 	bd_queue_t	*bq = xi->i_bq;
1736 
1737 	mutex_enter(&bq->q_iomutex);
1738 	bq->q_qactive--;
1739 	list_remove(&bq->q_runq, xi);
1740 	mutex_exit(&bq->q_iomutex);
1741 
1742 	mutex_enter(&bd->d_ksmutex);
1743 	kstat_runq_exit(bd->d_kiop);
1744 	mutex_exit(&bd->d_ksmutex);
1745 
1746 	if (err == 0) {
1747 		if (bp->b_flags & B_READ) {
1748 			atomic_inc_uint(&bd->d_kiop->reads);
1749 			atomic_add_64((uint64_t *)&bd->d_kiop->nread,
1750 			    bp->b_bcount - xi->i_resid);
1751 		} else {
1752 			atomic_inc_uint(&bd->d_kiop->writes);
1753 			atomic_add_64((uint64_t *)&bd->d_kiop->nwritten,
1754 			    bp->b_bcount - xi->i_resid);
1755 		}
1756 	}
1757 	bd_sched(bd, bq);
1758 }
1759 
1760 static void
1761 bd_update_state(bd_t *bd)
1762 {
1763 	enum	dkio_state	state = DKIO_INSERTED;
1764 	boolean_t		docmlb = B_FALSE;
1765 	bd_media_t		media;
1766 
1767 	bzero(&media, sizeof (media));
1768 
1769 	mutex_enter(&bd->d_statemutex);
1770 	if (bd->d_ops.o_media_info(bd->d_private, &media) != 0) {
1771 		bd->d_numblks = 0;
1772 		state = DKIO_EJECTED;
1773 		goto done;
1774 	}
1775 
1776 	if ((media.m_blksize < 512) ||
1777 	    (!ISP2(media.m_blksize)) ||
1778 	    (P2PHASE(bd->d_maxxfer, media.m_blksize))) {
1779 		cmn_err(CE_WARN, "%s%d: Invalid media block size (%d)",
1780 		    ddi_driver_name(bd->d_dip), ddi_get_instance(bd->d_dip),
1781 		    media.m_blksize);
1782 		/*
1783 		 * We can't use the media, treat it as not present.
1784 		 */
1785 		state = DKIO_EJECTED;
1786 		bd->d_numblks = 0;
1787 		goto done;
1788 	}
1789 
1790 	if (((1U << bd->d_blkshift) != media.m_blksize) ||
1791 	    (bd->d_numblks != media.m_nblks)) {
1792 		/* Device size changed */
1793 		docmlb = B_TRUE;
1794 	}
1795 
1796 	bd->d_blkshift = ddi_ffs(media.m_blksize) - 1;
1797 	bd->d_pblkshift = bd->d_blkshift;
1798 	bd->d_numblks = media.m_nblks;
1799 	bd->d_rdonly = media.m_readonly;
1800 	bd->d_ssd = media.m_solidstate;
1801 
1802 	/*
1803 	 * Only use the supplied physical block size if it is non-zero,
1804 	 * greater or equal to the block size, and a power of 2. Ignore it
1805 	 * if not, it's just informational and we can still use the media.
1806 	 */
1807 	if ((media.m_pblksize != 0) &&
1808 	    (media.m_pblksize >= media.m_blksize) &&
1809 	    (ISP2(media.m_pblksize)))
1810 		bd->d_pblkshift = ddi_ffs(media.m_pblksize) - 1;
1811 
1812 done:
1813 	if (state != bd->d_state) {
1814 		bd->d_state = state;
1815 		cv_broadcast(&bd->d_statecv);
1816 		docmlb = B_TRUE;
1817 	}
1818 	mutex_exit(&bd->d_statemutex);
1819 
1820 	bd->d_kerr->bd_capacity.value.ui64 = bd->d_numblks << bd->d_blkshift;
1821 
1822 	if (docmlb) {
1823 		if (state == DKIO_INSERTED) {
1824 			(void) cmlb_validate(bd->d_cmlbh, 0, 0);
1825 		} else {
1826 			cmlb_invalidate(bd->d_cmlbh, 0);
1827 		}
1828 	}
1829 }
1830 
1831 static int
1832 bd_check_state(bd_t *bd, enum dkio_state *state)
1833 {
1834 	clock_t		when;
1835 
1836 	for (;;) {
1837 
1838 		bd_update_state(bd);
1839 
1840 		mutex_enter(&bd->d_statemutex);
1841 
1842 		if (bd->d_state != *state) {
1843 			*state = bd->d_state;
1844 			mutex_exit(&bd->d_statemutex);
1845 			break;
1846 		}
1847 
1848 		when = drv_usectohz(1000000);
1849 		if (cv_reltimedwait_sig(&bd->d_statecv, &bd->d_statemutex,
1850 		    when, TR_CLOCK_TICK) == 0) {
1851 			mutex_exit(&bd->d_statemutex);
1852 			return (EINTR);
1853 		}
1854 
1855 		mutex_exit(&bd->d_statemutex);
1856 	}
1857 
1858 	return (0);
1859 }
1860 
1861 static int
1862 bd_flush_write_cache_done(struct buf *bp)
1863 {
1864 	struct dk_callback *dc = (void *)bp->b_private;
1865 
1866 	(*dc->dkc_callback)(dc->dkc_cookie, geterror(bp));
1867 	kmem_free(dc, sizeof (*dc));
1868 	freerbuf(bp);
1869 	return (0);
1870 }
1871 
1872 static int
1873 bd_flush_write_cache(bd_t *bd, struct dk_callback *dkc)
1874 {
1875 	buf_t			*bp;
1876 	struct dk_callback	*dc;
1877 	bd_xfer_impl_t		*xi;
1878 	int			rv;
1879 
1880 	if (bd->d_ops.o_sync_cache == NULL) {
1881 		return (ENOTSUP);
1882 	}
1883 	if ((bp = getrbuf(KM_SLEEP)) == NULL) {
1884 		return (ENOMEM);
1885 	}
1886 	bp->b_resid = 0;
1887 	bp->b_bcount = 0;
1888 
1889 	xi = bd_xfer_alloc(bd, bp, bd->d_ops.o_sync_cache, KM_SLEEP);
1890 	if (xi == NULL) {
1891 		rv = geterror(bp);
1892 		freerbuf(bp);
1893 		return (rv);
1894 	}
1895 
1896 	/* Make an asynchronous flush, but only if there is a callback */
1897 	if (dkc != NULL && dkc->dkc_callback != NULL) {
1898 		/* Make a private copy of the callback structure */
1899 		dc = kmem_alloc(sizeof (*dc), KM_SLEEP);
1900 		*dc = *dkc;
1901 		bp->b_private = dc;
1902 		bp->b_iodone = bd_flush_write_cache_done;
1903 
1904 		bd_submit(bd, xi);
1905 		return (0);
1906 	}
1907 
1908 	/* In case there is no callback, perform a synchronous flush */
1909 	bd_submit(bd, xi);
1910 	(void) biowait(bp);
1911 	rv = geterror(bp);
1912 	freerbuf(bp);
1913 
1914 	return (rv);
1915 }
1916 
1917 /*
1918  * Nexus support.
1919  */
1920 int
1921 bd_bus_ctl(dev_info_t *dip, dev_info_t *rdip, ddi_ctl_enum_t ctlop,
1922     void *arg, void *result)
1923 {
1924 	bd_handle_t	hdl;
1925 
1926 	switch (ctlop) {
1927 	case DDI_CTLOPS_REPORTDEV:
1928 		cmn_err(CE_CONT, "?Block device: %s@%s, %s%d\n",
1929 		    ddi_node_name(rdip), ddi_get_name_addr(rdip),
1930 		    ddi_driver_name(rdip), ddi_get_instance(rdip));
1931 		return (DDI_SUCCESS);
1932 
1933 	case DDI_CTLOPS_INITCHILD:
1934 		hdl = ddi_get_parent_data((dev_info_t *)arg);
1935 		if (hdl == NULL) {
1936 			return (DDI_NOT_WELL_FORMED);
1937 		}
1938 		ddi_set_name_addr((dev_info_t *)arg, hdl->h_addr);
1939 		return (DDI_SUCCESS);
1940 
1941 	case DDI_CTLOPS_UNINITCHILD:
1942 		ddi_set_name_addr((dev_info_t *)arg, NULL);
1943 		ndi_prop_remove_all((dev_info_t *)arg);
1944 		return (DDI_SUCCESS);
1945 
1946 	default:
1947 		return (ddi_ctlops(dip, rdip, ctlop, arg, result));
1948 	}
1949 }
1950 
1951 /*
1952  * Functions for device drivers.
1953  */
1954 bd_handle_t
1955 bd_alloc_handle(void *private, bd_ops_t *ops, ddi_dma_attr_t *dma, int kmflag)
1956 {
1957 	bd_handle_t	hdl;
1958 
1959 	/*
1960 	 * There is full compatability between the version 0 API and the
1961 	 * current version.
1962 	 */
1963 	switch (ops->o_version) {
1964 	case BD_OPS_VERSION_0:
1965 	case BD_OPS_CURRENT_VERSION:
1966 		break;
1967 
1968 	default:
1969 		return (NULL);
1970 	}
1971 
1972 	hdl = kmem_zalloc(sizeof (*hdl), kmflag);
1973 	if (hdl != NULL) {
1974 		hdl->h_ops = *ops;
1975 		hdl->h_dma = dma;
1976 		hdl->h_private = private;
1977 	}
1978 
1979 	return (hdl);
1980 }
1981 
1982 void
1983 bd_free_handle(bd_handle_t hdl)
1984 {
1985 	kmem_free(hdl, sizeof (*hdl));
1986 }
1987 
1988 int
1989 bd_attach_handle(dev_info_t *dip, bd_handle_t hdl)
1990 {
1991 	dev_info_t	*child;
1992 	bd_drive_t	drive = { 0 };
1993 
1994 	/*
1995 	 * It's not an error if bd_attach_handle() is called on a handle that
1996 	 * already is attached. We just ignore the request to attach and return.
1997 	 * This way drivers using blkdev don't have to keep track about blkdev
1998 	 * state, they can just call this function to make sure it attached.
1999 	 */
2000 	if (hdl->h_child != NULL) {
2001 		return (DDI_SUCCESS);
2002 	}
2003 
2004 	/* if drivers don't override this, make it assume none */
2005 	drive.d_lun = -1;
2006 	hdl->h_ops.o_drive_info(hdl->h_private, &drive);
2007 
2008 	hdl->h_parent = dip;
2009 	hdl->h_name = "blkdev";
2010 
2011 	/*LINTED: E_BAD_PTR_CAST_ALIGN*/
2012 	if (*(uint64_t *)drive.d_eui64 != 0) {
2013 		if (drive.d_lun >= 0) {
2014 			(void) snprintf(hdl->h_addr, sizeof (hdl->h_addr),
2015 			    "w%02X%02X%02X%02X%02X%02X%02X%02X,%X",
2016 			    drive.d_eui64[0], drive.d_eui64[1],
2017 			    drive.d_eui64[2], drive.d_eui64[3],
2018 			    drive.d_eui64[4], drive.d_eui64[5],
2019 			    drive.d_eui64[6], drive.d_eui64[7], drive.d_lun);
2020 		} else {
2021 			(void) snprintf(hdl->h_addr, sizeof (hdl->h_addr),
2022 			    "w%02X%02X%02X%02X%02X%02X%02X%02X",
2023 			    drive.d_eui64[0], drive.d_eui64[1],
2024 			    drive.d_eui64[2], drive.d_eui64[3],
2025 			    drive.d_eui64[4], drive.d_eui64[5],
2026 			    drive.d_eui64[6], drive.d_eui64[7]);
2027 		}
2028 	} else {
2029 		if (drive.d_lun >= 0) {
2030 			(void) snprintf(hdl->h_addr, sizeof (hdl->h_addr),
2031 			    "%X,%X", drive.d_target, drive.d_lun);
2032 		} else {
2033 			(void) snprintf(hdl->h_addr, sizeof (hdl->h_addr),
2034 			    "%X", drive.d_target);
2035 		}
2036 	}
2037 
2038 	if (ndi_devi_alloc(dip, hdl->h_name, (pnode_t)DEVI_SID_NODEID,
2039 	    &child) != NDI_SUCCESS) {
2040 		cmn_err(CE_WARN, "%s%d: unable to allocate node %s@%s",
2041 		    ddi_driver_name(dip), ddi_get_instance(dip),
2042 		    "blkdev", hdl->h_addr);
2043 		return (DDI_FAILURE);
2044 	}
2045 
2046 	ddi_set_parent_data(child, hdl);
2047 	hdl->h_child = child;
2048 
2049 	if (ndi_devi_online(child, 0) == NDI_FAILURE) {
2050 		cmn_err(CE_WARN, "%s%d: failed bringing node %s@%s online",
2051 		    ddi_driver_name(dip), ddi_get_instance(dip),
2052 		    hdl->h_name, hdl->h_addr);
2053 		(void) ndi_devi_free(child);
2054 		return (DDI_FAILURE);
2055 	}
2056 
2057 	return (DDI_SUCCESS);
2058 }
2059 
2060 int
2061 bd_detach_handle(bd_handle_t hdl)
2062 {
2063 	int	circ;
2064 	int	rv;
2065 	char	*devnm;
2066 
2067 	/*
2068 	 * It's not an error if bd_detach_handle() is called on a handle that
2069 	 * already is detached. We just ignore the request to detach and return.
2070 	 * This way drivers using blkdev don't have to keep track about blkdev
2071 	 * state, they can just call this function to make sure it detached.
2072 	 */
2073 	if (hdl->h_child == NULL) {
2074 		return (DDI_SUCCESS);
2075 	}
2076 	ndi_devi_enter(hdl->h_parent, &circ);
2077 	if (i_ddi_node_state(hdl->h_child) < DS_INITIALIZED) {
2078 		rv = ddi_remove_child(hdl->h_child, 0);
2079 	} else {
2080 		devnm = kmem_alloc(MAXNAMELEN + 1, KM_SLEEP);
2081 		(void) ddi_deviname(hdl->h_child, devnm);
2082 		(void) devfs_clean(hdl->h_parent, devnm + 1, DV_CLEAN_FORCE);
2083 		rv = ndi_devi_unconfig_one(hdl->h_parent, devnm + 1, NULL,
2084 		    NDI_DEVI_REMOVE | NDI_UNCONFIG);
2085 		kmem_free(devnm, MAXNAMELEN + 1);
2086 	}
2087 	if (rv == 0) {
2088 		hdl->h_child = NULL;
2089 	}
2090 
2091 	ndi_devi_exit(hdl->h_parent, circ);
2092 	return (rv == NDI_SUCCESS ? DDI_SUCCESS : DDI_FAILURE);
2093 }
2094 
2095 void
2096 bd_xfer_done(bd_xfer_t *xfer, int err)
2097 {
2098 	bd_xfer_impl_t	*xi = (void *)xfer;
2099 	buf_t		*bp = xi->i_bp;
2100 	int		rv = DDI_SUCCESS;
2101 	bd_t		*bd = xi->i_bd;
2102 	size_t		len;
2103 
2104 	if (err != 0) {
2105 		bd_runq_exit(xi, err);
2106 		atomic_inc_32(&bd->d_kerr->bd_harderrs.value.ui32);
2107 
2108 		bp->b_resid += xi->i_resid;
2109 		bd_xfer_free(xi);
2110 		bioerror(bp, err);
2111 		biodone(bp);
2112 		return;
2113 	}
2114 
2115 	xi->i_cur_win++;
2116 	xi->i_resid -= xi->i_len;
2117 
2118 	if (xi->i_resid == 0) {
2119 		/* Job completed succcessfully! */
2120 		bd_runq_exit(xi, 0);
2121 
2122 		bd_xfer_free(xi);
2123 		biodone(bp);
2124 		return;
2125 	}
2126 
2127 	xi->i_blkno += xi->i_nblks;
2128 
2129 	if (bd->d_use_dma) {
2130 		/* More transfer still pending... advance to next DMA window. */
2131 		rv = ddi_dma_getwin(xi->i_dmah, xi->i_cur_win,
2132 		    &xi->i_offset, &len, &xi->i_dmac, &xi->i_ndmac);
2133 	} else {
2134 		/* Advance memory window. */
2135 		xi->i_kaddr += xi->i_len;
2136 		xi->i_offset += xi->i_len;
2137 		len = min(bp->b_bcount - xi->i_offset, bd->d_maxxfer);
2138 	}
2139 
2140 
2141 	if ((rv != DDI_SUCCESS) ||
2142 	    (P2PHASE(len, (1U << xi->i_blkshift)) != 0)) {
2143 		bd_runq_exit(xi, EFAULT);
2144 
2145 		bp->b_resid += xi->i_resid;
2146 		bd_xfer_free(xi);
2147 		bioerror(bp, EFAULT);
2148 		biodone(bp);
2149 		return;
2150 	}
2151 	xi->i_len = len;
2152 	xi->i_nblks = len >> xi->i_blkshift;
2153 
2154 	/* Submit next window to hardware. */
2155 	rv = xi->i_func(bd->d_private, &xi->i_public);
2156 	if (rv != 0) {
2157 		bd_runq_exit(xi, rv);
2158 
2159 		atomic_inc_32(&bd->d_kerr->bd_transerrs.value.ui32);
2160 
2161 		bp->b_resid += xi->i_resid;
2162 		bd_xfer_free(xi);
2163 		bioerror(bp, rv);
2164 		biodone(bp);
2165 	}
2166 }
2167 
2168 void
2169 bd_error(bd_xfer_t *xfer, int error)
2170 {
2171 	bd_xfer_impl_t	*xi = (void *)xfer;
2172 	bd_t		*bd = xi->i_bd;
2173 
2174 	switch (error) {
2175 	case BD_ERR_MEDIA:
2176 		atomic_inc_32(&bd->d_kerr->bd_rq_media_err.value.ui32);
2177 		break;
2178 	case BD_ERR_NTRDY:
2179 		atomic_inc_32(&bd->d_kerr->bd_rq_ntrdy_err.value.ui32);
2180 		break;
2181 	case BD_ERR_NODEV:
2182 		atomic_inc_32(&bd->d_kerr->bd_rq_nodev_err.value.ui32);
2183 		break;
2184 	case BD_ERR_RECOV:
2185 		atomic_inc_32(&bd->d_kerr->bd_rq_recov_err.value.ui32);
2186 		break;
2187 	case BD_ERR_ILLRQ:
2188 		atomic_inc_32(&bd->d_kerr->bd_rq_illrq_err.value.ui32);
2189 		break;
2190 	case BD_ERR_PFA:
2191 		atomic_inc_32(&bd->d_kerr->bd_rq_pfa_err.value.ui32);
2192 		break;
2193 	default:
2194 		cmn_err(CE_PANIC, "bd_error: unknown error type %d", error);
2195 		break;
2196 	}
2197 }
2198 
2199 void
2200 bd_state_change(bd_handle_t hdl)
2201 {
2202 	bd_t		*bd;
2203 
2204 	if ((bd = hdl->h_bd) != NULL) {
2205 		bd_update_state(bd);
2206 	}
2207 }
2208 
2209 void
2210 bd_mod_init(struct dev_ops *devops)
2211 {
2212 	static struct bus_ops bd_bus_ops = {
2213 		BUSO_REV,		/* busops_rev */
2214 		nullbusmap,		/* bus_map */
2215 		NULL,			/* bus_get_intrspec (OBSOLETE) */
2216 		NULL,			/* bus_add_intrspec (OBSOLETE) */
2217 		NULL,			/* bus_remove_intrspec (OBSOLETE) */
2218 		i_ddi_map_fault,	/* bus_map_fault */
2219 		NULL,			/* bus_dma_map (OBSOLETE) */
2220 		ddi_dma_allochdl,	/* bus_dma_allochdl */
2221 		ddi_dma_freehdl,	/* bus_dma_freehdl */
2222 		ddi_dma_bindhdl,	/* bus_dma_bindhdl */
2223 		ddi_dma_unbindhdl,	/* bus_dma_unbindhdl */
2224 		ddi_dma_flush,		/* bus_dma_flush */
2225 		ddi_dma_win,		/* bus_dma_win */
2226 		ddi_dma_mctl,		/* bus_dma_ctl */
2227 		bd_bus_ctl,		/* bus_ctl */
2228 		ddi_bus_prop_op,	/* bus_prop_op */
2229 		NULL,			/* bus_get_eventcookie */
2230 		NULL,			/* bus_add_eventcall */
2231 		NULL,			/* bus_remove_eventcall */
2232 		NULL,			/* bus_post_event */
2233 		NULL,			/* bus_intr_ctl (OBSOLETE) */
2234 		NULL,			/* bus_config */
2235 		NULL,			/* bus_unconfig */
2236 		NULL,			/* bus_fm_init */
2237 		NULL,			/* bus_fm_fini */
2238 		NULL,			/* bus_fm_access_enter */
2239 		NULL,			/* bus_fm_access_exit */
2240 		NULL,			/* bus_power */
2241 		NULL,			/* bus_intr_op */
2242 	};
2243 
2244 	devops->devo_bus_ops = &bd_bus_ops;
2245 
2246 	/*
2247 	 * NB: The device driver is free to supply its own
2248 	 * character entry device support.
2249 	 */
2250 }
2251 
2252 void
2253 bd_mod_fini(struct dev_ops *devops)
2254 {
2255 	devops->devo_bus_ops = NULL;
2256 }
2257