xref: /illumos-gate/usr/src/uts/common/io/blkdev/blkdev.c (revision 33f84ecfada5880d94e9bfc5af7954d41e5664d5)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright 2012 Garrett D'Amore <garrett@damore.org>.  All rights reserved.
24  * Copyright 2012 Alexey Zaytsev <alexey.zaytsev@gmail.com> All rights reserved.
25  * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
26  * Copyright 2017 The MathWorks, Inc.  All rights reserved.
27  * Copyright 2019 Western Digital Corporation.
28  * Copyright 2020 Joyent, Inc.
29  */
30 
31 #include <sys/types.h>
32 #include <sys/ksynch.h>
33 #include <sys/kmem.h>
34 #include <sys/file.h>
35 #include <sys/errno.h>
36 #include <sys/open.h>
37 #include <sys/buf.h>
38 #include <sys/uio.h>
39 #include <sys/aio_req.h>
40 #include <sys/cred.h>
41 #include <sys/modctl.h>
42 #include <sys/cmlb.h>
43 #include <sys/conf.h>
44 #include <sys/devops.h>
45 #include <sys/list.h>
46 #include <sys/sysmacros.h>
47 #include <sys/dkio.h>
48 #include <sys/vtoc.h>
49 #include <sys/scsi/scsi.h>	/* for DTYPE_DIRECT */
50 #include <sys/kstat.h>
51 #include <sys/fs/dv_node.h>
52 #include <sys/ddi.h>
53 #include <sys/sunddi.h>
54 #include <sys/note.h>
55 #include <sys/blkdev.h>
56 #include <sys/scsi/impl/inquiry.h>
57 
58 /*
59  * blkdev is a driver which provides a lot of the common functionality
60  * a block device driver may need and helps by removing code which
61  * is frequently duplicated in block device drivers.
62  *
63  * Within this driver all the struct cb_ops functions required for a
64  * block device driver are written with appropriate call back functions
65  * to be provided by the parent driver.
66  *
67  * To use blkdev, a driver needs to:
68  *	1. Create a bd_ops_t structure which has the call back operations
69  *	   blkdev will use.
70  *	2. Create a handle by calling bd_alloc_handle(). One of the
71  *	   arguments to this function is the bd_ops_t.
72  *	3. Call bd_attach_handle(). This will instantiate a blkdev device
73  *	   as a child device node of the calling driver.
74  *
75  * A parent driver is not restricted to just allocating and attaching a
76  * single instance, it may attach as many as it wishes. For each handle
77  * attached, appropriate entries in /dev/[r]dsk are created.
78  *
79  * The bd_ops_t routines that a parent of blkdev need to provide are:
80  *
81  * o_drive_info: Provide information to blkdev such as how many I/O queues
82  *		 to create and the size of those queues. Also some device
83  *		 specifics such as EUI, vendor, product, model, serial
84  *		 number ....
85  *
86  * o_media_info: Provide information about the media. Eg size and block size.
87  *
88  * o_devid_init: Creates and initializes the device id. Typically calls
89  *		 ddi_devid_init().
90  *
91  * o_sync_cache: Issues a device appropriate command to flush any write
92  *		 caches.
93  *
94  * o_read:	 Read data as described by bd_xfer_t argument.
95  *
96  * o_write:	 Write data as described by bd_xfer_t argument.
97  *
98  *
99  * Queues
100  * ------
101  * Part of the drive_info data is a queue count. blkdev will create
102  * "queue count" number of waitq/runq pairs. Each waitq/runq pair
103  * operates independently. As an I/O is scheduled up to the parent
104  * driver via o_read or o_write its queue number is given. If the
105  * parent driver supports multiple hardware queues it can then select
106  * where to submit the I/O request.
107  *
108  * Currently blkdev uses a simplistic round-robin queue selection method.
109  * It has the advantage that it is lockless. In the future it will be
110  * worthwhile reviewing this strategy for something which prioritizes queues
111  * depending on how busy they are.
112  *
113  * Each waitq/runq pair is protected by its mutex (q_iomutex). Incoming
114  * I/O requests are initially added to the waitq. They are taken off the
115  * waitq, added to the runq and submitted, providing the runq is less
116  * than the qsize as specified in the drive_info. As an I/O request
117  * completes, the parent driver is required to call bd_xfer_done(), which
118  * will remove the I/O request from the runq and pass I/O completion
119  * status up the stack.
120  *
121  * Locks
122  * -----
123  * There are 4 instance global locks d_ocmutex, d_ksmutex, d_errmutex and
124  * d_statemutex. As well a q_iomutex per waitq/runq pair.
125  *
126  * Lock Hierarchy
127  * --------------
128  * The only two locks which may be held simultaneously are q_iomutex and
129  * d_ksmutex. In all cases q_iomutex must be acquired before d_ksmutex.
130  */
131 
132 #define	BD_MAXPART	64
133 #define	BDINST(dev)	(getminor(dev) / BD_MAXPART)
134 #define	BDPART(dev)	(getminor(dev) % BD_MAXPART)
135 
136 typedef struct bd bd_t;
137 typedef struct bd_xfer_impl bd_xfer_impl_t;
138 typedef struct bd_queue bd_queue_t;
139 
140 struct bd {
141 	void		*d_private;
142 	dev_info_t	*d_dip;
143 	kmutex_t	d_ocmutex;
144 	kmutex_t	d_ksmutex;
145 	kmutex_t	d_errmutex;
146 	kmutex_t	d_statemutex;
147 	kcondvar_t	d_statecv;
148 	enum dkio_state	d_state;
149 	cmlb_handle_t	d_cmlbh;
150 	unsigned	d_open_lyr[BD_MAXPART];	/* open count */
151 	uint64_t	d_open_excl;	/* bit mask indexed by partition */
152 	uint64_t	d_open_reg[OTYPCNT];		/* bit mask */
153 	uint64_t	d_io_counter;
154 
155 	uint32_t	d_qcount;
156 	uint32_t	d_qactive;
157 	uint32_t	d_maxxfer;
158 	uint32_t	d_blkshift;
159 	uint32_t	d_pblkshift;
160 	uint64_t	d_numblks;
161 	ddi_devid_t	d_devid;
162 
163 	kmem_cache_t	*d_cache;
164 	bd_queue_t	*d_queues;
165 	kstat_t		*d_ksp;
166 	kstat_io_t	*d_kiop;
167 	kstat_t		*d_errstats;
168 	struct bd_errstats *d_kerr;
169 
170 	boolean_t	d_rdonly;
171 	boolean_t	d_ssd;
172 	boolean_t	d_removable;
173 	boolean_t	d_hotpluggable;
174 	boolean_t	d_use_dma;
175 
176 	ddi_dma_attr_t	d_dma;
177 	bd_ops_t	d_ops;
178 	bd_handle_t	d_handle;
179 };
180 
181 struct bd_handle {
182 	bd_ops_t	h_ops;
183 	ddi_dma_attr_t	*h_dma;
184 	dev_info_t	*h_parent;
185 	dev_info_t	*h_child;
186 	void		*h_private;
187 	bd_t		*h_bd;
188 	char		*h_name;
189 	char		h_addr[30];	/* enough for w%0.16x,%X */
190 };
191 
192 struct bd_xfer_impl {
193 	bd_xfer_t	i_public;
194 	list_node_t	i_linkage;
195 	bd_t		*i_bd;
196 	buf_t		*i_bp;
197 	bd_queue_t	*i_bq;
198 	uint_t		i_num_win;
199 	uint_t		i_cur_win;
200 	off_t		i_offset;
201 	int		(*i_func)(void *, bd_xfer_t *);
202 	uint32_t	i_blkshift;
203 	size_t		i_len;
204 	size_t		i_resid;
205 };
206 
207 struct bd_queue {
208 	kmutex_t	q_iomutex;
209 	uint32_t	q_qsize;
210 	uint32_t	q_qactive;
211 	list_t		q_runq;
212 	list_t		q_waitq;
213 };
214 
215 #define	i_dmah		i_public.x_dmah
216 #define	i_dmac		i_public.x_dmac
217 #define	i_ndmac		i_public.x_ndmac
218 #define	i_kaddr		i_public.x_kaddr
219 #define	i_nblks		i_public.x_nblks
220 #define	i_blkno		i_public.x_blkno
221 #define	i_flags		i_public.x_flags
222 #define	i_qnum		i_public.x_qnum
223 
224 
225 /*
226  * Private prototypes.
227  */
228 
229 static void bd_prop_update_inqstring(dev_info_t *, char *, char *, size_t);
230 static void bd_create_inquiry_props(dev_info_t *, bd_drive_t *);
231 static void bd_create_errstats(bd_t *, int, bd_drive_t *);
232 static void bd_destroy_errstats(bd_t *);
233 static void bd_errstats_setstr(kstat_named_t *, char *, size_t, char *);
234 static void bd_init_errstats(bd_t *, bd_drive_t *);
235 static void bd_fini_errstats(bd_t *);
236 
237 static int bd_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
238 static int bd_attach(dev_info_t *, ddi_attach_cmd_t);
239 static int bd_detach(dev_info_t *, ddi_detach_cmd_t);
240 
241 static int bd_open(dev_t *, int, int, cred_t *);
242 static int bd_close(dev_t, int, int, cred_t *);
243 static int bd_strategy(struct buf *);
244 static int bd_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
245 static int bd_dump(dev_t, caddr_t, daddr_t, int);
246 static int bd_read(dev_t, struct uio *, cred_t *);
247 static int bd_write(dev_t, struct uio *, cred_t *);
248 static int bd_aread(dev_t, struct aio_req *, cred_t *);
249 static int bd_awrite(dev_t, struct aio_req *, cred_t *);
250 static int bd_prop_op(dev_t, dev_info_t *, ddi_prop_op_t, int, char *,
251     caddr_t, int *);
252 
253 static int bd_tg_rdwr(dev_info_t *, uchar_t, void *, diskaddr_t, size_t,
254     void *);
255 static int bd_tg_getinfo(dev_info_t *, int, void *, void *);
256 static int bd_xfer_ctor(void *, void *, int);
257 static void bd_xfer_dtor(void *, void *);
258 static void bd_sched(bd_t *, bd_queue_t *);
259 static void bd_submit(bd_t *, bd_xfer_impl_t *);
260 static void bd_runq_exit(bd_xfer_impl_t *, int);
261 static void bd_update_state(bd_t *);
262 static int bd_check_state(bd_t *, enum dkio_state *);
263 static int bd_flush_write_cache(bd_t *, struct dk_callback *);
264 static int bd_check_uio(dev_t, struct uio *);
265 
266 struct cmlb_tg_ops bd_tg_ops = {
267 	TG_DK_OPS_VERSION_1,
268 	bd_tg_rdwr,
269 	bd_tg_getinfo,
270 };
271 
272 static struct cb_ops bd_cb_ops = {
273 	bd_open,		/* open */
274 	bd_close,		/* close */
275 	bd_strategy,		/* strategy */
276 	nodev,			/* print */
277 	bd_dump,		/* dump */
278 	bd_read,		/* read */
279 	bd_write,		/* write */
280 	bd_ioctl,		/* ioctl */
281 	nodev,			/* devmap */
282 	nodev,			/* mmap */
283 	nodev,			/* segmap */
284 	nochpoll,		/* poll */
285 	bd_prop_op,		/* cb_prop_op */
286 	0,			/* streamtab  */
287 	D_64BIT | D_MP,		/* Driver comaptibility flag */
288 	CB_REV,			/* cb_rev */
289 	bd_aread,		/* async read */
290 	bd_awrite		/* async write */
291 };
292 
293 struct dev_ops bd_dev_ops = {
294 	DEVO_REV,		/* devo_rev, */
295 	0,			/* refcnt  */
296 	bd_getinfo,		/* getinfo */
297 	nulldev,		/* identify */
298 	nulldev,		/* probe */
299 	bd_attach,		/* attach */
300 	bd_detach,		/* detach */
301 	nodev,			/* reset */
302 	&bd_cb_ops,		/* driver operations */
303 	NULL,			/* bus operations */
304 	NULL,			/* power */
305 	ddi_quiesce_not_needed,	/* quiesce */
306 };
307 
308 static struct modldrv modldrv = {
309 	&mod_driverops,
310 	"Generic Block Device",
311 	&bd_dev_ops,
312 };
313 
314 static struct modlinkage modlinkage = {
315 	MODREV_1, { &modldrv, NULL }
316 };
317 
318 static void *bd_state;
319 static krwlock_t bd_lock;
320 
321 int
322 _init(void)
323 {
324 	int	rv;
325 
326 	rv = ddi_soft_state_init(&bd_state, sizeof (struct bd), 2);
327 	if (rv != DDI_SUCCESS) {
328 		return (rv);
329 	}
330 	rw_init(&bd_lock, NULL, RW_DRIVER, NULL);
331 	rv = mod_install(&modlinkage);
332 	if (rv != DDI_SUCCESS) {
333 		rw_destroy(&bd_lock);
334 		ddi_soft_state_fini(&bd_state);
335 	}
336 	return (rv);
337 }
338 
339 int
340 _fini(void)
341 {
342 	int	rv;
343 
344 	rv = mod_remove(&modlinkage);
345 	if (rv == DDI_SUCCESS) {
346 		rw_destroy(&bd_lock);
347 		ddi_soft_state_fini(&bd_state);
348 	}
349 	return (rv);
350 }
351 
352 int
353 _info(struct modinfo *modinfop)
354 {
355 	return (mod_info(&modlinkage, modinfop));
356 }
357 
358 static int
359 bd_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resultp)
360 {
361 	bd_t	*bd;
362 	minor_t	inst;
363 
364 	_NOTE(ARGUNUSED(dip));
365 
366 	inst = BDINST((dev_t)arg);
367 
368 	switch (cmd) {
369 	case DDI_INFO_DEVT2DEVINFO:
370 		bd = ddi_get_soft_state(bd_state, inst);
371 		if (bd == NULL) {
372 			return (DDI_FAILURE);
373 		}
374 		*resultp = (void *)bd->d_dip;
375 		break;
376 
377 	case DDI_INFO_DEVT2INSTANCE:
378 		*resultp = (void *)(intptr_t)inst;
379 		break;
380 
381 	default:
382 		return (DDI_FAILURE);
383 	}
384 	return (DDI_SUCCESS);
385 }
386 
387 static void
388 bd_prop_update_inqstring(dev_info_t *dip, char *name, char *data, size_t len)
389 {
390 	int	ilen;
391 	char	*data_string;
392 
393 	ilen = scsi_ascii_inquiry_len(data, len);
394 	ASSERT3U(ilen, <=, len);
395 	if (ilen <= 0)
396 		return;
397 	/* ensure null termination */
398 	data_string = kmem_zalloc(ilen + 1, KM_SLEEP);
399 	bcopy(data, data_string, ilen);
400 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, dip, name, data_string);
401 	kmem_free(data_string, ilen + 1);
402 }
403 
404 static void
405 bd_create_inquiry_props(dev_info_t *dip, bd_drive_t *drive)
406 {
407 	if (drive->d_vendor_len > 0)
408 		bd_prop_update_inqstring(dip, INQUIRY_VENDOR_ID,
409 		    drive->d_vendor, drive->d_vendor_len);
410 
411 	if (drive->d_product_len > 0)
412 		bd_prop_update_inqstring(dip, INQUIRY_PRODUCT_ID,
413 		    drive->d_product, drive->d_product_len);
414 
415 	if (drive->d_serial_len > 0)
416 		bd_prop_update_inqstring(dip, INQUIRY_SERIAL_NO,
417 		    drive->d_serial, drive->d_serial_len);
418 
419 	if (drive->d_revision_len > 0)
420 		bd_prop_update_inqstring(dip, INQUIRY_REVISION_ID,
421 		    drive->d_revision, drive->d_revision_len);
422 }
423 
424 static void
425 bd_create_errstats(bd_t *bd, int inst, bd_drive_t *drive)
426 {
427 	char	ks_module[KSTAT_STRLEN];
428 	char	ks_name[KSTAT_STRLEN];
429 	int	ndata = sizeof (struct bd_errstats) / sizeof (kstat_named_t);
430 
431 	if (bd->d_errstats != NULL)
432 		return;
433 
434 	(void) snprintf(ks_module, sizeof (ks_module), "%serr",
435 	    ddi_driver_name(bd->d_dip));
436 	(void) snprintf(ks_name, sizeof (ks_name), "%s%d,err",
437 	    ddi_driver_name(bd->d_dip), inst);
438 
439 	bd->d_errstats = kstat_create(ks_module, inst, ks_name, "device_error",
440 	    KSTAT_TYPE_NAMED, ndata, KSTAT_FLAG_PERSISTENT);
441 
442 	mutex_init(&bd->d_errmutex, NULL, MUTEX_DRIVER, NULL);
443 	if (bd->d_errstats == NULL) {
444 		/*
445 		 * Even if we cannot create the kstat, we create a
446 		 * scratch kstat.  The reason for this is to ensure
447 		 * that we can update the kstat all of the time,
448 		 * without adding an extra branch instruction.
449 		 */
450 		bd->d_kerr = kmem_zalloc(sizeof (struct bd_errstats),
451 		    KM_SLEEP);
452 	} else {
453 		bd->d_errstats->ks_lock = &bd->d_errmutex;
454 		bd->d_kerr = (struct bd_errstats *)bd->d_errstats->ks_data;
455 	}
456 
457 	kstat_named_init(&bd->d_kerr->bd_softerrs,	"Soft Errors",
458 	    KSTAT_DATA_UINT32);
459 	kstat_named_init(&bd->d_kerr->bd_harderrs,	"Hard Errors",
460 	    KSTAT_DATA_UINT32);
461 	kstat_named_init(&bd->d_kerr->bd_transerrs,	"Transport Errors",
462 	    KSTAT_DATA_UINT32);
463 
464 	if (drive->d_model_len > 0) {
465 		kstat_named_init(&bd->d_kerr->bd_model,	"Model",
466 		    KSTAT_DATA_STRING);
467 	} else {
468 		kstat_named_init(&bd->d_kerr->bd_vid,	"Vendor",
469 		    KSTAT_DATA_STRING);
470 		kstat_named_init(&bd->d_kerr->bd_pid,	"Product",
471 		    KSTAT_DATA_STRING);
472 	}
473 
474 	kstat_named_init(&bd->d_kerr->bd_revision,	"Revision",
475 	    KSTAT_DATA_STRING);
476 	kstat_named_init(&bd->d_kerr->bd_serial,	"Serial No",
477 	    KSTAT_DATA_STRING);
478 	kstat_named_init(&bd->d_kerr->bd_capacity,	"Size",
479 	    KSTAT_DATA_ULONGLONG);
480 	kstat_named_init(&bd->d_kerr->bd_rq_media_err,	"Media Error",
481 	    KSTAT_DATA_UINT32);
482 	kstat_named_init(&bd->d_kerr->bd_rq_ntrdy_err,	"Device Not Ready",
483 	    KSTAT_DATA_UINT32);
484 	kstat_named_init(&bd->d_kerr->bd_rq_nodev_err,	"No Device",
485 	    KSTAT_DATA_UINT32);
486 	kstat_named_init(&bd->d_kerr->bd_rq_recov_err,	"Recoverable",
487 	    KSTAT_DATA_UINT32);
488 	kstat_named_init(&bd->d_kerr->bd_rq_illrq_err,	"Illegal Request",
489 	    KSTAT_DATA_UINT32);
490 	kstat_named_init(&bd->d_kerr->bd_rq_pfa_err,
491 	    "Predictive Failure Analysis", KSTAT_DATA_UINT32);
492 
493 	bd->d_errstats->ks_private = bd;
494 
495 	kstat_install(bd->d_errstats);
496 	bd_init_errstats(bd, drive);
497 }
498 
499 static void
500 bd_destroy_errstats(bd_t *bd)
501 {
502 	if (bd->d_errstats != NULL) {
503 		bd_fini_errstats(bd);
504 		kstat_delete(bd->d_errstats);
505 		bd->d_errstats = NULL;
506 	} else {
507 		kmem_free(bd->d_kerr, sizeof (struct bd_errstats));
508 		bd->d_kerr = NULL;
509 		mutex_destroy(&bd->d_errmutex);
510 	}
511 }
512 
513 static void
514 bd_errstats_setstr(kstat_named_t *k, char *str, size_t len, char *alt)
515 {
516 	char	*tmp;
517 	size_t	km_len;
518 
519 	if (KSTAT_NAMED_STR_PTR(k) == NULL) {
520 		if (len > 0)
521 			km_len = strnlen(str, len);
522 		else if (alt != NULL)
523 			km_len = strlen(alt);
524 		else
525 			return;
526 
527 		tmp = kmem_alloc(km_len + 1, KM_SLEEP);
528 		bcopy(len > 0 ? str : alt, tmp, km_len);
529 		tmp[km_len] = '\0';
530 
531 		kstat_named_setstr(k, tmp);
532 	}
533 }
534 
535 static void
536 bd_errstats_clrstr(kstat_named_t *k)
537 {
538 	if (KSTAT_NAMED_STR_PTR(k) == NULL)
539 		return;
540 
541 	kmem_free(KSTAT_NAMED_STR_PTR(k), KSTAT_NAMED_STR_BUFLEN(k));
542 	kstat_named_setstr(k, NULL);
543 }
544 
545 static void
546 bd_init_errstats(bd_t *bd, bd_drive_t *drive)
547 {
548 	struct bd_errstats	*est = bd->d_kerr;
549 
550 	mutex_enter(&bd->d_errmutex);
551 
552 	if (drive->d_model_len > 0 &&
553 	    KSTAT_NAMED_STR_PTR(&est->bd_model) == NULL) {
554 		bd_errstats_setstr(&est->bd_model, drive->d_model,
555 		    drive->d_model_len, NULL);
556 	} else {
557 		bd_errstats_setstr(&est->bd_vid, drive->d_vendor,
558 		    drive->d_vendor_len, "Unknown ");
559 		bd_errstats_setstr(&est->bd_pid, drive->d_product,
560 		    drive->d_product_len, "Unknown         ");
561 	}
562 
563 	bd_errstats_setstr(&est->bd_revision, drive->d_revision,
564 	    drive->d_revision_len, "0001");
565 	bd_errstats_setstr(&est->bd_serial, drive->d_serial,
566 	    drive->d_serial_len, "0               ");
567 
568 	mutex_exit(&bd->d_errmutex);
569 }
570 
571 static void
572 bd_fini_errstats(bd_t *bd)
573 {
574 	struct bd_errstats	*est = bd->d_kerr;
575 
576 	mutex_enter(&bd->d_errmutex);
577 
578 	bd_errstats_clrstr(&est->bd_model);
579 	bd_errstats_clrstr(&est->bd_vid);
580 	bd_errstats_clrstr(&est->bd_pid);
581 	bd_errstats_clrstr(&est->bd_revision);
582 	bd_errstats_clrstr(&est->bd_serial);
583 
584 	mutex_exit(&bd->d_errmutex);
585 }
586 
587 static void
588 bd_queues_free(bd_t *bd)
589 {
590 	uint32_t i;
591 
592 	for (i = 0; i < bd->d_qcount; i++) {
593 		bd_queue_t *bq = &bd->d_queues[i];
594 
595 		mutex_destroy(&bq->q_iomutex);
596 		list_destroy(&bq->q_waitq);
597 		list_destroy(&bq->q_runq);
598 	}
599 
600 	kmem_free(bd->d_queues, sizeof (*bd->d_queues) * bd->d_qcount);
601 }
602 
603 static int
604 bd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
605 {
606 	int		inst;
607 	bd_handle_t	hdl;
608 	bd_t		*bd;
609 	bd_drive_t	drive;
610 	uint32_t	i;
611 	int		rv;
612 	char		name[16];
613 	char		kcache[32];
614 
615 	switch (cmd) {
616 	case DDI_ATTACH:
617 		break;
618 	case DDI_RESUME:
619 		/* We don't do anything native for suspend/resume */
620 		return (DDI_SUCCESS);
621 	default:
622 		return (DDI_FAILURE);
623 	}
624 
625 	inst = ddi_get_instance(dip);
626 	hdl = ddi_get_parent_data(dip);
627 
628 	(void) snprintf(name, sizeof (name), "%s%d",
629 	    ddi_driver_name(dip), ddi_get_instance(dip));
630 	(void) snprintf(kcache, sizeof (kcache), "%s_xfer", name);
631 
632 	if (hdl == NULL) {
633 		cmn_err(CE_WARN, "%s: missing parent data!", name);
634 		return (DDI_FAILURE);
635 	}
636 
637 	if (ddi_soft_state_zalloc(bd_state, inst) != DDI_SUCCESS) {
638 		cmn_err(CE_WARN, "%s: unable to zalloc soft state!", name);
639 		return (DDI_FAILURE);
640 	}
641 	bd = ddi_get_soft_state(bd_state, inst);
642 
643 	if (hdl->h_dma) {
644 		bd->d_dma = *(hdl->h_dma);
645 		bd->d_dma.dma_attr_granular =
646 		    max(DEV_BSIZE, bd->d_dma.dma_attr_granular);
647 		bd->d_use_dma = B_TRUE;
648 
649 		if (bd->d_maxxfer &&
650 		    (bd->d_maxxfer != bd->d_dma.dma_attr_maxxfer)) {
651 			cmn_err(CE_WARN,
652 			    "%s: inconsistent maximum transfer size!",
653 			    name);
654 			/* We force it */
655 			bd->d_maxxfer = bd->d_dma.dma_attr_maxxfer;
656 		} else {
657 			bd->d_maxxfer = bd->d_dma.dma_attr_maxxfer;
658 		}
659 	} else {
660 		bd->d_use_dma = B_FALSE;
661 		if (bd->d_maxxfer == 0) {
662 			bd->d_maxxfer = 1024 * 1024;
663 		}
664 	}
665 	bd->d_ops = hdl->h_ops;
666 	bd->d_private = hdl->h_private;
667 	bd->d_blkshift = 9;	/* 512 bytes, to start */
668 
669 	if (bd->d_maxxfer % DEV_BSIZE) {
670 		cmn_err(CE_WARN, "%s: maximum transfer misaligned!", name);
671 		bd->d_maxxfer &= ~(DEV_BSIZE - 1);
672 	}
673 	if (bd->d_maxxfer < DEV_BSIZE) {
674 		cmn_err(CE_WARN, "%s: maximum transfer size too small!", name);
675 		ddi_soft_state_free(bd_state, inst);
676 		return (DDI_FAILURE);
677 	}
678 
679 	bd->d_dip = dip;
680 	bd->d_handle = hdl;
681 	hdl->h_bd = bd;
682 	ddi_set_driver_private(dip, bd);
683 
684 	mutex_init(&bd->d_ksmutex, NULL, MUTEX_DRIVER, NULL);
685 	mutex_init(&bd->d_ocmutex, NULL, MUTEX_DRIVER, NULL);
686 	mutex_init(&bd->d_statemutex, NULL, MUTEX_DRIVER, NULL);
687 	cv_init(&bd->d_statecv, NULL, CV_DRIVER, NULL);
688 
689 	bd->d_cache = kmem_cache_create(kcache, sizeof (bd_xfer_impl_t), 8,
690 	    bd_xfer_ctor, bd_xfer_dtor, NULL, bd, NULL, 0);
691 
692 	bd->d_ksp = kstat_create(ddi_driver_name(dip), inst, NULL, "disk",
693 	    KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT);
694 	if (bd->d_ksp != NULL) {
695 		bd->d_ksp->ks_lock = &bd->d_ksmutex;
696 		kstat_install(bd->d_ksp);
697 		bd->d_kiop = bd->d_ksp->ks_data;
698 	} else {
699 		/*
700 		 * Even if we cannot create the kstat, we create a
701 		 * scratch kstat.  The reason for this is to ensure
702 		 * that we can update the kstat all of the time,
703 		 * without adding an extra branch instruction.
704 		 */
705 		bd->d_kiop = kmem_zalloc(sizeof (kstat_io_t), KM_SLEEP);
706 	}
707 
708 	cmlb_alloc_handle(&bd->d_cmlbh);
709 
710 	bd->d_state = DKIO_NONE;
711 
712 	bzero(&drive, sizeof (drive));
713 	/*
714 	 * Default to one queue, parent driver can override.
715 	 */
716 	drive.d_qcount = 1;
717 	bd->d_ops.o_drive_info(bd->d_private, &drive);
718 	bd->d_qcount = drive.d_qcount;
719 	bd->d_removable = drive.d_removable;
720 	bd->d_hotpluggable = drive.d_hotpluggable;
721 
722 	if (drive.d_maxxfer && drive.d_maxxfer < bd->d_maxxfer)
723 		bd->d_maxxfer = drive.d_maxxfer;
724 
725 	bd_create_inquiry_props(dip, &drive);
726 
727 	bd_create_errstats(bd, inst, &drive);
728 	bd_update_state(bd);
729 
730 	bd->d_queues = kmem_alloc(sizeof (*bd->d_queues) * bd->d_qcount,
731 	    KM_SLEEP);
732 	for (i = 0; i < bd->d_qcount; i++) {
733 		bd_queue_t *bq = &bd->d_queues[i];
734 
735 		bq->q_qsize = drive.d_qsize;
736 		bq->q_qactive = 0;
737 		mutex_init(&bq->q_iomutex, NULL, MUTEX_DRIVER, NULL);
738 
739 		list_create(&bq->q_waitq, sizeof (bd_xfer_impl_t),
740 		    offsetof(struct bd_xfer_impl, i_linkage));
741 		list_create(&bq->q_runq, sizeof (bd_xfer_impl_t),
742 		    offsetof(struct bd_xfer_impl, i_linkage));
743 	}
744 
745 	rv = cmlb_attach(dip, &bd_tg_ops, DTYPE_DIRECT,
746 	    bd->d_removable, bd->d_hotpluggable,
747 	    /*LINTED: E_BAD_PTR_CAST_ALIGN*/
748 	    *(uint64_t *)drive.d_eui64 != 0 ? DDI_NT_BLOCK_BLKDEV :
749 	    drive.d_lun >= 0 ? DDI_NT_BLOCK_CHAN : DDI_NT_BLOCK,
750 	    CMLB_FAKE_LABEL_ONE_PARTITION, bd->d_cmlbh, 0);
751 	if (rv != 0) {
752 		bd_queues_free(bd);
753 		bd_destroy_errstats(bd);
754 		cmlb_free_handle(&bd->d_cmlbh);
755 
756 		if (bd->d_ksp != NULL) {
757 			kstat_delete(bd->d_ksp);
758 			bd->d_ksp = NULL;
759 		} else {
760 			kmem_free(bd->d_kiop, sizeof (kstat_io_t));
761 			bd->d_kiop = NULL;
762 		}
763 
764 		kmem_cache_destroy(bd->d_cache);
765 		cv_destroy(&bd->d_statecv);
766 		mutex_destroy(&bd->d_statemutex);
767 		mutex_destroy(&bd->d_ocmutex);
768 		mutex_destroy(&bd->d_ksmutex);
769 		ddi_soft_state_free(bd_state, inst);
770 		return (DDI_FAILURE);
771 	}
772 
773 	if (bd->d_ops.o_devid_init != NULL) {
774 		rv = bd->d_ops.o_devid_init(bd->d_private, dip, &bd->d_devid);
775 		if (rv == DDI_SUCCESS) {
776 			if (ddi_devid_register(dip, bd->d_devid) !=
777 			    DDI_SUCCESS) {
778 				cmn_err(CE_WARN,
779 				    "%s: unable to register devid", name);
780 			}
781 		}
782 	}
783 
784 	/*
785 	 * Add a zero-length attribute to tell the world we support
786 	 * kernel ioctls (for layered drivers).  Also set up properties
787 	 * used by HAL to identify removable media.
788 	 */
789 	(void) ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
790 	    DDI_KERNEL_IOCTL, NULL, 0);
791 	if (bd->d_removable) {
792 		(void) ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
793 		    "removable-media", NULL, 0);
794 	}
795 	if (bd->d_hotpluggable) {
796 		(void) ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
797 		    "hotpluggable", NULL, 0);
798 	}
799 
800 	ddi_report_dev(dip);
801 
802 	return (DDI_SUCCESS);
803 }
804 
805 static int
806 bd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
807 {
808 	bd_t	*bd;
809 
810 	bd = ddi_get_driver_private(dip);
811 
812 	switch (cmd) {
813 	case DDI_DETACH:
814 		break;
815 	case DDI_SUSPEND:
816 		/* We don't suspend, but our parent does */
817 		return (DDI_SUCCESS);
818 	default:
819 		return (DDI_FAILURE);
820 	}
821 
822 	if (bd->d_ksp != NULL) {
823 		kstat_delete(bd->d_ksp);
824 		bd->d_ksp = NULL;
825 	} else {
826 		kmem_free(bd->d_kiop, sizeof (kstat_io_t));
827 	}
828 
829 	bd_destroy_errstats(bd);
830 	cmlb_detach(bd->d_cmlbh, 0);
831 	cmlb_free_handle(&bd->d_cmlbh);
832 	if (bd->d_devid)
833 		ddi_devid_free(bd->d_devid);
834 	kmem_cache_destroy(bd->d_cache);
835 	mutex_destroy(&bd->d_ksmutex);
836 	mutex_destroy(&bd->d_ocmutex);
837 	mutex_destroy(&bd->d_statemutex);
838 	cv_destroy(&bd->d_statecv);
839 	bd_queues_free(bd);
840 	ddi_soft_state_free(bd_state, ddi_get_instance(dip));
841 	return (DDI_SUCCESS);
842 }
843 
844 static int
845 bd_xfer_ctor(void *buf, void *arg, int kmflag)
846 {
847 	bd_xfer_impl_t	*xi;
848 	bd_t		*bd = arg;
849 	int		(*dcb)(caddr_t);
850 
851 	if (kmflag == KM_PUSHPAGE || kmflag == KM_SLEEP) {
852 		dcb = DDI_DMA_SLEEP;
853 	} else {
854 		dcb = DDI_DMA_DONTWAIT;
855 	}
856 
857 	xi = buf;
858 	bzero(xi, sizeof (*xi));
859 	xi->i_bd = bd;
860 
861 	if (bd->d_use_dma) {
862 		if (ddi_dma_alloc_handle(bd->d_dip, &bd->d_dma, dcb, NULL,
863 		    &xi->i_dmah) != DDI_SUCCESS) {
864 			return (-1);
865 		}
866 	}
867 
868 	return (0);
869 }
870 
871 static void
872 bd_xfer_dtor(void *buf, void *arg)
873 {
874 	bd_xfer_impl_t	*xi = buf;
875 
876 	_NOTE(ARGUNUSED(arg));
877 
878 	if (xi->i_dmah)
879 		ddi_dma_free_handle(&xi->i_dmah);
880 	xi->i_dmah = NULL;
881 }
882 
883 static bd_xfer_impl_t *
884 bd_xfer_alloc(bd_t *bd, struct buf *bp, int (*func)(void *, bd_xfer_t *),
885     int kmflag)
886 {
887 	bd_xfer_impl_t		*xi;
888 	int			rv = 0;
889 	int			status;
890 	unsigned		dir;
891 	int			(*cb)(caddr_t);
892 	size_t			len;
893 	uint32_t		shift;
894 
895 	if (kmflag == KM_SLEEP) {
896 		cb = DDI_DMA_SLEEP;
897 	} else {
898 		cb = DDI_DMA_DONTWAIT;
899 	}
900 
901 	xi = kmem_cache_alloc(bd->d_cache, kmflag);
902 	if (xi == NULL) {
903 		bioerror(bp, ENOMEM);
904 		return (NULL);
905 	}
906 
907 	ASSERT(bp);
908 
909 	xi->i_bp = bp;
910 	xi->i_func = func;
911 	xi->i_blkno = bp->b_lblkno >> (bd->d_blkshift - DEV_BSHIFT);
912 
913 	if (bp->b_bcount == 0) {
914 		xi->i_len = 0;
915 		xi->i_nblks = 0;
916 		xi->i_kaddr = NULL;
917 		xi->i_resid = 0;
918 		xi->i_num_win = 0;
919 		goto done;
920 	}
921 
922 	if (bp->b_flags & B_READ) {
923 		dir = DDI_DMA_READ;
924 		xi->i_func = bd->d_ops.o_read;
925 	} else {
926 		dir = DDI_DMA_WRITE;
927 		xi->i_func = bd->d_ops.o_write;
928 	}
929 
930 	shift = bd->d_blkshift;
931 	xi->i_blkshift = shift;
932 
933 	if (!bd->d_use_dma) {
934 		bp_mapin(bp);
935 		rv = 0;
936 		xi->i_offset = 0;
937 		xi->i_num_win =
938 		    (bp->b_bcount + (bd->d_maxxfer - 1)) / bd->d_maxxfer;
939 		xi->i_cur_win = 0;
940 		xi->i_len = min(bp->b_bcount, bd->d_maxxfer);
941 		xi->i_nblks = xi->i_len >> shift;
942 		xi->i_kaddr = bp->b_un.b_addr;
943 		xi->i_resid = bp->b_bcount;
944 	} else {
945 
946 		/*
947 		 * We have to use consistent DMA if the address is misaligned.
948 		 */
949 		if (((bp->b_flags & (B_PAGEIO | B_REMAPPED)) != B_PAGEIO) &&
950 		    ((uintptr_t)bp->b_un.b_addr & 0x7)) {
951 			dir |= DDI_DMA_CONSISTENT | DDI_DMA_PARTIAL;
952 		} else {
953 			dir |= DDI_DMA_STREAMING | DDI_DMA_PARTIAL;
954 		}
955 
956 		status = ddi_dma_buf_bind_handle(xi->i_dmah, bp, dir, cb,
957 		    NULL, &xi->i_dmac, &xi->i_ndmac);
958 		switch (status) {
959 		case DDI_DMA_MAPPED:
960 			xi->i_num_win = 1;
961 			xi->i_cur_win = 0;
962 			xi->i_offset = 0;
963 			xi->i_len = bp->b_bcount;
964 			xi->i_nblks = xi->i_len >> shift;
965 			xi->i_resid = bp->b_bcount;
966 			rv = 0;
967 			break;
968 		case DDI_DMA_PARTIAL_MAP:
969 			xi->i_cur_win = 0;
970 
971 			if ((ddi_dma_numwin(xi->i_dmah, &xi->i_num_win) !=
972 			    DDI_SUCCESS) ||
973 			    (ddi_dma_getwin(xi->i_dmah, 0, &xi->i_offset,
974 			    &len, &xi->i_dmac, &xi->i_ndmac) !=
975 			    DDI_SUCCESS) ||
976 			    (P2PHASE(len, (1U << shift)) != 0)) {
977 				(void) ddi_dma_unbind_handle(xi->i_dmah);
978 				rv = EFAULT;
979 				goto done;
980 			}
981 			xi->i_len = len;
982 			xi->i_nblks = xi->i_len >> shift;
983 			xi->i_resid = bp->b_bcount;
984 			rv = 0;
985 			break;
986 		case DDI_DMA_NORESOURCES:
987 			rv = EAGAIN;
988 			goto done;
989 		case DDI_DMA_TOOBIG:
990 			rv = EINVAL;
991 			goto done;
992 		case DDI_DMA_NOMAPPING:
993 		case DDI_DMA_INUSE:
994 		default:
995 			rv = EFAULT;
996 			goto done;
997 		}
998 	}
999 
1000 done:
1001 	if (rv != 0) {
1002 		kmem_cache_free(bd->d_cache, xi);
1003 		bioerror(bp, rv);
1004 		return (NULL);
1005 	}
1006 
1007 	return (xi);
1008 }
1009 
1010 static void
1011 bd_xfer_free(bd_xfer_impl_t *xi)
1012 {
1013 	if (xi->i_dmah) {
1014 		(void) ddi_dma_unbind_handle(xi->i_dmah);
1015 	}
1016 	kmem_cache_free(xi->i_bd->d_cache, xi);
1017 }
1018 
1019 static int
1020 bd_open(dev_t *devp, int flag, int otyp, cred_t *credp)
1021 {
1022 	dev_t		dev = *devp;
1023 	bd_t		*bd;
1024 	minor_t		part;
1025 	minor_t		inst;
1026 	uint64_t	mask;
1027 	boolean_t	ndelay;
1028 	int		rv;
1029 	diskaddr_t	nblks;
1030 	diskaddr_t	lba;
1031 
1032 	_NOTE(ARGUNUSED(credp));
1033 
1034 	part = BDPART(dev);
1035 	inst = BDINST(dev);
1036 
1037 	if (otyp >= OTYPCNT)
1038 		return (EINVAL);
1039 
1040 	ndelay = (flag & (FNDELAY | FNONBLOCK)) ? B_TRUE : B_FALSE;
1041 
1042 	/*
1043 	 * Block any DR events from changing the set of registered
1044 	 * devices while we function.
1045 	 */
1046 	rw_enter(&bd_lock, RW_READER);
1047 	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
1048 		rw_exit(&bd_lock);
1049 		return (ENXIO);
1050 	}
1051 
1052 	mutex_enter(&bd->d_ocmutex);
1053 
1054 	ASSERT(part < 64);
1055 	mask = (1U << part);
1056 
1057 	bd_update_state(bd);
1058 
1059 	if (cmlb_validate(bd->d_cmlbh, 0, 0) != 0) {
1060 
1061 		/* non-blocking opens are allowed to succeed */
1062 		if (!ndelay) {
1063 			rv = ENXIO;
1064 			goto done;
1065 		}
1066 	} else if (cmlb_partinfo(bd->d_cmlbh, part, &nblks, &lba,
1067 	    NULL, NULL, 0) == 0) {
1068 
1069 		/*
1070 		 * We read the partinfo, verify valid ranges.  If the
1071 		 * partition is invalid, and we aren't blocking or
1072 		 * doing a raw access, then fail. (Non-blocking and
1073 		 * raw accesses can still succeed to allow a disk with
1074 		 * bad partition data to opened by format and fdisk.)
1075 		 */
1076 		if ((!nblks) && ((!ndelay) || (otyp != OTYP_CHR))) {
1077 			rv = ENXIO;
1078 			goto done;
1079 		}
1080 	} else if (!ndelay) {
1081 		/*
1082 		 * cmlb_partinfo failed -- invalid partition or no
1083 		 * disk label.
1084 		 */
1085 		rv = ENXIO;
1086 		goto done;
1087 	}
1088 
1089 	if ((flag & FWRITE) && bd->d_rdonly) {
1090 		rv = EROFS;
1091 		goto done;
1092 	}
1093 
1094 	if ((bd->d_open_excl) & (mask)) {
1095 		rv = EBUSY;
1096 		goto done;
1097 	}
1098 	if (flag & FEXCL) {
1099 		if (bd->d_open_lyr[part]) {
1100 			rv = EBUSY;
1101 			goto done;
1102 		}
1103 		for (int i = 0; i < OTYP_LYR; i++) {
1104 			if (bd->d_open_reg[i] & mask) {
1105 				rv = EBUSY;
1106 				goto done;
1107 			}
1108 		}
1109 	}
1110 
1111 	if (otyp == OTYP_LYR) {
1112 		bd->d_open_lyr[part]++;
1113 	} else {
1114 		bd->d_open_reg[otyp] |= mask;
1115 	}
1116 	if (flag & FEXCL) {
1117 		bd->d_open_excl |= mask;
1118 	}
1119 
1120 	rv = 0;
1121 done:
1122 	mutex_exit(&bd->d_ocmutex);
1123 	rw_exit(&bd_lock);
1124 
1125 	return (rv);
1126 }
1127 
1128 static int
1129 bd_close(dev_t dev, int flag, int otyp, cred_t *credp)
1130 {
1131 	bd_t		*bd;
1132 	minor_t		inst;
1133 	minor_t		part;
1134 	uint64_t	mask;
1135 	boolean_t	last = B_TRUE;
1136 
1137 	_NOTE(ARGUNUSED(flag));
1138 	_NOTE(ARGUNUSED(credp));
1139 
1140 	part = BDPART(dev);
1141 	inst = BDINST(dev);
1142 
1143 	ASSERT(part < 64);
1144 	mask = (1U << part);
1145 
1146 	rw_enter(&bd_lock, RW_READER);
1147 
1148 	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
1149 		rw_exit(&bd_lock);
1150 		return (ENXIO);
1151 	}
1152 
1153 	mutex_enter(&bd->d_ocmutex);
1154 	if (bd->d_open_excl & mask) {
1155 		bd->d_open_excl &= ~mask;
1156 	}
1157 	if (otyp == OTYP_LYR) {
1158 		bd->d_open_lyr[part]--;
1159 	} else {
1160 		bd->d_open_reg[otyp] &= ~mask;
1161 	}
1162 	for (int i = 0; i < 64; i++) {
1163 		if (bd->d_open_lyr[part]) {
1164 			last = B_FALSE;
1165 		}
1166 	}
1167 	for (int i = 0; last && (i < OTYP_LYR); i++) {
1168 		if (bd->d_open_reg[i]) {
1169 			last = B_FALSE;
1170 		}
1171 	}
1172 	mutex_exit(&bd->d_ocmutex);
1173 
1174 	if (last) {
1175 		cmlb_invalidate(bd->d_cmlbh, 0);
1176 	}
1177 	rw_exit(&bd_lock);
1178 
1179 	return (0);
1180 }
1181 
1182 static int
1183 bd_dump(dev_t dev, caddr_t caddr, daddr_t blkno, int nblk)
1184 {
1185 	minor_t		inst;
1186 	minor_t		part;
1187 	diskaddr_t	pstart;
1188 	diskaddr_t	psize;
1189 	bd_t		*bd;
1190 	bd_xfer_impl_t	*xi;
1191 	buf_t		*bp;
1192 	int		rv;
1193 	uint32_t	shift;
1194 	daddr_t		d_blkno;
1195 	int	d_nblk;
1196 
1197 	rw_enter(&bd_lock, RW_READER);
1198 
1199 	part = BDPART(dev);
1200 	inst = BDINST(dev);
1201 
1202 	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
1203 		rw_exit(&bd_lock);
1204 		return (ENXIO);
1205 	}
1206 	shift = bd->d_blkshift;
1207 	d_blkno = blkno >> (shift - DEV_BSHIFT);
1208 	d_nblk = nblk >> (shift - DEV_BSHIFT);
1209 	/*
1210 	 * do cmlb, but do it synchronously unless we already have the
1211 	 * partition (which we probably should.)
1212 	 */
1213 	if (cmlb_partinfo(bd->d_cmlbh, part, &psize, &pstart, NULL, NULL,
1214 	    (void *)1)) {
1215 		rw_exit(&bd_lock);
1216 		return (ENXIO);
1217 	}
1218 
1219 	if ((d_blkno + d_nblk) > psize) {
1220 		rw_exit(&bd_lock);
1221 		return (EINVAL);
1222 	}
1223 	bp = getrbuf(KM_NOSLEEP);
1224 	if (bp == NULL) {
1225 		rw_exit(&bd_lock);
1226 		return (ENOMEM);
1227 	}
1228 
1229 	bp->b_bcount = nblk << DEV_BSHIFT;
1230 	bp->b_resid = bp->b_bcount;
1231 	bp->b_lblkno = blkno;
1232 	bp->b_un.b_addr = caddr;
1233 
1234 	xi = bd_xfer_alloc(bd, bp,  bd->d_ops.o_write, KM_NOSLEEP);
1235 	if (xi == NULL) {
1236 		rw_exit(&bd_lock);
1237 		freerbuf(bp);
1238 		return (ENOMEM);
1239 	}
1240 	xi->i_blkno = d_blkno + pstart;
1241 	xi->i_flags = BD_XFER_POLL;
1242 	bd_submit(bd, xi);
1243 	rw_exit(&bd_lock);
1244 
1245 	/*
1246 	 * Generally, we should have run this entirely synchronously
1247 	 * at this point and the biowait call should be a no-op.  If
1248 	 * it didn't happen this way, it's a bug in the underlying
1249 	 * driver not honoring BD_XFER_POLL.
1250 	 */
1251 	(void) biowait(bp);
1252 	rv = geterror(bp);
1253 	freerbuf(bp);
1254 	return (rv);
1255 }
1256 
1257 void
1258 bd_minphys(struct buf *bp)
1259 {
1260 	minor_t inst;
1261 	bd_t	*bd;
1262 	inst = BDINST(bp->b_edev);
1263 
1264 	bd = ddi_get_soft_state(bd_state, inst);
1265 
1266 	/*
1267 	 * In a non-debug kernel, bd_strategy will catch !bd as
1268 	 * well, and will fail nicely.
1269 	 */
1270 	ASSERT(bd);
1271 
1272 	if (bp->b_bcount > bd->d_maxxfer)
1273 		bp->b_bcount = bd->d_maxxfer;
1274 }
1275 
1276 static int
1277 bd_check_uio(dev_t dev, struct uio *uio)
1278 {
1279 	bd_t		*bd;
1280 	uint32_t	shift;
1281 
1282 	if ((bd = ddi_get_soft_state(bd_state, BDINST(dev))) == NULL) {
1283 		return (ENXIO);
1284 	}
1285 
1286 	shift = bd->d_blkshift;
1287 	if ((P2PHASE(uio->uio_loffset, (1U << shift)) != 0) ||
1288 	    (P2PHASE(uio->uio_iov->iov_len, (1U << shift)) != 0)) {
1289 		return (EINVAL);
1290 	}
1291 
1292 	return (0);
1293 }
1294 
1295 static int
1296 bd_read(dev_t dev, struct uio *uio, cred_t *credp)
1297 {
1298 	_NOTE(ARGUNUSED(credp));
1299 	int	ret = bd_check_uio(dev, uio);
1300 	if (ret != 0) {
1301 		return (ret);
1302 	}
1303 	return (physio(bd_strategy, NULL, dev, B_READ, bd_minphys, uio));
1304 }
1305 
1306 static int
1307 bd_write(dev_t dev, struct uio *uio, cred_t *credp)
1308 {
1309 	_NOTE(ARGUNUSED(credp));
1310 	int	ret = bd_check_uio(dev, uio);
1311 	if (ret != 0) {
1312 		return (ret);
1313 	}
1314 	return (physio(bd_strategy, NULL, dev, B_WRITE, bd_minphys, uio));
1315 }
1316 
1317 static int
1318 bd_aread(dev_t dev, struct aio_req *aio, cred_t *credp)
1319 {
1320 	_NOTE(ARGUNUSED(credp));
1321 	int	ret = bd_check_uio(dev, aio->aio_uio);
1322 	if (ret != 0) {
1323 		return (ret);
1324 	}
1325 	return (aphysio(bd_strategy, anocancel, dev, B_READ, bd_minphys, aio));
1326 }
1327 
1328 static int
1329 bd_awrite(dev_t dev, struct aio_req *aio, cred_t *credp)
1330 {
1331 	_NOTE(ARGUNUSED(credp));
1332 	int	ret = bd_check_uio(dev, aio->aio_uio);
1333 	if (ret != 0) {
1334 		return (ret);
1335 	}
1336 	return (aphysio(bd_strategy, anocancel, dev, B_WRITE, bd_minphys, aio));
1337 }
1338 
1339 static int
1340 bd_strategy(struct buf *bp)
1341 {
1342 	minor_t		inst;
1343 	minor_t		part;
1344 	bd_t		*bd;
1345 	diskaddr_t	p_lba;
1346 	diskaddr_t	p_nblks;
1347 	diskaddr_t	b_nblks;
1348 	bd_xfer_impl_t	*xi;
1349 	uint32_t	shift;
1350 	int		(*func)(void *, bd_xfer_t *);
1351 	diskaddr_t	lblkno;
1352 
1353 	part = BDPART(bp->b_edev);
1354 	inst = BDINST(bp->b_edev);
1355 
1356 	ASSERT(bp);
1357 
1358 	bp->b_resid = bp->b_bcount;
1359 
1360 	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
1361 		bioerror(bp, ENXIO);
1362 		biodone(bp);
1363 		return (0);
1364 	}
1365 
1366 	if (cmlb_partinfo(bd->d_cmlbh, part, &p_nblks, &p_lba,
1367 	    NULL, NULL, 0)) {
1368 		bioerror(bp, ENXIO);
1369 		biodone(bp);
1370 		return (0);
1371 	}
1372 
1373 	shift = bd->d_blkshift;
1374 	lblkno = bp->b_lblkno >> (shift - DEV_BSHIFT);
1375 	if ((P2PHASE(bp->b_lblkno, (1U << (shift - DEV_BSHIFT))) != 0) ||
1376 	    (P2PHASE(bp->b_bcount, (1U << shift)) != 0) ||
1377 	    (lblkno > p_nblks)) {
1378 		bioerror(bp, EINVAL);
1379 		biodone(bp);
1380 		return (0);
1381 	}
1382 	b_nblks = bp->b_bcount >> shift;
1383 	if ((lblkno == p_nblks) || (bp->b_bcount == 0)) {
1384 		biodone(bp);
1385 		return (0);
1386 	}
1387 
1388 	if ((b_nblks + lblkno) > p_nblks) {
1389 		bp->b_resid = ((lblkno + b_nblks - p_nblks) << shift);
1390 		bp->b_bcount -= bp->b_resid;
1391 	} else {
1392 		bp->b_resid = 0;
1393 	}
1394 	func = (bp->b_flags & B_READ) ? bd->d_ops.o_read : bd->d_ops.o_write;
1395 
1396 	xi = bd_xfer_alloc(bd, bp, func, KM_NOSLEEP);
1397 	if (xi == NULL) {
1398 		xi = bd_xfer_alloc(bd, bp, func, KM_PUSHPAGE);
1399 	}
1400 	if (xi == NULL) {
1401 		/* bd_request_alloc will have done bioerror */
1402 		biodone(bp);
1403 		return (0);
1404 	}
1405 	xi->i_blkno = lblkno + p_lba;
1406 
1407 	bd_submit(bd, xi);
1408 
1409 	return (0);
1410 }
1411 
1412 static int
1413 bd_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *credp, int *rvalp)
1414 {
1415 	minor_t		inst;
1416 	uint16_t	part;
1417 	bd_t		*bd;
1418 	void		*ptr = (void *)arg;
1419 	int		rv;
1420 
1421 	part = BDPART(dev);
1422 	inst = BDINST(dev);
1423 
1424 	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
1425 		return (ENXIO);
1426 	}
1427 
1428 	rv = cmlb_ioctl(bd->d_cmlbh, dev, cmd, arg, flag, credp, rvalp, 0);
1429 	if (rv != ENOTTY)
1430 		return (rv);
1431 
1432 	if (rvalp != NULL) {
1433 		/* the return value of the ioctl is 0 by default */
1434 		*rvalp = 0;
1435 	}
1436 
1437 	switch (cmd) {
1438 	case DKIOCGMEDIAINFO: {
1439 		struct dk_minfo minfo;
1440 
1441 		/* make sure our state information is current */
1442 		bd_update_state(bd);
1443 		bzero(&minfo, sizeof (minfo));
1444 		minfo.dki_media_type = DK_FIXED_DISK;
1445 		minfo.dki_lbsize = (1U << bd->d_blkshift);
1446 		minfo.dki_capacity = bd->d_numblks;
1447 		if (ddi_copyout(&minfo, ptr, sizeof (minfo), flag)) {
1448 			return (EFAULT);
1449 		}
1450 		return (0);
1451 	}
1452 	case DKIOCGMEDIAINFOEXT: {
1453 		struct dk_minfo_ext miext;
1454 
1455 		/* make sure our state information is current */
1456 		bd_update_state(bd);
1457 		bzero(&miext, sizeof (miext));
1458 		miext.dki_media_type = DK_FIXED_DISK;
1459 		miext.dki_lbsize = (1U << bd->d_blkshift);
1460 		miext.dki_pbsize = (1U << bd->d_pblkshift);
1461 		miext.dki_capacity = bd->d_numblks;
1462 		if (ddi_copyout(&miext, ptr, sizeof (miext), flag)) {
1463 			return (EFAULT);
1464 		}
1465 		return (0);
1466 	}
1467 	case DKIOCINFO: {
1468 		struct dk_cinfo cinfo;
1469 		bzero(&cinfo, sizeof (cinfo));
1470 		cinfo.dki_ctype = DKC_BLKDEV;
1471 		cinfo.dki_cnum = ddi_get_instance(ddi_get_parent(bd->d_dip));
1472 		(void) snprintf(cinfo.dki_cname, sizeof (cinfo.dki_cname),
1473 		    "%s", ddi_driver_name(ddi_get_parent(bd->d_dip)));
1474 		(void) snprintf(cinfo.dki_dname, sizeof (cinfo.dki_dname),
1475 		    "%s", ddi_driver_name(bd->d_dip));
1476 		cinfo.dki_unit = inst;
1477 		cinfo.dki_flags = DKI_FMTVOL;
1478 		cinfo.dki_partition = part;
1479 		cinfo.dki_maxtransfer = bd->d_maxxfer / DEV_BSIZE;
1480 		cinfo.dki_addr = 0;
1481 		cinfo.dki_slave = 0;
1482 		cinfo.dki_space = 0;
1483 		cinfo.dki_prio = 0;
1484 		cinfo.dki_vec = 0;
1485 		if (ddi_copyout(&cinfo, ptr, sizeof (cinfo), flag)) {
1486 			return (EFAULT);
1487 		}
1488 		return (0);
1489 	}
1490 	case DKIOCREMOVABLE: {
1491 		int i;
1492 		i = bd->d_removable ? 1 : 0;
1493 		if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
1494 			return (EFAULT);
1495 		}
1496 		return (0);
1497 	}
1498 	case DKIOCHOTPLUGGABLE: {
1499 		int i;
1500 		i = bd->d_hotpluggable ? 1 : 0;
1501 		if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
1502 			return (EFAULT);
1503 		}
1504 		return (0);
1505 	}
1506 	case DKIOCREADONLY: {
1507 		int i;
1508 		i = bd->d_rdonly ? 1 : 0;
1509 		if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
1510 			return (EFAULT);
1511 		}
1512 		return (0);
1513 	}
1514 	case DKIOCSOLIDSTATE: {
1515 		int i;
1516 		i = bd->d_ssd ? 1 : 0;
1517 		if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
1518 			return (EFAULT);
1519 		}
1520 		return (0);
1521 	}
1522 	case DKIOCSTATE: {
1523 		enum dkio_state	state;
1524 		if (ddi_copyin(ptr, &state, sizeof (state), flag)) {
1525 			return (EFAULT);
1526 		}
1527 		if ((rv = bd_check_state(bd, &state)) != 0) {
1528 			return (rv);
1529 		}
1530 		if (ddi_copyout(&state, ptr, sizeof (state), flag)) {
1531 			return (EFAULT);
1532 		}
1533 		return (0);
1534 	}
1535 	case DKIOCFLUSHWRITECACHE: {
1536 		struct dk_callback *dkc = NULL;
1537 
1538 		if (flag & FKIOCTL)
1539 			dkc = (void *)arg;
1540 
1541 		rv = bd_flush_write_cache(bd, dkc);
1542 		return (rv);
1543 	}
1544 
1545 	default:
1546 		break;
1547 
1548 	}
1549 	return (ENOTTY);
1550 }
1551 
1552 static int
1553 bd_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int mod_flags,
1554     char *name, caddr_t valuep, int *lengthp)
1555 {
1556 	bd_t	*bd;
1557 
1558 	bd = ddi_get_soft_state(bd_state, ddi_get_instance(dip));
1559 	if (bd == NULL)
1560 		return (ddi_prop_op(dev, dip, prop_op, mod_flags,
1561 		    name, valuep, lengthp));
1562 
1563 	return (cmlb_prop_op(bd->d_cmlbh, dev, dip, prop_op, mod_flags, name,
1564 	    valuep, lengthp, BDPART(dev), 0));
1565 }
1566 
1567 
1568 static int
1569 bd_tg_rdwr(dev_info_t *dip, uchar_t cmd, void *bufaddr, diskaddr_t start,
1570     size_t length, void *tg_cookie)
1571 {
1572 	bd_t		*bd;
1573 	buf_t		*bp;
1574 	bd_xfer_impl_t	*xi;
1575 	int		rv;
1576 	int		(*func)(void *, bd_xfer_t *);
1577 	int		kmflag;
1578 
1579 	/*
1580 	 * If we are running in polled mode (such as during dump(9e)
1581 	 * execution), then we cannot sleep for kernel allocations.
1582 	 */
1583 	kmflag = tg_cookie ? KM_NOSLEEP : KM_SLEEP;
1584 
1585 	bd = ddi_get_soft_state(bd_state, ddi_get_instance(dip));
1586 
1587 	if (P2PHASE(length, (1U << bd->d_blkshift)) != 0) {
1588 		/* We can only transfer whole blocks at a time! */
1589 		return (EINVAL);
1590 	}
1591 
1592 	if ((bp = getrbuf(kmflag)) == NULL) {
1593 		return (ENOMEM);
1594 	}
1595 
1596 	switch (cmd) {
1597 	case TG_READ:
1598 		bp->b_flags = B_READ;
1599 		func = bd->d_ops.o_read;
1600 		break;
1601 	case TG_WRITE:
1602 		bp->b_flags = B_WRITE;
1603 		func = bd->d_ops.o_write;
1604 		break;
1605 	default:
1606 		freerbuf(bp);
1607 		return (EINVAL);
1608 	}
1609 
1610 	bp->b_un.b_addr = bufaddr;
1611 	bp->b_bcount = length;
1612 	xi = bd_xfer_alloc(bd, bp, func, kmflag);
1613 	if (xi == NULL) {
1614 		rv = geterror(bp);
1615 		freerbuf(bp);
1616 		return (rv);
1617 	}
1618 	xi->i_flags = tg_cookie ? BD_XFER_POLL : 0;
1619 	xi->i_blkno = start;
1620 	bd_submit(bd, xi);
1621 	(void) biowait(bp);
1622 	rv = geterror(bp);
1623 	freerbuf(bp);
1624 
1625 	return (rv);
1626 }
1627 
1628 static int
1629 bd_tg_getinfo(dev_info_t *dip, int cmd, void *arg, void *tg_cookie)
1630 {
1631 	bd_t		*bd;
1632 
1633 	_NOTE(ARGUNUSED(tg_cookie));
1634 	bd = ddi_get_soft_state(bd_state, ddi_get_instance(dip));
1635 
1636 	switch (cmd) {
1637 	case TG_GETPHYGEOM:
1638 	case TG_GETVIRTGEOM:
1639 		/*
1640 		 * We don't have any "geometry" as such, let cmlb
1641 		 * fabricate something.
1642 		 */
1643 		return (ENOTTY);
1644 
1645 	case TG_GETCAPACITY:
1646 		bd_update_state(bd);
1647 		*(diskaddr_t *)arg = bd->d_numblks;
1648 		return (0);
1649 
1650 	case TG_GETBLOCKSIZE:
1651 		*(uint32_t *)arg = (1U << bd->d_blkshift);
1652 		return (0);
1653 
1654 	case TG_GETATTR:
1655 		/*
1656 		 * It turns out that cmlb really doesn't do much for
1657 		 * non-writable media, but lets make the information
1658 		 * available for it in case it does more in the
1659 		 * future.  (The value is currently used for
1660 		 * triggering special behavior for CD-ROMs.)
1661 		 */
1662 		bd_update_state(bd);
1663 		((tg_attribute_t *)arg)->media_is_writable =
1664 		    bd->d_rdonly ? B_FALSE : B_TRUE;
1665 		((tg_attribute_t *)arg)->media_is_solid_state = bd->d_ssd;
1666 		((tg_attribute_t *)arg)->media_is_rotational = B_FALSE;
1667 		return (0);
1668 
1669 	default:
1670 		return (EINVAL);
1671 	}
1672 }
1673 
1674 
1675 static void
1676 bd_sched(bd_t *bd, bd_queue_t *bq)
1677 {
1678 	bd_xfer_impl_t	*xi;
1679 	struct buf	*bp;
1680 	int		rv;
1681 
1682 	mutex_enter(&bq->q_iomutex);
1683 
1684 	while ((bq->q_qactive < bq->q_qsize) &&
1685 	    ((xi = list_remove_head(&bq->q_waitq)) != NULL)) {
1686 		mutex_enter(&bd->d_ksmutex);
1687 		kstat_waitq_to_runq(bd->d_kiop);
1688 		mutex_exit(&bd->d_ksmutex);
1689 
1690 		bq->q_qactive++;
1691 		list_insert_tail(&bq->q_runq, xi);
1692 
1693 		/*
1694 		 * Submit the job to the driver.  We drop the I/O mutex
1695 		 * so that we can deal with the case where the driver
1696 		 * completion routine calls back into us synchronously.
1697 		 */
1698 
1699 		mutex_exit(&bq->q_iomutex);
1700 
1701 		rv = xi->i_func(bd->d_private, &xi->i_public);
1702 		if (rv != 0) {
1703 			bp = xi->i_bp;
1704 			bioerror(bp, rv);
1705 			biodone(bp);
1706 
1707 			atomic_inc_32(&bd->d_kerr->bd_transerrs.value.ui32);
1708 
1709 			mutex_enter(&bq->q_iomutex);
1710 
1711 			mutex_enter(&bd->d_ksmutex);
1712 			kstat_runq_exit(bd->d_kiop);
1713 			mutex_exit(&bd->d_ksmutex);
1714 
1715 			bq->q_qactive--;
1716 			list_remove(&bq->q_runq, xi);
1717 			bd_xfer_free(xi);
1718 		} else {
1719 			mutex_enter(&bq->q_iomutex);
1720 		}
1721 	}
1722 
1723 	mutex_exit(&bq->q_iomutex);
1724 }
1725 
1726 static void
1727 bd_submit(bd_t *bd, bd_xfer_impl_t *xi)
1728 {
1729 	uint64_t	nv = atomic_inc_64_nv(&bd->d_io_counter);
1730 	unsigned	q = nv % bd->d_qcount;
1731 	bd_queue_t	*bq = &bd->d_queues[q];
1732 
1733 	xi->i_bq = bq;
1734 	xi->i_qnum = q;
1735 
1736 	mutex_enter(&bq->q_iomutex);
1737 
1738 	list_insert_tail(&bq->q_waitq, xi);
1739 
1740 	mutex_enter(&bd->d_ksmutex);
1741 	kstat_waitq_enter(bd->d_kiop);
1742 	mutex_exit(&bd->d_ksmutex);
1743 
1744 	mutex_exit(&bq->q_iomutex);
1745 
1746 	bd_sched(bd, bq);
1747 }
1748 
1749 static void
1750 bd_runq_exit(bd_xfer_impl_t *xi, int err)
1751 {
1752 	bd_t		*bd = xi->i_bd;
1753 	buf_t		*bp = xi->i_bp;
1754 	bd_queue_t	*bq = xi->i_bq;
1755 
1756 	mutex_enter(&bq->q_iomutex);
1757 	bq->q_qactive--;
1758 
1759 	mutex_enter(&bd->d_ksmutex);
1760 	kstat_runq_exit(bd->d_kiop);
1761 	mutex_exit(&bd->d_ksmutex);
1762 
1763 	list_remove(&bq->q_runq, xi);
1764 	mutex_exit(&bq->q_iomutex);
1765 
1766 	if (err == 0) {
1767 		if (bp->b_flags & B_READ) {
1768 			atomic_inc_uint(&bd->d_kiop->reads);
1769 			atomic_add_64((uint64_t *)&bd->d_kiop->nread,
1770 			    bp->b_bcount - xi->i_resid);
1771 		} else {
1772 			atomic_inc_uint(&bd->d_kiop->writes);
1773 			atomic_add_64((uint64_t *)&bd->d_kiop->nwritten,
1774 			    bp->b_bcount - xi->i_resid);
1775 		}
1776 	}
1777 	bd_sched(bd, bq);
1778 }
1779 
1780 static void
1781 bd_update_state(bd_t *bd)
1782 {
1783 	enum	dkio_state	state = DKIO_INSERTED;
1784 	boolean_t		docmlb = B_FALSE;
1785 	bd_media_t		media;
1786 
1787 	bzero(&media, sizeof (media));
1788 
1789 	mutex_enter(&bd->d_statemutex);
1790 	if (bd->d_ops.o_media_info(bd->d_private, &media) != 0) {
1791 		bd->d_numblks = 0;
1792 		state = DKIO_EJECTED;
1793 		goto done;
1794 	}
1795 
1796 	if ((media.m_blksize < 512) ||
1797 	    (!ISP2(media.m_blksize)) ||
1798 	    (P2PHASE(bd->d_maxxfer, media.m_blksize))) {
1799 		cmn_err(CE_WARN, "%s%d: Invalid media block size (%d)",
1800 		    ddi_driver_name(bd->d_dip), ddi_get_instance(bd->d_dip),
1801 		    media.m_blksize);
1802 		/*
1803 		 * We can't use the media, treat it as not present.
1804 		 */
1805 		state = DKIO_EJECTED;
1806 		bd->d_numblks = 0;
1807 		goto done;
1808 	}
1809 
1810 	if (((1U << bd->d_blkshift) != media.m_blksize) ||
1811 	    (bd->d_numblks != media.m_nblks)) {
1812 		/* Device size changed */
1813 		docmlb = B_TRUE;
1814 	}
1815 
1816 	bd->d_blkshift = ddi_ffs(media.m_blksize) - 1;
1817 	bd->d_pblkshift = bd->d_blkshift;
1818 	bd->d_numblks = media.m_nblks;
1819 	bd->d_rdonly = media.m_readonly;
1820 	bd->d_ssd = media.m_solidstate;
1821 
1822 	/*
1823 	 * Only use the supplied physical block size if it is non-zero,
1824 	 * greater or equal to the block size, and a power of 2. Ignore it
1825 	 * if not, it's just informational and we can still use the media.
1826 	 */
1827 	if ((media.m_pblksize != 0) &&
1828 	    (media.m_pblksize >= media.m_blksize) &&
1829 	    (ISP2(media.m_pblksize)))
1830 		bd->d_pblkshift = ddi_ffs(media.m_pblksize) - 1;
1831 
1832 done:
1833 	if (state != bd->d_state) {
1834 		bd->d_state = state;
1835 		cv_broadcast(&bd->d_statecv);
1836 		docmlb = B_TRUE;
1837 	}
1838 	mutex_exit(&bd->d_statemutex);
1839 
1840 	bd->d_kerr->bd_capacity.value.ui64 = bd->d_numblks << bd->d_blkshift;
1841 
1842 	if (docmlb) {
1843 		if (state == DKIO_INSERTED) {
1844 			(void) cmlb_validate(bd->d_cmlbh, 0, 0);
1845 		} else {
1846 			cmlb_invalidate(bd->d_cmlbh, 0);
1847 		}
1848 	}
1849 }
1850 
1851 static int
1852 bd_check_state(bd_t *bd, enum dkio_state *state)
1853 {
1854 	clock_t		when;
1855 
1856 	for (;;) {
1857 
1858 		bd_update_state(bd);
1859 
1860 		mutex_enter(&bd->d_statemutex);
1861 
1862 		if (bd->d_state != *state) {
1863 			*state = bd->d_state;
1864 			mutex_exit(&bd->d_statemutex);
1865 			break;
1866 		}
1867 
1868 		when = drv_usectohz(1000000);
1869 		if (cv_reltimedwait_sig(&bd->d_statecv, &bd->d_statemutex,
1870 		    when, TR_CLOCK_TICK) == 0) {
1871 			mutex_exit(&bd->d_statemutex);
1872 			return (EINTR);
1873 		}
1874 
1875 		mutex_exit(&bd->d_statemutex);
1876 	}
1877 
1878 	return (0);
1879 }
1880 
1881 static int
1882 bd_flush_write_cache_done(struct buf *bp)
1883 {
1884 	struct dk_callback *dc = (void *)bp->b_private;
1885 
1886 	(*dc->dkc_callback)(dc->dkc_cookie, geterror(bp));
1887 	kmem_free(dc, sizeof (*dc));
1888 	freerbuf(bp);
1889 	return (0);
1890 }
1891 
1892 static int
1893 bd_flush_write_cache(bd_t *bd, struct dk_callback *dkc)
1894 {
1895 	buf_t			*bp;
1896 	struct dk_callback	*dc;
1897 	bd_xfer_impl_t		*xi;
1898 	int			rv;
1899 
1900 	if (bd->d_ops.o_sync_cache == NULL) {
1901 		return (ENOTSUP);
1902 	}
1903 	if ((bp = getrbuf(KM_SLEEP)) == NULL) {
1904 		return (ENOMEM);
1905 	}
1906 	bp->b_resid = 0;
1907 	bp->b_bcount = 0;
1908 
1909 	xi = bd_xfer_alloc(bd, bp, bd->d_ops.o_sync_cache, KM_SLEEP);
1910 	if (xi == NULL) {
1911 		rv = geterror(bp);
1912 		freerbuf(bp);
1913 		return (rv);
1914 	}
1915 
1916 	/* Make an asynchronous flush, but only if there is a callback */
1917 	if (dkc != NULL && dkc->dkc_callback != NULL) {
1918 		/* Make a private copy of the callback structure */
1919 		dc = kmem_alloc(sizeof (*dc), KM_SLEEP);
1920 		*dc = *dkc;
1921 		bp->b_private = dc;
1922 		bp->b_iodone = bd_flush_write_cache_done;
1923 
1924 		bd_submit(bd, xi);
1925 		return (0);
1926 	}
1927 
1928 	/* In case there is no callback, perform a synchronous flush */
1929 	bd_submit(bd, xi);
1930 	(void) biowait(bp);
1931 	rv = geterror(bp);
1932 	freerbuf(bp);
1933 
1934 	return (rv);
1935 }
1936 
1937 /*
1938  * Nexus support.
1939  */
1940 int
1941 bd_bus_ctl(dev_info_t *dip, dev_info_t *rdip, ddi_ctl_enum_t ctlop,
1942     void *arg, void *result)
1943 {
1944 	bd_handle_t	hdl;
1945 
1946 	switch (ctlop) {
1947 	case DDI_CTLOPS_REPORTDEV:
1948 		cmn_err(CE_CONT, "?Block device: %s@%s, %s%d\n",
1949 		    ddi_node_name(rdip), ddi_get_name_addr(rdip),
1950 		    ddi_driver_name(rdip), ddi_get_instance(rdip));
1951 		return (DDI_SUCCESS);
1952 
1953 	case DDI_CTLOPS_INITCHILD:
1954 		hdl = ddi_get_parent_data((dev_info_t *)arg);
1955 		if (hdl == NULL) {
1956 			return (DDI_NOT_WELL_FORMED);
1957 		}
1958 		ddi_set_name_addr((dev_info_t *)arg, hdl->h_addr);
1959 		return (DDI_SUCCESS);
1960 
1961 	case DDI_CTLOPS_UNINITCHILD:
1962 		ddi_set_name_addr((dev_info_t *)arg, NULL);
1963 		ndi_prop_remove_all((dev_info_t *)arg);
1964 		return (DDI_SUCCESS);
1965 
1966 	default:
1967 		return (ddi_ctlops(dip, rdip, ctlop, arg, result));
1968 	}
1969 }
1970 
1971 /*
1972  * Functions for device drivers.
1973  */
1974 bd_handle_t
1975 bd_alloc_handle(void *private, bd_ops_t *ops, ddi_dma_attr_t *dma, int kmflag)
1976 {
1977 	bd_handle_t	hdl;
1978 
1979 	/*
1980 	 * There is full compatability between the version 0 API and the
1981 	 * current version.
1982 	 */
1983 	switch (ops->o_version) {
1984 	case BD_OPS_VERSION_0:
1985 	case BD_OPS_CURRENT_VERSION:
1986 		break;
1987 
1988 	default:
1989 		return (NULL);
1990 	}
1991 
1992 	hdl = kmem_zalloc(sizeof (*hdl), kmflag);
1993 	if (hdl != NULL) {
1994 		hdl->h_ops = *ops;
1995 		hdl->h_dma = dma;
1996 		hdl->h_private = private;
1997 	}
1998 
1999 	return (hdl);
2000 }
2001 
2002 void
2003 bd_free_handle(bd_handle_t hdl)
2004 {
2005 	kmem_free(hdl, sizeof (*hdl));
2006 }
2007 
2008 int
2009 bd_attach_handle(dev_info_t *dip, bd_handle_t hdl)
2010 {
2011 	dev_info_t	*child;
2012 	bd_drive_t	drive = { 0 };
2013 
2014 	/*
2015 	 * It's not an error if bd_attach_handle() is called on a handle that
2016 	 * already is attached. We just ignore the request to attach and return.
2017 	 * This way drivers using blkdev don't have to keep track about blkdev
2018 	 * state, they can just call this function to make sure it attached.
2019 	 */
2020 	if (hdl->h_child != NULL) {
2021 		return (DDI_SUCCESS);
2022 	}
2023 
2024 	/* if drivers don't override this, make it assume none */
2025 	drive.d_lun = -1;
2026 	hdl->h_ops.o_drive_info(hdl->h_private, &drive);
2027 
2028 	hdl->h_parent = dip;
2029 	hdl->h_name = "blkdev";
2030 
2031 	/*LINTED: E_BAD_PTR_CAST_ALIGN*/
2032 	if (*(uint64_t *)drive.d_eui64 != 0) {
2033 		if (drive.d_lun >= 0) {
2034 			(void) snprintf(hdl->h_addr, sizeof (hdl->h_addr),
2035 			    "w%02X%02X%02X%02X%02X%02X%02X%02X,%X",
2036 			    drive.d_eui64[0], drive.d_eui64[1],
2037 			    drive.d_eui64[2], drive.d_eui64[3],
2038 			    drive.d_eui64[4], drive.d_eui64[5],
2039 			    drive.d_eui64[6], drive.d_eui64[7], drive.d_lun);
2040 		} else {
2041 			(void) snprintf(hdl->h_addr, sizeof (hdl->h_addr),
2042 			    "w%02X%02X%02X%02X%02X%02X%02X%02X",
2043 			    drive.d_eui64[0], drive.d_eui64[1],
2044 			    drive.d_eui64[2], drive.d_eui64[3],
2045 			    drive.d_eui64[4], drive.d_eui64[5],
2046 			    drive.d_eui64[6], drive.d_eui64[7]);
2047 		}
2048 	} else {
2049 		if (drive.d_lun >= 0) {
2050 			(void) snprintf(hdl->h_addr, sizeof (hdl->h_addr),
2051 			    "%X,%X", drive.d_target, drive.d_lun);
2052 		} else {
2053 			(void) snprintf(hdl->h_addr, sizeof (hdl->h_addr),
2054 			    "%X", drive.d_target);
2055 		}
2056 	}
2057 
2058 	if (ndi_devi_alloc(dip, hdl->h_name, (pnode_t)DEVI_SID_NODEID,
2059 	    &child) != NDI_SUCCESS) {
2060 		cmn_err(CE_WARN, "%s%d: unable to allocate node %s@%s",
2061 		    ddi_driver_name(dip), ddi_get_instance(dip),
2062 		    "blkdev", hdl->h_addr);
2063 		return (DDI_FAILURE);
2064 	}
2065 
2066 	ddi_set_parent_data(child, hdl);
2067 	hdl->h_child = child;
2068 
2069 	if (ndi_devi_online(child, 0) != NDI_SUCCESS) {
2070 		cmn_err(CE_WARN, "%s%d: failed bringing node %s@%s online",
2071 		    ddi_driver_name(dip), ddi_get_instance(dip),
2072 		    hdl->h_name, hdl->h_addr);
2073 		(void) ndi_devi_free(child);
2074 		hdl->h_child = NULL;
2075 		return (DDI_FAILURE);
2076 	}
2077 
2078 	return (DDI_SUCCESS);
2079 }
2080 
2081 int
2082 bd_detach_handle(bd_handle_t hdl)
2083 {
2084 	int	circ;
2085 	int	rv;
2086 	char	*devnm;
2087 
2088 	/*
2089 	 * It's not an error if bd_detach_handle() is called on a handle that
2090 	 * already is detached. We just ignore the request to detach and return.
2091 	 * This way drivers using blkdev don't have to keep track about blkdev
2092 	 * state, they can just call this function to make sure it detached.
2093 	 */
2094 	if (hdl->h_child == NULL) {
2095 		return (DDI_SUCCESS);
2096 	}
2097 	ndi_devi_enter(hdl->h_parent, &circ);
2098 	if (i_ddi_node_state(hdl->h_child) < DS_INITIALIZED) {
2099 		rv = ddi_remove_child(hdl->h_child, 0);
2100 	} else {
2101 		devnm = kmem_alloc(MAXNAMELEN + 1, KM_SLEEP);
2102 		(void) ddi_deviname(hdl->h_child, devnm);
2103 		(void) devfs_clean(hdl->h_parent, devnm + 1, DV_CLEAN_FORCE);
2104 		rv = ndi_devi_unconfig_one(hdl->h_parent, devnm + 1, NULL,
2105 		    NDI_DEVI_REMOVE | NDI_UNCONFIG);
2106 		kmem_free(devnm, MAXNAMELEN + 1);
2107 	}
2108 	if (rv == 0) {
2109 		hdl->h_child = NULL;
2110 	}
2111 
2112 	ndi_devi_exit(hdl->h_parent, circ);
2113 	return (rv == NDI_SUCCESS ? DDI_SUCCESS : DDI_FAILURE);
2114 }
2115 
2116 void
2117 bd_xfer_done(bd_xfer_t *xfer, int err)
2118 {
2119 	bd_xfer_impl_t	*xi = (void *)xfer;
2120 	buf_t		*bp = xi->i_bp;
2121 	int		rv = DDI_SUCCESS;
2122 	bd_t		*bd = xi->i_bd;
2123 	size_t		len;
2124 
2125 	if (err != 0) {
2126 		bd_runq_exit(xi, err);
2127 		atomic_inc_32(&bd->d_kerr->bd_harderrs.value.ui32);
2128 
2129 		bp->b_resid += xi->i_resid;
2130 		bd_xfer_free(xi);
2131 		bioerror(bp, err);
2132 		biodone(bp);
2133 		return;
2134 	}
2135 
2136 	xi->i_cur_win++;
2137 	xi->i_resid -= xi->i_len;
2138 
2139 	if (xi->i_resid == 0) {
2140 		/* Job completed succcessfully! */
2141 		bd_runq_exit(xi, 0);
2142 
2143 		bd_xfer_free(xi);
2144 		biodone(bp);
2145 		return;
2146 	}
2147 
2148 	xi->i_blkno += xi->i_nblks;
2149 
2150 	if (bd->d_use_dma) {
2151 		/* More transfer still pending... advance to next DMA window. */
2152 		rv = ddi_dma_getwin(xi->i_dmah, xi->i_cur_win,
2153 		    &xi->i_offset, &len, &xi->i_dmac, &xi->i_ndmac);
2154 	} else {
2155 		/* Advance memory window. */
2156 		xi->i_kaddr += xi->i_len;
2157 		xi->i_offset += xi->i_len;
2158 		len = min(bp->b_bcount - xi->i_offset, bd->d_maxxfer);
2159 	}
2160 
2161 
2162 	if ((rv != DDI_SUCCESS) ||
2163 	    (P2PHASE(len, (1U << xi->i_blkshift)) != 0)) {
2164 		bd_runq_exit(xi, EFAULT);
2165 
2166 		bp->b_resid += xi->i_resid;
2167 		bd_xfer_free(xi);
2168 		bioerror(bp, EFAULT);
2169 		biodone(bp);
2170 		return;
2171 	}
2172 	xi->i_len = len;
2173 	xi->i_nblks = len >> xi->i_blkshift;
2174 
2175 	/* Submit next window to hardware. */
2176 	rv = xi->i_func(bd->d_private, &xi->i_public);
2177 	if (rv != 0) {
2178 		bd_runq_exit(xi, rv);
2179 
2180 		atomic_inc_32(&bd->d_kerr->bd_transerrs.value.ui32);
2181 
2182 		bp->b_resid += xi->i_resid;
2183 		bd_xfer_free(xi);
2184 		bioerror(bp, rv);
2185 		biodone(bp);
2186 	}
2187 }
2188 
2189 void
2190 bd_error(bd_xfer_t *xfer, int error)
2191 {
2192 	bd_xfer_impl_t	*xi = (void *)xfer;
2193 	bd_t		*bd = xi->i_bd;
2194 
2195 	switch (error) {
2196 	case BD_ERR_MEDIA:
2197 		atomic_inc_32(&bd->d_kerr->bd_rq_media_err.value.ui32);
2198 		break;
2199 	case BD_ERR_NTRDY:
2200 		atomic_inc_32(&bd->d_kerr->bd_rq_ntrdy_err.value.ui32);
2201 		break;
2202 	case BD_ERR_NODEV:
2203 		atomic_inc_32(&bd->d_kerr->bd_rq_nodev_err.value.ui32);
2204 		break;
2205 	case BD_ERR_RECOV:
2206 		atomic_inc_32(&bd->d_kerr->bd_rq_recov_err.value.ui32);
2207 		break;
2208 	case BD_ERR_ILLRQ:
2209 		atomic_inc_32(&bd->d_kerr->bd_rq_illrq_err.value.ui32);
2210 		break;
2211 	case BD_ERR_PFA:
2212 		atomic_inc_32(&bd->d_kerr->bd_rq_pfa_err.value.ui32);
2213 		break;
2214 	default:
2215 		cmn_err(CE_PANIC, "bd_error: unknown error type %d", error);
2216 		break;
2217 	}
2218 }
2219 
2220 void
2221 bd_state_change(bd_handle_t hdl)
2222 {
2223 	bd_t		*bd;
2224 
2225 	if ((bd = hdl->h_bd) != NULL) {
2226 		bd_update_state(bd);
2227 	}
2228 }
2229 
2230 void
2231 bd_mod_init(struct dev_ops *devops)
2232 {
2233 	static struct bus_ops bd_bus_ops = {
2234 		BUSO_REV,		/* busops_rev */
2235 		nullbusmap,		/* bus_map */
2236 		NULL,			/* bus_get_intrspec (OBSOLETE) */
2237 		NULL,			/* bus_add_intrspec (OBSOLETE) */
2238 		NULL,			/* bus_remove_intrspec (OBSOLETE) */
2239 		i_ddi_map_fault,	/* bus_map_fault */
2240 		NULL,			/* bus_dma_map (OBSOLETE) */
2241 		ddi_dma_allochdl,	/* bus_dma_allochdl */
2242 		ddi_dma_freehdl,	/* bus_dma_freehdl */
2243 		ddi_dma_bindhdl,	/* bus_dma_bindhdl */
2244 		ddi_dma_unbindhdl,	/* bus_dma_unbindhdl */
2245 		ddi_dma_flush,		/* bus_dma_flush */
2246 		ddi_dma_win,		/* bus_dma_win */
2247 		ddi_dma_mctl,		/* bus_dma_ctl */
2248 		bd_bus_ctl,		/* bus_ctl */
2249 		ddi_bus_prop_op,	/* bus_prop_op */
2250 		NULL,			/* bus_get_eventcookie */
2251 		NULL,			/* bus_add_eventcall */
2252 		NULL,			/* bus_remove_eventcall */
2253 		NULL,			/* bus_post_event */
2254 		NULL,			/* bus_intr_ctl (OBSOLETE) */
2255 		NULL,			/* bus_config */
2256 		NULL,			/* bus_unconfig */
2257 		NULL,			/* bus_fm_init */
2258 		NULL,			/* bus_fm_fini */
2259 		NULL,			/* bus_fm_access_enter */
2260 		NULL,			/* bus_fm_access_exit */
2261 		NULL,			/* bus_power */
2262 		NULL,			/* bus_intr_op */
2263 	};
2264 
2265 	devops->devo_bus_ops = &bd_bus_ops;
2266 
2267 	/*
2268 	 * NB: The device driver is free to supply its own
2269 	 * character entry device support.
2270 	 */
2271 }
2272 
2273 void
2274 bd_mod_fini(struct dev_ops *devops)
2275 {
2276 	devops->devo_bus_ops = NULL;
2277 }
2278