xref: /illumos-gate/usr/src/uts/common/io/blkdev/blkdev.c (revision 915894ef19890baaed00080f85f6b69e225cda98)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright 2012 Garrett D'Amore <garrett@damore.org>.  All rights reserved.
24  * Copyright 2012 Alexey Zaytsev <alexey.zaytsev@gmail.com> All rights reserved.
25  * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
26  * Copyright 2017 The MathWorks, Inc.  All rights reserved.
27  * Copyright 2019 Western Digital Corporation.
28  * Copyright 2020 Joyent, Inc.
29  */
30 
31 #include <sys/types.h>
32 #include <sys/ksynch.h>
33 #include <sys/kmem.h>
34 #include <sys/file.h>
35 #include <sys/errno.h>
36 #include <sys/open.h>
37 #include <sys/buf.h>
38 #include <sys/uio.h>
39 #include <sys/aio_req.h>
40 #include <sys/cred.h>
41 #include <sys/modctl.h>
42 #include <sys/cmlb.h>
43 #include <sys/conf.h>
44 #include <sys/devops.h>
45 #include <sys/list.h>
46 #include <sys/sysmacros.h>
47 #include <sys/dkio.h>
48 #include <sys/dkioc_free_util.h>
49 #include <sys/vtoc.h>
50 #include <sys/scsi/scsi.h>	/* for DTYPE_DIRECT */
51 #include <sys/kstat.h>
52 #include <sys/fs/dv_node.h>
53 #include <sys/ddi.h>
54 #include <sys/sunddi.h>
55 #include <sys/note.h>
56 #include <sys/blkdev.h>
57 #include <sys/scsi/impl/inquiry.h>
58 
59 /*
60  * blkdev is a driver which provides a lot of the common functionality
61  * a block device driver may need and helps by removing code which
62  * is frequently duplicated in block device drivers.
63  *
64  * Within this driver all the struct cb_ops functions required for a
65  * block device driver are written with appropriate call back functions
66  * to be provided by the parent driver.
67  *
68  * To use blkdev, a driver needs to:
69  *	1. Create a bd_ops_t structure which has the call back operations
70  *	   blkdev will use.
71  *	2. Create a handle by calling bd_alloc_handle(). One of the
72  *	   arguments to this function is the bd_ops_t.
73  *	3. Call bd_attach_handle(). This will instantiate a blkdev device
74  *	   as a child device node of the calling driver.
75  *
76  * A parent driver is not restricted to just allocating and attaching a
77  * single instance, it may attach as many as it wishes. For each handle
78  * attached, appropriate entries in /dev/[r]dsk are created.
79  *
80  * The bd_ops_t routines that a parent of blkdev need to provide are:
81  *
82  * o_drive_info: Provide information to blkdev such as how many I/O queues
83  *		 to create and the size of those queues. Also some device
84  *		 specifics such as EUI, vendor, product, model, serial
85  *		 number ....
86  *
87  * o_media_info: Provide information about the media. Eg size and block size.
88  *
89  * o_devid_init: Creates and initializes the device id. Typically calls
90  *		 ddi_devid_init().
91  *
92  * o_sync_cache: Issues a device appropriate command to flush any write
93  *		 caches.
94  *
95  * o_read:	 Read data as described by bd_xfer_t argument.
96  *
97  * o_write:	 Write data as described by bd_xfer_t argument.
98  *
99  * o_free_space: Free the space described by bd_xfer_t argument (optional).
100  *
101  * Queues
102  * ------
103  * Part of the drive_info data is a queue count. blkdev will create
104  * "queue count" number of waitq/runq pairs. Each waitq/runq pair
105  * operates independently. As an I/O is scheduled up to the parent
106  * driver via o_read or o_write its queue number is given. If the
107  * parent driver supports multiple hardware queues it can then select
108  * where to submit the I/O request.
109  *
110  * Currently blkdev uses a simplistic round-robin queue selection method.
111  * It has the advantage that it is lockless. In the future it will be
112  * worthwhile reviewing this strategy for something which prioritizes queues
113  * depending on how busy they are.
114  *
115  * Each waitq/runq pair is protected by its mutex (q_iomutex). Incoming
116  * I/O requests are initially added to the waitq. They are taken off the
117  * waitq, added to the runq and submitted, providing the runq is less
118  * than the qsize as specified in the drive_info. As an I/O request
119  * completes, the parent driver is required to call bd_xfer_done(), which
120  * will remove the I/O request from the runq and pass I/O completion
121  * status up the stack.
122  *
123  * Locks
124  * -----
125  * There are 4 instance global locks d_ocmutex, d_ksmutex, d_errmutex and
126  * d_statemutex. As well a q_iomutex per waitq/runq pair.
127  *
128  * Lock Hierarchy
129  * --------------
130  * The only two locks which may be held simultaneously are q_iomutex and
131  * d_ksmutex. In all cases q_iomutex must be acquired before d_ksmutex.
132  */
133 
134 #define	BD_MAXPART	64
135 #define	BDINST(dev)	(getminor(dev) / BD_MAXPART)
136 #define	BDPART(dev)	(getminor(dev) % BD_MAXPART)
137 
138 typedef struct bd bd_t;
139 typedef struct bd_xfer_impl bd_xfer_impl_t;
140 typedef struct bd_queue bd_queue_t;
141 
142 struct bd {
143 	void		*d_private;
144 	dev_info_t	*d_dip;
145 	kmutex_t	d_ocmutex;
146 	kmutex_t	d_ksmutex;
147 	kmutex_t	d_errmutex;
148 	kmutex_t	d_statemutex;
149 	kcondvar_t	d_statecv;
150 	enum dkio_state	d_state;
151 	cmlb_handle_t	d_cmlbh;
152 	unsigned	d_open_lyr[BD_MAXPART];	/* open count */
153 	uint64_t	d_open_excl;	/* bit mask indexed by partition */
154 	uint64_t	d_open_reg[OTYPCNT];		/* bit mask */
155 	uint64_t	d_io_counter;
156 
157 	uint32_t	d_qcount;
158 	uint32_t	d_qactive;
159 	uint32_t	d_maxxfer;
160 	uint32_t	d_blkshift;
161 	uint32_t	d_pblkshift;
162 	uint64_t	d_numblks;
163 	ddi_devid_t	d_devid;
164 
165 	uint64_t	d_max_free_seg;
166 	uint64_t	d_max_free_blks;
167 	uint64_t	d_max_free_seg_blks;
168 	uint64_t	d_free_align;
169 
170 	kmem_cache_t	*d_cache;
171 	bd_queue_t	*d_queues;
172 	kstat_t		*d_ksp;
173 	kstat_io_t	*d_kiop;
174 	kstat_t		*d_errstats;
175 	struct bd_errstats *d_kerr;
176 
177 	boolean_t	d_rdonly;
178 	boolean_t	d_ssd;
179 	boolean_t	d_removable;
180 	boolean_t	d_hotpluggable;
181 	boolean_t	d_use_dma;
182 
183 	ddi_dma_attr_t	d_dma;
184 	bd_ops_t	d_ops;
185 	bd_handle_t	d_handle;
186 };
187 
188 struct bd_handle {
189 	bd_ops_t	h_ops;
190 	ddi_dma_attr_t	*h_dma;
191 	dev_info_t	*h_parent;
192 	dev_info_t	*h_child;
193 	void		*h_private;
194 	bd_t		*h_bd;
195 	char		*h_name;
196 	char		h_addr[30];	/* enough for w%0.16x,%X */
197 };
198 
199 struct bd_xfer_impl {
200 	bd_xfer_t	i_public;
201 	list_node_t	i_linkage;
202 	bd_t		*i_bd;
203 	buf_t		*i_bp;
204 	bd_queue_t	*i_bq;
205 	uint_t		i_num_win;
206 	uint_t		i_cur_win;
207 	off_t		i_offset;
208 	int		(*i_func)(void *, bd_xfer_t *);
209 	uint32_t	i_blkshift;
210 	size_t		i_len;
211 	size_t		i_resid;
212 };
213 
214 struct bd_queue {
215 	kmutex_t	q_iomutex;
216 	uint32_t	q_qsize;
217 	uint32_t	q_qactive;
218 	list_t		q_runq;
219 	list_t		q_waitq;
220 };
221 
222 #define	i_dmah		i_public.x_dmah
223 #define	i_dmac		i_public.x_dmac
224 #define	i_ndmac		i_public.x_ndmac
225 #define	i_kaddr		i_public.x_kaddr
226 #define	i_nblks		i_public.x_nblks
227 #define	i_blkno		i_public.x_blkno
228 #define	i_flags		i_public.x_flags
229 #define	i_qnum		i_public.x_qnum
230 #define	i_dfl		i_public.x_dfl
231 
232 #define	CAN_FREESPACE(bd) \
233 	(((bd)->d_ops.o_free_space == NULL) ? B_FALSE : B_TRUE)
234 
235 /*
236  * Private prototypes.
237  */
238 
239 static void bd_prop_update_inqstring(dev_info_t *, char *, char *, size_t);
240 static void bd_create_inquiry_props(dev_info_t *, bd_drive_t *);
241 static void bd_create_errstats(bd_t *, int, bd_drive_t *);
242 static void bd_destroy_errstats(bd_t *);
243 static void bd_errstats_setstr(kstat_named_t *, char *, size_t, char *);
244 static void bd_init_errstats(bd_t *, bd_drive_t *);
245 static void bd_fini_errstats(bd_t *);
246 
247 static int bd_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
248 static int bd_attach(dev_info_t *, ddi_attach_cmd_t);
249 static int bd_detach(dev_info_t *, ddi_detach_cmd_t);
250 
251 static int bd_open(dev_t *, int, int, cred_t *);
252 static int bd_close(dev_t, int, int, cred_t *);
253 static int bd_strategy(struct buf *);
254 static int bd_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
255 static int bd_dump(dev_t, caddr_t, daddr_t, int);
256 static int bd_read(dev_t, struct uio *, cred_t *);
257 static int bd_write(dev_t, struct uio *, cred_t *);
258 static int bd_aread(dev_t, struct aio_req *, cred_t *);
259 static int bd_awrite(dev_t, struct aio_req *, cred_t *);
260 static int bd_prop_op(dev_t, dev_info_t *, ddi_prop_op_t, int, char *,
261     caddr_t, int *);
262 
263 static int bd_tg_rdwr(dev_info_t *, uchar_t, void *, diskaddr_t, size_t,
264     void *);
265 static int bd_tg_getinfo(dev_info_t *, int, void *, void *);
266 static int bd_xfer_ctor(void *, void *, int);
267 static void bd_xfer_dtor(void *, void *);
268 static void bd_sched(bd_t *, bd_queue_t *);
269 static void bd_submit(bd_t *, bd_xfer_impl_t *);
270 static void bd_runq_exit(bd_xfer_impl_t *, int);
271 static void bd_update_state(bd_t *);
272 static int bd_check_state(bd_t *, enum dkio_state *);
273 static int bd_flush_write_cache(bd_t *, struct dk_callback *);
274 static int bd_check_uio(dev_t, struct uio *);
275 static int bd_free_space(dev_t, bd_t *, dkioc_free_list_t *);
276 
277 struct cmlb_tg_ops bd_tg_ops = {
278 	TG_DK_OPS_VERSION_1,
279 	bd_tg_rdwr,
280 	bd_tg_getinfo,
281 };
282 
283 static struct cb_ops bd_cb_ops = {
284 	bd_open,		/* open */
285 	bd_close,		/* close */
286 	bd_strategy,		/* strategy */
287 	nodev,			/* print */
288 	bd_dump,		/* dump */
289 	bd_read,		/* read */
290 	bd_write,		/* write */
291 	bd_ioctl,		/* ioctl */
292 	nodev,			/* devmap */
293 	nodev,			/* mmap */
294 	nodev,			/* segmap */
295 	nochpoll,		/* poll */
296 	bd_prop_op,		/* cb_prop_op */
297 	0,			/* streamtab  */
298 	D_64BIT | D_MP,		/* Driver comaptibility flag */
299 	CB_REV,			/* cb_rev */
300 	bd_aread,		/* async read */
301 	bd_awrite		/* async write */
302 };
303 
304 struct dev_ops bd_dev_ops = {
305 	DEVO_REV,		/* devo_rev, */
306 	0,			/* refcnt  */
307 	bd_getinfo,		/* getinfo */
308 	nulldev,		/* identify */
309 	nulldev,		/* probe */
310 	bd_attach,		/* attach */
311 	bd_detach,		/* detach */
312 	nodev,			/* reset */
313 	&bd_cb_ops,		/* driver operations */
314 	NULL,			/* bus operations */
315 	NULL,			/* power */
316 	ddi_quiesce_not_needed,	/* quiesce */
317 };
318 
319 static struct modldrv modldrv = {
320 	&mod_driverops,
321 	"Generic Block Device",
322 	&bd_dev_ops,
323 };
324 
325 static struct modlinkage modlinkage = {
326 	MODREV_1, { &modldrv, NULL }
327 };
328 
329 static void *bd_state;
330 static krwlock_t bd_lock;
331 
332 int
333 _init(void)
334 {
335 	int	rv;
336 
337 	rv = ddi_soft_state_init(&bd_state, sizeof (struct bd), 2);
338 	if (rv != DDI_SUCCESS) {
339 		return (rv);
340 	}
341 	rw_init(&bd_lock, NULL, RW_DRIVER, NULL);
342 	rv = mod_install(&modlinkage);
343 	if (rv != DDI_SUCCESS) {
344 		rw_destroy(&bd_lock);
345 		ddi_soft_state_fini(&bd_state);
346 	}
347 	return (rv);
348 }
349 
350 int
351 _fini(void)
352 {
353 	int	rv;
354 
355 	rv = mod_remove(&modlinkage);
356 	if (rv == DDI_SUCCESS) {
357 		rw_destroy(&bd_lock);
358 		ddi_soft_state_fini(&bd_state);
359 	}
360 	return (rv);
361 }
362 
363 int
364 _info(struct modinfo *modinfop)
365 {
366 	return (mod_info(&modlinkage, modinfop));
367 }
368 
369 static int
370 bd_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resultp)
371 {
372 	bd_t	*bd;
373 	minor_t	inst;
374 
375 	_NOTE(ARGUNUSED(dip));
376 
377 	inst = BDINST((dev_t)arg);
378 
379 	switch (cmd) {
380 	case DDI_INFO_DEVT2DEVINFO:
381 		bd = ddi_get_soft_state(bd_state, inst);
382 		if (bd == NULL) {
383 			return (DDI_FAILURE);
384 		}
385 		*resultp = (void *)bd->d_dip;
386 		break;
387 
388 	case DDI_INFO_DEVT2INSTANCE:
389 		*resultp = (void *)(intptr_t)inst;
390 		break;
391 
392 	default:
393 		return (DDI_FAILURE);
394 	}
395 	return (DDI_SUCCESS);
396 }
397 
398 static void
399 bd_prop_update_inqstring(dev_info_t *dip, char *name, char *data, size_t len)
400 {
401 	int	ilen;
402 	char	*data_string;
403 
404 	ilen = scsi_ascii_inquiry_len(data, len);
405 	ASSERT3U(ilen, <=, len);
406 	if (ilen <= 0)
407 		return;
408 	/* ensure null termination */
409 	data_string = kmem_zalloc(ilen + 1, KM_SLEEP);
410 	bcopy(data, data_string, ilen);
411 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, dip, name, data_string);
412 	kmem_free(data_string, ilen + 1);
413 }
414 
415 static void
416 bd_create_inquiry_props(dev_info_t *dip, bd_drive_t *drive)
417 {
418 	if (drive->d_vendor_len > 0)
419 		bd_prop_update_inqstring(dip, INQUIRY_VENDOR_ID,
420 		    drive->d_vendor, drive->d_vendor_len);
421 
422 	if (drive->d_product_len > 0)
423 		bd_prop_update_inqstring(dip, INQUIRY_PRODUCT_ID,
424 		    drive->d_product, drive->d_product_len);
425 
426 	if (drive->d_serial_len > 0)
427 		bd_prop_update_inqstring(dip, INQUIRY_SERIAL_NO,
428 		    drive->d_serial, drive->d_serial_len);
429 
430 	if (drive->d_revision_len > 0)
431 		bd_prop_update_inqstring(dip, INQUIRY_REVISION_ID,
432 		    drive->d_revision, drive->d_revision_len);
433 }
434 
435 static void
436 bd_create_errstats(bd_t *bd, int inst, bd_drive_t *drive)
437 {
438 	char	ks_module[KSTAT_STRLEN];
439 	char	ks_name[KSTAT_STRLEN];
440 	int	ndata = sizeof (struct bd_errstats) / sizeof (kstat_named_t);
441 
442 	if (bd->d_errstats != NULL)
443 		return;
444 
445 	(void) snprintf(ks_module, sizeof (ks_module), "%serr",
446 	    ddi_driver_name(bd->d_dip));
447 	(void) snprintf(ks_name, sizeof (ks_name), "%s%d,err",
448 	    ddi_driver_name(bd->d_dip), inst);
449 
450 	bd->d_errstats = kstat_create(ks_module, inst, ks_name, "device_error",
451 	    KSTAT_TYPE_NAMED, ndata, KSTAT_FLAG_PERSISTENT);
452 
453 	mutex_init(&bd->d_errmutex, NULL, MUTEX_DRIVER, NULL);
454 	if (bd->d_errstats == NULL) {
455 		/*
456 		 * Even if we cannot create the kstat, we create a
457 		 * scratch kstat.  The reason for this is to ensure
458 		 * that we can update the kstat all of the time,
459 		 * without adding an extra branch instruction.
460 		 */
461 		bd->d_kerr = kmem_zalloc(sizeof (struct bd_errstats),
462 		    KM_SLEEP);
463 	} else {
464 		bd->d_errstats->ks_lock = &bd->d_errmutex;
465 		bd->d_kerr = (struct bd_errstats *)bd->d_errstats->ks_data;
466 	}
467 
468 	kstat_named_init(&bd->d_kerr->bd_softerrs,	"Soft Errors",
469 	    KSTAT_DATA_UINT32);
470 	kstat_named_init(&bd->d_kerr->bd_harderrs,	"Hard Errors",
471 	    KSTAT_DATA_UINT32);
472 	kstat_named_init(&bd->d_kerr->bd_transerrs,	"Transport Errors",
473 	    KSTAT_DATA_UINT32);
474 
475 	if (drive->d_model_len > 0) {
476 		kstat_named_init(&bd->d_kerr->bd_model,	"Model",
477 		    KSTAT_DATA_STRING);
478 	} else {
479 		kstat_named_init(&bd->d_kerr->bd_vid,	"Vendor",
480 		    KSTAT_DATA_STRING);
481 		kstat_named_init(&bd->d_kerr->bd_pid,	"Product",
482 		    KSTAT_DATA_STRING);
483 	}
484 
485 	kstat_named_init(&bd->d_kerr->bd_revision,	"Revision",
486 	    KSTAT_DATA_STRING);
487 	kstat_named_init(&bd->d_kerr->bd_serial,	"Serial No",
488 	    KSTAT_DATA_STRING);
489 	kstat_named_init(&bd->d_kerr->bd_capacity,	"Size",
490 	    KSTAT_DATA_ULONGLONG);
491 	kstat_named_init(&bd->d_kerr->bd_rq_media_err,	"Media Error",
492 	    KSTAT_DATA_UINT32);
493 	kstat_named_init(&bd->d_kerr->bd_rq_ntrdy_err,	"Device Not Ready",
494 	    KSTAT_DATA_UINT32);
495 	kstat_named_init(&bd->d_kerr->bd_rq_nodev_err,	"No Device",
496 	    KSTAT_DATA_UINT32);
497 	kstat_named_init(&bd->d_kerr->bd_rq_recov_err,	"Recoverable",
498 	    KSTAT_DATA_UINT32);
499 	kstat_named_init(&bd->d_kerr->bd_rq_illrq_err,	"Illegal Request",
500 	    KSTAT_DATA_UINT32);
501 	kstat_named_init(&bd->d_kerr->bd_rq_pfa_err,
502 	    "Predictive Failure Analysis", KSTAT_DATA_UINT32);
503 
504 	bd->d_errstats->ks_private = bd;
505 
506 	kstat_install(bd->d_errstats);
507 	bd_init_errstats(bd, drive);
508 }
509 
510 static void
511 bd_destroy_errstats(bd_t *bd)
512 {
513 	if (bd->d_errstats != NULL) {
514 		bd_fini_errstats(bd);
515 		kstat_delete(bd->d_errstats);
516 		bd->d_errstats = NULL;
517 	} else {
518 		kmem_free(bd->d_kerr, sizeof (struct bd_errstats));
519 		bd->d_kerr = NULL;
520 		mutex_destroy(&bd->d_errmutex);
521 	}
522 }
523 
524 static void
525 bd_errstats_setstr(kstat_named_t *k, char *str, size_t len, char *alt)
526 {
527 	char	*tmp;
528 	size_t	km_len;
529 
530 	if (KSTAT_NAMED_STR_PTR(k) == NULL) {
531 		if (len > 0)
532 			km_len = strnlen(str, len);
533 		else if (alt != NULL)
534 			km_len = strlen(alt);
535 		else
536 			return;
537 
538 		tmp = kmem_alloc(km_len + 1, KM_SLEEP);
539 		bcopy(len > 0 ? str : alt, tmp, km_len);
540 		tmp[km_len] = '\0';
541 
542 		kstat_named_setstr(k, tmp);
543 	}
544 }
545 
546 static void
547 bd_errstats_clrstr(kstat_named_t *k)
548 {
549 	if (KSTAT_NAMED_STR_PTR(k) == NULL)
550 		return;
551 
552 	kmem_free(KSTAT_NAMED_STR_PTR(k), KSTAT_NAMED_STR_BUFLEN(k));
553 	kstat_named_setstr(k, NULL);
554 }
555 
556 static void
557 bd_init_errstats(bd_t *bd, bd_drive_t *drive)
558 {
559 	struct bd_errstats	*est = bd->d_kerr;
560 
561 	mutex_enter(&bd->d_errmutex);
562 
563 	if (drive->d_model_len > 0 &&
564 	    KSTAT_NAMED_STR_PTR(&est->bd_model) == NULL) {
565 		bd_errstats_setstr(&est->bd_model, drive->d_model,
566 		    drive->d_model_len, NULL);
567 	} else {
568 		bd_errstats_setstr(&est->bd_vid, drive->d_vendor,
569 		    drive->d_vendor_len, "Unknown ");
570 		bd_errstats_setstr(&est->bd_pid, drive->d_product,
571 		    drive->d_product_len, "Unknown         ");
572 	}
573 
574 	bd_errstats_setstr(&est->bd_revision, drive->d_revision,
575 	    drive->d_revision_len, "0001");
576 	bd_errstats_setstr(&est->bd_serial, drive->d_serial,
577 	    drive->d_serial_len, "0               ");
578 
579 	mutex_exit(&bd->d_errmutex);
580 }
581 
582 static void
583 bd_fini_errstats(bd_t *bd)
584 {
585 	struct bd_errstats	*est = bd->d_kerr;
586 
587 	mutex_enter(&bd->d_errmutex);
588 
589 	bd_errstats_clrstr(&est->bd_model);
590 	bd_errstats_clrstr(&est->bd_vid);
591 	bd_errstats_clrstr(&est->bd_pid);
592 	bd_errstats_clrstr(&est->bd_revision);
593 	bd_errstats_clrstr(&est->bd_serial);
594 
595 	mutex_exit(&bd->d_errmutex);
596 }
597 
598 static void
599 bd_queues_free(bd_t *bd)
600 {
601 	uint32_t i;
602 
603 	for (i = 0; i < bd->d_qcount; i++) {
604 		bd_queue_t *bq = &bd->d_queues[i];
605 
606 		mutex_destroy(&bq->q_iomutex);
607 		list_destroy(&bq->q_waitq);
608 		list_destroy(&bq->q_runq);
609 	}
610 
611 	kmem_free(bd->d_queues, sizeof (*bd->d_queues) * bd->d_qcount);
612 }
613 
614 static int
615 bd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
616 {
617 	int		inst;
618 	bd_handle_t	hdl;
619 	bd_t		*bd;
620 	bd_drive_t	drive;
621 	uint32_t	i;
622 	int		rv;
623 	char		name[16];
624 	char		kcache[32];
625 
626 	switch (cmd) {
627 	case DDI_ATTACH:
628 		break;
629 	case DDI_RESUME:
630 		/* We don't do anything native for suspend/resume */
631 		return (DDI_SUCCESS);
632 	default:
633 		return (DDI_FAILURE);
634 	}
635 
636 	inst = ddi_get_instance(dip);
637 	hdl = ddi_get_parent_data(dip);
638 
639 	(void) snprintf(name, sizeof (name), "%s%d",
640 	    ddi_driver_name(dip), ddi_get_instance(dip));
641 	(void) snprintf(kcache, sizeof (kcache), "%s_xfer", name);
642 
643 	if (hdl == NULL) {
644 		cmn_err(CE_WARN, "%s: missing parent data!", name);
645 		return (DDI_FAILURE);
646 	}
647 
648 	if (ddi_soft_state_zalloc(bd_state, inst) != DDI_SUCCESS) {
649 		cmn_err(CE_WARN, "%s: unable to zalloc soft state!", name);
650 		return (DDI_FAILURE);
651 	}
652 	bd = ddi_get_soft_state(bd_state, inst);
653 
654 	if (hdl->h_dma) {
655 		bd->d_dma = *(hdl->h_dma);
656 		bd->d_dma.dma_attr_granular =
657 		    max(DEV_BSIZE, bd->d_dma.dma_attr_granular);
658 		bd->d_use_dma = B_TRUE;
659 
660 		if (bd->d_maxxfer &&
661 		    (bd->d_maxxfer != bd->d_dma.dma_attr_maxxfer)) {
662 			cmn_err(CE_WARN,
663 			    "%s: inconsistent maximum transfer size!",
664 			    name);
665 			/* We force it */
666 			bd->d_maxxfer = bd->d_dma.dma_attr_maxxfer;
667 		} else {
668 			bd->d_maxxfer = bd->d_dma.dma_attr_maxxfer;
669 		}
670 	} else {
671 		bd->d_use_dma = B_FALSE;
672 		if (bd->d_maxxfer == 0) {
673 			bd->d_maxxfer = 1024 * 1024;
674 		}
675 	}
676 	bd->d_ops = hdl->h_ops;
677 	bd->d_private = hdl->h_private;
678 	bd->d_blkshift = DEV_BSHIFT;	/* 512 bytes, to start */
679 
680 	if (bd->d_maxxfer % DEV_BSIZE) {
681 		cmn_err(CE_WARN, "%s: maximum transfer misaligned!", name);
682 		bd->d_maxxfer &= ~(DEV_BSIZE - 1);
683 	}
684 	if (bd->d_maxxfer < DEV_BSIZE) {
685 		cmn_err(CE_WARN, "%s: maximum transfer size too small!", name);
686 		ddi_soft_state_free(bd_state, inst);
687 		return (DDI_FAILURE);
688 	}
689 
690 	bd->d_dip = dip;
691 	bd->d_handle = hdl;
692 	hdl->h_bd = bd;
693 	ddi_set_driver_private(dip, bd);
694 
695 	mutex_init(&bd->d_ksmutex, NULL, MUTEX_DRIVER, NULL);
696 	mutex_init(&bd->d_ocmutex, NULL, MUTEX_DRIVER, NULL);
697 	mutex_init(&bd->d_statemutex, NULL, MUTEX_DRIVER, NULL);
698 	cv_init(&bd->d_statecv, NULL, CV_DRIVER, NULL);
699 
700 	bd->d_cache = kmem_cache_create(kcache, sizeof (bd_xfer_impl_t), 8,
701 	    bd_xfer_ctor, bd_xfer_dtor, NULL, bd, NULL, 0);
702 
703 	bd->d_ksp = kstat_create(ddi_driver_name(dip), inst, NULL, "disk",
704 	    KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT);
705 	if (bd->d_ksp != NULL) {
706 		bd->d_ksp->ks_lock = &bd->d_ksmutex;
707 		kstat_install(bd->d_ksp);
708 		bd->d_kiop = bd->d_ksp->ks_data;
709 	} else {
710 		/*
711 		 * Even if we cannot create the kstat, we create a
712 		 * scratch kstat.  The reason for this is to ensure
713 		 * that we can update the kstat all of the time,
714 		 * without adding an extra branch instruction.
715 		 */
716 		bd->d_kiop = kmem_zalloc(sizeof (kstat_io_t), KM_SLEEP);
717 	}
718 
719 	cmlb_alloc_handle(&bd->d_cmlbh);
720 
721 	bd->d_state = DKIO_NONE;
722 
723 	bzero(&drive, sizeof (drive));
724 	/*
725 	 * Default to one queue, and no restrictions on free space requests
726 	 * (if driver provides method) parent driver can override.
727 	 */
728 	drive.d_qcount = 1;
729 	drive.d_free_align = 1;
730 	bd->d_ops.o_drive_info(bd->d_private, &drive);
731 
732 	/*
733 	 * Several checks to make sure o_drive_info() didn't return bad
734 	 * values:
735 	 *
736 	 * There must be at least one queue
737 	 */
738 	if (drive.d_qcount == 0)
739 		goto fail_drive_info;
740 
741 	/* FREE/UNMAP/TRIM alignment needs to be at least 1 block */
742 	if (drive.d_free_align == 0)
743 		goto fail_drive_info;
744 
745 	/*
746 	 * If d_max_free_blks is not unlimited (not 0), then we cannot allow
747 	 * an unlimited segment size. It is however permissible to not impose
748 	 * a limit on the total number of blocks freed while limiting the
749 	 * amount allowed in an individual segment.
750 	 */
751 	if ((drive.d_max_free_blks > 0 && drive.d_max_free_seg_blks == 0))
752 		goto fail_drive_info;
753 
754 	/*
755 	 * If a limit is set on d_max_free_blks (by the above check, we know
756 	 * if there's a limit on d_max_free_blks, d_max_free_seg_blks cannot
757 	 * be unlimited), it cannot be smaller than the limit on an individual
758 	 * segment.
759 	 */
760 	if ((drive.d_max_free_blks > 0 &&
761 	    drive.d_max_free_seg_blks > drive.d_max_free_blks)) {
762 		goto fail_drive_info;
763 	}
764 
765 	bd->d_qcount = drive.d_qcount;
766 	bd->d_removable = drive.d_removable;
767 	bd->d_hotpluggable = drive.d_hotpluggable;
768 
769 	if (drive.d_maxxfer && drive.d_maxxfer < bd->d_maxxfer)
770 		bd->d_maxxfer = drive.d_maxxfer;
771 
772 	bd->d_free_align = drive.d_free_align;
773 	bd->d_max_free_seg = drive.d_max_free_seg;
774 	bd->d_max_free_blks = drive.d_max_free_blks;
775 	bd->d_max_free_seg_blks = drive.d_max_free_seg_blks;
776 
777 	bd_create_inquiry_props(dip, &drive);
778 	bd_create_errstats(bd, inst, &drive);
779 	bd_update_state(bd);
780 
781 	bd->d_queues = kmem_alloc(sizeof (*bd->d_queues) * bd->d_qcount,
782 	    KM_SLEEP);
783 	for (i = 0; i < bd->d_qcount; i++) {
784 		bd_queue_t *bq = &bd->d_queues[i];
785 
786 		bq->q_qsize = drive.d_qsize;
787 		bq->q_qactive = 0;
788 		mutex_init(&bq->q_iomutex, NULL, MUTEX_DRIVER, NULL);
789 
790 		list_create(&bq->q_waitq, sizeof (bd_xfer_impl_t),
791 		    offsetof(struct bd_xfer_impl, i_linkage));
792 		list_create(&bq->q_runq, sizeof (bd_xfer_impl_t),
793 		    offsetof(struct bd_xfer_impl, i_linkage));
794 	}
795 
796 	rv = cmlb_attach(dip, &bd_tg_ops, DTYPE_DIRECT,
797 	    bd->d_removable, bd->d_hotpluggable,
798 	    /*LINTED: E_BAD_PTR_CAST_ALIGN*/
799 	    *(uint64_t *)drive.d_eui64 != 0 ? DDI_NT_BLOCK_BLKDEV :
800 	    drive.d_lun >= 0 ? DDI_NT_BLOCK_CHAN : DDI_NT_BLOCK,
801 	    CMLB_FAKE_LABEL_ONE_PARTITION, bd->d_cmlbh, 0);
802 	if (rv != 0) {
803 		goto fail_cmlb_attach;
804 	}
805 
806 	if (bd->d_ops.o_devid_init != NULL) {
807 		rv = bd->d_ops.o_devid_init(bd->d_private, dip, &bd->d_devid);
808 		if (rv == DDI_SUCCESS) {
809 			if (ddi_devid_register(dip, bd->d_devid) !=
810 			    DDI_SUCCESS) {
811 				cmn_err(CE_WARN,
812 				    "%s: unable to register devid", name);
813 			}
814 		}
815 	}
816 
817 	/*
818 	 * Add a zero-length attribute to tell the world we support
819 	 * kernel ioctls (for layered drivers).  Also set up properties
820 	 * used by HAL to identify removable media.
821 	 */
822 	(void) ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
823 	    DDI_KERNEL_IOCTL, NULL, 0);
824 	if (bd->d_removable) {
825 		(void) ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
826 		    "removable-media", NULL, 0);
827 	}
828 	if (bd->d_hotpluggable) {
829 		(void) ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
830 		    "hotpluggable", NULL, 0);
831 	}
832 
833 	ddi_report_dev(dip);
834 
835 	return (DDI_SUCCESS);
836 
837 fail_cmlb_attach:
838 	bd_queues_free(bd);
839 	bd_destroy_errstats(bd);
840 
841 fail_drive_info:
842 	cmlb_free_handle(&bd->d_cmlbh);
843 
844 	if (bd->d_ksp != NULL) {
845 		kstat_delete(bd->d_ksp);
846 		bd->d_ksp = NULL;
847 	} else {
848 		kmem_free(bd->d_kiop, sizeof (kstat_io_t));
849 	}
850 
851 	kmem_cache_destroy(bd->d_cache);
852 	cv_destroy(&bd->d_statecv);
853 	mutex_destroy(&bd->d_statemutex);
854 	mutex_destroy(&bd->d_ocmutex);
855 	mutex_destroy(&bd->d_ksmutex);
856 	ddi_soft_state_free(bd_state, inst);
857 	return (DDI_FAILURE);
858 }
859 
860 static int
861 bd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
862 {
863 	bd_t	*bd;
864 
865 	bd = ddi_get_driver_private(dip);
866 
867 	switch (cmd) {
868 	case DDI_DETACH:
869 		break;
870 	case DDI_SUSPEND:
871 		/* We don't suspend, but our parent does */
872 		return (DDI_SUCCESS);
873 	default:
874 		return (DDI_FAILURE);
875 	}
876 
877 	if (bd->d_ksp != NULL) {
878 		kstat_delete(bd->d_ksp);
879 		bd->d_ksp = NULL;
880 	} else {
881 		kmem_free(bd->d_kiop, sizeof (kstat_io_t));
882 	}
883 
884 	bd_destroy_errstats(bd);
885 	cmlb_detach(bd->d_cmlbh, 0);
886 	cmlb_free_handle(&bd->d_cmlbh);
887 	if (bd->d_devid)
888 		ddi_devid_free(bd->d_devid);
889 	kmem_cache_destroy(bd->d_cache);
890 	mutex_destroy(&bd->d_ksmutex);
891 	mutex_destroy(&bd->d_ocmutex);
892 	mutex_destroy(&bd->d_statemutex);
893 	cv_destroy(&bd->d_statecv);
894 	bd_queues_free(bd);
895 	ddi_soft_state_free(bd_state, ddi_get_instance(dip));
896 	return (DDI_SUCCESS);
897 }
898 
899 static int
900 bd_xfer_ctor(void *buf, void *arg, int kmflag)
901 {
902 	bd_xfer_impl_t	*xi;
903 	bd_t		*bd = arg;
904 	int		(*dcb)(caddr_t);
905 
906 	if (kmflag == KM_PUSHPAGE || kmflag == KM_SLEEP) {
907 		dcb = DDI_DMA_SLEEP;
908 	} else {
909 		dcb = DDI_DMA_DONTWAIT;
910 	}
911 
912 	xi = buf;
913 	bzero(xi, sizeof (*xi));
914 	xi->i_bd = bd;
915 
916 	if (bd->d_use_dma) {
917 		if (ddi_dma_alloc_handle(bd->d_dip, &bd->d_dma, dcb, NULL,
918 		    &xi->i_dmah) != DDI_SUCCESS) {
919 			return (-1);
920 		}
921 	}
922 
923 	return (0);
924 }
925 
926 static void
927 bd_xfer_dtor(void *buf, void *arg)
928 {
929 	bd_xfer_impl_t	*xi = buf;
930 
931 	_NOTE(ARGUNUSED(arg));
932 
933 	if (xi->i_dmah)
934 		ddi_dma_free_handle(&xi->i_dmah);
935 	xi->i_dmah = NULL;
936 }
937 
938 static bd_xfer_impl_t *
939 bd_xfer_alloc(bd_t *bd, struct buf *bp, int (*func)(void *, bd_xfer_t *),
940     int kmflag)
941 {
942 	bd_xfer_impl_t		*xi;
943 	int			rv = 0;
944 	int			status;
945 	unsigned		dir;
946 	int			(*cb)(caddr_t);
947 	size_t			len;
948 	uint32_t		shift;
949 
950 	if (kmflag == KM_SLEEP) {
951 		cb = DDI_DMA_SLEEP;
952 	} else {
953 		cb = DDI_DMA_DONTWAIT;
954 	}
955 
956 	xi = kmem_cache_alloc(bd->d_cache, kmflag);
957 	if (xi == NULL) {
958 		bioerror(bp, ENOMEM);
959 		return (NULL);
960 	}
961 
962 	ASSERT(bp);
963 
964 	xi->i_bp = bp;
965 	xi->i_func = func;
966 	xi->i_blkno = bp->b_lblkno >> (bd->d_blkshift - DEV_BSHIFT);
967 
968 	if (bp->b_bcount == 0) {
969 		xi->i_len = 0;
970 		xi->i_nblks = 0;
971 		xi->i_kaddr = NULL;
972 		xi->i_resid = 0;
973 		xi->i_num_win = 0;
974 		goto done;
975 	}
976 
977 	if (bp->b_flags & B_READ) {
978 		dir = DDI_DMA_READ;
979 		xi->i_func = bd->d_ops.o_read;
980 	} else {
981 		dir = DDI_DMA_WRITE;
982 		xi->i_func = bd->d_ops.o_write;
983 	}
984 
985 	shift = bd->d_blkshift;
986 	xi->i_blkshift = shift;
987 
988 	if (!bd->d_use_dma) {
989 		bp_mapin(bp);
990 		rv = 0;
991 		xi->i_offset = 0;
992 		xi->i_num_win =
993 		    (bp->b_bcount + (bd->d_maxxfer - 1)) / bd->d_maxxfer;
994 		xi->i_cur_win = 0;
995 		xi->i_len = min(bp->b_bcount, bd->d_maxxfer);
996 		xi->i_nblks = xi->i_len >> shift;
997 		xi->i_kaddr = bp->b_un.b_addr;
998 		xi->i_resid = bp->b_bcount;
999 	} else {
1000 
1001 		/*
1002 		 * We have to use consistent DMA if the address is misaligned.
1003 		 */
1004 		if (((bp->b_flags & (B_PAGEIO | B_REMAPPED)) != B_PAGEIO) &&
1005 		    ((uintptr_t)bp->b_un.b_addr & 0x7)) {
1006 			dir |= DDI_DMA_CONSISTENT | DDI_DMA_PARTIAL;
1007 		} else {
1008 			dir |= DDI_DMA_STREAMING | DDI_DMA_PARTIAL;
1009 		}
1010 
1011 		status = ddi_dma_buf_bind_handle(xi->i_dmah, bp, dir, cb,
1012 		    NULL, &xi->i_dmac, &xi->i_ndmac);
1013 		switch (status) {
1014 		case DDI_DMA_MAPPED:
1015 			xi->i_num_win = 1;
1016 			xi->i_cur_win = 0;
1017 			xi->i_offset = 0;
1018 			xi->i_len = bp->b_bcount;
1019 			xi->i_nblks = xi->i_len >> shift;
1020 			xi->i_resid = bp->b_bcount;
1021 			rv = 0;
1022 			break;
1023 		case DDI_DMA_PARTIAL_MAP:
1024 			xi->i_cur_win = 0;
1025 
1026 			if ((ddi_dma_numwin(xi->i_dmah, &xi->i_num_win) !=
1027 			    DDI_SUCCESS) ||
1028 			    (ddi_dma_getwin(xi->i_dmah, 0, &xi->i_offset,
1029 			    &len, &xi->i_dmac, &xi->i_ndmac) !=
1030 			    DDI_SUCCESS) ||
1031 			    (P2PHASE(len, (1U << shift)) != 0)) {
1032 				(void) ddi_dma_unbind_handle(xi->i_dmah);
1033 				rv = EFAULT;
1034 				goto done;
1035 			}
1036 			xi->i_len = len;
1037 			xi->i_nblks = xi->i_len >> shift;
1038 			xi->i_resid = bp->b_bcount;
1039 			rv = 0;
1040 			break;
1041 		case DDI_DMA_NORESOURCES:
1042 			rv = EAGAIN;
1043 			goto done;
1044 		case DDI_DMA_TOOBIG:
1045 			rv = EINVAL;
1046 			goto done;
1047 		case DDI_DMA_NOMAPPING:
1048 		case DDI_DMA_INUSE:
1049 		default:
1050 			rv = EFAULT;
1051 			goto done;
1052 		}
1053 	}
1054 
1055 done:
1056 	if (rv != 0) {
1057 		kmem_cache_free(bd->d_cache, xi);
1058 		bioerror(bp, rv);
1059 		return (NULL);
1060 	}
1061 
1062 	return (xi);
1063 }
1064 
1065 static void
1066 bd_xfer_free(bd_xfer_impl_t *xi)
1067 {
1068 	if (xi->i_dmah) {
1069 		(void) ddi_dma_unbind_handle(xi->i_dmah);
1070 	}
1071 	if (xi->i_dfl != NULL) {
1072 		dfl_free((dkioc_free_list_t *)xi->i_dfl);
1073 		xi->i_dfl = NULL;
1074 	}
1075 	kmem_cache_free(xi->i_bd->d_cache, xi);
1076 }
1077 
1078 static int
1079 bd_open(dev_t *devp, int flag, int otyp, cred_t *credp)
1080 {
1081 	dev_t		dev = *devp;
1082 	bd_t		*bd;
1083 	minor_t		part;
1084 	minor_t		inst;
1085 	uint64_t	mask;
1086 	boolean_t	ndelay;
1087 	int		rv;
1088 	diskaddr_t	nblks;
1089 	diskaddr_t	lba;
1090 
1091 	_NOTE(ARGUNUSED(credp));
1092 
1093 	part = BDPART(dev);
1094 	inst = BDINST(dev);
1095 
1096 	if (otyp >= OTYPCNT)
1097 		return (EINVAL);
1098 
1099 	ndelay = (flag & (FNDELAY | FNONBLOCK)) ? B_TRUE : B_FALSE;
1100 
1101 	/*
1102 	 * Block any DR events from changing the set of registered
1103 	 * devices while we function.
1104 	 */
1105 	rw_enter(&bd_lock, RW_READER);
1106 	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
1107 		rw_exit(&bd_lock);
1108 		return (ENXIO);
1109 	}
1110 
1111 	mutex_enter(&bd->d_ocmutex);
1112 
1113 	ASSERT(part < 64);
1114 	mask = (1U << part);
1115 
1116 	bd_update_state(bd);
1117 
1118 	if (cmlb_validate(bd->d_cmlbh, 0, 0) != 0) {
1119 
1120 		/* non-blocking opens are allowed to succeed */
1121 		if (!ndelay) {
1122 			rv = ENXIO;
1123 			goto done;
1124 		}
1125 	} else if (cmlb_partinfo(bd->d_cmlbh, part, &nblks, &lba,
1126 	    NULL, NULL, 0) == 0) {
1127 
1128 		/*
1129 		 * We read the partinfo, verify valid ranges.  If the
1130 		 * partition is invalid, and we aren't blocking or
1131 		 * doing a raw access, then fail. (Non-blocking and
1132 		 * raw accesses can still succeed to allow a disk with
1133 		 * bad partition data to opened by format and fdisk.)
1134 		 */
1135 		if ((!nblks) && ((!ndelay) || (otyp != OTYP_CHR))) {
1136 			rv = ENXIO;
1137 			goto done;
1138 		}
1139 	} else if (!ndelay) {
1140 		/*
1141 		 * cmlb_partinfo failed -- invalid partition or no
1142 		 * disk label.
1143 		 */
1144 		rv = ENXIO;
1145 		goto done;
1146 	}
1147 
1148 	if ((flag & FWRITE) && bd->d_rdonly) {
1149 		rv = EROFS;
1150 		goto done;
1151 	}
1152 
1153 	if ((bd->d_open_excl) & (mask)) {
1154 		rv = EBUSY;
1155 		goto done;
1156 	}
1157 	if (flag & FEXCL) {
1158 		if (bd->d_open_lyr[part]) {
1159 			rv = EBUSY;
1160 			goto done;
1161 		}
1162 		for (int i = 0; i < OTYP_LYR; i++) {
1163 			if (bd->d_open_reg[i] & mask) {
1164 				rv = EBUSY;
1165 				goto done;
1166 			}
1167 		}
1168 	}
1169 
1170 	if (otyp == OTYP_LYR) {
1171 		bd->d_open_lyr[part]++;
1172 	} else {
1173 		bd->d_open_reg[otyp] |= mask;
1174 	}
1175 	if (flag & FEXCL) {
1176 		bd->d_open_excl |= mask;
1177 	}
1178 
1179 	rv = 0;
1180 done:
1181 	mutex_exit(&bd->d_ocmutex);
1182 	rw_exit(&bd_lock);
1183 
1184 	return (rv);
1185 }
1186 
1187 static int
1188 bd_close(dev_t dev, int flag, int otyp, cred_t *credp)
1189 {
1190 	bd_t		*bd;
1191 	minor_t		inst;
1192 	minor_t		part;
1193 	uint64_t	mask;
1194 	boolean_t	last = B_TRUE;
1195 
1196 	_NOTE(ARGUNUSED(flag));
1197 	_NOTE(ARGUNUSED(credp));
1198 
1199 	part = BDPART(dev);
1200 	inst = BDINST(dev);
1201 
1202 	ASSERT(part < 64);
1203 	mask = (1U << part);
1204 
1205 	rw_enter(&bd_lock, RW_READER);
1206 
1207 	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
1208 		rw_exit(&bd_lock);
1209 		return (ENXIO);
1210 	}
1211 
1212 	mutex_enter(&bd->d_ocmutex);
1213 	if (bd->d_open_excl & mask) {
1214 		bd->d_open_excl &= ~mask;
1215 	}
1216 	if (otyp == OTYP_LYR) {
1217 		bd->d_open_lyr[part]--;
1218 	} else {
1219 		bd->d_open_reg[otyp] &= ~mask;
1220 	}
1221 	for (int i = 0; i < 64; i++) {
1222 		if (bd->d_open_lyr[part]) {
1223 			last = B_FALSE;
1224 		}
1225 	}
1226 	for (int i = 0; last && (i < OTYP_LYR); i++) {
1227 		if (bd->d_open_reg[i]) {
1228 			last = B_FALSE;
1229 		}
1230 	}
1231 	mutex_exit(&bd->d_ocmutex);
1232 
1233 	if (last) {
1234 		cmlb_invalidate(bd->d_cmlbh, 0);
1235 	}
1236 	rw_exit(&bd_lock);
1237 
1238 	return (0);
1239 }
1240 
1241 static int
1242 bd_dump(dev_t dev, caddr_t caddr, daddr_t blkno, int nblk)
1243 {
1244 	minor_t		inst;
1245 	minor_t		part;
1246 	diskaddr_t	pstart;
1247 	diskaddr_t	psize;
1248 	bd_t		*bd;
1249 	bd_xfer_impl_t	*xi;
1250 	buf_t		*bp;
1251 	int		rv;
1252 	uint32_t	shift;
1253 	daddr_t		d_blkno;
1254 	int	d_nblk;
1255 
1256 	rw_enter(&bd_lock, RW_READER);
1257 
1258 	part = BDPART(dev);
1259 	inst = BDINST(dev);
1260 
1261 	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
1262 		rw_exit(&bd_lock);
1263 		return (ENXIO);
1264 	}
1265 	shift = bd->d_blkshift;
1266 	d_blkno = blkno >> (shift - DEV_BSHIFT);
1267 	d_nblk = nblk >> (shift - DEV_BSHIFT);
1268 	/*
1269 	 * do cmlb, but do it synchronously unless we already have the
1270 	 * partition (which we probably should.)
1271 	 */
1272 	if (cmlb_partinfo(bd->d_cmlbh, part, &psize, &pstart, NULL, NULL,
1273 	    (void *)1)) {
1274 		rw_exit(&bd_lock);
1275 		return (ENXIO);
1276 	}
1277 
1278 	if ((d_blkno + d_nblk) > psize) {
1279 		rw_exit(&bd_lock);
1280 		return (EINVAL);
1281 	}
1282 	bp = getrbuf(KM_NOSLEEP);
1283 	if (bp == NULL) {
1284 		rw_exit(&bd_lock);
1285 		return (ENOMEM);
1286 	}
1287 
1288 	bp->b_bcount = nblk << DEV_BSHIFT;
1289 	bp->b_resid = bp->b_bcount;
1290 	bp->b_lblkno = blkno;
1291 	bp->b_un.b_addr = caddr;
1292 
1293 	xi = bd_xfer_alloc(bd, bp,  bd->d_ops.o_write, KM_NOSLEEP);
1294 	if (xi == NULL) {
1295 		rw_exit(&bd_lock);
1296 		freerbuf(bp);
1297 		return (ENOMEM);
1298 	}
1299 	xi->i_blkno = d_blkno + pstart;
1300 	xi->i_flags = BD_XFER_POLL;
1301 	bd_submit(bd, xi);
1302 	rw_exit(&bd_lock);
1303 
1304 	/*
1305 	 * Generally, we should have run this entirely synchronously
1306 	 * at this point and the biowait call should be a no-op.  If
1307 	 * it didn't happen this way, it's a bug in the underlying
1308 	 * driver not honoring BD_XFER_POLL.
1309 	 */
1310 	(void) biowait(bp);
1311 	rv = geterror(bp);
1312 	freerbuf(bp);
1313 	return (rv);
1314 }
1315 
1316 void
1317 bd_minphys(struct buf *bp)
1318 {
1319 	minor_t inst;
1320 	bd_t	*bd;
1321 	inst = BDINST(bp->b_edev);
1322 
1323 	bd = ddi_get_soft_state(bd_state, inst);
1324 
1325 	/*
1326 	 * In a non-debug kernel, bd_strategy will catch !bd as
1327 	 * well, and will fail nicely.
1328 	 */
1329 	ASSERT(bd);
1330 
1331 	if (bp->b_bcount > bd->d_maxxfer)
1332 		bp->b_bcount = bd->d_maxxfer;
1333 }
1334 
1335 static int
1336 bd_check_uio(dev_t dev, struct uio *uio)
1337 {
1338 	bd_t		*bd;
1339 	uint32_t	shift;
1340 
1341 	if ((bd = ddi_get_soft_state(bd_state, BDINST(dev))) == NULL) {
1342 		return (ENXIO);
1343 	}
1344 
1345 	shift = bd->d_blkshift;
1346 	if ((P2PHASE(uio->uio_loffset, (1U << shift)) != 0) ||
1347 	    (P2PHASE(uio->uio_iov->iov_len, (1U << shift)) != 0)) {
1348 		return (EINVAL);
1349 	}
1350 
1351 	return (0);
1352 }
1353 
1354 static int
1355 bd_read(dev_t dev, struct uio *uio, cred_t *credp)
1356 {
1357 	_NOTE(ARGUNUSED(credp));
1358 	int	ret = bd_check_uio(dev, uio);
1359 	if (ret != 0) {
1360 		return (ret);
1361 	}
1362 	return (physio(bd_strategy, NULL, dev, B_READ, bd_minphys, uio));
1363 }
1364 
1365 static int
1366 bd_write(dev_t dev, struct uio *uio, cred_t *credp)
1367 {
1368 	_NOTE(ARGUNUSED(credp));
1369 	int	ret = bd_check_uio(dev, uio);
1370 	if (ret != 0) {
1371 		return (ret);
1372 	}
1373 	return (physio(bd_strategy, NULL, dev, B_WRITE, bd_minphys, uio));
1374 }
1375 
1376 static int
1377 bd_aread(dev_t dev, struct aio_req *aio, cred_t *credp)
1378 {
1379 	_NOTE(ARGUNUSED(credp));
1380 	int	ret = bd_check_uio(dev, aio->aio_uio);
1381 	if (ret != 0) {
1382 		return (ret);
1383 	}
1384 	return (aphysio(bd_strategy, anocancel, dev, B_READ, bd_minphys, aio));
1385 }
1386 
1387 static int
1388 bd_awrite(dev_t dev, struct aio_req *aio, cred_t *credp)
1389 {
1390 	_NOTE(ARGUNUSED(credp));
1391 	int	ret = bd_check_uio(dev, aio->aio_uio);
1392 	if (ret != 0) {
1393 		return (ret);
1394 	}
1395 	return (aphysio(bd_strategy, anocancel, dev, B_WRITE, bd_minphys, aio));
1396 }
1397 
1398 static int
1399 bd_strategy(struct buf *bp)
1400 {
1401 	minor_t		inst;
1402 	minor_t		part;
1403 	bd_t		*bd;
1404 	diskaddr_t	p_lba;
1405 	diskaddr_t	p_nblks;
1406 	diskaddr_t	b_nblks;
1407 	bd_xfer_impl_t	*xi;
1408 	uint32_t	shift;
1409 	int		(*func)(void *, bd_xfer_t *);
1410 	diskaddr_t	lblkno;
1411 
1412 	part = BDPART(bp->b_edev);
1413 	inst = BDINST(bp->b_edev);
1414 
1415 	ASSERT(bp);
1416 
1417 	bp->b_resid = bp->b_bcount;
1418 
1419 	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
1420 		bioerror(bp, ENXIO);
1421 		biodone(bp);
1422 		return (0);
1423 	}
1424 
1425 	if (cmlb_partinfo(bd->d_cmlbh, part, &p_nblks, &p_lba,
1426 	    NULL, NULL, 0)) {
1427 		bioerror(bp, ENXIO);
1428 		biodone(bp);
1429 		return (0);
1430 	}
1431 
1432 	shift = bd->d_blkshift;
1433 	lblkno = bp->b_lblkno >> (shift - DEV_BSHIFT);
1434 	if ((P2PHASE(bp->b_lblkno, (1U << (shift - DEV_BSHIFT))) != 0) ||
1435 	    (P2PHASE(bp->b_bcount, (1U << shift)) != 0) ||
1436 	    (lblkno > p_nblks)) {
1437 		bioerror(bp, EINVAL);
1438 		biodone(bp);
1439 		return (0);
1440 	}
1441 	b_nblks = bp->b_bcount >> shift;
1442 	if ((lblkno == p_nblks) || (bp->b_bcount == 0)) {
1443 		biodone(bp);
1444 		return (0);
1445 	}
1446 
1447 	if ((b_nblks + lblkno) > p_nblks) {
1448 		bp->b_resid = ((lblkno + b_nblks - p_nblks) << shift);
1449 		bp->b_bcount -= bp->b_resid;
1450 	} else {
1451 		bp->b_resid = 0;
1452 	}
1453 	func = (bp->b_flags & B_READ) ? bd->d_ops.o_read : bd->d_ops.o_write;
1454 
1455 	xi = bd_xfer_alloc(bd, bp, func, KM_NOSLEEP);
1456 	if (xi == NULL) {
1457 		xi = bd_xfer_alloc(bd, bp, func, KM_PUSHPAGE);
1458 	}
1459 	if (xi == NULL) {
1460 		/* bd_request_alloc will have done bioerror */
1461 		biodone(bp);
1462 		return (0);
1463 	}
1464 	xi->i_blkno = lblkno + p_lba;
1465 
1466 	bd_submit(bd, xi);
1467 
1468 	return (0);
1469 }
1470 
1471 static int
1472 bd_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *credp, int *rvalp)
1473 {
1474 	minor_t		inst;
1475 	uint16_t	part;
1476 	bd_t		*bd;
1477 	void		*ptr = (void *)arg;
1478 	int		rv;
1479 
1480 	part = BDPART(dev);
1481 	inst = BDINST(dev);
1482 
1483 	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
1484 		return (ENXIO);
1485 	}
1486 
1487 	rv = cmlb_ioctl(bd->d_cmlbh, dev, cmd, arg, flag, credp, rvalp, 0);
1488 	if (rv != ENOTTY)
1489 		return (rv);
1490 
1491 	if (rvalp != NULL) {
1492 		/* the return value of the ioctl is 0 by default */
1493 		*rvalp = 0;
1494 	}
1495 
1496 	switch (cmd) {
1497 	case DKIOCGMEDIAINFO: {
1498 		struct dk_minfo minfo;
1499 
1500 		/* make sure our state information is current */
1501 		bd_update_state(bd);
1502 		bzero(&minfo, sizeof (minfo));
1503 		minfo.dki_media_type = DK_FIXED_DISK;
1504 		minfo.dki_lbsize = (1U << bd->d_blkshift);
1505 		minfo.dki_capacity = bd->d_numblks;
1506 		if (ddi_copyout(&minfo, ptr, sizeof (minfo), flag)) {
1507 			return (EFAULT);
1508 		}
1509 		return (0);
1510 	}
1511 	case DKIOCGMEDIAINFOEXT: {
1512 		struct dk_minfo_ext miext;
1513 		size_t len;
1514 
1515 		/* make sure our state information is current */
1516 		bd_update_state(bd);
1517 		bzero(&miext, sizeof (miext));
1518 		miext.dki_media_type = DK_FIXED_DISK;
1519 		miext.dki_lbsize = (1U << bd->d_blkshift);
1520 		miext.dki_pbsize = (1U << bd->d_pblkshift);
1521 		miext.dki_capacity = bd->d_numblks;
1522 
1523 		switch (ddi_model_convert_from(flag & FMODELS)) {
1524 		case DDI_MODEL_ILP32:
1525 			len = sizeof (struct dk_minfo_ext32);
1526 			break;
1527 		default:
1528 			len = sizeof (struct dk_minfo_ext);
1529 			break;
1530 		}
1531 
1532 		if (ddi_copyout(&miext, ptr, len, flag)) {
1533 			return (EFAULT);
1534 		}
1535 		return (0);
1536 	}
1537 	case DKIOCINFO: {
1538 		struct dk_cinfo cinfo;
1539 		bzero(&cinfo, sizeof (cinfo));
1540 		cinfo.dki_ctype = DKC_BLKDEV;
1541 		cinfo.dki_cnum = ddi_get_instance(ddi_get_parent(bd->d_dip));
1542 		(void) snprintf(cinfo.dki_cname, sizeof (cinfo.dki_cname),
1543 		    "%s", ddi_driver_name(ddi_get_parent(bd->d_dip)));
1544 		(void) snprintf(cinfo.dki_dname, sizeof (cinfo.dki_dname),
1545 		    "%s", ddi_driver_name(bd->d_dip));
1546 		cinfo.dki_unit = inst;
1547 		cinfo.dki_flags = DKI_FMTVOL;
1548 		cinfo.dki_partition = part;
1549 		cinfo.dki_maxtransfer = bd->d_maxxfer / DEV_BSIZE;
1550 		cinfo.dki_addr = 0;
1551 		cinfo.dki_slave = 0;
1552 		cinfo.dki_space = 0;
1553 		cinfo.dki_prio = 0;
1554 		cinfo.dki_vec = 0;
1555 		if (ddi_copyout(&cinfo, ptr, sizeof (cinfo), flag)) {
1556 			return (EFAULT);
1557 		}
1558 		return (0);
1559 	}
1560 	case DKIOCREMOVABLE: {
1561 		int i;
1562 		i = bd->d_removable ? 1 : 0;
1563 		if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
1564 			return (EFAULT);
1565 		}
1566 		return (0);
1567 	}
1568 	case DKIOCHOTPLUGGABLE: {
1569 		int i;
1570 		i = bd->d_hotpluggable ? 1 : 0;
1571 		if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
1572 			return (EFAULT);
1573 		}
1574 		return (0);
1575 	}
1576 	case DKIOCREADONLY: {
1577 		int i;
1578 		i = bd->d_rdonly ? 1 : 0;
1579 		if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
1580 			return (EFAULT);
1581 		}
1582 		return (0);
1583 	}
1584 	case DKIOCSOLIDSTATE: {
1585 		int i;
1586 		i = bd->d_ssd ? 1 : 0;
1587 		if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
1588 			return (EFAULT);
1589 		}
1590 		return (0);
1591 	}
1592 	case DKIOCSTATE: {
1593 		enum dkio_state	state;
1594 		if (ddi_copyin(ptr, &state, sizeof (state), flag)) {
1595 			return (EFAULT);
1596 		}
1597 		if ((rv = bd_check_state(bd, &state)) != 0) {
1598 			return (rv);
1599 		}
1600 		if (ddi_copyout(&state, ptr, sizeof (state), flag)) {
1601 			return (EFAULT);
1602 		}
1603 		return (0);
1604 	}
1605 	case DKIOCFLUSHWRITECACHE: {
1606 		struct dk_callback *dkc = NULL;
1607 
1608 		if (flag & FKIOCTL)
1609 			dkc = (void *)arg;
1610 
1611 		rv = bd_flush_write_cache(bd, dkc);
1612 		return (rv);
1613 	}
1614 	case DKIOCFREE: {
1615 		dkioc_free_list_t *dfl = NULL;
1616 
1617 		/*
1618 		 * Check free space support early to avoid copyin/allocation
1619 		 * when unnecessary.
1620 		 */
1621 		if (!CAN_FREESPACE(bd))
1622 			return (ENOTSUP);
1623 
1624 		rv = dfl_copyin(ptr, &dfl, flag, KM_SLEEP);
1625 		if (rv != 0)
1626 			return (rv);
1627 
1628 		/*
1629 		 * bd_free_space() consumes 'dfl'. bd_free_space() will
1630 		 * call dfl_iter() which will normally try to pass dfl through
1631 		 * to bd_free_space_cb() which attaches dfl to the bd_xfer_t
1632 		 * that is then queued for the underlying driver. Once the
1633 		 * driver processes the request, the bd_xfer_t instance is
1634 		 * disposed of, including any attached dkioc_free_list_t.
1635 		 *
1636 		 * If dfl cannot be processed by the underlying driver due to
1637 		 * size or alignment requirements of the driver, dfl_iter()
1638 		 * will replace dfl with one or more new dkioc_free_list_t
1639 		 * instances with the correct alignment and sizes for the driver
1640 		 * (and free the original dkioc_free_list_t).
1641 		 */
1642 		rv = bd_free_space(dev, bd, dfl);
1643 		return (rv);
1644 	}
1645 
1646 	case DKIOC_CANFREE: {
1647 		boolean_t supported = CAN_FREESPACE(bd);
1648 
1649 		if (ddi_copyout(&supported, (void *)arg, sizeof (supported),
1650 		    flag) != 0) {
1651 			return (EFAULT);
1652 		}
1653 
1654 		return (0);
1655 	}
1656 
1657 	default:
1658 		break;
1659 
1660 	}
1661 	return (ENOTTY);
1662 }
1663 
1664 static int
1665 bd_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int mod_flags,
1666     char *name, caddr_t valuep, int *lengthp)
1667 {
1668 	bd_t	*bd;
1669 
1670 	bd = ddi_get_soft_state(bd_state, ddi_get_instance(dip));
1671 	if (bd == NULL)
1672 		return (ddi_prop_op(dev, dip, prop_op, mod_flags,
1673 		    name, valuep, lengthp));
1674 
1675 	return (cmlb_prop_op(bd->d_cmlbh, dev, dip, prop_op, mod_flags, name,
1676 	    valuep, lengthp, BDPART(dev), 0));
1677 }
1678 
1679 
1680 static int
1681 bd_tg_rdwr(dev_info_t *dip, uchar_t cmd, void *bufaddr, diskaddr_t start,
1682     size_t length, void *tg_cookie)
1683 {
1684 	bd_t		*bd;
1685 	buf_t		*bp;
1686 	bd_xfer_impl_t	*xi;
1687 	int		rv;
1688 	int		(*func)(void *, bd_xfer_t *);
1689 	int		kmflag;
1690 
1691 	/*
1692 	 * If we are running in polled mode (such as during dump(9e)
1693 	 * execution), then we cannot sleep for kernel allocations.
1694 	 */
1695 	kmflag = tg_cookie ? KM_NOSLEEP : KM_SLEEP;
1696 
1697 	bd = ddi_get_soft_state(bd_state, ddi_get_instance(dip));
1698 
1699 	if (P2PHASE(length, (1U << bd->d_blkshift)) != 0) {
1700 		/* We can only transfer whole blocks at a time! */
1701 		return (EINVAL);
1702 	}
1703 
1704 	if ((bp = getrbuf(kmflag)) == NULL) {
1705 		return (ENOMEM);
1706 	}
1707 
1708 	switch (cmd) {
1709 	case TG_READ:
1710 		bp->b_flags = B_READ;
1711 		func = bd->d_ops.o_read;
1712 		break;
1713 	case TG_WRITE:
1714 		bp->b_flags = B_WRITE;
1715 		func = bd->d_ops.o_write;
1716 		break;
1717 	default:
1718 		freerbuf(bp);
1719 		return (EINVAL);
1720 	}
1721 
1722 	bp->b_un.b_addr = bufaddr;
1723 	bp->b_bcount = length;
1724 	xi = bd_xfer_alloc(bd, bp, func, kmflag);
1725 	if (xi == NULL) {
1726 		rv = geterror(bp);
1727 		freerbuf(bp);
1728 		return (rv);
1729 	}
1730 	xi->i_flags = tg_cookie ? BD_XFER_POLL : 0;
1731 	xi->i_blkno = start;
1732 	bd_submit(bd, xi);
1733 	(void) biowait(bp);
1734 	rv = geterror(bp);
1735 	freerbuf(bp);
1736 
1737 	return (rv);
1738 }
1739 
1740 static int
1741 bd_tg_getinfo(dev_info_t *dip, int cmd, void *arg, void *tg_cookie)
1742 {
1743 	bd_t		*bd;
1744 
1745 	_NOTE(ARGUNUSED(tg_cookie));
1746 	bd = ddi_get_soft_state(bd_state, ddi_get_instance(dip));
1747 
1748 	switch (cmd) {
1749 	case TG_GETPHYGEOM:
1750 	case TG_GETVIRTGEOM:
1751 		/*
1752 		 * We don't have any "geometry" as such, let cmlb
1753 		 * fabricate something.
1754 		 */
1755 		return (ENOTTY);
1756 
1757 	case TG_GETCAPACITY:
1758 		bd_update_state(bd);
1759 		*(diskaddr_t *)arg = bd->d_numblks;
1760 		return (0);
1761 
1762 	case TG_GETBLOCKSIZE:
1763 		*(uint32_t *)arg = (1U << bd->d_blkshift);
1764 		return (0);
1765 
1766 	case TG_GETATTR:
1767 		/*
1768 		 * It turns out that cmlb really doesn't do much for
1769 		 * non-writable media, but lets make the information
1770 		 * available for it in case it does more in the
1771 		 * future.  (The value is currently used for
1772 		 * triggering special behavior for CD-ROMs.)
1773 		 */
1774 		bd_update_state(bd);
1775 		((tg_attribute_t *)arg)->media_is_writable =
1776 		    bd->d_rdonly ? B_FALSE : B_TRUE;
1777 		((tg_attribute_t *)arg)->media_is_solid_state = bd->d_ssd;
1778 		((tg_attribute_t *)arg)->media_is_rotational = B_FALSE;
1779 		return (0);
1780 
1781 	default:
1782 		return (EINVAL);
1783 	}
1784 }
1785 
1786 
1787 static void
1788 bd_sched(bd_t *bd, bd_queue_t *bq)
1789 {
1790 	bd_xfer_impl_t	*xi;
1791 	struct buf	*bp;
1792 	int		rv;
1793 
1794 	mutex_enter(&bq->q_iomutex);
1795 
1796 	while ((bq->q_qactive < bq->q_qsize) &&
1797 	    ((xi = list_remove_head(&bq->q_waitq)) != NULL)) {
1798 		mutex_enter(&bd->d_ksmutex);
1799 		kstat_waitq_to_runq(bd->d_kiop);
1800 		mutex_exit(&bd->d_ksmutex);
1801 
1802 		bq->q_qactive++;
1803 		list_insert_tail(&bq->q_runq, xi);
1804 
1805 		/*
1806 		 * Submit the job to the driver.  We drop the I/O mutex
1807 		 * so that we can deal with the case where the driver
1808 		 * completion routine calls back into us synchronously.
1809 		 */
1810 
1811 		mutex_exit(&bq->q_iomutex);
1812 
1813 		rv = xi->i_func(bd->d_private, &xi->i_public);
1814 		if (rv != 0) {
1815 			bp = xi->i_bp;
1816 			bioerror(bp, rv);
1817 			biodone(bp);
1818 
1819 			atomic_inc_32(&bd->d_kerr->bd_transerrs.value.ui32);
1820 
1821 			mutex_enter(&bq->q_iomutex);
1822 
1823 			mutex_enter(&bd->d_ksmutex);
1824 			kstat_runq_exit(bd->d_kiop);
1825 			mutex_exit(&bd->d_ksmutex);
1826 
1827 			bq->q_qactive--;
1828 			list_remove(&bq->q_runq, xi);
1829 			bd_xfer_free(xi);
1830 		} else {
1831 			mutex_enter(&bq->q_iomutex);
1832 		}
1833 	}
1834 
1835 	mutex_exit(&bq->q_iomutex);
1836 }
1837 
1838 static void
1839 bd_submit(bd_t *bd, bd_xfer_impl_t *xi)
1840 {
1841 	uint64_t	nv = atomic_inc_64_nv(&bd->d_io_counter);
1842 	unsigned	q = nv % bd->d_qcount;
1843 	bd_queue_t	*bq = &bd->d_queues[q];
1844 
1845 	xi->i_bq = bq;
1846 	xi->i_qnum = q;
1847 
1848 	mutex_enter(&bq->q_iomutex);
1849 
1850 	list_insert_tail(&bq->q_waitq, xi);
1851 
1852 	mutex_enter(&bd->d_ksmutex);
1853 	kstat_waitq_enter(bd->d_kiop);
1854 	mutex_exit(&bd->d_ksmutex);
1855 
1856 	mutex_exit(&bq->q_iomutex);
1857 
1858 	bd_sched(bd, bq);
1859 }
1860 
1861 static void
1862 bd_runq_exit(bd_xfer_impl_t *xi, int err)
1863 {
1864 	bd_t		*bd = xi->i_bd;
1865 	buf_t		*bp = xi->i_bp;
1866 	bd_queue_t	*bq = xi->i_bq;
1867 
1868 	mutex_enter(&bq->q_iomutex);
1869 	bq->q_qactive--;
1870 
1871 	mutex_enter(&bd->d_ksmutex);
1872 	kstat_runq_exit(bd->d_kiop);
1873 	mutex_exit(&bd->d_ksmutex);
1874 
1875 	list_remove(&bq->q_runq, xi);
1876 	mutex_exit(&bq->q_iomutex);
1877 
1878 	if (err == 0) {
1879 		if (bp->b_flags & B_READ) {
1880 			atomic_inc_uint(&bd->d_kiop->reads);
1881 			atomic_add_64((uint64_t *)&bd->d_kiop->nread,
1882 			    bp->b_bcount - xi->i_resid);
1883 		} else {
1884 			atomic_inc_uint(&bd->d_kiop->writes);
1885 			atomic_add_64((uint64_t *)&bd->d_kiop->nwritten,
1886 			    bp->b_bcount - xi->i_resid);
1887 		}
1888 	}
1889 	bd_sched(bd, bq);
1890 }
1891 
1892 static void
1893 bd_update_state(bd_t *bd)
1894 {
1895 	enum	dkio_state	state = DKIO_INSERTED;
1896 	boolean_t		docmlb = B_FALSE;
1897 	bd_media_t		media;
1898 
1899 	bzero(&media, sizeof (media));
1900 
1901 	mutex_enter(&bd->d_statemutex);
1902 	if (bd->d_ops.o_media_info(bd->d_private, &media) != 0) {
1903 		bd->d_numblks = 0;
1904 		state = DKIO_EJECTED;
1905 		goto done;
1906 	}
1907 
1908 	if ((media.m_blksize < 512) ||
1909 	    (!ISP2(media.m_blksize)) ||
1910 	    (P2PHASE(bd->d_maxxfer, media.m_blksize))) {
1911 		cmn_err(CE_WARN, "%s%d: Invalid media block size (%d)",
1912 		    ddi_driver_name(bd->d_dip), ddi_get_instance(bd->d_dip),
1913 		    media.m_blksize);
1914 		/*
1915 		 * We can't use the media, treat it as not present.
1916 		 */
1917 		state = DKIO_EJECTED;
1918 		bd->d_numblks = 0;
1919 		goto done;
1920 	}
1921 
1922 	if (((1U << bd->d_blkshift) != media.m_blksize) ||
1923 	    (bd->d_numblks != media.m_nblks)) {
1924 		/* Device size changed */
1925 		docmlb = B_TRUE;
1926 	}
1927 
1928 	bd->d_blkshift = ddi_ffs(media.m_blksize) - 1;
1929 	bd->d_pblkshift = bd->d_blkshift;
1930 	bd->d_numblks = media.m_nblks;
1931 	bd->d_rdonly = media.m_readonly;
1932 	bd->d_ssd = media.m_solidstate;
1933 
1934 	/*
1935 	 * Only use the supplied physical block size if it is non-zero,
1936 	 * greater or equal to the block size, and a power of 2. Ignore it
1937 	 * if not, it's just informational and we can still use the media.
1938 	 */
1939 	if ((media.m_pblksize != 0) &&
1940 	    (media.m_pblksize >= media.m_blksize) &&
1941 	    (ISP2(media.m_pblksize)))
1942 		bd->d_pblkshift = ddi_ffs(media.m_pblksize) - 1;
1943 
1944 done:
1945 	if (state != bd->d_state) {
1946 		bd->d_state = state;
1947 		cv_broadcast(&bd->d_statecv);
1948 		docmlb = B_TRUE;
1949 	}
1950 	mutex_exit(&bd->d_statemutex);
1951 
1952 	bd->d_kerr->bd_capacity.value.ui64 = bd->d_numblks << bd->d_blkshift;
1953 
1954 	if (docmlb) {
1955 		if (state == DKIO_INSERTED) {
1956 			(void) cmlb_validate(bd->d_cmlbh, 0, 0);
1957 		} else {
1958 			cmlb_invalidate(bd->d_cmlbh, 0);
1959 		}
1960 	}
1961 }
1962 
1963 static int
1964 bd_check_state(bd_t *bd, enum dkio_state *state)
1965 {
1966 	clock_t		when;
1967 
1968 	for (;;) {
1969 
1970 		bd_update_state(bd);
1971 
1972 		mutex_enter(&bd->d_statemutex);
1973 
1974 		if (bd->d_state != *state) {
1975 			*state = bd->d_state;
1976 			mutex_exit(&bd->d_statemutex);
1977 			break;
1978 		}
1979 
1980 		when = drv_usectohz(1000000);
1981 		if (cv_reltimedwait_sig(&bd->d_statecv, &bd->d_statemutex,
1982 		    when, TR_CLOCK_TICK) == 0) {
1983 			mutex_exit(&bd->d_statemutex);
1984 			return (EINTR);
1985 		}
1986 
1987 		mutex_exit(&bd->d_statemutex);
1988 	}
1989 
1990 	return (0);
1991 }
1992 
1993 static int
1994 bd_flush_write_cache_done(struct buf *bp)
1995 {
1996 	struct dk_callback *dc = (void *)bp->b_private;
1997 
1998 	(*dc->dkc_callback)(dc->dkc_cookie, geterror(bp));
1999 	kmem_free(dc, sizeof (*dc));
2000 	freerbuf(bp);
2001 	return (0);
2002 }
2003 
2004 static int
2005 bd_flush_write_cache(bd_t *bd, struct dk_callback *dkc)
2006 {
2007 	buf_t			*bp;
2008 	struct dk_callback	*dc;
2009 	bd_xfer_impl_t		*xi;
2010 	int			rv;
2011 
2012 	if (bd->d_ops.o_sync_cache == NULL) {
2013 		return (ENOTSUP);
2014 	}
2015 	if ((bp = getrbuf(KM_SLEEP)) == NULL) {
2016 		return (ENOMEM);
2017 	}
2018 	bp->b_resid = 0;
2019 	bp->b_bcount = 0;
2020 
2021 	xi = bd_xfer_alloc(bd, bp, bd->d_ops.o_sync_cache, KM_SLEEP);
2022 	if (xi == NULL) {
2023 		rv = geterror(bp);
2024 		freerbuf(bp);
2025 		return (rv);
2026 	}
2027 
2028 	/* Make an asynchronous flush, but only if there is a callback */
2029 	if (dkc != NULL && dkc->dkc_callback != NULL) {
2030 		/* Make a private copy of the callback structure */
2031 		dc = kmem_alloc(sizeof (*dc), KM_SLEEP);
2032 		*dc = *dkc;
2033 		bp->b_private = dc;
2034 		bp->b_iodone = bd_flush_write_cache_done;
2035 
2036 		bd_submit(bd, xi);
2037 		return (0);
2038 	}
2039 
2040 	/* In case there is no callback, perform a synchronous flush */
2041 	bd_submit(bd, xi);
2042 	(void) biowait(bp);
2043 	rv = geterror(bp);
2044 	freerbuf(bp);
2045 
2046 	return (rv);
2047 }
2048 
2049 static int
2050 bd_free_space_done(struct buf *bp)
2051 {
2052 	freerbuf(bp);
2053 	return (0);
2054 }
2055 
2056 static int
2057 bd_free_space_cb(dkioc_free_list_t *dfl, void *arg, int kmflag)
2058 {
2059 	bd_t		*bd = arg;
2060 	buf_t		*bp = NULL;
2061 	bd_xfer_impl_t	*xi = NULL;
2062 	boolean_t	sync = DFL_ISSYNC(dfl) ?  B_TRUE : B_FALSE;
2063 	int		rv = 0;
2064 
2065 	bp = getrbuf(KM_SLEEP);
2066 	bp->b_resid = 0;
2067 	bp->b_bcount = 0;
2068 	bp->b_lblkno = 0;
2069 
2070 	xi = bd_xfer_alloc(bd, bp, bd->d_ops.o_free_space, kmflag);
2071 	xi->i_dfl = dfl;
2072 
2073 	if (!sync) {
2074 		bp->b_iodone = bd_free_space_done;
2075 		bd_submit(bd, xi);
2076 		return (0);
2077 	}
2078 
2079 	xi->i_flags |= BD_XFER_POLL;
2080 	bd_submit(bd, xi);
2081 
2082 	(void) biowait(bp);
2083 	rv = geterror(bp);
2084 	freerbuf(bp);
2085 
2086 	return (rv);
2087 }
2088 
2089 static int
2090 bd_free_space(dev_t dev, bd_t *bd, dkioc_free_list_t *dfl)
2091 {
2092 	diskaddr_t p_len, p_offset;
2093 	uint64_t offset_bytes, len_bytes;
2094 	minor_t part = BDPART(dev);
2095 	const uint_t bshift = bd->d_blkshift;
2096 	dkioc_free_info_t dfi = {
2097 		.dfi_bshift = bshift,
2098 		.dfi_align = bd->d_free_align << bshift,
2099 		.dfi_max_bytes = bd->d_max_free_blks << bshift,
2100 		.dfi_max_ext = bd->d_max_free_seg,
2101 		.dfi_max_ext_bytes = bd->d_max_free_seg_blks << bshift,
2102 	};
2103 
2104 	if (cmlb_partinfo(bd->d_cmlbh, part, &p_len, &p_offset, NULL,
2105 	    NULL, 0) != 0) {
2106 		dfl_free(dfl);
2107 		return (ENXIO);
2108 	}
2109 
2110 	/*
2111 	 * bd_ioctl created our own copy of dfl, so we can modify as
2112 	 * necessary
2113 	 */
2114 	offset_bytes = (uint64_t)p_offset << bshift;
2115 	len_bytes = (uint64_t)p_len << bshift;
2116 
2117 	dfl->dfl_offset += offset_bytes;
2118 	if (dfl->dfl_offset < offset_bytes) {
2119 		dfl_free(dfl);
2120 		return (EOVERFLOW);
2121 	}
2122 
2123 	return (dfl_iter(dfl, &dfi, offset_bytes + len_bytes, bd_free_space_cb,
2124 	    bd, KM_SLEEP));
2125 }
2126 
2127 /*
2128  * Nexus support.
2129  */
2130 int
2131 bd_bus_ctl(dev_info_t *dip, dev_info_t *rdip, ddi_ctl_enum_t ctlop,
2132     void *arg, void *result)
2133 {
2134 	bd_handle_t	hdl;
2135 
2136 	switch (ctlop) {
2137 	case DDI_CTLOPS_REPORTDEV:
2138 		cmn_err(CE_CONT, "?Block device: %s@%s, %s%d\n",
2139 		    ddi_node_name(rdip), ddi_get_name_addr(rdip),
2140 		    ddi_driver_name(rdip), ddi_get_instance(rdip));
2141 		return (DDI_SUCCESS);
2142 
2143 	case DDI_CTLOPS_INITCHILD:
2144 		hdl = ddi_get_parent_data((dev_info_t *)arg);
2145 		if (hdl == NULL) {
2146 			return (DDI_NOT_WELL_FORMED);
2147 		}
2148 		ddi_set_name_addr((dev_info_t *)arg, hdl->h_addr);
2149 		return (DDI_SUCCESS);
2150 
2151 	case DDI_CTLOPS_UNINITCHILD:
2152 		ddi_set_name_addr((dev_info_t *)arg, NULL);
2153 		ndi_prop_remove_all((dev_info_t *)arg);
2154 		return (DDI_SUCCESS);
2155 
2156 	default:
2157 		return (ddi_ctlops(dip, rdip, ctlop, arg, result));
2158 	}
2159 }
2160 
2161 /*
2162  * Functions for device drivers.
2163  */
2164 bd_handle_t
2165 bd_alloc_handle(void *private, bd_ops_t *ops, ddi_dma_attr_t *dma, int kmflag)
2166 {
2167 	bd_handle_t	hdl;
2168 
2169 	switch (ops->o_version) {
2170 	case BD_OPS_VERSION_0:
2171 	case BD_OPS_VERSION_1:
2172 	case BD_OPS_VERSION_2:
2173 		break;
2174 
2175 	default:
2176 		/* Unsupported version */
2177 		return (NULL);
2178 	}
2179 
2180 	hdl = kmem_zalloc(sizeof (*hdl), kmflag);
2181 	if (hdl == NULL) {
2182 		return (NULL);
2183 	}
2184 
2185 	switch (ops->o_version) {
2186 	case BD_OPS_VERSION_2:
2187 		hdl->h_ops.o_free_space = ops->o_free_space;
2188 		/*FALLTHRU*/
2189 	case BD_OPS_VERSION_1:
2190 	case BD_OPS_VERSION_0:
2191 		hdl->h_ops.o_drive_info = ops->o_drive_info;
2192 		hdl->h_ops.o_media_info = ops->o_media_info;
2193 		hdl->h_ops.o_devid_init = ops->o_devid_init;
2194 		hdl->h_ops.o_sync_cache = ops->o_sync_cache;
2195 		hdl->h_ops.o_read = ops->o_read;
2196 		hdl->h_ops.o_write = ops->o_write;
2197 		break;
2198 	}
2199 
2200 	hdl->h_dma = dma;
2201 	hdl->h_private = private;
2202 
2203 	return (hdl);
2204 }
2205 
2206 void
2207 bd_free_handle(bd_handle_t hdl)
2208 {
2209 	kmem_free(hdl, sizeof (*hdl));
2210 }
2211 
2212 int
2213 bd_attach_handle(dev_info_t *dip, bd_handle_t hdl)
2214 {
2215 	dev_info_t	*child;
2216 	bd_drive_t	drive = { 0 };
2217 
2218 	/*
2219 	 * It's not an error if bd_attach_handle() is called on a handle that
2220 	 * already is attached. We just ignore the request to attach and return.
2221 	 * This way drivers using blkdev don't have to keep track about blkdev
2222 	 * state, they can just call this function to make sure it attached.
2223 	 */
2224 	if (hdl->h_child != NULL) {
2225 		return (DDI_SUCCESS);
2226 	}
2227 
2228 	/* if drivers don't override this, make it assume none */
2229 	drive.d_lun = -1;
2230 	hdl->h_ops.o_drive_info(hdl->h_private, &drive);
2231 
2232 	hdl->h_parent = dip;
2233 	hdl->h_name = "blkdev";
2234 
2235 	/*LINTED: E_BAD_PTR_CAST_ALIGN*/
2236 	if (*(uint64_t *)drive.d_eui64 != 0) {
2237 		if (drive.d_lun >= 0) {
2238 			(void) snprintf(hdl->h_addr, sizeof (hdl->h_addr),
2239 			    "w%02X%02X%02X%02X%02X%02X%02X%02X,%X",
2240 			    drive.d_eui64[0], drive.d_eui64[1],
2241 			    drive.d_eui64[2], drive.d_eui64[3],
2242 			    drive.d_eui64[4], drive.d_eui64[5],
2243 			    drive.d_eui64[6], drive.d_eui64[7], drive.d_lun);
2244 		} else {
2245 			(void) snprintf(hdl->h_addr, sizeof (hdl->h_addr),
2246 			    "w%02X%02X%02X%02X%02X%02X%02X%02X",
2247 			    drive.d_eui64[0], drive.d_eui64[1],
2248 			    drive.d_eui64[2], drive.d_eui64[3],
2249 			    drive.d_eui64[4], drive.d_eui64[5],
2250 			    drive.d_eui64[6], drive.d_eui64[7]);
2251 		}
2252 	} else {
2253 		if (drive.d_lun >= 0) {
2254 			(void) snprintf(hdl->h_addr, sizeof (hdl->h_addr),
2255 			    "%X,%X", drive.d_target, drive.d_lun);
2256 		} else {
2257 			(void) snprintf(hdl->h_addr, sizeof (hdl->h_addr),
2258 			    "%X", drive.d_target);
2259 		}
2260 	}
2261 
2262 	if (ndi_devi_alloc(dip, hdl->h_name, (pnode_t)DEVI_SID_NODEID,
2263 	    &child) != NDI_SUCCESS) {
2264 		cmn_err(CE_WARN, "%s%d: unable to allocate node %s@%s",
2265 		    ddi_driver_name(dip), ddi_get_instance(dip),
2266 		    "blkdev", hdl->h_addr);
2267 		return (DDI_FAILURE);
2268 	}
2269 
2270 	ddi_set_parent_data(child, hdl);
2271 	hdl->h_child = child;
2272 
2273 	if (ndi_devi_online(child, 0) != NDI_SUCCESS) {
2274 		cmn_err(CE_WARN, "%s%d: failed bringing node %s@%s online",
2275 		    ddi_driver_name(dip), ddi_get_instance(dip),
2276 		    hdl->h_name, hdl->h_addr);
2277 		(void) ndi_devi_free(child);
2278 		hdl->h_child = NULL;
2279 		return (DDI_FAILURE);
2280 	}
2281 
2282 	return (DDI_SUCCESS);
2283 }
2284 
2285 int
2286 bd_detach_handle(bd_handle_t hdl)
2287 {
2288 	int	circ;
2289 	int	rv;
2290 	char	*devnm;
2291 
2292 	/*
2293 	 * It's not an error if bd_detach_handle() is called on a handle that
2294 	 * already is detached. We just ignore the request to detach and return.
2295 	 * This way drivers using blkdev don't have to keep track about blkdev
2296 	 * state, they can just call this function to make sure it detached.
2297 	 */
2298 	if (hdl->h_child == NULL) {
2299 		return (DDI_SUCCESS);
2300 	}
2301 	ndi_devi_enter(hdl->h_parent, &circ);
2302 	if (i_ddi_node_state(hdl->h_child) < DS_INITIALIZED) {
2303 		rv = ddi_remove_child(hdl->h_child, 0);
2304 	} else {
2305 		devnm = kmem_alloc(MAXNAMELEN + 1, KM_SLEEP);
2306 		(void) ddi_deviname(hdl->h_child, devnm);
2307 		(void) devfs_clean(hdl->h_parent, devnm + 1, DV_CLEAN_FORCE);
2308 		rv = ndi_devi_unconfig_one(hdl->h_parent, devnm + 1, NULL,
2309 		    NDI_DEVI_REMOVE | NDI_UNCONFIG);
2310 		kmem_free(devnm, MAXNAMELEN + 1);
2311 	}
2312 	if (rv == 0) {
2313 		hdl->h_child = NULL;
2314 	}
2315 
2316 	ndi_devi_exit(hdl->h_parent, circ);
2317 	return (rv == NDI_SUCCESS ? DDI_SUCCESS : DDI_FAILURE);
2318 }
2319 
2320 void
2321 bd_xfer_done(bd_xfer_t *xfer, int err)
2322 {
2323 	bd_xfer_impl_t	*xi = (void *)xfer;
2324 	buf_t		*bp = xi->i_bp;
2325 	int		rv = DDI_SUCCESS;
2326 	bd_t		*bd = xi->i_bd;
2327 	size_t		len;
2328 
2329 	if (err != 0) {
2330 		bd_runq_exit(xi, err);
2331 		atomic_inc_32(&bd->d_kerr->bd_harderrs.value.ui32);
2332 
2333 		bp->b_resid += xi->i_resid;
2334 		bd_xfer_free(xi);
2335 		bioerror(bp, err);
2336 		biodone(bp);
2337 		return;
2338 	}
2339 
2340 	xi->i_cur_win++;
2341 	xi->i_resid -= xi->i_len;
2342 
2343 	if (xi->i_resid == 0) {
2344 		/* Job completed succcessfully! */
2345 		bd_runq_exit(xi, 0);
2346 
2347 		bd_xfer_free(xi);
2348 		biodone(bp);
2349 		return;
2350 	}
2351 
2352 	xi->i_blkno += xi->i_nblks;
2353 
2354 	if (bd->d_use_dma) {
2355 		/* More transfer still pending... advance to next DMA window. */
2356 		rv = ddi_dma_getwin(xi->i_dmah, xi->i_cur_win,
2357 		    &xi->i_offset, &len, &xi->i_dmac, &xi->i_ndmac);
2358 	} else {
2359 		/* Advance memory window. */
2360 		xi->i_kaddr += xi->i_len;
2361 		xi->i_offset += xi->i_len;
2362 		len = min(bp->b_bcount - xi->i_offset, bd->d_maxxfer);
2363 	}
2364 
2365 
2366 	if ((rv != DDI_SUCCESS) ||
2367 	    (P2PHASE(len, (1U << xi->i_blkshift)) != 0)) {
2368 		bd_runq_exit(xi, EFAULT);
2369 
2370 		bp->b_resid += xi->i_resid;
2371 		bd_xfer_free(xi);
2372 		bioerror(bp, EFAULT);
2373 		biodone(bp);
2374 		return;
2375 	}
2376 	xi->i_len = len;
2377 	xi->i_nblks = len >> xi->i_blkshift;
2378 
2379 	/* Submit next window to hardware. */
2380 	rv = xi->i_func(bd->d_private, &xi->i_public);
2381 	if (rv != 0) {
2382 		bd_runq_exit(xi, rv);
2383 
2384 		atomic_inc_32(&bd->d_kerr->bd_transerrs.value.ui32);
2385 
2386 		bp->b_resid += xi->i_resid;
2387 		bd_xfer_free(xi);
2388 		bioerror(bp, rv);
2389 		biodone(bp);
2390 	}
2391 }
2392 
2393 void
2394 bd_error(bd_xfer_t *xfer, int error)
2395 {
2396 	bd_xfer_impl_t	*xi = (void *)xfer;
2397 	bd_t		*bd = xi->i_bd;
2398 
2399 	switch (error) {
2400 	case BD_ERR_MEDIA:
2401 		atomic_inc_32(&bd->d_kerr->bd_rq_media_err.value.ui32);
2402 		break;
2403 	case BD_ERR_NTRDY:
2404 		atomic_inc_32(&bd->d_kerr->bd_rq_ntrdy_err.value.ui32);
2405 		break;
2406 	case BD_ERR_NODEV:
2407 		atomic_inc_32(&bd->d_kerr->bd_rq_nodev_err.value.ui32);
2408 		break;
2409 	case BD_ERR_RECOV:
2410 		atomic_inc_32(&bd->d_kerr->bd_rq_recov_err.value.ui32);
2411 		break;
2412 	case BD_ERR_ILLRQ:
2413 		atomic_inc_32(&bd->d_kerr->bd_rq_illrq_err.value.ui32);
2414 		break;
2415 	case BD_ERR_PFA:
2416 		atomic_inc_32(&bd->d_kerr->bd_rq_pfa_err.value.ui32);
2417 		break;
2418 	default:
2419 		cmn_err(CE_PANIC, "bd_error: unknown error type %d", error);
2420 		break;
2421 	}
2422 }
2423 
2424 void
2425 bd_state_change(bd_handle_t hdl)
2426 {
2427 	bd_t		*bd;
2428 
2429 	if ((bd = hdl->h_bd) != NULL) {
2430 		bd_update_state(bd);
2431 	}
2432 }
2433 
2434 void
2435 bd_mod_init(struct dev_ops *devops)
2436 {
2437 	static struct bus_ops bd_bus_ops = {
2438 		BUSO_REV,		/* busops_rev */
2439 		nullbusmap,		/* bus_map */
2440 		NULL,			/* bus_get_intrspec (OBSOLETE) */
2441 		NULL,			/* bus_add_intrspec (OBSOLETE) */
2442 		NULL,			/* bus_remove_intrspec (OBSOLETE) */
2443 		i_ddi_map_fault,	/* bus_map_fault */
2444 		NULL,			/* bus_dma_map (OBSOLETE) */
2445 		ddi_dma_allochdl,	/* bus_dma_allochdl */
2446 		ddi_dma_freehdl,	/* bus_dma_freehdl */
2447 		ddi_dma_bindhdl,	/* bus_dma_bindhdl */
2448 		ddi_dma_unbindhdl,	/* bus_dma_unbindhdl */
2449 		ddi_dma_flush,		/* bus_dma_flush */
2450 		ddi_dma_win,		/* bus_dma_win */
2451 		ddi_dma_mctl,		/* bus_dma_ctl */
2452 		bd_bus_ctl,		/* bus_ctl */
2453 		ddi_bus_prop_op,	/* bus_prop_op */
2454 		NULL,			/* bus_get_eventcookie */
2455 		NULL,			/* bus_add_eventcall */
2456 		NULL,			/* bus_remove_eventcall */
2457 		NULL,			/* bus_post_event */
2458 		NULL,			/* bus_intr_ctl (OBSOLETE) */
2459 		NULL,			/* bus_config */
2460 		NULL,			/* bus_unconfig */
2461 		NULL,			/* bus_fm_init */
2462 		NULL,			/* bus_fm_fini */
2463 		NULL,			/* bus_fm_access_enter */
2464 		NULL,			/* bus_fm_access_exit */
2465 		NULL,			/* bus_power */
2466 		NULL,			/* bus_intr_op */
2467 	};
2468 
2469 	devops->devo_bus_ops = &bd_bus_ops;
2470 
2471 	/*
2472 	 * NB: The device driver is free to supply its own
2473 	 * character entry device support.
2474 	 */
2475 }
2476 
2477 void
2478 bd_mod_fini(struct dev_ops *devops)
2479 {
2480 	devops->devo_bus_ops = NULL;
2481 }
2482