xref: /illumos-gate/usr/src/uts/common/io/blkdev/blkdev.c (revision c55633c3b85a97a093b3f79f341aee08eb6bd15b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright 2012 Garrett D'Amore <garrett@damore.org>.  All rights reserved.
24  * Copyright 2012 Alexey Zaytsev <alexey.zaytsev@gmail.com> All rights reserved.
25  * Copyright 2017 The MathWorks, Inc.  All rights reserved.
26  * Copyright 2020 Joyent, Inc.
27  * Copyright 2022 OmniOS Community Edition (OmniOSce) Association.
28  * Copyright 2022 Tintri by DDN, Inc. All rights reserved.
29  */
30 
31 #include <sys/types.h>
32 #include <sys/ksynch.h>
33 #include <sys/kmem.h>
34 #include <sys/file.h>
35 #include <sys/errno.h>
36 #include <sys/open.h>
37 #include <sys/buf.h>
38 #include <sys/uio.h>
39 #include <sys/aio_req.h>
40 #include <sys/cred.h>
41 #include <sys/modctl.h>
42 #include <sys/cmlb.h>
43 #include <sys/conf.h>
44 #include <sys/devops.h>
45 #include <sys/list.h>
46 #include <sys/sysmacros.h>
47 #include <sys/dkio.h>
48 #include <sys/dkioc_free_util.h>
49 #include <sys/vtoc.h>
50 #include <sys/scsi/scsi.h>	/* for DTYPE_DIRECT */
51 #include <sys/kstat.h>
52 #include <sys/fs/dv_node.h>
53 #include <sys/ddi.h>
54 #include <sys/sunddi.h>
55 #include <sys/note.h>
56 #include <sys/blkdev.h>
57 #include <sys/scsi/impl/inquiry.h>
58 #include <sys/taskq.h>
59 #include <sys/taskq_impl.h>
60 #include <sys/disp.h>
61 #include <sys/sysevent/eventdefs.h>
62 #include <sys/sysevent/dev.h>
63 
64 /*
65  * blkdev is a driver which provides a lot of the common functionality
66  * a block device driver may need and helps by removing code which
67  * is frequently duplicated in block device drivers.
68  *
69  * Within this driver all the struct cb_ops functions required for a
70  * block device driver are written with appropriate call back functions
71  * to be provided by the parent driver.
72  *
73  * To use blkdev, a driver needs to:
74  *	1. Create a bd_ops_t structure which has the call back operations
75  *	   blkdev will use.
76  *	2. Create a handle by calling bd_alloc_handle(). One of the
77  *	   arguments to this function is the bd_ops_t.
78  *	3. Call bd_attach_handle(). This will instantiate a blkdev device
79  *	   as a child device node of the calling driver.
80  *
81  * A parent driver is not restricted to just allocating and attaching a
82  * single instance, it may attach as many as it wishes. For each handle
83  * attached, appropriate entries in /dev/[r]dsk are created.
84  *
85  * The bd_ops_t routines that a parent of blkdev need to provide are:
86  *
87  * o_drive_info: Provide information to blkdev such as how many I/O queues
88  *		 to create and the size of those queues. Also some device
89  *		 specifics such as EUI, vendor, product, model, serial
90  *		 number ....
91  *
92  * o_media_info: Provide information about the media. Eg size and block size.
93  *
94  * o_devid_init: Creates and initializes the device id. Typically calls
95  *		 ddi_devid_init().
96  *
97  * o_sync_cache: Issues a device appropriate command to flush any write
98  *		 caches.
99  *
100  * o_read:	 Read data as described by bd_xfer_t argument.
101  *
102  * o_write:	 Write data as described by bd_xfer_t argument.
103  *
104  * o_free_space: Free the space described by bd_xfer_t argument (optional).
105  *
106  * Queues
107  * ------
108  * Part of the drive_info data is a queue count. blkdev will create
109  * "queue count" number of waitq/runq pairs. Each waitq/runq pair
110  * operates independently. As an I/O is scheduled up to the parent
111  * driver via o_read or o_write its queue number is given. If the
112  * parent driver supports multiple hardware queues it can then select
113  * where to submit the I/O request.
114  *
115  * Currently blkdev uses a simplistic round-robin queue selection method.
116  * It has the advantage that it is lockless. In the future it will be
117  * worthwhile reviewing this strategy for something which prioritizes queues
118  * depending on how busy they are.
119  *
120  * Each waitq/runq pair is protected by its mutex (q_iomutex). Incoming
121  * I/O requests are initially added to the waitq. They are taken off the
122  * waitq, added to the runq and submitted, providing the runq is less
123  * than the qsize as specified in the drive_info. As an I/O request
124  * completes, the parent driver is required to call bd_xfer_done(), which
125  * will remove the I/O request from the runq and pass I/O completion
126  * status up the stack.
127  *
128  * Locks
129  * -----
130  * There are 5 instance global locks d_ocmutex, d_ksmutex, d_errmutex,
131  * d_statemutex and d_dle_mutex. As well a q_iomutex per waitq/runq pair.
132  *
133  * Lock Hierarchy
134  * --------------
135  * The only two locks which may be held simultaneously are q_iomutex and
136  * d_ksmutex. In all cases q_iomutex must be acquired before d_ksmutex.
137  */
138 
139 #define	BD_MAXPART	64
140 #define	BDINST(dev)	(getminor(dev) / BD_MAXPART)
141 #define	BDPART(dev)	(getminor(dev) % BD_MAXPART)
142 
143 typedef struct bd bd_t;
144 typedef struct bd_xfer_impl bd_xfer_impl_t;
145 typedef struct bd_queue bd_queue_t;
146 
147 typedef enum {
148 	BD_DLE_PENDING	= 1 << 0,
149 	BD_DLE_RUNNING	= 1 << 1
150 } bd_dle_state_t;
151 
152 struct bd {
153 	void		*d_private;
154 	dev_info_t	*d_dip;
155 	kmutex_t	d_ocmutex;	/* open/close */
156 	kmutex_t	d_ksmutex;	/* kstat */
157 	kmutex_t	d_errmutex;
158 	kmutex_t	d_statemutex;
159 	kcondvar_t	d_statecv;
160 	enum dkio_state	d_state;
161 	cmlb_handle_t	d_cmlbh;
162 	unsigned	d_open_lyr[BD_MAXPART];	/* open count */
163 	uint64_t	d_open_excl;	/* bit mask indexed by partition */
164 	uint64_t	d_open_reg[OTYPCNT];		/* bit mask */
165 	uint64_t	d_io_counter;
166 
167 	uint32_t	d_qcount;
168 	uint32_t	d_qactive;
169 	uint32_t	d_maxxfer;
170 	uint32_t	d_blkshift;
171 	uint32_t	d_pblkshift;
172 	uint64_t	d_numblks;
173 	ddi_devid_t	d_devid;
174 
175 	uint64_t	d_max_free_seg;
176 	uint64_t	d_max_free_blks;
177 	uint64_t	d_max_free_seg_blks;
178 	uint64_t	d_free_align;
179 
180 	kmem_cache_t	*d_cache;
181 	bd_queue_t	*d_queues;
182 	kstat_t		*d_ksp;
183 	kstat_io_t	*d_kiop;
184 	kstat_t		*d_errstats;
185 	struct bd_errstats *d_kerr;
186 
187 	boolean_t	d_rdonly;
188 	boolean_t	d_ssd;
189 	boolean_t	d_removable;
190 	boolean_t	d_hotpluggable;
191 	boolean_t	d_use_dma;
192 
193 	ddi_dma_attr_t	d_dma;
194 	bd_ops_t	d_ops;
195 	bd_handle_t	d_handle;
196 
197 	kmutex_t	d_dle_mutex;
198 	taskq_ent_t	d_dle_ent;
199 	bd_dle_state_t	d_dle_state;
200 };
201 
202 struct bd_handle {
203 	bd_ops_t	h_ops;
204 	ddi_dma_attr_t	*h_dma;
205 	dev_info_t	*h_parent;
206 	dev_info_t	*h_child;
207 	void		*h_private;
208 	bd_t		*h_bd;
209 	char		*h_name;
210 	char		h_addr[30];	/* enough for w%0.16x,%X */
211 };
212 
213 struct bd_xfer_impl {
214 	bd_xfer_t	i_public;
215 	list_node_t	i_linkage;
216 	bd_t		*i_bd;
217 	buf_t		*i_bp;
218 	bd_queue_t	*i_bq;
219 	uint_t		i_num_win;
220 	uint_t		i_cur_win;
221 	off_t		i_offset;
222 	int		(*i_func)(void *, bd_xfer_t *);
223 	uint32_t	i_blkshift;
224 	size_t		i_len;
225 	size_t		i_resid;
226 };
227 
228 struct bd_queue {
229 	kmutex_t	q_iomutex;
230 	uint32_t	q_qsize;
231 	uint32_t	q_qactive;
232 	list_t		q_runq;
233 	list_t		q_waitq;
234 };
235 
236 #define	i_dmah		i_public.x_dmah
237 #define	i_dmac		i_public.x_dmac
238 #define	i_ndmac		i_public.x_ndmac
239 #define	i_kaddr		i_public.x_kaddr
240 #define	i_nblks		i_public.x_nblks
241 #define	i_blkno		i_public.x_blkno
242 #define	i_flags		i_public.x_flags
243 #define	i_qnum		i_public.x_qnum
244 #define	i_dfl		i_public.x_dfl
245 
246 #define	CAN_FREESPACE(bd) \
247 	(((bd)->d_ops.o_free_space == NULL) ? B_FALSE : B_TRUE)
248 
249 /*
250  * Private prototypes.
251  */
252 
253 static void bd_prop_update_inqstring(dev_info_t *, char *, char *, size_t);
254 static void bd_create_inquiry_props(dev_info_t *, bd_drive_t *);
255 static void bd_create_errstats(bd_t *, int, bd_drive_t *);
256 static void bd_destroy_errstats(bd_t *);
257 static void bd_errstats_setstr(kstat_named_t *, char *, size_t, char *);
258 static void bd_init_errstats(bd_t *, bd_drive_t *);
259 static void bd_fini_errstats(bd_t *);
260 
261 static int bd_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
262 static int bd_attach(dev_info_t *, ddi_attach_cmd_t);
263 static int bd_detach(dev_info_t *, ddi_detach_cmd_t);
264 
265 static int bd_open(dev_t *, int, int, cred_t *);
266 static int bd_close(dev_t, int, int, cred_t *);
267 static int bd_strategy(struct buf *);
268 static int bd_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
269 static int bd_dump(dev_t, caddr_t, daddr_t, int);
270 static int bd_read(dev_t, struct uio *, cred_t *);
271 static int bd_write(dev_t, struct uio *, cred_t *);
272 static int bd_aread(dev_t, struct aio_req *, cred_t *);
273 static int bd_awrite(dev_t, struct aio_req *, cred_t *);
274 static int bd_prop_op(dev_t, dev_info_t *, ddi_prop_op_t, int, char *,
275     caddr_t, int *);
276 
277 static int bd_tg_rdwr(dev_info_t *, uchar_t, void *, diskaddr_t, size_t,
278     void *);
279 static int bd_tg_getinfo(dev_info_t *, int, void *, void *);
280 static int bd_xfer_ctor(void *, void *, int);
281 static void bd_xfer_dtor(void *, void *);
282 static void bd_sched(bd_t *, bd_queue_t *);
283 static void bd_submit(bd_t *, bd_xfer_impl_t *);
284 static void bd_runq_exit(bd_xfer_impl_t *, int);
285 static void bd_update_state(bd_t *);
286 static int bd_check_state(bd_t *, enum dkio_state *);
287 static int bd_flush_write_cache(bd_t *, struct dk_callback *);
288 static int bd_check_uio(dev_t, struct uio *);
289 static int bd_free_space(dev_t, bd_t *, dkioc_free_list_t *);
290 
291 struct cmlb_tg_ops bd_tg_ops = {
292 	TG_DK_OPS_VERSION_1,
293 	bd_tg_rdwr,
294 	bd_tg_getinfo,
295 };
296 
297 static struct cb_ops bd_cb_ops = {
298 	bd_open,		/* open */
299 	bd_close,		/* close */
300 	bd_strategy,		/* strategy */
301 	nodev,			/* print */
302 	bd_dump,		/* dump */
303 	bd_read,		/* read */
304 	bd_write,		/* write */
305 	bd_ioctl,		/* ioctl */
306 	nodev,			/* devmap */
307 	nodev,			/* mmap */
308 	nodev,			/* segmap */
309 	nochpoll,		/* poll */
310 	bd_prop_op,		/* cb_prop_op */
311 	0,			/* streamtab  */
312 	D_64BIT | D_MP,		/* Driver comaptibility flag */
313 	CB_REV,			/* cb_rev */
314 	bd_aread,		/* async read */
315 	bd_awrite		/* async write */
316 };
317 
318 struct dev_ops bd_dev_ops = {
319 	DEVO_REV,		/* devo_rev, */
320 	0,			/* refcnt  */
321 	bd_getinfo,		/* getinfo */
322 	nulldev,		/* identify */
323 	nulldev,		/* probe */
324 	bd_attach,		/* attach */
325 	bd_detach,		/* detach */
326 	nodev,			/* reset */
327 	&bd_cb_ops,		/* driver operations */
328 	NULL,			/* bus operations */
329 	NULL,			/* power */
330 	ddi_quiesce_not_needed,	/* quiesce */
331 };
332 
333 static struct modldrv modldrv = {
334 	&mod_driverops,
335 	"Generic Block Device",
336 	&bd_dev_ops,
337 };
338 
339 static struct modlinkage modlinkage = {
340 	MODREV_1, { &modldrv, NULL }
341 };
342 
343 static void *bd_state;
344 static krwlock_t bd_lock;
345 static taskq_t *bd_taskq;
346 
347 int
348 _init(void)
349 {
350 	char taskq_name[TASKQ_NAMELEN];
351 	const char *name;
352 	int rv;
353 
354 	rv = ddi_soft_state_init(&bd_state, sizeof (struct bd), 2);
355 	if (rv != DDI_SUCCESS)
356 		return (rv);
357 
358 	name = mod_modname(&modlinkage);
359 	(void) snprintf(taskq_name, sizeof (taskq_name), "%s_taskq", name);
360 	bd_taskq = taskq_create(taskq_name, 1, minclsyspri, 0, 0, 0);
361 	if (bd_taskq == NULL) {
362 		cmn_err(CE_WARN, "%s: unable to create %s", name, taskq_name);
363 		ddi_soft_state_fini(&bd_state);
364 		return (DDI_FAILURE);
365 	}
366 
367 	rw_init(&bd_lock, NULL, RW_DRIVER, NULL);
368 
369 	rv = mod_install(&modlinkage);
370 	if (rv != DDI_SUCCESS) {
371 		rw_destroy(&bd_lock);
372 		taskq_destroy(bd_taskq);
373 		ddi_soft_state_fini(&bd_state);
374 	}
375 	return (rv);
376 }
377 
378 int
379 _fini(void)
380 {
381 	int	rv;
382 
383 	rv = mod_remove(&modlinkage);
384 	if (rv == DDI_SUCCESS) {
385 		rw_destroy(&bd_lock);
386 		taskq_destroy(bd_taskq);
387 		ddi_soft_state_fini(&bd_state);
388 	}
389 	return (rv);
390 }
391 
392 int
393 _info(struct modinfo *modinfop)
394 {
395 	return (mod_info(&modlinkage, modinfop));
396 }
397 
398 static int
399 bd_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resultp)
400 {
401 	bd_t	*bd;
402 	minor_t	inst;
403 
404 	_NOTE(ARGUNUSED(dip));
405 
406 	inst = BDINST((dev_t)arg);
407 
408 	switch (cmd) {
409 	case DDI_INFO_DEVT2DEVINFO:
410 		bd = ddi_get_soft_state(bd_state, inst);
411 		if (bd == NULL) {
412 			return (DDI_FAILURE);
413 		}
414 		*resultp = (void *)bd->d_dip;
415 		break;
416 
417 	case DDI_INFO_DEVT2INSTANCE:
418 		*resultp = (void *)(intptr_t)inst;
419 		break;
420 
421 	default:
422 		return (DDI_FAILURE);
423 	}
424 	return (DDI_SUCCESS);
425 }
426 
427 static void
428 bd_prop_update_inqstring(dev_info_t *dip, char *name, char *data, size_t len)
429 {
430 	int	ilen;
431 	char	*data_string;
432 
433 	ilen = scsi_ascii_inquiry_len(data, len);
434 	ASSERT3U(ilen, <=, len);
435 	if (ilen <= 0)
436 		return;
437 	/* ensure null termination */
438 	data_string = kmem_zalloc(ilen + 1, KM_SLEEP);
439 	bcopy(data, data_string, ilen);
440 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, dip, name, data_string);
441 	kmem_free(data_string, ilen + 1);
442 }
443 
444 static void
445 bd_create_inquiry_props(dev_info_t *dip, bd_drive_t *drive)
446 {
447 	if (drive->d_vendor_len > 0)
448 		bd_prop_update_inqstring(dip, INQUIRY_VENDOR_ID,
449 		    drive->d_vendor, drive->d_vendor_len);
450 
451 	if (drive->d_product_len > 0)
452 		bd_prop_update_inqstring(dip, INQUIRY_PRODUCT_ID,
453 		    drive->d_product, drive->d_product_len);
454 
455 	if (drive->d_serial_len > 0)
456 		bd_prop_update_inqstring(dip, INQUIRY_SERIAL_NO,
457 		    drive->d_serial, drive->d_serial_len);
458 
459 	if (drive->d_revision_len > 0)
460 		bd_prop_update_inqstring(dip, INQUIRY_REVISION_ID,
461 		    drive->d_revision, drive->d_revision_len);
462 }
463 
464 static void
465 bd_create_errstats(bd_t *bd, int inst, bd_drive_t *drive)
466 {
467 	char	ks_module[KSTAT_STRLEN];
468 	char	ks_name[KSTAT_STRLEN];
469 	int	ndata = sizeof (struct bd_errstats) / sizeof (kstat_named_t);
470 
471 	if (bd->d_errstats != NULL)
472 		return;
473 
474 	(void) snprintf(ks_module, sizeof (ks_module), "%serr",
475 	    ddi_driver_name(bd->d_dip));
476 	(void) snprintf(ks_name, sizeof (ks_name), "%s%d,err",
477 	    ddi_driver_name(bd->d_dip), inst);
478 
479 	bd->d_errstats = kstat_create(ks_module, inst, ks_name, "device_error",
480 	    KSTAT_TYPE_NAMED, ndata, KSTAT_FLAG_PERSISTENT);
481 
482 	mutex_init(&bd->d_errmutex, NULL, MUTEX_DRIVER, NULL);
483 	if (bd->d_errstats == NULL) {
484 		/*
485 		 * Even if we cannot create the kstat, we create a
486 		 * scratch kstat.  The reason for this is to ensure
487 		 * that we can update the kstat all of the time,
488 		 * without adding an extra branch instruction.
489 		 */
490 		bd->d_kerr = kmem_zalloc(sizeof (struct bd_errstats),
491 		    KM_SLEEP);
492 	} else {
493 		bd->d_errstats->ks_lock = &bd->d_errmutex;
494 		bd->d_kerr = (struct bd_errstats *)bd->d_errstats->ks_data;
495 	}
496 
497 	kstat_named_init(&bd->d_kerr->bd_softerrs,	"Soft Errors",
498 	    KSTAT_DATA_UINT32);
499 	kstat_named_init(&bd->d_kerr->bd_harderrs,	"Hard Errors",
500 	    KSTAT_DATA_UINT32);
501 	kstat_named_init(&bd->d_kerr->bd_transerrs,	"Transport Errors",
502 	    KSTAT_DATA_UINT32);
503 
504 	if (drive->d_model_len > 0) {
505 		kstat_named_init(&bd->d_kerr->bd_model,	"Model",
506 		    KSTAT_DATA_STRING);
507 	} else {
508 		kstat_named_init(&bd->d_kerr->bd_vid,	"Vendor",
509 		    KSTAT_DATA_STRING);
510 		kstat_named_init(&bd->d_kerr->bd_pid,	"Product",
511 		    KSTAT_DATA_STRING);
512 	}
513 
514 	kstat_named_init(&bd->d_kerr->bd_revision,	"Revision",
515 	    KSTAT_DATA_STRING);
516 	kstat_named_init(&bd->d_kerr->bd_serial,	"Serial No",
517 	    KSTAT_DATA_STRING);
518 	kstat_named_init(&bd->d_kerr->bd_capacity,	"Size",
519 	    KSTAT_DATA_ULONGLONG);
520 	kstat_named_init(&bd->d_kerr->bd_rq_media_err,	"Media Error",
521 	    KSTAT_DATA_UINT32);
522 	kstat_named_init(&bd->d_kerr->bd_rq_ntrdy_err,	"Device Not Ready",
523 	    KSTAT_DATA_UINT32);
524 	kstat_named_init(&bd->d_kerr->bd_rq_nodev_err,	"No Device",
525 	    KSTAT_DATA_UINT32);
526 	kstat_named_init(&bd->d_kerr->bd_rq_recov_err,	"Recoverable",
527 	    KSTAT_DATA_UINT32);
528 	kstat_named_init(&bd->d_kerr->bd_rq_illrq_err,	"Illegal Request",
529 	    KSTAT_DATA_UINT32);
530 	kstat_named_init(&bd->d_kerr->bd_rq_pfa_err,
531 	    "Predictive Failure Analysis", KSTAT_DATA_UINT32);
532 
533 	bd->d_errstats->ks_private = bd;
534 
535 	kstat_install(bd->d_errstats);
536 	bd_init_errstats(bd, drive);
537 }
538 
539 static void
540 bd_destroy_errstats(bd_t *bd)
541 {
542 	if (bd->d_errstats != NULL) {
543 		bd_fini_errstats(bd);
544 		kstat_delete(bd->d_errstats);
545 		bd->d_errstats = NULL;
546 	} else {
547 		kmem_free(bd->d_kerr, sizeof (struct bd_errstats));
548 		bd->d_kerr = NULL;
549 		mutex_destroy(&bd->d_errmutex);
550 	}
551 }
552 
553 static void
554 bd_errstats_setstr(kstat_named_t *k, char *str, size_t len, char *alt)
555 {
556 	char	*tmp;
557 	size_t	km_len;
558 
559 	if (KSTAT_NAMED_STR_PTR(k) == NULL) {
560 		if (len > 0)
561 			km_len = strnlen(str, len);
562 		else if (alt != NULL)
563 			km_len = strlen(alt);
564 		else
565 			return;
566 
567 		tmp = kmem_alloc(km_len + 1, KM_SLEEP);
568 		bcopy(len > 0 ? str : alt, tmp, km_len);
569 		tmp[km_len] = '\0';
570 
571 		kstat_named_setstr(k, tmp);
572 	}
573 }
574 
575 static void
576 bd_errstats_clrstr(kstat_named_t *k)
577 {
578 	if (KSTAT_NAMED_STR_PTR(k) == NULL)
579 		return;
580 
581 	kmem_free(KSTAT_NAMED_STR_PTR(k), KSTAT_NAMED_STR_BUFLEN(k));
582 	kstat_named_setstr(k, NULL);
583 }
584 
585 static void
586 bd_init_errstats(bd_t *bd, bd_drive_t *drive)
587 {
588 	struct bd_errstats	*est = bd->d_kerr;
589 
590 	mutex_enter(&bd->d_errmutex);
591 
592 	if (drive->d_model_len > 0 &&
593 	    KSTAT_NAMED_STR_PTR(&est->bd_model) == NULL) {
594 		bd_errstats_setstr(&est->bd_model, drive->d_model,
595 		    drive->d_model_len, NULL);
596 	} else {
597 		bd_errstats_setstr(&est->bd_vid, drive->d_vendor,
598 		    drive->d_vendor_len, "Unknown ");
599 		bd_errstats_setstr(&est->bd_pid, drive->d_product,
600 		    drive->d_product_len, "Unknown         ");
601 	}
602 
603 	bd_errstats_setstr(&est->bd_revision, drive->d_revision,
604 	    drive->d_revision_len, "0001");
605 	bd_errstats_setstr(&est->bd_serial, drive->d_serial,
606 	    drive->d_serial_len, "0               ");
607 
608 	mutex_exit(&bd->d_errmutex);
609 }
610 
611 static void
612 bd_fini_errstats(bd_t *bd)
613 {
614 	struct bd_errstats	*est = bd->d_kerr;
615 
616 	mutex_enter(&bd->d_errmutex);
617 
618 	bd_errstats_clrstr(&est->bd_model);
619 	bd_errstats_clrstr(&est->bd_vid);
620 	bd_errstats_clrstr(&est->bd_pid);
621 	bd_errstats_clrstr(&est->bd_revision);
622 	bd_errstats_clrstr(&est->bd_serial);
623 
624 	mutex_exit(&bd->d_errmutex);
625 }
626 
627 static void
628 bd_queues_free(bd_t *bd)
629 {
630 	uint32_t i;
631 
632 	for (i = 0; i < bd->d_qcount; i++) {
633 		bd_queue_t *bq = &bd->d_queues[i];
634 
635 		mutex_destroy(&bq->q_iomutex);
636 		list_destroy(&bq->q_waitq);
637 		list_destroy(&bq->q_runq);
638 	}
639 
640 	kmem_free(bd->d_queues, sizeof (*bd->d_queues) * bd->d_qcount);
641 }
642 
643 static int
644 bd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
645 {
646 	int		inst;
647 	bd_handle_t	hdl;
648 	bd_t		*bd;
649 	bd_drive_t	drive;
650 	uint32_t	i;
651 	int		rv;
652 	char		name[16];
653 	char		kcache[32];
654 
655 	switch (cmd) {
656 	case DDI_ATTACH:
657 		break;
658 	case DDI_RESUME:
659 		/* We don't do anything native for suspend/resume */
660 		return (DDI_SUCCESS);
661 	default:
662 		return (DDI_FAILURE);
663 	}
664 
665 	inst = ddi_get_instance(dip);
666 	hdl = ddi_get_parent_data(dip);
667 
668 	(void) snprintf(name, sizeof (name), "%s%d",
669 	    ddi_driver_name(dip), ddi_get_instance(dip));
670 	(void) snprintf(kcache, sizeof (kcache), "%s_xfer", name);
671 
672 	if (hdl == NULL) {
673 		cmn_err(CE_WARN, "%s: missing parent data!", name);
674 		return (DDI_FAILURE);
675 	}
676 
677 	if (ddi_soft_state_zalloc(bd_state, inst) != DDI_SUCCESS) {
678 		cmn_err(CE_WARN, "%s: unable to zalloc soft state!", name);
679 		return (DDI_FAILURE);
680 	}
681 	bd = ddi_get_soft_state(bd_state, inst);
682 
683 	if (hdl->h_dma) {
684 		bd->d_dma = *(hdl->h_dma);
685 		bd->d_dma.dma_attr_granular =
686 		    max(DEV_BSIZE, bd->d_dma.dma_attr_granular);
687 		bd->d_use_dma = B_TRUE;
688 
689 		if (bd->d_maxxfer &&
690 		    (bd->d_maxxfer != bd->d_dma.dma_attr_maxxfer)) {
691 			cmn_err(CE_WARN,
692 			    "%s: inconsistent maximum transfer size!",
693 			    name);
694 			/* We force it */
695 			bd->d_maxxfer = bd->d_dma.dma_attr_maxxfer;
696 		} else {
697 			bd->d_maxxfer = bd->d_dma.dma_attr_maxxfer;
698 		}
699 	} else {
700 		bd->d_use_dma = B_FALSE;
701 		if (bd->d_maxxfer == 0) {
702 			bd->d_maxxfer = 1024 * 1024;
703 		}
704 	}
705 	bd->d_ops = hdl->h_ops;
706 	bd->d_private = hdl->h_private;
707 	bd->d_blkshift = DEV_BSHIFT;	/* 512 bytes, to start */
708 
709 	if (bd->d_maxxfer % DEV_BSIZE) {
710 		cmn_err(CE_WARN, "%s: maximum transfer misaligned!", name);
711 		bd->d_maxxfer &= ~(DEV_BSIZE - 1);
712 	}
713 	if (bd->d_maxxfer < DEV_BSIZE) {
714 		cmn_err(CE_WARN, "%s: maximum transfer size too small!", name);
715 		ddi_soft_state_free(bd_state, inst);
716 		return (DDI_FAILURE);
717 	}
718 
719 	bd->d_dip = dip;
720 	bd->d_handle = hdl;
721 	ddi_set_driver_private(dip, bd);
722 
723 	mutex_init(&bd->d_ksmutex, NULL, MUTEX_DRIVER, NULL);
724 	mutex_init(&bd->d_ocmutex, NULL, MUTEX_DRIVER, NULL);
725 	mutex_init(&bd->d_statemutex, NULL, MUTEX_DRIVER, NULL);
726 	cv_init(&bd->d_statecv, NULL, CV_DRIVER, NULL);
727 	mutex_init(&bd->d_dle_mutex, NULL, MUTEX_DRIVER, NULL);
728 	bd->d_dle_state = 0;
729 
730 	bd->d_cache = kmem_cache_create(kcache, sizeof (bd_xfer_impl_t), 8,
731 	    bd_xfer_ctor, bd_xfer_dtor, NULL, bd, NULL, 0);
732 
733 	bd->d_ksp = kstat_create(ddi_driver_name(dip), inst, NULL, "disk",
734 	    KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT);
735 	if (bd->d_ksp != NULL) {
736 		bd->d_ksp->ks_lock = &bd->d_ksmutex;
737 		kstat_install(bd->d_ksp);
738 		bd->d_kiop = bd->d_ksp->ks_data;
739 	} else {
740 		/*
741 		 * Even if we cannot create the kstat, we create a
742 		 * scratch kstat.  The reason for this is to ensure
743 		 * that we can update the kstat all of the time,
744 		 * without adding an extra branch instruction.
745 		 */
746 		bd->d_kiop = kmem_zalloc(sizeof (kstat_io_t), KM_SLEEP);
747 	}
748 
749 	cmlb_alloc_handle(&bd->d_cmlbh);
750 
751 	bd->d_state = DKIO_NONE;
752 
753 	bzero(&drive, sizeof (drive));
754 	/*
755 	 * Default to one queue, and no restrictions on free space requests
756 	 * (if driver provides method) parent driver can override.
757 	 */
758 	drive.d_qcount = 1;
759 	drive.d_free_align = 1;
760 	bd->d_ops.o_drive_info(bd->d_private, &drive);
761 
762 	/*
763 	 * Several checks to make sure o_drive_info() didn't return bad
764 	 * values:
765 	 *
766 	 * There must be at least one queue
767 	 */
768 	if (drive.d_qcount == 0)
769 		goto fail_drive_info;
770 
771 	/* FREE/UNMAP/TRIM alignment needs to be at least 1 block */
772 	if (drive.d_free_align == 0)
773 		goto fail_drive_info;
774 
775 	/*
776 	 * If d_max_free_blks is not unlimited (not 0), then we cannot allow
777 	 * an unlimited segment size. It is however permissible to not impose
778 	 * a limit on the total number of blocks freed while limiting the
779 	 * amount allowed in an individual segment.
780 	 */
781 	if ((drive.d_max_free_blks > 0 && drive.d_max_free_seg_blks == 0))
782 		goto fail_drive_info;
783 
784 	/*
785 	 * If a limit is set on d_max_free_blks (by the above check, we know
786 	 * if there's a limit on d_max_free_blks, d_max_free_seg_blks cannot
787 	 * be unlimited), it cannot be smaller than the limit on an individual
788 	 * segment.
789 	 */
790 	if ((drive.d_max_free_blks > 0 &&
791 	    drive.d_max_free_seg_blks > drive.d_max_free_blks)) {
792 		goto fail_drive_info;
793 	}
794 
795 	bd->d_qcount = drive.d_qcount;
796 	bd->d_removable = drive.d_removable;
797 	bd->d_hotpluggable = drive.d_hotpluggable;
798 
799 	if (drive.d_maxxfer && drive.d_maxxfer < bd->d_maxxfer)
800 		bd->d_maxxfer = drive.d_maxxfer;
801 
802 	bd->d_free_align = drive.d_free_align;
803 	bd->d_max_free_seg = drive.d_max_free_seg;
804 	bd->d_max_free_blks = drive.d_max_free_blks;
805 	bd->d_max_free_seg_blks = drive.d_max_free_seg_blks;
806 
807 	bd_create_inquiry_props(dip, &drive);
808 	bd_create_errstats(bd, inst, &drive);
809 	bd_update_state(bd);
810 
811 	bd->d_queues = kmem_alloc(sizeof (*bd->d_queues) * bd->d_qcount,
812 	    KM_SLEEP);
813 	for (i = 0; i < bd->d_qcount; i++) {
814 		bd_queue_t *bq = &bd->d_queues[i];
815 
816 		bq->q_qsize = drive.d_qsize;
817 		bq->q_qactive = 0;
818 		mutex_init(&bq->q_iomutex, NULL, MUTEX_DRIVER, NULL);
819 
820 		list_create(&bq->q_waitq, sizeof (bd_xfer_impl_t),
821 		    offsetof(struct bd_xfer_impl, i_linkage));
822 		list_create(&bq->q_runq, sizeof (bd_xfer_impl_t),
823 		    offsetof(struct bd_xfer_impl, i_linkage));
824 	}
825 
826 	rv = cmlb_attach(dip, &bd_tg_ops, DTYPE_DIRECT,
827 	    bd->d_removable, bd->d_hotpluggable,
828 	    /*LINTED: E_BAD_PTR_CAST_ALIGN*/
829 	    *(uint64_t *)drive.d_eui64 != 0 ? DDI_NT_BLOCK_BLKDEV :
830 	    drive.d_lun >= 0 ? DDI_NT_BLOCK_CHAN : DDI_NT_BLOCK,
831 	    CMLB_FAKE_LABEL_ONE_PARTITION, bd->d_cmlbh, 0);
832 	if (rv != 0) {
833 		goto fail_cmlb_attach;
834 	}
835 
836 	if (bd->d_ops.o_devid_init != NULL) {
837 		rv = bd->d_ops.o_devid_init(bd->d_private, dip, &bd->d_devid);
838 		if (rv == DDI_SUCCESS) {
839 			if (ddi_devid_register(dip, bd->d_devid) !=
840 			    DDI_SUCCESS) {
841 				cmn_err(CE_WARN,
842 				    "%s: unable to register devid", name);
843 			}
844 		}
845 	}
846 
847 	/*
848 	 * Add a zero-length attribute to tell the world we support
849 	 * kernel ioctls (for layered drivers).  Also set up properties
850 	 * used by HAL to identify removable media.
851 	 */
852 	(void) ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
853 	    DDI_KERNEL_IOCTL, NULL, 0);
854 	if (bd->d_removable) {
855 		(void) ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
856 		    "removable-media", NULL, 0);
857 	}
858 	if (bd->d_hotpluggable) {
859 		(void) ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
860 		    "hotpluggable", NULL, 0);
861 	}
862 
863 	hdl->h_bd = bd;
864 	ddi_report_dev(dip);
865 
866 	return (DDI_SUCCESS);
867 
868 fail_cmlb_attach:
869 	bd_queues_free(bd);
870 	bd_destroy_errstats(bd);
871 
872 fail_drive_info:
873 	cmlb_free_handle(&bd->d_cmlbh);
874 
875 	if (bd->d_ksp != NULL) {
876 		kstat_delete(bd->d_ksp);
877 		bd->d_ksp = NULL;
878 	} else {
879 		kmem_free(bd->d_kiop, sizeof (kstat_io_t));
880 	}
881 
882 	kmem_cache_destroy(bd->d_cache);
883 	cv_destroy(&bd->d_statecv);
884 	mutex_destroy(&bd->d_statemutex);
885 	mutex_destroy(&bd->d_ocmutex);
886 	mutex_destroy(&bd->d_ksmutex);
887 	mutex_destroy(&bd->d_dle_mutex);
888 	ddi_soft_state_free(bd_state, inst);
889 	return (DDI_FAILURE);
890 }
891 
892 static int
893 bd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
894 {
895 	bd_handle_t	hdl;
896 	bd_t		*bd;
897 
898 	bd = ddi_get_driver_private(dip);
899 	hdl = ddi_get_parent_data(dip);
900 
901 	switch (cmd) {
902 	case DDI_DETACH:
903 		break;
904 	case DDI_SUSPEND:
905 		/* We don't suspend, but our parent does */
906 		return (DDI_SUCCESS);
907 	default:
908 		return (DDI_FAILURE);
909 	}
910 
911 	hdl->h_bd = NULL;
912 
913 	if (bd->d_ksp != NULL) {
914 		kstat_delete(bd->d_ksp);
915 		bd->d_ksp = NULL;
916 	} else {
917 		kmem_free(bd->d_kiop, sizeof (kstat_io_t));
918 	}
919 
920 	bd_destroy_errstats(bd);
921 	cmlb_detach(bd->d_cmlbh, 0);
922 	cmlb_free_handle(&bd->d_cmlbh);
923 	if (bd->d_devid)
924 		ddi_devid_free(bd->d_devid);
925 	kmem_cache_destroy(bd->d_cache);
926 	mutex_destroy(&bd->d_ksmutex);
927 	mutex_destroy(&bd->d_ocmutex);
928 	mutex_destroy(&bd->d_statemutex);
929 	cv_destroy(&bd->d_statecv);
930 	mutex_destroy(&bd->d_dle_mutex);
931 	bd_queues_free(bd);
932 	ddi_soft_state_free(bd_state, ddi_get_instance(dip));
933 	return (DDI_SUCCESS);
934 }
935 
936 static int
937 bd_xfer_ctor(void *buf, void *arg, int kmflag)
938 {
939 	bd_xfer_impl_t	*xi;
940 	bd_t		*bd = arg;
941 	int		(*dcb)(caddr_t);
942 
943 	if (kmflag == KM_PUSHPAGE || kmflag == KM_SLEEP) {
944 		dcb = DDI_DMA_SLEEP;
945 	} else {
946 		dcb = DDI_DMA_DONTWAIT;
947 	}
948 
949 	xi = buf;
950 	bzero(xi, sizeof (*xi));
951 	xi->i_bd = bd;
952 
953 	if (bd->d_use_dma) {
954 		if (ddi_dma_alloc_handle(bd->d_dip, &bd->d_dma, dcb, NULL,
955 		    &xi->i_dmah) != DDI_SUCCESS) {
956 			return (-1);
957 		}
958 	}
959 
960 	return (0);
961 }
962 
963 static void
964 bd_xfer_dtor(void *buf, void *arg)
965 {
966 	bd_xfer_impl_t	*xi = buf;
967 
968 	_NOTE(ARGUNUSED(arg));
969 
970 	if (xi->i_dmah)
971 		ddi_dma_free_handle(&xi->i_dmah);
972 	xi->i_dmah = NULL;
973 }
974 
975 static bd_xfer_impl_t *
976 bd_xfer_alloc(bd_t *bd, struct buf *bp, int (*func)(void *, bd_xfer_t *),
977     int kmflag)
978 {
979 	bd_xfer_impl_t		*xi;
980 	int			rv = 0;
981 	int			status;
982 	unsigned		dir;
983 	int			(*cb)(caddr_t);
984 	size_t			len;
985 	uint32_t		shift;
986 
987 	if (kmflag == KM_SLEEP) {
988 		cb = DDI_DMA_SLEEP;
989 	} else {
990 		cb = DDI_DMA_DONTWAIT;
991 	}
992 
993 	xi = kmem_cache_alloc(bd->d_cache, kmflag);
994 	if (xi == NULL) {
995 		bioerror(bp, ENOMEM);
996 		return (NULL);
997 	}
998 
999 	ASSERT(bp);
1000 
1001 	xi->i_bp = bp;
1002 	xi->i_func = func;
1003 	xi->i_blkno = bp->b_lblkno >> (bd->d_blkshift - DEV_BSHIFT);
1004 
1005 	if (bp->b_bcount == 0) {
1006 		xi->i_len = 0;
1007 		xi->i_nblks = 0;
1008 		xi->i_kaddr = NULL;
1009 		xi->i_resid = 0;
1010 		xi->i_num_win = 0;
1011 		goto done;
1012 	}
1013 
1014 	if (bp->b_flags & B_READ) {
1015 		dir = DDI_DMA_READ;
1016 		xi->i_func = bd->d_ops.o_read;
1017 	} else {
1018 		dir = DDI_DMA_WRITE;
1019 		xi->i_func = bd->d_ops.o_write;
1020 	}
1021 
1022 	shift = bd->d_blkshift;
1023 	xi->i_blkshift = shift;
1024 
1025 	if (!bd->d_use_dma) {
1026 		bp_mapin(bp);
1027 		rv = 0;
1028 		xi->i_offset = 0;
1029 		xi->i_num_win =
1030 		    (bp->b_bcount + (bd->d_maxxfer - 1)) / bd->d_maxxfer;
1031 		xi->i_cur_win = 0;
1032 		xi->i_len = min(bp->b_bcount, bd->d_maxxfer);
1033 		xi->i_nblks = xi->i_len >> shift;
1034 		xi->i_kaddr = bp->b_un.b_addr;
1035 		xi->i_resid = bp->b_bcount;
1036 	} else {
1037 
1038 		/*
1039 		 * We have to use consistent DMA if the address is misaligned.
1040 		 */
1041 		if (((bp->b_flags & (B_PAGEIO | B_REMAPPED)) != B_PAGEIO) &&
1042 		    ((uintptr_t)bp->b_un.b_addr & 0x7)) {
1043 			dir |= DDI_DMA_CONSISTENT | DDI_DMA_PARTIAL;
1044 		} else {
1045 			dir |= DDI_DMA_STREAMING | DDI_DMA_PARTIAL;
1046 		}
1047 
1048 		status = ddi_dma_buf_bind_handle(xi->i_dmah, bp, dir, cb,
1049 		    NULL, &xi->i_dmac, &xi->i_ndmac);
1050 		switch (status) {
1051 		case DDI_DMA_MAPPED:
1052 			xi->i_num_win = 1;
1053 			xi->i_cur_win = 0;
1054 			xi->i_offset = 0;
1055 			xi->i_len = bp->b_bcount;
1056 			xi->i_nblks = xi->i_len >> shift;
1057 			xi->i_resid = bp->b_bcount;
1058 			rv = 0;
1059 			break;
1060 		case DDI_DMA_PARTIAL_MAP:
1061 			xi->i_cur_win = 0;
1062 
1063 			if ((ddi_dma_numwin(xi->i_dmah, &xi->i_num_win) !=
1064 			    DDI_SUCCESS) ||
1065 			    (ddi_dma_getwin(xi->i_dmah, 0, &xi->i_offset,
1066 			    &len, &xi->i_dmac, &xi->i_ndmac) !=
1067 			    DDI_SUCCESS) ||
1068 			    (P2PHASE(len, (1U << shift)) != 0)) {
1069 				(void) ddi_dma_unbind_handle(xi->i_dmah);
1070 				rv = EFAULT;
1071 				goto done;
1072 			}
1073 			xi->i_len = len;
1074 			xi->i_nblks = xi->i_len >> shift;
1075 			xi->i_resid = bp->b_bcount;
1076 			rv = 0;
1077 			break;
1078 		case DDI_DMA_NORESOURCES:
1079 			rv = EAGAIN;
1080 			goto done;
1081 		case DDI_DMA_TOOBIG:
1082 			rv = EINVAL;
1083 			goto done;
1084 		case DDI_DMA_NOMAPPING:
1085 		case DDI_DMA_INUSE:
1086 		default:
1087 			rv = EFAULT;
1088 			goto done;
1089 		}
1090 	}
1091 
1092 done:
1093 	if (rv != 0) {
1094 		kmem_cache_free(bd->d_cache, xi);
1095 		bioerror(bp, rv);
1096 		return (NULL);
1097 	}
1098 
1099 	return (xi);
1100 }
1101 
1102 static void
1103 bd_xfer_free(bd_xfer_impl_t *xi)
1104 {
1105 	if (xi->i_dmah) {
1106 		(void) ddi_dma_unbind_handle(xi->i_dmah);
1107 	}
1108 	if (xi->i_dfl != NULL) {
1109 		dfl_free((dkioc_free_list_t *)xi->i_dfl);
1110 		xi->i_dfl = NULL;
1111 	}
1112 	kmem_cache_free(xi->i_bd->d_cache, xi);
1113 }
1114 
1115 static int
1116 bd_open(dev_t *devp, int flag, int otyp, cred_t *credp)
1117 {
1118 	dev_t		dev = *devp;
1119 	bd_t		*bd;
1120 	minor_t		part;
1121 	minor_t		inst;
1122 	uint64_t	mask;
1123 	boolean_t	ndelay;
1124 	int		rv;
1125 	diskaddr_t	nblks;
1126 	diskaddr_t	lba;
1127 
1128 	_NOTE(ARGUNUSED(credp));
1129 
1130 	part = BDPART(dev);
1131 	inst = BDINST(dev);
1132 
1133 	if (otyp >= OTYPCNT)
1134 		return (EINVAL);
1135 
1136 	ndelay = (flag & (FNDELAY | FNONBLOCK)) ? B_TRUE : B_FALSE;
1137 
1138 	/*
1139 	 * Block any DR events from changing the set of registered
1140 	 * devices while we function.
1141 	 */
1142 	rw_enter(&bd_lock, RW_READER);
1143 	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
1144 		rw_exit(&bd_lock);
1145 		return (ENXIO);
1146 	}
1147 
1148 	mutex_enter(&bd->d_ocmutex);
1149 
1150 	ASSERT(part < 64);
1151 	mask = (1U << part);
1152 
1153 	bd_update_state(bd);
1154 
1155 	if (cmlb_validate(bd->d_cmlbh, 0, 0) != 0) {
1156 
1157 		/* non-blocking opens are allowed to succeed */
1158 		if (!ndelay) {
1159 			rv = ENXIO;
1160 			goto done;
1161 		}
1162 	} else if (cmlb_partinfo(bd->d_cmlbh, part, &nblks, &lba,
1163 	    NULL, NULL, 0) == 0) {
1164 
1165 		/*
1166 		 * We read the partinfo, verify valid ranges.  If the
1167 		 * partition is invalid, and we aren't blocking or
1168 		 * doing a raw access, then fail. (Non-blocking and
1169 		 * raw accesses can still succeed to allow a disk with
1170 		 * bad partition data to opened by format and fdisk.)
1171 		 */
1172 		if ((!nblks) && ((!ndelay) || (otyp != OTYP_CHR))) {
1173 			rv = ENXIO;
1174 			goto done;
1175 		}
1176 	} else if (!ndelay) {
1177 		/*
1178 		 * cmlb_partinfo failed -- invalid partition or no
1179 		 * disk label.
1180 		 */
1181 		rv = ENXIO;
1182 		goto done;
1183 	}
1184 
1185 	if ((flag & FWRITE) && bd->d_rdonly) {
1186 		rv = EROFS;
1187 		goto done;
1188 	}
1189 
1190 	if ((bd->d_open_excl) & (mask)) {
1191 		rv = EBUSY;
1192 		goto done;
1193 	}
1194 	if (flag & FEXCL) {
1195 		if (bd->d_open_lyr[part]) {
1196 			rv = EBUSY;
1197 			goto done;
1198 		}
1199 		for (int i = 0; i < OTYP_LYR; i++) {
1200 			if (bd->d_open_reg[i] & mask) {
1201 				rv = EBUSY;
1202 				goto done;
1203 			}
1204 		}
1205 	}
1206 
1207 	if (otyp == OTYP_LYR) {
1208 		bd->d_open_lyr[part]++;
1209 	} else {
1210 		bd->d_open_reg[otyp] |= mask;
1211 	}
1212 	if (flag & FEXCL) {
1213 		bd->d_open_excl |= mask;
1214 	}
1215 
1216 	rv = 0;
1217 done:
1218 	mutex_exit(&bd->d_ocmutex);
1219 	rw_exit(&bd_lock);
1220 
1221 	return (rv);
1222 }
1223 
1224 static int
1225 bd_close(dev_t dev, int flag, int otyp, cred_t *credp)
1226 {
1227 	bd_t		*bd;
1228 	minor_t		inst;
1229 	minor_t		part;
1230 	uint64_t	mask;
1231 	boolean_t	last = B_TRUE;
1232 
1233 	_NOTE(ARGUNUSED(flag));
1234 	_NOTE(ARGUNUSED(credp));
1235 
1236 	part = BDPART(dev);
1237 	inst = BDINST(dev);
1238 
1239 	ASSERT(part < 64);
1240 	mask = (1U << part);
1241 
1242 	rw_enter(&bd_lock, RW_READER);
1243 
1244 	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
1245 		rw_exit(&bd_lock);
1246 		return (ENXIO);
1247 	}
1248 
1249 	mutex_enter(&bd->d_ocmutex);
1250 	if (bd->d_open_excl & mask) {
1251 		bd->d_open_excl &= ~mask;
1252 	}
1253 	if (otyp == OTYP_LYR) {
1254 		bd->d_open_lyr[part]--;
1255 	} else {
1256 		bd->d_open_reg[otyp] &= ~mask;
1257 	}
1258 	for (int i = 0; i < 64; i++) {
1259 		if (bd->d_open_lyr[part]) {
1260 			last = B_FALSE;
1261 		}
1262 	}
1263 	for (int i = 0; last && (i < OTYP_LYR); i++) {
1264 		if (bd->d_open_reg[i]) {
1265 			last = B_FALSE;
1266 		}
1267 	}
1268 	mutex_exit(&bd->d_ocmutex);
1269 
1270 	if (last) {
1271 		cmlb_invalidate(bd->d_cmlbh, 0);
1272 	}
1273 	rw_exit(&bd_lock);
1274 
1275 	return (0);
1276 }
1277 
1278 static int
1279 bd_dump(dev_t dev, caddr_t caddr, daddr_t blkno, int nblk)
1280 {
1281 	minor_t		inst;
1282 	minor_t		part;
1283 	diskaddr_t	pstart;
1284 	diskaddr_t	psize;
1285 	bd_t		*bd;
1286 	bd_xfer_impl_t	*xi;
1287 	buf_t		*bp;
1288 	int		rv;
1289 	uint32_t	shift;
1290 	daddr_t		d_blkno;
1291 	int	d_nblk;
1292 
1293 	rw_enter(&bd_lock, RW_READER);
1294 
1295 	part = BDPART(dev);
1296 	inst = BDINST(dev);
1297 
1298 	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
1299 		rw_exit(&bd_lock);
1300 		return (ENXIO);
1301 	}
1302 	shift = bd->d_blkshift;
1303 	d_blkno = blkno >> (shift - DEV_BSHIFT);
1304 	d_nblk = nblk >> (shift - DEV_BSHIFT);
1305 	/*
1306 	 * do cmlb, but do it synchronously unless we already have the
1307 	 * partition (which we probably should.)
1308 	 */
1309 	if (cmlb_partinfo(bd->d_cmlbh, part, &psize, &pstart, NULL, NULL,
1310 	    (void *)1)) {
1311 		rw_exit(&bd_lock);
1312 		return (ENXIO);
1313 	}
1314 
1315 	if ((d_blkno + d_nblk) > psize) {
1316 		rw_exit(&bd_lock);
1317 		return (EINVAL);
1318 	}
1319 	bp = getrbuf(KM_NOSLEEP);
1320 	if (bp == NULL) {
1321 		rw_exit(&bd_lock);
1322 		return (ENOMEM);
1323 	}
1324 
1325 	bp->b_bcount = nblk << DEV_BSHIFT;
1326 	bp->b_resid = bp->b_bcount;
1327 	bp->b_lblkno = blkno;
1328 	bp->b_un.b_addr = caddr;
1329 
1330 	xi = bd_xfer_alloc(bd, bp,  bd->d_ops.o_write, KM_NOSLEEP);
1331 	if (xi == NULL) {
1332 		rw_exit(&bd_lock);
1333 		freerbuf(bp);
1334 		return (ENOMEM);
1335 	}
1336 	xi->i_blkno = d_blkno + pstart;
1337 	xi->i_flags = BD_XFER_POLL;
1338 	bd_submit(bd, xi);
1339 	rw_exit(&bd_lock);
1340 
1341 	/*
1342 	 * Generally, we should have run this entirely synchronously
1343 	 * at this point and the biowait call should be a no-op.  If
1344 	 * it didn't happen this way, it's a bug in the underlying
1345 	 * driver not honoring BD_XFER_POLL.
1346 	 */
1347 	(void) biowait(bp);
1348 	rv = geterror(bp);
1349 	freerbuf(bp);
1350 	return (rv);
1351 }
1352 
1353 void
1354 bd_minphys(struct buf *bp)
1355 {
1356 	minor_t inst;
1357 	bd_t	*bd;
1358 	inst = BDINST(bp->b_edev);
1359 
1360 	bd = ddi_get_soft_state(bd_state, inst);
1361 
1362 	/*
1363 	 * In a non-debug kernel, bd_strategy will catch !bd as
1364 	 * well, and will fail nicely.
1365 	 */
1366 	ASSERT(bd);
1367 
1368 	if (bp->b_bcount > bd->d_maxxfer)
1369 		bp->b_bcount = bd->d_maxxfer;
1370 }
1371 
1372 static int
1373 bd_check_uio(dev_t dev, struct uio *uio)
1374 {
1375 	bd_t		*bd;
1376 	uint32_t	shift;
1377 
1378 	if ((bd = ddi_get_soft_state(bd_state, BDINST(dev))) == NULL) {
1379 		return (ENXIO);
1380 	}
1381 
1382 	shift = bd->d_blkshift;
1383 	if ((P2PHASE(uio->uio_loffset, (1U << shift)) != 0) ||
1384 	    (P2PHASE(uio->uio_iov->iov_len, (1U << shift)) != 0)) {
1385 		return (EINVAL);
1386 	}
1387 
1388 	return (0);
1389 }
1390 
1391 static int
1392 bd_read(dev_t dev, struct uio *uio, cred_t *credp)
1393 {
1394 	_NOTE(ARGUNUSED(credp));
1395 	int	ret = bd_check_uio(dev, uio);
1396 	if (ret != 0) {
1397 		return (ret);
1398 	}
1399 	return (physio(bd_strategy, NULL, dev, B_READ, bd_minphys, uio));
1400 }
1401 
1402 static int
1403 bd_write(dev_t dev, struct uio *uio, cred_t *credp)
1404 {
1405 	_NOTE(ARGUNUSED(credp));
1406 	int	ret = bd_check_uio(dev, uio);
1407 	if (ret != 0) {
1408 		return (ret);
1409 	}
1410 	return (physio(bd_strategy, NULL, dev, B_WRITE, bd_minphys, uio));
1411 }
1412 
1413 static int
1414 bd_aread(dev_t dev, struct aio_req *aio, cred_t *credp)
1415 {
1416 	_NOTE(ARGUNUSED(credp));
1417 	int	ret = bd_check_uio(dev, aio->aio_uio);
1418 	if (ret != 0) {
1419 		return (ret);
1420 	}
1421 	return (aphysio(bd_strategy, anocancel, dev, B_READ, bd_minphys, aio));
1422 }
1423 
1424 static int
1425 bd_awrite(dev_t dev, struct aio_req *aio, cred_t *credp)
1426 {
1427 	_NOTE(ARGUNUSED(credp));
1428 	int	ret = bd_check_uio(dev, aio->aio_uio);
1429 	if (ret != 0) {
1430 		return (ret);
1431 	}
1432 	return (aphysio(bd_strategy, anocancel, dev, B_WRITE, bd_minphys, aio));
1433 }
1434 
1435 static int
1436 bd_strategy(struct buf *bp)
1437 {
1438 	minor_t		inst;
1439 	minor_t		part;
1440 	bd_t		*bd;
1441 	diskaddr_t	p_lba;
1442 	diskaddr_t	p_nblks;
1443 	diskaddr_t	b_nblks;
1444 	bd_xfer_impl_t	*xi;
1445 	uint32_t	shift;
1446 	int		(*func)(void *, bd_xfer_t *);
1447 	diskaddr_t	lblkno;
1448 
1449 	part = BDPART(bp->b_edev);
1450 	inst = BDINST(bp->b_edev);
1451 
1452 	ASSERT(bp);
1453 
1454 	bp->b_resid = bp->b_bcount;
1455 
1456 	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
1457 		bioerror(bp, ENXIO);
1458 		biodone(bp);
1459 		return (0);
1460 	}
1461 
1462 	if (cmlb_partinfo(bd->d_cmlbh, part, &p_nblks, &p_lba,
1463 	    NULL, NULL, 0)) {
1464 		bioerror(bp, ENXIO);
1465 		biodone(bp);
1466 		return (0);
1467 	}
1468 
1469 	shift = bd->d_blkshift;
1470 	lblkno = bp->b_lblkno >> (shift - DEV_BSHIFT);
1471 	if ((P2PHASE(bp->b_lblkno, (1U << (shift - DEV_BSHIFT))) != 0) ||
1472 	    (P2PHASE(bp->b_bcount, (1U << shift)) != 0) ||
1473 	    (lblkno > p_nblks)) {
1474 		bioerror(bp, EINVAL);
1475 		biodone(bp);
1476 		return (0);
1477 	}
1478 	b_nblks = bp->b_bcount >> shift;
1479 	if ((lblkno == p_nblks) || (bp->b_bcount == 0)) {
1480 		biodone(bp);
1481 		return (0);
1482 	}
1483 
1484 	if ((b_nblks + lblkno) > p_nblks) {
1485 		bp->b_resid = ((lblkno + b_nblks - p_nblks) << shift);
1486 		bp->b_bcount -= bp->b_resid;
1487 	} else {
1488 		bp->b_resid = 0;
1489 	}
1490 	func = (bp->b_flags & B_READ) ? bd->d_ops.o_read : bd->d_ops.o_write;
1491 
1492 	xi = bd_xfer_alloc(bd, bp, func, KM_NOSLEEP);
1493 	if (xi == NULL) {
1494 		xi = bd_xfer_alloc(bd, bp, func, KM_PUSHPAGE);
1495 	}
1496 	if (xi == NULL) {
1497 		/* bd_request_alloc will have done bioerror */
1498 		biodone(bp);
1499 		return (0);
1500 	}
1501 	xi->i_blkno = lblkno + p_lba;
1502 
1503 	bd_submit(bd, xi);
1504 
1505 	return (0);
1506 }
1507 
1508 static int
1509 bd_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *credp, int *rvalp)
1510 {
1511 	minor_t		inst;
1512 	uint16_t	part;
1513 	bd_t		*bd;
1514 	void		*ptr = (void *)arg;
1515 	int		rv;
1516 
1517 	part = BDPART(dev);
1518 	inst = BDINST(dev);
1519 
1520 	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
1521 		return (ENXIO);
1522 	}
1523 
1524 	rv = cmlb_ioctl(bd->d_cmlbh, dev, cmd, arg, flag, credp, rvalp, 0);
1525 	if (rv != ENOTTY)
1526 		return (rv);
1527 
1528 	if (rvalp != NULL) {
1529 		/* the return value of the ioctl is 0 by default */
1530 		*rvalp = 0;
1531 	}
1532 
1533 	switch (cmd) {
1534 	case DKIOCGMEDIAINFO: {
1535 		struct dk_minfo minfo;
1536 
1537 		/* make sure our state information is current */
1538 		bd_update_state(bd);
1539 		bzero(&minfo, sizeof (minfo));
1540 		minfo.dki_media_type = DK_FIXED_DISK;
1541 		minfo.dki_lbsize = (1U << bd->d_blkshift);
1542 		minfo.dki_capacity = bd->d_numblks;
1543 		if (ddi_copyout(&minfo, ptr, sizeof (minfo), flag)) {
1544 			return (EFAULT);
1545 		}
1546 		return (0);
1547 	}
1548 	case DKIOCGMEDIAINFOEXT: {
1549 		struct dk_minfo_ext miext;
1550 		size_t len;
1551 
1552 		/* make sure our state information is current */
1553 		bd_update_state(bd);
1554 		bzero(&miext, sizeof (miext));
1555 		miext.dki_media_type = DK_FIXED_DISK;
1556 		miext.dki_lbsize = (1U << bd->d_blkshift);
1557 		miext.dki_pbsize = (1U << bd->d_pblkshift);
1558 		miext.dki_capacity = bd->d_numblks;
1559 
1560 		switch (ddi_model_convert_from(flag & FMODELS)) {
1561 		case DDI_MODEL_ILP32:
1562 			len = sizeof (struct dk_minfo_ext32);
1563 			break;
1564 		default:
1565 			len = sizeof (struct dk_minfo_ext);
1566 			break;
1567 		}
1568 
1569 		if (ddi_copyout(&miext, ptr, len, flag)) {
1570 			return (EFAULT);
1571 		}
1572 		return (0);
1573 	}
1574 	case DKIOCINFO: {
1575 		struct dk_cinfo cinfo;
1576 		bzero(&cinfo, sizeof (cinfo));
1577 		cinfo.dki_ctype = DKC_BLKDEV;
1578 		cinfo.dki_cnum = ddi_get_instance(ddi_get_parent(bd->d_dip));
1579 		(void) snprintf(cinfo.dki_cname, sizeof (cinfo.dki_cname),
1580 		    "%s", ddi_driver_name(ddi_get_parent(bd->d_dip)));
1581 		(void) snprintf(cinfo.dki_dname, sizeof (cinfo.dki_dname),
1582 		    "%s", ddi_driver_name(bd->d_dip));
1583 		cinfo.dki_unit = inst;
1584 		cinfo.dki_flags = DKI_FMTVOL;
1585 		cinfo.dki_partition = part;
1586 		cinfo.dki_maxtransfer = bd->d_maxxfer / DEV_BSIZE;
1587 		cinfo.dki_addr = 0;
1588 		cinfo.dki_slave = 0;
1589 		cinfo.dki_space = 0;
1590 		cinfo.dki_prio = 0;
1591 		cinfo.dki_vec = 0;
1592 		if (ddi_copyout(&cinfo, ptr, sizeof (cinfo), flag)) {
1593 			return (EFAULT);
1594 		}
1595 		return (0);
1596 	}
1597 	case DKIOCREMOVABLE: {
1598 		int i;
1599 		i = bd->d_removable ? 1 : 0;
1600 		if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
1601 			return (EFAULT);
1602 		}
1603 		return (0);
1604 	}
1605 	case DKIOCHOTPLUGGABLE: {
1606 		int i;
1607 		i = bd->d_hotpluggable ? 1 : 0;
1608 		if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
1609 			return (EFAULT);
1610 		}
1611 		return (0);
1612 	}
1613 	case DKIOCREADONLY: {
1614 		int i;
1615 		i = bd->d_rdonly ? 1 : 0;
1616 		if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
1617 			return (EFAULT);
1618 		}
1619 		return (0);
1620 	}
1621 	case DKIOCSOLIDSTATE: {
1622 		int i;
1623 		i = bd->d_ssd ? 1 : 0;
1624 		if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
1625 			return (EFAULT);
1626 		}
1627 		return (0);
1628 	}
1629 	case DKIOCSTATE: {
1630 		enum dkio_state	state;
1631 		if (ddi_copyin(ptr, &state, sizeof (state), flag)) {
1632 			return (EFAULT);
1633 		}
1634 		if ((rv = bd_check_state(bd, &state)) != 0) {
1635 			return (rv);
1636 		}
1637 		if (ddi_copyout(&state, ptr, sizeof (state), flag)) {
1638 			return (EFAULT);
1639 		}
1640 		return (0);
1641 	}
1642 	case DKIOCFLUSHWRITECACHE: {
1643 		struct dk_callback *dkc = NULL;
1644 
1645 		if (flag & FKIOCTL)
1646 			dkc = (void *)arg;
1647 
1648 		rv = bd_flush_write_cache(bd, dkc);
1649 		return (rv);
1650 	}
1651 	case DKIOCFREE: {
1652 		dkioc_free_list_t *dfl = NULL;
1653 
1654 		/*
1655 		 * Check free space support early to avoid copyin/allocation
1656 		 * when unnecessary.
1657 		 */
1658 		if (!CAN_FREESPACE(bd))
1659 			return (ENOTSUP);
1660 
1661 		rv = dfl_copyin(ptr, &dfl, flag, KM_SLEEP);
1662 		if (rv != 0)
1663 			return (rv);
1664 
1665 		/*
1666 		 * bd_free_space() consumes 'dfl'. bd_free_space() will
1667 		 * call dfl_iter() which will normally try to pass dfl through
1668 		 * to bd_free_space_cb() which attaches dfl to the bd_xfer_t
1669 		 * that is then queued for the underlying driver. Once the
1670 		 * driver processes the request, the bd_xfer_t instance is
1671 		 * disposed of, including any attached dkioc_free_list_t.
1672 		 *
1673 		 * If dfl cannot be processed by the underlying driver due to
1674 		 * size or alignment requirements of the driver, dfl_iter()
1675 		 * will replace dfl with one or more new dkioc_free_list_t
1676 		 * instances with the correct alignment and sizes for the driver
1677 		 * (and free the original dkioc_free_list_t).
1678 		 */
1679 		rv = bd_free_space(dev, bd, dfl);
1680 		return (rv);
1681 	}
1682 
1683 	case DKIOC_CANFREE: {
1684 		boolean_t supported = CAN_FREESPACE(bd);
1685 
1686 		if (ddi_copyout(&supported, (void *)arg, sizeof (supported),
1687 		    flag) != 0) {
1688 			return (EFAULT);
1689 		}
1690 
1691 		return (0);
1692 	}
1693 
1694 	default:
1695 		break;
1696 
1697 	}
1698 	return (ENOTTY);
1699 }
1700 
1701 static int
1702 bd_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int mod_flags,
1703     char *name, caddr_t valuep, int *lengthp)
1704 {
1705 	bd_t	*bd;
1706 
1707 	bd = ddi_get_soft_state(bd_state, ddi_get_instance(dip));
1708 	if (bd == NULL)
1709 		return (ddi_prop_op(dev, dip, prop_op, mod_flags,
1710 		    name, valuep, lengthp));
1711 
1712 	return (cmlb_prop_op(bd->d_cmlbh, dev, dip, prop_op, mod_flags, name,
1713 	    valuep, lengthp, BDPART(dev), 0));
1714 }
1715 
1716 
1717 static int
1718 bd_tg_rdwr(dev_info_t *dip, uchar_t cmd, void *bufaddr, diskaddr_t start,
1719     size_t length, void *tg_cookie)
1720 {
1721 	bd_t		*bd;
1722 	buf_t		*bp;
1723 	bd_xfer_impl_t	*xi;
1724 	int		rv;
1725 	int		(*func)(void *, bd_xfer_t *);
1726 	int		kmflag;
1727 
1728 	/*
1729 	 * If we are running in polled mode (such as during dump(9e)
1730 	 * execution), then we cannot sleep for kernel allocations.
1731 	 */
1732 	kmflag = tg_cookie ? KM_NOSLEEP : KM_SLEEP;
1733 
1734 	bd = ddi_get_soft_state(bd_state, ddi_get_instance(dip));
1735 
1736 	if (P2PHASE(length, (1U << bd->d_blkshift)) != 0) {
1737 		/* We can only transfer whole blocks at a time! */
1738 		return (EINVAL);
1739 	}
1740 
1741 	if ((bp = getrbuf(kmflag)) == NULL) {
1742 		return (ENOMEM);
1743 	}
1744 
1745 	switch (cmd) {
1746 	case TG_READ:
1747 		bp->b_flags = B_READ;
1748 		func = bd->d_ops.o_read;
1749 		break;
1750 	case TG_WRITE:
1751 		bp->b_flags = B_WRITE;
1752 		func = bd->d_ops.o_write;
1753 		break;
1754 	default:
1755 		freerbuf(bp);
1756 		return (EINVAL);
1757 	}
1758 
1759 	bp->b_un.b_addr = bufaddr;
1760 	bp->b_bcount = length;
1761 	xi = bd_xfer_alloc(bd, bp, func, kmflag);
1762 	if (xi == NULL) {
1763 		rv = geterror(bp);
1764 		freerbuf(bp);
1765 		return (rv);
1766 	}
1767 	xi->i_flags = tg_cookie ? BD_XFER_POLL : 0;
1768 	xi->i_blkno = start;
1769 	bd_submit(bd, xi);
1770 	(void) biowait(bp);
1771 	rv = geterror(bp);
1772 	freerbuf(bp);
1773 
1774 	return (rv);
1775 }
1776 
1777 static int
1778 bd_tg_getinfo(dev_info_t *dip, int cmd, void *arg, void *tg_cookie)
1779 {
1780 	bd_t		*bd;
1781 
1782 	_NOTE(ARGUNUSED(tg_cookie));
1783 	bd = ddi_get_soft_state(bd_state, ddi_get_instance(dip));
1784 
1785 	switch (cmd) {
1786 	case TG_GETPHYGEOM:
1787 	case TG_GETVIRTGEOM:
1788 		/*
1789 		 * We don't have any "geometry" as such, let cmlb
1790 		 * fabricate something.
1791 		 */
1792 		return (ENOTTY);
1793 
1794 	case TG_GETCAPACITY:
1795 		bd_update_state(bd);
1796 		*(diskaddr_t *)arg = bd->d_numblks;
1797 		return (0);
1798 
1799 	case TG_GETBLOCKSIZE:
1800 		*(uint32_t *)arg = (1U << bd->d_blkshift);
1801 		return (0);
1802 
1803 	case TG_GETATTR:
1804 		/*
1805 		 * It turns out that cmlb really doesn't do much for
1806 		 * non-writable media, but lets make the information
1807 		 * available for it in case it does more in the
1808 		 * future.  (The value is currently used for
1809 		 * triggering special behavior for CD-ROMs.)
1810 		 */
1811 		bd_update_state(bd);
1812 		((tg_attribute_t *)arg)->media_is_writable =
1813 		    bd->d_rdonly ? B_FALSE : B_TRUE;
1814 		((tg_attribute_t *)arg)->media_is_solid_state = bd->d_ssd;
1815 		((tg_attribute_t *)arg)->media_is_rotational = B_FALSE;
1816 		return (0);
1817 
1818 	default:
1819 		return (EINVAL);
1820 	}
1821 }
1822 
1823 
1824 static void
1825 bd_sched(bd_t *bd, bd_queue_t *bq)
1826 {
1827 	bd_xfer_impl_t	*xi;
1828 	struct buf	*bp;
1829 	int		rv;
1830 
1831 	mutex_enter(&bq->q_iomutex);
1832 
1833 	while ((bq->q_qactive < bq->q_qsize) &&
1834 	    ((xi = list_remove_head(&bq->q_waitq)) != NULL)) {
1835 		mutex_enter(&bd->d_ksmutex);
1836 		kstat_waitq_to_runq(bd->d_kiop);
1837 		mutex_exit(&bd->d_ksmutex);
1838 
1839 		bq->q_qactive++;
1840 		list_insert_tail(&bq->q_runq, xi);
1841 
1842 		/*
1843 		 * Submit the job to the driver.  We drop the I/O mutex
1844 		 * so that we can deal with the case where the driver
1845 		 * completion routine calls back into us synchronously.
1846 		 */
1847 
1848 		mutex_exit(&bq->q_iomutex);
1849 
1850 		rv = xi->i_func(bd->d_private, &xi->i_public);
1851 		if (rv != 0) {
1852 			bp = xi->i_bp;
1853 			bioerror(bp, rv);
1854 			biodone(bp);
1855 
1856 			atomic_inc_32(&bd->d_kerr->bd_transerrs.value.ui32);
1857 
1858 			mutex_enter(&bq->q_iomutex);
1859 
1860 			mutex_enter(&bd->d_ksmutex);
1861 			kstat_runq_exit(bd->d_kiop);
1862 			mutex_exit(&bd->d_ksmutex);
1863 
1864 			bq->q_qactive--;
1865 			list_remove(&bq->q_runq, xi);
1866 			bd_xfer_free(xi);
1867 		} else {
1868 			mutex_enter(&bq->q_iomutex);
1869 		}
1870 	}
1871 
1872 	mutex_exit(&bq->q_iomutex);
1873 }
1874 
1875 static void
1876 bd_submit(bd_t *bd, bd_xfer_impl_t *xi)
1877 {
1878 	uint64_t	nv = atomic_inc_64_nv(&bd->d_io_counter);
1879 	unsigned	q = nv % bd->d_qcount;
1880 	bd_queue_t	*bq = &bd->d_queues[q];
1881 
1882 	xi->i_bq = bq;
1883 	xi->i_qnum = q;
1884 
1885 	mutex_enter(&bq->q_iomutex);
1886 
1887 	list_insert_tail(&bq->q_waitq, xi);
1888 
1889 	mutex_enter(&bd->d_ksmutex);
1890 	kstat_waitq_enter(bd->d_kiop);
1891 	mutex_exit(&bd->d_ksmutex);
1892 
1893 	mutex_exit(&bq->q_iomutex);
1894 
1895 	bd_sched(bd, bq);
1896 }
1897 
1898 static void
1899 bd_runq_exit(bd_xfer_impl_t *xi, int err)
1900 {
1901 	bd_t		*bd = xi->i_bd;
1902 	buf_t		*bp = xi->i_bp;
1903 	bd_queue_t	*bq = xi->i_bq;
1904 
1905 	mutex_enter(&bq->q_iomutex);
1906 	bq->q_qactive--;
1907 
1908 	mutex_enter(&bd->d_ksmutex);
1909 	kstat_runq_exit(bd->d_kiop);
1910 	mutex_exit(&bd->d_ksmutex);
1911 
1912 	list_remove(&bq->q_runq, xi);
1913 	mutex_exit(&bq->q_iomutex);
1914 
1915 	if (err == 0) {
1916 		if (bp->b_flags & B_READ) {
1917 			atomic_inc_uint(&bd->d_kiop->reads);
1918 			atomic_add_64((uint64_t *)&bd->d_kiop->nread,
1919 			    bp->b_bcount - xi->i_resid);
1920 		} else {
1921 			atomic_inc_uint(&bd->d_kiop->writes);
1922 			atomic_add_64((uint64_t *)&bd->d_kiop->nwritten,
1923 			    bp->b_bcount - xi->i_resid);
1924 		}
1925 	}
1926 	bd_sched(bd, bq);
1927 }
1928 
1929 static void
1930 bd_dle_sysevent_task(void *arg)
1931 {
1932 	nvlist_t *attr = NULL;
1933 	char *path = NULL;
1934 	bd_t *bd = arg;
1935 	dev_info_t *dip = bd->d_dip;
1936 	size_t n;
1937 
1938 	mutex_enter(&bd->d_dle_mutex);
1939 	bd->d_dle_state &= ~BD_DLE_PENDING;
1940 	bd->d_dle_state |= BD_DLE_RUNNING;
1941 	mutex_exit(&bd->d_dle_mutex);
1942 
1943 	dev_err(dip, CE_NOTE, "!dynamic LUN expansion");
1944 
1945 	if (nvlist_alloc(&attr, NV_UNIQUE_NAME_TYPE, KM_SLEEP) != 0) {
1946 		mutex_enter(&bd->d_dle_mutex);
1947 		bd->d_dle_state &= ~(BD_DLE_RUNNING|BD_DLE_PENDING);
1948 		mutex_exit(&bd->d_dle_mutex);
1949 		return;
1950 	}
1951 
1952 	path = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
1953 
1954 	n = snprintf(path, MAXPATHLEN, "/devices");
1955 	(void) ddi_pathname(dip, path + n);
1956 	n = strlen(path);
1957 	n += snprintf(path + n, MAXPATHLEN - n, ":x");
1958 
1959 	for (;;) {
1960 		/*
1961 		 * On receipt of this event, the ZFS sysevent module will scan
1962 		 * active zpools for child vdevs matching this physical path.
1963 		 * In order to catch both whole disk pools and those with an
1964 		 * EFI boot partition, generate separate sysevents for minor
1965 		 * node 'a' and 'b'.
1966 		 */
1967 		for (char c = 'a'; c < 'c'; c++) {
1968 			path[n - 1] = c;
1969 
1970 			if (nvlist_add_string(attr, DEV_PHYS_PATH, path) != 0)
1971 				break;
1972 
1973 			(void) ddi_log_sysevent(dip, DDI_VENDOR_SUNW,
1974 			    EC_DEV_STATUS, ESC_DEV_DLE, attr, NULL, DDI_SLEEP);
1975 		}
1976 
1977 		mutex_enter(&bd->d_dle_mutex);
1978 		if ((bd->d_dle_state & BD_DLE_PENDING) == 0) {
1979 			bd->d_dle_state &= ~BD_DLE_RUNNING;
1980 			mutex_exit(&bd->d_dle_mutex);
1981 			break;
1982 		}
1983 		bd->d_dle_state &= ~BD_DLE_PENDING;
1984 		mutex_exit(&bd->d_dle_mutex);
1985 	}
1986 
1987 	nvlist_free(attr);
1988 	kmem_free(path, MAXPATHLEN);
1989 }
1990 
1991 static void
1992 bd_update_state(bd_t *bd)
1993 {
1994 	enum	dkio_state	state = DKIO_INSERTED;
1995 	boolean_t		docmlb = B_FALSE;
1996 	bd_media_t		media;
1997 
1998 	bzero(&media, sizeof (media));
1999 
2000 	mutex_enter(&bd->d_statemutex);
2001 	if (bd->d_ops.o_media_info(bd->d_private, &media) != 0) {
2002 		bd->d_numblks = 0;
2003 		state = DKIO_EJECTED;
2004 		goto done;
2005 	}
2006 
2007 	if ((media.m_blksize < 512) ||
2008 	    (!ISP2(media.m_blksize)) ||
2009 	    (P2PHASE(bd->d_maxxfer, media.m_blksize))) {
2010 		dev_err(bd->d_dip, CE_WARN, "Invalid media block size (%d)",
2011 		    media.m_blksize);
2012 		/*
2013 		 * We can't use the media, treat it as not present.
2014 		 */
2015 		state = DKIO_EJECTED;
2016 		bd->d_numblks = 0;
2017 		goto done;
2018 	}
2019 
2020 	if (((1U << bd->d_blkshift) != media.m_blksize) ||
2021 	    (bd->d_numblks != media.m_nblks)) {
2022 		/* Device size changed */
2023 		docmlb = B_TRUE;
2024 	}
2025 
2026 	bd->d_blkshift = ddi_ffs(media.m_blksize) - 1;
2027 	bd->d_pblkshift = bd->d_blkshift;
2028 	bd->d_numblks = media.m_nblks;
2029 	bd->d_rdonly = media.m_readonly;
2030 	bd->d_ssd = media.m_solidstate;
2031 
2032 	/*
2033 	 * Only use the supplied physical block size if it is non-zero,
2034 	 * greater or equal to the block size, and a power of 2. Ignore it
2035 	 * if not, it's just informational and we can still use the media.
2036 	 */
2037 	if ((media.m_pblksize != 0) &&
2038 	    (media.m_pblksize >= media.m_blksize) &&
2039 	    (ISP2(media.m_pblksize)))
2040 		bd->d_pblkshift = ddi_ffs(media.m_pblksize) - 1;
2041 
2042 done:
2043 	if (state != bd->d_state) {
2044 		bd->d_state = state;
2045 		cv_broadcast(&bd->d_statecv);
2046 		docmlb = B_TRUE;
2047 	}
2048 	mutex_exit(&bd->d_statemutex);
2049 
2050 	bd->d_kerr->bd_capacity.value.ui64 = bd->d_numblks << bd->d_blkshift;
2051 
2052 	if (docmlb) {
2053 		if (state == DKIO_INSERTED) {
2054 			(void) cmlb_validate(bd->d_cmlbh, 0, 0);
2055 
2056 			mutex_enter(&bd->d_dle_mutex);
2057 			/*
2058 			 * If there is already an event pending, there's
2059 			 * nothing to do; we coalesce multiple events.
2060 			 */
2061 			if ((bd->d_dle_state & BD_DLE_PENDING) == 0) {
2062 				if ((bd->d_dle_state & BD_DLE_RUNNING) == 0) {
2063 					taskq_dispatch_ent(bd_taskq,
2064 					    bd_dle_sysevent_task, bd, 0,
2065 					    &bd->d_dle_ent);
2066 				}
2067 				bd->d_dle_state |= BD_DLE_PENDING;
2068 			}
2069 			mutex_exit(&bd->d_dle_mutex);
2070 		} else {
2071 			cmlb_invalidate(bd->d_cmlbh, 0);
2072 		}
2073 	}
2074 }
2075 
2076 static int
2077 bd_check_state(bd_t *bd, enum dkio_state *state)
2078 {
2079 	clock_t		when;
2080 
2081 	for (;;) {
2082 
2083 		bd_update_state(bd);
2084 
2085 		mutex_enter(&bd->d_statemutex);
2086 
2087 		if (bd->d_state != *state) {
2088 			*state = bd->d_state;
2089 			mutex_exit(&bd->d_statemutex);
2090 			break;
2091 		}
2092 
2093 		when = drv_usectohz(1000000);
2094 		if (cv_reltimedwait_sig(&bd->d_statecv, &bd->d_statemutex,
2095 		    when, TR_CLOCK_TICK) == 0) {
2096 			mutex_exit(&bd->d_statemutex);
2097 			return (EINTR);
2098 		}
2099 
2100 		mutex_exit(&bd->d_statemutex);
2101 	}
2102 
2103 	return (0);
2104 }
2105 
2106 static int
2107 bd_flush_write_cache_done(struct buf *bp)
2108 {
2109 	struct dk_callback *dc = (void *)bp->b_private;
2110 
2111 	(*dc->dkc_callback)(dc->dkc_cookie, geterror(bp));
2112 	kmem_free(dc, sizeof (*dc));
2113 	freerbuf(bp);
2114 	return (0);
2115 }
2116 
2117 static int
2118 bd_flush_write_cache(bd_t *bd, struct dk_callback *dkc)
2119 {
2120 	buf_t			*bp;
2121 	struct dk_callback	*dc;
2122 	bd_xfer_impl_t		*xi;
2123 	int			rv;
2124 
2125 	if (bd->d_ops.o_sync_cache == NULL) {
2126 		return (ENOTSUP);
2127 	}
2128 	if ((bp = getrbuf(KM_SLEEP)) == NULL) {
2129 		return (ENOMEM);
2130 	}
2131 	bp->b_resid = 0;
2132 	bp->b_bcount = 0;
2133 
2134 	xi = bd_xfer_alloc(bd, bp, bd->d_ops.o_sync_cache, KM_SLEEP);
2135 	if (xi == NULL) {
2136 		rv = geterror(bp);
2137 		freerbuf(bp);
2138 		return (rv);
2139 	}
2140 
2141 	/* Make an asynchronous flush, but only if there is a callback */
2142 	if (dkc != NULL && dkc->dkc_callback != NULL) {
2143 		/* Make a private copy of the callback structure */
2144 		dc = kmem_alloc(sizeof (*dc), KM_SLEEP);
2145 		*dc = *dkc;
2146 		bp->b_private = dc;
2147 		bp->b_iodone = bd_flush_write_cache_done;
2148 
2149 		bd_submit(bd, xi);
2150 		return (0);
2151 	}
2152 
2153 	/* In case there is no callback, perform a synchronous flush */
2154 	bd_submit(bd, xi);
2155 	(void) biowait(bp);
2156 	rv = geterror(bp);
2157 	freerbuf(bp);
2158 
2159 	return (rv);
2160 }
2161 
2162 static int
2163 bd_free_space_done(struct buf *bp)
2164 {
2165 	freerbuf(bp);
2166 	return (0);
2167 }
2168 
2169 static int
2170 bd_free_space_cb(dkioc_free_list_t *dfl, void *arg, int kmflag)
2171 {
2172 	bd_t		*bd = arg;
2173 	buf_t		*bp = NULL;
2174 	bd_xfer_impl_t	*xi = NULL;
2175 	boolean_t	sync = DFL_ISSYNC(dfl) ?  B_TRUE : B_FALSE;
2176 	int		rv = 0;
2177 
2178 	bp = getrbuf(KM_SLEEP);
2179 	bp->b_resid = 0;
2180 	bp->b_bcount = 0;
2181 	bp->b_lblkno = 0;
2182 
2183 	xi = bd_xfer_alloc(bd, bp, bd->d_ops.o_free_space, kmflag);
2184 	xi->i_dfl = dfl;
2185 
2186 	if (!sync) {
2187 		bp->b_iodone = bd_free_space_done;
2188 		bd_submit(bd, xi);
2189 		return (0);
2190 	}
2191 
2192 	xi->i_flags |= BD_XFER_POLL;
2193 	bd_submit(bd, xi);
2194 
2195 	(void) biowait(bp);
2196 	rv = geterror(bp);
2197 	freerbuf(bp);
2198 
2199 	return (rv);
2200 }
2201 
2202 static int
2203 bd_free_space(dev_t dev, bd_t *bd, dkioc_free_list_t *dfl)
2204 {
2205 	diskaddr_t p_len, p_offset;
2206 	uint64_t offset_bytes, len_bytes;
2207 	minor_t part = BDPART(dev);
2208 	const uint_t bshift = bd->d_blkshift;
2209 	dkioc_free_info_t dfi = {
2210 		.dfi_bshift = bshift,
2211 		.dfi_align = bd->d_free_align << bshift,
2212 		.dfi_max_bytes = bd->d_max_free_blks << bshift,
2213 		.dfi_max_ext = bd->d_max_free_seg,
2214 		.dfi_max_ext_bytes = bd->d_max_free_seg_blks << bshift,
2215 	};
2216 
2217 	if (cmlb_partinfo(bd->d_cmlbh, part, &p_len, &p_offset, NULL,
2218 	    NULL, 0) != 0) {
2219 		dfl_free(dfl);
2220 		return (ENXIO);
2221 	}
2222 
2223 	/*
2224 	 * bd_ioctl created our own copy of dfl, so we can modify as
2225 	 * necessary
2226 	 */
2227 	offset_bytes = (uint64_t)p_offset << bshift;
2228 	len_bytes = (uint64_t)p_len << bshift;
2229 
2230 	dfl->dfl_offset += offset_bytes;
2231 	if (dfl->dfl_offset < offset_bytes) {
2232 		dfl_free(dfl);
2233 		return (EOVERFLOW);
2234 	}
2235 
2236 	return (dfl_iter(dfl, &dfi, offset_bytes + len_bytes, bd_free_space_cb,
2237 	    bd, KM_SLEEP));
2238 }
2239 
2240 /*
2241  * Nexus support.
2242  */
2243 int
2244 bd_bus_ctl(dev_info_t *dip, dev_info_t *rdip, ddi_ctl_enum_t ctlop,
2245     void *arg, void *result)
2246 {
2247 	bd_handle_t	hdl;
2248 
2249 	switch (ctlop) {
2250 	case DDI_CTLOPS_REPORTDEV:
2251 		cmn_err(CE_CONT, "?Block device: %s@%s, %s%d\n",
2252 		    ddi_node_name(rdip), ddi_get_name_addr(rdip),
2253 		    ddi_driver_name(rdip), ddi_get_instance(rdip));
2254 		return (DDI_SUCCESS);
2255 
2256 	case DDI_CTLOPS_INITCHILD:
2257 		hdl = ddi_get_parent_data((dev_info_t *)arg);
2258 		if (hdl == NULL) {
2259 			return (DDI_NOT_WELL_FORMED);
2260 		}
2261 		ddi_set_name_addr((dev_info_t *)arg, hdl->h_addr);
2262 		return (DDI_SUCCESS);
2263 
2264 	case DDI_CTLOPS_UNINITCHILD:
2265 		ddi_set_name_addr((dev_info_t *)arg, NULL);
2266 		ndi_prop_remove_all((dev_info_t *)arg);
2267 		return (DDI_SUCCESS);
2268 
2269 	default:
2270 		return (ddi_ctlops(dip, rdip, ctlop, arg, result));
2271 	}
2272 }
2273 
2274 /*
2275  * Functions for device drivers.
2276  */
2277 bd_handle_t
2278 bd_alloc_handle(void *private, bd_ops_t *ops, ddi_dma_attr_t *dma, int kmflag)
2279 {
2280 	bd_handle_t	hdl;
2281 
2282 	switch (ops->o_version) {
2283 	case BD_OPS_VERSION_0:
2284 	case BD_OPS_VERSION_1:
2285 	case BD_OPS_VERSION_2:
2286 		break;
2287 
2288 	default:
2289 		/* Unsupported version */
2290 		return (NULL);
2291 	}
2292 
2293 	hdl = kmem_zalloc(sizeof (*hdl), kmflag);
2294 	if (hdl == NULL) {
2295 		return (NULL);
2296 	}
2297 
2298 	switch (ops->o_version) {
2299 	case BD_OPS_VERSION_2:
2300 		hdl->h_ops.o_free_space = ops->o_free_space;
2301 		/*FALLTHRU*/
2302 	case BD_OPS_VERSION_1:
2303 	case BD_OPS_VERSION_0:
2304 		hdl->h_ops.o_drive_info = ops->o_drive_info;
2305 		hdl->h_ops.o_media_info = ops->o_media_info;
2306 		hdl->h_ops.o_devid_init = ops->o_devid_init;
2307 		hdl->h_ops.o_sync_cache = ops->o_sync_cache;
2308 		hdl->h_ops.o_read = ops->o_read;
2309 		hdl->h_ops.o_write = ops->o_write;
2310 		break;
2311 	}
2312 
2313 	hdl->h_dma = dma;
2314 	hdl->h_private = private;
2315 
2316 	return (hdl);
2317 }
2318 
2319 void
2320 bd_free_handle(bd_handle_t hdl)
2321 {
2322 	kmem_free(hdl, sizeof (*hdl));
2323 }
2324 
2325 int
2326 bd_attach_handle(dev_info_t *dip, bd_handle_t hdl)
2327 {
2328 	dev_info_t	*child;
2329 	bd_drive_t	drive = { 0 };
2330 
2331 	/*
2332 	 * It's not an error if bd_attach_handle() is called on a handle that
2333 	 * already is attached. We just ignore the request to attach and return.
2334 	 * This way drivers using blkdev don't have to keep track about blkdev
2335 	 * state, they can just call this function to make sure it attached.
2336 	 */
2337 	if (hdl->h_child != NULL) {
2338 		return (DDI_SUCCESS);
2339 	}
2340 
2341 	/* if drivers don't override this, make it assume none */
2342 	drive.d_lun = -1;
2343 	hdl->h_ops.o_drive_info(hdl->h_private, &drive);
2344 
2345 	hdl->h_parent = dip;
2346 	hdl->h_name = "blkdev";
2347 
2348 	/*LINTED: E_BAD_PTR_CAST_ALIGN*/
2349 	if (*(uint64_t *)drive.d_eui64 != 0) {
2350 		if (drive.d_lun >= 0) {
2351 			(void) snprintf(hdl->h_addr, sizeof (hdl->h_addr),
2352 			    "w%02X%02X%02X%02X%02X%02X%02X%02X,%X",
2353 			    drive.d_eui64[0], drive.d_eui64[1],
2354 			    drive.d_eui64[2], drive.d_eui64[3],
2355 			    drive.d_eui64[4], drive.d_eui64[5],
2356 			    drive.d_eui64[6], drive.d_eui64[7], drive.d_lun);
2357 		} else {
2358 			(void) snprintf(hdl->h_addr, sizeof (hdl->h_addr),
2359 			    "w%02X%02X%02X%02X%02X%02X%02X%02X",
2360 			    drive.d_eui64[0], drive.d_eui64[1],
2361 			    drive.d_eui64[2], drive.d_eui64[3],
2362 			    drive.d_eui64[4], drive.d_eui64[5],
2363 			    drive.d_eui64[6], drive.d_eui64[7]);
2364 		}
2365 	} else {
2366 		if (drive.d_lun >= 0) {
2367 			(void) snprintf(hdl->h_addr, sizeof (hdl->h_addr),
2368 			    "%X,%X", drive.d_target, drive.d_lun);
2369 		} else {
2370 			(void) snprintf(hdl->h_addr, sizeof (hdl->h_addr),
2371 			    "%X", drive.d_target);
2372 		}
2373 	}
2374 
2375 	if (ndi_devi_alloc(dip, hdl->h_name, (pnode_t)DEVI_SID_NODEID,
2376 	    &child) != NDI_SUCCESS) {
2377 		cmn_err(CE_WARN, "%s%d: unable to allocate node %s@%s",
2378 		    ddi_driver_name(dip), ddi_get_instance(dip),
2379 		    "blkdev", hdl->h_addr);
2380 		return (DDI_FAILURE);
2381 	}
2382 
2383 	ddi_set_parent_data(child, hdl);
2384 	hdl->h_child = child;
2385 
2386 	if (ndi_devi_online(child, 0) != NDI_SUCCESS) {
2387 		cmn_err(CE_WARN, "%s%d: failed bringing node %s@%s online",
2388 		    ddi_driver_name(dip), ddi_get_instance(dip),
2389 		    hdl->h_name, hdl->h_addr);
2390 		(void) ndi_devi_free(child);
2391 		hdl->h_child = NULL;
2392 		return (DDI_FAILURE);
2393 	}
2394 
2395 	return (DDI_SUCCESS);
2396 }
2397 
2398 int
2399 bd_detach_handle(bd_handle_t hdl)
2400 {
2401 	int	circ;
2402 	int	rv;
2403 	char	*devnm;
2404 
2405 	/*
2406 	 * It's not an error if bd_detach_handle() is called on a handle that
2407 	 * already is detached. We just ignore the request to detach and return.
2408 	 * This way drivers using blkdev don't have to keep track about blkdev
2409 	 * state, they can just call this function to make sure it detached.
2410 	 */
2411 	if (hdl->h_child == NULL) {
2412 		return (DDI_SUCCESS);
2413 	}
2414 	ndi_devi_enter(hdl->h_parent, &circ);
2415 	if (i_ddi_node_state(hdl->h_child) < DS_INITIALIZED) {
2416 		rv = ddi_remove_child(hdl->h_child, 0);
2417 	} else {
2418 		devnm = kmem_alloc(MAXNAMELEN + 1, KM_SLEEP);
2419 		(void) ddi_deviname(hdl->h_child, devnm);
2420 		(void) devfs_clean(hdl->h_parent, devnm + 1, DV_CLEAN_FORCE);
2421 		rv = ndi_devi_unconfig_one(hdl->h_parent, devnm + 1, NULL,
2422 		    NDI_DEVI_REMOVE | NDI_UNCONFIG);
2423 		kmem_free(devnm, MAXNAMELEN + 1);
2424 	}
2425 	if (rv == 0) {
2426 		hdl->h_child = NULL;
2427 	}
2428 
2429 	ndi_devi_exit(hdl->h_parent, circ);
2430 	return (rv == NDI_SUCCESS ? DDI_SUCCESS : DDI_FAILURE);
2431 }
2432 
2433 void
2434 bd_xfer_done(bd_xfer_t *xfer, int err)
2435 {
2436 	bd_xfer_impl_t	*xi = (void *)xfer;
2437 	buf_t		*bp = xi->i_bp;
2438 	int		rv = DDI_SUCCESS;
2439 	bd_t		*bd = xi->i_bd;
2440 	size_t		len;
2441 
2442 	if (err != 0) {
2443 		bd_runq_exit(xi, err);
2444 		atomic_inc_32(&bd->d_kerr->bd_harderrs.value.ui32);
2445 
2446 		bp->b_resid += xi->i_resid;
2447 		bd_xfer_free(xi);
2448 		bioerror(bp, err);
2449 		biodone(bp);
2450 		return;
2451 	}
2452 
2453 	xi->i_cur_win++;
2454 	xi->i_resid -= xi->i_len;
2455 
2456 	if (xi->i_resid == 0) {
2457 		/* Job completed succcessfully! */
2458 		bd_runq_exit(xi, 0);
2459 
2460 		bd_xfer_free(xi);
2461 		biodone(bp);
2462 		return;
2463 	}
2464 
2465 	xi->i_blkno += xi->i_nblks;
2466 
2467 	if (bd->d_use_dma) {
2468 		/* More transfer still pending... advance to next DMA window. */
2469 		rv = ddi_dma_getwin(xi->i_dmah, xi->i_cur_win,
2470 		    &xi->i_offset, &len, &xi->i_dmac, &xi->i_ndmac);
2471 	} else {
2472 		/* Advance memory window. */
2473 		xi->i_kaddr += xi->i_len;
2474 		xi->i_offset += xi->i_len;
2475 		len = min(bp->b_bcount - xi->i_offset, bd->d_maxxfer);
2476 	}
2477 
2478 
2479 	if ((rv != DDI_SUCCESS) ||
2480 	    (P2PHASE(len, (1U << xi->i_blkshift)) != 0)) {
2481 		bd_runq_exit(xi, EFAULT);
2482 
2483 		bp->b_resid += xi->i_resid;
2484 		bd_xfer_free(xi);
2485 		bioerror(bp, EFAULT);
2486 		biodone(bp);
2487 		return;
2488 	}
2489 	xi->i_len = len;
2490 	xi->i_nblks = len >> xi->i_blkshift;
2491 
2492 	/* Submit next window to hardware. */
2493 	rv = xi->i_func(bd->d_private, &xi->i_public);
2494 	if (rv != 0) {
2495 		bd_runq_exit(xi, rv);
2496 
2497 		atomic_inc_32(&bd->d_kerr->bd_transerrs.value.ui32);
2498 
2499 		bp->b_resid += xi->i_resid;
2500 		bd_xfer_free(xi);
2501 		bioerror(bp, rv);
2502 		biodone(bp);
2503 	}
2504 }
2505 
2506 void
2507 bd_error(bd_xfer_t *xfer, int error)
2508 {
2509 	bd_xfer_impl_t	*xi = (void *)xfer;
2510 	bd_t		*bd = xi->i_bd;
2511 
2512 	switch (error) {
2513 	case BD_ERR_MEDIA:
2514 		atomic_inc_32(&bd->d_kerr->bd_rq_media_err.value.ui32);
2515 		break;
2516 	case BD_ERR_NTRDY:
2517 		atomic_inc_32(&bd->d_kerr->bd_rq_ntrdy_err.value.ui32);
2518 		break;
2519 	case BD_ERR_NODEV:
2520 		atomic_inc_32(&bd->d_kerr->bd_rq_nodev_err.value.ui32);
2521 		break;
2522 	case BD_ERR_RECOV:
2523 		atomic_inc_32(&bd->d_kerr->bd_rq_recov_err.value.ui32);
2524 		break;
2525 	case BD_ERR_ILLRQ:
2526 		atomic_inc_32(&bd->d_kerr->bd_rq_illrq_err.value.ui32);
2527 		break;
2528 	case BD_ERR_PFA:
2529 		atomic_inc_32(&bd->d_kerr->bd_rq_pfa_err.value.ui32);
2530 		break;
2531 	default:
2532 		cmn_err(CE_PANIC, "bd_error: unknown error type %d", error);
2533 		break;
2534 	}
2535 }
2536 
2537 void
2538 bd_state_change(bd_handle_t hdl)
2539 {
2540 	bd_t		*bd;
2541 
2542 	if ((bd = hdl->h_bd) != NULL) {
2543 		bd_update_state(bd);
2544 	}
2545 }
2546 
2547 void
2548 bd_mod_init(struct dev_ops *devops)
2549 {
2550 	static struct bus_ops bd_bus_ops = {
2551 		BUSO_REV,		/* busops_rev */
2552 		nullbusmap,		/* bus_map */
2553 		NULL,			/* bus_get_intrspec (OBSOLETE) */
2554 		NULL,			/* bus_add_intrspec (OBSOLETE) */
2555 		NULL,			/* bus_remove_intrspec (OBSOLETE) */
2556 		i_ddi_map_fault,	/* bus_map_fault */
2557 		NULL,			/* bus_dma_map (OBSOLETE) */
2558 		ddi_dma_allochdl,	/* bus_dma_allochdl */
2559 		ddi_dma_freehdl,	/* bus_dma_freehdl */
2560 		ddi_dma_bindhdl,	/* bus_dma_bindhdl */
2561 		ddi_dma_unbindhdl,	/* bus_dma_unbindhdl */
2562 		ddi_dma_flush,		/* bus_dma_flush */
2563 		ddi_dma_win,		/* bus_dma_win */
2564 		ddi_dma_mctl,		/* bus_dma_ctl */
2565 		bd_bus_ctl,		/* bus_ctl */
2566 		ddi_bus_prop_op,	/* bus_prop_op */
2567 		NULL,			/* bus_get_eventcookie */
2568 		NULL,			/* bus_add_eventcall */
2569 		NULL,			/* bus_remove_eventcall */
2570 		NULL,			/* bus_post_event */
2571 		NULL,			/* bus_intr_ctl (OBSOLETE) */
2572 		NULL,			/* bus_config */
2573 		NULL,			/* bus_unconfig */
2574 		NULL,			/* bus_fm_init */
2575 		NULL,			/* bus_fm_fini */
2576 		NULL,			/* bus_fm_access_enter */
2577 		NULL,			/* bus_fm_access_exit */
2578 		NULL,			/* bus_power */
2579 		NULL,			/* bus_intr_op */
2580 	};
2581 
2582 	devops->devo_bus_ops = &bd_bus_ops;
2583 
2584 	/*
2585 	 * NB: The device driver is free to supply its own
2586 	 * character entry device support.
2587 	 */
2588 }
2589 
2590 void
2591 bd_mod_fini(struct dev_ops *devops)
2592 {
2593 	devops->devo_bus_ops = NULL;
2594 }
2595