xref: /titanic_50/usr/src/uts/common/io/blkdev/blkdev.c (revision 361f55a51b4e8a8d5a73478602f4afdc6e199da5)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright 2012 Garrett D'Amore <garrett@damore.org>.  All rights reserved.
24  * Copyright 2012 Alexey Zaytsev <alexey.zaytsev@gmail.com> All rights reserved.
25  * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
26  */
27 
28 #include <sys/types.h>
29 #include <sys/ksynch.h>
30 #include <sys/kmem.h>
31 #include <sys/file.h>
32 #include <sys/errno.h>
33 #include <sys/open.h>
34 #include <sys/buf.h>
35 #include <sys/uio.h>
36 #include <sys/aio_req.h>
37 #include <sys/cred.h>
38 #include <sys/modctl.h>
39 #include <sys/cmlb.h>
40 #include <sys/conf.h>
41 #include <sys/devops.h>
42 #include <sys/list.h>
43 #include <sys/sysmacros.h>
44 #include <sys/dkio.h>
45 #include <sys/vtoc.h>
46 #include <sys/scsi/scsi.h>	/* for DTYPE_DIRECT */
47 #include <sys/kstat.h>
48 #include <sys/fs/dv_node.h>
49 #include <sys/ddi.h>
50 #include <sys/sunddi.h>
51 #include <sys/note.h>
52 #include <sys/blkdev.h>
53 
54 #define	BD_MAXPART	64
55 #define	BDINST(dev)	(getminor(dev) / BD_MAXPART)
56 #define	BDPART(dev)	(getminor(dev) % BD_MAXPART)
57 
58 typedef struct bd bd_t;
59 typedef struct bd_xfer_impl bd_xfer_impl_t;
60 
61 struct bd {
62 	void		*d_private;
63 	dev_info_t	*d_dip;
64 	kmutex_t	d_ocmutex;
65 	kmutex_t	d_iomutex;
66 	kmutex_t	d_statemutex;
67 	kcondvar_t	d_statecv;
68 	enum dkio_state	d_state;
69 	cmlb_handle_t	d_cmlbh;
70 	unsigned	d_open_lyr[BD_MAXPART];	/* open count */
71 	uint64_t	d_open_excl;	/* bit mask indexed by partition */
72 	uint64_t	d_open_reg[OTYPCNT];		/* bit mask */
73 
74 	uint32_t	d_qsize;
75 	uint32_t	d_qactive;
76 	uint32_t	d_maxxfer;
77 	uint32_t	d_blkshift;
78 	uint64_t	d_numblks;
79 	ddi_devid_t	d_devid;
80 
81 	kmem_cache_t	*d_cache;
82 	list_t		d_runq;
83 	list_t		d_waitq;
84 	kstat_t		*d_ksp;
85 	kstat_io_t	*d_kiop;
86 
87 	boolean_t	d_rdonly;
88 	boolean_t	d_ssd;
89 	boolean_t	d_removable;
90 	boolean_t	d_hotpluggable;
91 	boolean_t	d_use_dma;
92 
93 	ddi_dma_attr_t	d_dma;
94 	bd_ops_t	d_ops;
95 	bd_handle_t	d_handle;
96 };
97 
98 struct bd_handle {
99 	bd_ops_t	h_ops;
100 	ddi_dma_attr_t	*h_dma;
101 	dev_info_t	*h_parent;
102 	dev_info_t	*h_child;
103 	void		*h_private;
104 	bd_t		*h_bd;
105 	char		*h_name;
106 	char		h_addr[20];	/* enough for %X,%X */
107 };
108 
109 struct bd_xfer_impl {
110 	bd_xfer_t	i_public;
111 	list_node_t	i_linkage;
112 	bd_t		*i_bd;
113 	buf_t		*i_bp;
114 	uint_t		i_num_win;
115 	uint_t		i_cur_win;
116 	off_t		i_offset;
117 	int		(*i_func)(void *, bd_xfer_t *);
118 	uint32_t	i_blkshift;
119 	size_t		i_len;
120 	size_t		i_resid;
121 };
122 
123 #define	i_dmah		i_public.x_dmah
124 #define	i_dmac		i_public.x_dmac
125 #define	i_ndmac		i_public.x_ndmac
126 #define	i_kaddr		i_public.x_kaddr
127 #define	i_nblks		i_public.x_nblks
128 #define	i_blkno		i_public.x_blkno
129 #define	i_flags		i_public.x_flags
130 
131 
132 /*
133  * Private prototypes.
134  */
135 
136 static int bd_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
137 static int bd_attach(dev_info_t *, ddi_attach_cmd_t);
138 static int bd_detach(dev_info_t *, ddi_detach_cmd_t);
139 
140 static int bd_open(dev_t *, int, int, cred_t *);
141 static int bd_close(dev_t, int, int, cred_t *);
142 static int bd_strategy(struct buf *);
143 static int bd_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
144 static int bd_dump(dev_t, caddr_t, daddr_t, int);
145 static int bd_read(dev_t, struct uio *, cred_t *);
146 static int bd_write(dev_t, struct uio *, cred_t *);
147 static int bd_aread(dev_t, struct aio_req *, cred_t *);
148 static int bd_awrite(dev_t, struct aio_req *, cred_t *);
149 static int bd_prop_op(dev_t, dev_info_t *, ddi_prop_op_t, int, char *,
150     caddr_t, int *);
151 
152 static int bd_tg_rdwr(dev_info_t *, uchar_t, void *, diskaddr_t, size_t,
153     void *);
154 static int bd_tg_getinfo(dev_info_t *, int, void *, void *);
155 static int bd_xfer_ctor(void *, void *, int);
156 static void bd_xfer_dtor(void *, void *);
157 static void bd_sched(bd_t *);
158 static void bd_submit(bd_t *, bd_xfer_impl_t *);
159 static void bd_runq_exit(bd_xfer_impl_t *, int);
160 static void bd_update_state(bd_t *);
161 static int bd_check_state(bd_t *, enum dkio_state *);
162 static int bd_flush_write_cache(bd_t *, struct dk_callback *);
163 
164 struct cmlb_tg_ops bd_tg_ops = {
165 	TG_DK_OPS_VERSION_1,
166 	bd_tg_rdwr,
167 	bd_tg_getinfo,
168 };
169 
170 static struct cb_ops bd_cb_ops = {
171 	bd_open, 		/* open */
172 	bd_close, 		/* close */
173 	bd_strategy, 		/* strategy */
174 	nodev, 			/* print */
175 	bd_dump,		/* dump */
176 	bd_read, 		/* read */
177 	bd_write, 		/* write */
178 	bd_ioctl, 		/* ioctl */
179 	nodev, 			/* devmap */
180 	nodev, 			/* mmap */
181 	nodev, 			/* segmap */
182 	nochpoll, 		/* poll */
183 	bd_prop_op, 		/* cb_prop_op */
184 	0, 			/* streamtab  */
185 	D_64BIT | D_MP,		/* Driver comaptibility flag */
186 	CB_REV,			/* cb_rev */
187 	bd_aread,		/* async read */
188 	bd_awrite		/* async write */
189 };
190 
191 struct dev_ops bd_dev_ops = {
192 	DEVO_REV, 		/* devo_rev, */
193 	0, 			/* refcnt  */
194 	bd_getinfo,		/* getinfo */
195 	nulldev, 		/* identify */
196 	nulldev, 		/* probe */
197 	bd_attach, 		/* attach */
198 	bd_detach,		/* detach */
199 	nodev, 			/* reset */
200 	&bd_cb_ops, 		/* driver operations */
201 	NULL,			/* bus operations */
202 	NULL,			/* power */
203 	ddi_quiesce_not_needed,	/* quiesce */
204 };
205 
206 static struct modldrv modldrv = {
207 	&mod_driverops,
208 	"Generic Block Device",
209 	&bd_dev_ops,
210 };
211 
212 static struct modlinkage modlinkage = {
213 	MODREV_1, { &modldrv, NULL }
214 };
215 
216 static void *bd_state;
217 static krwlock_t bd_lock;
218 
219 int
220 _init(void)
221 {
222 	int	rv;
223 
224 	rv = ddi_soft_state_init(&bd_state, sizeof (struct bd), 2);
225 	if (rv != DDI_SUCCESS) {
226 		return (rv);
227 	}
228 	rw_init(&bd_lock, NULL, RW_DRIVER, NULL);
229 	rv = mod_install(&modlinkage);
230 	if (rv != DDI_SUCCESS) {
231 		rw_destroy(&bd_lock);
232 		ddi_soft_state_fini(&bd_state);
233 	}
234 	return (rv);
235 }
236 
237 int
238 _fini(void)
239 {
240 	int	rv;
241 
242 	rv = mod_remove(&modlinkage);
243 	if (rv == DDI_SUCCESS) {
244 		rw_destroy(&bd_lock);
245 		ddi_soft_state_fini(&bd_state);
246 	}
247 	return (rv);
248 }
249 
250 int
251 _info(struct modinfo *modinfop)
252 {
253 	return (mod_info(&modlinkage, modinfop));
254 }
255 
256 static int
257 bd_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resultp)
258 {
259 	bd_t	*bd;
260 	minor_t	inst;
261 
262 	_NOTE(ARGUNUSED(dip));
263 
264 	inst = BDINST((dev_t)arg);
265 
266 	switch (cmd) {
267 	case DDI_INFO_DEVT2DEVINFO:
268 		bd = ddi_get_soft_state(bd_state, inst);
269 		if (bd == NULL) {
270 			return (DDI_FAILURE);
271 		}
272 		*resultp = (void *)bd->d_dip;
273 		break;
274 
275 	case DDI_INFO_DEVT2INSTANCE:
276 		*resultp = (void *)(intptr_t)inst;
277 		break;
278 
279 	default:
280 		return (DDI_FAILURE);
281 	}
282 	return (DDI_SUCCESS);
283 }
284 
285 static int
286 bd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
287 {
288 	int		inst;
289 	bd_handle_t	hdl;
290 	bd_t		*bd;
291 	bd_drive_t	drive;
292 	int		rv;
293 	char		name[16];
294 	char		kcache[32];
295 
296 	switch (cmd) {
297 	case DDI_ATTACH:
298 		break;
299 	case DDI_RESUME:
300 		/* We don't do anything native for suspend/resume */
301 		return (DDI_SUCCESS);
302 	default:
303 		return (DDI_FAILURE);
304 	}
305 
306 	inst = ddi_get_instance(dip);
307 	hdl = ddi_get_parent_data(dip);
308 
309 	(void) snprintf(name, sizeof (name), "%s%d",
310 	    ddi_driver_name(dip), ddi_get_instance(dip));
311 	(void) snprintf(kcache, sizeof (kcache), "%s_xfer", name);
312 
313 	if (hdl == NULL) {
314 		cmn_err(CE_WARN, "%s: missing parent data!", name);
315 		return (DDI_FAILURE);
316 	}
317 
318 	if (ddi_soft_state_zalloc(bd_state, inst) != DDI_SUCCESS) {
319 		cmn_err(CE_WARN, "%s: unable to zalloc soft state!", name);
320 		return (DDI_FAILURE);
321 	}
322 	bd = ddi_get_soft_state(bd_state, inst);
323 
324 	if (hdl->h_dma) {
325 		bd->d_dma = *(hdl->h_dma);
326 		bd->d_dma.dma_attr_granular =
327 		    max(DEV_BSIZE, bd->d_dma.dma_attr_granular);
328 		bd->d_use_dma = B_TRUE;
329 
330 		if (bd->d_maxxfer &&
331 		    (bd->d_maxxfer != bd->d_dma.dma_attr_maxxfer)) {
332 			cmn_err(CE_WARN,
333 			    "%s: inconsistent maximum transfer size!",
334 			    name);
335 			/* We force it */
336 			bd->d_maxxfer = bd->d_dma.dma_attr_maxxfer;
337 		} else {
338 			bd->d_maxxfer = bd->d_dma.dma_attr_maxxfer;
339 		}
340 	} else {
341 		bd->d_use_dma = B_FALSE;
342 		if (bd->d_maxxfer == 0) {
343 			bd->d_maxxfer = 1024 * 1024;
344 		}
345 	}
346 	bd->d_ops = hdl->h_ops;
347 	bd->d_private = hdl->h_private;
348 	bd->d_blkshift = 9;	/* 512 bytes, to start */
349 
350 	if (bd->d_maxxfer % DEV_BSIZE) {
351 		cmn_err(CE_WARN, "%s: maximum transfer misaligned!", name);
352 		bd->d_maxxfer &= ~(DEV_BSIZE - 1);
353 	}
354 	if (bd->d_maxxfer < DEV_BSIZE) {
355 		cmn_err(CE_WARN, "%s: maximum transfer size too small!", name);
356 		ddi_soft_state_free(bd_state, inst);
357 		return (DDI_FAILURE);
358 	}
359 
360 	bd->d_dip = dip;
361 	bd->d_handle = hdl;
362 	hdl->h_bd = bd;
363 	ddi_set_driver_private(dip, bd);
364 
365 	mutex_init(&bd->d_iomutex, NULL, MUTEX_DRIVER, NULL);
366 	mutex_init(&bd->d_ocmutex, NULL, MUTEX_DRIVER, NULL);
367 	mutex_init(&bd->d_statemutex, NULL, MUTEX_DRIVER, NULL);
368 	cv_init(&bd->d_statecv, NULL, CV_DRIVER, NULL);
369 
370 	list_create(&bd->d_waitq, sizeof (bd_xfer_impl_t),
371 	    offsetof(struct bd_xfer_impl, i_linkage));
372 	list_create(&bd->d_runq, sizeof (bd_xfer_impl_t),
373 	    offsetof(struct bd_xfer_impl, i_linkage));
374 
375 	bd->d_cache = kmem_cache_create(kcache, sizeof (bd_xfer_impl_t), 8,
376 	    bd_xfer_ctor, bd_xfer_dtor, NULL, bd, NULL, 0);
377 
378 	bd->d_ksp = kstat_create(ddi_driver_name(dip), inst, NULL, "disk",
379 	    KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT);
380 	if (bd->d_ksp != NULL) {
381 		bd->d_ksp->ks_lock = &bd->d_iomutex;
382 		kstat_install(bd->d_ksp);
383 		bd->d_kiop = bd->d_ksp->ks_data;
384 	} else {
385 		/*
386 		 * Even if we cannot create the kstat, we create a
387 		 * scratch kstat.  The reason for this is to ensure
388 		 * that we can update the kstat all of the time,
389 		 * without adding an extra branch instruction.
390 		 */
391 		bd->d_kiop = kmem_zalloc(sizeof (kstat_io_t), KM_SLEEP);
392 	}
393 
394 	cmlb_alloc_handle(&bd->d_cmlbh);
395 
396 	bd->d_state = DKIO_NONE;
397 
398 	bzero(&drive, sizeof (drive));
399 	bd->d_ops.o_drive_info(bd->d_private, &drive);
400 	bd->d_qsize = drive.d_qsize;
401 	bd->d_removable = drive.d_removable;
402 	bd->d_hotpluggable = drive.d_hotpluggable;
403 
404 	if (drive.d_maxxfer && drive.d_maxxfer < bd->d_maxxfer)
405 		bd->d_maxxfer = drive.d_maxxfer;
406 
407 
408 	rv = cmlb_attach(dip, &bd_tg_ops, DTYPE_DIRECT,
409 	    bd->d_removable, bd->d_hotpluggable,
410 	    drive.d_lun >= 0 ? DDI_NT_BLOCK_CHAN : DDI_NT_BLOCK,
411 	    CMLB_FAKE_LABEL_ONE_PARTITION, bd->d_cmlbh, 0);
412 	if (rv != 0) {
413 		cmlb_free_handle(&bd->d_cmlbh);
414 		kmem_cache_destroy(bd->d_cache);
415 		mutex_destroy(&bd->d_iomutex);
416 		mutex_destroy(&bd->d_ocmutex);
417 		mutex_destroy(&bd->d_statemutex);
418 		cv_destroy(&bd->d_statecv);
419 		list_destroy(&bd->d_waitq);
420 		list_destroy(&bd->d_runq);
421 		if (bd->d_ksp != NULL) {
422 			kstat_delete(bd->d_ksp);
423 			bd->d_ksp = NULL;
424 		} else {
425 			kmem_free(bd->d_kiop, sizeof (kstat_io_t));
426 		}
427 		ddi_soft_state_free(bd_state, inst);
428 		return (DDI_FAILURE);
429 	}
430 
431 	if (bd->d_ops.o_devid_init != NULL) {
432 		rv = bd->d_ops.o_devid_init(bd->d_private, dip, &bd->d_devid);
433 		if (rv == DDI_SUCCESS) {
434 			if (ddi_devid_register(dip, bd->d_devid) !=
435 			    DDI_SUCCESS) {
436 				cmn_err(CE_WARN,
437 				    "%s: unable to register devid", name);
438 			}
439 		}
440 	}
441 
442 	/*
443 	 * Add a zero-length attribute to tell the world we support
444 	 * kernel ioctls (for layered drivers).  Also set up properties
445 	 * used by HAL to identify removable media.
446 	 */
447 	(void) ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
448 	    DDI_KERNEL_IOCTL, NULL, 0);
449 	if (bd->d_removable) {
450 		(void) ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
451 		    "removable-media", NULL, 0);
452 	}
453 	if (bd->d_hotpluggable) {
454 		(void) ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
455 		    "hotpluggable", NULL, 0);
456 	}
457 
458 	ddi_report_dev(dip);
459 
460 	return (DDI_SUCCESS);
461 }
462 
463 static int
464 bd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
465 {
466 	bd_t	*bd;
467 
468 	bd = ddi_get_driver_private(dip);
469 
470 	switch (cmd) {
471 	case DDI_DETACH:
472 		break;
473 	case DDI_SUSPEND:
474 		/* We don't suspend, but our parent does */
475 		return (DDI_SUCCESS);
476 	default:
477 		return (DDI_FAILURE);
478 	}
479 	if (bd->d_ksp != NULL) {
480 		kstat_delete(bd->d_ksp);
481 		bd->d_ksp = NULL;
482 	} else {
483 		kmem_free(bd->d_kiop, sizeof (kstat_io_t));
484 	}
485 	cmlb_detach(bd->d_cmlbh, 0);
486 	cmlb_free_handle(&bd->d_cmlbh);
487 	if (bd->d_devid)
488 		ddi_devid_free(bd->d_devid);
489 	kmem_cache_destroy(bd->d_cache);
490 	mutex_destroy(&bd->d_iomutex);
491 	mutex_destroy(&bd->d_ocmutex);
492 	mutex_destroy(&bd->d_statemutex);
493 	cv_destroy(&bd->d_statecv);
494 	list_destroy(&bd->d_waitq);
495 	list_destroy(&bd->d_runq);
496 	ddi_soft_state_free(bd_state, ddi_get_instance(dip));
497 	return (DDI_SUCCESS);
498 }
499 
500 static int
501 bd_xfer_ctor(void *buf, void *arg, int kmflag)
502 {
503 	bd_xfer_impl_t	*xi;
504 	bd_t		*bd = arg;
505 	int		(*dcb)(caddr_t);
506 
507 	if (kmflag == KM_PUSHPAGE || kmflag == KM_SLEEP) {
508 		dcb = DDI_DMA_SLEEP;
509 	} else {
510 		dcb = DDI_DMA_DONTWAIT;
511 	}
512 
513 	xi = buf;
514 	bzero(xi, sizeof (*xi));
515 	xi->i_bd = bd;
516 
517 	if (bd->d_use_dma) {
518 		if (ddi_dma_alloc_handle(bd->d_dip, &bd->d_dma, dcb, NULL,
519 		    &xi->i_dmah) != DDI_SUCCESS) {
520 			return (-1);
521 		}
522 	}
523 
524 	return (0);
525 }
526 
527 static void
528 bd_xfer_dtor(void *buf, void *arg)
529 {
530 	bd_xfer_impl_t	*xi = buf;
531 
532 	_NOTE(ARGUNUSED(arg));
533 
534 	if (xi->i_dmah)
535 		ddi_dma_free_handle(&xi->i_dmah);
536 	xi->i_dmah = NULL;
537 }
538 
539 static bd_xfer_impl_t *
540 bd_xfer_alloc(bd_t *bd, struct buf *bp, int (*func)(void *, bd_xfer_t *),
541     int kmflag)
542 {
543 	bd_xfer_impl_t		*xi;
544 	int			rv;
545 	int			status;
546 	unsigned		dir;
547 	int			(*cb)(caddr_t);
548 	size_t			len;
549 	uint32_t		shift;
550 
551 	if (kmflag == KM_SLEEP) {
552 		cb = DDI_DMA_SLEEP;
553 	} else {
554 		cb = DDI_DMA_DONTWAIT;
555 	}
556 
557 	xi = kmem_cache_alloc(bd->d_cache, kmflag);
558 	if (xi == NULL) {
559 		bioerror(bp, ENOMEM);
560 		return (NULL);
561 	}
562 
563 	ASSERT(bp);
564 
565 	xi->i_bp = bp;
566 	xi->i_func = func;
567 	xi->i_blkno = bp->b_lblkno;
568 
569 	if (bp->b_bcount == 0) {
570 		xi->i_len = 0;
571 		xi->i_nblks = 0;
572 		xi->i_kaddr = NULL;
573 		xi->i_resid = 0;
574 		xi->i_num_win = 0;
575 		goto done;
576 	}
577 
578 	if (bp->b_flags & B_READ) {
579 		dir = DDI_DMA_READ;
580 		xi->i_func = bd->d_ops.o_read;
581 	} else {
582 		dir = DDI_DMA_WRITE;
583 		xi->i_func = bd->d_ops.o_write;
584 	}
585 
586 	shift = bd->d_blkshift;
587 	xi->i_blkshift = shift;
588 
589 	if (!bd->d_use_dma) {
590 		bp_mapin(bp);
591 		rv = 0;
592 		xi->i_offset = 0;
593 		xi->i_num_win =
594 		    (bp->b_bcount + (bd->d_maxxfer - 1)) / bd->d_maxxfer;
595 		xi->i_cur_win = 0;
596 		xi->i_len = min(bp->b_bcount, bd->d_maxxfer);
597 		xi->i_nblks = xi->i_len >> shift;
598 		xi->i_kaddr = bp->b_un.b_addr;
599 		xi->i_resid = bp->b_bcount;
600 	} else {
601 
602 		/*
603 		 * We have to use consistent DMA if the address is misaligned.
604 		 */
605 		if (((bp->b_flags & (B_PAGEIO | B_REMAPPED)) != B_PAGEIO) &&
606 		    ((uintptr_t)bp->b_un.b_addr & 0x7)) {
607 			dir |= DDI_DMA_CONSISTENT | DDI_DMA_PARTIAL;
608 		} else {
609 			dir |= DDI_DMA_STREAMING | DDI_DMA_PARTIAL;
610 		}
611 
612 		status = ddi_dma_buf_bind_handle(xi->i_dmah, bp, dir, cb,
613 		    NULL, &xi->i_dmac, &xi->i_ndmac);
614 		switch (status) {
615 		case DDI_DMA_MAPPED:
616 			xi->i_num_win = 1;
617 			xi->i_cur_win = 0;
618 			xi->i_offset = 0;
619 			xi->i_len = bp->b_bcount;
620 			xi->i_nblks = xi->i_len >> shift;
621 			xi->i_resid = bp->b_bcount;
622 			rv = 0;
623 			break;
624 		case DDI_DMA_PARTIAL_MAP:
625 			xi->i_cur_win = 0;
626 
627 			if ((ddi_dma_numwin(xi->i_dmah, &xi->i_num_win) !=
628 			    DDI_SUCCESS) ||
629 			    (ddi_dma_getwin(xi->i_dmah, 0, &xi->i_offset,
630 			    &len, &xi->i_dmac, &xi->i_ndmac) !=
631 			    DDI_SUCCESS) ||
632 			    (P2PHASE(len, shift) != 0)) {
633 				(void) ddi_dma_unbind_handle(xi->i_dmah);
634 				rv = EFAULT;
635 				goto done;
636 			}
637 			xi->i_len = len;
638 			xi->i_nblks = xi->i_len >> shift;
639 			xi->i_resid = bp->b_bcount;
640 			rv = 0;
641 			break;
642 		case DDI_DMA_NORESOURCES:
643 			rv = EAGAIN;
644 			goto done;
645 		case DDI_DMA_TOOBIG:
646 			rv = EINVAL;
647 			goto done;
648 		case DDI_DMA_NOMAPPING:
649 		case DDI_DMA_INUSE:
650 		default:
651 			rv = EFAULT;
652 			goto done;
653 		}
654 	}
655 
656 done:
657 	if (rv != 0) {
658 		kmem_cache_free(bd->d_cache, xi);
659 		bioerror(bp, rv);
660 		return (NULL);
661 	}
662 
663 	return (xi);
664 }
665 
666 static void
667 bd_xfer_free(bd_xfer_impl_t *xi)
668 {
669 	if (xi->i_dmah) {
670 		(void) ddi_dma_unbind_handle(xi->i_dmah);
671 	}
672 	kmem_cache_free(xi->i_bd->d_cache, xi);
673 }
674 
675 static int
676 bd_open(dev_t *devp, int flag, int otyp, cred_t *credp)
677 {
678 	dev_t		dev = *devp;
679 	bd_t		*bd;
680 	minor_t		part;
681 	minor_t		inst;
682 	uint64_t	mask;
683 	boolean_t	ndelay;
684 	int		rv;
685 	diskaddr_t	nblks;
686 	diskaddr_t	lba;
687 
688 	_NOTE(ARGUNUSED(credp));
689 
690 	part = BDPART(dev);
691 	inst = BDINST(dev);
692 
693 	if (otyp >= OTYPCNT)
694 		return (EINVAL);
695 
696 	ndelay = (flag & (FNDELAY | FNONBLOCK)) ? B_TRUE : B_FALSE;
697 
698 	/*
699 	 * Block any DR events from changing the set of registered
700 	 * devices while we function.
701 	 */
702 	rw_enter(&bd_lock, RW_READER);
703 	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
704 		rw_exit(&bd_lock);
705 		return (ENXIO);
706 	}
707 
708 	mutex_enter(&bd->d_ocmutex);
709 
710 	ASSERT(part < 64);
711 	mask = (1U << part);
712 
713 	bd_update_state(bd);
714 
715 	if (cmlb_validate(bd->d_cmlbh, 0, 0) != 0) {
716 
717 		/* non-blocking opens are allowed to succeed */
718 		if (!ndelay) {
719 			rv = ENXIO;
720 			goto done;
721 		}
722 	} else if (cmlb_partinfo(bd->d_cmlbh, part, &nblks, &lba,
723 	    NULL, NULL, 0) == 0) {
724 
725 		/*
726 		 * We read the partinfo, verify valid ranges.  If the
727 		 * partition is invalid, and we aren't blocking or
728 		 * doing a raw access, then fail. (Non-blocking and
729 		 * raw accesses can still succeed to allow a disk with
730 		 * bad partition data to opened by format and fdisk.)
731 		 */
732 		if ((!nblks) && ((!ndelay) || (otyp != OTYP_CHR))) {
733 			rv = ENXIO;
734 			goto done;
735 		}
736 	} else if (!ndelay) {
737 		/*
738 		 * cmlb_partinfo failed -- invalid partition or no
739 		 * disk label.
740 		 */
741 		rv = ENXIO;
742 		goto done;
743 	}
744 
745 	if ((flag & FWRITE) && bd->d_rdonly) {
746 		rv = EROFS;
747 		goto done;
748 	}
749 
750 	if ((bd->d_open_excl) & (mask)) {
751 		rv = EBUSY;
752 		goto done;
753 	}
754 	if (flag & FEXCL) {
755 		if (bd->d_open_lyr[part]) {
756 			rv = EBUSY;
757 			goto done;
758 		}
759 		for (int i = 0; i < OTYP_LYR; i++) {
760 			if (bd->d_open_reg[i] & mask) {
761 				rv = EBUSY;
762 				goto done;
763 			}
764 		}
765 	}
766 
767 	if (otyp == OTYP_LYR) {
768 		bd->d_open_lyr[part]++;
769 	} else {
770 		bd->d_open_reg[otyp] |= mask;
771 	}
772 	if (flag & FEXCL) {
773 		bd->d_open_excl |= mask;
774 	}
775 
776 	rv = 0;
777 done:
778 	mutex_exit(&bd->d_ocmutex);
779 	rw_exit(&bd_lock);
780 
781 	return (rv);
782 }
783 
784 static int
785 bd_close(dev_t dev, int flag, int otyp, cred_t *credp)
786 {
787 	bd_t		*bd;
788 	minor_t		inst;
789 	minor_t		part;
790 	uint64_t	mask;
791 	boolean_t	last = B_TRUE;
792 
793 	_NOTE(ARGUNUSED(flag));
794 	_NOTE(ARGUNUSED(credp));
795 
796 	part = BDPART(dev);
797 	inst = BDINST(dev);
798 
799 	ASSERT(part < 64);
800 	mask = (1U << part);
801 
802 	rw_enter(&bd_lock, RW_READER);
803 
804 	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
805 		rw_exit(&bd_lock);
806 		return (ENXIO);
807 	}
808 
809 	mutex_enter(&bd->d_ocmutex);
810 	if (bd->d_open_excl & mask) {
811 		bd->d_open_excl &= ~mask;
812 	}
813 	if (otyp == OTYP_LYR) {
814 		bd->d_open_lyr[part]--;
815 	} else {
816 		bd->d_open_reg[otyp] &= ~mask;
817 	}
818 	for (int i = 0; i < 64; i++) {
819 		if (bd->d_open_lyr[part]) {
820 			last = B_FALSE;
821 		}
822 	}
823 	for (int i = 0; last && (i < OTYP_LYR); i++) {
824 		if (bd->d_open_reg[i]) {
825 			last = B_FALSE;
826 		}
827 	}
828 	mutex_exit(&bd->d_ocmutex);
829 
830 	if (last) {
831 		cmlb_invalidate(bd->d_cmlbh, 0);
832 	}
833 	rw_exit(&bd_lock);
834 
835 	return (0);
836 }
837 
838 static int
839 bd_dump(dev_t dev, caddr_t caddr, daddr_t blkno, int nblk)
840 {
841 	minor_t		inst;
842 	minor_t		part;
843 	diskaddr_t	pstart;
844 	diskaddr_t	psize;
845 	bd_t		*bd;
846 	bd_xfer_impl_t	*xi;
847 	buf_t		*bp;
848 	int		rv;
849 
850 	rw_enter(&bd_lock, RW_READER);
851 
852 	part = BDPART(dev);
853 	inst = BDINST(dev);
854 
855 	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
856 		rw_exit(&bd_lock);
857 		return (ENXIO);
858 	}
859 	/*
860 	 * do cmlb, but do it synchronously unless we already have the
861 	 * partition (which we probably should.)
862 	 */
863 	if (cmlb_partinfo(bd->d_cmlbh, part, &psize, &pstart, NULL, NULL,
864 	    (void *)1)) {
865 		rw_exit(&bd_lock);
866 		return (ENXIO);
867 	}
868 
869 	if ((blkno + nblk) > psize) {
870 		rw_exit(&bd_lock);
871 		return (EINVAL);
872 	}
873 	bp = getrbuf(KM_NOSLEEP);
874 	if (bp == NULL) {
875 		rw_exit(&bd_lock);
876 		return (ENOMEM);
877 	}
878 
879 	bp->b_bcount = nblk << bd->d_blkshift;
880 	bp->b_resid = bp->b_bcount;
881 	bp->b_lblkno = blkno;
882 	bp->b_un.b_addr = caddr;
883 
884 	xi = bd_xfer_alloc(bd, bp,  bd->d_ops.o_write, KM_NOSLEEP);
885 	if (xi == NULL) {
886 		rw_exit(&bd_lock);
887 		freerbuf(bp);
888 		return (ENOMEM);
889 	}
890 	xi->i_blkno = blkno + pstart;
891 	xi->i_flags = BD_XFER_POLL;
892 	bd_submit(bd, xi);
893 	rw_exit(&bd_lock);
894 
895 	/*
896 	 * Generally, we should have run this entirely synchronously
897 	 * at this point and the biowait call should be a no-op.  If
898 	 * it didn't happen this way, it's a bug in the underlying
899 	 * driver not honoring BD_XFER_POLL.
900 	 */
901 	(void) biowait(bp);
902 	rv = geterror(bp);
903 	freerbuf(bp);
904 	return (rv);
905 }
906 
907 void
908 bd_minphys(struct buf *bp)
909 {
910 	minor_t inst;
911 	bd_t	*bd;
912 	inst = BDINST(bp->b_edev);
913 
914 	bd = ddi_get_soft_state(bd_state, inst);
915 
916 	/*
917 	 * In a non-debug kernel, bd_strategy will catch !bd as
918 	 * well, and will fail nicely.
919 	 */
920 	ASSERT(bd);
921 
922 	if (bp->b_bcount > bd->d_maxxfer)
923 		bp->b_bcount = bd->d_maxxfer;
924 }
925 
926 static int
927 bd_read(dev_t dev, struct uio *uio, cred_t *credp)
928 {
929 	_NOTE(ARGUNUSED(credp));
930 	return (physio(bd_strategy, NULL, dev, B_READ, bd_minphys, uio));
931 }
932 
933 static int
934 bd_write(dev_t dev, struct uio *uio, cred_t *credp)
935 {
936 	_NOTE(ARGUNUSED(credp));
937 	return (physio(bd_strategy, NULL, dev, B_WRITE, bd_minphys, uio));
938 }
939 
940 static int
941 bd_aread(dev_t dev, struct aio_req *aio, cred_t *credp)
942 {
943 	_NOTE(ARGUNUSED(credp));
944 	return (aphysio(bd_strategy, anocancel, dev, B_READ, bd_minphys, aio));
945 }
946 
947 static int
948 bd_awrite(dev_t dev, struct aio_req *aio, cred_t *credp)
949 {
950 	_NOTE(ARGUNUSED(credp));
951 	return (aphysio(bd_strategy, anocancel, dev, B_WRITE, bd_minphys, aio));
952 }
953 
954 static int
955 bd_strategy(struct buf *bp)
956 {
957 	minor_t		inst;
958 	minor_t		part;
959 	bd_t		*bd;
960 	diskaddr_t	p_lba;
961 	diskaddr_t	p_nblks;
962 	diskaddr_t	b_nblks;
963 	bd_xfer_impl_t	*xi;
964 	uint32_t	shift;
965 	int		(*func)(void *, bd_xfer_t *);
966 
967 	part = BDPART(bp->b_edev);
968 	inst = BDINST(bp->b_edev);
969 
970 	ASSERT(bp);
971 
972 	bp->b_resid = bp->b_bcount;
973 
974 	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
975 		bioerror(bp, ENXIO);
976 		biodone(bp);
977 		return (0);
978 	}
979 
980 	if (cmlb_partinfo(bd->d_cmlbh, part, &p_nblks, &p_lba,
981 	    NULL, NULL, 0)) {
982 		bioerror(bp, ENXIO);
983 		biodone(bp);
984 		return (0);
985 	}
986 
987 	shift = bd->d_blkshift;
988 
989 	if ((P2PHASE(bp->b_bcount, (1U << shift)) != 0) ||
990 	    (bp->b_lblkno > p_nblks)) {
991 		bioerror(bp, ENXIO);
992 		biodone(bp);
993 		return (0);
994 	}
995 	b_nblks = bp->b_bcount >> shift;
996 	if ((bp->b_lblkno == p_nblks) || (bp->b_bcount == 0)) {
997 		biodone(bp);
998 		return (0);
999 	}
1000 
1001 	if ((b_nblks + bp->b_lblkno) > p_nblks) {
1002 		bp->b_resid = ((bp->b_lblkno + b_nblks - p_nblks) << shift);
1003 		bp->b_bcount -= bp->b_resid;
1004 	} else {
1005 		bp->b_resid = 0;
1006 	}
1007 	func = (bp->b_flags & B_READ) ? bd->d_ops.o_read : bd->d_ops.o_write;
1008 
1009 	xi = bd_xfer_alloc(bd, bp, func, KM_NOSLEEP);
1010 	if (xi == NULL) {
1011 		xi = bd_xfer_alloc(bd, bp, func, KM_PUSHPAGE);
1012 	}
1013 	if (xi == NULL) {
1014 		/* bd_request_alloc will have done bioerror */
1015 		biodone(bp);
1016 		return (0);
1017 	}
1018 	xi->i_blkno = bp->b_lblkno + p_lba;
1019 
1020 	bd_submit(bd, xi);
1021 
1022 	return (0);
1023 }
1024 
1025 static int
1026 bd_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *credp, int *rvalp)
1027 {
1028 	minor_t		inst;
1029 	uint16_t	part;
1030 	bd_t		*bd;
1031 	void		*ptr = (void *)arg;
1032 	int		rv;
1033 
1034 	part = BDPART(dev);
1035 	inst = BDINST(dev);
1036 
1037 	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
1038 		return (ENXIO);
1039 	}
1040 
1041 	rv = cmlb_ioctl(bd->d_cmlbh, dev, cmd, arg, flag, credp, rvalp, 0);
1042 	if (rv != ENOTTY)
1043 		return (rv);
1044 
1045 	switch (cmd) {
1046 	case DKIOCGMEDIAINFO: {
1047 		struct dk_minfo minfo;
1048 
1049 		/* make sure our state information is current */
1050 		bd_update_state(bd);
1051 		bzero(&minfo, sizeof (minfo));
1052 		minfo.dki_media_type = DK_FIXED_DISK;
1053 		minfo.dki_lbsize = (1U << bd->d_blkshift);
1054 		minfo.dki_capacity = bd->d_numblks;
1055 		if (ddi_copyout(&minfo, ptr, sizeof (minfo), flag)) {
1056 			return (EFAULT);
1057 		}
1058 		return (0);
1059 	}
1060 	case DKIOCGMEDIAINFOEXT: {
1061 		struct dk_minfo_ext miext;
1062 
1063 		/* make sure our state information is current */
1064 		bd_update_state(bd);
1065 		bzero(&miext, sizeof (miext));
1066 		miext.dki_media_type = DK_FIXED_DISK;
1067 		miext.dki_lbsize = (1U << bd->d_blkshift);
1068 		miext.dki_pbsize = miext.dki_lbsize;
1069 		miext.dki_capacity = bd->d_numblks;
1070 		if (ddi_copyout(&miext, ptr, sizeof (miext), flag)) {
1071 			return (EFAULT);
1072 		}
1073 		return (0);
1074 	}
1075 	case DKIOCINFO: {
1076 		struct dk_cinfo cinfo;
1077 		bzero(&cinfo, sizeof (cinfo));
1078 		cinfo.dki_ctype = DKC_BLKDEV;
1079 		cinfo.dki_cnum = ddi_get_instance(ddi_get_parent(bd->d_dip));
1080 		(void) snprintf(cinfo.dki_cname, sizeof (cinfo.dki_cname),
1081 		    "%s", ddi_driver_name(ddi_get_parent(bd->d_dip)));
1082 		(void) snprintf(cinfo.dki_dname, sizeof (cinfo.dki_dname),
1083 		    "%s", ddi_driver_name(bd->d_dip));
1084 		cinfo.dki_unit = inst;
1085 		cinfo.dki_flags = DKI_FMTVOL;
1086 		cinfo.dki_partition = part;
1087 		cinfo.dki_maxtransfer = bd->d_maxxfer / DEV_BSIZE;
1088 		cinfo.dki_addr = 0;
1089 		cinfo.dki_slave = 0;
1090 		cinfo.dki_space = 0;
1091 		cinfo.dki_prio = 0;
1092 		cinfo.dki_vec = 0;
1093 		if (ddi_copyout(&cinfo, ptr, sizeof (cinfo), flag)) {
1094 			return (EFAULT);
1095 		}
1096 		return (0);
1097 	}
1098 	case DKIOCREMOVABLE: {
1099 		int i;
1100 		i = bd->d_removable ? 1 : 0;
1101 		if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
1102 			return (EFAULT);
1103 		}
1104 		return (0);
1105 	}
1106 	case DKIOCHOTPLUGGABLE: {
1107 		int i;
1108 		i = bd->d_hotpluggable ? 1 : 0;
1109 		if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
1110 			return (EFAULT);
1111 		}
1112 		return (0);
1113 	}
1114 	case DKIOCREADONLY: {
1115 		int i;
1116 		i = bd->d_rdonly ? 1 : 0;
1117 		if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
1118 			return (EFAULT);
1119 		}
1120 		return (0);
1121 	}
1122 	case DKIOCSOLIDSTATE: {
1123 		int i;
1124 		i = bd->d_ssd ? 1 : 0;
1125 		if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
1126 			return (EFAULT);
1127 		}
1128 		return (0);
1129 	}
1130 	case DKIOCSTATE: {
1131 		enum dkio_state	state;
1132 		if (ddi_copyin(ptr, &state, sizeof (state), flag)) {
1133 			return (EFAULT);
1134 		}
1135 		if ((rv = bd_check_state(bd, &state)) != 0) {
1136 			return (rv);
1137 		}
1138 		if (ddi_copyout(&state, ptr, sizeof (state), flag)) {
1139 			return (EFAULT);
1140 		}
1141 		return (0);
1142 	}
1143 	case DKIOCFLUSHWRITECACHE: {
1144 		struct dk_callback *dkc = NULL;
1145 
1146 		if (flag & FKIOCTL)
1147 			dkc = (void *)arg;
1148 
1149 		rv = bd_flush_write_cache(bd, dkc);
1150 		return (rv);
1151 	}
1152 
1153 	default:
1154 		break;
1155 
1156 	}
1157 	return (ENOTTY);
1158 }
1159 
1160 static int
1161 bd_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int mod_flags,
1162     char *name, caddr_t valuep, int *lengthp)
1163 {
1164 	bd_t	*bd;
1165 
1166 	bd = ddi_get_soft_state(bd_state, ddi_get_instance(dip));
1167 	if (bd == NULL)
1168 		return (ddi_prop_op(dev, dip, prop_op, mod_flags,
1169 		    name, valuep, lengthp));
1170 
1171 	return (cmlb_prop_op(bd->d_cmlbh, dev, dip, prop_op, mod_flags, name,
1172 	    valuep, lengthp, BDPART(dev), 0));
1173 }
1174 
1175 
1176 static int
1177 bd_tg_rdwr(dev_info_t *dip, uchar_t cmd, void *bufaddr, diskaddr_t start,
1178     size_t length, void *tg_cookie)
1179 {
1180 	bd_t		*bd;
1181 	buf_t		*bp;
1182 	bd_xfer_impl_t	*xi;
1183 	int		rv;
1184 	int		(*func)(void *, bd_xfer_t *);
1185 	int		kmflag;
1186 
1187 	/*
1188 	 * If we are running in polled mode (such as during dump(9e)
1189 	 * execution), then we cannot sleep for kernel allocations.
1190 	 */
1191 	kmflag = tg_cookie ? KM_NOSLEEP : KM_SLEEP;
1192 
1193 	bd = ddi_get_soft_state(bd_state, ddi_get_instance(dip));
1194 
1195 	if (P2PHASE(length, (1U << bd->d_blkshift)) != 0) {
1196 		/* We can only transfer whole blocks at a time! */
1197 		return (EINVAL);
1198 	}
1199 
1200 	if ((bp = getrbuf(kmflag)) == NULL) {
1201 		return (ENOMEM);
1202 	}
1203 
1204 	switch (cmd) {
1205 	case TG_READ:
1206 		bp->b_flags = B_READ;
1207 		func = bd->d_ops.o_read;
1208 		break;
1209 	case TG_WRITE:
1210 		bp->b_flags = B_WRITE;
1211 		func = bd->d_ops.o_write;
1212 		break;
1213 	default:
1214 		freerbuf(bp);
1215 		return (EINVAL);
1216 	}
1217 
1218 	bp->b_un.b_addr = bufaddr;
1219 	bp->b_bcount = length;
1220 	xi = bd_xfer_alloc(bd, bp, func, kmflag);
1221 	if (xi == NULL) {
1222 		rv = geterror(bp);
1223 		freerbuf(bp);
1224 		return (rv);
1225 	}
1226 	xi->i_flags = tg_cookie ? BD_XFER_POLL : 0;
1227 	xi->i_blkno = start;
1228 	bd_submit(bd, xi);
1229 	(void) biowait(bp);
1230 	rv = geterror(bp);
1231 	freerbuf(bp);
1232 
1233 	return (rv);
1234 }
1235 
1236 static int
1237 bd_tg_getinfo(dev_info_t *dip, int cmd, void *arg, void *tg_cookie)
1238 {
1239 	bd_t		*bd;
1240 
1241 	_NOTE(ARGUNUSED(tg_cookie));
1242 	bd = ddi_get_soft_state(bd_state, ddi_get_instance(dip));
1243 
1244 	switch (cmd) {
1245 	case TG_GETPHYGEOM:
1246 	case TG_GETVIRTGEOM:
1247 		/*
1248 		 * We don't have any "geometry" as such, let cmlb
1249 		 * fabricate something.
1250 		 */
1251 		return (ENOTTY);
1252 
1253 	case TG_GETCAPACITY:
1254 		bd_update_state(bd);
1255 		*(diskaddr_t *)arg = bd->d_numblks;
1256 		return (0);
1257 
1258 	case TG_GETBLOCKSIZE:
1259 		*(uint32_t *)arg = (1U << bd->d_blkshift);
1260 		return (0);
1261 
1262 	case TG_GETATTR:
1263 		/*
1264 		 * It turns out that cmlb really doesn't do much for
1265 		 * non-writable media, but lets make the information
1266 		 * available for it in case it does more in the
1267 		 * future.  (The value is currently used for
1268 		 * triggering special behavior for CD-ROMs.)
1269 		 */
1270 		bd_update_state(bd);
1271 		((tg_attribute_t *)arg)->media_is_writable =
1272 		    bd->d_rdonly ? B_FALSE : B_TRUE;
1273 		((tg_attribute_t *)arg)->media_is_solid_state = bd->d_ssd;
1274 		return (0);
1275 
1276 	default:
1277 		return (EINVAL);
1278 	}
1279 }
1280 
1281 
1282 static void
1283 bd_sched(bd_t *bd)
1284 {
1285 	bd_xfer_impl_t	*xi;
1286 	struct buf	*bp;
1287 	int		rv;
1288 
1289 	mutex_enter(&bd->d_iomutex);
1290 
1291 	while ((bd->d_qactive < bd->d_qsize) &&
1292 	    ((xi = list_remove_head(&bd->d_waitq)) != NULL)) {
1293 		bd->d_qactive++;
1294 		kstat_waitq_to_runq(bd->d_kiop);
1295 		list_insert_tail(&bd->d_runq, xi);
1296 
1297 		/*
1298 		 * Submit the job to the driver.  We drop the I/O mutex
1299 		 * so that we can deal with the case where the driver
1300 		 * completion routine calls back into us synchronously.
1301 		 */
1302 
1303 		mutex_exit(&bd->d_iomutex);
1304 
1305 		rv = xi->i_func(bd->d_private, &xi->i_public);
1306 		if (rv != 0) {
1307 			bp = xi->i_bp;
1308 			bd_xfer_free(xi);
1309 			bioerror(bp, rv);
1310 			biodone(bp);
1311 
1312 			mutex_enter(&bd->d_iomutex);
1313 			bd->d_qactive--;
1314 			kstat_runq_exit(bd->d_kiop);
1315 			list_remove(&bd->d_runq, xi);
1316 		} else {
1317 			mutex_enter(&bd->d_iomutex);
1318 		}
1319 	}
1320 
1321 	mutex_exit(&bd->d_iomutex);
1322 }
1323 
1324 static void
1325 bd_submit(bd_t *bd, bd_xfer_impl_t *xi)
1326 {
1327 	mutex_enter(&bd->d_iomutex);
1328 	list_insert_tail(&bd->d_waitq, xi);
1329 	kstat_waitq_enter(bd->d_kiop);
1330 	mutex_exit(&bd->d_iomutex);
1331 
1332 	bd_sched(bd);
1333 }
1334 
1335 static void
1336 bd_runq_exit(bd_xfer_impl_t *xi, int err)
1337 {
1338 	bd_t	*bd = xi->i_bd;
1339 	buf_t	*bp = xi->i_bp;
1340 
1341 	mutex_enter(&bd->d_iomutex);
1342 	bd->d_qactive--;
1343 	kstat_runq_exit(bd->d_kiop);
1344 	list_remove(&bd->d_runq, xi);
1345 	mutex_exit(&bd->d_iomutex);
1346 
1347 	if (err == 0) {
1348 		if (bp->b_flags & B_READ) {
1349 			bd->d_kiop->reads++;
1350 			bd->d_kiop->nread += (bp->b_bcount - xi->i_resid);
1351 		} else {
1352 			bd->d_kiop->writes++;
1353 			bd->d_kiop->nwritten += (bp->b_bcount - xi->i_resid);
1354 		}
1355 	}
1356 	bd_sched(bd);
1357 }
1358 
1359 static void
1360 bd_update_state(bd_t *bd)
1361 {
1362 	enum	dkio_state	state;
1363 	bd_media_t		media;
1364 	boolean_t		docmlb = B_FALSE;
1365 
1366 	bzero(&media, sizeof (media));
1367 
1368 	mutex_enter(&bd->d_statemutex);
1369 	if (bd->d_ops.o_media_info(bd->d_private, &media) == 0) {
1370 		if ((1U << bd->d_blkshift) != media.m_blksize) {
1371 			if ((media.m_blksize < 512) ||
1372 			    (!ISP2(media.m_blksize)) ||
1373 			    (P2PHASE(bd->d_maxxfer, media.m_blksize))) {
1374 				cmn_err(CE_WARN,
1375 				    "%s%d: Invalid media block size (%d)",
1376 				    ddi_driver_name(bd->d_dip),
1377 				    ddi_get_instance(bd->d_dip),
1378 				    media.m_blksize);
1379 				/*
1380 				 * We can't use the media, treat it as
1381 				 * not present.
1382 				 */
1383 				state = DKIO_EJECTED;
1384 				bd->d_numblks = 0;
1385 			} else {
1386 				bd->d_blkshift = ddi_ffs(media.m_blksize) - 1;
1387 				bd->d_numblks = media.m_nblks;
1388 				bd->d_rdonly = media.m_readonly;
1389 				bd->d_ssd = media.m_solidstate;
1390 				state = DKIO_INSERTED;
1391 			}
1392 
1393 			/* Device size changed */
1394 			docmlb = B_TRUE;
1395 
1396 		} else {
1397 			if (bd->d_numblks != media.m_nblks) {
1398 				/* Device size changed */
1399 				docmlb = B_TRUE;
1400 			}
1401 			bd->d_numblks = media.m_nblks;
1402 			bd->d_rdonly = media.m_readonly;
1403 			state = DKIO_INSERTED;
1404 		}
1405 
1406 	} else {
1407 		bd->d_numblks = 0;
1408 		state = DKIO_EJECTED;
1409 	}
1410 	if (state != bd->d_state) {
1411 		bd->d_state = state;
1412 		cv_broadcast(&bd->d_statecv);
1413 		docmlb = B_TRUE;
1414 	}
1415 	mutex_exit(&bd->d_statemutex);
1416 
1417 	if (docmlb) {
1418 		if (state == DKIO_INSERTED) {
1419 			(void) cmlb_validate(bd->d_cmlbh, 0, 0);
1420 		} else {
1421 			cmlb_invalidate(bd->d_cmlbh, 0);
1422 		}
1423 	}
1424 }
1425 
1426 static int
1427 bd_check_state(bd_t *bd, enum dkio_state *state)
1428 {
1429 	clock_t		when;
1430 
1431 	for (;;) {
1432 
1433 		bd_update_state(bd);
1434 
1435 		mutex_enter(&bd->d_statemutex);
1436 
1437 		if (bd->d_state != *state) {
1438 			*state = bd->d_state;
1439 			mutex_exit(&bd->d_statemutex);
1440 			break;
1441 		}
1442 
1443 		when = drv_usectohz(1000000);
1444 		if (cv_reltimedwait_sig(&bd->d_statecv, &bd->d_statemutex,
1445 		    when, TR_CLOCK_TICK) == 0) {
1446 			mutex_exit(&bd->d_statemutex);
1447 			return (EINTR);
1448 		}
1449 
1450 		mutex_exit(&bd->d_statemutex);
1451 	}
1452 
1453 	return (0);
1454 }
1455 
1456 static int
1457 bd_flush_write_cache_done(struct buf *bp)
1458 {
1459 	struct dk_callback *dc = (void *)bp->b_private;
1460 
1461 	(*dc->dkc_callback)(dc->dkc_cookie, geterror(bp));
1462 	kmem_free(dc, sizeof (*dc));
1463 	freerbuf(bp);
1464 	return (0);
1465 }
1466 
1467 static int
1468 bd_flush_write_cache(bd_t *bd, struct dk_callback *dkc)
1469 {
1470 	buf_t			*bp;
1471 	struct dk_callback	*dc;
1472 	bd_xfer_impl_t		*xi;
1473 	int			rv;
1474 
1475 	if (bd->d_ops.o_sync_cache == NULL) {
1476 		return (ENOTSUP);
1477 	}
1478 	if ((bp = getrbuf(KM_SLEEP)) == NULL) {
1479 		return (ENOMEM);
1480 	}
1481 	bp->b_resid = 0;
1482 	bp->b_bcount = 0;
1483 
1484 	xi = bd_xfer_alloc(bd, bp, bd->d_ops.o_sync_cache, KM_SLEEP);
1485 	if (xi == NULL) {
1486 		rv = geterror(bp);
1487 		freerbuf(bp);
1488 		return (rv);
1489 	}
1490 
1491 	/* Make an asynchronous flush, but only if there is a callback */
1492 	if (dkc != NULL && dkc->dkc_callback != NULL) {
1493 		/* Make a private copy of the callback structure */
1494 		dc = kmem_alloc(sizeof (*dc), KM_SLEEP);
1495 		*dc = *dkc;
1496 		bp->b_private = dc;
1497 		bp->b_iodone = bd_flush_write_cache_done;
1498 
1499 		bd_submit(bd, xi);
1500 		return (0);
1501 	}
1502 
1503 	/* In case there is no callback, perform a synchronous flush */
1504 	bd_submit(bd, xi);
1505 	(void) biowait(bp);
1506 	rv = geterror(bp);
1507 	freerbuf(bp);
1508 
1509 	return (rv);
1510 }
1511 
1512 /*
1513  * Nexus support.
1514  */
1515 int
1516 bd_bus_ctl(dev_info_t *dip, dev_info_t *rdip, ddi_ctl_enum_t ctlop,
1517     void *arg, void *result)
1518 {
1519 	bd_handle_t	hdl;
1520 
1521 	switch (ctlop) {
1522 	case DDI_CTLOPS_REPORTDEV:
1523 		cmn_err(CE_CONT, "?Block device: %s@%s, %s%d\n",
1524 		    ddi_node_name(rdip), ddi_get_name_addr(rdip),
1525 		    ddi_driver_name(rdip), ddi_get_instance(rdip));
1526 		return (DDI_SUCCESS);
1527 
1528 	case DDI_CTLOPS_INITCHILD:
1529 		hdl = ddi_get_parent_data((dev_info_t *)arg);
1530 		if (hdl == NULL) {
1531 			return (DDI_NOT_WELL_FORMED);
1532 		}
1533 		ddi_set_name_addr((dev_info_t *)arg, hdl->h_addr);
1534 		return (DDI_SUCCESS);
1535 
1536 	case DDI_CTLOPS_UNINITCHILD:
1537 		ddi_set_name_addr((dev_info_t *)arg, NULL);
1538 		ndi_prop_remove_all((dev_info_t *)arg);
1539 		return (DDI_SUCCESS);
1540 
1541 	default:
1542 		return (ddi_ctlops(dip, rdip, ctlop, arg, result));
1543 	}
1544 }
1545 
1546 /*
1547  * Functions for device drivers.
1548  */
1549 bd_handle_t
1550 bd_alloc_handle(void *private, bd_ops_t *ops, ddi_dma_attr_t *dma, int kmflag)
1551 {
1552 	bd_handle_t	hdl;
1553 
1554 	hdl = kmem_zalloc(sizeof (*hdl), kmflag);
1555 	if (hdl != NULL) {
1556 		hdl->h_ops = *ops;
1557 		hdl->h_dma = dma;
1558 		hdl->h_private = private;
1559 	}
1560 
1561 	return (hdl);
1562 }
1563 
1564 void
1565 bd_free_handle(bd_handle_t hdl)
1566 {
1567 	kmem_free(hdl, sizeof (*hdl));
1568 }
1569 
1570 int
1571 bd_attach_handle(dev_info_t *dip, bd_handle_t hdl)
1572 {
1573 	dev_info_t	*child;
1574 	bd_drive_t	drive;
1575 
1576 	/* if drivers don't override this, make it assume none */
1577 	drive.d_lun = -1;
1578 	hdl->h_ops.o_drive_info(hdl->h_private, &drive);
1579 
1580 	hdl->h_parent = dip;
1581 	hdl->h_name = "blkdev";
1582 
1583 	if (drive.d_lun >= 0) {
1584 		(void) snprintf(hdl->h_addr, sizeof (hdl->h_addr), "%X,%X",
1585 		    drive.d_target, drive.d_lun);
1586 	} else {
1587 		(void) snprintf(hdl->h_addr, sizeof (hdl->h_addr), "%X",
1588 		    drive.d_target);
1589 	}
1590 	if (ndi_devi_alloc(dip, hdl->h_name, (pnode_t)DEVI_SID_NODEID,
1591 	    &child) != NDI_SUCCESS) {
1592 		cmn_err(CE_WARN, "%s%d: unable to allocate node %s@%s",
1593 		    ddi_driver_name(dip), ddi_get_instance(dip),
1594 		    "blkdev", hdl->h_addr);
1595 		return (DDI_FAILURE);
1596 	}
1597 
1598 	ddi_set_parent_data(child, hdl);
1599 	hdl->h_child = child;
1600 
1601 	if (ndi_devi_online(child, 0) == NDI_FAILURE) {
1602 		cmn_err(CE_WARN, "%s%d: failed bringing node %s@%s online",
1603 		    ddi_driver_name(dip), ddi_get_instance(dip),
1604 		    hdl->h_name, hdl->h_addr);
1605 		(void) ndi_devi_free(child);
1606 		return (DDI_FAILURE);
1607 	}
1608 
1609 	return (DDI_SUCCESS);
1610 }
1611 
1612 int
1613 bd_detach_handle(bd_handle_t hdl)
1614 {
1615 	int	circ;
1616 	int	rv;
1617 	char	*devnm;
1618 
1619 	if (hdl->h_child == NULL) {
1620 		return (DDI_SUCCESS);
1621 	}
1622 	ndi_devi_enter(hdl->h_parent, &circ);
1623 	if (i_ddi_node_state(hdl->h_child) < DS_INITIALIZED) {
1624 		rv = ddi_remove_child(hdl->h_child, 0);
1625 	} else {
1626 		devnm = kmem_alloc(MAXNAMELEN + 1, KM_SLEEP);
1627 		(void) ddi_deviname(hdl->h_child, devnm);
1628 		(void) devfs_clean(hdl->h_parent, devnm + 1, DV_CLEAN_FORCE);
1629 		rv = ndi_devi_unconfig_one(hdl->h_parent, devnm + 1, NULL,
1630 		    NDI_DEVI_REMOVE | NDI_UNCONFIG);
1631 		kmem_free(devnm, MAXNAMELEN + 1);
1632 	}
1633 	if (rv == 0) {
1634 		hdl->h_child = NULL;
1635 	}
1636 
1637 	ndi_devi_exit(hdl->h_parent, circ);
1638 	return (rv = NDI_SUCCESS ? DDI_SUCCESS : DDI_FAILURE);
1639 }
1640 
1641 void
1642 bd_xfer_done(bd_xfer_t *xfer, int err)
1643 {
1644 	bd_xfer_impl_t	*xi = (void *)xfer;
1645 	buf_t		*bp = xi->i_bp;
1646 	int		rv = DDI_SUCCESS;
1647 	bd_t		*bd = xi->i_bd;
1648 	size_t		len;
1649 
1650 	if (err != 0) {
1651 		bd_runq_exit(xi, err);
1652 
1653 		bp->b_resid += xi->i_resid;
1654 		bd_xfer_free(xi);
1655 		bioerror(bp, err);
1656 		biodone(bp);
1657 		return;
1658 	}
1659 
1660 	xi->i_cur_win++;
1661 	xi->i_resid -= xi->i_len;
1662 
1663 	if (xi->i_resid == 0) {
1664 		/* Job completed succcessfully! */
1665 		bd_runq_exit(xi, 0);
1666 
1667 		bd_xfer_free(xi);
1668 		biodone(bp);
1669 		return;
1670 	}
1671 
1672 	xi->i_blkno += xi->i_nblks;
1673 
1674 	if (bd->d_use_dma) {
1675 		/* More transfer still pending... advance to next DMA window. */
1676 		rv = ddi_dma_getwin(xi->i_dmah, xi->i_cur_win,
1677 		    &xi->i_offset, &len, &xi->i_dmac, &xi->i_ndmac);
1678 	} else {
1679 		/* Advance memory window. */
1680 		xi->i_kaddr += xi->i_len;
1681 		xi->i_offset += xi->i_len;
1682 		len = min(bp->b_bcount - xi->i_offset, bd->d_maxxfer);
1683 	}
1684 
1685 
1686 	if ((rv != DDI_SUCCESS) ||
1687 	    (P2PHASE(len, (1U << xi->i_blkshift) != 0))) {
1688 		bd_runq_exit(xi, EFAULT);
1689 
1690 		bp->b_resid += xi->i_resid;
1691 		bd_xfer_free(xi);
1692 		bioerror(bp, EFAULT);
1693 		biodone(bp);
1694 		return;
1695 	}
1696 	xi->i_len = len;
1697 	xi->i_nblks = len >> xi->i_blkshift;
1698 
1699 	/* Submit next window to hardware. */
1700 	rv = xi->i_func(bd->d_private, &xi->i_public);
1701 	if (rv != 0) {
1702 		bd_runq_exit(xi, rv);
1703 
1704 		bp->b_resid += xi->i_resid;
1705 		bd_xfer_free(xi);
1706 		bioerror(bp, rv);
1707 		biodone(bp);
1708 	}
1709 }
1710 
1711 void
1712 bd_state_change(bd_handle_t hdl)
1713 {
1714 	bd_t		*bd;
1715 
1716 	if ((bd = hdl->h_bd) != NULL) {
1717 		bd_update_state(bd);
1718 	}
1719 }
1720 
1721 void
1722 bd_mod_init(struct dev_ops *devops)
1723 {
1724 	static struct bus_ops bd_bus_ops = {
1725 		BUSO_REV,		/* busops_rev */
1726 		nullbusmap,		/* bus_map */
1727 		NULL,			/* bus_get_intrspec (OBSOLETE) */
1728 		NULL,			/* bus_add_intrspec (OBSOLETE) */
1729 		NULL,			/* bus_remove_intrspec (OBSOLETE) */
1730 		i_ddi_map_fault,	/* bus_map_fault */
1731 		NULL,			/* bus_dma_map (OBSOLETE) */
1732 		ddi_dma_allochdl,	/* bus_dma_allochdl */
1733 		ddi_dma_freehdl,	/* bus_dma_freehdl */
1734 		ddi_dma_bindhdl,	/* bus_dma_bindhdl */
1735 		ddi_dma_unbindhdl,	/* bus_dma_unbindhdl */
1736 		ddi_dma_flush,		/* bus_dma_flush */
1737 		ddi_dma_win,		/* bus_dma_win */
1738 		ddi_dma_mctl,		/* bus_dma_ctl */
1739 		bd_bus_ctl,		/* bus_ctl */
1740 		ddi_bus_prop_op,	/* bus_prop_op */
1741 		NULL,			/* bus_get_eventcookie */
1742 		NULL,			/* bus_add_eventcall */
1743 		NULL,			/* bus_remove_eventcall */
1744 		NULL,			/* bus_post_event */
1745 		NULL,			/* bus_intr_ctl (OBSOLETE) */
1746 		NULL,			/* bus_config */
1747 		NULL,			/* bus_unconfig */
1748 		NULL,			/* bus_fm_init */
1749 		NULL,			/* bus_fm_fini */
1750 		NULL,			/* bus_fm_access_enter */
1751 		NULL,			/* bus_fm_access_exit */
1752 		NULL,			/* bus_power */
1753 		NULL,			/* bus_intr_op */
1754 	};
1755 
1756 	devops->devo_bus_ops = &bd_bus_ops;
1757 
1758 	/*
1759 	 * NB: The device driver is free to supply its own
1760 	 * character entry device support.
1761 	 */
1762 }
1763 
1764 void
1765 bd_mod_fini(struct dev_ops *devops)
1766 {
1767 	devops->devo_bus_ops = NULL;
1768 }
1769