xref: /titanic_50/usr/src/uts/common/io/blkdev/blkdev.c (revision a69cdccdf9a647a09c204a49f998caff672138e5)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 #include <sys/types.h>
26 #include <sys/ksynch.h>
27 #include <sys/kmem.h>
28 #include <sys/file.h>
29 #include <sys/errno.h>
30 #include <sys/open.h>
31 #include <sys/buf.h>
32 #include <sys/uio.h>
33 #include <sys/aio_req.h>
34 #include <sys/cred.h>
35 #include <sys/modctl.h>
36 #include <sys/cmlb.h>
37 #include <sys/conf.h>
38 #include <sys/devops.h>
39 #include <sys/list.h>
40 #include <sys/sysmacros.h>
41 #include <sys/dkio.h>
42 #include <sys/vtoc.h>
43 #include <sys/scsi/scsi.h>	/* for DTYPE_DIRECT */
44 #include <sys/kstat.h>
45 #include <sys/fs/dv_node.h>
46 #include <sys/ddi.h>
47 #include <sys/sunddi.h>
48 #include <sys/note.h>
49 #include <sys/blkdev.h>
50 
51 #define	BD_MAXPART	64
52 #define	BDINST(dev)	(getminor(dev) / BD_MAXPART)
53 #define	BDPART(dev)	(getminor(dev) % BD_MAXPART)
54 
55 typedef struct bd bd_t;
56 typedef struct bd_xfer_impl bd_xfer_impl_t;
57 
58 struct bd {
59 	void		*d_private;
60 	dev_info_t	*d_dip;
61 	kmutex_t	d_ocmutex;
62 	kmutex_t	d_iomutex;
63 	kmutex_t	d_statemutex;
64 	kcondvar_t	d_statecv;
65 	enum dkio_state	d_state;
66 	cmlb_handle_t	d_cmlbh;
67 	unsigned	d_open_lyr[BD_MAXPART];	/* open count */
68 	uint64_t	d_open_excl;	/* bit mask indexed by partition */
69 	uint64_t	d_open_reg[OTYPCNT];		/* bit mask */
70 
71 	uint32_t	d_qsize;
72 	uint32_t	d_qactive;
73 	uint32_t	d_maxxfer;
74 	uint32_t	d_blkshift;
75 	uint64_t	d_numblks;
76 	ddi_devid_t	d_devid;
77 
78 	kmem_cache_t	*d_cache;
79 	list_t		d_runq;
80 	list_t		d_waitq;
81 	kstat_t		*d_ksp;
82 	kstat_io_t	*d_kiop;
83 
84 	boolean_t	d_rdonly;
85 	boolean_t	d_removable;
86 	boolean_t	d_hotpluggable;
87 	boolean_t	d_use_dma;
88 
89 	ddi_dma_attr_t	d_dma;
90 	bd_ops_t	d_ops;
91 	bd_handle_t	d_handle;
92 };
93 
94 struct bd_handle {
95 	bd_ops_t	h_ops;
96 	ddi_dma_attr_t	*h_dma;
97 	dev_info_t	*h_parent;
98 	dev_info_t	*h_child;
99 	void		*h_private;
100 	bd_t		*h_bd;
101 	char		*h_name;
102 	char		h_addr[20];	/* enough for %X,%X */
103 };
104 
105 struct bd_xfer_impl {
106 	bd_xfer_t	i_public;
107 	list_node_t	i_linkage;
108 	bd_t		*i_bd;
109 	buf_t		*i_bp;
110 	uint_t		i_num_win;
111 	uint_t		i_cur_win;
112 	off_t		i_offset;
113 	int		(*i_func)(void *, bd_xfer_t *);
114 	uint32_t	i_blkshift;
115 	size_t		i_len;
116 	size_t		i_resid;
117 };
118 
119 #define	i_dmah		i_public.x_dmah
120 #define	i_dmac		i_public.x_dmac
121 #define	i_ndmac		i_public.x_ndmac
122 #define	i_kaddr		i_public.x_kaddr
123 #define	i_nblks		i_public.x_nblks
124 #define	i_blkno		i_public.x_blkno
125 
126 
127 /*
128  * Private prototypes.
129  */
130 
131 static int bd_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
132 static int bd_attach(dev_info_t *, ddi_attach_cmd_t);
133 static int bd_detach(dev_info_t *, ddi_detach_cmd_t);
134 
135 static int bd_open(dev_t *, int, int, cred_t *);
136 static int bd_close(dev_t, int, int, cred_t *);
137 static int bd_strategy(struct buf *);
138 static int bd_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
139 static int bd_read(dev_t, struct uio *, cred_t *);
140 static int bd_write(dev_t, struct uio *, cred_t *);
141 static int bd_aread(dev_t, struct aio_req *, cred_t *);
142 static int bd_awrite(dev_t, struct aio_req *, cred_t *);
143 static int bd_prop_op(dev_t, dev_info_t *, ddi_prop_op_t, int, char *,
144     caddr_t, int *);
145 
146 static int bd_tg_rdwr(dev_info_t *, uchar_t, void *, diskaddr_t, size_t,
147     void *);
148 static int bd_tg_getinfo(dev_info_t *, int, void *, void *);
149 static int bd_xfer_ctor(void *, void *, int);
150 static void bd_xfer_dtor(void *, void *);
151 static void bd_sched(bd_t *);
152 static void bd_submit(bd_t *, bd_xfer_impl_t *);
153 static void bd_runq_exit(bd_xfer_impl_t *, int);
154 static void bd_update_state(bd_t *);
155 static int bd_check_state(bd_t *, enum dkio_state *);
156 static int bd_flush_write_cache(bd_t *, struct dk_callback *);
157 
158 struct cmlb_tg_ops bd_tg_ops = {
159 	TG_DK_OPS_VERSION_1,
160 	bd_tg_rdwr,
161 	bd_tg_getinfo,
162 };
163 
164 static struct cb_ops bd_cb_ops = {
165 	bd_open, 		/* open */
166 	bd_close, 		/* close */
167 	bd_strategy, 		/* strategy */
168 	nodev, 			/* print */
169 	nodev,			/* dump */
170 	bd_read, 		/* read */
171 	bd_write, 		/* write */
172 	bd_ioctl, 		/* ioctl */
173 	nodev, 			/* devmap */
174 	nodev, 			/* mmap */
175 	nodev, 			/* segmap */
176 	nochpoll, 		/* poll */
177 	bd_prop_op, 		/* cb_prop_op */
178 	0, 			/* streamtab  */
179 	D_64BIT | D_MP,		/* Driver comaptibility flag */
180 	CB_REV,			/* cb_rev */
181 	bd_aread,		/* async read */
182 	bd_awrite		/* async write */
183 };
184 
185 struct dev_ops bd_dev_ops = {
186 	DEVO_REV, 		/* devo_rev, */
187 	0, 			/* refcnt  */
188 	bd_getinfo,		/* getinfo */
189 	nulldev, 		/* identify */
190 	nulldev, 		/* probe */
191 	bd_attach, 		/* attach */
192 	bd_detach,		/* detach */
193 	nodev, 			/* reset */
194 	&bd_cb_ops, 		/* driver operations */
195 	NULL,			/* bus operations */
196 	NULL,			/* power */
197 	ddi_quiesce_not_needed,	/* quiesce */
198 };
199 
200 static struct modldrv modldrv = {
201 	&mod_driverops,
202 	"Generic Block Device",
203 	&bd_dev_ops,
204 };
205 
206 static struct modlinkage modlinkage = {
207 	MODREV_1, { &modldrv, NULL }
208 };
209 
210 static void *bd_state;
211 static krwlock_t bd_lock;
212 
213 int
214 _init(void)
215 {
216 	int	rv;
217 
218 	rv = ddi_soft_state_init(&bd_state, sizeof (struct bd), 2);
219 	if (rv != DDI_SUCCESS) {
220 		return (rv);
221 	}
222 	rw_init(&bd_lock, NULL, RW_DRIVER, NULL);
223 	rv = mod_install(&modlinkage);
224 	if (rv != DDI_SUCCESS) {
225 		rw_destroy(&bd_lock);
226 		ddi_soft_state_fini(&bd_state);
227 	}
228 	return (rv);
229 }
230 
231 int
232 _fini(void)
233 {
234 	int	rv;
235 
236 	rv = mod_remove(&modlinkage);
237 	if (rv == DDI_SUCCESS) {
238 		rw_destroy(&bd_lock);
239 		ddi_soft_state_fini(&bd_state);
240 	}
241 	return (rv);
242 }
243 
244 int
245 _info(struct modinfo *modinfop)
246 {
247 	return (mod_info(&modlinkage, modinfop));
248 }
249 
250 static int
251 bd_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resultp)
252 {
253 	bd_t	*bd;
254 	minor_t	inst;
255 
256 	_NOTE(ARGUNUSED(dip));
257 
258 	inst = BDINST((dev_t)arg);
259 
260 	switch (cmd) {
261 	case DDI_INFO_DEVT2DEVINFO:
262 		bd = ddi_get_soft_state(bd_state, inst);
263 		if (bd == NULL) {
264 			return (DDI_FAILURE);
265 		}
266 		*resultp = (void *)bd->d_dip;
267 		break;
268 
269 	case DDI_INFO_DEVT2INSTANCE:
270 		*resultp = (void *)(intptr_t)inst;
271 		break;
272 
273 	default:
274 		return (DDI_FAILURE);
275 	}
276 	return (DDI_SUCCESS);
277 }
278 
279 static int
280 bd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
281 {
282 	int		inst;
283 	bd_handle_t	hdl;
284 	bd_t		*bd;
285 	bd_drive_t	drive;
286 	int		rv;
287 	char		name[16];
288 	char		kcache[32];
289 
290 	switch (cmd) {
291 	case DDI_ATTACH:
292 		break;
293 	case DDI_RESUME:
294 		/* We don't do anything native for suspend/resume */
295 		return (DDI_SUCCESS);
296 	default:
297 		return (DDI_FAILURE);
298 	}
299 
300 	inst = ddi_get_instance(dip);
301 	hdl = ddi_get_parent_data(dip);
302 
303 	(void) snprintf(name, sizeof (name), "%s%d",
304 	    ddi_driver_name(dip), ddi_get_instance(dip));
305 	(void) snprintf(kcache, sizeof (kcache), "%s_xfer", name);
306 
307 	if (hdl == NULL) {
308 		cmn_err(CE_WARN, "%s: missing parent data!", name);
309 		return (DDI_FAILURE);
310 	}
311 
312 	if (ddi_soft_state_zalloc(bd_state, inst) != DDI_SUCCESS) {
313 		cmn_err(CE_WARN, "%s: unable to zalloc soft state!", name);
314 		return (DDI_FAILURE);
315 	}
316 	bd = ddi_get_soft_state(bd_state, inst);
317 
318 	if (hdl->h_dma) {
319 		bd->d_dma = *(hdl->h_dma);
320 		bd->d_dma.dma_attr_granular =
321 		    max(DEV_BSIZE, bd->d_dma.dma_attr_granular);
322 		bd->d_use_dma = B_TRUE;
323 
324 		if (bd->d_maxxfer &&
325 		    (bd->d_maxxfer != bd->d_dma.dma_attr_maxxfer)) {
326 			cmn_err(CE_WARN,
327 			    "%s: inconsistent maximum transfer size!",
328 			    name);
329 			/* We force it */
330 			bd->d_maxxfer = bd->d_dma.dma_attr_maxxfer;
331 		} else {
332 			bd->d_maxxfer = bd->d_dma.dma_attr_maxxfer;
333 		}
334 	} else {
335 		bd->d_use_dma = B_FALSE;
336 		if (bd->d_maxxfer == 0) {
337 			bd->d_maxxfer = 1024 * 1024;
338 		}
339 	}
340 	bd->d_ops = hdl->h_ops;
341 	bd->d_private = hdl->h_private;
342 	bd->d_blkshift = 9;	/* 512 bytes, to start */
343 
344 	if (bd->d_maxxfer % DEV_BSIZE) {
345 		cmn_err(CE_WARN, "%s: maximum transfer misaligned!", name);
346 		bd->d_maxxfer &= ~(DEV_BSIZE - 1);
347 	}
348 	if (bd->d_maxxfer < DEV_BSIZE) {
349 		cmn_err(CE_WARN, "%s: maximum transfer size too small!", name);
350 		ddi_soft_state_free(bd_state, inst);
351 		return (DDI_FAILURE);
352 	}
353 
354 	bd->d_dip = dip;
355 	bd->d_handle = hdl;
356 	hdl->h_bd = bd;
357 	ddi_set_driver_private(dip, bd);
358 
359 	mutex_init(&bd->d_iomutex, NULL, MUTEX_DRIVER, NULL);
360 	mutex_init(&bd->d_ocmutex, NULL, MUTEX_DRIVER, NULL);
361 	mutex_init(&bd->d_statemutex, NULL, MUTEX_DRIVER, NULL);
362 	cv_init(&bd->d_statecv, NULL, CV_DRIVER, NULL);
363 
364 	list_create(&bd->d_waitq, sizeof (bd_xfer_impl_t),
365 	    offsetof(struct bd_xfer_impl, i_linkage));
366 	list_create(&bd->d_runq, sizeof (bd_xfer_impl_t),
367 	    offsetof(struct bd_xfer_impl, i_linkage));
368 
369 	bd->d_cache = kmem_cache_create(kcache, sizeof (bd_xfer_impl_t), 8,
370 	    bd_xfer_ctor, bd_xfer_dtor, NULL, bd, NULL, 0);
371 
372 	bd->d_ksp = kstat_create(ddi_driver_name(dip), inst, NULL, "disk",
373 	    KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT);
374 	if (bd->d_ksp != NULL) {
375 		bd->d_ksp->ks_lock = &bd->d_iomutex;
376 		kstat_install(bd->d_ksp);
377 		bd->d_kiop = bd->d_ksp->ks_data;
378 	} else {
379 		/*
380 		 * Even if we cannot create the kstat, we create a
381 		 * scratch kstat.  The reason for this is to ensure
382 		 * that we can update the kstat all of the time,
383 		 * without adding an extra branch instruction.
384 		 */
385 		bd->d_kiop = kmem_zalloc(sizeof (kstat_io_t), KM_SLEEP);
386 	}
387 
388 	cmlb_alloc_handle(&bd->d_cmlbh);
389 
390 	bd->d_state = DKIO_NONE;
391 
392 	bzero(&drive, sizeof (drive));
393 	bd->d_ops.o_drive_info(bd->d_private, &drive);
394 	bd->d_qsize = drive.d_qsize;
395 	bd->d_maxxfer = drive.d_maxxfer;
396 	bd->d_removable = drive.d_removable;
397 	bd->d_hotpluggable = drive.d_hotpluggable;
398 
399 	rv = cmlb_attach(dip, &bd_tg_ops, DTYPE_DIRECT,
400 	    bd->d_removable, bd->d_hotpluggable,
401 	    drive.d_lun >= 0 ? DDI_NT_BLOCK_CHAN : DDI_NT_BLOCK,
402 	    CMLB_FAKE_LABEL_ONE_PARTITION, bd->d_cmlbh, bd);
403 	if (rv != 0) {
404 		cmlb_free_handle(&bd->d_cmlbh);
405 		kmem_cache_destroy(bd->d_cache);
406 		mutex_destroy(&bd->d_iomutex);
407 		mutex_destroy(&bd->d_ocmutex);
408 		mutex_destroy(&bd->d_statemutex);
409 		cv_destroy(&bd->d_statecv);
410 		list_destroy(&bd->d_waitq);
411 		list_destroy(&bd->d_runq);
412 		if (bd->d_ksp != NULL) {
413 			kstat_delete(bd->d_ksp);
414 			bd->d_ksp = NULL;
415 		} else {
416 			kmem_free(bd->d_kiop, sizeof (kstat_io_t));
417 		}
418 		ddi_soft_state_free(bd_state, inst);
419 		return (DDI_FAILURE);
420 	}
421 
422 	if (bd->d_ops.o_devid_init != NULL) {
423 		rv = bd->d_ops.o_devid_init(bd->d_private, dip, &bd->d_devid);
424 		if (rv == DDI_SUCCESS) {
425 			if (ddi_devid_register(dip, bd->d_devid) !=
426 			    DDI_SUCCESS) {
427 				cmn_err(CE_WARN,
428 				    "%s: unable to register devid", name);
429 			}
430 		}
431 	}
432 
433 	/*
434 	 * Add a zero-length attribute to tell the world we support
435 	 * kernel ioctls (for layered drivers).  Also set up properties
436 	 * used by HAL to identify removable media.
437 	 */
438 	(void) ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
439 	    DDI_KERNEL_IOCTL, NULL, 0);
440 	if (bd->d_removable) {
441 		(void) ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
442 		    "removable-media", NULL, 0);
443 	}
444 	if (bd->d_hotpluggable) {
445 		(void) ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
446 		    "hotpluggable", NULL, 0);
447 	}
448 
449 	ddi_report_dev(dip);
450 
451 	return (DDI_SUCCESS);
452 }
453 
454 static int
455 bd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
456 {
457 	bd_t	*bd;
458 
459 	bd = ddi_get_driver_private(dip);
460 
461 	switch (cmd) {
462 	case DDI_DETACH:
463 		break;
464 	case DDI_SUSPEND:
465 		/* We don't suspend, but our parent does */
466 		return (DDI_SUCCESS);
467 	default:
468 		return (DDI_FAILURE);
469 	}
470 	if (bd->d_ksp != NULL) {
471 		kstat_delete(bd->d_ksp);
472 		bd->d_ksp = NULL;
473 	} else {
474 		kmem_free(bd->d_kiop, sizeof (kstat_io_t));
475 	}
476 	cmlb_detach(bd->d_cmlbh, bd);
477 	cmlb_free_handle(&bd->d_cmlbh);
478 	if (bd->d_devid)
479 		ddi_devid_free(bd->d_devid);
480 	kmem_cache_destroy(bd->d_cache);
481 	mutex_destroy(&bd->d_iomutex);
482 	mutex_destroy(&bd->d_ocmutex);
483 	mutex_destroy(&bd->d_statemutex);
484 	cv_destroy(&bd->d_statecv);
485 	list_destroy(&bd->d_waitq);
486 	list_destroy(&bd->d_runq);
487 	ddi_soft_state_free(bd_state, ddi_get_instance(dip));
488 	return (DDI_SUCCESS);
489 }
490 
491 static int
492 bd_xfer_ctor(void *buf, void *arg, int kmflag)
493 {
494 	bd_xfer_impl_t	*xi;
495 	bd_t		*bd = arg;
496 	int		(*dcb)(caddr_t);
497 
498 	if (kmflag == KM_SLEEP) {
499 		dcb = DDI_DMA_SLEEP;
500 	} else {
501 		dcb = DDI_DMA_DONTWAIT;
502 	}
503 
504 	xi = buf;
505 	bzero(xi, sizeof (*xi));
506 	xi->i_bd = bd;
507 
508 	if (bd->d_use_dma) {
509 		if (ddi_dma_alloc_handle(bd->d_dip, &bd->d_dma, dcb, NULL,
510 		    &xi->i_dmah) != DDI_SUCCESS) {
511 			return (-1);
512 		}
513 	}
514 
515 	return (0);
516 }
517 
518 static void
519 bd_xfer_dtor(void *buf, void *arg)
520 {
521 	bd_xfer_impl_t	*xi = buf;
522 
523 	_NOTE(ARGUNUSED(arg));
524 
525 	if (xi->i_dmah)
526 		ddi_dma_free_handle(&xi->i_dmah);
527 	xi->i_dmah = NULL;
528 }
529 
530 static bd_xfer_impl_t *
531 bd_xfer_alloc(bd_t *bd, struct buf *bp, int (*func)(void *, bd_xfer_t *),
532     int kmflag)
533 {
534 	bd_xfer_impl_t		*xi;
535 	int			rv;
536 	int			status;
537 	unsigned		dir;
538 	int			(*cb)(caddr_t);
539 	size_t			len;
540 	uint32_t		shift;
541 
542 	if (kmflag == KM_SLEEP) {
543 		cb = DDI_DMA_SLEEP;
544 	} else {
545 		cb = DDI_DMA_DONTWAIT;
546 	}
547 
548 	xi = kmem_cache_alloc(bd->d_cache, kmflag);
549 	if (xi == NULL) {
550 		bioerror(bp, ENOMEM);
551 		return (NULL);
552 	}
553 
554 	ASSERT(bp);
555 	ASSERT(bp->b_bcount);
556 
557 	xi->i_bp = bp;
558 	xi->i_func = func;
559 	xi->i_blkno = bp->b_lblkno;
560 
561 	if (bp->b_bcount == 0) {
562 		xi->i_len = 0;
563 		xi->i_nblks = 0;
564 		xi->i_kaddr = NULL;
565 		xi->i_resid = 0;
566 		xi->i_num_win = 0;
567 		goto done;
568 	}
569 
570 	if (bp->b_flags & B_READ) {
571 		dir = DDI_DMA_READ;
572 		xi->i_func = bd->d_ops.o_read;
573 	} else {
574 		dir = DDI_DMA_WRITE;
575 		xi->i_func = bd->d_ops.o_write;
576 	}
577 
578 	shift = bd->d_blkshift;
579 	xi->i_blkshift = shift;
580 
581 	if (!bd->d_use_dma) {
582 		bp_mapin(bp);
583 		rv = 0;
584 		xi->i_offset = 0;
585 		xi->i_num_win =
586 		    (bp->b_bcount + (bd->d_maxxfer - 1)) / bd->d_maxxfer;
587 		xi->i_cur_win = 0;
588 		xi->i_len = min(bp->b_bcount, bd->d_maxxfer);
589 		xi->i_nblks = xi->i_len >> shift;
590 		xi->i_kaddr = bp->b_un.b_addr;
591 		xi->i_resid = bp->b_bcount;
592 	} else {
593 
594 		/*
595 		 * We have to use consistent DMA if the address is misaligned.
596 		 */
597 		if (((bp->b_flags & (B_PAGEIO | B_REMAPPED)) != B_PAGEIO) &&
598 		    ((uintptr_t)bp->b_un.b_addr & 0x7)) {
599 			dir |= DDI_DMA_CONSISTENT | DDI_DMA_PARTIAL;
600 		} else {
601 			dir |= DDI_DMA_STREAMING | DDI_DMA_PARTIAL;
602 		}
603 
604 		status = ddi_dma_buf_bind_handle(xi->i_dmah, bp, dir, cb,
605 		    NULL, &xi->i_dmac, &xi->i_ndmac);
606 		switch (status) {
607 		case DDI_DMA_MAPPED:
608 			xi->i_num_win = 1;
609 			xi->i_cur_win = 0;
610 			xi->i_offset = 0;
611 			xi->i_len = bp->b_bcount;
612 			xi->i_nblks = xi->i_len >> shift;
613 			xi->i_resid = bp->b_bcount;
614 			rv = 0;
615 			break;
616 		case DDI_DMA_PARTIAL_MAP:
617 			xi->i_cur_win = 0;
618 
619 			if ((ddi_dma_numwin(xi->i_dmah, &xi->i_num_win) !=
620 			    DDI_SUCCESS) ||
621 			    (ddi_dma_getwin(xi->i_dmah, 0, &xi->i_offset,
622 			    &len, &xi->i_dmac, &xi->i_ndmac) !=
623 			    DDI_SUCCESS) ||
624 			    (P2PHASE(len, shift) != 0)) {
625 				(void) ddi_dma_unbind_handle(xi->i_dmah);
626 				rv = EFAULT;
627 				goto done;
628 			}
629 			xi->i_len = len;
630 			xi->i_nblks = xi->i_len >> shift;
631 			xi->i_resid = bp->b_bcount;
632 			rv = 0;
633 			break;
634 		case DDI_DMA_NORESOURCES:
635 			rv = EAGAIN;
636 			goto done;
637 		case DDI_DMA_TOOBIG:
638 			rv = EINVAL;
639 			goto done;
640 		case DDI_DMA_NOMAPPING:
641 		case DDI_DMA_INUSE:
642 		default:
643 			rv = EFAULT;
644 			goto done;
645 		}
646 	}
647 
648 done:
649 	if (rv != 0) {
650 		kmem_cache_free(bd->d_cache, xi);
651 		bioerror(bp, rv);
652 		return (NULL);
653 	}
654 
655 	return (xi);
656 }
657 
658 static void
659 bd_xfer_free(bd_xfer_impl_t *xi)
660 {
661 	if (xi->i_dmah) {
662 		(void) ddi_dma_unbind_handle(xi->i_dmah);
663 	}
664 	kmem_cache_free(xi->i_bd->d_cache, xi);
665 }
666 
667 static int
668 bd_open(dev_t *devp, int flag, int otyp, cred_t *credp)
669 {
670 	dev_t		dev = *devp;
671 	bd_t		*bd;
672 	minor_t		part;
673 	minor_t		inst;
674 	uint64_t	mask;
675 	boolean_t	ndelay;
676 	int		rv;
677 	diskaddr_t	nblks;
678 	diskaddr_t	lba;
679 
680 	_NOTE(ARGUNUSED(credp));
681 
682 	part = BDPART(dev);
683 	inst = BDINST(dev);
684 
685 	if (otyp >= OTYPCNT)
686 		return (EINVAL);
687 
688 	ndelay = (flag & (FNDELAY | FNONBLOCK)) ? B_TRUE : B_FALSE;
689 
690 	/*
691 	 * Block any DR events from changing the set of registered
692 	 * devices while we function.
693 	 */
694 	rw_enter(&bd_lock, RW_READER);
695 	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
696 		rw_exit(&bd_lock);
697 		return (ENXIO);
698 	}
699 
700 	mutex_enter(&bd->d_ocmutex);
701 
702 	ASSERT(part < 64);
703 	mask = (1U << part);
704 
705 	bd_update_state(bd);
706 
707 	if (cmlb_validate(bd->d_cmlbh, 0, bd) != 0) {
708 
709 		/* non-blocking opens are allowed to succeed */
710 		if (!ndelay) {
711 			rv = ENXIO;
712 			goto done;
713 		}
714 	} else if (cmlb_partinfo(bd->d_cmlbh, part, &nblks, &lba,
715 	    NULL, NULL, bd) == 0) {
716 
717 		/*
718 		 * We read the partinfo, verify valid ranges.  If the
719 		 * partition is invalid, and we aren't blocking or
720 		 * doing a raw access, then fail. (Non-blocking and
721 		 * raw accesses can still succeed to allow a disk with
722 		 * bad partition data to opened by format and fdisk.)
723 		 */
724 		if ((!nblks) && ((!ndelay) || (otyp != OTYP_CHR))) {
725 			rv = ENXIO;
726 			goto done;
727 		}
728 	} else if (!ndelay) {
729 		/*
730 		 * cmlb_partinfo failed -- invalid partition or no
731 		 * disk label.
732 		 */
733 		rv = ENXIO;
734 		goto done;
735 	}
736 
737 	if ((flag & FWRITE) && bd->d_rdonly) {
738 		rv = EROFS;
739 		goto done;
740 	}
741 
742 	if ((bd->d_open_excl) & (mask)) {
743 		rv = EBUSY;
744 		goto done;
745 	}
746 	if (flag & FEXCL) {
747 		if (bd->d_open_lyr[part]) {
748 			rv = EBUSY;
749 			goto done;
750 		}
751 		for (int i = 0; i < OTYP_LYR; i++) {
752 			if (bd->d_open_reg[i] & mask) {
753 				rv = EBUSY;
754 				goto done;
755 			}
756 		}
757 	}
758 
759 	if (otyp == OTYP_LYR) {
760 		bd->d_open_lyr[part]++;
761 	} else {
762 		bd->d_open_reg[otyp] |= mask;
763 	}
764 	if (flag & FEXCL) {
765 		bd->d_open_excl |= mask;
766 	}
767 
768 	rv = 0;
769 done:
770 	mutex_exit(&bd->d_ocmutex);
771 	rw_exit(&bd_lock);
772 
773 	return (rv);
774 }
775 
776 static int
777 bd_close(dev_t dev, int flag, int otyp, cred_t *credp)
778 {
779 	bd_t		*bd;
780 	minor_t		inst;
781 	minor_t		part;
782 	uint64_t	mask;
783 	boolean_t	last = B_TRUE;
784 
785 	_NOTE(ARGUNUSED(flag));
786 	_NOTE(ARGUNUSED(credp));
787 
788 	part = BDPART(dev);
789 	inst = BDINST(dev);
790 
791 	ASSERT(part < 64);
792 	mask = (1U << part);
793 
794 	rw_enter(&bd_lock, RW_READER);
795 
796 	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
797 		rw_exit(&bd_lock);
798 		return (ENXIO);
799 	}
800 
801 	mutex_enter(&bd->d_ocmutex);
802 	if (bd->d_open_excl & mask) {
803 		bd->d_open_excl &= ~mask;
804 	}
805 	if (otyp == OTYP_LYR) {
806 		bd->d_open_lyr[part]--;
807 	} else {
808 		bd->d_open_reg[otyp] &= ~mask;
809 	}
810 	for (int i = 0; i < 64; i++) {
811 		if (bd->d_open_lyr[part]) {
812 			last = B_FALSE;
813 		}
814 	}
815 	for (int i = 0; last && (i < OTYP_LYR); i++) {
816 		if (bd->d_open_reg[i]) {
817 			last = B_FALSE;
818 		}
819 	}
820 	mutex_exit(&bd->d_ocmutex);
821 
822 	if (last) {
823 		cmlb_invalidate(bd->d_cmlbh, bd);
824 	}
825 	rw_exit(&bd_lock);
826 
827 	return (0);
828 }
829 
830 static int
831 bd_read(dev_t dev, struct uio *uio, cred_t *credp)
832 {
833 	_NOTE(ARGUNUSED(credp));
834 	return (physio(bd_strategy, NULL, dev, B_READ, minphys, uio));
835 }
836 
837 static int
838 bd_write(dev_t dev, struct uio *uio, cred_t *credp)
839 {
840 	_NOTE(ARGUNUSED(credp));
841 	return (physio(bd_strategy, NULL, dev, B_WRITE, minphys, uio));
842 }
843 
844 static int
845 bd_aread(dev_t dev, struct aio_req *aio, cred_t *credp)
846 {
847 	_NOTE(ARGUNUSED(credp));
848 	return (aphysio(bd_strategy, anocancel, dev, B_READ, minphys, aio));
849 }
850 
851 static int
852 bd_awrite(dev_t dev, struct aio_req *aio, cred_t *credp)
853 {
854 	_NOTE(ARGUNUSED(credp));
855 	return (aphysio(bd_strategy, anocancel, dev, B_WRITE, minphys, aio));
856 }
857 
858 static int
859 bd_strategy(struct buf *bp)
860 {
861 	minor_t		inst;
862 	minor_t		part;
863 	bd_t		*bd;
864 	diskaddr_t	p_lba;
865 	diskaddr_t	p_nblks;
866 	diskaddr_t	b_nblks;
867 	bd_xfer_impl_t	*xi;
868 	uint32_t	shift;
869 	int		(*func)(void *, bd_xfer_t *);
870 
871 	part = BDPART(bp->b_edev);
872 	inst = BDINST(bp->b_edev);
873 
874 	ASSERT(bp);
875 
876 	bp->b_resid = bp->b_bcount;
877 
878 	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
879 		bioerror(bp, ENXIO);
880 		biodone(bp);
881 		return (0);
882 	}
883 
884 	if (cmlb_partinfo(bd->d_cmlbh, part, &p_nblks, &p_lba,
885 	    NULL, NULL, bd)) {
886 		bioerror(bp, ENXIO);
887 		biodone(bp);
888 		return (0);
889 	}
890 
891 	shift = bd->d_blkshift;
892 
893 	if ((P2PHASE(bp->b_bcount, (1U << shift)) != 0) ||
894 	    (bp->b_lblkno > p_nblks)) {
895 		bioerror(bp, ENXIO);
896 		biodone(bp);
897 		return (0);
898 	}
899 	b_nblks = bp->b_bcount >> shift;
900 	if ((bp->b_lblkno == p_nblks) || (bp->b_bcount == 0)) {
901 		biodone(bp);
902 		return (0);
903 	}
904 
905 	if ((b_nblks + bp->b_lblkno) > p_nblks) {
906 		bp->b_resid = ((bp->b_lblkno + b_nblks - p_nblks) << shift);
907 		bp->b_bcount -= bp->b_resid;
908 	} else {
909 		bp->b_resid = 0;
910 	}
911 	func = (bp->b_flags & B_READ) ? bd->d_ops.o_read : bd->d_ops.o_write;
912 
913 	xi = bd_xfer_alloc(bd, bp, func, KM_NOSLEEP);
914 	if (xi == NULL) {
915 		xi = bd_xfer_alloc(bd, bp, func, KM_PUSHPAGE);
916 	}
917 	if (xi == NULL) {
918 		/* bd_request_alloc will have done bioerror */
919 		biodone(bp);
920 		return (0);
921 	}
922 	xi->i_blkno = bp->b_lblkno + p_lba;
923 
924 	bd_submit(bd, xi);
925 
926 	return (0);
927 }
928 
929 static int
930 bd_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *credp, int *rvalp)
931 {
932 	minor_t		inst;
933 	uint16_t	part;
934 	bd_t		*bd;
935 	void		*ptr = (void *)arg;
936 	int		rv;
937 
938 	part = BDPART(dev);
939 	inst = BDINST(dev);
940 
941 	if ((bd = ddi_get_soft_state(bd_state, inst)) == NULL) {
942 		return (ENXIO);
943 	}
944 
945 	rv = cmlb_ioctl(bd->d_cmlbh, dev, cmd, arg, flag, credp, rvalp, bd);
946 	if (rv != ENOTTY)
947 		return (rv);
948 
949 	switch (cmd) {
950 	case DKIOCGMEDIAINFO: {
951 		struct dk_minfo minfo;
952 
953 		/* make sure our state information is current */
954 		bd_update_state(bd);
955 		bzero(&minfo, sizeof (minfo));
956 		minfo.dki_media_type = DK_FIXED_DISK;
957 		minfo.dki_lbsize = (1U << bd->d_blkshift);
958 		minfo.dki_capacity = bd->d_numblks;
959 		if (ddi_copyout(&minfo, ptr, sizeof (minfo), flag))  {
960 			return (EFAULT);
961 		}
962 		return (0);
963 	}
964 	case DKIOCINFO: {
965 		struct dk_cinfo cinfo;
966 		bzero(&cinfo, sizeof (cinfo));
967 		cinfo.dki_ctype = DKC_BLKDEV;
968 		cinfo.dki_cnum = ddi_get_instance(ddi_get_parent(bd->d_dip));
969 		(void) snprintf(cinfo.dki_cname, sizeof (cinfo.dki_cname),
970 		    "%s", ddi_driver_name(ddi_get_parent(bd->d_dip)));
971 		(void) snprintf(cinfo.dki_dname, sizeof (cinfo.dki_dname),
972 		    "%s", ddi_driver_name(bd->d_dip));
973 		cinfo.dki_unit = inst;
974 		cinfo.dki_flags = DKI_FMTVOL;
975 		cinfo.dki_partition = part;
976 		cinfo.dki_maxtransfer = bd->d_maxxfer / DEV_BSIZE;
977 		cinfo.dki_addr = 0;
978 		cinfo.dki_slave = 0;
979 		cinfo.dki_space = 0;
980 		cinfo.dki_prio = 0;
981 		cinfo.dki_vec = 0;
982 		if (ddi_copyout(&cinfo, ptr, sizeof (cinfo), flag))  {
983 			return (EFAULT);
984 		}
985 		return (0);
986 	}
987 	case DKIOCREMOVABLE: {
988 		int i;
989 		i = bd->d_removable ? 1 : 0;
990 		if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
991 			return (EFAULT);
992 		}
993 		return (0);
994 	}
995 	case DKIOCHOTPLUGGABLE: {
996 		int i;
997 		i = bd->d_hotpluggable ? 1 : 0;
998 		if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
999 			return (EFAULT);
1000 		}
1001 		return (0);
1002 	}
1003 	case DKIOCREADONLY: {
1004 		int i;
1005 		i = bd->d_rdonly ? 1 : 0;
1006 		if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
1007 			return (EFAULT);
1008 		}
1009 		return (0);
1010 	}
1011 	case DKIOCSTATE: {
1012 		enum dkio_state	state;
1013 		if (ddi_copyin(ptr, &state, sizeof (state), flag)) {
1014 			return (EFAULT);
1015 		}
1016 		if ((rv = bd_check_state(bd, &state)) != 0) {
1017 			return (rv);
1018 		}
1019 		if (ddi_copyout(&state, ptr, sizeof (state), flag)) {
1020 			return (EFAULT);
1021 		}
1022 		return (0);
1023 	}
1024 	case DKIOCFLUSHWRITECACHE: {
1025 		struct dk_callback *dkc;
1026 
1027 		dkc = flag & FKIOCTL ? (void *)arg : NULL;
1028 		rv = bd_flush_write_cache(bd, dkc);
1029 		return (rv);
1030 	}
1031 
1032 	default:
1033 		break;
1034 
1035 	}
1036 	return (ENOTTY);
1037 }
1038 
1039 static int
1040 bd_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int mod_flags,
1041     char *name, caddr_t valuep, int *lengthp)
1042 {
1043 	bd_t	*bd;
1044 
1045 	bd = ddi_get_soft_state(bd_state, ddi_get_instance(dip));
1046 	if (bd == NULL)
1047 		return (ddi_prop_op(dev, dip, prop_op, mod_flags,
1048 		    name, valuep, lengthp));
1049 
1050 	return (cmlb_prop_op(bd->d_cmlbh, dev, dip, prop_op, mod_flags, name,
1051 	    valuep, lengthp, BDPART(dev), bd));
1052 }
1053 
1054 
1055 static int
1056 bd_tg_rdwr(dev_info_t *dip, uchar_t cmd, void *bufaddr, diskaddr_t start,
1057     size_t length, void *tg_cookie)
1058 {
1059 	bd_t		*bd;
1060 	buf_t		*bp;
1061 	bd_xfer_impl_t	*xi;
1062 	int		rv;
1063 	int		(*func)(void *, bd_xfer_t *);
1064 
1065 	_NOTE(ARGUNUSED(dip));
1066 
1067 
1068 	bd = tg_cookie;
1069 	if (P2PHASE(length, (1U << bd->d_blkshift)) != 0) {
1070 		/* We can only transfer whole blocks at a time! */
1071 		return (EINVAL);
1072 	}
1073 
1074 	bp = getrbuf(KM_SLEEP);
1075 
1076 	switch (cmd) {
1077 	case TG_READ:
1078 		bp->b_flags = B_READ;
1079 		func = bd->d_ops.o_read;
1080 		break;
1081 	case TG_WRITE:
1082 		bp->b_flags = B_WRITE;
1083 		func = bd->d_ops.o_write;
1084 		break;
1085 	default:
1086 		freerbuf(bp);
1087 		return (EINVAL);
1088 	}
1089 
1090 	bp->b_un.b_addr = bufaddr;
1091 	bp->b_bcount = length;
1092 	xi = bd_xfer_alloc(bd, bp, func, KM_SLEEP);
1093 	if (xi == NULL) {
1094 		rv = geterror(bp);
1095 		freerbuf(bp);
1096 		return (rv);
1097 	}
1098 
1099 	xi->i_blkno = start;
1100 	bd_submit(bd, xi);
1101 	(void) biowait(bp);
1102 	rv = geterror(bp);
1103 	freerbuf(bp);
1104 
1105 	return (rv);
1106 }
1107 
1108 static int
1109 bd_tg_getinfo(dev_info_t *dip, int cmd, void *arg, void *tg_cookie)
1110 {
1111 	bd_t		*bd;
1112 
1113 	_NOTE(ARGUNUSED(dip));
1114 	bd = tg_cookie;
1115 
1116 	switch (cmd) {
1117 	case TG_GETPHYGEOM:
1118 	case TG_GETVIRTGEOM:
1119 		/*
1120 		 * We don't have any "geometry" as such, let cmlb
1121 		 * fabricate something.
1122 		 */
1123 		return (ENOTTY);
1124 
1125 	case TG_GETCAPACITY:
1126 		bd_update_state(bd);
1127 		*(diskaddr_t *)arg = bd->d_numblks;
1128 		return (0);
1129 
1130 	case TG_GETBLOCKSIZE:
1131 		*(uint32_t *)arg = (1U << bd->d_blkshift);
1132 		return (0);
1133 
1134 	case TG_GETATTR:
1135 		/*
1136 		 * It turns out that cmlb really doesn't do much for
1137 		 * non-writable media, but lets make the information
1138 		 * available for it in case it does more in the
1139 		 * future.  (The value is currently used for
1140 		 * triggering special behavior for CD-ROMs.)
1141 		 */
1142 		bd_update_state(bd);
1143 		((tg_attribute_t *)arg)->media_is_writable =
1144 		    bd->d_rdonly ? B_FALSE : B_TRUE;
1145 		return (0);
1146 
1147 	default:
1148 		return (EINVAL);
1149 	}
1150 }
1151 
1152 
1153 static void
1154 bd_sched(bd_t *bd)
1155 {
1156 	bd_xfer_impl_t	*xi;
1157 	struct buf	*bp;
1158 	int		rv;
1159 
1160 	ASSERT(mutex_owned(&bd->d_iomutex));
1161 
1162 	while ((bd->d_qactive < bd->d_qsize) &&
1163 	    ((xi = list_remove_head(&bd->d_waitq)) != NULL)) {
1164 		bd->d_qactive++;
1165 		kstat_waitq_to_runq(bd->d_kiop);
1166 		list_insert_tail(&bd->d_runq, xi);
1167 
1168 		/* Submit the job to driver */
1169 		rv = xi->i_func(bd->d_private, &xi->i_public);
1170 		if (rv != 0) {
1171 			bd->d_qactive--;
1172 			kstat_runq_exit(bd->d_kiop);
1173 			list_remove(&bd->d_runq, xi);
1174 
1175 			mutex_exit(&bd->d_iomutex);
1176 			bp = xi->i_bp;
1177 			bd_xfer_free(xi);
1178 			bioerror(bp, rv);
1179 			biodone(bp);
1180 			mutex_enter(&bd->d_iomutex);
1181 		}
1182 	}
1183 }
1184 
1185 static void
1186 bd_submit(bd_t *bd, bd_xfer_impl_t *xi)
1187 {
1188 	mutex_enter(&bd->d_iomutex);
1189 	list_insert_tail(&bd->d_waitq, xi);
1190 	kstat_waitq_enter(bd->d_kiop);
1191 	bd_sched(bd);
1192 	mutex_exit(&bd->d_iomutex);
1193 }
1194 
1195 static void
1196 bd_runq_exit(bd_xfer_impl_t *xi, int err)
1197 {
1198 	bd_t	*bd = xi->i_bd;
1199 	buf_t	*bp = xi->i_bp;
1200 
1201 	ASSERT(mutex_owned(&bd->d_iomutex));
1202 
1203 	bd->d_qactive--;
1204 	kstat_runq_exit(bd->d_kiop);
1205 	if (err == 0) {
1206 		if (bp->b_flags & B_READ) {
1207 			bd->d_kiop->reads++;
1208 			bd->d_kiop->nread += (bp->b_bcount - xi->i_resid);
1209 		} else {
1210 			bd->d_kiop->writes++;
1211 			bd->d_kiop->nwritten += (bp->b_bcount - xi->i_resid);
1212 		}
1213 	}
1214 	list_remove(&bd->d_runq, xi);
1215 	bd_sched(bd);
1216 }
1217 
1218 static void
1219 bd_update_state(bd_t *bd)
1220 {
1221 	enum	dkio_state	state;
1222 	bd_media_t		media;
1223 	boolean_t		docmlb = B_FALSE;
1224 
1225 	bzero(&media, sizeof (media));
1226 
1227 	mutex_enter(&bd->d_statemutex);
1228 	if (bd->d_ops.o_media_info(bd->d_private, &media) == 0) {
1229 		if ((1U << bd->d_blkshift) != media.m_blksize) {
1230 			if ((media.m_blksize < 512) ||
1231 			    (!ISP2(media.m_blksize)) ||
1232 			    (P2PHASE(bd->d_maxxfer, media.m_blksize))) {
1233 				cmn_err(CE_WARN,
1234 				    "%s%d: Invalid media block size (%d)",
1235 				    ddi_driver_name(bd->d_dip),
1236 				    ddi_get_instance(bd->d_dip),
1237 				    media.m_blksize);
1238 				/*
1239 				 * We can't use the media, treat it as
1240 				 * not present.
1241 				 */
1242 				state = DKIO_EJECTED;
1243 				bd->d_numblks = 0;
1244 			} else {
1245 				bd->d_blkshift = ddi_ffs(media.m_blksize) - 1;
1246 				bd->d_numblks = media.m_nblks;
1247 				bd->d_rdonly = media.m_readonly;
1248 				state = DKIO_INSERTED;
1249 			}
1250 
1251 			/* Device size changed */
1252 			docmlb = B_TRUE;
1253 
1254 		} else {
1255 			if (bd->d_numblks != media.m_nblks) {
1256 				/* Device size changed */
1257 				docmlb = B_TRUE;
1258 			}
1259 			bd->d_numblks = media.m_nblks;
1260 			bd->d_rdonly = media.m_readonly;
1261 			state = DKIO_INSERTED;
1262 		}
1263 
1264 	} else {
1265 		bd->d_numblks = 0;
1266 		state = DKIO_EJECTED;
1267 	}
1268 	if (state != bd->d_state) {
1269 		bd->d_state = state;
1270 		cv_broadcast(&bd->d_statecv);
1271 		docmlb = B_TRUE;
1272 	}
1273 	mutex_exit(&bd->d_statemutex);
1274 
1275 	if (docmlb) {
1276 		if (state == DKIO_INSERTED) {
1277 			(void) cmlb_validate(bd->d_cmlbh, 0, bd);
1278 		} else {
1279 			cmlb_invalidate(bd->d_cmlbh, bd);
1280 		}
1281 	}
1282 }
1283 
1284 static int
1285 bd_check_state(bd_t *bd, enum dkio_state *state)
1286 {
1287 	clock_t		when;
1288 
1289 	for (;;) {
1290 
1291 		bd_update_state(bd);
1292 
1293 		mutex_enter(&bd->d_statemutex);
1294 
1295 		if (bd->d_state != *state) {
1296 			*state = bd->d_state;
1297 			mutex_exit(&bd->d_statemutex);
1298 			break;
1299 		}
1300 
1301 		when = drv_usectohz(1000000);
1302 		if (cv_reltimedwait_sig(&bd->d_statecv, &bd->d_statemutex,
1303 		    when, TR_CLOCK_TICK) == 0) {
1304 			mutex_exit(&bd->d_statemutex);
1305 			return (EINTR);
1306 		}
1307 
1308 		mutex_exit(&bd->d_statemutex);
1309 	}
1310 
1311 	return (0);
1312 }
1313 
1314 static int
1315 bd_flush_write_cache_done(struct buf *bp)
1316 {
1317 	struct dk_callback *dc = (void *)bp->b_private;
1318 
1319 	(*dc->dkc_callback)(dc->dkc_cookie, geterror(bp));
1320 	kmem_free(dc, sizeof (*dc));
1321 	freerbuf(bp);
1322 	return (0);
1323 }
1324 
1325 static int
1326 bd_flush_write_cache(bd_t *bd, struct dk_callback *dkc)
1327 {
1328 	buf_t			*bp;
1329 	struct dk_callback	*dc;
1330 	bd_xfer_impl_t		*xi;
1331 	int			rv;
1332 
1333 	if (bd->d_ops.o_sync_cache == NULL) {
1334 		return (ENOTSUP);
1335 	}
1336 	if ((bp = getrbuf(KM_SLEEP)) == NULL) {
1337 		return (ENOMEM);
1338 	}
1339 	bp->b_resid = 0;
1340 	bp->b_bcount = 0;
1341 
1342 	xi = bd_xfer_alloc(bd, bp, bd->d_ops.o_sync_cache, KM_SLEEP);
1343 	if (xi == NULL) {
1344 		rv = geterror(bp);
1345 		freerbuf(bp);
1346 		return (rv);
1347 	}
1348 
1349 	if (dkc != NULL) {
1350 		/* Make a private copy of the callback structure */
1351 		dc = kmem_alloc(sizeof (*dc), KM_SLEEP);
1352 		*dc = *dkc;
1353 		bp->b_private = dc;
1354 		bp->b_iodone = bd_flush_write_cache_done;
1355 	}
1356 
1357 	bd_submit(bd, xi);
1358 	if (dkc == NULL) {
1359 		/* wait synchronously */
1360 		(void) biowait(bp);
1361 		rv = geterror(bp);
1362 		freerbuf(bp);
1363 	} else {
1364 		/* deferred via callback */
1365 		rv = 0;
1366 	}
1367 	return (rv);
1368 }
1369 
1370 /*
1371  * Nexus support.
1372  */
1373 int
1374 bd_bus_ctl(dev_info_t *dip, dev_info_t *rdip, ddi_ctl_enum_t ctlop,
1375     void *arg, void *result)
1376 {
1377 	bd_handle_t	hdl;
1378 
1379 	switch (ctlop) {
1380 	case DDI_CTLOPS_REPORTDEV:
1381 		cmn_err(CE_CONT, "?Block device: %s@%s, %s%d\n",
1382 		    ddi_node_name(rdip), ddi_get_name_addr(rdip),
1383 		    ddi_driver_name(rdip), ddi_get_instance(rdip));
1384 		return (DDI_SUCCESS);
1385 
1386 	case DDI_CTLOPS_INITCHILD:
1387 		hdl = ddi_get_parent_data((dev_info_t *)arg);
1388 		if (hdl == NULL) {
1389 			return (DDI_NOT_WELL_FORMED);
1390 		}
1391 		ddi_set_name_addr((dev_info_t *)arg, hdl->h_addr);
1392 		return (DDI_SUCCESS);
1393 
1394 	case DDI_CTLOPS_UNINITCHILD:
1395 		ddi_set_name_addr((dev_info_t *)arg, NULL);
1396 		ndi_prop_remove_all((dev_info_t *)arg);
1397 		return (DDI_SUCCESS);
1398 
1399 	default:
1400 		return (ddi_ctlops(dip, rdip, ctlop, arg, result));
1401 	}
1402 }
1403 
1404 /*
1405  * Functions for device drivers.
1406  */
1407 bd_handle_t
1408 bd_alloc_handle(void *private, bd_ops_t *ops, ddi_dma_attr_t *dma, int kmflag)
1409 {
1410 	bd_handle_t	hdl;
1411 
1412 	hdl = kmem_zalloc(sizeof (*hdl), kmflag);
1413 	if (hdl != NULL) {
1414 		hdl->h_ops = *ops;
1415 		hdl->h_dma = dma;
1416 		hdl->h_private = private;
1417 	}
1418 
1419 	return (hdl);
1420 }
1421 
1422 void
1423 bd_free_handle(bd_handle_t hdl)
1424 {
1425 	kmem_free(hdl, sizeof (*hdl));
1426 }
1427 
1428 int
1429 bd_attach_handle(dev_info_t *dip, bd_handle_t hdl)
1430 {
1431 	dev_info_t	*child;
1432 	bd_drive_t	drive;
1433 
1434 	/* if drivers don't override this, make it assume none */
1435 	drive.d_lun = -1;
1436 	hdl->h_ops.o_drive_info(hdl->h_private, &drive);
1437 
1438 	hdl->h_parent = dip;
1439 	hdl->h_name = "blkdev";
1440 
1441 	if (drive.d_lun >= 0) {
1442 		(void) snprintf(hdl->h_addr, sizeof (hdl->h_addr), "%X,%X",
1443 		    drive.d_target, drive.d_lun);
1444 	} else {
1445 		(void) snprintf(hdl->h_addr, sizeof (hdl->h_addr), "%X",
1446 		    drive.d_target);
1447 	}
1448 	if (ndi_devi_alloc(dip, hdl->h_name, (pnode_t)DEVI_SID_NODEID,
1449 	    &child) != NDI_SUCCESS) {
1450 		cmn_err(CE_WARN, "%s%d: unable to allocate node %s@%s",
1451 		    ddi_driver_name(dip), ddi_get_instance(dip),
1452 		    "blkdev", hdl->h_addr);
1453 		return (DDI_FAILURE);
1454 	}
1455 
1456 	ddi_set_parent_data(child, hdl);
1457 	hdl->h_child = child;
1458 
1459 	if (ndi_devi_online(child, 0) == NDI_FAILURE) {
1460 		cmn_err(CE_WARN, "%s%d: failed bringing node %s@%s online",
1461 		    ddi_driver_name(dip), ddi_get_instance(dip),
1462 		    hdl->h_name, hdl->h_addr);
1463 		(void) ndi_devi_free(child);
1464 		return (DDI_FAILURE);
1465 	}
1466 
1467 	return (DDI_SUCCESS);
1468 }
1469 
1470 int
1471 bd_detach_handle(bd_handle_t hdl)
1472 {
1473 	int	circ;
1474 	int	rv;
1475 	char	*devnm;
1476 
1477 	if (hdl->h_child == NULL) {
1478 		return (DDI_SUCCESS);
1479 	}
1480 	ndi_devi_enter(hdl->h_parent, &circ);
1481 	if (i_ddi_node_state(hdl->h_child) < DS_INITIALIZED) {
1482 		rv = ddi_remove_child(hdl->h_child, 0);
1483 	} else {
1484 		devnm = kmem_alloc(MAXNAMELEN + 1, KM_SLEEP);
1485 		(void) ddi_deviname(hdl->h_child, devnm);
1486 		(void) devfs_clean(hdl->h_parent, devnm + 1, DV_CLEAN_FORCE);
1487 		rv = ndi_devi_unconfig_one(hdl->h_parent, devnm + 1, NULL,
1488 		    NDI_DEVI_REMOVE | NDI_UNCONFIG);
1489 		kmem_free(devnm, MAXNAMELEN + 1);
1490 	}
1491 	if (rv == 0) {
1492 		hdl->h_child = NULL;
1493 	}
1494 
1495 	ndi_devi_exit(hdl->h_parent, circ);
1496 	return (rv = NDI_SUCCESS ? DDI_SUCCESS : DDI_FAILURE);
1497 }
1498 
1499 void
1500 bd_xfer_done(bd_xfer_t *xfer, int err)
1501 {
1502 	bd_xfer_impl_t	*xi = (void *)xfer;
1503 	buf_t		*bp = xi->i_bp;
1504 	int		rv;
1505 	bd_t		*bd = xi->i_bd;
1506 	size_t		len;
1507 
1508 	mutex_enter(&bd->d_iomutex);
1509 	if (err != 0) {
1510 		bd_runq_exit(xi, err);
1511 		mutex_exit(&bd->d_iomutex);
1512 
1513 		bp->b_resid += xi->i_resid;
1514 		bd_xfer_free(xi);
1515 		bioerror(bp, err);
1516 		biodone(bp);
1517 		return;
1518 	}
1519 
1520 	xi->i_cur_win++;
1521 	xi->i_resid -= xi->i_len;
1522 
1523 	if (xi->i_resid == 0) {
1524 		/* Job completed succcessfully! */
1525 		bd_runq_exit(xi, 0);
1526 		mutex_exit(&bd->d_iomutex);
1527 
1528 		bd_xfer_free(xi);
1529 		biodone(bp);
1530 		return;
1531 	}
1532 
1533 	xi->i_blkno += xi->i_nblks;
1534 
1535 	if (bd->d_use_dma) {
1536 		/* More transfer still pending... advance to next DMA window. */
1537 		rv = ddi_dma_getwin(xi->i_dmah, xi->i_cur_win,
1538 		    &xi->i_offset, &len, &xi->i_dmac, &xi->i_ndmac);
1539 	} else {
1540 		/* Advance memory window. */
1541 		xi->i_kaddr += xi->i_len;
1542 		xi->i_offset += xi->i_len;
1543 		len = min(bp->b_bcount - xi->i_offset, bd->d_maxxfer);
1544 	}
1545 
1546 
1547 	if ((rv != DDI_SUCCESS) ||
1548 	    (P2PHASE(len, (1U << xi->i_blkshift) != 0))) {
1549 		bd_runq_exit(xi, EFAULT);
1550 		mutex_exit(&bd->d_iomutex);
1551 
1552 		bp->b_resid += xi->i_resid;
1553 		bd_xfer_free(xi);
1554 		bioerror(bp, EFAULT);
1555 		biodone(bp);
1556 		return;
1557 	}
1558 	xi->i_len = len;
1559 	xi->i_nblks = len >> xi->i_blkshift;
1560 
1561 	/* Submit next window to hardware. */
1562 	rv = xi->i_func(bd->d_private, &xi->i_public);
1563 	if (rv != 0) {
1564 		bd_runq_exit(xi, rv);
1565 		mutex_exit(&bd->d_iomutex);
1566 
1567 		bp->b_resid += xi->i_resid;
1568 		bd_xfer_free(xi);
1569 		bioerror(bp, rv);
1570 		biodone(bp);
1571 		return;
1572 	}
1573 
1574 	mutex_exit(&bd->d_iomutex);
1575 }
1576 
1577 void
1578 bd_state_change(bd_handle_t hdl)
1579 {
1580 	bd_t		*bd;
1581 
1582 	if ((bd = hdl->h_bd) != NULL) {
1583 		bd_update_state(bd);
1584 	}
1585 }
1586 
1587 void
1588 bd_mod_init(struct dev_ops *devops)
1589 {
1590 	static struct bus_ops bd_bus_ops = {
1591 		BUSO_REV,		/* busops_rev */
1592 		nullbusmap,		/* bus_map */
1593 		NULL,			/* bus_get_intrspec (OBSOLETE) */
1594 		NULL,			/* bus_add_intrspec (OBSOLETE) */
1595 		NULL,			/* bus_remove_intrspec (OBSOLETE) */
1596 		i_ddi_map_fault,	/* bus_map_fault */
1597 		ddi_dma_map,		/* bus_dma_map */
1598 		ddi_dma_allochdl,	/* bus_dma_allochdl */
1599 		ddi_dma_freehdl,	/* bus_dma_freehdl */
1600 		ddi_dma_bindhdl,	/* bus_dma_bindhdl */
1601 		ddi_dma_unbindhdl,	/* bus_dma_unbindhdl */
1602 		ddi_dma_flush,		/* bus_dma_flush */
1603 		ddi_dma_win,		/* bus_dma_win */
1604 		ddi_dma_mctl,		/* bus_dma_ctl */
1605 		bd_bus_ctl,		/* bus_ctl */
1606 		ddi_bus_prop_op,	/* bus_prop_op */
1607 		NULL,			/* bus_get_eventcookie */
1608 		NULL,			/* bus_add_eventcall */
1609 		NULL,			/* bus_remove_eventcall */
1610 		NULL,			/* bus_post_event */
1611 		NULL,			/* bus_intr_ctl (OBSOLETE) */
1612 		NULL,			/* bus_config */
1613 		NULL,			/* bus_unconfig */
1614 		NULL,			/* bus_fm_init */
1615 		NULL,			/* bus_fm_fini */
1616 		NULL,			/* bus_fm_access_enter */
1617 		NULL,			/* bus_fm_access_exit */
1618 		NULL,			/* bus_power */
1619 		NULL,			/* bus_intr_op */
1620 	};
1621 
1622 	devops->devo_bus_ops = &bd_bus_ops;
1623 
1624 	/*
1625 	 * NB: The device driver is free to supply its own
1626 	 * character entry device support.
1627 	 */
1628 }
1629 
1630 void
1631 bd_mod_fini(struct dev_ops *devops)
1632 {
1633 	devops->devo_bus_ops = NULL;
1634 }
1635