xref: /titanic_51/usr/src/uts/i86pc/i86hvm/io/pv_cmdk.c (revision aba6a64cda8fc853e4d61f08c163d0b9be0815b2)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/scsi/scsi_types.h>
29 #include <sys/modctl.h>
30 #include <sys/cmlb.h>
31 #include <sys/types.h>
32 #include <sys/xpv_support.h>
33 #include <sys/xendev.h>
34 #include <sys/gnttab.h>
35 #include <public/xen.h>
36 #include <public/grant_table.h>
37 #include <io/xdf.h>
38 #include <sys/vtoc.h>
39 #include <sys/dkio.h>
40 #include <sys/dktp/dadev.h>
41 #include <sys/dktp/dadkio.h>
42 #include <sys/dktp/tgdk.h>
43 #include <sys/dktp/bbh.h>
44 #include <sys/dktp/cmdk.h>
45 #include <sys/dktp/altsctr.h>
46 
47 /*
48  * General Notes
49  *
50  * We don't support disks with bad block mappins.  We have this
51  * limitation because the underlying xdf driver doesn't support
52  * bad block remapping.  If there is a need to support this feature
53  * it should be added directly to the xdf driver and we should just
54  * pass requests strait on through and let it handle the remapping.
55  * Also, it's probably worth pointing out that most modern disks do bad
56  * block remapping internally in the hardware so there's actually less
57  * of a chance of us ever discovering bad blocks.  Also, in most cases
58  * this driver (and the xdf driver) will only be used with virtualized
59  * devices, so one might wonder why a virtual device would ever actually
60  * experience bad blocks.  To wrap this up, you might be wondering how
61  * these bad block mappings get created and how they are managed.  Well,
62  * there are two tools for managing bad block mappings, format(1M) and
63  * addbadsec(1M).  Format(1M) can be used to do a surface scan of a disk
64  * to attempt to find bad block and create mappings for them.  Format(1M)
65  * and addbadsec(1M) can also be used to edit existing mappings that may
66  * be saved on the disk.
67  *
68  * The underlying PV driver that this driver passes on requests to is the
69  * xdf driver.  Since in most cases the xdf driver doesn't deal with
70  * physical disks it has it's own algorithm for assigning a physical
71  * geometry to a virtual disk (ie, cylinder count, head count, etc.)
72  * The default values chosen by the xdf driver may not match those
73  * assigned to a disk by a hardware disk emulator in an HVM environment.
74  * This is a problem since these physical geometry attributes affect
75  * things like the partition table, backup label location, etc.  So
76  * to emulate disk devices correctly we need to know the physical geometry
77  * that was assigned to a disk at the time of it's initalization.
78  * Normally in an HVM environment this information will passed to
79  * the BIOS and operating system from the hardware emulator that is
80  * emulating the disk devices.  In the case of a solaris dom0+xvm
81  * this would be qemu.  So to work around this issue, this driver will
82  * query the emulated hardware to get the assigned physical geometry
83  * and then pass this geometry onto the xdf driver so that it can use it.
84  * But really, this information is essentially metadata about the disk
85  * that should be kept with the disk image itself.  (Assuming or course
86  * that a disk image is the actual backingstore for this emulated device.)
87  * This metadata should also be made available to PV drivers via a common
88  * mechamisn, probably the xenstore.  The fact that this metadata isn't
89  * available outside of HVM domains means that it's difficult to move
90  * disks between HVM and PV domains, since a fully PV domain will have no
91  * way of knowing what the correct geometry of the target device is.
92  * (Short of reading the disk, looking for things like partition tables
93  * and labels, and taking a best guess at what the geometry was when
94  * the disk was initialized.  Unsuprisingly, qemu actually does this.)
95  *
96  * This driver has to map cmdk device instances into their corresponding
97  * xdf device instances.  We have to do this to ensure that when a user
98  * accesses a emulated cmdk device we map those accesses to the proper
99  * paravirtualized device.  Basically what we need to know is how multiple
100  * 'disk' entries in a domU configuration file get mapped to emulated
101  * cmdk devices and to xdf devices.  The 'disk' entry to xdf instance
102  * mappings we know because those are done within the Solaris xvdi code
103  * and the xpvd nexus driver.  But the config to emulated devices mappings
104  * are handled entirely within the xen management tool chain and the
105  * hardware emulator.  Since all the tools that establish these mappings
106  * live in dom0, dom0 should really supply us with this information,
107  * probably via the xenstore.  Unfortunatly it doesn't so, since there's
108  * no good way to determine this mapping dynamically, this driver uses
109  * a hard coded set of static mappings.  These mappings are hardware
110  * emulator specific because each different hardware emulator could have
111  * a different device tree with different cmdk device paths.  This
112  * means that if we want to continue to use this static mapping approach
113  * to allow Solaris to run on different hardware emulators we'll have
114  * to analyze each of those emulators to determine what paths they
115  * use and hard code those paths into this driver.  yech.  This metadata
116  * really needs to be supplied to us by dom0.
117  *
118  * This driver access underlying xdf nodes.  Unfortunatly, devices
119  * must create minor nodes during attach, and for disk devices to create
120  * minor nodes, they have to look at the label on the disk, so this means
121  * that disk drivers must be able to access a disk contents during
122  * attach.  That means that this disk driver must be able to access
123  * underlying xdf nodes during attach.  Unfortunatly, due to device tree
124  * locking restrictions, we cannot have an attach operation occuring on
125  * this device and then attempt to access another device which may
126  * cause another attach to occur in a different device tree branch
127  * since this could result in deadlock.  Hence, this driver can only
128  * access xdf device nodes that we know are attached, and it can't use
129  * any ddi interfaces to access those nodes if those interfaces could
130  * trigger an attach of the xdf device.  So this driver works around
131  * these restrictions by talking directly to xdf devices via
132  * xdf_hvm_hold().  This interface takes a pathname to an xdf device,
133  * and if that device is already attached then it returns the a held dip
134  * pointer for that device node.  This prevents us from getting into
135  * deadlock situations, but now we need a mechanism to ensure that all
136  * the xdf device nodes this driver might access are attached before
137  * this driver tries to access them.  This is accomplished via the
138  * hvmboot_rootconf() callback which is invoked just before root is
139  * mounted.  hvmboot_rootconf() will attach xpvd and tell it to configure
140  * all xdf device visible to the system.  All these xdf device nodes
141  * will also be marked with the "ddi-no-autodetach" property so that
142  * once they are configured, the will not be automatically unconfigured.
143  * The only way that they could be unconfigured is if the administrator
144  * explicitly attempts to unload required modules via rem_drv(1M)
145  * or modunload(1M).
146  */
147 
148 /*
149  * 16 paritions + fdisk (see xdf.h)
150  */
151 #define	XDF_DEV2UNIT(dev)	XDF_INST((getminor((dev))))
152 #define	XDF_DEV2PART(dev)	XDF_PART((getminor((dev))))
153 
154 #define	OTYP_VALID(otyp)	((otyp == OTYP_BLK) || \
155 					(otyp == OTYP_CHR) || \
156 					(otyp == OTYP_LYR))
157 
158 #define	PV_CMDK_NODES		4
159 
160 typedef struct hvm_to_pv {
161 	char	*h2p_hvm_path;
162 	char	*h2p_pv_path;
163 } hvm_to_pv_t;
164 
165 /*
166  */
167 static hvm_to_pv_t pv_cmdk_h2p_xen_qemu[] = {
168 	/*
169 	 * The paths mapping here are very specific to xen and qemu.  When a
170 	 * domU is booted under xen in HVM mode, qemu is normally used to
171 	 * emulate up to four ide disks.  These disks always have the four
172 	 * path listed below.  To configure an emulated ide device, the
173 	 * xen domain configuration file normally has an entry that looks
174 	 * like this:
175 	 *	disk = [ 'file:/foo.img,hda,w' ]
176 	 *
177 	 * The part we're interested in is the 'hda', which we'll call the
178 	 * xen disk device name here.  The xen management tools (which parse
179 	 * the xen domain configuration file and launch qemu) makes the
180 	 * following assumptions about this value:
181 	 *	hda == emulated ide disk 0 (ide bus 0, master)
182 	 *	hdb == emulated ide disk 1 (ide bus 0, slave)
183 	 *	hdc == emulated ide disk 2 (ide bus 1, master)
184 	 *	hdd == emulated ide disk 3 (ide bus 1, slave)
185 	 *
186 	 * (Uncoincidentally, these xen disk device names actually map to
187 	 * the /dev filesystem names of ide disk devices in Linux.  So in
188 	 * Linux /dev/hda is the first ide disk.)  So for the first part of
189 	 * our mapping we've just hardcoded the cmdk paths that we know
190 	 * qemu will use.
191 	 *
192 	 * To understand the second half of the mapping (ie, the xdf device
193 	 * that each emulated cmdk device should be mapped two) we need to
194 	 * know the solaris device node address that will be assigned to
195 	 * each xdf device.  (The device node address is the hex number that
196 	 * comes after the "xdf@" in the device path.)
197 	 *
198 	 * Normally when a domU is run in non-HVM mode, the xen disk device
199 	 * names in the xen domain configuration file are specified with
200 	 * integers instead of Linux device names.  (for example, '0' would
201 	 * be used instead of 'hda'.)  So in the non-HVM case we simply
202 	 * convert the xen disk device name (which is an interger) into a
203 	 * hex number and use it as the Solaris xdf device node address.
204 	 * But when we're running in HVM mode then we have a string for the
205 	 * xen disk device name, so we can't simply use that as a solaris
206 	 * device node address.  Instead we fall back to using the xenstore
207 	 * device id for the xen disk device as the xdf device node address.
208 	 * The xdf device node address assignment happens in xvdi_init_dev().
209 	 *
210 	 * So the question becomes, how do we know what the xenstore device
211 	 * id for emulated disk will be?  Well, it turns out that since the
212 	 * xen management tools expect the disk device names to be Linux
213 	 * device names, those same management tools assign each disk a
214 	 * device id that matches the dev_t of the corresponding device
215 	 * under Linux.  (Big shocker.)  This xen device name-to-id mapping
216 	 * is currently all hard coded here:
217 	 *	xen.hg/tools/python/xen/util/blkif.py`blkdev_name_to_number()
218 	 *
219 	 * So looking at the code above we can see the following xen disk
220 	 * device name to xenstore device id mappings:
221 	 *	'hda' --> 0x300  == 0t768  == ((3  * 256) + (0 * 64))
222 	 *	'hdb' --> 0x340  == 0t832  == ((3  * 256) + (1 * 64))
223 	 *	'hdc' --> 0x1600 == 0t5632 == ((22 * 256) + (0 * 64))
224 	 *	'hdd' --> 0x1640 == 0t5696 == ((22 * 256) + (1 * 64))
225 	 */
226 	{ "/pci@0,0/pci-ide@1,1/ide@0/cmdk@0,0", "/xpvd/xdf@300" },
227 	{ "/pci@0,0/pci-ide@1,1/ide@0/cmdk@1,0", "/xpvd/xdf@340" },
228 	{ "/pci@0,0/pci-ide@1,1/ide@1/cmdk@0,0", "/xpvd/xdf@1600" },
229 	{ "/pci@0,0/pci-ide@1,1/ide@1/cmdk@1,0", "/xpvd/xdf@1640" },
230 	{ NULL, 0 }
231 };
232 
233 typedef struct pv_cmdk {
234 	dev_info_t	*dk_dip;
235 	cmlb_handle_t	dk_cmlbhandle;
236 	ddi_devid_t	dk_devid;
237 	kmutex_t	dk_mutex;
238 	dev_info_t	*dk_xdf_dip;
239 	dev_t		dk_xdf_dev;
240 	int		dk_xdf_otyp_count[OTYPCNT][XDF_PEXT];
241 	ldi_handle_t	dk_xdf_lh[XDF_PEXT];
242 } pv_cmdk_t;
243 
244 /*
245  * Globals
246  */
247 static void *pv_cmdk_state;
248 static major_t pv_cmdk_major;
249 static hvm_to_pv_t *pv_cmdk_h2p;
250 
251 /*
252  * Function prototypes for xdf callback functions
253  */
254 extern int xdf_lb_getinfo(dev_info_t *, int, void *, void *);
255 extern int xdf_lb_rdwr(dev_info_t *, uchar_t, void *, diskaddr_t, size_t,
256     void *);
257 
258 static boolean_t
259 pv_cmdk_isopen_part(struct pv_cmdk *dkp, int part)
260 {
261 	int otyp;
262 
263 	ASSERT(MUTEX_HELD(&dkp->dk_mutex));
264 
265 	for (otyp = 0; (otyp < OTYPCNT); otyp++) {
266 		if (dkp->dk_xdf_otyp_count[otyp][part] != 0)
267 			return (B_TRUE);
268 	}
269 	return (B_FALSE);
270 }
271 
272 /*
273  * Cmlb ops vectors, allows the cmlb module to directly access the entire
274  * pv_cmdk disk device without going through any partitioning layers.
275  */
276 /*ARGSUSED*/
277 static int
278 pv_cmdk_lb_rdwr(dev_info_t *dip, uchar_t cmd, void *bufaddr,
279     diskaddr_t start, size_t count, void *tg_cookie)
280 {
281 	int		instance = ddi_get_instance(dip);
282 	struct pv_cmdk	*dkp = ddi_get_soft_state(pv_cmdk_state, instance);
283 
284 	if (dkp == NULL)
285 		return (ENXIO);
286 
287 	return (xdf_lb_rdwr(dkp->dk_xdf_dip, cmd, bufaddr, start, count,
288 	    tg_cookie));
289 }
290 
291 /*ARGSUSED*/
292 static int
293 pv_cmdk_lb_getinfo(dev_info_t *dip, int cmd, void *arg, void *tg_cookie)
294 {
295 	int		instance = ddi_get_instance(dip);
296 	struct pv_cmdk	*dkp = ddi_get_soft_state(pv_cmdk_state, instance);
297 	int		err;
298 
299 	if (dkp == NULL)
300 		return (ENXIO);
301 
302 	if (cmd == TG_GETVIRTGEOM) {
303 		cmlb_geom_t	pgeom, *vgeomp;
304 		diskaddr_t	capacity;
305 
306 		/*
307 		 * The native xdf driver doesn't support this ioctl.
308 		 * Intead of passing it on, emulate it here so that the
309 		 * results look the same as what we get for a real cmdk
310 		 * device.
311 		 *
312 		 * Get the real size of the device
313 		 */
314 		if ((err = xdf_lb_getinfo(dkp->dk_xdf_dip,
315 		    TG_GETPHYGEOM, &pgeom, tg_cookie)) != 0)
316 			return (err);
317 		capacity = pgeom.g_capacity;
318 
319 		/*
320 		 * If the controller returned us something that doesn't
321 		 * really fit into an Int 13/function 8 geometry
322 		 * result, just fail the ioctl.  See PSARC 1998/313.
323 		 */
324 		if (capacity >= (63 * 254 * 1024))
325 			return (EINVAL);
326 
327 		vgeomp = (cmlb_geom_t *)arg;
328 		vgeomp->g_capacity	= capacity;
329 		vgeomp->g_nsect		= 63;
330 		vgeomp->g_nhead		= 254;
331 		vgeomp->g_ncyl		= capacity / (63 * 254);
332 		vgeomp->g_acyl		= 0;
333 		vgeomp->g_secsize	= 512;
334 		vgeomp->g_intrlv	= 1;
335 		vgeomp->g_rpm		= 3600;
336 		return (0);
337 	}
338 
339 	return (xdf_lb_getinfo(dkp->dk_xdf_dip, cmd, arg, tg_cookie));
340 }
341 
342 static cmlb_tg_ops_t pv_cmdk_lb_ops = {
343 	TG_DK_OPS_VERSION_1,
344 	pv_cmdk_lb_rdwr,
345 	pv_cmdk_lb_getinfo
346 };
347 
348 /*
349  * devid management functions
350  */
351 
352 /*
353  * pv_cmdk_get_modser() is basically a local copy of
354  * cmdk_get_modser() modified to work without the dadk layer.
355  * (which the non-pv version of the cmdk driver uses.)
356  */
357 static int
358 pv_cmdk_get_modser(struct pv_cmdk *dkp, int ioccmd, char *buf, int len)
359 {
360 	struct scsi_device	*scsi_device;
361 	opaque_t		ctlobjp;
362 	dadk_ioc_string_t	strarg;
363 	char			*s;
364 	char			ch;
365 	boolean_t		ret;
366 	int			i;
367 	int			tb;
368 
369 	strarg.is_buf = buf;
370 	strarg.is_size = len;
371 	scsi_device = ddi_get_driver_private(dkp->dk_dip);
372 	ctlobjp = scsi_device->sd_address.a_hba_tran;
373 	if (CTL_IOCTL(ctlobjp,
374 	    ioccmd, (uintptr_t)&strarg, FNATIVE | FKIOCTL) != 0)
375 		return (0);
376 
377 	/*
378 	 * valid model/serial string must contain a non-zero non-space
379 	 * trim trailing spaces/NULL
380 	 */
381 	ret = B_FALSE;
382 	s = buf;
383 	for (i = 0; i < strarg.is_size; i++) {
384 		ch = *s++;
385 		if (ch != ' ' && ch != '\0')
386 			tb = i + 1;
387 		if (ch != ' ' && ch != '\0' && ch != '0')
388 			ret = B_TRUE;
389 	}
390 
391 	if (ret == B_FALSE)
392 		return (0);
393 
394 	return (tb);
395 }
396 
397 /*
398  * pv_cmdk_devid_modser() is basically a copy of cmdk_devid_modser()
399  * that has been modified to use local pv cmdk driver functions.
400  *
401  * Build a devid from the model and serial number
402  * Return DDI_SUCCESS or DDI_FAILURE.
403  */
404 static int
405 pv_cmdk_devid_modser(struct pv_cmdk *dkp)
406 {
407 	int	rc = DDI_FAILURE;
408 	char	*hwid;
409 	int	modlen;
410 	int	serlen;
411 
412 	/*
413 	 * device ID is a concatenation of model number, '=', serial number.
414 	 */
415 	hwid = kmem_alloc(CMDK_HWIDLEN, KM_SLEEP);
416 	modlen = pv_cmdk_get_modser(dkp, DIOCTL_GETMODEL, hwid, CMDK_HWIDLEN);
417 	if (modlen == 0)
418 		goto err;
419 
420 	hwid[modlen++] = '=';
421 	serlen = pv_cmdk_get_modser(dkp, DIOCTL_GETSERIAL,
422 	    hwid + modlen, CMDK_HWIDLEN - modlen);
423 	if (serlen == 0)
424 		goto err;
425 
426 	hwid[modlen + serlen] = 0;
427 
428 	/* Initialize the device ID, trailing NULL not included */
429 	rc = ddi_devid_init(dkp->dk_dip, DEVID_ATA_SERIAL, modlen + serlen,
430 	    hwid, (ddi_devid_t *)&dkp->dk_devid);
431 	if (rc != DDI_SUCCESS)
432 		goto err;
433 
434 	kmem_free(hwid, CMDK_HWIDLEN);
435 	return (DDI_SUCCESS);
436 
437 err:
438 	kmem_free(hwid, CMDK_HWIDLEN);
439 	return (DDI_FAILURE);
440 }
441 
442 /*
443  * pv_cmdk_devid_read() is basically a local copy of
444  * cmdk_devid_read() modified to work without the dadk layer.
445  * (which the non-pv version of the cmdk driver uses.)
446  *
447  * Read a devid from on the first block of the last track of
448  * the last cylinder.  Make sure what we read is a valid devid.
449  * Return DDI_SUCCESS or DDI_FAILURE.
450  */
451 static int
452 pv_cmdk_devid_read(struct pv_cmdk *dkp)
453 {
454 	diskaddr_t	blk;
455 	struct dk_devid *dkdevidp;
456 	uint_t		*ip, chksum;
457 	int		i;
458 
459 	if (cmlb_get_devid_block(dkp->dk_cmlbhandle, &blk, 0) != 0)
460 		return (DDI_FAILURE);
461 
462 	dkdevidp = kmem_zalloc(NBPSCTR, KM_SLEEP);
463 	if (pv_cmdk_lb_rdwr(dkp->dk_dip,
464 	    TG_READ, dkdevidp, blk, NBPSCTR, NULL) != 0)
465 		goto err;
466 
467 	/* Validate the revision */
468 	if ((dkdevidp->dkd_rev_hi != DK_DEVID_REV_MSB) ||
469 	    (dkdevidp->dkd_rev_lo != DK_DEVID_REV_LSB))
470 		goto err;
471 
472 	/* Calculate the checksum */
473 	chksum = 0;
474 	ip = (uint_t *)dkdevidp;
475 	for (i = 0; i < ((NBPSCTR - sizeof (int))/sizeof (int)); i++)
476 		chksum ^= ip[i];
477 	if (DKD_GETCHKSUM(dkdevidp) != chksum)
478 		goto err;
479 
480 	/* Validate the device id */
481 	if (ddi_devid_valid((ddi_devid_t)dkdevidp->dkd_devid) != DDI_SUCCESS)
482 		goto err;
483 
484 	/* keep a copy of the device id */
485 	i = ddi_devid_sizeof((ddi_devid_t)dkdevidp->dkd_devid);
486 	dkp->dk_devid = kmem_alloc(i, KM_SLEEP);
487 	bcopy(dkdevidp->dkd_devid, dkp->dk_devid, i);
488 	kmem_free(dkdevidp, NBPSCTR);
489 	return (DDI_SUCCESS);
490 
491 err:
492 	kmem_free(dkdevidp, NBPSCTR);
493 	return (DDI_FAILURE);
494 }
495 
496 /*
497  * pv_cmdk_devid_fabricate() is basically a local copy of
498  * cmdk_devid_fabricate() modified to work without the dadk layer.
499  * (which the non-pv version of the cmdk driver uses.)
500  *
501  * Create a devid and write it on the first block of the last track of
502  * the last cylinder.
503  * Return DDI_SUCCESS or DDI_FAILURE.
504  */
505 static int
506 pv_cmdk_devid_fabricate(struct pv_cmdk *dkp)
507 {
508 	ddi_devid_t	devid = NULL; /* devid made by ddi_devid_init  */
509 	struct dk_devid	*dkdevidp = NULL; /* devid struct stored on disk */
510 	diskaddr_t	blk;
511 	uint_t		*ip, chksum;
512 	int		i;
513 
514 	if (cmlb_get_devid_block(dkp->dk_cmlbhandle, &blk, 0) != 0)
515 		return (DDI_FAILURE);
516 
517 	if (ddi_devid_init(dkp->dk_dip, DEVID_FAB, 0, NULL, &devid) !=
518 	    DDI_SUCCESS)
519 		return (DDI_FAILURE);
520 
521 	/* allocate a buffer */
522 	dkdevidp = (struct dk_devid *)kmem_zalloc(NBPSCTR, KM_SLEEP);
523 
524 	/* Fill in the revision */
525 	dkdevidp->dkd_rev_hi = DK_DEVID_REV_MSB;
526 	dkdevidp->dkd_rev_lo = DK_DEVID_REV_LSB;
527 
528 	/* Copy in the device id */
529 	i = ddi_devid_sizeof(devid);
530 	if (i > DK_DEVID_SIZE)
531 		goto err;
532 	bcopy(devid, dkdevidp->dkd_devid, i);
533 
534 	/* Calculate the chksum */
535 	chksum = 0;
536 	ip = (uint_t *)dkdevidp;
537 	for (i = 0; i < ((NBPSCTR - sizeof (int))/sizeof (int)); i++)
538 		chksum ^= ip[i];
539 
540 	/* Fill in the checksum */
541 	DKD_FORMCHKSUM(chksum, dkdevidp);
542 
543 	if (pv_cmdk_lb_rdwr(dkp->dk_dip,
544 	    TG_WRITE, dkdevidp, blk, NBPSCTR, NULL) != 0)
545 		goto err;
546 
547 	kmem_free(dkdevidp, NBPSCTR);
548 
549 	dkp->dk_devid = devid;
550 	return (DDI_SUCCESS);
551 
552 err:
553 	if (dkdevidp != NULL)
554 		kmem_free(dkdevidp, NBPSCTR);
555 	if (devid != NULL)
556 		ddi_devid_free(devid);
557 	return (DDI_FAILURE);
558 }
559 
560 /*
561  * pv_cmdk_devid_setup() is basically a local copy ofcmdk_devid_setup()
562  * that has been modified to use local pv cmdk driver functions.
563  *
564  * Create and register the devid.
565  * There are 4 different ways we can get a device id:
566  *    1. Already have one - nothing to do
567  *    2. Build one from the drive's model and serial numbers
568  *    3. Read one from the disk (first sector of last track)
569  *    4. Fabricate one and write it on the disk.
570  * If any of these succeeds, register the deviceid
571  */
572 static void
573 pv_cmdk_devid_setup(struct pv_cmdk *dkp)
574 {
575 	int	rc;
576 
577 	/* Try options until one succeeds, or all have failed */
578 
579 	/* 1. All done if already registered */
580 
581 	if (dkp->dk_devid != NULL)
582 		return;
583 
584 	/* 2. Build a devid from the model and serial number */
585 	rc = pv_cmdk_devid_modser(dkp);
586 	if (rc != DDI_SUCCESS) {
587 		/* 3. Read devid from the disk, if present */
588 		rc = pv_cmdk_devid_read(dkp);
589 
590 		/* 4. otherwise make one up and write it on the disk */
591 		if (rc != DDI_SUCCESS)
592 			rc = pv_cmdk_devid_fabricate(dkp);
593 	}
594 
595 	/* If we managed to get a devid any of the above ways, register it */
596 	if (rc == DDI_SUCCESS)
597 		(void) ddi_devid_register(dkp->dk_dip, dkp->dk_devid);
598 }
599 
600 /*
601  * Local Functions
602  */
603 static int
604 pv_cmdk_iodone(struct buf *bp)
605 {
606 	struct buf	*bp_orig = bp->b_chain;
607 
608 	/* Propegate back the io results */
609 	bp_orig->b_resid = bp->b_resid;
610 	bioerror(bp_orig, geterror(bp));
611 	biodone(bp_orig);
612 
613 	freerbuf(bp);
614 	return (0);
615 }
616 
617 static int
618 pv_cmdkstrategy(struct buf *bp)
619 {
620 	dev_t		dev = bp->b_edev;
621 	int		instance = XDF_DEV2UNIT(dev);
622 	int		part = XDF_DEV2PART(dev);
623 	struct pv_cmdk	*dkp = ddi_get_soft_state(pv_cmdk_state, instance);
624 	dev_t		xdf_devt;
625 	struct buf	*bp_clone;
626 
627 	/*
628 	 * Sanity checks that the dev_t associated with the buf we were
629 	 * passed actually corresponds us and that the partition we're
630 	 * trying to access is actually open.  On debug kernels we'll
631 	 * panic and on non-debug kernels we'll return failure.
632 	 */
633 	ASSERT(getmajor(dev) == pv_cmdk_major);
634 	if (getmajor(dev) != pv_cmdk_major)
635 		goto err;
636 
637 	mutex_enter(&dkp->dk_mutex);
638 	ASSERT(pv_cmdk_isopen_part(dkp, part));
639 	if (!pv_cmdk_isopen_part(dkp, part)) {
640 		mutex_exit(&dkp->dk_mutex);
641 		goto err;
642 	}
643 	mutex_exit(&dkp->dk_mutex);
644 
645 	/* clone this buffer */
646 	xdf_devt = dkp->dk_xdf_dev | part;
647 	bp_clone = bioclone(bp, 0, bp->b_bcount, xdf_devt, bp->b_blkno,
648 	    pv_cmdk_iodone, NULL, KM_SLEEP);
649 	bp_clone->b_chain = bp;
650 
651 	/*
652 	 * If we're being invoked on behalf of the physio() call in
653 	 * pv_cmdk_dioctl_rwcmd() then b_private will be set to
654 	 * XB_SLICE_NONE and we need to propegate this flag into the
655 	 * cloned buffer so that the xdf driver will see it.
656 	 */
657 	if (bp->b_private == (void *)XB_SLICE_NONE)
658 		bp_clone->b_private = (void *)XB_SLICE_NONE;
659 
660 	/*
661 	 * Pass on the cloned buffer.  Note that we don't bother to check
662 	 * for failure because the xdf strategy routine will have to
663 	 * invoke biodone() if it wants to return an error, which means
664 	 * that the pv_cmdk_iodone() callback will get invoked and it
665 	 * will propegate the error back up the stack and free the cloned
666 	 * buffer.
667 	 */
668 	ASSERT(dkp->dk_xdf_lh[part] != NULL);
669 	return (ldi_strategy(dkp->dk_xdf_lh[part], bp_clone));
670 
671 err:
672 	bioerror(bp, ENXIO);
673 	bp->b_resid = bp->b_bcount;
674 	biodone(bp);
675 	return (0);
676 }
677 
678 /*ARGSUSED*/
679 static int
680 pv_cmdkread(dev_t dev, struct uio *uio, cred_t *credp)
681 {
682 	int		instance = XDF_DEV2UNIT(dev);
683 	int		part = XDF_DEV2PART(dev);
684 	struct pv_cmdk	*dkp = ddi_get_soft_state(pv_cmdk_state, instance);
685 
686 	return (ldi_read(dkp->dk_xdf_lh[part], uio, credp));
687 }
688 
689 /*ARGSUSED*/
690 static int
691 pv_cmdkwrite(dev_t dev, struct uio *uio, cred_t *credp)
692 {
693 	int		instance = XDF_DEV2UNIT(dev);
694 	int		part = XDF_DEV2PART(dev);
695 	struct pv_cmdk	*dkp = ddi_get_soft_state(pv_cmdk_state, instance);
696 
697 	return (ldi_write(dkp->dk_xdf_lh[part], uio, credp));
698 }
699 
700 /*ARGSUSED*/
701 static int
702 pv_cmdkaread(dev_t dev, struct aio_req *aio, cred_t *credp)
703 {
704 	int		instance = XDF_DEV2UNIT(dev);
705 	int		part = XDF_DEV2PART(dev);
706 	struct pv_cmdk	*dkp = ddi_get_soft_state(pv_cmdk_state, instance);
707 	return (ldi_aread(dkp->dk_xdf_lh[part], aio, credp));
708 }
709 
710 /*ARGSUSED*/
711 static int
712 pv_cmdkawrite(dev_t dev, struct aio_req *aio, cred_t *credp)
713 {
714 	int		instance = XDF_DEV2UNIT(dev);
715 	int		part = XDF_DEV2PART(dev);
716 	struct pv_cmdk	*dkp = ddi_get_soft_state(pv_cmdk_state, instance);
717 	return (ldi_awrite(dkp->dk_xdf_lh[part], aio, credp));
718 }
719 
720 static int
721 pv_cmdkdump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
722 {
723 	int		instance = XDF_DEV2UNIT(dev);
724 	int		part = XDF_DEV2PART(dev);
725 	struct pv_cmdk	*dkp = ddi_get_soft_state(pv_cmdk_state, instance);
726 
727 	return (ldi_dump(dkp->dk_xdf_lh[part], addr, blkno, nblk));
728 }
729 
730 /*
731  * pv_rwcmd_copyin() is a duplicate of rwcmd_copyin().
732  */
733 static int
734 pv_rwcmd_copyin(struct dadkio_rwcmd *rwcmdp, caddr_t inaddr, int flag)
735 {
736 	switch (ddi_model_convert_from(flag)) {
737 		case DDI_MODEL_ILP32: {
738 			struct dadkio_rwcmd32 cmd32;
739 
740 			if (ddi_copyin(inaddr, &cmd32,
741 			    sizeof (struct dadkio_rwcmd32), flag)) {
742 				return (EFAULT);
743 			}
744 
745 			rwcmdp->cmd = cmd32.cmd;
746 			rwcmdp->flags = cmd32.flags;
747 			rwcmdp->blkaddr = (daddr_t)cmd32.blkaddr;
748 			rwcmdp->buflen = cmd32.buflen;
749 			rwcmdp->bufaddr = (caddr_t)(intptr_t)cmd32.bufaddr;
750 			/*
751 			 * Note: we do not convert the 'status' field,
752 			 * as it should not contain valid data at this
753 			 * point.
754 			 */
755 			bzero(&rwcmdp->status, sizeof (rwcmdp->status));
756 			break;
757 		}
758 		case DDI_MODEL_NONE: {
759 			if (ddi_copyin(inaddr, rwcmdp,
760 			    sizeof (struct dadkio_rwcmd), flag)) {
761 				return (EFAULT);
762 			}
763 		}
764 	}
765 	return (0);
766 }
767 
768 /*
769  * pv_rwcmd_copyout() is a duplicate of rwcmd_copyout().
770  */
771 static int
772 pv_rwcmd_copyout(struct dadkio_rwcmd *rwcmdp, caddr_t outaddr, int flag)
773 {
774 	switch (ddi_model_convert_from(flag)) {
775 		case DDI_MODEL_ILP32: {
776 			struct dadkio_rwcmd32 cmd32;
777 
778 			cmd32.cmd = rwcmdp->cmd;
779 			cmd32.flags = rwcmdp->flags;
780 			cmd32.blkaddr = rwcmdp->blkaddr;
781 			cmd32.buflen = rwcmdp->buflen;
782 			ASSERT64(((uintptr_t)rwcmdp->bufaddr >> 32) == 0);
783 			cmd32.bufaddr = (caddr32_t)(uintptr_t)rwcmdp->bufaddr;
784 
785 			cmd32.status.status = rwcmdp->status.status;
786 			cmd32.status.resid = rwcmdp->status.resid;
787 			cmd32.status.failed_blk_is_valid =
788 			    rwcmdp->status.failed_blk_is_valid;
789 			cmd32.status.failed_blk = rwcmdp->status.failed_blk;
790 			cmd32.status.fru_code_is_valid =
791 			    rwcmdp->status.fru_code_is_valid;
792 			cmd32.status.fru_code = rwcmdp->status.fru_code;
793 
794 			bcopy(rwcmdp->status.add_error_info,
795 			    cmd32.status.add_error_info, DADKIO_ERROR_INFO_LEN);
796 
797 			if (ddi_copyout(&cmd32, outaddr,
798 			    sizeof (struct dadkio_rwcmd32), flag))
799 				return (EFAULT);
800 			break;
801 		}
802 		case DDI_MODEL_NONE: {
803 			if (ddi_copyout(rwcmdp, outaddr,
804 			    sizeof (struct dadkio_rwcmd), flag))
805 			return (EFAULT);
806 		}
807 	}
808 	return (0);
809 }
810 
811 static void
812 pv_cmdkmin(struct buf *bp)
813 {
814 	if (bp->b_bcount > DK_MAXRECSIZE)
815 		bp->b_bcount = DK_MAXRECSIZE;
816 }
817 
818 static int
819 pv_cmdk_dioctl_rwcmd(dev_t dev, intptr_t arg, int flag)
820 {
821 	struct dadkio_rwcmd	*rwcmdp;
822 	struct iovec		aiov;
823 	struct uio		auio;
824 	struct buf		*bp;
825 	int			rw, status;
826 
827 	rwcmdp = kmem_alloc(sizeof (struct dadkio_rwcmd), KM_SLEEP);
828 	status = pv_rwcmd_copyin(rwcmdp, (caddr_t)arg, flag);
829 
830 	if (status != 0)
831 		goto out;
832 
833 	switch (rwcmdp->cmd) {
834 		case DADKIO_RWCMD_READ:
835 		case DADKIO_RWCMD_WRITE:
836 			break;
837 		default:
838 			status = EINVAL;
839 			goto out;
840 	}
841 
842 	bzero((caddr_t)&aiov, sizeof (struct iovec));
843 	aiov.iov_base = rwcmdp->bufaddr;
844 	aiov.iov_len = rwcmdp->buflen;
845 
846 	bzero((caddr_t)&auio, sizeof (struct uio));
847 	auio.uio_iov = &aiov;
848 	auio.uio_iovcnt = 1;
849 	auio.uio_loffset = (offset_t)rwcmdp->blkaddr * (offset_t)XB_BSIZE;
850 	auio.uio_resid = rwcmdp->buflen;
851 	auio.uio_segflg = (flag & FKIOCTL) ? UIO_SYSSPACE : UIO_USERSPACE;
852 
853 	/*
854 	 * Tell the xdf driver that this I/O request is using an absolute
855 	 * offset.
856 	 */
857 	bp = getrbuf(KM_SLEEP);
858 	bp->b_private = (void *)XB_SLICE_NONE;
859 
860 	rw = ((rwcmdp->cmd == DADKIO_RWCMD_WRITE) ? B_WRITE : B_READ);
861 	status = physio(pv_cmdkstrategy, bp, dev, rw, pv_cmdkmin, &auio);
862 
863 	biofini(bp);
864 	kmem_free(bp, sizeof (buf_t));
865 
866 	if (status == 0)
867 		status = pv_rwcmd_copyout(rwcmdp, (caddr_t)arg, flag);
868 
869 out:
870 	kmem_free(rwcmdp, sizeof (struct dadkio_rwcmd));
871 	return (status);
872 }
873 
874 static int
875 pv_cmdkioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *credp,
876     int *rvalp)
877 {
878 	int		instance = XDF_DEV2UNIT(dev);
879 	int		part = XDF_DEV2PART(dev);
880 	struct pv_cmdk	*dkp = ddi_get_soft_state(pv_cmdk_state, instance);
881 	int		err;
882 
883 	switch (cmd) {
884 	default:
885 		return (ldi_ioctl(dkp->dk_xdf_lh[part],
886 		    cmd, arg, flag, credp, rvalp));
887 	case DKIOCGETWCE:
888 	case DKIOCSETWCE:
889 		return (EIO);
890 	case DKIOCADDBAD: {
891 		/*
892 		 * This is for ata/ide bad block handling.  It is supposed
893 		 * to cause the driver to re-read the bad block list and
894 		 * alternate map after it has been updated.  Our driver
895 		 * will refuse to attach to any disk which has a bad blocks
896 		 * list defined, so there really isn't much to do here.
897 		 */
898 		return (0);
899 	}
900 	case DKIOCGETDEF: {
901 		/*
902 		 * I can't actually find any code that utilizes this ioctl,
903 		 * hence we're leaving it explicitly unimplemented.
904 		 */
905 		ASSERT("ioctl cmd unsupported by pv_cmdk: DKIOCGETDEF");
906 		return (EIO);
907 	}
908 	case DIOCTL_RWCMD: {
909 		/*
910 		 * This just seems to just be an alternate interface for
911 		 * reading and writing the disk.  Great, another way to
912 		 * do the same thing...
913 		 */
914 		return (pv_cmdk_dioctl_rwcmd(dev, arg, flag));
915 	}
916 	case DKIOCINFO: {
917 		dev_info_t	*dip = dkp->dk_dip;
918 		struct dk_cinfo	info;
919 
920 		/* Pass on the ioctl request, save the response */
921 		if ((err = ldi_ioctl(dkp->dk_xdf_lh[part],
922 		    cmd, (intptr_t)&info, FKIOCTL, credp, rvalp)) != 0)
923 			return (err);
924 
925 		/* Update controller info */
926 		info.dki_cnum = ddi_get_instance(ddi_get_parent(dip));
927 		(void) strlcpy(info.dki_cname,
928 		    ddi_get_name(ddi_get_parent(dip)), sizeof (info.dki_cname));
929 
930 		/* Update unit info. */
931 		if (info.dki_ctype == DKC_VBD)
932 			info.dki_ctype = DKC_DIRECT;
933 		info.dki_unit = instance;
934 		(void) strlcpy(info.dki_dname,
935 		    ddi_driver_name(dip), sizeof (info.dki_dname));
936 		info.dki_addr = 1;
937 
938 		if (ddi_copyout(&info, (void *)arg, sizeof (info), flag))
939 			return (EFAULT);
940 		return (0);
941 	}
942 	} /* switch (cmd) */
943 	/*NOTREACHED*/
944 }
945 
946 /*ARGSUSED*/
947 static int
948 pv_cmdkopen(dev_t *dev_p, int flag, int otyp, cred_t *credp)
949 {
950 	ldi_ident_t	li;
951 	dev_t		dev = *dev_p;
952 	int		instance = XDF_DEV2UNIT(dev);
953 	int		part = XDF_DEV2PART(dev);
954 	struct pv_cmdk	*dkp = ddi_get_soft_state(pv_cmdk_state, instance);
955 	dev_t		xdf_devt = dkp->dk_xdf_dev | part;
956 	int		err = 0;
957 
958 	if ((otyp < 0) || (otyp >= OTYPCNT))
959 		return (EINVAL);
960 
961 	/* allocate an ldi handle */
962 	VERIFY(ldi_ident_from_dev(*dev_p, &li) == 0);
963 
964 	mutex_enter(&dkp->dk_mutex);
965 
966 	/*
967 	 * We translate all device opens (chr, blk, and lyr) into
968 	 * block device opens.  Why?  Because for all the opens that
969 	 * come through this driver, we only keep around one LDI handle.
970 	 * So that handle can only be of one open type.  The reason
971 	 * that we choose the block interface for this is that to use
972 	 * the block interfaces for a device the system needs to allocatex
973 	 * buf_ts, which are associated with system memory which can act
974 	 * as a cache for device data.  So normally when a block device
975 	 * is closed the system will ensure that all these pages get
976 	 * flushed out of memory.  But if we were to open the device
977 	 * as a character device, then when we went to close the underlying
978 	 * device (even if we had invoked the block interfaces) any data
979 	 * remaining in memory wouldn't necessairly be flushed out
980 	 * before the device was closed.
981 	 */
982 	if (dkp->dk_xdf_lh[part] == NULL) {
983 		ASSERT(!pv_cmdk_isopen_part(dkp, part));
984 
985 		err = ldi_open_by_dev(&xdf_devt, OTYP_BLK, flag, credp,
986 		    &dkp->dk_xdf_lh[part], li);
987 
988 		if (err != 0) {
989 			mutex_exit(&dkp->dk_mutex);
990 			ldi_ident_release(li);
991 			return (err);
992 		}
993 
994 		/* Disk devices really shouldn't clone */
995 		ASSERT(xdf_devt == (dkp->dk_xdf_dev | part));
996 	} else {
997 		ldi_handle_t lh_tmp;
998 
999 		ASSERT(pv_cmdk_isopen_part(dkp, part));
1000 
1001 		/* do ldi open/close to get flags and cred check */
1002 		err = ldi_open_by_dev(&xdf_devt, OTYP_BLK, flag, credp,
1003 		    &lh_tmp, li);
1004 		if (err != 0) {
1005 			mutex_exit(&dkp->dk_mutex);
1006 			ldi_ident_release(li);
1007 			return (err);
1008 		}
1009 
1010 		/* Disk devices really shouldn't clone */
1011 		ASSERT(xdf_devt == (dkp->dk_xdf_dev | part));
1012 		(void) ldi_close(lh_tmp, flag, credp);
1013 	}
1014 	ldi_ident_release(li);
1015 
1016 	dkp->dk_xdf_otyp_count[otyp][part]++;
1017 
1018 	mutex_exit(&dkp->dk_mutex);
1019 	return (0);
1020 }
1021 
1022 /*ARGSUSED*/
1023 static int
1024 pv_cmdkclose(dev_t dev, int flag, int otyp, cred_t *credp)
1025 {
1026 	int		instance = XDF_DEV2UNIT(dev);
1027 	int		part = XDF_DEV2PART(dev);
1028 	struct pv_cmdk	*dkp = ddi_get_soft_state(pv_cmdk_state, instance);
1029 	int		err = 0;
1030 
1031 	ASSERT((otyp >= 0) && otyp < OTYPCNT);
1032 
1033 	/*
1034 	 * Sanity check that that the dev_t specified corresponds to this
1035 	 * driver and that the device is actually open.  On debug kernels we'll
1036 	 * panic and on non-debug kernels we'll return failure.
1037 	 */
1038 	ASSERT(getmajor(dev) == pv_cmdk_major);
1039 	if (getmajor(dev) != pv_cmdk_major)
1040 		return (ENXIO);
1041 
1042 	mutex_enter(&dkp->dk_mutex);
1043 	ASSERT(pv_cmdk_isopen_part(dkp, part));
1044 	if (!pv_cmdk_isopen_part(dkp, part)) {
1045 		mutex_exit(&dkp->dk_mutex);
1046 		return (ENXIO);
1047 	}
1048 
1049 	ASSERT(dkp->dk_xdf_lh[part] != NULL);
1050 	ASSERT(dkp->dk_xdf_otyp_count[otyp][part] > 0);
1051 	if (otyp == OTYP_LYR) {
1052 		dkp->dk_xdf_otyp_count[otyp][part]--;
1053 	} else {
1054 		dkp->dk_xdf_otyp_count[otyp][part] = 0;
1055 	}
1056 
1057 	if (!pv_cmdk_isopen_part(dkp, part)) {
1058 		err = ldi_close(dkp->dk_xdf_lh[part], flag, credp);
1059 		dkp->dk_xdf_lh[part] = NULL;
1060 	}
1061 
1062 	mutex_exit(&dkp->dk_mutex);
1063 
1064 	return (err);
1065 }
1066 
1067 static int
1068 pv_cmdk_getpgeom(dev_info_t *dip, cmlb_geom_t *pgeom)
1069 {
1070 	struct scsi_device	*scsi_device;
1071 	struct tgdk_geom	tgdk_geom;
1072 	opaque_t		ctlobjp;
1073 	int			err;
1074 
1075 	scsi_device = ddi_get_driver_private(dip);
1076 	ctlobjp = scsi_device->sd_address.a_hba_tran;
1077 	if ((err = CTL_IOCTL(ctlobjp,
1078 	    DIOCTL_GETPHYGEOM, (uintptr_t)&tgdk_geom, FKIOCTL)) != 0)
1079 		return (err);
1080 
1081 	/* This driver won't work if this isn't true */
1082 	ASSERT(tgdk_geom.g_secsiz == XB_BSIZE);
1083 
1084 	pgeom->g_ncyl = tgdk_geom.g_cyl;
1085 	pgeom->g_acyl = tgdk_geom.g_acyl;
1086 	pgeom->g_nhead = tgdk_geom.g_head;
1087 	pgeom->g_nsect = tgdk_geom.g_sec;
1088 	pgeom->g_secsize = tgdk_geom.g_secsiz;
1089 	pgeom->g_capacity = tgdk_geom.g_cap;
1090 	pgeom->g_intrlv = 1;
1091 	pgeom->g_rpm = 3600;
1092 	return (0);
1093 }
1094 
1095 /*
1096  * pv_cmdk_bb_check() checks for the existance of bad blocks mappings in
1097  * the alternate partition/slice.  Returns B_FALSE is there are no bad
1098  * block mappins found, and B_TRUE is there are bad block mappins found.
1099  */
1100 static boolean_t
1101 pv_cmdk_bb_check(struct pv_cmdk *dkp)
1102 {
1103 	struct alts_parttbl	*ap;
1104 	diskaddr_t		nblocks, blk;
1105 	uint32_t		altused, altbase, altlast;
1106 	uint16_t		vtoctag;
1107 	int			alts;
1108 
1109 	/* find slice with V_ALTSCTR tag */
1110 	for (alts = 0; alts < NDKMAP; alts++) {
1111 
1112 		if (cmlb_partinfo(dkp->dk_cmlbhandle, alts,
1113 		    &nblocks, &blk, NULL, &vtoctag, 0) != 0) {
1114 			/* no partition table exists */
1115 			return (B_FALSE);
1116 		}
1117 
1118 		if ((vtoctag == V_ALTSCTR) && (nblocks > 1))
1119 			break;
1120 	}
1121 	if (alts >= NDKMAP)
1122 		return (B_FALSE); /* no V_ALTSCTR slice defined */
1123 
1124 	/* read in ALTS label block */
1125 	ap = (struct alts_parttbl *)kmem_zalloc(NBPSCTR, KM_SLEEP);
1126 	if (pv_cmdk_lb_rdwr(dkp->dk_dip,
1127 	    TG_READ, ap, blk, NBPSCTR, NULL) != 0)
1128 		goto err;
1129 
1130 	altused = ap->alts_ent_used;	/* number of BB entries */
1131 	altbase = ap->alts_ent_base;	/* blk offset from begin slice */
1132 	altlast = ap->alts_ent_end;	/* blk offset to last block */
1133 
1134 	if ((altused == 0) || (altbase < 1) ||
1135 	    (altbase > altlast) || (altlast >= nblocks))
1136 		goto err;
1137 
1138 	/* we found bad block mappins */
1139 	kmem_free(ap, NBPSCTR);
1140 	return (B_TRUE);
1141 
1142 err:
1143 	kmem_free(ap, NBPSCTR);
1144 	return (B_FALSE);
1145 }
1146 
1147 /*
1148  * Autoconfiguration Routines
1149  */
1150 static int
1151 pv_cmdkattach(dev_info_t *dip, ddi_attach_cmd_t cmd)
1152 {
1153 	int			instance = ddi_get_instance(dip);
1154 	dev_info_t		*xdf_dip = NULL;
1155 	struct pv_cmdk		*dkp;
1156 	cmlb_geom_t		pgeom;
1157 	char			*path;
1158 	int			i;
1159 
1160 	if (cmd != DDI_ATTACH)
1161 		return (DDI_FAILURE);
1162 
1163 	/*
1164 	 * This cmdk device layers on top of an xdf device.  So the first
1165 	 * thing we need to do is determine which xdf device instance this
1166 	 * cmdk instance should be layered on top of.
1167 	 */
1168 	path = kmem_alloc(MAXPATHLEN, KM_SLEEP);
1169 	(void) ddi_pathname(dip, path);
1170 	for (i = 0; pv_cmdk_h2p[i].h2p_hvm_path != NULL; i++) {
1171 		if (strcmp(pv_cmdk_h2p[i].h2p_hvm_path, path) == 0)
1172 			break;
1173 	}
1174 	kmem_free(path, MAXPATHLEN);
1175 
1176 	if (pv_cmdk_h2p[i].h2p_hvm_path == NULL) {
1177 		/*
1178 		 * UhOh.  We don't know what xdf instance this cmdk device
1179 		 * should be mapped to.
1180 		 */
1181 		return (DDI_FAILURE);
1182 	}
1183 
1184 	/* Check if this device exists */
1185 	xdf_dip = xdf_hvm_hold(pv_cmdk_h2p[i].h2p_pv_path);
1186 	if (xdf_dip == NULL)
1187 		return (DDI_FAILURE);
1188 
1189 	/* allocate and initialize our state structure */
1190 	(void) ddi_soft_state_zalloc(pv_cmdk_state, instance);
1191 	dkp = ddi_get_soft_state(pv_cmdk_state, instance);
1192 	mutex_init(&dkp->dk_mutex, NULL, MUTEX_DRIVER, NULL);
1193 	dkp->dk_dip = dip;
1194 	dkp->dk_xdf_dip = xdf_dip;
1195 	dkp->dk_xdf_dev = makedevice(ddi_driver_major(xdf_dip),
1196 	    XDF_MINOR(ddi_get_instance(xdf_dip), 0));
1197 
1198 	ASSERT((dkp->dk_xdf_dev & XDF_PMASK) == 0);
1199 
1200 	/*
1201 	 * GROSS HACK ALERT!  GROSS HACK ALERT!
1202 	 *
1203 	 * Before we can initialize the cmlb layer, we have to tell the
1204 	 * underlying xdf device what it's physical geometry should be.
1205 	 * See the block comments at the top of this file for more info.
1206 	 */
1207 	if ((pv_cmdk_getpgeom(dip, &pgeom) != 0) ||
1208 	    (xdf_hvm_setpgeom(dkp->dk_xdf_dip, &pgeom) != 0)) {
1209 		ddi_release_devi(dkp->dk_xdf_dip);
1210 		mutex_destroy(&dkp->dk_mutex);
1211 		ddi_soft_state_free(pv_cmdk_state, instance);
1212 		return (DDI_FAILURE);
1213 	}
1214 
1215 	/* create kstat for iostat(1M) */
1216 	if (xdf_kstat_create(dkp->dk_xdf_dip, "cmdk", instance) != 0) {
1217 		ddi_release_devi(dkp->dk_xdf_dip);
1218 		mutex_destroy(&dkp->dk_mutex);
1219 		ddi_soft_state_free(pv_cmdk_state, instance);
1220 		return (DDI_FAILURE);
1221 	}
1222 
1223 	/*
1224 	 * Force the xdf front end driver to connect to the backend.  From
1225 	 * the solaris device tree perspective, the xdf driver devinfo node
1226 	 * is already in the ATTACHED state.  (Otherwise xdf_hvm_hold()
1227 	 * would not have returned a dip.)  But this doesn't mean that the
1228 	 * xdf device has actually established a connection to it's back
1229 	 * end driver.  For us to be able to access the xdf device it needs
1230 	 * to be connected.  There are two ways to force the xdf driver to
1231 	 * connect to the backend device.
1232 	 */
1233 	if (xdf_hvm_connect(dkp->dk_xdf_dip) != 0) {
1234 		cmn_err(CE_WARN,
1235 		    "pv driver failed to connect: %s",
1236 		    pv_cmdk_h2p[i].h2p_pv_path);
1237 		xdf_kstat_delete(dkp->dk_xdf_dip);
1238 		ddi_release_devi(dkp->dk_xdf_dip);
1239 		mutex_destroy(&dkp->dk_mutex);
1240 		ddi_soft_state_free(pv_cmdk_state, instance);
1241 		return (DDI_FAILURE);
1242 	}
1243 
1244 	/*
1245 	 * Initalize cmlb.  Note that for partition information cmlb
1246 	 * will access the underly xdf disk device directly via
1247 	 * pv_cmdk_lb_rdwr() and pv_cmdk_lb_getinfo().  There are no
1248 	 * layered driver handles associated with this access because
1249 	 * it is a direct disk access that doesn't go through
1250 	 * any of the device nodes exported by the xdf device (since
1251 	 * all exported device nodes only reflect the portion of
1252 	 * the device visible via the partition/slice that the node
1253 	 * is associated with.)  So while not observable via the LDI,
1254 	 * this direct disk access is ok since we're actually holding
1255 	 * the target device.
1256 	 */
1257 	cmlb_alloc_handle((cmlb_handle_t *)&dkp->dk_cmlbhandle);
1258 	if (cmlb_attach(dkp->dk_dip, &pv_cmdk_lb_ops,
1259 	    DTYPE_DIRECT,		/* device_type */
1260 	    0,				/* not removable */
1261 	    0,				/* not hot pluggable */
1262 	    DDI_NT_BLOCK,
1263 	    CMLB_CREATE_ALTSLICE_VTOC_16_DTYPE_DIRECT,	/* mimic cmdk */
1264 	    dkp->dk_cmlbhandle, 0) != 0) {
1265 		cmlb_free_handle(&dkp->dk_cmlbhandle);
1266 		xdf_kstat_delete(dkp->dk_xdf_dip);
1267 		ddi_release_devi(dkp->dk_xdf_dip);
1268 		mutex_destroy(&dkp->dk_mutex);
1269 		ddi_soft_state_free(pv_cmdk_state, instance);
1270 		return (DDI_FAILURE);
1271 	}
1272 
1273 	if (pv_cmdk_bb_check(dkp)) {
1274 		cmn_err(CE_WARN,
1275 		    "pv cmdk disks with bad blocks are unsupported: %s",
1276 		    pv_cmdk_h2p[i].h2p_hvm_path);
1277 
1278 		cmlb_detach(dkp->dk_cmlbhandle, 0);
1279 		cmlb_free_handle(&dkp->dk_cmlbhandle);
1280 		xdf_kstat_delete(dkp->dk_xdf_dip);
1281 		ddi_release_devi(dkp->dk_xdf_dip);
1282 		mutex_destroy(&dkp->dk_mutex);
1283 		ddi_soft_state_free(pv_cmdk_state, instance);
1284 		return (DDI_FAILURE);
1285 	}
1286 
1287 	/* setup devid string */
1288 	pv_cmdk_devid_setup(dkp);
1289 
1290 	/* Calling validate will create minor nodes according to disk label */
1291 	(void) cmlb_validate(dkp->dk_cmlbhandle, 0, 0);
1292 
1293 	/*
1294 	 * Add a zero-length attribute to tell the world we support
1295 	 * kernel ioctls (for layered drivers).
1296 	 */
1297 	(void) ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
1298 	    DDI_KERNEL_IOCTL, NULL, 0);
1299 
1300 	/* Have the system report any newly created device nodes */
1301 	ddi_report_dev(dip);
1302 
1303 	return (DDI_SUCCESS);
1304 }
1305 
1306 static int
1307 pv_cmdkdetach(dev_info_t *dip, ddi_detach_cmd_t cmd)
1308 {
1309 	int		instance = ddi_get_instance(dip);
1310 	struct pv_cmdk	*dkp = ddi_get_soft_state(pv_cmdk_state, instance);
1311 
1312 	if (cmd != DDI_DETACH)
1313 		return (DDI_FAILURE);
1314 
1315 	ASSERT(MUTEX_NOT_HELD(&dkp->dk_mutex));
1316 
1317 	ddi_devid_unregister(dip);
1318 	if (dkp->dk_devid)
1319 		ddi_devid_free(dkp->dk_devid);
1320 	cmlb_detach(dkp->dk_cmlbhandle, 0);
1321 	cmlb_free_handle(&dkp->dk_cmlbhandle);
1322 	mutex_destroy(&dkp->dk_mutex);
1323 	xdf_kstat_delete(dkp->dk_xdf_dip);
1324 	ddi_release_devi(dkp->dk_xdf_dip);
1325 	ddi_soft_state_free(pv_cmdk_state, instance);
1326 	ddi_prop_remove_all(dip);
1327 
1328 	return (DDI_SUCCESS);
1329 }
1330 
1331 /*ARGSUSED*/
1332 static int
1333 pv_cmdk_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg,
1334     void **result)
1335 {
1336 	dev_t		dev = (dev_t)arg;
1337 	int		instance = XDF_DEV2UNIT(dev);
1338 	struct pv_cmdk	*dkp = ddi_get_soft_state(pv_cmdk_state, instance);
1339 
1340 	switch (infocmd) {
1341 		case DDI_INFO_DEVT2DEVINFO:
1342 			if (dkp == NULL)
1343 				return (DDI_FAILURE);
1344 			*result = (void *)dkp->dk_dip;
1345 			break;
1346 		case DDI_INFO_DEVT2INSTANCE:
1347 			*result = (void *)(intptr_t)instance;
1348 			break;
1349 		default:
1350 			return (DDI_FAILURE);
1351 	}
1352 	return (DDI_SUCCESS);
1353 }
1354 
1355 static int
1356 pv_cmdk_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op,
1357     int flags, char *name, caddr_t valuep, int *lengthp)
1358 {
1359 	int		instance = ddi_get_instance(dip);
1360 	struct pv_cmdk	*dkp = ddi_get_soft_state(pv_cmdk_state, instance);
1361 	dev_info_t	*xdf_dip;
1362 	dev_t		xdf_devt;
1363 	int		err;
1364 
1365 	/*
1366 	 * Sanity check that if a dev_t or dip were specified that they
1367 	 * correspond to this device driver.  On debug kernels we'll
1368 	 * panic and on non-debug kernels we'll return failure.
1369 	 */
1370 	ASSERT(ddi_driver_major(dip) == pv_cmdk_major);
1371 	ASSERT((dev == DDI_DEV_T_ANY) || (getmajor(dev) == pv_cmdk_major));
1372 	if ((ddi_driver_major(dip) != pv_cmdk_major) ||
1373 	    ((dev != DDI_DEV_T_ANY) && (getmajor(dev) != pv_cmdk_major)))
1374 		return (DDI_PROP_NOT_FOUND);
1375 
1376 	/*
1377 	 * This property lookup might be associated with a device node
1378 	 * that is not yet attached, if so pass it onto ddi_prop_op().
1379 	 */
1380 	if (dkp == NULL)
1381 		return (ddi_prop_op(dev, dip, prop_op, flags,
1382 		    name, valuep, lengthp));
1383 
1384 	/*
1385 	 * Make sure we only lookup static properties.
1386 	 *
1387 	 * If there are static properties of the underlying xdf driver
1388 	 * that we want to mirror, then we'll have to explicity look them
1389 	 * up and define them during attach.  There are a few reasons
1390 	 * for this.  Most importantly, most static properties are typed
1391 	 * and all dynamic properties are untyped, ie, for dynamic
1392 	 * properties the caller must know the type of the property and
1393 	 * how to interpret the value of the property.  the prop_op drivedr
1394 	 * entry point is only designed for returning dynamic/untyped
1395 	 * properties, so if we were to attempt to lookup and pass back
1396 	 * static properties of the underlying device here then we would
1397 	 * be losing the type information for those properties.  Another
1398 	 * reason we don't want to pass on static property requests is that
1399 	 * static properties are enumerable in the device tree, where as
1400 	 * dynamic ones are not.
1401 	 */
1402 	flags |= DDI_PROP_DYNAMIC;
1403 
1404 	/*
1405 	 * We can't use the ldi here to access the underlying device because
1406 	 * the ldi actually opens the device, and that open might fail if the
1407 	 * device has already been opened with the FEXCL flag.  If we used
1408 	 * the ldi here, it would also be possible for some other caller
1409 	 * to try open the device with the FEXCL flag and get a failure
1410 	 * back because we have it open to do a property query.
1411 	 *
1412 	 * Instad we'll grab a hold on the target dip and query the
1413 	 * property directly.
1414 	 */
1415 	mutex_enter(&dkp->dk_mutex);
1416 
1417 	if ((xdf_dip = dkp->dk_xdf_dip) == NULL) {
1418 		mutex_exit(&dkp->dk_mutex);
1419 		return (DDI_PROP_NOT_FOUND);
1420 	}
1421 	e_ddi_hold_devi(xdf_dip);
1422 
1423 	/* figure out the dev_t we're going to pass on down */
1424 	if (dev == DDI_DEV_T_ANY) {
1425 		xdf_devt = DDI_DEV_T_ANY;
1426 	} else {
1427 		xdf_devt = dkp->dk_xdf_dev | XDF_DEV2PART(dev);
1428 	}
1429 
1430 	mutex_exit(&dkp->dk_mutex);
1431 
1432 	/*
1433 	 * Cdev_prop_op() is not a public interface, and normally the caller
1434 	 * is required to make sure that the target driver actually implements
1435 	 * this interface before trying to invoke it.  In this case we know
1436 	 * that we're always accessing the xdf driver and it does have this
1437 	 * interface defined, so we can skip the check.
1438 	 */
1439 	err = cdev_prop_op(xdf_devt, xdf_dip,
1440 	    prop_op, flags, name, valuep, lengthp);
1441 	ddi_release_devi(xdf_dip);
1442 	return (err);
1443 }
1444 
1445 /*
1446  * Device driver ops vector
1447  */
1448 static struct cb_ops pv_cmdk_cb_ops = {
1449 	pv_cmdkopen,		/* open */
1450 	pv_cmdkclose,		/* close */
1451 	pv_cmdkstrategy,	/* strategy */
1452 	nodev,			/* print */
1453 	pv_cmdkdump,		/* dump */
1454 	pv_cmdkread,		/* read */
1455 	pv_cmdkwrite,		/* write */
1456 	pv_cmdkioctl,		/* ioctl */
1457 	nodev,			/* devmap */
1458 	nodev,			/* mmap */
1459 	nodev,			/* segmap */
1460 	nochpoll,		/* poll */
1461 	pv_cmdk_prop_op,	/* cb_prop_op */
1462 	0,			/* streamtab  */
1463 	D_64BIT | D_MP | D_NEW,	/* Driver comaptibility flag */
1464 	CB_REV,			/* cb_rev */
1465 	pv_cmdkaread,		/* async read */
1466 	pv_cmdkawrite		/* async write */
1467 };
1468 
1469 struct dev_ops pv_cmdk_ops = {
1470 	DEVO_REV,		/* devo_rev, */
1471 	0,			/* refcnt  */
1472 	pv_cmdk_getinfo,	/* info */
1473 	nulldev,		/* identify */
1474 	nulldev,		/* probe */
1475 	pv_cmdkattach,		/* attach */
1476 	pv_cmdkdetach,		/* detach */
1477 	nodev,			/* reset */
1478 	&pv_cmdk_cb_ops,	/* driver operations */
1479 	(struct bus_ops *)0	/* bus operations */
1480 };
1481 
1482 /*
1483  * Module linkage information for the kernel.
1484  */
1485 static struct modldrv modldrv = {
1486 	&mod_driverops,		/* Type of module. This one is a driver */
1487 	"PV Common Direct Access Disk",
1488 	&pv_cmdk_ops,		/* driver ops		*/
1489 };
1490 
1491 static struct modlinkage modlinkage = {
1492 	MODREV_1, (void *)&modldrv, NULL
1493 };
1494 
1495 int
1496 _init(void)
1497 {
1498 	int rval;
1499 
1500 	if ((pv_cmdk_major = ddi_name_to_major("cmdk")) == (major_t)-1)
1501 		return (EINVAL);
1502 
1503 	/*
1504 	 * In general ide usually supports 4 disk devices, this same
1505 	 * limitation also applies to software emulating ide devices.
1506 	 * so by default we pre-allocate 4 cmdk soft state structures.
1507 	 */
1508 	if ((rval = ddi_soft_state_init(&pv_cmdk_state,
1509 	    sizeof (struct pv_cmdk), PV_CMDK_NODES)) != 0)
1510 		return (rval);
1511 
1512 	/*
1513 	 * Currently we only support qemu as the backing hardware emulator
1514 	 * for cmdk devices.
1515 	 */
1516 	pv_cmdk_h2p = pv_cmdk_h2p_xen_qemu;
1517 
1518 	/* Install our module */
1519 	if ((rval = mod_install(&modlinkage)) != 0) {
1520 		ddi_soft_state_fini(&pv_cmdk_state);
1521 		return (rval);
1522 	}
1523 
1524 	return (0);
1525 }
1526 
1527 int
1528 _info(struct modinfo *modinfop)
1529 {
1530 	return (mod_info(&modlinkage, modinfop));
1531 }
1532 
1533 int
1534 _fini(void)
1535 {
1536 	int	rval;
1537 	if ((rval = mod_remove(&modlinkage)) != 0)
1538 		return (rval);
1539 	ddi_soft_state_fini(&pv_cmdk_state);
1540 	return (0);
1541 }
1542