xref: /illumos-gate/usr/src/uts/i86pc/i86hvm/io/xpv/xpv_support.c (revision ca9327a6de44d69ddab3668cc1e143ce781387a3)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/modctl.h>
29 #include <sys/types.h>
30 #include <sys/archsystm.h>
31 #include <sys/machsystm.h>
32 #include <sys/sunndi.h>
33 #include <sys/sunddi.h>
34 #include <sys/ddi_subrdefs.h>
35 #include <sys/xpv_support.h>
36 #include <sys/xen_errno.h>
37 #include <sys/hypervisor.h>
38 #include <sys/gnttab.h>
39 #include <sys/xenbus_comms.h>
40 #include <sys/xenbus_impl.h>
41 #include <xen/sys/xendev.h>
42 #include <sys/sysmacros.h>
43 #include <sys/x86_archext.h>
44 #include <sys/mman.h>
45 #include <sys/stat.h>
46 #include <sys/conf.h>
47 #include <sys/devops.h>
48 #include <sys/pc_mmu.h>
49 #include <sys/cmn_err.h>
50 #include <sys/cpr.h>
51 #include <sys/ddi.h>
52 #include <vm/seg_kmem.h>
53 #include <vm/as.h>
54 #include <vm/hat_pte.h>
55 #include <vm/hat_i86.h>
56 
57 #define	XPV_MINOR 0
58 #define	XPV_BUFSIZE 128
59 
60 /*
61  * This structure is ordinarily constructed by Xen. In the HVM world, we
62  * manually fill in the few fields the PV drivers need.
63  */
64 start_info_t *xen_info = NULL;
65 
66 /* Xen version number. */
67 int xen_major, xen_minor;
68 
69 /* Metadata page shared between domain and Xen */
70 shared_info_t *HYPERVISOR_shared_info = NULL;
71 
72 /* Page containing code to issue hypercalls.  */
73 extern caddr_t hypercall_page;
74 
75 /* Is the hypervisor 64-bit? */
76 int xen_is_64bit = -1;
77 
78 /* virtual addr for the store_mfn page */
79 caddr_t xb_addr;
80 
81 dev_info_t *xpv_dip;
82 static dev_info_t *xpvd_dip;
83 
84 /* saved pfn of the shared info page */
85 static pfn_t shared_info_frame;
86 
87 #ifdef DEBUG
88 int xen_suspend_debug;
89 
90 #define	SUSPEND_DEBUG if (xen_suspend_debug) xen_printf
91 #else
92 #define	SUSPEND_DEBUG(...)
93 #endif
94 
95 /*
96  * Forward declarations
97  */
98 static int xpv_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
99 static int xpv_attach(dev_info_t *, ddi_attach_cmd_t);
100 static int xpv_detach(dev_info_t *, ddi_detach_cmd_t);
101 static int xpv_open(dev_t *, int, int, cred_t *);
102 static int xpv_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
103 
104 static struct cb_ops xpv_cb_ops = {
105 	xpv_open,
106 	nulldev,	/* close */
107 	nodev,		/* strategy */
108 	nodev,		/* print */
109 	nodev,		/* dump */
110 	nodev,		/* read */
111 	nodev,		/* write */
112 	xpv_ioctl,	/* ioctl */
113 	nodev,		/* devmap */
114 	nodev,		/* mmap */
115 	nodev,		/* segmap */
116 	nochpoll,	/* poll */
117 	ddi_prop_op,
118 	NULL,
119 	D_MP,
120 	CB_REV,
121 	NULL,
122 	NULL
123 };
124 
125 static struct dev_ops xpv_dv_ops = {
126 	DEVO_REV,
127 	0,
128 	xpv_getinfo,
129 	nulldev,	/* identify */
130 	nulldev,	/* probe */
131 	xpv_attach,
132 	xpv_detach,
133 	nodev,		/* reset */
134 	&xpv_cb_ops,
135 	NULL,		/* struct bus_ops */
136 	NULL		/* power */
137 };
138 
139 static struct modldrv modldrv = {
140 	&mod_driverops,
141 	"xpv driver %I%",
142 	&xpv_dv_ops
143 };
144 
145 static struct modlinkage modl = {
146 	MODREV_1,
147 	{
148 		(void *)&modldrv,
149 		NULL		/* null termination */
150 	}
151 };
152 
153 static ddi_dma_attr_t xpv_dma_attr = {
154 	DMA_ATTR_V0,		/* version of this structure */
155 	0,			/* lowest usable address */
156 	0xffffffffffffffffULL,	/* highest usable address */
157 	0x7fffffff,		/* maximum DMAable byte count */
158 	MMU_PAGESIZE,		/* alignment in bytes */
159 	0x7ff,			/* bitmap of burst sizes */
160 	1,			/* minimum transfer */
161 	0xffffffffU,		/* maximum transfer */
162 	0x7fffffffULL,		/* maximum segment length */
163 	1,			/* maximum number of segments */
164 	1,			/* granularity */
165 	0,			/* flags (reserved) */
166 };
167 
168 static ddi_device_acc_attr_t xpv_accattr = {
169 	DDI_DEVICE_ATTR_V0,
170 	DDI_NEVERSWAP_ACC,
171 	DDI_STRICTORDER_ACC
172 };
173 
174 #define	MAX_ALLOCATIONS 10
175 static ddi_dma_handle_t xpv_dma_handle[MAX_ALLOCATIONS];
176 static ddi_acc_handle_t xpv_dma_acchandle[MAX_ALLOCATIONS];
177 static int xen_alloc_cnt = 0;
178 
179 void *
180 xen_alloc_pages(pgcnt_t cnt)
181 {
182 	size_t len;
183 	int a = xen_alloc_cnt++;
184 	caddr_t addr;
185 
186 	ASSERT(xen_alloc_cnt < MAX_ALLOCATIONS);
187 	if (ddi_dma_alloc_handle(xpv_dip, &xpv_dma_attr, DDI_DMA_SLEEP, 0,
188 	    &xpv_dma_handle[a]) != DDI_SUCCESS)
189 		return (NULL);
190 
191 	if (ddi_dma_mem_alloc(xpv_dma_handle[a], MMU_PAGESIZE * cnt,
192 	    &xpv_accattr, DDI_DMA_CONSISTENT, DDI_DMA_SLEEP, 0,
193 	    &addr, &len, &xpv_dma_acchandle[a]) != DDI_SUCCESS) {
194 		ddi_dma_free_handle(&xpv_dma_handle[a]);
195 		cmn_err(CE_WARN, "Couldn't allocate memory for xpv devices");
196 		return (NULL);
197 	}
198 	return (addr);
199 }
200 
201 /*
202  * This function is invoked twice, first time with reprogram=0 to set up
203  * the xpvd portion of the device tree. The second time it is ignored.
204  */
205 static void
206 xpv_enumerate(int reprogram)
207 {
208 	dev_info_t *dip;
209 
210 	if (reprogram != 0)
211 		return;
212 
213 	ndi_devi_alloc_sleep(ddi_root_node(), "xpvd",
214 	    (pnode_t)DEVI_SID_NODEID, &dip);
215 
216 	(void) ndi_devi_bind_driver(dip, 0);
217 
218 	/*
219 	 * Too early to enumerate split device drivers in domU
220 	 * since we need to create taskq thread during enumeration.
221 	 * So, we only enumerate softdevs and console here.
222 	 */
223 	xendev_enum_all(dip, B_TRUE);
224 }
225 
226 /*
227  * Translate a hypervisor errcode to a Solaris error code.
228  */
229 int
230 xen_xlate_errcode(int error)
231 {
232 #define	CASE(num)	case X_##num: error = num; break
233 
234 	switch (-error) {
235 		CASE(EPERM);    CASE(ENOENT);   CASE(ESRCH);
236 		CASE(EINTR);	CASE(EIO);	CASE(ENXIO);
237 		CASE(E2BIG);    CASE(ENOMEM);   CASE(EACCES);
238 		CASE(EFAULT);   CASE(EBUSY);    CASE(EEXIST);
239 		CASE(ENODEV);   CASE(EISDIR);   CASE(EINVAL);
240 		CASE(ENOSPC);   CASE(ESPIPE);   CASE(EROFS);
241 		CASE(ENOSYS);   CASE(ENOTEMPTY); CASE(EISCONN);
242 		CASE(ENODATA);
243 		default:
244 		panic("xen_xlate_errcode: unknown error %d", error);
245 	}
246 	return (error);
247 #undef CASE
248 }
249 
250 /*PRINTFLIKE1*/
251 void
252 xen_printf(const char *fmt, ...)
253 {
254 	va_list adx;
255 
256 	va_start(adx, fmt);
257 	printf(fmt, adx);
258 	va_end(adx);
259 }
260 
261 /*
262  * Stub functions to get the FE drivers to build, and to catch drivers that
263  * misbehave in HVM domains.
264  */
265 /*ARGSUSED*/
266 void
267 xen_release_pfn(pfn_t pfn, caddr_t va)
268 {
269 	panic("xen_release_pfn() is not supported in HVM domains");
270 }
271 
272 /*ARGSUSED*/
273 void
274 reassign_pfn(pfn_t pfn, mfn_t mfn)
275 {
276 	panic("reassign_pfn() is not supported in HVM domains");
277 }
278 
279 /*ARGSUSED*/
280 long
281 balloon_free_pages(uint_t page_cnt, mfn_t *mfns, caddr_t kva, pfn_t *pfns)
282 {
283 	panic("balloon_free_pages() is not supported in HVM domains");
284 	return (0);
285 }
286 
287 /*ARGSUSED*/
288 void
289 balloon_drv_added(int64_t delta)
290 {
291 	panic("balloon_drv_added() is not supported in HVM domains");
292 }
293 
294 /*
295  * Add a mapping for the machine page at the given virtual address.
296  */
297 void
298 kbm_map_ma(maddr_t ma, uintptr_t va, uint_t level)
299 {
300 	ASSERT(level == 0);
301 
302 	hat_devload(kas.a_hat, (caddr_t)va, MMU_PAGESIZE,
303 	    mmu_btop(ma), PROT_READ | PROT_WRITE, HAT_LOAD);
304 }
305 
306 static uint64_t
307 hvm_get_param(int param_id)
308 {
309 	struct xen_hvm_param xhp;
310 
311 	xhp.domid = DOMID_SELF;
312 	xhp.index = param_id;
313 	if ((HYPERVISOR_hvm_op(HVMOP_get_param, &xhp) < 0))
314 		return (-1);
315 	return (xhp.value);
316 }
317 
318 static struct xenbus_watch shutdown_watch;
319 taskq_t *xen_shutdown_tq;
320 
321 #define	SHUTDOWN_INVALID	-1
322 #define	SHUTDOWN_POWEROFF	0
323 #define	SHUTDOWN_REBOOT		1
324 #define	SHUTDOWN_SUSPEND	2
325 #define	SHUTDOWN_HALT		3
326 #define	SHUTDOWN_MAX		4
327 
328 #define	SHUTDOWN_TIMEOUT_SECS (60 * 5)
329 
330 static const char *cmd_strings[SHUTDOWN_MAX] = {
331 	"poweroff",
332 	"reboot",
333 	"suspend",
334 	"halt"
335 };
336 
337 int
338 xen_suspend_devices(dev_info_t *dip)
339 {
340 	int error;
341 	char buf[XPV_BUFSIZE];
342 
343 	SUSPEND_DEBUG("xen_suspend_devices\n");
344 
345 	for (; dip != NULL; dip = ddi_get_next_sibling(dip)) {
346 		if (xen_suspend_devices(ddi_get_child(dip)))
347 			return (ENXIO);
348 		if (ddi_get_driver(dip) == NULL)
349 			continue;
350 		SUSPEND_DEBUG("Suspending device %s\n", ddi_deviname(dip, buf));
351 		ASSERT((DEVI(dip)->devi_cpr_flags & DCF_CPR_SUSPENDED) == 0);
352 
353 
354 		if (!i_ddi_devi_attached(dip)) {
355 			error = DDI_FAILURE;
356 		} else {
357 			error = devi_detach(dip, DDI_SUSPEND);
358 		}
359 
360 		if (error == DDI_SUCCESS) {
361 			DEVI(dip)->devi_cpr_flags |= DCF_CPR_SUSPENDED;
362 		} else {
363 			SUSPEND_DEBUG("WARNING: Unable to suspend device %s\n",
364 			    ddi_deviname(dip, buf));
365 			cmn_err(CE_WARN, "Unable to suspend device %s.",
366 			    ddi_deviname(dip, buf));
367 			cmn_err(CE_WARN, "Device is busy or does not "
368 			    "support suspend/resume.");
369 				return (ENXIO);
370 		}
371 	}
372 	return (0);
373 }
374 
375 int
376 xen_resume_devices(dev_info_t *start, int resume_failed)
377 {
378 	dev_info_t *dip, *next, *last = NULL;
379 	int did_suspend;
380 	int error = resume_failed;
381 	char buf[XPV_BUFSIZE];
382 
383 	SUSPEND_DEBUG("xen_resume_devices\n");
384 
385 	while (last != start) {
386 		dip = start;
387 		next = ddi_get_next_sibling(dip);
388 		while (next != last) {
389 			dip = next;
390 			next = ddi_get_next_sibling(dip);
391 		}
392 
393 		/*
394 		 * cpr is the only one that uses this field and the device
395 		 * itself hasn't resumed yet, there is no need to use a
396 		 * lock, even though kernel threads are active by now.
397 		 */
398 		did_suspend = DEVI(dip)->devi_cpr_flags & DCF_CPR_SUSPENDED;
399 		if (did_suspend)
400 			DEVI(dip)->devi_cpr_flags &= ~DCF_CPR_SUSPENDED;
401 
402 		/*
403 		 * There may be background attaches happening on devices
404 		 * that were not originally suspended by cpr, so resume
405 		 * only devices that were suspended by cpr. Also, stop
406 		 * resuming after the first resume failure, but traverse
407 		 * the entire tree to clear the suspend flag.
408 		 */
409 		if (did_suspend && !error) {
410 			SUSPEND_DEBUG("Resuming device %s\n",
411 			    ddi_deviname(dip, buf));
412 			/*
413 			 * If a device suspended by cpr gets detached during
414 			 * the resume process (for example, due to hotplugging)
415 			 * before cpr gets around to issuing it a DDI_RESUME,
416 			 * we'll have problems.
417 			 */
418 			if (!i_ddi_devi_attached(dip)) {
419 				cmn_err(CE_WARN, "Skipping %s, device "
420 				    "not ready for resume",
421 				    ddi_deviname(dip, buf));
422 			} else {
423 				if (devi_attach(dip, DDI_RESUME) !=
424 				    DDI_SUCCESS) {
425 					error = ENXIO;
426 				}
427 			}
428 		}
429 
430 		if (error == ENXIO) {
431 			cmn_err(CE_WARN, "Unable to resume device %s",
432 			    ddi_deviname(dip, buf));
433 		}
434 
435 		error = xen_resume_devices(ddi_get_child(dip), error);
436 		last = dip;
437 	}
438 
439 	return (error);
440 }
441 
442 /*ARGSUSED*/
443 static int
444 check_xpvd(dev_info_t *dip, void *arg)
445 {
446 	char *name;
447 
448 	name = ddi_node_name(dip);
449 	if (name == NULL || strcmp(name, "xpvd")) {
450 		return (DDI_WALK_CONTINUE);
451 	} else {
452 		xpvd_dip = dip;
453 		return (DDI_WALK_TERMINATE);
454 	}
455 }
456 
457 /*
458  * Top level routine to direct suspend/resume of a domain.
459  */
460 void
461 xen_suspend_domain(void)
462 {
463 	extern void rtcsync(void);
464 	extern void ec_resume(void);
465 	extern kmutex_t ec_lock;
466 	struct xen_add_to_physmap xatp;
467 	ulong_t flags;
468 	int err;
469 
470 	cmn_err(CE_NOTE, "Domain suspending for save/migrate");
471 
472 	SUSPEND_DEBUG("xen_suspend_domain\n");
473 
474 	/*
475 	 * We only want to suspend the PV devices, since the emulated devices
476 	 * are suspended by saving the emulated device state.  The PV devices
477 	 * are all children of the xpvd nexus device.  So we search the
478 	 * device tree for the xpvd node to use as the root of the tree to
479 	 * be suspended.
480 	 */
481 	if (xpvd_dip == NULL)
482 		ddi_walk_devs(ddi_root_node(), check_xpvd, NULL);
483 
484 	/*
485 	 * suspend interrupts and devices
486 	 */
487 	if (xpvd_dip != NULL)
488 		(void) xen_suspend_devices(ddi_get_child(xpvd_dip));
489 	else
490 		cmn_err(CE_WARN, "No PV devices found to suspend");
491 	SUSPEND_DEBUG("xenbus_suspend\n");
492 	xenbus_suspend();
493 
494 	mutex_enter(&cpu_lock);
495 
496 	/*
497 	 * Suspend on vcpu 0
498 	 */
499 	thread_affinity_set(curthread, 0);
500 	kpreempt_disable();
501 
502 	if (ncpus > 1)
503 		pause_cpus(NULL);
504 	/*
505 	 * We can grab the ec_lock as it's a spinlock with a high SPL. Hence
506 	 * any holder would have dropped it to get through pause_cpus().
507 	 */
508 	mutex_enter(&ec_lock);
509 
510 	/*
511 	 * From here on in, we can't take locks.
512 	 */
513 
514 	flags = intr_clear();
515 
516 	SUSPEND_DEBUG("HYPERVISOR_suspend\n");
517 	/*
518 	 * At this point we suspend and sometime later resume.
519 	 * Note that this call may return with an indication of a cancelled
520 	 * for now no matter ehat the return we do a full resume of all
521 	 * suspended drivers, etc.
522 	 */
523 	(void) HYPERVISOR_shutdown(SHUTDOWN_suspend);
524 
525 	/*
526 	 * Point HYPERVISOR_shared_info to the proper place.
527 	 */
528 	xatp.domid = DOMID_SELF;
529 	xatp.idx = 0;
530 	xatp.space = XENMAPSPACE_shared_info;
531 	xatp.gpfn = shared_info_frame;
532 	if ((err = HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) != 0)
533 		panic("Could not set shared_info page. error: %d", err);
534 
535 	SUSPEND_DEBUG("gnttab_resume\n");
536 	gnttab_resume();
537 
538 	SUSPEND_DEBUG("ec_resume\n");
539 	ec_resume();
540 
541 	intr_restore(flags);
542 
543 	if (ncpus > 1)
544 		start_cpus();
545 
546 	mutex_exit(&ec_lock);
547 	mutex_exit(&cpu_lock);
548 
549 	/*
550 	 * Now we can take locks again.
551 	 */
552 
553 	rtcsync();
554 
555 	SUSPEND_DEBUG("xenbus_resume\n");
556 	xenbus_resume();
557 	SUSPEND_DEBUG("xen_resume_devices\n");
558 	if (xpvd_dip != NULL)
559 		(void) xen_resume_devices(ddi_get_child(xpvd_dip), 0);
560 
561 	thread_affinity_clear(curthread);
562 	kpreempt_enable();
563 
564 	SUSPEND_DEBUG("finished xen_suspend_domain\n");
565 
566 	cmn_err(CE_NOTE, "domain restore/migrate completed");
567 }
568 
569 static void
570 xen_dirty_shutdown(void *arg)
571 {
572 	int cmd = (uintptr_t)arg;
573 
574 	cmn_err(CE_WARN, "Externally requested shutdown failed or "
575 	    "timed out.\nShutting down.\n");
576 
577 	switch (cmd) {
578 	case SHUTDOWN_HALT:
579 	case SHUTDOWN_POWEROFF:
580 		(void) kadmin(A_SHUTDOWN, AD_POWEROFF, NULL, kcred);
581 		break;
582 	case SHUTDOWN_REBOOT:
583 		(void) kadmin(A_REBOOT, AD_BOOT, NULL, kcred);
584 		break;
585 	}
586 }
587 
588 static void
589 xen_shutdown(void *arg)
590 {
591 	nvlist_t *attr_list = NULL;
592 	sysevent_t *event = NULL;
593 	sysevent_id_t eid;
594 	int cmd = (uintptr_t)arg;
595 	int err;
596 
597 	ASSERT(cmd > SHUTDOWN_INVALID && cmd < SHUTDOWN_MAX);
598 
599 	if (cmd == SHUTDOWN_SUSPEND) {
600 		xen_suspend_domain();
601 		return;
602 	}
603 
604 	err = nvlist_alloc(&attr_list, NV_UNIQUE_NAME, KM_SLEEP);
605 	if (err != DDI_SUCCESS)
606 		goto failure;
607 
608 	err = nvlist_add_string(attr_list, "shutdown", cmd_strings[cmd]);
609 	if (err != DDI_SUCCESS)
610 		goto failure;
611 
612 	if ((event = sysevent_alloc("EC_xpvsys", "control", "SUNW:kern:xpv",
613 	    SE_SLEEP)) == NULL)
614 		goto failure;
615 	(void) sysevent_attach_attributes(event,
616 	    (sysevent_attr_list_t *)attr_list);
617 
618 	err = log_sysevent(event, SE_SLEEP, &eid);
619 
620 	sysevent_detach_attributes(event);
621 	sysevent_free(event);
622 
623 	if (err != 0)
624 		goto failure;
625 
626 	(void) timeout(xen_dirty_shutdown, arg,
627 	    SHUTDOWN_TIMEOUT_SECS * drv_usectohz(MICROSEC));
628 
629 	nvlist_free(attr_list);
630 	return;
631 
632 failure:
633 	if (attr_list != NULL)
634 		nvlist_free(attr_list);
635 	xen_dirty_shutdown(arg);
636 }
637 
638 /*ARGSUSED*/
639 static void
640 xen_shutdown_handler(struct xenbus_watch *watch, const char **vec,
641 	unsigned int len)
642 {
643 	char *str;
644 	xenbus_transaction_t xbt;
645 	int err, shutdown_code = SHUTDOWN_INVALID;
646 	unsigned int slen;
647 
648 again:
649 	err = xenbus_transaction_start(&xbt);
650 	if (err)
651 		return;
652 	if (xenbus_read(xbt, "control", "shutdown", (void *)&str, &slen)) {
653 		(void) xenbus_transaction_end(xbt, 1);
654 		return;
655 	}
656 
657 	SUSPEND_DEBUG("%d: xen_shutdown_handler: \"%s\"\n", CPU->cpu_id, str);
658 
659 	/*
660 	 * If this is a watch fired from our write below, check out early to
661 	 * avoid an infinite loop.
662 	 */
663 	if (strcmp(str, "") == 0) {
664 		(void) xenbus_transaction_end(xbt, 0);
665 		kmem_free(str, slen);
666 		return;
667 	} else if (strcmp(str, "poweroff") == 0) {
668 		shutdown_code = SHUTDOWN_POWEROFF;
669 	} else if (strcmp(str, "reboot") == 0) {
670 		shutdown_code = SHUTDOWN_REBOOT;
671 	} else if (strcmp(str, "suspend") == 0) {
672 		shutdown_code = SHUTDOWN_SUSPEND;
673 	} else if (strcmp(str, "halt") == 0) {
674 		shutdown_code = SHUTDOWN_HALT;
675 	} else {
676 		printf("Ignoring shutdown request: %s\n", str);
677 	}
678 
679 	(void) xenbus_write(xbt, "control", "shutdown", "");
680 	err = xenbus_transaction_end(xbt, 0);
681 	if (err == EAGAIN) {
682 		SUSPEND_DEBUG("%d: trying again\n", CPU->cpu_id);
683 		kmem_free(str, slen);
684 		goto again;
685 	}
686 
687 	kmem_free(str, slen);
688 	if (shutdown_code != SHUTDOWN_INVALID) {
689 		(void) taskq_dispatch(xen_shutdown_tq, xen_shutdown,
690 		    (void *)(intptr_t)shutdown_code, 0);
691 	}
692 }
693 
694 static int
695 xen_pv_init(dev_info_t *xpv_dip)
696 {
697 	struct cpuid_regs cp;
698 	uint32_t xen_signature[4];
699 	char *xen_str;
700 	struct xen_add_to_physmap xatp;
701 	xen_capabilities_info_t caps;
702 	pfn_t pfn;
703 	uint64_t msrval;
704 	int err;
705 
706 	/*
707 	 * Xen's pseudo-cpuid function 0x40000000 returns a string
708 	 * representing the Xen signature in %ebx, %ecx, and %edx.
709 	 * %eax contains the maximum supported cpuid function.
710 	 */
711 	cp.cp_eax = 0x40000000;
712 	(void) __cpuid_insn(&cp);
713 	xen_signature[0] = cp.cp_ebx;
714 	xen_signature[1] = cp.cp_ecx;
715 	xen_signature[2] = cp.cp_edx;
716 	xen_signature[3] = 0;
717 	xen_str = (char *)xen_signature;
718 	if (strcmp("XenVMMXenVMM", xen_str) != 0 ||
719 	    cp.cp_eax < 0x40000002) {
720 		cmn_err(CE_WARN,
721 		    "Attempting to load Xen drivers on non-Xen system");
722 		return (-1);
723 	}
724 
725 	/*
726 	 * cpuid function 0x40000001 returns the Xen version in %eax.  The
727 	 * top 16 bits are the major version, the bottom 16 are the minor
728 	 * version.
729 	 */
730 	cp.cp_eax = 0x40000001;
731 	(void) __cpuid_insn(&cp);
732 	xen_major = cp.cp_eax >> 16;
733 	xen_minor = cp.cp_eax & 0xffff;
734 
735 	/*
736 	 * The xpv driver is incompatible with xen versions older than 3.1. This
737 	 * is due to the changes in the vcpu_info and shared_info structs used
738 	 * to communicate with the hypervisor (the event channels in particular)
739 	 * that were introduced with 3.1.
740 	 */
741 	if (xen_major < 3 || (xen_major == 3 && xen_minor < 1)) {
742 		cmn_err(CE_WARN, "Xen version %d.%d is not supported",
743 		    xen_major, xen_minor);
744 		return (-1);
745 	}
746 
747 	/*
748 	 * cpuid function 0x40000002 returns information about the
749 	 * hypercall page.  %eax nominally contains the number of pages
750 	 * with hypercall code, but according to the Xen guys, "I'll
751 	 * guarantee that remains one forever more, so you can just
752 	 * allocate a single page and get quite upset if you ever see CPUID
753 	 * return more than one page."  %ebx contains an MSR we use to ask
754 	 * Xen to remap each page at a specific pfn.
755 	 */
756 	cp.cp_eax = 0x40000002;
757 	(void) __cpuid_insn(&cp);
758 
759 	/*
760 	 * Let Xen know where we want the hypercall page mapped.  We
761 	 * already have a page allocated in the .text section to simplify
762 	 * the wrapper code.
763 	 */
764 	pfn = hat_getpfnum(kas.a_hat, (caddr_t)&hypercall_page);
765 	msrval = mmu_ptob(pfn);
766 	wrmsr(cp.cp_ebx, msrval);
767 
768 	/* Fill in the xen_info data */
769 	xen_info = kmem_zalloc(sizeof (start_info_t), KM_SLEEP);
770 	(void) sprintf(xen_info->magic, "xen-%d.%d", xen_major, xen_minor);
771 	xen_info->store_mfn = (mfn_t)hvm_get_param(HVM_PARAM_STORE_PFN);
772 	xen_info->store_evtchn = (int)hvm_get_param(HVM_PARAM_STORE_EVTCHN);
773 
774 	/* Figure out whether the hypervisor is 32-bit or 64-bit.  */
775 	if ((HYPERVISOR_xen_version(XENVER_capabilities, &caps) == 0)) {
776 		((char *)(caps))[sizeof (caps) - 1] = '\0';
777 		if (strstr(caps, "x86_64") != NULL)
778 			xen_is_64bit = 1;
779 		else if (strstr(caps, "x86_32") != NULL)
780 			xen_is_64bit = 0;
781 	}
782 	if (xen_is_64bit < 0) {
783 		cmn_err(CE_WARN, "Couldn't get capability info from Xen.");
784 		return (-1);
785 	}
786 #ifdef __amd64
787 	ASSERT(xen_is_64bit == 1);
788 #endif
789 
790 	/*
791 	 * Allocate space for the shared_info page and tell Xen where it
792 	 * is.
793 	 */
794 	HYPERVISOR_shared_info = xen_alloc_pages(1);
795 	shared_info_frame = hat_getpfnum(kas.a_hat,
796 	    (caddr_t)HYPERVISOR_shared_info);
797 	xatp.domid = DOMID_SELF;
798 	xatp.idx = 0;
799 	xatp.space = XENMAPSPACE_shared_info;
800 	xatp.gpfn = shared_info_frame;
801 	if ((err = HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) != 0) {
802 		cmn_err(CE_WARN, "Could not get shared_info page from Xen."
803 		    "  error: %d", err);
804 		return (-1);
805 	}
806 
807 	/* Set up the grant tables.  */
808 	gnttab_init();
809 
810 	/* Set up event channel support */
811 	if (ec_init(xpv_dip) != 0)
812 		return (-1);
813 
814 	/* Set up xenbus */
815 	xb_addr = vmem_alloc(heap_arena, MMU_PAGESIZE, VM_SLEEP);
816 	xs_early_init();
817 	xs_domu_init();
818 
819 	/* Set up for suspend/resume/migrate */
820 	xen_shutdown_tq = taskq_create("shutdown_taskq", 1,
821 	    maxclsyspri - 1, 1, 1, TASKQ_PREPOPULATE);
822 	shutdown_watch.node = "control/shutdown";
823 	shutdown_watch.callback = xen_shutdown_handler;
824 	if (register_xenbus_watch(&shutdown_watch))
825 		cmn_err(CE_WARN, "Failed to set shutdown watcher");
826 
827 	return (0);
828 }
829 
830 static void
831 xen_pv_fini()
832 {
833 	if (xen_info != NULL)
834 		kmem_free(xen_info, sizeof (start_info_t));
835 	ec_fini();
836 }
837 
838 /*ARGSUSED*/
839 static int
840 xpv_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
841 {
842 	if (getminor((dev_t)arg) != XPV_MINOR)
843 		return (DDI_FAILURE);
844 
845 	switch (cmd) {
846 	case DDI_INFO_DEVT2DEVINFO:
847 		*result = xpv_dip;
848 		break;
849 	case DDI_INFO_DEVT2INSTANCE:
850 		*result = 0;
851 		break;
852 	default:
853 		return (DDI_FAILURE);
854 	}
855 
856 	return (DDI_SUCCESS);
857 }
858 
859 static int
860 xpv_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
861 {
862 	if (cmd != DDI_ATTACH)
863 		return (DDI_FAILURE);
864 
865 	if (ddi_create_minor_node(dip, ddi_get_name(dip), S_IFCHR,
866 	    ddi_get_instance(dip), DDI_PSEUDO, 0) != DDI_SUCCESS)
867 		return (DDI_FAILURE);
868 
869 	xpv_dip = dip;
870 
871 	if (xen_pv_init(dip) != 0)
872 		return (DDI_FAILURE);
873 
874 	ddi_report_dev(dip);
875 
876 	/*
877 	 * If the memscrubber attempts to scrub the pages we hand to Xen,
878 	 * the domain will panic.
879 	 */
880 	memscrub_disable();
881 
882 	/*
883 	 * Report our version to dom0.
884 	 */
885 	if (xenbus_printf(XBT_NULL, "hvmpv/xpv", "version", "%d",
886 	    HVMPV_XPV_VERS))
887 		cmn_err(CE_WARN, "xpv: couldn't write version\n");
888 
889 	return (DDI_SUCCESS);
890 }
891 
892 /*
893  * Attempts to reload the PV driver plumbing hang on Intel platforms, so
894  * we don't want to unload the framework by accident.
895  */
896 int xpv_allow_detach = 0;
897 
898 static int
899 xpv_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
900 {
901 	if (cmd != DDI_DETACH || xpv_allow_detach == 0)
902 		return (DDI_FAILURE);
903 
904 	if (xpv_dip != NULL) {
905 		xen_pv_fini();
906 		ddi_remove_minor_node(dip, NULL);
907 		xpv_dip = NULL;
908 	}
909 
910 	return (DDI_SUCCESS);
911 }
912 
913 /*ARGSUSED1*/
914 static int
915 xpv_open(dev_t *dev, int flag, int otyp, cred_t *cr)
916 {
917 	return (getminor(*dev) == XPV_MINOR ? 0 : ENXIO);
918 }
919 
920 /*ARGSUSED*/
921 static int
922 xpv_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *cr,
923     int *rval_p)
924 {
925 	return (EINVAL);
926 }
927 
928 int
929 _init(void)
930 {
931 	int err;
932 
933 	if ((err = mod_install(&modl)) != 0)
934 		return (err);
935 
936 	impl_bus_add_probe(xpv_enumerate);
937 	return (0);
938 }
939 
940 int
941 _fini(void)
942 {
943 	int err;
944 
945 	if ((err = mod_remove(&modl)) != 0)
946 		return (err);
947 
948 	impl_bus_delete_probe(xpv_enumerate);
949 	return (0);
950 }
951 
952 int
953 _info(struct modinfo *modinfop)
954 {
955 	return (mod_info(&modl, modinfop));
956 }
957