xref: /illumos-gate/usr/src/uts/i86xpv/os/xen_machdep.c (revision e5ba14ff435beeefdaa2e6649e175c74afe02c76)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /* derived from netbsd's xen_machdep.c 1.1.2.1 */
30 
31 /*
32  *
33  * Copyright (c) 2004 Christian Limpach.
34  * All rights reserved.
35  *
36  * Redistribution and use in source and binary forms, with or without
37  * modification, are permitted provided that the following conditions
38  * are met:
39  * 1. Redistributions of source code must retain the above copyright
40  *    notice, this list of conditions and the following disclaimer.
41  * 2. Redistributions in binary form must reproduce the above copyright
42  *    notice, this list of conditions and the following disclaimer in the
43  *    documentation and/or other materials provided with the distribution.
44  * 3. This section intentionally left blank.
45  * 4. The name of the author may not be used to endorse or promote products
46  *    derived from this software without specific prior written permission.
47  *
48  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
49  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
50  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
51  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
52  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
53  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
54  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
55  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
56  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
57  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
58  */
59 /*
60  * Section 3 of the above license was updated in response to bug 6379571.
61  */
62 
63 #include <sys/ctype.h>
64 #include <sys/types.h>
65 #include <sys/cmn_err.h>
66 #include <sys/trap.h>
67 #include <sys/segments.h>
68 #include <sys/hypervisor.h>
69 #include <sys/xen_mmu.h>
70 #include <sys/machsystm.h>
71 #include <sys/promif.h>
72 #include <sys/bootconf.h>
73 #include <sys/bootinfo.h>
74 #include <sys/cpr.h>
75 #include <sys/taskq.h>
76 #include <sys/uadmin.h>
77 #include <sys/evtchn_impl.h>
78 #include <sys/archsystm.h>
79 #include <xen/sys/xenbus_impl.h>
80 #include <sys/mach_mmu.h>
81 #include <vm/hat_i86.h>
82 #include <sys/gnttab.h>
83 #include <sys/reboot.h>
84 #include <sys/stack.h>
85 #include <sys/clock.h>
86 #include <sys/bitmap.h>
87 #include <sys/processor.h>
88 #include <sys/xen_errno.h>
89 #include <sys/xpv_panic.h>
90 #include <sys/smp_impldefs.h>
91 #include <sys/cpu.h>
92 #include <sys/balloon_impl.h>
93 #include <sys/ddi.h>
94 
95 #ifdef DEBUG
96 #define	SUSPEND_DEBUG if (xen_suspend_debug) xen_printf
97 #else
98 #define	SUSPEND_DEBUG(...)
99 #endif
100 
101 int cpr_debug;
102 cpuset_t cpu_suspend_lost_set;
103 static int xen_suspend_debug;
104 
105 /*
106  * Determine helpful version information.
107  *
108  * (And leave copies in the data segment so we can look at them later
109  * with e.g. kmdb.)
110  */
111 
112 typedef enum xen_version {
113 	XENVER_BOOT_IDX,
114 	XENVER_CURRENT_IDX
115 } xen_version_t;
116 
117 struct xenver {
118 	ulong_t xv_major;
119 	ulong_t xv_minor;
120 	ulong_t xv_revision;
121 	xen_extraversion_t xv_ver;
122 	xen_changeset_info_t xv_chgset;
123 	xen_compile_info_t xv_build;
124 	xen_capabilities_info_t xv_caps;
125 } xenver[2];
126 
127 #define	XENVER_BOOT(m)	(xenver[XENVER_BOOT_IDX].m)
128 #define	XENVER_CURRENT(m)	(xenver[XENVER_CURRENT_IDX].m)
129 
130 /*
131  * Update the xenver data. We maintain two copies, boot and
132  * current. If we are setting the boot, then also set current.
133  */
134 static void
135 xen_set_version(xen_version_t idx)
136 {
137 	ulong_t ver;
138 
139 	bzero(&xenver[idx], sizeof (xenver[idx]));
140 
141 	ver = HYPERVISOR_xen_version(XENVER_version, 0);
142 
143 	xenver[idx].xv_major = BITX(ver, 31, 16);
144 	xenver[idx].xv_minor = BITX(ver, 15, 0);
145 
146 	(void) HYPERVISOR_xen_version(XENVER_extraversion, &xenver[idx].xv_ver);
147 
148 	/*
149 	 * The revision is buried in the extraversion information that is
150 	 * maintained by the hypervisor. For our purposes we expect that
151 	 * the revision number is:
152 	 * 	- the second character in the extraversion information
153 	 *	- one character long
154 	 *	- numeric digit
155 	 * If it isn't then we can't extract the revision and we leave it
156 	 * set to 0.
157 	 */
158 	if (strlen(xenver[idx].xv_ver) > 1 && isdigit(xenver[idx].xv_ver[1]))
159 		xenver[idx].xv_revision = xenver[idx].xv_ver[1] - '0';
160 	else
161 		cmn_err(CE_WARN, "Cannot extract revision on this hypervisor "
162 		    "version: v%s, unexpected version format",
163 		    xenver[idx].xv_ver);
164 
165 	(void) HYPERVISOR_xen_version(XENVER_changeset,
166 	    &xenver[idx].xv_chgset);
167 
168 	(void) HYPERVISOR_xen_version(XENVER_compile_info,
169 	    &xenver[idx].xv_build);
170 	/*
171 	 * Capabilities are a set of space separated ascii strings
172 	 * e.g. 'xen-3.1-x86_32p' or 'hvm-3.2-x86_64'
173 	 */
174 	(void) HYPERVISOR_xen_version(XENVER_capabilities,
175 	    &xenver[idx].xv_caps);
176 
177 	cmn_err(CE_CONT, "?v%lu.%lu%s chgset '%s'\n", xenver[idx].xv_major,
178 	    xenver[idx].xv_minor, xenver[idx].xv_ver, xenver[idx].xv_chgset);
179 
180 	if (idx == XENVER_BOOT_IDX)
181 		bcopy(&xenver[XENVER_BOOT_IDX], &xenver[XENVER_CURRENT_IDX],
182 		    sizeof (xenver[XENVER_BOOT_IDX]));
183 }
184 
185 typedef enum xen_hypervisor_check {
186 	XEN_RUN_CHECK,
187 	XEN_SUSPEND_CHECK
188 } xen_hypervisor_check_t;
189 
190 /*
191  * To run the hypervisor must be 3.0.4 or better. To suspend/resume
192  * we need 3.0.4 or better and if it is 3.0.4. then it must be provided
193  * by the Solaris xVM project.
194  * Checking can be disabled for testing purposes by setting the
195  * xen_suspend_debug variable.
196  */
197 static int
198 xen_hypervisor_supports_solaris(xen_hypervisor_check_t check)
199 {
200 	if (xen_suspend_debug == 1)
201 		return (1);
202 	if (XENVER_CURRENT(xv_major) < 3)
203 		return (0);
204 	if (XENVER_CURRENT(xv_major) > 3)
205 		return (1);
206 	if (XENVER_CURRENT(xv_minor) > 0)
207 		return (1);
208 	if (XENVER_CURRENT(xv_revision) < 4)
209 		return (0);
210 	if (XENVER_CURRENT(xv_revision) == 4 && check == XEN_SUSPEND_CHECK) {
211 		if (strlen(XENVER_CURRENT(xv_ver)) < 4)
212 			return (0);
213 		if (strncmp(XENVER_CURRENT(xv_ver) +
214 		    strlen(XENVER_CURRENT(xv_ver)) - 4, "-xvm", 4))
215 			return (0);
216 	}
217 	return (1);
218 }
219 
220 void
221 xen_set_callback(void (*func)(void), uint_t type, uint_t flags)
222 {
223 	struct callback_register cb;
224 
225 	bzero(&cb, sizeof (cb));
226 #if defined(__amd64)
227 	cb.address = (ulong_t)func;
228 #elif defined(__i386)
229 	cb.address.cs = KCS_SEL;
230 	cb.address.eip = (ulong_t)func;
231 #endif
232 	cb.type = type;
233 	cb.flags = flags;
234 
235 	/*
236 	 * XXPV always ignore return value for NMI
237 	 */
238 	if (HYPERVISOR_callback_op(CALLBACKOP_register, &cb) != 0 &&
239 	    type != CALLBACKTYPE_nmi)
240 		panic("HYPERVISOR_callback_op failed");
241 }
242 
243 void
244 xen_init_callbacks(void)
245 {
246 	/*
247 	 * register event (interrupt) handler.
248 	 */
249 	xen_set_callback(xen_callback, CALLBACKTYPE_event, 0);
250 
251 	/*
252 	 * failsafe handler.
253 	 */
254 	xen_set_callback(xen_failsafe_callback, CALLBACKTYPE_failsafe,
255 	    CALLBACKF_mask_events);
256 
257 	/*
258 	 * NMI handler.
259 	 */
260 	xen_set_callback(nmiint, CALLBACKTYPE_nmi, 0);
261 
262 	/*
263 	 * system call handler
264 	 * XXPV move to init_cpu_syscall?
265 	 */
266 #if defined(__amd64)
267 	xen_set_callback(sys_syscall, CALLBACKTYPE_syscall,
268 	    CALLBACKF_mask_events);
269 #endif	/* __amd64 */
270 }
271 
272 
273 /*
274  * cmn_err() followed by a 1/4 second delay; this gives the
275  * logging service a chance to flush messages and helps avoid
276  * intermixing output from prom_printf().
277  * XXPV: doesn't exactly help us on UP though.
278  */
279 /*PRINTFLIKE2*/
280 void
281 cpr_err(int ce, const char *fmt, ...)
282 {
283 	va_list adx;
284 
285 	va_start(adx, fmt);
286 	vcmn_err(ce, fmt, adx);
287 	va_end(adx);
288 	drv_usecwait(MICROSEC >> 2);
289 }
290 
291 void
292 xen_suspend_devices(void)
293 {
294 	int rc;
295 
296 	SUSPEND_DEBUG("xen_suspend_devices\n");
297 
298 	if ((rc = cpr_suspend_devices(ddi_root_node())) != 0)
299 		panic("failed to suspend devices: %d", rc);
300 }
301 
302 void
303 xen_resume_devices(void)
304 {
305 	int rc;
306 
307 	SUSPEND_DEBUG("xen_resume_devices\n");
308 
309 	if ((rc = cpr_resume_devices(ddi_root_node(), 0)) != 0)
310 		panic("failed to resume devices: %d", rc);
311 }
312 
313 /*
314  * The list of mfn pages is out of date.  Recompute it.
315  */
316 static void
317 rebuild_mfn_list(void)
318 {
319 	int i = 0;
320 	size_t sz;
321 	size_t off;
322 	pfn_t pfn;
323 
324 	SUSPEND_DEBUG("rebuild_mfn_list\n");
325 
326 	sz = ((mfn_count * sizeof (mfn_t)) + MMU_PAGEOFFSET) & MMU_PAGEMASK;
327 
328 	for (off = 0; off < sz; off += MMU_PAGESIZE) {
329 		size_t j = mmu_btop(off);
330 		if (((j * sizeof (mfn_t)) & MMU_PAGEOFFSET) == 0) {
331 			pfn = hat_getpfnum(kas.a_hat,
332 			    (caddr_t)&mfn_list_pages[j]);
333 			mfn_list_pages_page[i++] = pfn_to_mfn(pfn);
334 		}
335 
336 		pfn = hat_getpfnum(kas.a_hat, (caddr_t)mfn_list + off);
337 		mfn_list_pages[j] = pfn_to_mfn(pfn);
338 	}
339 
340 	pfn = hat_getpfnum(kas.a_hat, (caddr_t)mfn_list_pages_page);
341 	HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list
342 	    = pfn_to_mfn(pfn);
343 }
344 
345 static void
346 suspend_cpus(void)
347 {
348 	int i;
349 
350 	SUSPEND_DEBUG("suspend_cpus\n");
351 
352 	mp_enter_barrier();
353 
354 	for (i = 1; i < ncpus; i++) {
355 		if (!CPU_IN_SET(cpu_suspend_lost_set, i)) {
356 			SUSPEND_DEBUG("xen_vcpu_down %d\n", i);
357 			(void) xen_vcpu_down(i);
358 		}
359 
360 		mach_cpucontext_reset(cpu[i]);
361 	}
362 }
363 
364 static void
365 resume_cpus(void)
366 {
367 	int i;
368 
369 	for (i = 1; i < ncpus; i++) {
370 		if (cpu[i] == NULL)
371 			continue;
372 
373 		if (!CPU_IN_SET(cpu_suspend_lost_set, i)) {
374 			SUSPEND_DEBUG("xen_vcpu_up %d\n", i);
375 			mach_cpucontext_restore(cpu[i]);
376 			(void) xen_vcpu_up(i);
377 		}
378 	}
379 
380 	mp_leave_barrier();
381 }
382 
383 /*
384  * Top level routine to direct suspend/resume of a domain.
385  */
386 void
387 xen_suspend_domain(void)
388 {
389 	extern void rtcsync(void);
390 	extern hrtime_t hres_last_tick;
391 	mfn_t start_info_mfn;
392 	ulong_t flags;
393 	pfn_t pfn;
394 	int i;
395 
396 	/*
397 	 * Check that we are happy to suspend on this hypervisor.
398 	 */
399 	if (xen_hypervisor_supports_solaris(XEN_SUSPEND_CHECK) == 0) {
400 		cpr_err(CE_WARN, "Cannot suspend on this hypervisor "
401 		    "version: v%lu.%lu%s, need at least version v3.0.4 or "
402 		    "-xvm based hypervisor", XENVER_CURRENT(xv_major),
403 		    XENVER_CURRENT(xv_minor), XENVER_CURRENT(xv_ver));
404 		return;
405 	}
406 
407 	/*
408 	 * XXPV - Are we definitely OK to suspend by the time we've connected
409 	 * the handler?
410 	 */
411 
412 	cpr_err(CE_NOTE, "Domain suspending for save/migrate");
413 
414 	SUSPEND_DEBUG("xen_suspend_domain\n");
415 
416 	/*
417 	 * suspend interrupts and devices
418 	 * XXPV - we use suspend/resume for both save/restore domains (like sun
419 	 * cpr) and for migration.  Would be nice to know the difference if
420 	 * possible.  For save/restore where down time may be a long time, we
421 	 * may want to do more of the things that cpr does.  (i.e. notify user
422 	 * processes, shrink memory footprint for faster restore, etc.)
423 	 */
424 	xen_suspend_devices();
425 	SUSPEND_DEBUG("xenbus_suspend\n");
426 	xenbus_suspend();
427 
428 	pfn = hat_getpfnum(kas.a_hat, (caddr_t)xen_info);
429 	start_info_mfn = pfn_to_mfn(pfn);
430 
431 	/*
432 	 * XXPV: cpu hotplug can hold this under a xenbus watch. Are we safe
433 	 * wrt xenbus being suspended here?
434 	 */
435 	mutex_enter(&cpu_lock);
436 
437 	/*
438 	 * Suspend must be done on vcpu 0, as no context for other CPUs is
439 	 * saved.
440 	 *
441 	 * XXPV - add to taskq API ?
442 	 */
443 	thread_affinity_set(curthread, 0);
444 	kpreempt_disable();
445 
446 	SUSPEND_DEBUG("xen_start_migrate\n");
447 	xen_start_migrate();
448 	if (ncpus > 1)
449 		suspend_cpus();
450 
451 	/*
452 	 * We can grab the ec_lock as it's a spinlock with a high SPL. Hence
453 	 * any holder would have dropped it to get through suspend_cpus().
454 	 */
455 	mutex_enter(&ec_lock);
456 
457 	/*
458 	 * From here on in, we can't take locks.
459 	 */
460 	SUSPEND_DEBUG("ec_suspend\n");
461 	ec_suspend();
462 	SUSPEND_DEBUG("gnttab_suspend\n");
463 	gnttab_suspend();
464 
465 	flags = intr_clear();
466 
467 	xpv_time_suspend();
468 
469 	/*
470 	 * Currently, the hypervisor incorrectly fails to bring back
471 	 * powered-down VCPUs.  Thus we need to record any powered-down VCPUs
472 	 * to prevent any attempts to operate on them.  But we have to do this
473 	 * *after* the very first time we do ec_suspend().
474 	 */
475 	for (i = 1; i < ncpus; i++) {
476 		if (cpu[i] == NULL)
477 			continue;
478 
479 		if (cpu_get_state(cpu[i]) == P_POWEROFF)
480 			CPUSET_ATOMIC_ADD(cpu_suspend_lost_set, i);
481 	}
482 
483 	/*
484 	 * The dom0 save/migrate code doesn't automatically translate
485 	 * these into PFNs, but expects them to be, so we do it here.
486 	 * We don't use mfn_to_pfn() because so many OS services have
487 	 * been disabled at this point.
488 	 */
489 	xen_info->store_mfn = mfn_to_pfn_mapping[xen_info->store_mfn];
490 	xen_info->console.domU.mfn =
491 	    mfn_to_pfn_mapping[xen_info->console.domU.mfn];
492 
493 	if (CPU->cpu_m.mcpu_vcpu_info->evtchn_upcall_mask == 0) {
494 		prom_printf("xen_suspend_domain(): "
495 		    "CPU->cpu_m.mcpu_vcpu_info->evtchn_upcall_mask not set\n");
496 		(void) HYPERVISOR_shutdown(SHUTDOWN_crash);
497 	}
498 
499 	if (HYPERVISOR_update_va_mapping((uintptr_t)HYPERVISOR_shared_info,
500 	    0, UVMF_INVLPG)) {
501 		prom_printf("xen_suspend_domain(): "
502 		    "HYPERVISOR_update_va_mapping() failed\n");
503 		(void) HYPERVISOR_shutdown(SHUTDOWN_crash);
504 	}
505 
506 	SUSPEND_DEBUG("HYPERVISOR_suspend\n");
507 
508 	/*
509 	 * At this point we suspend and sometime later resume.
510 	 */
511 	if (HYPERVISOR_suspend(start_info_mfn)) {
512 		prom_printf("xen_suspend_domain(): "
513 		    "HYPERVISOR_suspend() failed\n");
514 		(void) HYPERVISOR_shutdown(SHUTDOWN_crash);
515 	}
516 
517 	/*
518 	 * Point HYPERVISOR_shared_info to its new value.
519 	 */
520 	if (HYPERVISOR_update_va_mapping((uintptr_t)HYPERVISOR_shared_info,
521 	    xen_info->shared_info | PT_NOCONSIST | PT_VALID | PT_WRITABLE,
522 	    UVMF_INVLPG))
523 		(void) HYPERVISOR_shutdown(SHUTDOWN_crash);
524 
525 	if (xen_info->nr_pages != mfn_count) {
526 		prom_printf("xen_suspend_domain(): number of pages"
527 		    " changed, was 0x%lx, now 0x%lx\n", mfn_count,
528 		    xen_info->nr_pages);
529 		(void) HYPERVISOR_shutdown(SHUTDOWN_crash);
530 	}
531 
532 	xpv_time_resume();
533 
534 	cached_max_mfn = 0;
535 
536 	SUSPEND_DEBUG("gnttab_resume\n");
537 	gnttab_resume();
538 
539 	/* XXPV: add a note that this must be lockless. */
540 	SUSPEND_DEBUG("ec_resume\n");
541 	ec_resume();
542 
543 	intr_restore(flags);
544 
545 	if (ncpus > 1)
546 		resume_cpus();
547 
548 	mutex_exit(&ec_lock);
549 	xen_end_migrate();
550 	mutex_exit(&cpu_lock);
551 
552 	/*
553 	 * Now we can take locks again.
554 	 */
555 
556 	/*
557 	 * Force the tick value used for tv_nsec in hres_tick() to be up to
558 	 * date. rtcsync() will reset the hrestime value appropriately.
559 	 */
560 	hres_last_tick = xpv_gethrtime();
561 
562 	/*
563 	 * XXPV: we need to have resumed the CPUs since this takes locks, but
564 	 * can remote CPUs see bad state? Presumably yes. Should probably nest
565 	 * taking of todlock inside of cpu_lock, or vice versa, then provide an
566 	 * unlocked version.  Probably need to call clkinitf to reset cpu freq
567 	 * and re-calibrate if we migrated to a different speed cpu.  Also need
568 	 * to make a (re)init_cpu_info call to update processor info structs
569 	 * and device tree info.  That remains to be written at the moment.
570 	 */
571 	rtcsync();
572 
573 	rebuild_mfn_list();
574 
575 	SUSPEND_DEBUG("xenbus_resume\n");
576 	xenbus_resume();
577 	SUSPEND_DEBUG("xenbus_resume_devices\n");
578 	xen_resume_devices();
579 
580 	thread_affinity_clear(curthread);
581 	kpreempt_enable();
582 
583 	SUSPEND_DEBUG("finished xen_suspend_domain\n");
584 
585 	/*
586 	 * We have restarted our suspended domain, update the hypervisor
587 	 * details. NB: This must be done at the end of this function,
588 	 * since we need the domain to be completely resumed before
589 	 * these functions will work correctly.
590 	 */
591 	xen_set_version(XENVER_CURRENT_IDX);
592 
593 	/*
594 	 * We can check and report a warning, but we don't stop the
595 	 * process.
596 	 */
597 	if (xen_hypervisor_supports_solaris(XEN_SUSPEND_CHECK) == 0)
598 		cmn_err(CE_WARN, "Found hypervisor version: v%lu.%lu%s "
599 		    "but need at least version v3.0.4",
600 		    XENVER_CURRENT(xv_major), XENVER_CURRENT(xv_minor),
601 		    XENVER_CURRENT(xv_ver));
602 
603 	cmn_err(CE_NOTE, "domain restore/migrate completed");
604 }
605 
606 /*ARGSUSED*/
607 int
608 xen_debug_handler(void *arg)
609 {
610 	debug_enter("External debug event received");
611 
612 	/*
613 	 * If we've not got KMDB loaded, output some stuff difficult to capture
614 	 * from a domain core.
615 	 */
616 	if (!(boothowto & RB_DEBUG)) {
617 		shared_info_t *si = HYPERVISOR_shared_info;
618 		int i;
619 
620 		prom_printf("evtchn_pending [ ");
621 		for (i = 0; i < 8; i++)
622 			prom_printf("%lx ", si->evtchn_pending[i]);
623 		prom_printf("]\nevtchn_mask [ ");
624 		for (i = 0; i < 8; i++)
625 			prom_printf("%lx ", si->evtchn_mask[i]);
626 		prom_printf("]\n");
627 
628 		for (i = 0; i < ncpus; i++) {
629 			vcpu_info_t *vcpu = &si->vcpu_info[i];
630 			if (cpu[i] == NULL)
631 				continue;
632 			prom_printf("CPU%d pending %d mask %d sel %lx\n",
633 			    i, vcpu->evtchn_upcall_pending,
634 			    vcpu->evtchn_upcall_mask,
635 			    vcpu->evtchn_pending_sel);
636 		}
637 	}
638 
639 	return (0);
640 }
641 
642 /*ARGSUSED*/
643 static void
644 xen_sysrq_handler(struct xenbus_watch *watch, const char **vec,
645     unsigned int len)
646 {
647 	xenbus_transaction_t xbt;
648 	char key = '\0';
649 	int ret;
650 
651 retry:
652 	if (xenbus_transaction_start(&xbt)) {
653 		cmn_err(CE_WARN, "failed to start sysrq transaction");
654 		return;
655 	}
656 
657 	if ((ret = xenbus_scanf(xbt, "control", "sysrq", "%c", &key)) != 0) {
658 		/*
659 		 * ENOENT happens in response to our own xenbus_rm.
660 		 * XXPV - this happens spuriously on boot?
661 		 */
662 		if (ret != ENOENT)
663 			cmn_err(CE_WARN, "failed to read sysrq: %d", ret);
664 		goto out;
665 	}
666 
667 	if ((ret = xenbus_rm(xbt, "control", "sysrq")) != 0) {
668 		cmn_err(CE_WARN, "failed to reset sysrq: %d", ret);
669 		goto out;
670 	}
671 
672 	if (xenbus_transaction_end(xbt, 0) == EAGAIN)
673 		goto retry;
674 
675 	/*
676 	 * Somewhat arbitrary - on Linux this means 'reboot'. We could just
677 	 * accept any key, but this might increase the risk of sending a
678 	 * harmless sysrq to the wrong domain...
679 	 */
680 	if (key == 'b')
681 		(void) xen_debug_handler(NULL);
682 	else
683 		cmn_err(CE_WARN, "Ignored sysrq %c", key);
684 	return;
685 
686 out:
687 	(void) xenbus_transaction_end(xbt, 1);
688 }
689 
690 taskq_t *xen_shutdown_tq;
691 
692 #define	SHUTDOWN_INVALID	-1
693 #define	SHUTDOWN_POWEROFF	0
694 #define	SHUTDOWN_REBOOT		1
695 #define	SHUTDOWN_SUSPEND	2
696 #define	SHUTDOWN_HALT		3
697 #define	SHUTDOWN_MAX		4
698 
699 #define	SHUTDOWN_TIMEOUT_SECS (60 * 5)
700 
701 static const char *cmd_strings[SHUTDOWN_MAX] = {
702 	"poweroff",
703 	"reboot",
704 	"suspend",
705 	"halt"
706 };
707 
708 static void
709 xen_dirty_shutdown(void *arg)
710 {
711 	int cmd = (uintptr_t)arg;
712 
713 	cmn_err(CE_WARN, "Externally requested shutdown failed or "
714 	    "timed out.\nShutting down.\n");
715 
716 	switch (cmd) {
717 	case SHUTDOWN_HALT:
718 	case SHUTDOWN_POWEROFF:
719 		(void) kadmin(A_SHUTDOWN, AD_POWEROFF, NULL, kcred);
720 		break;
721 	case SHUTDOWN_REBOOT:
722 		(void) kadmin(A_REBOOT, AD_BOOT, NULL, kcred);
723 		break;
724 	}
725 }
726 
727 static void
728 xen_shutdown(void *arg)
729 {
730 	nvlist_t *attr_list = NULL;
731 	sysevent_t *event = NULL;
732 	sysevent_id_t eid;
733 	int cmd = (uintptr_t)arg;
734 	int err;
735 
736 	ASSERT(cmd > SHUTDOWN_INVALID && cmd < SHUTDOWN_MAX);
737 
738 	if (cmd == SHUTDOWN_SUSPEND) {
739 		xen_suspend_domain();
740 		return;
741 	}
742 
743 	err = nvlist_alloc(&attr_list, NV_UNIQUE_NAME, KM_SLEEP);
744 	if (err != DDI_SUCCESS)
745 		goto failure;
746 
747 	err = nvlist_add_string(attr_list, "shutdown", cmd_strings[cmd]);
748 	if (err != DDI_SUCCESS)
749 		goto failure;
750 
751 	if ((event = sysevent_alloc("EC_xpvsys", "control", "SUNW:kern:xpv",
752 	    SE_SLEEP)) == NULL)
753 		goto failure;
754 	(void) sysevent_attach_attributes(event,
755 	    (sysevent_attr_list_t *)attr_list);
756 
757 	err = log_sysevent(event, SE_SLEEP, &eid);
758 
759 	sysevent_detach_attributes(event);
760 	sysevent_free(event);
761 
762 	if (err != 0)
763 		goto failure;
764 
765 	(void) timeout(xen_dirty_shutdown, arg,
766 	    SHUTDOWN_TIMEOUT_SECS * drv_usectohz(MICROSEC));
767 
768 	nvlist_free(attr_list);
769 	return;
770 
771 failure:
772 	if (attr_list != NULL)
773 		nvlist_free(attr_list);
774 	xen_dirty_shutdown(arg);
775 }
776 
777 /*ARGSUSED*/
778 static void
779 xen_shutdown_handler(struct xenbus_watch *watch, const char **vec,
780 	unsigned int len)
781 {
782 	char *str;
783 	xenbus_transaction_t xbt;
784 	int err, shutdown_code = SHUTDOWN_INVALID;
785 	unsigned int slen;
786 
787 again:
788 	err = xenbus_transaction_start(&xbt);
789 	if (err)
790 		return;
791 	if (xenbus_read(xbt, "control", "shutdown", (void *)&str, &slen)) {
792 		(void) xenbus_transaction_end(xbt, 1);
793 		return;
794 	}
795 
796 	SUSPEND_DEBUG("%d: xen_shutdown_handler: \"%s\"\n", CPU->cpu_id, str);
797 
798 	/*
799 	 * If this is a watch fired from our write below, check out early to
800 	 * avoid an infinite loop.
801 	 */
802 	if (strcmp(str, "") == 0) {
803 		(void) xenbus_transaction_end(xbt, 0);
804 		kmem_free(str, slen);
805 		return;
806 	} else if (strcmp(str, "poweroff") == 0) {
807 		shutdown_code = SHUTDOWN_POWEROFF;
808 	} else if (strcmp(str, "reboot") == 0) {
809 		shutdown_code = SHUTDOWN_REBOOT;
810 	} else if (strcmp(str, "suspend") == 0) {
811 		shutdown_code = SHUTDOWN_SUSPEND;
812 	} else if (strcmp(str, "halt") == 0) {
813 		shutdown_code = SHUTDOWN_HALT;
814 	} else {
815 		printf("Ignoring shutdown request: %s\n", str);
816 	}
817 
818 	/*
819 	 * XXPV	Should we check the value of xenbus_write() too, or are all
820 	 *	errors automatically folded into xenbus_transaction_end() ??
821 	 */
822 	(void) xenbus_write(xbt, "control", "shutdown", "");
823 	err = xenbus_transaction_end(xbt, 0);
824 	if (err == EAGAIN) {
825 		SUSPEND_DEBUG("%d: trying again\n", CPU->cpu_id);
826 		kmem_free(str, slen);
827 		goto again;
828 	}
829 
830 	kmem_free(str, slen);
831 	if (shutdown_code != SHUTDOWN_INVALID) {
832 		(void) taskq_dispatch(xen_shutdown_tq, xen_shutdown,
833 		    (void *)(intptr_t)shutdown_code, 0);
834 	}
835 }
836 
837 static struct xenbus_watch shutdown_watch;
838 static struct xenbus_watch sysrq_watch;
839 
840 void
841 xen_late_startup(void)
842 {
843 	if (!DOMAIN_IS_INITDOMAIN(xen_info)) {
844 		xen_shutdown_tq = taskq_create("shutdown_taskq", 1,
845 		    maxclsyspri - 1, 1, 1, TASKQ_PREPOPULATE);
846 		shutdown_watch.node = "control/shutdown";
847 		shutdown_watch.callback = xen_shutdown_handler;
848 		if (register_xenbus_watch(&shutdown_watch))
849 			cmn_err(CE_WARN, "Failed to set shutdown watcher");
850 
851 		sysrq_watch.node = "control/sysrq";
852 		sysrq_watch.callback = xen_sysrq_handler;
853 		if (register_xenbus_watch(&sysrq_watch))
854 			cmn_err(CE_WARN, "Failed to set sysrq watcher");
855 	}
856 	balloon_init(xen_info->nr_pages);
857 }
858 
859 #ifdef DEBUG
860 #define	XEN_PRINTF_BUFSIZE	1024
861 
862 char xen_printf_buffer[XEN_PRINTF_BUFSIZE];
863 
864 /*
865  * Printf function that calls hypervisor directly.  For DomU it only
866  * works when running on a xen hypervisor built with debug on.  Works
867  * always since no I/O ring interaction is needed.
868  */
869 /*PRINTFLIKE1*/
870 void
871 xen_printf(const char *fmt, ...)
872 {
873 	va_list	ap;
874 
875 	va_start(ap, fmt);
876 	(void) vsnprintf(xen_printf_buffer, XEN_PRINTF_BUFSIZE, fmt, ap);
877 	va_end(ap);
878 
879 	(void) HYPERVISOR_console_io(CONSOLEIO_write,
880 	    strlen(xen_printf_buffer), xen_printf_buffer);
881 }
882 #else
883 void
884 xen_printf(const char *fmt, ...)
885 {
886 }
887 #endif	/* DEBUG */
888 
889 void
890 xen_version(void)
891 {
892 	xen_set_version(XENVER_BOOT_IDX);
893 	if (xen_hypervisor_supports_solaris(XEN_RUN_CHECK) == 0)
894 		cmn_err(CE_WARN, "Found hypervisor version: v%lu.%lu%s "
895 		    "but need at least version v3.0.4",
896 		    XENVER_CURRENT(xv_major), XENVER_CURRENT(xv_minor),
897 		    XENVER_CURRENT(xv_ver));
898 }
899 
900 /*
901  * Miscellaneous hypercall wrappers with slightly more verbose diagnostics.
902  */
903 
904 void
905 xen_set_gdt(ulong_t *frame_list, int entries)
906 {
907 	int err;
908 	if ((err = HYPERVISOR_set_gdt(frame_list, entries)) != 0) {
909 		/*
910 		 * X_EINVAL:	reserved entry or bad frames
911 		 * X_EFAULT:	bad address
912 		 */
913 		panic("xen_set_gdt(%p, %d): error %d",
914 		    (void *)frame_list, entries, -(int)err);
915 	}
916 }
917 
918 void
919 xen_set_ldt(user_desc_t *ldt, uint_t nsels)
920 {
921 	struct mmuext_op	op;
922 	long			err;
923 
924 	op.cmd = MMUEXT_SET_LDT;
925 	op.arg1.linear_addr = (uintptr_t)ldt;
926 	op.arg2.nr_ents = nsels;
927 
928 	if ((err = HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) != 0) {
929 		panic("xen_set_ldt(%p, %d): error %d",
930 		    (void *)ldt, nsels, -(int)err);
931 	}
932 }
933 
934 void
935 xen_stack_switch(ulong_t ss, ulong_t esp)
936 {
937 	long err;
938 
939 	if ((err = HYPERVISOR_stack_switch(ss, esp)) != 0) {
940 		/*
941 		 * X_EPERM:	bad selector
942 		 */
943 		panic("xen_stack_switch(%lx, %lx): error %d", ss, esp,
944 		    -(int)err);
945 	}
946 }
947 
948 long
949 xen_set_trap_table(trap_info_t *table)
950 {
951 	long err;
952 
953 	if ((err = HYPERVISOR_set_trap_table(table)) != 0) {
954 		/*
955 		 * X_EFAULT:	bad address
956 		 * X_EPERM:	bad selector
957 		 */
958 		panic("xen_set_trap_table(%p): error %d", (void *)table,
959 		    -(int)err);
960 	}
961 	return (err);
962 }
963 
964 #if defined(__amd64)
965 void
966 xen_set_segment_base(int reg, ulong_t value)
967 {
968 	long err;
969 
970 	if ((err = HYPERVISOR_set_segment_base(reg, value)) != 0) {
971 		/*
972 		 * X_EFAULT:	bad address
973 		 * X_EINVAL:	bad type
974 		 */
975 		panic("xen_set_segment_base(%d, %lx): error %d",
976 		    reg, value, -(int)err);
977 	}
978 }
979 #endif	/* __amd64 */
980 
981 /*
982  * Translate a hypervisor errcode to a Solaris error code.
983  */
984 int
985 xen_xlate_errcode(int error)
986 {
987 	switch (-error) {
988 
989 	/*
990 	 * Translate hypervisor errno's into native errno's
991 	 */
992 
993 #define	CASE(num)	case X_##num: error = num; break
994 
995 	CASE(EPERM);	CASE(ENOENT);	CASE(ESRCH);
996 	CASE(EINTR);	CASE(EIO);	CASE(ENXIO);
997 	CASE(E2BIG);	CASE(ENOMEM);	CASE(EACCES);
998 	CASE(EFAULT);	CASE(EBUSY);	CASE(EEXIST);
999 	CASE(ENODEV);	CASE(EISDIR);	CASE(EINVAL);
1000 	CASE(ENOSPC);	CASE(ESPIPE);	CASE(EROFS);
1001 	CASE(ENOSYS);	CASE(ENOTEMPTY); CASE(EISCONN);
1002 	CASE(ENODATA);
1003 
1004 #undef CASE
1005 
1006 	default:
1007 		panic("xen_xlate_errcode: unknown error %d", error);
1008 	}
1009 
1010 	return (error);
1011 }
1012 
1013 /*
1014  * Raise PS_IOPL on current vcpu to user level.
1015  * Caller responsible for preventing kernel preemption.
1016  */
1017 void
1018 xen_enable_user_iopl(void)
1019 {
1020 	physdev_set_iopl_t set_iopl;
1021 	set_iopl.iopl = 3;		/* user ring 3 */
1022 	(void) HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
1023 }
1024 
1025 /*
1026  * Drop PS_IOPL on current vcpu to kernel level
1027  */
1028 void
1029 xen_disable_user_iopl(void)
1030 {
1031 	physdev_set_iopl_t set_iopl;
1032 	set_iopl.iopl = 1;		/* kernel pseudo ring 1 */
1033 	(void) HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
1034 }
1035 
1036 int
1037 xen_gdt_setprot(cpu_t *cp, uint_t prot)
1038 {
1039 	int err;
1040 #if defined(__amd64)
1041 	int pt_bits = PT_VALID;
1042 	if (prot & PROT_WRITE)
1043 		pt_bits |= PT_WRITABLE;
1044 #endif
1045 
1046 	if ((err = as_setprot(&kas, (caddr_t)cp->cpu_gdt,
1047 	    MMU_PAGESIZE, prot)) != 0)
1048 		goto done;
1049 
1050 #if defined(__amd64)
1051 	err = xen_kpm_page(mmu_btop(cp->cpu_m.mcpu_gdtpa), pt_bits);
1052 #endif
1053 
1054 done:
1055 	if (err) {
1056 		cmn_err(CE_WARN, "cpu%d: xen_gdt_setprot(%s) failed: error %d",
1057 		    cp->cpu_id, (prot & PROT_WRITE) ? "writable" : "read-only",
1058 		    err);
1059 	}
1060 
1061 	return (err);
1062 }
1063 
1064 int
1065 xen_ldt_setprot(user_desc_t *ldt, size_t lsize, uint_t prot)
1066 {
1067 	int err;
1068 	caddr_t	lva = (caddr_t)ldt;
1069 #if defined(__amd64)
1070 	int pt_bits = PT_VALID;
1071 	pgcnt_t npgs;
1072 	if (prot & PROT_WRITE)
1073 		pt_bits |= PT_WRITABLE;
1074 #endif	/* __amd64 */
1075 
1076 	if ((err = as_setprot(&kas, (caddr_t)ldt, lsize, prot)) != 0)
1077 		goto done;
1078 
1079 #if defined(__amd64)
1080 
1081 	ASSERT(IS_P2ALIGNED(lsize, PAGESIZE));
1082 	npgs = mmu_btop(lsize);
1083 	while (npgs--) {
1084 		if ((err = xen_kpm_page(hat_getpfnum(kas.a_hat, lva),
1085 		    pt_bits)) != 0)
1086 			break;
1087 		lva += PAGESIZE;
1088 	}
1089 #endif	/* __amd64 */
1090 
1091 done:
1092 	if (err) {
1093 		cmn_err(CE_WARN, "xen_ldt_setprot(%p, %s) failed: error %d",
1094 		    (void *)lva,
1095 		    (prot & PROT_WRITE) ? "writable" : "read-only", err);
1096 	}
1097 
1098 	return (err);
1099 }
1100