xref: /titanic_44/usr/src/uts/i86xpv/os/xen_machdep.c (revision ec77975f4066916892ac3a662a2045cca3926268)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /* derived from netbsd's xen_machdep.c 1.1.2.1 */
30 
31 /*
32  *
33  * Copyright (c) 2004 Christian Limpach.
34  * All rights reserved.
35  *
36  * Redistribution and use in source and binary forms, with or without
37  * modification, are permitted provided that the following conditions
38  * are met:
39  * 1. Redistributions of source code must retain the above copyright
40  *    notice, this list of conditions and the following disclaimer.
41  * 2. Redistributions in binary form must reproduce the above copyright
42  *    notice, this list of conditions and the following disclaimer in the
43  *    documentation and/or other materials provided with the distribution.
44  * 3. This section intentionally left blank.
45  * 4. The name of the author may not be used to endorse or promote products
46  *    derived from this software without specific prior written permission.
47  *
48  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
49  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
50  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
51  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
52  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
53  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
54  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
55  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
56  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
57  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
58  */
59 /*
60  * Section 3 of the above license was updated in response to bug 6379571.
61  */
62 
63 #include <sys/types.h>
64 #include <sys/cmn_err.h>
65 #include <sys/trap.h>
66 #include <sys/segments.h>
67 #include <sys/hypervisor.h>
68 #include <sys/xen_mmu.h>
69 #include <sys/machsystm.h>
70 #include <sys/promif.h>
71 #include <sys/bootconf.h>
72 #include <sys/bootinfo.h>
73 #include <sys/cpr.h>
74 #include <sys/taskq.h>
75 #include <sys/uadmin.h>
76 #include <sys/evtchn_impl.h>
77 #include <sys/archsystm.h>
78 #include <xen/sys/xenbus_impl.h>
79 #include <sys/mach_mmu.h>
80 #include <vm/hat_i86.h>
81 #include <sys/gnttab.h>
82 #include <sys/reboot.h>
83 #include <sys/stack.h>
84 #include <sys/clock.h>
85 #include <sys/bitmap.h>
86 #include <sys/processor.h>
87 #include <sys/xen_errno.h>
88 #include <sys/xpv_panic.h>
89 #include <sys/smp_impldefs.h>
90 #include <sys/cpu.h>
91 #include <sys/balloon_impl.h>
92 #include <sys/ddi.h>
93 
94 #ifdef DEBUG
95 #define	SUSPEND_DEBUG if (xen_suspend_debug) xen_printf
96 #else
97 #define	SUSPEND_DEBUG(...)
98 #endif
99 
100 int cpr_debug;
101 cpuset_t cpu_suspend_lost_set;
102 static int xen_suspend_debug;
103 
104 void
105 xen_set_callback(void (*func)(void), uint_t type, uint_t flags)
106 {
107 	struct callback_register cb;
108 
109 	bzero(&cb, sizeof (cb));
110 #if defined(__amd64)
111 	cb.address = (ulong_t)func;
112 #elif defined(__i386)
113 	cb.address.cs = KCS_SEL;
114 	cb.address.eip = (ulong_t)func;
115 #endif
116 	cb.type = type;
117 	cb.flags = flags;
118 
119 	/*
120 	 * XXPV always ignore return value for NMI
121 	 */
122 	if (HYPERVISOR_callback_op(CALLBACKOP_register, &cb) != 0 &&
123 	    type != CALLBACKTYPE_nmi)
124 		panic("HYPERVISOR_callback_op failed");
125 }
126 
127 void
128 xen_init_callbacks(void)
129 {
130 	/*
131 	 * register event (interrupt) handler.
132 	 */
133 	xen_set_callback(xen_callback, CALLBACKTYPE_event, 0);
134 
135 	/*
136 	 * failsafe handler.
137 	 */
138 	xen_set_callback(xen_failsafe_callback, CALLBACKTYPE_failsafe,
139 	    CALLBACKF_mask_events);
140 
141 	/*
142 	 * NMI handler.
143 	 */
144 	xen_set_callback(nmiint, CALLBACKTYPE_nmi, 0);
145 
146 	/*
147 	 * system call handler
148 	 * XXPV move to init_cpu_syscall?
149 	 */
150 #if defined(__amd64)
151 	xen_set_callback(sys_syscall, CALLBACKTYPE_syscall,
152 	    CALLBACKF_mask_events);
153 #endif	/* __amd64 */
154 }
155 
156 
157 /*
158  * cmn_err() followed by a 1/4 second delay; this gives the
159  * logging service a chance to flush messages and helps avoid
160  * intermixing output from prom_printf().
161  * XXPV: doesn't exactly help us on UP though.
162  */
163 /*PRINTFLIKE2*/
164 void
165 cpr_err(int ce, const char *fmt, ...)
166 {
167 	va_list adx;
168 
169 	va_start(adx, fmt);
170 	vcmn_err(ce, fmt, adx);
171 	va_end(adx);
172 	drv_usecwait(MICROSEC >> 2);
173 }
174 
175 void
176 xen_suspend_devices(void)
177 {
178 	int rc;
179 
180 	SUSPEND_DEBUG("xen_suspend_devices\n");
181 
182 	if ((rc = cpr_suspend_devices(ddi_root_node())) != 0)
183 		panic("failed to suspend devices: %d", rc);
184 }
185 
186 void
187 xen_resume_devices(void)
188 {
189 	int rc;
190 
191 	SUSPEND_DEBUG("xen_resume_devices\n");
192 
193 	if ((rc = cpr_resume_devices(ddi_root_node(), 0)) != 0)
194 		panic("failed to resume devices: %d", rc);
195 }
196 
197 /*
198  * The list of mfn pages is out of date.  Recompute it.
199  */
200 static void
201 rebuild_mfn_list(void)
202 {
203 	int i = 0;
204 	size_t sz;
205 	size_t off;
206 	pfn_t pfn;
207 
208 	SUSPEND_DEBUG("rebuild_mfn_list\n");
209 
210 	sz = ((mfn_count * sizeof (mfn_t)) + MMU_PAGEOFFSET) & MMU_PAGEMASK;
211 
212 	for (off = 0; off < sz; off += MMU_PAGESIZE) {
213 		size_t j = mmu_btop(off);
214 		if (((j * sizeof (mfn_t)) & MMU_PAGEOFFSET) == 0) {
215 			pfn = hat_getpfnum(kas.a_hat,
216 			    (caddr_t)&mfn_list_pages[j]);
217 			mfn_list_pages_page[i++] = pfn_to_mfn(pfn);
218 		}
219 
220 		pfn = hat_getpfnum(kas.a_hat, (caddr_t)mfn_list + off);
221 		mfn_list_pages[j] = pfn_to_mfn(pfn);
222 	}
223 
224 	pfn = hat_getpfnum(kas.a_hat, (caddr_t)mfn_list_pages_page);
225 	HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list
226 	    = pfn_to_mfn(pfn);
227 }
228 
229 static void
230 suspend_cpus(void)
231 {
232 	int i;
233 
234 	SUSPEND_DEBUG("suspend_cpus\n");
235 
236 	mp_enter_barrier();
237 
238 	for (i = 1; i < ncpus; i++) {
239 		if (!CPU_IN_SET(cpu_suspend_lost_set, i)) {
240 			SUSPEND_DEBUG("xen_vcpu_down %d\n", i);
241 			(void) xen_vcpu_down(i);
242 		}
243 
244 		mach_cpucontext_reset(cpu[i]);
245 	}
246 }
247 
248 static void
249 resume_cpus(void)
250 {
251 	int i;
252 
253 	for (i = 1; i < ncpus; i++) {
254 		if (cpu[i] == NULL)
255 			continue;
256 
257 		if (!CPU_IN_SET(cpu_suspend_lost_set, i)) {
258 			SUSPEND_DEBUG("xen_vcpu_up %d\n", i);
259 			mach_cpucontext_restore(cpu[i]);
260 			(void) xen_vcpu_up(i);
261 		}
262 	}
263 
264 	mp_leave_barrier();
265 }
266 
267 /*
268  * Top level routine to direct suspend/resume of a domain.
269  */
270 void
271 xen_suspend_domain(void)
272 {
273 	extern void rtcsync(void);
274 	extern hrtime_t hres_last_tick;
275 	mfn_t start_info_mfn;
276 	ulong_t flags;
277 	pfn_t pfn;
278 	int i;
279 
280 	/*
281 	 * XXPV - Are we definitely OK to suspend by the time we've connected
282 	 * the handler?
283 	 */
284 
285 	cpr_err(CE_NOTE, "Domain suspending for save/migrate");
286 
287 	SUSPEND_DEBUG("xen_suspend_domain\n");
288 
289 	/*
290 	 * suspend interrupts and devices
291 	 * XXPV - we use suspend/resume for both save/restore domains (like sun
292 	 * cpr) and for migration.  Would be nice to know the difference if
293 	 * possible.  For save/restore where down time may be a long time, we
294 	 * may want to do more of the things that cpr does.  (i.e. notify user
295 	 * processes, shrink memory footprint for faster restore, etc.)
296 	 */
297 	xen_suspend_devices();
298 	SUSPEND_DEBUG("xenbus_suspend\n");
299 	xenbus_suspend();
300 
301 	pfn = hat_getpfnum(kas.a_hat, (caddr_t)xen_info);
302 	start_info_mfn = pfn_to_mfn(pfn);
303 
304 	/*
305 	 * XXPV: cpu hotplug can hold this under a xenbus watch. Are we safe
306 	 * wrt xenbus being suspended here?
307 	 */
308 	mutex_enter(&cpu_lock);
309 
310 	/*
311 	 * Suspend must be done on vcpu 0, as no context for other CPUs is
312 	 * saved.
313 	 *
314 	 * XXPV - add to taskq API ?
315 	 */
316 	thread_affinity_set(curthread, 0);
317 	kpreempt_disable();
318 
319 	SUSPEND_DEBUG("xen_start_migrate\n");
320 	xen_start_migrate();
321 	if (ncpus > 1)
322 		suspend_cpus();
323 
324 	/*
325 	 * We can grab the ec_lock as it's a spinlock with a high SPL. Hence
326 	 * any holder would have dropped it to get through suspend_cpus().
327 	 */
328 	mutex_enter(&ec_lock);
329 
330 	/*
331 	 * From here on in, we can't take locks.
332 	 */
333 	SUSPEND_DEBUG("ec_suspend\n");
334 	ec_suspend();
335 	SUSPEND_DEBUG("gnttab_suspend\n");
336 	gnttab_suspend();
337 
338 	flags = intr_clear();
339 
340 	xpv_time_suspend();
341 
342 	/*
343 	 * Currently, the hypervisor incorrectly fails to bring back
344 	 * powered-down VCPUs.  Thus we need to record any powered-down VCPUs
345 	 * to prevent any attempts to operate on them.  But we have to do this
346 	 * *after* the very first time we do ec_suspend().
347 	 */
348 	for (i = 1; i < ncpus; i++) {
349 		if (cpu[i] == NULL)
350 			continue;
351 
352 		if (cpu_get_state(cpu[i]) == P_POWEROFF)
353 			CPUSET_ATOMIC_ADD(cpu_suspend_lost_set, i);
354 	}
355 
356 	/*
357 	 * The dom0 save/migrate code doesn't automatically translate
358 	 * these into PFNs, but expects them to be, so we do it here.
359 	 * We don't use mfn_to_pfn() because so many OS services have
360 	 * been disabled at this point.
361 	 */
362 	xen_info->store_mfn = mfn_to_pfn_mapping[xen_info->store_mfn];
363 	xen_info->console.domU.mfn =
364 	    mfn_to_pfn_mapping[xen_info->console.domU.mfn];
365 
366 	if (CPU->cpu_m.mcpu_vcpu_info->evtchn_upcall_mask == 0) {
367 		prom_printf("xen_suspend_domain(): "
368 		    "CPU->cpu_m.mcpu_vcpu_info->evtchn_upcall_mask not set\n");
369 		(void) HYPERVISOR_shutdown(SHUTDOWN_crash);
370 	}
371 
372 	if (HYPERVISOR_update_va_mapping((uintptr_t)HYPERVISOR_shared_info,
373 	    0, UVMF_INVLPG)) {
374 		prom_printf("xen_suspend_domain(): "
375 		    "HYPERVISOR_update_va_mapping() failed\n");
376 		(void) HYPERVISOR_shutdown(SHUTDOWN_crash);
377 	}
378 
379 	SUSPEND_DEBUG("HYPERVISOR_suspend\n");
380 
381 	/*
382 	 * At this point we suspend and sometime later resume.
383 	 */
384 	if (HYPERVISOR_suspend(start_info_mfn)) {
385 		prom_printf("xen_suspend_domain(): "
386 		    "HYPERVISOR_suspend() failed\n");
387 		(void) HYPERVISOR_shutdown(SHUTDOWN_crash);
388 	}
389 
390 	/*
391 	 * Point HYPERVISOR_shared_info to its new value.
392 	 */
393 	if (HYPERVISOR_update_va_mapping((uintptr_t)HYPERVISOR_shared_info,
394 	    xen_info->shared_info | PT_NOCONSIST | PT_VALID | PT_WRITABLE,
395 	    UVMF_INVLPG))
396 		(void) HYPERVISOR_shutdown(SHUTDOWN_crash);
397 
398 	if (xen_info->nr_pages != mfn_count) {
399 		prom_printf("xen_suspend_domain(): number of pages"
400 		    " changed, was 0x%lx, now 0x%lx\n", mfn_count,
401 		    xen_info->nr_pages);
402 		(void) HYPERVISOR_shutdown(SHUTDOWN_crash);
403 	}
404 
405 	xpv_time_resume();
406 
407 	cached_max_mfn = 0;
408 
409 	SUSPEND_DEBUG("gnttab_resume\n");
410 	gnttab_resume();
411 
412 	/* XXPV: add a note that this must be lockless. */
413 	SUSPEND_DEBUG("ec_resume\n");
414 	ec_resume();
415 
416 	intr_restore(flags);
417 
418 	if (ncpus > 1)
419 		resume_cpus();
420 
421 	mutex_exit(&ec_lock);
422 	xen_end_migrate();
423 	mutex_exit(&cpu_lock);
424 
425 	/*
426 	 * Now we can take locks again.
427 	 */
428 
429 	/*
430 	 * Force the tick value used for tv_nsec in hres_tick() to be up to
431 	 * date. rtcsync() will reset the hrestime value appropriately.
432 	 */
433 	hres_last_tick = xpv_gethrtime();
434 
435 	/*
436 	 * XXPV: we need to have resumed the CPUs since this takes locks, but
437 	 * can remote CPUs see bad state? Presumably yes. Should probably nest
438 	 * taking of todlock inside of cpu_lock, or vice versa, then provide an
439 	 * unlocked version.  Probably need to call clkinitf to reset cpu freq
440 	 * and re-calibrate if we migrated to a different speed cpu.  Also need
441 	 * to make a (re)init_cpu_info call to update processor info structs
442 	 * and device tree info.  That remains to be written at the moment.
443 	 */
444 	rtcsync();
445 
446 	rebuild_mfn_list();
447 
448 	SUSPEND_DEBUG("xenbus_resume\n");
449 	xenbus_resume();
450 	SUSPEND_DEBUG("xenbus_resume_devices\n");
451 	xen_resume_devices();
452 
453 	thread_affinity_clear(curthread);
454 	kpreempt_enable();
455 
456 	SUSPEND_DEBUG("finished xen_suspend_domain\n");
457 	cmn_err(CE_NOTE, "domain restore/migrate completed");
458 }
459 
460 /*ARGSUSED*/
461 int
462 xen_debug_handler(void *arg)
463 {
464 	debug_enter("External debug event received");
465 
466 	/*
467 	 * If we've not got KMDB loaded, output some stuff difficult to capture
468 	 * from a domain core.
469 	 */
470 	if (!(boothowto & RB_DEBUG)) {
471 		shared_info_t *si = HYPERVISOR_shared_info;
472 		int i;
473 
474 		prom_printf("evtchn_pending [ ");
475 		for (i = 0; i < 8; i++)
476 			prom_printf("%lx ", si->evtchn_pending[i]);
477 		prom_printf("]\nevtchn_mask [ ");
478 		for (i = 0; i < 8; i++)
479 			prom_printf("%lx ", si->evtchn_mask[i]);
480 		prom_printf("]\n");
481 
482 		for (i = 0; i < ncpus; i++) {
483 			vcpu_info_t *vcpu = &si->vcpu_info[i];
484 			if (cpu[i] == NULL)
485 				continue;
486 			prom_printf("CPU%d pending %d mask %d sel %lx\n",
487 			    i, vcpu->evtchn_upcall_pending,
488 			    vcpu->evtchn_upcall_mask,
489 			    vcpu->evtchn_pending_sel);
490 		}
491 	}
492 
493 	return (0);
494 }
495 
496 /*ARGSUSED*/
497 static void
498 xen_sysrq_handler(struct xenbus_watch *watch, const char **vec,
499     unsigned int len)
500 {
501 	xenbus_transaction_t xbt;
502 	char key = '\0';
503 	int ret;
504 
505 retry:
506 	if (xenbus_transaction_start(&xbt)) {
507 		cmn_err(CE_WARN, "failed to start sysrq transaction");
508 		return;
509 	}
510 
511 	if ((ret = xenbus_scanf(xbt, "control", "sysrq", "%c", &key)) != 0) {
512 		/*
513 		 * ENOENT happens in response to our own xenbus_rm.
514 		 * XXPV - this happens spuriously on boot?
515 		 */
516 		if (ret != ENOENT)
517 			cmn_err(CE_WARN, "failed to read sysrq: %d", ret);
518 		goto out;
519 	}
520 
521 	if ((ret = xenbus_rm(xbt, "control", "sysrq")) != 0) {
522 		cmn_err(CE_WARN, "failed to reset sysrq: %d", ret);
523 		goto out;
524 	}
525 
526 	if (xenbus_transaction_end(xbt, 0) == EAGAIN)
527 		goto retry;
528 
529 	/*
530 	 * Somewhat arbitrary - on Linux this means 'reboot'. We could just
531 	 * accept any key, but this might increase the risk of sending a
532 	 * harmless sysrq to the wrong domain...
533 	 */
534 	if (key == 'b')
535 		(void) xen_debug_handler(NULL);
536 	else
537 		cmn_err(CE_WARN, "Ignored sysrq %c", key);
538 	return;
539 
540 out:
541 	(void) xenbus_transaction_end(xbt, 1);
542 }
543 
544 taskq_t *xen_shutdown_tq;
545 
546 #define	SHUTDOWN_INVALID	-1
547 #define	SHUTDOWN_POWEROFF	0
548 #define	SHUTDOWN_REBOOT		1
549 #define	SHUTDOWN_SUSPEND	2
550 #define	SHUTDOWN_HALT		3
551 #define	SHUTDOWN_MAX		4
552 
553 #define	SHUTDOWN_TIMEOUT_SECS (60 * 5)
554 
555 static const char *cmd_strings[SHUTDOWN_MAX] = {
556 	"poweroff",
557 	"reboot",
558 	"suspend",
559 	"halt"
560 };
561 
562 static void
563 xen_dirty_shutdown(void *arg)
564 {
565 	int cmd = (uintptr_t)arg;
566 
567 	cmn_err(CE_WARN, "Externally requested shutdown failed or "
568 	    "timed out.\nShutting down.\n");
569 
570 	switch (cmd) {
571 	case SHUTDOWN_HALT:
572 	case SHUTDOWN_POWEROFF:
573 		(void) kadmin(A_SHUTDOWN, AD_POWEROFF, NULL, kcred);
574 		break;
575 	case SHUTDOWN_REBOOT:
576 		(void) kadmin(A_REBOOT, AD_BOOT, NULL, kcred);
577 		break;
578 	}
579 }
580 
581 static void
582 xen_shutdown(void *arg)
583 {
584 	nvlist_t *attr_list = NULL;
585 	sysevent_t *event = NULL;
586 	sysevent_id_t eid;
587 	int cmd = (uintptr_t)arg;
588 	int err;
589 
590 	ASSERT(cmd > SHUTDOWN_INVALID && cmd < SHUTDOWN_MAX);
591 
592 	if (cmd == SHUTDOWN_SUSPEND) {
593 		xen_suspend_domain();
594 		return;
595 	}
596 
597 	err = nvlist_alloc(&attr_list, NV_UNIQUE_NAME, KM_SLEEP);
598 	if (err != DDI_SUCCESS)
599 		goto failure;
600 
601 	err = nvlist_add_string(attr_list, "shutdown", cmd_strings[cmd]);
602 	if (err != DDI_SUCCESS)
603 		goto failure;
604 
605 	if ((event = sysevent_alloc("EC_xpvsys", "control", "SUNW:kern:xpv",
606 	    SE_SLEEP)) == NULL)
607 		goto failure;
608 	(void) sysevent_attach_attributes(event,
609 	    (sysevent_attr_list_t *)attr_list);
610 
611 	err = log_sysevent(event, SE_SLEEP, &eid);
612 
613 	sysevent_detach_attributes(event);
614 	sysevent_free(event);
615 
616 	if (err != 0)
617 		goto failure;
618 
619 	(void) timeout(xen_dirty_shutdown, arg,
620 	    SHUTDOWN_TIMEOUT_SECS * drv_usectohz(MICROSEC));
621 
622 	nvlist_free(attr_list);
623 	return;
624 
625 failure:
626 	if (attr_list != NULL)
627 		nvlist_free(attr_list);
628 	xen_dirty_shutdown(arg);
629 }
630 
631 /*ARGSUSED*/
632 static void
633 xen_shutdown_handler(struct xenbus_watch *watch, const char **vec,
634 	unsigned int len)
635 {
636 	char *str;
637 	xenbus_transaction_t xbt;
638 	int err, shutdown_code = SHUTDOWN_INVALID;
639 	unsigned int slen;
640 
641 again:
642 	err = xenbus_transaction_start(&xbt);
643 	if (err)
644 		return;
645 	if (xenbus_read(xbt, "control", "shutdown", (void *)&str, &slen)) {
646 		(void) xenbus_transaction_end(xbt, 1);
647 		return;
648 	}
649 
650 	SUSPEND_DEBUG("%d: xen_shutdown_handler: \"%s\"\n", CPU->cpu_id, str);
651 
652 	/*
653 	 * If this is a watch fired from our write below, check out early to
654 	 * avoid an infinite loop.
655 	 */
656 	if (strcmp(str, "") == 0) {
657 		(void) xenbus_transaction_end(xbt, 0);
658 		kmem_free(str, slen);
659 		return;
660 	} else if (strcmp(str, "poweroff") == 0) {
661 		shutdown_code = SHUTDOWN_POWEROFF;
662 	} else if (strcmp(str, "reboot") == 0) {
663 		shutdown_code = SHUTDOWN_REBOOT;
664 	} else if (strcmp(str, "suspend") == 0) {
665 		shutdown_code = SHUTDOWN_SUSPEND;
666 	} else if (strcmp(str, "halt") == 0) {
667 		shutdown_code = SHUTDOWN_HALT;
668 	} else {
669 		printf("Ignoring shutdown request: %s\n", str);
670 	}
671 
672 	/*
673 	 * XXPV	Should we check the value of xenbus_write() too, or are all
674 	 *	errors automatically folded into xenbus_transaction_end() ??
675 	 */
676 	(void) xenbus_write(xbt, "control", "shutdown", "");
677 	err = xenbus_transaction_end(xbt, 0);
678 	if (err == EAGAIN) {
679 		SUSPEND_DEBUG("%d: trying again\n", CPU->cpu_id);
680 		kmem_free(str, slen);
681 		goto again;
682 	}
683 
684 	kmem_free(str, slen);
685 	if (shutdown_code != SHUTDOWN_INVALID) {
686 		(void) taskq_dispatch(xen_shutdown_tq, xen_shutdown,
687 		    (void *)(intptr_t)shutdown_code, 0);
688 	}
689 }
690 
691 static struct xenbus_watch shutdown_watch;
692 static struct xenbus_watch sysrq_watch;
693 
694 void
695 xen_late_startup(void)
696 {
697 	if (!DOMAIN_IS_INITDOMAIN(xen_info)) {
698 		xen_shutdown_tq = taskq_create("shutdown_taskq", 1,
699 		    maxclsyspri - 1, 1, 1, TASKQ_PREPOPULATE);
700 		shutdown_watch.node = "control/shutdown";
701 		shutdown_watch.callback = xen_shutdown_handler;
702 		if (register_xenbus_watch(&shutdown_watch))
703 			cmn_err(CE_WARN, "Failed to set shutdown watcher");
704 
705 		sysrq_watch.node = "control/sysrq";
706 		sysrq_watch.callback = xen_sysrq_handler;
707 		if (register_xenbus_watch(&sysrq_watch))
708 			cmn_err(CE_WARN, "Failed to set sysrq watcher");
709 	}
710 	balloon_init(xen_info->nr_pages);
711 }
712 
713 #ifdef DEBUG
714 #define	XEN_PRINTF_BUFSIZE	1024
715 
716 char xen_printf_buffer[XEN_PRINTF_BUFSIZE];
717 
718 /*
719  * Printf function that calls hypervisor directly.  For DomU it only
720  * works when running on a xen hypervisor built with debug on.  Works
721  * always since no I/O ring interaction is needed.
722  */
723 /*PRINTFLIKE1*/
724 void
725 xen_printf(const char *fmt, ...)
726 {
727 	va_list	ap;
728 
729 	va_start(ap, fmt);
730 	(void) vsnprintf(xen_printf_buffer, XEN_PRINTF_BUFSIZE, fmt, ap);
731 	va_end(ap);
732 
733 	(void) HYPERVISOR_console_io(CONSOLEIO_write,
734 	    strlen(xen_printf_buffer), xen_printf_buffer);
735 }
736 #else
737 void
738 xen_printf(const char *fmt, ...)
739 {
740 }
741 #endif	/* DEBUG */
742 
743 /*
744  * Determine helpful version information.
745  *
746  * (And leave a copy around in the data segment so we can look
747  * at them later with e.g. kmdb.)
748  */
749 struct xenver {
750 	char *xv_ver;
751 	char *xv_chgset;
752 	char *xv_compiler;
753 	char *xv_compile_date;
754 	char *xv_compile_by;
755 	char *xv_compile_domain;
756 	char *xv_caps;
757 } xenver;
758 
759 static char *
760 sprintf_alloc(const char *fmt, ...)
761 {
762 	va_list ap;
763 	size_t len;
764 	char *p;
765 
766 	va_start(ap, fmt);
767 	len = 1 + vsnprintf(NULL, 0, fmt, ap);
768 	p = kmem_alloc(len, KM_SLEEP);
769 	(void) vsnprintf(p, len, fmt, ap);
770 	va_end(ap);
771 	return (p);
772 }
773 
774 void
775 xen_version(void)
776 {
777 	static const char strfmt[] = "%s";
778 	static const char xenver_sun[] = "3.0.4-1-xvm";  /* XXPV */
779 	union {
780 		xen_extraversion_t xver;
781 		xen_changeset_info_t chgset;
782 		xen_compile_info_t build;
783 		xen_capabilities_info_t caps;
784 	} data, *src = &data;
785 
786 	ulong_t ver = HYPERVISOR_xen_version(XENVER_version, 0);
787 
788 	if (HYPERVISOR_xen_version(XENVER_extraversion, src) == 0) {
789 		((char *)(src->xver))[sizeof (src->xver) - 1] = '\0';
790 	} else
791 		((char *)(src->xver))[0] = '\0';
792 
793 	xenver.xv_ver = sprintf_alloc("%lu.%lu%s",
794 	    BITX(ver, 31, 16), BITX(ver, 15, 0), src->xver);
795 
796 	if (HYPERVISOR_xen_version(XENVER_changeset, src) == 0) {
797 		((char *)(src->chgset))[sizeof (src->chgset) - 1] = '\0';
798 		xenver.xv_chgset = sprintf_alloc(strfmt, src->chgset);
799 	}
800 
801 	cmn_err(CE_CONT, "?xen v%s chgset '%s'\n",
802 	    xenver.xv_ver, xenver.xv_chgset);
803 
804 	/*
805 	 * XXPV - Solaris guests currently require special version of
806 	 * the hypervisor from Sun to function properly called "3.0.4-1-xvm".
807 	 * This version is based on "3.0.4-1" plus changes from
808 	 * Sun that are a work-in-progress.
809 	 *
810 	 * This version check will disappear after appropriate fixes
811 	 * are accepted upstream.
812 	 */
813 	if (strcmp(xenver.xv_ver, xenver_sun) != 0) {
814 		cmn_err(CE_WARN, "Found xen v%s but need xen v%s",
815 		    xenver.xv_ver, xenver_sun);
816 		cmn_err(CE_WARN, "The kernel may not function correctly");
817 	}
818 
819 	if (HYPERVISOR_xen_version(XENVER_compile_info, src) == 0) {
820 		xenver.xv_compiler = sprintf_alloc(strfmt,
821 		    data.build.compiler);
822 		xenver.xv_compile_date = sprintf_alloc(strfmt,
823 		    data.build.compile_date);
824 		xenver.xv_compile_by = sprintf_alloc(strfmt,
825 		    data.build.compile_by);
826 		xenver.xv_compile_domain = sprintf_alloc(strfmt,
827 		    data.build.compile_domain);
828 	}
829 
830 	/*
831 	 * Capabilities are a set of space separated ascii strings
832 	 * e.g. 'xen-3.1-x86_32p' or 'hvm-3.2-x86_64'
833 	 */
834 	if (HYPERVISOR_xen_version(XENVER_capabilities, src) == 0) {
835 		((char *)(src->caps))[sizeof (src->caps) - 1] = '\0';
836 		xenver.xv_caps = sprintf_alloc(strfmt, src->caps);
837 	}
838 }
839 
840 /*
841  * Miscellaneous hypercall wrappers with slightly more verbose diagnostics.
842  */
843 
844 void
845 xen_set_gdt(ulong_t *frame_list, int entries)
846 {
847 	int err;
848 	if ((err = HYPERVISOR_set_gdt(frame_list, entries)) != 0) {
849 		/*
850 		 * X_EINVAL:	reserved entry or bad frames
851 		 * X_EFAULT:	bad address
852 		 */
853 		panic("xen_set_gdt(%p, %d): error %d",
854 		    (void *)frame_list, entries, -(int)err);
855 	}
856 }
857 
858 void
859 xen_set_ldt(user_desc_t *ldt, uint_t nsels)
860 {
861 	struct mmuext_op	op;
862 	long			err;
863 
864 	op.cmd = MMUEXT_SET_LDT;
865 	op.arg1.linear_addr = (uintptr_t)ldt;
866 	op.arg2.nr_ents = nsels;
867 
868 	if ((err = HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) != 0) {
869 		panic("xen_set_ldt(%p, %d): error %d",
870 		    (void *)ldt, nsels, -(int)err);
871 	}
872 }
873 
874 void
875 xen_stack_switch(ulong_t ss, ulong_t esp)
876 {
877 	long err;
878 
879 	if ((err = HYPERVISOR_stack_switch(ss, esp)) != 0) {
880 		/*
881 		 * X_EPERM:	bad selector
882 		 */
883 		panic("xen_stack_switch(%lx, %lx): error %d", ss, esp,
884 		    -(int)err);
885 	}
886 }
887 
888 long
889 xen_set_trap_table(trap_info_t *table)
890 {
891 	long err;
892 
893 	if ((err = HYPERVISOR_set_trap_table(table)) != 0) {
894 		/*
895 		 * X_EFAULT:	bad address
896 		 * X_EPERM:	bad selector
897 		 */
898 		panic("xen_set_trap_table(%p): error %d", (void *)table,
899 		    -(int)err);
900 	}
901 	return (err);
902 }
903 
904 #if defined(__amd64)
905 void
906 xen_set_segment_base(int reg, ulong_t value)
907 {
908 	long err;
909 
910 	if ((err = HYPERVISOR_set_segment_base(reg, value)) != 0) {
911 		/*
912 		 * X_EFAULT:	bad address
913 		 * X_EINVAL:	bad type
914 		 */
915 		panic("xen_set_segment_base(%d, %lx): error %d",
916 		    reg, value, -(int)err);
917 	}
918 }
919 #endif	/* __amd64 */
920 
921 /*
922  * Translate a hypervisor errcode to a Solaris error code.
923  */
924 int
925 xen_xlate_errcode(int error)
926 {
927 	switch (-error) {
928 
929 	/*
930 	 * Translate hypervisor errno's into native errno's
931 	 */
932 
933 #define	CASE(num)	case X_##num: error = num; break
934 
935 	CASE(EPERM);	CASE(ENOENT);	CASE(ESRCH);
936 	CASE(EINTR);	CASE(EIO);	CASE(ENXIO);
937 	CASE(E2BIG);	CASE(ENOMEM);	CASE(EACCES);
938 	CASE(EFAULT);	CASE(EBUSY);	CASE(EEXIST);
939 	CASE(ENODEV);	CASE(EISDIR);	CASE(EINVAL);
940 	CASE(ENOSPC);	CASE(ESPIPE);	CASE(EROFS);
941 	CASE(ENOSYS);	CASE(ENOTEMPTY); CASE(EISCONN);
942 	CASE(ENODATA);
943 
944 #undef CASE
945 
946 	default:
947 		panic("xen_xlate_errcode: unknown error %d", error);
948 	}
949 
950 	return (error);
951 }
952 
953 /*
954  * Raise PS_IOPL on current vcpu to user level.
955  * Caller responsible for preventing kernel preemption.
956  */
957 void
958 xen_enable_user_iopl(void)
959 {
960 	physdev_set_iopl_t set_iopl;
961 	set_iopl.iopl = 3;		/* user ring 3 */
962 	(void) HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
963 }
964 
965 /*
966  * Drop PS_IOPL on current vcpu to kernel level
967  */
968 void
969 xen_disable_user_iopl(void)
970 {
971 	physdev_set_iopl_t set_iopl;
972 	set_iopl.iopl = 1;		/* kernel pseudo ring 1 */
973 	(void) HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
974 }
975 
976 int
977 xen_gdt_setprot(cpu_t *cp, uint_t prot)
978 {
979 	int err;
980 #if defined(__amd64)
981 	int pt_bits = PT_VALID;
982 	if (prot & PROT_WRITE)
983 		pt_bits |= PT_WRITABLE;
984 #endif
985 
986 	if ((err = as_setprot(&kas, (caddr_t)cp->cpu_gdt,
987 	    MMU_PAGESIZE, prot)) != 0)
988 		goto done;
989 
990 #if defined(__amd64)
991 	err = xen_kpm_page(mmu_btop(cp->cpu_m.mcpu_gdtpa), pt_bits);
992 #endif
993 
994 done:
995 	if (err) {
996 		cmn_err(CE_WARN, "cpu%d: xen_gdt_setprot(%s) failed: error %d",
997 		    cp->cpu_id, (prot & PROT_WRITE) ? "writable" : "read-only",
998 		    err);
999 	}
1000 
1001 	return (err);
1002 }
1003 
1004 int
1005 xen_ldt_setprot(user_desc_t *ldt, size_t lsize, uint_t prot)
1006 {
1007 	int err;
1008 	caddr_t	lva = (caddr_t)ldt;
1009 #if defined(__amd64)
1010 	int pt_bits = PT_VALID;
1011 	pgcnt_t npgs;
1012 	if (prot & PROT_WRITE)
1013 		pt_bits |= PT_WRITABLE;
1014 #endif	/* __amd64 */
1015 
1016 	if ((err = as_setprot(&kas, (caddr_t)ldt, lsize, prot)) != 0)
1017 		goto done;
1018 
1019 #if defined(__amd64)
1020 
1021 	ASSERT(IS_P2ALIGNED(lsize, PAGESIZE));
1022 	npgs = mmu_btop(lsize);
1023 	while (npgs--) {
1024 		if ((err = xen_kpm_page(hat_getpfnum(kas.a_hat, lva),
1025 		    pt_bits)) != 0)
1026 			break;
1027 		lva += PAGESIZE;
1028 	}
1029 #endif	/* __amd64 */
1030 
1031 done:
1032 	if (err) {
1033 		cmn_err(CE_WARN, "xen_ldt_setprot(%p, %s) failed: error %d",
1034 		    (void *)lva,
1035 		    (prot & PROT_WRITE) ? "writable" : "read-only", err);
1036 	}
1037 
1038 	return (err);
1039 }
1040