xref: /titanic_51/usr/src/uts/i86xpv/os/xen_machdep.c (revision e79c98e6c943cb3032f272714ff4ce6137d40394)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /* derived from netbsd's xen_machdep.c 1.1.2.1 */
30 
31 /*
32  *
33  * Copyright (c) 2004 Christian Limpach.
34  * All rights reserved.
35  *
36  * Redistribution and use in source and binary forms, with or without
37  * modification, are permitted provided that the following conditions
38  * are met:
39  * 1. Redistributions of source code must retain the above copyright
40  *    notice, this list of conditions and the following disclaimer.
41  * 2. Redistributions in binary form must reproduce the above copyright
42  *    notice, this list of conditions and the following disclaimer in the
43  *    documentation and/or other materials provided with the distribution.
44  * 3. This section intentionally left blank.
45  * 4. The name of the author may not be used to endorse or promote products
46  *    derived from this software without specific prior written permission.
47  *
48  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
49  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
50  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
51  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
52  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
53  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
54  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
55  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
56  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
57  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
58  */
59 /*
60  * Section 3 of the above license was updated in response to bug 6379571.
61  */
62 
63 #include <sys/ctype.h>
64 #include <sys/types.h>
65 #include <sys/cmn_err.h>
66 #include <sys/trap.h>
67 #include <sys/segments.h>
68 #include <sys/hypervisor.h>
69 #include <sys/xen_mmu.h>
70 #include <sys/machsystm.h>
71 #include <sys/promif.h>
72 #include <sys/bootconf.h>
73 #include <sys/bootinfo.h>
74 #include <sys/cpr.h>
75 #include <sys/taskq.h>
76 #include <sys/uadmin.h>
77 #include <sys/evtchn_impl.h>
78 #include <sys/archsystm.h>
79 #include <xen/sys/xenbus_impl.h>
80 #include <sys/mach_mmu.h>
81 #include <vm/hat_i86.h>
82 #include <sys/gnttab.h>
83 #include <sys/reboot.h>
84 #include <sys/stack.h>
85 #include <sys/clock.h>
86 #include <sys/bitmap.h>
87 #include <sys/processor.h>
88 #include <sys/xen_errno.h>
89 #include <sys/xpv_panic.h>
90 #include <sys/smp_impldefs.h>
91 #include <sys/cpu.h>
92 #include <sys/balloon_impl.h>
93 #include <sys/ddi.h>
94 
95 #ifdef DEBUG
96 #define	SUSPEND_DEBUG if (xen_suspend_debug) xen_printf
97 #else
98 #define	SUSPEND_DEBUG(...)
99 #endif
100 
101 int cpr_debug;
102 cpuset_t cpu_suspend_lost_set;
103 static int xen_suspend_debug;
104 
105 /*
106  * Determine helpful version information.
107  *
108  * (And leave copies in the data segment so we can look at them later
109  * with e.g. kmdb.)
110  */
111 
112 typedef enum xen_version {
113 	XENVER_BOOT_IDX,
114 	XENVER_CURRENT_IDX
115 } xen_version_t;
116 
117 struct xenver {
118 	ulong_t xv_major;
119 	ulong_t xv_minor;
120 	ulong_t xv_revision;
121 	xen_extraversion_t xv_ver;
122 	ulong_t xv_is_xvm;
123 	xen_changeset_info_t xv_chgset;
124 	xen_compile_info_t xv_build;
125 	xen_capabilities_info_t xv_caps;
126 } xenver[2];
127 
128 #define	XENVER_BOOT(m)	(xenver[XENVER_BOOT_IDX].m)
129 #define	XENVER_CURRENT(m)	(xenver[XENVER_CURRENT_IDX].m)
130 
131 /*
132  * Update the xenver data. We maintain two copies, boot and
133  * current. If we are setting the boot, then also set current.
134  */
135 static void
136 xen_set_version(xen_version_t idx)
137 {
138 	ulong_t ver;
139 
140 	bzero(&xenver[idx], sizeof (xenver[idx]));
141 
142 	ver = HYPERVISOR_xen_version(XENVER_version, 0);
143 
144 	xenver[idx].xv_major = BITX(ver, 31, 16);
145 	xenver[idx].xv_minor = BITX(ver, 15, 0);
146 
147 	(void) HYPERVISOR_xen_version(XENVER_extraversion, &xenver[idx].xv_ver);
148 
149 	/*
150 	 * The revision is buried in the extraversion information that is
151 	 * maintained by the hypervisor. For our purposes we expect that
152 	 * the revision number is:
153 	 * 	- the second character in the extraversion information
154 	 *	- one character long
155 	 *	- numeric digit
156 	 * If it isn't then we can't extract the revision and we leave it
157 	 * set to 0.
158 	 */
159 	if (strlen(xenver[idx].xv_ver) > 1 && isdigit(xenver[idx].xv_ver[1]))
160 		xenver[idx].xv_revision = xenver[idx].xv_ver[1] - '0';
161 	else
162 		cmn_err(CE_WARN, "Cannot extract revision on this hypervisor "
163 		    "version: v%s, unexpected version format",
164 		    xenver[idx].xv_ver);
165 
166 	xenver[idx].xv_is_xvm = 0;
167 
168 	if (strlen(xenver[idx].xv_ver) >= 4 &&
169 	    strncmp(xenver[idx].xv_ver + strlen(xenver[idx].xv_ver) - 4,
170 	    "-xvm", 4) == 0)
171 		xenver[idx].xv_is_xvm = 1;
172 
173 	(void) HYPERVISOR_xen_version(XENVER_changeset,
174 	    &xenver[idx].xv_chgset);
175 
176 	(void) HYPERVISOR_xen_version(XENVER_compile_info,
177 	    &xenver[idx].xv_build);
178 	/*
179 	 * Capabilities are a set of space separated ascii strings
180 	 * e.g. 'xen-3.1-x86_32p' or 'hvm-3.2-x86_64'
181 	 */
182 	(void) HYPERVISOR_xen_version(XENVER_capabilities,
183 	    &xenver[idx].xv_caps);
184 
185 	cmn_err(CE_CONT, "?v%lu.%lu%s chgset '%s'\n", xenver[idx].xv_major,
186 	    xenver[idx].xv_minor, xenver[idx].xv_ver, xenver[idx].xv_chgset);
187 
188 	if (idx == XENVER_BOOT_IDX)
189 		bcopy(&xenver[XENVER_BOOT_IDX], &xenver[XENVER_CURRENT_IDX],
190 		    sizeof (xenver[XENVER_BOOT_IDX]));
191 }
192 
193 typedef enum xen_hypervisor_check {
194 	XEN_RUN_CHECK,
195 	XEN_SUSPEND_CHECK
196 } xen_hypervisor_check_t;
197 
198 /*
199  * To run the hypervisor must be 3.0.4 or better. To suspend/resume
200  * we need 3.0.4 or better and if it is 3.0.4. then it must be provided
201  * by the Solaris xVM project.
202  * Checking can be disabled for testing purposes by setting the
203  * xen_suspend_debug variable.
204  */
205 static int
206 xen_hypervisor_supports_solaris(xen_hypervisor_check_t check)
207 {
208 	if (xen_suspend_debug == 1)
209 		return (1);
210 	if (XENVER_CURRENT(xv_major) < 3)
211 		return (0);
212 	if (XENVER_CURRENT(xv_major) > 3)
213 		return (1);
214 	if (XENVER_CURRENT(xv_minor) > 0)
215 		return (1);
216 	if (XENVER_CURRENT(xv_revision) < 4)
217 		return (0);
218 	if (check == XEN_SUSPEND_CHECK && XENVER_CURRENT(xv_revision) == 4 &&
219 	    !XENVER_CURRENT(xv_is_xvm))
220 		return (0);
221 
222 	return (1);
223 }
224 
225 /*
226  * If the hypervisor is -xvm, or 3.1.2 or higher, we don't need the
227  * workaround.
228  */
229 static void
230 xen_pte_workaround(void)
231 {
232 #if defined(__amd64)
233 	extern int pt_kern;
234 
235 	if (XENVER_CURRENT(xv_major) != 3)
236 		return;
237 	if (XENVER_CURRENT(xv_minor) > 1)
238 		return;
239 	if (XENVER_CURRENT(xv_minor) == 1 &&
240 	    XENVER_CURRENT(xv_revision) > 1)
241 		return;
242 	if (XENVER_CURRENT(xv_is_xvm))
243 		return;
244 
245 	pt_kern = PT_USER;
246 #endif
247 }
248 
249 void
250 xen_set_callback(void (*func)(void), uint_t type, uint_t flags)
251 {
252 	struct callback_register cb;
253 
254 	bzero(&cb, sizeof (cb));
255 #if defined(__amd64)
256 	cb.address = (ulong_t)func;
257 #elif defined(__i386)
258 	cb.address.cs = KCS_SEL;
259 	cb.address.eip = (ulong_t)func;
260 #endif
261 	cb.type = type;
262 	cb.flags = flags;
263 
264 	/*
265 	 * XXPV always ignore return value for NMI
266 	 */
267 	if (HYPERVISOR_callback_op(CALLBACKOP_register, &cb) != 0 &&
268 	    type != CALLBACKTYPE_nmi)
269 		panic("HYPERVISOR_callback_op failed");
270 }
271 
272 void
273 xen_init_callbacks(void)
274 {
275 	/*
276 	 * register event (interrupt) handler.
277 	 */
278 	xen_set_callback(xen_callback, CALLBACKTYPE_event, 0);
279 
280 	/*
281 	 * failsafe handler.
282 	 */
283 	xen_set_callback(xen_failsafe_callback, CALLBACKTYPE_failsafe,
284 	    CALLBACKF_mask_events);
285 
286 	/*
287 	 * NMI handler.
288 	 */
289 	xen_set_callback(nmiint, CALLBACKTYPE_nmi, 0);
290 
291 	/*
292 	 * system call handler
293 	 * XXPV move to init_cpu_syscall?
294 	 */
295 #if defined(__amd64)
296 	xen_set_callback(sys_syscall, CALLBACKTYPE_syscall,
297 	    CALLBACKF_mask_events);
298 #endif	/* __amd64 */
299 }
300 
301 
302 /*
303  * cmn_err() followed by a 1/4 second delay; this gives the
304  * logging service a chance to flush messages and helps avoid
305  * intermixing output from prom_printf().
306  * XXPV: doesn't exactly help us on UP though.
307  */
308 /*PRINTFLIKE2*/
309 void
310 cpr_err(int ce, const char *fmt, ...)
311 {
312 	va_list adx;
313 
314 	va_start(adx, fmt);
315 	vcmn_err(ce, fmt, adx);
316 	va_end(adx);
317 	drv_usecwait(MICROSEC >> 2);
318 }
319 
320 void
321 xen_suspend_devices(void)
322 {
323 	int rc;
324 
325 	SUSPEND_DEBUG("xen_suspend_devices\n");
326 
327 	if ((rc = cpr_suspend_devices(ddi_root_node())) != 0)
328 		panic("failed to suspend devices: %d", rc);
329 }
330 
331 void
332 xen_resume_devices(void)
333 {
334 	int rc;
335 
336 	SUSPEND_DEBUG("xen_resume_devices\n");
337 
338 	if ((rc = cpr_resume_devices(ddi_root_node(), 0)) != 0)
339 		panic("failed to resume devices: %d", rc);
340 }
341 
342 /*
343  * The list of mfn pages is out of date.  Recompute it.
344  */
345 static void
346 rebuild_mfn_list(void)
347 {
348 	int i = 0;
349 	size_t sz;
350 	size_t off;
351 	pfn_t pfn;
352 
353 	SUSPEND_DEBUG("rebuild_mfn_list\n");
354 
355 	sz = ((mfn_count * sizeof (mfn_t)) + MMU_PAGEOFFSET) & MMU_PAGEMASK;
356 
357 	for (off = 0; off < sz; off += MMU_PAGESIZE) {
358 		size_t j = mmu_btop(off);
359 		if (((j * sizeof (mfn_t)) & MMU_PAGEOFFSET) == 0) {
360 			pfn = hat_getpfnum(kas.a_hat,
361 			    (caddr_t)&mfn_list_pages[j]);
362 			mfn_list_pages_page[i++] = pfn_to_mfn(pfn);
363 		}
364 
365 		pfn = hat_getpfnum(kas.a_hat, (caddr_t)mfn_list + off);
366 		mfn_list_pages[j] = pfn_to_mfn(pfn);
367 	}
368 
369 	pfn = hat_getpfnum(kas.a_hat, (caddr_t)mfn_list_pages_page);
370 	HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list
371 	    = pfn_to_mfn(pfn);
372 }
373 
374 static void
375 suspend_cpus(void)
376 {
377 	int i;
378 
379 	SUSPEND_DEBUG("suspend_cpus\n");
380 
381 	mp_enter_barrier();
382 
383 	for (i = 1; i < ncpus; i++) {
384 		if (!CPU_IN_SET(cpu_suspend_lost_set, i)) {
385 			SUSPEND_DEBUG("xen_vcpu_down %d\n", i);
386 			(void) xen_vcpu_down(i);
387 		}
388 
389 		mach_cpucontext_reset(cpu[i]);
390 	}
391 }
392 
393 static void
394 resume_cpus(void)
395 {
396 	int i;
397 
398 	for (i = 1; i < ncpus; i++) {
399 		if (cpu[i] == NULL)
400 			continue;
401 
402 		if (!CPU_IN_SET(cpu_suspend_lost_set, i)) {
403 			SUSPEND_DEBUG("xen_vcpu_up %d\n", i);
404 			mach_cpucontext_restore(cpu[i]);
405 			(void) xen_vcpu_up(i);
406 		}
407 	}
408 
409 	mp_leave_barrier();
410 }
411 
412 /*
413  * Top level routine to direct suspend/resume of a domain.
414  */
415 void
416 xen_suspend_domain(void)
417 {
418 	extern void rtcsync(void);
419 	extern hrtime_t hres_last_tick;
420 	mfn_t start_info_mfn;
421 	ulong_t flags;
422 	pfn_t pfn;
423 	int i;
424 
425 	/*
426 	 * Check that we are happy to suspend on this hypervisor.
427 	 */
428 	if (xen_hypervisor_supports_solaris(XEN_SUSPEND_CHECK) == 0) {
429 		cpr_err(CE_WARN, "Cannot suspend on this hypervisor "
430 		    "version: v%lu.%lu%s, need at least version v3.0.4 or "
431 		    "-xvm based hypervisor", XENVER_CURRENT(xv_major),
432 		    XENVER_CURRENT(xv_minor), XENVER_CURRENT(xv_ver));
433 		return;
434 	}
435 
436 	/*
437 	 * XXPV - Are we definitely OK to suspend by the time we've connected
438 	 * the handler?
439 	 */
440 
441 	cpr_err(CE_NOTE, "Domain suspending for save/migrate");
442 
443 	SUSPEND_DEBUG("xen_suspend_domain\n");
444 
445 	/*
446 	 * suspend interrupts and devices
447 	 * XXPV - we use suspend/resume for both save/restore domains (like sun
448 	 * cpr) and for migration.  Would be nice to know the difference if
449 	 * possible.  For save/restore where down time may be a long time, we
450 	 * may want to do more of the things that cpr does.  (i.e. notify user
451 	 * processes, shrink memory footprint for faster restore, etc.)
452 	 */
453 	xen_suspend_devices();
454 	SUSPEND_DEBUG("xenbus_suspend\n");
455 	xenbus_suspend();
456 
457 	pfn = hat_getpfnum(kas.a_hat, (caddr_t)xen_info);
458 	start_info_mfn = pfn_to_mfn(pfn);
459 
460 	/*
461 	 * XXPV: cpu hotplug can hold this under a xenbus watch. Are we safe
462 	 * wrt xenbus being suspended here?
463 	 */
464 	mutex_enter(&cpu_lock);
465 
466 	/*
467 	 * Suspend must be done on vcpu 0, as no context for other CPUs is
468 	 * saved.
469 	 *
470 	 * XXPV - add to taskq API ?
471 	 */
472 	thread_affinity_set(curthread, 0);
473 	kpreempt_disable();
474 
475 	SUSPEND_DEBUG("xen_start_migrate\n");
476 	xen_start_migrate();
477 	if (ncpus > 1)
478 		suspend_cpus();
479 
480 	/*
481 	 * We can grab the ec_lock as it's a spinlock with a high SPL. Hence
482 	 * any holder would have dropped it to get through suspend_cpus().
483 	 */
484 	mutex_enter(&ec_lock);
485 
486 	/*
487 	 * From here on in, we can't take locks.
488 	 */
489 	SUSPEND_DEBUG("ec_suspend\n");
490 	ec_suspend();
491 	SUSPEND_DEBUG("gnttab_suspend\n");
492 	gnttab_suspend();
493 
494 	flags = intr_clear();
495 
496 	xpv_time_suspend();
497 
498 	/*
499 	 * Currently, the hypervisor incorrectly fails to bring back
500 	 * powered-down VCPUs.  Thus we need to record any powered-down VCPUs
501 	 * to prevent any attempts to operate on them.  But we have to do this
502 	 * *after* the very first time we do ec_suspend().
503 	 */
504 	for (i = 1; i < ncpus; i++) {
505 		if (cpu[i] == NULL)
506 			continue;
507 
508 		if (cpu_get_state(cpu[i]) == P_POWEROFF)
509 			CPUSET_ATOMIC_ADD(cpu_suspend_lost_set, i);
510 	}
511 
512 	/*
513 	 * The dom0 save/migrate code doesn't automatically translate
514 	 * these into PFNs, but expects them to be, so we do it here.
515 	 * We don't use mfn_to_pfn() because so many OS services have
516 	 * been disabled at this point.
517 	 */
518 	xen_info->store_mfn = mfn_to_pfn_mapping[xen_info->store_mfn];
519 	xen_info->console.domU.mfn =
520 	    mfn_to_pfn_mapping[xen_info->console.domU.mfn];
521 
522 	if (CPU->cpu_m.mcpu_vcpu_info->evtchn_upcall_mask == 0) {
523 		prom_printf("xen_suspend_domain(): "
524 		    "CPU->cpu_m.mcpu_vcpu_info->evtchn_upcall_mask not set\n");
525 		(void) HYPERVISOR_shutdown(SHUTDOWN_crash);
526 	}
527 
528 	if (HYPERVISOR_update_va_mapping((uintptr_t)HYPERVISOR_shared_info,
529 	    0, UVMF_INVLPG)) {
530 		prom_printf("xen_suspend_domain(): "
531 		    "HYPERVISOR_update_va_mapping() failed\n");
532 		(void) HYPERVISOR_shutdown(SHUTDOWN_crash);
533 	}
534 
535 	SUSPEND_DEBUG("HYPERVISOR_suspend\n");
536 
537 	/*
538 	 * At this point we suspend and sometime later resume.
539 	 */
540 	if (HYPERVISOR_suspend(start_info_mfn)) {
541 		prom_printf("xen_suspend_domain(): "
542 		    "HYPERVISOR_suspend() failed\n");
543 		(void) HYPERVISOR_shutdown(SHUTDOWN_crash);
544 	}
545 
546 	/*
547 	 * Point HYPERVISOR_shared_info to its new value.
548 	 */
549 	if (HYPERVISOR_update_va_mapping((uintptr_t)HYPERVISOR_shared_info,
550 	    xen_info->shared_info | PT_NOCONSIST | PT_VALID | PT_WRITABLE,
551 	    UVMF_INVLPG))
552 		(void) HYPERVISOR_shutdown(SHUTDOWN_crash);
553 
554 	if (xen_info->nr_pages != mfn_count) {
555 		prom_printf("xen_suspend_domain(): number of pages"
556 		    " changed, was 0x%lx, now 0x%lx\n", mfn_count,
557 		    xen_info->nr_pages);
558 		(void) HYPERVISOR_shutdown(SHUTDOWN_crash);
559 	}
560 
561 	xpv_time_resume();
562 
563 	cached_max_mfn = 0;
564 
565 	SUSPEND_DEBUG("gnttab_resume\n");
566 	gnttab_resume();
567 
568 	/* XXPV: add a note that this must be lockless. */
569 	SUSPEND_DEBUG("ec_resume\n");
570 	ec_resume();
571 
572 	intr_restore(flags);
573 
574 	if (ncpus > 1)
575 		resume_cpus();
576 
577 	mutex_exit(&ec_lock);
578 	xen_end_migrate();
579 	mutex_exit(&cpu_lock);
580 
581 	/*
582 	 * Now we can take locks again.
583 	 */
584 
585 	/*
586 	 * Force the tick value used for tv_nsec in hres_tick() to be up to
587 	 * date. rtcsync() will reset the hrestime value appropriately.
588 	 */
589 	hres_last_tick = xpv_gethrtime();
590 
591 	/*
592 	 * XXPV: we need to have resumed the CPUs since this takes locks, but
593 	 * can remote CPUs see bad state? Presumably yes. Should probably nest
594 	 * taking of todlock inside of cpu_lock, or vice versa, then provide an
595 	 * unlocked version.  Probably need to call clkinitf to reset cpu freq
596 	 * and re-calibrate if we migrated to a different speed cpu.  Also need
597 	 * to make a (re)init_cpu_info call to update processor info structs
598 	 * and device tree info.  That remains to be written at the moment.
599 	 */
600 	rtcsync();
601 
602 	rebuild_mfn_list();
603 
604 	SUSPEND_DEBUG("xenbus_resume\n");
605 	xenbus_resume();
606 	SUSPEND_DEBUG("xenbus_resume_devices\n");
607 	xen_resume_devices();
608 
609 	thread_affinity_clear(curthread);
610 	kpreempt_enable();
611 
612 	SUSPEND_DEBUG("finished xen_suspend_domain\n");
613 
614 	/*
615 	 * We have restarted our suspended domain, update the hypervisor
616 	 * details. NB: This must be done at the end of this function,
617 	 * since we need the domain to be completely resumed before
618 	 * these functions will work correctly.
619 	 */
620 	xen_set_version(XENVER_CURRENT_IDX);
621 
622 	/*
623 	 * We can check and report a warning, but we don't stop the
624 	 * process.
625 	 */
626 	if (xen_hypervisor_supports_solaris(XEN_SUSPEND_CHECK) == 0)
627 		cmn_err(CE_WARN, "Found hypervisor version: v%lu.%lu%s "
628 		    "but need at least version v3.0.4",
629 		    XENVER_CURRENT(xv_major), XENVER_CURRENT(xv_minor),
630 		    XENVER_CURRENT(xv_ver));
631 
632 	cmn_err(CE_NOTE, "domain restore/migrate completed");
633 }
634 
635 /*ARGSUSED*/
636 int
637 xen_debug_handler(void *arg)
638 {
639 	debug_enter("External debug event received");
640 
641 	/*
642 	 * If we've not got KMDB loaded, output some stuff difficult to capture
643 	 * from a domain core.
644 	 */
645 	if (!(boothowto & RB_DEBUG)) {
646 		shared_info_t *si = HYPERVISOR_shared_info;
647 		int i;
648 
649 		prom_printf("evtchn_pending [ ");
650 		for (i = 0; i < 8; i++)
651 			prom_printf("%lx ", si->evtchn_pending[i]);
652 		prom_printf("]\nevtchn_mask [ ");
653 		for (i = 0; i < 8; i++)
654 			prom_printf("%lx ", si->evtchn_mask[i]);
655 		prom_printf("]\n");
656 
657 		for (i = 0; i < ncpus; i++) {
658 			vcpu_info_t *vcpu = &si->vcpu_info[i];
659 			if (cpu[i] == NULL)
660 				continue;
661 			prom_printf("CPU%d pending %d mask %d sel %lx\n",
662 			    i, vcpu->evtchn_upcall_pending,
663 			    vcpu->evtchn_upcall_mask,
664 			    vcpu->evtchn_pending_sel);
665 		}
666 	}
667 
668 	return (0);
669 }
670 
671 /*ARGSUSED*/
672 static void
673 xen_sysrq_handler(struct xenbus_watch *watch, const char **vec,
674     unsigned int len)
675 {
676 	xenbus_transaction_t xbt;
677 	char key = '\0';
678 	int ret;
679 
680 retry:
681 	if (xenbus_transaction_start(&xbt)) {
682 		cmn_err(CE_WARN, "failed to start sysrq transaction");
683 		return;
684 	}
685 
686 	if ((ret = xenbus_scanf(xbt, "control", "sysrq", "%c", &key)) != 0) {
687 		/*
688 		 * ENOENT happens in response to our own xenbus_rm.
689 		 * XXPV - this happens spuriously on boot?
690 		 */
691 		if (ret != ENOENT)
692 			cmn_err(CE_WARN, "failed to read sysrq: %d", ret);
693 		goto out;
694 	}
695 
696 	if ((ret = xenbus_rm(xbt, "control", "sysrq")) != 0) {
697 		cmn_err(CE_WARN, "failed to reset sysrq: %d", ret);
698 		goto out;
699 	}
700 
701 	if (xenbus_transaction_end(xbt, 0) == EAGAIN)
702 		goto retry;
703 
704 	/*
705 	 * Somewhat arbitrary - on Linux this means 'reboot'. We could just
706 	 * accept any key, but this might increase the risk of sending a
707 	 * harmless sysrq to the wrong domain...
708 	 */
709 	if (key == 'b')
710 		(void) xen_debug_handler(NULL);
711 	else
712 		cmn_err(CE_WARN, "Ignored sysrq %c", key);
713 	return;
714 
715 out:
716 	(void) xenbus_transaction_end(xbt, 1);
717 }
718 
719 taskq_t *xen_shutdown_tq;
720 
721 #define	SHUTDOWN_INVALID	-1
722 #define	SHUTDOWN_POWEROFF	0
723 #define	SHUTDOWN_REBOOT		1
724 #define	SHUTDOWN_SUSPEND	2
725 #define	SHUTDOWN_HALT		3
726 #define	SHUTDOWN_MAX		4
727 
728 #define	SHUTDOWN_TIMEOUT_SECS (60 * 5)
729 
730 static const char *cmd_strings[SHUTDOWN_MAX] = {
731 	"poweroff",
732 	"reboot",
733 	"suspend",
734 	"halt"
735 };
736 
737 static void
738 xen_dirty_shutdown(void *arg)
739 {
740 	int cmd = (uintptr_t)arg;
741 
742 	cmn_err(CE_WARN, "Externally requested shutdown failed or "
743 	    "timed out.\nShutting down.\n");
744 
745 	switch (cmd) {
746 	case SHUTDOWN_HALT:
747 	case SHUTDOWN_POWEROFF:
748 		(void) kadmin(A_SHUTDOWN, AD_POWEROFF, NULL, kcred);
749 		break;
750 	case SHUTDOWN_REBOOT:
751 		(void) kadmin(A_REBOOT, AD_BOOT, NULL, kcred);
752 		break;
753 	}
754 }
755 
756 static void
757 xen_shutdown(void *arg)
758 {
759 	nvlist_t *attr_list = NULL;
760 	sysevent_t *event = NULL;
761 	sysevent_id_t eid;
762 	int cmd = (uintptr_t)arg;
763 	int err;
764 
765 	ASSERT(cmd > SHUTDOWN_INVALID && cmd < SHUTDOWN_MAX);
766 
767 	if (cmd == SHUTDOWN_SUSPEND) {
768 		xen_suspend_domain();
769 		return;
770 	}
771 
772 	err = nvlist_alloc(&attr_list, NV_UNIQUE_NAME, KM_SLEEP);
773 	if (err != DDI_SUCCESS)
774 		goto failure;
775 
776 	err = nvlist_add_string(attr_list, "shutdown", cmd_strings[cmd]);
777 	if (err != DDI_SUCCESS)
778 		goto failure;
779 
780 	if ((event = sysevent_alloc("EC_xpvsys", "control", "SUNW:kern:xpv",
781 	    SE_SLEEP)) == NULL)
782 		goto failure;
783 	(void) sysevent_attach_attributes(event,
784 	    (sysevent_attr_list_t *)attr_list);
785 
786 	err = log_sysevent(event, SE_SLEEP, &eid);
787 
788 	sysevent_detach_attributes(event);
789 	sysevent_free(event);
790 
791 	if (err != 0)
792 		goto failure;
793 
794 	(void) timeout(xen_dirty_shutdown, arg,
795 	    SHUTDOWN_TIMEOUT_SECS * drv_usectohz(MICROSEC));
796 
797 	nvlist_free(attr_list);
798 	return;
799 
800 failure:
801 	if (attr_list != NULL)
802 		nvlist_free(attr_list);
803 	xen_dirty_shutdown(arg);
804 }
805 
806 /*ARGSUSED*/
807 static void
808 xen_shutdown_handler(struct xenbus_watch *watch, const char **vec,
809 	unsigned int len)
810 {
811 	char *str;
812 	xenbus_transaction_t xbt;
813 	int err, shutdown_code = SHUTDOWN_INVALID;
814 	unsigned int slen;
815 
816 again:
817 	err = xenbus_transaction_start(&xbt);
818 	if (err)
819 		return;
820 	if (xenbus_read(xbt, "control", "shutdown", (void *)&str, &slen)) {
821 		(void) xenbus_transaction_end(xbt, 1);
822 		return;
823 	}
824 
825 	SUSPEND_DEBUG("%d: xen_shutdown_handler: \"%s\"\n", CPU->cpu_id, str);
826 
827 	/*
828 	 * If this is a watch fired from our write below, check out early to
829 	 * avoid an infinite loop.
830 	 */
831 	if (strcmp(str, "") == 0) {
832 		(void) xenbus_transaction_end(xbt, 0);
833 		kmem_free(str, slen);
834 		return;
835 	} else if (strcmp(str, "poweroff") == 0) {
836 		shutdown_code = SHUTDOWN_POWEROFF;
837 	} else if (strcmp(str, "reboot") == 0) {
838 		shutdown_code = SHUTDOWN_REBOOT;
839 	} else if (strcmp(str, "suspend") == 0) {
840 		shutdown_code = SHUTDOWN_SUSPEND;
841 	} else if (strcmp(str, "halt") == 0) {
842 		shutdown_code = SHUTDOWN_HALT;
843 	} else {
844 		printf("Ignoring shutdown request: %s\n", str);
845 	}
846 
847 	/*
848 	 * XXPV	Should we check the value of xenbus_write() too, or are all
849 	 *	errors automatically folded into xenbus_transaction_end() ??
850 	 */
851 	(void) xenbus_write(xbt, "control", "shutdown", "");
852 	err = xenbus_transaction_end(xbt, 0);
853 	if (err == EAGAIN) {
854 		SUSPEND_DEBUG("%d: trying again\n", CPU->cpu_id);
855 		kmem_free(str, slen);
856 		goto again;
857 	}
858 
859 	kmem_free(str, slen);
860 	if (shutdown_code != SHUTDOWN_INVALID) {
861 		(void) taskq_dispatch(xen_shutdown_tq, xen_shutdown,
862 		    (void *)(intptr_t)shutdown_code, 0);
863 	}
864 }
865 
866 static struct xenbus_watch shutdown_watch;
867 static struct xenbus_watch sysrq_watch;
868 
869 void
870 xen_late_startup(void)
871 {
872 	if (!DOMAIN_IS_INITDOMAIN(xen_info)) {
873 		xen_shutdown_tq = taskq_create("shutdown_taskq", 1,
874 		    maxclsyspri - 1, 1, 1, TASKQ_PREPOPULATE);
875 		shutdown_watch.node = "control/shutdown";
876 		shutdown_watch.callback = xen_shutdown_handler;
877 		if (register_xenbus_watch(&shutdown_watch))
878 			cmn_err(CE_WARN, "Failed to set shutdown watcher");
879 
880 		sysrq_watch.node = "control/sysrq";
881 		sysrq_watch.callback = xen_sysrq_handler;
882 		if (register_xenbus_watch(&sysrq_watch))
883 			cmn_err(CE_WARN, "Failed to set sysrq watcher");
884 	}
885 	balloon_init(xen_info->nr_pages);
886 }
887 
888 #ifdef DEBUG
889 #define	XEN_PRINTF_BUFSIZE	1024
890 
891 char xen_printf_buffer[XEN_PRINTF_BUFSIZE];
892 
893 /*
894  * Printf function that calls hypervisor directly.  For DomU it only
895  * works when running on a xen hypervisor built with debug on.  Works
896  * always since no I/O ring interaction is needed.
897  */
898 /*PRINTFLIKE1*/
899 void
900 xen_printf(const char *fmt, ...)
901 {
902 	va_list	ap;
903 
904 	va_start(ap, fmt);
905 	(void) vsnprintf(xen_printf_buffer, XEN_PRINTF_BUFSIZE, fmt, ap);
906 	va_end(ap);
907 
908 	(void) HYPERVISOR_console_io(CONSOLEIO_write,
909 	    strlen(xen_printf_buffer), xen_printf_buffer);
910 }
911 #else
912 void
913 xen_printf(const char *fmt, ...)
914 {
915 }
916 #endif	/* DEBUG */
917 
918 void
919 startup_xen_version(void)
920 {
921 	xen_set_version(XENVER_BOOT_IDX);
922 	if (xen_hypervisor_supports_solaris(XEN_RUN_CHECK) == 0)
923 		cmn_err(CE_WARN, "Found hypervisor version: v%lu.%lu%s "
924 		    "but need at least version v3.0.4",
925 		    XENVER_CURRENT(xv_major), XENVER_CURRENT(xv_minor),
926 		    XENVER_CURRENT(xv_ver));
927 	xen_pte_workaround();
928 }
929 
930 /*
931  * Miscellaneous hypercall wrappers with slightly more verbose diagnostics.
932  */
933 
934 void
935 xen_set_gdt(ulong_t *frame_list, int entries)
936 {
937 	int err;
938 	if ((err = HYPERVISOR_set_gdt(frame_list, entries)) != 0) {
939 		/*
940 		 * X_EINVAL:	reserved entry or bad frames
941 		 * X_EFAULT:	bad address
942 		 */
943 		panic("xen_set_gdt(%p, %d): error %d",
944 		    (void *)frame_list, entries, -(int)err);
945 	}
946 }
947 
948 void
949 xen_set_ldt(user_desc_t *ldt, uint_t nsels)
950 {
951 	struct mmuext_op	op;
952 	long			err;
953 
954 	op.cmd = MMUEXT_SET_LDT;
955 	op.arg1.linear_addr = (uintptr_t)ldt;
956 	op.arg2.nr_ents = nsels;
957 
958 	if ((err = HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) != 0) {
959 		panic("xen_set_ldt(%p, %d): error %d",
960 		    (void *)ldt, nsels, -(int)err);
961 	}
962 }
963 
964 void
965 xen_stack_switch(ulong_t ss, ulong_t esp)
966 {
967 	long err;
968 
969 	if ((err = HYPERVISOR_stack_switch(ss, esp)) != 0) {
970 		/*
971 		 * X_EPERM:	bad selector
972 		 */
973 		panic("xen_stack_switch(%lx, %lx): error %d", ss, esp,
974 		    -(int)err);
975 	}
976 }
977 
978 long
979 xen_set_trap_table(trap_info_t *table)
980 {
981 	long err;
982 
983 	if ((err = HYPERVISOR_set_trap_table(table)) != 0) {
984 		/*
985 		 * X_EFAULT:	bad address
986 		 * X_EPERM:	bad selector
987 		 */
988 		panic("xen_set_trap_table(%p): error %d", (void *)table,
989 		    -(int)err);
990 	}
991 	return (err);
992 }
993 
994 #if defined(__amd64)
995 void
996 xen_set_segment_base(int reg, ulong_t value)
997 {
998 	long err;
999 
1000 	if ((err = HYPERVISOR_set_segment_base(reg, value)) != 0) {
1001 		/*
1002 		 * X_EFAULT:	bad address
1003 		 * X_EINVAL:	bad type
1004 		 */
1005 		panic("xen_set_segment_base(%d, %lx): error %d",
1006 		    reg, value, -(int)err);
1007 	}
1008 }
1009 #endif	/* __amd64 */
1010 
1011 /*
1012  * Translate a hypervisor errcode to a Solaris error code.
1013  */
1014 int
1015 xen_xlate_errcode(int error)
1016 {
1017 	switch (-error) {
1018 
1019 	/*
1020 	 * Translate hypervisor errno's into native errno's
1021 	 */
1022 
1023 #define	CASE(num)	case X_##num: error = num; break
1024 
1025 	CASE(EPERM);	CASE(ENOENT);	CASE(ESRCH);
1026 	CASE(EINTR);	CASE(EIO);	CASE(ENXIO);
1027 	CASE(E2BIG);	CASE(ENOMEM);	CASE(EACCES);
1028 	CASE(EFAULT);	CASE(EBUSY);	CASE(EEXIST);
1029 	CASE(ENODEV);	CASE(EISDIR);	CASE(EINVAL);
1030 	CASE(ENOSPC);	CASE(ESPIPE);	CASE(EROFS);
1031 	CASE(ENOSYS);	CASE(ENOTEMPTY); CASE(EISCONN);
1032 	CASE(ENODATA);
1033 
1034 #undef CASE
1035 
1036 	default:
1037 		panic("xen_xlate_errcode: unknown error %d", error);
1038 	}
1039 
1040 	return (error);
1041 }
1042 
1043 /*
1044  * Raise PS_IOPL on current vcpu to user level.
1045  * Caller responsible for preventing kernel preemption.
1046  */
1047 void
1048 xen_enable_user_iopl(void)
1049 {
1050 	physdev_set_iopl_t set_iopl;
1051 	set_iopl.iopl = 3;		/* user ring 3 */
1052 	(void) HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
1053 }
1054 
1055 /*
1056  * Drop PS_IOPL on current vcpu to kernel level
1057  */
1058 void
1059 xen_disable_user_iopl(void)
1060 {
1061 	physdev_set_iopl_t set_iopl;
1062 	set_iopl.iopl = 1;		/* kernel pseudo ring 1 */
1063 	(void) HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
1064 }
1065 
1066 int
1067 xen_gdt_setprot(cpu_t *cp, uint_t prot)
1068 {
1069 	int err;
1070 #if defined(__amd64)
1071 	int pt_bits = PT_VALID;
1072 	if (prot & PROT_WRITE)
1073 		pt_bits |= PT_WRITABLE;
1074 #endif
1075 
1076 	if ((err = as_setprot(&kas, (caddr_t)cp->cpu_gdt,
1077 	    MMU_PAGESIZE, prot)) != 0)
1078 		goto done;
1079 
1080 #if defined(__amd64)
1081 	err = xen_kpm_page(mmu_btop(cp->cpu_m.mcpu_gdtpa), pt_bits);
1082 #endif
1083 
1084 done:
1085 	if (err) {
1086 		cmn_err(CE_WARN, "cpu%d: xen_gdt_setprot(%s) failed: error %d",
1087 		    cp->cpu_id, (prot & PROT_WRITE) ? "writable" : "read-only",
1088 		    err);
1089 	}
1090 
1091 	return (err);
1092 }
1093 
1094 int
1095 xen_ldt_setprot(user_desc_t *ldt, size_t lsize, uint_t prot)
1096 {
1097 	int err;
1098 	caddr_t	lva = (caddr_t)ldt;
1099 #if defined(__amd64)
1100 	int pt_bits = PT_VALID;
1101 	pgcnt_t npgs;
1102 	if (prot & PROT_WRITE)
1103 		pt_bits |= PT_WRITABLE;
1104 #endif	/* __amd64 */
1105 
1106 	if ((err = as_setprot(&kas, (caddr_t)ldt, lsize, prot)) != 0)
1107 		goto done;
1108 
1109 #if defined(__amd64)
1110 
1111 	ASSERT(IS_P2ALIGNED(lsize, PAGESIZE));
1112 	npgs = mmu_btop(lsize);
1113 	while (npgs--) {
1114 		if ((err = xen_kpm_page(hat_getpfnum(kas.a_hat, lva),
1115 		    pt_bits)) != 0)
1116 			break;
1117 		lva += PAGESIZE;
1118 	}
1119 #endif	/* __amd64 */
1120 
1121 done:
1122 	if (err) {
1123 		cmn_err(CE_WARN, "xen_ldt_setprot(%p, %s) failed: error %d",
1124 		    (void *)lva,
1125 		    (prot & PROT_WRITE) ? "writable" : "read-only", err);
1126 	}
1127 
1128 	return (err);
1129 }
1130