xref: /illumos-gate/usr/src/uts/i86xpv/os/xen_machdep.c (revision e4b86885570d77af552e9cf94f142f4d744fb8c8)
1843e1988Sjohnlev /*
2843e1988Sjohnlev  * CDDL HEADER START
3843e1988Sjohnlev  *
4843e1988Sjohnlev  * The contents of this file are subject to the terms of the
5843e1988Sjohnlev  * Common Development and Distribution License (the "License").
6843e1988Sjohnlev  * You may not use this file except in compliance with the License.
7843e1988Sjohnlev  *
8843e1988Sjohnlev  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9843e1988Sjohnlev  * or http://www.opensolaris.org/os/licensing.
10843e1988Sjohnlev  * See the License for the specific language governing permissions
11843e1988Sjohnlev  * and limitations under the License.
12843e1988Sjohnlev  *
13843e1988Sjohnlev  * When distributing Covered Code, include this CDDL HEADER in each
14843e1988Sjohnlev  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15843e1988Sjohnlev  * If applicable, add the following below this CDDL HEADER, with the
16843e1988Sjohnlev  * fields enclosed by brackets "[]" replaced with your own identifying
17843e1988Sjohnlev  * information: Portions Copyright [yyyy] [name of copyright owner]
18843e1988Sjohnlev  *
19843e1988Sjohnlev  * CDDL HEADER END
20843e1988Sjohnlev  */
21843e1988Sjohnlev 
22843e1988Sjohnlev /*
23c48ac12eSjohnlev  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24843e1988Sjohnlev  * Use is subject to license terms.
25843e1988Sjohnlev  */
26843e1988Sjohnlev 
27843e1988Sjohnlev /* derived from netbsd's xen_machdep.c 1.1.2.1 */
28843e1988Sjohnlev 
29843e1988Sjohnlev /*
30843e1988Sjohnlev  *
31843e1988Sjohnlev  * Copyright (c) 2004 Christian Limpach.
32843e1988Sjohnlev  * All rights reserved.
33843e1988Sjohnlev  *
34843e1988Sjohnlev  * Redistribution and use in source and binary forms, with or without
35843e1988Sjohnlev  * modification, are permitted provided that the following conditions
36843e1988Sjohnlev  * are met:
37843e1988Sjohnlev  * 1. Redistributions of source code must retain the above copyright
38843e1988Sjohnlev  *    notice, this list of conditions and the following disclaimer.
39843e1988Sjohnlev  * 2. Redistributions in binary form must reproduce the above copyright
40843e1988Sjohnlev  *    notice, this list of conditions and the following disclaimer in the
41843e1988Sjohnlev  *    documentation and/or other materials provided with the distribution.
42843e1988Sjohnlev  * 3. This section intentionally left blank.
43843e1988Sjohnlev  * 4. The name of the author may not be used to endorse or promote products
44843e1988Sjohnlev  *    derived from this software without specific prior written permission.
45843e1988Sjohnlev  *
46843e1988Sjohnlev  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
47843e1988Sjohnlev  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
48843e1988Sjohnlev  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
49843e1988Sjohnlev  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
50843e1988Sjohnlev  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
51843e1988Sjohnlev  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
52843e1988Sjohnlev  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
53843e1988Sjohnlev  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
54843e1988Sjohnlev  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
55843e1988Sjohnlev  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
56843e1988Sjohnlev  */
57843e1988Sjohnlev /*
58843e1988Sjohnlev  * Section 3 of the above license was updated in response to bug 6379571.
59843e1988Sjohnlev  */
60843e1988Sjohnlev 
619e839ce9Sgarypen #include <sys/ctype.h>
62843e1988Sjohnlev #include <sys/types.h>
63843e1988Sjohnlev #include <sys/cmn_err.h>
64843e1988Sjohnlev #include <sys/trap.h>
65843e1988Sjohnlev #include <sys/segments.h>
66843e1988Sjohnlev #include <sys/hypervisor.h>
67843e1988Sjohnlev #include <sys/xen_mmu.h>
68843e1988Sjohnlev #include <sys/machsystm.h>
69843e1988Sjohnlev #include <sys/promif.h>
70843e1988Sjohnlev #include <sys/bootconf.h>
71843e1988Sjohnlev #include <sys/bootinfo.h>
72843e1988Sjohnlev #include <sys/cpr.h>
73843e1988Sjohnlev #include <sys/taskq.h>
74843e1988Sjohnlev #include <sys/uadmin.h>
75843e1988Sjohnlev #include <sys/evtchn_impl.h>
76843e1988Sjohnlev #include <sys/archsystm.h>
77843e1988Sjohnlev #include <xen/sys/xenbus_impl.h>
78843e1988Sjohnlev #include <sys/mach_mmu.h>
79843e1988Sjohnlev #include <vm/hat_i86.h>
80843e1988Sjohnlev #include <sys/gnttab.h>
81843e1988Sjohnlev #include <sys/reboot.h>
82843e1988Sjohnlev #include <sys/stack.h>
83843e1988Sjohnlev #include <sys/clock.h>
84843e1988Sjohnlev #include <sys/bitmap.h>
85843e1988Sjohnlev #include <sys/processor.h>
86843e1988Sjohnlev #include <sys/xen_errno.h>
87843e1988Sjohnlev #include <sys/xpv_panic.h>
88843e1988Sjohnlev #include <sys/smp_impldefs.h>
89843e1988Sjohnlev #include <sys/cpu.h>
90843e1988Sjohnlev #include <sys/balloon_impl.h>
91843e1988Sjohnlev #include <sys/ddi.h>
92843e1988Sjohnlev 
93843e1988Sjohnlev #ifdef DEBUG
94843e1988Sjohnlev #define	SUSPEND_DEBUG if (xen_suspend_debug) xen_printf
95843e1988Sjohnlev #else
96843e1988Sjohnlev #define	SUSPEND_DEBUG(...)
97843e1988Sjohnlev #endif
98843e1988Sjohnlev 
99843e1988Sjohnlev int cpr_debug;
100843e1988Sjohnlev cpuset_t cpu_suspend_lost_set;
101843e1988Sjohnlev static int xen_suspend_debug;
102843e1988Sjohnlev 
103*e4b86885SCheng Sean Ye uint_t xen_phys_ncpus;
104*e4b86885SCheng Sean Ye xen_mc_logical_cpu_t *xen_phys_cpus;
105*e4b86885SCheng Sean Ye int xen_physinfo_debug = 0;
106*e4b86885SCheng Sean Ye 
1079e839ce9Sgarypen /*
1089e839ce9Sgarypen  * Determine helpful version information.
1099e839ce9Sgarypen  *
1109e839ce9Sgarypen  * (And leave copies in the data segment so we can look at them later
1119e839ce9Sgarypen  * with e.g. kmdb.)
1129e839ce9Sgarypen  */
1139e839ce9Sgarypen 
1149e839ce9Sgarypen typedef enum xen_version {
1159e839ce9Sgarypen 	XENVER_BOOT_IDX,
1169e839ce9Sgarypen 	XENVER_CURRENT_IDX
1179e839ce9Sgarypen } xen_version_t;
1189e839ce9Sgarypen 
1199e839ce9Sgarypen struct xenver {
1209e839ce9Sgarypen 	ulong_t xv_major;
1219e839ce9Sgarypen 	ulong_t xv_minor;
1229e839ce9Sgarypen 	ulong_t xv_revision;
1239e839ce9Sgarypen 	xen_extraversion_t xv_ver;
124ab4a9bebSjohnlev 	ulong_t xv_is_xvm;
1259e839ce9Sgarypen 	xen_changeset_info_t xv_chgset;
1269e839ce9Sgarypen 	xen_compile_info_t xv_build;
1279e839ce9Sgarypen 	xen_capabilities_info_t xv_caps;
1289e839ce9Sgarypen } xenver[2];
1299e839ce9Sgarypen 
1309e839ce9Sgarypen #define	XENVER_BOOT(m)	(xenver[XENVER_BOOT_IDX].m)
1319e839ce9Sgarypen #define	XENVER_CURRENT(m)	(xenver[XENVER_CURRENT_IDX].m)
1329e839ce9Sgarypen 
1339e839ce9Sgarypen /*
1349e839ce9Sgarypen  * Update the xenver data. We maintain two copies, boot and
1359e839ce9Sgarypen  * current. If we are setting the boot, then also set current.
1369e839ce9Sgarypen  */
1379e839ce9Sgarypen static void
1389e839ce9Sgarypen xen_set_version(xen_version_t idx)
1399e839ce9Sgarypen {
1409e839ce9Sgarypen 	ulong_t ver;
1419e839ce9Sgarypen 
1429e839ce9Sgarypen 	bzero(&xenver[idx], sizeof (xenver[idx]));
1439e839ce9Sgarypen 
1449e839ce9Sgarypen 	ver = HYPERVISOR_xen_version(XENVER_version, 0);
1459e839ce9Sgarypen 
1469e839ce9Sgarypen 	xenver[idx].xv_major = BITX(ver, 31, 16);
1479e839ce9Sgarypen 	xenver[idx].xv_minor = BITX(ver, 15, 0);
1489e839ce9Sgarypen 
1499e839ce9Sgarypen 	(void) HYPERVISOR_xen_version(XENVER_extraversion, &xenver[idx].xv_ver);
1509e839ce9Sgarypen 
1519e839ce9Sgarypen 	/*
1529e839ce9Sgarypen 	 * The revision is buried in the extraversion information that is
1539e839ce9Sgarypen 	 * maintained by the hypervisor. For our purposes we expect that
1549e839ce9Sgarypen 	 * the revision number is:
1559e839ce9Sgarypen 	 * 	- the second character in the extraversion information
1569e839ce9Sgarypen 	 *	- one character long
1579e839ce9Sgarypen 	 *	- numeric digit
1589e839ce9Sgarypen 	 * If it isn't then we can't extract the revision and we leave it
1599e839ce9Sgarypen 	 * set to 0.
1609e839ce9Sgarypen 	 */
1619e839ce9Sgarypen 	if (strlen(xenver[idx].xv_ver) > 1 && isdigit(xenver[idx].xv_ver[1]))
1629e839ce9Sgarypen 		xenver[idx].xv_revision = xenver[idx].xv_ver[1] - '0';
1639e839ce9Sgarypen 	else
1649e839ce9Sgarypen 		cmn_err(CE_WARN, "Cannot extract revision on this hypervisor "
1659e839ce9Sgarypen 		    "version: v%s, unexpected version format",
1669e839ce9Sgarypen 		    xenver[idx].xv_ver);
1679e839ce9Sgarypen 
168ab4a9bebSjohnlev 	xenver[idx].xv_is_xvm = 0;
169ab4a9bebSjohnlev 
170ab4a9bebSjohnlev 	if (strlen(xenver[idx].xv_ver) >= 4 &&
171ab4a9bebSjohnlev 	    strncmp(xenver[idx].xv_ver + strlen(xenver[idx].xv_ver) - 4,
172ab4a9bebSjohnlev 	    "-xvm", 4) == 0)
173ab4a9bebSjohnlev 		xenver[idx].xv_is_xvm = 1;
174ab4a9bebSjohnlev 
1759e839ce9Sgarypen 	(void) HYPERVISOR_xen_version(XENVER_changeset,
1769e839ce9Sgarypen 	    &xenver[idx].xv_chgset);
1779e839ce9Sgarypen 
1789e839ce9Sgarypen 	(void) HYPERVISOR_xen_version(XENVER_compile_info,
1799e839ce9Sgarypen 	    &xenver[idx].xv_build);
1809e839ce9Sgarypen 	/*
1819e839ce9Sgarypen 	 * Capabilities are a set of space separated ascii strings
1829e839ce9Sgarypen 	 * e.g. 'xen-3.1-x86_32p' or 'hvm-3.2-x86_64'
1839e839ce9Sgarypen 	 */
1849e839ce9Sgarypen 	(void) HYPERVISOR_xen_version(XENVER_capabilities,
1859e839ce9Sgarypen 	    &xenver[idx].xv_caps);
1869e839ce9Sgarypen 
1879e839ce9Sgarypen 	cmn_err(CE_CONT, "?v%lu.%lu%s chgset '%s'\n", xenver[idx].xv_major,
1889e839ce9Sgarypen 	    xenver[idx].xv_minor, xenver[idx].xv_ver, xenver[idx].xv_chgset);
1899e839ce9Sgarypen 
1909e839ce9Sgarypen 	if (idx == XENVER_BOOT_IDX)
1919e839ce9Sgarypen 		bcopy(&xenver[XENVER_BOOT_IDX], &xenver[XENVER_CURRENT_IDX],
1929e839ce9Sgarypen 		    sizeof (xenver[XENVER_BOOT_IDX]));
1939e839ce9Sgarypen }
1949e839ce9Sgarypen 
1959e839ce9Sgarypen typedef enum xen_hypervisor_check {
1969e839ce9Sgarypen 	XEN_RUN_CHECK,
1979e839ce9Sgarypen 	XEN_SUSPEND_CHECK
1989e839ce9Sgarypen } xen_hypervisor_check_t;
1999e839ce9Sgarypen 
2009e839ce9Sgarypen /*
2019e839ce9Sgarypen  * To run the hypervisor must be 3.0.4 or better. To suspend/resume
2029e839ce9Sgarypen  * we need 3.0.4 or better and if it is 3.0.4. then it must be provided
2039e839ce9Sgarypen  * by the Solaris xVM project.
2049e839ce9Sgarypen  * Checking can be disabled for testing purposes by setting the
2059e839ce9Sgarypen  * xen_suspend_debug variable.
2069e839ce9Sgarypen  */
2079e839ce9Sgarypen static int
2089e839ce9Sgarypen xen_hypervisor_supports_solaris(xen_hypervisor_check_t check)
2099e839ce9Sgarypen {
2109e839ce9Sgarypen 	if (xen_suspend_debug == 1)
2119e839ce9Sgarypen 		return (1);
2129e839ce9Sgarypen 	if (XENVER_CURRENT(xv_major) < 3)
2139e839ce9Sgarypen 		return (0);
2149e839ce9Sgarypen 	if (XENVER_CURRENT(xv_major) > 3)
2159e839ce9Sgarypen 		return (1);
2169e839ce9Sgarypen 	if (XENVER_CURRENT(xv_minor) > 0)
2179e839ce9Sgarypen 		return (1);
2189e839ce9Sgarypen 	if (XENVER_CURRENT(xv_revision) < 4)
2199e839ce9Sgarypen 		return (0);
220ab4a9bebSjohnlev 	if (check == XEN_SUSPEND_CHECK && XENVER_CURRENT(xv_revision) == 4 &&
221ab4a9bebSjohnlev 	    !XENVER_CURRENT(xv_is_xvm))
2229e839ce9Sgarypen 		return (0);
223ab4a9bebSjohnlev 
2249e839ce9Sgarypen 	return (1);
2259e839ce9Sgarypen }
2269e839ce9Sgarypen 
227ab4a9bebSjohnlev /*
228ab4a9bebSjohnlev  * If the hypervisor is -xvm, or 3.1.2 or higher, we don't need the
229ab4a9bebSjohnlev  * workaround.
230ab4a9bebSjohnlev  */
231ab4a9bebSjohnlev static void
232ab4a9bebSjohnlev xen_pte_workaround(void)
233ab4a9bebSjohnlev {
234ab4a9bebSjohnlev #if defined(__amd64)
235ab4a9bebSjohnlev 	extern int pt_kern;
236ab4a9bebSjohnlev 
237ab4a9bebSjohnlev 	if (XENVER_CURRENT(xv_major) != 3)
238ab4a9bebSjohnlev 		return;
239ab4a9bebSjohnlev 	if (XENVER_CURRENT(xv_minor) > 1)
240ab4a9bebSjohnlev 		return;
241ab4a9bebSjohnlev 	if (XENVER_CURRENT(xv_minor) == 1 &&
242ab4a9bebSjohnlev 	    XENVER_CURRENT(xv_revision) > 1)
243ab4a9bebSjohnlev 		return;
244ab4a9bebSjohnlev 	if (XENVER_CURRENT(xv_is_xvm))
245ab4a9bebSjohnlev 		return;
246ab4a9bebSjohnlev 
247ab4a9bebSjohnlev 	pt_kern = PT_USER;
248ab4a9bebSjohnlev #endif
249ab4a9bebSjohnlev }
250ab4a9bebSjohnlev 
251843e1988Sjohnlev void
252843e1988Sjohnlev xen_set_callback(void (*func)(void), uint_t type, uint_t flags)
253843e1988Sjohnlev {
254843e1988Sjohnlev 	struct callback_register cb;
255843e1988Sjohnlev 
256843e1988Sjohnlev 	bzero(&cb, sizeof (cb));
257843e1988Sjohnlev #if defined(__amd64)
258843e1988Sjohnlev 	cb.address = (ulong_t)func;
259843e1988Sjohnlev #elif defined(__i386)
260843e1988Sjohnlev 	cb.address.cs = KCS_SEL;
261843e1988Sjohnlev 	cb.address.eip = (ulong_t)func;
262843e1988Sjohnlev #endif
263843e1988Sjohnlev 	cb.type = type;
264843e1988Sjohnlev 	cb.flags = flags;
265843e1988Sjohnlev 
266843e1988Sjohnlev 	/*
267843e1988Sjohnlev 	 * XXPV always ignore return value for NMI
268843e1988Sjohnlev 	 */
269843e1988Sjohnlev 	if (HYPERVISOR_callback_op(CALLBACKOP_register, &cb) != 0 &&
270843e1988Sjohnlev 	    type != CALLBACKTYPE_nmi)
271843e1988Sjohnlev 		panic("HYPERVISOR_callback_op failed");
272843e1988Sjohnlev }
273843e1988Sjohnlev 
274843e1988Sjohnlev void
275843e1988Sjohnlev xen_init_callbacks(void)
276843e1988Sjohnlev {
277843e1988Sjohnlev 	/*
278843e1988Sjohnlev 	 * register event (interrupt) handler.
279843e1988Sjohnlev 	 */
280843e1988Sjohnlev 	xen_set_callback(xen_callback, CALLBACKTYPE_event, 0);
281843e1988Sjohnlev 
282843e1988Sjohnlev 	/*
283843e1988Sjohnlev 	 * failsafe handler.
284843e1988Sjohnlev 	 */
285843e1988Sjohnlev 	xen_set_callback(xen_failsafe_callback, CALLBACKTYPE_failsafe,
286843e1988Sjohnlev 	    CALLBACKF_mask_events);
287843e1988Sjohnlev 
288843e1988Sjohnlev 	/*
289843e1988Sjohnlev 	 * NMI handler.
290843e1988Sjohnlev 	 */
291843e1988Sjohnlev 	xen_set_callback(nmiint, CALLBACKTYPE_nmi, 0);
292843e1988Sjohnlev 
293843e1988Sjohnlev 	/*
294843e1988Sjohnlev 	 * system call handler
295843e1988Sjohnlev 	 * XXPV move to init_cpu_syscall?
296843e1988Sjohnlev 	 */
297843e1988Sjohnlev #if defined(__amd64)
298843e1988Sjohnlev 	xen_set_callback(sys_syscall, CALLBACKTYPE_syscall,
299843e1988Sjohnlev 	    CALLBACKF_mask_events);
300843e1988Sjohnlev #endif	/* __amd64 */
301843e1988Sjohnlev }
302843e1988Sjohnlev 
303843e1988Sjohnlev 
304843e1988Sjohnlev /*
305843e1988Sjohnlev  * cmn_err() followed by a 1/4 second delay; this gives the
306843e1988Sjohnlev  * logging service a chance to flush messages and helps avoid
307843e1988Sjohnlev  * intermixing output from prom_printf().
308843e1988Sjohnlev  * XXPV: doesn't exactly help us on UP though.
309843e1988Sjohnlev  */
310843e1988Sjohnlev /*PRINTFLIKE2*/
311843e1988Sjohnlev void
312843e1988Sjohnlev cpr_err(int ce, const char *fmt, ...)
313843e1988Sjohnlev {
314843e1988Sjohnlev 	va_list adx;
315843e1988Sjohnlev 
316843e1988Sjohnlev 	va_start(adx, fmt);
317843e1988Sjohnlev 	vcmn_err(ce, fmt, adx);
318843e1988Sjohnlev 	va_end(adx);
319843e1988Sjohnlev 	drv_usecwait(MICROSEC >> 2);
320843e1988Sjohnlev }
321843e1988Sjohnlev 
322843e1988Sjohnlev void
323843e1988Sjohnlev xen_suspend_devices(void)
324843e1988Sjohnlev {
325843e1988Sjohnlev 	int rc;
326843e1988Sjohnlev 
327843e1988Sjohnlev 	SUSPEND_DEBUG("xen_suspend_devices\n");
328843e1988Sjohnlev 
329843e1988Sjohnlev 	if ((rc = cpr_suspend_devices(ddi_root_node())) != 0)
330843e1988Sjohnlev 		panic("failed to suspend devices: %d", rc);
331843e1988Sjohnlev }
332843e1988Sjohnlev 
333843e1988Sjohnlev void
334843e1988Sjohnlev xen_resume_devices(void)
335843e1988Sjohnlev {
336843e1988Sjohnlev 	int rc;
337843e1988Sjohnlev 
338843e1988Sjohnlev 	SUSPEND_DEBUG("xen_resume_devices\n");
339843e1988Sjohnlev 
340843e1988Sjohnlev 	if ((rc = cpr_resume_devices(ddi_root_node(), 0)) != 0)
341843e1988Sjohnlev 		panic("failed to resume devices: %d", rc);
342843e1988Sjohnlev }
343843e1988Sjohnlev 
344843e1988Sjohnlev /*
345843e1988Sjohnlev  * The list of mfn pages is out of date.  Recompute it.
346843e1988Sjohnlev  */
347843e1988Sjohnlev static void
348843e1988Sjohnlev rebuild_mfn_list(void)
349843e1988Sjohnlev {
350843e1988Sjohnlev 	int i = 0;
351843e1988Sjohnlev 	size_t sz;
352843e1988Sjohnlev 	size_t off;
353843e1988Sjohnlev 	pfn_t pfn;
354843e1988Sjohnlev 
355843e1988Sjohnlev 	SUSPEND_DEBUG("rebuild_mfn_list\n");
356843e1988Sjohnlev 
357843e1988Sjohnlev 	sz = ((mfn_count * sizeof (mfn_t)) + MMU_PAGEOFFSET) & MMU_PAGEMASK;
358843e1988Sjohnlev 
359843e1988Sjohnlev 	for (off = 0; off < sz; off += MMU_PAGESIZE) {
360843e1988Sjohnlev 		size_t j = mmu_btop(off);
361843e1988Sjohnlev 		if (((j * sizeof (mfn_t)) & MMU_PAGEOFFSET) == 0) {
362843e1988Sjohnlev 			pfn = hat_getpfnum(kas.a_hat,
363843e1988Sjohnlev 			    (caddr_t)&mfn_list_pages[j]);
364843e1988Sjohnlev 			mfn_list_pages_page[i++] = pfn_to_mfn(pfn);
365843e1988Sjohnlev 		}
366843e1988Sjohnlev 
367843e1988Sjohnlev 		pfn = hat_getpfnum(kas.a_hat, (caddr_t)mfn_list + off);
368843e1988Sjohnlev 		mfn_list_pages[j] = pfn_to_mfn(pfn);
369843e1988Sjohnlev 	}
370843e1988Sjohnlev 
371843e1988Sjohnlev 	pfn = hat_getpfnum(kas.a_hat, (caddr_t)mfn_list_pages_page);
372843e1988Sjohnlev 	HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list
373843e1988Sjohnlev 	    = pfn_to_mfn(pfn);
374843e1988Sjohnlev }
375843e1988Sjohnlev 
376843e1988Sjohnlev static void
377843e1988Sjohnlev suspend_cpus(void)
378843e1988Sjohnlev {
379843e1988Sjohnlev 	int i;
380843e1988Sjohnlev 
381843e1988Sjohnlev 	SUSPEND_DEBUG("suspend_cpus\n");
382843e1988Sjohnlev 
3831d03c31eSjohnlev 	mp_enter_barrier();
384843e1988Sjohnlev 
385843e1988Sjohnlev 	for (i = 1; i < ncpus; i++) {
386843e1988Sjohnlev 		if (!CPU_IN_SET(cpu_suspend_lost_set, i)) {
387843e1988Sjohnlev 			SUSPEND_DEBUG("xen_vcpu_down %d\n", i);
388843e1988Sjohnlev 			(void) xen_vcpu_down(i);
389843e1988Sjohnlev 		}
390843e1988Sjohnlev 
391843e1988Sjohnlev 		mach_cpucontext_reset(cpu[i]);
392843e1988Sjohnlev 	}
393843e1988Sjohnlev }
394843e1988Sjohnlev 
395843e1988Sjohnlev static void
396843e1988Sjohnlev resume_cpus(void)
397843e1988Sjohnlev {
398843e1988Sjohnlev 	int i;
399843e1988Sjohnlev 
400843e1988Sjohnlev 	for (i = 1; i < ncpus; i++) {
401843e1988Sjohnlev 		if (cpu[i] == NULL)
402843e1988Sjohnlev 			continue;
403843e1988Sjohnlev 
404843e1988Sjohnlev 		if (!CPU_IN_SET(cpu_suspend_lost_set, i)) {
405843e1988Sjohnlev 			SUSPEND_DEBUG("xen_vcpu_up %d\n", i);
406843e1988Sjohnlev 			mach_cpucontext_restore(cpu[i]);
407843e1988Sjohnlev 			(void) xen_vcpu_up(i);
408843e1988Sjohnlev 		}
409843e1988Sjohnlev 	}
410843e1988Sjohnlev 
4111d03c31eSjohnlev 	mp_leave_barrier();
412843e1988Sjohnlev }
413843e1988Sjohnlev 
414843e1988Sjohnlev /*
415843e1988Sjohnlev  * Top level routine to direct suspend/resume of a domain.
416843e1988Sjohnlev  */
417843e1988Sjohnlev void
418843e1988Sjohnlev xen_suspend_domain(void)
419843e1988Sjohnlev {
420843e1988Sjohnlev 	extern void rtcsync(void);
421843e1988Sjohnlev 	extern hrtime_t hres_last_tick;
422843e1988Sjohnlev 	mfn_t start_info_mfn;
423843e1988Sjohnlev 	ulong_t flags;
424843e1988Sjohnlev 	pfn_t pfn;
425843e1988Sjohnlev 	int i;
426843e1988Sjohnlev 
427843e1988Sjohnlev 	/*
4289e839ce9Sgarypen 	 * Check that we are happy to suspend on this hypervisor.
4299e839ce9Sgarypen 	 */
4309e839ce9Sgarypen 	if (xen_hypervisor_supports_solaris(XEN_SUSPEND_CHECK) == 0) {
4319e839ce9Sgarypen 		cpr_err(CE_WARN, "Cannot suspend on this hypervisor "
4329e839ce9Sgarypen 		    "version: v%lu.%lu%s, need at least version v3.0.4 or "
4339e839ce9Sgarypen 		    "-xvm based hypervisor", XENVER_CURRENT(xv_major),
4349e839ce9Sgarypen 		    XENVER_CURRENT(xv_minor), XENVER_CURRENT(xv_ver));
4359e839ce9Sgarypen 		return;
4369e839ce9Sgarypen 	}
4379e839ce9Sgarypen 
4389e839ce9Sgarypen 	/*
439843e1988Sjohnlev 	 * XXPV - Are we definitely OK to suspend by the time we've connected
440843e1988Sjohnlev 	 * the handler?
441843e1988Sjohnlev 	 */
442843e1988Sjohnlev 
443843e1988Sjohnlev 	cpr_err(CE_NOTE, "Domain suspending for save/migrate");
444843e1988Sjohnlev 
445843e1988Sjohnlev 	SUSPEND_DEBUG("xen_suspend_domain\n");
446843e1988Sjohnlev 
447843e1988Sjohnlev 	/*
448843e1988Sjohnlev 	 * suspend interrupts and devices
449843e1988Sjohnlev 	 * XXPV - we use suspend/resume for both save/restore domains (like sun
450843e1988Sjohnlev 	 * cpr) and for migration.  Would be nice to know the difference if
451843e1988Sjohnlev 	 * possible.  For save/restore where down time may be a long time, we
452843e1988Sjohnlev 	 * may want to do more of the things that cpr does.  (i.e. notify user
453843e1988Sjohnlev 	 * processes, shrink memory footprint for faster restore, etc.)
454843e1988Sjohnlev 	 */
455843e1988Sjohnlev 	xen_suspend_devices();
456843e1988Sjohnlev 	SUSPEND_DEBUG("xenbus_suspend\n");
457843e1988Sjohnlev 	xenbus_suspend();
458843e1988Sjohnlev 
459843e1988Sjohnlev 	pfn = hat_getpfnum(kas.a_hat, (caddr_t)xen_info);
460843e1988Sjohnlev 	start_info_mfn = pfn_to_mfn(pfn);
461843e1988Sjohnlev 
462843e1988Sjohnlev 	/*
463843e1988Sjohnlev 	 * XXPV: cpu hotplug can hold this under a xenbus watch. Are we safe
464843e1988Sjohnlev 	 * wrt xenbus being suspended here?
465843e1988Sjohnlev 	 */
466843e1988Sjohnlev 	mutex_enter(&cpu_lock);
467843e1988Sjohnlev 
468843e1988Sjohnlev 	/*
469843e1988Sjohnlev 	 * Suspend must be done on vcpu 0, as no context for other CPUs is
470843e1988Sjohnlev 	 * saved.
471843e1988Sjohnlev 	 *
472843e1988Sjohnlev 	 * XXPV - add to taskq API ?
473843e1988Sjohnlev 	 */
474843e1988Sjohnlev 	thread_affinity_set(curthread, 0);
475843e1988Sjohnlev 	kpreempt_disable();
476843e1988Sjohnlev 
477843e1988Sjohnlev 	SUSPEND_DEBUG("xen_start_migrate\n");
478843e1988Sjohnlev 	xen_start_migrate();
479843e1988Sjohnlev 	if (ncpus > 1)
480843e1988Sjohnlev 		suspend_cpus();
481843e1988Sjohnlev 
482843e1988Sjohnlev 	/*
483843e1988Sjohnlev 	 * We can grab the ec_lock as it's a spinlock with a high SPL. Hence
484843e1988Sjohnlev 	 * any holder would have dropped it to get through suspend_cpus().
485843e1988Sjohnlev 	 */
486843e1988Sjohnlev 	mutex_enter(&ec_lock);
487843e1988Sjohnlev 
488843e1988Sjohnlev 	/*
489843e1988Sjohnlev 	 * From here on in, we can't take locks.
490843e1988Sjohnlev 	 */
491843e1988Sjohnlev 	SUSPEND_DEBUG("ec_suspend\n");
492843e1988Sjohnlev 	ec_suspend();
493843e1988Sjohnlev 	SUSPEND_DEBUG("gnttab_suspend\n");
494843e1988Sjohnlev 	gnttab_suspend();
495843e1988Sjohnlev 
496843e1988Sjohnlev 	flags = intr_clear();
497843e1988Sjohnlev 
498843e1988Sjohnlev 	xpv_time_suspend();
499843e1988Sjohnlev 
500843e1988Sjohnlev 	/*
501843e1988Sjohnlev 	 * Currently, the hypervisor incorrectly fails to bring back
502843e1988Sjohnlev 	 * powered-down VCPUs.  Thus we need to record any powered-down VCPUs
503843e1988Sjohnlev 	 * to prevent any attempts to operate on them.  But we have to do this
504843e1988Sjohnlev 	 * *after* the very first time we do ec_suspend().
505843e1988Sjohnlev 	 */
506843e1988Sjohnlev 	for (i = 1; i < ncpus; i++) {
507843e1988Sjohnlev 		if (cpu[i] == NULL)
508843e1988Sjohnlev 			continue;
509843e1988Sjohnlev 
510843e1988Sjohnlev 		if (cpu_get_state(cpu[i]) == P_POWEROFF)
511843e1988Sjohnlev 			CPUSET_ATOMIC_ADD(cpu_suspend_lost_set, i);
512843e1988Sjohnlev 	}
513843e1988Sjohnlev 
514843e1988Sjohnlev 	/*
515843e1988Sjohnlev 	 * The dom0 save/migrate code doesn't automatically translate
516843e1988Sjohnlev 	 * these into PFNs, but expects them to be, so we do it here.
517843e1988Sjohnlev 	 * We don't use mfn_to_pfn() because so many OS services have
518843e1988Sjohnlev 	 * been disabled at this point.
519843e1988Sjohnlev 	 */
520843e1988Sjohnlev 	xen_info->store_mfn = mfn_to_pfn_mapping[xen_info->store_mfn];
521843e1988Sjohnlev 	xen_info->console.domU.mfn =
522843e1988Sjohnlev 	    mfn_to_pfn_mapping[xen_info->console.domU.mfn];
523843e1988Sjohnlev 
524843e1988Sjohnlev 	if (CPU->cpu_m.mcpu_vcpu_info->evtchn_upcall_mask == 0) {
525843e1988Sjohnlev 		prom_printf("xen_suspend_domain(): "
526843e1988Sjohnlev 		    "CPU->cpu_m.mcpu_vcpu_info->evtchn_upcall_mask not set\n");
527843e1988Sjohnlev 		(void) HYPERVISOR_shutdown(SHUTDOWN_crash);
528843e1988Sjohnlev 	}
529843e1988Sjohnlev 
530843e1988Sjohnlev 	if (HYPERVISOR_update_va_mapping((uintptr_t)HYPERVISOR_shared_info,
531843e1988Sjohnlev 	    0, UVMF_INVLPG)) {
532843e1988Sjohnlev 		prom_printf("xen_suspend_domain(): "
533843e1988Sjohnlev 		    "HYPERVISOR_update_va_mapping() failed\n");
534843e1988Sjohnlev 		(void) HYPERVISOR_shutdown(SHUTDOWN_crash);
535843e1988Sjohnlev 	}
536843e1988Sjohnlev 
537843e1988Sjohnlev 	SUSPEND_DEBUG("HYPERVISOR_suspend\n");
538843e1988Sjohnlev 
539843e1988Sjohnlev 	/*
540843e1988Sjohnlev 	 * At this point we suspend and sometime later resume.
541843e1988Sjohnlev 	 */
542843e1988Sjohnlev 	if (HYPERVISOR_suspend(start_info_mfn)) {
543843e1988Sjohnlev 		prom_printf("xen_suspend_domain(): "
544843e1988Sjohnlev 		    "HYPERVISOR_suspend() failed\n");
545843e1988Sjohnlev 		(void) HYPERVISOR_shutdown(SHUTDOWN_crash);
546843e1988Sjohnlev 	}
547843e1988Sjohnlev 
548843e1988Sjohnlev 	/*
549843e1988Sjohnlev 	 * Point HYPERVISOR_shared_info to its new value.
550843e1988Sjohnlev 	 */
551843e1988Sjohnlev 	if (HYPERVISOR_update_va_mapping((uintptr_t)HYPERVISOR_shared_info,
552843e1988Sjohnlev 	    xen_info->shared_info | PT_NOCONSIST | PT_VALID | PT_WRITABLE,
553843e1988Sjohnlev 	    UVMF_INVLPG))
554843e1988Sjohnlev 		(void) HYPERVISOR_shutdown(SHUTDOWN_crash);
555843e1988Sjohnlev 
556843e1988Sjohnlev 	if (xen_info->nr_pages != mfn_count) {
557843e1988Sjohnlev 		prom_printf("xen_suspend_domain(): number of pages"
558843e1988Sjohnlev 		    " changed, was 0x%lx, now 0x%lx\n", mfn_count,
559843e1988Sjohnlev 		    xen_info->nr_pages);
560843e1988Sjohnlev 		(void) HYPERVISOR_shutdown(SHUTDOWN_crash);
561843e1988Sjohnlev 	}
562843e1988Sjohnlev 
563843e1988Sjohnlev 	xpv_time_resume();
564843e1988Sjohnlev 
565843e1988Sjohnlev 	cached_max_mfn = 0;
566843e1988Sjohnlev 
567843e1988Sjohnlev 	SUSPEND_DEBUG("gnttab_resume\n");
568843e1988Sjohnlev 	gnttab_resume();
569843e1988Sjohnlev 
570843e1988Sjohnlev 	/* XXPV: add a note that this must be lockless. */
571843e1988Sjohnlev 	SUSPEND_DEBUG("ec_resume\n");
572843e1988Sjohnlev 	ec_resume();
573843e1988Sjohnlev 
574843e1988Sjohnlev 	intr_restore(flags);
575843e1988Sjohnlev 
576843e1988Sjohnlev 	if (ncpus > 1)
577843e1988Sjohnlev 		resume_cpus();
578843e1988Sjohnlev 
579843e1988Sjohnlev 	mutex_exit(&ec_lock);
580843e1988Sjohnlev 	xen_end_migrate();
581843e1988Sjohnlev 	mutex_exit(&cpu_lock);
582843e1988Sjohnlev 
583843e1988Sjohnlev 	/*
584843e1988Sjohnlev 	 * Now we can take locks again.
585843e1988Sjohnlev 	 */
586843e1988Sjohnlev 
587843e1988Sjohnlev 	/*
588843e1988Sjohnlev 	 * Force the tick value used for tv_nsec in hres_tick() to be up to
589843e1988Sjohnlev 	 * date. rtcsync() will reset the hrestime value appropriately.
590843e1988Sjohnlev 	 */
591843e1988Sjohnlev 	hres_last_tick = xpv_gethrtime();
592843e1988Sjohnlev 
593843e1988Sjohnlev 	/*
594843e1988Sjohnlev 	 * XXPV: we need to have resumed the CPUs since this takes locks, but
595843e1988Sjohnlev 	 * can remote CPUs see bad state? Presumably yes. Should probably nest
596843e1988Sjohnlev 	 * taking of todlock inside of cpu_lock, or vice versa, then provide an
597843e1988Sjohnlev 	 * unlocked version.  Probably need to call clkinitf to reset cpu freq
598843e1988Sjohnlev 	 * and re-calibrate if we migrated to a different speed cpu.  Also need
599843e1988Sjohnlev 	 * to make a (re)init_cpu_info call to update processor info structs
600843e1988Sjohnlev 	 * and device tree info.  That remains to be written at the moment.
601843e1988Sjohnlev 	 */
602843e1988Sjohnlev 	rtcsync();
603843e1988Sjohnlev 
604843e1988Sjohnlev 	rebuild_mfn_list();
605843e1988Sjohnlev 
606843e1988Sjohnlev 	SUSPEND_DEBUG("xenbus_resume\n");
607843e1988Sjohnlev 	xenbus_resume();
608843e1988Sjohnlev 	SUSPEND_DEBUG("xenbus_resume_devices\n");
609843e1988Sjohnlev 	xen_resume_devices();
610843e1988Sjohnlev 
611843e1988Sjohnlev 	thread_affinity_clear(curthread);
612843e1988Sjohnlev 	kpreempt_enable();
613843e1988Sjohnlev 
614843e1988Sjohnlev 	SUSPEND_DEBUG("finished xen_suspend_domain\n");
6159e839ce9Sgarypen 
6169e839ce9Sgarypen 	/*
6179e839ce9Sgarypen 	 * We have restarted our suspended domain, update the hypervisor
6189e839ce9Sgarypen 	 * details. NB: This must be done at the end of this function,
6199e839ce9Sgarypen 	 * since we need the domain to be completely resumed before
6209e839ce9Sgarypen 	 * these functions will work correctly.
6219e839ce9Sgarypen 	 */
6229e839ce9Sgarypen 	xen_set_version(XENVER_CURRENT_IDX);
6239e839ce9Sgarypen 
6249e839ce9Sgarypen 	/*
6259e839ce9Sgarypen 	 * We can check and report a warning, but we don't stop the
6269e839ce9Sgarypen 	 * process.
6279e839ce9Sgarypen 	 */
6289e839ce9Sgarypen 	if (xen_hypervisor_supports_solaris(XEN_SUSPEND_CHECK) == 0)
6299e839ce9Sgarypen 		cmn_err(CE_WARN, "Found hypervisor version: v%lu.%lu%s "
6309e839ce9Sgarypen 		    "but need at least version v3.0.4",
6319e839ce9Sgarypen 		    XENVER_CURRENT(xv_major), XENVER_CURRENT(xv_minor),
6329e839ce9Sgarypen 		    XENVER_CURRENT(xv_ver));
6339e839ce9Sgarypen 
634843e1988Sjohnlev 	cmn_err(CE_NOTE, "domain restore/migrate completed");
635843e1988Sjohnlev }
636843e1988Sjohnlev 
637843e1988Sjohnlev /*ARGSUSED*/
638843e1988Sjohnlev int
639843e1988Sjohnlev xen_debug_handler(void *arg)
640843e1988Sjohnlev {
641843e1988Sjohnlev 	debug_enter("External debug event received");
642843e1988Sjohnlev 
643843e1988Sjohnlev 	/*
644843e1988Sjohnlev 	 * If we've not got KMDB loaded, output some stuff difficult to capture
645843e1988Sjohnlev 	 * from a domain core.
646843e1988Sjohnlev 	 */
647843e1988Sjohnlev 	if (!(boothowto & RB_DEBUG)) {
648843e1988Sjohnlev 		shared_info_t *si = HYPERVISOR_shared_info;
649843e1988Sjohnlev 		int i;
650843e1988Sjohnlev 
651843e1988Sjohnlev 		prom_printf("evtchn_pending [ ");
652843e1988Sjohnlev 		for (i = 0; i < 8; i++)
653843e1988Sjohnlev 			prom_printf("%lx ", si->evtchn_pending[i]);
654843e1988Sjohnlev 		prom_printf("]\nevtchn_mask [ ");
655843e1988Sjohnlev 		for (i = 0; i < 8; i++)
656843e1988Sjohnlev 			prom_printf("%lx ", si->evtchn_mask[i]);
657843e1988Sjohnlev 		prom_printf("]\n");
658843e1988Sjohnlev 
659843e1988Sjohnlev 		for (i = 0; i < ncpus; i++) {
660843e1988Sjohnlev 			vcpu_info_t *vcpu = &si->vcpu_info[i];
661843e1988Sjohnlev 			if (cpu[i] == NULL)
662843e1988Sjohnlev 				continue;
663843e1988Sjohnlev 			prom_printf("CPU%d pending %d mask %d sel %lx\n",
664843e1988Sjohnlev 			    i, vcpu->evtchn_upcall_pending,
665843e1988Sjohnlev 			    vcpu->evtchn_upcall_mask,
666843e1988Sjohnlev 			    vcpu->evtchn_pending_sel);
667843e1988Sjohnlev 		}
668843e1988Sjohnlev 	}
669843e1988Sjohnlev 
670843e1988Sjohnlev 	return (0);
671843e1988Sjohnlev }
672843e1988Sjohnlev 
673843e1988Sjohnlev /*ARGSUSED*/
674843e1988Sjohnlev static void
675843e1988Sjohnlev xen_sysrq_handler(struct xenbus_watch *watch, const char **vec,
676843e1988Sjohnlev     unsigned int len)
677843e1988Sjohnlev {
678843e1988Sjohnlev 	xenbus_transaction_t xbt;
679843e1988Sjohnlev 	char key = '\0';
680843e1988Sjohnlev 	int ret;
681843e1988Sjohnlev 
682843e1988Sjohnlev retry:
683843e1988Sjohnlev 	if (xenbus_transaction_start(&xbt)) {
684843e1988Sjohnlev 		cmn_err(CE_WARN, "failed to start sysrq transaction");
685843e1988Sjohnlev 		return;
686843e1988Sjohnlev 	}
687843e1988Sjohnlev 
688843e1988Sjohnlev 	if ((ret = xenbus_scanf(xbt, "control", "sysrq", "%c", &key)) != 0) {
689843e1988Sjohnlev 		/*
690843e1988Sjohnlev 		 * ENOENT happens in response to our own xenbus_rm.
691843e1988Sjohnlev 		 * XXPV - this happens spuriously on boot?
692843e1988Sjohnlev 		 */
693843e1988Sjohnlev 		if (ret != ENOENT)
694843e1988Sjohnlev 			cmn_err(CE_WARN, "failed to read sysrq: %d", ret);
695843e1988Sjohnlev 		goto out;
696843e1988Sjohnlev 	}
697843e1988Sjohnlev 
698843e1988Sjohnlev 	if ((ret = xenbus_rm(xbt, "control", "sysrq")) != 0) {
699843e1988Sjohnlev 		cmn_err(CE_WARN, "failed to reset sysrq: %d", ret);
700843e1988Sjohnlev 		goto out;
701843e1988Sjohnlev 	}
702843e1988Sjohnlev 
703843e1988Sjohnlev 	if (xenbus_transaction_end(xbt, 0) == EAGAIN)
704843e1988Sjohnlev 		goto retry;
705843e1988Sjohnlev 
706843e1988Sjohnlev 	/*
707843e1988Sjohnlev 	 * Somewhat arbitrary - on Linux this means 'reboot'. We could just
708843e1988Sjohnlev 	 * accept any key, but this might increase the risk of sending a
709843e1988Sjohnlev 	 * harmless sysrq to the wrong domain...
710843e1988Sjohnlev 	 */
711843e1988Sjohnlev 	if (key == 'b')
712843e1988Sjohnlev 		(void) xen_debug_handler(NULL);
713843e1988Sjohnlev 	else
714843e1988Sjohnlev 		cmn_err(CE_WARN, "Ignored sysrq %c", key);
715843e1988Sjohnlev 	return;
716843e1988Sjohnlev 
717843e1988Sjohnlev out:
718843e1988Sjohnlev 	(void) xenbus_transaction_end(xbt, 1);
719843e1988Sjohnlev }
720843e1988Sjohnlev 
721843e1988Sjohnlev taskq_t *xen_shutdown_tq;
722843e1988Sjohnlev 
723843e1988Sjohnlev #define	SHUTDOWN_INVALID	-1
724843e1988Sjohnlev #define	SHUTDOWN_POWEROFF	0
725843e1988Sjohnlev #define	SHUTDOWN_REBOOT		1
726843e1988Sjohnlev #define	SHUTDOWN_SUSPEND	2
727843e1988Sjohnlev #define	SHUTDOWN_HALT		3
728843e1988Sjohnlev #define	SHUTDOWN_MAX		4
729843e1988Sjohnlev 
730843e1988Sjohnlev #define	SHUTDOWN_TIMEOUT_SECS (60 * 5)
731843e1988Sjohnlev 
732843e1988Sjohnlev static const char *cmd_strings[SHUTDOWN_MAX] = {
733843e1988Sjohnlev 	"poweroff",
734843e1988Sjohnlev 	"reboot",
735843e1988Sjohnlev 	"suspend",
736843e1988Sjohnlev 	"halt"
737843e1988Sjohnlev };
738843e1988Sjohnlev 
739843e1988Sjohnlev static void
740843e1988Sjohnlev xen_dirty_shutdown(void *arg)
741843e1988Sjohnlev {
742843e1988Sjohnlev 	int cmd = (uintptr_t)arg;
743843e1988Sjohnlev 
744843e1988Sjohnlev 	cmn_err(CE_WARN, "Externally requested shutdown failed or "
745843e1988Sjohnlev 	    "timed out.\nShutting down.\n");
746843e1988Sjohnlev 
747843e1988Sjohnlev 	switch (cmd) {
748843e1988Sjohnlev 	case SHUTDOWN_HALT:
749843e1988Sjohnlev 	case SHUTDOWN_POWEROFF:
750843e1988Sjohnlev 		(void) kadmin(A_SHUTDOWN, AD_POWEROFF, NULL, kcred);
751843e1988Sjohnlev 		break;
752843e1988Sjohnlev 	case SHUTDOWN_REBOOT:
753843e1988Sjohnlev 		(void) kadmin(A_REBOOT, AD_BOOT, NULL, kcred);
754843e1988Sjohnlev 		break;
755843e1988Sjohnlev 	}
756843e1988Sjohnlev }
757843e1988Sjohnlev 
758843e1988Sjohnlev static void
759843e1988Sjohnlev xen_shutdown(void *arg)
760843e1988Sjohnlev {
761843e1988Sjohnlev 	int cmd = (uintptr_t)arg;
762c48ac12eSjohnlev 	proc_t *initpp;
763843e1988Sjohnlev 
764843e1988Sjohnlev 	ASSERT(cmd > SHUTDOWN_INVALID && cmd < SHUTDOWN_MAX);
765843e1988Sjohnlev 
766843e1988Sjohnlev 	if (cmd == SHUTDOWN_SUSPEND) {
767843e1988Sjohnlev 		xen_suspend_domain();
768843e1988Sjohnlev 		return;
769843e1988Sjohnlev 	}
770843e1988Sjohnlev 
771c48ac12eSjohnlev 	switch (cmd) {
772c48ac12eSjohnlev 	case SHUTDOWN_POWEROFF:
773c48ac12eSjohnlev 		force_shutdown_method = AD_POWEROFF;
774c48ac12eSjohnlev 		break;
775c48ac12eSjohnlev 	case SHUTDOWN_HALT:
776c48ac12eSjohnlev 		force_shutdown_method = AD_HALT;
777c48ac12eSjohnlev 		break;
778c48ac12eSjohnlev 	case SHUTDOWN_REBOOT:
779c48ac12eSjohnlev 		force_shutdown_method = AD_BOOT;
780c48ac12eSjohnlev 		break;
781c48ac12eSjohnlev 	}
782843e1988Sjohnlev 
783c48ac12eSjohnlev 	/*
784c48ac12eSjohnlev 	 * If we're still booting and init(1) isn't set up yet, simply halt.
785c48ac12eSjohnlev 	 */
786c48ac12eSjohnlev 	mutex_enter(&pidlock);
787c48ac12eSjohnlev 	initpp = prfind(P_INITPID);
788c48ac12eSjohnlev 	mutex_exit(&pidlock);
789c48ac12eSjohnlev 	if (initpp == NULL) {
790c48ac12eSjohnlev 		extern void halt(char *);
791c48ac12eSjohnlev 		halt("Power off the System");   /* just in case */
792c48ac12eSjohnlev 	}
793843e1988Sjohnlev 
794c48ac12eSjohnlev 	/*
795c48ac12eSjohnlev 	 * else, graceful shutdown with inittab and all getting involved
796c48ac12eSjohnlev 	 */
797c48ac12eSjohnlev 	psignal(initpp, SIGPWR);
798843e1988Sjohnlev 
799843e1988Sjohnlev 	(void) timeout(xen_dirty_shutdown, arg,
800843e1988Sjohnlev 	    SHUTDOWN_TIMEOUT_SECS * drv_usectohz(MICROSEC));
801843e1988Sjohnlev }
802843e1988Sjohnlev 
803843e1988Sjohnlev /*ARGSUSED*/
804843e1988Sjohnlev static void
805843e1988Sjohnlev xen_shutdown_handler(struct xenbus_watch *watch, const char **vec,
806843e1988Sjohnlev 	unsigned int len)
807843e1988Sjohnlev {
808843e1988Sjohnlev 	char *str;
809843e1988Sjohnlev 	xenbus_transaction_t xbt;
810843e1988Sjohnlev 	int err, shutdown_code = SHUTDOWN_INVALID;
811843e1988Sjohnlev 	unsigned int slen;
812843e1988Sjohnlev 
813843e1988Sjohnlev again:
814843e1988Sjohnlev 	err = xenbus_transaction_start(&xbt);
815843e1988Sjohnlev 	if (err)
816843e1988Sjohnlev 		return;
817843e1988Sjohnlev 	if (xenbus_read(xbt, "control", "shutdown", (void *)&str, &slen)) {
818843e1988Sjohnlev 		(void) xenbus_transaction_end(xbt, 1);
819843e1988Sjohnlev 		return;
820843e1988Sjohnlev 	}
821843e1988Sjohnlev 
822843e1988Sjohnlev 	SUSPEND_DEBUG("%d: xen_shutdown_handler: \"%s\"\n", CPU->cpu_id, str);
823843e1988Sjohnlev 
824843e1988Sjohnlev 	/*
825843e1988Sjohnlev 	 * If this is a watch fired from our write below, check out early to
826843e1988Sjohnlev 	 * avoid an infinite loop.
827843e1988Sjohnlev 	 */
828843e1988Sjohnlev 	if (strcmp(str, "") == 0) {
829843e1988Sjohnlev 		(void) xenbus_transaction_end(xbt, 0);
830843e1988Sjohnlev 		kmem_free(str, slen);
831843e1988Sjohnlev 		return;
832843e1988Sjohnlev 	} else if (strcmp(str, "poweroff") == 0) {
833843e1988Sjohnlev 		shutdown_code = SHUTDOWN_POWEROFF;
834843e1988Sjohnlev 	} else if (strcmp(str, "reboot") == 0) {
835843e1988Sjohnlev 		shutdown_code = SHUTDOWN_REBOOT;
836843e1988Sjohnlev 	} else if (strcmp(str, "suspend") == 0) {
837843e1988Sjohnlev 		shutdown_code = SHUTDOWN_SUSPEND;
838843e1988Sjohnlev 	} else if (strcmp(str, "halt") == 0) {
839843e1988Sjohnlev 		shutdown_code = SHUTDOWN_HALT;
840843e1988Sjohnlev 	} else {
841843e1988Sjohnlev 		printf("Ignoring shutdown request: %s\n", str);
842843e1988Sjohnlev 	}
843843e1988Sjohnlev 
844843e1988Sjohnlev 	/*
845843e1988Sjohnlev 	 * XXPV	Should we check the value of xenbus_write() too, or are all
846843e1988Sjohnlev 	 *	errors automatically folded into xenbus_transaction_end() ??
847843e1988Sjohnlev 	 */
848843e1988Sjohnlev 	(void) xenbus_write(xbt, "control", "shutdown", "");
849843e1988Sjohnlev 	err = xenbus_transaction_end(xbt, 0);
850843e1988Sjohnlev 	if (err == EAGAIN) {
851843e1988Sjohnlev 		SUSPEND_DEBUG("%d: trying again\n", CPU->cpu_id);
852843e1988Sjohnlev 		kmem_free(str, slen);
853843e1988Sjohnlev 		goto again;
854843e1988Sjohnlev 	}
855843e1988Sjohnlev 
856843e1988Sjohnlev 	kmem_free(str, slen);
857843e1988Sjohnlev 	if (shutdown_code != SHUTDOWN_INVALID) {
858843e1988Sjohnlev 		(void) taskq_dispatch(xen_shutdown_tq, xen_shutdown,
859843e1988Sjohnlev 		    (void *)(intptr_t)shutdown_code, 0);
860843e1988Sjohnlev 	}
861843e1988Sjohnlev }
862843e1988Sjohnlev 
863843e1988Sjohnlev static struct xenbus_watch shutdown_watch;
864843e1988Sjohnlev static struct xenbus_watch sysrq_watch;
865843e1988Sjohnlev 
866843e1988Sjohnlev void
867843e1988Sjohnlev xen_late_startup(void)
868843e1988Sjohnlev {
869843e1988Sjohnlev 	if (!DOMAIN_IS_INITDOMAIN(xen_info)) {
870843e1988Sjohnlev 		xen_shutdown_tq = taskq_create("shutdown_taskq", 1,
871843e1988Sjohnlev 		    maxclsyspri - 1, 1, 1, TASKQ_PREPOPULATE);
872843e1988Sjohnlev 		shutdown_watch.node = "control/shutdown";
873843e1988Sjohnlev 		shutdown_watch.callback = xen_shutdown_handler;
874843e1988Sjohnlev 		if (register_xenbus_watch(&shutdown_watch))
875843e1988Sjohnlev 			cmn_err(CE_WARN, "Failed to set shutdown watcher");
876843e1988Sjohnlev 
877843e1988Sjohnlev 		sysrq_watch.node = "control/sysrq";
878843e1988Sjohnlev 		sysrq_watch.callback = xen_sysrq_handler;
879843e1988Sjohnlev 		if (register_xenbus_watch(&sysrq_watch))
880843e1988Sjohnlev 			cmn_err(CE_WARN, "Failed to set sysrq watcher");
881843e1988Sjohnlev 	}
882843e1988Sjohnlev 	balloon_init(xen_info->nr_pages);
883843e1988Sjohnlev }
884843e1988Sjohnlev 
885843e1988Sjohnlev #ifdef DEBUG
886843e1988Sjohnlev #define	XEN_PRINTF_BUFSIZE	1024
887843e1988Sjohnlev 
888843e1988Sjohnlev char xen_printf_buffer[XEN_PRINTF_BUFSIZE];
889843e1988Sjohnlev 
890843e1988Sjohnlev /*
891843e1988Sjohnlev  * Printf function that calls hypervisor directly.  For DomU it only
892843e1988Sjohnlev  * works when running on a xen hypervisor built with debug on.  Works
893843e1988Sjohnlev  * always since no I/O ring interaction is needed.
894843e1988Sjohnlev  */
895843e1988Sjohnlev /*PRINTFLIKE1*/
896843e1988Sjohnlev void
897843e1988Sjohnlev xen_printf(const char *fmt, ...)
898843e1988Sjohnlev {
899843e1988Sjohnlev 	va_list	ap;
900843e1988Sjohnlev 
901843e1988Sjohnlev 	va_start(ap, fmt);
902843e1988Sjohnlev 	(void) vsnprintf(xen_printf_buffer, XEN_PRINTF_BUFSIZE, fmt, ap);
903843e1988Sjohnlev 	va_end(ap);
904843e1988Sjohnlev 
905843e1988Sjohnlev 	(void) HYPERVISOR_console_io(CONSOLEIO_write,
906843e1988Sjohnlev 	    strlen(xen_printf_buffer), xen_printf_buffer);
907843e1988Sjohnlev }
908843e1988Sjohnlev #else
909843e1988Sjohnlev void
910843e1988Sjohnlev xen_printf(const char *fmt, ...)
911843e1988Sjohnlev {
912843e1988Sjohnlev }
913843e1988Sjohnlev #endif	/* DEBUG */
914843e1988Sjohnlev 
915843e1988Sjohnlev void
916ab4a9bebSjohnlev startup_xen_version(void)
917843e1988Sjohnlev {
9189e839ce9Sgarypen 	xen_set_version(XENVER_BOOT_IDX);
9199e839ce9Sgarypen 	if (xen_hypervisor_supports_solaris(XEN_RUN_CHECK) == 0)
9209e839ce9Sgarypen 		cmn_err(CE_WARN, "Found hypervisor version: v%lu.%lu%s "
9219e839ce9Sgarypen 		    "but need at least version v3.0.4",
9229e839ce9Sgarypen 		    XENVER_CURRENT(xv_major), XENVER_CURRENT(xv_minor),
9239e839ce9Sgarypen 		    XENVER_CURRENT(xv_ver));
924ab4a9bebSjohnlev 	xen_pte_workaround();
925843e1988Sjohnlev }
926843e1988Sjohnlev 
927*e4b86885SCheng Sean Ye int xen_mca_simulate_mc_physinfo_failure = 0;
928*e4b86885SCheng Sean Ye 
929*e4b86885SCheng Sean Ye void
930*e4b86885SCheng Sean Ye startup_xen_mca(void)
931*e4b86885SCheng Sean Ye {
932*e4b86885SCheng Sean Ye 	if (!DOMAIN_IS_INITDOMAIN(xen_info))
933*e4b86885SCheng Sean Ye 		return;
934*e4b86885SCheng Sean Ye 
935*e4b86885SCheng Sean Ye 	xen_phys_ncpus = 0;
936*e4b86885SCheng Sean Ye 	xen_phys_cpus = NULL;
937*e4b86885SCheng Sean Ye 
938*e4b86885SCheng Sean Ye 	if (xen_mca_simulate_mc_physinfo_failure ||
939*e4b86885SCheng Sean Ye 	    xen_get_mc_physcpuinfo(NULL, &xen_phys_ncpus) != 0) {
940*e4b86885SCheng Sean Ye 		cmn_err(CE_WARN,
941*e4b86885SCheng Sean Ye 		    "%sxen_get_mc_physinfo failure during xen MCA startup: "
942*e4b86885SCheng Sean Ye 		    "there will be no machine check support",
943*e4b86885SCheng Sean Ye 		    xen_mca_simulate_mc_physinfo_failure ? "(simulated) " : "");
944*e4b86885SCheng Sean Ye 		return;
945*e4b86885SCheng Sean Ye 	}
946*e4b86885SCheng Sean Ye 
947*e4b86885SCheng Sean Ye 	xen_phys_cpus = kmem_alloc(xen_phys_ncpus *
948*e4b86885SCheng Sean Ye 	    sizeof (xen_mc_logical_cpu_t), KM_NOSLEEP);
949*e4b86885SCheng Sean Ye 
950*e4b86885SCheng Sean Ye 	if (xen_phys_cpus == NULL) {
951*e4b86885SCheng Sean Ye 		cmn_err(CE_WARN,
952*e4b86885SCheng Sean Ye 		    "xen_get_physinfo failure: can't allocate CPU array");
953*e4b86885SCheng Sean Ye 		return;
954*e4b86885SCheng Sean Ye 	}
955*e4b86885SCheng Sean Ye 
956*e4b86885SCheng Sean Ye 	if (xen_get_mc_physcpuinfo(xen_phys_cpus, &xen_phys_ncpus) != 0) {
957*e4b86885SCheng Sean Ye 		cmn_err(CE_WARN, "xen_get_mc_physinfo failure: no "
958*e4b86885SCheng Sean Ye 		    "physical CPU info");
959*e4b86885SCheng Sean Ye 		kmem_free(xen_phys_cpus,
960*e4b86885SCheng Sean Ye 		    xen_phys_ncpus * sizeof (xen_mc_logical_cpu_t));
961*e4b86885SCheng Sean Ye 		xen_phys_ncpus = 0;
962*e4b86885SCheng Sean Ye 		xen_phys_cpus = NULL;
963*e4b86885SCheng Sean Ye 	}
964*e4b86885SCheng Sean Ye 
965*e4b86885SCheng Sean Ye 	if (xen_physinfo_debug) {
966*e4b86885SCheng Sean Ye 		xen_mc_logical_cpu_t *xcp;
967*e4b86885SCheng Sean Ye 		unsigned i;
968*e4b86885SCheng Sean Ye 
969*e4b86885SCheng Sean Ye 		cmn_err(CE_NOTE, "xvm mca: %u physical cpus:\n",
970*e4b86885SCheng Sean Ye 		    xen_phys_ncpus);
971*e4b86885SCheng Sean Ye 		for (i = 0; i < xen_phys_ncpus; i++) {
972*e4b86885SCheng Sean Ye 			xcp = &xen_phys_cpus[i];
973*e4b86885SCheng Sean Ye 			cmn_err(CE_NOTE, "cpu%u: (%u, %u, %u) apid %u",
974*e4b86885SCheng Sean Ye 			    xcp->mc_cpunr, xcp->mc_chipid, xcp->mc_coreid,
975*e4b86885SCheng Sean Ye 			    xcp->mc_threadid, xcp->mc_apicid);
976*e4b86885SCheng Sean Ye 		}
977*e4b86885SCheng Sean Ye 	}
978*e4b86885SCheng Sean Ye }
979*e4b86885SCheng Sean Ye 
980843e1988Sjohnlev /*
981843e1988Sjohnlev  * Miscellaneous hypercall wrappers with slightly more verbose diagnostics.
982843e1988Sjohnlev  */
983843e1988Sjohnlev 
984843e1988Sjohnlev void
985843e1988Sjohnlev xen_set_gdt(ulong_t *frame_list, int entries)
986843e1988Sjohnlev {
987843e1988Sjohnlev 	int err;
988843e1988Sjohnlev 	if ((err = HYPERVISOR_set_gdt(frame_list, entries)) != 0) {
989843e1988Sjohnlev 		/*
990843e1988Sjohnlev 		 * X_EINVAL:	reserved entry or bad frames
991843e1988Sjohnlev 		 * X_EFAULT:	bad address
992843e1988Sjohnlev 		 */
993843e1988Sjohnlev 		panic("xen_set_gdt(%p, %d): error %d",
994843e1988Sjohnlev 		    (void *)frame_list, entries, -(int)err);
995843e1988Sjohnlev 	}
996843e1988Sjohnlev }
997843e1988Sjohnlev 
998843e1988Sjohnlev void
999843e1988Sjohnlev xen_set_ldt(user_desc_t *ldt, uint_t nsels)
1000843e1988Sjohnlev {
1001843e1988Sjohnlev 	struct mmuext_op	op;
1002843e1988Sjohnlev 	long			err;
1003843e1988Sjohnlev 
1004843e1988Sjohnlev 	op.cmd = MMUEXT_SET_LDT;
1005843e1988Sjohnlev 	op.arg1.linear_addr = (uintptr_t)ldt;
1006843e1988Sjohnlev 	op.arg2.nr_ents = nsels;
1007843e1988Sjohnlev 
1008843e1988Sjohnlev 	if ((err = HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) != 0) {
1009843e1988Sjohnlev 		panic("xen_set_ldt(%p, %d): error %d",
1010843e1988Sjohnlev 		    (void *)ldt, nsels, -(int)err);
1011843e1988Sjohnlev 	}
1012843e1988Sjohnlev }
1013843e1988Sjohnlev 
1014843e1988Sjohnlev void
1015843e1988Sjohnlev xen_stack_switch(ulong_t ss, ulong_t esp)
1016843e1988Sjohnlev {
1017843e1988Sjohnlev 	long err;
1018843e1988Sjohnlev 
1019843e1988Sjohnlev 	if ((err = HYPERVISOR_stack_switch(ss, esp)) != 0) {
1020843e1988Sjohnlev 		/*
1021843e1988Sjohnlev 		 * X_EPERM:	bad selector
1022843e1988Sjohnlev 		 */
1023843e1988Sjohnlev 		panic("xen_stack_switch(%lx, %lx): error %d", ss, esp,
1024843e1988Sjohnlev 		    -(int)err);
1025843e1988Sjohnlev 	}
1026843e1988Sjohnlev }
1027843e1988Sjohnlev 
1028843e1988Sjohnlev long
1029843e1988Sjohnlev xen_set_trap_table(trap_info_t *table)
1030843e1988Sjohnlev {
1031843e1988Sjohnlev 	long err;
1032843e1988Sjohnlev 
1033843e1988Sjohnlev 	if ((err = HYPERVISOR_set_trap_table(table)) != 0) {
1034843e1988Sjohnlev 		/*
1035843e1988Sjohnlev 		 * X_EFAULT:	bad address
1036843e1988Sjohnlev 		 * X_EPERM:	bad selector
1037843e1988Sjohnlev 		 */
1038843e1988Sjohnlev 		panic("xen_set_trap_table(%p): error %d", (void *)table,
1039843e1988Sjohnlev 		    -(int)err);
1040843e1988Sjohnlev 	}
1041843e1988Sjohnlev 	return (err);
1042843e1988Sjohnlev }
1043843e1988Sjohnlev 
1044843e1988Sjohnlev #if defined(__amd64)
1045843e1988Sjohnlev void
1046843e1988Sjohnlev xen_set_segment_base(int reg, ulong_t value)
1047843e1988Sjohnlev {
1048843e1988Sjohnlev 	long err;
1049843e1988Sjohnlev 
1050843e1988Sjohnlev 	if ((err = HYPERVISOR_set_segment_base(reg, value)) != 0) {
1051843e1988Sjohnlev 		/*
1052843e1988Sjohnlev 		 * X_EFAULT:	bad address
1053843e1988Sjohnlev 		 * X_EINVAL:	bad type
1054843e1988Sjohnlev 		 */
1055843e1988Sjohnlev 		panic("xen_set_segment_base(%d, %lx): error %d",
1056843e1988Sjohnlev 		    reg, value, -(int)err);
1057843e1988Sjohnlev 	}
1058843e1988Sjohnlev }
1059843e1988Sjohnlev #endif	/* __amd64 */
1060843e1988Sjohnlev 
1061843e1988Sjohnlev /*
1062843e1988Sjohnlev  * Translate a hypervisor errcode to a Solaris error code.
1063843e1988Sjohnlev  */
1064843e1988Sjohnlev int
1065843e1988Sjohnlev xen_xlate_errcode(int error)
1066843e1988Sjohnlev {
1067843e1988Sjohnlev 	switch (-error) {
1068843e1988Sjohnlev 
1069843e1988Sjohnlev 	/*
1070843e1988Sjohnlev 	 * Translate hypervisor errno's into native errno's
1071843e1988Sjohnlev 	 */
1072843e1988Sjohnlev 
1073843e1988Sjohnlev #define	CASE(num)	case X_##num: error = num; break
1074843e1988Sjohnlev 
1075843e1988Sjohnlev 	CASE(EPERM);	CASE(ENOENT);	CASE(ESRCH);
1076843e1988Sjohnlev 	CASE(EINTR);	CASE(EIO);	CASE(ENXIO);
1077843e1988Sjohnlev 	CASE(E2BIG);	CASE(ENOMEM);	CASE(EACCES);
1078843e1988Sjohnlev 	CASE(EFAULT);	CASE(EBUSY);	CASE(EEXIST);
1079843e1988Sjohnlev 	CASE(ENODEV);	CASE(EISDIR);	CASE(EINVAL);
1080843e1988Sjohnlev 	CASE(ENOSPC);	CASE(ESPIPE);	CASE(EROFS);
1081843e1988Sjohnlev 	CASE(ENOSYS);	CASE(ENOTEMPTY); CASE(EISCONN);
1082843e1988Sjohnlev 	CASE(ENODATA);
1083843e1988Sjohnlev 
1084843e1988Sjohnlev #undef CASE
1085843e1988Sjohnlev 
1086843e1988Sjohnlev 	default:
1087843e1988Sjohnlev 		panic("xen_xlate_errcode: unknown error %d", error);
1088843e1988Sjohnlev 	}
1089843e1988Sjohnlev 
1090843e1988Sjohnlev 	return (error);
1091843e1988Sjohnlev }
1092843e1988Sjohnlev 
1093843e1988Sjohnlev /*
1094843e1988Sjohnlev  * Raise PS_IOPL on current vcpu to user level.
1095843e1988Sjohnlev  * Caller responsible for preventing kernel preemption.
1096843e1988Sjohnlev  */
1097843e1988Sjohnlev void
1098843e1988Sjohnlev xen_enable_user_iopl(void)
1099843e1988Sjohnlev {
1100843e1988Sjohnlev 	physdev_set_iopl_t set_iopl;
1101843e1988Sjohnlev 	set_iopl.iopl = 3;		/* user ring 3 */
1102843e1988Sjohnlev 	(void) HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
1103843e1988Sjohnlev }
1104843e1988Sjohnlev 
1105843e1988Sjohnlev /*
1106843e1988Sjohnlev  * Drop PS_IOPL on current vcpu to kernel level
1107843e1988Sjohnlev  */
1108843e1988Sjohnlev void
1109843e1988Sjohnlev xen_disable_user_iopl(void)
1110843e1988Sjohnlev {
1111843e1988Sjohnlev 	physdev_set_iopl_t set_iopl;
1112843e1988Sjohnlev 	set_iopl.iopl = 1;		/* kernel pseudo ring 1 */
1113843e1988Sjohnlev 	(void) HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
1114843e1988Sjohnlev }
1115843e1988Sjohnlev 
1116843e1988Sjohnlev int
1117843e1988Sjohnlev xen_gdt_setprot(cpu_t *cp, uint_t prot)
1118843e1988Sjohnlev {
1119843e1988Sjohnlev 	int err;
1120843e1988Sjohnlev #if defined(__amd64)
1121843e1988Sjohnlev 	int pt_bits = PT_VALID;
1122843e1988Sjohnlev 	if (prot & PROT_WRITE)
1123843e1988Sjohnlev 		pt_bits |= PT_WRITABLE;
1124843e1988Sjohnlev #endif
1125843e1988Sjohnlev 
1126843e1988Sjohnlev 	if ((err = as_setprot(&kas, (caddr_t)cp->cpu_gdt,
1127843e1988Sjohnlev 	    MMU_PAGESIZE, prot)) != 0)
1128843e1988Sjohnlev 		goto done;
1129843e1988Sjohnlev 
1130843e1988Sjohnlev #if defined(__amd64)
1131843e1988Sjohnlev 	err = xen_kpm_page(mmu_btop(cp->cpu_m.mcpu_gdtpa), pt_bits);
1132843e1988Sjohnlev #endif
1133843e1988Sjohnlev 
1134843e1988Sjohnlev done:
1135843e1988Sjohnlev 	if (err) {
1136843e1988Sjohnlev 		cmn_err(CE_WARN, "cpu%d: xen_gdt_setprot(%s) failed: error %d",
1137843e1988Sjohnlev 		    cp->cpu_id, (prot & PROT_WRITE) ? "writable" : "read-only",
1138843e1988Sjohnlev 		    err);
1139843e1988Sjohnlev 	}
1140843e1988Sjohnlev 
1141843e1988Sjohnlev 	return (err);
1142843e1988Sjohnlev }
1143843e1988Sjohnlev 
1144843e1988Sjohnlev int
1145843e1988Sjohnlev xen_ldt_setprot(user_desc_t *ldt, size_t lsize, uint_t prot)
1146843e1988Sjohnlev {
1147843e1988Sjohnlev 	int err;
1148843e1988Sjohnlev 	caddr_t	lva = (caddr_t)ldt;
1149843e1988Sjohnlev #if defined(__amd64)
1150843e1988Sjohnlev 	int pt_bits = PT_VALID;
1151843e1988Sjohnlev 	pgcnt_t npgs;
1152843e1988Sjohnlev 	if (prot & PROT_WRITE)
1153843e1988Sjohnlev 		pt_bits |= PT_WRITABLE;
1154843e1988Sjohnlev #endif	/* __amd64 */
1155843e1988Sjohnlev 
1156843e1988Sjohnlev 	if ((err = as_setprot(&kas, (caddr_t)ldt, lsize, prot)) != 0)
1157843e1988Sjohnlev 		goto done;
1158843e1988Sjohnlev 
1159843e1988Sjohnlev #if defined(__amd64)
1160843e1988Sjohnlev 
1161843e1988Sjohnlev 	ASSERT(IS_P2ALIGNED(lsize, PAGESIZE));
1162843e1988Sjohnlev 	npgs = mmu_btop(lsize);
1163843e1988Sjohnlev 	while (npgs--) {
1164843e1988Sjohnlev 		if ((err = xen_kpm_page(hat_getpfnum(kas.a_hat, lva),
1165843e1988Sjohnlev 		    pt_bits)) != 0)
1166843e1988Sjohnlev 			break;
1167843e1988Sjohnlev 		lva += PAGESIZE;
1168843e1988Sjohnlev 	}
1169843e1988Sjohnlev #endif	/* __amd64 */
1170843e1988Sjohnlev 
1171843e1988Sjohnlev done:
1172843e1988Sjohnlev 	if (err) {
1173843e1988Sjohnlev 		cmn_err(CE_WARN, "xen_ldt_setprot(%p, %s) failed: error %d",
1174843e1988Sjohnlev 		    (void *)lva,
1175843e1988Sjohnlev 		    (prot & PROT_WRITE) ? "writable" : "read-only", err);
1176843e1988Sjohnlev 	}
1177843e1988Sjohnlev 
1178843e1988Sjohnlev 	return (err);
1179843e1988Sjohnlev }
1180*e4b86885SCheng Sean Ye 
1181*e4b86885SCheng Sean Ye int
1182*e4b86885SCheng Sean Ye xen_get_physinfo(xen_sysctl_physinfo_t *pi)
1183*e4b86885SCheng Sean Ye {
1184*e4b86885SCheng Sean Ye 	xen_sysctl_t op;
1185*e4b86885SCheng Sean Ye 	int ret;
1186*e4b86885SCheng Sean Ye 
1187*e4b86885SCheng Sean Ye 	bzero(&op, sizeof (op));
1188*e4b86885SCheng Sean Ye 	op.cmd = XEN_SYSCTL_physinfo;
1189*e4b86885SCheng Sean Ye 	op.interface_version = XEN_SYSCTL_INTERFACE_VERSION;
1190*e4b86885SCheng Sean Ye 
1191*e4b86885SCheng Sean Ye 	ret = HYPERVISOR_sysctl(&op);
1192*e4b86885SCheng Sean Ye 
1193*e4b86885SCheng Sean Ye 	if (ret != 0)
1194*e4b86885SCheng Sean Ye 		return (ret);
1195*e4b86885SCheng Sean Ye 
1196*e4b86885SCheng Sean Ye 	bcopy(&op.u.physinfo, pi, sizeof (op.u.physinfo));
1197*e4b86885SCheng Sean Ye 	return (0);
1198*e4b86885SCheng Sean Ye }
1199*e4b86885SCheng Sean Ye 
1200*e4b86885SCheng Sean Ye int
1201*e4b86885SCheng Sean Ye xen_get_mc_physcpuinfo(xen_mc_logical_cpu_t *log_cpus, uint_t *ncpus)
1202*e4b86885SCheng Sean Ye {
1203*e4b86885SCheng Sean Ye 	struct xen_mc_physcpuinfo cpi;
1204*e4b86885SCheng Sean Ye 
1205*e4b86885SCheng Sean Ye 	cpi.ncpus = *ncpus;
1206*e4b86885SCheng Sean Ye 	/*LINTED: constant in conditional context*/
1207*e4b86885SCheng Sean Ye 	set_xen_guest_handle(cpi.info, log_cpus);
1208*e4b86885SCheng Sean Ye 
1209*e4b86885SCheng Sean Ye 	if (HYPERVISOR_mca(XEN_MC_CMD_physcpuinfo, (xen_mc_arg_t *)&cpi) !=
1210*e4b86885SCheng Sean Ye 	    XEN_MC_HCALL_SUCCESS)
1211*e4b86885SCheng Sean Ye 		return (-1);
1212*e4b86885SCheng Sean Ye 
1213*e4b86885SCheng Sean Ye 	*ncpus = cpi.ncpus;
1214*e4b86885SCheng Sean Ye 	return (0);
1215*e4b86885SCheng Sean Ye }
1216*e4b86885SCheng Sean Ye 
1217*e4b86885SCheng Sean Ye void
1218*e4b86885SCheng Sean Ye print_panic(const char *str)
1219*e4b86885SCheng Sean Ye {
1220*e4b86885SCheng Sean Ye 	xen_printf(str);
1221*e4b86885SCheng Sean Ye }
1222*e4b86885SCheng Sean Ye 
1223*e4b86885SCheng Sean Ye /*
1224*e4b86885SCheng Sean Ye  * Interfaces to iterate over real cpu information, but only that info
1225*e4b86885SCheng Sean Ye  * which we choose to expose here.  These are of interest to dom0
1226*e4b86885SCheng Sean Ye  * only (and the backing hypercall should not work for domu).
1227*e4b86885SCheng Sean Ye  */
1228*e4b86885SCheng Sean Ye 
1229*e4b86885SCheng Sean Ye xen_mc_lcpu_cookie_t
1230*e4b86885SCheng Sean Ye xen_physcpu_next(xen_mc_lcpu_cookie_t cookie)
1231*e4b86885SCheng Sean Ye {
1232*e4b86885SCheng Sean Ye 	xen_mc_logical_cpu_t *xcp = (xen_mc_logical_cpu_t *)cookie;
1233*e4b86885SCheng Sean Ye 
1234*e4b86885SCheng Sean Ye 	if (!DOMAIN_IS_INITDOMAIN(xen_info))
1235*e4b86885SCheng Sean Ye 		return (NULL);
1236*e4b86885SCheng Sean Ye 
1237*e4b86885SCheng Sean Ye 	if (cookie == NULL)
1238*e4b86885SCheng Sean Ye 		return ((xen_mc_lcpu_cookie_t)xen_phys_cpus);
1239*e4b86885SCheng Sean Ye 
1240*e4b86885SCheng Sean Ye 	if (xcp == xen_phys_cpus + xen_phys_ncpus - 1)
1241*e4b86885SCheng Sean Ye 		return (NULL);
1242*e4b86885SCheng Sean Ye 	else
1243*e4b86885SCheng Sean Ye 		return ((xen_mc_lcpu_cookie_t)++xcp);
1244*e4b86885SCheng Sean Ye }
1245*e4b86885SCheng Sean Ye 
1246*e4b86885SCheng Sean Ye #define	COOKIE2XCP(c) ((xen_mc_logical_cpu_t *)(c))
1247*e4b86885SCheng Sean Ye 
1248*e4b86885SCheng Sean Ye const char *
1249*e4b86885SCheng Sean Ye xen_physcpu_vendorstr(xen_mc_lcpu_cookie_t cookie)
1250*e4b86885SCheng Sean Ye {
1251*e4b86885SCheng Sean Ye 	xen_mc_logical_cpu_t *xcp = COOKIE2XCP(cookie);
1252*e4b86885SCheng Sean Ye 
1253*e4b86885SCheng Sean Ye 	return ((const char *)&xcp->mc_vendorid[0]);
1254*e4b86885SCheng Sean Ye }
1255*e4b86885SCheng Sean Ye 
1256*e4b86885SCheng Sean Ye int
1257*e4b86885SCheng Sean Ye xen_physcpu_family(xen_mc_lcpu_cookie_t cookie)
1258*e4b86885SCheng Sean Ye {
1259*e4b86885SCheng Sean Ye 	return (COOKIE2XCP(cookie)->mc_family);
1260*e4b86885SCheng Sean Ye }
1261*e4b86885SCheng Sean Ye 
1262*e4b86885SCheng Sean Ye int
1263*e4b86885SCheng Sean Ye xen_physcpu_model(xen_mc_lcpu_cookie_t cookie)
1264*e4b86885SCheng Sean Ye {
1265*e4b86885SCheng Sean Ye 	return (COOKIE2XCP(cookie)->mc_model);
1266*e4b86885SCheng Sean Ye }
1267*e4b86885SCheng Sean Ye 
1268*e4b86885SCheng Sean Ye int
1269*e4b86885SCheng Sean Ye xen_physcpu_stepping(xen_mc_lcpu_cookie_t cookie)
1270*e4b86885SCheng Sean Ye {
1271*e4b86885SCheng Sean Ye 	return (COOKIE2XCP(cookie)->mc_step);
1272*e4b86885SCheng Sean Ye }
1273*e4b86885SCheng Sean Ye 
1274*e4b86885SCheng Sean Ye id_t
1275*e4b86885SCheng Sean Ye xen_physcpu_chipid(xen_mc_lcpu_cookie_t cookie)
1276*e4b86885SCheng Sean Ye {
1277*e4b86885SCheng Sean Ye 	return (COOKIE2XCP(cookie)->mc_chipid);
1278*e4b86885SCheng Sean Ye }
1279*e4b86885SCheng Sean Ye 
1280*e4b86885SCheng Sean Ye id_t
1281*e4b86885SCheng Sean Ye xen_physcpu_coreid(xen_mc_lcpu_cookie_t cookie)
1282*e4b86885SCheng Sean Ye {
1283*e4b86885SCheng Sean Ye 	return (COOKIE2XCP(cookie)->mc_coreid);
1284*e4b86885SCheng Sean Ye }
1285*e4b86885SCheng Sean Ye 
1286*e4b86885SCheng Sean Ye id_t
1287*e4b86885SCheng Sean Ye xen_physcpu_strandid(xen_mc_lcpu_cookie_t cookie)
1288*e4b86885SCheng Sean Ye {
1289*e4b86885SCheng Sean Ye 	return (COOKIE2XCP(cookie)->mc_threadid);
1290*e4b86885SCheng Sean Ye }
1291*e4b86885SCheng Sean Ye 
1292*e4b86885SCheng Sean Ye id_t
1293*e4b86885SCheng Sean Ye xen_physcpu_logical_id(xen_mc_lcpu_cookie_t cookie)
1294*e4b86885SCheng Sean Ye {
1295*e4b86885SCheng Sean Ye 	return (COOKIE2XCP(cookie)->mc_cpunr);
1296*e4b86885SCheng Sean Ye }
1297*e4b86885SCheng Sean Ye 
1298*e4b86885SCheng Sean Ye boolean_t
1299*e4b86885SCheng Sean Ye xen_physcpu_is_cmt(xen_mc_lcpu_cookie_t cookie)
1300*e4b86885SCheng Sean Ye {
1301*e4b86885SCheng Sean Ye 	return (COOKIE2XCP(cookie)->mc_nthreads > 1);
1302*e4b86885SCheng Sean Ye }
1303*e4b86885SCheng Sean Ye 
1304*e4b86885SCheng Sean Ye uint64_t
1305*e4b86885SCheng Sean Ye xen_physcpu_mcg_cap(xen_mc_lcpu_cookie_t cookie)
1306*e4b86885SCheng Sean Ye {
1307*e4b86885SCheng Sean Ye 	xen_mc_logical_cpu_t *xcp = COOKIE2XCP(cookie);
1308*e4b86885SCheng Sean Ye 
1309*e4b86885SCheng Sean Ye 	/*
1310*e4b86885SCheng Sean Ye 	 * Need to #define the indices, or search through the array.
1311*e4b86885SCheng Sean Ye 	 */
1312*e4b86885SCheng Sean Ye 	return (xcp->mc_msrvalues[0].value);
1313*e4b86885SCheng Sean Ye }
1314