xref: /titanic_52/usr/src/uts/i86xpv/os/mp_xen.c (revision 0ed5c46e82c989cfa9726d9dae452e3d24ef83be)
1843e1988Sjohnlev /*
2843e1988Sjohnlev  * CDDL HEADER START
3843e1988Sjohnlev  *
4843e1988Sjohnlev  * The contents of this file are subject to the terms of the
5843e1988Sjohnlev  * Common Development and Distribution License (the "License").
6843e1988Sjohnlev  * You may not use this file except in compliance with the License.
7843e1988Sjohnlev  *
8843e1988Sjohnlev  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9843e1988Sjohnlev  * or http://www.opensolaris.org/os/licensing.
10843e1988Sjohnlev  * See the License for the specific language governing permissions
11843e1988Sjohnlev  * and limitations under the License.
12843e1988Sjohnlev  *
13843e1988Sjohnlev  * When distributing Covered Code, include this CDDL HEADER in each
14843e1988Sjohnlev  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15843e1988Sjohnlev  * If applicable, add the following below this CDDL HEADER, with the
16843e1988Sjohnlev  * fields enclosed by brackets "[]" replaced with your own identifying
17843e1988Sjohnlev  * information: Portions Copyright [yyyy] [name of copyright owner]
18843e1988Sjohnlev  *
19843e1988Sjohnlev  * CDDL HEADER END
20843e1988Sjohnlev  */
21843e1988Sjohnlev 
22843e1988Sjohnlev /*
23f34a7178SJoe Bonasera  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24843e1988Sjohnlev  * Use is subject to license terms.
25843e1988Sjohnlev  */
26843e1988Sjohnlev 
271d03c31eSjohnlev /*
281d03c31eSjohnlev  * Virtual CPU management.
291d03c31eSjohnlev  *
301d03c31eSjohnlev  * VCPUs can be controlled in one of two ways; through the domain itself
311d03c31eSjohnlev  * (psradm, p_online(), etc.), and via changes in xenstore (vcpu_config()).
321d03c31eSjohnlev  * Unfortunately, the terminology is used in different ways; they work out as
331d03c31eSjohnlev  * follows:
341d03c31eSjohnlev  *
351d03c31eSjohnlev  * P_ONLINE: the VCPU is up and running, taking interrupts and running threads
361d03c31eSjohnlev  *
371d03c31eSjohnlev  * P_OFFLINE: the VCPU is up and running, but quiesced (i.e. blocked in the
381d03c31eSjohnlev  * hypervisor on the idle thread).  It must be up since a downed VCPU cannot
391d03c31eSjohnlev  * receive interrupts, and we require this for offline CPUs in Solaris.
401d03c31eSjohnlev  *
411d03c31eSjohnlev  * P_POWEROFF: the VCPU is down (we never called xen_vcpu_up(), or called
421d03c31eSjohnlev  * xen_vcpu_down() for it).  It can't take interrupts or run anything, though
431d03c31eSjohnlev  * if it has run previously, its software state (cpu_t, machcpu structures, IPI
441d03c31eSjohnlev  * event channels, etc.) will still exist.
451d03c31eSjohnlev  *
461d03c31eSjohnlev  * The hypervisor has two notions of CPU states as represented in the store:
471d03c31eSjohnlev  *
481d03c31eSjohnlev  * "offline": the VCPU is down.  Corresponds to P_POWEROFF.
491d03c31eSjohnlev  *
501d03c31eSjohnlev  * "online": the VCPU is running.  Corresponds to a CPU state other than
511d03c31eSjohnlev  * P_POWEROFF.
521d03c31eSjohnlev  *
531d03c31eSjohnlev  * Currently, only a notification via xenstore can bring a CPU into a
541d03c31eSjohnlev  * P_POWEROFF state, and only the domain can change between P_ONLINE, P_NOINTR,
551d03c31eSjohnlev  * P_OFFLINE, etc.  We need to be careful to treat xenstore notifications
561d03c31eSjohnlev  * idempotently, as we'll get 'duplicate' entries when we resume a domain.
571d03c31eSjohnlev  *
581d03c31eSjohnlev  * Note that the xenstore configuration is strictly advisory, in that a domain
591d03c31eSjohnlev  * can choose to ignore it and still power up a VCPU in the offline state. To
601d03c31eSjohnlev  * play nice, we don't allow it. Thus, any attempt to power on/off a CPU is
611d03c31eSjohnlev  * ENOTSUP from within Solaris.
621d03c31eSjohnlev  *
631d03c31eSjohnlev  * Powering off a VCPU and suspending the domain use similar code. The
641d03c31eSjohnlev  * difficulty here is that we must ensure that each VCPU is in a stable
651d03c31eSjohnlev  * state: it must have a saved PCB, and not be responding to interrupts
661d03c31eSjohnlev  * (since we are just about to remove its ability to run on a real CPU,
671d03c31eSjohnlev  * possibly forever).  However, an offline CPU in Solaris can take
681d03c31eSjohnlev  * cross-call interrupts, as mentioned, so we must go through a
691d03c31eSjohnlev  * two-stage process.  First, we use the standard Solaris pause_cpus().
701d03c31eSjohnlev  * This ensures that all CPUs are either in mach_cpu_pause() or
711d03c31eSjohnlev  * mach_cpu_idle(), and nothing will cross-call them.
721d03c31eSjohnlev  *
731d03c31eSjohnlev  * Powered-off-CPUs are already safe, as we own the cpu_lock needed to
741d03c31eSjohnlev  * bring them back up, and in state CPU_PHASE_POWERED_OFF.
751d03c31eSjohnlev  *
761d03c31eSjohnlev  * Running CPUs are spinning in mach_cpu_pause() waiting for either
771d03c31eSjohnlev  * PAUSE_IDLE or CPU_PHASE_WAIT_SAFE.
781d03c31eSjohnlev  *
791d03c31eSjohnlev  * Offline CPUs are either running the idle thread and periodically
801d03c31eSjohnlev  * checking for CPU_PHASE_WAIT_SAFE, or blocked in the hypervisor.
811d03c31eSjohnlev  *
821d03c31eSjohnlev  * Thus, we set CPU_PHASE_WAIT_SAFE for every powered-on CPU, as well as
831d03c31eSjohnlev  * poking them to make sure they're not blocked[1]. When every CPU has
841d03c31eSjohnlev  * responded by reaching a safe state and setting CPU_PHASE_SAFE, we
851d03c31eSjohnlev  * know we can suspend, or power-off a CPU, without problems.
861d03c31eSjohnlev  *
871d03c31eSjohnlev  * [1] note that we have to repeatedly poke offline CPUs: it's the only
881d03c31eSjohnlev  * way to ensure that the CPU doesn't miss the state change before
891d03c31eSjohnlev  * dropping into HYPERVISOR_block().
901d03c31eSjohnlev  */
911d03c31eSjohnlev 
92843e1988Sjohnlev #include <sys/types.h>
93843e1988Sjohnlev #include <sys/systm.h>
94843e1988Sjohnlev #include <sys/param.h>
95843e1988Sjohnlev #include <sys/taskq.h>
96843e1988Sjohnlev #include <sys/cmn_err.h>
97843e1988Sjohnlev #include <sys/archsystm.h>
98843e1988Sjohnlev #include <sys/machsystm.h>
99843e1988Sjohnlev #include <sys/segments.h>
100843e1988Sjohnlev #include <sys/cpuvar.h>
101843e1988Sjohnlev #include <sys/x86_archext.h>
102843e1988Sjohnlev #include <sys/controlregs.h>
103843e1988Sjohnlev #include <sys/hypervisor.h>
104843e1988Sjohnlev #include <sys/xpv_panic.h>
1051d03c31eSjohnlev #include <sys/mman.h>
1061d03c31eSjohnlev #include <sys/psw.h>
107843e1988Sjohnlev #include <sys/cpu.h>
1081d03c31eSjohnlev #include <sys/sunddi.h>
1091d03c31eSjohnlev #include <util/sscanf.h>
1101d03c31eSjohnlev #include <vm/hat_i86.h>
1111d03c31eSjohnlev #include <vm/hat.h>
1121d03c31eSjohnlev #include <vm/as.h>
113843e1988Sjohnlev 
114843e1988Sjohnlev #include <xen/public/io/xs_wire.h>
1151d03c31eSjohnlev #include <xen/sys/xenbus_impl.h>
1161d03c31eSjohnlev #include <xen/public/vcpu.h>
117843e1988Sjohnlev 
118f34a7178SJoe Bonasera extern cpuset_t cpu_ready_set;
119f34a7178SJoe Bonasera 
1201d03c31eSjohnlev #define	CPU_PHASE_NONE 0
1211d03c31eSjohnlev #define	CPU_PHASE_WAIT_SAFE 1
1221d03c31eSjohnlev #define	CPU_PHASE_SAFE 2
1231d03c31eSjohnlev #define	CPU_PHASE_POWERED_OFF 3
1241d03c31eSjohnlev 
1251d03c31eSjohnlev /*
1261d03c31eSjohnlev  * We can only poke CPUs during barrier enter 256 times a second at
1271d03c31eSjohnlev  * most.
1281d03c31eSjohnlev  */
1291d03c31eSjohnlev #define	POKE_TIMEOUT (NANOSEC / 256)
130843e1988Sjohnlev 
131843e1988Sjohnlev static taskq_t *cpu_config_tq;
1321d03c31eSjohnlev static int cpu_phase[NCPU];
1331d03c31eSjohnlev 
134843e1988Sjohnlev static void vcpu_config_event(struct xenbus_watch *, const char **, uint_t);
135843e1988Sjohnlev static int xen_vcpu_initialize(processorid_t, vcpu_guest_context_t *);
136843e1988Sjohnlev 
137843e1988Sjohnlev /*
138b9bc7f78Ssmaybe  * Return whether or not the vcpu is actually running on a pcpu
139b9bc7f78Ssmaybe  */
140b9bc7f78Ssmaybe int
141b9bc7f78Ssmaybe vcpu_on_pcpu(processorid_t cpu)
142b9bc7f78Ssmaybe {
143b9bc7f78Ssmaybe 	struct vcpu_runstate_info runstate;
144b9bc7f78Ssmaybe 	int	ret = VCPU_STATE_UNKNOWN;
145b9bc7f78Ssmaybe 
146b9bc7f78Ssmaybe 	ASSERT(cpu < NCPU);
147b9bc7f78Ssmaybe 	/*
148b9bc7f78Ssmaybe 	 * Don't bother with hypercall if we are asking about ourself
149b9bc7f78Ssmaybe 	 */
150b9bc7f78Ssmaybe 	if (cpu == CPU->cpu_id)
151b9bc7f78Ssmaybe 		return (VCPU_ON_PCPU);
152b9bc7f78Ssmaybe 	if (HYPERVISOR_vcpu_op(VCPUOP_get_runstate_info, cpu, &runstate) != 0)
153b9bc7f78Ssmaybe 		goto out;
154b9bc7f78Ssmaybe 
155b9bc7f78Ssmaybe 	switch (runstate.state) {
156b9bc7f78Ssmaybe 	case RUNSTATE_running:
157b9bc7f78Ssmaybe 		ret = VCPU_ON_PCPU;
158b9bc7f78Ssmaybe 		break;
159b9bc7f78Ssmaybe 
160b9bc7f78Ssmaybe 	case RUNSTATE_runnable:
161b9bc7f78Ssmaybe 	case RUNSTATE_offline:
162b9bc7f78Ssmaybe 	case RUNSTATE_blocked:
163b9bc7f78Ssmaybe 		ret = VCPU_NOT_ON_PCPU;
164b9bc7f78Ssmaybe 		break;
165b9bc7f78Ssmaybe 
166b9bc7f78Ssmaybe 	default:
167b9bc7f78Ssmaybe 		break;
168b9bc7f78Ssmaybe 	}
169b9bc7f78Ssmaybe 
170b9bc7f78Ssmaybe out:
171b9bc7f78Ssmaybe 	return (ret);
172b9bc7f78Ssmaybe }
173b9bc7f78Ssmaybe 
174b9bc7f78Ssmaybe /*
175843e1988Sjohnlev  * These routines allocate any global state that might be needed
176843e1988Sjohnlev  * while starting cpus.  For virtual cpus, there is no such state.
177843e1988Sjohnlev  */
178843e1988Sjohnlev int
179843e1988Sjohnlev mach_cpucontext_init(void)
180843e1988Sjohnlev {
181843e1988Sjohnlev 	return (0);
182843e1988Sjohnlev }
183843e1988Sjohnlev 
184843e1988Sjohnlev void
185843e1988Sjohnlev do_cpu_config_watch(int state)
186843e1988Sjohnlev {
187843e1988Sjohnlev 	static struct xenbus_watch cpu_config_watch;
188843e1988Sjohnlev 
189843e1988Sjohnlev 	if (state != XENSTORE_UP)
190843e1988Sjohnlev 		return;
191843e1988Sjohnlev 	cpu_config_watch.node = "cpu";
192843e1988Sjohnlev 	cpu_config_watch.callback = vcpu_config_event;
193843e1988Sjohnlev 	if (register_xenbus_watch(&cpu_config_watch)) {
194843e1988Sjohnlev 		taskq_destroy(cpu_config_tq);
195843e1988Sjohnlev 		cmn_err(CE_WARN, "do_cpu_config_watch: "
196843e1988Sjohnlev 		    "failed to set vcpu config watch");
197843e1988Sjohnlev 	}
198843e1988Sjohnlev 
199843e1988Sjohnlev }
200843e1988Sjohnlev 
201843e1988Sjohnlev /*
202843e1988Sjohnlev  * This routine is called after all the "normal" MP startup has
203843e1988Sjohnlev  * been done; a good place to start watching xen store for virtual
204843e1988Sjohnlev  * cpu hot plug events.
205843e1988Sjohnlev  */
206843e1988Sjohnlev void
207843e1988Sjohnlev mach_cpucontext_fini(void)
208843e1988Sjohnlev {
209843e1988Sjohnlev 
210843e1988Sjohnlev 	cpu_config_tq = taskq_create("vcpu config taskq", 1,
211843e1988Sjohnlev 	    maxclsyspri - 1, 1, 1, TASKQ_PREPOPULATE);
212843e1988Sjohnlev 
213843e1988Sjohnlev 	(void) xs_register_xenbus_callback(do_cpu_config_watch);
214843e1988Sjohnlev }
215843e1988Sjohnlev 
216843e1988Sjohnlev /*
217843e1988Sjohnlev  * Fill in the remaining CPU context and initialize it.
218843e1988Sjohnlev  */
219843e1988Sjohnlev static int
220843e1988Sjohnlev mp_set_cpu_context(vcpu_guest_context_t *vgc, cpu_t *cp)
221843e1988Sjohnlev {
222843e1988Sjohnlev 	uint_t vec, iopl;
223843e1988Sjohnlev 
224843e1988Sjohnlev 	vgc->flags = VGCF_IN_KERNEL;
225843e1988Sjohnlev 
226843e1988Sjohnlev 	/*
227843e1988Sjohnlev 	 * fpu_ctx we leave as zero; on first fault we'll store
228843e1988Sjohnlev 	 * sse_initial into it anyway.
229843e1988Sjohnlev 	 */
230843e1988Sjohnlev 
231843e1988Sjohnlev #if defined(__amd64)
232843e1988Sjohnlev 	vgc->user_regs.cs = KCS_SEL | SEL_KPL;	/* force to ring 3 */
233843e1988Sjohnlev #else
234843e1988Sjohnlev 	vgc->user_regs.cs = KCS_SEL;
235843e1988Sjohnlev #endif
236843e1988Sjohnlev 	vgc->user_regs.ds = KDS_SEL;
237843e1988Sjohnlev 	vgc->user_regs.es = KDS_SEL;
238843e1988Sjohnlev 	vgc->user_regs.ss = KDS_SEL;
239843e1988Sjohnlev 	vgc->kernel_ss = KDS_SEL;
240843e1988Sjohnlev 
241843e1988Sjohnlev 	/*
242843e1988Sjohnlev 	 * Allow I/O privilege level for Dom0 kernel.
243843e1988Sjohnlev 	 */
244843e1988Sjohnlev 	if (DOMAIN_IS_INITDOMAIN(xen_info))
245843e1988Sjohnlev 		iopl = (PS_IOPL & 0x1000); /* ring 1 */
246843e1988Sjohnlev 	else
247843e1988Sjohnlev 		iopl = 0;
248843e1988Sjohnlev 
249843e1988Sjohnlev #if defined(__amd64)
250843e1988Sjohnlev 	vgc->user_regs.fs = 0;
251843e1988Sjohnlev 	vgc->user_regs.gs = 0;
252843e1988Sjohnlev 	vgc->user_regs.rflags = F_OFF | iopl;
253843e1988Sjohnlev #elif defined(__i386)
254843e1988Sjohnlev 	vgc->user_regs.fs = KFS_SEL;
255843e1988Sjohnlev 	vgc->user_regs.gs = KGS_SEL;
256843e1988Sjohnlev 	vgc->user_regs.eflags = F_OFF | iopl;
257843e1988Sjohnlev 	vgc->event_callback_cs = vgc->user_regs.cs;
258843e1988Sjohnlev 	vgc->failsafe_callback_cs = vgc->user_regs.cs;
259843e1988Sjohnlev #endif
260843e1988Sjohnlev 
261843e1988Sjohnlev 	/*
262843e1988Sjohnlev 	 * Initialize the trap_info_t from the IDT
263843e1988Sjohnlev 	 */
264843e1988Sjohnlev #if !defined(__lint)
265843e1988Sjohnlev 	ASSERT(NIDT == sizeof (vgc->trap_ctxt) / sizeof (vgc->trap_ctxt[0]));
266843e1988Sjohnlev #endif
267843e1988Sjohnlev 	for (vec = 0; vec < NIDT; vec++) {
268843e1988Sjohnlev 		trap_info_t *ti = &vgc->trap_ctxt[vec];
269843e1988Sjohnlev 
270843e1988Sjohnlev 		if (xen_idt_to_trap_info(vec,
271843e1988Sjohnlev 		    &cp->cpu_m.mcpu_idt[vec], ti) == 0) {
272843e1988Sjohnlev 			ti->cs = KCS_SEL;
273843e1988Sjohnlev 			ti->vector = vec;
274843e1988Sjohnlev 		}
275843e1988Sjohnlev 	}
276843e1988Sjohnlev 
277843e1988Sjohnlev 	/*
278843e1988Sjohnlev 	 * No LDT
279843e1988Sjohnlev 	 */
280843e1988Sjohnlev 
281843e1988Sjohnlev 	/*
282843e1988Sjohnlev 	 * (We assert in various places that the GDT is (a) aligned on a
283843e1988Sjohnlev 	 * page boundary and (b) one page long, so this really should fit..)
284843e1988Sjohnlev 	 */
285843e1988Sjohnlev #ifdef CRASH_XEN
286843e1988Sjohnlev 	vgc->gdt_frames[0] = pa_to_ma(mmu_btop(cp->cpu_m.mcpu_gdtpa));
287843e1988Sjohnlev #else
288843e1988Sjohnlev 	vgc->gdt_frames[0] = pfn_to_mfn(mmu_btop(cp->cpu_m.mcpu_gdtpa));
289843e1988Sjohnlev #endif
290843e1988Sjohnlev 	vgc->gdt_ents = NGDT;
291843e1988Sjohnlev 
292843e1988Sjohnlev 	vgc->ctrlreg[0] = CR0_ENABLE_FPU_FLAGS(getcr0());
293843e1988Sjohnlev 
294843e1988Sjohnlev #if defined(__i386)
295843e1988Sjohnlev 	if (mmu.pae_hat)
296843e1988Sjohnlev 		vgc->ctrlreg[3] =
297843e1988Sjohnlev 		    xen_pfn_to_cr3(pfn_to_mfn(kas.a_hat->hat_htable->ht_pfn));
298843e1988Sjohnlev 	else
299843e1988Sjohnlev #endif
300843e1988Sjohnlev 		vgc->ctrlreg[3] =
301843e1988Sjohnlev 		    pa_to_ma(mmu_ptob(kas.a_hat->hat_htable->ht_pfn));
302843e1988Sjohnlev 
303843e1988Sjohnlev 	vgc->ctrlreg[4] = getcr4();
304843e1988Sjohnlev 
305843e1988Sjohnlev 	vgc->event_callback_eip = (uintptr_t)xen_callback;
306843e1988Sjohnlev 	vgc->failsafe_callback_eip = (uintptr_t)xen_failsafe_callback;
307843e1988Sjohnlev 	vgc->flags |= VGCF_failsafe_disables_events;
308843e1988Sjohnlev 
309843e1988Sjohnlev #if defined(__amd64)
310843e1988Sjohnlev 	/*
311843e1988Sjohnlev 	 * XXPV should this be moved to init_cpu_syscall?
312843e1988Sjohnlev 	 */
313843e1988Sjohnlev 	vgc->syscall_callback_eip = (uintptr_t)sys_syscall;
314843e1988Sjohnlev 	vgc->flags |= VGCF_syscall_disables_events;
315843e1988Sjohnlev 
316843e1988Sjohnlev 	ASSERT(vgc->user_regs.gs == 0);
317843e1988Sjohnlev 	vgc->gs_base_kernel = (uintptr_t)cp;
318843e1988Sjohnlev #endif
319843e1988Sjohnlev 
320843e1988Sjohnlev 	return (xen_vcpu_initialize(cp->cpu_id, vgc));
321843e1988Sjohnlev }
322843e1988Sjohnlev 
323843e1988Sjohnlev /*
324843e1988Sjohnlev  * Create a guest virtual cpu context so that the virtual cpu
325843e1988Sjohnlev  * springs into life in the domain just about to call mp_startup()
326843e1988Sjohnlev  *
327843e1988Sjohnlev  * Virtual CPUs must be initialized once in the lifetime of the domain;
328843e1988Sjohnlev  * after that subsequent attempts to start them will fail with X_EEXIST.
329843e1988Sjohnlev  *
330843e1988Sjohnlev  * Thus 'alloc' -really- creates and initializes the virtual
331843e1988Sjohnlev  * CPU context just once. Once the initialisation succeeds, we never
332843e1988Sjohnlev  * free it, nor the regular cpu_t to which it refers.
333843e1988Sjohnlev  */
334843e1988Sjohnlev void *
335843e1988Sjohnlev mach_cpucontext_alloc(struct cpu *cp)
336843e1988Sjohnlev {
337843e1988Sjohnlev 	kthread_t *tp = cp->cpu_thread;
338843e1988Sjohnlev 	vcpu_guest_context_t vgc;
339843e1988Sjohnlev 
340843e1988Sjohnlev 	int err = 1;
341843e1988Sjohnlev 
342843e1988Sjohnlev 	/*
343843e1988Sjohnlev 	 * First, augment the incoming cpu structure
344843e1988Sjohnlev 	 * - vcpu pointer reference
345843e1988Sjohnlev 	 * - pending event storage area
346843e1988Sjohnlev 	 * - physical address of GDT
347843e1988Sjohnlev 	 */
348843e1988Sjohnlev 	cp->cpu_m.mcpu_vcpu_info =
349843e1988Sjohnlev 	    &HYPERVISOR_shared_info->vcpu_info[cp->cpu_id];
350843e1988Sjohnlev 	cp->cpu_m.mcpu_evt_pend = kmem_zalloc(
351843e1988Sjohnlev 	    sizeof (struct xen_evt_data), KM_SLEEP);
352843e1988Sjohnlev 	cp->cpu_m.mcpu_gdtpa =
353843e1988Sjohnlev 	    mmu_ptob(hat_getpfnum(kas.a_hat, (caddr_t)cp->cpu_gdt));
354843e1988Sjohnlev 
355843e1988Sjohnlev 	if ((err = xen_gdt_setprot(cp, PROT_READ)) != 0)
356843e1988Sjohnlev 		goto done;
357843e1988Sjohnlev 
358843e1988Sjohnlev 	/*
359843e1988Sjohnlev 	 * Now set up the vcpu context so that we can start this vcpu
360843e1988Sjohnlev 	 * in the kernel at tp->t_pc (mp_startup).  Note that the
361843e1988Sjohnlev 	 * thread will thread_exit() shortly after performing the
362843e1988Sjohnlev 	 * initialization; in particular, we will *never* take a
363843e1988Sjohnlev 	 * privilege transition on this thread.
364843e1988Sjohnlev 	 */
365843e1988Sjohnlev 
366843e1988Sjohnlev 	bzero(&vgc, sizeof (vgc));
367843e1988Sjohnlev 
368843e1988Sjohnlev #ifdef __amd64
369843e1988Sjohnlev 	vgc.user_regs.rip = tp->t_pc;
370843e1988Sjohnlev 	vgc.user_regs.rsp = tp->t_sp;
371843e1988Sjohnlev 	vgc.user_regs.rbp = tp->t_sp - 2 * sizeof (greg_t);
372843e1988Sjohnlev #else
373843e1988Sjohnlev 	vgc.user_regs.eip = tp->t_pc;
374843e1988Sjohnlev 	vgc.user_regs.esp = tp->t_sp;
375843e1988Sjohnlev 	vgc.user_regs.ebp = tp->t_sp - 2 * sizeof (greg_t);
376843e1988Sjohnlev #endif
377843e1988Sjohnlev 	/*
378843e1988Sjohnlev 	 * XXPV	Fix resume, if Russ didn't already fix it.
379843e1988Sjohnlev 	 *
380843e1988Sjohnlev 	 * Note that resume unconditionally puts t->t_stk + sizeof (regs)
381843e1988Sjohnlev 	 * into kernel_sp via HYPERVISOR_stack_switch. This anticipates
382843e1988Sjohnlev 	 * that only lwps take traps that switch to the kernel stack;
383843e1988Sjohnlev 	 * part of creating an lwp adjusts the stack by subtracting
384843e1988Sjohnlev 	 * sizeof (struct regs) off t_stk.
385843e1988Sjohnlev 	 *
386843e1988Sjohnlev 	 * The more interesting question is, why do we do all the work
387843e1988Sjohnlev 	 * of a fully fledged lwp for a plain thread?  In particular
388843e1988Sjohnlev 	 * we don't have to call HYPERVISOR_stack_switch for lwp-less threads
389843e1988Sjohnlev 	 * or futz with the LDT.  This should probably all be done with
390843e1988Sjohnlev 	 * an lwp context operator to keep pure thread context switch fast.
391843e1988Sjohnlev 	 */
392843e1988Sjohnlev 	vgc.kernel_sp = (ulong_t)tp->t_stk;
393843e1988Sjohnlev 
394843e1988Sjohnlev 	err = mp_set_cpu_context(&vgc, cp);
395843e1988Sjohnlev 
396843e1988Sjohnlev done:
397843e1988Sjohnlev 	if (err) {
398843e1988Sjohnlev 		mach_cpucontext_free(cp, NULL, err);
399843e1988Sjohnlev 		return (NULL);
400843e1988Sjohnlev 	}
401843e1988Sjohnlev 	return (cp);
402843e1988Sjohnlev }
403843e1988Sjohnlev 
404843e1988Sjohnlev /*
405843e1988Sjohnlev  * By the time we are called either we have successfully started
406843e1988Sjohnlev  * the cpu, or our attempt to start it has failed.
407843e1988Sjohnlev  */
408843e1988Sjohnlev 
409843e1988Sjohnlev /*ARGSUSED*/
410843e1988Sjohnlev void
411843e1988Sjohnlev mach_cpucontext_free(struct cpu *cp, void *arg, int err)
412843e1988Sjohnlev {
413843e1988Sjohnlev 	switch (err) {
414843e1988Sjohnlev 	case 0:
415843e1988Sjohnlev 		break;
416843e1988Sjohnlev 	case ETIMEDOUT:
417843e1988Sjohnlev 		/*
418843e1988Sjohnlev 		 * The vcpu context is loaded into the hypervisor, and
419843e1988Sjohnlev 		 * we've tried to start it, but the vcpu has not been set
420843e1988Sjohnlev 		 * running yet, for whatever reason.  We arrange to -not-
421843e1988Sjohnlev 		 * free any data structures it may be referencing.  In
422843e1988Sjohnlev 		 * particular, we've already told the hypervisor about
423843e1988Sjohnlev 		 * the GDT, and so we can't map it read-write again.
424843e1988Sjohnlev 		 */
425843e1988Sjohnlev 		break;
426843e1988Sjohnlev 	default:
427843e1988Sjohnlev 		(void) xen_gdt_setprot(cp, PROT_READ | PROT_WRITE);
428843e1988Sjohnlev 		kmem_free(cp->cpu_m.mcpu_evt_pend,
429843e1988Sjohnlev 		    sizeof (struct xen_evt_data));
430843e1988Sjohnlev 		break;
431843e1988Sjohnlev 	}
432843e1988Sjohnlev }
433843e1988Sjohnlev 
434843e1988Sjohnlev /*
435843e1988Sjohnlev  * Reset this CPU's context.  Clear out any pending evtchn data, since event
436843e1988Sjohnlev  * channel numbers will all change when we resume.
437843e1988Sjohnlev  */
438843e1988Sjohnlev void
439843e1988Sjohnlev mach_cpucontext_reset(cpu_t *cp)
440843e1988Sjohnlev {
441843e1988Sjohnlev 	bzero(cp->cpu_m.mcpu_evt_pend, sizeof (struct xen_evt_data));
442843e1988Sjohnlev 	/* mcpu_intr_pending ? */
443843e1988Sjohnlev }
444843e1988Sjohnlev 
445843e1988Sjohnlev static void
446843e1988Sjohnlev pcb_to_user_regs(label_t *pcb, vcpu_guest_context_t *vgc)
447843e1988Sjohnlev {
448843e1988Sjohnlev #ifdef __amd64
449843e1988Sjohnlev 	vgc->user_regs.rip = pcb->val[REG_LABEL_PC];
450843e1988Sjohnlev 	vgc->user_regs.rsp = pcb->val[REG_LABEL_SP];
451843e1988Sjohnlev 	vgc->user_regs.rbp = pcb->val[REG_LABEL_BP];
452843e1988Sjohnlev 	vgc->user_regs.rbx = pcb->val[REG_LABEL_RBX];
453843e1988Sjohnlev 	vgc->user_regs.r12 = pcb->val[REG_LABEL_R12];
454843e1988Sjohnlev 	vgc->user_regs.r13 = pcb->val[REG_LABEL_R13];
455843e1988Sjohnlev 	vgc->user_regs.r14 = pcb->val[REG_LABEL_R14];
456843e1988Sjohnlev 	vgc->user_regs.r15 = pcb->val[REG_LABEL_R15];
457843e1988Sjohnlev #else /* __amd64 */
458843e1988Sjohnlev 	vgc->user_regs.eip = pcb->val[REG_LABEL_PC];
459843e1988Sjohnlev 	vgc->user_regs.esp = pcb->val[REG_LABEL_SP];
460843e1988Sjohnlev 	vgc->user_regs.ebp = pcb->val[REG_LABEL_BP];
461843e1988Sjohnlev 	vgc->user_regs.ebx = pcb->val[REG_LABEL_EBX];
462843e1988Sjohnlev 	vgc->user_regs.esi = pcb->val[REG_LABEL_ESI];
463843e1988Sjohnlev 	vgc->user_regs.edi = pcb->val[REG_LABEL_EDI];
464843e1988Sjohnlev #endif /* __amd64 */
465843e1988Sjohnlev }
466843e1988Sjohnlev 
467843e1988Sjohnlev /*
4681d03c31eSjohnlev  * Restore the context of a CPU during resume.  This context is always
4691d03c31eSjohnlev  * inside enter_safe_phase(), below.
470843e1988Sjohnlev  */
471843e1988Sjohnlev void
472843e1988Sjohnlev mach_cpucontext_restore(cpu_t *cp)
473843e1988Sjohnlev {
474843e1988Sjohnlev 	vcpu_guest_context_t vgc;
475843e1988Sjohnlev 	int err;
476843e1988Sjohnlev 
477843e1988Sjohnlev 	ASSERT(cp->cpu_thread == cp->cpu_pause_thread ||
478843e1988Sjohnlev 	    cp->cpu_thread == cp->cpu_idle_thread);
479843e1988Sjohnlev 
480843e1988Sjohnlev 	bzero(&vgc, sizeof (vgc));
481843e1988Sjohnlev 
482843e1988Sjohnlev 	pcb_to_user_regs(&cp->cpu_thread->t_pcb, &vgc);
483843e1988Sjohnlev 
484843e1988Sjohnlev 	/*
485843e1988Sjohnlev 	 * We're emulating a longjmp() here: in particular, we need to bump the
486843e1988Sjohnlev 	 * stack pointer to account for the pop of xIP that returning from
487843e1988Sjohnlev 	 * longjmp() normally would do, and set the return value in xAX to 1.
488843e1988Sjohnlev 	 */
489843e1988Sjohnlev #ifdef __amd64
490843e1988Sjohnlev 	vgc.user_regs.rax = 1;
491843e1988Sjohnlev 	vgc.user_regs.rsp += sizeof (ulong_t);
492843e1988Sjohnlev #else
493843e1988Sjohnlev 	vgc.user_regs.eax = 1;
494843e1988Sjohnlev 	vgc.user_regs.esp += sizeof (ulong_t);
495843e1988Sjohnlev #endif
496843e1988Sjohnlev 
497843e1988Sjohnlev 	vgc.kernel_sp = cp->cpu_thread->t_sp;
498843e1988Sjohnlev 
499843e1988Sjohnlev 	err = mp_set_cpu_context(&vgc, cp);
500843e1988Sjohnlev 
501843e1988Sjohnlev 	ASSERT(err == 0);
502843e1988Sjohnlev }
503843e1988Sjohnlev 
5041d03c31eSjohnlev /*
5051d03c31eSjohnlev  * Reach a point at which the CPU can be safely powered-off or
5061d03c31eSjohnlev  * suspended.  Nothing can wake this CPU out of the loop.
5071d03c31eSjohnlev  */
5081d03c31eSjohnlev static void
5091d03c31eSjohnlev enter_safe_phase(void)
5101d03c31eSjohnlev {
5111d03c31eSjohnlev 	ulong_t flags = intr_clear();
5121d03c31eSjohnlev 
5131d03c31eSjohnlev 	if (setjmp(&curthread->t_pcb) == 0) {
5141d03c31eSjohnlev 		cpu_phase[CPU->cpu_id] = CPU_PHASE_SAFE;
5151d03c31eSjohnlev 		while (cpu_phase[CPU->cpu_id] == CPU_PHASE_SAFE)
5161d03c31eSjohnlev 			SMT_PAUSE();
5171d03c31eSjohnlev 	}
5181d03c31eSjohnlev 
5191d03c31eSjohnlev 	ASSERT(!interrupts_enabled());
5201d03c31eSjohnlev 
5211d03c31eSjohnlev 	intr_restore(flags);
5221d03c31eSjohnlev }
5231d03c31eSjohnlev 
5241d03c31eSjohnlev /*
5251d03c31eSjohnlev  * Offline CPUs run this code even under a pause_cpus(), so we must
5261d03c31eSjohnlev  * check if we need to enter the safe phase.
5271d03c31eSjohnlev  */
528843e1988Sjohnlev void
529843e1988Sjohnlev mach_cpu_idle(void)
530843e1988Sjohnlev {
531843e1988Sjohnlev 	if (IN_XPV_PANIC()) {
532843e1988Sjohnlev 		xpv_panic_halt();
533843e1988Sjohnlev 	} else  {
534843e1988Sjohnlev 		(void) HYPERVISOR_block();
5351d03c31eSjohnlev 		if (cpu_phase[CPU->cpu_id] == CPU_PHASE_WAIT_SAFE)
5361d03c31eSjohnlev 			enter_safe_phase();
5371d03c31eSjohnlev 	}
5381d03c31eSjohnlev }
5391d03c31eSjohnlev 
5401d03c31eSjohnlev /*
5411d03c31eSjohnlev  * Spin until either start_cpus() wakes us up, or we get a request to
5421d03c31eSjohnlev  * enter the safe phase (followed by a later start_cpus()).
5431d03c31eSjohnlev  */
5441d03c31eSjohnlev void
5451d03c31eSjohnlev mach_cpu_pause(volatile char *safe)
5461d03c31eSjohnlev {
5471d03c31eSjohnlev 	*safe = PAUSE_WAIT;
5481d03c31eSjohnlev 	membar_enter();
5491d03c31eSjohnlev 
5501d03c31eSjohnlev 	while (*safe != PAUSE_IDLE) {
5511d03c31eSjohnlev 		if (cpu_phase[CPU->cpu_id] == CPU_PHASE_WAIT_SAFE)
5521d03c31eSjohnlev 			enter_safe_phase();
5531d03c31eSjohnlev 		SMT_PAUSE();
554843e1988Sjohnlev 	}
555843e1988Sjohnlev }
556843e1988Sjohnlev 
557843e1988Sjohnlev void
558843e1988Sjohnlev mach_cpu_halt(char *msg)
559843e1988Sjohnlev {
560843e1988Sjohnlev 	if (msg)
561843e1988Sjohnlev 		prom_printf("%s\n", msg);
562843e1988Sjohnlev 	(void) xen_vcpu_down(CPU->cpu_id);
563843e1988Sjohnlev }
564843e1988Sjohnlev 
565843e1988Sjohnlev /*ARGSUSED*/
566843e1988Sjohnlev int
567843e1988Sjohnlev mp_cpu_poweron(struct cpu *cp)
568843e1988Sjohnlev {
569843e1988Sjohnlev 	return (ENOTSUP);
570843e1988Sjohnlev }
571843e1988Sjohnlev 
572843e1988Sjohnlev /*ARGSUSED*/
573843e1988Sjohnlev int
574843e1988Sjohnlev mp_cpu_poweroff(struct cpu *cp)
575843e1988Sjohnlev {
576843e1988Sjohnlev 	return (ENOTSUP);
577843e1988Sjohnlev }
578843e1988Sjohnlev 
5791d03c31eSjohnlev void
5801d03c31eSjohnlev mp_enter_barrier(void)
581843e1988Sjohnlev {
5821d03c31eSjohnlev 	hrtime_t last_poke_time = 0;
5831d03c31eSjohnlev 	int poke_allowed = 0;
5841d03c31eSjohnlev 	int done = 0;
5851d03c31eSjohnlev 	int i;
586843e1988Sjohnlev 
587843e1988Sjohnlev 	ASSERT(MUTEX_HELD(&cpu_lock));
588843e1988Sjohnlev 
589*0ed5c46eSJosef 'Jeff' Sipek 	pause_cpus(NULL, NULL);
5901d03c31eSjohnlev 
5911d03c31eSjohnlev 	while (!done) {
5921d03c31eSjohnlev 		done = 1;
5931d03c31eSjohnlev 		poke_allowed = 0;
5941d03c31eSjohnlev 
5951d03c31eSjohnlev 		if (xpv_gethrtime() - last_poke_time > POKE_TIMEOUT) {
5961d03c31eSjohnlev 			last_poke_time = xpv_gethrtime();
5971d03c31eSjohnlev 			poke_allowed = 1;
598843e1988Sjohnlev 		}
599843e1988Sjohnlev 
6001d03c31eSjohnlev 		for (i = 0; i < NCPU; i++) {
6011d03c31eSjohnlev 			cpu_t *cp = cpu_get(i);
602843e1988Sjohnlev 
6031d03c31eSjohnlev 			if (cp == NULL || cp == CPU)
6041d03c31eSjohnlev 				continue;
6051d03c31eSjohnlev 
6061d03c31eSjohnlev 			switch (cpu_phase[i]) {
6071d03c31eSjohnlev 			case CPU_PHASE_NONE:
6081d03c31eSjohnlev 				cpu_phase[i] = CPU_PHASE_WAIT_SAFE;
6091d03c31eSjohnlev 				poke_cpu(i);
6101d03c31eSjohnlev 				done = 0;
6111d03c31eSjohnlev 				break;
6121d03c31eSjohnlev 
6131d03c31eSjohnlev 			case CPU_PHASE_WAIT_SAFE:
6141d03c31eSjohnlev 				if (poke_allowed)
6151d03c31eSjohnlev 					poke_cpu(i);
6161d03c31eSjohnlev 				done = 0;
6171d03c31eSjohnlev 				break;
6181d03c31eSjohnlev 
6191d03c31eSjohnlev 			case CPU_PHASE_SAFE:
6201d03c31eSjohnlev 			case CPU_PHASE_POWERED_OFF:
6211d03c31eSjohnlev 				break;
622843e1988Sjohnlev 			}
623843e1988Sjohnlev 		}
624843e1988Sjohnlev 
6251d03c31eSjohnlev 		SMT_PAUSE();
6261d03c31eSjohnlev 	}
6271d03c31eSjohnlev }
6281d03c31eSjohnlev 
6291d03c31eSjohnlev void
6301d03c31eSjohnlev mp_leave_barrier(void)
631843e1988Sjohnlev {
6321d03c31eSjohnlev 	int i;
6331d03c31eSjohnlev 
6341d03c31eSjohnlev 	ASSERT(MUTEX_HELD(&cpu_lock));
6351d03c31eSjohnlev 
6361d03c31eSjohnlev 	for (i = 0; i < NCPU; i++) {
6371d03c31eSjohnlev 		cpu_t *cp = cpu_get(i);
6381d03c31eSjohnlev 
6391d03c31eSjohnlev 		if (cp == NULL || cp == CPU)
6401d03c31eSjohnlev 			continue;
6411d03c31eSjohnlev 
6421d03c31eSjohnlev 		switch (cpu_phase[i]) {
6431d03c31eSjohnlev 		/*
6441d03c31eSjohnlev 		 * If we see a CPU in one of these phases, something has
6451d03c31eSjohnlev 		 * gone badly wrong with the guarantees
6461d03c31eSjohnlev 		 * mp_enter_barrier() is supposed to provide.  Rather
6471d03c31eSjohnlev 		 * than attempt to stumble along (and since we can't
6481d03c31eSjohnlev 		 * panic properly in this context), we tell the
6491d03c31eSjohnlev 		 * hypervisor we've crashed.
6501d03c31eSjohnlev 		 */
6511d03c31eSjohnlev 		case CPU_PHASE_NONE:
6521d03c31eSjohnlev 		case CPU_PHASE_WAIT_SAFE:
6531d03c31eSjohnlev 			(void) HYPERVISOR_shutdown(SHUTDOWN_crash);
6541d03c31eSjohnlev 			break;
6551d03c31eSjohnlev 
6561d03c31eSjohnlev 		case CPU_PHASE_POWERED_OFF:
6571d03c31eSjohnlev 			break;
6581d03c31eSjohnlev 
6591d03c31eSjohnlev 		case CPU_PHASE_SAFE:
6601d03c31eSjohnlev 			cpu_phase[i] = CPU_PHASE_NONE;
6611d03c31eSjohnlev 		}
662843e1988Sjohnlev 	}
663843e1988Sjohnlev 
6641d03c31eSjohnlev 	start_cpus();
6651d03c31eSjohnlev }
6661d03c31eSjohnlev 
667843e1988Sjohnlev static int
668843e1988Sjohnlev poweroff_vcpu(struct cpu *cp)
669843e1988Sjohnlev {
670843e1988Sjohnlev 	int error;
671843e1988Sjohnlev 
672843e1988Sjohnlev 	ASSERT(MUTEX_HELD(&cpu_lock));
673843e1988Sjohnlev 
674843e1988Sjohnlev 	ASSERT(CPU->cpu_id != cp->cpu_id);
675843e1988Sjohnlev 	ASSERT(cp->cpu_flags & CPU_QUIESCED);
676843e1988Sjohnlev 
6771d03c31eSjohnlev 	mp_enter_barrier();
678843e1988Sjohnlev 
679843e1988Sjohnlev 	if ((error = xen_vcpu_down(cp->cpu_id)) == 0) {
6801d03c31eSjohnlev 		ASSERT(cpu_phase[cp->cpu_id] == CPU_PHASE_SAFE);
6811d03c31eSjohnlev 
682843e1988Sjohnlev 		CPUSET_DEL(cpu_ready_set, cp->cpu_id);
6831d03c31eSjohnlev 
684843e1988Sjohnlev 		cp->cpu_flags |= CPU_POWEROFF | CPU_OFFLINE;
685843e1988Sjohnlev 		cp->cpu_flags &=
686843e1988Sjohnlev 		    ~(CPU_RUNNING | CPU_READY | CPU_EXISTS | CPU_ENABLE);
687843e1988Sjohnlev 
6881d03c31eSjohnlev 		cpu_phase[cp->cpu_id] = CPU_PHASE_POWERED_OFF;
6891d03c31eSjohnlev 
690843e1988Sjohnlev 		cpu_set_state(cp);
691843e1988Sjohnlev 	}
6921d03c31eSjohnlev 
6931d03c31eSjohnlev 	mp_leave_barrier();
6941d03c31eSjohnlev 
695843e1988Sjohnlev 	return (error);
696843e1988Sjohnlev }
697843e1988Sjohnlev 
698843e1988Sjohnlev static int
699843e1988Sjohnlev vcpu_config_poweroff(processorid_t id)
700843e1988Sjohnlev {
701843e1988Sjohnlev 	int oldstate;
702843e1988Sjohnlev 	int error;
703843e1988Sjohnlev 	cpu_t *cp;
704843e1988Sjohnlev 
705843e1988Sjohnlev 	mutex_enter(&cpu_lock);
706843e1988Sjohnlev 
707843e1988Sjohnlev 	if ((cp = cpu_get(id)) == NULL) {
708843e1988Sjohnlev 		mutex_exit(&cpu_lock);
709843e1988Sjohnlev 		return (ESRCH);
710843e1988Sjohnlev 	}
711843e1988Sjohnlev 
712843e1988Sjohnlev 	if (cpu_get_state(cp) == P_POWEROFF) {
713843e1988Sjohnlev 		mutex_exit(&cpu_lock);
714843e1988Sjohnlev 		return (0);
715843e1988Sjohnlev 	}
716843e1988Sjohnlev 
717843e1988Sjohnlev 	mutex_exit(&cpu_lock);
718843e1988Sjohnlev 
719843e1988Sjohnlev 	do {
720843e1988Sjohnlev 		error = p_online_internal(id, P_OFFLINE,
721843e1988Sjohnlev 		    &oldstate);
722843e1988Sjohnlev 
723843e1988Sjohnlev 		if (error != 0)
724843e1988Sjohnlev 			break;
725843e1988Sjohnlev 
726843e1988Sjohnlev 		/*
727843e1988Sjohnlev 		 * So we just changed it to P_OFFLINE.  But then we dropped
728843e1988Sjohnlev 		 * cpu_lock, so now it is possible for another thread to change
729843e1988Sjohnlev 		 * the cpu back to a different, non-quiesced state e.g.
730843e1988Sjohnlev 		 * P_ONLINE.
731843e1988Sjohnlev 		 */
732843e1988Sjohnlev 		mutex_enter(&cpu_lock);
733843e1988Sjohnlev 		if ((cp = cpu_get(id)) == NULL)
734843e1988Sjohnlev 			error = ESRCH;
735843e1988Sjohnlev 		else {
736843e1988Sjohnlev 			if (cp->cpu_flags & CPU_QUIESCED)
737843e1988Sjohnlev 				error = poweroff_vcpu(cp);
738843e1988Sjohnlev 			else
739843e1988Sjohnlev 				error = EBUSY;
740843e1988Sjohnlev 		}
741843e1988Sjohnlev 		mutex_exit(&cpu_lock);
742843e1988Sjohnlev 	} while (error == EBUSY);
743843e1988Sjohnlev 
744843e1988Sjohnlev 	return (error);
745843e1988Sjohnlev }
746843e1988Sjohnlev 
747843e1988Sjohnlev /*
748843e1988Sjohnlev  * Add a new virtual cpu to the domain.
749843e1988Sjohnlev  */
750843e1988Sjohnlev static int
751843e1988Sjohnlev vcpu_config_new(processorid_t id)
752843e1988Sjohnlev {
753843e1988Sjohnlev 	extern int start_cpu(processorid_t);
754843e1988Sjohnlev 	int error;
755843e1988Sjohnlev 
756843e1988Sjohnlev 	if (ncpus == 1) {
757843e1988Sjohnlev 		printf("cannot (yet) add cpus to a single-cpu domain\n");
758843e1988Sjohnlev 		return (ENOTSUP);
759843e1988Sjohnlev 	}
760843e1988Sjohnlev 
761843e1988Sjohnlev 	affinity_set(CPU_CURRENT);
762843e1988Sjohnlev 	error = start_cpu(id);
763843e1988Sjohnlev 	affinity_clear();
764843e1988Sjohnlev 	return (error);
765843e1988Sjohnlev }
766843e1988Sjohnlev 
767843e1988Sjohnlev static int
7681d03c31eSjohnlev poweron_vcpu(struct cpu *cp)
7691d03c31eSjohnlev {
7701d03c31eSjohnlev 	int error;
7711d03c31eSjohnlev 
7721d03c31eSjohnlev 	ASSERT(MUTEX_HELD(&cpu_lock));
7731d03c31eSjohnlev 
7741d03c31eSjohnlev 	if (HYPERVISOR_vcpu_op(VCPUOP_is_up, cp->cpu_id, NULL) != 0) {
7751d03c31eSjohnlev 		printf("poweron_vcpu: vcpu%d is not available!\n",
7761d03c31eSjohnlev 		    cp->cpu_id);
7771d03c31eSjohnlev 		return (ENXIO);
7781d03c31eSjohnlev 	}
7791d03c31eSjohnlev 
7801d03c31eSjohnlev 	if ((error = xen_vcpu_up(cp->cpu_id)) == 0) {
7811d03c31eSjohnlev 		CPUSET_ADD(cpu_ready_set, cp->cpu_id);
7821d03c31eSjohnlev 		cp->cpu_flags |= CPU_EXISTS | CPU_READY | CPU_RUNNING;
7831d03c31eSjohnlev 		cp->cpu_flags &= ~CPU_POWEROFF;
7841d03c31eSjohnlev 		/*
7851d03c31eSjohnlev 		 * There are some nasty races possible here.
7861d03c31eSjohnlev 		 * Tell the vcpu it's up one more time.
7871d03c31eSjohnlev 		 * XXPV	Is this enough?  Is this safe?
7881d03c31eSjohnlev 		 */
7891d03c31eSjohnlev 		(void) xen_vcpu_up(cp->cpu_id);
7901d03c31eSjohnlev 
7911d03c31eSjohnlev 		cpu_phase[cp->cpu_id] = CPU_PHASE_NONE;
7921d03c31eSjohnlev 
7931d03c31eSjohnlev 		cpu_set_state(cp);
7941d03c31eSjohnlev 	}
7951d03c31eSjohnlev 	return (error);
7961d03c31eSjohnlev }
7971d03c31eSjohnlev 
7981d03c31eSjohnlev static int
799843e1988Sjohnlev vcpu_config_poweron(processorid_t id)
800843e1988Sjohnlev {
801843e1988Sjohnlev 	cpu_t *cp;
802843e1988Sjohnlev 	int oldstate;
803843e1988Sjohnlev 	int error;
804843e1988Sjohnlev 
805843e1988Sjohnlev 	if (id >= ncpus)
806843e1988Sjohnlev 		return (vcpu_config_new(id));
807843e1988Sjohnlev 
808843e1988Sjohnlev 	mutex_enter(&cpu_lock);
809843e1988Sjohnlev 
810843e1988Sjohnlev 	if ((cp = cpu_get(id)) == NULL) {
811843e1988Sjohnlev 		mutex_exit(&cpu_lock);
812843e1988Sjohnlev 		return (ESRCH);
813843e1988Sjohnlev 	}
814843e1988Sjohnlev 
815843e1988Sjohnlev 	if (cpu_get_state(cp) != P_POWEROFF) {
816843e1988Sjohnlev 		mutex_exit(&cpu_lock);
817843e1988Sjohnlev 		return (0);
818843e1988Sjohnlev 	}
819843e1988Sjohnlev 
820843e1988Sjohnlev 	if ((error = poweron_vcpu(cp)) != 0) {
821843e1988Sjohnlev 		mutex_exit(&cpu_lock);
822843e1988Sjohnlev 		return (error);
823843e1988Sjohnlev 	}
824843e1988Sjohnlev 
825843e1988Sjohnlev 	mutex_exit(&cpu_lock);
826843e1988Sjohnlev 
827843e1988Sjohnlev 	return (p_online_internal(id, P_ONLINE, &oldstate));
828843e1988Sjohnlev }
829843e1988Sjohnlev 
830843e1988Sjohnlev #define	REPORT_LEN	128
831843e1988Sjohnlev 
832843e1988Sjohnlev static void
833843e1988Sjohnlev vcpu_config_report(processorid_t id, uint_t newstate, int error)
834843e1988Sjohnlev {
835843e1988Sjohnlev 	char *report = kmem_alloc(REPORT_LEN, KM_SLEEP);
836843e1988Sjohnlev 	size_t len;
837843e1988Sjohnlev 	char *ps;
838843e1988Sjohnlev 
839843e1988Sjohnlev 	switch (newstate) {
840843e1988Sjohnlev 	case P_ONLINE:
841843e1988Sjohnlev 		ps = PS_ONLINE;
842843e1988Sjohnlev 		break;
843843e1988Sjohnlev 	case P_POWEROFF:
844843e1988Sjohnlev 		ps = PS_POWEROFF;
845843e1988Sjohnlev 		break;
846843e1988Sjohnlev 	default:
847843e1988Sjohnlev 		cmn_err(CE_PANIC, "unknown state %u\n", newstate);
848843e1988Sjohnlev 		break;
849843e1988Sjohnlev 	}
850843e1988Sjohnlev 
851843e1988Sjohnlev 	len = snprintf(report, REPORT_LEN,
852843e1988Sjohnlev 	    "cpu%d: externally initiated %s", id, ps);
853843e1988Sjohnlev 
854843e1988Sjohnlev 	if (!error) {
855843e1988Sjohnlev 		cmn_err(CE_CONT, "!%s\n", report);
856843e1988Sjohnlev 		kmem_free(report, REPORT_LEN);
857843e1988Sjohnlev 		return;
858843e1988Sjohnlev 	}
859843e1988Sjohnlev 
860843e1988Sjohnlev 	len += snprintf(report + len, REPORT_LEN - len,
861843e1988Sjohnlev 	    " failed, error %d: ", error);
862843e1988Sjohnlev 	switch (error) {
863843e1988Sjohnlev 	case EEXIST:
864843e1988Sjohnlev 		len += snprintf(report + len, REPORT_LEN - len,
865843e1988Sjohnlev 		    "cpu already %s", ps ? ps : "?");
866843e1988Sjohnlev 		break;
867843e1988Sjohnlev 	case ESRCH:
868843e1988Sjohnlev 		len += snprintf(report + len, REPORT_LEN - len,
869843e1988Sjohnlev 		    "cpu not found");
870843e1988Sjohnlev 		break;
871843e1988Sjohnlev 	case EINVAL:
872843e1988Sjohnlev 	case EALREADY:
873843e1988Sjohnlev 		break;
874843e1988Sjohnlev 	case EPERM:
875843e1988Sjohnlev 		len += snprintf(report + len, REPORT_LEN - len,
876843e1988Sjohnlev 		    "insufficient privilege (0x%x)", id);
877843e1988Sjohnlev 		break;
878843e1988Sjohnlev 	case EBUSY:
879843e1988Sjohnlev 		switch (newstate) {
880843e1988Sjohnlev 		case P_ONLINE:
881843e1988Sjohnlev 			/*
882843e1988Sjohnlev 			 * This return comes from mp_cpu_start -
883843e1988Sjohnlev 			 * we cannot 'start' the boot CPU.
884843e1988Sjohnlev 			 */
885843e1988Sjohnlev 			len += snprintf(report + len, REPORT_LEN - len,
886843e1988Sjohnlev 			    "already running");
887843e1988Sjohnlev 			break;
888843e1988Sjohnlev 		case P_POWEROFF:
889843e1988Sjohnlev 			len += snprintf(report + len, REPORT_LEN - len,
890843e1988Sjohnlev 			    "bound lwps?");
891843e1988Sjohnlev 			break;
892843e1988Sjohnlev 		default:
893843e1988Sjohnlev 			break;
894843e1988Sjohnlev 		}
895843e1988Sjohnlev 	default:
896843e1988Sjohnlev 		break;
897843e1988Sjohnlev 	}
898843e1988Sjohnlev 
899843e1988Sjohnlev 	cmn_err(CE_CONT, "%s\n", report);
900843e1988Sjohnlev 	kmem_free(report, REPORT_LEN);
901843e1988Sjohnlev }
902843e1988Sjohnlev 
903843e1988Sjohnlev static void
904843e1988Sjohnlev vcpu_config(void *arg)
905843e1988Sjohnlev {
906843e1988Sjohnlev 	int id = (int)(uintptr_t)arg;
907843e1988Sjohnlev 	int error;
908843e1988Sjohnlev 	char dir[16];
909843e1988Sjohnlev 	char *state;
910843e1988Sjohnlev 
911843e1988Sjohnlev 	if ((uint_t)id >= max_ncpus) {
912843e1988Sjohnlev 		cmn_err(CE_WARN,
913843e1988Sjohnlev 		    "vcpu_config: cpu%d does not fit in this domain", id);
914843e1988Sjohnlev 		return;
915843e1988Sjohnlev 	}
916843e1988Sjohnlev 
917843e1988Sjohnlev 	(void) snprintf(dir, sizeof (dir), "cpu/%d", id);
918843e1988Sjohnlev 	state = kmem_alloc(MAXPATHLEN, KM_SLEEP);
919843e1988Sjohnlev 	if (xenbus_scanf(XBT_NULL, dir, "availability", "%s", state) == 0) {
920843e1988Sjohnlev 		if (strcmp(state, "online") == 0) {
921843e1988Sjohnlev 			error = vcpu_config_poweron(id);
922843e1988Sjohnlev 			vcpu_config_report(id, P_ONLINE, error);
923843e1988Sjohnlev 		} else if (strcmp(state, "offline") == 0) {
924843e1988Sjohnlev 			error = vcpu_config_poweroff(id);
925843e1988Sjohnlev 			vcpu_config_report(id, P_POWEROFF, error);
926843e1988Sjohnlev 		} else {
927843e1988Sjohnlev 			cmn_err(CE_WARN,
928843e1988Sjohnlev 			    "cpu%d: unknown target state '%s'", id, state);
929843e1988Sjohnlev 		}
930843e1988Sjohnlev 	} else
931843e1988Sjohnlev 		cmn_err(CE_WARN,
932843e1988Sjohnlev 		    "cpu%d: unable to read target state from xenstore", id);
933843e1988Sjohnlev 
934843e1988Sjohnlev 	kmem_free(state, MAXPATHLEN);
935843e1988Sjohnlev }
936843e1988Sjohnlev 
937843e1988Sjohnlev /*ARGSUSED*/
938843e1988Sjohnlev static void
939843e1988Sjohnlev vcpu_config_event(struct xenbus_watch *watch, const char **vec, uint_t len)
940843e1988Sjohnlev {
941843e1988Sjohnlev 	const char *path = vec[XS_WATCH_PATH];
942843e1988Sjohnlev 	processorid_t id;
943843e1988Sjohnlev 	char *s;
944843e1988Sjohnlev 
945843e1988Sjohnlev 	if ((s = strstr(path, "cpu/")) != NULL &&
946843e1988Sjohnlev 	    sscanf(s, "cpu/%d", &id) == 1) {
947843e1988Sjohnlev 		/*
948843e1988Sjohnlev 		 * Run the virtual CPU configuration on a separate thread to
949843e1988Sjohnlev 		 * avoid blocking on this event for too long (and for now,
950843e1988Sjohnlev 		 * to ensure configuration requests are serialized.)
951843e1988Sjohnlev 		 */
952843e1988Sjohnlev 		(void) taskq_dispatch(cpu_config_tq,
953843e1988Sjohnlev 		    vcpu_config, (void *)(uintptr_t)id, 0);
954843e1988Sjohnlev 	}
955843e1988Sjohnlev }
956843e1988Sjohnlev 
957843e1988Sjohnlev static int
958843e1988Sjohnlev xen_vcpu_initialize(processorid_t id, vcpu_guest_context_t *vgc)
959843e1988Sjohnlev {
960843e1988Sjohnlev 	int err;
961843e1988Sjohnlev 
962843e1988Sjohnlev 	if ((err = HYPERVISOR_vcpu_op(VCPUOP_initialise, id, vgc)) != 0) {
963843e1988Sjohnlev 		char *str;
964843e1988Sjohnlev 		int level = CE_WARN;
965843e1988Sjohnlev 
966843e1988Sjohnlev 		switch (err) {
967843e1988Sjohnlev 		case -X_EINVAL:
968843e1988Sjohnlev 			/*
969843e1988Sjohnlev 			 * This interface squashes multiple error sources
970843e1988Sjohnlev 			 * to one error code.  In particular, an X_EINVAL
971843e1988Sjohnlev 			 * code can mean:
972843e1988Sjohnlev 			 *
973843e1988Sjohnlev 			 * -	the vcpu id is out of range
974843e1988Sjohnlev 			 * -	cs or ss are in ring 0
975843e1988Sjohnlev 			 * -	cr3 is wrong
976843e1988Sjohnlev 			 * -	an entry in the new gdt is above the
977843e1988Sjohnlev 			 *	reserved entry
978843e1988Sjohnlev 			 * -	a frame underneath the new gdt is bad
979843e1988Sjohnlev 			 */
980843e1988Sjohnlev 			str = "something is wrong :(";
981843e1988Sjohnlev 			break;
982843e1988Sjohnlev 		case -X_ENOENT:
983843e1988Sjohnlev 			str = "no such cpu";
984843e1988Sjohnlev 			break;
985843e1988Sjohnlev 		case -X_ENOMEM:
986843e1988Sjohnlev 			str = "no mem to copy ctxt";
987843e1988Sjohnlev 			break;
988843e1988Sjohnlev 		case -X_EFAULT:
989843e1988Sjohnlev 			str = "bad address";
990843e1988Sjohnlev 			break;
991843e1988Sjohnlev 		case -X_EEXIST:
992843e1988Sjohnlev 			/*
993843e1988Sjohnlev 			 * Hmm.  This error is returned if the vcpu has already
994843e1988Sjohnlev 			 * been initialized once before in the lifetime of this
995843e1988Sjohnlev 			 * domain.  This is a logic error in the kernel.
996843e1988Sjohnlev 			 */
997843e1988Sjohnlev 			level = CE_PANIC;
998843e1988Sjohnlev 			str = "already initialized";
999843e1988Sjohnlev 			break;
1000843e1988Sjohnlev 		default:
1001843e1988Sjohnlev 			level = CE_PANIC;
1002843e1988Sjohnlev 			str = "<unexpected>";
1003843e1988Sjohnlev 			break;
1004843e1988Sjohnlev 		}
1005843e1988Sjohnlev 
1006843e1988Sjohnlev 		cmn_err(level, "vcpu%d: failed to init: error %d: %s",
1007843e1988Sjohnlev 		    id, -err, str);
1008843e1988Sjohnlev 	}
1009843e1988Sjohnlev 	return (err);
1010843e1988Sjohnlev }
1011843e1988Sjohnlev 
1012843e1988Sjohnlev long
1013843e1988Sjohnlev xen_vcpu_up(processorid_t id)
1014843e1988Sjohnlev {
1015843e1988Sjohnlev 	long err;
1016843e1988Sjohnlev 
1017843e1988Sjohnlev 	if ((err = HYPERVISOR_vcpu_op(VCPUOP_up, id, NULL)) != 0) {
1018843e1988Sjohnlev 		char *str;
1019843e1988Sjohnlev 
1020843e1988Sjohnlev 		switch (err) {
1021843e1988Sjohnlev 		case -X_ENOENT:
1022843e1988Sjohnlev 			str = "no such cpu";
1023843e1988Sjohnlev 			break;
1024843e1988Sjohnlev 		case -X_EINVAL:
1025843e1988Sjohnlev 			/*
1026843e1988Sjohnlev 			 * Perhaps this is diagnostic overkill.
1027843e1988Sjohnlev 			 */
1028843e1988Sjohnlev 			if (HYPERVISOR_vcpu_op(VCPUOP_is_up, id, NULL) < 0)
1029843e1988Sjohnlev 				str = "bad cpuid";
1030843e1988Sjohnlev 			else
1031843e1988Sjohnlev 				str = "not initialized";
1032843e1988Sjohnlev 			break;
1033843e1988Sjohnlev 		default:
1034843e1988Sjohnlev 			str = "<unexpected>";
1035843e1988Sjohnlev 			break;
1036843e1988Sjohnlev 		}
1037843e1988Sjohnlev 
1038843e1988Sjohnlev 		printf("vcpu%d: failed to start: error %d: %s\n",
1039843e1988Sjohnlev 		    id, -(int)err, str);
1040843e1988Sjohnlev 		return (EBFONT);	/* deliberately silly */
1041843e1988Sjohnlev 	}
1042843e1988Sjohnlev 	return (err);
1043843e1988Sjohnlev }
1044843e1988Sjohnlev 
1045843e1988Sjohnlev long
1046843e1988Sjohnlev xen_vcpu_down(processorid_t id)
1047843e1988Sjohnlev {
1048843e1988Sjohnlev 	long err;
1049843e1988Sjohnlev 
1050843e1988Sjohnlev 	if ((err = HYPERVISOR_vcpu_op(VCPUOP_down, id, NULL)) != 0) {
1051843e1988Sjohnlev 		/*
1052843e1988Sjohnlev 		 * X_ENOENT:	no such cpu
1053843e1988Sjohnlev 		 * X_EINVAL:	bad cpuid
1054843e1988Sjohnlev 		 */
1055843e1988Sjohnlev 		panic("vcpu%d: failed to stop: error %d", id, -(int)err);
1056843e1988Sjohnlev 	}
1057843e1988Sjohnlev 
1058843e1988Sjohnlev 	return (err);
1059843e1988Sjohnlev }
1060