1843e1988Sjohnlev /*
2843e1988Sjohnlev * CDDL HEADER START
3843e1988Sjohnlev *
4843e1988Sjohnlev * The contents of this file are subject to the terms of the
5843e1988Sjohnlev * Common Development and Distribution License (the "License").
6843e1988Sjohnlev * You may not use this file except in compliance with the License.
7843e1988Sjohnlev *
8843e1988Sjohnlev * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9843e1988Sjohnlev * or http://www.opensolaris.org/os/licensing.
10843e1988Sjohnlev * See the License for the specific language governing permissions
11843e1988Sjohnlev * and limitations under the License.
12843e1988Sjohnlev *
13843e1988Sjohnlev * When distributing Covered Code, include this CDDL HEADER in each
14843e1988Sjohnlev * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15843e1988Sjohnlev * If applicable, add the following below this CDDL HEADER, with the
16843e1988Sjohnlev * fields enclosed by brackets "[]" replaced with your own identifying
17843e1988Sjohnlev * information: Portions Copyright [yyyy] [name of copyright owner]
18843e1988Sjohnlev *
19843e1988Sjohnlev * CDDL HEADER END
20843e1988Sjohnlev */
21843e1988Sjohnlev
22843e1988Sjohnlev /*
23f34a7178SJoe Bonasera * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24843e1988Sjohnlev * Use is subject to license terms.
25843e1988Sjohnlev */
26843e1988Sjohnlev
271d03c31eSjohnlev /*
281d03c31eSjohnlev * Virtual CPU management.
291d03c31eSjohnlev *
301d03c31eSjohnlev * VCPUs can be controlled in one of two ways; through the domain itself
311d03c31eSjohnlev * (psradm, p_online(), etc.), and via changes in xenstore (vcpu_config()).
321d03c31eSjohnlev * Unfortunately, the terminology is used in different ways; they work out as
331d03c31eSjohnlev * follows:
341d03c31eSjohnlev *
351d03c31eSjohnlev * P_ONLINE: the VCPU is up and running, taking interrupts and running threads
361d03c31eSjohnlev *
371d03c31eSjohnlev * P_OFFLINE: the VCPU is up and running, but quiesced (i.e. blocked in the
381d03c31eSjohnlev * hypervisor on the idle thread). It must be up since a downed VCPU cannot
391d03c31eSjohnlev * receive interrupts, and we require this for offline CPUs in Solaris.
401d03c31eSjohnlev *
411d03c31eSjohnlev * P_POWEROFF: the VCPU is down (we never called xen_vcpu_up(), or called
421d03c31eSjohnlev * xen_vcpu_down() for it). It can't take interrupts or run anything, though
431d03c31eSjohnlev * if it has run previously, its software state (cpu_t, machcpu structures, IPI
441d03c31eSjohnlev * event channels, etc.) will still exist.
451d03c31eSjohnlev *
461d03c31eSjohnlev * The hypervisor has two notions of CPU states as represented in the store:
471d03c31eSjohnlev *
481d03c31eSjohnlev * "offline": the VCPU is down. Corresponds to P_POWEROFF.
491d03c31eSjohnlev *
501d03c31eSjohnlev * "online": the VCPU is running. Corresponds to a CPU state other than
511d03c31eSjohnlev * P_POWEROFF.
521d03c31eSjohnlev *
531d03c31eSjohnlev * Currently, only a notification via xenstore can bring a CPU into a
541d03c31eSjohnlev * P_POWEROFF state, and only the domain can change between P_ONLINE, P_NOINTR,
551d03c31eSjohnlev * P_OFFLINE, etc. We need to be careful to treat xenstore notifications
561d03c31eSjohnlev * idempotently, as we'll get 'duplicate' entries when we resume a domain.
571d03c31eSjohnlev *
581d03c31eSjohnlev * Note that the xenstore configuration is strictly advisory, in that a domain
591d03c31eSjohnlev * can choose to ignore it and still power up a VCPU in the offline state. To
601d03c31eSjohnlev * play nice, we don't allow it. Thus, any attempt to power on/off a CPU is
611d03c31eSjohnlev * ENOTSUP from within Solaris.
621d03c31eSjohnlev *
631d03c31eSjohnlev * Powering off a VCPU and suspending the domain use similar code. The
641d03c31eSjohnlev * difficulty here is that we must ensure that each VCPU is in a stable
651d03c31eSjohnlev * state: it must have a saved PCB, and not be responding to interrupts
661d03c31eSjohnlev * (since we are just about to remove its ability to run on a real CPU,
671d03c31eSjohnlev * possibly forever). However, an offline CPU in Solaris can take
681d03c31eSjohnlev * cross-call interrupts, as mentioned, so we must go through a
691d03c31eSjohnlev * two-stage process. First, we use the standard Solaris pause_cpus().
701d03c31eSjohnlev * This ensures that all CPUs are either in mach_cpu_pause() or
711d03c31eSjohnlev * mach_cpu_idle(), and nothing will cross-call them.
721d03c31eSjohnlev *
731d03c31eSjohnlev * Powered-off-CPUs are already safe, as we own the cpu_lock needed to
741d03c31eSjohnlev * bring them back up, and in state CPU_PHASE_POWERED_OFF.
751d03c31eSjohnlev *
761d03c31eSjohnlev * Running CPUs are spinning in mach_cpu_pause() waiting for either
771d03c31eSjohnlev * PAUSE_IDLE or CPU_PHASE_WAIT_SAFE.
781d03c31eSjohnlev *
791d03c31eSjohnlev * Offline CPUs are either running the idle thread and periodically
801d03c31eSjohnlev * checking for CPU_PHASE_WAIT_SAFE, or blocked in the hypervisor.
811d03c31eSjohnlev *
821d03c31eSjohnlev * Thus, we set CPU_PHASE_WAIT_SAFE for every powered-on CPU, as well as
831d03c31eSjohnlev * poking them to make sure they're not blocked[1]. When every CPU has
841d03c31eSjohnlev * responded by reaching a safe state and setting CPU_PHASE_SAFE, we
851d03c31eSjohnlev * know we can suspend, or power-off a CPU, without problems.
861d03c31eSjohnlev *
871d03c31eSjohnlev * [1] note that we have to repeatedly poke offline CPUs: it's the only
881d03c31eSjohnlev * way to ensure that the CPU doesn't miss the state change before
891d03c31eSjohnlev * dropping into HYPERVISOR_block().
901d03c31eSjohnlev */
911d03c31eSjohnlev
92843e1988Sjohnlev #include <sys/types.h>
93843e1988Sjohnlev #include <sys/systm.h>
94843e1988Sjohnlev #include <sys/param.h>
95843e1988Sjohnlev #include <sys/taskq.h>
96843e1988Sjohnlev #include <sys/cmn_err.h>
97843e1988Sjohnlev #include <sys/archsystm.h>
98843e1988Sjohnlev #include <sys/machsystm.h>
99843e1988Sjohnlev #include <sys/segments.h>
100843e1988Sjohnlev #include <sys/cpuvar.h>
101843e1988Sjohnlev #include <sys/x86_archext.h>
102843e1988Sjohnlev #include <sys/controlregs.h>
103843e1988Sjohnlev #include <sys/hypervisor.h>
104843e1988Sjohnlev #include <sys/xpv_panic.h>
1051d03c31eSjohnlev #include <sys/mman.h>
1061d03c31eSjohnlev #include <sys/psw.h>
107843e1988Sjohnlev #include <sys/cpu.h>
1081d03c31eSjohnlev #include <sys/sunddi.h>
1091d03c31eSjohnlev #include <util/sscanf.h>
1101d03c31eSjohnlev #include <vm/hat_i86.h>
1111d03c31eSjohnlev #include <vm/hat.h>
1121d03c31eSjohnlev #include <vm/as.h>
113843e1988Sjohnlev
114843e1988Sjohnlev #include <xen/public/io/xs_wire.h>
1151d03c31eSjohnlev #include <xen/sys/xenbus_impl.h>
1161d03c31eSjohnlev #include <xen/public/vcpu.h>
117843e1988Sjohnlev
118f34a7178SJoe Bonasera extern cpuset_t cpu_ready_set;
119f34a7178SJoe Bonasera
1201d03c31eSjohnlev #define CPU_PHASE_NONE 0
1211d03c31eSjohnlev #define CPU_PHASE_WAIT_SAFE 1
1221d03c31eSjohnlev #define CPU_PHASE_SAFE 2
1231d03c31eSjohnlev #define CPU_PHASE_POWERED_OFF 3
1241d03c31eSjohnlev
1251d03c31eSjohnlev /*
1261d03c31eSjohnlev * We can only poke CPUs during barrier enter 256 times a second at
1271d03c31eSjohnlev * most.
1281d03c31eSjohnlev */
1291d03c31eSjohnlev #define POKE_TIMEOUT (NANOSEC / 256)
130843e1988Sjohnlev
131843e1988Sjohnlev static taskq_t *cpu_config_tq;
1321d03c31eSjohnlev static int cpu_phase[NCPU];
1331d03c31eSjohnlev
134843e1988Sjohnlev static void vcpu_config_event(struct xenbus_watch *, const char **, uint_t);
135843e1988Sjohnlev static int xen_vcpu_initialize(processorid_t, vcpu_guest_context_t *);
136843e1988Sjohnlev
137843e1988Sjohnlev /*
138b9bc7f78Ssmaybe * Return whether or not the vcpu is actually running on a pcpu
139b9bc7f78Ssmaybe */
140b9bc7f78Ssmaybe int
vcpu_on_pcpu(processorid_t cpu)141b9bc7f78Ssmaybe vcpu_on_pcpu(processorid_t cpu)
142b9bc7f78Ssmaybe {
143b9bc7f78Ssmaybe struct vcpu_runstate_info runstate;
144b9bc7f78Ssmaybe int ret = VCPU_STATE_UNKNOWN;
145b9bc7f78Ssmaybe
146b9bc7f78Ssmaybe ASSERT(cpu < NCPU);
147b9bc7f78Ssmaybe /*
148b9bc7f78Ssmaybe * Don't bother with hypercall if we are asking about ourself
149b9bc7f78Ssmaybe */
150b9bc7f78Ssmaybe if (cpu == CPU->cpu_id)
151b9bc7f78Ssmaybe return (VCPU_ON_PCPU);
152b9bc7f78Ssmaybe if (HYPERVISOR_vcpu_op(VCPUOP_get_runstate_info, cpu, &runstate) != 0)
153b9bc7f78Ssmaybe goto out;
154b9bc7f78Ssmaybe
155b9bc7f78Ssmaybe switch (runstate.state) {
156b9bc7f78Ssmaybe case RUNSTATE_running:
157b9bc7f78Ssmaybe ret = VCPU_ON_PCPU;
158b9bc7f78Ssmaybe break;
159b9bc7f78Ssmaybe
160b9bc7f78Ssmaybe case RUNSTATE_runnable:
161b9bc7f78Ssmaybe case RUNSTATE_offline:
162b9bc7f78Ssmaybe case RUNSTATE_blocked:
163b9bc7f78Ssmaybe ret = VCPU_NOT_ON_PCPU;
164b9bc7f78Ssmaybe break;
165b9bc7f78Ssmaybe
166b9bc7f78Ssmaybe default:
167b9bc7f78Ssmaybe break;
168b9bc7f78Ssmaybe }
169b9bc7f78Ssmaybe
170b9bc7f78Ssmaybe out:
171b9bc7f78Ssmaybe return (ret);
172b9bc7f78Ssmaybe }
173b9bc7f78Ssmaybe
174b9bc7f78Ssmaybe /*
175843e1988Sjohnlev * These routines allocate any global state that might be needed
176843e1988Sjohnlev * while starting cpus. For virtual cpus, there is no such state.
177843e1988Sjohnlev */
178843e1988Sjohnlev int
mach_cpucontext_init(void)179843e1988Sjohnlev mach_cpucontext_init(void)
180843e1988Sjohnlev {
181843e1988Sjohnlev return (0);
182843e1988Sjohnlev }
183843e1988Sjohnlev
184843e1988Sjohnlev void
do_cpu_config_watch(int state)185843e1988Sjohnlev do_cpu_config_watch(int state)
186843e1988Sjohnlev {
187843e1988Sjohnlev static struct xenbus_watch cpu_config_watch;
188843e1988Sjohnlev
189843e1988Sjohnlev if (state != XENSTORE_UP)
190843e1988Sjohnlev return;
191843e1988Sjohnlev cpu_config_watch.node = "cpu";
192843e1988Sjohnlev cpu_config_watch.callback = vcpu_config_event;
193843e1988Sjohnlev if (register_xenbus_watch(&cpu_config_watch)) {
194843e1988Sjohnlev taskq_destroy(cpu_config_tq);
195843e1988Sjohnlev cmn_err(CE_WARN, "do_cpu_config_watch: "
196843e1988Sjohnlev "failed to set vcpu config watch");
197843e1988Sjohnlev }
198843e1988Sjohnlev
199843e1988Sjohnlev }
200843e1988Sjohnlev
201843e1988Sjohnlev /*
202843e1988Sjohnlev * This routine is called after all the "normal" MP startup has
203843e1988Sjohnlev * been done; a good place to start watching xen store for virtual
204843e1988Sjohnlev * cpu hot plug events.
205843e1988Sjohnlev */
206843e1988Sjohnlev void
mach_cpucontext_fini(void)207843e1988Sjohnlev mach_cpucontext_fini(void)
208843e1988Sjohnlev {
209843e1988Sjohnlev
210843e1988Sjohnlev cpu_config_tq = taskq_create("vcpu config taskq", 1,
211843e1988Sjohnlev maxclsyspri - 1, 1, 1, TASKQ_PREPOPULATE);
212843e1988Sjohnlev
213843e1988Sjohnlev (void) xs_register_xenbus_callback(do_cpu_config_watch);
214843e1988Sjohnlev }
215843e1988Sjohnlev
216843e1988Sjohnlev /*
217843e1988Sjohnlev * Fill in the remaining CPU context and initialize it.
218843e1988Sjohnlev */
219843e1988Sjohnlev static int
mp_set_cpu_context(vcpu_guest_context_t * vgc,cpu_t * cp)220843e1988Sjohnlev mp_set_cpu_context(vcpu_guest_context_t *vgc, cpu_t *cp)
221843e1988Sjohnlev {
222843e1988Sjohnlev uint_t vec, iopl;
223843e1988Sjohnlev
224843e1988Sjohnlev vgc->flags = VGCF_IN_KERNEL;
225843e1988Sjohnlev
226843e1988Sjohnlev /*
227843e1988Sjohnlev * fpu_ctx we leave as zero; on first fault we'll store
228843e1988Sjohnlev * sse_initial into it anyway.
229843e1988Sjohnlev */
230843e1988Sjohnlev
231843e1988Sjohnlev #if defined(__amd64)
232843e1988Sjohnlev vgc->user_regs.cs = KCS_SEL | SEL_KPL; /* force to ring 3 */
233843e1988Sjohnlev #else
234843e1988Sjohnlev vgc->user_regs.cs = KCS_SEL;
235843e1988Sjohnlev #endif
236843e1988Sjohnlev vgc->user_regs.ds = KDS_SEL;
237843e1988Sjohnlev vgc->user_regs.es = KDS_SEL;
238843e1988Sjohnlev vgc->user_regs.ss = KDS_SEL;
239843e1988Sjohnlev vgc->kernel_ss = KDS_SEL;
240843e1988Sjohnlev
241843e1988Sjohnlev /*
242843e1988Sjohnlev * Allow I/O privilege level for Dom0 kernel.
243843e1988Sjohnlev */
244843e1988Sjohnlev if (DOMAIN_IS_INITDOMAIN(xen_info))
245843e1988Sjohnlev iopl = (PS_IOPL & 0x1000); /* ring 1 */
246843e1988Sjohnlev else
247843e1988Sjohnlev iopl = 0;
248843e1988Sjohnlev
249843e1988Sjohnlev #if defined(__amd64)
250843e1988Sjohnlev vgc->user_regs.fs = 0;
251843e1988Sjohnlev vgc->user_regs.gs = 0;
252843e1988Sjohnlev vgc->user_regs.rflags = F_OFF | iopl;
253843e1988Sjohnlev #elif defined(__i386)
254843e1988Sjohnlev vgc->user_regs.fs = KFS_SEL;
255843e1988Sjohnlev vgc->user_regs.gs = KGS_SEL;
256843e1988Sjohnlev vgc->user_regs.eflags = F_OFF | iopl;
257843e1988Sjohnlev vgc->event_callback_cs = vgc->user_regs.cs;
258843e1988Sjohnlev vgc->failsafe_callback_cs = vgc->user_regs.cs;
259843e1988Sjohnlev #endif
260843e1988Sjohnlev
261843e1988Sjohnlev /*
262843e1988Sjohnlev * Initialize the trap_info_t from the IDT
263843e1988Sjohnlev */
264843e1988Sjohnlev #if !defined(__lint)
265843e1988Sjohnlev ASSERT(NIDT == sizeof (vgc->trap_ctxt) / sizeof (vgc->trap_ctxt[0]));
266843e1988Sjohnlev #endif
267843e1988Sjohnlev for (vec = 0; vec < NIDT; vec++) {
268843e1988Sjohnlev trap_info_t *ti = &vgc->trap_ctxt[vec];
269843e1988Sjohnlev
270843e1988Sjohnlev if (xen_idt_to_trap_info(vec,
271843e1988Sjohnlev &cp->cpu_m.mcpu_idt[vec], ti) == 0) {
272843e1988Sjohnlev ti->cs = KCS_SEL;
273843e1988Sjohnlev ti->vector = vec;
274843e1988Sjohnlev }
275843e1988Sjohnlev }
276843e1988Sjohnlev
277843e1988Sjohnlev /*
278843e1988Sjohnlev * No LDT
279843e1988Sjohnlev */
280843e1988Sjohnlev
281843e1988Sjohnlev /*
282843e1988Sjohnlev * (We assert in various places that the GDT is (a) aligned on a
283843e1988Sjohnlev * page boundary and (b) one page long, so this really should fit..)
284843e1988Sjohnlev */
285843e1988Sjohnlev #ifdef CRASH_XEN
286843e1988Sjohnlev vgc->gdt_frames[0] = pa_to_ma(mmu_btop(cp->cpu_m.mcpu_gdtpa));
287843e1988Sjohnlev #else
288843e1988Sjohnlev vgc->gdt_frames[0] = pfn_to_mfn(mmu_btop(cp->cpu_m.mcpu_gdtpa));
289843e1988Sjohnlev #endif
290843e1988Sjohnlev vgc->gdt_ents = NGDT;
291843e1988Sjohnlev
292843e1988Sjohnlev vgc->ctrlreg[0] = CR0_ENABLE_FPU_FLAGS(getcr0());
293843e1988Sjohnlev
294843e1988Sjohnlev #if defined(__i386)
295843e1988Sjohnlev if (mmu.pae_hat)
296843e1988Sjohnlev vgc->ctrlreg[3] =
297843e1988Sjohnlev xen_pfn_to_cr3(pfn_to_mfn(kas.a_hat->hat_htable->ht_pfn));
298843e1988Sjohnlev else
299843e1988Sjohnlev #endif
300843e1988Sjohnlev vgc->ctrlreg[3] =
301843e1988Sjohnlev pa_to_ma(mmu_ptob(kas.a_hat->hat_htable->ht_pfn));
302843e1988Sjohnlev
303843e1988Sjohnlev vgc->ctrlreg[4] = getcr4();
304843e1988Sjohnlev
305843e1988Sjohnlev vgc->event_callback_eip = (uintptr_t)xen_callback;
306843e1988Sjohnlev vgc->failsafe_callback_eip = (uintptr_t)xen_failsafe_callback;
307843e1988Sjohnlev vgc->flags |= VGCF_failsafe_disables_events;
308843e1988Sjohnlev
309843e1988Sjohnlev #if defined(__amd64)
310843e1988Sjohnlev /*
311843e1988Sjohnlev * XXPV should this be moved to init_cpu_syscall?
312843e1988Sjohnlev */
313843e1988Sjohnlev vgc->syscall_callback_eip = (uintptr_t)sys_syscall;
314843e1988Sjohnlev vgc->flags |= VGCF_syscall_disables_events;
315843e1988Sjohnlev
316843e1988Sjohnlev ASSERT(vgc->user_regs.gs == 0);
317843e1988Sjohnlev vgc->gs_base_kernel = (uintptr_t)cp;
318843e1988Sjohnlev #endif
319843e1988Sjohnlev
320843e1988Sjohnlev return (xen_vcpu_initialize(cp->cpu_id, vgc));
321843e1988Sjohnlev }
322843e1988Sjohnlev
323843e1988Sjohnlev /*
324843e1988Sjohnlev * Create a guest virtual cpu context so that the virtual cpu
325843e1988Sjohnlev * springs into life in the domain just about to call mp_startup()
326843e1988Sjohnlev *
327843e1988Sjohnlev * Virtual CPUs must be initialized once in the lifetime of the domain;
328843e1988Sjohnlev * after that subsequent attempts to start them will fail with X_EEXIST.
329843e1988Sjohnlev *
330843e1988Sjohnlev * Thus 'alloc' -really- creates and initializes the virtual
331843e1988Sjohnlev * CPU context just once. Once the initialisation succeeds, we never
332843e1988Sjohnlev * free it, nor the regular cpu_t to which it refers.
333843e1988Sjohnlev */
334843e1988Sjohnlev void *
mach_cpucontext_alloc(struct cpu * cp)335843e1988Sjohnlev mach_cpucontext_alloc(struct cpu *cp)
336843e1988Sjohnlev {
337843e1988Sjohnlev kthread_t *tp = cp->cpu_thread;
338843e1988Sjohnlev vcpu_guest_context_t vgc;
339843e1988Sjohnlev
340843e1988Sjohnlev int err = 1;
341843e1988Sjohnlev
342843e1988Sjohnlev /*
343843e1988Sjohnlev * First, augment the incoming cpu structure
344843e1988Sjohnlev * - vcpu pointer reference
345843e1988Sjohnlev * - pending event storage area
346843e1988Sjohnlev * - physical address of GDT
347843e1988Sjohnlev */
348843e1988Sjohnlev cp->cpu_m.mcpu_vcpu_info =
349843e1988Sjohnlev &HYPERVISOR_shared_info->vcpu_info[cp->cpu_id];
350843e1988Sjohnlev cp->cpu_m.mcpu_evt_pend = kmem_zalloc(
351843e1988Sjohnlev sizeof (struct xen_evt_data), KM_SLEEP);
352843e1988Sjohnlev cp->cpu_m.mcpu_gdtpa =
353843e1988Sjohnlev mmu_ptob(hat_getpfnum(kas.a_hat, (caddr_t)cp->cpu_gdt));
354843e1988Sjohnlev
355843e1988Sjohnlev if ((err = xen_gdt_setprot(cp, PROT_READ)) != 0)
356843e1988Sjohnlev goto done;
357843e1988Sjohnlev
358843e1988Sjohnlev /*
359843e1988Sjohnlev * Now set up the vcpu context so that we can start this vcpu
360843e1988Sjohnlev * in the kernel at tp->t_pc (mp_startup). Note that the
361843e1988Sjohnlev * thread will thread_exit() shortly after performing the
362843e1988Sjohnlev * initialization; in particular, we will *never* take a
363843e1988Sjohnlev * privilege transition on this thread.
364843e1988Sjohnlev */
365843e1988Sjohnlev
366843e1988Sjohnlev bzero(&vgc, sizeof (vgc));
367843e1988Sjohnlev
368843e1988Sjohnlev #ifdef __amd64
369843e1988Sjohnlev vgc.user_regs.rip = tp->t_pc;
370843e1988Sjohnlev vgc.user_regs.rsp = tp->t_sp;
371843e1988Sjohnlev vgc.user_regs.rbp = tp->t_sp - 2 * sizeof (greg_t);
372843e1988Sjohnlev #else
373843e1988Sjohnlev vgc.user_regs.eip = tp->t_pc;
374843e1988Sjohnlev vgc.user_regs.esp = tp->t_sp;
375843e1988Sjohnlev vgc.user_regs.ebp = tp->t_sp - 2 * sizeof (greg_t);
376843e1988Sjohnlev #endif
377843e1988Sjohnlev /*
378843e1988Sjohnlev * XXPV Fix resume, if Russ didn't already fix it.
379843e1988Sjohnlev *
380843e1988Sjohnlev * Note that resume unconditionally puts t->t_stk + sizeof (regs)
381843e1988Sjohnlev * into kernel_sp via HYPERVISOR_stack_switch. This anticipates
382843e1988Sjohnlev * that only lwps take traps that switch to the kernel stack;
383843e1988Sjohnlev * part of creating an lwp adjusts the stack by subtracting
384843e1988Sjohnlev * sizeof (struct regs) off t_stk.
385843e1988Sjohnlev *
386843e1988Sjohnlev * The more interesting question is, why do we do all the work
387843e1988Sjohnlev * of a fully fledged lwp for a plain thread? In particular
388843e1988Sjohnlev * we don't have to call HYPERVISOR_stack_switch for lwp-less threads
389843e1988Sjohnlev * or futz with the LDT. This should probably all be done with
390843e1988Sjohnlev * an lwp context operator to keep pure thread context switch fast.
391843e1988Sjohnlev */
392843e1988Sjohnlev vgc.kernel_sp = (ulong_t)tp->t_stk;
393843e1988Sjohnlev
394843e1988Sjohnlev err = mp_set_cpu_context(&vgc, cp);
395843e1988Sjohnlev
396843e1988Sjohnlev done:
397843e1988Sjohnlev if (err) {
398843e1988Sjohnlev mach_cpucontext_free(cp, NULL, err);
399843e1988Sjohnlev return (NULL);
400843e1988Sjohnlev }
401843e1988Sjohnlev return (cp);
402843e1988Sjohnlev }
403843e1988Sjohnlev
404843e1988Sjohnlev /*
405843e1988Sjohnlev * By the time we are called either we have successfully started
406843e1988Sjohnlev * the cpu, or our attempt to start it has failed.
407843e1988Sjohnlev */
408843e1988Sjohnlev
409843e1988Sjohnlev /*ARGSUSED*/
410843e1988Sjohnlev void
mach_cpucontext_free(struct cpu * cp,void * arg,int err)411843e1988Sjohnlev mach_cpucontext_free(struct cpu *cp, void *arg, int err)
412843e1988Sjohnlev {
413843e1988Sjohnlev switch (err) {
414843e1988Sjohnlev case 0:
415843e1988Sjohnlev break;
416843e1988Sjohnlev case ETIMEDOUT:
417843e1988Sjohnlev /*
418843e1988Sjohnlev * The vcpu context is loaded into the hypervisor, and
419843e1988Sjohnlev * we've tried to start it, but the vcpu has not been set
420843e1988Sjohnlev * running yet, for whatever reason. We arrange to -not-
421843e1988Sjohnlev * free any data structures it may be referencing. In
422843e1988Sjohnlev * particular, we've already told the hypervisor about
423843e1988Sjohnlev * the GDT, and so we can't map it read-write again.
424843e1988Sjohnlev */
425843e1988Sjohnlev break;
426843e1988Sjohnlev default:
427843e1988Sjohnlev (void) xen_gdt_setprot(cp, PROT_READ | PROT_WRITE);
428843e1988Sjohnlev kmem_free(cp->cpu_m.mcpu_evt_pend,
429843e1988Sjohnlev sizeof (struct xen_evt_data));
430843e1988Sjohnlev break;
431843e1988Sjohnlev }
432843e1988Sjohnlev }
433843e1988Sjohnlev
434843e1988Sjohnlev /*
435843e1988Sjohnlev * Reset this CPU's context. Clear out any pending evtchn data, since event
436843e1988Sjohnlev * channel numbers will all change when we resume.
437843e1988Sjohnlev */
438843e1988Sjohnlev void
mach_cpucontext_reset(cpu_t * cp)439843e1988Sjohnlev mach_cpucontext_reset(cpu_t *cp)
440843e1988Sjohnlev {
441843e1988Sjohnlev bzero(cp->cpu_m.mcpu_evt_pend, sizeof (struct xen_evt_data));
442843e1988Sjohnlev /* mcpu_intr_pending ? */
443843e1988Sjohnlev }
444843e1988Sjohnlev
445843e1988Sjohnlev static void
pcb_to_user_regs(label_t * pcb,vcpu_guest_context_t * vgc)446843e1988Sjohnlev pcb_to_user_regs(label_t *pcb, vcpu_guest_context_t *vgc)
447843e1988Sjohnlev {
448843e1988Sjohnlev #ifdef __amd64
449843e1988Sjohnlev vgc->user_regs.rip = pcb->val[REG_LABEL_PC];
450843e1988Sjohnlev vgc->user_regs.rsp = pcb->val[REG_LABEL_SP];
451843e1988Sjohnlev vgc->user_regs.rbp = pcb->val[REG_LABEL_BP];
452843e1988Sjohnlev vgc->user_regs.rbx = pcb->val[REG_LABEL_RBX];
453843e1988Sjohnlev vgc->user_regs.r12 = pcb->val[REG_LABEL_R12];
454843e1988Sjohnlev vgc->user_regs.r13 = pcb->val[REG_LABEL_R13];
455843e1988Sjohnlev vgc->user_regs.r14 = pcb->val[REG_LABEL_R14];
456843e1988Sjohnlev vgc->user_regs.r15 = pcb->val[REG_LABEL_R15];
457843e1988Sjohnlev #else /* __amd64 */
458843e1988Sjohnlev vgc->user_regs.eip = pcb->val[REG_LABEL_PC];
459843e1988Sjohnlev vgc->user_regs.esp = pcb->val[REG_LABEL_SP];
460843e1988Sjohnlev vgc->user_regs.ebp = pcb->val[REG_LABEL_BP];
461843e1988Sjohnlev vgc->user_regs.ebx = pcb->val[REG_LABEL_EBX];
462843e1988Sjohnlev vgc->user_regs.esi = pcb->val[REG_LABEL_ESI];
463843e1988Sjohnlev vgc->user_regs.edi = pcb->val[REG_LABEL_EDI];
464843e1988Sjohnlev #endif /* __amd64 */
465843e1988Sjohnlev }
466843e1988Sjohnlev
467843e1988Sjohnlev /*
4681d03c31eSjohnlev * Restore the context of a CPU during resume. This context is always
4691d03c31eSjohnlev * inside enter_safe_phase(), below.
470843e1988Sjohnlev */
471843e1988Sjohnlev void
mach_cpucontext_restore(cpu_t * cp)472843e1988Sjohnlev mach_cpucontext_restore(cpu_t *cp)
473843e1988Sjohnlev {
474843e1988Sjohnlev vcpu_guest_context_t vgc;
475843e1988Sjohnlev int err;
476843e1988Sjohnlev
477843e1988Sjohnlev ASSERT(cp->cpu_thread == cp->cpu_pause_thread ||
478843e1988Sjohnlev cp->cpu_thread == cp->cpu_idle_thread);
479843e1988Sjohnlev
480843e1988Sjohnlev bzero(&vgc, sizeof (vgc));
481843e1988Sjohnlev
482843e1988Sjohnlev pcb_to_user_regs(&cp->cpu_thread->t_pcb, &vgc);
483843e1988Sjohnlev
484843e1988Sjohnlev /*
485843e1988Sjohnlev * We're emulating a longjmp() here: in particular, we need to bump the
486843e1988Sjohnlev * stack pointer to account for the pop of xIP that returning from
487843e1988Sjohnlev * longjmp() normally would do, and set the return value in xAX to 1.
488843e1988Sjohnlev */
489843e1988Sjohnlev #ifdef __amd64
490843e1988Sjohnlev vgc.user_regs.rax = 1;
491843e1988Sjohnlev vgc.user_regs.rsp += sizeof (ulong_t);
492843e1988Sjohnlev #else
493843e1988Sjohnlev vgc.user_regs.eax = 1;
494843e1988Sjohnlev vgc.user_regs.esp += sizeof (ulong_t);
495843e1988Sjohnlev #endif
496843e1988Sjohnlev
497843e1988Sjohnlev vgc.kernel_sp = cp->cpu_thread->t_sp;
498843e1988Sjohnlev
499843e1988Sjohnlev err = mp_set_cpu_context(&vgc, cp);
500843e1988Sjohnlev
501843e1988Sjohnlev ASSERT(err == 0);
502843e1988Sjohnlev }
503843e1988Sjohnlev
5041d03c31eSjohnlev /*
5051d03c31eSjohnlev * Reach a point at which the CPU can be safely powered-off or
5061d03c31eSjohnlev * suspended. Nothing can wake this CPU out of the loop.
5071d03c31eSjohnlev */
5081d03c31eSjohnlev static void
enter_safe_phase(void)5091d03c31eSjohnlev enter_safe_phase(void)
5101d03c31eSjohnlev {
5111d03c31eSjohnlev ulong_t flags = intr_clear();
5121d03c31eSjohnlev
5131d03c31eSjohnlev if (setjmp(&curthread->t_pcb) == 0) {
5141d03c31eSjohnlev cpu_phase[CPU->cpu_id] = CPU_PHASE_SAFE;
5151d03c31eSjohnlev while (cpu_phase[CPU->cpu_id] == CPU_PHASE_SAFE)
5161d03c31eSjohnlev SMT_PAUSE();
5171d03c31eSjohnlev }
5181d03c31eSjohnlev
5191d03c31eSjohnlev ASSERT(!interrupts_enabled());
5201d03c31eSjohnlev
5211d03c31eSjohnlev intr_restore(flags);
5221d03c31eSjohnlev }
5231d03c31eSjohnlev
5241d03c31eSjohnlev /*
5251d03c31eSjohnlev * Offline CPUs run this code even under a pause_cpus(), so we must
5261d03c31eSjohnlev * check if we need to enter the safe phase.
5271d03c31eSjohnlev */
528843e1988Sjohnlev void
mach_cpu_idle(void)529843e1988Sjohnlev mach_cpu_idle(void)
530843e1988Sjohnlev {
531843e1988Sjohnlev if (IN_XPV_PANIC()) {
532843e1988Sjohnlev xpv_panic_halt();
533843e1988Sjohnlev } else {
534843e1988Sjohnlev (void) HYPERVISOR_block();
5351d03c31eSjohnlev if (cpu_phase[CPU->cpu_id] == CPU_PHASE_WAIT_SAFE)
5361d03c31eSjohnlev enter_safe_phase();
5371d03c31eSjohnlev }
5381d03c31eSjohnlev }
5391d03c31eSjohnlev
5401d03c31eSjohnlev /*
5411d03c31eSjohnlev * Spin until either start_cpus() wakes us up, or we get a request to
5421d03c31eSjohnlev * enter the safe phase (followed by a later start_cpus()).
5431d03c31eSjohnlev */
5441d03c31eSjohnlev void
mach_cpu_pause(volatile char * safe)5451d03c31eSjohnlev mach_cpu_pause(volatile char *safe)
5461d03c31eSjohnlev {
5471d03c31eSjohnlev *safe = PAUSE_WAIT;
5481d03c31eSjohnlev membar_enter();
5491d03c31eSjohnlev
5501d03c31eSjohnlev while (*safe != PAUSE_IDLE) {
5511d03c31eSjohnlev if (cpu_phase[CPU->cpu_id] == CPU_PHASE_WAIT_SAFE)
5521d03c31eSjohnlev enter_safe_phase();
5531d03c31eSjohnlev SMT_PAUSE();
554843e1988Sjohnlev }
555843e1988Sjohnlev }
556843e1988Sjohnlev
557843e1988Sjohnlev void
mach_cpu_halt(char * msg)558843e1988Sjohnlev mach_cpu_halt(char *msg)
559843e1988Sjohnlev {
560843e1988Sjohnlev if (msg)
561843e1988Sjohnlev prom_printf("%s\n", msg);
562843e1988Sjohnlev (void) xen_vcpu_down(CPU->cpu_id);
563843e1988Sjohnlev }
564843e1988Sjohnlev
565843e1988Sjohnlev /*ARGSUSED*/
566843e1988Sjohnlev int
mp_cpu_poweron(struct cpu * cp)567843e1988Sjohnlev mp_cpu_poweron(struct cpu *cp)
568843e1988Sjohnlev {
569843e1988Sjohnlev return (ENOTSUP);
570843e1988Sjohnlev }
571843e1988Sjohnlev
572843e1988Sjohnlev /*ARGSUSED*/
573843e1988Sjohnlev int
mp_cpu_poweroff(struct cpu * cp)574843e1988Sjohnlev mp_cpu_poweroff(struct cpu *cp)
575843e1988Sjohnlev {
576843e1988Sjohnlev return (ENOTSUP);
577843e1988Sjohnlev }
578843e1988Sjohnlev
5791d03c31eSjohnlev void
mp_enter_barrier(void)5801d03c31eSjohnlev mp_enter_barrier(void)
581843e1988Sjohnlev {
5821d03c31eSjohnlev hrtime_t last_poke_time = 0;
5831d03c31eSjohnlev int poke_allowed = 0;
5841d03c31eSjohnlev int done = 0;
5851d03c31eSjohnlev int i;
586843e1988Sjohnlev
587843e1988Sjohnlev ASSERT(MUTEX_HELD(&cpu_lock));
588843e1988Sjohnlev
589*0ed5c46eSJosef 'Jeff' Sipek pause_cpus(NULL, NULL);
5901d03c31eSjohnlev
5911d03c31eSjohnlev while (!done) {
5921d03c31eSjohnlev done = 1;
5931d03c31eSjohnlev poke_allowed = 0;
5941d03c31eSjohnlev
5951d03c31eSjohnlev if (xpv_gethrtime() - last_poke_time > POKE_TIMEOUT) {
5961d03c31eSjohnlev last_poke_time = xpv_gethrtime();
5971d03c31eSjohnlev poke_allowed = 1;
598843e1988Sjohnlev }
599843e1988Sjohnlev
6001d03c31eSjohnlev for (i = 0; i < NCPU; i++) {
6011d03c31eSjohnlev cpu_t *cp = cpu_get(i);
602843e1988Sjohnlev
6031d03c31eSjohnlev if (cp == NULL || cp == CPU)
6041d03c31eSjohnlev continue;
6051d03c31eSjohnlev
6061d03c31eSjohnlev switch (cpu_phase[i]) {
6071d03c31eSjohnlev case CPU_PHASE_NONE:
6081d03c31eSjohnlev cpu_phase[i] = CPU_PHASE_WAIT_SAFE;
6091d03c31eSjohnlev poke_cpu(i);
6101d03c31eSjohnlev done = 0;
6111d03c31eSjohnlev break;
6121d03c31eSjohnlev
6131d03c31eSjohnlev case CPU_PHASE_WAIT_SAFE:
6141d03c31eSjohnlev if (poke_allowed)
6151d03c31eSjohnlev poke_cpu(i);
6161d03c31eSjohnlev done = 0;
6171d03c31eSjohnlev break;
6181d03c31eSjohnlev
6191d03c31eSjohnlev case CPU_PHASE_SAFE:
6201d03c31eSjohnlev case CPU_PHASE_POWERED_OFF:
6211d03c31eSjohnlev break;
622843e1988Sjohnlev }
623843e1988Sjohnlev }
624843e1988Sjohnlev
6251d03c31eSjohnlev SMT_PAUSE();
6261d03c31eSjohnlev }
6271d03c31eSjohnlev }
6281d03c31eSjohnlev
6291d03c31eSjohnlev void
mp_leave_barrier(void)6301d03c31eSjohnlev mp_leave_barrier(void)
631843e1988Sjohnlev {
6321d03c31eSjohnlev int i;
6331d03c31eSjohnlev
6341d03c31eSjohnlev ASSERT(MUTEX_HELD(&cpu_lock));
6351d03c31eSjohnlev
6361d03c31eSjohnlev for (i = 0; i < NCPU; i++) {
6371d03c31eSjohnlev cpu_t *cp = cpu_get(i);
6381d03c31eSjohnlev
6391d03c31eSjohnlev if (cp == NULL || cp == CPU)
6401d03c31eSjohnlev continue;
6411d03c31eSjohnlev
6421d03c31eSjohnlev switch (cpu_phase[i]) {
6431d03c31eSjohnlev /*
6441d03c31eSjohnlev * If we see a CPU in one of these phases, something has
6451d03c31eSjohnlev * gone badly wrong with the guarantees
6461d03c31eSjohnlev * mp_enter_barrier() is supposed to provide. Rather
6471d03c31eSjohnlev * than attempt to stumble along (and since we can't
6481d03c31eSjohnlev * panic properly in this context), we tell the
6491d03c31eSjohnlev * hypervisor we've crashed.
6501d03c31eSjohnlev */
6511d03c31eSjohnlev case CPU_PHASE_NONE:
6521d03c31eSjohnlev case CPU_PHASE_WAIT_SAFE:
6531d03c31eSjohnlev (void) HYPERVISOR_shutdown(SHUTDOWN_crash);
6541d03c31eSjohnlev break;
6551d03c31eSjohnlev
6561d03c31eSjohnlev case CPU_PHASE_POWERED_OFF:
6571d03c31eSjohnlev break;
6581d03c31eSjohnlev
6591d03c31eSjohnlev case CPU_PHASE_SAFE:
6601d03c31eSjohnlev cpu_phase[i] = CPU_PHASE_NONE;
6611d03c31eSjohnlev }
662843e1988Sjohnlev }
663843e1988Sjohnlev
6641d03c31eSjohnlev start_cpus();
6651d03c31eSjohnlev }
6661d03c31eSjohnlev
667843e1988Sjohnlev static int
poweroff_vcpu(struct cpu * cp)668843e1988Sjohnlev poweroff_vcpu(struct cpu *cp)
669843e1988Sjohnlev {
670843e1988Sjohnlev int error;
671843e1988Sjohnlev
672843e1988Sjohnlev ASSERT(MUTEX_HELD(&cpu_lock));
673843e1988Sjohnlev
674843e1988Sjohnlev ASSERT(CPU->cpu_id != cp->cpu_id);
675843e1988Sjohnlev ASSERT(cp->cpu_flags & CPU_QUIESCED);
676843e1988Sjohnlev
6771d03c31eSjohnlev mp_enter_barrier();
678843e1988Sjohnlev
679843e1988Sjohnlev if ((error = xen_vcpu_down(cp->cpu_id)) == 0) {
6801d03c31eSjohnlev ASSERT(cpu_phase[cp->cpu_id] == CPU_PHASE_SAFE);
6811d03c31eSjohnlev
682843e1988Sjohnlev CPUSET_DEL(cpu_ready_set, cp->cpu_id);
6831d03c31eSjohnlev
684843e1988Sjohnlev cp->cpu_flags |= CPU_POWEROFF | CPU_OFFLINE;
685843e1988Sjohnlev cp->cpu_flags &=
686843e1988Sjohnlev ~(CPU_RUNNING | CPU_READY | CPU_EXISTS | CPU_ENABLE);
687843e1988Sjohnlev
6881d03c31eSjohnlev cpu_phase[cp->cpu_id] = CPU_PHASE_POWERED_OFF;
6891d03c31eSjohnlev
690843e1988Sjohnlev cpu_set_state(cp);
691843e1988Sjohnlev }
6921d03c31eSjohnlev
6931d03c31eSjohnlev mp_leave_barrier();
6941d03c31eSjohnlev
695843e1988Sjohnlev return (error);
696843e1988Sjohnlev }
697843e1988Sjohnlev
698843e1988Sjohnlev static int
vcpu_config_poweroff(processorid_t id)699843e1988Sjohnlev vcpu_config_poweroff(processorid_t id)
700843e1988Sjohnlev {
701843e1988Sjohnlev int oldstate;
702843e1988Sjohnlev int error;
703843e1988Sjohnlev cpu_t *cp;
704843e1988Sjohnlev
705843e1988Sjohnlev mutex_enter(&cpu_lock);
706843e1988Sjohnlev
707843e1988Sjohnlev if ((cp = cpu_get(id)) == NULL) {
708843e1988Sjohnlev mutex_exit(&cpu_lock);
709843e1988Sjohnlev return (ESRCH);
710843e1988Sjohnlev }
711843e1988Sjohnlev
712843e1988Sjohnlev if (cpu_get_state(cp) == P_POWEROFF) {
713843e1988Sjohnlev mutex_exit(&cpu_lock);
714843e1988Sjohnlev return (0);
715843e1988Sjohnlev }
716843e1988Sjohnlev
717843e1988Sjohnlev mutex_exit(&cpu_lock);
718843e1988Sjohnlev
719843e1988Sjohnlev do {
720843e1988Sjohnlev error = p_online_internal(id, P_OFFLINE,
721843e1988Sjohnlev &oldstate);
722843e1988Sjohnlev
723843e1988Sjohnlev if (error != 0)
724843e1988Sjohnlev break;
725843e1988Sjohnlev
726843e1988Sjohnlev /*
727843e1988Sjohnlev * So we just changed it to P_OFFLINE. But then we dropped
728843e1988Sjohnlev * cpu_lock, so now it is possible for another thread to change
729843e1988Sjohnlev * the cpu back to a different, non-quiesced state e.g.
730843e1988Sjohnlev * P_ONLINE.
731843e1988Sjohnlev */
732843e1988Sjohnlev mutex_enter(&cpu_lock);
733843e1988Sjohnlev if ((cp = cpu_get(id)) == NULL)
734843e1988Sjohnlev error = ESRCH;
735843e1988Sjohnlev else {
736843e1988Sjohnlev if (cp->cpu_flags & CPU_QUIESCED)
737843e1988Sjohnlev error = poweroff_vcpu(cp);
738843e1988Sjohnlev else
739843e1988Sjohnlev error = EBUSY;
740843e1988Sjohnlev }
741843e1988Sjohnlev mutex_exit(&cpu_lock);
742843e1988Sjohnlev } while (error == EBUSY);
743843e1988Sjohnlev
744843e1988Sjohnlev return (error);
745843e1988Sjohnlev }
746843e1988Sjohnlev
747843e1988Sjohnlev /*
748843e1988Sjohnlev * Add a new virtual cpu to the domain.
749843e1988Sjohnlev */
750843e1988Sjohnlev static int
vcpu_config_new(processorid_t id)751843e1988Sjohnlev vcpu_config_new(processorid_t id)
752843e1988Sjohnlev {
753843e1988Sjohnlev extern int start_cpu(processorid_t);
754843e1988Sjohnlev int error;
755843e1988Sjohnlev
756843e1988Sjohnlev if (ncpus == 1) {
757843e1988Sjohnlev printf("cannot (yet) add cpus to a single-cpu domain\n");
758843e1988Sjohnlev return (ENOTSUP);
759843e1988Sjohnlev }
760843e1988Sjohnlev
761843e1988Sjohnlev affinity_set(CPU_CURRENT);
762843e1988Sjohnlev error = start_cpu(id);
763843e1988Sjohnlev affinity_clear();
764843e1988Sjohnlev return (error);
765843e1988Sjohnlev }
766843e1988Sjohnlev
767843e1988Sjohnlev static int
poweron_vcpu(struct cpu * cp)7681d03c31eSjohnlev poweron_vcpu(struct cpu *cp)
7691d03c31eSjohnlev {
7701d03c31eSjohnlev int error;
7711d03c31eSjohnlev
7721d03c31eSjohnlev ASSERT(MUTEX_HELD(&cpu_lock));
7731d03c31eSjohnlev
7741d03c31eSjohnlev if (HYPERVISOR_vcpu_op(VCPUOP_is_up, cp->cpu_id, NULL) != 0) {
7751d03c31eSjohnlev printf("poweron_vcpu: vcpu%d is not available!\n",
7761d03c31eSjohnlev cp->cpu_id);
7771d03c31eSjohnlev return (ENXIO);
7781d03c31eSjohnlev }
7791d03c31eSjohnlev
7801d03c31eSjohnlev if ((error = xen_vcpu_up(cp->cpu_id)) == 0) {
7811d03c31eSjohnlev CPUSET_ADD(cpu_ready_set, cp->cpu_id);
7821d03c31eSjohnlev cp->cpu_flags |= CPU_EXISTS | CPU_READY | CPU_RUNNING;
7831d03c31eSjohnlev cp->cpu_flags &= ~CPU_POWEROFF;
7841d03c31eSjohnlev /*
7851d03c31eSjohnlev * There are some nasty races possible here.
7861d03c31eSjohnlev * Tell the vcpu it's up one more time.
7871d03c31eSjohnlev * XXPV Is this enough? Is this safe?
7881d03c31eSjohnlev */
7891d03c31eSjohnlev (void) xen_vcpu_up(cp->cpu_id);
7901d03c31eSjohnlev
7911d03c31eSjohnlev cpu_phase[cp->cpu_id] = CPU_PHASE_NONE;
7921d03c31eSjohnlev
7931d03c31eSjohnlev cpu_set_state(cp);
7941d03c31eSjohnlev }
7951d03c31eSjohnlev return (error);
7961d03c31eSjohnlev }
7971d03c31eSjohnlev
7981d03c31eSjohnlev static int
vcpu_config_poweron(processorid_t id)799843e1988Sjohnlev vcpu_config_poweron(processorid_t id)
800843e1988Sjohnlev {
801843e1988Sjohnlev cpu_t *cp;
802843e1988Sjohnlev int oldstate;
803843e1988Sjohnlev int error;
804843e1988Sjohnlev
805843e1988Sjohnlev if (id >= ncpus)
806843e1988Sjohnlev return (vcpu_config_new(id));
807843e1988Sjohnlev
808843e1988Sjohnlev mutex_enter(&cpu_lock);
809843e1988Sjohnlev
810843e1988Sjohnlev if ((cp = cpu_get(id)) == NULL) {
811843e1988Sjohnlev mutex_exit(&cpu_lock);
812843e1988Sjohnlev return (ESRCH);
813843e1988Sjohnlev }
814843e1988Sjohnlev
815843e1988Sjohnlev if (cpu_get_state(cp) != P_POWEROFF) {
816843e1988Sjohnlev mutex_exit(&cpu_lock);
817843e1988Sjohnlev return (0);
818843e1988Sjohnlev }
819843e1988Sjohnlev
820843e1988Sjohnlev if ((error = poweron_vcpu(cp)) != 0) {
821843e1988Sjohnlev mutex_exit(&cpu_lock);
822843e1988Sjohnlev return (error);
823843e1988Sjohnlev }
824843e1988Sjohnlev
825843e1988Sjohnlev mutex_exit(&cpu_lock);
826843e1988Sjohnlev
827843e1988Sjohnlev return (p_online_internal(id, P_ONLINE, &oldstate));
828843e1988Sjohnlev }
829843e1988Sjohnlev
830843e1988Sjohnlev #define REPORT_LEN 128
831843e1988Sjohnlev
832843e1988Sjohnlev static void
vcpu_config_report(processorid_t id,uint_t newstate,int error)833843e1988Sjohnlev vcpu_config_report(processorid_t id, uint_t newstate, int error)
834843e1988Sjohnlev {
835843e1988Sjohnlev char *report = kmem_alloc(REPORT_LEN, KM_SLEEP);
836843e1988Sjohnlev size_t len;
837843e1988Sjohnlev char *ps;
838843e1988Sjohnlev
839843e1988Sjohnlev switch (newstate) {
840843e1988Sjohnlev case P_ONLINE:
841843e1988Sjohnlev ps = PS_ONLINE;
842843e1988Sjohnlev break;
843843e1988Sjohnlev case P_POWEROFF:
844843e1988Sjohnlev ps = PS_POWEROFF;
845843e1988Sjohnlev break;
846843e1988Sjohnlev default:
847843e1988Sjohnlev cmn_err(CE_PANIC, "unknown state %u\n", newstate);
848843e1988Sjohnlev break;
849843e1988Sjohnlev }
850843e1988Sjohnlev
851843e1988Sjohnlev len = snprintf(report, REPORT_LEN,
852843e1988Sjohnlev "cpu%d: externally initiated %s", id, ps);
853843e1988Sjohnlev
854843e1988Sjohnlev if (!error) {
855843e1988Sjohnlev cmn_err(CE_CONT, "!%s\n", report);
856843e1988Sjohnlev kmem_free(report, REPORT_LEN);
857843e1988Sjohnlev return;
858843e1988Sjohnlev }
859843e1988Sjohnlev
860843e1988Sjohnlev len += snprintf(report + len, REPORT_LEN - len,
861843e1988Sjohnlev " failed, error %d: ", error);
862843e1988Sjohnlev switch (error) {
863843e1988Sjohnlev case EEXIST:
864843e1988Sjohnlev len += snprintf(report + len, REPORT_LEN - len,
865843e1988Sjohnlev "cpu already %s", ps ? ps : "?");
866843e1988Sjohnlev break;
867843e1988Sjohnlev case ESRCH:
868843e1988Sjohnlev len += snprintf(report + len, REPORT_LEN - len,
869843e1988Sjohnlev "cpu not found");
870843e1988Sjohnlev break;
871843e1988Sjohnlev case EINVAL:
872843e1988Sjohnlev case EALREADY:
873843e1988Sjohnlev break;
874843e1988Sjohnlev case EPERM:
875843e1988Sjohnlev len += snprintf(report + len, REPORT_LEN - len,
876843e1988Sjohnlev "insufficient privilege (0x%x)", id);
877843e1988Sjohnlev break;
878843e1988Sjohnlev case EBUSY:
879843e1988Sjohnlev switch (newstate) {
880843e1988Sjohnlev case P_ONLINE:
881843e1988Sjohnlev /*
882843e1988Sjohnlev * This return comes from mp_cpu_start -
883843e1988Sjohnlev * we cannot 'start' the boot CPU.
884843e1988Sjohnlev */
885843e1988Sjohnlev len += snprintf(report + len, REPORT_LEN - len,
886843e1988Sjohnlev "already running");
887843e1988Sjohnlev break;
888843e1988Sjohnlev case P_POWEROFF:
889843e1988Sjohnlev len += snprintf(report + len, REPORT_LEN - len,
890843e1988Sjohnlev "bound lwps?");
891843e1988Sjohnlev break;
892843e1988Sjohnlev default:
893843e1988Sjohnlev break;
894843e1988Sjohnlev }
895843e1988Sjohnlev default:
896843e1988Sjohnlev break;
897843e1988Sjohnlev }
898843e1988Sjohnlev
899843e1988Sjohnlev cmn_err(CE_CONT, "%s\n", report);
900843e1988Sjohnlev kmem_free(report, REPORT_LEN);
901843e1988Sjohnlev }
902843e1988Sjohnlev
903843e1988Sjohnlev static void
vcpu_config(void * arg)904843e1988Sjohnlev vcpu_config(void *arg)
905843e1988Sjohnlev {
906843e1988Sjohnlev int id = (int)(uintptr_t)arg;
907843e1988Sjohnlev int error;
908843e1988Sjohnlev char dir[16];
909843e1988Sjohnlev char *state;
910843e1988Sjohnlev
911843e1988Sjohnlev if ((uint_t)id >= max_ncpus) {
912843e1988Sjohnlev cmn_err(CE_WARN,
913843e1988Sjohnlev "vcpu_config: cpu%d does not fit in this domain", id);
914843e1988Sjohnlev return;
915843e1988Sjohnlev }
916843e1988Sjohnlev
917843e1988Sjohnlev (void) snprintf(dir, sizeof (dir), "cpu/%d", id);
918843e1988Sjohnlev state = kmem_alloc(MAXPATHLEN, KM_SLEEP);
919843e1988Sjohnlev if (xenbus_scanf(XBT_NULL, dir, "availability", "%s", state) == 0) {
920843e1988Sjohnlev if (strcmp(state, "online") == 0) {
921843e1988Sjohnlev error = vcpu_config_poweron(id);
922843e1988Sjohnlev vcpu_config_report(id, P_ONLINE, error);
923843e1988Sjohnlev } else if (strcmp(state, "offline") == 0) {
924843e1988Sjohnlev error = vcpu_config_poweroff(id);
925843e1988Sjohnlev vcpu_config_report(id, P_POWEROFF, error);
926843e1988Sjohnlev } else {
927843e1988Sjohnlev cmn_err(CE_WARN,
928843e1988Sjohnlev "cpu%d: unknown target state '%s'", id, state);
929843e1988Sjohnlev }
930843e1988Sjohnlev } else
931843e1988Sjohnlev cmn_err(CE_WARN,
932843e1988Sjohnlev "cpu%d: unable to read target state from xenstore", id);
933843e1988Sjohnlev
934843e1988Sjohnlev kmem_free(state, MAXPATHLEN);
935843e1988Sjohnlev }
936843e1988Sjohnlev
937843e1988Sjohnlev /*ARGSUSED*/
938843e1988Sjohnlev static void
vcpu_config_event(struct xenbus_watch * watch,const char ** vec,uint_t len)939843e1988Sjohnlev vcpu_config_event(struct xenbus_watch *watch, const char **vec, uint_t len)
940843e1988Sjohnlev {
941843e1988Sjohnlev const char *path = vec[XS_WATCH_PATH];
942843e1988Sjohnlev processorid_t id;
943843e1988Sjohnlev char *s;
944843e1988Sjohnlev
945843e1988Sjohnlev if ((s = strstr(path, "cpu/")) != NULL &&
946843e1988Sjohnlev sscanf(s, "cpu/%d", &id) == 1) {
947843e1988Sjohnlev /*
948843e1988Sjohnlev * Run the virtual CPU configuration on a separate thread to
949843e1988Sjohnlev * avoid blocking on this event for too long (and for now,
950843e1988Sjohnlev * to ensure configuration requests are serialized.)
951843e1988Sjohnlev */
952843e1988Sjohnlev (void) taskq_dispatch(cpu_config_tq,
953843e1988Sjohnlev vcpu_config, (void *)(uintptr_t)id, 0);
954843e1988Sjohnlev }
955843e1988Sjohnlev }
956843e1988Sjohnlev
957843e1988Sjohnlev static int
xen_vcpu_initialize(processorid_t id,vcpu_guest_context_t * vgc)958843e1988Sjohnlev xen_vcpu_initialize(processorid_t id, vcpu_guest_context_t *vgc)
959843e1988Sjohnlev {
960843e1988Sjohnlev int err;
961843e1988Sjohnlev
962843e1988Sjohnlev if ((err = HYPERVISOR_vcpu_op(VCPUOP_initialise, id, vgc)) != 0) {
963843e1988Sjohnlev char *str;
964843e1988Sjohnlev int level = CE_WARN;
965843e1988Sjohnlev
966843e1988Sjohnlev switch (err) {
967843e1988Sjohnlev case -X_EINVAL:
968843e1988Sjohnlev /*
969843e1988Sjohnlev * This interface squashes multiple error sources
970843e1988Sjohnlev * to one error code. In particular, an X_EINVAL
971843e1988Sjohnlev * code can mean:
972843e1988Sjohnlev *
973843e1988Sjohnlev * - the vcpu id is out of range
974843e1988Sjohnlev * - cs or ss are in ring 0
975843e1988Sjohnlev * - cr3 is wrong
976843e1988Sjohnlev * - an entry in the new gdt is above the
977843e1988Sjohnlev * reserved entry
978843e1988Sjohnlev * - a frame underneath the new gdt is bad
979843e1988Sjohnlev */
980843e1988Sjohnlev str = "something is wrong :(";
981843e1988Sjohnlev break;
982843e1988Sjohnlev case -X_ENOENT:
983843e1988Sjohnlev str = "no such cpu";
984843e1988Sjohnlev break;
985843e1988Sjohnlev case -X_ENOMEM:
986843e1988Sjohnlev str = "no mem to copy ctxt";
987843e1988Sjohnlev break;
988843e1988Sjohnlev case -X_EFAULT:
989843e1988Sjohnlev str = "bad address";
990843e1988Sjohnlev break;
991843e1988Sjohnlev case -X_EEXIST:
992843e1988Sjohnlev /*
993843e1988Sjohnlev * Hmm. This error is returned if the vcpu has already
994843e1988Sjohnlev * been initialized once before in the lifetime of this
995843e1988Sjohnlev * domain. This is a logic error in the kernel.
996843e1988Sjohnlev */
997843e1988Sjohnlev level = CE_PANIC;
998843e1988Sjohnlev str = "already initialized";
999843e1988Sjohnlev break;
1000843e1988Sjohnlev default:
1001843e1988Sjohnlev level = CE_PANIC;
1002843e1988Sjohnlev str = "<unexpected>";
1003843e1988Sjohnlev break;
1004843e1988Sjohnlev }
1005843e1988Sjohnlev
1006843e1988Sjohnlev cmn_err(level, "vcpu%d: failed to init: error %d: %s",
1007843e1988Sjohnlev id, -err, str);
1008843e1988Sjohnlev }
1009843e1988Sjohnlev return (err);
1010843e1988Sjohnlev }
1011843e1988Sjohnlev
1012843e1988Sjohnlev long
xen_vcpu_up(processorid_t id)1013843e1988Sjohnlev xen_vcpu_up(processorid_t id)
1014843e1988Sjohnlev {
1015843e1988Sjohnlev long err;
1016843e1988Sjohnlev
1017843e1988Sjohnlev if ((err = HYPERVISOR_vcpu_op(VCPUOP_up, id, NULL)) != 0) {
1018843e1988Sjohnlev char *str;
1019843e1988Sjohnlev
1020843e1988Sjohnlev switch (err) {
1021843e1988Sjohnlev case -X_ENOENT:
1022843e1988Sjohnlev str = "no such cpu";
1023843e1988Sjohnlev break;
1024843e1988Sjohnlev case -X_EINVAL:
1025843e1988Sjohnlev /*
1026843e1988Sjohnlev * Perhaps this is diagnostic overkill.
1027843e1988Sjohnlev */
1028843e1988Sjohnlev if (HYPERVISOR_vcpu_op(VCPUOP_is_up, id, NULL) < 0)
1029843e1988Sjohnlev str = "bad cpuid";
1030843e1988Sjohnlev else
1031843e1988Sjohnlev str = "not initialized";
1032843e1988Sjohnlev break;
1033843e1988Sjohnlev default:
1034843e1988Sjohnlev str = "<unexpected>";
1035843e1988Sjohnlev break;
1036843e1988Sjohnlev }
1037843e1988Sjohnlev
1038843e1988Sjohnlev printf("vcpu%d: failed to start: error %d: %s\n",
1039843e1988Sjohnlev id, -(int)err, str);
1040843e1988Sjohnlev return (EBFONT); /* deliberately silly */
1041843e1988Sjohnlev }
1042843e1988Sjohnlev return (err);
1043843e1988Sjohnlev }
1044843e1988Sjohnlev
1045843e1988Sjohnlev long
xen_vcpu_down(processorid_t id)1046843e1988Sjohnlev xen_vcpu_down(processorid_t id)
1047843e1988Sjohnlev {
1048843e1988Sjohnlev long err;
1049843e1988Sjohnlev
1050843e1988Sjohnlev if ((err = HYPERVISOR_vcpu_op(VCPUOP_down, id, NULL)) != 0) {
1051843e1988Sjohnlev /*
1052843e1988Sjohnlev * X_ENOENT: no such cpu
1053843e1988Sjohnlev * X_EINVAL: bad cpuid
1054843e1988Sjohnlev */
1055843e1988Sjohnlev panic("vcpu%d: failed to stop: error %d", id, -(int)err);
1056843e1988Sjohnlev }
1057843e1988Sjohnlev
1058843e1988Sjohnlev return (err);
1059843e1988Sjohnlev }
1060