xref: /illumos-gate/usr/src/uts/i86xpv/os/mp_xen.c (revision 7b34a9a5df26271af0da06974fc361c468cd48d3)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Copyright 2019 Joyent, Inc.
29  */
30 
31 /*
32  * Virtual CPU management.
33  *
34  * VCPUs can be controlled in one of two ways; through the domain itself
35  * (psradm, p_online(), etc.), and via changes in xenstore (vcpu_config()).
36  * Unfortunately, the terminology is used in different ways; they work out as
37  * follows:
38  *
39  * P_ONLINE: the VCPU is up and running, taking interrupts and running threads
40  *
41  * P_OFFLINE: the VCPU is up and running, but quiesced (i.e. blocked in the
42  * hypervisor on the idle thread).  It must be up since a downed VCPU cannot
43  * receive interrupts, and we require this for offline CPUs in Solaris.
44  *
45  * P_POWEROFF: the VCPU is down (we never called xen_vcpu_up(), or called
46  * xen_vcpu_down() for it).  It can't take interrupts or run anything, though
47  * if it has run previously, its software state (cpu_t, machcpu structures, IPI
48  * event channels, etc.) will still exist.
49  *
50  * The hypervisor has two notions of CPU states as represented in the store:
51  *
52  * "offline": the VCPU is down.  Corresponds to P_POWEROFF.
53  *
54  * "online": the VCPU is running.  Corresponds to a CPU state other than
55  * P_POWEROFF.
56  *
57  * Currently, only a notification via xenstore can bring a CPU into a
58  * P_POWEROFF state, and only the domain can change between P_ONLINE, P_NOINTR,
59  * P_OFFLINE, etc.  We need to be careful to treat xenstore notifications
60  * idempotently, as we'll get 'duplicate' entries when we resume a domain.
61  *
62  * Note that the xenstore configuration is strictly advisory, in that a domain
63  * can choose to ignore it and still power up a VCPU in the offline state. To
64  * play nice, we don't allow it. Thus, any attempt to power on/off a CPU is
65  * ENOTSUP from within Solaris.
66  *
67  * Powering off a VCPU and suspending the domain use similar code. The
68  * difficulty here is that we must ensure that each VCPU is in a stable
69  * state: it must have a saved PCB, and not be responding to interrupts
70  * (since we are just about to remove its ability to run on a real CPU,
71  * possibly forever).  However, an offline CPU in Solaris can take
72  * cross-call interrupts, as mentioned, so we must go through a
73  * two-stage process.  First, we use the standard Solaris pause_cpus().
74  * This ensures that all CPUs are either in mach_cpu_pause() or
75  * mach_cpu_idle(), and nothing will cross-call them.
76  *
77  * Powered-off-CPUs are already safe, as we own the cpu_lock needed to
78  * bring them back up, and in state CPU_PHASE_POWERED_OFF.
79  *
80  * Running CPUs are spinning in mach_cpu_pause() waiting for either
81  * PAUSE_IDLE or CPU_PHASE_WAIT_SAFE.
82  *
83  * Offline CPUs are either running the idle thread and periodically
84  * checking for CPU_PHASE_WAIT_SAFE, or blocked in the hypervisor.
85  *
86  * Thus, we set CPU_PHASE_WAIT_SAFE for every powered-on CPU, as well as
87  * poking them to make sure they're not blocked[1]. When every CPU has
88  * responded by reaching a safe state and setting CPU_PHASE_SAFE, we
89  * know we can suspend, or power-off a CPU, without problems.
90  *
91  * [1] note that we have to repeatedly poke offline CPUs: it's the only
92  * way to ensure that the CPU doesn't miss the state change before
93  * dropping into HYPERVISOR_block().
94  */
95 
96 #include <sys/types.h>
97 #include <sys/systm.h>
98 #include <sys/param.h>
99 #include <sys/taskq.h>
100 #include <sys/cmn_err.h>
101 #include <sys/archsystm.h>
102 #include <sys/machsystm.h>
103 #include <sys/segments.h>
104 #include <sys/cpuvar.h>
105 #include <sys/x86_archext.h>
106 #include <sys/controlregs.h>
107 #include <sys/hypervisor.h>
108 #include <sys/xpv_panic.h>
109 #include <sys/mman.h>
110 #include <sys/psw.h>
111 #include <sys/cpu.h>
112 #include <sys/sunddi.h>
113 #include <util/sscanf.h>
114 #include <vm/hat_i86.h>
115 #include <vm/hat.h>
116 #include <vm/as.h>
117 
118 #include <xen/public/io/xs_wire.h>
119 #include <xen/sys/xenbus_impl.h>
120 #include <xen/public/vcpu.h>
121 
122 extern cpuset_t cpu_ready_set;
123 
124 #define	CPU_PHASE_NONE 0
125 #define	CPU_PHASE_WAIT_SAFE 1
126 #define	CPU_PHASE_SAFE 2
127 #define	CPU_PHASE_POWERED_OFF 3
128 
129 /*
130  * We can only poke CPUs during barrier enter 256 times a second at
131  * most.
132  */
133 #define	POKE_TIMEOUT (NANOSEC / 256)
134 
135 static taskq_t *cpu_config_tq;
136 static int cpu_phase[NCPU];
137 
138 static void vcpu_config_event(struct xenbus_watch *, const char **, uint_t);
139 static int xen_vcpu_initialize(processorid_t, vcpu_guest_context_t *);
140 
141 /*
142  * Return whether or not the vcpu is actually running on a pcpu
143  */
144 int
145 vcpu_on_pcpu(processorid_t cpu)
146 {
147 	struct vcpu_runstate_info runstate;
148 	int	ret = VCPU_STATE_UNKNOWN;
149 
150 	ASSERT(cpu < NCPU);
151 	/*
152 	 * Don't bother with hypercall if we are asking about ourself
153 	 */
154 	if (cpu == CPU->cpu_id)
155 		return (VCPU_ON_PCPU);
156 	if (HYPERVISOR_vcpu_op(VCPUOP_get_runstate_info, cpu, &runstate) != 0)
157 		goto out;
158 
159 	switch (runstate.state) {
160 	case RUNSTATE_running:
161 		ret = VCPU_ON_PCPU;
162 		break;
163 
164 	case RUNSTATE_runnable:
165 	case RUNSTATE_offline:
166 	case RUNSTATE_blocked:
167 		ret = VCPU_NOT_ON_PCPU;
168 		break;
169 
170 	default:
171 		break;
172 	}
173 
174 out:
175 	return (ret);
176 }
177 
178 /*
179  * These routines allocate any global state that might be needed
180  * while starting cpus.  For virtual cpus, there is no such state.
181  */
182 int
183 mach_cpucontext_init(void)
184 {
185 	return (0);
186 }
187 
188 void
189 do_cpu_config_watch(int state)
190 {
191 	static struct xenbus_watch cpu_config_watch;
192 
193 	if (state != XENSTORE_UP)
194 		return;
195 	cpu_config_watch.node = "cpu";
196 	cpu_config_watch.callback = vcpu_config_event;
197 	if (register_xenbus_watch(&cpu_config_watch)) {
198 		taskq_destroy(cpu_config_tq);
199 		cmn_err(CE_WARN, "do_cpu_config_watch: "
200 		    "failed to set vcpu config watch");
201 	}
202 
203 }
204 
205 /*
206  * This routine is called after all the "normal" MP startup has
207  * been done; a good place to start watching xen store for virtual
208  * cpu hot plug events.
209  */
210 void
211 mach_cpucontext_fini(void)
212 {
213 
214 	cpu_config_tq = taskq_create("vcpu config taskq", 1,
215 	    maxclsyspri - 1, 1, 1, TASKQ_PREPOPULATE);
216 
217 	(void) xs_register_xenbus_callback(do_cpu_config_watch);
218 }
219 
220 /*
221  * Fill in the remaining CPU context and initialize it.
222  */
223 static int
224 mp_set_cpu_context(vcpu_guest_context_t *vgc, cpu_t *cp)
225 {
226 	uint_t vec, iopl;
227 
228 	vgc->flags = VGCF_IN_KERNEL;
229 
230 	/*
231 	 * fpu_ctx we leave as zero; on first fault we'll store
232 	 * sse_initial into it anyway.
233 	 */
234 
235 #if defined(__amd64)
236 	vgc->user_regs.cs = KCS_SEL | SEL_KPL;	/* force to ring 3 */
237 #else
238 	vgc->user_regs.cs = KCS_SEL;
239 #endif
240 	vgc->user_regs.ds = KDS_SEL;
241 	vgc->user_regs.es = KDS_SEL;
242 	vgc->user_regs.ss = KDS_SEL;
243 	vgc->kernel_ss = KDS_SEL;
244 
245 	/*
246 	 * Allow I/O privilege level for Dom0 kernel.
247 	 */
248 	if (DOMAIN_IS_INITDOMAIN(xen_info))
249 		iopl = (PS_IOPL & 0x1000); /* ring 1 */
250 	else
251 		iopl = 0;
252 
253 #if defined(__amd64)
254 	vgc->user_regs.fs = 0;
255 	vgc->user_regs.gs = 0;
256 	vgc->user_regs.rflags = F_OFF | iopl;
257 #elif defined(__i386)
258 	vgc->user_regs.fs = KFS_SEL;
259 	vgc->user_regs.gs = KGS_SEL;
260 	vgc->user_regs.eflags = F_OFF | iopl;
261 	vgc->event_callback_cs = vgc->user_regs.cs;
262 	vgc->failsafe_callback_cs = vgc->user_regs.cs;
263 #endif
264 
265 	/*
266 	 * Initialize the trap_info_t from the IDT
267 	 */
268 #if !defined(__lint)
269 	ASSERT(NIDT == sizeof (vgc->trap_ctxt) / sizeof (vgc->trap_ctxt[0]));
270 #endif
271 	for (vec = 0; vec < NIDT; vec++) {
272 		trap_info_t *ti = &vgc->trap_ctxt[vec];
273 
274 		if (xen_idt_to_trap_info(vec,
275 		    &cp->cpu_m.mcpu_idt[vec], ti) == 0) {
276 			ti->cs = KCS_SEL;
277 			ti->vector = vec;
278 		}
279 	}
280 
281 	/*
282 	 * No LDT
283 	 */
284 
285 	/*
286 	 * (We assert in various places that the GDT is (a) aligned on a
287 	 * page boundary and (b) one page long, so this really should fit..)
288 	 */
289 #ifdef CRASH_XEN
290 	vgc->gdt_frames[0] = pa_to_ma(mmu_btop(cp->cpu_m.mcpu_gdtpa));
291 #else
292 	vgc->gdt_frames[0] = pfn_to_mfn(mmu_btop(cp->cpu_m.mcpu_gdtpa));
293 #endif
294 	vgc->gdt_ents = NGDT;
295 
296 	vgc->ctrlreg[0] = CR0_ENABLE_FPU_FLAGS(getcr0());
297 
298 #if defined(__i386)
299 	if (mmu.pae_hat)
300 		vgc->ctrlreg[3] =
301 		    xen_pfn_to_cr3(pfn_to_mfn(kas.a_hat->hat_htable->ht_pfn));
302 	else
303 #endif
304 		vgc->ctrlreg[3] =
305 		    pa_to_ma(mmu_ptob(kas.a_hat->hat_htable->ht_pfn));
306 
307 	vgc->ctrlreg[4] = getcr4();
308 
309 	vgc->event_callback_eip = (uintptr_t)xen_callback;
310 	vgc->failsafe_callback_eip = (uintptr_t)xen_failsafe_callback;
311 	vgc->flags |= VGCF_failsafe_disables_events;
312 
313 #if defined(__amd64)
314 	/*
315 	 * XXPV should this be moved to init_cpu_syscall?
316 	 */
317 	vgc->syscall_callback_eip = (uintptr_t)sys_syscall;
318 	vgc->flags |= VGCF_syscall_disables_events;
319 
320 	ASSERT(vgc->user_regs.gs == 0);
321 	vgc->gs_base_kernel = (uintptr_t)cp;
322 #endif
323 
324 	return (xen_vcpu_initialize(cp->cpu_id, vgc));
325 }
326 
327 /*
328  * Create a guest virtual cpu context so that the virtual cpu
329  * springs into life in the domain just about to call mp_startup()
330  *
331  * Virtual CPUs must be initialized once in the lifetime of the domain;
332  * after that subsequent attempts to start them will fail with X_EEXIST.
333  *
334  * Thus 'alloc' -really- creates and initializes the virtual
335  * CPU context just once. Once the initialisation succeeds, we never
336  * free it, nor the regular cpu_t to which it refers.
337  */
338 void *
339 mach_cpucontext_alloc(struct cpu *cp)
340 {
341 	kthread_t *tp = cp->cpu_thread;
342 	vcpu_guest_context_t vgc;
343 
344 	int err = 1;
345 
346 	/*
347 	 * First, augment the incoming cpu structure
348 	 * - vcpu pointer reference
349 	 * - pending event storage area
350 	 * - physical address of GDT
351 	 */
352 	cp->cpu_m.mcpu_vcpu_info =
353 	    &HYPERVISOR_shared_info->vcpu_info[cp->cpu_id];
354 	cp->cpu_m.mcpu_evt_pend = kmem_zalloc(
355 	    sizeof (struct xen_evt_data), KM_SLEEP);
356 	cp->cpu_m.mcpu_gdtpa =
357 	    mmu_ptob(hat_getpfnum(kas.a_hat, (caddr_t)cp->cpu_gdt));
358 
359 	if ((err = xen_gdt_setprot(cp, PROT_READ)) != 0)
360 		goto done;
361 
362 	/*
363 	 * Now set up the vcpu context so that we can start this vcpu
364 	 * in the kernel at tp->t_pc (mp_startup).  Note that the
365 	 * thread will thread_exit() shortly after performing the
366 	 * initialization; in particular, we will *never* take a
367 	 * privilege transition on this thread.
368 	 */
369 
370 	bzero(&vgc, sizeof (vgc));
371 
372 #ifdef __amd64
373 	vgc.user_regs.rip = tp->t_pc;
374 	vgc.user_regs.rsp = tp->t_sp;
375 	vgc.user_regs.rbp = tp->t_sp - 2 * sizeof (greg_t);
376 #else
377 	vgc.user_regs.eip = tp->t_pc;
378 	vgc.user_regs.esp = tp->t_sp;
379 	vgc.user_regs.ebp = tp->t_sp - 2 * sizeof (greg_t);
380 #endif
381 	/*
382 	 * XXPV	Fix resume, if Russ didn't already fix it.
383 	 *
384 	 * Note that resume unconditionally puts t->t_stk + sizeof (regs)
385 	 * into kernel_sp via HYPERVISOR_stack_switch. This anticipates
386 	 * that only lwps take traps that switch to the kernel stack;
387 	 * part of creating an lwp adjusts the stack by subtracting
388 	 * sizeof (struct regs) off t_stk.
389 	 *
390 	 * The more interesting question is, why do we do all the work
391 	 * of a fully fledged lwp for a plain thread?  In particular
392 	 * we don't have to call HYPERVISOR_stack_switch for lwp-less threads
393 	 * or futz with the LDT.  This should probably all be done with
394 	 * an lwp context operator to keep pure thread context switch fast.
395 	 */
396 	vgc.kernel_sp = (ulong_t)tp->t_stk;
397 
398 	err = mp_set_cpu_context(&vgc, cp);
399 
400 done:
401 	if (err) {
402 		mach_cpucontext_free(cp, NULL, err);
403 		return (NULL);
404 	}
405 	return (cp);
406 }
407 
408 /*
409  * By the time we are called either we have successfully started
410  * the cpu, or our attempt to start it has failed.
411  */
412 
413 /*ARGSUSED*/
414 void
415 mach_cpucontext_free(struct cpu *cp, void *arg, int err)
416 {
417 	switch (err) {
418 	case 0:
419 		break;
420 	case ETIMEDOUT:
421 		/*
422 		 * The vcpu context is loaded into the hypervisor, and
423 		 * we've tried to start it, but the vcpu has not been set
424 		 * running yet, for whatever reason.  We arrange to -not-
425 		 * free any data structures it may be referencing.  In
426 		 * particular, we've already told the hypervisor about
427 		 * the GDT, and so we can't map it read-write again.
428 		 */
429 		break;
430 	default:
431 		(void) xen_gdt_setprot(cp, PROT_READ | PROT_WRITE);
432 		kmem_free(cp->cpu_m.mcpu_evt_pend,
433 		    sizeof (struct xen_evt_data));
434 		break;
435 	}
436 }
437 
438 /*
439  * Reset this CPU's context.  Clear out any pending evtchn data, since event
440  * channel numbers will all change when we resume.
441  */
442 void
443 mach_cpucontext_reset(cpu_t *cp)
444 {
445 	bzero(cp->cpu_m.mcpu_evt_pend, sizeof (struct xen_evt_data));
446 	/* mcpu_intr_pending ? */
447 }
448 
449 static void
450 pcb_to_user_regs(label_t *pcb, vcpu_guest_context_t *vgc)
451 {
452 #ifdef __amd64
453 	vgc->user_regs.rip = pcb->val[REG_LABEL_PC];
454 	vgc->user_regs.rsp = pcb->val[REG_LABEL_SP];
455 	vgc->user_regs.rbp = pcb->val[REG_LABEL_BP];
456 	vgc->user_regs.rbx = pcb->val[REG_LABEL_RBX];
457 	vgc->user_regs.r12 = pcb->val[REG_LABEL_R12];
458 	vgc->user_regs.r13 = pcb->val[REG_LABEL_R13];
459 	vgc->user_regs.r14 = pcb->val[REG_LABEL_R14];
460 	vgc->user_regs.r15 = pcb->val[REG_LABEL_R15];
461 #else /* __amd64 */
462 	vgc->user_regs.eip = pcb->val[REG_LABEL_PC];
463 	vgc->user_regs.esp = pcb->val[REG_LABEL_SP];
464 	vgc->user_regs.ebp = pcb->val[REG_LABEL_BP];
465 	vgc->user_regs.ebx = pcb->val[REG_LABEL_EBX];
466 	vgc->user_regs.esi = pcb->val[REG_LABEL_ESI];
467 	vgc->user_regs.edi = pcb->val[REG_LABEL_EDI];
468 #endif /* __amd64 */
469 }
470 
471 /*
472  * Restore the context of a CPU during resume.  This context is always
473  * inside enter_safe_phase(), below.
474  */
475 void
476 mach_cpucontext_restore(cpu_t *cp)
477 {
478 	vcpu_guest_context_t vgc;
479 	int err;
480 
481 	ASSERT(cp->cpu_thread == cp->cpu_pause_thread ||
482 	    cp->cpu_thread == cp->cpu_idle_thread);
483 
484 	bzero(&vgc, sizeof (vgc));
485 
486 	pcb_to_user_regs(&cp->cpu_thread->t_pcb, &vgc);
487 
488 	/*
489 	 * We're emulating a longjmp() here: in particular, we need to bump the
490 	 * stack pointer to account for the pop of xIP that returning from
491 	 * longjmp() normally would do, and set the return value in xAX to 1.
492 	 */
493 #ifdef __amd64
494 	vgc.user_regs.rax = 1;
495 	vgc.user_regs.rsp += sizeof (ulong_t);
496 #else
497 	vgc.user_regs.eax = 1;
498 	vgc.user_regs.esp += sizeof (ulong_t);
499 #endif
500 
501 	vgc.kernel_sp = cp->cpu_thread->t_sp;
502 
503 	err = mp_set_cpu_context(&vgc, cp);
504 
505 	ASSERT(err == 0);
506 }
507 
508 /*
509  * Reach a point at which the CPU can be safely powered-off or
510  * suspended.  Nothing can wake this CPU out of the loop.
511  */
512 static void
513 enter_safe_phase(void)
514 {
515 	ulong_t flags = intr_clear();
516 
517 	if (setjmp(&curthread->t_pcb) == 0) {
518 		cpu_phase[CPU->cpu_id] = CPU_PHASE_SAFE;
519 		while (cpu_phase[CPU->cpu_id] == CPU_PHASE_SAFE)
520 			SMT_PAUSE();
521 	}
522 
523 	ASSERT(!interrupts_enabled());
524 
525 	intr_restore(flags);
526 }
527 
528 /*
529  * Offline CPUs run this code even under a pause_cpus(), so we must
530  * check if we need to enter the safe phase.
531  */
532 void
533 mach_cpu_idle(void)
534 {
535 	if (IN_XPV_PANIC()) {
536 		xpv_panic_halt();
537 	} else  {
538 		(void) HYPERVISOR_block();
539 		if (cpu_phase[CPU->cpu_id] == CPU_PHASE_WAIT_SAFE)
540 			enter_safe_phase();
541 	}
542 }
543 
544 /*
545  * Spin until either start_cpus() wakes us up, or we get a request to
546  * enter the safe phase (followed by a later start_cpus()).
547  */
548 void
549 mach_cpu_pause(volatile char *safe)
550 {
551 	*safe = PAUSE_WAIT;
552 	membar_enter();
553 
554 	while (*safe != PAUSE_IDLE) {
555 		if (cpu_phase[CPU->cpu_id] == CPU_PHASE_WAIT_SAFE)
556 			enter_safe_phase();
557 		SMT_PAUSE();
558 	}
559 }
560 
561 int
562 mach_cpu_halt(xc_arg_t arg1, xc_arg_t arg2 __unused, xc_arg_t arg3 __unused)
563 {
564 	char *msg = (char *)arg1;
565 
566 	if (msg)
567 		prom_printf("%s\n", msg);
568 	(void) xen_vcpu_down(CPU->cpu_id);
569 	return (0);
570 }
571 
572 /*ARGSUSED*/
573 int
574 mp_cpu_poweron(struct cpu *cp)
575 {
576 	return (ENOTSUP);
577 }
578 
579 /*ARGSUSED*/
580 int
581 mp_cpu_poweroff(struct cpu *cp)
582 {
583 	return (ENOTSUP);
584 }
585 
586 void
587 mp_enter_barrier(void)
588 {
589 	hrtime_t last_poke_time = 0;
590 	int poke_allowed = 0;
591 	int done = 0;
592 	int i;
593 
594 	ASSERT(MUTEX_HELD(&cpu_lock));
595 
596 	pause_cpus(NULL, NULL);
597 
598 	while (!done) {
599 		done = 1;
600 		poke_allowed = 0;
601 
602 		if (xpv_gethrtime() - last_poke_time > POKE_TIMEOUT) {
603 			last_poke_time = xpv_gethrtime();
604 			poke_allowed = 1;
605 		}
606 
607 		for (i = 0; i < NCPU; i++) {
608 			cpu_t *cp = cpu_get(i);
609 
610 			if (cp == NULL || cp == CPU)
611 				continue;
612 
613 			switch (cpu_phase[i]) {
614 			case CPU_PHASE_NONE:
615 				cpu_phase[i] = CPU_PHASE_WAIT_SAFE;
616 				poke_cpu(i);
617 				done = 0;
618 				break;
619 
620 			case CPU_PHASE_WAIT_SAFE:
621 				if (poke_allowed)
622 					poke_cpu(i);
623 				done = 0;
624 				break;
625 
626 			case CPU_PHASE_SAFE:
627 			case CPU_PHASE_POWERED_OFF:
628 				break;
629 			}
630 		}
631 
632 		SMT_PAUSE();
633 	}
634 }
635 
636 void
637 mp_leave_barrier(void)
638 {
639 	int i;
640 
641 	ASSERT(MUTEX_HELD(&cpu_lock));
642 
643 	for (i = 0; i < NCPU; i++) {
644 		cpu_t *cp = cpu_get(i);
645 
646 		if (cp == NULL || cp == CPU)
647 			continue;
648 
649 		switch (cpu_phase[i]) {
650 		/*
651 		 * If we see a CPU in one of these phases, something has
652 		 * gone badly wrong with the guarantees
653 		 * mp_enter_barrier() is supposed to provide.  Rather
654 		 * than attempt to stumble along (and since we can't
655 		 * panic properly in this context), we tell the
656 		 * hypervisor we've crashed.
657 		 */
658 		case CPU_PHASE_NONE:
659 		case CPU_PHASE_WAIT_SAFE:
660 			(void) HYPERVISOR_shutdown(SHUTDOWN_crash);
661 			break;
662 
663 		case CPU_PHASE_POWERED_OFF:
664 			break;
665 
666 		case CPU_PHASE_SAFE:
667 			cpu_phase[i] = CPU_PHASE_NONE;
668 		}
669 	}
670 
671 	start_cpus();
672 }
673 
674 static int
675 poweroff_vcpu(struct cpu *cp)
676 {
677 	int error;
678 
679 	ASSERT(MUTEX_HELD(&cpu_lock));
680 
681 	ASSERT(CPU->cpu_id != cp->cpu_id);
682 	ASSERT(cp->cpu_flags & CPU_QUIESCED);
683 
684 	mp_enter_barrier();
685 
686 	if ((error = xen_vcpu_down(cp->cpu_id)) == 0) {
687 		ASSERT(cpu_phase[cp->cpu_id] == CPU_PHASE_SAFE);
688 
689 		CPUSET_DEL(cpu_ready_set, cp->cpu_id);
690 
691 		if (cp->cpu_flags & CPU_ENABLE)
692 			ncpus_intr_enabled--;
693 
694 		cp->cpu_flags |= CPU_POWEROFF | CPU_OFFLINE;
695 		cp->cpu_flags &=
696 		    ~(CPU_RUNNING | CPU_READY | CPU_EXISTS | CPU_ENABLE);
697 
698 		cpu_phase[cp->cpu_id] = CPU_PHASE_POWERED_OFF;
699 
700 		cpu_set_state(cp);
701 	}
702 
703 	mp_leave_barrier();
704 
705 	return (error);
706 }
707 
708 static int
709 vcpu_config_poweroff(processorid_t id)
710 {
711 	int oldstate;
712 	int error;
713 	cpu_t *cp;
714 
715 	mutex_enter(&cpu_lock);
716 
717 	if ((cp = cpu_get(id)) == NULL) {
718 		mutex_exit(&cpu_lock);
719 		return (ESRCH);
720 	}
721 
722 	if (cpu_get_state(cp) == P_POWEROFF) {
723 		mutex_exit(&cpu_lock);
724 		return (0);
725 	}
726 
727 	mutex_exit(&cpu_lock);
728 
729 	do {
730 		error = p_online_internal(id, P_OFFLINE,
731 		    &oldstate);
732 
733 		if (error != 0)
734 			break;
735 
736 		/*
737 		 * So we just changed it to P_OFFLINE.  But then we dropped
738 		 * cpu_lock, so now it is possible for another thread to change
739 		 * the cpu back to a different, non-quiesced state e.g.
740 		 * P_ONLINE.
741 		 */
742 		mutex_enter(&cpu_lock);
743 		if ((cp = cpu_get(id)) == NULL)
744 			error = ESRCH;
745 		else {
746 			if (cp->cpu_flags & CPU_QUIESCED)
747 				error = poweroff_vcpu(cp);
748 			else
749 				error = EBUSY;
750 		}
751 		mutex_exit(&cpu_lock);
752 	} while (error == EBUSY);
753 
754 	return (error);
755 }
756 
757 /*
758  * Add a new virtual cpu to the domain.
759  */
760 static int
761 vcpu_config_new(processorid_t id)
762 {
763 	extern int start_cpu(processorid_t);
764 	int error;
765 
766 	if (ncpus == 1) {
767 		printf("cannot (yet) add cpus to a single-cpu domain\n");
768 		return (ENOTSUP);
769 	}
770 
771 	affinity_set(CPU_CURRENT);
772 	error = start_cpu(id);
773 	affinity_clear();
774 	return (error);
775 }
776 
777 static int
778 poweron_vcpu(struct cpu *cp)
779 {
780 	int error;
781 
782 	ASSERT(MUTEX_HELD(&cpu_lock));
783 
784 	if (HYPERVISOR_vcpu_op(VCPUOP_is_up, cp->cpu_id, NULL) != 0) {
785 		printf("poweron_vcpu: vcpu%d is not available!\n",
786 		    cp->cpu_id);
787 		return (ENXIO);
788 	}
789 
790 	if ((error = xen_vcpu_up(cp->cpu_id)) == 0) {
791 		CPUSET_ADD(cpu_ready_set, cp->cpu_id);
792 		cp->cpu_flags |= CPU_EXISTS | CPU_READY | CPU_RUNNING;
793 		cp->cpu_flags &= ~CPU_POWEROFF;
794 		/*
795 		 * There are some nasty races possible here.
796 		 * Tell the vcpu it's up one more time.
797 		 * XXPV	Is this enough?  Is this safe?
798 		 */
799 		(void) xen_vcpu_up(cp->cpu_id);
800 
801 		cpu_phase[cp->cpu_id] = CPU_PHASE_NONE;
802 
803 		cpu_set_state(cp);
804 	}
805 	return (error);
806 }
807 
808 static int
809 vcpu_config_poweron(processorid_t id)
810 {
811 	cpu_t *cp;
812 	int oldstate;
813 	int error;
814 
815 	if (id >= ncpus)
816 		return (vcpu_config_new(id));
817 
818 	mutex_enter(&cpu_lock);
819 
820 	if ((cp = cpu_get(id)) == NULL) {
821 		mutex_exit(&cpu_lock);
822 		return (ESRCH);
823 	}
824 
825 	if (cpu_get_state(cp) != P_POWEROFF) {
826 		mutex_exit(&cpu_lock);
827 		return (0);
828 	}
829 
830 	if ((error = poweron_vcpu(cp)) != 0) {
831 		mutex_exit(&cpu_lock);
832 		return (error);
833 	}
834 
835 	mutex_exit(&cpu_lock);
836 
837 	return (p_online_internal(id, P_ONLINE, &oldstate));
838 }
839 
840 #define	REPORT_LEN	128
841 
842 static void
843 vcpu_config_report(processorid_t id, uint_t newstate, int error)
844 {
845 	char *report = kmem_alloc(REPORT_LEN, KM_SLEEP);
846 	size_t len;
847 	char *ps;
848 
849 	ps = NULL;
850 	switch (newstate) {
851 	case P_ONLINE:
852 		ps = PS_ONLINE;
853 		break;
854 	case P_POWEROFF:
855 		ps = PS_POWEROFF;
856 		break;
857 	default:
858 		cmn_err(CE_PANIC, "unknown state %u\n", newstate);
859 		break;
860 	}
861 
862 	len = snprintf(report, REPORT_LEN,
863 	    "cpu%d: externally initiated %s", id, ps);
864 
865 	if (!error) {
866 		cmn_err(CE_CONT, "!%s\n", report);
867 		kmem_free(report, REPORT_LEN);
868 		return;
869 	}
870 
871 	len += snprintf(report + len, REPORT_LEN - len,
872 	    " failed, error %d: ", error);
873 	switch (error) {
874 	case EEXIST:
875 		len += snprintf(report + len, REPORT_LEN - len,
876 		    "cpu already %s", ps ? ps : "?");
877 		break;
878 	case ESRCH:
879 		len += snprintf(report + len, REPORT_LEN - len,
880 		    "cpu not found");
881 		break;
882 	case EINVAL:
883 	case EALREADY:
884 		break;
885 	case EPERM:
886 		len += snprintf(report + len, REPORT_LEN - len,
887 		    "insufficient privilege (0x%x)", id);
888 		break;
889 	case EBUSY:
890 		switch (newstate) {
891 		case P_ONLINE:
892 			/*
893 			 * This return comes from mp_cpu_start -
894 			 * we cannot 'start' the boot CPU.
895 			 */
896 			len += snprintf(report + len, REPORT_LEN - len,
897 			    "already running");
898 			break;
899 		case P_POWEROFF:
900 			len += snprintf(report + len, REPORT_LEN - len,
901 			    "bound lwps?");
902 			break;
903 		default:
904 			break;
905 		}
906 	default:
907 		break;
908 	}
909 
910 	cmn_err(CE_CONT, "%s\n", report);
911 	kmem_free(report, REPORT_LEN);
912 }
913 
914 static void
915 vcpu_config(void *arg)
916 {
917 	int id = (int)(uintptr_t)arg;
918 	int error;
919 	char dir[16];
920 	char *state;
921 
922 	if ((uint_t)id >= max_ncpus) {
923 		cmn_err(CE_WARN,
924 		    "vcpu_config: cpu%d does not fit in this domain", id);
925 		return;
926 	}
927 
928 	(void) snprintf(dir, sizeof (dir), "cpu/%d", id);
929 	state = kmem_alloc(MAXPATHLEN, KM_SLEEP);
930 	if (xenbus_scanf(XBT_NULL, dir, "availability", "%s", state) == 0) {
931 		if (strcmp(state, "online") == 0) {
932 			error = vcpu_config_poweron(id);
933 			vcpu_config_report(id, P_ONLINE, error);
934 		} else if (strcmp(state, "offline") == 0) {
935 			error = vcpu_config_poweroff(id);
936 			vcpu_config_report(id, P_POWEROFF, error);
937 		} else {
938 			cmn_err(CE_WARN,
939 			    "cpu%d: unknown target state '%s'", id, state);
940 		}
941 	} else
942 		cmn_err(CE_WARN,
943 		    "cpu%d: unable to read target state from xenstore", id);
944 
945 	kmem_free(state, MAXPATHLEN);
946 }
947 
948 /*ARGSUSED*/
949 static void
950 vcpu_config_event(struct xenbus_watch *watch, const char **vec, uint_t len)
951 {
952 	const char *path = vec[XS_WATCH_PATH];
953 	processorid_t id;
954 	char *s;
955 
956 	if ((s = strstr(path, "cpu/")) != NULL &&
957 	    sscanf(s, "cpu/%d", &id) == 1) {
958 		/*
959 		 * Run the virtual CPU configuration on a separate thread to
960 		 * avoid blocking on this event for too long (and for now,
961 		 * to ensure configuration requests are serialized.)
962 		 */
963 		(void) taskq_dispatch(cpu_config_tq,
964 		    vcpu_config, (void *)(uintptr_t)id, 0);
965 	}
966 }
967 
968 static int
969 xen_vcpu_initialize(processorid_t id, vcpu_guest_context_t *vgc)
970 {
971 	int err;
972 
973 	if ((err = HYPERVISOR_vcpu_op(VCPUOP_initialise, id, vgc)) != 0) {
974 		char *str;
975 		int level = CE_WARN;
976 
977 		switch (err) {
978 		case -X_EINVAL:
979 			/*
980 			 * This interface squashes multiple error sources
981 			 * to one error code.  In particular, an X_EINVAL
982 			 * code can mean:
983 			 *
984 			 * -	the vcpu id is out of range
985 			 * -	cs or ss are in ring 0
986 			 * -	cr3 is wrong
987 			 * -	an entry in the new gdt is above the
988 			 *	reserved entry
989 			 * -	a frame underneath the new gdt is bad
990 			 */
991 			str = "something is wrong :(";
992 			break;
993 		case -X_ENOENT:
994 			str = "no such cpu";
995 			break;
996 		case -X_ENOMEM:
997 			str = "no mem to copy ctxt";
998 			break;
999 		case -X_EFAULT:
1000 			str = "bad address";
1001 			break;
1002 		case -X_EEXIST:
1003 			/*
1004 			 * Hmm.  This error is returned if the vcpu has already
1005 			 * been initialized once before in the lifetime of this
1006 			 * domain.  This is a logic error in the kernel.
1007 			 */
1008 			level = CE_PANIC;
1009 			str = "already initialized";
1010 			break;
1011 		default:
1012 			level = CE_PANIC;
1013 			str = "<unexpected>";
1014 			break;
1015 		}
1016 
1017 		cmn_err(level, "vcpu%d: failed to init: error %d: %s",
1018 		    id, -err, str);
1019 	}
1020 	return (err);
1021 }
1022 
1023 long
1024 xen_vcpu_up(processorid_t id)
1025 {
1026 	long err;
1027 
1028 	if ((err = HYPERVISOR_vcpu_op(VCPUOP_up, id, NULL)) != 0) {
1029 		char *str;
1030 
1031 		switch (err) {
1032 		case -X_ENOENT:
1033 			str = "no such cpu";
1034 			break;
1035 		case -X_EINVAL:
1036 			/*
1037 			 * Perhaps this is diagnostic overkill.
1038 			 */
1039 			if (HYPERVISOR_vcpu_op(VCPUOP_is_up, id, NULL) < 0)
1040 				str = "bad cpuid";
1041 			else
1042 				str = "not initialized";
1043 			break;
1044 		default:
1045 			str = "<unexpected>";
1046 			break;
1047 		}
1048 
1049 		printf("vcpu%d: failed to start: error %d: %s\n",
1050 		    id, -(int)err, str);
1051 		return (EBFONT);	/* deliberately silly */
1052 	}
1053 	return (err);
1054 }
1055 
1056 long
1057 xen_vcpu_down(processorid_t id)
1058 {
1059 	long err;
1060 
1061 	if ((err = HYPERVISOR_vcpu_op(VCPUOP_down, id, NULL)) != 0) {
1062 		/*
1063 		 * X_ENOENT:	no such cpu
1064 		 * X_EINVAL:	bad cpuid
1065 		 */
1066 		panic("vcpu%d: failed to stop: error %d", id, -(int)err);
1067 	}
1068 
1069 	return (err);
1070 }
1071