xref: /illumos-gate/usr/src/uts/i86xpv/os/mp_xen.c (revision 9d1587b49e4692b8d6652e6c0b113a53b1af5313)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Virtual CPU management.
29  *
30  * VCPUs can be controlled in one of two ways; through the domain itself
31  * (psradm, p_online(), etc.), and via changes in xenstore (vcpu_config()).
32  * Unfortunately, the terminology is used in different ways; they work out as
33  * follows:
34  *
35  * P_ONLINE: the VCPU is up and running, taking interrupts and running threads
36  *
37  * P_OFFLINE: the VCPU is up and running, but quiesced (i.e. blocked in the
38  * hypervisor on the idle thread).  It must be up since a downed VCPU cannot
39  * receive interrupts, and we require this for offline CPUs in Solaris.
40  *
41  * P_POWEROFF: the VCPU is down (we never called xen_vcpu_up(), or called
42  * xen_vcpu_down() for it).  It can't take interrupts or run anything, though
43  * if it has run previously, its software state (cpu_t, machcpu structures, IPI
44  * event channels, etc.) will still exist.
45  *
46  * The hypervisor has two notions of CPU states as represented in the store:
47  *
48  * "offline": the VCPU is down.  Corresponds to P_POWEROFF.
49  *
50  * "online": the VCPU is running.  Corresponds to a CPU state other than
51  * P_POWEROFF.
52  *
53  * Currently, only a notification via xenstore can bring a CPU into a
54  * P_POWEROFF state, and only the domain can change between P_ONLINE, P_NOINTR,
55  * P_OFFLINE, etc.  We need to be careful to treat xenstore notifications
56  * idempotently, as we'll get 'duplicate' entries when we resume a domain.
57  *
58  * Note that the xenstore configuration is strictly advisory, in that a domain
59  * can choose to ignore it and still power up a VCPU in the offline state. To
60  * play nice, we don't allow it. Thus, any attempt to power on/off a CPU is
61  * ENOTSUP from within Solaris.
62  *
63  * Powering off a VCPU and suspending the domain use similar code. The
64  * difficulty here is that we must ensure that each VCPU is in a stable
65  * state: it must have a saved PCB, and not be responding to interrupts
66  * (since we are just about to remove its ability to run on a real CPU,
67  * possibly forever).  However, an offline CPU in Solaris can take
68  * cross-call interrupts, as mentioned, so we must go through a
69  * two-stage process.  First, we use the standard Solaris pause_cpus().
70  * This ensures that all CPUs are either in mach_cpu_pause() or
71  * mach_cpu_idle(), and nothing will cross-call them.
72  *
73  * Powered-off-CPUs are already safe, as we own the cpu_lock needed to
74  * bring them back up, and in state CPU_PHASE_POWERED_OFF.
75  *
76  * Running CPUs are spinning in mach_cpu_pause() waiting for either
77  * PAUSE_IDLE or CPU_PHASE_WAIT_SAFE.
78  *
79  * Offline CPUs are either running the idle thread and periodically
80  * checking for CPU_PHASE_WAIT_SAFE, or blocked in the hypervisor.
81  *
82  * Thus, we set CPU_PHASE_WAIT_SAFE for every powered-on CPU, as well as
83  * poking them to make sure they're not blocked[1]. When every CPU has
84  * responded by reaching a safe state and setting CPU_PHASE_SAFE, we
85  * know we can suspend, or power-off a CPU, without problems.
86  *
87  * [1] note that we have to repeatedly poke offline CPUs: it's the only
88  * way to ensure that the CPU doesn't miss the state change before
89  * dropping into HYPERVISOR_block().
90  */
91 
92 #include <sys/types.h>
93 #include <sys/systm.h>
94 #include <sys/param.h>
95 #include <sys/taskq.h>
96 #include <sys/cmn_err.h>
97 #include <sys/archsystm.h>
98 #include <sys/machsystm.h>
99 #include <sys/segments.h>
100 #include <sys/cpuvar.h>
101 #include <sys/x86_archext.h>
102 #include <sys/controlregs.h>
103 #include <sys/hypervisor.h>
104 #include <sys/xpv_panic.h>
105 #include <sys/mman.h>
106 #include <sys/psw.h>
107 #include <sys/cpu.h>
108 #include <sys/sunddi.h>
109 #include <util/sscanf.h>
110 #include <vm/hat_i86.h>
111 #include <vm/hat.h>
112 #include <vm/as.h>
113 
114 #include <xen/public/io/xs_wire.h>
115 #include <xen/sys/xenbus_impl.h>
116 #include <xen/public/vcpu.h>
117 
118 extern cpuset_t cpu_ready_set;
119 
120 #define	CPU_PHASE_NONE 0
121 #define	CPU_PHASE_WAIT_SAFE 1
122 #define	CPU_PHASE_SAFE 2
123 #define	CPU_PHASE_POWERED_OFF 3
124 
125 /*
126  * We can only poke CPUs during barrier enter 256 times a second at
127  * most.
128  */
129 #define	POKE_TIMEOUT (NANOSEC / 256)
130 
131 static taskq_t *cpu_config_tq;
132 static int cpu_phase[NCPU];
133 
134 static void vcpu_config_event(struct xenbus_watch *, const char **, uint_t);
135 static int xen_vcpu_initialize(processorid_t, vcpu_guest_context_t *);
136 
137 /*
138  * Return whether or not the vcpu is actually running on a pcpu
139  */
140 int
141 vcpu_on_pcpu(processorid_t cpu)
142 {
143 	struct vcpu_runstate_info runstate;
144 	int	ret = VCPU_STATE_UNKNOWN;
145 
146 	ASSERT(cpu < NCPU);
147 	/*
148 	 * Don't bother with hypercall if we are asking about ourself
149 	 */
150 	if (cpu == CPU->cpu_id)
151 		return (VCPU_ON_PCPU);
152 	if (HYPERVISOR_vcpu_op(VCPUOP_get_runstate_info, cpu, &runstate) != 0)
153 		goto out;
154 
155 	switch (runstate.state) {
156 	case RUNSTATE_running:
157 		ret = VCPU_ON_PCPU;
158 		break;
159 
160 	case RUNSTATE_runnable:
161 	case RUNSTATE_offline:
162 	case RUNSTATE_blocked:
163 		ret = VCPU_NOT_ON_PCPU;
164 		break;
165 
166 	default:
167 		break;
168 	}
169 
170 out:
171 	return (ret);
172 }
173 
174 /*
175  * These routines allocate any global state that might be needed
176  * while starting cpus.  For virtual cpus, there is no such state.
177  */
178 int
179 mach_cpucontext_init(void)
180 {
181 	return (0);
182 }
183 
184 void
185 do_cpu_config_watch(int state)
186 {
187 	static struct xenbus_watch cpu_config_watch;
188 
189 	if (state != XENSTORE_UP)
190 		return;
191 	cpu_config_watch.node = "cpu";
192 	cpu_config_watch.callback = vcpu_config_event;
193 	if (register_xenbus_watch(&cpu_config_watch)) {
194 		taskq_destroy(cpu_config_tq);
195 		cmn_err(CE_WARN, "do_cpu_config_watch: "
196 		    "failed to set vcpu config watch");
197 	}
198 
199 }
200 
201 /*
202  * This routine is called after all the "normal" MP startup has
203  * been done; a good place to start watching xen store for virtual
204  * cpu hot plug events.
205  */
206 void
207 mach_cpucontext_fini(void)
208 {
209 
210 	cpu_config_tq = taskq_create("vcpu config taskq", 1,
211 	    maxclsyspri - 1, 1, 1, TASKQ_PREPOPULATE);
212 
213 	(void) xs_register_xenbus_callback(do_cpu_config_watch);
214 }
215 
216 /*
217  * Fill in the remaining CPU context and initialize it.
218  */
219 static int
220 mp_set_cpu_context(vcpu_guest_context_t *vgc, cpu_t *cp)
221 {
222 	uint_t vec, iopl;
223 
224 	vgc->flags = VGCF_IN_KERNEL;
225 
226 	/*
227 	 * fpu_ctx we leave as zero; on first fault we'll store
228 	 * sse_initial into it anyway.
229 	 */
230 
231 #if defined(__amd64)
232 	vgc->user_regs.cs = KCS_SEL | SEL_KPL;	/* force to ring 3 */
233 #else
234 	vgc->user_regs.cs = KCS_SEL;
235 #endif
236 	vgc->user_regs.ds = KDS_SEL;
237 	vgc->user_regs.es = KDS_SEL;
238 	vgc->user_regs.ss = KDS_SEL;
239 	vgc->kernel_ss = KDS_SEL;
240 
241 	/*
242 	 * Allow I/O privilege level for Dom0 kernel.
243 	 */
244 	if (DOMAIN_IS_INITDOMAIN(xen_info))
245 		iopl = (PS_IOPL & 0x1000); /* ring 1 */
246 	else
247 		iopl = 0;
248 
249 #if defined(__amd64)
250 	vgc->user_regs.fs = 0;
251 	vgc->user_regs.gs = 0;
252 	vgc->user_regs.rflags = F_OFF | iopl;
253 #elif defined(__i386)
254 	vgc->user_regs.fs = KFS_SEL;
255 	vgc->user_regs.gs = KGS_SEL;
256 	vgc->user_regs.eflags = F_OFF | iopl;
257 	vgc->event_callback_cs = vgc->user_regs.cs;
258 	vgc->failsafe_callback_cs = vgc->user_regs.cs;
259 #endif
260 
261 	/*
262 	 * Initialize the trap_info_t from the IDT
263 	 */
264 #if !defined(__lint)
265 	ASSERT(NIDT == sizeof (vgc->trap_ctxt) / sizeof (vgc->trap_ctxt[0]));
266 #endif
267 	for (vec = 0; vec < NIDT; vec++) {
268 		trap_info_t *ti = &vgc->trap_ctxt[vec];
269 
270 		if (xen_idt_to_trap_info(vec,
271 		    &cp->cpu_m.mcpu_idt[vec], ti) == 0) {
272 			ti->cs = KCS_SEL;
273 			ti->vector = vec;
274 		}
275 	}
276 
277 	/*
278 	 * No LDT
279 	 */
280 
281 	/*
282 	 * (We assert in various places that the GDT is (a) aligned on a
283 	 * page boundary and (b) one page long, so this really should fit..)
284 	 */
285 #ifdef CRASH_XEN
286 	vgc->gdt_frames[0] = pa_to_ma(mmu_btop(cp->cpu_m.mcpu_gdtpa));
287 #else
288 	vgc->gdt_frames[0] = pfn_to_mfn(mmu_btop(cp->cpu_m.mcpu_gdtpa));
289 #endif
290 	vgc->gdt_ents = NGDT;
291 
292 	vgc->ctrlreg[0] = CR0_ENABLE_FPU_FLAGS(getcr0());
293 
294 #if defined(__i386)
295 	if (mmu.pae_hat)
296 		vgc->ctrlreg[3] =
297 		    xen_pfn_to_cr3(pfn_to_mfn(kas.a_hat->hat_htable->ht_pfn));
298 	else
299 #endif
300 		vgc->ctrlreg[3] =
301 		    pa_to_ma(mmu_ptob(kas.a_hat->hat_htable->ht_pfn));
302 
303 	vgc->ctrlreg[4] = getcr4();
304 
305 	vgc->event_callback_eip = (uintptr_t)xen_callback;
306 	vgc->failsafe_callback_eip = (uintptr_t)xen_failsafe_callback;
307 	vgc->flags |= VGCF_failsafe_disables_events;
308 
309 #if defined(__amd64)
310 	/*
311 	 * XXPV should this be moved to init_cpu_syscall?
312 	 */
313 	vgc->syscall_callback_eip = (uintptr_t)sys_syscall;
314 	vgc->flags |= VGCF_syscall_disables_events;
315 
316 	ASSERT(vgc->user_regs.gs == 0);
317 	vgc->gs_base_kernel = (uintptr_t)cp;
318 #endif
319 
320 	return (xen_vcpu_initialize(cp->cpu_id, vgc));
321 }
322 
323 /*
324  * Create a guest virtual cpu context so that the virtual cpu
325  * springs into life in the domain just about to call mp_startup()
326  *
327  * Virtual CPUs must be initialized once in the lifetime of the domain;
328  * after that subsequent attempts to start them will fail with X_EEXIST.
329  *
330  * Thus 'alloc' -really- creates and initializes the virtual
331  * CPU context just once. Once the initialisation succeeds, we never
332  * free it, nor the regular cpu_t to which it refers.
333  */
334 void *
335 mach_cpucontext_alloc(struct cpu *cp)
336 {
337 	kthread_t *tp = cp->cpu_thread;
338 	vcpu_guest_context_t vgc;
339 
340 	int err = 1;
341 
342 	/*
343 	 * First, augment the incoming cpu structure
344 	 * - vcpu pointer reference
345 	 * - pending event storage area
346 	 * - physical address of GDT
347 	 */
348 	cp->cpu_m.mcpu_vcpu_info =
349 	    &HYPERVISOR_shared_info->vcpu_info[cp->cpu_id];
350 	cp->cpu_m.mcpu_evt_pend = kmem_zalloc(
351 	    sizeof (struct xen_evt_data), KM_SLEEP);
352 	cp->cpu_m.mcpu_gdtpa =
353 	    mmu_ptob(hat_getpfnum(kas.a_hat, (caddr_t)cp->cpu_gdt));
354 
355 	if ((err = xen_gdt_setprot(cp, PROT_READ)) != 0)
356 		goto done;
357 
358 	/*
359 	 * Now set up the vcpu context so that we can start this vcpu
360 	 * in the kernel at tp->t_pc (mp_startup).  Note that the
361 	 * thread will thread_exit() shortly after performing the
362 	 * initialization; in particular, we will *never* take a
363 	 * privilege transition on this thread.
364 	 */
365 
366 	bzero(&vgc, sizeof (vgc));
367 
368 #ifdef __amd64
369 	vgc.user_regs.rip = tp->t_pc;
370 	vgc.user_regs.rsp = tp->t_sp;
371 	vgc.user_regs.rbp = tp->t_sp - 2 * sizeof (greg_t);
372 #else
373 	vgc.user_regs.eip = tp->t_pc;
374 	vgc.user_regs.esp = tp->t_sp;
375 	vgc.user_regs.ebp = tp->t_sp - 2 * sizeof (greg_t);
376 #endif
377 	/*
378 	 * XXPV	Fix resume, if Russ didn't already fix it.
379 	 *
380 	 * Note that resume unconditionally puts t->t_stk + sizeof (regs)
381 	 * into kernel_sp via HYPERVISOR_stack_switch. This anticipates
382 	 * that only lwps take traps that switch to the kernel stack;
383 	 * part of creating an lwp adjusts the stack by subtracting
384 	 * sizeof (struct regs) off t_stk.
385 	 *
386 	 * The more interesting question is, why do we do all the work
387 	 * of a fully fledged lwp for a plain thread?  In particular
388 	 * we don't have to call HYPERVISOR_stack_switch for lwp-less threads
389 	 * or futz with the LDT.  This should probably all be done with
390 	 * an lwp context operator to keep pure thread context switch fast.
391 	 */
392 	vgc.kernel_sp = (ulong_t)tp->t_stk;
393 
394 	err = mp_set_cpu_context(&vgc, cp);
395 
396 done:
397 	if (err) {
398 		mach_cpucontext_free(cp, NULL, err);
399 		return (NULL);
400 	}
401 	return (cp);
402 }
403 
404 /*
405  * By the time we are called either we have successfully started
406  * the cpu, or our attempt to start it has failed.
407  */
408 
409 /*ARGSUSED*/
410 void
411 mach_cpucontext_free(struct cpu *cp, void *arg, int err)
412 {
413 	switch (err) {
414 	case 0:
415 		break;
416 	case ETIMEDOUT:
417 		/*
418 		 * The vcpu context is loaded into the hypervisor, and
419 		 * we've tried to start it, but the vcpu has not been set
420 		 * running yet, for whatever reason.  We arrange to -not-
421 		 * free any data structures it may be referencing.  In
422 		 * particular, we've already told the hypervisor about
423 		 * the GDT, and so we can't map it read-write again.
424 		 */
425 		break;
426 	default:
427 		(void) xen_gdt_setprot(cp, PROT_READ | PROT_WRITE);
428 		kmem_free(cp->cpu_m.mcpu_evt_pend,
429 		    sizeof (struct xen_evt_data));
430 		break;
431 	}
432 }
433 
434 /*
435  * Reset this CPU's context.  Clear out any pending evtchn data, since event
436  * channel numbers will all change when we resume.
437  */
438 void
439 mach_cpucontext_reset(cpu_t *cp)
440 {
441 	bzero(cp->cpu_m.mcpu_evt_pend, sizeof (struct xen_evt_data));
442 	/* mcpu_intr_pending ? */
443 }
444 
445 static void
446 pcb_to_user_regs(label_t *pcb, vcpu_guest_context_t *vgc)
447 {
448 #ifdef __amd64
449 	vgc->user_regs.rip = pcb->val[REG_LABEL_PC];
450 	vgc->user_regs.rsp = pcb->val[REG_LABEL_SP];
451 	vgc->user_regs.rbp = pcb->val[REG_LABEL_BP];
452 	vgc->user_regs.rbx = pcb->val[REG_LABEL_RBX];
453 	vgc->user_regs.r12 = pcb->val[REG_LABEL_R12];
454 	vgc->user_regs.r13 = pcb->val[REG_LABEL_R13];
455 	vgc->user_regs.r14 = pcb->val[REG_LABEL_R14];
456 	vgc->user_regs.r15 = pcb->val[REG_LABEL_R15];
457 #else /* __amd64 */
458 	vgc->user_regs.eip = pcb->val[REG_LABEL_PC];
459 	vgc->user_regs.esp = pcb->val[REG_LABEL_SP];
460 	vgc->user_regs.ebp = pcb->val[REG_LABEL_BP];
461 	vgc->user_regs.ebx = pcb->val[REG_LABEL_EBX];
462 	vgc->user_regs.esi = pcb->val[REG_LABEL_ESI];
463 	vgc->user_regs.edi = pcb->val[REG_LABEL_EDI];
464 #endif /* __amd64 */
465 }
466 
467 /*
468  * Restore the context of a CPU during resume.  This context is always
469  * inside enter_safe_phase(), below.
470  */
471 void
472 mach_cpucontext_restore(cpu_t *cp)
473 {
474 	vcpu_guest_context_t vgc;
475 	int err;
476 
477 	ASSERT(cp->cpu_thread == cp->cpu_pause_thread ||
478 	    cp->cpu_thread == cp->cpu_idle_thread);
479 
480 	bzero(&vgc, sizeof (vgc));
481 
482 	pcb_to_user_regs(&cp->cpu_thread->t_pcb, &vgc);
483 
484 	/*
485 	 * We're emulating a longjmp() here: in particular, we need to bump the
486 	 * stack pointer to account for the pop of xIP that returning from
487 	 * longjmp() normally would do, and set the return value in xAX to 1.
488 	 */
489 #ifdef __amd64
490 	vgc.user_regs.rax = 1;
491 	vgc.user_regs.rsp += sizeof (ulong_t);
492 #else
493 	vgc.user_regs.eax = 1;
494 	vgc.user_regs.esp += sizeof (ulong_t);
495 #endif
496 
497 	vgc.kernel_sp = cp->cpu_thread->t_sp;
498 
499 	err = mp_set_cpu_context(&vgc, cp);
500 
501 	ASSERT(err == 0);
502 }
503 
504 /*
505  * Reach a point at which the CPU can be safely powered-off or
506  * suspended.  Nothing can wake this CPU out of the loop.
507  */
508 static void
509 enter_safe_phase(void)
510 {
511 	ulong_t flags = intr_clear();
512 
513 	if (setjmp(&curthread->t_pcb) == 0) {
514 		cpu_phase[CPU->cpu_id] = CPU_PHASE_SAFE;
515 		while (cpu_phase[CPU->cpu_id] == CPU_PHASE_SAFE)
516 			SMT_PAUSE();
517 	}
518 
519 	ASSERT(!interrupts_enabled());
520 
521 	intr_restore(flags);
522 }
523 
524 /*
525  * Offline CPUs run this code even under a pause_cpus(), so we must
526  * check if we need to enter the safe phase.
527  */
528 void
529 mach_cpu_idle(void)
530 {
531 	if (IN_XPV_PANIC()) {
532 		xpv_panic_halt();
533 	} else  {
534 		(void) HYPERVISOR_block();
535 		if (cpu_phase[CPU->cpu_id] == CPU_PHASE_WAIT_SAFE)
536 			enter_safe_phase();
537 	}
538 }
539 
540 /*
541  * Spin until either start_cpus() wakes us up, or we get a request to
542  * enter the safe phase (followed by a later start_cpus()).
543  */
544 void
545 mach_cpu_pause(volatile char *safe)
546 {
547 	*safe = PAUSE_WAIT;
548 	membar_enter();
549 
550 	while (*safe != PAUSE_IDLE) {
551 		if (cpu_phase[CPU->cpu_id] == CPU_PHASE_WAIT_SAFE)
552 			enter_safe_phase();
553 		SMT_PAUSE();
554 	}
555 }
556 
557 void
558 mach_cpu_halt(char *msg)
559 {
560 	if (msg)
561 		prom_printf("%s\n", msg);
562 	(void) xen_vcpu_down(CPU->cpu_id);
563 }
564 
565 /*ARGSUSED*/
566 int
567 mp_cpu_poweron(struct cpu *cp)
568 {
569 	return (ENOTSUP);
570 }
571 
572 /*ARGSUSED*/
573 int
574 mp_cpu_poweroff(struct cpu *cp)
575 {
576 	return (ENOTSUP);
577 }
578 
579 void
580 mp_enter_barrier(void)
581 {
582 	hrtime_t last_poke_time = 0;
583 	int poke_allowed = 0;
584 	int done = 0;
585 	int i;
586 
587 	ASSERT(MUTEX_HELD(&cpu_lock));
588 
589 	pause_cpus(NULL, NULL);
590 
591 	while (!done) {
592 		done = 1;
593 		poke_allowed = 0;
594 
595 		if (xpv_gethrtime() - last_poke_time > POKE_TIMEOUT) {
596 			last_poke_time = xpv_gethrtime();
597 			poke_allowed = 1;
598 		}
599 
600 		for (i = 0; i < NCPU; i++) {
601 			cpu_t *cp = cpu_get(i);
602 
603 			if (cp == NULL || cp == CPU)
604 				continue;
605 
606 			switch (cpu_phase[i]) {
607 			case CPU_PHASE_NONE:
608 				cpu_phase[i] = CPU_PHASE_WAIT_SAFE;
609 				poke_cpu(i);
610 				done = 0;
611 				break;
612 
613 			case CPU_PHASE_WAIT_SAFE:
614 				if (poke_allowed)
615 					poke_cpu(i);
616 				done = 0;
617 				break;
618 
619 			case CPU_PHASE_SAFE:
620 			case CPU_PHASE_POWERED_OFF:
621 				break;
622 			}
623 		}
624 
625 		SMT_PAUSE();
626 	}
627 }
628 
629 void
630 mp_leave_barrier(void)
631 {
632 	int i;
633 
634 	ASSERT(MUTEX_HELD(&cpu_lock));
635 
636 	for (i = 0; i < NCPU; i++) {
637 		cpu_t *cp = cpu_get(i);
638 
639 		if (cp == NULL || cp == CPU)
640 			continue;
641 
642 		switch (cpu_phase[i]) {
643 		/*
644 		 * If we see a CPU in one of these phases, something has
645 		 * gone badly wrong with the guarantees
646 		 * mp_enter_barrier() is supposed to provide.  Rather
647 		 * than attempt to stumble along (and since we can't
648 		 * panic properly in this context), we tell the
649 		 * hypervisor we've crashed.
650 		 */
651 		case CPU_PHASE_NONE:
652 		case CPU_PHASE_WAIT_SAFE:
653 			(void) HYPERVISOR_shutdown(SHUTDOWN_crash);
654 			break;
655 
656 		case CPU_PHASE_POWERED_OFF:
657 			break;
658 
659 		case CPU_PHASE_SAFE:
660 			cpu_phase[i] = CPU_PHASE_NONE;
661 		}
662 	}
663 
664 	start_cpus();
665 }
666 
667 static int
668 poweroff_vcpu(struct cpu *cp)
669 {
670 	int error;
671 
672 	ASSERT(MUTEX_HELD(&cpu_lock));
673 
674 	ASSERT(CPU->cpu_id != cp->cpu_id);
675 	ASSERT(cp->cpu_flags & CPU_QUIESCED);
676 
677 	mp_enter_barrier();
678 
679 	if ((error = xen_vcpu_down(cp->cpu_id)) == 0) {
680 		ASSERT(cpu_phase[cp->cpu_id] == CPU_PHASE_SAFE);
681 
682 		CPUSET_DEL(cpu_ready_set, cp->cpu_id);
683 
684 		cp->cpu_flags |= CPU_POWEROFF | CPU_OFFLINE;
685 		cp->cpu_flags &=
686 		    ~(CPU_RUNNING | CPU_READY | CPU_EXISTS | CPU_ENABLE);
687 
688 		cpu_phase[cp->cpu_id] = CPU_PHASE_POWERED_OFF;
689 
690 		cpu_set_state(cp);
691 	}
692 
693 	mp_leave_barrier();
694 
695 	return (error);
696 }
697 
698 static int
699 vcpu_config_poweroff(processorid_t id)
700 {
701 	int oldstate;
702 	int error;
703 	cpu_t *cp;
704 
705 	mutex_enter(&cpu_lock);
706 
707 	if ((cp = cpu_get(id)) == NULL) {
708 		mutex_exit(&cpu_lock);
709 		return (ESRCH);
710 	}
711 
712 	if (cpu_get_state(cp) == P_POWEROFF) {
713 		mutex_exit(&cpu_lock);
714 		return (0);
715 	}
716 
717 	mutex_exit(&cpu_lock);
718 
719 	do {
720 		error = p_online_internal(id, P_OFFLINE,
721 		    &oldstate);
722 
723 		if (error != 0)
724 			break;
725 
726 		/*
727 		 * So we just changed it to P_OFFLINE.  But then we dropped
728 		 * cpu_lock, so now it is possible for another thread to change
729 		 * the cpu back to a different, non-quiesced state e.g.
730 		 * P_ONLINE.
731 		 */
732 		mutex_enter(&cpu_lock);
733 		if ((cp = cpu_get(id)) == NULL)
734 			error = ESRCH;
735 		else {
736 			if (cp->cpu_flags & CPU_QUIESCED)
737 				error = poweroff_vcpu(cp);
738 			else
739 				error = EBUSY;
740 		}
741 		mutex_exit(&cpu_lock);
742 	} while (error == EBUSY);
743 
744 	return (error);
745 }
746 
747 /*
748  * Add a new virtual cpu to the domain.
749  */
750 static int
751 vcpu_config_new(processorid_t id)
752 {
753 	extern int start_cpu(processorid_t);
754 	int error;
755 
756 	if (ncpus == 1) {
757 		printf("cannot (yet) add cpus to a single-cpu domain\n");
758 		return (ENOTSUP);
759 	}
760 
761 	affinity_set(CPU_CURRENT);
762 	error = start_cpu(id);
763 	affinity_clear();
764 	return (error);
765 }
766 
767 static int
768 poweron_vcpu(struct cpu *cp)
769 {
770 	int error;
771 
772 	ASSERT(MUTEX_HELD(&cpu_lock));
773 
774 	if (HYPERVISOR_vcpu_op(VCPUOP_is_up, cp->cpu_id, NULL) != 0) {
775 		printf("poweron_vcpu: vcpu%d is not available!\n",
776 		    cp->cpu_id);
777 		return (ENXIO);
778 	}
779 
780 	if ((error = xen_vcpu_up(cp->cpu_id)) == 0) {
781 		CPUSET_ADD(cpu_ready_set, cp->cpu_id);
782 		cp->cpu_flags |= CPU_EXISTS | CPU_READY | CPU_RUNNING;
783 		cp->cpu_flags &= ~CPU_POWEROFF;
784 		/*
785 		 * There are some nasty races possible here.
786 		 * Tell the vcpu it's up one more time.
787 		 * XXPV	Is this enough?  Is this safe?
788 		 */
789 		(void) xen_vcpu_up(cp->cpu_id);
790 
791 		cpu_phase[cp->cpu_id] = CPU_PHASE_NONE;
792 
793 		cpu_set_state(cp);
794 	}
795 	return (error);
796 }
797 
798 static int
799 vcpu_config_poweron(processorid_t id)
800 {
801 	cpu_t *cp;
802 	int oldstate;
803 	int error;
804 
805 	if (id >= ncpus)
806 		return (vcpu_config_new(id));
807 
808 	mutex_enter(&cpu_lock);
809 
810 	if ((cp = cpu_get(id)) == NULL) {
811 		mutex_exit(&cpu_lock);
812 		return (ESRCH);
813 	}
814 
815 	if (cpu_get_state(cp) != P_POWEROFF) {
816 		mutex_exit(&cpu_lock);
817 		return (0);
818 	}
819 
820 	if ((error = poweron_vcpu(cp)) != 0) {
821 		mutex_exit(&cpu_lock);
822 		return (error);
823 	}
824 
825 	mutex_exit(&cpu_lock);
826 
827 	return (p_online_internal(id, P_ONLINE, &oldstate));
828 }
829 
830 #define	REPORT_LEN	128
831 
832 static void
833 vcpu_config_report(processorid_t id, uint_t newstate, int error)
834 {
835 	char *report = kmem_alloc(REPORT_LEN, KM_SLEEP);
836 	size_t len;
837 	char *ps;
838 
839 	switch (newstate) {
840 	case P_ONLINE:
841 		ps = PS_ONLINE;
842 		break;
843 	case P_POWEROFF:
844 		ps = PS_POWEROFF;
845 		break;
846 	default:
847 		cmn_err(CE_PANIC, "unknown state %u\n", newstate);
848 		break;
849 	}
850 
851 	len = snprintf(report, REPORT_LEN,
852 	    "cpu%d: externally initiated %s", id, ps);
853 
854 	if (!error) {
855 		cmn_err(CE_CONT, "!%s\n", report);
856 		kmem_free(report, REPORT_LEN);
857 		return;
858 	}
859 
860 	len += snprintf(report + len, REPORT_LEN - len,
861 	    " failed, error %d: ", error);
862 	switch (error) {
863 	case EEXIST:
864 		len += snprintf(report + len, REPORT_LEN - len,
865 		    "cpu already %s", ps ? ps : "?");
866 		break;
867 	case ESRCH:
868 		len += snprintf(report + len, REPORT_LEN - len,
869 		    "cpu not found");
870 		break;
871 	case EINVAL:
872 	case EALREADY:
873 		break;
874 	case EPERM:
875 		len += snprintf(report + len, REPORT_LEN - len,
876 		    "insufficient privilege (0x%x)", id);
877 		break;
878 	case EBUSY:
879 		switch (newstate) {
880 		case P_ONLINE:
881 			/*
882 			 * This return comes from mp_cpu_start -
883 			 * we cannot 'start' the boot CPU.
884 			 */
885 			len += snprintf(report + len, REPORT_LEN - len,
886 			    "already running");
887 			break;
888 		case P_POWEROFF:
889 			len += snprintf(report + len, REPORT_LEN - len,
890 			    "bound lwps?");
891 			break;
892 		default:
893 			break;
894 		}
895 	default:
896 		break;
897 	}
898 
899 	cmn_err(CE_CONT, "%s\n", report);
900 	kmem_free(report, REPORT_LEN);
901 }
902 
903 static void
904 vcpu_config(void *arg)
905 {
906 	int id = (int)(uintptr_t)arg;
907 	int error;
908 	char dir[16];
909 	char *state;
910 
911 	if ((uint_t)id >= max_ncpus) {
912 		cmn_err(CE_WARN,
913 		    "vcpu_config: cpu%d does not fit in this domain", id);
914 		return;
915 	}
916 
917 	(void) snprintf(dir, sizeof (dir), "cpu/%d", id);
918 	state = kmem_alloc(MAXPATHLEN, KM_SLEEP);
919 	if (xenbus_scanf(XBT_NULL, dir, "availability", "%s", state) == 0) {
920 		if (strcmp(state, "online") == 0) {
921 			error = vcpu_config_poweron(id);
922 			vcpu_config_report(id, P_ONLINE, error);
923 		} else if (strcmp(state, "offline") == 0) {
924 			error = vcpu_config_poweroff(id);
925 			vcpu_config_report(id, P_POWEROFF, error);
926 		} else {
927 			cmn_err(CE_WARN,
928 			    "cpu%d: unknown target state '%s'", id, state);
929 		}
930 	} else
931 		cmn_err(CE_WARN,
932 		    "cpu%d: unable to read target state from xenstore", id);
933 
934 	kmem_free(state, MAXPATHLEN);
935 }
936 
937 /*ARGSUSED*/
938 static void
939 vcpu_config_event(struct xenbus_watch *watch, const char **vec, uint_t len)
940 {
941 	const char *path = vec[XS_WATCH_PATH];
942 	processorid_t id;
943 	char *s;
944 
945 	if ((s = strstr(path, "cpu/")) != NULL &&
946 	    sscanf(s, "cpu/%d", &id) == 1) {
947 		/*
948 		 * Run the virtual CPU configuration on a separate thread to
949 		 * avoid blocking on this event for too long (and for now,
950 		 * to ensure configuration requests are serialized.)
951 		 */
952 		(void) taskq_dispatch(cpu_config_tq,
953 		    vcpu_config, (void *)(uintptr_t)id, 0);
954 	}
955 }
956 
957 static int
958 xen_vcpu_initialize(processorid_t id, vcpu_guest_context_t *vgc)
959 {
960 	int err;
961 
962 	if ((err = HYPERVISOR_vcpu_op(VCPUOP_initialise, id, vgc)) != 0) {
963 		char *str;
964 		int level = CE_WARN;
965 
966 		switch (err) {
967 		case -X_EINVAL:
968 			/*
969 			 * This interface squashes multiple error sources
970 			 * to one error code.  In particular, an X_EINVAL
971 			 * code can mean:
972 			 *
973 			 * -	the vcpu id is out of range
974 			 * -	cs or ss are in ring 0
975 			 * -	cr3 is wrong
976 			 * -	an entry in the new gdt is above the
977 			 *	reserved entry
978 			 * -	a frame underneath the new gdt is bad
979 			 */
980 			str = "something is wrong :(";
981 			break;
982 		case -X_ENOENT:
983 			str = "no such cpu";
984 			break;
985 		case -X_ENOMEM:
986 			str = "no mem to copy ctxt";
987 			break;
988 		case -X_EFAULT:
989 			str = "bad address";
990 			break;
991 		case -X_EEXIST:
992 			/*
993 			 * Hmm.  This error is returned if the vcpu has already
994 			 * been initialized once before in the lifetime of this
995 			 * domain.  This is a logic error in the kernel.
996 			 */
997 			level = CE_PANIC;
998 			str = "already initialized";
999 			break;
1000 		default:
1001 			level = CE_PANIC;
1002 			str = "<unexpected>";
1003 			break;
1004 		}
1005 
1006 		cmn_err(level, "vcpu%d: failed to init: error %d: %s",
1007 		    id, -err, str);
1008 	}
1009 	return (err);
1010 }
1011 
1012 long
1013 xen_vcpu_up(processorid_t id)
1014 {
1015 	long err;
1016 
1017 	if ((err = HYPERVISOR_vcpu_op(VCPUOP_up, id, NULL)) != 0) {
1018 		char *str;
1019 
1020 		switch (err) {
1021 		case -X_ENOENT:
1022 			str = "no such cpu";
1023 			break;
1024 		case -X_EINVAL:
1025 			/*
1026 			 * Perhaps this is diagnostic overkill.
1027 			 */
1028 			if (HYPERVISOR_vcpu_op(VCPUOP_is_up, id, NULL) < 0)
1029 				str = "bad cpuid";
1030 			else
1031 				str = "not initialized";
1032 			break;
1033 		default:
1034 			str = "<unexpected>";
1035 			break;
1036 		}
1037 
1038 		printf("vcpu%d: failed to start: error %d: %s\n",
1039 		    id, -(int)err, str);
1040 		return (EBFONT);	/* deliberately silly */
1041 	}
1042 	return (err);
1043 }
1044 
1045 long
1046 xen_vcpu_down(processorid_t id)
1047 {
1048 	long err;
1049 
1050 	if ((err = HYPERVISOR_vcpu_op(VCPUOP_down, id, NULL)) != 0) {
1051 		/*
1052 		 * X_ENOENT:	no such cpu
1053 		 * X_EINVAL:	bad cpuid
1054 		 */
1055 		panic("vcpu%d: failed to stop: error %d", id, -(int)err);
1056 	}
1057 
1058 	return (err);
1059 }
1060