xref: /illumos-gate/usr/src/uts/i86xpv/os/mp_xen.c (revision da6c28aaf62fa55f0fdb8004aa40f88f23bf53f0)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Virtual CPU management.
29  *
30  * VCPUs can be controlled in one of two ways; through the domain itself
31  * (psradm, p_online(), etc.), and via changes in xenstore (vcpu_config()).
32  * Unfortunately, the terminology is used in different ways; they work out as
33  * follows:
34  *
35  * P_ONLINE: the VCPU is up and running, taking interrupts and running threads
36  *
37  * P_OFFLINE: the VCPU is up and running, but quiesced (i.e. blocked in the
38  * hypervisor on the idle thread).  It must be up since a downed VCPU cannot
39  * receive interrupts, and we require this for offline CPUs in Solaris.
40  *
41  * P_POWEROFF: the VCPU is down (we never called xen_vcpu_up(), or called
42  * xen_vcpu_down() for it).  It can't take interrupts or run anything, though
43  * if it has run previously, its software state (cpu_t, machcpu structures, IPI
44  * event channels, etc.) will still exist.
45  *
46  * The hypervisor has two notions of CPU states as represented in the store:
47  *
48  * "offline": the VCPU is down.  Corresponds to P_POWEROFF.
49  *
50  * "online": the VCPU is running.  Corresponds to a CPU state other than
51  * P_POWEROFF.
52  *
53  * Currently, only a notification via xenstore can bring a CPU into a
54  * P_POWEROFF state, and only the domain can change between P_ONLINE, P_NOINTR,
55  * P_OFFLINE, etc.  We need to be careful to treat xenstore notifications
56  * idempotently, as we'll get 'duplicate' entries when we resume a domain.
57  *
58  * Note that the xenstore configuration is strictly advisory, in that a domain
59  * can choose to ignore it and still power up a VCPU in the offline state. To
60  * play nice, we don't allow it. Thus, any attempt to power on/off a CPU is
61  * ENOTSUP from within Solaris.
62  *
63  * Powering off a VCPU and suspending the domain use similar code. The
64  * difficulty here is that we must ensure that each VCPU is in a stable
65  * state: it must have a saved PCB, and not be responding to interrupts
66  * (since we are just about to remove its ability to run on a real CPU,
67  * possibly forever).  However, an offline CPU in Solaris can take
68  * cross-call interrupts, as mentioned, so we must go through a
69  * two-stage process.  First, we use the standard Solaris pause_cpus().
70  * This ensures that all CPUs are either in mach_cpu_pause() or
71  * mach_cpu_idle(), and nothing will cross-call them.
72  *
73  * Powered-off-CPUs are already safe, as we own the cpu_lock needed to
74  * bring them back up, and in state CPU_PHASE_POWERED_OFF.
75  *
76  * Running CPUs are spinning in mach_cpu_pause() waiting for either
77  * PAUSE_IDLE or CPU_PHASE_WAIT_SAFE.
78  *
79  * Offline CPUs are either running the idle thread and periodically
80  * checking for CPU_PHASE_WAIT_SAFE, or blocked in the hypervisor.
81  *
82  * Thus, we set CPU_PHASE_WAIT_SAFE for every powered-on CPU, as well as
83  * poking them to make sure they're not blocked[1]. When every CPU has
84  * responded by reaching a safe state and setting CPU_PHASE_SAFE, we
85  * know we can suspend, or power-off a CPU, without problems.
86  *
87  * [1] note that we have to repeatedly poke offline CPUs: it's the only
88  * way to ensure that the CPU doesn't miss the state change before
89  * dropping into HYPERVISOR_block().
90  */
91 
92 #pragma ident	"%Z%%M%	%I%	%E% SMI"
93 
94 #include <sys/types.h>
95 #include <sys/systm.h>
96 #include <sys/param.h>
97 #include <sys/taskq.h>
98 #include <sys/cmn_err.h>
99 #include <sys/archsystm.h>
100 #include <sys/machsystm.h>
101 #include <sys/segments.h>
102 #include <sys/cpuvar.h>
103 #include <sys/x86_archext.h>
104 #include <sys/controlregs.h>
105 #include <sys/hypervisor.h>
106 #include <sys/xpv_panic.h>
107 #include <sys/mman.h>
108 #include <sys/psw.h>
109 #include <sys/cpu.h>
110 #include <sys/sunddi.h>
111 #include <util/sscanf.h>
112 #include <vm/hat_i86.h>
113 #include <vm/hat.h>
114 #include <vm/as.h>
115 
116 #include <xen/public/io/xs_wire.h>
117 #include <xen/sys/xenbus_impl.h>
118 #include <xen/public/vcpu.h>
119 
120 #define	CPU_PHASE_NONE 0
121 #define	CPU_PHASE_WAIT_SAFE 1
122 #define	CPU_PHASE_SAFE 2
123 #define	CPU_PHASE_POWERED_OFF 3
124 
125 /*
126  * We can only poke CPUs during barrier enter 256 times a second at
127  * most.
128  */
129 #define	POKE_TIMEOUT (NANOSEC / 256)
130 
131 static taskq_t *cpu_config_tq;
132 static int cpu_phase[NCPU];
133 
134 static void vcpu_config_event(struct xenbus_watch *, const char **, uint_t);
135 static int xen_vcpu_initialize(processorid_t, vcpu_guest_context_t *);
136 
137 /*
138  * These routines allocate any global state that might be needed
139  * while starting cpus.  For virtual cpus, there is no such state.
140  */
141 int
142 mach_cpucontext_init(void)
143 {
144 	return (0);
145 }
146 
147 void
148 do_cpu_config_watch(int state)
149 {
150 	static struct xenbus_watch cpu_config_watch;
151 
152 	if (state != XENSTORE_UP)
153 		return;
154 	cpu_config_watch.node = "cpu";
155 	cpu_config_watch.callback = vcpu_config_event;
156 	if (register_xenbus_watch(&cpu_config_watch)) {
157 		taskq_destroy(cpu_config_tq);
158 		cmn_err(CE_WARN, "do_cpu_config_watch: "
159 		    "failed to set vcpu config watch");
160 	}
161 
162 }
163 
164 /*
165  * This routine is called after all the "normal" MP startup has
166  * been done; a good place to start watching xen store for virtual
167  * cpu hot plug events.
168  */
169 void
170 mach_cpucontext_fini(void)
171 {
172 
173 	cpu_config_tq = taskq_create("vcpu config taskq", 1,
174 	    maxclsyspri - 1, 1, 1, TASKQ_PREPOPULATE);
175 
176 	(void) xs_register_xenbus_callback(do_cpu_config_watch);
177 }
178 
179 /*
180  * Fill in the remaining CPU context and initialize it.
181  */
182 static int
183 mp_set_cpu_context(vcpu_guest_context_t *vgc, cpu_t *cp)
184 {
185 	uint_t vec, iopl;
186 
187 	vgc->flags = VGCF_IN_KERNEL;
188 
189 	/*
190 	 * fpu_ctx we leave as zero; on first fault we'll store
191 	 * sse_initial into it anyway.
192 	 */
193 
194 #if defined(__amd64)
195 	vgc->user_regs.cs = KCS_SEL | SEL_KPL;	/* force to ring 3 */
196 #else
197 	vgc->user_regs.cs = KCS_SEL;
198 #endif
199 	vgc->user_regs.ds = KDS_SEL;
200 	vgc->user_regs.es = KDS_SEL;
201 	vgc->user_regs.ss = KDS_SEL;
202 	vgc->kernel_ss = KDS_SEL;
203 
204 	/*
205 	 * Allow I/O privilege level for Dom0 kernel.
206 	 */
207 	if (DOMAIN_IS_INITDOMAIN(xen_info))
208 		iopl = (PS_IOPL & 0x1000); /* ring 1 */
209 	else
210 		iopl = 0;
211 
212 #if defined(__amd64)
213 	vgc->user_regs.fs = 0;
214 	vgc->user_regs.gs = 0;
215 	vgc->user_regs.rflags = F_OFF | iopl;
216 #elif defined(__i386)
217 	vgc->user_regs.fs = KFS_SEL;
218 	vgc->user_regs.gs = KGS_SEL;
219 	vgc->user_regs.eflags = F_OFF | iopl;
220 	vgc->event_callback_cs = vgc->user_regs.cs;
221 	vgc->failsafe_callback_cs = vgc->user_regs.cs;
222 #endif
223 
224 	/*
225 	 * Initialize the trap_info_t from the IDT
226 	 */
227 #if !defined(__lint)
228 	ASSERT(NIDT == sizeof (vgc->trap_ctxt) / sizeof (vgc->trap_ctxt[0]));
229 #endif
230 	for (vec = 0; vec < NIDT; vec++) {
231 		trap_info_t *ti = &vgc->trap_ctxt[vec];
232 
233 		if (xen_idt_to_trap_info(vec,
234 		    &cp->cpu_m.mcpu_idt[vec], ti) == 0) {
235 			ti->cs = KCS_SEL;
236 			ti->vector = vec;
237 		}
238 	}
239 
240 	/*
241 	 * No LDT
242 	 */
243 
244 	/*
245 	 * (We assert in various places that the GDT is (a) aligned on a
246 	 * page boundary and (b) one page long, so this really should fit..)
247 	 */
248 #ifdef CRASH_XEN
249 	vgc->gdt_frames[0] = pa_to_ma(mmu_btop(cp->cpu_m.mcpu_gdtpa));
250 #else
251 	vgc->gdt_frames[0] = pfn_to_mfn(mmu_btop(cp->cpu_m.mcpu_gdtpa));
252 #endif
253 	vgc->gdt_ents = NGDT;
254 
255 	vgc->ctrlreg[0] = CR0_ENABLE_FPU_FLAGS(getcr0());
256 
257 #if defined(__i386)
258 	if (mmu.pae_hat)
259 		vgc->ctrlreg[3] =
260 		    xen_pfn_to_cr3(pfn_to_mfn(kas.a_hat->hat_htable->ht_pfn));
261 	else
262 #endif
263 		vgc->ctrlreg[3] =
264 		    pa_to_ma(mmu_ptob(kas.a_hat->hat_htable->ht_pfn));
265 
266 	vgc->ctrlreg[4] = getcr4();
267 
268 	vgc->event_callback_eip = (uintptr_t)xen_callback;
269 	vgc->failsafe_callback_eip = (uintptr_t)xen_failsafe_callback;
270 	vgc->flags |= VGCF_failsafe_disables_events;
271 
272 #if defined(__amd64)
273 	/*
274 	 * XXPV should this be moved to init_cpu_syscall?
275 	 */
276 	vgc->syscall_callback_eip = (uintptr_t)sys_syscall;
277 	vgc->flags |= VGCF_syscall_disables_events;
278 
279 	ASSERT(vgc->user_regs.gs == 0);
280 	vgc->gs_base_kernel = (uintptr_t)cp;
281 #endif
282 
283 	return (xen_vcpu_initialize(cp->cpu_id, vgc));
284 }
285 
286 /*
287  * Create a guest virtual cpu context so that the virtual cpu
288  * springs into life in the domain just about to call mp_startup()
289  *
290  * Virtual CPUs must be initialized once in the lifetime of the domain;
291  * after that subsequent attempts to start them will fail with X_EEXIST.
292  *
293  * Thus 'alloc' -really- creates and initializes the virtual
294  * CPU context just once. Once the initialisation succeeds, we never
295  * free it, nor the regular cpu_t to which it refers.
296  */
297 void *
298 mach_cpucontext_alloc(struct cpu *cp)
299 {
300 	kthread_t *tp = cp->cpu_thread;
301 	vcpu_guest_context_t vgc;
302 
303 	int err = 1;
304 
305 	/*
306 	 * First, augment the incoming cpu structure
307 	 * - vcpu pointer reference
308 	 * - pending event storage area
309 	 * - physical address of GDT
310 	 */
311 	cp->cpu_m.mcpu_vcpu_info =
312 	    &HYPERVISOR_shared_info->vcpu_info[cp->cpu_id];
313 	cp->cpu_m.mcpu_evt_pend = kmem_zalloc(
314 	    sizeof (struct xen_evt_data), KM_SLEEP);
315 	cp->cpu_m.mcpu_gdtpa =
316 	    mmu_ptob(hat_getpfnum(kas.a_hat, (caddr_t)cp->cpu_gdt));
317 
318 	if ((err = xen_gdt_setprot(cp, PROT_READ)) != 0)
319 		goto done;
320 
321 	/*
322 	 * Now set up the vcpu context so that we can start this vcpu
323 	 * in the kernel at tp->t_pc (mp_startup).  Note that the
324 	 * thread will thread_exit() shortly after performing the
325 	 * initialization; in particular, we will *never* take a
326 	 * privilege transition on this thread.
327 	 */
328 
329 	bzero(&vgc, sizeof (vgc));
330 
331 #ifdef __amd64
332 	vgc.user_regs.rip = tp->t_pc;
333 	vgc.user_regs.rsp = tp->t_sp;
334 	vgc.user_regs.rbp = tp->t_sp - 2 * sizeof (greg_t);
335 #else
336 	vgc.user_regs.eip = tp->t_pc;
337 	vgc.user_regs.esp = tp->t_sp;
338 	vgc.user_regs.ebp = tp->t_sp - 2 * sizeof (greg_t);
339 #endif
340 	/*
341 	 * XXPV	Fix resume, if Russ didn't already fix it.
342 	 *
343 	 * Note that resume unconditionally puts t->t_stk + sizeof (regs)
344 	 * into kernel_sp via HYPERVISOR_stack_switch. This anticipates
345 	 * that only lwps take traps that switch to the kernel stack;
346 	 * part of creating an lwp adjusts the stack by subtracting
347 	 * sizeof (struct regs) off t_stk.
348 	 *
349 	 * The more interesting question is, why do we do all the work
350 	 * of a fully fledged lwp for a plain thread?  In particular
351 	 * we don't have to call HYPERVISOR_stack_switch for lwp-less threads
352 	 * or futz with the LDT.  This should probably all be done with
353 	 * an lwp context operator to keep pure thread context switch fast.
354 	 */
355 	vgc.kernel_sp = (ulong_t)tp->t_stk;
356 
357 	err = mp_set_cpu_context(&vgc, cp);
358 
359 done:
360 	if (err) {
361 		mach_cpucontext_free(cp, NULL, err);
362 		return (NULL);
363 	}
364 	return (cp);
365 }
366 
367 /*
368  * By the time we are called either we have successfully started
369  * the cpu, or our attempt to start it has failed.
370  */
371 
372 /*ARGSUSED*/
373 void
374 mach_cpucontext_free(struct cpu *cp, void *arg, int err)
375 {
376 	switch (err) {
377 	case 0:
378 		break;
379 	case ETIMEDOUT:
380 		/*
381 		 * The vcpu context is loaded into the hypervisor, and
382 		 * we've tried to start it, but the vcpu has not been set
383 		 * running yet, for whatever reason.  We arrange to -not-
384 		 * free any data structures it may be referencing.  In
385 		 * particular, we've already told the hypervisor about
386 		 * the GDT, and so we can't map it read-write again.
387 		 */
388 		break;
389 	default:
390 		(void) xen_gdt_setprot(cp, PROT_READ | PROT_WRITE);
391 		kmem_free(cp->cpu_m.mcpu_evt_pend,
392 		    sizeof (struct xen_evt_data));
393 		break;
394 	}
395 }
396 
397 /*
398  * Reset this CPU's context.  Clear out any pending evtchn data, since event
399  * channel numbers will all change when we resume.
400  */
401 void
402 mach_cpucontext_reset(cpu_t *cp)
403 {
404 	bzero(cp->cpu_m.mcpu_evt_pend, sizeof (struct xen_evt_data));
405 	/* mcpu_intr_pending ? */
406 }
407 
408 static void
409 pcb_to_user_regs(label_t *pcb, vcpu_guest_context_t *vgc)
410 {
411 #ifdef __amd64
412 	vgc->user_regs.rip = pcb->val[REG_LABEL_PC];
413 	vgc->user_regs.rsp = pcb->val[REG_LABEL_SP];
414 	vgc->user_regs.rbp = pcb->val[REG_LABEL_BP];
415 	vgc->user_regs.rbx = pcb->val[REG_LABEL_RBX];
416 	vgc->user_regs.r12 = pcb->val[REG_LABEL_R12];
417 	vgc->user_regs.r13 = pcb->val[REG_LABEL_R13];
418 	vgc->user_regs.r14 = pcb->val[REG_LABEL_R14];
419 	vgc->user_regs.r15 = pcb->val[REG_LABEL_R15];
420 #else /* __amd64 */
421 	vgc->user_regs.eip = pcb->val[REG_LABEL_PC];
422 	vgc->user_regs.esp = pcb->val[REG_LABEL_SP];
423 	vgc->user_regs.ebp = pcb->val[REG_LABEL_BP];
424 	vgc->user_regs.ebx = pcb->val[REG_LABEL_EBX];
425 	vgc->user_regs.esi = pcb->val[REG_LABEL_ESI];
426 	vgc->user_regs.edi = pcb->val[REG_LABEL_EDI];
427 #endif /* __amd64 */
428 }
429 
430 /*
431  * Restore the context of a CPU during resume.  This context is always
432  * inside enter_safe_phase(), below.
433  */
434 void
435 mach_cpucontext_restore(cpu_t *cp)
436 {
437 	vcpu_guest_context_t vgc;
438 	int err;
439 
440 	ASSERT(cp->cpu_thread == cp->cpu_pause_thread ||
441 	    cp->cpu_thread == cp->cpu_idle_thread);
442 
443 	bzero(&vgc, sizeof (vgc));
444 
445 	pcb_to_user_regs(&cp->cpu_thread->t_pcb, &vgc);
446 
447 	/*
448 	 * We're emulating a longjmp() here: in particular, we need to bump the
449 	 * stack pointer to account for the pop of xIP that returning from
450 	 * longjmp() normally would do, and set the return value in xAX to 1.
451 	 */
452 #ifdef __amd64
453 	vgc.user_regs.rax = 1;
454 	vgc.user_regs.rsp += sizeof (ulong_t);
455 #else
456 	vgc.user_regs.eax = 1;
457 	vgc.user_regs.esp += sizeof (ulong_t);
458 #endif
459 
460 	vgc.kernel_sp = cp->cpu_thread->t_sp;
461 
462 	err = mp_set_cpu_context(&vgc, cp);
463 
464 	ASSERT(err == 0);
465 }
466 
467 /*
468  * Reach a point at which the CPU can be safely powered-off or
469  * suspended.  Nothing can wake this CPU out of the loop.
470  */
471 static void
472 enter_safe_phase(void)
473 {
474 	ulong_t flags = intr_clear();
475 
476 	if (setjmp(&curthread->t_pcb) == 0) {
477 		cpu_phase[CPU->cpu_id] = CPU_PHASE_SAFE;
478 		while (cpu_phase[CPU->cpu_id] == CPU_PHASE_SAFE)
479 			SMT_PAUSE();
480 	}
481 
482 	ASSERT(!interrupts_enabled());
483 
484 	intr_restore(flags);
485 }
486 
487 /*
488  * Offline CPUs run this code even under a pause_cpus(), so we must
489  * check if we need to enter the safe phase.
490  */
491 void
492 mach_cpu_idle(void)
493 {
494 	if (IN_XPV_PANIC()) {
495 		xpv_panic_halt();
496 	} else  {
497 		(void) HYPERVISOR_block();
498 		if (cpu_phase[CPU->cpu_id] == CPU_PHASE_WAIT_SAFE)
499 			enter_safe_phase();
500 	}
501 }
502 
503 /*
504  * Spin until either start_cpus() wakes us up, or we get a request to
505  * enter the safe phase (followed by a later start_cpus()).
506  */
507 void
508 mach_cpu_pause(volatile char *safe)
509 {
510 	*safe = PAUSE_WAIT;
511 	membar_enter();
512 
513 	while (*safe != PAUSE_IDLE) {
514 		if (cpu_phase[CPU->cpu_id] == CPU_PHASE_WAIT_SAFE)
515 			enter_safe_phase();
516 		SMT_PAUSE();
517 	}
518 }
519 
520 void
521 mach_cpu_halt(char *msg)
522 {
523 	if (msg)
524 		prom_printf("%s\n", msg);
525 	(void) xen_vcpu_down(CPU->cpu_id);
526 }
527 
528 /*ARGSUSED*/
529 int
530 mp_cpu_poweron(struct cpu *cp)
531 {
532 	return (ENOTSUP);
533 }
534 
535 /*ARGSUSED*/
536 int
537 mp_cpu_poweroff(struct cpu *cp)
538 {
539 	return (ENOTSUP);
540 }
541 
542 void
543 mp_enter_barrier(void)
544 {
545 	hrtime_t last_poke_time = 0;
546 	int poke_allowed = 0;
547 	int done = 0;
548 	int i;
549 
550 	ASSERT(MUTEX_HELD(&cpu_lock));
551 
552 	pause_cpus(NULL);
553 
554 	while (!done) {
555 		done = 1;
556 		poke_allowed = 0;
557 
558 		if (xpv_gethrtime() - last_poke_time > POKE_TIMEOUT) {
559 			last_poke_time = xpv_gethrtime();
560 			poke_allowed = 1;
561 		}
562 
563 		for (i = 0; i < NCPU; i++) {
564 			cpu_t *cp = cpu_get(i);
565 
566 			if (cp == NULL || cp == CPU)
567 				continue;
568 
569 			switch (cpu_phase[i]) {
570 			case CPU_PHASE_NONE:
571 				cpu_phase[i] = CPU_PHASE_WAIT_SAFE;
572 				poke_cpu(i);
573 				done = 0;
574 				break;
575 
576 			case CPU_PHASE_WAIT_SAFE:
577 				if (poke_allowed)
578 					poke_cpu(i);
579 				done = 0;
580 				break;
581 
582 			case CPU_PHASE_SAFE:
583 			case CPU_PHASE_POWERED_OFF:
584 				break;
585 			}
586 		}
587 
588 		SMT_PAUSE();
589 	}
590 }
591 
592 void
593 mp_leave_barrier(void)
594 {
595 	int i;
596 
597 	ASSERT(MUTEX_HELD(&cpu_lock));
598 
599 	for (i = 0; i < NCPU; i++) {
600 		cpu_t *cp = cpu_get(i);
601 
602 		if (cp == NULL || cp == CPU)
603 			continue;
604 
605 		switch (cpu_phase[i]) {
606 		/*
607 		 * If we see a CPU in one of these phases, something has
608 		 * gone badly wrong with the guarantees
609 		 * mp_enter_barrier() is supposed to provide.  Rather
610 		 * than attempt to stumble along (and since we can't
611 		 * panic properly in this context), we tell the
612 		 * hypervisor we've crashed.
613 		 */
614 		case CPU_PHASE_NONE:
615 		case CPU_PHASE_WAIT_SAFE:
616 			(void) HYPERVISOR_shutdown(SHUTDOWN_crash);
617 			break;
618 
619 		case CPU_PHASE_POWERED_OFF:
620 			break;
621 
622 		case CPU_PHASE_SAFE:
623 			cpu_phase[i] = CPU_PHASE_NONE;
624 		}
625 	}
626 
627 	start_cpus();
628 }
629 
630 static int
631 poweroff_vcpu(struct cpu *cp)
632 {
633 	int error;
634 
635 	ASSERT(MUTEX_HELD(&cpu_lock));
636 
637 	ASSERT(CPU->cpu_id != cp->cpu_id);
638 	ASSERT(cp->cpu_flags & CPU_QUIESCED);
639 
640 	mp_enter_barrier();
641 
642 	if ((error = xen_vcpu_down(cp->cpu_id)) == 0) {
643 		ASSERT(cpu_phase[cp->cpu_id] == CPU_PHASE_SAFE);
644 
645 		CPUSET_DEL(cpu_ready_set, cp->cpu_id);
646 
647 		cp->cpu_flags |= CPU_POWEROFF | CPU_OFFLINE;
648 		cp->cpu_flags &=
649 		    ~(CPU_RUNNING | CPU_READY | CPU_EXISTS | CPU_ENABLE);
650 
651 		cpu_phase[cp->cpu_id] = CPU_PHASE_POWERED_OFF;
652 
653 		cpu_set_state(cp);
654 	}
655 
656 	mp_leave_barrier();
657 
658 	return (error);
659 }
660 
661 static int
662 vcpu_config_poweroff(processorid_t id)
663 {
664 	int oldstate;
665 	int error;
666 	cpu_t *cp;
667 
668 	mutex_enter(&cpu_lock);
669 
670 	if ((cp = cpu_get(id)) == NULL) {
671 		mutex_exit(&cpu_lock);
672 		return (ESRCH);
673 	}
674 
675 	if (cpu_get_state(cp) == P_POWEROFF) {
676 		mutex_exit(&cpu_lock);
677 		return (0);
678 	}
679 
680 	mutex_exit(&cpu_lock);
681 
682 	do {
683 		error = p_online_internal(id, P_OFFLINE,
684 		    &oldstate);
685 
686 		if (error != 0)
687 			break;
688 
689 		/*
690 		 * So we just changed it to P_OFFLINE.  But then we dropped
691 		 * cpu_lock, so now it is possible for another thread to change
692 		 * the cpu back to a different, non-quiesced state e.g.
693 		 * P_ONLINE.
694 		 */
695 		mutex_enter(&cpu_lock);
696 		if ((cp = cpu_get(id)) == NULL)
697 			error = ESRCH;
698 		else {
699 			if (cp->cpu_flags & CPU_QUIESCED)
700 				error = poweroff_vcpu(cp);
701 			else
702 				error = EBUSY;
703 		}
704 		mutex_exit(&cpu_lock);
705 	} while (error == EBUSY);
706 
707 	return (error);
708 }
709 
710 /*
711  * Add a new virtual cpu to the domain.
712  */
713 static int
714 vcpu_config_new(processorid_t id)
715 {
716 	extern int start_cpu(processorid_t);
717 	int error;
718 
719 	if (ncpus == 1) {
720 		printf("cannot (yet) add cpus to a single-cpu domain\n");
721 		return (ENOTSUP);
722 	}
723 
724 	affinity_set(CPU_CURRENT);
725 	error = start_cpu(id);
726 	affinity_clear();
727 	return (error);
728 }
729 
730 static int
731 poweron_vcpu(struct cpu *cp)
732 {
733 	int error;
734 
735 	ASSERT(MUTEX_HELD(&cpu_lock));
736 
737 	if (HYPERVISOR_vcpu_op(VCPUOP_is_up, cp->cpu_id, NULL) != 0) {
738 		printf("poweron_vcpu: vcpu%d is not available!\n",
739 		    cp->cpu_id);
740 		return (ENXIO);
741 	}
742 
743 	if ((error = xen_vcpu_up(cp->cpu_id)) == 0) {
744 		CPUSET_ADD(cpu_ready_set, cp->cpu_id);
745 		cp->cpu_flags |= CPU_EXISTS | CPU_READY | CPU_RUNNING;
746 		cp->cpu_flags &= ~CPU_POWEROFF;
747 		/*
748 		 * There are some nasty races possible here.
749 		 * Tell the vcpu it's up one more time.
750 		 * XXPV	Is this enough?  Is this safe?
751 		 */
752 		(void) xen_vcpu_up(cp->cpu_id);
753 
754 		cpu_phase[cp->cpu_id] = CPU_PHASE_NONE;
755 
756 		cpu_set_state(cp);
757 	}
758 	return (error);
759 }
760 
761 static int
762 vcpu_config_poweron(processorid_t id)
763 {
764 	cpu_t *cp;
765 	int oldstate;
766 	int error;
767 
768 	if (id >= ncpus)
769 		return (vcpu_config_new(id));
770 
771 	mutex_enter(&cpu_lock);
772 
773 	if ((cp = cpu_get(id)) == NULL) {
774 		mutex_exit(&cpu_lock);
775 		return (ESRCH);
776 	}
777 
778 	if (cpu_get_state(cp) != P_POWEROFF) {
779 		mutex_exit(&cpu_lock);
780 		return (0);
781 	}
782 
783 	if ((error = poweron_vcpu(cp)) != 0) {
784 		mutex_exit(&cpu_lock);
785 		return (error);
786 	}
787 
788 	mutex_exit(&cpu_lock);
789 
790 	return (p_online_internal(id, P_ONLINE, &oldstate));
791 }
792 
793 #define	REPORT_LEN	128
794 
795 static void
796 vcpu_config_report(processorid_t id, uint_t newstate, int error)
797 {
798 	char *report = kmem_alloc(REPORT_LEN, KM_SLEEP);
799 	size_t len;
800 	char *ps;
801 
802 	switch (newstate) {
803 	case P_ONLINE:
804 		ps = PS_ONLINE;
805 		break;
806 	case P_POWEROFF:
807 		ps = PS_POWEROFF;
808 		break;
809 	default:
810 		cmn_err(CE_PANIC, "unknown state %u\n", newstate);
811 		break;
812 	}
813 
814 	len = snprintf(report, REPORT_LEN,
815 	    "cpu%d: externally initiated %s", id, ps);
816 
817 	if (!error) {
818 		cmn_err(CE_CONT, "!%s\n", report);
819 		kmem_free(report, REPORT_LEN);
820 		return;
821 	}
822 
823 	len += snprintf(report + len, REPORT_LEN - len,
824 	    " failed, error %d: ", error);
825 	switch (error) {
826 	case EEXIST:
827 		len += snprintf(report + len, REPORT_LEN - len,
828 		    "cpu already %s", ps ? ps : "?");
829 		break;
830 	case ESRCH:
831 		len += snprintf(report + len, REPORT_LEN - len,
832 		    "cpu not found");
833 		break;
834 	case EINVAL:
835 	case EALREADY:
836 		break;
837 	case EPERM:
838 		len += snprintf(report + len, REPORT_LEN - len,
839 		    "insufficient privilege (0x%x)", id);
840 		break;
841 	case EBUSY:
842 		switch (newstate) {
843 		case P_ONLINE:
844 			/*
845 			 * This return comes from mp_cpu_start -
846 			 * we cannot 'start' the boot CPU.
847 			 */
848 			len += snprintf(report + len, REPORT_LEN - len,
849 			    "already running");
850 			break;
851 		case P_POWEROFF:
852 			len += snprintf(report + len, REPORT_LEN - len,
853 			    "bound lwps?");
854 			break;
855 		default:
856 			break;
857 		}
858 	default:
859 		break;
860 	}
861 
862 	cmn_err(CE_CONT, "%s\n", report);
863 	kmem_free(report, REPORT_LEN);
864 }
865 
866 static void
867 vcpu_config(void *arg)
868 {
869 	int id = (int)(uintptr_t)arg;
870 	int error;
871 	char dir[16];
872 	char *state;
873 
874 	if ((uint_t)id >= max_ncpus) {
875 		cmn_err(CE_WARN,
876 		    "vcpu_config: cpu%d does not fit in this domain", id);
877 		return;
878 	}
879 
880 	(void) snprintf(dir, sizeof (dir), "cpu/%d", id);
881 	state = kmem_alloc(MAXPATHLEN, KM_SLEEP);
882 	if (xenbus_scanf(XBT_NULL, dir, "availability", "%s", state) == 0) {
883 		if (strcmp(state, "online") == 0) {
884 			error = vcpu_config_poweron(id);
885 			vcpu_config_report(id, P_ONLINE, error);
886 		} else if (strcmp(state, "offline") == 0) {
887 			error = vcpu_config_poweroff(id);
888 			vcpu_config_report(id, P_POWEROFF, error);
889 		} else {
890 			cmn_err(CE_WARN,
891 			    "cpu%d: unknown target state '%s'", id, state);
892 		}
893 	} else
894 		cmn_err(CE_WARN,
895 		    "cpu%d: unable to read target state from xenstore", id);
896 
897 	kmem_free(state, MAXPATHLEN);
898 }
899 
900 /*ARGSUSED*/
901 static void
902 vcpu_config_event(struct xenbus_watch *watch, const char **vec, uint_t len)
903 {
904 	const char *path = vec[XS_WATCH_PATH];
905 	processorid_t id;
906 	char *s;
907 
908 	if ((s = strstr(path, "cpu/")) != NULL &&
909 	    sscanf(s, "cpu/%d", &id) == 1) {
910 		/*
911 		 * Run the virtual CPU configuration on a separate thread to
912 		 * avoid blocking on this event for too long (and for now,
913 		 * to ensure configuration requests are serialized.)
914 		 */
915 		(void) taskq_dispatch(cpu_config_tq,
916 		    vcpu_config, (void *)(uintptr_t)id, 0);
917 	}
918 }
919 
920 static int
921 xen_vcpu_initialize(processorid_t id, vcpu_guest_context_t *vgc)
922 {
923 	int err;
924 
925 	if ((err = HYPERVISOR_vcpu_op(VCPUOP_initialise, id, vgc)) != 0) {
926 		char *str;
927 		int level = CE_WARN;
928 
929 		switch (err) {
930 		case -X_EINVAL:
931 			/*
932 			 * This interface squashes multiple error sources
933 			 * to one error code.  In particular, an X_EINVAL
934 			 * code can mean:
935 			 *
936 			 * -	the vcpu id is out of range
937 			 * -	cs or ss are in ring 0
938 			 * -	cr3 is wrong
939 			 * -	an entry in the new gdt is above the
940 			 *	reserved entry
941 			 * -	a frame underneath the new gdt is bad
942 			 */
943 			str = "something is wrong :(";
944 			break;
945 		case -X_ENOENT:
946 			str = "no such cpu";
947 			break;
948 		case -X_ENOMEM:
949 			str = "no mem to copy ctxt";
950 			break;
951 		case -X_EFAULT:
952 			str = "bad address";
953 			break;
954 		case -X_EEXIST:
955 			/*
956 			 * Hmm.  This error is returned if the vcpu has already
957 			 * been initialized once before in the lifetime of this
958 			 * domain.  This is a logic error in the kernel.
959 			 */
960 			level = CE_PANIC;
961 			str = "already initialized";
962 			break;
963 		default:
964 			level = CE_PANIC;
965 			str = "<unexpected>";
966 			break;
967 		}
968 
969 		cmn_err(level, "vcpu%d: failed to init: error %d: %s",
970 		    id, -err, str);
971 	}
972 	return (err);
973 }
974 
975 long
976 xen_vcpu_up(processorid_t id)
977 {
978 	long err;
979 
980 	if ((err = HYPERVISOR_vcpu_op(VCPUOP_up, id, NULL)) != 0) {
981 		char *str;
982 
983 		switch (err) {
984 		case -X_ENOENT:
985 			str = "no such cpu";
986 			break;
987 		case -X_EINVAL:
988 			/*
989 			 * Perhaps this is diagnostic overkill.
990 			 */
991 			if (HYPERVISOR_vcpu_op(VCPUOP_is_up, id, NULL) < 0)
992 				str = "bad cpuid";
993 			else
994 				str = "not initialized";
995 			break;
996 		default:
997 			str = "<unexpected>";
998 			break;
999 		}
1000 
1001 		printf("vcpu%d: failed to start: error %d: %s\n",
1002 		    id, -(int)err, str);
1003 		return (EBFONT);	/* deliberately silly */
1004 	}
1005 	return (err);
1006 }
1007 
1008 long
1009 xen_vcpu_down(processorid_t id)
1010 {
1011 	long err;
1012 
1013 	if ((err = HYPERVISOR_vcpu_op(VCPUOP_down, id, NULL)) != 0) {
1014 		/*
1015 		 * X_ENOENT:	no such cpu
1016 		 * X_EINVAL:	bad cpuid
1017 		 */
1018 		panic("vcpu%d: failed to stop: error %d", id, -(int)err);
1019 	}
1020 
1021 	return (err);
1022 }
1023