xref: /illumos-gate/usr/src/uts/i86xpv/os/mp_xen.c (revision ed093b41a93e8563e6e1e5dae0768dda2a7bcc27)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Copyright 2019 Joyent, Inc.
29  */
30 
31 /*
32  * Virtual CPU management.
33  *
34  * VCPUs can be controlled in one of two ways; through the domain itself
35  * (psradm, p_online(), etc.), and via changes in xenstore (vcpu_config()).
36  * Unfortunately, the terminology is used in different ways; they work out as
37  * follows:
38  *
39  * P_ONLINE: the VCPU is up and running, taking interrupts and running threads
40  *
41  * P_OFFLINE: the VCPU is up and running, but quiesced (i.e. blocked in the
42  * hypervisor on the idle thread).  It must be up since a downed VCPU cannot
43  * receive interrupts, and we require this for offline CPUs in Solaris.
44  *
45  * P_POWEROFF: the VCPU is down (we never called xen_vcpu_up(), or called
46  * xen_vcpu_down() for it).  It can't take interrupts or run anything, though
47  * if it has run previously, its software state (cpu_t, machcpu structures, IPI
48  * event channels, etc.) will still exist.
49  *
50  * The hypervisor has two notions of CPU states as represented in the store:
51  *
52  * "offline": the VCPU is down.  Corresponds to P_POWEROFF.
53  *
54  * "online": the VCPU is running.  Corresponds to a CPU state other than
55  * P_POWEROFF.
56  *
57  * Currently, only a notification via xenstore can bring a CPU into a
58  * P_POWEROFF state, and only the domain can change between P_ONLINE, P_NOINTR,
59  * P_OFFLINE, etc.  We need to be careful to treat xenstore notifications
60  * idempotently, as we'll get 'duplicate' entries when we resume a domain.
61  *
62  * Note that the xenstore configuration is strictly advisory, in that a domain
63  * can choose to ignore it and still power up a VCPU in the offline state. To
64  * play nice, we don't allow it. Thus, any attempt to power on/off a CPU is
65  * ENOTSUP from within Solaris.
66  *
67  * Powering off a VCPU and suspending the domain use similar code. The
68  * difficulty here is that we must ensure that each VCPU is in a stable
69  * state: it must have a saved PCB, and not be responding to interrupts
70  * (since we are just about to remove its ability to run on a real CPU,
71  * possibly forever).  However, an offline CPU in Solaris can take
72  * cross-call interrupts, as mentioned, so we must go through a
73  * two-stage process.  First, we use the standard Solaris pause_cpus().
74  * This ensures that all CPUs are either in mach_cpu_pause() or
75  * mach_cpu_idle(), and nothing will cross-call them.
76  *
77  * Powered-off-CPUs are already safe, as we own the cpu_lock needed to
78  * bring them back up, and in state CPU_PHASE_POWERED_OFF.
79  *
80  * Running CPUs are spinning in mach_cpu_pause() waiting for either
81  * PAUSE_IDLE or CPU_PHASE_WAIT_SAFE.
82  *
83  * Offline CPUs are either running the idle thread and periodically
84  * checking for CPU_PHASE_WAIT_SAFE, or blocked in the hypervisor.
85  *
86  * Thus, we set CPU_PHASE_WAIT_SAFE for every powered-on CPU, as well as
87  * poking them to make sure they're not blocked[1]. When every CPU has
88  * responded by reaching a safe state and setting CPU_PHASE_SAFE, we
89  * know we can suspend, or power-off a CPU, without problems.
90  *
91  * [1] note that we have to repeatedly poke offline CPUs: it's the only
92  * way to ensure that the CPU doesn't miss the state change before
93  * dropping into HYPERVISOR_block().
94  */
95 
96 #include <sys/types.h>
97 #include <sys/systm.h>
98 #include <sys/param.h>
99 #include <sys/taskq.h>
100 #include <sys/cmn_err.h>
101 #include <sys/archsystm.h>
102 #include <sys/machsystm.h>
103 #include <sys/segments.h>
104 #include <sys/cpuvar.h>
105 #include <sys/x86_archext.h>
106 #include <sys/controlregs.h>
107 #include <sys/hypervisor.h>
108 #include <sys/xpv_panic.h>
109 #include <sys/mman.h>
110 #include <sys/psw.h>
111 #include <sys/cpu.h>
112 #include <sys/sunddi.h>
113 #include <util/sscanf.h>
114 #include <vm/hat_i86.h>
115 #include <vm/hat.h>
116 #include <vm/as.h>
117 
118 #include <xen/public/io/xs_wire.h>
119 #include <xen/sys/xenbus_impl.h>
120 #include <xen/public/vcpu.h>
121 
122 extern cpuset_t cpu_ready_set;
123 
124 #define	CPU_PHASE_NONE 0
125 #define	CPU_PHASE_WAIT_SAFE 1
126 #define	CPU_PHASE_SAFE 2
127 #define	CPU_PHASE_POWERED_OFF 3
128 
129 /*
130  * We can only poke CPUs during barrier enter 256 times a second at
131  * most.
132  */
133 #define	POKE_TIMEOUT (NANOSEC / 256)
134 
135 static taskq_t *cpu_config_tq;
136 static int cpu_phase[NCPU];
137 
138 static void vcpu_config_event(struct xenbus_watch *, const char **, uint_t);
139 static int xen_vcpu_initialize(processorid_t, vcpu_guest_context_t *);
140 
141 /*
142  * Return whether or not the vcpu is actually running on a pcpu
143  */
144 int
145 vcpu_on_pcpu(processorid_t cpu)
146 {
147 	struct vcpu_runstate_info runstate;
148 	int	ret = VCPU_STATE_UNKNOWN;
149 
150 	ASSERT(cpu < NCPU);
151 	/*
152 	 * Don't bother with hypercall if we are asking about ourself
153 	 */
154 	if (cpu == CPU->cpu_id)
155 		return (VCPU_ON_PCPU);
156 	if (HYPERVISOR_vcpu_op(VCPUOP_get_runstate_info, cpu, &runstate) != 0)
157 		goto out;
158 
159 	switch (runstate.state) {
160 	case RUNSTATE_running:
161 		ret = VCPU_ON_PCPU;
162 		break;
163 
164 	case RUNSTATE_runnable:
165 	case RUNSTATE_offline:
166 	case RUNSTATE_blocked:
167 		ret = VCPU_NOT_ON_PCPU;
168 		break;
169 
170 	default:
171 		break;
172 	}
173 
174 out:
175 	return (ret);
176 }
177 
178 /*
179  * These routines allocate any global state that might be needed
180  * while starting cpus.  For virtual cpus, there is no such state.
181  */
182 int
183 mach_cpucontext_init(void)
184 {
185 	return (0);
186 }
187 
188 void
189 do_cpu_config_watch(int state)
190 {
191 	static struct xenbus_watch cpu_config_watch;
192 
193 	if (state != XENSTORE_UP)
194 		return;
195 	cpu_config_watch.node = "cpu";
196 	cpu_config_watch.callback = vcpu_config_event;
197 	if (register_xenbus_watch(&cpu_config_watch)) {
198 		taskq_destroy(cpu_config_tq);
199 		cmn_err(CE_WARN, "do_cpu_config_watch: "
200 		    "failed to set vcpu config watch");
201 	}
202 
203 }
204 
205 /*
206  * This routine is called after all the "normal" MP startup has
207  * been done; a good place to start watching xen store for virtual
208  * cpu hot plug events.
209  */
210 void
211 mach_cpucontext_fini(void)
212 {
213 
214 	cpu_config_tq = taskq_create("vcpu config taskq", 1,
215 	    maxclsyspri - 1, 1, 1, TASKQ_PREPOPULATE);
216 
217 	(void) xs_register_xenbus_callback(do_cpu_config_watch);
218 }
219 
220 /*
221  * Fill in the remaining CPU context and initialize it.
222  */
223 static int
224 mp_set_cpu_context(vcpu_guest_context_t *vgc, cpu_t *cp)
225 {
226 	uint_t vec, iopl;
227 
228 	vgc->flags = VGCF_IN_KERNEL;
229 
230 	/*
231 	 * fpu_ctx we leave as zero; on first fault we'll store
232 	 * sse_initial into it anyway.
233 	 */
234 
235 	vgc->user_regs.cs = KCS_SEL | SEL_KPL;	/* force to ring 3 */
236 	vgc->user_regs.ds = KDS_SEL;
237 	vgc->user_regs.es = KDS_SEL;
238 	vgc->user_regs.ss = KDS_SEL;
239 	vgc->kernel_ss = KDS_SEL;
240 
241 	/*
242 	 * Allow I/O privilege level for Dom0 kernel.
243 	 */
244 	if (DOMAIN_IS_INITDOMAIN(xen_info))
245 		iopl = (PS_IOPL & 0x1000); /* ring 1 */
246 	else
247 		iopl = 0;
248 
249 	vgc->user_regs.fs = 0;
250 	vgc->user_regs.gs = 0;
251 	vgc->user_regs.rflags = F_OFF | iopl;
252 
253 	/*
254 	 * Initialize the trap_info_t from the IDT
255 	 */
256 #if !defined(__lint)
257 	ASSERT(NIDT == sizeof (vgc->trap_ctxt) / sizeof (vgc->trap_ctxt[0]));
258 #endif
259 	for (vec = 0; vec < NIDT; vec++) {
260 		trap_info_t *ti = &vgc->trap_ctxt[vec];
261 
262 		if (xen_idt_to_trap_info(vec,
263 		    &cp->cpu_m.mcpu_idt[vec], ti) == 0) {
264 			ti->cs = KCS_SEL;
265 			ti->vector = vec;
266 		}
267 	}
268 
269 	/*
270 	 * No LDT
271 	 */
272 
273 	/*
274 	 * (We assert in various places that the GDT is (a) aligned on a
275 	 * page boundary and (b) one page long, so this really should fit..)
276 	 */
277 #ifdef CRASH_XEN
278 	vgc->gdt_frames[0] = pa_to_ma(mmu_btop(cp->cpu_m.mcpu_gdtpa));
279 #else
280 	vgc->gdt_frames[0] = pfn_to_mfn(mmu_btop(cp->cpu_m.mcpu_gdtpa));
281 #endif
282 	vgc->gdt_ents = NGDT;
283 
284 	vgc->ctrlreg[0] = CR0_ENABLE_FPU_FLAGS(getcr0());
285 
286 	vgc->ctrlreg[3] =
287 	    pa_to_ma(mmu_ptob(kas.a_hat->hat_htable->ht_pfn));
288 
289 	vgc->ctrlreg[4] = getcr4();
290 
291 	vgc->event_callback_eip = (uintptr_t)xen_callback;
292 	vgc->failsafe_callback_eip = (uintptr_t)xen_failsafe_callback;
293 	vgc->flags |= VGCF_failsafe_disables_events;
294 
295 	/*
296 	 * XXPV should this be moved to init_cpu_syscall?
297 	 */
298 	vgc->syscall_callback_eip = (uintptr_t)sys_syscall;
299 	vgc->flags |= VGCF_syscall_disables_events;
300 
301 	ASSERT(vgc->user_regs.gs == 0);
302 	vgc->gs_base_kernel = (uintptr_t)cp;
303 
304 	return (xen_vcpu_initialize(cp->cpu_id, vgc));
305 }
306 
307 /*
308  * Create a guest virtual cpu context so that the virtual cpu
309  * springs into life in the domain just about to call mp_startup()
310  *
311  * Virtual CPUs must be initialized once in the lifetime of the domain;
312  * after that subsequent attempts to start them will fail with X_EEXIST.
313  *
314  * Thus 'alloc' -really- creates and initializes the virtual
315  * CPU context just once. Once the initialisation succeeds, we never
316  * free it, nor the regular cpu_t to which it refers.
317  */
318 void *
319 mach_cpucontext_alloc(struct cpu *cp)
320 {
321 	kthread_t *tp = cp->cpu_thread;
322 	vcpu_guest_context_t vgc;
323 
324 	int err = 1;
325 
326 	/*
327 	 * First, augment the incoming cpu structure
328 	 * - vcpu pointer reference
329 	 * - pending event storage area
330 	 * - physical address of GDT
331 	 */
332 	cp->cpu_m.mcpu_vcpu_info =
333 	    &HYPERVISOR_shared_info->vcpu_info[cp->cpu_id];
334 	cp->cpu_m.mcpu_evt_pend = kmem_zalloc(
335 	    sizeof (struct xen_evt_data), KM_SLEEP);
336 	cp->cpu_m.mcpu_gdtpa =
337 	    mmu_ptob(hat_getpfnum(kas.a_hat, (caddr_t)cp->cpu_gdt));
338 
339 	if ((err = xen_gdt_setprot(cp, PROT_READ)) != 0)
340 		goto done;
341 
342 	/*
343 	 * Now set up the vcpu context so that we can start this vcpu
344 	 * in the kernel at tp->t_pc (mp_startup).  Note that the
345 	 * thread will thread_exit() shortly after performing the
346 	 * initialization; in particular, we will *never* take a
347 	 * privilege transition on this thread.
348 	 */
349 
350 	bzero(&vgc, sizeof (vgc));
351 
352 	vgc.user_regs.rip = tp->t_pc;
353 	vgc.user_regs.rsp = tp->t_sp;
354 	vgc.user_regs.rbp = tp->t_sp - 2 * sizeof (greg_t);
355 	/*
356 	 * XXPV	Fix resume, if Russ didn't already fix it.
357 	 *
358 	 * Note that resume unconditionally puts t->t_stk + sizeof (regs)
359 	 * into kernel_sp via HYPERVISOR_stack_switch. This anticipates
360 	 * that only lwps take traps that switch to the kernel stack;
361 	 * part of creating an lwp adjusts the stack by subtracting
362 	 * sizeof (struct regs) off t_stk.
363 	 *
364 	 * The more interesting question is, why do we do all the work
365 	 * of a fully fledged lwp for a plain thread?  In particular
366 	 * we don't have to call HYPERVISOR_stack_switch for lwp-less threads
367 	 * or futz with the LDT.  This should probably all be done with
368 	 * an lwp context operator to keep pure thread context switch fast.
369 	 */
370 	vgc.kernel_sp = (ulong_t)tp->t_stk;
371 
372 	err = mp_set_cpu_context(&vgc, cp);
373 
374 done:
375 	if (err) {
376 		mach_cpucontext_free(cp, NULL, err);
377 		return (NULL);
378 	}
379 	return (cp);
380 }
381 
382 /*
383  * By the time we are called either we have successfully started
384  * the cpu, or our attempt to start it has failed.
385  */
386 
387 /*ARGSUSED*/
388 void
389 mach_cpucontext_free(struct cpu *cp, void *arg, int err)
390 {
391 	switch (err) {
392 	case 0:
393 		break;
394 	case ETIMEDOUT:
395 		/*
396 		 * The vcpu context is loaded into the hypervisor, and
397 		 * we've tried to start it, but the vcpu has not been set
398 		 * running yet, for whatever reason.  We arrange to -not-
399 		 * free any data structures it may be referencing.  In
400 		 * particular, we've already told the hypervisor about
401 		 * the GDT, and so we can't map it read-write again.
402 		 */
403 		break;
404 	default:
405 		(void) xen_gdt_setprot(cp, PROT_READ | PROT_WRITE);
406 		kmem_free(cp->cpu_m.mcpu_evt_pend,
407 		    sizeof (struct xen_evt_data));
408 		break;
409 	}
410 }
411 
412 /*
413  * Reset this CPU's context.  Clear out any pending evtchn data, since event
414  * channel numbers will all change when we resume.
415  */
416 void
417 mach_cpucontext_reset(cpu_t *cp)
418 {
419 	bzero(cp->cpu_m.mcpu_evt_pend, sizeof (struct xen_evt_data));
420 	/* mcpu_intr_pending ? */
421 }
422 
423 static void
424 pcb_to_user_regs(label_t *pcb, vcpu_guest_context_t *vgc)
425 {
426 	vgc->user_regs.rip = pcb->val[REG_LABEL_PC];
427 	vgc->user_regs.rsp = pcb->val[REG_LABEL_SP];
428 	vgc->user_regs.rbp = pcb->val[REG_LABEL_BP];
429 	vgc->user_regs.rbx = pcb->val[REG_LABEL_RBX];
430 	vgc->user_regs.r12 = pcb->val[REG_LABEL_R12];
431 	vgc->user_regs.r13 = pcb->val[REG_LABEL_R13];
432 	vgc->user_regs.r14 = pcb->val[REG_LABEL_R14];
433 	vgc->user_regs.r15 = pcb->val[REG_LABEL_R15];
434 }
435 
436 /*
437  * Restore the context of a CPU during resume.  This context is always
438  * inside enter_safe_phase(), below.
439  */
440 void
441 mach_cpucontext_restore(cpu_t *cp)
442 {
443 	vcpu_guest_context_t vgc;
444 	int err;
445 
446 	ASSERT(cp->cpu_thread == cp->cpu_pause_thread ||
447 	    cp->cpu_thread == cp->cpu_idle_thread);
448 
449 	bzero(&vgc, sizeof (vgc));
450 
451 	pcb_to_user_regs(&cp->cpu_thread->t_pcb, &vgc);
452 
453 	/*
454 	 * We're emulating a longjmp() here: in particular, we need to bump the
455 	 * stack pointer to account for the pop of xIP that returning from
456 	 * longjmp() normally would do, and set the return value in xAX to 1.
457 	 */
458 	vgc.user_regs.rax = 1;
459 	vgc.user_regs.rsp += sizeof (ulong_t);
460 
461 	vgc.kernel_sp = cp->cpu_thread->t_sp;
462 
463 	err = mp_set_cpu_context(&vgc, cp);
464 
465 	ASSERT(err == 0);
466 }
467 
468 /*
469  * Reach a point at which the CPU can be safely powered-off or
470  * suspended.  Nothing can wake this CPU out of the loop.
471  */
472 static void
473 enter_safe_phase(void)
474 {
475 	ulong_t flags = intr_clear();
476 
477 	if (setjmp(&curthread->t_pcb) == 0) {
478 		cpu_phase[CPU->cpu_id] = CPU_PHASE_SAFE;
479 		while (cpu_phase[CPU->cpu_id] == CPU_PHASE_SAFE)
480 			SMT_PAUSE();
481 	}
482 
483 	ASSERT(!interrupts_enabled());
484 
485 	intr_restore(flags);
486 }
487 
488 /*
489  * Offline CPUs run this code even under a pause_cpus(), so we must
490  * check if we need to enter the safe phase.
491  */
492 void
493 mach_cpu_idle(void)
494 {
495 	if (IN_XPV_PANIC()) {
496 		xpv_panic_halt();
497 	} else  {
498 		(void) HYPERVISOR_block();
499 		if (cpu_phase[CPU->cpu_id] == CPU_PHASE_WAIT_SAFE)
500 			enter_safe_phase();
501 	}
502 }
503 
504 /*
505  * Spin until either start_cpus() wakes us up, or we get a request to
506  * enter the safe phase (followed by a later start_cpus()).
507  */
508 void
509 mach_cpu_pause(volatile char *safe)
510 {
511 	*safe = PAUSE_WAIT;
512 	membar_enter();
513 
514 	while (*safe != PAUSE_IDLE) {
515 		if (cpu_phase[CPU->cpu_id] == CPU_PHASE_WAIT_SAFE)
516 			enter_safe_phase();
517 		SMT_PAUSE();
518 	}
519 }
520 
521 int
522 mach_cpu_halt(xc_arg_t arg1, xc_arg_t arg2 __unused, xc_arg_t arg3 __unused)
523 {
524 	char *msg = (char *)arg1;
525 
526 	if (msg)
527 		prom_printf("%s\n", msg);
528 	(void) xen_vcpu_down(CPU->cpu_id);
529 	return (0);
530 }
531 
532 /*ARGSUSED*/
533 int
534 mp_cpu_poweron(struct cpu *cp)
535 {
536 	return (ENOTSUP);
537 }
538 
539 /*ARGSUSED*/
540 int
541 mp_cpu_poweroff(struct cpu *cp)
542 {
543 	return (ENOTSUP);
544 }
545 
546 void
547 mp_enter_barrier(void)
548 {
549 	hrtime_t last_poke_time = 0;
550 	int poke_allowed = 0;
551 	int done = 0;
552 	int i;
553 
554 	ASSERT(MUTEX_HELD(&cpu_lock));
555 
556 	pause_cpus(NULL, NULL);
557 
558 	while (!done) {
559 		done = 1;
560 		poke_allowed = 0;
561 
562 		if (xpv_gethrtime() - last_poke_time > POKE_TIMEOUT) {
563 			last_poke_time = xpv_gethrtime();
564 			poke_allowed = 1;
565 		}
566 
567 		for (i = 0; i < NCPU; i++) {
568 			cpu_t *cp = cpu_get(i);
569 
570 			if (cp == NULL || cp == CPU)
571 				continue;
572 
573 			switch (cpu_phase[i]) {
574 			case CPU_PHASE_NONE:
575 				cpu_phase[i] = CPU_PHASE_WAIT_SAFE;
576 				poke_cpu(i);
577 				done = 0;
578 				break;
579 
580 			case CPU_PHASE_WAIT_SAFE:
581 				if (poke_allowed)
582 					poke_cpu(i);
583 				done = 0;
584 				break;
585 
586 			case CPU_PHASE_SAFE:
587 			case CPU_PHASE_POWERED_OFF:
588 				break;
589 			}
590 		}
591 
592 		SMT_PAUSE();
593 	}
594 }
595 
596 void
597 mp_leave_barrier(void)
598 {
599 	int i;
600 
601 	ASSERT(MUTEX_HELD(&cpu_lock));
602 
603 	for (i = 0; i < NCPU; i++) {
604 		cpu_t *cp = cpu_get(i);
605 
606 		if (cp == NULL || cp == CPU)
607 			continue;
608 
609 		switch (cpu_phase[i]) {
610 		/*
611 		 * If we see a CPU in one of these phases, something has
612 		 * gone badly wrong with the guarantees
613 		 * mp_enter_barrier() is supposed to provide.  Rather
614 		 * than attempt to stumble along (and since we can't
615 		 * panic properly in this context), we tell the
616 		 * hypervisor we've crashed.
617 		 */
618 		case CPU_PHASE_NONE:
619 		case CPU_PHASE_WAIT_SAFE:
620 			(void) HYPERVISOR_shutdown(SHUTDOWN_crash);
621 			break;
622 
623 		case CPU_PHASE_POWERED_OFF:
624 			break;
625 
626 		case CPU_PHASE_SAFE:
627 			cpu_phase[i] = CPU_PHASE_NONE;
628 		}
629 	}
630 
631 	start_cpus();
632 }
633 
634 static int
635 poweroff_vcpu(struct cpu *cp)
636 {
637 	int error;
638 
639 	ASSERT(MUTEX_HELD(&cpu_lock));
640 
641 	ASSERT(CPU->cpu_id != cp->cpu_id);
642 	ASSERT(cp->cpu_flags & CPU_QUIESCED);
643 
644 	mp_enter_barrier();
645 
646 	if ((error = xen_vcpu_down(cp->cpu_id)) == 0) {
647 		ASSERT(cpu_phase[cp->cpu_id] == CPU_PHASE_SAFE);
648 
649 		CPUSET_DEL(cpu_ready_set, cp->cpu_id);
650 
651 		if (cp->cpu_flags & CPU_ENABLE)
652 			ncpus_intr_enabled--;
653 
654 		cp->cpu_flags |= CPU_POWEROFF | CPU_OFFLINE;
655 		cp->cpu_flags &=
656 		    ~(CPU_RUNNING | CPU_READY | CPU_EXISTS | CPU_ENABLE);
657 
658 		cpu_phase[cp->cpu_id] = CPU_PHASE_POWERED_OFF;
659 
660 		cpu_set_state(cp);
661 	}
662 
663 	mp_leave_barrier();
664 
665 	return (error);
666 }
667 
668 static int
669 vcpu_config_poweroff(processorid_t id)
670 {
671 	int oldstate;
672 	int error;
673 	cpu_t *cp;
674 
675 	mutex_enter(&cpu_lock);
676 
677 	if ((cp = cpu_get(id)) == NULL) {
678 		mutex_exit(&cpu_lock);
679 		return (ESRCH);
680 	}
681 
682 	if (cpu_get_state(cp) == P_POWEROFF) {
683 		mutex_exit(&cpu_lock);
684 		return (0);
685 	}
686 
687 	mutex_exit(&cpu_lock);
688 
689 	do {
690 		error = p_online_internal(id, P_OFFLINE,
691 		    &oldstate);
692 
693 		if (error != 0)
694 			break;
695 
696 		/*
697 		 * So we just changed it to P_OFFLINE.  But then we dropped
698 		 * cpu_lock, so now it is possible for another thread to change
699 		 * the cpu back to a different, non-quiesced state e.g.
700 		 * P_ONLINE.
701 		 */
702 		mutex_enter(&cpu_lock);
703 		if ((cp = cpu_get(id)) == NULL)
704 			error = ESRCH;
705 		else {
706 			if (cp->cpu_flags & CPU_QUIESCED)
707 				error = poweroff_vcpu(cp);
708 			else
709 				error = EBUSY;
710 		}
711 		mutex_exit(&cpu_lock);
712 	} while (error == EBUSY);
713 
714 	return (error);
715 }
716 
717 /*
718  * Add a new virtual cpu to the domain.
719  */
720 static int
721 vcpu_config_new(processorid_t id)
722 {
723 	extern int start_cpu(processorid_t);
724 	int error;
725 
726 	if (ncpus == 1) {
727 		printf("cannot (yet) add cpus to a single-cpu domain\n");
728 		return (ENOTSUP);
729 	}
730 
731 	affinity_set(CPU_CURRENT);
732 	error = start_cpu(id);
733 	affinity_clear();
734 	return (error);
735 }
736 
737 static int
738 poweron_vcpu(struct cpu *cp)
739 {
740 	int error;
741 
742 	ASSERT(MUTEX_HELD(&cpu_lock));
743 
744 	if (HYPERVISOR_vcpu_op(VCPUOP_is_up, cp->cpu_id, NULL) != 0) {
745 		printf("poweron_vcpu: vcpu%d is not available!\n",
746 		    cp->cpu_id);
747 		return (ENXIO);
748 	}
749 
750 	if ((error = xen_vcpu_up(cp->cpu_id)) == 0) {
751 		CPUSET_ADD(cpu_ready_set, cp->cpu_id);
752 		cp->cpu_flags |= CPU_EXISTS | CPU_READY | CPU_RUNNING;
753 		cp->cpu_flags &= ~CPU_POWEROFF;
754 		/*
755 		 * There are some nasty races possible here.
756 		 * Tell the vcpu it's up one more time.
757 		 * XXPV	Is this enough?  Is this safe?
758 		 */
759 		(void) xen_vcpu_up(cp->cpu_id);
760 
761 		cpu_phase[cp->cpu_id] = CPU_PHASE_NONE;
762 
763 		cpu_set_state(cp);
764 	}
765 	return (error);
766 }
767 
768 static int
769 vcpu_config_poweron(processorid_t id)
770 {
771 	cpu_t *cp;
772 	int oldstate;
773 	int error;
774 
775 	if (id >= ncpus)
776 		return (vcpu_config_new(id));
777 
778 	mutex_enter(&cpu_lock);
779 
780 	if ((cp = cpu_get(id)) == NULL) {
781 		mutex_exit(&cpu_lock);
782 		return (ESRCH);
783 	}
784 
785 	if (cpu_get_state(cp) != P_POWEROFF) {
786 		mutex_exit(&cpu_lock);
787 		return (0);
788 	}
789 
790 	if ((error = poweron_vcpu(cp)) != 0) {
791 		mutex_exit(&cpu_lock);
792 		return (error);
793 	}
794 
795 	mutex_exit(&cpu_lock);
796 
797 	return (p_online_internal(id, P_ONLINE, &oldstate));
798 }
799 
800 #define	REPORT_LEN	128
801 
802 static void
803 vcpu_config_report(processorid_t id, uint_t newstate, int error)
804 {
805 	char *report = kmem_alloc(REPORT_LEN, KM_SLEEP);
806 	size_t len;
807 	char *ps;
808 
809 	ps = NULL;
810 	switch (newstate) {
811 	case P_ONLINE:
812 		ps = PS_ONLINE;
813 		break;
814 	case P_POWEROFF:
815 		ps = PS_POWEROFF;
816 		break;
817 	default:
818 		cmn_err(CE_PANIC, "unknown state %u\n", newstate);
819 		break;
820 	}
821 
822 	len = snprintf(report, REPORT_LEN,
823 	    "cpu%d: externally initiated %s", id, ps);
824 
825 	if (!error) {
826 		cmn_err(CE_CONT, "!%s\n", report);
827 		kmem_free(report, REPORT_LEN);
828 		return;
829 	}
830 
831 	len += snprintf(report + len, REPORT_LEN - len,
832 	    " failed, error %d: ", error);
833 	switch (error) {
834 	case EEXIST:
835 		len += snprintf(report + len, REPORT_LEN - len,
836 		    "cpu already %s", ps ? ps : "?");
837 		break;
838 	case ESRCH:
839 		len += snprintf(report + len, REPORT_LEN - len,
840 		    "cpu not found");
841 		break;
842 	case EINVAL:
843 	case EALREADY:
844 		break;
845 	case EPERM:
846 		len += snprintf(report + len, REPORT_LEN - len,
847 		    "insufficient privilege (0x%x)", id);
848 		break;
849 	case EBUSY:
850 		switch (newstate) {
851 		case P_ONLINE:
852 			/*
853 			 * This return comes from mp_cpu_start -
854 			 * we cannot 'start' the boot CPU.
855 			 */
856 			len += snprintf(report + len, REPORT_LEN - len,
857 			    "already running");
858 			break;
859 		case P_POWEROFF:
860 			len += snprintf(report + len, REPORT_LEN - len,
861 			    "bound lwps?");
862 			break;
863 		default:
864 			break;
865 		}
866 	default:
867 		break;
868 	}
869 
870 	cmn_err(CE_CONT, "%s\n", report);
871 	kmem_free(report, REPORT_LEN);
872 }
873 
874 static void
875 vcpu_config(void *arg)
876 {
877 	int id = (int)(uintptr_t)arg;
878 	int error;
879 	char dir[16];
880 	char *state;
881 
882 	if ((uint_t)id >= max_ncpus) {
883 		cmn_err(CE_WARN,
884 		    "vcpu_config: cpu%d does not fit in this domain", id);
885 		return;
886 	}
887 
888 	(void) snprintf(dir, sizeof (dir), "cpu/%d", id);
889 	state = kmem_alloc(MAXPATHLEN, KM_SLEEP);
890 	if (xenbus_scanf(XBT_NULL, dir, "availability", "%s", state) == 0) {
891 		if (strcmp(state, "online") == 0) {
892 			error = vcpu_config_poweron(id);
893 			vcpu_config_report(id, P_ONLINE, error);
894 		} else if (strcmp(state, "offline") == 0) {
895 			error = vcpu_config_poweroff(id);
896 			vcpu_config_report(id, P_POWEROFF, error);
897 		} else {
898 			cmn_err(CE_WARN,
899 			    "cpu%d: unknown target state '%s'", id, state);
900 		}
901 	} else
902 		cmn_err(CE_WARN,
903 		    "cpu%d: unable to read target state from xenstore", id);
904 
905 	kmem_free(state, MAXPATHLEN);
906 }
907 
908 /*ARGSUSED*/
909 static void
910 vcpu_config_event(struct xenbus_watch *watch, const char **vec, uint_t len)
911 {
912 	const char *path = vec[XS_WATCH_PATH];
913 	processorid_t id;
914 	char *s;
915 
916 	if ((s = strstr(path, "cpu/")) != NULL &&
917 	    sscanf(s, "cpu/%d", &id) == 1) {
918 		/*
919 		 * Run the virtual CPU configuration on a separate thread to
920 		 * avoid blocking on this event for too long (and for now,
921 		 * to ensure configuration requests are serialized.)
922 		 */
923 		(void) taskq_dispatch(cpu_config_tq,
924 		    vcpu_config, (void *)(uintptr_t)id, 0);
925 	}
926 }
927 
928 static int
929 xen_vcpu_initialize(processorid_t id, vcpu_guest_context_t *vgc)
930 {
931 	int err;
932 
933 	if ((err = HYPERVISOR_vcpu_op(VCPUOP_initialise, id, vgc)) != 0) {
934 		char *str;
935 		int level = CE_WARN;
936 
937 		switch (err) {
938 		case -X_EINVAL:
939 			/*
940 			 * This interface squashes multiple error sources
941 			 * to one error code.  In particular, an X_EINVAL
942 			 * code can mean:
943 			 *
944 			 * -	the vcpu id is out of range
945 			 * -	cs or ss are in ring 0
946 			 * -	cr3 is wrong
947 			 * -	an entry in the new gdt is above the
948 			 *	reserved entry
949 			 * -	a frame underneath the new gdt is bad
950 			 */
951 			str = "something is wrong :(";
952 			break;
953 		case -X_ENOENT:
954 			str = "no such cpu";
955 			break;
956 		case -X_ENOMEM:
957 			str = "no mem to copy ctxt";
958 			break;
959 		case -X_EFAULT:
960 			str = "bad address";
961 			break;
962 		case -X_EEXIST:
963 			/*
964 			 * Hmm.  This error is returned if the vcpu has already
965 			 * been initialized once before in the lifetime of this
966 			 * domain.  This is a logic error in the kernel.
967 			 */
968 			level = CE_PANIC;
969 			str = "already initialized";
970 			break;
971 		default:
972 			level = CE_PANIC;
973 			str = "<unexpected>";
974 			break;
975 		}
976 
977 		cmn_err(level, "vcpu%d: failed to init: error %d: %s",
978 		    id, -err, str);
979 	}
980 	return (err);
981 }
982 
983 long
984 xen_vcpu_up(processorid_t id)
985 {
986 	long err;
987 
988 	if ((err = HYPERVISOR_vcpu_op(VCPUOP_up, id, NULL)) != 0) {
989 		char *str;
990 
991 		switch (err) {
992 		case -X_ENOENT:
993 			str = "no such cpu";
994 			break;
995 		case -X_EINVAL:
996 			/*
997 			 * Perhaps this is diagnostic overkill.
998 			 */
999 			if (HYPERVISOR_vcpu_op(VCPUOP_is_up, id, NULL) < 0)
1000 				str = "bad cpuid";
1001 			else
1002 				str = "not initialized";
1003 			break;
1004 		default:
1005 			str = "<unexpected>";
1006 			break;
1007 		}
1008 
1009 		printf("vcpu%d: failed to start: error %d: %s\n",
1010 		    id, -(int)err, str);
1011 		return (EBFONT);	/* deliberately silly */
1012 	}
1013 	return (err);
1014 }
1015 
1016 long
1017 xen_vcpu_down(processorid_t id)
1018 {
1019 	long err;
1020 
1021 	if ((err = HYPERVISOR_vcpu_op(VCPUOP_down, id, NULL)) != 0) {
1022 		/*
1023 		 * X_ENOENT:	no such cpu
1024 		 * X_EINVAL:	bad cpuid
1025 		 */
1026 		panic("vcpu%d: failed to stop: error %d", id, -(int)err);
1027 	}
1028 
1029 	return (err);
1030 }
1031