1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 /*
28 * Copyright 2019 Joyent, Inc.
29 */
30
31 /*
32 * Virtual CPU management.
33 *
34 * VCPUs can be controlled in one of two ways; through the domain itself
35 * (psradm, p_online(), etc.), and via changes in xenstore (vcpu_config()).
36 * Unfortunately, the terminology is used in different ways; they work out as
37 * follows:
38 *
39 * P_ONLINE: the VCPU is up and running, taking interrupts and running threads
40 *
41 * P_OFFLINE: the VCPU is up and running, but quiesced (i.e. blocked in the
42 * hypervisor on the idle thread). It must be up since a downed VCPU cannot
43 * receive interrupts, and we require this for offline CPUs in Solaris.
44 *
45 * P_POWEROFF: the VCPU is down (we never called xen_vcpu_up(), or called
46 * xen_vcpu_down() for it). It can't take interrupts or run anything, though
47 * if it has run previously, its software state (cpu_t, machcpu structures, IPI
48 * event channels, etc.) will still exist.
49 *
50 * The hypervisor has two notions of CPU states as represented in the store:
51 *
52 * "offline": the VCPU is down. Corresponds to P_POWEROFF.
53 *
54 * "online": the VCPU is running. Corresponds to a CPU state other than
55 * P_POWEROFF.
56 *
57 * Currently, only a notification via xenstore can bring a CPU into a
58 * P_POWEROFF state, and only the domain can change between P_ONLINE, P_NOINTR,
59 * P_OFFLINE, etc. We need to be careful to treat xenstore notifications
60 * idempotently, as we'll get 'duplicate' entries when we resume a domain.
61 *
62 * Note that the xenstore configuration is strictly advisory, in that a domain
63 * can choose to ignore it and still power up a VCPU in the offline state. To
64 * play nice, we don't allow it. Thus, any attempt to power on/off a CPU is
65 * ENOTSUP from within Solaris.
66 *
67 * Powering off a VCPU and suspending the domain use similar code. The
68 * difficulty here is that we must ensure that each VCPU is in a stable
69 * state: it must have a saved PCB, and not be responding to interrupts
70 * (since we are just about to remove its ability to run on a real CPU,
71 * possibly forever). However, an offline CPU in Solaris can take
72 * cross-call interrupts, as mentioned, so we must go through a
73 * two-stage process. First, we use the standard Solaris pause_cpus().
74 * This ensures that all CPUs are either in mach_cpu_pause() or
75 * mach_cpu_idle(), and nothing will cross-call them.
76 *
77 * Powered-off-CPUs are already safe, as we own the cpu_lock needed to
78 * bring them back up, and in state CPU_PHASE_POWERED_OFF.
79 *
80 * Running CPUs are spinning in mach_cpu_pause() waiting for either
81 * PAUSE_IDLE or CPU_PHASE_WAIT_SAFE.
82 *
83 * Offline CPUs are either running the idle thread and periodically
84 * checking for CPU_PHASE_WAIT_SAFE, or blocked in the hypervisor.
85 *
86 * Thus, we set CPU_PHASE_WAIT_SAFE for every powered-on CPU, as well as
87 * poking them to make sure they're not blocked[1]. When every CPU has
88 * responded by reaching a safe state and setting CPU_PHASE_SAFE, we
89 * know we can suspend, or power-off a CPU, without problems.
90 *
91 * [1] note that we have to repeatedly poke offline CPUs: it's the only
92 * way to ensure that the CPU doesn't miss the state change before
93 * dropping into HYPERVISOR_block().
94 */
95
96 #include <sys/types.h>
97 #include <sys/systm.h>
98 #include <sys/param.h>
99 #include <sys/taskq.h>
100 #include <sys/cmn_err.h>
101 #include <sys/archsystm.h>
102 #include <sys/machsystm.h>
103 #include <sys/segments.h>
104 #include <sys/cpuvar.h>
105 #include <sys/x86_archext.h>
106 #include <sys/controlregs.h>
107 #include <sys/hypervisor.h>
108 #include <sys/xpv_panic.h>
109 #include <sys/mman.h>
110 #include <sys/psw.h>
111 #include <sys/cpu.h>
112 #include <sys/sunddi.h>
113 #include <util/sscanf.h>
114 #include <vm/hat_i86.h>
115 #include <vm/hat.h>
116 #include <vm/as.h>
117
118 #include <xen/public/io/xs_wire.h>
119 #include <xen/sys/xenbus_impl.h>
120 #include <xen/public/vcpu.h>
121
122 extern cpuset_t cpu_ready_set;
123
124 #define CPU_PHASE_NONE 0
125 #define CPU_PHASE_WAIT_SAFE 1
126 #define CPU_PHASE_SAFE 2
127 #define CPU_PHASE_POWERED_OFF 3
128
129 /*
130 * We can only poke CPUs during barrier enter 256 times a second at
131 * most.
132 */
133 #define POKE_TIMEOUT (NANOSEC / 256)
134
135 static taskq_t *cpu_config_tq;
136 static int cpu_phase[NCPU];
137
138 static void vcpu_config_event(struct xenbus_watch *, const char **, uint_t);
139 static int xen_vcpu_initialize(processorid_t, vcpu_guest_context_t *);
140
141 /*
142 * Return whether or not the vcpu is actually running on a pcpu
143 */
144 int
vcpu_on_pcpu(processorid_t cpu)145 vcpu_on_pcpu(processorid_t cpu)
146 {
147 struct vcpu_runstate_info runstate;
148 int ret = VCPU_STATE_UNKNOWN;
149
150 ASSERT(cpu < NCPU);
151 /*
152 * Don't bother with hypercall if we are asking about ourself
153 */
154 if (cpu == CPU->cpu_id)
155 return (VCPU_ON_PCPU);
156 if (HYPERVISOR_vcpu_op(VCPUOP_get_runstate_info, cpu, &runstate) != 0)
157 goto out;
158
159 switch (runstate.state) {
160 case RUNSTATE_running:
161 ret = VCPU_ON_PCPU;
162 break;
163
164 case RUNSTATE_runnable:
165 case RUNSTATE_offline:
166 case RUNSTATE_blocked:
167 ret = VCPU_NOT_ON_PCPU;
168 break;
169
170 default:
171 break;
172 }
173
174 out:
175 return (ret);
176 }
177
178 /*
179 * These routines allocate any global state that might be needed
180 * while starting cpus. For virtual cpus, there is no such state.
181 */
182 int
mach_cpucontext_init(void)183 mach_cpucontext_init(void)
184 {
185 return (0);
186 }
187
188 void
do_cpu_config_watch(int state)189 do_cpu_config_watch(int state)
190 {
191 static struct xenbus_watch cpu_config_watch;
192
193 if (state != XENSTORE_UP)
194 return;
195 cpu_config_watch.node = "cpu";
196 cpu_config_watch.callback = vcpu_config_event;
197 if (register_xenbus_watch(&cpu_config_watch)) {
198 taskq_destroy(cpu_config_tq);
199 cmn_err(CE_WARN, "do_cpu_config_watch: "
200 "failed to set vcpu config watch");
201 }
202
203 }
204
205 /*
206 * This routine is called after all the "normal" MP startup has
207 * been done; a good place to start watching xen store for virtual
208 * cpu hot plug events.
209 */
210 void
mach_cpucontext_fini(void)211 mach_cpucontext_fini(void)
212 {
213
214 cpu_config_tq = taskq_create("vcpu config taskq", 1,
215 maxclsyspri - 1, 1, 1, TASKQ_PREPOPULATE);
216
217 (void) xs_register_xenbus_callback(do_cpu_config_watch);
218 }
219
220 /*
221 * Fill in the remaining CPU context and initialize it.
222 */
223 static int
mp_set_cpu_context(vcpu_guest_context_t * vgc,cpu_t * cp)224 mp_set_cpu_context(vcpu_guest_context_t *vgc, cpu_t *cp)
225 {
226 uint_t vec, iopl;
227
228 vgc->flags = VGCF_IN_KERNEL;
229
230 /*
231 * fpu_ctx we leave as zero; on first fault we'll store
232 * sse_initial into it anyway.
233 */
234
235 vgc->user_regs.cs = KCS_SEL | SEL_KPL; /* force to ring 3 */
236 vgc->user_regs.ds = KDS_SEL;
237 vgc->user_regs.es = KDS_SEL;
238 vgc->user_regs.ss = KDS_SEL;
239 vgc->kernel_ss = KDS_SEL;
240
241 /*
242 * Allow I/O privilege level for Dom0 kernel.
243 */
244 if (DOMAIN_IS_INITDOMAIN(xen_info))
245 iopl = (PS_IOPL & 0x1000); /* ring 1 */
246 else
247 iopl = 0;
248
249 vgc->user_regs.fs = 0;
250 vgc->user_regs.gs = 0;
251 vgc->user_regs.rflags = F_OFF | iopl;
252
253 /*
254 * Initialize the trap_info_t from the IDT
255 */
256 #if !defined(__lint)
257 ASSERT(NIDT == sizeof (vgc->trap_ctxt) / sizeof (vgc->trap_ctxt[0]));
258 #endif
259 for (vec = 0; vec < NIDT; vec++) {
260 trap_info_t *ti = &vgc->trap_ctxt[vec];
261
262 if (xen_idt_to_trap_info(vec,
263 &cp->cpu_m.mcpu_idt[vec], ti) == 0) {
264 ti->cs = KCS_SEL;
265 ti->vector = vec;
266 }
267 }
268
269 /*
270 * No LDT
271 */
272
273 /*
274 * (We assert in various places that the GDT is (a) aligned on a
275 * page boundary and (b) one page long, so this really should fit..)
276 */
277 #ifdef CRASH_XEN
278 vgc->gdt_frames[0] = pa_to_ma(mmu_btop(cp->cpu_m.mcpu_gdtpa));
279 #else
280 vgc->gdt_frames[0] = pfn_to_mfn(mmu_btop(cp->cpu_m.mcpu_gdtpa));
281 #endif
282 vgc->gdt_ents = NGDT;
283
284 vgc->ctrlreg[0] = CR0_ENABLE_FPU_FLAGS(getcr0());
285
286 vgc->ctrlreg[3] =
287 pa_to_ma(mmu_ptob(kas.a_hat->hat_htable->ht_pfn));
288
289 vgc->ctrlreg[4] = getcr4();
290
291 vgc->event_callback_eip = (uintptr_t)xen_callback;
292 vgc->failsafe_callback_eip = (uintptr_t)xen_failsafe_callback;
293 vgc->flags |= VGCF_failsafe_disables_events;
294
295 /*
296 * XXPV should this be moved to init_cpu_syscall?
297 */
298 vgc->syscall_callback_eip = (uintptr_t)sys_syscall;
299 vgc->flags |= VGCF_syscall_disables_events;
300
301 ASSERT(vgc->user_regs.gs == 0);
302 vgc->gs_base_kernel = (uintptr_t)cp;
303
304 return (xen_vcpu_initialize(cp->cpu_id, vgc));
305 }
306
307 /*
308 * Create a guest virtual cpu context so that the virtual cpu
309 * springs into life in the domain just about to call mp_startup()
310 *
311 * Virtual CPUs must be initialized once in the lifetime of the domain;
312 * after that subsequent attempts to start them will fail with X_EEXIST.
313 *
314 * Thus 'alloc' -really- creates and initializes the virtual
315 * CPU context just once. Once the initialisation succeeds, we never
316 * free it, nor the regular cpu_t to which it refers.
317 */
318 void *
mach_cpucontext_alloc(struct cpu * cp)319 mach_cpucontext_alloc(struct cpu *cp)
320 {
321 kthread_t *tp = cp->cpu_thread;
322 vcpu_guest_context_t vgc;
323
324 int err = 1;
325
326 /*
327 * First, augment the incoming cpu structure
328 * - vcpu pointer reference
329 * - pending event storage area
330 * - physical address of GDT
331 */
332 cp->cpu_m.mcpu_vcpu_info =
333 &HYPERVISOR_shared_info->vcpu_info[cp->cpu_id];
334 cp->cpu_m.mcpu_evt_pend = kmem_zalloc(
335 sizeof (struct xen_evt_data), KM_SLEEP);
336 cp->cpu_m.mcpu_gdtpa =
337 mmu_ptob(hat_getpfnum(kas.a_hat, (caddr_t)cp->cpu_gdt));
338
339 if ((err = xen_gdt_setprot(cp, PROT_READ)) != 0)
340 goto done;
341
342 /*
343 * Now set up the vcpu context so that we can start this vcpu
344 * in the kernel at tp->t_pc (mp_startup). Note that the
345 * thread will thread_exit() shortly after performing the
346 * initialization; in particular, we will *never* take a
347 * privilege transition on this thread.
348 */
349
350 bzero(&vgc, sizeof (vgc));
351
352 vgc.user_regs.rip = tp->t_pc;
353 vgc.user_regs.rsp = tp->t_sp;
354 vgc.user_regs.rbp = tp->t_sp - 2 * sizeof (greg_t);
355 /*
356 * XXPV Fix resume, if Russ didn't already fix it.
357 *
358 * Note that resume unconditionally puts t->t_stk + sizeof (regs)
359 * into kernel_sp via HYPERVISOR_stack_switch. This anticipates
360 * that only lwps take traps that switch to the kernel stack;
361 * part of creating an lwp adjusts the stack by subtracting
362 * sizeof (struct regs) off t_stk.
363 *
364 * The more interesting question is, why do we do all the work
365 * of a fully fledged lwp for a plain thread? In particular
366 * we don't have to call HYPERVISOR_stack_switch for lwp-less threads
367 * or futz with the LDT. This should probably all be done with
368 * an lwp context operator to keep pure thread context switch fast.
369 */
370 vgc.kernel_sp = (ulong_t)tp->t_stk;
371
372 err = mp_set_cpu_context(&vgc, cp);
373
374 done:
375 if (err) {
376 mach_cpucontext_free(cp, NULL, err);
377 return (NULL);
378 }
379 return (cp);
380 }
381
382 /*
383 * By the time we are called either we have successfully started
384 * the cpu, or our attempt to start it has failed.
385 */
386
387 /*ARGSUSED*/
388 void
mach_cpucontext_free(struct cpu * cp,void * arg,int err)389 mach_cpucontext_free(struct cpu *cp, void *arg, int err)
390 {
391 switch (err) {
392 case 0:
393 break;
394 case ETIMEDOUT:
395 /*
396 * The vcpu context is loaded into the hypervisor, and
397 * we've tried to start it, but the vcpu has not been set
398 * running yet, for whatever reason. We arrange to -not-
399 * free any data structures it may be referencing. In
400 * particular, we've already told the hypervisor about
401 * the GDT, and so we can't map it read-write again.
402 */
403 break;
404 default:
405 (void) xen_gdt_setprot(cp, PROT_READ | PROT_WRITE);
406 kmem_free(cp->cpu_m.mcpu_evt_pend,
407 sizeof (struct xen_evt_data));
408 break;
409 }
410 }
411
412 /*
413 * Reset this CPU's context. Clear out any pending evtchn data, since event
414 * channel numbers will all change when we resume.
415 */
416 void
mach_cpucontext_reset(cpu_t * cp)417 mach_cpucontext_reset(cpu_t *cp)
418 {
419 bzero(cp->cpu_m.mcpu_evt_pend, sizeof (struct xen_evt_data));
420 /* mcpu_intr_pending ? */
421 }
422
423 static void
pcb_to_user_regs(label_t * pcb,vcpu_guest_context_t * vgc)424 pcb_to_user_regs(label_t *pcb, vcpu_guest_context_t *vgc)
425 {
426 vgc->user_regs.rip = pcb->val[REG_LABEL_PC];
427 vgc->user_regs.rsp = pcb->val[REG_LABEL_SP];
428 vgc->user_regs.rbp = pcb->val[REG_LABEL_BP];
429 vgc->user_regs.rbx = pcb->val[REG_LABEL_RBX];
430 vgc->user_regs.r12 = pcb->val[REG_LABEL_R12];
431 vgc->user_regs.r13 = pcb->val[REG_LABEL_R13];
432 vgc->user_regs.r14 = pcb->val[REG_LABEL_R14];
433 vgc->user_regs.r15 = pcb->val[REG_LABEL_R15];
434 }
435
436 /*
437 * Restore the context of a CPU during resume. This context is always
438 * inside enter_safe_phase(), below.
439 */
440 void
mach_cpucontext_restore(cpu_t * cp)441 mach_cpucontext_restore(cpu_t *cp)
442 {
443 vcpu_guest_context_t vgc;
444 int err;
445
446 ASSERT(cp->cpu_thread == cp->cpu_pause_thread ||
447 cp->cpu_thread == cp->cpu_idle_thread);
448
449 bzero(&vgc, sizeof (vgc));
450
451 pcb_to_user_regs(&cp->cpu_thread->t_pcb, &vgc);
452
453 /*
454 * We're emulating a longjmp() here: in particular, we need to bump the
455 * stack pointer to account for the pop of xIP that returning from
456 * longjmp() normally would do, and set the return value in xAX to 1.
457 */
458 vgc.user_regs.rax = 1;
459 vgc.user_regs.rsp += sizeof (ulong_t);
460
461 vgc.kernel_sp = cp->cpu_thread->t_sp;
462
463 err = mp_set_cpu_context(&vgc, cp);
464
465 ASSERT(err == 0);
466 }
467
468 /*
469 * Reach a point at which the CPU can be safely powered-off or
470 * suspended. Nothing can wake this CPU out of the loop.
471 */
472 static void
enter_safe_phase(void)473 enter_safe_phase(void)
474 {
475 ulong_t flags = intr_clear();
476
477 if (setjmp(&curthread->t_pcb) == 0) {
478 cpu_phase[CPU->cpu_id] = CPU_PHASE_SAFE;
479 while (cpu_phase[CPU->cpu_id] == CPU_PHASE_SAFE)
480 SMT_PAUSE();
481 }
482
483 ASSERT(!interrupts_enabled());
484
485 intr_restore(flags);
486 }
487
488 /*
489 * Offline CPUs run this code even under a pause_cpus(), so we must
490 * check if we need to enter the safe phase.
491 */
492 void
mach_cpu_idle(void)493 mach_cpu_idle(void)
494 {
495 if (IN_XPV_PANIC()) {
496 xpv_panic_halt();
497 } else {
498 (void) HYPERVISOR_block();
499 if (cpu_phase[CPU->cpu_id] == CPU_PHASE_WAIT_SAFE)
500 enter_safe_phase();
501 }
502 }
503
504 /*
505 * Spin until either start_cpus() wakes us up, or we get a request to
506 * enter the safe phase (followed by a later start_cpus()).
507 */
508 void
mach_cpu_pause(volatile char * safe)509 mach_cpu_pause(volatile char *safe)
510 {
511 *safe = PAUSE_WAIT;
512 membar_enter();
513
514 while (*safe != PAUSE_IDLE) {
515 if (cpu_phase[CPU->cpu_id] == CPU_PHASE_WAIT_SAFE)
516 enter_safe_phase();
517 SMT_PAUSE();
518 }
519 }
520
521 int
mach_cpu_halt(xc_arg_t arg1,xc_arg_t arg2 __unused,xc_arg_t arg3 __unused)522 mach_cpu_halt(xc_arg_t arg1, xc_arg_t arg2 __unused, xc_arg_t arg3 __unused)
523 {
524 char *msg = (char *)arg1;
525
526 if (msg)
527 prom_printf("%s\n", msg);
528 (void) xen_vcpu_down(CPU->cpu_id);
529 return (0);
530 }
531
532 /*ARGSUSED*/
533 int
mp_cpu_poweron(struct cpu * cp)534 mp_cpu_poweron(struct cpu *cp)
535 {
536 return (ENOTSUP);
537 }
538
539 /*ARGSUSED*/
540 int
mp_cpu_poweroff(struct cpu * cp)541 mp_cpu_poweroff(struct cpu *cp)
542 {
543 return (ENOTSUP);
544 }
545
546 void
mp_enter_barrier(void)547 mp_enter_barrier(void)
548 {
549 hrtime_t last_poke_time = 0;
550 int poke_allowed = 0;
551 int done = 0;
552 int i;
553
554 ASSERT(MUTEX_HELD(&cpu_lock));
555
556 pause_cpus(NULL, NULL);
557
558 while (!done) {
559 done = 1;
560 poke_allowed = 0;
561
562 if (xpv_gethrtime() - last_poke_time > POKE_TIMEOUT) {
563 last_poke_time = xpv_gethrtime();
564 poke_allowed = 1;
565 }
566
567 for (i = 0; i < NCPU; i++) {
568 cpu_t *cp = cpu_get(i);
569
570 if (cp == NULL || cp == CPU)
571 continue;
572
573 switch (cpu_phase[i]) {
574 case CPU_PHASE_NONE:
575 cpu_phase[i] = CPU_PHASE_WAIT_SAFE;
576 poke_cpu(i);
577 done = 0;
578 break;
579
580 case CPU_PHASE_WAIT_SAFE:
581 if (poke_allowed)
582 poke_cpu(i);
583 done = 0;
584 break;
585
586 case CPU_PHASE_SAFE:
587 case CPU_PHASE_POWERED_OFF:
588 break;
589 }
590 }
591
592 SMT_PAUSE();
593 }
594 }
595
596 void
mp_leave_barrier(void)597 mp_leave_barrier(void)
598 {
599 int i;
600
601 ASSERT(MUTEX_HELD(&cpu_lock));
602
603 for (i = 0; i < NCPU; i++) {
604 cpu_t *cp = cpu_get(i);
605
606 if (cp == NULL || cp == CPU)
607 continue;
608
609 switch (cpu_phase[i]) {
610 /*
611 * If we see a CPU in one of these phases, something has
612 * gone badly wrong with the guarantees
613 * mp_enter_barrier() is supposed to provide. Rather
614 * than attempt to stumble along (and since we can't
615 * panic properly in this context), we tell the
616 * hypervisor we've crashed.
617 */
618 case CPU_PHASE_NONE:
619 case CPU_PHASE_WAIT_SAFE:
620 (void) HYPERVISOR_shutdown(SHUTDOWN_crash);
621 break;
622
623 case CPU_PHASE_POWERED_OFF:
624 break;
625
626 case CPU_PHASE_SAFE:
627 cpu_phase[i] = CPU_PHASE_NONE;
628 }
629 }
630
631 start_cpus();
632 }
633
634 static int
poweroff_vcpu(struct cpu * cp)635 poweroff_vcpu(struct cpu *cp)
636 {
637 int error;
638
639 ASSERT(MUTEX_HELD(&cpu_lock));
640
641 ASSERT(CPU->cpu_id != cp->cpu_id);
642 ASSERT(cp->cpu_flags & CPU_QUIESCED);
643
644 mp_enter_barrier();
645
646 if ((error = xen_vcpu_down(cp->cpu_id)) == 0) {
647 ASSERT(cpu_phase[cp->cpu_id] == CPU_PHASE_SAFE);
648
649 CPUSET_DEL(cpu_ready_set, cp->cpu_id);
650
651 if (cp->cpu_flags & CPU_ENABLE)
652 ncpus_intr_enabled--;
653
654 cp->cpu_flags |= CPU_POWEROFF | CPU_OFFLINE;
655 cp->cpu_flags &=
656 ~(CPU_RUNNING | CPU_READY | CPU_EXISTS | CPU_ENABLE);
657
658 cpu_phase[cp->cpu_id] = CPU_PHASE_POWERED_OFF;
659
660 cpu_set_state(cp);
661 }
662
663 mp_leave_barrier();
664
665 return (error);
666 }
667
668 static int
vcpu_config_poweroff(processorid_t id)669 vcpu_config_poweroff(processorid_t id)
670 {
671 int oldstate;
672 int error;
673 cpu_t *cp;
674
675 mutex_enter(&cpu_lock);
676
677 if ((cp = cpu_get(id)) == NULL) {
678 mutex_exit(&cpu_lock);
679 return (ESRCH);
680 }
681
682 if (cpu_get_state(cp) == P_POWEROFF) {
683 mutex_exit(&cpu_lock);
684 return (0);
685 }
686
687 mutex_exit(&cpu_lock);
688
689 do {
690 error = p_online_internal(id, P_OFFLINE,
691 &oldstate);
692
693 if (error != 0)
694 break;
695
696 /*
697 * So we just changed it to P_OFFLINE. But then we dropped
698 * cpu_lock, so now it is possible for another thread to change
699 * the cpu back to a different, non-quiesced state e.g.
700 * P_ONLINE.
701 */
702 mutex_enter(&cpu_lock);
703 if ((cp = cpu_get(id)) == NULL)
704 error = ESRCH;
705 else {
706 if (cp->cpu_flags & CPU_QUIESCED)
707 error = poweroff_vcpu(cp);
708 else
709 error = EBUSY;
710 }
711 mutex_exit(&cpu_lock);
712 } while (error == EBUSY);
713
714 return (error);
715 }
716
717 /*
718 * Add a new virtual cpu to the domain.
719 */
720 static int
vcpu_config_new(processorid_t id)721 vcpu_config_new(processorid_t id)
722 {
723 extern int start_cpu(processorid_t);
724 int error;
725
726 if (ncpus == 1) {
727 printf("cannot (yet) add cpus to a single-cpu domain\n");
728 return (ENOTSUP);
729 }
730
731 affinity_set(CPU_CURRENT);
732 error = start_cpu(id);
733 affinity_clear();
734 return (error);
735 }
736
737 static int
poweron_vcpu(struct cpu * cp)738 poweron_vcpu(struct cpu *cp)
739 {
740 int error;
741
742 ASSERT(MUTEX_HELD(&cpu_lock));
743
744 if (HYPERVISOR_vcpu_op(VCPUOP_is_up, cp->cpu_id, NULL) != 0) {
745 printf("poweron_vcpu: vcpu%d is not available!\n",
746 cp->cpu_id);
747 return (ENXIO);
748 }
749
750 if ((error = xen_vcpu_up(cp->cpu_id)) == 0) {
751 CPUSET_ADD(cpu_ready_set, cp->cpu_id);
752 cp->cpu_flags |= CPU_EXISTS | CPU_READY | CPU_RUNNING;
753 cp->cpu_flags &= ~CPU_POWEROFF;
754 /*
755 * There are some nasty races possible here.
756 * Tell the vcpu it's up one more time.
757 * XXPV Is this enough? Is this safe?
758 */
759 (void) xen_vcpu_up(cp->cpu_id);
760
761 cpu_phase[cp->cpu_id] = CPU_PHASE_NONE;
762
763 cpu_set_state(cp);
764 }
765 return (error);
766 }
767
768 static int
vcpu_config_poweron(processorid_t id)769 vcpu_config_poweron(processorid_t id)
770 {
771 cpu_t *cp;
772 int oldstate;
773 int error;
774
775 if (id >= ncpus)
776 return (vcpu_config_new(id));
777
778 mutex_enter(&cpu_lock);
779
780 if ((cp = cpu_get(id)) == NULL) {
781 mutex_exit(&cpu_lock);
782 return (ESRCH);
783 }
784
785 if (cpu_get_state(cp) != P_POWEROFF) {
786 mutex_exit(&cpu_lock);
787 return (0);
788 }
789
790 if ((error = poweron_vcpu(cp)) != 0) {
791 mutex_exit(&cpu_lock);
792 return (error);
793 }
794
795 mutex_exit(&cpu_lock);
796
797 return (p_online_internal(id, P_ONLINE, &oldstate));
798 }
799
800 #define REPORT_LEN 128
801
802 static void
vcpu_config_report(processorid_t id,uint_t newstate,int error)803 vcpu_config_report(processorid_t id, uint_t newstate, int error)
804 {
805 char *report = kmem_alloc(REPORT_LEN, KM_SLEEP);
806 size_t len;
807 char *ps;
808
809 ps = NULL;
810 switch (newstate) {
811 case P_ONLINE:
812 ps = PS_ONLINE;
813 break;
814 case P_POWEROFF:
815 ps = PS_POWEROFF;
816 break;
817 default:
818 cmn_err(CE_PANIC, "unknown state %u\n", newstate);
819 break;
820 }
821
822 len = snprintf(report, REPORT_LEN,
823 "cpu%d: externally initiated %s", id, ps);
824
825 if (!error) {
826 cmn_err(CE_CONT, "!%s\n", report);
827 kmem_free(report, REPORT_LEN);
828 return;
829 }
830
831 len += snprintf(report + len, REPORT_LEN - len,
832 " failed, error %d: ", error);
833 switch (error) {
834 case EEXIST:
835 len += snprintf(report + len, REPORT_LEN - len,
836 "cpu already %s", ps ? ps : "?");
837 break;
838 case ESRCH:
839 len += snprintf(report + len, REPORT_LEN - len,
840 "cpu not found");
841 break;
842 case EINVAL:
843 case EALREADY:
844 break;
845 case EPERM:
846 len += snprintf(report + len, REPORT_LEN - len,
847 "insufficient privilege (0x%x)", id);
848 break;
849 case EBUSY:
850 switch (newstate) {
851 case P_ONLINE:
852 /*
853 * This return comes from mp_cpu_start -
854 * we cannot 'start' the boot CPU.
855 */
856 len += snprintf(report + len, REPORT_LEN - len,
857 "already running");
858 break;
859 case P_POWEROFF:
860 len += snprintf(report + len, REPORT_LEN - len,
861 "bound lwps?");
862 break;
863 default:
864 break;
865 }
866 default:
867 break;
868 }
869
870 cmn_err(CE_CONT, "%s\n", report);
871 kmem_free(report, REPORT_LEN);
872 }
873
874 static void
vcpu_config(void * arg)875 vcpu_config(void *arg)
876 {
877 int id = (int)(uintptr_t)arg;
878 int error;
879 char dir[16];
880 char *state;
881
882 if ((uint_t)id >= max_ncpus) {
883 cmn_err(CE_WARN,
884 "vcpu_config: cpu%d does not fit in this domain", id);
885 return;
886 }
887
888 (void) snprintf(dir, sizeof (dir), "cpu/%d", id);
889 state = kmem_alloc(MAXPATHLEN, KM_SLEEP);
890 if (xenbus_scanf(XBT_NULL, dir, "availability", "%s", state) == 0) {
891 if (strcmp(state, "online") == 0) {
892 error = vcpu_config_poweron(id);
893 vcpu_config_report(id, P_ONLINE, error);
894 } else if (strcmp(state, "offline") == 0) {
895 error = vcpu_config_poweroff(id);
896 vcpu_config_report(id, P_POWEROFF, error);
897 } else {
898 cmn_err(CE_WARN,
899 "cpu%d: unknown target state '%s'", id, state);
900 }
901 } else
902 cmn_err(CE_WARN,
903 "cpu%d: unable to read target state from xenstore", id);
904
905 kmem_free(state, MAXPATHLEN);
906 }
907
908 /*ARGSUSED*/
909 static void
vcpu_config_event(struct xenbus_watch * watch,const char ** vec,uint_t len)910 vcpu_config_event(struct xenbus_watch *watch, const char **vec, uint_t len)
911 {
912 const char *path = vec[XS_WATCH_PATH];
913 processorid_t id;
914 char *s;
915
916 if ((s = strstr(path, "cpu/")) != NULL &&
917 sscanf(s, "cpu/%d", &id) == 1) {
918 /*
919 * Run the virtual CPU configuration on a separate thread to
920 * avoid blocking on this event for too long (and for now,
921 * to ensure configuration requests are serialized.)
922 */
923 (void) taskq_dispatch(cpu_config_tq,
924 vcpu_config, (void *)(uintptr_t)id, 0);
925 }
926 }
927
928 static int
xen_vcpu_initialize(processorid_t id,vcpu_guest_context_t * vgc)929 xen_vcpu_initialize(processorid_t id, vcpu_guest_context_t *vgc)
930 {
931 int err;
932
933 if ((err = HYPERVISOR_vcpu_op(VCPUOP_initialise, id, vgc)) != 0) {
934 char *str;
935 int level = CE_WARN;
936
937 switch (err) {
938 case -X_EINVAL:
939 /*
940 * This interface squashes multiple error sources
941 * to one error code. In particular, an X_EINVAL
942 * code can mean:
943 *
944 * - the vcpu id is out of range
945 * - cs or ss are in ring 0
946 * - cr3 is wrong
947 * - an entry in the new gdt is above the
948 * reserved entry
949 * - a frame underneath the new gdt is bad
950 */
951 str = "something is wrong :(";
952 break;
953 case -X_ENOENT:
954 str = "no such cpu";
955 break;
956 case -X_ENOMEM:
957 str = "no mem to copy ctxt";
958 break;
959 case -X_EFAULT:
960 str = "bad address";
961 break;
962 case -X_EEXIST:
963 /*
964 * Hmm. This error is returned if the vcpu has already
965 * been initialized once before in the lifetime of this
966 * domain. This is a logic error in the kernel.
967 */
968 level = CE_PANIC;
969 str = "already initialized";
970 break;
971 default:
972 level = CE_PANIC;
973 str = "<unexpected>";
974 break;
975 }
976
977 cmn_err(level, "vcpu%d: failed to init: error %d: %s",
978 id, -err, str);
979 }
980 return (err);
981 }
982
983 long
xen_vcpu_up(processorid_t id)984 xen_vcpu_up(processorid_t id)
985 {
986 long err;
987
988 if ((err = HYPERVISOR_vcpu_op(VCPUOP_up, id, NULL)) != 0) {
989 char *str;
990
991 switch (err) {
992 case -X_ENOENT:
993 str = "no such cpu";
994 break;
995 case -X_EINVAL:
996 /*
997 * Perhaps this is diagnostic overkill.
998 */
999 if (HYPERVISOR_vcpu_op(VCPUOP_is_up, id, NULL) < 0)
1000 str = "bad cpuid";
1001 else
1002 str = "not initialized";
1003 break;
1004 default:
1005 str = "<unexpected>";
1006 break;
1007 }
1008
1009 printf("vcpu%d: failed to start: error %d: %s\n",
1010 id, -(int)err, str);
1011 return (EBFONT); /* deliberately silly */
1012 }
1013 return (err);
1014 }
1015
1016 long
xen_vcpu_down(processorid_t id)1017 xen_vcpu_down(processorid_t id)
1018 {
1019 long err;
1020
1021 if ((err = HYPERVISOR_vcpu_op(VCPUOP_down, id, NULL)) != 0) {
1022 /*
1023 * X_ENOENT: no such cpu
1024 * X_EINVAL: bad cpuid
1025 */
1026 panic("vcpu%d: failed to stop: error %d", id, -(int)err);
1027 }
1028
1029 return (err);
1030 }
1031