1843e1988Sjohnlev /* 2843e1988Sjohnlev * CDDL HEADER START 3843e1988Sjohnlev * 4843e1988Sjohnlev * The contents of this file are subject to the terms of the 5843e1988Sjohnlev * Common Development and Distribution License (the "License"). 6843e1988Sjohnlev * You may not use this file except in compliance with the License. 7843e1988Sjohnlev * 8843e1988Sjohnlev * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9843e1988Sjohnlev * or http://www.opensolaris.org/os/licensing. 10843e1988Sjohnlev * See the License for the specific language governing permissions 11843e1988Sjohnlev * and limitations under the License. 12843e1988Sjohnlev * 13843e1988Sjohnlev * When distributing Covered Code, include this CDDL HEADER in each 14843e1988Sjohnlev * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15843e1988Sjohnlev * If applicable, add the following below this CDDL HEADER, with the 16843e1988Sjohnlev * fields enclosed by brackets "[]" replaced with your own identifying 17843e1988Sjohnlev * information: Portions Copyright [yyyy] [name of copyright owner] 18843e1988Sjohnlev * 19843e1988Sjohnlev * CDDL HEADER END 20843e1988Sjohnlev */ 21843e1988Sjohnlev 22843e1988Sjohnlev /* 23f34a7178SJoe Bonasera * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24843e1988Sjohnlev * Use is subject to license terms. 25843e1988Sjohnlev */ 26843e1988Sjohnlev 271d03c31eSjohnlev /* 281d03c31eSjohnlev * Virtual CPU management. 291d03c31eSjohnlev * 301d03c31eSjohnlev * VCPUs can be controlled in one of two ways; through the domain itself 311d03c31eSjohnlev * (psradm, p_online(), etc.), and via changes in xenstore (vcpu_config()). 321d03c31eSjohnlev * Unfortunately, the terminology is used in different ways; they work out as 331d03c31eSjohnlev * follows: 341d03c31eSjohnlev * 351d03c31eSjohnlev * P_ONLINE: the VCPU is up and running, taking interrupts and running threads 361d03c31eSjohnlev * 371d03c31eSjohnlev * P_OFFLINE: the VCPU is up and running, but quiesced (i.e. blocked in the 381d03c31eSjohnlev * hypervisor on the idle thread). It must be up since a downed VCPU cannot 391d03c31eSjohnlev * receive interrupts, and we require this for offline CPUs in Solaris. 401d03c31eSjohnlev * 411d03c31eSjohnlev * P_POWEROFF: the VCPU is down (we never called xen_vcpu_up(), or called 421d03c31eSjohnlev * xen_vcpu_down() for it). It can't take interrupts or run anything, though 431d03c31eSjohnlev * if it has run previously, its software state (cpu_t, machcpu structures, IPI 441d03c31eSjohnlev * event channels, etc.) will still exist. 451d03c31eSjohnlev * 461d03c31eSjohnlev * The hypervisor has two notions of CPU states as represented in the store: 471d03c31eSjohnlev * 481d03c31eSjohnlev * "offline": the VCPU is down. Corresponds to P_POWEROFF. 491d03c31eSjohnlev * 501d03c31eSjohnlev * "online": the VCPU is running. Corresponds to a CPU state other than 511d03c31eSjohnlev * P_POWEROFF. 521d03c31eSjohnlev * 531d03c31eSjohnlev * Currently, only a notification via xenstore can bring a CPU into a 541d03c31eSjohnlev * P_POWEROFF state, and only the domain can change between P_ONLINE, P_NOINTR, 551d03c31eSjohnlev * P_OFFLINE, etc. We need to be careful to treat xenstore notifications 561d03c31eSjohnlev * idempotently, as we'll get 'duplicate' entries when we resume a domain. 571d03c31eSjohnlev * 581d03c31eSjohnlev * Note that the xenstore configuration is strictly advisory, in that a domain 591d03c31eSjohnlev * can choose to ignore it and still power up a VCPU in the offline state. To 601d03c31eSjohnlev * play nice, we don't allow it. Thus, any attempt to power on/off a CPU is 611d03c31eSjohnlev * ENOTSUP from within Solaris. 621d03c31eSjohnlev * 631d03c31eSjohnlev * Powering off a VCPU and suspending the domain use similar code. The 641d03c31eSjohnlev * difficulty here is that we must ensure that each VCPU is in a stable 651d03c31eSjohnlev * state: it must have a saved PCB, and not be responding to interrupts 661d03c31eSjohnlev * (since we are just about to remove its ability to run on a real CPU, 671d03c31eSjohnlev * possibly forever). However, an offline CPU in Solaris can take 681d03c31eSjohnlev * cross-call interrupts, as mentioned, so we must go through a 691d03c31eSjohnlev * two-stage process. First, we use the standard Solaris pause_cpus(). 701d03c31eSjohnlev * This ensures that all CPUs are either in mach_cpu_pause() or 711d03c31eSjohnlev * mach_cpu_idle(), and nothing will cross-call them. 721d03c31eSjohnlev * 731d03c31eSjohnlev * Powered-off-CPUs are already safe, as we own the cpu_lock needed to 741d03c31eSjohnlev * bring them back up, and in state CPU_PHASE_POWERED_OFF. 751d03c31eSjohnlev * 761d03c31eSjohnlev * Running CPUs are spinning in mach_cpu_pause() waiting for either 771d03c31eSjohnlev * PAUSE_IDLE or CPU_PHASE_WAIT_SAFE. 781d03c31eSjohnlev * 791d03c31eSjohnlev * Offline CPUs are either running the idle thread and periodically 801d03c31eSjohnlev * checking for CPU_PHASE_WAIT_SAFE, or blocked in the hypervisor. 811d03c31eSjohnlev * 821d03c31eSjohnlev * Thus, we set CPU_PHASE_WAIT_SAFE for every powered-on CPU, as well as 831d03c31eSjohnlev * poking them to make sure they're not blocked[1]. When every CPU has 841d03c31eSjohnlev * responded by reaching a safe state and setting CPU_PHASE_SAFE, we 851d03c31eSjohnlev * know we can suspend, or power-off a CPU, without problems. 861d03c31eSjohnlev * 871d03c31eSjohnlev * [1] note that we have to repeatedly poke offline CPUs: it's the only 881d03c31eSjohnlev * way to ensure that the CPU doesn't miss the state change before 891d03c31eSjohnlev * dropping into HYPERVISOR_block(). 901d03c31eSjohnlev */ 911d03c31eSjohnlev 92843e1988Sjohnlev #include <sys/types.h> 93843e1988Sjohnlev #include <sys/systm.h> 94843e1988Sjohnlev #include <sys/param.h> 95843e1988Sjohnlev #include <sys/taskq.h> 96843e1988Sjohnlev #include <sys/cmn_err.h> 97843e1988Sjohnlev #include <sys/archsystm.h> 98843e1988Sjohnlev #include <sys/machsystm.h> 99843e1988Sjohnlev #include <sys/segments.h> 100843e1988Sjohnlev #include <sys/cpuvar.h> 101843e1988Sjohnlev #include <sys/x86_archext.h> 102843e1988Sjohnlev #include <sys/controlregs.h> 103843e1988Sjohnlev #include <sys/hypervisor.h> 104843e1988Sjohnlev #include <sys/xpv_panic.h> 1051d03c31eSjohnlev #include <sys/mman.h> 1061d03c31eSjohnlev #include <sys/psw.h> 107843e1988Sjohnlev #include <sys/cpu.h> 1081d03c31eSjohnlev #include <sys/sunddi.h> 1091d03c31eSjohnlev #include <util/sscanf.h> 1101d03c31eSjohnlev #include <vm/hat_i86.h> 1111d03c31eSjohnlev #include <vm/hat.h> 1121d03c31eSjohnlev #include <vm/as.h> 113843e1988Sjohnlev 114843e1988Sjohnlev #include <xen/public/io/xs_wire.h> 1151d03c31eSjohnlev #include <xen/sys/xenbus_impl.h> 1161d03c31eSjohnlev #include <xen/public/vcpu.h> 117843e1988Sjohnlev 118f34a7178SJoe Bonasera extern cpuset_t cpu_ready_set; 119f34a7178SJoe Bonasera 1201d03c31eSjohnlev #define CPU_PHASE_NONE 0 1211d03c31eSjohnlev #define CPU_PHASE_WAIT_SAFE 1 1221d03c31eSjohnlev #define CPU_PHASE_SAFE 2 1231d03c31eSjohnlev #define CPU_PHASE_POWERED_OFF 3 1241d03c31eSjohnlev 1251d03c31eSjohnlev /* 1261d03c31eSjohnlev * We can only poke CPUs during barrier enter 256 times a second at 1271d03c31eSjohnlev * most. 1281d03c31eSjohnlev */ 1291d03c31eSjohnlev #define POKE_TIMEOUT (NANOSEC / 256) 130843e1988Sjohnlev 131843e1988Sjohnlev static taskq_t *cpu_config_tq; 1321d03c31eSjohnlev static int cpu_phase[NCPU]; 1331d03c31eSjohnlev 134843e1988Sjohnlev static void vcpu_config_event(struct xenbus_watch *, const char **, uint_t); 135843e1988Sjohnlev static int xen_vcpu_initialize(processorid_t, vcpu_guest_context_t *); 136843e1988Sjohnlev 137843e1988Sjohnlev /* 138b9bc7f78Ssmaybe * Return whether or not the vcpu is actually running on a pcpu 139b9bc7f78Ssmaybe */ 140b9bc7f78Ssmaybe int 141b9bc7f78Ssmaybe vcpu_on_pcpu(processorid_t cpu) 142b9bc7f78Ssmaybe { 143b9bc7f78Ssmaybe struct vcpu_runstate_info runstate; 144b9bc7f78Ssmaybe int ret = VCPU_STATE_UNKNOWN; 145b9bc7f78Ssmaybe 146b9bc7f78Ssmaybe ASSERT(cpu < NCPU); 147b9bc7f78Ssmaybe /* 148b9bc7f78Ssmaybe * Don't bother with hypercall if we are asking about ourself 149b9bc7f78Ssmaybe */ 150b9bc7f78Ssmaybe if (cpu == CPU->cpu_id) 151b9bc7f78Ssmaybe return (VCPU_ON_PCPU); 152b9bc7f78Ssmaybe if (HYPERVISOR_vcpu_op(VCPUOP_get_runstate_info, cpu, &runstate) != 0) 153b9bc7f78Ssmaybe goto out; 154b9bc7f78Ssmaybe 155b9bc7f78Ssmaybe switch (runstate.state) { 156b9bc7f78Ssmaybe case RUNSTATE_running: 157b9bc7f78Ssmaybe ret = VCPU_ON_PCPU; 158b9bc7f78Ssmaybe break; 159b9bc7f78Ssmaybe 160b9bc7f78Ssmaybe case RUNSTATE_runnable: 161b9bc7f78Ssmaybe case RUNSTATE_offline: 162b9bc7f78Ssmaybe case RUNSTATE_blocked: 163b9bc7f78Ssmaybe ret = VCPU_NOT_ON_PCPU; 164b9bc7f78Ssmaybe break; 165b9bc7f78Ssmaybe 166b9bc7f78Ssmaybe default: 167b9bc7f78Ssmaybe break; 168b9bc7f78Ssmaybe } 169b9bc7f78Ssmaybe 170b9bc7f78Ssmaybe out: 171b9bc7f78Ssmaybe return (ret); 172b9bc7f78Ssmaybe } 173b9bc7f78Ssmaybe 174b9bc7f78Ssmaybe /* 175843e1988Sjohnlev * These routines allocate any global state that might be needed 176843e1988Sjohnlev * while starting cpus. For virtual cpus, there is no such state. 177843e1988Sjohnlev */ 178843e1988Sjohnlev int 179843e1988Sjohnlev mach_cpucontext_init(void) 180843e1988Sjohnlev { 181843e1988Sjohnlev return (0); 182843e1988Sjohnlev } 183843e1988Sjohnlev 184843e1988Sjohnlev void 185843e1988Sjohnlev do_cpu_config_watch(int state) 186843e1988Sjohnlev { 187843e1988Sjohnlev static struct xenbus_watch cpu_config_watch; 188843e1988Sjohnlev 189843e1988Sjohnlev if (state != XENSTORE_UP) 190843e1988Sjohnlev return; 191843e1988Sjohnlev cpu_config_watch.node = "cpu"; 192843e1988Sjohnlev cpu_config_watch.callback = vcpu_config_event; 193843e1988Sjohnlev if (register_xenbus_watch(&cpu_config_watch)) { 194843e1988Sjohnlev taskq_destroy(cpu_config_tq); 195843e1988Sjohnlev cmn_err(CE_WARN, "do_cpu_config_watch: " 196843e1988Sjohnlev "failed to set vcpu config watch"); 197843e1988Sjohnlev } 198843e1988Sjohnlev 199843e1988Sjohnlev } 200843e1988Sjohnlev 201843e1988Sjohnlev /* 202843e1988Sjohnlev * This routine is called after all the "normal" MP startup has 203843e1988Sjohnlev * been done; a good place to start watching xen store for virtual 204843e1988Sjohnlev * cpu hot plug events. 205843e1988Sjohnlev */ 206843e1988Sjohnlev void 207843e1988Sjohnlev mach_cpucontext_fini(void) 208843e1988Sjohnlev { 209843e1988Sjohnlev 210843e1988Sjohnlev cpu_config_tq = taskq_create("vcpu config taskq", 1, 211843e1988Sjohnlev maxclsyspri - 1, 1, 1, TASKQ_PREPOPULATE); 212843e1988Sjohnlev 213843e1988Sjohnlev (void) xs_register_xenbus_callback(do_cpu_config_watch); 214843e1988Sjohnlev } 215843e1988Sjohnlev 216843e1988Sjohnlev /* 217843e1988Sjohnlev * Fill in the remaining CPU context and initialize it. 218843e1988Sjohnlev */ 219843e1988Sjohnlev static int 220843e1988Sjohnlev mp_set_cpu_context(vcpu_guest_context_t *vgc, cpu_t *cp) 221843e1988Sjohnlev { 222843e1988Sjohnlev uint_t vec, iopl; 223843e1988Sjohnlev 224843e1988Sjohnlev vgc->flags = VGCF_IN_KERNEL; 225843e1988Sjohnlev 226843e1988Sjohnlev /* 227843e1988Sjohnlev * fpu_ctx we leave as zero; on first fault we'll store 228843e1988Sjohnlev * sse_initial into it anyway. 229843e1988Sjohnlev */ 230843e1988Sjohnlev 231843e1988Sjohnlev #if defined(__amd64) 232843e1988Sjohnlev vgc->user_regs.cs = KCS_SEL | SEL_KPL; /* force to ring 3 */ 233843e1988Sjohnlev #else 234843e1988Sjohnlev vgc->user_regs.cs = KCS_SEL; 235843e1988Sjohnlev #endif 236843e1988Sjohnlev vgc->user_regs.ds = KDS_SEL; 237843e1988Sjohnlev vgc->user_regs.es = KDS_SEL; 238843e1988Sjohnlev vgc->user_regs.ss = KDS_SEL; 239843e1988Sjohnlev vgc->kernel_ss = KDS_SEL; 240843e1988Sjohnlev 241843e1988Sjohnlev /* 242843e1988Sjohnlev * Allow I/O privilege level for Dom0 kernel. 243843e1988Sjohnlev */ 244843e1988Sjohnlev if (DOMAIN_IS_INITDOMAIN(xen_info)) 245843e1988Sjohnlev iopl = (PS_IOPL & 0x1000); /* ring 1 */ 246843e1988Sjohnlev else 247843e1988Sjohnlev iopl = 0; 248843e1988Sjohnlev 249843e1988Sjohnlev #if defined(__amd64) 250843e1988Sjohnlev vgc->user_regs.fs = 0; 251843e1988Sjohnlev vgc->user_regs.gs = 0; 252843e1988Sjohnlev vgc->user_regs.rflags = F_OFF | iopl; 253843e1988Sjohnlev #elif defined(__i386) 254843e1988Sjohnlev vgc->user_regs.fs = KFS_SEL; 255843e1988Sjohnlev vgc->user_regs.gs = KGS_SEL; 256843e1988Sjohnlev vgc->user_regs.eflags = F_OFF | iopl; 257843e1988Sjohnlev vgc->event_callback_cs = vgc->user_regs.cs; 258843e1988Sjohnlev vgc->failsafe_callback_cs = vgc->user_regs.cs; 259843e1988Sjohnlev #endif 260843e1988Sjohnlev 261843e1988Sjohnlev /* 262843e1988Sjohnlev * Initialize the trap_info_t from the IDT 263843e1988Sjohnlev */ 264843e1988Sjohnlev #if !defined(__lint) 265843e1988Sjohnlev ASSERT(NIDT == sizeof (vgc->trap_ctxt) / sizeof (vgc->trap_ctxt[0])); 266843e1988Sjohnlev #endif 267843e1988Sjohnlev for (vec = 0; vec < NIDT; vec++) { 268843e1988Sjohnlev trap_info_t *ti = &vgc->trap_ctxt[vec]; 269843e1988Sjohnlev 270843e1988Sjohnlev if (xen_idt_to_trap_info(vec, 271843e1988Sjohnlev &cp->cpu_m.mcpu_idt[vec], ti) == 0) { 272843e1988Sjohnlev ti->cs = KCS_SEL; 273843e1988Sjohnlev ti->vector = vec; 274843e1988Sjohnlev } 275843e1988Sjohnlev } 276843e1988Sjohnlev 277843e1988Sjohnlev /* 278843e1988Sjohnlev * No LDT 279843e1988Sjohnlev */ 280843e1988Sjohnlev 281843e1988Sjohnlev /* 282843e1988Sjohnlev * (We assert in various places that the GDT is (a) aligned on a 283843e1988Sjohnlev * page boundary and (b) one page long, so this really should fit..) 284843e1988Sjohnlev */ 285843e1988Sjohnlev #ifdef CRASH_XEN 286843e1988Sjohnlev vgc->gdt_frames[0] = pa_to_ma(mmu_btop(cp->cpu_m.mcpu_gdtpa)); 287843e1988Sjohnlev #else 288843e1988Sjohnlev vgc->gdt_frames[0] = pfn_to_mfn(mmu_btop(cp->cpu_m.mcpu_gdtpa)); 289843e1988Sjohnlev #endif 290843e1988Sjohnlev vgc->gdt_ents = NGDT; 291843e1988Sjohnlev 292843e1988Sjohnlev vgc->ctrlreg[0] = CR0_ENABLE_FPU_FLAGS(getcr0()); 293843e1988Sjohnlev 294843e1988Sjohnlev #if defined(__i386) 295843e1988Sjohnlev if (mmu.pae_hat) 296843e1988Sjohnlev vgc->ctrlreg[3] = 297843e1988Sjohnlev xen_pfn_to_cr3(pfn_to_mfn(kas.a_hat->hat_htable->ht_pfn)); 298843e1988Sjohnlev else 299843e1988Sjohnlev #endif 300843e1988Sjohnlev vgc->ctrlreg[3] = 301843e1988Sjohnlev pa_to_ma(mmu_ptob(kas.a_hat->hat_htable->ht_pfn)); 302843e1988Sjohnlev 303843e1988Sjohnlev vgc->ctrlreg[4] = getcr4(); 304843e1988Sjohnlev 305843e1988Sjohnlev vgc->event_callback_eip = (uintptr_t)xen_callback; 306843e1988Sjohnlev vgc->failsafe_callback_eip = (uintptr_t)xen_failsafe_callback; 307843e1988Sjohnlev vgc->flags |= VGCF_failsafe_disables_events; 308843e1988Sjohnlev 309843e1988Sjohnlev #if defined(__amd64) 310843e1988Sjohnlev /* 311843e1988Sjohnlev * XXPV should this be moved to init_cpu_syscall? 312843e1988Sjohnlev */ 313843e1988Sjohnlev vgc->syscall_callback_eip = (uintptr_t)sys_syscall; 314843e1988Sjohnlev vgc->flags |= VGCF_syscall_disables_events; 315843e1988Sjohnlev 316843e1988Sjohnlev ASSERT(vgc->user_regs.gs == 0); 317843e1988Sjohnlev vgc->gs_base_kernel = (uintptr_t)cp; 318843e1988Sjohnlev #endif 319843e1988Sjohnlev 320843e1988Sjohnlev return (xen_vcpu_initialize(cp->cpu_id, vgc)); 321843e1988Sjohnlev } 322843e1988Sjohnlev 323843e1988Sjohnlev /* 324843e1988Sjohnlev * Create a guest virtual cpu context so that the virtual cpu 325843e1988Sjohnlev * springs into life in the domain just about to call mp_startup() 326843e1988Sjohnlev * 327843e1988Sjohnlev * Virtual CPUs must be initialized once in the lifetime of the domain; 328843e1988Sjohnlev * after that subsequent attempts to start them will fail with X_EEXIST. 329843e1988Sjohnlev * 330843e1988Sjohnlev * Thus 'alloc' -really- creates and initializes the virtual 331843e1988Sjohnlev * CPU context just once. Once the initialisation succeeds, we never 332843e1988Sjohnlev * free it, nor the regular cpu_t to which it refers. 333843e1988Sjohnlev */ 334843e1988Sjohnlev void * 335843e1988Sjohnlev mach_cpucontext_alloc(struct cpu *cp) 336843e1988Sjohnlev { 337843e1988Sjohnlev kthread_t *tp = cp->cpu_thread; 338843e1988Sjohnlev vcpu_guest_context_t vgc; 339843e1988Sjohnlev 340843e1988Sjohnlev int err = 1; 341843e1988Sjohnlev 342843e1988Sjohnlev /* 343843e1988Sjohnlev * First, augment the incoming cpu structure 344843e1988Sjohnlev * - vcpu pointer reference 345843e1988Sjohnlev * - pending event storage area 346843e1988Sjohnlev * - physical address of GDT 347843e1988Sjohnlev */ 348843e1988Sjohnlev cp->cpu_m.mcpu_vcpu_info = 349843e1988Sjohnlev &HYPERVISOR_shared_info->vcpu_info[cp->cpu_id]; 350843e1988Sjohnlev cp->cpu_m.mcpu_evt_pend = kmem_zalloc( 351843e1988Sjohnlev sizeof (struct xen_evt_data), KM_SLEEP); 352843e1988Sjohnlev cp->cpu_m.mcpu_gdtpa = 353843e1988Sjohnlev mmu_ptob(hat_getpfnum(kas.a_hat, (caddr_t)cp->cpu_gdt)); 354843e1988Sjohnlev 355843e1988Sjohnlev if ((err = xen_gdt_setprot(cp, PROT_READ)) != 0) 356843e1988Sjohnlev goto done; 357843e1988Sjohnlev 358843e1988Sjohnlev /* 359843e1988Sjohnlev * Now set up the vcpu context so that we can start this vcpu 360843e1988Sjohnlev * in the kernel at tp->t_pc (mp_startup). Note that the 361843e1988Sjohnlev * thread will thread_exit() shortly after performing the 362843e1988Sjohnlev * initialization; in particular, we will *never* take a 363843e1988Sjohnlev * privilege transition on this thread. 364843e1988Sjohnlev */ 365843e1988Sjohnlev 366843e1988Sjohnlev bzero(&vgc, sizeof (vgc)); 367843e1988Sjohnlev 368843e1988Sjohnlev #ifdef __amd64 369843e1988Sjohnlev vgc.user_regs.rip = tp->t_pc; 370843e1988Sjohnlev vgc.user_regs.rsp = tp->t_sp; 371843e1988Sjohnlev vgc.user_regs.rbp = tp->t_sp - 2 * sizeof (greg_t); 372843e1988Sjohnlev #else 373843e1988Sjohnlev vgc.user_regs.eip = tp->t_pc; 374843e1988Sjohnlev vgc.user_regs.esp = tp->t_sp; 375843e1988Sjohnlev vgc.user_regs.ebp = tp->t_sp - 2 * sizeof (greg_t); 376843e1988Sjohnlev #endif 377843e1988Sjohnlev /* 378843e1988Sjohnlev * XXPV Fix resume, if Russ didn't already fix it. 379843e1988Sjohnlev * 380843e1988Sjohnlev * Note that resume unconditionally puts t->t_stk + sizeof (regs) 381843e1988Sjohnlev * into kernel_sp via HYPERVISOR_stack_switch. This anticipates 382843e1988Sjohnlev * that only lwps take traps that switch to the kernel stack; 383843e1988Sjohnlev * part of creating an lwp adjusts the stack by subtracting 384843e1988Sjohnlev * sizeof (struct regs) off t_stk. 385843e1988Sjohnlev * 386843e1988Sjohnlev * The more interesting question is, why do we do all the work 387843e1988Sjohnlev * of a fully fledged lwp for a plain thread? In particular 388843e1988Sjohnlev * we don't have to call HYPERVISOR_stack_switch for lwp-less threads 389843e1988Sjohnlev * or futz with the LDT. This should probably all be done with 390843e1988Sjohnlev * an lwp context operator to keep pure thread context switch fast. 391843e1988Sjohnlev */ 392843e1988Sjohnlev vgc.kernel_sp = (ulong_t)tp->t_stk; 393843e1988Sjohnlev 394843e1988Sjohnlev err = mp_set_cpu_context(&vgc, cp); 395843e1988Sjohnlev 396843e1988Sjohnlev done: 397843e1988Sjohnlev if (err) { 398843e1988Sjohnlev mach_cpucontext_free(cp, NULL, err); 399843e1988Sjohnlev return (NULL); 400843e1988Sjohnlev } 401843e1988Sjohnlev return (cp); 402843e1988Sjohnlev } 403843e1988Sjohnlev 404843e1988Sjohnlev /* 405843e1988Sjohnlev * By the time we are called either we have successfully started 406843e1988Sjohnlev * the cpu, or our attempt to start it has failed. 407843e1988Sjohnlev */ 408843e1988Sjohnlev 409843e1988Sjohnlev /*ARGSUSED*/ 410843e1988Sjohnlev void 411843e1988Sjohnlev mach_cpucontext_free(struct cpu *cp, void *arg, int err) 412843e1988Sjohnlev { 413843e1988Sjohnlev switch (err) { 414843e1988Sjohnlev case 0: 415843e1988Sjohnlev break; 416843e1988Sjohnlev case ETIMEDOUT: 417843e1988Sjohnlev /* 418843e1988Sjohnlev * The vcpu context is loaded into the hypervisor, and 419843e1988Sjohnlev * we've tried to start it, but the vcpu has not been set 420843e1988Sjohnlev * running yet, for whatever reason. We arrange to -not- 421843e1988Sjohnlev * free any data structures it may be referencing. In 422843e1988Sjohnlev * particular, we've already told the hypervisor about 423843e1988Sjohnlev * the GDT, and so we can't map it read-write again. 424843e1988Sjohnlev */ 425843e1988Sjohnlev break; 426843e1988Sjohnlev default: 427843e1988Sjohnlev (void) xen_gdt_setprot(cp, PROT_READ | PROT_WRITE); 428843e1988Sjohnlev kmem_free(cp->cpu_m.mcpu_evt_pend, 429843e1988Sjohnlev sizeof (struct xen_evt_data)); 430843e1988Sjohnlev break; 431843e1988Sjohnlev } 432843e1988Sjohnlev } 433843e1988Sjohnlev 434843e1988Sjohnlev /* 435843e1988Sjohnlev * Reset this CPU's context. Clear out any pending evtchn data, since event 436843e1988Sjohnlev * channel numbers will all change when we resume. 437843e1988Sjohnlev */ 438843e1988Sjohnlev void 439843e1988Sjohnlev mach_cpucontext_reset(cpu_t *cp) 440843e1988Sjohnlev { 441843e1988Sjohnlev bzero(cp->cpu_m.mcpu_evt_pend, sizeof (struct xen_evt_data)); 442843e1988Sjohnlev /* mcpu_intr_pending ? */ 443843e1988Sjohnlev } 444843e1988Sjohnlev 445843e1988Sjohnlev static void 446843e1988Sjohnlev pcb_to_user_regs(label_t *pcb, vcpu_guest_context_t *vgc) 447843e1988Sjohnlev { 448843e1988Sjohnlev #ifdef __amd64 449843e1988Sjohnlev vgc->user_regs.rip = pcb->val[REG_LABEL_PC]; 450843e1988Sjohnlev vgc->user_regs.rsp = pcb->val[REG_LABEL_SP]; 451843e1988Sjohnlev vgc->user_regs.rbp = pcb->val[REG_LABEL_BP]; 452843e1988Sjohnlev vgc->user_regs.rbx = pcb->val[REG_LABEL_RBX]; 453843e1988Sjohnlev vgc->user_regs.r12 = pcb->val[REG_LABEL_R12]; 454843e1988Sjohnlev vgc->user_regs.r13 = pcb->val[REG_LABEL_R13]; 455843e1988Sjohnlev vgc->user_regs.r14 = pcb->val[REG_LABEL_R14]; 456843e1988Sjohnlev vgc->user_regs.r15 = pcb->val[REG_LABEL_R15]; 457843e1988Sjohnlev #else /* __amd64 */ 458843e1988Sjohnlev vgc->user_regs.eip = pcb->val[REG_LABEL_PC]; 459843e1988Sjohnlev vgc->user_regs.esp = pcb->val[REG_LABEL_SP]; 460843e1988Sjohnlev vgc->user_regs.ebp = pcb->val[REG_LABEL_BP]; 461843e1988Sjohnlev vgc->user_regs.ebx = pcb->val[REG_LABEL_EBX]; 462843e1988Sjohnlev vgc->user_regs.esi = pcb->val[REG_LABEL_ESI]; 463843e1988Sjohnlev vgc->user_regs.edi = pcb->val[REG_LABEL_EDI]; 464843e1988Sjohnlev #endif /* __amd64 */ 465843e1988Sjohnlev } 466843e1988Sjohnlev 467843e1988Sjohnlev /* 4681d03c31eSjohnlev * Restore the context of a CPU during resume. This context is always 4691d03c31eSjohnlev * inside enter_safe_phase(), below. 470843e1988Sjohnlev */ 471843e1988Sjohnlev void 472843e1988Sjohnlev mach_cpucontext_restore(cpu_t *cp) 473843e1988Sjohnlev { 474843e1988Sjohnlev vcpu_guest_context_t vgc; 475843e1988Sjohnlev int err; 476843e1988Sjohnlev 477843e1988Sjohnlev ASSERT(cp->cpu_thread == cp->cpu_pause_thread || 478843e1988Sjohnlev cp->cpu_thread == cp->cpu_idle_thread); 479843e1988Sjohnlev 480843e1988Sjohnlev bzero(&vgc, sizeof (vgc)); 481843e1988Sjohnlev 482843e1988Sjohnlev pcb_to_user_regs(&cp->cpu_thread->t_pcb, &vgc); 483843e1988Sjohnlev 484843e1988Sjohnlev /* 485843e1988Sjohnlev * We're emulating a longjmp() here: in particular, we need to bump the 486843e1988Sjohnlev * stack pointer to account for the pop of xIP that returning from 487843e1988Sjohnlev * longjmp() normally would do, and set the return value in xAX to 1. 488843e1988Sjohnlev */ 489843e1988Sjohnlev #ifdef __amd64 490843e1988Sjohnlev vgc.user_regs.rax = 1; 491843e1988Sjohnlev vgc.user_regs.rsp += sizeof (ulong_t); 492843e1988Sjohnlev #else 493843e1988Sjohnlev vgc.user_regs.eax = 1; 494843e1988Sjohnlev vgc.user_regs.esp += sizeof (ulong_t); 495843e1988Sjohnlev #endif 496843e1988Sjohnlev 497843e1988Sjohnlev vgc.kernel_sp = cp->cpu_thread->t_sp; 498843e1988Sjohnlev 499843e1988Sjohnlev err = mp_set_cpu_context(&vgc, cp); 500843e1988Sjohnlev 501843e1988Sjohnlev ASSERT(err == 0); 502843e1988Sjohnlev } 503843e1988Sjohnlev 5041d03c31eSjohnlev /* 5051d03c31eSjohnlev * Reach a point at which the CPU can be safely powered-off or 5061d03c31eSjohnlev * suspended. Nothing can wake this CPU out of the loop. 5071d03c31eSjohnlev */ 5081d03c31eSjohnlev static void 5091d03c31eSjohnlev enter_safe_phase(void) 5101d03c31eSjohnlev { 5111d03c31eSjohnlev ulong_t flags = intr_clear(); 5121d03c31eSjohnlev 5131d03c31eSjohnlev if (setjmp(&curthread->t_pcb) == 0) { 5141d03c31eSjohnlev cpu_phase[CPU->cpu_id] = CPU_PHASE_SAFE; 5151d03c31eSjohnlev while (cpu_phase[CPU->cpu_id] == CPU_PHASE_SAFE) 5161d03c31eSjohnlev SMT_PAUSE(); 5171d03c31eSjohnlev } 5181d03c31eSjohnlev 5191d03c31eSjohnlev ASSERT(!interrupts_enabled()); 5201d03c31eSjohnlev 5211d03c31eSjohnlev intr_restore(flags); 5221d03c31eSjohnlev } 5231d03c31eSjohnlev 5241d03c31eSjohnlev /* 5251d03c31eSjohnlev * Offline CPUs run this code even under a pause_cpus(), so we must 5261d03c31eSjohnlev * check if we need to enter the safe phase. 5271d03c31eSjohnlev */ 528843e1988Sjohnlev void 529843e1988Sjohnlev mach_cpu_idle(void) 530843e1988Sjohnlev { 531843e1988Sjohnlev if (IN_XPV_PANIC()) { 532843e1988Sjohnlev xpv_panic_halt(); 533843e1988Sjohnlev } else { 534843e1988Sjohnlev (void) HYPERVISOR_block(); 5351d03c31eSjohnlev if (cpu_phase[CPU->cpu_id] == CPU_PHASE_WAIT_SAFE) 5361d03c31eSjohnlev enter_safe_phase(); 5371d03c31eSjohnlev } 5381d03c31eSjohnlev } 5391d03c31eSjohnlev 5401d03c31eSjohnlev /* 5411d03c31eSjohnlev * Spin until either start_cpus() wakes us up, or we get a request to 5421d03c31eSjohnlev * enter the safe phase (followed by a later start_cpus()). 5431d03c31eSjohnlev */ 5441d03c31eSjohnlev void 5451d03c31eSjohnlev mach_cpu_pause(volatile char *safe) 5461d03c31eSjohnlev { 5471d03c31eSjohnlev *safe = PAUSE_WAIT; 5481d03c31eSjohnlev membar_enter(); 5491d03c31eSjohnlev 5501d03c31eSjohnlev while (*safe != PAUSE_IDLE) { 5511d03c31eSjohnlev if (cpu_phase[CPU->cpu_id] == CPU_PHASE_WAIT_SAFE) 5521d03c31eSjohnlev enter_safe_phase(); 5531d03c31eSjohnlev SMT_PAUSE(); 554843e1988Sjohnlev } 555843e1988Sjohnlev } 556843e1988Sjohnlev 557843e1988Sjohnlev void 558843e1988Sjohnlev mach_cpu_halt(char *msg) 559843e1988Sjohnlev { 560843e1988Sjohnlev if (msg) 561843e1988Sjohnlev prom_printf("%s\n", msg); 562843e1988Sjohnlev (void) xen_vcpu_down(CPU->cpu_id); 563843e1988Sjohnlev } 564843e1988Sjohnlev 565843e1988Sjohnlev /*ARGSUSED*/ 566843e1988Sjohnlev int 567843e1988Sjohnlev mp_cpu_poweron(struct cpu *cp) 568843e1988Sjohnlev { 569843e1988Sjohnlev return (ENOTSUP); 570843e1988Sjohnlev } 571843e1988Sjohnlev 572843e1988Sjohnlev /*ARGSUSED*/ 573843e1988Sjohnlev int 574843e1988Sjohnlev mp_cpu_poweroff(struct cpu *cp) 575843e1988Sjohnlev { 576843e1988Sjohnlev return (ENOTSUP); 577843e1988Sjohnlev } 578843e1988Sjohnlev 5791d03c31eSjohnlev void 5801d03c31eSjohnlev mp_enter_barrier(void) 581843e1988Sjohnlev { 5821d03c31eSjohnlev hrtime_t last_poke_time = 0; 5831d03c31eSjohnlev int poke_allowed = 0; 5841d03c31eSjohnlev int done = 0; 5851d03c31eSjohnlev int i; 586843e1988Sjohnlev 587843e1988Sjohnlev ASSERT(MUTEX_HELD(&cpu_lock)); 588843e1988Sjohnlev 589*0ed5c46eSJosef 'Jeff' Sipek pause_cpus(NULL, NULL); 5901d03c31eSjohnlev 5911d03c31eSjohnlev while (!done) { 5921d03c31eSjohnlev done = 1; 5931d03c31eSjohnlev poke_allowed = 0; 5941d03c31eSjohnlev 5951d03c31eSjohnlev if (xpv_gethrtime() - last_poke_time > POKE_TIMEOUT) { 5961d03c31eSjohnlev last_poke_time = xpv_gethrtime(); 5971d03c31eSjohnlev poke_allowed = 1; 598843e1988Sjohnlev } 599843e1988Sjohnlev 6001d03c31eSjohnlev for (i = 0; i < NCPU; i++) { 6011d03c31eSjohnlev cpu_t *cp = cpu_get(i); 602843e1988Sjohnlev 6031d03c31eSjohnlev if (cp == NULL || cp == CPU) 6041d03c31eSjohnlev continue; 6051d03c31eSjohnlev 6061d03c31eSjohnlev switch (cpu_phase[i]) { 6071d03c31eSjohnlev case CPU_PHASE_NONE: 6081d03c31eSjohnlev cpu_phase[i] = CPU_PHASE_WAIT_SAFE; 6091d03c31eSjohnlev poke_cpu(i); 6101d03c31eSjohnlev done = 0; 6111d03c31eSjohnlev break; 6121d03c31eSjohnlev 6131d03c31eSjohnlev case CPU_PHASE_WAIT_SAFE: 6141d03c31eSjohnlev if (poke_allowed) 6151d03c31eSjohnlev poke_cpu(i); 6161d03c31eSjohnlev done = 0; 6171d03c31eSjohnlev break; 6181d03c31eSjohnlev 6191d03c31eSjohnlev case CPU_PHASE_SAFE: 6201d03c31eSjohnlev case CPU_PHASE_POWERED_OFF: 6211d03c31eSjohnlev break; 622843e1988Sjohnlev } 623843e1988Sjohnlev } 624843e1988Sjohnlev 6251d03c31eSjohnlev SMT_PAUSE(); 6261d03c31eSjohnlev } 6271d03c31eSjohnlev } 6281d03c31eSjohnlev 6291d03c31eSjohnlev void 6301d03c31eSjohnlev mp_leave_barrier(void) 631843e1988Sjohnlev { 6321d03c31eSjohnlev int i; 6331d03c31eSjohnlev 6341d03c31eSjohnlev ASSERT(MUTEX_HELD(&cpu_lock)); 6351d03c31eSjohnlev 6361d03c31eSjohnlev for (i = 0; i < NCPU; i++) { 6371d03c31eSjohnlev cpu_t *cp = cpu_get(i); 6381d03c31eSjohnlev 6391d03c31eSjohnlev if (cp == NULL || cp == CPU) 6401d03c31eSjohnlev continue; 6411d03c31eSjohnlev 6421d03c31eSjohnlev switch (cpu_phase[i]) { 6431d03c31eSjohnlev /* 6441d03c31eSjohnlev * If we see a CPU in one of these phases, something has 6451d03c31eSjohnlev * gone badly wrong with the guarantees 6461d03c31eSjohnlev * mp_enter_barrier() is supposed to provide. Rather 6471d03c31eSjohnlev * than attempt to stumble along (and since we can't 6481d03c31eSjohnlev * panic properly in this context), we tell the 6491d03c31eSjohnlev * hypervisor we've crashed. 6501d03c31eSjohnlev */ 6511d03c31eSjohnlev case CPU_PHASE_NONE: 6521d03c31eSjohnlev case CPU_PHASE_WAIT_SAFE: 6531d03c31eSjohnlev (void) HYPERVISOR_shutdown(SHUTDOWN_crash); 6541d03c31eSjohnlev break; 6551d03c31eSjohnlev 6561d03c31eSjohnlev case CPU_PHASE_POWERED_OFF: 6571d03c31eSjohnlev break; 6581d03c31eSjohnlev 6591d03c31eSjohnlev case CPU_PHASE_SAFE: 6601d03c31eSjohnlev cpu_phase[i] = CPU_PHASE_NONE; 6611d03c31eSjohnlev } 662843e1988Sjohnlev } 663843e1988Sjohnlev 6641d03c31eSjohnlev start_cpus(); 6651d03c31eSjohnlev } 6661d03c31eSjohnlev 667843e1988Sjohnlev static int 668843e1988Sjohnlev poweroff_vcpu(struct cpu *cp) 669843e1988Sjohnlev { 670843e1988Sjohnlev int error; 671843e1988Sjohnlev 672843e1988Sjohnlev ASSERT(MUTEX_HELD(&cpu_lock)); 673843e1988Sjohnlev 674843e1988Sjohnlev ASSERT(CPU->cpu_id != cp->cpu_id); 675843e1988Sjohnlev ASSERT(cp->cpu_flags & CPU_QUIESCED); 676843e1988Sjohnlev 6771d03c31eSjohnlev mp_enter_barrier(); 678843e1988Sjohnlev 679843e1988Sjohnlev if ((error = xen_vcpu_down(cp->cpu_id)) == 0) { 6801d03c31eSjohnlev ASSERT(cpu_phase[cp->cpu_id] == CPU_PHASE_SAFE); 6811d03c31eSjohnlev 682843e1988Sjohnlev CPUSET_DEL(cpu_ready_set, cp->cpu_id); 6831d03c31eSjohnlev 684843e1988Sjohnlev cp->cpu_flags |= CPU_POWEROFF | CPU_OFFLINE; 685843e1988Sjohnlev cp->cpu_flags &= 686843e1988Sjohnlev ~(CPU_RUNNING | CPU_READY | CPU_EXISTS | CPU_ENABLE); 687843e1988Sjohnlev 6881d03c31eSjohnlev cpu_phase[cp->cpu_id] = CPU_PHASE_POWERED_OFF; 6891d03c31eSjohnlev 690843e1988Sjohnlev cpu_set_state(cp); 691843e1988Sjohnlev } 6921d03c31eSjohnlev 6931d03c31eSjohnlev mp_leave_barrier(); 6941d03c31eSjohnlev 695843e1988Sjohnlev return (error); 696843e1988Sjohnlev } 697843e1988Sjohnlev 698843e1988Sjohnlev static int 699843e1988Sjohnlev vcpu_config_poweroff(processorid_t id) 700843e1988Sjohnlev { 701843e1988Sjohnlev int oldstate; 702843e1988Sjohnlev int error; 703843e1988Sjohnlev cpu_t *cp; 704843e1988Sjohnlev 705843e1988Sjohnlev mutex_enter(&cpu_lock); 706843e1988Sjohnlev 707843e1988Sjohnlev if ((cp = cpu_get(id)) == NULL) { 708843e1988Sjohnlev mutex_exit(&cpu_lock); 709843e1988Sjohnlev return (ESRCH); 710843e1988Sjohnlev } 711843e1988Sjohnlev 712843e1988Sjohnlev if (cpu_get_state(cp) == P_POWEROFF) { 713843e1988Sjohnlev mutex_exit(&cpu_lock); 714843e1988Sjohnlev return (0); 715843e1988Sjohnlev } 716843e1988Sjohnlev 717843e1988Sjohnlev mutex_exit(&cpu_lock); 718843e1988Sjohnlev 719843e1988Sjohnlev do { 720843e1988Sjohnlev error = p_online_internal(id, P_OFFLINE, 721843e1988Sjohnlev &oldstate); 722843e1988Sjohnlev 723843e1988Sjohnlev if (error != 0) 724843e1988Sjohnlev break; 725843e1988Sjohnlev 726843e1988Sjohnlev /* 727843e1988Sjohnlev * So we just changed it to P_OFFLINE. But then we dropped 728843e1988Sjohnlev * cpu_lock, so now it is possible for another thread to change 729843e1988Sjohnlev * the cpu back to a different, non-quiesced state e.g. 730843e1988Sjohnlev * P_ONLINE. 731843e1988Sjohnlev */ 732843e1988Sjohnlev mutex_enter(&cpu_lock); 733843e1988Sjohnlev if ((cp = cpu_get(id)) == NULL) 734843e1988Sjohnlev error = ESRCH; 735843e1988Sjohnlev else { 736843e1988Sjohnlev if (cp->cpu_flags & CPU_QUIESCED) 737843e1988Sjohnlev error = poweroff_vcpu(cp); 738843e1988Sjohnlev else 739843e1988Sjohnlev error = EBUSY; 740843e1988Sjohnlev } 741843e1988Sjohnlev mutex_exit(&cpu_lock); 742843e1988Sjohnlev } while (error == EBUSY); 743843e1988Sjohnlev 744843e1988Sjohnlev return (error); 745843e1988Sjohnlev } 746843e1988Sjohnlev 747843e1988Sjohnlev /* 748843e1988Sjohnlev * Add a new virtual cpu to the domain. 749843e1988Sjohnlev */ 750843e1988Sjohnlev static int 751843e1988Sjohnlev vcpu_config_new(processorid_t id) 752843e1988Sjohnlev { 753843e1988Sjohnlev extern int start_cpu(processorid_t); 754843e1988Sjohnlev int error; 755843e1988Sjohnlev 756843e1988Sjohnlev if (ncpus == 1) { 757843e1988Sjohnlev printf("cannot (yet) add cpus to a single-cpu domain\n"); 758843e1988Sjohnlev return (ENOTSUP); 759843e1988Sjohnlev } 760843e1988Sjohnlev 761843e1988Sjohnlev affinity_set(CPU_CURRENT); 762843e1988Sjohnlev error = start_cpu(id); 763843e1988Sjohnlev affinity_clear(); 764843e1988Sjohnlev return (error); 765843e1988Sjohnlev } 766843e1988Sjohnlev 767843e1988Sjohnlev static int 7681d03c31eSjohnlev poweron_vcpu(struct cpu *cp) 7691d03c31eSjohnlev { 7701d03c31eSjohnlev int error; 7711d03c31eSjohnlev 7721d03c31eSjohnlev ASSERT(MUTEX_HELD(&cpu_lock)); 7731d03c31eSjohnlev 7741d03c31eSjohnlev if (HYPERVISOR_vcpu_op(VCPUOP_is_up, cp->cpu_id, NULL) != 0) { 7751d03c31eSjohnlev printf("poweron_vcpu: vcpu%d is not available!\n", 7761d03c31eSjohnlev cp->cpu_id); 7771d03c31eSjohnlev return (ENXIO); 7781d03c31eSjohnlev } 7791d03c31eSjohnlev 7801d03c31eSjohnlev if ((error = xen_vcpu_up(cp->cpu_id)) == 0) { 7811d03c31eSjohnlev CPUSET_ADD(cpu_ready_set, cp->cpu_id); 7821d03c31eSjohnlev cp->cpu_flags |= CPU_EXISTS | CPU_READY | CPU_RUNNING; 7831d03c31eSjohnlev cp->cpu_flags &= ~CPU_POWEROFF; 7841d03c31eSjohnlev /* 7851d03c31eSjohnlev * There are some nasty races possible here. 7861d03c31eSjohnlev * Tell the vcpu it's up one more time. 7871d03c31eSjohnlev * XXPV Is this enough? Is this safe? 7881d03c31eSjohnlev */ 7891d03c31eSjohnlev (void) xen_vcpu_up(cp->cpu_id); 7901d03c31eSjohnlev 7911d03c31eSjohnlev cpu_phase[cp->cpu_id] = CPU_PHASE_NONE; 7921d03c31eSjohnlev 7931d03c31eSjohnlev cpu_set_state(cp); 7941d03c31eSjohnlev } 7951d03c31eSjohnlev return (error); 7961d03c31eSjohnlev } 7971d03c31eSjohnlev 7981d03c31eSjohnlev static int 799843e1988Sjohnlev vcpu_config_poweron(processorid_t id) 800843e1988Sjohnlev { 801843e1988Sjohnlev cpu_t *cp; 802843e1988Sjohnlev int oldstate; 803843e1988Sjohnlev int error; 804843e1988Sjohnlev 805843e1988Sjohnlev if (id >= ncpus) 806843e1988Sjohnlev return (vcpu_config_new(id)); 807843e1988Sjohnlev 808843e1988Sjohnlev mutex_enter(&cpu_lock); 809843e1988Sjohnlev 810843e1988Sjohnlev if ((cp = cpu_get(id)) == NULL) { 811843e1988Sjohnlev mutex_exit(&cpu_lock); 812843e1988Sjohnlev return (ESRCH); 813843e1988Sjohnlev } 814843e1988Sjohnlev 815843e1988Sjohnlev if (cpu_get_state(cp) != P_POWEROFF) { 816843e1988Sjohnlev mutex_exit(&cpu_lock); 817843e1988Sjohnlev return (0); 818843e1988Sjohnlev } 819843e1988Sjohnlev 820843e1988Sjohnlev if ((error = poweron_vcpu(cp)) != 0) { 821843e1988Sjohnlev mutex_exit(&cpu_lock); 822843e1988Sjohnlev return (error); 823843e1988Sjohnlev } 824843e1988Sjohnlev 825843e1988Sjohnlev mutex_exit(&cpu_lock); 826843e1988Sjohnlev 827843e1988Sjohnlev return (p_online_internal(id, P_ONLINE, &oldstate)); 828843e1988Sjohnlev } 829843e1988Sjohnlev 830843e1988Sjohnlev #define REPORT_LEN 128 831843e1988Sjohnlev 832843e1988Sjohnlev static void 833843e1988Sjohnlev vcpu_config_report(processorid_t id, uint_t newstate, int error) 834843e1988Sjohnlev { 835843e1988Sjohnlev char *report = kmem_alloc(REPORT_LEN, KM_SLEEP); 836843e1988Sjohnlev size_t len; 837843e1988Sjohnlev char *ps; 838843e1988Sjohnlev 839843e1988Sjohnlev switch (newstate) { 840843e1988Sjohnlev case P_ONLINE: 841843e1988Sjohnlev ps = PS_ONLINE; 842843e1988Sjohnlev break; 843843e1988Sjohnlev case P_POWEROFF: 844843e1988Sjohnlev ps = PS_POWEROFF; 845843e1988Sjohnlev break; 846843e1988Sjohnlev default: 847843e1988Sjohnlev cmn_err(CE_PANIC, "unknown state %u\n", newstate); 848843e1988Sjohnlev break; 849843e1988Sjohnlev } 850843e1988Sjohnlev 851843e1988Sjohnlev len = snprintf(report, REPORT_LEN, 852843e1988Sjohnlev "cpu%d: externally initiated %s", id, ps); 853843e1988Sjohnlev 854843e1988Sjohnlev if (!error) { 855843e1988Sjohnlev cmn_err(CE_CONT, "!%s\n", report); 856843e1988Sjohnlev kmem_free(report, REPORT_LEN); 857843e1988Sjohnlev return; 858843e1988Sjohnlev } 859843e1988Sjohnlev 860843e1988Sjohnlev len += snprintf(report + len, REPORT_LEN - len, 861843e1988Sjohnlev " failed, error %d: ", error); 862843e1988Sjohnlev switch (error) { 863843e1988Sjohnlev case EEXIST: 864843e1988Sjohnlev len += snprintf(report + len, REPORT_LEN - len, 865843e1988Sjohnlev "cpu already %s", ps ? ps : "?"); 866843e1988Sjohnlev break; 867843e1988Sjohnlev case ESRCH: 868843e1988Sjohnlev len += snprintf(report + len, REPORT_LEN - len, 869843e1988Sjohnlev "cpu not found"); 870843e1988Sjohnlev break; 871843e1988Sjohnlev case EINVAL: 872843e1988Sjohnlev case EALREADY: 873843e1988Sjohnlev break; 874843e1988Sjohnlev case EPERM: 875843e1988Sjohnlev len += snprintf(report + len, REPORT_LEN - len, 876843e1988Sjohnlev "insufficient privilege (0x%x)", id); 877843e1988Sjohnlev break; 878843e1988Sjohnlev case EBUSY: 879843e1988Sjohnlev switch (newstate) { 880843e1988Sjohnlev case P_ONLINE: 881843e1988Sjohnlev /* 882843e1988Sjohnlev * This return comes from mp_cpu_start - 883843e1988Sjohnlev * we cannot 'start' the boot CPU. 884843e1988Sjohnlev */ 885843e1988Sjohnlev len += snprintf(report + len, REPORT_LEN - len, 886843e1988Sjohnlev "already running"); 887843e1988Sjohnlev break; 888843e1988Sjohnlev case P_POWEROFF: 889843e1988Sjohnlev len += snprintf(report + len, REPORT_LEN - len, 890843e1988Sjohnlev "bound lwps?"); 891843e1988Sjohnlev break; 892843e1988Sjohnlev default: 893843e1988Sjohnlev break; 894843e1988Sjohnlev } 895843e1988Sjohnlev default: 896843e1988Sjohnlev break; 897843e1988Sjohnlev } 898843e1988Sjohnlev 899843e1988Sjohnlev cmn_err(CE_CONT, "%s\n", report); 900843e1988Sjohnlev kmem_free(report, REPORT_LEN); 901843e1988Sjohnlev } 902843e1988Sjohnlev 903843e1988Sjohnlev static void 904843e1988Sjohnlev vcpu_config(void *arg) 905843e1988Sjohnlev { 906843e1988Sjohnlev int id = (int)(uintptr_t)arg; 907843e1988Sjohnlev int error; 908843e1988Sjohnlev char dir[16]; 909843e1988Sjohnlev char *state; 910843e1988Sjohnlev 911843e1988Sjohnlev if ((uint_t)id >= max_ncpus) { 912843e1988Sjohnlev cmn_err(CE_WARN, 913843e1988Sjohnlev "vcpu_config: cpu%d does not fit in this domain", id); 914843e1988Sjohnlev return; 915843e1988Sjohnlev } 916843e1988Sjohnlev 917843e1988Sjohnlev (void) snprintf(dir, sizeof (dir), "cpu/%d", id); 918843e1988Sjohnlev state = kmem_alloc(MAXPATHLEN, KM_SLEEP); 919843e1988Sjohnlev if (xenbus_scanf(XBT_NULL, dir, "availability", "%s", state) == 0) { 920843e1988Sjohnlev if (strcmp(state, "online") == 0) { 921843e1988Sjohnlev error = vcpu_config_poweron(id); 922843e1988Sjohnlev vcpu_config_report(id, P_ONLINE, error); 923843e1988Sjohnlev } else if (strcmp(state, "offline") == 0) { 924843e1988Sjohnlev error = vcpu_config_poweroff(id); 925843e1988Sjohnlev vcpu_config_report(id, P_POWEROFF, error); 926843e1988Sjohnlev } else { 927843e1988Sjohnlev cmn_err(CE_WARN, 928843e1988Sjohnlev "cpu%d: unknown target state '%s'", id, state); 929843e1988Sjohnlev } 930843e1988Sjohnlev } else 931843e1988Sjohnlev cmn_err(CE_WARN, 932843e1988Sjohnlev "cpu%d: unable to read target state from xenstore", id); 933843e1988Sjohnlev 934843e1988Sjohnlev kmem_free(state, MAXPATHLEN); 935843e1988Sjohnlev } 936843e1988Sjohnlev 937843e1988Sjohnlev /*ARGSUSED*/ 938843e1988Sjohnlev static void 939843e1988Sjohnlev vcpu_config_event(struct xenbus_watch *watch, const char **vec, uint_t len) 940843e1988Sjohnlev { 941843e1988Sjohnlev const char *path = vec[XS_WATCH_PATH]; 942843e1988Sjohnlev processorid_t id; 943843e1988Sjohnlev char *s; 944843e1988Sjohnlev 945843e1988Sjohnlev if ((s = strstr(path, "cpu/")) != NULL && 946843e1988Sjohnlev sscanf(s, "cpu/%d", &id) == 1) { 947843e1988Sjohnlev /* 948843e1988Sjohnlev * Run the virtual CPU configuration on a separate thread to 949843e1988Sjohnlev * avoid blocking on this event for too long (and for now, 950843e1988Sjohnlev * to ensure configuration requests are serialized.) 951843e1988Sjohnlev */ 952843e1988Sjohnlev (void) taskq_dispatch(cpu_config_tq, 953843e1988Sjohnlev vcpu_config, (void *)(uintptr_t)id, 0); 954843e1988Sjohnlev } 955843e1988Sjohnlev } 956843e1988Sjohnlev 957843e1988Sjohnlev static int 958843e1988Sjohnlev xen_vcpu_initialize(processorid_t id, vcpu_guest_context_t *vgc) 959843e1988Sjohnlev { 960843e1988Sjohnlev int err; 961843e1988Sjohnlev 962843e1988Sjohnlev if ((err = HYPERVISOR_vcpu_op(VCPUOP_initialise, id, vgc)) != 0) { 963843e1988Sjohnlev char *str; 964843e1988Sjohnlev int level = CE_WARN; 965843e1988Sjohnlev 966843e1988Sjohnlev switch (err) { 967843e1988Sjohnlev case -X_EINVAL: 968843e1988Sjohnlev /* 969843e1988Sjohnlev * This interface squashes multiple error sources 970843e1988Sjohnlev * to one error code. In particular, an X_EINVAL 971843e1988Sjohnlev * code can mean: 972843e1988Sjohnlev * 973843e1988Sjohnlev * - the vcpu id is out of range 974843e1988Sjohnlev * - cs or ss are in ring 0 975843e1988Sjohnlev * - cr3 is wrong 976843e1988Sjohnlev * - an entry in the new gdt is above the 977843e1988Sjohnlev * reserved entry 978843e1988Sjohnlev * - a frame underneath the new gdt is bad 979843e1988Sjohnlev */ 980843e1988Sjohnlev str = "something is wrong :("; 981843e1988Sjohnlev break; 982843e1988Sjohnlev case -X_ENOENT: 983843e1988Sjohnlev str = "no such cpu"; 984843e1988Sjohnlev break; 985843e1988Sjohnlev case -X_ENOMEM: 986843e1988Sjohnlev str = "no mem to copy ctxt"; 987843e1988Sjohnlev break; 988843e1988Sjohnlev case -X_EFAULT: 989843e1988Sjohnlev str = "bad address"; 990843e1988Sjohnlev break; 991843e1988Sjohnlev case -X_EEXIST: 992843e1988Sjohnlev /* 993843e1988Sjohnlev * Hmm. This error is returned if the vcpu has already 994843e1988Sjohnlev * been initialized once before in the lifetime of this 995843e1988Sjohnlev * domain. This is a logic error in the kernel. 996843e1988Sjohnlev */ 997843e1988Sjohnlev level = CE_PANIC; 998843e1988Sjohnlev str = "already initialized"; 999843e1988Sjohnlev break; 1000843e1988Sjohnlev default: 1001843e1988Sjohnlev level = CE_PANIC; 1002843e1988Sjohnlev str = "<unexpected>"; 1003843e1988Sjohnlev break; 1004843e1988Sjohnlev } 1005843e1988Sjohnlev 1006843e1988Sjohnlev cmn_err(level, "vcpu%d: failed to init: error %d: %s", 1007843e1988Sjohnlev id, -err, str); 1008843e1988Sjohnlev } 1009843e1988Sjohnlev return (err); 1010843e1988Sjohnlev } 1011843e1988Sjohnlev 1012843e1988Sjohnlev long 1013843e1988Sjohnlev xen_vcpu_up(processorid_t id) 1014843e1988Sjohnlev { 1015843e1988Sjohnlev long err; 1016843e1988Sjohnlev 1017843e1988Sjohnlev if ((err = HYPERVISOR_vcpu_op(VCPUOP_up, id, NULL)) != 0) { 1018843e1988Sjohnlev char *str; 1019843e1988Sjohnlev 1020843e1988Sjohnlev switch (err) { 1021843e1988Sjohnlev case -X_ENOENT: 1022843e1988Sjohnlev str = "no such cpu"; 1023843e1988Sjohnlev break; 1024843e1988Sjohnlev case -X_EINVAL: 1025843e1988Sjohnlev /* 1026843e1988Sjohnlev * Perhaps this is diagnostic overkill. 1027843e1988Sjohnlev */ 1028843e1988Sjohnlev if (HYPERVISOR_vcpu_op(VCPUOP_is_up, id, NULL) < 0) 1029843e1988Sjohnlev str = "bad cpuid"; 1030843e1988Sjohnlev else 1031843e1988Sjohnlev str = "not initialized"; 1032843e1988Sjohnlev break; 1033843e1988Sjohnlev default: 1034843e1988Sjohnlev str = "<unexpected>"; 1035843e1988Sjohnlev break; 1036843e1988Sjohnlev } 1037843e1988Sjohnlev 1038843e1988Sjohnlev printf("vcpu%d: failed to start: error %d: %s\n", 1039843e1988Sjohnlev id, -(int)err, str); 1040843e1988Sjohnlev return (EBFONT); /* deliberately silly */ 1041843e1988Sjohnlev } 1042843e1988Sjohnlev return (err); 1043843e1988Sjohnlev } 1044843e1988Sjohnlev 1045843e1988Sjohnlev long 1046843e1988Sjohnlev xen_vcpu_down(processorid_t id) 1047843e1988Sjohnlev { 1048843e1988Sjohnlev long err; 1049843e1988Sjohnlev 1050843e1988Sjohnlev if ((err = HYPERVISOR_vcpu_op(VCPUOP_down, id, NULL)) != 0) { 1051843e1988Sjohnlev /* 1052843e1988Sjohnlev * X_ENOENT: no such cpu 1053843e1988Sjohnlev * X_EINVAL: bad cpuid 1054843e1988Sjohnlev */ 1055843e1988Sjohnlev panic("vcpu%d: failed to stop: error %d", id, -(int)err); 1056843e1988Sjohnlev } 1057843e1988Sjohnlev 1058843e1988Sjohnlev return (err); 1059843e1988Sjohnlev } 1060