/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #pragma ident "%Z%%M% %I% %E% SMI" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct xen_evt_data cpu0_evt_data; /* cpu0's pending event data */ static taskq_t *cpu_config_tq; static void vcpu_config_event(struct xenbus_watch *, const char **, uint_t); static int xen_vcpu_initialize(processorid_t, vcpu_guest_context_t *); /* * These routines allocate any global state that might be needed * while starting cpus. For virtual cpus, there is no such state. */ int mach_cpucontext_init(void) { return (0); } void do_cpu_config_watch(int state) { static struct xenbus_watch cpu_config_watch; if (state != XENSTORE_UP) return; cpu_config_watch.node = "cpu"; cpu_config_watch.callback = vcpu_config_event; if (register_xenbus_watch(&cpu_config_watch)) { taskq_destroy(cpu_config_tq); cmn_err(CE_WARN, "do_cpu_config_watch: " "failed to set vcpu config watch"); } } /* * This routine is called after all the "normal" MP startup has * been done; a good place to start watching xen store for virtual * cpu hot plug events. */ void mach_cpucontext_fini(void) { cpu_config_tq = taskq_create("vcpu config taskq", 1, maxclsyspri - 1, 1, 1, TASKQ_PREPOPULATE); (void) xs_register_xenbus_callback(do_cpu_config_watch); } /* * Fill in the remaining CPU context and initialize it. */ static int mp_set_cpu_context(vcpu_guest_context_t *vgc, cpu_t *cp) { uint_t vec, iopl; vgc->flags = VGCF_IN_KERNEL; /* * fpu_ctx we leave as zero; on first fault we'll store * sse_initial into it anyway. */ #if defined(__amd64) vgc->user_regs.cs = KCS_SEL | SEL_KPL; /* force to ring 3 */ #else vgc->user_regs.cs = KCS_SEL; #endif vgc->user_regs.ds = KDS_SEL; vgc->user_regs.es = KDS_SEL; vgc->user_regs.ss = KDS_SEL; vgc->kernel_ss = KDS_SEL; /* * Allow I/O privilege level for Dom0 kernel. */ if (DOMAIN_IS_INITDOMAIN(xen_info)) iopl = (PS_IOPL & 0x1000); /* ring 1 */ else iopl = 0; #if defined(__amd64) vgc->user_regs.fs = 0; vgc->user_regs.gs = 0; vgc->user_regs.rflags = F_OFF | iopl; #elif defined(__i386) vgc->user_regs.fs = KFS_SEL; vgc->user_regs.gs = KGS_SEL; vgc->user_regs.eflags = F_OFF | iopl; vgc->event_callback_cs = vgc->user_regs.cs; vgc->failsafe_callback_cs = vgc->user_regs.cs; #endif /* * Initialize the trap_info_t from the IDT */ #if !defined(__lint) ASSERT(NIDT == sizeof (vgc->trap_ctxt) / sizeof (vgc->trap_ctxt[0])); #endif for (vec = 0; vec < NIDT; vec++) { trap_info_t *ti = &vgc->trap_ctxt[vec]; if (xen_idt_to_trap_info(vec, &cp->cpu_m.mcpu_idt[vec], ti) == 0) { ti->cs = KCS_SEL; ti->vector = vec; } } /* * No LDT */ /* * (We assert in various places that the GDT is (a) aligned on a * page boundary and (b) one page long, so this really should fit..) */ #ifdef CRASH_XEN vgc->gdt_frames[0] = pa_to_ma(mmu_btop(cp->cpu_m.mcpu_gdtpa)); #else vgc->gdt_frames[0] = pfn_to_mfn(mmu_btop(cp->cpu_m.mcpu_gdtpa)); #endif vgc->gdt_ents = NGDT; vgc->ctrlreg[0] = CR0_ENABLE_FPU_FLAGS(getcr0()); #if defined(__i386) if (mmu.pae_hat) vgc->ctrlreg[3] = xen_pfn_to_cr3(pfn_to_mfn(kas.a_hat->hat_htable->ht_pfn)); else #endif vgc->ctrlreg[3] = pa_to_ma(mmu_ptob(kas.a_hat->hat_htable->ht_pfn)); vgc->ctrlreg[4] = getcr4(); vgc->event_callback_eip = (uintptr_t)xen_callback; vgc->failsafe_callback_eip = (uintptr_t)xen_failsafe_callback; vgc->flags |= VGCF_failsafe_disables_events; #if defined(__amd64) /* * XXPV should this be moved to init_cpu_syscall? */ vgc->syscall_callback_eip = (uintptr_t)sys_syscall; vgc->flags |= VGCF_syscall_disables_events; ASSERT(vgc->user_regs.gs == 0); vgc->gs_base_kernel = (uintptr_t)cp; #endif return (xen_vcpu_initialize(cp->cpu_id, vgc)); } /* * Create a guest virtual cpu context so that the virtual cpu * springs into life in the domain just about to call mp_startup() * * Virtual CPUs must be initialized once in the lifetime of the domain; * after that subsequent attempts to start them will fail with X_EEXIST. * * Thus 'alloc' -really- creates and initializes the virtual * CPU context just once. Once the initialisation succeeds, we never * free it, nor the regular cpu_t to which it refers. */ void * mach_cpucontext_alloc(struct cpu *cp) { kthread_t *tp = cp->cpu_thread; vcpu_guest_context_t vgc; int err = 1; /* * First, augment the incoming cpu structure * - vcpu pointer reference * - pending event storage area * - physical address of GDT */ cp->cpu_m.mcpu_vcpu_info = &HYPERVISOR_shared_info->vcpu_info[cp->cpu_id]; cp->cpu_m.mcpu_evt_pend = kmem_zalloc( sizeof (struct xen_evt_data), KM_SLEEP); cp->cpu_m.mcpu_gdtpa = mmu_ptob(hat_getpfnum(kas.a_hat, (caddr_t)cp->cpu_gdt)); if ((err = xen_gdt_setprot(cp, PROT_READ)) != 0) goto done; /* * Now set up the vcpu context so that we can start this vcpu * in the kernel at tp->t_pc (mp_startup). Note that the * thread will thread_exit() shortly after performing the * initialization; in particular, we will *never* take a * privilege transition on this thread. */ bzero(&vgc, sizeof (vgc)); #ifdef __amd64 vgc.user_regs.rip = tp->t_pc; vgc.user_regs.rsp = tp->t_sp; vgc.user_regs.rbp = tp->t_sp - 2 * sizeof (greg_t); #else vgc.user_regs.eip = tp->t_pc; vgc.user_regs.esp = tp->t_sp; vgc.user_regs.ebp = tp->t_sp - 2 * sizeof (greg_t); #endif /* * XXPV Fix resume, if Russ didn't already fix it. * * Note that resume unconditionally puts t->t_stk + sizeof (regs) * into kernel_sp via HYPERVISOR_stack_switch. This anticipates * that only lwps take traps that switch to the kernel stack; * part of creating an lwp adjusts the stack by subtracting * sizeof (struct regs) off t_stk. * * The more interesting question is, why do we do all the work * of a fully fledged lwp for a plain thread? In particular * we don't have to call HYPERVISOR_stack_switch for lwp-less threads * or futz with the LDT. This should probably all be done with * an lwp context operator to keep pure thread context switch fast. */ vgc.kernel_sp = (ulong_t)tp->t_stk; err = mp_set_cpu_context(&vgc, cp); done: if (err) { mach_cpucontext_free(cp, NULL, err); return (NULL); } return (cp); } /* * By the time we are called either we have successfully started * the cpu, or our attempt to start it has failed. */ /*ARGSUSED*/ void mach_cpucontext_free(struct cpu *cp, void *arg, int err) { switch (err) { case 0: break; case ETIMEDOUT: /* * The vcpu context is loaded into the hypervisor, and * we've tried to start it, but the vcpu has not been set * running yet, for whatever reason. We arrange to -not- * free any data structures it may be referencing. In * particular, we've already told the hypervisor about * the GDT, and so we can't map it read-write again. */ break; default: (void) xen_gdt_setprot(cp, PROT_READ | PROT_WRITE); kmem_free(cp->cpu_m.mcpu_evt_pend, sizeof (struct xen_evt_data)); break; } } /* * Reset this CPU's context. Clear out any pending evtchn data, since event * channel numbers will all change when we resume. */ void mach_cpucontext_reset(cpu_t *cp) { bzero(cp->cpu_m.mcpu_evt_pend, sizeof (struct xen_evt_data)); /* mcpu_intr_pending ? */ } static void pcb_to_user_regs(label_t *pcb, vcpu_guest_context_t *vgc) { #ifdef __amd64 vgc->user_regs.rip = pcb->val[REG_LABEL_PC]; vgc->user_regs.rsp = pcb->val[REG_LABEL_SP]; vgc->user_regs.rbp = pcb->val[REG_LABEL_BP]; vgc->user_regs.rbx = pcb->val[REG_LABEL_RBX]; vgc->user_regs.r12 = pcb->val[REG_LABEL_R12]; vgc->user_regs.r13 = pcb->val[REG_LABEL_R13]; vgc->user_regs.r14 = pcb->val[REG_LABEL_R14]; vgc->user_regs.r15 = pcb->val[REG_LABEL_R15]; #else /* __amd64 */ vgc->user_regs.eip = pcb->val[REG_LABEL_PC]; vgc->user_regs.esp = pcb->val[REG_LABEL_SP]; vgc->user_regs.ebp = pcb->val[REG_LABEL_BP]; vgc->user_regs.ebx = pcb->val[REG_LABEL_EBX]; vgc->user_regs.esi = pcb->val[REG_LABEL_ESI]; vgc->user_regs.edi = pcb->val[REG_LABEL_EDI]; #endif /* __amd64 */ } /* * Restore the context of a CPU during resume. The CPU must either * have been blocked in cpu_idle() (running the idle thread), if it was * offline, or inside cpu_pause_thread(). Either way we can restore safely * from the t_pcb. */ void mach_cpucontext_restore(cpu_t *cp) { vcpu_guest_context_t vgc; int err; ASSERT(cp->cpu_thread == cp->cpu_pause_thread || cp->cpu_thread == cp->cpu_idle_thread); bzero(&vgc, sizeof (vgc)); pcb_to_user_regs(&cp->cpu_thread->t_pcb, &vgc); /* * We're emulating a longjmp() here: in particular, we need to bump the * stack pointer to account for the pop of xIP that returning from * longjmp() normally would do, and set the return value in xAX to 1. */ #ifdef __amd64 vgc.user_regs.rax = 1; vgc.user_regs.rsp += sizeof (ulong_t); #else vgc.user_regs.eax = 1; vgc.user_regs.esp += sizeof (ulong_t); #endif vgc.kernel_sp = cp->cpu_thread->t_sp; err = mp_set_cpu_context(&vgc, cp); ASSERT(err == 0); } void mach_cpu_idle(void) { if (IN_XPV_PANIC()) { xpv_panic_halt(); } else { (void) setjmp(&curthread->t_pcb); CPUSET_ATOMIC_ADD(cpu_suspend_set, CPU->cpu_id); (void) HYPERVISOR_block(); CPUSET_ATOMIC_DEL(cpu_suspend_set, CPU->cpu_id); } } void mach_cpu_halt(char *msg) { if (msg) prom_printf("%s\n", msg); (void) xen_vcpu_down(CPU->cpu_id); } void mach_cpu_pause(volatile char *safe) { ulong_t flags; flags = intr_clear(); if (setjmp(&curthread->t_pcb) == 0) { CPUSET_ATOMIC_ADD(cpu_suspend_set, CPU->cpu_id); /* * This cpu is now safe. */ *safe = PAUSE_WAIT; membar_enter(); } while (*safe != PAUSE_IDLE) SMT_PAUSE(); CPUSET_ATOMIC_DEL(cpu_suspend_set, CPU->cpu_id); intr_restore(flags); } /* * Virtual CPU management. * * VCPUs can be controlled in one of two ways; through the domain itself * (psradm, p_online(), etc.), and via changes in xenstore (vcpu_config()). * Unfortunately, the terminology is used in different ways; they work out as * follows: * * P_ONLINE: the VCPU is up and running, taking interrupts and running threads * * P_OFFLINE: the VCPU is up and running, but quiesced (i.e. blocked in the * hypervisor on the idle thread). It must be up since a downed VCPU cannot * receive interrupts, and we require this for offline CPUs in Solaris. * * P_POWEROFF: the VCPU is down (we never called xen_vcpu_up(), or called * xen_vcpu_down() for it). It can't take interrupts or run anything, though * if it has run previously, its software state (cpu_t, machcpu structures, IPI * event channels, etc.) will still exist. * * The hypervisor has two notions of CPU states as represented in the store: * * "offline": the VCPU is down. Corresponds to P_POWEROFF. * * "online": the VCPU is running. Corresponds to a CPU state other than * P_POWEROFF. * * Currently, only a notification via xenstore can bring a CPU into a * P_POWEROFF state, and only the domain can change between P_ONLINE, P_NOINTR, * P_OFFLINE, etc. We need to be careful to treat xenstore notifications * idempotently, as we'll get 'duplicate' entries when we resume a domain. * * Note that the xenstore configuration is strictly advisory, in that a domain * can choose to ignore it and still power up a VCPU in the offline state. To * play nice, we don't allow it. Thus, any attempt to power on/off a CPU is * ENOTSUP from within Solaris. */ /*ARGSUSED*/ int mp_cpu_poweron(struct cpu *cp) { return (ENOTSUP); } /*ARGSUSED*/ int mp_cpu_poweroff(struct cpu *cp) { return (ENOTSUP); } static int poweron_vcpu(struct cpu *cp) { int error; ASSERT(MUTEX_HELD(&cpu_lock)); if (HYPERVISOR_vcpu_op(VCPUOP_is_up, cp->cpu_id, NULL) != 0) { printf("poweron_vcpu: vcpu%d is not available!\n", cp->cpu_id); return (ENXIO); } if ((error = xen_vcpu_up(cp->cpu_id)) == 0) { CPUSET_ADD(cpu_ready_set, cp->cpu_id); cp->cpu_flags |= CPU_EXISTS | CPU_READY | CPU_RUNNING; cp->cpu_flags &= ~CPU_POWEROFF; /* * There are some nasty races possible here. * Tell the vcpu it's up one more time. * XXPV Is this enough? Is this safe? */ (void) xen_vcpu_up(cp->cpu_id); cpu_set_state(cp); } return (error); } static int poweroff_poke(void) { CPUSET_ATOMIC_DEL(cpu_suspend_set, CPU->cpu_id); return (0); } /* * We must ensure that the VCPU reaches a safe state (in the suspend set, and * thus is not going to change) before we can power it off. The VCPU could * still be in mach_cpu_pause() and about to head back out; so just checking * cpu_suspend_set() isn't sufficient to make sure the VCPU has stopped moving. * Instead, we xcall it to delete itself from the set; whichever way it comes * back from that xcall, it won't mark itself in the set until it's safely back * in mach_cpu_idle(). */ static int poweroff_vcpu(struct cpu *cp) { int error; cpuset_t set; ASSERT(MUTEX_HELD(&cpu_lock)); ASSERT(CPU->cpu_id != cp->cpu_id); ASSERT(cp->cpu_flags & CPU_QUIESCED); CPUSET_ONLY(set, cp->cpu_id); xc_sync(0, 0, 0, X_CALL_HIPRI, set, (xc_func_t)poweroff_poke); while (!CPU_IN_SET(cpu_suspend_set, cp->cpu_id)) SMT_PAUSE(); if ((error = xen_vcpu_down(cp->cpu_id)) == 0) { ASSERT(CPU_IN_SET(cpu_suspend_set, cp->cpu_id)); CPUSET_DEL(cpu_ready_set, cp->cpu_id); cp->cpu_flags |= CPU_POWEROFF | CPU_OFFLINE; cp->cpu_flags &= ~(CPU_RUNNING | CPU_READY | CPU_EXISTS | CPU_ENABLE); cpu_set_state(cp); } return (error); } static int vcpu_config_poweroff(processorid_t id) { int oldstate; int error; cpu_t *cp; mutex_enter(&cpu_lock); if ((cp = cpu_get(id)) == NULL) { mutex_exit(&cpu_lock); return (ESRCH); } if (cpu_get_state(cp) == P_POWEROFF) { mutex_exit(&cpu_lock); return (0); } mutex_exit(&cpu_lock); do { error = p_online_internal(id, P_OFFLINE, &oldstate); if (error != 0) break; /* * So we just changed it to P_OFFLINE. But then we dropped * cpu_lock, so now it is possible for another thread to change * the cpu back to a different, non-quiesced state e.g. * P_ONLINE. */ mutex_enter(&cpu_lock); if ((cp = cpu_get(id)) == NULL) error = ESRCH; else { if (cp->cpu_flags & CPU_QUIESCED) error = poweroff_vcpu(cp); else error = EBUSY; } mutex_exit(&cpu_lock); } while (error == EBUSY); return (error); } /* * Add a new virtual cpu to the domain. */ static int vcpu_config_new(processorid_t id) { extern int start_cpu(processorid_t); int error; if (ncpus == 1) { printf("cannot (yet) add cpus to a single-cpu domain\n"); return (ENOTSUP); } affinity_set(CPU_CURRENT); error = start_cpu(id); affinity_clear(); return (error); } static int vcpu_config_poweron(processorid_t id) { cpu_t *cp; int oldstate; int error; if (id >= ncpus) return (vcpu_config_new(id)); mutex_enter(&cpu_lock); if ((cp = cpu_get(id)) == NULL) { mutex_exit(&cpu_lock); return (ESRCH); } if (cpu_get_state(cp) != P_POWEROFF) { mutex_exit(&cpu_lock); return (0); } if ((error = poweron_vcpu(cp)) != 0) { mutex_exit(&cpu_lock); return (error); } mutex_exit(&cpu_lock); return (p_online_internal(id, P_ONLINE, &oldstate)); } #define REPORT_LEN 128 static void vcpu_config_report(processorid_t id, uint_t newstate, int error) { char *report = kmem_alloc(REPORT_LEN, KM_SLEEP); size_t len; char *ps; switch (newstate) { case P_ONLINE: ps = PS_ONLINE; break; case P_POWEROFF: ps = PS_POWEROFF; break; default: cmn_err(CE_PANIC, "unknown state %u\n", newstate); break; } len = snprintf(report, REPORT_LEN, "cpu%d: externally initiated %s", id, ps); if (!error) { cmn_err(CE_CONT, "!%s\n", report); kmem_free(report, REPORT_LEN); return; } len += snprintf(report + len, REPORT_LEN - len, " failed, error %d: ", error); switch (error) { case EEXIST: len += snprintf(report + len, REPORT_LEN - len, "cpu already %s", ps ? ps : "?"); break; case ESRCH: len += snprintf(report + len, REPORT_LEN - len, "cpu not found"); break; case EINVAL: case EALREADY: break; case EPERM: len += snprintf(report + len, REPORT_LEN - len, "insufficient privilege (0x%x)", id); break; case EBUSY: switch (newstate) { case P_ONLINE: /* * This return comes from mp_cpu_start - * we cannot 'start' the boot CPU. */ len += snprintf(report + len, REPORT_LEN - len, "already running"); break; case P_POWEROFF: len += snprintf(report + len, REPORT_LEN - len, "bound lwps?"); break; default: break; } default: break; } cmn_err(CE_CONT, "%s\n", report); kmem_free(report, REPORT_LEN); } static void vcpu_config(void *arg) { int id = (int)(uintptr_t)arg; int error; char dir[16]; char *state; if ((uint_t)id >= max_ncpus) { cmn_err(CE_WARN, "vcpu_config: cpu%d does not fit in this domain", id); return; } (void) snprintf(dir, sizeof (dir), "cpu/%d", id); state = kmem_alloc(MAXPATHLEN, KM_SLEEP); if (xenbus_scanf(XBT_NULL, dir, "availability", "%s", state) == 0) { if (strcmp(state, "online") == 0) { error = vcpu_config_poweron(id); vcpu_config_report(id, P_ONLINE, error); } else if (strcmp(state, "offline") == 0) { error = vcpu_config_poweroff(id); vcpu_config_report(id, P_POWEROFF, error); } else { cmn_err(CE_WARN, "cpu%d: unknown target state '%s'", id, state); } } else cmn_err(CE_WARN, "cpu%d: unable to read target state from xenstore", id); kmem_free(state, MAXPATHLEN); } /*ARGSUSED*/ static void vcpu_config_event(struct xenbus_watch *watch, const char **vec, uint_t len) { const char *path = vec[XS_WATCH_PATH]; processorid_t id; char *s; if ((s = strstr(path, "cpu/")) != NULL && sscanf(s, "cpu/%d", &id) == 1) { /* * Run the virtual CPU configuration on a separate thread to * avoid blocking on this event for too long (and for now, * to ensure configuration requests are serialized.) */ (void) taskq_dispatch(cpu_config_tq, vcpu_config, (void *)(uintptr_t)id, 0); } } static int xen_vcpu_initialize(processorid_t id, vcpu_guest_context_t *vgc) { int err; if ((err = HYPERVISOR_vcpu_op(VCPUOP_initialise, id, vgc)) != 0) { char *str; int level = CE_WARN; switch (err) { case -X_EINVAL: /* * This interface squashes multiple error sources * to one error code. In particular, an X_EINVAL * code can mean: * * - the vcpu id is out of range * - cs or ss are in ring 0 * - cr3 is wrong * - an entry in the new gdt is above the * reserved entry * - a frame underneath the new gdt is bad */ str = "something is wrong :("; break; case -X_ENOENT: str = "no such cpu"; break; case -X_ENOMEM: str = "no mem to copy ctxt"; break; case -X_EFAULT: str = "bad address"; break; case -X_EEXIST: /* * Hmm. This error is returned if the vcpu has already * been initialized once before in the lifetime of this * domain. This is a logic error in the kernel. */ level = CE_PANIC; str = "already initialized"; break; default: level = CE_PANIC; str = ""; break; } cmn_err(level, "vcpu%d: failed to init: error %d: %s", id, -err, str); } return (err); } long xen_vcpu_up(processorid_t id) { long err; if ((err = HYPERVISOR_vcpu_op(VCPUOP_up, id, NULL)) != 0) { char *str; switch (err) { case -X_ENOENT: str = "no such cpu"; break; case -X_EINVAL: /* * Perhaps this is diagnostic overkill. */ if (HYPERVISOR_vcpu_op(VCPUOP_is_up, id, NULL) < 0) str = "bad cpuid"; else str = "not initialized"; break; default: str = ""; break; } printf("vcpu%d: failed to start: error %d: %s\n", id, -(int)err, str); return (EBFONT); /* deliberately silly */ } return (err); } long xen_vcpu_down(processorid_t id) { long err; if ((err = HYPERVISOR_vcpu_op(VCPUOP_down, id, NULL)) != 0) { /* * X_ENOENT: no such cpu * X_EINVAL: bad cpuid */ panic("vcpu%d: failed to stop: error %d", id, -(int)err); } return (err); }