1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * Virtual CPU management. 29 * 30 * VCPUs can be controlled in one of two ways; through the domain itself 31 * (psradm, p_online(), etc.), and via changes in xenstore (vcpu_config()). 32 * Unfortunately, the terminology is used in different ways; they work out as 33 * follows: 34 * 35 * P_ONLINE: the VCPU is up and running, taking interrupts and running threads 36 * 37 * P_OFFLINE: the VCPU is up and running, but quiesced (i.e. blocked in the 38 * hypervisor on the idle thread). It must be up since a downed VCPU cannot 39 * receive interrupts, and we require this for offline CPUs in Solaris. 40 * 41 * P_POWEROFF: the VCPU is down (we never called xen_vcpu_up(), or called 42 * xen_vcpu_down() for it). It can't take interrupts or run anything, though 43 * if it has run previously, its software state (cpu_t, machcpu structures, IPI 44 * event channels, etc.) will still exist. 45 * 46 * The hypervisor has two notions of CPU states as represented in the store: 47 * 48 * "offline": the VCPU is down. Corresponds to P_POWEROFF. 49 * 50 * "online": the VCPU is running. Corresponds to a CPU state other than 51 * P_POWEROFF. 52 * 53 * Currently, only a notification via xenstore can bring a CPU into a 54 * P_POWEROFF state, and only the domain can change between P_ONLINE, P_NOINTR, 55 * P_OFFLINE, etc. We need to be careful to treat xenstore notifications 56 * idempotently, as we'll get 'duplicate' entries when we resume a domain. 57 * 58 * Note that the xenstore configuration is strictly advisory, in that a domain 59 * can choose to ignore it and still power up a VCPU in the offline state. To 60 * play nice, we don't allow it. Thus, any attempt to power on/off a CPU is 61 * ENOTSUP from within Solaris. 62 * 63 * Powering off a VCPU and suspending the domain use similar code. The 64 * difficulty here is that we must ensure that each VCPU is in a stable 65 * state: it must have a saved PCB, and not be responding to interrupts 66 * (since we are just about to remove its ability to run on a real CPU, 67 * possibly forever). However, an offline CPU in Solaris can take 68 * cross-call interrupts, as mentioned, so we must go through a 69 * two-stage process. First, we use the standard Solaris pause_cpus(). 70 * This ensures that all CPUs are either in mach_cpu_pause() or 71 * mach_cpu_idle(), and nothing will cross-call them. 72 * 73 * Powered-off-CPUs are already safe, as we own the cpu_lock needed to 74 * bring them back up, and in state CPU_PHASE_POWERED_OFF. 75 * 76 * Running CPUs are spinning in mach_cpu_pause() waiting for either 77 * PAUSE_IDLE or CPU_PHASE_WAIT_SAFE. 78 * 79 * Offline CPUs are either running the idle thread and periodically 80 * checking for CPU_PHASE_WAIT_SAFE, or blocked in the hypervisor. 81 * 82 * Thus, we set CPU_PHASE_WAIT_SAFE for every powered-on CPU, as well as 83 * poking them to make sure they're not blocked[1]. When every CPU has 84 * responded by reaching a safe state and setting CPU_PHASE_SAFE, we 85 * know we can suspend, or power-off a CPU, without problems. 86 * 87 * [1] note that we have to repeatedly poke offline CPUs: it's the only 88 * way to ensure that the CPU doesn't miss the state change before 89 * dropping into HYPERVISOR_block(). 90 */ 91 92 #pragma ident "%Z%%M% %I% %E% SMI" 93 94 #include <sys/types.h> 95 #include <sys/systm.h> 96 #include <sys/param.h> 97 #include <sys/taskq.h> 98 #include <sys/cmn_err.h> 99 #include <sys/archsystm.h> 100 #include <sys/machsystm.h> 101 #include <sys/segments.h> 102 #include <sys/cpuvar.h> 103 #include <sys/x86_archext.h> 104 #include <sys/controlregs.h> 105 #include <sys/hypervisor.h> 106 #include <sys/xpv_panic.h> 107 #include <sys/mman.h> 108 #include <sys/psw.h> 109 #include <sys/cpu.h> 110 #include <sys/sunddi.h> 111 #include <util/sscanf.h> 112 #include <vm/hat_i86.h> 113 #include <vm/hat.h> 114 #include <vm/as.h> 115 116 #include <xen/public/io/xs_wire.h> 117 #include <xen/sys/xenbus_impl.h> 118 #include <xen/public/vcpu.h> 119 120 #define CPU_PHASE_NONE 0 121 #define CPU_PHASE_WAIT_SAFE 1 122 #define CPU_PHASE_SAFE 2 123 #define CPU_PHASE_POWERED_OFF 3 124 125 /* 126 * We can only poke CPUs during barrier enter 256 times a second at 127 * most. 128 */ 129 #define POKE_TIMEOUT (NANOSEC / 256) 130 131 static taskq_t *cpu_config_tq; 132 static int cpu_phase[NCPU]; 133 134 static void vcpu_config_event(struct xenbus_watch *, const char **, uint_t); 135 static int xen_vcpu_initialize(processorid_t, vcpu_guest_context_t *); 136 137 /* 138 * Return whether or not the vcpu is actually running on a pcpu 139 */ 140 int 141 vcpu_on_pcpu(processorid_t cpu) 142 { 143 struct vcpu_runstate_info runstate; 144 int ret = VCPU_STATE_UNKNOWN; 145 146 ASSERT(cpu < NCPU); 147 /* 148 * Don't bother with hypercall if we are asking about ourself 149 */ 150 if (cpu == CPU->cpu_id) 151 return (VCPU_ON_PCPU); 152 if (HYPERVISOR_vcpu_op(VCPUOP_get_runstate_info, cpu, &runstate) != 0) 153 goto out; 154 155 switch (runstate.state) { 156 case RUNSTATE_running: 157 ret = VCPU_ON_PCPU; 158 break; 159 160 case RUNSTATE_runnable: 161 case RUNSTATE_offline: 162 case RUNSTATE_blocked: 163 ret = VCPU_NOT_ON_PCPU; 164 break; 165 166 default: 167 break; 168 } 169 170 out: 171 return (ret); 172 } 173 174 /* 175 * These routines allocate any global state that might be needed 176 * while starting cpus. For virtual cpus, there is no such state. 177 */ 178 int 179 mach_cpucontext_init(void) 180 { 181 return (0); 182 } 183 184 void 185 do_cpu_config_watch(int state) 186 { 187 static struct xenbus_watch cpu_config_watch; 188 189 if (state != XENSTORE_UP) 190 return; 191 cpu_config_watch.node = "cpu"; 192 cpu_config_watch.callback = vcpu_config_event; 193 if (register_xenbus_watch(&cpu_config_watch)) { 194 taskq_destroy(cpu_config_tq); 195 cmn_err(CE_WARN, "do_cpu_config_watch: " 196 "failed to set vcpu config watch"); 197 } 198 199 } 200 201 /* 202 * This routine is called after all the "normal" MP startup has 203 * been done; a good place to start watching xen store for virtual 204 * cpu hot plug events. 205 */ 206 void 207 mach_cpucontext_fini(void) 208 { 209 210 cpu_config_tq = taskq_create("vcpu config taskq", 1, 211 maxclsyspri - 1, 1, 1, TASKQ_PREPOPULATE); 212 213 (void) xs_register_xenbus_callback(do_cpu_config_watch); 214 } 215 216 /* 217 * Fill in the remaining CPU context and initialize it. 218 */ 219 static int 220 mp_set_cpu_context(vcpu_guest_context_t *vgc, cpu_t *cp) 221 { 222 uint_t vec, iopl; 223 224 vgc->flags = VGCF_IN_KERNEL; 225 226 /* 227 * fpu_ctx we leave as zero; on first fault we'll store 228 * sse_initial into it anyway. 229 */ 230 231 #if defined(__amd64) 232 vgc->user_regs.cs = KCS_SEL | SEL_KPL; /* force to ring 3 */ 233 #else 234 vgc->user_regs.cs = KCS_SEL; 235 #endif 236 vgc->user_regs.ds = KDS_SEL; 237 vgc->user_regs.es = KDS_SEL; 238 vgc->user_regs.ss = KDS_SEL; 239 vgc->kernel_ss = KDS_SEL; 240 241 /* 242 * Allow I/O privilege level for Dom0 kernel. 243 */ 244 if (DOMAIN_IS_INITDOMAIN(xen_info)) 245 iopl = (PS_IOPL & 0x1000); /* ring 1 */ 246 else 247 iopl = 0; 248 249 #if defined(__amd64) 250 vgc->user_regs.fs = 0; 251 vgc->user_regs.gs = 0; 252 vgc->user_regs.rflags = F_OFF | iopl; 253 #elif defined(__i386) 254 vgc->user_regs.fs = KFS_SEL; 255 vgc->user_regs.gs = KGS_SEL; 256 vgc->user_regs.eflags = F_OFF | iopl; 257 vgc->event_callback_cs = vgc->user_regs.cs; 258 vgc->failsafe_callback_cs = vgc->user_regs.cs; 259 #endif 260 261 /* 262 * Initialize the trap_info_t from the IDT 263 */ 264 #if !defined(__lint) 265 ASSERT(NIDT == sizeof (vgc->trap_ctxt) / sizeof (vgc->trap_ctxt[0])); 266 #endif 267 for (vec = 0; vec < NIDT; vec++) { 268 trap_info_t *ti = &vgc->trap_ctxt[vec]; 269 270 if (xen_idt_to_trap_info(vec, 271 &cp->cpu_m.mcpu_idt[vec], ti) == 0) { 272 ti->cs = KCS_SEL; 273 ti->vector = vec; 274 } 275 } 276 277 /* 278 * No LDT 279 */ 280 281 /* 282 * (We assert in various places that the GDT is (a) aligned on a 283 * page boundary and (b) one page long, so this really should fit..) 284 */ 285 #ifdef CRASH_XEN 286 vgc->gdt_frames[0] = pa_to_ma(mmu_btop(cp->cpu_m.mcpu_gdtpa)); 287 #else 288 vgc->gdt_frames[0] = pfn_to_mfn(mmu_btop(cp->cpu_m.mcpu_gdtpa)); 289 #endif 290 vgc->gdt_ents = NGDT; 291 292 vgc->ctrlreg[0] = CR0_ENABLE_FPU_FLAGS(getcr0()); 293 294 #if defined(__i386) 295 if (mmu.pae_hat) 296 vgc->ctrlreg[3] = 297 xen_pfn_to_cr3(pfn_to_mfn(kas.a_hat->hat_htable->ht_pfn)); 298 else 299 #endif 300 vgc->ctrlreg[3] = 301 pa_to_ma(mmu_ptob(kas.a_hat->hat_htable->ht_pfn)); 302 303 vgc->ctrlreg[4] = getcr4(); 304 305 vgc->event_callback_eip = (uintptr_t)xen_callback; 306 vgc->failsafe_callback_eip = (uintptr_t)xen_failsafe_callback; 307 vgc->flags |= VGCF_failsafe_disables_events; 308 309 #if defined(__amd64) 310 /* 311 * XXPV should this be moved to init_cpu_syscall? 312 */ 313 vgc->syscall_callback_eip = (uintptr_t)sys_syscall; 314 vgc->flags |= VGCF_syscall_disables_events; 315 316 ASSERT(vgc->user_regs.gs == 0); 317 vgc->gs_base_kernel = (uintptr_t)cp; 318 #endif 319 320 return (xen_vcpu_initialize(cp->cpu_id, vgc)); 321 } 322 323 /* 324 * Create a guest virtual cpu context so that the virtual cpu 325 * springs into life in the domain just about to call mp_startup() 326 * 327 * Virtual CPUs must be initialized once in the lifetime of the domain; 328 * after that subsequent attempts to start them will fail with X_EEXIST. 329 * 330 * Thus 'alloc' -really- creates and initializes the virtual 331 * CPU context just once. Once the initialisation succeeds, we never 332 * free it, nor the regular cpu_t to which it refers. 333 */ 334 void * 335 mach_cpucontext_alloc(struct cpu *cp) 336 { 337 kthread_t *tp = cp->cpu_thread; 338 vcpu_guest_context_t vgc; 339 340 int err = 1; 341 342 /* 343 * First, augment the incoming cpu structure 344 * - vcpu pointer reference 345 * - pending event storage area 346 * - physical address of GDT 347 */ 348 cp->cpu_m.mcpu_vcpu_info = 349 &HYPERVISOR_shared_info->vcpu_info[cp->cpu_id]; 350 cp->cpu_m.mcpu_evt_pend = kmem_zalloc( 351 sizeof (struct xen_evt_data), KM_SLEEP); 352 cp->cpu_m.mcpu_gdtpa = 353 mmu_ptob(hat_getpfnum(kas.a_hat, (caddr_t)cp->cpu_gdt)); 354 355 if ((err = xen_gdt_setprot(cp, PROT_READ)) != 0) 356 goto done; 357 358 /* 359 * Now set up the vcpu context so that we can start this vcpu 360 * in the kernel at tp->t_pc (mp_startup). Note that the 361 * thread will thread_exit() shortly after performing the 362 * initialization; in particular, we will *never* take a 363 * privilege transition on this thread. 364 */ 365 366 bzero(&vgc, sizeof (vgc)); 367 368 #ifdef __amd64 369 vgc.user_regs.rip = tp->t_pc; 370 vgc.user_regs.rsp = tp->t_sp; 371 vgc.user_regs.rbp = tp->t_sp - 2 * sizeof (greg_t); 372 #else 373 vgc.user_regs.eip = tp->t_pc; 374 vgc.user_regs.esp = tp->t_sp; 375 vgc.user_regs.ebp = tp->t_sp - 2 * sizeof (greg_t); 376 #endif 377 /* 378 * XXPV Fix resume, if Russ didn't already fix it. 379 * 380 * Note that resume unconditionally puts t->t_stk + sizeof (regs) 381 * into kernel_sp via HYPERVISOR_stack_switch. This anticipates 382 * that only lwps take traps that switch to the kernel stack; 383 * part of creating an lwp adjusts the stack by subtracting 384 * sizeof (struct regs) off t_stk. 385 * 386 * The more interesting question is, why do we do all the work 387 * of a fully fledged lwp for a plain thread? In particular 388 * we don't have to call HYPERVISOR_stack_switch for lwp-less threads 389 * or futz with the LDT. This should probably all be done with 390 * an lwp context operator to keep pure thread context switch fast. 391 */ 392 vgc.kernel_sp = (ulong_t)tp->t_stk; 393 394 err = mp_set_cpu_context(&vgc, cp); 395 396 done: 397 if (err) { 398 mach_cpucontext_free(cp, NULL, err); 399 return (NULL); 400 } 401 return (cp); 402 } 403 404 /* 405 * By the time we are called either we have successfully started 406 * the cpu, or our attempt to start it has failed. 407 */ 408 409 /*ARGSUSED*/ 410 void 411 mach_cpucontext_free(struct cpu *cp, void *arg, int err) 412 { 413 switch (err) { 414 case 0: 415 break; 416 case ETIMEDOUT: 417 /* 418 * The vcpu context is loaded into the hypervisor, and 419 * we've tried to start it, but the vcpu has not been set 420 * running yet, for whatever reason. We arrange to -not- 421 * free any data structures it may be referencing. In 422 * particular, we've already told the hypervisor about 423 * the GDT, and so we can't map it read-write again. 424 */ 425 break; 426 default: 427 (void) xen_gdt_setprot(cp, PROT_READ | PROT_WRITE); 428 kmem_free(cp->cpu_m.mcpu_evt_pend, 429 sizeof (struct xen_evt_data)); 430 break; 431 } 432 } 433 434 /* 435 * Reset this CPU's context. Clear out any pending evtchn data, since event 436 * channel numbers will all change when we resume. 437 */ 438 void 439 mach_cpucontext_reset(cpu_t *cp) 440 { 441 bzero(cp->cpu_m.mcpu_evt_pend, sizeof (struct xen_evt_data)); 442 /* mcpu_intr_pending ? */ 443 } 444 445 static void 446 pcb_to_user_regs(label_t *pcb, vcpu_guest_context_t *vgc) 447 { 448 #ifdef __amd64 449 vgc->user_regs.rip = pcb->val[REG_LABEL_PC]; 450 vgc->user_regs.rsp = pcb->val[REG_LABEL_SP]; 451 vgc->user_regs.rbp = pcb->val[REG_LABEL_BP]; 452 vgc->user_regs.rbx = pcb->val[REG_LABEL_RBX]; 453 vgc->user_regs.r12 = pcb->val[REG_LABEL_R12]; 454 vgc->user_regs.r13 = pcb->val[REG_LABEL_R13]; 455 vgc->user_regs.r14 = pcb->val[REG_LABEL_R14]; 456 vgc->user_regs.r15 = pcb->val[REG_LABEL_R15]; 457 #else /* __amd64 */ 458 vgc->user_regs.eip = pcb->val[REG_LABEL_PC]; 459 vgc->user_regs.esp = pcb->val[REG_LABEL_SP]; 460 vgc->user_regs.ebp = pcb->val[REG_LABEL_BP]; 461 vgc->user_regs.ebx = pcb->val[REG_LABEL_EBX]; 462 vgc->user_regs.esi = pcb->val[REG_LABEL_ESI]; 463 vgc->user_regs.edi = pcb->val[REG_LABEL_EDI]; 464 #endif /* __amd64 */ 465 } 466 467 /* 468 * Restore the context of a CPU during resume. This context is always 469 * inside enter_safe_phase(), below. 470 */ 471 void 472 mach_cpucontext_restore(cpu_t *cp) 473 { 474 vcpu_guest_context_t vgc; 475 int err; 476 477 ASSERT(cp->cpu_thread == cp->cpu_pause_thread || 478 cp->cpu_thread == cp->cpu_idle_thread); 479 480 bzero(&vgc, sizeof (vgc)); 481 482 pcb_to_user_regs(&cp->cpu_thread->t_pcb, &vgc); 483 484 /* 485 * We're emulating a longjmp() here: in particular, we need to bump the 486 * stack pointer to account for the pop of xIP that returning from 487 * longjmp() normally would do, and set the return value in xAX to 1. 488 */ 489 #ifdef __amd64 490 vgc.user_regs.rax = 1; 491 vgc.user_regs.rsp += sizeof (ulong_t); 492 #else 493 vgc.user_regs.eax = 1; 494 vgc.user_regs.esp += sizeof (ulong_t); 495 #endif 496 497 vgc.kernel_sp = cp->cpu_thread->t_sp; 498 499 err = mp_set_cpu_context(&vgc, cp); 500 501 ASSERT(err == 0); 502 } 503 504 /* 505 * Reach a point at which the CPU can be safely powered-off or 506 * suspended. Nothing can wake this CPU out of the loop. 507 */ 508 static void 509 enter_safe_phase(void) 510 { 511 ulong_t flags = intr_clear(); 512 513 if (setjmp(&curthread->t_pcb) == 0) { 514 cpu_phase[CPU->cpu_id] = CPU_PHASE_SAFE; 515 while (cpu_phase[CPU->cpu_id] == CPU_PHASE_SAFE) 516 SMT_PAUSE(); 517 } 518 519 ASSERT(!interrupts_enabled()); 520 521 intr_restore(flags); 522 } 523 524 /* 525 * Offline CPUs run this code even under a pause_cpus(), so we must 526 * check if we need to enter the safe phase. 527 */ 528 void 529 mach_cpu_idle(void) 530 { 531 if (IN_XPV_PANIC()) { 532 xpv_panic_halt(); 533 } else { 534 (void) HYPERVISOR_block(); 535 if (cpu_phase[CPU->cpu_id] == CPU_PHASE_WAIT_SAFE) 536 enter_safe_phase(); 537 } 538 } 539 540 /* 541 * Spin until either start_cpus() wakes us up, or we get a request to 542 * enter the safe phase (followed by a later start_cpus()). 543 */ 544 void 545 mach_cpu_pause(volatile char *safe) 546 { 547 *safe = PAUSE_WAIT; 548 membar_enter(); 549 550 while (*safe != PAUSE_IDLE) { 551 if (cpu_phase[CPU->cpu_id] == CPU_PHASE_WAIT_SAFE) 552 enter_safe_phase(); 553 SMT_PAUSE(); 554 } 555 } 556 557 void 558 mach_cpu_halt(char *msg) 559 { 560 if (msg) 561 prom_printf("%s\n", msg); 562 (void) xen_vcpu_down(CPU->cpu_id); 563 } 564 565 /*ARGSUSED*/ 566 int 567 mp_cpu_poweron(struct cpu *cp) 568 { 569 return (ENOTSUP); 570 } 571 572 /*ARGSUSED*/ 573 int 574 mp_cpu_poweroff(struct cpu *cp) 575 { 576 return (ENOTSUP); 577 } 578 579 void 580 mp_enter_barrier(void) 581 { 582 hrtime_t last_poke_time = 0; 583 int poke_allowed = 0; 584 int done = 0; 585 int i; 586 587 ASSERT(MUTEX_HELD(&cpu_lock)); 588 589 pause_cpus(NULL); 590 591 while (!done) { 592 done = 1; 593 poke_allowed = 0; 594 595 if (xpv_gethrtime() - last_poke_time > POKE_TIMEOUT) { 596 last_poke_time = xpv_gethrtime(); 597 poke_allowed = 1; 598 } 599 600 for (i = 0; i < NCPU; i++) { 601 cpu_t *cp = cpu_get(i); 602 603 if (cp == NULL || cp == CPU) 604 continue; 605 606 switch (cpu_phase[i]) { 607 case CPU_PHASE_NONE: 608 cpu_phase[i] = CPU_PHASE_WAIT_SAFE; 609 poke_cpu(i); 610 done = 0; 611 break; 612 613 case CPU_PHASE_WAIT_SAFE: 614 if (poke_allowed) 615 poke_cpu(i); 616 done = 0; 617 break; 618 619 case CPU_PHASE_SAFE: 620 case CPU_PHASE_POWERED_OFF: 621 break; 622 } 623 } 624 625 SMT_PAUSE(); 626 } 627 } 628 629 void 630 mp_leave_barrier(void) 631 { 632 int i; 633 634 ASSERT(MUTEX_HELD(&cpu_lock)); 635 636 for (i = 0; i < NCPU; i++) { 637 cpu_t *cp = cpu_get(i); 638 639 if (cp == NULL || cp == CPU) 640 continue; 641 642 switch (cpu_phase[i]) { 643 /* 644 * If we see a CPU in one of these phases, something has 645 * gone badly wrong with the guarantees 646 * mp_enter_barrier() is supposed to provide. Rather 647 * than attempt to stumble along (and since we can't 648 * panic properly in this context), we tell the 649 * hypervisor we've crashed. 650 */ 651 case CPU_PHASE_NONE: 652 case CPU_PHASE_WAIT_SAFE: 653 (void) HYPERVISOR_shutdown(SHUTDOWN_crash); 654 break; 655 656 case CPU_PHASE_POWERED_OFF: 657 break; 658 659 case CPU_PHASE_SAFE: 660 cpu_phase[i] = CPU_PHASE_NONE; 661 } 662 } 663 664 start_cpus(); 665 } 666 667 static int 668 poweroff_vcpu(struct cpu *cp) 669 { 670 int error; 671 672 ASSERT(MUTEX_HELD(&cpu_lock)); 673 674 ASSERT(CPU->cpu_id != cp->cpu_id); 675 ASSERT(cp->cpu_flags & CPU_QUIESCED); 676 677 mp_enter_barrier(); 678 679 if ((error = xen_vcpu_down(cp->cpu_id)) == 0) { 680 ASSERT(cpu_phase[cp->cpu_id] == CPU_PHASE_SAFE); 681 682 CPUSET_DEL(cpu_ready_set, cp->cpu_id); 683 684 cp->cpu_flags |= CPU_POWEROFF | CPU_OFFLINE; 685 cp->cpu_flags &= 686 ~(CPU_RUNNING | CPU_READY | CPU_EXISTS | CPU_ENABLE); 687 688 cpu_phase[cp->cpu_id] = CPU_PHASE_POWERED_OFF; 689 690 cpu_set_state(cp); 691 } 692 693 mp_leave_barrier(); 694 695 return (error); 696 } 697 698 static int 699 vcpu_config_poweroff(processorid_t id) 700 { 701 int oldstate; 702 int error; 703 cpu_t *cp; 704 705 mutex_enter(&cpu_lock); 706 707 if ((cp = cpu_get(id)) == NULL) { 708 mutex_exit(&cpu_lock); 709 return (ESRCH); 710 } 711 712 if (cpu_get_state(cp) == P_POWEROFF) { 713 mutex_exit(&cpu_lock); 714 return (0); 715 } 716 717 mutex_exit(&cpu_lock); 718 719 do { 720 error = p_online_internal(id, P_OFFLINE, 721 &oldstate); 722 723 if (error != 0) 724 break; 725 726 /* 727 * So we just changed it to P_OFFLINE. But then we dropped 728 * cpu_lock, so now it is possible for another thread to change 729 * the cpu back to a different, non-quiesced state e.g. 730 * P_ONLINE. 731 */ 732 mutex_enter(&cpu_lock); 733 if ((cp = cpu_get(id)) == NULL) 734 error = ESRCH; 735 else { 736 if (cp->cpu_flags & CPU_QUIESCED) 737 error = poweroff_vcpu(cp); 738 else 739 error = EBUSY; 740 } 741 mutex_exit(&cpu_lock); 742 } while (error == EBUSY); 743 744 return (error); 745 } 746 747 /* 748 * Add a new virtual cpu to the domain. 749 */ 750 static int 751 vcpu_config_new(processorid_t id) 752 { 753 extern int start_cpu(processorid_t); 754 int error; 755 756 if (ncpus == 1) { 757 printf("cannot (yet) add cpus to a single-cpu domain\n"); 758 return (ENOTSUP); 759 } 760 761 affinity_set(CPU_CURRENT); 762 error = start_cpu(id); 763 affinity_clear(); 764 return (error); 765 } 766 767 static int 768 poweron_vcpu(struct cpu *cp) 769 { 770 int error; 771 772 ASSERT(MUTEX_HELD(&cpu_lock)); 773 774 if (HYPERVISOR_vcpu_op(VCPUOP_is_up, cp->cpu_id, NULL) != 0) { 775 printf("poweron_vcpu: vcpu%d is not available!\n", 776 cp->cpu_id); 777 return (ENXIO); 778 } 779 780 if ((error = xen_vcpu_up(cp->cpu_id)) == 0) { 781 CPUSET_ADD(cpu_ready_set, cp->cpu_id); 782 cp->cpu_flags |= CPU_EXISTS | CPU_READY | CPU_RUNNING; 783 cp->cpu_flags &= ~CPU_POWEROFF; 784 /* 785 * There are some nasty races possible here. 786 * Tell the vcpu it's up one more time. 787 * XXPV Is this enough? Is this safe? 788 */ 789 (void) xen_vcpu_up(cp->cpu_id); 790 791 cpu_phase[cp->cpu_id] = CPU_PHASE_NONE; 792 793 cpu_set_state(cp); 794 } 795 return (error); 796 } 797 798 static int 799 vcpu_config_poweron(processorid_t id) 800 { 801 cpu_t *cp; 802 int oldstate; 803 int error; 804 805 if (id >= ncpus) 806 return (vcpu_config_new(id)); 807 808 mutex_enter(&cpu_lock); 809 810 if ((cp = cpu_get(id)) == NULL) { 811 mutex_exit(&cpu_lock); 812 return (ESRCH); 813 } 814 815 if (cpu_get_state(cp) != P_POWEROFF) { 816 mutex_exit(&cpu_lock); 817 return (0); 818 } 819 820 if ((error = poweron_vcpu(cp)) != 0) { 821 mutex_exit(&cpu_lock); 822 return (error); 823 } 824 825 mutex_exit(&cpu_lock); 826 827 return (p_online_internal(id, P_ONLINE, &oldstate)); 828 } 829 830 #define REPORT_LEN 128 831 832 static void 833 vcpu_config_report(processorid_t id, uint_t newstate, int error) 834 { 835 char *report = kmem_alloc(REPORT_LEN, KM_SLEEP); 836 size_t len; 837 char *ps; 838 839 switch (newstate) { 840 case P_ONLINE: 841 ps = PS_ONLINE; 842 break; 843 case P_POWEROFF: 844 ps = PS_POWEROFF; 845 break; 846 default: 847 cmn_err(CE_PANIC, "unknown state %u\n", newstate); 848 break; 849 } 850 851 len = snprintf(report, REPORT_LEN, 852 "cpu%d: externally initiated %s", id, ps); 853 854 if (!error) { 855 cmn_err(CE_CONT, "!%s\n", report); 856 kmem_free(report, REPORT_LEN); 857 return; 858 } 859 860 len += snprintf(report + len, REPORT_LEN - len, 861 " failed, error %d: ", error); 862 switch (error) { 863 case EEXIST: 864 len += snprintf(report + len, REPORT_LEN - len, 865 "cpu already %s", ps ? ps : "?"); 866 break; 867 case ESRCH: 868 len += snprintf(report + len, REPORT_LEN - len, 869 "cpu not found"); 870 break; 871 case EINVAL: 872 case EALREADY: 873 break; 874 case EPERM: 875 len += snprintf(report + len, REPORT_LEN - len, 876 "insufficient privilege (0x%x)", id); 877 break; 878 case EBUSY: 879 switch (newstate) { 880 case P_ONLINE: 881 /* 882 * This return comes from mp_cpu_start - 883 * we cannot 'start' the boot CPU. 884 */ 885 len += snprintf(report + len, REPORT_LEN - len, 886 "already running"); 887 break; 888 case P_POWEROFF: 889 len += snprintf(report + len, REPORT_LEN - len, 890 "bound lwps?"); 891 break; 892 default: 893 break; 894 } 895 default: 896 break; 897 } 898 899 cmn_err(CE_CONT, "%s\n", report); 900 kmem_free(report, REPORT_LEN); 901 } 902 903 static void 904 vcpu_config(void *arg) 905 { 906 int id = (int)(uintptr_t)arg; 907 int error; 908 char dir[16]; 909 char *state; 910 911 if ((uint_t)id >= max_ncpus) { 912 cmn_err(CE_WARN, 913 "vcpu_config: cpu%d does not fit in this domain", id); 914 return; 915 } 916 917 (void) snprintf(dir, sizeof (dir), "cpu/%d", id); 918 state = kmem_alloc(MAXPATHLEN, KM_SLEEP); 919 if (xenbus_scanf(XBT_NULL, dir, "availability", "%s", state) == 0) { 920 if (strcmp(state, "online") == 0) { 921 error = vcpu_config_poweron(id); 922 vcpu_config_report(id, P_ONLINE, error); 923 } else if (strcmp(state, "offline") == 0) { 924 error = vcpu_config_poweroff(id); 925 vcpu_config_report(id, P_POWEROFF, error); 926 } else { 927 cmn_err(CE_WARN, 928 "cpu%d: unknown target state '%s'", id, state); 929 } 930 } else 931 cmn_err(CE_WARN, 932 "cpu%d: unable to read target state from xenstore", id); 933 934 kmem_free(state, MAXPATHLEN); 935 } 936 937 /*ARGSUSED*/ 938 static void 939 vcpu_config_event(struct xenbus_watch *watch, const char **vec, uint_t len) 940 { 941 const char *path = vec[XS_WATCH_PATH]; 942 processorid_t id; 943 char *s; 944 945 if ((s = strstr(path, "cpu/")) != NULL && 946 sscanf(s, "cpu/%d", &id) == 1) { 947 /* 948 * Run the virtual CPU configuration on a separate thread to 949 * avoid blocking on this event for too long (and for now, 950 * to ensure configuration requests are serialized.) 951 */ 952 (void) taskq_dispatch(cpu_config_tq, 953 vcpu_config, (void *)(uintptr_t)id, 0); 954 } 955 } 956 957 static int 958 xen_vcpu_initialize(processorid_t id, vcpu_guest_context_t *vgc) 959 { 960 int err; 961 962 if ((err = HYPERVISOR_vcpu_op(VCPUOP_initialise, id, vgc)) != 0) { 963 char *str; 964 int level = CE_WARN; 965 966 switch (err) { 967 case -X_EINVAL: 968 /* 969 * This interface squashes multiple error sources 970 * to one error code. In particular, an X_EINVAL 971 * code can mean: 972 * 973 * - the vcpu id is out of range 974 * - cs or ss are in ring 0 975 * - cr3 is wrong 976 * - an entry in the new gdt is above the 977 * reserved entry 978 * - a frame underneath the new gdt is bad 979 */ 980 str = "something is wrong :("; 981 break; 982 case -X_ENOENT: 983 str = "no such cpu"; 984 break; 985 case -X_ENOMEM: 986 str = "no mem to copy ctxt"; 987 break; 988 case -X_EFAULT: 989 str = "bad address"; 990 break; 991 case -X_EEXIST: 992 /* 993 * Hmm. This error is returned if the vcpu has already 994 * been initialized once before in the lifetime of this 995 * domain. This is a logic error in the kernel. 996 */ 997 level = CE_PANIC; 998 str = "already initialized"; 999 break; 1000 default: 1001 level = CE_PANIC; 1002 str = "<unexpected>"; 1003 break; 1004 } 1005 1006 cmn_err(level, "vcpu%d: failed to init: error %d: %s", 1007 id, -err, str); 1008 } 1009 return (err); 1010 } 1011 1012 long 1013 xen_vcpu_up(processorid_t id) 1014 { 1015 long err; 1016 1017 if ((err = HYPERVISOR_vcpu_op(VCPUOP_up, id, NULL)) != 0) { 1018 char *str; 1019 1020 switch (err) { 1021 case -X_ENOENT: 1022 str = "no such cpu"; 1023 break; 1024 case -X_EINVAL: 1025 /* 1026 * Perhaps this is diagnostic overkill. 1027 */ 1028 if (HYPERVISOR_vcpu_op(VCPUOP_is_up, id, NULL) < 0) 1029 str = "bad cpuid"; 1030 else 1031 str = "not initialized"; 1032 break; 1033 default: 1034 str = "<unexpected>"; 1035 break; 1036 } 1037 1038 printf("vcpu%d: failed to start: error %d: %s\n", 1039 id, -(int)err, str); 1040 return (EBFONT); /* deliberately silly */ 1041 } 1042 return (err); 1043 } 1044 1045 long 1046 xen_vcpu_down(processorid_t id) 1047 { 1048 long err; 1049 1050 if ((err = HYPERVISOR_vcpu_op(VCPUOP_down, id, NULL)) != 0) { 1051 /* 1052 * X_ENOENT: no such cpu 1053 * X_EINVAL: bad cpuid 1054 */ 1055 panic("vcpu%d: failed to stop: error %d", id, -(int)err); 1056 } 1057 1058 return (err); 1059 } 1060