1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * Copyright 2019 Joyent, Inc. 29 */ 30 31 /* 32 * Virtual CPU management. 33 * 34 * VCPUs can be controlled in one of two ways; through the domain itself 35 * (psradm, p_online(), etc.), and via changes in xenstore (vcpu_config()). 36 * Unfortunately, the terminology is used in different ways; they work out as 37 * follows: 38 * 39 * P_ONLINE: the VCPU is up and running, taking interrupts and running threads 40 * 41 * P_OFFLINE: the VCPU is up and running, but quiesced (i.e. blocked in the 42 * hypervisor on the idle thread). It must be up since a downed VCPU cannot 43 * receive interrupts, and we require this for offline CPUs in Solaris. 44 * 45 * P_POWEROFF: the VCPU is down (we never called xen_vcpu_up(), or called 46 * xen_vcpu_down() for it). It can't take interrupts or run anything, though 47 * if it has run previously, its software state (cpu_t, machcpu structures, IPI 48 * event channels, etc.) will still exist. 49 * 50 * The hypervisor has two notions of CPU states as represented in the store: 51 * 52 * "offline": the VCPU is down. Corresponds to P_POWEROFF. 53 * 54 * "online": the VCPU is running. Corresponds to a CPU state other than 55 * P_POWEROFF. 56 * 57 * Currently, only a notification via xenstore can bring a CPU into a 58 * P_POWEROFF state, and only the domain can change between P_ONLINE, P_NOINTR, 59 * P_OFFLINE, etc. We need to be careful to treat xenstore notifications 60 * idempotently, as we'll get 'duplicate' entries when we resume a domain. 61 * 62 * Note that the xenstore configuration is strictly advisory, in that a domain 63 * can choose to ignore it and still power up a VCPU in the offline state. To 64 * play nice, we don't allow it. Thus, any attempt to power on/off a CPU is 65 * ENOTSUP from within Solaris. 66 * 67 * Powering off a VCPU and suspending the domain use similar code. The 68 * difficulty here is that we must ensure that each VCPU is in a stable 69 * state: it must have a saved PCB, and not be responding to interrupts 70 * (since we are just about to remove its ability to run on a real CPU, 71 * possibly forever). However, an offline CPU in Solaris can take 72 * cross-call interrupts, as mentioned, so we must go through a 73 * two-stage process. First, we use the standard Solaris pause_cpus(). 74 * This ensures that all CPUs are either in mach_cpu_pause() or 75 * mach_cpu_idle(), and nothing will cross-call them. 76 * 77 * Powered-off-CPUs are already safe, as we own the cpu_lock needed to 78 * bring them back up, and in state CPU_PHASE_POWERED_OFF. 79 * 80 * Running CPUs are spinning in mach_cpu_pause() waiting for either 81 * PAUSE_IDLE or CPU_PHASE_WAIT_SAFE. 82 * 83 * Offline CPUs are either running the idle thread and periodically 84 * checking for CPU_PHASE_WAIT_SAFE, or blocked in the hypervisor. 85 * 86 * Thus, we set CPU_PHASE_WAIT_SAFE for every powered-on CPU, as well as 87 * poking them to make sure they're not blocked[1]. When every CPU has 88 * responded by reaching a safe state and setting CPU_PHASE_SAFE, we 89 * know we can suspend, or power-off a CPU, without problems. 90 * 91 * [1] note that we have to repeatedly poke offline CPUs: it's the only 92 * way to ensure that the CPU doesn't miss the state change before 93 * dropping into HYPERVISOR_block(). 94 */ 95 96 #include <sys/types.h> 97 #include <sys/systm.h> 98 #include <sys/param.h> 99 #include <sys/taskq.h> 100 #include <sys/cmn_err.h> 101 #include <sys/archsystm.h> 102 #include <sys/machsystm.h> 103 #include <sys/segments.h> 104 #include <sys/cpuvar.h> 105 #include <sys/x86_archext.h> 106 #include <sys/controlregs.h> 107 #include <sys/hypervisor.h> 108 #include <sys/xpv_panic.h> 109 #include <sys/mman.h> 110 #include <sys/psw.h> 111 #include <sys/cpu.h> 112 #include <sys/sunddi.h> 113 #include <util/sscanf.h> 114 #include <vm/hat_i86.h> 115 #include <vm/hat.h> 116 #include <vm/as.h> 117 118 #include <xen/public/io/xs_wire.h> 119 #include <xen/sys/xenbus_impl.h> 120 #include <xen/public/vcpu.h> 121 122 extern cpuset_t cpu_ready_set; 123 124 #define CPU_PHASE_NONE 0 125 #define CPU_PHASE_WAIT_SAFE 1 126 #define CPU_PHASE_SAFE 2 127 #define CPU_PHASE_POWERED_OFF 3 128 129 /* 130 * We can only poke CPUs during barrier enter 256 times a second at 131 * most. 132 */ 133 #define POKE_TIMEOUT (NANOSEC / 256) 134 135 static taskq_t *cpu_config_tq; 136 static int cpu_phase[NCPU]; 137 138 static void vcpu_config_event(struct xenbus_watch *, const char **, uint_t); 139 static int xen_vcpu_initialize(processorid_t, vcpu_guest_context_t *); 140 141 /* 142 * Return whether or not the vcpu is actually running on a pcpu 143 */ 144 int 145 vcpu_on_pcpu(processorid_t cpu) 146 { 147 struct vcpu_runstate_info runstate; 148 int ret = VCPU_STATE_UNKNOWN; 149 150 ASSERT(cpu < NCPU); 151 /* 152 * Don't bother with hypercall if we are asking about ourself 153 */ 154 if (cpu == CPU->cpu_id) 155 return (VCPU_ON_PCPU); 156 if (HYPERVISOR_vcpu_op(VCPUOP_get_runstate_info, cpu, &runstate) != 0) 157 goto out; 158 159 switch (runstate.state) { 160 case RUNSTATE_running: 161 ret = VCPU_ON_PCPU; 162 break; 163 164 case RUNSTATE_runnable: 165 case RUNSTATE_offline: 166 case RUNSTATE_blocked: 167 ret = VCPU_NOT_ON_PCPU; 168 break; 169 170 default: 171 break; 172 } 173 174 out: 175 return (ret); 176 } 177 178 /* 179 * These routines allocate any global state that might be needed 180 * while starting cpus. For virtual cpus, there is no such state. 181 */ 182 int 183 mach_cpucontext_init(void) 184 { 185 return (0); 186 } 187 188 void 189 do_cpu_config_watch(int state) 190 { 191 static struct xenbus_watch cpu_config_watch; 192 193 if (state != XENSTORE_UP) 194 return; 195 cpu_config_watch.node = "cpu"; 196 cpu_config_watch.callback = vcpu_config_event; 197 if (register_xenbus_watch(&cpu_config_watch)) { 198 taskq_destroy(cpu_config_tq); 199 cmn_err(CE_WARN, "do_cpu_config_watch: " 200 "failed to set vcpu config watch"); 201 } 202 203 } 204 205 /* 206 * This routine is called after all the "normal" MP startup has 207 * been done; a good place to start watching xen store for virtual 208 * cpu hot plug events. 209 */ 210 void 211 mach_cpucontext_fini(void) 212 { 213 214 cpu_config_tq = taskq_create("vcpu config taskq", 1, 215 maxclsyspri - 1, 1, 1, TASKQ_PREPOPULATE); 216 217 (void) xs_register_xenbus_callback(do_cpu_config_watch); 218 } 219 220 /* 221 * Fill in the remaining CPU context and initialize it. 222 */ 223 static int 224 mp_set_cpu_context(vcpu_guest_context_t *vgc, cpu_t *cp) 225 { 226 uint_t vec, iopl; 227 228 vgc->flags = VGCF_IN_KERNEL; 229 230 /* 231 * fpu_ctx we leave as zero; on first fault we'll store 232 * sse_initial into it anyway. 233 */ 234 235 #if defined(__amd64) 236 vgc->user_regs.cs = KCS_SEL | SEL_KPL; /* force to ring 3 */ 237 #else 238 vgc->user_regs.cs = KCS_SEL; 239 #endif 240 vgc->user_regs.ds = KDS_SEL; 241 vgc->user_regs.es = KDS_SEL; 242 vgc->user_regs.ss = KDS_SEL; 243 vgc->kernel_ss = KDS_SEL; 244 245 /* 246 * Allow I/O privilege level for Dom0 kernel. 247 */ 248 if (DOMAIN_IS_INITDOMAIN(xen_info)) 249 iopl = (PS_IOPL & 0x1000); /* ring 1 */ 250 else 251 iopl = 0; 252 253 #if defined(__amd64) 254 vgc->user_regs.fs = 0; 255 vgc->user_regs.gs = 0; 256 vgc->user_regs.rflags = F_OFF | iopl; 257 #elif defined(__i386) 258 vgc->user_regs.fs = KFS_SEL; 259 vgc->user_regs.gs = KGS_SEL; 260 vgc->user_regs.eflags = F_OFF | iopl; 261 vgc->event_callback_cs = vgc->user_regs.cs; 262 vgc->failsafe_callback_cs = vgc->user_regs.cs; 263 #endif 264 265 /* 266 * Initialize the trap_info_t from the IDT 267 */ 268 #if !defined(__lint) 269 ASSERT(NIDT == sizeof (vgc->trap_ctxt) / sizeof (vgc->trap_ctxt[0])); 270 #endif 271 for (vec = 0; vec < NIDT; vec++) { 272 trap_info_t *ti = &vgc->trap_ctxt[vec]; 273 274 if (xen_idt_to_trap_info(vec, 275 &cp->cpu_m.mcpu_idt[vec], ti) == 0) { 276 ti->cs = KCS_SEL; 277 ti->vector = vec; 278 } 279 } 280 281 /* 282 * No LDT 283 */ 284 285 /* 286 * (We assert in various places that the GDT is (a) aligned on a 287 * page boundary and (b) one page long, so this really should fit..) 288 */ 289 #ifdef CRASH_XEN 290 vgc->gdt_frames[0] = pa_to_ma(mmu_btop(cp->cpu_m.mcpu_gdtpa)); 291 #else 292 vgc->gdt_frames[0] = pfn_to_mfn(mmu_btop(cp->cpu_m.mcpu_gdtpa)); 293 #endif 294 vgc->gdt_ents = NGDT; 295 296 vgc->ctrlreg[0] = CR0_ENABLE_FPU_FLAGS(getcr0()); 297 298 #if defined(__i386) 299 if (mmu.pae_hat) 300 vgc->ctrlreg[3] = 301 xen_pfn_to_cr3(pfn_to_mfn(kas.a_hat->hat_htable->ht_pfn)); 302 else 303 #endif 304 vgc->ctrlreg[3] = 305 pa_to_ma(mmu_ptob(kas.a_hat->hat_htable->ht_pfn)); 306 307 vgc->ctrlreg[4] = getcr4(); 308 309 vgc->event_callback_eip = (uintptr_t)xen_callback; 310 vgc->failsafe_callback_eip = (uintptr_t)xen_failsafe_callback; 311 vgc->flags |= VGCF_failsafe_disables_events; 312 313 #if defined(__amd64) 314 /* 315 * XXPV should this be moved to init_cpu_syscall? 316 */ 317 vgc->syscall_callback_eip = (uintptr_t)sys_syscall; 318 vgc->flags |= VGCF_syscall_disables_events; 319 320 ASSERT(vgc->user_regs.gs == 0); 321 vgc->gs_base_kernel = (uintptr_t)cp; 322 #endif 323 324 return (xen_vcpu_initialize(cp->cpu_id, vgc)); 325 } 326 327 /* 328 * Create a guest virtual cpu context so that the virtual cpu 329 * springs into life in the domain just about to call mp_startup() 330 * 331 * Virtual CPUs must be initialized once in the lifetime of the domain; 332 * after that subsequent attempts to start them will fail with X_EEXIST. 333 * 334 * Thus 'alloc' -really- creates and initializes the virtual 335 * CPU context just once. Once the initialisation succeeds, we never 336 * free it, nor the regular cpu_t to which it refers. 337 */ 338 void * 339 mach_cpucontext_alloc(struct cpu *cp) 340 { 341 kthread_t *tp = cp->cpu_thread; 342 vcpu_guest_context_t vgc; 343 344 int err = 1; 345 346 /* 347 * First, augment the incoming cpu structure 348 * - vcpu pointer reference 349 * - pending event storage area 350 * - physical address of GDT 351 */ 352 cp->cpu_m.mcpu_vcpu_info = 353 &HYPERVISOR_shared_info->vcpu_info[cp->cpu_id]; 354 cp->cpu_m.mcpu_evt_pend = kmem_zalloc( 355 sizeof (struct xen_evt_data), KM_SLEEP); 356 cp->cpu_m.mcpu_gdtpa = 357 mmu_ptob(hat_getpfnum(kas.a_hat, (caddr_t)cp->cpu_gdt)); 358 359 if ((err = xen_gdt_setprot(cp, PROT_READ)) != 0) 360 goto done; 361 362 /* 363 * Now set up the vcpu context so that we can start this vcpu 364 * in the kernel at tp->t_pc (mp_startup). Note that the 365 * thread will thread_exit() shortly after performing the 366 * initialization; in particular, we will *never* take a 367 * privilege transition on this thread. 368 */ 369 370 bzero(&vgc, sizeof (vgc)); 371 372 #ifdef __amd64 373 vgc.user_regs.rip = tp->t_pc; 374 vgc.user_regs.rsp = tp->t_sp; 375 vgc.user_regs.rbp = tp->t_sp - 2 * sizeof (greg_t); 376 #else 377 vgc.user_regs.eip = tp->t_pc; 378 vgc.user_regs.esp = tp->t_sp; 379 vgc.user_regs.ebp = tp->t_sp - 2 * sizeof (greg_t); 380 #endif 381 /* 382 * XXPV Fix resume, if Russ didn't already fix it. 383 * 384 * Note that resume unconditionally puts t->t_stk + sizeof (regs) 385 * into kernel_sp via HYPERVISOR_stack_switch. This anticipates 386 * that only lwps take traps that switch to the kernel stack; 387 * part of creating an lwp adjusts the stack by subtracting 388 * sizeof (struct regs) off t_stk. 389 * 390 * The more interesting question is, why do we do all the work 391 * of a fully fledged lwp for a plain thread? In particular 392 * we don't have to call HYPERVISOR_stack_switch for lwp-less threads 393 * or futz with the LDT. This should probably all be done with 394 * an lwp context operator to keep pure thread context switch fast. 395 */ 396 vgc.kernel_sp = (ulong_t)tp->t_stk; 397 398 err = mp_set_cpu_context(&vgc, cp); 399 400 done: 401 if (err) { 402 mach_cpucontext_free(cp, NULL, err); 403 return (NULL); 404 } 405 return (cp); 406 } 407 408 /* 409 * By the time we are called either we have successfully started 410 * the cpu, or our attempt to start it has failed. 411 */ 412 413 /*ARGSUSED*/ 414 void 415 mach_cpucontext_free(struct cpu *cp, void *arg, int err) 416 { 417 switch (err) { 418 case 0: 419 break; 420 case ETIMEDOUT: 421 /* 422 * The vcpu context is loaded into the hypervisor, and 423 * we've tried to start it, but the vcpu has not been set 424 * running yet, for whatever reason. We arrange to -not- 425 * free any data structures it may be referencing. In 426 * particular, we've already told the hypervisor about 427 * the GDT, and so we can't map it read-write again. 428 */ 429 break; 430 default: 431 (void) xen_gdt_setprot(cp, PROT_READ | PROT_WRITE); 432 kmem_free(cp->cpu_m.mcpu_evt_pend, 433 sizeof (struct xen_evt_data)); 434 break; 435 } 436 } 437 438 /* 439 * Reset this CPU's context. Clear out any pending evtchn data, since event 440 * channel numbers will all change when we resume. 441 */ 442 void 443 mach_cpucontext_reset(cpu_t *cp) 444 { 445 bzero(cp->cpu_m.mcpu_evt_pend, sizeof (struct xen_evt_data)); 446 /* mcpu_intr_pending ? */ 447 } 448 449 static void 450 pcb_to_user_regs(label_t *pcb, vcpu_guest_context_t *vgc) 451 { 452 #ifdef __amd64 453 vgc->user_regs.rip = pcb->val[REG_LABEL_PC]; 454 vgc->user_regs.rsp = pcb->val[REG_LABEL_SP]; 455 vgc->user_regs.rbp = pcb->val[REG_LABEL_BP]; 456 vgc->user_regs.rbx = pcb->val[REG_LABEL_RBX]; 457 vgc->user_regs.r12 = pcb->val[REG_LABEL_R12]; 458 vgc->user_regs.r13 = pcb->val[REG_LABEL_R13]; 459 vgc->user_regs.r14 = pcb->val[REG_LABEL_R14]; 460 vgc->user_regs.r15 = pcb->val[REG_LABEL_R15]; 461 #else /* __amd64 */ 462 vgc->user_regs.eip = pcb->val[REG_LABEL_PC]; 463 vgc->user_regs.esp = pcb->val[REG_LABEL_SP]; 464 vgc->user_regs.ebp = pcb->val[REG_LABEL_BP]; 465 vgc->user_regs.ebx = pcb->val[REG_LABEL_EBX]; 466 vgc->user_regs.esi = pcb->val[REG_LABEL_ESI]; 467 vgc->user_regs.edi = pcb->val[REG_LABEL_EDI]; 468 #endif /* __amd64 */ 469 } 470 471 /* 472 * Restore the context of a CPU during resume. This context is always 473 * inside enter_safe_phase(), below. 474 */ 475 void 476 mach_cpucontext_restore(cpu_t *cp) 477 { 478 vcpu_guest_context_t vgc; 479 int err; 480 481 ASSERT(cp->cpu_thread == cp->cpu_pause_thread || 482 cp->cpu_thread == cp->cpu_idle_thread); 483 484 bzero(&vgc, sizeof (vgc)); 485 486 pcb_to_user_regs(&cp->cpu_thread->t_pcb, &vgc); 487 488 /* 489 * We're emulating a longjmp() here: in particular, we need to bump the 490 * stack pointer to account for the pop of xIP that returning from 491 * longjmp() normally would do, and set the return value in xAX to 1. 492 */ 493 #ifdef __amd64 494 vgc.user_regs.rax = 1; 495 vgc.user_regs.rsp += sizeof (ulong_t); 496 #else 497 vgc.user_regs.eax = 1; 498 vgc.user_regs.esp += sizeof (ulong_t); 499 #endif 500 501 vgc.kernel_sp = cp->cpu_thread->t_sp; 502 503 err = mp_set_cpu_context(&vgc, cp); 504 505 ASSERT(err == 0); 506 } 507 508 /* 509 * Reach a point at which the CPU can be safely powered-off or 510 * suspended. Nothing can wake this CPU out of the loop. 511 */ 512 static void 513 enter_safe_phase(void) 514 { 515 ulong_t flags = intr_clear(); 516 517 if (setjmp(&curthread->t_pcb) == 0) { 518 cpu_phase[CPU->cpu_id] = CPU_PHASE_SAFE; 519 while (cpu_phase[CPU->cpu_id] == CPU_PHASE_SAFE) 520 SMT_PAUSE(); 521 } 522 523 ASSERT(!interrupts_enabled()); 524 525 intr_restore(flags); 526 } 527 528 /* 529 * Offline CPUs run this code even under a pause_cpus(), so we must 530 * check if we need to enter the safe phase. 531 */ 532 void 533 mach_cpu_idle(void) 534 { 535 if (IN_XPV_PANIC()) { 536 xpv_panic_halt(); 537 } else { 538 (void) HYPERVISOR_block(); 539 if (cpu_phase[CPU->cpu_id] == CPU_PHASE_WAIT_SAFE) 540 enter_safe_phase(); 541 } 542 } 543 544 /* 545 * Spin until either start_cpus() wakes us up, or we get a request to 546 * enter the safe phase (followed by a later start_cpus()). 547 */ 548 void 549 mach_cpu_pause(volatile char *safe) 550 { 551 *safe = PAUSE_WAIT; 552 membar_enter(); 553 554 while (*safe != PAUSE_IDLE) { 555 if (cpu_phase[CPU->cpu_id] == CPU_PHASE_WAIT_SAFE) 556 enter_safe_phase(); 557 SMT_PAUSE(); 558 } 559 } 560 561 int 562 mach_cpu_halt(xc_arg_t arg1, xc_arg_t arg2 __unused, xc_arg_t arg3 __unused) 563 { 564 char *msg = (char *)arg1; 565 566 if (msg) 567 prom_printf("%s\n", msg); 568 (void) xen_vcpu_down(CPU->cpu_id); 569 return (0); 570 } 571 572 /*ARGSUSED*/ 573 int 574 mp_cpu_poweron(struct cpu *cp) 575 { 576 return (ENOTSUP); 577 } 578 579 /*ARGSUSED*/ 580 int 581 mp_cpu_poweroff(struct cpu *cp) 582 { 583 return (ENOTSUP); 584 } 585 586 void 587 mp_enter_barrier(void) 588 { 589 hrtime_t last_poke_time = 0; 590 int poke_allowed = 0; 591 int done = 0; 592 int i; 593 594 ASSERT(MUTEX_HELD(&cpu_lock)); 595 596 pause_cpus(NULL, NULL); 597 598 while (!done) { 599 done = 1; 600 poke_allowed = 0; 601 602 if (xpv_gethrtime() - last_poke_time > POKE_TIMEOUT) { 603 last_poke_time = xpv_gethrtime(); 604 poke_allowed = 1; 605 } 606 607 for (i = 0; i < NCPU; i++) { 608 cpu_t *cp = cpu_get(i); 609 610 if (cp == NULL || cp == CPU) 611 continue; 612 613 switch (cpu_phase[i]) { 614 case CPU_PHASE_NONE: 615 cpu_phase[i] = CPU_PHASE_WAIT_SAFE; 616 poke_cpu(i); 617 done = 0; 618 break; 619 620 case CPU_PHASE_WAIT_SAFE: 621 if (poke_allowed) 622 poke_cpu(i); 623 done = 0; 624 break; 625 626 case CPU_PHASE_SAFE: 627 case CPU_PHASE_POWERED_OFF: 628 break; 629 } 630 } 631 632 SMT_PAUSE(); 633 } 634 } 635 636 void 637 mp_leave_barrier(void) 638 { 639 int i; 640 641 ASSERT(MUTEX_HELD(&cpu_lock)); 642 643 for (i = 0; i < NCPU; i++) { 644 cpu_t *cp = cpu_get(i); 645 646 if (cp == NULL || cp == CPU) 647 continue; 648 649 switch (cpu_phase[i]) { 650 /* 651 * If we see a CPU in one of these phases, something has 652 * gone badly wrong with the guarantees 653 * mp_enter_barrier() is supposed to provide. Rather 654 * than attempt to stumble along (and since we can't 655 * panic properly in this context), we tell the 656 * hypervisor we've crashed. 657 */ 658 case CPU_PHASE_NONE: 659 case CPU_PHASE_WAIT_SAFE: 660 (void) HYPERVISOR_shutdown(SHUTDOWN_crash); 661 break; 662 663 case CPU_PHASE_POWERED_OFF: 664 break; 665 666 case CPU_PHASE_SAFE: 667 cpu_phase[i] = CPU_PHASE_NONE; 668 } 669 } 670 671 start_cpus(); 672 } 673 674 static int 675 poweroff_vcpu(struct cpu *cp) 676 { 677 int error; 678 679 ASSERT(MUTEX_HELD(&cpu_lock)); 680 681 ASSERT(CPU->cpu_id != cp->cpu_id); 682 ASSERT(cp->cpu_flags & CPU_QUIESCED); 683 684 mp_enter_barrier(); 685 686 if ((error = xen_vcpu_down(cp->cpu_id)) == 0) { 687 ASSERT(cpu_phase[cp->cpu_id] == CPU_PHASE_SAFE); 688 689 CPUSET_DEL(cpu_ready_set, cp->cpu_id); 690 691 if (cp->cpu_flags & CPU_ENABLE) 692 ncpus_intr_enabled--; 693 694 cp->cpu_flags |= CPU_POWEROFF | CPU_OFFLINE; 695 cp->cpu_flags &= 696 ~(CPU_RUNNING | CPU_READY | CPU_EXISTS | CPU_ENABLE); 697 698 cpu_phase[cp->cpu_id] = CPU_PHASE_POWERED_OFF; 699 700 cpu_set_state(cp); 701 } 702 703 mp_leave_barrier(); 704 705 return (error); 706 } 707 708 static int 709 vcpu_config_poweroff(processorid_t id) 710 { 711 int oldstate; 712 int error; 713 cpu_t *cp; 714 715 mutex_enter(&cpu_lock); 716 717 if ((cp = cpu_get(id)) == NULL) { 718 mutex_exit(&cpu_lock); 719 return (ESRCH); 720 } 721 722 if (cpu_get_state(cp) == P_POWEROFF) { 723 mutex_exit(&cpu_lock); 724 return (0); 725 } 726 727 mutex_exit(&cpu_lock); 728 729 do { 730 error = p_online_internal(id, P_OFFLINE, 731 &oldstate); 732 733 if (error != 0) 734 break; 735 736 /* 737 * So we just changed it to P_OFFLINE. But then we dropped 738 * cpu_lock, so now it is possible for another thread to change 739 * the cpu back to a different, non-quiesced state e.g. 740 * P_ONLINE. 741 */ 742 mutex_enter(&cpu_lock); 743 if ((cp = cpu_get(id)) == NULL) 744 error = ESRCH; 745 else { 746 if (cp->cpu_flags & CPU_QUIESCED) 747 error = poweroff_vcpu(cp); 748 else 749 error = EBUSY; 750 } 751 mutex_exit(&cpu_lock); 752 } while (error == EBUSY); 753 754 return (error); 755 } 756 757 /* 758 * Add a new virtual cpu to the domain. 759 */ 760 static int 761 vcpu_config_new(processorid_t id) 762 { 763 extern int start_cpu(processorid_t); 764 int error; 765 766 if (ncpus == 1) { 767 printf("cannot (yet) add cpus to a single-cpu domain\n"); 768 return (ENOTSUP); 769 } 770 771 affinity_set(CPU_CURRENT); 772 error = start_cpu(id); 773 affinity_clear(); 774 return (error); 775 } 776 777 static int 778 poweron_vcpu(struct cpu *cp) 779 { 780 int error; 781 782 ASSERT(MUTEX_HELD(&cpu_lock)); 783 784 if (HYPERVISOR_vcpu_op(VCPUOP_is_up, cp->cpu_id, NULL) != 0) { 785 printf("poweron_vcpu: vcpu%d is not available!\n", 786 cp->cpu_id); 787 return (ENXIO); 788 } 789 790 if ((error = xen_vcpu_up(cp->cpu_id)) == 0) { 791 CPUSET_ADD(cpu_ready_set, cp->cpu_id); 792 cp->cpu_flags |= CPU_EXISTS | CPU_READY | CPU_RUNNING; 793 cp->cpu_flags &= ~CPU_POWEROFF; 794 /* 795 * There are some nasty races possible here. 796 * Tell the vcpu it's up one more time. 797 * XXPV Is this enough? Is this safe? 798 */ 799 (void) xen_vcpu_up(cp->cpu_id); 800 801 cpu_phase[cp->cpu_id] = CPU_PHASE_NONE; 802 803 cpu_set_state(cp); 804 } 805 return (error); 806 } 807 808 static int 809 vcpu_config_poweron(processorid_t id) 810 { 811 cpu_t *cp; 812 int oldstate; 813 int error; 814 815 if (id >= ncpus) 816 return (vcpu_config_new(id)); 817 818 mutex_enter(&cpu_lock); 819 820 if ((cp = cpu_get(id)) == NULL) { 821 mutex_exit(&cpu_lock); 822 return (ESRCH); 823 } 824 825 if (cpu_get_state(cp) != P_POWEROFF) { 826 mutex_exit(&cpu_lock); 827 return (0); 828 } 829 830 if ((error = poweron_vcpu(cp)) != 0) { 831 mutex_exit(&cpu_lock); 832 return (error); 833 } 834 835 mutex_exit(&cpu_lock); 836 837 return (p_online_internal(id, P_ONLINE, &oldstate)); 838 } 839 840 #define REPORT_LEN 128 841 842 static void 843 vcpu_config_report(processorid_t id, uint_t newstate, int error) 844 { 845 char *report = kmem_alloc(REPORT_LEN, KM_SLEEP); 846 size_t len; 847 char *ps; 848 849 switch (newstate) { 850 case P_ONLINE: 851 ps = PS_ONLINE; 852 break; 853 case P_POWEROFF: 854 ps = PS_POWEROFF; 855 break; 856 default: 857 cmn_err(CE_PANIC, "unknown state %u\n", newstate); 858 break; 859 } 860 861 len = snprintf(report, REPORT_LEN, 862 "cpu%d: externally initiated %s", id, ps); 863 864 if (!error) { 865 cmn_err(CE_CONT, "!%s\n", report); 866 kmem_free(report, REPORT_LEN); 867 return; 868 } 869 870 len += snprintf(report + len, REPORT_LEN - len, 871 " failed, error %d: ", error); 872 switch (error) { 873 case EEXIST: 874 len += snprintf(report + len, REPORT_LEN - len, 875 "cpu already %s", ps ? ps : "?"); 876 break; 877 case ESRCH: 878 len += snprintf(report + len, REPORT_LEN - len, 879 "cpu not found"); 880 break; 881 case EINVAL: 882 case EALREADY: 883 break; 884 case EPERM: 885 len += snprintf(report + len, REPORT_LEN - len, 886 "insufficient privilege (0x%x)", id); 887 break; 888 case EBUSY: 889 switch (newstate) { 890 case P_ONLINE: 891 /* 892 * This return comes from mp_cpu_start - 893 * we cannot 'start' the boot CPU. 894 */ 895 len += snprintf(report + len, REPORT_LEN - len, 896 "already running"); 897 break; 898 case P_POWEROFF: 899 len += snprintf(report + len, REPORT_LEN - len, 900 "bound lwps?"); 901 break; 902 default: 903 break; 904 } 905 default: 906 break; 907 } 908 909 cmn_err(CE_CONT, "%s\n", report); 910 kmem_free(report, REPORT_LEN); 911 } 912 913 static void 914 vcpu_config(void *arg) 915 { 916 int id = (int)(uintptr_t)arg; 917 int error; 918 char dir[16]; 919 char *state; 920 921 if ((uint_t)id >= max_ncpus) { 922 cmn_err(CE_WARN, 923 "vcpu_config: cpu%d does not fit in this domain", id); 924 return; 925 } 926 927 (void) snprintf(dir, sizeof (dir), "cpu/%d", id); 928 state = kmem_alloc(MAXPATHLEN, KM_SLEEP); 929 if (xenbus_scanf(XBT_NULL, dir, "availability", "%s", state) == 0) { 930 if (strcmp(state, "online") == 0) { 931 error = vcpu_config_poweron(id); 932 vcpu_config_report(id, P_ONLINE, error); 933 } else if (strcmp(state, "offline") == 0) { 934 error = vcpu_config_poweroff(id); 935 vcpu_config_report(id, P_POWEROFF, error); 936 } else { 937 cmn_err(CE_WARN, 938 "cpu%d: unknown target state '%s'", id, state); 939 } 940 } else 941 cmn_err(CE_WARN, 942 "cpu%d: unable to read target state from xenstore", id); 943 944 kmem_free(state, MAXPATHLEN); 945 } 946 947 /*ARGSUSED*/ 948 static void 949 vcpu_config_event(struct xenbus_watch *watch, const char **vec, uint_t len) 950 { 951 const char *path = vec[XS_WATCH_PATH]; 952 processorid_t id; 953 char *s; 954 955 if ((s = strstr(path, "cpu/")) != NULL && 956 sscanf(s, "cpu/%d", &id) == 1) { 957 /* 958 * Run the virtual CPU configuration on a separate thread to 959 * avoid blocking on this event for too long (and for now, 960 * to ensure configuration requests are serialized.) 961 */ 962 (void) taskq_dispatch(cpu_config_tq, 963 vcpu_config, (void *)(uintptr_t)id, 0); 964 } 965 } 966 967 static int 968 xen_vcpu_initialize(processorid_t id, vcpu_guest_context_t *vgc) 969 { 970 int err; 971 972 if ((err = HYPERVISOR_vcpu_op(VCPUOP_initialise, id, vgc)) != 0) { 973 char *str; 974 int level = CE_WARN; 975 976 switch (err) { 977 case -X_EINVAL: 978 /* 979 * This interface squashes multiple error sources 980 * to one error code. In particular, an X_EINVAL 981 * code can mean: 982 * 983 * - the vcpu id is out of range 984 * - cs or ss are in ring 0 985 * - cr3 is wrong 986 * - an entry in the new gdt is above the 987 * reserved entry 988 * - a frame underneath the new gdt is bad 989 */ 990 str = "something is wrong :("; 991 break; 992 case -X_ENOENT: 993 str = "no such cpu"; 994 break; 995 case -X_ENOMEM: 996 str = "no mem to copy ctxt"; 997 break; 998 case -X_EFAULT: 999 str = "bad address"; 1000 break; 1001 case -X_EEXIST: 1002 /* 1003 * Hmm. This error is returned if the vcpu has already 1004 * been initialized once before in the lifetime of this 1005 * domain. This is a logic error in the kernel. 1006 */ 1007 level = CE_PANIC; 1008 str = "already initialized"; 1009 break; 1010 default: 1011 level = CE_PANIC; 1012 str = "<unexpected>"; 1013 break; 1014 } 1015 1016 cmn_err(level, "vcpu%d: failed to init: error %d: %s", 1017 id, -err, str); 1018 } 1019 return (err); 1020 } 1021 1022 long 1023 xen_vcpu_up(processorid_t id) 1024 { 1025 long err; 1026 1027 if ((err = HYPERVISOR_vcpu_op(VCPUOP_up, id, NULL)) != 0) { 1028 char *str; 1029 1030 switch (err) { 1031 case -X_ENOENT: 1032 str = "no such cpu"; 1033 break; 1034 case -X_EINVAL: 1035 /* 1036 * Perhaps this is diagnostic overkill. 1037 */ 1038 if (HYPERVISOR_vcpu_op(VCPUOP_is_up, id, NULL) < 0) 1039 str = "bad cpuid"; 1040 else 1041 str = "not initialized"; 1042 break; 1043 default: 1044 str = "<unexpected>"; 1045 break; 1046 } 1047 1048 printf("vcpu%d: failed to start: error %d: %s\n", 1049 id, -(int)err, str); 1050 return (EBFONT); /* deliberately silly */ 1051 } 1052 return (err); 1053 } 1054 1055 long 1056 xen_vcpu_down(processorid_t id) 1057 { 1058 long err; 1059 1060 if ((err = HYPERVISOR_vcpu_op(VCPUOP_down, id, NULL)) != 0) { 1061 /* 1062 * X_ENOENT: no such cpu 1063 * X_EINVAL: bad cpuid 1064 */ 1065 panic("vcpu%d: failed to stop: error %d", id, -(int)err); 1066 } 1067 1068 return (err); 1069 } 1070