1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * Virtual CPU management. 29 * 30 * VCPUs can be controlled in one of two ways; through the domain itself 31 * (psradm, p_online(), etc.), and via changes in xenstore (vcpu_config()). 32 * Unfortunately, the terminology is used in different ways; they work out as 33 * follows: 34 * 35 * P_ONLINE: the VCPU is up and running, taking interrupts and running threads 36 * 37 * P_OFFLINE: the VCPU is up and running, but quiesced (i.e. blocked in the 38 * hypervisor on the idle thread). It must be up since a downed VCPU cannot 39 * receive interrupts, and we require this for offline CPUs in Solaris. 40 * 41 * P_POWEROFF: the VCPU is down (we never called xen_vcpu_up(), or called 42 * xen_vcpu_down() for it). It can't take interrupts or run anything, though 43 * if it has run previously, its software state (cpu_t, machcpu structures, IPI 44 * event channels, etc.) will still exist. 45 * 46 * The hypervisor has two notions of CPU states as represented in the store: 47 * 48 * "offline": the VCPU is down. Corresponds to P_POWEROFF. 49 * 50 * "online": the VCPU is running. Corresponds to a CPU state other than 51 * P_POWEROFF. 52 * 53 * Currently, only a notification via xenstore can bring a CPU into a 54 * P_POWEROFF state, and only the domain can change between P_ONLINE, P_NOINTR, 55 * P_OFFLINE, etc. We need to be careful to treat xenstore notifications 56 * idempotently, as we'll get 'duplicate' entries when we resume a domain. 57 * 58 * Note that the xenstore configuration is strictly advisory, in that a domain 59 * can choose to ignore it and still power up a VCPU in the offline state. To 60 * play nice, we don't allow it. Thus, any attempt to power on/off a CPU is 61 * ENOTSUP from within Solaris. 62 * 63 * Powering off a VCPU and suspending the domain use similar code. The 64 * difficulty here is that we must ensure that each VCPU is in a stable 65 * state: it must have a saved PCB, and not be responding to interrupts 66 * (since we are just about to remove its ability to run on a real CPU, 67 * possibly forever). However, an offline CPU in Solaris can take 68 * cross-call interrupts, as mentioned, so we must go through a 69 * two-stage process. First, we use the standard Solaris pause_cpus(). 70 * This ensures that all CPUs are either in mach_cpu_pause() or 71 * mach_cpu_idle(), and nothing will cross-call them. 72 * 73 * Powered-off-CPUs are already safe, as we own the cpu_lock needed to 74 * bring them back up, and in state CPU_PHASE_POWERED_OFF. 75 * 76 * Running CPUs are spinning in mach_cpu_pause() waiting for either 77 * PAUSE_IDLE or CPU_PHASE_WAIT_SAFE. 78 * 79 * Offline CPUs are either running the idle thread and periodically 80 * checking for CPU_PHASE_WAIT_SAFE, or blocked in the hypervisor. 81 * 82 * Thus, we set CPU_PHASE_WAIT_SAFE for every powered-on CPU, as well as 83 * poking them to make sure they're not blocked[1]. When every CPU has 84 * responded by reaching a safe state and setting CPU_PHASE_SAFE, we 85 * know we can suspend, or power-off a CPU, without problems. 86 * 87 * [1] note that we have to repeatedly poke offline CPUs: it's the only 88 * way to ensure that the CPU doesn't miss the state change before 89 * dropping into HYPERVISOR_block(). 90 */ 91 92 #pragma ident "%Z%%M% %I% %E% SMI" 93 94 #include <sys/types.h> 95 #include <sys/systm.h> 96 #include <sys/param.h> 97 #include <sys/taskq.h> 98 #include <sys/cmn_err.h> 99 #include <sys/archsystm.h> 100 #include <sys/machsystm.h> 101 #include <sys/segments.h> 102 #include <sys/cpuvar.h> 103 #include <sys/x86_archext.h> 104 #include <sys/controlregs.h> 105 #include <sys/hypervisor.h> 106 #include <sys/xpv_panic.h> 107 #include <sys/mman.h> 108 #include <sys/psw.h> 109 #include <sys/cpu.h> 110 #include <sys/sunddi.h> 111 #include <util/sscanf.h> 112 #include <vm/hat_i86.h> 113 #include <vm/hat.h> 114 #include <vm/as.h> 115 116 #include <xen/public/io/xs_wire.h> 117 #include <xen/sys/xenbus_impl.h> 118 #include <xen/public/vcpu.h> 119 120 #define CPU_PHASE_NONE 0 121 #define CPU_PHASE_WAIT_SAFE 1 122 #define CPU_PHASE_SAFE 2 123 #define CPU_PHASE_POWERED_OFF 3 124 125 /* 126 * We can only poke CPUs during barrier enter 256 times a second at 127 * most. 128 */ 129 #define POKE_TIMEOUT (NANOSEC / 256) 130 131 static taskq_t *cpu_config_tq; 132 static int cpu_phase[NCPU]; 133 134 static void vcpu_config_event(struct xenbus_watch *, const char **, uint_t); 135 static int xen_vcpu_initialize(processorid_t, vcpu_guest_context_t *); 136 137 /* 138 * These routines allocate any global state that might be needed 139 * while starting cpus. For virtual cpus, there is no such state. 140 */ 141 int 142 mach_cpucontext_init(void) 143 { 144 return (0); 145 } 146 147 void 148 do_cpu_config_watch(int state) 149 { 150 static struct xenbus_watch cpu_config_watch; 151 152 if (state != XENSTORE_UP) 153 return; 154 cpu_config_watch.node = "cpu"; 155 cpu_config_watch.callback = vcpu_config_event; 156 if (register_xenbus_watch(&cpu_config_watch)) { 157 taskq_destroy(cpu_config_tq); 158 cmn_err(CE_WARN, "do_cpu_config_watch: " 159 "failed to set vcpu config watch"); 160 } 161 162 } 163 164 /* 165 * This routine is called after all the "normal" MP startup has 166 * been done; a good place to start watching xen store for virtual 167 * cpu hot plug events. 168 */ 169 void 170 mach_cpucontext_fini(void) 171 { 172 173 cpu_config_tq = taskq_create("vcpu config taskq", 1, 174 maxclsyspri - 1, 1, 1, TASKQ_PREPOPULATE); 175 176 (void) xs_register_xenbus_callback(do_cpu_config_watch); 177 } 178 179 /* 180 * Fill in the remaining CPU context and initialize it. 181 */ 182 static int 183 mp_set_cpu_context(vcpu_guest_context_t *vgc, cpu_t *cp) 184 { 185 uint_t vec, iopl; 186 187 vgc->flags = VGCF_IN_KERNEL; 188 189 /* 190 * fpu_ctx we leave as zero; on first fault we'll store 191 * sse_initial into it anyway. 192 */ 193 194 #if defined(__amd64) 195 vgc->user_regs.cs = KCS_SEL | SEL_KPL; /* force to ring 3 */ 196 #else 197 vgc->user_regs.cs = KCS_SEL; 198 #endif 199 vgc->user_regs.ds = KDS_SEL; 200 vgc->user_regs.es = KDS_SEL; 201 vgc->user_regs.ss = KDS_SEL; 202 vgc->kernel_ss = KDS_SEL; 203 204 /* 205 * Allow I/O privilege level for Dom0 kernel. 206 */ 207 if (DOMAIN_IS_INITDOMAIN(xen_info)) 208 iopl = (PS_IOPL & 0x1000); /* ring 1 */ 209 else 210 iopl = 0; 211 212 #if defined(__amd64) 213 vgc->user_regs.fs = 0; 214 vgc->user_regs.gs = 0; 215 vgc->user_regs.rflags = F_OFF | iopl; 216 #elif defined(__i386) 217 vgc->user_regs.fs = KFS_SEL; 218 vgc->user_regs.gs = KGS_SEL; 219 vgc->user_regs.eflags = F_OFF | iopl; 220 vgc->event_callback_cs = vgc->user_regs.cs; 221 vgc->failsafe_callback_cs = vgc->user_regs.cs; 222 #endif 223 224 /* 225 * Initialize the trap_info_t from the IDT 226 */ 227 #if !defined(__lint) 228 ASSERT(NIDT == sizeof (vgc->trap_ctxt) / sizeof (vgc->trap_ctxt[0])); 229 #endif 230 for (vec = 0; vec < NIDT; vec++) { 231 trap_info_t *ti = &vgc->trap_ctxt[vec]; 232 233 if (xen_idt_to_trap_info(vec, 234 &cp->cpu_m.mcpu_idt[vec], ti) == 0) { 235 ti->cs = KCS_SEL; 236 ti->vector = vec; 237 } 238 } 239 240 /* 241 * No LDT 242 */ 243 244 /* 245 * (We assert in various places that the GDT is (a) aligned on a 246 * page boundary and (b) one page long, so this really should fit..) 247 */ 248 #ifdef CRASH_XEN 249 vgc->gdt_frames[0] = pa_to_ma(mmu_btop(cp->cpu_m.mcpu_gdtpa)); 250 #else 251 vgc->gdt_frames[0] = pfn_to_mfn(mmu_btop(cp->cpu_m.mcpu_gdtpa)); 252 #endif 253 vgc->gdt_ents = NGDT; 254 255 vgc->ctrlreg[0] = CR0_ENABLE_FPU_FLAGS(getcr0()); 256 257 #if defined(__i386) 258 if (mmu.pae_hat) 259 vgc->ctrlreg[3] = 260 xen_pfn_to_cr3(pfn_to_mfn(kas.a_hat->hat_htable->ht_pfn)); 261 else 262 #endif 263 vgc->ctrlreg[3] = 264 pa_to_ma(mmu_ptob(kas.a_hat->hat_htable->ht_pfn)); 265 266 vgc->ctrlreg[4] = getcr4(); 267 268 vgc->event_callback_eip = (uintptr_t)xen_callback; 269 vgc->failsafe_callback_eip = (uintptr_t)xen_failsafe_callback; 270 vgc->flags |= VGCF_failsafe_disables_events; 271 272 #if defined(__amd64) 273 /* 274 * XXPV should this be moved to init_cpu_syscall? 275 */ 276 vgc->syscall_callback_eip = (uintptr_t)sys_syscall; 277 vgc->flags |= VGCF_syscall_disables_events; 278 279 ASSERT(vgc->user_regs.gs == 0); 280 vgc->gs_base_kernel = (uintptr_t)cp; 281 #endif 282 283 return (xen_vcpu_initialize(cp->cpu_id, vgc)); 284 } 285 286 /* 287 * Create a guest virtual cpu context so that the virtual cpu 288 * springs into life in the domain just about to call mp_startup() 289 * 290 * Virtual CPUs must be initialized once in the lifetime of the domain; 291 * after that subsequent attempts to start them will fail with X_EEXIST. 292 * 293 * Thus 'alloc' -really- creates and initializes the virtual 294 * CPU context just once. Once the initialisation succeeds, we never 295 * free it, nor the regular cpu_t to which it refers. 296 */ 297 void * 298 mach_cpucontext_alloc(struct cpu *cp) 299 { 300 kthread_t *tp = cp->cpu_thread; 301 vcpu_guest_context_t vgc; 302 303 int err = 1; 304 305 /* 306 * First, augment the incoming cpu structure 307 * - vcpu pointer reference 308 * - pending event storage area 309 * - physical address of GDT 310 */ 311 cp->cpu_m.mcpu_vcpu_info = 312 &HYPERVISOR_shared_info->vcpu_info[cp->cpu_id]; 313 cp->cpu_m.mcpu_evt_pend = kmem_zalloc( 314 sizeof (struct xen_evt_data), KM_SLEEP); 315 cp->cpu_m.mcpu_gdtpa = 316 mmu_ptob(hat_getpfnum(kas.a_hat, (caddr_t)cp->cpu_gdt)); 317 318 if ((err = xen_gdt_setprot(cp, PROT_READ)) != 0) 319 goto done; 320 321 /* 322 * Now set up the vcpu context so that we can start this vcpu 323 * in the kernel at tp->t_pc (mp_startup). Note that the 324 * thread will thread_exit() shortly after performing the 325 * initialization; in particular, we will *never* take a 326 * privilege transition on this thread. 327 */ 328 329 bzero(&vgc, sizeof (vgc)); 330 331 #ifdef __amd64 332 vgc.user_regs.rip = tp->t_pc; 333 vgc.user_regs.rsp = tp->t_sp; 334 vgc.user_regs.rbp = tp->t_sp - 2 * sizeof (greg_t); 335 #else 336 vgc.user_regs.eip = tp->t_pc; 337 vgc.user_regs.esp = tp->t_sp; 338 vgc.user_regs.ebp = tp->t_sp - 2 * sizeof (greg_t); 339 #endif 340 /* 341 * XXPV Fix resume, if Russ didn't already fix it. 342 * 343 * Note that resume unconditionally puts t->t_stk + sizeof (regs) 344 * into kernel_sp via HYPERVISOR_stack_switch. This anticipates 345 * that only lwps take traps that switch to the kernel stack; 346 * part of creating an lwp adjusts the stack by subtracting 347 * sizeof (struct regs) off t_stk. 348 * 349 * The more interesting question is, why do we do all the work 350 * of a fully fledged lwp for a plain thread? In particular 351 * we don't have to call HYPERVISOR_stack_switch for lwp-less threads 352 * or futz with the LDT. This should probably all be done with 353 * an lwp context operator to keep pure thread context switch fast. 354 */ 355 vgc.kernel_sp = (ulong_t)tp->t_stk; 356 357 err = mp_set_cpu_context(&vgc, cp); 358 359 done: 360 if (err) { 361 mach_cpucontext_free(cp, NULL, err); 362 return (NULL); 363 } 364 return (cp); 365 } 366 367 /* 368 * By the time we are called either we have successfully started 369 * the cpu, or our attempt to start it has failed. 370 */ 371 372 /*ARGSUSED*/ 373 void 374 mach_cpucontext_free(struct cpu *cp, void *arg, int err) 375 { 376 switch (err) { 377 case 0: 378 break; 379 case ETIMEDOUT: 380 /* 381 * The vcpu context is loaded into the hypervisor, and 382 * we've tried to start it, but the vcpu has not been set 383 * running yet, for whatever reason. We arrange to -not- 384 * free any data structures it may be referencing. In 385 * particular, we've already told the hypervisor about 386 * the GDT, and so we can't map it read-write again. 387 */ 388 break; 389 default: 390 (void) xen_gdt_setprot(cp, PROT_READ | PROT_WRITE); 391 kmem_free(cp->cpu_m.mcpu_evt_pend, 392 sizeof (struct xen_evt_data)); 393 break; 394 } 395 } 396 397 /* 398 * Reset this CPU's context. Clear out any pending evtchn data, since event 399 * channel numbers will all change when we resume. 400 */ 401 void 402 mach_cpucontext_reset(cpu_t *cp) 403 { 404 bzero(cp->cpu_m.mcpu_evt_pend, sizeof (struct xen_evt_data)); 405 /* mcpu_intr_pending ? */ 406 } 407 408 static void 409 pcb_to_user_regs(label_t *pcb, vcpu_guest_context_t *vgc) 410 { 411 #ifdef __amd64 412 vgc->user_regs.rip = pcb->val[REG_LABEL_PC]; 413 vgc->user_regs.rsp = pcb->val[REG_LABEL_SP]; 414 vgc->user_regs.rbp = pcb->val[REG_LABEL_BP]; 415 vgc->user_regs.rbx = pcb->val[REG_LABEL_RBX]; 416 vgc->user_regs.r12 = pcb->val[REG_LABEL_R12]; 417 vgc->user_regs.r13 = pcb->val[REG_LABEL_R13]; 418 vgc->user_regs.r14 = pcb->val[REG_LABEL_R14]; 419 vgc->user_regs.r15 = pcb->val[REG_LABEL_R15]; 420 #else /* __amd64 */ 421 vgc->user_regs.eip = pcb->val[REG_LABEL_PC]; 422 vgc->user_regs.esp = pcb->val[REG_LABEL_SP]; 423 vgc->user_regs.ebp = pcb->val[REG_LABEL_BP]; 424 vgc->user_regs.ebx = pcb->val[REG_LABEL_EBX]; 425 vgc->user_regs.esi = pcb->val[REG_LABEL_ESI]; 426 vgc->user_regs.edi = pcb->val[REG_LABEL_EDI]; 427 #endif /* __amd64 */ 428 } 429 430 /* 431 * Restore the context of a CPU during resume. This context is always 432 * inside enter_safe_phase(), below. 433 */ 434 void 435 mach_cpucontext_restore(cpu_t *cp) 436 { 437 vcpu_guest_context_t vgc; 438 int err; 439 440 ASSERT(cp->cpu_thread == cp->cpu_pause_thread || 441 cp->cpu_thread == cp->cpu_idle_thread); 442 443 bzero(&vgc, sizeof (vgc)); 444 445 pcb_to_user_regs(&cp->cpu_thread->t_pcb, &vgc); 446 447 /* 448 * We're emulating a longjmp() here: in particular, we need to bump the 449 * stack pointer to account for the pop of xIP that returning from 450 * longjmp() normally would do, and set the return value in xAX to 1. 451 */ 452 #ifdef __amd64 453 vgc.user_regs.rax = 1; 454 vgc.user_regs.rsp += sizeof (ulong_t); 455 #else 456 vgc.user_regs.eax = 1; 457 vgc.user_regs.esp += sizeof (ulong_t); 458 #endif 459 460 vgc.kernel_sp = cp->cpu_thread->t_sp; 461 462 err = mp_set_cpu_context(&vgc, cp); 463 464 ASSERT(err == 0); 465 } 466 467 /* 468 * Reach a point at which the CPU can be safely powered-off or 469 * suspended. Nothing can wake this CPU out of the loop. 470 */ 471 static void 472 enter_safe_phase(void) 473 { 474 ulong_t flags = intr_clear(); 475 476 if (setjmp(&curthread->t_pcb) == 0) { 477 cpu_phase[CPU->cpu_id] = CPU_PHASE_SAFE; 478 while (cpu_phase[CPU->cpu_id] == CPU_PHASE_SAFE) 479 SMT_PAUSE(); 480 } 481 482 ASSERT(!interrupts_enabled()); 483 484 intr_restore(flags); 485 } 486 487 /* 488 * Offline CPUs run this code even under a pause_cpus(), so we must 489 * check if we need to enter the safe phase. 490 */ 491 void 492 mach_cpu_idle(void) 493 { 494 if (IN_XPV_PANIC()) { 495 xpv_panic_halt(); 496 } else { 497 (void) HYPERVISOR_block(); 498 if (cpu_phase[CPU->cpu_id] == CPU_PHASE_WAIT_SAFE) 499 enter_safe_phase(); 500 } 501 } 502 503 /* 504 * Spin until either start_cpus() wakes us up, or we get a request to 505 * enter the safe phase (followed by a later start_cpus()). 506 */ 507 void 508 mach_cpu_pause(volatile char *safe) 509 { 510 *safe = PAUSE_WAIT; 511 membar_enter(); 512 513 while (*safe != PAUSE_IDLE) { 514 if (cpu_phase[CPU->cpu_id] == CPU_PHASE_WAIT_SAFE) 515 enter_safe_phase(); 516 SMT_PAUSE(); 517 } 518 } 519 520 void 521 mach_cpu_halt(char *msg) 522 { 523 if (msg) 524 prom_printf("%s\n", msg); 525 (void) xen_vcpu_down(CPU->cpu_id); 526 } 527 528 /*ARGSUSED*/ 529 int 530 mp_cpu_poweron(struct cpu *cp) 531 { 532 return (ENOTSUP); 533 } 534 535 /*ARGSUSED*/ 536 int 537 mp_cpu_poweroff(struct cpu *cp) 538 { 539 return (ENOTSUP); 540 } 541 542 void 543 mp_enter_barrier(void) 544 { 545 hrtime_t last_poke_time = 0; 546 int poke_allowed = 0; 547 int done = 0; 548 int i; 549 550 ASSERT(MUTEX_HELD(&cpu_lock)); 551 552 pause_cpus(NULL); 553 554 while (!done) { 555 done = 1; 556 poke_allowed = 0; 557 558 if (xpv_gethrtime() - last_poke_time > POKE_TIMEOUT) { 559 last_poke_time = xpv_gethrtime(); 560 poke_allowed = 1; 561 } 562 563 for (i = 0; i < NCPU; i++) { 564 cpu_t *cp = cpu_get(i); 565 566 if (cp == NULL || cp == CPU) 567 continue; 568 569 switch (cpu_phase[i]) { 570 case CPU_PHASE_NONE: 571 cpu_phase[i] = CPU_PHASE_WAIT_SAFE; 572 poke_cpu(i); 573 done = 0; 574 break; 575 576 case CPU_PHASE_WAIT_SAFE: 577 if (poke_allowed) 578 poke_cpu(i); 579 done = 0; 580 break; 581 582 case CPU_PHASE_SAFE: 583 case CPU_PHASE_POWERED_OFF: 584 break; 585 } 586 } 587 588 SMT_PAUSE(); 589 } 590 } 591 592 void 593 mp_leave_barrier(void) 594 { 595 int i; 596 597 ASSERT(MUTEX_HELD(&cpu_lock)); 598 599 for (i = 0; i < NCPU; i++) { 600 cpu_t *cp = cpu_get(i); 601 602 if (cp == NULL || cp == CPU) 603 continue; 604 605 switch (cpu_phase[i]) { 606 /* 607 * If we see a CPU in one of these phases, something has 608 * gone badly wrong with the guarantees 609 * mp_enter_barrier() is supposed to provide. Rather 610 * than attempt to stumble along (and since we can't 611 * panic properly in this context), we tell the 612 * hypervisor we've crashed. 613 */ 614 case CPU_PHASE_NONE: 615 case CPU_PHASE_WAIT_SAFE: 616 (void) HYPERVISOR_shutdown(SHUTDOWN_crash); 617 break; 618 619 case CPU_PHASE_POWERED_OFF: 620 break; 621 622 case CPU_PHASE_SAFE: 623 cpu_phase[i] = CPU_PHASE_NONE; 624 } 625 } 626 627 start_cpus(); 628 } 629 630 static int 631 poweroff_vcpu(struct cpu *cp) 632 { 633 int error; 634 635 ASSERT(MUTEX_HELD(&cpu_lock)); 636 637 ASSERT(CPU->cpu_id != cp->cpu_id); 638 ASSERT(cp->cpu_flags & CPU_QUIESCED); 639 640 mp_enter_barrier(); 641 642 if ((error = xen_vcpu_down(cp->cpu_id)) == 0) { 643 ASSERT(cpu_phase[cp->cpu_id] == CPU_PHASE_SAFE); 644 645 CPUSET_DEL(cpu_ready_set, cp->cpu_id); 646 647 cp->cpu_flags |= CPU_POWEROFF | CPU_OFFLINE; 648 cp->cpu_flags &= 649 ~(CPU_RUNNING | CPU_READY | CPU_EXISTS | CPU_ENABLE); 650 651 cpu_phase[cp->cpu_id] = CPU_PHASE_POWERED_OFF; 652 653 cpu_set_state(cp); 654 } 655 656 mp_leave_barrier(); 657 658 return (error); 659 } 660 661 static int 662 vcpu_config_poweroff(processorid_t id) 663 { 664 int oldstate; 665 int error; 666 cpu_t *cp; 667 668 mutex_enter(&cpu_lock); 669 670 if ((cp = cpu_get(id)) == NULL) { 671 mutex_exit(&cpu_lock); 672 return (ESRCH); 673 } 674 675 if (cpu_get_state(cp) == P_POWEROFF) { 676 mutex_exit(&cpu_lock); 677 return (0); 678 } 679 680 mutex_exit(&cpu_lock); 681 682 do { 683 error = p_online_internal(id, P_OFFLINE, 684 &oldstate); 685 686 if (error != 0) 687 break; 688 689 /* 690 * So we just changed it to P_OFFLINE. But then we dropped 691 * cpu_lock, so now it is possible for another thread to change 692 * the cpu back to a different, non-quiesced state e.g. 693 * P_ONLINE. 694 */ 695 mutex_enter(&cpu_lock); 696 if ((cp = cpu_get(id)) == NULL) 697 error = ESRCH; 698 else { 699 if (cp->cpu_flags & CPU_QUIESCED) 700 error = poweroff_vcpu(cp); 701 else 702 error = EBUSY; 703 } 704 mutex_exit(&cpu_lock); 705 } while (error == EBUSY); 706 707 return (error); 708 } 709 710 /* 711 * Add a new virtual cpu to the domain. 712 */ 713 static int 714 vcpu_config_new(processorid_t id) 715 { 716 extern int start_cpu(processorid_t); 717 int error; 718 719 if (ncpus == 1) { 720 printf("cannot (yet) add cpus to a single-cpu domain\n"); 721 return (ENOTSUP); 722 } 723 724 affinity_set(CPU_CURRENT); 725 error = start_cpu(id); 726 affinity_clear(); 727 return (error); 728 } 729 730 static int 731 poweron_vcpu(struct cpu *cp) 732 { 733 int error; 734 735 ASSERT(MUTEX_HELD(&cpu_lock)); 736 737 if (HYPERVISOR_vcpu_op(VCPUOP_is_up, cp->cpu_id, NULL) != 0) { 738 printf("poweron_vcpu: vcpu%d is not available!\n", 739 cp->cpu_id); 740 return (ENXIO); 741 } 742 743 if ((error = xen_vcpu_up(cp->cpu_id)) == 0) { 744 CPUSET_ADD(cpu_ready_set, cp->cpu_id); 745 cp->cpu_flags |= CPU_EXISTS | CPU_READY | CPU_RUNNING; 746 cp->cpu_flags &= ~CPU_POWEROFF; 747 /* 748 * There are some nasty races possible here. 749 * Tell the vcpu it's up one more time. 750 * XXPV Is this enough? Is this safe? 751 */ 752 (void) xen_vcpu_up(cp->cpu_id); 753 754 cpu_phase[cp->cpu_id] = CPU_PHASE_NONE; 755 756 cpu_set_state(cp); 757 } 758 return (error); 759 } 760 761 static int 762 vcpu_config_poweron(processorid_t id) 763 { 764 cpu_t *cp; 765 int oldstate; 766 int error; 767 768 if (id >= ncpus) 769 return (vcpu_config_new(id)); 770 771 mutex_enter(&cpu_lock); 772 773 if ((cp = cpu_get(id)) == NULL) { 774 mutex_exit(&cpu_lock); 775 return (ESRCH); 776 } 777 778 if (cpu_get_state(cp) != P_POWEROFF) { 779 mutex_exit(&cpu_lock); 780 return (0); 781 } 782 783 if ((error = poweron_vcpu(cp)) != 0) { 784 mutex_exit(&cpu_lock); 785 return (error); 786 } 787 788 mutex_exit(&cpu_lock); 789 790 return (p_online_internal(id, P_ONLINE, &oldstate)); 791 } 792 793 #define REPORT_LEN 128 794 795 static void 796 vcpu_config_report(processorid_t id, uint_t newstate, int error) 797 { 798 char *report = kmem_alloc(REPORT_LEN, KM_SLEEP); 799 size_t len; 800 char *ps; 801 802 switch (newstate) { 803 case P_ONLINE: 804 ps = PS_ONLINE; 805 break; 806 case P_POWEROFF: 807 ps = PS_POWEROFF; 808 break; 809 default: 810 cmn_err(CE_PANIC, "unknown state %u\n", newstate); 811 break; 812 } 813 814 len = snprintf(report, REPORT_LEN, 815 "cpu%d: externally initiated %s", id, ps); 816 817 if (!error) { 818 cmn_err(CE_CONT, "!%s\n", report); 819 kmem_free(report, REPORT_LEN); 820 return; 821 } 822 823 len += snprintf(report + len, REPORT_LEN - len, 824 " failed, error %d: ", error); 825 switch (error) { 826 case EEXIST: 827 len += snprintf(report + len, REPORT_LEN - len, 828 "cpu already %s", ps ? ps : "?"); 829 break; 830 case ESRCH: 831 len += snprintf(report + len, REPORT_LEN - len, 832 "cpu not found"); 833 break; 834 case EINVAL: 835 case EALREADY: 836 break; 837 case EPERM: 838 len += snprintf(report + len, REPORT_LEN - len, 839 "insufficient privilege (0x%x)", id); 840 break; 841 case EBUSY: 842 switch (newstate) { 843 case P_ONLINE: 844 /* 845 * This return comes from mp_cpu_start - 846 * we cannot 'start' the boot CPU. 847 */ 848 len += snprintf(report + len, REPORT_LEN - len, 849 "already running"); 850 break; 851 case P_POWEROFF: 852 len += snprintf(report + len, REPORT_LEN - len, 853 "bound lwps?"); 854 break; 855 default: 856 break; 857 } 858 default: 859 break; 860 } 861 862 cmn_err(CE_CONT, "%s\n", report); 863 kmem_free(report, REPORT_LEN); 864 } 865 866 static void 867 vcpu_config(void *arg) 868 { 869 int id = (int)(uintptr_t)arg; 870 int error; 871 char dir[16]; 872 char *state; 873 874 if ((uint_t)id >= max_ncpus) { 875 cmn_err(CE_WARN, 876 "vcpu_config: cpu%d does not fit in this domain", id); 877 return; 878 } 879 880 (void) snprintf(dir, sizeof (dir), "cpu/%d", id); 881 state = kmem_alloc(MAXPATHLEN, KM_SLEEP); 882 if (xenbus_scanf(XBT_NULL, dir, "availability", "%s", state) == 0) { 883 if (strcmp(state, "online") == 0) { 884 error = vcpu_config_poweron(id); 885 vcpu_config_report(id, P_ONLINE, error); 886 } else if (strcmp(state, "offline") == 0) { 887 error = vcpu_config_poweroff(id); 888 vcpu_config_report(id, P_POWEROFF, error); 889 } else { 890 cmn_err(CE_WARN, 891 "cpu%d: unknown target state '%s'", id, state); 892 } 893 } else 894 cmn_err(CE_WARN, 895 "cpu%d: unable to read target state from xenstore", id); 896 897 kmem_free(state, MAXPATHLEN); 898 } 899 900 /*ARGSUSED*/ 901 static void 902 vcpu_config_event(struct xenbus_watch *watch, const char **vec, uint_t len) 903 { 904 const char *path = vec[XS_WATCH_PATH]; 905 processorid_t id; 906 char *s; 907 908 if ((s = strstr(path, "cpu/")) != NULL && 909 sscanf(s, "cpu/%d", &id) == 1) { 910 /* 911 * Run the virtual CPU configuration on a separate thread to 912 * avoid blocking on this event for too long (and for now, 913 * to ensure configuration requests are serialized.) 914 */ 915 (void) taskq_dispatch(cpu_config_tq, 916 vcpu_config, (void *)(uintptr_t)id, 0); 917 } 918 } 919 920 static int 921 xen_vcpu_initialize(processorid_t id, vcpu_guest_context_t *vgc) 922 { 923 int err; 924 925 if ((err = HYPERVISOR_vcpu_op(VCPUOP_initialise, id, vgc)) != 0) { 926 char *str; 927 int level = CE_WARN; 928 929 switch (err) { 930 case -X_EINVAL: 931 /* 932 * This interface squashes multiple error sources 933 * to one error code. In particular, an X_EINVAL 934 * code can mean: 935 * 936 * - the vcpu id is out of range 937 * - cs or ss are in ring 0 938 * - cr3 is wrong 939 * - an entry in the new gdt is above the 940 * reserved entry 941 * - a frame underneath the new gdt is bad 942 */ 943 str = "something is wrong :("; 944 break; 945 case -X_ENOENT: 946 str = "no such cpu"; 947 break; 948 case -X_ENOMEM: 949 str = "no mem to copy ctxt"; 950 break; 951 case -X_EFAULT: 952 str = "bad address"; 953 break; 954 case -X_EEXIST: 955 /* 956 * Hmm. This error is returned if the vcpu has already 957 * been initialized once before in the lifetime of this 958 * domain. This is a logic error in the kernel. 959 */ 960 level = CE_PANIC; 961 str = "already initialized"; 962 break; 963 default: 964 level = CE_PANIC; 965 str = "<unexpected>"; 966 break; 967 } 968 969 cmn_err(level, "vcpu%d: failed to init: error %d: %s", 970 id, -err, str); 971 } 972 return (err); 973 } 974 975 long 976 xen_vcpu_up(processorid_t id) 977 { 978 long err; 979 980 if ((err = HYPERVISOR_vcpu_op(VCPUOP_up, id, NULL)) != 0) { 981 char *str; 982 983 switch (err) { 984 case -X_ENOENT: 985 str = "no such cpu"; 986 break; 987 case -X_EINVAL: 988 /* 989 * Perhaps this is diagnostic overkill. 990 */ 991 if (HYPERVISOR_vcpu_op(VCPUOP_is_up, id, NULL) < 0) 992 str = "bad cpuid"; 993 else 994 str = "not initialized"; 995 break; 996 default: 997 str = "<unexpected>"; 998 break; 999 } 1000 1001 printf("vcpu%d: failed to start: error %d: %s\n", 1002 id, -(int)err, str); 1003 return (EBFONT); /* deliberately silly */ 1004 } 1005 return (err); 1006 } 1007 1008 long 1009 xen_vcpu_down(processorid_t id) 1010 { 1011 long err; 1012 1013 if ((err = HYPERVISOR_vcpu_op(VCPUOP_down, id, NULL)) != 0) { 1014 /* 1015 * X_ENOENT: no such cpu 1016 * X_EINVAL: bad cpuid 1017 */ 1018 panic("vcpu%d: failed to stop: error %d", id, -(int)err); 1019 } 1020 1021 return (err); 1022 } 1023