1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 /* 25 * Copyright (c) 2010, Intel Corporation. 26 * All rights reserved. 27 */ 28 29 /* 30 * Welcome to the world of the "real mode platter". 31 * See also startup.c, mpcore.s and apic.c for related routines. 32 */ 33 34 #include <sys/types.h> 35 #include <sys/systm.h> 36 #include <sys/cpuvar.h> 37 #include <sys/cpu_module.h> 38 #include <sys/kmem.h> 39 #include <sys/archsystm.h> 40 #include <sys/machsystm.h> 41 #include <sys/controlregs.h> 42 #include <sys/x86_archext.h> 43 #include <sys/smp_impldefs.h> 44 #include <sys/sysmacros.h> 45 #include <sys/mach_mmu.h> 46 #include <sys/promif.h> 47 #include <sys/cpu.h> 48 #include <sys/cpu_event.h> 49 #include <sys/sunndi.h> 50 #include <sys/fs/dv_node.h> 51 #include <vm/hat_i86.h> 52 #include <vm/as.h> 53 54 extern cpuset_t cpu_ready_set; 55 56 extern int mp_start_cpu_common(cpu_t *cp, boolean_t boot); 57 extern void real_mode_start_cpu(void); 58 extern void real_mode_start_cpu_end(void); 59 extern void real_mode_stop_cpu_stage1(void); 60 extern void real_mode_stop_cpu_stage1_end(void); 61 extern void real_mode_stop_cpu_stage2(void); 62 extern void real_mode_stop_cpu_stage2_end(void); 63 extern void *(*cpu_pause_func)(void *); 64 65 void rmp_gdt_init(rm_platter_t *); 66 67 /* 68 * Fill up the real mode platter to make it easy for real mode code to 69 * kick it off. This area should really be one passed by boot to kernel 70 * and guaranteed to be below 1MB and aligned to 16 bytes. Should also 71 * have identical physical and virtual address in paged mode. 72 */ 73 static ushort_t *warm_reset_vector = NULL; 74 75 int 76 mach_cpucontext_init(void) 77 { 78 ushort_t *vec; 79 ulong_t addr; 80 struct rm_platter *rm = (struct rm_platter *)rm_platter_va; 81 82 if (!(vec = (ushort_t *)psm_map_phys(WARM_RESET_VECTOR, 83 sizeof (vec), PROT_READ | PROT_WRITE))) 84 return (-1); 85 86 /* 87 * setup secondary cpu bios boot up vector 88 * Write page offset to 0x467 and page frame number to 0x469. 89 */ 90 addr = (ulong_t)((caddr_t)rm->rm_code - (caddr_t)rm) + rm_platter_pa; 91 vec[0] = (ushort_t)(addr & PAGEOFFSET); 92 vec[1] = (ushort_t)((addr & (0xfffff & PAGEMASK)) >> 4); 93 warm_reset_vector = vec; 94 95 /* Map real mode platter into kas so kernel can access it. */ 96 hat_devload(kas.a_hat, 97 (caddr_t)(uintptr_t)rm_platter_pa, MMU_PAGESIZE, 98 btop(rm_platter_pa), PROT_READ | PROT_WRITE | PROT_EXEC, 99 HAT_LOAD_NOCONSIST); 100 101 /* Copy CPU startup code to rm_platter if it's still during boot. */ 102 if (!plat_dr_enabled()) { 103 ASSERT((size_t)real_mode_start_cpu_end - 104 (size_t)real_mode_start_cpu <= RM_PLATTER_CODE_SIZE); 105 bcopy((caddr_t)real_mode_start_cpu, (caddr_t)rm->rm_code, 106 (size_t)real_mode_start_cpu_end - 107 (size_t)real_mode_start_cpu); 108 } 109 110 return (0); 111 } 112 113 void 114 mach_cpucontext_fini(void) 115 { 116 if (warm_reset_vector) 117 psm_unmap_phys((caddr_t)warm_reset_vector, 118 sizeof (warm_reset_vector)); 119 hat_unload(kas.a_hat, (caddr_t)(uintptr_t)rm_platter_pa, MMU_PAGESIZE, 120 HAT_UNLOAD); 121 } 122 123 #if defined(__amd64) 124 extern void *long_mode_64(void); 125 #endif /* __amd64 */ 126 127 /*ARGSUSED*/ 128 void 129 rmp_gdt_init(rm_platter_t *rm) 130 { 131 132 #if defined(__amd64) 133 /* Use the kas address space for the CPU startup thread. */ 134 if (MAKECR3(kas.a_hat->hat_htable->ht_pfn) > 0xffffffffUL) 135 panic("Cannot initialize CPUs; kernel's 64-bit page tables\n" 136 "located above 4G in physical memory (@ 0x%lx)", 137 MAKECR3(kas.a_hat->hat_htable->ht_pfn)); 138 139 /* 140 * Setup pseudo-descriptors for temporary GDT and IDT for use ONLY 141 * by code in real_mode_start_cpu(): 142 * 143 * GDT[0]: NULL selector 144 * GDT[1]: 64-bit CS: Long = 1, Present = 1, bits 12, 11 = 1 145 * 146 * Clear the IDT as interrupts will be off and a limit of 0 will cause 147 * the CPU to triple fault and reset on an NMI, seemingly as reasonable 148 * a course of action as any other, though it may cause the entire 149 * platform to reset in some cases... 150 */ 151 rm->rm_temp_gdt[0] = 0; 152 rm->rm_temp_gdt[TEMPGDT_KCODE64] = 0x20980000000000ULL; 153 154 rm->rm_temp_gdt_lim = (ushort_t)(sizeof (rm->rm_temp_gdt) - 1); 155 rm->rm_temp_gdt_base = rm_platter_pa + 156 (uint32_t)offsetof(rm_platter_t, rm_temp_gdt); 157 rm->rm_temp_idt_lim = 0; 158 rm->rm_temp_idt_base = 0; 159 160 /* 161 * Since the CPU needs to jump to protected mode using an identity 162 * mapped address, we need to calculate it here. 163 */ 164 rm->rm_longmode64_addr = rm_platter_pa + 165 ((uint32_t)long_mode_64 - (uint32_t)real_mode_start_cpu); 166 #endif /* __amd64 */ 167 } 168 169 static void * 170 mach_cpucontext_alloc_tables(struct cpu *cp) 171 { 172 struct tss *ntss; 173 struct cpu_tables *ct; 174 175 /* 176 * Allocate space for stack, tss, gdt and idt. We round the size 177 * allotted for cpu_tables up, so that the TSS is on a unique page. 178 * This is more efficient when running in virtual machines. 179 */ 180 ct = kmem_zalloc(P2ROUNDUP(sizeof (*ct), PAGESIZE), KM_SLEEP); 181 if ((uintptr_t)ct & PAGEOFFSET) 182 panic("mach_cpucontext_alloc_tables: cpu%d misaligned tables", 183 cp->cpu_id); 184 185 ntss = cp->cpu_tss = &ct->ct_tss; 186 187 #if defined(__amd64) 188 189 /* 190 * #DF (double fault). 191 */ 192 ntss->tss_ist1 = (uint64_t)&ct->ct_stack[sizeof (ct->ct_stack)]; 193 194 #elif defined(__i386) 195 196 ntss->tss_esp0 = ntss->tss_esp1 = ntss->tss_esp2 = ntss->tss_esp = 197 (uint32_t)&ct->ct_stack[sizeof (ct->ct_stack)]; 198 199 ntss->tss_ss0 = ntss->tss_ss1 = ntss->tss_ss2 = ntss->tss_ss = KDS_SEL; 200 201 ntss->tss_eip = (uint32_t)cp->cpu_thread->t_pc; 202 203 ntss->tss_cs = KCS_SEL; 204 ntss->tss_ds = ntss->tss_es = KDS_SEL; 205 ntss->tss_fs = KFS_SEL; 206 ntss->tss_gs = KGS_SEL; 207 208 #endif /* __i386 */ 209 210 /* 211 * Set I/O bit map offset equal to size of TSS segment limit 212 * for no I/O permission map. This will cause all user I/O 213 * instructions to generate #gp fault. 214 */ 215 ntss->tss_bitmapbase = sizeof (*ntss); 216 217 /* 218 * Setup kernel tss. 219 */ 220 set_syssegd((system_desc_t *)&cp->cpu_gdt[GDT_KTSS], cp->cpu_tss, 221 sizeof (*cp->cpu_tss) - 1, SDT_SYSTSS, SEL_KPL); 222 223 return (ct); 224 } 225 226 void * 227 mach_cpucontext_xalloc(struct cpu *cp, int optype) 228 { 229 size_t len; 230 struct cpu_tables *ct; 231 rm_platter_t *rm = (rm_platter_t *)rm_platter_va; 232 static int cpu_halt_code_ready; 233 234 if (optype == MACH_CPUCONTEXT_OP_STOP) { 235 ASSERT(plat_dr_enabled()); 236 237 /* 238 * The WARM_RESET_VECTOR has a limitation that the physical 239 * address written to it must be page-aligned. To work around 240 * this limitation, the CPU stop code has been splitted into 241 * two stages. 242 * The stage 2 code, which implements the real logic to halt 243 * CPUs, is copied to the rm_cpu_halt_code field in the real 244 * mode platter. The stage 1 code, which simply jumps to the 245 * stage 2 code in the rm_cpu_halt_code field, is copied to 246 * rm_code field in the real mode platter and it may be 247 * overwritten after the CPU has been stopped. 248 */ 249 if (!cpu_halt_code_ready) { 250 /* 251 * The rm_cpu_halt_code field in the real mode platter 252 * is used by the CPU stop code only. So only copy the 253 * CPU stop stage 2 code into the rm_cpu_halt_code 254 * field on the first call. 255 */ 256 len = (size_t)real_mode_stop_cpu_stage2_end - 257 (size_t)real_mode_stop_cpu_stage2; 258 ASSERT(len <= RM_PLATTER_CPU_HALT_CODE_SIZE); 259 bcopy((caddr_t)real_mode_stop_cpu_stage2, 260 (caddr_t)rm->rm_cpu_halt_code, len); 261 cpu_halt_code_ready = 1; 262 } 263 264 /* 265 * The rm_code field in the real mode platter is shared by 266 * the CPU start, CPU stop, CPR and fast reboot code. So copy 267 * the CPU stop stage 1 code into the rm_code field every time. 268 */ 269 len = (size_t)real_mode_stop_cpu_stage1_end - 270 (size_t)real_mode_stop_cpu_stage1; 271 ASSERT(len <= RM_PLATTER_CODE_SIZE); 272 bcopy((caddr_t)real_mode_stop_cpu_stage1, 273 (caddr_t)rm->rm_code, len); 274 rm->rm_cpu_halted = 0; 275 276 return (cp->cpu_m.mcpu_mach_ctx_ptr); 277 } else if (optype != MACH_CPUCONTEXT_OP_START) { 278 return (NULL); 279 } 280 281 /* 282 * Only need to allocate tables when starting CPU. 283 * Tables allocated when starting CPU will be reused when stopping CPU. 284 */ 285 ct = mach_cpucontext_alloc_tables(cp); 286 if (ct == NULL) { 287 return (NULL); 288 } 289 290 /* Copy CPU startup code to rm_platter for CPU hot-add operations. */ 291 if (plat_dr_enabled()) { 292 bcopy((caddr_t)real_mode_start_cpu, (caddr_t)rm->rm_code, 293 (size_t)real_mode_start_cpu_end - 294 (size_t)real_mode_start_cpu); 295 } 296 297 /* 298 * Now copy all that we've set up onto the real mode platter 299 * for the real mode code to digest as part of starting the cpu. 300 */ 301 rm->rm_idt_base = cp->cpu_idt; 302 rm->rm_idt_lim = sizeof (*cp->cpu_idt) * NIDT - 1; 303 rm->rm_gdt_base = cp->cpu_gdt; 304 rm->rm_gdt_lim = sizeof (*cp->cpu_gdt) * NGDT - 1; 305 306 /* 307 * CPU needs to access kernel address space after powering on. 308 * When hot-adding CPU at runtime, directly use top level page table 309 * of kas other than the return value of getcr3(). getcr3() returns 310 * current process's top level page table, which may be different from 311 * the one of kas. 312 */ 313 rm->rm_pdbr = MAKECR3(kas.a_hat->hat_htable->ht_pfn); 314 rm->rm_cpu = cp->cpu_id; 315 316 /* 317 * For hot-adding CPU at runtime, Machine Check and Performance Counter 318 * should be disabled. They will be enabled on demand after CPU powers 319 * on successfully 320 */ 321 rm->rm_cr4 = getcr4(); 322 rm->rm_cr4 &= ~(CR4_MCE | CR4_PCE); 323 324 rmp_gdt_init(rm); 325 326 return (ct); 327 } 328 329 void 330 mach_cpucontext_xfree(struct cpu *cp, void *arg, int err, int optype) 331 { 332 struct cpu_tables *ct = arg; 333 334 ASSERT(&ct->ct_tss == cp->cpu_tss); 335 if (optype == MACH_CPUCONTEXT_OP_START) { 336 switch (err) { 337 case 0: 338 /* 339 * Save pointer for reuse when stopping CPU. 340 */ 341 cp->cpu_m.mcpu_mach_ctx_ptr = arg; 342 break; 343 case ETIMEDOUT: 344 /* 345 * The processor was poked, but failed to start before 346 * we gave up waiting for it. In case it starts later, 347 * don't free anything. 348 */ 349 cp->cpu_m.mcpu_mach_ctx_ptr = arg; 350 break; 351 default: 352 /* 353 * Some other, passive, error occurred. 354 */ 355 kmem_free(ct, P2ROUNDUP(sizeof (*ct), PAGESIZE)); 356 cp->cpu_tss = NULL; 357 break; 358 } 359 } else if (optype == MACH_CPUCONTEXT_OP_STOP) { 360 switch (err) { 361 case 0: 362 /* 363 * Free resources allocated when starting CPU. 364 */ 365 kmem_free(ct, P2ROUNDUP(sizeof (*ct), PAGESIZE)); 366 cp->cpu_tss = NULL; 367 cp->cpu_m.mcpu_mach_ctx_ptr = NULL; 368 break; 369 default: 370 /* 371 * Don't touch table pointer in case of failure. 372 */ 373 break; 374 } 375 } else { 376 ASSERT(0); 377 } 378 } 379 380 void * 381 mach_cpucontext_alloc(struct cpu *cp) 382 { 383 return (mach_cpucontext_xalloc(cp, MACH_CPUCONTEXT_OP_START)); 384 } 385 386 void 387 mach_cpucontext_free(struct cpu *cp, void *arg, int err) 388 { 389 mach_cpucontext_xfree(cp, arg, err, MACH_CPUCONTEXT_OP_START); 390 } 391 392 /* 393 * "Enter monitor." Called via cross-call from stop_other_cpus(). 394 */ 395 void 396 mach_cpu_halt(char *msg) 397 { 398 if (msg) 399 prom_printf("%s\n", msg); 400 401 /*CONSTANTCONDITION*/ 402 while (1) 403 ; 404 } 405 406 void 407 mach_cpu_idle(void) 408 { 409 i86_halt(); 410 } 411 412 void 413 mach_cpu_pause(volatile char *safe) 414 { 415 /* 416 * This cpu is now safe. 417 */ 418 *safe = PAUSE_WAIT; 419 membar_enter(); /* make sure stores are flushed */ 420 421 /* 422 * Now we wait. When we are allowed to continue, safe 423 * will be set to PAUSE_IDLE. 424 */ 425 while (*safe != PAUSE_IDLE) 426 SMT_PAUSE(); 427 } 428 429 /* 430 * Power on the target CPU. 431 */ 432 int 433 mp_cpu_poweron(struct cpu *cp) 434 { 435 int error; 436 cpuset_t tempset; 437 processorid_t cpuid; 438 439 ASSERT(cp != NULL); 440 cpuid = cp->cpu_id; 441 if (use_mp == 0 || plat_dr_support_cpu() == 0) { 442 return (ENOTSUP); 443 } else if (cpuid < 0 || cpuid >= max_ncpus) { 444 return (EINVAL); 445 } 446 447 /* 448 * The currrent x86 implementaiton of mp_cpu_configure() and 449 * mp_cpu_poweron() have a limitation that mp_cpu_poweron() could only 450 * be called once after calling mp_cpu_configure() for a specific CPU. 451 * It's because mp_cpu_poweron() will destroy data structure created 452 * by mp_cpu_configure(). So reject the request if the CPU has already 453 * been powered on once after calling mp_cpu_configure(). 454 * This limitaiton only affects the p_online syscall and the DR driver 455 * won't be affected because the DR driver always invoke public CPU 456 * management interfaces in the predefined order: 457 * cpu_configure()->cpu_poweron()...->cpu_poweroff()->cpu_unconfigure() 458 */ 459 if (cpuid_checkpass(cp, 4) || cp->cpu_thread == cp->cpu_idle_thread) { 460 return (ENOTSUP); 461 } 462 463 /* 464 * Check if there's at least a Mbyte of kmem available 465 * before attempting to start the cpu. 466 */ 467 if (kmem_avail() < 1024 * 1024) { 468 /* 469 * Kick off a reap in case that helps us with 470 * later attempts .. 471 */ 472 kmem_reap(); 473 return (ENOMEM); 474 } 475 476 affinity_set(CPU->cpu_id); 477 478 /* 479 * Start the target CPU. No need to call mach_cpucontext_fini() 480 * if mach_cpucontext_init() fails. 481 */ 482 if ((error = mach_cpucontext_init()) == 0) { 483 error = mp_start_cpu_common(cp, B_FALSE); 484 mach_cpucontext_fini(); 485 } 486 if (error != 0) { 487 affinity_clear(); 488 return (error); 489 } 490 491 /* Wait for the target cpu to reach READY state. */ 492 tempset = cpu_ready_set; 493 while (!CPU_IN_SET(tempset, cpuid)) { 494 delay(1); 495 tempset = *((volatile cpuset_t *)&cpu_ready_set); 496 } 497 498 /* Mark the target CPU as available for mp operation. */ 499 CPUSET_ATOMIC_ADD(mp_cpus, cpuid); 500 501 /* Free the space allocated to hold the microcode file */ 502 ucode_cleanup(); 503 504 affinity_clear(); 505 506 return (0); 507 } 508 509 #define MP_CPU_DETACH_MAX_TRIES 5 510 #define MP_CPU_DETACH_DELAY 100 511 512 static int 513 mp_cpu_detach_driver(dev_info_t *dip) 514 { 515 int i; 516 int rv = EBUSY; 517 dev_info_t *pdip; 518 519 pdip = ddi_get_parent(dip); 520 ASSERT(pdip != NULL); 521 /* 522 * Check if caller holds pdip busy - can cause deadlocks in 523 * e_ddi_branch_unconfigure(), which calls devfs_clean(). 524 */ 525 if (DEVI_BUSY_OWNED(pdip)) { 526 return (EDEADLOCK); 527 } 528 529 for (i = 0; i < MP_CPU_DETACH_MAX_TRIES; i++) { 530 if (e_ddi_branch_unconfigure(dip, NULL, 0) == 0) { 531 rv = 0; 532 break; 533 } 534 DELAY(MP_CPU_DETACH_DELAY); 535 } 536 537 return (rv); 538 } 539 540 /* 541 * Power off the target CPU. 542 * Note: cpu_lock will be released and then reacquired. 543 */ 544 int 545 mp_cpu_poweroff(struct cpu *cp) 546 { 547 int rv = 0; 548 void *ctx; 549 dev_info_t *dip = NULL; 550 rm_platter_t *rm = (rm_platter_t *)rm_platter_va; 551 extern void cpupm_start(cpu_t *); 552 extern void cpupm_stop(cpu_t *); 553 554 ASSERT(cp != NULL); 555 ASSERT((cp->cpu_flags & CPU_OFFLINE) != 0); 556 ASSERT((cp->cpu_flags & CPU_QUIESCED) != 0); 557 558 if (use_mp == 0 || plat_dr_support_cpu() == 0) { 559 return (ENOTSUP); 560 } 561 /* 562 * There is no support for powering off cpu0 yet. 563 * There are many pieces of code which have a hard dependency on cpu0. 564 */ 565 if (cp->cpu_id == 0) { 566 return (ENOTSUP); 567 }; 568 569 if (mach_cpu_get_device_node(cp, &dip) != PSM_SUCCESS) { 570 return (ENXIO); 571 } 572 ASSERT(dip != NULL); 573 if (mp_cpu_detach_driver(dip) != 0) { 574 rv = EBUSY; 575 goto out_online; 576 } 577 578 /* Allocate CPU context for stopping */ 579 if (mach_cpucontext_init() != 0) { 580 rv = ENXIO; 581 goto out_online; 582 } 583 ctx = mach_cpucontext_xalloc(cp, MACH_CPUCONTEXT_OP_STOP); 584 if (ctx == NULL) { 585 rv = ENXIO; 586 goto out_context_fini; 587 } 588 589 cpupm_stop(cp); 590 cpu_event_fini_cpu(cp); 591 592 if (cp->cpu_m.mcpu_cmi_hdl != NULL) { 593 cmi_fini(cp->cpu_m.mcpu_cmi_hdl); 594 cp->cpu_m.mcpu_cmi_hdl = NULL; 595 } 596 597 rv = mach_cpu_stop(cp, ctx); 598 if (rv != 0) { 599 goto out_enable_cmi; 600 } 601 602 /* Wait until the target CPU has been halted. */ 603 while (*(volatile ushort_t *)&(rm->rm_cpu_halted) != 0xdead) { 604 delay(1); 605 } 606 rm->rm_cpu_halted = 0xffff; 607 608 /* CPU_READY has been cleared by mach_cpu_stop. */ 609 ASSERT((cp->cpu_flags & CPU_READY) == 0); 610 ASSERT((cp->cpu_flags & CPU_RUNNING) == 0); 611 cp->cpu_flags = CPU_OFFLINE | CPU_QUIESCED | CPU_POWEROFF; 612 CPUSET_ATOMIC_DEL(mp_cpus, cp->cpu_id); 613 614 mach_cpucontext_xfree(cp, ctx, 0, MACH_CPUCONTEXT_OP_STOP); 615 mach_cpucontext_fini(); 616 617 return (0); 618 619 out_enable_cmi: 620 { 621 cmi_hdl_t hdl; 622 623 if ((hdl = cmi_init(CMI_HDL_NATIVE, cmi_ntv_hwchipid(cp), 624 cmi_ntv_hwcoreid(cp), cmi_ntv_hwstrandid(cp))) != NULL) { 625 if (is_x86_feature(x86_featureset, X86FSET_MCA)) 626 cmi_mca_init(hdl); 627 cp->cpu_m.mcpu_cmi_hdl = hdl; 628 } 629 } 630 cpu_event_init_cpu(cp); 631 cpupm_start(cp); 632 mach_cpucontext_xfree(cp, ctx, rv, MACH_CPUCONTEXT_OP_STOP); 633 634 out_context_fini: 635 mach_cpucontext_fini(); 636 637 out_online: 638 (void) e_ddi_branch_configure(dip, NULL, 0); 639 640 if (rv != EAGAIN && rv != ETIME) { 641 rv = ENXIO; 642 } 643 644 return (rv); 645 } 646 647 /* 648 * Return vcpu state, since this could be a virtual environment that we 649 * are unaware of, return "unknown". 650 */ 651 /* ARGSUSED */ 652 int 653 vcpu_on_pcpu(processorid_t cpu) 654 { 655 return (VCPU_STATE_UNKNOWN); 656 } 657