1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 /* 26 * Copyright (c) 2010, Intel Corporation. 27 * All rights reserved. 28 */ 29 30 /* 31 * Welcome to the world of the "real mode platter". 32 * See also startup.c, mpcore.s and apic.c for related routines. 33 */ 34 35 #include <sys/types.h> 36 #include <sys/systm.h> 37 #include <sys/cpuvar.h> 38 #include <sys/cpu_module.h> 39 #include <sys/kmem.h> 40 #include <sys/archsystm.h> 41 #include <sys/machsystm.h> 42 #include <sys/controlregs.h> 43 #include <sys/x86_archext.h> 44 #include <sys/smp_impldefs.h> 45 #include <sys/sysmacros.h> 46 #include <sys/mach_mmu.h> 47 #include <sys/promif.h> 48 #include <sys/cpu.h> 49 #include <sys/cpu_event.h> 50 #include <sys/sunndi.h> 51 #include <sys/fs/dv_node.h> 52 #include <vm/hat_i86.h> 53 #include <vm/as.h> 54 55 extern cpuset_t cpu_ready_set; 56 57 extern int mp_start_cpu_common(cpu_t *cp, boolean_t boot); 58 extern void real_mode_start_cpu(void); 59 extern void real_mode_start_cpu_end(void); 60 extern void real_mode_stop_cpu_stage1(void); 61 extern void real_mode_stop_cpu_stage1_end(void); 62 extern void real_mode_stop_cpu_stage2(void); 63 extern void real_mode_stop_cpu_stage2_end(void); 64 extern void *(*cpu_pause_func)(void *); 65 66 void rmp_gdt_init(rm_platter_t *); 67 68 /* 69 * Fill up the real mode platter to make it easy for real mode code to 70 * kick it off. This area should really be one passed by boot to kernel 71 * and guaranteed to be below 1MB and aligned to 16 bytes. Should also 72 * have identical physical and virtual address in paged mode. 73 */ 74 static ushort_t *warm_reset_vector = NULL; 75 76 int 77 mach_cpucontext_init(void) 78 { 79 ushort_t *vec; 80 ulong_t addr; 81 struct rm_platter *rm = (struct rm_platter *)rm_platter_va; 82 83 if (!(vec = (ushort_t *)psm_map_phys(WARM_RESET_VECTOR, 84 sizeof (vec), PROT_READ | PROT_WRITE))) 85 return (-1); 86 87 /* 88 * setup secondary cpu bios boot up vector 89 * Write page offset to 0x467 and page frame number to 0x469. 90 */ 91 addr = (ulong_t)((caddr_t)rm->rm_code - (caddr_t)rm) + rm_platter_pa; 92 vec[0] = (ushort_t)(addr & PAGEOFFSET); 93 vec[1] = (ushort_t)((addr & (0xfffff & PAGEMASK)) >> 4); 94 warm_reset_vector = vec; 95 96 /* Map real mode platter into kas so kernel can access it. */ 97 hat_devload(kas.a_hat, 98 (caddr_t)(uintptr_t)rm_platter_pa, MMU_PAGESIZE, 99 btop(rm_platter_pa), PROT_READ | PROT_WRITE | PROT_EXEC, 100 HAT_LOAD_NOCONSIST); 101 102 /* Copy CPU startup code to rm_platter if it's still during boot. */ 103 if (!plat_dr_enabled()) { 104 ASSERT((size_t)real_mode_start_cpu_end - 105 (size_t)real_mode_start_cpu <= RM_PLATTER_CODE_SIZE); 106 bcopy((caddr_t)real_mode_start_cpu, (caddr_t)rm->rm_code, 107 (size_t)real_mode_start_cpu_end - 108 (size_t)real_mode_start_cpu); 109 } 110 111 return (0); 112 } 113 114 void 115 mach_cpucontext_fini(void) 116 { 117 if (warm_reset_vector) 118 psm_unmap_phys((caddr_t)warm_reset_vector, 119 sizeof (warm_reset_vector)); 120 hat_unload(kas.a_hat, (caddr_t)(uintptr_t)rm_platter_pa, MMU_PAGESIZE, 121 HAT_UNLOAD); 122 } 123 124 #if defined(__amd64) 125 extern void *long_mode_64(void); 126 #endif /* __amd64 */ 127 128 /*ARGSUSED*/ 129 void 130 rmp_gdt_init(rm_platter_t *rm) 131 { 132 133 #if defined(__amd64) 134 /* Use the kas address space for the CPU startup thread. */ 135 if (MAKECR3(kas.a_hat->hat_htable->ht_pfn) > 0xffffffffUL) 136 panic("Cannot initialize CPUs; kernel's 64-bit page tables\n" 137 "located above 4G in physical memory (@ 0x%lx)", 138 MAKECR3(kas.a_hat->hat_htable->ht_pfn)); 139 140 /* 141 * Setup pseudo-descriptors for temporary GDT and IDT for use ONLY 142 * by code in real_mode_start_cpu(): 143 * 144 * GDT[0]: NULL selector 145 * GDT[1]: 64-bit CS: Long = 1, Present = 1, bits 12, 11 = 1 146 * 147 * Clear the IDT as interrupts will be off and a limit of 0 will cause 148 * the CPU to triple fault and reset on an NMI, seemingly as reasonable 149 * a course of action as any other, though it may cause the entire 150 * platform to reset in some cases... 151 */ 152 rm->rm_temp_gdt[0] = 0; 153 rm->rm_temp_gdt[TEMPGDT_KCODE64] = 0x20980000000000ULL; 154 155 rm->rm_temp_gdt_lim = (ushort_t)(sizeof (rm->rm_temp_gdt) - 1); 156 rm->rm_temp_gdt_base = rm_platter_pa + 157 (uint32_t)offsetof(rm_platter_t, rm_temp_gdt); 158 rm->rm_temp_idt_lim = 0; 159 rm->rm_temp_idt_base = 0; 160 161 /* 162 * Since the CPU needs to jump to protected mode using an identity 163 * mapped address, we need to calculate it here. 164 */ 165 rm->rm_longmode64_addr = rm_platter_pa + 166 ((uint32_t)long_mode_64 - (uint32_t)real_mode_start_cpu); 167 #endif /* __amd64 */ 168 } 169 170 static void * 171 mach_cpucontext_alloc_tables(struct cpu *cp) 172 { 173 struct tss *ntss; 174 struct cpu_tables *ct; 175 176 /* 177 * Allocate space for stack, tss, gdt and idt. We round the size 178 * allotted for cpu_tables up, so that the TSS is on a unique page. 179 * This is more efficient when running in virtual machines. 180 */ 181 ct = kmem_zalloc(P2ROUNDUP(sizeof (*ct), PAGESIZE), KM_SLEEP); 182 if ((uintptr_t)ct & PAGEOFFSET) 183 panic("mach_cpucontext_alloc_tables: cpu%d misaligned tables", 184 cp->cpu_id); 185 186 ntss = cp->cpu_tss = &ct->ct_tss; 187 188 #if defined(__amd64) 189 190 /* 191 * #DF (double fault). 192 */ 193 ntss->tss_ist1 = (uint64_t)&ct->ct_stack[sizeof (ct->ct_stack)]; 194 195 #elif defined(__i386) 196 197 ntss->tss_esp0 = ntss->tss_esp1 = ntss->tss_esp2 = ntss->tss_esp = 198 (uint32_t)&ct->ct_stack[sizeof (ct->ct_stack)]; 199 200 ntss->tss_ss0 = ntss->tss_ss1 = ntss->tss_ss2 = ntss->tss_ss = KDS_SEL; 201 202 ntss->tss_eip = (uint32_t)cp->cpu_thread->t_pc; 203 204 ntss->tss_cs = KCS_SEL; 205 ntss->tss_ds = ntss->tss_es = KDS_SEL; 206 ntss->tss_fs = KFS_SEL; 207 ntss->tss_gs = KGS_SEL; 208 209 #endif /* __i386 */ 210 211 /* 212 * Set I/O bit map offset equal to size of TSS segment limit 213 * for no I/O permission map. This will cause all user I/O 214 * instructions to generate #gp fault. 215 */ 216 ntss->tss_bitmapbase = sizeof (*ntss); 217 218 /* 219 * Setup kernel tss. 220 */ 221 set_syssegd((system_desc_t *)&cp->cpu_gdt[GDT_KTSS], cp->cpu_tss, 222 sizeof (*cp->cpu_tss) - 1, SDT_SYSTSS, SEL_KPL); 223 224 return (ct); 225 } 226 227 void * 228 mach_cpucontext_xalloc(struct cpu *cp, int optype) 229 { 230 size_t len; 231 struct cpu_tables *ct; 232 rm_platter_t *rm = (rm_platter_t *)rm_platter_va; 233 static int cpu_halt_code_ready; 234 235 if (optype == MACH_CPUCONTEXT_OP_STOP) { 236 ASSERT(plat_dr_enabled()); 237 238 /* 239 * The WARM_RESET_VECTOR has a limitation that the physical 240 * address written to it must be page-aligned. To work around 241 * this limitation, the CPU stop code has been splitted into 242 * two stages. 243 * The stage 2 code, which implements the real logic to halt 244 * CPUs, is copied to the rm_cpu_halt_code field in the real 245 * mode platter. The stage 1 code, which simply jumps to the 246 * stage 2 code in the rm_cpu_halt_code field, is copied to 247 * rm_code field in the real mode platter and it may be 248 * overwritten after the CPU has been stopped. 249 */ 250 if (!cpu_halt_code_ready) { 251 /* 252 * The rm_cpu_halt_code field in the real mode platter 253 * is used by the CPU stop code only. So only copy the 254 * CPU stop stage 2 code into the rm_cpu_halt_code 255 * field on the first call. 256 */ 257 len = (size_t)real_mode_stop_cpu_stage2_end - 258 (size_t)real_mode_stop_cpu_stage2; 259 ASSERT(len <= RM_PLATTER_CPU_HALT_CODE_SIZE); 260 bcopy((caddr_t)real_mode_stop_cpu_stage2, 261 (caddr_t)rm->rm_cpu_halt_code, len); 262 cpu_halt_code_ready = 1; 263 } 264 265 /* 266 * The rm_code field in the real mode platter is shared by 267 * the CPU start, CPU stop, CPR and fast reboot code. So copy 268 * the CPU stop stage 1 code into the rm_code field every time. 269 */ 270 len = (size_t)real_mode_stop_cpu_stage1_end - 271 (size_t)real_mode_stop_cpu_stage1; 272 ASSERT(len <= RM_PLATTER_CODE_SIZE); 273 bcopy((caddr_t)real_mode_stop_cpu_stage1, 274 (caddr_t)rm->rm_code, len); 275 rm->rm_cpu_halted = 0; 276 277 return (cp->cpu_m.mcpu_mach_ctx_ptr); 278 } else if (optype != MACH_CPUCONTEXT_OP_START) { 279 return (NULL); 280 } 281 282 /* 283 * Only need to allocate tables when starting CPU. 284 * Tables allocated when starting CPU will be reused when stopping CPU. 285 */ 286 ct = mach_cpucontext_alloc_tables(cp); 287 if (ct == NULL) { 288 return (NULL); 289 } 290 291 /* Copy CPU startup code to rm_platter for CPU hot-add operations. */ 292 if (plat_dr_enabled()) { 293 bcopy((caddr_t)real_mode_start_cpu, (caddr_t)rm->rm_code, 294 (size_t)real_mode_start_cpu_end - 295 (size_t)real_mode_start_cpu); 296 } 297 298 /* 299 * Now copy all that we've set up onto the real mode platter 300 * for the real mode code to digest as part of starting the cpu. 301 */ 302 rm->rm_idt_base = cp->cpu_idt; 303 rm->rm_idt_lim = sizeof (*cp->cpu_idt) * NIDT - 1; 304 rm->rm_gdt_base = cp->cpu_gdt; 305 rm->rm_gdt_lim = sizeof (*cp->cpu_gdt) * NGDT - 1; 306 307 /* 308 * CPU needs to access kernel address space after powering on. 309 * When hot-adding CPU at runtime, directly use top level page table 310 * of kas other than the return value of getcr3(). getcr3() returns 311 * current process's top level page table, which may be different from 312 * the one of kas. 313 */ 314 rm->rm_pdbr = MAKECR3(kas.a_hat->hat_htable->ht_pfn); 315 rm->rm_cpu = cp->cpu_id; 316 rm->rm_x86feature = x86_feature; 317 318 /* 319 * For hot-adding CPU at runtime, Machine Check and Performance Counter 320 * should be disabled. They will be enabled on demand after CPU powers 321 * on successfully 322 */ 323 rm->rm_cr4 = getcr4(); 324 rm->rm_cr4 &= ~(CR4_MCE | CR4_PCE); 325 326 rmp_gdt_init(rm); 327 328 return (ct); 329 } 330 331 void 332 mach_cpucontext_xfree(struct cpu *cp, void *arg, int err, int optype) 333 { 334 struct cpu_tables *ct = arg; 335 336 ASSERT(&ct->ct_tss == cp->cpu_tss); 337 if (optype == MACH_CPUCONTEXT_OP_START) { 338 switch (err) { 339 case 0: 340 /* 341 * Save pointer for reuse when stopping CPU. 342 */ 343 cp->cpu_m.mcpu_mach_ctx_ptr = arg; 344 break; 345 case ETIMEDOUT: 346 /* 347 * The processor was poked, but failed to start before 348 * we gave up waiting for it. In case it starts later, 349 * don't free anything. 350 */ 351 cp->cpu_m.mcpu_mach_ctx_ptr = arg; 352 break; 353 default: 354 /* 355 * Some other, passive, error occurred. 356 */ 357 kmem_free(ct, P2ROUNDUP(sizeof (*ct), PAGESIZE)); 358 cp->cpu_tss = NULL; 359 break; 360 } 361 } else if (optype == MACH_CPUCONTEXT_OP_STOP) { 362 switch (err) { 363 case 0: 364 /* 365 * Free resources allocated when starting CPU. 366 */ 367 kmem_free(ct, P2ROUNDUP(sizeof (*ct), PAGESIZE)); 368 cp->cpu_tss = NULL; 369 cp->cpu_m.mcpu_mach_ctx_ptr = NULL; 370 break; 371 default: 372 /* 373 * Don't touch table pointer in case of failure. 374 */ 375 break; 376 } 377 } else { 378 ASSERT(0); 379 } 380 } 381 382 void * 383 mach_cpucontext_alloc(struct cpu *cp) 384 { 385 return (mach_cpucontext_xalloc(cp, MACH_CPUCONTEXT_OP_START)); 386 } 387 388 void 389 mach_cpucontext_free(struct cpu *cp, void *arg, int err) 390 { 391 mach_cpucontext_xfree(cp, arg, err, MACH_CPUCONTEXT_OP_START); 392 } 393 394 /* 395 * "Enter monitor." Called via cross-call from stop_other_cpus(). 396 */ 397 void 398 mach_cpu_halt(char *msg) 399 { 400 if (msg) 401 prom_printf("%s\n", msg); 402 403 /*CONSTANTCONDITION*/ 404 while (1) 405 ; 406 } 407 408 void 409 mach_cpu_idle(void) 410 { 411 i86_halt(); 412 } 413 414 void 415 mach_cpu_pause(volatile char *safe) 416 { 417 /* 418 * This cpu is now safe. 419 */ 420 *safe = PAUSE_WAIT; 421 membar_enter(); /* make sure stores are flushed */ 422 423 /* 424 * Now we wait. When we are allowed to continue, safe 425 * will be set to PAUSE_IDLE. 426 */ 427 while (*safe != PAUSE_IDLE) 428 SMT_PAUSE(); 429 } 430 431 /* 432 * Power on the target CPU. 433 */ 434 int 435 mp_cpu_poweron(struct cpu *cp) 436 { 437 int error; 438 cpuset_t tempset; 439 processorid_t cpuid; 440 441 ASSERT(cp != NULL); 442 cpuid = cp->cpu_id; 443 if (use_mp == 0 || plat_dr_support_cpu() == 0) { 444 return (ENOTSUP); 445 } else if (cpuid < 0 || cpuid >= max_ncpus) { 446 return (EINVAL); 447 } 448 449 /* 450 * The currrent x86 implementaiton of mp_cpu_configure() and 451 * mp_cpu_poweron() have a limitation that mp_cpu_poweron() could only 452 * be called once after calling mp_cpu_configure() for a specific CPU. 453 * It's because mp_cpu_poweron() will destroy data structure created 454 * by mp_cpu_configure(). So reject the request if the CPU has already 455 * been powered on once after calling mp_cpu_configure(). 456 * This limitaiton only affects the p_online syscall and the DR driver 457 * won't be affected because the DR driver always invoke public CPU 458 * management interfaces in the predefined order: 459 * cpu_configure()->cpu_poweron()...->cpu_poweroff()->cpu_unconfigure() 460 */ 461 if (cpuid_checkpass(cp, 4) || cp->cpu_thread == cp->cpu_idle_thread) { 462 return (ENOTSUP); 463 } 464 465 /* 466 * Check if there's at least a Mbyte of kmem available 467 * before attempting to start the cpu. 468 */ 469 if (kmem_avail() < 1024 * 1024) { 470 /* 471 * Kick off a reap in case that helps us with 472 * later attempts .. 473 */ 474 kmem_reap(); 475 return (ENOMEM); 476 } 477 478 affinity_set(CPU->cpu_id); 479 480 /* 481 * Start the target CPU. No need to call mach_cpucontext_fini() 482 * if mach_cpucontext_init() fails. 483 */ 484 if ((error = mach_cpucontext_init()) == 0) { 485 error = mp_start_cpu_common(cp, B_FALSE); 486 mach_cpucontext_fini(); 487 } 488 if (error != 0) { 489 affinity_clear(); 490 return (error); 491 } 492 493 /* Wait for the target cpu to reach READY state. */ 494 tempset = cpu_ready_set; 495 while (!CPU_IN_SET(tempset, cpuid)) { 496 delay(1); 497 tempset = *((volatile cpuset_t *)&cpu_ready_set); 498 } 499 500 /* Mark the target CPU as available for mp operation. */ 501 CPUSET_ATOMIC_ADD(mp_cpus, cpuid); 502 503 /* Free the space allocated to hold the microcode file */ 504 ucode_cleanup(); 505 506 affinity_clear(); 507 508 return (0); 509 } 510 511 #define MP_CPU_DETACH_MAX_TRIES 5 512 #define MP_CPU_DETACH_DELAY 100 513 514 static int 515 mp_cpu_detach_driver(dev_info_t *dip) 516 { 517 int i; 518 int rv = EBUSY; 519 dev_info_t *pdip; 520 521 pdip = ddi_get_parent(dip); 522 ASSERT(pdip != NULL); 523 /* 524 * Check if caller holds pdip busy - can cause deadlocks in 525 * e_ddi_branch_unconfigure(), which calls devfs_clean(). 526 */ 527 if (DEVI_BUSY_OWNED(pdip)) { 528 return (EDEADLOCK); 529 } 530 531 for (i = 0; i < MP_CPU_DETACH_MAX_TRIES; i++) { 532 if (e_ddi_branch_unconfigure(dip, NULL, 0) == 0) { 533 rv = 0; 534 break; 535 } 536 DELAY(MP_CPU_DETACH_DELAY); 537 } 538 539 return (rv); 540 } 541 542 /* 543 * Power off the target CPU. 544 * Note: cpu_lock will be released and then reacquired. 545 */ 546 int 547 mp_cpu_poweroff(struct cpu *cp) 548 { 549 int rv = 0; 550 void *ctx; 551 dev_info_t *dip = NULL; 552 rm_platter_t *rm = (rm_platter_t *)rm_platter_va; 553 extern void cpupm_start(cpu_t *); 554 extern void cpupm_stop(cpu_t *); 555 556 ASSERT(cp != NULL); 557 ASSERT((cp->cpu_flags & CPU_OFFLINE) != 0); 558 ASSERT((cp->cpu_flags & CPU_QUIESCED) != 0); 559 560 if (use_mp == 0 || plat_dr_support_cpu() == 0) { 561 return (ENOTSUP); 562 } 563 /* 564 * There is no support for powering off cpu0 yet. 565 * There are many pieces of code which have a hard dependency on cpu0. 566 */ 567 if (cp->cpu_id == 0) { 568 return (ENOTSUP); 569 }; 570 571 if (mach_cpu_get_device_node(cp, &dip) != PSM_SUCCESS) { 572 return (ENXIO); 573 } 574 ASSERT(dip != NULL); 575 if (mp_cpu_detach_driver(dip) != 0) { 576 rv = EBUSY; 577 goto out_online; 578 } 579 580 /* Allocate CPU context for stopping */ 581 if (mach_cpucontext_init() != 0) { 582 rv = ENXIO; 583 goto out_online; 584 } 585 ctx = mach_cpucontext_xalloc(cp, MACH_CPUCONTEXT_OP_STOP); 586 if (ctx == NULL) { 587 rv = ENXIO; 588 goto out_context_fini; 589 } 590 591 cpupm_stop(cp); 592 cpu_event_fini_cpu(cp); 593 594 if (cp->cpu_m.mcpu_cmi_hdl != NULL) { 595 cmi_fini(cp->cpu_m.mcpu_cmi_hdl); 596 cp->cpu_m.mcpu_cmi_hdl = NULL; 597 } 598 599 rv = mach_cpu_stop(cp, ctx); 600 if (rv != 0) { 601 goto out_enable_cmi; 602 } 603 604 /* Wait until the target CPU has been halted. */ 605 while (*(volatile ushort_t *)&(rm->rm_cpu_halted) != 0xdead) { 606 delay(1); 607 } 608 rm->rm_cpu_halted = 0xffff; 609 610 /* CPU_READY has been cleared by mach_cpu_stop. */ 611 ASSERT((cp->cpu_flags & CPU_READY) == 0); 612 ASSERT((cp->cpu_flags & CPU_RUNNING) == 0); 613 cp->cpu_flags = CPU_OFFLINE | CPU_QUIESCED | CPU_POWEROFF; 614 CPUSET_ATOMIC_DEL(mp_cpus, cp->cpu_id); 615 616 mach_cpucontext_xfree(cp, ctx, 0, MACH_CPUCONTEXT_OP_STOP); 617 mach_cpucontext_fini(); 618 619 return (0); 620 621 out_enable_cmi: 622 { 623 cmi_hdl_t hdl; 624 625 if ((hdl = cmi_init(CMI_HDL_NATIVE, cmi_ntv_hwchipid(cp), 626 cmi_ntv_hwcoreid(cp), cmi_ntv_hwstrandid(cp))) != NULL) { 627 if (x86_feature & X86_MCA) 628 cmi_mca_init(hdl); 629 cp->cpu_m.mcpu_cmi_hdl = hdl; 630 } 631 } 632 cpu_event_init_cpu(cp); 633 cpupm_start(cp); 634 mach_cpucontext_xfree(cp, ctx, rv, MACH_CPUCONTEXT_OP_STOP); 635 636 out_context_fini: 637 mach_cpucontext_fini(); 638 639 out_online: 640 (void) e_ddi_branch_configure(dip, NULL, 0); 641 642 if (rv != EAGAIN && rv != ETIME) { 643 rv = ENXIO; 644 } 645 646 return (rv); 647 } 648 649 /* 650 * Return vcpu state, since this could be a virtual environment that we 651 * are unaware of, return "unknown". 652 */ 653 /* ARGSUSED */ 654 int 655 vcpu_on_pcpu(processorid_t cpu) 656 { 657 return (VCPU_STATE_UNKNOWN); 658 } 659