1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 /* 25 * Copyright (c) 2010, Intel Corporation. 26 * All rights reserved. 27 */ 28 /* 29 * Copyright 2011 Joyent, Inc. All rights reserved. 30 */ 31 32 /* 33 * Welcome to the world of the "real mode platter". 34 * See also startup.c, mpcore.s and apic.c for related routines. 35 */ 36 37 #include <sys/types.h> 38 #include <sys/systm.h> 39 #include <sys/cpuvar.h> 40 #include <sys/cpu_module.h> 41 #include <sys/kmem.h> 42 #include <sys/archsystm.h> 43 #include <sys/machsystm.h> 44 #include <sys/controlregs.h> 45 #include <sys/x86_archext.h> 46 #include <sys/smp_impldefs.h> 47 #include <sys/sysmacros.h> 48 #include <sys/mach_mmu.h> 49 #include <sys/promif.h> 50 #include <sys/cpu.h> 51 #include <sys/cpu_event.h> 52 #include <sys/sunndi.h> 53 #include <sys/fs/dv_node.h> 54 #include <vm/hat_i86.h> 55 #include <vm/as.h> 56 57 extern cpuset_t cpu_ready_set; 58 59 extern int mp_start_cpu_common(cpu_t *cp, boolean_t boot); 60 extern void real_mode_start_cpu(void); 61 extern void real_mode_start_cpu_end(void); 62 extern void real_mode_stop_cpu_stage1(void); 63 extern void real_mode_stop_cpu_stage1_end(void); 64 extern void real_mode_stop_cpu_stage2(void); 65 extern void real_mode_stop_cpu_stage2_end(void); 66 extern void *(*cpu_pause_func)(void *); 67 68 void rmp_gdt_init(rm_platter_t *); 69 70 /* 71 * Fill up the real mode platter to make it easy for real mode code to 72 * kick it off. This area should really be one passed by boot to kernel 73 * and guaranteed to be below 1MB and aligned to 16 bytes. Should also 74 * have identical physical and virtual address in paged mode. 75 */ 76 static ushort_t *warm_reset_vector = NULL; 77 78 int 79 mach_cpucontext_init(void) 80 { 81 ushort_t *vec; 82 ulong_t addr; 83 struct rm_platter *rm = (struct rm_platter *)rm_platter_va; 84 85 if (!(vec = (ushort_t *)psm_map_phys(WARM_RESET_VECTOR, 86 sizeof (vec), PROT_READ | PROT_WRITE))) 87 return (-1); 88 89 /* 90 * setup secondary cpu bios boot up vector 91 * Write page offset to 0x467 and page frame number to 0x469. 92 */ 93 addr = (ulong_t)((caddr_t)rm->rm_code - (caddr_t)rm) + rm_platter_pa; 94 vec[0] = (ushort_t)(addr & PAGEOFFSET); 95 vec[1] = (ushort_t)((addr & (0xfffff & PAGEMASK)) >> 4); 96 warm_reset_vector = vec; 97 98 /* Map real mode platter into kas so kernel can access it. */ 99 hat_devload(kas.a_hat, 100 (caddr_t)(uintptr_t)rm_platter_pa, MMU_PAGESIZE, 101 btop(rm_platter_pa), PROT_READ | PROT_WRITE | PROT_EXEC, 102 HAT_LOAD_NOCONSIST); 103 104 /* Copy CPU startup code to rm_platter if it's still during boot. */ 105 if (!plat_dr_enabled()) { 106 ASSERT((size_t)real_mode_start_cpu_end - 107 (size_t)real_mode_start_cpu <= RM_PLATTER_CODE_SIZE); 108 bcopy((caddr_t)real_mode_start_cpu, (caddr_t)rm->rm_code, 109 (size_t)real_mode_start_cpu_end - 110 (size_t)real_mode_start_cpu); 111 } 112 113 return (0); 114 } 115 116 void 117 mach_cpucontext_fini(void) 118 { 119 if (warm_reset_vector) 120 psm_unmap_phys((caddr_t)warm_reset_vector, 121 sizeof (warm_reset_vector)); 122 hat_unload(kas.a_hat, (caddr_t)(uintptr_t)rm_platter_pa, MMU_PAGESIZE, 123 HAT_UNLOAD); 124 } 125 126 #if defined(__amd64) 127 extern void *long_mode_64(void); 128 #endif /* __amd64 */ 129 130 /*ARGSUSED*/ 131 void 132 rmp_gdt_init(rm_platter_t *rm) 133 { 134 135 #if defined(__amd64) 136 /* Use the kas address space for the CPU startup thread. */ 137 if (MAKECR3(kas.a_hat->hat_htable->ht_pfn) > 0xffffffffUL) 138 panic("Cannot initialize CPUs; kernel's 64-bit page tables\n" 139 "located above 4G in physical memory (@ 0x%lx)", 140 MAKECR3(kas.a_hat->hat_htable->ht_pfn)); 141 142 /* 143 * Setup pseudo-descriptors for temporary GDT and IDT for use ONLY 144 * by code in real_mode_start_cpu(): 145 * 146 * GDT[0]: NULL selector 147 * GDT[1]: 64-bit CS: Long = 1, Present = 1, bits 12, 11 = 1 148 * 149 * Clear the IDT as interrupts will be off and a limit of 0 will cause 150 * the CPU to triple fault and reset on an NMI, seemingly as reasonable 151 * a course of action as any other, though it may cause the entire 152 * platform to reset in some cases... 153 */ 154 rm->rm_temp_gdt[0] = 0; 155 rm->rm_temp_gdt[TEMPGDT_KCODE64] = 0x20980000000000ULL; 156 157 rm->rm_temp_gdt_lim = (ushort_t)(sizeof (rm->rm_temp_gdt) - 1); 158 rm->rm_temp_gdt_base = rm_platter_pa + 159 (uint32_t)offsetof(rm_platter_t, rm_temp_gdt); 160 rm->rm_temp_idt_lim = 0; 161 rm->rm_temp_idt_base = 0; 162 163 /* 164 * Since the CPU needs to jump to protected mode using an identity 165 * mapped address, we need to calculate it here. 166 */ 167 rm->rm_longmode64_addr = rm_platter_pa + 168 (uint32_t)((uintptr_t)long_mode_64 - 169 (uintptr_t)real_mode_start_cpu); 170 #endif /* __amd64 */ 171 } 172 173 static void * 174 mach_cpucontext_alloc_tables(struct cpu *cp) 175 { 176 tss_t *ntss; 177 struct cpu_tables *ct; 178 179 /* 180 * Allocate space for stack, tss, gdt and idt. We round the size 181 * allotted for cpu_tables up, so that the TSS is on a unique page. 182 * This is more efficient when running in virtual machines. 183 */ 184 ct = kmem_zalloc(P2ROUNDUP(sizeof (*ct), PAGESIZE), KM_SLEEP); 185 if ((uintptr_t)ct & PAGEOFFSET) 186 panic("mach_cpucontext_alloc_tables: cpu%d misaligned tables", 187 cp->cpu_id); 188 189 ntss = cp->cpu_tss = &ct->ct_tss; 190 191 #if defined(__amd64) 192 193 /* 194 * #DF (double fault). 195 */ 196 ntss->tss_ist1 = (uint64_t)&ct->ct_stack[sizeof (ct->ct_stack)]; 197 198 #elif defined(__i386) 199 200 ntss->tss_esp0 = ntss->tss_esp1 = ntss->tss_esp2 = ntss->tss_esp = 201 (uint32_t)&ct->ct_stack[sizeof (ct->ct_stack)]; 202 203 ntss->tss_ss0 = ntss->tss_ss1 = ntss->tss_ss2 = ntss->tss_ss = KDS_SEL; 204 205 ntss->tss_eip = (uint32_t)cp->cpu_thread->t_pc; 206 207 ntss->tss_cs = KCS_SEL; 208 ntss->tss_ds = ntss->tss_es = KDS_SEL; 209 ntss->tss_fs = KFS_SEL; 210 ntss->tss_gs = KGS_SEL; 211 212 #endif /* __i386 */ 213 214 /* 215 * Set I/O bit map offset equal to size of TSS segment limit 216 * for no I/O permission map. This will cause all user I/O 217 * instructions to generate #gp fault. 218 */ 219 ntss->tss_bitmapbase = sizeof (*ntss); 220 221 /* 222 * Setup kernel tss. 223 */ 224 set_syssegd((system_desc_t *)&cp->cpu_gdt[GDT_KTSS], cp->cpu_tss, 225 sizeof (*cp->cpu_tss) - 1, SDT_SYSTSS, SEL_KPL); 226 227 return (ct); 228 } 229 230 void * 231 mach_cpucontext_xalloc(struct cpu *cp, int optype) 232 { 233 size_t len; 234 struct cpu_tables *ct; 235 rm_platter_t *rm = (rm_platter_t *)rm_platter_va; 236 static int cpu_halt_code_ready; 237 238 if (optype == MACH_CPUCONTEXT_OP_STOP) { 239 ASSERT(plat_dr_enabled()); 240 241 /* 242 * The WARM_RESET_VECTOR has a limitation that the physical 243 * address written to it must be page-aligned. To work around 244 * this limitation, the CPU stop code has been splitted into 245 * two stages. 246 * The stage 2 code, which implements the real logic to halt 247 * CPUs, is copied to the rm_cpu_halt_code field in the real 248 * mode platter. The stage 1 code, which simply jumps to the 249 * stage 2 code in the rm_cpu_halt_code field, is copied to 250 * rm_code field in the real mode platter and it may be 251 * overwritten after the CPU has been stopped. 252 */ 253 if (!cpu_halt_code_ready) { 254 /* 255 * The rm_cpu_halt_code field in the real mode platter 256 * is used by the CPU stop code only. So only copy the 257 * CPU stop stage 2 code into the rm_cpu_halt_code 258 * field on the first call. 259 */ 260 len = (size_t)real_mode_stop_cpu_stage2_end - 261 (size_t)real_mode_stop_cpu_stage2; 262 ASSERT(len <= RM_PLATTER_CPU_HALT_CODE_SIZE); 263 bcopy((caddr_t)real_mode_stop_cpu_stage2, 264 (caddr_t)rm->rm_cpu_halt_code, len); 265 cpu_halt_code_ready = 1; 266 } 267 268 /* 269 * The rm_code field in the real mode platter is shared by 270 * the CPU start, CPU stop, CPR and fast reboot code. So copy 271 * the CPU stop stage 1 code into the rm_code field every time. 272 */ 273 len = (size_t)real_mode_stop_cpu_stage1_end - 274 (size_t)real_mode_stop_cpu_stage1; 275 ASSERT(len <= RM_PLATTER_CODE_SIZE); 276 bcopy((caddr_t)real_mode_stop_cpu_stage1, 277 (caddr_t)rm->rm_code, len); 278 rm->rm_cpu_halted = 0; 279 280 return (cp->cpu_m.mcpu_mach_ctx_ptr); 281 } else if (optype != MACH_CPUCONTEXT_OP_START) { 282 return (NULL); 283 } 284 285 /* 286 * Only need to allocate tables when starting CPU. 287 * Tables allocated when starting CPU will be reused when stopping CPU. 288 */ 289 ct = mach_cpucontext_alloc_tables(cp); 290 if (ct == NULL) { 291 return (NULL); 292 } 293 294 /* Copy CPU startup code to rm_platter for CPU hot-add operations. */ 295 if (plat_dr_enabled()) { 296 bcopy((caddr_t)real_mode_start_cpu, (caddr_t)rm->rm_code, 297 (size_t)real_mode_start_cpu_end - 298 (size_t)real_mode_start_cpu); 299 } 300 301 /* 302 * Now copy all that we've set up onto the real mode platter 303 * for the real mode code to digest as part of starting the cpu. 304 */ 305 rm->rm_idt_base = cp->cpu_idt; 306 rm->rm_idt_lim = sizeof (*cp->cpu_idt) * NIDT - 1; 307 rm->rm_gdt_base = cp->cpu_gdt; 308 rm->rm_gdt_lim = sizeof (*cp->cpu_gdt) * NGDT - 1; 309 310 /* 311 * CPU needs to access kernel address space after powering on. 312 * When hot-adding CPU at runtime, directly use top level page table 313 * of kas other than the return value of getcr3(). getcr3() returns 314 * current process's top level page table, which may be different from 315 * the one of kas. 316 */ 317 rm->rm_pdbr = MAKECR3(kas.a_hat->hat_htable->ht_pfn); 318 rm->rm_cpu = cp->cpu_id; 319 320 /* 321 * For hot-adding CPU at runtime, Machine Check and Performance Counter 322 * should be disabled. They will be enabled on demand after CPU powers 323 * on successfully 324 */ 325 rm->rm_cr4 = getcr4(); 326 rm->rm_cr4 &= ~(CR4_MCE | CR4_PCE); 327 328 rmp_gdt_init(rm); 329 330 return (ct); 331 } 332 333 void 334 mach_cpucontext_xfree(struct cpu *cp, void *arg, int err, int optype) 335 { 336 struct cpu_tables *ct = arg; 337 338 ASSERT(&ct->ct_tss == cp->cpu_tss); 339 if (optype == MACH_CPUCONTEXT_OP_START) { 340 switch (err) { 341 case 0: 342 /* 343 * Save pointer for reuse when stopping CPU. 344 */ 345 cp->cpu_m.mcpu_mach_ctx_ptr = arg; 346 break; 347 case ETIMEDOUT: 348 /* 349 * The processor was poked, but failed to start before 350 * we gave up waiting for it. In case it starts later, 351 * don't free anything. 352 */ 353 cp->cpu_m.mcpu_mach_ctx_ptr = arg; 354 break; 355 default: 356 /* 357 * Some other, passive, error occurred. 358 */ 359 kmem_free(ct, P2ROUNDUP(sizeof (*ct), PAGESIZE)); 360 cp->cpu_tss = NULL; 361 break; 362 } 363 } else if (optype == MACH_CPUCONTEXT_OP_STOP) { 364 switch (err) { 365 case 0: 366 /* 367 * Free resources allocated when starting CPU. 368 */ 369 kmem_free(ct, P2ROUNDUP(sizeof (*ct), PAGESIZE)); 370 cp->cpu_tss = NULL; 371 cp->cpu_m.mcpu_mach_ctx_ptr = NULL; 372 break; 373 default: 374 /* 375 * Don't touch table pointer in case of failure. 376 */ 377 break; 378 } 379 } else { 380 ASSERT(0); 381 } 382 } 383 384 void * 385 mach_cpucontext_alloc(struct cpu *cp) 386 { 387 return (mach_cpucontext_xalloc(cp, MACH_CPUCONTEXT_OP_START)); 388 } 389 390 void 391 mach_cpucontext_free(struct cpu *cp, void *arg, int err) 392 { 393 mach_cpucontext_xfree(cp, arg, err, MACH_CPUCONTEXT_OP_START); 394 } 395 396 /* 397 * "Enter monitor." Called via cross-call from stop_other_cpus(). 398 */ 399 void 400 mach_cpu_halt(char *msg) 401 { 402 if (msg) 403 prom_printf("%s\n", msg); 404 405 /*CONSTANTCONDITION*/ 406 while (1) 407 ; 408 } 409 410 void 411 mach_cpu_idle(void) 412 { 413 i86_halt(); 414 } 415 416 void 417 mach_cpu_pause(volatile char *safe) 418 { 419 /* 420 * This cpu is now safe. 421 */ 422 *safe = PAUSE_WAIT; 423 membar_enter(); /* make sure stores are flushed */ 424 425 /* 426 * Now we wait. When we are allowed to continue, safe 427 * will be set to PAUSE_IDLE. 428 */ 429 while (*safe != PAUSE_IDLE) 430 SMT_PAUSE(); 431 } 432 433 /* 434 * Power on the target CPU. 435 */ 436 int 437 mp_cpu_poweron(struct cpu *cp) 438 { 439 int error; 440 cpuset_t tempset; 441 processorid_t cpuid; 442 443 ASSERT(cp != NULL); 444 cpuid = cp->cpu_id; 445 if (use_mp == 0 || plat_dr_support_cpu() == 0) { 446 return (ENOTSUP); 447 } else if (cpuid < 0 || cpuid >= max_ncpus) { 448 return (EINVAL); 449 } 450 451 /* 452 * The currrent x86 implementaiton of mp_cpu_configure() and 453 * mp_cpu_poweron() have a limitation that mp_cpu_poweron() could only 454 * be called once after calling mp_cpu_configure() for a specific CPU. 455 * It's because mp_cpu_poweron() will destroy data structure created 456 * by mp_cpu_configure(). So reject the request if the CPU has already 457 * been powered on once after calling mp_cpu_configure(). 458 * This limitaiton only affects the p_online syscall and the DR driver 459 * won't be affected because the DR driver always invoke public CPU 460 * management interfaces in the predefined order: 461 * cpu_configure()->cpu_poweron()...->cpu_poweroff()->cpu_unconfigure() 462 */ 463 if (cpuid_checkpass(cp, 4) || cp->cpu_thread == cp->cpu_idle_thread) { 464 return (ENOTSUP); 465 } 466 467 /* 468 * Check if there's at least a Mbyte of kmem available 469 * before attempting to start the cpu. 470 */ 471 if (kmem_avail() < 1024 * 1024) { 472 /* 473 * Kick off a reap in case that helps us with 474 * later attempts .. 475 */ 476 kmem_reap(); 477 return (ENOMEM); 478 } 479 480 affinity_set(CPU->cpu_id); 481 482 /* 483 * Start the target CPU. No need to call mach_cpucontext_fini() 484 * if mach_cpucontext_init() fails. 485 */ 486 if ((error = mach_cpucontext_init()) == 0) { 487 error = mp_start_cpu_common(cp, B_FALSE); 488 mach_cpucontext_fini(); 489 } 490 if (error != 0) { 491 affinity_clear(); 492 return (error); 493 } 494 495 /* Wait for the target cpu to reach READY state. */ 496 tempset = cpu_ready_set; 497 while (!CPU_IN_SET(tempset, cpuid)) { 498 delay(1); 499 tempset = *((volatile cpuset_t *)&cpu_ready_set); 500 } 501 502 /* Mark the target CPU as available for mp operation. */ 503 CPUSET_ATOMIC_ADD(mp_cpus, cpuid); 504 505 /* Free the space allocated to hold the microcode file */ 506 ucode_cleanup(); 507 508 affinity_clear(); 509 510 return (0); 511 } 512 513 #define MP_CPU_DETACH_MAX_TRIES 5 514 #define MP_CPU_DETACH_DELAY 100 515 516 static int 517 mp_cpu_detach_driver(dev_info_t *dip) 518 { 519 int i; 520 int rv = EBUSY; 521 dev_info_t *pdip; 522 523 pdip = ddi_get_parent(dip); 524 ASSERT(pdip != NULL); 525 /* 526 * Check if caller holds pdip busy - can cause deadlocks in 527 * e_ddi_branch_unconfigure(), which calls devfs_clean(). 528 */ 529 if (DEVI_BUSY_OWNED(pdip)) { 530 return (EDEADLOCK); 531 } 532 533 for (i = 0; i < MP_CPU_DETACH_MAX_TRIES; i++) { 534 if (e_ddi_branch_unconfigure(dip, NULL, 0) == 0) { 535 rv = 0; 536 break; 537 } 538 DELAY(MP_CPU_DETACH_DELAY); 539 } 540 541 return (rv); 542 } 543 544 /* 545 * Power off the target CPU. 546 * Note: cpu_lock will be released and then reacquired. 547 */ 548 int 549 mp_cpu_poweroff(struct cpu *cp) 550 { 551 int rv = 0; 552 void *ctx; 553 dev_info_t *dip = NULL; 554 rm_platter_t *rm = (rm_platter_t *)rm_platter_va; 555 extern void cpupm_start(cpu_t *); 556 extern void cpupm_stop(cpu_t *); 557 558 ASSERT(cp != NULL); 559 ASSERT((cp->cpu_flags & CPU_OFFLINE) != 0); 560 ASSERT((cp->cpu_flags & CPU_QUIESCED) != 0); 561 562 if (use_mp == 0 || plat_dr_support_cpu() == 0) { 563 return (ENOTSUP); 564 } 565 /* 566 * There is no support for powering off cpu0 yet. 567 * There are many pieces of code which have a hard dependency on cpu0. 568 */ 569 if (cp->cpu_id == 0) { 570 return (ENOTSUP); 571 }; 572 573 if (mach_cpu_get_device_node(cp, &dip) != PSM_SUCCESS) { 574 return (ENXIO); 575 } 576 ASSERT(dip != NULL); 577 if (mp_cpu_detach_driver(dip) != 0) { 578 rv = EBUSY; 579 goto out_online; 580 } 581 582 /* Allocate CPU context for stopping */ 583 if (mach_cpucontext_init() != 0) { 584 rv = ENXIO; 585 goto out_online; 586 } 587 ctx = mach_cpucontext_xalloc(cp, MACH_CPUCONTEXT_OP_STOP); 588 if (ctx == NULL) { 589 rv = ENXIO; 590 goto out_context_fini; 591 } 592 593 cpupm_stop(cp); 594 cpu_event_fini_cpu(cp); 595 596 if (cp->cpu_m.mcpu_cmi_hdl != NULL) { 597 cmi_fini(cp->cpu_m.mcpu_cmi_hdl); 598 cp->cpu_m.mcpu_cmi_hdl = NULL; 599 } 600 601 rv = mach_cpu_stop(cp, ctx); 602 if (rv != 0) { 603 goto out_enable_cmi; 604 } 605 606 /* Wait until the target CPU has been halted. */ 607 while (*(volatile ushort_t *)&(rm->rm_cpu_halted) != 0xdead) { 608 delay(1); 609 } 610 rm->rm_cpu_halted = 0xffff; 611 612 /* CPU_READY has been cleared by mach_cpu_stop. */ 613 ASSERT((cp->cpu_flags & CPU_READY) == 0); 614 ASSERT((cp->cpu_flags & CPU_RUNNING) == 0); 615 cp->cpu_flags = CPU_OFFLINE | CPU_QUIESCED | CPU_POWEROFF; 616 CPUSET_ATOMIC_DEL(mp_cpus, cp->cpu_id); 617 618 mach_cpucontext_xfree(cp, ctx, 0, MACH_CPUCONTEXT_OP_STOP); 619 mach_cpucontext_fini(); 620 621 return (0); 622 623 out_enable_cmi: 624 { 625 cmi_hdl_t hdl; 626 627 if ((hdl = cmi_init(CMI_HDL_NATIVE, cmi_ntv_hwchipid(cp), 628 cmi_ntv_hwcoreid(cp), cmi_ntv_hwstrandid(cp))) != NULL) { 629 if (is_x86_feature(x86_featureset, X86FSET_MCA)) 630 cmi_mca_init(hdl); 631 cp->cpu_m.mcpu_cmi_hdl = hdl; 632 } 633 } 634 cpu_event_init_cpu(cp); 635 cpupm_start(cp); 636 mach_cpucontext_xfree(cp, ctx, rv, MACH_CPUCONTEXT_OP_STOP); 637 638 out_context_fini: 639 mach_cpucontext_fini(); 640 641 out_online: 642 (void) e_ddi_branch_configure(dip, NULL, 0); 643 644 if (rv != EAGAIN && rv != ETIME) { 645 rv = ENXIO; 646 } 647 648 return (rv); 649 } 650 651 /* 652 * Return vcpu state, since this could be a virtual environment that we 653 * are unaware of, return "unknown". 654 */ 655 /* ARGSUSED */ 656 int 657 vcpu_on_pcpu(processorid_t cpu) 658 { 659 return (VCPU_STATE_UNKNOWN); 660 } 661