1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 /* 26 * Copyright (c) 2010, Intel Corporation. 27 * All rights reserved. 28 */ 29 30 #include <sys/types.h> 31 #include <sys/thread.h> 32 #include <sys/cpuvar.h> 33 #include <sys/cpu.h> 34 #include <sys/t_lock.h> 35 #include <sys/param.h> 36 #include <sys/proc.h> 37 #include <sys/disp.h> 38 #include <sys/class.h> 39 #include <sys/cmn_err.h> 40 #include <sys/debug.h> 41 #include <sys/note.h> 42 #include <sys/asm_linkage.h> 43 #include <sys/x_call.h> 44 #include <sys/systm.h> 45 #include <sys/var.h> 46 #include <sys/vtrace.h> 47 #include <vm/hat.h> 48 #include <vm/as.h> 49 #include <vm/seg_kmem.h> 50 #include <vm/seg_kp.h> 51 #include <sys/segments.h> 52 #include <sys/kmem.h> 53 #include <sys/stack.h> 54 #include <sys/smp_impldefs.h> 55 #include <sys/x86_archext.h> 56 #include <sys/machsystm.h> 57 #include <sys/traptrace.h> 58 #include <sys/clock.h> 59 #include <sys/cpc_impl.h> 60 #include <sys/pg.h> 61 #include <sys/cmt.h> 62 #include <sys/dtrace.h> 63 #include <sys/archsystm.h> 64 #include <sys/fp.h> 65 #include <sys/reboot.h> 66 #include <sys/kdi_machimpl.h> 67 #include <vm/hat_i86.h> 68 #include <vm/vm_dep.h> 69 #include <sys/memnode.h> 70 #include <sys/pci_cfgspace.h> 71 #include <sys/mach_mmu.h> 72 #include <sys/sysmacros.h> 73 #if defined(__xpv) 74 #include <sys/hypervisor.h> 75 #endif 76 #include <sys/cpu_module.h> 77 78 struct cpu cpus[1]; /* CPU data */ 79 struct cpu *cpu[NCPU] = {&cpus[0]}; /* pointers to all CPUs */ 80 struct cpu *cpu_free_list; /* list for released CPUs */ 81 cpu_core_t cpu_core[NCPU]; /* cpu_core structures */ 82 83 #define cpu_next_free cpu_prev 84 85 /* 86 * Useful for disabling MP bring-up on a MP capable system. 87 */ 88 int use_mp = 1; 89 90 /* 91 * to be set by a PSM to indicate what cpus 92 * are sitting around on the system. 93 */ 94 cpuset_t mp_cpus; 95 96 /* 97 * This variable is used by the hat layer to decide whether or not 98 * critical sections are needed to prevent race conditions. For sun4m, 99 * this variable is set once enough MP initialization has been done in 100 * order to allow cross calls. 101 */ 102 int flushes_require_xcalls; 103 104 cpuset_t cpu_ready_set; /* initialized in startup() */ 105 106 static void mp_startup_boot(void); 107 static void mp_startup_hotplug(void); 108 109 static void cpu_sep_enable(void); 110 static void cpu_sep_disable(void); 111 static void cpu_asysc_enable(void); 112 static void cpu_asysc_disable(void); 113 114 /* 115 * Init CPU info - get CPU type info for processor_info system call. 116 */ 117 void 118 init_cpu_info(struct cpu *cp) 119 { 120 processor_info_t *pi = &cp->cpu_type_info; 121 122 /* 123 * Get clock-frequency property for the CPU. 124 */ 125 pi->pi_clock = cpu_freq; 126 127 /* 128 * Current frequency in Hz. 129 */ 130 cp->cpu_curr_clock = cpu_freq_hz; 131 132 /* 133 * Supported frequencies. 134 */ 135 if (cp->cpu_supp_freqs == NULL) { 136 cpu_set_supp_freqs(cp, NULL); 137 } 138 139 (void) strcpy(pi->pi_processor_type, "i386"); 140 if (fpu_exists) 141 (void) strcpy(pi->pi_fputypes, "i387 compatible"); 142 143 cp->cpu_idstr = kmem_zalloc(CPU_IDSTRLEN, KM_SLEEP); 144 cp->cpu_brandstr = kmem_zalloc(CPU_IDSTRLEN, KM_SLEEP); 145 146 /* 147 * If called for the BSP, cp is equal to current CPU. 148 * For non-BSPs, cpuid info of cp is not ready yet, so use cpuid info 149 * of current CPU as default values for cpu_idstr and cpu_brandstr. 150 * They will be corrected in mp_startup_common() after cpuid_pass1() 151 * has been invoked on target CPU. 152 */ 153 (void) cpuid_getidstr(CPU, cp->cpu_idstr, CPU_IDSTRLEN); 154 (void) cpuid_getbrandstr(CPU, cp->cpu_brandstr, CPU_IDSTRLEN); 155 } 156 157 /* 158 * Configure syscall support on this CPU. 159 */ 160 /*ARGSUSED*/ 161 void 162 init_cpu_syscall(struct cpu *cp) 163 { 164 kpreempt_disable(); 165 166 #if defined(__amd64) 167 if (is_x86_feature(x86_featureset, X86FSET_MSR) && 168 is_x86_feature(x86_featureset, X86FSET_ASYSC)) { 169 170 #if !defined(__lint) 171 /* 172 * The syscall instruction imposes a certain ordering on 173 * segment selectors, so we double-check that ordering 174 * here. 175 */ 176 ASSERT(KDS_SEL == KCS_SEL + 8); 177 ASSERT(UDS_SEL == U32CS_SEL + 8); 178 ASSERT(UCS_SEL == U32CS_SEL + 16); 179 #endif 180 /* 181 * Turn syscall/sysret extensions on. 182 */ 183 cpu_asysc_enable(); 184 185 /* 186 * Program the magic registers .. 187 */ 188 wrmsr(MSR_AMD_STAR, 189 ((uint64_t)(U32CS_SEL << 16 | KCS_SEL)) << 32); 190 wrmsr(MSR_AMD_LSTAR, (uint64_t)(uintptr_t)sys_syscall); 191 wrmsr(MSR_AMD_CSTAR, (uint64_t)(uintptr_t)sys_syscall32); 192 193 /* 194 * This list of flags is masked off the incoming 195 * %rfl when we enter the kernel. 196 */ 197 wrmsr(MSR_AMD_SFMASK, (uint64_t)(uintptr_t)(PS_IE | PS_T)); 198 } 199 #endif 200 201 /* 202 * On 32-bit kernels, we use sysenter/sysexit because it's too 203 * hard to use syscall/sysret, and it is more portable anyway. 204 * 205 * On 64-bit kernels on Nocona machines, the 32-bit syscall 206 * variant isn't available to 32-bit applications, but sysenter is. 207 */ 208 if (is_x86_feature(x86_featureset, X86FSET_MSR) && 209 is_x86_feature(x86_featureset, X86FSET_SEP)) { 210 211 #if !defined(__lint) 212 /* 213 * The sysenter instruction imposes a certain ordering on 214 * segment selectors, so we double-check that ordering 215 * here. See "sysenter" in Intel document 245471-012, "IA-32 216 * Intel Architecture Software Developer's Manual Volume 2: 217 * Instruction Set Reference" 218 */ 219 ASSERT(KDS_SEL == KCS_SEL + 8); 220 221 ASSERT32(UCS_SEL == ((KCS_SEL + 16) | 3)); 222 ASSERT32(UDS_SEL == UCS_SEL + 8); 223 224 ASSERT64(U32CS_SEL == ((KCS_SEL + 16) | 3)); 225 ASSERT64(UDS_SEL == U32CS_SEL + 8); 226 #endif 227 228 cpu_sep_enable(); 229 230 /* 231 * resume() sets this value to the base of the threads stack 232 * via a context handler. 233 */ 234 wrmsr(MSR_INTC_SEP_ESP, 0); 235 wrmsr(MSR_INTC_SEP_EIP, (uint64_t)(uintptr_t)sys_sysenter); 236 } 237 238 kpreempt_enable(); 239 } 240 241 /* 242 * Multiprocessor initialization. 243 * 244 * Allocate and initialize the cpu structure, TRAPTRACE buffer, and the 245 * startup and idle threads for the specified CPU. 246 * Parameter boot is true for boot time operations and is false for CPU 247 * DR operations. 248 */ 249 static struct cpu * 250 mp_cpu_configure_common(int cpun, boolean_t boot) 251 { 252 struct cpu *cp; 253 kthread_id_t tp; 254 caddr_t sp; 255 proc_t *procp; 256 #if !defined(__xpv) 257 extern int idle_cpu_prefer_mwait; 258 extern void cpu_idle_mwait(); 259 #endif 260 extern void idle(); 261 extern void cpu_idle(); 262 263 #ifdef TRAPTRACE 264 trap_trace_ctl_t *ttc = &trap_trace_ctl[cpun]; 265 #endif 266 267 ASSERT(MUTEX_HELD(&cpu_lock)); 268 ASSERT(cpun < NCPU && cpu[cpun] == NULL); 269 270 if (cpu_free_list == NULL) { 271 cp = kmem_zalloc(sizeof (*cp), KM_SLEEP); 272 } else { 273 cp = cpu_free_list; 274 cpu_free_list = cp->cpu_next_free; 275 } 276 277 cp->cpu_m.mcpu_istamp = cpun << 16; 278 279 /* Create per CPU specific threads in the process p0. */ 280 procp = &p0; 281 282 /* 283 * Initialize the dispatcher first. 284 */ 285 disp_cpu_init(cp); 286 287 cpu_vm_data_init(cp); 288 289 /* 290 * Allocate and initialize the startup thread for this CPU. 291 * Interrupt and process switch stacks get allocated later 292 * when the CPU starts running. 293 */ 294 tp = thread_create(NULL, 0, NULL, NULL, 0, procp, 295 TS_STOPPED, maxclsyspri); 296 297 /* 298 * Set state to TS_ONPROC since this thread will start running 299 * as soon as the CPU comes online. 300 * 301 * All the other fields of the thread structure are setup by 302 * thread_create(). 303 */ 304 THREAD_ONPROC(tp, cp); 305 tp->t_preempt = 1; 306 tp->t_bound_cpu = cp; 307 tp->t_affinitycnt = 1; 308 tp->t_cpu = cp; 309 tp->t_disp_queue = cp->cpu_disp; 310 311 /* 312 * Setup thread to start in mp_startup_common. 313 */ 314 sp = tp->t_stk; 315 tp->t_sp = (uintptr_t)(sp - MINFRAME); 316 #if defined(__amd64) 317 tp->t_sp -= STACK_ENTRY_ALIGN; /* fake a call */ 318 #endif 319 /* 320 * Setup thread start entry point for boot or hotplug. 321 */ 322 if (boot) { 323 tp->t_pc = (uintptr_t)mp_startup_boot; 324 } else { 325 tp->t_pc = (uintptr_t)mp_startup_hotplug; 326 } 327 328 cp->cpu_id = cpun; 329 cp->cpu_self = cp; 330 cp->cpu_thread = tp; 331 cp->cpu_lwp = NULL; 332 cp->cpu_dispthread = tp; 333 cp->cpu_dispatch_pri = DISP_PRIO(tp); 334 335 /* 336 * cpu_base_spl must be set explicitly here to prevent any blocking 337 * operations in mp_startup_common from causing the spl of the cpu 338 * to drop to 0 (allowing device interrupts before we're ready) in 339 * resume(). 340 * cpu_base_spl MUST remain at LOCK_LEVEL until the cpu is CPU_READY. 341 * As an extra bit of security on DEBUG kernels, this is enforced with 342 * an assertion in mp_startup_common() -- before cpu_base_spl is set 343 * to its proper value. 344 */ 345 cp->cpu_base_spl = ipltospl(LOCK_LEVEL); 346 347 /* 348 * Now, initialize per-CPU idle thread for this CPU. 349 */ 350 tp = thread_create(NULL, PAGESIZE, idle, NULL, 0, procp, TS_ONPROC, -1); 351 352 cp->cpu_idle_thread = tp; 353 354 tp->t_preempt = 1; 355 tp->t_bound_cpu = cp; 356 tp->t_affinitycnt = 1; 357 tp->t_cpu = cp; 358 tp->t_disp_queue = cp->cpu_disp; 359 360 /* 361 * Bootstrap the CPU's PG data 362 */ 363 pg_cpu_bootstrap(cp); 364 365 /* 366 * Perform CPC initialization on the new CPU. 367 */ 368 kcpc_hw_init(cp); 369 370 /* 371 * Allocate virtual addresses for cpu_caddr1 and cpu_caddr2 372 * for each CPU. 373 */ 374 setup_vaddr_for_ppcopy(cp); 375 376 /* 377 * Allocate page for new GDT and initialize from current GDT. 378 */ 379 #if !defined(__lint) 380 ASSERT((sizeof (*cp->cpu_gdt) * NGDT) <= PAGESIZE); 381 #endif 382 cp->cpu_gdt = kmem_zalloc(PAGESIZE, KM_SLEEP); 383 bcopy(CPU->cpu_gdt, cp->cpu_gdt, (sizeof (*cp->cpu_gdt) * NGDT)); 384 385 #if defined(__i386) 386 /* 387 * setup kernel %gs. 388 */ 389 set_usegd(&cp->cpu_gdt[GDT_GS], cp, sizeof (struct cpu) -1, SDT_MEMRWA, 390 SEL_KPL, 0, 1); 391 #endif 392 393 /* 394 * If we have more than one node, each cpu gets a copy of IDT 395 * local to its node. If this is a Pentium box, we use cpu 0's 396 * IDT. cpu 0's IDT has been made read-only to workaround the 397 * cmpxchgl register bug 398 */ 399 if (system_hardware.hd_nodes && x86_type != X86_TYPE_P5) { 400 #if !defined(__lint) 401 ASSERT((sizeof (*CPU->cpu_idt) * NIDT) <= PAGESIZE); 402 #endif 403 cp->cpu_idt = kmem_zalloc(PAGESIZE, KM_SLEEP); 404 bcopy(CPU->cpu_idt, cp->cpu_idt, PAGESIZE); 405 } else { 406 cp->cpu_idt = CPU->cpu_idt; 407 } 408 409 /* 410 * Get interrupt priority data from cpu 0. 411 */ 412 cp->cpu_pri_data = CPU->cpu_pri_data; 413 414 /* 415 * alloc space for cpuid info 416 */ 417 cpuid_alloc_space(cp); 418 #if !defined(__xpv) 419 if (is_x86_feature(x86_featureset, X86FSET_MWAIT) && 420 idle_cpu_prefer_mwait) { 421 cp->cpu_m.mcpu_mwait = cpuid_mwait_alloc(cp); 422 cp->cpu_m.mcpu_idle_cpu = cpu_idle_mwait; 423 } else 424 #endif 425 cp->cpu_m.mcpu_idle_cpu = cpu_idle; 426 427 init_cpu_info(cp); 428 429 /* 430 * alloc space for ucode_info 431 */ 432 ucode_alloc_space(cp); 433 xc_init_cpu(cp); 434 hat_cpu_online(cp); 435 436 #ifdef TRAPTRACE 437 /* 438 * If this is a TRAPTRACE kernel, allocate TRAPTRACE buffers 439 */ 440 ttc->ttc_first = (uintptr_t)kmem_zalloc(trap_trace_bufsize, KM_SLEEP); 441 ttc->ttc_next = ttc->ttc_first; 442 ttc->ttc_limit = ttc->ttc_first + trap_trace_bufsize; 443 #endif 444 445 /* 446 * Record that we have another CPU. 447 */ 448 /* 449 * Initialize the interrupt threads for this CPU 450 */ 451 cpu_intr_alloc(cp, NINTR_THREADS); 452 453 cp->cpu_flags = CPU_OFFLINE | CPU_QUIESCED | CPU_POWEROFF; 454 cpu_set_state(cp); 455 456 /* 457 * Add CPU to list of available CPUs. It'll be on the active list 458 * after mp_startup_common(). 459 */ 460 cpu_add_unit(cp); 461 462 return (cp); 463 } 464 465 /* 466 * Undo what was done in mp_cpu_configure_common 467 */ 468 static void 469 mp_cpu_unconfigure_common(struct cpu *cp, int error) 470 { 471 ASSERT(MUTEX_HELD(&cpu_lock)); 472 473 /* 474 * Remove the CPU from the list of available CPUs. 475 */ 476 cpu_del_unit(cp->cpu_id); 477 478 if (error == ETIMEDOUT) { 479 /* 480 * The cpu was started, but never *seemed* to run any 481 * code in the kernel; it's probably off spinning in its 482 * own private world, though with potential references to 483 * our kmem-allocated IDTs and GDTs (for example). 484 * 485 * Worse still, it may actually wake up some time later, 486 * so rather than guess what it might or might not do, we 487 * leave the fundamental data structures intact. 488 */ 489 cp->cpu_flags = 0; 490 return; 491 } 492 493 /* 494 * At this point, the only threads bound to this CPU should 495 * special per-cpu threads: it's idle thread, it's pause threads, 496 * and it's interrupt threads. Clean these up. 497 */ 498 cpu_destroy_bound_threads(cp); 499 cp->cpu_idle_thread = NULL; 500 501 /* 502 * Free the interrupt stack. 503 */ 504 segkp_release(segkp, 505 cp->cpu_intr_stack - (INTR_STACK_SIZE - SA(MINFRAME))); 506 cp->cpu_intr_stack = NULL; 507 508 #ifdef TRAPTRACE 509 /* 510 * Discard the trap trace buffer 511 */ 512 { 513 trap_trace_ctl_t *ttc = &trap_trace_ctl[cp->cpu_id]; 514 515 kmem_free((void *)ttc->ttc_first, trap_trace_bufsize); 516 ttc->ttc_first = NULL; 517 } 518 #endif 519 520 hat_cpu_offline(cp); 521 522 ucode_free_space(cp); 523 524 /* Free CPU ID string and brand string. */ 525 if (cp->cpu_idstr) { 526 kmem_free(cp->cpu_idstr, CPU_IDSTRLEN); 527 cp->cpu_idstr = NULL; 528 } 529 if (cp->cpu_brandstr) { 530 kmem_free(cp->cpu_brandstr, CPU_IDSTRLEN); 531 cp->cpu_brandstr = NULL; 532 } 533 534 #if !defined(__xpv) 535 if (cp->cpu_m.mcpu_mwait != NULL) { 536 cpuid_mwait_free(cp); 537 cp->cpu_m.mcpu_mwait = NULL; 538 } 539 #endif 540 cpuid_free_space(cp); 541 542 if (cp->cpu_idt != CPU->cpu_idt) 543 kmem_free(cp->cpu_idt, PAGESIZE); 544 cp->cpu_idt = NULL; 545 546 kmem_free(cp->cpu_gdt, PAGESIZE); 547 cp->cpu_gdt = NULL; 548 549 if (cp->cpu_supp_freqs != NULL) { 550 size_t len = strlen(cp->cpu_supp_freqs) + 1; 551 kmem_free(cp->cpu_supp_freqs, len); 552 cp->cpu_supp_freqs = NULL; 553 } 554 555 teardown_vaddr_for_ppcopy(cp); 556 557 kcpc_hw_fini(cp); 558 559 cp->cpu_dispthread = NULL; 560 cp->cpu_thread = NULL; /* discarded by cpu_destroy_bound_threads() */ 561 562 cpu_vm_data_destroy(cp); 563 564 xc_fini_cpu(cp); 565 disp_cpu_fini(cp); 566 567 ASSERT(cp != CPU0); 568 bzero(cp, sizeof (*cp)); 569 cp->cpu_next_free = cpu_free_list; 570 cpu_free_list = cp; 571 } 572 573 /* 574 * Apply workarounds for known errata, and warn about those that are absent. 575 * 576 * System vendors occasionally create configurations which contain different 577 * revisions of the CPUs that are almost but not exactly the same. At the 578 * time of writing, this meant that their clock rates were the same, their 579 * feature sets were the same, but the required workaround were -not- 580 * necessarily the same. So, this routine is invoked on -every- CPU soon 581 * after starting to make sure that the resulting system contains the most 582 * pessimal set of workarounds needed to cope with *any* of the CPUs in the 583 * system. 584 * 585 * workaround_errata is invoked early in mlsetup() for CPU 0, and in 586 * mp_startup_common() for all slave CPUs. Slaves process workaround_errata 587 * prior to acknowledging their readiness to the master, so this routine will 588 * never be executed by multiple CPUs in parallel, thus making updates to 589 * global data safe. 590 * 591 * These workarounds are based on Rev 3.57 of the Revision Guide for 592 * AMD Athlon(tm) 64 and AMD Opteron(tm) Processors, August 2005. 593 */ 594 595 #if defined(OPTERON_ERRATUM_88) 596 int opteron_erratum_88; /* if non-zero -> at least one cpu has it */ 597 #endif 598 599 #if defined(OPTERON_ERRATUM_91) 600 int opteron_erratum_91; /* if non-zero -> at least one cpu has it */ 601 #endif 602 603 #if defined(OPTERON_ERRATUM_93) 604 int opteron_erratum_93; /* if non-zero -> at least one cpu has it */ 605 #endif 606 607 #if defined(OPTERON_ERRATUM_95) 608 int opteron_erratum_95; /* if non-zero -> at least one cpu has it */ 609 #endif 610 611 #if defined(OPTERON_ERRATUM_100) 612 int opteron_erratum_100; /* if non-zero -> at least one cpu has it */ 613 #endif 614 615 #if defined(OPTERON_ERRATUM_108) 616 int opteron_erratum_108; /* if non-zero -> at least one cpu has it */ 617 #endif 618 619 #if defined(OPTERON_ERRATUM_109) 620 int opteron_erratum_109; /* if non-zero -> at least one cpu has it */ 621 #endif 622 623 #if defined(OPTERON_ERRATUM_121) 624 int opteron_erratum_121; /* if non-zero -> at least one cpu has it */ 625 #endif 626 627 #if defined(OPTERON_ERRATUM_122) 628 int opteron_erratum_122; /* if non-zero -> at least one cpu has it */ 629 #endif 630 631 #if defined(OPTERON_ERRATUM_123) 632 int opteron_erratum_123; /* if non-zero -> at least one cpu has it */ 633 #endif 634 635 #if defined(OPTERON_ERRATUM_131) 636 int opteron_erratum_131; /* if non-zero -> at least one cpu has it */ 637 #endif 638 639 #if defined(OPTERON_WORKAROUND_6336786) 640 int opteron_workaround_6336786; /* non-zero -> WA relevant and applied */ 641 int opteron_workaround_6336786_UP = 0; /* Not needed for UP */ 642 #endif 643 644 #if defined(OPTERON_WORKAROUND_6323525) 645 int opteron_workaround_6323525; /* if non-zero -> at least one cpu has it */ 646 #endif 647 648 #if defined(OPTERON_ERRATUM_298) 649 int opteron_erratum_298; 650 #endif 651 652 static void 653 workaround_warning(cpu_t *cp, uint_t erratum) 654 { 655 cmn_err(CE_WARN, "cpu%d: no workaround for erratum %u", 656 cp->cpu_id, erratum); 657 } 658 659 static void 660 workaround_applied(uint_t erratum) 661 { 662 if (erratum > 1000000) 663 cmn_err(CE_CONT, "?workaround applied for cpu issue #%d\n", 664 erratum); 665 else 666 cmn_err(CE_CONT, "?workaround applied for cpu erratum #%d\n", 667 erratum); 668 } 669 670 static void 671 msr_warning(cpu_t *cp, const char *rw, uint_t msr, int error) 672 { 673 cmn_err(CE_WARN, "cpu%d: couldn't %smsr 0x%x, error %d", 674 cp->cpu_id, rw, msr, error); 675 } 676 677 /* 678 * Determine the number of nodes in a Hammer / Greyhound / Griffin family 679 * system. 680 */ 681 static uint_t 682 opteron_get_nnodes(void) 683 { 684 static uint_t nnodes = 0; 685 686 if (nnodes == 0) { 687 #ifdef DEBUG 688 uint_t family; 689 690 /* 691 * This routine uses a PCI config space based mechanism 692 * for retrieving the number of nodes in the system. 693 * Device 24, function 0, offset 0x60 as used here is not 694 * AMD processor architectural, and may not work on processor 695 * families other than those listed below. 696 * 697 * Callers of this routine must ensure that we're running on 698 * a processor which supports this mechanism. 699 * The assertion below is meant to catch calls on unsupported 700 * processors. 701 */ 702 family = cpuid_getfamily(CPU); 703 ASSERT(family == 0xf || family == 0x10 || family == 0x11); 704 #endif /* DEBUG */ 705 706 /* 707 * Obtain the number of nodes in the system from 708 * bits [6:4] of the Node ID register on node 0. 709 * 710 * The actual node count is NodeID[6:4] + 1 711 * 712 * The Node ID register is accessed via function 0, 713 * offset 0x60. Node 0 is device 24. 714 */ 715 nnodes = ((pci_getl_func(0, 24, 0, 0x60) & 0x70) >> 4) + 1; 716 } 717 return (nnodes); 718 } 719 720 uint_t 721 do_erratum_298(struct cpu *cpu) 722 { 723 static int osvwrc = -3; 724 extern int osvw_opteron_erratum(cpu_t *, uint_t); 725 726 /* 727 * L2 Eviction May Occur During Processor Operation To Set 728 * Accessed or Dirty Bit. 729 */ 730 if (osvwrc == -3) { 731 osvwrc = osvw_opteron_erratum(cpu, 298); 732 } else { 733 /* osvw return codes should be consistent for all cpus */ 734 ASSERT(osvwrc == osvw_opteron_erratum(cpu, 298)); 735 } 736 737 switch (osvwrc) { 738 case 0: /* erratum is not present: do nothing */ 739 break; 740 case 1: /* erratum is present: BIOS workaround applied */ 741 /* 742 * check if workaround is actually in place and issue warning 743 * if not. 744 */ 745 if (((rdmsr(MSR_AMD_HWCR) & AMD_HWCR_TLBCACHEDIS) == 0) || 746 ((rdmsr(MSR_AMD_BU_CFG) & AMD_BU_CFG_E298) == 0)) { 747 #if defined(OPTERON_ERRATUM_298) 748 opteron_erratum_298++; 749 #else 750 workaround_warning(cpu, 298); 751 return (1); 752 #endif 753 } 754 break; 755 case -1: /* cannot determine via osvw: check cpuid */ 756 if ((cpuid_opteron_erratum(cpu, 298) > 0) && 757 (((rdmsr(MSR_AMD_HWCR) & AMD_HWCR_TLBCACHEDIS) == 0) || 758 ((rdmsr(MSR_AMD_BU_CFG) & AMD_BU_CFG_E298) == 0))) { 759 #if defined(OPTERON_ERRATUM_298) 760 opteron_erratum_298++; 761 #else 762 workaround_warning(cpu, 298); 763 return (1); 764 #endif 765 } 766 break; 767 } 768 return (0); 769 } 770 771 uint_t 772 workaround_errata(struct cpu *cpu) 773 { 774 uint_t missing = 0; 775 776 ASSERT(cpu == CPU); 777 778 /*LINTED*/ 779 if (cpuid_opteron_erratum(cpu, 88) > 0) { 780 /* 781 * SWAPGS May Fail To Read Correct GS Base 782 */ 783 #if defined(OPTERON_ERRATUM_88) 784 /* 785 * The workaround is an mfence in the relevant assembler code 786 */ 787 opteron_erratum_88++; 788 #else 789 workaround_warning(cpu, 88); 790 missing++; 791 #endif 792 } 793 794 if (cpuid_opteron_erratum(cpu, 91) > 0) { 795 /* 796 * Software Prefetches May Report A Page Fault 797 */ 798 #if defined(OPTERON_ERRATUM_91) 799 /* 800 * fix is in trap.c 801 */ 802 opteron_erratum_91++; 803 #else 804 workaround_warning(cpu, 91); 805 missing++; 806 #endif 807 } 808 809 if (cpuid_opteron_erratum(cpu, 93) > 0) { 810 /* 811 * RSM Auto-Halt Restart Returns to Incorrect RIP 812 */ 813 #if defined(OPTERON_ERRATUM_93) 814 /* 815 * fix is in trap.c 816 */ 817 opteron_erratum_93++; 818 #else 819 workaround_warning(cpu, 93); 820 missing++; 821 #endif 822 } 823 824 /*LINTED*/ 825 if (cpuid_opteron_erratum(cpu, 95) > 0) { 826 /* 827 * RET Instruction May Return to Incorrect EIP 828 */ 829 #if defined(OPTERON_ERRATUM_95) 830 #if defined(_LP64) 831 /* 832 * Workaround this by ensuring that 32-bit user code and 833 * 64-bit kernel code never occupy the same address 834 * range mod 4G. 835 */ 836 if (_userlimit32 > 0xc0000000ul) 837 *(uintptr_t *)&_userlimit32 = 0xc0000000ul; 838 839 /*LINTED*/ 840 ASSERT((uint32_t)COREHEAP_BASE == 0xc0000000u); 841 opteron_erratum_95++; 842 #endif /* _LP64 */ 843 #else 844 workaround_warning(cpu, 95); 845 missing++; 846 #endif 847 } 848 849 if (cpuid_opteron_erratum(cpu, 100) > 0) { 850 /* 851 * Compatibility Mode Branches Transfer to Illegal Address 852 */ 853 #if defined(OPTERON_ERRATUM_100) 854 /* 855 * fix is in trap.c 856 */ 857 opteron_erratum_100++; 858 #else 859 workaround_warning(cpu, 100); 860 missing++; 861 #endif 862 } 863 864 /*LINTED*/ 865 if (cpuid_opteron_erratum(cpu, 108) > 0) { 866 /* 867 * CPUID Instruction May Return Incorrect Model Number In 868 * Some Processors 869 */ 870 #if defined(OPTERON_ERRATUM_108) 871 /* 872 * (Our cpuid-handling code corrects the model number on 873 * those processors) 874 */ 875 #else 876 workaround_warning(cpu, 108); 877 missing++; 878 #endif 879 } 880 881 /*LINTED*/ 882 if (cpuid_opteron_erratum(cpu, 109) > 0) do { 883 /* 884 * Certain Reverse REP MOVS May Produce Unpredictable Behavior 885 */ 886 #if defined(OPTERON_ERRATUM_109) 887 /* 888 * The "workaround" is to print a warning to upgrade the BIOS 889 */ 890 uint64_t value; 891 const uint_t msr = MSR_AMD_PATCHLEVEL; 892 int err; 893 894 if ((err = checked_rdmsr(msr, &value)) != 0) { 895 msr_warning(cpu, "rd", msr, err); 896 workaround_warning(cpu, 109); 897 missing++; 898 } 899 if (value == 0) 900 opteron_erratum_109++; 901 #else 902 workaround_warning(cpu, 109); 903 missing++; 904 #endif 905 /*CONSTANTCONDITION*/ 906 } while (0); 907 908 /*LINTED*/ 909 if (cpuid_opteron_erratum(cpu, 121) > 0) { 910 /* 911 * Sequential Execution Across Non_Canonical Boundary Caused 912 * Processor Hang 913 */ 914 #if defined(OPTERON_ERRATUM_121) 915 #if defined(_LP64) 916 /* 917 * Erratum 121 is only present in long (64 bit) mode. 918 * Workaround is to include the page immediately before the 919 * va hole to eliminate the possibility of system hangs due to 920 * sequential execution across the va hole boundary. 921 */ 922 if (opteron_erratum_121) 923 opteron_erratum_121++; 924 else { 925 if (hole_start) { 926 hole_start -= PAGESIZE; 927 } else { 928 /* 929 * hole_start not yet initialized by 930 * mmu_init. Initialize hole_start 931 * with value to be subtracted. 932 */ 933 hole_start = PAGESIZE; 934 } 935 opteron_erratum_121++; 936 } 937 #endif /* _LP64 */ 938 #else 939 workaround_warning(cpu, 121); 940 missing++; 941 #endif 942 } 943 944 /*LINTED*/ 945 if (cpuid_opteron_erratum(cpu, 122) > 0) do { 946 /* 947 * TLB Flush Filter May Cause Coherency Problem in 948 * Multiprocessor Systems 949 */ 950 #if defined(OPTERON_ERRATUM_122) 951 uint64_t value; 952 const uint_t msr = MSR_AMD_HWCR; 953 int error; 954 955 /* 956 * Erratum 122 is only present in MP configurations (multi-core 957 * or multi-processor). 958 */ 959 #if defined(__xpv) 960 if (!DOMAIN_IS_INITDOMAIN(xen_info)) 961 break; 962 if (!opteron_erratum_122 && xpv_nr_phys_cpus() == 1) 963 break; 964 #else 965 if (!opteron_erratum_122 && opteron_get_nnodes() == 1 && 966 cpuid_get_ncpu_per_chip(cpu) == 1) 967 break; 968 #endif 969 /* disable TLB Flush Filter */ 970 971 if ((error = checked_rdmsr(msr, &value)) != 0) { 972 msr_warning(cpu, "rd", msr, error); 973 workaround_warning(cpu, 122); 974 missing++; 975 } else { 976 value |= (uint64_t)AMD_HWCR_FFDIS; 977 if ((error = checked_wrmsr(msr, value)) != 0) { 978 msr_warning(cpu, "wr", msr, error); 979 workaround_warning(cpu, 122); 980 missing++; 981 } 982 } 983 opteron_erratum_122++; 984 #else 985 workaround_warning(cpu, 122); 986 missing++; 987 #endif 988 /*CONSTANTCONDITION*/ 989 } while (0); 990 991 /*LINTED*/ 992 if (cpuid_opteron_erratum(cpu, 123) > 0) do { 993 /* 994 * Bypassed Reads May Cause Data Corruption of System Hang in 995 * Dual Core Processors 996 */ 997 #if defined(OPTERON_ERRATUM_123) 998 uint64_t value; 999 const uint_t msr = MSR_AMD_PATCHLEVEL; 1000 int err; 1001 1002 /* 1003 * Erratum 123 applies only to multi-core cpus. 1004 */ 1005 if (cpuid_get_ncpu_per_chip(cpu) < 2) 1006 break; 1007 #if defined(__xpv) 1008 if (!DOMAIN_IS_INITDOMAIN(xen_info)) 1009 break; 1010 #endif 1011 /* 1012 * The "workaround" is to print a warning to upgrade the BIOS 1013 */ 1014 if ((err = checked_rdmsr(msr, &value)) != 0) { 1015 msr_warning(cpu, "rd", msr, err); 1016 workaround_warning(cpu, 123); 1017 missing++; 1018 } 1019 if (value == 0) 1020 opteron_erratum_123++; 1021 #else 1022 workaround_warning(cpu, 123); 1023 missing++; 1024 1025 #endif 1026 /*CONSTANTCONDITION*/ 1027 } while (0); 1028 1029 /*LINTED*/ 1030 if (cpuid_opteron_erratum(cpu, 131) > 0) do { 1031 /* 1032 * Multiprocessor Systems with Four or More Cores May Deadlock 1033 * Waiting for a Probe Response 1034 */ 1035 #if defined(OPTERON_ERRATUM_131) 1036 uint64_t nbcfg; 1037 const uint_t msr = MSR_AMD_NB_CFG; 1038 const uint64_t wabits = 1039 AMD_NB_CFG_SRQ_HEARTBEAT | AMD_NB_CFG_SRQ_SPR; 1040 int error; 1041 1042 /* 1043 * Erratum 131 applies to any system with four or more cores. 1044 */ 1045 if (opteron_erratum_131) 1046 break; 1047 #if defined(__xpv) 1048 if (!DOMAIN_IS_INITDOMAIN(xen_info)) 1049 break; 1050 if (xpv_nr_phys_cpus() < 4) 1051 break; 1052 #else 1053 if (opteron_get_nnodes() * cpuid_get_ncpu_per_chip(cpu) < 4) 1054 break; 1055 #endif 1056 /* 1057 * Print a warning if neither of the workarounds for 1058 * erratum 131 is present. 1059 */ 1060 if ((error = checked_rdmsr(msr, &nbcfg)) != 0) { 1061 msr_warning(cpu, "rd", msr, error); 1062 workaround_warning(cpu, 131); 1063 missing++; 1064 } else if ((nbcfg & wabits) == 0) { 1065 opteron_erratum_131++; 1066 } else { 1067 /* cannot have both workarounds set */ 1068 ASSERT((nbcfg & wabits) != wabits); 1069 } 1070 #else 1071 workaround_warning(cpu, 131); 1072 missing++; 1073 #endif 1074 /*CONSTANTCONDITION*/ 1075 } while (0); 1076 1077 /* 1078 * This isn't really an erratum, but for convenience the 1079 * detection/workaround code lives here and in cpuid_opteron_erratum. 1080 */ 1081 if (cpuid_opteron_erratum(cpu, 6336786) > 0) { 1082 #if defined(OPTERON_WORKAROUND_6336786) 1083 /* 1084 * Disable C1-Clock ramping on multi-core/multi-processor 1085 * K8 platforms to guard against TSC drift. 1086 */ 1087 if (opteron_workaround_6336786) { 1088 opteron_workaround_6336786++; 1089 #if defined(__xpv) 1090 } else if ((DOMAIN_IS_INITDOMAIN(xen_info) && 1091 xpv_nr_phys_cpus() > 1) || 1092 opteron_workaround_6336786_UP) { 1093 /* 1094 * XXPV Hmm. We can't walk the Northbridges on 1095 * the hypervisor; so just complain and drive 1096 * on. This probably needs to be fixed in 1097 * the hypervisor itself. 1098 */ 1099 opteron_workaround_6336786++; 1100 workaround_warning(cpu, 6336786); 1101 #else /* __xpv */ 1102 } else if ((opteron_get_nnodes() * 1103 cpuid_get_ncpu_per_chip(cpu) > 1) || 1104 opteron_workaround_6336786_UP) { 1105 1106 uint_t node, nnodes; 1107 uint8_t data; 1108 1109 nnodes = opteron_get_nnodes(); 1110 for (node = 0; node < nnodes; node++) { 1111 /* 1112 * Clear PMM7[1:0] (function 3, offset 0x87) 1113 * Northbridge device is the node id + 24. 1114 */ 1115 data = pci_getb_func(0, node + 24, 3, 0x87); 1116 data &= 0xFC; 1117 pci_putb_func(0, node + 24, 3, 0x87, data); 1118 } 1119 opteron_workaround_6336786++; 1120 #endif /* __xpv */ 1121 } 1122 #else 1123 workaround_warning(cpu, 6336786); 1124 missing++; 1125 #endif 1126 } 1127 1128 /*LINTED*/ 1129 /* 1130 * Mutex primitives don't work as expected. 1131 */ 1132 if (cpuid_opteron_erratum(cpu, 6323525) > 0) { 1133 #if defined(OPTERON_WORKAROUND_6323525) 1134 /* 1135 * This problem only occurs with 2 or more cores. If bit in 1136 * MSR_AMD_BU_CFG set, then not applicable. The workaround 1137 * is to patch the semaphone routines with the lfence 1138 * instruction to provide necessary load memory barrier with 1139 * possible subsequent read-modify-write ops. 1140 * 1141 * It is too early in boot to call the patch routine so 1142 * set erratum variable to be done in startup_end(). 1143 */ 1144 if (opteron_workaround_6323525) { 1145 opteron_workaround_6323525++; 1146 #if defined(__xpv) 1147 } else if (is_x86_feature(x86_featureset, X86FSET_SSE2)) { 1148 if (DOMAIN_IS_INITDOMAIN(xen_info)) { 1149 /* 1150 * XXPV Use dom0_msr here when extended 1151 * operations are supported? 1152 */ 1153 if (xpv_nr_phys_cpus() > 1) 1154 opteron_workaround_6323525++; 1155 } else { 1156 /* 1157 * We have no way to tell how many physical 1158 * cpus there are, or even if this processor 1159 * has the problem, so enable the workaround 1160 * unconditionally (at some performance cost). 1161 */ 1162 opteron_workaround_6323525++; 1163 } 1164 #else /* __xpv */ 1165 } else if (is_x86_feature(x86_featureset, X86FSET_SSE2) && 1166 ((opteron_get_nnodes() * 1167 cpuid_get_ncpu_per_chip(cpu)) > 1)) { 1168 if ((xrdmsr(MSR_AMD_BU_CFG) & (UINT64_C(1) << 33)) == 0) 1169 opteron_workaround_6323525++; 1170 #endif /* __xpv */ 1171 } 1172 #else 1173 workaround_warning(cpu, 6323525); 1174 missing++; 1175 #endif 1176 } 1177 1178 missing += do_erratum_298(cpu); 1179 1180 #ifdef __xpv 1181 return (0); 1182 #else 1183 return (missing); 1184 #endif 1185 } 1186 1187 void 1188 workaround_errata_end() 1189 { 1190 #if defined(OPTERON_ERRATUM_88) 1191 if (opteron_erratum_88) 1192 workaround_applied(88); 1193 #endif 1194 #if defined(OPTERON_ERRATUM_91) 1195 if (opteron_erratum_91) 1196 workaround_applied(91); 1197 #endif 1198 #if defined(OPTERON_ERRATUM_93) 1199 if (opteron_erratum_93) 1200 workaround_applied(93); 1201 #endif 1202 #if defined(OPTERON_ERRATUM_95) 1203 if (opteron_erratum_95) 1204 workaround_applied(95); 1205 #endif 1206 #if defined(OPTERON_ERRATUM_100) 1207 if (opteron_erratum_100) 1208 workaround_applied(100); 1209 #endif 1210 #if defined(OPTERON_ERRATUM_108) 1211 if (opteron_erratum_108) 1212 workaround_applied(108); 1213 #endif 1214 #if defined(OPTERON_ERRATUM_109) 1215 if (opteron_erratum_109) { 1216 cmn_err(CE_WARN, 1217 "BIOS microcode patch for AMD Athlon(tm) 64/Opteron(tm)" 1218 " processor\nerratum 109 was not detected; updating your" 1219 " system's BIOS to a version\ncontaining this" 1220 " microcode patch is HIGHLY recommended or erroneous" 1221 " system\noperation may occur.\n"); 1222 } 1223 #endif 1224 #if defined(OPTERON_ERRATUM_121) 1225 if (opteron_erratum_121) 1226 workaround_applied(121); 1227 #endif 1228 #if defined(OPTERON_ERRATUM_122) 1229 if (opteron_erratum_122) 1230 workaround_applied(122); 1231 #endif 1232 #if defined(OPTERON_ERRATUM_123) 1233 if (opteron_erratum_123) { 1234 cmn_err(CE_WARN, 1235 "BIOS microcode patch for AMD Athlon(tm) 64/Opteron(tm)" 1236 " processor\nerratum 123 was not detected; updating your" 1237 " system's BIOS to a version\ncontaining this" 1238 " microcode patch is HIGHLY recommended or erroneous" 1239 " system\noperation may occur.\n"); 1240 } 1241 #endif 1242 #if defined(OPTERON_ERRATUM_131) 1243 if (opteron_erratum_131) { 1244 cmn_err(CE_WARN, 1245 "BIOS microcode patch for AMD Athlon(tm) 64/Opteron(tm)" 1246 " processor\nerratum 131 was not detected; updating your" 1247 " system's BIOS to a version\ncontaining this" 1248 " microcode patch is HIGHLY recommended or erroneous" 1249 " system\noperation may occur.\n"); 1250 } 1251 #endif 1252 #if defined(OPTERON_WORKAROUND_6336786) 1253 if (opteron_workaround_6336786) 1254 workaround_applied(6336786); 1255 #endif 1256 #if defined(OPTERON_WORKAROUND_6323525) 1257 if (opteron_workaround_6323525) 1258 workaround_applied(6323525); 1259 #endif 1260 #if defined(OPTERON_ERRATUM_298) 1261 if (opteron_erratum_298) { 1262 cmn_err(CE_WARN, 1263 "BIOS microcode patch for AMD 64/Opteron(tm)" 1264 " processor\nerratum 298 was not detected; updating your" 1265 " system's BIOS to a version\ncontaining this" 1266 " microcode patch is HIGHLY recommended or erroneous" 1267 " system\noperation may occur.\n"); 1268 } 1269 #endif 1270 } 1271 1272 /* 1273 * The procset_slave and procset_master are used to synchronize 1274 * between the control CPU and the target CPU when starting CPUs. 1275 */ 1276 static cpuset_t procset_slave, procset_master; 1277 1278 static void 1279 mp_startup_wait(cpuset_t *sp, processorid_t cpuid) 1280 { 1281 cpuset_t tempset; 1282 1283 for (tempset = *sp; !CPU_IN_SET(tempset, cpuid); 1284 tempset = *(volatile cpuset_t *)sp) { 1285 SMT_PAUSE(); 1286 } 1287 CPUSET_ATOMIC_DEL(*(cpuset_t *)sp, cpuid); 1288 } 1289 1290 static void 1291 mp_startup_signal(cpuset_t *sp, processorid_t cpuid) 1292 { 1293 cpuset_t tempset; 1294 1295 CPUSET_ATOMIC_ADD(*(cpuset_t *)sp, cpuid); 1296 for (tempset = *sp; CPU_IN_SET(tempset, cpuid); 1297 tempset = *(volatile cpuset_t *)sp) { 1298 SMT_PAUSE(); 1299 } 1300 } 1301 1302 int 1303 mp_start_cpu_common(cpu_t *cp, boolean_t boot) 1304 { 1305 _NOTE(ARGUNUSED(boot)); 1306 1307 void *ctx; 1308 int delays; 1309 int error = 0; 1310 cpuset_t tempset; 1311 processorid_t cpuid; 1312 #ifndef __xpv 1313 extern void cpupm_init(cpu_t *); 1314 #endif 1315 1316 ASSERT(cp != NULL); 1317 cpuid = cp->cpu_id; 1318 ctx = mach_cpucontext_alloc(cp); 1319 if (ctx == NULL) { 1320 cmn_err(CE_WARN, 1321 "cpu%d: failed to allocate context", cp->cpu_id); 1322 return (EAGAIN); 1323 } 1324 error = mach_cpu_start(cp, ctx); 1325 if (error != 0) { 1326 cmn_err(CE_WARN, 1327 "cpu%d: failed to start, error %d", cp->cpu_id, error); 1328 mach_cpucontext_free(cp, ctx, error); 1329 return (error); 1330 } 1331 1332 for (delays = 0, tempset = procset_slave; !CPU_IN_SET(tempset, cpuid); 1333 delays++) { 1334 if (delays == 500) { 1335 /* 1336 * After five seconds, things are probably looking 1337 * a bit bleak - explain the hang. 1338 */ 1339 cmn_err(CE_NOTE, "cpu%d: started, " 1340 "but not running in the kernel yet", cpuid); 1341 } else if (delays > 2000) { 1342 /* 1343 * We waited at least 20 seconds, bail .. 1344 */ 1345 error = ETIMEDOUT; 1346 cmn_err(CE_WARN, "cpu%d: timed out", cpuid); 1347 mach_cpucontext_free(cp, ctx, error); 1348 return (error); 1349 } 1350 1351 /* 1352 * wait at least 10ms, then check again.. 1353 */ 1354 delay(USEC_TO_TICK_ROUNDUP(10000)); 1355 tempset = *((volatile cpuset_t *)&procset_slave); 1356 } 1357 CPUSET_ATOMIC_DEL(procset_slave, cpuid); 1358 1359 mach_cpucontext_free(cp, ctx, 0); 1360 1361 #ifndef __xpv 1362 if (tsc_gethrtime_enable) 1363 tsc_sync_master(cpuid); 1364 #endif 1365 1366 if (dtrace_cpu_init != NULL) { 1367 (*dtrace_cpu_init)(cpuid); 1368 } 1369 1370 /* 1371 * During CPU DR operations, the cpu_lock is held by current 1372 * (the control) thread. We can't release the cpu_lock here 1373 * because that will break the CPU DR logic. 1374 * On the other hand, CPUPM and processor group initialization 1375 * routines need to access the cpu_lock. So we invoke those 1376 * routines here on behalf of mp_startup_common(). 1377 * 1378 * CPUPM and processor group initialization routines depend 1379 * on the cpuid probing results. Wait for mp_startup_common() 1380 * to signal that cpuid probing is done. 1381 */ 1382 mp_startup_wait(&procset_slave, cpuid); 1383 #ifndef __xpv 1384 cpupm_init(cp); 1385 #endif 1386 (void) pg_cpu_init(cp, B_FALSE); 1387 cpu_set_state(cp); 1388 mp_startup_signal(&procset_master, cpuid); 1389 1390 return (0); 1391 } 1392 1393 /* 1394 * Start a single cpu, assuming that the kernel context is available 1395 * to successfully start another cpu. 1396 * 1397 * (For example, real mode code is mapped into the right place 1398 * in memory and is ready to be run.) 1399 */ 1400 int 1401 start_cpu(processorid_t who) 1402 { 1403 cpu_t *cp; 1404 int error = 0; 1405 cpuset_t tempset; 1406 1407 ASSERT(who != 0); 1408 1409 /* 1410 * Check if there's at least a Mbyte of kmem available 1411 * before attempting to start the cpu. 1412 */ 1413 if (kmem_avail() < 1024 * 1024) { 1414 /* 1415 * Kick off a reap in case that helps us with 1416 * later attempts .. 1417 */ 1418 kmem_reap(); 1419 return (ENOMEM); 1420 } 1421 1422 /* 1423 * First configure cpu. 1424 */ 1425 cp = mp_cpu_configure_common(who, B_TRUE); 1426 ASSERT(cp != NULL); 1427 1428 /* 1429 * Then start cpu. 1430 */ 1431 error = mp_start_cpu_common(cp, B_TRUE); 1432 if (error != 0) { 1433 mp_cpu_unconfigure_common(cp, error); 1434 return (error); 1435 } 1436 1437 mutex_exit(&cpu_lock); 1438 tempset = cpu_ready_set; 1439 while (!CPU_IN_SET(tempset, who)) { 1440 drv_usecwait(1); 1441 tempset = *((volatile cpuset_t *)&cpu_ready_set); 1442 } 1443 mutex_enter(&cpu_lock); 1444 1445 return (0); 1446 } 1447 1448 void 1449 start_other_cpus(int cprboot) 1450 { 1451 _NOTE(ARGUNUSED(cprboot)); 1452 1453 uint_t who; 1454 uint_t bootcpuid = 0; 1455 1456 /* 1457 * Initialize our own cpu_info. 1458 */ 1459 init_cpu_info(CPU); 1460 1461 cmn_err(CE_CONT, "?cpu%d: %s\n", CPU->cpu_id, CPU->cpu_idstr); 1462 cmn_err(CE_CONT, "?cpu%d: %s\n", CPU->cpu_id, CPU->cpu_brandstr); 1463 1464 /* 1465 * Initialize our syscall handlers 1466 */ 1467 init_cpu_syscall(CPU); 1468 1469 /* 1470 * Take the boot cpu out of the mp_cpus set because we know 1471 * it's already running. Add it to the cpu_ready_set for 1472 * precisely the same reason. 1473 */ 1474 CPUSET_DEL(mp_cpus, bootcpuid); 1475 CPUSET_ADD(cpu_ready_set, bootcpuid); 1476 1477 /* 1478 * skip the rest of this if 1479 * . only 1 cpu dectected and system isn't hotplug-capable 1480 * . not using MP 1481 */ 1482 if ((CPUSET_ISNULL(mp_cpus) && plat_dr_support_cpu() == 0) || 1483 use_mp == 0) { 1484 if (use_mp == 0) 1485 cmn_err(CE_CONT, "?***** Not in MP mode\n"); 1486 goto done; 1487 } 1488 1489 /* 1490 * perform such initialization as is needed 1491 * to be able to take CPUs on- and off-line. 1492 */ 1493 cpu_pause_init(); 1494 1495 xc_init_cpu(CPU); /* initialize processor crosscalls */ 1496 1497 if (mach_cpucontext_init() != 0) 1498 goto done; 1499 1500 flushes_require_xcalls = 1; 1501 1502 /* 1503 * We lock our affinity to the master CPU to ensure that all slave CPUs 1504 * do their TSC syncs with the same CPU. 1505 */ 1506 affinity_set(CPU_CURRENT); 1507 1508 for (who = 0; who < NCPU; who++) { 1509 if (!CPU_IN_SET(mp_cpus, who)) 1510 continue; 1511 ASSERT(who != bootcpuid); 1512 1513 mutex_enter(&cpu_lock); 1514 if (start_cpu(who) != 0) 1515 CPUSET_DEL(mp_cpus, who); 1516 cpu_state_change_notify(who, CPU_SETUP); 1517 mutex_exit(&cpu_lock); 1518 } 1519 1520 /* Free the space allocated to hold the microcode file */ 1521 ucode_cleanup(); 1522 1523 affinity_clear(); 1524 1525 mach_cpucontext_fini(); 1526 1527 done: 1528 if (get_hwenv() == HW_NATIVE) 1529 workaround_errata_end(); 1530 cmi_post_mpstartup(); 1531 1532 if (use_mp && ncpus != boot_max_ncpus) { 1533 cmn_err(CE_NOTE, 1534 "System detected %d cpus, but " 1535 "only %d cpu(s) were enabled during boot.", 1536 boot_max_ncpus, ncpus); 1537 cmn_err(CE_NOTE, 1538 "Use \"boot-ncpus\" parameter to enable more CPU(s). " 1539 "See eeprom(1M)."); 1540 } 1541 } 1542 1543 int 1544 mp_cpu_configure(int cpuid) 1545 { 1546 cpu_t *cp; 1547 1548 if (use_mp == 0 || plat_dr_support_cpu() == 0) { 1549 return (ENOTSUP); 1550 } 1551 1552 cp = cpu_get(cpuid); 1553 if (cp != NULL) { 1554 return (EALREADY); 1555 } 1556 1557 /* 1558 * Check if there's at least a Mbyte of kmem available 1559 * before attempting to start the cpu. 1560 */ 1561 if (kmem_avail() < 1024 * 1024) { 1562 /* 1563 * Kick off a reap in case that helps us with 1564 * later attempts .. 1565 */ 1566 kmem_reap(); 1567 return (ENOMEM); 1568 } 1569 1570 cp = mp_cpu_configure_common(cpuid, B_FALSE); 1571 ASSERT(cp != NULL && cpu_get(cpuid) == cp); 1572 1573 return (cp != NULL ? 0 : EAGAIN); 1574 } 1575 1576 int 1577 mp_cpu_unconfigure(int cpuid) 1578 { 1579 cpu_t *cp; 1580 1581 if (use_mp == 0 || plat_dr_support_cpu() == 0) { 1582 return (ENOTSUP); 1583 } else if (cpuid < 0 || cpuid >= max_ncpus) { 1584 return (EINVAL); 1585 } 1586 1587 cp = cpu_get(cpuid); 1588 if (cp == NULL) { 1589 return (ENODEV); 1590 } 1591 mp_cpu_unconfigure_common(cp, 0); 1592 1593 return (0); 1594 } 1595 1596 /* 1597 * Startup function for 'other' CPUs (besides boot cpu). 1598 * Called from real_mode_start. 1599 * 1600 * WARNING: until CPU_READY is set, mp_startup_common and routines called by 1601 * mp_startup_common should not call routines (e.g. kmem_free) that could call 1602 * hat_unload which requires CPU_READY to be set. 1603 */ 1604 static void 1605 mp_startup_common(boolean_t boot) 1606 { 1607 cpu_t *cp = CPU; 1608 void *new_x86_featureset; 1609 extern void cpu_event_init_cpu(cpu_t *); 1610 1611 /* 1612 * We need to get TSC on this proc synced (i.e., any delta 1613 * from cpu0 accounted for) as soon as we can, because many 1614 * many things use gethrtime/pc_gethrestime, including 1615 * interrupts, cmn_err, etc. 1616 */ 1617 1618 /* Let the control CPU continue into tsc_sync_master() */ 1619 mp_startup_signal(&procset_slave, cp->cpu_id); 1620 1621 #ifndef __xpv 1622 if (tsc_gethrtime_enable) 1623 tsc_sync_slave(); 1624 #endif 1625 1626 /* 1627 * Once this was done from assembly, but it's safer here; if 1628 * it blocks, we need to be able to swtch() to and from, and 1629 * since we get here by calling t_pc, we need to do that call 1630 * before swtch() overwrites it. 1631 */ 1632 (void) (*ap_mlsetup)(); 1633 1634 new_x86_featureset = cpuid_pass1(cp); 1635 1636 #ifndef __xpv 1637 /* 1638 * Program this cpu's PAT 1639 */ 1640 if (is_x86_feature(x86_featureset, X86FSET_PAT)) 1641 pat_sync(); 1642 #endif 1643 1644 /* 1645 * Set up TSC_AUX to contain the cpuid for this processor 1646 * for the rdtscp instruction. 1647 */ 1648 if (is_x86_feature(x86_featureset, X86FSET_TSCP)) 1649 (void) wrmsr(MSR_AMD_TSCAUX, cp->cpu_id); 1650 1651 /* 1652 * Initialize this CPU's syscall handlers 1653 */ 1654 init_cpu_syscall(cp); 1655 1656 /* 1657 * Enable interrupts with spl set to LOCK_LEVEL. LOCK_LEVEL is the 1658 * highest level at which a routine is permitted to block on 1659 * an adaptive mutex (allows for cpu poke interrupt in case 1660 * the cpu is blocked on a mutex and halts). Setting LOCK_LEVEL blocks 1661 * device interrupts that may end up in the hat layer issuing cross 1662 * calls before CPU_READY is set. 1663 */ 1664 splx(ipltospl(LOCK_LEVEL)); 1665 sti(); 1666 1667 /* 1668 * Do a sanity check to make sure this new CPU is a sane thing 1669 * to add to the collection of processors running this system. 1670 * 1671 * XXX Clearly this needs to get more sophisticated, if x86 1672 * systems start to get built out of heterogenous CPUs; as is 1673 * likely to happen once the number of processors in a configuration 1674 * gets large enough. 1675 */ 1676 if (compare_x86_featureset(x86_featureset, new_x86_featureset) == 1677 B_FALSE) { 1678 cmn_err(CE_CONT, "cpu%d: featureset\n", cp->cpu_id); 1679 print_x86_featureset(new_x86_featureset); 1680 cmn_err(CE_WARN, "cpu%d feature mismatch", cp->cpu_id); 1681 } 1682 1683 /* 1684 * We do not support cpus with mixed monitor/mwait support if the 1685 * boot cpu supports monitor/mwait. 1686 */ 1687 if (is_x86_feature(x86_featureset, X86FSET_MWAIT) != 1688 is_x86_feature(new_x86_featureset, X86FSET_MWAIT)) 1689 panic("unsupported mixed cpu monitor/mwait support detected"); 1690 1691 free_x86_featureset(new_x86_featureset); 1692 1693 /* 1694 * We could be more sophisticated here, and just mark the CPU 1695 * as "faulted" but at this point we'll opt for the easier 1696 * answer of dying horribly. Provided the boot cpu is ok, 1697 * the system can be recovered by booting with use_mp set to zero. 1698 */ 1699 if (workaround_errata(cp) != 0) 1700 panic("critical workaround(s) missing for cpu%d", cp->cpu_id); 1701 1702 /* 1703 * We can touch cpu_flags here without acquiring the cpu_lock here 1704 * because the cpu_lock is held by the control CPU which is running 1705 * mp_start_cpu_common(). 1706 * Need to clear CPU_QUIESCED flag before calling any function which 1707 * may cause thread context switching, such as kmem_alloc() etc. 1708 * The idle thread checks for CPU_QUIESCED flag and loops for ever if 1709 * it's set. So the startup thread may have no chance to switch back 1710 * again if it's switched away with CPU_QUIESCED set. 1711 */ 1712 cp->cpu_flags &= ~(CPU_POWEROFF | CPU_QUIESCED); 1713 1714 cpuid_pass2(cp); 1715 cpuid_pass3(cp); 1716 (void) cpuid_pass4(cp); 1717 1718 /* 1719 * Correct cpu_idstr and cpu_brandstr on target CPU after 1720 * cpuid_pass1() is done. 1721 */ 1722 (void) cpuid_getidstr(cp, cp->cpu_idstr, CPU_IDSTRLEN); 1723 (void) cpuid_getbrandstr(cp, cp->cpu_brandstr, CPU_IDSTRLEN); 1724 1725 cp->cpu_flags |= CPU_RUNNING | CPU_READY | CPU_EXISTS; 1726 1727 post_startup_cpu_fixups(); 1728 1729 cpu_event_init_cpu(cp); 1730 1731 /* 1732 * Enable preemption here so that contention for any locks acquired 1733 * later in mp_startup_common may be preempted if the thread owning 1734 * those locks is continuously executing on other CPUs (for example, 1735 * this CPU must be preemptible to allow other CPUs to pause it during 1736 * their startup phases). It's safe to enable preemption here because 1737 * the CPU state is pretty-much fully constructed. 1738 */ 1739 curthread->t_preempt = 0; 1740 1741 /* The base spl should still be at LOCK LEVEL here */ 1742 ASSERT(cp->cpu_base_spl == ipltospl(LOCK_LEVEL)); 1743 set_base_spl(); /* Restore the spl to its proper value */ 1744 1745 pghw_physid_create(cp); 1746 /* 1747 * Delegate initialization tasks, which need to access the cpu_lock, 1748 * to mp_start_cpu_common() because we can't acquire the cpu_lock here 1749 * during CPU DR operations. 1750 */ 1751 mp_startup_signal(&procset_slave, cp->cpu_id); 1752 mp_startup_wait(&procset_master, cp->cpu_id); 1753 pg_cmt_cpu_startup(cp); 1754 1755 if (boot) { 1756 mutex_enter(&cpu_lock); 1757 cp->cpu_flags &= ~CPU_OFFLINE; 1758 cpu_enable_intr(cp); 1759 cpu_add_active(cp); 1760 mutex_exit(&cpu_lock); 1761 } 1762 1763 /* Enable interrupts */ 1764 (void) spl0(); 1765 1766 /* 1767 * Fill out cpu_ucode_info. Update microcode if necessary. 1768 */ 1769 ucode_check(cp); 1770 1771 #ifndef __xpv 1772 { 1773 /* 1774 * Set up the CPU module for this CPU. This can't be done 1775 * before this CPU is made CPU_READY, because we may (in 1776 * heterogeneous systems) need to go load another CPU module. 1777 * The act of attempting to load a module may trigger a 1778 * cross-call, which will ASSERT unless this cpu is CPU_READY. 1779 */ 1780 cmi_hdl_t hdl; 1781 1782 if ((hdl = cmi_init(CMI_HDL_NATIVE, cmi_ntv_hwchipid(CPU), 1783 cmi_ntv_hwcoreid(CPU), cmi_ntv_hwstrandid(CPU))) != NULL) { 1784 if (is_x86_feature(x86_featureset, X86FSET_MCA)) 1785 cmi_mca_init(hdl); 1786 cp->cpu_m.mcpu_cmi_hdl = hdl; 1787 } 1788 } 1789 #endif /* __xpv */ 1790 1791 if (boothowto & RB_DEBUG) 1792 kdi_cpu_init(); 1793 1794 /* 1795 * Setting the bit in cpu_ready_set must be the last operation in 1796 * processor initialization; the boot CPU will continue to boot once 1797 * it sees this bit set for all active CPUs. 1798 */ 1799 CPUSET_ATOMIC_ADD(cpu_ready_set, cp->cpu_id); 1800 1801 (void) mach_cpu_create_device_node(cp, NULL); 1802 1803 cmn_err(CE_CONT, "?cpu%d: %s\n", cp->cpu_id, cp->cpu_idstr); 1804 cmn_err(CE_CONT, "?cpu%d: %s\n", cp->cpu_id, cp->cpu_brandstr); 1805 cmn_err(CE_CONT, "?cpu%d initialization complete - online\n", 1806 cp->cpu_id); 1807 1808 /* 1809 * Now we are done with the startup thread, so free it up. 1810 */ 1811 thread_exit(); 1812 panic("mp_startup: cannot return"); 1813 /*NOTREACHED*/ 1814 } 1815 1816 /* 1817 * Startup function for 'other' CPUs at boot time (besides boot cpu). 1818 */ 1819 static void 1820 mp_startup_boot(void) 1821 { 1822 mp_startup_common(B_TRUE); 1823 } 1824 1825 /* 1826 * Startup function for hotplug CPUs at runtime. 1827 */ 1828 void 1829 mp_startup_hotplug(void) 1830 { 1831 mp_startup_common(B_FALSE); 1832 } 1833 1834 /* 1835 * Start CPU on user request. 1836 */ 1837 /* ARGSUSED */ 1838 int 1839 mp_cpu_start(struct cpu *cp) 1840 { 1841 ASSERT(MUTEX_HELD(&cpu_lock)); 1842 return (0); 1843 } 1844 1845 /* 1846 * Stop CPU on user request. 1847 */ 1848 int 1849 mp_cpu_stop(struct cpu *cp) 1850 { 1851 extern int cbe_psm_timer_mode; 1852 ASSERT(MUTEX_HELD(&cpu_lock)); 1853 1854 #ifdef __xpv 1855 /* 1856 * We can't offline vcpu0. 1857 */ 1858 if (cp->cpu_id == 0) 1859 return (EBUSY); 1860 #endif 1861 1862 /* 1863 * If TIMER_PERIODIC mode is used, CPU0 is the one running it; 1864 * can't stop it. (This is true only for machines with no TSC.) 1865 */ 1866 1867 if ((cbe_psm_timer_mode == TIMER_PERIODIC) && (cp->cpu_id == 0)) 1868 return (EBUSY); 1869 1870 return (0); 1871 } 1872 1873 /* 1874 * Take the specified CPU out of participation in interrupts. 1875 */ 1876 int 1877 cpu_disable_intr(struct cpu *cp) 1878 { 1879 if (psm_disable_intr(cp->cpu_id) != DDI_SUCCESS) 1880 return (EBUSY); 1881 1882 cp->cpu_flags &= ~CPU_ENABLE; 1883 return (0); 1884 } 1885 1886 /* 1887 * Allow the specified CPU to participate in interrupts. 1888 */ 1889 void 1890 cpu_enable_intr(struct cpu *cp) 1891 { 1892 ASSERT(MUTEX_HELD(&cpu_lock)); 1893 cp->cpu_flags |= CPU_ENABLE; 1894 psm_enable_intr(cp->cpu_id); 1895 } 1896 1897 void 1898 mp_cpu_faulted_enter(struct cpu *cp) 1899 { 1900 #ifdef __xpv 1901 _NOTE(ARGUNUSED(cp)); 1902 #else 1903 cmi_hdl_t hdl = cp->cpu_m.mcpu_cmi_hdl; 1904 1905 if (hdl != NULL) { 1906 cmi_hdl_hold(hdl); 1907 } else { 1908 hdl = cmi_hdl_lookup(CMI_HDL_NATIVE, cmi_ntv_hwchipid(cp), 1909 cmi_ntv_hwcoreid(cp), cmi_ntv_hwstrandid(cp)); 1910 } 1911 if (hdl != NULL) { 1912 cmi_faulted_enter(hdl); 1913 cmi_hdl_rele(hdl); 1914 } 1915 #endif 1916 } 1917 1918 void 1919 mp_cpu_faulted_exit(struct cpu *cp) 1920 { 1921 #ifdef __xpv 1922 _NOTE(ARGUNUSED(cp)); 1923 #else 1924 cmi_hdl_t hdl = cp->cpu_m.mcpu_cmi_hdl; 1925 1926 if (hdl != NULL) { 1927 cmi_hdl_hold(hdl); 1928 } else { 1929 hdl = cmi_hdl_lookup(CMI_HDL_NATIVE, cmi_ntv_hwchipid(cp), 1930 cmi_ntv_hwcoreid(cp), cmi_ntv_hwstrandid(cp)); 1931 } 1932 if (hdl != NULL) { 1933 cmi_faulted_exit(hdl); 1934 cmi_hdl_rele(hdl); 1935 } 1936 #endif 1937 } 1938 1939 /* 1940 * The following two routines are used as context operators on threads belonging 1941 * to processes with a private LDT (see sysi86). Due to the rarity of such 1942 * processes, these routines are currently written for best code readability and 1943 * organization rather than speed. We could avoid checking x86_featureset at 1944 * every context switch by installing different context ops, depending on 1945 * x86_featureset, at LDT creation time -- one for each combination of fast 1946 * syscall features. 1947 */ 1948 1949 /*ARGSUSED*/ 1950 void 1951 cpu_fast_syscall_disable(void *arg) 1952 { 1953 if (is_x86_feature(x86_featureset, X86FSET_MSR) && 1954 is_x86_feature(x86_featureset, X86FSET_SEP)) 1955 cpu_sep_disable(); 1956 if (is_x86_feature(x86_featureset, X86FSET_MSR) && 1957 is_x86_feature(x86_featureset, X86FSET_ASYSC)) 1958 cpu_asysc_disable(); 1959 } 1960 1961 /*ARGSUSED*/ 1962 void 1963 cpu_fast_syscall_enable(void *arg) 1964 { 1965 if (is_x86_feature(x86_featureset, X86FSET_MSR) && 1966 is_x86_feature(x86_featureset, X86FSET_SEP)) 1967 cpu_sep_enable(); 1968 if (is_x86_feature(x86_featureset, X86FSET_MSR) && 1969 is_x86_feature(x86_featureset, X86FSET_ASYSC)) 1970 cpu_asysc_enable(); 1971 } 1972 1973 static void 1974 cpu_sep_enable(void) 1975 { 1976 ASSERT(is_x86_feature(x86_featureset, X86FSET_SEP)); 1977 ASSERT(curthread->t_preempt || getpil() >= LOCK_LEVEL); 1978 1979 wrmsr(MSR_INTC_SEP_CS, (uint64_t)(uintptr_t)KCS_SEL); 1980 } 1981 1982 static void 1983 cpu_sep_disable(void) 1984 { 1985 ASSERT(is_x86_feature(x86_featureset, X86FSET_SEP)); 1986 ASSERT(curthread->t_preempt || getpil() >= LOCK_LEVEL); 1987 1988 /* 1989 * Setting the SYSENTER_CS_MSR register to 0 causes software executing 1990 * the sysenter or sysexit instruction to trigger a #gp fault. 1991 */ 1992 wrmsr(MSR_INTC_SEP_CS, 0); 1993 } 1994 1995 static void 1996 cpu_asysc_enable(void) 1997 { 1998 ASSERT(is_x86_feature(x86_featureset, X86FSET_ASYSC)); 1999 ASSERT(curthread->t_preempt || getpil() >= LOCK_LEVEL); 2000 2001 wrmsr(MSR_AMD_EFER, rdmsr(MSR_AMD_EFER) | 2002 (uint64_t)(uintptr_t)AMD_EFER_SCE); 2003 } 2004 2005 static void 2006 cpu_asysc_disable(void) 2007 { 2008 ASSERT(is_x86_feature(x86_featureset, X86FSET_ASYSC)); 2009 ASSERT(curthread->t_preempt || getpil() >= LOCK_LEVEL); 2010 2011 /* 2012 * Turn off the SCE (syscall enable) bit in the EFER register. Software 2013 * executing syscall or sysret with this bit off will incur a #ud trap. 2014 */ 2015 wrmsr(MSR_AMD_EFER, rdmsr(MSR_AMD_EFER) & 2016 ~((uint64_t)(uintptr_t)AMD_EFER_SCE)); 2017 } 2018