1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2019 Joyent, Inc. 14 * Copyright 2020 OmniOS Community Edition (OmniOSce) Association. 15 * Copyright 2024 Oxide Computer Company 16 */ 17 18 #include <sys/cpuvar.h> 19 #include <sys/types.h> 20 #include <sys/errno.h> 21 #include <sys/machsystm.h> 22 #include <sys/archsystm.h> 23 #include <sys/controlregs.h> 24 #include <sys/x86_archext.h> 25 #include <sys/id_space.h> 26 #include <sys/hma.h> 27 #include <sys/cmn_err.h> 28 #include <vm/hat.h> 29 #include <vm/as.h> 30 31 struct hma_reg { 32 const char *hr_name; 33 list_node_t hr_node; 34 }; 35 36 static kmutex_t hma_lock; 37 static list_t hma_registrations; 38 static boolean_t hma_exclusive = B_FALSE; 39 int hma_disable = 0; 40 41 typedef enum hma_cpu_status { 42 HCS_UNINITIALIZED = 0, 43 HCS_READY, 44 HCS_ERROR 45 } hma_cpu_status_t; 46 47 /* 48 * When both host and guest want simultaneous use of the CPU performance 49 * counters, which should take priority? 50 * 51 * Defer to the guest by default, making its activity invisible to 52 * host-configured CPC measurements. This is necessary since the Capacity & 53 * Utilization system keeps the CPCs active at all times when not in use by 54 * libcpc or dtrace users. 55 */ 56 typedef enum hma_cpc_priority { 57 HCP_HOST_WINS = 0, 58 HCP_GUEST_WINS = 1, 59 } hma_cpc_priority_t; 60 static hma_cpc_priority_t hma_cpc_priority = HCP_GUEST_WINS; 61 62 /* 63 * VMX-specific per-CPU data 64 */ 65 typedef struct hma_vmx_cpu { 66 void *hvc_vmxon_page; 67 uintptr_t hvc_vmxon_pa; 68 69 } hma_vmx_cpu_t; 70 71 /* 72 * SVM-specific per-CPU data 73 */ 74 typedef struct hma_svm_cpu { 75 void *hsc_hsave_page; 76 uintptr_t hsc_hsave_pa; 77 hma_svm_asid_t hsc_asid; 78 uint_t hsc_gif_disabled; 79 /* 80 * hsc_cpc_saved_flags stores the state of guest performance counters 81 * while inside the hma_svm_cpc_enter/hma_svm_cpc_exit critical section. 82 * 83 * If, due to the state of host counters, requested guest counters, and 84 * hma_cpc_priority, the guest counters are _not_ loaded during 85 * hma_svm_cpc_enter(), then this field will hold HCF_DISABLED, 86 * indicating that no state restoration is required during 87 * hma_svm_cpc_exit(). 88 * 89 * When hsc_cpc_saved_flags is not HCF_DISABLED, then hsc_cpc_host_regs 90 * will hold the saved host CPC state while the guest state occupies 91 * those registers in the CPU. 92 */ 93 hma_cpc_flags_t hsc_cpc_saved_flags; 94 hma_cpc_t hsc_cpc_host_regs[6]; 95 } hma_svm_cpu_t; 96 97 /* 98 * Combined per-CPU state data 99 * 100 * The bulk of HMA state (VMX & SVM) is protected by cpu_lock, rather than a 101 * mutex specific to the module. It (cpu_lock) is already required for the 102 * state needed to perform setup on all CPUs, so it was a natural fit to 103 * protect this data too. 104 */ 105 struct hma_cpu { 106 union { 107 struct hma_vmx_cpu vmx; 108 struct hma_svm_cpu svm; 109 } hc_u; 110 hma_cpu_status_t hc_status; 111 uintptr_t _hc_padding[6]; 112 } hma_cpu[NCPU]; 113 114 /* Keep per-CPU state aligned to cache line size to avoid false sharing */ 115 CTASSERT(sizeof (struct hma_cpu) % _CACHE_LINE_SIZE == 0); 116 117 118 static boolean_t hma_vmx_ready = B_FALSE; 119 static const char *hma_vmx_error = NULL; 120 static id_space_t *hma_vmx_vpid; 121 122 /* HMA-internal tracking of optional VMX capabilities */ 123 typedef enum { 124 HVC_EPT = (1 << 0), 125 HVC_VPID = (1 << 1), 126 HVC_INVEPT_ONE = (1 << 2), 127 HVC_INVEPT_ALL = (1 << 3), 128 } hma_vmx_capab_t; 129 130 static uint32_t hma_vmx_revision; 131 static hma_vmx_capab_t hma_vmx_capabs = 0; 132 133 static boolean_t hma_svm_ready = B_FALSE; 134 static const char *hma_svm_error = NULL; 135 static uint32_t hma_svm_features; 136 static uint32_t hma_svm_max_asid; 137 static hma_cpc_flags_t hma_svm_cpc_allowed = HCF_DISABLED; 138 139 static int hma_vmx_init(void); 140 static int hma_svm_init(void); 141 142 /* Helpers from ml/hma_asm.s */ 143 int hma_vmx_do_invept(int, uintptr_t); 144 int hma_vmx_vmxon(uintptr_t); 145 146 void 147 hma_init(void) 148 { 149 mutex_init(&hma_lock, NULL, MUTEX_DEFAULT, NULL); 150 list_create(&hma_registrations, sizeof (struct hma_reg), 151 offsetof(struct hma_reg, hr_node)); 152 153 if (hma_disable != 0) { 154 cmn_err(CE_CONT, "?hma_init: disabled"); 155 return; 156 } 157 158 switch (cpuid_getvendor(CPU)) { 159 case X86_VENDOR_Intel: 160 (void) hma_vmx_init(); 161 break; 162 case X86_VENDOR_AMD: 163 case X86_VENDOR_HYGON: 164 (void) hma_svm_init(); 165 break; 166 default: 167 break; 168 } 169 } 170 171 static hma_reg_t * 172 hma_register_backend(const char *name) 173 { 174 struct hma_reg *reg; 175 boolean_t is_ready; 176 177 ASSERT(MUTEX_HELD(&hma_lock)); 178 179 switch (cpuid_getvendor(CPU)) { 180 case X86_VENDOR_Intel: 181 is_ready = hma_vmx_ready; 182 break; 183 case X86_VENDOR_AMD: 184 case X86_VENDOR_HYGON: 185 is_ready = hma_svm_ready; 186 break; 187 default: 188 is_ready = B_FALSE; 189 break; 190 } 191 192 if (!is_ready) 193 return (NULL); 194 195 reg = kmem_zalloc(sizeof (*reg), KM_SLEEP); 196 reg->hr_name = name; 197 list_insert_tail(&hma_registrations, reg); 198 199 return (reg); 200 } 201 202 hma_reg_t * 203 hma_register(const char *name) 204 { 205 struct hma_reg *reg = NULL; 206 207 VERIFY(name != NULL); 208 209 mutex_enter(&hma_lock); 210 211 if (!hma_exclusive) 212 reg = hma_register_backend(name); 213 214 mutex_exit(&hma_lock); 215 216 return (reg); 217 } 218 219 hma_reg_t * 220 hma_register_exclusive(const char *name) 221 { 222 struct hma_reg *reg = NULL; 223 224 VERIFY(name != NULL); 225 226 mutex_enter(&hma_lock); 227 228 if (list_is_empty(&hma_registrations)) { 229 reg = hma_register_backend(name); 230 if (reg != NULL) 231 hma_exclusive = B_TRUE; 232 } 233 234 mutex_exit(&hma_lock); 235 236 return (reg); 237 } 238 239 void 240 hma_unregister(hma_reg_t *reg) 241 { 242 VERIFY(reg != NULL); 243 VERIFY(!list_is_empty(&hma_registrations)); 244 245 mutex_enter(&hma_lock); 246 list_remove(&hma_registrations, reg); 247 if (hma_exclusive && list_is_empty(&hma_registrations)) 248 hma_exclusive = B_FALSE; 249 mutex_exit(&hma_lock); 250 kmem_free(reg, sizeof (*reg)); 251 } 252 253 static __inline hma_vmx_cpu_t * 254 hma_vmx_cpu(processorid_t id) 255 { 256 return (&hma_cpu[id].hc_u.vmx); 257 } 258 259 static __inline hma_svm_cpu_t * 260 hma_svm_cpu(processorid_t id) 261 { 262 return (&hma_cpu[id].hc_u.svm); 263 } 264 265 /* 266 * VPID 0 is reserved for instances where VPID is disabled. Some hypervisors 267 * (read: bhyve) reserve lower-order VPIDs for use in fallback behavior if 268 * unique VPIDs could not be allocated for all the vCPUs belonging to a VM. 269 */ 270 #define HMA_VPID_RESERVED NCPU 271 272 uint16_t 273 hma_vmx_vpid_alloc(void) 274 { 275 id_t res; 276 277 /* Do not bother if the CPU lacks support */ 278 if ((hma_vmx_capabs & HVC_VPID) == 0) { 279 return (0); 280 } 281 282 res = id_alloc_nosleep(hma_vmx_vpid); 283 if (res == -1) { 284 return (0); 285 } else { 286 ASSERT(res > HMA_VPID_RESERVED && res <= UINT16_MAX); 287 return (res); 288 } 289 } 290 291 void 292 hma_vmx_vpid_free(uint16_t vpid) 293 { 294 VERIFY(vpid > HMA_VPID_RESERVED); 295 id_free(hma_vmx_vpid, (id_t)vpid); 296 } 297 298 #define INVEPT_SINGLE_CONTEXT 1 299 #define INVEPT_ALL_CONTEXTS 2 300 301 static int 302 hma_vmx_invept_xcall(xc_arg_t arg1, xc_arg_t arg2, xc_arg_t arg3 __unused) 303 { 304 int flag = (int)arg1; 305 uintptr_t eptp = (uintptr_t)arg2; 306 307 ASSERT(flag == INVEPT_SINGLE_CONTEXT || flag == INVEPT_ALL_CONTEXTS); 308 309 VERIFY0(hma_vmx_do_invept(flag, eptp)); 310 return (0); 311 } 312 313 void 314 hma_vmx_invept_allcpus(uintptr_t eptp) 315 { 316 int flag = -1; 317 cpuset_t set; 318 319 if ((hma_vmx_capabs & HVC_INVEPT_ONE) != 0) { 320 flag = INVEPT_SINGLE_CONTEXT; 321 } else if ((hma_vmx_capabs & HVC_INVEPT_ALL) != 0) { 322 flag = INVEPT_ALL_CONTEXTS; 323 eptp = 0; 324 } else { 325 return; 326 } 327 328 cpuset_zero(&set); 329 mutex_enter(&cpu_lock); 330 331 cpuset_or(&set, &cpu_active_set); 332 xc_call((xc_arg_t)flag, (xc_arg_t)eptp, 0, CPUSET2BV(set), 333 hma_vmx_invept_xcall); 334 335 mutex_exit(&cpu_lock); 336 } 337 338 static int 339 hma_vmx_cpu_vmxon(xc_arg_t arg1 __unused, xc_arg_t arg2 __unused, 340 xc_arg_t arg3 __unused) 341 { 342 uint64_t fctrl; 343 const processorid_t id = CPU->cpu_seqid; 344 hma_vmx_cpu_t *vmx_cpu = hma_vmx_cpu(id); 345 346 VERIFY(vmx_cpu->hvc_vmxon_page != NULL); 347 VERIFY(vmx_cpu->hvc_vmxon_pa != 0); 348 349 /* 350 * Ensure that the VMX support and lock bits are enabled in the 351 * feature-control MSR. 352 */ 353 fctrl = rdmsr(MSR_IA32_FEAT_CTRL); 354 if ((fctrl & IA32_FEAT_CTRL_LOCK) == 0 || 355 (fctrl & IA32_FEAT_CTRL_VMX_EN) == 0) { 356 fctrl = fctrl | IA32_FEAT_CTRL_VMX_EN | IA32_FEAT_CTRL_LOCK; 357 wrmsr(MSR_IA32_FEAT_CTRL, fctrl); 358 } 359 360 setcr4(getcr4() | CR4_VMXE); 361 362 if (hma_vmx_vmxon(vmx_cpu->hvc_vmxon_pa) == 0) { 363 hma_cpu[id].hc_status = HCS_READY; 364 } else { 365 hma_cpu[id].hc_status = HCS_ERROR; 366 367 /* 368 * If VMX has already been marked active and available for the 369 * system, then failure to perform VMXON on a newly-onlined CPU 370 * represents a fatal problem. Continuing on would mean 371 * failure for any hypervisor thread which landed here. 372 */ 373 if (hma_vmx_ready) { 374 panic("VMXON failure after VMX marked ready"); 375 } 376 } 377 return (0); 378 } 379 380 static int 381 hma_vmx_cpu_setup(cpu_setup_t what, int id, void *arg __unused) 382 { 383 hma_vmx_cpu_t *vmx_cpu = hma_vmx_cpu(id); 384 385 ASSERT(MUTEX_HELD(&cpu_lock)); 386 ASSERT(id >= 0 && id < NCPU); 387 388 if (what != CPU_ON) { 389 /* 390 * For the purposes of VMX setup, only the CPU_ON event is of 391 * interest. Letting VMX state linger on an offline CPU should 392 * not cause any harm. 393 * 394 * This logic assumes that any offlining activity is strictly 395 * administrative in nature and will not alter any existing 396 * configuration (such as %cr4 bits previously set). 397 */ 398 return (0); 399 } 400 401 const hma_cpu_status_t status = hma_cpu[id].hc_status; 402 if (status == HCS_ERROR) { 403 return (-1); 404 } 405 406 /* Allocate the VMXON page for this CPU, if not already done */ 407 if (vmx_cpu->hvc_vmxon_page == NULL) { 408 caddr_t va; 409 pfn_t pfn; 410 411 va = kmem_alloc(PAGESIZE, KM_SLEEP); 412 VERIFY0((uintptr_t)va & PAGEOFFSET); 413 vmx_cpu->hvc_vmxon_page = va; 414 415 /* Initialize the VMX revision field as expected */ 416 bcopy(&hma_vmx_revision, va, sizeof (hma_vmx_revision)); 417 418 /* 419 * Cache the physical address of the VMXON page rather than 420 * looking it up later when the potential blocking of 421 * hat_getpfnum would be less acceptable. 422 */ 423 pfn = hat_getpfnum(kas.a_hat, va); 424 vmx_cpu->hvc_vmxon_pa = (pfn << PAGESHIFT); 425 } else { 426 VERIFY(vmx_cpu->hvc_vmxon_pa != 0); 427 } 428 429 if (status == HCS_UNINITIALIZED) { 430 cpuset_t set; 431 432 /* Activate VMX on this CPU */ 433 cpuset_zero(&set); 434 cpuset_add(&set, id); 435 xc_call(0, 0, 0, CPUSET2BV(set), hma_vmx_cpu_vmxon); 436 } else { 437 VERIFY3U(status, ==, HCS_READY); 438 439 /* 440 * If an already-initialized CPU is going back online, perform 441 * an all-contexts invept to eliminate the possibility of 442 * cached EPT state causing issues. 443 */ 444 if ((hma_vmx_capabs & HVC_INVEPT_ALL) != 0) { 445 cpuset_t set; 446 447 cpuset_zero(&set); 448 cpuset_add(&set, id); 449 xc_call((xc_arg_t)INVEPT_ALL_CONTEXTS, 0, 0, 450 CPUSET2BV(set), hma_vmx_invept_xcall); 451 } 452 } 453 454 return (hma_cpu[id].hc_status != HCS_READY); 455 } 456 457 /* 458 * Determining the availability of VM execution controls is somewhat different 459 * from conventional means, where one simply checks for asserted bits in the 460 * MSR value. Instead, these execution control MSRs are split into two halves: 461 * the lower 32-bits indicating capabilities which can be zeroed in the VMCS 462 * field and the upper 32-bits indicating capabilities which can be set to one. 463 * 464 * It is described in detail in Appendix A.3 of SDM volume 3. 465 */ 466 #define VMX_CTL_ONE_SETTING(val, flag) \ 467 (((val) & ((uint64_t)(flag) << 32)) != 0) 468 469 static const char * 470 hma_vmx_query_details(void) 471 { 472 boolean_t query_true_ctl = B_FALSE; 473 uint64_t msr; 474 475 /* The basic INS/OUTS functionality is cited as a necessary prereq */ 476 msr = rdmsr(MSR_IA32_VMX_BASIC); 477 if ((msr & IA32_VMX_BASIC_INS_OUTS) == 0) { 478 return ("VMX does not support INS/OUTS"); 479 } 480 481 /* Record the VMX revision for later VMXON usage */ 482 hma_vmx_revision = (uint32_t)msr; 483 484 /* 485 * Bit 55 in the VMX_BASIC MSR determines how VMX control information 486 * can be queried. 487 */ 488 query_true_ctl = (msr & IA32_VMX_BASIC_TRUE_CTRLS) != 0; 489 490 /* Check for EPT and VPID support */ 491 msr = rdmsr(query_true_ctl ? 492 MSR_IA32_VMX_TRUE_PROCBASED_CTLS : MSR_IA32_VMX_PROCBASED_CTLS); 493 if (VMX_CTL_ONE_SETTING(msr, IA32_VMX_PROCBASED_2ND_CTLS)) { 494 msr = rdmsr(MSR_IA32_VMX_PROCBASED2_CTLS); 495 if (VMX_CTL_ONE_SETTING(msr, IA32_VMX_PROCBASED2_EPT)) { 496 hma_vmx_capabs |= HVC_EPT; 497 } 498 if (VMX_CTL_ONE_SETTING(msr, IA32_VMX_PROCBASED2_VPID)) { 499 hma_vmx_capabs |= HVC_VPID; 500 } 501 } 502 503 /* Check for INVEPT support */ 504 if ((hma_vmx_capabs & HVC_EPT) != 0) { 505 msr = rdmsr(MSR_IA32_VMX_EPT_VPID_CAP); 506 if ((msr & IA32_VMX_EPT_VPID_INVEPT) != 0) { 507 if ((msr & IA32_VMX_EPT_VPID_INVEPT_SINGLE) != 0) { 508 hma_vmx_capabs |= HVC_INVEPT_ONE; 509 } 510 if ((msr & IA32_VMX_EPT_VPID_INVEPT_ALL) != 0) { 511 hma_vmx_capabs |= HVC_INVEPT_ALL; 512 } 513 } 514 } 515 516 return (NULL); 517 } 518 519 static int 520 hma_vmx_init(void) 521 { 522 cpu_t *cp; 523 uint64_t msr; 524 int err = 0; 525 const char *msg = NULL; 526 527 if (!is_x86_feature(x86_featureset, X86FSET_VMX)) { 528 msg = "CPU does not support VMX"; 529 goto bail; 530 } 531 532 /* Has the BIOS set the feature-control lock bit without VMX enabled? */ 533 msr = rdmsr(MSR_IA32_FEAT_CTRL); 534 if ((msr & IA32_FEAT_CTRL_LOCK) != 0 && 535 (msr & IA32_FEAT_CTRL_VMX_EN) == 0) { 536 msg = "VMX support disabled by BIOS"; 537 goto bail; 538 } 539 540 msg = hma_vmx_query_details(); 541 if (msg != NULL) { 542 goto bail; 543 } 544 545 mutex_enter(&cpu_lock); 546 /* Perform VMX configuration for already-online CPUs. */ 547 cp = cpu_active; 548 do { 549 err = hma_vmx_cpu_setup(CPU_ON, cp->cpu_seqid, NULL); 550 if (err != 0) { 551 msg = "failure during VMXON setup"; 552 mutex_exit(&cpu_lock); 553 goto bail; 554 } 555 } while ((cp = cp->cpu_next_onln) != cpu_active); 556 557 /* 558 * Register callback for later-onlined CPUs and perform other remaining 559 * resource allocation. 560 */ 561 register_cpu_setup_func(hma_vmx_cpu_setup, NULL); 562 mutex_exit(&cpu_lock); 563 564 hma_vmx_vpid = id_space_create("hma_vmx_vpid", HMA_VPID_RESERVED + 1, 565 UINT16_MAX); 566 hma_vmx_ready = B_TRUE; 567 568 return (0); 569 570 bail: 571 hma_vmx_error = msg; 572 cmn_err(CE_NOTE, "!hma_vmx_init: %s", msg); 573 return (-1); 574 } 575 576 #define VMCB_FLUSH_NOTHING 0x0 577 #define VMCB_FLUSH_ALL 0x1 578 #define VMCB_FLUSH_ASID 0x3 579 580 void 581 hma_svm_asid_init(hma_svm_asid_t *vcp) 582 { 583 /* 584 * Initialize the generation to 0, forcing an ASID allocation on first 585 * entry. Leave the ASID at 0, so if the host forgoes the call to 586 * hma_svm_asid_update(), SVM will bail on the invalid vcpu state. 587 */ 588 vcp->hsa_gen = 0; 589 vcp->hsa_asid = 0; 590 } 591 592 uint8_t 593 hma_svm_asid_update(hma_svm_asid_t *vcp, boolean_t flush_by_asid, 594 boolean_t npt_flush) 595 { 596 /* 597 * Most ASID resource updates are expected to be performed as part of 598 * VMM entry into guest context, where interrupts would be disabled for 599 * the sake of state consistency. 600 * 601 * We demand this be the case, even though other situations which might 602 * incur an ASID update, such as userspace manipulation of guest vCPU 603 * state, may not require such consistency. 604 */ 605 ASSERT(!interrupts_enabled()); 606 607 /* 608 * If NPT changes dictate a TLB flush and by-ASID flushing is not 609 * supported/used, force a fresh ASID allocation. 610 */ 611 if (npt_flush && !flush_by_asid) { 612 vcp->hsa_gen = 0; 613 } 614 615 hma_svm_asid_t *hcp = &(hma_svm_cpu(CPU->cpu_seqid)->hsc_asid); 616 if (vcp->hsa_gen != hcp->hsa_gen) { 617 hcp->hsa_asid++; 618 619 if (hcp->hsa_asid >= hma_svm_max_asid) { 620 /* Keep the ASID properly constrained */ 621 hcp->hsa_asid = 1; 622 hcp->hsa_gen++; 623 if (hcp->hsa_gen == 0) { 624 /* 625 * Stay clear of the '0' sentinel value for 626 * generation, if wrapping around. 627 */ 628 hcp->hsa_gen = 1; 629 } 630 } 631 vcp->hsa_gen = hcp->hsa_gen; 632 vcp->hsa_asid = hcp->hsa_asid; 633 634 ASSERT(vcp->hsa_asid != 0); 635 ASSERT3U(vcp->hsa_asid, <, hma_svm_max_asid); 636 637 if (flush_by_asid) { 638 return (VMCB_FLUSH_ASID); 639 } else { 640 return (VMCB_FLUSH_ALL); 641 } 642 } else if (npt_flush) { 643 ASSERT(flush_by_asid); 644 return (VMCB_FLUSH_ASID); 645 } 646 647 return (VMCB_FLUSH_NOTHING); 648 } 649 650 void 651 hma_svm_gif_disable(void) 652 { 653 /* 654 * Clear the GIF (masking interrupts) first, so the subsequent 655 * housekeeping can be done under its protection. 656 */ 657 __asm__ __volatile__("clgi"); 658 659 hma_svm_cpu_t *svm_cpu = hma_svm_cpu(CPU->cpu_seqid); 660 const uint_t old_gif = atomic_swap_uint(&svm_cpu->hsc_gif_disabled, 1); 661 662 if (old_gif != 0) { 663 panic("GIF disable is set when expected to be clear"); 664 } 665 } 666 667 void 668 hma_svm_gif_enable(void) 669 { 670 hma_svm_cpu_t *svm_cpu = hma_svm_cpu(CPU->cpu_seqid); 671 const uint_t old_gif = atomic_swap_uint(&svm_cpu->hsc_gif_disabled, 0); 672 673 if (old_gif == 0) { 674 panic("GIF disable is clear when expected to be set"); 675 } 676 677 /* 678 * Set the GIF last (un-masking interrupts) last, so the housekeeping 679 * will have been completed under its protection. 680 */ 681 __asm__ __volatile__("stgi"); 682 } 683 684 boolean_t 685 hma_svm_gif_is_disabled(void) 686 { 687 hma_svm_cpu_t *svm_cpu = hma_svm_cpu(CPU->cpu_seqid); 688 689 /* 690 * At the time of this writing, there exists no mechanism by which the 691 * state of the GIF on a CPU can be directly queried. Rather than 692 * attempting an indirect means of checking its state, we track it 693 * manually through the HMA disable/enable functions. 694 */ 695 return (svm_cpu->hsc_gif_disabled != 0); 696 } 697 698 #define EVTSEL_EN(evt) (((evt) & AMD_PERF_EVTSEL_CTR_EN) != 0) 699 #define CPC_BASE_REGS 4 700 #define CPC_EXTD_REGS 6 701 #define MSR_CPC_EXTD_EVTSEL(idx) (MSR_AMD_F15H_PERF_EVTSEL0 + (idx * 2)) 702 #define MSR_CPC_EXTD_CTR(idx) (MSR_AMD_F15H_PERF_CTR0 + (idx * 2)) 703 704 /* 705 * AMD CPU Performance Counter Support 706 * 707 * This provides a means of safely saving/loading host CPC state, along with 708 * loading/saving guest CPC state upon guest entry/exit (respectively). 709 * Currently, this only supports the 6 "extended" performance counters 710 * (in MSRs C0010200h - C001020bh). It pays no head to any other CPC state such 711 * as the Northbridge counters or PerfMonV2 registers. 712 */ 713 714 hma_svm_cpc_res_t 715 hma_svm_cpc_enter(struct hma_svm_cpc_state *cpc_state) 716 { 717 hma_svm_cpu_t *svm_cpu = hma_svm_cpu(CPU->cpu_seqid); 718 719 ASSERT(!interrupts_enabled()); 720 721 svm_cpu->hsc_cpc_saved_flags = HCF_DISABLED; 722 723 const hma_cpc_flags_t req_flags = 724 cpc_state->hscs_flags & hma_svm_cpc_allowed; 725 if (req_flags == HCF_DISABLED) { 726 return (HSCR_EMPTY); 727 } 728 729 /* Extended regs should not be enabled without base */ 730 IMPLY((req_flags & HCF_EN_EXTD) != 0, (req_flags & HCF_EN_BASE) != 0); 731 732 const uint_t max_guest_reg = 733 (req_flags & HCF_EN_EXTD) != 0 ? CPC_EXTD_REGS : CPC_BASE_REGS; 734 uint_t guest_active = 0; 735 for (uint_t i = 0; i < max_guest_reg; i++) { 736 if (EVTSEL_EN(cpc_state->hscs_regs[i].hc_evtsel)) { 737 guest_active++; 738 } 739 } 740 741 /* 742 * Guest is not currently measuring with any of the CPCs, so leave any 743 * host counters in place. 744 */ 745 if (guest_active == 0) { 746 return (HSCR_EMPTY); 747 } 748 749 /* 750 * Read (and save) the host evtsel values, counting the number of 751 * registers in active use 752 */ 753 uint_t host_active = 0; 754 for (uint_t i = 0; i < CPC_EXTD_REGS; i++) { 755 const uint64_t evtsel = rdmsr(MSR_CPC_EXTD_EVTSEL(i)); 756 757 svm_cpu->hsc_cpc_host_regs[i].hc_evtsel = evtsel; 758 if (EVTSEL_EN(evtsel)) { 759 host_active++; 760 } 761 } 762 763 if (host_active != 0) { 764 if (hma_cpc_priority == HCP_HOST_WINS) { 765 /* 766 * Host has priority access to the perf counters over 767 * the guest, so just leave everything in place. 768 */ 769 DTRACE_PROBE2(hma_svm__guest_deferred, 770 processorid_t, CPU->cpu_seqid, 771 uint_t, guest_active); 772 return (HSCR_EMPTY); 773 } 774 775 DTRACE_PROBE2(hma_svm__host_deferred, 776 processorid_t, CPU->cpu_seqid, uint_t, host_active); 777 778 /* 779 * Disable any active host counters, trying to do so in as 780 * consistent a manner as possible. 781 */ 782 for (uint_t i = 0; i < CPC_EXTD_REGS; i++) { 783 const uint64_t evtsel = 784 svm_cpu->hsc_cpc_host_regs[i].hc_evtsel; 785 wrmsr(MSR_CPC_EXTD_EVTSEL(i), 786 evtsel & ~AMD_PERF_EVTSEL_CTR_EN); 787 } 788 } 789 790 /* 791 * With any active host counters stopped from collecting new events, 792 * save the counter values themselves before loading guest state. 793 */ 794 for (uint_t i = 0; i < CPC_EXTD_REGS; i++) { 795 svm_cpu->hsc_cpc_host_regs[i].hc_ctr = 796 rdmsr(MSR_CPC_EXTD_CTR(i)); 797 } 798 799 /* 800 * Now load the guest state, fixing it up with the flag necessary to 801 * collect events only while in guest context. 802 */ 803 for (uint_t i = 0; i < max_guest_reg; i++) { 804 uint64_t evtsel = cpc_state->hscs_regs[i].hc_evtsel; 805 806 /* 807 * Clear any existing HG flags, as well as any request for 808 * interrupt enable. (Trapping the interrupt from guest counters 809 * is not presently supported.) 810 */ 811 evtsel &= ~(AMD_PERF_EVTSEL_HG_MASK | AMD_PERF_EVTSEL_INT_EN); 812 /* And indicate guest-only event tracking */ 813 evtsel |= AMD_PERF_EVTSEL_HG_GUEST; 814 815 wrmsr(MSR_CPC_EXTD_EVTSEL(i), evtsel); 816 wrmsr(MSR_CPC_EXTD_CTR(i), cpc_state->hscs_regs[i].hc_ctr); 817 } 818 819 svm_cpu->hsc_cpc_saved_flags = req_flags; 820 return (HSCR_ACCESS_RDPMC | HSCR_ACCESS_CTR_MSR); 821 } 822 823 void 824 hma_svm_cpc_exit(struct hma_svm_cpc_state *cpc_state) 825 { 826 ASSERT(!interrupts_enabled()); 827 828 hma_svm_cpu_t *svm_cpu = hma_svm_cpu(CPU->cpu_seqid); 829 830 const hma_cpc_flags_t saved_flags = svm_cpu->hsc_cpc_saved_flags; 831 if (saved_flags == HCF_DISABLED) { 832 return; 833 } 834 835 /* Save the guest counter values. */ 836 const uint_t max_guest_reg = 837 (saved_flags & HCF_EN_EXTD) != 0 ? CPC_EXTD_REGS : CPC_BASE_REGS; 838 for (uint_t i = 0; i < max_guest_reg; i++) { 839 cpc_state->hscs_regs[i].hc_ctr = rdmsr(MSR_CPC_EXTD_CTR(i)); 840 } 841 842 /* 843 * Load the host values back, once again taking care to toggle the 844 * counter enable state as a separate step in an attempt to keep 845 * readings as consistent as possible 846 */ 847 uint_t host_active = 0; 848 for (uint_t i = 0; i < CPC_EXTD_REGS; i++) { 849 const uint64_t evtsel = svm_cpu->hsc_cpc_host_regs[i].hc_evtsel; 850 851 if (EVTSEL_EN(evtsel)) { 852 host_active++; 853 } 854 wrmsr(MSR_CPC_EXTD_EVTSEL(i), evtsel & ~AMD_PERF_EVTSEL_CTR_EN); 855 wrmsr(MSR_CPC_EXTD_CTR(i), 856 svm_cpu->hsc_cpc_host_regs[i].hc_ctr); 857 } 858 859 /* 860 * Allow any enabled host counters to collect events, now that all of 861 * the other state is loaded. 862 */ 863 if (host_active != 0) { 864 for (uint_t i = 0; i < CPC_EXTD_REGS; i++) { 865 wrmsr(MSR_CPC_EXTD_EVTSEL(i), 866 svm_cpu->hsc_cpc_host_regs[i].hc_evtsel); 867 } 868 } 869 } 870 871 static int 872 hma_svm_cpu_activate(xc_arg_t arg1 __unused, xc_arg_t arg2 __unused, 873 xc_arg_t arg3 __unused) 874 { 875 const processorid_t id = CPU->cpu_seqid; 876 const uintptr_t hsave_pa = hma_svm_cpu(id)->hsc_hsave_pa; 877 uint64_t efer; 878 879 VERIFY(hsave_pa != 0); 880 881 /* Enable SVM via EFER */ 882 efer = rdmsr(MSR_AMD_EFER); 883 efer |= AMD_EFER_SVME; 884 wrmsr(MSR_AMD_EFER, efer); 885 886 /* Setup hsave area */ 887 wrmsr(MSR_AMD_VM_HSAVE_PA, hsave_pa); 888 889 hma_cpu[id].hc_status = HCS_READY; 890 return (0); 891 } 892 893 static int 894 hma_svm_cpu_setup(cpu_setup_t what, int id, void *arg __unused) 895 { 896 hma_svm_cpu_t *svm_cpu = hma_svm_cpu(id); 897 898 ASSERT(MUTEX_HELD(&cpu_lock)); 899 ASSERT(id >= 0 && id < NCPU); 900 901 switch (what) { 902 case CPU_CONFIG: 903 case CPU_ON: 904 case CPU_INIT: 905 break; 906 default: 907 /* 908 * Other events, such as CPU offlining, are of no interest. 909 * Letting the SVM state linger should not cause any harm. 910 * 911 * This logic assumes that any offlining activity is strictly 912 * administrative in nature and will not alter any existing 913 * configuration (such as EFER bits previously set). 914 */ 915 return (0); 916 } 917 918 /* Perform initialization if it has not been previously attempted. */ 919 if (hma_cpu[id].hc_status != HCS_UNINITIALIZED) { 920 return ((hma_cpu[id].hc_status == HCS_READY) ? 0 : -1); 921 } 922 923 /* Allocate the hsave page for this CPU */ 924 if (svm_cpu->hsc_hsave_page == NULL) { 925 caddr_t va; 926 pfn_t pfn; 927 928 va = kmem_alloc(PAGESIZE, KM_SLEEP); 929 VERIFY0((uintptr_t)va & PAGEOFFSET); 930 svm_cpu->hsc_hsave_page = va; 931 932 /* 933 * Cache the physical address of the hsave page rather than 934 * looking it up later when the potential blocking of 935 * hat_getpfnum would be less acceptable. 936 */ 937 pfn = hat_getpfnum(kas.a_hat, va); 938 svm_cpu->hsc_hsave_pa = (pfn << PAGESHIFT); 939 } else { 940 VERIFY(svm_cpu->hsc_hsave_pa != 0); 941 } 942 943 kpreempt_disable(); 944 if (CPU->cpu_seqid == id) { 945 /* Perform svm setup directly if this CPU is the target */ 946 (void) hma_svm_cpu_activate(0, 0, 0); 947 kpreempt_enable(); 948 } else { 949 cpuset_t set; 950 951 /* Use a cross-call if a remote CPU is the target */ 952 kpreempt_enable(); 953 cpuset_zero(&set); 954 cpuset_add(&set, id); 955 xc_call(0, 0, 0, CPUSET2BV(set), hma_svm_cpu_activate); 956 } 957 958 return (hma_cpu[id].hc_status != HCS_READY); 959 } 960 961 static int 962 hma_svm_init(void) 963 { 964 uint64_t msr; 965 const char *msg = NULL; 966 struct cpuid_regs regs; 967 cpu_t *cp; 968 969 if (!is_x86_feature(x86_featureset, X86FSET_SVM)) { 970 msg = "CPU does not support SVM"; 971 goto bail; 972 } 973 974 msr = rdmsr(MSR_AMD_VM_CR); 975 if ((msr & AMD_VM_CR_SVMDIS) != 0) { 976 msg = "SVM disabled by BIOS"; 977 goto bail; 978 } 979 980 regs.cp_eax = 0x8000000a; 981 (void) cpuid_insn(NULL, ®s); 982 const uint32_t nasid = regs.cp_ebx; 983 const uint32_t feat = regs.cp_edx; 984 985 if (nasid == 0) { 986 msg = "Not enough ASIDs for guests"; 987 goto bail; 988 } 989 if ((feat & CPUID_AMD_EDX_NESTED_PAGING) == 0) { 990 msg = "CPU does not support nested paging"; 991 goto bail; 992 } 993 if ((feat & CPUID_AMD_EDX_NRIPS) == 0) { 994 msg = "CPU does not support NRIP save"; 995 goto bail; 996 } 997 998 hma_svm_features = feat; 999 hma_svm_max_asid = nasid; 1000 1001 mutex_enter(&cpu_lock); 1002 /* Perform SVM configuration for already-online CPUs. */ 1003 cp = cpu_active; 1004 do { 1005 int err = hma_svm_cpu_setup(CPU_ON, cp->cpu_seqid, NULL); 1006 if (err != 0) { 1007 msg = "failure during SVM setup"; 1008 mutex_exit(&cpu_lock); 1009 goto bail; 1010 } 1011 } while ((cp = cp->cpu_next_onln) != cpu_active); 1012 1013 /* 1014 * Register callback for later-onlined CPUs and perform other remaining 1015 * resource allocation. 1016 */ 1017 register_cpu_setup_func(hma_svm_cpu_setup, NULL); 1018 mutex_exit(&cpu_lock); 1019 1020 /* Initialize per-CPU ASID state. */ 1021 for (uint_t i = 0; i < NCPU; i++) { 1022 /* 1023 * Skip past sentinel 0 value for generation. Doing so for 1024 * ASID is unneeded, since it will be incremented during the 1025 * first allocation. 1026 */ 1027 hma_svm_asid_t *cpu_asid = &hma_svm_cpu(i)->hsc_asid; 1028 cpu_asid->hsa_gen = 1; 1029 cpu_asid->hsa_asid = 0; 1030 } 1031 1032 /* 1033 * For now, only expose performance counter support if the host supports 1034 * "extended" counters. This makes MSR access more consistent for logic 1035 * handling that state. 1036 */ 1037 if (is_x86_feature(x86_featureset, X86FSET_AMD_PCEC)) { 1038 hma_svm_cpc_allowed = HCF_EN_BASE | HCF_EN_EXTD; 1039 } 1040 1041 hma_svm_ready = B_TRUE; 1042 return (0); 1043 1044 bail: 1045 hma_svm_error = msg; 1046 cmn_err(CE_NOTE, "!hma_svm_init: %s", msg); 1047 return (-1); 1048 } 1049