1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2011 NetApp, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 /* 29 * This file and its contents are supplied under the terms of the 30 * Common Development and Distribution License ("CDDL"), version 1.0. 31 * You may only use this file in accordance with the terms of version 32 * 1.0 of the CDDL. 33 * 34 * A full copy of the text of the CDDL should have accompanied this 35 * source. A copy of the CDDL is also available via the Internet at 36 * http://www.illumos.org/license/CDDL. 37 */ 38 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */ 39 40 /* 41 * Copyright 2014 Pluribus Networks Inc. 42 * Copyright 2018 Joyent, Inc. 43 * Copyright 2025 Oxide Computer Company 44 */ 45 46 #include <sys/types.h> 47 #include <sys/stdbool.h> 48 #include <sys/errno.h> 49 50 #include <machine/md_var.h> 51 #include <machine/specialreg.h> 52 53 #include <machine/vmm.h> 54 #include <sys/vmm_kernel.h> 55 56 #include "vmm_host.h" 57 #include "vmm_util.h" 58 #include "vlapic.h" 59 60 /* 61 * CPUID Emulation 62 * 63 * All CPUID instruction exits are handled by the in-kernel emulation. 64 * 65 * ---------------- 66 * Legacy Emulation 67 * ---------------- 68 * 69 * Originally, the kernel vmm portion of bhyve relied on fixed logic to filter 70 * and/or generate CPUID results based on what was reported by the host CPU, as 71 * well as attributes of the VM (such as CPU topology, and enabled features). 72 * This is largely adequate to expose CPU capabilities to the guest in manner 73 * which allows it to operate properly. 74 * 75 * ------------------------------ 76 * Userspace-Controlled Emulation 77 * ------------------------------ 78 * 79 * In certain situations, more control over the CPUID emulation results present 80 * to the guest is desired. Live migration between physical hosts is one such 81 * example, where the underlying CPUs, or at least their microcode, may differ 82 * between the source and destination. In such cases, where changes to the 83 * CPUID results cannot be tolerated, the userspace portion of the VMM can be in 84 * complete control over the leaves which are presented to the guest. It may 85 * still consult the "legacy" CPUID data for guidance about which CPU features 86 * are safe to expose (due to hypervisor limitations, etc). This leaf 87 * information is configured on a per-vCPU basis. 88 * 89 * The emulation entries provided by userspace are expected to be in sorted 90 * order, running from lowest function and index to highest. 91 * 92 * For example: 93 * (func: 00h idx: 00h) -> 94 * (flags: 0, eax: highest std leaf, ebx-edx: vendor id) 95 * (func: 0Dh idx: 00h) -> 96 * (flags: VCE_FLAG_MATCH_INDEX, eax - edx: XCR0/XSAVE info) 97 * (func: 0Dh idx: 01h) -> 98 * (flags: VCE_FLAG_MATCH_INDEX, eax - edx: XSAVE/XSAVEOPT details) 99 * ... 100 * (func: 0Dh idx: 07H) -> 101 * (flags: VCE_FLAG_MATCH_INDEX, eax - edx: AVX-512 details) 102 * (func: 8000000h idx: 0h) -> 103 * (flags: 0, eax: highest extd leaf ...) 104 * ... 105 */ 106 107 108 #define CPUID_TYPE_MASK 0xf0000000 109 #define CPUID_TYPE_STD 0x00000000 110 #define CPUID_TYPE_EXTD 0x80000000 111 112 #define CPUID_0000_0000 (0x0) 113 #define CPUID_0000_0001 (0x1) 114 #define CPUID_0000_0002 (0x2) 115 #define CPUID_0000_0003 (0x3) 116 #define CPUID_0000_0004 (0x4) 117 #define CPUID_0000_0006 (0x6) 118 #define CPUID_0000_0007 (0x7) 119 #define CPUID_0000_000A (0xA) 120 #define CPUID_0000_000B (0xB) 121 #define CPUID_0000_000D (0xD) 122 #define CPUID_0000_000F (0xF) 123 #define CPUID_0000_0010 (0x10) 124 #define CPUID_0000_0015 (0x15) 125 #define CPUID_8000_0000 (0x80000000) 126 #define CPUID_8000_0001 (0x80000001) 127 #define CPUID_8000_0002 (0x80000002) 128 #define CPUID_8000_0003 (0x80000003) 129 #define CPUID_8000_0004 (0x80000004) 130 #define CPUID_8000_0006 (0x80000006) 131 #define CPUID_8000_0007 (0x80000007) 132 #define CPUID_8000_0008 (0x80000008) 133 #define CPUID_8000_001D (0x8000001D) 134 #define CPUID_8000_001E (0x8000001E) 135 136 #define CPUID_VM_HIGH 0x40000000 137 138 static const struct vcpu_cpuid_entry cpuid_empty_entry = { 0 }; 139 140 /* 141 * Given the CPUID configuration for a vCPU, locate the entry which matches the 142 * provided function/index tuple. The entries list is walked in order, and the 143 * first valid match based on the function/index and flags will be emitted. 144 * 145 * If no match is found, but Intel-style fallback is configured, then the 146 * highest standard leaf encountered will be emitted. 147 */ 148 static const struct vcpu_cpuid_entry * 149 cpuid_find_entry(const vcpu_cpuid_config_t *cfg, uint32_t func, uint32_t idx) 150 { 151 const struct vcpu_cpuid_entry *last_std = NULL; 152 const bool intel_fallback = 153 (cfg->vcc_flags & VCC_FLAG_INTEL_FALLBACK) != 0; 154 bool matched_leaf = false; 155 156 ASSERT0(cfg->vcc_flags & VCC_FLAG_LEGACY_HANDLING); 157 158 for (uint_t i = 0; i < cfg->vcc_nent; i++) { 159 const struct vcpu_cpuid_entry *ent = &cfg->vcc_entries[i]; 160 const bool ent_is_std = 161 (ent->vce_function & CPUID_TYPE_MASK) == CPUID_TYPE_STD; 162 const bool ent_must_match_idx = 163 (ent->vce_flags & VCE_FLAG_MATCH_INDEX) != 0; 164 165 if (ent_is_std) { 166 /* 167 * Keep track of the last "standard" leaf for 168 * Intel-style fallback behavior. 169 * 170 * This does currently not account for the sub-leaf 171 * index matching behavior for fallback described in the 172 * SDM. It is not clear if any consumers rely on such 173 * matching when encountering fallback. 174 */ 175 last_std = ent; 176 } 177 if (ent->vce_function == func) { 178 if (ent->vce_index == idx || !ent_must_match_idx) { 179 return (ent); 180 } 181 /* 182 * Make note of when the top-level leaf matches, even 183 * when the index does not. 184 */ 185 matched_leaf = true; 186 } else if (ent->vce_function > func) { 187 if ((ent->vce_function & CPUID_TYPE_MASK) == 188 (func & CPUID_TYPE_MASK)) { 189 /* 190 * We are beyond a valid leaf to match, but have 191 * not exceeded the maximum leaf for this "type" 192 * (standard, extended, hvm, etc), so return an 193 * empty entry. 194 */ 195 return (&cpuid_empty_entry); 196 } else { 197 /* 198 * Otherwise, we can stop now, having gone 199 * beyond the last entry which could match the 200 * target function in a sorted list. 201 */ 202 break; 203 } 204 } 205 } 206 207 if (matched_leaf || !intel_fallback) { 208 return (&cpuid_empty_entry); 209 } else { 210 return (last_std); 211 } 212 } 213 214 /* 215 * Updates a previously-populated set of CPUID return values to account for the 216 * runtime state of the executing vCPU, i.e., the values in its control 217 * registers and MSRs that influence the values returned by the CPUID 218 * instruction. 219 * 220 * This function does not account for "static" properties of the vCPU or VM, 221 * such as the enablement of VM-wide features and capabilities (like x2APIC or 222 * INVPCID support) or settings that vary only with the vCPU's ID (like the 223 * values returned from its topology leaves). 224 * 225 * This function assumes that it is called from within VMRUN(), which guarantees 226 * that the guest's FPU state is loaded. This is required to obtain the correct 227 * values for leaves whose values depend on the guest values of %xcr0 and the 228 * IA32_XSS MSR. 229 */ 230 static void 231 cpuid_apply_runtime_reg_state(struct vm *vm, int vcpuid, uint32_t func, 232 uint32_t index, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx) 233 { 234 uint64_t cr4; 235 int error; 236 unsigned int regs[4]; 237 238 switch (func) { 239 case CPUID_0000_0001: 240 /* 241 * If CPUID2_XSAVE is being advertised and the 242 * guest has set CR4_XSAVE, set CPUID2_OSXSAVE. 243 */ 244 *ecx &= ~CPUID2_OSXSAVE; 245 if ((*ecx & CPUID2_XSAVE) != 0) { 246 error = vm_get_register(vm, vcpuid, 247 VM_REG_GUEST_CR4, &cr4); 248 VERIFY0(error); 249 if ((cr4 & CR4_XSAVE) != 0) { 250 *ecx |= CPUID2_OSXSAVE; 251 } 252 } 253 254 /* 255 * AMD APM vol. 3 rev. 3.36 section E.3.2 notes that this bit is 256 * set only if the "APIC exists and is enabled." Vol. 3 of the 257 * June 2024 Intel SDM notes in section 11.4.3 that "[t]he CPUID 258 * feature flag for the APIC ... is also set to 0" when the APIC 259 * enable bit is cleared. 260 */ 261 if (vlapic_hw_disabled(vm_lapic(vm, vcpuid))) { 262 *edx &= ~CPUID_APIC; 263 } 264 break; 265 266 case CPUID_0000_000D: 267 /* 268 * Leaf D reports XSAVE area sizes that vary with the current 269 * value of %xcr0. Since this function is called with %xcr0 270 * still set to its guest value, the easiest way to get the 271 * correct output is to execute CPUID on the host and copy out 272 * the relevant values. 273 */ 274 cpuid_count(func, index, regs); 275 switch (index) { 276 case 0: 277 /* 278 * %eax, %ecx, and %edx return information about the 279 * complete set of features the processor supports, not 280 * just the ones that are enabled. The caller is 281 * presumed to have set these already, so just update 282 * %ebx. 283 */ 284 *ebx = regs[1]; 285 break; 286 case 1: 287 /* 288 * Subleaf 1 reports the XSAVE area size required for 289 * features enabled in %xcr0 and the IA32_XSS MSR via 290 * %ebx. As with subleaf 0, the caller is presumed to 291 * have set the other three output register values 292 * already. 293 * 294 * AMD APM vol. 3 rev. 3.36 and the June 2024 edition of 295 * volume 2 of the Intel SDM specify slightly different 296 * behavior here: the SDM says that the value returned 297 * in %ebx depends in part on whether %eax advertises 298 * XSAVEC and IA32_XSS support, but the APM does not. To 299 * handle these cases: 300 * 301 * 1. If the guest isn't a VMX guest, just copy the 302 * current reported save area size. 303 * 2. If both the XSAVEC and XSAVES bits are clear in 304 * %eax, return a save area size of 0 in %ebx to 305 * match the SDM description. 306 * 3. Otherwise, copy the host's reported save area 307 * size. 308 * 309 * Note that, because XSAVES saves a superset of the 310 * state saved by XSAVEC, it's OK to report the host's 311 * save area size even if the host and guest report 312 * different feature bits in %eax: 313 * 314 * - If the host supports XSAVES and the guest doesn't, 315 * the reported save area size will be too large, but 316 * the guest can still use XSAVEC safely. 317 * - If the VM's explicit CPUID values advertise XSAVES 318 * support, but the host doesn't support XSAVES, the 319 * host's reported save area size will still be large 320 * enough for the xcr0-controlled state saved by 321 * XSAVEC. The area will be undersized for XSAVES, 322 * but this is OK because the guest can't execute 323 * XSAVES anyway (it will #UD). 324 */ 325 if (!vmm_is_intel()) { 326 *ebx = regs[1]; 327 } else { 328 if ((*eax & (CPUID_EXTSTATE_XSAVEC | 329 CPUID_EXTSTATE_XSAVES)) == 0) { 330 *ebx = 0; 331 } else { 332 *ebx = regs[1]; 333 } 334 } 335 break; 336 default: 337 /* 338 * Other subleaves of leaf D report the relative sizes 339 * and offsets of the state required for specific 340 * features in the relevant offset masks. These don't 341 * depend on the current enabled features (only the 342 * supported ones), so no enabled-feature specialization 343 * is required. 344 */ 345 break; 346 } 347 break; 348 } 349 } 350 351 /* 352 * Emulates the CPUID instruction on the specified vCPU and returns its outputs 353 * in the rax/rbx/rcx/rdx variables. 354 * 355 * This function assumes it is called from within VMRUN(), which guarantees that 356 * certain guest state (e.g. FPU state) remains loaded. 357 */ 358 void 359 vcpu_emulate_cpuid(struct vm *vm, int vcpuid, uint64_t *rax, uint64_t *rbx, 360 uint64_t *rcx, uint64_t *rdx) 361 { 362 const vcpu_cpuid_config_t *cfg = vm_cpuid_config(vm, vcpuid); 363 uint32_t func, index; 364 365 ASSERT3P(rax, !=, NULL); 366 ASSERT3P(rbx, !=, NULL); 367 ASSERT3P(rcx, !=, NULL); 368 ASSERT3P(rdx, !=, NULL); 369 370 uint32_t regs[4] = { *rax, 0, *rcx, 0 }; 371 func = (uint32_t)*rax; 372 index = (uint32_t)*rcx; 373 374 /* Fall back to legacy handling if specified */ 375 if ((cfg->vcc_flags & VCC_FLAG_LEGACY_HANDLING) != 0) { 376 legacy_emulate_cpuid(vm, vcpuid, ®s[0], ®s[1], ®s[2], 377 ®s[3]); 378 } else { 379 const struct vcpu_cpuid_entry *ent = cpuid_find_entry(cfg, func, 380 index); 381 ASSERT(ent != NULL); 382 383 /* 384 * The function and index in the found entry may differ from 385 * what the guest requested (if the entry was chosen via the 386 * "highest leaf" fallback described above). Use the values 387 * from the entry to ensure that the correct vCPU state fixups 388 * get applied below. 389 * 390 * The found entry may also be an all-zero empty entry (if the 391 * requested leaf is invalid but is less than the maximum valid 392 * leaf). It's OK to fall through in this case because leaf 0 393 * never has any CPU state-based fixups to apply. 394 */ 395 func = ent->vce_function; 396 index = ent->vce_index; 397 regs[0] = ent->vce_eax; 398 regs[1] = ent->vce_ebx; 399 regs[2] = ent->vce_ecx; 400 regs[3] = ent->vce_edx; 401 } 402 403 /* Fix up any returned values that vary with guest register state. */ 404 cpuid_apply_runtime_reg_state(vm, vcpuid, func, index, ®s[0], 405 ®s[1], ®s[2], ®s[3]); 406 407 /* CPUID clears the upper 32-bits of the long-mode registers. */ 408 *rax = regs[0]; 409 *rbx = regs[1]; 410 *rcx = regs[2]; 411 *rdx = regs[3]; 412 } 413 414 /* 415 * Get the current CPUID emulation configuration for this vCPU. 416 * 417 * Only the existing flags will be emitted if the vCPU is configured for legacy 418 * operation via the VCC_FLAG_LEGACY_HANDLING flag. If in userspace-controlled 419 * mode, then we will attempt to copy the existing entries into vcc_entries, 420 * its side specified by vcc_nent. 421 * 422 * Regardless of whether vcc_entries is adequately sized (or even present), 423 * vcc_nent will be set to the number of existing entries. 424 */ 425 int 426 vm_get_cpuid(struct vm *vm, int vcpuid, vcpu_cpuid_config_t *res) 427 { 428 if (vcpuid < 0 || vcpuid > VM_MAXCPU) { 429 return (EINVAL); 430 } 431 432 const vcpu_cpuid_config_t *src = vm_cpuid_config(vm, vcpuid); 433 if (src->vcc_nent > res->vcc_nent) { 434 res->vcc_nent = src->vcc_nent; 435 return (E2BIG); 436 } else if (src->vcc_nent != 0) { 437 bcopy(src->vcc_entries, res->vcc_entries, 438 src->vcc_nent * sizeof (struct vcpu_cpuid_entry)); 439 } 440 res->vcc_flags = src->vcc_flags; 441 res->vcc_nent = src->vcc_nent; 442 return (0); 443 } 444 445 /* 446 * Set the CPUID emulation configuration for this vCPU. 447 * 448 * If VCC_FLAG_LEGACY_HANDLING is set in vcc_flags, then vcc_nent is expected to 449 * be set to 0, as configuring a list of entries would be useless when using the 450 * legacy handling. 451 * 452 * Any existing entries which are configured are freed, and the newly provided 453 * ones will be copied into their place. 454 */ 455 int 456 vm_set_cpuid(struct vm *vm, int vcpuid, const vcpu_cpuid_config_t *src) 457 { 458 if (vcpuid < 0 || vcpuid > VM_MAXCPU) { 459 return (EINVAL); 460 } 461 if (src->vcc_nent > VMM_MAX_CPUID_ENTRIES) { 462 return (EINVAL); 463 } 464 if ((src->vcc_flags & ~VCC_FLAGS_VALID) != 0) { 465 return (EINVAL); 466 } 467 if ((src->vcc_flags & VCC_FLAG_LEGACY_HANDLING) != 0 && 468 src->vcc_nent != 0) { 469 /* No entries should be provided if using legacy handling */ 470 return (EINVAL); 471 } 472 for (uint_t i = 0; i < src->vcc_nent; i++) { 473 /* Ensure all entries carry valid flags */ 474 if ((src->vcc_entries[i].vce_flags & ~VCE_FLAGS_VALID) != 0) { 475 return (EINVAL); 476 } 477 } 478 479 vcpu_cpuid_config_t *cfg = vm_cpuid_config(vm, vcpuid); 480 481 /* Free any existing entries first */ 482 vcpu_cpuid_cleanup(cfg); 483 484 /* Copy supplied entries into freshly allocated space */ 485 if (src->vcc_nent != 0) { 486 const size_t entries_sz = 487 src->vcc_nent * sizeof (struct vcpu_cpuid_entry); 488 489 cfg->vcc_nent = src->vcc_nent; 490 cfg->vcc_entries = kmem_alloc(entries_sz, KM_SLEEP); 491 bcopy(src->vcc_entries, cfg->vcc_entries, entries_sz); 492 } 493 cfg->vcc_flags = src->vcc_flags; 494 495 return (0); 496 } 497 498 void 499 vcpu_cpuid_init(vcpu_cpuid_config_t *cfg) 500 { 501 /* Default to legacy-style handling */ 502 cfg->vcc_flags = VCC_FLAG_LEGACY_HANDLING; 503 cfg->vcc_nent = 0; 504 cfg->vcc_entries = NULL; 505 } 506 507 void 508 vcpu_cpuid_cleanup(vcpu_cpuid_config_t *cfg) 509 { 510 if (cfg->vcc_nent != 0) { 511 ASSERT3P(cfg->vcc_entries, !=, NULL); 512 513 kmem_free(cfg->vcc_entries, 514 cfg->vcc_nent * sizeof (struct vcpu_cpuid_entry)); 515 516 cfg->vcc_nent = 0; 517 cfg->vcc_entries = NULL; 518 } 519 } 520 521 static const char bhyve_id[12] = "bhyve bhyve "; 522 523 /* 524 * Force exposition of the invariant TSC capability, regardless of whether the 525 * host CPU reports having it. 526 */ 527 static int vmm_force_invariant_tsc = 0; 528 529 /* 530 * CPUID instruction Fn0000_0001: 531 */ 532 #define CPUID_0000_0001_APICID_SHIFT 24 533 534 535 /* 536 * Compute ceil(log2(x)). Returns -1 if x is zero. 537 */ 538 static __inline int 539 log2(uint_t x) 540 { 541 return (x == 0 ? -1 : fls(x - 1)); 542 } 543 544 /* 545 * The "legacy" bhyve cpuid emulation, which largly applies statically defined 546 * masks to the data provided by the host CPU. 547 */ 548 void 549 legacy_emulate_cpuid(struct vm *vm, int vcpu_id, uint32_t *eax, uint32_t *ebx, 550 uint32_t *ecx, uint32_t *edx) 551 { 552 const struct xsave_limits *limits; 553 int error, enable_invpcid, level, width = 0, x2apic_id = 0; 554 unsigned int func, regs[4], logical_cpus = 0, param; 555 enum x2apic_state x2apic_state; 556 uint16_t cores, maxcpus, sockets, threads; 557 558 /* 559 * The function of CPUID is controlled through the provided value of 560 * %eax (and secondarily %ecx, for certain leaf data). 561 */ 562 func = (uint32_t)*eax; 563 param = (uint32_t)*ecx; 564 565 /* 566 * Requests for invalid CPUID levels should map to the highest 567 * available level instead. 568 */ 569 if (cpu_exthigh != 0 && func >= 0x80000000) { 570 if (func > cpu_exthigh) 571 func = cpu_exthigh; 572 } else if (func >= 0x40000000) { 573 if (func > CPUID_VM_HIGH) 574 func = CPUID_VM_HIGH; 575 } else if (func > cpu_high) { 576 func = cpu_high; 577 } 578 579 /* 580 * In general the approach used for CPU topology is to 581 * advertise a flat topology where all CPUs are packages with 582 * no multi-core or SMT. 583 */ 584 switch (func) { 585 /* 586 * Pass these through to the guest 587 */ 588 case CPUID_0000_0000: 589 case CPUID_0000_0002: 590 case CPUID_0000_0003: 591 case CPUID_8000_0000: 592 case CPUID_8000_0002: 593 case CPUID_8000_0003: 594 case CPUID_8000_0004: 595 case CPUID_8000_0006: 596 cpuid_count(func, param, regs); 597 break; 598 case CPUID_8000_0008: 599 cpuid_count(func, param, regs); 600 if (vmm_is_svm()) { 601 /* 602 * As on Intel (0000_0007:0, EDX), mask out 603 * unsupported or unsafe AMD extended features 604 * (8000_0008 EBX). 605 */ 606 regs[1] &= (AMDFEID_CLZERO | AMDFEID_IRPERF | 607 AMDFEID_XSAVEERPTR); 608 609 vm_get_topology(vm, &sockets, &cores, &threads, 610 &maxcpus); 611 /* 612 * Here, width is ApicIdCoreIdSize, present on 613 * at least Family 15h and newer. It 614 * represents the "number of bits in the 615 * initial apicid that indicate thread id 616 * within a package." 617 * 618 * Our topo_probe_amd() uses it for 619 * pkg_id_shift and other OSes may rely on it. 620 */ 621 width = MIN(0xF, log2(threads * cores)); 622 if (width < 0x4) 623 width = 0; 624 logical_cpus = MIN(0xFF, threads * cores - 1); 625 regs[2] = (width << AMDID_COREID_SIZE_SHIFT) | 626 logical_cpus; 627 } 628 break; 629 630 case CPUID_8000_0001: 631 cpuid_count(func, param, regs); 632 633 /* 634 * Hide SVM from guest. 635 */ 636 regs[2] &= ~AMDID2_SVM; 637 638 /* 639 * Don't advertise extended performance counter MSRs 640 * to the guest. 641 */ 642 regs[2] &= ~AMDID2_PCXC; 643 regs[2] &= ~AMDID2_PNXC; 644 regs[2] &= ~AMDID2_PTSCEL2I; 645 646 /* 647 * Don't advertise Instruction Based Sampling feature. 648 */ 649 regs[2] &= ~AMDID2_IBS; 650 651 /* NodeID MSR not available */ 652 regs[2] &= ~AMDID2_NODE_ID; 653 654 /* Don't advertise the OS visible workaround feature */ 655 regs[2] &= ~AMDID2_OSVW; 656 657 /* Hide mwaitx/monitorx capability from the guest */ 658 regs[2] &= ~AMDID2_MWAITX; 659 660 #ifndef __FreeBSD__ 661 /* 662 * Detection routines for TCE and FFXSR are missing 663 * from our vm_cpuid_capability() detection logic 664 * today. Mask them out until that is remedied. 665 * They do not appear to be in common usage, so their 666 * absence should not cause undue trouble. 667 */ 668 regs[2] &= ~AMDID2_TCE; 669 regs[3] &= ~AMDID_FFXSR; 670 #endif 671 672 /* 673 * Hide rdtscp/ia32_tsc_aux until we know how 674 * to deal with them. 675 */ 676 regs[3] &= ~AMDID_RDTSCP; 677 break; 678 679 case CPUID_8000_0007: 680 cpuid_count(func, param, regs); 681 /* 682 * AMD uses this leaf to advertise the processor's 683 * power monitoring and RAS capabilities. These 684 * features are hardware-specific and exposing 685 * them to a guest doesn't make a lot of sense. 686 * 687 * Intel uses this leaf only to advertise the 688 * "Invariant TSC" feature with all other bits 689 * being reserved (set to zero). 690 */ 691 regs[0] = 0; 692 regs[1] = 0; 693 regs[2] = 0; 694 695 /* 696 * If the host system possesses an invariant TSC, then 697 * it is safe to expose to the guest. 698 * 699 * If there is measured skew between host TSCs, it will 700 * be properly offset so guests do not observe any 701 * change between CPU migrations. 702 */ 703 regs[3] &= AMDPM_TSC_INVARIANT; 704 705 /* 706 * Since illumos avoids deep C-states on CPUs which do 707 * not support an invariant TSC, it may be safe (and 708 * desired) to unconditionally expose that capability to 709 * the guest. 710 */ 711 if (vmm_force_invariant_tsc != 0) { 712 regs[3] |= AMDPM_TSC_INVARIANT; 713 } 714 break; 715 716 case CPUID_8000_001D: 717 /* AMD Cache topology, like 0000_0004 for Intel. */ 718 if (!vmm_is_svm()) 719 goto default_leaf; 720 721 /* 722 * Similar to Intel, generate a fictitious cache 723 * topology for the guest with L3 shared by the 724 * package, and L1 and L2 local to a core. 725 */ 726 vm_get_topology(vm, &sockets, &cores, &threads, 727 &maxcpus); 728 switch (param) { 729 case 0: 730 logical_cpus = threads; 731 level = 1; 732 func = 1; /* data cache */ 733 break; 734 case 1: 735 logical_cpus = threads; 736 level = 2; 737 func = 3; /* unified cache */ 738 break; 739 case 2: 740 logical_cpus = threads * cores; 741 level = 3; 742 func = 3; /* unified cache */ 743 break; 744 default: 745 logical_cpus = 0; 746 level = 0; 747 func = 0; 748 break; 749 } 750 751 if (level == 0) { 752 regs[0] = 0; 753 regs[1] = 0; 754 } else { 755 logical_cpus = MIN(0xfff, logical_cpus - 1); 756 regs[0] = (logical_cpus << 14) | (1 << 8) | 757 (level << 5) | func; 758 regs[1] = func > 0 ? _CACHE_LINE_SIZE - 1 : 0; 759 } 760 regs[2] = 0; 761 regs[3] = 0; 762 break; 763 764 case CPUID_8000_001E: 765 /* 766 * AMD Family 16h+ and Hygon Family 18h additional 767 * identifiers. 768 */ 769 if (!vmm_is_svm() || CPUID_TO_FAMILY(cpu_id) < 0x16) 770 goto default_leaf; 771 772 vm_get_topology(vm, &sockets, &cores, &threads, 773 &maxcpus); 774 regs[0] = vcpu_id; 775 threads = MIN(0xFF, threads - 1); 776 regs[1] = (threads << 8) | 777 (vcpu_id >> log2(threads + 1)); 778 /* 779 * XXX Bhyve topology cannot yet represent >1 node per 780 * processor. 781 */ 782 regs[2] = 0; 783 regs[3] = 0; 784 break; 785 786 case CPUID_0000_0001: 787 do_cpuid(1, regs); 788 789 error = vm_get_x2apic_state(vm, vcpu_id, &x2apic_state); 790 VERIFY0(error); 791 792 /* 793 * Override the APIC ID only in ebx 794 */ 795 regs[1] &= ~(CPUID_LOCAL_APIC_ID); 796 regs[1] |= (vcpu_id << CPUID_0000_0001_APICID_SHIFT); 797 798 /* 799 * Don't expose VMX, SpeedStep, TME or SMX capability. 800 * Advertise x2APIC capability and Hypervisor guest. 801 */ 802 regs[2] &= ~(CPUID2_VMX | CPUID2_EST | CPUID2_TM2); 803 regs[2] &= ~(CPUID2_SMX); 804 805 regs[2] |= CPUID2_HV; 806 807 if (x2apic_state != X2APIC_DISABLED) 808 regs[2] |= CPUID2_X2APIC; 809 else 810 regs[2] &= ~CPUID2_X2APIC; 811 812 /* 813 * Only advertise CPUID2_XSAVE in the guest if 814 * the host is using XSAVE. 815 */ 816 if (!(regs[2] & CPUID2_OSXSAVE)) 817 regs[2] &= ~CPUID2_XSAVE; 818 819 /* 820 * Hide monitor/mwait until we know how to deal with 821 * these instructions. 822 */ 823 regs[2] &= ~CPUID2_MON; 824 825 /* 826 * Hide the performance and debug features. 827 */ 828 regs[2] &= ~CPUID2_PDCM; 829 830 /* 831 * No TSC deadline support in the APIC yet 832 */ 833 regs[2] &= ~CPUID2_TSCDLT; 834 835 /* 836 * Hide thermal monitoring 837 */ 838 regs[3] &= ~(CPUID_ACPI | CPUID_TM); 839 840 /* 841 * Hide the debug store capability. 842 */ 843 regs[3] &= ~CPUID_DS; 844 845 /* 846 * Advertise the Machine Check and MTRR capability. 847 * 848 * Some guest OSes (e.g. Windows) will not boot if 849 * these features are absent. 850 */ 851 regs[3] |= (CPUID_MCA | CPUID_MCE | CPUID_MTRR); 852 853 vm_get_topology(vm, &sockets, &cores, &threads, 854 &maxcpus); 855 logical_cpus = threads * cores; 856 regs[1] &= ~CPUID_HTT_CORES; 857 regs[1] |= (logical_cpus & 0xff) << 16; 858 regs[3] |= CPUID_HTT; 859 break; 860 861 case CPUID_0000_0004: 862 cpuid_count(func, param, regs); 863 864 if (regs[0] || regs[1] || regs[2] || regs[3]) { 865 vm_get_topology(vm, &sockets, &cores, &threads, 866 &maxcpus); 867 regs[0] &= 0x3ff; 868 regs[0] |= (cores - 1) << 26; 869 /* 870 * Cache topology: 871 * - L1 and L2 are shared only by the logical 872 * processors in a single core. 873 * - L3 and above are shared by all logical 874 * processors in the package. 875 */ 876 logical_cpus = threads; 877 level = (regs[0] >> 5) & 0x7; 878 if (level >= 3) 879 logical_cpus *= cores; 880 regs[0] |= (logical_cpus - 1) << 14; 881 } 882 break; 883 884 case CPUID_0000_0007: 885 regs[0] = 0; 886 regs[1] = 0; 887 regs[2] = 0; 888 regs[3] = 0; 889 890 /* leaf 0 */ 891 if (param == 0) { 892 cpuid_count(func, param, regs); 893 894 /* Only leaf 0 is supported */ 895 regs[0] = 0; 896 897 /* 898 * Expose known-safe features. 899 */ 900 regs[1] &= CPUID_STDEXT_FSGSBASE | 901 CPUID_STDEXT_BMI1 | CPUID_STDEXT_HLE | 902 CPUID_STDEXT_AVX2 | CPUID_STDEXT_SMEP | 903 CPUID_STDEXT_BMI2 | 904 CPUID_STDEXT_ERMS | CPUID_STDEXT_RTM | 905 CPUID_STDEXT_AVX512F | 906 CPUID_STDEXT_AVX512DQ | 907 CPUID_STDEXT_RDSEED | 908 CPUID_STDEXT_SMAP | 909 CPUID_STDEXT_AVX512PF | 910 CPUID_STDEXT_AVX512ER | 911 CPUID_STDEXT_AVX512CD | CPUID_STDEXT_SHA | 912 CPUID_STDEXT_AVX512BW | 913 CPUID_STDEXT_AVX512VL; 914 regs[2] &= CPUID_STDEXT2_VAES | 915 CPUID_STDEXT2_VPCLMULQDQ; 916 regs[3] &= CPUID_STDEXT3_MD_CLEAR; 917 918 /* Advertise INVPCID if it is enabled. */ 919 error = vm_get_capability(vm, vcpu_id, 920 VM_CAP_ENABLE_INVPCID, &enable_invpcid); 921 if (error == 0 && enable_invpcid) 922 regs[1] |= CPUID_STDEXT_INVPCID; 923 } 924 break; 925 926 case CPUID_0000_0006: 927 regs[0] = CPUTPM1_ARAT; 928 regs[1] = 0; 929 regs[2] = 0; 930 regs[3] = 0; 931 break; 932 933 case CPUID_0000_000A: 934 /* 935 * Handle the access, but report 0 for 936 * all options 937 */ 938 regs[0] = 0; 939 regs[1] = 0; 940 regs[2] = 0; 941 regs[3] = 0; 942 break; 943 944 case CPUID_0000_000B: 945 /* 946 * Intel processor topology enumeration 947 */ 948 if (vmm_is_intel()) { 949 vm_get_topology(vm, &sockets, &cores, &threads, 950 &maxcpus); 951 if (param == 0) { 952 logical_cpus = threads; 953 width = log2(logical_cpus); 954 level = CPUID_TYPE_SMT; 955 x2apic_id = vcpu_id; 956 } 957 958 if (param == 1) { 959 logical_cpus = threads * cores; 960 width = log2(logical_cpus); 961 level = CPUID_TYPE_CORE; 962 x2apic_id = vcpu_id; 963 } 964 965 if (param >= 2) { 966 width = 0; 967 logical_cpus = 0; 968 level = 0; 969 x2apic_id = 0; 970 } 971 972 regs[0] = width & 0x1f; 973 regs[1] = logical_cpus & 0xffff; 974 regs[2] = (level << 8) | (param & 0xff); 975 regs[3] = x2apic_id; 976 } else { 977 regs[0] = 0; 978 regs[1] = 0; 979 regs[2] = 0; 980 regs[3] = 0; 981 } 982 break; 983 984 case CPUID_0000_000D: 985 limits = vmm_get_xsave_limits(); 986 if (!limits->xsave_enabled) { 987 regs[0] = 0; 988 regs[1] = 0; 989 regs[2] = 0; 990 regs[3] = 0; 991 break; 992 } 993 994 cpuid_count(func, param, regs); 995 switch (param) { 996 case 0: 997 /* 998 * Only permit the guest to use bits 999 * that are active in the host in 1000 * %xcr0. Also, claim that the 1001 * maximum save area size is 1002 * equivalent to the host's current 1003 * save area size. Since this runs 1004 * "inside" of vmrun(), it runs with 1005 * the guest's xcr0, so the current 1006 * save area size is correct as-is. 1007 */ 1008 regs[0] &= limits->xcr0_allowed; 1009 regs[2] = limits->xsave_max_size; 1010 regs[3] &= (limits->xcr0_allowed >> 32); 1011 break; 1012 case 1: 1013 /* Only permit XSAVEOPT. */ 1014 regs[0] &= CPUID_EXTSTATE_XSAVEOPT; 1015 regs[1] = 0; 1016 regs[2] = 0; 1017 regs[3] = 0; 1018 break; 1019 default: 1020 /* 1021 * If the leaf is for a permitted feature, 1022 * pass through as-is, otherwise return 1023 * all zeroes. 1024 */ 1025 if (!(limits->xcr0_allowed & (1ul << param))) { 1026 regs[0] = 0; 1027 regs[1] = 0; 1028 regs[2] = 0; 1029 regs[3] = 0; 1030 } 1031 break; 1032 } 1033 break; 1034 1035 case CPUID_0000_000F: 1036 case CPUID_0000_0010: 1037 /* 1038 * Do not report any Resource Director Technology 1039 * capabilities. Exposing control of cache or memory 1040 * controller resource partitioning to the guest is not 1041 * at all sensible. 1042 * 1043 * This is already hidden at a high level by masking of 1044 * leaf 0x7. Even still, a guest may look here for 1045 * detailed capability information. 1046 */ 1047 regs[0] = 0; 1048 regs[1] = 0; 1049 regs[2] = 0; 1050 regs[3] = 0; 1051 break; 1052 1053 case CPUID_0000_0015: 1054 /* 1055 * Don't report CPU TSC/Crystal ratio and clock 1056 * values since guests may use these to derive the 1057 * local APIC frequency.. 1058 */ 1059 regs[0] = 0; 1060 regs[1] = 0; 1061 regs[2] = 0; 1062 regs[3] = 0; 1063 break; 1064 1065 case 0x40000000: 1066 regs[0] = CPUID_VM_HIGH; 1067 bcopy(bhyve_id, ®s[1], 4); 1068 bcopy(bhyve_id + 4, ®s[2], 4); 1069 bcopy(bhyve_id + 8, ®s[3], 4); 1070 break; 1071 1072 default: 1073 default_leaf: 1074 /* 1075 * The leaf value has already been clamped so 1076 * simply pass this through. 1077 */ 1078 cpuid_count(func, param, regs); 1079 break; 1080 } 1081 1082 *eax = regs[0]; 1083 *ebx = regs[1]; 1084 *ecx = regs[2]; 1085 *edx = regs[3]; 1086 } 1087