1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2011 NetApp, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 * 28 * $FreeBSD$ 29 */ 30 /* 31 * This file and its contents are supplied under the terms of the 32 * Common Development and Distribution License ("CDDL"), version 1.0. 33 * You may only use this file in accordance with the terms of version 34 * 1.0 of the CDDL. 35 * 36 * A full copy of the text of the CDDL should have accompanied this 37 * source. A copy of the CDDL is also available via the Internet at 38 * http://www.illumos.org/license/CDDL. 39 * 40 * Copyright 2014 Pluribus Networks Inc. 41 * Copyright 2018 Joyent, Inc. 42 * Copyright 2022 Oxide Computer Company 43 */ 44 45 #include <sys/types.h> 46 #include <sys/stdbool.h> 47 #include <sys/errno.h> 48 49 #include <machine/md_var.h> 50 #include <machine/specialreg.h> 51 52 #include <machine/vmm.h> 53 #include <sys/vmm_kernel.h> 54 55 #include "vmm_host.h" 56 #include "vmm_util.h" 57 58 /* 59 * CPUID Emulation 60 * 61 * All CPUID instruction exits are handled by the in-kernel emulation. 62 * 63 * ---------------- 64 * Legacy Emulation 65 * ---------------- 66 * 67 * Originally, the kernel vmm portion of bhyve relied on fixed logic to filter 68 * and/or generate CPUID results based on what was reported by the host CPU, as 69 * well as attributes of the VM (such as CPU topology, and enabled features). 70 * This is largely adequate to expose CPU capabilities to the guest in manner 71 * which allows it to operate properly. 72 * 73 * ------------------------------ 74 * Userspace-Controlled Emulation 75 * ------------------------------ 76 * 77 * In certain situations, more control over the CPUID emulation results present 78 * to the guest is desired. Live migration between physical hosts is one such 79 * example, where the underlying CPUs, or at least their microcode, may differ 80 * between the source and destination. In such cases, where changes to the 81 * CPUID results cannot be tolerated, the userspace portion of the VMM can be in 82 * complete control over the leaves which are presented to the guest. It may 83 * still consult the "legacy" CPUID data for guidance about which CPU features 84 * are safe to expose (due to hypervisor limitations, etc). This leaf 85 * information is configured on a per-vCPU basis. 86 * 87 * The emulation entries provided by userspace are expected to be in sorted 88 * order, running from lowest function and index to highest. 89 * 90 * For example: 91 * (func: 00h idx: 00h) -> 92 * (flags: 0, eax: highest std leaf, ebx-edx: vendor id) 93 * (func: 0Dh idx: 00h) -> 94 * (flags: VCE_FLAG_MATCH_INDEX, eax - edx: XCR0/XSAVE info) 95 * (func: 0Dh idx: 01h) -> 96 * (flags: VCE_FLAG_MATCH_INDEX, eax - edx: XSAVE/XSAVEOPT details) 97 * ... 98 * (func: 0Dh idx: 07H) -> 99 * (flags: VCE_FLAG_MATCH_INDEX, eax - edx: AVX-512 details) 100 * (func: 8000000h idx: 0h) -> 101 * (flags: 0, eax: highest extd leaf ...) 102 * ... 103 */ 104 105 106 #define CPUID_TYPE_MASK 0xf0000000 107 #define CPUID_TYPE_STD 0x00000000 108 #define CPUID_TYPE_EXTD 0x80000000 109 110 static const struct vcpu_cpuid_entry cpuid_empty_entry = { 0 }; 111 112 /* 113 * Given the CPUID configuration for a vCPU, locate the entry which matches the 114 * provided function/index tuple. The entries list is walked in order, and the 115 * first valid match based on the function/index and flags will be emitted. 116 * 117 * If no match is found, but Intel-style fallback is configured, then the 118 * highest standard leaf encountered will be emitted. 119 */ 120 static const struct vcpu_cpuid_entry * 121 cpuid_find_entry(const vcpu_cpuid_config_t *cfg, uint32_t func, uint32_t idx) 122 { 123 const struct vcpu_cpuid_entry *last_std = NULL; 124 const bool intel_fallback = 125 (cfg->vcc_flags & VCC_FLAG_INTEL_FALLBACK) != 0; 126 bool matched_leaf = false; 127 128 ASSERT0(cfg->vcc_flags & VCC_FLAG_LEGACY_HANDLING); 129 130 for (uint_t i = 0; i < cfg->vcc_nent; i++) { 131 const struct vcpu_cpuid_entry *ent = &cfg->vcc_entries[i]; 132 const bool ent_is_std = 133 (ent->vce_function & CPUID_TYPE_MASK) == CPUID_TYPE_STD; 134 const bool ent_must_match_idx = 135 (ent->vce_flags & VCE_FLAG_MATCH_INDEX) != 0; 136 137 if (ent_is_std) { 138 /* 139 * Keep track of the last "standard" leaf for 140 * Intel-style fallback behavior. 141 * 142 * This does currently not account for the sub-leaf 143 * index matching behavior for fallback described in the 144 * SDM. It is not clear if any consumers rely on such 145 * matching when encountering fallback. 146 */ 147 last_std = ent; 148 } 149 if (ent->vce_function == func) { 150 if (ent->vce_index == idx || !ent_must_match_idx) { 151 return (ent); 152 } 153 /* 154 * Make note of when the top-level leaf matches, even 155 * when the index does not. 156 */ 157 matched_leaf = true; 158 } else if (ent->vce_function > func) { 159 if ((ent->vce_function & CPUID_TYPE_MASK) == 160 (func & CPUID_TYPE_MASK)) { 161 /* 162 * We are beyond a valid leaf to match, but have 163 * not exceeded the maximum leaf for this "type" 164 * (standard, extended, hvm, etc), so return an 165 * empty entry. 166 */ 167 return (&cpuid_empty_entry); 168 } else { 169 /* 170 * Otherwise, we can stop now, having gone 171 * beyond the last entry which could match the 172 * target function in a sorted list. 173 */ 174 break; 175 } 176 } 177 } 178 179 if (matched_leaf || !intel_fallback) { 180 return (&cpuid_empty_entry); 181 } else { 182 return (last_std); 183 } 184 } 185 186 void 187 vcpu_emulate_cpuid(struct vm *vm, int vcpuid, uint64_t *rax, uint64_t *rbx, 188 uint64_t *rcx, uint64_t *rdx) 189 { 190 const vcpu_cpuid_config_t *cfg = vm_cpuid_config(vm, vcpuid); 191 192 ASSERT3P(rax, !=, NULL); 193 ASSERT3P(rbx, !=, NULL); 194 ASSERT3P(rcx, !=, NULL); 195 ASSERT3P(rdx, !=, NULL); 196 197 /* Fall back to legacy handling if specified */ 198 if ((cfg->vcc_flags & VCC_FLAG_LEGACY_HANDLING) != 0) { 199 uint32_t regs[4] = { *rax, 0, *rcx, 0 }; 200 201 legacy_emulate_cpuid(vm, vcpuid, ®s[0], ®s[1], ®s[2], 202 ®s[3]); 203 /* CPUID clears the upper 32-bits of the long-mode registers. */ 204 *rax = regs[0]; 205 *rbx = regs[1]; 206 *rcx = regs[2]; 207 *rdx = regs[3]; 208 return; 209 } 210 211 const struct vcpu_cpuid_entry *ent = cpuid_find_entry(cfg, *rax, *rcx); 212 ASSERT(ent != NULL); 213 /* CPUID clears the upper 32-bits of the long-mode registers. */ 214 *rax = ent->vce_eax; 215 *rbx = ent->vce_ebx; 216 *rcx = ent->vce_ecx; 217 *rdx = ent->vce_edx; 218 } 219 220 /* 221 * Get the current CPUID emulation configuration for this vCPU. 222 * 223 * Only the existing flags will be emitted if the vCPU is configured for legacy 224 * operation via the VCC_FLAG_LEGACY_HANDLING flag. If in userspace-controlled 225 * mode, then we will attempt to copy the existing entries into vcc_entries, 226 * its side specified by vcc_nent. 227 * 228 * Regardless of whether vcc_entries is adequately sized (or even present), 229 * vcc_nent will be set to the number of existing entries. 230 */ 231 int 232 vm_get_cpuid(struct vm *vm, int vcpuid, vcpu_cpuid_config_t *res) 233 { 234 if (vcpuid < 0 || vcpuid > VM_MAXCPU) { 235 return (EINVAL); 236 } 237 238 const vcpu_cpuid_config_t *src = vm_cpuid_config(vm, vcpuid); 239 if (src->vcc_nent > res->vcc_nent) { 240 res->vcc_nent = src->vcc_nent; 241 return (E2BIG); 242 } else if (src->vcc_nent != 0) { 243 bcopy(src->vcc_entries, res->vcc_entries, 244 src->vcc_nent * sizeof (struct vcpu_cpuid_entry)); 245 } 246 res->vcc_flags = src->vcc_flags; 247 res->vcc_nent = src->vcc_nent; 248 return (0); 249 } 250 251 /* 252 * Set the CPUID emulation configuration for this vCPU. 253 * 254 * If VCC_FLAG_LEGACY_HANDLING is set in vcc_flags, then vcc_nent is expected to 255 * be set to 0, as configuring a list of entries would be useless when using the 256 * legacy handling. 257 * 258 * Any existing entries which are configured are freed, and the newly provided 259 * ones will be copied into their place. 260 */ 261 int 262 vm_set_cpuid(struct vm *vm, int vcpuid, const vcpu_cpuid_config_t *src) 263 { 264 if (vcpuid < 0 || vcpuid > VM_MAXCPU) { 265 return (EINVAL); 266 } 267 if (src->vcc_nent > VMM_MAX_CPUID_ENTRIES) { 268 return (EINVAL); 269 } 270 if ((src->vcc_flags & ~VCC_FLAGS_VALID) != 0) { 271 return (EINVAL); 272 } 273 if ((src->vcc_flags & VCC_FLAG_LEGACY_HANDLING) != 0 && 274 src->vcc_nent != 0) { 275 /* No entries should be provided if using legacy handling */ 276 return (EINVAL); 277 } 278 for (uint_t i = 0; i < src->vcc_nent; i++) { 279 /* Ensure all entries carry valid flags */ 280 if ((src->vcc_entries[i].vce_flags & ~VCE_FLAGS_VALID) != 0) { 281 return (EINVAL); 282 } 283 } 284 285 vcpu_cpuid_config_t *cfg = vm_cpuid_config(vm, vcpuid); 286 287 /* Free any existing entries first */ 288 vcpu_cpuid_cleanup(cfg); 289 290 /* Copy supplied entries into freshly allocated space */ 291 if (src->vcc_nent != 0) { 292 const size_t entries_sz = 293 src->vcc_nent * sizeof (struct vcpu_cpuid_entry); 294 295 cfg->vcc_nent = src->vcc_nent; 296 cfg->vcc_entries = kmem_alloc(entries_sz, KM_SLEEP); 297 bcopy(src->vcc_entries, cfg->vcc_entries, entries_sz); 298 } 299 cfg->vcc_flags = src->vcc_flags; 300 301 return (0); 302 } 303 304 void 305 vcpu_cpuid_init(vcpu_cpuid_config_t *cfg) 306 { 307 /* Default to legacy-style handling */ 308 cfg->vcc_flags = VCC_FLAG_LEGACY_HANDLING; 309 cfg->vcc_nent = 0; 310 cfg->vcc_entries = NULL; 311 } 312 313 void 314 vcpu_cpuid_cleanup(vcpu_cpuid_config_t *cfg) 315 { 316 if (cfg->vcc_nent != 0) { 317 ASSERT3P(cfg->vcc_entries, !=, NULL); 318 319 kmem_free(cfg->vcc_entries, 320 cfg->vcc_nent * sizeof (struct vcpu_cpuid_entry)); 321 322 cfg->vcc_nent = 0; 323 cfg->vcc_entries = NULL; 324 } 325 } 326 327 static const char bhyve_id[12] = "bhyve bhyve "; 328 329 /* 330 * Force exposition of the invariant TSC capability, regardless of whether the 331 * host CPU reports having it. 332 */ 333 static int vmm_force_invariant_tsc = 0; 334 335 #define CPUID_0000_0000 (0x0) 336 #define CPUID_0000_0001 (0x1) 337 #define CPUID_0000_0002 (0x2) 338 #define CPUID_0000_0003 (0x3) 339 #define CPUID_0000_0004 (0x4) 340 #define CPUID_0000_0006 (0x6) 341 #define CPUID_0000_0007 (0x7) 342 #define CPUID_0000_000A (0xA) 343 #define CPUID_0000_000B (0xB) 344 #define CPUID_0000_000D (0xD) 345 #define CPUID_0000_000F (0xF) 346 #define CPUID_0000_0010 (0x10) 347 #define CPUID_0000_0015 (0x15) 348 #define CPUID_8000_0000 (0x80000000) 349 #define CPUID_8000_0001 (0x80000001) 350 #define CPUID_8000_0002 (0x80000002) 351 #define CPUID_8000_0003 (0x80000003) 352 #define CPUID_8000_0004 (0x80000004) 353 #define CPUID_8000_0006 (0x80000006) 354 #define CPUID_8000_0007 (0x80000007) 355 #define CPUID_8000_0008 (0x80000008) 356 #define CPUID_8000_001D (0x8000001D) 357 #define CPUID_8000_001E (0x8000001E) 358 359 #define CPUID_VM_HIGH 0x40000000 360 361 /* 362 * CPUID instruction Fn0000_0001: 363 */ 364 #define CPUID_0000_0001_APICID_SHIFT 24 365 366 367 /* 368 * Round up to the next power of two, if necessary, and then take log2. 369 * Returns -1 if argument is zero. 370 */ 371 static __inline int 372 log2(uint_t x) 373 { 374 return (fls(x << (1 - powerof2(x))) - 1); 375 } 376 377 /* 378 * The "legacy" bhyve cpuid emulation, which largly applies statically defined 379 * masks to the data provided by the host CPU. 380 */ 381 void 382 legacy_emulate_cpuid(struct vm *vm, int vcpu_id, uint32_t *eax, uint32_t *ebx, 383 uint32_t *ecx, uint32_t *edx) 384 { 385 const struct xsave_limits *limits; 386 uint64_t cr4; 387 int error, enable_invpcid, level, width = 0, x2apic_id = 0; 388 unsigned int func, regs[4], logical_cpus = 0, param; 389 enum x2apic_state x2apic_state; 390 uint16_t cores, maxcpus, sockets, threads; 391 392 /* 393 * The function of CPUID is controlled through the provided value of 394 * %eax (and secondarily %ecx, for certain leaf data). 395 */ 396 func = (uint32_t)*eax; 397 param = (uint32_t)*ecx; 398 399 /* 400 * Requests for invalid CPUID levels should map to the highest 401 * available level instead. 402 */ 403 if (cpu_exthigh != 0 && func >= 0x80000000) { 404 if (func > cpu_exthigh) 405 func = cpu_exthigh; 406 } else if (func >= 0x40000000) { 407 if (func > CPUID_VM_HIGH) 408 func = CPUID_VM_HIGH; 409 } else if (func > cpu_high) { 410 func = cpu_high; 411 } 412 413 /* 414 * In general the approach used for CPU topology is to 415 * advertise a flat topology where all CPUs are packages with 416 * no multi-core or SMT. 417 */ 418 switch (func) { 419 /* 420 * Pass these through to the guest 421 */ 422 case CPUID_0000_0000: 423 case CPUID_0000_0002: 424 case CPUID_0000_0003: 425 case CPUID_8000_0000: 426 case CPUID_8000_0002: 427 case CPUID_8000_0003: 428 case CPUID_8000_0004: 429 case CPUID_8000_0006: 430 cpuid_count(func, param, regs); 431 break; 432 case CPUID_8000_0008: 433 cpuid_count(func, param, regs); 434 if (vmm_is_svm()) { 435 /* 436 * As on Intel (0000_0007:0, EDX), mask out 437 * unsupported or unsafe AMD extended features 438 * (8000_0008 EBX). 439 */ 440 regs[1] &= (AMDFEID_CLZERO | AMDFEID_IRPERF | 441 AMDFEID_XSAVEERPTR); 442 443 vm_get_topology(vm, &sockets, &cores, &threads, 444 &maxcpus); 445 /* 446 * Here, width is ApicIdCoreIdSize, present on 447 * at least Family 15h and newer. It 448 * represents the "number of bits in the 449 * initial apicid that indicate thread id 450 * within a package." 451 * 452 * Our topo_probe_amd() uses it for 453 * pkg_id_shift and other OSes may rely on it. 454 */ 455 width = MIN(0xF, log2(threads * cores)); 456 if (width < 0x4) 457 width = 0; 458 logical_cpus = MIN(0xFF, threads * cores - 1); 459 regs[2] = (width << AMDID_COREID_SIZE_SHIFT) | 460 logical_cpus; 461 } 462 break; 463 464 case CPUID_8000_0001: 465 cpuid_count(func, param, regs); 466 467 /* 468 * Hide SVM from guest. 469 */ 470 regs[2] &= ~AMDID2_SVM; 471 472 /* 473 * Don't advertise extended performance counter MSRs 474 * to the guest. 475 */ 476 regs[2] &= ~AMDID2_PCXC; 477 regs[2] &= ~AMDID2_PNXC; 478 regs[2] &= ~AMDID2_PTSCEL2I; 479 480 /* 481 * Don't advertise Instruction Based Sampling feature. 482 */ 483 regs[2] &= ~AMDID2_IBS; 484 485 /* NodeID MSR not available */ 486 regs[2] &= ~AMDID2_NODE_ID; 487 488 /* Don't advertise the OS visible workaround feature */ 489 regs[2] &= ~AMDID2_OSVW; 490 491 /* Hide mwaitx/monitorx capability from the guest */ 492 regs[2] &= ~AMDID2_MWAITX; 493 494 #ifndef __FreeBSD__ 495 /* 496 * Detection routines for TCE and FFXSR are missing 497 * from our vm_cpuid_capability() detection logic 498 * today. Mask them out until that is remedied. 499 * They do not appear to be in common usage, so their 500 * absence should not cause undue trouble. 501 */ 502 regs[2] &= ~AMDID2_TCE; 503 regs[3] &= ~AMDID_FFXSR; 504 #endif 505 506 /* 507 * Hide rdtscp/ia32_tsc_aux until we know how 508 * to deal with them. 509 */ 510 regs[3] &= ~AMDID_RDTSCP; 511 break; 512 513 case CPUID_8000_0007: 514 cpuid_count(func, param, regs); 515 /* 516 * AMD uses this leaf to advertise the processor's 517 * power monitoring and RAS capabilities. These 518 * features are hardware-specific and exposing 519 * them to a guest doesn't make a lot of sense. 520 * 521 * Intel uses this leaf only to advertise the 522 * "Invariant TSC" feature with all other bits 523 * being reserved (set to zero). 524 */ 525 regs[0] = 0; 526 regs[1] = 0; 527 regs[2] = 0; 528 529 /* 530 * If the host system possesses an invariant TSC, then 531 * it is safe to expose to the guest. 532 * 533 * If there is measured skew between host TSCs, it will 534 * be properly offset so guests do not observe any 535 * change between CPU migrations. 536 */ 537 regs[3] &= AMDPM_TSC_INVARIANT; 538 539 /* 540 * Since illumos avoids deep C-states on CPUs which do 541 * not support an invariant TSC, it may be safe (and 542 * desired) to unconditionally expose that capability to 543 * the guest. 544 */ 545 if (vmm_force_invariant_tsc != 0) { 546 regs[3] |= AMDPM_TSC_INVARIANT; 547 } 548 break; 549 550 case CPUID_8000_001D: 551 /* AMD Cache topology, like 0000_0004 for Intel. */ 552 if (!vmm_is_svm()) 553 goto default_leaf; 554 555 /* 556 * Similar to Intel, generate a ficticious cache 557 * topology for the guest with L3 shared by the 558 * package, and L1 and L2 local to a core. 559 */ 560 vm_get_topology(vm, &sockets, &cores, &threads, 561 &maxcpus); 562 switch (param) { 563 case 0: 564 logical_cpus = threads; 565 level = 1; 566 func = 1; /* data cache */ 567 break; 568 case 1: 569 logical_cpus = threads; 570 level = 2; 571 func = 3; /* unified cache */ 572 break; 573 case 2: 574 logical_cpus = threads * cores; 575 level = 3; 576 func = 3; /* unified cache */ 577 break; 578 default: 579 logical_cpus = 0; 580 level = 0; 581 func = 0; 582 break; 583 } 584 585 logical_cpus = MIN(0xfff, logical_cpus - 1); 586 regs[0] = (logical_cpus << 14) | (1 << 8) | 587 (level << 5) | func; 588 regs[1] = (func > 0) ? (CACHE_LINE_SIZE - 1) : 0; 589 regs[2] = 0; 590 regs[3] = 0; 591 break; 592 593 case CPUID_8000_001E: 594 /* 595 * AMD Family 16h+ and Hygon Family 18h additional 596 * identifiers. 597 */ 598 if (!vmm_is_svm() || CPUID_TO_FAMILY(cpu_id) < 0x16) 599 goto default_leaf; 600 601 vm_get_topology(vm, &sockets, &cores, &threads, 602 &maxcpus); 603 regs[0] = vcpu_id; 604 threads = MIN(0xFF, threads - 1); 605 regs[1] = (threads << 8) | 606 (vcpu_id >> log2(threads + 1)); 607 /* 608 * XXX Bhyve topology cannot yet represent >1 node per 609 * processor. 610 */ 611 regs[2] = 0; 612 regs[3] = 0; 613 break; 614 615 case CPUID_0000_0001: 616 do_cpuid(1, regs); 617 618 error = vm_get_x2apic_state(vm, vcpu_id, &x2apic_state); 619 VERIFY0(error); 620 621 /* 622 * Override the APIC ID only in ebx 623 */ 624 regs[1] &= ~(CPUID_LOCAL_APIC_ID); 625 regs[1] |= (vcpu_id << CPUID_0000_0001_APICID_SHIFT); 626 627 /* 628 * Don't expose VMX, SpeedStep, TME or SMX capability. 629 * Advertise x2APIC capability and Hypervisor guest. 630 */ 631 regs[2] &= ~(CPUID2_VMX | CPUID2_EST | CPUID2_TM2); 632 regs[2] &= ~(CPUID2_SMX); 633 634 regs[2] |= CPUID2_HV; 635 636 if (x2apic_state != X2APIC_DISABLED) 637 regs[2] |= CPUID2_X2APIC; 638 else 639 regs[2] &= ~CPUID2_X2APIC; 640 641 /* 642 * Only advertise CPUID2_XSAVE in the guest if 643 * the host is using XSAVE. 644 */ 645 if (!(regs[2] & CPUID2_OSXSAVE)) 646 regs[2] &= ~CPUID2_XSAVE; 647 648 /* 649 * If CPUID2_XSAVE is being advertised and the 650 * guest has set CR4_XSAVE, set 651 * CPUID2_OSXSAVE. 652 */ 653 regs[2] &= ~CPUID2_OSXSAVE; 654 if (regs[2] & CPUID2_XSAVE) { 655 error = vm_get_register(vm, vcpu_id, 656 VM_REG_GUEST_CR4, &cr4); 657 VERIFY0(error); 658 if (cr4 & CR4_XSAVE) 659 regs[2] |= CPUID2_OSXSAVE; 660 } 661 662 /* 663 * Hide monitor/mwait until we know how to deal with 664 * these instructions. 665 */ 666 regs[2] &= ~CPUID2_MON; 667 668 /* 669 * Hide the performance and debug features. 670 */ 671 regs[2] &= ~CPUID2_PDCM; 672 673 /* 674 * No TSC deadline support in the APIC yet 675 */ 676 regs[2] &= ~CPUID2_TSCDLT; 677 678 /* 679 * Hide thermal monitoring 680 */ 681 regs[3] &= ~(CPUID_ACPI | CPUID_TM); 682 683 /* 684 * Hide the debug store capability. 685 */ 686 regs[3] &= ~CPUID_DS; 687 688 /* 689 * Advertise the Machine Check and MTRR capability. 690 * 691 * Some guest OSes (e.g. Windows) will not boot if 692 * these features are absent. 693 */ 694 regs[3] |= (CPUID_MCA | CPUID_MCE | CPUID_MTRR); 695 696 vm_get_topology(vm, &sockets, &cores, &threads, 697 &maxcpus); 698 logical_cpus = threads * cores; 699 regs[1] &= ~CPUID_HTT_CORES; 700 regs[1] |= (logical_cpus & 0xff) << 16; 701 regs[3] |= CPUID_HTT; 702 break; 703 704 case CPUID_0000_0004: 705 cpuid_count(func, param, regs); 706 707 if (regs[0] || regs[1] || regs[2] || regs[3]) { 708 vm_get_topology(vm, &sockets, &cores, &threads, 709 &maxcpus); 710 regs[0] &= 0x3ff; 711 regs[0] |= (cores - 1) << 26; 712 /* 713 * Cache topology: 714 * - L1 and L2 are shared only by the logical 715 * processors in a single core. 716 * - L3 and above are shared by all logical 717 * processors in the package. 718 */ 719 logical_cpus = threads; 720 level = (regs[0] >> 5) & 0x7; 721 if (level >= 3) 722 logical_cpus *= cores; 723 regs[0] |= (logical_cpus - 1) << 14; 724 } 725 break; 726 727 case CPUID_0000_0007: 728 regs[0] = 0; 729 regs[1] = 0; 730 regs[2] = 0; 731 regs[3] = 0; 732 733 /* leaf 0 */ 734 if (param == 0) { 735 cpuid_count(func, param, regs); 736 737 /* Only leaf 0 is supported */ 738 regs[0] = 0; 739 740 /* 741 * Expose known-safe features. 742 */ 743 regs[1] &= (CPUID_STDEXT_FSGSBASE | 744 CPUID_STDEXT_BMI1 | CPUID_STDEXT_HLE | 745 CPUID_STDEXT_AVX2 | CPUID_STDEXT_SMEP | 746 CPUID_STDEXT_BMI2 | 747 CPUID_STDEXT_ERMS | CPUID_STDEXT_RTM | 748 CPUID_STDEXT_AVX512F | 749 CPUID_STDEXT_RDSEED | 750 CPUID_STDEXT_SMAP | 751 CPUID_STDEXT_AVX512PF | 752 CPUID_STDEXT_AVX512ER | 753 CPUID_STDEXT_AVX512CD | CPUID_STDEXT_SHA); 754 regs[2] = 0; 755 regs[3] &= CPUID_STDEXT3_MD_CLEAR; 756 757 /* Advertise INVPCID if it is enabled. */ 758 error = vm_get_capability(vm, vcpu_id, 759 VM_CAP_ENABLE_INVPCID, &enable_invpcid); 760 if (error == 0 && enable_invpcid) 761 regs[1] |= CPUID_STDEXT_INVPCID; 762 } 763 break; 764 765 case CPUID_0000_0006: 766 regs[0] = CPUTPM1_ARAT; 767 regs[1] = 0; 768 regs[2] = 0; 769 regs[3] = 0; 770 break; 771 772 case CPUID_0000_000A: 773 /* 774 * Handle the access, but report 0 for 775 * all options 776 */ 777 regs[0] = 0; 778 regs[1] = 0; 779 regs[2] = 0; 780 regs[3] = 0; 781 break; 782 783 case CPUID_0000_000B: 784 /* 785 * Intel processor topology enumeration 786 */ 787 if (vmm_is_intel()) { 788 vm_get_topology(vm, &sockets, &cores, &threads, 789 &maxcpus); 790 if (param == 0) { 791 logical_cpus = threads; 792 width = log2(logical_cpus); 793 level = CPUID_TYPE_SMT; 794 x2apic_id = vcpu_id; 795 } 796 797 if (param == 1) { 798 logical_cpus = threads * cores; 799 width = log2(logical_cpus); 800 level = CPUID_TYPE_CORE; 801 x2apic_id = vcpu_id; 802 } 803 804 if (param >= 2) { 805 width = 0; 806 logical_cpus = 0; 807 level = 0; 808 x2apic_id = 0; 809 } 810 811 regs[0] = width & 0x1f; 812 regs[1] = logical_cpus & 0xffff; 813 regs[2] = (level << 8) | (param & 0xff); 814 regs[3] = x2apic_id; 815 } else { 816 regs[0] = 0; 817 regs[1] = 0; 818 regs[2] = 0; 819 regs[3] = 0; 820 } 821 break; 822 823 case CPUID_0000_000D: 824 limits = vmm_get_xsave_limits(); 825 if (!limits->xsave_enabled) { 826 regs[0] = 0; 827 regs[1] = 0; 828 regs[2] = 0; 829 regs[3] = 0; 830 break; 831 } 832 833 cpuid_count(func, param, regs); 834 switch (param) { 835 case 0: 836 /* 837 * Only permit the guest to use bits 838 * that are active in the host in 839 * %xcr0. Also, claim that the 840 * maximum save area size is 841 * equivalent to the host's current 842 * save area size. Since this runs 843 * "inside" of vmrun(), it runs with 844 * the guest's xcr0, so the current 845 * save area size is correct as-is. 846 */ 847 regs[0] &= limits->xcr0_allowed; 848 regs[2] = limits->xsave_max_size; 849 regs[3] &= (limits->xcr0_allowed >> 32); 850 break; 851 case 1: 852 /* Only permit XSAVEOPT. */ 853 regs[0] &= CPUID_EXTSTATE_XSAVEOPT; 854 regs[1] = 0; 855 regs[2] = 0; 856 regs[3] = 0; 857 break; 858 default: 859 /* 860 * If the leaf is for a permitted feature, 861 * pass through as-is, otherwise return 862 * all zeroes. 863 */ 864 if (!(limits->xcr0_allowed & (1ul << param))) { 865 regs[0] = 0; 866 regs[1] = 0; 867 regs[2] = 0; 868 regs[3] = 0; 869 } 870 break; 871 } 872 break; 873 874 case CPUID_0000_000F: 875 case CPUID_0000_0010: 876 /* 877 * Do not report any Resource Director Technology 878 * capabilities. Exposing control of cache or memory 879 * controller resource partitioning to the guest is not 880 * at all sensible. 881 * 882 * This is already hidden at a high level by masking of 883 * leaf 0x7. Even still, a guest may look here for 884 * detailed capability information. 885 */ 886 regs[0] = 0; 887 regs[1] = 0; 888 regs[2] = 0; 889 regs[3] = 0; 890 break; 891 892 case CPUID_0000_0015: 893 /* 894 * Don't report CPU TSC/Crystal ratio and clock 895 * values since guests may use these to derive the 896 * local APIC frequency.. 897 */ 898 regs[0] = 0; 899 regs[1] = 0; 900 regs[2] = 0; 901 regs[3] = 0; 902 break; 903 904 case 0x40000000: 905 regs[0] = CPUID_VM_HIGH; 906 bcopy(bhyve_id, ®s[1], 4); 907 bcopy(bhyve_id + 4, ®s[2], 4); 908 bcopy(bhyve_id + 8, ®s[3], 4); 909 break; 910 911 default: 912 default_leaf: 913 /* 914 * The leaf value has already been clamped so 915 * simply pass this through. 916 */ 917 cpuid_count(func, param, regs); 918 break; 919 } 920 921 *eax = regs[0]; 922 *ebx = regs[1]; 923 *ecx = regs[2]; 924 *edx = regs[3]; 925 } 926