1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2011 NetApp, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28 /*
29 * This file and its contents are supplied under the terms of the
30 * Common Development and Distribution License ("CDDL"), version 1.0.
31 * You may only use this file in accordance with the terms of version
32 * 1.0 of the CDDL.
33 *
34 * A full copy of the text of the CDDL should have accompanied this
35 * source. A copy of the CDDL is also available via the Internet at
36 * http://www.illumos.org/license/CDDL.
37 */
38 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */
39
40 /*
41 * Copyright 2014 Pluribus Networks Inc.
42 * Copyright 2018 Joyent, Inc.
43 * Copyright 2025 Oxide Computer Company
44 */
45
46 #include <sys/types.h>
47 #include <sys/stdbool.h>
48 #include <sys/errno.h>
49
50 #include <machine/md_var.h>
51 #include <machine/specialreg.h>
52
53 #include <machine/vmm.h>
54 #include <sys/vmm_kernel.h>
55
56 #include "vmm_host.h"
57 #include "vmm_util.h"
58 #include "vlapic.h"
59
60 /*
61 * CPUID Emulation
62 *
63 * All CPUID instruction exits are handled by the in-kernel emulation.
64 *
65 * ----------------
66 * Legacy Emulation
67 * ----------------
68 *
69 * Originally, the kernel vmm portion of bhyve relied on fixed logic to filter
70 * and/or generate CPUID results based on what was reported by the host CPU, as
71 * well as attributes of the VM (such as CPU topology, and enabled features).
72 * This is largely adequate to expose CPU capabilities to the guest in manner
73 * which allows it to operate properly.
74 *
75 * ------------------------------
76 * Userspace-Controlled Emulation
77 * ------------------------------
78 *
79 * In certain situations, more control over the CPUID emulation results present
80 * to the guest is desired. Live migration between physical hosts is one such
81 * example, where the underlying CPUs, or at least their microcode, may differ
82 * between the source and destination. In such cases, where changes to the
83 * CPUID results cannot be tolerated, the userspace portion of the VMM can be in
84 * complete control over the leaves which are presented to the guest. It may
85 * still consult the "legacy" CPUID data for guidance about which CPU features
86 * are safe to expose (due to hypervisor limitations, etc). This leaf
87 * information is configured on a per-vCPU basis.
88 *
89 * The emulation entries provided by userspace are expected to be in sorted
90 * order, running from lowest function and index to highest.
91 *
92 * For example:
93 * (func: 00h idx: 00h) ->
94 * (flags: 0, eax: highest std leaf, ebx-edx: vendor id)
95 * (func: 0Dh idx: 00h) ->
96 * (flags: VCE_FLAG_MATCH_INDEX, eax - edx: XCR0/XSAVE info)
97 * (func: 0Dh idx: 01h) ->
98 * (flags: VCE_FLAG_MATCH_INDEX, eax - edx: XSAVE/XSAVEOPT details)
99 * ...
100 * (func: 0Dh idx: 07H) ->
101 * (flags: VCE_FLAG_MATCH_INDEX, eax - edx: AVX-512 details)
102 * (func: 8000000h idx: 0h) ->
103 * (flags: 0, eax: highest extd leaf ...)
104 * ...
105 */
106
107
108 #define CPUID_TYPE_MASK 0xf0000000
109 #define CPUID_TYPE_STD 0x00000000
110 #define CPUID_TYPE_EXTD 0x80000000
111
112 #define CPUID_0000_0000 (0x0)
113 #define CPUID_0000_0001 (0x1)
114 #define CPUID_0000_0002 (0x2)
115 #define CPUID_0000_0003 (0x3)
116 #define CPUID_0000_0004 (0x4)
117 #define CPUID_0000_0006 (0x6)
118 #define CPUID_0000_0007 (0x7)
119 #define CPUID_0000_000A (0xA)
120 #define CPUID_0000_000B (0xB)
121 #define CPUID_0000_000D (0xD)
122 #define CPUID_0000_000F (0xF)
123 #define CPUID_0000_0010 (0x10)
124 #define CPUID_0000_0015 (0x15)
125 #define CPUID_8000_0000 (0x80000000)
126 #define CPUID_8000_0001 (0x80000001)
127 #define CPUID_8000_0002 (0x80000002)
128 #define CPUID_8000_0003 (0x80000003)
129 #define CPUID_8000_0004 (0x80000004)
130 #define CPUID_8000_0006 (0x80000006)
131 #define CPUID_8000_0007 (0x80000007)
132 #define CPUID_8000_0008 (0x80000008)
133 #define CPUID_8000_001D (0x8000001D)
134 #define CPUID_8000_001E (0x8000001E)
135
136 #define CPUID_VM_HIGH 0x40000000
137
138 static const struct vcpu_cpuid_entry cpuid_empty_entry = { 0 };
139
140 /*
141 * Given the CPUID configuration for a vCPU, locate the entry which matches the
142 * provided function/index tuple. The entries list is walked in order, and the
143 * first valid match based on the function/index and flags will be emitted.
144 *
145 * If no match is found, but Intel-style fallback is configured, then the
146 * highest standard leaf encountered will be emitted.
147 */
148 static const struct vcpu_cpuid_entry *
cpuid_find_entry(const vcpu_cpuid_config_t * cfg,uint32_t func,uint32_t idx)149 cpuid_find_entry(const vcpu_cpuid_config_t *cfg, uint32_t func, uint32_t idx)
150 {
151 const struct vcpu_cpuid_entry *last_std = NULL;
152 const bool intel_fallback =
153 (cfg->vcc_flags & VCC_FLAG_INTEL_FALLBACK) != 0;
154 bool matched_leaf = false;
155
156 ASSERT0(cfg->vcc_flags & VCC_FLAG_LEGACY_HANDLING);
157
158 for (uint_t i = 0; i < cfg->vcc_nent; i++) {
159 const struct vcpu_cpuid_entry *ent = &cfg->vcc_entries[i];
160 const bool ent_is_std =
161 (ent->vce_function & CPUID_TYPE_MASK) == CPUID_TYPE_STD;
162 const bool ent_must_match_idx =
163 (ent->vce_flags & VCE_FLAG_MATCH_INDEX) != 0;
164
165 if (ent_is_std) {
166 /*
167 * Keep track of the last "standard" leaf for
168 * Intel-style fallback behavior.
169 *
170 * This does currently not account for the sub-leaf
171 * index matching behavior for fallback described in the
172 * SDM. It is not clear if any consumers rely on such
173 * matching when encountering fallback.
174 */
175 last_std = ent;
176 }
177 if (ent->vce_function == func) {
178 if (ent->vce_index == idx || !ent_must_match_idx) {
179 return (ent);
180 }
181 /*
182 * Make note of when the top-level leaf matches, even
183 * when the index does not.
184 */
185 matched_leaf = true;
186 } else if (ent->vce_function > func) {
187 if ((ent->vce_function & CPUID_TYPE_MASK) ==
188 (func & CPUID_TYPE_MASK)) {
189 /*
190 * We are beyond a valid leaf to match, but have
191 * not exceeded the maximum leaf for this "type"
192 * (standard, extended, hvm, etc), so return an
193 * empty entry.
194 */
195 return (&cpuid_empty_entry);
196 } else {
197 /*
198 * Otherwise, we can stop now, having gone
199 * beyond the last entry which could match the
200 * target function in a sorted list.
201 */
202 break;
203 }
204 }
205 }
206
207 if (matched_leaf || !intel_fallback) {
208 return (&cpuid_empty_entry);
209 } else {
210 return (last_std);
211 }
212 }
213
214 /*
215 * Updates a previously-populated set of CPUID return values to account for the
216 * runtime state of the executing vCPU, i.e., the values in its control
217 * registers and MSRs that influence the values returned by the CPUID
218 * instruction.
219 *
220 * This function does not account for "static" properties of the vCPU or VM,
221 * such as the enablement of VM-wide features and capabilities (like x2APIC or
222 * INVPCID support) or settings that vary only with the vCPU's ID (like the
223 * values returned from its topology leaves).
224 *
225 * This function assumes that it is called from within VMRUN(), which guarantees
226 * that the guest's FPU state is loaded. This is required to obtain the correct
227 * values for leaves whose values depend on the guest values of %xcr0 and the
228 * IA32_XSS MSR.
229 */
230 static void
cpuid_apply_runtime_reg_state(struct vm * vm,int vcpuid,uint32_t func,uint32_t index,uint32_t * eax,uint32_t * ebx,uint32_t * ecx,uint32_t * edx)231 cpuid_apply_runtime_reg_state(struct vm *vm, int vcpuid, uint32_t func,
232 uint32_t index, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
233 {
234 uint64_t cr4;
235 int error;
236 unsigned int regs[4];
237
238 switch (func) {
239 case CPUID_0000_0001:
240 /*
241 * If CPUID2_XSAVE is being advertised and the
242 * guest has set CR4_XSAVE, set CPUID2_OSXSAVE.
243 */
244 *ecx &= ~CPUID2_OSXSAVE;
245 if ((*ecx & CPUID2_XSAVE) != 0) {
246 error = vm_get_register(vm, vcpuid,
247 VM_REG_GUEST_CR4, &cr4);
248 VERIFY0(error);
249 if ((cr4 & CR4_XSAVE) != 0) {
250 *ecx |= CPUID2_OSXSAVE;
251 }
252 }
253
254 /*
255 * AMD APM vol. 3 rev. 3.36 section E.3.2 notes that this bit is
256 * set only if the "APIC exists and is enabled." Vol. 3 of the
257 * June 2024 Intel SDM notes in section 11.4.3 that "[t]he CPUID
258 * feature flag for the APIC ... is also set to 0" when the APIC
259 * enable bit is cleared.
260 */
261 if (vlapic_hw_disabled(vm_lapic(vm, vcpuid))) {
262 *edx &= ~CPUID_APIC;
263 }
264 break;
265
266 case CPUID_0000_000D:
267 /*
268 * Leaf D reports XSAVE area sizes that vary with the current
269 * value of %xcr0. Since this function is called with %xcr0
270 * still set to its guest value, the easiest way to get the
271 * correct output is to execute CPUID on the host and copy out
272 * the relevant values.
273 */
274 cpuid_count(func, index, regs);
275 switch (index) {
276 case 0:
277 /*
278 * %eax, %ecx, and %edx return information about the
279 * complete set of features the processor supports, not
280 * just the ones that are enabled. The caller is
281 * presumed to have set these already, so just update
282 * %ebx.
283 */
284 *ebx = regs[1];
285 break;
286 case 1:
287 /*
288 * Subleaf 1 reports the XSAVE area size required for
289 * features enabled in %xcr0 and the IA32_XSS MSR via
290 * %ebx. As with subleaf 0, the caller is presumed to
291 * have set the other three output register values
292 * already.
293 *
294 * AMD APM vol. 3 rev. 3.36 and the June 2024 edition of
295 * volume 2 of the Intel SDM specify slightly different
296 * behavior here: the SDM says that the value returned
297 * in %ebx depends in part on whether %eax advertises
298 * XSAVEC and IA32_XSS support, but the APM does not. To
299 * handle these cases:
300 *
301 * 1. If the guest isn't a VMX guest, just copy the
302 * current reported save area size.
303 * 2. If both the XSAVEC and XSAVES bits are clear in
304 * %eax, return a save area size of 0 in %ebx to
305 * match the SDM description.
306 * 3. Otherwise, copy the host's reported save area
307 * size.
308 *
309 * Note that, because XSAVES saves a superset of the
310 * state saved by XSAVEC, it's OK to report the host's
311 * save area size even if the host and guest report
312 * different feature bits in %eax:
313 *
314 * - If the host supports XSAVES and the guest doesn't,
315 * the reported save area size will be too large, but
316 * the guest can still use XSAVEC safely.
317 * - If the VM's explicit CPUID values advertise XSAVES
318 * support, but the host doesn't support XSAVES, the
319 * host's reported save area size will still be large
320 * enough for the xcr0-controlled state saved by
321 * XSAVEC. The area will be undersized for XSAVES,
322 * but this is OK because the guest can't execute
323 * XSAVES anyway (it will #UD).
324 */
325 if (!vmm_is_intel()) {
326 *ebx = regs[1];
327 } else {
328 if ((*eax & (CPUID_EXTSTATE_XSAVEC |
329 CPUID_EXTSTATE_XSAVES)) == 0) {
330 *ebx = 0;
331 } else {
332 *ebx = regs[1];
333 }
334 }
335 break;
336 default:
337 /*
338 * Other subleaves of leaf D report the relative sizes
339 * and offsets of the state required for specific
340 * features in the relevant offset masks. These don't
341 * depend on the current enabled features (only the
342 * supported ones), so no enabled-feature specialization
343 * is required.
344 */
345 break;
346 }
347 break;
348 }
349 }
350
351 /*
352 * Emulates the CPUID instruction on the specified vCPU and returns its outputs
353 * in the rax/rbx/rcx/rdx variables.
354 *
355 * This function assumes it is called from within VMRUN(), which guarantees that
356 * certain guest state (e.g. FPU state) remains loaded.
357 */
358 void
vcpu_emulate_cpuid(struct vm * vm,int vcpuid,uint64_t * rax,uint64_t * rbx,uint64_t * rcx,uint64_t * rdx)359 vcpu_emulate_cpuid(struct vm *vm, int vcpuid, uint64_t *rax, uint64_t *rbx,
360 uint64_t *rcx, uint64_t *rdx)
361 {
362 const vcpu_cpuid_config_t *cfg = vm_cpuid_config(vm, vcpuid);
363 uint32_t func, index;
364
365 ASSERT3P(rax, !=, NULL);
366 ASSERT3P(rbx, !=, NULL);
367 ASSERT3P(rcx, !=, NULL);
368 ASSERT3P(rdx, !=, NULL);
369
370 uint32_t regs[4] = { *rax, 0, *rcx, 0 };
371 func = (uint32_t)*rax;
372 index = (uint32_t)*rcx;
373
374 /* Fall back to legacy handling if specified */
375 if ((cfg->vcc_flags & VCC_FLAG_LEGACY_HANDLING) != 0) {
376 legacy_emulate_cpuid(vm, vcpuid, ®s[0], ®s[1], ®s[2],
377 ®s[3]);
378 } else {
379 const struct vcpu_cpuid_entry *ent = cpuid_find_entry(cfg, func,
380 index);
381 ASSERT(ent != NULL);
382
383 /*
384 * The function and index in the found entry may differ from
385 * what the guest requested (if the entry was chosen via the
386 * "highest leaf" fallback described above). Use the values
387 * from the entry to ensure that the correct vCPU state fixups
388 * get applied below.
389 *
390 * The found entry may also be an all-zero empty entry (if the
391 * requested leaf is invalid but is less than the maximum valid
392 * leaf). It's OK to fall through in this case because leaf 0
393 * never has any CPU state-based fixups to apply.
394 */
395 func = ent->vce_function;
396 index = ent->vce_index;
397 regs[0] = ent->vce_eax;
398 regs[1] = ent->vce_ebx;
399 regs[2] = ent->vce_ecx;
400 regs[3] = ent->vce_edx;
401 }
402
403 /* Fix up any returned values that vary with guest register state. */
404 cpuid_apply_runtime_reg_state(vm, vcpuid, func, index, ®s[0],
405 ®s[1], ®s[2], ®s[3]);
406
407 /* CPUID clears the upper 32-bits of the long-mode registers. */
408 *rax = regs[0];
409 *rbx = regs[1];
410 *rcx = regs[2];
411 *rdx = regs[3];
412 }
413
414 /*
415 * Get the current CPUID emulation configuration for this vCPU.
416 *
417 * Only the existing flags will be emitted if the vCPU is configured for legacy
418 * operation via the VCC_FLAG_LEGACY_HANDLING flag. If in userspace-controlled
419 * mode, then we will attempt to copy the existing entries into vcc_entries,
420 * its side specified by vcc_nent.
421 *
422 * Regardless of whether vcc_entries is adequately sized (or even present),
423 * vcc_nent will be set to the number of existing entries.
424 */
425 int
vm_get_cpuid(struct vm * vm,int vcpuid,vcpu_cpuid_config_t * res)426 vm_get_cpuid(struct vm *vm, int vcpuid, vcpu_cpuid_config_t *res)
427 {
428 if (vcpuid < 0 || vcpuid > VM_MAXCPU) {
429 return (EINVAL);
430 }
431
432 const vcpu_cpuid_config_t *src = vm_cpuid_config(vm, vcpuid);
433 if (src->vcc_nent > res->vcc_nent) {
434 res->vcc_nent = src->vcc_nent;
435 return (E2BIG);
436 } else if (src->vcc_nent != 0) {
437 bcopy(src->vcc_entries, res->vcc_entries,
438 src->vcc_nent * sizeof (struct vcpu_cpuid_entry));
439 }
440 res->vcc_flags = src->vcc_flags;
441 res->vcc_nent = src->vcc_nent;
442 return (0);
443 }
444
445 /*
446 * Set the CPUID emulation configuration for this vCPU.
447 *
448 * If VCC_FLAG_LEGACY_HANDLING is set in vcc_flags, then vcc_nent is expected to
449 * be set to 0, as configuring a list of entries would be useless when using the
450 * legacy handling.
451 *
452 * Any existing entries which are configured are freed, and the newly provided
453 * ones will be copied into their place.
454 */
455 int
vm_set_cpuid(struct vm * vm,int vcpuid,const vcpu_cpuid_config_t * src)456 vm_set_cpuid(struct vm *vm, int vcpuid, const vcpu_cpuid_config_t *src)
457 {
458 if (vcpuid < 0 || vcpuid > VM_MAXCPU) {
459 return (EINVAL);
460 }
461 if (src->vcc_nent > VMM_MAX_CPUID_ENTRIES) {
462 return (EINVAL);
463 }
464 if ((src->vcc_flags & ~VCC_FLAGS_VALID) != 0) {
465 return (EINVAL);
466 }
467 if ((src->vcc_flags & VCC_FLAG_LEGACY_HANDLING) != 0 &&
468 src->vcc_nent != 0) {
469 /* No entries should be provided if using legacy handling */
470 return (EINVAL);
471 }
472 for (uint_t i = 0; i < src->vcc_nent; i++) {
473 /* Ensure all entries carry valid flags */
474 if ((src->vcc_entries[i].vce_flags & ~VCE_FLAGS_VALID) != 0) {
475 return (EINVAL);
476 }
477 }
478
479 vcpu_cpuid_config_t *cfg = vm_cpuid_config(vm, vcpuid);
480
481 /* Free any existing entries first */
482 vcpu_cpuid_cleanup(cfg);
483
484 /* Copy supplied entries into freshly allocated space */
485 if (src->vcc_nent != 0) {
486 const size_t entries_sz =
487 src->vcc_nent * sizeof (struct vcpu_cpuid_entry);
488
489 cfg->vcc_nent = src->vcc_nent;
490 cfg->vcc_entries = kmem_alloc(entries_sz, KM_SLEEP);
491 bcopy(src->vcc_entries, cfg->vcc_entries, entries_sz);
492 }
493 cfg->vcc_flags = src->vcc_flags;
494
495 return (0);
496 }
497
498 void
vcpu_cpuid_init(vcpu_cpuid_config_t * cfg)499 vcpu_cpuid_init(vcpu_cpuid_config_t *cfg)
500 {
501 /* Default to legacy-style handling */
502 cfg->vcc_flags = VCC_FLAG_LEGACY_HANDLING;
503 cfg->vcc_nent = 0;
504 cfg->vcc_entries = NULL;
505 }
506
507 void
vcpu_cpuid_cleanup(vcpu_cpuid_config_t * cfg)508 vcpu_cpuid_cleanup(vcpu_cpuid_config_t *cfg)
509 {
510 if (cfg->vcc_nent != 0) {
511 ASSERT3P(cfg->vcc_entries, !=, NULL);
512
513 kmem_free(cfg->vcc_entries,
514 cfg->vcc_nent * sizeof (struct vcpu_cpuid_entry));
515
516 cfg->vcc_nent = 0;
517 cfg->vcc_entries = NULL;
518 }
519 }
520
521 static const char bhyve_id[12] = "bhyve bhyve ";
522
523 /*
524 * Force exposition of the invariant TSC capability, regardless of whether the
525 * host CPU reports having it.
526 */
527 static int vmm_force_invariant_tsc = 0;
528
529 /*
530 * CPUID instruction Fn0000_0001:
531 */
532 #define CPUID_0000_0001_APICID_SHIFT 24
533
534
535 /*
536 * Compute ceil(log2(x)). Returns -1 if x is zero.
537 */
538 static __inline int
log2(uint_t x)539 log2(uint_t x)
540 {
541 return (x == 0 ? -1 : fls(x - 1));
542 }
543
544 /*
545 * The "legacy" bhyve cpuid emulation, which largly applies statically defined
546 * masks to the data provided by the host CPU.
547 */
548 void
legacy_emulate_cpuid(struct vm * vm,int vcpu_id,uint32_t * eax,uint32_t * ebx,uint32_t * ecx,uint32_t * edx)549 legacy_emulate_cpuid(struct vm *vm, int vcpu_id, uint32_t *eax, uint32_t *ebx,
550 uint32_t *ecx, uint32_t *edx)
551 {
552 const struct xsave_limits *limits;
553 int error, enable_invpcid, level, width = 0, x2apic_id = 0;
554 unsigned int func, regs[4], logical_cpus = 0, param;
555 enum x2apic_state x2apic_state;
556 uint16_t cores, maxcpus, sockets, threads;
557
558 /*
559 * The function of CPUID is controlled through the provided value of
560 * %eax (and secondarily %ecx, for certain leaf data).
561 */
562 func = (uint32_t)*eax;
563 param = (uint32_t)*ecx;
564
565 /*
566 * Requests for invalid CPUID levels should map to the highest
567 * available level instead.
568 */
569 if (cpu_exthigh != 0 && func >= 0x80000000) {
570 if (func > cpu_exthigh)
571 func = cpu_exthigh;
572 } else if (func >= 0x40000000) {
573 if (func > CPUID_VM_HIGH)
574 func = CPUID_VM_HIGH;
575 } else if (func > cpu_high) {
576 func = cpu_high;
577 }
578
579 /*
580 * In general the approach used for CPU topology is to
581 * advertise a flat topology where all CPUs are packages with
582 * no multi-core or SMT.
583 */
584 switch (func) {
585 /*
586 * Pass these through to the guest
587 */
588 case CPUID_0000_0000:
589 case CPUID_0000_0002:
590 case CPUID_0000_0003:
591 case CPUID_8000_0000:
592 case CPUID_8000_0002:
593 case CPUID_8000_0003:
594 case CPUID_8000_0004:
595 case CPUID_8000_0006:
596 cpuid_count(func, param, regs);
597 break;
598 case CPUID_8000_0008:
599 cpuid_count(func, param, regs);
600 if (vmm_is_svm()) {
601 /*
602 * As on Intel (0000_0007:0, EDX), mask out
603 * unsupported or unsafe AMD extended features
604 * (8000_0008 EBX).
605 */
606 regs[1] &= (AMDFEID_CLZERO | AMDFEID_IRPERF |
607 AMDFEID_XSAVEERPTR);
608
609 vm_get_topology(vm, &sockets, &cores, &threads,
610 &maxcpus);
611 /*
612 * Here, width is ApicIdCoreIdSize, present on
613 * at least Family 15h and newer. It
614 * represents the "number of bits in the
615 * initial apicid that indicate thread id
616 * within a package."
617 *
618 * Our topo_probe_amd() uses it for
619 * pkg_id_shift and other OSes may rely on it.
620 */
621 width = MIN(0xF, log2(threads * cores));
622 if (width < 0x4)
623 width = 0;
624 logical_cpus = MIN(0xFF, threads * cores - 1);
625 regs[2] = (width << AMDID_COREID_SIZE_SHIFT) |
626 logical_cpus;
627 }
628 break;
629
630 case CPUID_8000_0001:
631 cpuid_count(func, param, regs);
632
633 /*
634 * Hide SVM from guest.
635 */
636 regs[2] &= ~AMDID2_SVM;
637
638 /*
639 * Don't advertise extended performance counter MSRs
640 * to the guest.
641 */
642 regs[2] &= ~AMDID2_PCXC;
643 regs[2] &= ~AMDID2_PNXC;
644 regs[2] &= ~AMDID2_PTSCEL2I;
645
646 /*
647 * Don't advertise Instruction Based Sampling feature.
648 */
649 regs[2] &= ~AMDID2_IBS;
650
651 /* NodeID MSR not available */
652 regs[2] &= ~AMDID2_NODE_ID;
653
654 /* Don't advertise the OS visible workaround feature */
655 regs[2] &= ~AMDID2_OSVW;
656
657 /* Hide mwaitx/monitorx capability from the guest */
658 regs[2] &= ~AMDID2_MWAITX;
659
660 #ifndef __FreeBSD__
661 /*
662 * Detection routines for TCE and FFXSR are missing
663 * from our vm_cpuid_capability() detection logic
664 * today. Mask them out until that is remedied.
665 * They do not appear to be in common usage, so their
666 * absence should not cause undue trouble.
667 */
668 regs[2] &= ~AMDID2_TCE;
669 regs[3] &= ~AMDID_FFXSR;
670 #endif
671
672 /*
673 * Hide rdtscp/ia32_tsc_aux until we know how
674 * to deal with them.
675 */
676 regs[3] &= ~AMDID_RDTSCP;
677 break;
678
679 case CPUID_8000_0007:
680 cpuid_count(func, param, regs);
681 /*
682 * AMD uses this leaf to advertise the processor's
683 * power monitoring and RAS capabilities. These
684 * features are hardware-specific and exposing
685 * them to a guest doesn't make a lot of sense.
686 *
687 * Intel uses this leaf only to advertise the
688 * "Invariant TSC" feature with all other bits
689 * being reserved (set to zero).
690 */
691 regs[0] = 0;
692 regs[1] = 0;
693 regs[2] = 0;
694
695 /*
696 * If the host system possesses an invariant TSC, then
697 * it is safe to expose to the guest.
698 *
699 * If there is measured skew between host TSCs, it will
700 * be properly offset so guests do not observe any
701 * change between CPU migrations.
702 */
703 regs[3] &= AMDPM_TSC_INVARIANT;
704
705 /*
706 * Since illumos avoids deep C-states on CPUs which do
707 * not support an invariant TSC, it may be safe (and
708 * desired) to unconditionally expose that capability to
709 * the guest.
710 */
711 if (vmm_force_invariant_tsc != 0) {
712 regs[3] |= AMDPM_TSC_INVARIANT;
713 }
714 break;
715
716 case CPUID_8000_001D:
717 /* AMD Cache topology, like 0000_0004 for Intel. */
718 if (!vmm_is_svm())
719 goto default_leaf;
720
721 /*
722 * Similar to Intel, generate a fictitious cache
723 * topology for the guest with L3 shared by the
724 * package, and L1 and L2 local to a core.
725 */
726 vm_get_topology(vm, &sockets, &cores, &threads,
727 &maxcpus);
728 switch (param) {
729 case 0:
730 logical_cpus = threads;
731 level = 1;
732 func = 1; /* data cache */
733 break;
734 case 1:
735 logical_cpus = threads;
736 level = 2;
737 func = 3; /* unified cache */
738 break;
739 case 2:
740 logical_cpus = threads * cores;
741 level = 3;
742 func = 3; /* unified cache */
743 break;
744 default:
745 logical_cpus = 0;
746 level = 0;
747 func = 0;
748 break;
749 }
750
751 if (level == 0) {
752 regs[0] = 0;
753 regs[1] = 0;
754 } else {
755 logical_cpus = MIN(0xfff, logical_cpus - 1);
756 regs[0] = (logical_cpus << 14) | (1 << 8) |
757 (level << 5) | func;
758 regs[1] = func > 0 ? _CACHE_LINE_SIZE - 1 : 0;
759 }
760 regs[2] = 0;
761 regs[3] = 0;
762 break;
763
764 case CPUID_8000_001E:
765 /*
766 * AMD Family 16h+ and Hygon Family 18h additional
767 * identifiers.
768 */
769 if (!vmm_is_svm() || CPUID_TO_FAMILY(cpu_id) < 0x16)
770 goto default_leaf;
771
772 vm_get_topology(vm, &sockets, &cores, &threads,
773 &maxcpus);
774 regs[0] = vcpu_id;
775 threads = MIN(0xFF, threads - 1);
776 regs[1] = (threads << 8) |
777 (vcpu_id >> log2(threads + 1));
778 /*
779 * XXX Bhyve topology cannot yet represent >1 node per
780 * processor.
781 */
782 regs[2] = 0;
783 regs[3] = 0;
784 break;
785
786 case CPUID_0000_0001:
787 do_cpuid(1, regs);
788
789 error = vm_get_x2apic_state(vm, vcpu_id, &x2apic_state);
790 VERIFY0(error);
791
792 /*
793 * Override the APIC ID only in ebx
794 */
795 regs[1] &= ~(CPUID_LOCAL_APIC_ID);
796 regs[1] |= (vcpu_id << CPUID_0000_0001_APICID_SHIFT);
797
798 /*
799 * Don't expose VMX, SpeedStep, TME or SMX capability.
800 * Advertise x2APIC capability and Hypervisor guest.
801 */
802 regs[2] &= ~(CPUID2_VMX | CPUID2_EST | CPUID2_TM2);
803 regs[2] &= ~(CPUID2_SMX);
804
805 regs[2] |= CPUID2_HV;
806
807 if (x2apic_state != X2APIC_DISABLED)
808 regs[2] |= CPUID2_X2APIC;
809 else
810 regs[2] &= ~CPUID2_X2APIC;
811
812 /*
813 * Only advertise CPUID2_XSAVE in the guest if
814 * the host is using XSAVE.
815 */
816 if (!(regs[2] & CPUID2_OSXSAVE))
817 regs[2] &= ~CPUID2_XSAVE;
818
819 /*
820 * Hide monitor/mwait until we know how to deal with
821 * these instructions.
822 */
823 regs[2] &= ~CPUID2_MON;
824
825 /*
826 * Hide the performance and debug features.
827 */
828 regs[2] &= ~CPUID2_PDCM;
829
830 /*
831 * No TSC deadline support in the APIC yet
832 */
833 regs[2] &= ~CPUID2_TSCDLT;
834
835 /*
836 * Hide thermal monitoring
837 */
838 regs[3] &= ~(CPUID_ACPI | CPUID_TM);
839
840 /*
841 * Hide the debug store capability.
842 */
843 regs[3] &= ~CPUID_DS;
844
845 /*
846 * Advertise the Machine Check and MTRR capability.
847 *
848 * Some guest OSes (e.g. Windows) will not boot if
849 * these features are absent.
850 */
851 regs[3] |= (CPUID_MCA | CPUID_MCE | CPUID_MTRR);
852
853 vm_get_topology(vm, &sockets, &cores, &threads,
854 &maxcpus);
855 logical_cpus = threads * cores;
856 regs[1] &= ~CPUID_HTT_CORES;
857 regs[1] |= (logical_cpus & 0xff) << 16;
858 regs[3] |= CPUID_HTT;
859 break;
860
861 case CPUID_0000_0004:
862 cpuid_count(func, param, regs);
863
864 if (regs[0] || regs[1] || regs[2] || regs[3]) {
865 vm_get_topology(vm, &sockets, &cores, &threads,
866 &maxcpus);
867 regs[0] &= 0x3ff;
868 regs[0] |= (cores - 1) << 26;
869 /*
870 * Cache topology:
871 * - L1 and L2 are shared only by the logical
872 * processors in a single core.
873 * - L3 and above are shared by all logical
874 * processors in the package.
875 */
876 logical_cpus = threads;
877 level = (regs[0] >> 5) & 0x7;
878 if (level >= 3)
879 logical_cpus *= cores;
880 regs[0] |= (logical_cpus - 1) << 14;
881 }
882 break;
883
884 case CPUID_0000_0007:
885 regs[0] = 0;
886 regs[1] = 0;
887 regs[2] = 0;
888 regs[3] = 0;
889
890 /* leaf 0 */
891 if (param == 0) {
892 cpuid_count(func, param, regs);
893
894 /* Only leaf 0 is supported */
895 regs[0] = 0;
896
897 /*
898 * Expose known-safe features.
899 */
900 regs[1] &= CPUID_STDEXT_FSGSBASE |
901 CPUID_STDEXT_BMI1 | CPUID_STDEXT_HLE |
902 CPUID_STDEXT_AVX2 | CPUID_STDEXT_SMEP |
903 CPUID_STDEXT_BMI2 |
904 CPUID_STDEXT_ERMS | CPUID_STDEXT_RTM |
905 CPUID_STDEXT_AVX512F |
906 CPUID_STDEXT_AVX512DQ |
907 CPUID_STDEXT_RDSEED |
908 CPUID_STDEXT_SMAP |
909 CPUID_STDEXT_AVX512PF |
910 CPUID_STDEXT_AVX512ER |
911 CPUID_STDEXT_AVX512CD | CPUID_STDEXT_SHA |
912 CPUID_STDEXT_AVX512BW |
913 CPUID_STDEXT_AVX512VL;
914 regs[2] &= CPUID_STDEXT2_VAES |
915 CPUID_STDEXT2_VPCLMULQDQ;
916 regs[3] &= CPUID_STDEXT3_MD_CLEAR;
917
918 /* Advertise INVPCID if it is enabled. */
919 error = vm_get_capability(vm, vcpu_id,
920 VM_CAP_ENABLE_INVPCID, &enable_invpcid);
921 if (error == 0 && enable_invpcid)
922 regs[1] |= CPUID_STDEXT_INVPCID;
923 }
924 break;
925
926 case CPUID_0000_0006:
927 regs[0] = CPUTPM1_ARAT;
928 regs[1] = 0;
929 regs[2] = 0;
930 regs[3] = 0;
931 break;
932
933 case CPUID_0000_000A:
934 /*
935 * Handle the access, but report 0 for
936 * all options
937 */
938 regs[0] = 0;
939 regs[1] = 0;
940 regs[2] = 0;
941 regs[3] = 0;
942 break;
943
944 case CPUID_0000_000B:
945 /*
946 * Intel processor topology enumeration
947 */
948 if (vmm_is_intel()) {
949 vm_get_topology(vm, &sockets, &cores, &threads,
950 &maxcpus);
951 if (param == 0) {
952 logical_cpus = threads;
953 width = log2(logical_cpus);
954 level = CPUID_TYPE_SMT;
955 x2apic_id = vcpu_id;
956 }
957
958 if (param == 1) {
959 logical_cpus = threads * cores;
960 width = log2(logical_cpus);
961 level = CPUID_TYPE_CORE;
962 x2apic_id = vcpu_id;
963 }
964
965 if (param >= 2) {
966 width = 0;
967 logical_cpus = 0;
968 level = 0;
969 x2apic_id = 0;
970 }
971
972 regs[0] = width & 0x1f;
973 regs[1] = logical_cpus & 0xffff;
974 regs[2] = (level << 8) | (param & 0xff);
975 regs[3] = x2apic_id;
976 } else {
977 regs[0] = 0;
978 regs[1] = 0;
979 regs[2] = 0;
980 regs[3] = 0;
981 }
982 break;
983
984 case CPUID_0000_000D:
985 limits = vmm_get_xsave_limits();
986 if (!limits->xsave_enabled) {
987 regs[0] = 0;
988 regs[1] = 0;
989 regs[2] = 0;
990 regs[3] = 0;
991 break;
992 }
993
994 cpuid_count(func, param, regs);
995 switch (param) {
996 case 0:
997 /*
998 * Only permit the guest to use bits
999 * that are active in the host in
1000 * %xcr0. Also, claim that the
1001 * maximum save area size is
1002 * equivalent to the host's current
1003 * save area size. Since this runs
1004 * "inside" of vmrun(), it runs with
1005 * the guest's xcr0, so the current
1006 * save area size is correct as-is.
1007 */
1008 regs[0] &= limits->xcr0_allowed;
1009 regs[2] = limits->xsave_max_size;
1010 regs[3] &= (limits->xcr0_allowed >> 32);
1011 break;
1012 case 1:
1013 /* Only permit XSAVEOPT. */
1014 regs[0] &= CPUID_EXTSTATE_XSAVEOPT;
1015 regs[1] = 0;
1016 regs[2] = 0;
1017 regs[3] = 0;
1018 break;
1019 default:
1020 /*
1021 * If the leaf is for a permitted feature,
1022 * pass through as-is, otherwise return
1023 * all zeroes.
1024 */
1025 if (!(limits->xcr0_allowed & (1ul << param))) {
1026 regs[0] = 0;
1027 regs[1] = 0;
1028 regs[2] = 0;
1029 regs[3] = 0;
1030 }
1031 break;
1032 }
1033 break;
1034
1035 case CPUID_0000_000F:
1036 case CPUID_0000_0010:
1037 /*
1038 * Do not report any Resource Director Technology
1039 * capabilities. Exposing control of cache or memory
1040 * controller resource partitioning to the guest is not
1041 * at all sensible.
1042 *
1043 * This is already hidden at a high level by masking of
1044 * leaf 0x7. Even still, a guest may look here for
1045 * detailed capability information.
1046 */
1047 regs[0] = 0;
1048 regs[1] = 0;
1049 regs[2] = 0;
1050 regs[3] = 0;
1051 break;
1052
1053 case CPUID_0000_0015:
1054 /*
1055 * Don't report CPU TSC/Crystal ratio and clock
1056 * values since guests may use these to derive the
1057 * local APIC frequency..
1058 */
1059 regs[0] = 0;
1060 regs[1] = 0;
1061 regs[2] = 0;
1062 regs[3] = 0;
1063 break;
1064
1065 case 0x40000000:
1066 regs[0] = CPUID_VM_HIGH;
1067 bcopy(bhyve_id, ®s[1], 4);
1068 bcopy(bhyve_id + 4, ®s[2], 4);
1069 bcopy(bhyve_id + 8, ®s[3], 4);
1070 break;
1071
1072 default:
1073 default_leaf:
1074 /*
1075 * The leaf value has already been clamped so
1076 * simply pass this through.
1077 */
1078 cpuid_count(func, param, regs);
1079 break;
1080 }
1081
1082 *eax = regs[0];
1083 *ebx = regs[1];
1084 *ecx = regs[2];
1085 *edx = regs[3];
1086 }
1087