xref: /freebsd/sys/amd64/vmm/x86.c (revision 13138422bc354a1ec35f53a27c4efeccdffc5639)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  *
28  * $FreeBSD$
29  */
30 
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33 
34 #include <sys/param.h>
35 #include <sys/pcpu.h>
36 #include <sys/systm.h>
37 #include <sys/sysctl.h>
38 
39 #include <machine/clock.h>
40 #include <machine/cpufunc.h>
41 #include <machine/md_var.h>
42 #include <machine/segments.h>
43 #include <machine/specialreg.h>
44 
45 #include <machine/vmm.h>
46 
47 #include "vmm_host.h"
48 #include "vmm_ktr.h"
49 #include "vmm_util.h"
50 #include "x86.h"
51 
52 SYSCTL_DECL(_hw_vmm);
53 static SYSCTL_NODE(_hw_vmm, OID_AUTO, topology, CTLFLAG_RD, 0, NULL);
54 
55 #define	CPUID_VM_HIGH		0x40000000
56 
57 static const char bhyve_id[12] = "bhyve bhyve ";
58 
59 static uint64_t bhyve_xcpuids;
60 SYSCTL_ULONG(_hw_vmm, OID_AUTO, bhyve_xcpuids, CTLFLAG_RW, &bhyve_xcpuids, 0,
61     "Number of times an unknown cpuid leaf was accessed");
62 
63 #if __FreeBSD_version < 1200060	/* Remove after 11 EOL helps MFCing */
64 extern u_int threads_per_core;
65 SYSCTL_UINT(_hw_vmm_topology, OID_AUTO, threads_per_core, CTLFLAG_RDTUN,
66     &threads_per_core, 0, NULL);
67 
68 extern u_int cores_per_package;
69 SYSCTL_UINT(_hw_vmm_topology, OID_AUTO, cores_per_package, CTLFLAG_RDTUN,
70     &cores_per_package, 0, NULL);
71 #endif
72 
73 static int cpuid_leaf_b = 1;
74 SYSCTL_INT(_hw_vmm_topology, OID_AUTO, cpuid_leaf_b, CTLFLAG_RDTUN,
75     &cpuid_leaf_b, 0, NULL);
76 
77 /*
78  * Round up to the next power of two, if necessary, and then take log2.
79  * Returns -1 if argument is zero.
80  */
81 static __inline int
82 log2(u_int x)
83 {
84 
85 	return (fls(x << (1 - powerof2(x))) - 1);
86 }
87 
88 int
89 x86_emulate_cpuid(struct vm *vm, int vcpu_id,
90 		  uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
91 {
92 	const struct xsave_limits *limits;
93 	uint64_t cr4;
94 	int error, enable_invpcid, level, width, x2apic_id;
95 	unsigned int func, regs[4], logical_cpus;
96 	enum x2apic_state x2apic_state;
97 	uint16_t cores, maxcpus, sockets, threads;
98 
99 	VCPU_CTR2(vm, vcpu_id, "cpuid %#x,%#x", *eax, *ecx);
100 
101 	/*
102 	 * Requests for invalid CPUID levels should map to the highest
103 	 * available level instead.
104 	 */
105 	if (cpu_exthigh != 0 && *eax >= 0x80000000) {
106 		if (*eax > cpu_exthigh)
107 			*eax = cpu_exthigh;
108 	} else if (*eax >= 0x40000000) {
109 		if (*eax > CPUID_VM_HIGH)
110 			*eax = CPUID_VM_HIGH;
111 	} else if (*eax > cpu_high) {
112 		*eax = cpu_high;
113 	}
114 
115 	func = *eax;
116 
117 	/*
118 	 * In general the approach used for CPU topology is to
119 	 * advertise a flat topology where all CPUs are packages with
120 	 * no multi-core or SMT.
121 	 */
122 	switch (func) {
123 		/*
124 		 * Pass these through to the guest
125 		 */
126 		case CPUID_0000_0000:
127 		case CPUID_0000_0002:
128 		case CPUID_0000_0003:
129 		case CPUID_8000_0000:
130 		case CPUID_8000_0002:
131 		case CPUID_8000_0003:
132 		case CPUID_8000_0004:
133 		case CPUID_8000_0006:
134 			cpuid_count(*eax, *ecx, regs);
135 			break;
136 		case CPUID_8000_0008:
137 			cpuid_count(*eax, *ecx, regs);
138 			if (vmm_is_svm()) {
139 				/*
140 				 * As on Intel (0000_0007:0, EDX), mask out
141 				 * unsupported or unsafe AMD extended features
142 				 * (8000_0008 EBX).
143 				 */
144 				regs[1] &= (AMDFEID_CLZERO | AMDFEID_IRPERF |
145 				    AMDFEID_XSAVEERPTR);
146 
147 				vm_get_topology(vm, &sockets, &cores, &threads,
148 				    &maxcpus);
149 				/*
150 				 * Here, width is ApicIdCoreIdSize, present on
151 				 * at least Family 15h and newer.  It
152 				 * represents the "number of bits in the
153 				 * initial apicid that indicate thread id
154 				 * within a package."
155 				 *
156 				 * Our topo_probe_amd() uses it for
157 				 * pkg_id_shift and other OSes may rely on it.
158 				 */
159 				width = MIN(0xF, log2(threads * cores));
160 				if (width < 0x4)
161 					width = 0;
162 				logical_cpus = MIN(0xFF, threads * cores - 1);
163 				regs[2] = (width << AMDID_COREID_SIZE_SHIFT) | logical_cpus;
164 			}
165 			break;
166 
167 		case CPUID_8000_0001:
168 			cpuid_count(*eax, *ecx, regs);
169 
170 			/*
171 			 * Hide SVM from guest.
172 			 */
173 			regs[2] &= ~AMDID2_SVM;
174 
175 			/*
176 			 * Don't advertise extended performance counter MSRs
177 			 * to the guest.
178 			 */
179 			regs[2] &= ~AMDID2_PCXC;
180 			regs[2] &= ~AMDID2_PNXC;
181 			regs[2] &= ~AMDID2_PTSCEL2I;
182 
183 			/*
184 			 * Don't advertise Instruction Based Sampling feature.
185 			 */
186 			regs[2] &= ~AMDID2_IBS;
187 
188 			/* NodeID MSR not available */
189 			regs[2] &= ~AMDID2_NODE_ID;
190 
191 			/* Don't advertise the OS visible workaround feature */
192 			regs[2] &= ~AMDID2_OSVW;
193 
194 			/* Hide mwaitx/monitorx capability from the guest */
195 			regs[2] &= ~AMDID2_MWAITX;
196 
197 			/*
198 			 * Hide rdtscp/ia32_tsc_aux until we know how
199 			 * to deal with them.
200 			 */
201 			regs[3] &= ~AMDID_RDTSCP;
202 			break;
203 
204 		case CPUID_8000_0007:
205 			/*
206 			 * AMD uses this leaf to advertise the processor's
207 			 * power monitoring and RAS capabilities. These
208 			 * features are hardware-specific and exposing
209 			 * them to a guest doesn't make a lot of sense.
210 			 *
211 			 * Intel uses this leaf only to advertise the
212 			 * "Invariant TSC" feature with all other bits
213 			 * being reserved (set to zero).
214 			 */
215 			regs[0] = 0;
216 			regs[1] = 0;
217 			regs[2] = 0;
218 			regs[3] = 0;
219 
220 			/*
221 			 * "Invariant TSC" can be advertised to the guest if:
222 			 * - host TSC frequency is invariant
223 			 * - host TSCs are synchronized across physical cpus
224 			 *
225 			 * XXX This still falls short because the vcpu
226 			 * can observe the TSC moving backwards as it
227 			 * migrates across physical cpus. But at least
228 			 * it should discourage the guest from using the
229 			 * TSC to keep track of time.
230 			 */
231 			if (tsc_is_invariant && smp_tsc)
232 				regs[3] |= AMDPM_TSC_INVARIANT;
233 			break;
234 
235 		case CPUID_8000_001D:
236 			/* AMD Cache topology, like 0000_0004 for Intel. */
237 			if (!vmm_is_svm())
238 				goto default_leaf;
239 
240 			/*
241 			 * Similar to Intel, generate a ficticious cache
242 			 * topology for the guest with L3 shared by the
243 			 * package, and L1 and L2 local to a core.
244 			 */
245 			vm_get_topology(vm, &sockets, &cores, &threads,
246 			    &maxcpus);
247 			switch (*ecx) {
248 			case 0:
249 				logical_cpus = threads;
250 				level = 1;
251 				func = 1;	/* data cache */
252 				break;
253 			case 1:
254 				logical_cpus = threads;
255 				level = 2;
256 				func = 3;	/* unified cache */
257 				break;
258 			case 2:
259 				logical_cpus = threads * cores;
260 				level = 3;
261 				func = 3;	/* unified cache */
262 				break;
263 			default:
264 				logical_cpus = 0;
265 				level = 0;
266 				func = 0;
267 				break;
268 			}
269 
270 			logical_cpus = MIN(0xfff, logical_cpus - 1);
271 			regs[0] = (logical_cpus << 14) | (1 << 8) |
272 			    (level << 5) | func;
273 			regs[1] = (func > 0) ? (CACHE_LINE_SIZE - 1) : 0;
274 			regs[2] = 0;
275 			regs[3] = 0;
276 			break;
277 
278 		case CPUID_8000_001E:
279 			/*
280 			 * AMD Family 16h+ and Hygon Family 18h additional
281 			 * identifiers.
282 			 */
283 			if (!vmm_is_svm() || CPUID_TO_FAMILY(cpu_id) < 0x16)
284 				goto default_leaf;
285 
286 			vm_get_topology(vm, &sockets, &cores, &threads,
287 			    &maxcpus);
288 			regs[0] = vcpu_id;
289 			threads = MIN(0xFF, threads - 1);
290 			regs[1] = (threads << 8) |
291 			    (vcpu_id >> log2(threads + 1));
292 			/*
293 			 * XXX Bhyve topology cannot yet represent >1 node per
294 			 * processor.
295 			 */
296 			regs[2] = 0;
297 			regs[3] = 0;
298 			break;
299 
300 		case CPUID_0000_0001:
301 			do_cpuid(1, regs);
302 
303 			error = vm_get_x2apic_state(vm, vcpu_id, &x2apic_state);
304 			if (error) {
305 				panic("x86_emulate_cpuid: error %d "
306 				      "fetching x2apic state", error);
307 			}
308 
309 			/*
310 			 * Override the APIC ID only in ebx
311 			 */
312 			regs[1] &= ~(CPUID_LOCAL_APIC_ID);
313 			regs[1] |= (vcpu_id << CPUID_0000_0001_APICID_SHIFT);
314 
315 			/*
316 			 * Don't expose VMX, SpeedStep, TME or SMX capability.
317 			 * Advertise x2APIC capability and Hypervisor guest.
318 			 */
319 			regs[2] &= ~(CPUID2_VMX | CPUID2_EST | CPUID2_TM2);
320 			regs[2] &= ~(CPUID2_SMX);
321 
322 			regs[2] |= CPUID2_HV;
323 
324 			if (x2apic_state != X2APIC_DISABLED)
325 				regs[2] |= CPUID2_X2APIC;
326 			else
327 				regs[2] &= ~CPUID2_X2APIC;
328 
329 			/*
330 			 * Only advertise CPUID2_XSAVE in the guest if
331 			 * the host is using XSAVE.
332 			 */
333 			if (!(regs[2] & CPUID2_OSXSAVE))
334 				regs[2] &= ~CPUID2_XSAVE;
335 
336 			/*
337 			 * If CPUID2_XSAVE is being advertised and the
338 			 * guest has set CR4_XSAVE, set
339 			 * CPUID2_OSXSAVE.
340 			 */
341 			regs[2] &= ~CPUID2_OSXSAVE;
342 			if (regs[2] & CPUID2_XSAVE) {
343 				error = vm_get_register(vm, vcpu_id,
344 				    VM_REG_GUEST_CR4, &cr4);
345 				if (error)
346 					panic("x86_emulate_cpuid: error %d "
347 					      "fetching %%cr4", error);
348 				if (cr4 & CR4_XSAVE)
349 					regs[2] |= CPUID2_OSXSAVE;
350 			}
351 
352 			/*
353 			 * Hide monitor/mwait until we know how to deal with
354 			 * these instructions.
355 			 */
356 			regs[2] &= ~CPUID2_MON;
357 
358                         /*
359 			 * Hide the performance and debug features.
360 			 */
361 			regs[2] &= ~CPUID2_PDCM;
362 
363 			/*
364 			 * No TSC deadline support in the APIC yet
365 			 */
366 			regs[2] &= ~CPUID2_TSCDLT;
367 
368 			/*
369 			 * Hide thermal monitoring
370 			 */
371 			regs[3] &= ~(CPUID_ACPI | CPUID_TM);
372 
373 			/*
374 			 * Hide the debug store capability.
375 			 */
376 			regs[3] &= ~CPUID_DS;
377 
378 			/*
379 			 * Advertise the Machine Check and MTRR capability.
380 			 *
381 			 * Some guest OSes (e.g. Windows) will not boot if
382 			 * these features are absent.
383 			 */
384 			regs[3] |= (CPUID_MCA | CPUID_MCE | CPUID_MTRR);
385 
386 			vm_get_topology(vm, &sockets, &cores, &threads,
387 			    &maxcpus);
388 			logical_cpus = threads * cores;
389 			regs[1] &= ~CPUID_HTT_CORES;
390 			regs[1] |= (logical_cpus & 0xff) << 16;
391 			regs[3] |= CPUID_HTT;
392 			break;
393 
394 		case CPUID_0000_0004:
395 			cpuid_count(*eax, *ecx, regs);
396 
397 			if (regs[0] || regs[1] || regs[2] || regs[3]) {
398 				vm_get_topology(vm, &sockets, &cores, &threads,
399 				    &maxcpus);
400 				regs[0] &= 0x3ff;
401 				regs[0] |= (cores - 1) << 26;
402 				/*
403 				 * Cache topology:
404 				 * - L1 and L2 are shared only by the logical
405 				 *   processors in a single core.
406 				 * - L3 and above are shared by all logical
407 				 *   processors in the package.
408 				 */
409 				logical_cpus = threads;
410 				level = (regs[0] >> 5) & 0x7;
411 				if (level >= 3)
412 					logical_cpus *= cores;
413 				regs[0] |= (logical_cpus - 1) << 14;
414 			}
415 			break;
416 
417 		case CPUID_0000_0007:
418 			regs[0] = 0;
419 			regs[1] = 0;
420 			regs[2] = 0;
421 			regs[3] = 0;
422 
423 			/* leaf 0 */
424 			if (*ecx == 0) {
425 				cpuid_count(*eax, *ecx, regs);
426 
427 				/* Only leaf 0 is supported */
428 				regs[0] = 0;
429 
430 				/*
431 				 * Expose known-safe features.
432 				 */
433 				regs[1] &= (CPUID_STDEXT_FSGSBASE |
434 				    CPUID_STDEXT_BMI1 | CPUID_STDEXT_HLE |
435 				    CPUID_STDEXT_AVX2 | CPUID_STDEXT_BMI2 |
436 				    CPUID_STDEXT_ERMS | CPUID_STDEXT_RTM |
437 				    CPUID_STDEXT_AVX512F |
438 				    CPUID_STDEXT_RDSEED |
439 				    CPUID_STDEXT_AVX512PF |
440 				    CPUID_STDEXT_AVX512ER |
441 				    CPUID_STDEXT_AVX512CD | CPUID_STDEXT_SHA);
442 				regs[2] = 0;
443 				regs[3] &= CPUID_STDEXT3_MD_CLEAR;
444 
445 				/* Advertise INVPCID if it is enabled. */
446 				error = vm_get_capability(vm, vcpu_id,
447 				    VM_CAP_ENABLE_INVPCID, &enable_invpcid);
448 				if (error == 0 && enable_invpcid)
449 					regs[1] |= CPUID_STDEXT_INVPCID;
450 			}
451 			break;
452 
453 		case CPUID_0000_0006:
454 			regs[0] = CPUTPM1_ARAT;
455 			regs[1] = 0;
456 			regs[2] = 0;
457 			regs[3] = 0;
458 			break;
459 
460 		case CPUID_0000_000A:
461 			/*
462 			 * Handle the access, but report 0 for
463 			 * all options
464 			 */
465 			regs[0] = 0;
466 			regs[1] = 0;
467 			regs[2] = 0;
468 			regs[3] = 0;
469 			break;
470 
471 		case CPUID_0000_000B:
472 			/*
473 			 * Intel processor topology enumeration
474 			 */
475 			if (vmm_is_intel()) {
476 				vm_get_topology(vm, &sockets, &cores, &threads,
477 				    &maxcpus);
478 				if (*ecx == 0) {
479 					logical_cpus = threads;
480 					width = log2(logical_cpus);
481 					level = CPUID_TYPE_SMT;
482 					x2apic_id = vcpu_id;
483 				}
484 
485 				if (*ecx == 1) {
486 					logical_cpus = threads * cores;
487 					width = log2(logical_cpus);
488 					level = CPUID_TYPE_CORE;
489 					x2apic_id = vcpu_id;
490 				}
491 
492 				if (!cpuid_leaf_b || *ecx >= 2) {
493 					width = 0;
494 					logical_cpus = 0;
495 					level = 0;
496 					x2apic_id = 0;
497 				}
498 
499 				regs[0] = width & 0x1f;
500 				regs[1] = logical_cpus & 0xffff;
501 				regs[2] = (level << 8) | (*ecx & 0xff);
502 				regs[3] = x2apic_id;
503 			} else {
504 				regs[0] = 0;
505 				regs[1] = 0;
506 				regs[2] = 0;
507 				regs[3] = 0;
508 			}
509 			break;
510 
511 		case CPUID_0000_000D:
512 			limits = vmm_get_xsave_limits();
513 			if (!limits->xsave_enabled) {
514 				regs[0] = 0;
515 				regs[1] = 0;
516 				regs[2] = 0;
517 				regs[3] = 0;
518 				break;
519 			}
520 
521 			cpuid_count(*eax, *ecx, regs);
522 			switch (*ecx) {
523 			case 0:
524 				/*
525 				 * Only permit the guest to use bits
526 				 * that are active in the host in
527 				 * %xcr0.  Also, claim that the
528 				 * maximum save area size is
529 				 * equivalent to the host's current
530 				 * save area size.  Since this runs
531 				 * "inside" of vmrun(), it runs with
532 				 * the guest's xcr0, so the current
533 				 * save area size is correct as-is.
534 				 */
535 				regs[0] &= limits->xcr0_allowed;
536 				regs[2] = limits->xsave_max_size;
537 				regs[3] &= (limits->xcr0_allowed >> 32);
538 				break;
539 			case 1:
540 				/* Only permit XSAVEOPT. */
541 				regs[0] &= CPUID_EXTSTATE_XSAVEOPT;
542 				regs[1] = 0;
543 				regs[2] = 0;
544 				regs[3] = 0;
545 				break;
546 			default:
547 				/*
548 				 * If the leaf is for a permitted feature,
549 				 * pass through as-is, otherwise return
550 				 * all zeroes.
551 				 */
552 				if (!(limits->xcr0_allowed & (1ul << *ecx))) {
553 					regs[0] = 0;
554 					regs[1] = 0;
555 					regs[2] = 0;
556 					regs[3] = 0;
557 				}
558 				break;
559 			}
560 			break;
561 
562 		case 0x40000000:
563 			regs[0] = CPUID_VM_HIGH;
564 			bcopy(bhyve_id, &regs[1], 4);
565 			bcopy(bhyve_id + 4, &regs[2], 4);
566 			bcopy(bhyve_id + 8, &regs[3], 4);
567 			break;
568 
569 		default:
570 default_leaf:
571 			/*
572 			 * The leaf value has already been clamped so
573 			 * simply pass this through, keeping count of
574 			 * how many unhandled leaf values have been seen.
575 			 */
576 			atomic_add_long(&bhyve_xcpuids, 1);
577 			cpuid_count(*eax, *ecx, regs);
578 			break;
579 	}
580 
581 	*eax = regs[0];
582 	*ebx = regs[1];
583 	*ecx = regs[2];
584 	*edx = regs[3];
585 
586 	return (1);
587 }
588 
589 bool
590 vm_cpuid_capability(struct vm *vm, int vcpuid, enum vm_cpuid_capability cap)
591 {
592 	bool rv;
593 
594 	KASSERT(cap > 0 && cap < VCC_LAST, ("%s: invalid vm_cpu_capability %d",
595 	    __func__, cap));
596 
597 	/*
598 	 * Simply passthrough the capabilities of the host cpu for now.
599 	 */
600 	rv = false;
601 	switch (cap) {
602 	case VCC_NO_EXECUTE:
603 		if (amd_feature & AMDID_NX)
604 			rv = true;
605 		break;
606 	case VCC_FFXSR:
607 		if (amd_feature & AMDID_FFXSR)
608 			rv = true;
609 		break;
610 	case VCC_TCE:
611 		if (amd_feature2 & AMDID2_TCE)
612 			rv = true;
613 		break;
614 	default:
615 		panic("%s: unknown vm_cpu_capability %d", __func__, cap);
616 	}
617 	return (rv);
618 }
619