xref: /freebsd/sys/amd64/vmm/x86.c (revision 6574b8ed19b093f0af09501d2c9676c28993cb97)
1 /*-
2  * Copyright (c) 2011 NetApp, Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  *
26  * $FreeBSD$
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include <sys/param.h>
33 #include <sys/pcpu.h>
34 #include <sys/systm.h>
35 #include <sys/cpuset.h>
36 #include <sys/sysctl.h>
37 
38 #include <machine/clock.h>
39 #include <machine/cpufunc.h>
40 #include <machine/md_var.h>
41 #include <machine/segments.h>
42 #include <machine/specialreg.h>
43 
44 #include <machine/vmm.h>
45 
46 #include "vmm_host.h"
47 #include "x86.h"
48 
49 SYSCTL_DECL(_hw_vmm);
50 static SYSCTL_NODE(_hw_vmm, OID_AUTO, topology, CTLFLAG_RD, 0, NULL);
51 
52 #define	CPUID_VM_HIGH		0x40000000
53 
54 static const char bhyve_id[12] = "bhyve bhyve ";
55 
56 static uint64_t bhyve_xcpuids;
57 
58 /*
59  * The default CPU topology is a single thread per package.
60  */
61 static u_int threads_per_core = 1;
62 SYSCTL_UINT(_hw_vmm_topology, OID_AUTO, threads_per_core, CTLFLAG_RDTUN,
63     &threads_per_core, 0, NULL);
64 
65 static u_int cores_per_package = 1;
66 SYSCTL_UINT(_hw_vmm_topology, OID_AUTO, cores_per_package, CTLFLAG_RDTUN,
67     &cores_per_package, 0, NULL);
68 
69 static int cpuid_leaf_b = 1;
70 SYSCTL_INT(_hw_vmm_topology, OID_AUTO, cpuid_leaf_b, CTLFLAG_RDTUN,
71     &cpuid_leaf_b, 0, NULL);
72 
73 /*
74  * Round up to the next power of two, if necessary, and then take log2.
75  * Returns -1 if argument is zero.
76  */
77 static __inline int
78 log2(u_int x)
79 {
80 
81 	return (fls(x << (1 - powerof2(x))) - 1);
82 }
83 
84 int
85 x86_emulate_cpuid(struct vm *vm, int vcpu_id,
86 		  uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
87 {
88 	const struct xsave_limits *limits;
89 	uint64_t cr4;
90 	int error, enable_invpcid, level, width, x2apic_id;
91 	unsigned int func, regs[4], logical_cpus;
92 	enum x2apic_state x2apic_state;
93 
94 	/*
95 	 * Requests for invalid CPUID levels should map to the highest
96 	 * available level instead.
97 	 */
98 	if (cpu_exthigh != 0 && *eax >= 0x80000000) {
99 		if (*eax > cpu_exthigh)
100 			*eax = cpu_exthigh;
101 	} else if (*eax >= 0x40000000) {
102 		if (*eax > CPUID_VM_HIGH)
103 			*eax = CPUID_VM_HIGH;
104 	} else if (*eax > cpu_high) {
105 		*eax = cpu_high;
106 	}
107 
108 	func = *eax;
109 
110 	/*
111 	 * In general the approach used for CPU topology is to
112 	 * advertise a flat topology where all CPUs are packages with
113 	 * no multi-core or SMT.
114 	 */
115 	switch (func) {
116 		/*
117 		 * Pass these through to the guest
118 		 */
119 		case CPUID_0000_0000:
120 		case CPUID_0000_0002:
121 		case CPUID_0000_0003:
122 		case CPUID_8000_0000:
123 		case CPUID_8000_0002:
124 		case CPUID_8000_0003:
125 		case CPUID_8000_0004:
126 		case CPUID_8000_0006:
127 		case CPUID_8000_0008:
128 			cpuid_count(*eax, *ecx, regs);
129 			break;
130 
131 		case CPUID_8000_0001:
132 			/*
133 			 * Hide rdtscp/ia32_tsc_aux until we know how
134 			 * to deal with them.
135 			 */
136 			cpuid_count(*eax, *ecx, regs);
137 			regs[3] &= ~AMDID_RDTSCP;
138 			break;
139 
140 		case CPUID_8000_0007:
141 			cpuid_count(*eax, *ecx, regs);
142 			/*
143 			 * If the host TSCs are not synchronized across
144 			 * physical cpus then we cannot advertise an
145 			 * invariant tsc to a vcpu.
146 			 *
147 			 * XXX This still falls short because the vcpu
148 			 * can observe the TSC moving backwards as it
149 			 * migrates across physical cpus. But at least
150 			 * it should discourage the guest from using the
151 			 * TSC to keep track of time.
152 			 */
153 			if (!smp_tsc)
154 				regs[3] &= ~AMDPM_TSC_INVARIANT;
155 			break;
156 
157 		case CPUID_0000_0001:
158 			do_cpuid(1, regs);
159 
160 			error = vm_get_x2apic_state(vm, vcpu_id, &x2apic_state);
161 			if (error) {
162 				panic("x86_emulate_cpuid: error %d "
163 				      "fetching x2apic state", error);
164 			}
165 
166 			/*
167 			 * Override the APIC ID only in ebx
168 			 */
169 			regs[1] &= ~(CPUID_LOCAL_APIC_ID);
170 			regs[1] |= (vcpu_id << CPUID_0000_0001_APICID_SHIFT);
171 
172 			/*
173 			 * Don't expose VMX, SpeedStep or TME capability.
174 			 * Advertise x2APIC capability and Hypervisor guest.
175 			 */
176 			regs[2] &= ~(CPUID2_VMX | CPUID2_EST | CPUID2_TM2);
177 
178 			regs[2] |= CPUID2_HV;
179 
180 			if (x2apic_state != X2APIC_DISABLED)
181 				regs[2] |= CPUID2_X2APIC;
182 			else
183 				regs[2] &= ~CPUID2_X2APIC;
184 
185 			/*
186 			 * Only advertise CPUID2_XSAVE in the guest if
187 			 * the host is using XSAVE.
188 			 */
189 			if (!(regs[2] & CPUID2_OSXSAVE))
190 				regs[2] &= ~CPUID2_XSAVE;
191 
192 			/*
193 			 * If CPUID2_XSAVE is being advertised and the
194 			 * guest has set CR4_XSAVE, set
195 			 * CPUID2_OSXSAVE.
196 			 */
197 			regs[2] &= ~CPUID2_OSXSAVE;
198 			if (regs[2] & CPUID2_XSAVE) {
199 				error = vm_get_register(vm, vcpu_id,
200 				    VM_REG_GUEST_CR4, &cr4);
201 				if (error)
202 					panic("x86_emulate_cpuid: error %d "
203 					      "fetching %%cr4", error);
204 				if (cr4 & CR4_XSAVE)
205 					regs[2] |= CPUID2_OSXSAVE;
206 			}
207 
208 			/*
209 			 * Hide monitor/mwait until we know how to deal with
210 			 * these instructions.
211 			 */
212 			regs[2] &= ~CPUID2_MON;
213 
214                         /*
215 			 * Hide the performance and debug features.
216 			 */
217 			regs[2] &= ~CPUID2_PDCM;
218 
219 			/*
220 			 * No TSC deadline support in the APIC yet
221 			 */
222 			regs[2] &= ~CPUID2_TSCDLT;
223 
224 			/*
225 			 * Hide thermal monitoring
226 			 */
227 			regs[3] &= ~(CPUID_ACPI | CPUID_TM);
228 
229 			/*
230 			 * Machine check handling is done in the host.
231 			 * Hide MTRR capability.
232 			 */
233 			regs[3] &= ~(CPUID_MCA | CPUID_MCE | CPUID_MTRR);
234 
235                         /*
236                         * Hide the debug store capability.
237                         */
238 			regs[3] &= ~CPUID_DS;
239 
240 			logical_cpus = threads_per_core * cores_per_package;
241 			regs[1] &= ~CPUID_HTT_CORES;
242 			regs[1] |= (logical_cpus & 0xff) << 16;
243 			regs[3] |= CPUID_HTT;
244 			break;
245 
246 		case CPUID_0000_0004:
247 			cpuid_count(*eax, *ecx, regs);
248 
249 			if (regs[0] || regs[1] || regs[2] || regs[3]) {
250 				regs[0] &= 0x3ff;
251 				regs[0] |= (cores_per_package - 1) << 26;
252 				/*
253 				 * Cache topology:
254 				 * - L1 and L2 are shared only by the logical
255 				 *   processors in a single core.
256 				 * - L3 and above are shared by all logical
257 				 *   processors in the package.
258 				 */
259 				logical_cpus = threads_per_core;
260 				level = (regs[0] >> 5) & 0x7;
261 				if (level >= 3)
262 					logical_cpus *= cores_per_package;
263 				regs[0] |= (logical_cpus - 1) << 14;
264 			}
265 			break;
266 
267 		case CPUID_0000_0007:
268 			regs[0] = 0;
269 			regs[1] = 0;
270 			regs[2] = 0;
271 			regs[3] = 0;
272 
273 			/* leaf 0 */
274 			if (*ecx == 0) {
275 				cpuid_count(*eax, *ecx, regs);
276 
277 				/* Only leaf 0 is supported */
278 				regs[0] = 0;
279 
280 				/*
281 				 * Expose known-safe features.
282 				 */
283 				regs[1] &= (CPUID_STDEXT_FSGSBASE |
284 				    CPUID_STDEXT_BMI1 | CPUID_STDEXT_HLE |
285 				    CPUID_STDEXT_AVX2 | CPUID_STDEXT_BMI2 |
286 				    CPUID_STDEXT_ERMS | CPUID_STDEXT_RTM |
287 				    CPUID_STDEXT_AVX512F |
288 				    CPUID_STDEXT_AVX512PF |
289 				    CPUID_STDEXT_AVX512ER |
290 				    CPUID_STDEXT_AVX512CD);
291 				regs[2] = 0;
292 				regs[3] = 0;
293 
294 				/* Advertise INVPCID if it is enabled. */
295 				error = vm_get_capability(vm, vcpu_id,
296 				    VM_CAP_ENABLE_INVPCID, &enable_invpcid);
297 				if (error == 0 && enable_invpcid)
298 					regs[1] |= CPUID_STDEXT_INVPCID;
299 			}
300 			break;
301 
302 		case CPUID_0000_0006:
303 		case CPUID_0000_000A:
304 			/*
305 			 * Handle the access, but report 0 for
306 			 * all options
307 			 */
308 			regs[0] = 0;
309 			regs[1] = 0;
310 			regs[2] = 0;
311 			regs[3] = 0;
312 			break;
313 
314 		case CPUID_0000_000B:
315 			/*
316 			 * Processor topology enumeration
317 			 */
318 			if (*ecx == 0) {
319 				logical_cpus = threads_per_core;
320 				width = log2(logical_cpus);
321 				level = CPUID_TYPE_SMT;
322 				x2apic_id = vcpu_id;
323 			}
324 
325 			if (*ecx == 1) {
326 				logical_cpus = threads_per_core *
327 				    cores_per_package;
328 				width = log2(logical_cpus);
329 				level = CPUID_TYPE_CORE;
330 				x2apic_id = vcpu_id;
331 			}
332 
333 			if (!cpuid_leaf_b || *ecx >= 2) {
334 				width = 0;
335 				logical_cpus = 0;
336 				level = 0;
337 				x2apic_id = 0;
338 			}
339 
340 			regs[0] = width & 0x1f;
341 			regs[1] = logical_cpus & 0xffff;
342 			regs[2] = (level << 8) | (*ecx & 0xff);
343 			regs[3] = x2apic_id;
344 			break;
345 
346 		case CPUID_0000_000D:
347 			limits = vmm_get_xsave_limits();
348 			if (!limits->xsave_enabled) {
349 				regs[0] = 0;
350 				regs[1] = 0;
351 				regs[2] = 0;
352 				regs[3] = 0;
353 				break;
354 			}
355 
356 			cpuid_count(*eax, *ecx, regs);
357 			switch (*ecx) {
358 			case 0:
359 				/*
360 				 * Only permit the guest to use bits
361 				 * that are active in the host in
362 				 * %xcr0.  Also, claim that the
363 				 * maximum save area size is
364 				 * equivalent to the host's current
365 				 * save area size.  Since this runs
366 				 * "inside" of vmrun(), it runs with
367 				 * the guest's xcr0, so the current
368 				 * save area size is correct as-is.
369 				 */
370 				regs[0] &= limits->xcr0_allowed;
371 				regs[2] = limits->xsave_max_size;
372 				regs[3] &= (limits->xcr0_allowed >> 32);
373 				break;
374 			case 1:
375 				/* Only permit XSAVEOPT. */
376 				regs[0] &= CPUID_EXTSTATE_XSAVEOPT;
377 				regs[1] = 0;
378 				regs[2] = 0;
379 				regs[3] = 0;
380 				break;
381 			default:
382 				/*
383 				 * If the leaf is for a permitted feature,
384 				 * pass through as-is, otherwise return
385 				 * all zeroes.
386 				 */
387 				if (!(limits->xcr0_allowed & (1ul << *ecx))) {
388 					regs[0] = 0;
389 					regs[1] = 0;
390 					regs[2] = 0;
391 					regs[3] = 0;
392 				}
393 				break;
394 			}
395 			break;
396 
397 		case 0x40000000:
398 			regs[0] = CPUID_VM_HIGH;
399 			bcopy(bhyve_id, &regs[1], 4);
400 			bcopy(bhyve_id + 4, &regs[2], 4);
401 			bcopy(bhyve_id + 8, &regs[3], 4);
402 			break;
403 
404 		default:
405 			/*
406 			 * The leaf value has already been clamped so
407 			 * simply pass this through, keeping count of
408 			 * how many unhandled leaf values have been seen.
409 			 */
410 			atomic_add_long(&bhyve_xcpuids, 1);
411 			cpuid_count(*eax, *ecx, regs);
412 			break;
413 	}
414 
415 	*eax = regs[0];
416 	*ebx = regs[1];
417 	*ecx = regs[2];
418 	*edx = regs[3];
419 
420 	return (1);
421 }
422