xref: /freebsd/sys/amd64/vmm/x86.c (revision 95d45410b5100e07f6f98450bcd841a8945d4726)
1 /*-
2  * Copyright (c) 2011 NetApp, Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  *
26  * $FreeBSD$
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include <sys/param.h>
33 #include <sys/pcpu.h>
34 #include <sys/systm.h>
35 #include <sys/cpuset.h>
36 
37 #include <machine/clock.h>
38 #include <machine/cpufunc.h>
39 #include <machine/md_var.h>
40 #include <machine/segments.h>
41 #include <machine/specialreg.h>
42 
43 #include <machine/vmm.h>
44 
45 #include "vmm_host.h"
46 #include "x86.h"
47 
48 #define	CPUID_VM_HIGH		0x40000000
49 
50 static const char bhyve_id[12] = "bhyve bhyve ";
51 
52 static uint64_t bhyve_xcpuids;
53 
54 int
55 x86_emulate_cpuid(struct vm *vm, int vcpu_id,
56 		  uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
57 {
58 	const struct xsave_limits *limits;
59 	uint64_t cr4;
60 	int error, enable_invpcid;
61 	unsigned int 	func, regs[4];
62 	enum x2apic_state x2apic_state;
63 
64 	/*
65 	 * Requests for invalid CPUID levels should map to the highest
66 	 * available level instead.
67 	 */
68 	if (cpu_exthigh != 0 && *eax >= 0x80000000) {
69 		if (*eax > cpu_exthigh)
70 			*eax = cpu_exthigh;
71 	} else if (*eax >= 0x40000000) {
72 		if (*eax > CPUID_VM_HIGH)
73 			*eax = CPUID_VM_HIGH;
74 	} else if (*eax > cpu_high) {
75 		*eax = cpu_high;
76 	}
77 
78 	func = *eax;
79 
80 	/*
81 	 * In general the approach used for CPU topology is to
82 	 * advertise a flat topology where all CPUs are packages with
83 	 * no multi-core or SMT.
84 	 */
85 	switch (func) {
86 		/*
87 		 * Pass these through to the guest
88 		 */
89 		case CPUID_0000_0000:
90 		case CPUID_0000_0002:
91 		case CPUID_0000_0003:
92 		case CPUID_8000_0000:
93 		case CPUID_8000_0002:
94 		case CPUID_8000_0003:
95 		case CPUID_8000_0004:
96 		case CPUID_8000_0006:
97 		case CPUID_8000_0008:
98 			cpuid_count(*eax, *ecx, regs);
99 			break;
100 
101 		case CPUID_8000_0001:
102 			/*
103 			 * Hide rdtscp/ia32_tsc_aux until we know how
104 			 * to deal with them.
105 			 */
106 			cpuid_count(*eax, *ecx, regs);
107 			regs[3] &= ~AMDID_RDTSCP;
108 			break;
109 
110 		case CPUID_8000_0007:
111 			cpuid_count(*eax, *ecx, regs);
112 			/*
113 			 * If the host TSCs are not synchronized across
114 			 * physical cpus then we cannot advertise an
115 			 * invariant tsc to a vcpu.
116 			 *
117 			 * XXX This still falls short because the vcpu
118 			 * can observe the TSC moving backwards as it
119 			 * migrates across physical cpus. But at least
120 			 * it should discourage the guest from using the
121 			 * TSC to keep track of time.
122 			 */
123 			if (!smp_tsc)
124 				regs[3] &= ~AMDPM_TSC_INVARIANT;
125 			break;
126 
127 		case CPUID_0000_0001:
128 			do_cpuid(1, regs);
129 
130 			error = vm_get_x2apic_state(vm, vcpu_id, &x2apic_state);
131 			if (error) {
132 				panic("x86_emulate_cpuid: error %d "
133 				      "fetching x2apic state", error);
134 			}
135 
136 			/*
137 			 * Override the APIC ID only in ebx
138 			 */
139 			regs[1] &= ~(CPUID_LOCAL_APIC_ID);
140 			regs[1] |= (vcpu_id << CPUID_0000_0001_APICID_SHIFT);
141 
142 			/*
143 			 * Don't expose VMX, SpeedStep or TME capability.
144 			 * Advertise x2APIC capability and Hypervisor guest.
145 			 */
146 			regs[2] &= ~(CPUID2_VMX | CPUID2_EST | CPUID2_TM2);
147 
148 			regs[2] |= CPUID2_HV;
149 
150 			if (x2apic_state != X2APIC_DISABLED)
151 				regs[2] |= CPUID2_X2APIC;
152 			else
153 				regs[2] &= ~CPUID2_X2APIC;
154 
155 			/*
156 			 * Only advertise CPUID2_XSAVE in the guest if
157 			 * the host is using XSAVE.
158 			 */
159 			if (!(regs[2] & CPUID2_OSXSAVE))
160 				regs[2] &= ~CPUID2_XSAVE;
161 
162 			/*
163 			 * If CPUID2_XSAVE is being advertised and the
164 			 * guest has set CR4_XSAVE, set
165 			 * CPUID2_OSXSAVE.
166 			 */
167 			regs[2] &= ~CPUID2_OSXSAVE;
168 			if (regs[2] & CPUID2_XSAVE) {
169 				error = vm_get_register(vm, vcpu_id,
170 				    VM_REG_GUEST_CR4, &cr4);
171 				if (error)
172 					panic("x86_emulate_cpuid: error %d "
173 					      "fetching %%cr4", error);
174 				if (cr4 & CR4_XSAVE)
175 					regs[2] |= CPUID2_OSXSAVE;
176 			}
177 
178 			/*
179 			 * Hide monitor/mwait until we know how to deal with
180 			 * these instructions.
181 			 */
182 			regs[2] &= ~CPUID2_MON;
183 
184                         /*
185 			 * Hide the performance and debug features.
186 			 */
187 			regs[2] &= ~CPUID2_PDCM;
188 
189 			/*
190 			 * No TSC deadline support in the APIC yet
191 			 */
192 			regs[2] &= ~CPUID2_TSCDLT;
193 
194 			/*
195 			 * Hide thermal monitoring
196 			 */
197 			regs[3] &= ~(CPUID_ACPI | CPUID_TM);
198 
199 			/*
200 			 * Machine check handling is done in the host.
201 			 * Hide MTRR capability.
202 			 */
203 			regs[3] &= ~(CPUID_MCA | CPUID_MCE | CPUID_MTRR);
204 
205                         /*
206                         * Hide the debug store capability.
207                         */
208 			regs[3] &= ~CPUID_DS;
209 
210 			/*
211 			 * Disable multi-core.
212 			 */
213 			regs[1] &= ~CPUID_HTT_CORES;
214 			regs[3] &= ~CPUID_HTT;
215 			break;
216 
217 		case CPUID_0000_0004:
218 			do_cpuid(4, regs);
219 
220 			/*
221 			 * Do not expose topology.
222 			 *
223 			 * The maximum number of processor cores in
224 			 * this physical processor package and the
225 			 * maximum number of threads sharing this
226 			 * cache are encoded with "plus 1" encoding.
227 			 * Adding one to the value in this register
228 			 * field to obtains the actual value.
229 			 *
230 			 * Therefore 0 for both indicates 1 core per
231 			 * package and no cache sharing.
232 			 */
233 			regs[0] &= 0xffff8000;
234 			break;
235 
236 		case CPUID_0000_0007:
237 			regs[0] = 0;
238 			regs[1] = 0;
239 			regs[2] = 0;
240 			regs[3] = 0;
241 
242 			/* leaf 0 */
243 			if (*ecx == 0) {
244 				cpuid_count(*eax, *ecx, regs);
245 
246 				/* Only leaf 0 is supported */
247 				regs[0] = 0;
248 
249 				/*
250 				 * Expose known-safe features.
251 				 */
252 				regs[1] &= (CPUID_STDEXT_FSGSBASE |
253 				    CPUID_STDEXT_BMI1 | CPUID_STDEXT_HLE |
254 				    CPUID_STDEXT_AVX2 | CPUID_STDEXT_BMI2 |
255 				    CPUID_STDEXT_ERMS | CPUID_STDEXT_RTM |
256 				    CPUID_STDEXT_AVX512F |
257 				    CPUID_STDEXT_AVX512PF |
258 				    CPUID_STDEXT_AVX512ER |
259 				    CPUID_STDEXT_AVX512CD);
260 				regs[2] = 0;
261 				regs[3] = 0;
262 
263 				/* Advertise INVPCID if it is enabled. */
264 				error = vm_get_capability(vm, vcpu_id,
265 				    VM_CAP_ENABLE_INVPCID, &enable_invpcid);
266 				if (error == 0 && enable_invpcid)
267 					regs[1] |= CPUID_STDEXT_INVPCID;
268 			}
269 			break;
270 
271 		case CPUID_0000_0006:
272 		case CPUID_0000_000A:
273 			/*
274 			 * Handle the access, but report 0 for
275 			 * all options
276 			 */
277 			regs[0] = 0;
278 			regs[1] = 0;
279 			regs[2] = 0;
280 			regs[3] = 0;
281 			break;
282 
283 		case CPUID_0000_000B:
284 			/*
285 			 * Processor topology enumeration
286 			 */
287 			regs[0] = 0;
288 			regs[1] = 0;
289 			regs[2] = *ecx & 0xff;
290 			regs[3] = vcpu_id;
291 			break;
292 
293 		case CPUID_0000_000D:
294 			limits = vmm_get_xsave_limits();
295 			if (!limits->xsave_enabled) {
296 				regs[0] = 0;
297 				regs[1] = 0;
298 				regs[2] = 0;
299 				regs[3] = 0;
300 				break;
301 			}
302 
303 			cpuid_count(*eax, *ecx, regs);
304 			switch (*ecx) {
305 			case 0:
306 				/*
307 				 * Only permit the guest to use bits
308 				 * that are active in the host in
309 				 * %xcr0.  Also, claim that the
310 				 * maximum save area size is
311 				 * equivalent to the host's current
312 				 * save area size.  Since this runs
313 				 * "inside" of vmrun(), it runs with
314 				 * the guest's xcr0, so the current
315 				 * save area size is correct as-is.
316 				 */
317 				regs[0] &= limits->xcr0_allowed;
318 				regs[2] = limits->xsave_max_size;
319 				regs[3] &= (limits->xcr0_allowed >> 32);
320 				break;
321 			case 1:
322 				/* Only permit XSAVEOPT. */
323 				regs[0] &= CPUID_EXTSTATE_XSAVEOPT;
324 				regs[1] = 0;
325 				regs[2] = 0;
326 				regs[3] = 0;
327 				break;
328 			default:
329 				/*
330 				 * If the leaf is for a permitted feature,
331 				 * pass through as-is, otherwise return
332 				 * all zeroes.
333 				 */
334 				if (!(limits->xcr0_allowed & (1ul << *ecx))) {
335 					regs[0] = 0;
336 					regs[1] = 0;
337 					regs[2] = 0;
338 					regs[3] = 0;
339 				}
340 				break;
341 			}
342 			break;
343 
344 		case 0x40000000:
345 			regs[0] = CPUID_VM_HIGH;
346 			bcopy(bhyve_id, &regs[1], 4);
347 			bcopy(bhyve_id + 4, &regs[2], 4);
348 			bcopy(bhyve_id + 8, &regs[3], 4);
349 			break;
350 
351 		default:
352 			/*
353 			 * The leaf value has already been clamped so
354 			 * simply pass this through, keeping count of
355 			 * how many unhandled leaf values have been seen.
356 			 */
357 			atomic_add_long(&bhyve_xcpuids, 1);
358 			cpuid_count(*eax, *ecx, regs);
359 			break;
360 	}
361 
362 	*eax = regs[0];
363 	*ebx = regs[1];
364 	*ecx = regs[2];
365 	*edx = regs[3];
366 
367 	return (1);
368 }
369