xref: /freebsd/sys/amd64/vmm/x86.c (revision 1a61beb0549e05b33df31380e427d90f6e46ff7e)
1 /*-
2  * Copyright (c) 2011 NetApp, Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  *
26  * $FreeBSD$
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include <sys/param.h>
33 #include <sys/pcpu.h>
34 #include <sys/systm.h>
35 #include <sys/cpuset.h>
36 
37 #include <machine/clock.h>
38 #include <machine/cpufunc.h>
39 #include <machine/md_var.h>
40 #include <machine/segments.h>
41 #include <machine/specialreg.h>
42 
43 #include <machine/vmm.h>
44 
45 #include "vmm_host.h"
46 #include "x86.h"
47 
48 #define	CPUID_VM_HIGH		0x40000000
49 
50 static const char bhyve_id[12] = "bhyve bhyve ";
51 
52 static uint64_t bhyve_xcpuids;
53 
54 int
55 x86_emulate_cpuid(struct vm *vm, int vcpu_id,
56 		  uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
57 {
58 	const struct xsave_limits *limits;
59 	uint64_t cr4;
60 	int error, enable_invpcid;
61 	unsigned int 	func, regs[4];
62 	enum x2apic_state x2apic_state;
63 
64 	/*
65 	 * Requests for invalid CPUID levels should map to the highest
66 	 * available level instead.
67 	 */
68 	if (cpu_exthigh != 0 && *eax >= 0x80000000) {
69 		if (*eax > cpu_exthigh)
70 			*eax = cpu_exthigh;
71 	} else if (*eax >= 0x40000000) {
72 		if (*eax > CPUID_VM_HIGH)
73 			*eax = CPUID_VM_HIGH;
74 	} else if (*eax > cpu_high) {
75 		*eax = cpu_high;
76 	}
77 
78 	func = *eax;
79 
80 	/*
81 	 * In general the approach used for CPU topology is to
82 	 * advertise a flat topology where all CPUs are packages with
83 	 * no multi-core or SMT.
84 	 */
85 	switch (func) {
86 		/*
87 		 * Pass these through to the guest
88 		 */
89 		case CPUID_0000_0000:
90 		case CPUID_0000_0002:
91 		case CPUID_0000_0003:
92 		case CPUID_8000_0000:
93 		case CPUID_8000_0002:
94 		case CPUID_8000_0003:
95 		case CPUID_8000_0004:
96 		case CPUID_8000_0006:
97 		case CPUID_8000_0008:
98 			cpuid_count(*eax, *ecx, regs);
99 			break;
100 
101 		case CPUID_8000_0001:
102 			/*
103 			 * Hide rdtscp/ia32_tsc_aux until we know how
104 			 * to deal with them.
105 			 */
106 			cpuid_count(*eax, *ecx, regs);
107 			regs[3] &= ~AMDID_RDTSCP;
108 			break;
109 
110 		case CPUID_8000_0007:
111 			cpuid_count(*eax, *ecx, regs);
112 			/*
113 			 * If the host TSCs are not synchronized across
114 			 * physical cpus then we cannot advertise an
115 			 * invariant tsc to a vcpu.
116 			 *
117 			 * XXX This still falls short because the vcpu
118 			 * can observe the TSC moving backwards as it
119 			 * migrates across physical cpus. But at least
120 			 * it should discourage the guest from using the
121 			 * TSC to keep track of time.
122 			 */
123 			if (!smp_tsc)
124 				regs[3] &= ~AMDPM_TSC_INVARIANT;
125 			break;
126 
127 		case CPUID_0000_0001:
128 			do_cpuid(1, regs);
129 
130 			error = vm_get_x2apic_state(vm, vcpu_id, &x2apic_state);
131 			if (error) {
132 				panic("x86_emulate_cpuid: error %d "
133 				      "fetching x2apic state", error);
134 			}
135 
136 			/*
137 			 * Override the APIC ID only in ebx
138 			 */
139 			regs[1] &= ~(CPUID_LOCAL_APIC_ID);
140 			regs[1] |= (vcpu_id << CPUID_0000_0001_APICID_SHIFT);
141 
142 			/*
143 			 * Don't expose VMX, SpeedStep or TME capability.
144 			 * Advertise x2APIC capability and Hypervisor guest.
145 			 */
146 			regs[2] &= ~(CPUID2_VMX | CPUID2_EST | CPUID2_TM2);
147 
148 			regs[2] |= CPUID2_HV;
149 
150 			if (x2apic_state != X2APIC_DISABLED)
151 				regs[2] |= CPUID2_X2APIC;
152 
153 			/*
154 			 * Only advertise CPUID2_XSAVE in the guest if
155 			 * the host is using XSAVE.
156 			 */
157 			if (!(regs[2] & CPUID2_OSXSAVE))
158 				regs[2] &= ~CPUID2_XSAVE;
159 
160 			/*
161 			 * If CPUID2_XSAVE is being advertised and the
162 			 * guest has set CR4_XSAVE, set
163 			 * CPUID2_OSXSAVE.
164 			 */
165 			regs[2] &= ~CPUID2_OSXSAVE;
166 			if (regs[2] & CPUID2_XSAVE) {
167 				error = vm_get_register(vm, vcpu_id,
168 				    VM_REG_GUEST_CR4, &cr4);
169 				if (error)
170 					panic("x86_emulate_cpuid: error %d "
171 					      "fetching %%cr4", error);
172 				if (cr4 & CR4_XSAVE)
173 					regs[2] |= CPUID2_OSXSAVE;
174 			}
175 
176 			/*
177 			 * Hide monitor/mwait until we know how to deal with
178 			 * these instructions.
179 			 */
180 			regs[2] &= ~CPUID2_MON;
181 
182                         /*
183 			 * Hide the performance and debug features.
184 			 */
185 			regs[2] &= ~CPUID2_PDCM;
186 
187 			/*
188 			 * No TSC deadline support in the APIC yet
189 			 */
190 			regs[2] &= ~CPUID2_TSCDLT;
191 
192 			/*
193 			 * Hide thermal monitoring
194 			 */
195 			regs[3] &= ~(CPUID_ACPI | CPUID_TM);
196 
197 			/*
198 			 * Machine check handling is done in the host.
199 			 * Hide MTRR capability.
200 			 */
201 			regs[3] &= ~(CPUID_MCA | CPUID_MCE | CPUID_MTRR);
202 
203                         /*
204                         * Hide the debug store capability.
205                         */
206 			regs[3] &= ~CPUID_DS;
207 
208 			/*
209 			 * Disable multi-core.
210 			 */
211 			regs[1] &= ~CPUID_HTT_CORES;
212 			regs[3] &= ~CPUID_HTT;
213 			break;
214 
215 		case CPUID_0000_0004:
216 			do_cpuid(4, regs);
217 
218 			/*
219 			 * Do not expose topology.
220 			 */
221 			regs[0] &= 0xffff8000;
222 			regs[0] |= 0x04008000;
223 			break;
224 
225 		case CPUID_0000_0007:
226 			regs[0] = 0;
227 			regs[1] = 0;
228 			regs[2] = 0;
229 			regs[3] = 0;
230 
231 			/* leaf 0 */
232 			if (*ecx == 0) {
233 				error = vm_get_capability(vm, vcpu_id,
234 				    VM_CAP_ENABLE_INVPCID, &enable_invpcid);
235 				if (error == 0 && enable_invpcid)
236 					regs[1] |= CPUID_STDEXT_INVPCID;
237 			}
238 			break;
239 
240 		case CPUID_0000_0006:
241 		case CPUID_0000_000A:
242 			/*
243 			 * Handle the access, but report 0 for
244 			 * all options
245 			 */
246 			regs[0] = 0;
247 			regs[1] = 0;
248 			regs[2] = 0;
249 			regs[3] = 0;
250 			break;
251 
252 		case CPUID_0000_000B:
253 			/*
254 			 * Processor topology enumeration
255 			 */
256 			regs[0] = 0;
257 			regs[1] = 0;
258 			regs[2] = *ecx & 0xff;
259 			regs[3] = vcpu_id;
260 			break;
261 
262 		case CPUID_0000_000D:
263 			limits = vmm_get_xsave_limits();
264 			if (!limits->xsave_enabled) {
265 				regs[0] = 0;
266 				regs[1] = 0;
267 				regs[2] = 0;
268 				regs[3] = 0;
269 				break;
270 			}
271 
272 			cpuid_count(*eax, *ecx, regs);
273 			switch (*ecx) {
274 			case 0:
275 				/*
276 				 * Only permit the guest to use bits
277 				 * that are active in the host in
278 				 * %xcr0.  Also, claim that the
279 				 * maximum save area size is
280 				 * equivalent to the host's current
281 				 * save area size.  Since this runs
282 				 * "inside" of vmrun(), it runs with
283 				 * the guest's xcr0, so the current
284 				 * save area size is correct as-is.
285 				 */
286 				regs[0] &= limits->xcr0_allowed;
287 				regs[2] = limits->xsave_max_size;
288 				regs[3] &= (limits->xcr0_allowed >> 32);
289 				break;
290 			case 1:
291 				/* Only permit XSAVEOPT. */
292 				regs[0] &= CPUID_EXTSTATE_XSAVEOPT;
293 				regs[1] = 0;
294 				regs[2] = 0;
295 				regs[3] = 0;
296 				break;
297 			default:
298 				/*
299 				 * If the leaf is for a permitted feature,
300 				 * pass through as-is, otherwise return
301 				 * all zeroes.
302 				 */
303 				if (!(limits->xcr0_allowed & (1ul << *ecx))) {
304 					regs[0] = 0;
305 					regs[1] = 0;
306 					regs[2] = 0;
307 					regs[3] = 0;
308 				}
309 				break;
310 			}
311 			break;
312 
313 		case 0x40000000:
314 			regs[0] = CPUID_VM_HIGH;
315 			bcopy(bhyve_id, &regs[1], 4);
316 			bcopy(bhyve_id + 4, &regs[2], 4);
317 			bcopy(bhyve_id + 8, &regs[3], 4);
318 			break;
319 
320 		default:
321 			/*
322 			 * The leaf value has already been clamped so
323 			 * simply pass this through, keeping count of
324 			 * how many unhandled leaf values have been seen.
325 			 */
326 			atomic_add_long(&bhyve_xcpuids, 1);
327 			cpuid_count(*eax, *ecx, regs);
328 			break;
329 	}
330 
331 	*eax = regs[0];
332 	*ebx = regs[1];
333 	*ecx = regs[2];
334 	*edx = regs[3];
335 
336 	return (1);
337 }
338