xref: /freebsd/sys/amd64/vmm/x86.c (revision 4313cc83440a39bdf976f955b1d4d3f3c4d1552f)
1 /*-
2  * Copyright (c) 2011 NetApp, Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  *
26  * $FreeBSD$
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include <sys/param.h>
33 #include <sys/pcpu.h>
34 #include <sys/systm.h>
35 #include <sys/cpuset.h>
36 
37 #include <machine/clock.h>
38 #include <machine/cpufunc.h>
39 #include <machine/md_var.h>
40 #include <machine/segments.h>
41 #include <machine/specialreg.h>
42 
43 #include <machine/vmm.h>
44 
45 #include "vmm_host.h"
46 #include "x86.h"
47 
48 #define	CPUID_VM_HIGH		0x40000000
49 
50 static const char bhyve_id[12] = "bhyve bhyve ";
51 
52 static uint64_t bhyve_xcpuids;
53 
54 int
55 x86_emulate_cpuid(struct vm *vm, int vcpu_id,
56 		  uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
57 {
58 	const struct xsave_limits *limits;
59 	uint64_t cr4;
60 	int error, enable_invpcid;
61 	unsigned int 	func, regs[4];
62 	enum x2apic_state x2apic_state;
63 
64 	/*
65 	 * Requests for invalid CPUID levels should map to the highest
66 	 * available level instead.
67 	 */
68 	if (cpu_exthigh != 0 && *eax >= 0x80000000) {
69 		if (*eax > cpu_exthigh)
70 			*eax = cpu_exthigh;
71 	} else if (*eax >= 0x40000000) {
72 		if (*eax > CPUID_VM_HIGH)
73 			*eax = CPUID_VM_HIGH;
74 	} else if (*eax > cpu_high) {
75 		*eax = cpu_high;
76 	}
77 
78 	func = *eax;
79 
80 	/*
81 	 * In general the approach used for CPU topology is to
82 	 * advertise a flat topology where all CPUs are packages with
83 	 * no multi-core or SMT.
84 	 */
85 	switch (func) {
86 		/*
87 		 * Pass these through to the guest
88 		 */
89 		case CPUID_0000_0000:
90 		case CPUID_0000_0002:
91 		case CPUID_0000_0003:
92 		case CPUID_8000_0000:
93 		case CPUID_8000_0002:
94 		case CPUID_8000_0003:
95 		case CPUID_8000_0004:
96 		case CPUID_8000_0006:
97 		case CPUID_8000_0008:
98 			cpuid_count(*eax, *ecx, regs);
99 			break;
100 
101 		case CPUID_8000_0001:
102 			/*
103 			 * Hide rdtscp/ia32_tsc_aux until we know how
104 			 * to deal with them.
105 			 */
106 			cpuid_count(*eax, *ecx, regs);
107 			regs[3] &= ~AMDID_RDTSCP;
108 			break;
109 
110 		case CPUID_8000_0007:
111 			cpuid_count(*eax, *ecx, regs);
112 			/*
113 			 * If the host TSCs are not synchronized across
114 			 * physical cpus then we cannot advertise an
115 			 * invariant tsc to a vcpu.
116 			 *
117 			 * XXX This still falls short because the vcpu
118 			 * can observe the TSC moving backwards as it
119 			 * migrates across physical cpus. But at least
120 			 * it should discourage the guest from using the
121 			 * TSC to keep track of time.
122 			 */
123 			if (!smp_tsc)
124 				regs[3] &= ~AMDPM_TSC_INVARIANT;
125 			break;
126 
127 		case CPUID_0000_0001:
128 			do_cpuid(1, regs);
129 
130 			error = vm_get_x2apic_state(vm, vcpu_id, &x2apic_state);
131 			if (error) {
132 				panic("x86_emulate_cpuid: error %d "
133 				      "fetching x2apic state", error);
134 			}
135 
136 			/*
137 			 * Override the APIC ID only in ebx
138 			 */
139 			regs[1] &= ~(CPUID_LOCAL_APIC_ID);
140 			regs[1] |= (vcpu_id << CPUID_0000_0001_APICID_SHIFT);
141 
142 			/*
143 			 * Don't expose VMX, SpeedStep or TME capability.
144 			 * Advertise x2APIC capability and Hypervisor guest.
145 			 */
146 			regs[2] &= ~(CPUID2_VMX | CPUID2_EST | CPUID2_TM2);
147 
148 			regs[2] |= CPUID2_HV;
149 
150 			if (x2apic_state != X2APIC_DISABLED)
151 				regs[2] |= CPUID2_X2APIC;
152 			else
153 				regs[2] &= ~CPUID2_X2APIC;
154 
155 			/*
156 			 * Only advertise CPUID2_XSAVE in the guest if
157 			 * the host is using XSAVE.
158 			 */
159 			if (!(regs[2] & CPUID2_OSXSAVE))
160 				regs[2] &= ~CPUID2_XSAVE;
161 
162 			/*
163 			 * If CPUID2_XSAVE is being advertised and the
164 			 * guest has set CR4_XSAVE, set
165 			 * CPUID2_OSXSAVE.
166 			 */
167 			regs[2] &= ~CPUID2_OSXSAVE;
168 			if (regs[2] & CPUID2_XSAVE) {
169 				error = vm_get_register(vm, vcpu_id,
170 				    VM_REG_GUEST_CR4, &cr4);
171 				if (error)
172 					panic("x86_emulate_cpuid: error %d "
173 					      "fetching %%cr4", error);
174 				if (cr4 & CR4_XSAVE)
175 					regs[2] |= CPUID2_OSXSAVE;
176 			}
177 
178 			/*
179 			 * Hide monitor/mwait until we know how to deal with
180 			 * these instructions.
181 			 */
182 			regs[2] &= ~CPUID2_MON;
183 
184                         /*
185 			 * Hide the performance and debug features.
186 			 */
187 			regs[2] &= ~CPUID2_PDCM;
188 
189 			/*
190 			 * No TSC deadline support in the APIC yet
191 			 */
192 			regs[2] &= ~CPUID2_TSCDLT;
193 
194 			/*
195 			 * Hide thermal monitoring
196 			 */
197 			regs[3] &= ~(CPUID_ACPI | CPUID_TM);
198 
199 			/*
200 			 * Machine check handling is done in the host.
201 			 * Hide MTRR capability.
202 			 */
203 			regs[3] &= ~(CPUID_MCA | CPUID_MCE | CPUID_MTRR);
204 
205                         /*
206                         * Hide the debug store capability.
207                         */
208 			regs[3] &= ~CPUID_DS;
209 
210 			/*
211 			 * Disable multi-core.
212 			 */
213 			regs[1] &= ~CPUID_HTT_CORES;
214 			regs[3] &= ~CPUID_HTT;
215 			break;
216 
217 		case CPUID_0000_0004:
218 			do_cpuid(4, regs);
219 
220 			/*
221 			 * Do not expose topology.
222 			 *
223 			 * The maximum number of processor cores in
224 			 * this physical processor package and the
225 			 * maximum number of threads sharing this
226 			 * cache are encoded with "plus 1" encoding.
227 			 * Adding one to the value in this register
228 			 * field to obtains the actual value.
229 			 *
230 			 * Therefore 0 for both indicates 1 core per
231 			 * package and no cache sharing.
232 			 */
233 			regs[0] &= 0xffff8000;
234 			break;
235 
236 		case CPUID_0000_0007:
237 			regs[0] = 0;
238 			regs[1] = 0;
239 			regs[2] = 0;
240 			regs[3] = 0;
241 
242 			/* leaf 0 */
243 			if (*ecx == 0) {
244 				error = vm_get_capability(vm, vcpu_id,
245 				    VM_CAP_ENABLE_INVPCID, &enable_invpcid);
246 				if (error == 0 && enable_invpcid)
247 					regs[1] |= CPUID_STDEXT_INVPCID;
248 			}
249 			break;
250 
251 		case CPUID_0000_0006:
252 		case CPUID_0000_000A:
253 			/*
254 			 * Handle the access, but report 0 for
255 			 * all options
256 			 */
257 			regs[0] = 0;
258 			regs[1] = 0;
259 			regs[2] = 0;
260 			regs[3] = 0;
261 			break;
262 
263 		case CPUID_0000_000B:
264 			/*
265 			 * Processor topology enumeration
266 			 */
267 			regs[0] = 0;
268 			regs[1] = 0;
269 			regs[2] = *ecx & 0xff;
270 			regs[3] = vcpu_id;
271 			break;
272 
273 		case CPUID_0000_000D:
274 			limits = vmm_get_xsave_limits();
275 			if (!limits->xsave_enabled) {
276 				regs[0] = 0;
277 				regs[1] = 0;
278 				regs[2] = 0;
279 				regs[3] = 0;
280 				break;
281 			}
282 
283 			cpuid_count(*eax, *ecx, regs);
284 			switch (*ecx) {
285 			case 0:
286 				/*
287 				 * Only permit the guest to use bits
288 				 * that are active in the host in
289 				 * %xcr0.  Also, claim that the
290 				 * maximum save area size is
291 				 * equivalent to the host's current
292 				 * save area size.  Since this runs
293 				 * "inside" of vmrun(), it runs with
294 				 * the guest's xcr0, so the current
295 				 * save area size is correct as-is.
296 				 */
297 				regs[0] &= limits->xcr0_allowed;
298 				regs[2] = limits->xsave_max_size;
299 				regs[3] &= (limits->xcr0_allowed >> 32);
300 				break;
301 			case 1:
302 				/* Only permit XSAVEOPT. */
303 				regs[0] &= CPUID_EXTSTATE_XSAVEOPT;
304 				regs[1] = 0;
305 				regs[2] = 0;
306 				regs[3] = 0;
307 				break;
308 			default:
309 				/*
310 				 * If the leaf is for a permitted feature,
311 				 * pass through as-is, otherwise return
312 				 * all zeroes.
313 				 */
314 				if (!(limits->xcr0_allowed & (1ul << *ecx))) {
315 					regs[0] = 0;
316 					regs[1] = 0;
317 					regs[2] = 0;
318 					regs[3] = 0;
319 				}
320 				break;
321 			}
322 			break;
323 
324 		case 0x40000000:
325 			regs[0] = CPUID_VM_HIGH;
326 			bcopy(bhyve_id, &regs[1], 4);
327 			bcopy(bhyve_id + 4, &regs[2], 4);
328 			bcopy(bhyve_id + 8, &regs[3], 4);
329 			break;
330 
331 		default:
332 			/*
333 			 * The leaf value has already been clamped so
334 			 * simply pass this through, keeping count of
335 			 * how many unhandled leaf values have been seen.
336 			 */
337 			atomic_add_long(&bhyve_xcpuids, 1);
338 			cpuid_count(*eax, *ecx, regs);
339 			break;
340 	}
341 
342 	*eax = regs[0];
343 	*ebx = regs[1];
344 	*ecx = regs[2];
345 	*edx = regs[3];
346 
347 	return (1);
348 }
349