xref: /freebsd/sys/amd64/vmm/x86.c (revision f5fd950e35c962bad0aa31fdc4b4052e13207893)
1 /*-
2  * Copyright (c) 2011 NetApp, Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  *
26  * $FreeBSD$
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include <sys/param.h>
33 #include <sys/pcpu.h>
34 #include <sys/systm.h>
35 #include <sys/cpuset.h>
36 
37 #include <machine/clock.h>
38 #include <machine/cpufunc.h>
39 #include <machine/md_var.h>
40 #include <machine/segments.h>
41 #include <machine/specialreg.h>
42 
43 #include <machine/vmm.h>
44 
45 #include "vmm_host.h"
46 #include "x86.h"
47 
48 #define	CPUID_VM_HIGH		0x40000000
49 
50 static const char bhyve_id[12] = "bhyve bhyve ";
51 
52 static uint64_t bhyve_xcpuids;
53 
54 int
55 x86_emulate_cpuid(struct vm *vm, int vcpu_id,
56 		  uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
57 {
58 	const struct xsave_limits *limits;
59 	uint64_t cr4;
60 	int error, enable_invpcid;
61 	unsigned int 	func, regs[4];
62 	enum x2apic_state x2apic_state;
63 
64 	/*
65 	 * Requests for invalid CPUID levels should map to the highest
66 	 * available level instead.
67 	 */
68 	if (cpu_exthigh != 0 && *eax >= 0x80000000) {
69 		if (*eax > cpu_exthigh)
70 			*eax = cpu_exthigh;
71 	} else if (*eax >= 0x40000000) {
72 		if (*eax > CPUID_VM_HIGH)
73 			*eax = CPUID_VM_HIGH;
74 	} else if (*eax > cpu_high) {
75 		*eax = cpu_high;
76 	}
77 
78 	func = *eax;
79 
80 	/*
81 	 * In general the approach used for CPU topology is to
82 	 * advertise a flat topology where all CPUs are packages with
83 	 * no multi-core or SMT.
84 	 */
85 	switch (func) {
86 		/*
87 		 * Pass these through to the guest
88 		 */
89 		case CPUID_0000_0000:
90 		case CPUID_0000_0002:
91 		case CPUID_0000_0003:
92 		case CPUID_8000_0000:
93 		case CPUID_8000_0002:
94 		case CPUID_8000_0003:
95 		case CPUID_8000_0004:
96 		case CPUID_8000_0006:
97 		case CPUID_8000_0008:
98 			cpuid_count(*eax, *ecx, regs);
99 			break;
100 
101 		case CPUID_8000_0001:
102 			/*
103 			 * Hide rdtscp/ia32_tsc_aux until we know how
104 			 * to deal with them.
105 			 */
106 			cpuid_count(*eax, *ecx, regs);
107 			regs[3] &= ~AMDID_RDTSCP;
108 			break;
109 
110 		case CPUID_8000_0007:
111 			cpuid_count(*eax, *ecx, regs);
112 			/*
113 			 * If the host TSCs are not synchronized across
114 			 * physical cpus then we cannot advertise an
115 			 * invariant tsc to a vcpu.
116 			 *
117 			 * XXX This still falls short because the vcpu
118 			 * can observe the TSC moving backwards as it
119 			 * migrates across physical cpus. But at least
120 			 * it should discourage the guest from using the
121 			 * TSC to keep track of time.
122 			 */
123 			if (!smp_tsc)
124 				regs[3] &= ~AMDPM_TSC_INVARIANT;
125 			break;
126 
127 		case CPUID_0000_0001:
128 			do_cpuid(1, regs);
129 
130 			error = vm_get_x2apic_state(vm, vcpu_id, &x2apic_state);
131 			if (error) {
132 				panic("x86_emulate_cpuid: error %d "
133 				      "fetching x2apic state", error);
134 			}
135 
136 			/*
137 			 * Override the APIC ID only in ebx
138 			 */
139 			regs[1] &= ~(CPUID_LOCAL_APIC_ID);
140 			regs[1] |= (vcpu_id << CPUID_0000_0001_APICID_SHIFT);
141 
142 			/*
143 			 * Don't expose VMX, SpeedStep or TME capability.
144 			 * Advertise x2APIC capability and Hypervisor guest.
145 			 */
146 			regs[2] &= ~(CPUID2_VMX | CPUID2_EST | CPUID2_TM2);
147 
148 			regs[2] |= CPUID2_HV;
149 
150 			if (x2apic_state != X2APIC_DISABLED)
151 				regs[2] |= CPUID2_X2APIC;
152 			else
153 				regs[2] &= ~CPUID2_X2APIC;
154 
155 			/*
156 			 * Only advertise CPUID2_XSAVE in the guest if
157 			 * the host is using XSAVE.
158 			 */
159 			if (!(regs[2] & CPUID2_OSXSAVE))
160 				regs[2] &= ~CPUID2_XSAVE;
161 
162 			/*
163 			 * If CPUID2_XSAVE is being advertised and the
164 			 * guest has set CR4_XSAVE, set
165 			 * CPUID2_OSXSAVE.
166 			 */
167 			regs[2] &= ~CPUID2_OSXSAVE;
168 			if (regs[2] & CPUID2_XSAVE) {
169 				error = vm_get_register(vm, vcpu_id,
170 				    VM_REG_GUEST_CR4, &cr4);
171 				if (error)
172 					panic("x86_emulate_cpuid: error %d "
173 					      "fetching %%cr4", error);
174 				if (cr4 & CR4_XSAVE)
175 					regs[2] |= CPUID2_OSXSAVE;
176 			}
177 
178 			/*
179 			 * Hide monitor/mwait until we know how to deal with
180 			 * these instructions.
181 			 */
182 			regs[2] &= ~CPUID2_MON;
183 
184                         /*
185 			 * Hide the performance and debug features.
186 			 */
187 			regs[2] &= ~CPUID2_PDCM;
188 
189 			/*
190 			 * No TSC deadline support in the APIC yet
191 			 */
192 			regs[2] &= ~CPUID2_TSCDLT;
193 
194 			/*
195 			 * Hide thermal monitoring
196 			 */
197 			regs[3] &= ~(CPUID_ACPI | CPUID_TM);
198 
199 			/*
200 			 * Machine check handling is done in the host.
201 			 * Hide MTRR capability.
202 			 */
203 			regs[3] &= ~(CPUID_MCA | CPUID_MCE | CPUID_MTRR);
204 
205                         /*
206                         * Hide the debug store capability.
207                         */
208 			regs[3] &= ~CPUID_DS;
209 
210 			/*
211 			 * Disable multi-core.
212 			 */
213 			regs[1] &= ~CPUID_HTT_CORES;
214 			regs[3] &= ~CPUID_HTT;
215 			break;
216 
217 		case CPUID_0000_0004:
218 			do_cpuid(4, regs);
219 
220 			/*
221 			 * Do not expose topology.
222 			 */
223 			regs[0] &= 0xffff8000;
224 			regs[0] |= 0x04008000;
225 			break;
226 
227 		case CPUID_0000_0007:
228 			regs[0] = 0;
229 			regs[1] = 0;
230 			regs[2] = 0;
231 			regs[3] = 0;
232 
233 			/* leaf 0 */
234 			if (*ecx == 0) {
235 				error = vm_get_capability(vm, vcpu_id,
236 				    VM_CAP_ENABLE_INVPCID, &enable_invpcid);
237 				if (error == 0 && enable_invpcid)
238 					regs[1] |= CPUID_STDEXT_INVPCID;
239 			}
240 			break;
241 
242 		case CPUID_0000_0006:
243 		case CPUID_0000_000A:
244 			/*
245 			 * Handle the access, but report 0 for
246 			 * all options
247 			 */
248 			regs[0] = 0;
249 			regs[1] = 0;
250 			regs[2] = 0;
251 			regs[3] = 0;
252 			break;
253 
254 		case CPUID_0000_000B:
255 			/*
256 			 * Processor topology enumeration
257 			 */
258 			regs[0] = 0;
259 			regs[1] = 0;
260 			regs[2] = *ecx & 0xff;
261 			regs[3] = vcpu_id;
262 			break;
263 
264 		case CPUID_0000_000D:
265 			limits = vmm_get_xsave_limits();
266 			if (!limits->xsave_enabled) {
267 				regs[0] = 0;
268 				regs[1] = 0;
269 				regs[2] = 0;
270 				regs[3] = 0;
271 				break;
272 			}
273 
274 			cpuid_count(*eax, *ecx, regs);
275 			switch (*ecx) {
276 			case 0:
277 				/*
278 				 * Only permit the guest to use bits
279 				 * that are active in the host in
280 				 * %xcr0.  Also, claim that the
281 				 * maximum save area size is
282 				 * equivalent to the host's current
283 				 * save area size.  Since this runs
284 				 * "inside" of vmrun(), it runs with
285 				 * the guest's xcr0, so the current
286 				 * save area size is correct as-is.
287 				 */
288 				regs[0] &= limits->xcr0_allowed;
289 				regs[2] = limits->xsave_max_size;
290 				regs[3] &= (limits->xcr0_allowed >> 32);
291 				break;
292 			case 1:
293 				/* Only permit XSAVEOPT. */
294 				regs[0] &= CPUID_EXTSTATE_XSAVEOPT;
295 				regs[1] = 0;
296 				regs[2] = 0;
297 				regs[3] = 0;
298 				break;
299 			default:
300 				/*
301 				 * If the leaf is for a permitted feature,
302 				 * pass through as-is, otherwise return
303 				 * all zeroes.
304 				 */
305 				if (!(limits->xcr0_allowed & (1ul << *ecx))) {
306 					regs[0] = 0;
307 					regs[1] = 0;
308 					regs[2] = 0;
309 					regs[3] = 0;
310 				}
311 				break;
312 			}
313 			break;
314 
315 		case 0x40000000:
316 			regs[0] = CPUID_VM_HIGH;
317 			bcopy(bhyve_id, &regs[1], 4);
318 			bcopy(bhyve_id + 4, &regs[2], 4);
319 			bcopy(bhyve_id + 8, &regs[3], 4);
320 			break;
321 
322 		default:
323 			/*
324 			 * The leaf value has already been clamped so
325 			 * simply pass this through, keeping count of
326 			 * how many unhandled leaf values have been seen.
327 			 */
328 			atomic_add_long(&bhyve_xcpuids, 1);
329 			cpuid_count(*eax, *ecx, regs);
330 			break;
331 	}
332 
333 	*eax = regs[0];
334 	*ebx = regs[1];
335 	*ecx = regs[2];
336 	*edx = regs[3];
337 
338 	return (1);
339 }
340