xref: /freebsd/sys/amd64/vmm/intel/vmx_msr.c (revision 1da7f3f6f72b2245e458fc7195733268ae4a1136)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/proc.h>
32 
33 #include <machine/clock.h>
34 #include <machine/cpufunc.h>
35 #include <machine/md_var.h>
36 #include <machine/pcb.h>
37 #include <machine/specialreg.h>
38 #include <machine/vmm.h>
39 
40 #include "vmx.h"
41 #include "vmx_msr.h"
42 #include "x86.h"
43 
44 static bool
45 vmx_ctl_allows_one_setting(uint64_t msr_val, int bitpos)
46 {
47 
48 	return ((msr_val & (1UL << (bitpos + 32))) != 0);
49 }
50 
51 static bool
52 vmx_ctl_allows_zero_setting(uint64_t msr_val, int bitpos)
53 {
54 
55 	return ((msr_val & (1UL << bitpos)) == 0);
56 }
57 
58 uint32_t
59 vmx_revision(void)
60 {
61 
62 	return (rdmsr(MSR_VMX_BASIC) & 0xffffffff);
63 }
64 
65 /*
66  * Generate a bitmask to be used for the VMCS execution control fields.
67  *
68  * The caller specifies what bits should be set to one in 'ones_mask'
69  * and what bits should be set to zero in 'zeros_mask'. The don't-care
70  * bits are set to the default value. The default values are obtained
71  * based on "Algorithm 3" in Section 27.5.1 "Algorithms for Determining
72  * VMX Capabilities".
73  *
74  * Returns zero on success and non-zero on error.
75  */
76 int
77 vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask,
78 	       uint32_t zeros_mask, uint32_t *retval)
79 {
80 	int i;
81 	uint64_t val, trueval;
82 	bool true_ctls_avail, one_allowed, zero_allowed;
83 
84 	/* We cannot ask the same bit to be set to both '1' and '0' */
85 	if ((ones_mask ^ zeros_mask) != (ones_mask | zeros_mask))
86 		return (EINVAL);
87 
88 	true_ctls_avail = (rdmsr(MSR_VMX_BASIC) & (1UL << 55)) != 0;
89 
90 	val = rdmsr(ctl_reg);
91 	if (true_ctls_avail)
92 		trueval = rdmsr(true_ctl_reg);		/* step c */
93 	else
94 		trueval = val;				/* step a */
95 
96 	for (i = 0; i < 32; i++) {
97 		one_allowed = vmx_ctl_allows_one_setting(trueval, i);
98 		zero_allowed = vmx_ctl_allows_zero_setting(trueval, i);
99 
100 		KASSERT(one_allowed || zero_allowed,
101 			("invalid zero/one setting for bit %d of ctl 0x%0x, "
102 			 "truectl 0x%0x\n", i, ctl_reg, true_ctl_reg));
103 
104 		if (zero_allowed && !one_allowed) {		/* b(i),c(i) */
105 			if (ones_mask & (1 << i))
106 				return (EINVAL);
107 			*retval &= ~(1 << i);
108 		} else if (one_allowed && !zero_allowed) {	/* b(i),c(i) */
109 			if (zeros_mask & (1 << i))
110 				return (EINVAL);
111 			*retval |= 1 << i;
112 		} else {
113 			if (zeros_mask & (1 << i))	/* b(ii),c(ii) */
114 				*retval &= ~(1 << i);
115 			else if (ones_mask & (1 << i)) /* b(ii), c(ii) */
116 				*retval |= 1 << i;
117 			else if (!true_ctls_avail)
118 				*retval &= ~(1 << i);	/* b(iii) */
119 			else if (vmx_ctl_allows_zero_setting(val, i))/* c(iii)*/
120 				*retval &= ~(1 << i);
121 			else if (vmx_ctl_allows_one_setting(val, i)) /* c(iv) */
122 				*retval |= 1 << i;
123 			else {
124 				panic("vmx_set_ctlreg: unable to determine "
125 				      "correct value of ctl bit %d for msr "
126 				      "0x%0x and true msr 0x%0x", i, ctl_reg,
127 				      true_ctl_reg);
128 			}
129 		}
130 	}
131 
132 	return (0);
133 }
134 
135 void
136 msr_bitmap_initialize(char *bitmap)
137 {
138 
139 	memset(bitmap, 0xff, PAGE_SIZE);
140 }
141 
142 int
143 msr_bitmap_change_access(char *bitmap, u_int msr, int access)
144 {
145 	int byte, bit;
146 
147 	if (msr <= 0x00001FFF)
148 		byte = msr / 8;
149 	else if (msr >= 0xC0000000 && msr <= 0xC0001FFF)
150 		byte = 1024 + (msr - 0xC0000000) / 8;
151 	else
152 		return (EINVAL);
153 
154 	bit = msr & 0x7;
155 
156 	if (access & MSR_BITMAP_ACCESS_READ)
157 		bitmap[byte] &= ~(1 << bit);
158 	else
159 		bitmap[byte] |= 1 << bit;
160 
161 	byte += 2048;
162 	if (access & MSR_BITMAP_ACCESS_WRITE)
163 		bitmap[byte] &= ~(1 << bit);
164 	else
165 		bitmap[byte] |= 1 << bit;
166 
167 	return (0);
168 }
169 
170 static uint64_t misc_enable;
171 static uint64_t platform_info;
172 static uint64_t turbo_ratio_limit;
173 static uint64_t host_msrs[GUEST_MSR_NUM];
174 
175 static bool
176 nehalem_cpu(void)
177 {
178 	u_int family, model;
179 
180 	/*
181 	 * The family:model numbers belonging to the Nehalem microarchitecture
182 	 * are documented in Section 35.5, Intel SDM dated Feb 2014.
183 	 */
184 	family = CPUID_TO_FAMILY(cpu_id);
185 	model = CPUID_TO_MODEL(cpu_id);
186 	if (family == 0x6) {
187 		switch (model) {
188 		case 0x1A:
189 		case 0x1E:
190 		case 0x1F:
191 		case 0x2E:
192 			return (true);
193 		default:
194 			break;
195 		}
196 	}
197 	return (false);
198 }
199 
200 static bool
201 westmere_cpu(void)
202 {
203 	u_int family, model;
204 
205 	/*
206 	 * The family:model numbers belonging to the Westmere microarchitecture
207 	 * are documented in Section 35.6, Intel SDM dated Feb 2014.
208 	 */
209 	family = CPUID_TO_FAMILY(cpu_id);
210 	model = CPUID_TO_MODEL(cpu_id);
211 	if (family == 0x6) {
212 		switch (model) {
213 		case 0x25:
214 		case 0x2C:
215 			return (true);
216 		default:
217 			break;
218 		}
219 	}
220 	return (false);
221 }
222 
223 static bool
224 pat_valid(uint64_t val)
225 {
226 	int i, pa;
227 
228 	/*
229 	 * From Intel SDM: Table "Memory Types That Can Be Encoded With PAT"
230 	 *
231 	 * Extract PA0 through PA7 and validate that each one encodes a
232 	 * valid memory type.
233 	 */
234 	for (i = 0; i < 8; i++) {
235 		pa = (val >> (i * 8)) & 0xff;
236 		if (pa == 2 || pa == 3 || pa >= 8)
237 			return (false);
238 	}
239 	return (true);
240 }
241 
242 void
243 vmx_msr_init(void)
244 {
245 	uint64_t bus_freq, ratio;
246 	int i;
247 
248 	/*
249 	 * It is safe to cache the values of the following MSRs because
250 	 * they don't change based on curcpu, curproc or curthread.
251 	 */
252 	host_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR);
253 	host_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR);
254 	host_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR);
255 	host_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK);
256 
257 	/*
258 	 * Initialize emulated MSRs
259 	 */
260 	misc_enable = rdmsr(MSR_IA32_MISC_ENABLE);
261 	/*
262 	 * Set mandatory bits
263 	 *  11:   branch trace disabled
264 	 *  12:   PEBS unavailable
265 	 * Clear unsupported features
266 	 *  16:   SpeedStep enable
267 	 *  18:   enable MONITOR FSM
268 	 */
269 	misc_enable |= (1 << 12) | (1 << 11);
270 	misc_enable &= ~((1 << 18) | (1 << 16));
271 
272 	if (nehalem_cpu() || westmere_cpu())
273 		bus_freq = 133330000;		/* 133Mhz */
274 	else
275 		bus_freq = 100000000;		/* 100Mhz */
276 
277 	/*
278 	 * XXXtime
279 	 * The ratio should really be based on the virtual TSC frequency as
280 	 * opposed to the host TSC.
281 	 */
282 	ratio = (tsc_freq / bus_freq) & 0xff;
283 
284 	/*
285 	 * The register definition is based on the micro-architecture
286 	 * but the following bits are always the same:
287 	 * [15:8]  Maximum Non-Turbo Ratio
288 	 * [28]    Programmable Ratio Limit for Turbo Mode
289 	 * [29]    Programmable TDC-TDP Limit for Turbo Mode
290 	 * [47:40] Maximum Efficiency Ratio
291 	 *
292 	 * The other bits can be safely set to 0 on all
293 	 * micro-architectures up to Haswell.
294 	 */
295 	platform_info = (ratio << 8) | (ratio << 40);
296 
297 	/*
298 	 * The number of valid bits in the MSR_TURBO_RATIO_LIMITx register is
299 	 * dependent on the maximum cores per package supported by the micro-
300 	 * architecture. For e.g., Westmere supports 6 cores per package and
301 	 * uses the low 48 bits. Sandybridge support 8 cores per package and
302 	 * uses up all 64 bits.
303 	 *
304 	 * However, the unused bits are reserved so we pretend that all bits
305 	 * in this MSR are valid.
306 	 */
307 	for (i = 0; i < 8; i++)
308 		turbo_ratio_limit = (turbo_ratio_limit << 8) | ratio;
309 }
310 
311 void
312 vmx_msr_guest_init(struct vmx *vmx, struct vmx_vcpu *vcpu)
313 {
314 	/*
315 	 * The permissions bitmap is shared between all vcpus so initialize it
316 	 * once when initializing the vBSP.
317 	 */
318 	if (vcpu->vcpuid == 0) {
319 		guest_msr_rw(vmx, MSR_LSTAR);
320 		guest_msr_rw(vmx, MSR_CSTAR);
321 		guest_msr_rw(vmx, MSR_STAR);
322 		guest_msr_rw(vmx, MSR_SF_MASK);
323 		guest_msr_rw(vmx, MSR_KGSBASE);
324 	}
325 
326 	/*
327 	 * Initialize guest IA32_PAT MSR with default value after reset.
328 	 */
329 	vcpu->guest_msrs[IDX_MSR_PAT] = PAT_VALUE(0, PAT_WRITE_BACK) |
330 	    PAT_VALUE(1, PAT_WRITE_THROUGH)	|
331 	    PAT_VALUE(2, PAT_UNCACHED)		|
332 	    PAT_VALUE(3, PAT_UNCACHEABLE)	|
333 	    PAT_VALUE(4, PAT_WRITE_BACK)	|
334 	    PAT_VALUE(5, PAT_WRITE_THROUGH)	|
335 	    PAT_VALUE(6, PAT_UNCACHED)		|
336 	    PAT_VALUE(7, PAT_UNCACHEABLE);
337 
338 	return;
339 }
340 
341 void
342 vmx_msr_guest_enter(struct vmx_vcpu *vcpu)
343 {
344 
345 	/* Save host MSRs (in particular, KGSBASE) and restore guest MSRs */
346 	update_pcb_bases(curpcb);
347 	wrmsr(MSR_LSTAR, vcpu->guest_msrs[IDX_MSR_LSTAR]);
348 	wrmsr(MSR_CSTAR, vcpu->guest_msrs[IDX_MSR_CSTAR]);
349 	wrmsr(MSR_STAR, vcpu->guest_msrs[IDX_MSR_STAR]);
350 	wrmsr(MSR_SF_MASK, vcpu->guest_msrs[IDX_MSR_SF_MASK]);
351 	wrmsr(MSR_KGSBASE, vcpu->guest_msrs[IDX_MSR_KGSBASE]);
352 }
353 
354 void
355 vmx_msr_guest_enter_tsc_aux(struct vmx *vmx, struct vmx_vcpu *vcpu)
356 {
357 	uint64_t guest_tsc_aux = vcpu->guest_msrs[IDX_MSR_TSC_AUX];
358 	uint32_t host_aux = cpu_auxmsr();
359 
360 	if (vmx_have_msr_tsc_aux && guest_tsc_aux != host_aux)
361 		wrmsr(MSR_TSC_AUX, guest_tsc_aux);
362 }
363 
364 void
365 vmx_msr_guest_exit(struct vmx_vcpu *vcpu)
366 {
367 
368 	/* Save guest MSRs */
369 	vcpu->guest_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR);
370 	vcpu->guest_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR);
371 	vcpu->guest_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR);
372 	vcpu->guest_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK);
373 	vcpu->guest_msrs[IDX_MSR_KGSBASE] = rdmsr(MSR_KGSBASE);
374 
375 	/* Restore host MSRs */
376 	wrmsr(MSR_LSTAR, host_msrs[IDX_MSR_LSTAR]);
377 	wrmsr(MSR_CSTAR, host_msrs[IDX_MSR_CSTAR]);
378 	wrmsr(MSR_STAR, host_msrs[IDX_MSR_STAR]);
379 	wrmsr(MSR_SF_MASK, host_msrs[IDX_MSR_SF_MASK]);
380 
381 	/* MSR_KGSBASE will be restored on the way back to userspace */
382 }
383 
384 void
385 vmx_msr_guest_exit_tsc_aux(struct vmx *vmx, struct vmx_vcpu *vcpu)
386 {
387 	uint64_t guest_tsc_aux = vcpu->guest_msrs[IDX_MSR_TSC_AUX];
388 	uint32_t host_aux = cpu_auxmsr();
389 
390 	if (vmx_have_msr_tsc_aux && guest_tsc_aux != host_aux)
391 		/*
392 		 * Note that it is not necessary to save the guest value
393 		 * here; vcpu->guest_msrs[IDX_MSR_TSC_AUX] always
394 		 * contains the current value since it is updated whenever
395 		 * the guest writes to it (which is expected to be very
396 		 * rare).
397 		 */
398 		wrmsr(MSR_TSC_AUX, host_aux);
399 }
400 
401 int
402 vmx_rdmsr(struct vmx_vcpu *vcpu, u_int num, uint64_t *val, bool *retu)
403 {
404 	int error;
405 
406 	error = 0;
407 
408 	switch (num) {
409 	case MSR_MCG_CAP:
410 	case MSR_MCG_STATUS:
411 		*val = 0;
412 		break;
413 	case MSR_MTRRcap:
414 	case MSR_MTRRdefType:
415 	case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7:
416 	case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
417 	case MSR_MTRR64kBase:
418 	case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1:
419 		if (vm_rdmtrr(&vcpu->mtrr, num, val) != 0) {
420 			vm_inject_gp(vcpu->vcpu);
421 		}
422 		break;
423 	case MSR_IA32_MISC_ENABLE:
424 		*val = misc_enable;
425 		break;
426 	case MSR_PLATFORM_INFO:
427 		*val = platform_info;
428 		break;
429 	case MSR_TURBO_RATIO_LIMIT:
430 	case MSR_TURBO_RATIO_LIMIT1:
431 		*val = turbo_ratio_limit;
432 		break;
433 	case MSR_PAT:
434 		*val = vcpu->guest_msrs[IDX_MSR_PAT];
435 		break;
436 	default:
437 		error = EINVAL;
438 		break;
439 	}
440 	return (error);
441 }
442 
443 int
444 vmx_wrmsr(struct vmx_vcpu *vcpu, u_int num, uint64_t val, bool *retu)
445 {
446 	uint64_t changed;
447 	int error;
448 
449 	error = 0;
450 
451 	switch (num) {
452 	case MSR_MCG_CAP:
453 	case MSR_MCG_STATUS:
454 		break;		/* ignore writes */
455 	case MSR_MTRRcap:
456 	case MSR_MTRRdefType:
457 	case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7:
458 	case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
459 	case MSR_MTRR64kBase:
460 	case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1:
461 		if (vm_wrmtrr(&vcpu->mtrr, num, val) != 0) {
462 			vm_inject_gp(vcpu->vcpu);
463 		}
464 		break;
465 	case MSR_IA32_MISC_ENABLE:
466 		changed = val ^ misc_enable;
467 		/*
468 		 * If the host has disabled the NX feature then the guest
469 		 * also cannot use it. However, a Linux guest will try to
470 		 * enable the NX feature by writing to the MISC_ENABLE MSR.
471 		 *
472 		 * This can be safely ignored because the memory management
473 		 * code looks at CPUID.80000001H:EDX.NX to check if the
474 		 * functionality is actually enabled.
475 		 */
476 		changed &= ~(1UL << 34);
477 
478 		/*
479 		 * Punt to userspace if any other bits are being modified.
480 		 */
481 		if (changed)
482 			error = EINVAL;
483 
484 		break;
485 	case MSR_PAT:
486 		if (pat_valid(val))
487 			vcpu->guest_msrs[IDX_MSR_PAT] = val;
488 		else
489 			vm_inject_gp(vcpu->vcpu);
490 		break;
491 	case MSR_TSC:
492 		error = vmx_set_tsc_offset(vcpu, val - rdtsc());
493 		break;
494 	case MSR_TSC_AUX:
495 		if (vmx_have_msr_tsc_aux)
496 			/*
497 			 * vmx_msr_guest_enter_tsc_aux() will apply this
498 			 * value when it is called immediately before guest
499 			 * entry.
500 			 */
501 			vcpu->guest_msrs[IDX_MSR_TSC_AUX] = val;
502 		else
503 			vm_inject_gp(vcpu->vcpu);
504 		break;
505 	default:
506 		error = EINVAL;
507 		break;
508 	}
509 
510 	return (error);
511 }
512