xref: /illumos-gate/usr/src/uts/intel/io/vmm/intel/vmx_msr.c (revision fdad6fbf87b201fdb96a704fc41fa8be1e4efbc8)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 /*
29  * This file and its contents are supplied under the terms of the
30  * Common Development and Distribution License ("CDDL"), version 1.0.
31  * You may only use this file in accordance with the terms of version
32  * 1.0 of the CDDL.
33  *
34  * A full copy of the text of the CDDL should have accompanied this
35  * source.  A copy of the CDDL is also available via the Internet at
36  * http://www.illumos.org/license/CDDL.
37  */
38 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */
39 
40 /*
41  * Copyright 2020 Joyent, Inc.
42  * Copyright 2021 Oxide Computer Company
43  */
44 
45 #include <sys/cdefs.h>
46 
47 #include <sys/param.h>
48 #include <sys/systm.h>
49 #include <sys/proc.h>
50 
51 #include <machine/clock.h>
52 #include <machine/cpufunc.h>
53 #include <machine/md_var.h>
54 #include <machine/specialreg.h>
55 #include <machine/vmm.h>
56 #include <sys/vmm_kernel.h>
57 
58 #include "vmx.h"
59 #include "vmx_msr.h"
60 
61 static bool
vmx_ctl_allows_one_setting(uint64_t msr_val,int bitpos)62 vmx_ctl_allows_one_setting(uint64_t msr_val, int bitpos)
63 {
64 
65 	return ((msr_val & (1UL << (bitpos + 32))) != 0);
66 }
67 
68 static bool
vmx_ctl_allows_zero_setting(uint64_t msr_val,int bitpos)69 vmx_ctl_allows_zero_setting(uint64_t msr_val, int bitpos)
70 {
71 
72 	return ((msr_val & (1UL << bitpos)) == 0);
73 }
74 
75 /*
76  * Generate a bitmask to be used for the VMCS execution control fields.
77  *
78  * The caller specifies what bits should be set to one in 'ones_mask'
79  * and what bits should be set to zero in 'zeros_mask'. The don't-care
80  * bits are set to the default value. The default values are obtained
81  * based on "Algorithm 3" in Section 27.5.1 "Algorithms for Determining
82  * VMX Capabilities".
83  *
84  * Returns zero on success and non-zero on error.
85  */
86 int
vmx_set_ctlreg(int ctl_reg,int true_ctl_reg,uint32_t ones_mask,uint32_t zeros_mask,uint32_t * retval)87 vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask,
88     uint32_t zeros_mask, uint32_t *retval)
89 {
90 	int i;
91 	uint64_t val, trueval;
92 	bool true_ctls_avail, one_allowed, zero_allowed;
93 
94 	/* We cannot ask the same bit to be set to both '1' and '0' */
95 	if ((ones_mask ^ zeros_mask) != (ones_mask | zeros_mask))
96 		return (EINVAL);
97 
98 	true_ctls_avail = (rdmsr(MSR_VMX_BASIC) & (1UL << 55)) != 0;
99 
100 	val = rdmsr(ctl_reg);
101 	if (true_ctls_avail)
102 		trueval = rdmsr(true_ctl_reg);		/* step c */
103 	else
104 		trueval = val;				/* step a */
105 
106 	for (i = 0; i < 32; i++) {
107 		one_allowed = vmx_ctl_allows_one_setting(trueval, i);
108 		zero_allowed = vmx_ctl_allows_zero_setting(trueval, i);
109 
110 		KASSERT(one_allowed || zero_allowed,
111 		    ("invalid zero/one setting for bit %d of ctl 0x%0x, "
112 		    "truectl 0x%0x\n", i, ctl_reg, true_ctl_reg));
113 
114 		if (zero_allowed && !one_allowed) {		/* b(i),c(i) */
115 			if (ones_mask & (1 << i))
116 				return (EINVAL);
117 			*retval &= ~(1 << i);
118 		} else if (one_allowed && !zero_allowed) {	/* b(i),c(i) */
119 			if (zeros_mask & (1 << i))
120 				return (EINVAL);
121 			*retval |= 1 << i;
122 		} else {
123 			if (zeros_mask & (1 << i)) {
124 				/* b(ii),c(ii) */
125 				*retval &= ~(1 << i);
126 			} else if (ones_mask & (1 << i)) {
127 				/* b(ii), c(ii) */
128 				*retval |= 1 << i;
129 			} else if (!true_ctls_avail) {
130 				/* b(iii) */
131 				*retval &= ~(1 << i);
132 			} else if (vmx_ctl_allows_zero_setting(val, i)) {
133 				/* c(iii) */
134 				*retval &= ~(1 << i);
135 			} else if (vmx_ctl_allows_one_setting(val, i)) {
136 				/* c(iv) */
137 				*retval |= 1 << i;
138 			} else {
139 				panic("vmx_set_ctlreg: unable to determine "
140 				    "correct value of ctl bit %d for msr "
141 				    "0x%0x and true msr 0x%0x", i, ctl_reg,
142 				    true_ctl_reg);
143 			}
144 		}
145 	}
146 
147 	return (0);
148 }
149 
150 void
vmx_msr_bitmap_initialize(struct vmx * vmx)151 vmx_msr_bitmap_initialize(struct vmx *vmx)
152 {
153 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
154 		uint8_t *bitmap;
155 
156 		bitmap = kmem_alloc(PAGESIZE, KM_SLEEP);
157 		VERIFY3U((uintptr_t)bitmap & PAGEOFFSET, ==, 0);
158 		memset(bitmap, 0xff, PAGESIZE);
159 
160 		vmx->msr_bitmap[i] = bitmap;
161 	}
162 }
163 
164 void
vmx_msr_bitmap_destroy(struct vmx * vmx)165 vmx_msr_bitmap_destroy(struct vmx *vmx)
166 {
167 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
168 		VERIFY3P(vmx->msr_bitmap[i], !=, NULL);
169 		kmem_free(vmx->msr_bitmap[i], PAGESIZE);
170 		vmx->msr_bitmap[i] = NULL;
171 	}
172 }
173 
174 void
vmx_msr_bitmap_change_access(struct vmx * vmx,int vcpuid,uint_t msr,int acc)175 vmx_msr_bitmap_change_access(struct vmx *vmx, int vcpuid, uint_t msr, int acc)
176 {
177 	uint8_t *bitmap = vmx->msr_bitmap[vcpuid];
178 	int byte, bit;
179 
180 	if (msr <= 0x00001FFF) {
181 		byte = msr / 8;
182 	} else if (msr >= 0xC0000000 && msr <= 0xC0001FFF) {
183 		byte = 1024 + (msr - 0xC0000000) / 8;
184 	} else {
185 		panic("Invalid MSR for bitmap: %x", msr);
186 	}
187 
188 	bit = msr & 0x7;
189 
190 	if (acc & MSR_BITMAP_ACCESS_READ) {
191 		bitmap[byte] &= ~(1 << bit);
192 	} else {
193 		bitmap[byte] |= 1 << bit;
194 	}
195 
196 	byte += 2048;
197 	if (acc & MSR_BITMAP_ACCESS_WRITE) {
198 		bitmap[byte] &= ~(1 << bit);
199 	} else {
200 		bitmap[byte] |= 1 << bit;
201 	}
202 }
203 
204 static uint64_t misc_enable;
205 static uint64_t platform_info;
206 static uint64_t turbo_ratio_limit;
207 
208 static bool
nehalem_cpu(void)209 nehalem_cpu(void)
210 {
211 	uint_t family, model;
212 
213 	/*
214 	 * The family:model numbers belonging to the Nehalem microarchitecture
215 	 * are documented in Section 35.5, Intel SDM dated Feb 2014.
216 	 */
217 	family = CPUID_TO_FAMILY(cpu_id);
218 	model = CPUID_TO_MODEL(cpu_id);
219 	if (family == 0x6) {
220 		switch (model) {
221 		case 0x1A:
222 		case 0x1E:
223 		case 0x1F:
224 		case 0x2E:
225 			return (true);
226 		default:
227 			break;
228 		}
229 	}
230 	return (false);
231 }
232 
233 static bool
westmere_cpu(void)234 westmere_cpu(void)
235 {
236 	uint_t family, model;
237 
238 	/*
239 	 * The family:model numbers belonging to the Westmere microarchitecture
240 	 * are documented in Section 35.6, Intel SDM dated Feb 2014.
241 	 */
242 	family = CPUID_TO_FAMILY(cpu_id);
243 	model = CPUID_TO_MODEL(cpu_id);
244 	if (family == 0x6) {
245 		switch (model) {
246 		case 0x25:
247 		case 0x2C:
248 			return (true);
249 		default:
250 			break;
251 		}
252 	}
253 	return (false);
254 }
255 
256 static bool
pat_valid(uint64_t val)257 pat_valid(uint64_t val)
258 {
259 	int i, pa;
260 
261 	/*
262 	 * From Intel SDM: Table "Memory Types That Can Be Encoded With PAT"
263 	 *
264 	 * Extract PA0 through PA7 and validate that each one encodes a
265 	 * valid memory type.
266 	 */
267 	for (i = 0; i < 8; i++) {
268 		pa = (val >> (i * 8)) & 0xff;
269 		if (pa == 2 || pa == 3 || pa >= 8)
270 			return (false);
271 	}
272 	return (true);
273 }
274 
275 void
vmx_msr_init(void)276 vmx_msr_init(void)
277 {
278 	uint64_t bus_freq, ratio;
279 	int i;
280 
281 	/*
282 	 * Initialize emulated MSRs
283 	 */
284 	misc_enable = rdmsr(MSR_IA32_MISC_ENABLE);
285 	/*
286 	 * Set mandatory bits
287 	 *  11:   branch trace disabled
288 	 *  12:   PEBS unavailable
289 	 * Clear unsupported features
290 	 *  16:   SpeedStep enable
291 	 *  18:   enable MONITOR FSM
292 	 */
293 	misc_enable |= (1 << 12) | (1 << 11);
294 	misc_enable &= ~((1 << 18) | (1 << 16));
295 
296 	if (nehalem_cpu() || westmere_cpu())
297 		bus_freq = 133330000;		/* 133Mhz */
298 	else
299 		bus_freq = 100000000;		/* 100Mhz */
300 
301 	/*
302 	 * XXXtime
303 	 * The ratio should really be based on the virtual TSC frequency as
304 	 * opposed to the host TSC.
305 	 */
306 	ratio = (tsc_freq / bus_freq) & 0xff;
307 
308 	/*
309 	 * The register definition is based on the micro-architecture
310 	 * but the following bits are always the same:
311 	 * [15:8]  Maximum Non-Turbo Ratio
312 	 * [28]    Programmable Ratio Limit for Turbo Mode
313 	 * [29]    Programmable TDC-TDP Limit for Turbo Mode
314 	 * [47:40] Maximum Efficiency Ratio
315 	 *
316 	 * The other bits can be safely set to 0 on all
317 	 * micro-architectures up to Haswell.
318 	 */
319 	platform_info = (ratio << 8) | (ratio << 40);
320 
321 	/*
322 	 * The number of valid bits in the MSR_TURBO_RATIO_LIMITx register is
323 	 * dependent on the maximum cores per package supported by the micro-
324 	 * architecture. For e.g., Westmere supports 6 cores per package and
325 	 * uses the low 48 bits. Sandybridge support 8 cores per package and
326 	 * uses up all 64 bits.
327 	 *
328 	 * However, the unused bits are reserved so we pretend that all bits
329 	 * in this MSR are valid.
330 	 */
331 	for (i = 0; i < 8; i++)
332 		turbo_ratio_limit = (turbo_ratio_limit << 8) | ratio;
333 }
334 
335 void
vmx_msr_guest_init(struct vmx * vmx,int vcpuid)336 vmx_msr_guest_init(struct vmx *vmx, int vcpuid)
337 {
338 	uint64_t *guest_msrs = vmx->guest_msrs[vcpuid];
339 
340 	/*
341 	 * It is safe to allow direct access to MSR_GSBASE and
342 	 * MSR_FSBASE.  The guest FSBASE and GSBASE are saved and
343 	 * restored during vm-exit and vm-entry respectively. The host
344 	 * FSBASE and GSBASE are always restored from the vmcs host
345 	 * state area on vm-exit.
346 	 *
347 	 * The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in
348 	 * how they are saved/restored so can be directly accessed by
349 	 * the guest.
350 	 *
351 	 * MSR_EFER is saved and restored in the guest VMCS area on a VM
352 	 * exit and entry respectively. It is also restored from the
353 	 * host VMCS area on a VM exit.
354 	 *
355 	 * The TSC MSR is exposed read-only. Writes are disallowed as
356 	 * that will impact the host TSC.  If the guest does a write the
357 	 * "use TSC offsetting" execution control is enabled and the
358 	 * difference between the host TSC and the guest TSC is written
359 	 * into the TSC offset in the VMCS.
360 	 */
361 	guest_msr_rw(vmx, vcpuid, MSR_GSBASE);
362 	guest_msr_rw(vmx, vcpuid, MSR_FSBASE);
363 	guest_msr_rw(vmx, vcpuid, MSR_SYSENTER_CS_MSR);
364 	guest_msr_rw(vmx, vcpuid, MSR_SYSENTER_ESP_MSR);
365 	guest_msr_rw(vmx, vcpuid, MSR_SYSENTER_EIP_MSR);
366 	guest_msr_rw(vmx, vcpuid, MSR_EFER);
367 	guest_msr_ro(vmx, vcpuid, MSR_TSC);
368 
369 	/*
370 	 * The guest may have direct access to these MSRs as they are
371 	 * saved/restored in vmx_msr_guest_enter() and vmx_msr_guest_exit().
372 	 */
373 	guest_msr_rw(vmx, vcpuid, MSR_LSTAR);
374 	guest_msr_rw(vmx, vcpuid, MSR_CSTAR);
375 	guest_msr_rw(vmx, vcpuid, MSR_STAR);
376 	guest_msr_rw(vmx, vcpuid, MSR_SF_MASK);
377 	guest_msr_rw(vmx, vcpuid, MSR_KGSBASE);
378 
379 	/*
380 	 * Initialize guest IA32_PAT MSR with default value after reset.
381 	 */
382 	guest_msrs[IDX_MSR_PAT] = PAT_VALUE(0, PAT_WRITE_BACK) |
383 	    PAT_VALUE(1, PAT_WRITE_THROUGH)	|
384 	    PAT_VALUE(2, PAT_UNCACHED)		|
385 	    PAT_VALUE(3, PAT_UNCACHEABLE)	|
386 	    PAT_VALUE(4, PAT_WRITE_BACK)	|
387 	    PAT_VALUE(5, PAT_WRITE_THROUGH)	|
388 	    PAT_VALUE(6, PAT_UNCACHED)		|
389 	    PAT_VALUE(7, PAT_UNCACHEABLE);
390 }
391 
392 void
vmx_msr_guest_enter(struct vmx * vmx,int vcpuid)393 vmx_msr_guest_enter(struct vmx *vmx, int vcpuid)
394 {
395 	uint64_t *guest_msrs = vmx->guest_msrs[vcpuid];
396 	uint64_t *host_msrs = vmx->host_msrs[vcpuid];
397 
398 	/* Save host MSRs */
399 	host_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR);
400 	host_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR);
401 	host_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR);
402 	host_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK);
403 
404 	/* Save host MSRs (in particular, KGSBASE) and restore guest MSRs */
405 	wrmsr(MSR_LSTAR, guest_msrs[IDX_MSR_LSTAR]);
406 	wrmsr(MSR_CSTAR, guest_msrs[IDX_MSR_CSTAR]);
407 	wrmsr(MSR_STAR, guest_msrs[IDX_MSR_STAR]);
408 	wrmsr(MSR_SF_MASK, guest_msrs[IDX_MSR_SF_MASK]);
409 	wrmsr(MSR_KGSBASE, guest_msrs[IDX_MSR_KGSBASE]);
410 }
411 
412 void
vmx_msr_guest_exit(struct vmx * vmx,int vcpuid)413 vmx_msr_guest_exit(struct vmx *vmx, int vcpuid)
414 {
415 	uint64_t *guest_msrs = vmx->guest_msrs[vcpuid];
416 	uint64_t *host_msrs = vmx->host_msrs[vcpuid];
417 
418 	/* Save guest MSRs */
419 	guest_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR);
420 	guest_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR);
421 	guest_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR);
422 	guest_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK);
423 	guest_msrs[IDX_MSR_KGSBASE] = rdmsr(MSR_KGSBASE);
424 
425 	/* Restore host MSRs */
426 	wrmsr(MSR_LSTAR, host_msrs[IDX_MSR_LSTAR]);
427 	wrmsr(MSR_CSTAR, host_msrs[IDX_MSR_CSTAR]);
428 	wrmsr(MSR_STAR, host_msrs[IDX_MSR_STAR]);
429 	wrmsr(MSR_SF_MASK, host_msrs[IDX_MSR_SF_MASK]);
430 
431 	/* MSR_KGSBASE will be restored on the way back to userspace */
432 }
433 
434 vm_msr_result_t
vmx_rdmsr(struct vmx * vmx,int vcpuid,uint32_t num,uint64_t * val)435 vmx_rdmsr(struct vmx *vmx, int vcpuid, uint32_t num, uint64_t *val)
436 {
437 	const uint64_t *guest_msrs = vmx->guest_msrs[vcpuid];
438 
439 	switch (num) {
440 	case MSR_IA32_FEATURE_CONTROL:
441 		/*
442 		 * We currently don't support SGX support in guests, so
443 		 * always report those features as disabled with the MSR
444 		 * locked so the guest won't attempt to write to it.
445 		 */
446 		*val = IA32_FEATURE_CONTROL_LOCK;
447 		break;
448 	case MSR_IA32_MISC_ENABLE:
449 		*val = misc_enable;
450 		break;
451 	case MSR_PLATFORM_INFO:
452 		*val = platform_info;
453 		break;
454 	case MSR_TURBO_RATIO_LIMIT:
455 	case MSR_TURBO_RATIO_LIMIT1:
456 		*val = turbo_ratio_limit;
457 		break;
458 	case MSR_PAT:
459 		*val = guest_msrs[IDX_MSR_PAT];
460 		break;
461 	default:
462 		return (VMR_UNHANLDED);
463 	}
464 	return (VMR_OK);
465 }
466 
467 vm_msr_result_t
vmx_wrmsr(struct vmx * vmx,int vcpuid,uint32_t num,uint64_t val)468 vmx_wrmsr(struct vmx *vmx, int vcpuid, uint32_t num, uint64_t val)
469 {
470 	uint64_t *guest_msrs = vmx->guest_msrs[vcpuid];
471 	uint64_t changed;
472 
473 	switch (num) {
474 	case MSR_IA32_MISC_ENABLE:
475 		changed = val ^ misc_enable;
476 		/*
477 		 * If the host has disabled the NX feature then the guest
478 		 * also cannot use it. However, a Linux guest will try to
479 		 * enable the NX feature by writing to the MISC_ENABLE MSR.
480 		 *
481 		 * This can be safely ignored because the memory management
482 		 * code looks at CPUID.80000001H:EDX.NX to check if the
483 		 * functionality is actually enabled.
484 		 */
485 		changed &= ~(1UL << 34);
486 
487 		/*
488 		 * Punt to userspace if any other bits are being modified.
489 		 */
490 		if (changed) {
491 			return (VMR_UNHANLDED);
492 		}
493 		break;
494 	case MSR_PAT:
495 		if (!pat_valid(val)) {
496 			return (VMR_GP);
497 		}
498 		guest_msrs[IDX_MSR_PAT] = val;
499 		break;
500 	default:
501 		return (VMR_UNHANLDED);
502 	}
503 
504 	return (VMR_OK);
505 }
506