xref: /illumos-gate/usr/src/uts/intel/io/vmm/intel/vmx_msr.c (revision 32640292339b07090f10ce34d455f98711077343)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 /*
29  * Copyright 2020 Joyent, Inc.
30  * Copyright 2021 Oxide Computer Company
31  */
32 
33 #include <sys/cdefs.h>
34 
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/proc.h>
38 
39 #include <machine/clock.h>
40 #include <machine/cpufunc.h>
41 #include <machine/md_var.h>
42 #include <machine/specialreg.h>
43 #include <machine/vmm.h>
44 #include <sys/vmm_kernel.h>
45 
46 #include "vmx.h"
47 #include "vmx_msr.h"
48 
49 static bool
vmx_ctl_allows_one_setting(uint64_t msr_val,int bitpos)50 vmx_ctl_allows_one_setting(uint64_t msr_val, int bitpos)
51 {
52 
53 	return ((msr_val & (1UL << (bitpos + 32))) != 0);
54 }
55 
56 static bool
vmx_ctl_allows_zero_setting(uint64_t msr_val,int bitpos)57 vmx_ctl_allows_zero_setting(uint64_t msr_val, int bitpos)
58 {
59 
60 	return ((msr_val & (1UL << bitpos)) == 0);
61 }
62 
63 /*
64  * Generate a bitmask to be used for the VMCS execution control fields.
65  *
66  * The caller specifies what bits should be set to one in 'ones_mask'
67  * and what bits should be set to zero in 'zeros_mask'. The don't-care
68  * bits are set to the default value. The default values are obtained
69  * based on "Algorithm 3" in Section 27.5.1 "Algorithms for Determining
70  * VMX Capabilities".
71  *
72  * Returns zero on success and non-zero on error.
73  */
74 int
vmx_set_ctlreg(int ctl_reg,int true_ctl_reg,uint32_t ones_mask,uint32_t zeros_mask,uint32_t * retval)75 vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask,
76     uint32_t zeros_mask, uint32_t *retval)
77 {
78 	int i;
79 	uint64_t val, trueval;
80 	bool true_ctls_avail, one_allowed, zero_allowed;
81 
82 	/* We cannot ask the same bit to be set to both '1' and '0' */
83 	if ((ones_mask ^ zeros_mask) != (ones_mask | zeros_mask))
84 		return (EINVAL);
85 
86 	true_ctls_avail = (rdmsr(MSR_VMX_BASIC) & (1UL << 55)) != 0;
87 
88 	val = rdmsr(ctl_reg);
89 	if (true_ctls_avail)
90 		trueval = rdmsr(true_ctl_reg);		/* step c */
91 	else
92 		trueval = val;				/* step a */
93 
94 	for (i = 0; i < 32; i++) {
95 		one_allowed = vmx_ctl_allows_one_setting(trueval, i);
96 		zero_allowed = vmx_ctl_allows_zero_setting(trueval, i);
97 
98 		KASSERT(one_allowed || zero_allowed,
99 		    ("invalid zero/one setting for bit %d of ctl 0x%0x, "
100 		    "truectl 0x%0x\n", i, ctl_reg, true_ctl_reg));
101 
102 		if (zero_allowed && !one_allowed) {		/* b(i),c(i) */
103 			if (ones_mask & (1 << i))
104 				return (EINVAL);
105 			*retval &= ~(1 << i);
106 		} else if (one_allowed && !zero_allowed) {	/* b(i),c(i) */
107 			if (zeros_mask & (1 << i))
108 				return (EINVAL);
109 			*retval |= 1 << i;
110 		} else {
111 			if (zeros_mask & (1 << i)) {
112 				/* b(ii),c(ii) */
113 				*retval &= ~(1 << i);
114 			} else if (ones_mask & (1 << i)) {
115 				/* b(ii), c(ii) */
116 				*retval |= 1 << i;
117 			} else if (!true_ctls_avail) {
118 				/* b(iii) */
119 				*retval &= ~(1 << i);
120 			} else if (vmx_ctl_allows_zero_setting(val, i)) {
121 				/* c(iii) */
122 				*retval &= ~(1 << i);
123 			} else if (vmx_ctl_allows_one_setting(val, i)) {
124 				/* c(iv) */
125 				*retval |= 1 << i;
126 			} else {
127 				panic("vmx_set_ctlreg: unable to determine "
128 				    "correct value of ctl bit %d for msr "
129 				    "0x%0x and true msr 0x%0x", i, ctl_reg,
130 				    true_ctl_reg);
131 			}
132 		}
133 	}
134 
135 	return (0);
136 }
137 
138 void
vmx_msr_bitmap_initialize(struct vmx * vmx)139 vmx_msr_bitmap_initialize(struct vmx *vmx)
140 {
141 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
142 		uint8_t *bitmap;
143 
144 		bitmap = kmem_alloc(PAGESIZE, KM_SLEEP);
145 		VERIFY3U((uintptr_t)bitmap & PAGEOFFSET, ==, 0);
146 		memset(bitmap, 0xff, PAGESIZE);
147 
148 		vmx->msr_bitmap[i] = bitmap;
149 	}
150 }
151 
152 void
vmx_msr_bitmap_destroy(struct vmx * vmx)153 vmx_msr_bitmap_destroy(struct vmx *vmx)
154 {
155 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
156 		VERIFY3P(vmx->msr_bitmap[i], !=, NULL);
157 		kmem_free(vmx->msr_bitmap[i], PAGESIZE);
158 		vmx->msr_bitmap[i] = NULL;
159 	}
160 }
161 
162 void
vmx_msr_bitmap_change_access(struct vmx * vmx,int vcpuid,uint_t msr,int acc)163 vmx_msr_bitmap_change_access(struct vmx *vmx, int vcpuid, uint_t msr, int acc)
164 {
165 	uint8_t *bitmap = vmx->msr_bitmap[vcpuid];
166 	int byte, bit;
167 
168 	if (msr <= 0x00001FFF) {
169 		byte = msr / 8;
170 	} else if (msr >= 0xC0000000 && msr <= 0xC0001FFF) {
171 		byte = 1024 + (msr - 0xC0000000) / 8;
172 	} else {
173 		panic("Invalid MSR for bitmap: %x", msr);
174 	}
175 
176 	bit = msr & 0x7;
177 
178 	if (acc & MSR_BITMAP_ACCESS_READ) {
179 		bitmap[byte] &= ~(1 << bit);
180 	} else {
181 		bitmap[byte] |= 1 << bit;
182 	}
183 
184 	byte += 2048;
185 	if (acc & MSR_BITMAP_ACCESS_WRITE) {
186 		bitmap[byte] &= ~(1 << bit);
187 	} else {
188 		bitmap[byte] |= 1 << bit;
189 	}
190 }
191 
192 static uint64_t misc_enable;
193 static uint64_t platform_info;
194 static uint64_t turbo_ratio_limit;
195 
196 static bool
nehalem_cpu(void)197 nehalem_cpu(void)
198 {
199 	uint_t family, model;
200 
201 	/*
202 	 * The family:model numbers belonging to the Nehalem microarchitecture
203 	 * are documented in Section 35.5, Intel SDM dated Feb 2014.
204 	 */
205 	family = CPUID_TO_FAMILY(cpu_id);
206 	model = CPUID_TO_MODEL(cpu_id);
207 	if (family == 0x6) {
208 		switch (model) {
209 		case 0x1A:
210 		case 0x1E:
211 		case 0x1F:
212 		case 0x2E:
213 			return (true);
214 		default:
215 			break;
216 		}
217 	}
218 	return (false);
219 }
220 
221 static bool
westmere_cpu(void)222 westmere_cpu(void)
223 {
224 	uint_t family, model;
225 
226 	/*
227 	 * The family:model numbers belonging to the Westmere microarchitecture
228 	 * are documented in Section 35.6, Intel SDM dated Feb 2014.
229 	 */
230 	family = CPUID_TO_FAMILY(cpu_id);
231 	model = CPUID_TO_MODEL(cpu_id);
232 	if (family == 0x6) {
233 		switch (model) {
234 		case 0x25:
235 		case 0x2C:
236 			return (true);
237 		default:
238 			break;
239 		}
240 	}
241 	return (false);
242 }
243 
244 static bool
pat_valid(uint64_t val)245 pat_valid(uint64_t val)
246 {
247 	int i, pa;
248 
249 	/*
250 	 * From Intel SDM: Table "Memory Types That Can Be Encoded With PAT"
251 	 *
252 	 * Extract PA0 through PA7 and validate that each one encodes a
253 	 * valid memory type.
254 	 */
255 	for (i = 0; i < 8; i++) {
256 		pa = (val >> (i * 8)) & 0xff;
257 		if (pa == 2 || pa == 3 || pa >= 8)
258 			return (false);
259 	}
260 	return (true);
261 }
262 
263 void
vmx_msr_init(void)264 vmx_msr_init(void)
265 {
266 	uint64_t bus_freq, ratio;
267 	int i;
268 
269 	/*
270 	 * Initialize emulated MSRs
271 	 */
272 	misc_enable = rdmsr(MSR_IA32_MISC_ENABLE);
273 	/*
274 	 * Set mandatory bits
275 	 *  11:   branch trace disabled
276 	 *  12:   PEBS unavailable
277 	 * Clear unsupported features
278 	 *  16:   SpeedStep enable
279 	 *  18:   enable MONITOR FSM
280 	 */
281 	misc_enable |= (1 << 12) | (1 << 11);
282 	misc_enable &= ~((1 << 18) | (1 << 16));
283 
284 	if (nehalem_cpu() || westmere_cpu())
285 		bus_freq = 133330000;		/* 133Mhz */
286 	else
287 		bus_freq = 100000000;		/* 100Mhz */
288 
289 	/*
290 	 * XXXtime
291 	 * The ratio should really be based on the virtual TSC frequency as
292 	 * opposed to the host TSC.
293 	 */
294 	ratio = (tsc_freq / bus_freq) & 0xff;
295 
296 	/*
297 	 * The register definition is based on the micro-architecture
298 	 * but the following bits are always the same:
299 	 * [15:8]  Maximum Non-Turbo Ratio
300 	 * [28]    Programmable Ratio Limit for Turbo Mode
301 	 * [29]    Programmable TDC-TDP Limit for Turbo Mode
302 	 * [47:40] Maximum Efficiency Ratio
303 	 *
304 	 * The other bits can be safely set to 0 on all
305 	 * micro-architectures up to Haswell.
306 	 */
307 	platform_info = (ratio << 8) | (ratio << 40);
308 
309 	/*
310 	 * The number of valid bits in the MSR_TURBO_RATIO_LIMITx register is
311 	 * dependent on the maximum cores per package supported by the micro-
312 	 * architecture. For e.g., Westmere supports 6 cores per package and
313 	 * uses the low 48 bits. Sandybridge support 8 cores per package and
314 	 * uses up all 64 bits.
315 	 *
316 	 * However, the unused bits are reserved so we pretend that all bits
317 	 * in this MSR are valid.
318 	 */
319 	for (i = 0; i < 8; i++)
320 		turbo_ratio_limit = (turbo_ratio_limit << 8) | ratio;
321 }
322 
323 void
vmx_msr_guest_init(struct vmx * vmx,int vcpuid)324 vmx_msr_guest_init(struct vmx *vmx, int vcpuid)
325 {
326 	uint64_t *guest_msrs = vmx->guest_msrs[vcpuid];
327 
328 	/*
329 	 * It is safe to allow direct access to MSR_GSBASE and
330 	 * MSR_FSBASE.  The guest FSBASE and GSBASE are saved and
331 	 * restored during vm-exit and vm-entry respectively. The host
332 	 * FSBASE and GSBASE are always restored from the vmcs host
333 	 * state area on vm-exit.
334 	 *
335 	 * The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in
336 	 * how they are saved/restored so can be directly accessed by
337 	 * the guest.
338 	 *
339 	 * MSR_EFER is saved and restored in the guest VMCS area on a VM
340 	 * exit and entry respectively. It is also restored from the
341 	 * host VMCS area on a VM exit.
342 	 *
343 	 * The TSC MSR is exposed read-only. Writes are disallowed as
344 	 * that will impact the host TSC.  If the guest does a write the
345 	 * "use TSC offsetting" execution control is enabled and the
346 	 * difference between the host TSC and the guest TSC is written
347 	 * into the TSC offset in the VMCS.
348 	 */
349 	guest_msr_rw(vmx, vcpuid, MSR_GSBASE);
350 	guest_msr_rw(vmx, vcpuid, MSR_FSBASE);
351 	guest_msr_rw(vmx, vcpuid, MSR_SYSENTER_CS_MSR);
352 	guest_msr_rw(vmx, vcpuid, MSR_SYSENTER_ESP_MSR);
353 	guest_msr_rw(vmx, vcpuid, MSR_SYSENTER_EIP_MSR);
354 	guest_msr_rw(vmx, vcpuid, MSR_EFER);
355 	guest_msr_ro(vmx, vcpuid, MSR_TSC);
356 
357 	/*
358 	 * The guest may have direct access to these MSRs as they are
359 	 * saved/restored in vmx_msr_guest_enter() and vmx_msr_guest_exit().
360 	 */
361 	guest_msr_rw(vmx, vcpuid, MSR_LSTAR);
362 	guest_msr_rw(vmx, vcpuid, MSR_CSTAR);
363 	guest_msr_rw(vmx, vcpuid, MSR_STAR);
364 	guest_msr_rw(vmx, vcpuid, MSR_SF_MASK);
365 	guest_msr_rw(vmx, vcpuid, MSR_KGSBASE);
366 
367 	/*
368 	 * Initialize guest IA32_PAT MSR with default value after reset.
369 	 */
370 	guest_msrs[IDX_MSR_PAT] = PAT_VALUE(0, PAT_WRITE_BACK) |
371 	    PAT_VALUE(1, PAT_WRITE_THROUGH)	|
372 	    PAT_VALUE(2, PAT_UNCACHED)		|
373 	    PAT_VALUE(3, PAT_UNCACHEABLE)	|
374 	    PAT_VALUE(4, PAT_WRITE_BACK)	|
375 	    PAT_VALUE(5, PAT_WRITE_THROUGH)	|
376 	    PAT_VALUE(6, PAT_UNCACHED)		|
377 	    PAT_VALUE(7, PAT_UNCACHEABLE);
378 }
379 
380 void
vmx_msr_guest_enter(struct vmx * vmx,int vcpuid)381 vmx_msr_guest_enter(struct vmx *vmx, int vcpuid)
382 {
383 	uint64_t *guest_msrs = vmx->guest_msrs[vcpuid];
384 	uint64_t *host_msrs = vmx->host_msrs[vcpuid];
385 
386 	/* Save host MSRs */
387 	host_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR);
388 	host_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR);
389 	host_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR);
390 	host_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK);
391 
392 	/* Save host MSRs (in particular, KGSBASE) and restore guest MSRs */
393 	wrmsr(MSR_LSTAR, guest_msrs[IDX_MSR_LSTAR]);
394 	wrmsr(MSR_CSTAR, guest_msrs[IDX_MSR_CSTAR]);
395 	wrmsr(MSR_STAR, guest_msrs[IDX_MSR_STAR]);
396 	wrmsr(MSR_SF_MASK, guest_msrs[IDX_MSR_SF_MASK]);
397 	wrmsr(MSR_KGSBASE, guest_msrs[IDX_MSR_KGSBASE]);
398 }
399 
400 void
vmx_msr_guest_exit(struct vmx * vmx,int vcpuid)401 vmx_msr_guest_exit(struct vmx *vmx, int vcpuid)
402 {
403 	uint64_t *guest_msrs = vmx->guest_msrs[vcpuid];
404 	uint64_t *host_msrs = vmx->host_msrs[vcpuid];
405 
406 	/* Save guest MSRs */
407 	guest_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR);
408 	guest_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR);
409 	guest_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR);
410 	guest_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK);
411 	guest_msrs[IDX_MSR_KGSBASE] = rdmsr(MSR_KGSBASE);
412 
413 	/* Restore host MSRs */
414 	wrmsr(MSR_LSTAR, host_msrs[IDX_MSR_LSTAR]);
415 	wrmsr(MSR_CSTAR, host_msrs[IDX_MSR_CSTAR]);
416 	wrmsr(MSR_STAR, host_msrs[IDX_MSR_STAR]);
417 	wrmsr(MSR_SF_MASK, host_msrs[IDX_MSR_SF_MASK]);
418 
419 	/* MSR_KGSBASE will be restored on the way back to userspace */
420 }
421 
422 vm_msr_result_t
vmx_rdmsr(struct vmx * vmx,int vcpuid,uint32_t num,uint64_t * val)423 vmx_rdmsr(struct vmx *vmx, int vcpuid, uint32_t num, uint64_t *val)
424 {
425 	const uint64_t *guest_msrs = vmx->guest_msrs[vcpuid];
426 
427 	switch (num) {
428 	case MSR_IA32_FEATURE_CONTROL:
429 		/*
430 		 * We currently don't support SGX support in guests, so
431 		 * always report those features as disabled with the MSR
432 		 * locked so the guest won't attempt to write to it.
433 		 */
434 		*val = IA32_FEATURE_CONTROL_LOCK;
435 		break;
436 	case MSR_IA32_MISC_ENABLE:
437 		*val = misc_enable;
438 		break;
439 	case MSR_PLATFORM_INFO:
440 		*val = platform_info;
441 		break;
442 	case MSR_TURBO_RATIO_LIMIT:
443 	case MSR_TURBO_RATIO_LIMIT1:
444 		*val = turbo_ratio_limit;
445 		break;
446 	case MSR_PAT:
447 		*val = guest_msrs[IDX_MSR_PAT];
448 		break;
449 	default:
450 		return (VMR_UNHANLDED);
451 	}
452 	return (VMR_OK);
453 }
454 
455 vm_msr_result_t
vmx_wrmsr(struct vmx * vmx,int vcpuid,uint32_t num,uint64_t val)456 vmx_wrmsr(struct vmx *vmx, int vcpuid, uint32_t num, uint64_t val)
457 {
458 	uint64_t *guest_msrs = vmx->guest_msrs[vcpuid];
459 	uint64_t changed;
460 
461 	switch (num) {
462 	case MSR_IA32_MISC_ENABLE:
463 		changed = val ^ misc_enable;
464 		/*
465 		 * If the host has disabled the NX feature then the guest
466 		 * also cannot use it. However, a Linux guest will try to
467 		 * enable the NX feature by writing to the MISC_ENABLE MSR.
468 		 *
469 		 * This can be safely ignored because the memory management
470 		 * code looks at CPUID.80000001H:EDX.NX to check if the
471 		 * functionality is actually enabled.
472 		 */
473 		changed &= ~(1UL << 34);
474 
475 		/*
476 		 * Punt to userspace if any other bits are being modified.
477 		 */
478 		if (changed) {
479 			return (VMR_UNHANLDED);
480 		}
481 		break;
482 	case MSR_PAT:
483 		if (!pat_valid(val)) {
484 			return (VMR_GP);
485 		}
486 		guest_msrs[IDX_MSR_PAT] = val;
487 		break;
488 	default:
489 		return (VMR_UNHANLDED);
490 	}
491 
492 	return (VMR_OK);
493 }
494