1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2011 NetApp, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/proc.h>
32
33 #include <machine/clock.h>
34 #include <machine/cpufunc.h>
35 #include <machine/md_var.h>
36 #include <machine/pcb.h>
37 #include <machine/specialreg.h>
38 #include <machine/vmm.h>
39
40 #include "vmx.h"
41 #include "vmx_msr.h"
42 #include "x86.h"
43
44 static bool
vmx_ctl_allows_one_setting(uint64_t msr_val,int bitpos)45 vmx_ctl_allows_one_setting(uint64_t msr_val, int bitpos)
46 {
47
48 return ((msr_val & (1UL << (bitpos + 32))) != 0);
49 }
50
51 static bool
vmx_ctl_allows_zero_setting(uint64_t msr_val,int bitpos)52 vmx_ctl_allows_zero_setting(uint64_t msr_val, int bitpos)
53 {
54
55 return ((msr_val & (1UL << bitpos)) == 0);
56 }
57
58 uint32_t
vmx_revision(void)59 vmx_revision(void)
60 {
61
62 return (rdmsr(MSR_VMX_BASIC) & 0xffffffff);
63 }
64
65 /*
66 * Generate a bitmask to be used for the VMCS execution control fields.
67 *
68 * The caller specifies what bits should be set to one in 'ones_mask'
69 * and what bits should be set to zero in 'zeros_mask'. The don't-care
70 * bits are set to the default value. The default values are obtained
71 * based on "Algorithm 3" in Section 27.5.1 "Algorithms for Determining
72 * VMX Capabilities".
73 *
74 * Returns zero on success and non-zero on error.
75 */
76 int
vmx_set_ctlreg(int ctl_reg,int true_ctl_reg,uint32_t ones_mask,uint32_t zeros_mask,uint32_t * retval)77 vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask,
78 uint32_t zeros_mask, uint32_t *retval)
79 {
80 int i;
81 uint64_t val, trueval;
82 bool true_ctls_avail, one_allowed, zero_allowed;
83
84 /* We cannot ask the same bit to be set to both '1' and '0' */
85 if ((ones_mask ^ zeros_mask) != (ones_mask | zeros_mask))
86 return (EINVAL);
87
88 true_ctls_avail = (rdmsr(MSR_VMX_BASIC) & (1UL << 55)) != 0;
89
90 val = rdmsr(ctl_reg);
91 if (true_ctls_avail)
92 trueval = rdmsr(true_ctl_reg); /* step c */
93 else
94 trueval = val; /* step a */
95
96 for (i = 0; i < 32; i++) {
97 one_allowed = vmx_ctl_allows_one_setting(trueval, i);
98 zero_allowed = vmx_ctl_allows_zero_setting(trueval, i);
99
100 KASSERT(one_allowed || zero_allowed,
101 ("invalid zero/one setting for bit %d of ctl 0x%0x, "
102 "truectl 0x%0x\n", i, ctl_reg, true_ctl_reg));
103
104 if (zero_allowed && !one_allowed) { /* b(i),c(i) */
105 if (ones_mask & (1 << i))
106 return (EINVAL);
107 *retval &= ~(1 << i);
108 } else if (one_allowed && !zero_allowed) { /* b(i),c(i) */
109 if (zeros_mask & (1 << i))
110 return (EINVAL);
111 *retval |= 1 << i;
112 } else {
113 if (zeros_mask & (1 << i)) /* b(ii),c(ii) */
114 *retval &= ~(1 << i);
115 else if (ones_mask & (1 << i)) /* b(ii), c(ii) */
116 *retval |= 1 << i;
117 else if (!true_ctls_avail)
118 *retval &= ~(1 << i); /* b(iii) */
119 else if (vmx_ctl_allows_zero_setting(val, i))/* c(iii)*/
120 *retval &= ~(1 << i);
121 else if (vmx_ctl_allows_one_setting(val, i)) /* c(iv) */
122 *retval |= 1 << i;
123 else {
124 panic("vmx_set_ctlreg: unable to determine "
125 "correct value of ctl bit %d for msr "
126 "0x%0x and true msr 0x%0x", i, ctl_reg,
127 true_ctl_reg);
128 }
129 }
130 }
131
132 return (0);
133 }
134
135 void
msr_bitmap_initialize(char * bitmap)136 msr_bitmap_initialize(char *bitmap)
137 {
138
139 memset(bitmap, 0xff, PAGE_SIZE);
140 }
141
142 int
msr_bitmap_change_access(char * bitmap,u_int msr,int access)143 msr_bitmap_change_access(char *bitmap, u_int msr, int access)
144 {
145 int byte, bit;
146
147 if (msr <= 0x00001FFF)
148 byte = msr / 8;
149 else if (msr >= 0xC0000000 && msr <= 0xC0001FFF)
150 byte = 1024 + (msr - 0xC0000000) / 8;
151 else
152 return (EINVAL);
153
154 bit = msr & 0x7;
155
156 if (access & MSR_BITMAP_ACCESS_READ)
157 bitmap[byte] &= ~(1 << bit);
158 else
159 bitmap[byte] |= 1 << bit;
160
161 byte += 2048;
162 if (access & MSR_BITMAP_ACCESS_WRITE)
163 bitmap[byte] &= ~(1 << bit);
164 else
165 bitmap[byte] |= 1 << bit;
166
167 return (0);
168 }
169
170 static uint64_t misc_enable;
171 static uint64_t platform_info;
172 static uint64_t turbo_ratio_limit;
173 static uint64_t host_msrs[GUEST_MSR_NUM];
174
175 static bool
nehalem_cpu(void)176 nehalem_cpu(void)
177 {
178 u_int family, model;
179
180 /*
181 * The family:model numbers belonging to the Nehalem microarchitecture
182 * are documented in Section 35.5, Intel SDM dated Feb 2014.
183 */
184 family = CPUID_TO_FAMILY(cpu_id);
185 model = CPUID_TO_MODEL(cpu_id);
186 if (family == 0x6) {
187 switch (model) {
188 case 0x1A:
189 case 0x1E:
190 case 0x1F:
191 case 0x2E:
192 return (true);
193 default:
194 break;
195 }
196 }
197 return (false);
198 }
199
200 static bool
westmere_cpu(void)201 westmere_cpu(void)
202 {
203 u_int family, model;
204
205 /*
206 * The family:model numbers belonging to the Westmere microarchitecture
207 * are documented in Section 35.6, Intel SDM dated Feb 2014.
208 */
209 family = CPUID_TO_FAMILY(cpu_id);
210 model = CPUID_TO_MODEL(cpu_id);
211 if (family == 0x6) {
212 switch (model) {
213 case 0x25:
214 case 0x2C:
215 return (true);
216 default:
217 break;
218 }
219 }
220 return (false);
221 }
222
223 static bool
pat_valid(uint64_t val)224 pat_valid(uint64_t val)
225 {
226 int i, pa;
227
228 /*
229 * From Intel SDM: Table "Memory Types That Can Be Encoded With PAT"
230 *
231 * Extract PA0 through PA7 and validate that each one encodes a
232 * valid memory type.
233 */
234 for (i = 0; i < 8; i++) {
235 pa = (val >> (i * 8)) & 0xff;
236 if (pa == 2 || pa == 3 || pa >= 8)
237 return (false);
238 }
239 return (true);
240 }
241
242 void
vmx_msr_init(void)243 vmx_msr_init(void)
244 {
245 uint64_t bus_freq, ratio;
246 int i;
247
248 /*
249 * It is safe to cache the values of the following MSRs because
250 * they don't change based on curcpu, curproc or curthread.
251 */
252 host_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR);
253 host_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR);
254 host_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR);
255 host_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK);
256
257 /*
258 * Initialize emulated MSRs
259 */
260 misc_enable = rdmsr(MSR_IA32_MISC_ENABLE);
261 /*
262 * Set mandatory bits
263 * 11: branch trace disabled
264 * 12: PEBS unavailable
265 * Clear unsupported features
266 * 16: SpeedStep enable
267 * 18: enable MONITOR FSM
268 */
269 misc_enable |= (1 << 12) | (1 << 11);
270 misc_enable &= ~((1 << 18) | (1 << 16));
271
272 if (nehalem_cpu() || westmere_cpu())
273 bus_freq = 133330000; /* 133Mhz */
274 else
275 bus_freq = 100000000; /* 100Mhz */
276
277 /*
278 * XXXtime
279 * The ratio should really be based on the virtual TSC frequency as
280 * opposed to the host TSC.
281 */
282 ratio = (tsc_freq / bus_freq) & 0xff;
283
284 /*
285 * The register definition is based on the micro-architecture
286 * but the following bits are always the same:
287 * [15:8] Maximum Non-Turbo Ratio
288 * [28] Programmable Ratio Limit for Turbo Mode
289 * [29] Programmable TDC-TDP Limit for Turbo Mode
290 * [47:40] Maximum Efficiency Ratio
291 *
292 * The other bits can be safely set to 0 on all
293 * micro-architectures up to Haswell.
294 */
295 platform_info = (ratio << 8) | (ratio << 40);
296
297 /*
298 * The number of valid bits in the MSR_TURBO_RATIO_LIMITx register is
299 * dependent on the maximum cores per package supported by the micro-
300 * architecture. For e.g., Westmere supports 6 cores per package and
301 * uses the low 48 bits. Sandybridge support 8 cores per package and
302 * uses up all 64 bits.
303 *
304 * However, the unused bits are reserved so we pretend that all bits
305 * in this MSR are valid.
306 */
307 for (i = 0; i < 8; i++)
308 turbo_ratio_limit = (turbo_ratio_limit << 8) | ratio;
309 }
310
311 void
vmx_msr_guest_init(struct vmx * vmx,struct vmx_vcpu * vcpu)312 vmx_msr_guest_init(struct vmx *vmx, struct vmx_vcpu *vcpu)
313 {
314 /*
315 * The permissions bitmap is shared between all vcpus so initialize it
316 * once when initializing the vBSP.
317 */
318 if (vcpu->vcpuid == 0) {
319 guest_msr_rw(vmx, MSR_LSTAR);
320 guest_msr_rw(vmx, MSR_CSTAR);
321 guest_msr_rw(vmx, MSR_STAR);
322 guest_msr_rw(vmx, MSR_SF_MASK);
323 guest_msr_rw(vmx, MSR_KGSBASE);
324 }
325
326 /*
327 * Initialize guest IA32_PAT MSR with default value after reset.
328 */
329 vcpu->guest_msrs[IDX_MSR_PAT] = PAT_VALUE(0, PAT_WRITE_BACK) |
330 PAT_VALUE(1, PAT_WRITE_THROUGH) |
331 PAT_VALUE(2, PAT_UNCACHED) |
332 PAT_VALUE(3, PAT_UNCACHEABLE) |
333 PAT_VALUE(4, PAT_WRITE_BACK) |
334 PAT_VALUE(5, PAT_WRITE_THROUGH) |
335 PAT_VALUE(6, PAT_UNCACHED) |
336 PAT_VALUE(7, PAT_UNCACHEABLE);
337
338 return;
339 }
340
341 void
vmx_msr_guest_enter(struct vmx_vcpu * vcpu)342 vmx_msr_guest_enter(struct vmx_vcpu *vcpu)
343 {
344
345 /* Save host MSRs (in particular, KGSBASE) and restore guest MSRs */
346 update_pcb_bases(curpcb);
347 wrmsr(MSR_LSTAR, vcpu->guest_msrs[IDX_MSR_LSTAR]);
348 wrmsr(MSR_CSTAR, vcpu->guest_msrs[IDX_MSR_CSTAR]);
349 wrmsr(MSR_STAR, vcpu->guest_msrs[IDX_MSR_STAR]);
350 wrmsr(MSR_SF_MASK, vcpu->guest_msrs[IDX_MSR_SF_MASK]);
351 wrmsr(MSR_KGSBASE, vcpu->guest_msrs[IDX_MSR_KGSBASE]);
352 }
353
354 void
vmx_msr_guest_enter_tsc_aux(struct vmx * vmx,struct vmx_vcpu * vcpu)355 vmx_msr_guest_enter_tsc_aux(struct vmx *vmx, struct vmx_vcpu *vcpu)
356 {
357 uint64_t guest_tsc_aux = vcpu->guest_msrs[IDX_MSR_TSC_AUX];
358 uint32_t host_aux = cpu_auxmsr();
359
360 if (vmx_have_msr_tsc_aux && guest_tsc_aux != host_aux)
361 wrmsr(MSR_TSC_AUX, guest_tsc_aux);
362 }
363
364 void
vmx_msr_guest_exit(struct vmx_vcpu * vcpu)365 vmx_msr_guest_exit(struct vmx_vcpu *vcpu)
366 {
367
368 /* Save guest MSRs */
369 vcpu->guest_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR);
370 vcpu->guest_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR);
371 vcpu->guest_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR);
372 vcpu->guest_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK);
373 vcpu->guest_msrs[IDX_MSR_KGSBASE] = rdmsr(MSR_KGSBASE);
374
375 /* Restore host MSRs */
376 wrmsr(MSR_LSTAR, host_msrs[IDX_MSR_LSTAR]);
377 wrmsr(MSR_CSTAR, host_msrs[IDX_MSR_CSTAR]);
378 wrmsr(MSR_STAR, host_msrs[IDX_MSR_STAR]);
379 wrmsr(MSR_SF_MASK, host_msrs[IDX_MSR_SF_MASK]);
380
381 /* MSR_KGSBASE will be restored on the way back to userspace */
382 }
383
384 void
vmx_msr_guest_exit_tsc_aux(struct vmx * vmx,struct vmx_vcpu * vcpu)385 vmx_msr_guest_exit_tsc_aux(struct vmx *vmx, struct vmx_vcpu *vcpu)
386 {
387 uint64_t guest_tsc_aux = vcpu->guest_msrs[IDX_MSR_TSC_AUX];
388 uint32_t host_aux = cpu_auxmsr();
389
390 if (vmx_have_msr_tsc_aux && guest_tsc_aux != host_aux)
391 /*
392 * Note that it is not necessary to save the guest value
393 * here; vcpu->guest_msrs[IDX_MSR_TSC_AUX] always
394 * contains the current value since it is updated whenever
395 * the guest writes to it (which is expected to be very
396 * rare).
397 */
398 wrmsr(MSR_TSC_AUX, host_aux);
399 }
400
401 int
vmx_rdmsr(struct vmx_vcpu * vcpu,u_int num,uint64_t * val,bool * retu)402 vmx_rdmsr(struct vmx_vcpu *vcpu, u_int num, uint64_t *val, bool *retu)
403 {
404 int error;
405
406 error = 0;
407
408 switch (num) {
409 case MSR_MCG_CAP:
410 case MSR_MCG_STATUS:
411 *val = 0;
412 break;
413 case MSR_MTRRcap:
414 case MSR_MTRRdefType:
415 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7:
416 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
417 case MSR_MTRR64kBase:
418 case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1:
419 if (vm_rdmtrr(&vcpu->mtrr, num, val) != 0) {
420 vm_inject_gp(vcpu->vcpu);
421 }
422 break;
423 case MSR_IA32_MISC_ENABLE:
424 *val = misc_enable;
425 break;
426 case MSR_PLATFORM_INFO:
427 *val = platform_info;
428 break;
429 case MSR_TURBO_RATIO_LIMIT:
430 case MSR_TURBO_RATIO_LIMIT1:
431 *val = turbo_ratio_limit;
432 break;
433 case MSR_PAT:
434 *val = vcpu->guest_msrs[IDX_MSR_PAT];
435 break;
436 default:
437 error = EINVAL;
438 break;
439 }
440 return (error);
441 }
442
443 int
vmx_wrmsr(struct vmx_vcpu * vcpu,u_int num,uint64_t val,bool * retu)444 vmx_wrmsr(struct vmx_vcpu *vcpu, u_int num, uint64_t val, bool *retu)
445 {
446 uint64_t changed;
447 int error;
448
449 error = 0;
450
451 switch (num) {
452 case MSR_MCG_CAP:
453 case MSR_MCG_STATUS:
454 break; /* ignore writes */
455 case MSR_MTRRcap:
456 case MSR_MTRRdefType:
457 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7:
458 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
459 case MSR_MTRR64kBase:
460 case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1:
461 if (vm_wrmtrr(&vcpu->mtrr, num, val) != 0) {
462 vm_inject_gp(vcpu->vcpu);
463 }
464 break;
465 case MSR_IA32_MISC_ENABLE:
466 changed = val ^ misc_enable;
467 /*
468 * If the host has disabled the NX feature then the guest
469 * also cannot use it. However, a Linux guest will try to
470 * enable the NX feature by writing to the MISC_ENABLE MSR.
471 *
472 * This can be safely ignored because the memory management
473 * code looks at CPUID.80000001H:EDX.NX to check if the
474 * functionality is actually enabled.
475 */
476 changed &= ~(1UL << 34);
477
478 /*
479 * Punt to userspace if any other bits are being modified.
480 */
481 if (changed)
482 error = EINVAL;
483
484 break;
485 case MSR_PAT:
486 if (pat_valid(val))
487 vcpu->guest_msrs[IDX_MSR_PAT] = val;
488 else
489 vm_inject_gp(vcpu->vcpu);
490 break;
491 case MSR_TSC:
492 error = vmx_set_tsc_offset(vcpu, val - rdtsc());
493 break;
494 case MSR_TSC_AUX:
495 if (vmx_have_msr_tsc_aux)
496 /*
497 * vmx_msr_guest_enter_tsc_aux() will apply this
498 * value when it is called immediately before guest
499 * entry.
500 */
501 vcpu->guest_msrs[IDX_MSR_TSC_AUX] = val;
502 else
503 vm_inject_gp(vcpu->vcpu);
504 break;
505 default:
506 error = EINVAL;
507 break;
508 }
509
510 return (error);
511 }
512