1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2011 NetApp, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/cdefs.h> 30 #include <sys/param.h> 31 #include <sys/systm.h> 32 #include <sys/proc.h> 33 34 #include <machine/clock.h> 35 #include <machine/cpufunc.h> 36 #include <machine/md_var.h> 37 #include <machine/pcb.h> 38 #include <machine/specialreg.h> 39 #include <machine/vmm.h> 40 41 #include "vmx.h" 42 #include "vmx_msr.h" 43 #include "x86.h" 44 45 static bool 46 vmx_ctl_allows_one_setting(uint64_t msr_val, int bitpos) 47 { 48 49 return ((msr_val & (1UL << (bitpos + 32))) != 0); 50 } 51 52 static bool 53 vmx_ctl_allows_zero_setting(uint64_t msr_val, int bitpos) 54 { 55 56 return ((msr_val & (1UL << bitpos)) == 0); 57 } 58 59 uint32_t 60 vmx_revision(void) 61 { 62 63 return (rdmsr(MSR_VMX_BASIC) & 0xffffffff); 64 } 65 66 /* 67 * Generate a bitmask to be used for the VMCS execution control fields. 68 * 69 * The caller specifies what bits should be set to one in 'ones_mask' 70 * and what bits should be set to zero in 'zeros_mask'. The don't-care 71 * bits are set to the default value. The default values are obtained 72 * based on "Algorithm 3" in Section 27.5.1 "Algorithms for Determining 73 * VMX Capabilities". 74 * 75 * Returns zero on success and non-zero on error. 76 */ 77 int 78 vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask, 79 uint32_t zeros_mask, uint32_t *retval) 80 { 81 int i; 82 uint64_t val, trueval; 83 bool true_ctls_avail, one_allowed, zero_allowed; 84 85 /* We cannot ask the same bit to be set to both '1' and '0' */ 86 if ((ones_mask ^ zeros_mask) != (ones_mask | zeros_mask)) 87 return (EINVAL); 88 89 true_ctls_avail = (rdmsr(MSR_VMX_BASIC) & (1UL << 55)) != 0; 90 91 val = rdmsr(ctl_reg); 92 if (true_ctls_avail) 93 trueval = rdmsr(true_ctl_reg); /* step c */ 94 else 95 trueval = val; /* step a */ 96 97 for (i = 0; i < 32; i++) { 98 one_allowed = vmx_ctl_allows_one_setting(trueval, i); 99 zero_allowed = vmx_ctl_allows_zero_setting(trueval, i); 100 101 KASSERT(one_allowed || zero_allowed, 102 ("invalid zero/one setting for bit %d of ctl 0x%0x, " 103 "truectl 0x%0x\n", i, ctl_reg, true_ctl_reg)); 104 105 if (zero_allowed && !one_allowed) { /* b(i),c(i) */ 106 if (ones_mask & (1 << i)) 107 return (EINVAL); 108 *retval &= ~(1 << i); 109 } else if (one_allowed && !zero_allowed) { /* b(i),c(i) */ 110 if (zeros_mask & (1 << i)) 111 return (EINVAL); 112 *retval |= 1 << i; 113 } else { 114 if (zeros_mask & (1 << i)) /* b(ii),c(ii) */ 115 *retval &= ~(1 << i); 116 else if (ones_mask & (1 << i)) /* b(ii), c(ii) */ 117 *retval |= 1 << i; 118 else if (!true_ctls_avail) 119 *retval &= ~(1 << i); /* b(iii) */ 120 else if (vmx_ctl_allows_zero_setting(val, i))/* c(iii)*/ 121 *retval &= ~(1 << i); 122 else if (vmx_ctl_allows_one_setting(val, i)) /* c(iv) */ 123 *retval |= 1 << i; 124 else { 125 panic("vmx_set_ctlreg: unable to determine " 126 "correct value of ctl bit %d for msr " 127 "0x%0x and true msr 0x%0x", i, ctl_reg, 128 true_ctl_reg); 129 } 130 } 131 } 132 133 return (0); 134 } 135 136 void 137 msr_bitmap_initialize(char *bitmap) 138 { 139 140 memset(bitmap, 0xff, PAGE_SIZE); 141 } 142 143 int 144 msr_bitmap_change_access(char *bitmap, u_int msr, int access) 145 { 146 int byte, bit; 147 148 if (msr <= 0x00001FFF) 149 byte = msr / 8; 150 else if (msr >= 0xC0000000 && msr <= 0xC0001FFF) 151 byte = 1024 + (msr - 0xC0000000) / 8; 152 else 153 return (EINVAL); 154 155 bit = msr & 0x7; 156 157 if (access & MSR_BITMAP_ACCESS_READ) 158 bitmap[byte] &= ~(1 << bit); 159 else 160 bitmap[byte] |= 1 << bit; 161 162 byte += 2048; 163 if (access & MSR_BITMAP_ACCESS_WRITE) 164 bitmap[byte] &= ~(1 << bit); 165 else 166 bitmap[byte] |= 1 << bit; 167 168 return (0); 169 } 170 171 static uint64_t misc_enable; 172 static uint64_t platform_info; 173 static uint64_t turbo_ratio_limit; 174 static uint64_t host_msrs[GUEST_MSR_NUM]; 175 176 static bool 177 nehalem_cpu(void) 178 { 179 u_int family, model; 180 181 /* 182 * The family:model numbers belonging to the Nehalem microarchitecture 183 * are documented in Section 35.5, Intel SDM dated Feb 2014. 184 */ 185 family = CPUID_TO_FAMILY(cpu_id); 186 model = CPUID_TO_MODEL(cpu_id); 187 if (family == 0x6) { 188 switch (model) { 189 case 0x1A: 190 case 0x1E: 191 case 0x1F: 192 case 0x2E: 193 return (true); 194 default: 195 break; 196 } 197 } 198 return (false); 199 } 200 201 static bool 202 westmere_cpu(void) 203 { 204 u_int family, model; 205 206 /* 207 * The family:model numbers belonging to the Westmere microarchitecture 208 * are documented in Section 35.6, Intel SDM dated Feb 2014. 209 */ 210 family = CPUID_TO_FAMILY(cpu_id); 211 model = CPUID_TO_MODEL(cpu_id); 212 if (family == 0x6) { 213 switch (model) { 214 case 0x25: 215 case 0x2C: 216 return (true); 217 default: 218 break; 219 } 220 } 221 return (false); 222 } 223 224 static bool 225 pat_valid(uint64_t val) 226 { 227 int i, pa; 228 229 /* 230 * From Intel SDM: Table "Memory Types That Can Be Encoded With PAT" 231 * 232 * Extract PA0 through PA7 and validate that each one encodes a 233 * valid memory type. 234 */ 235 for (i = 0; i < 8; i++) { 236 pa = (val >> (i * 8)) & 0xff; 237 if (pa == 2 || pa == 3 || pa >= 8) 238 return (false); 239 } 240 return (true); 241 } 242 243 void 244 vmx_msr_init(void) 245 { 246 uint64_t bus_freq, ratio; 247 int i; 248 249 /* 250 * It is safe to cache the values of the following MSRs because 251 * they don't change based on curcpu, curproc or curthread. 252 */ 253 host_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR); 254 host_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR); 255 host_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR); 256 host_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK); 257 258 /* 259 * Initialize emulated MSRs 260 */ 261 misc_enable = rdmsr(MSR_IA32_MISC_ENABLE); 262 /* 263 * Set mandatory bits 264 * 11: branch trace disabled 265 * 12: PEBS unavailable 266 * Clear unsupported features 267 * 16: SpeedStep enable 268 * 18: enable MONITOR FSM 269 */ 270 misc_enable |= (1 << 12) | (1 << 11); 271 misc_enable &= ~((1 << 18) | (1 << 16)); 272 273 if (nehalem_cpu() || westmere_cpu()) 274 bus_freq = 133330000; /* 133Mhz */ 275 else 276 bus_freq = 100000000; /* 100Mhz */ 277 278 /* 279 * XXXtime 280 * The ratio should really be based on the virtual TSC frequency as 281 * opposed to the host TSC. 282 */ 283 ratio = (tsc_freq / bus_freq) & 0xff; 284 285 /* 286 * The register definition is based on the micro-architecture 287 * but the following bits are always the same: 288 * [15:8] Maximum Non-Turbo Ratio 289 * [28] Programmable Ratio Limit for Turbo Mode 290 * [29] Programmable TDC-TDP Limit for Turbo Mode 291 * [47:40] Maximum Efficiency Ratio 292 * 293 * The other bits can be safely set to 0 on all 294 * micro-architectures up to Haswell. 295 */ 296 platform_info = (ratio << 8) | (ratio << 40); 297 298 /* 299 * The number of valid bits in the MSR_TURBO_RATIO_LIMITx register is 300 * dependent on the maximum cores per package supported by the micro- 301 * architecture. For e.g., Westmere supports 6 cores per package and 302 * uses the low 48 bits. Sandybridge support 8 cores per package and 303 * uses up all 64 bits. 304 * 305 * However, the unused bits are reserved so we pretend that all bits 306 * in this MSR are valid. 307 */ 308 for (i = 0; i < 8; i++) 309 turbo_ratio_limit = (turbo_ratio_limit << 8) | ratio; 310 } 311 312 void 313 vmx_msr_guest_init(struct vmx *vmx, struct vmx_vcpu *vcpu) 314 { 315 /* 316 * The permissions bitmap is shared between all vcpus so initialize it 317 * once when initializing the vBSP. 318 */ 319 if (vcpu->vcpuid == 0) { 320 guest_msr_rw(vmx, MSR_LSTAR); 321 guest_msr_rw(vmx, MSR_CSTAR); 322 guest_msr_rw(vmx, MSR_STAR); 323 guest_msr_rw(vmx, MSR_SF_MASK); 324 guest_msr_rw(vmx, MSR_KGSBASE); 325 } 326 327 /* 328 * Initialize guest IA32_PAT MSR with default value after reset. 329 */ 330 vcpu->guest_msrs[IDX_MSR_PAT] = PAT_VALUE(0, PAT_WRITE_BACK) | 331 PAT_VALUE(1, PAT_WRITE_THROUGH) | 332 PAT_VALUE(2, PAT_UNCACHED) | 333 PAT_VALUE(3, PAT_UNCACHEABLE) | 334 PAT_VALUE(4, PAT_WRITE_BACK) | 335 PAT_VALUE(5, PAT_WRITE_THROUGH) | 336 PAT_VALUE(6, PAT_UNCACHED) | 337 PAT_VALUE(7, PAT_UNCACHEABLE); 338 339 return; 340 } 341 342 void 343 vmx_msr_guest_enter(struct vmx_vcpu *vcpu) 344 { 345 346 /* Save host MSRs (in particular, KGSBASE) and restore guest MSRs */ 347 update_pcb_bases(curpcb); 348 wrmsr(MSR_LSTAR, vcpu->guest_msrs[IDX_MSR_LSTAR]); 349 wrmsr(MSR_CSTAR, vcpu->guest_msrs[IDX_MSR_CSTAR]); 350 wrmsr(MSR_STAR, vcpu->guest_msrs[IDX_MSR_STAR]); 351 wrmsr(MSR_SF_MASK, vcpu->guest_msrs[IDX_MSR_SF_MASK]); 352 wrmsr(MSR_KGSBASE, vcpu->guest_msrs[IDX_MSR_KGSBASE]); 353 } 354 355 void 356 vmx_msr_guest_enter_tsc_aux(struct vmx *vmx, struct vmx_vcpu *vcpu) 357 { 358 uint64_t guest_tsc_aux = vcpu->guest_msrs[IDX_MSR_TSC_AUX]; 359 uint32_t host_aux = cpu_auxmsr(); 360 361 if (vmx_have_msr_tsc_aux && guest_tsc_aux != host_aux) 362 wrmsr(MSR_TSC_AUX, guest_tsc_aux); 363 } 364 365 void 366 vmx_msr_guest_exit(struct vmx_vcpu *vcpu) 367 { 368 369 /* Save guest MSRs */ 370 vcpu->guest_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR); 371 vcpu->guest_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR); 372 vcpu->guest_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR); 373 vcpu->guest_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK); 374 vcpu->guest_msrs[IDX_MSR_KGSBASE] = rdmsr(MSR_KGSBASE); 375 376 /* Restore host MSRs */ 377 wrmsr(MSR_LSTAR, host_msrs[IDX_MSR_LSTAR]); 378 wrmsr(MSR_CSTAR, host_msrs[IDX_MSR_CSTAR]); 379 wrmsr(MSR_STAR, host_msrs[IDX_MSR_STAR]); 380 wrmsr(MSR_SF_MASK, host_msrs[IDX_MSR_SF_MASK]); 381 382 /* MSR_KGSBASE will be restored on the way back to userspace */ 383 } 384 385 void 386 vmx_msr_guest_exit_tsc_aux(struct vmx *vmx, struct vmx_vcpu *vcpu) 387 { 388 uint64_t guest_tsc_aux = vcpu->guest_msrs[IDX_MSR_TSC_AUX]; 389 uint32_t host_aux = cpu_auxmsr(); 390 391 if (vmx_have_msr_tsc_aux && guest_tsc_aux != host_aux) 392 /* 393 * Note that it is not necessary to save the guest value 394 * here; vcpu->guest_msrs[IDX_MSR_TSC_AUX] always 395 * contains the current value since it is updated whenever 396 * the guest writes to it (which is expected to be very 397 * rare). 398 */ 399 wrmsr(MSR_TSC_AUX, host_aux); 400 } 401 402 int 403 vmx_rdmsr(struct vmx_vcpu *vcpu, u_int num, uint64_t *val, bool *retu) 404 { 405 int error; 406 407 error = 0; 408 409 switch (num) { 410 case MSR_MCG_CAP: 411 case MSR_MCG_STATUS: 412 *val = 0; 413 break; 414 case MSR_MTRRcap: 415 case MSR_MTRRdefType: 416 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7: 417 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: 418 case MSR_MTRR64kBase: 419 case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: 420 if (vm_rdmtrr(&vcpu->mtrr, num, val) != 0) { 421 vm_inject_gp(vcpu->vcpu); 422 } 423 break; 424 case MSR_IA32_MISC_ENABLE: 425 *val = misc_enable; 426 break; 427 case MSR_PLATFORM_INFO: 428 *val = platform_info; 429 break; 430 case MSR_TURBO_RATIO_LIMIT: 431 case MSR_TURBO_RATIO_LIMIT1: 432 *val = turbo_ratio_limit; 433 break; 434 case MSR_PAT: 435 *val = vcpu->guest_msrs[IDX_MSR_PAT]; 436 break; 437 default: 438 error = EINVAL; 439 break; 440 } 441 return (error); 442 } 443 444 int 445 vmx_wrmsr(struct vmx_vcpu *vcpu, u_int num, uint64_t val, bool *retu) 446 { 447 uint64_t changed; 448 int error; 449 450 error = 0; 451 452 switch (num) { 453 case MSR_MCG_CAP: 454 case MSR_MCG_STATUS: 455 break; /* ignore writes */ 456 case MSR_MTRRcap: 457 case MSR_MTRRdefType: 458 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7: 459 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: 460 case MSR_MTRR64kBase: 461 case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: 462 if (vm_wrmtrr(&vcpu->mtrr, num, val) != 0) { 463 vm_inject_gp(vcpu->vcpu); 464 } 465 break; 466 case MSR_IA32_MISC_ENABLE: 467 changed = val ^ misc_enable; 468 /* 469 * If the host has disabled the NX feature then the guest 470 * also cannot use it. However, a Linux guest will try to 471 * enable the NX feature by writing to the MISC_ENABLE MSR. 472 * 473 * This can be safely ignored because the memory management 474 * code looks at CPUID.80000001H:EDX.NX to check if the 475 * functionality is actually enabled. 476 */ 477 changed &= ~(1UL << 34); 478 479 /* 480 * Punt to userspace if any other bits are being modified. 481 */ 482 if (changed) 483 error = EINVAL; 484 485 break; 486 case MSR_PAT: 487 if (pat_valid(val)) 488 vcpu->guest_msrs[IDX_MSR_PAT] = val; 489 else 490 vm_inject_gp(vcpu->vcpu); 491 break; 492 case MSR_TSC: 493 error = vmx_set_tsc_offset(vcpu, val - rdtsc()); 494 break; 495 case MSR_TSC_AUX: 496 if (vmx_have_msr_tsc_aux) 497 /* 498 * vmx_msr_guest_enter_tsc_aux() will apply this 499 * value when it is called immediately before guest 500 * entry. 501 */ 502 vcpu->guest_msrs[IDX_MSR_TSC_AUX] = val; 503 else 504 vm_inject_gp(vcpu->vcpu); 505 break; 506 default: 507 error = EINVAL; 508 break; 509 } 510 511 return (error); 512 } 513