1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2011 NetApp, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 * 28 * $FreeBSD$ 29 */ 30 31 #include <sys/cdefs.h> 32 __FBSDID("$FreeBSD$"); 33 34 #include <sys/param.h> 35 #include <sys/systm.h> 36 #include <sys/proc.h> 37 38 #include <machine/clock.h> 39 #include <machine/cpufunc.h> 40 #include <machine/md_var.h> 41 #include <machine/pcb.h> 42 #include <machine/specialreg.h> 43 #include <machine/vmm.h> 44 45 #include "vmx.h" 46 #include "vmx_msr.h" 47 #include "x86.h" 48 49 static bool 50 vmx_ctl_allows_one_setting(uint64_t msr_val, int bitpos) 51 { 52 53 return ((msr_val & (1UL << (bitpos + 32))) != 0); 54 } 55 56 static bool 57 vmx_ctl_allows_zero_setting(uint64_t msr_val, int bitpos) 58 { 59 60 return ((msr_val & (1UL << bitpos)) == 0); 61 } 62 63 uint32_t 64 vmx_revision(void) 65 { 66 67 return (rdmsr(MSR_VMX_BASIC) & 0xffffffff); 68 } 69 70 /* 71 * Generate a bitmask to be used for the VMCS execution control fields. 72 * 73 * The caller specifies what bits should be set to one in 'ones_mask' 74 * and what bits should be set to zero in 'zeros_mask'. The don't-care 75 * bits are set to the default value. The default values are obtained 76 * based on "Algorithm 3" in Section 27.5.1 "Algorithms for Determining 77 * VMX Capabilities". 78 * 79 * Returns zero on success and non-zero on error. 80 */ 81 int 82 vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask, 83 uint32_t zeros_mask, uint32_t *retval) 84 { 85 int i; 86 uint64_t val, trueval; 87 bool true_ctls_avail, one_allowed, zero_allowed; 88 89 /* We cannot ask the same bit to be set to both '1' and '0' */ 90 if ((ones_mask ^ zeros_mask) != (ones_mask | zeros_mask)) 91 return (EINVAL); 92 93 true_ctls_avail = (rdmsr(MSR_VMX_BASIC) & (1UL << 55)) != 0; 94 95 val = rdmsr(ctl_reg); 96 if (true_ctls_avail) 97 trueval = rdmsr(true_ctl_reg); /* step c */ 98 else 99 trueval = val; /* step a */ 100 101 for (i = 0; i < 32; i++) { 102 one_allowed = vmx_ctl_allows_one_setting(trueval, i); 103 zero_allowed = vmx_ctl_allows_zero_setting(trueval, i); 104 105 KASSERT(one_allowed || zero_allowed, 106 ("invalid zero/one setting for bit %d of ctl 0x%0x, " 107 "truectl 0x%0x\n", i, ctl_reg, true_ctl_reg)); 108 109 if (zero_allowed && !one_allowed) { /* b(i),c(i) */ 110 if (ones_mask & (1 << i)) 111 return (EINVAL); 112 *retval &= ~(1 << i); 113 } else if (one_allowed && !zero_allowed) { /* b(i),c(i) */ 114 if (zeros_mask & (1 << i)) 115 return (EINVAL); 116 *retval |= 1 << i; 117 } else { 118 if (zeros_mask & (1 << i)) /* b(ii),c(ii) */ 119 *retval &= ~(1 << i); 120 else if (ones_mask & (1 << i)) /* b(ii), c(ii) */ 121 *retval |= 1 << i; 122 else if (!true_ctls_avail) 123 *retval &= ~(1 << i); /* b(iii) */ 124 else if (vmx_ctl_allows_zero_setting(val, i))/* c(iii)*/ 125 *retval &= ~(1 << i); 126 else if (vmx_ctl_allows_one_setting(val, i)) /* c(iv) */ 127 *retval |= 1 << i; 128 else { 129 panic("vmx_set_ctlreg: unable to determine " 130 "correct value of ctl bit %d for msr " 131 "0x%0x and true msr 0x%0x", i, ctl_reg, 132 true_ctl_reg); 133 } 134 } 135 } 136 137 return (0); 138 } 139 140 void 141 msr_bitmap_initialize(char *bitmap) 142 { 143 144 memset(bitmap, 0xff, PAGE_SIZE); 145 } 146 147 int 148 msr_bitmap_change_access(char *bitmap, u_int msr, int access) 149 { 150 int byte, bit; 151 152 if (msr <= 0x00001FFF) 153 byte = msr / 8; 154 else if (msr >= 0xC0000000 && msr <= 0xC0001FFF) 155 byte = 1024 + (msr - 0xC0000000) / 8; 156 else 157 return (EINVAL); 158 159 bit = msr & 0x7; 160 161 if (access & MSR_BITMAP_ACCESS_READ) 162 bitmap[byte] &= ~(1 << bit); 163 else 164 bitmap[byte] |= 1 << bit; 165 166 byte += 2048; 167 if (access & MSR_BITMAP_ACCESS_WRITE) 168 bitmap[byte] &= ~(1 << bit); 169 else 170 bitmap[byte] |= 1 << bit; 171 172 return (0); 173 } 174 175 static uint64_t misc_enable; 176 static uint64_t platform_info; 177 static uint64_t turbo_ratio_limit; 178 static uint64_t host_msrs[GUEST_MSR_NUM]; 179 180 static bool 181 nehalem_cpu(void) 182 { 183 u_int family, model; 184 185 /* 186 * The family:model numbers belonging to the Nehalem microarchitecture 187 * are documented in Section 35.5, Intel SDM dated Feb 2014. 188 */ 189 family = CPUID_TO_FAMILY(cpu_id); 190 model = CPUID_TO_MODEL(cpu_id); 191 if (family == 0x6) { 192 switch (model) { 193 case 0x1A: 194 case 0x1E: 195 case 0x1F: 196 case 0x2E: 197 return (true); 198 default: 199 break; 200 } 201 } 202 return (false); 203 } 204 205 static bool 206 westmere_cpu(void) 207 { 208 u_int family, model; 209 210 /* 211 * The family:model numbers belonging to the Westmere microarchitecture 212 * are documented in Section 35.6, Intel SDM dated Feb 2014. 213 */ 214 family = CPUID_TO_FAMILY(cpu_id); 215 model = CPUID_TO_MODEL(cpu_id); 216 if (family == 0x6) { 217 switch (model) { 218 case 0x25: 219 case 0x2C: 220 return (true); 221 default: 222 break; 223 } 224 } 225 return (false); 226 } 227 228 static bool 229 pat_valid(uint64_t val) 230 { 231 int i, pa; 232 233 /* 234 * From Intel SDM: Table "Memory Types That Can Be Encoded With PAT" 235 * 236 * Extract PA0 through PA7 and validate that each one encodes a 237 * valid memory type. 238 */ 239 for (i = 0; i < 8; i++) { 240 pa = (val >> (i * 8)) & 0xff; 241 if (pa == 2 || pa == 3 || pa >= 8) 242 return (false); 243 } 244 return (true); 245 } 246 247 void 248 vmx_msr_init(void) 249 { 250 uint64_t bus_freq, ratio; 251 int i; 252 253 /* 254 * It is safe to cache the values of the following MSRs because 255 * they don't change based on curcpu, curproc or curthread. 256 */ 257 host_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR); 258 host_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR); 259 host_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR); 260 host_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK); 261 262 /* 263 * Initialize emulated MSRs 264 */ 265 misc_enable = rdmsr(MSR_IA32_MISC_ENABLE); 266 /* 267 * Set mandatory bits 268 * 11: branch trace disabled 269 * 12: PEBS unavailable 270 * Clear unsupported features 271 * 16: SpeedStep enable 272 * 18: enable MONITOR FSM 273 */ 274 misc_enable |= (1 << 12) | (1 << 11); 275 misc_enable &= ~((1 << 18) | (1 << 16)); 276 277 if (nehalem_cpu() || westmere_cpu()) 278 bus_freq = 133330000; /* 133Mhz */ 279 else 280 bus_freq = 100000000; /* 100Mhz */ 281 282 /* 283 * XXXtime 284 * The ratio should really be based on the virtual TSC frequency as 285 * opposed to the host TSC. 286 */ 287 ratio = (tsc_freq / bus_freq) & 0xff; 288 289 /* 290 * The register definition is based on the micro-architecture 291 * but the following bits are always the same: 292 * [15:8] Maximum Non-Turbo Ratio 293 * [28] Programmable Ratio Limit for Turbo Mode 294 * [29] Programmable TDC-TDP Limit for Turbo Mode 295 * [47:40] Maximum Efficiency Ratio 296 * 297 * The other bits can be safely set to 0 on all 298 * micro-architectures up to Haswell. 299 */ 300 platform_info = (ratio << 8) | (ratio << 40); 301 302 /* 303 * The number of valid bits in the MSR_TURBO_RATIO_LIMITx register is 304 * dependent on the maximum cores per package supported by the micro- 305 * architecture. For e.g., Westmere supports 6 cores per package and 306 * uses the low 48 bits. Sandybridge support 8 cores per package and 307 * uses up all 64 bits. 308 * 309 * However, the unused bits are reserved so we pretend that all bits 310 * in this MSR are valid. 311 */ 312 for (i = 0; i < 8; i++) 313 turbo_ratio_limit = (turbo_ratio_limit << 8) | ratio; 314 } 315 316 void 317 vmx_msr_guest_init(struct vmx *vmx, int vcpuid) 318 { 319 uint64_t *guest_msrs; 320 321 guest_msrs = vmx->guest_msrs[vcpuid]; 322 323 /* 324 * The permissions bitmap is shared between all vcpus so initialize it 325 * once when initializing the vBSP. 326 */ 327 if (vcpuid == 0) { 328 guest_msr_rw(vmx, MSR_LSTAR); 329 guest_msr_rw(vmx, MSR_CSTAR); 330 guest_msr_rw(vmx, MSR_STAR); 331 guest_msr_rw(vmx, MSR_SF_MASK); 332 guest_msr_rw(vmx, MSR_KGSBASE); 333 } 334 335 /* 336 * Initialize guest IA32_PAT MSR with default value after reset. 337 */ 338 guest_msrs[IDX_MSR_PAT] = PAT_VALUE(0, PAT_WRITE_BACK) | 339 PAT_VALUE(1, PAT_WRITE_THROUGH) | 340 PAT_VALUE(2, PAT_UNCACHED) | 341 PAT_VALUE(3, PAT_UNCACHEABLE) | 342 PAT_VALUE(4, PAT_WRITE_BACK) | 343 PAT_VALUE(5, PAT_WRITE_THROUGH) | 344 PAT_VALUE(6, PAT_UNCACHED) | 345 PAT_VALUE(7, PAT_UNCACHEABLE); 346 347 return; 348 } 349 350 void 351 vmx_msr_guest_enter(struct vmx *vmx, int vcpuid) 352 { 353 uint64_t *guest_msrs = vmx->guest_msrs[vcpuid]; 354 355 /* Save host MSRs (in particular, KGSBASE) and restore guest MSRs */ 356 update_pcb_bases(curpcb); 357 wrmsr(MSR_LSTAR, guest_msrs[IDX_MSR_LSTAR]); 358 wrmsr(MSR_CSTAR, guest_msrs[IDX_MSR_CSTAR]); 359 wrmsr(MSR_STAR, guest_msrs[IDX_MSR_STAR]); 360 wrmsr(MSR_SF_MASK, guest_msrs[IDX_MSR_SF_MASK]); 361 wrmsr(MSR_KGSBASE, guest_msrs[IDX_MSR_KGSBASE]); 362 } 363 364 void 365 vmx_msr_guest_enter_tsc_aux(struct vmx *vmx, int vcpuid) 366 { 367 uint64_t guest_tsc_aux = vmx->guest_msrs[vcpuid][IDX_MSR_TSC_AUX]; 368 uint32_t host_aux = cpu_auxmsr(); 369 370 if (vmx_have_msr_tsc_aux(vmx) && guest_tsc_aux != host_aux) 371 wrmsr(MSR_TSC_AUX, guest_tsc_aux); 372 } 373 374 void 375 vmx_msr_guest_exit(struct vmx *vmx, int vcpuid) 376 { 377 uint64_t *guest_msrs = vmx->guest_msrs[vcpuid]; 378 379 /* Save guest MSRs */ 380 guest_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR); 381 guest_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR); 382 guest_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR); 383 guest_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK); 384 guest_msrs[IDX_MSR_KGSBASE] = rdmsr(MSR_KGSBASE); 385 386 /* Restore host MSRs */ 387 wrmsr(MSR_LSTAR, host_msrs[IDX_MSR_LSTAR]); 388 wrmsr(MSR_CSTAR, host_msrs[IDX_MSR_CSTAR]); 389 wrmsr(MSR_STAR, host_msrs[IDX_MSR_STAR]); 390 wrmsr(MSR_SF_MASK, host_msrs[IDX_MSR_SF_MASK]); 391 392 /* MSR_KGSBASE will be restored on the way back to userspace */ 393 } 394 395 void 396 vmx_msr_guest_exit_tsc_aux(struct vmx *vmx, int vcpuid) 397 { 398 uint64_t guest_tsc_aux = vmx->guest_msrs[vcpuid][IDX_MSR_TSC_AUX]; 399 uint32_t host_aux = cpu_auxmsr(); 400 401 if (vmx_have_msr_tsc_aux(vmx) && guest_tsc_aux != host_aux) 402 /* 403 * Note that it is not necessary to save the guest value 404 * here; vmx->guest_msrs[vcpuid][IDX_MSR_TSC_AUX] always 405 * contains the current value since it is updated whenever 406 * the guest writes to it (which is expected to be very 407 * rare). 408 */ 409 wrmsr(MSR_TSC_AUX, host_aux); 410 } 411 412 int 413 vmx_rdmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t *val, bool *retu) 414 { 415 const uint64_t *guest_msrs; 416 int error; 417 418 guest_msrs = vmx->guest_msrs[vcpuid]; 419 error = 0; 420 421 switch (num) { 422 case MSR_MCG_CAP: 423 case MSR_MCG_STATUS: 424 *val = 0; 425 break; 426 case MSR_MTRRcap: 427 case MSR_MTRRdefType: 428 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7: 429 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: 430 case MSR_MTRR64kBase: 431 case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: 432 if (vm_rdmtrr(&vmx->mtrr[vcpuid], num, val) != 0) { 433 vm_inject_gp(vmx->vm, vcpuid); 434 } 435 break; 436 case MSR_IA32_MISC_ENABLE: 437 *val = misc_enable; 438 break; 439 case MSR_PLATFORM_INFO: 440 *val = platform_info; 441 break; 442 case MSR_TURBO_RATIO_LIMIT: 443 case MSR_TURBO_RATIO_LIMIT1: 444 *val = turbo_ratio_limit; 445 break; 446 case MSR_PAT: 447 *val = guest_msrs[IDX_MSR_PAT]; 448 break; 449 default: 450 error = EINVAL; 451 break; 452 } 453 return (error); 454 } 455 456 int 457 vmx_wrmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t val, bool *retu) 458 { 459 uint64_t *guest_msrs; 460 uint64_t changed; 461 int error; 462 463 guest_msrs = vmx->guest_msrs[vcpuid]; 464 error = 0; 465 466 switch (num) { 467 case MSR_MCG_CAP: 468 case MSR_MCG_STATUS: 469 break; /* ignore writes */ 470 case MSR_MTRRcap: 471 case MSR_MTRRdefType: 472 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7: 473 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: 474 case MSR_MTRR64kBase: 475 case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: 476 if (vm_wrmtrr(&vmx->mtrr[vcpuid], num, val) != 0) { 477 vm_inject_gp(vmx->vm, vcpuid); 478 } 479 break; 480 case MSR_IA32_MISC_ENABLE: 481 changed = val ^ misc_enable; 482 /* 483 * If the host has disabled the NX feature then the guest 484 * also cannot use it. However, a Linux guest will try to 485 * enable the NX feature by writing to the MISC_ENABLE MSR. 486 * 487 * This can be safely ignored because the memory management 488 * code looks at CPUID.80000001H:EDX.NX to check if the 489 * functionality is actually enabled. 490 */ 491 changed &= ~(1UL << 34); 492 493 /* 494 * Punt to userspace if any other bits are being modified. 495 */ 496 if (changed) 497 error = EINVAL; 498 499 break; 500 case MSR_PAT: 501 if (pat_valid(val)) 502 guest_msrs[IDX_MSR_PAT] = val; 503 else 504 vm_inject_gp(vmx->vm, vcpuid); 505 break; 506 case MSR_TSC: 507 error = vmx_set_tsc_offset(vmx, vcpuid, val - rdtsc()); 508 break; 509 case MSR_TSC_AUX: 510 if (vmx_have_msr_tsc_aux(vmx)) 511 /* 512 * vmx_msr_guest_enter_tsc_aux() will apply this 513 * value when it is called immediately before guest 514 * entry. 515 */ 516 guest_msrs[IDX_MSR_TSC_AUX] = val; 517 else 518 vm_inject_gp(vmx->vm, vcpuid); 519 break; 520 default: 521 error = EINVAL; 522 break; 523 } 524 525 return (error); 526 } 527