1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2011 NetApp, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 /* 29 * Copyright 2020 Joyent, Inc. 30 * Copyright 2021 Oxide Computer Company 31 */ 32 33 #include <sys/cdefs.h> 34 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/proc.h> 38 39 #include <machine/clock.h> 40 #include <machine/cpufunc.h> 41 #include <machine/md_var.h> 42 #include <machine/specialreg.h> 43 #include <machine/vmm.h> 44 #include <sys/vmm_kernel.h> 45 46 #include "vmx.h" 47 #include "vmx_msr.h" 48 49 static bool 50 vmx_ctl_allows_one_setting(uint64_t msr_val, int bitpos) 51 { 52 53 return ((msr_val & (1UL << (bitpos + 32))) != 0); 54 } 55 56 static bool 57 vmx_ctl_allows_zero_setting(uint64_t msr_val, int bitpos) 58 { 59 60 return ((msr_val & (1UL << bitpos)) == 0); 61 } 62 63 /* 64 * Generate a bitmask to be used for the VMCS execution control fields. 65 * 66 * The caller specifies what bits should be set to one in 'ones_mask' 67 * and what bits should be set to zero in 'zeros_mask'. The don't-care 68 * bits are set to the default value. The default values are obtained 69 * based on "Algorithm 3" in Section 27.5.1 "Algorithms for Determining 70 * VMX Capabilities". 71 * 72 * Returns zero on success and non-zero on error. 73 */ 74 int 75 vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask, 76 uint32_t zeros_mask, uint32_t *retval) 77 { 78 int i; 79 uint64_t val, trueval; 80 bool true_ctls_avail, one_allowed, zero_allowed; 81 82 /* We cannot ask the same bit to be set to both '1' and '0' */ 83 if ((ones_mask ^ zeros_mask) != (ones_mask | zeros_mask)) 84 return (EINVAL); 85 86 true_ctls_avail = (rdmsr(MSR_VMX_BASIC) & (1UL << 55)) != 0; 87 88 val = rdmsr(ctl_reg); 89 if (true_ctls_avail) 90 trueval = rdmsr(true_ctl_reg); /* step c */ 91 else 92 trueval = val; /* step a */ 93 94 for (i = 0; i < 32; i++) { 95 one_allowed = vmx_ctl_allows_one_setting(trueval, i); 96 zero_allowed = vmx_ctl_allows_zero_setting(trueval, i); 97 98 KASSERT(one_allowed || zero_allowed, 99 ("invalid zero/one setting for bit %d of ctl 0x%0x, " 100 "truectl 0x%0x\n", i, ctl_reg, true_ctl_reg)); 101 102 if (zero_allowed && !one_allowed) { /* b(i),c(i) */ 103 if (ones_mask & (1 << i)) 104 return (EINVAL); 105 *retval &= ~(1 << i); 106 } else if (one_allowed && !zero_allowed) { /* b(i),c(i) */ 107 if (zeros_mask & (1 << i)) 108 return (EINVAL); 109 *retval |= 1 << i; 110 } else { 111 if (zeros_mask & (1 << i)) { 112 /* b(ii),c(ii) */ 113 *retval &= ~(1 << i); 114 } else if (ones_mask & (1 << i)) { 115 /* b(ii), c(ii) */ 116 *retval |= 1 << i; 117 } else if (!true_ctls_avail) { 118 /* b(iii) */ 119 *retval &= ~(1 << i); 120 } else if (vmx_ctl_allows_zero_setting(val, i)) { 121 /* c(iii) */ 122 *retval &= ~(1 << i); 123 } else if (vmx_ctl_allows_one_setting(val, i)) { 124 /* c(iv) */ 125 *retval |= 1 << i; 126 } else { 127 panic("vmx_set_ctlreg: unable to determine " 128 "correct value of ctl bit %d for msr " 129 "0x%0x and true msr 0x%0x", i, ctl_reg, 130 true_ctl_reg); 131 } 132 } 133 } 134 135 return (0); 136 } 137 138 void 139 vmx_msr_bitmap_initialize(struct vmx *vmx) 140 { 141 for (uint_t i = 0; i < VM_MAXCPU; i++) { 142 uint8_t *bitmap; 143 144 bitmap = kmem_alloc(PAGESIZE, KM_SLEEP); 145 VERIFY3U((uintptr_t)bitmap & PAGEOFFSET, ==, 0); 146 memset(bitmap, 0xff, PAGESIZE); 147 148 vmx->msr_bitmap[i] = bitmap; 149 } 150 } 151 152 void 153 vmx_msr_bitmap_destroy(struct vmx *vmx) 154 { 155 for (uint_t i = 0; i < VM_MAXCPU; i++) { 156 VERIFY3P(vmx->msr_bitmap[i], !=, NULL); 157 kmem_free(vmx->msr_bitmap[i], PAGESIZE); 158 vmx->msr_bitmap[i] = NULL; 159 } 160 } 161 162 void 163 vmx_msr_bitmap_change_access(struct vmx *vmx, int vcpuid, uint_t msr, int acc) 164 { 165 uint8_t *bitmap = vmx->msr_bitmap[vcpuid]; 166 int byte, bit; 167 168 if (msr <= 0x00001FFF) { 169 byte = msr / 8; 170 } else if (msr >= 0xC0000000 && msr <= 0xC0001FFF) { 171 byte = 1024 + (msr - 0xC0000000) / 8; 172 } else { 173 panic("Invalid MSR for bitmap: %x", msr); 174 } 175 176 bit = msr & 0x7; 177 178 if (acc & MSR_BITMAP_ACCESS_READ) { 179 bitmap[byte] &= ~(1 << bit); 180 } else { 181 bitmap[byte] |= 1 << bit; 182 } 183 184 byte += 2048; 185 if (acc & MSR_BITMAP_ACCESS_WRITE) { 186 bitmap[byte] &= ~(1 << bit); 187 } else { 188 bitmap[byte] |= 1 << bit; 189 } 190 } 191 192 static uint64_t misc_enable; 193 static uint64_t platform_info; 194 static uint64_t turbo_ratio_limit; 195 196 static bool 197 nehalem_cpu(void) 198 { 199 uint_t family, model; 200 201 /* 202 * The family:model numbers belonging to the Nehalem microarchitecture 203 * are documented in Section 35.5, Intel SDM dated Feb 2014. 204 */ 205 family = CPUID_TO_FAMILY(cpu_id); 206 model = CPUID_TO_MODEL(cpu_id); 207 if (family == 0x6) { 208 switch (model) { 209 case 0x1A: 210 case 0x1E: 211 case 0x1F: 212 case 0x2E: 213 return (true); 214 default: 215 break; 216 } 217 } 218 return (false); 219 } 220 221 static bool 222 westmere_cpu(void) 223 { 224 uint_t family, model; 225 226 /* 227 * The family:model numbers belonging to the Westmere microarchitecture 228 * are documented in Section 35.6, Intel SDM dated Feb 2014. 229 */ 230 family = CPUID_TO_FAMILY(cpu_id); 231 model = CPUID_TO_MODEL(cpu_id); 232 if (family == 0x6) { 233 switch (model) { 234 case 0x25: 235 case 0x2C: 236 return (true); 237 default: 238 break; 239 } 240 } 241 return (false); 242 } 243 244 static bool 245 pat_valid(uint64_t val) 246 { 247 int i, pa; 248 249 /* 250 * From Intel SDM: Table "Memory Types That Can Be Encoded With PAT" 251 * 252 * Extract PA0 through PA7 and validate that each one encodes a 253 * valid memory type. 254 */ 255 for (i = 0; i < 8; i++) { 256 pa = (val >> (i * 8)) & 0xff; 257 if (pa == 2 || pa == 3 || pa >= 8) 258 return (false); 259 } 260 return (true); 261 } 262 263 void 264 vmx_msr_init(void) 265 { 266 uint64_t bus_freq, ratio; 267 int i; 268 269 /* 270 * Initialize emulated MSRs 271 */ 272 misc_enable = rdmsr(MSR_IA32_MISC_ENABLE); 273 /* 274 * Set mandatory bits 275 * 11: branch trace disabled 276 * 12: PEBS unavailable 277 * Clear unsupported features 278 * 16: SpeedStep enable 279 * 18: enable MONITOR FSM 280 */ 281 misc_enable |= (1 << 12) | (1 << 11); 282 misc_enable &= ~((1 << 18) | (1 << 16)); 283 284 if (nehalem_cpu() || westmere_cpu()) 285 bus_freq = 133330000; /* 133Mhz */ 286 else 287 bus_freq = 100000000; /* 100Mhz */ 288 289 /* 290 * XXXtime 291 * The ratio should really be based on the virtual TSC frequency as 292 * opposed to the host TSC. 293 */ 294 ratio = (tsc_freq / bus_freq) & 0xff; 295 296 /* 297 * The register definition is based on the micro-architecture 298 * but the following bits are always the same: 299 * [15:8] Maximum Non-Turbo Ratio 300 * [28] Programmable Ratio Limit for Turbo Mode 301 * [29] Programmable TDC-TDP Limit for Turbo Mode 302 * [47:40] Maximum Efficiency Ratio 303 * 304 * The other bits can be safely set to 0 on all 305 * micro-architectures up to Haswell. 306 */ 307 platform_info = (ratio << 8) | (ratio << 40); 308 309 /* 310 * The number of valid bits in the MSR_TURBO_RATIO_LIMITx register is 311 * dependent on the maximum cores per package supported by the micro- 312 * architecture. For e.g., Westmere supports 6 cores per package and 313 * uses the low 48 bits. Sandybridge support 8 cores per package and 314 * uses up all 64 bits. 315 * 316 * However, the unused bits are reserved so we pretend that all bits 317 * in this MSR are valid. 318 */ 319 for (i = 0; i < 8; i++) 320 turbo_ratio_limit = (turbo_ratio_limit << 8) | ratio; 321 } 322 323 void 324 vmx_msr_guest_init(struct vmx *vmx, int vcpuid) 325 { 326 uint64_t *guest_msrs = vmx->guest_msrs[vcpuid]; 327 328 /* 329 * It is safe to allow direct access to MSR_GSBASE and 330 * MSR_FSBASE. The guest FSBASE and GSBASE are saved and 331 * restored during vm-exit and vm-entry respectively. The host 332 * FSBASE and GSBASE are always restored from the vmcs host 333 * state area on vm-exit. 334 * 335 * The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in 336 * how they are saved/restored so can be directly accessed by 337 * the guest. 338 * 339 * MSR_EFER is saved and restored in the guest VMCS area on a VM 340 * exit and entry respectively. It is also restored from the 341 * host VMCS area on a VM exit. 342 * 343 * The TSC MSR is exposed read-only. Writes are disallowed as 344 * that will impact the host TSC. If the guest does a write the 345 * "use TSC offsetting" execution control is enabled and the 346 * difference between the host TSC and the guest TSC is written 347 * into the TSC offset in the VMCS. 348 */ 349 guest_msr_rw(vmx, vcpuid, MSR_GSBASE); 350 guest_msr_rw(vmx, vcpuid, MSR_FSBASE); 351 guest_msr_rw(vmx, vcpuid, MSR_SYSENTER_CS_MSR); 352 guest_msr_rw(vmx, vcpuid, MSR_SYSENTER_ESP_MSR); 353 guest_msr_rw(vmx, vcpuid, MSR_SYSENTER_EIP_MSR); 354 guest_msr_rw(vmx, vcpuid, MSR_EFER); 355 guest_msr_ro(vmx, vcpuid, MSR_TSC); 356 357 /* 358 * The guest may have direct access to these MSRs as they are 359 * saved/restored in vmx_msr_guest_enter() and vmx_msr_guest_exit(). 360 */ 361 guest_msr_rw(vmx, vcpuid, MSR_LSTAR); 362 guest_msr_rw(vmx, vcpuid, MSR_CSTAR); 363 guest_msr_rw(vmx, vcpuid, MSR_STAR); 364 guest_msr_rw(vmx, vcpuid, MSR_SF_MASK); 365 guest_msr_rw(vmx, vcpuid, MSR_KGSBASE); 366 367 /* 368 * Initialize guest IA32_PAT MSR with default value after reset. 369 */ 370 guest_msrs[IDX_MSR_PAT] = PAT_VALUE(0, PAT_WRITE_BACK) | 371 PAT_VALUE(1, PAT_WRITE_THROUGH) | 372 PAT_VALUE(2, PAT_UNCACHED) | 373 PAT_VALUE(3, PAT_UNCACHEABLE) | 374 PAT_VALUE(4, PAT_WRITE_BACK) | 375 PAT_VALUE(5, PAT_WRITE_THROUGH) | 376 PAT_VALUE(6, PAT_UNCACHED) | 377 PAT_VALUE(7, PAT_UNCACHEABLE); 378 } 379 380 void 381 vmx_msr_guest_enter(struct vmx *vmx, int vcpuid) 382 { 383 uint64_t *guest_msrs = vmx->guest_msrs[vcpuid]; 384 uint64_t *host_msrs = vmx->host_msrs[vcpuid]; 385 386 /* Save host MSRs */ 387 host_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR); 388 host_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR); 389 host_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR); 390 host_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK); 391 392 /* Save host MSRs (in particular, KGSBASE) and restore guest MSRs */ 393 wrmsr(MSR_LSTAR, guest_msrs[IDX_MSR_LSTAR]); 394 wrmsr(MSR_CSTAR, guest_msrs[IDX_MSR_CSTAR]); 395 wrmsr(MSR_STAR, guest_msrs[IDX_MSR_STAR]); 396 wrmsr(MSR_SF_MASK, guest_msrs[IDX_MSR_SF_MASK]); 397 wrmsr(MSR_KGSBASE, guest_msrs[IDX_MSR_KGSBASE]); 398 } 399 400 void 401 vmx_msr_guest_exit(struct vmx *vmx, int vcpuid) 402 { 403 uint64_t *guest_msrs = vmx->guest_msrs[vcpuid]; 404 uint64_t *host_msrs = vmx->host_msrs[vcpuid]; 405 406 /* Save guest MSRs */ 407 guest_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR); 408 guest_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR); 409 guest_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR); 410 guest_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK); 411 guest_msrs[IDX_MSR_KGSBASE] = rdmsr(MSR_KGSBASE); 412 413 /* Restore host MSRs */ 414 wrmsr(MSR_LSTAR, host_msrs[IDX_MSR_LSTAR]); 415 wrmsr(MSR_CSTAR, host_msrs[IDX_MSR_CSTAR]); 416 wrmsr(MSR_STAR, host_msrs[IDX_MSR_STAR]); 417 wrmsr(MSR_SF_MASK, host_msrs[IDX_MSR_SF_MASK]); 418 419 /* MSR_KGSBASE will be restored on the way back to userspace */ 420 } 421 422 vm_msr_result_t 423 vmx_rdmsr(struct vmx *vmx, int vcpuid, uint32_t num, uint64_t *val) 424 { 425 const uint64_t *guest_msrs = vmx->guest_msrs[vcpuid]; 426 427 switch (num) { 428 case MSR_IA32_FEATURE_CONTROL: 429 /* 430 * We currently don't support SGX support in guests, so 431 * always report those features as disabled with the MSR 432 * locked so the guest won't attempt to write to it. 433 */ 434 *val = IA32_FEATURE_CONTROL_LOCK; 435 break; 436 case MSR_IA32_MISC_ENABLE: 437 *val = misc_enable; 438 break; 439 case MSR_PLATFORM_INFO: 440 *val = platform_info; 441 break; 442 case MSR_TURBO_RATIO_LIMIT: 443 case MSR_TURBO_RATIO_LIMIT1: 444 *val = turbo_ratio_limit; 445 break; 446 case MSR_PAT: 447 *val = guest_msrs[IDX_MSR_PAT]; 448 break; 449 default: 450 return (VMR_UNHANLDED); 451 } 452 return (VMR_OK); 453 } 454 455 vm_msr_result_t 456 vmx_wrmsr(struct vmx *vmx, int vcpuid, uint32_t num, uint64_t val) 457 { 458 uint64_t *guest_msrs = vmx->guest_msrs[vcpuid]; 459 uint64_t changed; 460 461 switch (num) { 462 case MSR_IA32_MISC_ENABLE: 463 changed = val ^ misc_enable; 464 /* 465 * If the host has disabled the NX feature then the guest 466 * also cannot use it. However, a Linux guest will try to 467 * enable the NX feature by writing to the MISC_ENABLE MSR. 468 * 469 * This can be safely ignored because the memory management 470 * code looks at CPUID.80000001H:EDX.NX to check if the 471 * functionality is actually enabled. 472 */ 473 changed &= ~(1UL << 34); 474 475 /* 476 * Punt to userspace if any other bits are being modified. 477 */ 478 if (changed) { 479 return (VMR_UNHANLDED); 480 } 481 break; 482 case MSR_PAT: 483 if (!pat_valid(val)) { 484 return (VMR_GP); 485 } 486 guest_msrs[IDX_MSR_PAT] = val; 487 break; 488 default: 489 return (VMR_UNHANLDED); 490 } 491 492 return (VMR_OK); 493 } 494