1 /* SPDX-License-Identifier: GPL-2.0-only */ 2 /* 3 * Kernel-based Virtual Machine driver for Linux 4 * 5 * This module enables machines with Intel VT-x extensions to run virtual 6 * machines without emulation or binary translation. 7 * 8 * MMU support 9 * 10 * Copyright (C) 2006 Qumranet, Inc. 11 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 12 * 13 * Authors: 14 * Yaniv Kamay <yaniv@qumranet.com> 15 * Avi Kivity <avi@qumranet.com> 16 */ 17 18 /* 19 * The MMU needs to be able to access/walk 32-bit and 64-bit guest page tables, 20 * as well as guest EPT tables, so the code in this file is compiled thrice, 21 * once per guest PTE type. The per-type defines are #undef'd at the end. 22 */ 23 24 #if PTTYPE == 64 25 #define pt_element_t u64 26 #define guest_walker guest_walker64 27 #define FNAME(name) paging##64_##name 28 #define PT_LEVEL_BITS 9 29 #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT 30 #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT 31 #define PT_HAVE_ACCESSED_DIRTY(mmu) true 32 #ifdef CONFIG_X86_64 33 #define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL 34 #else 35 #define PT_MAX_FULL_LEVELS 2 36 #endif 37 #elif PTTYPE == 32 38 #define pt_element_t u32 39 #define guest_walker guest_walker32 40 #define FNAME(name) paging##32_##name 41 #define PT_LEVEL_BITS 10 42 #define PT_MAX_FULL_LEVELS 2 43 #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT 44 #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT 45 #define PT_HAVE_ACCESSED_DIRTY(mmu) true 46 47 #define PT32_DIR_PSE36_SIZE 4 48 #define PT32_DIR_PSE36_SHIFT 13 49 #define PT32_DIR_PSE36_MASK \ 50 (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT) 51 #elif PTTYPE == PTTYPE_EPT 52 #define pt_element_t u64 53 #define guest_walker guest_walkerEPT 54 #define FNAME(name) ept_##name 55 #define PT_LEVEL_BITS 9 56 #define PT_GUEST_DIRTY_SHIFT 9 57 #define PT_GUEST_ACCESSED_SHIFT 8 58 #define PT_HAVE_ACCESSED_DIRTY(mmu) (!(mmu)->cpu_role.base.ad_disabled) 59 #define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL 60 #else 61 #error Invalid PTTYPE value 62 #endif 63 64 /* Common logic, but per-type values. These also need to be undefined. */ 65 #define PT_BASE_ADDR_MASK ((pt_element_t)__PT_BASE_ADDR_MASK) 66 #define PT_LVL_ADDR_MASK(lvl) __PT_LVL_ADDR_MASK(PT_BASE_ADDR_MASK, lvl, PT_LEVEL_BITS) 67 #define PT_LVL_OFFSET_MASK(lvl) __PT_LVL_OFFSET_MASK(PT_BASE_ADDR_MASK, lvl, PT_LEVEL_BITS) 68 #define PT_INDEX(addr, lvl) __PT_INDEX(addr, lvl, PT_LEVEL_BITS) 69 70 #define PT_GUEST_DIRTY_MASK (1 << PT_GUEST_DIRTY_SHIFT) 71 #define PT_GUEST_ACCESSED_MASK (1 << PT_GUEST_ACCESSED_SHIFT) 72 73 #define gpte_to_gfn_lvl FNAME(gpte_to_gfn_lvl) 74 #define gpte_to_gfn(pte) gpte_to_gfn_lvl((pte), PG_LEVEL_4K) 75 76 /* 77 * The guest_walker structure emulates the behavior of the hardware page 78 * table walker. 79 */ 80 struct guest_walker { 81 int level; 82 unsigned max_level; 83 gfn_t table_gfn[PT_MAX_FULL_LEVELS]; 84 pt_element_t ptes[PT_MAX_FULL_LEVELS]; 85 pt_element_t prefetch_ptes[PTE_PREFETCH_NUM]; 86 gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; 87 pt_element_t __user *ptep_user[PT_MAX_FULL_LEVELS]; 88 bool pte_writable[PT_MAX_FULL_LEVELS]; 89 unsigned int pt_access[PT_MAX_FULL_LEVELS]; 90 unsigned int pte_access; 91 gfn_t gfn; 92 struct x86_exception fault; 93 }; 94 95 #if PTTYPE == 32 96 static inline gfn_t pse36_gfn_delta(u32 gpte) 97 { 98 int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT; 99 100 return (gpte & PT32_DIR_PSE36_MASK) << shift; 101 } 102 #endif 103 104 static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl) 105 { 106 return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT; 107 } 108 109 static inline void FNAME(protect_clean_gpte)(struct kvm_mmu *mmu, unsigned *access, 110 unsigned gpte) 111 { 112 unsigned mask; 113 114 /* dirty bit is not supported, so no need to track it */ 115 if (!PT_HAVE_ACCESSED_DIRTY(mmu)) 116 return; 117 118 BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK); 119 120 mask = (unsigned)~ACC_WRITE_MASK; 121 /* Allow write access to dirty gptes */ 122 mask |= (gpte >> (PT_GUEST_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) & 123 PT_WRITABLE_MASK; 124 *access &= mask; 125 } 126 127 static inline int FNAME(is_present_gpte)(struct kvm_mmu *mmu, 128 unsigned long pte) 129 { 130 #if PTTYPE != PTTYPE_EPT 131 return pte & PT_PRESENT_MASK; 132 #else 133 /* 134 * For EPT, an entry is present if any of bits 2:0 are set. 135 * With mode-based execute control, bit 10 also indicates presence. 136 */ 137 return pte & (7 | (mmu_has_mbec(mmu) ? VMX_EPT_USER_EXECUTABLE_MASK : 0)); 138 #endif 139 } 140 141 static bool FNAME(is_bad_mt_xwr)(struct rsvd_bits_validate *rsvd_check, u64 gpte) 142 { 143 #if PTTYPE != PTTYPE_EPT 144 return false; 145 #else 146 return __is_bad_mt_xwr(rsvd_check, gpte); 147 #endif 148 } 149 150 static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level) 151 { 152 return __is_rsvd_bits_set(&mmu->guest_rsvd_check, gpte, level) || 153 FNAME(is_bad_mt_xwr)(&mmu->guest_rsvd_check, gpte); 154 } 155 156 static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, 157 struct kvm_mmu_page *sp, u64 *spte, 158 u64 gpte) 159 { 160 if (!FNAME(is_present_gpte)(vcpu->arch.mmu, gpte)) 161 goto no_present; 162 163 /* Prefetch only accessed entries (unless A/D bits are disabled). */ 164 if (PT_HAVE_ACCESSED_DIRTY(vcpu->arch.mmu) && 165 !(gpte & PT_GUEST_ACCESSED_MASK)) 166 goto no_present; 167 168 if (FNAME(is_rsvd_bits_set)(vcpu->arch.mmu, gpte, PG_LEVEL_4K)) 169 goto no_present; 170 171 return false; 172 173 no_present: 174 drop_spte(vcpu->kvm, spte); 175 return true; 176 } 177 178 static inline unsigned FNAME(gpte_access)(u64 gpte) 179 { 180 unsigned access; 181 /* 182 * Set bits in ACC_*_MASK even if they might not be used in the 183 * actual checks. For example, if EFER.NX is clear permission_fault() 184 * will ignore ACC_EXEC_MASK, and if MBEC is disabled it will 185 * ignore ACC_USER_EXEC_MASK. 186 */ 187 #if PTTYPE == PTTYPE_EPT 188 access = ((gpte & VMX_EPT_WRITABLE_MASK) ? ACC_WRITE_MASK : 0) | 189 ((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) | 190 ((gpte & VMX_EPT_READABLE_MASK) ? ACC_READ_MASK : 0) | 191 ((gpte & VMX_EPT_USER_EXECUTABLE_MASK) ? ACC_USER_EXEC_MASK : 0); 192 #else 193 /* 194 * P is set here, so the page is always readable and W/U/!NX represent 195 * allowed accesses. 196 */ 197 BUILD_BUG_ON(ACC_READ_MASK != PT_PRESENT_MASK); 198 BUILD_BUG_ON(ACC_WRITE_MASK != PT_WRITABLE_MASK); 199 BUILD_BUG_ON(ACC_USER_MASK != PT_USER_MASK); 200 BUILD_BUG_ON(ACC_EXEC_MASK & (PT_WRITABLE_MASK | PT_USER_MASK | PT_PRESENT_MASK)); 201 access = gpte & (PT_WRITABLE_MASK | PT_USER_MASK | PT_PRESENT_MASK); 202 access |= gpte & PT64_NX_MASK ? 0 : ACC_EXEC_MASK; 203 #endif 204 205 return access; 206 } 207 208 static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, 209 struct kvm_mmu *mmu, 210 struct guest_walker *walker, 211 gpa_t addr, int write_fault) 212 { 213 unsigned level, index; 214 pt_element_t pte, orig_pte; 215 pt_element_t __user *ptep_user; 216 gfn_t table_gfn; 217 int ret; 218 219 /* dirty/accessed bits are not supported, so no need to update them */ 220 if (!PT_HAVE_ACCESSED_DIRTY(mmu)) 221 return 0; 222 223 for (level = walker->max_level; level >= walker->level; --level) { 224 pte = orig_pte = walker->ptes[level - 1]; 225 table_gfn = walker->table_gfn[level - 1]; 226 ptep_user = walker->ptep_user[level - 1]; 227 index = offset_in_page(ptep_user) / sizeof(pt_element_t); 228 if (!(pte & PT_GUEST_ACCESSED_MASK)) { 229 trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte)); 230 pte |= PT_GUEST_ACCESSED_MASK; 231 } 232 if (level == walker->level && write_fault && 233 !(pte & PT_GUEST_DIRTY_MASK)) { 234 trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); 235 #if PTTYPE == PTTYPE_EPT 236 if (kvm_x86_ops.nested_ops->write_log_dirty(vcpu, addr)) 237 return -EINVAL; 238 #endif 239 pte |= PT_GUEST_DIRTY_MASK; 240 } 241 if (pte == orig_pte) 242 continue; 243 244 /* 245 * If the slot is read-only, simply do not process the accessed 246 * and dirty bits. This is the correct thing to do if the slot 247 * is ROM, and page tables in read-as-ROM/write-as-MMIO slots 248 * are only supported if the accessed and dirty bits are already 249 * set in the ROM (so that MMIO writes are never needed). 250 * 251 * Note that NPT does not allow this at all and faults, since 252 * it always wants nested page table entries for the guest 253 * page tables to be writable. And EPT works but will simply 254 * overwrite the read-only memory to set the accessed and dirty 255 * bits. 256 */ 257 if (unlikely(!walker->pte_writable[level - 1])) 258 continue; 259 260 ret = __try_cmpxchg_user(ptep_user, &orig_pte, pte, fault); 261 if (ret) 262 return ret; 263 264 kvm_vcpu_mark_page_dirty(vcpu, table_gfn); 265 walker->ptes[level - 1] = pte; 266 } 267 return 0; 268 } 269 270 static inline unsigned FNAME(gpte_pkeys)(struct kvm_vcpu *vcpu, u64 gpte) 271 { 272 unsigned pkeys = 0; 273 #if PTTYPE == 64 274 pte_t pte = {.pte = gpte}; 275 276 pkeys = pte_flags_pkey(pte_flags(pte)); 277 #endif 278 return pkeys; 279 } 280 281 static inline bool FNAME(is_last_gpte)(struct kvm_mmu *mmu, 282 unsigned int level, unsigned int gpte) 283 { 284 /* 285 * For EPT and PAE paging (both variants), bit 7 is either reserved at 286 * all level or indicates a huge page (ignoring CR3/EPTP). In either 287 * case, bit 7 being set terminates the walk. 288 */ 289 #if PTTYPE == 32 290 /* 291 * 32-bit paging requires special handling because bit 7 is ignored if 292 * CR4.PSE=0, not reserved. Clear bit 7 in the gpte if the level is 293 * greater than the last level for which bit 7 is the PAGE_SIZE bit. 294 * 295 * The RHS has bit 7 set iff level < (2 + PSE). If it is clear, bit 7 296 * is not reserved and does not indicate a large page at this level, 297 * so clear PT_PAGE_SIZE_MASK in gpte if that is the case. 298 */ 299 gpte &= level - (PT32_ROOT_LEVEL + mmu->cpu_role.ext.cr4_pse); 300 #endif 301 /* 302 * PG_LEVEL_4K always terminates. The RHS has bit 7 set 303 * iff level <= PG_LEVEL_4K, which for our purpose means 304 * level == PG_LEVEL_4K; set PT_PAGE_SIZE_MASK in gpte then. 305 */ 306 gpte |= level - PG_LEVEL_4K - 1; 307 308 return gpte & PT_PAGE_SIZE_MASK; 309 } 310 /* 311 * Fetch a guest pte for a guest virtual address, or for an L2's GPA. 312 */ 313 static int FNAME(walk_addr_generic)(struct guest_walker *walker, 314 struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, 315 gpa_t addr, u64 access) 316 { 317 int ret; 318 pt_element_t pte; 319 pt_element_t __user *ptep_user; 320 gfn_t table_gfn; 321 u64 pt_access, pte_access; 322 unsigned index, accessed_dirty, pte_pkey; 323 u64 nested_access; 324 gpa_t pte_gpa; 325 bool have_ad; 326 int offset; 327 u64 walk_nx_mask = 0; 328 const int write_fault = access & PFERR_WRITE_MASK; 329 const int user_fault = access & PFERR_USER_MASK; 330 const int fetch_fault = access & PFERR_FETCH_MASK; 331 /* 332 * Note! Track the error_code that's common to legacy shadow paging 333 * and NPT shadow paging as a u16 to guard against unintentionally 334 * setting any of bits 63:16. Architecturally, the #PF error code is 335 * 32 bits, and Intel CPUs don't support settings bits 31:16. 336 */ 337 u16 errcode = 0; 338 gpa_t real_gpa; 339 gfn_t gfn; 340 341 trace_kvm_mmu_pagetable_walk(addr, access); 342 retry_walk: 343 walker->level = mmu->cpu_role.base.level; 344 pte = kvm_mmu_get_guest_pgd(vcpu, mmu); 345 have_ad = PT_HAVE_ACCESSED_DIRTY(mmu); 346 347 #if PTTYPE == 64 348 walk_nx_mask = 1ULL << PT64_NX_SHIFT; 349 if (walker->level == PT32E_ROOT_LEVEL) { 350 pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3); 351 trace_kvm_mmu_paging_element(pte, walker->level); 352 if (!FNAME(is_present_gpte)(mmu, pte)) 353 goto error; 354 --walker->level; 355 } 356 #endif 357 walker->max_level = walker->level; 358 359 /* 360 * FIXME: on Intel processors, loads of the PDPTE registers for PAE paging 361 * by the MOV to CR instruction are treated as reads and do not cause the 362 * processor to set the dirty flag in any EPT paging-structure entry. 363 */ 364 nested_access = (have_ad ? PFERR_WRITE_MASK : 0) | PFERR_USER_MASK; 365 366 pte_access = ~0; 367 368 /* 369 * Queue a page fault for injection if this assertion fails, as callers 370 * assume that walker.fault contains sane info on a walk failure. I.e. 371 * avoid making the situation worse by inducing even worse badness 372 * between when the assertion fails and when KVM kicks the vCPU out to 373 * userspace (because the VM is bugged). 374 */ 375 if (KVM_BUG_ON(is_long_mode(vcpu) && !is_pae(vcpu), vcpu->kvm)) 376 goto error; 377 378 ++walker->level; 379 380 do { 381 struct kvm_memory_slot *slot; 382 unsigned long host_addr; 383 384 pt_access = pte_access; 385 --walker->level; 386 387 index = PT_INDEX(addr, walker->level); 388 table_gfn = gpte_to_gfn(pte); 389 offset = index * sizeof(pt_element_t); 390 pte_gpa = gfn_to_gpa(table_gfn) + offset; 391 392 BUG_ON(walker->level < 1); 393 walker->table_gfn[walker->level - 1] = table_gfn; 394 walker->pte_gpa[walker->level - 1] = pte_gpa; 395 396 real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(table_gfn), 397 nested_access | PFERR_GUEST_PAGE_MASK, 398 &walker->fault, 0); 399 400 if (unlikely(real_gpa == INVALID_GPA)) 401 return 0; 402 403 slot = kvm_vcpu_gfn_to_memslot(vcpu, gpa_to_gfn(real_gpa)); 404 if (!kvm_is_visible_memslot(slot)) 405 goto error; 406 407 host_addr = gfn_to_hva_memslot_prot(slot, gpa_to_gfn(real_gpa), 408 &walker->pte_writable[walker->level - 1]); 409 if (unlikely(kvm_is_error_hva(host_addr))) 410 goto error; 411 412 ptep_user = (pt_element_t __user *)((void *)host_addr + offset); 413 if (unlikely(get_user(pte, ptep_user))) 414 goto error; 415 walker->ptep_user[walker->level - 1] = ptep_user; 416 417 trace_kvm_mmu_paging_element(pte, walker->level); 418 419 /* 420 * Inverting the NX it lets us AND it like other 421 * permission bits. 422 */ 423 pte_access = pt_access & (pte ^ walk_nx_mask); 424 425 if (unlikely(!FNAME(is_present_gpte)(mmu, pte))) 426 goto error; 427 428 if (unlikely(FNAME(is_rsvd_bits_set)(mmu, pte, walker->level))) { 429 errcode = PFERR_RSVD_MASK | PFERR_PRESENT_MASK; 430 goto error; 431 } 432 433 walker->ptes[walker->level - 1] = pte; 434 435 /* Convert to ACC_*_MASK flags for struct guest_walker. */ 436 walker->pt_access[walker->level - 1] = FNAME(gpte_access)(pt_access ^ walk_nx_mask); 437 } while (!FNAME(is_last_gpte)(mmu, walker->level, pte)); 438 439 pte_pkey = FNAME(gpte_pkeys)(vcpu, pte); 440 accessed_dirty = have_ad ? pte_access & PT_GUEST_ACCESSED_MASK : 0; 441 442 /* Convert to ACC_*_MASK flags for struct guest_walker. */ 443 walker->pte_access = FNAME(gpte_access)(pte_access ^ walk_nx_mask); 444 errcode = permission_fault(vcpu, mmu, walker->pte_access, pte_pkey, access); 445 if (unlikely(errcode)) 446 goto error; 447 448 gfn = gpte_to_gfn_lvl(pte, walker->level); 449 gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT; 450 451 #if PTTYPE == 32 452 if (walker->level > PG_LEVEL_4K && is_cpuid_PSE36()) 453 gfn += pse36_gfn_delta(pte); 454 #endif 455 456 real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(gfn), 457 access | PFERR_GUEST_FINAL_MASK, 458 &walker->fault, walker->pte_access); 459 if (real_gpa == INVALID_GPA) 460 return 0; 461 462 walker->gfn = real_gpa >> PAGE_SHIFT; 463 464 if (!write_fault) 465 FNAME(protect_clean_gpte)(mmu, &walker->pte_access, pte); 466 else 467 /* 468 * On a write fault, fold the dirty bit into accessed_dirty. 469 * For modes without A/D bits support accessed_dirty will be 470 * always clear. 471 */ 472 accessed_dirty &= pte >> 473 (PT_GUEST_DIRTY_SHIFT - PT_GUEST_ACCESSED_SHIFT); 474 475 if (unlikely(!accessed_dirty)) { 476 ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, 477 addr, write_fault); 478 if (unlikely(ret < 0)) 479 goto error; 480 else if (ret) 481 goto retry_walk; 482 } 483 484 return 1; 485 486 error: 487 errcode |= write_fault | user_fault; 488 if (fetch_fault && has_pferr_fetch(mmu)) 489 errcode |= PFERR_FETCH_MASK; 490 491 walker->fault.vector = PF_VECTOR; 492 walker->fault.error_code_valid = true; 493 walker->fault.error_code = errcode; 494 495 #if PTTYPE == PTTYPE_EPT 496 /* 497 * Use PFERR_RSVD_MASK in error_code to tell if EPT 498 * misconfiguration requires to be injected. The detection is 499 * done by is_rsvd_bits_set() above. 500 * 501 * We set up the value of exit_qualification to inject: 502 * [2:0] - Derive from the access bits. The exit_qualification might be 503 * out of date if it is serving an EPT misconfiguration. 504 * [5:3] - Calculated by the page walk of the guest EPT page tables 505 * [7:8] - Derived from "fault stage" access bits 506 * [9:11] - Derived from [9:11] of real exit_qualification 507 * 508 * The other bits are set to 0. 509 */ 510 if (!(errcode & PFERR_RSVD_MASK)) { 511 walker->fault.exit_qualification = 0; 512 513 if (write_fault) 514 walker->fault.exit_qualification |= EPT_VIOLATION_ACC_WRITE; 515 else if (fetch_fault) 516 walker->fault.exit_qualification |= EPT_VIOLATION_ACC_INSTR; 517 else 518 walker->fault.exit_qualification |= EPT_VIOLATION_ACC_READ; 519 520 /* 521 * KVM doesn't emulate features that access GPAs directly, e.g. 522 * Intel Processor Trace. Assume the GVA is always valid; when 523 * propagating faults from hardware, KVM will discard this info 524 * and use the EXIT_QUALIFICATION bits from the VMCS. 525 */ 526 walker->fault.exit_qualification |= EPT_VIOLATION_GVA_IS_VALID; 527 528 /* 529 * Accesses to guest paging structures are either "reads" or 530 * "read+write" accesses, so consider them the latter if write_fault 531 * is true. 532 */ 533 if (access & PFERR_GUEST_PAGE_MASK) 534 walker->fault.exit_qualification |= EPT_VIOLATION_ACC_READ; 535 else 536 walker->fault.exit_qualification |= EPT_VIOLATION_GVA_TRANSLATED; 537 538 /* 539 * Note, pte_access holds the raw RWX bits from the EPTE, not 540 * ACC_*_MASK flags! 541 */ 542 walker->fault.exit_qualification |= EPT_VIOLATION_RWX_TO_PROT(pte_access); 543 if (mmu_has_mbec(mmu)) 544 walker->fault.exit_qualification |= 545 EPT_VIOLATION_USER_EXEC_TO_PROT(pte_access); 546 } 547 #endif 548 walker->fault.address = addr; 549 walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu; 550 walker->fault.async_page_fault = false; 551 552 #if PTTYPE != PTTYPE_EPT 553 if (walker->fault.nested_page_fault) 554 walker->fault.error_code |= access & PFERR_GUEST_FAULT_STAGE_MASK; 555 #endif 556 557 trace_kvm_mmu_walker_error(walker->fault.error_code); 558 return 0; 559 } 560 561 static int FNAME(walk_addr)(struct guest_walker *walker, 562 struct kvm_vcpu *vcpu, gpa_t addr, u64 access) 563 { 564 return FNAME(walk_addr_generic)(walker, vcpu, vcpu->arch.mmu, addr, 565 access); 566 } 567 568 static bool 569 FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 570 u64 *spte, pt_element_t gpte) 571 { 572 unsigned pte_access; 573 gfn_t gfn; 574 575 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) 576 return false; 577 578 gfn = gpte_to_gfn(gpte); 579 pte_access = sp->role.access & FNAME(gpte_access)(gpte); 580 FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte); 581 582 return kvm_mmu_prefetch_sptes(vcpu, gfn, spte, 1, pte_access); 583 } 584 585 static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu, 586 struct guest_walker *gw, int level) 587 { 588 pt_element_t curr_pte; 589 gpa_t base_gpa, pte_gpa = gw->pte_gpa[level - 1]; 590 u64 mask; 591 int r, index; 592 593 if (level == PG_LEVEL_4K) { 594 mask = PTE_PREFETCH_NUM * sizeof(pt_element_t) - 1; 595 base_gpa = pte_gpa & ~mask; 596 index = (pte_gpa - base_gpa) / sizeof(pt_element_t); 597 598 r = kvm_vcpu_read_guest_atomic(vcpu, base_gpa, 599 gw->prefetch_ptes, sizeof(gw->prefetch_ptes)); 600 curr_pte = gw->prefetch_ptes[index]; 601 } else 602 r = kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, 603 &curr_pte, sizeof(curr_pte)); 604 605 return r || curr_pte != gw->ptes[level - 1]; 606 } 607 608 static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, 609 u64 *sptep) 610 { 611 struct kvm_mmu_page *sp; 612 pt_element_t *gptep = gw->prefetch_ptes; 613 u64 *spte; 614 int i; 615 616 sp = sptep_to_sp(sptep); 617 618 if (sp->role.level > PG_LEVEL_4K) 619 return; 620 621 /* 622 * If addresses are being invalidated, skip prefetching to avoid 623 * accidentally prefetching those addresses. 624 */ 625 if (unlikely(vcpu->kvm->mmu_invalidate_in_progress)) 626 return; 627 628 if (sp->role.direct) 629 return __direct_pte_prefetch(vcpu, sp, sptep); 630 631 i = spte_index(sptep) & ~(PTE_PREFETCH_NUM - 1); 632 spte = sp->spt + i; 633 634 for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { 635 if (spte == sptep) 636 continue; 637 638 if (is_shadow_present_pte(*spte)) 639 continue; 640 641 if (!FNAME(prefetch_gpte)(vcpu, sp, spte, gptep[i])) 642 break; 643 } 644 } 645 646 /* 647 * Fetch a shadow pte for a specific level in the paging hierarchy. 648 * If the guest tries to write a write-protected page, we need to 649 * emulate this operation, return 1 to indicate this case. 650 */ 651 static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, 652 struct guest_walker *gw) 653 { 654 struct kvm_mmu_page *sp = NULL; 655 struct kvm_shadow_walk_iterator it; 656 unsigned int direct_access, access; 657 int top_level, ret; 658 gfn_t base_gfn = fault->gfn; 659 660 WARN_ON_ONCE(gw->gfn != base_gfn); 661 direct_access = gw->pte_access; 662 663 top_level = vcpu->arch.mmu->cpu_role.base.level; 664 if (top_level == PT32E_ROOT_LEVEL) 665 top_level = PT32_ROOT_LEVEL; 666 /* 667 * Verify that the top-level gpte is still there. Since the page 668 * is a root page, it is either write protected (and cannot be 669 * changed from now on) or it is invalid (in which case, we don't 670 * really care if it changes underneath us after this point). 671 */ 672 if (FNAME(gpte_changed)(vcpu, gw, top_level)) 673 return RET_PF_RETRY; 674 675 if (WARN_ON_ONCE(!VALID_PAGE(vcpu->arch.mmu->root.hpa))) 676 return RET_PF_RETRY; 677 678 /* 679 * Load a new root and retry the faulting instruction in the extremely 680 * unlikely scenario that the guest root gfn became visible between 681 * loading a dummy root and handling the resulting page fault, e.g. if 682 * userspace create a memslot in the interim. 683 */ 684 if (unlikely(kvm_mmu_is_dummy_root(vcpu->arch.mmu->root.hpa))) { 685 kvm_make_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu); 686 return RET_PF_RETRY; 687 } 688 689 for_each_shadow_entry(vcpu, fault->addr, it) { 690 gfn_t table_gfn; 691 692 clear_sp_write_flooding_count(it.sptep); 693 if (it.level == gw->level) 694 break; 695 696 table_gfn = gw->table_gfn[it.level - 2]; 697 access = gw->pt_access[it.level - 2]; 698 sp = kvm_mmu_get_child_sp(vcpu, it.sptep, table_gfn, 699 false, access); 700 701 /* 702 * Synchronize the new page before linking it, as the CPU (KVM) 703 * is architecturally disallowed from inserting non-present 704 * entries into the TLB, i.e. the guest isn't required to flush 705 * the TLB when changing the gPTE from non-present to present. 706 * 707 * For PG_LEVEL_4K, kvm_mmu_find_shadow_page() has already 708 * synchronized the page via kvm_sync_page(). 709 * 710 * For higher level pages, which cannot be unsync themselves 711 * but can have unsync children, synchronize via the slower 712 * mmu_sync_children(). If KVM needs to drop mmu_lock due to 713 * contention or to reschedule, instruct the caller to retry 714 * the #PF (mmu_sync_children() ensures forward progress will 715 * be made). 716 */ 717 if (sp != ERR_PTR(-EEXIST) && sp->unsync_children && 718 mmu_sync_children(vcpu, sp, false)) 719 return RET_PF_RETRY; 720 721 /* 722 * Verify that the gpte in the page, which is now either 723 * write-protected or unsync, wasn't modified between the fault 724 * and acquiring mmu_lock. This needs to be done even when 725 * reusing an existing shadow page to ensure the information 726 * gathered by the walker matches the information stored in the 727 * shadow page (which could have been modified by a different 728 * vCPU even if the page was already linked). Holding mmu_lock 729 * prevents the shadow page from changing after this point. 730 */ 731 if (FNAME(gpte_changed)(vcpu, gw, it.level - 1)) 732 return RET_PF_RETRY; 733 734 if (sp != ERR_PTR(-EEXIST)) 735 link_shadow_page(vcpu, it.sptep, sp); 736 737 if (fault->write && table_gfn == fault->gfn) 738 fault->write_fault_to_shadow_pgtable = true; 739 } 740 741 /* 742 * Adjust the hugepage size _after_ resolving indirect shadow pages. 743 * KVM doesn't support mapping hugepages into the guest for gfns that 744 * are being shadowed by KVM, i.e. allocating a new shadow page may 745 * affect the allowed hugepage size. 746 */ 747 kvm_mmu_hugepage_adjust(vcpu, fault); 748 749 trace_kvm_mmu_spte_requested(fault, gw->pte_access); 750 751 for (; shadow_walk_okay(&it); shadow_walk_next(&it)) { 752 /* 753 * We cannot overwrite existing page tables with an NX 754 * large page, as the leaf could be executable. 755 */ 756 if (fault->nx_huge_page_workaround_enabled) 757 disallowed_hugepage_adjust(fault, *it.sptep, it.level); 758 759 base_gfn = gfn_round_for_level(fault->gfn, it.level); 760 if (it.level == fault->goal_level) 761 break; 762 763 validate_direct_spte(vcpu, it.sptep, direct_access); 764 765 sp = kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn, 766 true, direct_access); 767 if (sp == ERR_PTR(-EEXIST)) 768 continue; 769 770 link_shadow_page(vcpu, it.sptep, sp); 771 if (fault->huge_page_disallowed) 772 account_nx_huge_page(vcpu->kvm, sp, 773 fault->req_level >= it.level); 774 } 775 776 if (WARN_ON_ONCE(it.level != fault->goal_level)) 777 return -EFAULT; 778 779 ret = mmu_set_spte(vcpu, fault->slot, it.sptep, gw->pte_access, 780 base_gfn, fault->pfn, fault); 781 if (ret == RET_PF_SPURIOUS) 782 return ret; 783 784 FNAME(pte_prefetch)(vcpu, gw, it.sptep); 785 return ret; 786 } 787 788 /* 789 * Page fault handler. There are several causes for a page fault: 790 * - there is no shadow pte for the guest pte 791 * - write access through a shadow pte marked read only so that we can set 792 * the dirty bit 793 * - write access to a shadow pte marked read only so we can update the page 794 * dirty bitmap, when userspace requests it 795 * - mmio access; in this case we will never install a present shadow pte 796 * - normal guest page fault due to the guest pte marked not present, not 797 * writable, or not executable 798 * 799 * Returns: 1 if we need to emulate the instruction, 0 otherwise, or 800 * a negative value on error. 801 */ 802 static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) 803 { 804 struct guest_walker walker; 805 int r; 806 807 WARN_ON_ONCE(fault->is_tdp); 808 809 /* 810 * Look up the guest pte for the faulting address. 811 * If PFEC.RSVD is set, this is a shadow page fault. 812 * The bit needs to be cleared before walking guest page tables. 813 */ 814 r = FNAME(walk_addr)(&walker, vcpu, fault->addr, 815 fault->error_code & ~PFERR_RSVD_MASK); 816 817 /* 818 * The page is not mapped by the guest. Let the guest handle it. 819 */ 820 if (!r) { 821 if (!fault->prefetch) 822 __kvm_inject_emulated_page_fault(vcpu, &walker.fault, true); 823 824 return RET_PF_RETRY; 825 } 826 827 fault->gfn = walker.gfn; 828 fault->max_level = walker.level; 829 fault->slot = kvm_vcpu_gfn_to_memslot(vcpu, fault->gfn); 830 831 if (page_fault_handle_page_track(vcpu, fault)) { 832 shadow_page_table_clear_flood(vcpu, fault->addr); 833 return RET_PF_WRITE_PROTECTED; 834 } 835 836 r = mmu_topup_memory_caches(vcpu, true); 837 if (r) 838 return r; 839 840 r = kvm_mmu_faultin_pfn(vcpu, fault, walker.pte_access); 841 if (r != RET_PF_CONTINUE) 842 return r; 843 844 #if PTTYPE != PTTYPE_EPT 845 /* 846 * Treat the guest PTE protections as writable, supervisor-only if this 847 * is a supervisor write fault and CR0.WP=0 (supervisor accesses ignore 848 * PTE.W if CR0.WP=0). Don't change the access type for emulated MMIO, 849 * otherwise KVM will cache incorrect access information in the SPTE. 850 */ 851 if (fault->write && !(walker.pte_access & ACC_WRITE_MASK) && 852 !is_cr0_wp(vcpu->arch.mmu) && !fault->user && fault->slot) { 853 walker.pte_access |= ACC_WRITE_MASK; 854 walker.pte_access &= ~ACC_USER_MASK; 855 856 /* 857 * If we converted a user page to a kernel page, 858 * so that the kernel can write to it when cr0.wp=0, 859 * then we should prevent the kernel from executing it 860 * if SMEP is enabled. 861 */ 862 if (is_cr4_smep(vcpu->arch.mmu)) 863 walker.pte_access &= ~ACC_EXEC_MASK; 864 } 865 #endif 866 867 r = RET_PF_RETRY; 868 write_lock(&vcpu->kvm->mmu_lock); 869 870 if (is_page_fault_stale(vcpu, fault)) 871 goto out_unlock; 872 873 r = make_mmu_pages_available(vcpu); 874 if (r) 875 goto out_unlock; 876 r = FNAME(fetch)(vcpu, fault, &walker); 877 878 out_unlock: 879 kvm_mmu_finish_page_fault(vcpu, fault, r); 880 write_unlock(&vcpu->kvm->mmu_lock); 881 return r; 882 } 883 884 static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp) 885 { 886 int offset = 0; 887 888 WARN_ON_ONCE(sp->role.level != PG_LEVEL_4K); 889 890 if (PTTYPE == 32) 891 offset = sp->role.quadrant << SPTE_LEVEL_BITS; 892 893 return gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t); 894 } 895 896 /* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */ 897 static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, 898 gpa_t addr, u64 access, 899 struct x86_exception *exception) 900 { 901 struct guest_walker walker; 902 gpa_t gpa = INVALID_GPA; 903 int r; 904 905 #ifndef CONFIG_X86_64 906 /* A 64-bit GVA should be impossible on 32-bit KVM. */ 907 WARN_ON_ONCE((addr >> 32) && mmu == vcpu->arch.walk_mmu); 908 #endif 909 910 r = FNAME(walk_addr_generic)(&walker, vcpu, mmu, addr, access); 911 912 if (r) { 913 gpa = gfn_to_gpa(walker.gfn); 914 gpa |= addr & ~PAGE_MASK; 915 } else if (exception) 916 *exception = walker.fault; 917 918 return gpa; 919 } 920 921 /* 922 * Using the information in sp->shadowed_translation (kvm_mmu_page_get_gfn()) is 923 * safe because SPTEs are protected by mmu_notifiers and memslot generations, so 924 * the pfn for a given gfn can't change unless all SPTEs pointing to the gfn are 925 * nuked first. 926 * 927 * Returns 928 * < 0: failed to sync spte 929 * 0: the spte is synced and no tlb flushing is required 930 * > 0: the spte is synced and tlb flushing is required 931 */ 932 static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int i) 933 { 934 bool host_writable; 935 gpa_t first_pte_gpa; 936 u64 *sptep, spte; 937 struct kvm_memory_slot *slot; 938 unsigned pte_access; 939 pt_element_t gpte; 940 gpa_t pte_gpa; 941 gfn_t gfn; 942 943 if (WARN_ON_ONCE(sp->spt[i] == SHADOW_NONPRESENT_VALUE || 944 !sp->shadowed_translation)) 945 return 0; 946 947 first_pte_gpa = FNAME(get_level1_sp_gpa)(sp); 948 pte_gpa = first_pte_gpa + i * sizeof(pt_element_t); 949 950 if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte, 951 sizeof(pt_element_t))) 952 return -1; 953 954 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) 955 return 1; 956 957 gfn = gpte_to_gfn(gpte); 958 pte_access = sp->role.access; 959 pte_access &= FNAME(gpte_access)(gpte); 960 FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte); 961 962 if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access)) 963 return 0; 964 965 /* 966 * Drop the SPTE if the new protections result in no effective 967 * "present" bit or if the gfn is changing. The former case 968 * only affects EPT with execute-only support with pte_access==0; 969 * all other paging modes will create a read-only SPTE if 970 * pte_access is zero. 971 */ 972 if ((pte_access | shadow_present_mask) == SHADOW_NONPRESENT_VALUE || 973 gfn != kvm_mmu_page_get_gfn(sp, i)) { 974 drop_spte(vcpu->kvm, &sp->spt[i]); 975 return 1; 976 } 977 /* 978 * Do nothing if the permissions are unchanged. The existing SPTE is 979 * still, and prefetch_invalid_gpte() has verified that the A/D bits 980 * are set in the "new" gPTE, i.e. there is no danger of missing an A/D 981 * update due to A/D bits being set in the SPTE but not the gPTE. 982 */ 983 if (kvm_mmu_page_get_access(sp, i) == pte_access) 984 return 0; 985 986 /* Update the shadowed access bits in case they changed. */ 987 kvm_mmu_page_set_access(sp, i, pte_access); 988 989 sptep = &sp->spt[i]; 990 spte = *sptep; 991 host_writable = spte & shadow_host_writable_mask; 992 slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 993 make_spte(vcpu, sp, slot, pte_access, gfn, 994 spte_to_pfn(spte), spte, true, true, 995 host_writable, &spte); 996 997 /* 998 * There is no need to mark the pfn dirty, as the new protections must 999 * be a subset of the old protections, i.e. synchronizing a SPTE cannot 1000 * change the SPTE from read-only to writable. 1001 */ 1002 return mmu_spte_update(sptep, spte); 1003 } 1004 1005 #undef pt_element_t 1006 #undef guest_walker 1007 #undef FNAME 1008 #undef PT_BASE_ADDR_MASK 1009 #undef PT_INDEX 1010 #undef PT_LVL_ADDR_MASK 1011 #undef PT_LVL_OFFSET_MASK 1012 #undef PT_LEVEL_BITS 1013 #undef PT_MAX_FULL_LEVELS 1014 #undef gpte_to_gfn 1015 #undef gpte_to_gfn_lvl 1016 #undef PT_GUEST_ACCESSED_MASK 1017 #undef PT_GUEST_DIRTY_MASK 1018 #undef PT_GUEST_DIRTY_SHIFT 1019 #undef PT_GUEST_ACCESSED_SHIFT 1020 #undef PT_HAVE_ACCESSED_DIRTY 1021