1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2020-2023 Loongson Technology Corporation Limited 4 */ 5 6 #include <linux/highmem.h> 7 #include <linux/hugetlb.h> 8 #include <linux/kvm_host.h> 9 #include <linux/page-flags.h> 10 #include <linux/uaccess.h> 11 #include <asm/mmu_context.h> 12 #include <asm/pgalloc.h> 13 #include <asm/tlb.h> 14 #include <asm/kvm_mmu.h> 15 16 static inline void kvm_ptw_prepare(struct kvm *kvm, kvm_ptw_ctx *ctx) 17 { 18 ctx->level = kvm->arch.root_level; 19 /* pte table */ 20 ctx->invalid_ptes = kvm->arch.invalid_ptes; 21 ctx->pte_shifts = kvm->arch.pte_shifts; 22 ctx->pgtable_shift = ctx->pte_shifts[ctx->level]; 23 ctx->invalid_entry = ctx->invalid_ptes[ctx->level]; 24 ctx->opaque = kvm; 25 } 26 27 /* 28 * Mark a range of guest physical address space old (all accesses fault) in the 29 * VM's GPA page table to allow detection of commonly used pages. 30 */ 31 static int kvm_mkold_pte(kvm_pte_t *pte, phys_addr_t addr, kvm_ptw_ctx *ctx) 32 { 33 if (kvm_pte_young(*pte)) { 34 *pte = kvm_pte_mkold(*pte); 35 return 1; 36 } 37 38 return 0; 39 } 40 41 /* 42 * Mark a range of guest physical address space clean (writes fault) in the VM's 43 * GPA page table to allow dirty page tracking. 44 */ 45 static int kvm_mkclean_pte(kvm_pte_t *pte, phys_addr_t addr, kvm_ptw_ctx *ctx) 46 { 47 gfn_t offset; 48 kvm_pte_t val; 49 50 val = *pte; 51 /* 52 * For kvm_arch_mmu_enable_log_dirty_pt_masked with mask, start and end 53 * may cross hugepage, for first huge page parameter addr is equal to 54 * start, however for the second huge page addr is base address of 55 * this huge page, rather than start or end address 56 */ 57 if ((ctx->flag & _KVM_HAS_PGMASK) && !kvm_pte_huge(val)) { 58 offset = (addr >> PAGE_SHIFT) - ctx->gfn; 59 if (!(BIT(offset) & ctx->mask)) 60 return 0; 61 } 62 63 /* 64 * Need not split huge page now, just set write-proect pte bit 65 * Split huge page until next write fault 66 */ 67 if (kvm_pte_dirty(val)) { 68 *pte = kvm_pte_mkclean(val); 69 return 1; 70 } 71 72 return 0; 73 } 74 75 /* 76 * Clear pte entry 77 */ 78 static int kvm_flush_pte(kvm_pte_t *pte, phys_addr_t addr, kvm_ptw_ctx *ctx) 79 { 80 struct kvm *kvm; 81 82 kvm = ctx->opaque; 83 if (ctx->level) 84 kvm->stat.hugepages--; 85 else 86 kvm->stat.pages--; 87 88 *pte = ctx->invalid_entry; 89 90 return 1; 91 } 92 93 /* 94 * kvm_pgd_alloc() - Allocate and initialise a KVM GPA page directory. 95 * 96 * Allocate a blank KVM GPA page directory (PGD) for representing guest physical 97 * to host physical page mappings. 98 * 99 * Returns: Pointer to new KVM GPA page directory. 100 * NULL on allocation failure. 101 */ 102 kvm_pte_t *kvm_pgd_alloc(void) 103 { 104 kvm_pte_t *pgd; 105 106 pgd = (kvm_pte_t *)__get_free_pages(GFP_KERNEL, 0); 107 if (pgd) 108 pgd_init((void *)pgd); 109 110 return pgd; 111 } 112 113 static void _kvm_pte_init(void *addr, unsigned long val) 114 { 115 unsigned long *p, *end; 116 117 p = (unsigned long *)addr; 118 end = p + PTRS_PER_PTE; 119 do { 120 p[0] = val; 121 p[1] = val; 122 p[2] = val; 123 p[3] = val; 124 p[4] = val; 125 p += 8; 126 p[-3] = val; 127 p[-2] = val; 128 p[-1] = val; 129 } while (p != end); 130 } 131 132 /* 133 * Caller must hold kvm->mm_lock 134 * 135 * Walk the page tables of kvm to find the PTE corresponding to the 136 * address @addr. If page tables don't exist for @addr, they will be created 137 * from the MMU cache if @cache is not NULL. 138 */ 139 static kvm_pte_t *kvm_populate_gpa(struct kvm *kvm, 140 struct kvm_mmu_memory_cache *cache, 141 unsigned long addr, int level) 142 { 143 kvm_ptw_ctx ctx; 144 kvm_pte_t *entry, *child; 145 146 kvm_ptw_prepare(kvm, &ctx); 147 child = kvm->arch.pgd; 148 while (ctx.level > level) { 149 entry = kvm_pgtable_offset(&ctx, child, addr); 150 if (kvm_pte_none(&ctx, entry)) { 151 if (!cache) 152 return NULL; 153 154 child = kvm_mmu_memory_cache_alloc(cache); 155 _kvm_pte_init(child, ctx.invalid_ptes[ctx.level - 1]); 156 kvm_set_pte(entry, __pa(child)); 157 } else if (kvm_pte_huge(*entry)) { 158 return entry; 159 } else 160 child = (kvm_pte_t *)__va(PHYSADDR(*entry)); 161 kvm_ptw_enter(&ctx); 162 } 163 164 entry = kvm_pgtable_offset(&ctx, child, addr); 165 166 return entry; 167 } 168 169 /* 170 * Page walker for VM shadow mmu at last level 171 * The last level is small pte page or huge pmd page 172 */ 173 static int kvm_ptw_leaf(kvm_pte_t *dir, phys_addr_t addr, phys_addr_t end, kvm_ptw_ctx *ctx) 174 { 175 int ret; 176 phys_addr_t next, start, size; 177 struct list_head *list; 178 kvm_pte_t *entry, *child; 179 180 ret = 0; 181 start = addr; 182 child = (kvm_pte_t *)__va(PHYSADDR(*dir)); 183 entry = kvm_pgtable_offset(ctx, child, addr); 184 do { 185 next = addr + (0x1UL << ctx->pgtable_shift); 186 if (!kvm_pte_present(ctx, entry)) 187 continue; 188 189 ret |= ctx->ops(entry, addr, ctx); 190 } while (entry++, addr = next, addr < end); 191 192 if (kvm_need_flush(ctx)) { 193 size = 0x1UL << (ctx->pgtable_shift + PAGE_SHIFT - 3); 194 if (start + size == end) { 195 list = (struct list_head *)child; 196 list_add_tail(list, &ctx->list); 197 *dir = ctx->invalid_ptes[ctx->level + 1]; 198 } 199 } 200 201 return ret; 202 } 203 204 /* 205 * Page walker for VM shadow mmu at page table dir level 206 */ 207 static int kvm_ptw_dir(kvm_pte_t *dir, phys_addr_t addr, phys_addr_t end, kvm_ptw_ctx *ctx) 208 { 209 int ret; 210 phys_addr_t next, start, size; 211 struct list_head *list; 212 kvm_pte_t *entry, *child; 213 214 ret = 0; 215 start = addr; 216 child = (kvm_pte_t *)__va(PHYSADDR(*dir)); 217 entry = kvm_pgtable_offset(ctx, child, addr); 218 do { 219 next = kvm_pgtable_addr_end(ctx, addr, end); 220 if (!kvm_pte_present(ctx, entry)) 221 continue; 222 223 if (kvm_pte_huge(*entry)) { 224 ret |= ctx->ops(entry, addr, ctx); 225 continue; 226 } 227 228 kvm_ptw_enter(ctx); 229 if (ctx->level == 0) 230 ret |= kvm_ptw_leaf(entry, addr, next, ctx); 231 else 232 ret |= kvm_ptw_dir(entry, addr, next, ctx); 233 kvm_ptw_exit(ctx); 234 } while (entry++, addr = next, addr < end); 235 236 if (kvm_need_flush(ctx)) { 237 size = 0x1UL << (ctx->pgtable_shift + PAGE_SHIFT - 3); 238 if (start + size == end) { 239 list = (struct list_head *)child; 240 list_add_tail(list, &ctx->list); 241 *dir = ctx->invalid_ptes[ctx->level + 1]; 242 } 243 } 244 245 return ret; 246 } 247 248 /* 249 * Page walker for VM shadow mmu at page root table 250 */ 251 static int kvm_ptw_top(kvm_pte_t *dir, phys_addr_t addr, phys_addr_t end, kvm_ptw_ctx *ctx) 252 { 253 int ret; 254 phys_addr_t next; 255 kvm_pte_t *entry; 256 257 ret = 0; 258 entry = kvm_pgtable_offset(ctx, dir, addr); 259 do { 260 next = kvm_pgtable_addr_end(ctx, addr, end); 261 if (!kvm_pte_present(ctx, entry)) 262 continue; 263 264 kvm_ptw_enter(ctx); 265 ret |= kvm_ptw_dir(entry, addr, next, ctx); 266 kvm_ptw_exit(ctx); 267 } while (entry++, addr = next, addr < end); 268 269 return ret; 270 } 271 272 /* 273 * kvm_flush_range() - Flush a range of guest physical addresses. 274 * @kvm: KVM pointer. 275 * @start_gfn: Guest frame number of first page in GPA range to flush. 276 * @end_gfn: Guest frame number of last page in GPA range to flush. 277 * @lock: Whether to hold mmu_lock or not 278 * 279 * Flushes a range of GPA mappings from the GPA page tables. 280 */ 281 static void kvm_flush_range(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn, int lock) 282 { 283 int ret; 284 kvm_ptw_ctx ctx; 285 struct list_head *pos, *temp; 286 287 ctx.ops = kvm_flush_pte; 288 ctx.flag = _KVM_FLUSH_PGTABLE; 289 kvm_ptw_prepare(kvm, &ctx); 290 INIT_LIST_HEAD(&ctx.list); 291 292 if (lock) { 293 spin_lock(&kvm->mmu_lock); 294 ret = kvm_ptw_top(kvm->arch.pgd, start_gfn << PAGE_SHIFT, 295 end_gfn << PAGE_SHIFT, &ctx); 296 spin_unlock(&kvm->mmu_lock); 297 } else 298 ret = kvm_ptw_top(kvm->arch.pgd, start_gfn << PAGE_SHIFT, 299 end_gfn << PAGE_SHIFT, &ctx); 300 301 /* Flush vpid for each vCPU individually */ 302 if (ret) 303 kvm_flush_remote_tlbs(kvm); 304 305 /* 306 * free pte table page after mmu_lock 307 * the pte table page is linked together with ctx.list 308 */ 309 list_for_each_safe(pos, temp, &ctx.list) { 310 list_del(pos); 311 free_page((unsigned long)pos); 312 } 313 } 314 315 /* 316 * kvm_mkclean_gpa_pt() - Make a range of guest physical addresses clean. 317 * @kvm: KVM pointer. 318 * @start_gfn: Guest frame number of first page in GPA range to flush. 319 * @end_gfn: Guest frame number of last page in GPA range to flush. 320 * 321 * Make a range of GPA mappings clean so that guest writes will fault and 322 * trigger dirty page logging. 323 * 324 * The caller must hold the @kvm->mmu_lock spinlock. 325 * 326 * Returns: Whether any GPA mappings were modified, which would require 327 * derived mappings (GVA page tables & TLB enties) to be 328 * invalidated. 329 */ 330 static int kvm_mkclean_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn) 331 { 332 kvm_ptw_ctx ctx; 333 334 ctx.ops = kvm_mkclean_pte; 335 ctx.flag = 0; 336 kvm_ptw_prepare(kvm, &ctx); 337 return kvm_ptw_top(kvm->arch.pgd, start_gfn << PAGE_SHIFT, end_gfn << PAGE_SHIFT, &ctx); 338 } 339 340 /* 341 * kvm_arch_mmu_enable_log_dirty_pt_masked() - write protect dirty pages 342 * @kvm: The KVM pointer 343 * @slot: The memory slot associated with mask 344 * @gfn_offset: The gfn offset in memory slot 345 * @mask: The mask of dirty pages at offset 'gfn_offset' in this memory 346 * slot to be write protected 347 * 348 * Walks bits set in mask write protects the associated pte's. Caller must 349 * acquire @kvm->mmu_lock. 350 */ 351 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, 352 struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask) 353 { 354 kvm_ptw_ctx ctx; 355 gfn_t base_gfn = slot->base_gfn + gfn_offset; 356 gfn_t start = base_gfn + __ffs(mask); 357 gfn_t end = base_gfn + __fls(mask) + 1; 358 359 ctx.ops = kvm_mkclean_pte; 360 ctx.flag = _KVM_HAS_PGMASK; 361 ctx.mask = mask; 362 ctx.gfn = base_gfn; 363 kvm_ptw_prepare(kvm, &ctx); 364 365 kvm_ptw_top(kvm->arch.pgd, start << PAGE_SHIFT, end << PAGE_SHIFT, &ctx); 366 } 367 368 void kvm_arch_commit_memory_region(struct kvm *kvm, 369 struct kvm_memory_slot *old, 370 const struct kvm_memory_slot *new, 371 enum kvm_mr_change change) 372 { 373 int needs_flush; 374 375 /* 376 * If dirty page logging is enabled, write protect all pages in the slot 377 * ready for dirty logging. 378 * 379 * There is no need to do this in any of the following cases: 380 * CREATE: No dirty mappings will already exist. 381 * MOVE/DELETE: The old mappings will already have been cleaned up by 382 * kvm_arch_flush_shadow_memslot() 383 */ 384 if (change == KVM_MR_FLAGS_ONLY && 385 (!(old->flags & KVM_MEM_LOG_DIRTY_PAGES) && 386 new->flags & KVM_MEM_LOG_DIRTY_PAGES)) { 387 spin_lock(&kvm->mmu_lock); 388 /* Write protect GPA page table entries */ 389 needs_flush = kvm_mkclean_gpa_pt(kvm, new->base_gfn, 390 new->base_gfn + new->npages); 391 spin_unlock(&kvm->mmu_lock); 392 if (needs_flush) 393 kvm_flush_remote_tlbs(kvm); 394 } 395 } 396 397 void kvm_arch_flush_shadow_all(struct kvm *kvm) 398 { 399 kvm_flush_range(kvm, 0, kvm->arch.gpa_size >> PAGE_SHIFT, 0); 400 } 401 402 void kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) 403 { 404 /* 405 * The slot has been made invalid (ready for moving or deletion), so we 406 * need to ensure that it can no longer be accessed by any guest vCPUs. 407 */ 408 kvm_flush_range(kvm, slot->base_gfn, slot->base_gfn + slot->npages, 1); 409 } 410 411 bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) 412 { 413 kvm_ptw_ctx ctx; 414 415 ctx.flag = 0; 416 ctx.ops = kvm_flush_pte; 417 kvm_ptw_prepare(kvm, &ctx); 418 INIT_LIST_HEAD(&ctx.list); 419 420 return kvm_ptw_top(kvm->arch.pgd, range->start << PAGE_SHIFT, 421 range->end << PAGE_SHIFT, &ctx); 422 } 423 424 bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 425 { 426 unsigned long prot_bits; 427 kvm_pte_t *ptep; 428 kvm_pfn_t pfn = pte_pfn(range->arg.pte); 429 gpa_t gpa = range->start << PAGE_SHIFT; 430 431 ptep = kvm_populate_gpa(kvm, NULL, gpa, 0); 432 if (!ptep) 433 return false; 434 435 /* Replacing an absent or old page doesn't need flushes */ 436 if (!kvm_pte_present(NULL, ptep) || !kvm_pte_young(*ptep)) { 437 kvm_set_pte(ptep, 0); 438 return false; 439 } 440 441 /* Fill new pte if write protected or page migrated */ 442 prot_bits = _PAGE_PRESENT | __READABLE; 443 prot_bits |= _CACHE_MASK & pte_val(range->arg.pte); 444 445 /* 446 * Set _PAGE_WRITE or _PAGE_DIRTY iff old and new pte both support 447 * _PAGE_WRITE for map_page_fast if next page write fault 448 * _PAGE_DIRTY since gpa has already recorded as dirty page 449 */ 450 prot_bits |= __WRITEABLE & *ptep & pte_val(range->arg.pte); 451 kvm_set_pte(ptep, kvm_pfn_pte(pfn, __pgprot(prot_bits))); 452 453 return true; 454 } 455 456 bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 457 { 458 kvm_ptw_ctx ctx; 459 460 ctx.flag = 0; 461 ctx.ops = kvm_mkold_pte; 462 kvm_ptw_prepare(kvm, &ctx); 463 464 return kvm_ptw_top(kvm->arch.pgd, range->start << PAGE_SHIFT, 465 range->end << PAGE_SHIFT, &ctx); 466 } 467 468 bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 469 { 470 gpa_t gpa = range->start << PAGE_SHIFT; 471 kvm_pte_t *ptep = kvm_populate_gpa(kvm, NULL, gpa, 0); 472 473 if (ptep && kvm_pte_present(NULL, ptep) && kvm_pte_young(*ptep)) 474 return true; 475 476 return false; 477 } 478 479 /* 480 * kvm_map_page_fast() - Fast path GPA fault handler. 481 * @vcpu: vCPU pointer. 482 * @gpa: Guest physical address of fault. 483 * @write: Whether the fault was due to a write. 484 * 485 * Perform fast path GPA fault handling, doing all that can be done without 486 * calling into KVM. This handles marking old pages young (for idle page 487 * tracking), and dirtying of clean pages (for dirty page logging). 488 * 489 * Returns: 0 on success, in which case we can update derived mappings and 490 * resume guest execution. 491 * -EFAULT on failure due to absent GPA mapping or write to 492 * read-only page, in which case KVM must be consulted. 493 */ 494 static int kvm_map_page_fast(struct kvm_vcpu *vcpu, unsigned long gpa, bool write) 495 { 496 int ret = 0; 497 kvm_pfn_t pfn = 0; 498 kvm_pte_t *ptep, changed, new; 499 gfn_t gfn = gpa >> PAGE_SHIFT; 500 struct kvm *kvm = vcpu->kvm; 501 struct kvm_memory_slot *slot; 502 503 spin_lock(&kvm->mmu_lock); 504 505 /* Fast path - just check GPA page table for an existing entry */ 506 ptep = kvm_populate_gpa(kvm, NULL, gpa, 0); 507 if (!ptep || !kvm_pte_present(NULL, ptep)) { 508 ret = -EFAULT; 509 goto out; 510 } 511 512 /* Track access to pages marked old */ 513 new = *ptep; 514 if (!kvm_pte_young(new)) 515 new = kvm_pte_mkyoung(new); 516 /* call kvm_set_pfn_accessed() after unlock */ 517 518 if (write && !kvm_pte_dirty(new)) { 519 if (!kvm_pte_write(new)) { 520 ret = -EFAULT; 521 goto out; 522 } 523 524 if (kvm_pte_huge(new)) { 525 /* 526 * Do not set write permission when dirty logging is 527 * enabled for HugePages 528 */ 529 slot = gfn_to_memslot(kvm, gfn); 530 if (kvm_slot_dirty_track_enabled(slot)) { 531 ret = -EFAULT; 532 goto out; 533 } 534 } 535 536 /* Track dirtying of writeable pages */ 537 new = kvm_pte_mkdirty(new); 538 } 539 540 changed = new ^ (*ptep); 541 if (changed) { 542 kvm_set_pte(ptep, new); 543 pfn = kvm_pte_pfn(new); 544 } 545 spin_unlock(&kvm->mmu_lock); 546 547 /* 548 * Fixme: pfn may be freed after mmu_lock 549 * kvm_try_get_pfn(pfn)/kvm_release_pfn pair to prevent this? 550 */ 551 if (kvm_pte_young(changed)) 552 kvm_set_pfn_accessed(pfn); 553 554 if (kvm_pte_dirty(changed)) { 555 mark_page_dirty(kvm, gfn); 556 kvm_set_pfn_dirty(pfn); 557 } 558 return ret; 559 out: 560 spin_unlock(&kvm->mmu_lock); 561 return ret; 562 } 563 564 static bool fault_supports_huge_mapping(struct kvm_memory_slot *memslot, 565 unsigned long hva, unsigned long map_size, bool write) 566 { 567 size_t size; 568 gpa_t gpa_start; 569 hva_t uaddr_start, uaddr_end; 570 571 /* Disable dirty logging on HugePages */ 572 if (kvm_slot_dirty_track_enabled(memslot) && write) 573 return false; 574 575 size = memslot->npages * PAGE_SIZE; 576 gpa_start = memslot->base_gfn << PAGE_SHIFT; 577 uaddr_start = memslot->userspace_addr; 578 uaddr_end = uaddr_start + size; 579 580 /* 581 * Pages belonging to memslots that don't have the same alignment 582 * within a PMD for userspace and GPA cannot be mapped with stage-2 583 * PMD entries, because we'll end up mapping the wrong pages. 584 * 585 * Consider a layout like the following: 586 * 587 * memslot->userspace_addr: 588 * +-----+--------------------+--------------------+---+ 589 * |abcde|fgh Stage-1 block | Stage-1 block tv|xyz| 590 * +-----+--------------------+--------------------+---+ 591 * 592 * memslot->base_gfn << PAGE_SIZE: 593 * +---+--------------------+--------------------+-----+ 594 * |abc|def Stage-2 block | Stage-2 block |tvxyz| 595 * +---+--------------------+--------------------+-----+ 596 * 597 * If we create those stage-2 blocks, we'll end up with this incorrect 598 * mapping: 599 * d -> f 600 * e -> g 601 * f -> h 602 */ 603 if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1))) 604 return false; 605 606 /* 607 * Next, let's make sure we're not trying to map anything not covered 608 * by the memslot. This means we have to prohibit block size mappings 609 * for the beginning and end of a non-block aligned and non-block sized 610 * memory slot (illustrated by the head and tail parts of the 611 * userspace view above containing pages 'abcde' and 'xyz', 612 * respectively). 613 * 614 * Note that it doesn't matter if we do the check using the 615 * userspace_addr or the base_gfn, as both are equally aligned (per 616 * the check above) and equally sized. 617 */ 618 return (hva & ~(map_size - 1)) >= uaddr_start && 619 (hva & ~(map_size - 1)) + map_size <= uaddr_end; 620 } 621 622 /* 623 * Lookup the mapping level for @gfn in the current mm. 624 * 625 * WARNING! Use of host_pfn_mapping_level() requires the caller and the end 626 * consumer to be tied into KVM's handlers for MMU notifier events! 627 * 628 * There are several ways to safely use this helper: 629 * 630 * - Check mmu_invalidate_retry_hva() after grabbing the mapping level, before 631 * consuming it. In this case, mmu_lock doesn't need to be held during the 632 * lookup, but it does need to be held while checking the MMU notifier. 633 * 634 * - Hold mmu_lock AND ensure there is no in-progress MMU notifier invalidation 635 * event for the hva. This can be done by explicit checking the MMU notifier 636 * or by ensuring that KVM already has a valid mapping that covers the hva. 637 * 638 * - Do not use the result to install new mappings, e.g. use the host mapping 639 * level only to decide whether or not to zap an entry. In this case, it's 640 * not required to hold mmu_lock (though it's highly likely the caller will 641 * want to hold mmu_lock anyways, e.g. to modify SPTEs). 642 * 643 * Note! The lookup can still race with modifications to host page tables, but 644 * the above "rules" ensure KVM will not _consume_ the result of the walk if a 645 * race with the primary MMU occurs. 646 */ 647 static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, 648 const struct kvm_memory_slot *slot) 649 { 650 int level = 0; 651 unsigned long hva; 652 unsigned long flags; 653 pgd_t pgd; 654 p4d_t p4d; 655 pud_t pud; 656 pmd_t pmd; 657 658 /* 659 * Note, using the already-retrieved memslot and __gfn_to_hva_memslot() 660 * is not solely for performance, it's also necessary to avoid the 661 * "writable" check in __gfn_to_hva_many(), which will always fail on 662 * read-only memslots due to gfn_to_hva() assuming writes. Earlier 663 * page fault steps have already verified the guest isn't writing a 664 * read-only memslot. 665 */ 666 hva = __gfn_to_hva_memslot(slot, gfn); 667 668 /* 669 * Disable IRQs to prevent concurrent tear down of host page tables, 670 * e.g. if the primary MMU promotes a P*D to a huge page and then frees 671 * the original page table. 672 */ 673 local_irq_save(flags); 674 675 /* 676 * Read each entry once. As above, a non-leaf entry can be promoted to 677 * a huge page _during_ this walk. Re-reading the entry could send the 678 * walk into the weeks, e.g. p*d_large() returns false (sees the old 679 * value) and then p*d_offset() walks into the target huge page instead 680 * of the old page table (sees the new value). 681 */ 682 pgd = READ_ONCE(*pgd_offset(kvm->mm, hva)); 683 if (pgd_none(pgd)) 684 goto out; 685 686 p4d = READ_ONCE(*p4d_offset(&pgd, hva)); 687 if (p4d_none(p4d) || !p4d_present(p4d)) 688 goto out; 689 690 pud = READ_ONCE(*pud_offset(&p4d, hva)); 691 if (pud_none(pud) || !pud_present(pud)) 692 goto out; 693 694 pmd = READ_ONCE(*pmd_offset(&pud, hva)); 695 if (pmd_none(pmd) || !pmd_present(pmd)) 696 goto out; 697 698 if (kvm_pte_huge(pmd_val(pmd))) 699 level = 1; 700 701 out: 702 local_irq_restore(flags); 703 return level; 704 } 705 706 /* 707 * Split huge page 708 */ 709 static kvm_pte_t *kvm_split_huge(struct kvm_vcpu *vcpu, kvm_pte_t *ptep, gfn_t gfn) 710 { 711 int i; 712 kvm_pte_t val, *child; 713 struct kvm *kvm = vcpu->kvm; 714 struct kvm_mmu_memory_cache *memcache; 715 716 memcache = &vcpu->arch.mmu_page_cache; 717 child = kvm_mmu_memory_cache_alloc(memcache); 718 val = kvm_pte_mksmall(*ptep); 719 for (i = 0; i < PTRS_PER_PTE; i++) { 720 kvm_set_pte(child + i, val); 721 val += PAGE_SIZE; 722 } 723 724 /* The later kvm_flush_tlb_gpa() will flush hugepage tlb */ 725 kvm_set_pte(ptep, __pa(child)); 726 727 kvm->stat.hugepages--; 728 kvm->stat.pages += PTRS_PER_PTE; 729 730 return child + (gfn & (PTRS_PER_PTE - 1)); 731 } 732 733 /* 734 * kvm_map_page() - Map a guest physical page. 735 * @vcpu: vCPU pointer. 736 * @gpa: Guest physical address of fault. 737 * @write: Whether the fault was due to a write. 738 * 739 * Handle GPA faults by creating a new GPA mapping (or updating an existing 740 * one). 741 * 742 * This takes care of marking pages young or dirty (idle/dirty page tracking), 743 * asking KVM for the corresponding PFN, and creating a mapping in the GPA page 744 * tables. Derived mappings (GVA page tables and TLBs) must be handled by the 745 * caller. 746 * 747 * Returns: 0 on success 748 * -EFAULT if there is no memory region at @gpa or a write was 749 * attempted to a read-only memory region. This is usually handled 750 * as an MMIO access. 751 */ 752 static int kvm_map_page(struct kvm_vcpu *vcpu, unsigned long gpa, bool write) 753 { 754 bool writeable; 755 int srcu_idx, err, retry_no = 0, level; 756 unsigned long hva, mmu_seq, prot_bits; 757 kvm_pfn_t pfn; 758 kvm_pte_t *ptep, new_pte; 759 gfn_t gfn = gpa >> PAGE_SHIFT; 760 struct kvm *kvm = vcpu->kvm; 761 struct kvm_memory_slot *memslot; 762 struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; 763 764 /* Try the fast path to handle old / clean pages */ 765 srcu_idx = srcu_read_lock(&kvm->srcu); 766 err = kvm_map_page_fast(vcpu, gpa, write); 767 if (!err) 768 goto out; 769 770 memslot = gfn_to_memslot(kvm, gfn); 771 hva = gfn_to_hva_memslot_prot(memslot, gfn, &writeable); 772 if (kvm_is_error_hva(hva) || (write && !writeable)) { 773 err = -EFAULT; 774 goto out; 775 } 776 777 /* We need a minimum of cached pages ready for page table creation */ 778 err = kvm_mmu_topup_memory_cache(memcache, KVM_MMU_CACHE_MIN_PAGES); 779 if (err) 780 goto out; 781 782 retry: 783 /* 784 * Used to check for invalidations in progress, of the pfn that is 785 * returned by pfn_to_pfn_prot below. 786 */ 787 mmu_seq = kvm->mmu_invalidate_seq; 788 /* 789 * Ensure the read of mmu_invalidate_seq isn't reordered with PTE reads in 790 * gfn_to_pfn_prot() (which calls get_user_pages()), so that we don't 791 * risk the page we get a reference to getting unmapped before we have a 792 * chance to grab the mmu_lock without mmu_invalidate_retry() noticing. 793 * 794 * This smp_rmb() pairs with the effective smp_wmb() of the combination 795 * of the pte_unmap_unlock() after the PTE is zapped, and the 796 * spin_lock() in kvm_mmu_invalidate_invalidate_<page|range_end>() before 797 * mmu_invalidate_seq is incremented. 798 */ 799 smp_rmb(); 800 801 /* Slow path - ask KVM core whether we can access this GPA */ 802 pfn = gfn_to_pfn_prot(kvm, gfn, write, &writeable); 803 if (is_error_noslot_pfn(pfn)) { 804 err = -EFAULT; 805 goto out; 806 } 807 808 /* Check if an invalidation has taken place since we got pfn */ 809 spin_lock(&kvm->mmu_lock); 810 if (mmu_invalidate_retry_hva(kvm, mmu_seq, hva)) { 811 /* 812 * This can happen when mappings are changed asynchronously, but 813 * also synchronously if a COW is triggered by 814 * gfn_to_pfn_prot(). 815 */ 816 spin_unlock(&kvm->mmu_lock); 817 kvm_release_pfn_clean(pfn); 818 if (retry_no > 100) { 819 retry_no = 0; 820 schedule(); 821 } 822 retry_no++; 823 goto retry; 824 } 825 826 /* 827 * For emulated devices such virtio device, actual cache attribute is 828 * determined by physical machine. 829 * For pass through physical device, it should be uncachable 830 */ 831 prot_bits = _PAGE_PRESENT | __READABLE; 832 if (pfn_valid(pfn)) 833 prot_bits |= _CACHE_CC; 834 else 835 prot_bits |= _CACHE_SUC; 836 837 if (writeable) { 838 prot_bits |= _PAGE_WRITE; 839 if (write) 840 prot_bits |= __WRITEABLE; 841 } 842 843 /* Disable dirty logging on HugePages */ 844 level = 0; 845 if (!fault_supports_huge_mapping(memslot, hva, PMD_SIZE, write)) { 846 level = 0; 847 } else { 848 level = host_pfn_mapping_level(kvm, gfn, memslot); 849 if (level == 1) { 850 gfn = gfn & ~(PTRS_PER_PTE - 1); 851 pfn = pfn & ~(PTRS_PER_PTE - 1); 852 } 853 } 854 855 /* Ensure page tables are allocated */ 856 ptep = kvm_populate_gpa(kvm, memcache, gpa, level); 857 new_pte = kvm_pfn_pte(pfn, __pgprot(prot_bits)); 858 if (level == 1) { 859 new_pte = kvm_pte_mkhuge(new_pte); 860 /* 861 * previous pmd entry is invalid_pte_table 862 * there is invalid tlb with small page 863 * need flush these invalid tlbs for current vcpu 864 */ 865 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 866 ++kvm->stat.hugepages; 867 } else if (kvm_pte_huge(*ptep) && write) 868 ptep = kvm_split_huge(vcpu, ptep, gfn); 869 else 870 ++kvm->stat.pages; 871 kvm_set_pte(ptep, new_pte); 872 spin_unlock(&kvm->mmu_lock); 873 874 if (prot_bits & _PAGE_DIRTY) { 875 mark_page_dirty_in_slot(kvm, memslot, gfn); 876 kvm_set_pfn_dirty(pfn); 877 } 878 879 kvm_set_pfn_accessed(pfn); 880 kvm_release_pfn_clean(pfn); 881 out: 882 srcu_read_unlock(&kvm->srcu, srcu_idx); 883 return err; 884 } 885 886 int kvm_handle_mm_fault(struct kvm_vcpu *vcpu, unsigned long gpa, bool write) 887 { 888 int ret; 889 890 ret = kvm_map_page(vcpu, gpa, write); 891 if (ret) 892 return ret; 893 894 /* Invalidate this entry in the TLB */ 895 kvm_flush_tlb_gpa(vcpu, gpa); 896 897 return 0; 898 } 899 900 void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) 901 { 902 } 903 904 int kvm_arch_prepare_memory_region(struct kvm *kvm, const struct kvm_memory_slot *old, 905 struct kvm_memory_slot *new, enum kvm_mr_change change) 906 { 907 return 0; 908 } 909 910 void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, 911 const struct kvm_memory_slot *memslot) 912 { 913 kvm_flush_remote_tlbs(kvm); 914 } 915