1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2020-2023 Loongson Technology Corporation Limited 4 */ 5 6 #include <linux/highmem.h> 7 #include <linux/hugetlb.h> 8 #include <linux/kvm_host.h> 9 #include <linux/page-flags.h> 10 #include <linux/uaccess.h> 11 #include <asm/mmu_context.h> 12 #include <asm/pgalloc.h> 13 #include <asm/tlb.h> 14 #include <asm/kvm_mmu.h> 15 16 static inline bool kvm_hugepage_capable(struct kvm_memory_slot *slot) 17 { 18 return slot->arch.flags & KVM_MEM_HUGEPAGE_CAPABLE; 19 } 20 21 static inline bool kvm_hugepage_incapable(struct kvm_memory_slot *slot) 22 { 23 return slot->arch.flags & KVM_MEM_HUGEPAGE_INCAPABLE; 24 } 25 26 static inline void kvm_ptw_prepare(struct kvm *kvm, kvm_ptw_ctx *ctx) 27 { 28 ctx->level = kvm->arch.root_level; 29 /* pte table */ 30 ctx->invalid_ptes = kvm->arch.invalid_ptes; 31 ctx->pte_shifts = kvm->arch.pte_shifts; 32 ctx->pgtable_shift = ctx->pte_shifts[ctx->level]; 33 ctx->invalid_entry = ctx->invalid_ptes[ctx->level]; 34 ctx->opaque = kvm; 35 } 36 37 /* 38 * Mark a range of guest physical address space old (all accesses fault) in the 39 * VM's GPA page table to allow detection of commonly used pages. 40 */ 41 static int kvm_mkold_pte(kvm_pte_t *pte, phys_addr_t addr, kvm_ptw_ctx *ctx) 42 { 43 if (kvm_pte_young(*pte)) { 44 *pte = kvm_pte_mkold(*pte); 45 return 1; 46 } 47 48 return 0; 49 } 50 51 /* 52 * Mark a range of guest physical address space clean (writes fault) in the VM's 53 * GPA page table to allow dirty page tracking. 54 */ 55 static int kvm_mkclean_pte(kvm_pte_t *pte, phys_addr_t addr, kvm_ptw_ctx *ctx) 56 { 57 gfn_t offset; 58 kvm_pte_t val; 59 60 val = *pte; 61 /* 62 * For kvm_arch_mmu_enable_log_dirty_pt_masked with mask, start and end 63 * may cross hugepage, for first huge page parameter addr is equal to 64 * start, however for the second huge page addr is base address of 65 * this huge page, rather than start or end address 66 */ 67 if ((ctx->flag & _KVM_HAS_PGMASK) && !kvm_pte_huge(val)) { 68 offset = (addr >> PAGE_SHIFT) - ctx->gfn; 69 if (!(BIT(offset) & ctx->mask)) 70 return 0; 71 } 72 73 /* 74 * Need not split huge page now, just set write-proect pte bit 75 * Split huge page until next write fault 76 */ 77 if (kvm_pte_dirty(val)) { 78 *pte = kvm_pte_mkclean(val); 79 return 1; 80 } 81 82 return 0; 83 } 84 85 /* 86 * Clear pte entry 87 */ 88 static int kvm_flush_pte(kvm_pte_t *pte, phys_addr_t addr, kvm_ptw_ctx *ctx) 89 { 90 struct kvm *kvm; 91 92 kvm = ctx->opaque; 93 if (ctx->level) 94 kvm->stat.hugepages--; 95 else 96 kvm->stat.pages--; 97 98 *pte = ctx->invalid_entry; 99 100 return 1; 101 } 102 103 /* 104 * kvm_pgd_alloc() - Allocate and initialise a KVM GPA page directory. 105 * 106 * Allocate a blank KVM GPA page directory (PGD) for representing guest physical 107 * to host physical page mappings. 108 * 109 * Returns: Pointer to new KVM GPA page directory. 110 * NULL on allocation failure. 111 */ 112 kvm_pte_t *kvm_pgd_alloc(void) 113 { 114 kvm_pte_t *pgd; 115 116 pgd = (kvm_pte_t *)__get_free_pages(GFP_KERNEL, 0); 117 if (pgd) 118 pgd_init((void *)pgd); 119 120 return pgd; 121 } 122 123 static void _kvm_pte_init(void *addr, unsigned long val) 124 { 125 unsigned long *p, *end; 126 127 p = (unsigned long *)addr; 128 end = p + PTRS_PER_PTE; 129 do { 130 p[0] = val; 131 p[1] = val; 132 p[2] = val; 133 p[3] = val; 134 p[4] = val; 135 p += 8; 136 p[-3] = val; 137 p[-2] = val; 138 p[-1] = val; 139 } while (p != end); 140 } 141 142 /* 143 * Caller must hold kvm->mm_lock 144 * 145 * Walk the page tables of kvm to find the PTE corresponding to the 146 * address @addr. If page tables don't exist for @addr, they will be created 147 * from the MMU cache if @cache is not NULL. 148 */ 149 static kvm_pte_t *kvm_populate_gpa(struct kvm *kvm, 150 struct kvm_mmu_memory_cache *cache, 151 unsigned long addr, int level) 152 { 153 kvm_ptw_ctx ctx; 154 kvm_pte_t *entry, *child; 155 156 kvm_ptw_prepare(kvm, &ctx); 157 child = kvm->arch.pgd; 158 while (ctx.level > level) { 159 entry = kvm_pgtable_offset(&ctx, child, addr); 160 if (kvm_pte_none(&ctx, entry)) { 161 if (!cache) 162 return NULL; 163 164 child = kvm_mmu_memory_cache_alloc(cache); 165 _kvm_pte_init(child, ctx.invalid_ptes[ctx.level - 1]); 166 smp_wmb(); /* Make pte visible before pmd */ 167 kvm_set_pte(entry, __pa(child)); 168 } else if (kvm_pte_huge(*entry)) { 169 return entry; 170 } else 171 child = (kvm_pte_t *)__va(PHYSADDR(*entry)); 172 kvm_ptw_enter(&ctx); 173 } 174 175 entry = kvm_pgtable_offset(&ctx, child, addr); 176 177 return entry; 178 } 179 180 /* 181 * Page walker for VM shadow mmu at last level 182 * The last level is small pte page or huge pmd page 183 */ 184 static int kvm_ptw_leaf(kvm_pte_t *dir, phys_addr_t addr, phys_addr_t end, kvm_ptw_ctx *ctx) 185 { 186 int ret; 187 phys_addr_t next, start, size; 188 struct list_head *list; 189 kvm_pte_t *entry, *child; 190 191 ret = 0; 192 start = addr; 193 child = (kvm_pte_t *)__va(PHYSADDR(*dir)); 194 entry = kvm_pgtable_offset(ctx, child, addr); 195 do { 196 next = addr + (0x1UL << ctx->pgtable_shift); 197 if (!kvm_pte_present(ctx, entry)) 198 continue; 199 200 ret |= ctx->ops(entry, addr, ctx); 201 } while (entry++, addr = next, addr < end); 202 203 if (kvm_need_flush(ctx)) { 204 size = 0x1UL << (ctx->pgtable_shift + PAGE_SHIFT - 3); 205 if (start + size == end) { 206 list = (struct list_head *)child; 207 list_add_tail(list, &ctx->list); 208 *dir = ctx->invalid_ptes[ctx->level + 1]; 209 } 210 } 211 212 return ret; 213 } 214 215 /* 216 * Page walker for VM shadow mmu at page table dir level 217 */ 218 static int kvm_ptw_dir(kvm_pte_t *dir, phys_addr_t addr, phys_addr_t end, kvm_ptw_ctx *ctx) 219 { 220 int ret; 221 phys_addr_t next, start, size; 222 struct list_head *list; 223 kvm_pte_t *entry, *child; 224 225 ret = 0; 226 start = addr; 227 child = (kvm_pte_t *)__va(PHYSADDR(*dir)); 228 entry = kvm_pgtable_offset(ctx, child, addr); 229 do { 230 next = kvm_pgtable_addr_end(ctx, addr, end); 231 if (!kvm_pte_present(ctx, entry)) 232 continue; 233 234 if (kvm_pte_huge(*entry)) { 235 ret |= ctx->ops(entry, addr, ctx); 236 continue; 237 } 238 239 kvm_ptw_enter(ctx); 240 if (ctx->level == 0) 241 ret |= kvm_ptw_leaf(entry, addr, next, ctx); 242 else 243 ret |= kvm_ptw_dir(entry, addr, next, ctx); 244 kvm_ptw_exit(ctx); 245 } while (entry++, addr = next, addr < end); 246 247 if (kvm_need_flush(ctx)) { 248 size = 0x1UL << (ctx->pgtable_shift + PAGE_SHIFT - 3); 249 if (start + size == end) { 250 list = (struct list_head *)child; 251 list_add_tail(list, &ctx->list); 252 *dir = ctx->invalid_ptes[ctx->level + 1]; 253 } 254 } 255 256 return ret; 257 } 258 259 /* 260 * Page walker for VM shadow mmu at page root table 261 */ 262 static int kvm_ptw_top(kvm_pte_t *dir, phys_addr_t addr, phys_addr_t end, kvm_ptw_ctx *ctx) 263 { 264 int ret; 265 phys_addr_t next; 266 kvm_pte_t *entry; 267 268 ret = 0; 269 entry = kvm_pgtable_offset(ctx, dir, addr); 270 do { 271 next = kvm_pgtable_addr_end(ctx, addr, end); 272 if (!kvm_pte_present(ctx, entry)) 273 continue; 274 275 kvm_ptw_enter(ctx); 276 ret |= kvm_ptw_dir(entry, addr, next, ctx); 277 kvm_ptw_exit(ctx); 278 } while (entry++, addr = next, addr < end); 279 280 return ret; 281 } 282 283 /* 284 * kvm_flush_range() - Flush a range of guest physical addresses. 285 * @kvm: KVM pointer. 286 * @start_gfn: Guest frame number of first page in GPA range to flush. 287 * @end_gfn: Guest frame number of last page in GPA range to flush. 288 * @lock: Whether to hold mmu_lock or not 289 * 290 * Flushes a range of GPA mappings from the GPA page tables. 291 */ 292 static void kvm_flush_range(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn, int lock) 293 { 294 int ret; 295 kvm_ptw_ctx ctx; 296 struct list_head *pos, *temp; 297 298 ctx.ops = kvm_flush_pte; 299 ctx.flag = _KVM_FLUSH_PGTABLE; 300 kvm_ptw_prepare(kvm, &ctx); 301 INIT_LIST_HEAD(&ctx.list); 302 303 if (lock) { 304 spin_lock(&kvm->mmu_lock); 305 ret = kvm_ptw_top(kvm->arch.pgd, start_gfn << PAGE_SHIFT, 306 end_gfn << PAGE_SHIFT, &ctx); 307 spin_unlock(&kvm->mmu_lock); 308 } else 309 ret = kvm_ptw_top(kvm->arch.pgd, start_gfn << PAGE_SHIFT, 310 end_gfn << PAGE_SHIFT, &ctx); 311 312 /* Flush vpid for each vCPU individually */ 313 if (ret) 314 kvm_flush_remote_tlbs(kvm); 315 316 /* 317 * free pte table page after mmu_lock 318 * the pte table page is linked together with ctx.list 319 */ 320 list_for_each_safe(pos, temp, &ctx.list) { 321 list_del(pos); 322 free_page((unsigned long)pos); 323 } 324 } 325 326 /* 327 * kvm_mkclean_gpa_pt() - Make a range of guest physical addresses clean. 328 * @kvm: KVM pointer. 329 * @start_gfn: Guest frame number of first page in GPA range to flush. 330 * @end_gfn: Guest frame number of last page in GPA range to flush. 331 * 332 * Make a range of GPA mappings clean so that guest writes will fault and 333 * trigger dirty page logging. 334 * 335 * The caller must hold the @kvm->mmu_lock spinlock. 336 * 337 * Returns: Whether any GPA mappings were modified, which would require 338 * derived mappings (GVA page tables & TLB enties) to be 339 * invalidated. 340 */ 341 static int kvm_mkclean_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn) 342 { 343 kvm_ptw_ctx ctx; 344 345 ctx.ops = kvm_mkclean_pte; 346 ctx.flag = 0; 347 kvm_ptw_prepare(kvm, &ctx); 348 return kvm_ptw_top(kvm->arch.pgd, start_gfn << PAGE_SHIFT, end_gfn << PAGE_SHIFT, &ctx); 349 } 350 351 /* 352 * kvm_arch_mmu_enable_log_dirty_pt_masked() - write protect dirty pages 353 * @kvm: The KVM pointer 354 * @slot: The memory slot associated with mask 355 * @gfn_offset: The gfn offset in memory slot 356 * @mask: The mask of dirty pages at offset 'gfn_offset' in this memory 357 * slot to be write protected 358 * 359 * Walks bits set in mask write protects the associated pte's. Caller must 360 * acquire @kvm->mmu_lock. 361 */ 362 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, 363 struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask) 364 { 365 kvm_ptw_ctx ctx; 366 gfn_t base_gfn = slot->base_gfn + gfn_offset; 367 gfn_t start = base_gfn + __ffs(mask); 368 gfn_t end = base_gfn + __fls(mask) + 1; 369 370 ctx.ops = kvm_mkclean_pte; 371 ctx.flag = _KVM_HAS_PGMASK; 372 ctx.mask = mask; 373 ctx.gfn = base_gfn; 374 kvm_ptw_prepare(kvm, &ctx); 375 376 kvm_ptw_top(kvm->arch.pgd, start << PAGE_SHIFT, end << PAGE_SHIFT, &ctx); 377 } 378 379 int kvm_arch_prepare_memory_region(struct kvm *kvm, const struct kvm_memory_slot *old, 380 struct kvm_memory_slot *new, enum kvm_mr_change change) 381 { 382 gpa_t gpa_start; 383 hva_t hva_start; 384 size_t size, gpa_offset, hva_offset; 385 386 if ((change != KVM_MR_MOVE) && (change != KVM_MR_CREATE)) 387 return 0; 388 /* 389 * Prevent userspace from creating a memory region outside of the 390 * VM GPA address space 391 */ 392 if ((new->base_gfn + new->npages) > (kvm->arch.gpa_size >> PAGE_SHIFT)) 393 return -ENOMEM; 394 395 new->arch.flags = 0; 396 size = new->npages * PAGE_SIZE; 397 gpa_start = new->base_gfn << PAGE_SHIFT; 398 hva_start = new->userspace_addr; 399 if (IS_ALIGNED(size, PMD_SIZE) && IS_ALIGNED(gpa_start, PMD_SIZE) 400 && IS_ALIGNED(hva_start, PMD_SIZE)) 401 new->arch.flags |= KVM_MEM_HUGEPAGE_CAPABLE; 402 else { 403 /* 404 * Pages belonging to memslots that don't have the same 405 * alignment within a PMD for userspace and GPA cannot be 406 * mapped with PMD entries, because we'll end up mapping 407 * the wrong pages. 408 * 409 * Consider a layout like the following: 410 * 411 * memslot->userspace_addr: 412 * +-----+--------------------+--------------------+---+ 413 * |abcde|fgh Stage-1 block | Stage-1 block tv|xyz| 414 * +-----+--------------------+--------------------+---+ 415 * 416 * memslot->base_gfn << PAGE_SIZE: 417 * +---+--------------------+--------------------+-----+ 418 * |abc|def Stage-2 block | Stage-2 block |tvxyz| 419 * +---+--------------------+--------------------+-----+ 420 * 421 * If we create those stage-2 blocks, we'll end up with this 422 * incorrect mapping: 423 * d -> f 424 * e -> g 425 * f -> h 426 */ 427 gpa_offset = gpa_start & (PMD_SIZE - 1); 428 hva_offset = hva_start & (PMD_SIZE - 1); 429 if (gpa_offset != hva_offset) { 430 new->arch.flags |= KVM_MEM_HUGEPAGE_INCAPABLE; 431 } else { 432 if (gpa_offset == 0) 433 gpa_offset = PMD_SIZE; 434 if ((size + gpa_offset) < (PMD_SIZE * 2)) 435 new->arch.flags |= KVM_MEM_HUGEPAGE_INCAPABLE; 436 } 437 } 438 439 return 0; 440 } 441 442 void kvm_arch_commit_memory_region(struct kvm *kvm, 443 struct kvm_memory_slot *old, 444 const struct kvm_memory_slot *new, 445 enum kvm_mr_change change) 446 { 447 int needs_flush; 448 u32 old_flags = old ? old->flags : 0; 449 u32 new_flags = new ? new->flags : 0; 450 bool log_dirty_pages = new_flags & KVM_MEM_LOG_DIRTY_PAGES; 451 452 /* Only track memslot flags changed */ 453 if (change != KVM_MR_FLAGS_ONLY) 454 return; 455 456 /* Discard dirty page tracking on readonly memslot */ 457 if ((old_flags & new_flags) & KVM_MEM_READONLY) 458 return; 459 460 /* 461 * If dirty page logging is enabled, write protect all pages in the slot 462 * ready for dirty logging. 463 * 464 * There is no need to do this in any of the following cases: 465 * CREATE: No dirty mappings will already exist. 466 * MOVE/DELETE: The old mappings will already have been cleaned up by 467 * kvm_arch_flush_shadow_memslot() 468 */ 469 if (!(old_flags & KVM_MEM_LOG_DIRTY_PAGES) && log_dirty_pages) { 470 /* 471 * Initially-all-set does not require write protecting any page 472 * because they're all assumed to be dirty. 473 */ 474 if (kvm_dirty_log_manual_protect_and_init_set(kvm)) 475 return; 476 477 spin_lock(&kvm->mmu_lock); 478 /* Write protect GPA page table entries */ 479 needs_flush = kvm_mkclean_gpa_pt(kvm, new->base_gfn, 480 new->base_gfn + new->npages); 481 spin_unlock(&kvm->mmu_lock); 482 if (needs_flush) 483 kvm_flush_remote_tlbs(kvm); 484 } 485 } 486 487 void kvm_arch_flush_shadow_all(struct kvm *kvm) 488 { 489 kvm_flush_range(kvm, 0, kvm->arch.gpa_size >> PAGE_SHIFT, 0); 490 } 491 492 void kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) 493 { 494 /* 495 * The slot has been made invalid (ready for moving or deletion), so we 496 * need to ensure that it can no longer be accessed by any guest vCPUs. 497 */ 498 kvm_flush_range(kvm, slot->base_gfn, slot->base_gfn + slot->npages, 1); 499 } 500 501 bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) 502 { 503 kvm_ptw_ctx ctx; 504 505 ctx.flag = 0; 506 ctx.ops = kvm_flush_pte; 507 kvm_ptw_prepare(kvm, &ctx); 508 INIT_LIST_HEAD(&ctx.list); 509 510 return kvm_ptw_top(kvm->arch.pgd, range->start << PAGE_SHIFT, 511 range->end << PAGE_SHIFT, &ctx); 512 } 513 514 bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 515 { 516 kvm_ptw_ctx ctx; 517 518 ctx.flag = 0; 519 ctx.ops = kvm_mkold_pte; 520 kvm_ptw_prepare(kvm, &ctx); 521 522 return kvm_ptw_top(kvm->arch.pgd, range->start << PAGE_SHIFT, 523 range->end << PAGE_SHIFT, &ctx); 524 } 525 526 bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 527 { 528 gpa_t gpa = range->start << PAGE_SHIFT; 529 kvm_pte_t *ptep = kvm_populate_gpa(kvm, NULL, gpa, 0); 530 531 if (ptep && kvm_pte_present(NULL, ptep) && kvm_pte_young(*ptep)) 532 return true; 533 534 return false; 535 } 536 537 /* 538 * kvm_map_page_fast() - Fast path GPA fault handler. 539 * @vcpu: vCPU pointer. 540 * @gpa: Guest physical address of fault. 541 * @write: Whether the fault was due to a write. 542 * 543 * Perform fast path GPA fault handling, doing all that can be done without 544 * calling into KVM. This handles marking old pages young (for idle page 545 * tracking), and dirtying of clean pages (for dirty page logging). 546 * 547 * Returns: 0 on success, in which case we can update derived mappings and 548 * resume guest execution. 549 * -EFAULT on failure due to absent GPA mapping or write to 550 * read-only page, in which case KVM must be consulted. 551 */ 552 static int kvm_map_page_fast(struct kvm_vcpu *vcpu, unsigned long gpa, bool write) 553 { 554 int ret = 0; 555 kvm_pfn_t pfn = 0; 556 kvm_pte_t *ptep, changed, new; 557 gfn_t gfn = gpa >> PAGE_SHIFT; 558 struct kvm *kvm = vcpu->kvm; 559 struct kvm_memory_slot *slot; 560 struct page *page; 561 562 spin_lock(&kvm->mmu_lock); 563 564 /* Fast path - just check GPA page table for an existing entry */ 565 ptep = kvm_populate_gpa(kvm, NULL, gpa, 0); 566 if (!ptep || !kvm_pte_present(NULL, ptep)) { 567 ret = -EFAULT; 568 goto out; 569 } 570 571 /* Track access to pages marked old */ 572 new = kvm_pte_mkyoung(*ptep); 573 /* call kvm_set_pfn_accessed() after unlock */ 574 575 if (write && !kvm_pte_dirty(new)) { 576 if (!kvm_pte_write(new)) { 577 ret = -EFAULT; 578 goto out; 579 } 580 581 if (kvm_pte_huge(new)) { 582 /* 583 * Do not set write permission when dirty logging is 584 * enabled for HugePages 585 */ 586 slot = gfn_to_memslot(kvm, gfn); 587 if (kvm_slot_dirty_track_enabled(slot)) { 588 ret = -EFAULT; 589 goto out; 590 } 591 } 592 593 /* Track dirtying of writeable pages */ 594 new = kvm_pte_mkdirty(new); 595 } 596 597 changed = new ^ (*ptep); 598 if (changed) { 599 kvm_set_pte(ptep, new); 600 pfn = kvm_pte_pfn(new); 601 page = kvm_pfn_to_refcounted_page(pfn); 602 if (page) 603 get_page(page); 604 } 605 spin_unlock(&kvm->mmu_lock); 606 607 if (changed) { 608 if (kvm_pte_young(changed)) 609 kvm_set_pfn_accessed(pfn); 610 611 if (kvm_pte_dirty(changed)) { 612 mark_page_dirty(kvm, gfn); 613 kvm_set_pfn_dirty(pfn); 614 } 615 if (page) 616 put_page(page); 617 } 618 return ret; 619 out: 620 spin_unlock(&kvm->mmu_lock); 621 return ret; 622 } 623 624 static bool fault_supports_huge_mapping(struct kvm_memory_slot *memslot, 625 unsigned long hva, bool write) 626 { 627 hva_t start, end; 628 629 /* Disable dirty logging on HugePages */ 630 if (kvm_slot_dirty_track_enabled(memslot) && write) 631 return false; 632 633 if (kvm_hugepage_capable(memslot)) 634 return true; 635 636 if (kvm_hugepage_incapable(memslot)) 637 return false; 638 639 start = memslot->userspace_addr; 640 end = start + memslot->npages * PAGE_SIZE; 641 642 /* 643 * Next, let's make sure we're not trying to map anything not covered 644 * by the memslot. This means we have to prohibit block size mappings 645 * for the beginning and end of a non-block aligned and non-block sized 646 * memory slot (illustrated by the head and tail parts of the 647 * userspace view above containing pages 'abcde' and 'xyz', 648 * respectively). 649 * 650 * Note that it doesn't matter if we do the check using the 651 * userspace_addr or the base_gfn, as both are equally aligned (per 652 * the check above) and equally sized. 653 */ 654 return (hva >= ALIGN(start, PMD_SIZE)) && (hva < ALIGN_DOWN(end, PMD_SIZE)); 655 } 656 657 /* 658 * Lookup the mapping level for @gfn in the current mm. 659 * 660 * WARNING! Use of host_pfn_mapping_level() requires the caller and the end 661 * consumer to be tied into KVM's handlers for MMU notifier events! 662 * 663 * There are several ways to safely use this helper: 664 * 665 * - Check mmu_invalidate_retry_gfn() after grabbing the mapping level, before 666 * consuming it. In this case, mmu_lock doesn't need to be held during the 667 * lookup, but it does need to be held while checking the MMU notifier. 668 * 669 * - Hold mmu_lock AND ensure there is no in-progress MMU notifier invalidation 670 * event for the hva. This can be done by explicit checking the MMU notifier 671 * or by ensuring that KVM already has a valid mapping that covers the hva. 672 * 673 * - Do not use the result to install new mappings, e.g. use the host mapping 674 * level only to decide whether or not to zap an entry. In this case, it's 675 * not required to hold mmu_lock (though it's highly likely the caller will 676 * want to hold mmu_lock anyways, e.g. to modify SPTEs). 677 * 678 * Note! The lookup can still race with modifications to host page tables, but 679 * the above "rules" ensure KVM will not _consume_ the result of the walk if a 680 * race with the primary MMU occurs. 681 */ 682 static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, 683 const struct kvm_memory_slot *slot) 684 { 685 int level = 0; 686 unsigned long hva; 687 unsigned long flags; 688 pgd_t pgd; 689 p4d_t p4d; 690 pud_t pud; 691 pmd_t pmd; 692 693 /* 694 * Note, using the already-retrieved memslot and __gfn_to_hva_memslot() 695 * is not solely for performance, it's also necessary to avoid the 696 * "writable" check in __gfn_to_hva_many(), which will always fail on 697 * read-only memslots due to gfn_to_hva() assuming writes. Earlier 698 * page fault steps have already verified the guest isn't writing a 699 * read-only memslot. 700 */ 701 hva = __gfn_to_hva_memslot(slot, gfn); 702 703 /* 704 * Disable IRQs to prevent concurrent tear down of host page tables, 705 * e.g. if the primary MMU promotes a P*D to a huge page and then frees 706 * the original page table. 707 */ 708 local_irq_save(flags); 709 710 /* 711 * Read each entry once. As above, a non-leaf entry can be promoted to 712 * a huge page _during_ this walk. Re-reading the entry could send the 713 * walk into the weeks, e.g. p*d_leaf() returns false (sees the old 714 * value) and then p*d_offset() walks into the target huge page instead 715 * of the old page table (sees the new value). 716 */ 717 pgd = READ_ONCE(*pgd_offset(kvm->mm, hva)); 718 if (pgd_none(pgd)) 719 goto out; 720 721 p4d = READ_ONCE(*p4d_offset(&pgd, hva)); 722 if (p4d_none(p4d) || !p4d_present(p4d)) 723 goto out; 724 725 pud = READ_ONCE(*pud_offset(&p4d, hva)); 726 if (pud_none(pud) || !pud_present(pud)) 727 goto out; 728 729 pmd = READ_ONCE(*pmd_offset(&pud, hva)); 730 if (pmd_none(pmd) || !pmd_present(pmd)) 731 goto out; 732 733 if (kvm_pte_huge(pmd_val(pmd))) 734 level = 1; 735 736 out: 737 local_irq_restore(flags); 738 return level; 739 } 740 741 /* 742 * Split huge page 743 */ 744 static kvm_pte_t *kvm_split_huge(struct kvm_vcpu *vcpu, kvm_pte_t *ptep, gfn_t gfn) 745 { 746 int i; 747 kvm_pte_t val, *child; 748 struct kvm *kvm = vcpu->kvm; 749 struct kvm_mmu_memory_cache *memcache; 750 751 memcache = &vcpu->arch.mmu_page_cache; 752 child = kvm_mmu_memory_cache_alloc(memcache); 753 val = kvm_pte_mksmall(*ptep); 754 for (i = 0; i < PTRS_PER_PTE; i++) { 755 kvm_set_pte(child + i, val); 756 val += PAGE_SIZE; 757 } 758 759 smp_wmb(); /* Make pte visible before pmd */ 760 /* The later kvm_flush_tlb_gpa() will flush hugepage tlb */ 761 kvm_set_pte(ptep, __pa(child)); 762 763 kvm->stat.hugepages--; 764 kvm->stat.pages += PTRS_PER_PTE; 765 766 return child + (gfn & (PTRS_PER_PTE - 1)); 767 } 768 769 /* 770 * kvm_map_page() - Map a guest physical page. 771 * @vcpu: vCPU pointer. 772 * @gpa: Guest physical address of fault. 773 * @write: Whether the fault was due to a write. 774 * 775 * Handle GPA faults by creating a new GPA mapping (or updating an existing 776 * one). 777 * 778 * This takes care of marking pages young or dirty (idle/dirty page tracking), 779 * asking KVM for the corresponding PFN, and creating a mapping in the GPA page 780 * tables. Derived mappings (GVA page tables and TLBs) must be handled by the 781 * caller. 782 * 783 * Returns: 0 on success 784 * -EFAULT if there is no memory region at @gpa or a write was 785 * attempted to a read-only memory region. This is usually handled 786 * as an MMIO access. 787 */ 788 static int kvm_map_page(struct kvm_vcpu *vcpu, unsigned long gpa, bool write) 789 { 790 bool writeable; 791 int srcu_idx, err, retry_no = 0, level; 792 unsigned long hva, mmu_seq, prot_bits; 793 kvm_pfn_t pfn; 794 kvm_pte_t *ptep, new_pte; 795 gfn_t gfn = gpa >> PAGE_SHIFT; 796 struct kvm *kvm = vcpu->kvm; 797 struct kvm_memory_slot *memslot; 798 struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; 799 800 /* Try the fast path to handle old / clean pages */ 801 srcu_idx = srcu_read_lock(&kvm->srcu); 802 err = kvm_map_page_fast(vcpu, gpa, write); 803 if (!err) 804 goto out; 805 806 memslot = gfn_to_memslot(kvm, gfn); 807 hva = gfn_to_hva_memslot_prot(memslot, gfn, &writeable); 808 if (kvm_is_error_hva(hva) || (write && !writeable)) { 809 err = -EFAULT; 810 goto out; 811 } 812 813 /* We need a minimum of cached pages ready for page table creation */ 814 err = kvm_mmu_topup_memory_cache(memcache, KVM_MMU_CACHE_MIN_PAGES); 815 if (err) 816 goto out; 817 818 retry: 819 /* 820 * Used to check for invalidations in progress, of the pfn that is 821 * returned by pfn_to_pfn_prot below. 822 */ 823 mmu_seq = kvm->mmu_invalidate_seq; 824 /* 825 * Ensure the read of mmu_invalidate_seq isn't reordered with PTE reads in 826 * gfn_to_pfn_prot() (which calls get_user_pages()), so that we don't 827 * risk the page we get a reference to getting unmapped before we have a 828 * chance to grab the mmu_lock without mmu_invalidate_retry() noticing. 829 * 830 * This smp_rmb() pairs with the effective smp_wmb() of the combination 831 * of the pte_unmap_unlock() after the PTE is zapped, and the 832 * spin_lock() in kvm_mmu_invalidate_invalidate_<page|range_end>() before 833 * mmu_invalidate_seq is incremented. 834 */ 835 smp_rmb(); 836 837 /* Slow path - ask KVM core whether we can access this GPA */ 838 pfn = gfn_to_pfn_prot(kvm, gfn, write, &writeable); 839 if (is_error_noslot_pfn(pfn)) { 840 err = -EFAULT; 841 goto out; 842 } 843 844 /* Check if an invalidation has taken place since we got pfn */ 845 spin_lock(&kvm->mmu_lock); 846 if (mmu_invalidate_retry_gfn(kvm, mmu_seq, gfn)) { 847 /* 848 * This can happen when mappings are changed asynchronously, but 849 * also synchronously if a COW is triggered by 850 * gfn_to_pfn_prot(). 851 */ 852 spin_unlock(&kvm->mmu_lock); 853 kvm_release_pfn_clean(pfn); 854 if (retry_no > 100) { 855 retry_no = 0; 856 schedule(); 857 } 858 retry_no++; 859 goto retry; 860 } 861 862 /* 863 * For emulated devices such virtio device, actual cache attribute is 864 * determined by physical machine. 865 * For pass through physical device, it should be uncachable 866 */ 867 prot_bits = _PAGE_PRESENT | __READABLE; 868 if (pfn_valid(pfn)) 869 prot_bits |= _CACHE_CC; 870 else 871 prot_bits |= _CACHE_SUC; 872 873 if (writeable) { 874 prot_bits |= _PAGE_WRITE; 875 if (write) 876 prot_bits |= __WRITEABLE; 877 } 878 879 /* Disable dirty logging on HugePages */ 880 level = 0; 881 if (fault_supports_huge_mapping(memslot, hva, write)) { 882 /* Check page level about host mmu*/ 883 level = host_pfn_mapping_level(kvm, gfn, memslot); 884 if (level == 1) { 885 /* 886 * Check page level about secondary mmu 887 * Disable hugepage if it is normal page on 888 * secondary mmu already 889 */ 890 ptep = kvm_populate_gpa(kvm, NULL, gpa, 0); 891 if (ptep && !kvm_pte_huge(*ptep)) 892 level = 0; 893 } 894 895 if (level == 1) { 896 gfn = gfn & ~(PTRS_PER_PTE - 1); 897 pfn = pfn & ~(PTRS_PER_PTE - 1); 898 } 899 } 900 901 /* Ensure page tables are allocated */ 902 ptep = kvm_populate_gpa(kvm, memcache, gpa, level); 903 new_pte = kvm_pfn_pte(pfn, __pgprot(prot_bits)); 904 if (level == 1) { 905 new_pte = kvm_pte_mkhuge(new_pte); 906 /* 907 * previous pmd entry is invalid_pte_table 908 * there is invalid tlb with small page 909 * need flush these invalid tlbs for current vcpu 910 */ 911 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 912 ++kvm->stat.hugepages; 913 } else if (kvm_pte_huge(*ptep) && write) 914 ptep = kvm_split_huge(vcpu, ptep, gfn); 915 else 916 ++kvm->stat.pages; 917 kvm_set_pte(ptep, new_pte); 918 spin_unlock(&kvm->mmu_lock); 919 920 if (prot_bits & _PAGE_DIRTY) { 921 mark_page_dirty_in_slot(kvm, memslot, gfn); 922 kvm_set_pfn_dirty(pfn); 923 } 924 925 kvm_release_pfn_clean(pfn); 926 out: 927 srcu_read_unlock(&kvm->srcu, srcu_idx); 928 return err; 929 } 930 931 int kvm_handle_mm_fault(struct kvm_vcpu *vcpu, unsigned long gpa, bool write) 932 { 933 int ret; 934 935 ret = kvm_map_page(vcpu, gpa, write); 936 if (ret) 937 return ret; 938 939 /* Invalidate this entry in the TLB */ 940 vcpu->arch.flush_gpa = gpa; 941 kvm_make_request(KVM_REQ_TLB_FLUSH_GPA, vcpu); 942 943 return 0; 944 } 945 946 void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) 947 { 948 } 949 950 void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, 951 const struct kvm_memory_slot *memslot) 952 { 953 kvm_flush_remote_tlbs(kvm); 954 } 955