1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2020-2023 Loongson Technology Corporation Limited 4 */ 5 6 #include <linux/highmem.h> 7 #include <linux/hugetlb.h> 8 #include <linux/kvm_host.h> 9 #include <linux/page-flags.h> 10 #include <linux/uaccess.h> 11 #include <asm/mmu_context.h> 12 #include <asm/pgalloc.h> 13 #include <asm/tlb.h> 14 #include <asm/kvm_mmu.h> 15 16 static inline bool kvm_hugepage_capable(struct kvm_memory_slot *slot) 17 { 18 return slot->arch.flags & KVM_MEM_HUGEPAGE_CAPABLE; 19 } 20 21 static inline bool kvm_hugepage_incapable(struct kvm_memory_slot *slot) 22 { 23 return slot->arch.flags & KVM_MEM_HUGEPAGE_INCAPABLE; 24 } 25 26 static inline void kvm_ptw_prepare(struct kvm *kvm, kvm_ptw_ctx *ctx) 27 { 28 ctx->level = kvm->arch.root_level; 29 /* pte table */ 30 ctx->invalid_ptes = kvm->arch.invalid_ptes; 31 ctx->pte_shifts = kvm->arch.pte_shifts; 32 ctx->pgtable_shift = ctx->pte_shifts[ctx->level]; 33 ctx->invalid_entry = ctx->invalid_ptes[ctx->level]; 34 ctx->opaque = kvm; 35 } 36 37 /* 38 * Mark a range of guest physical address space old (all accesses fault) in the 39 * VM's GPA page table to allow detection of commonly used pages. 40 */ 41 static int kvm_mkold_pte(kvm_pte_t *pte, phys_addr_t addr, kvm_ptw_ctx *ctx) 42 { 43 if (kvm_pte_young(*pte)) { 44 *pte = kvm_pte_mkold(*pte); 45 return 1; 46 } 47 48 return 0; 49 } 50 51 /* 52 * Mark a range of guest physical address space clean (writes fault) in the VM's 53 * GPA page table to allow dirty page tracking. 54 */ 55 static int kvm_mkclean_pte(kvm_pte_t *pte, phys_addr_t addr, kvm_ptw_ctx *ctx) 56 { 57 gfn_t offset; 58 kvm_pte_t val; 59 60 val = *pte; 61 /* 62 * For kvm_arch_mmu_enable_log_dirty_pt_masked with mask, start and end 63 * may cross hugepage, for first huge page parameter addr is equal to 64 * start, however for the second huge page addr is base address of 65 * this huge page, rather than start or end address 66 */ 67 if ((ctx->flag & _KVM_HAS_PGMASK) && !kvm_pte_huge(val)) { 68 offset = (addr >> PAGE_SHIFT) - ctx->gfn; 69 if (!(BIT(offset) & ctx->mask)) 70 return 0; 71 } 72 73 /* 74 * Need not split huge page now, just set write-proect pte bit 75 * Split huge page until next write fault 76 */ 77 if (kvm_pte_dirty(val)) { 78 *pte = kvm_pte_mkclean(val); 79 return 1; 80 } 81 82 return 0; 83 } 84 85 /* 86 * Clear pte entry 87 */ 88 static int kvm_flush_pte(kvm_pte_t *pte, phys_addr_t addr, kvm_ptw_ctx *ctx) 89 { 90 struct kvm *kvm; 91 92 kvm = ctx->opaque; 93 if (ctx->level) 94 kvm->stat.hugepages--; 95 else 96 kvm->stat.pages--; 97 98 *pte = ctx->invalid_entry; 99 100 return 1; 101 } 102 103 /* 104 * kvm_pgd_alloc() - Allocate and initialise a KVM GPA page directory. 105 * 106 * Allocate a blank KVM GPA page directory (PGD) for representing guest physical 107 * to host physical page mappings. 108 * 109 * Returns: Pointer to new KVM GPA page directory. 110 * NULL on allocation failure. 111 */ 112 kvm_pte_t *kvm_pgd_alloc(void) 113 { 114 kvm_pte_t *pgd; 115 116 pgd = (kvm_pte_t *)__get_free_pages(GFP_KERNEL, 0); 117 if (pgd) 118 pgd_init((void *)pgd); 119 120 return pgd; 121 } 122 123 static void _kvm_pte_init(void *addr, unsigned long val) 124 { 125 unsigned long *p, *end; 126 127 p = (unsigned long *)addr; 128 end = p + PTRS_PER_PTE; 129 do { 130 p[0] = val; 131 p[1] = val; 132 p[2] = val; 133 p[3] = val; 134 p[4] = val; 135 p += 8; 136 p[-3] = val; 137 p[-2] = val; 138 p[-1] = val; 139 } while (p != end); 140 } 141 142 /* 143 * Caller must hold kvm->mm_lock 144 * 145 * Walk the page tables of kvm to find the PTE corresponding to the 146 * address @addr. If page tables don't exist for @addr, they will be created 147 * from the MMU cache if @cache is not NULL. 148 */ 149 static kvm_pte_t *kvm_populate_gpa(struct kvm *kvm, 150 struct kvm_mmu_memory_cache *cache, 151 unsigned long addr, int level) 152 { 153 kvm_ptw_ctx ctx; 154 kvm_pte_t *entry, *child; 155 156 kvm_ptw_prepare(kvm, &ctx); 157 child = kvm->arch.pgd; 158 while (ctx.level > level) { 159 entry = kvm_pgtable_offset(&ctx, child, addr); 160 if (kvm_pte_none(&ctx, entry)) { 161 if (!cache) 162 return NULL; 163 164 child = kvm_mmu_memory_cache_alloc(cache); 165 _kvm_pte_init(child, ctx.invalid_ptes[ctx.level - 1]); 166 smp_wmb(); /* Make pte visible before pmd */ 167 kvm_set_pte(entry, __pa(child)); 168 } else if (kvm_pte_huge(*entry)) { 169 return entry; 170 } else 171 child = (kvm_pte_t *)__va(PHYSADDR(*entry)); 172 kvm_ptw_enter(&ctx); 173 } 174 175 entry = kvm_pgtable_offset(&ctx, child, addr); 176 177 return entry; 178 } 179 180 /* 181 * Page walker for VM shadow mmu at last level 182 * The last level is small pte page or huge pmd page 183 */ 184 static int kvm_ptw_leaf(kvm_pte_t *dir, phys_addr_t addr, phys_addr_t end, kvm_ptw_ctx *ctx) 185 { 186 int ret; 187 phys_addr_t next, start, size; 188 struct list_head *list; 189 kvm_pte_t *entry, *child; 190 191 ret = 0; 192 start = addr; 193 child = (kvm_pte_t *)__va(PHYSADDR(*dir)); 194 entry = kvm_pgtable_offset(ctx, child, addr); 195 do { 196 next = addr + (0x1UL << ctx->pgtable_shift); 197 if (!kvm_pte_present(ctx, entry)) 198 continue; 199 200 ret |= ctx->ops(entry, addr, ctx); 201 } while (entry++, addr = next, addr < end); 202 203 if (kvm_need_flush(ctx)) { 204 size = 0x1UL << (ctx->pgtable_shift + PAGE_SHIFT - 3); 205 if (start + size == end) { 206 list = (struct list_head *)child; 207 list_add_tail(list, &ctx->list); 208 *dir = ctx->invalid_ptes[ctx->level + 1]; 209 } 210 } 211 212 return ret; 213 } 214 215 /* 216 * Page walker for VM shadow mmu at page table dir level 217 */ 218 static int kvm_ptw_dir(kvm_pte_t *dir, phys_addr_t addr, phys_addr_t end, kvm_ptw_ctx *ctx) 219 { 220 int ret; 221 phys_addr_t next, start, size; 222 struct list_head *list; 223 kvm_pte_t *entry, *child; 224 225 ret = 0; 226 start = addr; 227 child = (kvm_pte_t *)__va(PHYSADDR(*dir)); 228 entry = kvm_pgtable_offset(ctx, child, addr); 229 do { 230 next = kvm_pgtable_addr_end(ctx, addr, end); 231 if (!kvm_pte_present(ctx, entry)) 232 continue; 233 234 if (kvm_pte_huge(*entry)) { 235 ret |= ctx->ops(entry, addr, ctx); 236 continue; 237 } 238 239 kvm_ptw_enter(ctx); 240 if (ctx->level == 0) 241 ret |= kvm_ptw_leaf(entry, addr, next, ctx); 242 else 243 ret |= kvm_ptw_dir(entry, addr, next, ctx); 244 kvm_ptw_exit(ctx); 245 } while (entry++, addr = next, addr < end); 246 247 if (kvm_need_flush(ctx)) { 248 size = 0x1UL << (ctx->pgtable_shift + PAGE_SHIFT - 3); 249 if (start + size == end) { 250 list = (struct list_head *)child; 251 list_add_tail(list, &ctx->list); 252 *dir = ctx->invalid_ptes[ctx->level + 1]; 253 } 254 } 255 256 return ret; 257 } 258 259 /* 260 * Page walker for VM shadow mmu at page root table 261 */ 262 static int kvm_ptw_top(kvm_pte_t *dir, phys_addr_t addr, phys_addr_t end, kvm_ptw_ctx *ctx) 263 { 264 int ret; 265 phys_addr_t next; 266 kvm_pte_t *entry; 267 268 ret = 0; 269 entry = kvm_pgtable_offset(ctx, dir, addr); 270 do { 271 next = kvm_pgtable_addr_end(ctx, addr, end); 272 if (!kvm_pte_present(ctx, entry)) 273 continue; 274 275 kvm_ptw_enter(ctx); 276 ret |= kvm_ptw_dir(entry, addr, next, ctx); 277 kvm_ptw_exit(ctx); 278 } while (entry++, addr = next, addr < end); 279 280 return ret; 281 } 282 283 /* 284 * kvm_flush_range() - Flush a range of guest physical addresses. 285 * @kvm: KVM pointer. 286 * @start_gfn: Guest frame number of first page in GPA range to flush. 287 * @end_gfn: Guest frame number of last page in GPA range to flush. 288 * @lock: Whether to hold mmu_lock or not 289 * 290 * Flushes a range of GPA mappings from the GPA page tables. 291 */ 292 static void kvm_flush_range(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn, int lock) 293 { 294 int ret; 295 kvm_ptw_ctx ctx; 296 struct list_head *pos, *temp; 297 298 ctx.ops = kvm_flush_pte; 299 ctx.flag = _KVM_FLUSH_PGTABLE; 300 kvm_ptw_prepare(kvm, &ctx); 301 INIT_LIST_HEAD(&ctx.list); 302 303 if (lock) { 304 spin_lock(&kvm->mmu_lock); 305 ret = kvm_ptw_top(kvm->arch.pgd, start_gfn << PAGE_SHIFT, 306 end_gfn << PAGE_SHIFT, &ctx); 307 spin_unlock(&kvm->mmu_lock); 308 } else 309 ret = kvm_ptw_top(kvm->arch.pgd, start_gfn << PAGE_SHIFT, 310 end_gfn << PAGE_SHIFT, &ctx); 311 312 /* Flush vpid for each vCPU individually */ 313 if (ret) 314 kvm_flush_remote_tlbs(kvm); 315 316 /* 317 * free pte table page after mmu_lock 318 * the pte table page is linked together with ctx.list 319 */ 320 list_for_each_safe(pos, temp, &ctx.list) { 321 list_del(pos); 322 free_page((unsigned long)pos); 323 } 324 } 325 326 /* 327 * kvm_mkclean_gpa_pt() - Make a range of guest physical addresses clean. 328 * @kvm: KVM pointer. 329 * @start_gfn: Guest frame number of first page in GPA range to flush. 330 * @end_gfn: Guest frame number of last page in GPA range to flush. 331 * 332 * Make a range of GPA mappings clean so that guest writes will fault and 333 * trigger dirty page logging. 334 * 335 * The caller must hold the @kvm->mmu_lock spinlock. 336 * 337 * Returns: Whether any GPA mappings were modified, which would require 338 * derived mappings (GVA page tables & TLB enties) to be 339 * invalidated. 340 */ 341 static int kvm_mkclean_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn) 342 { 343 kvm_ptw_ctx ctx; 344 345 ctx.ops = kvm_mkclean_pte; 346 ctx.flag = 0; 347 kvm_ptw_prepare(kvm, &ctx); 348 return kvm_ptw_top(kvm->arch.pgd, start_gfn << PAGE_SHIFT, end_gfn << PAGE_SHIFT, &ctx); 349 } 350 351 /* 352 * kvm_arch_mmu_enable_log_dirty_pt_masked() - write protect dirty pages 353 * @kvm: The KVM pointer 354 * @slot: The memory slot associated with mask 355 * @gfn_offset: The gfn offset in memory slot 356 * @mask: The mask of dirty pages at offset 'gfn_offset' in this memory 357 * slot to be write protected 358 * 359 * Walks bits set in mask write protects the associated pte's. Caller must 360 * acquire @kvm->mmu_lock. 361 */ 362 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, 363 struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask) 364 { 365 kvm_ptw_ctx ctx; 366 gfn_t base_gfn = slot->base_gfn + gfn_offset; 367 gfn_t start = base_gfn + __ffs(mask); 368 gfn_t end = base_gfn + __fls(mask) + 1; 369 370 ctx.ops = kvm_mkclean_pte; 371 ctx.flag = _KVM_HAS_PGMASK; 372 ctx.mask = mask; 373 ctx.gfn = base_gfn; 374 kvm_ptw_prepare(kvm, &ctx); 375 376 kvm_ptw_top(kvm->arch.pgd, start << PAGE_SHIFT, end << PAGE_SHIFT, &ctx); 377 } 378 379 int kvm_arch_prepare_memory_region(struct kvm *kvm, const struct kvm_memory_slot *old, 380 struct kvm_memory_slot *new, enum kvm_mr_change change) 381 { 382 gpa_t gpa_start; 383 hva_t hva_start; 384 size_t size, gpa_offset, hva_offset; 385 386 if ((change != KVM_MR_MOVE) && (change != KVM_MR_CREATE)) 387 return 0; 388 /* 389 * Prevent userspace from creating a memory region outside of the 390 * VM GPA address space 391 */ 392 if ((new->base_gfn + new->npages) > (kvm->arch.gpa_size >> PAGE_SHIFT)) 393 return -ENOMEM; 394 395 new->arch.flags = 0; 396 size = new->npages * PAGE_SIZE; 397 gpa_start = new->base_gfn << PAGE_SHIFT; 398 hva_start = new->userspace_addr; 399 if (IS_ALIGNED(size, PMD_SIZE) && IS_ALIGNED(gpa_start, PMD_SIZE) 400 && IS_ALIGNED(hva_start, PMD_SIZE)) 401 new->arch.flags |= KVM_MEM_HUGEPAGE_CAPABLE; 402 else { 403 /* 404 * Pages belonging to memslots that don't have the same 405 * alignment within a PMD for userspace and GPA cannot be 406 * mapped with PMD entries, because we'll end up mapping 407 * the wrong pages. 408 * 409 * Consider a layout like the following: 410 * 411 * memslot->userspace_addr: 412 * +-----+--------------------+--------------------+---+ 413 * |abcde|fgh Stage-1 block | Stage-1 block tv|xyz| 414 * +-----+--------------------+--------------------+---+ 415 * 416 * memslot->base_gfn << PAGE_SIZE: 417 * +---+--------------------+--------------------+-----+ 418 * |abc|def Stage-2 block | Stage-2 block |tvxyz| 419 * +---+--------------------+--------------------+-----+ 420 * 421 * If we create those stage-2 blocks, we'll end up with this 422 * incorrect mapping: 423 * d -> f 424 * e -> g 425 * f -> h 426 */ 427 gpa_offset = gpa_start & (PMD_SIZE - 1); 428 hva_offset = hva_start & (PMD_SIZE - 1); 429 if (gpa_offset != hva_offset) { 430 new->arch.flags |= KVM_MEM_HUGEPAGE_INCAPABLE; 431 } else { 432 if (gpa_offset == 0) 433 gpa_offset = PMD_SIZE; 434 if ((size + gpa_offset) < (PMD_SIZE * 2)) 435 new->arch.flags |= KVM_MEM_HUGEPAGE_INCAPABLE; 436 } 437 } 438 439 return 0; 440 } 441 442 void kvm_arch_commit_memory_region(struct kvm *kvm, 443 struct kvm_memory_slot *old, 444 const struct kvm_memory_slot *new, 445 enum kvm_mr_change change) 446 { 447 int needs_flush; 448 u32 old_flags = old ? old->flags : 0; 449 u32 new_flags = new ? new->flags : 0; 450 bool log_dirty_pages = new_flags & KVM_MEM_LOG_DIRTY_PAGES; 451 452 /* Only track memslot flags changed */ 453 if (change != KVM_MR_FLAGS_ONLY) 454 return; 455 456 /* Discard dirty page tracking on readonly memslot */ 457 if ((old_flags & new_flags) & KVM_MEM_READONLY) 458 return; 459 460 /* 461 * If dirty page logging is enabled, write protect all pages in the slot 462 * ready for dirty logging. 463 * 464 * There is no need to do this in any of the following cases: 465 * CREATE: No dirty mappings will already exist. 466 * MOVE/DELETE: The old mappings will already have been cleaned up by 467 * kvm_arch_flush_shadow_memslot() 468 */ 469 if (!(old_flags & KVM_MEM_LOG_DIRTY_PAGES) && log_dirty_pages) { 470 /* 471 * Initially-all-set does not require write protecting any page 472 * because they're all assumed to be dirty. 473 */ 474 if (kvm_dirty_log_manual_protect_and_init_set(kvm)) 475 return; 476 477 spin_lock(&kvm->mmu_lock); 478 /* Write protect GPA page table entries */ 479 needs_flush = kvm_mkclean_gpa_pt(kvm, new->base_gfn, 480 new->base_gfn + new->npages); 481 spin_unlock(&kvm->mmu_lock); 482 if (needs_flush) 483 kvm_flush_remote_tlbs(kvm); 484 } 485 } 486 487 void kvm_arch_flush_shadow_all(struct kvm *kvm) 488 { 489 kvm_flush_range(kvm, 0, kvm->arch.gpa_size >> PAGE_SHIFT, 0); 490 } 491 492 void kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) 493 { 494 /* 495 * The slot has been made invalid (ready for moving or deletion), so we 496 * need to ensure that it can no longer be accessed by any guest vCPUs. 497 */ 498 kvm_flush_range(kvm, slot->base_gfn, slot->base_gfn + slot->npages, 1); 499 } 500 501 bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) 502 { 503 kvm_ptw_ctx ctx; 504 505 ctx.flag = 0; 506 ctx.ops = kvm_flush_pte; 507 kvm_ptw_prepare(kvm, &ctx); 508 INIT_LIST_HEAD(&ctx.list); 509 510 return kvm_ptw_top(kvm->arch.pgd, range->start << PAGE_SHIFT, 511 range->end << PAGE_SHIFT, &ctx); 512 } 513 514 bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 515 { 516 kvm_ptw_ctx ctx; 517 518 ctx.flag = 0; 519 ctx.ops = kvm_mkold_pte; 520 kvm_ptw_prepare(kvm, &ctx); 521 522 return kvm_ptw_top(kvm->arch.pgd, range->start << PAGE_SHIFT, 523 range->end << PAGE_SHIFT, &ctx); 524 } 525 526 bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 527 { 528 gpa_t gpa = range->start << PAGE_SHIFT; 529 kvm_pte_t *ptep = kvm_populate_gpa(kvm, NULL, gpa, 0); 530 531 if (ptep && kvm_pte_present(NULL, ptep) && kvm_pte_young(*ptep)) 532 return true; 533 534 return false; 535 } 536 537 /* 538 * kvm_map_page_fast() - Fast path GPA fault handler. 539 * @vcpu: vCPU pointer. 540 * @gpa: Guest physical address of fault. 541 * @write: Whether the fault was due to a write. 542 * 543 * Perform fast path GPA fault handling, doing all that can be done without 544 * calling into KVM. This handles marking old pages young (for idle page 545 * tracking), and dirtying of clean pages (for dirty page logging). 546 * 547 * Returns: 0 on success, in which case we can update derived mappings and 548 * resume guest execution. 549 * -EFAULT on failure due to absent GPA mapping or write to 550 * read-only page, in which case KVM must be consulted. 551 */ 552 static int kvm_map_page_fast(struct kvm_vcpu *vcpu, unsigned long gpa, bool write) 553 { 554 int ret = 0; 555 kvm_pte_t *ptep, changed, new; 556 gfn_t gfn = gpa >> PAGE_SHIFT; 557 struct kvm *kvm = vcpu->kvm; 558 struct kvm_memory_slot *slot; 559 560 spin_lock(&kvm->mmu_lock); 561 562 /* Fast path - just check GPA page table for an existing entry */ 563 ptep = kvm_populate_gpa(kvm, NULL, gpa, 0); 564 if (!ptep || !kvm_pte_present(NULL, ptep)) { 565 ret = -EFAULT; 566 goto out; 567 } 568 569 /* Track access to pages marked old */ 570 new = kvm_pte_mkyoung(*ptep); 571 if (write && !kvm_pte_dirty(new)) { 572 if (!kvm_pte_write(new)) { 573 ret = -EFAULT; 574 goto out; 575 } 576 577 if (kvm_pte_huge(new)) { 578 /* 579 * Do not set write permission when dirty logging is 580 * enabled for HugePages 581 */ 582 slot = gfn_to_memslot(kvm, gfn); 583 if (kvm_slot_dirty_track_enabled(slot)) { 584 ret = -EFAULT; 585 goto out; 586 } 587 } 588 589 /* Track dirtying of writeable pages */ 590 new = kvm_pte_mkdirty(new); 591 } 592 593 changed = new ^ (*ptep); 594 if (changed) 595 kvm_set_pte(ptep, new); 596 597 spin_unlock(&kvm->mmu_lock); 598 599 if (kvm_pte_dirty(changed)) 600 mark_page_dirty(kvm, gfn); 601 602 return ret; 603 out: 604 spin_unlock(&kvm->mmu_lock); 605 return ret; 606 } 607 608 static bool fault_supports_huge_mapping(struct kvm_memory_slot *memslot, 609 unsigned long hva, bool write) 610 { 611 hva_t start, end; 612 613 /* Disable dirty logging on HugePages */ 614 if (kvm_slot_dirty_track_enabled(memslot) && write) 615 return false; 616 617 if (kvm_hugepage_capable(memslot)) 618 return true; 619 620 if (kvm_hugepage_incapable(memslot)) 621 return false; 622 623 start = memslot->userspace_addr; 624 end = start + memslot->npages * PAGE_SIZE; 625 626 /* 627 * Next, let's make sure we're not trying to map anything not covered 628 * by the memslot. This means we have to prohibit block size mappings 629 * for the beginning and end of a non-block aligned and non-block sized 630 * memory slot (illustrated by the head and tail parts of the 631 * userspace view above containing pages 'abcde' and 'xyz', 632 * respectively). 633 * 634 * Note that it doesn't matter if we do the check using the 635 * userspace_addr or the base_gfn, as both are equally aligned (per 636 * the check above) and equally sized. 637 */ 638 return (hva >= ALIGN(start, PMD_SIZE)) && (hva < ALIGN_DOWN(end, PMD_SIZE)); 639 } 640 641 /* 642 * Lookup the mapping level for @gfn in the current mm. 643 * 644 * WARNING! Use of host_pfn_mapping_level() requires the caller and the end 645 * consumer to be tied into KVM's handlers for MMU notifier events! 646 * 647 * There are several ways to safely use this helper: 648 * 649 * - Check mmu_invalidate_retry_gfn() after grabbing the mapping level, before 650 * consuming it. In this case, mmu_lock doesn't need to be held during the 651 * lookup, but it does need to be held while checking the MMU notifier. 652 * 653 * - Hold mmu_lock AND ensure there is no in-progress MMU notifier invalidation 654 * event for the hva. This can be done by explicit checking the MMU notifier 655 * or by ensuring that KVM already has a valid mapping that covers the hva. 656 * 657 * - Do not use the result to install new mappings, e.g. use the host mapping 658 * level only to decide whether or not to zap an entry. In this case, it's 659 * not required to hold mmu_lock (though it's highly likely the caller will 660 * want to hold mmu_lock anyways, e.g. to modify SPTEs). 661 * 662 * Note! The lookup can still race with modifications to host page tables, but 663 * the above "rules" ensure KVM will not _consume_ the result of the walk if a 664 * race with the primary MMU occurs. 665 */ 666 static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, 667 const struct kvm_memory_slot *slot) 668 { 669 int level = 0; 670 unsigned long hva; 671 unsigned long flags; 672 pgd_t pgd; 673 p4d_t p4d; 674 pud_t pud; 675 pmd_t pmd; 676 677 /* 678 * Note, using the already-retrieved memslot and __gfn_to_hva_memslot() 679 * is not solely for performance, it's also necessary to avoid the 680 * "writable" check in __gfn_to_hva_many(), which will always fail on 681 * read-only memslots due to gfn_to_hva() assuming writes. Earlier 682 * page fault steps have already verified the guest isn't writing a 683 * read-only memslot. 684 */ 685 hva = __gfn_to_hva_memslot(slot, gfn); 686 687 /* 688 * Disable IRQs to prevent concurrent tear down of host page tables, 689 * e.g. if the primary MMU promotes a P*D to a huge page and then frees 690 * the original page table. 691 */ 692 local_irq_save(flags); 693 694 /* 695 * Read each entry once. As above, a non-leaf entry can be promoted to 696 * a huge page _during_ this walk. Re-reading the entry could send the 697 * walk into the weeks, e.g. p*d_leaf() returns false (sees the old 698 * value) and then p*d_offset() walks into the target huge page instead 699 * of the old page table (sees the new value). 700 */ 701 pgd = pgdp_get(pgd_offset(kvm->mm, hva)); 702 if (pgd_none(pgd)) 703 goto out; 704 705 p4d = p4dp_get(p4d_offset(&pgd, hva)); 706 if (p4d_none(p4d) || !p4d_present(p4d)) 707 goto out; 708 709 pud = pudp_get(pud_offset(&p4d, hva)); 710 if (pud_none(pud) || !pud_present(pud)) 711 goto out; 712 713 pmd = pmdp_get(pmd_offset(&pud, hva)); 714 if (pmd_none(pmd) || !pmd_present(pmd)) 715 goto out; 716 717 if (kvm_pte_huge(pmd_val(pmd))) 718 level = 1; 719 720 out: 721 local_irq_restore(flags); 722 return level; 723 } 724 725 /* 726 * Split huge page 727 */ 728 static kvm_pte_t *kvm_split_huge(struct kvm_vcpu *vcpu, kvm_pte_t *ptep, gfn_t gfn) 729 { 730 int i; 731 kvm_pte_t val, *child; 732 struct kvm *kvm = vcpu->kvm; 733 struct kvm_mmu_memory_cache *memcache; 734 735 memcache = &vcpu->arch.mmu_page_cache; 736 child = kvm_mmu_memory_cache_alloc(memcache); 737 val = kvm_pte_mksmall(*ptep); 738 for (i = 0; i < PTRS_PER_PTE; i++) { 739 kvm_set_pte(child + i, val); 740 val += PAGE_SIZE; 741 } 742 743 smp_wmb(); /* Make pte visible before pmd */ 744 /* The later kvm_flush_tlb_gpa() will flush hugepage tlb */ 745 kvm_set_pte(ptep, __pa(child)); 746 747 kvm->stat.hugepages--; 748 kvm->stat.pages += PTRS_PER_PTE; 749 750 return child + (gfn & (PTRS_PER_PTE - 1)); 751 } 752 753 /* 754 * kvm_map_page() - Map a guest physical page. 755 * @vcpu: vCPU pointer. 756 * @gpa: Guest physical address of fault. 757 * @write: Whether the fault was due to a write. 758 * 759 * Handle GPA faults by creating a new GPA mapping (or updating an existing 760 * one). 761 * 762 * This takes care of marking pages young or dirty (idle/dirty page tracking), 763 * asking KVM for the corresponding PFN, and creating a mapping in the GPA page 764 * tables. Derived mappings (GVA page tables and TLBs) must be handled by the 765 * caller. 766 * 767 * Returns: 0 on success 768 * -EFAULT if there is no memory region at @gpa or a write was 769 * attempted to a read-only memory region. This is usually handled 770 * as an MMIO access. 771 */ 772 static int kvm_map_page(struct kvm_vcpu *vcpu, unsigned long gpa, bool write) 773 { 774 bool writeable; 775 int srcu_idx, err, retry_no = 0, level; 776 unsigned long hva, mmu_seq, prot_bits; 777 kvm_pfn_t pfn; 778 kvm_pte_t *ptep, new_pte; 779 gfn_t gfn = gpa >> PAGE_SHIFT; 780 struct kvm *kvm = vcpu->kvm; 781 struct kvm_memory_slot *memslot; 782 struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; 783 struct page *page; 784 785 /* Try the fast path to handle old / clean pages */ 786 srcu_idx = srcu_read_lock(&kvm->srcu); 787 err = kvm_map_page_fast(vcpu, gpa, write); 788 if (!err) 789 goto out; 790 791 memslot = gfn_to_memslot(kvm, gfn); 792 hva = gfn_to_hva_memslot_prot(memslot, gfn, &writeable); 793 if (kvm_is_error_hva(hva) || (write && !writeable)) { 794 err = -EFAULT; 795 goto out; 796 } 797 798 /* We need a minimum of cached pages ready for page table creation */ 799 err = kvm_mmu_topup_memory_cache(memcache, KVM_MMU_CACHE_MIN_PAGES); 800 if (err) 801 goto out; 802 803 retry: 804 /* 805 * Used to check for invalidations in progress, of the pfn that is 806 * returned by pfn_to_pfn_prot below. 807 */ 808 mmu_seq = kvm->mmu_invalidate_seq; 809 /* 810 * Ensure the read of mmu_invalidate_seq isn't reordered with PTE reads in 811 * kvm_faultin_pfn() (which calls get_user_pages()), so that we don't 812 * risk the page we get a reference to getting unmapped before we have a 813 * chance to grab the mmu_lock without mmu_invalidate_retry() noticing. 814 * 815 * This smp_rmb() pairs with the effective smp_wmb() of the combination 816 * of the pte_unmap_unlock() after the PTE is zapped, and the 817 * spin_lock() in kvm_mmu_invalidate_invalidate_<page|range_end>() before 818 * mmu_invalidate_seq is incremented. 819 */ 820 smp_rmb(); 821 822 /* Slow path - ask KVM core whether we can access this GPA */ 823 pfn = kvm_faultin_pfn(vcpu, gfn, write, &writeable, &page); 824 if (is_error_noslot_pfn(pfn)) { 825 err = -EFAULT; 826 goto out; 827 } 828 829 /* Check if an invalidation has taken place since we got pfn */ 830 spin_lock(&kvm->mmu_lock); 831 if (mmu_invalidate_retry_gfn(kvm, mmu_seq, gfn)) { 832 /* 833 * This can happen when mappings are changed asynchronously, but 834 * also synchronously if a COW is triggered by 835 * kvm_faultin_pfn(). 836 */ 837 spin_unlock(&kvm->mmu_lock); 838 kvm_release_page_unused(page); 839 if (retry_no > 100) { 840 retry_no = 0; 841 schedule(); 842 } 843 retry_no++; 844 goto retry; 845 } 846 847 /* 848 * For emulated devices such virtio device, actual cache attribute is 849 * determined by physical machine. 850 * For pass through physical device, it should be uncachable 851 */ 852 prot_bits = _PAGE_PRESENT | __READABLE; 853 if (pfn_valid(pfn)) 854 prot_bits |= _CACHE_CC; 855 else 856 prot_bits |= _CACHE_SUC; 857 858 if (writeable) { 859 prot_bits |= _PAGE_WRITE; 860 if (write) 861 prot_bits |= __WRITEABLE; 862 } 863 864 /* Disable dirty logging on HugePages */ 865 level = 0; 866 if (fault_supports_huge_mapping(memslot, hva, write)) { 867 /* Check page level about host mmu*/ 868 level = host_pfn_mapping_level(kvm, gfn, memslot); 869 if (level == 1) { 870 /* 871 * Check page level about secondary mmu 872 * Disable hugepage if it is normal page on 873 * secondary mmu already 874 */ 875 ptep = kvm_populate_gpa(kvm, NULL, gpa, 0); 876 if (ptep && !kvm_pte_huge(*ptep)) 877 level = 0; 878 } 879 880 if (level == 1) { 881 gfn = gfn & ~(PTRS_PER_PTE - 1); 882 pfn = pfn & ~(PTRS_PER_PTE - 1); 883 } 884 } 885 886 /* Ensure page tables are allocated */ 887 ptep = kvm_populate_gpa(kvm, memcache, gpa, level); 888 new_pte = kvm_pfn_pte(pfn, __pgprot(prot_bits)); 889 if (level == 1) { 890 new_pte = kvm_pte_mkhuge(new_pte); 891 /* 892 * previous pmd entry is invalid_pte_table 893 * there is invalid tlb with small page 894 * need flush these invalid tlbs for current vcpu 895 */ 896 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 897 ++kvm->stat.hugepages; 898 } else if (kvm_pte_huge(*ptep) && write) 899 ptep = kvm_split_huge(vcpu, ptep, gfn); 900 else 901 ++kvm->stat.pages; 902 kvm_set_pte(ptep, new_pte); 903 904 kvm_release_faultin_page(kvm, page, false, writeable); 905 spin_unlock(&kvm->mmu_lock); 906 907 if (prot_bits & _PAGE_DIRTY) 908 mark_page_dirty_in_slot(kvm, memslot, gfn); 909 910 out: 911 srcu_read_unlock(&kvm->srcu, srcu_idx); 912 return err; 913 } 914 915 int kvm_handle_mm_fault(struct kvm_vcpu *vcpu, unsigned long gpa, bool write) 916 { 917 int ret; 918 919 ret = kvm_map_page(vcpu, gpa, write); 920 if (ret) 921 return ret; 922 923 /* Invalidate this entry in the TLB */ 924 vcpu->arch.flush_gpa = gpa; 925 kvm_make_request(KVM_REQ_TLB_FLUSH_GPA, vcpu); 926 927 return 0; 928 } 929 930 void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) 931 { 932 } 933 934 void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, 935 const struct kvm_memory_slot *memslot) 936 { 937 kvm_flush_remote_tlbs(kvm); 938 } 939