1 /* 2 * This program is free software; you can redistribute it and/or modify 3 * it under the terms of the GNU General Public License, version 2, as 4 * published by the Free Software Foundation. 5 * 6 * This program is distributed in the hope that it will be useful, 7 * but WITHOUT ANY WARRANTY; without even the implied warranty of 8 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 9 * GNU General Public License for more details. 10 * 11 * You should have received a copy of the GNU General Public License 12 * along with this program; if not, write to the Free Software 13 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 14 * 15 * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> 16 */ 17 18 #include <linux/types.h> 19 #include <linux/string.h> 20 #include <linux/kvm.h> 21 #include <linux/kvm_host.h> 22 #include <linux/highmem.h> 23 #include <linux/gfp.h> 24 #include <linux/slab.h> 25 #include <linux/hugetlb.h> 26 #include <linux/vmalloc.h> 27 28 #include <asm/tlbflush.h> 29 #include <asm/kvm_ppc.h> 30 #include <asm/kvm_book3s.h> 31 #include <asm/mmu-hash64.h> 32 #include <asm/hvcall.h> 33 #include <asm/synch.h> 34 #include <asm/ppc-opcode.h> 35 #include <asm/cputable.h> 36 37 /* POWER7 has 10-bit LPIDs, PPC970 has 6-bit LPIDs */ 38 #define MAX_LPID_970 63 39 40 /* Power architecture requires HPT is at least 256kB */ 41 #define PPC_MIN_HPT_ORDER 18 42 43 long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp) 44 { 45 unsigned long hpt; 46 struct revmap_entry *rev; 47 struct kvmppc_linear_info *li; 48 long order = kvm_hpt_order; 49 50 if (htab_orderp) { 51 order = *htab_orderp; 52 if (order < PPC_MIN_HPT_ORDER) 53 order = PPC_MIN_HPT_ORDER; 54 } 55 56 /* 57 * If the user wants a different size from default, 58 * try first to allocate it from the kernel page allocator. 59 */ 60 hpt = 0; 61 if (order != kvm_hpt_order) { 62 hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT| 63 __GFP_NOWARN, order - PAGE_SHIFT); 64 if (!hpt) 65 --order; 66 } 67 68 /* Next try to allocate from the preallocated pool */ 69 if (!hpt) { 70 li = kvm_alloc_hpt(); 71 if (li) { 72 hpt = (ulong)li->base_virt; 73 kvm->arch.hpt_li = li; 74 order = kvm_hpt_order; 75 } 76 } 77 78 /* Lastly try successively smaller sizes from the page allocator */ 79 while (!hpt && order > PPC_MIN_HPT_ORDER) { 80 hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT| 81 __GFP_NOWARN, order - PAGE_SHIFT); 82 if (!hpt) 83 --order; 84 } 85 86 if (!hpt) 87 return -ENOMEM; 88 89 kvm->arch.hpt_virt = hpt; 90 kvm->arch.hpt_order = order; 91 /* HPTEs are 2**4 bytes long */ 92 kvm->arch.hpt_npte = 1ul << (order - 4); 93 /* 128 (2**7) bytes in each HPTEG */ 94 kvm->arch.hpt_mask = (1ul << (order - 7)) - 1; 95 96 /* Allocate reverse map array */ 97 rev = vmalloc(sizeof(struct revmap_entry) * kvm->arch.hpt_npte); 98 if (!rev) { 99 pr_err("kvmppc_alloc_hpt: Couldn't alloc reverse map array\n"); 100 goto out_freehpt; 101 } 102 kvm->arch.revmap = rev; 103 kvm->arch.sdr1 = __pa(hpt) | (order - 18); 104 105 pr_info("KVM guest htab at %lx (order %ld), LPID %x\n", 106 hpt, order, kvm->arch.lpid); 107 108 if (htab_orderp) 109 *htab_orderp = order; 110 return 0; 111 112 out_freehpt: 113 if (kvm->arch.hpt_li) 114 kvm_release_hpt(kvm->arch.hpt_li); 115 else 116 free_pages(hpt, order - PAGE_SHIFT); 117 return -ENOMEM; 118 } 119 120 long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp) 121 { 122 long err = -EBUSY; 123 long order; 124 125 mutex_lock(&kvm->lock); 126 if (kvm->arch.rma_setup_done) { 127 kvm->arch.rma_setup_done = 0; 128 /* order rma_setup_done vs. vcpus_running */ 129 smp_mb(); 130 if (atomic_read(&kvm->arch.vcpus_running)) { 131 kvm->arch.rma_setup_done = 1; 132 goto out; 133 } 134 } 135 if (kvm->arch.hpt_virt) { 136 order = kvm->arch.hpt_order; 137 /* Set the entire HPT to 0, i.e. invalid HPTEs */ 138 memset((void *)kvm->arch.hpt_virt, 0, 1ul << order); 139 /* 140 * Set the whole last_vcpu array to an invalid vcpu number. 141 * This ensures that each vcpu will flush its TLB on next entry. 142 */ 143 memset(kvm->arch.last_vcpu, 0xff, sizeof(kvm->arch.last_vcpu)); 144 *htab_orderp = order; 145 err = 0; 146 } else { 147 err = kvmppc_alloc_hpt(kvm, htab_orderp); 148 order = *htab_orderp; 149 } 150 out: 151 mutex_unlock(&kvm->lock); 152 return err; 153 } 154 155 void kvmppc_free_hpt(struct kvm *kvm) 156 { 157 kvmppc_free_lpid(kvm->arch.lpid); 158 vfree(kvm->arch.revmap); 159 if (kvm->arch.hpt_li) 160 kvm_release_hpt(kvm->arch.hpt_li); 161 else 162 free_pages(kvm->arch.hpt_virt, 163 kvm->arch.hpt_order - PAGE_SHIFT); 164 } 165 166 /* Bits in first HPTE dword for pagesize 4k, 64k or 16M */ 167 static inline unsigned long hpte0_pgsize_encoding(unsigned long pgsize) 168 { 169 return (pgsize > 0x1000) ? HPTE_V_LARGE : 0; 170 } 171 172 /* Bits in second HPTE dword for pagesize 4k, 64k or 16M */ 173 static inline unsigned long hpte1_pgsize_encoding(unsigned long pgsize) 174 { 175 return (pgsize == 0x10000) ? 0x1000 : 0; 176 } 177 178 void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, 179 unsigned long porder) 180 { 181 unsigned long i; 182 unsigned long npages; 183 unsigned long hp_v, hp_r; 184 unsigned long addr, hash; 185 unsigned long psize; 186 unsigned long hp0, hp1; 187 long ret; 188 struct kvm *kvm = vcpu->kvm; 189 190 psize = 1ul << porder; 191 npages = memslot->npages >> (porder - PAGE_SHIFT); 192 193 /* VRMA can't be > 1TB */ 194 if (npages > 1ul << (40 - porder)) 195 npages = 1ul << (40 - porder); 196 /* Can't use more than 1 HPTE per HPTEG */ 197 if (npages > kvm->arch.hpt_mask + 1) 198 npages = kvm->arch.hpt_mask + 1; 199 200 hp0 = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) | 201 HPTE_V_BOLTED | hpte0_pgsize_encoding(psize); 202 hp1 = hpte1_pgsize_encoding(psize) | 203 HPTE_R_R | HPTE_R_C | HPTE_R_M | PP_RWXX; 204 205 for (i = 0; i < npages; ++i) { 206 addr = i << porder; 207 /* can't use hpt_hash since va > 64 bits */ 208 hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & kvm->arch.hpt_mask; 209 /* 210 * We assume that the hash table is empty and no 211 * vcpus are using it at this stage. Since we create 212 * at most one HPTE per HPTEG, we just assume entry 7 213 * is available and use it. 214 */ 215 hash = (hash << 3) + 7; 216 hp_v = hp0 | ((addr >> 16) & ~0x7fUL); 217 hp_r = hp1 | addr; 218 ret = kvmppc_virtmode_h_enter(vcpu, H_EXACT, hash, hp_v, hp_r); 219 if (ret != H_SUCCESS) { 220 pr_err("KVM: map_vrma at %lx failed, ret=%ld\n", 221 addr, ret); 222 break; 223 } 224 } 225 } 226 227 int kvmppc_mmu_hv_init(void) 228 { 229 unsigned long host_lpid, rsvd_lpid; 230 231 if (!cpu_has_feature(CPU_FTR_HVMODE)) 232 return -EINVAL; 233 234 /* POWER7 has 10-bit LPIDs, PPC970 and e500mc have 6-bit LPIDs */ 235 if (cpu_has_feature(CPU_FTR_ARCH_206)) { 236 host_lpid = mfspr(SPRN_LPID); /* POWER7 */ 237 rsvd_lpid = LPID_RSVD; 238 } else { 239 host_lpid = 0; /* PPC970 */ 240 rsvd_lpid = MAX_LPID_970; 241 } 242 243 kvmppc_init_lpid(rsvd_lpid + 1); 244 245 kvmppc_claim_lpid(host_lpid); 246 /* rsvd_lpid is reserved for use in partition switching */ 247 kvmppc_claim_lpid(rsvd_lpid); 248 249 return 0; 250 } 251 252 void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) 253 { 254 } 255 256 static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu) 257 { 258 kvmppc_set_msr(vcpu, MSR_SF | MSR_ME); 259 } 260 261 /* 262 * This is called to get a reference to a guest page if there isn't 263 * one already in the kvm->arch.slot_phys[][] arrays. 264 */ 265 static long kvmppc_get_guest_page(struct kvm *kvm, unsigned long gfn, 266 struct kvm_memory_slot *memslot, 267 unsigned long psize) 268 { 269 unsigned long start; 270 long np, err; 271 struct page *page, *hpage, *pages[1]; 272 unsigned long s, pgsize; 273 unsigned long *physp; 274 unsigned int is_io, got, pgorder; 275 struct vm_area_struct *vma; 276 unsigned long pfn, i, npages; 277 278 physp = kvm->arch.slot_phys[memslot->id]; 279 if (!physp) 280 return -EINVAL; 281 if (physp[gfn - memslot->base_gfn]) 282 return 0; 283 284 is_io = 0; 285 got = 0; 286 page = NULL; 287 pgsize = psize; 288 err = -EINVAL; 289 start = gfn_to_hva_memslot(memslot, gfn); 290 291 /* Instantiate and get the page we want access to */ 292 np = get_user_pages_fast(start, 1, 1, pages); 293 if (np != 1) { 294 /* Look up the vma for the page */ 295 down_read(¤t->mm->mmap_sem); 296 vma = find_vma(current->mm, start); 297 if (!vma || vma->vm_start > start || 298 start + psize > vma->vm_end || 299 !(vma->vm_flags & VM_PFNMAP)) 300 goto up_err; 301 is_io = hpte_cache_bits(pgprot_val(vma->vm_page_prot)); 302 pfn = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); 303 /* check alignment of pfn vs. requested page size */ 304 if (psize > PAGE_SIZE && (pfn & ((psize >> PAGE_SHIFT) - 1))) 305 goto up_err; 306 up_read(¤t->mm->mmap_sem); 307 308 } else { 309 page = pages[0]; 310 got = KVMPPC_GOT_PAGE; 311 312 /* See if this is a large page */ 313 s = PAGE_SIZE; 314 if (PageHuge(page)) { 315 hpage = compound_head(page); 316 s <<= compound_order(hpage); 317 /* Get the whole large page if slot alignment is ok */ 318 if (s > psize && slot_is_aligned(memslot, s) && 319 !(memslot->userspace_addr & (s - 1))) { 320 start &= ~(s - 1); 321 pgsize = s; 322 get_page(hpage); 323 put_page(page); 324 page = hpage; 325 } 326 } 327 if (s < psize) 328 goto out; 329 pfn = page_to_pfn(page); 330 } 331 332 npages = pgsize >> PAGE_SHIFT; 333 pgorder = __ilog2(npages); 334 physp += (gfn - memslot->base_gfn) & ~(npages - 1); 335 spin_lock(&kvm->arch.slot_phys_lock); 336 for (i = 0; i < npages; ++i) { 337 if (!physp[i]) { 338 physp[i] = ((pfn + i) << PAGE_SHIFT) + 339 got + is_io + pgorder; 340 got = 0; 341 } 342 } 343 spin_unlock(&kvm->arch.slot_phys_lock); 344 err = 0; 345 346 out: 347 if (got) 348 put_page(page); 349 return err; 350 351 up_err: 352 up_read(¤t->mm->mmap_sem); 353 return err; 354 } 355 356 /* 357 * We come here on a H_ENTER call from the guest when we are not 358 * using mmu notifiers and we don't have the requested page pinned 359 * already. 360 */ 361 long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, 362 long pte_index, unsigned long pteh, unsigned long ptel) 363 { 364 struct kvm *kvm = vcpu->kvm; 365 unsigned long psize, gpa, gfn; 366 struct kvm_memory_slot *memslot; 367 long ret; 368 369 if (kvm->arch.using_mmu_notifiers) 370 goto do_insert; 371 372 psize = hpte_page_size(pteh, ptel); 373 if (!psize) 374 return H_PARAMETER; 375 376 pteh &= ~(HPTE_V_HVLOCK | HPTE_V_ABSENT | HPTE_V_VALID); 377 378 /* Find the memslot (if any) for this address */ 379 gpa = (ptel & HPTE_R_RPN) & ~(psize - 1); 380 gfn = gpa >> PAGE_SHIFT; 381 memslot = gfn_to_memslot(kvm, gfn); 382 if (memslot && !(memslot->flags & KVM_MEMSLOT_INVALID)) { 383 if (!slot_is_aligned(memslot, psize)) 384 return H_PARAMETER; 385 if (kvmppc_get_guest_page(kvm, gfn, memslot, psize) < 0) 386 return H_PARAMETER; 387 } 388 389 do_insert: 390 /* Protect linux PTE lookup from page table destruction */ 391 rcu_read_lock_sched(); /* this disables preemption too */ 392 vcpu->arch.pgdir = current->mm->pgd; 393 ret = kvmppc_h_enter(vcpu, flags, pte_index, pteh, ptel); 394 rcu_read_unlock_sched(); 395 if (ret == H_TOO_HARD) { 396 /* this can't happen */ 397 pr_err("KVM: Oops, kvmppc_h_enter returned too hard!\n"); 398 ret = H_RESOURCE; /* or something */ 399 } 400 return ret; 401 402 } 403 404 static struct kvmppc_slb *kvmppc_mmu_book3s_hv_find_slbe(struct kvm_vcpu *vcpu, 405 gva_t eaddr) 406 { 407 u64 mask; 408 int i; 409 410 for (i = 0; i < vcpu->arch.slb_nr; i++) { 411 if (!(vcpu->arch.slb[i].orige & SLB_ESID_V)) 412 continue; 413 414 if (vcpu->arch.slb[i].origv & SLB_VSID_B_1T) 415 mask = ESID_MASK_1T; 416 else 417 mask = ESID_MASK; 418 419 if (((vcpu->arch.slb[i].orige ^ eaddr) & mask) == 0) 420 return &vcpu->arch.slb[i]; 421 } 422 return NULL; 423 } 424 425 static unsigned long kvmppc_mmu_get_real_addr(unsigned long v, unsigned long r, 426 unsigned long ea) 427 { 428 unsigned long ra_mask; 429 430 ra_mask = hpte_page_size(v, r) - 1; 431 return (r & HPTE_R_RPN & ~ra_mask) | (ea & ra_mask); 432 } 433 434 static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, 435 struct kvmppc_pte *gpte, bool data) 436 { 437 struct kvm *kvm = vcpu->kvm; 438 struct kvmppc_slb *slbe; 439 unsigned long slb_v; 440 unsigned long pp, key; 441 unsigned long v, gr; 442 unsigned long *hptep; 443 int index; 444 int virtmode = vcpu->arch.shregs.msr & (data ? MSR_DR : MSR_IR); 445 446 /* Get SLB entry */ 447 if (virtmode) { 448 slbe = kvmppc_mmu_book3s_hv_find_slbe(vcpu, eaddr); 449 if (!slbe) 450 return -EINVAL; 451 slb_v = slbe->origv; 452 } else { 453 /* real mode access */ 454 slb_v = vcpu->kvm->arch.vrma_slb_v; 455 } 456 457 /* Find the HPTE in the hash table */ 458 index = kvmppc_hv_find_lock_hpte(kvm, eaddr, slb_v, 459 HPTE_V_VALID | HPTE_V_ABSENT); 460 if (index < 0) 461 return -ENOENT; 462 hptep = (unsigned long *)(kvm->arch.hpt_virt + (index << 4)); 463 v = hptep[0] & ~HPTE_V_HVLOCK; 464 gr = kvm->arch.revmap[index].guest_rpte; 465 466 /* Unlock the HPTE */ 467 asm volatile("lwsync" : : : "memory"); 468 hptep[0] = v; 469 470 gpte->eaddr = eaddr; 471 gpte->vpage = ((v & HPTE_V_AVPN) << 4) | ((eaddr >> 12) & 0xfff); 472 473 /* Get PP bits and key for permission check */ 474 pp = gr & (HPTE_R_PP0 | HPTE_R_PP); 475 key = (vcpu->arch.shregs.msr & MSR_PR) ? SLB_VSID_KP : SLB_VSID_KS; 476 key &= slb_v; 477 478 /* Calculate permissions */ 479 gpte->may_read = hpte_read_permission(pp, key); 480 gpte->may_write = hpte_write_permission(pp, key); 481 gpte->may_execute = gpte->may_read && !(gr & (HPTE_R_N | HPTE_R_G)); 482 483 /* Storage key permission check for POWER7 */ 484 if (data && virtmode && cpu_has_feature(CPU_FTR_ARCH_206)) { 485 int amrfield = hpte_get_skey_perm(gr, vcpu->arch.amr); 486 if (amrfield & 1) 487 gpte->may_read = 0; 488 if (amrfield & 2) 489 gpte->may_write = 0; 490 } 491 492 /* Get the guest physical address */ 493 gpte->raddr = kvmppc_mmu_get_real_addr(v, gr, eaddr); 494 return 0; 495 } 496 497 /* 498 * Quick test for whether an instruction is a load or a store. 499 * If the instruction is a load or a store, then this will indicate 500 * which it is, at least on server processors. (Embedded processors 501 * have some external PID instructions that don't follow the rule 502 * embodied here.) If the instruction isn't a load or store, then 503 * this doesn't return anything useful. 504 */ 505 static int instruction_is_store(unsigned int instr) 506 { 507 unsigned int mask; 508 509 mask = 0x10000000; 510 if ((instr & 0xfc000000) == 0x7c000000) 511 mask = 0x100; /* major opcode 31 */ 512 return (instr & mask) != 0; 513 } 514 515 static int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu, 516 unsigned long gpa, gva_t ea, int is_store) 517 { 518 int ret; 519 u32 last_inst; 520 unsigned long srr0 = kvmppc_get_pc(vcpu); 521 522 /* We try to load the last instruction. We don't let 523 * emulate_instruction do it as it doesn't check what 524 * kvmppc_ld returns. 525 * If we fail, we just return to the guest and try executing it again. 526 */ 527 if (vcpu->arch.last_inst == KVM_INST_FETCH_FAILED) { 528 ret = kvmppc_ld(vcpu, &srr0, sizeof(u32), &last_inst, false); 529 if (ret != EMULATE_DONE || last_inst == KVM_INST_FETCH_FAILED) 530 return RESUME_GUEST; 531 vcpu->arch.last_inst = last_inst; 532 } 533 534 /* 535 * WARNING: We do not know for sure whether the instruction we just 536 * read from memory is the same that caused the fault in the first 537 * place. If the instruction we read is neither an load or a store, 538 * then it can't access memory, so we don't need to worry about 539 * enforcing access permissions. So, assuming it is a load or 540 * store, we just check that its direction (load or store) is 541 * consistent with the original fault, since that's what we 542 * checked the access permissions against. If there is a mismatch 543 * we just return and retry the instruction. 544 */ 545 546 if (instruction_is_store(vcpu->arch.last_inst) != !!is_store) 547 return RESUME_GUEST; 548 549 /* 550 * Emulated accesses are emulated by looking at the hash for 551 * translation once, then performing the access later. The 552 * translation could be invalidated in the meantime in which 553 * point performing the subsequent memory access on the old 554 * physical address could possibly be a security hole for the 555 * guest (but not the host). 556 * 557 * This is less of an issue for MMIO stores since they aren't 558 * globally visible. It could be an issue for MMIO loads to 559 * a certain extent but we'll ignore it for now. 560 */ 561 562 vcpu->arch.paddr_accessed = gpa; 563 vcpu->arch.vaddr_accessed = ea; 564 return kvmppc_emulate_mmio(run, vcpu); 565 } 566 567 int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, 568 unsigned long ea, unsigned long dsisr) 569 { 570 struct kvm *kvm = vcpu->kvm; 571 unsigned long *hptep, hpte[3], r; 572 unsigned long mmu_seq, psize, pte_size; 573 unsigned long gfn, hva, pfn; 574 struct kvm_memory_slot *memslot; 575 unsigned long *rmap; 576 struct revmap_entry *rev; 577 struct page *page, *pages[1]; 578 long index, ret, npages; 579 unsigned long is_io; 580 unsigned int writing, write_ok; 581 struct vm_area_struct *vma; 582 unsigned long rcbits; 583 584 /* 585 * Real-mode code has already searched the HPT and found the 586 * entry we're interested in. Lock the entry and check that 587 * it hasn't changed. If it has, just return and re-execute the 588 * instruction. 589 */ 590 if (ea != vcpu->arch.pgfault_addr) 591 return RESUME_GUEST; 592 index = vcpu->arch.pgfault_index; 593 hptep = (unsigned long *)(kvm->arch.hpt_virt + (index << 4)); 594 rev = &kvm->arch.revmap[index]; 595 preempt_disable(); 596 while (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) 597 cpu_relax(); 598 hpte[0] = hptep[0] & ~HPTE_V_HVLOCK; 599 hpte[1] = hptep[1]; 600 hpte[2] = r = rev->guest_rpte; 601 asm volatile("lwsync" : : : "memory"); 602 hptep[0] = hpte[0]; 603 preempt_enable(); 604 605 if (hpte[0] != vcpu->arch.pgfault_hpte[0] || 606 hpte[1] != vcpu->arch.pgfault_hpte[1]) 607 return RESUME_GUEST; 608 609 /* Translate the logical address and get the page */ 610 psize = hpte_page_size(hpte[0], r); 611 gfn = hpte_rpn(r, psize); 612 memslot = gfn_to_memslot(kvm, gfn); 613 614 /* No memslot means it's an emulated MMIO region */ 615 if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) { 616 unsigned long gpa = (gfn << PAGE_SHIFT) | (ea & (psize - 1)); 617 return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea, 618 dsisr & DSISR_ISSTORE); 619 } 620 621 if (!kvm->arch.using_mmu_notifiers) 622 return -EFAULT; /* should never get here */ 623 624 /* used to check for invalidations in progress */ 625 mmu_seq = kvm->mmu_notifier_seq; 626 smp_rmb(); 627 628 is_io = 0; 629 pfn = 0; 630 page = NULL; 631 pte_size = PAGE_SIZE; 632 writing = (dsisr & DSISR_ISSTORE) != 0; 633 /* If writing != 0, then the HPTE must allow writing, if we get here */ 634 write_ok = writing; 635 hva = gfn_to_hva_memslot(memslot, gfn); 636 npages = get_user_pages_fast(hva, 1, writing, pages); 637 if (npages < 1) { 638 /* Check if it's an I/O mapping */ 639 down_read(¤t->mm->mmap_sem); 640 vma = find_vma(current->mm, hva); 641 if (vma && vma->vm_start <= hva && hva + psize <= vma->vm_end && 642 (vma->vm_flags & VM_PFNMAP)) { 643 pfn = vma->vm_pgoff + 644 ((hva - vma->vm_start) >> PAGE_SHIFT); 645 pte_size = psize; 646 is_io = hpte_cache_bits(pgprot_val(vma->vm_page_prot)); 647 write_ok = vma->vm_flags & VM_WRITE; 648 } 649 up_read(¤t->mm->mmap_sem); 650 if (!pfn) 651 return -EFAULT; 652 } else { 653 page = pages[0]; 654 if (PageHuge(page)) { 655 page = compound_head(page); 656 pte_size <<= compound_order(page); 657 } 658 /* if the guest wants write access, see if that is OK */ 659 if (!writing && hpte_is_writable(r)) { 660 pte_t *ptep, pte; 661 662 /* 663 * We need to protect against page table destruction 664 * while looking up and updating the pte. 665 */ 666 rcu_read_lock_sched(); 667 ptep = find_linux_pte_or_hugepte(current->mm->pgd, 668 hva, NULL); 669 if (ptep && pte_present(*ptep)) { 670 pte = kvmppc_read_update_linux_pte(ptep, 1); 671 if (pte_write(pte)) 672 write_ok = 1; 673 } 674 rcu_read_unlock_sched(); 675 } 676 pfn = page_to_pfn(page); 677 } 678 679 ret = -EFAULT; 680 if (psize > pte_size) 681 goto out_put; 682 683 /* Check WIMG vs. the actual page we're accessing */ 684 if (!hpte_cache_flags_ok(r, is_io)) { 685 if (is_io) 686 return -EFAULT; 687 /* 688 * Allow guest to map emulated device memory as 689 * uncacheable, but actually make it cacheable. 690 */ 691 r = (r & ~(HPTE_R_W|HPTE_R_I|HPTE_R_G)) | HPTE_R_M; 692 } 693 694 /* Set the HPTE to point to pfn */ 695 r = (r & ~(HPTE_R_PP0 - pte_size)) | (pfn << PAGE_SHIFT); 696 if (hpte_is_writable(r) && !write_ok) 697 r = hpte_make_readonly(r); 698 ret = RESUME_GUEST; 699 preempt_disable(); 700 while (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) 701 cpu_relax(); 702 if ((hptep[0] & ~HPTE_V_HVLOCK) != hpte[0] || hptep[1] != hpte[1] || 703 rev->guest_rpte != hpte[2]) 704 /* HPTE has been changed under us; let the guest retry */ 705 goto out_unlock; 706 hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID; 707 708 rmap = &memslot->arch.rmap[gfn - memslot->base_gfn]; 709 lock_rmap(rmap); 710 711 /* Check if we might have been invalidated; let the guest retry if so */ 712 ret = RESUME_GUEST; 713 if (mmu_notifier_retry(vcpu, mmu_seq)) { 714 unlock_rmap(rmap); 715 goto out_unlock; 716 } 717 718 /* Only set R/C in real HPTE if set in both *rmap and guest_rpte */ 719 rcbits = *rmap >> KVMPPC_RMAP_RC_SHIFT; 720 r &= rcbits | ~(HPTE_R_R | HPTE_R_C); 721 722 if (hptep[0] & HPTE_V_VALID) { 723 /* HPTE was previously valid, so we need to invalidate it */ 724 unlock_rmap(rmap); 725 hptep[0] |= HPTE_V_ABSENT; 726 kvmppc_invalidate_hpte(kvm, hptep, index); 727 /* don't lose previous R and C bits */ 728 r |= hptep[1] & (HPTE_R_R | HPTE_R_C); 729 } else { 730 kvmppc_add_revmap_chain(kvm, rev, rmap, index, 0); 731 } 732 733 hptep[1] = r; 734 eieio(); 735 hptep[0] = hpte[0]; 736 asm volatile("ptesync" : : : "memory"); 737 preempt_enable(); 738 if (page && hpte_is_writable(r)) 739 SetPageDirty(page); 740 741 out_put: 742 if (page) { 743 /* 744 * We drop pages[0] here, not page because page might 745 * have been set to the head page of a compound, but 746 * we have to drop the reference on the correct tail 747 * page to match the get inside gup() 748 */ 749 put_page(pages[0]); 750 } 751 return ret; 752 753 out_unlock: 754 hptep[0] &= ~HPTE_V_HVLOCK; 755 preempt_enable(); 756 goto out_put; 757 } 758 759 static int kvm_handle_hva_range(struct kvm *kvm, 760 unsigned long start, 761 unsigned long end, 762 int (*handler)(struct kvm *kvm, 763 unsigned long *rmapp, 764 unsigned long gfn)) 765 { 766 int ret; 767 int retval = 0; 768 struct kvm_memslots *slots; 769 struct kvm_memory_slot *memslot; 770 771 slots = kvm_memslots(kvm); 772 kvm_for_each_memslot(memslot, slots) { 773 unsigned long hva_start, hva_end; 774 gfn_t gfn, gfn_end; 775 776 hva_start = max(start, memslot->userspace_addr); 777 hva_end = min(end, memslot->userspace_addr + 778 (memslot->npages << PAGE_SHIFT)); 779 if (hva_start >= hva_end) 780 continue; 781 /* 782 * {gfn(page) | page intersects with [hva_start, hva_end)} = 783 * {gfn, gfn+1, ..., gfn_end-1}. 784 */ 785 gfn = hva_to_gfn_memslot(hva_start, memslot); 786 gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); 787 788 for (; gfn < gfn_end; ++gfn) { 789 gfn_t gfn_offset = gfn - memslot->base_gfn; 790 791 ret = handler(kvm, &memslot->arch.rmap[gfn_offset], gfn); 792 retval |= ret; 793 } 794 } 795 796 return retval; 797 } 798 799 static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, 800 int (*handler)(struct kvm *kvm, unsigned long *rmapp, 801 unsigned long gfn)) 802 { 803 return kvm_handle_hva_range(kvm, hva, hva + 1, handler); 804 } 805 806 static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, 807 unsigned long gfn) 808 { 809 struct revmap_entry *rev = kvm->arch.revmap; 810 unsigned long h, i, j; 811 unsigned long *hptep; 812 unsigned long ptel, psize, rcbits; 813 814 for (;;) { 815 lock_rmap(rmapp); 816 if (!(*rmapp & KVMPPC_RMAP_PRESENT)) { 817 unlock_rmap(rmapp); 818 break; 819 } 820 821 /* 822 * To avoid an ABBA deadlock with the HPTE lock bit, 823 * we can't spin on the HPTE lock while holding the 824 * rmap chain lock. 825 */ 826 i = *rmapp & KVMPPC_RMAP_INDEX; 827 hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4)); 828 if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) { 829 /* unlock rmap before spinning on the HPTE lock */ 830 unlock_rmap(rmapp); 831 while (hptep[0] & HPTE_V_HVLOCK) 832 cpu_relax(); 833 continue; 834 } 835 j = rev[i].forw; 836 if (j == i) { 837 /* chain is now empty */ 838 *rmapp &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX); 839 } else { 840 /* remove i from chain */ 841 h = rev[i].back; 842 rev[h].forw = j; 843 rev[j].back = h; 844 rev[i].forw = rev[i].back = i; 845 *rmapp = (*rmapp & ~KVMPPC_RMAP_INDEX) | j; 846 } 847 848 /* Now check and modify the HPTE */ 849 ptel = rev[i].guest_rpte; 850 psize = hpte_page_size(hptep[0], ptel); 851 if ((hptep[0] & HPTE_V_VALID) && 852 hpte_rpn(ptel, psize) == gfn) { 853 hptep[0] |= HPTE_V_ABSENT; 854 kvmppc_invalidate_hpte(kvm, hptep, i); 855 /* Harvest R and C */ 856 rcbits = hptep[1] & (HPTE_R_R | HPTE_R_C); 857 *rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT; 858 rev[i].guest_rpte = ptel | rcbits; 859 } 860 unlock_rmap(rmapp); 861 hptep[0] &= ~HPTE_V_HVLOCK; 862 } 863 return 0; 864 } 865 866 int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) 867 { 868 if (kvm->arch.using_mmu_notifiers) 869 kvm_handle_hva(kvm, hva, kvm_unmap_rmapp); 870 return 0; 871 } 872 873 int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) 874 { 875 if (kvm->arch.using_mmu_notifiers) 876 kvm_handle_hva_range(kvm, start, end, kvm_unmap_rmapp); 877 return 0; 878 } 879 880 static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, 881 unsigned long gfn) 882 { 883 struct revmap_entry *rev = kvm->arch.revmap; 884 unsigned long head, i, j; 885 unsigned long *hptep; 886 int ret = 0; 887 888 retry: 889 lock_rmap(rmapp); 890 if (*rmapp & KVMPPC_RMAP_REFERENCED) { 891 *rmapp &= ~KVMPPC_RMAP_REFERENCED; 892 ret = 1; 893 } 894 if (!(*rmapp & KVMPPC_RMAP_PRESENT)) { 895 unlock_rmap(rmapp); 896 return ret; 897 } 898 899 i = head = *rmapp & KVMPPC_RMAP_INDEX; 900 do { 901 hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4)); 902 j = rev[i].forw; 903 904 /* If this HPTE isn't referenced, ignore it */ 905 if (!(hptep[1] & HPTE_R_R)) 906 continue; 907 908 if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) { 909 /* unlock rmap before spinning on the HPTE lock */ 910 unlock_rmap(rmapp); 911 while (hptep[0] & HPTE_V_HVLOCK) 912 cpu_relax(); 913 goto retry; 914 } 915 916 /* Now check and modify the HPTE */ 917 if ((hptep[0] & HPTE_V_VALID) && (hptep[1] & HPTE_R_R)) { 918 kvmppc_clear_ref_hpte(kvm, hptep, i); 919 rev[i].guest_rpte |= HPTE_R_R; 920 ret = 1; 921 } 922 hptep[0] &= ~HPTE_V_HVLOCK; 923 } while ((i = j) != head); 924 925 unlock_rmap(rmapp); 926 return ret; 927 } 928 929 int kvm_age_hva(struct kvm *kvm, unsigned long hva) 930 { 931 if (!kvm->arch.using_mmu_notifiers) 932 return 0; 933 return kvm_handle_hva(kvm, hva, kvm_age_rmapp); 934 } 935 936 static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, 937 unsigned long gfn) 938 { 939 struct revmap_entry *rev = kvm->arch.revmap; 940 unsigned long head, i, j; 941 unsigned long *hp; 942 int ret = 1; 943 944 if (*rmapp & KVMPPC_RMAP_REFERENCED) 945 return 1; 946 947 lock_rmap(rmapp); 948 if (*rmapp & KVMPPC_RMAP_REFERENCED) 949 goto out; 950 951 if (*rmapp & KVMPPC_RMAP_PRESENT) { 952 i = head = *rmapp & KVMPPC_RMAP_INDEX; 953 do { 954 hp = (unsigned long *)(kvm->arch.hpt_virt + (i << 4)); 955 j = rev[i].forw; 956 if (hp[1] & HPTE_R_R) 957 goto out; 958 } while ((i = j) != head); 959 } 960 ret = 0; 961 962 out: 963 unlock_rmap(rmapp); 964 return ret; 965 } 966 967 int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) 968 { 969 if (!kvm->arch.using_mmu_notifiers) 970 return 0; 971 return kvm_handle_hva(kvm, hva, kvm_test_age_rmapp); 972 } 973 974 void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) 975 { 976 if (!kvm->arch.using_mmu_notifiers) 977 return; 978 kvm_handle_hva(kvm, hva, kvm_unmap_rmapp); 979 } 980 981 static int kvm_test_clear_dirty(struct kvm *kvm, unsigned long *rmapp) 982 { 983 struct revmap_entry *rev = kvm->arch.revmap; 984 unsigned long head, i, j; 985 unsigned long *hptep; 986 int ret = 0; 987 988 retry: 989 lock_rmap(rmapp); 990 if (*rmapp & KVMPPC_RMAP_CHANGED) { 991 *rmapp &= ~KVMPPC_RMAP_CHANGED; 992 ret = 1; 993 } 994 if (!(*rmapp & KVMPPC_RMAP_PRESENT)) { 995 unlock_rmap(rmapp); 996 return ret; 997 } 998 999 i = head = *rmapp & KVMPPC_RMAP_INDEX; 1000 do { 1001 hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4)); 1002 j = rev[i].forw; 1003 1004 if (!(hptep[1] & HPTE_R_C)) 1005 continue; 1006 1007 if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) { 1008 /* unlock rmap before spinning on the HPTE lock */ 1009 unlock_rmap(rmapp); 1010 while (hptep[0] & HPTE_V_HVLOCK) 1011 cpu_relax(); 1012 goto retry; 1013 } 1014 1015 /* Now check and modify the HPTE */ 1016 if ((hptep[0] & HPTE_V_VALID) && (hptep[1] & HPTE_R_C)) { 1017 /* need to make it temporarily absent to clear C */ 1018 hptep[0] |= HPTE_V_ABSENT; 1019 kvmppc_invalidate_hpte(kvm, hptep, i); 1020 hptep[1] &= ~HPTE_R_C; 1021 eieio(); 1022 hptep[0] = (hptep[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID; 1023 rev[i].guest_rpte |= HPTE_R_C; 1024 ret = 1; 1025 } 1026 hptep[0] &= ~HPTE_V_HVLOCK; 1027 } while ((i = j) != head); 1028 1029 unlock_rmap(rmapp); 1030 return ret; 1031 } 1032 1033 long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) 1034 { 1035 unsigned long i; 1036 unsigned long *rmapp, *map; 1037 1038 preempt_disable(); 1039 rmapp = memslot->arch.rmap; 1040 map = memslot->dirty_bitmap; 1041 for (i = 0; i < memslot->npages; ++i) { 1042 if (kvm_test_clear_dirty(kvm, rmapp)) 1043 __set_bit_le(i, map); 1044 ++rmapp; 1045 } 1046 preempt_enable(); 1047 return 0; 1048 } 1049 1050 void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa, 1051 unsigned long *nb_ret) 1052 { 1053 struct kvm_memory_slot *memslot; 1054 unsigned long gfn = gpa >> PAGE_SHIFT; 1055 struct page *page, *pages[1]; 1056 int npages; 1057 unsigned long hva, psize, offset; 1058 unsigned long pa; 1059 unsigned long *physp; 1060 1061 memslot = gfn_to_memslot(kvm, gfn); 1062 if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) 1063 return NULL; 1064 if (!kvm->arch.using_mmu_notifiers) { 1065 physp = kvm->arch.slot_phys[memslot->id]; 1066 if (!physp) 1067 return NULL; 1068 physp += gfn - memslot->base_gfn; 1069 pa = *physp; 1070 if (!pa) { 1071 if (kvmppc_get_guest_page(kvm, gfn, memslot, 1072 PAGE_SIZE) < 0) 1073 return NULL; 1074 pa = *physp; 1075 } 1076 page = pfn_to_page(pa >> PAGE_SHIFT); 1077 get_page(page); 1078 } else { 1079 hva = gfn_to_hva_memslot(memslot, gfn); 1080 npages = get_user_pages_fast(hva, 1, 1, pages); 1081 if (npages < 1) 1082 return NULL; 1083 page = pages[0]; 1084 } 1085 psize = PAGE_SIZE; 1086 if (PageHuge(page)) { 1087 page = compound_head(page); 1088 psize <<= compound_order(page); 1089 } 1090 offset = gpa & (psize - 1); 1091 if (nb_ret) 1092 *nb_ret = psize - offset; 1093 return page_address(page) + offset; 1094 } 1095 1096 void kvmppc_unpin_guest_page(struct kvm *kvm, void *va) 1097 { 1098 struct page *page = virt_to_page(va); 1099 1100 put_page(page); 1101 } 1102 1103 void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu) 1104 { 1105 struct kvmppc_mmu *mmu = &vcpu->arch.mmu; 1106 1107 if (cpu_has_feature(CPU_FTR_ARCH_206)) 1108 vcpu->arch.slb_nr = 32; /* POWER7 */ 1109 else 1110 vcpu->arch.slb_nr = 64; 1111 1112 mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate; 1113 mmu->reset_msr = kvmppc_mmu_book3s_64_hv_reset_msr; 1114 1115 vcpu->arch.hflags |= BOOK3S_HFLAG_SLB; 1116 } 1117