1 /* 2 * Kernel-based Virtual Machine driver for Linux 3 * 4 * This module enables machines with Intel VT-x extensions to run virtual 5 * machines without emulation or binary translation. 6 * 7 * Copyright (C) 2006 Qumranet, Inc. 8 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 9 * 10 * Authors: 11 * Avi Kivity <avi@qumranet.com> 12 * Yaniv Kamay <yaniv@qumranet.com> 13 * 14 * This work is licensed under the terms of the GNU GPL, version 2. See 15 * the COPYING file in the top-level directory. 16 * 17 */ 18 19 #include "iodev.h" 20 21 #include <linux/kvm_host.h> 22 #include <linux/kvm.h> 23 #include <linux/module.h> 24 #include <linux/errno.h> 25 #include <linux/percpu.h> 26 #include <linux/mm.h> 27 #include <linux/miscdevice.h> 28 #include <linux/vmalloc.h> 29 #include <linux/reboot.h> 30 #include <linux/debugfs.h> 31 #include <linux/highmem.h> 32 #include <linux/file.h> 33 #include <linux/syscore_ops.h> 34 #include <linux/cpu.h> 35 #include <linux/sched.h> 36 #include <linux/cpumask.h> 37 #include <linux/smp.h> 38 #include <linux/anon_inodes.h> 39 #include <linux/profile.h> 40 #include <linux/kvm_para.h> 41 #include <linux/pagemap.h> 42 #include <linux/mman.h> 43 #include <linux/swap.h> 44 #include <linux/bitops.h> 45 #include <linux/spinlock.h> 46 #include <linux/compat.h> 47 #include <linux/srcu.h> 48 #include <linux/hugetlb.h> 49 #include <linux/slab.h> 50 51 #include <asm/processor.h> 52 #include <asm/io.h> 53 #include <asm/uaccess.h> 54 #include <asm/pgtable.h> 55 56 #include "coalesced_mmio.h" 57 #include "async_pf.h" 58 59 #define CREATE_TRACE_POINTS 60 #include <trace/events/kvm.h> 61 62 MODULE_AUTHOR("Qumranet"); 63 MODULE_LICENSE("GPL"); 64 65 /* 66 * Ordering of locks: 67 * 68 * kvm->lock --> kvm->slots_lock --> kvm->irq_lock 69 */ 70 71 DEFINE_RAW_SPINLOCK(kvm_lock); 72 LIST_HEAD(vm_list); 73 74 static cpumask_var_t cpus_hardware_enabled; 75 static int kvm_usage_count = 0; 76 static atomic_t hardware_enable_failed; 77 78 struct kmem_cache *kvm_vcpu_cache; 79 EXPORT_SYMBOL_GPL(kvm_vcpu_cache); 80 81 static __read_mostly struct preempt_ops kvm_preempt_ops; 82 83 struct dentry *kvm_debugfs_dir; 84 85 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, 86 unsigned long arg); 87 static int hardware_enable_all(void); 88 static void hardware_disable_all(void); 89 90 static void kvm_io_bus_destroy(struct kvm_io_bus *bus); 91 92 bool kvm_rebooting; 93 EXPORT_SYMBOL_GPL(kvm_rebooting); 94 95 static bool largepages_enabled = true; 96 97 static struct page *hwpoison_page; 98 static pfn_t hwpoison_pfn; 99 100 static struct page *fault_page; 101 static pfn_t fault_pfn; 102 103 inline int kvm_is_mmio_pfn(pfn_t pfn) 104 { 105 if (pfn_valid(pfn)) { 106 int reserved; 107 struct page *tail = pfn_to_page(pfn); 108 struct page *head = compound_trans_head(tail); 109 reserved = PageReserved(head); 110 if (head != tail) { 111 /* 112 * "head" is not a dangling pointer 113 * (compound_trans_head takes care of that) 114 * but the hugepage may have been splitted 115 * from under us (and we may not hold a 116 * reference count on the head page so it can 117 * be reused before we run PageReferenced), so 118 * we've to check PageTail before returning 119 * what we just read. 120 */ 121 smp_rmb(); 122 if (PageTail(tail)) 123 return reserved; 124 } 125 return PageReserved(tail); 126 } 127 128 return true; 129 } 130 131 /* 132 * Switches to specified vcpu, until a matching vcpu_put() 133 */ 134 void vcpu_load(struct kvm_vcpu *vcpu) 135 { 136 int cpu; 137 138 mutex_lock(&vcpu->mutex); 139 if (unlikely(vcpu->pid != current->pids[PIDTYPE_PID].pid)) { 140 /* The thread running this VCPU changed. */ 141 struct pid *oldpid = vcpu->pid; 142 struct pid *newpid = get_task_pid(current, PIDTYPE_PID); 143 rcu_assign_pointer(vcpu->pid, newpid); 144 synchronize_rcu(); 145 put_pid(oldpid); 146 } 147 cpu = get_cpu(); 148 preempt_notifier_register(&vcpu->preempt_notifier); 149 kvm_arch_vcpu_load(vcpu, cpu); 150 put_cpu(); 151 } 152 153 void vcpu_put(struct kvm_vcpu *vcpu) 154 { 155 preempt_disable(); 156 kvm_arch_vcpu_put(vcpu); 157 preempt_notifier_unregister(&vcpu->preempt_notifier); 158 preempt_enable(); 159 mutex_unlock(&vcpu->mutex); 160 } 161 162 static void ack_flush(void *_completed) 163 { 164 } 165 166 static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) 167 { 168 int i, cpu, me; 169 cpumask_var_t cpus; 170 bool called = true; 171 struct kvm_vcpu *vcpu; 172 173 zalloc_cpumask_var(&cpus, GFP_ATOMIC); 174 175 me = get_cpu(); 176 kvm_for_each_vcpu(i, vcpu, kvm) { 177 kvm_make_request(req, vcpu); 178 cpu = vcpu->cpu; 179 180 /* Set ->requests bit before we read ->mode */ 181 smp_mb(); 182 183 if (cpus != NULL && cpu != -1 && cpu != me && 184 kvm_vcpu_exiting_guest_mode(vcpu) != OUTSIDE_GUEST_MODE) 185 cpumask_set_cpu(cpu, cpus); 186 } 187 if (unlikely(cpus == NULL)) 188 smp_call_function_many(cpu_online_mask, ack_flush, NULL, 1); 189 else if (!cpumask_empty(cpus)) 190 smp_call_function_many(cpus, ack_flush, NULL, 1); 191 else 192 called = false; 193 put_cpu(); 194 free_cpumask_var(cpus); 195 return called; 196 } 197 198 void kvm_flush_remote_tlbs(struct kvm *kvm) 199 { 200 int dirty_count = kvm->tlbs_dirty; 201 202 smp_mb(); 203 if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) 204 ++kvm->stat.remote_tlb_flush; 205 cmpxchg(&kvm->tlbs_dirty, dirty_count, 0); 206 } 207 208 void kvm_reload_remote_mmus(struct kvm *kvm) 209 { 210 make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD); 211 } 212 213 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) 214 { 215 struct page *page; 216 int r; 217 218 mutex_init(&vcpu->mutex); 219 vcpu->cpu = -1; 220 vcpu->kvm = kvm; 221 vcpu->vcpu_id = id; 222 vcpu->pid = NULL; 223 init_waitqueue_head(&vcpu->wq); 224 kvm_async_pf_vcpu_init(vcpu); 225 226 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 227 if (!page) { 228 r = -ENOMEM; 229 goto fail; 230 } 231 vcpu->run = page_address(page); 232 233 r = kvm_arch_vcpu_init(vcpu); 234 if (r < 0) 235 goto fail_free_run; 236 return 0; 237 238 fail_free_run: 239 free_page((unsigned long)vcpu->run); 240 fail: 241 return r; 242 } 243 EXPORT_SYMBOL_GPL(kvm_vcpu_init); 244 245 void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) 246 { 247 put_pid(vcpu->pid); 248 kvm_arch_vcpu_uninit(vcpu); 249 free_page((unsigned long)vcpu->run); 250 } 251 EXPORT_SYMBOL_GPL(kvm_vcpu_uninit); 252 253 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 254 static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) 255 { 256 return container_of(mn, struct kvm, mmu_notifier); 257 } 258 259 static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn, 260 struct mm_struct *mm, 261 unsigned long address) 262 { 263 struct kvm *kvm = mmu_notifier_to_kvm(mn); 264 int need_tlb_flush, idx; 265 266 /* 267 * When ->invalidate_page runs, the linux pte has been zapped 268 * already but the page is still allocated until 269 * ->invalidate_page returns. So if we increase the sequence 270 * here the kvm page fault will notice if the spte can't be 271 * established because the page is going to be freed. If 272 * instead the kvm page fault establishes the spte before 273 * ->invalidate_page runs, kvm_unmap_hva will release it 274 * before returning. 275 * 276 * The sequence increase only need to be seen at spin_unlock 277 * time, and not at spin_lock time. 278 * 279 * Increasing the sequence after the spin_unlock would be 280 * unsafe because the kvm page fault could then establish the 281 * pte after kvm_unmap_hva returned, without noticing the page 282 * is going to be freed. 283 */ 284 idx = srcu_read_lock(&kvm->srcu); 285 spin_lock(&kvm->mmu_lock); 286 kvm->mmu_notifier_seq++; 287 need_tlb_flush = kvm_unmap_hva(kvm, address) | kvm->tlbs_dirty; 288 spin_unlock(&kvm->mmu_lock); 289 srcu_read_unlock(&kvm->srcu, idx); 290 291 /* we've to flush the tlb before the pages can be freed */ 292 if (need_tlb_flush) 293 kvm_flush_remote_tlbs(kvm); 294 295 } 296 297 static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, 298 struct mm_struct *mm, 299 unsigned long address, 300 pte_t pte) 301 { 302 struct kvm *kvm = mmu_notifier_to_kvm(mn); 303 int idx; 304 305 idx = srcu_read_lock(&kvm->srcu); 306 spin_lock(&kvm->mmu_lock); 307 kvm->mmu_notifier_seq++; 308 kvm_set_spte_hva(kvm, address, pte); 309 spin_unlock(&kvm->mmu_lock); 310 srcu_read_unlock(&kvm->srcu, idx); 311 } 312 313 static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, 314 struct mm_struct *mm, 315 unsigned long start, 316 unsigned long end) 317 { 318 struct kvm *kvm = mmu_notifier_to_kvm(mn); 319 int need_tlb_flush = 0, idx; 320 321 idx = srcu_read_lock(&kvm->srcu); 322 spin_lock(&kvm->mmu_lock); 323 /* 324 * The count increase must become visible at unlock time as no 325 * spte can be established without taking the mmu_lock and 326 * count is also read inside the mmu_lock critical section. 327 */ 328 kvm->mmu_notifier_count++; 329 for (; start < end; start += PAGE_SIZE) 330 need_tlb_flush |= kvm_unmap_hva(kvm, start); 331 need_tlb_flush |= kvm->tlbs_dirty; 332 spin_unlock(&kvm->mmu_lock); 333 srcu_read_unlock(&kvm->srcu, idx); 334 335 /* we've to flush the tlb before the pages can be freed */ 336 if (need_tlb_flush) 337 kvm_flush_remote_tlbs(kvm); 338 } 339 340 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, 341 struct mm_struct *mm, 342 unsigned long start, 343 unsigned long end) 344 { 345 struct kvm *kvm = mmu_notifier_to_kvm(mn); 346 347 spin_lock(&kvm->mmu_lock); 348 /* 349 * This sequence increase will notify the kvm page fault that 350 * the page that is going to be mapped in the spte could have 351 * been freed. 352 */ 353 kvm->mmu_notifier_seq++; 354 /* 355 * The above sequence increase must be visible before the 356 * below count decrease but both values are read by the kvm 357 * page fault under mmu_lock spinlock so we don't need to add 358 * a smb_wmb() here in between the two. 359 */ 360 kvm->mmu_notifier_count--; 361 spin_unlock(&kvm->mmu_lock); 362 363 BUG_ON(kvm->mmu_notifier_count < 0); 364 } 365 366 static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, 367 struct mm_struct *mm, 368 unsigned long address) 369 { 370 struct kvm *kvm = mmu_notifier_to_kvm(mn); 371 int young, idx; 372 373 idx = srcu_read_lock(&kvm->srcu); 374 spin_lock(&kvm->mmu_lock); 375 young = kvm_age_hva(kvm, address); 376 spin_unlock(&kvm->mmu_lock); 377 srcu_read_unlock(&kvm->srcu, idx); 378 379 if (young) 380 kvm_flush_remote_tlbs(kvm); 381 382 return young; 383 } 384 385 static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn, 386 struct mm_struct *mm, 387 unsigned long address) 388 { 389 struct kvm *kvm = mmu_notifier_to_kvm(mn); 390 int young, idx; 391 392 idx = srcu_read_lock(&kvm->srcu); 393 spin_lock(&kvm->mmu_lock); 394 young = kvm_test_age_hva(kvm, address); 395 spin_unlock(&kvm->mmu_lock); 396 srcu_read_unlock(&kvm->srcu, idx); 397 398 return young; 399 } 400 401 static void kvm_mmu_notifier_release(struct mmu_notifier *mn, 402 struct mm_struct *mm) 403 { 404 struct kvm *kvm = mmu_notifier_to_kvm(mn); 405 int idx; 406 407 idx = srcu_read_lock(&kvm->srcu); 408 kvm_arch_flush_shadow(kvm); 409 srcu_read_unlock(&kvm->srcu, idx); 410 } 411 412 static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { 413 .invalidate_page = kvm_mmu_notifier_invalidate_page, 414 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, 415 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, 416 .clear_flush_young = kvm_mmu_notifier_clear_flush_young, 417 .test_young = kvm_mmu_notifier_test_young, 418 .change_pte = kvm_mmu_notifier_change_pte, 419 .release = kvm_mmu_notifier_release, 420 }; 421 422 static int kvm_init_mmu_notifier(struct kvm *kvm) 423 { 424 kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops; 425 return mmu_notifier_register(&kvm->mmu_notifier, current->mm); 426 } 427 428 #else /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */ 429 430 static int kvm_init_mmu_notifier(struct kvm *kvm) 431 { 432 return 0; 433 } 434 435 #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */ 436 437 static struct kvm *kvm_create_vm(void) 438 { 439 int r, i; 440 struct kvm *kvm = kvm_arch_alloc_vm(); 441 442 if (!kvm) 443 return ERR_PTR(-ENOMEM); 444 445 r = kvm_arch_init_vm(kvm); 446 if (r) 447 goto out_err_nodisable; 448 449 r = hardware_enable_all(); 450 if (r) 451 goto out_err_nodisable; 452 453 #ifdef CONFIG_HAVE_KVM_IRQCHIP 454 INIT_HLIST_HEAD(&kvm->mask_notifier_list); 455 INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list); 456 #endif 457 458 r = -ENOMEM; 459 kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); 460 if (!kvm->memslots) 461 goto out_err_nosrcu; 462 if (init_srcu_struct(&kvm->srcu)) 463 goto out_err_nosrcu; 464 for (i = 0; i < KVM_NR_BUSES; i++) { 465 kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus), 466 GFP_KERNEL); 467 if (!kvm->buses[i]) 468 goto out_err; 469 } 470 471 spin_lock_init(&kvm->mmu_lock); 472 kvm->mm = current->mm; 473 atomic_inc(&kvm->mm->mm_count); 474 kvm_eventfd_init(kvm); 475 mutex_init(&kvm->lock); 476 mutex_init(&kvm->irq_lock); 477 mutex_init(&kvm->slots_lock); 478 atomic_set(&kvm->users_count, 1); 479 480 r = kvm_init_mmu_notifier(kvm); 481 if (r) 482 goto out_err; 483 484 raw_spin_lock(&kvm_lock); 485 list_add(&kvm->vm_list, &vm_list); 486 raw_spin_unlock(&kvm_lock); 487 488 return kvm; 489 490 out_err: 491 cleanup_srcu_struct(&kvm->srcu); 492 out_err_nosrcu: 493 hardware_disable_all(); 494 out_err_nodisable: 495 for (i = 0; i < KVM_NR_BUSES; i++) 496 kfree(kvm->buses[i]); 497 kfree(kvm->memslots); 498 kvm_arch_free_vm(kvm); 499 return ERR_PTR(r); 500 } 501 502 static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) 503 { 504 if (!memslot->dirty_bitmap) 505 return; 506 507 if (2 * kvm_dirty_bitmap_bytes(memslot) > PAGE_SIZE) 508 vfree(memslot->dirty_bitmap_head); 509 else 510 kfree(memslot->dirty_bitmap_head); 511 512 memslot->dirty_bitmap = NULL; 513 memslot->dirty_bitmap_head = NULL; 514 } 515 516 /* 517 * Free any memory in @free but not in @dont. 518 */ 519 static void kvm_free_physmem_slot(struct kvm_memory_slot *free, 520 struct kvm_memory_slot *dont) 521 { 522 int i; 523 524 if (!dont || free->rmap != dont->rmap) 525 vfree(free->rmap); 526 527 if (!dont || free->dirty_bitmap != dont->dirty_bitmap) 528 kvm_destroy_dirty_bitmap(free); 529 530 531 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { 532 if (!dont || free->lpage_info[i] != dont->lpage_info[i]) { 533 vfree(free->lpage_info[i]); 534 free->lpage_info[i] = NULL; 535 } 536 } 537 538 free->npages = 0; 539 free->rmap = NULL; 540 } 541 542 void kvm_free_physmem(struct kvm *kvm) 543 { 544 int i; 545 struct kvm_memslots *slots = kvm->memslots; 546 547 for (i = 0; i < slots->nmemslots; ++i) 548 kvm_free_physmem_slot(&slots->memslots[i], NULL); 549 550 kfree(kvm->memslots); 551 } 552 553 static void kvm_destroy_vm(struct kvm *kvm) 554 { 555 int i; 556 struct mm_struct *mm = kvm->mm; 557 558 kvm_arch_sync_events(kvm); 559 raw_spin_lock(&kvm_lock); 560 list_del(&kvm->vm_list); 561 raw_spin_unlock(&kvm_lock); 562 kvm_free_irq_routing(kvm); 563 for (i = 0; i < KVM_NR_BUSES; i++) 564 kvm_io_bus_destroy(kvm->buses[i]); 565 kvm_coalesced_mmio_free(kvm); 566 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 567 mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); 568 #else 569 kvm_arch_flush_shadow(kvm); 570 #endif 571 kvm_arch_destroy_vm(kvm); 572 kvm_free_physmem(kvm); 573 cleanup_srcu_struct(&kvm->srcu); 574 kvm_arch_free_vm(kvm); 575 hardware_disable_all(); 576 mmdrop(mm); 577 } 578 579 void kvm_get_kvm(struct kvm *kvm) 580 { 581 atomic_inc(&kvm->users_count); 582 } 583 EXPORT_SYMBOL_GPL(kvm_get_kvm); 584 585 void kvm_put_kvm(struct kvm *kvm) 586 { 587 if (atomic_dec_and_test(&kvm->users_count)) 588 kvm_destroy_vm(kvm); 589 } 590 EXPORT_SYMBOL_GPL(kvm_put_kvm); 591 592 593 static int kvm_vm_release(struct inode *inode, struct file *filp) 594 { 595 struct kvm *kvm = filp->private_data; 596 597 kvm_irqfd_release(kvm); 598 599 kvm_put_kvm(kvm); 600 return 0; 601 } 602 603 #ifndef CONFIG_S390 604 /* 605 * Allocation size is twice as large as the actual dirty bitmap size. 606 * This makes it possible to do double buffering: see x86's 607 * kvm_vm_ioctl_get_dirty_log(). 608 */ 609 static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot) 610 { 611 unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot); 612 613 if (dirty_bytes > PAGE_SIZE) 614 memslot->dirty_bitmap = vzalloc(dirty_bytes); 615 else 616 memslot->dirty_bitmap = kzalloc(dirty_bytes, GFP_KERNEL); 617 618 if (!memslot->dirty_bitmap) 619 return -ENOMEM; 620 621 memslot->dirty_bitmap_head = memslot->dirty_bitmap; 622 return 0; 623 } 624 #endif /* !CONFIG_S390 */ 625 626 /* 627 * Allocate some memory and give it an address in the guest physical address 628 * space. 629 * 630 * Discontiguous memory is allowed, mostly for framebuffers. 631 * 632 * Must be called holding mmap_sem for write. 633 */ 634 int __kvm_set_memory_region(struct kvm *kvm, 635 struct kvm_userspace_memory_region *mem, 636 int user_alloc) 637 { 638 int r; 639 gfn_t base_gfn; 640 unsigned long npages; 641 unsigned long i; 642 struct kvm_memory_slot *memslot; 643 struct kvm_memory_slot old, new; 644 struct kvm_memslots *slots, *old_memslots; 645 646 r = -EINVAL; 647 /* General sanity checks */ 648 if (mem->memory_size & (PAGE_SIZE - 1)) 649 goto out; 650 if (mem->guest_phys_addr & (PAGE_SIZE - 1)) 651 goto out; 652 /* We can read the guest memory with __xxx_user() later on. */ 653 if (user_alloc && 654 ((mem->userspace_addr & (PAGE_SIZE - 1)) || 655 !access_ok(VERIFY_WRITE, 656 (void __user *)(unsigned long)mem->userspace_addr, 657 mem->memory_size))) 658 goto out; 659 if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS) 660 goto out; 661 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) 662 goto out; 663 664 memslot = &kvm->memslots->memslots[mem->slot]; 665 base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; 666 npages = mem->memory_size >> PAGE_SHIFT; 667 668 r = -EINVAL; 669 if (npages > KVM_MEM_MAX_NR_PAGES) 670 goto out; 671 672 if (!npages) 673 mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES; 674 675 new = old = *memslot; 676 677 new.id = mem->slot; 678 new.base_gfn = base_gfn; 679 new.npages = npages; 680 new.flags = mem->flags; 681 682 /* Disallow changing a memory slot's size. */ 683 r = -EINVAL; 684 if (npages && old.npages && npages != old.npages) 685 goto out_free; 686 687 /* Check for overlaps */ 688 r = -EEXIST; 689 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { 690 struct kvm_memory_slot *s = &kvm->memslots->memslots[i]; 691 692 if (s == memslot || !s->npages) 693 continue; 694 if (!((base_gfn + npages <= s->base_gfn) || 695 (base_gfn >= s->base_gfn + s->npages))) 696 goto out_free; 697 } 698 699 /* Free page dirty bitmap if unneeded */ 700 if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES)) 701 new.dirty_bitmap = NULL; 702 703 r = -ENOMEM; 704 705 /* Allocate if a slot is being created */ 706 #ifndef CONFIG_S390 707 if (npages && !new.rmap) { 708 new.rmap = vzalloc(npages * sizeof(*new.rmap)); 709 710 if (!new.rmap) 711 goto out_free; 712 713 new.user_alloc = user_alloc; 714 new.userspace_addr = mem->userspace_addr; 715 } 716 if (!npages) 717 goto skip_lpage; 718 719 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { 720 unsigned long ugfn; 721 unsigned long j; 722 int lpages; 723 int level = i + 2; 724 725 /* Avoid unused variable warning if no large pages */ 726 (void)level; 727 728 if (new.lpage_info[i]) 729 continue; 730 731 lpages = 1 + ((base_gfn + npages - 1) 732 >> KVM_HPAGE_GFN_SHIFT(level)); 733 lpages -= base_gfn >> KVM_HPAGE_GFN_SHIFT(level); 734 735 new.lpage_info[i] = vzalloc(lpages * sizeof(*new.lpage_info[i])); 736 737 if (!new.lpage_info[i]) 738 goto out_free; 739 740 if (base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1)) 741 new.lpage_info[i][0].write_count = 1; 742 if ((base_gfn+npages) & (KVM_PAGES_PER_HPAGE(level) - 1)) 743 new.lpage_info[i][lpages - 1].write_count = 1; 744 ugfn = new.userspace_addr >> PAGE_SHIFT; 745 /* 746 * If the gfn and userspace address are not aligned wrt each 747 * other, or if explicitly asked to, disable large page 748 * support for this slot 749 */ 750 if ((base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) || 751 !largepages_enabled) 752 for (j = 0; j < lpages; ++j) 753 new.lpage_info[i][j].write_count = 1; 754 } 755 756 skip_lpage: 757 758 /* Allocate page dirty bitmap if needed */ 759 if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { 760 if (kvm_create_dirty_bitmap(&new) < 0) 761 goto out_free; 762 /* destroy any largepage mappings for dirty tracking */ 763 } 764 #else /* not defined CONFIG_S390 */ 765 new.user_alloc = user_alloc; 766 if (user_alloc) 767 new.userspace_addr = mem->userspace_addr; 768 #endif /* not defined CONFIG_S390 */ 769 770 if (!npages) { 771 r = -ENOMEM; 772 slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); 773 if (!slots) 774 goto out_free; 775 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); 776 if (mem->slot >= slots->nmemslots) 777 slots->nmemslots = mem->slot + 1; 778 slots->generation++; 779 slots->memslots[mem->slot].flags |= KVM_MEMSLOT_INVALID; 780 781 old_memslots = kvm->memslots; 782 rcu_assign_pointer(kvm->memslots, slots); 783 synchronize_srcu_expedited(&kvm->srcu); 784 /* From this point no new shadow pages pointing to a deleted 785 * memslot will be created. 786 * 787 * validation of sp->gfn happens in: 788 * - gfn_to_hva (kvm_read_guest, gfn_to_pfn) 789 * - kvm_is_visible_gfn (mmu_check_roots) 790 */ 791 kvm_arch_flush_shadow(kvm); 792 kfree(old_memslots); 793 } 794 795 r = kvm_arch_prepare_memory_region(kvm, &new, old, mem, user_alloc); 796 if (r) 797 goto out_free; 798 799 /* map the pages in iommu page table */ 800 if (npages) { 801 r = kvm_iommu_map_pages(kvm, &new); 802 if (r) 803 goto out_free; 804 } 805 806 r = -ENOMEM; 807 slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); 808 if (!slots) 809 goto out_free; 810 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); 811 if (mem->slot >= slots->nmemslots) 812 slots->nmemslots = mem->slot + 1; 813 slots->generation++; 814 815 /* actual memory is freed via old in kvm_free_physmem_slot below */ 816 if (!npages) { 817 new.rmap = NULL; 818 new.dirty_bitmap = NULL; 819 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) 820 new.lpage_info[i] = NULL; 821 } 822 823 slots->memslots[mem->slot] = new; 824 old_memslots = kvm->memslots; 825 rcu_assign_pointer(kvm->memslots, slots); 826 synchronize_srcu_expedited(&kvm->srcu); 827 828 kvm_arch_commit_memory_region(kvm, mem, old, user_alloc); 829 830 kvm_free_physmem_slot(&old, &new); 831 kfree(old_memslots); 832 833 return 0; 834 835 out_free: 836 kvm_free_physmem_slot(&new, &old); 837 out: 838 return r; 839 840 } 841 EXPORT_SYMBOL_GPL(__kvm_set_memory_region); 842 843 int kvm_set_memory_region(struct kvm *kvm, 844 struct kvm_userspace_memory_region *mem, 845 int user_alloc) 846 { 847 int r; 848 849 mutex_lock(&kvm->slots_lock); 850 r = __kvm_set_memory_region(kvm, mem, user_alloc); 851 mutex_unlock(&kvm->slots_lock); 852 return r; 853 } 854 EXPORT_SYMBOL_GPL(kvm_set_memory_region); 855 856 int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, 857 struct 858 kvm_userspace_memory_region *mem, 859 int user_alloc) 860 { 861 if (mem->slot >= KVM_MEMORY_SLOTS) 862 return -EINVAL; 863 return kvm_set_memory_region(kvm, mem, user_alloc); 864 } 865 866 int kvm_get_dirty_log(struct kvm *kvm, 867 struct kvm_dirty_log *log, int *is_dirty) 868 { 869 struct kvm_memory_slot *memslot; 870 int r, i; 871 unsigned long n; 872 unsigned long any = 0; 873 874 r = -EINVAL; 875 if (log->slot >= KVM_MEMORY_SLOTS) 876 goto out; 877 878 memslot = &kvm->memslots->memslots[log->slot]; 879 r = -ENOENT; 880 if (!memslot->dirty_bitmap) 881 goto out; 882 883 n = kvm_dirty_bitmap_bytes(memslot); 884 885 for (i = 0; !any && i < n/sizeof(long); ++i) 886 any = memslot->dirty_bitmap[i]; 887 888 r = -EFAULT; 889 if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) 890 goto out; 891 892 if (any) 893 *is_dirty = 1; 894 895 r = 0; 896 out: 897 return r; 898 } 899 900 void kvm_disable_largepages(void) 901 { 902 largepages_enabled = false; 903 } 904 EXPORT_SYMBOL_GPL(kvm_disable_largepages); 905 906 int is_error_page(struct page *page) 907 { 908 return page == bad_page || page == hwpoison_page || page == fault_page; 909 } 910 EXPORT_SYMBOL_GPL(is_error_page); 911 912 int is_error_pfn(pfn_t pfn) 913 { 914 return pfn == bad_pfn || pfn == hwpoison_pfn || pfn == fault_pfn; 915 } 916 EXPORT_SYMBOL_GPL(is_error_pfn); 917 918 int is_hwpoison_pfn(pfn_t pfn) 919 { 920 return pfn == hwpoison_pfn; 921 } 922 EXPORT_SYMBOL_GPL(is_hwpoison_pfn); 923 924 int is_fault_pfn(pfn_t pfn) 925 { 926 return pfn == fault_pfn; 927 } 928 EXPORT_SYMBOL_GPL(is_fault_pfn); 929 930 static inline unsigned long bad_hva(void) 931 { 932 return PAGE_OFFSET; 933 } 934 935 int kvm_is_error_hva(unsigned long addr) 936 { 937 return addr == bad_hva(); 938 } 939 EXPORT_SYMBOL_GPL(kvm_is_error_hva); 940 941 static struct kvm_memory_slot *__gfn_to_memslot(struct kvm_memslots *slots, 942 gfn_t gfn) 943 { 944 int i; 945 946 for (i = 0; i < slots->nmemslots; ++i) { 947 struct kvm_memory_slot *memslot = &slots->memslots[i]; 948 949 if (gfn >= memslot->base_gfn 950 && gfn < memslot->base_gfn + memslot->npages) 951 return memslot; 952 } 953 return NULL; 954 } 955 956 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) 957 { 958 return __gfn_to_memslot(kvm_memslots(kvm), gfn); 959 } 960 EXPORT_SYMBOL_GPL(gfn_to_memslot); 961 962 int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) 963 { 964 int i; 965 struct kvm_memslots *slots = kvm_memslots(kvm); 966 967 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { 968 struct kvm_memory_slot *memslot = &slots->memslots[i]; 969 970 if (memslot->flags & KVM_MEMSLOT_INVALID) 971 continue; 972 973 if (gfn >= memslot->base_gfn 974 && gfn < memslot->base_gfn + memslot->npages) 975 return 1; 976 } 977 return 0; 978 } 979 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); 980 981 unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn) 982 { 983 struct vm_area_struct *vma; 984 unsigned long addr, size; 985 986 size = PAGE_SIZE; 987 988 addr = gfn_to_hva(kvm, gfn); 989 if (kvm_is_error_hva(addr)) 990 return PAGE_SIZE; 991 992 down_read(¤t->mm->mmap_sem); 993 vma = find_vma(current->mm, addr); 994 if (!vma) 995 goto out; 996 997 size = vma_kernel_pagesize(vma); 998 999 out: 1000 up_read(¤t->mm->mmap_sem); 1001 1002 return size; 1003 } 1004 1005 static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, 1006 gfn_t *nr_pages) 1007 { 1008 if (!slot || slot->flags & KVM_MEMSLOT_INVALID) 1009 return bad_hva(); 1010 1011 if (nr_pages) 1012 *nr_pages = slot->npages - (gfn - slot->base_gfn); 1013 1014 return gfn_to_hva_memslot(slot, gfn); 1015 } 1016 1017 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) 1018 { 1019 return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL); 1020 } 1021 EXPORT_SYMBOL_GPL(gfn_to_hva); 1022 1023 static pfn_t get_fault_pfn(void) 1024 { 1025 get_page(fault_page); 1026 return fault_pfn; 1027 } 1028 1029 int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm, 1030 unsigned long start, int write, struct page **page) 1031 { 1032 int flags = FOLL_TOUCH | FOLL_NOWAIT | FOLL_HWPOISON | FOLL_GET; 1033 1034 if (write) 1035 flags |= FOLL_WRITE; 1036 1037 return __get_user_pages(tsk, mm, start, 1, flags, page, NULL, NULL); 1038 } 1039 1040 static inline int check_user_page_hwpoison(unsigned long addr) 1041 { 1042 int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE; 1043 1044 rc = __get_user_pages(current, current->mm, addr, 1, 1045 flags, NULL, NULL, NULL); 1046 return rc == -EHWPOISON; 1047 } 1048 1049 static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic, 1050 bool *async, bool write_fault, bool *writable) 1051 { 1052 struct page *page[1]; 1053 int npages = 0; 1054 pfn_t pfn; 1055 1056 /* we can do it either atomically or asynchronously, not both */ 1057 BUG_ON(atomic && async); 1058 1059 BUG_ON(!write_fault && !writable); 1060 1061 if (writable) 1062 *writable = true; 1063 1064 if (atomic || async) 1065 npages = __get_user_pages_fast(addr, 1, 1, page); 1066 1067 if (unlikely(npages != 1) && !atomic) { 1068 might_sleep(); 1069 1070 if (writable) 1071 *writable = write_fault; 1072 1073 if (async) { 1074 down_read(¤t->mm->mmap_sem); 1075 npages = get_user_page_nowait(current, current->mm, 1076 addr, write_fault, page); 1077 up_read(¤t->mm->mmap_sem); 1078 } else 1079 npages = get_user_pages_fast(addr, 1, write_fault, 1080 page); 1081 1082 /* map read fault as writable if possible */ 1083 if (unlikely(!write_fault) && npages == 1) { 1084 struct page *wpage[1]; 1085 1086 npages = __get_user_pages_fast(addr, 1, 1, wpage); 1087 if (npages == 1) { 1088 *writable = true; 1089 put_page(page[0]); 1090 page[0] = wpage[0]; 1091 } 1092 npages = 1; 1093 } 1094 } 1095 1096 if (unlikely(npages != 1)) { 1097 struct vm_area_struct *vma; 1098 1099 if (atomic) 1100 return get_fault_pfn(); 1101 1102 down_read(¤t->mm->mmap_sem); 1103 if (npages == -EHWPOISON || 1104 (!async && check_user_page_hwpoison(addr))) { 1105 up_read(¤t->mm->mmap_sem); 1106 get_page(hwpoison_page); 1107 return page_to_pfn(hwpoison_page); 1108 } 1109 1110 vma = find_vma_intersection(current->mm, addr, addr+1); 1111 1112 if (vma == NULL) 1113 pfn = get_fault_pfn(); 1114 else if ((vma->vm_flags & VM_PFNMAP)) { 1115 pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + 1116 vma->vm_pgoff; 1117 BUG_ON(!kvm_is_mmio_pfn(pfn)); 1118 } else { 1119 if (async && (vma->vm_flags & VM_WRITE)) 1120 *async = true; 1121 pfn = get_fault_pfn(); 1122 } 1123 up_read(¤t->mm->mmap_sem); 1124 } else 1125 pfn = page_to_pfn(page[0]); 1126 1127 return pfn; 1128 } 1129 1130 pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr) 1131 { 1132 return hva_to_pfn(kvm, addr, true, NULL, true, NULL); 1133 } 1134 EXPORT_SYMBOL_GPL(hva_to_pfn_atomic); 1135 1136 static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async, 1137 bool write_fault, bool *writable) 1138 { 1139 unsigned long addr; 1140 1141 if (async) 1142 *async = false; 1143 1144 addr = gfn_to_hva(kvm, gfn); 1145 if (kvm_is_error_hva(addr)) { 1146 get_page(bad_page); 1147 return page_to_pfn(bad_page); 1148 } 1149 1150 return hva_to_pfn(kvm, addr, atomic, async, write_fault, writable); 1151 } 1152 1153 pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn) 1154 { 1155 return __gfn_to_pfn(kvm, gfn, true, NULL, true, NULL); 1156 } 1157 EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic); 1158 1159 pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async, 1160 bool write_fault, bool *writable) 1161 { 1162 return __gfn_to_pfn(kvm, gfn, false, async, write_fault, writable); 1163 } 1164 EXPORT_SYMBOL_GPL(gfn_to_pfn_async); 1165 1166 pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) 1167 { 1168 return __gfn_to_pfn(kvm, gfn, false, NULL, true, NULL); 1169 } 1170 EXPORT_SYMBOL_GPL(gfn_to_pfn); 1171 1172 pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, 1173 bool *writable) 1174 { 1175 return __gfn_to_pfn(kvm, gfn, false, NULL, write_fault, writable); 1176 } 1177 EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); 1178 1179 pfn_t gfn_to_pfn_memslot(struct kvm *kvm, 1180 struct kvm_memory_slot *slot, gfn_t gfn) 1181 { 1182 unsigned long addr = gfn_to_hva_memslot(slot, gfn); 1183 return hva_to_pfn(kvm, addr, false, NULL, true, NULL); 1184 } 1185 1186 int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages, 1187 int nr_pages) 1188 { 1189 unsigned long addr; 1190 gfn_t entry; 1191 1192 addr = gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, &entry); 1193 if (kvm_is_error_hva(addr)) 1194 return -1; 1195 1196 if (entry < nr_pages) 1197 return 0; 1198 1199 return __get_user_pages_fast(addr, nr_pages, 1, pages); 1200 } 1201 EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic); 1202 1203 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) 1204 { 1205 pfn_t pfn; 1206 1207 pfn = gfn_to_pfn(kvm, gfn); 1208 if (!kvm_is_mmio_pfn(pfn)) 1209 return pfn_to_page(pfn); 1210 1211 WARN_ON(kvm_is_mmio_pfn(pfn)); 1212 1213 get_page(bad_page); 1214 return bad_page; 1215 } 1216 1217 EXPORT_SYMBOL_GPL(gfn_to_page); 1218 1219 void kvm_release_page_clean(struct page *page) 1220 { 1221 kvm_release_pfn_clean(page_to_pfn(page)); 1222 } 1223 EXPORT_SYMBOL_GPL(kvm_release_page_clean); 1224 1225 void kvm_release_pfn_clean(pfn_t pfn) 1226 { 1227 if (!kvm_is_mmio_pfn(pfn)) 1228 put_page(pfn_to_page(pfn)); 1229 } 1230 EXPORT_SYMBOL_GPL(kvm_release_pfn_clean); 1231 1232 void kvm_release_page_dirty(struct page *page) 1233 { 1234 kvm_release_pfn_dirty(page_to_pfn(page)); 1235 } 1236 EXPORT_SYMBOL_GPL(kvm_release_page_dirty); 1237 1238 void kvm_release_pfn_dirty(pfn_t pfn) 1239 { 1240 kvm_set_pfn_dirty(pfn); 1241 kvm_release_pfn_clean(pfn); 1242 } 1243 EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty); 1244 1245 void kvm_set_page_dirty(struct page *page) 1246 { 1247 kvm_set_pfn_dirty(page_to_pfn(page)); 1248 } 1249 EXPORT_SYMBOL_GPL(kvm_set_page_dirty); 1250 1251 void kvm_set_pfn_dirty(pfn_t pfn) 1252 { 1253 if (!kvm_is_mmio_pfn(pfn)) { 1254 struct page *page = pfn_to_page(pfn); 1255 if (!PageReserved(page)) 1256 SetPageDirty(page); 1257 } 1258 } 1259 EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty); 1260 1261 void kvm_set_pfn_accessed(pfn_t pfn) 1262 { 1263 if (!kvm_is_mmio_pfn(pfn)) 1264 mark_page_accessed(pfn_to_page(pfn)); 1265 } 1266 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed); 1267 1268 void kvm_get_pfn(pfn_t pfn) 1269 { 1270 if (!kvm_is_mmio_pfn(pfn)) 1271 get_page(pfn_to_page(pfn)); 1272 } 1273 EXPORT_SYMBOL_GPL(kvm_get_pfn); 1274 1275 static int next_segment(unsigned long len, int offset) 1276 { 1277 if (len > PAGE_SIZE - offset) 1278 return PAGE_SIZE - offset; 1279 else 1280 return len; 1281 } 1282 1283 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, 1284 int len) 1285 { 1286 int r; 1287 unsigned long addr; 1288 1289 addr = gfn_to_hva(kvm, gfn); 1290 if (kvm_is_error_hva(addr)) 1291 return -EFAULT; 1292 r = __copy_from_user(data, (void __user *)addr + offset, len); 1293 if (r) 1294 return -EFAULT; 1295 return 0; 1296 } 1297 EXPORT_SYMBOL_GPL(kvm_read_guest_page); 1298 1299 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) 1300 { 1301 gfn_t gfn = gpa >> PAGE_SHIFT; 1302 int seg; 1303 int offset = offset_in_page(gpa); 1304 int ret; 1305 1306 while ((seg = next_segment(len, offset)) != 0) { 1307 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg); 1308 if (ret < 0) 1309 return ret; 1310 offset = 0; 1311 len -= seg; 1312 data += seg; 1313 ++gfn; 1314 } 1315 return 0; 1316 } 1317 EXPORT_SYMBOL_GPL(kvm_read_guest); 1318 1319 int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, 1320 unsigned long len) 1321 { 1322 int r; 1323 unsigned long addr; 1324 gfn_t gfn = gpa >> PAGE_SHIFT; 1325 int offset = offset_in_page(gpa); 1326 1327 addr = gfn_to_hva(kvm, gfn); 1328 if (kvm_is_error_hva(addr)) 1329 return -EFAULT; 1330 pagefault_disable(); 1331 r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len); 1332 pagefault_enable(); 1333 if (r) 1334 return -EFAULT; 1335 return 0; 1336 } 1337 EXPORT_SYMBOL(kvm_read_guest_atomic); 1338 1339 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, 1340 int offset, int len) 1341 { 1342 int r; 1343 unsigned long addr; 1344 1345 addr = gfn_to_hva(kvm, gfn); 1346 if (kvm_is_error_hva(addr)) 1347 return -EFAULT; 1348 r = copy_to_user((void __user *)addr + offset, data, len); 1349 if (r) 1350 return -EFAULT; 1351 mark_page_dirty(kvm, gfn); 1352 return 0; 1353 } 1354 EXPORT_SYMBOL_GPL(kvm_write_guest_page); 1355 1356 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, 1357 unsigned long len) 1358 { 1359 gfn_t gfn = gpa >> PAGE_SHIFT; 1360 int seg; 1361 int offset = offset_in_page(gpa); 1362 int ret; 1363 1364 while ((seg = next_segment(len, offset)) != 0) { 1365 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg); 1366 if (ret < 0) 1367 return ret; 1368 offset = 0; 1369 len -= seg; 1370 data += seg; 1371 ++gfn; 1372 } 1373 return 0; 1374 } 1375 1376 int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 1377 gpa_t gpa) 1378 { 1379 struct kvm_memslots *slots = kvm_memslots(kvm); 1380 int offset = offset_in_page(gpa); 1381 gfn_t gfn = gpa >> PAGE_SHIFT; 1382 1383 ghc->gpa = gpa; 1384 ghc->generation = slots->generation; 1385 ghc->memslot = __gfn_to_memslot(slots, gfn); 1386 ghc->hva = gfn_to_hva_many(ghc->memslot, gfn, NULL); 1387 if (!kvm_is_error_hva(ghc->hva)) 1388 ghc->hva += offset; 1389 else 1390 return -EFAULT; 1391 1392 return 0; 1393 } 1394 EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init); 1395 1396 int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 1397 void *data, unsigned long len) 1398 { 1399 struct kvm_memslots *slots = kvm_memslots(kvm); 1400 int r; 1401 1402 if (slots->generation != ghc->generation) 1403 kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa); 1404 1405 if (kvm_is_error_hva(ghc->hva)) 1406 return -EFAULT; 1407 1408 r = copy_to_user((void __user *)ghc->hva, data, len); 1409 if (r) 1410 return -EFAULT; 1411 mark_page_dirty_in_slot(kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT); 1412 1413 return 0; 1414 } 1415 EXPORT_SYMBOL_GPL(kvm_write_guest_cached); 1416 1417 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) 1418 { 1419 return kvm_write_guest_page(kvm, gfn, (const void *) empty_zero_page, 1420 offset, len); 1421 } 1422 EXPORT_SYMBOL_GPL(kvm_clear_guest_page); 1423 1424 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) 1425 { 1426 gfn_t gfn = gpa >> PAGE_SHIFT; 1427 int seg; 1428 int offset = offset_in_page(gpa); 1429 int ret; 1430 1431 while ((seg = next_segment(len, offset)) != 0) { 1432 ret = kvm_clear_guest_page(kvm, gfn, offset, seg); 1433 if (ret < 0) 1434 return ret; 1435 offset = 0; 1436 len -= seg; 1437 ++gfn; 1438 } 1439 return 0; 1440 } 1441 EXPORT_SYMBOL_GPL(kvm_clear_guest); 1442 1443 void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot, 1444 gfn_t gfn) 1445 { 1446 if (memslot && memslot->dirty_bitmap) { 1447 unsigned long rel_gfn = gfn - memslot->base_gfn; 1448 1449 __set_bit_le(rel_gfn, memslot->dirty_bitmap); 1450 } 1451 } 1452 1453 void mark_page_dirty(struct kvm *kvm, gfn_t gfn) 1454 { 1455 struct kvm_memory_slot *memslot; 1456 1457 memslot = gfn_to_memslot(kvm, gfn); 1458 mark_page_dirty_in_slot(kvm, memslot, gfn); 1459 } 1460 1461 /* 1462 * The vCPU has executed a HLT instruction with in-kernel mode enabled. 1463 */ 1464 void kvm_vcpu_block(struct kvm_vcpu *vcpu) 1465 { 1466 DEFINE_WAIT(wait); 1467 1468 for (;;) { 1469 prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); 1470 1471 if (kvm_arch_vcpu_runnable(vcpu)) { 1472 kvm_make_request(KVM_REQ_UNHALT, vcpu); 1473 break; 1474 } 1475 if (kvm_cpu_has_pending_timer(vcpu)) 1476 break; 1477 if (signal_pending(current)) 1478 break; 1479 1480 schedule(); 1481 } 1482 1483 finish_wait(&vcpu->wq, &wait); 1484 } 1485 1486 void kvm_resched(struct kvm_vcpu *vcpu) 1487 { 1488 if (!need_resched()) 1489 return; 1490 cond_resched(); 1491 } 1492 EXPORT_SYMBOL_GPL(kvm_resched); 1493 1494 void kvm_vcpu_on_spin(struct kvm_vcpu *me) 1495 { 1496 struct kvm *kvm = me->kvm; 1497 struct kvm_vcpu *vcpu; 1498 int last_boosted_vcpu = me->kvm->last_boosted_vcpu; 1499 int yielded = 0; 1500 int pass; 1501 int i; 1502 1503 /* 1504 * We boost the priority of a VCPU that is runnable but not 1505 * currently running, because it got preempted by something 1506 * else and called schedule in __vcpu_run. Hopefully that 1507 * VCPU is holding the lock that we need and will release it. 1508 * We approximate round-robin by starting at the last boosted VCPU. 1509 */ 1510 for (pass = 0; pass < 2 && !yielded; pass++) { 1511 kvm_for_each_vcpu(i, vcpu, kvm) { 1512 struct task_struct *task = NULL; 1513 struct pid *pid; 1514 if (!pass && i < last_boosted_vcpu) { 1515 i = last_boosted_vcpu; 1516 continue; 1517 } else if (pass && i > last_boosted_vcpu) 1518 break; 1519 if (vcpu == me) 1520 continue; 1521 if (waitqueue_active(&vcpu->wq)) 1522 continue; 1523 rcu_read_lock(); 1524 pid = rcu_dereference(vcpu->pid); 1525 if (pid) 1526 task = get_pid_task(vcpu->pid, PIDTYPE_PID); 1527 rcu_read_unlock(); 1528 if (!task) 1529 continue; 1530 if (task->flags & PF_VCPU) { 1531 put_task_struct(task); 1532 continue; 1533 } 1534 if (yield_to(task, 1)) { 1535 put_task_struct(task); 1536 kvm->last_boosted_vcpu = i; 1537 yielded = 1; 1538 break; 1539 } 1540 put_task_struct(task); 1541 } 1542 } 1543 } 1544 EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin); 1545 1546 static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 1547 { 1548 struct kvm_vcpu *vcpu = vma->vm_file->private_data; 1549 struct page *page; 1550 1551 if (vmf->pgoff == 0) 1552 page = virt_to_page(vcpu->run); 1553 #ifdef CONFIG_X86 1554 else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET) 1555 page = virt_to_page(vcpu->arch.pio_data); 1556 #endif 1557 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 1558 else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET) 1559 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring); 1560 #endif 1561 else 1562 return VM_FAULT_SIGBUS; 1563 get_page(page); 1564 vmf->page = page; 1565 return 0; 1566 } 1567 1568 static const struct vm_operations_struct kvm_vcpu_vm_ops = { 1569 .fault = kvm_vcpu_fault, 1570 }; 1571 1572 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma) 1573 { 1574 vma->vm_ops = &kvm_vcpu_vm_ops; 1575 return 0; 1576 } 1577 1578 static int kvm_vcpu_release(struct inode *inode, struct file *filp) 1579 { 1580 struct kvm_vcpu *vcpu = filp->private_data; 1581 1582 kvm_put_kvm(vcpu->kvm); 1583 return 0; 1584 } 1585 1586 static struct file_operations kvm_vcpu_fops = { 1587 .release = kvm_vcpu_release, 1588 .unlocked_ioctl = kvm_vcpu_ioctl, 1589 .compat_ioctl = kvm_vcpu_ioctl, 1590 .mmap = kvm_vcpu_mmap, 1591 .llseek = noop_llseek, 1592 }; 1593 1594 /* 1595 * Allocates an inode for the vcpu. 1596 */ 1597 static int create_vcpu_fd(struct kvm_vcpu *vcpu) 1598 { 1599 return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR); 1600 } 1601 1602 /* 1603 * Creates some virtual cpus. Good luck creating more than one. 1604 */ 1605 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) 1606 { 1607 int r; 1608 struct kvm_vcpu *vcpu, *v; 1609 1610 vcpu = kvm_arch_vcpu_create(kvm, id); 1611 if (IS_ERR(vcpu)) 1612 return PTR_ERR(vcpu); 1613 1614 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); 1615 1616 r = kvm_arch_vcpu_setup(vcpu); 1617 if (r) 1618 return r; 1619 1620 mutex_lock(&kvm->lock); 1621 if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) { 1622 r = -EINVAL; 1623 goto vcpu_destroy; 1624 } 1625 1626 kvm_for_each_vcpu(r, v, kvm) 1627 if (v->vcpu_id == id) { 1628 r = -EEXIST; 1629 goto vcpu_destroy; 1630 } 1631 1632 BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]); 1633 1634 /* Now it's all set up, let userspace reach it */ 1635 kvm_get_kvm(kvm); 1636 r = create_vcpu_fd(vcpu); 1637 if (r < 0) { 1638 kvm_put_kvm(kvm); 1639 goto vcpu_destroy; 1640 } 1641 1642 kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu; 1643 smp_wmb(); 1644 atomic_inc(&kvm->online_vcpus); 1645 1646 #ifdef CONFIG_KVM_APIC_ARCHITECTURE 1647 if (kvm->bsp_vcpu_id == id) 1648 kvm->bsp_vcpu = vcpu; 1649 #endif 1650 mutex_unlock(&kvm->lock); 1651 return r; 1652 1653 vcpu_destroy: 1654 mutex_unlock(&kvm->lock); 1655 kvm_arch_vcpu_destroy(vcpu); 1656 return r; 1657 } 1658 1659 static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset) 1660 { 1661 if (sigset) { 1662 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP)); 1663 vcpu->sigset_active = 1; 1664 vcpu->sigset = *sigset; 1665 } else 1666 vcpu->sigset_active = 0; 1667 return 0; 1668 } 1669 1670 static long kvm_vcpu_ioctl(struct file *filp, 1671 unsigned int ioctl, unsigned long arg) 1672 { 1673 struct kvm_vcpu *vcpu = filp->private_data; 1674 void __user *argp = (void __user *)arg; 1675 int r; 1676 struct kvm_fpu *fpu = NULL; 1677 struct kvm_sregs *kvm_sregs = NULL; 1678 1679 if (vcpu->kvm->mm != current->mm) 1680 return -EIO; 1681 1682 #if defined(CONFIG_S390) || defined(CONFIG_PPC) 1683 /* 1684 * Special cases: vcpu ioctls that are asynchronous to vcpu execution, 1685 * so vcpu_load() would break it. 1686 */ 1687 if (ioctl == KVM_S390_INTERRUPT || ioctl == KVM_INTERRUPT) 1688 return kvm_arch_vcpu_ioctl(filp, ioctl, arg); 1689 #endif 1690 1691 1692 vcpu_load(vcpu); 1693 switch (ioctl) { 1694 case KVM_RUN: 1695 r = -EINVAL; 1696 if (arg) 1697 goto out; 1698 r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run); 1699 trace_kvm_userspace_exit(vcpu->run->exit_reason, r); 1700 break; 1701 case KVM_GET_REGS: { 1702 struct kvm_regs *kvm_regs; 1703 1704 r = -ENOMEM; 1705 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL); 1706 if (!kvm_regs) 1707 goto out; 1708 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs); 1709 if (r) 1710 goto out_free1; 1711 r = -EFAULT; 1712 if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs))) 1713 goto out_free1; 1714 r = 0; 1715 out_free1: 1716 kfree(kvm_regs); 1717 break; 1718 } 1719 case KVM_SET_REGS: { 1720 struct kvm_regs *kvm_regs; 1721 1722 r = -ENOMEM; 1723 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL); 1724 if (!kvm_regs) 1725 goto out; 1726 r = -EFAULT; 1727 if (copy_from_user(kvm_regs, argp, sizeof(struct kvm_regs))) 1728 goto out_free2; 1729 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs); 1730 if (r) 1731 goto out_free2; 1732 r = 0; 1733 out_free2: 1734 kfree(kvm_regs); 1735 break; 1736 } 1737 case KVM_GET_SREGS: { 1738 kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL); 1739 r = -ENOMEM; 1740 if (!kvm_sregs) 1741 goto out; 1742 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs); 1743 if (r) 1744 goto out; 1745 r = -EFAULT; 1746 if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs))) 1747 goto out; 1748 r = 0; 1749 break; 1750 } 1751 case KVM_SET_SREGS: { 1752 kvm_sregs = kmalloc(sizeof(struct kvm_sregs), GFP_KERNEL); 1753 r = -ENOMEM; 1754 if (!kvm_sregs) 1755 goto out; 1756 r = -EFAULT; 1757 if (copy_from_user(kvm_sregs, argp, sizeof(struct kvm_sregs))) 1758 goto out; 1759 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs); 1760 if (r) 1761 goto out; 1762 r = 0; 1763 break; 1764 } 1765 case KVM_GET_MP_STATE: { 1766 struct kvm_mp_state mp_state; 1767 1768 r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state); 1769 if (r) 1770 goto out; 1771 r = -EFAULT; 1772 if (copy_to_user(argp, &mp_state, sizeof mp_state)) 1773 goto out; 1774 r = 0; 1775 break; 1776 } 1777 case KVM_SET_MP_STATE: { 1778 struct kvm_mp_state mp_state; 1779 1780 r = -EFAULT; 1781 if (copy_from_user(&mp_state, argp, sizeof mp_state)) 1782 goto out; 1783 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state); 1784 if (r) 1785 goto out; 1786 r = 0; 1787 break; 1788 } 1789 case KVM_TRANSLATE: { 1790 struct kvm_translation tr; 1791 1792 r = -EFAULT; 1793 if (copy_from_user(&tr, argp, sizeof tr)) 1794 goto out; 1795 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr); 1796 if (r) 1797 goto out; 1798 r = -EFAULT; 1799 if (copy_to_user(argp, &tr, sizeof tr)) 1800 goto out; 1801 r = 0; 1802 break; 1803 } 1804 case KVM_SET_GUEST_DEBUG: { 1805 struct kvm_guest_debug dbg; 1806 1807 r = -EFAULT; 1808 if (copy_from_user(&dbg, argp, sizeof dbg)) 1809 goto out; 1810 r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg); 1811 if (r) 1812 goto out; 1813 r = 0; 1814 break; 1815 } 1816 case KVM_SET_SIGNAL_MASK: { 1817 struct kvm_signal_mask __user *sigmask_arg = argp; 1818 struct kvm_signal_mask kvm_sigmask; 1819 sigset_t sigset, *p; 1820 1821 p = NULL; 1822 if (argp) { 1823 r = -EFAULT; 1824 if (copy_from_user(&kvm_sigmask, argp, 1825 sizeof kvm_sigmask)) 1826 goto out; 1827 r = -EINVAL; 1828 if (kvm_sigmask.len != sizeof sigset) 1829 goto out; 1830 r = -EFAULT; 1831 if (copy_from_user(&sigset, sigmask_arg->sigset, 1832 sizeof sigset)) 1833 goto out; 1834 p = &sigset; 1835 } 1836 r = kvm_vcpu_ioctl_set_sigmask(vcpu, p); 1837 break; 1838 } 1839 case KVM_GET_FPU: { 1840 fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL); 1841 r = -ENOMEM; 1842 if (!fpu) 1843 goto out; 1844 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu); 1845 if (r) 1846 goto out; 1847 r = -EFAULT; 1848 if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu))) 1849 goto out; 1850 r = 0; 1851 break; 1852 } 1853 case KVM_SET_FPU: { 1854 fpu = kmalloc(sizeof(struct kvm_fpu), GFP_KERNEL); 1855 r = -ENOMEM; 1856 if (!fpu) 1857 goto out; 1858 r = -EFAULT; 1859 if (copy_from_user(fpu, argp, sizeof(struct kvm_fpu))) 1860 goto out; 1861 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu); 1862 if (r) 1863 goto out; 1864 r = 0; 1865 break; 1866 } 1867 default: 1868 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); 1869 } 1870 out: 1871 vcpu_put(vcpu); 1872 kfree(fpu); 1873 kfree(kvm_sregs); 1874 return r; 1875 } 1876 1877 static long kvm_vm_ioctl(struct file *filp, 1878 unsigned int ioctl, unsigned long arg) 1879 { 1880 struct kvm *kvm = filp->private_data; 1881 void __user *argp = (void __user *)arg; 1882 int r; 1883 1884 if (kvm->mm != current->mm) 1885 return -EIO; 1886 switch (ioctl) { 1887 case KVM_CREATE_VCPU: 1888 r = kvm_vm_ioctl_create_vcpu(kvm, arg); 1889 if (r < 0) 1890 goto out; 1891 break; 1892 case KVM_SET_USER_MEMORY_REGION: { 1893 struct kvm_userspace_memory_region kvm_userspace_mem; 1894 1895 r = -EFAULT; 1896 if (copy_from_user(&kvm_userspace_mem, argp, 1897 sizeof kvm_userspace_mem)) 1898 goto out; 1899 1900 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1); 1901 if (r) 1902 goto out; 1903 break; 1904 } 1905 case KVM_GET_DIRTY_LOG: { 1906 struct kvm_dirty_log log; 1907 1908 r = -EFAULT; 1909 if (copy_from_user(&log, argp, sizeof log)) 1910 goto out; 1911 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 1912 if (r) 1913 goto out; 1914 break; 1915 } 1916 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 1917 case KVM_REGISTER_COALESCED_MMIO: { 1918 struct kvm_coalesced_mmio_zone zone; 1919 r = -EFAULT; 1920 if (copy_from_user(&zone, argp, sizeof zone)) 1921 goto out; 1922 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone); 1923 if (r) 1924 goto out; 1925 r = 0; 1926 break; 1927 } 1928 case KVM_UNREGISTER_COALESCED_MMIO: { 1929 struct kvm_coalesced_mmio_zone zone; 1930 r = -EFAULT; 1931 if (copy_from_user(&zone, argp, sizeof zone)) 1932 goto out; 1933 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone); 1934 if (r) 1935 goto out; 1936 r = 0; 1937 break; 1938 } 1939 #endif 1940 case KVM_IRQFD: { 1941 struct kvm_irqfd data; 1942 1943 r = -EFAULT; 1944 if (copy_from_user(&data, argp, sizeof data)) 1945 goto out; 1946 r = kvm_irqfd(kvm, data.fd, data.gsi, data.flags); 1947 break; 1948 } 1949 case KVM_IOEVENTFD: { 1950 struct kvm_ioeventfd data; 1951 1952 r = -EFAULT; 1953 if (copy_from_user(&data, argp, sizeof data)) 1954 goto out; 1955 r = kvm_ioeventfd(kvm, &data); 1956 break; 1957 } 1958 #ifdef CONFIG_KVM_APIC_ARCHITECTURE 1959 case KVM_SET_BOOT_CPU_ID: 1960 r = 0; 1961 mutex_lock(&kvm->lock); 1962 if (atomic_read(&kvm->online_vcpus) != 0) 1963 r = -EBUSY; 1964 else 1965 kvm->bsp_vcpu_id = arg; 1966 mutex_unlock(&kvm->lock); 1967 break; 1968 #endif 1969 default: 1970 r = kvm_arch_vm_ioctl(filp, ioctl, arg); 1971 if (r == -ENOTTY) 1972 r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg); 1973 } 1974 out: 1975 return r; 1976 } 1977 1978 #ifdef CONFIG_COMPAT 1979 struct compat_kvm_dirty_log { 1980 __u32 slot; 1981 __u32 padding1; 1982 union { 1983 compat_uptr_t dirty_bitmap; /* one bit per page */ 1984 __u64 padding2; 1985 }; 1986 }; 1987 1988 static long kvm_vm_compat_ioctl(struct file *filp, 1989 unsigned int ioctl, unsigned long arg) 1990 { 1991 struct kvm *kvm = filp->private_data; 1992 int r; 1993 1994 if (kvm->mm != current->mm) 1995 return -EIO; 1996 switch (ioctl) { 1997 case KVM_GET_DIRTY_LOG: { 1998 struct compat_kvm_dirty_log compat_log; 1999 struct kvm_dirty_log log; 2000 2001 r = -EFAULT; 2002 if (copy_from_user(&compat_log, (void __user *)arg, 2003 sizeof(compat_log))) 2004 goto out; 2005 log.slot = compat_log.slot; 2006 log.padding1 = compat_log.padding1; 2007 log.padding2 = compat_log.padding2; 2008 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap); 2009 2010 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 2011 if (r) 2012 goto out; 2013 break; 2014 } 2015 default: 2016 r = kvm_vm_ioctl(filp, ioctl, arg); 2017 } 2018 2019 out: 2020 return r; 2021 } 2022 #endif 2023 2024 static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 2025 { 2026 struct page *page[1]; 2027 unsigned long addr; 2028 int npages; 2029 gfn_t gfn = vmf->pgoff; 2030 struct kvm *kvm = vma->vm_file->private_data; 2031 2032 addr = gfn_to_hva(kvm, gfn); 2033 if (kvm_is_error_hva(addr)) 2034 return VM_FAULT_SIGBUS; 2035 2036 npages = get_user_pages(current, current->mm, addr, 1, 1, 0, page, 2037 NULL); 2038 if (unlikely(npages != 1)) 2039 return VM_FAULT_SIGBUS; 2040 2041 vmf->page = page[0]; 2042 return 0; 2043 } 2044 2045 static const struct vm_operations_struct kvm_vm_vm_ops = { 2046 .fault = kvm_vm_fault, 2047 }; 2048 2049 static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma) 2050 { 2051 vma->vm_ops = &kvm_vm_vm_ops; 2052 return 0; 2053 } 2054 2055 static struct file_operations kvm_vm_fops = { 2056 .release = kvm_vm_release, 2057 .unlocked_ioctl = kvm_vm_ioctl, 2058 #ifdef CONFIG_COMPAT 2059 .compat_ioctl = kvm_vm_compat_ioctl, 2060 #endif 2061 .mmap = kvm_vm_mmap, 2062 .llseek = noop_llseek, 2063 }; 2064 2065 static int kvm_dev_ioctl_create_vm(void) 2066 { 2067 int r; 2068 struct kvm *kvm; 2069 2070 kvm = kvm_create_vm(); 2071 if (IS_ERR(kvm)) 2072 return PTR_ERR(kvm); 2073 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 2074 r = kvm_coalesced_mmio_init(kvm); 2075 if (r < 0) { 2076 kvm_put_kvm(kvm); 2077 return r; 2078 } 2079 #endif 2080 r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR); 2081 if (r < 0) 2082 kvm_put_kvm(kvm); 2083 2084 return r; 2085 } 2086 2087 static long kvm_dev_ioctl_check_extension_generic(long arg) 2088 { 2089 switch (arg) { 2090 case KVM_CAP_USER_MEMORY: 2091 case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: 2092 case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS: 2093 #ifdef CONFIG_KVM_APIC_ARCHITECTURE 2094 case KVM_CAP_SET_BOOT_CPU_ID: 2095 #endif 2096 case KVM_CAP_INTERNAL_ERROR_DATA: 2097 return 1; 2098 #ifdef CONFIG_HAVE_KVM_IRQCHIP 2099 case KVM_CAP_IRQ_ROUTING: 2100 return KVM_MAX_IRQ_ROUTES; 2101 #endif 2102 default: 2103 break; 2104 } 2105 return kvm_dev_ioctl_check_extension(arg); 2106 } 2107 2108 static long kvm_dev_ioctl(struct file *filp, 2109 unsigned int ioctl, unsigned long arg) 2110 { 2111 long r = -EINVAL; 2112 2113 switch (ioctl) { 2114 case KVM_GET_API_VERSION: 2115 r = -EINVAL; 2116 if (arg) 2117 goto out; 2118 r = KVM_API_VERSION; 2119 break; 2120 case KVM_CREATE_VM: 2121 r = -EINVAL; 2122 if (arg) 2123 goto out; 2124 r = kvm_dev_ioctl_create_vm(); 2125 break; 2126 case KVM_CHECK_EXTENSION: 2127 r = kvm_dev_ioctl_check_extension_generic(arg); 2128 break; 2129 case KVM_GET_VCPU_MMAP_SIZE: 2130 r = -EINVAL; 2131 if (arg) 2132 goto out; 2133 r = PAGE_SIZE; /* struct kvm_run */ 2134 #ifdef CONFIG_X86 2135 r += PAGE_SIZE; /* pio data page */ 2136 #endif 2137 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 2138 r += PAGE_SIZE; /* coalesced mmio ring page */ 2139 #endif 2140 break; 2141 case KVM_TRACE_ENABLE: 2142 case KVM_TRACE_PAUSE: 2143 case KVM_TRACE_DISABLE: 2144 r = -EOPNOTSUPP; 2145 break; 2146 default: 2147 return kvm_arch_dev_ioctl(filp, ioctl, arg); 2148 } 2149 out: 2150 return r; 2151 } 2152 2153 static struct file_operations kvm_chardev_ops = { 2154 .unlocked_ioctl = kvm_dev_ioctl, 2155 .compat_ioctl = kvm_dev_ioctl, 2156 .llseek = noop_llseek, 2157 }; 2158 2159 static struct miscdevice kvm_dev = { 2160 KVM_MINOR, 2161 "kvm", 2162 &kvm_chardev_ops, 2163 }; 2164 2165 static void hardware_enable_nolock(void *junk) 2166 { 2167 int cpu = raw_smp_processor_id(); 2168 int r; 2169 2170 if (cpumask_test_cpu(cpu, cpus_hardware_enabled)) 2171 return; 2172 2173 cpumask_set_cpu(cpu, cpus_hardware_enabled); 2174 2175 r = kvm_arch_hardware_enable(NULL); 2176 2177 if (r) { 2178 cpumask_clear_cpu(cpu, cpus_hardware_enabled); 2179 atomic_inc(&hardware_enable_failed); 2180 printk(KERN_INFO "kvm: enabling virtualization on " 2181 "CPU%d failed\n", cpu); 2182 } 2183 } 2184 2185 static void hardware_enable(void *junk) 2186 { 2187 raw_spin_lock(&kvm_lock); 2188 hardware_enable_nolock(junk); 2189 raw_spin_unlock(&kvm_lock); 2190 } 2191 2192 static void hardware_disable_nolock(void *junk) 2193 { 2194 int cpu = raw_smp_processor_id(); 2195 2196 if (!cpumask_test_cpu(cpu, cpus_hardware_enabled)) 2197 return; 2198 cpumask_clear_cpu(cpu, cpus_hardware_enabled); 2199 kvm_arch_hardware_disable(NULL); 2200 } 2201 2202 static void hardware_disable(void *junk) 2203 { 2204 raw_spin_lock(&kvm_lock); 2205 hardware_disable_nolock(junk); 2206 raw_spin_unlock(&kvm_lock); 2207 } 2208 2209 static void hardware_disable_all_nolock(void) 2210 { 2211 BUG_ON(!kvm_usage_count); 2212 2213 kvm_usage_count--; 2214 if (!kvm_usage_count) 2215 on_each_cpu(hardware_disable_nolock, NULL, 1); 2216 } 2217 2218 static void hardware_disable_all(void) 2219 { 2220 raw_spin_lock(&kvm_lock); 2221 hardware_disable_all_nolock(); 2222 raw_spin_unlock(&kvm_lock); 2223 } 2224 2225 static int hardware_enable_all(void) 2226 { 2227 int r = 0; 2228 2229 raw_spin_lock(&kvm_lock); 2230 2231 kvm_usage_count++; 2232 if (kvm_usage_count == 1) { 2233 atomic_set(&hardware_enable_failed, 0); 2234 on_each_cpu(hardware_enable_nolock, NULL, 1); 2235 2236 if (atomic_read(&hardware_enable_failed)) { 2237 hardware_disable_all_nolock(); 2238 r = -EBUSY; 2239 } 2240 } 2241 2242 raw_spin_unlock(&kvm_lock); 2243 2244 return r; 2245 } 2246 2247 static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, 2248 void *v) 2249 { 2250 int cpu = (long)v; 2251 2252 if (!kvm_usage_count) 2253 return NOTIFY_OK; 2254 2255 val &= ~CPU_TASKS_FROZEN; 2256 switch (val) { 2257 case CPU_DYING: 2258 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", 2259 cpu); 2260 hardware_disable(NULL); 2261 break; 2262 case CPU_STARTING: 2263 printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n", 2264 cpu); 2265 hardware_enable(NULL); 2266 break; 2267 } 2268 return NOTIFY_OK; 2269 } 2270 2271 2272 asmlinkage void kvm_spurious_fault(void) 2273 { 2274 /* Fault while not rebooting. We want the trace. */ 2275 BUG(); 2276 } 2277 EXPORT_SYMBOL_GPL(kvm_spurious_fault); 2278 2279 static int kvm_reboot(struct notifier_block *notifier, unsigned long val, 2280 void *v) 2281 { 2282 /* 2283 * Some (well, at least mine) BIOSes hang on reboot if 2284 * in vmx root mode. 2285 * 2286 * And Intel TXT required VMX off for all cpu when system shutdown. 2287 */ 2288 printk(KERN_INFO "kvm: exiting hardware virtualization\n"); 2289 kvm_rebooting = true; 2290 on_each_cpu(hardware_disable_nolock, NULL, 1); 2291 return NOTIFY_OK; 2292 } 2293 2294 static struct notifier_block kvm_reboot_notifier = { 2295 .notifier_call = kvm_reboot, 2296 .priority = 0, 2297 }; 2298 2299 static void kvm_io_bus_destroy(struct kvm_io_bus *bus) 2300 { 2301 int i; 2302 2303 for (i = 0; i < bus->dev_count; i++) { 2304 struct kvm_io_device *pos = bus->devs[i]; 2305 2306 kvm_iodevice_destructor(pos); 2307 } 2308 kfree(bus); 2309 } 2310 2311 /* kvm_io_bus_write - called under kvm->slots_lock */ 2312 int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, 2313 int len, const void *val) 2314 { 2315 int i; 2316 struct kvm_io_bus *bus; 2317 2318 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); 2319 for (i = 0; i < bus->dev_count; i++) 2320 if (!kvm_iodevice_write(bus->devs[i], addr, len, val)) 2321 return 0; 2322 return -EOPNOTSUPP; 2323 } 2324 2325 /* kvm_io_bus_read - called under kvm->slots_lock */ 2326 int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, 2327 int len, void *val) 2328 { 2329 int i; 2330 struct kvm_io_bus *bus; 2331 2332 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); 2333 for (i = 0; i < bus->dev_count; i++) 2334 if (!kvm_iodevice_read(bus->devs[i], addr, len, val)) 2335 return 0; 2336 return -EOPNOTSUPP; 2337 } 2338 2339 /* Caller must hold slots_lock. */ 2340 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, 2341 struct kvm_io_device *dev) 2342 { 2343 struct kvm_io_bus *new_bus, *bus; 2344 2345 bus = kvm->buses[bus_idx]; 2346 if (bus->dev_count > NR_IOBUS_DEVS-1) 2347 return -ENOSPC; 2348 2349 new_bus = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL); 2350 if (!new_bus) 2351 return -ENOMEM; 2352 memcpy(new_bus, bus, sizeof(struct kvm_io_bus)); 2353 new_bus->devs[new_bus->dev_count++] = dev; 2354 rcu_assign_pointer(kvm->buses[bus_idx], new_bus); 2355 synchronize_srcu_expedited(&kvm->srcu); 2356 kfree(bus); 2357 2358 return 0; 2359 } 2360 2361 /* Caller must hold slots_lock. */ 2362 int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, 2363 struct kvm_io_device *dev) 2364 { 2365 int i, r; 2366 struct kvm_io_bus *new_bus, *bus; 2367 2368 new_bus = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL); 2369 if (!new_bus) 2370 return -ENOMEM; 2371 2372 bus = kvm->buses[bus_idx]; 2373 memcpy(new_bus, bus, sizeof(struct kvm_io_bus)); 2374 2375 r = -ENOENT; 2376 for (i = 0; i < new_bus->dev_count; i++) 2377 if (new_bus->devs[i] == dev) { 2378 r = 0; 2379 new_bus->devs[i] = new_bus->devs[--new_bus->dev_count]; 2380 break; 2381 } 2382 2383 if (r) { 2384 kfree(new_bus); 2385 return r; 2386 } 2387 2388 rcu_assign_pointer(kvm->buses[bus_idx], new_bus); 2389 synchronize_srcu_expedited(&kvm->srcu); 2390 kfree(bus); 2391 return r; 2392 } 2393 2394 static struct notifier_block kvm_cpu_notifier = { 2395 .notifier_call = kvm_cpu_hotplug, 2396 }; 2397 2398 static int vm_stat_get(void *_offset, u64 *val) 2399 { 2400 unsigned offset = (long)_offset; 2401 struct kvm *kvm; 2402 2403 *val = 0; 2404 raw_spin_lock(&kvm_lock); 2405 list_for_each_entry(kvm, &vm_list, vm_list) 2406 *val += *(u32 *)((void *)kvm + offset); 2407 raw_spin_unlock(&kvm_lock); 2408 return 0; 2409 } 2410 2411 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, NULL, "%llu\n"); 2412 2413 static int vcpu_stat_get(void *_offset, u64 *val) 2414 { 2415 unsigned offset = (long)_offset; 2416 struct kvm *kvm; 2417 struct kvm_vcpu *vcpu; 2418 int i; 2419 2420 *val = 0; 2421 raw_spin_lock(&kvm_lock); 2422 list_for_each_entry(kvm, &vm_list, vm_list) 2423 kvm_for_each_vcpu(i, vcpu, kvm) 2424 *val += *(u32 *)((void *)vcpu + offset); 2425 2426 raw_spin_unlock(&kvm_lock); 2427 return 0; 2428 } 2429 2430 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, NULL, "%llu\n"); 2431 2432 static const struct file_operations *stat_fops[] = { 2433 [KVM_STAT_VCPU] = &vcpu_stat_fops, 2434 [KVM_STAT_VM] = &vm_stat_fops, 2435 }; 2436 2437 static void kvm_init_debug(void) 2438 { 2439 struct kvm_stats_debugfs_item *p; 2440 2441 kvm_debugfs_dir = debugfs_create_dir("kvm", NULL); 2442 for (p = debugfs_entries; p->name; ++p) 2443 p->dentry = debugfs_create_file(p->name, 0444, kvm_debugfs_dir, 2444 (void *)(long)p->offset, 2445 stat_fops[p->kind]); 2446 } 2447 2448 static void kvm_exit_debug(void) 2449 { 2450 struct kvm_stats_debugfs_item *p; 2451 2452 for (p = debugfs_entries; p->name; ++p) 2453 debugfs_remove(p->dentry); 2454 debugfs_remove(kvm_debugfs_dir); 2455 } 2456 2457 static int kvm_suspend(void) 2458 { 2459 if (kvm_usage_count) 2460 hardware_disable_nolock(NULL); 2461 return 0; 2462 } 2463 2464 static void kvm_resume(void) 2465 { 2466 if (kvm_usage_count) { 2467 WARN_ON(raw_spin_is_locked(&kvm_lock)); 2468 hardware_enable_nolock(NULL); 2469 } 2470 } 2471 2472 static struct syscore_ops kvm_syscore_ops = { 2473 .suspend = kvm_suspend, 2474 .resume = kvm_resume, 2475 }; 2476 2477 struct page *bad_page; 2478 pfn_t bad_pfn; 2479 2480 static inline 2481 struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) 2482 { 2483 return container_of(pn, struct kvm_vcpu, preempt_notifier); 2484 } 2485 2486 static void kvm_sched_in(struct preempt_notifier *pn, int cpu) 2487 { 2488 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 2489 2490 kvm_arch_vcpu_load(vcpu, cpu); 2491 } 2492 2493 static void kvm_sched_out(struct preempt_notifier *pn, 2494 struct task_struct *next) 2495 { 2496 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 2497 2498 kvm_arch_vcpu_put(vcpu); 2499 } 2500 2501 int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, 2502 struct module *module) 2503 { 2504 int r; 2505 int cpu; 2506 2507 r = kvm_arch_init(opaque); 2508 if (r) 2509 goto out_fail; 2510 2511 bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO); 2512 2513 if (bad_page == NULL) { 2514 r = -ENOMEM; 2515 goto out; 2516 } 2517 2518 bad_pfn = page_to_pfn(bad_page); 2519 2520 hwpoison_page = alloc_page(GFP_KERNEL | __GFP_ZERO); 2521 2522 if (hwpoison_page == NULL) { 2523 r = -ENOMEM; 2524 goto out_free_0; 2525 } 2526 2527 hwpoison_pfn = page_to_pfn(hwpoison_page); 2528 2529 fault_page = alloc_page(GFP_KERNEL | __GFP_ZERO); 2530 2531 if (fault_page == NULL) { 2532 r = -ENOMEM; 2533 goto out_free_0; 2534 } 2535 2536 fault_pfn = page_to_pfn(fault_page); 2537 2538 if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { 2539 r = -ENOMEM; 2540 goto out_free_0; 2541 } 2542 2543 r = kvm_arch_hardware_setup(); 2544 if (r < 0) 2545 goto out_free_0a; 2546 2547 for_each_online_cpu(cpu) { 2548 smp_call_function_single(cpu, 2549 kvm_arch_check_processor_compat, 2550 &r, 1); 2551 if (r < 0) 2552 goto out_free_1; 2553 } 2554 2555 r = register_cpu_notifier(&kvm_cpu_notifier); 2556 if (r) 2557 goto out_free_2; 2558 register_reboot_notifier(&kvm_reboot_notifier); 2559 2560 /* A kmem cache lets us meet the alignment requirements of fx_save. */ 2561 if (!vcpu_align) 2562 vcpu_align = __alignof__(struct kvm_vcpu); 2563 kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, vcpu_align, 2564 0, NULL); 2565 if (!kvm_vcpu_cache) { 2566 r = -ENOMEM; 2567 goto out_free_3; 2568 } 2569 2570 r = kvm_async_pf_init(); 2571 if (r) 2572 goto out_free; 2573 2574 kvm_chardev_ops.owner = module; 2575 kvm_vm_fops.owner = module; 2576 kvm_vcpu_fops.owner = module; 2577 2578 r = misc_register(&kvm_dev); 2579 if (r) { 2580 printk(KERN_ERR "kvm: misc device register failed\n"); 2581 goto out_unreg; 2582 } 2583 2584 register_syscore_ops(&kvm_syscore_ops); 2585 2586 kvm_preempt_ops.sched_in = kvm_sched_in; 2587 kvm_preempt_ops.sched_out = kvm_sched_out; 2588 2589 kvm_init_debug(); 2590 2591 return 0; 2592 2593 out_unreg: 2594 kvm_async_pf_deinit(); 2595 out_free: 2596 kmem_cache_destroy(kvm_vcpu_cache); 2597 out_free_3: 2598 unregister_reboot_notifier(&kvm_reboot_notifier); 2599 unregister_cpu_notifier(&kvm_cpu_notifier); 2600 out_free_2: 2601 out_free_1: 2602 kvm_arch_hardware_unsetup(); 2603 out_free_0a: 2604 free_cpumask_var(cpus_hardware_enabled); 2605 out_free_0: 2606 if (fault_page) 2607 __free_page(fault_page); 2608 if (hwpoison_page) 2609 __free_page(hwpoison_page); 2610 __free_page(bad_page); 2611 out: 2612 kvm_arch_exit(); 2613 out_fail: 2614 return r; 2615 } 2616 EXPORT_SYMBOL_GPL(kvm_init); 2617 2618 void kvm_exit(void) 2619 { 2620 kvm_exit_debug(); 2621 misc_deregister(&kvm_dev); 2622 kmem_cache_destroy(kvm_vcpu_cache); 2623 kvm_async_pf_deinit(); 2624 unregister_syscore_ops(&kvm_syscore_ops); 2625 unregister_reboot_notifier(&kvm_reboot_notifier); 2626 unregister_cpu_notifier(&kvm_cpu_notifier); 2627 on_each_cpu(hardware_disable_nolock, NULL, 1); 2628 kvm_arch_hardware_unsetup(); 2629 kvm_arch_exit(); 2630 free_cpumask_var(cpus_hardware_enabled); 2631 __free_page(hwpoison_page); 2632 __free_page(bad_page); 2633 } 2634 EXPORT_SYMBOL_GPL(kvm_exit); 2635