1 /* 2 * Kernel-based Virtual Machine driver for Linux 3 * 4 * This module enables machines with Intel VT-x extensions to run virtual 5 * machines without emulation or binary translation. 6 * 7 * Copyright (C) 2006 Qumranet, Inc. 8 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 9 * 10 * Authors: 11 * Avi Kivity <avi@qumranet.com> 12 * Yaniv Kamay <yaniv@qumranet.com> 13 * 14 * This work is licensed under the terms of the GNU GPL, version 2. See 15 * the COPYING file in the top-level directory. 16 * 17 */ 18 19 #include "iodev.h" 20 21 #include <linux/kvm_host.h> 22 #include <linux/kvm.h> 23 #include <linux/module.h> 24 #include <linux/errno.h> 25 #include <linux/percpu.h> 26 #include <linux/mm.h> 27 #include <linux/miscdevice.h> 28 #include <linux/vmalloc.h> 29 #include <linux/reboot.h> 30 #include <linux/debugfs.h> 31 #include <linux/highmem.h> 32 #include <linux/file.h> 33 #include <linux/syscore_ops.h> 34 #include <linux/cpu.h> 35 #include <linux/sched.h> 36 #include <linux/cpumask.h> 37 #include <linux/smp.h> 38 #include <linux/anon_inodes.h> 39 #include <linux/profile.h> 40 #include <linux/kvm_para.h> 41 #include <linux/pagemap.h> 42 #include <linux/mman.h> 43 #include <linux/swap.h> 44 #include <linux/bitops.h> 45 #include <linux/spinlock.h> 46 #include <linux/compat.h> 47 #include <linux/srcu.h> 48 #include <linux/hugetlb.h> 49 #include <linux/slab.h> 50 #include <linux/sort.h> 51 #include <linux/bsearch.h> 52 53 #include <asm/processor.h> 54 #include <asm/io.h> 55 #include <asm/uaccess.h> 56 #include <asm/pgtable.h> 57 58 #include "coalesced_mmio.h" 59 #include "async_pf.h" 60 61 #define CREATE_TRACE_POINTS 62 #include <trace/events/kvm.h> 63 64 MODULE_AUTHOR("Qumranet"); 65 MODULE_LICENSE("GPL"); 66 67 /* 68 * Ordering of locks: 69 * 70 * kvm->lock --> kvm->slots_lock --> kvm->irq_lock 71 */ 72 73 DEFINE_RAW_SPINLOCK(kvm_lock); 74 LIST_HEAD(vm_list); 75 76 static cpumask_var_t cpus_hardware_enabled; 77 static int kvm_usage_count = 0; 78 static atomic_t hardware_enable_failed; 79 80 struct kmem_cache *kvm_vcpu_cache; 81 EXPORT_SYMBOL_GPL(kvm_vcpu_cache); 82 83 static __read_mostly struct preempt_ops kvm_preempt_ops; 84 85 struct dentry *kvm_debugfs_dir; 86 87 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, 88 unsigned long arg); 89 #ifdef CONFIG_COMPAT 90 static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl, 91 unsigned long arg); 92 #endif 93 static int hardware_enable_all(void); 94 static void hardware_disable_all(void); 95 96 static void kvm_io_bus_destroy(struct kvm_io_bus *bus); 97 98 bool kvm_rebooting; 99 EXPORT_SYMBOL_GPL(kvm_rebooting); 100 101 static bool largepages_enabled = true; 102 103 static struct page *hwpoison_page; 104 static pfn_t hwpoison_pfn; 105 106 struct page *fault_page; 107 pfn_t fault_pfn; 108 109 inline int kvm_is_mmio_pfn(pfn_t pfn) 110 { 111 if (pfn_valid(pfn)) { 112 int reserved; 113 struct page *tail = pfn_to_page(pfn); 114 struct page *head = compound_trans_head(tail); 115 reserved = PageReserved(head); 116 if (head != tail) { 117 /* 118 * "head" is not a dangling pointer 119 * (compound_trans_head takes care of that) 120 * but the hugepage may have been splitted 121 * from under us (and we may not hold a 122 * reference count on the head page so it can 123 * be reused before we run PageReferenced), so 124 * we've to check PageTail before returning 125 * what we just read. 126 */ 127 smp_rmb(); 128 if (PageTail(tail)) 129 return reserved; 130 } 131 return PageReserved(tail); 132 } 133 134 return true; 135 } 136 137 /* 138 * Switches to specified vcpu, until a matching vcpu_put() 139 */ 140 void vcpu_load(struct kvm_vcpu *vcpu) 141 { 142 int cpu; 143 144 mutex_lock(&vcpu->mutex); 145 if (unlikely(vcpu->pid != current->pids[PIDTYPE_PID].pid)) { 146 /* The thread running this VCPU changed. */ 147 struct pid *oldpid = vcpu->pid; 148 struct pid *newpid = get_task_pid(current, PIDTYPE_PID); 149 rcu_assign_pointer(vcpu->pid, newpid); 150 synchronize_rcu(); 151 put_pid(oldpid); 152 } 153 cpu = get_cpu(); 154 preempt_notifier_register(&vcpu->preempt_notifier); 155 kvm_arch_vcpu_load(vcpu, cpu); 156 put_cpu(); 157 } 158 159 void vcpu_put(struct kvm_vcpu *vcpu) 160 { 161 preempt_disable(); 162 kvm_arch_vcpu_put(vcpu); 163 preempt_notifier_unregister(&vcpu->preempt_notifier); 164 preempt_enable(); 165 mutex_unlock(&vcpu->mutex); 166 } 167 168 static void ack_flush(void *_completed) 169 { 170 } 171 172 static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) 173 { 174 int i, cpu, me; 175 cpumask_var_t cpus; 176 bool called = true; 177 struct kvm_vcpu *vcpu; 178 179 zalloc_cpumask_var(&cpus, GFP_ATOMIC); 180 181 me = get_cpu(); 182 kvm_for_each_vcpu(i, vcpu, kvm) { 183 kvm_make_request(req, vcpu); 184 cpu = vcpu->cpu; 185 186 /* Set ->requests bit before we read ->mode */ 187 smp_mb(); 188 189 if (cpus != NULL && cpu != -1 && cpu != me && 190 kvm_vcpu_exiting_guest_mode(vcpu) != OUTSIDE_GUEST_MODE) 191 cpumask_set_cpu(cpu, cpus); 192 } 193 if (unlikely(cpus == NULL)) 194 smp_call_function_many(cpu_online_mask, ack_flush, NULL, 1); 195 else if (!cpumask_empty(cpus)) 196 smp_call_function_many(cpus, ack_flush, NULL, 1); 197 else 198 called = false; 199 put_cpu(); 200 free_cpumask_var(cpus); 201 return called; 202 } 203 204 void kvm_flush_remote_tlbs(struct kvm *kvm) 205 { 206 int dirty_count = kvm->tlbs_dirty; 207 208 smp_mb(); 209 if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) 210 ++kvm->stat.remote_tlb_flush; 211 cmpxchg(&kvm->tlbs_dirty, dirty_count, 0); 212 } 213 214 void kvm_reload_remote_mmus(struct kvm *kvm) 215 { 216 make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD); 217 } 218 219 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) 220 { 221 struct page *page; 222 int r; 223 224 mutex_init(&vcpu->mutex); 225 vcpu->cpu = -1; 226 vcpu->kvm = kvm; 227 vcpu->vcpu_id = id; 228 vcpu->pid = NULL; 229 init_waitqueue_head(&vcpu->wq); 230 kvm_async_pf_vcpu_init(vcpu); 231 232 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 233 if (!page) { 234 r = -ENOMEM; 235 goto fail; 236 } 237 vcpu->run = page_address(page); 238 239 r = kvm_arch_vcpu_init(vcpu); 240 if (r < 0) 241 goto fail_free_run; 242 return 0; 243 244 fail_free_run: 245 free_page((unsigned long)vcpu->run); 246 fail: 247 return r; 248 } 249 EXPORT_SYMBOL_GPL(kvm_vcpu_init); 250 251 void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) 252 { 253 put_pid(vcpu->pid); 254 kvm_arch_vcpu_uninit(vcpu); 255 free_page((unsigned long)vcpu->run); 256 } 257 EXPORT_SYMBOL_GPL(kvm_vcpu_uninit); 258 259 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 260 static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) 261 { 262 return container_of(mn, struct kvm, mmu_notifier); 263 } 264 265 static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn, 266 struct mm_struct *mm, 267 unsigned long address) 268 { 269 struct kvm *kvm = mmu_notifier_to_kvm(mn); 270 int need_tlb_flush, idx; 271 272 /* 273 * When ->invalidate_page runs, the linux pte has been zapped 274 * already but the page is still allocated until 275 * ->invalidate_page returns. So if we increase the sequence 276 * here the kvm page fault will notice if the spte can't be 277 * established because the page is going to be freed. If 278 * instead the kvm page fault establishes the spte before 279 * ->invalidate_page runs, kvm_unmap_hva will release it 280 * before returning. 281 * 282 * The sequence increase only need to be seen at spin_unlock 283 * time, and not at spin_lock time. 284 * 285 * Increasing the sequence after the spin_unlock would be 286 * unsafe because the kvm page fault could then establish the 287 * pte after kvm_unmap_hva returned, without noticing the page 288 * is going to be freed. 289 */ 290 idx = srcu_read_lock(&kvm->srcu); 291 spin_lock(&kvm->mmu_lock); 292 kvm->mmu_notifier_seq++; 293 need_tlb_flush = kvm_unmap_hva(kvm, address) | kvm->tlbs_dirty; 294 spin_unlock(&kvm->mmu_lock); 295 srcu_read_unlock(&kvm->srcu, idx); 296 297 /* we've to flush the tlb before the pages can be freed */ 298 if (need_tlb_flush) 299 kvm_flush_remote_tlbs(kvm); 300 301 } 302 303 static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, 304 struct mm_struct *mm, 305 unsigned long address, 306 pte_t pte) 307 { 308 struct kvm *kvm = mmu_notifier_to_kvm(mn); 309 int idx; 310 311 idx = srcu_read_lock(&kvm->srcu); 312 spin_lock(&kvm->mmu_lock); 313 kvm->mmu_notifier_seq++; 314 kvm_set_spte_hva(kvm, address, pte); 315 spin_unlock(&kvm->mmu_lock); 316 srcu_read_unlock(&kvm->srcu, idx); 317 } 318 319 static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, 320 struct mm_struct *mm, 321 unsigned long start, 322 unsigned long end) 323 { 324 struct kvm *kvm = mmu_notifier_to_kvm(mn); 325 int need_tlb_flush = 0, idx; 326 327 idx = srcu_read_lock(&kvm->srcu); 328 spin_lock(&kvm->mmu_lock); 329 /* 330 * The count increase must become visible at unlock time as no 331 * spte can be established without taking the mmu_lock and 332 * count is also read inside the mmu_lock critical section. 333 */ 334 kvm->mmu_notifier_count++; 335 for (; start < end; start += PAGE_SIZE) 336 need_tlb_flush |= kvm_unmap_hva(kvm, start); 337 need_tlb_flush |= kvm->tlbs_dirty; 338 spin_unlock(&kvm->mmu_lock); 339 srcu_read_unlock(&kvm->srcu, idx); 340 341 /* we've to flush the tlb before the pages can be freed */ 342 if (need_tlb_flush) 343 kvm_flush_remote_tlbs(kvm); 344 } 345 346 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, 347 struct mm_struct *mm, 348 unsigned long start, 349 unsigned long end) 350 { 351 struct kvm *kvm = mmu_notifier_to_kvm(mn); 352 353 spin_lock(&kvm->mmu_lock); 354 /* 355 * This sequence increase will notify the kvm page fault that 356 * the page that is going to be mapped in the spte could have 357 * been freed. 358 */ 359 kvm->mmu_notifier_seq++; 360 /* 361 * The above sequence increase must be visible before the 362 * below count decrease but both values are read by the kvm 363 * page fault under mmu_lock spinlock so we don't need to add 364 * a smb_wmb() here in between the two. 365 */ 366 kvm->mmu_notifier_count--; 367 spin_unlock(&kvm->mmu_lock); 368 369 BUG_ON(kvm->mmu_notifier_count < 0); 370 } 371 372 static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, 373 struct mm_struct *mm, 374 unsigned long address) 375 { 376 struct kvm *kvm = mmu_notifier_to_kvm(mn); 377 int young, idx; 378 379 idx = srcu_read_lock(&kvm->srcu); 380 spin_lock(&kvm->mmu_lock); 381 young = kvm_age_hva(kvm, address); 382 spin_unlock(&kvm->mmu_lock); 383 srcu_read_unlock(&kvm->srcu, idx); 384 385 if (young) 386 kvm_flush_remote_tlbs(kvm); 387 388 return young; 389 } 390 391 static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn, 392 struct mm_struct *mm, 393 unsigned long address) 394 { 395 struct kvm *kvm = mmu_notifier_to_kvm(mn); 396 int young, idx; 397 398 idx = srcu_read_lock(&kvm->srcu); 399 spin_lock(&kvm->mmu_lock); 400 young = kvm_test_age_hva(kvm, address); 401 spin_unlock(&kvm->mmu_lock); 402 srcu_read_unlock(&kvm->srcu, idx); 403 404 return young; 405 } 406 407 static void kvm_mmu_notifier_release(struct mmu_notifier *mn, 408 struct mm_struct *mm) 409 { 410 struct kvm *kvm = mmu_notifier_to_kvm(mn); 411 int idx; 412 413 idx = srcu_read_lock(&kvm->srcu); 414 kvm_arch_flush_shadow(kvm); 415 srcu_read_unlock(&kvm->srcu, idx); 416 } 417 418 static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { 419 .invalidate_page = kvm_mmu_notifier_invalidate_page, 420 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, 421 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, 422 .clear_flush_young = kvm_mmu_notifier_clear_flush_young, 423 .test_young = kvm_mmu_notifier_test_young, 424 .change_pte = kvm_mmu_notifier_change_pte, 425 .release = kvm_mmu_notifier_release, 426 }; 427 428 static int kvm_init_mmu_notifier(struct kvm *kvm) 429 { 430 kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops; 431 return mmu_notifier_register(&kvm->mmu_notifier, current->mm); 432 } 433 434 #else /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */ 435 436 static int kvm_init_mmu_notifier(struct kvm *kvm) 437 { 438 return 0; 439 } 440 441 #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */ 442 443 static struct kvm *kvm_create_vm(void) 444 { 445 int r, i; 446 struct kvm *kvm = kvm_arch_alloc_vm(); 447 448 if (!kvm) 449 return ERR_PTR(-ENOMEM); 450 451 r = kvm_arch_init_vm(kvm); 452 if (r) 453 goto out_err_nodisable; 454 455 r = hardware_enable_all(); 456 if (r) 457 goto out_err_nodisable; 458 459 #ifdef CONFIG_HAVE_KVM_IRQCHIP 460 INIT_HLIST_HEAD(&kvm->mask_notifier_list); 461 INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list); 462 #endif 463 464 r = -ENOMEM; 465 kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); 466 if (!kvm->memslots) 467 goto out_err_nosrcu; 468 if (init_srcu_struct(&kvm->srcu)) 469 goto out_err_nosrcu; 470 for (i = 0; i < KVM_NR_BUSES; i++) { 471 kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus), 472 GFP_KERNEL); 473 if (!kvm->buses[i]) 474 goto out_err; 475 } 476 477 spin_lock_init(&kvm->mmu_lock); 478 kvm->mm = current->mm; 479 atomic_inc(&kvm->mm->mm_count); 480 kvm_eventfd_init(kvm); 481 mutex_init(&kvm->lock); 482 mutex_init(&kvm->irq_lock); 483 mutex_init(&kvm->slots_lock); 484 atomic_set(&kvm->users_count, 1); 485 486 r = kvm_init_mmu_notifier(kvm); 487 if (r) 488 goto out_err; 489 490 raw_spin_lock(&kvm_lock); 491 list_add(&kvm->vm_list, &vm_list); 492 raw_spin_unlock(&kvm_lock); 493 494 return kvm; 495 496 out_err: 497 cleanup_srcu_struct(&kvm->srcu); 498 out_err_nosrcu: 499 hardware_disable_all(); 500 out_err_nodisable: 501 for (i = 0; i < KVM_NR_BUSES; i++) 502 kfree(kvm->buses[i]); 503 kfree(kvm->memslots); 504 kvm_arch_free_vm(kvm); 505 return ERR_PTR(r); 506 } 507 508 static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) 509 { 510 if (!memslot->dirty_bitmap) 511 return; 512 513 if (2 * kvm_dirty_bitmap_bytes(memslot) > PAGE_SIZE) 514 vfree(memslot->dirty_bitmap_head); 515 else 516 kfree(memslot->dirty_bitmap_head); 517 518 memslot->dirty_bitmap = NULL; 519 memslot->dirty_bitmap_head = NULL; 520 } 521 522 /* 523 * Free any memory in @free but not in @dont. 524 */ 525 static void kvm_free_physmem_slot(struct kvm_memory_slot *free, 526 struct kvm_memory_slot *dont) 527 { 528 int i; 529 530 if (!dont || free->rmap != dont->rmap) 531 vfree(free->rmap); 532 533 if (!dont || free->dirty_bitmap != dont->dirty_bitmap) 534 kvm_destroy_dirty_bitmap(free); 535 536 537 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { 538 if (!dont || free->lpage_info[i] != dont->lpage_info[i]) { 539 vfree(free->lpage_info[i]); 540 free->lpage_info[i] = NULL; 541 } 542 } 543 544 free->npages = 0; 545 free->rmap = NULL; 546 } 547 548 void kvm_free_physmem(struct kvm *kvm) 549 { 550 int i; 551 struct kvm_memslots *slots = kvm->memslots; 552 553 for (i = 0; i < slots->nmemslots; ++i) 554 kvm_free_physmem_slot(&slots->memslots[i], NULL); 555 556 kfree(kvm->memslots); 557 } 558 559 static void kvm_destroy_vm(struct kvm *kvm) 560 { 561 int i; 562 struct mm_struct *mm = kvm->mm; 563 564 kvm_arch_sync_events(kvm); 565 raw_spin_lock(&kvm_lock); 566 list_del(&kvm->vm_list); 567 raw_spin_unlock(&kvm_lock); 568 kvm_free_irq_routing(kvm); 569 for (i = 0; i < KVM_NR_BUSES; i++) 570 kvm_io_bus_destroy(kvm->buses[i]); 571 kvm_coalesced_mmio_free(kvm); 572 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 573 mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); 574 #else 575 kvm_arch_flush_shadow(kvm); 576 #endif 577 kvm_arch_destroy_vm(kvm); 578 kvm_free_physmem(kvm); 579 cleanup_srcu_struct(&kvm->srcu); 580 kvm_arch_free_vm(kvm); 581 hardware_disable_all(); 582 mmdrop(mm); 583 } 584 585 void kvm_get_kvm(struct kvm *kvm) 586 { 587 atomic_inc(&kvm->users_count); 588 } 589 EXPORT_SYMBOL_GPL(kvm_get_kvm); 590 591 void kvm_put_kvm(struct kvm *kvm) 592 { 593 if (atomic_dec_and_test(&kvm->users_count)) 594 kvm_destroy_vm(kvm); 595 } 596 EXPORT_SYMBOL_GPL(kvm_put_kvm); 597 598 599 static int kvm_vm_release(struct inode *inode, struct file *filp) 600 { 601 struct kvm *kvm = filp->private_data; 602 603 kvm_irqfd_release(kvm); 604 605 kvm_put_kvm(kvm); 606 return 0; 607 } 608 609 #ifndef CONFIG_S390 610 /* 611 * Allocation size is twice as large as the actual dirty bitmap size. 612 * This makes it possible to do double buffering: see x86's 613 * kvm_vm_ioctl_get_dirty_log(). 614 */ 615 static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot) 616 { 617 unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot); 618 619 if (dirty_bytes > PAGE_SIZE) 620 memslot->dirty_bitmap = vzalloc(dirty_bytes); 621 else 622 memslot->dirty_bitmap = kzalloc(dirty_bytes, GFP_KERNEL); 623 624 if (!memslot->dirty_bitmap) 625 return -ENOMEM; 626 627 memslot->dirty_bitmap_head = memslot->dirty_bitmap; 628 return 0; 629 } 630 #endif /* !CONFIG_S390 */ 631 632 /* 633 * Allocate some memory and give it an address in the guest physical address 634 * space. 635 * 636 * Discontiguous memory is allowed, mostly for framebuffers. 637 * 638 * Must be called holding mmap_sem for write. 639 */ 640 int __kvm_set_memory_region(struct kvm *kvm, 641 struct kvm_userspace_memory_region *mem, 642 int user_alloc) 643 { 644 int r; 645 gfn_t base_gfn; 646 unsigned long npages; 647 unsigned long i; 648 struct kvm_memory_slot *memslot; 649 struct kvm_memory_slot old, new; 650 struct kvm_memslots *slots, *old_memslots; 651 652 r = -EINVAL; 653 /* General sanity checks */ 654 if (mem->memory_size & (PAGE_SIZE - 1)) 655 goto out; 656 if (mem->guest_phys_addr & (PAGE_SIZE - 1)) 657 goto out; 658 /* We can read the guest memory with __xxx_user() later on. */ 659 if (user_alloc && 660 ((mem->userspace_addr & (PAGE_SIZE - 1)) || 661 !access_ok(VERIFY_WRITE, 662 (void __user *)(unsigned long)mem->userspace_addr, 663 mem->memory_size))) 664 goto out; 665 if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS) 666 goto out; 667 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) 668 goto out; 669 670 memslot = &kvm->memslots->memslots[mem->slot]; 671 base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; 672 npages = mem->memory_size >> PAGE_SHIFT; 673 674 r = -EINVAL; 675 if (npages > KVM_MEM_MAX_NR_PAGES) 676 goto out; 677 678 if (!npages) 679 mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES; 680 681 new = old = *memslot; 682 683 new.id = mem->slot; 684 new.base_gfn = base_gfn; 685 new.npages = npages; 686 new.flags = mem->flags; 687 688 /* Disallow changing a memory slot's size. */ 689 r = -EINVAL; 690 if (npages && old.npages && npages != old.npages) 691 goto out_free; 692 693 /* Check for overlaps */ 694 r = -EEXIST; 695 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { 696 struct kvm_memory_slot *s = &kvm->memslots->memslots[i]; 697 698 if (s == memslot || !s->npages) 699 continue; 700 if (!((base_gfn + npages <= s->base_gfn) || 701 (base_gfn >= s->base_gfn + s->npages))) 702 goto out_free; 703 } 704 705 /* Free page dirty bitmap if unneeded */ 706 if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES)) 707 new.dirty_bitmap = NULL; 708 709 r = -ENOMEM; 710 711 /* Allocate if a slot is being created */ 712 #ifndef CONFIG_S390 713 if (npages && !new.rmap) { 714 new.rmap = vzalloc(npages * sizeof(*new.rmap)); 715 716 if (!new.rmap) 717 goto out_free; 718 719 new.user_alloc = user_alloc; 720 new.userspace_addr = mem->userspace_addr; 721 } 722 if (!npages) 723 goto skip_lpage; 724 725 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { 726 unsigned long ugfn; 727 unsigned long j; 728 int lpages; 729 int level = i + 2; 730 731 /* Avoid unused variable warning if no large pages */ 732 (void)level; 733 734 if (new.lpage_info[i]) 735 continue; 736 737 lpages = 1 + ((base_gfn + npages - 1) 738 >> KVM_HPAGE_GFN_SHIFT(level)); 739 lpages -= base_gfn >> KVM_HPAGE_GFN_SHIFT(level); 740 741 new.lpage_info[i] = vzalloc(lpages * sizeof(*new.lpage_info[i])); 742 743 if (!new.lpage_info[i]) 744 goto out_free; 745 746 if (base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1)) 747 new.lpage_info[i][0].write_count = 1; 748 if ((base_gfn+npages) & (KVM_PAGES_PER_HPAGE(level) - 1)) 749 new.lpage_info[i][lpages - 1].write_count = 1; 750 ugfn = new.userspace_addr >> PAGE_SHIFT; 751 /* 752 * If the gfn and userspace address are not aligned wrt each 753 * other, or if explicitly asked to, disable large page 754 * support for this slot 755 */ 756 if ((base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) || 757 !largepages_enabled) 758 for (j = 0; j < lpages; ++j) 759 new.lpage_info[i][j].write_count = 1; 760 } 761 762 skip_lpage: 763 764 /* Allocate page dirty bitmap if needed */ 765 if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { 766 if (kvm_create_dirty_bitmap(&new) < 0) 767 goto out_free; 768 /* destroy any largepage mappings for dirty tracking */ 769 } 770 #else /* not defined CONFIG_S390 */ 771 new.user_alloc = user_alloc; 772 if (user_alloc) 773 new.userspace_addr = mem->userspace_addr; 774 #endif /* not defined CONFIG_S390 */ 775 776 if (!npages) { 777 r = -ENOMEM; 778 slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); 779 if (!slots) 780 goto out_free; 781 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); 782 if (mem->slot >= slots->nmemslots) 783 slots->nmemslots = mem->slot + 1; 784 slots->generation++; 785 slots->memslots[mem->slot].flags |= KVM_MEMSLOT_INVALID; 786 787 old_memslots = kvm->memslots; 788 rcu_assign_pointer(kvm->memslots, slots); 789 synchronize_srcu_expedited(&kvm->srcu); 790 /* From this point no new shadow pages pointing to a deleted 791 * memslot will be created. 792 * 793 * validation of sp->gfn happens in: 794 * - gfn_to_hva (kvm_read_guest, gfn_to_pfn) 795 * - kvm_is_visible_gfn (mmu_check_roots) 796 */ 797 kvm_arch_flush_shadow(kvm); 798 kfree(old_memslots); 799 } 800 801 r = kvm_arch_prepare_memory_region(kvm, &new, old, mem, user_alloc); 802 if (r) 803 goto out_free; 804 805 /* map the pages in iommu page table */ 806 if (npages) { 807 r = kvm_iommu_map_pages(kvm, &new); 808 if (r) 809 goto out_free; 810 } 811 812 r = -ENOMEM; 813 slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); 814 if (!slots) 815 goto out_free; 816 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); 817 if (mem->slot >= slots->nmemslots) 818 slots->nmemslots = mem->slot + 1; 819 slots->generation++; 820 821 /* actual memory is freed via old in kvm_free_physmem_slot below */ 822 if (!npages) { 823 new.rmap = NULL; 824 new.dirty_bitmap = NULL; 825 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) 826 new.lpage_info[i] = NULL; 827 } 828 829 slots->memslots[mem->slot] = new; 830 old_memslots = kvm->memslots; 831 rcu_assign_pointer(kvm->memslots, slots); 832 synchronize_srcu_expedited(&kvm->srcu); 833 834 kvm_arch_commit_memory_region(kvm, mem, old, user_alloc); 835 836 /* 837 * If the new memory slot is created, we need to clear all 838 * mmio sptes. 839 */ 840 if (npages && old.base_gfn != mem->guest_phys_addr >> PAGE_SHIFT) 841 kvm_arch_flush_shadow(kvm); 842 843 kvm_free_physmem_slot(&old, &new); 844 kfree(old_memslots); 845 846 return 0; 847 848 out_free: 849 kvm_free_physmem_slot(&new, &old); 850 out: 851 return r; 852 853 } 854 EXPORT_SYMBOL_GPL(__kvm_set_memory_region); 855 856 int kvm_set_memory_region(struct kvm *kvm, 857 struct kvm_userspace_memory_region *mem, 858 int user_alloc) 859 { 860 int r; 861 862 mutex_lock(&kvm->slots_lock); 863 r = __kvm_set_memory_region(kvm, mem, user_alloc); 864 mutex_unlock(&kvm->slots_lock); 865 return r; 866 } 867 EXPORT_SYMBOL_GPL(kvm_set_memory_region); 868 869 int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, 870 struct 871 kvm_userspace_memory_region *mem, 872 int user_alloc) 873 { 874 if (mem->slot >= KVM_MEMORY_SLOTS) 875 return -EINVAL; 876 return kvm_set_memory_region(kvm, mem, user_alloc); 877 } 878 879 int kvm_get_dirty_log(struct kvm *kvm, 880 struct kvm_dirty_log *log, int *is_dirty) 881 { 882 struct kvm_memory_slot *memslot; 883 int r, i; 884 unsigned long n; 885 unsigned long any = 0; 886 887 r = -EINVAL; 888 if (log->slot >= KVM_MEMORY_SLOTS) 889 goto out; 890 891 memslot = &kvm->memslots->memslots[log->slot]; 892 r = -ENOENT; 893 if (!memslot->dirty_bitmap) 894 goto out; 895 896 n = kvm_dirty_bitmap_bytes(memslot); 897 898 for (i = 0; !any && i < n/sizeof(long); ++i) 899 any = memslot->dirty_bitmap[i]; 900 901 r = -EFAULT; 902 if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) 903 goto out; 904 905 if (any) 906 *is_dirty = 1; 907 908 r = 0; 909 out: 910 return r; 911 } 912 913 void kvm_disable_largepages(void) 914 { 915 largepages_enabled = false; 916 } 917 EXPORT_SYMBOL_GPL(kvm_disable_largepages); 918 919 int is_error_page(struct page *page) 920 { 921 return page == bad_page || page == hwpoison_page || page == fault_page; 922 } 923 EXPORT_SYMBOL_GPL(is_error_page); 924 925 int is_error_pfn(pfn_t pfn) 926 { 927 return pfn == bad_pfn || pfn == hwpoison_pfn || pfn == fault_pfn; 928 } 929 EXPORT_SYMBOL_GPL(is_error_pfn); 930 931 int is_hwpoison_pfn(pfn_t pfn) 932 { 933 return pfn == hwpoison_pfn; 934 } 935 EXPORT_SYMBOL_GPL(is_hwpoison_pfn); 936 937 int is_fault_pfn(pfn_t pfn) 938 { 939 return pfn == fault_pfn; 940 } 941 EXPORT_SYMBOL_GPL(is_fault_pfn); 942 943 int is_noslot_pfn(pfn_t pfn) 944 { 945 return pfn == bad_pfn; 946 } 947 EXPORT_SYMBOL_GPL(is_noslot_pfn); 948 949 int is_invalid_pfn(pfn_t pfn) 950 { 951 return pfn == hwpoison_pfn || pfn == fault_pfn; 952 } 953 EXPORT_SYMBOL_GPL(is_invalid_pfn); 954 955 static inline unsigned long bad_hva(void) 956 { 957 return PAGE_OFFSET; 958 } 959 960 int kvm_is_error_hva(unsigned long addr) 961 { 962 return addr == bad_hva(); 963 } 964 EXPORT_SYMBOL_GPL(kvm_is_error_hva); 965 966 static struct kvm_memory_slot *__gfn_to_memslot(struct kvm_memslots *slots, 967 gfn_t gfn) 968 { 969 int i; 970 971 for (i = 0; i < slots->nmemslots; ++i) { 972 struct kvm_memory_slot *memslot = &slots->memslots[i]; 973 974 if (gfn >= memslot->base_gfn 975 && gfn < memslot->base_gfn + memslot->npages) 976 return memslot; 977 } 978 return NULL; 979 } 980 981 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) 982 { 983 return __gfn_to_memslot(kvm_memslots(kvm), gfn); 984 } 985 EXPORT_SYMBOL_GPL(gfn_to_memslot); 986 987 int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) 988 { 989 int i; 990 struct kvm_memslots *slots = kvm_memslots(kvm); 991 992 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { 993 struct kvm_memory_slot *memslot = &slots->memslots[i]; 994 995 if (memslot->flags & KVM_MEMSLOT_INVALID) 996 continue; 997 998 if (gfn >= memslot->base_gfn 999 && gfn < memslot->base_gfn + memslot->npages) 1000 return 1; 1001 } 1002 return 0; 1003 } 1004 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); 1005 1006 unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn) 1007 { 1008 struct vm_area_struct *vma; 1009 unsigned long addr, size; 1010 1011 size = PAGE_SIZE; 1012 1013 addr = gfn_to_hva(kvm, gfn); 1014 if (kvm_is_error_hva(addr)) 1015 return PAGE_SIZE; 1016 1017 down_read(¤t->mm->mmap_sem); 1018 vma = find_vma(current->mm, addr); 1019 if (!vma) 1020 goto out; 1021 1022 size = vma_kernel_pagesize(vma); 1023 1024 out: 1025 up_read(¤t->mm->mmap_sem); 1026 1027 return size; 1028 } 1029 1030 static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, 1031 gfn_t *nr_pages) 1032 { 1033 if (!slot || slot->flags & KVM_MEMSLOT_INVALID) 1034 return bad_hva(); 1035 1036 if (nr_pages) 1037 *nr_pages = slot->npages - (gfn - slot->base_gfn); 1038 1039 return gfn_to_hva_memslot(slot, gfn); 1040 } 1041 1042 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) 1043 { 1044 return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL); 1045 } 1046 EXPORT_SYMBOL_GPL(gfn_to_hva); 1047 1048 static pfn_t get_fault_pfn(void) 1049 { 1050 get_page(fault_page); 1051 return fault_pfn; 1052 } 1053 1054 int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm, 1055 unsigned long start, int write, struct page **page) 1056 { 1057 int flags = FOLL_TOUCH | FOLL_NOWAIT | FOLL_HWPOISON | FOLL_GET; 1058 1059 if (write) 1060 flags |= FOLL_WRITE; 1061 1062 return __get_user_pages(tsk, mm, start, 1, flags, page, NULL, NULL); 1063 } 1064 1065 static inline int check_user_page_hwpoison(unsigned long addr) 1066 { 1067 int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE; 1068 1069 rc = __get_user_pages(current, current->mm, addr, 1, 1070 flags, NULL, NULL, NULL); 1071 return rc == -EHWPOISON; 1072 } 1073 1074 static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic, 1075 bool *async, bool write_fault, bool *writable) 1076 { 1077 struct page *page[1]; 1078 int npages = 0; 1079 pfn_t pfn; 1080 1081 /* we can do it either atomically or asynchronously, not both */ 1082 BUG_ON(atomic && async); 1083 1084 BUG_ON(!write_fault && !writable); 1085 1086 if (writable) 1087 *writable = true; 1088 1089 if (atomic || async) 1090 npages = __get_user_pages_fast(addr, 1, 1, page); 1091 1092 if (unlikely(npages != 1) && !atomic) { 1093 might_sleep(); 1094 1095 if (writable) 1096 *writable = write_fault; 1097 1098 if (async) { 1099 down_read(¤t->mm->mmap_sem); 1100 npages = get_user_page_nowait(current, current->mm, 1101 addr, write_fault, page); 1102 up_read(¤t->mm->mmap_sem); 1103 } else 1104 npages = get_user_pages_fast(addr, 1, write_fault, 1105 page); 1106 1107 /* map read fault as writable if possible */ 1108 if (unlikely(!write_fault) && npages == 1) { 1109 struct page *wpage[1]; 1110 1111 npages = __get_user_pages_fast(addr, 1, 1, wpage); 1112 if (npages == 1) { 1113 *writable = true; 1114 put_page(page[0]); 1115 page[0] = wpage[0]; 1116 } 1117 npages = 1; 1118 } 1119 } 1120 1121 if (unlikely(npages != 1)) { 1122 struct vm_area_struct *vma; 1123 1124 if (atomic) 1125 return get_fault_pfn(); 1126 1127 down_read(¤t->mm->mmap_sem); 1128 if (npages == -EHWPOISON || 1129 (!async && check_user_page_hwpoison(addr))) { 1130 up_read(¤t->mm->mmap_sem); 1131 get_page(hwpoison_page); 1132 return page_to_pfn(hwpoison_page); 1133 } 1134 1135 vma = find_vma_intersection(current->mm, addr, addr+1); 1136 1137 if (vma == NULL) 1138 pfn = get_fault_pfn(); 1139 else if ((vma->vm_flags & VM_PFNMAP)) { 1140 pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + 1141 vma->vm_pgoff; 1142 BUG_ON(!kvm_is_mmio_pfn(pfn)); 1143 } else { 1144 if (async && (vma->vm_flags & VM_WRITE)) 1145 *async = true; 1146 pfn = get_fault_pfn(); 1147 } 1148 up_read(¤t->mm->mmap_sem); 1149 } else 1150 pfn = page_to_pfn(page[0]); 1151 1152 return pfn; 1153 } 1154 1155 pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr) 1156 { 1157 return hva_to_pfn(kvm, addr, true, NULL, true, NULL); 1158 } 1159 EXPORT_SYMBOL_GPL(hva_to_pfn_atomic); 1160 1161 static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async, 1162 bool write_fault, bool *writable) 1163 { 1164 unsigned long addr; 1165 1166 if (async) 1167 *async = false; 1168 1169 addr = gfn_to_hva(kvm, gfn); 1170 if (kvm_is_error_hva(addr)) { 1171 get_page(bad_page); 1172 return page_to_pfn(bad_page); 1173 } 1174 1175 return hva_to_pfn(kvm, addr, atomic, async, write_fault, writable); 1176 } 1177 1178 pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn) 1179 { 1180 return __gfn_to_pfn(kvm, gfn, true, NULL, true, NULL); 1181 } 1182 EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic); 1183 1184 pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async, 1185 bool write_fault, bool *writable) 1186 { 1187 return __gfn_to_pfn(kvm, gfn, false, async, write_fault, writable); 1188 } 1189 EXPORT_SYMBOL_GPL(gfn_to_pfn_async); 1190 1191 pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) 1192 { 1193 return __gfn_to_pfn(kvm, gfn, false, NULL, true, NULL); 1194 } 1195 EXPORT_SYMBOL_GPL(gfn_to_pfn); 1196 1197 pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, 1198 bool *writable) 1199 { 1200 return __gfn_to_pfn(kvm, gfn, false, NULL, write_fault, writable); 1201 } 1202 EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); 1203 1204 pfn_t gfn_to_pfn_memslot(struct kvm *kvm, 1205 struct kvm_memory_slot *slot, gfn_t gfn) 1206 { 1207 unsigned long addr = gfn_to_hva_memslot(slot, gfn); 1208 return hva_to_pfn(kvm, addr, false, NULL, true, NULL); 1209 } 1210 1211 int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages, 1212 int nr_pages) 1213 { 1214 unsigned long addr; 1215 gfn_t entry; 1216 1217 addr = gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, &entry); 1218 if (kvm_is_error_hva(addr)) 1219 return -1; 1220 1221 if (entry < nr_pages) 1222 return 0; 1223 1224 return __get_user_pages_fast(addr, nr_pages, 1, pages); 1225 } 1226 EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic); 1227 1228 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) 1229 { 1230 pfn_t pfn; 1231 1232 pfn = gfn_to_pfn(kvm, gfn); 1233 if (!kvm_is_mmio_pfn(pfn)) 1234 return pfn_to_page(pfn); 1235 1236 WARN_ON(kvm_is_mmio_pfn(pfn)); 1237 1238 get_page(bad_page); 1239 return bad_page; 1240 } 1241 1242 EXPORT_SYMBOL_GPL(gfn_to_page); 1243 1244 void kvm_release_page_clean(struct page *page) 1245 { 1246 kvm_release_pfn_clean(page_to_pfn(page)); 1247 } 1248 EXPORT_SYMBOL_GPL(kvm_release_page_clean); 1249 1250 void kvm_release_pfn_clean(pfn_t pfn) 1251 { 1252 if (!kvm_is_mmio_pfn(pfn)) 1253 put_page(pfn_to_page(pfn)); 1254 } 1255 EXPORT_SYMBOL_GPL(kvm_release_pfn_clean); 1256 1257 void kvm_release_page_dirty(struct page *page) 1258 { 1259 kvm_release_pfn_dirty(page_to_pfn(page)); 1260 } 1261 EXPORT_SYMBOL_GPL(kvm_release_page_dirty); 1262 1263 void kvm_release_pfn_dirty(pfn_t pfn) 1264 { 1265 kvm_set_pfn_dirty(pfn); 1266 kvm_release_pfn_clean(pfn); 1267 } 1268 EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty); 1269 1270 void kvm_set_page_dirty(struct page *page) 1271 { 1272 kvm_set_pfn_dirty(page_to_pfn(page)); 1273 } 1274 EXPORT_SYMBOL_GPL(kvm_set_page_dirty); 1275 1276 void kvm_set_pfn_dirty(pfn_t pfn) 1277 { 1278 if (!kvm_is_mmio_pfn(pfn)) { 1279 struct page *page = pfn_to_page(pfn); 1280 if (!PageReserved(page)) 1281 SetPageDirty(page); 1282 } 1283 } 1284 EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty); 1285 1286 void kvm_set_pfn_accessed(pfn_t pfn) 1287 { 1288 if (!kvm_is_mmio_pfn(pfn)) 1289 mark_page_accessed(pfn_to_page(pfn)); 1290 } 1291 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed); 1292 1293 void kvm_get_pfn(pfn_t pfn) 1294 { 1295 if (!kvm_is_mmio_pfn(pfn)) 1296 get_page(pfn_to_page(pfn)); 1297 } 1298 EXPORT_SYMBOL_GPL(kvm_get_pfn); 1299 1300 static int next_segment(unsigned long len, int offset) 1301 { 1302 if (len > PAGE_SIZE - offset) 1303 return PAGE_SIZE - offset; 1304 else 1305 return len; 1306 } 1307 1308 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, 1309 int len) 1310 { 1311 int r; 1312 unsigned long addr; 1313 1314 addr = gfn_to_hva(kvm, gfn); 1315 if (kvm_is_error_hva(addr)) 1316 return -EFAULT; 1317 r = __copy_from_user(data, (void __user *)addr + offset, len); 1318 if (r) 1319 return -EFAULT; 1320 return 0; 1321 } 1322 EXPORT_SYMBOL_GPL(kvm_read_guest_page); 1323 1324 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) 1325 { 1326 gfn_t gfn = gpa >> PAGE_SHIFT; 1327 int seg; 1328 int offset = offset_in_page(gpa); 1329 int ret; 1330 1331 while ((seg = next_segment(len, offset)) != 0) { 1332 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg); 1333 if (ret < 0) 1334 return ret; 1335 offset = 0; 1336 len -= seg; 1337 data += seg; 1338 ++gfn; 1339 } 1340 return 0; 1341 } 1342 EXPORT_SYMBOL_GPL(kvm_read_guest); 1343 1344 int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, 1345 unsigned long len) 1346 { 1347 int r; 1348 unsigned long addr; 1349 gfn_t gfn = gpa >> PAGE_SHIFT; 1350 int offset = offset_in_page(gpa); 1351 1352 addr = gfn_to_hva(kvm, gfn); 1353 if (kvm_is_error_hva(addr)) 1354 return -EFAULT; 1355 pagefault_disable(); 1356 r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len); 1357 pagefault_enable(); 1358 if (r) 1359 return -EFAULT; 1360 return 0; 1361 } 1362 EXPORT_SYMBOL(kvm_read_guest_atomic); 1363 1364 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, 1365 int offset, int len) 1366 { 1367 int r; 1368 unsigned long addr; 1369 1370 addr = gfn_to_hva(kvm, gfn); 1371 if (kvm_is_error_hva(addr)) 1372 return -EFAULT; 1373 r = __copy_to_user((void __user *)addr + offset, data, len); 1374 if (r) 1375 return -EFAULT; 1376 mark_page_dirty(kvm, gfn); 1377 return 0; 1378 } 1379 EXPORT_SYMBOL_GPL(kvm_write_guest_page); 1380 1381 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, 1382 unsigned long len) 1383 { 1384 gfn_t gfn = gpa >> PAGE_SHIFT; 1385 int seg; 1386 int offset = offset_in_page(gpa); 1387 int ret; 1388 1389 while ((seg = next_segment(len, offset)) != 0) { 1390 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg); 1391 if (ret < 0) 1392 return ret; 1393 offset = 0; 1394 len -= seg; 1395 data += seg; 1396 ++gfn; 1397 } 1398 return 0; 1399 } 1400 1401 int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 1402 gpa_t gpa) 1403 { 1404 struct kvm_memslots *slots = kvm_memslots(kvm); 1405 int offset = offset_in_page(gpa); 1406 gfn_t gfn = gpa >> PAGE_SHIFT; 1407 1408 ghc->gpa = gpa; 1409 ghc->generation = slots->generation; 1410 ghc->memslot = __gfn_to_memslot(slots, gfn); 1411 ghc->hva = gfn_to_hva_many(ghc->memslot, gfn, NULL); 1412 if (!kvm_is_error_hva(ghc->hva)) 1413 ghc->hva += offset; 1414 else 1415 return -EFAULT; 1416 1417 return 0; 1418 } 1419 EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init); 1420 1421 int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 1422 void *data, unsigned long len) 1423 { 1424 struct kvm_memslots *slots = kvm_memslots(kvm); 1425 int r; 1426 1427 if (slots->generation != ghc->generation) 1428 kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa); 1429 1430 if (kvm_is_error_hva(ghc->hva)) 1431 return -EFAULT; 1432 1433 r = __copy_to_user((void __user *)ghc->hva, data, len); 1434 if (r) 1435 return -EFAULT; 1436 mark_page_dirty_in_slot(kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT); 1437 1438 return 0; 1439 } 1440 EXPORT_SYMBOL_GPL(kvm_write_guest_cached); 1441 1442 int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 1443 void *data, unsigned long len) 1444 { 1445 struct kvm_memslots *slots = kvm_memslots(kvm); 1446 int r; 1447 1448 if (slots->generation != ghc->generation) 1449 kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa); 1450 1451 if (kvm_is_error_hva(ghc->hva)) 1452 return -EFAULT; 1453 1454 r = __copy_from_user(data, (void __user *)ghc->hva, len); 1455 if (r) 1456 return -EFAULT; 1457 1458 return 0; 1459 } 1460 EXPORT_SYMBOL_GPL(kvm_read_guest_cached); 1461 1462 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) 1463 { 1464 return kvm_write_guest_page(kvm, gfn, (const void *) empty_zero_page, 1465 offset, len); 1466 } 1467 EXPORT_SYMBOL_GPL(kvm_clear_guest_page); 1468 1469 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) 1470 { 1471 gfn_t gfn = gpa >> PAGE_SHIFT; 1472 int seg; 1473 int offset = offset_in_page(gpa); 1474 int ret; 1475 1476 while ((seg = next_segment(len, offset)) != 0) { 1477 ret = kvm_clear_guest_page(kvm, gfn, offset, seg); 1478 if (ret < 0) 1479 return ret; 1480 offset = 0; 1481 len -= seg; 1482 ++gfn; 1483 } 1484 return 0; 1485 } 1486 EXPORT_SYMBOL_GPL(kvm_clear_guest); 1487 1488 void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot, 1489 gfn_t gfn) 1490 { 1491 if (memslot && memslot->dirty_bitmap) { 1492 unsigned long rel_gfn = gfn - memslot->base_gfn; 1493 1494 __set_bit_le(rel_gfn, memslot->dirty_bitmap); 1495 } 1496 } 1497 1498 void mark_page_dirty(struct kvm *kvm, gfn_t gfn) 1499 { 1500 struct kvm_memory_slot *memslot; 1501 1502 memslot = gfn_to_memslot(kvm, gfn); 1503 mark_page_dirty_in_slot(kvm, memslot, gfn); 1504 } 1505 1506 /* 1507 * The vCPU has executed a HLT instruction with in-kernel mode enabled. 1508 */ 1509 void kvm_vcpu_block(struct kvm_vcpu *vcpu) 1510 { 1511 DEFINE_WAIT(wait); 1512 1513 for (;;) { 1514 prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); 1515 1516 if (kvm_arch_vcpu_runnable(vcpu)) { 1517 kvm_make_request(KVM_REQ_UNHALT, vcpu); 1518 break; 1519 } 1520 if (kvm_cpu_has_pending_timer(vcpu)) 1521 break; 1522 if (signal_pending(current)) 1523 break; 1524 1525 schedule(); 1526 } 1527 1528 finish_wait(&vcpu->wq, &wait); 1529 } 1530 1531 void kvm_resched(struct kvm_vcpu *vcpu) 1532 { 1533 if (!need_resched()) 1534 return; 1535 cond_resched(); 1536 } 1537 EXPORT_SYMBOL_GPL(kvm_resched); 1538 1539 void kvm_vcpu_on_spin(struct kvm_vcpu *me) 1540 { 1541 struct kvm *kvm = me->kvm; 1542 struct kvm_vcpu *vcpu; 1543 int last_boosted_vcpu = me->kvm->last_boosted_vcpu; 1544 int yielded = 0; 1545 int pass; 1546 int i; 1547 1548 /* 1549 * We boost the priority of a VCPU that is runnable but not 1550 * currently running, because it got preempted by something 1551 * else and called schedule in __vcpu_run. Hopefully that 1552 * VCPU is holding the lock that we need and will release it. 1553 * We approximate round-robin by starting at the last boosted VCPU. 1554 */ 1555 for (pass = 0; pass < 2 && !yielded; pass++) { 1556 kvm_for_each_vcpu(i, vcpu, kvm) { 1557 struct task_struct *task = NULL; 1558 struct pid *pid; 1559 if (!pass && i < last_boosted_vcpu) { 1560 i = last_boosted_vcpu; 1561 continue; 1562 } else if (pass && i > last_boosted_vcpu) 1563 break; 1564 if (vcpu == me) 1565 continue; 1566 if (waitqueue_active(&vcpu->wq)) 1567 continue; 1568 rcu_read_lock(); 1569 pid = rcu_dereference(vcpu->pid); 1570 if (pid) 1571 task = get_pid_task(vcpu->pid, PIDTYPE_PID); 1572 rcu_read_unlock(); 1573 if (!task) 1574 continue; 1575 if (task->flags & PF_VCPU) { 1576 put_task_struct(task); 1577 continue; 1578 } 1579 if (yield_to(task, 1)) { 1580 put_task_struct(task); 1581 kvm->last_boosted_vcpu = i; 1582 yielded = 1; 1583 break; 1584 } 1585 put_task_struct(task); 1586 } 1587 } 1588 } 1589 EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin); 1590 1591 static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 1592 { 1593 struct kvm_vcpu *vcpu = vma->vm_file->private_data; 1594 struct page *page; 1595 1596 if (vmf->pgoff == 0) 1597 page = virt_to_page(vcpu->run); 1598 #ifdef CONFIG_X86 1599 else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET) 1600 page = virt_to_page(vcpu->arch.pio_data); 1601 #endif 1602 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 1603 else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET) 1604 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring); 1605 #endif 1606 else 1607 return VM_FAULT_SIGBUS; 1608 get_page(page); 1609 vmf->page = page; 1610 return 0; 1611 } 1612 1613 static const struct vm_operations_struct kvm_vcpu_vm_ops = { 1614 .fault = kvm_vcpu_fault, 1615 }; 1616 1617 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma) 1618 { 1619 vma->vm_ops = &kvm_vcpu_vm_ops; 1620 return 0; 1621 } 1622 1623 static int kvm_vcpu_release(struct inode *inode, struct file *filp) 1624 { 1625 struct kvm_vcpu *vcpu = filp->private_data; 1626 1627 kvm_put_kvm(vcpu->kvm); 1628 return 0; 1629 } 1630 1631 static struct file_operations kvm_vcpu_fops = { 1632 .release = kvm_vcpu_release, 1633 .unlocked_ioctl = kvm_vcpu_ioctl, 1634 #ifdef CONFIG_COMPAT 1635 .compat_ioctl = kvm_vcpu_compat_ioctl, 1636 #endif 1637 .mmap = kvm_vcpu_mmap, 1638 .llseek = noop_llseek, 1639 }; 1640 1641 /* 1642 * Allocates an inode for the vcpu. 1643 */ 1644 static int create_vcpu_fd(struct kvm_vcpu *vcpu) 1645 { 1646 return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR); 1647 } 1648 1649 /* 1650 * Creates some virtual cpus. Good luck creating more than one. 1651 */ 1652 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) 1653 { 1654 int r; 1655 struct kvm_vcpu *vcpu, *v; 1656 1657 vcpu = kvm_arch_vcpu_create(kvm, id); 1658 if (IS_ERR(vcpu)) 1659 return PTR_ERR(vcpu); 1660 1661 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); 1662 1663 r = kvm_arch_vcpu_setup(vcpu); 1664 if (r) 1665 goto vcpu_destroy; 1666 1667 mutex_lock(&kvm->lock); 1668 if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) { 1669 r = -EINVAL; 1670 goto unlock_vcpu_destroy; 1671 } 1672 1673 kvm_for_each_vcpu(r, v, kvm) 1674 if (v->vcpu_id == id) { 1675 r = -EEXIST; 1676 goto unlock_vcpu_destroy; 1677 } 1678 1679 BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]); 1680 1681 /* Now it's all set up, let userspace reach it */ 1682 kvm_get_kvm(kvm); 1683 r = create_vcpu_fd(vcpu); 1684 if (r < 0) { 1685 kvm_put_kvm(kvm); 1686 goto unlock_vcpu_destroy; 1687 } 1688 1689 kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu; 1690 smp_wmb(); 1691 atomic_inc(&kvm->online_vcpus); 1692 1693 #ifdef CONFIG_KVM_APIC_ARCHITECTURE 1694 if (kvm->bsp_vcpu_id == id) 1695 kvm->bsp_vcpu = vcpu; 1696 #endif 1697 mutex_unlock(&kvm->lock); 1698 return r; 1699 1700 unlock_vcpu_destroy: 1701 mutex_unlock(&kvm->lock); 1702 vcpu_destroy: 1703 kvm_arch_vcpu_destroy(vcpu); 1704 return r; 1705 } 1706 1707 static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset) 1708 { 1709 if (sigset) { 1710 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP)); 1711 vcpu->sigset_active = 1; 1712 vcpu->sigset = *sigset; 1713 } else 1714 vcpu->sigset_active = 0; 1715 return 0; 1716 } 1717 1718 static long kvm_vcpu_ioctl(struct file *filp, 1719 unsigned int ioctl, unsigned long arg) 1720 { 1721 struct kvm_vcpu *vcpu = filp->private_data; 1722 void __user *argp = (void __user *)arg; 1723 int r; 1724 struct kvm_fpu *fpu = NULL; 1725 struct kvm_sregs *kvm_sregs = NULL; 1726 1727 if (vcpu->kvm->mm != current->mm) 1728 return -EIO; 1729 1730 #if defined(CONFIG_S390) || defined(CONFIG_PPC) 1731 /* 1732 * Special cases: vcpu ioctls that are asynchronous to vcpu execution, 1733 * so vcpu_load() would break it. 1734 */ 1735 if (ioctl == KVM_S390_INTERRUPT || ioctl == KVM_INTERRUPT) 1736 return kvm_arch_vcpu_ioctl(filp, ioctl, arg); 1737 #endif 1738 1739 1740 vcpu_load(vcpu); 1741 switch (ioctl) { 1742 case KVM_RUN: 1743 r = -EINVAL; 1744 if (arg) 1745 goto out; 1746 r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run); 1747 trace_kvm_userspace_exit(vcpu->run->exit_reason, r); 1748 break; 1749 case KVM_GET_REGS: { 1750 struct kvm_regs *kvm_regs; 1751 1752 r = -ENOMEM; 1753 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL); 1754 if (!kvm_regs) 1755 goto out; 1756 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs); 1757 if (r) 1758 goto out_free1; 1759 r = -EFAULT; 1760 if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs))) 1761 goto out_free1; 1762 r = 0; 1763 out_free1: 1764 kfree(kvm_regs); 1765 break; 1766 } 1767 case KVM_SET_REGS: { 1768 struct kvm_regs *kvm_regs; 1769 1770 r = -ENOMEM; 1771 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL); 1772 if (!kvm_regs) 1773 goto out; 1774 r = -EFAULT; 1775 if (copy_from_user(kvm_regs, argp, sizeof(struct kvm_regs))) 1776 goto out_free2; 1777 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs); 1778 if (r) 1779 goto out_free2; 1780 r = 0; 1781 out_free2: 1782 kfree(kvm_regs); 1783 break; 1784 } 1785 case KVM_GET_SREGS: { 1786 kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL); 1787 r = -ENOMEM; 1788 if (!kvm_sregs) 1789 goto out; 1790 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs); 1791 if (r) 1792 goto out; 1793 r = -EFAULT; 1794 if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs))) 1795 goto out; 1796 r = 0; 1797 break; 1798 } 1799 case KVM_SET_SREGS: { 1800 kvm_sregs = kmalloc(sizeof(struct kvm_sregs), GFP_KERNEL); 1801 r = -ENOMEM; 1802 if (!kvm_sregs) 1803 goto out; 1804 r = -EFAULT; 1805 if (copy_from_user(kvm_sregs, argp, sizeof(struct kvm_sregs))) 1806 goto out; 1807 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs); 1808 if (r) 1809 goto out; 1810 r = 0; 1811 break; 1812 } 1813 case KVM_GET_MP_STATE: { 1814 struct kvm_mp_state mp_state; 1815 1816 r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state); 1817 if (r) 1818 goto out; 1819 r = -EFAULT; 1820 if (copy_to_user(argp, &mp_state, sizeof mp_state)) 1821 goto out; 1822 r = 0; 1823 break; 1824 } 1825 case KVM_SET_MP_STATE: { 1826 struct kvm_mp_state mp_state; 1827 1828 r = -EFAULT; 1829 if (copy_from_user(&mp_state, argp, sizeof mp_state)) 1830 goto out; 1831 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state); 1832 if (r) 1833 goto out; 1834 r = 0; 1835 break; 1836 } 1837 case KVM_TRANSLATE: { 1838 struct kvm_translation tr; 1839 1840 r = -EFAULT; 1841 if (copy_from_user(&tr, argp, sizeof tr)) 1842 goto out; 1843 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr); 1844 if (r) 1845 goto out; 1846 r = -EFAULT; 1847 if (copy_to_user(argp, &tr, sizeof tr)) 1848 goto out; 1849 r = 0; 1850 break; 1851 } 1852 case KVM_SET_GUEST_DEBUG: { 1853 struct kvm_guest_debug dbg; 1854 1855 r = -EFAULT; 1856 if (copy_from_user(&dbg, argp, sizeof dbg)) 1857 goto out; 1858 r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg); 1859 if (r) 1860 goto out; 1861 r = 0; 1862 break; 1863 } 1864 case KVM_SET_SIGNAL_MASK: { 1865 struct kvm_signal_mask __user *sigmask_arg = argp; 1866 struct kvm_signal_mask kvm_sigmask; 1867 sigset_t sigset, *p; 1868 1869 p = NULL; 1870 if (argp) { 1871 r = -EFAULT; 1872 if (copy_from_user(&kvm_sigmask, argp, 1873 sizeof kvm_sigmask)) 1874 goto out; 1875 r = -EINVAL; 1876 if (kvm_sigmask.len != sizeof sigset) 1877 goto out; 1878 r = -EFAULT; 1879 if (copy_from_user(&sigset, sigmask_arg->sigset, 1880 sizeof sigset)) 1881 goto out; 1882 p = &sigset; 1883 } 1884 r = kvm_vcpu_ioctl_set_sigmask(vcpu, p); 1885 break; 1886 } 1887 case KVM_GET_FPU: { 1888 fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL); 1889 r = -ENOMEM; 1890 if (!fpu) 1891 goto out; 1892 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu); 1893 if (r) 1894 goto out; 1895 r = -EFAULT; 1896 if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu))) 1897 goto out; 1898 r = 0; 1899 break; 1900 } 1901 case KVM_SET_FPU: { 1902 fpu = kmalloc(sizeof(struct kvm_fpu), GFP_KERNEL); 1903 r = -ENOMEM; 1904 if (!fpu) 1905 goto out; 1906 r = -EFAULT; 1907 if (copy_from_user(fpu, argp, sizeof(struct kvm_fpu))) 1908 goto out; 1909 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu); 1910 if (r) 1911 goto out; 1912 r = 0; 1913 break; 1914 } 1915 default: 1916 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); 1917 } 1918 out: 1919 vcpu_put(vcpu); 1920 kfree(fpu); 1921 kfree(kvm_sregs); 1922 return r; 1923 } 1924 1925 #ifdef CONFIG_COMPAT 1926 static long kvm_vcpu_compat_ioctl(struct file *filp, 1927 unsigned int ioctl, unsigned long arg) 1928 { 1929 struct kvm_vcpu *vcpu = filp->private_data; 1930 void __user *argp = compat_ptr(arg); 1931 int r; 1932 1933 if (vcpu->kvm->mm != current->mm) 1934 return -EIO; 1935 1936 switch (ioctl) { 1937 case KVM_SET_SIGNAL_MASK: { 1938 struct kvm_signal_mask __user *sigmask_arg = argp; 1939 struct kvm_signal_mask kvm_sigmask; 1940 compat_sigset_t csigset; 1941 sigset_t sigset; 1942 1943 if (argp) { 1944 r = -EFAULT; 1945 if (copy_from_user(&kvm_sigmask, argp, 1946 sizeof kvm_sigmask)) 1947 goto out; 1948 r = -EINVAL; 1949 if (kvm_sigmask.len != sizeof csigset) 1950 goto out; 1951 r = -EFAULT; 1952 if (copy_from_user(&csigset, sigmask_arg->sigset, 1953 sizeof csigset)) 1954 goto out; 1955 } 1956 sigset_from_compat(&sigset, &csigset); 1957 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset); 1958 break; 1959 } 1960 default: 1961 r = kvm_vcpu_ioctl(filp, ioctl, arg); 1962 } 1963 1964 out: 1965 return r; 1966 } 1967 #endif 1968 1969 static long kvm_vm_ioctl(struct file *filp, 1970 unsigned int ioctl, unsigned long arg) 1971 { 1972 struct kvm *kvm = filp->private_data; 1973 void __user *argp = (void __user *)arg; 1974 int r; 1975 1976 if (kvm->mm != current->mm) 1977 return -EIO; 1978 switch (ioctl) { 1979 case KVM_CREATE_VCPU: 1980 r = kvm_vm_ioctl_create_vcpu(kvm, arg); 1981 if (r < 0) 1982 goto out; 1983 break; 1984 case KVM_SET_USER_MEMORY_REGION: { 1985 struct kvm_userspace_memory_region kvm_userspace_mem; 1986 1987 r = -EFAULT; 1988 if (copy_from_user(&kvm_userspace_mem, argp, 1989 sizeof kvm_userspace_mem)) 1990 goto out; 1991 1992 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1); 1993 if (r) 1994 goto out; 1995 break; 1996 } 1997 case KVM_GET_DIRTY_LOG: { 1998 struct kvm_dirty_log log; 1999 2000 r = -EFAULT; 2001 if (copy_from_user(&log, argp, sizeof log)) 2002 goto out; 2003 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 2004 if (r) 2005 goto out; 2006 break; 2007 } 2008 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 2009 case KVM_REGISTER_COALESCED_MMIO: { 2010 struct kvm_coalesced_mmio_zone zone; 2011 r = -EFAULT; 2012 if (copy_from_user(&zone, argp, sizeof zone)) 2013 goto out; 2014 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone); 2015 if (r) 2016 goto out; 2017 r = 0; 2018 break; 2019 } 2020 case KVM_UNREGISTER_COALESCED_MMIO: { 2021 struct kvm_coalesced_mmio_zone zone; 2022 r = -EFAULT; 2023 if (copy_from_user(&zone, argp, sizeof zone)) 2024 goto out; 2025 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone); 2026 if (r) 2027 goto out; 2028 r = 0; 2029 break; 2030 } 2031 #endif 2032 case KVM_IRQFD: { 2033 struct kvm_irqfd data; 2034 2035 r = -EFAULT; 2036 if (copy_from_user(&data, argp, sizeof data)) 2037 goto out; 2038 r = kvm_irqfd(kvm, data.fd, data.gsi, data.flags); 2039 break; 2040 } 2041 case KVM_IOEVENTFD: { 2042 struct kvm_ioeventfd data; 2043 2044 r = -EFAULT; 2045 if (copy_from_user(&data, argp, sizeof data)) 2046 goto out; 2047 r = kvm_ioeventfd(kvm, &data); 2048 break; 2049 } 2050 #ifdef CONFIG_KVM_APIC_ARCHITECTURE 2051 case KVM_SET_BOOT_CPU_ID: 2052 r = 0; 2053 mutex_lock(&kvm->lock); 2054 if (atomic_read(&kvm->online_vcpus) != 0) 2055 r = -EBUSY; 2056 else 2057 kvm->bsp_vcpu_id = arg; 2058 mutex_unlock(&kvm->lock); 2059 break; 2060 #endif 2061 default: 2062 r = kvm_arch_vm_ioctl(filp, ioctl, arg); 2063 if (r == -ENOTTY) 2064 r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg); 2065 } 2066 out: 2067 return r; 2068 } 2069 2070 #ifdef CONFIG_COMPAT 2071 struct compat_kvm_dirty_log { 2072 __u32 slot; 2073 __u32 padding1; 2074 union { 2075 compat_uptr_t dirty_bitmap; /* one bit per page */ 2076 __u64 padding2; 2077 }; 2078 }; 2079 2080 static long kvm_vm_compat_ioctl(struct file *filp, 2081 unsigned int ioctl, unsigned long arg) 2082 { 2083 struct kvm *kvm = filp->private_data; 2084 int r; 2085 2086 if (kvm->mm != current->mm) 2087 return -EIO; 2088 switch (ioctl) { 2089 case KVM_GET_DIRTY_LOG: { 2090 struct compat_kvm_dirty_log compat_log; 2091 struct kvm_dirty_log log; 2092 2093 r = -EFAULT; 2094 if (copy_from_user(&compat_log, (void __user *)arg, 2095 sizeof(compat_log))) 2096 goto out; 2097 log.slot = compat_log.slot; 2098 log.padding1 = compat_log.padding1; 2099 log.padding2 = compat_log.padding2; 2100 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap); 2101 2102 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 2103 if (r) 2104 goto out; 2105 break; 2106 } 2107 default: 2108 r = kvm_vm_ioctl(filp, ioctl, arg); 2109 } 2110 2111 out: 2112 return r; 2113 } 2114 #endif 2115 2116 static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 2117 { 2118 struct page *page[1]; 2119 unsigned long addr; 2120 int npages; 2121 gfn_t gfn = vmf->pgoff; 2122 struct kvm *kvm = vma->vm_file->private_data; 2123 2124 addr = gfn_to_hva(kvm, gfn); 2125 if (kvm_is_error_hva(addr)) 2126 return VM_FAULT_SIGBUS; 2127 2128 npages = get_user_pages(current, current->mm, addr, 1, 1, 0, page, 2129 NULL); 2130 if (unlikely(npages != 1)) 2131 return VM_FAULT_SIGBUS; 2132 2133 vmf->page = page[0]; 2134 return 0; 2135 } 2136 2137 static const struct vm_operations_struct kvm_vm_vm_ops = { 2138 .fault = kvm_vm_fault, 2139 }; 2140 2141 static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma) 2142 { 2143 vma->vm_ops = &kvm_vm_vm_ops; 2144 return 0; 2145 } 2146 2147 static struct file_operations kvm_vm_fops = { 2148 .release = kvm_vm_release, 2149 .unlocked_ioctl = kvm_vm_ioctl, 2150 #ifdef CONFIG_COMPAT 2151 .compat_ioctl = kvm_vm_compat_ioctl, 2152 #endif 2153 .mmap = kvm_vm_mmap, 2154 .llseek = noop_llseek, 2155 }; 2156 2157 static int kvm_dev_ioctl_create_vm(void) 2158 { 2159 int r; 2160 struct kvm *kvm; 2161 2162 kvm = kvm_create_vm(); 2163 if (IS_ERR(kvm)) 2164 return PTR_ERR(kvm); 2165 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 2166 r = kvm_coalesced_mmio_init(kvm); 2167 if (r < 0) { 2168 kvm_put_kvm(kvm); 2169 return r; 2170 } 2171 #endif 2172 r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR); 2173 if (r < 0) 2174 kvm_put_kvm(kvm); 2175 2176 return r; 2177 } 2178 2179 static long kvm_dev_ioctl_check_extension_generic(long arg) 2180 { 2181 switch (arg) { 2182 case KVM_CAP_USER_MEMORY: 2183 case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: 2184 case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS: 2185 #ifdef CONFIG_KVM_APIC_ARCHITECTURE 2186 case KVM_CAP_SET_BOOT_CPU_ID: 2187 #endif 2188 case KVM_CAP_INTERNAL_ERROR_DATA: 2189 return 1; 2190 #ifdef CONFIG_HAVE_KVM_IRQCHIP 2191 case KVM_CAP_IRQ_ROUTING: 2192 return KVM_MAX_IRQ_ROUTES; 2193 #endif 2194 default: 2195 break; 2196 } 2197 return kvm_dev_ioctl_check_extension(arg); 2198 } 2199 2200 static long kvm_dev_ioctl(struct file *filp, 2201 unsigned int ioctl, unsigned long arg) 2202 { 2203 long r = -EINVAL; 2204 2205 switch (ioctl) { 2206 case KVM_GET_API_VERSION: 2207 r = -EINVAL; 2208 if (arg) 2209 goto out; 2210 r = KVM_API_VERSION; 2211 break; 2212 case KVM_CREATE_VM: 2213 r = -EINVAL; 2214 if (arg) 2215 goto out; 2216 r = kvm_dev_ioctl_create_vm(); 2217 break; 2218 case KVM_CHECK_EXTENSION: 2219 r = kvm_dev_ioctl_check_extension_generic(arg); 2220 break; 2221 case KVM_GET_VCPU_MMAP_SIZE: 2222 r = -EINVAL; 2223 if (arg) 2224 goto out; 2225 r = PAGE_SIZE; /* struct kvm_run */ 2226 #ifdef CONFIG_X86 2227 r += PAGE_SIZE; /* pio data page */ 2228 #endif 2229 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 2230 r += PAGE_SIZE; /* coalesced mmio ring page */ 2231 #endif 2232 break; 2233 case KVM_TRACE_ENABLE: 2234 case KVM_TRACE_PAUSE: 2235 case KVM_TRACE_DISABLE: 2236 r = -EOPNOTSUPP; 2237 break; 2238 default: 2239 return kvm_arch_dev_ioctl(filp, ioctl, arg); 2240 } 2241 out: 2242 return r; 2243 } 2244 2245 static struct file_operations kvm_chardev_ops = { 2246 .unlocked_ioctl = kvm_dev_ioctl, 2247 .compat_ioctl = kvm_dev_ioctl, 2248 .llseek = noop_llseek, 2249 }; 2250 2251 static struct miscdevice kvm_dev = { 2252 KVM_MINOR, 2253 "kvm", 2254 &kvm_chardev_ops, 2255 }; 2256 2257 static void hardware_enable_nolock(void *junk) 2258 { 2259 int cpu = raw_smp_processor_id(); 2260 int r; 2261 2262 if (cpumask_test_cpu(cpu, cpus_hardware_enabled)) 2263 return; 2264 2265 cpumask_set_cpu(cpu, cpus_hardware_enabled); 2266 2267 r = kvm_arch_hardware_enable(NULL); 2268 2269 if (r) { 2270 cpumask_clear_cpu(cpu, cpus_hardware_enabled); 2271 atomic_inc(&hardware_enable_failed); 2272 printk(KERN_INFO "kvm: enabling virtualization on " 2273 "CPU%d failed\n", cpu); 2274 } 2275 } 2276 2277 static void hardware_enable(void *junk) 2278 { 2279 raw_spin_lock(&kvm_lock); 2280 hardware_enable_nolock(junk); 2281 raw_spin_unlock(&kvm_lock); 2282 } 2283 2284 static void hardware_disable_nolock(void *junk) 2285 { 2286 int cpu = raw_smp_processor_id(); 2287 2288 if (!cpumask_test_cpu(cpu, cpus_hardware_enabled)) 2289 return; 2290 cpumask_clear_cpu(cpu, cpus_hardware_enabled); 2291 kvm_arch_hardware_disable(NULL); 2292 } 2293 2294 static void hardware_disable(void *junk) 2295 { 2296 raw_spin_lock(&kvm_lock); 2297 hardware_disable_nolock(junk); 2298 raw_spin_unlock(&kvm_lock); 2299 } 2300 2301 static void hardware_disable_all_nolock(void) 2302 { 2303 BUG_ON(!kvm_usage_count); 2304 2305 kvm_usage_count--; 2306 if (!kvm_usage_count) 2307 on_each_cpu(hardware_disable_nolock, NULL, 1); 2308 } 2309 2310 static void hardware_disable_all(void) 2311 { 2312 raw_spin_lock(&kvm_lock); 2313 hardware_disable_all_nolock(); 2314 raw_spin_unlock(&kvm_lock); 2315 } 2316 2317 static int hardware_enable_all(void) 2318 { 2319 int r = 0; 2320 2321 raw_spin_lock(&kvm_lock); 2322 2323 kvm_usage_count++; 2324 if (kvm_usage_count == 1) { 2325 atomic_set(&hardware_enable_failed, 0); 2326 on_each_cpu(hardware_enable_nolock, NULL, 1); 2327 2328 if (atomic_read(&hardware_enable_failed)) { 2329 hardware_disable_all_nolock(); 2330 r = -EBUSY; 2331 } 2332 } 2333 2334 raw_spin_unlock(&kvm_lock); 2335 2336 return r; 2337 } 2338 2339 static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, 2340 void *v) 2341 { 2342 int cpu = (long)v; 2343 2344 if (!kvm_usage_count) 2345 return NOTIFY_OK; 2346 2347 val &= ~CPU_TASKS_FROZEN; 2348 switch (val) { 2349 case CPU_DYING: 2350 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", 2351 cpu); 2352 hardware_disable(NULL); 2353 break; 2354 case CPU_STARTING: 2355 printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n", 2356 cpu); 2357 hardware_enable(NULL); 2358 break; 2359 } 2360 return NOTIFY_OK; 2361 } 2362 2363 2364 asmlinkage void kvm_spurious_fault(void) 2365 { 2366 /* Fault while not rebooting. We want the trace. */ 2367 BUG(); 2368 } 2369 EXPORT_SYMBOL_GPL(kvm_spurious_fault); 2370 2371 static int kvm_reboot(struct notifier_block *notifier, unsigned long val, 2372 void *v) 2373 { 2374 /* 2375 * Some (well, at least mine) BIOSes hang on reboot if 2376 * in vmx root mode. 2377 * 2378 * And Intel TXT required VMX off for all cpu when system shutdown. 2379 */ 2380 printk(KERN_INFO "kvm: exiting hardware virtualization\n"); 2381 kvm_rebooting = true; 2382 on_each_cpu(hardware_disable_nolock, NULL, 1); 2383 return NOTIFY_OK; 2384 } 2385 2386 static struct notifier_block kvm_reboot_notifier = { 2387 .notifier_call = kvm_reboot, 2388 .priority = 0, 2389 }; 2390 2391 static void kvm_io_bus_destroy(struct kvm_io_bus *bus) 2392 { 2393 int i; 2394 2395 for (i = 0; i < bus->dev_count; i++) { 2396 struct kvm_io_device *pos = bus->range[i].dev; 2397 2398 kvm_iodevice_destructor(pos); 2399 } 2400 kfree(bus); 2401 } 2402 2403 int kvm_io_bus_sort_cmp(const void *p1, const void *p2) 2404 { 2405 const struct kvm_io_range *r1 = p1; 2406 const struct kvm_io_range *r2 = p2; 2407 2408 if (r1->addr < r2->addr) 2409 return -1; 2410 if (r1->addr + r1->len > r2->addr + r2->len) 2411 return 1; 2412 return 0; 2413 } 2414 2415 int kvm_io_bus_insert_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev, 2416 gpa_t addr, int len) 2417 { 2418 if (bus->dev_count == NR_IOBUS_DEVS) 2419 return -ENOSPC; 2420 2421 bus->range[bus->dev_count++] = (struct kvm_io_range) { 2422 .addr = addr, 2423 .len = len, 2424 .dev = dev, 2425 }; 2426 2427 sort(bus->range, bus->dev_count, sizeof(struct kvm_io_range), 2428 kvm_io_bus_sort_cmp, NULL); 2429 2430 return 0; 2431 } 2432 2433 int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus, 2434 gpa_t addr, int len) 2435 { 2436 struct kvm_io_range *range, key; 2437 int off; 2438 2439 key = (struct kvm_io_range) { 2440 .addr = addr, 2441 .len = len, 2442 }; 2443 2444 range = bsearch(&key, bus->range, bus->dev_count, 2445 sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp); 2446 if (range == NULL) 2447 return -ENOENT; 2448 2449 off = range - bus->range; 2450 2451 while (off > 0 && kvm_io_bus_sort_cmp(&key, &bus->range[off-1]) == 0) 2452 off--; 2453 2454 return off; 2455 } 2456 2457 /* kvm_io_bus_write - called under kvm->slots_lock */ 2458 int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, 2459 int len, const void *val) 2460 { 2461 int idx; 2462 struct kvm_io_bus *bus; 2463 struct kvm_io_range range; 2464 2465 range = (struct kvm_io_range) { 2466 .addr = addr, 2467 .len = len, 2468 }; 2469 2470 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); 2471 idx = kvm_io_bus_get_first_dev(bus, addr, len); 2472 if (idx < 0) 2473 return -EOPNOTSUPP; 2474 2475 while (idx < bus->dev_count && 2476 kvm_io_bus_sort_cmp(&range, &bus->range[idx]) == 0) { 2477 if (!kvm_iodevice_write(bus->range[idx].dev, addr, len, val)) 2478 return 0; 2479 idx++; 2480 } 2481 2482 return -EOPNOTSUPP; 2483 } 2484 2485 /* kvm_io_bus_read - called under kvm->slots_lock */ 2486 int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, 2487 int len, void *val) 2488 { 2489 int idx; 2490 struct kvm_io_bus *bus; 2491 struct kvm_io_range range; 2492 2493 range = (struct kvm_io_range) { 2494 .addr = addr, 2495 .len = len, 2496 }; 2497 2498 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); 2499 idx = kvm_io_bus_get_first_dev(bus, addr, len); 2500 if (idx < 0) 2501 return -EOPNOTSUPP; 2502 2503 while (idx < bus->dev_count && 2504 kvm_io_bus_sort_cmp(&range, &bus->range[idx]) == 0) { 2505 if (!kvm_iodevice_read(bus->range[idx].dev, addr, len, val)) 2506 return 0; 2507 idx++; 2508 } 2509 2510 return -EOPNOTSUPP; 2511 } 2512 2513 /* Caller must hold slots_lock. */ 2514 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, 2515 int len, struct kvm_io_device *dev) 2516 { 2517 struct kvm_io_bus *new_bus, *bus; 2518 2519 bus = kvm->buses[bus_idx]; 2520 if (bus->dev_count > NR_IOBUS_DEVS-1) 2521 return -ENOSPC; 2522 2523 new_bus = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL); 2524 if (!new_bus) 2525 return -ENOMEM; 2526 memcpy(new_bus, bus, sizeof(struct kvm_io_bus)); 2527 kvm_io_bus_insert_dev(new_bus, dev, addr, len); 2528 rcu_assign_pointer(kvm->buses[bus_idx], new_bus); 2529 synchronize_srcu_expedited(&kvm->srcu); 2530 kfree(bus); 2531 2532 return 0; 2533 } 2534 2535 /* Caller must hold slots_lock. */ 2536 int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, 2537 struct kvm_io_device *dev) 2538 { 2539 int i, r; 2540 struct kvm_io_bus *new_bus, *bus; 2541 2542 new_bus = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL); 2543 if (!new_bus) 2544 return -ENOMEM; 2545 2546 bus = kvm->buses[bus_idx]; 2547 memcpy(new_bus, bus, sizeof(struct kvm_io_bus)); 2548 2549 r = -ENOENT; 2550 for (i = 0; i < new_bus->dev_count; i++) 2551 if (new_bus->range[i].dev == dev) { 2552 r = 0; 2553 new_bus->dev_count--; 2554 new_bus->range[i] = new_bus->range[new_bus->dev_count]; 2555 sort(new_bus->range, new_bus->dev_count, 2556 sizeof(struct kvm_io_range), 2557 kvm_io_bus_sort_cmp, NULL); 2558 break; 2559 } 2560 2561 if (r) { 2562 kfree(new_bus); 2563 return r; 2564 } 2565 2566 rcu_assign_pointer(kvm->buses[bus_idx], new_bus); 2567 synchronize_srcu_expedited(&kvm->srcu); 2568 kfree(bus); 2569 return r; 2570 } 2571 2572 static struct notifier_block kvm_cpu_notifier = { 2573 .notifier_call = kvm_cpu_hotplug, 2574 }; 2575 2576 static int vm_stat_get(void *_offset, u64 *val) 2577 { 2578 unsigned offset = (long)_offset; 2579 struct kvm *kvm; 2580 2581 *val = 0; 2582 raw_spin_lock(&kvm_lock); 2583 list_for_each_entry(kvm, &vm_list, vm_list) 2584 *val += *(u32 *)((void *)kvm + offset); 2585 raw_spin_unlock(&kvm_lock); 2586 return 0; 2587 } 2588 2589 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, NULL, "%llu\n"); 2590 2591 static int vcpu_stat_get(void *_offset, u64 *val) 2592 { 2593 unsigned offset = (long)_offset; 2594 struct kvm *kvm; 2595 struct kvm_vcpu *vcpu; 2596 int i; 2597 2598 *val = 0; 2599 raw_spin_lock(&kvm_lock); 2600 list_for_each_entry(kvm, &vm_list, vm_list) 2601 kvm_for_each_vcpu(i, vcpu, kvm) 2602 *val += *(u32 *)((void *)vcpu + offset); 2603 2604 raw_spin_unlock(&kvm_lock); 2605 return 0; 2606 } 2607 2608 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, NULL, "%llu\n"); 2609 2610 static const struct file_operations *stat_fops[] = { 2611 [KVM_STAT_VCPU] = &vcpu_stat_fops, 2612 [KVM_STAT_VM] = &vm_stat_fops, 2613 }; 2614 2615 static void kvm_init_debug(void) 2616 { 2617 struct kvm_stats_debugfs_item *p; 2618 2619 kvm_debugfs_dir = debugfs_create_dir("kvm", NULL); 2620 for (p = debugfs_entries; p->name; ++p) 2621 p->dentry = debugfs_create_file(p->name, 0444, kvm_debugfs_dir, 2622 (void *)(long)p->offset, 2623 stat_fops[p->kind]); 2624 } 2625 2626 static void kvm_exit_debug(void) 2627 { 2628 struct kvm_stats_debugfs_item *p; 2629 2630 for (p = debugfs_entries; p->name; ++p) 2631 debugfs_remove(p->dentry); 2632 debugfs_remove(kvm_debugfs_dir); 2633 } 2634 2635 static int kvm_suspend(void) 2636 { 2637 if (kvm_usage_count) 2638 hardware_disable_nolock(NULL); 2639 return 0; 2640 } 2641 2642 static void kvm_resume(void) 2643 { 2644 if (kvm_usage_count) { 2645 WARN_ON(raw_spin_is_locked(&kvm_lock)); 2646 hardware_enable_nolock(NULL); 2647 } 2648 } 2649 2650 static struct syscore_ops kvm_syscore_ops = { 2651 .suspend = kvm_suspend, 2652 .resume = kvm_resume, 2653 }; 2654 2655 struct page *bad_page; 2656 pfn_t bad_pfn; 2657 2658 static inline 2659 struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) 2660 { 2661 return container_of(pn, struct kvm_vcpu, preempt_notifier); 2662 } 2663 2664 static void kvm_sched_in(struct preempt_notifier *pn, int cpu) 2665 { 2666 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 2667 2668 kvm_arch_vcpu_load(vcpu, cpu); 2669 } 2670 2671 static void kvm_sched_out(struct preempt_notifier *pn, 2672 struct task_struct *next) 2673 { 2674 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 2675 2676 kvm_arch_vcpu_put(vcpu); 2677 } 2678 2679 int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, 2680 struct module *module) 2681 { 2682 int r; 2683 int cpu; 2684 2685 r = kvm_arch_init(opaque); 2686 if (r) 2687 goto out_fail; 2688 2689 bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO); 2690 2691 if (bad_page == NULL) { 2692 r = -ENOMEM; 2693 goto out; 2694 } 2695 2696 bad_pfn = page_to_pfn(bad_page); 2697 2698 hwpoison_page = alloc_page(GFP_KERNEL | __GFP_ZERO); 2699 2700 if (hwpoison_page == NULL) { 2701 r = -ENOMEM; 2702 goto out_free_0; 2703 } 2704 2705 hwpoison_pfn = page_to_pfn(hwpoison_page); 2706 2707 fault_page = alloc_page(GFP_KERNEL | __GFP_ZERO); 2708 2709 if (fault_page == NULL) { 2710 r = -ENOMEM; 2711 goto out_free_0; 2712 } 2713 2714 fault_pfn = page_to_pfn(fault_page); 2715 2716 if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { 2717 r = -ENOMEM; 2718 goto out_free_0; 2719 } 2720 2721 r = kvm_arch_hardware_setup(); 2722 if (r < 0) 2723 goto out_free_0a; 2724 2725 for_each_online_cpu(cpu) { 2726 smp_call_function_single(cpu, 2727 kvm_arch_check_processor_compat, 2728 &r, 1); 2729 if (r < 0) 2730 goto out_free_1; 2731 } 2732 2733 r = register_cpu_notifier(&kvm_cpu_notifier); 2734 if (r) 2735 goto out_free_2; 2736 register_reboot_notifier(&kvm_reboot_notifier); 2737 2738 /* A kmem cache lets us meet the alignment requirements of fx_save. */ 2739 if (!vcpu_align) 2740 vcpu_align = __alignof__(struct kvm_vcpu); 2741 kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, vcpu_align, 2742 0, NULL); 2743 if (!kvm_vcpu_cache) { 2744 r = -ENOMEM; 2745 goto out_free_3; 2746 } 2747 2748 r = kvm_async_pf_init(); 2749 if (r) 2750 goto out_free; 2751 2752 kvm_chardev_ops.owner = module; 2753 kvm_vm_fops.owner = module; 2754 kvm_vcpu_fops.owner = module; 2755 2756 r = misc_register(&kvm_dev); 2757 if (r) { 2758 printk(KERN_ERR "kvm: misc device register failed\n"); 2759 goto out_unreg; 2760 } 2761 2762 register_syscore_ops(&kvm_syscore_ops); 2763 2764 kvm_preempt_ops.sched_in = kvm_sched_in; 2765 kvm_preempt_ops.sched_out = kvm_sched_out; 2766 2767 kvm_init_debug(); 2768 2769 return 0; 2770 2771 out_unreg: 2772 kvm_async_pf_deinit(); 2773 out_free: 2774 kmem_cache_destroy(kvm_vcpu_cache); 2775 out_free_3: 2776 unregister_reboot_notifier(&kvm_reboot_notifier); 2777 unregister_cpu_notifier(&kvm_cpu_notifier); 2778 out_free_2: 2779 out_free_1: 2780 kvm_arch_hardware_unsetup(); 2781 out_free_0a: 2782 free_cpumask_var(cpus_hardware_enabled); 2783 out_free_0: 2784 if (fault_page) 2785 __free_page(fault_page); 2786 if (hwpoison_page) 2787 __free_page(hwpoison_page); 2788 __free_page(bad_page); 2789 out: 2790 kvm_arch_exit(); 2791 out_fail: 2792 return r; 2793 } 2794 EXPORT_SYMBOL_GPL(kvm_init); 2795 2796 void kvm_exit(void) 2797 { 2798 kvm_exit_debug(); 2799 misc_deregister(&kvm_dev); 2800 kmem_cache_destroy(kvm_vcpu_cache); 2801 kvm_async_pf_deinit(); 2802 unregister_syscore_ops(&kvm_syscore_ops); 2803 unregister_reboot_notifier(&kvm_reboot_notifier); 2804 unregister_cpu_notifier(&kvm_cpu_notifier); 2805 on_each_cpu(hardware_disable_nolock, NULL, 1); 2806 kvm_arch_hardware_unsetup(); 2807 kvm_arch_exit(); 2808 free_cpumask_var(cpus_hardware_enabled); 2809 __free_page(hwpoison_page); 2810 __free_page(bad_page); 2811 } 2812 EXPORT_SYMBOL_GPL(kvm_exit); 2813