1 /* 2 * Kernel-based Virtual Machine driver for Linux 3 * 4 * This module enables machines with Intel VT-x extensions to run virtual 5 * machines without emulation or binary translation. 6 * 7 * Copyright (C) 2006 Qumranet, Inc. 8 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 9 * 10 * Authors: 11 * Avi Kivity <avi@qumranet.com> 12 * Yaniv Kamay <yaniv@qumranet.com> 13 * 14 * This work is licensed under the terms of the GNU GPL, version 2. See 15 * the COPYING file in the top-level directory. 16 * 17 */ 18 19 #include <kvm/iodev.h> 20 21 #include <linux/kvm_host.h> 22 #include <linux/kvm.h> 23 #include <linux/module.h> 24 #include <linux/errno.h> 25 #include <linux/percpu.h> 26 #include <linux/mm.h> 27 #include <linux/miscdevice.h> 28 #include <linux/vmalloc.h> 29 #include <linux/reboot.h> 30 #include <linux/debugfs.h> 31 #include <linux/highmem.h> 32 #include <linux/file.h> 33 #include <linux/syscore_ops.h> 34 #include <linux/cpu.h> 35 #include <linux/sched.h> 36 #include <linux/cpumask.h> 37 #include <linux/smp.h> 38 #include <linux/anon_inodes.h> 39 #include <linux/profile.h> 40 #include <linux/kvm_para.h> 41 #include <linux/pagemap.h> 42 #include <linux/mman.h> 43 #include <linux/swap.h> 44 #include <linux/bitops.h> 45 #include <linux/spinlock.h> 46 #include <linux/compat.h> 47 #include <linux/srcu.h> 48 #include <linux/hugetlb.h> 49 #include <linux/slab.h> 50 #include <linux/sort.h> 51 #include <linux/bsearch.h> 52 53 #include <asm/processor.h> 54 #include <asm/io.h> 55 #include <asm/ioctl.h> 56 #include <asm/uaccess.h> 57 #include <asm/pgtable.h> 58 59 #include "coalesced_mmio.h" 60 #include "async_pf.h" 61 #include "vfio.h" 62 63 #define CREATE_TRACE_POINTS 64 #include <trace/events/kvm.h> 65 66 MODULE_AUTHOR("Qumranet"); 67 MODULE_LICENSE("GPL"); 68 69 static unsigned int halt_poll_ns; 70 module_param(halt_poll_ns, uint, S_IRUGO | S_IWUSR); 71 72 /* 73 * Ordering of locks: 74 * 75 * kvm->lock --> kvm->slots_lock --> kvm->irq_lock 76 */ 77 78 DEFINE_SPINLOCK(kvm_lock); 79 static DEFINE_RAW_SPINLOCK(kvm_count_lock); 80 LIST_HEAD(vm_list); 81 82 static cpumask_var_t cpus_hardware_enabled; 83 static int kvm_usage_count; 84 static atomic_t hardware_enable_failed; 85 86 struct kmem_cache *kvm_vcpu_cache; 87 EXPORT_SYMBOL_GPL(kvm_vcpu_cache); 88 89 static __read_mostly struct preempt_ops kvm_preempt_ops; 90 91 struct dentry *kvm_debugfs_dir; 92 EXPORT_SYMBOL_GPL(kvm_debugfs_dir); 93 94 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, 95 unsigned long arg); 96 #ifdef CONFIG_KVM_COMPAT 97 static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl, 98 unsigned long arg); 99 #endif 100 static int hardware_enable_all(void); 101 static void hardware_disable_all(void); 102 103 static void kvm_io_bus_destroy(struct kvm_io_bus *bus); 104 105 static void kvm_release_pfn_dirty(pfn_t pfn); 106 static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn); 107 108 __visible bool kvm_rebooting; 109 EXPORT_SYMBOL_GPL(kvm_rebooting); 110 111 static bool largepages_enabled = true; 112 113 bool kvm_is_reserved_pfn(pfn_t pfn) 114 { 115 if (pfn_valid(pfn)) 116 return PageReserved(pfn_to_page(pfn)); 117 118 return true; 119 } 120 121 /* 122 * Switches to specified vcpu, until a matching vcpu_put() 123 */ 124 int vcpu_load(struct kvm_vcpu *vcpu) 125 { 126 int cpu; 127 128 if (mutex_lock_killable(&vcpu->mutex)) 129 return -EINTR; 130 cpu = get_cpu(); 131 preempt_notifier_register(&vcpu->preempt_notifier); 132 kvm_arch_vcpu_load(vcpu, cpu); 133 put_cpu(); 134 return 0; 135 } 136 137 void vcpu_put(struct kvm_vcpu *vcpu) 138 { 139 preempt_disable(); 140 kvm_arch_vcpu_put(vcpu); 141 preempt_notifier_unregister(&vcpu->preempt_notifier); 142 preempt_enable(); 143 mutex_unlock(&vcpu->mutex); 144 } 145 146 static void ack_flush(void *_completed) 147 { 148 } 149 150 bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req) 151 { 152 int i, cpu, me; 153 cpumask_var_t cpus; 154 bool called = true; 155 struct kvm_vcpu *vcpu; 156 157 zalloc_cpumask_var(&cpus, GFP_ATOMIC); 158 159 me = get_cpu(); 160 kvm_for_each_vcpu(i, vcpu, kvm) { 161 kvm_make_request(req, vcpu); 162 cpu = vcpu->cpu; 163 164 /* Set ->requests bit before we read ->mode */ 165 smp_mb(); 166 167 if (cpus != NULL && cpu != -1 && cpu != me && 168 kvm_vcpu_exiting_guest_mode(vcpu) != OUTSIDE_GUEST_MODE) 169 cpumask_set_cpu(cpu, cpus); 170 } 171 if (unlikely(cpus == NULL)) 172 smp_call_function_many(cpu_online_mask, ack_flush, NULL, 1); 173 else if (!cpumask_empty(cpus)) 174 smp_call_function_many(cpus, ack_flush, NULL, 1); 175 else 176 called = false; 177 put_cpu(); 178 free_cpumask_var(cpus); 179 return called; 180 } 181 182 #ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL 183 void kvm_flush_remote_tlbs(struct kvm *kvm) 184 { 185 long dirty_count = kvm->tlbs_dirty; 186 187 smp_mb(); 188 if (kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) 189 ++kvm->stat.remote_tlb_flush; 190 cmpxchg(&kvm->tlbs_dirty, dirty_count, 0); 191 } 192 EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs); 193 #endif 194 195 void kvm_reload_remote_mmus(struct kvm *kvm) 196 { 197 kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD); 198 } 199 200 void kvm_make_mclock_inprogress_request(struct kvm *kvm) 201 { 202 kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS); 203 } 204 205 void kvm_make_scan_ioapic_request(struct kvm *kvm) 206 { 207 kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC); 208 } 209 210 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) 211 { 212 struct page *page; 213 int r; 214 215 mutex_init(&vcpu->mutex); 216 vcpu->cpu = -1; 217 vcpu->kvm = kvm; 218 vcpu->vcpu_id = id; 219 vcpu->pid = NULL; 220 init_waitqueue_head(&vcpu->wq); 221 kvm_async_pf_vcpu_init(vcpu); 222 223 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 224 if (!page) { 225 r = -ENOMEM; 226 goto fail; 227 } 228 vcpu->run = page_address(page); 229 230 kvm_vcpu_set_in_spin_loop(vcpu, false); 231 kvm_vcpu_set_dy_eligible(vcpu, false); 232 vcpu->preempted = false; 233 234 r = kvm_arch_vcpu_init(vcpu); 235 if (r < 0) 236 goto fail_free_run; 237 return 0; 238 239 fail_free_run: 240 free_page((unsigned long)vcpu->run); 241 fail: 242 return r; 243 } 244 EXPORT_SYMBOL_GPL(kvm_vcpu_init); 245 246 void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) 247 { 248 put_pid(vcpu->pid); 249 kvm_arch_vcpu_uninit(vcpu); 250 free_page((unsigned long)vcpu->run); 251 } 252 EXPORT_SYMBOL_GPL(kvm_vcpu_uninit); 253 254 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 255 static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) 256 { 257 return container_of(mn, struct kvm, mmu_notifier); 258 } 259 260 static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn, 261 struct mm_struct *mm, 262 unsigned long address) 263 { 264 struct kvm *kvm = mmu_notifier_to_kvm(mn); 265 int need_tlb_flush, idx; 266 267 /* 268 * When ->invalidate_page runs, the linux pte has been zapped 269 * already but the page is still allocated until 270 * ->invalidate_page returns. So if we increase the sequence 271 * here the kvm page fault will notice if the spte can't be 272 * established because the page is going to be freed. If 273 * instead the kvm page fault establishes the spte before 274 * ->invalidate_page runs, kvm_unmap_hva will release it 275 * before returning. 276 * 277 * The sequence increase only need to be seen at spin_unlock 278 * time, and not at spin_lock time. 279 * 280 * Increasing the sequence after the spin_unlock would be 281 * unsafe because the kvm page fault could then establish the 282 * pte after kvm_unmap_hva returned, without noticing the page 283 * is going to be freed. 284 */ 285 idx = srcu_read_lock(&kvm->srcu); 286 spin_lock(&kvm->mmu_lock); 287 288 kvm->mmu_notifier_seq++; 289 need_tlb_flush = kvm_unmap_hva(kvm, address) | kvm->tlbs_dirty; 290 /* we've to flush the tlb before the pages can be freed */ 291 if (need_tlb_flush) 292 kvm_flush_remote_tlbs(kvm); 293 294 spin_unlock(&kvm->mmu_lock); 295 296 kvm_arch_mmu_notifier_invalidate_page(kvm, address); 297 298 srcu_read_unlock(&kvm->srcu, idx); 299 } 300 301 static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, 302 struct mm_struct *mm, 303 unsigned long address, 304 pte_t pte) 305 { 306 struct kvm *kvm = mmu_notifier_to_kvm(mn); 307 int idx; 308 309 idx = srcu_read_lock(&kvm->srcu); 310 spin_lock(&kvm->mmu_lock); 311 kvm->mmu_notifier_seq++; 312 kvm_set_spte_hva(kvm, address, pte); 313 spin_unlock(&kvm->mmu_lock); 314 srcu_read_unlock(&kvm->srcu, idx); 315 } 316 317 static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, 318 struct mm_struct *mm, 319 unsigned long start, 320 unsigned long end) 321 { 322 struct kvm *kvm = mmu_notifier_to_kvm(mn); 323 int need_tlb_flush = 0, idx; 324 325 idx = srcu_read_lock(&kvm->srcu); 326 spin_lock(&kvm->mmu_lock); 327 /* 328 * The count increase must become visible at unlock time as no 329 * spte can be established without taking the mmu_lock and 330 * count is also read inside the mmu_lock critical section. 331 */ 332 kvm->mmu_notifier_count++; 333 need_tlb_flush = kvm_unmap_hva_range(kvm, start, end); 334 need_tlb_flush |= kvm->tlbs_dirty; 335 /* we've to flush the tlb before the pages can be freed */ 336 if (need_tlb_flush) 337 kvm_flush_remote_tlbs(kvm); 338 339 spin_unlock(&kvm->mmu_lock); 340 srcu_read_unlock(&kvm->srcu, idx); 341 } 342 343 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, 344 struct mm_struct *mm, 345 unsigned long start, 346 unsigned long end) 347 { 348 struct kvm *kvm = mmu_notifier_to_kvm(mn); 349 350 spin_lock(&kvm->mmu_lock); 351 /* 352 * This sequence increase will notify the kvm page fault that 353 * the page that is going to be mapped in the spte could have 354 * been freed. 355 */ 356 kvm->mmu_notifier_seq++; 357 smp_wmb(); 358 /* 359 * The above sequence increase must be visible before the 360 * below count decrease, which is ensured by the smp_wmb above 361 * in conjunction with the smp_rmb in mmu_notifier_retry(). 362 */ 363 kvm->mmu_notifier_count--; 364 spin_unlock(&kvm->mmu_lock); 365 366 BUG_ON(kvm->mmu_notifier_count < 0); 367 } 368 369 static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, 370 struct mm_struct *mm, 371 unsigned long start, 372 unsigned long end) 373 { 374 struct kvm *kvm = mmu_notifier_to_kvm(mn); 375 int young, idx; 376 377 idx = srcu_read_lock(&kvm->srcu); 378 spin_lock(&kvm->mmu_lock); 379 380 young = kvm_age_hva(kvm, start, end); 381 if (young) 382 kvm_flush_remote_tlbs(kvm); 383 384 spin_unlock(&kvm->mmu_lock); 385 srcu_read_unlock(&kvm->srcu, idx); 386 387 return young; 388 } 389 390 static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn, 391 struct mm_struct *mm, 392 unsigned long start, 393 unsigned long end) 394 { 395 struct kvm *kvm = mmu_notifier_to_kvm(mn); 396 int young, idx; 397 398 idx = srcu_read_lock(&kvm->srcu); 399 spin_lock(&kvm->mmu_lock); 400 /* 401 * Even though we do not flush TLB, this will still adversely 402 * affect performance on pre-Haswell Intel EPT, where there is 403 * no EPT Access Bit to clear so that we have to tear down EPT 404 * tables instead. If we find this unacceptable, we can always 405 * add a parameter to kvm_age_hva so that it effectively doesn't 406 * do anything on clear_young. 407 * 408 * Also note that currently we never issue secondary TLB flushes 409 * from clear_young, leaving this job up to the regular system 410 * cadence. If we find this inaccurate, we might come up with a 411 * more sophisticated heuristic later. 412 */ 413 young = kvm_age_hva(kvm, start, end); 414 spin_unlock(&kvm->mmu_lock); 415 srcu_read_unlock(&kvm->srcu, idx); 416 417 return young; 418 } 419 420 static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn, 421 struct mm_struct *mm, 422 unsigned long address) 423 { 424 struct kvm *kvm = mmu_notifier_to_kvm(mn); 425 int young, idx; 426 427 idx = srcu_read_lock(&kvm->srcu); 428 spin_lock(&kvm->mmu_lock); 429 young = kvm_test_age_hva(kvm, address); 430 spin_unlock(&kvm->mmu_lock); 431 srcu_read_unlock(&kvm->srcu, idx); 432 433 return young; 434 } 435 436 static void kvm_mmu_notifier_release(struct mmu_notifier *mn, 437 struct mm_struct *mm) 438 { 439 struct kvm *kvm = mmu_notifier_to_kvm(mn); 440 int idx; 441 442 idx = srcu_read_lock(&kvm->srcu); 443 kvm_arch_flush_shadow_all(kvm); 444 srcu_read_unlock(&kvm->srcu, idx); 445 } 446 447 static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { 448 .invalidate_page = kvm_mmu_notifier_invalidate_page, 449 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, 450 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, 451 .clear_flush_young = kvm_mmu_notifier_clear_flush_young, 452 .clear_young = kvm_mmu_notifier_clear_young, 453 .test_young = kvm_mmu_notifier_test_young, 454 .change_pte = kvm_mmu_notifier_change_pte, 455 .release = kvm_mmu_notifier_release, 456 }; 457 458 static int kvm_init_mmu_notifier(struct kvm *kvm) 459 { 460 kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops; 461 return mmu_notifier_register(&kvm->mmu_notifier, current->mm); 462 } 463 464 #else /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */ 465 466 static int kvm_init_mmu_notifier(struct kvm *kvm) 467 { 468 return 0; 469 } 470 471 #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */ 472 473 static struct kvm_memslots *kvm_alloc_memslots(void) 474 { 475 int i; 476 struct kvm_memslots *slots; 477 478 slots = kvm_kvzalloc(sizeof(struct kvm_memslots)); 479 if (!slots) 480 return NULL; 481 482 /* 483 * Init kvm generation close to the maximum to easily test the 484 * code of handling generation number wrap-around. 485 */ 486 slots->generation = -150; 487 for (i = 0; i < KVM_MEM_SLOTS_NUM; i++) 488 slots->id_to_index[i] = slots->memslots[i].id = i; 489 490 return slots; 491 } 492 493 static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) 494 { 495 if (!memslot->dirty_bitmap) 496 return; 497 498 kvfree(memslot->dirty_bitmap); 499 memslot->dirty_bitmap = NULL; 500 } 501 502 /* 503 * Free any memory in @free but not in @dont. 504 */ 505 static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, 506 struct kvm_memory_slot *dont) 507 { 508 if (!dont || free->dirty_bitmap != dont->dirty_bitmap) 509 kvm_destroy_dirty_bitmap(free); 510 511 kvm_arch_free_memslot(kvm, free, dont); 512 513 free->npages = 0; 514 } 515 516 static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots) 517 { 518 struct kvm_memory_slot *memslot; 519 520 if (!slots) 521 return; 522 523 kvm_for_each_memslot(memslot, slots) 524 kvm_free_memslot(kvm, memslot, NULL); 525 526 kvfree(slots); 527 } 528 529 static struct kvm *kvm_create_vm(unsigned long type) 530 { 531 int r, i; 532 struct kvm *kvm = kvm_arch_alloc_vm(); 533 534 if (!kvm) 535 return ERR_PTR(-ENOMEM); 536 537 r = kvm_arch_init_vm(kvm, type); 538 if (r) 539 goto out_err_no_disable; 540 541 r = hardware_enable_all(); 542 if (r) 543 goto out_err_no_disable; 544 545 #ifdef CONFIG_HAVE_KVM_IRQFD 546 INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list); 547 #endif 548 549 BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX); 550 551 r = -ENOMEM; 552 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { 553 kvm->memslots[i] = kvm_alloc_memslots(); 554 if (!kvm->memslots[i]) 555 goto out_err_no_srcu; 556 } 557 558 if (init_srcu_struct(&kvm->srcu)) 559 goto out_err_no_srcu; 560 if (init_srcu_struct(&kvm->irq_srcu)) 561 goto out_err_no_irq_srcu; 562 for (i = 0; i < KVM_NR_BUSES; i++) { 563 kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus), 564 GFP_KERNEL); 565 if (!kvm->buses[i]) 566 goto out_err; 567 } 568 569 spin_lock_init(&kvm->mmu_lock); 570 kvm->mm = current->mm; 571 atomic_inc(&kvm->mm->mm_count); 572 kvm_eventfd_init(kvm); 573 mutex_init(&kvm->lock); 574 mutex_init(&kvm->irq_lock); 575 mutex_init(&kvm->slots_lock); 576 atomic_set(&kvm->users_count, 1); 577 INIT_LIST_HEAD(&kvm->devices); 578 579 r = kvm_init_mmu_notifier(kvm); 580 if (r) 581 goto out_err; 582 583 spin_lock(&kvm_lock); 584 list_add(&kvm->vm_list, &vm_list); 585 spin_unlock(&kvm_lock); 586 587 preempt_notifier_inc(); 588 589 return kvm; 590 591 out_err: 592 cleanup_srcu_struct(&kvm->irq_srcu); 593 out_err_no_irq_srcu: 594 cleanup_srcu_struct(&kvm->srcu); 595 out_err_no_srcu: 596 hardware_disable_all(); 597 out_err_no_disable: 598 for (i = 0; i < KVM_NR_BUSES; i++) 599 kfree(kvm->buses[i]); 600 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) 601 kvm_free_memslots(kvm, kvm->memslots[i]); 602 kvm_arch_free_vm(kvm); 603 return ERR_PTR(r); 604 } 605 606 /* 607 * Avoid using vmalloc for a small buffer. 608 * Should not be used when the size is statically known. 609 */ 610 void *kvm_kvzalloc(unsigned long size) 611 { 612 if (size > PAGE_SIZE) 613 return vzalloc(size); 614 else 615 return kzalloc(size, GFP_KERNEL); 616 } 617 618 static void kvm_destroy_devices(struct kvm *kvm) 619 { 620 struct list_head *node, *tmp; 621 622 list_for_each_safe(node, tmp, &kvm->devices) { 623 struct kvm_device *dev = 624 list_entry(node, struct kvm_device, vm_node); 625 626 list_del(node); 627 dev->ops->destroy(dev); 628 } 629 } 630 631 static void kvm_destroy_vm(struct kvm *kvm) 632 { 633 int i; 634 struct mm_struct *mm = kvm->mm; 635 636 kvm_arch_sync_events(kvm); 637 spin_lock(&kvm_lock); 638 list_del(&kvm->vm_list); 639 spin_unlock(&kvm_lock); 640 kvm_free_irq_routing(kvm); 641 for (i = 0; i < KVM_NR_BUSES; i++) 642 kvm_io_bus_destroy(kvm->buses[i]); 643 kvm_coalesced_mmio_free(kvm); 644 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 645 mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); 646 #else 647 kvm_arch_flush_shadow_all(kvm); 648 #endif 649 kvm_arch_destroy_vm(kvm); 650 kvm_destroy_devices(kvm); 651 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) 652 kvm_free_memslots(kvm, kvm->memslots[i]); 653 cleanup_srcu_struct(&kvm->irq_srcu); 654 cleanup_srcu_struct(&kvm->srcu); 655 kvm_arch_free_vm(kvm); 656 preempt_notifier_dec(); 657 hardware_disable_all(); 658 mmdrop(mm); 659 } 660 661 void kvm_get_kvm(struct kvm *kvm) 662 { 663 atomic_inc(&kvm->users_count); 664 } 665 EXPORT_SYMBOL_GPL(kvm_get_kvm); 666 667 void kvm_put_kvm(struct kvm *kvm) 668 { 669 if (atomic_dec_and_test(&kvm->users_count)) 670 kvm_destroy_vm(kvm); 671 } 672 EXPORT_SYMBOL_GPL(kvm_put_kvm); 673 674 675 static int kvm_vm_release(struct inode *inode, struct file *filp) 676 { 677 struct kvm *kvm = filp->private_data; 678 679 kvm_irqfd_release(kvm); 680 681 kvm_put_kvm(kvm); 682 return 0; 683 } 684 685 /* 686 * Allocation size is twice as large as the actual dirty bitmap size. 687 * See x86's kvm_vm_ioctl_get_dirty_log() why this is needed. 688 */ 689 static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot) 690 { 691 unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot); 692 693 memslot->dirty_bitmap = kvm_kvzalloc(dirty_bytes); 694 if (!memslot->dirty_bitmap) 695 return -ENOMEM; 696 697 return 0; 698 } 699 700 /* 701 * Insert memslot and re-sort memslots based on their GFN, 702 * so binary search could be used to lookup GFN. 703 * Sorting algorithm takes advantage of having initially 704 * sorted array and known changed memslot position. 705 */ 706 static void update_memslots(struct kvm_memslots *slots, 707 struct kvm_memory_slot *new) 708 { 709 int id = new->id; 710 int i = slots->id_to_index[id]; 711 struct kvm_memory_slot *mslots = slots->memslots; 712 713 WARN_ON(mslots[i].id != id); 714 if (!new->npages) { 715 WARN_ON(!mslots[i].npages); 716 if (mslots[i].npages) 717 slots->used_slots--; 718 } else { 719 if (!mslots[i].npages) 720 slots->used_slots++; 721 } 722 723 while (i < KVM_MEM_SLOTS_NUM - 1 && 724 new->base_gfn <= mslots[i + 1].base_gfn) { 725 if (!mslots[i + 1].npages) 726 break; 727 mslots[i] = mslots[i + 1]; 728 slots->id_to_index[mslots[i].id] = i; 729 i++; 730 } 731 732 /* 733 * The ">=" is needed when creating a slot with base_gfn == 0, 734 * so that it moves before all those with base_gfn == npages == 0. 735 * 736 * On the other hand, if new->npages is zero, the above loop has 737 * already left i pointing to the beginning of the empty part of 738 * mslots, and the ">=" would move the hole backwards in this 739 * case---which is wrong. So skip the loop when deleting a slot. 740 */ 741 if (new->npages) { 742 while (i > 0 && 743 new->base_gfn >= mslots[i - 1].base_gfn) { 744 mslots[i] = mslots[i - 1]; 745 slots->id_to_index[mslots[i].id] = i; 746 i--; 747 } 748 } else 749 WARN_ON_ONCE(i != slots->used_slots); 750 751 mslots[i] = *new; 752 slots->id_to_index[mslots[i].id] = i; 753 } 754 755 static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem) 756 { 757 u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES; 758 759 #ifdef __KVM_HAVE_READONLY_MEM 760 valid_flags |= KVM_MEM_READONLY; 761 #endif 762 763 if (mem->flags & ~valid_flags) 764 return -EINVAL; 765 766 return 0; 767 } 768 769 static struct kvm_memslots *install_new_memslots(struct kvm *kvm, 770 int as_id, struct kvm_memslots *slots) 771 { 772 struct kvm_memslots *old_memslots = __kvm_memslots(kvm, as_id); 773 774 /* 775 * Set the low bit in the generation, which disables SPTE caching 776 * until the end of synchronize_srcu_expedited. 777 */ 778 WARN_ON(old_memslots->generation & 1); 779 slots->generation = old_memslots->generation + 1; 780 781 rcu_assign_pointer(kvm->memslots[as_id], slots); 782 synchronize_srcu_expedited(&kvm->srcu); 783 784 /* 785 * Increment the new memslot generation a second time. This prevents 786 * vm exits that race with memslot updates from caching a memslot 787 * generation that will (potentially) be valid forever. 788 */ 789 slots->generation++; 790 791 kvm_arch_memslots_updated(kvm, slots); 792 793 return old_memslots; 794 } 795 796 /* 797 * Allocate some memory and give it an address in the guest physical address 798 * space. 799 * 800 * Discontiguous memory is allowed, mostly for framebuffers. 801 * 802 * Must be called holding kvm->slots_lock for write. 803 */ 804 int __kvm_set_memory_region(struct kvm *kvm, 805 const struct kvm_userspace_memory_region *mem) 806 { 807 int r; 808 gfn_t base_gfn; 809 unsigned long npages; 810 struct kvm_memory_slot *slot; 811 struct kvm_memory_slot old, new; 812 struct kvm_memslots *slots = NULL, *old_memslots; 813 int as_id, id; 814 enum kvm_mr_change change; 815 816 r = check_memory_region_flags(mem); 817 if (r) 818 goto out; 819 820 r = -EINVAL; 821 as_id = mem->slot >> 16; 822 id = (u16)mem->slot; 823 824 /* General sanity checks */ 825 if (mem->memory_size & (PAGE_SIZE - 1)) 826 goto out; 827 if (mem->guest_phys_addr & (PAGE_SIZE - 1)) 828 goto out; 829 /* We can read the guest memory with __xxx_user() later on. */ 830 if ((id < KVM_USER_MEM_SLOTS) && 831 ((mem->userspace_addr & (PAGE_SIZE - 1)) || 832 !access_ok(VERIFY_WRITE, 833 (void __user *)(unsigned long)mem->userspace_addr, 834 mem->memory_size))) 835 goto out; 836 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM) 837 goto out; 838 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) 839 goto out; 840 841 slot = id_to_memslot(__kvm_memslots(kvm, as_id), id); 842 base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; 843 npages = mem->memory_size >> PAGE_SHIFT; 844 845 if (npages > KVM_MEM_MAX_NR_PAGES) 846 goto out; 847 848 new = old = *slot; 849 850 new.id = id; 851 new.base_gfn = base_gfn; 852 new.npages = npages; 853 new.flags = mem->flags; 854 855 if (npages) { 856 if (!old.npages) 857 change = KVM_MR_CREATE; 858 else { /* Modify an existing slot. */ 859 if ((mem->userspace_addr != old.userspace_addr) || 860 (npages != old.npages) || 861 ((new.flags ^ old.flags) & KVM_MEM_READONLY)) 862 goto out; 863 864 if (base_gfn != old.base_gfn) 865 change = KVM_MR_MOVE; 866 else if (new.flags != old.flags) 867 change = KVM_MR_FLAGS_ONLY; 868 else { /* Nothing to change. */ 869 r = 0; 870 goto out; 871 } 872 } 873 } else { 874 if (!old.npages) 875 goto out; 876 877 change = KVM_MR_DELETE; 878 new.base_gfn = 0; 879 new.flags = 0; 880 } 881 882 if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) { 883 /* Check for overlaps */ 884 r = -EEXIST; 885 kvm_for_each_memslot(slot, __kvm_memslots(kvm, as_id)) { 886 if ((slot->id >= KVM_USER_MEM_SLOTS) || 887 (slot->id == id)) 888 continue; 889 if (!((base_gfn + npages <= slot->base_gfn) || 890 (base_gfn >= slot->base_gfn + slot->npages))) 891 goto out; 892 } 893 } 894 895 /* Free page dirty bitmap if unneeded */ 896 if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES)) 897 new.dirty_bitmap = NULL; 898 899 r = -ENOMEM; 900 if (change == KVM_MR_CREATE) { 901 new.userspace_addr = mem->userspace_addr; 902 903 if (kvm_arch_create_memslot(kvm, &new, npages)) 904 goto out_free; 905 } 906 907 /* Allocate page dirty bitmap if needed */ 908 if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { 909 if (kvm_create_dirty_bitmap(&new) < 0) 910 goto out_free; 911 } 912 913 slots = kvm_kvzalloc(sizeof(struct kvm_memslots)); 914 if (!slots) 915 goto out_free; 916 memcpy(slots, __kvm_memslots(kvm, as_id), sizeof(struct kvm_memslots)); 917 918 if ((change == KVM_MR_DELETE) || (change == KVM_MR_MOVE)) { 919 slot = id_to_memslot(slots, id); 920 slot->flags |= KVM_MEMSLOT_INVALID; 921 922 old_memslots = install_new_memslots(kvm, as_id, slots); 923 924 /* slot was deleted or moved, clear iommu mapping */ 925 kvm_iommu_unmap_pages(kvm, &old); 926 /* From this point no new shadow pages pointing to a deleted, 927 * or moved, memslot will be created. 928 * 929 * validation of sp->gfn happens in: 930 * - gfn_to_hva (kvm_read_guest, gfn_to_pfn) 931 * - kvm_is_visible_gfn (mmu_check_roots) 932 */ 933 kvm_arch_flush_shadow_memslot(kvm, slot); 934 935 /* 936 * We can re-use the old_memslots from above, the only difference 937 * from the currently installed memslots is the invalid flag. This 938 * will get overwritten by update_memslots anyway. 939 */ 940 slots = old_memslots; 941 } 942 943 r = kvm_arch_prepare_memory_region(kvm, &new, mem, change); 944 if (r) 945 goto out_slots; 946 947 /* actual memory is freed via old in kvm_free_memslot below */ 948 if (change == KVM_MR_DELETE) { 949 new.dirty_bitmap = NULL; 950 memset(&new.arch, 0, sizeof(new.arch)); 951 } 952 953 update_memslots(slots, &new); 954 old_memslots = install_new_memslots(kvm, as_id, slots); 955 956 kvm_arch_commit_memory_region(kvm, mem, &old, &new, change); 957 958 kvm_free_memslot(kvm, &old, &new); 959 kvfree(old_memslots); 960 961 /* 962 * IOMMU mapping: New slots need to be mapped. Old slots need to be 963 * un-mapped and re-mapped if their base changes. Since base change 964 * unmapping is handled above with slot deletion, mapping alone is 965 * needed here. Anything else the iommu might care about for existing 966 * slots (size changes, userspace addr changes and read-only flag 967 * changes) is disallowed above, so any other attribute changes getting 968 * here can be skipped. 969 */ 970 if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) { 971 r = kvm_iommu_map_pages(kvm, &new); 972 return r; 973 } 974 975 return 0; 976 977 out_slots: 978 kvfree(slots); 979 out_free: 980 kvm_free_memslot(kvm, &new, &old); 981 out: 982 return r; 983 } 984 EXPORT_SYMBOL_GPL(__kvm_set_memory_region); 985 986 int kvm_set_memory_region(struct kvm *kvm, 987 const struct kvm_userspace_memory_region *mem) 988 { 989 int r; 990 991 mutex_lock(&kvm->slots_lock); 992 r = __kvm_set_memory_region(kvm, mem); 993 mutex_unlock(&kvm->slots_lock); 994 return r; 995 } 996 EXPORT_SYMBOL_GPL(kvm_set_memory_region); 997 998 static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, 999 struct kvm_userspace_memory_region *mem) 1000 { 1001 if ((u16)mem->slot >= KVM_USER_MEM_SLOTS) 1002 return -EINVAL; 1003 1004 return kvm_set_memory_region(kvm, mem); 1005 } 1006 1007 int kvm_get_dirty_log(struct kvm *kvm, 1008 struct kvm_dirty_log *log, int *is_dirty) 1009 { 1010 struct kvm_memslots *slots; 1011 struct kvm_memory_slot *memslot; 1012 int r, i, as_id, id; 1013 unsigned long n; 1014 unsigned long any = 0; 1015 1016 r = -EINVAL; 1017 as_id = log->slot >> 16; 1018 id = (u16)log->slot; 1019 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) 1020 goto out; 1021 1022 slots = __kvm_memslots(kvm, as_id); 1023 memslot = id_to_memslot(slots, id); 1024 r = -ENOENT; 1025 if (!memslot->dirty_bitmap) 1026 goto out; 1027 1028 n = kvm_dirty_bitmap_bytes(memslot); 1029 1030 for (i = 0; !any && i < n/sizeof(long); ++i) 1031 any = memslot->dirty_bitmap[i]; 1032 1033 r = -EFAULT; 1034 if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) 1035 goto out; 1036 1037 if (any) 1038 *is_dirty = 1; 1039 1040 r = 0; 1041 out: 1042 return r; 1043 } 1044 EXPORT_SYMBOL_GPL(kvm_get_dirty_log); 1045 1046 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT 1047 /** 1048 * kvm_get_dirty_log_protect - get a snapshot of dirty pages, and if any pages 1049 * are dirty write protect them for next write. 1050 * @kvm: pointer to kvm instance 1051 * @log: slot id and address to which we copy the log 1052 * @is_dirty: flag set if any page is dirty 1053 * 1054 * We need to keep it in mind that VCPU threads can write to the bitmap 1055 * concurrently. So, to avoid losing track of dirty pages we keep the 1056 * following order: 1057 * 1058 * 1. Take a snapshot of the bit and clear it if needed. 1059 * 2. Write protect the corresponding page. 1060 * 3. Copy the snapshot to the userspace. 1061 * 4. Upon return caller flushes TLB's if needed. 1062 * 1063 * Between 2 and 4, the guest may write to the page using the remaining TLB 1064 * entry. This is not a problem because the page is reported dirty using 1065 * the snapshot taken before and step 4 ensures that writes done after 1066 * exiting to userspace will be logged for the next call. 1067 * 1068 */ 1069 int kvm_get_dirty_log_protect(struct kvm *kvm, 1070 struct kvm_dirty_log *log, bool *is_dirty) 1071 { 1072 struct kvm_memslots *slots; 1073 struct kvm_memory_slot *memslot; 1074 int r, i, as_id, id; 1075 unsigned long n; 1076 unsigned long *dirty_bitmap; 1077 unsigned long *dirty_bitmap_buffer; 1078 1079 r = -EINVAL; 1080 as_id = log->slot >> 16; 1081 id = (u16)log->slot; 1082 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) 1083 goto out; 1084 1085 slots = __kvm_memslots(kvm, as_id); 1086 memslot = id_to_memslot(slots, id); 1087 1088 dirty_bitmap = memslot->dirty_bitmap; 1089 r = -ENOENT; 1090 if (!dirty_bitmap) 1091 goto out; 1092 1093 n = kvm_dirty_bitmap_bytes(memslot); 1094 1095 dirty_bitmap_buffer = dirty_bitmap + n / sizeof(long); 1096 memset(dirty_bitmap_buffer, 0, n); 1097 1098 spin_lock(&kvm->mmu_lock); 1099 *is_dirty = false; 1100 for (i = 0; i < n / sizeof(long); i++) { 1101 unsigned long mask; 1102 gfn_t offset; 1103 1104 if (!dirty_bitmap[i]) 1105 continue; 1106 1107 *is_dirty = true; 1108 1109 mask = xchg(&dirty_bitmap[i], 0); 1110 dirty_bitmap_buffer[i] = mask; 1111 1112 if (mask) { 1113 offset = i * BITS_PER_LONG; 1114 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, 1115 offset, mask); 1116 } 1117 } 1118 1119 spin_unlock(&kvm->mmu_lock); 1120 1121 r = -EFAULT; 1122 if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n)) 1123 goto out; 1124 1125 r = 0; 1126 out: 1127 return r; 1128 } 1129 EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect); 1130 #endif 1131 1132 bool kvm_largepages_enabled(void) 1133 { 1134 return largepages_enabled; 1135 } 1136 1137 void kvm_disable_largepages(void) 1138 { 1139 largepages_enabled = false; 1140 } 1141 EXPORT_SYMBOL_GPL(kvm_disable_largepages); 1142 1143 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) 1144 { 1145 return __gfn_to_memslot(kvm_memslots(kvm), gfn); 1146 } 1147 EXPORT_SYMBOL_GPL(gfn_to_memslot); 1148 1149 struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn) 1150 { 1151 return __gfn_to_memslot(kvm_vcpu_memslots(vcpu), gfn); 1152 } 1153 1154 int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) 1155 { 1156 struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn); 1157 1158 if (!memslot || memslot->id >= KVM_USER_MEM_SLOTS || 1159 memslot->flags & KVM_MEMSLOT_INVALID) 1160 return 0; 1161 1162 return 1; 1163 } 1164 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); 1165 1166 unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn) 1167 { 1168 struct vm_area_struct *vma; 1169 unsigned long addr, size; 1170 1171 size = PAGE_SIZE; 1172 1173 addr = gfn_to_hva(kvm, gfn); 1174 if (kvm_is_error_hva(addr)) 1175 return PAGE_SIZE; 1176 1177 down_read(¤t->mm->mmap_sem); 1178 vma = find_vma(current->mm, addr); 1179 if (!vma) 1180 goto out; 1181 1182 size = vma_kernel_pagesize(vma); 1183 1184 out: 1185 up_read(¤t->mm->mmap_sem); 1186 1187 return size; 1188 } 1189 1190 static bool memslot_is_readonly(struct kvm_memory_slot *slot) 1191 { 1192 return slot->flags & KVM_MEM_READONLY; 1193 } 1194 1195 static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, 1196 gfn_t *nr_pages, bool write) 1197 { 1198 if (!slot || slot->flags & KVM_MEMSLOT_INVALID) 1199 return KVM_HVA_ERR_BAD; 1200 1201 if (memslot_is_readonly(slot) && write) 1202 return KVM_HVA_ERR_RO_BAD; 1203 1204 if (nr_pages) 1205 *nr_pages = slot->npages - (gfn - slot->base_gfn); 1206 1207 return __gfn_to_hva_memslot(slot, gfn); 1208 } 1209 1210 static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, 1211 gfn_t *nr_pages) 1212 { 1213 return __gfn_to_hva_many(slot, gfn, nr_pages, true); 1214 } 1215 1216 unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, 1217 gfn_t gfn) 1218 { 1219 return gfn_to_hva_many(slot, gfn, NULL); 1220 } 1221 EXPORT_SYMBOL_GPL(gfn_to_hva_memslot); 1222 1223 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) 1224 { 1225 return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL); 1226 } 1227 EXPORT_SYMBOL_GPL(gfn_to_hva); 1228 1229 unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn) 1230 { 1231 return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL); 1232 } 1233 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva); 1234 1235 /* 1236 * If writable is set to false, the hva returned by this function is only 1237 * allowed to be read. 1238 */ 1239 unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot, 1240 gfn_t gfn, bool *writable) 1241 { 1242 unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false); 1243 1244 if (!kvm_is_error_hva(hva) && writable) 1245 *writable = !memslot_is_readonly(slot); 1246 1247 return hva; 1248 } 1249 1250 unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable) 1251 { 1252 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); 1253 1254 return gfn_to_hva_memslot_prot(slot, gfn, writable); 1255 } 1256 1257 unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable) 1258 { 1259 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 1260 1261 return gfn_to_hva_memslot_prot(slot, gfn, writable); 1262 } 1263 1264 static int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm, 1265 unsigned long start, int write, struct page **page) 1266 { 1267 int flags = FOLL_TOUCH | FOLL_NOWAIT | FOLL_HWPOISON | FOLL_GET; 1268 1269 if (write) 1270 flags |= FOLL_WRITE; 1271 1272 return __get_user_pages(tsk, mm, start, 1, flags, page, NULL, NULL); 1273 } 1274 1275 static inline int check_user_page_hwpoison(unsigned long addr) 1276 { 1277 int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE; 1278 1279 rc = __get_user_pages(current, current->mm, addr, 1, 1280 flags, NULL, NULL, NULL); 1281 return rc == -EHWPOISON; 1282 } 1283 1284 /* 1285 * The atomic path to get the writable pfn which will be stored in @pfn, 1286 * true indicates success, otherwise false is returned. 1287 */ 1288 static bool hva_to_pfn_fast(unsigned long addr, bool atomic, bool *async, 1289 bool write_fault, bool *writable, pfn_t *pfn) 1290 { 1291 struct page *page[1]; 1292 int npages; 1293 1294 if (!(async || atomic)) 1295 return false; 1296 1297 /* 1298 * Fast pin a writable pfn only if it is a write fault request 1299 * or the caller allows to map a writable pfn for a read fault 1300 * request. 1301 */ 1302 if (!(write_fault || writable)) 1303 return false; 1304 1305 npages = __get_user_pages_fast(addr, 1, 1, page); 1306 if (npages == 1) { 1307 *pfn = page_to_pfn(page[0]); 1308 1309 if (writable) 1310 *writable = true; 1311 return true; 1312 } 1313 1314 return false; 1315 } 1316 1317 /* 1318 * The slow path to get the pfn of the specified host virtual address, 1319 * 1 indicates success, -errno is returned if error is detected. 1320 */ 1321 static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, 1322 bool *writable, pfn_t *pfn) 1323 { 1324 struct page *page[1]; 1325 int npages = 0; 1326 1327 might_sleep(); 1328 1329 if (writable) 1330 *writable = write_fault; 1331 1332 if (async) { 1333 down_read(¤t->mm->mmap_sem); 1334 npages = get_user_page_nowait(current, current->mm, 1335 addr, write_fault, page); 1336 up_read(¤t->mm->mmap_sem); 1337 } else 1338 npages = __get_user_pages_unlocked(current, current->mm, addr, 1, 1339 write_fault, 0, page, 1340 FOLL_TOUCH|FOLL_HWPOISON); 1341 if (npages != 1) 1342 return npages; 1343 1344 /* map read fault as writable if possible */ 1345 if (unlikely(!write_fault) && writable) { 1346 struct page *wpage[1]; 1347 1348 npages = __get_user_pages_fast(addr, 1, 1, wpage); 1349 if (npages == 1) { 1350 *writable = true; 1351 put_page(page[0]); 1352 page[0] = wpage[0]; 1353 } 1354 1355 npages = 1; 1356 } 1357 *pfn = page_to_pfn(page[0]); 1358 return npages; 1359 } 1360 1361 static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault) 1362 { 1363 if (unlikely(!(vma->vm_flags & VM_READ))) 1364 return false; 1365 1366 if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE)))) 1367 return false; 1368 1369 return true; 1370 } 1371 1372 /* 1373 * Pin guest page in memory and return its pfn. 1374 * @addr: host virtual address which maps memory to the guest 1375 * @atomic: whether this function can sleep 1376 * @async: whether this function need to wait IO complete if the 1377 * host page is not in the memory 1378 * @write_fault: whether we should get a writable host page 1379 * @writable: whether it allows to map a writable host page for !@write_fault 1380 * 1381 * The function will map a writable host page for these two cases: 1382 * 1): @write_fault = true 1383 * 2): @write_fault = false && @writable, @writable will tell the caller 1384 * whether the mapping is writable. 1385 */ 1386 static pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, 1387 bool write_fault, bool *writable) 1388 { 1389 struct vm_area_struct *vma; 1390 pfn_t pfn = 0; 1391 int npages; 1392 1393 /* we can do it either atomically or asynchronously, not both */ 1394 BUG_ON(atomic && async); 1395 1396 if (hva_to_pfn_fast(addr, atomic, async, write_fault, writable, &pfn)) 1397 return pfn; 1398 1399 if (atomic) 1400 return KVM_PFN_ERR_FAULT; 1401 1402 npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn); 1403 if (npages == 1) 1404 return pfn; 1405 1406 down_read(¤t->mm->mmap_sem); 1407 if (npages == -EHWPOISON || 1408 (!async && check_user_page_hwpoison(addr))) { 1409 pfn = KVM_PFN_ERR_HWPOISON; 1410 goto exit; 1411 } 1412 1413 vma = find_vma_intersection(current->mm, addr, addr + 1); 1414 1415 if (vma == NULL) 1416 pfn = KVM_PFN_ERR_FAULT; 1417 else if ((vma->vm_flags & VM_PFNMAP)) { 1418 pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + 1419 vma->vm_pgoff; 1420 BUG_ON(!kvm_is_reserved_pfn(pfn)); 1421 } else { 1422 if (async && vma_is_valid(vma, write_fault)) 1423 *async = true; 1424 pfn = KVM_PFN_ERR_FAULT; 1425 } 1426 exit: 1427 up_read(¤t->mm->mmap_sem); 1428 return pfn; 1429 } 1430 1431 pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic, 1432 bool *async, bool write_fault, bool *writable) 1433 { 1434 unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault); 1435 1436 if (addr == KVM_HVA_ERR_RO_BAD) 1437 return KVM_PFN_ERR_RO_FAULT; 1438 1439 if (kvm_is_error_hva(addr)) 1440 return KVM_PFN_NOSLOT; 1441 1442 /* Do not map writable pfn in the readonly memslot. */ 1443 if (writable && memslot_is_readonly(slot)) { 1444 *writable = false; 1445 writable = NULL; 1446 } 1447 1448 return hva_to_pfn(addr, atomic, async, write_fault, 1449 writable); 1450 } 1451 EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot); 1452 1453 pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, 1454 bool *writable) 1455 { 1456 return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL, 1457 write_fault, writable); 1458 } 1459 EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); 1460 1461 pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn) 1462 { 1463 return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL); 1464 } 1465 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot); 1466 1467 pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn) 1468 { 1469 return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL); 1470 } 1471 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic); 1472 1473 pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn) 1474 { 1475 return gfn_to_pfn_memslot_atomic(gfn_to_memslot(kvm, gfn), gfn); 1476 } 1477 EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic); 1478 1479 pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn) 1480 { 1481 return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn); 1482 } 1483 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic); 1484 1485 pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) 1486 { 1487 return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn); 1488 } 1489 EXPORT_SYMBOL_GPL(gfn_to_pfn); 1490 1491 pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn) 1492 { 1493 return gfn_to_pfn_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn); 1494 } 1495 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn); 1496 1497 int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn, 1498 struct page **pages, int nr_pages) 1499 { 1500 unsigned long addr; 1501 gfn_t entry; 1502 1503 addr = gfn_to_hva_many(slot, gfn, &entry); 1504 if (kvm_is_error_hva(addr)) 1505 return -1; 1506 1507 if (entry < nr_pages) 1508 return 0; 1509 1510 return __get_user_pages_fast(addr, nr_pages, 1, pages); 1511 } 1512 EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic); 1513 1514 static struct page *kvm_pfn_to_page(pfn_t pfn) 1515 { 1516 if (is_error_noslot_pfn(pfn)) 1517 return KVM_ERR_PTR_BAD_PAGE; 1518 1519 if (kvm_is_reserved_pfn(pfn)) { 1520 WARN_ON(1); 1521 return KVM_ERR_PTR_BAD_PAGE; 1522 } 1523 1524 return pfn_to_page(pfn); 1525 } 1526 1527 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) 1528 { 1529 pfn_t pfn; 1530 1531 pfn = gfn_to_pfn(kvm, gfn); 1532 1533 return kvm_pfn_to_page(pfn); 1534 } 1535 EXPORT_SYMBOL_GPL(gfn_to_page); 1536 1537 struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn) 1538 { 1539 pfn_t pfn; 1540 1541 pfn = kvm_vcpu_gfn_to_pfn(vcpu, gfn); 1542 1543 return kvm_pfn_to_page(pfn); 1544 } 1545 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_page); 1546 1547 void kvm_release_page_clean(struct page *page) 1548 { 1549 WARN_ON(is_error_page(page)); 1550 1551 kvm_release_pfn_clean(page_to_pfn(page)); 1552 } 1553 EXPORT_SYMBOL_GPL(kvm_release_page_clean); 1554 1555 void kvm_release_pfn_clean(pfn_t pfn) 1556 { 1557 if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn)) 1558 put_page(pfn_to_page(pfn)); 1559 } 1560 EXPORT_SYMBOL_GPL(kvm_release_pfn_clean); 1561 1562 void kvm_release_page_dirty(struct page *page) 1563 { 1564 WARN_ON(is_error_page(page)); 1565 1566 kvm_release_pfn_dirty(page_to_pfn(page)); 1567 } 1568 EXPORT_SYMBOL_GPL(kvm_release_page_dirty); 1569 1570 static void kvm_release_pfn_dirty(pfn_t pfn) 1571 { 1572 kvm_set_pfn_dirty(pfn); 1573 kvm_release_pfn_clean(pfn); 1574 } 1575 1576 void kvm_set_pfn_dirty(pfn_t pfn) 1577 { 1578 if (!kvm_is_reserved_pfn(pfn)) { 1579 struct page *page = pfn_to_page(pfn); 1580 1581 if (!PageReserved(page)) 1582 SetPageDirty(page); 1583 } 1584 } 1585 EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty); 1586 1587 void kvm_set_pfn_accessed(pfn_t pfn) 1588 { 1589 if (!kvm_is_reserved_pfn(pfn)) 1590 mark_page_accessed(pfn_to_page(pfn)); 1591 } 1592 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed); 1593 1594 void kvm_get_pfn(pfn_t pfn) 1595 { 1596 if (!kvm_is_reserved_pfn(pfn)) 1597 get_page(pfn_to_page(pfn)); 1598 } 1599 EXPORT_SYMBOL_GPL(kvm_get_pfn); 1600 1601 static int next_segment(unsigned long len, int offset) 1602 { 1603 if (len > PAGE_SIZE - offset) 1604 return PAGE_SIZE - offset; 1605 else 1606 return len; 1607 } 1608 1609 static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn, 1610 void *data, int offset, int len) 1611 { 1612 int r; 1613 unsigned long addr; 1614 1615 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL); 1616 if (kvm_is_error_hva(addr)) 1617 return -EFAULT; 1618 r = __copy_from_user(data, (void __user *)addr + offset, len); 1619 if (r) 1620 return -EFAULT; 1621 return 0; 1622 } 1623 1624 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, 1625 int len) 1626 { 1627 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); 1628 1629 return __kvm_read_guest_page(slot, gfn, data, offset, len); 1630 } 1631 EXPORT_SYMBOL_GPL(kvm_read_guest_page); 1632 1633 int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, 1634 int offset, int len) 1635 { 1636 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 1637 1638 return __kvm_read_guest_page(slot, gfn, data, offset, len); 1639 } 1640 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page); 1641 1642 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) 1643 { 1644 gfn_t gfn = gpa >> PAGE_SHIFT; 1645 int seg; 1646 int offset = offset_in_page(gpa); 1647 int ret; 1648 1649 while ((seg = next_segment(len, offset)) != 0) { 1650 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg); 1651 if (ret < 0) 1652 return ret; 1653 offset = 0; 1654 len -= seg; 1655 data += seg; 1656 ++gfn; 1657 } 1658 return 0; 1659 } 1660 EXPORT_SYMBOL_GPL(kvm_read_guest); 1661 1662 int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len) 1663 { 1664 gfn_t gfn = gpa >> PAGE_SHIFT; 1665 int seg; 1666 int offset = offset_in_page(gpa); 1667 int ret; 1668 1669 while ((seg = next_segment(len, offset)) != 0) { 1670 ret = kvm_vcpu_read_guest_page(vcpu, gfn, data, offset, seg); 1671 if (ret < 0) 1672 return ret; 1673 offset = 0; 1674 len -= seg; 1675 data += seg; 1676 ++gfn; 1677 } 1678 return 0; 1679 } 1680 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest); 1681 1682 static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn, 1683 void *data, int offset, unsigned long len) 1684 { 1685 int r; 1686 unsigned long addr; 1687 1688 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL); 1689 if (kvm_is_error_hva(addr)) 1690 return -EFAULT; 1691 pagefault_disable(); 1692 r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len); 1693 pagefault_enable(); 1694 if (r) 1695 return -EFAULT; 1696 return 0; 1697 } 1698 1699 int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, 1700 unsigned long len) 1701 { 1702 gfn_t gfn = gpa >> PAGE_SHIFT; 1703 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); 1704 int offset = offset_in_page(gpa); 1705 1706 return __kvm_read_guest_atomic(slot, gfn, data, offset, len); 1707 } 1708 EXPORT_SYMBOL_GPL(kvm_read_guest_atomic); 1709 1710 int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa, 1711 void *data, unsigned long len) 1712 { 1713 gfn_t gfn = gpa >> PAGE_SHIFT; 1714 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 1715 int offset = offset_in_page(gpa); 1716 1717 return __kvm_read_guest_atomic(slot, gfn, data, offset, len); 1718 } 1719 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic); 1720 1721 static int __kvm_write_guest_page(struct kvm_memory_slot *memslot, gfn_t gfn, 1722 const void *data, int offset, int len) 1723 { 1724 int r; 1725 unsigned long addr; 1726 1727 addr = gfn_to_hva_memslot(memslot, gfn); 1728 if (kvm_is_error_hva(addr)) 1729 return -EFAULT; 1730 r = __copy_to_user((void __user *)addr + offset, data, len); 1731 if (r) 1732 return -EFAULT; 1733 mark_page_dirty_in_slot(memslot, gfn); 1734 return 0; 1735 } 1736 1737 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, 1738 const void *data, int offset, int len) 1739 { 1740 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); 1741 1742 return __kvm_write_guest_page(slot, gfn, data, offset, len); 1743 } 1744 EXPORT_SYMBOL_GPL(kvm_write_guest_page); 1745 1746 int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, 1747 const void *data, int offset, int len) 1748 { 1749 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 1750 1751 return __kvm_write_guest_page(slot, gfn, data, offset, len); 1752 } 1753 EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page); 1754 1755 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, 1756 unsigned long len) 1757 { 1758 gfn_t gfn = gpa >> PAGE_SHIFT; 1759 int seg; 1760 int offset = offset_in_page(gpa); 1761 int ret; 1762 1763 while ((seg = next_segment(len, offset)) != 0) { 1764 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg); 1765 if (ret < 0) 1766 return ret; 1767 offset = 0; 1768 len -= seg; 1769 data += seg; 1770 ++gfn; 1771 } 1772 return 0; 1773 } 1774 EXPORT_SYMBOL_GPL(kvm_write_guest); 1775 1776 int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data, 1777 unsigned long len) 1778 { 1779 gfn_t gfn = gpa >> PAGE_SHIFT; 1780 int seg; 1781 int offset = offset_in_page(gpa); 1782 int ret; 1783 1784 while ((seg = next_segment(len, offset)) != 0) { 1785 ret = kvm_vcpu_write_guest_page(vcpu, gfn, data, offset, seg); 1786 if (ret < 0) 1787 return ret; 1788 offset = 0; 1789 len -= seg; 1790 data += seg; 1791 ++gfn; 1792 } 1793 return 0; 1794 } 1795 EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest); 1796 1797 int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 1798 gpa_t gpa, unsigned long len) 1799 { 1800 struct kvm_memslots *slots = kvm_memslots(kvm); 1801 int offset = offset_in_page(gpa); 1802 gfn_t start_gfn = gpa >> PAGE_SHIFT; 1803 gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT; 1804 gfn_t nr_pages_needed = end_gfn - start_gfn + 1; 1805 gfn_t nr_pages_avail; 1806 1807 ghc->gpa = gpa; 1808 ghc->generation = slots->generation; 1809 ghc->len = len; 1810 ghc->memslot = gfn_to_memslot(kvm, start_gfn); 1811 ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, NULL); 1812 if (!kvm_is_error_hva(ghc->hva) && nr_pages_needed <= 1) { 1813 ghc->hva += offset; 1814 } else { 1815 /* 1816 * If the requested region crosses two memslots, we still 1817 * verify that the entire region is valid here. 1818 */ 1819 while (start_gfn <= end_gfn) { 1820 ghc->memslot = gfn_to_memslot(kvm, start_gfn); 1821 ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, 1822 &nr_pages_avail); 1823 if (kvm_is_error_hva(ghc->hva)) 1824 return -EFAULT; 1825 start_gfn += nr_pages_avail; 1826 } 1827 /* Use the slow path for cross page reads and writes. */ 1828 ghc->memslot = NULL; 1829 } 1830 return 0; 1831 } 1832 EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init); 1833 1834 int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 1835 void *data, unsigned long len) 1836 { 1837 struct kvm_memslots *slots = kvm_memslots(kvm); 1838 int r; 1839 1840 BUG_ON(len > ghc->len); 1841 1842 if (slots->generation != ghc->generation) 1843 kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa, ghc->len); 1844 1845 if (unlikely(!ghc->memslot)) 1846 return kvm_write_guest(kvm, ghc->gpa, data, len); 1847 1848 if (kvm_is_error_hva(ghc->hva)) 1849 return -EFAULT; 1850 1851 r = __copy_to_user((void __user *)ghc->hva, data, len); 1852 if (r) 1853 return -EFAULT; 1854 mark_page_dirty_in_slot(ghc->memslot, ghc->gpa >> PAGE_SHIFT); 1855 1856 return 0; 1857 } 1858 EXPORT_SYMBOL_GPL(kvm_write_guest_cached); 1859 1860 int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 1861 void *data, unsigned long len) 1862 { 1863 struct kvm_memslots *slots = kvm_memslots(kvm); 1864 int r; 1865 1866 BUG_ON(len > ghc->len); 1867 1868 if (slots->generation != ghc->generation) 1869 kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa, ghc->len); 1870 1871 if (unlikely(!ghc->memslot)) 1872 return kvm_read_guest(kvm, ghc->gpa, data, len); 1873 1874 if (kvm_is_error_hva(ghc->hva)) 1875 return -EFAULT; 1876 1877 r = __copy_from_user(data, (void __user *)ghc->hva, len); 1878 if (r) 1879 return -EFAULT; 1880 1881 return 0; 1882 } 1883 EXPORT_SYMBOL_GPL(kvm_read_guest_cached); 1884 1885 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) 1886 { 1887 const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0))); 1888 1889 return kvm_write_guest_page(kvm, gfn, zero_page, offset, len); 1890 } 1891 EXPORT_SYMBOL_GPL(kvm_clear_guest_page); 1892 1893 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) 1894 { 1895 gfn_t gfn = gpa >> PAGE_SHIFT; 1896 int seg; 1897 int offset = offset_in_page(gpa); 1898 int ret; 1899 1900 while ((seg = next_segment(len, offset)) != 0) { 1901 ret = kvm_clear_guest_page(kvm, gfn, offset, seg); 1902 if (ret < 0) 1903 return ret; 1904 offset = 0; 1905 len -= seg; 1906 ++gfn; 1907 } 1908 return 0; 1909 } 1910 EXPORT_SYMBOL_GPL(kvm_clear_guest); 1911 1912 static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, 1913 gfn_t gfn) 1914 { 1915 if (memslot && memslot->dirty_bitmap) { 1916 unsigned long rel_gfn = gfn - memslot->base_gfn; 1917 1918 set_bit_le(rel_gfn, memslot->dirty_bitmap); 1919 } 1920 } 1921 1922 void mark_page_dirty(struct kvm *kvm, gfn_t gfn) 1923 { 1924 struct kvm_memory_slot *memslot; 1925 1926 memslot = gfn_to_memslot(kvm, gfn); 1927 mark_page_dirty_in_slot(memslot, gfn); 1928 } 1929 EXPORT_SYMBOL_GPL(mark_page_dirty); 1930 1931 void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn) 1932 { 1933 struct kvm_memory_slot *memslot; 1934 1935 memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 1936 mark_page_dirty_in_slot(memslot, gfn); 1937 } 1938 EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty); 1939 1940 static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu) 1941 { 1942 if (kvm_arch_vcpu_runnable(vcpu)) { 1943 kvm_make_request(KVM_REQ_UNHALT, vcpu); 1944 return -EINTR; 1945 } 1946 if (kvm_cpu_has_pending_timer(vcpu)) 1947 return -EINTR; 1948 if (signal_pending(current)) 1949 return -EINTR; 1950 1951 return 0; 1952 } 1953 1954 /* 1955 * The vCPU has executed a HLT instruction with in-kernel mode enabled. 1956 */ 1957 void kvm_vcpu_block(struct kvm_vcpu *vcpu) 1958 { 1959 ktime_t start, cur; 1960 DEFINE_WAIT(wait); 1961 bool waited = false; 1962 1963 start = cur = ktime_get(); 1964 if (halt_poll_ns) { 1965 ktime_t stop = ktime_add_ns(ktime_get(), halt_poll_ns); 1966 1967 do { 1968 /* 1969 * This sets KVM_REQ_UNHALT if an interrupt 1970 * arrives. 1971 */ 1972 if (kvm_vcpu_check_block(vcpu) < 0) { 1973 ++vcpu->stat.halt_successful_poll; 1974 goto out; 1975 } 1976 cur = ktime_get(); 1977 } while (single_task_running() && ktime_before(cur, stop)); 1978 } 1979 1980 for (;;) { 1981 prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); 1982 1983 if (kvm_vcpu_check_block(vcpu) < 0) 1984 break; 1985 1986 waited = true; 1987 schedule(); 1988 } 1989 1990 finish_wait(&vcpu->wq, &wait); 1991 cur = ktime_get(); 1992 1993 out: 1994 trace_kvm_vcpu_wakeup(ktime_to_ns(cur) - ktime_to_ns(start), waited); 1995 } 1996 EXPORT_SYMBOL_GPL(kvm_vcpu_block); 1997 1998 #ifndef CONFIG_S390 1999 /* 2000 * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode. 2001 */ 2002 void kvm_vcpu_kick(struct kvm_vcpu *vcpu) 2003 { 2004 int me; 2005 int cpu = vcpu->cpu; 2006 wait_queue_head_t *wqp; 2007 2008 wqp = kvm_arch_vcpu_wq(vcpu); 2009 if (waitqueue_active(wqp)) { 2010 wake_up_interruptible(wqp); 2011 ++vcpu->stat.halt_wakeup; 2012 } 2013 2014 me = get_cpu(); 2015 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) 2016 if (kvm_arch_vcpu_should_kick(vcpu)) 2017 smp_send_reschedule(cpu); 2018 put_cpu(); 2019 } 2020 EXPORT_SYMBOL_GPL(kvm_vcpu_kick); 2021 #endif /* !CONFIG_S390 */ 2022 2023 int kvm_vcpu_yield_to(struct kvm_vcpu *target) 2024 { 2025 struct pid *pid; 2026 struct task_struct *task = NULL; 2027 int ret = 0; 2028 2029 rcu_read_lock(); 2030 pid = rcu_dereference(target->pid); 2031 if (pid) 2032 task = get_pid_task(pid, PIDTYPE_PID); 2033 rcu_read_unlock(); 2034 if (!task) 2035 return ret; 2036 ret = yield_to(task, 1); 2037 put_task_struct(task); 2038 2039 return ret; 2040 } 2041 EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to); 2042 2043 /* 2044 * Helper that checks whether a VCPU is eligible for directed yield. 2045 * Most eligible candidate to yield is decided by following heuristics: 2046 * 2047 * (a) VCPU which has not done pl-exit or cpu relax intercepted recently 2048 * (preempted lock holder), indicated by @in_spin_loop. 2049 * Set at the beiginning and cleared at the end of interception/PLE handler. 2050 * 2051 * (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get 2052 * chance last time (mostly it has become eligible now since we have probably 2053 * yielded to lockholder in last iteration. This is done by toggling 2054 * @dy_eligible each time a VCPU checked for eligibility.) 2055 * 2056 * Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding 2057 * to preempted lock-holder could result in wrong VCPU selection and CPU 2058 * burning. Giving priority for a potential lock-holder increases lock 2059 * progress. 2060 * 2061 * Since algorithm is based on heuristics, accessing another VCPU data without 2062 * locking does not harm. It may result in trying to yield to same VCPU, fail 2063 * and continue with next VCPU and so on. 2064 */ 2065 static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu) 2066 { 2067 #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT 2068 bool eligible; 2069 2070 eligible = !vcpu->spin_loop.in_spin_loop || 2071 vcpu->spin_loop.dy_eligible; 2072 2073 if (vcpu->spin_loop.in_spin_loop) 2074 kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible); 2075 2076 return eligible; 2077 #else 2078 return true; 2079 #endif 2080 } 2081 2082 void kvm_vcpu_on_spin(struct kvm_vcpu *me) 2083 { 2084 struct kvm *kvm = me->kvm; 2085 struct kvm_vcpu *vcpu; 2086 int last_boosted_vcpu = me->kvm->last_boosted_vcpu; 2087 int yielded = 0; 2088 int try = 3; 2089 int pass; 2090 int i; 2091 2092 kvm_vcpu_set_in_spin_loop(me, true); 2093 /* 2094 * We boost the priority of a VCPU that is runnable but not 2095 * currently running, because it got preempted by something 2096 * else and called schedule in __vcpu_run. Hopefully that 2097 * VCPU is holding the lock that we need and will release it. 2098 * We approximate round-robin by starting at the last boosted VCPU. 2099 */ 2100 for (pass = 0; pass < 2 && !yielded && try; pass++) { 2101 kvm_for_each_vcpu(i, vcpu, kvm) { 2102 if (!pass && i <= last_boosted_vcpu) { 2103 i = last_boosted_vcpu; 2104 continue; 2105 } else if (pass && i > last_boosted_vcpu) 2106 break; 2107 if (!ACCESS_ONCE(vcpu->preempted)) 2108 continue; 2109 if (vcpu == me) 2110 continue; 2111 if (waitqueue_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu)) 2112 continue; 2113 if (!kvm_vcpu_eligible_for_directed_yield(vcpu)) 2114 continue; 2115 2116 yielded = kvm_vcpu_yield_to(vcpu); 2117 if (yielded > 0) { 2118 kvm->last_boosted_vcpu = i; 2119 break; 2120 } else if (yielded < 0) { 2121 try--; 2122 if (!try) 2123 break; 2124 } 2125 } 2126 } 2127 kvm_vcpu_set_in_spin_loop(me, false); 2128 2129 /* Ensure vcpu is not eligible during next spinloop */ 2130 kvm_vcpu_set_dy_eligible(me, false); 2131 } 2132 EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin); 2133 2134 static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 2135 { 2136 struct kvm_vcpu *vcpu = vma->vm_file->private_data; 2137 struct page *page; 2138 2139 if (vmf->pgoff == 0) 2140 page = virt_to_page(vcpu->run); 2141 #ifdef CONFIG_X86 2142 else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET) 2143 page = virt_to_page(vcpu->arch.pio_data); 2144 #endif 2145 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 2146 else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET) 2147 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring); 2148 #endif 2149 else 2150 return kvm_arch_vcpu_fault(vcpu, vmf); 2151 get_page(page); 2152 vmf->page = page; 2153 return 0; 2154 } 2155 2156 static const struct vm_operations_struct kvm_vcpu_vm_ops = { 2157 .fault = kvm_vcpu_fault, 2158 }; 2159 2160 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma) 2161 { 2162 vma->vm_ops = &kvm_vcpu_vm_ops; 2163 return 0; 2164 } 2165 2166 static int kvm_vcpu_release(struct inode *inode, struct file *filp) 2167 { 2168 struct kvm_vcpu *vcpu = filp->private_data; 2169 2170 kvm_put_kvm(vcpu->kvm); 2171 return 0; 2172 } 2173 2174 static struct file_operations kvm_vcpu_fops = { 2175 .release = kvm_vcpu_release, 2176 .unlocked_ioctl = kvm_vcpu_ioctl, 2177 #ifdef CONFIG_KVM_COMPAT 2178 .compat_ioctl = kvm_vcpu_compat_ioctl, 2179 #endif 2180 .mmap = kvm_vcpu_mmap, 2181 .llseek = noop_llseek, 2182 }; 2183 2184 /* 2185 * Allocates an inode for the vcpu. 2186 */ 2187 static int create_vcpu_fd(struct kvm_vcpu *vcpu) 2188 { 2189 return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC); 2190 } 2191 2192 /* 2193 * Creates some virtual cpus. Good luck creating more than one. 2194 */ 2195 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) 2196 { 2197 int r; 2198 struct kvm_vcpu *vcpu, *v; 2199 2200 if (id >= KVM_MAX_VCPUS) 2201 return -EINVAL; 2202 2203 vcpu = kvm_arch_vcpu_create(kvm, id); 2204 if (IS_ERR(vcpu)) 2205 return PTR_ERR(vcpu); 2206 2207 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); 2208 2209 r = kvm_arch_vcpu_setup(vcpu); 2210 if (r) 2211 goto vcpu_destroy; 2212 2213 mutex_lock(&kvm->lock); 2214 if (!kvm_vcpu_compatible(vcpu)) { 2215 r = -EINVAL; 2216 goto unlock_vcpu_destroy; 2217 } 2218 if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) { 2219 r = -EINVAL; 2220 goto unlock_vcpu_destroy; 2221 } 2222 2223 kvm_for_each_vcpu(r, v, kvm) 2224 if (v->vcpu_id == id) { 2225 r = -EEXIST; 2226 goto unlock_vcpu_destroy; 2227 } 2228 2229 BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]); 2230 2231 /* Now it's all set up, let userspace reach it */ 2232 kvm_get_kvm(kvm); 2233 r = create_vcpu_fd(vcpu); 2234 if (r < 0) { 2235 kvm_put_kvm(kvm); 2236 goto unlock_vcpu_destroy; 2237 } 2238 2239 kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu; 2240 2241 /* 2242 * Pairs with smp_rmb() in kvm_get_vcpu. Write kvm->vcpus 2243 * before kvm->online_vcpu's incremented value. 2244 */ 2245 smp_wmb(); 2246 atomic_inc(&kvm->online_vcpus); 2247 2248 mutex_unlock(&kvm->lock); 2249 kvm_arch_vcpu_postcreate(vcpu); 2250 return r; 2251 2252 unlock_vcpu_destroy: 2253 mutex_unlock(&kvm->lock); 2254 vcpu_destroy: 2255 kvm_arch_vcpu_destroy(vcpu); 2256 return r; 2257 } 2258 2259 static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset) 2260 { 2261 if (sigset) { 2262 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP)); 2263 vcpu->sigset_active = 1; 2264 vcpu->sigset = *sigset; 2265 } else 2266 vcpu->sigset_active = 0; 2267 return 0; 2268 } 2269 2270 static long kvm_vcpu_ioctl(struct file *filp, 2271 unsigned int ioctl, unsigned long arg) 2272 { 2273 struct kvm_vcpu *vcpu = filp->private_data; 2274 void __user *argp = (void __user *)arg; 2275 int r; 2276 struct kvm_fpu *fpu = NULL; 2277 struct kvm_sregs *kvm_sregs = NULL; 2278 2279 if (vcpu->kvm->mm != current->mm) 2280 return -EIO; 2281 2282 if (unlikely(_IOC_TYPE(ioctl) != KVMIO)) 2283 return -EINVAL; 2284 2285 #if defined(CONFIG_S390) || defined(CONFIG_PPC) || defined(CONFIG_MIPS) 2286 /* 2287 * Special cases: vcpu ioctls that are asynchronous to vcpu execution, 2288 * so vcpu_load() would break it. 2289 */ 2290 if (ioctl == KVM_S390_INTERRUPT || ioctl == KVM_S390_IRQ || ioctl == KVM_INTERRUPT) 2291 return kvm_arch_vcpu_ioctl(filp, ioctl, arg); 2292 #endif 2293 2294 2295 r = vcpu_load(vcpu); 2296 if (r) 2297 return r; 2298 switch (ioctl) { 2299 case KVM_RUN: 2300 r = -EINVAL; 2301 if (arg) 2302 goto out; 2303 if (unlikely(vcpu->pid != current->pids[PIDTYPE_PID].pid)) { 2304 /* The thread running this VCPU changed. */ 2305 struct pid *oldpid = vcpu->pid; 2306 struct pid *newpid = get_task_pid(current, PIDTYPE_PID); 2307 2308 rcu_assign_pointer(vcpu->pid, newpid); 2309 if (oldpid) 2310 synchronize_rcu(); 2311 put_pid(oldpid); 2312 } 2313 r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run); 2314 trace_kvm_userspace_exit(vcpu->run->exit_reason, r); 2315 break; 2316 case KVM_GET_REGS: { 2317 struct kvm_regs *kvm_regs; 2318 2319 r = -ENOMEM; 2320 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL); 2321 if (!kvm_regs) 2322 goto out; 2323 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs); 2324 if (r) 2325 goto out_free1; 2326 r = -EFAULT; 2327 if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs))) 2328 goto out_free1; 2329 r = 0; 2330 out_free1: 2331 kfree(kvm_regs); 2332 break; 2333 } 2334 case KVM_SET_REGS: { 2335 struct kvm_regs *kvm_regs; 2336 2337 r = -ENOMEM; 2338 kvm_regs = memdup_user(argp, sizeof(*kvm_regs)); 2339 if (IS_ERR(kvm_regs)) { 2340 r = PTR_ERR(kvm_regs); 2341 goto out; 2342 } 2343 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs); 2344 kfree(kvm_regs); 2345 break; 2346 } 2347 case KVM_GET_SREGS: { 2348 kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL); 2349 r = -ENOMEM; 2350 if (!kvm_sregs) 2351 goto out; 2352 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs); 2353 if (r) 2354 goto out; 2355 r = -EFAULT; 2356 if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs))) 2357 goto out; 2358 r = 0; 2359 break; 2360 } 2361 case KVM_SET_SREGS: { 2362 kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs)); 2363 if (IS_ERR(kvm_sregs)) { 2364 r = PTR_ERR(kvm_sregs); 2365 kvm_sregs = NULL; 2366 goto out; 2367 } 2368 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs); 2369 break; 2370 } 2371 case KVM_GET_MP_STATE: { 2372 struct kvm_mp_state mp_state; 2373 2374 r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state); 2375 if (r) 2376 goto out; 2377 r = -EFAULT; 2378 if (copy_to_user(argp, &mp_state, sizeof(mp_state))) 2379 goto out; 2380 r = 0; 2381 break; 2382 } 2383 case KVM_SET_MP_STATE: { 2384 struct kvm_mp_state mp_state; 2385 2386 r = -EFAULT; 2387 if (copy_from_user(&mp_state, argp, sizeof(mp_state))) 2388 goto out; 2389 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state); 2390 break; 2391 } 2392 case KVM_TRANSLATE: { 2393 struct kvm_translation tr; 2394 2395 r = -EFAULT; 2396 if (copy_from_user(&tr, argp, sizeof(tr))) 2397 goto out; 2398 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr); 2399 if (r) 2400 goto out; 2401 r = -EFAULT; 2402 if (copy_to_user(argp, &tr, sizeof(tr))) 2403 goto out; 2404 r = 0; 2405 break; 2406 } 2407 case KVM_SET_GUEST_DEBUG: { 2408 struct kvm_guest_debug dbg; 2409 2410 r = -EFAULT; 2411 if (copy_from_user(&dbg, argp, sizeof(dbg))) 2412 goto out; 2413 r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg); 2414 break; 2415 } 2416 case KVM_SET_SIGNAL_MASK: { 2417 struct kvm_signal_mask __user *sigmask_arg = argp; 2418 struct kvm_signal_mask kvm_sigmask; 2419 sigset_t sigset, *p; 2420 2421 p = NULL; 2422 if (argp) { 2423 r = -EFAULT; 2424 if (copy_from_user(&kvm_sigmask, argp, 2425 sizeof(kvm_sigmask))) 2426 goto out; 2427 r = -EINVAL; 2428 if (kvm_sigmask.len != sizeof(sigset)) 2429 goto out; 2430 r = -EFAULT; 2431 if (copy_from_user(&sigset, sigmask_arg->sigset, 2432 sizeof(sigset))) 2433 goto out; 2434 p = &sigset; 2435 } 2436 r = kvm_vcpu_ioctl_set_sigmask(vcpu, p); 2437 break; 2438 } 2439 case KVM_GET_FPU: { 2440 fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL); 2441 r = -ENOMEM; 2442 if (!fpu) 2443 goto out; 2444 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu); 2445 if (r) 2446 goto out; 2447 r = -EFAULT; 2448 if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu))) 2449 goto out; 2450 r = 0; 2451 break; 2452 } 2453 case KVM_SET_FPU: { 2454 fpu = memdup_user(argp, sizeof(*fpu)); 2455 if (IS_ERR(fpu)) { 2456 r = PTR_ERR(fpu); 2457 fpu = NULL; 2458 goto out; 2459 } 2460 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu); 2461 break; 2462 } 2463 default: 2464 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); 2465 } 2466 out: 2467 vcpu_put(vcpu); 2468 kfree(fpu); 2469 kfree(kvm_sregs); 2470 return r; 2471 } 2472 2473 #ifdef CONFIG_KVM_COMPAT 2474 static long kvm_vcpu_compat_ioctl(struct file *filp, 2475 unsigned int ioctl, unsigned long arg) 2476 { 2477 struct kvm_vcpu *vcpu = filp->private_data; 2478 void __user *argp = compat_ptr(arg); 2479 int r; 2480 2481 if (vcpu->kvm->mm != current->mm) 2482 return -EIO; 2483 2484 switch (ioctl) { 2485 case KVM_SET_SIGNAL_MASK: { 2486 struct kvm_signal_mask __user *sigmask_arg = argp; 2487 struct kvm_signal_mask kvm_sigmask; 2488 compat_sigset_t csigset; 2489 sigset_t sigset; 2490 2491 if (argp) { 2492 r = -EFAULT; 2493 if (copy_from_user(&kvm_sigmask, argp, 2494 sizeof(kvm_sigmask))) 2495 goto out; 2496 r = -EINVAL; 2497 if (kvm_sigmask.len != sizeof(csigset)) 2498 goto out; 2499 r = -EFAULT; 2500 if (copy_from_user(&csigset, sigmask_arg->sigset, 2501 sizeof(csigset))) 2502 goto out; 2503 sigset_from_compat(&sigset, &csigset); 2504 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset); 2505 } else 2506 r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL); 2507 break; 2508 } 2509 default: 2510 r = kvm_vcpu_ioctl(filp, ioctl, arg); 2511 } 2512 2513 out: 2514 return r; 2515 } 2516 #endif 2517 2518 static int kvm_device_ioctl_attr(struct kvm_device *dev, 2519 int (*accessor)(struct kvm_device *dev, 2520 struct kvm_device_attr *attr), 2521 unsigned long arg) 2522 { 2523 struct kvm_device_attr attr; 2524 2525 if (!accessor) 2526 return -EPERM; 2527 2528 if (copy_from_user(&attr, (void __user *)arg, sizeof(attr))) 2529 return -EFAULT; 2530 2531 return accessor(dev, &attr); 2532 } 2533 2534 static long kvm_device_ioctl(struct file *filp, unsigned int ioctl, 2535 unsigned long arg) 2536 { 2537 struct kvm_device *dev = filp->private_data; 2538 2539 switch (ioctl) { 2540 case KVM_SET_DEVICE_ATTR: 2541 return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg); 2542 case KVM_GET_DEVICE_ATTR: 2543 return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg); 2544 case KVM_HAS_DEVICE_ATTR: 2545 return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg); 2546 default: 2547 if (dev->ops->ioctl) 2548 return dev->ops->ioctl(dev, ioctl, arg); 2549 2550 return -ENOTTY; 2551 } 2552 } 2553 2554 static int kvm_device_release(struct inode *inode, struct file *filp) 2555 { 2556 struct kvm_device *dev = filp->private_data; 2557 struct kvm *kvm = dev->kvm; 2558 2559 kvm_put_kvm(kvm); 2560 return 0; 2561 } 2562 2563 static const struct file_operations kvm_device_fops = { 2564 .unlocked_ioctl = kvm_device_ioctl, 2565 #ifdef CONFIG_KVM_COMPAT 2566 .compat_ioctl = kvm_device_ioctl, 2567 #endif 2568 .release = kvm_device_release, 2569 }; 2570 2571 struct kvm_device *kvm_device_from_filp(struct file *filp) 2572 { 2573 if (filp->f_op != &kvm_device_fops) 2574 return NULL; 2575 2576 return filp->private_data; 2577 } 2578 2579 static struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = { 2580 #ifdef CONFIG_KVM_MPIC 2581 [KVM_DEV_TYPE_FSL_MPIC_20] = &kvm_mpic_ops, 2582 [KVM_DEV_TYPE_FSL_MPIC_42] = &kvm_mpic_ops, 2583 #endif 2584 2585 #ifdef CONFIG_KVM_XICS 2586 [KVM_DEV_TYPE_XICS] = &kvm_xics_ops, 2587 #endif 2588 }; 2589 2590 int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type) 2591 { 2592 if (type >= ARRAY_SIZE(kvm_device_ops_table)) 2593 return -ENOSPC; 2594 2595 if (kvm_device_ops_table[type] != NULL) 2596 return -EEXIST; 2597 2598 kvm_device_ops_table[type] = ops; 2599 return 0; 2600 } 2601 2602 void kvm_unregister_device_ops(u32 type) 2603 { 2604 if (kvm_device_ops_table[type] != NULL) 2605 kvm_device_ops_table[type] = NULL; 2606 } 2607 2608 static int kvm_ioctl_create_device(struct kvm *kvm, 2609 struct kvm_create_device *cd) 2610 { 2611 struct kvm_device_ops *ops = NULL; 2612 struct kvm_device *dev; 2613 bool test = cd->flags & KVM_CREATE_DEVICE_TEST; 2614 int ret; 2615 2616 if (cd->type >= ARRAY_SIZE(kvm_device_ops_table)) 2617 return -ENODEV; 2618 2619 ops = kvm_device_ops_table[cd->type]; 2620 if (ops == NULL) 2621 return -ENODEV; 2622 2623 if (test) 2624 return 0; 2625 2626 dev = kzalloc(sizeof(*dev), GFP_KERNEL); 2627 if (!dev) 2628 return -ENOMEM; 2629 2630 dev->ops = ops; 2631 dev->kvm = kvm; 2632 2633 ret = ops->create(dev, cd->type); 2634 if (ret < 0) { 2635 kfree(dev); 2636 return ret; 2637 } 2638 2639 ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC); 2640 if (ret < 0) { 2641 ops->destroy(dev); 2642 return ret; 2643 } 2644 2645 list_add(&dev->vm_node, &kvm->devices); 2646 kvm_get_kvm(kvm); 2647 cd->fd = ret; 2648 return 0; 2649 } 2650 2651 static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) 2652 { 2653 switch (arg) { 2654 case KVM_CAP_USER_MEMORY: 2655 case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: 2656 case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS: 2657 case KVM_CAP_INTERNAL_ERROR_DATA: 2658 #ifdef CONFIG_HAVE_KVM_MSI 2659 case KVM_CAP_SIGNAL_MSI: 2660 #endif 2661 #ifdef CONFIG_HAVE_KVM_IRQFD 2662 case KVM_CAP_IRQFD: 2663 case KVM_CAP_IRQFD_RESAMPLE: 2664 #endif 2665 case KVM_CAP_CHECK_EXTENSION_VM: 2666 return 1; 2667 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING 2668 case KVM_CAP_IRQ_ROUTING: 2669 return KVM_MAX_IRQ_ROUTES; 2670 #endif 2671 #if KVM_ADDRESS_SPACE_NUM > 1 2672 case KVM_CAP_MULTI_ADDRESS_SPACE: 2673 return KVM_ADDRESS_SPACE_NUM; 2674 #endif 2675 default: 2676 break; 2677 } 2678 return kvm_vm_ioctl_check_extension(kvm, arg); 2679 } 2680 2681 static long kvm_vm_ioctl(struct file *filp, 2682 unsigned int ioctl, unsigned long arg) 2683 { 2684 struct kvm *kvm = filp->private_data; 2685 void __user *argp = (void __user *)arg; 2686 int r; 2687 2688 if (kvm->mm != current->mm) 2689 return -EIO; 2690 switch (ioctl) { 2691 case KVM_CREATE_VCPU: 2692 r = kvm_vm_ioctl_create_vcpu(kvm, arg); 2693 break; 2694 case KVM_SET_USER_MEMORY_REGION: { 2695 struct kvm_userspace_memory_region kvm_userspace_mem; 2696 2697 r = -EFAULT; 2698 if (copy_from_user(&kvm_userspace_mem, argp, 2699 sizeof(kvm_userspace_mem))) 2700 goto out; 2701 2702 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem); 2703 break; 2704 } 2705 case KVM_GET_DIRTY_LOG: { 2706 struct kvm_dirty_log log; 2707 2708 r = -EFAULT; 2709 if (copy_from_user(&log, argp, sizeof(log))) 2710 goto out; 2711 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 2712 break; 2713 } 2714 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 2715 case KVM_REGISTER_COALESCED_MMIO: { 2716 struct kvm_coalesced_mmio_zone zone; 2717 2718 r = -EFAULT; 2719 if (copy_from_user(&zone, argp, sizeof(zone))) 2720 goto out; 2721 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone); 2722 break; 2723 } 2724 case KVM_UNREGISTER_COALESCED_MMIO: { 2725 struct kvm_coalesced_mmio_zone zone; 2726 2727 r = -EFAULT; 2728 if (copy_from_user(&zone, argp, sizeof(zone))) 2729 goto out; 2730 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone); 2731 break; 2732 } 2733 #endif 2734 case KVM_IRQFD: { 2735 struct kvm_irqfd data; 2736 2737 r = -EFAULT; 2738 if (copy_from_user(&data, argp, sizeof(data))) 2739 goto out; 2740 r = kvm_irqfd(kvm, &data); 2741 break; 2742 } 2743 case KVM_IOEVENTFD: { 2744 struct kvm_ioeventfd data; 2745 2746 r = -EFAULT; 2747 if (copy_from_user(&data, argp, sizeof(data))) 2748 goto out; 2749 r = kvm_ioeventfd(kvm, &data); 2750 break; 2751 } 2752 #ifdef CONFIG_HAVE_KVM_MSI 2753 case KVM_SIGNAL_MSI: { 2754 struct kvm_msi msi; 2755 2756 r = -EFAULT; 2757 if (copy_from_user(&msi, argp, sizeof(msi))) 2758 goto out; 2759 r = kvm_send_userspace_msi(kvm, &msi); 2760 break; 2761 } 2762 #endif 2763 #ifdef __KVM_HAVE_IRQ_LINE 2764 case KVM_IRQ_LINE_STATUS: 2765 case KVM_IRQ_LINE: { 2766 struct kvm_irq_level irq_event; 2767 2768 r = -EFAULT; 2769 if (copy_from_user(&irq_event, argp, sizeof(irq_event))) 2770 goto out; 2771 2772 r = kvm_vm_ioctl_irq_line(kvm, &irq_event, 2773 ioctl == KVM_IRQ_LINE_STATUS); 2774 if (r) 2775 goto out; 2776 2777 r = -EFAULT; 2778 if (ioctl == KVM_IRQ_LINE_STATUS) { 2779 if (copy_to_user(argp, &irq_event, sizeof(irq_event))) 2780 goto out; 2781 } 2782 2783 r = 0; 2784 break; 2785 } 2786 #endif 2787 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING 2788 case KVM_SET_GSI_ROUTING: { 2789 struct kvm_irq_routing routing; 2790 struct kvm_irq_routing __user *urouting; 2791 struct kvm_irq_routing_entry *entries; 2792 2793 r = -EFAULT; 2794 if (copy_from_user(&routing, argp, sizeof(routing))) 2795 goto out; 2796 r = -EINVAL; 2797 if (routing.nr >= KVM_MAX_IRQ_ROUTES) 2798 goto out; 2799 if (routing.flags) 2800 goto out; 2801 r = -ENOMEM; 2802 entries = vmalloc(routing.nr * sizeof(*entries)); 2803 if (!entries) 2804 goto out; 2805 r = -EFAULT; 2806 urouting = argp; 2807 if (copy_from_user(entries, urouting->entries, 2808 routing.nr * sizeof(*entries))) 2809 goto out_free_irq_routing; 2810 r = kvm_set_irq_routing(kvm, entries, routing.nr, 2811 routing.flags); 2812 out_free_irq_routing: 2813 vfree(entries); 2814 break; 2815 } 2816 #endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */ 2817 case KVM_CREATE_DEVICE: { 2818 struct kvm_create_device cd; 2819 2820 r = -EFAULT; 2821 if (copy_from_user(&cd, argp, sizeof(cd))) 2822 goto out; 2823 2824 r = kvm_ioctl_create_device(kvm, &cd); 2825 if (r) 2826 goto out; 2827 2828 r = -EFAULT; 2829 if (copy_to_user(argp, &cd, sizeof(cd))) 2830 goto out; 2831 2832 r = 0; 2833 break; 2834 } 2835 case KVM_CHECK_EXTENSION: 2836 r = kvm_vm_ioctl_check_extension_generic(kvm, arg); 2837 break; 2838 default: 2839 r = kvm_arch_vm_ioctl(filp, ioctl, arg); 2840 } 2841 out: 2842 return r; 2843 } 2844 2845 #ifdef CONFIG_KVM_COMPAT 2846 struct compat_kvm_dirty_log { 2847 __u32 slot; 2848 __u32 padding1; 2849 union { 2850 compat_uptr_t dirty_bitmap; /* one bit per page */ 2851 __u64 padding2; 2852 }; 2853 }; 2854 2855 static long kvm_vm_compat_ioctl(struct file *filp, 2856 unsigned int ioctl, unsigned long arg) 2857 { 2858 struct kvm *kvm = filp->private_data; 2859 int r; 2860 2861 if (kvm->mm != current->mm) 2862 return -EIO; 2863 switch (ioctl) { 2864 case KVM_GET_DIRTY_LOG: { 2865 struct compat_kvm_dirty_log compat_log; 2866 struct kvm_dirty_log log; 2867 2868 r = -EFAULT; 2869 if (copy_from_user(&compat_log, (void __user *)arg, 2870 sizeof(compat_log))) 2871 goto out; 2872 log.slot = compat_log.slot; 2873 log.padding1 = compat_log.padding1; 2874 log.padding2 = compat_log.padding2; 2875 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap); 2876 2877 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 2878 break; 2879 } 2880 default: 2881 r = kvm_vm_ioctl(filp, ioctl, arg); 2882 } 2883 2884 out: 2885 return r; 2886 } 2887 #endif 2888 2889 static struct file_operations kvm_vm_fops = { 2890 .release = kvm_vm_release, 2891 .unlocked_ioctl = kvm_vm_ioctl, 2892 #ifdef CONFIG_KVM_COMPAT 2893 .compat_ioctl = kvm_vm_compat_ioctl, 2894 #endif 2895 .llseek = noop_llseek, 2896 }; 2897 2898 static int kvm_dev_ioctl_create_vm(unsigned long type) 2899 { 2900 int r; 2901 struct kvm *kvm; 2902 2903 kvm = kvm_create_vm(type); 2904 if (IS_ERR(kvm)) 2905 return PTR_ERR(kvm); 2906 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 2907 r = kvm_coalesced_mmio_init(kvm); 2908 if (r < 0) { 2909 kvm_put_kvm(kvm); 2910 return r; 2911 } 2912 #endif 2913 r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR | O_CLOEXEC); 2914 if (r < 0) 2915 kvm_put_kvm(kvm); 2916 2917 return r; 2918 } 2919 2920 static long kvm_dev_ioctl(struct file *filp, 2921 unsigned int ioctl, unsigned long arg) 2922 { 2923 long r = -EINVAL; 2924 2925 switch (ioctl) { 2926 case KVM_GET_API_VERSION: 2927 if (arg) 2928 goto out; 2929 r = KVM_API_VERSION; 2930 break; 2931 case KVM_CREATE_VM: 2932 r = kvm_dev_ioctl_create_vm(arg); 2933 break; 2934 case KVM_CHECK_EXTENSION: 2935 r = kvm_vm_ioctl_check_extension_generic(NULL, arg); 2936 break; 2937 case KVM_GET_VCPU_MMAP_SIZE: 2938 if (arg) 2939 goto out; 2940 r = PAGE_SIZE; /* struct kvm_run */ 2941 #ifdef CONFIG_X86 2942 r += PAGE_SIZE; /* pio data page */ 2943 #endif 2944 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 2945 r += PAGE_SIZE; /* coalesced mmio ring page */ 2946 #endif 2947 break; 2948 case KVM_TRACE_ENABLE: 2949 case KVM_TRACE_PAUSE: 2950 case KVM_TRACE_DISABLE: 2951 r = -EOPNOTSUPP; 2952 break; 2953 default: 2954 return kvm_arch_dev_ioctl(filp, ioctl, arg); 2955 } 2956 out: 2957 return r; 2958 } 2959 2960 static struct file_operations kvm_chardev_ops = { 2961 .unlocked_ioctl = kvm_dev_ioctl, 2962 .compat_ioctl = kvm_dev_ioctl, 2963 .llseek = noop_llseek, 2964 }; 2965 2966 static struct miscdevice kvm_dev = { 2967 KVM_MINOR, 2968 "kvm", 2969 &kvm_chardev_ops, 2970 }; 2971 2972 static void hardware_enable_nolock(void *junk) 2973 { 2974 int cpu = raw_smp_processor_id(); 2975 int r; 2976 2977 if (cpumask_test_cpu(cpu, cpus_hardware_enabled)) 2978 return; 2979 2980 cpumask_set_cpu(cpu, cpus_hardware_enabled); 2981 2982 r = kvm_arch_hardware_enable(); 2983 2984 if (r) { 2985 cpumask_clear_cpu(cpu, cpus_hardware_enabled); 2986 atomic_inc(&hardware_enable_failed); 2987 pr_info("kvm: enabling virtualization on CPU%d failed\n", cpu); 2988 } 2989 } 2990 2991 static void hardware_enable(void) 2992 { 2993 raw_spin_lock(&kvm_count_lock); 2994 if (kvm_usage_count) 2995 hardware_enable_nolock(NULL); 2996 raw_spin_unlock(&kvm_count_lock); 2997 } 2998 2999 static void hardware_disable_nolock(void *junk) 3000 { 3001 int cpu = raw_smp_processor_id(); 3002 3003 if (!cpumask_test_cpu(cpu, cpus_hardware_enabled)) 3004 return; 3005 cpumask_clear_cpu(cpu, cpus_hardware_enabled); 3006 kvm_arch_hardware_disable(); 3007 } 3008 3009 static void hardware_disable(void) 3010 { 3011 raw_spin_lock(&kvm_count_lock); 3012 if (kvm_usage_count) 3013 hardware_disable_nolock(NULL); 3014 raw_spin_unlock(&kvm_count_lock); 3015 } 3016 3017 static void hardware_disable_all_nolock(void) 3018 { 3019 BUG_ON(!kvm_usage_count); 3020 3021 kvm_usage_count--; 3022 if (!kvm_usage_count) 3023 on_each_cpu(hardware_disable_nolock, NULL, 1); 3024 } 3025 3026 static void hardware_disable_all(void) 3027 { 3028 raw_spin_lock(&kvm_count_lock); 3029 hardware_disable_all_nolock(); 3030 raw_spin_unlock(&kvm_count_lock); 3031 } 3032 3033 static int hardware_enable_all(void) 3034 { 3035 int r = 0; 3036 3037 raw_spin_lock(&kvm_count_lock); 3038 3039 kvm_usage_count++; 3040 if (kvm_usage_count == 1) { 3041 atomic_set(&hardware_enable_failed, 0); 3042 on_each_cpu(hardware_enable_nolock, NULL, 1); 3043 3044 if (atomic_read(&hardware_enable_failed)) { 3045 hardware_disable_all_nolock(); 3046 r = -EBUSY; 3047 } 3048 } 3049 3050 raw_spin_unlock(&kvm_count_lock); 3051 3052 return r; 3053 } 3054 3055 static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, 3056 void *v) 3057 { 3058 val &= ~CPU_TASKS_FROZEN; 3059 switch (val) { 3060 case CPU_DYING: 3061 hardware_disable(); 3062 break; 3063 case CPU_STARTING: 3064 hardware_enable(); 3065 break; 3066 } 3067 return NOTIFY_OK; 3068 } 3069 3070 static int kvm_reboot(struct notifier_block *notifier, unsigned long val, 3071 void *v) 3072 { 3073 /* 3074 * Some (well, at least mine) BIOSes hang on reboot if 3075 * in vmx root mode. 3076 * 3077 * And Intel TXT required VMX off for all cpu when system shutdown. 3078 */ 3079 pr_info("kvm: exiting hardware virtualization\n"); 3080 kvm_rebooting = true; 3081 on_each_cpu(hardware_disable_nolock, NULL, 1); 3082 return NOTIFY_OK; 3083 } 3084 3085 static struct notifier_block kvm_reboot_notifier = { 3086 .notifier_call = kvm_reboot, 3087 .priority = 0, 3088 }; 3089 3090 static void kvm_io_bus_destroy(struct kvm_io_bus *bus) 3091 { 3092 int i; 3093 3094 for (i = 0; i < bus->dev_count; i++) { 3095 struct kvm_io_device *pos = bus->range[i].dev; 3096 3097 kvm_iodevice_destructor(pos); 3098 } 3099 kfree(bus); 3100 } 3101 3102 static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1, 3103 const struct kvm_io_range *r2) 3104 { 3105 if (r1->addr < r2->addr) 3106 return -1; 3107 if (r1->addr + r1->len > r2->addr + r2->len) 3108 return 1; 3109 return 0; 3110 } 3111 3112 static int kvm_io_bus_sort_cmp(const void *p1, const void *p2) 3113 { 3114 return kvm_io_bus_cmp(p1, p2); 3115 } 3116 3117 static int kvm_io_bus_insert_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev, 3118 gpa_t addr, int len) 3119 { 3120 bus->range[bus->dev_count++] = (struct kvm_io_range) { 3121 .addr = addr, 3122 .len = len, 3123 .dev = dev, 3124 }; 3125 3126 sort(bus->range, bus->dev_count, sizeof(struct kvm_io_range), 3127 kvm_io_bus_sort_cmp, NULL); 3128 3129 return 0; 3130 } 3131 3132 static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus, 3133 gpa_t addr, int len) 3134 { 3135 struct kvm_io_range *range, key; 3136 int off; 3137 3138 key = (struct kvm_io_range) { 3139 .addr = addr, 3140 .len = len, 3141 }; 3142 3143 range = bsearch(&key, bus->range, bus->dev_count, 3144 sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp); 3145 if (range == NULL) 3146 return -ENOENT; 3147 3148 off = range - bus->range; 3149 3150 while (off > 0 && kvm_io_bus_cmp(&key, &bus->range[off-1]) == 0) 3151 off--; 3152 3153 return off; 3154 } 3155 3156 static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus, 3157 struct kvm_io_range *range, const void *val) 3158 { 3159 int idx; 3160 3161 idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len); 3162 if (idx < 0) 3163 return -EOPNOTSUPP; 3164 3165 while (idx < bus->dev_count && 3166 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) { 3167 if (!kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr, 3168 range->len, val)) 3169 return idx; 3170 idx++; 3171 } 3172 3173 return -EOPNOTSUPP; 3174 } 3175 3176 /* kvm_io_bus_write - called under kvm->slots_lock */ 3177 int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, 3178 int len, const void *val) 3179 { 3180 struct kvm_io_bus *bus; 3181 struct kvm_io_range range; 3182 int r; 3183 3184 range = (struct kvm_io_range) { 3185 .addr = addr, 3186 .len = len, 3187 }; 3188 3189 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); 3190 r = __kvm_io_bus_write(vcpu, bus, &range, val); 3191 return r < 0 ? r : 0; 3192 } 3193 3194 /* kvm_io_bus_write_cookie - called under kvm->slots_lock */ 3195 int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, 3196 gpa_t addr, int len, const void *val, long cookie) 3197 { 3198 struct kvm_io_bus *bus; 3199 struct kvm_io_range range; 3200 3201 range = (struct kvm_io_range) { 3202 .addr = addr, 3203 .len = len, 3204 }; 3205 3206 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); 3207 3208 /* First try the device referenced by cookie. */ 3209 if ((cookie >= 0) && (cookie < bus->dev_count) && 3210 (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0)) 3211 if (!kvm_iodevice_write(vcpu, bus->range[cookie].dev, addr, len, 3212 val)) 3213 return cookie; 3214 3215 /* 3216 * cookie contained garbage; fall back to search and return the 3217 * correct cookie value. 3218 */ 3219 return __kvm_io_bus_write(vcpu, bus, &range, val); 3220 } 3221 3222 static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus, 3223 struct kvm_io_range *range, void *val) 3224 { 3225 int idx; 3226 3227 idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len); 3228 if (idx < 0) 3229 return -EOPNOTSUPP; 3230 3231 while (idx < bus->dev_count && 3232 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) { 3233 if (!kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr, 3234 range->len, val)) 3235 return idx; 3236 idx++; 3237 } 3238 3239 return -EOPNOTSUPP; 3240 } 3241 EXPORT_SYMBOL_GPL(kvm_io_bus_write); 3242 3243 /* kvm_io_bus_read - called under kvm->slots_lock */ 3244 int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, 3245 int len, void *val) 3246 { 3247 struct kvm_io_bus *bus; 3248 struct kvm_io_range range; 3249 int r; 3250 3251 range = (struct kvm_io_range) { 3252 .addr = addr, 3253 .len = len, 3254 }; 3255 3256 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); 3257 r = __kvm_io_bus_read(vcpu, bus, &range, val); 3258 return r < 0 ? r : 0; 3259 } 3260 3261 3262 /* Caller must hold slots_lock. */ 3263 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, 3264 int len, struct kvm_io_device *dev) 3265 { 3266 struct kvm_io_bus *new_bus, *bus; 3267 3268 bus = kvm->buses[bus_idx]; 3269 /* exclude ioeventfd which is limited by maximum fd */ 3270 if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1) 3271 return -ENOSPC; 3272 3273 new_bus = kzalloc(sizeof(*bus) + ((bus->dev_count + 1) * 3274 sizeof(struct kvm_io_range)), GFP_KERNEL); 3275 if (!new_bus) 3276 return -ENOMEM; 3277 memcpy(new_bus, bus, sizeof(*bus) + (bus->dev_count * 3278 sizeof(struct kvm_io_range))); 3279 kvm_io_bus_insert_dev(new_bus, dev, addr, len); 3280 rcu_assign_pointer(kvm->buses[bus_idx], new_bus); 3281 synchronize_srcu_expedited(&kvm->srcu); 3282 kfree(bus); 3283 3284 return 0; 3285 } 3286 3287 /* Caller must hold slots_lock. */ 3288 int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, 3289 struct kvm_io_device *dev) 3290 { 3291 int i, r; 3292 struct kvm_io_bus *new_bus, *bus; 3293 3294 bus = kvm->buses[bus_idx]; 3295 r = -ENOENT; 3296 for (i = 0; i < bus->dev_count; i++) 3297 if (bus->range[i].dev == dev) { 3298 r = 0; 3299 break; 3300 } 3301 3302 if (r) 3303 return r; 3304 3305 new_bus = kzalloc(sizeof(*bus) + ((bus->dev_count - 1) * 3306 sizeof(struct kvm_io_range)), GFP_KERNEL); 3307 if (!new_bus) 3308 return -ENOMEM; 3309 3310 memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range)); 3311 new_bus->dev_count--; 3312 memcpy(new_bus->range + i, bus->range + i + 1, 3313 (new_bus->dev_count - i) * sizeof(struct kvm_io_range)); 3314 3315 rcu_assign_pointer(kvm->buses[bus_idx], new_bus); 3316 synchronize_srcu_expedited(&kvm->srcu); 3317 kfree(bus); 3318 return r; 3319 } 3320 3321 static struct notifier_block kvm_cpu_notifier = { 3322 .notifier_call = kvm_cpu_hotplug, 3323 }; 3324 3325 static int vm_stat_get(void *_offset, u64 *val) 3326 { 3327 unsigned offset = (long)_offset; 3328 struct kvm *kvm; 3329 3330 *val = 0; 3331 spin_lock(&kvm_lock); 3332 list_for_each_entry(kvm, &vm_list, vm_list) 3333 *val += *(u32 *)((void *)kvm + offset); 3334 spin_unlock(&kvm_lock); 3335 return 0; 3336 } 3337 3338 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, NULL, "%llu\n"); 3339 3340 static int vcpu_stat_get(void *_offset, u64 *val) 3341 { 3342 unsigned offset = (long)_offset; 3343 struct kvm *kvm; 3344 struct kvm_vcpu *vcpu; 3345 int i; 3346 3347 *val = 0; 3348 spin_lock(&kvm_lock); 3349 list_for_each_entry(kvm, &vm_list, vm_list) 3350 kvm_for_each_vcpu(i, vcpu, kvm) 3351 *val += *(u32 *)((void *)vcpu + offset); 3352 3353 spin_unlock(&kvm_lock); 3354 return 0; 3355 } 3356 3357 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, NULL, "%llu\n"); 3358 3359 static const struct file_operations *stat_fops[] = { 3360 [KVM_STAT_VCPU] = &vcpu_stat_fops, 3361 [KVM_STAT_VM] = &vm_stat_fops, 3362 }; 3363 3364 static int kvm_init_debug(void) 3365 { 3366 int r = -EEXIST; 3367 struct kvm_stats_debugfs_item *p; 3368 3369 kvm_debugfs_dir = debugfs_create_dir("kvm", NULL); 3370 if (kvm_debugfs_dir == NULL) 3371 goto out; 3372 3373 for (p = debugfs_entries; p->name; ++p) { 3374 p->dentry = debugfs_create_file(p->name, 0444, kvm_debugfs_dir, 3375 (void *)(long)p->offset, 3376 stat_fops[p->kind]); 3377 if (p->dentry == NULL) 3378 goto out_dir; 3379 } 3380 3381 return 0; 3382 3383 out_dir: 3384 debugfs_remove_recursive(kvm_debugfs_dir); 3385 out: 3386 return r; 3387 } 3388 3389 static void kvm_exit_debug(void) 3390 { 3391 struct kvm_stats_debugfs_item *p; 3392 3393 for (p = debugfs_entries; p->name; ++p) 3394 debugfs_remove(p->dentry); 3395 debugfs_remove(kvm_debugfs_dir); 3396 } 3397 3398 static int kvm_suspend(void) 3399 { 3400 if (kvm_usage_count) 3401 hardware_disable_nolock(NULL); 3402 return 0; 3403 } 3404 3405 static void kvm_resume(void) 3406 { 3407 if (kvm_usage_count) { 3408 WARN_ON(raw_spin_is_locked(&kvm_count_lock)); 3409 hardware_enable_nolock(NULL); 3410 } 3411 } 3412 3413 static struct syscore_ops kvm_syscore_ops = { 3414 .suspend = kvm_suspend, 3415 .resume = kvm_resume, 3416 }; 3417 3418 static inline 3419 struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) 3420 { 3421 return container_of(pn, struct kvm_vcpu, preempt_notifier); 3422 } 3423 3424 static void kvm_sched_in(struct preempt_notifier *pn, int cpu) 3425 { 3426 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 3427 3428 if (vcpu->preempted) 3429 vcpu->preempted = false; 3430 3431 kvm_arch_sched_in(vcpu, cpu); 3432 3433 kvm_arch_vcpu_load(vcpu, cpu); 3434 } 3435 3436 static void kvm_sched_out(struct preempt_notifier *pn, 3437 struct task_struct *next) 3438 { 3439 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 3440 3441 if (current->state == TASK_RUNNING) 3442 vcpu->preempted = true; 3443 kvm_arch_vcpu_put(vcpu); 3444 } 3445 3446 int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, 3447 struct module *module) 3448 { 3449 int r; 3450 int cpu; 3451 3452 r = kvm_arch_init(opaque); 3453 if (r) 3454 goto out_fail; 3455 3456 /* 3457 * kvm_arch_init makes sure there's at most one caller 3458 * for architectures that support multiple implementations, 3459 * like intel and amd on x86. 3460 * kvm_arch_init must be called before kvm_irqfd_init to avoid creating 3461 * conflicts in case kvm is already setup for another implementation. 3462 */ 3463 r = kvm_irqfd_init(); 3464 if (r) 3465 goto out_irqfd; 3466 3467 if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { 3468 r = -ENOMEM; 3469 goto out_free_0; 3470 } 3471 3472 r = kvm_arch_hardware_setup(); 3473 if (r < 0) 3474 goto out_free_0a; 3475 3476 for_each_online_cpu(cpu) { 3477 smp_call_function_single(cpu, 3478 kvm_arch_check_processor_compat, 3479 &r, 1); 3480 if (r < 0) 3481 goto out_free_1; 3482 } 3483 3484 r = register_cpu_notifier(&kvm_cpu_notifier); 3485 if (r) 3486 goto out_free_2; 3487 register_reboot_notifier(&kvm_reboot_notifier); 3488 3489 /* A kmem cache lets us meet the alignment requirements of fx_save. */ 3490 if (!vcpu_align) 3491 vcpu_align = __alignof__(struct kvm_vcpu); 3492 kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, vcpu_align, 3493 0, NULL); 3494 if (!kvm_vcpu_cache) { 3495 r = -ENOMEM; 3496 goto out_free_3; 3497 } 3498 3499 r = kvm_async_pf_init(); 3500 if (r) 3501 goto out_free; 3502 3503 kvm_chardev_ops.owner = module; 3504 kvm_vm_fops.owner = module; 3505 kvm_vcpu_fops.owner = module; 3506 3507 r = misc_register(&kvm_dev); 3508 if (r) { 3509 pr_err("kvm: misc device register failed\n"); 3510 goto out_unreg; 3511 } 3512 3513 register_syscore_ops(&kvm_syscore_ops); 3514 3515 kvm_preempt_ops.sched_in = kvm_sched_in; 3516 kvm_preempt_ops.sched_out = kvm_sched_out; 3517 3518 r = kvm_init_debug(); 3519 if (r) { 3520 pr_err("kvm: create debugfs files failed\n"); 3521 goto out_undebugfs; 3522 } 3523 3524 r = kvm_vfio_ops_init(); 3525 WARN_ON(r); 3526 3527 return 0; 3528 3529 out_undebugfs: 3530 unregister_syscore_ops(&kvm_syscore_ops); 3531 misc_deregister(&kvm_dev); 3532 out_unreg: 3533 kvm_async_pf_deinit(); 3534 out_free: 3535 kmem_cache_destroy(kvm_vcpu_cache); 3536 out_free_3: 3537 unregister_reboot_notifier(&kvm_reboot_notifier); 3538 unregister_cpu_notifier(&kvm_cpu_notifier); 3539 out_free_2: 3540 out_free_1: 3541 kvm_arch_hardware_unsetup(); 3542 out_free_0a: 3543 free_cpumask_var(cpus_hardware_enabled); 3544 out_free_0: 3545 kvm_irqfd_exit(); 3546 out_irqfd: 3547 kvm_arch_exit(); 3548 out_fail: 3549 return r; 3550 } 3551 EXPORT_SYMBOL_GPL(kvm_init); 3552 3553 void kvm_exit(void) 3554 { 3555 kvm_exit_debug(); 3556 misc_deregister(&kvm_dev); 3557 kmem_cache_destroy(kvm_vcpu_cache); 3558 kvm_async_pf_deinit(); 3559 unregister_syscore_ops(&kvm_syscore_ops); 3560 unregister_reboot_notifier(&kvm_reboot_notifier); 3561 unregister_cpu_notifier(&kvm_cpu_notifier); 3562 on_each_cpu(hardware_disable_nolock, NULL, 1); 3563 kvm_arch_hardware_unsetup(); 3564 kvm_arch_exit(); 3565 kvm_irqfd_exit(); 3566 free_cpumask_var(cpus_hardware_enabled); 3567 kvm_vfio_ops_exit(); 3568 } 3569 EXPORT_SYMBOL_GPL(kvm_exit); 3570