1 /* 2 * Kernel-based Virtual Machine driver for Linux 3 * 4 * This module enables machines with Intel VT-x extensions to run virtual 5 * machines without emulation or binary translation. 6 * 7 * Copyright (C) 2006 Qumranet, Inc. 8 * 9 * Authors: 10 * Avi Kivity <avi@qumranet.com> 11 * Yaniv Kamay <yaniv@qumranet.com> 12 * 13 * This work is licensed under the terms of the GNU GPL, version 2. See 14 * the COPYING file in the top-level directory. 15 * 16 */ 17 18 #include "iodev.h" 19 20 #include <linux/kvm_host.h> 21 #include <linux/kvm.h> 22 #include <linux/module.h> 23 #include <linux/errno.h> 24 #include <linux/percpu.h> 25 #include <linux/gfp.h> 26 #include <linux/mm.h> 27 #include <linux/miscdevice.h> 28 #include <linux/vmalloc.h> 29 #include <linux/reboot.h> 30 #include <linux/debugfs.h> 31 #include <linux/highmem.h> 32 #include <linux/file.h> 33 #include <linux/sysdev.h> 34 #include <linux/cpu.h> 35 #include <linux/sched.h> 36 #include <linux/cpumask.h> 37 #include <linux/smp.h> 38 #include <linux/anon_inodes.h> 39 #include <linux/profile.h> 40 #include <linux/kvm_para.h> 41 #include <linux/pagemap.h> 42 #include <linux/mman.h> 43 #include <linux/swap.h> 44 45 #include <asm/processor.h> 46 #include <asm/io.h> 47 #include <asm/uaccess.h> 48 #include <asm/pgtable.h> 49 50 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 51 #include "coalesced_mmio.h" 52 #endif 53 54 #ifdef KVM_CAP_DEVICE_ASSIGNMENT 55 #include <linux/pci.h> 56 #include <linux/interrupt.h> 57 #include "irq.h" 58 #endif 59 60 MODULE_AUTHOR("Qumranet"); 61 MODULE_LICENSE("GPL"); 62 63 static int msi2intx = 1; 64 module_param(msi2intx, bool, 0); 65 66 DEFINE_SPINLOCK(kvm_lock); 67 LIST_HEAD(vm_list); 68 69 static cpumask_var_t cpus_hardware_enabled; 70 71 struct kmem_cache *kvm_vcpu_cache; 72 EXPORT_SYMBOL_GPL(kvm_vcpu_cache); 73 74 static __read_mostly struct preempt_ops kvm_preempt_ops; 75 76 struct dentry *kvm_debugfs_dir; 77 78 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, 79 unsigned long arg); 80 81 static bool kvm_rebooting; 82 83 #ifdef KVM_CAP_DEVICE_ASSIGNMENT 84 static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head, 85 int assigned_dev_id) 86 { 87 struct list_head *ptr; 88 struct kvm_assigned_dev_kernel *match; 89 90 list_for_each(ptr, head) { 91 match = list_entry(ptr, struct kvm_assigned_dev_kernel, list); 92 if (match->assigned_dev_id == assigned_dev_id) 93 return match; 94 } 95 return NULL; 96 } 97 98 static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work) 99 { 100 struct kvm_assigned_dev_kernel *assigned_dev; 101 102 assigned_dev = container_of(work, struct kvm_assigned_dev_kernel, 103 interrupt_work); 104 105 /* This is taken to safely inject irq inside the guest. When 106 * the interrupt injection (or the ioapic code) uses a 107 * finer-grained lock, update this 108 */ 109 mutex_lock(&assigned_dev->kvm->lock); 110 kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, 111 assigned_dev->guest_irq, 1); 112 113 if (assigned_dev->irq_requested_type & KVM_ASSIGNED_DEV_GUEST_MSI) { 114 enable_irq(assigned_dev->host_irq); 115 assigned_dev->host_irq_disabled = false; 116 } 117 mutex_unlock(&assigned_dev->kvm->lock); 118 } 119 120 static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id) 121 { 122 struct kvm_assigned_dev_kernel *assigned_dev = 123 (struct kvm_assigned_dev_kernel *) dev_id; 124 125 schedule_work(&assigned_dev->interrupt_work); 126 127 disable_irq_nosync(irq); 128 assigned_dev->host_irq_disabled = true; 129 130 return IRQ_HANDLED; 131 } 132 133 /* Ack the irq line for an assigned device */ 134 static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) 135 { 136 struct kvm_assigned_dev_kernel *dev; 137 138 if (kian->gsi == -1) 139 return; 140 141 dev = container_of(kian, struct kvm_assigned_dev_kernel, 142 ack_notifier); 143 144 kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0); 145 146 /* The guest irq may be shared so this ack may be 147 * from another device. 148 */ 149 if (dev->host_irq_disabled) { 150 enable_irq(dev->host_irq); 151 dev->host_irq_disabled = false; 152 } 153 } 154 155 /* The function implicit hold kvm->lock mutex due to cancel_work_sync() */ 156 static void kvm_free_assigned_irq(struct kvm *kvm, 157 struct kvm_assigned_dev_kernel *assigned_dev) 158 { 159 if (!irqchip_in_kernel(kvm)) 160 return; 161 162 kvm_unregister_irq_ack_notifier(&assigned_dev->ack_notifier); 163 164 if (assigned_dev->irq_source_id != -1) 165 kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id); 166 assigned_dev->irq_source_id = -1; 167 168 if (!assigned_dev->irq_requested_type) 169 return; 170 171 /* 172 * In kvm_free_device_irq, cancel_work_sync return true if: 173 * 1. work is scheduled, and then cancelled. 174 * 2. work callback is executed. 175 * 176 * The first one ensured that the irq is disabled and no more events 177 * would happen. But for the second one, the irq may be enabled (e.g. 178 * for MSI). So we disable irq here to prevent further events. 179 * 180 * Notice this maybe result in nested disable if the interrupt type is 181 * INTx, but it's OK for we are going to free it. 182 * 183 * If this function is a part of VM destroy, please ensure that till 184 * now, the kvm state is still legal for probably we also have to wait 185 * interrupt_work done. 186 */ 187 disable_irq_nosync(assigned_dev->host_irq); 188 cancel_work_sync(&assigned_dev->interrupt_work); 189 190 free_irq(assigned_dev->host_irq, (void *)assigned_dev); 191 192 if (assigned_dev->irq_requested_type & KVM_ASSIGNED_DEV_HOST_MSI) 193 pci_disable_msi(assigned_dev->dev); 194 195 assigned_dev->irq_requested_type = 0; 196 } 197 198 199 static void kvm_free_assigned_device(struct kvm *kvm, 200 struct kvm_assigned_dev_kernel 201 *assigned_dev) 202 { 203 kvm_free_assigned_irq(kvm, assigned_dev); 204 205 pci_reset_function(assigned_dev->dev); 206 207 pci_release_regions(assigned_dev->dev); 208 pci_disable_device(assigned_dev->dev); 209 pci_dev_put(assigned_dev->dev); 210 211 list_del(&assigned_dev->list); 212 kfree(assigned_dev); 213 } 214 215 void kvm_free_all_assigned_devices(struct kvm *kvm) 216 { 217 struct list_head *ptr, *ptr2; 218 struct kvm_assigned_dev_kernel *assigned_dev; 219 220 list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) { 221 assigned_dev = list_entry(ptr, 222 struct kvm_assigned_dev_kernel, 223 list); 224 225 kvm_free_assigned_device(kvm, assigned_dev); 226 } 227 } 228 229 static int assigned_device_update_intx(struct kvm *kvm, 230 struct kvm_assigned_dev_kernel *adev, 231 struct kvm_assigned_irq *airq) 232 { 233 adev->guest_irq = airq->guest_irq; 234 adev->ack_notifier.gsi = airq->guest_irq; 235 236 if (adev->irq_requested_type & KVM_ASSIGNED_DEV_HOST_INTX) 237 return 0; 238 239 if (irqchip_in_kernel(kvm)) { 240 if (!msi2intx && 241 (adev->irq_requested_type & KVM_ASSIGNED_DEV_HOST_MSI)) { 242 free_irq(adev->host_irq, (void *)adev); 243 pci_disable_msi(adev->dev); 244 } 245 246 if (!capable(CAP_SYS_RAWIO)) 247 return -EPERM; 248 249 if (airq->host_irq) 250 adev->host_irq = airq->host_irq; 251 else 252 adev->host_irq = adev->dev->irq; 253 254 /* Even though this is PCI, we don't want to use shared 255 * interrupts. Sharing host devices with guest-assigned devices 256 * on the same interrupt line is not a happy situation: there 257 * are going to be long delays in accepting, acking, etc. 258 */ 259 if (request_irq(adev->host_irq, kvm_assigned_dev_intr, 260 0, "kvm_assigned_intx_device", (void *)adev)) 261 return -EIO; 262 } 263 264 adev->irq_requested_type = KVM_ASSIGNED_DEV_GUEST_INTX | 265 KVM_ASSIGNED_DEV_HOST_INTX; 266 return 0; 267 } 268 269 #ifdef CONFIG_X86 270 static int assigned_device_update_msi(struct kvm *kvm, 271 struct kvm_assigned_dev_kernel *adev, 272 struct kvm_assigned_irq *airq) 273 { 274 int r; 275 276 adev->guest_irq = airq->guest_irq; 277 if (airq->flags & KVM_DEV_IRQ_ASSIGN_ENABLE_MSI) { 278 /* x86 don't care upper address of guest msi message addr */ 279 adev->irq_requested_type |= KVM_ASSIGNED_DEV_GUEST_MSI; 280 adev->irq_requested_type &= ~KVM_ASSIGNED_DEV_GUEST_INTX; 281 adev->ack_notifier.gsi = -1; 282 } else if (msi2intx) { 283 adev->irq_requested_type |= KVM_ASSIGNED_DEV_GUEST_INTX; 284 adev->irq_requested_type &= ~KVM_ASSIGNED_DEV_GUEST_MSI; 285 adev->ack_notifier.gsi = airq->guest_irq; 286 } else { 287 /* 288 * Guest require to disable device MSI, we disable MSI and 289 * re-enable INTx by default again. Notice it's only for 290 * non-msi2intx. 291 */ 292 assigned_device_update_intx(kvm, adev, airq); 293 return 0; 294 } 295 296 if (adev->irq_requested_type & KVM_ASSIGNED_DEV_HOST_MSI) 297 return 0; 298 299 if (irqchip_in_kernel(kvm)) { 300 if (!msi2intx) { 301 if (adev->irq_requested_type & 302 KVM_ASSIGNED_DEV_HOST_INTX) 303 free_irq(adev->host_irq, (void *)adev); 304 305 r = pci_enable_msi(adev->dev); 306 if (r) 307 return r; 308 } 309 310 adev->host_irq = adev->dev->irq; 311 if (request_irq(adev->host_irq, kvm_assigned_dev_intr, 0, 312 "kvm_assigned_msi_device", (void *)adev)) 313 return -EIO; 314 } 315 316 if (!msi2intx) 317 adev->irq_requested_type = KVM_ASSIGNED_DEV_GUEST_MSI; 318 319 adev->irq_requested_type |= KVM_ASSIGNED_DEV_HOST_MSI; 320 return 0; 321 } 322 #endif 323 324 static int kvm_vm_ioctl_assign_irq(struct kvm *kvm, 325 struct kvm_assigned_irq 326 *assigned_irq) 327 { 328 int r = 0; 329 struct kvm_assigned_dev_kernel *match; 330 u32 current_flags = 0, changed_flags; 331 332 mutex_lock(&kvm->lock); 333 334 match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, 335 assigned_irq->assigned_dev_id); 336 if (!match) { 337 mutex_unlock(&kvm->lock); 338 return -EINVAL; 339 } 340 341 if (!match->irq_requested_type) { 342 INIT_WORK(&match->interrupt_work, 343 kvm_assigned_dev_interrupt_work_handler); 344 if (irqchip_in_kernel(kvm)) { 345 /* Register ack nofitier */ 346 match->ack_notifier.gsi = -1; 347 match->ack_notifier.irq_acked = 348 kvm_assigned_dev_ack_irq; 349 kvm_register_irq_ack_notifier(kvm, 350 &match->ack_notifier); 351 352 /* Request IRQ source ID */ 353 r = kvm_request_irq_source_id(kvm); 354 if (r < 0) 355 goto out_release; 356 else 357 match->irq_source_id = r; 358 359 #ifdef CONFIG_X86 360 /* Determine host device irq type, we can know the 361 * result from dev->msi_enabled */ 362 if (msi2intx) 363 pci_enable_msi(match->dev); 364 #endif 365 } 366 } 367 368 if ((match->irq_requested_type & KVM_ASSIGNED_DEV_HOST_MSI) && 369 (match->irq_requested_type & KVM_ASSIGNED_DEV_GUEST_MSI)) 370 current_flags |= KVM_DEV_IRQ_ASSIGN_ENABLE_MSI; 371 372 changed_flags = assigned_irq->flags ^ current_flags; 373 374 if ((changed_flags & KVM_DEV_IRQ_ASSIGN_MSI_ACTION) || 375 (msi2intx && match->dev->msi_enabled)) { 376 #ifdef CONFIG_X86 377 r = assigned_device_update_msi(kvm, match, assigned_irq); 378 if (r) { 379 printk(KERN_WARNING "kvm: failed to enable " 380 "MSI device!\n"); 381 goto out_release; 382 } 383 #else 384 r = -ENOTTY; 385 #endif 386 } else if (assigned_irq->host_irq == 0 && match->dev->irq == 0) { 387 /* Host device IRQ 0 means don't support INTx */ 388 if (!msi2intx) { 389 printk(KERN_WARNING 390 "kvm: wait device to enable MSI!\n"); 391 r = 0; 392 } else { 393 printk(KERN_WARNING 394 "kvm: failed to enable MSI device!\n"); 395 r = -ENOTTY; 396 goto out_release; 397 } 398 } else { 399 /* Non-sharing INTx mode */ 400 r = assigned_device_update_intx(kvm, match, assigned_irq); 401 if (r) { 402 printk(KERN_WARNING "kvm: failed to enable " 403 "INTx device!\n"); 404 goto out_release; 405 } 406 } 407 408 mutex_unlock(&kvm->lock); 409 return r; 410 out_release: 411 mutex_unlock(&kvm->lock); 412 kvm_free_assigned_device(kvm, match); 413 return r; 414 } 415 416 static int kvm_vm_ioctl_assign_device(struct kvm *kvm, 417 struct kvm_assigned_pci_dev *assigned_dev) 418 { 419 int r = 0; 420 struct kvm_assigned_dev_kernel *match; 421 struct pci_dev *dev; 422 423 down_read(&kvm->slots_lock); 424 mutex_lock(&kvm->lock); 425 426 match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, 427 assigned_dev->assigned_dev_id); 428 if (match) { 429 /* device already assigned */ 430 r = -EINVAL; 431 goto out; 432 } 433 434 match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL); 435 if (match == NULL) { 436 printk(KERN_INFO "%s: Couldn't allocate memory\n", 437 __func__); 438 r = -ENOMEM; 439 goto out; 440 } 441 dev = pci_get_bus_and_slot(assigned_dev->busnr, 442 assigned_dev->devfn); 443 if (!dev) { 444 printk(KERN_INFO "%s: host device not found\n", __func__); 445 r = -EINVAL; 446 goto out_free; 447 } 448 if (pci_enable_device(dev)) { 449 printk(KERN_INFO "%s: Could not enable PCI device\n", __func__); 450 r = -EBUSY; 451 goto out_put; 452 } 453 r = pci_request_regions(dev, "kvm_assigned_device"); 454 if (r) { 455 printk(KERN_INFO "%s: Could not get access to device regions\n", 456 __func__); 457 goto out_disable; 458 } 459 460 pci_reset_function(dev); 461 462 match->assigned_dev_id = assigned_dev->assigned_dev_id; 463 match->host_busnr = assigned_dev->busnr; 464 match->host_devfn = assigned_dev->devfn; 465 match->flags = assigned_dev->flags; 466 match->dev = dev; 467 match->irq_source_id = -1; 468 match->kvm = kvm; 469 470 list_add(&match->list, &kvm->arch.assigned_dev_head); 471 472 if (assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) { 473 if (!kvm->arch.iommu_domain) { 474 r = kvm_iommu_map_guest(kvm); 475 if (r) 476 goto out_list_del; 477 } 478 r = kvm_assign_device(kvm, match); 479 if (r) 480 goto out_list_del; 481 } 482 483 out: 484 mutex_unlock(&kvm->lock); 485 up_read(&kvm->slots_lock); 486 return r; 487 out_list_del: 488 list_del(&match->list); 489 pci_release_regions(dev); 490 out_disable: 491 pci_disable_device(dev); 492 out_put: 493 pci_dev_put(dev); 494 out_free: 495 kfree(match); 496 mutex_unlock(&kvm->lock); 497 up_read(&kvm->slots_lock); 498 return r; 499 } 500 #endif 501 502 #ifdef KVM_CAP_DEVICE_DEASSIGNMENT 503 static int kvm_vm_ioctl_deassign_device(struct kvm *kvm, 504 struct kvm_assigned_pci_dev *assigned_dev) 505 { 506 int r = 0; 507 struct kvm_assigned_dev_kernel *match; 508 509 mutex_lock(&kvm->lock); 510 511 match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, 512 assigned_dev->assigned_dev_id); 513 if (!match) { 514 printk(KERN_INFO "%s: device hasn't been assigned before, " 515 "so cannot be deassigned\n", __func__); 516 r = -EINVAL; 517 goto out; 518 } 519 520 if (match->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) 521 kvm_deassign_device(kvm, match); 522 523 kvm_free_assigned_device(kvm, match); 524 525 out: 526 mutex_unlock(&kvm->lock); 527 return r; 528 } 529 #endif 530 531 static inline int valid_vcpu(int n) 532 { 533 return likely(n >= 0 && n < KVM_MAX_VCPUS); 534 } 535 536 inline int kvm_is_mmio_pfn(pfn_t pfn) 537 { 538 if (pfn_valid(pfn)) { 539 struct page *page = compound_head(pfn_to_page(pfn)); 540 return PageReserved(page); 541 } 542 543 return true; 544 } 545 546 /* 547 * Switches to specified vcpu, until a matching vcpu_put() 548 */ 549 void vcpu_load(struct kvm_vcpu *vcpu) 550 { 551 int cpu; 552 553 mutex_lock(&vcpu->mutex); 554 cpu = get_cpu(); 555 preempt_notifier_register(&vcpu->preempt_notifier); 556 kvm_arch_vcpu_load(vcpu, cpu); 557 put_cpu(); 558 } 559 560 void vcpu_put(struct kvm_vcpu *vcpu) 561 { 562 preempt_disable(); 563 kvm_arch_vcpu_put(vcpu); 564 preempt_notifier_unregister(&vcpu->preempt_notifier); 565 preempt_enable(); 566 mutex_unlock(&vcpu->mutex); 567 } 568 569 static void ack_flush(void *_completed) 570 { 571 } 572 573 static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) 574 { 575 int i, cpu, me; 576 cpumask_var_t cpus; 577 bool called = true; 578 struct kvm_vcpu *vcpu; 579 580 if (alloc_cpumask_var(&cpus, GFP_ATOMIC)) 581 cpumask_clear(cpus); 582 583 me = get_cpu(); 584 for (i = 0; i < KVM_MAX_VCPUS; ++i) { 585 vcpu = kvm->vcpus[i]; 586 if (!vcpu) 587 continue; 588 if (test_and_set_bit(req, &vcpu->requests)) 589 continue; 590 cpu = vcpu->cpu; 591 if (cpus != NULL && cpu != -1 && cpu != me) 592 cpumask_set_cpu(cpu, cpus); 593 } 594 if (unlikely(cpus == NULL)) 595 smp_call_function_many(cpu_online_mask, ack_flush, NULL, 1); 596 else if (!cpumask_empty(cpus)) 597 smp_call_function_many(cpus, ack_flush, NULL, 1); 598 else 599 called = false; 600 put_cpu(); 601 free_cpumask_var(cpus); 602 return called; 603 } 604 605 void kvm_flush_remote_tlbs(struct kvm *kvm) 606 { 607 if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) 608 ++kvm->stat.remote_tlb_flush; 609 } 610 611 void kvm_reload_remote_mmus(struct kvm *kvm) 612 { 613 make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD); 614 } 615 616 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) 617 { 618 struct page *page; 619 int r; 620 621 mutex_init(&vcpu->mutex); 622 vcpu->cpu = -1; 623 vcpu->kvm = kvm; 624 vcpu->vcpu_id = id; 625 init_waitqueue_head(&vcpu->wq); 626 627 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 628 if (!page) { 629 r = -ENOMEM; 630 goto fail; 631 } 632 vcpu->run = page_address(page); 633 634 r = kvm_arch_vcpu_init(vcpu); 635 if (r < 0) 636 goto fail_free_run; 637 return 0; 638 639 fail_free_run: 640 free_page((unsigned long)vcpu->run); 641 fail: 642 return r; 643 } 644 EXPORT_SYMBOL_GPL(kvm_vcpu_init); 645 646 void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) 647 { 648 kvm_arch_vcpu_uninit(vcpu); 649 free_page((unsigned long)vcpu->run); 650 } 651 EXPORT_SYMBOL_GPL(kvm_vcpu_uninit); 652 653 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 654 static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) 655 { 656 return container_of(mn, struct kvm, mmu_notifier); 657 } 658 659 static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn, 660 struct mm_struct *mm, 661 unsigned long address) 662 { 663 struct kvm *kvm = mmu_notifier_to_kvm(mn); 664 int need_tlb_flush; 665 666 /* 667 * When ->invalidate_page runs, the linux pte has been zapped 668 * already but the page is still allocated until 669 * ->invalidate_page returns. So if we increase the sequence 670 * here the kvm page fault will notice if the spte can't be 671 * established because the page is going to be freed. If 672 * instead the kvm page fault establishes the spte before 673 * ->invalidate_page runs, kvm_unmap_hva will release it 674 * before returning. 675 * 676 * The sequence increase only need to be seen at spin_unlock 677 * time, and not at spin_lock time. 678 * 679 * Increasing the sequence after the spin_unlock would be 680 * unsafe because the kvm page fault could then establish the 681 * pte after kvm_unmap_hva returned, without noticing the page 682 * is going to be freed. 683 */ 684 spin_lock(&kvm->mmu_lock); 685 kvm->mmu_notifier_seq++; 686 need_tlb_flush = kvm_unmap_hva(kvm, address); 687 spin_unlock(&kvm->mmu_lock); 688 689 /* we've to flush the tlb before the pages can be freed */ 690 if (need_tlb_flush) 691 kvm_flush_remote_tlbs(kvm); 692 693 } 694 695 static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, 696 struct mm_struct *mm, 697 unsigned long start, 698 unsigned long end) 699 { 700 struct kvm *kvm = mmu_notifier_to_kvm(mn); 701 int need_tlb_flush = 0; 702 703 spin_lock(&kvm->mmu_lock); 704 /* 705 * The count increase must become visible at unlock time as no 706 * spte can be established without taking the mmu_lock and 707 * count is also read inside the mmu_lock critical section. 708 */ 709 kvm->mmu_notifier_count++; 710 for (; start < end; start += PAGE_SIZE) 711 need_tlb_flush |= kvm_unmap_hva(kvm, start); 712 spin_unlock(&kvm->mmu_lock); 713 714 /* we've to flush the tlb before the pages can be freed */ 715 if (need_tlb_flush) 716 kvm_flush_remote_tlbs(kvm); 717 } 718 719 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, 720 struct mm_struct *mm, 721 unsigned long start, 722 unsigned long end) 723 { 724 struct kvm *kvm = mmu_notifier_to_kvm(mn); 725 726 spin_lock(&kvm->mmu_lock); 727 /* 728 * This sequence increase will notify the kvm page fault that 729 * the page that is going to be mapped in the spte could have 730 * been freed. 731 */ 732 kvm->mmu_notifier_seq++; 733 /* 734 * The above sequence increase must be visible before the 735 * below count decrease but both values are read by the kvm 736 * page fault under mmu_lock spinlock so we don't need to add 737 * a smb_wmb() here in between the two. 738 */ 739 kvm->mmu_notifier_count--; 740 spin_unlock(&kvm->mmu_lock); 741 742 BUG_ON(kvm->mmu_notifier_count < 0); 743 } 744 745 static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, 746 struct mm_struct *mm, 747 unsigned long address) 748 { 749 struct kvm *kvm = mmu_notifier_to_kvm(mn); 750 int young; 751 752 spin_lock(&kvm->mmu_lock); 753 young = kvm_age_hva(kvm, address); 754 spin_unlock(&kvm->mmu_lock); 755 756 if (young) 757 kvm_flush_remote_tlbs(kvm); 758 759 return young; 760 } 761 762 static void kvm_mmu_notifier_release(struct mmu_notifier *mn, 763 struct mm_struct *mm) 764 { 765 struct kvm *kvm = mmu_notifier_to_kvm(mn); 766 kvm_arch_flush_shadow(kvm); 767 } 768 769 static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { 770 .invalidate_page = kvm_mmu_notifier_invalidate_page, 771 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, 772 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, 773 .clear_flush_young = kvm_mmu_notifier_clear_flush_young, 774 .release = kvm_mmu_notifier_release, 775 }; 776 #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */ 777 778 static struct kvm *kvm_create_vm(void) 779 { 780 struct kvm *kvm = kvm_arch_create_vm(); 781 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 782 struct page *page; 783 #endif 784 785 if (IS_ERR(kvm)) 786 goto out; 787 #ifdef CONFIG_HAVE_KVM_IRQCHIP 788 INIT_LIST_HEAD(&kvm->irq_routing); 789 INIT_HLIST_HEAD(&kvm->mask_notifier_list); 790 #endif 791 792 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 793 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 794 if (!page) { 795 kfree(kvm); 796 return ERR_PTR(-ENOMEM); 797 } 798 kvm->coalesced_mmio_ring = 799 (struct kvm_coalesced_mmio_ring *)page_address(page); 800 #endif 801 802 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 803 { 804 int err; 805 kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops; 806 err = mmu_notifier_register(&kvm->mmu_notifier, current->mm); 807 if (err) { 808 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 809 put_page(page); 810 #endif 811 kfree(kvm); 812 return ERR_PTR(err); 813 } 814 } 815 #endif 816 817 kvm->mm = current->mm; 818 atomic_inc(&kvm->mm->mm_count); 819 spin_lock_init(&kvm->mmu_lock); 820 kvm_io_bus_init(&kvm->pio_bus); 821 mutex_init(&kvm->lock); 822 kvm_io_bus_init(&kvm->mmio_bus); 823 init_rwsem(&kvm->slots_lock); 824 atomic_set(&kvm->users_count, 1); 825 spin_lock(&kvm_lock); 826 list_add(&kvm->vm_list, &vm_list); 827 spin_unlock(&kvm_lock); 828 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 829 kvm_coalesced_mmio_init(kvm); 830 #endif 831 out: 832 return kvm; 833 } 834 835 /* 836 * Free any memory in @free but not in @dont. 837 */ 838 static void kvm_free_physmem_slot(struct kvm_memory_slot *free, 839 struct kvm_memory_slot *dont) 840 { 841 if (!dont || free->rmap != dont->rmap) 842 vfree(free->rmap); 843 844 if (!dont || free->dirty_bitmap != dont->dirty_bitmap) 845 vfree(free->dirty_bitmap); 846 847 if (!dont || free->lpage_info != dont->lpage_info) 848 vfree(free->lpage_info); 849 850 free->npages = 0; 851 free->dirty_bitmap = NULL; 852 free->rmap = NULL; 853 free->lpage_info = NULL; 854 } 855 856 void kvm_free_physmem(struct kvm *kvm) 857 { 858 int i; 859 860 for (i = 0; i < kvm->nmemslots; ++i) 861 kvm_free_physmem_slot(&kvm->memslots[i], NULL); 862 } 863 864 static void kvm_destroy_vm(struct kvm *kvm) 865 { 866 struct mm_struct *mm = kvm->mm; 867 868 kvm_arch_sync_events(kvm); 869 spin_lock(&kvm_lock); 870 list_del(&kvm->vm_list); 871 spin_unlock(&kvm_lock); 872 kvm_free_irq_routing(kvm); 873 kvm_io_bus_destroy(&kvm->pio_bus); 874 kvm_io_bus_destroy(&kvm->mmio_bus); 875 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 876 if (kvm->coalesced_mmio_ring != NULL) 877 free_page((unsigned long)kvm->coalesced_mmio_ring); 878 #endif 879 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 880 mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); 881 #endif 882 kvm_arch_destroy_vm(kvm); 883 mmdrop(mm); 884 } 885 886 void kvm_get_kvm(struct kvm *kvm) 887 { 888 atomic_inc(&kvm->users_count); 889 } 890 EXPORT_SYMBOL_GPL(kvm_get_kvm); 891 892 void kvm_put_kvm(struct kvm *kvm) 893 { 894 if (atomic_dec_and_test(&kvm->users_count)) 895 kvm_destroy_vm(kvm); 896 } 897 EXPORT_SYMBOL_GPL(kvm_put_kvm); 898 899 900 static int kvm_vm_release(struct inode *inode, struct file *filp) 901 { 902 struct kvm *kvm = filp->private_data; 903 904 kvm_put_kvm(kvm); 905 return 0; 906 } 907 908 /* 909 * Allocate some memory and give it an address in the guest physical address 910 * space. 911 * 912 * Discontiguous memory is allowed, mostly for framebuffers. 913 * 914 * Must be called holding mmap_sem for write. 915 */ 916 int __kvm_set_memory_region(struct kvm *kvm, 917 struct kvm_userspace_memory_region *mem, 918 int user_alloc) 919 { 920 int r; 921 gfn_t base_gfn; 922 unsigned long npages; 923 unsigned long i; 924 struct kvm_memory_slot *memslot; 925 struct kvm_memory_slot old, new; 926 927 r = -EINVAL; 928 /* General sanity checks */ 929 if (mem->memory_size & (PAGE_SIZE - 1)) 930 goto out; 931 if (mem->guest_phys_addr & (PAGE_SIZE - 1)) 932 goto out; 933 if (user_alloc && (mem->userspace_addr & (PAGE_SIZE - 1))) 934 goto out; 935 if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS) 936 goto out; 937 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) 938 goto out; 939 940 memslot = &kvm->memslots[mem->slot]; 941 base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; 942 npages = mem->memory_size >> PAGE_SHIFT; 943 944 if (!npages) 945 mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES; 946 947 new = old = *memslot; 948 949 new.base_gfn = base_gfn; 950 new.npages = npages; 951 new.flags = mem->flags; 952 953 /* Disallow changing a memory slot's size. */ 954 r = -EINVAL; 955 if (npages && old.npages && npages != old.npages) 956 goto out_free; 957 958 /* Check for overlaps */ 959 r = -EEXIST; 960 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { 961 struct kvm_memory_slot *s = &kvm->memslots[i]; 962 963 if (s == memslot) 964 continue; 965 if (!((base_gfn + npages <= s->base_gfn) || 966 (base_gfn >= s->base_gfn + s->npages))) 967 goto out_free; 968 } 969 970 /* Free page dirty bitmap if unneeded */ 971 if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES)) 972 new.dirty_bitmap = NULL; 973 974 r = -ENOMEM; 975 976 /* Allocate if a slot is being created */ 977 #ifndef CONFIG_S390 978 if (npages && !new.rmap) { 979 new.rmap = vmalloc(npages * sizeof(struct page *)); 980 981 if (!new.rmap) 982 goto out_free; 983 984 memset(new.rmap, 0, npages * sizeof(*new.rmap)); 985 986 new.user_alloc = user_alloc; 987 /* 988 * hva_to_rmmap() serialzies with the mmu_lock and to be 989 * safe it has to ignore memslots with !user_alloc && 990 * !userspace_addr. 991 */ 992 if (user_alloc) 993 new.userspace_addr = mem->userspace_addr; 994 else 995 new.userspace_addr = 0; 996 } 997 if (npages && !new.lpage_info) { 998 int largepages = npages / KVM_PAGES_PER_HPAGE; 999 if (npages % KVM_PAGES_PER_HPAGE) 1000 largepages++; 1001 if (base_gfn % KVM_PAGES_PER_HPAGE) 1002 largepages++; 1003 1004 new.lpage_info = vmalloc(largepages * sizeof(*new.lpage_info)); 1005 1006 if (!new.lpage_info) 1007 goto out_free; 1008 1009 memset(new.lpage_info, 0, largepages * sizeof(*new.lpage_info)); 1010 1011 if (base_gfn % KVM_PAGES_PER_HPAGE) 1012 new.lpage_info[0].write_count = 1; 1013 if ((base_gfn+npages) % KVM_PAGES_PER_HPAGE) 1014 new.lpage_info[largepages-1].write_count = 1; 1015 } 1016 1017 /* Allocate page dirty bitmap if needed */ 1018 if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { 1019 unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8; 1020 1021 new.dirty_bitmap = vmalloc(dirty_bytes); 1022 if (!new.dirty_bitmap) 1023 goto out_free; 1024 memset(new.dirty_bitmap, 0, dirty_bytes); 1025 } 1026 #endif /* not defined CONFIG_S390 */ 1027 1028 if (!npages) 1029 kvm_arch_flush_shadow(kvm); 1030 1031 spin_lock(&kvm->mmu_lock); 1032 if (mem->slot >= kvm->nmemslots) 1033 kvm->nmemslots = mem->slot + 1; 1034 1035 *memslot = new; 1036 spin_unlock(&kvm->mmu_lock); 1037 1038 r = kvm_arch_set_memory_region(kvm, mem, old, user_alloc); 1039 if (r) { 1040 spin_lock(&kvm->mmu_lock); 1041 *memslot = old; 1042 spin_unlock(&kvm->mmu_lock); 1043 goto out_free; 1044 } 1045 1046 kvm_free_physmem_slot(&old, npages ? &new : NULL); 1047 /* Slot deletion case: we have to update the current slot */ 1048 if (!npages) 1049 *memslot = old; 1050 #ifdef CONFIG_DMAR 1051 /* map the pages in iommu page table */ 1052 r = kvm_iommu_map_pages(kvm, base_gfn, npages); 1053 if (r) 1054 goto out; 1055 #endif 1056 return 0; 1057 1058 out_free: 1059 kvm_free_physmem_slot(&new, &old); 1060 out: 1061 return r; 1062 1063 } 1064 EXPORT_SYMBOL_GPL(__kvm_set_memory_region); 1065 1066 int kvm_set_memory_region(struct kvm *kvm, 1067 struct kvm_userspace_memory_region *mem, 1068 int user_alloc) 1069 { 1070 int r; 1071 1072 down_write(&kvm->slots_lock); 1073 r = __kvm_set_memory_region(kvm, mem, user_alloc); 1074 up_write(&kvm->slots_lock); 1075 return r; 1076 } 1077 EXPORT_SYMBOL_GPL(kvm_set_memory_region); 1078 1079 int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, 1080 struct 1081 kvm_userspace_memory_region *mem, 1082 int user_alloc) 1083 { 1084 if (mem->slot >= KVM_MEMORY_SLOTS) 1085 return -EINVAL; 1086 return kvm_set_memory_region(kvm, mem, user_alloc); 1087 } 1088 1089 int kvm_get_dirty_log(struct kvm *kvm, 1090 struct kvm_dirty_log *log, int *is_dirty) 1091 { 1092 struct kvm_memory_slot *memslot; 1093 int r, i; 1094 int n; 1095 unsigned long any = 0; 1096 1097 r = -EINVAL; 1098 if (log->slot >= KVM_MEMORY_SLOTS) 1099 goto out; 1100 1101 memslot = &kvm->memslots[log->slot]; 1102 r = -ENOENT; 1103 if (!memslot->dirty_bitmap) 1104 goto out; 1105 1106 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; 1107 1108 for (i = 0; !any && i < n/sizeof(long); ++i) 1109 any = memslot->dirty_bitmap[i]; 1110 1111 r = -EFAULT; 1112 if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) 1113 goto out; 1114 1115 if (any) 1116 *is_dirty = 1; 1117 1118 r = 0; 1119 out: 1120 return r; 1121 } 1122 1123 int is_error_page(struct page *page) 1124 { 1125 return page == bad_page; 1126 } 1127 EXPORT_SYMBOL_GPL(is_error_page); 1128 1129 int is_error_pfn(pfn_t pfn) 1130 { 1131 return pfn == bad_pfn; 1132 } 1133 EXPORT_SYMBOL_GPL(is_error_pfn); 1134 1135 static inline unsigned long bad_hva(void) 1136 { 1137 return PAGE_OFFSET; 1138 } 1139 1140 int kvm_is_error_hva(unsigned long addr) 1141 { 1142 return addr == bad_hva(); 1143 } 1144 EXPORT_SYMBOL_GPL(kvm_is_error_hva); 1145 1146 struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn) 1147 { 1148 int i; 1149 1150 for (i = 0; i < kvm->nmemslots; ++i) { 1151 struct kvm_memory_slot *memslot = &kvm->memslots[i]; 1152 1153 if (gfn >= memslot->base_gfn 1154 && gfn < memslot->base_gfn + memslot->npages) 1155 return memslot; 1156 } 1157 return NULL; 1158 } 1159 EXPORT_SYMBOL_GPL(gfn_to_memslot_unaliased); 1160 1161 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) 1162 { 1163 gfn = unalias_gfn(kvm, gfn); 1164 return gfn_to_memslot_unaliased(kvm, gfn); 1165 } 1166 1167 int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) 1168 { 1169 int i; 1170 1171 gfn = unalias_gfn(kvm, gfn); 1172 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { 1173 struct kvm_memory_slot *memslot = &kvm->memslots[i]; 1174 1175 if (gfn >= memslot->base_gfn 1176 && gfn < memslot->base_gfn + memslot->npages) 1177 return 1; 1178 } 1179 return 0; 1180 } 1181 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); 1182 1183 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) 1184 { 1185 struct kvm_memory_slot *slot; 1186 1187 gfn = unalias_gfn(kvm, gfn); 1188 slot = gfn_to_memslot_unaliased(kvm, gfn); 1189 if (!slot) 1190 return bad_hva(); 1191 return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE); 1192 } 1193 EXPORT_SYMBOL_GPL(gfn_to_hva); 1194 1195 pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) 1196 { 1197 struct page *page[1]; 1198 unsigned long addr; 1199 int npages; 1200 pfn_t pfn; 1201 1202 might_sleep(); 1203 1204 addr = gfn_to_hva(kvm, gfn); 1205 if (kvm_is_error_hva(addr)) { 1206 get_page(bad_page); 1207 return page_to_pfn(bad_page); 1208 } 1209 1210 npages = get_user_pages_fast(addr, 1, 1, page); 1211 1212 if (unlikely(npages != 1)) { 1213 struct vm_area_struct *vma; 1214 1215 down_read(¤t->mm->mmap_sem); 1216 vma = find_vma(current->mm, addr); 1217 1218 if (vma == NULL || addr < vma->vm_start || 1219 !(vma->vm_flags & VM_PFNMAP)) { 1220 up_read(¤t->mm->mmap_sem); 1221 get_page(bad_page); 1222 return page_to_pfn(bad_page); 1223 } 1224 1225 pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 1226 up_read(¤t->mm->mmap_sem); 1227 BUG_ON(!kvm_is_mmio_pfn(pfn)); 1228 } else 1229 pfn = page_to_pfn(page[0]); 1230 1231 return pfn; 1232 } 1233 1234 EXPORT_SYMBOL_GPL(gfn_to_pfn); 1235 1236 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) 1237 { 1238 pfn_t pfn; 1239 1240 pfn = gfn_to_pfn(kvm, gfn); 1241 if (!kvm_is_mmio_pfn(pfn)) 1242 return pfn_to_page(pfn); 1243 1244 WARN_ON(kvm_is_mmio_pfn(pfn)); 1245 1246 get_page(bad_page); 1247 return bad_page; 1248 } 1249 1250 EXPORT_SYMBOL_GPL(gfn_to_page); 1251 1252 void kvm_release_page_clean(struct page *page) 1253 { 1254 kvm_release_pfn_clean(page_to_pfn(page)); 1255 } 1256 EXPORT_SYMBOL_GPL(kvm_release_page_clean); 1257 1258 void kvm_release_pfn_clean(pfn_t pfn) 1259 { 1260 if (!kvm_is_mmio_pfn(pfn)) 1261 put_page(pfn_to_page(pfn)); 1262 } 1263 EXPORT_SYMBOL_GPL(kvm_release_pfn_clean); 1264 1265 void kvm_release_page_dirty(struct page *page) 1266 { 1267 kvm_release_pfn_dirty(page_to_pfn(page)); 1268 } 1269 EXPORT_SYMBOL_GPL(kvm_release_page_dirty); 1270 1271 void kvm_release_pfn_dirty(pfn_t pfn) 1272 { 1273 kvm_set_pfn_dirty(pfn); 1274 kvm_release_pfn_clean(pfn); 1275 } 1276 EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty); 1277 1278 void kvm_set_page_dirty(struct page *page) 1279 { 1280 kvm_set_pfn_dirty(page_to_pfn(page)); 1281 } 1282 EXPORT_SYMBOL_GPL(kvm_set_page_dirty); 1283 1284 void kvm_set_pfn_dirty(pfn_t pfn) 1285 { 1286 if (!kvm_is_mmio_pfn(pfn)) { 1287 struct page *page = pfn_to_page(pfn); 1288 if (!PageReserved(page)) 1289 SetPageDirty(page); 1290 } 1291 } 1292 EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty); 1293 1294 void kvm_set_pfn_accessed(pfn_t pfn) 1295 { 1296 if (!kvm_is_mmio_pfn(pfn)) 1297 mark_page_accessed(pfn_to_page(pfn)); 1298 } 1299 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed); 1300 1301 void kvm_get_pfn(pfn_t pfn) 1302 { 1303 if (!kvm_is_mmio_pfn(pfn)) 1304 get_page(pfn_to_page(pfn)); 1305 } 1306 EXPORT_SYMBOL_GPL(kvm_get_pfn); 1307 1308 static int next_segment(unsigned long len, int offset) 1309 { 1310 if (len > PAGE_SIZE - offset) 1311 return PAGE_SIZE - offset; 1312 else 1313 return len; 1314 } 1315 1316 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, 1317 int len) 1318 { 1319 int r; 1320 unsigned long addr; 1321 1322 addr = gfn_to_hva(kvm, gfn); 1323 if (kvm_is_error_hva(addr)) 1324 return -EFAULT; 1325 r = copy_from_user(data, (void __user *)addr + offset, len); 1326 if (r) 1327 return -EFAULT; 1328 return 0; 1329 } 1330 EXPORT_SYMBOL_GPL(kvm_read_guest_page); 1331 1332 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) 1333 { 1334 gfn_t gfn = gpa >> PAGE_SHIFT; 1335 int seg; 1336 int offset = offset_in_page(gpa); 1337 int ret; 1338 1339 while ((seg = next_segment(len, offset)) != 0) { 1340 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg); 1341 if (ret < 0) 1342 return ret; 1343 offset = 0; 1344 len -= seg; 1345 data += seg; 1346 ++gfn; 1347 } 1348 return 0; 1349 } 1350 EXPORT_SYMBOL_GPL(kvm_read_guest); 1351 1352 int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, 1353 unsigned long len) 1354 { 1355 int r; 1356 unsigned long addr; 1357 gfn_t gfn = gpa >> PAGE_SHIFT; 1358 int offset = offset_in_page(gpa); 1359 1360 addr = gfn_to_hva(kvm, gfn); 1361 if (kvm_is_error_hva(addr)) 1362 return -EFAULT; 1363 pagefault_disable(); 1364 r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len); 1365 pagefault_enable(); 1366 if (r) 1367 return -EFAULT; 1368 return 0; 1369 } 1370 EXPORT_SYMBOL(kvm_read_guest_atomic); 1371 1372 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, 1373 int offset, int len) 1374 { 1375 int r; 1376 unsigned long addr; 1377 1378 addr = gfn_to_hva(kvm, gfn); 1379 if (kvm_is_error_hva(addr)) 1380 return -EFAULT; 1381 r = copy_to_user((void __user *)addr + offset, data, len); 1382 if (r) 1383 return -EFAULT; 1384 mark_page_dirty(kvm, gfn); 1385 return 0; 1386 } 1387 EXPORT_SYMBOL_GPL(kvm_write_guest_page); 1388 1389 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, 1390 unsigned long len) 1391 { 1392 gfn_t gfn = gpa >> PAGE_SHIFT; 1393 int seg; 1394 int offset = offset_in_page(gpa); 1395 int ret; 1396 1397 while ((seg = next_segment(len, offset)) != 0) { 1398 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg); 1399 if (ret < 0) 1400 return ret; 1401 offset = 0; 1402 len -= seg; 1403 data += seg; 1404 ++gfn; 1405 } 1406 return 0; 1407 } 1408 1409 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) 1410 { 1411 return kvm_write_guest_page(kvm, gfn, empty_zero_page, offset, len); 1412 } 1413 EXPORT_SYMBOL_GPL(kvm_clear_guest_page); 1414 1415 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) 1416 { 1417 gfn_t gfn = gpa >> PAGE_SHIFT; 1418 int seg; 1419 int offset = offset_in_page(gpa); 1420 int ret; 1421 1422 while ((seg = next_segment(len, offset)) != 0) { 1423 ret = kvm_clear_guest_page(kvm, gfn, offset, seg); 1424 if (ret < 0) 1425 return ret; 1426 offset = 0; 1427 len -= seg; 1428 ++gfn; 1429 } 1430 return 0; 1431 } 1432 EXPORT_SYMBOL_GPL(kvm_clear_guest); 1433 1434 void mark_page_dirty(struct kvm *kvm, gfn_t gfn) 1435 { 1436 struct kvm_memory_slot *memslot; 1437 1438 gfn = unalias_gfn(kvm, gfn); 1439 memslot = gfn_to_memslot_unaliased(kvm, gfn); 1440 if (memslot && memslot->dirty_bitmap) { 1441 unsigned long rel_gfn = gfn - memslot->base_gfn; 1442 1443 /* avoid RMW */ 1444 if (!test_bit(rel_gfn, memslot->dirty_bitmap)) 1445 set_bit(rel_gfn, memslot->dirty_bitmap); 1446 } 1447 } 1448 1449 /* 1450 * The vCPU has executed a HLT instruction with in-kernel mode enabled. 1451 */ 1452 void kvm_vcpu_block(struct kvm_vcpu *vcpu) 1453 { 1454 DEFINE_WAIT(wait); 1455 1456 for (;;) { 1457 prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); 1458 1459 if (kvm_cpu_has_interrupt(vcpu) || 1460 kvm_cpu_has_pending_timer(vcpu) || 1461 kvm_arch_vcpu_runnable(vcpu)) { 1462 set_bit(KVM_REQ_UNHALT, &vcpu->requests); 1463 break; 1464 } 1465 if (signal_pending(current)) 1466 break; 1467 1468 vcpu_put(vcpu); 1469 schedule(); 1470 vcpu_load(vcpu); 1471 } 1472 1473 finish_wait(&vcpu->wq, &wait); 1474 } 1475 1476 void kvm_resched(struct kvm_vcpu *vcpu) 1477 { 1478 if (!need_resched()) 1479 return; 1480 cond_resched(); 1481 } 1482 EXPORT_SYMBOL_GPL(kvm_resched); 1483 1484 static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 1485 { 1486 struct kvm_vcpu *vcpu = vma->vm_file->private_data; 1487 struct page *page; 1488 1489 if (vmf->pgoff == 0) 1490 page = virt_to_page(vcpu->run); 1491 #ifdef CONFIG_X86 1492 else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET) 1493 page = virt_to_page(vcpu->arch.pio_data); 1494 #endif 1495 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 1496 else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET) 1497 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring); 1498 #endif 1499 else 1500 return VM_FAULT_SIGBUS; 1501 get_page(page); 1502 vmf->page = page; 1503 return 0; 1504 } 1505 1506 static struct vm_operations_struct kvm_vcpu_vm_ops = { 1507 .fault = kvm_vcpu_fault, 1508 }; 1509 1510 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma) 1511 { 1512 vma->vm_ops = &kvm_vcpu_vm_ops; 1513 return 0; 1514 } 1515 1516 static int kvm_vcpu_release(struct inode *inode, struct file *filp) 1517 { 1518 struct kvm_vcpu *vcpu = filp->private_data; 1519 1520 kvm_put_kvm(vcpu->kvm); 1521 return 0; 1522 } 1523 1524 static struct file_operations kvm_vcpu_fops = { 1525 .release = kvm_vcpu_release, 1526 .unlocked_ioctl = kvm_vcpu_ioctl, 1527 .compat_ioctl = kvm_vcpu_ioctl, 1528 .mmap = kvm_vcpu_mmap, 1529 }; 1530 1531 /* 1532 * Allocates an inode for the vcpu. 1533 */ 1534 static int create_vcpu_fd(struct kvm_vcpu *vcpu) 1535 { 1536 int fd = anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, 0); 1537 if (fd < 0) 1538 kvm_put_kvm(vcpu->kvm); 1539 return fd; 1540 } 1541 1542 /* 1543 * Creates some virtual cpus. Good luck creating more than one. 1544 */ 1545 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n) 1546 { 1547 int r; 1548 struct kvm_vcpu *vcpu; 1549 1550 if (!valid_vcpu(n)) 1551 return -EINVAL; 1552 1553 vcpu = kvm_arch_vcpu_create(kvm, n); 1554 if (IS_ERR(vcpu)) 1555 return PTR_ERR(vcpu); 1556 1557 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); 1558 1559 r = kvm_arch_vcpu_setup(vcpu); 1560 if (r) 1561 return r; 1562 1563 mutex_lock(&kvm->lock); 1564 if (kvm->vcpus[n]) { 1565 r = -EEXIST; 1566 goto vcpu_destroy; 1567 } 1568 kvm->vcpus[n] = vcpu; 1569 mutex_unlock(&kvm->lock); 1570 1571 /* Now it's all set up, let userspace reach it */ 1572 kvm_get_kvm(kvm); 1573 r = create_vcpu_fd(vcpu); 1574 if (r < 0) 1575 goto unlink; 1576 return r; 1577 1578 unlink: 1579 mutex_lock(&kvm->lock); 1580 kvm->vcpus[n] = NULL; 1581 vcpu_destroy: 1582 mutex_unlock(&kvm->lock); 1583 kvm_arch_vcpu_destroy(vcpu); 1584 return r; 1585 } 1586 1587 static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset) 1588 { 1589 if (sigset) { 1590 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP)); 1591 vcpu->sigset_active = 1; 1592 vcpu->sigset = *sigset; 1593 } else 1594 vcpu->sigset_active = 0; 1595 return 0; 1596 } 1597 1598 static long kvm_vcpu_ioctl(struct file *filp, 1599 unsigned int ioctl, unsigned long arg) 1600 { 1601 struct kvm_vcpu *vcpu = filp->private_data; 1602 void __user *argp = (void __user *)arg; 1603 int r; 1604 struct kvm_fpu *fpu = NULL; 1605 struct kvm_sregs *kvm_sregs = NULL; 1606 1607 if (vcpu->kvm->mm != current->mm) 1608 return -EIO; 1609 switch (ioctl) { 1610 case KVM_RUN: 1611 r = -EINVAL; 1612 if (arg) 1613 goto out; 1614 r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run); 1615 break; 1616 case KVM_GET_REGS: { 1617 struct kvm_regs *kvm_regs; 1618 1619 r = -ENOMEM; 1620 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL); 1621 if (!kvm_regs) 1622 goto out; 1623 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs); 1624 if (r) 1625 goto out_free1; 1626 r = -EFAULT; 1627 if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs))) 1628 goto out_free1; 1629 r = 0; 1630 out_free1: 1631 kfree(kvm_regs); 1632 break; 1633 } 1634 case KVM_SET_REGS: { 1635 struct kvm_regs *kvm_regs; 1636 1637 r = -ENOMEM; 1638 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL); 1639 if (!kvm_regs) 1640 goto out; 1641 r = -EFAULT; 1642 if (copy_from_user(kvm_regs, argp, sizeof(struct kvm_regs))) 1643 goto out_free2; 1644 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs); 1645 if (r) 1646 goto out_free2; 1647 r = 0; 1648 out_free2: 1649 kfree(kvm_regs); 1650 break; 1651 } 1652 case KVM_GET_SREGS: { 1653 kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL); 1654 r = -ENOMEM; 1655 if (!kvm_sregs) 1656 goto out; 1657 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs); 1658 if (r) 1659 goto out; 1660 r = -EFAULT; 1661 if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs))) 1662 goto out; 1663 r = 0; 1664 break; 1665 } 1666 case KVM_SET_SREGS: { 1667 kvm_sregs = kmalloc(sizeof(struct kvm_sregs), GFP_KERNEL); 1668 r = -ENOMEM; 1669 if (!kvm_sregs) 1670 goto out; 1671 r = -EFAULT; 1672 if (copy_from_user(kvm_sregs, argp, sizeof(struct kvm_sregs))) 1673 goto out; 1674 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs); 1675 if (r) 1676 goto out; 1677 r = 0; 1678 break; 1679 } 1680 case KVM_GET_MP_STATE: { 1681 struct kvm_mp_state mp_state; 1682 1683 r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state); 1684 if (r) 1685 goto out; 1686 r = -EFAULT; 1687 if (copy_to_user(argp, &mp_state, sizeof mp_state)) 1688 goto out; 1689 r = 0; 1690 break; 1691 } 1692 case KVM_SET_MP_STATE: { 1693 struct kvm_mp_state mp_state; 1694 1695 r = -EFAULT; 1696 if (copy_from_user(&mp_state, argp, sizeof mp_state)) 1697 goto out; 1698 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state); 1699 if (r) 1700 goto out; 1701 r = 0; 1702 break; 1703 } 1704 case KVM_TRANSLATE: { 1705 struct kvm_translation tr; 1706 1707 r = -EFAULT; 1708 if (copy_from_user(&tr, argp, sizeof tr)) 1709 goto out; 1710 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr); 1711 if (r) 1712 goto out; 1713 r = -EFAULT; 1714 if (copy_to_user(argp, &tr, sizeof tr)) 1715 goto out; 1716 r = 0; 1717 break; 1718 } 1719 case KVM_SET_GUEST_DEBUG: { 1720 struct kvm_guest_debug dbg; 1721 1722 r = -EFAULT; 1723 if (copy_from_user(&dbg, argp, sizeof dbg)) 1724 goto out; 1725 r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg); 1726 if (r) 1727 goto out; 1728 r = 0; 1729 break; 1730 } 1731 case KVM_SET_SIGNAL_MASK: { 1732 struct kvm_signal_mask __user *sigmask_arg = argp; 1733 struct kvm_signal_mask kvm_sigmask; 1734 sigset_t sigset, *p; 1735 1736 p = NULL; 1737 if (argp) { 1738 r = -EFAULT; 1739 if (copy_from_user(&kvm_sigmask, argp, 1740 sizeof kvm_sigmask)) 1741 goto out; 1742 r = -EINVAL; 1743 if (kvm_sigmask.len != sizeof sigset) 1744 goto out; 1745 r = -EFAULT; 1746 if (copy_from_user(&sigset, sigmask_arg->sigset, 1747 sizeof sigset)) 1748 goto out; 1749 p = &sigset; 1750 } 1751 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset); 1752 break; 1753 } 1754 case KVM_GET_FPU: { 1755 fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL); 1756 r = -ENOMEM; 1757 if (!fpu) 1758 goto out; 1759 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu); 1760 if (r) 1761 goto out; 1762 r = -EFAULT; 1763 if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu))) 1764 goto out; 1765 r = 0; 1766 break; 1767 } 1768 case KVM_SET_FPU: { 1769 fpu = kmalloc(sizeof(struct kvm_fpu), GFP_KERNEL); 1770 r = -ENOMEM; 1771 if (!fpu) 1772 goto out; 1773 r = -EFAULT; 1774 if (copy_from_user(fpu, argp, sizeof(struct kvm_fpu))) 1775 goto out; 1776 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu); 1777 if (r) 1778 goto out; 1779 r = 0; 1780 break; 1781 } 1782 default: 1783 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); 1784 } 1785 out: 1786 kfree(fpu); 1787 kfree(kvm_sregs); 1788 return r; 1789 } 1790 1791 static long kvm_vm_ioctl(struct file *filp, 1792 unsigned int ioctl, unsigned long arg) 1793 { 1794 struct kvm *kvm = filp->private_data; 1795 void __user *argp = (void __user *)arg; 1796 int r; 1797 1798 if (kvm->mm != current->mm) 1799 return -EIO; 1800 switch (ioctl) { 1801 case KVM_CREATE_VCPU: 1802 r = kvm_vm_ioctl_create_vcpu(kvm, arg); 1803 if (r < 0) 1804 goto out; 1805 break; 1806 case KVM_SET_USER_MEMORY_REGION: { 1807 struct kvm_userspace_memory_region kvm_userspace_mem; 1808 1809 r = -EFAULT; 1810 if (copy_from_user(&kvm_userspace_mem, argp, 1811 sizeof kvm_userspace_mem)) 1812 goto out; 1813 1814 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1); 1815 if (r) 1816 goto out; 1817 break; 1818 } 1819 case KVM_GET_DIRTY_LOG: { 1820 struct kvm_dirty_log log; 1821 1822 r = -EFAULT; 1823 if (copy_from_user(&log, argp, sizeof log)) 1824 goto out; 1825 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 1826 if (r) 1827 goto out; 1828 break; 1829 } 1830 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 1831 case KVM_REGISTER_COALESCED_MMIO: { 1832 struct kvm_coalesced_mmio_zone zone; 1833 r = -EFAULT; 1834 if (copy_from_user(&zone, argp, sizeof zone)) 1835 goto out; 1836 r = -ENXIO; 1837 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone); 1838 if (r) 1839 goto out; 1840 r = 0; 1841 break; 1842 } 1843 case KVM_UNREGISTER_COALESCED_MMIO: { 1844 struct kvm_coalesced_mmio_zone zone; 1845 r = -EFAULT; 1846 if (copy_from_user(&zone, argp, sizeof zone)) 1847 goto out; 1848 r = -ENXIO; 1849 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone); 1850 if (r) 1851 goto out; 1852 r = 0; 1853 break; 1854 } 1855 #endif 1856 #ifdef KVM_CAP_DEVICE_ASSIGNMENT 1857 case KVM_ASSIGN_PCI_DEVICE: { 1858 struct kvm_assigned_pci_dev assigned_dev; 1859 1860 r = -EFAULT; 1861 if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) 1862 goto out; 1863 r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev); 1864 if (r) 1865 goto out; 1866 break; 1867 } 1868 case KVM_ASSIGN_IRQ: { 1869 struct kvm_assigned_irq assigned_irq; 1870 1871 r = -EFAULT; 1872 if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq)) 1873 goto out; 1874 r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq); 1875 if (r) 1876 goto out; 1877 break; 1878 } 1879 #endif 1880 #ifdef KVM_CAP_DEVICE_DEASSIGNMENT 1881 case KVM_DEASSIGN_PCI_DEVICE: { 1882 struct kvm_assigned_pci_dev assigned_dev; 1883 1884 r = -EFAULT; 1885 if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) 1886 goto out; 1887 r = kvm_vm_ioctl_deassign_device(kvm, &assigned_dev); 1888 if (r) 1889 goto out; 1890 break; 1891 } 1892 #endif 1893 #ifdef KVM_CAP_IRQ_ROUTING 1894 case KVM_SET_GSI_ROUTING: { 1895 struct kvm_irq_routing routing; 1896 struct kvm_irq_routing __user *urouting; 1897 struct kvm_irq_routing_entry *entries; 1898 1899 r = -EFAULT; 1900 if (copy_from_user(&routing, argp, sizeof(routing))) 1901 goto out; 1902 r = -EINVAL; 1903 if (routing.nr >= KVM_MAX_IRQ_ROUTES) 1904 goto out; 1905 if (routing.flags) 1906 goto out; 1907 r = -ENOMEM; 1908 entries = vmalloc(routing.nr * sizeof(*entries)); 1909 if (!entries) 1910 goto out; 1911 r = -EFAULT; 1912 urouting = argp; 1913 if (copy_from_user(entries, urouting->entries, 1914 routing.nr * sizeof(*entries))) 1915 goto out_free_irq_routing; 1916 r = kvm_set_irq_routing(kvm, entries, routing.nr, 1917 routing.flags); 1918 out_free_irq_routing: 1919 vfree(entries); 1920 break; 1921 } 1922 #endif 1923 default: 1924 r = kvm_arch_vm_ioctl(filp, ioctl, arg); 1925 } 1926 out: 1927 return r; 1928 } 1929 1930 static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 1931 { 1932 struct page *page[1]; 1933 unsigned long addr; 1934 int npages; 1935 gfn_t gfn = vmf->pgoff; 1936 struct kvm *kvm = vma->vm_file->private_data; 1937 1938 addr = gfn_to_hva(kvm, gfn); 1939 if (kvm_is_error_hva(addr)) 1940 return VM_FAULT_SIGBUS; 1941 1942 npages = get_user_pages(current, current->mm, addr, 1, 1, 0, page, 1943 NULL); 1944 if (unlikely(npages != 1)) 1945 return VM_FAULT_SIGBUS; 1946 1947 vmf->page = page[0]; 1948 return 0; 1949 } 1950 1951 static struct vm_operations_struct kvm_vm_vm_ops = { 1952 .fault = kvm_vm_fault, 1953 }; 1954 1955 static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma) 1956 { 1957 vma->vm_ops = &kvm_vm_vm_ops; 1958 return 0; 1959 } 1960 1961 static struct file_operations kvm_vm_fops = { 1962 .release = kvm_vm_release, 1963 .unlocked_ioctl = kvm_vm_ioctl, 1964 .compat_ioctl = kvm_vm_ioctl, 1965 .mmap = kvm_vm_mmap, 1966 }; 1967 1968 static int kvm_dev_ioctl_create_vm(void) 1969 { 1970 int fd; 1971 struct kvm *kvm; 1972 1973 kvm = kvm_create_vm(); 1974 if (IS_ERR(kvm)) 1975 return PTR_ERR(kvm); 1976 fd = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, 0); 1977 if (fd < 0) 1978 kvm_put_kvm(kvm); 1979 1980 return fd; 1981 } 1982 1983 static long kvm_dev_ioctl_check_extension_generic(long arg) 1984 { 1985 switch (arg) { 1986 case KVM_CAP_USER_MEMORY: 1987 case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: 1988 return 1; 1989 #ifdef CONFIG_HAVE_KVM_IRQCHIP 1990 case KVM_CAP_IRQ_ROUTING: 1991 return KVM_MAX_IRQ_ROUTES; 1992 #endif 1993 default: 1994 break; 1995 } 1996 return kvm_dev_ioctl_check_extension(arg); 1997 } 1998 1999 static long kvm_dev_ioctl(struct file *filp, 2000 unsigned int ioctl, unsigned long arg) 2001 { 2002 long r = -EINVAL; 2003 2004 switch (ioctl) { 2005 case KVM_GET_API_VERSION: 2006 r = -EINVAL; 2007 if (arg) 2008 goto out; 2009 r = KVM_API_VERSION; 2010 break; 2011 case KVM_CREATE_VM: 2012 r = -EINVAL; 2013 if (arg) 2014 goto out; 2015 r = kvm_dev_ioctl_create_vm(); 2016 break; 2017 case KVM_CHECK_EXTENSION: 2018 r = kvm_dev_ioctl_check_extension_generic(arg); 2019 break; 2020 case KVM_GET_VCPU_MMAP_SIZE: 2021 r = -EINVAL; 2022 if (arg) 2023 goto out; 2024 r = PAGE_SIZE; /* struct kvm_run */ 2025 #ifdef CONFIG_X86 2026 r += PAGE_SIZE; /* pio data page */ 2027 #endif 2028 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 2029 r += PAGE_SIZE; /* coalesced mmio ring page */ 2030 #endif 2031 break; 2032 case KVM_TRACE_ENABLE: 2033 case KVM_TRACE_PAUSE: 2034 case KVM_TRACE_DISABLE: 2035 r = kvm_trace_ioctl(ioctl, arg); 2036 break; 2037 default: 2038 return kvm_arch_dev_ioctl(filp, ioctl, arg); 2039 } 2040 out: 2041 return r; 2042 } 2043 2044 static struct file_operations kvm_chardev_ops = { 2045 .unlocked_ioctl = kvm_dev_ioctl, 2046 .compat_ioctl = kvm_dev_ioctl, 2047 }; 2048 2049 static struct miscdevice kvm_dev = { 2050 KVM_MINOR, 2051 "kvm", 2052 &kvm_chardev_ops, 2053 }; 2054 2055 static void hardware_enable(void *junk) 2056 { 2057 int cpu = raw_smp_processor_id(); 2058 2059 if (cpumask_test_cpu(cpu, cpus_hardware_enabled)) 2060 return; 2061 cpumask_set_cpu(cpu, cpus_hardware_enabled); 2062 kvm_arch_hardware_enable(NULL); 2063 } 2064 2065 static void hardware_disable(void *junk) 2066 { 2067 int cpu = raw_smp_processor_id(); 2068 2069 if (!cpumask_test_cpu(cpu, cpus_hardware_enabled)) 2070 return; 2071 cpumask_clear_cpu(cpu, cpus_hardware_enabled); 2072 kvm_arch_hardware_disable(NULL); 2073 } 2074 2075 static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, 2076 void *v) 2077 { 2078 int cpu = (long)v; 2079 2080 val &= ~CPU_TASKS_FROZEN; 2081 switch (val) { 2082 case CPU_DYING: 2083 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", 2084 cpu); 2085 hardware_disable(NULL); 2086 break; 2087 case CPU_UP_CANCELED: 2088 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", 2089 cpu); 2090 smp_call_function_single(cpu, hardware_disable, NULL, 1); 2091 break; 2092 case CPU_ONLINE: 2093 printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n", 2094 cpu); 2095 smp_call_function_single(cpu, hardware_enable, NULL, 1); 2096 break; 2097 } 2098 return NOTIFY_OK; 2099 } 2100 2101 2102 asmlinkage void kvm_handle_fault_on_reboot(void) 2103 { 2104 if (kvm_rebooting) 2105 /* spin while reset goes on */ 2106 while (true) 2107 ; 2108 /* Fault while not rebooting. We want the trace. */ 2109 BUG(); 2110 } 2111 EXPORT_SYMBOL_GPL(kvm_handle_fault_on_reboot); 2112 2113 static int kvm_reboot(struct notifier_block *notifier, unsigned long val, 2114 void *v) 2115 { 2116 if (val == SYS_RESTART) { 2117 /* 2118 * Some (well, at least mine) BIOSes hang on reboot if 2119 * in vmx root mode. 2120 */ 2121 printk(KERN_INFO "kvm: exiting hardware virtualization\n"); 2122 kvm_rebooting = true; 2123 on_each_cpu(hardware_disable, NULL, 1); 2124 } 2125 return NOTIFY_OK; 2126 } 2127 2128 static struct notifier_block kvm_reboot_notifier = { 2129 .notifier_call = kvm_reboot, 2130 .priority = 0, 2131 }; 2132 2133 void kvm_io_bus_init(struct kvm_io_bus *bus) 2134 { 2135 memset(bus, 0, sizeof(*bus)); 2136 } 2137 2138 void kvm_io_bus_destroy(struct kvm_io_bus *bus) 2139 { 2140 int i; 2141 2142 for (i = 0; i < bus->dev_count; i++) { 2143 struct kvm_io_device *pos = bus->devs[i]; 2144 2145 kvm_iodevice_destructor(pos); 2146 } 2147 } 2148 2149 struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, 2150 gpa_t addr, int len, int is_write) 2151 { 2152 int i; 2153 2154 for (i = 0; i < bus->dev_count; i++) { 2155 struct kvm_io_device *pos = bus->devs[i]; 2156 2157 if (pos->in_range(pos, addr, len, is_write)) 2158 return pos; 2159 } 2160 2161 return NULL; 2162 } 2163 2164 void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev) 2165 { 2166 BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1)); 2167 2168 bus->devs[bus->dev_count++] = dev; 2169 } 2170 2171 static struct notifier_block kvm_cpu_notifier = { 2172 .notifier_call = kvm_cpu_hotplug, 2173 .priority = 20, /* must be > scheduler priority */ 2174 }; 2175 2176 static int vm_stat_get(void *_offset, u64 *val) 2177 { 2178 unsigned offset = (long)_offset; 2179 struct kvm *kvm; 2180 2181 *val = 0; 2182 spin_lock(&kvm_lock); 2183 list_for_each_entry(kvm, &vm_list, vm_list) 2184 *val += *(u32 *)((void *)kvm + offset); 2185 spin_unlock(&kvm_lock); 2186 return 0; 2187 } 2188 2189 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, NULL, "%llu\n"); 2190 2191 static int vcpu_stat_get(void *_offset, u64 *val) 2192 { 2193 unsigned offset = (long)_offset; 2194 struct kvm *kvm; 2195 struct kvm_vcpu *vcpu; 2196 int i; 2197 2198 *val = 0; 2199 spin_lock(&kvm_lock); 2200 list_for_each_entry(kvm, &vm_list, vm_list) 2201 for (i = 0; i < KVM_MAX_VCPUS; ++i) { 2202 vcpu = kvm->vcpus[i]; 2203 if (vcpu) 2204 *val += *(u32 *)((void *)vcpu + offset); 2205 } 2206 spin_unlock(&kvm_lock); 2207 return 0; 2208 } 2209 2210 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, NULL, "%llu\n"); 2211 2212 static struct file_operations *stat_fops[] = { 2213 [KVM_STAT_VCPU] = &vcpu_stat_fops, 2214 [KVM_STAT_VM] = &vm_stat_fops, 2215 }; 2216 2217 static void kvm_init_debug(void) 2218 { 2219 struct kvm_stats_debugfs_item *p; 2220 2221 kvm_debugfs_dir = debugfs_create_dir("kvm", NULL); 2222 for (p = debugfs_entries; p->name; ++p) 2223 p->dentry = debugfs_create_file(p->name, 0444, kvm_debugfs_dir, 2224 (void *)(long)p->offset, 2225 stat_fops[p->kind]); 2226 } 2227 2228 static void kvm_exit_debug(void) 2229 { 2230 struct kvm_stats_debugfs_item *p; 2231 2232 for (p = debugfs_entries; p->name; ++p) 2233 debugfs_remove(p->dentry); 2234 debugfs_remove(kvm_debugfs_dir); 2235 } 2236 2237 static int kvm_suspend(struct sys_device *dev, pm_message_t state) 2238 { 2239 hardware_disable(NULL); 2240 return 0; 2241 } 2242 2243 static int kvm_resume(struct sys_device *dev) 2244 { 2245 hardware_enable(NULL); 2246 return 0; 2247 } 2248 2249 static struct sysdev_class kvm_sysdev_class = { 2250 .name = "kvm", 2251 .suspend = kvm_suspend, 2252 .resume = kvm_resume, 2253 }; 2254 2255 static struct sys_device kvm_sysdev = { 2256 .id = 0, 2257 .cls = &kvm_sysdev_class, 2258 }; 2259 2260 struct page *bad_page; 2261 pfn_t bad_pfn; 2262 2263 static inline 2264 struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) 2265 { 2266 return container_of(pn, struct kvm_vcpu, preempt_notifier); 2267 } 2268 2269 static void kvm_sched_in(struct preempt_notifier *pn, int cpu) 2270 { 2271 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 2272 2273 kvm_arch_vcpu_load(vcpu, cpu); 2274 } 2275 2276 static void kvm_sched_out(struct preempt_notifier *pn, 2277 struct task_struct *next) 2278 { 2279 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 2280 2281 kvm_arch_vcpu_put(vcpu); 2282 } 2283 2284 int kvm_init(void *opaque, unsigned int vcpu_size, 2285 struct module *module) 2286 { 2287 int r; 2288 int cpu; 2289 2290 kvm_init_debug(); 2291 2292 r = kvm_arch_init(opaque); 2293 if (r) 2294 goto out_fail; 2295 2296 bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO); 2297 2298 if (bad_page == NULL) { 2299 r = -ENOMEM; 2300 goto out; 2301 } 2302 2303 bad_pfn = page_to_pfn(bad_page); 2304 2305 if (!alloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { 2306 r = -ENOMEM; 2307 goto out_free_0; 2308 } 2309 2310 r = kvm_arch_hardware_setup(); 2311 if (r < 0) 2312 goto out_free_0a; 2313 2314 for_each_online_cpu(cpu) { 2315 smp_call_function_single(cpu, 2316 kvm_arch_check_processor_compat, 2317 &r, 1); 2318 if (r < 0) 2319 goto out_free_1; 2320 } 2321 2322 on_each_cpu(hardware_enable, NULL, 1); 2323 r = register_cpu_notifier(&kvm_cpu_notifier); 2324 if (r) 2325 goto out_free_2; 2326 register_reboot_notifier(&kvm_reboot_notifier); 2327 2328 r = sysdev_class_register(&kvm_sysdev_class); 2329 if (r) 2330 goto out_free_3; 2331 2332 r = sysdev_register(&kvm_sysdev); 2333 if (r) 2334 goto out_free_4; 2335 2336 /* A kmem cache lets us meet the alignment requirements of fx_save. */ 2337 kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, 2338 __alignof__(struct kvm_vcpu), 2339 0, NULL); 2340 if (!kvm_vcpu_cache) { 2341 r = -ENOMEM; 2342 goto out_free_5; 2343 } 2344 2345 kvm_chardev_ops.owner = module; 2346 kvm_vm_fops.owner = module; 2347 kvm_vcpu_fops.owner = module; 2348 2349 r = misc_register(&kvm_dev); 2350 if (r) { 2351 printk(KERN_ERR "kvm: misc device register failed\n"); 2352 goto out_free; 2353 } 2354 2355 kvm_preempt_ops.sched_in = kvm_sched_in; 2356 kvm_preempt_ops.sched_out = kvm_sched_out; 2357 #ifndef CONFIG_X86 2358 msi2intx = 0; 2359 #endif 2360 2361 return 0; 2362 2363 out_free: 2364 kmem_cache_destroy(kvm_vcpu_cache); 2365 out_free_5: 2366 sysdev_unregister(&kvm_sysdev); 2367 out_free_4: 2368 sysdev_class_unregister(&kvm_sysdev_class); 2369 out_free_3: 2370 unregister_reboot_notifier(&kvm_reboot_notifier); 2371 unregister_cpu_notifier(&kvm_cpu_notifier); 2372 out_free_2: 2373 on_each_cpu(hardware_disable, NULL, 1); 2374 out_free_1: 2375 kvm_arch_hardware_unsetup(); 2376 out_free_0a: 2377 free_cpumask_var(cpus_hardware_enabled); 2378 out_free_0: 2379 __free_page(bad_page); 2380 out: 2381 kvm_arch_exit(); 2382 kvm_exit_debug(); 2383 out_fail: 2384 return r; 2385 } 2386 EXPORT_SYMBOL_GPL(kvm_init); 2387 2388 void kvm_exit(void) 2389 { 2390 kvm_trace_cleanup(); 2391 misc_deregister(&kvm_dev); 2392 kmem_cache_destroy(kvm_vcpu_cache); 2393 sysdev_unregister(&kvm_sysdev); 2394 sysdev_class_unregister(&kvm_sysdev_class); 2395 unregister_reboot_notifier(&kvm_reboot_notifier); 2396 unregister_cpu_notifier(&kvm_cpu_notifier); 2397 on_each_cpu(hardware_disable, NULL, 1); 2398 kvm_arch_hardware_unsetup(); 2399 kvm_arch_exit(); 2400 kvm_exit_debug(); 2401 free_cpumask_var(cpus_hardware_enabled); 2402 __free_page(bad_page); 2403 } 2404 EXPORT_SYMBOL_GPL(kvm_exit); 2405