1 /* 2 * Kernel-based Virtual Machine driver for Linux 3 * 4 * This module enables machines with Intel VT-x extensions to run virtual 5 * machines without emulation or binary translation. 6 * 7 * Copyright (C) 2006 Qumranet, Inc. 8 * 9 * Authors: 10 * Avi Kivity <avi@qumranet.com> 11 * Yaniv Kamay <yaniv@qumranet.com> 12 * 13 * This work is licensed under the terms of the GNU GPL, version 2. See 14 * the COPYING file in the top-level directory. 15 * 16 */ 17 18 #include "iodev.h" 19 20 #include <linux/kvm_host.h> 21 #include <linux/kvm.h> 22 #include <linux/module.h> 23 #include <linux/errno.h> 24 #include <linux/percpu.h> 25 #include <linux/gfp.h> 26 #include <linux/mm.h> 27 #include <linux/miscdevice.h> 28 #include <linux/vmalloc.h> 29 #include <linux/reboot.h> 30 #include <linux/debugfs.h> 31 #include <linux/highmem.h> 32 #include <linux/file.h> 33 #include <linux/sysdev.h> 34 #include <linux/cpu.h> 35 #include <linux/sched.h> 36 #include <linux/cpumask.h> 37 #include <linux/smp.h> 38 #include <linux/anon_inodes.h> 39 #include <linux/profile.h> 40 #include <linux/kvm_para.h> 41 #include <linux/pagemap.h> 42 #include <linux/mman.h> 43 #include <linux/swap.h> 44 #include <linux/bitops.h> 45 #include <linux/spinlock.h> 46 47 #include <asm/processor.h> 48 #include <asm/io.h> 49 #include <asm/uaccess.h> 50 #include <asm/pgtable.h> 51 52 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 53 #include "coalesced_mmio.h" 54 #endif 55 56 #ifdef KVM_CAP_DEVICE_ASSIGNMENT 57 #include <linux/pci.h> 58 #include <linux/interrupt.h> 59 #include "irq.h" 60 #endif 61 62 MODULE_AUTHOR("Qumranet"); 63 MODULE_LICENSE("GPL"); 64 65 DEFINE_SPINLOCK(kvm_lock); 66 LIST_HEAD(vm_list); 67 68 static cpumask_var_t cpus_hardware_enabled; 69 70 struct kmem_cache *kvm_vcpu_cache; 71 EXPORT_SYMBOL_GPL(kvm_vcpu_cache); 72 73 static __read_mostly struct preempt_ops kvm_preempt_ops; 74 75 struct dentry *kvm_debugfs_dir; 76 77 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, 78 unsigned long arg); 79 80 static bool kvm_rebooting; 81 82 #ifdef KVM_CAP_DEVICE_ASSIGNMENT 83 static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head, 84 int assigned_dev_id) 85 { 86 struct list_head *ptr; 87 struct kvm_assigned_dev_kernel *match; 88 89 list_for_each(ptr, head) { 90 match = list_entry(ptr, struct kvm_assigned_dev_kernel, list); 91 if (match->assigned_dev_id == assigned_dev_id) 92 return match; 93 } 94 return NULL; 95 } 96 97 static int find_index_from_host_irq(struct kvm_assigned_dev_kernel 98 *assigned_dev, int irq) 99 { 100 int i, index; 101 struct msix_entry *host_msix_entries; 102 103 host_msix_entries = assigned_dev->host_msix_entries; 104 105 index = -1; 106 for (i = 0; i < assigned_dev->entries_nr; i++) 107 if (irq == host_msix_entries[i].vector) { 108 index = i; 109 break; 110 } 111 if (index < 0) { 112 printk(KERN_WARNING "Fail to find correlated MSI-X entry!\n"); 113 return 0; 114 } 115 116 return index; 117 } 118 119 static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work) 120 { 121 struct kvm_assigned_dev_kernel *assigned_dev; 122 struct kvm *kvm; 123 int irq, i; 124 125 assigned_dev = container_of(work, struct kvm_assigned_dev_kernel, 126 interrupt_work); 127 kvm = assigned_dev->kvm; 128 129 /* This is taken to safely inject irq inside the guest. When 130 * the interrupt injection (or the ioapic code) uses a 131 * finer-grained lock, update this 132 */ 133 mutex_lock(&kvm->lock); 134 spin_lock_irq(&assigned_dev->assigned_dev_lock); 135 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { 136 struct kvm_guest_msix_entry *guest_entries = 137 assigned_dev->guest_msix_entries; 138 for (i = 0; i < assigned_dev->entries_nr; i++) { 139 if (!(guest_entries[i].flags & 140 KVM_ASSIGNED_MSIX_PENDING)) 141 continue; 142 guest_entries[i].flags &= ~KVM_ASSIGNED_MSIX_PENDING; 143 kvm_set_irq(assigned_dev->kvm, 144 assigned_dev->irq_source_id, 145 guest_entries[i].vector, 1); 146 irq = assigned_dev->host_msix_entries[i].vector; 147 if (irq != 0) 148 enable_irq(irq); 149 assigned_dev->host_irq_disabled = false; 150 } 151 } else { 152 kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, 153 assigned_dev->guest_irq, 1); 154 if (assigned_dev->irq_requested_type & 155 KVM_DEV_IRQ_GUEST_MSI) { 156 enable_irq(assigned_dev->host_irq); 157 assigned_dev->host_irq_disabled = false; 158 } 159 } 160 161 spin_unlock_irq(&assigned_dev->assigned_dev_lock); 162 mutex_unlock(&assigned_dev->kvm->lock); 163 } 164 165 static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id) 166 { 167 unsigned long flags; 168 struct kvm_assigned_dev_kernel *assigned_dev = 169 (struct kvm_assigned_dev_kernel *) dev_id; 170 171 spin_lock_irqsave(&assigned_dev->assigned_dev_lock, flags); 172 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { 173 int index = find_index_from_host_irq(assigned_dev, irq); 174 if (index < 0) 175 goto out; 176 assigned_dev->guest_msix_entries[index].flags |= 177 KVM_ASSIGNED_MSIX_PENDING; 178 } 179 180 schedule_work(&assigned_dev->interrupt_work); 181 182 disable_irq_nosync(irq); 183 assigned_dev->host_irq_disabled = true; 184 185 out: 186 spin_unlock_irqrestore(&assigned_dev->assigned_dev_lock, flags); 187 return IRQ_HANDLED; 188 } 189 190 /* Ack the irq line for an assigned device */ 191 static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) 192 { 193 struct kvm_assigned_dev_kernel *dev; 194 unsigned long flags; 195 196 if (kian->gsi == -1) 197 return; 198 199 dev = container_of(kian, struct kvm_assigned_dev_kernel, 200 ack_notifier); 201 202 kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0); 203 204 /* The guest irq may be shared so this ack may be 205 * from another device. 206 */ 207 spin_lock_irqsave(&dev->assigned_dev_lock, flags); 208 if (dev->host_irq_disabled) { 209 enable_irq(dev->host_irq); 210 dev->host_irq_disabled = false; 211 } 212 spin_unlock_irqrestore(&dev->assigned_dev_lock, flags); 213 } 214 215 static void deassign_guest_irq(struct kvm *kvm, 216 struct kvm_assigned_dev_kernel *assigned_dev) 217 { 218 kvm_unregister_irq_ack_notifier(&assigned_dev->ack_notifier); 219 assigned_dev->ack_notifier.gsi = -1; 220 221 if (assigned_dev->irq_source_id != -1) 222 kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id); 223 assigned_dev->irq_source_id = -1; 224 assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_GUEST_MASK); 225 } 226 227 /* The function implicit hold kvm->lock mutex due to cancel_work_sync() */ 228 static void deassign_host_irq(struct kvm *kvm, 229 struct kvm_assigned_dev_kernel *assigned_dev) 230 { 231 /* 232 * In kvm_free_device_irq, cancel_work_sync return true if: 233 * 1. work is scheduled, and then cancelled. 234 * 2. work callback is executed. 235 * 236 * The first one ensured that the irq is disabled and no more events 237 * would happen. But for the second one, the irq may be enabled (e.g. 238 * for MSI). So we disable irq here to prevent further events. 239 * 240 * Notice this maybe result in nested disable if the interrupt type is 241 * INTx, but it's OK for we are going to free it. 242 * 243 * If this function is a part of VM destroy, please ensure that till 244 * now, the kvm state is still legal for probably we also have to wait 245 * interrupt_work done. 246 */ 247 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { 248 int i; 249 for (i = 0; i < assigned_dev->entries_nr; i++) 250 disable_irq_nosync(assigned_dev-> 251 host_msix_entries[i].vector); 252 253 cancel_work_sync(&assigned_dev->interrupt_work); 254 255 for (i = 0; i < assigned_dev->entries_nr; i++) 256 free_irq(assigned_dev->host_msix_entries[i].vector, 257 (void *)assigned_dev); 258 259 assigned_dev->entries_nr = 0; 260 kfree(assigned_dev->host_msix_entries); 261 kfree(assigned_dev->guest_msix_entries); 262 pci_disable_msix(assigned_dev->dev); 263 } else { 264 /* Deal with MSI and INTx */ 265 disable_irq_nosync(assigned_dev->host_irq); 266 cancel_work_sync(&assigned_dev->interrupt_work); 267 268 free_irq(assigned_dev->host_irq, (void *)assigned_dev); 269 270 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI) 271 pci_disable_msi(assigned_dev->dev); 272 } 273 274 assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_HOST_MASK); 275 } 276 277 static int kvm_deassign_irq(struct kvm *kvm, 278 struct kvm_assigned_dev_kernel *assigned_dev, 279 unsigned long irq_requested_type) 280 { 281 unsigned long guest_irq_type, host_irq_type; 282 283 if (!irqchip_in_kernel(kvm)) 284 return -EINVAL; 285 /* no irq assignment to deassign */ 286 if (!assigned_dev->irq_requested_type) 287 return -ENXIO; 288 289 host_irq_type = irq_requested_type & KVM_DEV_IRQ_HOST_MASK; 290 guest_irq_type = irq_requested_type & KVM_DEV_IRQ_GUEST_MASK; 291 292 if (host_irq_type) 293 deassign_host_irq(kvm, assigned_dev); 294 if (guest_irq_type) 295 deassign_guest_irq(kvm, assigned_dev); 296 297 return 0; 298 } 299 300 static void kvm_free_assigned_irq(struct kvm *kvm, 301 struct kvm_assigned_dev_kernel *assigned_dev) 302 { 303 kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type); 304 } 305 306 static void kvm_free_assigned_device(struct kvm *kvm, 307 struct kvm_assigned_dev_kernel 308 *assigned_dev) 309 { 310 kvm_free_assigned_irq(kvm, assigned_dev); 311 312 pci_reset_function(assigned_dev->dev); 313 314 pci_release_regions(assigned_dev->dev); 315 pci_disable_device(assigned_dev->dev); 316 pci_dev_put(assigned_dev->dev); 317 318 list_del(&assigned_dev->list); 319 kfree(assigned_dev); 320 } 321 322 void kvm_free_all_assigned_devices(struct kvm *kvm) 323 { 324 struct list_head *ptr, *ptr2; 325 struct kvm_assigned_dev_kernel *assigned_dev; 326 327 list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) { 328 assigned_dev = list_entry(ptr, 329 struct kvm_assigned_dev_kernel, 330 list); 331 332 kvm_free_assigned_device(kvm, assigned_dev); 333 } 334 } 335 336 static int assigned_device_enable_host_intx(struct kvm *kvm, 337 struct kvm_assigned_dev_kernel *dev) 338 { 339 dev->host_irq = dev->dev->irq; 340 /* Even though this is PCI, we don't want to use shared 341 * interrupts. Sharing host devices with guest-assigned devices 342 * on the same interrupt line is not a happy situation: there 343 * are going to be long delays in accepting, acking, etc. 344 */ 345 if (request_irq(dev->host_irq, kvm_assigned_dev_intr, 346 0, "kvm_assigned_intx_device", (void *)dev)) 347 return -EIO; 348 return 0; 349 } 350 351 #ifdef __KVM_HAVE_MSI 352 static int assigned_device_enable_host_msi(struct kvm *kvm, 353 struct kvm_assigned_dev_kernel *dev) 354 { 355 int r; 356 357 if (!dev->dev->msi_enabled) { 358 r = pci_enable_msi(dev->dev); 359 if (r) 360 return r; 361 } 362 363 dev->host_irq = dev->dev->irq; 364 if (request_irq(dev->host_irq, kvm_assigned_dev_intr, 0, 365 "kvm_assigned_msi_device", (void *)dev)) { 366 pci_disable_msi(dev->dev); 367 return -EIO; 368 } 369 370 return 0; 371 } 372 #endif 373 374 #ifdef __KVM_HAVE_MSIX 375 static int assigned_device_enable_host_msix(struct kvm *kvm, 376 struct kvm_assigned_dev_kernel *dev) 377 { 378 int i, r = -EINVAL; 379 380 /* host_msix_entries and guest_msix_entries should have been 381 * initialized */ 382 if (dev->entries_nr == 0) 383 return r; 384 385 r = pci_enable_msix(dev->dev, dev->host_msix_entries, dev->entries_nr); 386 if (r) 387 return r; 388 389 for (i = 0; i < dev->entries_nr; i++) { 390 r = request_irq(dev->host_msix_entries[i].vector, 391 kvm_assigned_dev_intr, 0, 392 "kvm_assigned_msix_device", 393 (void *)dev); 394 /* FIXME: free requested_irq's on failure */ 395 if (r) 396 return r; 397 } 398 399 return 0; 400 } 401 402 #endif 403 404 static int assigned_device_enable_guest_intx(struct kvm *kvm, 405 struct kvm_assigned_dev_kernel *dev, 406 struct kvm_assigned_irq *irq) 407 { 408 dev->guest_irq = irq->guest_irq; 409 dev->ack_notifier.gsi = irq->guest_irq; 410 return 0; 411 } 412 413 #ifdef __KVM_HAVE_MSI 414 static int assigned_device_enable_guest_msi(struct kvm *kvm, 415 struct kvm_assigned_dev_kernel *dev, 416 struct kvm_assigned_irq *irq) 417 { 418 dev->guest_irq = irq->guest_irq; 419 dev->ack_notifier.gsi = -1; 420 return 0; 421 } 422 #endif 423 #ifdef __KVM_HAVE_MSIX 424 static int assigned_device_enable_guest_msix(struct kvm *kvm, 425 struct kvm_assigned_dev_kernel *dev, 426 struct kvm_assigned_irq *irq) 427 { 428 dev->guest_irq = irq->guest_irq; 429 dev->ack_notifier.gsi = -1; 430 return 0; 431 } 432 #endif 433 434 static int assign_host_irq(struct kvm *kvm, 435 struct kvm_assigned_dev_kernel *dev, 436 __u32 host_irq_type) 437 { 438 int r = -EEXIST; 439 440 if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK) 441 return r; 442 443 switch (host_irq_type) { 444 case KVM_DEV_IRQ_HOST_INTX: 445 r = assigned_device_enable_host_intx(kvm, dev); 446 break; 447 #ifdef __KVM_HAVE_MSI 448 case KVM_DEV_IRQ_HOST_MSI: 449 r = assigned_device_enable_host_msi(kvm, dev); 450 break; 451 #endif 452 #ifdef __KVM_HAVE_MSIX 453 case KVM_DEV_IRQ_HOST_MSIX: 454 r = assigned_device_enable_host_msix(kvm, dev); 455 break; 456 #endif 457 default: 458 r = -EINVAL; 459 } 460 461 if (!r) 462 dev->irq_requested_type |= host_irq_type; 463 464 return r; 465 } 466 467 static int assign_guest_irq(struct kvm *kvm, 468 struct kvm_assigned_dev_kernel *dev, 469 struct kvm_assigned_irq *irq, 470 unsigned long guest_irq_type) 471 { 472 int id; 473 int r = -EEXIST; 474 475 if (dev->irq_requested_type & KVM_DEV_IRQ_GUEST_MASK) 476 return r; 477 478 id = kvm_request_irq_source_id(kvm); 479 if (id < 0) 480 return id; 481 482 dev->irq_source_id = id; 483 484 switch (guest_irq_type) { 485 case KVM_DEV_IRQ_GUEST_INTX: 486 r = assigned_device_enable_guest_intx(kvm, dev, irq); 487 break; 488 #ifdef __KVM_HAVE_MSI 489 case KVM_DEV_IRQ_GUEST_MSI: 490 r = assigned_device_enable_guest_msi(kvm, dev, irq); 491 break; 492 #endif 493 #ifdef __KVM_HAVE_MSIX 494 case KVM_DEV_IRQ_GUEST_MSIX: 495 r = assigned_device_enable_guest_msix(kvm, dev, irq); 496 break; 497 #endif 498 default: 499 r = -EINVAL; 500 } 501 502 if (!r) { 503 dev->irq_requested_type |= guest_irq_type; 504 kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier); 505 } else 506 kvm_free_irq_source_id(kvm, dev->irq_source_id); 507 508 return r; 509 } 510 511 /* TODO Deal with KVM_DEV_IRQ_ASSIGNED_MASK_MSIX */ 512 static int kvm_vm_ioctl_assign_irq(struct kvm *kvm, 513 struct kvm_assigned_irq *assigned_irq) 514 { 515 int r = -EINVAL; 516 struct kvm_assigned_dev_kernel *match; 517 unsigned long host_irq_type, guest_irq_type; 518 519 if (!capable(CAP_SYS_RAWIO)) 520 return -EPERM; 521 522 if (!irqchip_in_kernel(kvm)) 523 return r; 524 525 mutex_lock(&kvm->lock); 526 r = -ENODEV; 527 match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, 528 assigned_irq->assigned_dev_id); 529 if (!match) 530 goto out; 531 532 host_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_HOST_MASK); 533 guest_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_GUEST_MASK); 534 535 r = -EINVAL; 536 /* can only assign one type at a time */ 537 if (hweight_long(host_irq_type) > 1) 538 goto out; 539 if (hweight_long(guest_irq_type) > 1) 540 goto out; 541 if (host_irq_type == 0 && guest_irq_type == 0) 542 goto out; 543 544 r = 0; 545 if (host_irq_type) 546 r = assign_host_irq(kvm, match, host_irq_type); 547 if (r) 548 goto out; 549 550 if (guest_irq_type) 551 r = assign_guest_irq(kvm, match, assigned_irq, guest_irq_type); 552 out: 553 mutex_unlock(&kvm->lock); 554 return r; 555 } 556 557 static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm, 558 struct kvm_assigned_irq 559 *assigned_irq) 560 { 561 int r = -ENODEV; 562 struct kvm_assigned_dev_kernel *match; 563 564 mutex_lock(&kvm->lock); 565 566 match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, 567 assigned_irq->assigned_dev_id); 568 if (!match) 569 goto out; 570 571 r = kvm_deassign_irq(kvm, match, assigned_irq->flags); 572 out: 573 mutex_unlock(&kvm->lock); 574 return r; 575 } 576 577 static int kvm_vm_ioctl_assign_device(struct kvm *kvm, 578 struct kvm_assigned_pci_dev *assigned_dev) 579 { 580 int r = 0; 581 struct kvm_assigned_dev_kernel *match; 582 struct pci_dev *dev; 583 584 down_read(&kvm->slots_lock); 585 mutex_lock(&kvm->lock); 586 587 match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, 588 assigned_dev->assigned_dev_id); 589 if (match) { 590 /* device already assigned */ 591 r = -EEXIST; 592 goto out; 593 } 594 595 match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL); 596 if (match == NULL) { 597 printk(KERN_INFO "%s: Couldn't allocate memory\n", 598 __func__); 599 r = -ENOMEM; 600 goto out; 601 } 602 dev = pci_get_bus_and_slot(assigned_dev->busnr, 603 assigned_dev->devfn); 604 if (!dev) { 605 printk(KERN_INFO "%s: host device not found\n", __func__); 606 r = -EINVAL; 607 goto out_free; 608 } 609 if (pci_enable_device(dev)) { 610 printk(KERN_INFO "%s: Could not enable PCI device\n", __func__); 611 r = -EBUSY; 612 goto out_put; 613 } 614 r = pci_request_regions(dev, "kvm_assigned_device"); 615 if (r) { 616 printk(KERN_INFO "%s: Could not get access to device regions\n", 617 __func__); 618 goto out_disable; 619 } 620 621 pci_reset_function(dev); 622 623 match->assigned_dev_id = assigned_dev->assigned_dev_id; 624 match->host_busnr = assigned_dev->busnr; 625 match->host_devfn = assigned_dev->devfn; 626 match->flags = assigned_dev->flags; 627 match->dev = dev; 628 spin_lock_init(&match->assigned_dev_lock); 629 match->irq_source_id = -1; 630 match->kvm = kvm; 631 match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq; 632 INIT_WORK(&match->interrupt_work, 633 kvm_assigned_dev_interrupt_work_handler); 634 635 list_add(&match->list, &kvm->arch.assigned_dev_head); 636 637 if (assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) { 638 if (!kvm->arch.iommu_domain) { 639 r = kvm_iommu_map_guest(kvm); 640 if (r) 641 goto out_list_del; 642 } 643 r = kvm_assign_device(kvm, match); 644 if (r) 645 goto out_list_del; 646 } 647 648 out: 649 mutex_unlock(&kvm->lock); 650 up_read(&kvm->slots_lock); 651 return r; 652 out_list_del: 653 list_del(&match->list); 654 pci_release_regions(dev); 655 out_disable: 656 pci_disable_device(dev); 657 out_put: 658 pci_dev_put(dev); 659 out_free: 660 kfree(match); 661 mutex_unlock(&kvm->lock); 662 up_read(&kvm->slots_lock); 663 return r; 664 } 665 #endif 666 667 #ifdef KVM_CAP_DEVICE_DEASSIGNMENT 668 static int kvm_vm_ioctl_deassign_device(struct kvm *kvm, 669 struct kvm_assigned_pci_dev *assigned_dev) 670 { 671 int r = 0; 672 struct kvm_assigned_dev_kernel *match; 673 674 mutex_lock(&kvm->lock); 675 676 match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, 677 assigned_dev->assigned_dev_id); 678 if (!match) { 679 printk(KERN_INFO "%s: device hasn't been assigned before, " 680 "so cannot be deassigned\n", __func__); 681 r = -EINVAL; 682 goto out; 683 } 684 685 if (match->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) 686 kvm_deassign_device(kvm, match); 687 688 kvm_free_assigned_device(kvm, match); 689 690 out: 691 mutex_unlock(&kvm->lock); 692 return r; 693 } 694 #endif 695 696 static inline int valid_vcpu(int n) 697 { 698 return likely(n >= 0 && n < KVM_MAX_VCPUS); 699 } 700 701 inline int kvm_is_mmio_pfn(pfn_t pfn) 702 { 703 if (pfn_valid(pfn)) { 704 struct page *page = compound_head(pfn_to_page(pfn)); 705 return PageReserved(page); 706 } 707 708 return true; 709 } 710 711 /* 712 * Switches to specified vcpu, until a matching vcpu_put() 713 */ 714 void vcpu_load(struct kvm_vcpu *vcpu) 715 { 716 int cpu; 717 718 mutex_lock(&vcpu->mutex); 719 cpu = get_cpu(); 720 preempt_notifier_register(&vcpu->preempt_notifier); 721 kvm_arch_vcpu_load(vcpu, cpu); 722 put_cpu(); 723 } 724 725 void vcpu_put(struct kvm_vcpu *vcpu) 726 { 727 preempt_disable(); 728 kvm_arch_vcpu_put(vcpu); 729 preempt_notifier_unregister(&vcpu->preempt_notifier); 730 preempt_enable(); 731 mutex_unlock(&vcpu->mutex); 732 } 733 734 static void ack_flush(void *_completed) 735 { 736 } 737 738 static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) 739 { 740 int i, cpu, me; 741 cpumask_var_t cpus; 742 bool called = true; 743 struct kvm_vcpu *vcpu; 744 745 if (alloc_cpumask_var(&cpus, GFP_ATOMIC)) 746 cpumask_clear(cpus); 747 748 me = get_cpu(); 749 spin_lock(&kvm->requests_lock); 750 for (i = 0; i < KVM_MAX_VCPUS; ++i) { 751 vcpu = kvm->vcpus[i]; 752 if (!vcpu) 753 continue; 754 if (test_and_set_bit(req, &vcpu->requests)) 755 continue; 756 cpu = vcpu->cpu; 757 if (cpus != NULL && cpu != -1 && cpu != me) 758 cpumask_set_cpu(cpu, cpus); 759 } 760 if (unlikely(cpus == NULL)) 761 smp_call_function_many(cpu_online_mask, ack_flush, NULL, 1); 762 else if (!cpumask_empty(cpus)) 763 smp_call_function_many(cpus, ack_flush, NULL, 1); 764 else 765 called = false; 766 spin_unlock(&kvm->requests_lock); 767 put_cpu(); 768 free_cpumask_var(cpus); 769 return called; 770 } 771 772 void kvm_flush_remote_tlbs(struct kvm *kvm) 773 { 774 if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) 775 ++kvm->stat.remote_tlb_flush; 776 } 777 778 void kvm_reload_remote_mmus(struct kvm *kvm) 779 { 780 make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD); 781 } 782 783 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) 784 { 785 struct page *page; 786 int r; 787 788 mutex_init(&vcpu->mutex); 789 vcpu->cpu = -1; 790 vcpu->kvm = kvm; 791 vcpu->vcpu_id = id; 792 init_waitqueue_head(&vcpu->wq); 793 794 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 795 if (!page) { 796 r = -ENOMEM; 797 goto fail; 798 } 799 vcpu->run = page_address(page); 800 801 r = kvm_arch_vcpu_init(vcpu); 802 if (r < 0) 803 goto fail_free_run; 804 return 0; 805 806 fail_free_run: 807 free_page((unsigned long)vcpu->run); 808 fail: 809 return r; 810 } 811 EXPORT_SYMBOL_GPL(kvm_vcpu_init); 812 813 void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) 814 { 815 kvm_arch_vcpu_uninit(vcpu); 816 free_page((unsigned long)vcpu->run); 817 } 818 EXPORT_SYMBOL_GPL(kvm_vcpu_uninit); 819 820 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 821 static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) 822 { 823 return container_of(mn, struct kvm, mmu_notifier); 824 } 825 826 static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn, 827 struct mm_struct *mm, 828 unsigned long address) 829 { 830 struct kvm *kvm = mmu_notifier_to_kvm(mn); 831 int need_tlb_flush; 832 833 /* 834 * When ->invalidate_page runs, the linux pte has been zapped 835 * already but the page is still allocated until 836 * ->invalidate_page returns. So if we increase the sequence 837 * here the kvm page fault will notice if the spte can't be 838 * established because the page is going to be freed. If 839 * instead the kvm page fault establishes the spte before 840 * ->invalidate_page runs, kvm_unmap_hva will release it 841 * before returning. 842 * 843 * The sequence increase only need to be seen at spin_unlock 844 * time, and not at spin_lock time. 845 * 846 * Increasing the sequence after the spin_unlock would be 847 * unsafe because the kvm page fault could then establish the 848 * pte after kvm_unmap_hva returned, without noticing the page 849 * is going to be freed. 850 */ 851 spin_lock(&kvm->mmu_lock); 852 kvm->mmu_notifier_seq++; 853 need_tlb_flush = kvm_unmap_hva(kvm, address); 854 spin_unlock(&kvm->mmu_lock); 855 856 /* we've to flush the tlb before the pages can be freed */ 857 if (need_tlb_flush) 858 kvm_flush_remote_tlbs(kvm); 859 860 } 861 862 static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, 863 struct mm_struct *mm, 864 unsigned long start, 865 unsigned long end) 866 { 867 struct kvm *kvm = mmu_notifier_to_kvm(mn); 868 int need_tlb_flush = 0; 869 870 spin_lock(&kvm->mmu_lock); 871 /* 872 * The count increase must become visible at unlock time as no 873 * spte can be established without taking the mmu_lock and 874 * count is also read inside the mmu_lock critical section. 875 */ 876 kvm->mmu_notifier_count++; 877 for (; start < end; start += PAGE_SIZE) 878 need_tlb_flush |= kvm_unmap_hva(kvm, start); 879 spin_unlock(&kvm->mmu_lock); 880 881 /* we've to flush the tlb before the pages can be freed */ 882 if (need_tlb_flush) 883 kvm_flush_remote_tlbs(kvm); 884 } 885 886 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, 887 struct mm_struct *mm, 888 unsigned long start, 889 unsigned long end) 890 { 891 struct kvm *kvm = mmu_notifier_to_kvm(mn); 892 893 spin_lock(&kvm->mmu_lock); 894 /* 895 * This sequence increase will notify the kvm page fault that 896 * the page that is going to be mapped in the spte could have 897 * been freed. 898 */ 899 kvm->mmu_notifier_seq++; 900 /* 901 * The above sequence increase must be visible before the 902 * below count decrease but both values are read by the kvm 903 * page fault under mmu_lock spinlock so we don't need to add 904 * a smb_wmb() here in between the two. 905 */ 906 kvm->mmu_notifier_count--; 907 spin_unlock(&kvm->mmu_lock); 908 909 BUG_ON(kvm->mmu_notifier_count < 0); 910 } 911 912 static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, 913 struct mm_struct *mm, 914 unsigned long address) 915 { 916 struct kvm *kvm = mmu_notifier_to_kvm(mn); 917 int young; 918 919 spin_lock(&kvm->mmu_lock); 920 young = kvm_age_hva(kvm, address); 921 spin_unlock(&kvm->mmu_lock); 922 923 if (young) 924 kvm_flush_remote_tlbs(kvm); 925 926 return young; 927 } 928 929 static void kvm_mmu_notifier_release(struct mmu_notifier *mn, 930 struct mm_struct *mm) 931 { 932 struct kvm *kvm = mmu_notifier_to_kvm(mn); 933 kvm_arch_flush_shadow(kvm); 934 } 935 936 static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { 937 .invalidate_page = kvm_mmu_notifier_invalidate_page, 938 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, 939 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, 940 .clear_flush_young = kvm_mmu_notifier_clear_flush_young, 941 .release = kvm_mmu_notifier_release, 942 }; 943 #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */ 944 945 static struct kvm *kvm_create_vm(void) 946 { 947 struct kvm *kvm = kvm_arch_create_vm(); 948 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 949 struct page *page; 950 #endif 951 952 if (IS_ERR(kvm)) 953 goto out; 954 #ifdef CONFIG_HAVE_KVM_IRQCHIP 955 INIT_LIST_HEAD(&kvm->irq_routing); 956 INIT_HLIST_HEAD(&kvm->mask_notifier_list); 957 #endif 958 959 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 960 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 961 if (!page) { 962 kfree(kvm); 963 return ERR_PTR(-ENOMEM); 964 } 965 kvm->coalesced_mmio_ring = 966 (struct kvm_coalesced_mmio_ring *)page_address(page); 967 #endif 968 969 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 970 { 971 int err; 972 kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops; 973 err = mmu_notifier_register(&kvm->mmu_notifier, current->mm); 974 if (err) { 975 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 976 put_page(page); 977 #endif 978 kfree(kvm); 979 return ERR_PTR(err); 980 } 981 } 982 #endif 983 984 kvm->mm = current->mm; 985 atomic_inc(&kvm->mm->mm_count); 986 spin_lock_init(&kvm->mmu_lock); 987 spin_lock_init(&kvm->requests_lock); 988 kvm_io_bus_init(&kvm->pio_bus); 989 mutex_init(&kvm->lock); 990 kvm_io_bus_init(&kvm->mmio_bus); 991 init_rwsem(&kvm->slots_lock); 992 atomic_set(&kvm->users_count, 1); 993 spin_lock(&kvm_lock); 994 list_add(&kvm->vm_list, &vm_list); 995 spin_unlock(&kvm_lock); 996 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 997 kvm_coalesced_mmio_init(kvm); 998 #endif 999 out: 1000 return kvm; 1001 } 1002 1003 /* 1004 * Free any memory in @free but not in @dont. 1005 */ 1006 static void kvm_free_physmem_slot(struct kvm_memory_slot *free, 1007 struct kvm_memory_slot *dont) 1008 { 1009 if (!dont || free->rmap != dont->rmap) 1010 vfree(free->rmap); 1011 1012 if (!dont || free->dirty_bitmap != dont->dirty_bitmap) 1013 vfree(free->dirty_bitmap); 1014 1015 if (!dont || free->lpage_info != dont->lpage_info) 1016 vfree(free->lpage_info); 1017 1018 free->npages = 0; 1019 free->dirty_bitmap = NULL; 1020 free->rmap = NULL; 1021 free->lpage_info = NULL; 1022 } 1023 1024 void kvm_free_physmem(struct kvm *kvm) 1025 { 1026 int i; 1027 1028 for (i = 0; i < kvm->nmemslots; ++i) 1029 kvm_free_physmem_slot(&kvm->memslots[i], NULL); 1030 } 1031 1032 static void kvm_destroy_vm(struct kvm *kvm) 1033 { 1034 struct mm_struct *mm = kvm->mm; 1035 1036 kvm_arch_sync_events(kvm); 1037 spin_lock(&kvm_lock); 1038 list_del(&kvm->vm_list); 1039 spin_unlock(&kvm_lock); 1040 kvm_free_irq_routing(kvm); 1041 kvm_io_bus_destroy(&kvm->pio_bus); 1042 kvm_io_bus_destroy(&kvm->mmio_bus); 1043 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 1044 if (kvm->coalesced_mmio_ring != NULL) 1045 free_page((unsigned long)kvm->coalesced_mmio_ring); 1046 #endif 1047 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 1048 mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); 1049 #else 1050 kvm_arch_flush_shadow(kvm); 1051 #endif 1052 kvm_arch_destroy_vm(kvm); 1053 mmdrop(mm); 1054 } 1055 1056 void kvm_get_kvm(struct kvm *kvm) 1057 { 1058 atomic_inc(&kvm->users_count); 1059 } 1060 EXPORT_SYMBOL_GPL(kvm_get_kvm); 1061 1062 void kvm_put_kvm(struct kvm *kvm) 1063 { 1064 if (atomic_dec_and_test(&kvm->users_count)) 1065 kvm_destroy_vm(kvm); 1066 } 1067 EXPORT_SYMBOL_GPL(kvm_put_kvm); 1068 1069 1070 static int kvm_vm_release(struct inode *inode, struct file *filp) 1071 { 1072 struct kvm *kvm = filp->private_data; 1073 1074 kvm_put_kvm(kvm); 1075 return 0; 1076 } 1077 1078 /* 1079 * Allocate some memory and give it an address in the guest physical address 1080 * space. 1081 * 1082 * Discontiguous memory is allowed, mostly for framebuffers. 1083 * 1084 * Must be called holding mmap_sem for write. 1085 */ 1086 int __kvm_set_memory_region(struct kvm *kvm, 1087 struct kvm_userspace_memory_region *mem, 1088 int user_alloc) 1089 { 1090 int r; 1091 gfn_t base_gfn; 1092 unsigned long npages, ugfn; 1093 unsigned long largepages, i; 1094 struct kvm_memory_slot *memslot; 1095 struct kvm_memory_slot old, new; 1096 1097 r = -EINVAL; 1098 /* General sanity checks */ 1099 if (mem->memory_size & (PAGE_SIZE - 1)) 1100 goto out; 1101 if (mem->guest_phys_addr & (PAGE_SIZE - 1)) 1102 goto out; 1103 if (user_alloc && (mem->userspace_addr & (PAGE_SIZE - 1))) 1104 goto out; 1105 if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS) 1106 goto out; 1107 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) 1108 goto out; 1109 1110 memslot = &kvm->memslots[mem->slot]; 1111 base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; 1112 npages = mem->memory_size >> PAGE_SHIFT; 1113 1114 if (!npages) 1115 mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES; 1116 1117 new = old = *memslot; 1118 1119 new.base_gfn = base_gfn; 1120 new.npages = npages; 1121 new.flags = mem->flags; 1122 1123 /* Disallow changing a memory slot's size. */ 1124 r = -EINVAL; 1125 if (npages && old.npages && npages != old.npages) 1126 goto out_free; 1127 1128 /* Check for overlaps */ 1129 r = -EEXIST; 1130 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { 1131 struct kvm_memory_slot *s = &kvm->memslots[i]; 1132 1133 if (s == memslot || !s->npages) 1134 continue; 1135 if (!((base_gfn + npages <= s->base_gfn) || 1136 (base_gfn >= s->base_gfn + s->npages))) 1137 goto out_free; 1138 } 1139 1140 /* Free page dirty bitmap if unneeded */ 1141 if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES)) 1142 new.dirty_bitmap = NULL; 1143 1144 r = -ENOMEM; 1145 1146 /* Allocate if a slot is being created */ 1147 #ifndef CONFIG_S390 1148 if (npages && !new.rmap) { 1149 new.rmap = vmalloc(npages * sizeof(struct page *)); 1150 1151 if (!new.rmap) 1152 goto out_free; 1153 1154 memset(new.rmap, 0, npages * sizeof(*new.rmap)); 1155 1156 new.user_alloc = user_alloc; 1157 /* 1158 * hva_to_rmmap() serialzies with the mmu_lock and to be 1159 * safe it has to ignore memslots with !user_alloc && 1160 * !userspace_addr. 1161 */ 1162 if (user_alloc) 1163 new.userspace_addr = mem->userspace_addr; 1164 else 1165 new.userspace_addr = 0; 1166 } 1167 if (npages && !new.lpage_info) { 1168 largepages = 1 + (base_gfn + npages - 1) / KVM_PAGES_PER_HPAGE; 1169 largepages -= base_gfn / KVM_PAGES_PER_HPAGE; 1170 1171 new.lpage_info = vmalloc(largepages * sizeof(*new.lpage_info)); 1172 1173 if (!new.lpage_info) 1174 goto out_free; 1175 1176 memset(new.lpage_info, 0, largepages * sizeof(*new.lpage_info)); 1177 1178 if (base_gfn % KVM_PAGES_PER_HPAGE) 1179 new.lpage_info[0].write_count = 1; 1180 if ((base_gfn+npages) % KVM_PAGES_PER_HPAGE) 1181 new.lpage_info[largepages-1].write_count = 1; 1182 ugfn = new.userspace_addr >> PAGE_SHIFT; 1183 /* 1184 * If the gfn and userspace address are not aligned wrt each 1185 * other, disable large page support for this slot 1186 */ 1187 if ((base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE - 1)) 1188 for (i = 0; i < largepages; ++i) 1189 new.lpage_info[i].write_count = 1; 1190 } 1191 1192 /* Allocate page dirty bitmap if needed */ 1193 if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { 1194 unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8; 1195 1196 new.dirty_bitmap = vmalloc(dirty_bytes); 1197 if (!new.dirty_bitmap) 1198 goto out_free; 1199 memset(new.dirty_bitmap, 0, dirty_bytes); 1200 if (old.npages) 1201 kvm_arch_flush_shadow(kvm); 1202 } 1203 #endif /* not defined CONFIG_S390 */ 1204 1205 if (!npages) 1206 kvm_arch_flush_shadow(kvm); 1207 1208 spin_lock(&kvm->mmu_lock); 1209 if (mem->slot >= kvm->nmemslots) 1210 kvm->nmemslots = mem->slot + 1; 1211 1212 *memslot = new; 1213 spin_unlock(&kvm->mmu_lock); 1214 1215 r = kvm_arch_set_memory_region(kvm, mem, old, user_alloc); 1216 if (r) { 1217 spin_lock(&kvm->mmu_lock); 1218 *memslot = old; 1219 spin_unlock(&kvm->mmu_lock); 1220 goto out_free; 1221 } 1222 1223 kvm_free_physmem_slot(&old, npages ? &new : NULL); 1224 /* Slot deletion case: we have to update the current slot */ 1225 spin_lock(&kvm->mmu_lock); 1226 if (!npages) 1227 *memslot = old; 1228 spin_unlock(&kvm->mmu_lock); 1229 #ifdef CONFIG_DMAR 1230 /* map the pages in iommu page table */ 1231 r = kvm_iommu_map_pages(kvm, base_gfn, npages); 1232 if (r) 1233 goto out; 1234 #endif 1235 return 0; 1236 1237 out_free: 1238 kvm_free_physmem_slot(&new, &old); 1239 out: 1240 return r; 1241 1242 } 1243 EXPORT_SYMBOL_GPL(__kvm_set_memory_region); 1244 1245 int kvm_set_memory_region(struct kvm *kvm, 1246 struct kvm_userspace_memory_region *mem, 1247 int user_alloc) 1248 { 1249 int r; 1250 1251 down_write(&kvm->slots_lock); 1252 r = __kvm_set_memory_region(kvm, mem, user_alloc); 1253 up_write(&kvm->slots_lock); 1254 return r; 1255 } 1256 EXPORT_SYMBOL_GPL(kvm_set_memory_region); 1257 1258 int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, 1259 struct 1260 kvm_userspace_memory_region *mem, 1261 int user_alloc) 1262 { 1263 if (mem->slot >= KVM_MEMORY_SLOTS) 1264 return -EINVAL; 1265 return kvm_set_memory_region(kvm, mem, user_alloc); 1266 } 1267 1268 int kvm_get_dirty_log(struct kvm *kvm, 1269 struct kvm_dirty_log *log, int *is_dirty) 1270 { 1271 struct kvm_memory_slot *memslot; 1272 int r, i; 1273 int n; 1274 unsigned long any = 0; 1275 1276 r = -EINVAL; 1277 if (log->slot >= KVM_MEMORY_SLOTS) 1278 goto out; 1279 1280 memslot = &kvm->memslots[log->slot]; 1281 r = -ENOENT; 1282 if (!memslot->dirty_bitmap) 1283 goto out; 1284 1285 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; 1286 1287 for (i = 0; !any && i < n/sizeof(long); ++i) 1288 any = memslot->dirty_bitmap[i]; 1289 1290 r = -EFAULT; 1291 if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) 1292 goto out; 1293 1294 if (any) 1295 *is_dirty = 1; 1296 1297 r = 0; 1298 out: 1299 return r; 1300 } 1301 1302 int is_error_page(struct page *page) 1303 { 1304 return page == bad_page; 1305 } 1306 EXPORT_SYMBOL_GPL(is_error_page); 1307 1308 int is_error_pfn(pfn_t pfn) 1309 { 1310 return pfn == bad_pfn; 1311 } 1312 EXPORT_SYMBOL_GPL(is_error_pfn); 1313 1314 static inline unsigned long bad_hva(void) 1315 { 1316 return PAGE_OFFSET; 1317 } 1318 1319 int kvm_is_error_hva(unsigned long addr) 1320 { 1321 return addr == bad_hva(); 1322 } 1323 EXPORT_SYMBOL_GPL(kvm_is_error_hva); 1324 1325 struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn) 1326 { 1327 int i; 1328 1329 for (i = 0; i < kvm->nmemslots; ++i) { 1330 struct kvm_memory_slot *memslot = &kvm->memslots[i]; 1331 1332 if (gfn >= memslot->base_gfn 1333 && gfn < memslot->base_gfn + memslot->npages) 1334 return memslot; 1335 } 1336 return NULL; 1337 } 1338 EXPORT_SYMBOL_GPL(gfn_to_memslot_unaliased); 1339 1340 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) 1341 { 1342 gfn = unalias_gfn(kvm, gfn); 1343 return gfn_to_memslot_unaliased(kvm, gfn); 1344 } 1345 1346 int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) 1347 { 1348 int i; 1349 1350 gfn = unalias_gfn(kvm, gfn); 1351 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { 1352 struct kvm_memory_slot *memslot = &kvm->memslots[i]; 1353 1354 if (gfn >= memslot->base_gfn 1355 && gfn < memslot->base_gfn + memslot->npages) 1356 return 1; 1357 } 1358 return 0; 1359 } 1360 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); 1361 1362 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) 1363 { 1364 struct kvm_memory_slot *slot; 1365 1366 gfn = unalias_gfn(kvm, gfn); 1367 slot = gfn_to_memslot_unaliased(kvm, gfn); 1368 if (!slot) 1369 return bad_hva(); 1370 return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE); 1371 } 1372 EXPORT_SYMBOL_GPL(gfn_to_hva); 1373 1374 pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) 1375 { 1376 struct page *page[1]; 1377 unsigned long addr; 1378 int npages; 1379 pfn_t pfn; 1380 1381 might_sleep(); 1382 1383 addr = gfn_to_hva(kvm, gfn); 1384 if (kvm_is_error_hva(addr)) { 1385 get_page(bad_page); 1386 return page_to_pfn(bad_page); 1387 } 1388 1389 npages = get_user_pages_fast(addr, 1, 1, page); 1390 1391 if (unlikely(npages != 1)) { 1392 struct vm_area_struct *vma; 1393 1394 down_read(¤t->mm->mmap_sem); 1395 vma = find_vma(current->mm, addr); 1396 1397 if (vma == NULL || addr < vma->vm_start || 1398 !(vma->vm_flags & VM_PFNMAP)) { 1399 up_read(¤t->mm->mmap_sem); 1400 get_page(bad_page); 1401 return page_to_pfn(bad_page); 1402 } 1403 1404 pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 1405 up_read(¤t->mm->mmap_sem); 1406 BUG_ON(!kvm_is_mmio_pfn(pfn)); 1407 } else 1408 pfn = page_to_pfn(page[0]); 1409 1410 return pfn; 1411 } 1412 1413 EXPORT_SYMBOL_GPL(gfn_to_pfn); 1414 1415 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) 1416 { 1417 pfn_t pfn; 1418 1419 pfn = gfn_to_pfn(kvm, gfn); 1420 if (!kvm_is_mmio_pfn(pfn)) 1421 return pfn_to_page(pfn); 1422 1423 WARN_ON(kvm_is_mmio_pfn(pfn)); 1424 1425 get_page(bad_page); 1426 return bad_page; 1427 } 1428 1429 EXPORT_SYMBOL_GPL(gfn_to_page); 1430 1431 void kvm_release_page_clean(struct page *page) 1432 { 1433 kvm_release_pfn_clean(page_to_pfn(page)); 1434 } 1435 EXPORT_SYMBOL_GPL(kvm_release_page_clean); 1436 1437 void kvm_release_pfn_clean(pfn_t pfn) 1438 { 1439 if (!kvm_is_mmio_pfn(pfn)) 1440 put_page(pfn_to_page(pfn)); 1441 } 1442 EXPORT_SYMBOL_GPL(kvm_release_pfn_clean); 1443 1444 void kvm_release_page_dirty(struct page *page) 1445 { 1446 kvm_release_pfn_dirty(page_to_pfn(page)); 1447 } 1448 EXPORT_SYMBOL_GPL(kvm_release_page_dirty); 1449 1450 void kvm_release_pfn_dirty(pfn_t pfn) 1451 { 1452 kvm_set_pfn_dirty(pfn); 1453 kvm_release_pfn_clean(pfn); 1454 } 1455 EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty); 1456 1457 void kvm_set_page_dirty(struct page *page) 1458 { 1459 kvm_set_pfn_dirty(page_to_pfn(page)); 1460 } 1461 EXPORT_SYMBOL_GPL(kvm_set_page_dirty); 1462 1463 void kvm_set_pfn_dirty(pfn_t pfn) 1464 { 1465 if (!kvm_is_mmio_pfn(pfn)) { 1466 struct page *page = pfn_to_page(pfn); 1467 if (!PageReserved(page)) 1468 SetPageDirty(page); 1469 } 1470 } 1471 EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty); 1472 1473 void kvm_set_pfn_accessed(pfn_t pfn) 1474 { 1475 if (!kvm_is_mmio_pfn(pfn)) 1476 mark_page_accessed(pfn_to_page(pfn)); 1477 } 1478 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed); 1479 1480 void kvm_get_pfn(pfn_t pfn) 1481 { 1482 if (!kvm_is_mmio_pfn(pfn)) 1483 get_page(pfn_to_page(pfn)); 1484 } 1485 EXPORT_SYMBOL_GPL(kvm_get_pfn); 1486 1487 static int next_segment(unsigned long len, int offset) 1488 { 1489 if (len > PAGE_SIZE - offset) 1490 return PAGE_SIZE - offset; 1491 else 1492 return len; 1493 } 1494 1495 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, 1496 int len) 1497 { 1498 int r; 1499 unsigned long addr; 1500 1501 addr = gfn_to_hva(kvm, gfn); 1502 if (kvm_is_error_hva(addr)) 1503 return -EFAULT; 1504 r = copy_from_user(data, (void __user *)addr + offset, len); 1505 if (r) 1506 return -EFAULT; 1507 return 0; 1508 } 1509 EXPORT_SYMBOL_GPL(kvm_read_guest_page); 1510 1511 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) 1512 { 1513 gfn_t gfn = gpa >> PAGE_SHIFT; 1514 int seg; 1515 int offset = offset_in_page(gpa); 1516 int ret; 1517 1518 while ((seg = next_segment(len, offset)) != 0) { 1519 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg); 1520 if (ret < 0) 1521 return ret; 1522 offset = 0; 1523 len -= seg; 1524 data += seg; 1525 ++gfn; 1526 } 1527 return 0; 1528 } 1529 EXPORT_SYMBOL_GPL(kvm_read_guest); 1530 1531 int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, 1532 unsigned long len) 1533 { 1534 int r; 1535 unsigned long addr; 1536 gfn_t gfn = gpa >> PAGE_SHIFT; 1537 int offset = offset_in_page(gpa); 1538 1539 addr = gfn_to_hva(kvm, gfn); 1540 if (kvm_is_error_hva(addr)) 1541 return -EFAULT; 1542 pagefault_disable(); 1543 r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len); 1544 pagefault_enable(); 1545 if (r) 1546 return -EFAULT; 1547 return 0; 1548 } 1549 EXPORT_SYMBOL(kvm_read_guest_atomic); 1550 1551 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, 1552 int offset, int len) 1553 { 1554 int r; 1555 unsigned long addr; 1556 1557 addr = gfn_to_hva(kvm, gfn); 1558 if (kvm_is_error_hva(addr)) 1559 return -EFAULT; 1560 r = copy_to_user((void __user *)addr + offset, data, len); 1561 if (r) 1562 return -EFAULT; 1563 mark_page_dirty(kvm, gfn); 1564 return 0; 1565 } 1566 EXPORT_SYMBOL_GPL(kvm_write_guest_page); 1567 1568 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, 1569 unsigned long len) 1570 { 1571 gfn_t gfn = gpa >> PAGE_SHIFT; 1572 int seg; 1573 int offset = offset_in_page(gpa); 1574 int ret; 1575 1576 while ((seg = next_segment(len, offset)) != 0) { 1577 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg); 1578 if (ret < 0) 1579 return ret; 1580 offset = 0; 1581 len -= seg; 1582 data += seg; 1583 ++gfn; 1584 } 1585 return 0; 1586 } 1587 1588 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) 1589 { 1590 return kvm_write_guest_page(kvm, gfn, empty_zero_page, offset, len); 1591 } 1592 EXPORT_SYMBOL_GPL(kvm_clear_guest_page); 1593 1594 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) 1595 { 1596 gfn_t gfn = gpa >> PAGE_SHIFT; 1597 int seg; 1598 int offset = offset_in_page(gpa); 1599 int ret; 1600 1601 while ((seg = next_segment(len, offset)) != 0) { 1602 ret = kvm_clear_guest_page(kvm, gfn, offset, seg); 1603 if (ret < 0) 1604 return ret; 1605 offset = 0; 1606 len -= seg; 1607 ++gfn; 1608 } 1609 return 0; 1610 } 1611 EXPORT_SYMBOL_GPL(kvm_clear_guest); 1612 1613 void mark_page_dirty(struct kvm *kvm, gfn_t gfn) 1614 { 1615 struct kvm_memory_slot *memslot; 1616 1617 gfn = unalias_gfn(kvm, gfn); 1618 memslot = gfn_to_memslot_unaliased(kvm, gfn); 1619 if (memslot && memslot->dirty_bitmap) { 1620 unsigned long rel_gfn = gfn - memslot->base_gfn; 1621 1622 /* avoid RMW */ 1623 if (!test_bit(rel_gfn, memslot->dirty_bitmap)) 1624 set_bit(rel_gfn, memslot->dirty_bitmap); 1625 } 1626 } 1627 1628 /* 1629 * The vCPU has executed a HLT instruction with in-kernel mode enabled. 1630 */ 1631 void kvm_vcpu_block(struct kvm_vcpu *vcpu) 1632 { 1633 DEFINE_WAIT(wait); 1634 1635 for (;;) { 1636 prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); 1637 1638 if ((kvm_arch_interrupt_allowed(vcpu) && 1639 kvm_cpu_has_interrupt(vcpu)) || 1640 kvm_arch_vcpu_runnable(vcpu)) { 1641 set_bit(KVM_REQ_UNHALT, &vcpu->requests); 1642 break; 1643 } 1644 if (kvm_cpu_has_pending_timer(vcpu)) 1645 break; 1646 if (signal_pending(current)) 1647 break; 1648 1649 vcpu_put(vcpu); 1650 schedule(); 1651 vcpu_load(vcpu); 1652 } 1653 1654 finish_wait(&vcpu->wq, &wait); 1655 } 1656 1657 void kvm_resched(struct kvm_vcpu *vcpu) 1658 { 1659 if (!need_resched()) 1660 return; 1661 cond_resched(); 1662 } 1663 EXPORT_SYMBOL_GPL(kvm_resched); 1664 1665 static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 1666 { 1667 struct kvm_vcpu *vcpu = vma->vm_file->private_data; 1668 struct page *page; 1669 1670 if (vmf->pgoff == 0) 1671 page = virt_to_page(vcpu->run); 1672 #ifdef CONFIG_X86 1673 else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET) 1674 page = virt_to_page(vcpu->arch.pio_data); 1675 #endif 1676 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 1677 else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET) 1678 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring); 1679 #endif 1680 else 1681 return VM_FAULT_SIGBUS; 1682 get_page(page); 1683 vmf->page = page; 1684 return 0; 1685 } 1686 1687 static struct vm_operations_struct kvm_vcpu_vm_ops = { 1688 .fault = kvm_vcpu_fault, 1689 }; 1690 1691 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma) 1692 { 1693 vma->vm_ops = &kvm_vcpu_vm_ops; 1694 return 0; 1695 } 1696 1697 static int kvm_vcpu_release(struct inode *inode, struct file *filp) 1698 { 1699 struct kvm_vcpu *vcpu = filp->private_data; 1700 1701 kvm_put_kvm(vcpu->kvm); 1702 return 0; 1703 } 1704 1705 static struct file_operations kvm_vcpu_fops = { 1706 .release = kvm_vcpu_release, 1707 .unlocked_ioctl = kvm_vcpu_ioctl, 1708 .compat_ioctl = kvm_vcpu_ioctl, 1709 .mmap = kvm_vcpu_mmap, 1710 }; 1711 1712 /* 1713 * Allocates an inode for the vcpu. 1714 */ 1715 static int create_vcpu_fd(struct kvm_vcpu *vcpu) 1716 { 1717 int fd = anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, 0); 1718 if (fd < 0) 1719 kvm_put_kvm(vcpu->kvm); 1720 return fd; 1721 } 1722 1723 /* 1724 * Creates some virtual cpus. Good luck creating more than one. 1725 */ 1726 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n) 1727 { 1728 int r; 1729 struct kvm_vcpu *vcpu; 1730 1731 if (!valid_vcpu(n)) 1732 return -EINVAL; 1733 1734 vcpu = kvm_arch_vcpu_create(kvm, n); 1735 if (IS_ERR(vcpu)) 1736 return PTR_ERR(vcpu); 1737 1738 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); 1739 1740 r = kvm_arch_vcpu_setup(vcpu); 1741 if (r) 1742 return r; 1743 1744 mutex_lock(&kvm->lock); 1745 if (kvm->vcpus[n]) { 1746 r = -EEXIST; 1747 goto vcpu_destroy; 1748 } 1749 kvm->vcpus[n] = vcpu; 1750 mutex_unlock(&kvm->lock); 1751 1752 /* Now it's all set up, let userspace reach it */ 1753 kvm_get_kvm(kvm); 1754 r = create_vcpu_fd(vcpu); 1755 if (r < 0) 1756 goto unlink; 1757 return r; 1758 1759 unlink: 1760 mutex_lock(&kvm->lock); 1761 kvm->vcpus[n] = NULL; 1762 vcpu_destroy: 1763 mutex_unlock(&kvm->lock); 1764 kvm_arch_vcpu_destroy(vcpu); 1765 return r; 1766 } 1767 1768 static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset) 1769 { 1770 if (sigset) { 1771 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP)); 1772 vcpu->sigset_active = 1; 1773 vcpu->sigset = *sigset; 1774 } else 1775 vcpu->sigset_active = 0; 1776 return 0; 1777 } 1778 1779 #ifdef __KVM_HAVE_MSIX 1780 static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm, 1781 struct kvm_assigned_msix_nr *entry_nr) 1782 { 1783 int r = 0; 1784 struct kvm_assigned_dev_kernel *adev; 1785 1786 mutex_lock(&kvm->lock); 1787 1788 adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, 1789 entry_nr->assigned_dev_id); 1790 if (!adev) { 1791 r = -EINVAL; 1792 goto msix_nr_out; 1793 } 1794 1795 if (adev->entries_nr == 0) { 1796 adev->entries_nr = entry_nr->entry_nr; 1797 if (adev->entries_nr == 0 || 1798 adev->entries_nr >= KVM_MAX_MSIX_PER_DEV) { 1799 r = -EINVAL; 1800 goto msix_nr_out; 1801 } 1802 1803 adev->host_msix_entries = kzalloc(sizeof(struct msix_entry) * 1804 entry_nr->entry_nr, 1805 GFP_KERNEL); 1806 if (!adev->host_msix_entries) { 1807 r = -ENOMEM; 1808 goto msix_nr_out; 1809 } 1810 adev->guest_msix_entries = kzalloc( 1811 sizeof(struct kvm_guest_msix_entry) * 1812 entry_nr->entry_nr, GFP_KERNEL); 1813 if (!adev->guest_msix_entries) { 1814 kfree(adev->host_msix_entries); 1815 r = -ENOMEM; 1816 goto msix_nr_out; 1817 } 1818 } else /* Not allowed set MSI-X number twice */ 1819 r = -EINVAL; 1820 msix_nr_out: 1821 mutex_unlock(&kvm->lock); 1822 return r; 1823 } 1824 1825 static int kvm_vm_ioctl_set_msix_entry(struct kvm *kvm, 1826 struct kvm_assigned_msix_entry *entry) 1827 { 1828 int r = 0, i; 1829 struct kvm_assigned_dev_kernel *adev; 1830 1831 mutex_lock(&kvm->lock); 1832 1833 adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, 1834 entry->assigned_dev_id); 1835 1836 if (!adev) { 1837 r = -EINVAL; 1838 goto msix_entry_out; 1839 } 1840 1841 for (i = 0; i < adev->entries_nr; i++) 1842 if (adev->guest_msix_entries[i].vector == 0 || 1843 adev->guest_msix_entries[i].entry == entry->entry) { 1844 adev->guest_msix_entries[i].entry = entry->entry; 1845 adev->guest_msix_entries[i].vector = entry->gsi; 1846 adev->host_msix_entries[i].entry = entry->entry; 1847 break; 1848 } 1849 if (i == adev->entries_nr) { 1850 r = -ENOSPC; 1851 goto msix_entry_out; 1852 } 1853 1854 msix_entry_out: 1855 mutex_unlock(&kvm->lock); 1856 1857 return r; 1858 } 1859 #endif 1860 1861 static long kvm_vcpu_ioctl(struct file *filp, 1862 unsigned int ioctl, unsigned long arg) 1863 { 1864 struct kvm_vcpu *vcpu = filp->private_data; 1865 void __user *argp = (void __user *)arg; 1866 int r; 1867 struct kvm_fpu *fpu = NULL; 1868 struct kvm_sregs *kvm_sregs = NULL; 1869 1870 if (vcpu->kvm->mm != current->mm) 1871 return -EIO; 1872 switch (ioctl) { 1873 case KVM_RUN: 1874 r = -EINVAL; 1875 if (arg) 1876 goto out; 1877 r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run); 1878 break; 1879 case KVM_GET_REGS: { 1880 struct kvm_regs *kvm_regs; 1881 1882 r = -ENOMEM; 1883 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL); 1884 if (!kvm_regs) 1885 goto out; 1886 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs); 1887 if (r) 1888 goto out_free1; 1889 r = -EFAULT; 1890 if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs))) 1891 goto out_free1; 1892 r = 0; 1893 out_free1: 1894 kfree(kvm_regs); 1895 break; 1896 } 1897 case KVM_SET_REGS: { 1898 struct kvm_regs *kvm_regs; 1899 1900 r = -ENOMEM; 1901 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL); 1902 if (!kvm_regs) 1903 goto out; 1904 r = -EFAULT; 1905 if (copy_from_user(kvm_regs, argp, sizeof(struct kvm_regs))) 1906 goto out_free2; 1907 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs); 1908 if (r) 1909 goto out_free2; 1910 r = 0; 1911 out_free2: 1912 kfree(kvm_regs); 1913 break; 1914 } 1915 case KVM_GET_SREGS: { 1916 kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL); 1917 r = -ENOMEM; 1918 if (!kvm_sregs) 1919 goto out; 1920 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs); 1921 if (r) 1922 goto out; 1923 r = -EFAULT; 1924 if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs))) 1925 goto out; 1926 r = 0; 1927 break; 1928 } 1929 case KVM_SET_SREGS: { 1930 kvm_sregs = kmalloc(sizeof(struct kvm_sregs), GFP_KERNEL); 1931 r = -ENOMEM; 1932 if (!kvm_sregs) 1933 goto out; 1934 r = -EFAULT; 1935 if (copy_from_user(kvm_sregs, argp, sizeof(struct kvm_sregs))) 1936 goto out; 1937 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs); 1938 if (r) 1939 goto out; 1940 r = 0; 1941 break; 1942 } 1943 case KVM_GET_MP_STATE: { 1944 struct kvm_mp_state mp_state; 1945 1946 r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state); 1947 if (r) 1948 goto out; 1949 r = -EFAULT; 1950 if (copy_to_user(argp, &mp_state, sizeof mp_state)) 1951 goto out; 1952 r = 0; 1953 break; 1954 } 1955 case KVM_SET_MP_STATE: { 1956 struct kvm_mp_state mp_state; 1957 1958 r = -EFAULT; 1959 if (copy_from_user(&mp_state, argp, sizeof mp_state)) 1960 goto out; 1961 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state); 1962 if (r) 1963 goto out; 1964 r = 0; 1965 break; 1966 } 1967 case KVM_TRANSLATE: { 1968 struct kvm_translation tr; 1969 1970 r = -EFAULT; 1971 if (copy_from_user(&tr, argp, sizeof tr)) 1972 goto out; 1973 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr); 1974 if (r) 1975 goto out; 1976 r = -EFAULT; 1977 if (copy_to_user(argp, &tr, sizeof tr)) 1978 goto out; 1979 r = 0; 1980 break; 1981 } 1982 case KVM_SET_GUEST_DEBUG: { 1983 struct kvm_guest_debug dbg; 1984 1985 r = -EFAULT; 1986 if (copy_from_user(&dbg, argp, sizeof dbg)) 1987 goto out; 1988 r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg); 1989 if (r) 1990 goto out; 1991 r = 0; 1992 break; 1993 } 1994 case KVM_SET_SIGNAL_MASK: { 1995 struct kvm_signal_mask __user *sigmask_arg = argp; 1996 struct kvm_signal_mask kvm_sigmask; 1997 sigset_t sigset, *p; 1998 1999 p = NULL; 2000 if (argp) { 2001 r = -EFAULT; 2002 if (copy_from_user(&kvm_sigmask, argp, 2003 sizeof kvm_sigmask)) 2004 goto out; 2005 r = -EINVAL; 2006 if (kvm_sigmask.len != sizeof sigset) 2007 goto out; 2008 r = -EFAULT; 2009 if (copy_from_user(&sigset, sigmask_arg->sigset, 2010 sizeof sigset)) 2011 goto out; 2012 p = &sigset; 2013 } 2014 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset); 2015 break; 2016 } 2017 case KVM_GET_FPU: { 2018 fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL); 2019 r = -ENOMEM; 2020 if (!fpu) 2021 goto out; 2022 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu); 2023 if (r) 2024 goto out; 2025 r = -EFAULT; 2026 if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu))) 2027 goto out; 2028 r = 0; 2029 break; 2030 } 2031 case KVM_SET_FPU: { 2032 fpu = kmalloc(sizeof(struct kvm_fpu), GFP_KERNEL); 2033 r = -ENOMEM; 2034 if (!fpu) 2035 goto out; 2036 r = -EFAULT; 2037 if (copy_from_user(fpu, argp, sizeof(struct kvm_fpu))) 2038 goto out; 2039 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu); 2040 if (r) 2041 goto out; 2042 r = 0; 2043 break; 2044 } 2045 default: 2046 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); 2047 } 2048 out: 2049 kfree(fpu); 2050 kfree(kvm_sregs); 2051 return r; 2052 } 2053 2054 static long kvm_vm_ioctl(struct file *filp, 2055 unsigned int ioctl, unsigned long arg) 2056 { 2057 struct kvm *kvm = filp->private_data; 2058 void __user *argp = (void __user *)arg; 2059 int r; 2060 2061 if (kvm->mm != current->mm) 2062 return -EIO; 2063 switch (ioctl) { 2064 case KVM_CREATE_VCPU: 2065 r = kvm_vm_ioctl_create_vcpu(kvm, arg); 2066 if (r < 0) 2067 goto out; 2068 break; 2069 case KVM_SET_USER_MEMORY_REGION: { 2070 struct kvm_userspace_memory_region kvm_userspace_mem; 2071 2072 r = -EFAULT; 2073 if (copy_from_user(&kvm_userspace_mem, argp, 2074 sizeof kvm_userspace_mem)) 2075 goto out; 2076 2077 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1); 2078 if (r) 2079 goto out; 2080 break; 2081 } 2082 case KVM_GET_DIRTY_LOG: { 2083 struct kvm_dirty_log log; 2084 2085 r = -EFAULT; 2086 if (copy_from_user(&log, argp, sizeof log)) 2087 goto out; 2088 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 2089 if (r) 2090 goto out; 2091 break; 2092 } 2093 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 2094 case KVM_REGISTER_COALESCED_MMIO: { 2095 struct kvm_coalesced_mmio_zone zone; 2096 r = -EFAULT; 2097 if (copy_from_user(&zone, argp, sizeof zone)) 2098 goto out; 2099 r = -ENXIO; 2100 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone); 2101 if (r) 2102 goto out; 2103 r = 0; 2104 break; 2105 } 2106 case KVM_UNREGISTER_COALESCED_MMIO: { 2107 struct kvm_coalesced_mmio_zone zone; 2108 r = -EFAULT; 2109 if (copy_from_user(&zone, argp, sizeof zone)) 2110 goto out; 2111 r = -ENXIO; 2112 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone); 2113 if (r) 2114 goto out; 2115 r = 0; 2116 break; 2117 } 2118 #endif 2119 #ifdef KVM_CAP_DEVICE_ASSIGNMENT 2120 case KVM_ASSIGN_PCI_DEVICE: { 2121 struct kvm_assigned_pci_dev assigned_dev; 2122 2123 r = -EFAULT; 2124 if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) 2125 goto out; 2126 r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev); 2127 if (r) 2128 goto out; 2129 break; 2130 } 2131 case KVM_ASSIGN_IRQ: { 2132 r = -EOPNOTSUPP; 2133 break; 2134 } 2135 #ifdef KVM_CAP_ASSIGN_DEV_IRQ 2136 case KVM_ASSIGN_DEV_IRQ: { 2137 struct kvm_assigned_irq assigned_irq; 2138 2139 r = -EFAULT; 2140 if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq)) 2141 goto out; 2142 r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq); 2143 if (r) 2144 goto out; 2145 break; 2146 } 2147 case KVM_DEASSIGN_DEV_IRQ: { 2148 struct kvm_assigned_irq assigned_irq; 2149 2150 r = -EFAULT; 2151 if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq)) 2152 goto out; 2153 r = kvm_vm_ioctl_deassign_dev_irq(kvm, &assigned_irq); 2154 if (r) 2155 goto out; 2156 break; 2157 } 2158 #endif 2159 #endif 2160 #ifdef KVM_CAP_DEVICE_DEASSIGNMENT 2161 case KVM_DEASSIGN_PCI_DEVICE: { 2162 struct kvm_assigned_pci_dev assigned_dev; 2163 2164 r = -EFAULT; 2165 if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) 2166 goto out; 2167 r = kvm_vm_ioctl_deassign_device(kvm, &assigned_dev); 2168 if (r) 2169 goto out; 2170 break; 2171 } 2172 #endif 2173 #ifdef KVM_CAP_IRQ_ROUTING 2174 case KVM_SET_GSI_ROUTING: { 2175 struct kvm_irq_routing routing; 2176 struct kvm_irq_routing __user *urouting; 2177 struct kvm_irq_routing_entry *entries; 2178 2179 r = -EFAULT; 2180 if (copy_from_user(&routing, argp, sizeof(routing))) 2181 goto out; 2182 r = -EINVAL; 2183 if (routing.nr >= KVM_MAX_IRQ_ROUTES) 2184 goto out; 2185 if (routing.flags) 2186 goto out; 2187 r = -ENOMEM; 2188 entries = vmalloc(routing.nr * sizeof(*entries)); 2189 if (!entries) 2190 goto out; 2191 r = -EFAULT; 2192 urouting = argp; 2193 if (copy_from_user(entries, urouting->entries, 2194 routing.nr * sizeof(*entries))) 2195 goto out_free_irq_routing; 2196 r = kvm_set_irq_routing(kvm, entries, routing.nr, 2197 routing.flags); 2198 out_free_irq_routing: 2199 vfree(entries); 2200 break; 2201 } 2202 #ifdef __KVM_HAVE_MSIX 2203 case KVM_ASSIGN_SET_MSIX_NR: { 2204 struct kvm_assigned_msix_nr entry_nr; 2205 r = -EFAULT; 2206 if (copy_from_user(&entry_nr, argp, sizeof entry_nr)) 2207 goto out; 2208 r = kvm_vm_ioctl_set_msix_nr(kvm, &entry_nr); 2209 if (r) 2210 goto out; 2211 break; 2212 } 2213 case KVM_ASSIGN_SET_MSIX_ENTRY: { 2214 struct kvm_assigned_msix_entry entry; 2215 r = -EFAULT; 2216 if (copy_from_user(&entry, argp, sizeof entry)) 2217 goto out; 2218 r = kvm_vm_ioctl_set_msix_entry(kvm, &entry); 2219 if (r) 2220 goto out; 2221 break; 2222 } 2223 #endif 2224 #endif /* KVM_CAP_IRQ_ROUTING */ 2225 default: 2226 r = kvm_arch_vm_ioctl(filp, ioctl, arg); 2227 } 2228 out: 2229 return r; 2230 } 2231 2232 static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 2233 { 2234 struct page *page[1]; 2235 unsigned long addr; 2236 int npages; 2237 gfn_t gfn = vmf->pgoff; 2238 struct kvm *kvm = vma->vm_file->private_data; 2239 2240 addr = gfn_to_hva(kvm, gfn); 2241 if (kvm_is_error_hva(addr)) 2242 return VM_FAULT_SIGBUS; 2243 2244 npages = get_user_pages(current, current->mm, addr, 1, 1, 0, page, 2245 NULL); 2246 if (unlikely(npages != 1)) 2247 return VM_FAULT_SIGBUS; 2248 2249 vmf->page = page[0]; 2250 return 0; 2251 } 2252 2253 static struct vm_operations_struct kvm_vm_vm_ops = { 2254 .fault = kvm_vm_fault, 2255 }; 2256 2257 static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma) 2258 { 2259 vma->vm_ops = &kvm_vm_vm_ops; 2260 return 0; 2261 } 2262 2263 static struct file_operations kvm_vm_fops = { 2264 .release = kvm_vm_release, 2265 .unlocked_ioctl = kvm_vm_ioctl, 2266 .compat_ioctl = kvm_vm_ioctl, 2267 .mmap = kvm_vm_mmap, 2268 }; 2269 2270 static int kvm_dev_ioctl_create_vm(void) 2271 { 2272 int fd; 2273 struct kvm *kvm; 2274 2275 kvm = kvm_create_vm(); 2276 if (IS_ERR(kvm)) 2277 return PTR_ERR(kvm); 2278 fd = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, 0); 2279 if (fd < 0) 2280 kvm_put_kvm(kvm); 2281 2282 return fd; 2283 } 2284 2285 static long kvm_dev_ioctl_check_extension_generic(long arg) 2286 { 2287 switch (arg) { 2288 case KVM_CAP_USER_MEMORY: 2289 case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: 2290 case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS: 2291 return 1; 2292 #ifdef CONFIG_HAVE_KVM_IRQCHIP 2293 case KVM_CAP_IRQ_ROUTING: 2294 return KVM_MAX_IRQ_ROUTES; 2295 #endif 2296 default: 2297 break; 2298 } 2299 return kvm_dev_ioctl_check_extension(arg); 2300 } 2301 2302 static long kvm_dev_ioctl(struct file *filp, 2303 unsigned int ioctl, unsigned long arg) 2304 { 2305 long r = -EINVAL; 2306 2307 switch (ioctl) { 2308 case KVM_GET_API_VERSION: 2309 r = -EINVAL; 2310 if (arg) 2311 goto out; 2312 r = KVM_API_VERSION; 2313 break; 2314 case KVM_CREATE_VM: 2315 r = -EINVAL; 2316 if (arg) 2317 goto out; 2318 r = kvm_dev_ioctl_create_vm(); 2319 break; 2320 case KVM_CHECK_EXTENSION: 2321 r = kvm_dev_ioctl_check_extension_generic(arg); 2322 break; 2323 case KVM_GET_VCPU_MMAP_SIZE: 2324 r = -EINVAL; 2325 if (arg) 2326 goto out; 2327 r = PAGE_SIZE; /* struct kvm_run */ 2328 #ifdef CONFIG_X86 2329 r += PAGE_SIZE; /* pio data page */ 2330 #endif 2331 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 2332 r += PAGE_SIZE; /* coalesced mmio ring page */ 2333 #endif 2334 break; 2335 case KVM_TRACE_ENABLE: 2336 case KVM_TRACE_PAUSE: 2337 case KVM_TRACE_DISABLE: 2338 r = kvm_trace_ioctl(ioctl, arg); 2339 break; 2340 default: 2341 return kvm_arch_dev_ioctl(filp, ioctl, arg); 2342 } 2343 out: 2344 return r; 2345 } 2346 2347 static struct file_operations kvm_chardev_ops = { 2348 .unlocked_ioctl = kvm_dev_ioctl, 2349 .compat_ioctl = kvm_dev_ioctl, 2350 }; 2351 2352 static struct miscdevice kvm_dev = { 2353 KVM_MINOR, 2354 "kvm", 2355 &kvm_chardev_ops, 2356 }; 2357 2358 static void hardware_enable(void *junk) 2359 { 2360 int cpu = raw_smp_processor_id(); 2361 2362 if (cpumask_test_cpu(cpu, cpus_hardware_enabled)) 2363 return; 2364 cpumask_set_cpu(cpu, cpus_hardware_enabled); 2365 kvm_arch_hardware_enable(NULL); 2366 } 2367 2368 static void hardware_disable(void *junk) 2369 { 2370 int cpu = raw_smp_processor_id(); 2371 2372 if (!cpumask_test_cpu(cpu, cpus_hardware_enabled)) 2373 return; 2374 cpumask_clear_cpu(cpu, cpus_hardware_enabled); 2375 kvm_arch_hardware_disable(NULL); 2376 } 2377 2378 static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, 2379 void *v) 2380 { 2381 int cpu = (long)v; 2382 2383 val &= ~CPU_TASKS_FROZEN; 2384 switch (val) { 2385 case CPU_DYING: 2386 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", 2387 cpu); 2388 hardware_disable(NULL); 2389 break; 2390 case CPU_UP_CANCELED: 2391 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", 2392 cpu); 2393 smp_call_function_single(cpu, hardware_disable, NULL, 1); 2394 break; 2395 case CPU_ONLINE: 2396 printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n", 2397 cpu); 2398 smp_call_function_single(cpu, hardware_enable, NULL, 1); 2399 break; 2400 } 2401 return NOTIFY_OK; 2402 } 2403 2404 2405 asmlinkage void kvm_handle_fault_on_reboot(void) 2406 { 2407 if (kvm_rebooting) 2408 /* spin while reset goes on */ 2409 while (true) 2410 ; 2411 /* Fault while not rebooting. We want the trace. */ 2412 BUG(); 2413 } 2414 EXPORT_SYMBOL_GPL(kvm_handle_fault_on_reboot); 2415 2416 static int kvm_reboot(struct notifier_block *notifier, unsigned long val, 2417 void *v) 2418 { 2419 /* 2420 * Some (well, at least mine) BIOSes hang on reboot if 2421 * in vmx root mode. 2422 * 2423 * And Intel TXT required VMX off for all cpu when system shutdown. 2424 */ 2425 printk(KERN_INFO "kvm: exiting hardware virtualization\n"); 2426 kvm_rebooting = true; 2427 on_each_cpu(hardware_disable, NULL, 1); 2428 return NOTIFY_OK; 2429 } 2430 2431 static struct notifier_block kvm_reboot_notifier = { 2432 .notifier_call = kvm_reboot, 2433 .priority = 0, 2434 }; 2435 2436 void kvm_io_bus_init(struct kvm_io_bus *bus) 2437 { 2438 memset(bus, 0, sizeof(*bus)); 2439 } 2440 2441 void kvm_io_bus_destroy(struct kvm_io_bus *bus) 2442 { 2443 int i; 2444 2445 for (i = 0; i < bus->dev_count; i++) { 2446 struct kvm_io_device *pos = bus->devs[i]; 2447 2448 kvm_iodevice_destructor(pos); 2449 } 2450 } 2451 2452 struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, 2453 gpa_t addr, int len, int is_write) 2454 { 2455 int i; 2456 2457 for (i = 0; i < bus->dev_count; i++) { 2458 struct kvm_io_device *pos = bus->devs[i]; 2459 2460 if (pos->in_range(pos, addr, len, is_write)) 2461 return pos; 2462 } 2463 2464 return NULL; 2465 } 2466 2467 void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev) 2468 { 2469 BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1)); 2470 2471 bus->devs[bus->dev_count++] = dev; 2472 } 2473 2474 static struct notifier_block kvm_cpu_notifier = { 2475 .notifier_call = kvm_cpu_hotplug, 2476 .priority = 20, /* must be > scheduler priority */ 2477 }; 2478 2479 static int vm_stat_get(void *_offset, u64 *val) 2480 { 2481 unsigned offset = (long)_offset; 2482 struct kvm *kvm; 2483 2484 *val = 0; 2485 spin_lock(&kvm_lock); 2486 list_for_each_entry(kvm, &vm_list, vm_list) 2487 *val += *(u32 *)((void *)kvm + offset); 2488 spin_unlock(&kvm_lock); 2489 return 0; 2490 } 2491 2492 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, NULL, "%llu\n"); 2493 2494 static int vcpu_stat_get(void *_offset, u64 *val) 2495 { 2496 unsigned offset = (long)_offset; 2497 struct kvm *kvm; 2498 struct kvm_vcpu *vcpu; 2499 int i; 2500 2501 *val = 0; 2502 spin_lock(&kvm_lock); 2503 list_for_each_entry(kvm, &vm_list, vm_list) 2504 for (i = 0; i < KVM_MAX_VCPUS; ++i) { 2505 vcpu = kvm->vcpus[i]; 2506 if (vcpu) 2507 *val += *(u32 *)((void *)vcpu + offset); 2508 } 2509 spin_unlock(&kvm_lock); 2510 return 0; 2511 } 2512 2513 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, NULL, "%llu\n"); 2514 2515 static struct file_operations *stat_fops[] = { 2516 [KVM_STAT_VCPU] = &vcpu_stat_fops, 2517 [KVM_STAT_VM] = &vm_stat_fops, 2518 }; 2519 2520 static void kvm_init_debug(void) 2521 { 2522 struct kvm_stats_debugfs_item *p; 2523 2524 kvm_debugfs_dir = debugfs_create_dir("kvm", NULL); 2525 for (p = debugfs_entries; p->name; ++p) 2526 p->dentry = debugfs_create_file(p->name, 0444, kvm_debugfs_dir, 2527 (void *)(long)p->offset, 2528 stat_fops[p->kind]); 2529 } 2530 2531 static void kvm_exit_debug(void) 2532 { 2533 struct kvm_stats_debugfs_item *p; 2534 2535 for (p = debugfs_entries; p->name; ++p) 2536 debugfs_remove(p->dentry); 2537 debugfs_remove(kvm_debugfs_dir); 2538 } 2539 2540 static int kvm_suspend(struct sys_device *dev, pm_message_t state) 2541 { 2542 hardware_disable(NULL); 2543 return 0; 2544 } 2545 2546 static int kvm_resume(struct sys_device *dev) 2547 { 2548 hardware_enable(NULL); 2549 return 0; 2550 } 2551 2552 static struct sysdev_class kvm_sysdev_class = { 2553 .name = "kvm", 2554 .suspend = kvm_suspend, 2555 .resume = kvm_resume, 2556 }; 2557 2558 static struct sys_device kvm_sysdev = { 2559 .id = 0, 2560 .cls = &kvm_sysdev_class, 2561 }; 2562 2563 struct page *bad_page; 2564 pfn_t bad_pfn; 2565 2566 static inline 2567 struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) 2568 { 2569 return container_of(pn, struct kvm_vcpu, preempt_notifier); 2570 } 2571 2572 static void kvm_sched_in(struct preempt_notifier *pn, int cpu) 2573 { 2574 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 2575 2576 kvm_arch_vcpu_load(vcpu, cpu); 2577 } 2578 2579 static void kvm_sched_out(struct preempt_notifier *pn, 2580 struct task_struct *next) 2581 { 2582 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 2583 2584 kvm_arch_vcpu_put(vcpu); 2585 } 2586 2587 int kvm_init(void *opaque, unsigned int vcpu_size, 2588 struct module *module) 2589 { 2590 int r; 2591 int cpu; 2592 2593 kvm_init_debug(); 2594 2595 r = kvm_arch_init(opaque); 2596 if (r) 2597 goto out_fail; 2598 2599 bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO); 2600 2601 if (bad_page == NULL) { 2602 r = -ENOMEM; 2603 goto out; 2604 } 2605 2606 bad_pfn = page_to_pfn(bad_page); 2607 2608 if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { 2609 r = -ENOMEM; 2610 goto out_free_0; 2611 } 2612 2613 r = kvm_arch_hardware_setup(); 2614 if (r < 0) 2615 goto out_free_0a; 2616 2617 for_each_online_cpu(cpu) { 2618 smp_call_function_single(cpu, 2619 kvm_arch_check_processor_compat, 2620 &r, 1); 2621 if (r < 0) 2622 goto out_free_1; 2623 } 2624 2625 on_each_cpu(hardware_enable, NULL, 1); 2626 r = register_cpu_notifier(&kvm_cpu_notifier); 2627 if (r) 2628 goto out_free_2; 2629 register_reboot_notifier(&kvm_reboot_notifier); 2630 2631 r = sysdev_class_register(&kvm_sysdev_class); 2632 if (r) 2633 goto out_free_3; 2634 2635 r = sysdev_register(&kvm_sysdev); 2636 if (r) 2637 goto out_free_4; 2638 2639 /* A kmem cache lets us meet the alignment requirements of fx_save. */ 2640 kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, 2641 __alignof__(struct kvm_vcpu), 2642 0, NULL); 2643 if (!kvm_vcpu_cache) { 2644 r = -ENOMEM; 2645 goto out_free_5; 2646 } 2647 2648 kvm_chardev_ops.owner = module; 2649 kvm_vm_fops.owner = module; 2650 kvm_vcpu_fops.owner = module; 2651 2652 r = misc_register(&kvm_dev); 2653 if (r) { 2654 printk(KERN_ERR "kvm: misc device register failed\n"); 2655 goto out_free; 2656 } 2657 2658 kvm_preempt_ops.sched_in = kvm_sched_in; 2659 kvm_preempt_ops.sched_out = kvm_sched_out; 2660 2661 return 0; 2662 2663 out_free: 2664 kmem_cache_destroy(kvm_vcpu_cache); 2665 out_free_5: 2666 sysdev_unregister(&kvm_sysdev); 2667 out_free_4: 2668 sysdev_class_unregister(&kvm_sysdev_class); 2669 out_free_3: 2670 unregister_reboot_notifier(&kvm_reboot_notifier); 2671 unregister_cpu_notifier(&kvm_cpu_notifier); 2672 out_free_2: 2673 on_each_cpu(hardware_disable, NULL, 1); 2674 out_free_1: 2675 kvm_arch_hardware_unsetup(); 2676 out_free_0a: 2677 free_cpumask_var(cpus_hardware_enabled); 2678 out_free_0: 2679 __free_page(bad_page); 2680 out: 2681 kvm_arch_exit(); 2682 kvm_exit_debug(); 2683 out_fail: 2684 return r; 2685 } 2686 EXPORT_SYMBOL_GPL(kvm_init); 2687 2688 void kvm_exit(void) 2689 { 2690 kvm_trace_cleanup(); 2691 misc_deregister(&kvm_dev); 2692 kmem_cache_destroy(kvm_vcpu_cache); 2693 sysdev_unregister(&kvm_sysdev); 2694 sysdev_class_unregister(&kvm_sysdev_class); 2695 unregister_reboot_notifier(&kvm_reboot_notifier); 2696 unregister_cpu_notifier(&kvm_cpu_notifier); 2697 on_each_cpu(hardware_disable, NULL, 1); 2698 kvm_arch_hardware_unsetup(); 2699 kvm_arch_exit(); 2700 kvm_exit_debug(); 2701 free_cpumask_var(cpus_hardware_enabled); 2702 __free_page(bad_page); 2703 } 2704 EXPORT_SYMBOL_GPL(kvm_exit); 2705