1 /* 2 * Kernel-based Virtual Machine driver for Linux 3 * 4 * derived from drivers/kvm/kvm_main.c 5 * 6 * Copyright (C) 2006 Qumranet, Inc. 7 * Copyright (C) 2008 Qumranet, Inc. 8 * Copyright IBM Corporation, 2008 9 * Copyright 2010 Red Hat, Inc. and/or its affilates. 10 * 11 * Authors: 12 * Avi Kivity <avi@qumranet.com> 13 * Yaniv Kamay <yaniv@qumranet.com> 14 * Amit Shah <amit.shah@qumranet.com> 15 * Ben-Ami Yassour <benami@il.ibm.com> 16 * 17 * This work is licensed under the terms of the GNU GPL, version 2. See 18 * the COPYING file in the top-level directory. 19 * 20 */ 21 22 #include <linux/kvm_host.h> 23 #include "irq.h" 24 #include "mmu.h" 25 #include "i8254.h" 26 #include "tss.h" 27 #include "kvm_cache_regs.h" 28 #include "x86.h" 29 30 #include <linux/clocksource.h> 31 #include <linux/interrupt.h> 32 #include <linux/kvm.h> 33 #include <linux/fs.h> 34 #include <linux/vmalloc.h> 35 #include <linux/module.h> 36 #include <linux/mman.h> 37 #include <linux/highmem.h> 38 #include <linux/iommu.h> 39 #include <linux/intel-iommu.h> 40 #include <linux/cpufreq.h> 41 #include <linux/user-return-notifier.h> 42 #include <linux/srcu.h> 43 #include <linux/slab.h> 44 #include <linux/perf_event.h> 45 #include <linux/uaccess.h> 46 #include <trace/events/kvm.h> 47 48 #define CREATE_TRACE_POINTS 49 #include "trace.h" 50 51 #include <asm/debugreg.h> 52 #include <asm/msr.h> 53 #include <asm/desc.h> 54 #include <asm/mtrr.h> 55 #include <asm/mce.h> 56 #include <asm/i387.h> 57 #include <asm/xcr.h> 58 59 #define MAX_IO_MSRS 256 60 #define CR0_RESERVED_BITS \ 61 (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ 62 | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ 63 | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG)) 64 #define CR4_RESERVED_BITS \ 65 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ 66 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ 67 | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ 68 | X86_CR4_OSXSAVE \ 69 | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) 70 71 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) 72 73 #define KVM_MAX_MCE_BANKS 32 74 #define KVM_MCE_CAP_SUPPORTED MCG_CTL_P 75 76 /* EFER defaults: 77 * - enable syscall per default because its emulated by KVM 78 * - enable LME and LMA per default on 64 bit KVM 79 */ 80 #ifdef CONFIG_X86_64 81 static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL; 82 #else 83 static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL; 84 #endif 85 86 #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM 87 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU 88 89 static void update_cr8_intercept(struct kvm_vcpu *vcpu); 90 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, 91 struct kvm_cpuid_entry2 __user *entries); 92 93 struct kvm_x86_ops *kvm_x86_ops; 94 EXPORT_SYMBOL_GPL(kvm_x86_ops); 95 96 int ignore_msrs = 0; 97 module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR); 98 99 #define KVM_NR_SHARED_MSRS 16 100 101 struct kvm_shared_msrs_global { 102 int nr; 103 u32 msrs[KVM_NR_SHARED_MSRS]; 104 }; 105 106 struct kvm_shared_msrs { 107 struct user_return_notifier urn; 108 bool registered; 109 struct kvm_shared_msr_values { 110 u64 host; 111 u64 curr; 112 } values[KVM_NR_SHARED_MSRS]; 113 }; 114 115 static struct kvm_shared_msrs_global __read_mostly shared_msrs_global; 116 static DEFINE_PER_CPU(struct kvm_shared_msrs, shared_msrs); 117 118 struct kvm_stats_debugfs_item debugfs_entries[] = { 119 { "pf_fixed", VCPU_STAT(pf_fixed) }, 120 { "pf_guest", VCPU_STAT(pf_guest) }, 121 { "tlb_flush", VCPU_STAT(tlb_flush) }, 122 { "invlpg", VCPU_STAT(invlpg) }, 123 { "exits", VCPU_STAT(exits) }, 124 { "io_exits", VCPU_STAT(io_exits) }, 125 { "mmio_exits", VCPU_STAT(mmio_exits) }, 126 { "signal_exits", VCPU_STAT(signal_exits) }, 127 { "irq_window", VCPU_STAT(irq_window_exits) }, 128 { "nmi_window", VCPU_STAT(nmi_window_exits) }, 129 { "halt_exits", VCPU_STAT(halt_exits) }, 130 { "halt_wakeup", VCPU_STAT(halt_wakeup) }, 131 { "hypercalls", VCPU_STAT(hypercalls) }, 132 { "request_irq", VCPU_STAT(request_irq_exits) }, 133 { "irq_exits", VCPU_STAT(irq_exits) }, 134 { "host_state_reload", VCPU_STAT(host_state_reload) }, 135 { "efer_reload", VCPU_STAT(efer_reload) }, 136 { "fpu_reload", VCPU_STAT(fpu_reload) }, 137 { "insn_emulation", VCPU_STAT(insn_emulation) }, 138 { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, 139 { "irq_injections", VCPU_STAT(irq_injections) }, 140 { "nmi_injections", VCPU_STAT(nmi_injections) }, 141 { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, 142 { "mmu_pte_write", VM_STAT(mmu_pte_write) }, 143 { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, 144 { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) }, 145 { "mmu_flooded", VM_STAT(mmu_flooded) }, 146 { "mmu_recycled", VM_STAT(mmu_recycled) }, 147 { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, 148 { "mmu_unsync", VM_STAT(mmu_unsync) }, 149 { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, 150 { "largepages", VM_STAT(lpages) }, 151 { NULL } 152 }; 153 154 u64 __read_mostly host_xcr0; 155 156 static inline u32 bit(int bitno) 157 { 158 return 1 << (bitno & 31); 159 } 160 161 static void kvm_on_user_return(struct user_return_notifier *urn) 162 { 163 unsigned slot; 164 struct kvm_shared_msrs *locals 165 = container_of(urn, struct kvm_shared_msrs, urn); 166 struct kvm_shared_msr_values *values; 167 168 for (slot = 0; slot < shared_msrs_global.nr; ++slot) { 169 values = &locals->values[slot]; 170 if (values->host != values->curr) { 171 wrmsrl(shared_msrs_global.msrs[slot], values->host); 172 values->curr = values->host; 173 } 174 } 175 locals->registered = false; 176 user_return_notifier_unregister(urn); 177 } 178 179 static void shared_msr_update(unsigned slot, u32 msr) 180 { 181 struct kvm_shared_msrs *smsr; 182 u64 value; 183 184 smsr = &__get_cpu_var(shared_msrs); 185 /* only read, and nobody should modify it at this time, 186 * so don't need lock */ 187 if (slot >= shared_msrs_global.nr) { 188 printk(KERN_ERR "kvm: invalid MSR slot!"); 189 return; 190 } 191 rdmsrl_safe(msr, &value); 192 smsr->values[slot].host = value; 193 smsr->values[slot].curr = value; 194 } 195 196 void kvm_define_shared_msr(unsigned slot, u32 msr) 197 { 198 if (slot >= shared_msrs_global.nr) 199 shared_msrs_global.nr = slot + 1; 200 shared_msrs_global.msrs[slot] = msr; 201 /* we need ensured the shared_msr_global have been updated */ 202 smp_wmb(); 203 } 204 EXPORT_SYMBOL_GPL(kvm_define_shared_msr); 205 206 static void kvm_shared_msr_cpu_online(void) 207 { 208 unsigned i; 209 210 for (i = 0; i < shared_msrs_global.nr; ++i) 211 shared_msr_update(i, shared_msrs_global.msrs[i]); 212 } 213 214 void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask) 215 { 216 struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs); 217 218 if (((value ^ smsr->values[slot].curr) & mask) == 0) 219 return; 220 smsr->values[slot].curr = value; 221 wrmsrl(shared_msrs_global.msrs[slot], value); 222 if (!smsr->registered) { 223 smsr->urn.on_user_return = kvm_on_user_return; 224 user_return_notifier_register(&smsr->urn); 225 smsr->registered = true; 226 } 227 } 228 EXPORT_SYMBOL_GPL(kvm_set_shared_msr); 229 230 static void drop_user_return_notifiers(void *ignore) 231 { 232 struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs); 233 234 if (smsr->registered) 235 kvm_on_user_return(&smsr->urn); 236 } 237 238 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) 239 { 240 if (irqchip_in_kernel(vcpu->kvm)) 241 return vcpu->arch.apic_base; 242 else 243 return vcpu->arch.apic_base; 244 } 245 EXPORT_SYMBOL_GPL(kvm_get_apic_base); 246 247 void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data) 248 { 249 /* TODO: reserve bits check */ 250 if (irqchip_in_kernel(vcpu->kvm)) 251 kvm_lapic_set_base(vcpu, data); 252 else 253 vcpu->arch.apic_base = data; 254 } 255 EXPORT_SYMBOL_GPL(kvm_set_apic_base); 256 257 #define EXCPT_BENIGN 0 258 #define EXCPT_CONTRIBUTORY 1 259 #define EXCPT_PF 2 260 261 static int exception_class(int vector) 262 { 263 switch (vector) { 264 case PF_VECTOR: 265 return EXCPT_PF; 266 case DE_VECTOR: 267 case TS_VECTOR: 268 case NP_VECTOR: 269 case SS_VECTOR: 270 case GP_VECTOR: 271 return EXCPT_CONTRIBUTORY; 272 default: 273 break; 274 } 275 return EXCPT_BENIGN; 276 } 277 278 static void kvm_multiple_exception(struct kvm_vcpu *vcpu, 279 unsigned nr, bool has_error, u32 error_code, 280 bool reinject) 281 { 282 u32 prev_nr; 283 int class1, class2; 284 285 if (!vcpu->arch.exception.pending) { 286 queue: 287 vcpu->arch.exception.pending = true; 288 vcpu->arch.exception.has_error_code = has_error; 289 vcpu->arch.exception.nr = nr; 290 vcpu->arch.exception.error_code = error_code; 291 vcpu->arch.exception.reinject = reinject; 292 return; 293 } 294 295 /* to check exception */ 296 prev_nr = vcpu->arch.exception.nr; 297 if (prev_nr == DF_VECTOR) { 298 /* triple fault -> shutdown */ 299 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 300 return; 301 } 302 class1 = exception_class(prev_nr); 303 class2 = exception_class(nr); 304 if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY) 305 || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) { 306 /* generate double fault per SDM Table 5-5 */ 307 vcpu->arch.exception.pending = true; 308 vcpu->arch.exception.has_error_code = true; 309 vcpu->arch.exception.nr = DF_VECTOR; 310 vcpu->arch.exception.error_code = 0; 311 } else 312 /* replace previous exception with a new one in a hope 313 that instruction re-execution will regenerate lost 314 exception */ 315 goto queue; 316 } 317 318 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) 319 { 320 kvm_multiple_exception(vcpu, nr, false, 0, false); 321 } 322 EXPORT_SYMBOL_GPL(kvm_queue_exception); 323 324 void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr) 325 { 326 kvm_multiple_exception(vcpu, nr, false, 0, true); 327 } 328 EXPORT_SYMBOL_GPL(kvm_requeue_exception); 329 330 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, 331 u32 error_code) 332 { 333 ++vcpu->stat.pf_guest; 334 vcpu->arch.cr2 = addr; 335 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); 336 } 337 338 void kvm_inject_nmi(struct kvm_vcpu *vcpu) 339 { 340 vcpu->arch.nmi_pending = 1; 341 } 342 EXPORT_SYMBOL_GPL(kvm_inject_nmi); 343 344 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) 345 { 346 kvm_multiple_exception(vcpu, nr, true, error_code, false); 347 } 348 EXPORT_SYMBOL_GPL(kvm_queue_exception_e); 349 350 void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) 351 { 352 kvm_multiple_exception(vcpu, nr, true, error_code, true); 353 } 354 EXPORT_SYMBOL_GPL(kvm_requeue_exception_e); 355 356 /* 357 * Checks if cpl <= required_cpl; if true, return true. Otherwise queue 358 * a #GP and return false. 359 */ 360 bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl) 361 { 362 if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl) 363 return true; 364 kvm_queue_exception_e(vcpu, GP_VECTOR, 0); 365 return false; 366 } 367 EXPORT_SYMBOL_GPL(kvm_require_cpl); 368 369 /* 370 * Load the pae pdptrs. Return true is they are all valid. 371 */ 372 int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) 373 { 374 gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; 375 unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; 376 int i; 377 int ret; 378 u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; 379 380 ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte, 381 offset * sizeof(u64), sizeof(pdpte)); 382 if (ret < 0) { 383 ret = 0; 384 goto out; 385 } 386 for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { 387 if (is_present_gpte(pdpte[i]) && 388 (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) { 389 ret = 0; 390 goto out; 391 } 392 } 393 ret = 1; 394 395 memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs)); 396 __set_bit(VCPU_EXREG_PDPTR, 397 (unsigned long *)&vcpu->arch.regs_avail); 398 __set_bit(VCPU_EXREG_PDPTR, 399 (unsigned long *)&vcpu->arch.regs_dirty); 400 out: 401 402 return ret; 403 } 404 EXPORT_SYMBOL_GPL(load_pdptrs); 405 406 static bool pdptrs_changed(struct kvm_vcpu *vcpu) 407 { 408 u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; 409 bool changed = true; 410 int r; 411 412 if (is_long_mode(vcpu) || !is_pae(vcpu)) 413 return false; 414 415 if (!test_bit(VCPU_EXREG_PDPTR, 416 (unsigned long *)&vcpu->arch.regs_avail)) 417 return true; 418 419 r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte)); 420 if (r < 0) 421 goto out; 422 changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0; 423 out: 424 425 return changed; 426 } 427 428 int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 429 { 430 unsigned long old_cr0 = kvm_read_cr0(vcpu); 431 unsigned long update_bits = X86_CR0_PG | X86_CR0_WP | 432 X86_CR0_CD | X86_CR0_NW; 433 434 cr0 |= X86_CR0_ET; 435 436 #ifdef CONFIG_X86_64 437 if (cr0 & 0xffffffff00000000UL) 438 return 1; 439 #endif 440 441 cr0 &= ~CR0_RESERVED_BITS; 442 443 if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) 444 return 1; 445 446 if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) 447 return 1; 448 449 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { 450 #ifdef CONFIG_X86_64 451 if ((vcpu->arch.efer & EFER_LME)) { 452 int cs_db, cs_l; 453 454 if (!is_pae(vcpu)) 455 return 1; 456 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 457 if (cs_l) 458 return 1; 459 } else 460 #endif 461 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) 462 return 1; 463 } 464 465 kvm_x86_ops->set_cr0(vcpu, cr0); 466 467 if ((cr0 ^ old_cr0) & update_bits) 468 kvm_mmu_reset_context(vcpu); 469 return 0; 470 } 471 EXPORT_SYMBOL_GPL(kvm_set_cr0); 472 473 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) 474 { 475 (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f)); 476 } 477 EXPORT_SYMBOL_GPL(kvm_lmsw); 478 479 int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) 480 { 481 u64 xcr0; 482 483 /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */ 484 if (index != XCR_XFEATURE_ENABLED_MASK) 485 return 1; 486 xcr0 = xcr; 487 if (kvm_x86_ops->get_cpl(vcpu) != 0) 488 return 1; 489 if (!(xcr0 & XSTATE_FP)) 490 return 1; 491 if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE)) 492 return 1; 493 if (xcr0 & ~host_xcr0) 494 return 1; 495 vcpu->arch.xcr0 = xcr0; 496 vcpu->guest_xcr0_loaded = 0; 497 return 0; 498 } 499 500 int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) 501 { 502 if (__kvm_set_xcr(vcpu, index, xcr)) { 503 kvm_inject_gp(vcpu, 0); 504 return 1; 505 } 506 return 0; 507 } 508 EXPORT_SYMBOL_GPL(kvm_set_xcr); 509 510 static bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu) 511 { 512 struct kvm_cpuid_entry2 *best; 513 514 best = kvm_find_cpuid_entry(vcpu, 1, 0); 515 return best && (best->ecx & bit(X86_FEATURE_XSAVE)); 516 } 517 518 static void update_cpuid(struct kvm_vcpu *vcpu) 519 { 520 struct kvm_cpuid_entry2 *best; 521 522 best = kvm_find_cpuid_entry(vcpu, 1, 0); 523 if (!best) 524 return; 525 526 /* Update OSXSAVE bit */ 527 if (cpu_has_xsave && best->function == 0x1) { 528 best->ecx &= ~(bit(X86_FEATURE_OSXSAVE)); 529 if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) 530 best->ecx |= bit(X86_FEATURE_OSXSAVE); 531 } 532 } 533 534 int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 535 { 536 unsigned long old_cr4 = kvm_read_cr4(vcpu); 537 unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE; 538 539 if (cr4 & CR4_RESERVED_BITS) 540 return 1; 541 542 if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE)) 543 return 1; 544 545 if (is_long_mode(vcpu)) { 546 if (!(cr4 & X86_CR4_PAE)) 547 return 1; 548 } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) 549 && ((cr4 ^ old_cr4) & pdptr_bits) 550 && !load_pdptrs(vcpu, vcpu->arch.cr3)) 551 return 1; 552 553 if (cr4 & X86_CR4_VMXE) 554 return 1; 555 556 kvm_x86_ops->set_cr4(vcpu, cr4); 557 558 if ((cr4 ^ old_cr4) & pdptr_bits) 559 kvm_mmu_reset_context(vcpu); 560 561 if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE) 562 update_cpuid(vcpu); 563 564 return 0; 565 } 566 EXPORT_SYMBOL_GPL(kvm_set_cr4); 567 568 int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 569 { 570 if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { 571 kvm_mmu_sync_roots(vcpu); 572 kvm_mmu_flush_tlb(vcpu); 573 return 0; 574 } 575 576 if (is_long_mode(vcpu)) { 577 if (cr3 & CR3_L_MODE_RESERVED_BITS) 578 return 1; 579 } else { 580 if (is_pae(vcpu)) { 581 if (cr3 & CR3_PAE_RESERVED_BITS) 582 return 1; 583 if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) 584 return 1; 585 } 586 /* 587 * We don't check reserved bits in nonpae mode, because 588 * this isn't enforced, and VMware depends on this. 589 */ 590 } 591 592 /* 593 * Does the new cr3 value map to physical memory? (Note, we 594 * catch an invalid cr3 even in real-mode, because it would 595 * cause trouble later on when we turn on paging anyway.) 596 * 597 * A real CPU would silently accept an invalid cr3 and would 598 * attempt to use it - with largely undefined (and often hard 599 * to debug) behavior on the guest side. 600 */ 601 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) 602 return 1; 603 vcpu->arch.cr3 = cr3; 604 vcpu->arch.mmu.new_cr3(vcpu); 605 return 0; 606 } 607 EXPORT_SYMBOL_GPL(kvm_set_cr3); 608 609 int __kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) 610 { 611 if (cr8 & CR8_RESERVED_BITS) 612 return 1; 613 if (irqchip_in_kernel(vcpu->kvm)) 614 kvm_lapic_set_tpr(vcpu, cr8); 615 else 616 vcpu->arch.cr8 = cr8; 617 return 0; 618 } 619 620 void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) 621 { 622 if (__kvm_set_cr8(vcpu, cr8)) 623 kvm_inject_gp(vcpu, 0); 624 } 625 EXPORT_SYMBOL_GPL(kvm_set_cr8); 626 627 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) 628 { 629 if (irqchip_in_kernel(vcpu->kvm)) 630 return kvm_lapic_get_cr8(vcpu); 631 else 632 return vcpu->arch.cr8; 633 } 634 EXPORT_SYMBOL_GPL(kvm_get_cr8); 635 636 static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) 637 { 638 switch (dr) { 639 case 0 ... 3: 640 vcpu->arch.db[dr] = val; 641 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) 642 vcpu->arch.eff_db[dr] = val; 643 break; 644 case 4: 645 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) 646 return 1; /* #UD */ 647 /* fall through */ 648 case 6: 649 if (val & 0xffffffff00000000ULL) 650 return -1; /* #GP */ 651 vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1; 652 break; 653 case 5: 654 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) 655 return 1; /* #UD */ 656 /* fall through */ 657 default: /* 7 */ 658 if (val & 0xffffffff00000000ULL) 659 return -1; /* #GP */ 660 vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1; 661 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { 662 kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7); 663 vcpu->arch.switch_db_regs = (val & DR7_BP_EN_MASK); 664 } 665 break; 666 } 667 668 return 0; 669 } 670 671 int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) 672 { 673 int res; 674 675 res = __kvm_set_dr(vcpu, dr, val); 676 if (res > 0) 677 kvm_queue_exception(vcpu, UD_VECTOR); 678 else if (res < 0) 679 kvm_inject_gp(vcpu, 0); 680 681 return res; 682 } 683 EXPORT_SYMBOL_GPL(kvm_set_dr); 684 685 static int _kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) 686 { 687 switch (dr) { 688 case 0 ... 3: 689 *val = vcpu->arch.db[dr]; 690 break; 691 case 4: 692 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) 693 return 1; 694 /* fall through */ 695 case 6: 696 *val = vcpu->arch.dr6; 697 break; 698 case 5: 699 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) 700 return 1; 701 /* fall through */ 702 default: /* 7 */ 703 *val = vcpu->arch.dr7; 704 break; 705 } 706 707 return 0; 708 } 709 710 int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) 711 { 712 if (_kvm_get_dr(vcpu, dr, val)) { 713 kvm_queue_exception(vcpu, UD_VECTOR); 714 return 1; 715 } 716 return 0; 717 } 718 EXPORT_SYMBOL_GPL(kvm_get_dr); 719 720 /* 721 * List of msr numbers which we expose to userspace through KVM_GET_MSRS 722 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. 723 * 724 * This list is modified at module load time to reflect the 725 * capabilities of the host cpu. This capabilities test skips MSRs that are 726 * kvm-specific. Those are put in the beginning of the list. 727 */ 728 729 #define KVM_SAVE_MSRS_BEGIN 7 730 static u32 msrs_to_save[] = { 731 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, 732 MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, 733 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, 734 HV_X64_MSR_APIC_ASSIST_PAGE, 735 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 736 MSR_STAR, 737 #ifdef CONFIG_X86_64 738 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, 739 #endif 740 MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA 741 }; 742 743 static unsigned num_msrs_to_save; 744 745 static u32 emulated_msrs[] = { 746 MSR_IA32_MISC_ENABLE, 747 MSR_IA32_MCG_STATUS, 748 MSR_IA32_MCG_CTL, 749 }; 750 751 static int set_efer(struct kvm_vcpu *vcpu, u64 efer) 752 { 753 u64 old_efer = vcpu->arch.efer; 754 755 if (efer & efer_reserved_bits) 756 return 1; 757 758 if (is_paging(vcpu) 759 && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) 760 return 1; 761 762 if (efer & EFER_FFXSR) { 763 struct kvm_cpuid_entry2 *feat; 764 765 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); 766 if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) 767 return 1; 768 } 769 770 if (efer & EFER_SVME) { 771 struct kvm_cpuid_entry2 *feat; 772 773 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); 774 if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) 775 return 1; 776 } 777 778 efer &= ~EFER_LMA; 779 efer |= vcpu->arch.efer & EFER_LMA; 780 781 kvm_x86_ops->set_efer(vcpu, efer); 782 783 vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; 784 kvm_mmu_reset_context(vcpu); 785 786 /* Update reserved bits */ 787 if ((efer ^ old_efer) & EFER_NX) 788 kvm_mmu_reset_context(vcpu); 789 790 return 0; 791 } 792 793 void kvm_enable_efer_bits(u64 mask) 794 { 795 efer_reserved_bits &= ~mask; 796 } 797 EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); 798 799 800 /* 801 * Writes msr value into into the appropriate "register". 802 * Returns 0 on success, non-0 otherwise. 803 * Assumes vcpu_load() was already called. 804 */ 805 int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 806 { 807 return kvm_x86_ops->set_msr(vcpu, msr_index, data); 808 } 809 810 /* 811 * Adapt set_msr() to msr_io()'s calling convention 812 */ 813 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) 814 { 815 return kvm_set_msr(vcpu, index, *data); 816 } 817 818 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) 819 { 820 int version; 821 int r; 822 struct pvclock_wall_clock wc; 823 struct timespec boot; 824 825 if (!wall_clock) 826 return; 827 828 r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version)); 829 if (r) 830 return; 831 832 if (version & 1) 833 ++version; /* first time write, random junk */ 834 835 ++version; 836 837 kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); 838 839 /* 840 * The guest calculates current wall clock time by adding 841 * system time (updated by kvm_write_guest_time below) to the 842 * wall clock specified here. guest system time equals host 843 * system time for us, thus we must fill in host boot time here. 844 */ 845 getboottime(&boot); 846 847 wc.sec = boot.tv_sec; 848 wc.nsec = boot.tv_nsec; 849 wc.version = version; 850 851 kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc)); 852 853 version++; 854 kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); 855 } 856 857 static uint32_t div_frac(uint32_t dividend, uint32_t divisor) 858 { 859 uint32_t quotient, remainder; 860 861 /* Don't try to replace with do_div(), this one calculates 862 * "(dividend << 32) / divisor" */ 863 __asm__ ( "divl %4" 864 : "=a" (quotient), "=d" (remainder) 865 : "0" (0), "1" (dividend), "r" (divisor) ); 866 return quotient; 867 } 868 869 static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock) 870 { 871 uint64_t nsecs = 1000000000LL; 872 int32_t shift = 0; 873 uint64_t tps64; 874 uint32_t tps32; 875 876 tps64 = tsc_khz * 1000LL; 877 while (tps64 > nsecs*2) { 878 tps64 >>= 1; 879 shift--; 880 } 881 882 tps32 = (uint32_t)tps64; 883 while (tps32 <= (uint32_t)nsecs) { 884 tps32 <<= 1; 885 shift++; 886 } 887 888 hv_clock->tsc_shift = shift; 889 hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32); 890 891 pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n", 892 __func__, tsc_khz, hv_clock->tsc_shift, 893 hv_clock->tsc_to_system_mul); 894 } 895 896 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); 897 898 static void kvm_write_guest_time(struct kvm_vcpu *v) 899 { 900 struct timespec ts; 901 unsigned long flags; 902 struct kvm_vcpu_arch *vcpu = &v->arch; 903 void *shared_kaddr; 904 unsigned long this_tsc_khz; 905 906 if ((!vcpu->time_page)) 907 return; 908 909 this_tsc_khz = get_cpu_var(cpu_tsc_khz); 910 if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) { 911 kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock); 912 vcpu->hv_clock_tsc_khz = this_tsc_khz; 913 } 914 put_cpu_var(cpu_tsc_khz); 915 916 /* Keep irq disabled to prevent changes to the clock */ 917 local_irq_save(flags); 918 kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp); 919 ktime_get_ts(&ts); 920 monotonic_to_bootbased(&ts); 921 local_irq_restore(flags); 922 923 /* With all the info we got, fill in the values */ 924 925 vcpu->hv_clock.system_time = ts.tv_nsec + 926 (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset; 927 928 vcpu->hv_clock.flags = 0; 929 930 /* 931 * The interface expects us to write an even number signaling that the 932 * update is finished. Since the guest won't see the intermediate 933 * state, we just increase by 2 at the end. 934 */ 935 vcpu->hv_clock.version += 2; 936 937 shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0); 938 939 memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock, 940 sizeof(vcpu->hv_clock)); 941 942 kunmap_atomic(shared_kaddr, KM_USER0); 943 944 mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT); 945 } 946 947 static int kvm_request_guest_time_update(struct kvm_vcpu *v) 948 { 949 struct kvm_vcpu_arch *vcpu = &v->arch; 950 951 if (!vcpu->time_page) 952 return 0; 953 kvm_make_request(KVM_REQ_KVMCLOCK_UPDATE, v); 954 return 1; 955 } 956 957 static bool msr_mtrr_valid(unsigned msr) 958 { 959 switch (msr) { 960 case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1: 961 case MSR_MTRRfix64K_00000: 962 case MSR_MTRRfix16K_80000: 963 case MSR_MTRRfix16K_A0000: 964 case MSR_MTRRfix4K_C0000: 965 case MSR_MTRRfix4K_C8000: 966 case MSR_MTRRfix4K_D0000: 967 case MSR_MTRRfix4K_D8000: 968 case MSR_MTRRfix4K_E0000: 969 case MSR_MTRRfix4K_E8000: 970 case MSR_MTRRfix4K_F0000: 971 case MSR_MTRRfix4K_F8000: 972 case MSR_MTRRdefType: 973 case MSR_IA32_CR_PAT: 974 return true; 975 case 0x2f8: 976 return true; 977 } 978 return false; 979 } 980 981 static bool valid_pat_type(unsigned t) 982 { 983 return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */ 984 } 985 986 static bool valid_mtrr_type(unsigned t) 987 { 988 return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */ 989 } 990 991 static bool mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) 992 { 993 int i; 994 995 if (!msr_mtrr_valid(msr)) 996 return false; 997 998 if (msr == MSR_IA32_CR_PAT) { 999 for (i = 0; i < 8; i++) 1000 if (!valid_pat_type((data >> (i * 8)) & 0xff)) 1001 return false; 1002 return true; 1003 } else if (msr == MSR_MTRRdefType) { 1004 if (data & ~0xcff) 1005 return false; 1006 return valid_mtrr_type(data & 0xff); 1007 } else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) { 1008 for (i = 0; i < 8 ; i++) 1009 if (!valid_mtrr_type((data >> (i * 8)) & 0xff)) 1010 return false; 1011 return true; 1012 } 1013 1014 /* variable MTRRs */ 1015 return valid_mtrr_type(data & 0xff); 1016 } 1017 1018 static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data) 1019 { 1020 u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; 1021 1022 if (!mtrr_valid(vcpu, msr, data)) 1023 return 1; 1024 1025 if (msr == MSR_MTRRdefType) { 1026 vcpu->arch.mtrr_state.def_type = data; 1027 vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10; 1028 } else if (msr == MSR_MTRRfix64K_00000) 1029 p[0] = data; 1030 else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000) 1031 p[1 + msr - MSR_MTRRfix16K_80000] = data; 1032 else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000) 1033 p[3 + msr - MSR_MTRRfix4K_C0000] = data; 1034 else if (msr == MSR_IA32_CR_PAT) 1035 vcpu->arch.pat = data; 1036 else { /* Variable MTRRs */ 1037 int idx, is_mtrr_mask; 1038 u64 *pt; 1039 1040 idx = (msr - 0x200) / 2; 1041 is_mtrr_mask = msr - 0x200 - 2 * idx; 1042 if (!is_mtrr_mask) 1043 pt = 1044 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo; 1045 else 1046 pt = 1047 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo; 1048 *pt = data; 1049 } 1050 1051 kvm_mmu_reset_context(vcpu); 1052 return 0; 1053 } 1054 1055 static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data) 1056 { 1057 u64 mcg_cap = vcpu->arch.mcg_cap; 1058 unsigned bank_num = mcg_cap & 0xff; 1059 1060 switch (msr) { 1061 case MSR_IA32_MCG_STATUS: 1062 vcpu->arch.mcg_status = data; 1063 break; 1064 case MSR_IA32_MCG_CTL: 1065 if (!(mcg_cap & MCG_CTL_P)) 1066 return 1; 1067 if (data != 0 && data != ~(u64)0) 1068 return -1; 1069 vcpu->arch.mcg_ctl = data; 1070 break; 1071 default: 1072 if (msr >= MSR_IA32_MC0_CTL && 1073 msr < MSR_IA32_MC0_CTL + 4 * bank_num) { 1074 u32 offset = msr - MSR_IA32_MC0_CTL; 1075 /* only 0 or all 1s can be written to IA32_MCi_CTL 1076 * some Linux kernels though clear bit 10 in bank 4 to 1077 * workaround a BIOS/GART TBL issue on AMD K8s, ignore 1078 * this to avoid an uncatched #GP in the guest 1079 */ 1080 if ((offset & 0x3) == 0 && 1081 data != 0 && (data | (1 << 10)) != ~(u64)0) 1082 return -1; 1083 vcpu->arch.mce_banks[offset] = data; 1084 break; 1085 } 1086 return 1; 1087 } 1088 return 0; 1089 } 1090 1091 static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data) 1092 { 1093 struct kvm *kvm = vcpu->kvm; 1094 int lm = is_long_mode(vcpu); 1095 u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64 1096 : (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32; 1097 u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64 1098 : kvm->arch.xen_hvm_config.blob_size_32; 1099 u32 page_num = data & ~PAGE_MASK; 1100 u64 page_addr = data & PAGE_MASK; 1101 u8 *page; 1102 int r; 1103 1104 r = -E2BIG; 1105 if (page_num >= blob_size) 1106 goto out; 1107 r = -ENOMEM; 1108 page = kzalloc(PAGE_SIZE, GFP_KERNEL); 1109 if (!page) 1110 goto out; 1111 r = -EFAULT; 1112 if (copy_from_user(page, blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE)) 1113 goto out_free; 1114 if (kvm_write_guest(kvm, page_addr, page, PAGE_SIZE)) 1115 goto out_free; 1116 r = 0; 1117 out_free: 1118 kfree(page); 1119 out: 1120 return r; 1121 } 1122 1123 static bool kvm_hv_hypercall_enabled(struct kvm *kvm) 1124 { 1125 return kvm->arch.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE; 1126 } 1127 1128 static bool kvm_hv_msr_partition_wide(u32 msr) 1129 { 1130 bool r = false; 1131 switch (msr) { 1132 case HV_X64_MSR_GUEST_OS_ID: 1133 case HV_X64_MSR_HYPERCALL: 1134 r = true; 1135 break; 1136 } 1137 1138 return r; 1139 } 1140 1141 static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data) 1142 { 1143 struct kvm *kvm = vcpu->kvm; 1144 1145 switch (msr) { 1146 case HV_X64_MSR_GUEST_OS_ID: 1147 kvm->arch.hv_guest_os_id = data; 1148 /* setting guest os id to zero disables hypercall page */ 1149 if (!kvm->arch.hv_guest_os_id) 1150 kvm->arch.hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE; 1151 break; 1152 case HV_X64_MSR_HYPERCALL: { 1153 u64 gfn; 1154 unsigned long addr; 1155 u8 instructions[4]; 1156 1157 /* if guest os id is not set hypercall should remain disabled */ 1158 if (!kvm->arch.hv_guest_os_id) 1159 break; 1160 if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) { 1161 kvm->arch.hv_hypercall = data; 1162 break; 1163 } 1164 gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT; 1165 addr = gfn_to_hva(kvm, gfn); 1166 if (kvm_is_error_hva(addr)) 1167 return 1; 1168 kvm_x86_ops->patch_hypercall(vcpu, instructions); 1169 ((unsigned char *)instructions)[3] = 0xc3; /* ret */ 1170 if (copy_to_user((void __user *)addr, instructions, 4)) 1171 return 1; 1172 kvm->arch.hv_hypercall = data; 1173 break; 1174 } 1175 default: 1176 pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " 1177 "data 0x%llx\n", msr, data); 1178 return 1; 1179 } 1180 return 0; 1181 } 1182 1183 static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data) 1184 { 1185 switch (msr) { 1186 case HV_X64_MSR_APIC_ASSIST_PAGE: { 1187 unsigned long addr; 1188 1189 if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) { 1190 vcpu->arch.hv_vapic = data; 1191 break; 1192 } 1193 addr = gfn_to_hva(vcpu->kvm, data >> 1194 HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT); 1195 if (kvm_is_error_hva(addr)) 1196 return 1; 1197 if (clear_user((void __user *)addr, PAGE_SIZE)) 1198 return 1; 1199 vcpu->arch.hv_vapic = data; 1200 break; 1201 } 1202 case HV_X64_MSR_EOI: 1203 return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data); 1204 case HV_X64_MSR_ICR: 1205 return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data); 1206 case HV_X64_MSR_TPR: 1207 return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data); 1208 default: 1209 pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " 1210 "data 0x%llx\n", msr, data); 1211 return 1; 1212 } 1213 1214 return 0; 1215 } 1216 1217 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) 1218 { 1219 switch (msr) { 1220 case MSR_EFER: 1221 return set_efer(vcpu, data); 1222 case MSR_K7_HWCR: 1223 data &= ~(u64)0x40; /* ignore flush filter disable */ 1224 data &= ~(u64)0x100; /* ignore ignne emulation enable */ 1225 if (data != 0) { 1226 pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", 1227 data); 1228 return 1; 1229 } 1230 break; 1231 case MSR_FAM10H_MMIO_CONF_BASE: 1232 if (data != 0) { 1233 pr_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: " 1234 "0x%llx\n", data); 1235 return 1; 1236 } 1237 break; 1238 case MSR_AMD64_NB_CFG: 1239 break; 1240 case MSR_IA32_DEBUGCTLMSR: 1241 if (!data) { 1242 /* We support the non-activated case already */ 1243 break; 1244 } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) { 1245 /* Values other than LBR and BTF are vendor-specific, 1246 thus reserved and should throw a #GP */ 1247 return 1; 1248 } 1249 pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", 1250 __func__, data); 1251 break; 1252 case MSR_IA32_UCODE_REV: 1253 case MSR_IA32_UCODE_WRITE: 1254 case MSR_VM_HSAVE_PA: 1255 case MSR_AMD64_PATCH_LOADER: 1256 break; 1257 case 0x200 ... 0x2ff: 1258 return set_msr_mtrr(vcpu, msr, data); 1259 case MSR_IA32_APICBASE: 1260 kvm_set_apic_base(vcpu, data); 1261 break; 1262 case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: 1263 return kvm_x2apic_msr_write(vcpu, msr, data); 1264 case MSR_IA32_MISC_ENABLE: 1265 vcpu->arch.ia32_misc_enable_msr = data; 1266 break; 1267 case MSR_KVM_WALL_CLOCK_NEW: 1268 case MSR_KVM_WALL_CLOCK: 1269 vcpu->kvm->arch.wall_clock = data; 1270 kvm_write_wall_clock(vcpu->kvm, data); 1271 break; 1272 case MSR_KVM_SYSTEM_TIME_NEW: 1273 case MSR_KVM_SYSTEM_TIME: { 1274 if (vcpu->arch.time_page) { 1275 kvm_release_page_dirty(vcpu->arch.time_page); 1276 vcpu->arch.time_page = NULL; 1277 } 1278 1279 vcpu->arch.time = data; 1280 1281 /* we verify if the enable bit is set... */ 1282 if (!(data & 1)) 1283 break; 1284 1285 /* ...but clean it before doing the actual write */ 1286 vcpu->arch.time_offset = data & ~(PAGE_MASK | 1); 1287 1288 vcpu->arch.time_page = 1289 gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); 1290 1291 if (is_error_page(vcpu->arch.time_page)) { 1292 kvm_release_page_clean(vcpu->arch.time_page); 1293 vcpu->arch.time_page = NULL; 1294 } 1295 1296 kvm_request_guest_time_update(vcpu); 1297 break; 1298 } 1299 case MSR_IA32_MCG_CTL: 1300 case MSR_IA32_MCG_STATUS: 1301 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: 1302 return set_msr_mce(vcpu, msr, data); 1303 1304 /* Performance counters are not protected by a CPUID bit, 1305 * so we should check all of them in the generic path for the sake of 1306 * cross vendor migration. 1307 * Writing a zero into the event select MSRs disables them, 1308 * which we perfectly emulate ;-). Any other value should be at least 1309 * reported, some guests depend on them. 1310 */ 1311 case MSR_P6_EVNTSEL0: 1312 case MSR_P6_EVNTSEL1: 1313 case MSR_K7_EVNTSEL0: 1314 case MSR_K7_EVNTSEL1: 1315 case MSR_K7_EVNTSEL2: 1316 case MSR_K7_EVNTSEL3: 1317 if (data != 0) 1318 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " 1319 "0x%x data 0x%llx\n", msr, data); 1320 break; 1321 /* at least RHEL 4 unconditionally writes to the perfctr registers, 1322 * so we ignore writes to make it happy. 1323 */ 1324 case MSR_P6_PERFCTR0: 1325 case MSR_P6_PERFCTR1: 1326 case MSR_K7_PERFCTR0: 1327 case MSR_K7_PERFCTR1: 1328 case MSR_K7_PERFCTR2: 1329 case MSR_K7_PERFCTR3: 1330 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " 1331 "0x%x data 0x%llx\n", msr, data); 1332 break; 1333 case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: 1334 if (kvm_hv_msr_partition_wide(msr)) { 1335 int r; 1336 mutex_lock(&vcpu->kvm->lock); 1337 r = set_msr_hyperv_pw(vcpu, msr, data); 1338 mutex_unlock(&vcpu->kvm->lock); 1339 return r; 1340 } else 1341 return set_msr_hyperv(vcpu, msr, data); 1342 break; 1343 default: 1344 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) 1345 return xen_hvm_config(vcpu, data); 1346 if (!ignore_msrs) { 1347 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", 1348 msr, data); 1349 return 1; 1350 } else { 1351 pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", 1352 msr, data); 1353 break; 1354 } 1355 } 1356 return 0; 1357 } 1358 EXPORT_SYMBOL_GPL(kvm_set_msr_common); 1359 1360 1361 /* 1362 * Reads an msr value (of 'msr_index') into 'pdata'. 1363 * Returns 0 on success, non-0 otherwise. 1364 * Assumes vcpu_load() was already called. 1365 */ 1366 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) 1367 { 1368 return kvm_x86_ops->get_msr(vcpu, msr_index, pdata); 1369 } 1370 1371 static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 1372 { 1373 u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; 1374 1375 if (!msr_mtrr_valid(msr)) 1376 return 1; 1377 1378 if (msr == MSR_MTRRdefType) 1379 *pdata = vcpu->arch.mtrr_state.def_type + 1380 (vcpu->arch.mtrr_state.enabled << 10); 1381 else if (msr == MSR_MTRRfix64K_00000) 1382 *pdata = p[0]; 1383 else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000) 1384 *pdata = p[1 + msr - MSR_MTRRfix16K_80000]; 1385 else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000) 1386 *pdata = p[3 + msr - MSR_MTRRfix4K_C0000]; 1387 else if (msr == MSR_IA32_CR_PAT) 1388 *pdata = vcpu->arch.pat; 1389 else { /* Variable MTRRs */ 1390 int idx, is_mtrr_mask; 1391 u64 *pt; 1392 1393 idx = (msr - 0x200) / 2; 1394 is_mtrr_mask = msr - 0x200 - 2 * idx; 1395 if (!is_mtrr_mask) 1396 pt = 1397 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo; 1398 else 1399 pt = 1400 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo; 1401 *pdata = *pt; 1402 } 1403 1404 return 0; 1405 } 1406 1407 static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 1408 { 1409 u64 data; 1410 u64 mcg_cap = vcpu->arch.mcg_cap; 1411 unsigned bank_num = mcg_cap & 0xff; 1412 1413 switch (msr) { 1414 case MSR_IA32_P5_MC_ADDR: 1415 case MSR_IA32_P5_MC_TYPE: 1416 data = 0; 1417 break; 1418 case MSR_IA32_MCG_CAP: 1419 data = vcpu->arch.mcg_cap; 1420 break; 1421 case MSR_IA32_MCG_CTL: 1422 if (!(mcg_cap & MCG_CTL_P)) 1423 return 1; 1424 data = vcpu->arch.mcg_ctl; 1425 break; 1426 case MSR_IA32_MCG_STATUS: 1427 data = vcpu->arch.mcg_status; 1428 break; 1429 default: 1430 if (msr >= MSR_IA32_MC0_CTL && 1431 msr < MSR_IA32_MC0_CTL + 4 * bank_num) { 1432 u32 offset = msr - MSR_IA32_MC0_CTL; 1433 data = vcpu->arch.mce_banks[offset]; 1434 break; 1435 } 1436 return 1; 1437 } 1438 *pdata = data; 1439 return 0; 1440 } 1441 1442 static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 1443 { 1444 u64 data = 0; 1445 struct kvm *kvm = vcpu->kvm; 1446 1447 switch (msr) { 1448 case HV_X64_MSR_GUEST_OS_ID: 1449 data = kvm->arch.hv_guest_os_id; 1450 break; 1451 case HV_X64_MSR_HYPERCALL: 1452 data = kvm->arch.hv_hypercall; 1453 break; 1454 default: 1455 pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); 1456 return 1; 1457 } 1458 1459 *pdata = data; 1460 return 0; 1461 } 1462 1463 static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 1464 { 1465 u64 data = 0; 1466 1467 switch (msr) { 1468 case HV_X64_MSR_VP_INDEX: { 1469 int r; 1470 struct kvm_vcpu *v; 1471 kvm_for_each_vcpu(r, v, vcpu->kvm) 1472 if (v == vcpu) 1473 data = r; 1474 break; 1475 } 1476 case HV_X64_MSR_EOI: 1477 return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata); 1478 case HV_X64_MSR_ICR: 1479 return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata); 1480 case HV_X64_MSR_TPR: 1481 return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata); 1482 default: 1483 pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); 1484 return 1; 1485 } 1486 *pdata = data; 1487 return 0; 1488 } 1489 1490 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 1491 { 1492 u64 data; 1493 1494 switch (msr) { 1495 case MSR_IA32_PLATFORM_ID: 1496 case MSR_IA32_UCODE_REV: 1497 case MSR_IA32_EBL_CR_POWERON: 1498 case MSR_IA32_DEBUGCTLMSR: 1499 case MSR_IA32_LASTBRANCHFROMIP: 1500 case MSR_IA32_LASTBRANCHTOIP: 1501 case MSR_IA32_LASTINTFROMIP: 1502 case MSR_IA32_LASTINTTOIP: 1503 case MSR_K8_SYSCFG: 1504 case MSR_K7_HWCR: 1505 case MSR_VM_HSAVE_PA: 1506 case MSR_P6_PERFCTR0: 1507 case MSR_P6_PERFCTR1: 1508 case MSR_P6_EVNTSEL0: 1509 case MSR_P6_EVNTSEL1: 1510 case MSR_K7_EVNTSEL0: 1511 case MSR_K7_PERFCTR0: 1512 case MSR_K8_INT_PENDING_MSG: 1513 case MSR_AMD64_NB_CFG: 1514 case MSR_FAM10H_MMIO_CONF_BASE: 1515 data = 0; 1516 break; 1517 case MSR_MTRRcap: 1518 data = 0x500 | KVM_NR_VAR_MTRR; 1519 break; 1520 case 0x200 ... 0x2ff: 1521 return get_msr_mtrr(vcpu, msr, pdata); 1522 case 0xcd: /* fsb frequency */ 1523 data = 3; 1524 break; 1525 case MSR_IA32_APICBASE: 1526 data = kvm_get_apic_base(vcpu); 1527 break; 1528 case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: 1529 return kvm_x2apic_msr_read(vcpu, msr, pdata); 1530 break; 1531 case MSR_IA32_MISC_ENABLE: 1532 data = vcpu->arch.ia32_misc_enable_msr; 1533 break; 1534 case MSR_IA32_PERF_STATUS: 1535 /* TSC increment by tick */ 1536 data = 1000ULL; 1537 /* CPU multiplier */ 1538 data |= (((uint64_t)4ULL) << 40); 1539 break; 1540 case MSR_EFER: 1541 data = vcpu->arch.efer; 1542 break; 1543 case MSR_KVM_WALL_CLOCK: 1544 case MSR_KVM_WALL_CLOCK_NEW: 1545 data = vcpu->kvm->arch.wall_clock; 1546 break; 1547 case MSR_KVM_SYSTEM_TIME: 1548 case MSR_KVM_SYSTEM_TIME_NEW: 1549 data = vcpu->arch.time; 1550 break; 1551 case MSR_IA32_P5_MC_ADDR: 1552 case MSR_IA32_P5_MC_TYPE: 1553 case MSR_IA32_MCG_CAP: 1554 case MSR_IA32_MCG_CTL: 1555 case MSR_IA32_MCG_STATUS: 1556 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: 1557 return get_msr_mce(vcpu, msr, pdata); 1558 case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: 1559 if (kvm_hv_msr_partition_wide(msr)) { 1560 int r; 1561 mutex_lock(&vcpu->kvm->lock); 1562 r = get_msr_hyperv_pw(vcpu, msr, pdata); 1563 mutex_unlock(&vcpu->kvm->lock); 1564 return r; 1565 } else 1566 return get_msr_hyperv(vcpu, msr, pdata); 1567 break; 1568 default: 1569 if (!ignore_msrs) { 1570 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); 1571 return 1; 1572 } else { 1573 pr_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr); 1574 data = 0; 1575 } 1576 break; 1577 } 1578 *pdata = data; 1579 return 0; 1580 } 1581 EXPORT_SYMBOL_GPL(kvm_get_msr_common); 1582 1583 /* 1584 * Read or write a bunch of msrs. All parameters are kernel addresses. 1585 * 1586 * @return number of msrs set successfully. 1587 */ 1588 static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, 1589 struct kvm_msr_entry *entries, 1590 int (*do_msr)(struct kvm_vcpu *vcpu, 1591 unsigned index, u64 *data)) 1592 { 1593 int i, idx; 1594 1595 idx = srcu_read_lock(&vcpu->kvm->srcu); 1596 for (i = 0; i < msrs->nmsrs; ++i) 1597 if (do_msr(vcpu, entries[i].index, &entries[i].data)) 1598 break; 1599 srcu_read_unlock(&vcpu->kvm->srcu, idx); 1600 1601 return i; 1602 } 1603 1604 /* 1605 * Read or write a bunch of msrs. Parameters are user addresses. 1606 * 1607 * @return number of msrs set successfully. 1608 */ 1609 static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs, 1610 int (*do_msr)(struct kvm_vcpu *vcpu, 1611 unsigned index, u64 *data), 1612 int writeback) 1613 { 1614 struct kvm_msrs msrs; 1615 struct kvm_msr_entry *entries; 1616 int r, n; 1617 unsigned size; 1618 1619 r = -EFAULT; 1620 if (copy_from_user(&msrs, user_msrs, sizeof msrs)) 1621 goto out; 1622 1623 r = -E2BIG; 1624 if (msrs.nmsrs >= MAX_IO_MSRS) 1625 goto out; 1626 1627 r = -ENOMEM; 1628 size = sizeof(struct kvm_msr_entry) * msrs.nmsrs; 1629 entries = kmalloc(size, GFP_KERNEL); 1630 if (!entries) 1631 goto out; 1632 1633 r = -EFAULT; 1634 if (copy_from_user(entries, user_msrs->entries, size)) 1635 goto out_free; 1636 1637 r = n = __msr_io(vcpu, &msrs, entries, do_msr); 1638 if (r < 0) 1639 goto out_free; 1640 1641 r = -EFAULT; 1642 if (writeback && copy_to_user(user_msrs->entries, entries, size)) 1643 goto out_free; 1644 1645 r = n; 1646 1647 out_free: 1648 kfree(entries); 1649 out: 1650 return r; 1651 } 1652 1653 int kvm_dev_ioctl_check_extension(long ext) 1654 { 1655 int r; 1656 1657 switch (ext) { 1658 case KVM_CAP_IRQCHIP: 1659 case KVM_CAP_HLT: 1660 case KVM_CAP_MMU_SHADOW_CACHE_CONTROL: 1661 case KVM_CAP_SET_TSS_ADDR: 1662 case KVM_CAP_EXT_CPUID: 1663 case KVM_CAP_CLOCKSOURCE: 1664 case KVM_CAP_PIT: 1665 case KVM_CAP_NOP_IO_DELAY: 1666 case KVM_CAP_MP_STATE: 1667 case KVM_CAP_SYNC_MMU: 1668 case KVM_CAP_REINJECT_CONTROL: 1669 case KVM_CAP_IRQ_INJECT_STATUS: 1670 case KVM_CAP_ASSIGN_DEV_IRQ: 1671 case KVM_CAP_IRQFD: 1672 case KVM_CAP_IOEVENTFD: 1673 case KVM_CAP_PIT2: 1674 case KVM_CAP_PIT_STATE2: 1675 case KVM_CAP_SET_IDENTITY_MAP_ADDR: 1676 case KVM_CAP_XEN_HVM: 1677 case KVM_CAP_ADJUST_CLOCK: 1678 case KVM_CAP_VCPU_EVENTS: 1679 case KVM_CAP_HYPERV: 1680 case KVM_CAP_HYPERV_VAPIC: 1681 case KVM_CAP_HYPERV_SPIN: 1682 case KVM_CAP_PCI_SEGMENT: 1683 case KVM_CAP_DEBUGREGS: 1684 case KVM_CAP_X86_ROBUST_SINGLESTEP: 1685 case KVM_CAP_XSAVE: 1686 r = 1; 1687 break; 1688 case KVM_CAP_COALESCED_MMIO: 1689 r = KVM_COALESCED_MMIO_PAGE_OFFSET; 1690 break; 1691 case KVM_CAP_VAPIC: 1692 r = !kvm_x86_ops->cpu_has_accelerated_tpr(); 1693 break; 1694 case KVM_CAP_NR_VCPUS: 1695 r = KVM_MAX_VCPUS; 1696 break; 1697 case KVM_CAP_NR_MEMSLOTS: 1698 r = KVM_MEMORY_SLOTS; 1699 break; 1700 case KVM_CAP_PV_MMU: /* obsolete */ 1701 r = 0; 1702 break; 1703 case KVM_CAP_IOMMU: 1704 r = iommu_found(); 1705 break; 1706 case KVM_CAP_MCE: 1707 r = KVM_MAX_MCE_BANKS; 1708 break; 1709 case KVM_CAP_XCRS: 1710 r = cpu_has_xsave; 1711 break; 1712 default: 1713 r = 0; 1714 break; 1715 } 1716 return r; 1717 1718 } 1719 1720 long kvm_arch_dev_ioctl(struct file *filp, 1721 unsigned int ioctl, unsigned long arg) 1722 { 1723 void __user *argp = (void __user *)arg; 1724 long r; 1725 1726 switch (ioctl) { 1727 case KVM_GET_MSR_INDEX_LIST: { 1728 struct kvm_msr_list __user *user_msr_list = argp; 1729 struct kvm_msr_list msr_list; 1730 unsigned n; 1731 1732 r = -EFAULT; 1733 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list)) 1734 goto out; 1735 n = msr_list.nmsrs; 1736 msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs); 1737 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list)) 1738 goto out; 1739 r = -E2BIG; 1740 if (n < msr_list.nmsrs) 1741 goto out; 1742 r = -EFAULT; 1743 if (copy_to_user(user_msr_list->indices, &msrs_to_save, 1744 num_msrs_to_save * sizeof(u32))) 1745 goto out; 1746 if (copy_to_user(user_msr_list->indices + num_msrs_to_save, 1747 &emulated_msrs, 1748 ARRAY_SIZE(emulated_msrs) * sizeof(u32))) 1749 goto out; 1750 r = 0; 1751 break; 1752 } 1753 case KVM_GET_SUPPORTED_CPUID: { 1754 struct kvm_cpuid2 __user *cpuid_arg = argp; 1755 struct kvm_cpuid2 cpuid; 1756 1757 r = -EFAULT; 1758 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 1759 goto out; 1760 r = kvm_dev_ioctl_get_supported_cpuid(&cpuid, 1761 cpuid_arg->entries); 1762 if (r) 1763 goto out; 1764 1765 r = -EFAULT; 1766 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid)) 1767 goto out; 1768 r = 0; 1769 break; 1770 } 1771 case KVM_X86_GET_MCE_CAP_SUPPORTED: { 1772 u64 mce_cap; 1773 1774 mce_cap = KVM_MCE_CAP_SUPPORTED; 1775 r = -EFAULT; 1776 if (copy_to_user(argp, &mce_cap, sizeof mce_cap)) 1777 goto out; 1778 r = 0; 1779 break; 1780 } 1781 default: 1782 r = -EINVAL; 1783 } 1784 out: 1785 return r; 1786 } 1787 1788 static void wbinvd_ipi(void *garbage) 1789 { 1790 wbinvd(); 1791 } 1792 1793 static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu) 1794 { 1795 return vcpu->kvm->arch.iommu_domain && 1796 !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY); 1797 } 1798 1799 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 1800 { 1801 /* Address WBINVD may be executed by guest */ 1802 if (need_emulate_wbinvd(vcpu)) { 1803 if (kvm_x86_ops->has_wbinvd_exit()) 1804 cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); 1805 else if (vcpu->cpu != -1 && vcpu->cpu != cpu) 1806 smp_call_function_single(vcpu->cpu, 1807 wbinvd_ipi, NULL, 1); 1808 } 1809 1810 kvm_x86_ops->vcpu_load(vcpu, cpu); 1811 if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) { 1812 unsigned long khz = cpufreq_quick_get(cpu); 1813 if (!khz) 1814 khz = tsc_khz; 1815 per_cpu(cpu_tsc_khz, cpu) = khz; 1816 } 1817 kvm_request_guest_time_update(vcpu); 1818 } 1819 1820 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) 1821 { 1822 kvm_x86_ops->vcpu_put(vcpu); 1823 kvm_put_guest_fpu(vcpu); 1824 } 1825 1826 static int is_efer_nx(void) 1827 { 1828 unsigned long long efer = 0; 1829 1830 rdmsrl_safe(MSR_EFER, &efer); 1831 return efer & EFER_NX; 1832 } 1833 1834 static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu) 1835 { 1836 int i; 1837 struct kvm_cpuid_entry2 *e, *entry; 1838 1839 entry = NULL; 1840 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { 1841 e = &vcpu->arch.cpuid_entries[i]; 1842 if (e->function == 0x80000001) { 1843 entry = e; 1844 break; 1845 } 1846 } 1847 if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) { 1848 entry->edx &= ~(1 << 20); 1849 printk(KERN_INFO "kvm: guest NX capability removed\n"); 1850 } 1851 } 1852 1853 /* when an old userspace process fills a new kernel module */ 1854 static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, 1855 struct kvm_cpuid *cpuid, 1856 struct kvm_cpuid_entry __user *entries) 1857 { 1858 int r, i; 1859 struct kvm_cpuid_entry *cpuid_entries; 1860 1861 r = -E2BIG; 1862 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) 1863 goto out; 1864 r = -ENOMEM; 1865 cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent); 1866 if (!cpuid_entries) 1867 goto out; 1868 r = -EFAULT; 1869 if (copy_from_user(cpuid_entries, entries, 1870 cpuid->nent * sizeof(struct kvm_cpuid_entry))) 1871 goto out_free; 1872 for (i = 0; i < cpuid->nent; i++) { 1873 vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function; 1874 vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax; 1875 vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx; 1876 vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx; 1877 vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx; 1878 vcpu->arch.cpuid_entries[i].index = 0; 1879 vcpu->arch.cpuid_entries[i].flags = 0; 1880 vcpu->arch.cpuid_entries[i].padding[0] = 0; 1881 vcpu->arch.cpuid_entries[i].padding[1] = 0; 1882 vcpu->arch.cpuid_entries[i].padding[2] = 0; 1883 } 1884 vcpu->arch.cpuid_nent = cpuid->nent; 1885 cpuid_fix_nx_cap(vcpu); 1886 r = 0; 1887 kvm_apic_set_version(vcpu); 1888 kvm_x86_ops->cpuid_update(vcpu); 1889 update_cpuid(vcpu); 1890 1891 out_free: 1892 vfree(cpuid_entries); 1893 out: 1894 return r; 1895 } 1896 1897 static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, 1898 struct kvm_cpuid2 *cpuid, 1899 struct kvm_cpuid_entry2 __user *entries) 1900 { 1901 int r; 1902 1903 r = -E2BIG; 1904 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) 1905 goto out; 1906 r = -EFAULT; 1907 if (copy_from_user(&vcpu->arch.cpuid_entries, entries, 1908 cpuid->nent * sizeof(struct kvm_cpuid_entry2))) 1909 goto out; 1910 vcpu->arch.cpuid_nent = cpuid->nent; 1911 kvm_apic_set_version(vcpu); 1912 kvm_x86_ops->cpuid_update(vcpu); 1913 update_cpuid(vcpu); 1914 return 0; 1915 1916 out: 1917 return r; 1918 } 1919 1920 static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, 1921 struct kvm_cpuid2 *cpuid, 1922 struct kvm_cpuid_entry2 __user *entries) 1923 { 1924 int r; 1925 1926 r = -E2BIG; 1927 if (cpuid->nent < vcpu->arch.cpuid_nent) 1928 goto out; 1929 r = -EFAULT; 1930 if (copy_to_user(entries, &vcpu->arch.cpuid_entries, 1931 vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2))) 1932 goto out; 1933 return 0; 1934 1935 out: 1936 cpuid->nent = vcpu->arch.cpuid_nent; 1937 return r; 1938 } 1939 1940 static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, 1941 u32 index) 1942 { 1943 entry->function = function; 1944 entry->index = index; 1945 cpuid_count(entry->function, entry->index, 1946 &entry->eax, &entry->ebx, &entry->ecx, &entry->edx); 1947 entry->flags = 0; 1948 } 1949 1950 #define F(x) bit(X86_FEATURE_##x) 1951 1952 static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, 1953 u32 index, int *nent, int maxnent) 1954 { 1955 unsigned f_nx = is_efer_nx() ? F(NX) : 0; 1956 #ifdef CONFIG_X86_64 1957 unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL) 1958 ? F(GBPAGES) : 0; 1959 unsigned f_lm = F(LM); 1960 #else 1961 unsigned f_gbpages = 0; 1962 unsigned f_lm = 0; 1963 #endif 1964 unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0; 1965 1966 /* cpuid 1.edx */ 1967 const u32 kvm_supported_word0_x86_features = 1968 F(FPU) | F(VME) | F(DE) | F(PSE) | 1969 F(TSC) | F(MSR) | F(PAE) | F(MCE) | 1970 F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) | 1971 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | 1972 F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) | 1973 0 /* Reserved, DS, ACPI */ | F(MMX) | 1974 F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) | 1975 0 /* HTT, TM, Reserved, PBE */; 1976 /* cpuid 0x80000001.edx */ 1977 const u32 kvm_supported_word1_x86_features = 1978 F(FPU) | F(VME) | F(DE) | F(PSE) | 1979 F(TSC) | F(MSR) | F(PAE) | F(MCE) | 1980 F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) | 1981 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | 1982 F(PAT) | F(PSE36) | 0 /* Reserved */ | 1983 f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | 1984 F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp | 1985 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); 1986 /* cpuid 1.ecx */ 1987 const u32 kvm_supported_word4_x86_features = 1988 F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ | 1989 0 /* DS-CPL, VMX, SMX, EST */ | 1990 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | 1991 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ | 1992 0 /* Reserved, DCA */ | F(XMM4_1) | 1993 F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | 1994 0 /* Reserved, AES */ | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX); 1995 /* cpuid 0x80000001.ecx */ 1996 const u32 kvm_supported_word6_x86_features = 1997 F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ | 1998 F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | 1999 F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) | 2000 0 /* SKINIT */ | 0 /* WDT */; 2001 2002 /* all calls to cpuid_count() should be made on the same cpu */ 2003 get_cpu(); 2004 do_cpuid_1_ent(entry, function, index); 2005 ++*nent; 2006 2007 switch (function) { 2008 case 0: 2009 entry->eax = min(entry->eax, (u32)0xd); 2010 break; 2011 case 1: 2012 entry->edx &= kvm_supported_word0_x86_features; 2013 entry->ecx &= kvm_supported_word4_x86_features; 2014 /* we support x2apic emulation even if host does not support 2015 * it since we emulate x2apic in software */ 2016 entry->ecx |= F(X2APIC); 2017 break; 2018 /* function 2 entries are STATEFUL. That is, repeated cpuid commands 2019 * may return different values. This forces us to get_cpu() before 2020 * issuing the first command, and also to emulate this annoying behavior 2021 * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */ 2022 case 2: { 2023 int t, times = entry->eax & 0xff; 2024 2025 entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; 2026 entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; 2027 for (t = 1; t < times && *nent < maxnent; ++t) { 2028 do_cpuid_1_ent(&entry[t], function, 0); 2029 entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; 2030 ++*nent; 2031 } 2032 break; 2033 } 2034 /* function 4 and 0xb have additional index. */ 2035 case 4: { 2036 int i, cache_type; 2037 2038 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 2039 /* read more entries until cache_type is zero */ 2040 for (i = 1; *nent < maxnent; ++i) { 2041 cache_type = entry[i - 1].eax & 0x1f; 2042 if (!cache_type) 2043 break; 2044 do_cpuid_1_ent(&entry[i], function, i); 2045 entry[i].flags |= 2046 KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 2047 ++*nent; 2048 } 2049 break; 2050 } 2051 case 0xb: { 2052 int i, level_type; 2053 2054 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 2055 /* read more entries until level_type is zero */ 2056 for (i = 1; *nent < maxnent; ++i) { 2057 level_type = entry[i - 1].ecx & 0xff00; 2058 if (!level_type) 2059 break; 2060 do_cpuid_1_ent(&entry[i], function, i); 2061 entry[i].flags |= 2062 KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 2063 ++*nent; 2064 } 2065 break; 2066 } 2067 case 0xd: { 2068 int i; 2069 2070 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 2071 for (i = 1; *nent < maxnent; ++i) { 2072 if (entry[i - 1].eax == 0 && i != 2) 2073 break; 2074 do_cpuid_1_ent(&entry[i], function, i); 2075 entry[i].flags |= 2076 KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 2077 ++*nent; 2078 } 2079 break; 2080 } 2081 case KVM_CPUID_SIGNATURE: { 2082 char signature[12] = "KVMKVMKVM\0\0"; 2083 u32 *sigptr = (u32 *)signature; 2084 entry->eax = 0; 2085 entry->ebx = sigptr[0]; 2086 entry->ecx = sigptr[1]; 2087 entry->edx = sigptr[2]; 2088 break; 2089 } 2090 case KVM_CPUID_FEATURES: 2091 entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) | 2092 (1 << KVM_FEATURE_NOP_IO_DELAY) | 2093 (1 << KVM_FEATURE_CLOCKSOURCE2) | 2094 (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); 2095 entry->ebx = 0; 2096 entry->ecx = 0; 2097 entry->edx = 0; 2098 break; 2099 case 0x80000000: 2100 entry->eax = min(entry->eax, 0x8000001a); 2101 break; 2102 case 0x80000001: 2103 entry->edx &= kvm_supported_word1_x86_features; 2104 entry->ecx &= kvm_supported_word6_x86_features; 2105 break; 2106 } 2107 2108 kvm_x86_ops->set_supported_cpuid(function, entry); 2109 2110 put_cpu(); 2111 } 2112 2113 #undef F 2114 2115 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, 2116 struct kvm_cpuid_entry2 __user *entries) 2117 { 2118 struct kvm_cpuid_entry2 *cpuid_entries; 2119 int limit, nent = 0, r = -E2BIG; 2120 u32 func; 2121 2122 if (cpuid->nent < 1) 2123 goto out; 2124 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) 2125 cpuid->nent = KVM_MAX_CPUID_ENTRIES; 2126 r = -ENOMEM; 2127 cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent); 2128 if (!cpuid_entries) 2129 goto out; 2130 2131 do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent); 2132 limit = cpuid_entries[0].eax; 2133 for (func = 1; func <= limit && nent < cpuid->nent; ++func) 2134 do_cpuid_ent(&cpuid_entries[nent], func, 0, 2135 &nent, cpuid->nent); 2136 r = -E2BIG; 2137 if (nent >= cpuid->nent) 2138 goto out_free; 2139 2140 do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent); 2141 limit = cpuid_entries[nent - 1].eax; 2142 for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func) 2143 do_cpuid_ent(&cpuid_entries[nent], func, 0, 2144 &nent, cpuid->nent); 2145 2146 2147 2148 r = -E2BIG; 2149 if (nent >= cpuid->nent) 2150 goto out_free; 2151 2152 do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_SIGNATURE, 0, &nent, 2153 cpuid->nent); 2154 2155 r = -E2BIG; 2156 if (nent >= cpuid->nent) 2157 goto out_free; 2158 2159 do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_FEATURES, 0, &nent, 2160 cpuid->nent); 2161 2162 r = -E2BIG; 2163 if (nent >= cpuid->nent) 2164 goto out_free; 2165 2166 r = -EFAULT; 2167 if (copy_to_user(entries, cpuid_entries, 2168 nent * sizeof(struct kvm_cpuid_entry2))) 2169 goto out_free; 2170 cpuid->nent = nent; 2171 r = 0; 2172 2173 out_free: 2174 vfree(cpuid_entries); 2175 out: 2176 return r; 2177 } 2178 2179 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, 2180 struct kvm_lapic_state *s) 2181 { 2182 memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); 2183 2184 return 0; 2185 } 2186 2187 static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, 2188 struct kvm_lapic_state *s) 2189 { 2190 memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); 2191 kvm_apic_post_state_restore(vcpu); 2192 update_cr8_intercept(vcpu); 2193 2194 return 0; 2195 } 2196 2197 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, 2198 struct kvm_interrupt *irq) 2199 { 2200 if (irq->irq < 0 || irq->irq >= 256) 2201 return -EINVAL; 2202 if (irqchip_in_kernel(vcpu->kvm)) 2203 return -ENXIO; 2204 2205 kvm_queue_interrupt(vcpu, irq->irq, false); 2206 2207 return 0; 2208 } 2209 2210 static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu) 2211 { 2212 kvm_inject_nmi(vcpu); 2213 2214 return 0; 2215 } 2216 2217 static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu, 2218 struct kvm_tpr_access_ctl *tac) 2219 { 2220 if (tac->flags) 2221 return -EINVAL; 2222 vcpu->arch.tpr_access_reporting = !!tac->enabled; 2223 return 0; 2224 } 2225 2226 static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, 2227 u64 mcg_cap) 2228 { 2229 int r; 2230 unsigned bank_num = mcg_cap & 0xff, bank; 2231 2232 r = -EINVAL; 2233 if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS) 2234 goto out; 2235 if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000)) 2236 goto out; 2237 r = 0; 2238 vcpu->arch.mcg_cap = mcg_cap; 2239 /* Init IA32_MCG_CTL to all 1s */ 2240 if (mcg_cap & MCG_CTL_P) 2241 vcpu->arch.mcg_ctl = ~(u64)0; 2242 /* Init IA32_MCi_CTL to all 1s */ 2243 for (bank = 0; bank < bank_num; bank++) 2244 vcpu->arch.mce_banks[bank*4] = ~(u64)0; 2245 out: 2246 return r; 2247 } 2248 2249 static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, 2250 struct kvm_x86_mce *mce) 2251 { 2252 u64 mcg_cap = vcpu->arch.mcg_cap; 2253 unsigned bank_num = mcg_cap & 0xff; 2254 u64 *banks = vcpu->arch.mce_banks; 2255 2256 if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL)) 2257 return -EINVAL; 2258 /* 2259 * if IA32_MCG_CTL is not all 1s, the uncorrected error 2260 * reporting is disabled 2261 */ 2262 if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) && 2263 vcpu->arch.mcg_ctl != ~(u64)0) 2264 return 0; 2265 banks += 4 * mce->bank; 2266 /* 2267 * if IA32_MCi_CTL is not all 1s, the uncorrected error 2268 * reporting is disabled for the bank 2269 */ 2270 if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0) 2271 return 0; 2272 if (mce->status & MCI_STATUS_UC) { 2273 if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) || 2274 !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) { 2275 printk(KERN_DEBUG "kvm: set_mce: " 2276 "injects mce exception while " 2277 "previous one is in progress!\n"); 2278 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 2279 return 0; 2280 } 2281 if (banks[1] & MCI_STATUS_VAL) 2282 mce->status |= MCI_STATUS_OVER; 2283 banks[2] = mce->addr; 2284 banks[3] = mce->misc; 2285 vcpu->arch.mcg_status = mce->mcg_status; 2286 banks[1] = mce->status; 2287 kvm_queue_exception(vcpu, MC_VECTOR); 2288 } else if (!(banks[1] & MCI_STATUS_VAL) 2289 || !(banks[1] & MCI_STATUS_UC)) { 2290 if (banks[1] & MCI_STATUS_VAL) 2291 mce->status |= MCI_STATUS_OVER; 2292 banks[2] = mce->addr; 2293 banks[3] = mce->misc; 2294 banks[1] = mce->status; 2295 } else 2296 banks[1] |= MCI_STATUS_OVER; 2297 return 0; 2298 } 2299 2300 static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, 2301 struct kvm_vcpu_events *events) 2302 { 2303 events->exception.injected = 2304 vcpu->arch.exception.pending && 2305 !kvm_exception_is_soft(vcpu->arch.exception.nr); 2306 events->exception.nr = vcpu->arch.exception.nr; 2307 events->exception.has_error_code = vcpu->arch.exception.has_error_code; 2308 events->exception.error_code = vcpu->arch.exception.error_code; 2309 2310 events->interrupt.injected = 2311 vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft; 2312 events->interrupt.nr = vcpu->arch.interrupt.nr; 2313 events->interrupt.soft = 0; 2314 events->interrupt.shadow = 2315 kvm_x86_ops->get_interrupt_shadow(vcpu, 2316 KVM_X86_SHADOW_INT_MOV_SS | KVM_X86_SHADOW_INT_STI); 2317 2318 events->nmi.injected = vcpu->arch.nmi_injected; 2319 events->nmi.pending = vcpu->arch.nmi_pending; 2320 events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu); 2321 2322 events->sipi_vector = vcpu->arch.sipi_vector; 2323 2324 events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING 2325 | KVM_VCPUEVENT_VALID_SIPI_VECTOR 2326 | KVM_VCPUEVENT_VALID_SHADOW); 2327 } 2328 2329 static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, 2330 struct kvm_vcpu_events *events) 2331 { 2332 if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING 2333 | KVM_VCPUEVENT_VALID_SIPI_VECTOR 2334 | KVM_VCPUEVENT_VALID_SHADOW)) 2335 return -EINVAL; 2336 2337 vcpu->arch.exception.pending = events->exception.injected; 2338 vcpu->arch.exception.nr = events->exception.nr; 2339 vcpu->arch.exception.has_error_code = events->exception.has_error_code; 2340 vcpu->arch.exception.error_code = events->exception.error_code; 2341 2342 vcpu->arch.interrupt.pending = events->interrupt.injected; 2343 vcpu->arch.interrupt.nr = events->interrupt.nr; 2344 vcpu->arch.interrupt.soft = events->interrupt.soft; 2345 if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm)) 2346 kvm_pic_clear_isr_ack(vcpu->kvm); 2347 if (events->flags & KVM_VCPUEVENT_VALID_SHADOW) 2348 kvm_x86_ops->set_interrupt_shadow(vcpu, 2349 events->interrupt.shadow); 2350 2351 vcpu->arch.nmi_injected = events->nmi.injected; 2352 if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) 2353 vcpu->arch.nmi_pending = events->nmi.pending; 2354 kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked); 2355 2356 if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR) 2357 vcpu->arch.sipi_vector = events->sipi_vector; 2358 2359 return 0; 2360 } 2361 2362 static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu, 2363 struct kvm_debugregs *dbgregs) 2364 { 2365 memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db)); 2366 dbgregs->dr6 = vcpu->arch.dr6; 2367 dbgregs->dr7 = vcpu->arch.dr7; 2368 dbgregs->flags = 0; 2369 } 2370 2371 static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, 2372 struct kvm_debugregs *dbgregs) 2373 { 2374 if (dbgregs->flags) 2375 return -EINVAL; 2376 2377 memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db)); 2378 vcpu->arch.dr6 = dbgregs->dr6; 2379 vcpu->arch.dr7 = dbgregs->dr7; 2380 2381 return 0; 2382 } 2383 2384 static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, 2385 struct kvm_xsave *guest_xsave) 2386 { 2387 if (cpu_has_xsave) 2388 memcpy(guest_xsave->region, 2389 &vcpu->arch.guest_fpu.state->xsave, 2390 sizeof(struct xsave_struct)); 2391 else { 2392 memcpy(guest_xsave->region, 2393 &vcpu->arch.guest_fpu.state->fxsave, 2394 sizeof(struct i387_fxsave_struct)); 2395 *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] = 2396 XSTATE_FPSSE; 2397 } 2398 } 2399 2400 static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, 2401 struct kvm_xsave *guest_xsave) 2402 { 2403 u64 xstate_bv = 2404 *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)]; 2405 2406 if (cpu_has_xsave) 2407 memcpy(&vcpu->arch.guest_fpu.state->xsave, 2408 guest_xsave->region, sizeof(struct xsave_struct)); 2409 else { 2410 if (xstate_bv & ~XSTATE_FPSSE) 2411 return -EINVAL; 2412 memcpy(&vcpu->arch.guest_fpu.state->fxsave, 2413 guest_xsave->region, sizeof(struct i387_fxsave_struct)); 2414 } 2415 return 0; 2416 } 2417 2418 static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu, 2419 struct kvm_xcrs *guest_xcrs) 2420 { 2421 if (!cpu_has_xsave) { 2422 guest_xcrs->nr_xcrs = 0; 2423 return; 2424 } 2425 2426 guest_xcrs->nr_xcrs = 1; 2427 guest_xcrs->flags = 0; 2428 guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK; 2429 guest_xcrs->xcrs[0].value = vcpu->arch.xcr0; 2430 } 2431 2432 static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu, 2433 struct kvm_xcrs *guest_xcrs) 2434 { 2435 int i, r = 0; 2436 2437 if (!cpu_has_xsave) 2438 return -EINVAL; 2439 2440 if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags) 2441 return -EINVAL; 2442 2443 for (i = 0; i < guest_xcrs->nr_xcrs; i++) 2444 /* Only support XCR0 currently */ 2445 if (guest_xcrs->xcrs[0].xcr == XCR_XFEATURE_ENABLED_MASK) { 2446 r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK, 2447 guest_xcrs->xcrs[0].value); 2448 break; 2449 } 2450 if (r) 2451 r = -EINVAL; 2452 return r; 2453 } 2454 2455 long kvm_arch_vcpu_ioctl(struct file *filp, 2456 unsigned int ioctl, unsigned long arg) 2457 { 2458 struct kvm_vcpu *vcpu = filp->private_data; 2459 void __user *argp = (void __user *)arg; 2460 int r; 2461 union { 2462 struct kvm_lapic_state *lapic; 2463 struct kvm_xsave *xsave; 2464 struct kvm_xcrs *xcrs; 2465 void *buffer; 2466 } u; 2467 2468 u.buffer = NULL; 2469 switch (ioctl) { 2470 case KVM_GET_LAPIC: { 2471 r = -EINVAL; 2472 if (!vcpu->arch.apic) 2473 goto out; 2474 u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); 2475 2476 r = -ENOMEM; 2477 if (!u.lapic) 2478 goto out; 2479 r = kvm_vcpu_ioctl_get_lapic(vcpu, u.lapic); 2480 if (r) 2481 goto out; 2482 r = -EFAULT; 2483 if (copy_to_user(argp, u.lapic, sizeof(struct kvm_lapic_state))) 2484 goto out; 2485 r = 0; 2486 break; 2487 } 2488 case KVM_SET_LAPIC: { 2489 r = -EINVAL; 2490 if (!vcpu->arch.apic) 2491 goto out; 2492 u.lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); 2493 r = -ENOMEM; 2494 if (!u.lapic) 2495 goto out; 2496 r = -EFAULT; 2497 if (copy_from_user(u.lapic, argp, sizeof(struct kvm_lapic_state))) 2498 goto out; 2499 r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic); 2500 if (r) 2501 goto out; 2502 r = 0; 2503 break; 2504 } 2505 case KVM_INTERRUPT: { 2506 struct kvm_interrupt irq; 2507 2508 r = -EFAULT; 2509 if (copy_from_user(&irq, argp, sizeof irq)) 2510 goto out; 2511 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); 2512 if (r) 2513 goto out; 2514 r = 0; 2515 break; 2516 } 2517 case KVM_NMI: { 2518 r = kvm_vcpu_ioctl_nmi(vcpu); 2519 if (r) 2520 goto out; 2521 r = 0; 2522 break; 2523 } 2524 case KVM_SET_CPUID: { 2525 struct kvm_cpuid __user *cpuid_arg = argp; 2526 struct kvm_cpuid cpuid; 2527 2528 r = -EFAULT; 2529 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 2530 goto out; 2531 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries); 2532 if (r) 2533 goto out; 2534 break; 2535 } 2536 case KVM_SET_CPUID2: { 2537 struct kvm_cpuid2 __user *cpuid_arg = argp; 2538 struct kvm_cpuid2 cpuid; 2539 2540 r = -EFAULT; 2541 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 2542 goto out; 2543 r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid, 2544 cpuid_arg->entries); 2545 if (r) 2546 goto out; 2547 break; 2548 } 2549 case KVM_GET_CPUID2: { 2550 struct kvm_cpuid2 __user *cpuid_arg = argp; 2551 struct kvm_cpuid2 cpuid; 2552 2553 r = -EFAULT; 2554 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 2555 goto out; 2556 r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid, 2557 cpuid_arg->entries); 2558 if (r) 2559 goto out; 2560 r = -EFAULT; 2561 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid)) 2562 goto out; 2563 r = 0; 2564 break; 2565 } 2566 case KVM_GET_MSRS: 2567 r = msr_io(vcpu, argp, kvm_get_msr, 1); 2568 break; 2569 case KVM_SET_MSRS: 2570 r = msr_io(vcpu, argp, do_set_msr, 0); 2571 break; 2572 case KVM_TPR_ACCESS_REPORTING: { 2573 struct kvm_tpr_access_ctl tac; 2574 2575 r = -EFAULT; 2576 if (copy_from_user(&tac, argp, sizeof tac)) 2577 goto out; 2578 r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac); 2579 if (r) 2580 goto out; 2581 r = -EFAULT; 2582 if (copy_to_user(argp, &tac, sizeof tac)) 2583 goto out; 2584 r = 0; 2585 break; 2586 }; 2587 case KVM_SET_VAPIC_ADDR: { 2588 struct kvm_vapic_addr va; 2589 2590 r = -EINVAL; 2591 if (!irqchip_in_kernel(vcpu->kvm)) 2592 goto out; 2593 r = -EFAULT; 2594 if (copy_from_user(&va, argp, sizeof va)) 2595 goto out; 2596 r = 0; 2597 kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr); 2598 break; 2599 } 2600 case KVM_X86_SETUP_MCE: { 2601 u64 mcg_cap; 2602 2603 r = -EFAULT; 2604 if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap)) 2605 goto out; 2606 r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap); 2607 break; 2608 } 2609 case KVM_X86_SET_MCE: { 2610 struct kvm_x86_mce mce; 2611 2612 r = -EFAULT; 2613 if (copy_from_user(&mce, argp, sizeof mce)) 2614 goto out; 2615 r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce); 2616 break; 2617 } 2618 case KVM_GET_VCPU_EVENTS: { 2619 struct kvm_vcpu_events events; 2620 2621 kvm_vcpu_ioctl_x86_get_vcpu_events(vcpu, &events); 2622 2623 r = -EFAULT; 2624 if (copy_to_user(argp, &events, sizeof(struct kvm_vcpu_events))) 2625 break; 2626 r = 0; 2627 break; 2628 } 2629 case KVM_SET_VCPU_EVENTS: { 2630 struct kvm_vcpu_events events; 2631 2632 r = -EFAULT; 2633 if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events))) 2634 break; 2635 2636 r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events); 2637 break; 2638 } 2639 case KVM_GET_DEBUGREGS: { 2640 struct kvm_debugregs dbgregs; 2641 2642 kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs); 2643 2644 r = -EFAULT; 2645 if (copy_to_user(argp, &dbgregs, 2646 sizeof(struct kvm_debugregs))) 2647 break; 2648 r = 0; 2649 break; 2650 } 2651 case KVM_SET_DEBUGREGS: { 2652 struct kvm_debugregs dbgregs; 2653 2654 r = -EFAULT; 2655 if (copy_from_user(&dbgregs, argp, 2656 sizeof(struct kvm_debugregs))) 2657 break; 2658 2659 r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs); 2660 break; 2661 } 2662 case KVM_GET_XSAVE: { 2663 u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL); 2664 r = -ENOMEM; 2665 if (!u.xsave) 2666 break; 2667 2668 kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave); 2669 2670 r = -EFAULT; 2671 if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave))) 2672 break; 2673 r = 0; 2674 break; 2675 } 2676 case KVM_SET_XSAVE: { 2677 u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL); 2678 r = -ENOMEM; 2679 if (!u.xsave) 2680 break; 2681 2682 r = -EFAULT; 2683 if (copy_from_user(u.xsave, argp, sizeof(struct kvm_xsave))) 2684 break; 2685 2686 r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave); 2687 break; 2688 } 2689 case KVM_GET_XCRS: { 2690 u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL); 2691 r = -ENOMEM; 2692 if (!u.xcrs) 2693 break; 2694 2695 kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs); 2696 2697 r = -EFAULT; 2698 if (copy_to_user(argp, u.xcrs, 2699 sizeof(struct kvm_xcrs))) 2700 break; 2701 r = 0; 2702 break; 2703 } 2704 case KVM_SET_XCRS: { 2705 u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL); 2706 r = -ENOMEM; 2707 if (!u.xcrs) 2708 break; 2709 2710 r = -EFAULT; 2711 if (copy_from_user(u.xcrs, argp, 2712 sizeof(struct kvm_xcrs))) 2713 break; 2714 2715 r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs); 2716 break; 2717 } 2718 default: 2719 r = -EINVAL; 2720 } 2721 out: 2722 kfree(u.buffer); 2723 return r; 2724 } 2725 2726 static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) 2727 { 2728 int ret; 2729 2730 if (addr > (unsigned int)(-3 * PAGE_SIZE)) 2731 return -1; 2732 ret = kvm_x86_ops->set_tss_addr(kvm, addr); 2733 return ret; 2734 } 2735 2736 static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm, 2737 u64 ident_addr) 2738 { 2739 kvm->arch.ept_identity_map_addr = ident_addr; 2740 return 0; 2741 } 2742 2743 static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, 2744 u32 kvm_nr_mmu_pages) 2745 { 2746 if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES) 2747 return -EINVAL; 2748 2749 mutex_lock(&kvm->slots_lock); 2750 spin_lock(&kvm->mmu_lock); 2751 2752 kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); 2753 kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; 2754 2755 spin_unlock(&kvm->mmu_lock); 2756 mutex_unlock(&kvm->slots_lock); 2757 return 0; 2758 } 2759 2760 static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) 2761 { 2762 return kvm->arch.n_alloc_mmu_pages; 2763 } 2764 2765 static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) 2766 { 2767 int r; 2768 2769 r = 0; 2770 switch (chip->chip_id) { 2771 case KVM_IRQCHIP_PIC_MASTER: 2772 memcpy(&chip->chip.pic, 2773 &pic_irqchip(kvm)->pics[0], 2774 sizeof(struct kvm_pic_state)); 2775 break; 2776 case KVM_IRQCHIP_PIC_SLAVE: 2777 memcpy(&chip->chip.pic, 2778 &pic_irqchip(kvm)->pics[1], 2779 sizeof(struct kvm_pic_state)); 2780 break; 2781 case KVM_IRQCHIP_IOAPIC: 2782 r = kvm_get_ioapic(kvm, &chip->chip.ioapic); 2783 break; 2784 default: 2785 r = -EINVAL; 2786 break; 2787 } 2788 return r; 2789 } 2790 2791 static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) 2792 { 2793 int r; 2794 2795 r = 0; 2796 switch (chip->chip_id) { 2797 case KVM_IRQCHIP_PIC_MASTER: 2798 raw_spin_lock(&pic_irqchip(kvm)->lock); 2799 memcpy(&pic_irqchip(kvm)->pics[0], 2800 &chip->chip.pic, 2801 sizeof(struct kvm_pic_state)); 2802 raw_spin_unlock(&pic_irqchip(kvm)->lock); 2803 break; 2804 case KVM_IRQCHIP_PIC_SLAVE: 2805 raw_spin_lock(&pic_irqchip(kvm)->lock); 2806 memcpy(&pic_irqchip(kvm)->pics[1], 2807 &chip->chip.pic, 2808 sizeof(struct kvm_pic_state)); 2809 raw_spin_unlock(&pic_irqchip(kvm)->lock); 2810 break; 2811 case KVM_IRQCHIP_IOAPIC: 2812 r = kvm_set_ioapic(kvm, &chip->chip.ioapic); 2813 break; 2814 default: 2815 r = -EINVAL; 2816 break; 2817 } 2818 kvm_pic_update_irq(pic_irqchip(kvm)); 2819 return r; 2820 } 2821 2822 static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps) 2823 { 2824 int r = 0; 2825 2826 mutex_lock(&kvm->arch.vpit->pit_state.lock); 2827 memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state)); 2828 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 2829 return r; 2830 } 2831 2832 static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps) 2833 { 2834 int r = 0; 2835 2836 mutex_lock(&kvm->arch.vpit->pit_state.lock); 2837 memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state)); 2838 kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0); 2839 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 2840 return r; 2841 } 2842 2843 static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) 2844 { 2845 int r = 0; 2846 2847 mutex_lock(&kvm->arch.vpit->pit_state.lock); 2848 memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels, 2849 sizeof(ps->channels)); 2850 ps->flags = kvm->arch.vpit->pit_state.flags; 2851 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 2852 return r; 2853 } 2854 2855 static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) 2856 { 2857 int r = 0, start = 0; 2858 u32 prev_legacy, cur_legacy; 2859 mutex_lock(&kvm->arch.vpit->pit_state.lock); 2860 prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY; 2861 cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY; 2862 if (!prev_legacy && cur_legacy) 2863 start = 1; 2864 memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels, 2865 sizeof(kvm->arch.vpit->pit_state.channels)); 2866 kvm->arch.vpit->pit_state.flags = ps->flags; 2867 kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start); 2868 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 2869 return r; 2870 } 2871 2872 static int kvm_vm_ioctl_reinject(struct kvm *kvm, 2873 struct kvm_reinject_control *control) 2874 { 2875 if (!kvm->arch.vpit) 2876 return -ENXIO; 2877 mutex_lock(&kvm->arch.vpit->pit_state.lock); 2878 kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject; 2879 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 2880 return 0; 2881 } 2882 2883 /* 2884 * Get (and clear) the dirty memory log for a memory slot. 2885 */ 2886 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, 2887 struct kvm_dirty_log *log) 2888 { 2889 int r, i; 2890 struct kvm_memory_slot *memslot; 2891 unsigned long n; 2892 unsigned long is_dirty = 0; 2893 2894 mutex_lock(&kvm->slots_lock); 2895 2896 r = -EINVAL; 2897 if (log->slot >= KVM_MEMORY_SLOTS) 2898 goto out; 2899 2900 memslot = &kvm->memslots->memslots[log->slot]; 2901 r = -ENOENT; 2902 if (!memslot->dirty_bitmap) 2903 goto out; 2904 2905 n = kvm_dirty_bitmap_bytes(memslot); 2906 2907 for (i = 0; !is_dirty && i < n/sizeof(long); i++) 2908 is_dirty = memslot->dirty_bitmap[i]; 2909 2910 /* If nothing is dirty, don't bother messing with page tables. */ 2911 if (is_dirty) { 2912 struct kvm_memslots *slots, *old_slots; 2913 unsigned long *dirty_bitmap; 2914 2915 spin_lock(&kvm->mmu_lock); 2916 kvm_mmu_slot_remove_write_access(kvm, log->slot); 2917 spin_unlock(&kvm->mmu_lock); 2918 2919 r = -ENOMEM; 2920 dirty_bitmap = vmalloc(n); 2921 if (!dirty_bitmap) 2922 goto out; 2923 memset(dirty_bitmap, 0, n); 2924 2925 r = -ENOMEM; 2926 slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); 2927 if (!slots) { 2928 vfree(dirty_bitmap); 2929 goto out; 2930 } 2931 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); 2932 slots->memslots[log->slot].dirty_bitmap = dirty_bitmap; 2933 2934 old_slots = kvm->memslots; 2935 rcu_assign_pointer(kvm->memslots, slots); 2936 synchronize_srcu_expedited(&kvm->srcu); 2937 dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap; 2938 kfree(old_slots); 2939 2940 r = -EFAULT; 2941 if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) { 2942 vfree(dirty_bitmap); 2943 goto out; 2944 } 2945 vfree(dirty_bitmap); 2946 } else { 2947 r = -EFAULT; 2948 if (clear_user(log->dirty_bitmap, n)) 2949 goto out; 2950 } 2951 2952 r = 0; 2953 out: 2954 mutex_unlock(&kvm->slots_lock); 2955 return r; 2956 } 2957 2958 long kvm_arch_vm_ioctl(struct file *filp, 2959 unsigned int ioctl, unsigned long arg) 2960 { 2961 struct kvm *kvm = filp->private_data; 2962 void __user *argp = (void __user *)arg; 2963 int r = -ENOTTY; 2964 /* 2965 * This union makes it completely explicit to gcc-3.x 2966 * that these two variables' stack usage should be 2967 * combined, not added together. 2968 */ 2969 union { 2970 struct kvm_pit_state ps; 2971 struct kvm_pit_state2 ps2; 2972 struct kvm_pit_config pit_config; 2973 } u; 2974 2975 switch (ioctl) { 2976 case KVM_SET_TSS_ADDR: 2977 r = kvm_vm_ioctl_set_tss_addr(kvm, arg); 2978 if (r < 0) 2979 goto out; 2980 break; 2981 case KVM_SET_IDENTITY_MAP_ADDR: { 2982 u64 ident_addr; 2983 2984 r = -EFAULT; 2985 if (copy_from_user(&ident_addr, argp, sizeof ident_addr)) 2986 goto out; 2987 r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr); 2988 if (r < 0) 2989 goto out; 2990 break; 2991 } 2992 case KVM_SET_NR_MMU_PAGES: 2993 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); 2994 if (r) 2995 goto out; 2996 break; 2997 case KVM_GET_NR_MMU_PAGES: 2998 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); 2999 break; 3000 case KVM_CREATE_IRQCHIP: { 3001 struct kvm_pic *vpic; 3002 3003 mutex_lock(&kvm->lock); 3004 r = -EEXIST; 3005 if (kvm->arch.vpic) 3006 goto create_irqchip_unlock; 3007 r = -ENOMEM; 3008 vpic = kvm_create_pic(kvm); 3009 if (vpic) { 3010 r = kvm_ioapic_init(kvm); 3011 if (r) { 3012 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, 3013 &vpic->dev); 3014 kfree(vpic); 3015 goto create_irqchip_unlock; 3016 } 3017 } else 3018 goto create_irqchip_unlock; 3019 smp_wmb(); 3020 kvm->arch.vpic = vpic; 3021 smp_wmb(); 3022 r = kvm_setup_default_irq_routing(kvm); 3023 if (r) { 3024 mutex_lock(&kvm->irq_lock); 3025 kvm_ioapic_destroy(kvm); 3026 kvm_destroy_pic(kvm); 3027 mutex_unlock(&kvm->irq_lock); 3028 } 3029 create_irqchip_unlock: 3030 mutex_unlock(&kvm->lock); 3031 break; 3032 } 3033 case KVM_CREATE_PIT: 3034 u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY; 3035 goto create_pit; 3036 case KVM_CREATE_PIT2: 3037 r = -EFAULT; 3038 if (copy_from_user(&u.pit_config, argp, 3039 sizeof(struct kvm_pit_config))) 3040 goto out; 3041 create_pit: 3042 mutex_lock(&kvm->slots_lock); 3043 r = -EEXIST; 3044 if (kvm->arch.vpit) 3045 goto create_pit_unlock; 3046 r = -ENOMEM; 3047 kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags); 3048 if (kvm->arch.vpit) 3049 r = 0; 3050 create_pit_unlock: 3051 mutex_unlock(&kvm->slots_lock); 3052 break; 3053 case KVM_IRQ_LINE_STATUS: 3054 case KVM_IRQ_LINE: { 3055 struct kvm_irq_level irq_event; 3056 3057 r = -EFAULT; 3058 if (copy_from_user(&irq_event, argp, sizeof irq_event)) 3059 goto out; 3060 r = -ENXIO; 3061 if (irqchip_in_kernel(kvm)) { 3062 __s32 status; 3063 status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 3064 irq_event.irq, irq_event.level); 3065 if (ioctl == KVM_IRQ_LINE_STATUS) { 3066 r = -EFAULT; 3067 irq_event.status = status; 3068 if (copy_to_user(argp, &irq_event, 3069 sizeof irq_event)) 3070 goto out; 3071 } 3072 r = 0; 3073 } 3074 break; 3075 } 3076 case KVM_GET_IRQCHIP: { 3077 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ 3078 struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL); 3079 3080 r = -ENOMEM; 3081 if (!chip) 3082 goto out; 3083 r = -EFAULT; 3084 if (copy_from_user(chip, argp, sizeof *chip)) 3085 goto get_irqchip_out; 3086 r = -ENXIO; 3087 if (!irqchip_in_kernel(kvm)) 3088 goto get_irqchip_out; 3089 r = kvm_vm_ioctl_get_irqchip(kvm, chip); 3090 if (r) 3091 goto get_irqchip_out; 3092 r = -EFAULT; 3093 if (copy_to_user(argp, chip, sizeof *chip)) 3094 goto get_irqchip_out; 3095 r = 0; 3096 get_irqchip_out: 3097 kfree(chip); 3098 if (r) 3099 goto out; 3100 break; 3101 } 3102 case KVM_SET_IRQCHIP: { 3103 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ 3104 struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL); 3105 3106 r = -ENOMEM; 3107 if (!chip) 3108 goto out; 3109 r = -EFAULT; 3110 if (copy_from_user(chip, argp, sizeof *chip)) 3111 goto set_irqchip_out; 3112 r = -ENXIO; 3113 if (!irqchip_in_kernel(kvm)) 3114 goto set_irqchip_out; 3115 r = kvm_vm_ioctl_set_irqchip(kvm, chip); 3116 if (r) 3117 goto set_irqchip_out; 3118 r = 0; 3119 set_irqchip_out: 3120 kfree(chip); 3121 if (r) 3122 goto out; 3123 break; 3124 } 3125 case KVM_GET_PIT: { 3126 r = -EFAULT; 3127 if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state))) 3128 goto out; 3129 r = -ENXIO; 3130 if (!kvm->arch.vpit) 3131 goto out; 3132 r = kvm_vm_ioctl_get_pit(kvm, &u.ps); 3133 if (r) 3134 goto out; 3135 r = -EFAULT; 3136 if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state))) 3137 goto out; 3138 r = 0; 3139 break; 3140 } 3141 case KVM_SET_PIT: { 3142 r = -EFAULT; 3143 if (copy_from_user(&u.ps, argp, sizeof u.ps)) 3144 goto out; 3145 r = -ENXIO; 3146 if (!kvm->arch.vpit) 3147 goto out; 3148 r = kvm_vm_ioctl_set_pit(kvm, &u.ps); 3149 if (r) 3150 goto out; 3151 r = 0; 3152 break; 3153 } 3154 case KVM_GET_PIT2: { 3155 r = -ENXIO; 3156 if (!kvm->arch.vpit) 3157 goto out; 3158 r = kvm_vm_ioctl_get_pit2(kvm, &u.ps2); 3159 if (r) 3160 goto out; 3161 r = -EFAULT; 3162 if (copy_to_user(argp, &u.ps2, sizeof(u.ps2))) 3163 goto out; 3164 r = 0; 3165 break; 3166 } 3167 case KVM_SET_PIT2: { 3168 r = -EFAULT; 3169 if (copy_from_user(&u.ps2, argp, sizeof(u.ps2))) 3170 goto out; 3171 r = -ENXIO; 3172 if (!kvm->arch.vpit) 3173 goto out; 3174 r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2); 3175 if (r) 3176 goto out; 3177 r = 0; 3178 break; 3179 } 3180 case KVM_REINJECT_CONTROL: { 3181 struct kvm_reinject_control control; 3182 r = -EFAULT; 3183 if (copy_from_user(&control, argp, sizeof(control))) 3184 goto out; 3185 r = kvm_vm_ioctl_reinject(kvm, &control); 3186 if (r) 3187 goto out; 3188 r = 0; 3189 break; 3190 } 3191 case KVM_XEN_HVM_CONFIG: { 3192 r = -EFAULT; 3193 if (copy_from_user(&kvm->arch.xen_hvm_config, argp, 3194 sizeof(struct kvm_xen_hvm_config))) 3195 goto out; 3196 r = -EINVAL; 3197 if (kvm->arch.xen_hvm_config.flags) 3198 goto out; 3199 r = 0; 3200 break; 3201 } 3202 case KVM_SET_CLOCK: { 3203 struct timespec now; 3204 struct kvm_clock_data user_ns; 3205 u64 now_ns; 3206 s64 delta; 3207 3208 r = -EFAULT; 3209 if (copy_from_user(&user_ns, argp, sizeof(user_ns))) 3210 goto out; 3211 3212 r = -EINVAL; 3213 if (user_ns.flags) 3214 goto out; 3215 3216 r = 0; 3217 ktime_get_ts(&now); 3218 now_ns = timespec_to_ns(&now); 3219 delta = user_ns.clock - now_ns; 3220 kvm->arch.kvmclock_offset = delta; 3221 break; 3222 } 3223 case KVM_GET_CLOCK: { 3224 struct timespec now; 3225 struct kvm_clock_data user_ns; 3226 u64 now_ns; 3227 3228 ktime_get_ts(&now); 3229 now_ns = timespec_to_ns(&now); 3230 user_ns.clock = kvm->arch.kvmclock_offset + now_ns; 3231 user_ns.flags = 0; 3232 3233 r = -EFAULT; 3234 if (copy_to_user(argp, &user_ns, sizeof(user_ns))) 3235 goto out; 3236 r = 0; 3237 break; 3238 } 3239 3240 default: 3241 ; 3242 } 3243 out: 3244 return r; 3245 } 3246 3247 static void kvm_init_msr_list(void) 3248 { 3249 u32 dummy[2]; 3250 unsigned i, j; 3251 3252 /* skip the first msrs in the list. KVM-specific */ 3253 for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) { 3254 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) 3255 continue; 3256 if (j < i) 3257 msrs_to_save[j] = msrs_to_save[i]; 3258 j++; 3259 } 3260 num_msrs_to_save = j; 3261 } 3262 3263 static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, 3264 const void *v) 3265 { 3266 if (vcpu->arch.apic && 3267 !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v)) 3268 return 0; 3269 3270 return kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, len, v); 3271 } 3272 3273 static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) 3274 { 3275 if (vcpu->arch.apic && 3276 !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v)) 3277 return 0; 3278 3279 return kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v); 3280 } 3281 3282 static void kvm_set_segment(struct kvm_vcpu *vcpu, 3283 struct kvm_segment *var, int seg) 3284 { 3285 kvm_x86_ops->set_segment(vcpu, var, seg); 3286 } 3287 3288 void kvm_get_segment(struct kvm_vcpu *vcpu, 3289 struct kvm_segment *var, int seg) 3290 { 3291 kvm_x86_ops->get_segment(vcpu, var, seg); 3292 } 3293 3294 gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) 3295 { 3296 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3297 return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); 3298 } 3299 3300 gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) 3301 { 3302 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3303 access |= PFERR_FETCH_MASK; 3304 return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); 3305 } 3306 3307 gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) 3308 { 3309 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3310 access |= PFERR_WRITE_MASK; 3311 return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); 3312 } 3313 3314 /* uses this to access any guest's mapped memory without checking CPL */ 3315 gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) 3316 { 3317 return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, 0, error); 3318 } 3319 3320 static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, 3321 struct kvm_vcpu *vcpu, u32 access, 3322 u32 *error) 3323 { 3324 void *data = val; 3325 int r = X86EMUL_CONTINUE; 3326 3327 while (bytes) { 3328 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, access, error); 3329 unsigned offset = addr & (PAGE_SIZE-1); 3330 unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); 3331 int ret; 3332 3333 if (gpa == UNMAPPED_GVA) { 3334 r = X86EMUL_PROPAGATE_FAULT; 3335 goto out; 3336 } 3337 ret = kvm_read_guest(vcpu->kvm, gpa, data, toread); 3338 if (ret < 0) { 3339 r = X86EMUL_IO_NEEDED; 3340 goto out; 3341 } 3342 3343 bytes -= toread; 3344 data += toread; 3345 addr += toread; 3346 } 3347 out: 3348 return r; 3349 } 3350 3351 /* used for instruction fetching */ 3352 static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes, 3353 struct kvm_vcpu *vcpu, u32 *error) 3354 { 3355 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3356 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 3357 access | PFERR_FETCH_MASK, error); 3358 } 3359 3360 static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, 3361 struct kvm_vcpu *vcpu, u32 *error) 3362 { 3363 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3364 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, 3365 error); 3366 } 3367 3368 static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes, 3369 struct kvm_vcpu *vcpu, u32 *error) 3370 { 3371 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error); 3372 } 3373 3374 static int kvm_write_guest_virt_system(gva_t addr, void *val, 3375 unsigned int bytes, 3376 struct kvm_vcpu *vcpu, 3377 u32 *error) 3378 { 3379 void *data = val; 3380 int r = X86EMUL_CONTINUE; 3381 3382 while (bytes) { 3383 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, 3384 PFERR_WRITE_MASK, error); 3385 unsigned offset = addr & (PAGE_SIZE-1); 3386 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); 3387 int ret; 3388 3389 if (gpa == UNMAPPED_GVA) { 3390 r = X86EMUL_PROPAGATE_FAULT; 3391 goto out; 3392 } 3393 ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite); 3394 if (ret < 0) { 3395 r = X86EMUL_IO_NEEDED; 3396 goto out; 3397 } 3398 3399 bytes -= towrite; 3400 data += towrite; 3401 addr += towrite; 3402 } 3403 out: 3404 return r; 3405 } 3406 3407 static int emulator_read_emulated(unsigned long addr, 3408 void *val, 3409 unsigned int bytes, 3410 unsigned int *error_code, 3411 struct kvm_vcpu *vcpu) 3412 { 3413 gpa_t gpa; 3414 3415 if (vcpu->mmio_read_completed) { 3416 memcpy(val, vcpu->mmio_data, bytes); 3417 trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, 3418 vcpu->mmio_phys_addr, *(u64 *)val); 3419 vcpu->mmio_read_completed = 0; 3420 return X86EMUL_CONTINUE; 3421 } 3422 3423 gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, error_code); 3424 3425 if (gpa == UNMAPPED_GVA) 3426 return X86EMUL_PROPAGATE_FAULT; 3427 3428 /* For APIC access vmexit */ 3429 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 3430 goto mmio; 3431 3432 if (kvm_read_guest_virt(addr, val, bytes, vcpu, NULL) 3433 == X86EMUL_CONTINUE) 3434 return X86EMUL_CONTINUE; 3435 3436 mmio: 3437 /* 3438 * Is this MMIO handled locally? 3439 */ 3440 if (!vcpu_mmio_read(vcpu, gpa, bytes, val)) { 3441 trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, gpa, *(u64 *)val); 3442 return X86EMUL_CONTINUE; 3443 } 3444 3445 trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0); 3446 3447 vcpu->mmio_needed = 1; 3448 vcpu->run->exit_reason = KVM_EXIT_MMIO; 3449 vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; 3450 vcpu->run->mmio.len = vcpu->mmio_size = bytes; 3451 vcpu->run->mmio.is_write = vcpu->mmio_is_write = 0; 3452 3453 return X86EMUL_IO_NEEDED; 3454 } 3455 3456 int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, 3457 const void *val, int bytes) 3458 { 3459 int ret; 3460 3461 ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes); 3462 if (ret < 0) 3463 return 0; 3464 kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1); 3465 return 1; 3466 } 3467 3468 static int emulator_write_emulated_onepage(unsigned long addr, 3469 const void *val, 3470 unsigned int bytes, 3471 unsigned int *error_code, 3472 struct kvm_vcpu *vcpu) 3473 { 3474 gpa_t gpa; 3475 3476 gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error_code); 3477 3478 if (gpa == UNMAPPED_GVA) 3479 return X86EMUL_PROPAGATE_FAULT; 3480 3481 /* For APIC access vmexit */ 3482 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 3483 goto mmio; 3484 3485 if (emulator_write_phys(vcpu, gpa, val, bytes)) 3486 return X86EMUL_CONTINUE; 3487 3488 mmio: 3489 trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val); 3490 /* 3491 * Is this MMIO handled locally? 3492 */ 3493 if (!vcpu_mmio_write(vcpu, gpa, bytes, val)) 3494 return X86EMUL_CONTINUE; 3495 3496 vcpu->mmio_needed = 1; 3497 vcpu->run->exit_reason = KVM_EXIT_MMIO; 3498 vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; 3499 vcpu->run->mmio.len = vcpu->mmio_size = bytes; 3500 vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1; 3501 memcpy(vcpu->run->mmio.data, val, bytes); 3502 3503 return X86EMUL_CONTINUE; 3504 } 3505 3506 int emulator_write_emulated(unsigned long addr, 3507 const void *val, 3508 unsigned int bytes, 3509 unsigned int *error_code, 3510 struct kvm_vcpu *vcpu) 3511 { 3512 /* Crossing a page boundary? */ 3513 if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { 3514 int rc, now; 3515 3516 now = -addr & ~PAGE_MASK; 3517 rc = emulator_write_emulated_onepage(addr, val, now, error_code, 3518 vcpu); 3519 if (rc != X86EMUL_CONTINUE) 3520 return rc; 3521 addr += now; 3522 val += now; 3523 bytes -= now; 3524 } 3525 return emulator_write_emulated_onepage(addr, val, bytes, error_code, 3526 vcpu); 3527 } 3528 3529 #define CMPXCHG_TYPE(t, ptr, old, new) \ 3530 (cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old)) 3531 3532 #ifdef CONFIG_X86_64 3533 # define CMPXCHG64(ptr, old, new) CMPXCHG_TYPE(u64, ptr, old, new) 3534 #else 3535 # define CMPXCHG64(ptr, old, new) \ 3536 (cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old)) 3537 #endif 3538 3539 static int emulator_cmpxchg_emulated(unsigned long addr, 3540 const void *old, 3541 const void *new, 3542 unsigned int bytes, 3543 unsigned int *error_code, 3544 struct kvm_vcpu *vcpu) 3545 { 3546 gpa_t gpa; 3547 struct page *page; 3548 char *kaddr; 3549 bool exchanged; 3550 3551 /* guests cmpxchg8b have to be emulated atomically */ 3552 if (bytes > 8 || (bytes & (bytes - 1))) 3553 goto emul_write; 3554 3555 gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL); 3556 3557 if (gpa == UNMAPPED_GVA || 3558 (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 3559 goto emul_write; 3560 3561 if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK)) 3562 goto emul_write; 3563 3564 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); 3565 if (is_error_page(page)) { 3566 kvm_release_page_clean(page); 3567 goto emul_write; 3568 } 3569 3570 kaddr = kmap_atomic(page, KM_USER0); 3571 kaddr += offset_in_page(gpa); 3572 switch (bytes) { 3573 case 1: 3574 exchanged = CMPXCHG_TYPE(u8, kaddr, old, new); 3575 break; 3576 case 2: 3577 exchanged = CMPXCHG_TYPE(u16, kaddr, old, new); 3578 break; 3579 case 4: 3580 exchanged = CMPXCHG_TYPE(u32, kaddr, old, new); 3581 break; 3582 case 8: 3583 exchanged = CMPXCHG64(kaddr, old, new); 3584 break; 3585 default: 3586 BUG(); 3587 } 3588 kunmap_atomic(kaddr, KM_USER0); 3589 kvm_release_page_dirty(page); 3590 3591 if (!exchanged) 3592 return X86EMUL_CMPXCHG_FAILED; 3593 3594 kvm_mmu_pte_write(vcpu, gpa, new, bytes, 1); 3595 3596 return X86EMUL_CONTINUE; 3597 3598 emul_write: 3599 printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); 3600 3601 return emulator_write_emulated(addr, new, bytes, error_code, vcpu); 3602 } 3603 3604 static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) 3605 { 3606 /* TODO: String I/O for in kernel device */ 3607 int r; 3608 3609 if (vcpu->arch.pio.in) 3610 r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port, 3611 vcpu->arch.pio.size, pd); 3612 else 3613 r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS, 3614 vcpu->arch.pio.port, vcpu->arch.pio.size, 3615 pd); 3616 return r; 3617 } 3618 3619 3620 static int emulator_pio_in_emulated(int size, unsigned short port, void *val, 3621 unsigned int count, struct kvm_vcpu *vcpu) 3622 { 3623 if (vcpu->arch.pio.count) 3624 goto data_avail; 3625 3626 trace_kvm_pio(1, port, size, 1); 3627 3628 vcpu->arch.pio.port = port; 3629 vcpu->arch.pio.in = 1; 3630 vcpu->arch.pio.count = count; 3631 vcpu->arch.pio.size = size; 3632 3633 if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { 3634 data_avail: 3635 memcpy(val, vcpu->arch.pio_data, size * count); 3636 vcpu->arch.pio.count = 0; 3637 return 1; 3638 } 3639 3640 vcpu->run->exit_reason = KVM_EXIT_IO; 3641 vcpu->run->io.direction = KVM_EXIT_IO_IN; 3642 vcpu->run->io.size = size; 3643 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; 3644 vcpu->run->io.count = count; 3645 vcpu->run->io.port = port; 3646 3647 return 0; 3648 } 3649 3650 static int emulator_pio_out_emulated(int size, unsigned short port, 3651 const void *val, unsigned int count, 3652 struct kvm_vcpu *vcpu) 3653 { 3654 trace_kvm_pio(0, port, size, 1); 3655 3656 vcpu->arch.pio.port = port; 3657 vcpu->arch.pio.in = 0; 3658 vcpu->arch.pio.count = count; 3659 vcpu->arch.pio.size = size; 3660 3661 memcpy(vcpu->arch.pio_data, val, size * count); 3662 3663 if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { 3664 vcpu->arch.pio.count = 0; 3665 return 1; 3666 } 3667 3668 vcpu->run->exit_reason = KVM_EXIT_IO; 3669 vcpu->run->io.direction = KVM_EXIT_IO_OUT; 3670 vcpu->run->io.size = size; 3671 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; 3672 vcpu->run->io.count = count; 3673 vcpu->run->io.port = port; 3674 3675 return 0; 3676 } 3677 3678 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) 3679 { 3680 return kvm_x86_ops->get_segment_base(vcpu, seg); 3681 } 3682 3683 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) 3684 { 3685 kvm_mmu_invlpg(vcpu, address); 3686 return X86EMUL_CONTINUE; 3687 } 3688 3689 int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu) 3690 { 3691 if (!need_emulate_wbinvd(vcpu)) 3692 return X86EMUL_CONTINUE; 3693 3694 if (kvm_x86_ops->has_wbinvd_exit()) { 3695 smp_call_function_many(vcpu->arch.wbinvd_dirty_mask, 3696 wbinvd_ipi, NULL, 1); 3697 cpumask_clear(vcpu->arch.wbinvd_dirty_mask); 3698 } 3699 wbinvd(); 3700 return X86EMUL_CONTINUE; 3701 } 3702 EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd); 3703 3704 int emulate_clts(struct kvm_vcpu *vcpu) 3705 { 3706 kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); 3707 kvm_x86_ops->fpu_activate(vcpu); 3708 return X86EMUL_CONTINUE; 3709 } 3710 3711 int emulator_get_dr(int dr, unsigned long *dest, struct kvm_vcpu *vcpu) 3712 { 3713 return _kvm_get_dr(vcpu, dr, dest); 3714 } 3715 3716 int emulator_set_dr(int dr, unsigned long value, struct kvm_vcpu *vcpu) 3717 { 3718 3719 return __kvm_set_dr(vcpu, dr, value); 3720 } 3721 3722 static u64 mk_cr_64(u64 curr_cr, u32 new_val) 3723 { 3724 return (curr_cr & ~((1ULL << 32) - 1)) | new_val; 3725 } 3726 3727 static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu) 3728 { 3729 unsigned long value; 3730 3731 switch (cr) { 3732 case 0: 3733 value = kvm_read_cr0(vcpu); 3734 break; 3735 case 2: 3736 value = vcpu->arch.cr2; 3737 break; 3738 case 3: 3739 value = vcpu->arch.cr3; 3740 break; 3741 case 4: 3742 value = kvm_read_cr4(vcpu); 3743 break; 3744 case 8: 3745 value = kvm_get_cr8(vcpu); 3746 break; 3747 default: 3748 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); 3749 return 0; 3750 } 3751 3752 return value; 3753 } 3754 3755 static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu) 3756 { 3757 int res = 0; 3758 3759 switch (cr) { 3760 case 0: 3761 res = kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val)); 3762 break; 3763 case 2: 3764 vcpu->arch.cr2 = val; 3765 break; 3766 case 3: 3767 res = kvm_set_cr3(vcpu, val); 3768 break; 3769 case 4: 3770 res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val)); 3771 break; 3772 case 8: 3773 res = __kvm_set_cr8(vcpu, val & 0xfUL); 3774 break; 3775 default: 3776 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); 3777 res = -1; 3778 } 3779 3780 return res; 3781 } 3782 3783 static int emulator_get_cpl(struct kvm_vcpu *vcpu) 3784 { 3785 return kvm_x86_ops->get_cpl(vcpu); 3786 } 3787 3788 static void emulator_get_gdt(struct desc_ptr *dt, struct kvm_vcpu *vcpu) 3789 { 3790 kvm_x86_ops->get_gdt(vcpu, dt); 3791 } 3792 3793 static unsigned long emulator_get_cached_segment_base(int seg, 3794 struct kvm_vcpu *vcpu) 3795 { 3796 return get_segment_base(vcpu, seg); 3797 } 3798 3799 static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg, 3800 struct kvm_vcpu *vcpu) 3801 { 3802 struct kvm_segment var; 3803 3804 kvm_get_segment(vcpu, &var, seg); 3805 3806 if (var.unusable) 3807 return false; 3808 3809 if (var.g) 3810 var.limit >>= 12; 3811 set_desc_limit(desc, var.limit); 3812 set_desc_base(desc, (unsigned long)var.base); 3813 desc->type = var.type; 3814 desc->s = var.s; 3815 desc->dpl = var.dpl; 3816 desc->p = var.present; 3817 desc->avl = var.avl; 3818 desc->l = var.l; 3819 desc->d = var.db; 3820 desc->g = var.g; 3821 3822 return true; 3823 } 3824 3825 static void emulator_set_cached_descriptor(struct desc_struct *desc, int seg, 3826 struct kvm_vcpu *vcpu) 3827 { 3828 struct kvm_segment var; 3829 3830 /* needed to preserve selector */ 3831 kvm_get_segment(vcpu, &var, seg); 3832 3833 var.base = get_desc_base(desc); 3834 var.limit = get_desc_limit(desc); 3835 if (desc->g) 3836 var.limit = (var.limit << 12) | 0xfff; 3837 var.type = desc->type; 3838 var.present = desc->p; 3839 var.dpl = desc->dpl; 3840 var.db = desc->d; 3841 var.s = desc->s; 3842 var.l = desc->l; 3843 var.g = desc->g; 3844 var.avl = desc->avl; 3845 var.present = desc->p; 3846 var.unusable = !var.present; 3847 var.padding = 0; 3848 3849 kvm_set_segment(vcpu, &var, seg); 3850 return; 3851 } 3852 3853 static u16 emulator_get_segment_selector(int seg, struct kvm_vcpu *vcpu) 3854 { 3855 struct kvm_segment kvm_seg; 3856 3857 kvm_get_segment(vcpu, &kvm_seg, seg); 3858 return kvm_seg.selector; 3859 } 3860 3861 static void emulator_set_segment_selector(u16 sel, int seg, 3862 struct kvm_vcpu *vcpu) 3863 { 3864 struct kvm_segment kvm_seg; 3865 3866 kvm_get_segment(vcpu, &kvm_seg, seg); 3867 kvm_seg.selector = sel; 3868 kvm_set_segment(vcpu, &kvm_seg, seg); 3869 } 3870 3871 static struct x86_emulate_ops emulate_ops = { 3872 .read_std = kvm_read_guest_virt_system, 3873 .write_std = kvm_write_guest_virt_system, 3874 .fetch = kvm_fetch_guest_virt, 3875 .read_emulated = emulator_read_emulated, 3876 .write_emulated = emulator_write_emulated, 3877 .cmpxchg_emulated = emulator_cmpxchg_emulated, 3878 .pio_in_emulated = emulator_pio_in_emulated, 3879 .pio_out_emulated = emulator_pio_out_emulated, 3880 .get_cached_descriptor = emulator_get_cached_descriptor, 3881 .set_cached_descriptor = emulator_set_cached_descriptor, 3882 .get_segment_selector = emulator_get_segment_selector, 3883 .set_segment_selector = emulator_set_segment_selector, 3884 .get_cached_segment_base = emulator_get_cached_segment_base, 3885 .get_gdt = emulator_get_gdt, 3886 .get_cr = emulator_get_cr, 3887 .set_cr = emulator_set_cr, 3888 .cpl = emulator_get_cpl, 3889 .get_dr = emulator_get_dr, 3890 .set_dr = emulator_set_dr, 3891 .set_msr = kvm_set_msr, 3892 .get_msr = kvm_get_msr, 3893 }; 3894 3895 static void cache_all_regs(struct kvm_vcpu *vcpu) 3896 { 3897 kvm_register_read(vcpu, VCPU_REGS_RAX); 3898 kvm_register_read(vcpu, VCPU_REGS_RSP); 3899 kvm_register_read(vcpu, VCPU_REGS_RIP); 3900 vcpu->arch.regs_dirty = ~0; 3901 } 3902 3903 static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) 3904 { 3905 u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask); 3906 /* 3907 * an sti; sti; sequence only disable interrupts for the first 3908 * instruction. So, if the last instruction, be it emulated or 3909 * not, left the system with the INT_STI flag enabled, it 3910 * means that the last instruction is an sti. We should not 3911 * leave the flag on in this case. The same goes for mov ss 3912 */ 3913 if (!(int_shadow & mask)) 3914 kvm_x86_ops->set_interrupt_shadow(vcpu, mask); 3915 } 3916 3917 static void inject_emulated_exception(struct kvm_vcpu *vcpu) 3918 { 3919 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; 3920 if (ctxt->exception == PF_VECTOR) 3921 kvm_inject_page_fault(vcpu, ctxt->cr2, ctxt->error_code); 3922 else if (ctxt->error_code_valid) 3923 kvm_queue_exception_e(vcpu, ctxt->exception, ctxt->error_code); 3924 else 3925 kvm_queue_exception(vcpu, ctxt->exception); 3926 } 3927 3928 static int handle_emulation_failure(struct kvm_vcpu *vcpu) 3929 { 3930 ++vcpu->stat.insn_emulation_fail; 3931 trace_kvm_emulate_insn_failed(vcpu); 3932 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3933 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; 3934 vcpu->run->internal.ndata = 0; 3935 kvm_queue_exception(vcpu, UD_VECTOR); 3936 return EMULATE_FAIL; 3937 } 3938 3939 static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) 3940 { 3941 gpa_t gpa; 3942 3943 if (tdp_enabled) 3944 return false; 3945 3946 /* 3947 * if emulation was due to access to shadowed page table 3948 * and it failed try to unshadow page and re-entetr the 3949 * guest to let CPU execute the instruction. 3950 */ 3951 if (kvm_mmu_unprotect_page_virt(vcpu, gva)) 3952 return true; 3953 3954 gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL); 3955 3956 if (gpa == UNMAPPED_GVA) 3957 return true; /* let cpu generate fault */ 3958 3959 if (!kvm_is_error_hva(gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT))) 3960 return true; 3961 3962 return false; 3963 } 3964 3965 int emulate_instruction(struct kvm_vcpu *vcpu, 3966 unsigned long cr2, 3967 u16 error_code, 3968 int emulation_type) 3969 { 3970 int r; 3971 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; 3972 3973 kvm_clear_exception_queue(vcpu); 3974 vcpu->arch.mmio_fault_cr2 = cr2; 3975 /* 3976 * TODO: fix emulate.c to use guest_read/write_register 3977 * instead of direct ->regs accesses, can save hundred cycles 3978 * on Intel for instructions that don't read/change RSP, for 3979 * for example. 3980 */ 3981 cache_all_regs(vcpu); 3982 3983 if (!(emulation_type & EMULTYPE_NO_DECODE)) { 3984 int cs_db, cs_l; 3985 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 3986 3987 vcpu->arch.emulate_ctxt.vcpu = vcpu; 3988 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); 3989 vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu); 3990 vcpu->arch.emulate_ctxt.mode = 3991 (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : 3992 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) 3993 ? X86EMUL_MODE_VM86 : cs_l 3994 ? X86EMUL_MODE_PROT64 : cs_db 3995 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; 3996 memset(c, 0, sizeof(struct decode_cache)); 3997 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); 3998 vcpu->arch.emulate_ctxt.interruptibility = 0; 3999 vcpu->arch.emulate_ctxt.exception = -1; 4000 4001 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 4002 trace_kvm_emulate_insn_start(vcpu); 4003 4004 /* Only allow emulation of specific instructions on #UD 4005 * (namely VMMCALL, sysenter, sysexit, syscall)*/ 4006 if (emulation_type & EMULTYPE_TRAP_UD) { 4007 if (!c->twobyte) 4008 return EMULATE_FAIL; 4009 switch (c->b) { 4010 case 0x01: /* VMMCALL */ 4011 if (c->modrm_mod != 3 || c->modrm_rm != 1) 4012 return EMULATE_FAIL; 4013 break; 4014 case 0x34: /* sysenter */ 4015 case 0x35: /* sysexit */ 4016 if (c->modrm_mod != 0 || c->modrm_rm != 0) 4017 return EMULATE_FAIL; 4018 break; 4019 case 0x05: /* syscall */ 4020 if (c->modrm_mod != 0 || c->modrm_rm != 0) 4021 return EMULATE_FAIL; 4022 break; 4023 default: 4024 return EMULATE_FAIL; 4025 } 4026 4027 if (!(c->modrm_reg == 0 || c->modrm_reg == 3)) 4028 return EMULATE_FAIL; 4029 } 4030 4031 ++vcpu->stat.insn_emulation; 4032 if (r) { 4033 if (reexecute_instruction(vcpu, cr2)) 4034 return EMULATE_DONE; 4035 if (emulation_type & EMULTYPE_SKIP) 4036 return EMULATE_FAIL; 4037 return handle_emulation_failure(vcpu); 4038 } 4039 } 4040 4041 if (emulation_type & EMULTYPE_SKIP) { 4042 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.decode.eip); 4043 return EMULATE_DONE; 4044 } 4045 4046 /* this is needed for vmware backdor interface to work since it 4047 changes registers values during IO operation */ 4048 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); 4049 4050 restart: 4051 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 4052 4053 if (r) { /* emulation failed */ 4054 if (reexecute_instruction(vcpu, cr2)) 4055 return EMULATE_DONE; 4056 4057 return handle_emulation_failure(vcpu); 4058 } 4059 4060 toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility); 4061 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); 4062 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); 4063 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); 4064 4065 if (vcpu->arch.emulate_ctxt.exception >= 0) { 4066 inject_emulated_exception(vcpu); 4067 return EMULATE_DONE; 4068 } 4069 4070 if (vcpu->arch.pio.count) { 4071 if (!vcpu->arch.pio.in) 4072 vcpu->arch.pio.count = 0; 4073 return EMULATE_DO_MMIO; 4074 } 4075 4076 if (vcpu->mmio_needed) { 4077 if (vcpu->mmio_is_write) 4078 vcpu->mmio_needed = 0; 4079 return EMULATE_DO_MMIO; 4080 } 4081 4082 if (vcpu->arch.emulate_ctxt.restart) 4083 goto restart; 4084 4085 return EMULATE_DONE; 4086 } 4087 EXPORT_SYMBOL_GPL(emulate_instruction); 4088 4089 int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port) 4090 { 4091 unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX); 4092 int ret = emulator_pio_out_emulated(size, port, &val, 1, vcpu); 4093 /* do not return to emulator after return from userspace */ 4094 vcpu->arch.pio.count = 0; 4095 return ret; 4096 } 4097 EXPORT_SYMBOL_GPL(kvm_fast_pio_out); 4098 4099 static void bounce_off(void *info) 4100 { 4101 /* nothing */ 4102 } 4103 4104 static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, 4105 void *data) 4106 { 4107 struct cpufreq_freqs *freq = data; 4108 struct kvm *kvm; 4109 struct kvm_vcpu *vcpu; 4110 int i, send_ipi = 0; 4111 4112 if (val == CPUFREQ_PRECHANGE && freq->old > freq->new) 4113 return 0; 4114 if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new) 4115 return 0; 4116 per_cpu(cpu_tsc_khz, freq->cpu) = freq->new; 4117 4118 spin_lock(&kvm_lock); 4119 list_for_each_entry(kvm, &vm_list, vm_list) { 4120 kvm_for_each_vcpu(i, vcpu, kvm) { 4121 if (vcpu->cpu != freq->cpu) 4122 continue; 4123 if (!kvm_request_guest_time_update(vcpu)) 4124 continue; 4125 if (vcpu->cpu != smp_processor_id()) 4126 send_ipi++; 4127 } 4128 } 4129 spin_unlock(&kvm_lock); 4130 4131 if (freq->old < freq->new && send_ipi) { 4132 /* 4133 * We upscale the frequency. Must make the guest 4134 * doesn't see old kvmclock values while running with 4135 * the new frequency, otherwise we risk the guest sees 4136 * time go backwards. 4137 * 4138 * In case we update the frequency for another cpu 4139 * (which might be in guest context) send an interrupt 4140 * to kick the cpu out of guest context. Next time 4141 * guest context is entered kvmclock will be updated, 4142 * so the guest will not see stale values. 4143 */ 4144 smp_call_function_single(freq->cpu, bounce_off, NULL, 1); 4145 } 4146 return 0; 4147 } 4148 4149 static struct notifier_block kvmclock_cpufreq_notifier_block = { 4150 .notifier_call = kvmclock_cpufreq_notifier 4151 }; 4152 4153 static void kvm_timer_init(void) 4154 { 4155 int cpu; 4156 4157 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { 4158 cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, 4159 CPUFREQ_TRANSITION_NOTIFIER); 4160 for_each_online_cpu(cpu) { 4161 unsigned long khz = cpufreq_get(cpu); 4162 if (!khz) 4163 khz = tsc_khz; 4164 per_cpu(cpu_tsc_khz, cpu) = khz; 4165 } 4166 } else { 4167 for_each_possible_cpu(cpu) 4168 per_cpu(cpu_tsc_khz, cpu) = tsc_khz; 4169 } 4170 } 4171 4172 static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu); 4173 4174 static int kvm_is_in_guest(void) 4175 { 4176 return percpu_read(current_vcpu) != NULL; 4177 } 4178 4179 static int kvm_is_user_mode(void) 4180 { 4181 int user_mode = 3; 4182 4183 if (percpu_read(current_vcpu)) 4184 user_mode = kvm_x86_ops->get_cpl(percpu_read(current_vcpu)); 4185 4186 return user_mode != 0; 4187 } 4188 4189 static unsigned long kvm_get_guest_ip(void) 4190 { 4191 unsigned long ip = 0; 4192 4193 if (percpu_read(current_vcpu)) 4194 ip = kvm_rip_read(percpu_read(current_vcpu)); 4195 4196 return ip; 4197 } 4198 4199 static struct perf_guest_info_callbacks kvm_guest_cbs = { 4200 .is_in_guest = kvm_is_in_guest, 4201 .is_user_mode = kvm_is_user_mode, 4202 .get_guest_ip = kvm_get_guest_ip, 4203 }; 4204 4205 void kvm_before_handle_nmi(struct kvm_vcpu *vcpu) 4206 { 4207 percpu_write(current_vcpu, vcpu); 4208 } 4209 EXPORT_SYMBOL_GPL(kvm_before_handle_nmi); 4210 4211 void kvm_after_handle_nmi(struct kvm_vcpu *vcpu) 4212 { 4213 percpu_write(current_vcpu, NULL); 4214 } 4215 EXPORT_SYMBOL_GPL(kvm_after_handle_nmi); 4216 4217 int kvm_arch_init(void *opaque) 4218 { 4219 int r; 4220 struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque; 4221 4222 if (kvm_x86_ops) { 4223 printk(KERN_ERR "kvm: already loaded the other module\n"); 4224 r = -EEXIST; 4225 goto out; 4226 } 4227 4228 if (!ops->cpu_has_kvm_support()) { 4229 printk(KERN_ERR "kvm: no hardware support\n"); 4230 r = -EOPNOTSUPP; 4231 goto out; 4232 } 4233 if (ops->disabled_by_bios()) { 4234 printk(KERN_ERR "kvm: disabled by bios\n"); 4235 r = -EOPNOTSUPP; 4236 goto out; 4237 } 4238 4239 r = kvm_mmu_module_init(); 4240 if (r) 4241 goto out; 4242 4243 kvm_init_msr_list(); 4244 4245 kvm_x86_ops = ops; 4246 kvm_mmu_set_nonpresent_ptes(0ull, 0ull); 4247 kvm_mmu_set_base_ptes(PT_PRESENT_MASK); 4248 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, 4249 PT_DIRTY_MASK, PT64_NX_MASK, 0); 4250 4251 kvm_timer_init(); 4252 4253 perf_register_guest_info_callbacks(&kvm_guest_cbs); 4254 4255 if (cpu_has_xsave) 4256 host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); 4257 4258 return 0; 4259 4260 out: 4261 return r; 4262 } 4263 4264 void kvm_arch_exit(void) 4265 { 4266 perf_unregister_guest_info_callbacks(&kvm_guest_cbs); 4267 4268 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) 4269 cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block, 4270 CPUFREQ_TRANSITION_NOTIFIER); 4271 kvm_x86_ops = NULL; 4272 kvm_mmu_module_exit(); 4273 } 4274 4275 int kvm_emulate_halt(struct kvm_vcpu *vcpu) 4276 { 4277 ++vcpu->stat.halt_exits; 4278 if (irqchip_in_kernel(vcpu->kvm)) { 4279 vcpu->arch.mp_state = KVM_MP_STATE_HALTED; 4280 return 1; 4281 } else { 4282 vcpu->run->exit_reason = KVM_EXIT_HLT; 4283 return 0; 4284 } 4285 } 4286 EXPORT_SYMBOL_GPL(kvm_emulate_halt); 4287 4288 static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0, 4289 unsigned long a1) 4290 { 4291 if (is_long_mode(vcpu)) 4292 return a0; 4293 else 4294 return a0 | ((gpa_t)a1 << 32); 4295 } 4296 4297 int kvm_hv_hypercall(struct kvm_vcpu *vcpu) 4298 { 4299 u64 param, ingpa, outgpa, ret; 4300 uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0; 4301 bool fast, longmode; 4302 int cs_db, cs_l; 4303 4304 /* 4305 * hypercall generates UD from non zero cpl and real mode 4306 * per HYPER-V spec 4307 */ 4308 if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) { 4309 kvm_queue_exception(vcpu, UD_VECTOR); 4310 return 0; 4311 } 4312 4313 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 4314 longmode = is_long_mode(vcpu) && cs_l == 1; 4315 4316 if (!longmode) { 4317 param = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) | 4318 (kvm_register_read(vcpu, VCPU_REGS_RAX) & 0xffffffff); 4319 ingpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RBX) << 32) | 4320 (kvm_register_read(vcpu, VCPU_REGS_RCX) & 0xffffffff); 4321 outgpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDI) << 32) | 4322 (kvm_register_read(vcpu, VCPU_REGS_RSI) & 0xffffffff); 4323 } 4324 #ifdef CONFIG_X86_64 4325 else { 4326 param = kvm_register_read(vcpu, VCPU_REGS_RCX); 4327 ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX); 4328 outgpa = kvm_register_read(vcpu, VCPU_REGS_R8); 4329 } 4330 #endif 4331 4332 code = param & 0xffff; 4333 fast = (param >> 16) & 0x1; 4334 rep_cnt = (param >> 32) & 0xfff; 4335 rep_idx = (param >> 48) & 0xfff; 4336 4337 trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa); 4338 4339 switch (code) { 4340 case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT: 4341 kvm_vcpu_on_spin(vcpu); 4342 break; 4343 default: 4344 res = HV_STATUS_INVALID_HYPERCALL_CODE; 4345 break; 4346 } 4347 4348 ret = res | (((u64)rep_done & 0xfff) << 32); 4349 if (longmode) { 4350 kvm_register_write(vcpu, VCPU_REGS_RAX, ret); 4351 } else { 4352 kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32); 4353 kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff); 4354 } 4355 4356 return 1; 4357 } 4358 4359 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) 4360 { 4361 unsigned long nr, a0, a1, a2, a3, ret; 4362 int r = 1; 4363 4364 if (kvm_hv_hypercall_enabled(vcpu->kvm)) 4365 return kvm_hv_hypercall(vcpu); 4366 4367 nr = kvm_register_read(vcpu, VCPU_REGS_RAX); 4368 a0 = kvm_register_read(vcpu, VCPU_REGS_RBX); 4369 a1 = kvm_register_read(vcpu, VCPU_REGS_RCX); 4370 a2 = kvm_register_read(vcpu, VCPU_REGS_RDX); 4371 a3 = kvm_register_read(vcpu, VCPU_REGS_RSI); 4372 4373 trace_kvm_hypercall(nr, a0, a1, a2, a3); 4374 4375 if (!is_long_mode(vcpu)) { 4376 nr &= 0xFFFFFFFF; 4377 a0 &= 0xFFFFFFFF; 4378 a1 &= 0xFFFFFFFF; 4379 a2 &= 0xFFFFFFFF; 4380 a3 &= 0xFFFFFFFF; 4381 } 4382 4383 if (kvm_x86_ops->get_cpl(vcpu) != 0) { 4384 ret = -KVM_EPERM; 4385 goto out; 4386 } 4387 4388 switch (nr) { 4389 case KVM_HC_VAPIC_POLL_IRQ: 4390 ret = 0; 4391 break; 4392 case KVM_HC_MMU_OP: 4393 r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret); 4394 break; 4395 default: 4396 ret = -KVM_ENOSYS; 4397 break; 4398 } 4399 out: 4400 kvm_register_write(vcpu, VCPU_REGS_RAX, ret); 4401 ++vcpu->stat.hypercalls; 4402 return r; 4403 } 4404 EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); 4405 4406 int kvm_fix_hypercall(struct kvm_vcpu *vcpu) 4407 { 4408 char instruction[3]; 4409 unsigned long rip = kvm_rip_read(vcpu); 4410 4411 /* 4412 * Blow out the MMU to ensure that no other VCPU has an active mapping 4413 * to ensure that the updated hypercall appears atomically across all 4414 * VCPUs. 4415 */ 4416 kvm_mmu_zap_all(vcpu->kvm); 4417 4418 kvm_x86_ops->patch_hypercall(vcpu, instruction); 4419 4420 return emulator_write_emulated(rip, instruction, 3, NULL, vcpu); 4421 } 4422 4423 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) 4424 { 4425 struct desc_ptr dt = { limit, base }; 4426 4427 kvm_x86_ops->set_gdt(vcpu, &dt); 4428 } 4429 4430 void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) 4431 { 4432 struct desc_ptr dt = { limit, base }; 4433 4434 kvm_x86_ops->set_idt(vcpu, &dt); 4435 } 4436 4437 static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) 4438 { 4439 struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i]; 4440 int j, nent = vcpu->arch.cpuid_nent; 4441 4442 e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT; 4443 /* when no next entry is found, the current entry[i] is reselected */ 4444 for (j = i + 1; ; j = (j + 1) % nent) { 4445 struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j]; 4446 if (ej->function == e->function) { 4447 ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; 4448 return j; 4449 } 4450 } 4451 return 0; /* silence gcc, even though control never reaches here */ 4452 } 4453 4454 /* find an entry with matching function, matching index (if needed), and that 4455 * should be read next (if it's stateful) */ 4456 static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e, 4457 u32 function, u32 index) 4458 { 4459 if (e->function != function) 4460 return 0; 4461 if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index) 4462 return 0; 4463 if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) && 4464 !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT)) 4465 return 0; 4466 return 1; 4467 } 4468 4469 struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, 4470 u32 function, u32 index) 4471 { 4472 int i; 4473 struct kvm_cpuid_entry2 *best = NULL; 4474 4475 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { 4476 struct kvm_cpuid_entry2 *e; 4477 4478 e = &vcpu->arch.cpuid_entries[i]; 4479 if (is_matching_cpuid_entry(e, function, index)) { 4480 if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) 4481 move_to_next_stateful_cpuid_entry(vcpu, i); 4482 best = e; 4483 break; 4484 } 4485 /* 4486 * Both basic or both extended? 4487 */ 4488 if (((e->function ^ function) & 0x80000000) == 0) 4489 if (!best || e->function > best->function) 4490 best = e; 4491 } 4492 return best; 4493 } 4494 EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry); 4495 4496 int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) 4497 { 4498 struct kvm_cpuid_entry2 *best; 4499 4500 best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0); 4501 if (!best || best->eax < 0x80000008) 4502 goto not_found; 4503 best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); 4504 if (best) 4505 return best->eax & 0xff; 4506 not_found: 4507 return 36; 4508 } 4509 4510 void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) 4511 { 4512 u32 function, index; 4513 struct kvm_cpuid_entry2 *best; 4514 4515 function = kvm_register_read(vcpu, VCPU_REGS_RAX); 4516 index = kvm_register_read(vcpu, VCPU_REGS_RCX); 4517 kvm_register_write(vcpu, VCPU_REGS_RAX, 0); 4518 kvm_register_write(vcpu, VCPU_REGS_RBX, 0); 4519 kvm_register_write(vcpu, VCPU_REGS_RCX, 0); 4520 kvm_register_write(vcpu, VCPU_REGS_RDX, 0); 4521 best = kvm_find_cpuid_entry(vcpu, function, index); 4522 if (best) { 4523 kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax); 4524 kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx); 4525 kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx); 4526 kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx); 4527 } 4528 kvm_x86_ops->skip_emulated_instruction(vcpu); 4529 trace_kvm_cpuid(function, 4530 kvm_register_read(vcpu, VCPU_REGS_RAX), 4531 kvm_register_read(vcpu, VCPU_REGS_RBX), 4532 kvm_register_read(vcpu, VCPU_REGS_RCX), 4533 kvm_register_read(vcpu, VCPU_REGS_RDX)); 4534 } 4535 EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); 4536 4537 /* 4538 * Check if userspace requested an interrupt window, and that the 4539 * interrupt window is open. 4540 * 4541 * No need to exit to userspace if we already have an interrupt queued. 4542 */ 4543 static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu) 4544 { 4545 return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) && 4546 vcpu->run->request_interrupt_window && 4547 kvm_arch_interrupt_allowed(vcpu)); 4548 } 4549 4550 static void post_kvm_run_save(struct kvm_vcpu *vcpu) 4551 { 4552 struct kvm_run *kvm_run = vcpu->run; 4553 4554 kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0; 4555 kvm_run->cr8 = kvm_get_cr8(vcpu); 4556 kvm_run->apic_base = kvm_get_apic_base(vcpu); 4557 if (irqchip_in_kernel(vcpu->kvm)) 4558 kvm_run->ready_for_interrupt_injection = 1; 4559 else 4560 kvm_run->ready_for_interrupt_injection = 4561 kvm_arch_interrupt_allowed(vcpu) && 4562 !kvm_cpu_has_interrupt(vcpu) && 4563 !kvm_event_needs_reinjection(vcpu); 4564 } 4565 4566 static void vapic_enter(struct kvm_vcpu *vcpu) 4567 { 4568 struct kvm_lapic *apic = vcpu->arch.apic; 4569 struct page *page; 4570 4571 if (!apic || !apic->vapic_addr) 4572 return; 4573 4574 page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); 4575 4576 vcpu->arch.apic->vapic_page = page; 4577 } 4578 4579 static void vapic_exit(struct kvm_vcpu *vcpu) 4580 { 4581 struct kvm_lapic *apic = vcpu->arch.apic; 4582 int idx; 4583 4584 if (!apic || !apic->vapic_addr) 4585 return; 4586 4587 idx = srcu_read_lock(&vcpu->kvm->srcu); 4588 kvm_release_page_dirty(apic->vapic_page); 4589 mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); 4590 srcu_read_unlock(&vcpu->kvm->srcu, idx); 4591 } 4592 4593 static void update_cr8_intercept(struct kvm_vcpu *vcpu) 4594 { 4595 int max_irr, tpr; 4596 4597 if (!kvm_x86_ops->update_cr8_intercept) 4598 return; 4599 4600 if (!vcpu->arch.apic) 4601 return; 4602 4603 if (!vcpu->arch.apic->vapic_addr) 4604 max_irr = kvm_lapic_find_highest_irr(vcpu); 4605 else 4606 max_irr = -1; 4607 4608 if (max_irr != -1) 4609 max_irr >>= 4; 4610 4611 tpr = kvm_lapic_get_cr8(vcpu); 4612 4613 kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr); 4614 } 4615 4616 static void inject_pending_event(struct kvm_vcpu *vcpu) 4617 { 4618 /* try to reinject previous events if any */ 4619 if (vcpu->arch.exception.pending) { 4620 trace_kvm_inj_exception(vcpu->arch.exception.nr, 4621 vcpu->arch.exception.has_error_code, 4622 vcpu->arch.exception.error_code); 4623 kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr, 4624 vcpu->arch.exception.has_error_code, 4625 vcpu->arch.exception.error_code, 4626 vcpu->arch.exception.reinject); 4627 return; 4628 } 4629 4630 if (vcpu->arch.nmi_injected) { 4631 kvm_x86_ops->set_nmi(vcpu); 4632 return; 4633 } 4634 4635 if (vcpu->arch.interrupt.pending) { 4636 kvm_x86_ops->set_irq(vcpu); 4637 return; 4638 } 4639 4640 /* try to inject new event if pending */ 4641 if (vcpu->arch.nmi_pending) { 4642 if (kvm_x86_ops->nmi_allowed(vcpu)) { 4643 vcpu->arch.nmi_pending = false; 4644 vcpu->arch.nmi_injected = true; 4645 kvm_x86_ops->set_nmi(vcpu); 4646 } 4647 } else if (kvm_cpu_has_interrupt(vcpu)) { 4648 if (kvm_x86_ops->interrupt_allowed(vcpu)) { 4649 kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), 4650 false); 4651 kvm_x86_ops->set_irq(vcpu); 4652 } 4653 } 4654 } 4655 4656 static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu) 4657 { 4658 if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) && 4659 !vcpu->guest_xcr0_loaded) { 4660 /* kvm_set_xcr() also depends on this */ 4661 xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0); 4662 vcpu->guest_xcr0_loaded = 1; 4663 } 4664 } 4665 4666 static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu) 4667 { 4668 if (vcpu->guest_xcr0_loaded) { 4669 if (vcpu->arch.xcr0 != host_xcr0) 4670 xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0); 4671 vcpu->guest_xcr0_loaded = 0; 4672 } 4673 } 4674 4675 static int vcpu_enter_guest(struct kvm_vcpu *vcpu) 4676 { 4677 int r; 4678 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && 4679 vcpu->run->request_interrupt_window; 4680 4681 if (vcpu->requests) { 4682 if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) 4683 kvm_mmu_unload(vcpu); 4684 if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) 4685 __kvm_migrate_timers(vcpu); 4686 if (kvm_check_request(KVM_REQ_KVMCLOCK_UPDATE, vcpu)) 4687 kvm_write_guest_time(vcpu); 4688 if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu)) 4689 kvm_mmu_sync_roots(vcpu); 4690 if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) 4691 kvm_x86_ops->tlb_flush(vcpu); 4692 if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) { 4693 vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS; 4694 r = 0; 4695 goto out; 4696 } 4697 if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) { 4698 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; 4699 r = 0; 4700 goto out; 4701 } 4702 if (kvm_check_request(KVM_REQ_DEACTIVATE_FPU, vcpu)) { 4703 vcpu->fpu_active = 0; 4704 kvm_x86_ops->fpu_deactivate(vcpu); 4705 } 4706 } 4707 4708 r = kvm_mmu_reload(vcpu); 4709 if (unlikely(r)) 4710 goto out; 4711 4712 preempt_disable(); 4713 4714 kvm_x86_ops->prepare_guest_switch(vcpu); 4715 if (vcpu->fpu_active) 4716 kvm_load_guest_fpu(vcpu); 4717 kvm_load_guest_xcr0(vcpu); 4718 4719 atomic_set(&vcpu->guest_mode, 1); 4720 smp_wmb(); 4721 4722 local_irq_disable(); 4723 4724 if (!atomic_read(&vcpu->guest_mode) || vcpu->requests 4725 || need_resched() || signal_pending(current)) { 4726 atomic_set(&vcpu->guest_mode, 0); 4727 smp_wmb(); 4728 local_irq_enable(); 4729 preempt_enable(); 4730 r = 1; 4731 goto out; 4732 } 4733 4734 inject_pending_event(vcpu); 4735 4736 /* enable NMI/IRQ window open exits if needed */ 4737 if (vcpu->arch.nmi_pending) 4738 kvm_x86_ops->enable_nmi_window(vcpu); 4739 else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) 4740 kvm_x86_ops->enable_irq_window(vcpu); 4741 4742 if (kvm_lapic_enabled(vcpu)) { 4743 update_cr8_intercept(vcpu); 4744 kvm_lapic_sync_to_vapic(vcpu); 4745 } 4746 4747 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); 4748 4749 kvm_guest_enter(); 4750 4751 if (unlikely(vcpu->arch.switch_db_regs)) { 4752 set_debugreg(0, 7); 4753 set_debugreg(vcpu->arch.eff_db[0], 0); 4754 set_debugreg(vcpu->arch.eff_db[1], 1); 4755 set_debugreg(vcpu->arch.eff_db[2], 2); 4756 set_debugreg(vcpu->arch.eff_db[3], 3); 4757 } 4758 4759 trace_kvm_entry(vcpu->vcpu_id); 4760 kvm_x86_ops->run(vcpu); 4761 4762 /* 4763 * If the guest has used debug registers, at least dr7 4764 * will be disabled while returning to the host. 4765 * If we don't have active breakpoints in the host, we don't 4766 * care about the messed up debug address registers. But if 4767 * we have some of them active, restore the old state. 4768 */ 4769 if (hw_breakpoint_active()) 4770 hw_breakpoint_restore(); 4771 4772 atomic_set(&vcpu->guest_mode, 0); 4773 smp_wmb(); 4774 local_irq_enable(); 4775 4776 ++vcpu->stat.exits; 4777 4778 /* 4779 * We must have an instruction between local_irq_enable() and 4780 * kvm_guest_exit(), so the timer interrupt isn't delayed by 4781 * the interrupt shadow. The stat.exits increment will do nicely. 4782 * But we need to prevent reordering, hence this barrier(): 4783 */ 4784 barrier(); 4785 4786 kvm_guest_exit(); 4787 4788 preempt_enable(); 4789 4790 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); 4791 4792 /* 4793 * Profile KVM exit RIPs: 4794 */ 4795 if (unlikely(prof_on == KVM_PROFILING)) { 4796 unsigned long rip = kvm_rip_read(vcpu); 4797 profile_hit(KVM_PROFILING, (void *)rip); 4798 } 4799 4800 4801 kvm_lapic_sync_from_vapic(vcpu); 4802 4803 r = kvm_x86_ops->handle_exit(vcpu); 4804 out: 4805 return r; 4806 } 4807 4808 4809 static int __vcpu_run(struct kvm_vcpu *vcpu) 4810 { 4811 int r; 4812 struct kvm *kvm = vcpu->kvm; 4813 4814 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) { 4815 pr_debug("vcpu %d received sipi with vector # %x\n", 4816 vcpu->vcpu_id, vcpu->arch.sipi_vector); 4817 kvm_lapic_reset(vcpu); 4818 r = kvm_arch_vcpu_reset(vcpu); 4819 if (r) 4820 return r; 4821 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 4822 } 4823 4824 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); 4825 vapic_enter(vcpu); 4826 4827 r = 1; 4828 while (r > 0) { 4829 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) 4830 r = vcpu_enter_guest(vcpu); 4831 else { 4832 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); 4833 kvm_vcpu_block(vcpu); 4834 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); 4835 if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) 4836 { 4837 switch(vcpu->arch.mp_state) { 4838 case KVM_MP_STATE_HALTED: 4839 vcpu->arch.mp_state = 4840 KVM_MP_STATE_RUNNABLE; 4841 case KVM_MP_STATE_RUNNABLE: 4842 break; 4843 case KVM_MP_STATE_SIPI_RECEIVED: 4844 default: 4845 r = -EINTR; 4846 break; 4847 } 4848 } 4849 } 4850 4851 if (r <= 0) 4852 break; 4853 4854 clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); 4855 if (kvm_cpu_has_pending_timer(vcpu)) 4856 kvm_inject_pending_timer_irqs(vcpu); 4857 4858 if (dm_request_for_irq_injection(vcpu)) { 4859 r = -EINTR; 4860 vcpu->run->exit_reason = KVM_EXIT_INTR; 4861 ++vcpu->stat.request_irq_exits; 4862 } 4863 if (signal_pending(current)) { 4864 r = -EINTR; 4865 vcpu->run->exit_reason = KVM_EXIT_INTR; 4866 ++vcpu->stat.signal_exits; 4867 } 4868 if (need_resched()) { 4869 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); 4870 kvm_resched(vcpu); 4871 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); 4872 } 4873 } 4874 4875 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); 4876 4877 vapic_exit(vcpu); 4878 4879 return r; 4880 } 4881 4882 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 4883 { 4884 int r; 4885 sigset_t sigsaved; 4886 4887 if (vcpu->sigset_active) 4888 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); 4889 4890 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { 4891 kvm_vcpu_block(vcpu); 4892 clear_bit(KVM_REQ_UNHALT, &vcpu->requests); 4893 r = -EAGAIN; 4894 goto out; 4895 } 4896 4897 /* re-sync apic's tpr */ 4898 if (!irqchip_in_kernel(vcpu->kvm)) 4899 kvm_set_cr8(vcpu, kvm_run->cr8); 4900 4901 if (vcpu->arch.pio.count || vcpu->mmio_needed || 4902 vcpu->arch.emulate_ctxt.restart) { 4903 if (vcpu->mmio_needed) { 4904 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); 4905 vcpu->mmio_read_completed = 1; 4906 vcpu->mmio_needed = 0; 4907 } 4908 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); 4909 r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE); 4910 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); 4911 if (r != EMULATE_DONE) { 4912 r = 0; 4913 goto out; 4914 } 4915 } 4916 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) 4917 kvm_register_write(vcpu, VCPU_REGS_RAX, 4918 kvm_run->hypercall.ret); 4919 4920 r = __vcpu_run(vcpu); 4921 4922 out: 4923 post_kvm_run_save(vcpu); 4924 if (vcpu->sigset_active) 4925 sigprocmask(SIG_SETMASK, &sigsaved, NULL); 4926 4927 return r; 4928 } 4929 4930 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 4931 { 4932 regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); 4933 regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX); 4934 regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX); 4935 regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX); 4936 regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI); 4937 regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI); 4938 regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); 4939 regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP); 4940 #ifdef CONFIG_X86_64 4941 regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8); 4942 regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9); 4943 regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10); 4944 regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11); 4945 regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12); 4946 regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13); 4947 regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14); 4948 regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15); 4949 #endif 4950 4951 regs->rip = kvm_rip_read(vcpu); 4952 regs->rflags = kvm_get_rflags(vcpu); 4953 4954 return 0; 4955 } 4956 4957 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 4958 { 4959 kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax); 4960 kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx); 4961 kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx); 4962 kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx); 4963 kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi); 4964 kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi); 4965 kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp); 4966 kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp); 4967 #ifdef CONFIG_X86_64 4968 kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8); 4969 kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9); 4970 kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10); 4971 kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11); 4972 kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12); 4973 kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13); 4974 kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14); 4975 kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15); 4976 #endif 4977 4978 kvm_rip_write(vcpu, regs->rip); 4979 kvm_set_rflags(vcpu, regs->rflags); 4980 4981 vcpu->arch.exception.pending = false; 4982 4983 return 0; 4984 } 4985 4986 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) 4987 { 4988 struct kvm_segment cs; 4989 4990 kvm_get_segment(vcpu, &cs, VCPU_SREG_CS); 4991 *db = cs.db; 4992 *l = cs.l; 4993 } 4994 EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits); 4995 4996 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, 4997 struct kvm_sregs *sregs) 4998 { 4999 struct desc_ptr dt; 5000 5001 kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 5002 kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); 5003 kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES); 5004 kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS); 5005 kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS); 5006 kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS); 5007 5008 kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR); 5009 kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); 5010 5011 kvm_x86_ops->get_idt(vcpu, &dt); 5012 sregs->idt.limit = dt.size; 5013 sregs->idt.base = dt.address; 5014 kvm_x86_ops->get_gdt(vcpu, &dt); 5015 sregs->gdt.limit = dt.size; 5016 sregs->gdt.base = dt.address; 5017 5018 sregs->cr0 = kvm_read_cr0(vcpu); 5019 sregs->cr2 = vcpu->arch.cr2; 5020 sregs->cr3 = vcpu->arch.cr3; 5021 sregs->cr4 = kvm_read_cr4(vcpu); 5022 sregs->cr8 = kvm_get_cr8(vcpu); 5023 sregs->efer = vcpu->arch.efer; 5024 sregs->apic_base = kvm_get_apic_base(vcpu); 5025 5026 memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap); 5027 5028 if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft) 5029 set_bit(vcpu->arch.interrupt.nr, 5030 (unsigned long *)sregs->interrupt_bitmap); 5031 5032 return 0; 5033 } 5034 5035 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, 5036 struct kvm_mp_state *mp_state) 5037 { 5038 mp_state->mp_state = vcpu->arch.mp_state; 5039 return 0; 5040 } 5041 5042 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, 5043 struct kvm_mp_state *mp_state) 5044 { 5045 vcpu->arch.mp_state = mp_state->mp_state; 5046 return 0; 5047 } 5048 5049 int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, 5050 bool has_error_code, u32 error_code) 5051 { 5052 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; 5053 int cs_db, cs_l, ret; 5054 cache_all_regs(vcpu); 5055 5056 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 5057 5058 vcpu->arch.emulate_ctxt.vcpu = vcpu; 5059 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); 5060 vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu); 5061 vcpu->arch.emulate_ctxt.mode = 5062 (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : 5063 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) 5064 ? X86EMUL_MODE_VM86 : cs_l 5065 ? X86EMUL_MODE_PROT64 : cs_db 5066 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; 5067 memset(c, 0, sizeof(struct decode_cache)); 5068 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); 5069 5070 ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, &emulate_ops, 5071 tss_selector, reason, has_error_code, 5072 error_code); 5073 5074 if (ret) 5075 return EMULATE_FAIL; 5076 5077 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); 5078 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); 5079 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); 5080 return EMULATE_DONE; 5081 } 5082 EXPORT_SYMBOL_GPL(kvm_task_switch); 5083 5084 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, 5085 struct kvm_sregs *sregs) 5086 { 5087 int mmu_reset_needed = 0; 5088 int pending_vec, max_bits; 5089 struct desc_ptr dt; 5090 5091 dt.size = sregs->idt.limit; 5092 dt.address = sregs->idt.base; 5093 kvm_x86_ops->set_idt(vcpu, &dt); 5094 dt.size = sregs->gdt.limit; 5095 dt.address = sregs->gdt.base; 5096 kvm_x86_ops->set_gdt(vcpu, &dt); 5097 5098 vcpu->arch.cr2 = sregs->cr2; 5099 mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3; 5100 vcpu->arch.cr3 = sregs->cr3; 5101 5102 kvm_set_cr8(vcpu, sregs->cr8); 5103 5104 mmu_reset_needed |= vcpu->arch.efer != sregs->efer; 5105 kvm_x86_ops->set_efer(vcpu, sregs->efer); 5106 kvm_set_apic_base(vcpu, sregs->apic_base); 5107 5108 mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0; 5109 kvm_x86_ops->set_cr0(vcpu, sregs->cr0); 5110 vcpu->arch.cr0 = sregs->cr0; 5111 5112 mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; 5113 kvm_x86_ops->set_cr4(vcpu, sregs->cr4); 5114 if (!is_long_mode(vcpu) && is_pae(vcpu)) { 5115 load_pdptrs(vcpu, vcpu->arch.cr3); 5116 mmu_reset_needed = 1; 5117 } 5118 5119 if (mmu_reset_needed) 5120 kvm_mmu_reset_context(vcpu); 5121 5122 max_bits = (sizeof sregs->interrupt_bitmap) << 3; 5123 pending_vec = find_first_bit( 5124 (const unsigned long *)sregs->interrupt_bitmap, max_bits); 5125 if (pending_vec < max_bits) { 5126 kvm_queue_interrupt(vcpu, pending_vec, false); 5127 pr_debug("Set back pending irq %d\n", pending_vec); 5128 if (irqchip_in_kernel(vcpu->kvm)) 5129 kvm_pic_clear_isr_ack(vcpu->kvm); 5130 } 5131 5132 kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 5133 kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS); 5134 kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES); 5135 kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS); 5136 kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS); 5137 kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS); 5138 5139 kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); 5140 kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); 5141 5142 update_cr8_intercept(vcpu); 5143 5144 /* Older userspace won't unhalt the vcpu on reset. */ 5145 if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 && 5146 sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && 5147 !is_protmode(vcpu)) 5148 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 5149 5150 return 0; 5151 } 5152 5153 int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, 5154 struct kvm_guest_debug *dbg) 5155 { 5156 unsigned long rflags; 5157 int i, r; 5158 5159 if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) { 5160 r = -EBUSY; 5161 if (vcpu->arch.exception.pending) 5162 goto out; 5163 if (dbg->control & KVM_GUESTDBG_INJECT_DB) 5164 kvm_queue_exception(vcpu, DB_VECTOR); 5165 else 5166 kvm_queue_exception(vcpu, BP_VECTOR); 5167 } 5168 5169 /* 5170 * Read rflags as long as potentially injected trace flags are still 5171 * filtered out. 5172 */ 5173 rflags = kvm_get_rflags(vcpu); 5174 5175 vcpu->guest_debug = dbg->control; 5176 if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE)) 5177 vcpu->guest_debug = 0; 5178 5179 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { 5180 for (i = 0; i < KVM_NR_DB_REGS; ++i) 5181 vcpu->arch.eff_db[i] = dbg->arch.debugreg[i]; 5182 vcpu->arch.switch_db_regs = 5183 (dbg->arch.debugreg[7] & DR7_BP_EN_MASK); 5184 } else { 5185 for (i = 0; i < KVM_NR_DB_REGS; i++) 5186 vcpu->arch.eff_db[i] = vcpu->arch.db[i]; 5187 vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK); 5188 } 5189 5190 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 5191 vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) + 5192 get_segment_base(vcpu, VCPU_SREG_CS); 5193 5194 /* 5195 * Trigger an rflags update that will inject or remove the trace 5196 * flags. 5197 */ 5198 kvm_set_rflags(vcpu, rflags); 5199 5200 kvm_x86_ops->set_guest_debug(vcpu, dbg); 5201 5202 r = 0; 5203 5204 out: 5205 5206 return r; 5207 } 5208 5209 /* 5210 * Translate a guest virtual address to a guest physical address. 5211 */ 5212 int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, 5213 struct kvm_translation *tr) 5214 { 5215 unsigned long vaddr = tr->linear_address; 5216 gpa_t gpa; 5217 int idx; 5218 5219 idx = srcu_read_lock(&vcpu->kvm->srcu); 5220 gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL); 5221 srcu_read_unlock(&vcpu->kvm->srcu, idx); 5222 tr->physical_address = gpa; 5223 tr->valid = gpa != UNMAPPED_GVA; 5224 tr->writeable = 1; 5225 tr->usermode = 0; 5226 5227 return 0; 5228 } 5229 5230 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 5231 { 5232 struct i387_fxsave_struct *fxsave = 5233 &vcpu->arch.guest_fpu.state->fxsave; 5234 5235 memcpy(fpu->fpr, fxsave->st_space, 128); 5236 fpu->fcw = fxsave->cwd; 5237 fpu->fsw = fxsave->swd; 5238 fpu->ftwx = fxsave->twd; 5239 fpu->last_opcode = fxsave->fop; 5240 fpu->last_ip = fxsave->rip; 5241 fpu->last_dp = fxsave->rdp; 5242 memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space); 5243 5244 return 0; 5245 } 5246 5247 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 5248 { 5249 struct i387_fxsave_struct *fxsave = 5250 &vcpu->arch.guest_fpu.state->fxsave; 5251 5252 memcpy(fxsave->st_space, fpu->fpr, 128); 5253 fxsave->cwd = fpu->fcw; 5254 fxsave->swd = fpu->fsw; 5255 fxsave->twd = fpu->ftwx; 5256 fxsave->fop = fpu->last_opcode; 5257 fxsave->rip = fpu->last_ip; 5258 fxsave->rdp = fpu->last_dp; 5259 memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space); 5260 5261 return 0; 5262 } 5263 5264 int fx_init(struct kvm_vcpu *vcpu) 5265 { 5266 int err; 5267 5268 err = fpu_alloc(&vcpu->arch.guest_fpu); 5269 if (err) 5270 return err; 5271 5272 fpu_finit(&vcpu->arch.guest_fpu); 5273 5274 /* 5275 * Ensure guest xcr0 is valid for loading 5276 */ 5277 vcpu->arch.xcr0 = XSTATE_FP; 5278 5279 vcpu->arch.cr0 |= X86_CR0_ET; 5280 5281 return 0; 5282 } 5283 EXPORT_SYMBOL_GPL(fx_init); 5284 5285 static void fx_free(struct kvm_vcpu *vcpu) 5286 { 5287 fpu_free(&vcpu->arch.guest_fpu); 5288 } 5289 5290 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) 5291 { 5292 if (vcpu->guest_fpu_loaded) 5293 return; 5294 5295 /* 5296 * Restore all possible states in the guest, 5297 * and assume host would use all available bits. 5298 * Guest xcr0 would be loaded later. 5299 */ 5300 kvm_put_guest_xcr0(vcpu); 5301 vcpu->guest_fpu_loaded = 1; 5302 unlazy_fpu(current); 5303 fpu_restore_checking(&vcpu->arch.guest_fpu); 5304 trace_kvm_fpu(1); 5305 } 5306 5307 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) 5308 { 5309 kvm_put_guest_xcr0(vcpu); 5310 5311 if (!vcpu->guest_fpu_loaded) 5312 return; 5313 5314 vcpu->guest_fpu_loaded = 0; 5315 fpu_save_init(&vcpu->arch.guest_fpu); 5316 ++vcpu->stat.fpu_reload; 5317 kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu); 5318 trace_kvm_fpu(0); 5319 } 5320 5321 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) 5322 { 5323 if (vcpu->arch.time_page) { 5324 kvm_release_page_dirty(vcpu->arch.time_page); 5325 vcpu->arch.time_page = NULL; 5326 } 5327 5328 free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); 5329 fx_free(vcpu); 5330 kvm_x86_ops->vcpu_free(vcpu); 5331 } 5332 5333 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, 5334 unsigned int id) 5335 { 5336 return kvm_x86_ops->vcpu_create(kvm, id); 5337 } 5338 5339 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) 5340 { 5341 int r; 5342 5343 vcpu->arch.mtrr_state.have_fixed = 1; 5344 vcpu_load(vcpu); 5345 r = kvm_arch_vcpu_reset(vcpu); 5346 if (r == 0) 5347 r = kvm_mmu_setup(vcpu); 5348 vcpu_put(vcpu); 5349 if (r < 0) 5350 goto free_vcpu; 5351 5352 return 0; 5353 free_vcpu: 5354 kvm_x86_ops->vcpu_free(vcpu); 5355 return r; 5356 } 5357 5358 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) 5359 { 5360 vcpu_load(vcpu); 5361 kvm_mmu_unload(vcpu); 5362 vcpu_put(vcpu); 5363 5364 fx_free(vcpu); 5365 kvm_x86_ops->vcpu_free(vcpu); 5366 } 5367 5368 int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) 5369 { 5370 vcpu->arch.nmi_pending = false; 5371 vcpu->arch.nmi_injected = false; 5372 5373 vcpu->arch.switch_db_regs = 0; 5374 memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db)); 5375 vcpu->arch.dr6 = DR6_FIXED_1; 5376 vcpu->arch.dr7 = DR7_FIXED_1; 5377 5378 return kvm_x86_ops->vcpu_reset(vcpu); 5379 } 5380 5381 int kvm_arch_hardware_enable(void *garbage) 5382 { 5383 /* 5384 * Since this may be called from a hotplug notifcation, 5385 * we can't get the CPU frequency directly. 5386 */ 5387 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { 5388 int cpu = raw_smp_processor_id(); 5389 per_cpu(cpu_tsc_khz, cpu) = 0; 5390 } 5391 5392 kvm_shared_msr_cpu_online(); 5393 5394 return kvm_x86_ops->hardware_enable(garbage); 5395 } 5396 5397 void kvm_arch_hardware_disable(void *garbage) 5398 { 5399 kvm_x86_ops->hardware_disable(garbage); 5400 drop_user_return_notifiers(garbage); 5401 } 5402 5403 int kvm_arch_hardware_setup(void) 5404 { 5405 return kvm_x86_ops->hardware_setup(); 5406 } 5407 5408 void kvm_arch_hardware_unsetup(void) 5409 { 5410 kvm_x86_ops->hardware_unsetup(); 5411 } 5412 5413 void kvm_arch_check_processor_compat(void *rtn) 5414 { 5415 kvm_x86_ops->check_processor_compatibility(rtn); 5416 } 5417 5418 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) 5419 { 5420 struct page *page; 5421 struct kvm *kvm; 5422 int r; 5423 5424 BUG_ON(vcpu->kvm == NULL); 5425 kvm = vcpu->kvm; 5426 5427 vcpu->arch.mmu.root_hpa = INVALID_PAGE; 5428 if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu)) 5429 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 5430 else 5431 vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; 5432 5433 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 5434 if (!page) { 5435 r = -ENOMEM; 5436 goto fail; 5437 } 5438 vcpu->arch.pio_data = page_address(page); 5439 5440 r = kvm_mmu_create(vcpu); 5441 if (r < 0) 5442 goto fail_free_pio_data; 5443 5444 if (irqchip_in_kernel(kvm)) { 5445 r = kvm_create_lapic(vcpu); 5446 if (r < 0) 5447 goto fail_mmu_destroy; 5448 } 5449 5450 vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4, 5451 GFP_KERNEL); 5452 if (!vcpu->arch.mce_banks) { 5453 r = -ENOMEM; 5454 goto fail_free_lapic; 5455 } 5456 vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; 5457 5458 if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) 5459 goto fail_free_mce_banks; 5460 5461 return 0; 5462 fail_free_mce_banks: 5463 kfree(vcpu->arch.mce_banks); 5464 fail_free_lapic: 5465 kvm_free_lapic(vcpu); 5466 fail_mmu_destroy: 5467 kvm_mmu_destroy(vcpu); 5468 fail_free_pio_data: 5469 free_page((unsigned long)vcpu->arch.pio_data); 5470 fail: 5471 return r; 5472 } 5473 5474 void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) 5475 { 5476 int idx; 5477 5478 kfree(vcpu->arch.mce_banks); 5479 kvm_free_lapic(vcpu); 5480 idx = srcu_read_lock(&vcpu->kvm->srcu); 5481 kvm_mmu_destroy(vcpu); 5482 srcu_read_unlock(&vcpu->kvm->srcu, idx); 5483 free_page((unsigned long)vcpu->arch.pio_data); 5484 } 5485 5486 struct kvm *kvm_arch_create_vm(void) 5487 { 5488 struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); 5489 5490 if (!kvm) 5491 return ERR_PTR(-ENOMEM); 5492 5493 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); 5494 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); 5495 5496 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ 5497 set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); 5498 5499 rdtscll(kvm->arch.vm_init_tsc); 5500 5501 return kvm; 5502 } 5503 5504 static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) 5505 { 5506 vcpu_load(vcpu); 5507 kvm_mmu_unload(vcpu); 5508 vcpu_put(vcpu); 5509 } 5510 5511 static void kvm_free_vcpus(struct kvm *kvm) 5512 { 5513 unsigned int i; 5514 struct kvm_vcpu *vcpu; 5515 5516 /* 5517 * Unpin any mmu pages first. 5518 */ 5519 kvm_for_each_vcpu(i, vcpu, kvm) 5520 kvm_unload_vcpu_mmu(vcpu); 5521 kvm_for_each_vcpu(i, vcpu, kvm) 5522 kvm_arch_vcpu_free(vcpu); 5523 5524 mutex_lock(&kvm->lock); 5525 for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) 5526 kvm->vcpus[i] = NULL; 5527 5528 atomic_set(&kvm->online_vcpus, 0); 5529 mutex_unlock(&kvm->lock); 5530 } 5531 5532 void kvm_arch_sync_events(struct kvm *kvm) 5533 { 5534 kvm_free_all_assigned_devices(kvm); 5535 kvm_free_pit(kvm); 5536 } 5537 5538 void kvm_arch_destroy_vm(struct kvm *kvm) 5539 { 5540 kvm_iommu_unmap_guest(kvm); 5541 kfree(kvm->arch.vpic); 5542 kfree(kvm->arch.vioapic); 5543 kvm_free_vcpus(kvm); 5544 kvm_free_physmem(kvm); 5545 if (kvm->arch.apic_access_page) 5546 put_page(kvm->arch.apic_access_page); 5547 if (kvm->arch.ept_identity_pagetable) 5548 put_page(kvm->arch.ept_identity_pagetable); 5549 cleanup_srcu_struct(&kvm->srcu); 5550 kfree(kvm); 5551 } 5552 5553 int kvm_arch_prepare_memory_region(struct kvm *kvm, 5554 struct kvm_memory_slot *memslot, 5555 struct kvm_memory_slot old, 5556 struct kvm_userspace_memory_region *mem, 5557 int user_alloc) 5558 { 5559 int npages = memslot->npages; 5560 int map_flags = MAP_PRIVATE | MAP_ANONYMOUS; 5561 5562 /* Prevent internal slot pages from being moved by fork()/COW. */ 5563 if (memslot->id >= KVM_MEMORY_SLOTS) 5564 map_flags = MAP_SHARED | MAP_ANONYMOUS; 5565 5566 /*To keep backward compatibility with older userspace, 5567 *x86 needs to hanlde !user_alloc case. 5568 */ 5569 if (!user_alloc) { 5570 if (npages && !old.rmap) { 5571 unsigned long userspace_addr; 5572 5573 down_write(¤t->mm->mmap_sem); 5574 userspace_addr = do_mmap(NULL, 0, 5575 npages * PAGE_SIZE, 5576 PROT_READ | PROT_WRITE, 5577 map_flags, 5578 0); 5579 up_write(¤t->mm->mmap_sem); 5580 5581 if (IS_ERR((void *)userspace_addr)) 5582 return PTR_ERR((void *)userspace_addr); 5583 5584 memslot->userspace_addr = userspace_addr; 5585 } 5586 } 5587 5588 5589 return 0; 5590 } 5591 5592 void kvm_arch_commit_memory_region(struct kvm *kvm, 5593 struct kvm_userspace_memory_region *mem, 5594 struct kvm_memory_slot old, 5595 int user_alloc) 5596 { 5597 5598 int npages = mem->memory_size >> PAGE_SHIFT; 5599 5600 if (!user_alloc && !old.user_alloc && old.rmap && !npages) { 5601 int ret; 5602 5603 down_write(¤t->mm->mmap_sem); 5604 ret = do_munmap(current->mm, old.userspace_addr, 5605 old.npages * PAGE_SIZE); 5606 up_write(¤t->mm->mmap_sem); 5607 if (ret < 0) 5608 printk(KERN_WARNING 5609 "kvm_vm_ioctl_set_memory_region: " 5610 "failed to munmap memory\n"); 5611 } 5612 5613 spin_lock(&kvm->mmu_lock); 5614 if (!kvm->arch.n_requested_mmu_pages) { 5615 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); 5616 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); 5617 } 5618 5619 kvm_mmu_slot_remove_write_access(kvm, mem->slot); 5620 spin_unlock(&kvm->mmu_lock); 5621 } 5622 5623 void kvm_arch_flush_shadow(struct kvm *kvm) 5624 { 5625 kvm_mmu_zap_all(kvm); 5626 kvm_reload_remote_mmus(kvm); 5627 } 5628 5629 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) 5630 { 5631 return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE 5632 || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED 5633 || vcpu->arch.nmi_pending || 5634 (kvm_arch_interrupt_allowed(vcpu) && 5635 kvm_cpu_has_interrupt(vcpu)); 5636 } 5637 5638 void kvm_vcpu_kick(struct kvm_vcpu *vcpu) 5639 { 5640 int me; 5641 int cpu = vcpu->cpu; 5642 5643 if (waitqueue_active(&vcpu->wq)) { 5644 wake_up_interruptible(&vcpu->wq); 5645 ++vcpu->stat.halt_wakeup; 5646 } 5647 5648 me = get_cpu(); 5649 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) 5650 if (atomic_xchg(&vcpu->guest_mode, 0)) 5651 smp_send_reschedule(cpu); 5652 put_cpu(); 5653 } 5654 5655 int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) 5656 { 5657 return kvm_x86_ops->interrupt_allowed(vcpu); 5658 } 5659 5660 bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip) 5661 { 5662 unsigned long current_rip = kvm_rip_read(vcpu) + 5663 get_segment_base(vcpu, VCPU_SREG_CS); 5664 5665 return current_rip == linear_rip; 5666 } 5667 EXPORT_SYMBOL_GPL(kvm_is_linear_rip); 5668 5669 unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu) 5670 { 5671 unsigned long rflags; 5672 5673 rflags = kvm_x86_ops->get_rflags(vcpu); 5674 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 5675 rflags &= ~X86_EFLAGS_TF; 5676 return rflags; 5677 } 5678 EXPORT_SYMBOL_GPL(kvm_get_rflags); 5679 5680 void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 5681 { 5682 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP && 5683 kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip)) 5684 rflags |= X86_EFLAGS_TF; 5685 kvm_x86_ops->set_rflags(vcpu, rflags); 5686 } 5687 EXPORT_SYMBOL_GPL(kvm_set_rflags); 5688 5689 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); 5690 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); 5691 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); 5692 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr); 5693 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr); 5694 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun); 5695 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit); 5696 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject); 5697 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit); 5698 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga); 5699 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit); 5700 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts); 5701