1 /* 2 * Kernel-based Virtual Machine driver for Linux 3 * 4 * derived from drivers/kvm/kvm_main.c 5 * 6 * Copyright (C) 2006 Qumranet, Inc. 7 * Copyright (C) 2008 Qumranet, Inc. 8 * Copyright IBM Corporation, 2008 9 * 10 * Authors: 11 * Avi Kivity <avi@qumranet.com> 12 * Yaniv Kamay <yaniv@qumranet.com> 13 * Amit Shah <amit.shah@qumranet.com> 14 * Ben-Ami Yassour <benami@il.ibm.com> 15 * 16 * This work is licensed under the terms of the GNU GPL, version 2. See 17 * the COPYING file in the top-level directory. 18 * 19 */ 20 21 #include <linux/kvm_host.h> 22 #include "irq.h" 23 #include "mmu.h" 24 #include "i8254.h" 25 #include "tss.h" 26 #include "kvm_cache_regs.h" 27 #include "x86.h" 28 29 #include <linux/clocksource.h> 30 #include <linux/interrupt.h> 31 #include <linux/kvm.h> 32 #include <linux/fs.h> 33 #include <linux/vmalloc.h> 34 #include <linux/module.h> 35 #include <linux/mman.h> 36 #include <linux/highmem.h> 37 #include <linux/iommu.h> 38 #include <linux/intel-iommu.h> 39 40 #include <asm/uaccess.h> 41 #include <asm/msr.h> 42 #include <asm/desc.h> 43 #include <asm/mtrr.h> 44 45 #define MAX_IO_MSRS 256 46 #define CR0_RESERVED_BITS \ 47 (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ 48 | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ 49 | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG)) 50 #define CR4_RESERVED_BITS \ 51 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ 52 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ 53 | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ 54 | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) 55 56 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) 57 /* EFER defaults: 58 * - enable syscall per default because its emulated by KVM 59 * - enable LME and LMA per default on 64 bit KVM 60 */ 61 #ifdef CONFIG_X86_64 62 static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL; 63 #else 64 static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL; 65 #endif 66 67 #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM 68 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU 69 70 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, 71 struct kvm_cpuid_entry2 __user *entries); 72 73 struct kvm_x86_ops *kvm_x86_ops; 74 EXPORT_SYMBOL_GPL(kvm_x86_ops); 75 76 struct kvm_stats_debugfs_item debugfs_entries[] = { 77 { "pf_fixed", VCPU_STAT(pf_fixed) }, 78 { "pf_guest", VCPU_STAT(pf_guest) }, 79 { "tlb_flush", VCPU_STAT(tlb_flush) }, 80 { "invlpg", VCPU_STAT(invlpg) }, 81 { "exits", VCPU_STAT(exits) }, 82 { "io_exits", VCPU_STAT(io_exits) }, 83 { "mmio_exits", VCPU_STAT(mmio_exits) }, 84 { "signal_exits", VCPU_STAT(signal_exits) }, 85 { "irq_window", VCPU_STAT(irq_window_exits) }, 86 { "nmi_window", VCPU_STAT(nmi_window_exits) }, 87 { "halt_exits", VCPU_STAT(halt_exits) }, 88 { "halt_wakeup", VCPU_STAT(halt_wakeup) }, 89 { "hypercalls", VCPU_STAT(hypercalls) }, 90 { "request_irq", VCPU_STAT(request_irq_exits) }, 91 { "request_nmi", VCPU_STAT(request_nmi_exits) }, 92 { "irq_exits", VCPU_STAT(irq_exits) }, 93 { "host_state_reload", VCPU_STAT(host_state_reload) }, 94 { "efer_reload", VCPU_STAT(efer_reload) }, 95 { "fpu_reload", VCPU_STAT(fpu_reload) }, 96 { "insn_emulation", VCPU_STAT(insn_emulation) }, 97 { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, 98 { "irq_injections", VCPU_STAT(irq_injections) }, 99 { "nmi_injections", VCPU_STAT(nmi_injections) }, 100 { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, 101 { "mmu_pte_write", VM_STAT(mmu_pte_write) }, 102 { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, 103 { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) }, 104 { "mmu_flooded", VM_STAT(mmu_flooded) }, 105 { "mmu_recycled", VM_STAT(mmu_recycled) }, 106 { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, 107 { "mmu_unsync", VM_STAT(mmu_unsync) }, 108 { "mmu_unsync_global", VM_STAT(mmu_unsync_global) }, 109 { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, 110 { "largepages", VM_STAT(lpages) }, 111 { NULL } 112 }; 113 114 unsigned long segment_base(u16 selector) 115 { 116 struct descriptor_table gdt; 117 struct desc_struct *d; 118 unsigned long table_base; 119 unsigned long v; 120 121 if (selector == 0) 122 return 0; 123 124 asm("sgdt %0" : "=m"(gdt)); 125 table_base = gdt.base; 126 127 if (selector & 4) { /* from ldt */ 128 u16 ldt_selector; 129 130 asm("sldt %0" : "=g"(ldt_selector)); 131 table_base = segment_base(ldt_selector); 132 } 133 d = (struct desc_struct *)(table_base + (selector & ~7)); 134 v = d->base0 | ((unsigned long)d->base1 << 16) | 135 ((unsigned long)d->base2 << 24); 136 #ifdef CONFIG_X86_64 137 if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11)) 138 v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32; 139 #endif 140 return v; 141 } 142 EXPORT_SYMBOL_GPL(segment_base); 143 144 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) 145 { 146 if (irqchip_in_kernel(vcpu->kvm)) 147 return vcpu->arch.apic_base; 148 else 149 return vcpu->arch.apic_base; 150 } 151 EXPORT_SYMBOL_GPL(kvm_get_apic_base); 152 153 void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data) 154 { 155 /* TODO: reserve bits check */ 156 if (irqchip_in_kernel(vcpu->kvm)) 157 kvm_lapic_set_base(vcpu, data); 158 else 159 vcpu->arch.apic_base = data; 160 } 161 EXPORT_SYMBOL_GPL(kvm_set_apic_base); 162 163 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) 164 { 165 WARN_ON(vcpu->arch.exception.pending); 166 vcpu->arch.exception.pending = true; 167 vcpu->arch.exception.has_error_code = false; 168 vcpu->arch.exception.nr = nr; 169 } 170 EXPORT_SYMBOL_GPL(kvm_queue_exception); 171 172 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, 173 u32 error_code) 174 { 175 ++vcpu->stat.pf_guest; 176 if (vcpu->arch.exception.pending) { 177 if (vcpu->arch.exception.nr == PF_VECTOR) { 178 printk(KERN_DEBUG "kvm: inject_page_fault:" 179 " double fault 0x%lx\n", addr); 180 vcpu->arch.exception.nr = DF_VECTOR; 181 vcpu->arch.exception.error_code = 0; 182 } else if (vcpu->arch.exception.nr == DF_VECTOR) { 183 /* triple fault -> shutdown */ 184 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); 185 } 186 return; 187 } 188 vcpu->arch.cr2 = addr; 189 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); 190 } 191 192 void kvm_inject_nmi(struct kvm_vcpu *vcpu) 193 { 194 vcpu->arch.nmi_pending = 1; 195 } 196 EXPORT_SYMBOL_GPL(kvm_inject_nmi); 197 198 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) 199 { 200 WARN_ON(vcpu->arch.exception.pending); 201 vcpu->arch.exception.pending = true; 202 vcpu->arch.exception.has_error_code = true; 203 vcpu->arch.exception.nr = nr; 204 vcpu->arch.exception.error_code = error_code; 205 } 206 EXPORT_SYMBOL_GPL(kvm_queue_exception_e); 207 208 static void __queue_exception(struct kvm_vcpu *vcpu) 209 { 210 kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr, 211 vcpu->arch.exception.has_error_code, 212 vcpu->arch.exception.error_code); 213 } 214 215 /* 216 * Load the pae pdptrs. Return true is they are all valid. 217 */ 218 int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) 219 { 220 gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; 221 unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; 222 int i; 223 int ret; 224 u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; 225 226 ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte, 227 offset * sizeof(u64), sizeof(pdpte)); 228 if (ret < 0) { 229 ret = 0; 230 goto out; 231 } 232 for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { 233 if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) { 234 ret = 0; 235 goto out; 236 } 237 } 238 ret = 1; 239 240 memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs)); 241 out: 242 243 return ret; 244 } 245 EXPORT_SYMBOL_GPL(load_pdptrs); 246 247 static bool pdptrs_changed(struct kvm_vcpu *vcpu) 248 { 249 u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; 250 bool changed = true; 251 int r; 252 253 if (is_long_mode(vcpu) || !is_pae(vcpu)) 254 return false; 255 256 r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte)); 257 if (r < 0) 258 goto out; 259 changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0; 260 out: 261 262 return changed; 263 } 264 265 void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 266 { 267 if (cr0 & CR0_RESERVED_BITS) { 268 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", 269 cr0, vcpu->arch.cr0); 270 kvm_inject_gp(vcpu, 0); 271 return; 272 } 273 274 if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { 275 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n"); 276 kvm_inject_gp(vcpu, 0); 277 return; 278 } 279 280 if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) { 281 printk(KERN_DEBUG "set_cr0: #GP, set PG flag " 282 "and a clear PE flag\n"); 283 kvm_inject_gp(vcpu, 0); 284 return; 285 } 286 287 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { 288 #ifdef CONFIG_X86_64 289 if ((vcpu->arch.shadow_efer & EFER_LME)) { 290 int cs_db, cs_l; 291 292 if (!is_pae(vcpu)) { 293 printk(KERN_DEBUG "set_cr0: #GP, start paging " 294 "in long mode while PAE is disabled\n"); 295 kvm_inject_gp(vcpu, 0); 296 return; 297 } 298 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 299 if (cs_l) { 300 printk(KERN_DEBUG "set_cr0: #GP, start paging " 301 "in long mode while CS.L == 1\n"); 302 kvm_inject_gp(vcpu, 0); 303 return; 304 305 } 306 } else 307 #endif 308 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) { 309 printk(KERN_DEBUG "set_cr0: #GP, pdptrs " 310 "reserved bits\n"); 311 kvm_inject_gp(vcpu, 0); 312 return; 313 } 314 315 } 316 317 kvm_x86_ops->set_cr0(vcpu, cr0); 318 vcpu->arch.cr0 = cr0; 319 320 kvm_mmu_sync_global(vcpu); 321 kvm_mmu_reset_context(vcpu); 322 return; 323 } 324 EXPORT_SYMBOL_GPL(kvm_set_cr0); 325 326 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) 327 { 328 kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)); 329 KVMTRACE_1D(LMSW, vcpu, 330 (u32)((vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)), 331 handler); 332 } 333 EXPORT_SYMBOL_GPL(kvm_lmsw); 334 335 void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 336 { 337 if (cr4 & CR4_RESERVED_BITS) { 338 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n"); 339 kvm_inject_gp(vcpu, 0); 340 return; 341 } 342 343 if (is_long_mode(vcpu)) { 344 if (!(cr4 & X86_CR4_PAE)) { 345 printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while " 346 "in long mode\n"); 347 kvm_inject_gp(vcpu, 0); 348 return; 349 } 350 } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE) 351 && !load_pdptrs(vcpu, vcpu->arch.cr3)) { 352 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n"); 353 kvm_inject_gp(vcpu, 0); 354 return; 355 } 356 357 if (cr4 & X86_CR4_VMXE) { 358 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n"); 359 kvm_inject_gp(vcpu, 0); 360 return; 361 } 362 kvm_x86_ops->set_cr4(vcpu, cr4); 363 vcpu->arch.cr4 = cr4; 364 kvm_mmu_sync_global(vcpu); 365 kvm_mmu_reset_context(vcpu); 366 } 367 EXPORT_SYMBOL_GPL(kvm_set_cr4); 368 369 void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 370 { 371 if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { 372 kvm_mmu_sync_roots(vcpu); 373 kvm_mmu_flush_tlb(vcpu); 374 return; 375 } 376 377 if (is_long_mode(vcpu)) { 378 if (cr3 & CR3_L_MODE_RESERVED_BITS) { 379 printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n"); 380 kvm_inject_gp(vcpu, 0); 381 return; 382 } 383 } else { 384 if (is_pae(vcpu)) { 385 if (cr3 & CR3_PAE_RESERVED_BITS) { 386 printk(KERN_DEBUG 387 "set_cr3: #GP, reserved bits\n"); 388 kvm_inject_gp(vcpu, 0); 389 return; 390 } 391 if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) { 392 printk(KERN_DEBUG "set_cr3: #GP, pdptrs " 393 "reserved bits\n"); 394 kvm_inject_gp(vcpu, 0); 395 return; 396 } 397 } 398 /* 399 * We don't check reserved bits in nonpae mode, because 400 * this isn't enforced, and VMware depends on this. 401 */ 402 } 403 404 /* 405 * Does the new cr3 value map to physical memory? (Note, we 406 * catch an invalid cr3 even in real-mode, because it would 407 * cause trouble later on when we turn on paging anyway.) 408 * 409 * A real CPU would silently accept an invalid cr3 and would 410 * attempt to use it - with largely undefined (and often hard 411 * to debug) behavior on the guest side. 412 */ 413 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) 414 kvm_inject_gp(vcpu, 0); 415 else { 416 vcpu->arch.cr3 = cr3; 417 vcpu->arch.mmu.new_cr3(vcpu); 418 } 419 } 420 EXPORT_SYMBOL_GPL(kvm_set_cr3); 421 422 void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) 423 { 424 if (cr8 & CR8_RESERVED_BITS) { 425 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8); 426 kvm_inject_gp(vcpu, 0); 427 return; 428 } 429 if (irqchip_in_kernel(vcpu->kvm)) 430 kvm_lapic_set_tpr(vcpu, cr8); 431 else 432 vcpu->arch.cr8 = cr8; 433 } 434 EXPORT_SYMBOL_GPL(kvm_set_cr8); 435 436 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) 437 { 438 if (irqchip_in_kernel(vcpu->kvm)) 439 return kvm_lapic_get_cr8(vcpu); 440 else 441 return vcpu->arch.cr8; 442 } 443 EXPORT_SYMBOL_GPL(kvm_get_cr8); 444 445 /* 446 * List of msr numbers which we expose to userspace through KVM_GET_MSRS 447 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. 448 * 449 * This list is modified at module load time to reflect the 450 * capabilities of the host cpu. 451 */ 452 static u32 msrs_to_save[] = { 453 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 454 MSR_K6_STAR, 455 #ifdef CONFIG_X86_64 456 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, 457 #endif 458 MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, 459 MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT 460 }; 461 462 static unsigned num_msrs_to_save; 463 464 static u32 emulated_msrs[] = { 465 MSR_IA32_MISC_ENABLE, 466 }; 467 468 static void set_efer(struct kvm_vcpu *vcpu, u64 efer) 469 { 470 if (efer & efer_reserved_bits) { 471 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n", 472 efer); 473 kvm_inject_gp(vcpu, 0); 474 return; 475 } 476 477 if (is_paging(vcpu) 478 && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) { 479 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n"); 480 kvm_inject_gp(vcpu, 0); 481 return; 482 } 483 484 kvm_x86_ops->set_efer(vcpu, efer); 485 486 efer &= ~EFER_LMA; 487 efer |= vcpu->arch.shadow_efer & EFER_LMA; 488 489 vcpu->arch.shadow_efer = efer; 490 } 491 492 void kvm_enable_efer_bits(u64 mask) 493 { 494 efer_reserved_bits &= ~mask; 495 } 496 EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); 497 498 499 /* 500 * Writes msr value into into the appropriate "register". 501 * Returns 0 on success, non-0 otherwise. 502 * Assumes vcpu_load() was already called. 503 */ 504 int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 505 { 506 return kvm_x86_ops->set_msr(vcpu, msr_index, data); 507 } 508 509 /* 510 * Adapt set_msr() to msr_io()'s calling convention 511 */ 512 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) 513 { 514 return kvm_set_msr(vcpu, index, *data); 515 } 516 517 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) 518 { 519 static int version; 520 struct pvclock_wall_clock wc; 521 struct timespec now, sys, boot; 522 523 if (!wall_clock) 524 return; 525 526 version++; 527 528 kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); 529 530 /* 531 * The guest calculates current wall clock time by adding 532 * system time (updated by kvm_write_guest_time below) to the 533 * wall clock specified here. guest system time equals host 534 * system time for us, thus we must fill in host boot time here. 535 */ 536 now = current_kernel_time(); 537 ktime_get_ts(&sys); 538 boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys)); 539 540 wc.sec = boot.tv_sec; 541 wc.nsec = boot.tv_nsec; 542 wc.version = version; 543 544 kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc)); 545 546 version++; 547 kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); 548 } 549 550 static uint32_t div_frac(uint32_t dividend, uint32_t divisor) 551 { 552 uint32_t quotient, remainder; 553 554 /* Don't try to replace with do_div(), this one calculates 555 * "(dividend << 32) / divisor" */ 556 __asm__ ( "divl %4" 557 : "=a" (quotient), "=d" (remainder) 558 : "0" (0), "1" (dividend), "r" (divisor) ); 559 return quotient; 560 } 561 562 static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock) 563 { 564 uint64_t nsecs = 1000000000LL; 565 int32_t shift = 0; 566 uint64_t tps64; 567 uint32_t tps32; 568 569 tps64 = tsc_khz * 1000LL; 570 while (tps64 > nsecs*2) { 571 tps64 >>= 1; 572 shift--; 573 } 574 575 tps32 = (uint32_t)tps64; 576 while (tps32 <= (uint32_t)nsecs) { 577 tps32 <<= 1; 578 shift++; 579 } 580 581 hv_clock->tsc_shift = shift; 582 hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32); 583 584 pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n", 585 __func__, tsc_khz, hv_clock->tsc_shift, 586 hv_clock->tsc_to_system_mul); 587 } 588 589 static void kvm_write_guest_time(struct kvm_vcpu *v) 590 { 591 struct timespec ts; 592 unsigned long flags; 593 struct kvm_vcpu_arch *vcpu = &v->arch; 594 void *shared_kaddr; 595 596 if ((!vcpu->time_page)) 597 return; 598 599 if (unlikely(vcpu->hv_clock_tsc_khz != tsc_khz)) { 600 kvm_set_time_scale(tsc_khz, &vcpu->hv_clock); 601 vcpu->hv_clock_tsc_khz = tsc_khz; 602 } 603 604 /* Keep irq disabled to prevent changes to the clock */ 605 local_irq_save(flags); 606 kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER, 607 &vcpu->hv_clock.tsc_timestamp); 608 ktime_get_ts(&ts); 609 local_irq_restore(flags); 610 611 /* With all the info we got, fill in the values */ 612 613 vcpu->hv_clock.system_time = ts.tv_nsec + 614 (NSEC_PER_SEC * (u64)ts.tv_sec); 615 /* 616 * The interface expects us to write an even number signaling that the 617 * update is finished. Since the guest won't see the intermediate 618 * state, we just increase by 2 at the end. 619 */ 620 vcpu->hv_clock.version += 2; 621 622 shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0); 623 624 memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock, 625 sizeof(vcpu->hv_clock)); 626 627 kunmap_atomic(shared_kaddr, KM_USER0); 628 629 mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT); 630 } 631 632 static bool msr_mtrr_valid(unsigned msr) 633 { 634 switch (msr) { 635 case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1: 636 case MSR_MTRRfix64K_00000: 637 case MSR_MTRRfix16K_80000: 638 case MSR_MTRRfix16K_A0000: 639 case MSR_MTRRfix4K_C0000: 640 case MSR_MTRRfix4K_C8000: 641 case MSR_MTRRfix4K_D0000: 642 case MSR_MTRRfix4K_D8000: 643 case MSR_MTRRfix4K_E0000: 644 case MSR_MTRRfix4K_E8000: 645 case MSR_MTRRfix4K_F0000: 646 case MSR_MTRRfix4K_F8000: 647 case MSR_MTRRdefType: 648 case MSR_IA32_CR_PAT: 649 return true; 650 case 0x2f8: 651 return true; 652 } 653 return false; 654 } 655 656 static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data) 657 { 658 u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; 659 660 if (!msr_mtrr_valid(msr)) 661 return 1; 662 663 if (msr == MSR_MTRRdefType) { 664 vcpu->arch.mtrr_state.def_type = data; 665 vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10; 666 } else if (msr == MSR_MTRRfix64K_00000) 667 p[0] = data; 668 else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000) 669 p[1 + msr - MSR_MTRRfix16K_80000] = data; 670 else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000) 671 p[3 + msr - MSR_MTRRfix4K_C0000] = data; 672 else if (msr == MSR_IA32_CR_PAT) 673 vcpu->arch.pat = data; 674 else { /* Variable MTRRs */ 675 int idx, is_mtrr_mask; 676 u64 *pt; 677 678 idx = (msr - 0x200) / 2; 679 is_mtrr_mask = msr - 0x200 - 2 * idx; 680 if (!is_mtrr_mask) 681 pt = 682 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo; 683 else 684 pt = 685 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo; 686 *pt = data; 687 } 688 689 kvm_mmu_reset_context(vcpu); 690 return 0; 691 } 692 693 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) 694 { 695 switch (msr) { 696 case MSR_EFER: 697 set_efer(vcpu, data); 698 break; 699 case MSR_IA32_MC0_STATUS: 700 pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n", 701 __func__, data); 702 break; 703 case MSR_IA32_MCG_STATUS: 704 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n", 705 __func__, data); 706 break; 707 case MSR_IA32_MCG_CTL: 708 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n", 709 __func__, data); 710 break; 711 case MSR_IA32_DEBUGCTLMSR: 712 if (!data) { 713 /* We support the non-activated case already */ 714 break; 715 } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) { 716 /* Values other than LBR and BTF are vendor-specific, 717 thus reserved and should throw a #GP */ 718 return 1; 719 } 720 pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", 721 __func__, data); 722 break; 723 case MSR_IA32_UCODE_REV: 724 case MSR_IA32_UCODE_WRITE: 725 break; 726 case 0x200 ... 0x2ff: 727 return set_msr_mtrr(vcpu, msr, data); 728 case MSR_IA32_APICBASE: 729 kvm_set_apic_base(vcpu, data); 730 break; 731 case MSR_IA32_MISC_ENABLE: 732 vcpu->arch.ia32_misc_enable_msr = data; 733 break; 734 case MSR_KVM_WALL_CLOCK: 735 vcpu->kvm->arch.wall_clock = data; 736 kvm_write_wall_clock(vcpu->kvm, data); 737 break; 738 case MSR_KVM_SYSTEM_TIME: { 739 if (vcpu->arch.time_page) { 740 kvm_release_page_dirty(vcpu->arch.time_page); 741 vcpu->arch.time_page = NULL; 742 } 743 744 vcpu->arch.time = data; 745 746 /* we verify if the enable bit is set... */ 747 if (!(data & 1)) 748 break; 749 750 /* ...but clean it before doing the actual write */ 751 vcpu->arch.time_offset = data & ~(PAGE_MASK | 1); 752 753 vcpu->arch.time_page = 754 gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); 755 756 if (is_error_page(vcpu->arch.time_page)) { 757 kvm_release_page_clean(vcpu->arch.time_page); 758 vcpu->arch.time_page = NULL; 759 } 760 761 kvm_write_guest_time(vcpu); 762 break; 763 } 764 default: 765 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data); 766 return 1; 767 } 768 return 0; 769 } 770 EXPORT_SYMBOL_GPL(kvm_set_msr_common); 771 772 773 /* 774 * Reads an msr value (of 'msr_index') into 'pdata'. 775 * Returns 0 on success, non-0 otherwise. 776 * Assumes vcpu_load() was already called. 777 */ 778 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) 779 { 780 return kvm_x86_ops->get_msr(vcpu, msr_index, pdata); 781 } 782 783 static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 784 { 785 u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; 786 787 if (!msr_mtrr_valid(msr)) 788 return 1; 789 790 if (msr == MSR_MTRRdefType) 791 *pdata = vcpu->arch.mtrr_state.def_type + 792 (vcpu->arch.mtrr_state.enabled << 10); 793 else if (msr == MSR_MTRRfix64K_00000) 794 *pdata = p[0]; 795 else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000) 796 *pdata = p[1 + msr - MSR_MTRRfix16K_80000]; 797 else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000) 798 *pdata = p[3 + msr - MSR_MTRRfix4K_C0000]; 799 else if (msr == MSR_IA32_CR_PAT) 800 *pdata = vcpu->arch.pat; 801 else { /* Variable MTRRs */ 802 int idx, is_mtrr_mask; 803 u64 *pt; 804 805 idx = (msr - 0x200) / 2; 806 is_mtrr_mask = msr - 0x200 - 2 * idx; 807 if (!is_mtrr_mask) 808 pt = 809 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo; 810 else 811 pt = 812 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo; 813 *pdata = *pt; 814 } 815 816 return 0; 817 } 818 819 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 820 { 821 u64 data; 822 823 switch (msr) { 824 case 0xc0010010: /* SYSCFG */ 825 case 0xc0010015: /* HWCR */ 826 case MSR_IA32_PLATFORM_ID: 827 case MSR_IA32_P5_MC_ADDR: 828 case MSR_IA32_P5_MC_TYPE: 829 case MSR_IA32_MC0_CTL: 830 case MSR_IA32_MCG_STATUS: 831 case MSR_IA32_MCG_CAP: 832 case MSR_IA32_MCG_CTL: 833 case MSR_IA32_MC0_MISC: 834 case MSR_IA32_MC0_MISC+4: 835 case MSR_IA32_MC0_MISC+8: 836 case MSR_IA32_MC0_MISC+12: 837 case MSR_IA32_MC0_MISC+16: 838 case MSR_IA32_MC0_MISC+20: 839 case MSR_IA32_UCODE_REV: 840 case MSR_IA32_EBL_CR_POWERON: 841 case MSR_IA32_DEBUGCTLMSR: 842 case MSR_IA32_LASTBRANCHFROMIP: 843 case MSR_IA32_LASTBRANCHTOIP: 844 case MSR_IA32_LASTINTFROMIP: 845 case MSR_IA32_LASTINTTOIP: 846 data = 0; 847 break; 848 case MSR_MTRRcap: 849 data = 0x500 | KVM_NR_VAR_MTRR; 850 break; 851 case 0x200 ... 0x2ff: 852 return get_msr_mtrr(vcpu, msr, pdata); 853 case 0xcd: /* fsb frequency */ 854 data = 3; 855 break; 856 case MSR_IA32_APICBASE: 857 data = kvm_get_apic_base(vcpu); 858 break; 859 case MSR_IA32_MISC_ENABLE: 860 data = vcpu->arch.ia32_misc_enable_msr; 861 break; 862 case MSR_IA32_PERF_STATUS: 863 /* TSC increment by tick */ 864 data = 1000ULL; 865 /* CPU multiplier */ 866 data |= (((uint64_t)4ULL) << 40); 867 break; 868 case MSR_EFER: 869 data = vcpu->arch.shadow_efer; 870 break; 871 case MSR_KVM_WALL_CLOCK: 872 data = vcpu->kvm->arch.wall_clock; 873 break; 874 case MSR_KVM_SYSTEM_TIME: 875 data = vcpu->arch.time; 876 break; 877 default: 878 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); 879 return 1; 880 } 881 *pdata = data; 882 return 0; 883 } 884 EXPORT_SYMBOL_GPL(kvm_get_msr_common); 885 886 /* 887 * Read or write a bunch of msrs. All parameters are kernel addresses. 888 * 889 * @return number of msrs set successfully. 890 */ 891 static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, 892 struct kvm_msr_entry *entries, 893 int (*do_msr)(struct kvm_vcpu *vcpu, 894 unsigned index, u64 *data)) 895 { 896 int i; 897 898 vcpu_load(vcpu); 899 900 down_read(&vcpu->kvm->slots_lock); 901 for (i = 0; i < msrs->nmsrs; ++i) 902 if (do_msr(vcpu, entries[i].index, &entries[i].data)) 903 break; 904 up_read(&vcpu->kvm->slots_lock); 905 906 vcpu_put(vcpu); 907 908 return i; 909 } 910 911 /* 912 * Read or write a bunch of msrs. Parameters are user addresses. 913 * 914 * @return number of msrs set successfully. 915 */ 916 static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs, 917 int (*do_msr)(struct kvm_vcpu *vcpu, 918 unsigned index, u64 *data), 919 int writeback) 920 { 921 struct kvm_msrs msrs; 922 struct kvm_msr_entry *entries; 923 int r, n; 924 unsigned size; 925 926 r = -EFAULT; 927 if (copy_from_user(&msrs, user_msrs, sizeof msrs)) 928 goto out; 929 930 r = -E2BIG; 931 if (msrs.nmsrs >= MAX_IO_MSRS) 932 goto out; 933 934 r = -ENOMEM; 935 size = sizeof(struct kvm_msr_entry) * msrs.nmsrs; 936 entries = vmalloc(size); 937 if (!entries) 938 goto out; 939 940 r = -EFAULT; 941 if (copy_from_user(entries, user_msrs->entries, size)) 942 goto out_free; 943 944 r = n = __msr_io(vcpu, &msrs, entries, do_msr); 945 if (r < 0) 946 goto out_free; 947 948 r = -EFAULT; 949 if (writeback && copy_to_user(user_msrs->entries, entries, size)) 950 goto out_free; 951 952 r = n; 953 954 out_free: 955 vfree(entries); 956 out: 957 return r; 958 } 959 960 int kvm_dev_ioctl_check_extension(long ext) 961 { 962 int r; 963 964 switch (ext) { 965 case KVM_CAP_IRQCHIP: 966 case KVM_CAP_HLT: 967 case KVM_CAP_MMU_SHADOW_CACHE_CONTROL: 968 case KVM_CAP_SET_TSS_ADDR: 969 case KVM_CAP_EXT_CPUID: 970 case KVM_CAP_PIT: 971 case KVM_CAP_NOP_IO_DELAY: 972 case KVM_CAP_MP_STATE: 973 case KVM_CAP_SYNC_MMU: 974 r = 1; 975 break; 976 case KVM_CAP_COALESCED_MMIO: 977 r = KVM_COALESCED_MMIO_PAGE_OFFSET; 978 break; 979 case KVM_CAP_VAPIC: 980 r = !kvm_x86_ops->cpu_has_accelerated_tpr(); 981 break; 982 case KVM_CAP_NR_VCPUS: 983 r = KVM_MAX_VCPUS; 984 break; 985 case KVM_CAP_NR_MEMSLOTS: 986 r = KVM_MEMORY_SLOTS; 987 break; 988 case KVM_CAP_PV_MMU: 989 r = !tdp_enabled; 990 break; 991 case KVM_CAP_IOMMU: 992 r = iommu_found(); 993 break; 994 case KVM_CAP_CLOCKSOURCE: 995 r = boot_cpu_has(X86_FEATURE_CONSTANT_TSC); 996 break; 997 default: 998 r = 0; 999 break; 1000 } 1001 return r; 1002 1003 } 1004 1005 long kvm_arch_dev_ioctl(struct file *filp, 1006 unsigned int ioctl, unsigned long arg) 1007 { 1008 void __user *argp = (void __user *)arg; 1009 long r; 1010 1011 switch (ioctl) { 1012 case KVM_GET_MSR_INDEX_LIST: { 1013 struct kvm_msr_list __user *user_msr_list = argp; 1014 struct kvm_msr_list msr_list; 1015 unsigned n; 1016 1017 r = -EFAULT; 1018 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list)) 1019 goto out; 1020 n = msr_list.nmsrs; 1021 msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs); 1022 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list)) 1023 goto out; 1024 r = -E2BIG; 1025 if (n < num_msrs_to_save) 1026 goto out; 1027 r = -EFAULT; 1028 if (copy_to_user(user_msr_list->indices, &msrs_to_save, 1029 num_msrs_to_save * sizeof(u32))) 1030 goto out; 1031 if (copy_to_user(user_msr_list->indices 1032 + num_msrs_to_save * sizeof(u32), 1033 &emulated_msrs, 1034 ARRAY_SIZE(emulated_msrs) * sizeof(u32))) 1035 goto out; 1036 r = 0; 1037 break; 1038 } 1039 case KVM_GET_SUPPORTED_CPUID: { 1040 struct kvm_cpuid2 __user *cpuid_arg = argp; 1041 struct kvm_cpuid2 cpuid; 1042 1043 r = -EFAULT; 1044 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 1045 goto out; 1046 r = kvm_dev_ioctl_get_supported_cpuid(&cpuid, 1047 cpuid_arg->entries); 1048 if (r) 1049 goto out; 1050 1051 r = -EFAULT; 1052 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid)) 1053 goto out; 1054 r = 0; 1055 break; 1056 } 1057 default: 1058 r = -EINVAL; 1059 } 1060 out: 1061 return r; 1062 } 1063 1064 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 1065 { 1066 kvm_x86_ops->vcpu_load(vcpu, cpu); 1067 kvm_write_guest_time(vcpu); 1068 } 1069 1070 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) 1071 { 1072 kvm_x86_ops->vcpu_put(vcpu); 1073 kvm_put_guest_fpu(vcpu); 1074 } 1075 1076 static int is_efer_nx(void) 1077 { 1078 u64 efer; 1079 1080 rdmsrl(MSR_EFER, efer); 1081 return efer & EFER_NX; 1082 } 1083 1084 static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu) 1085 { 1086 int i; 1087 struct kvm_cpuid_entry2 *e, *entry; 1088 1089 entry = NULL; 1090 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { 1091 e = &vcpu->arch.cpuid_entries[i]; 1092 if (e->function == 0x80000001) { 1093 entry = e; 1094 break; 1095 } 1096 } 1097 if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) { 1098 entry->edx &= ~(1 << 20); 1099 printk(KERN_INFO "kvm: guest NX capability removed\n"); 1100 } 1101 } 1102 1103 /* when an old userspace process fills a new kernel module */ 1104 static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, 1105 struct kvm_cpuid *cpuid, 1106 struct kvm_cpuid_entry __user *entries) 1107 { 1108 int r, i; 1109 struct kvm_cpuid_entry *cpuid_entries; 1110 1111 r = -E2BIG; 1112 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) 1113 goto out; 1114 r = -ENOMEM; 1115 cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent); 1116 if (!cpuid_entries) 1117 goto out; 1118 r = -EFAULT; 1119 if (copy_from_user(cpuid_entries, entries, 1120 cpuid->nent * sizeof(struct kvm_cpuid_entry))) 1121 goto out_free; 1122 for (i = 0; i < cpuid->nent; i++) { 1123 vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function; 1124 vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax; 1125 vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx; 1126 vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx; 1127 vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx; 1128 vcpu->arch.cpuid_entries[i].index = 0; 1129 vcpu->arch.cpuid_entries[i].flags = 0; 1130 vcpu->arch.cpuid_entries[i].padding[0] = 0; 1131 vcpu->arch.cpuid_entries[i].padding[1] = 0; 1132 vcpu->arch.cpuid_entries[i].padding[2] = 0; 1133 } 1134 vcpu->arch.cpuid_nent = cpuid->nent; 1135 cpuid_fix_nx_cap(vcpu); 1136 r = 0; 1137 1138 out_free: 1139 vfree(cpuid_entries); 1140 out: 1141 return r; 1142 } 1143 1144 static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, 1145 struct kvm_cpuid2 *cpuid, 1146 struct kvm_cpuid_entry2 __user *entries) 1147 { 1148 int r; 1149 1150 r = -E2BIG; 1151 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) 1152 goto out; 1153 r = -EFAULT; 1154 if (copy_from_user(&vcpu->arch.cpuid_entries, entries, 1155 cpuid->nent * sizeof(struct kvm_cpuid_entry2))) 1156 goto out; 1157 vcpu->arch.cpuid_nent = cpuid->nent; 1158 return 0; 1159 1160 out: 1161 return r; 1162 } 1163 1164 static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, 1165 struct kvm_cpuid2 *cpuid, 1166 struct kvm_cpuid_entry2 __user *entries) 1167 { 1168 int r; 1169 1170 r = -E2BIG; 1171 if (cpuid->nent < vcpu->arch.cpuid_nent) 1172 goto out; 1173 r = -EFAULT; 1174 if (copy_to_user(entries, &vcpu->arch.cpuid_entries, 1175 vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2))) 1176 goto out; 1177 return 0; 1178 1179 out: 1180 cpuid->nent = vcpu->arch.cpuid_nent; 1181 return r; 1182 } 1183 1184 static inline u32 bit(int bitno) 1185 { 1186 return 1 << (bitno & 31); 1187 } 1188 1189 static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, 1190 u32 index) 1191 { 1192 entry->function = function; 1193 entry->index = index; 1194 cpuid_count(entry->function, entry->index, 1195 &entry->eax, &entry->ebx, &entry->ecx, &entry->edx); 1196 entry->flags = 0; 1197 } 1198 1199 static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, 1200 u32 index, int *nent, int maxnent) 1201 { 1202 const u32 kvm_supported_word0_x86_features = bit(X86_FEATURE_FPU) | 1203 bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) | 1204 bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) | 1205 bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) | 1206 bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) | 1207 bit(X86_FEATURE_SEP) | bit(X86_FEATURE_PGE) | 1208 bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) | 1209 bit(X86_FEATURE_CLFLSH) | bit(X86_FEATURE_MMX) | 1210 bit(X86_FEATURE_FXSR) | bit(X86_FEATURE_XMM) | 1211 bit(X86_FEATURE_XMM2) | bit(X86_FEATURE_SELFSNOOP); 1212 const u32 kvm_supported_word1_x86_features = bit(X86_FEATURE_FPU) | 1213 bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) | 1214 bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) | 1215 bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) | 1216 bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) | 1217 bit(X86_FEATURE_PGE) | 1218 bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) | 1219 bit(X86_FEATURE_MMX) | bit(X86_FEATURE_FXSR) | 1220 bit(X86_FEATURE_SYSCALL) | 1221 (bit(X86_FEATURE_NX) && is_efer_nx()) | 1222 #ifdef CONFIG_X86_64 1223 bit(X86_FEATURE_LM) | 1224 #endif 1225 bit(X86_FEATURE_MMXEXT) | 1226 bit(X86_FEATURE_3DNOWEXT) | 1227 bit(X86_FEATURE_3DNOW); 1228 const u32 kvm_supported_word3_x86_features = 1229 bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16); 1230 const u32 kvm_supported_word6_x86_features = 1231 bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY); 1232 1233 /* all func 2 cpuid_count() should be called on the same cpu */ 1234 get_cpu(); 1235 do_cpuid_1_ent(entry, function, index); 1236 ++*nent; 1237 1238 switch (function) { 1239 case 0: 1240 entry->eax = min(entry->eax, (u32)0xb); 1241 break; 1242 case 1: 1243 entry->edx &= kvm_supported_word0_x86_features; 1244 entry->ecx &= kvm_supported_word3_x86_features; 1245 break; 1246 /* function 2 entries are STATEFUL. That is, repeated cpuid commands 1247 * may return different values. This forces us to get_cpu() before 1248 * issuing the first command, and also to emulate this annoying behavior 1249 * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */ 1250 case 2: { 1251 int t, times = entry->eax & 0xff; 1252 1253 entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; 1254 entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; 1255 for (t = 1; t < times && *nent < maxnent; ++t) { 1256 do_cpuid_1_ent(&entry[t], function, 0); 1257 entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; 1258 ++*nent; 1259 } 1260 break; 1261 } 1262 /* function 4 and 0xb have additional index. */ 1263 case 4: { 1264 int i, cache_type; 1265 1266 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1267 /* read more entries until cache_type is zero */ 1268 for (i = 1; *nent < maxnent; ++i) { 1269 cache_type = entry[i - 1].eax & 0x1f; 1270 if (!cache_type) 1271 break; 1272 do_cpuid_1_ent(&entry[i], function, i); 1273 entry[i].flags |= 1274 KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1275 ++*nent; 1276 } 1277 break; 1278 } 1279 case 0xb: { 1280 int i, level_type; 1281 1282 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1283 /* read more entries until level_type is zero */ 1284 for (i = 1; *nent < maxnent; ++i) { 1285 level_type = entry[i - 1].ecx & 0xff00; 1286 if (!level_type) 1287 break; 1288 do_cpuid_1_ent(&entry[i], function, i); 1289 entry[i].flags |= 1290 KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1291 ++*nent; 1292 } 1293 break; 1294 } 1295 case 0x80000000: 1296 entry->eax = min(entry->eax, 0x8000001a); 1297 break; 1298 case 0x80000001: 1299 entry->edx &= kvm_supported_word1_x86_features; 1300 entry->ecx &= kvm_supported_word6_x86_features; 1301 break; 1302 } 1303 put_cpu(); 1304 } 1305 1306 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, 1307 struct kvm_cpuid_entry2 __user *entries) 1308 { 1309 struct kvm_cpuid_entry2 *cpuid_entries; 1310 int limit, nent = 0, r = -E2BIG; 1311 u32 func; 1312 1313 if (cpuid->nent < 1) 1314 goto out; 1315 r = -ENOMEM; 1316 cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent); 1317 if (!cpuid_entries) 1318 goto out; 1319 1320 do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent); 1321 limit = cpuid_entries[0].eax; 1322 for (func = 1; func <= limit && nent < cpuid->nent; ++func) 1323 do_cpuid_ent(&cpuid_entries[nent], func, 0, 1324 &nent, cpuid->nent); 1325 r = -E2BIG; 1326 if (nent >= cpuid->nent) 1327 goto out_free; 1328 1329 do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent); 1330 limit = cpuid_entries[nent - 1].eax; 1331 for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func) 1332 do_cpuid_ent(&cpuid_entries[nent], func, 0, 1333 &nent, cpuid->nent); 1334 r = -EFAULT; 1335 if (copy_to_user(entries, cpuid_entries, 1336 nent * sizeof(struct kvm_cpuid_entry2))) 1337 goto out_free; 1338 cpuid->nent = nent; 1339 r = 0; 1340 1341 out_free: 1342 vfree(cpuid_entries); 1343 out: 1344 return r; 1345 } 1346 1347 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, 1348 struct kvm_lapic_state *s) 1349 { 1350 vcpu_load(vcpu); 1351 memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); 1352 vcpu_put(vcpu); 1353 1354 return 0; 1355 } 1356 1357 static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, 1358 struct kvm_lapic_state *s) 1359 { 1360 vcpu_load(vcpu); 1361 memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); 1362 kvm_apic_post_state_restore(vcpu); 1363 vcpu_put(vcpu); 1364 1365 return 0; 1366 } 1367 1368 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, 1369 struct kvm_interrupt *irq) 1370 { 1371 if (irq->irq < 0 || irq->irq >= 256) 1372 return -EINVAL; 1373 if (irqchip_in_kernel(vcpu->kvm)) 1374 return -ENXIO; 1375 vcpu_load(vcpu); 1376 1377 set_bit(irq->irq, vcpu->arch.irq_pending); 1378 set_bit(irq->irq / BITS_PER_LONG, &vcpu->arch.irq_summary); 1379 1380 vcpu_put(vcpu); 1381 1382 return 0; 1383 } 1384 1385 static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu) 1386 { 1387 vcpu_load(vcpu); 1388 kvm_inject_nmi(vcpu); 1389 vcpu_put(vcpu); 1390 1391 return 0; 1392 } 1393 1394 static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu, 1395 struct kvm_tpr_access_ctl *tac) 1396 { 1397 if (tac->flags) 1398 return -EINVAL; 1399 vcpu->arch.tpr_access_reporting = !!tac->enabled; 1400 return 0; 1401 } 1402 1403 long kvm_arch_vcpu_ioctl(struct file *filp, 1404 unsigned int ioctl, unsigned long arg) 1405 { 1406 struct kvm_vcpu *vcpu = filp->private_data; 1407 void __user *argp = (void __user *)arg; 1408 int r; 1409 struct kvm_lapic_state *lapic = NULL; 1410 1411 switch (ioctl) { 1412 case KVM_GET_LAPIC: { 1413 lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); 1414 1415 r = -ENOMEM; 1416 if (!lapic) 1417 goto out; 1418 r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic); 1419 if (r) 1420 goto out; 1421 r = -EFAULT; 1422 if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state))) 1423 goto out; 1424 r = 0; 1425 break; 1426 } 1427 case KVM_SET_LAPIC: { 1428 lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); 1429 r = -ENOMEM; 1430 if (!lapic) 1431 goto out; 1432 r = -EFAULT; 1433 if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state))) 1434 goto out; 1435 r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic); 1436 if (r) 1437 goto out; 1438 r = 0; 1439 break; 1440 } 1441 case KVM_INTERRUPT: { 1442 struct kvm_interrupt irq; 1443 1444 r = -EFAULT; 1445 if (copy_from_user(&irq, argp, sizeof irq)) 1446 goto out; 1447 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); 1448 if (r) 1449 goto out; 1450 r = 0; 1451 break; 1452 } 1453 case KVM_NMI: { 1454 r = kvm_vcpu_ioctl_nmi(vcpu); 1455 if (r) 1456 goto out; 1457 r = 0; 1458 break; 1459 } 1460 case KVM_SET_CPUID: { 1461 struct kvm_cpuid __user *cpuid_arg = argp; 1462 struct kvm_cpuid cpuid; 1463 1464 r = -EFAULT; 1465 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 1466 goto out; 1467 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries); 1468 if (r) 1469 goto out; 1470 break; 1471 } 1472 case KVM_SET_CPUID2: { 1473 struct kvm_cpuid2 __user *cpuid_arg = argp; 1474 struct kvm_cpuid2 cpuid; 1475 1476 r = -EFAULT; 1477 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 1478 goto out; 1479 r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid, 1480 cpuid_arg->entries); 1481 if (r) 1482 goto out; 1483 break; 1484 } 1485 case KVM_GET_CPUID2: { 1486 struct kvm_cpuid2 __user *cpuid_arg = argp; 1487 struct kvm_cpuid2 cpuid; 1488 1489 r = -EFAULT; 1490 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 1491 goto out; 1492 r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid, 1493 cpuid_arg->entries); 1494 if (r) 1495 goto out; 1496 r = -EFAULT; 1497 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid)) 1498 goto out; 1499 r = 0; 1500 break; 1501 } 1502 case KVM_GET_MSRS: 1503 r = msr_io(vcpu, argp, kvm_get_msr, 1); 1504 break; 1505 case KVM_SET_MSRS: 1506 r = msr_io(vcpu, argp, do_set_msr, 0); 1507 break; 1508 case KVM_TPR_ACCESS_REPORTING: { 1509 struct kvm_tpr_access_ctl tac; 1510 1511 r = -EFAULT; 1512 if (copy_from_user(&tac, argp, sizeof tac)) 1513 goto out; 1514 r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac); 1515 if (r) 1516 goto out; 1517 r = -EFAULT; 1518 if (copy_to_user(argp, &tac, sizeof tac)) 1519 goto out; 1520 r = 0; 1521 break; 1522 }; 1523 case KVM_SET_VAPIC_ADDR: { 1524 struct kvm_vapic_addr va; 1525 1526 r = -EINVAL; 1527 if (!irqchip_in_kernel(vcpu->kvm)) 1528 goto out; 1529 r = -EFAULT; 1530 if (copy_from_user(&va, argp, sizeof va)) 1531 goto out; 1532 r = 0; 1533 kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr); 1534 break; 1535 } 1536 default: 1537 r = -EINVAL; 1538 } 1539 out: 1540 if (lapic) 1541 kfree(lapic); 1542 return r; 1543 } 1544 1545 static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) 1546 { 1547 int ret; 1548 1549 if (addr > (unsigned int)(-3 * PAGE_SIZE)) 1550 return -1; 1551 ret = kvm_x86_ops->set_tss_addr(kvm, addr); 1552 return ret; 1553 } 1554 1555 static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, 1556 u32 kvm_nr_mmu_pages) 1557 { 1558 if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES) 1559 return -EINVAL; 1560 1561 down_write(&kvm->slots_lock); 1562 1563 kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); 1564 kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; 1565 1566 up_write(&kvm->slots_lock); 1567 return 0; 1568 } 1569 1570 static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) 1571 { 1572 return kvm->arch.n_alloc_mmu_pages; 1573 } 1574 1575 gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) 1576 { 1577 int i; 1578 struct kvm_mem_alias *alias; 1579 1580 for (i = 0; i < kvm->arch.naliases; ++i) { 1581 alias = &kvm->arch.aliases[i]; 1582 if (gfn >= alias->base_gfn 1583 && gfn < alias->base_gfn + alias->npages) 1584 return alias->target_gfn + gfn - alias->base_gfn; 1585 } 1586 return gfn; 1587 } 1588 1589 /* 1590 * Set a new alias region. Aliases map a portion of physical memory into 1591 * another portion. This is useful for memory windows, for example the PC 1592 * VGA region. 1593 */ 1594 static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, 1595 struct kvm_memory_alias *alias) 1596 { 1597 int r, n; 1598 struct kvm_mem_alias *p; 1599 1600 r = -EINVAL; 1601 /* General sanity checks */ 1602 if (alias->memory_size & (PAGE_SIZE - 1)) 1603 goto out; 1604 if (alias->guest_phys_addr & (PAGE_SIZE - 1)) 1605 goto out; 1606 if (alias->slot >= KVM_ALIAS_SLOTS) 1607 goto out; 1608 if (alias->guest_phys_addr + alias->memory_size 1609 < alias->guest_phys_addr) 1610 goto out; 1611 if (alias->target_phys_addr + alias->memory_size 1612 < alias->target_phys_addr) 1613 goto out; 1614 1615 down_write(&kvm->slots_lock); 1616 spin_lock(&kvm->mmu_lock); 1617 1618 p = &kvm->arch.aliases[alias->slot]; 1619 p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; 1620 p->npages = alias->memory_size >> PAGE_SHIFT; 1621 p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT; 1622 1623 for (n = KVM_ALIAS_SLOTS; n > 0; --n) 1624 if (kvm->arch.aliases[n - 1].npages) 1625 break; 1626 kvm->arch.naliases = n; 1627 1628 spin_unlock(&kvm->mmu_lock); 1629 kvm_mmu_zap_all(kvm); 1630 1631 up_write(&kvm->slots_lock); 1632 1633 return 0; 1634 1635 out: 1636 return r; 1637 } 1638 1639 static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) 1640 { 1641 int r; 1642 1643 r = 0; 1644 switch (chip->chip_id) { 1645 case KVM_IRQCHIP_PIC_MASTER: 1646 memcpy(&chip->chip.pic, 1647 &pic_irqchip(kvm)->pics[0], 1648 sizeof(struct kvm_pic_state)); 1649 break; 1650 case KVM_IRQCHIP_PIC_SLAVE: 1651 memcpy(&chip->chip.pic, 1652 &pic_irqchip(kvm)->pics[1], 1653 sizeof(struct kvm_pic_state)); 1654 break; 1655 case KVM_IRQCHIP_IOAPIC: 1656 memcpy(&chip->chip.ioapic, 1657 ioapic_irqchip(kvm), 1658 sizeof(struct kvm_ioapic_state)); 1659 break; 1660 default: 1661 r = -EINVAL; 1662 break; 1663 } 1664 return r; 1665 } 1666 1667 static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) 1668 { 1669 int r; 1670 1671 r = 0; 1672 switch (chip->chip_id) { 1673 case KVM_IRQCHIP_PIC_MASTER: 1674 memcpy(&pic_irqchip(kvm)->pics[0], 1675 &chip->chip.pic, 1676 sizeof(struct kvm_pic_state)); 1677 break; 1678 case KVM_IRQCHIP_PIC_SLAVE: 1679 memcpy(&pic_irqchip(kvm)->pics[1], 1680 &chip->chip.pic, 1681 sizeof(struct kvm_pic_state)); 1682 break; 1683 case KVM_IRQCHIP_IOAPIC: 1684 memcpy(ioapic_irqchip(kvm), 1685 &chip->chip.ioapic, 1686 sizeof(struct kvm_ioapic_state)); 1687 break; 1688 default: 1689 r = -EINVAL; 1690 break; 1691 } 1692 kvm_pic_update_irq(pic_irqchip(kvm)); 1693 return r; 1694 } 1695 1696 static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps) 1697 { 1698 int r = 0; 1699 1700 memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state)); 1701 return r; 1702 } 1703 1704 static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps) 1705 { 1706 int r = 0; 1707 1708 memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state)); 1709 kvm_pit_load_count(kvm, 0, ps->channels[0].count); 1710 return r; 1711 } 1712 1713 /* 1714 * Get (and clear) the dirty memory log for a memory slot. 1715 */ 1716 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, 1717 struct kvm_dirty_log *log) 1718 { 1719 int r; 1720 int n; 1721 struct kvm_memory_slot *memslot; 1722 int is_dirty = 0; 1723 1724 down_write(&kvm->slots_lock); 1725 1726 r = kvm_get_dirty_log(kvm, log, &is_dirty); 1727 if (r) 1728 goto out; 1729 1730 /* If nothing is dirty, don't bother messing with page tables. */ 1731 if (is_dirty) { 1732 kvm_mmu_slot_remove_write_access(kvm, log->slot); 1733 kvm_flush_remote_tlbs(kvm); 1734 memslot = &kvm->memslots[log->slot]; 1735 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; 1736 memset(memslot->dirty_bitmap, 0, n); 1737 } 1738 r = 0; 1739 out: 1740 up_write(&kvm->slots_lock); 1741 return r; 1742 } 1743 1744 long kvm_arch_vm_ioctl(struct file *filp, 1745 unsigned int ioctl, unsigned long arg) 1746 { 1747 struct kvm *kvm = filp->private_data; 1748 void __user *argp = (void __user *)arg; 1749 int r = -EINVAL; 1750 /* 1751 * This union makes it completely explicit to gcc-3.x 1752 * that these two variables' stack usage should be 1753 * combined, not added together. 1754 */ 1755 union { 1756 struct kvm_pit_state ps; 1757 struct kvm_memory_alias alias; 1758 } u; 1759 1760 switch (ioctl) { 1761 case KVM_SET_TSS_ADDR: 1762 r = kvm_vm_ioctl_set_tss_addr(kvm, arg); 1763 if (r < 0) 1764 goto out; 1765 break; 1766 case KVM_SET_MEMORY_REGION: { 1767 struct kvm_memory_region kvm_mem; 1768 struct kvm_userspace_memory_region kvm_userspace_mem; 1769 1770 r = -EFAULT; 1771 if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem)) 1772 goto out; 1773 kvm_userspace_mem.slot = kvm_mem.slot; 1774 kvm_userspace_mem.flags = kvm_mem.flags; 1775 kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr; 1776 kvm_userspace_mem.memory_size = kvm_mem.memory_size; 1777 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0); 1778 if (r) 1779 goto out; 1780 break; 1781 } 1782 case KVM_SET_NR_MMU_PAGES: 1783 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); 1784 if (r) 1785 goto out; 1786 break; 1787 case KVM_GET_NR_MMU_PAGES: 1788 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); 1789 break; 1790 case KVM_SET_MEMORY_ALIAS: 1791 r = -EFAULT; 1792 if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias))) 1793 goto out; 1794 r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias); 1795 if (r) 1796 goto out; 1797 break; 1798 case KVM_CREATE_IRQCHIP: 1799 r = -ENOMEM; 1800 kvm->arch.vpic = kvm_create_pic(kvm); 1801 if (kvm->arch.vpic) { 1802 r = kvm_ioapic_init(kvm); 1803 if (r) { 1804 kfree(kvm->arch.vpic); 1805 kvm->arch.vpic = NULL; 1806 goto out; 1807 } 1808 } else 1809 goto out; 1810 break; 1811 case KVM_CREATE_PIT: 1812 r = -ENOMEM; 1813 kvm->arch.vpit = kvm_create_pit(kvm); 1814 if (kvm->arch.vpit) 1815 r = 0; 1816 break; 1817 case KVM_IRQ_LINE: { 1818 struct kvm_irq_level irq_event; 1819 1820 r = -EFAULT; 1821 if (copy_from_user(&irq_event, argp, sizeof irq_event)) 1822 goto out; 1823 if (irqchip_in_kernel(kvm)) { 1824 mutex_lock(&kvm->lock); 1825 kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1826 irq_event.irq, irq_event.level); 1827 mutex_unlock(&kvm->lock); 1828 r = 0; 1829 } 1830 break; 1831 } 1832 case KVM_GET_IRQCHIP: { 1833 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ 1834 struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL); 1835 1836 r = -ENOMEM; 1837 if (!chip) 1838 goto out; 1839 r = -EFAULT; 1840 if (copy_from_user(chip, argp, sizeof *chip)) 1841 goto get_irqchip_out; 1842 r = -ENXIO; 1843 if (!irqchip_in_kernel(kvm)) 1844 goto get_irqchip_out; 1845 r = kvm_vm_ioctl_get_irqchip(kvm, chip); 1846 if (r) 1847 goto get_irqchip_out; 1848 r = -EFAULT; 1849 if (copy_to_user(argp, chip, sizeof *chip)) 1850 goto get_irqchip_out; 1851 r = 0; 1852 get_irqchip_out: 1853 kfree(chip); 1854 if (r) 1855 goto out; 1856 break; 1857 } 1858 case KVM_SET_IRQCHIP: { 1859 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ 1860 struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL); 1861 1862 r = -ENOMEM; 1863 if (!chip) 1864 goto out; 1865 r = -EFAULT; 1866 if (copy_from_user(chip, argp, sizeof *chip)) 1867 goto set_irqchip_out; 1868 r = -ENXIO; 1869 if (!irqchip_in_kernel(kvm)) 1870 goto set_irqchip_out; 1871 r = kvm_vm_ioctl_set_irqchip(kvm, chip); 1872 if (r) 1873 goto set_irqchip_out; 1874 r = 0; 1875 set_irqchip_out: 1876 kfree(chip); 1877 if (r) 1878 goto out; 1879 break; 1880 } 1881 case KVM_GET_PIT: { 1882 r = -EFAULT; 1883 if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state))) 1884 goto out; 1885 r = -ENXIO; 1886 if (!kvm->arch.vpit) 1887 goto out; 1888 r = kvm_vm_ioctl_get_pit(kvm, &u.ps); 1889 if (r) 1890 goto out; 1891 r = -EFAULT; 1892 if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state))) 1893 goto out; 1894 r = 0; 1895 break; 1896 } 1897 case KVM_SET_PIT: { 1898 r = -EFAULT; 1899 if (copy_from_user(&u.ps, argp, sizeof u.ps)) 1900 goto out; 1901 r = -ENXIO; 1902 if (!kvm->arch.vpit) 1903 goto out; 1904 r = kvm_vm_ioctl_set_pit(kvm, &u.ps); 1905 if (r) 1906 goto out; 1907 r = 0; 1908 break; 1909 } 1910 default: 1911 ; 1912 } 1913 out: 1914 return r; 1915 } 1916 1917 static void kvm_init_msr_list(void) 1918 { 1919 u32 dummy[2]; 1920 unsigned i, j; 1921 1922 for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) { 1923 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) 1924 continue; 1925 if (j < i) 1926 msrs_to_save[j] = msrs_to_save[i]; 1927 j++; 1928 } 1929 num_msrs_to_save = j; 1930 } 1931 1932 /* 1933 * Only apic need an MMIO device hook, so shortcut now.. 1934 */ 1935 static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu, 1936 gpa_t addr, int len, 1937 int is_write) 1938 { 1939 struct kvm_io_device *dev; 1940 1941 if (vcpu->arch.apic) { 1942 dev = &vcpu->arch.apic->dev; 1943 if (dev->in_range(dev, addr, len, is_write)) 1944 return dev; 1945 } 1946 return NULL; 1947 } 1948 1949 1950 static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu, 1951 gpa_t addr, int len, 1952 int is_write) 1953 { 1954 struct kvm_io_device *dev; 1955 1956 dev = vcpu_find_pervcpu_dev(vcpu, addr, len, is_write); 1957 if (dev == NULL) 1958 dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr, len, 1959 is_write); 1960 return dev; 1961 } 1962 1963 int emulator_read_std(unsigned long addr, 1964 void *val, 1965 unsigned int bytes, 1966 struct kvm_vcpu *vcpu) 1967 { 1968 void *data = val; 1969 int r = X86EMUL_CONTINUE; 1970 1971 while (bytes) { 1972 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 1973 unsigned offset = addr & (PAGE_SIZE-1); 1974 unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset); 1975 int ret; 1976 1977 if (gpa == UNMAPPED_GVA) { 1978 r = X86EMUL_PROPAGATE_FAULT; 1979 goto out; 1980 } 1981 ret = kvm_read_guest(vcpu->kvm, gpa, data, tocopy); 1982 if (ret < 0) { 1983 r = X86EMUL_UNHANDLEABLE; 1984 goto out; 1985 } 1986 1987 bytes -= tocopy; 1988 data += tocopy; 1989 addr += tocopy; 1990 } 1991 out: 1992 return r; 1993 } 1994 EXPORT_SYMBOL_GPL(emulator_read_std); 1995 1996 static int emulator_read_emulated(unsigned long addr, 1997 void *val, 1998 unsigned int bytes, 1999 struct kvm_vcpu *vcpu) 2000 { 2001 struct kvm_io_device *mmio_dev; 2002 gpa_t gpa; 2003 2004 if (vcpu->mmio_read_completed) { 2005 memcpy(val, vcpu->mmio_data, bytes); 2006 vcpu->mmio_read_completed = 0; 2007 return X86EMUL_CONTINUE; 2008 } 2009 2010 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 2011 2012 /* For APIC access vmexit */ 2013 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 2014 goto mmio; 2015 2016 if (emulator_read_std(addr, val, bytes, vcpu) 2017 == X86EMUL_CONTINUE) 2018 return X86EMUL_CONTINUE; 2019 if (gpa == UNMAPPED_GVA) 2020 return X86EMUL_PROPAGATE_FAULT; 2021 2022 mmio: 2023 /* 2024 * Is this MMIO handled locally? 2025 */ 2026 mutex_lock(&vcpu->kvm->lock); 2027 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 0); 2028 if (mmio_dev) { 2029 kvm_iodevice_read(mmio_dev, gpa, bytes, val); 2030 mutex_unlock(&vcpu->kvm->lock); 2031 return X86EMUL_CONTINUE; 2032 } 2033 mutex_unlock(&vcpu->kvm->lock); 2034 2035 vcpu->mmio_needed = 1; 2036 vcpu->mmio_phys_addr = gpa; 2037 vcpu->mmio_size = bytes; 2038 vcpu->mmio_is_write = 0; 2039 2040 return X86EMUL_UNHANDLEABLE; 2041 } 2042 2043 int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, 2044 const void *val, int bytes) 2045 { 2046 int ret; 2047 2048 ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes); 2049 if (ret < 0) 2050 return 0; 2051 kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1); 2052 return 1; 2053 } 2054 2055 static int emulator_write_emulated_onepage(unsigned long addr, 2056 const void *val, 2057 unsigned int bytes, 2058 struct kvm_vcpu *vcpu) 2059 { 2060 struct kvm_io_device *mmio_dev; 2061 gpa_t gpa; 2062 2063 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 2064 2065 if (gpa == UNMAPPED_GVA) { 2066 kvm_inject_page_fault(vcpu, addr, 2); 2067 return X86EMUL_PROPAGATE_FAULT; 2068 } 2069 2070 /* For APIC access vmexit */ 2071 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 2072 goto mmio; 2073 2074 if (emulator_write_phys(vcpu, gpa, val, bytes)) 2075 return X86EMUL_CONTINUE; 2076 2077 mmio: 2078 /* 2079 * Is this MMIO handled locally? 2080 */ 2081 mutex_lock(&vcpu->kvm->lock); 2082 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 1); 2083 if (mmio_dev) { 2084 kvm_iodevice_write(mmio_dev, gpa, bytes, val); 2085 mutex_unlock(&vcpu->kvm->lock); 2086 return X86EMUL_CONTINUE; 2087 } 2088 mutex_unlock(&vcpu->kvm->lock); 2089 2090 vcpu->mmio_needed = 1; 2091 vcpu->mmio_phys_addr = gpa; 2092 vcpu->mmio_size = bytes; 2093 vcpu->mmio_is_write = 1; 2094 memcpy(vcpu->mmio_data, val, bytes); 2095 2096 return X86EMUL_CONTINUE; 2097 } 2098 2099 int emulator_write_emulated(unsigned long addr, 2100 const void *val, 2101 unsigned int bytes, 2102 struct kvm_vcpu *vcpu) 2103 { 2104 /* Crossing a page boundary? */ 2105 if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { 2106 int rc, now; 2107 2108 now = -addr & ~PAGE_MASK; 2109 rc = emulator_write_emulated_onepage(addr, val, now, vcpu); 2110 if (rc != X86EMUL_CONTINUE) 2111 return rc; 2112 addr += now; 2113 val += now; 2114 bytes -= now; 2115 } 2116 return emulator_write_emulated_onepage(addr, val, bytes, vcpu); 2117 } 2118 EXPORT_SYMBOL_GPL(emulator_write_emulated); 2119 2120 static int emulator_cmpxchg_emulated(unsigned long addr, 2121 const void *old, 2122 const void *new, 2123 unsigned int bytes, 2124 struct kvm_vcpu *vcpu) 2125 { 2126 static int reported; 2127 2128 if (!reported) { 2129 reported = 1; 2130 printk(KERN_WARNING "kvm: emulating exchange as write\n"); 2131 } 2132 #ifndef CONFIG_X86_64 2133 /* guests cmpxchg8b have to be emulated atomically */ 2134 if (bytes == 8) { 2135 gpa_t gpa; 2136 struct page *page; 2137 char *kaddr; 2138 u64 val; 2139 2140 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 2141 2142 if (gpa == UNMAPPED_GVA || 2143 (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 2144 goto emul_write; 2145 2146 if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK)) 2147 goto emul_write; 2148 2149 val = *(u64 *)new; 2150 2151 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); 2152 2153 kaddr = kmap_atomic(page, KM_USER0); 2154 set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val); 2155 kunmap_atomic(kaddr, KM_USER0); 2156 kvm_release_page_dirty(page); 2157 } 2158 emul_write: 2159 #endif 2160 2161 return emulator_write_emulated(addr, new, bytes, vcpu); 2162 } 2163 2164 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) 2165 { 2166 return kvm_x86_ops->get_segment_base(vcpu, seg); 2167 } 2168 2169 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) 2170 { 2171 kvm_mmu_invlpg(vcpu, address); 2172 return X86EMUL_CONTINUE; 2173 } 2174 2175 int emulate_clts(struct kvm_vcpu *vcpu) 2176 { 2177 KVMTRACE_0D(CLTS, vcpu, handler); 2178 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS); 2179 return X86EMUL_CONTINUE; 2180 } 2181 2182 int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) 2183 { 2184 struct kvm_vcpu *vcpu = ctxt->vcpu; 2185 2186 switch (dr) { 2187 case 0 ... 3: 2188 *dest = kvm_x86_ops->get_dr(vcpu, dr); 2189 return X86EMUL_CONTINUE; 2190 default: 2191 pr_unimpl(vcpu, "%s: unexpected dr %u\n", __func__, dr); 2192 return X86EMUL_UNHANDLEABLE; 2193 } 2194 } 2195 2196 int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) 2197 { 2198 unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U; 2199 int exception; 2200 2201 kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception); 2202 if (exception) { 2203 /* FIXME: better handling */ 2204 return X86EMUL_UNHANDLEABLE; 2205 } 2206 return X86EMUL_CONTINUE; 2207 } 2208 2209 void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) 2210 { 2211 u8 opcodes[4]; 2212 unsigned long rip = kvm_rip_read(vcpu); 2213 unsigned long rip_linear; 2214 2215 if (!printk_ratelimit()) 2216 return; 2217 2218 rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); 2219 2220 emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu); 2221 2222 printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n", 2223 context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); 2224 } 2225 EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); 2226 2227 static struct x86_emulate_ops emulate_ops = { 2228 .read_std = emulator_read_std, 2229 .read_emulated = emulator_read_emulated, 2230 .write_emulated = emulator_write_emulated, 2231 .cmpxchg_emulated = emulator_cmpxchg_emulated, 2232 }; 2233 2234 static void cache_all_regs(struct kvm_vcpu *vcpu) 2235 { 2236 kvm_register_read(vcpu, VCPU_REGS_RAX); 2237 kvm_register_read(vcpu, VCPU_REGS_RSP); 2238 kvm_register_read(vcpu, VCPU_REGS_RIP); 2239 vcpu->arch.regs_dirty = ~0; 2240 } 2241 2242 int emulate_instruction(struct kvm_vcpu *vcpu, 2243 struct kvm_run *run, 2244 unsigned long cr2, 2245 u16 error_code, 2246 int emulation_type) 2247 { 2248 int r; 2249 struct decode_cache *c; 2250 2251 kvm_clear_exception_queue(vcpu); 2252 vcpu->arch.mmio_fault_cr2 = cr2; 2253 /* 2254 * TODO: fix x86_emulate.c to use guest_read/write_register 2255 * instead of direct ->regs accesses, can save hundred cycles 2256 * on Intel for instructions that don't read/change RSP, for 2257 * for example. 2258 */ 2259 cache_all_regs(vcpu); 2260 2261 vcpu->mmio_is_write = 0; 2262 vcpu->arch.pio.string = 0; 2263 2264 if (!(emulation_type & EMULTYPE_NO_DECODE)) { 2265 int cs_db, cs_l; 2266 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 2267 2268 vcpu->arch.emulate_ctxt.vcpu = vcpu; 2269 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); 2270 vcpu->arch.emulate_ctxt.mode = 2271 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) 2272 ? X86EMUL_MODE_REAL : cs_l 2273 ? X86EMUL_MODE_PROT64 : cs_db 2274 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; 2275 2276 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 2277 2278 /* Reject the instructions other than VMCALL/VMMCALL when 2279 * try to emulate invalid opcode */ 2280 c = &vcpu->arch.emulate_ctxt.decode; 2281 if ((emulation_type & EMULTYPE_TRAP_UD) && 2282 (!(c->twobyte && c->b == 0x01 && 2283 (c->modrm_reg == 0 || c->modrm_reg == 3) && 2284 c->modrm_mod == 3 && c->modrm_rm == 1))) 2285 return EMULATE_FAIL; 2286 2287 ++vcpu->stat.insn_emulation; 2288 if (r) { 2289 ++vcpu->stat.insn_emulation_fail; 2290 if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) 2291 return EMULATE_DONE; 2292 return EMULATE_FAIL; 2293 } 2294 } 2295 2296 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 2297 2298 if (vcpu->arch.pio.string) 2299 return EMULATE_DO_MMIO; 2300 2301 if ((r || vcpu->mmio_is_write) && run) { 2302 run->exit_reason = KVM_EXIT_MMIO; 2303 run->mmio.phys_addr = vcpu->mmio_phys_addr; 2304 memcpy(run->mmio.data, vcpu->mmio_data, 8); 2305 run->mmio.len = vcpu->mmio_size; 2306 run->mmio.is_write = vcpu->mmio_is_write; 2307 } 2308 2309 if (r) { 2310 if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) 2311 return EMULATE_DONE; 2312 if (!vcpu->mmio_needed) { 2313 kvm_report_emulation_failure(vcpu, "mmio"); 2314 return EMULATE_FAIL; 2315 } 2316 return EMULATE_DO_MMIO; 2317 } 2318 2319 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); 2320 2321 if (vcpu->mmio_is_write) { 2322 vcpu->mmio_needed = 0; 2323 return EMULATE_DO_MMIO; 2324 } 2325 2326 return EMULATE_DONE; 2327 } 2328 EXPORT_SYMBOL_GPL(emulate_instruction); 2329 2330 static void free_pio_guest_pages(struct kvm_vcpu *vcpu) 2331 { 2332 int i; 2333 2334 for (i = 0; i < ARRAY_SIZE(vcpu->arch.pio.guest_pages); ++i) 2335 if (vcpu->arch.pio.guest_pages[i]) { 2336 kvm_release_page_dirty(vcpu->arch.pio.guest_pages[i]); 2337 vcpu->arch.pio.guest_pages[i] = NULL; 2338 } 2339 } 2340 2341 static int pio_copy_data(struct kvm_vcpu *vcpu) 2342 { 2343 void *p = vcpu->arch.pio_data; 2344 void *q; 2345 unsigned bytes; 2346 int nr_pages = vcpu->arch.pio.guest_pages[1] ? 2 : 1; 2347 2348 q = vmap(vcpu->arch.pio.guest_pages, nr_pages, VM_READ|VM_WRITE, 2349 PAGE_KERNEL); 2350 if (!q) { 2351 free_pio_guest_pages(vcpu); 2352 return -ENOMEM; 2353 } 2354 q += vcpu->arch.pio.guest_page_offset; 2355 bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count; 2356 if (vcpu->arch.pio.in) 2357 memcpy(q, p, bytes); 2358 else 2359 memcpy(p, q, bytes); 2360 q -= vcpu->arch.pio.guest_page_offset; 2361 vunmap(q); 2362 free_pio_guest_pages(vcpu); 2363 return 0; 2364 } 2365 2366 int complete_pio(struct kvm_vcpu *vcpu) 2367 { 2368 struct kvm_pio_request *io = &vcpu->arch.pio; 2369 long delta; 2370 int r; 2371 unsigned long val; 2372 2373 if (!io->string) { 2374 if (io->in) { 2375 val = kvm_register_read(vcpu, VCPU_REGS_RAX); 2376 memcpy(&val, vcpu->arch.pio_data, io->size); 2377 kvm_register_write(vcpu, VCPU_REGS_RAX, val); 2378 } 2379 } else { 2380 if (io->in) { 2381 r = pio_copy_data(vcpu); 2382 if (r) 2383 return r; 2384 } 2385 2386 delta = 1; 2387 if (io->rep) { 2388 delta *= io->cur_count; 2389 /* 2390 * The size of the register should really depend on 2391 * current address size. 2392 */ 2393 val = kvm_register_read(vcpu, VCPU_REGS_RCX); 2394 val -= delta; 2395 kvm_register_write(vcpu, VCPU_REGS_RCX, val); 2396 } 2397 if (io->down) 2398 delta = -delta; 2399 delta *= io->size; 2400 if (io->in) { 2401 val = kvm_register_read(vcpu, VCPU_REGS_RDI); 2402 val += delta; 2403 kvm_register_write(vcpu, VCPU_REGS_RDI, val); 2404 } else { 2405 val = kvm_register_read(vcpu, VCPU_REGS_RSI); 2406 val += delta; 2407 kvm_register_write(vcpu, VCPU_REGS_RSI, val); 2408 } 2409 } 2410 2411 io->count -= io->cur_count; 2412 io->cur_count = 0; 2413 2414 return 0; 2415 } 2416 2417 static void kernel_pio(struct kvm_io_device *pio_dev, 2418 struct kvm_vcpu *vcpu, 2419 void *pd) 2420 { 2421 /* TODO: String I/O for in kernel device */ 2422 2423 mutex_lock(&vcpu->kvm->lock); 2424 if (vcpu->arch.pio.in) 2425 kvm_iodevice_read(pio_dev, vcpu->arch.pio.port, 2426 vcpu->arch.pio.size, 2427 pd); 2428 else 2429 kvm_iodevice_write(pio_dev, vcpu->arch.pio.port, 2430 vcpu->arch.pio.size, 2431 pd); 2432 mutex_unlock(&vcpu->kvm->lock); 2433 } 2434 2435 static void pio_string_write(struct kvm_io_device *pio_dev, 2436 struct kvm_vcpu *vcpu) 2437 { 2438 struct kvm_pio_request *io = &vcpu->arch.pio; 2439 void *pd = vcpu->arch.pio_data; 2440 int i; 2441 2442 mutex_lock(&vcpu->kvm->lock); 2443 for (i = 0; i < io->cur_count; i++) { 2444 kvm_iodevice_write(pio_dev, io->port, 2445 io->size, 2446 pd); 2447 pd += io->size; 2448 } 2449 mutex_unlock(&vcpu->kvm->lock); 2450 } 2451 2452 static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu, 2453 gpa_t addr, int len, 2454 int is_write) 2455 { 2456 return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr, len, is_write); 2457 } 2458 2459 int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 2460 int size, unsigned port) 2461 { 2462 struct kvm_io_device *pio_dev; 2463 unsigned long val; 2464 2465 vcpu->run->exit_reason = KVM_EXIT_IO; 2466 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; 2467 vcpu->run->io.size = vcpu->arch.pio.size = size; 2468 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; 2469 vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1; 2470 vcpu->run->io.port = vcpu->arch.pio.port = port; 2471 vcpu->arch.pio.in = in; 2472 vcpu->arch.pio.string = 0; 2473 vcpu->arch.pio.down = 0; 2474 vcpu->arch.pio.guest_page_offset = 0; 2475 vcpu->arch.pio.rep = 0; 2476 2477 if (vcpu->run->io.direction == KVM_EXIT_IO_IN) 2478 KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size, 2479 handler); 2480 else 2481 KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size, 2482 handler); 2483 2484 val = kvm_register_read(vcpu, VCPU_REGS_RAX); 2485 memcpy(vcpu->arch.pio_data, &val, 4); 2486 2487 pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in); 2488 if (pio_dev) { 2489 kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data); 2490 complete_pio(vcpu); 2491 return 1; 2492 } 2493 return 0; 2494 } 2495 EXPORT_SYMBOL_GPL(kvm_emulate_pio); 2496 2497 int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 2498 int size, unsigned long count, int down, 2499 gva_t address, int rep, unsigned port) 2500 { 2501 unsigned now, in_page; 2502 int i, ret = 0; 2503 int nr_pages = 1; 2504 struct page *page; 2505 struct kvm_io_device *pio_dev; 2506 2507 vcpu->run->exit_reason = KVM_EXIT_IO; 2508 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; 2509 vcpu->run->io.size = vcpu->arch.pio.size = size; 2510 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; 2511 vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count; 2512 vcpu->run->io.port = vcpu->arch.pio.port = port; 2513 vcpu->arch.pio.in = in; 2514 vcpu->arch.pio.string = 1; 2515 vcpu->arch.pio.down = down; 2516 vcpu->arch.pio.guest_page_offset = offset_in_page(address); 2517 vcpu->arch.pio.rep = rep; 2518 2519 if (vcpu->run->io.direction == KVM_EXIT_IO_IN) 2520 KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size, 2521 handler); 2522 else 2523 KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size, 2524 handler); 2525 2526 if (!count) { 2527 kvm_x86_ops->skip_emulated_instruction(vcpu); 2528 return 1; 2529 } 2530 2531 if (!down) 2532 in_page = PAGE_SIZE - offset_in_page(address); 2533 else 2534 in_page = offset_in_page(address) + size; 2535 now = min(count, (unsigned long)in_page / size); 2536 if (!now) { 2537 /* 2538 * String I/O straddles page boundary. Pin two guest pages 2539 * so that we satisfy atomicity constraints. Do just one 2540 * transaction to avoid complexity. 2541 */ 2542 nr_pages = 2; 2543 now = 1; 2544 } 2545 if (down) { 2546 /* 2547 * String I/O in reverse. Yuck. Kill the guest, fix later. 2548 */ 2549 pr_unimpl(vcpu, "guest string pio down\n"); 2550 kvm_inject_gp(vcpu, 0); 2551 return 1; 2552 } 2553 vcpu->run->io.count = now; 2554 vcpu->arch.pio.cur_count = now; 2555 2556 if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count) 2557 kvm_x86_ops->skip_emulated_instruction(vcpu); 2558 2559 for (i = 0; i < nr_pages; ++i) { 2560 page = gva_to_page(vcpu, address + i * PAGE_SIZE); 2561 vcpu->arch.pio.guest_pages[i] = page; 2562 if (!page) { 2563 kvm_inject_gp(vcpu, 0); 2564 free_pio_guest_pages(vcpu); 2565 return 1; 2566 } 2567 } 2568 2569 pio_dev = vcpu_find_pio_dev(vcpu, port, 2570 vcpu->arch.pio.cur_count, 2571 !vcpu->arch.pio.in); 2572 if (!vcpu->arch.pio.in) { 2573 /* string PIO write */ 2574 ret = pio_copy_data(vcpu); 2575 if (ret >= 0 && pio_dev) { 2576 pio_string_write(pio_dev, vcpu); 2577 complete_pio(vcpu); 2578 if (vcpu->arch.pio.count == 0) 2579 ret = 1; 2580 } 2581 } else if (pio_dev) 2582 pr_unimpl(vcpu, "no string pio read support yet, " 2583 "port %x size %d count %ld\n", 2584 port, size, count); 2585 2586 return ret; 2587 } 2588 EXPORT_SYMBOL_GPL(kvm_emulate_pio_string); 2589 2590 int kvm_arch_init(void *opaque) 2591 { 2592 int r; 2593 struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque; 2594 2595 if (kvm_x86_ops) { 2596 printk(KERN_ERR "kvm: already loaded the other module\n"); 2597 r = -EEXIST; 2598 goto out; 2599 } 2600 2601 if (!ops->cpu_has_kvm_support()) { 2602 printk(KERN_ERR "kvm: no hardware support\n"); 2603 r = -EOPNOTSUPP; 2604 goto out; 2605 } 2606 if (ops->disabled_by_bios()) { 2607 printk(KERN_ERR "kvm: disabled by bios\n"); 2608 r = -EOPNOTSUPP; 2609 goto out; 2610 } 2611 2612 r = kvm_mmu_module_init(); 2613 if (r) 2614 goto out; 2615 2616 kvm_init_msr_list(); 2617 2618 kvm_x86_ops = ops; 2619 kvm_mmu_set_nonpresent_ptes(0ull, 0ull); 2620 kvm_mmu_set_base_ptes(PT_PRESENT_MASK); 2621 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, 2622 PT_DIRTY_MASK, PT64_NX_MASK, 0, 0); 2623 return 0; 2624 2625 out: 2626 return r; 2627 } 2628 2629 void kvm_arch_exit(void) 2630 { 2631 kvm_x86_ops = NULL; 2632 kvm_mmu_module_exit(); 2633 } 2634 2635 int kvm_emulate_halt(struct kvm_vcpu *vcpu) 2636 { 2637 ++vcpu->stat.halt_exits; 2638 KVMTRACE_0D(HLT, vcpu, handler); 2639 if (irqchip_in_kernel(vcpu->kvm)) { 2640 vcpu->arch.mp_state = KVM_MP_STATE_HALTED; 2641 return 1; 2642 } else { 2643 vcpu->run->exit_reason = KVM_EXIT_HLT; 2644 return 0; 2645 } 2646 } 2647 EXPORT_SYMBOL_GPL(kvm_emulate_halt); 2648 2649 static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0, 2650 unsigned long a1) 2651 { 2652 if (is_long_mode(vcpu)) 2653 return a0; 2654 else 2655 return a0 | ((gpa_t)a1 << 32); 2656 } 2657 2658 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) 2659 { 2660 unsigned long nr, a0, a1, a2, a3, ret; 2661 int r = 1; 2662 2663 nr = kvm_register_read(vcpu, VCPU_REGS_RAX); 2664 a0 = kvm_register_read(vcpu, VCPU_REGS_RBX); 2665 a1 = kvm_register_read(vcpu, VCPU_REGS_RCX); 2666 a2 = kvm_register_read(vcpu, VCPU_REGS_RDX); 2667 a3 = kvm_register_read(vcpu, VCPU_REGS_RSI); 2668 2669 KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler); 2670 2671 if (!is_long_mode(vcpu)) { 2672 nr &= 0xFFFFFFFF; 2673 a0 &= 0xFFFFFFFF; 2674 a1 &= 0xFFFFFFFF; 2675 a2 &= 0xFFFFFFFF; 2676 a3 &= 0xFFFFFFFF; 2677 } 2678 2679 switch (nr) { 2680 case KVM_HC_VAPIC_POLL_IRQ: 2681 ret = 0; 2682 break; 2683 case KVM_HC_MMU_OP: 2684 r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret); 2685 break; 2686 default: 2687 ret = -KVM_ENOSYS; 2688 break; 2689 } 2690 kvm_register_write(vcpu, VCPU_REGS_RAX, ret); 2691 ++vcpu->stat.hypercalls; 2692 return r; 2693 } 2694 EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); 2695 2696 int kvm_fix_hypercall(struct kvm_vcpu *vcpu) 2697 { 2698 char instruction[3]; 2699 int ret = 0; 2700 unsigned long rip = kvm_rip_read(vcpu); 2701 2702 2703 /* 2704 * Blow out the MMU to ensure that no other VCPU has an active mapping 2705 * to ensure that the updated hypercall appears atomically across all 2706 * VCPUs. 2707 */ 2708 kvm_mmu_zap_all(vcpu->kvm); 2709 2710 kvm_x86_ops->patch_hypercall(vcpu, instruction); 2711 if (emulator_write_emulated(rip, instruction, 3, vcpu) 2712 != X86EMUL_CONTINUE) 2713 ret = -EFAULT; 2714 2715 return ret; 2716 } 2717 2718 static u64 mk_cr_64(u64 curr_cr, u32 new_val) 2719 { 2720 return (curr_cr & ~((1ULL << 32) - 1)) | new_val; 2721 } 2722 2723 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) 2724 { 2725 struct descriptor_table dt = { limit, base }; 2726 2727 kvm_x86_ops->set_gdt(vcpu, &dt); 2728 } 2729 2730 void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) 2731 { 2732 struct descriptor_table dt = { limit, base }; 2733 2734 kvm_x86_ops->set_idt(vcpu, &dt); 2735 } 2736 2737 void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw, 2738 unsigned long *rflags) 2739 { 2740 kvm_lmsw(vcpu, msw); 2741 *rflags = kvm_x86_ops->get_rflags(vcpu); 2742 } 2743 2744 unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) 2745 { 2746 unsigned long value; 2747 2748 kvm_x86_ops->decache_cr4_guest_bits(vcpu); 2749 switch (cr) { 2750 case 0: 2751 value = vcpu->arch.cr0; 2752 break; 2753 case 2: 2754 value = vcpu->arch.cr2; 2755 break; 2756 case 3: 2757 value = vcpu->arch.cr3; 2758 break; 2759 case 4: 2760 value = vcpu->arch.cr4; 2761 break; 2762 case 8: 2763 value = kvm_get_cr8(vcpu); 2764 break; 2765 default: 2766 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); 2767 return 0; 2768 } 2769 KVMTRACE_3D(CR_READ, vcpu, (u32)cr, (u32)value, 2770 (u32)((u64)value >> 32), handler); 2771 2772 return value; 2773 } 2774 2775 void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, 2776 unsigned long *rflags) 2777 { 2778 KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)val, 2779 (u32)((u64)val >> 32), handler); 2780 2781 switch (cr) { 2782 case 0: 2783 kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); 2784 *rflags = kvm_x86_ops->get_rflags(vcpu); 2785 break; 2786 case 2: 2787 vcpu->arch.cr2 = val; 2788 break; 2789 case 3: 2790 kvm_set_cr3(vcpu, val); 2791 break; 2792 case 4: 2793 kvm_set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val)); 2794 break; 2795 case 8: 2796 kvm_set_cr8(vcpu, val & 0xfUL); 2797 break; 2798 default: 2799 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); 2800 } 2801 } 2802 2803 static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) 2804 { 2805 struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i]; 2806 int j, nent = vcpu->arch.cpuid_nent; 2807 2808 e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT; 2809 /* when no next entry is found, the current entry[i] is reselected */ 2810 for (j = i + 1; ; j = (j + 1) % nent) { 2811 struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j]; 2812 if (ej->function == e->function) { 2813 ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; 2814 return j; 2815 } 2816 } 2817 return 0; /* silence gcc, even though control never reaches here */ 2818 } 2819 2820 /* find an entry with matching function, matching index (if needed), and that 2821 * should be read next (if it's stateful) */ 2822 static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e, 2823 u32 function, u32 index) 2824 { 2825 if (e->function != function) 2826 return 0; 2827 if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index) 2828 return 0; 2829 if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) && 2830 !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT)) 2831 return 0; 2832 return 1; 2833 } 2834 2835 void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) 2836 { 2837 int i; 2838 u32 function, index; 2839 struct kvm_cpuid_entry2 *e, *best; 2840 2841 function = kvm_register_read(vcpu, VCPU_REGS_RAX); 2842 index = kvm_register_read(vcpu, VCPU_REGS_RCX); 2843 kvm_register_write(vcpu, VCPU_REGS_RAX, 0); 2844 kvm_register_write(vcpu, VCPU_REGS_RBX, 0); 2845 kvm_register_write(vcpu, VCPU_REGS_RCX, 0); 2846 kvm_register_write(vcpu, VCPU_REGS_RDX, 0); 2847 best = NULL; 2848 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { 2849 e = &vcpu->arch.cpuid_entries[i]; 2850 if (is_matching_cpuid_entry(e, function, index)) { 2851 if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) 2852 move_to_next_stateful_cpuid_entry(vcpu, i); 2853 best = e; 2854 break; 2855 } 2856 /* 2857 * Both basic or both extended? 2858 */ 2859 if (((e->function ^ function) & 0x80000000) == 0) 2860 if (!best || e->function > best->function) 2861 best = e; 2862 } 2863 if (best) { 2864 kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax); 2865 kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx); 2866 kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx); 2867 kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx); 2868 } 2869 kvm_x86_ops->skip_emulated_instruction(vcpu); 2870 KVMTRACE_5D(CPUID, vcpu, function, 2871 (u32)kvm_register_read(vcpu, VCPU_REGS_RAX), 2872 (u32)kvm_register_read(vcpu, VCPU_REGS_RBX), 2873 (u32)kvm_register_read(vcpu, VCPU_REGS_RCX), 2874 (u32)kvm_register_read(vcpu, VCPU_REGS_RDX), handler); 2875 } 2876 EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); 2877 2878 /* 2879 * Check if userspace requested an interrupt window, and that the 2880 * interrupt window is open. 2881 * 2882 * No need to exit to userspace if we already have an interrupt queued. 2883 */ 2884 static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu, 2885 struct kvm_run *kvm_run) 2886 { 2887 return (!vcpu->arch.irq_summary && 2888 kvm_run->request_interrupt_window && 2889 vcpu->arch.interrupt_window_open && 2890 (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF)); 2891 } 2892 2893 static void post_kvm_run_save(struct kvm_vcpu *vcpu, 2894 struct kvm_run *kvm_run) 2895 { 2896 kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0; 2897 kvm_run->cr8 = kvm_get_cr8(vcpu); 2898 kvm_run->apic_base = kvm_get_apic_base(vcpu); 2899 if (irqchip_in_kernel(vcpu->kvm)) 2900 kvm_run->ready_for_interrupt_injection = 1; 2901 else 2902 kvm_run->ready_for_interrupt_injection = 2903 (vcpu->arch.interrupt_window_open && 2904 vcpu->arch.irq_summary == 0); 2905 } 2906 2907 static void vapic_enter(struct kvm_vcpu *vcpu) 2908 { 2909 struct kvm_lapic *apic = vcpu->arch.apic; 2910 struct page *page; 2911 2912 if (!apic || !apic->vapic_addr) 2913 return; 2914 2915 page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); 2916 2917 vcpu->arch.apic->vapic_page = page; 2918 } 2919 2920 static void vapic_exit(struct kvm_vcpu *vcpu) 2921 { 2922 struct kvm_lapic *apic = vcpu->arch.apic; 2923 2924 if (!apic || !apic->vapic_addr) 2925 return; 2926 2927 down_read(&vcpu->kvm->slots_lock); 2928 kvm_release_page_dirty(apic->vapic_page); 2929 mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); 2930 up_read(&vcpu->kvm->slots_lock); 2931 } 2932 2933 static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2934 { 2935 int r; 2936 2937 if (vcpu->requests) 2938 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) 2939 kvm_mmu_unload(vcpu); 2940 2941 r = kvm_mmu_reload(vcpu); 2942 if (unlikely(r)) 2943 goto out; 2944 2945 if (vcpu->requests) { 2946 if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) 2947 __kvm_migrate_timers(vcpu); 2948 if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests)) 2949 kvm_mmu_sync_roots(vcpu); 2950 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) 2951 kvm_x86_ops->tlb_flush(vcpu); 2952 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, 2953 &vcpu->requests)) { 2954 kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS; 2955 r = 0; 2956 goto out; 2957 } 2958 if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) { 2959 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; 2960 r = 0; 2961 goto out; 2962 } 2963 } 2964 2965 clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); 2966 kvm_inject_pending_timer_irqs(vcpu); 2967 2968 preempt_disable(); 2969 2970 kvm_x86_ops->prepare_guest_switch(vcpu); 2971 kvm_load_guest_fpu(vcpu); 2972 2973 local_irq_disable(); 2974 2975 if (vcpu->requests || need_resched() || signal_pending(current)) { 2976 local_irq_enable(); 2977 preempt_enable(); 2978 r = 1; 2979 goto out; 2980 } 2981 2982 if (vcpu->guest_debug.enabled) 2983 kvm_x86_ops->guest_debug_pre(vcpu); 2984 2985 vcpu->guest_mode = 1; 2986 /* 2987 * Make sure that guest_mode assignment won't happen after 2988 * testing the pending IRQ vector bitmap. 2989 */ 2990 smp_wmb(); 2991 2992 if (vcpu->arch.exception.pending) 2993 __queue_exception(vcpu); 2994 else if (irqchip_in_kernel(vcpu->kvm)) 2995 kvm_x86_ops->inject_pending_irq(vcpu); 2996 else 2997 kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run); 2998 2999 kvm_lapic_sync_to_vapic(vcpu); 3000 3001 up_read(&vcpu->kvm->slots_lock); 3002 3003 kvm_guest_enter(); 3004 3005 3006 KVMTRACE_0D(VMENTRY, vcpu, entryexit); 3007 kvm_x86_ops->run(vcpu, kvm_run); 3008 3009 vcpu->guest_mode = 0; 3010 local_irq_enable(); 3011 3012 ++vcpu->stat.exits; 3013 3014 /* 3015 * We must have an instruction between local_irq_enable() and 3016 * kvm_guest_exit(), so the timer interrupt isn't delayed by 3017 * the interrupt shadow. The stat.exits increment will do nicely. 3018 * But we need to prevent reordering, hence this barrier(): 3019 */ 3020 barrier(); 3021 3022 kvm_guest_exit(); 3023 3024 preempt_enable(); 3025 3026 down_read(&vcpu->kvm->slots_lock); 3027 3028 /* 3029 * Profile KVM exit RIPs: 3030 */ 3031 if (unlikely(prof_on == KVM_PROFILING)) { 3032 unsigned long rip = kvm_rip_read(vcpu); 3033 profile_hit(KVM_PROFILING, (void *)rip); 3034 } 3035 3036 if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu)) 3037 vcpu->arch.exception.pending = false; 3038 3039 kvm_lapic_sync_from_vapic(vcpu); 3040 3041 r = kvm_x86_ops->handle_exit(kvm_run, vcpu); 3042 out: 3043 return r; 3044 } 3045 3046 static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3047 { 3048 int r; 3049 3050 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) { 3051 pr_debug("vcpu %d received sipi with vector # %x\n", 3052 vcpu->vcpu_id, vcpu->arch.sipi_vector); 3053 kvm_lapic_reset(vcpu); 3054 r = kvm_arch_vcpu_reset(vcpu); 3055 if (r) 3056 return r; 3057 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 3058 } 3059 3060 down_read(&vcpu->kvm->slots_lock); 3061 vapic_enter(vcpu); 3062 3063 r = 1; 3064 while (r > 0) { 3065 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) 3066 r = vcpu_enter_guest(vcpu, kvm_run); 3067 else { 3068 up_read(&vcpu->kvm->slots_lock); 3069 kvm_vcpu_block(vcpu); 3070 down_read(&vcpu->kvm->slots_lock); 3071 if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests)) 3072 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) 3073 vcpu->arch.mp_state = 3074 KVM_MP_STATE_RUNNABLE; 3075 if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE) 3076 r = -EINTR; 3077 } 3078 3079 if (r > 0) { 3080 if (dm_request_for_irq_injection(vcpu, kvm_run)) { 3081 r = -EINTR; 3082 kvm_run->exit_reason = KVM_EXIT_INTR; 3083 ++vcpu->stat.request_irq_exits; 3084 } 3085 if (signal_pending(current)) { 3086 r = -EINTR; 3087 kvm_run->exit_reason = KVM_EXIT_INTR; 3088 ++vcpu->stat.signal_exits; 3089 } 3090 if (need_resched()) { 3091 up_read(&vcpu->kvm->slots_lock); 3092 kvm_resched(vcpu); 3093 down_read(&vcpu->kvm->slots_lock); 3094 } 3095 } 3096 } 3097 3098 up_read(&vcpu->kvm->slots_lock); 3099 post_kvm_run_save(vcpu, kvm_run); 3100 3101 vapic_exit(vcpu); 3102 3103 return r; 3104 } 3105 3106 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3107 { 3108 int r; 3109 sigset_t sigsaved; 3110 3111 vcpu_load(vcpu); 3112 3113 if (vcpu->sigset_active) 3114 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); 3115 3116 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { 3117 kvm_vcpu_block(vcpu); 3118 clear_bit(KVM_REQ_UNHALT, &vcpu->requests); 3119 r = -EAGAIN; 3120 goto out; 3121 } 3122 3123 /* re-sync apic's tpr */ 3124 if (!irqchip_in_kernel(vcpu->kvm)) 3125 kvm_set_cr8(vcpu, kvm_run->cr8); 3126 3127 if (vcpu->arch.pio.cur_count) { 3128 r = complete_pio(vcpu); 3129 if (r) 3130 goto out; 3131 } 3132 #if CONFIG_HAS_IOMEM 3133 if (vcpu->mmio_needed) { 3134 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); 3135 vcpu->mmio_read_completed = 1; 3136 vcpu->mmio_needed = 0; 3137 3138 down_read(&vcpu->kvm->slots_lock); 3139 r = emulate_instruction(vcpu, kvm_run, 3140 vcpu->arch.mmio_fault_cr2, 0, 3141 EMULTYPE_NO_DECODE); 3142 up_read(&vcpu->kvm->slots_lock); 3143 if (r == EMULATE_DO_MMIO) { 3144 /* 3145 * Read-modify-write. Back to userspace. 3146 */ 3147 r = 0; 3148 goto out; 3149 } 3150 } 3151 #endif 3152 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) 3153 kvm_register_write(vcpu, VCPU_REGS_RAX, 3154 kvm_run->hypercall.ret); 3155 3156 r = __vcpu_run(vcpu, kvm_run); 3157 3158 out: 3159 if (vcpu->sigset_active) 3160 sigprocmask(SIG_SETMASK, &sigsaved, NULL); 3161 3162 vcpu_put(vcpu); 3163 return r; 3164 } 3165 3166 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 3167 { 3168 vcpu_load(vcpu); 3169 3170 regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); 3171 regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX); 3172 regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX); 3173 regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX); 3174 regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI); 3175 regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI); 3176 regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); 3177 regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP); 3178 #ifdef CONFIG_X86_64 3179 regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8); 3180 regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9); 3181 regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10); 3182 regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11); 3183 regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12); 3184 regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13); 3185 regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14); 3186 regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15); 3187 #endif 3188 3189 regs->rip = kvm_rip_read(vcpu); 3190 regs->rflags = kvm_x86_ops->get_rflags(vcpu); 3191 3192 /* 3193 * Don't leak debug flags in case they were set for guest debugging 3194 */ 3195 if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep) 3196 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF); 3197 3198 vcpu_put(vcpu); 3199 3200 return 0; 3201 } 3202 3203 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 3204 { 3205 vcpu_load(vcpu); 3206 3207 kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax); 3208 kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx); 3209 kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx); 3210 kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx); 3211 kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi); 3212 kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi); 3213 kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp); 3214 kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp); 3215 #ifdef CONFIG_X86_64 3216 kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8); 3217 kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9); 3218 kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10); 3219 kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11); 3220 kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12); 3221 kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13); 3222 kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14); 3223 kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15); 3224 3225 #endif 3226 3227 kvm_rip_write(vcpu, regs->rip); 3228 kvm_x86_ops->set_rflags(vcpu, regs->rflags); 3229 3230 3231 vcpu->arch.exception.pending = false; 3232 3233 vcpu_put(vcpu); 3234 3235 return 0; 3236 } 3237 3238 void kvm_get_segment(struct kvm_vcpu *vcpu, 3239 struct kvm_segment *var, int seg) 3240 { 3241 kvm_x86_ops->get_segment(vcpu, var, seg); 3242 } 3243 3244 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) 3245 { 3246 struct kvm_segment cs; 3247 3248 kvm_get_segment(vcpu, &cs, VCPU_SREG_CS); 3249 *db = cs.db; 3250 *l = cs.l; 3251 } 3252 EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits); 3253 3254 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, 3255 struct kvm_sregs *sregs) 3256 { 3257 struct descriptor_table dt; 3258 int pending_vec; 3259 3260 vcpu_load(vcpu); 3261 3262 kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 3263 kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); 3264 kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES); 3265 kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS); 3266 kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS); 3267 kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS); 3268 3269 kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR); 3270 kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); 3271 3272 kvm_x86_ops->get_idt(vcpu, &dt); 3273 sregs->idt.limit = dt.limit; 3274 sregs->idt.base = dt.base; 3275 kvm_x86_ops->get_gdt(vcpu, &dt); 3276 sregs->gdt.limit = dt.limit; 3277 sregs->gdt.base = dt.base; 3278 3279 kvm_x86_ops->decache_cr4_guest_bits(vcpu); 3280 sregs->cr0 = vcpu->arch.cr0; 3281 sregs->cr2 = vcpu->arch.cr2; 3282 sregs->cr3 = vcpu->arch.cr3; 3283 sregs->cr4 = vcpu->arch.cr4; 3284 sregs->cr8 = kvm_get_cr8(vcpu); 3285 sregs->efer = vcpu->arch.shadow_efer; 3286 sregs->apic_base = kvm_get_apic_base(vcpu); 3287 3288 if (irqchip_in_kernel(vcpu->kvm)) { 3289 memset(sregs->interrupt_bitmap, 0, 3290 sizeof sregs->interrupt_bitmap); 3291 pending_vec = kvm_x86_ops->get_irq(vcpu); 3292 if (pending_vec >= 0) 3293 set_bit(pending_vec, 3294 (unsigned long *)sregs->interrupt_bitmap); 3295 } else 3296 memcpy(sregs->interrupt_bitmap, vcpu->arch.irq_pending, 3297 sizeof sregs->interrupt_bitmap); 3298 3299 vcpu_put(vcpu); 3300 3301 return 0; 3302 } 3303 3304 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, 3305 struct kvm_mp_state *mp_state) 3306 { 3307 vcpu_load(vcpu); 3308 mp_state->mp_state = vcpu->arch.mp_state; 3309 vcpu_put(vcpu); 3310 return 0; 3311 } 3312 3313 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, 3314 struct kvm_mp_state *mp_state) 3315 { 3316 vcpu_load(vcpu); 3317 vcpu->arch.mp_state = mp_state->mp_state; 3318 vcpu_put(vcpu); 3319 return 0; 3320 } 3321 3322 static void kvm_set_segment(struct kvm_vcpu *vcpu, 3323 struct kvm_segment *var, int seg) 3324 { 3325 kvm_x86_ops->set_segment(vcpu, var, seg); 3326 } 3327 3328 static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector, 3329 struct kvm_segment *kvm_desct) 3330 { 3331 kvm_desct->base = seg_desc->base0; 3332 kvm_desct->base |= seg_desc->base1 << 16; 3333 kvm_desct->base |= seg_desc->base2 << 24; 3334 kvm_desct->limit = seg_desc->limit0; 3335 kvm_desct->limit |= seg_desc->limit << 16; 3336 if (seg_desc->g) { 3337 kvm_desct->limit <<= 12; 3338 kvm_desct->limit |= 0xfff; 3339 } 3340 kvm_desct->selector = selector; 3341 kvm_desct->type = seg_desc->type; 3342 kvm_desct->present = seg_desc->p; 3343 kvm_desct->dpl = seg_desc->dpl; 3344 kvm_desct->db = seg_desc->d; 3345 kvm_desct->s = seg_desc->s; 3346 kvm_desct->l = seg_desc->l; 3347 kvm_desct->g = seg_desc->g; 3348 kvm_desct->avl = seg_desc->avl; 3349 if (!selector) 3350 kvm_desct->unusable = 1; 3351 else 3352 kvm_desct->unusable = 0; 3353 kvm_desct->padding = 0; 3354 } 3355 3356 static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu, 3357 u16 selector, 3358 struct descriptor_table *dtable) 3359 { 3360 if (selector & 1 << 2) { 3361 struct kvm_segment kvm_seg; 3362 3363 kvm_get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR); 3364 3365 if (kvm_seg.unusable) 3366 dtable->limit = 0; 3367 else 3368 dtable->limit = kvm_seg.limit; 3369 dtable->base = kvm_seg.base; 3370 } 3371 else 3372 kvm_x86_ops->get_gdt(vcpu, dtable); 3373 } 3374 3375 /* allowed just for 8 bytes segments */ 3376 static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 3377 struct desc_struct *seg_desc) 3378 { 3379 gpa_t gpa; 3380 struct descriptor_table dtable; 3381 u16 index = selector >> 3; 3382 3383 get_segment_descriptor_dtable(vcpu, selector, &dtable); 3384 3385 if (dtable.limit < index * 8 + 7) { 3386 kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); 3387 return 1; 3388 } 3389 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base); 3390 gpa += index * 8; 3391 return kvm_read_guest(vcpu->kvm, gpa, seg_desc, 8); 3392 } 3393 3394 /* allowed just for 8 bytes segments */ 3395 static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 3396 struct desc_struct *seg_desc) 3397 { 3398 gpa_t gpa; 3399 struct descriptor_table dtable; 3400 u16 index = selector >> 3; 3401 3402 get_segment_descriptor_dtable(vcpu, selector, &dtable); 3403 3404 if (dtable.limit < index * 8 + 7) 3405 return 1; 3406 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base); 3407 gpa += index * 8; 3408 return kvm_write_guest(vcpu->kvm, gpa, seg_desc, 8); 3409 } 3410 3411 static u32 get_tss_base_addr(struct kvm_vcpu *vcpu, 3412 struct desc_struct *seg_desc) 3413 { 3414 u32 base_addr; 3415 3416 base_addr = seg_desc->base0; 3417 base_addr |= (seg_desc->base1 << 16); 3418 base_addr |= (seg_desc->base2 << 24); 3419 3420 return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr); 3421 } 3422 3423 static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg) 3424 { 3425 struct kvm_segment kvm_seg; 3426 3427 kvm_get_segment(vcpu, &kvm_seg, seg); 3428 return kvm_seg.selector; 3429 } 3430 3431 static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu, 3432 u16 selector, 3433 struct kvm_segment *kvm_seg) 3434 { 3435 struct desc_struct seg_desc; 3436 3437 if (load_guest_segment_descriptor(vcpu, selector, &seg_desc)) 3438 return 1; 3439 seg_desct_to_kvm_desct(&seg_desc, selector, kvm_seg); 3440 return 0; 3441 } 3442 3443 static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg) 3444 { 3445 struct kvm_segment segvar = { 3446 .base = selector << 4, 3447 .limit = 0xffff, 3448 .selector = selector, 3449 .type = 3, 3450 .present = 1, 3451 .dpl = 3, 3452 .db = 0, 3453 .s = 1, 3454 .l = 0, 3455 .g = 0, 3456 .avl = 0, 3457 .unusable = 0, 3458 }; 3459 kvm_x86_ops->set_segment(vcpu, &segvar, seg); 3460 return 0; 3461 } 3462 3463 int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 3464 int type_bits, int seg) 3465 { 3466 struct kvm_segment kvm_seg; 3467 3468 if (!(vcpu->arch.cr0 & X86_CR0_PE)) 3469 return kvm_load_realmode_segment(vcpu, selector, seg); 3470 if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg)) 3471 return 1; 3472 kvm_seg.type |= type_bits; 3473 3474 if (seg != VCPU_SREG_SS && seg != VCPU_SREG_CS && 3475 seg != VCPU_SREG_LDTR) 3476 if (!kvm_seg.s) 3477 kvm_seg.unusable = 1; 3478 3479 kvm_set_segment(vcpu, &kvm_seg, seg); 3480 return 0; 3481 } 3482 3483 static void save_state_to_tss32(struct kvm_vcpu *vcpu, 3484 struct tss_segment_32 *tss) 3485 { 3486 tss->cr3 = vcpu->arch.cr3; 3487 tss->eip = kvm_rip_read(vcpu); 3488 tss->eflags = kvm_x86_ops->get_rflags(vcpu); 3489 tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX); 3490 tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); 3491 tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX); 3492 tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX); 3493 tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP); 3494 tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP); 3495 tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI); 3496 tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI); 3497 tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); 3498 tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); 3499 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); 3500 tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS); 3501 tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS); 3502 tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS); 3503 tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR); 3504 tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR); 3505 } 3506 3507 static int load_state_from_tss32(struct kvm_vcpu *vcpu, 3508 struct tss_segment_32 *tss) 3509 { 3510 kvm_set_cr3(vcpu, tss->cr3); 3511 3512 kvm_rip_write(vcpu, tss->eip); 3513 kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2); 3514 3515 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax); 3516 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx); 3517 kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx); 3518 kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx); 3519 kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp); 3520 kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp); 3521 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi); 3522 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi); 3523 3524 if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR)) 3525 return 1; 3526 3527 if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) 3528 return 1; 3529 3530 if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) 3531 return 1; 3532 3533 if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) 3534 return 1; 3535 3536 if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) 3537 return 1; 3538 3539 if (kvm_load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS)) 3540 return 1; 3541 3542 if (kvm_load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS)) 3543 return 1; 3544 return 0; 3545 } 3546 3547 static void save_state_to_tss16(struct kvm_vcpu *vcpu, 3548 struct tss_segment_16 *tss) 3549 { 3550 tss->ip = kvm_rip_read(vcpu); 3551 tss->flag = kvm_x86_ops->get_rflags(vcpu); 3552 tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX); 3553 tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX); 3554 tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX); 3555 tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX); 3556 tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP); 3557 tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP); 3558 tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI); 3559 tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI); 3560 3561 tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); 3562 tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); 3563 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); 3564 tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS); 3565 tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR); 3566 tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR); 3567 } 3568 3569 static int load_state_from_tss16(struct kvm_vcpu *vcpu, 3570 struct tss_segment_16 *tss) 3571 { 3572 kvm_rip_write(vcpu, tss->ip); 3573 kvm_x86_ops->set_rflags(vcpu, tss->flag | 2); 3574 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax); 3575 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx); 3576 kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx); 3577 kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx); 3578 kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp); 3579 kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp); 3580 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si); 3581 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di); 3582 3583 if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR)) 3584 return 1; 3585 3586 if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) 3587 return 1; 3588 3589 if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) 3590 return 1; 3591 3592 if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) 3593 return 1; 3594 3595 if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) 3596 return 1; 3597 return 0; 3598 } 3599 3600 static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector, 3601 u32 old_tss_base, 3602 struct desc_struct *nseg_desc) 3603 { 3604 struct tss_segment_16 tss_segment_16; 3605 int ret = 0; 3606 3607 if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_16, 3608 sizeof tss_segment_16)) 3609 goto out; 3610 3611 save_state_to_tss16(vcpu, &tss_segment_16); 3612 3613 if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_16, 3614 sizeof tss_segment_16)) 3615 goto out; 3616 3617 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc), 3618 &tss_segment_16, sizeof tss_segment_16)) 3619 goto out; 3620 3621 if (load_state_from_tss16(vcpu, &tss_segment_16)) 3622 goto out; 3623 3624 ret = 1; 3625 out: 3626 return ret; 3627 } 3628 3629 static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector, 3630 u32 old_tss_base, 3631 struct desc_struct *nseg_desc) 3632 { 3633 struct tss_segment_32 tss_segment_32; 3634 int ret = 0; 3635 3636 if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_32, 3637 sizeof tss_segment_32)) 3638 goto out; 3639 3640 save_state_to_tss32(vcpu, &tss_segment_32); 3641 3642 if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_32, 3643 sizeof tss_segment_32)) 3644 goto out; 3645 3646 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc), 3647 &tss_segment_32, sizeof tss_segment_32)) 3648 goto out; 3649 3650 if (load_state_from_tss32(vcpu, &tss_segment_32)) 3651 goto out; 3652 3653 ret = 1; 3654 out: 3655 return ret; 3656 } 3657 3658 int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) 3659 { 3660 struct kvm_segment tr_seg; 3661 struct desc_struct cseg_desc; 3662 struct desc_struct nseg_desc; 3663 int ret = 0; 3664 u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR); 3665 u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR); 3666 3667 old_tss_base = vcpu->arch.mmu.gva_to_gpa(vcpu, old_tss_base); 3668 3669 /* FIXME: Handle errors. Failure to read either TSS or their 3670 * descriptors should generate a pagefault. 3671 */ 3672 if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc)) 3673 goto out; 3674 3675 if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc)) 3676 goto out; 3677 3678 if (reason != TASK_SWITCH_IRET) { 3679 int cpl; 3680 3681 cpl = kvm_x86_ops->get_cpl(vcpu); 3682 if ((tss_selector & 3) > nseg_desc.dpl || cpl > nseg_desc.dpl) { 3683 kvm_queue_exception_e(vcpu, GP_VECTOR, 0); 3684 return 1; 3685 } 3686 } 3687 3688 if (!nseg_desc.p || (nseg_desc.limit0 | nseg_desc.limit << 16) < 0x67) { 3689 kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc); 3690 return 1; 3691 } 3692 3693 if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) { 3694 cseg_desc.type &= ~(1 << 1); //clear the B flag 3695 save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc); 3696 } 3697 3698 if (reason == TASK_SWITCH_IRET) { 3699 u32 eflags = kvm_x86_ops->get_rflags(vcpu); 3700 kvm_x86_ops->set_rflags(vcpu, eflags & ~X86_EFLAGS_NT); 3701 } 3702 3703 kvm_x86_ops->skip_emulated_instruction(vcpu); 3704 3705 if (nseg_desc.type & 8) 3706 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_base, 3707 &nseg_desc); 3708 else 3709 ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_base, 3710 &nseg_desc); 3711 3712 if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) { 3713 u32 eflags = kvm_x86_ops->get_rflags(vcpu); 3714 kvm_x86_ops->set_rflags(vcpu, eflags | X86_EFLAGS_NT); 3715 } 3716 3717 if (reason != TASK_SWITCH_IRET) { 3718 nseg_desc.type |= (1 << 1); 3719 save_guest_segment_descriptor(vcpu, tss_selector, 3720 &nseg_desc); 3721 } 3722 3723 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS); 3724 seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg); 3725 tr_seg.type = 11; 3726 kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR); 3727 out: 3728 return ret; 3729 } 3730 EXPORT_SYMBOL_GPL(kvm_task_switch); 3731 3732 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, 3733 struct kvm_sregs *sregs) 3734 { 3735 int mmu_reset_needed = 0; 3736 int i, pending_vec, max_bits; 3737 struct descriptor_table dt; 3738 3739 vcpu_load(vcpu); 3740 3741 dt.limit = sregs->idt.limit; 3742 dt.base = sregs->idt.base; 3743 kvm_x86_ops->set_idt(vcpu, &dt); 3744 dt.limit = sregs->gdt.limit; 3745 dt.base = sregs->gdt.base; 3746 kvm_x86_ops->set_gdt(vcpu, &dt); 3747 3748 vcpu->arch.cr2 = sregs->cr2; 3749 mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3; 3750 vcpu->arch.cr3 = sregs->cr3; 3751 3752 kvm_set_cr8(vcpu, sregs->cr8); 3753 3754 mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer; 3755 kvm_x86_ops->set_efer(vcpu, sregs->efer); 3756 kvm_set_apic_base(vcpu, sregs->apic_base); 3757 3758 kvm_x86_ops->decache_cr4_guest_bits(vcpu); 3759 3760 mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0; 3761 kvm_x86_ops->set_cr0(vcpu, sregs->cr0); 3762 vcpu->arch.cr0 = sregs->cr0; 3763 3764 mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4; 3765 kvm_x86_ops->set_cr4(vcpu, sregs->cr4); 3766 if (!is_long_mode(vcpu) && is_pae(vcpu)) 3767 load_pdptrs(vcpu, vcpu->arch.cr3); 3768 3769 if (mmu_reset_needed) 3770 kvm_mmu_reset_context(vcpu); 3771 3772 if (!irqchip_in_kernel(vcpu->kvm)) { 3773 memcpy(vcpu->arch.irq_pending, sregs->interrupt_bitmap, 3774 sizeof vcpu->arch.irq_pending); 3775 vcpu->arch.irq_summary = 0; 3776 for (i = 0; i < ARRAY_SIZE(vcpu->arch.irq_pending); ++i) 3777 if (vcpu->arch.irq_pending[i]) 3778 __set_bit(i, &vcpu->arch.irq_summary); 3779 } else { 3780 max_bits = (sizeof sregs->interrupt_bitmap) << 3; 3781 pending_vec = find_first_bit( 3782 (const unsigned long *)sregs->interrupt_bitmap, 3783 max_bits); 3784 /* Only pending external irq is handled here */ 3785 if (pending_vec < max_bits) { 3786 kvm_x86_ops->set_irq(vcpu, pending_vec); 3787 pr_debug("Set back pending irq %d\n", 3788 pending_vec); 3789 } 3790 kvm_pic_clear_isr_ack(vcpu->kvm); 3791 } 3792 3793 kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 3794 kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS); 3795 kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES); 3796 kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS); 3797 kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS); 3798 kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS); 3799 3800 kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); 3801 kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); 3802 3803 /* Older userspace won't unhalt the vcpu on reset. */ 3804 if (vcpu->vcpu_id == 0 && kvm_rip_read(vcpu) == 0xfff0 && 3805 sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && 3806 !(vcpu->arch.cr0 & X86_CR0_PE)) 3807 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 3808 3809 vcpu_put(vcpu); 3810 3811 return 0; 3812 } 3813 3814 int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu, 3815 struct kvm_debug_guest *dbg) 3816 { 3817 int r; 3818 3819 vcpu_load(vcpu); 3820 3821 r = kvm_x86_ops->set_guest_debug(vcpu, dbg); 3822 3823 vcpu_put(vcpu); 3824 3825 return r; 3826 } 3827 3828 /* 3829 * fxsave fpu state. Taken from x86_64/processor.h. To be killed when 3830 * we have asm/x86/processor.h 3831 */ 3832 struct fxsave { 3833 u16 cwd; 3834 u16 swd; 3835 u16 twd; 3836 u16 fop; 3837 u64 rip; 3838 u64 rdp; 3839 u32 mxcsr; 3840 u32 mxcsr_mask; 3841 u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ 3842 #ifdef CONFIG_X86_64 3843 u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */ 3844 #else 3845 u32 xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */ 3846 #endif 3847 }; 3848 3849 /* 3850 * Translate a guest virtual address to a guest physical address. 3851 */ 3852 int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, 3853 struct kvm_translation *tr) 3854 { 3855 unsigned long vaddr = tr->linear_address; 3856 gpa_t gpa; 3857 3858 vcpu_load(vcpu); 3859 down_read(&vcpu->kvm->slots_lock); 3860 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr); 3861 up_read(&vcpu->kvm->slots_lock); 3862 tr->physical_address = gpa; 3863 tr->valid = gpa != UNMAPPED_GVA; 3864 tr->writeable = 1; 3865 tr->usermode = 0; 3866 vcpu_put(vcpu); 3867 3868 return 0; 3869 } 3870 3871 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 3872 { 3873 struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; 3874 3875 vcpu_load(vcpu); 3876 3877 memcpy(fpu->fpr, fxsave->st_space, 128); 3878 fpu->fcw = fxsave->cwd; 3879 fpu->fsw = fxsave->swd; 3880 fpu->ftwx = fxsave->twd; 3881 fpu->last_opcode = fxsave->fop; 3882 fpu->last_ip = fxsave->rip; 3883 fpu->last_dp = fxsave->rdp; 3884 memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space); 3885 3886 vcpu_put(vcpu); 3887 3888 return 0; 3889 } 3890 3891 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 3892 { 3893 struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; 3894 3895 vcpu_load(vcpu); 3896 3897 memcpy(fxsave->st_space, fpu->fpr, 128); 3898 fxsave->cwd = fpu->fcw; 3899 fxsave->swd = fpu->fsw; 3900 fxsave->twd = fpu->ftwx; 3901 fxsave->fop = fpu->last_opcode; 3902 fxsave->rip = fpu->last_ip; 3903 fxsave->rdp = fpu->last_dp; 3904 memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space); 3905 3906 vcpu_put(vcpu); 3907 3908 return 0; 3909 } 3910 3911 void fx_init(struct kvm_vcpu *vcpu) 3912 { 3913 unsigned after_mxcsr_mask; 3914 3915 /* 3916 * Touch the fpu the first time in non atomic context as if 3917 * this is the first fpu instruction the exception handler 3918 * will fire before the instruction returns and it'll have to 3919 * allocate ram with GFP_KERNEL. 3920 */ 3921 if (!used_math()) 3922 kvm_fx_save(&vcpu->arch.host_fx_image); 3923 3924 /* Initialize guest FPU by resetting ours and saving into guest's */ 3925 preempt_disable(); 3926 kvm_fx_save(&vcpu->arch.host_fx_image); 3927 kvm_fx_finit(); 3928 kvm_fx_save(&vcpu->arch.guest_fx_image); 3929 kvm_fx_restore(&vcpu->arch.host_fx_image); 3930 preempt_enable(); 3931 3932 vcpu->arch.cr0 |= X86_CR0_ET; 3933 after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space); 3934 vcpu->arch.guest_fx_image.mxcsr = 0x1f80; 3935 memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask, 3936 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask); 3937 } 3938 EXPORT_SYMBOL_GPL(fx_init); 3939 3940 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) 3941 { 3942 if (!vcpu->fpu_active || vcpu->guest_fpu_loaded) 3943 return; 3944 3945 vcpu->guest_fpu_loaded = 1; 3946 kvm_fx_save(&vcpu->arch.host_fx_image); 3947 kvm_fx_restore(&vcpu->arch.guest_fx_image); 3948 } 3949 EXPORT_SYMBOL_GPL(kvm_load_guest_fpu); 3950 3951 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) 3952 { 3953 if (!vcpu->guest_fpu_loaded) 3954 return; 3955 3956 vcpu->guest_fpu_loaded = 0; 3957 kvm_fx_save(&vcpu->arch.guest_fx_image); 3958 kvm_fx_restore(&vcpu->arch.host_fx_image); 3959 ++vcpu->stat.fpu_reload; 3960 } 3961 EXPORT_SYMBOL_GPL(kvm_put_guest_fpu); 3962 3963 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) 3964 { 3965 kvm_x86_ops->vcpu_free(vcpu); 3966 } 3967 3968 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, 3969 unsigned int id) 3970 { 3971 return kvm_x86_ops->vcpu_create(kvm, id); 3972 } 3973 3974 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) 3975 { 3976 int r; 3977 3978 /* We do fxsave: this must be aligned. */ 3979 BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF); 3980 3981 vcpu->arch.mtrr_state.have_fixed = 1; 3982 vcpu_load(vcpu); 3983 r = kvm_arch_vcpu_reset(vcpu); 3984 if (r == 0) 3985 r = kvm_mmu_setup(vcpu); 3986 vcpu_put(vcpu); 3987 if (r < 0) 3988 goto free_vcpu; 3989 3990 return 0; 3991 free_vcpu: 3992 kvm_x86_ops->vcpu_free(vcpu); 3993 return r; 3994 } 3995 3996 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) 3997 { 3998 vcpu_load(vcpu); 3999 kvm_mmu_unload(vcpu); 4000 vcpu_put(vcpu); 4001 4002 kvm_x86_ops->vcpu_free(vcpu); 4003 } 4004 4005 int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) 4006 { 4007 vcpu->arch.nmi_pending = false; 4008 vcpu->arch.nmi_injected = false; 4009 4010 return kvm_x86_ops->vcpu_reset(vcpu); 4011 } 4012 4013 void kvm_arch_hardware_enable(void *garbage) 4014 { 4015 kvm_x86_ops->hardware_enable(garbage); 4016 } 4017 4018 void kvm_arch_hardware_disable(void *garbage) 4019 { 4020 kvm_x86_ops->hardware_disable(garbage); 4021 } 4022 4023 int kvm_arch_hardware_setup(void) 4024 { 4025 return kvm_x86_ops->hardware_setup(); 4026 } 4027 4028 void kvm_arch_hardware_unsetup(void) 4029 { 4030 kvm_x86_ops->hardware_unsetup(); 4031 } 4032 4033 void kvm_arch_check_processor_compat(void *rtn) 4034 { 4035 kvm_x86_ops->check_processor_compatibility(rtn); 4036 } 4037 4038 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) 4039 { 4040 struct page *page; 4041 struct kvm *kvm; 4042 int r; 4043 4044 BUG_ON(vcpu->kvm == NULL); 4045 kvm = vcpu->kvm; 4046 4047 vcpu->arch.mmu.root_hpa = INVALID_PAGE; 4048 if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0) 4049 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 4050 else 4051 vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; 4052 4053 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 4054 if (!page) { 4055 r = -ENOMEM; 4056 goto fail; 4057 } 4058 vcpu->arch.pio_data = page_address(page); 4059 4060 r = kvm_mmu_create(vcpu); 4061 if (r < 0) 4062 goto fail_free_pio_data; 4063 4064 if (irqchip_in_kernel(kvm)) { 4065 r = kvm_create_lapic(vcpu); 4066 if (r < 0) 4067 goto fail_mmu_destroy; 4068 } 4069 4070 return 0; 4071 4072 fail_mmu_destroy: 4073 kvm_mmu_destroy(vcpu); 4074 fail_free_pio_data: 4075 free_page((unsigned long)vcpu->arch.pio_data); 4076 fail: 4077 return r; 4078 } 4079 4080 void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) 4081 { 4082 kvm_free_lapic(vcpu); 4083 down_read(&vcpu->kvm->slots_lock); 4084 kvm_mmu_destroy(vcpu); 4085 up_read(&vcpu->kvm->slots_lock); 4086 free_page((unsigned long)vcpu->arch.pio_data); 4087 } 4088 4089 struct kvm *kvm_arch_create_vm(void) 4090 { 4091 struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); 4092 4093 if (!kvm) 4094 return ERR_PTR(-ENOMEM); 4095 4096 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); 4097 INIT_LIST_HEAD(&kvm->arch.oos_global_pages); 4098 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); 4099 4100 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ 4101 set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); 4102 4103 return kvm; 4104 } 4105 4106 static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) 4107 { 4108 vcpu_load(vcpu); 4109 kvm_mmu_unload(vcpu); 4110 vcpu_put(vcpu); 4111 } 4112 4113 static void kvm_free_vcpus(struct kvm *kvm) 4114 { 4115 unsigned int i; 4116 4117 /* 4118 * Unpin any mmu pages first. 4119 */ 4120 for (i = 0; i < KVM_MAX_VCPUS; ++i) 4121 if (kvm->vcpus[i]) 4122 kvm_unload_vcpu_mmu(kvm->vcpus[i]); 4123 for (i = 0; i < KVM_MAX_VCPUS; ++i) { 4124 if (kvm->vcpus[i]) { 4125 kvm_arch_vcpu_free(kvm->vcpus[i]); 4126 kvm->vcpus[i] = NULL; 4127 } 4128 } 4129 4130 } 4131 4132 void kvm_arch_sync_events(struct kvm *kvm) 4133 { 4134 kvm_free_all_assigned_devices(kvm); 4135 } 4136 4137 void kvm_arch_destroy_vm(struct kvm *kvm) 4138 { 4139 kvm_iommu_unmap_guest(kvm); 4140 kvm_free_pit(kvm); 4141 kfree(kvm->arch.vpic); 4142 kfree(kvm->arch.vioapic); 4143 kvm_free_vcpus(kvm); 4144 kvm_free_physmem(kvm); 4145 if (kvm->arch.apic_access_page) 4146 put_page(kvm->arch.apic_access_page); 4147 if (kvm->arch.ept_identity_pagetable) 4148 put_page(kvm->arch.ept_identity_pagetable); 4149 kfree(kvm); 4150 } 4151 4152 int kvm_arch_set_memory_region(struct kvm *kvm, 4153 struct kvm_userspace_memory_region *mem, 4154 struct kvm_memory_slot old, 4155 int user_alloc) 4156 { 4157 int npages = mem->memory_size >> PAGE_SHIFT; 4158 struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot]; 4159 4160 /*To keep backward compatibility with older userspace, 4161 *x86 needs to hanlde !user_alloc case. 4162 */ 4163 if (!user_alloc) { 4164 if (npages && !old.rmap) { 4165 unsigned long userspace_addr; 4166 4167 down_write(¤t->mm->mmap_sem); 4168 userspace_addr = do_mmap(NULL, 0, 4169 npages * PAGE_SIZE, 4170 PROT_READ | PROT_WRITE, 4171 MAP_PRIVATE | MAP_ANONYMOUS, 4172 0); 4173 up_write(¤t->mm->mmap_sem); 4174 4175 if (IS_ERR((void *)userspace_addr)) 4176 return PTR_ERR((void *)userspace_addr); 4177 4178 /* set userspace_addr atomically for kvm_hva_to_rmapp */ 4179 spin_lock(&kvm->mmu_lock); 4180 memslot->userspace_addr = userspace_addr; 4181 spin_unlock(&kvm->mmu_lock); 4182 } else { 4183 if (!old.user_alloc && old.rmap) { 4184 int ret; 4185 4186 down_write(¤t->mm->mmap_sem); 4187 ret = do_munmap(current->mm, old.userspace_addr, 4188 old.npages * PAGE_SIZE); 4189 up_write(¤t->mm->mmap_sem); 4190 if (ret < 0) 4191 printk(KERN_WARNING 4192 "kvm_vm_ioctl_set_memory_region: " 4193 "failed to munmap memory\n"); 4194 } 4195 } 4196 } 4197 4198 if (!kvm->arch.n_requested_mmu_pages) { 4199 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); 4200 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); 4201 } 4202 4203 kvm_mmu_slot_remove_write_access(kvm, mem->slot); 4204 kvm_flush_remote_tlbs(kvm); 4205 4206 return 0; 4207 } 4208 4209 void kvm_arch_flush_shadow(struct kvm *kvm) 4210 { 4211 kvm_mmu_zap_all(kvm); 4212 } 4213 4214 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) 4215 { 4216 return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE 4217 || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED 4218 || vcpu->arch.nmi_pending; 4219 } 4220 4221 static void vcpu_kick_intr(void *info) 4222 { 4223 #ifdef DEBUG 4224 struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info; 4225 printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu); 4226 #endif 4227 } 4228 4229 void kvm_vcpu_kick(struct kvm_vcpu *vcpu) 4230 { 4231 int ipi_pcpu = vcpu->cpu; 4232 int cpu = get_cpu(); 4233 4234 if (waitqueue_active(&vcpu->wq)) { 4235 wake_up_interruptible(&vcpu->wq); 4236 ++vcpu->stat.halt_wakeup; 4237 } 4238 /* 4239 * We may be called synchronously with irqs disabled in guest mode, 4240 * So need not to call smp_call_function_single() in that case. 4241 */ 4242 if (vcpu->guest_mode && vcpu->cpu != cpu) 4243 smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0); 4244 put_cpu(); 4245 } 4246