xref: /linux/virt/kvm/kvm_main.c (revision 492c826b9facefa84995f4dea917e301b5ee0884)
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * This module enables machines with Intel VT-x extensions to run virtual
5  * machines without emulation or binary translation.
6  *
7  * Copyright (C) 2006 Qumranet, Inc.
8  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
9  *
10  * Authors:
11  *   Avi Kivity   <avi@qumranet.com>
12  *   Yaniv Kamay  <yaniv@qumranet.com>
13  *
14  * This work is licensed under the terms of the GNU GPL, version 2.  See
15  * the COPYING file in the top-level directory.
16  *
17  */
18 
19 #include "iodev.h"
20 
21 #include <linux/kvm_host.h>
22 #include <linux/kvm.h>
23 #include <linux/module.h>
24 #include <linux/errno.h>
25 #include <linux/percpu.h>
26 #include <linux/mm.h>
27 #include <linux/miscdevice.h>
28 #include <linux/vmalloc.h>
29 #include <linux/reboot.h>
30 #include <linux/debugfs.h>
31 #include <linux/highmem.h>
32 #include <linux/file.h>
33 #include <linux/syscore_ops.h>
34 #include <linux/cpu.h>
35 #include <linux/sched.h>
36 #include <linux/cpumask.h>
37 #include <linux/smp.h>
38 #include <linux/anon_inodes.h>
39 #include <linux/profile.h>
40 #include <linux/kvm_para.h>
41 #include <linux/pagemap.h>
42 #include <linux/mman.h>
43 #include <linux/swap.h>
44 #include <linux/bitops.h>
45 #include <linux/spinlock.h>
46 #include <linux/compat.h>
47 #include <linux/srcu.h>
48 #include <linux/hugetlb.h>
49 #include <linux/slab.h>
50 
51 #include <asm/processor.h>
52 #include <asm/io.h>
53 #include <asm/uaccess.h>
54 #include <asm/pgtable.h>
55 
56 #include "coalesced_mmio.h"
57 #include "async_pf.h"
58 
59 #define CREATE_TRACE_POINTS
60 #include <trace/events/kvm.h>
61 
62 MODULE_AUTHOR("Qumranet");
63 MODULE_LICENSE("GPL");
64 
65 /*
66  * Ordering of locks:
67  *
68  * 		kvm->lock --> kvm->slots_lock --> kvm->irq_lock
69  */
70 
71 DEFINE_RAW_SPINLOCK(kvm_lock);
72 LIST_HEAD(vm_list);
73 
74 static cpumask_var_t cpus_hardware_enabled;
75 static int kvm_usage_count = 0;
76 static atomic_t hardware_enable_failed;
77 
78 struct kmem_cache *kvm_vcpu_cache;
79 EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
80 
81 static __read_mostly struct preempt_ops kvm_preempt_ops;
82 
83 struct dentry *kvm_debugfs_dir;
84 
85 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
86 			   unsigned long arg);
87 static int hardware_enable_all(void);
88 static void hardware_disable_all(void);
89 
90 static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
91 
92 bool kvm_rebooting;
93 EXPORT_SYMBOL_GPL(kvm_rebooting);
94 
95 static bool largepages_enabled = true;
96 
97 static struct page *hwpoison_page;
98 static pfn_t hwpoison_pfn;
99 
100 static struct page *fault_page;
101 static pfn_t fault_pfn;
102 
103 inline int kvm_is_mmio_pfn(pfn_t pfn)
104 {
105 	if (pfn_valid(pfn)) {
106 		int reserved;
107 		struct page *tail = pfn_to_page(pfn);
108 		struct page *head = compound_trans_head(tail);
109 		reserved = PageReserved(head);
110 		if (head != tail) {
111 			/*
112 			 * "head" is not a dangling pointer
113 			 * (compound_trans_head takes care of that)
114 			 * but the hugepage may have been splitted
115 			 * from under us (and we may not hold a
116 			 * reference count on the head page so it can
117 			 * be reused before we run PageReferenced), so
118 			 * we've to check PageTail before returning
119 			 * what we just read.
120 			 */
121 			smp_rmb();
122 			if (PageTail(tail))
123 				return reserved;
124 		}
125 		return PageReserved(tail);
126 	}
127 
128 	return true;
129 }
130 
131 /*
132  * Switches to specified vcpu, until a matching vcpu_put()
133  */
134 void vcpu_load(struct kvm_vcpu *vcpu)
135 {
136 	int cpu;
137 
138 	mutex_lock(&vcpu->mutex);
139 	if (unlikely(vcpu->pid != current->pids[PIDTYPE_PID].pid)) {
140 		/* The thread running this VCPU changed. */
141 		struct pid *oldpid = vcpu->pid;
142 		struct pid *newpid = get_task_pid(current, PIDTYPE_PID);
143 		rcu_assign_pointer(vcpu->pid, newpid);
144 		synchronize_rcu();
145 		put_pid(oldpid);
146 	}
147 	cpu = get_cpu();
148 	preempt_notifier_register(&vcpu->preempt_notifier);
149 	kvm_arch_vcpu_load(vcpu, cpu);
150 	put_cpu();
151 }
152 
153 void vcpu_put(struct kvm_vcpu *vcpu)
154 {
155 	preempt_disable();
156 	kvm_arch_vcpu_put(vcpu);
157 	preempt_notifier_unregister(&vcpu->preempt_notifier);
158 	preempt_enable();
159 	mutex_unlock(&vcpu->mutex);
160 }
161 
162 static void ack_flush(void *_completed)
163 {
164 }
165 
166 static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
167 {
168 	int i, cpu, me;
169 	cpumask_var_t cpus;
170 	bool called = true;
171 	struct kvm_vcpu *vcpu;
172 
173 	zalloc_cpumask_var(&cpus, GFP_ATOMIC);
174 
175 	me = get_cpu();
176 	kvm_for_each_vcpu(i, vcpu, kvm) {
177 		kvm_make_request(req, vcpu);
178 		cpu = vcpu->cpu;
179 
180 		/* Set ->requests bit before we read ->mode */
181 		smp_mb();
182 
183 		if (cpus != NULL && cpu != -1 && cpu != me &&
184 		      kvm_vcpu_exiting_guest_mode(vcpu) != OUTSIDE_GUEST_MODE)
185 			cpumask_set_cpu(cpu, cpus);
186 	}
187 	if (unlikely(cpus == NULL))
188 		smp_call_function_many(cpu_online_mask, ack_flush, NULL, 1);
189 	else if (!cpumask_empty(cpus))
190 		smp_call_function_many(cpus, ack_flush, NULL, 1);
191 	else
192 		called = false;
193 	put_cpu();
194 	free_cpumask_var(cpus);
195 	return called;
196 }
197 
198 void kvm_flush_remote_tlbs(struct kvm *kvm)
199 {
200 	int dirty_count = kvm->tlbs_dirty;
201 
202 	smp_mb();
203 	if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
204 		++kvm->stat.remote_tlb_flush;
205 	cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
206 }
207 
208 void kvm_reload_remote_mmus(struct kvm *kvm)
209 {
210 	make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
211 }
212 
213 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
214 {
215 	struct page *page;
216 	int r;
217 
218 	mutex_init(&vcpu->mutex);
219 	vcpu->cpu = -1;
220 	vcpu->kvm = kvm;
221 	vcpu->vcpu_id = id;
222 	vcpu->pid = NULL;
223 	init_waitqueue_head(&vcpu->wq);
224 	kvm_async_pf_vcpu_init(vcpu);
225 
226 	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
227 	if (!page) {
228 		r = -ENOMEM;
229 		goto fail;
230 	}
231 	vcpu->run = page_address(page);
232 
233 	r = kvm_arch_vcpu_init(vcpu);
234 	if (r < 0)
235 		goto fail_free_run;
236 	return 0;
237 
238 fail_free_run:
239 	free_page((unsigned long)vcpu->run);
240 fail:
241 	return r;
242 }
243 EXPORT_SYMBOL_GPL(kvm_vcpu_init);
244 
245 void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
246 {
247 	put_pid(vcpu->pid);
248 	kvm_arch_vcpu_uninit(vcpu);
249 	free_page((unsigned long)vcpu->run);
250 }
251 EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
252 
253 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
254 static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
255 {
256 	return container_of(mn, struct kvm, mmu_notifier);
257 }
258 
259 static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
260 					     struct mm_struct *mm,
261 					     unsigned long address)
262 {
263 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
264 	int need_tlb_flush, idx;
265 
266 	/*
267 	 * When ->invalidate_page runs, the linux pte has been zapped
268 	 * already but the page is still allocated until
269 	 * ->invalidate_page returns. So if we increase the sequence
270 	 * here the kvm page fault will notice if the spte can't be
271 	 * established because the page is going to be freed. If
272 	 * instead the kvm page fault establishes the spte before
273 	 * ->invalidate_page runs, kvm_unmap_hva will release it
274 	 * before returning.
275 	 *
276 	 * The sequence increase only need to be seen at spin_unlock
277 	 * time, and not at spin_lock time.
278 	 *
279 	 * Increasing the sequence after the spin_unlock would be
280 	 * unsafe because the kvm page fault could then establish the
281 	 * pte after kvm_unmap_hva returned, without noticing the page
282 	 * is going to be freed.
283 	 */
284 	idx = srcu_read_lock(&kvm->srcu);
285 	spin_lock(&kvm->mmu_lock);
286 	kvm->mmu_notifier_seq++;
287 	need_tlb_flush = kvm_unmap_hva(kvm, address) | kvm->tlbs_dirty;
288 	spin_unlock(&kvm->mmu_lock);
289 	srcu_read_unlock(&kvm->srcu, idx);
290 
291 	/* we've to flush the tlb before the pages can be freed */
292 	if (need_tlb_flush)
293 		kvm_flush_remote_tlbs(kvm);
294 
295 }
296 
297 static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
298 					struct mm_struct *mm,
299 					unsigned long address,
300 					pte_t pte)
301 {
302 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
303 	int idx;
304 
305 	idx = srcu_read_lock(&kvm->srcu);
306 	spin_lock(&kvm->mmu_lock);
307 	kvm->mmu_notifier_seq++;
308 	kvm_set_spte_hva(kvm, address, pte);
309 	spin_unlock(&kvm->mmu_lock);
310 	srcu_read_unlock(&kvm->srcu, idx);
311 }
312 
313 static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
314 						    struct mm_struct *mm,
315 						    unsigned long start,
316 						    unsigned long end)
317 {
318 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
319 	int need_tlb_flush = 0, idx;
320 
321 	idx = srcu_read_lock(&kvm->srcu);
322 	spin_lock(&kvm->mmu_lock);
323 	/*
324 	 * The count increase must become visible at unlock time as no
325 	 * spte can be established without taking the mmu_lock and
326 	 * count is also read inside the mmu_lock critical section.
327 	 */
328 	kvm->mmu_notifier_count++;
329 	for (; start < end; start += PAGE_SIZE)
330 		need_tlb_flush |= kvm_unmap_hva(kvm, start);
331 	need_tlb_flush |= kvm->tlbs_dirty;
332 	spin_unlock(&kvm->mmu_lock);
333 	srcu_read_unlock(&kvm->srcu, idx);
334 
335 	/* we've to flush the tlb before the pages can be freed */
336 	if (need_tlb_flush)
337 		kvm_flush_remote_tlbs(kvm);
338 }
339 
340 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
341 						  struct mm_struct *mm,
342 						  unsigned long start,
343 						  unsigned long end)
344 {
345 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
346 
347 	spin_lock(&kvm->mmu_lock);
348 	/*
349 	 * This sequence increase will notify the kvm page fault that
350 	 * the page that is going to be mapped in the spte could have
351 	 * been freed.
352 	 */
353 	kvm->mmu_notifier_seq++;
354 	/*
355 	 * The above sequence increase must be visible before the
356 	 * below count decrease but both values are read by the kvm
357 	 * page fault under mmu_lock spinlock so we don't need to add
358 	 * a smb_wmb() here in between the two.
359 	 */
360 	kvm->mmu_notifier_count--;
361 	spin_unlock(&kvm->mmu_lock);
362 
363 	BUG_ON(kvm->mmu_notifier_count < 0);
364 }
365 
366 static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
367 					      struct mm_struct *mm,
368 					      unsigned long address)
369 {
370 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
371 	int young, idx;
372 
373 	idx = srcu_read_lock(&kvm->srcu);
374 	spin_lock(&kvm->mmu_lock);
375 	young = kvm_age_hva(kvm, address);
376 	spin_unlock(&kvm->mmu_lock);
377 	srcu_read_unlock(&kvm->srcu, idx);
378 
379 	if (young)
380 		kvm_flush_remote_tlbs(kvm);
381 
382 	return young;
383 }
384 
385 static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
386 				       struct mm_struct *mm,
387 				       unsigned long address)
388 {
389 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
390 	int young, idx;
391 
392 	idx = srcu_read_lock(&kvm->srcu);
393 	spin_lock(&kvm->mmu_lock);
394 	young = kvm_test_age_hva(kvm, address);
395 	spin_unlock(&kvm->mmu_lock);
396 	srcu_read_unlock(&kvm->srcu, idx);
397 
398 	return young;
399 }
400 
401 static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
402 				     struct mm_struct *mm)
403 {
404 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
405 	int idx;
406 
407 	idx = srcu_read_lock(&kvm->srcu);
408 	kvm_arch_flush_shadow(kvm);
409 	srcu_read_unlock(&kvm->srcu, idx);
410 }
411 
412 static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
413 	.invalidate_page	= kvm_mmu_notifier_invalidate_page,
414 	.invalidate_range_start	= kvm_mmu_notifier_invalidate_range_start,
415 	.invalidate_range_end	= kvm_mmu_notifier_invalidate_range_end,
416 	.clear_flush_young	= kvm_mmu_notifier_clear_flush_young,
417 	.test_young		= kvm_mmu_notifier_test_young,
418 	.change_pte		= kvm_mmu_notifier_change_pte,
419 	.release		= kvm_mmu_notifier_release,
420 };
421 
422 static int kvm_init_mmu_notifier(struct kvm *kvm)
423 {
424 	kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
425 	return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
426 }
427 
428 #else  /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */
429 
430 static int kvm_init_mmu_notifier(struct kvm *kvm)
431 {
432 	return 0;
433 }
434 
435 #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
436 
437 static struct kvm *kvm_create_vm(void)
438 {
439 	int r, i;
440 	struct kvm *kvm = kvm_arch_alloc_vm();
441 
442 	if (!kvm)
443 		return ERR_PTR(-ENOMEM);
444 
445 	r = kvm_arch_init_vm(kvm);
446 	if (r)
447 		goto out_err_nodisable;
448 
449 	r = hardware_enable_all();
450 	if (r)
451 		goto out_err_nodisable;
452 
453 #ifdef CONFIG_HAVE_KVM_IRQCHIP
454 	INIT_HLIST_HEAD(&kvm->mask_notifier_list);
455 	INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
456 #endif
457 
458 	r = -ENOMEM;
459 	kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
460 	if (!kvm->memslots)
461 		goto out_err_nosrcu;
462 	if (init_srcu_struct(&kvm->srcu))
463 		goto out_err_nosrcu;
464 	for (i = 0; i < KVM_NR_BUSES; i++) {
465 		kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus),
466 					GFP_KERNEL);
467 		if (!kvm->buses[i])
468 			goto out_err;
469 	}
470 	spin_lock_init(&kvm->mmu_lock);
471 
472 	r = kvm_init_mmu_notifier(kvm);
473 	if (r)
474 		goto out_err;
475 
476 	kvm->mm = current->mm;
477 	atomic_inc(&kvm->mm->mm_count);
478 	kvm_eventfd_init(kvm);
479 	mutex_init(&kvm->lock);
480 	mutex_init(&kvm->irq_lock);
481 	mutex_init(&kvm->slots_lock);
482 	atomic_set(&kvm->users_count, 1);
483 	raw_spin_lock(&kvm_lock);
484 	list_add(&kvm->vm_list, &vm_list);
485 	raw_spin_unlock(&kvm_lock);
486 
487 	return kvm;
488 
489 out_err:
490 	cleanup_srcu_struct(&kvm->srcu);
491 out_err_nosrcu:
492 	hardware_disable_all();
493 out_err_nodisable:
494 	for (i = 0; i < KVM_NR_BUSES; i++)
495 		kfree(kvm->buses[i]);
496 	kfree(kvm->memslots);
497 	kvm_arch_free_vm(kvm);
498 	return ERR_PTR(r);
499 }
500 
501 static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
502 {
503 	if (!memslot->dirty_bitmap)
504 		return;
505 
506 	if (2 * kvm_dirty_bitmap_bytes(memslot) > PAGE_SIZE)
507 		vfree(memslot->dirty_bitmap_head);
508 	else
509 		kfree(memslot->dirty_bitmap_head);
510 
511 	memslot->dirty_bitmap = NULL;
512 	memslot->dirty_bitmap_head = NULL;
513 }
514 
515 /*
516  * Free any memory in @free but not in @dont.
517  */
518 static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
519 				  struct kvm_memory_slot *dont)
520 {
521 	int i;
522 
523 	if (!dont || free->rmap != dont->rmap)
524 		vfree(free->rmap);
525 
526 	if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
527 		kvm_destroy_dirty_bitmap(free);
528 
529 
530 	for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
531 		if (!dont || free->lpage_info[i] != dont->lpage_info[i]) {
532 			vfree(free->lpage_info[i]);
533 			free->lpage_info[i] = NULL;
534 		}
535 	}
536 
537 	free->npages = 0;
538 	free->rmap = NULL;
539 }
540 
541 void kvm_free_physmem(struct kvm *kvm)
542 {
543 	int i;
544 	struct kvm_memslots *slots = kvm->memslots;
545 
546 	for (i = 0; i < slots->nmemslots; ++i)
547 		kvm_free_physmem_slot(&slots->memslots[i], NULL);
548 
549 	kfree(kvm->memslots);
550 }
551 
552 static void kvm_destroy_vm(struct kvm *kvm)
553 {
554 	int i;
555 	struct mm_struct *mm = kvm->mm;
556 
557 	kvm_arch_sync_events(kvm);
558 	raw_spin_lock(&kvm_lock);
559 	list_del(&kvm->vm_list);
560 	raw_spin_unlock(&kvm_lock);
561 	kvm_free_irq_routing(kvm);
562 	for (i = 0; i < KVM_NR_BUSES; i++)
563 		kvm_io_bus_destroy(kvm->buses[i]);
564 	kvm_coalesced_mmio_free(kvm);
565 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
566 	mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
567 #else
568 	kvm_arch_flush_shadow(kvm);
569 #endif
570 	kvm_arch_destroy_vm(kvm);
571 	kvm_free_physmem(kvm);
572 	cleanup_srcu_struct(&kvm->srcu);
573 	kvm_arch_free_vm(kvm);
574 	hardware_disable_all();
575 	mmdrop(mm);
576 }
577 
578 void kvm_get_kvm(struct kvm *kvm)
579 {
580 	atomic_inc(&kvm->users_count);
581 }
582 EXPORT_SYMBOL_GPL(kvm_get_kvm);
583 
584 void kvm_put_kvm(struct kvm *kvm)
585 {
586 	if (atomic_dec_and_test(&kvm->users_count))
587 		kvm_destroy_vm(kvm);
588 }
589 EXPORT_SYMBOL_GPL(kvm_put_kvm);
590 
591 
592 static int kvm_vm_release(struct inode *inode, struct file *filp)
593 {
594 	struct kvm *kvm = filp->private_data;
595 
596 	kvm_irqfd_release(kvm);
597 
598 	kvm_put_kvm(kvm);
599 	return 0;
600 }
601 
602 #ifndef CONFIG_S390
603 /*
604  * Allocation size is twice as large as the actual dirty bitmap size.
605  * This makes it possible to do double buffering: see x86's
606  * kvm_vm_ioctl_get_dirty_log().
607  */
608 static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
609 {
610 	unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot);
611 
612 	if (dirty_bytes > PAGE_SIZE)
613 		memslot->dirty_bitmap = vzalloc(dirty_bytes);
614 	else
615 		memslot->dirty_bitmap = kzalloc(dirty_bytes, GFP_KERNEL);
616 
617 	if (!memslot->dirty_bitmap)
618 		return -ENOMEM;
619 
620 	memslot->dirty_bitmap_head = memslot->dirty_bitmap;
621 	return 0;
622 }
623 #endif /* !CONFIG_S390 */
624 
625 /*
626  * Allocate some memory and give it an address in the guest physical address
627  * space.
628  *
629  * Discontiguous memory is allowed, mostly for framebuffers.
630  *
631  * Must be called holding mmap_sem for write.
632  */
633 int __kvm_set_memory_region(struct kvm *kvm,
634 			    struct kvm_userspace_memory_region *mem,
635 			    int user_alloc)
636 {
637 	int r;
638 	gfn_t base_gfn;
639 	unsigned long npages;
640 	unsigned long i;
641 	struct kvm_memory_slot *memslot;
642 	struct kvm_memory_slot old, new;
643 	struct kvm_memslots *slots, *old_memslots;
644 
645 	r = -EINVAL;
646 	/* General sanity checks */
647 	if (mem->memory_size & (PAGE_SIZE - 1))
648 		goto out;
649 	if (mem->guest_phys_addr & (PAGE_SIZE - 1))
650 		goto out;
651 	/* We can read the guest memory with __xxx_user() later on. */
652 	if (user_alloc &&
653 	    ((mem->userspace_addr & (PAGE_SIZE - 1)) ||
654 	     !access_ok(VERIFY_WRITE, mem->userspace_addr, mem->memory_size)))
655 		goto out;
656 	if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS)
657 		goto out;
658 	if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
659 		goto out;
660 
661 	memslot = &kvm->memslots->memslots[mem->slot];
662 	base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
663 	npages = mem->memory_size >> PAGE_SHIFT;
664 
665 	r = -EINVAL;
666 	if (npages > KVM_MEM_MAX_NR_PAGES)
667 		goto out;
668 
669 	if (!npages)
670 		mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
671 
672 	new = old = *memslot;
673 
674 	new.id = mem->slot;
675 	new.base_gfn = base_gfn;
676 	new.npages = npages;
677 	new.flags = mem->flags;
678 
679 	/* Disallow changing a memory slot's size. */
680 	r = -EINVAL;
681 	if (npages && old.npages && npages != old.npages)
682 		goto out_free;
683 
684 	/* Check for overlaps */
685 	r = -EEXIST;
686 	for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
687 		struct kvm_memory_slot *s = &kvm->memslots->memslots[i];
688 
689 		if (s == memslot || !s->npages)
690 			continue;
691 		if (!((base_gfn + npages <= s->base_gfn) ||
692 		      (base_gfn >= s->base_gfn + s->npages)))
693 			goto out_free;
694 	}
695 
696 	/* Free page dirty bitmap if unneeded */
697 	if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
698 		new.dirty_bitmap = NULL;
699 
700 	r = -ENOMEM;
701 
702 	/* Allocate if a slot is being created */
703 #ifndef CONFIG_S390
704 	if (npages && !new.rmap) {
705 		new.rmap = vzalloc(npages * sizeof(*new.rmap));
706 
707 		if (!new.rmap)
708 			goto out_free;
709 
710 		new.user_alloc = user_alloc;
711 		new.userspace_addr = mem->userspace_addr;
712 	}
713 	if (!npages)
714 		goto skip_lpage;
715 
716 	for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
717 		unsigned long ugfn;
718 		unsigned long j;
719 		int lpages;
720 		int level = i + 2;
721 
722 		/* Avoid unused variable warning if no large pages */
723 		(void)level;
724 
725 		if (new.lpage_info[i])
726 			continue;
727 
728 		lpages = 1 + ((base_gfn + npages - 1)
729 			     >> KVM_HPAGE_GFN_SHIFT(level));
730 		lpages -= base_gfn >> KVM_HPAGE_GFN_SHIFT(level);
731 
732 		new.lpage_info[i] = vzalloc(lpages * sizeof(*new.lpage_info[i]));
733 
734 		if (!new.lpage_info[i])
735 			goto out_free;
736 
737 		if (base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
738 			new.lpage_info[i][0].write_count = 1;
739 		if ((base_gfn+npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
740 			new.lpage_info[i][lpages - 1].write_count = 1;
741 		ugfn = new.userspace_addr >> PAGE_SHIFT;
742 		/*
743 		 * If the gfn and userspace address are not aligned wrt each
744 		 * other, or if explicitly asked to, disable large page
745 		 * support for this slot
746 		 */
747 		if ((base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) ||
748 		    !largepages_enabled)
749 			for (j = 0; j < lpages; ++j)
750 				new.lpage_info[i][j].write_count = 1;
751 	}
752 
753 skip_lpage:
754 
755 	/* Allocate page dirty bitmap if needed */
756 	if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
757 		if (kvm_create_dirty_bitmap(&new) < 0)
758 			goto out_free;
759 		/* destroy any largepage mappings for dirty tracking */
760 	}
761 #else  /* not defined CONFIG_S390 */
762 	new.user_alloc = user_alloc;
763 	if (user_alloc)
764 		new.userspace_addr = mem->userspace_addr;
765 #endif /* not defined CONFIG_S390 */
766 
767 	if (!npages) {
768 		r = -ENOMEM;
769 		slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
770 		if (!slots)
771 			goto out_free;
772 		memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
773 		if (mem->slot >= slots->nmemslots)
774 			slots->nmemslots = mem->slot + 1;
775 		slots->generation++;
776 		slots->memslots[mem->slot].flags |= KVM_MEMSLOT_INVALID;
777 
778 		old_memslots = kvm->memslots;
779 		rcu_assign_pointer(kvm->memslots, slots);
780 		synchronize_srcu_expedited(&kvm->srcu);
781 		/* From this point no new shadow pages pointing to a deleted
782 		 * memslot will be created.
783 		 *
784 		 * validation of sp->gfn happens in:
785 		 * 	- gfn_to_hva (kvm_read_guest, gfn_to_pfn)
786 		 * 	- kvm_is_visible_gfn (mmu_check_roots)
787 		 */
788 		kvm_arch_flush_shadow(kvm);
789 		kfree(old_memslots);
790 	}
791 
792 	r = kvm_arch_prepare_memory_region(kvm, &new, old, mem, user_alloc);
793 	if (r)
794 		goto out_free;
795 
796 	/* map the pages in iommu page table */
797 	if (npages) {
798 		r = kvm_iommu_map_pages(kvm, &new);
799 		if (r)
800 			goto out_free;
801 	}
802 
803 	r = -ENOMEM;
804 	slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
805 	if (!slots)
806 		goto out_free;
807 	memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
808 	if (mem->slot >= slots->nmemslots)
809 		slots->nmemslots = mem->slot + 1;
810 	slots->generation++;
811 
812 	/* actual memory is freed via old in kvm_free_physmem_slot below */
813 	if (!npages) {
814 		new.rmap = NULL;
815 		new.dirty_bitmap = NULL;
816 		for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i)
817 			new.lpage_info[i] = NULL;
818 	}
819 
820 	slots->memslots[mem->slot] = new;
821 	old_memslots = kvm->memslots;
822 	rcu_assign_pointer(kvm->memslots, slots);
823 	synchronize_srcu_expedited(&kvm->srcu);
824 
825 	kvm_arch_commit_memory_region(kvm, mem, old, user_alloc);
826 
827 	kvm_free_physmem_slot(&old, &new);
828 	kfree(old_memslots);
829 
830 	return 0;
831 
832 out_free:
833 	kvm_free_physmem_slot(&new, &old);
834 out:
835 	return r;
836 
837 }
838 EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
839 
840 int kvm_set_memory_region(struct kvm *kvm,
841 			  struct kvm_userspace_memory_region *mem,
842 			  int user_alloc)
843 {
844 	int r;
845 
846 	mutex_lock(&kvm->slots_lock);
847 	r = __kvm_set_memory_region(kvm, mem, user_alloc);
848 	mutex_unlock(&kvm->slots_lock);
849 	return r;
850 }
851 EXPORT_SYMBOL_GPL(kvm_set_memory_region);
852 
853 int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
854 				   struct
855 				   kvm_userspace_memory_region *mem,
856 				   int user_alloc)
857 {
858 	if (mem->slot >= KVM_MEMORY_SLOTS)
859 		return -EINVAL;
860 	return kvm_set_memory_region(kvm, mem, user_alloc);
861 }
862 
863 int kvm_get_dirty_log(struct kvm *kvm,
864 			struct kvm_dirty_log *log, int *is_dirty)
865 {
866 	struct kvm_memory_slot *memslot;
867 	int r, i;
868 	unsigned long n;
869 	unsigned long any = 0;
870 
871 	r = -EINVAL;
872 	if (log->slot >= KVM_MEMORY_SLOTS)
873 		goto out;
874 
875 	memslot = &kvm->memslots->memslots[log->slot];
876 	r = -ENOENT;
877 	if (!memslot->dirty_bitmap)
878 		goto out;
879 
880 	n = kvm_dirty_bitmap_bytes(memslot);
881 
882 	for (i = 0; !any && i < n/sizeof(long); ++i)
883 		any = memslot->dirty_bitmap[i];
884 
885 	r = -EFAULT;
886 	if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
887 		goto out;
888 
889 	if (any)
890 		*is_dirty = 1;
891 
892 	r = 0;
893 out:
894 	return r;
895 }
896 
897 void kvm_disable_largepages(void)
898 {
899 	largepages_enabled = false;
900 }
901 EXPORT_SYMBOL_GPL(kvm_disable_largepages);
902 
903 int is_error_page(struct page *page)
904 {
905 	return page == bad_page || page == hwpoison_page || page == fault_page;
906 }
907 EXPORT_SYMBOL_GPL(is_error_page);
908 
909 int is_error_pfn(pfn_t pfn)
910 {
911 	return pfn == bad_pfn || pfn == hwpoison_pfn || pfn == fault_pfn;
912 }
913 EXPORT_SYMBOL_GPL(is_error_pfn);
914 
915 int is_hwpoison_pfn(pfn_t pfn)
916 {
917 	return pfn == hwpoison_pfn;
918 }
919 EXPORT_SYMBOL_GPL(is_hwpoison_pfn);
920 
921 int is_fault_pfn(pfn_t pfn)
922 {
923 	return pfn == fault_pfn;
924 }
925 EXPORT_SYMBOL_GPL(is_fault_pfn);
926 
927 static inline unsigned long bad_hva(void)
928 {
929 	return PAGE_OFFSET;
930 }
931 
932 int kvm_is_error_hva(unsigned long addr)
933 {
934 	return addr == bad_hva();
935 }
936 EXPORT_SYMBOL_GPL(kvm_is_error_hva);
937 
938 static struct kvm_memory_slot *__gfn_to_memslot(struct kvm_memslots *slots,
939 						gfn_t gfn)
940 {
941 	int i;
942 
943 	for (i = 0; i < slots->nmemslots; ++i) {
944 		struct kvm_memory_slot *memslot = &slots->memslots[i];
945 
946 		if (gfn >= memslot->base_gfn
947 		    && gfn < memslot->base_gfn + memslot->npages)
948 			return memslot;
949 	}
950 	return NULL;
951 }
952 
953 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
954 {
955 	return __gfn_to_memslot(kvm_memslots(kvm), gfn);
956 }
957 EXPORT_SYMBOL_GPL(gfn_to_memslot);
958 
959 int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
960 {
961 	int i;
962 	struct kvm_memslots *slots = kvm_memslots(kvm);
963 
964 	for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
965 		struct kvm_memory_slot *memslot = &slots->memslots[i];
966 
967 		if (memslot->flags & KVM_MEMSLOT_INVALID)
968 			continue;
969 
970 		if (gfn >= memslot->base_gfn
971 		    && gfn < memslot->base_gfn + memslot->npages)
972 			return 1;
973 	}
974 	return 0;
975 }
976 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
977 
978 unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn)
979 {
980 	struct vm_area_struct *vma;
981 	unsigned long addr, size;
982 
983 	size = PAGE_SIZE;
984 
985 	addr = gfn_to_hva(kvm, gfn);
986 	if (kvm_is_error_hva(addr))
987 		return PAGE_SIZE;
988 
989 	down_read(&current->mm->mmap_sem);
990 	vma = find_vma(current->mm, addr);
991 	if (!vma)
992 		goto out;
993 
994 	size = vma_kernel_pagesize(vma);
995 
996 out:
997 	up_read(&current->mm->mmap_sem);
998 
999 	return size;
1000 }
1001 
1002 static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
1003 				     gfn_t *nr_pages)
1004 {
1005 	if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
1006 		return bad_hva();
1007 
1008 	if (nr_pages)
1009 		*nr_pages = slot->npages - (gfn - slot->base_gfn);
1010 
1011 	return gfn_to_hva_memslot(slot, gfn);
1012 }
1013 
1014 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
1015 {
1016 	return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
1017 }
1018 EXPORT_SYMBOL_GPL(gfn_to_hva);
1019 
1020 static pfn_t get_fault_pfn(void)
1021 {
1022 	get_page(fault_page);
1023 	return fault_pfn;
1024 }
1025 
1026 int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm,
1027 	unsigned long start, int write, struct page **page)
1028 {
1029 	int flags = FOLL_TOUCH | FOLL_NOWAIT | FOLL_HWPOISON | FOLL_GET;
1030 
1031 	if (write)
1032 		flags |= FOLL_WRITE;
1033 
1034 	return __get_user_pages(tsk, mm, start, 1, flags, page, NULL, NULL);
1035 }
1036 
1037 static inline int check_user_page_hwpoison(unsigned long addr)
1038 {
1039 	int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE;
1040 
1041 	rc = __get_user_pages(current, current->mm, addr, 1,
1042 			      flags, NULL, NULL, NULL);
1043 	return rc == -EHWPOISON;
1044 }
1045 
1046 static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic,
1047 			bool *async, bool write_fault, bool *writable)
1048 {
1049 	struct page *page[1];
1050 	int npages = 0;
1051 	pfn_t pfn;
1052 
1053 	/* we can do it either atomically or asynchronously, not both */
1054 	BUG_ON(atomic && async);
1055 
1056 	BUG_ON(!write_fault && !writable);
1057 
1058 	if (writable)
1059 		*writable = true;
1060 
1061 	if (atomic || async)
1062 		npages = __get_user_pages_fast(addr, 1, 1, page);
1063 
1064 	if (unlikely(npages != 1) && !atomic) {
1065 		might_sleep();
1066 
1067 		if (writable)
1068 			*writable = write_fault;
1069 
1070 		if (async) {
1071 			down_read(&current->mm->mmap_sem);
1072 			npages = get_user_page_nowait(current, current->mm,
1073 						     addr, write_fault, page);
1074 			up_read(&current->mm->mmap_sem);
1075 		} else
1076 			npages = get_user_pages_fast(addr, 1, write_fault,
1077 						     page);
1078 
1079 		/* map read fault as writable if possible */
1080 		if (unlikely(!write_fault) && npages == 1) {
1081 			struct page *wpage[1];
1082 
1083 			npages = __get_user_pages_fast(addr, 1, 1, wpage);
1084 			if (npages == 1) {
1085 				*writable = true;
1086 				put_page(page[0]);
1087 				page[0] = wpage[0];
1088 			}
1089 			npages = 1;
1090 		}
1091 	}
1092 
1093 	if (unlikely(npages != 1)) {
1094 		struct vm_area_struct *vma;
1095 
1096 		if (atomic)
1097 			return get_fault_pfn();
1098 
1099 		down_read(&current->mm->mmap_sem);
1100 		if (npages == -EHWPOISON ||
1101 			(!async && check_user_page_hwpoison(addr))) {
1102 			up_read(&current->mm->mmap_sem);
1103 			get_page(hwpoison_page);
1104 			return page_to_pfn(hwpoison_page);
1105 		}
1106 
1107 		vma = find_vma_intersection(current->mm, addr, addr+1);
1108 
1109 		if (vma == NULL)
1110 			pfn = get_fault_pfn();
1111 		else if ((vma->vm_flags & VM_PFNMAP)) {
1112 			pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) +
1113 				vma->vm_pgoff;
1114 			BUG_ON(!kvm_is_mmio_pfn(pfn));
1115 		} else {
1116 			if (async && (vma->vm_flags & VM_WRITE))
1117 				*async = true;
1118 			pfn = get_fault_pfn();
1119 		}
1120 		up_read(&current->mm->mmap_sem);
1121 	} else
1122 		pfn = page_to_pfn(page[0]);
1123 
1124 	return pfn;
1125 }
1126 
1127 pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr)
1128 {
1129 	return hva_to_pfn(kvm, addr, true, NULL, true, NULL);
1130 }
1131 EXPORT_SYMBOL_GPL(hva_to_pfn_atomic);
1132 
1133 static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async,
1134 			  bool write_fault, bool *writable)
1135 {
1136 	unsigned long addr;
1137 
1138 	if (async)
1139 		*async = false;
1140 
1141 	addr = gfn_to_hva(kvm, gfn);
1142 	if (kvm_is_error_hva(addr)) {
1143 		get_page(bad_page);
1144 		return page_to_pfn(bad_page);
1145 	}
1146 
1147 	return hva_to_pfn(kvm, addr, atomic, async, write_fault, writable);
1148 }
1149 
1150 pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
1151 {
1152 	return __gfn_to_pfn(kvm, gfn, true, NULL, true, NULL);
1153 }
1154 EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic);
1155 
1156 pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async,
1157 		       bool write_fault, bool *writable)
1158 {
1159 	return __gfn_to_pfn(kvm, gfn, false, async, write_fault, writable);
1160 }
1161 EXPORT_SYMBOL_GPL(gfn_to_pfn_async);
1162 
1163 pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
1164 {
1165 	return __gfn_to_pfn(kvm, gfn, false, NULL, true, NULL);
1166 }
1167 EXPORT_SYMBOL_GPL(gfn_to_pfn);
1168 
1169 pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
1170 		      bool *writable)
1171 {
1172 	return __gfn_to_pfn(kvm, gfn, false, NULL, write_fault, writable);
1173 }
1174 EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
1175 
1176 pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
1177 			 struct kvm_memory_slot *slot, gfn_t gfn)
1178 {
1179 	unsigned long addr = gfn_to_hva_memslot(slot, gfn);
1180 	return hva_to_pfn(kvm, addr, false, NULL, true, NULL);
1181 }
1182 
1183 int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
1184 								  int nr_pages)
1185 {
1186 	unsigned long addr;
1187 	gfn_t entry;
1188 
1189 	addr = gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, &entry);
1190 	if (kvm_is_error_hva(addr))
1191 		return -1;
1192 
1193 	if (entry < nr_pages)
1194 		return 0;
1195 
1196 	return __get_user_pages_fast(addr, nr_pages, 1, pages);
1197 }
1198 EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
1199 
1200 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
1201 {
1202 	pfn_t pfn;
1203 
1204 	pfn = gfn_to_pfn(kvm, gfn);
1205 	if (!kvm_is_mmio_pfn(pfn))
1206 		return pfn_to_page(pfn);
1207 
1208 	WARN_ON(kvm_is_mmio_pfn(pfn));
1209 
1210 	get_page(bad_page);
1211 	return bad_page;
1212 }
1213 
1214 EXPORT_SYMBOL_GPL(gfn_to_page);
1215 
1216 void kvm_release_page_clean(struct page *page)
1217 {
1218 	kvm_release_pfn_clean(page_to_pfn(page));
1219 }
1220 EXPORT_SYMBOL_GPL(kvm_release_page_clean);
1221 
1222 void kvm_release_pfn_clean(pfn_t pfn)
1223 {
1224 	if (!kvm_is_mmio_pfn(pfn))
1225 		put_page(pfn_to_page(pfn));
1226 }
1227 EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
1228 
1229 void kvm_release_page_dirty(struct page *page)
1230 {
1231 	kvm_release_pfn_dirty(page_to_pfn(page));
1232 }
1233 EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
1234 
1235 void kvm_release_pfn_dirty(pfn_t pfn)
1236 {
1237 	kvm_set_pfn_dirty(pfn);
1238 	kvm_release_pfn_clean(pfn);
1239 }
1240 EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
1241 
1242 void kvm_set_page_dirty(struct page *page)
1243 {
1244 	kvm_set_pfn_dirty(page_to_pfn(page));
1245 }
1246 EXPORT_SYMBOL_GPL(kvm_set_page_dirty);
1247 
1248 void kvm_set_pfn_dirty(pfn_t pfn)
1249 {
1250 	if (!kvm_is_mmio_pfn(pfn)) {
1251 		struct page *page = pfn_to_page(pfn);
1252 		if (!PageReserved(page))
1253 			SetPageDirty(page);
1254 	}
1255 }
1256 EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
1257 
1258 void kvm_set_pfn_accessed(pfn_t pfn)
1259 {
1260 	if (!kvm_is_mmio_pfn(pfn))
1261 		mark_page_accessed(pfn_to_page(pfn));
1262 }
1263 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
1264 
1265 void kvm_get_pfn(pfn_t pfn)
1266 {
1267 	if (!kvm_is_mmio_pfn(pfn))
1268 		get_page(pfn_to_page(pfn));
1269 }
1270 EXPORT_SYMBOL_GPL(kvm_get_pfn);
1271 
1272 static int next_segment(unsigned long len, int offset)
1273 {
1274 	if (len > PAGE_SIZE - offset)
1275 		return PAGE_SIZE - offset;
1276 	else
1277 		return len;
1278 }
1279 
1280 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
1281 			int len)
1282 {
1283 	int r;
1284 	unsigned long addr;
1285 
1286 	addr = gfn_to_hva(kvm, gfn);
1287 	if (kvm_is_error_hva(addr))
1288 		return -EFAULT;
1289 	r = __copy_from_user(data, (void __user *)addr + offset, len);
1290 	if (r)
1291 		return -EFAULT;
1292 	return 0;
1293 }
1294 EXPORT_SYMBOL_GPL(kvm_read_guest_page);
1295 
1296 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
1297 {
1298 	gfn_t gfn = gpa >> PAGE_SHIFT;
1299 	int seg;
1300 	int offset = offset_in_page(gpa);
1301 	int ret;
1302 
1303 	while ((seg = next_segment(len, offset)) != 0) {
1304 		ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
1305 		if (ret < 0)
1306 			return ret;
1307 		offset = 0;
1308 		len -= seg;
1309 		data += seg;
1310 		++gfn;
1311 	}
1312 	return 0;
1313 }
1314 EXPORT_SYMBOL_GPL(kvm_read_guest);
1315 
1316 int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
1317 			  unsigned long len)
1318 {
1319 	int r;
1320 	unsigned long addr;
1321 	gfn_t gfn = gpa >> PAGE_SHIFT;
1322 	int offset = offset_in_page(gpa);
1323 
1324 	addr = gfn_to_hva(kvm, gfn);
1325 	if (kvm_is_error_hva(addr))
1326 		return -EFAULT;
1327 	pagefault_disable();
1328 	r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
1329 	pagefault_enable();
1330 	if (r)
1331 		return -EFAULT;
1332 	return 0;
1333 }
1334 EXPORT_SYMBOL(kvm_read_guest_atomic);
1335 
1336 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
1337 			 int offset, int len)
1338 {
1339 	int r;
1340 	unsigned long addr;
1341 
1342 	addr = gfn_to_hva(kvm, gfn);
1343 	if (kvm_is_error_hva(addr))
1344 		return -EFAULT;
1345 	r = copy_to_user((void __user *)addr + offset, data, len);
1346 	if (r)
1347 		return -EFAULT;
1348 	mark_page_dirty(kvm, gfn);
1349 	return 0;
1350 }
1351 EXPORT_SYMBOL_GPL(kvm_write_guest_page);
1352 
1353 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
1354 		    unsigned long len)
1355 {
1356 	gfn_t gfn = gpa >> PAGE_SHIFT;
1357 	int seg;
1358 	int offset = offset_in_page(gpa);
1359 	int ret;
1360 
1361 	while ((seg = next_segment(len, offset)) != 0) {
1362 		ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
1363 		if (ret < 0)
1364 			return ret;
1365 		offset = 0;
1366 		len -= seg;
1367 		data += seg;
1368 		++gfn;
1369 	}
1370 	return 0;
1371 }
1372 
1373 int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
1374 			      gpa_t gpa)
1375 {
1376 	struct kvm_memslots *slots = kvm_memslots(kvm);
1377 	int offset = offset_in_page(gpa);
1378 	gfn_t gfn = gpa >> PAGE_SHIFT;
1379 
1380 	ghc->gpa = gpa;
1381 	ghc->generation = slots->generation;
1382 	ghc->memslot = __gfn_to_memslot(slots, gfn);
1383 	ghc->hva = gfn_to_hva_many(ghc->memslot, gfn, NULL);
1384 	if (!kvm_is_error_hva(ghc->hva))
1385 		ghc->hva += offset;
1386 	else
1387 		return -EFAULT;
1388 
1389 	return 0;
1390 }
1391 EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
1392 
1393 int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
1394 			   void *data, unsigned long len)
1395 {
1396 	struct kvm_memslots *slots = kvm_memslots(kvm);
1397 	int r;
1398 
1399 	if (slots->generation != ghc->generation)
1400 		kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa);
1401 
1402 	if (kvm_is_error_hva(ghc->hva))
1403 		return -EFAULT;
1404 
1405 	r = copy_to_user((void __user *)ghc->hva, data, len);
1406 	if (r)
1407 		return -EFAULT;
1408 	mark_page_dirty_in_slot(kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT);
1409 
1410 	return 0;
1411 }
1412 EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
1413 
1414 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
1415 {
1416 	return kvm_write_guest_page(kvm, gfn, (const void *) empty_zero_page,
1417 				    offset, len);
1418 }
1419 EXPORT_SYMBOL_GPL(kvm_clear_guest_page);
1420 
1421 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
1422 {
1423 	gfn_t gfn = gpa >> PAGE_SHIFT;
1424 	int seg;
1425 	int offset = offset_in_page(gpa);
1426 	int ret;
1427 
1428         while ((seg = next_segment(len, offset)) != 0) {
1429 		ret = kvm_clear_guest_page(kvm, gfn, offset, seg);
1430 		if (ret < 0)
1431 			return ret;
1432 		offset = 0;
1433 		len -= seg;
1434 		++gfn;
1435 	}
1436 	return 0;
1437 }
1438 EXPORT_SYMBOL_GPL(kvm_clear_guest);
1439 
1440 void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot,
1441 			     gfn_t gfn)
1442 {
1443 	if (memslot && memslot->dirty_bitmap) {
1444 		unsigned long rel_gfn = gfn - memslot->base_gfn;
1445 
1446 		__set_bit_le(rel_gfn, memslot->dirty_bitmap);
1447 	}
1448 }
1449 
1450 void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
1451 {
1452 	struct kvm_memory_slot *memslot;
1453 
1454 	memslot = gfn_to_memslot(kvm, gfn);
1455 	mark_page_dirty_in_slot(kvm, memslot, gfn);
1456 }
1457 
1458 /*
1459  * The vCPU has executed a HLT instruction with in-kernel mode enabled.
1460  */
1461 void kvm_vcpu_block(struct kvm_vcpu *vcpu)
1462 {
1463 	DEFINE_WAIT(wait);
1464 
1465 	for (;;) {
1466 		prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
1467 
1468 		if (kvm_arch_vcpu_runnable(vcpu)) {
1469 			kvm_make_request(KVM_REQ_UNHALT, vcpu);
1470 			break;
1471 		}
1472 		if (kvm_cpu_has_pending_timer(vcpu))
1473 			break;
1474 		if (signal_pending(current))
1475 			break;
1476 
1477 		schedule();
1478 	}
1479 
1480 	finish_wait(&vcpu->wq, &wait);
1481 }
1482 
1483 void kvm_resched(struct kvm_vcpu *vcpu)
1484 {
1485 	if (!need_resched())
1486 		return;
1487 	cond_resched();
1488 }
1489 EXPORT_SYMBOL_GPL(kvm_resched);
1490 
1491 void kvm_vcpu_on_spin(struct kvm_vcpu *me)
1492 {
1493 	struct kvm *kvm = me->kvm;
1494 	struct kvm_vcpu *vcpu;
1495 	int last_boosted_vcpu = me->kvm->last_boosted_vcpu;
1496 	int yielded = 0;
1497 	int pass;
1498 	int i;
1499 
1500 	/*
1501 	 * We boost the priority of a VCPU that is runnable but not
1502 	 * currently running, because it got preempted by something
1503 	 * else and called schedule in __vcpu_run.  Hopefully that
1504 	 * VCPU is holding the lock that we need and will release it.
1505 	 * We approximate round-robin by starting at the last boosted VCPU.
1506 	 */
1507 	for (pass = 0; pass < 2 && !yielded; pass++) {
1508 		kvm_for_each_vcpu(i, vcpu, kvm) {
1509 			struct task_struct *task = NULL;
1510 			struct pid *pid;
1511 			if (!pass && i < last_boosted_vcpu) {
1512 				i = last_boosted_vcpu;
1513 				continue;
1514 			} else if (pass && i > last_boosted_vcpu)
1515 				break;
1516 			if (vcpu == me)
1517 				continue;
1518 			if (waitqueue_active(&vcpu->wq))
1519 				continue;
1520 			rcu_read_lock();
1521 			pid = rcu_dereference(vcpu->pid);
1522 			if (pid)
1523 				task = get_pid_task(vcpu->pid, PIDTYPE_PID);
1524 			rcu_read_unlock();
1525 			if (!task)
1526 				continue;
1527 			if (task->flags & PF_VCPU) {
1528 				put_task_struct(task);
1529 				continue;
1530 			}
1531 			if (yield_to(task, 1)) {
1532 				put_task_struct(task);
1533 				kvm->last_boosted_vcpu = i;
1534 				yielded = 1;
1535 				break;
1536 			}
1537 			put_task_struct(task);
1538 		}
1539 	}
1540 }
1541 EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
1542 
1543 static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1544 {
1545 	struct kvm_vcpu *vcpu = vma->vm_file->private_data;
1546 	struct page *page;
1547 
1548 	if (vmf->pgoff == 0)
1549 		page = virt_to_page(vcpu->run);
1550 #ifdef CONFIG_X86
1551 	else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
1552 		page = virt_to_page(vcpu->arch.pio_data);
1553 #endif
1554 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
1555 	else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
1556 		page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
1557 #endif
1558 	else
1559 		return VM_FAULT_SIGBUS;
1560 	get_page(page);
1561 	vmf->page = page;
1562 	return 0;
1563 }
1564 
1565 static const struct vm_operations_struct kvm_vcpu_vm_ops = {
1566 	.fault = kvm_vcpu_fault,
1567 };
1568 
1569 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
1570 {
1571 	vma->vm_ops = &kvm_vcpu_vm_ops;
1572 	return 0;
1573 }
1574 
1575 static int kvm_vcpu_release(struct inode *inode, struct file *filp)
1576 {
1577 	struct kvm_vcpu *vcpu = filp->private_data;
1578 
1579 	kvm_put_kvm(vcpu->kvm);
1580 	return 0;
1581 }
1582 
1583 static struct file_operations kvm_vcpu_fops = {
1584 	.release        = kvm_vcpu_release,
1585 	.unlocked_ioctl = kvm_vcpu_ioctl,
1586 	.compat_ioctl   = kvm_vcpu_ioctl,
1587 	.mmap           = kvm_vcpu_mmap,
1588 	.llseek		= noop_llseek,
1589 };
1590 
1591 /*
1592  * Allocates an inode for the vcpu.
1593  */
1594 static int create_vcpu_fd(struct kvm_vcpu *vcpu)
1595 {
1596 	return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR);
1597 }
1598 
1599 /*
1600  * Creates some virtual cpus.  Good luck creating more than one.
1601  */
1602 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
1603 {
1604 	int r;
1605 	struct kvm_vcpu *vcpu, *v;
1606 
1607 	vcpu = kvm_arch_vcpu_create(kvm, id);
1608 	if (IS_ERR(vcpu))
1609 		return PTR_ERR(vcpu);
1610 
1611 	preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
1612 
1613 	r = kvm_arch_vcpu_setup(vcpu);
1614 	if (r)
1615 		return r;
1616 
1617 	mutex_lock(&kvm->lock);
1618 	if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) {
1619 		r = -EINVAL;
1620 		goto vcpu_destroy;
1621 	}
1622 
1623 	kvm_for_each_vcpu(r, v, kvm)
1624 		if (v->vcpu_id == id) {
1625 			r = -EEXIST;
1626 			goto vcpu_destroy;
1627 		}
1628 
1629 	BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]);
1630 
1631 	/* Now it's all set up, let userspace reach it */
1632 	kvm_get_kvm(kvm);
1633 	r = create_vcpu_fd(vcpu);
1634 	if (r < 0) {
1635 		kvm_put_kvm(kvm);
1636 		goto vcpu_destroy;
1637 	}
1638 
1639 	kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu;
1640 	smp_wmb();
1641 	atomic_inc(&kvm->online_vcpus);
1642 
1643 #ifdef CONFIG_KVM_APIC_ARCHITECTURE
1644 	if (kvm->bsp_vcpu_id == id)
1645 		kvm->bsp_vcpu = vcpu;
1646 #endif
1647 	mutex_unlock(&kvm->lock);
1648 	return r;
1649 
1650 vcpu_destroy:
1651 	mutex_unlock(&kvm->lock);
1652 	kvm_arch_vcpu_destroy(vcpu);
1653 	return r;
1654 }
1655 
1656 static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
1657 {
1658 	if (sigset) {
1659 		sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
1660 		vcpu->sigset_active = 1;
1661 		vcpu->sigset = *sigset;
1662 	} else
1663 		vcpu->sigset_active = 0;
1664 	return 0;
1665 }
1666 
1667 static long kvm_vcpu_ioctl(struct file *filp,
1668 			   unsigned int ioctl, unsigned long arg)
1669 {
1670 	struct kvm_vcpu *vcpu = filp->private_data;
1671 	void __user *argp = (void __user *)arg;
1672 	int r;
1673 	struct kvm_fpu *fpu = NULL;
1674 	struct kvm_sregs *kvm_sregs = NULL;
1675 
1676 	if (vcpu->kvm->mm != current->mm)
1677 		return -EIO;
1678 
1679 #if defined(CONFIG_S390) || defined(CONFIG_PPC)
1680 	/*
1681 	 * Special cases: vcpu ioctls that are asynchronous to vcpu execution,
1682 	 * so vcpu_load() would break it.
1683 	 */
1684 	if (ioctl == KVM_S390_INTERRUPT || ioctl == KVM_INTERRUPT)
1685 		return kvm_arch_vcpu_ioctl(filp, ioctl, arg);
1686 #endif
1687 
1688 
1689 	vcpu_load(vcpu);
1690 	switch (ioctl) {
1691 	case KVM_RUN:
1692 		r = -EINVAL;
1693 		if (arg)
1694 			goto out;
1695 		r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
1696 		trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
1697 		break;
1698 	case KVM_GET_REGS: {
1699 		struct kvm_regs *kvm_regs;
1700 
1701 		r = -ENOMEM;
1702 		kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL);
1703 		if (!kvm_regs)
1704 			goto out;
1705 		r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
1706 		if (r)
1707 			goto out_free1;
1708 		r = -EFAULT;
1709 		if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs)))
1710 			goto out_free1;
1711 		r = 0;
1712 out_free1:
1713 		kfree(kvm_regs);
1714 		break;
1715 	}
1716 	case KVM_SET_REGS: {
1717 		struct kvm_regs *kvm_regs;
1718 
1719 		r = -ENOMEM;
1720 		kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL);
1721 		if (!kvm_regs)
1722 			goto out;
1723 		r = -EFAULT;
1724 		if (copy_from_user(kvm_regs, argp, sizeof(struct kvm_regs)))
1725 			goto out_free2;
1726 		r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
1727 		if (r)
1728 			goto out_free2;
1729 		r = 0;
1730 out_free2:
1731 		kfree(kvm_regs);
1732 		break;
1733 	}
1734 	case KVM_GET_SREGS: {
1735 		kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL);
1736 		r = -ENOMEM;
1737 		if (!kvm_sregs)
1738 			goto out;
1739 		r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs);
1740 		if (r)
1741 			goto out;
1742 		r = -EFAULT;
1743 		if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs)))
1744 			goto out;
1745 		r = 0;
1746 		break;
1747 	}
1748 	case KVM_SET_SREGS: {
1749 		kvm_sregs = kmalloc(sizeof(struct kvm_sregs), GFP_KERNEL);
1750 		r = -ENOMEM;
1751 		if (!kvm_sregs)
1752 			goto out;
1753 		r = -EFAULT;
1754 		if (copy_from_user(kvm_sregs, argp, sizeof(struct kvm_sregs)))
1755 			goto out;
1756 		r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
1757 		if (r)
1758 			goto out;
1759 		r = 0;
1760 		break;
1761 	}
1762 	case KVM_GET_MP_STATE: {
1763 		struct kvm_mp_state mp_state;
1764 
1765 		r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state);
1766 		if (r)
1767 			goto out;
1768 		r = -EFAULT;
1769 		if (copy_to_user(argp, &mp_state, sizeof mp_state))
1770 			goto out;
1771 		r = 0;
1772 		break;
1773 	}
1774 	case KVM_SET_MP_STATE: {
1775 		struct kvm_mp_state mp_state;
1776 
1777 		r = -EFAULT;
1778 		if (copy_from_user(&mp_state, argp, sizeof mp_state))
1779 			goto out;
1780 		r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
1781 		if (r)
1782 			goto out;
1783 		r = 0;
1784 		break;
1785 	}
1786 	case KVM_TRANSLATE: {
1787 		struct kvm_translation tr;
1788 
1789 		r = -EFAULT;
1790 		if (copy_from_user(&tr, argp, sizeof tr))
1791 			goto out;
1792 		r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
1793 		if (r)
1794 			goto out;
1795 		r = -EFAULT;
1796 		if (copy_to_user(argp, &tr, sizeof tr))
1797 			goto out;
1798 		r = 0;
1799 		break;
1800 	}
1801 	case KVM_SET_GUEST_DEBUG: {
1802 		struct kvm_guest_debug dbg;
1803 
1804 		r = -EFAULT;
1805 		if (copy_from_user(&dbg, argp, sizeof dbg))
1806 			goto out;
1807 		r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
1808 		if (r)
1809 			goto out;
1810 		r = 0;
1811 		break;
1812 	}
1813 	case KVM_SET_SIGNAL_MASK: {
1814 		struct kvm_signal_mask __user *sigmask_arg = argp;
1815 		struct kvm_signal_mask kvm_sigmask;
1816 		sigset_t sigset, *p;
1817 
1818 		p = NULL;
1819 		if (argp) {
1820 			r = -EFAULT;
1821 			if (copy_from_user(&kvm_sigmask, argp,
1822 					   sizeof kvm_sigmask))
1823 				goto out;
1824 			r = -EINVAL;
1825 			if (kvm_sigmask.len != sizeof sigset)
1826 				goto out;
1827 			r = -EFAULT;
1828 			if (copy_from_user(&sigset, sigmask_arg->sigset,
1829 					   sizeof sigset))
1830 				goto out;
1831 			p = &sigset;
1832 		}
1833 		r = kvm_vcpu_ioctl_set_sigmask(vcpu, p);
1834 		break;
1835 	}
1836 	case KVM_GET_FPU: {
1837 		fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL);
1838 		r = -ENOMEM;
1839 		if (!fpu)
1840 			goto out;
1841 		r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu);
1842 		if (r)
1843 			goto out;
1844 		r = -EFAULT;
1845 		if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu)))
1846 			goto out;
1847 		r = 0;
1848 		break;
1849 	}
1850 	case KVM_SET_FPU: {
1851 		fpu = kmalloc(sizeof(struct kvm_fpu), GFP_KERNEL);
1852 		r = -ENOMEM;
1853 		if (!fpu)
1854 			goto out;
1855 		r = -EFAULT;
1856 		if (copy_from_user(fpu, argp, sizeof(struct kvm_fpu)))
1857 			goto out;
1858 		r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
1859 		if (r)
1860 			goto out;
1861 		r = 0;
1862 		break;
1863 	}
1864 	default:
1865 		r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
1866 	}
1867 out:
1868 	vcpu_put(vcpu);
1869 	kfree(fpu);
1870 	kfree(kvm_sregs);
1871 	return r;
1872 }
1873 
1874 static long kvm_vm_ioctl(struct file *filp,
1875 			   unsigned int ioctl, unsigned long arg)
1876 {
1877 	struct kvm *kvm = filp->private_data;
1878 	void __user *argp = (void __user *)arg;
1879 	int r;
1880 
1881 	if (kvm->mm != current->mm)
1882 		return -EIO;
1883 	switch (ioctl) {
1884 	case KVM_CREATE_VCPU:
1885 		r = kvm_vm_ioctl_create_vcpu(kvm, arg);
1886 		if (r < 0)
1887 			goto out;
1888 		break;
1889 	case KVM_SET_USER_MEMORY_REGION: {
1890 		struct kvm_userspace_memory_region kvm_userspace_mem;
1891 
1892 		r = -EFAULT;
1893 		if (copy_from_user(&kvm_userspace_mem, argp,
1894 						sizeof kvm_userspace_mem))
1895 			goto out;
1896 
1897 		r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1);
1898 		if (r)
1899 			goto out;
1900 		break;
1901 	}
1902 	case KVM_GET_DIRTY_LOG: {
1903 		struct kvm_dirty_log log;
1904 
1905 		r = -EFAULT;
1906 		if (copy_from_user(&log, argp, sizeof log))
1907 			goto out;
1908 		r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
1909 		if (r)
1910 			goto out;
1911 		break;
1912 	}
1913 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
1914 	case KVM_REGISTER_COALESCED_MMIO: {
1915 		struct kvm_coalesced_mmio_zone zone;
1916 		r = -EFAULT;
1917 		if (copy_from_user(&zone, argp, sizeof zone))
1918 			goto out;
1919 		r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
1920 		if (r)
1921 			goto out;
1922 		r = 0;
1923 		break;
1924 	}
1925 	case KVM_UNREGISTER_COALESCED_MMIO: {
1926 		struct kvm_coalesced_mmio_zone zone;
1927 		r = -EFAULT;
1928 		if (copy_from_user(&zone, argp, sizeof zone))
1929 			goto out;
1930 		r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
1931 		if (r)
1932 			goto out;
1933 		r = 0;
1934 		break;
1935 	}
1936 #endif
1937 	case KVM_IRQFD: {
1938 		struct kvm_irqfd data;
1939 
1940 		r = -EFAULT;
1941 		if (copy_from_user(&data, argp, sizeof data))
1942 			goto out;
1943 		r = kvm_irqfd(kvm, data.fd, data.gsi, data.flags);
1944 		break;
1945 	}
1946 	case KVM_IOEVENTFD: {
1947 		struct kvm_ioeventfd data;
1948 
1949 		r = -EFAULT;
1950 		if (copy_from_user(&data, argp, sizeof data))
1951 			goto out;
1952 		r = kvm_ioeventfd(kvm, &data);
1953 		break;
1954 	}
1955 #ifdef CONFIG_KVM_APIC_ARCHITECTURE
1956 	case KVM_SET_BOOT_CPU_ID:
1957 		r = 0;
1958 		mutex_lock(&kvm->lock);
1959 		if (atomic_read(&kvm->online_vcpus) != 0)
1960 			r = -EBUSY;
1961 		else
1962 			kvm->bsp_vcpu_id = arg;
1963 		mutex_unlock(&kvm->lock);
1964 		break;
1965 #endif
1966 	default:
1967 		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
1968 		if (r == -ENOTTY)
1969 			r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg);
1970 	}
1971 out:
1972 	return r;
1973 }
1974 
1975 #ifdef CONFIG_COMPAT
1976 struct compat_kvm_dirty_log {
1977 	__u32 slot;
1978 	__u32 padding1;
1979 	union {
1980 		compat_uptr_t dirty_bitmap; /* one bit per page */
1981 		__u64 padding2;
1982 	};
1983 };
1984 
1985 static long kvm_vm_compat_ioctl(struct file *filp,
1986 			   unsigned int ioctl, unsigned long arg)
1987 {
1988 	struct kvm *kvm = filp->private_data;
1989 	int r;
1990 
1991 	if (kvm->mm != current->mm)
1992 		return -EIO;
1993 	switch (ioctl) {
1994 	case KVM_GET_DIRTY_LOG: {
1995 		struct compat_kvm_dirty_log compat_log;
1996 		struct kvm_dirty_log log;
1997 
1998 		r = -EFAULT;
1999 		if (copy_from_user(&compat_log, (void __user *)arg,
2000 				   sizeof(compat_log)))
2001 			goto out;
2002 		log.slot	 = compat_log.slot;
2003 		log.padding1	 = compat_log.padding1;
2004 		log.padding2	 = compat_log.padding2;
2005 		log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
2006 
2007 		r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
2008 		if (r)
2009 			goto out;
2010 		break;
2011 	}
2012 	default:
2013 		r = kvm_vm_ioctl(filp, ioctl, arg);
2014 	}
2015 
2016 out:
2017 	return r;
2018 }
2019 #endif
2020 
2021 static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2022 {
2023 	struct page *page[1];
2024 	unsigned long addr;
2025 	int npages;
2026 	gfn_t gfn = vmf->pgoff;
2027 	struct kvm *kvm = vma->vm_file->private_data;
2028 
2029 	addr = gfn_to_hva(kvm, gfn);
2030 	if (kvm_is_error_hva(addr))
2031 		return VM_FAULT_SIGBUS;
2032 
2033 	npages = get_user_pages(current, current->mm, addr, 1, 1, 0, page,
2034 				NULL);
2035 	if (unlikely(npages != 1))
2036 		return VM_FAULT_SIGBUS;
2037 
2038 	vmf->page = page[0];
2039 	return 0;
2040 }
2041 
2042 static const struct vm_operations_struct kvm_vm_vm_ops = {
2043 	.fault = kvm_vm_fault,
2044 };
2045 
2046 static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
2047 {
2048 	vma->vm_ops = &kvm_vm_vm_ops;
2049 	return 0;
2050 }
2051 
2052 static struct file_operations kvm_vm_fops = {
2053 	.release        = kvm_vm_release,
2054 	.unlocked_ioctl = kvm_vm_ioctl,
2055 #ifdef CONFIG_COMPAT
2056 	.compat_ioctl   = kvm_vm_compat_ioctl,
2057 #endif
2058 	.mmap           = kvm_vm_mmap,
2059 	.llseek		= noop_llseek,
2060 };
2061 
2062 static int kvm_dev_ioctl_create_vm(void)
2063 {
2064 	int r;
2065 	struct kvm *kvm;
2066 
2067 	kvm = kvm_create_vm();
2068 	if (IS_ERR(kvm))
2069 		return PTR_ERR(kvm);
2070 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
2071 	r = kvm_coalesced_mmio_init(kvm);
2072 	if (r < 0) {
2073 		kvm_put_kvm(kvm);
2074 		return r;
2075 	}
2076 #endif
2077 	r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
2078 	if (r < 0)
2079 		kvm_put_kvm(kvm);
2080 
2081 	return r;
2082 }
2083 
2084 static long kvm_dev_ioctl_check_extension_generic(long arg)
2085 {
2086 	switch (arg) {
2087 	case KVM_CAP_USER_MEMORY:
2088 	case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
2089 	case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
2090 #ifdef CONFIG_KVM_APIC_ARCHITECTURE
2091 	case KVM_CAP_SET_BOOT_CPU_ID:
2092 #endif
2093 	case KVM_CAP_INTERNAL_ERROR_DATA:
2094 		return 1;
2095 #ifdef CONFIG_HAVE_KVM_IRQCHIP
2096 	case KVM_CAP_IRQ_ROUTING:
2097 		return KVM_MAX_IRQ_ROUTES;
2098 #endif
2099 	default:
2100 		break;
2101 	}
2102 	return kvm_dev_ioctl_check_extension(arg);
2103 }
2104 
2105 static long kvm_dev_ioctl(struct file *filp,
2106 			  unsigned int ioctl, unsigned long arg)
2107 {
2108 	long r = -EINVAL;
2109 
2110 	switch (ioctl) {
2111 	case KVM_GET_API_VERSION:
2112 		r = -EINVAL;
2113 		if (arg)
2114 			goto out;
2115 		r = KVM_API_VERSION;
2116 		break;
2117 	case KVM_CREATE_VM:
2118 		r = -EINVAL;
2119 		if (arg)
2120 			goto out;
2121 		r = kvm_dev_ioctl_create_vm();
2122 		break;
2123 	case KVM_CHECK_EXTENSION:
2124 		r = kvm_dev_ioctl_check_extension_generic(arg);
2125 		break;
2126 	case KVM_GET_VCPU_MMAP_SIZE:
2127 		r = -EINVAL;
2128 		if (arg)
2129 			goto out;
2130 		r = PAGE_SIZE;     /* struct kvm_run */
2131 #ifdef CONFIG_X86
2132 		r += PAGE_SIZE;    /* pio data page */
2133 #endif
2134 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
2135 		r += PAGE_SIZE;    /* coalesced mmio ring page */
2136 #endif
2137 		break;
2138 	case KVM_TRACE_ENABLE:
2139 	case KVM_TRACE_PAUSE:
2140 	case KVM_TRACE_DISABLE:
2141 		r = -EOPNOTSUPP;
2142 		break;
2143 	default:
2144 		return kvm_arch_dev_ioctl(filp, ioctl, arg);
2145 	}
2146 out:
2147 	return r;
2148 }
2149 
2150 static struct file_operations kvm_chardev_ops = {
2151 	.unlocked_ioctl = kvm_dev_ioctl,
2152 	.compat_ioctl   = kvm_dev_ioctl,
2153 	.llseek		= noop_llseek,
2154 };
2155 
2156 static struct miscdevice kvm_dev = {
2157 	KVM_MINOR,
2158 	"kvm",
2159 	&kvm_chardev_ops,
2160 };
2161 
2162 static void hardware_enable_nolock(void *junk)
2163 {
2164 	int cpu = raw_smp_processor_id();
2165 	int r;
2166 
2167 	if (cpumask_test_cpu(cpu, cpus_hardware_enabled))
2168 		return;
2169 
2170 	cpumask_set_cpu(cpu, cpus_hardware_enabled);
2171 
2172 	r = kvm_arch_hardware_enable(NULL);
2173 
2174 	if (r) {
2175 		cpumask_clear_cpu(cpu, cpus_hardware_enabled);
2176 		atomic_inc(&hardware_enable_failed);
2177 		printk(KERN_INFO "kvm: enabling virtualization on "
2178 				 "CPU%d failed\n", cpu);
2179 	}
2180 }
2181 
2182 static void hardware_enable(void *junk)
2183 {
2184 	raw_spin_lock(&kvm_lock);
2185 	hardware_enable_nolock(junk);
2186 	raw_spin_unlock(&kvm_lock);
2187 }
2188 
2189 static void hardware_disable_nolock(void *junk)
2190 {
2191 	int cpu = raw_smp_processor_id();
2192 
2193 	if (!cpumask_test_cpu(cpu, cpus_hardware_enabled))
2194 		return;
2195 	cpumask_clear_cpu(cpu, cpus_hardware_enabled);
2196 	kvm_arch_hardware_disable(NULL);
2197 }
2198 
2199 static void hardware_disable(void *junk)
2200 {
2201 	raw_spin_lock(&kvm_lock);
2202 	hardware_disable_nolock(junk);
2203 	raw_spin_unlock(&kvm_lock);
2204 }
2205 
2206 static void hardware_disable_all_nolock(void)
2207 {
2208 	BUG_ON(!kvm_usage_count);
2209 
2210 	kvm_usage_count--;
2211 	if (!kvm_usage_count)
2212 		on_each_cpu(hardware_disable_nolock, NULL, 1);
2213 }
2214 
2215 static void hardware_disable_all(void)
2216 {
2217 	raw_spin_lock(&kvm_lock);
2218 	hardware_disable_all_nolock();
2219 	raw_spin_unlock(&kvm_lock);
2220 }
2221 
2222 static int hardware_enable_all(void)
2223 {
2224 	int r = 0;
2225 
2226 	raw_spin_lock(&kvm_lock);
2227 
2228 	kvm_usage_count++;
2229 	if (kvm_usage_count == 1) {
2230 		atomic_set(&hardware_enable_failed, 0);
2231 		on_each_cpu(hardware_enable_nolock, NULL, 1);
2232 
2233 		if (atomic_read(&hardware_enable_failed)) {
2234 			hardware_disable_all_nolock();
2235 			r = -EBUSY;
2236 		}
2237 	}
2238 
2239 	raw_spin_unlock(&kvm_lock);
2240 
2241 	return r;
2242 }
2243 
2244 static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
2245 			   void *v)
2246 {
2247 	int cpu = (long)v;
2248 
2249 	if (!kvm_usage_count)
2250 		return NOTIFY_OK;
2251 
2252 	val &= ~CPU_TASKS_FROZEN;
2253 	switch (val) {
2254 	case CPU_DYING:
2255 		printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
2256 		       cpu);
2257 		hardware_disable(NULL);
2258 		break;
2259 	case CPU_STARTING:
2260 		printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
2261 		       cpu);
2262 		hardware_enable(NULL);
2263 		break;
2264 	}
2265 	return NOTIFY_OK;
2266 }
2267 
2268 
2269 asmlinkage void kvm_spurious_fault(void)
2270 {
2271 	/* Fault while not rebooting.  We want the trace. */
2272 	BUG();
2273 }
2274 EXPORT_SYMBOL_GPL(kvm_spurious_fault);
2275 
2276 static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
2277 		      void *v)
2278 {
2279 	/*
2280 	 * Some (well, at least mine) BIOSes hang on reboot if
2281 	 * in vmx root mode.
2282 	 *
2283 	 * And Intel TXT required VMX off for all cpu when system shutdown.
2284 	 */
2285 	printk(KERN_INFO "kvm: exiting hardware virtualization\n");
2286 	kvm_rebooting = true;
2287 	on_each_cpu(hardware_disable_nolock, NULL, 1);
2288 	return NOTIFY_OK;
2289 }
2290 
2291 static struct notifier_block kvm_reboot_notifier = {
2292 	.notifier_call = kvm_reboot,
2293 	.priority = 0,
2294 };
2295 
2296 static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
2297 {
2298 	int i;
2299 
2300 	for (i = 0; i < bus->dev_count; i++) {
2301 		struct kvm_io_device *pos = bus->devs[i];
2302 
2303 		kvm_iodevice_destructor(pos);
2304 	}
2305 	kfree(bus);
2306 }
2307 
2308 /* kvm_io_bus_write - called under kvm->slots_lock */
2309 int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
2310 		     int len, const void *val)
2311 {
2312 	int i;
2313 	struct kvm_io_bus *bus;
2314 
2315 	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
2316 	for (i = 0; i < bus->dev_count; i++)
2317 		if (!kvm_iodevice_write(bus->devs[i], addr, len, val))
2318 			return 0;
2319 	return -EOPNOTSUPP;
2320 }
2321 
2322 /* kvm_io_bus_read - called under kvm->slots_lock */
2323 int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
2324 		    int len, void *val)
2325 {
2326 	int i;
2327 	struct kvm_io_bus *bus;
2328 
2329 	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
2330 	for (i = 0; i < bus->dev_count; i++)
2331 		if (!kvm_iodevice_read(bus->devs[i], addr, len, val))
2332 			return 0;
2333 	return -EOPNOTSUPP;
2334 }
2335 
2336 /* Caller must hold slots_lock. */
2337 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx,
2338 			    struct kvm_io_device *dev)
2339 {
2340 	struct kvm_io_bus *new_bus, *bus;
2341 
2342 	bus = kvm->buses[bus_idx];
2343 	if (bus->dev_count > NR_IOBUS_DEVS-1)
2344 		return -ENOSPC;
2345 
2346 	new_bus = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL);
2347 	if (!new_bus)
2348 		return -ENOMEM;
2349 	memcpy(new_bus, bus, sizeof(struct kvm_io_bus));
2350 	new_bus->devs[new_bus->dev_count++] = dev;
2351 	rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
2352 	synchronize_srcu_expedited(&kvm->srcu);
2353 	kfree(bus);
2354 
2355 	return 0;
2356 }
2357 
2358 /* Caller must hold slots_lock. */
2359 int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
2360 			      struct kvm_io_device *dev)
2361 {
2362 	int i, r;
2363 	struct kvm_io_bus *new_bus, *bus;
2364 
2365 	new_bus = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL);
2366 	if (!new_bus)
2367 		return -ENOMEM;
2368 
2369 	bus = kvm->buses[bus_idx];
2370 	memcpy(new_bus, bus, sizeof(struct kvm_io_bus));
2371 
2372 	r = -ENOENT;
2373 	for (i = 0; i < new_bus->dev_count; i++)
2374 		if (new_bus->devs[i] == dev) {
2375 			r = 0;
2376 			new_bus->devs[i] = new_bus->devs[--new_bus->dev_count];
2377 			break;
2378 		}
2379 
2380 	if (r) {
2381 		kfree(new_bus);
2382 		return r;
2383 	}
2384 
2385 	rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
2386 	synchronize_srcu_expedited(&kvm->srcu);
2387 	kfree(bus);
2388 	return r;
2389 }
2390 
2391 static struct notifier_block kvm_cpu_notifier = {
2392 	.notifier_call = kvm_cpu_hotplug,
2393 };
2394 
2395 static int vm_stat_get(void *_offset, u64 *val)
2396 {
2397 	unsigned offset = (long)_offset;
2398 	struct kvm *kvm;
2399 
2400 	*val = 0;
2401 	raw_spin_lock(&kvm_lock);
2402 	list_for_each_entry(kvm, &vm_list, vm_list)
2403 		*val += *(u32 *)((void *)kvm + offset);
2404 	raw_spin_unlock(&kvm_lock);
2405 	return 0;
2406 }
2407 
2408 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, NULL, "%llu\n");
2409 
2410 static int vcpu_stat_get(void *_offset, u64 *val)
2411 {
2412 	unsigned offset = (long)_offset;
2413 	struct kvm *kvm;
2414 	struct kvm_vcpu *vcpu;
2415 	int i;
2416 
2417 	*val = 0;
2418 	raw_spin_lock(&kvm_lock);
2419 	list_for_each_entry(kvm, &vm_list, vm_list)
2420 		kvm_for_each_vcpu(i, vcpu, kvm)
2421 			*val += *(u32 *)((void *)vcpu + offset);
2422 
2423 	raw_spin_unlock(&kvm_lock);
2424 	return 0;
2425 }
2426 
2427 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, NULL, "%llu\n");
2428 
2429 static const struct file_operations *stat_fops[] = {
2430 	[KVM_STAT_VCPU] = &vcpu_stat_fops,
2431 	[KVM_STAT_VM]   = &vm_stat_fops,
2432 };
2433 
2434 static void kvm_init_debug(void)
2435 {
2436 	struct kvm_stats_debugfs_item *p;
2437 
2438 	kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
2439 	for (p = debugfs_entries; p->name; ++p)
2440 		p->dentry = debugfs_create_file(p->name, 0444, kvm_debugfs_dir,
2441 						(void *)(long)p->offset,
2442 						stat_fops[p->kind]);
2443 }
2444 
2445 static void kvm_exit_debug(void)
2446 {
2447 	struct kvm_stats_debugfs_item *p;
2448 
2449 	for (p = debugfs_entries; p->name; ++p)
2450 		debugfs_remove(p->dentry);
2451 	debugfs_remove(kvm_debugfs_dir);
2452 }
2453 
2454 static int kvm_suspend(void)
2455 {
2456 	if (kvm_usage_count)
2457 		hardware_disable_nolock(NULL);
2458 	return 0;
2459 }
2460 
2461 static void kvm_resume(void)
2462 {
2463 	if (kvm_usage_count) {
2464 		WARN_ON(raw_spin_is_locked(&kvm_lock));
2465 		hardware_enable_nolock(NULL);
2466 	}
2467 }
2468 
2469 static struct syscore_ops kvm_syscore_ops = {
2470 	.suspend = kvm_suspend,
2471 	.resume = kvm_resume,
2472 };
2473 
2474 struct page *bad_page;
2475 pfn_t bad_pfn;
2476 
2477 static inline
2478 struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
2479 {
2480 	return container_of(pn, struct kvm_vcpu, preempt_notifier);
2481 }
2482 
2483 static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
2484 {
2485 	struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
2486 
2487 	kvm_arch_vcpu_load(vcpu, cpu);
2488 }
2489 
2490 static void kvm_sched_out(struct preempt_notifier *pn,
2491 			  struct task_struct *next)
2492 {
2493 	struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
2494 
2495 	kvm_arch_vcpu_put(vcpu);
2496 }
2497 
2498 int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
2499 		  struct module *module)
2500 {
2501 	int r;
2502 	int cpu;
2503 
2504 	r = kvm_arch_init(opaque);
2505 	if (r)
2506 		goto out_fail;
2507 
2508 	bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2509 
2510 	if (bad_page == NULL) {
2511 		r = -ENOMEM;
2512 		goto out;
2513 	}
2514 
2515 	bad_pfn = page_to_pfn(bad_page);
2516 
2517 	hwpoison_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2518 
2519 	if (hwpoison_page == NULL) {
2520 		r = -ENOMEM;
2521 		goto out_free_0;
2522 	}
2523 
2524 	hwpoison_pfn = page_to_pfn(hwpoison_page);
2525 
2526 	fault_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2527 
2528 	if (fault_page == NULL) {
2529 		r = -ENOMEM;
2530 		goto out_free_0;
2531 	}
2532 
2533 	fault_pfn = page_to_pfn(fault_page);
2534 
2535 	if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
2536 		r = -ENOMEM;
2537 		goto out_free_0;
2538 	}
2539 
2540 	r = kvm_arch_hardware_setup();
2541 	if (r < 0)
2542 		goto out_free_0a;
2543 
2544 	for_each_online_cpu(cpu) {
2545 		smp_call_function_single(cpu,
2546 				kvm_arch_check_processor_compat,
2547 				&r, 1);
2548 		if (r < 0)
2549 			goto out_free_1;
2550 	}
2551 
2552 	r = register_cpu_notifier(&kvm_cpu_notifier);
2553 	if (r)
2554 		goto out_free_2;
2555 	register_reboot_notifier(&kvm_reboot_notifier);
2556 
2557 	/* A kmem cache lets us meet the alignment requirements of fx_save. */
2558 	if (!vcpu_align)
2559 		vcpu_align = __alignof__(struct kvm_vcpu);
2560 	kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, vcpu_align,
2561 					   0, NULL);
2562 	if (!kvm_vcpu_cache) {
2563 		r = -ENOMEM;
2564 		goto out_free_3;
2565 	}
2566 
2567 	r = kvm_async_pf_init();
2568 	if (r)
2569 		goto out_free;
2570 
2571 	kvm_chardev_ops.owner = module;
2572 	kvm_vm_fops.owner = module;
2573 	kvm_vcpu_fops.owner = module;
2574 
2575 	r = misc_register(&kvm_dev);
2576 	if (r) {
2577 		printk(KERN_ERR "kvm: misc device register failed\n");
2578 		goto out_unreg;
2579 	}
2580 
2581 	register_syscore_ops(&kvm_syscore_ops);
2582 
2583 	kvm_preempt_ops.sched_in = kvm_sched_in;
2584 	kvm_preempt_ops.sched_out = kvm_sched_out;
2585 
2586 	kvm_init_debug();
2587 
2588 	return 0;
2589 
2590 out_unreg:
2591 	kvm_async_pf_deinit();
2592 out_free:
2593 	kmem_cache_destroy(kvm_vcpu_cache);
2594 out_free_3:
2595 	unregister_reboot_notifier(&kvm_reboot_notifier);
2596 	unregister_cpu_notifier(&kvm_cpu_notifier);
2597 out_free_2:
2598 out_free_1:
2599 	kvm_arch_hardware_unsetup();
2600 out_free_0a:
2601 	free_cpumask_var(cpus_hardware_enabled);
2602 out_free_0:
2603 	if (fault_page)
2604 		__free_page(fault_page);
2605 	if (hwpoison_page)
2606 		__free_page(hwpoison_page);
2607 	__free_page(bad_page);
2608 out:
2609 	kvm_arch_exit();
2610 out_fail:
2611 	return r;
2612 }
2613 EXPORT_SYMBOL_GPL(kvm_init);
2614 
2615 void kvm_exit(void)
2616 {
2617 	kvm_exit_debug();
2618 	misc_deregister(&kvm_dev);
2619 	kmem_cache_destroy(kvm_vcpu_cache);
2620 	kvm_async_pf_deinit();
2621 	unregister_syscore_ops(&kvm_syscore_ops);
2622 	unregister_reboot_notifier(&kvm_reboot_notifier);
2623 	unregister_cpu_notifier(&kvm_cpu_notifier);
2624 	on_each_cpu(hardware_disable_nolock, NULL, 1);
2625 	kvm_arch_hardware_unsetup();
2626 	kvm_arch_exit();
2627 	free_cpumask_var(cpus_hardware_enabled);
2628 	__free_page(hwpoison_page);
2629 	__free_page(bad_page);
2630 }
2631 EXPORT_SYMBOL_GPL(kvm_exit);
2632