xref: /linux/tools/testing/selftests/kvm/lib/kvm_util.c (revision effa76856f2d7111f8c44de49f15ebdfccea8ccc)
1  // SPDX-License-Identifier: GPL-2.0-only
2  /*
3   * tools/testing/selftests/kvm/lib/kvm_util.c
4   *
5   * Copyright (C) 2018, Google LLC.
6   */
7  
8  #define _GNU_SOURCE /* for program_invocation_name */
9  #include "test_util.h"
10  #include "kvm_util.h"
11  #include "processor.h"
12  
13  #include <assert.h>
14  #include <sched.h>
15  #include <sys/mman.h>
16  #include <sys/types.h>
17  #include <sys/stat.h>
18  #include <unistd.h>
19  #include <linux/kernel.h>
20  
21  #define KVM_UTIL_MIN_PFN	2
22  
23  static int vcpu_mmap_sz(void);
24  
25  int open_path_or_exit(const char *path, int flags)
26  {
27  	int fd;
28  
29  	fd = open(path, flags);
30  	__TEST_REQUIRE(fd >= 0, "%s not available (errno: %d)", path, errno);
31  
32  	return fd;
33  }
34  
35  /*
36   * Open KVM_DEV_PATH if available, otherwise exit the entire program.
37   *
38   * Input Args:
39   *   flags - The flags to pass when opening KVM_DEV_PATH.
40   *
41   * Return:
42   *   The opened file descriptor of /dev/kvm.
43   */
44  static int _open_kvm_dev_path_or_exit(int flags)
45  {
46  	return open_path_or_exit(KVM_DEV_PATH, flags);
47  }
48  
49  int open_kvm_dev_path_or_exit(void)
50  {
51  	return _open_kvm_dev_path_or_exit(O_RDONLY);
52  }
53  
54  static bool get_module_param_bool(const char *module_name, const char *param)
55  {
56  	const int path_size = 128;
57  	char path[path_size];
58  	char value;
59  	ssize_t r;
60  	int fd;
61  
62  	r = snprintf(path, path_size, "/sys/module/%s/parameters/%s",
63  		     module_name, param);
64  	TEST_ASSERT(r < path_size,
65  		    "Failed to construct sysfs path in %d bytes.", path_size);
66  
67  	fd = open_path_or_exit(path, O_RDONLY);
68  
69  	r = read(fd, &value, 1);
70  	TEST_ASSERT(r == 1, "read(%s) failed", path);
71  
72  	r = close(fd);
73  	TEST_ASSERT(!r, "close(%s) failed", path);
74  
75  	if (value == 'Y')
76  		return true;
77  	else if (value == 'N')
78  		return false;
79  
80  	TEST_FAIL("Unrecognized value '%c' for boolean module param", value);
81  }
82  
83  bool get_kvm_intel_param_bool(const char *param)
84  {
85  	return get_module_param_bool("kvm_intel", param);
86  }
87  
88  bool get_kvm_amd_param_bool(const char *param)
89  {
90  	return get_module_param_bool("kvm_amd", param);
91  }
92  
93  /*
94   * Capability
95   *
96   * Input Args:
97   *   cap - Capability
98   *
99   * Output Args: None
100   *
101   * Return:
102   *   On success, the Value corresponding to the capability (KVM_CAP_*)
103   *   specified by the value of cap.  On failure a TEST_ASSERT failure
104   *   is produced.
105   *
106   * Looks up and returns the value corresponding to the capability
107   * (KVM_CAP_*) given by cap.
108   */
109  unsigned int kvm_check_cap(long cap)
110  {
111  	int ret;
112  	int kvm_fd;
113  
114  	kvm_fd = open_kvm_dev_path_or_exit();
115  	ret = __kvm_ioctl(kvm_fd, KVM_CHECK_EXTENSION, (void *)cap);
116  	TEST_ASSERT(ret >= 0, KVM_IOCTL_ERROR(KVM_CHECK_EXTENSION, ret));
117  
118  	close(kvm_fd);
119  
120  	return (unsigned int)ret;
121  }
122  
123  void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size)
124  {
125  	if (vm_check_cap(vm, KVM_CAP_DIRTY_LOG_RING_ACQ_REL))
126  		vm_enable_cap(vm, KVM_CAP_DIRTY_LOG_RING_ACQ_REL, ring_size);
127  	else
128  		vm_enable_cap(vm, KVM_CAP_DIRTY_LOG_RING, ring_size);
129  	vm->dirty_ring_size = ring_size;
130  }
131  
132  static void vm_open(struct kvm_vm *vm)
133  {
134  	vm->kvm_fd = _open_kvm_dev_path_or_exit(O_RDWR);
135  
136  	TEST_REQUIRE(kvm_has_cap(KVM_CAP_IMMEDIATE_EXIT));
137  
138  	vm->fd = __kvm_ioctl(vm->kvm_fd, KVM_CREATE_VM, (void *)vm->type);
139  	TEST_ASSERT(vm->fd >= 0, KVM_IOCTL_ERROR(KVM_CREATE_VM, vm->fd));
140  }
141  
142  const char *vm_guest_mode_string(uint32_t i)
143  {
144  	static const char * const strings[] = {
145  		[VM_MODE_P52V48_4K]	= "PA-bits:52,  VA-bits:48,  4K pages",
146  		[VM_MODE_P52V48_64K]	= "PA-bits:52,  VA-bits:48, 64K pages",
147  		[VM_MODE_P48V48_4K]	= "PA-bits:48,  VA-bits:48,  4K pages",
148  		[VM_MODE_P48V48_16K]	= "PA-bits:48,  VA-bits:48, 16K pages",
149  		[VM_MODE_P48V48_64K]	= "PA-bits:48,  VA-bits:48, 64K pages",
150  		[VM_MODE_P40V48_4K]	= "PA-bits:40,  VA-bits:48,  4K pages",
151  		[VM_MODE_P40V48_16K]	= "PA-bits:40,  VA-bits:48, 16K pages",
152  		[VM_MODE_P40V48_64K]	= "PA-bits:40,  VA-bits:48, 64K pages",
153  		[VM_MODE_PXXV48_4K]	= "PA-bits:ANY, VA-bits:48,  4K pages",
154  		[VM_MODE_P47V64_4K]	= "PA-bits:47,  VA-bits:64,  4K pages",
155  		[VM_MODE_P44V64_4K]	= "PA-bits:44,  VA-bits:64,  4K pages",
156  		[VM_MODE_P36V48_4K]	= "PA-bits:36,  VA-bits:48,  4K pages",
157  		[VM_MODE_P36V48_16K]	= "PA-bits:36,  VA-bits:48, 16K pages",
158  		[VM_MODE_P36V48_64K]	= "PA-bits:36,  VA-bits:48, 64K pages",
159  		[VM_MODE_P36V47_16K]	= "PA-bits:36,  VA-bits:47, 16K pages",
160  	};
161  	_Static_assert(sizeof(strings)/sizeof(char *) == NUM_VM_MODES,
162  		       "Missing new mode strings?");
163  
164  	TEST_ASSERT(i < NUM_VM_MODES, "Guest mode ID %d too big", i);
165  
166  	return strings[i];
167  }
168  
169  const struct vm_guest_mode_params vm_guest_mode_params[] = {
170  	[VM_MODE_P52V48_4K]	= { 52, 48,  0x1000, 12 },
171  	[VM_MODE_P52V48_64K]	= { 52, 48, 0x10000, 16 },
172  	[VM_MODE_P48V48_4K]	= { 48, 48,  0x1000, 12 },
173  	[VM_MODE_P48V48_16K]	= { 48, 48,  0x4000, 14 },
174  	[VM_MODE_P48V48_64K]	= { 48, 48, 0x10000, 16 },
175  	[VM_MODE_P40V48_4K]	= { 40, 48,  0x1000, 12 },
176  	[VM_MODE_P40V48_16K]	= { 40, 48,  0x4000, 14 },
177  	[VM_MODE_P40V48_64K]	= { 40, 48, 0x10000, 16 },
178  	[VM_MODE_PXXV48_4K]	= {  0,  0,  0x1000, 12 },
179  	[VM_MODE_P47V64_4K]	= { 47, 64,  0x1000, 12 },
180  	[VM_MODE_P44V64_4K]	= { 44, 64,  0x1000, 12 },
181  	[VM_MODE_P36V48_4K]	= { 36, 48,  0x1000, 12 },
182  	[VM_MODE_P36V48_16K]	= { 36, 48,  0x4000, 14 },
183  	[VM_MODE_P36V48_64K]	= { 36, 48, 0x10000, 16 },
184  	[VM_MODE_P36V47_16K]	= { 36, 47,  0x4000, 14 },
185  };
186  _Static_assert(sizeof(vm_guest_mode_params)/sizeof(struct vm_guest_mode_params) == NUM_VM_MODES,
187  	       "Missing new mode params?");
188  
189  /*
190   * Initializes vm->vpages_valid to match the canonical VA space of the
191   * architecture.
192   *
193   * The default implementation is valid for architectures which split the
194   * range addressed by a single page table into a low and high region
195   * based on the MSB of the VA. On architectures with this behavior
196   * the VA region spans [0, 2^(va_bits - 1)), [-(2^(va_bits - 1), -1].
197   */
198  __weak void vm_vaddr_populate_bitmap(struct kvm_vm *vm)
199  {
200  	sparsebit_set_num(vm->vpages_valid,
201  		0, (1ULL << (vm->va_bits - 1)) >> vm->page_shift);
202  	sparsebit_set_num(vm->vpages_valid,
203  		(~((1ULL << (vm->va_bits - 1)) - 1)) >> vm->page_shift,
204  		(1ULL << (vm->va_bits - 1)) >> vm->page_shift);
205  }
206  
207  struct kvm_vm *____vm_create(enum vm_guest_mode mode)
208  {
209  	struct kvm_vm *vm;
210  
211  	vm = calloc(1, sizeof(*vm));
212  	TEST_ASSERT(vm != NULL, "Insufficient Memory");
213  
214  	INIT_LIST_HEAD(&vm->vcpus);
215  	vm->regions.gpa_tree = RB_ROOT;
216  	vm->regions.hva_tree = RB_ROOT;
217  	hash_init(vm->regions.slot_hash);
218  
219  	vm->mode = mode;
220  	vm->type = 0;
221  
222  	vm->pa_bits = vm_guest_mode_params[mode].pa_bits;
223  	vm->va_bits = vm_guest_mode_params[mode].va_bits;
224  	vm->page_size = vm_guest_mode_params[mode].page_size;
225  	vm->page_shift = vm_guest_mode_params[mode].page_shift;
226  
227  	/* Setup mode specific traits. */
228  	switch (vm->mode) {
229  	case VM_MODE_P52V48_4K:
230  		vm->pgtable_levels = 4;
231  		break;
232  	case VM_MODE_P52V48_64K:
233  		vm->pgtable_levels = 3;
234  		break;
235  	case VM_MODE_P48V48_4K:
236  		vm->pgtable_levels = 4;
237  		break;
238  	case VM_MODE_P48V48_64K:
239  		vm->pgtable_levels = 3;
240  		break;
241  	case VM_MODE_P40V48_4K:
242  	case VM_MODE_P36V48_4K:
243  		vm->pgtable_levels = 4;
244  		break;
245  	case VM_MODE_P40V48_64K:
246  	case VM_MODE_P36V48_64K:
247  		vm->pgtable_levels = 3;
248  		break;
249  	case VM_MODE_P48V48_16K:
250  	case VM_MODE_P40V48_16K:
251  	case VM_MODE_P36V48_16K:
252  		vm->pgtable_levels = 4;
253  		break;
254  	case VM_MODE_P36V47_16K:
255  		vm->pgtable_levels = 3;
256  		break;
257  	case VM_MODE_PXXV48_4K:
258  #ifdef __x86_64__
259  		kvm_get_cpu_address_width(&vm->pa_bits, &vm->va_bits);
260  		/*
261  		 * Ignore KVM support for 5-level paging (vm->va_bits == 57),
262  		 * it doesn't take effect unless a CR4.LA57 is set, which it
263  		 * isn't for this VM_MODE.
264  		 */
265  		TEST_ASSERT(vm->va_bits == 48 || vm->va_bits == 57,
266  			    "Linear address width (%d bits) not supported",
267  			    vm->va_bits);
268  		pr_debug("Guest physical address width detected: %d\n",
269  			 vm->pa_bits);
270  		vm->pgtable_levels = 4;
271  		vm->va_bits = 48;
272  #else
273  		TEST_FAIL("VM_MODE_PXXV48_4K not supported on non-x86 platforms");
274  #endif
275  		break;
276  	case VM_MODE_P47V64_4K:
277  		vm->pgtable_levels = 5;
278  		break;
279  	case VM_MODE_P44V64_4K:
280  		vm->pgtable_levels = 5;
281  		break;
282  	default:
283  		TEST_FAIL("Unknown guest mode, mode: 0x%x", mode);
284  	}
285  
286  #ifdef __aarch64__
287  	if (vm->pa_bits != 40)
288  		vm->type = KVM_VM_TYPE_ARM_IPA_SIZE(vm->pa_bits);
289  #endif
290  
291  	vm_open(vm);
292  
293  	/* Limit to VA-bit canonical virtual addresses. */
294  	vm->vpages_valid = sparsebit_alloc();
295  	vm_vaddr_populate_bitmap(vm);
296  
297  	/* Limit physical addresses to PA-bits. */
298  	vm->max_gfn = vm_compute_max_gfn(vm);
299  
300  	/* Allocate and setup memory for guest. */
301  	vm->vpages_mapped = sparsebit_alloc();
302  
303  	return vm;
304  }
305  
306  static uint64_t vm_nr_pages_required(enum vm_guest_mode mode,
307  				     uint32_t nr_runnable_vcpus,
308  				     uint64_t extra_mem_pages)
309  {
310  	uint64_t nr_pages;
311  
312  	TEST_ASSERT(nr_runnable_vcpus,
313  		    "Use vm_create_barebones() for VMs that _never_ have vCPUs\n");
314  
315  	TEST_ASSERT(nr_runnable_vcpus <= kvm_check_cap(KVM_CAP_MAX_VCPUS),
316  		    "nr_vcpus = %d too large for host, max-vcpus = %d",
317  		    nr_runnable_vcpus, kvm_check_cap(KVM_CAP_MAX_VCPUS));
318  
319  	/*
320  	 * Arbitrarily allocate 512 pages (2mb when page size is 4kb) for the
321  	 * test code and other per-VM assets that will be loaded into memslot0.
322  	 */
323  	nr_pages = 512;
324  
325  	/* Account for the per-vCPU stacks on behalf of the test. */
326  	nr_pages += nr_runnable_vcpus * DEFAULT_STACK_PGS;
327  
328  	/*
329  	 * Account for the number of pages needed for the page tables.  The
330  	 * maximum page table size for a memory region will be when the
331  	 * smallest page size is used. Considering each page contains x page
332  	 * table descriptors, the total extra size for page tables (for extra
333  	 * N pages) will be: N/x+N/x^2+N/x^3+... which is definitely smaller
334  	 * than N/x*2.
335  	 */
336  	nr_pages += (nr_pages + extra_mem_pages) / PTES_PER_MIN_PAGE * 2;
337  
338  	return vm_adjust_num_guest_pages(mode, nr_pages);
339  }
340  
341  struct kvm_vm *__vm_create(enum vm_guest_mode mode, uint32_t nr_runnable_vcpus,
342  			   uint64_t nr_extra_pages)
343  {
344  	uint64_t nr_pages = vm_nr_pages_required(mode, nr_runnable_vcpus,
345  						 nr_extra_pages);
346  	struct userspace_mem_region *slot0;
347  	struct kvm_vm *vm;
348  	int i;
349  
350  	pr_debug("%s: mode='%s' pages='%ld'\n", __func__,
351  		 vm_guest_mode_string(mode), nr_pages);
352  
353  	vm = ____vm_create(mode);
354  
355  	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, 0, 0, nr_pages, 0);
356  	for (i = 0; i < NR_MEM_REGIONS; i++)
357  		vm->memslots[i] = 0;
358  
359  	kvm_vm_elf_load(vm, program_invocation_name);
360  
361  	/*
362  	 * TODO: Add proper defines to protect the library's memslots, and then
363  	 * carve out memslot1 for the ucall MMIO address.  KVM treats writes to
364  	 * read-only memslots as MMIO, and creating a read-only memslot for the
365  	 * MMIO region would prevent silently clobbering the MMIO region.
366  	 */
367  	slot0 = memslot2region(vm, 0);
368  	ucall_init(vm, slot0->region.guest_phys_addr + slot0->region.memory_size);
369  
370  	kvm_arch_vm_post_create(vm);
371  
372  	return vm;
373  }
374  
375  /*
376   * VM Create with customized parameters
377   *
378   * Input Args:
379   *   mode - VM Mode (e.g. VM_MODE_P52V48_4K)
380   *   nr_vcpus - VCPU count
381   *   extra_mem_pages - Non-slot0 physical memory total size
382   *   guest_code - Guest entry point
383   *   vcpuids - VCPU IDs
384   *
385   * Output Args: None
386   *
387   * Return:
388   *   Pointer to opaque structure that describes the created VM.
389   *
390   * Creates a VM with the mode specified by mode (e.g. VM_MODE_P52V48_4K).
391   * extra_mem_pages is only used to calculate the maximum page table size,
392   * no real memory allocation for non-slot0 memory in this function.
393   */
394  struct kvm_vm *__vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus,
395  				      uint64_t extra_mem_pages,
396  				      void *guest_code, struct kvm_vcpu *vcpus[])
397  {
398  	struct kvm_vm *vm;
399  	int i;
400  
401  	TEST_ASSERT(!nr_vcpus || vcpus, "Must provide vCPU array");
402  
403  	vm = __vm_create(mode, nr_vcpus, extra_mem_pages);
404  
405  	for (i = 0; i < nr_vcpus; ++i)
406  		vcpus[i] = vm_vcpu_add(vm, i, guest_code);
407  
408  	return vm;
409  }
410  
411  struct kvm_vm *__vm_create_with_one_vcpu(struct kvm_vcpu **vcpu,
412  					 uint64_t extra_mem_pages,
413  					 void *guest_code)
414  {
415  	struct kvm_vcpu *vcpus[1];
416  	struct kvm_vm *vm;
417  
418  	vm = __vm_create_with_vcpus(VM_MODE_DEFAULT, 1, extra_mem_pages,
419  				    guest_code, vcpus);
420  
421  	*vcpu = vcpus[0];
422  	return vm;
423  }
424  
425  /*
426   * VM Restart
427   *
428   * Input Args:
429   *   vm - VM that has been released before
430   *
431   * Output Args: None
432   *
433   * Reopens the file descriptors associated to the VM and reinstates the
434   * global state, such as the irqchip and the memory regions that are mapped
435   * into the guest.
436   */
437  void kvm_vm_restart(struct kvm_vm *vmp)
438  {
439  	int ctr;
440  	struct userspace_mem_region *region;
441  
442  	vm_open(vmp);
443  	if (vmp->has_irqchip)
444  		vm_create_irqchip(vmp);
445  
446  	hash_for_each(vmp->regions.slot_hash, ctr, region, slot_node) {
447  		int ret = ioctl(vmp->fd, KVM_SET_USER_MEMORY_REGION, &region->region);
448  		TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed,\n"
449  			    "  rc: %i errno: %i\n"
450  			    "  slot: %u flags: 0x%x\n"
451  			    "  guest_phys_addr: 0x%llx size: 0x%llx",
452  			    ret, errno, region->region.slot,
453  			    region->region.flags,
454  			    region->region.guest_phys_addr,
455  			    region->region.memory_size);
456  	}
457  }
458  
459  __weak struct kvm_vcpu *vm_arch_vcpu_recreate(struct kvm_vm *vm,
460  					      uint32_t vcpu_id)
461  {
462  	return __vm_vcpu_add(vm, vcpu_id);
463  }
464  
465  struct kvm_vcpu *vm_recreate_with_one_vcpu(struct kvm_vm *vm)
466  {
467  	kvm_vm_restart(vm);
468  
469  	return vm_vcpu_recreate(vm, 0);
470  }
471  
472  void kvm_pin_this_task_to_pcpu(uint32_t pcpu)
473  {
474  	cpu_set_t mask;
475  	int r;
476  
477  	CPU_ZERO(&mask);
478  	CPU_SET(pcpu, &mask);
479  	r = sched_setaffinity(0, sizeof(mask), &mask);
480  	TEST_ASSERT(!r, "sched_setaffinity() failed for pCPU '%u'.\n", pcpu);
481  }
482  
483  static uint32_t parse_pcpu(const char *cpu_str, const cpu_set_t *allowed_mask)
484  {
485  	uint32_t pcpu = atoi_non_negative("CPU number", cpu_str);
486  
487  	TEST_ASSERT(CPU_ISSET(pcpu, allowed_mask),
488  		    "Not allowed to run on pCPU '%d', check cgroups?\n", pcpu);
489  	return pcpu;
490  }
491  
492  void kvm_parse_vcpu_pinning(const char *pcpus_string, uint32_t vcpu_to_pcpu[],
493  			    int nr_vcpus)
494  {
495  	cpu_set_t allowed_mask;
496  	char *cpu, *cpu_list;
497  	char delim[2] = ",";
498  	int i, r;
499  
500  	cpu_list = strdup(pcpus_string);
501  	TEST_ASSERT(cpu_list, "strdup() allocation failed.\n");
502  
503  	r = sched_getaffinity(0, sizeof(allowed_mask), &allowed_mask);
504  	TEST_ASSERT(!r, "sched_getaffinity() failed");
505  
506  	cpu = strtok(cpu_list, delim);
507  
508  	/* 1. Get all pcpus for vcpus. */
509  	for (i = 0; i < nr_vcpus; i++) {
510  		TEST_ASSERT(cpu, "pCPU not provided for vCPU '%d'\n", i);
511  		vcpu_to_pcpu[i] = parse_pcpu(cpu, &allowed_mask);
512  		cpu = strtok(NULL, delim);
513  	}
514  
515  	/* 2. Check if the main worker needs to be pinned. */
516  	if (cpu) {
517  		kvm_pin_this_task_to_pcpu(parse_pcpu(cpu, &allowed_mask));
518  		cpu = strtok(NULL, delim);
519  	}
520  
521  	TEST_ASSERT(!cpu, "pCPU list contains trailing garbage characters '%s'", cpu);
522  	free(cpu_list);
523  }
524  
525  /*
526   * Userspace Memory Region Find
527   *
528   * Input Args:
529   *   vm - Virtual Machine
530   *   start - Starting VM physical address
531   *   end - Ending VM physical address, inclusive.
532   *
533   * Output Args: None
534   *
535   * Return:
536   *   Pointer to overlapping region, NULL if no such region.
537   *
538   * Searches for a region with any physical memory that overlaps with
539   * any portion of the guest physical addresses from start to end
540   * inclusive.  If multiple overlapping regions exist, a pointer to any
541   * of the regions is returned.  Null is returned only when no overlapping
542   * region exists.
543   */
544  static struct userspace_mem_region *
545  userspace_mem_region_find(struct kvm_vm *vm, uint64_t start, uint64_t end)
546  {
547  	struct rb_node *node;
548  
549  	for (node = vm->regions.gpa_tree.rb_node; node; ) {
550  		struct userspace_mem_region *region =
551  			container_of(node, struct userspace_mem_region, gpa_node);
552  		uint64_t existing_start = region->region.guest_phys_addr;
553  		uint64_t existing_end = region->region.guest_phys_addr
554  			+ region->region.memory_size - 1;
555  		if (start <= existing_end && end >= existing_start)
556  			return region;
557  
558  		if (start < existing_start)
559  			node = node->rb_left;
560  		else
561  			node = node->rb_right;
562  	}
563  
564  	return NULL;
565  }
566  
567  /*
568   * KVM Userspace Memory Region Find
569   *
570   * Input Args:
571   *   vm - Virtual Machine
572   *   start - Starting VM physical address
573   *   end - Ending VM physical address, inclusive.
574   *
575   * Output Args: None
576   *
577   * Return:
578   *   Pointer to overlapping region, NULL if no such region.
579   *
580   * Public interface to userspace_mem_region_find. Allows tests to look up
581   * the memslot datastructure for a given range of guest physical memory.
582   */
583  struct kvm_userspace_memory_region *
584  kvm_userspace_memory_region_find(struct kvm_vm *vm, uint64_t start,
585  				 uint64_t end)
586  {
587  	struct userspace_mem_region *region;
588  
589  	region = userspace_mem_region_find(vm, start, end);
590  	if (!region)
591  		return NULL;
592  
593  	return &region->region;
594  }
595  
596  __weak void vcpu_arch_free(struct kvm_vcpu *vcpu)
597  {
598  
599  }
600  
601  /*
602   * VM VCPU Remove
603   *
604   * Input Args:
605   *   vcpu - VCPU to remove
606   *
607   * Output Args: None
608   *
609   * Return: None, TEST_ASSERT failures for all error conditions
610   *
611   * Removes a vCPU from a VM and frees its resources.
612   */
613  static void vm_vcpu_rm(struct kvm_vm *vm, struct kvm_vcpu *vcpu)
614  {
615  	int ret;
616  
617  	if (vcpu->dirty_gfns) {
618  		ret = munmap(vcpu->dirty_gfns, vm->dirty_ring_size);
619  		TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("munmap()", ret));
620  		vcpu->dirty_gfns = NULL;
621  	}
622  
623  	ret = munmap(vcpu->run, vcpu_mmap_sz());
624  	TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("munmap()", ret));
625  
626  	ret = close(vcpu->fd);
627  	TEST_ASSERT(!ret,  __KVM_SYSCALL_ERROR("close()", ret));
628  
629  	list_del(&vcpu->list);
630  
631  	vcpu_arch_free(vcpu);
632  	free(vcpu);
633  }
634  
635  void kvm_vm_release(struct kvm_vm *vmp)
636  {
637  	struct kvm_vcpu *vcpu, *tmp;
638  	int ret;
639  
640  	list_for_each_entry_safe(vcpu, tmp, &vmp->vcpus, list)
641  		vm_vcpu_rm(vmp, vcpu);
642  
643  	ret = close(vmp->fd);
644  	TEST_ASSERT(!ret,  __KVM_SYSCALL_ERROR("close()", ret));
645  
646  	ret = close(vmp->kvm_fd);
647  	TEST_ASSERT(!ret,  __KVM_SYSCALL_ERROR("close()", ret));
648  }
649  
650  static void __vm_mem_region_delete(struct kvm_vm *vm,
651  				   struct userspace_mem_region *region,
652  				   bool unlink)
653  {
654  	int ret;
655  
656  	if (unlink) {
657  		rb_erase(&region->gpa_node, &vm->regions.gpa_tree);
658  		rb_erase(&region->hva_node, &vm->regions.hva_tree);
659  		hash_del(&region->slot_node);
660  	}
661  
662  	region->region.memory_size = 0;
663  	vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION, &region->region);
664  
665  	sparsebit_free(&region->unused_phy_pages);
666  	ret = munmap(region->mmap_start, region->mmap_size);
667  	TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("munmap()", ret));
668  	if (region->fd >= 0) {
669  		/* There's an extra map when using shared memory. */
670  		ret = munmap(region->mmap_alias, region->mmap_size);
671  		TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("munmap()", ret));
672  		close(region->fd);
673  	}
674  
675  	free(region);
676  }
677  
678  /*
679   * Destroys and frees the VM pointed to by vmp.
680   */
681  void kvm_vm_free(struct kvm_vm *vmp)
682  {
683  	int ctr;
684  	struct hlist_node *node;
685  	struct userspace_mem_region *region;
686  
687  	if (vmp == NULL)
688  		return;
689  
690  	/* Free cached stats metadata and close FD */
691  	if (vmp->stats_fd) {
692  		free(vmp->stats_desc);
693  		close(vmp->stats_fd);
694  	}
695  
696  	/* Free userspace_mem_regions. */
697  	hash_for_each_safe(vmp->regions.slot_hash, ctr, node, region, slot_node)
698  		__vm_mem_region_delete(vmp, region, false);
699  
700  	/* Free sparsebit arrays. */
701  	sparsebit_free(&vmp->vpages_valid);
702  	sparsebit_free(&vmp->vpages_mapped);
703  
704  	kvm_vm_release(vmp);
705  
706  	/* Free the structure describing the VM. */
707  	free(vmp);
708  }
709  
710  int kvm_memfd_alloc(size_t size, bool hugepages)
711  {
712  	int memfd_flags = MFD_CLOEXEC;
713  	int fd, r;
714  
715  	if (hugepages)
716  		memfd_flags |= MFD_HUGETLB;
717  
718  	fd = memfd_create("kvm_selftest", memfd_flags);
719  	TEST_ASSERT(fd != -1, __KVM_SYSCALL_ERROR("memfd_create()", fd));
720  
721  	r = ftruncate(fd, size);
722  	TEST_ASSERT(!r, __KVM_SYSCALL_ERROR("ftruncate()", r));
723  
724  	r = fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, size);
725  	TEST_ASSERT(!r, __KVM_SYSCALL_ERROR("fallocate()", r));
726  
727  	return fd;
728  }
729  
730  /*
731   * Memory Compare, host virtual to guest virtual
732   *
733   * Input Args:
734   *   hva - Starting host virtual address
735   *   vm - Virtual Machine
736   *   gva - Starting guest virtual address
737   *   len - number of bytes to compare
738   *
739   * Output Args: None
740   *
741   * Input/Output Args: None
742   *
743   * Return:
744   *   Returns 0 if the bytes starting at hva for a length of len
745   *   are equal the guest virtual bytes starting at gva.  Returns
746   *   a value < 0, if bytes at hva are less than those at gva.
747   *   Otherwise a value > 0 is returned.
748   *
749   * Compares the bytes starting at the host virtual address hva, for
750   * a length of len, to the guest bytes starting at the guest virtual
751   * address given by gva.
752   */
753  int kvm_memcmp_hva_gva(void *hva, struct kvm_vm *vm, vm_vaddr_t gva, size_t len)
754  {
755  	size_t amt;
756  
757  	/*
758  	 * Compare a batch of bytes until either a match is found
759  	 * or all the bytes have been compared.
760  	 */
761  	for (uintptr_t offset = 0; offset < len; offset += amt) {
762  		uintptr_t ptr1 = (uintptr_t)hva + offset;
763  
764  		/*
765  		 * Determine host address for guest virtual address
766  		 * at offset.
767  		 */
768  		uintptr_t ptr2 = (uintptr_t)addr_gva2hva(vm, gva + offset);
769  
770  		/*
771  		 * Determine amount to compare on this pass.
772  		 * Don't allow the comparsion to cross a page boundary.
773  		 */
774  		amt = len - offset;
775  		if ((ptr1 >> vm->page_shift) != ((ptr1 + amt) >> vm->page_shift))
776  			amt = vm->page_size - (ptr1 % vm->page_size);
777  		if ((ptr2 >> vm->page_shift) != ((ptr2 + amt) >> vm->page_shift))
778  			amt = vm->page_size - (ptr2 % vm->page_size);
779  
780  		assert((ptr1 >> vm->page_shift) == ((ptr1 + amt - 1) >> vm->page_shift));
781  		assert((ptr2 >> vm->page_shift) == ((ptr2 + amt - 1) >> vm->page_shift));
782  
783  		/*
784  		 * Perform the comparison.  If there is a difference
785  		 * return that result to the caller, otherwise need
786  		 * to continue on looking for a mismatch.
787  		 */
788  		int ret = memcmp((void *)ptr1, (void *)ptr2, amt);
789  		if (ret != 0)
790  			return ret;
791  	}
792  
793  	/*
794  	 * No mismatch found.  Let the caller know the two memory
795  	 * areas are equal.
796  	 */
797  	return 0;
798  }
799  
800  static void vm_userspace_mem_region_gpa_insert(struct rb_root *gpa_tree,
801  					       struct userspace_mem_region *region)
802  {
803  	struct rb_node **cur, *parent;
804  
805  	for (cur = &gpa_tree->rb_node, parent = NULL; *cur; ) {
806  		struct userspace_mem_region *cregion;
807  
808  		cregion = container_of(*cur, typeof(*cregion), gpa_node);
809  		parent = *cur;
810  		if (region->region.guest_phys_addr <
811  		    cregion->region.guest_phys_addr)
812  			cur = &(*cur)->rb_left;
813  		else {
814  			TEST_ASSERT(region->region.guest_phys_addr !=
815  				    cregion->region.guest_phys_addr,
816  				    "Duplicate GPA in region tree");
817  
818  			cur = &(*cur)->rb_right;
819  		}
820  	}
821  
822  	rb_link_node(&region->gpa_node, parent, cur);
823  	rb_insert_color(&region->gpa_node, gpa_tree);
824  }
825  
826  static void vm_userspace_mem_region_hva_insert(struct rb_root *hva_tree,
827  					       struct userspace_mem_region *region)
828  {
829  	struct rb_node **cur, *parent;
830  
831  	for (cur = &hva_tree->rb_node, parent = NULL; *cur; ) {
832  		struct userspace_mem_region *cregion;
833  
834  		cregion = container_of(*cur, typeof(*cregion), hva_node);
835  		parent = *cur;
836  		if (region->host_mem < cregion->host_mem)
837  			cur = &(*cur)->rb_left;
838  		else {
839  			TEST_ASSERT(region->host_mem !=
840  				    cregion->host_mem,
841  				    "Duplicate HVA in region tree");
842  
843  			cur = &(*cur)->rb_right;
844  		}
845  	}
846  
847  	rb_link_node(&region->hva_node, parent, cur);
848  	rb_insert_color(&region->hva_node, hva_tree);
849  }
850  
851  
852  int __vm_set_user_memory_region(struct kvm_vm *vm, uint32_t slot, uint32_t flags,
853  				uint64_t gpa, uint64_t size, void *hva)
854  {
855  	struct kvm_userspace_memory_region region = {
856  		.slot = slot,
857  		.flags = flags,
858  		.guest_phys_addr = gpa,
859  		.memory_size = size,
860  		.userspace_addr = (uintptr_t)hva,
861  	};
862  
863  	return ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION, &region);
864  }
865  
866  void vm_set_user_memory_region(struct kvm_vm *vm, uint32_t slot, uint32_t flags,
867  			       uint64_t gpa, uint64_t size, void *hva)
868  {
869  	int ret = __vm_set_user_memory_region(vm, slot, flags, gpa, size, hva);
870  
871  	TEST_ASSERT(!ret, "KVM_SET_USER_MEMORY_REGION failed, errno = %d (%s)",
872  		    errno, strerror(errno));
873  }
874  
875  /*
876   * VM Userspace Memory Region Add
877   *
878   * Input Args:
879   *   vm - Virtual Machine
880   *   src_type - Storage source for this region.
881   *              NULL to use anonymous memory.
882   *   guest_paddr - Starting guest physical address
883   *   slot - KVM region slot
884   *   npages - Number of physical pages
885   *   flags - KVM memory region flags (e.g. KVM_MEM_LOG_DIRTY_PAGES)
886   *
887   * Output Args: None
888   *
889   * Return: None
890   *
891   * Allocates a memory area of the number of pages specified by npages
892   * and maps it to the VM specified by vm, at a starting physical address
893   * given by guest_paddr.  The region is created with a KVM region slot
894   * given by slot, which must be unique and < KVM_MEM_SLOTS_NUM.  The
895   * region is created with the flags given by flags.
896   */
897  void vm_userspace_mem_region_add(struct kvm_vm *vm,
898  	enum vm_mem_backing_src_type src_type,
899  	uint64_t guest_paddr, uint32_t slot, uint64_t npages,
900  	uint32_t flags)
901  {
902  	int ret;
903  	struct userspace_mem_region *region;
904  	size_t backing_src_pagesz = get_backing_src_pagesz(src_type);
905  	size_t alignment;
906  
907  	TEST_ASSERT(vm_adjust_num_guest_pages(vm->mode, npages) == npages,
908  		"Number of guest pages is not compatible with the host. "
909  		"Try npages=%d", vm_adjust_num_guest_pages(vm->mode, npages));
910  
911  	TEST_ASSERT((guest_paddr % vm->page_size) == 0, "Guest physical "
912  		"address not on a page boundary.\n"
913  		"  guest_paddr: 0x%lx vm->page_size: 0x%x",
914  		guest_paddr, vm->page_size);
915  	TEST_ASSERT((((guest_paddr >> vm->page_shift) + npages) - 1)
916  		<= vm->max_gfn, "Physical range beyond maximum "
917  		"supported physical address,\n"
918  		"  guest_paddr: 0x%lx npages: 0x%lx\n"
919  		"  vm->max_gfn: 0x%lx vm->page_size: 0x%x",
920  		guest_paddr, npages, vm->max_gfn, vm->page_size);
921  
922  	/*
923  	 * Confirm a mem region with an overlapping address doesn't
924  	 * already exist.
925  	 */
926  	region = (struct userspace_mem_region *) userspace_mem_region_find(
927  		vm, guest_paddr, (guest_paddr + npages * vm->page_size) - 1);
928  	if (region != NULL)
929  		TEST_FAIL("overlapping userspace_mem_region already "
930  			"exists\n"
931  			"  requested guest_paddr: 0x%lx npages: 0x%lx "
932  			"page_size: 0x%x\n"
933  			"  existing guest_paddr: 0x%lx size: 0x%lx",
934  			guest_paddr, npages, vm->page_size,
935  			(uint64_t) region->region.guest_phys_addr,
936  			(uint64_t) region->region.memory_size);
937  
938  	/* Confirm no region with the requested slot already exists. */
939  	hash_for_each_possible(vm->regions.slot_hash, region, slot_node,
940  			       slot) {
941  		if (region->region.slot != slot)
942  			continue;
943  
944  		TEST_FAIL("A mem region with the requested slot "
945  			"already exists.\n"
946  			"  requested slot: %u paddr: 0x%lx npages: 0x%lx\n"
947  			"  existing slot: %u paddr: 0x%lx size: 0x%lx",
948  			slot, guest_paddr, npages,
949  			region->region.slot,
950  			(uint64_t) region->region.guest_phys_addr,
951  			(uint64_t) region->region.memory_size);
952  	}
953  
954  	/* Allocate and initialize new mem region structure. */
955  	region = calloc(1, sizeof(*region));
956  	TEST_ASSERT(region != NULL, "Insufficient Memory");
957  	region->mmap_size = npages * vm->page_size;
958  
959  #ifdef __s390x__
960  	/* On s390x, the host address must be aligned to 1M (due to PGSTEs) */
961  	alignment = 0x100000;
962  #else
963  	alignment = 1;
964  #endif
965  
966  	/*
967  	 * When using THP mmap is not guaranteed to returned a hugepage aligned
968  	 * address so we have to pad the mmap. Padding is not needed for HugeTLB
969  	 * because mmap will always return an address aligned to the HugeTLB
970  	 * page size.
971  	 */
972  	if (src_type == VM_MEM_SRC_ANONYMOUS_THP)
973  		alignment = max(backing_src_pagesz, alignment);
974  
975  	ASSERT_EQ(guest_paddr, align_up(guest_paddr, backing_src_pagesz));
976  
977  	/* Add enough memory to align up if necessary */
978  	if (alignment > 1)
979  		region->mmap_size += alignment;
980  
981  	region->fd = -1;
982  	if (backing_src_is_shared(src_type))
983  		region->fd = kvm_memfd_alloc(region->mmap_size,
984  					     src_type == VM_MEM_SRC_SHARED_HUGETLB);
985  
986  	region->mmap_start = mmap(NULL, region->mmap_size,
987  				  PROT_READ | PROT_WRITE,
988  				  vm_mem_backing_src_alias(src_type)->flag,
989  				  region->fd, 0);
990  	TEST_ASSERT(region->mmap_start != MAP_FAILED,
991  		    __KVM_SYSCALL_ERROR("mmap()", (int)(unsigned long)MAP_FAILED));
992  
993  	TEST_ASSERT(!is_backing_src_hugetlb(src_type) ||
994  		    region->mmap_start == align_ptr_up(region->mmap_start, backing_src_pagesz),
995  		    "mmap_start %p is not aligned to HugeTLB page size 0x%lx",
996  		    region->mmap_start, backing_src_pagesz);
997  
998  	/* Align host address */
999  	region->host_mem = align_ptr_up(region->mmap_start, alignment);
1000  
1001  	/* As needed perform madvise */
1002  	if ((src_type == VM_MEM_SRC_ANONYMOUS ||
1003  	     src_type == VM_MEM_SRC_ANONYMOUS_THP) && thp_configured()) {
1004  		ret = madvise(region->host_mem, npages * vm->page_size,
1005  			      src_type == VM_MEM_SRC_ANONYMOUS ? MADV_NOHUGEPAGE : MADV_HUGEPAGE);
1006  		TEST_ASSERT(ret == 0, "madvise failed, addr: %p length: 0x%lx src_type: %s",
1007  			    region->host_mem, npages * vm->page_size,
1008  			    vm_mem_backing_src_alias(src_type)->name);
1009  	}
1010  
1011  	region->backing_src_type = src_type;
1012  	region->unused_phy_pages = sparsebit_alloc();
1013  	sparsebit_set_num(region->unused_phy_pages,
1014  		guest_paddr >> vm->page_shift, npages);
1015  	region->region.slot = slot;
1016  	region->region.flags = flags;
1017  	region->region.guest_phys_addr = guest_paddr;
1018  	region->region.memory_size = npages * vm->page_size;
1019  	region->region.userspace_addr = (uintptr_t) region->host_mem;
1020  	ret = __vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION, &region->region);
1021  	TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed,\n"
1022  		"  rc: %i errno: %i\n"
1023  		"  slot: %u flags: 0x%x\n"
1024  		"  guest_phys_addr: 0x%lx size: 0x%lx",
1025  		ret, errno, slot, flags,
1026  		guest_paddr, (uint64_t) region->region.memory_size);
1027  
1028  	/* Add to quick lookup data structures */
1029  	vm_userspace_mem_region_gpa_insert(&vm->regions.gpa_tree, region);
1030  	vm_userspace_mem_region_hva_insert(&vm->regions.hva_tree, region);
1031  	hash_add(vm->regions.slot_hash, &region->slot_node, slot);
1032  
1033  	/* If shared memory, create an alias. */
1034  	if (region->fd >= 0) {
1035  		region->mmap_alias = mmap(NULL, region->mmap_size,
1036  					  PROT_READ | PROT_WRITE,
1037  					  vm_mem_backing_src_alias(src_type)->flag,
1038  					  region->fd, 0);
1039  		TEST_ASSERT(region->mmap_alias != MAP_FAILED,
1040  			    __KVM_SYSCALL_ERROR("mmap()",  (int)(unsigned long)MAP_FAILED));
1041  
1042  		/* Align host alias address */
1043  		region->host_alias = align_ptr_up(region->mmap_alias, alignment);
1044  	}
1045  }
1046  
1047  /*
1048   * Memslot to region
1049   *
1050   * Input Args:
1051   *   vm - Virtual Machine
1052   *   memslot - KVM memory slot ID
1053   *
1054   * Output Args: None
1055   *
1056   * Return:
1057   *   Pointer to memory region structure that describe memory region
1058   *   using kvm memory slot ID given by memslot.  TEST_ASSERT failure
1059   *   on error (e.g. currently no memory region using memslot as a KVM
1060   *   memory slot ID).
1061   */
1062  struct userspace_mem_region *
1063  memslot2region(struct kvm_vm *vm, uint32_t memslot)
1064  {
1065  	struct userspace_mem_region *region;
1066  
1067  	hash_for_each_possible(vm->regions.slot_hash, region, slot_node,
1068  			       memslot)
1069  		if (region->region.slot == memslot)
1070  			return region;
1071  
1072  	fprintf(stderr, "No mem region with the requested slot found,\n"
1073  		"  requested slot: %u\n", memslot);
1074  	fputs("---- vm dump ----\n", stderr);
1075  	vm_dump(stderr, vm, 2);
1076  	TEST_FAIL("Mem region not found");
1077  	return NULL;
1078  }
1079  
1080  /*
1081   * VM Memory Region Flags Set
1082   *
1083   * Input Args:
1084   *   vm - Virtual Machine
1085   *   flags - Starting guest physical address
1086   *
1087   * Output Args: None
1088   *
1089   * Return: None
1090   *
1091   * Sets the flags of the memory region specified by the value of slot,
1092   * to the values given by flags.
1093   */
1094  void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags)
1095  {
1096  	int ret;
1097  	struct userspace_mem_region *region;
1098  
1099  	region = memslot2region(vm, slot);
1100  
1101  	region->region.flags = flags;
1102  
1103  	ret = __vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION, &region->region);
1104  
1105  	TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed,\n"
1106  		"  rc: %i errno: %i slot: %u flags: 0x%x",
1107  		ret, errno, slot, flags);
1108  }
1109  
1110  /*
1111   * VM Memory Region Move
1112   *
1113   * Input Args:
1114   *   vm - Virtual Machine
1115   *   slot - Slot of the memory region to move
1116   *   new_gpa - Starting guest physical address
1117   *
1118   * Output Args: None
1119   *
1120   * Return: None
1121   *
1122   * Change the gpa of a memory region.
1123   */
1124  void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa)
1125  {
1126  	struct userspace_mem_region *region;
1127  	int ret;
1128  
1129  	region = memslot2region(vm, slot);
1130  
1131  	region->region.guest_phys_addr = new_gpa;
1132  
1133  	ret = __vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION, &region->region);
1134  
1135  	TEST_ASSERT(!ret, "KVM_SET_USER_MEMORY_REGION failed\n"
1136  		    "ret: %i errno: %i slot: %u new_gpa: 0x%lx",
1137  		    ret, errno, slot, new_gpa);
1138  }
1139  
1140  /*
1141   * VM Memory Region Delete
1142   *
1143   * Input Args:
1144   *   vm - Virtual Machine
1145   *   slot - Slot of the memory region to delete
1146   *
1147   * Output Args: None
1148   *
1149   * Return: None
1150   *
1151   * Delete a memory region.
1152   */
1153  void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot)
1154  {
1155  	__vm_mem_region_delete(vm, memslot2region(vm, slot), true);
1156  }
1157  
1158  /* Returns the size of a vCPU's kvm_run structure. */
1159  static int vcpu_mmap_sz(void)
1160  {
1161  	int dev_fd, ret;
1162  
1163  	dev_fd = open_kvm_dev_path_or_exit();
1164  
1165  	ret = ioctl(dev_fd, KVM_GET_VCPU_MMAP_SIZE, NULL);
1166  	TEST_ASSERT(ret >= sizeof(struct kvm_run),
1167  		    KVM_IOCTL_ERROR(KVM_GET_VCPU_MMAP_SIZE, ret));
1168  
1169  	close(dev_fd);
1170  
1171  	return ret;
1172  }
1173  
1174  static bool vcpu_exists(struct kvm_vm *vm, uint32_t vcpu_id)
1175  {
1176  	struct kvm_vcpu *vcpu;
1177  
1178  	list_for_each_entry(vcpu, &vm->vcpus, list) {
1179  		if (vcpu->id == vcpu_id)
1180  			return true;
1181  	}
1182  
1183  	return false;
1184  }
1185  
1186  /*
1187   * Adds a virtual CPU to the VM specified by vm with the ID given by vcpu_id.
1188   * No additional vCPU setup is done.  Returns the vCPU.
1189   */
1190  struct kvm_vcpu *__vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id)
1191  {
1192  	struct kvm_vcpu *vcpu;
1193  
1194  	/* Confirm a vcpu with the specified id doesn't already exist. */
1195  	TEST_ASSERT(!vcpu_exists(vm, vcpu_id), "vCPU%d already exists\n", vcpu_id);
1196  
1197  	/* Allocate and initialize new vcpu structure. */
1198  	vcpu = calloc(1, sizeof(*vcpu));
1199  	TEST_ASSERT(vcpu != NULL, "Insufficient Memory");
1200  
1201  	vcpu->vm = vm;
1202  	vcpu->id = vcpu_id;
1203  	vcpu->fd = __vm_ioctl(vm, KVM_CREATE_VCPU, (void *)(unsigned long)vcpu_id);
1204  	TEST_ASSERT(vcpu->fd >= 0, KVM_IOCTL_ERROR(KVM_CREATE_VCPU, vcpu->fd));
1205  
1206  	TEST_ASSERT(vcpu_mmap_sz() >= sizeof(*vcpu->run), "vcpu mmap size "
1207  		"smaller than expected, vcpu_mmap_sz: %i expected_min: %zi",
1208  		vcpu_mmap_sz(), sizeof(*vcpu->run));
1209  	vcpu->run = (struct kvm_run *) mmap(NULL, vcpu_mmap_sz(),
1210  		PROT_READ | PROT_WRITE, MAP_SHARED, vcpu->fd, 0);
1211  	TEST_ASSERT(vcpu->run != MAP_FAILED,
1212  		    __KVM_SYSCALL_ERROR("mmap()", (int)(unsigned long)MAP_FAILED));
1213  
1214  	/* Add to linked-list of VCPUs. */
1215  	list_add(&vcpu->list, &vm->vcpus);
1216  
1217  	return vcpu;
1218  }
1219  
1220  /*
1221   * VM Virtual Address Unused Gap
1222   *
1223   * Input Args:
1224   *   vm - Virtual Machine
1225   *   sz - Size (bytes)
1226   *   vaddr_min - Minimum Virtual Address
1227   *
1228   * Output Args: None
1229   *
1230   * Return:
1231   *   Lowest virtual address at or below vaddr_min, with at least
1232   *   sz unused bytes.  TEST_ASSERT failure if no area of at least
1233   *   size sz is available.
1234   *
1235   * Within the VM specified by vm, locates the lowest starting virtual
1236   * address >= vaddr_min, that has at least sz unallocated bytes.  A
1237   * TEST_ASSERT failure occurs for invalid input or no area of at least
1238   * sz unallocated bytes >= vaddr_min is available.
1239   */
1240  vm_vaddr_t vm_vaddr_unused_gap(struct kvm_vm *vm, size_t sz,
1241  			       vm_vaddr_t vaddr_min)
1242  {
1243  	uint64_t pages = (sz + vm->page_size - 1) >> vm->page_shift;
1244  
1245  	/* Determine lowest permitted virtual page index. */
1246  	uint64_t pgidx_start = (vaddr_min + vm->page_size - 1) >> vm->page_shift;
1247  	if ((pgidx_start * vm->page_size) < vaddr_min)
1248  		goto no_va_found;
1249  
1250  	/* Loop over section with enough valid virtual page indexes. */
1251  	if (!sparsebit_is_set_num(vm->vpages_valid,
1252  		pgidx_start, pages))
1253  		pgidx_start = sparsebit_next_set_num(vm->vpages_valid,
1254  			pgidx_start, pages);
1255  	do {
1256  		/*
1257  		 * Are there enough unused virtual pages available at
1258  		 * the currently proposed starting virtual page index.
1259  		 * If not, adjust proposed starting index to next
1260  		 * possible.
1261  		 */
1262  		if (sparsebit_is_clear_num(vm->vpages_mapped,
1263  			pgidx_start, pages))
1264  			goto va_found;
1265  		pgidx_start = sparsebit_next_clear_num(vm->vpages_mapped,
1266  			pgidx_start, pages);
1267  		if (pgidx_start == 0)
1268  			goto no_va_found;
1269  
1270  		/*
1271  		 * If needed, adjust proposed starting virtual address,
1272  		 * to next range of valid virtual addresses.
1273  		 */
1274  		if (!sparsebit_is_set_num(vm->vpages_valid,
1275  			pgidx_start, pages)) {
1276  			pgidx_start = sparsebit_next_set_num(
1277  				vm->vpages_valid, pgidx_start, pages);
1278  			if (pgidx_start == 0)
1279  				goto no_va_found;
1280  		}
1281  	} while (pgidx_start != 0);
1282  
1283  no_va_found:
1284  	TEST_FAIL("No vaddr of specified pages available, pages: 0x%lx", pages);
1285  
1286  	/* NOT REACHED */
1287  	return -1;
1288  
1289  va_found:
1290  	TEST_ASSERT(sparsebit_is_set_num(vm->vpages_valid,
1291  		pgidx_start, pages),
1292  		"Unexpected, invalid virtual page index range,\n"
1293  		"  pgidx_start: 0x%lx\n"
1294  		"  pages: 0x%lx",
1295  		pgidx_start, pages);
1296  	TEST_ASSERT(sparsebit_is_clear_num(vm->vpages_mapped,
1297  		pgidx_start, pages),
1298  		"Unexpected, pages already mapped,\n"
1299  		"  pgidx_start: 0x%lx\n"
1300  		"  pages: 0x%lx",
1301  		pgidx_start, pages);
1302  
1303  	return pgidx_start * vm->page_size;
1304  }
1305  
1306  vm_vaddr_t __vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min,
1307  			    enum kvm_mem_region_type type)
1308  {
1309  	uint64_t pages = (sz >> vm->page_shift) + ((sz % vm->page_size) != 0);
1310  
1311  	virt_pgd_alloc(vm);
1312  	vm_paddr_t paddr = vm_phy_pages_alloc(vm, pages,
1313  					      KVM_UTIL_MIN_PFN * vm->page_size,
1314  					      vm->memslots[type]);
1315  
1316  	/*
1317  	 * Find an unused range of virtual page addresses of at least
1318  	 * pages in length.
1319  	 */
1320  	vm_vaddr_t vaddr_start = vm_vaddr_unused_gap(vm, sz, vaddr_min);
1321  
1322  	/* Map the virtual pages. */
1323  	for (vm_vaddr_t vaddr = vaddr_start; pages > 0;
1324  		pages--, vaddr += vm->page_size, paddr += vm->page_size) {
1325  
1326  		virt_pg_map(vm, vaddr, paddr);
1327  
1328  		sparsebit_set(vm->vpages_mapped, vaddr >> vm->page_shift);
1329  	}
1330  
1331  	return vaddr_start;
1332  }
1333  
1334  /*
1335   * VM Virtual Address Allocate
1336   *
1337   * Input Args:
1338   *   vm - Virtual Machine
1339   *   sz - Size in bytes
1340   *   vaddr_min - Minimum starting virtual address
1341   *
1342   * Output Args: None
1343   *
1344   * Return:
1345   *   Starting guest virtual address
1346   *
1347   * Allocates at least sz bytes within the virtual address space of the vm
1348   * given by vm.  The allocated bytes are mapped to a virtual address >=
1349   * the address given by vaddr_min.  Note that each allocation uses a
1350   * a unique set of pages, with the minimum real allocation being at least
1351   * a page. The allocated physical space comes from the TEST_DATA memory region.
1352   */
1353  vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min)
1354  {
1355  	return __vm_vaddr_alloc(vm, sz, vaddr_min, MEM_REGION_TEST_DATA);
1356  }
1357  
1358  /*
1359   * VM Virtual Address Allocate Pages
1360   *
1361   * Input Args:
1362   *   vm - Virtual Machine
1363   *
1364   * Output Args: None
1365   *
1366   * Return:
1367   *   Starting guest virtual address
1368   *
1369   * Allocates at least N system pages worth of bytes within the virtual address
1370   * space of the vm.
1371   */
1372  vm_vaddr_t vm_vaddr_alloc_pages(struct kvm_vm *vm, int nr_pages)
1373  {
1374  	return vm_vaddr_alloc(vm, nr_pages * getpagesize(), KVM_UTIL_MIN_VADDR);
1375  }
1376  
1377  vm_vaddr_t __vm_vaddr_alloc_page(struct kvm_vm *vm, enum kvm_mem_region_type type)
1378  {
1379  	return __vm_vaddr_alloc(vm, getpagesize(), KVM_UTIL_MIN_VADDR, type);
1380  }
1381  
1382  /*
1383   * VM Virtual Address Allocate Page
1384   *
1385   * Input Args:
1386   *   vm - Virtual Machine
1387   *
1388   * Output Args: None
1389   *
1390   * Return:
1391   *   Starting guest virtual address
1392   *
1393   * Allocates at least one system page worth of bytes within the virtual address
1394   * space of the vm.
1395   */
1396  vm_vaddr_t vm_vaddr_alloc_page(struct kvm_vm *vm)
1397  {
1398  	return vm_vaddr_alloc_pages(vm, 1);
1399  }
1400  
1401  /*
1402   * Map a range of VM virtual address to the VM's physical address
1403   *
1404   * Input Args:
1405   *   vm - Virtual Machine
1406   *   vaddr - Virtuall address to map
1407   *   paddr - VM Physical Address
1408   *   npages - The number of pages to map
1409   *
1410   * Output Args: None
1411   *
1412   * Return: None
1413   *
1414   * Within the VM given by @vm, creates a virtual translation for
1415   * @npages starting at @vaddr to the page range starting at @paddr.
1416   */
1417  void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
1418  	      unsigned int npages)
1419  {
1420  	size_t page_size = vm->page_size;
1421  	size_t size = npages * page_size;
1422  
1423  	TEST_ASSERT(vaddr + size > vaddr, "Vaddr overflow");
1424  	TEST_ASSERT(paddr + size > paddr, "Paddr overflow");
1425  
1426  	while (npages--) {
1427  		virt_pg_map(vm, vaddr, paddr);
1428  		sparsebit_set(vm->vpages_mapped, vaddr >> vm->page_shift);
1429  
1430  		vaddr += page_size;
1431  		paddr += page_size;
1432  	}
1433  }
1434  
1435  /*
1436   * Address VM Physical to Host Virtual
1437   *
1438   * Input Args:
1439   *   vm - Virtual Machine
1440   *   gpa - VM physical address
1441   *
1442   * Output Args: None
1443   *
1444   * Return:
1445   *   Equivalent host virtual address
1446   *
1447   * Locates the memory region containing the VM physical address given
1448   * by gpa, within the VM given by vm.  When found, the host virtual
1449   * address providing the memory to the vm physical address is returned.
1450   * A TEST_ASSERT failure occurs if no region containing gpa exists.
1451   */
1452  void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa)
1453  {
1454  	struct userspace_mem_region *region;
1455  
1456  	region = userspace_mem_region_find(vm, gpa, gpa);
1457  	if (!region) {
1458  		TEST_FAIL("No vm physical memory at 0x%lx", gpa);
1459  		return NULL;
1460  	}
1461  
1462  	return (void *)((uintptr_t)region->host_mem
1463  		+ (gpa - region->region.guest_phys_addr));
1464  }
1465  
1466  /*
1467   * Address Host Virtual to VM Physical
1468   *
1469   * Input Args:
1470   *   vm - Virtual Machine
1471   *   hva - Host virtual address
1472   *
1473   * Output Args: None
1474   *
1475   * Return:
1476   *   Equivalent VM physical address
1477   *
1478   * Locates the memory region containing the host virtual address given
1479   * by hva, within the VM given by vm.  When found, the equivalent
1480   * VM physical address is returned. A TEST_ASSERT failure occurs if no
1481   * region containing hva exists.
1482   */
1483  vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva)
1484  {
1485  	struct rb_node *node;
1486  
1487  	for (node = vm->regions.hva_tree.rb_node; node; ) {
1488  		struct userspace_mem_region *region =
1489  			container_of(node, struct userspace_mem_region, hva_node);
1490  
1491  		if (hva >= region->host_mem) {
1492  			if (hva <= (region->host_mem
1493  				+ region->region.memory_size - 1))
1494  				return (vm_paddr_t)((uintptr_t)
1495  					region->region.guest_phys_addr
1496  					+ (hva - (uintptr_t)region->host_mem));
1497  
1498  			node = node->rb_right;
1499  		} else
1500  			node = node->rb_left;
1501  	}
1502  
1503  	TEST_FAIL("No mapping to a guest physical address, hva: %p", hva);
1504  	return -1;
1505  }
1506  
1507  /*
1508   * Address VM physical to Host Virtual *alias*.
1509   *
1510   * Input Args:
1511   *   vm - Virtual Machine
1512   *   gpa - VM physical address
1513   *
1514   * Output Args: None
1515   *
1516   * Return:
1517   *   Equivalent address within the host virtual *alias* area, or NULL
1518   *   (without failing the test) if the guest memory is not shared (so
1519   *   no alias exists).
1520   *
1521   * Create a writable, shared virtual=>physical alias for the specific GPA.
1522   * The primary use case is to allow the host selftest to manipulate guest
1523   * memory without mapping said memory in the guest's address space. And, for
1524   * userfaultfd-based demand paging, to do so without triggering userfaults.
1525   */
1526  void *addr_gpa2alias(struct kvm_vm *vm, vm_paddr_t gpa)
1527  {
1528  	struct userspace_mem_region *region;
1529  	uintptr_t offset;
1530  
1531  	region = userspace_mem_region_find(vm, gpa, gpa);
1532  	if (!region)
1533  		return NULL;
1534  
1535  	if (!region->host_alias)
1536  		return NULL;
1537  
1538  	offset = gpa - region->region.guest_phys_addr;
1539  	return (void *) ((uintptr_t) region->host_alias + offset);
1540  }
1541  
1542  /* Create an interrupt controller chip for the specified VM. */
1543  void vm_create_irqchip(struct kvm_vm *vm)
1544  {
1545  	vm_ioctl(vm, KVM_CREATE_IRQCHIP, NULL);
1546  
1547  	vm->has_irqchip = true;
1548  }
1549  
1550  int _vcpu_run(struct kvm_vcpu *vcpu)
1551  {
1552  	int rc;
1553  
1554  	do {
1555  		rc = __vcpu_run(vcpu);
1556  	} while (rc == -1 && errno == EINTR);
1557  
1558  	assert_on_unhandled_exception(vcpu);
1559  
1560  	return rc;
1561  }
1562  
1563  /*
1564   * Invoke KVM_RUN on a vCPU until KVM returns something other than -EINTR.
1565   * Assert if the KVM returns an error (other than -EINTR).
1566   */
1567  void vcpu_run(struct kvm_vcpu *vcpu)
1568  {
1569  	int ret = _vcpu_run(vcpu);
1570  
1571  	TEST_ASSERT(!ret, KVM_IOCTL_ERROR(KVM_RUN, ret));
1572  }
1573  
1574  void vcpu_run_complete_io(struct kvm_vcpu *vcpu)
1575  {
1576  	int ret;
1577  
1578  	vcpu->run->immediate_exit = 1;
1579  	ret = __vcpu_run(vcpu);
1580  	vcpu->run->immediate_exit = 0;
1581  
1582  	TEST_ASSERT(ret == -1 && errno == EINTR,
1583  		    "KVM_RUN IOCTL didn't exit immediately, rc: %i, errno: %i",
1584  		    ret, errno);
1585  }
1586  
1587  /*
1588   * Get the list of guest registers which are supported for
1589   * KVM_GET_ONE_REG/KVM_SET_ONE_REG ioctls.  Returns a kvm_reg_list pointer,
1590   * it is the caller's responsibility to free the list.
1591   */
1592  struct kvm_reg_list *vcpu_get_reg_list(struct kvm_vcpu *vcpu)
1593  {
1594  	struct kvm_reg_list reg_list_n = { .n = 0 }, *reg_list;
1595  	int ret;
1596  
1597  	ret = __vcpu_ioctl(vcpu, KVM_GET_REG_LIST, &reg_list_n);
1598  	TEST_ASSERT(ret == -1 && errno == E2BIG, "KVM_GET_REG_LIST n=0");
1599  
1600  	reg_list = calloc(1, sizeof(*reg_list) + reg_list_n.n * sizeof(__u64));
1601  	reg_list->n = reg_list_n.n;
1602  	vcpu_ioctl(vcpu, KVM_GET_REG_LIST, reg_list);
1603  	return reg_list;
1604  }
1605  
1606  void *vcpu_map_dirty_ring(struct kvm_vcpu *vcpu)
1607  {
1608  	uint32_t page_size = getpagesize();
1609  	uint32_t size = vcpu->vm->dirty_ring_size;
1610  
1611  	TEST_ASSERT(size > 0, "Should enable dirty ring first");
1612  
1613  	if (!vcpu->dirty_gfns) {
1614  		void *addr;
1615  
1616  		addr = mmap(NULL, size, PROT_READ, MAP_PRIVATE, vcpu->fd,
1617  			    page_size * KVM_DIRTY_LOG_PAGE_OFFSET);
1618  		TEST_ASSERT(addr == MAP_FAILED, "Dirty ring mapped private");
1619  
1620  		addr = mmap(NULL, size, PROT_READ | PROT_EXEC, MAP_PRIVATE, vcpu->fd,
1621  			    page_size * KVM_DIRTY_LOG_PAGE_OFFSET);
1622  		TEST_ASSERT(addr == MAP_FAILED, "Dirty ring mapped exec");
1623  
1624  		addr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, vcpu->fd,
1625  			    page_size * KVM_DIRTY_LOG_PAGE_OFFSET);
1626  		TEST_ASSERT(addr != MAP_FAILED, "Dirty ring map failed");
1627  
1628  		vcpu->dirty_gfns = addr;
1629  		vcpu->dirty_gfns_count = size / sizeof(struct kvm_dirty_gfn);
1630  	}
1631  
1632  	return vcpu->dirty_gfns;
1633  }
1634  
1635  /*
1636   * Device Ioctl
1637   */
1638  
1639  int __kvm_has_device_attr(int dev_fd, uint32_t group, uint64_t attr)
1640  {
1641  	struct kvm_device_attr attribute = {
1642  		.group = group,
1643  		.attr = attr,
1644  		.flags = 0,
1645  	};
1646  
1647  	return ioctl(dev_fd, KVM_HAS_DEVICE_ATTR, &attribute);
1648  }
1649  
1650  int __kvm_test_create_device(struct kvm_vm *vm, uint64_t type)
1651  {
1652  	struct kvm_create_device create_dev = {
1653  		.type = type,
1654  		.flags = KVM_CREATE_DEVICE_TEST,
1655  	};
1656  
1657  	return __vm_ioctl(vm, KVM_CREATE_DEVICE, &create_dev);
1658  }
1659  
1660  int __kvm_create_device(struct kvm_vm *vm, uint64_t type)
1661  {
1662  	struct kvm_create_device create_dev = {
1663  		.type = type,
1664  		.fd = -1,
1665  		.flags = 0,
1666  	};
1667  	int err;
1668  
1669  	err = __vm_ioctl(vm, KVM_CREATE_DEVICE, &create_dev);
1670  	TEST_ASSERT(err <= 0, "KVM_CREATE_DEVICE shouldn't return a positive value");
1671  	return err ? : create_dev.fd;
1672  }
1673  
1674  int __kvm_device_attr_get(int dev_fd, uint32_t group, uint64_t attr, void *val)
1675  {
1676  	struct kvm_device_attr kvmattr = {
1677  		.group = group,
1678  		.attr = attr,
1679  		.flags = 0,
1680  		.addr = (uintptr_t)val,
1681  	};
1682  
1683  	return __kvm_ioctl(dev_fd, KVM_GET_DEVICE_ATTR, &kvmattr);
1684  }
1685  
1686  int __kvm_device_attr_set(int dev_fd, uint32_t group, uint64_t attr, void *val)
1687  {
1688  	struct kvm_device_attr kvmattr = {
1689  		.group = group,
1690  		.attr = attr,
1691  		.flags = 0,
1692  		.addr = (uintptr_t)val,
1693  	};
1694  
1695  	return __kvm_ioctl(dev_fd, KVM_SET_DEVICE_ATTR, &kvmattr);
1696  }
1697  
1698  /*
1699   * IRQ related functions.
1700   */
1701  
1702  int _kvm_irq_line(struct kvm_vm *vm, uint32_t irq, int level)
1703  {
1704  	struct kvm_irq_level irq_level = {
1705  		.irq    = irq,
1706  		.level  = level,
1707  	};
1708  
1709  	return __vm_ioctl(vm, KVM_IRQ_LINE, &irq_level);
1710  }
1711  
1712  void kvm_irq_line(struct kvm_vm *vm, uint32_t irq, int level)
1713  {
1714  	int ret = _kvm_irq_line(vm, irq, level);
1715  
1716  	TEST_ASSERT(ret >= 0, KVM_IOCTL_ERROR(KVM_IRQ_LINE, ret));
1717  }
1718  
1719  struct kvm_irq_routing *kvm_gsi_routing_create(void)
1720  {
1721  	struct kvm_irq_routing *routing;
1722  	size_t size;
1723  
1724  	size = sizeof(struct kvm_irq_routing);
1725  	/* Allocate space for the max number of entries: this wastes 196 KBs. */
1726  	size += KVM_MAX_IRQ_ROUTES * sizeof(struct kvm_irq_routing_entry);
1727  	routing = calloc(1, size);
1728  	assert(routing);
1729  
1730  	return routing;
1731  }
1732  
1733  void kvm_gsi_routing_irqchip_add(struct kvm_irq_routing *routing,
1734  		uint32_t gsi, uint32_t pin)
1735  {
1736  	int i;
1737  
1738  	assert(routing);
1739  	assert(routing->nr < KVM_MAX_IRQ_ROUTES);
1740  
1741  	i = routing->nr;
1742  	routing->entries[i].gsi = gsi;
1743  	routing->entries[i].type = KVM_IRQ_ROUTING_IRQCHIP;
1744  	routing->entries[i].flags = 0;
1745  	routing->entries[i].u.irqchip.irqchip = 0;
1746  	routing->entries[i].u.irqchip.pin = pin;
1747  	routing->nr++;
1748  }
1749  
1750  int _kvm_gsi_routing_write(struct kvm_vm *vm, struct kvm_irq_routing *routing)
1751  {
1752  	int ret;
1753  
1754  	assert(routing);
1755  	ret = __vm_ioctl(vm, KVM_SET_GSI_ROUTING, routing);
1756  	free(routing);
1757  
1758  	return ret;
1759  }
1760  
1761  void kvm_gsi_routing_write(struct kvm_vm *vm, struct kvm_irq_routing *routing)
1762  {
1763  	int ret;
1764  
1765  	ret = _kvm_gsi_routing_write(vm, routing);
1766  	TEST_ASSERT(!ret, KVM_IOCTL_ERROR(KVM_SET_GSI_ROUTING, ret));
1767  }
1768  
1769  /*
1770   * VM Dump
1771   *
1772   * Input Args:
1773   *   vm - Virtual Machine
1774   *   indent - Left margin indent amount
1775   *
1776   * Output Args:
1777   *   stream - Output FILE stream
1778   *
1779   * Return: None
1780   *
1781   * Dumps the current state of the VM given by vm, to the FILE stream
1782   * given by stream.
1783   */
1784  void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
1785  {
1786  	int ctr;
1787  	struct userspace_mem_region *region;
1788  	struct kvm_vcpu *vcpu;
1789  
1790  	fprintf(stream, "%*smode: 0x%x\n", indent, "", vm->mode);
1791  	fprintf(stream, "%*sfd: %i\n", indent, "", vm->fd);
1792  	fprintf(stream, "%*spage_size: 0x%x\n", indent, "", vm->page_size);
1793  	fprintf(stream, "%*sMem Regions:\n", indent, "");
1794  	hash_for_each(vm->regions.slot_hash, ctr, region, slot_node) {
1795  		fprintf(stream, "%*sguest_phys: 0x%lx size: 0x%lx "
1796  			"host_virt: %p\n", indent + 2, "",
1797  			(uint64_t) region->region.guest_phys_addr,
1798  			(uint64_t) region->region.memory_size,
1799  			region->host_mem);
1800  		fprintf(stream, "%*sunused_phy_pages: ", indent + 2, "");
1801  		sparsebit_dump(stream, region->unused_phy_pages, 0);
1802  	}
1803  	fprintf(stream, "%*sMapped Virtual Pages:\n", indent, "");
1804  	sparsebit_dump(stream, vm->vpages_mapped, indent + 2);
1805  	fprintf(stream, "%*spgd_created: %u\n", indent, "",
1806  		vm->pgd_created);
1807  	if (vm->pgd_created) {
1808  		fprintf(stream, "%*sVirtual Translation Tables:\n",
1809  			indent + 2, "");
1810  		virt_dump(stream, vm, indent + 4);
1811  	}
1812  	fprintf(stream, "%*sVCPUs:\n", indent, "");
1813  
1814  	list_for_each_entry(vcpu, &vm->vcpus, list)
1815  		vcpu_dump(stream, vcpu, indent + 2);
1816  }
1817  
1818  /* Known KVM exit reasons */
1819  static struct exit_reason {
1820  	unsigned int reason;
1821  	const char *name;
1822  } exit_reasons_known[] = {
1823  	{KVM_EXIT_UNKNOWN, "UNKNOWN"},
1824  	{KVM_EXIT_EXCEPTION, "EXCEPTION"},
1825  	{KVM_EXIT_IO, "IO"},
1826  	{KVM_EXIT_HYPERCALL, "HYPERCALL"},
1827  	{KVM_EXIT_DEBUG, "DEBUG"},
1828  	{KVM_EXIT_HLT, "HLT"},
1829  	{KVM_EXIT_MMIO, "MMIO"},
1830  	{KVM_EXIT_IRQ_WINDOW_OPEN, "IRQ_WINDOW_OPEN"},
1831  	{KVM_EXIT_SHUTDOWN, "SHUTDOWN"},
1832  	{KVM_EXIT_FAIL_ENTRY, "FAIL_ENTRY"},
1833  	{KVM_EXIT_INTR, "INTR"},
1834  	{KVM_EXIT_SET_TPR, "SET_TPR"},
1835  	{KVM_EXIT_TPR_ACCESS, "TPR_ACCESS"},
1836  	{KVM_EXIT_S390_SIEIC, "S390_SIEIC"},
1837  	{KVM_EXIT_S390_RESET, "S390_RESET"},
1838  	{KVM_EXIT_DCR, "DCR"},
1839  	{KVM_EXIT_NMI, "NMI"},
1840  	{KVM_EXIT_INTERNAL_ERROR, "INTERNAL_ERROR"},
1841  	{KVM_EXIT_OSI, "OSI"},
1842  	{KVM_EXIT_PAPR_HCALL, "PAPR_HCALL"},
1843  	{KVM_EXIT_DIRTY_RING_FULL, "DIRTY_RING_FULL"},
1844  	{KVM_EXIT_X86_RDMSR, "RDMSR"},
1845  	{KVM_EXIT_X86_WRMSR, "WRMSR"},
1846  	{KVM_EXIT_XEN, "XEN"},
1847  #ifdef KVM_EXIT_MEMORY_NOT_PRESENT
1848  	{KVM_EXIT_MEMORY_NOT_PRESENT, "MEMORY_NOT_PRESENT"},
1849  #endif
1850  };
1851  
1852  /*
1853   * Exit Reason String
1854   *
1855   * Input Args:
1856   *   exit_reason - Exit reason
1857   *
1858   * Output Args: None
1859   *
1860   * Return:
1861   *   Constant string pointer describing the exit reason.
1862   *
1863   * Locates and returns a constant string that describes the KVM exit
1864   * reason given by exit_reason.  If no such string is found, a constant
1865   * string of "Unknown" is returned.
1866   */
1867  const char *exit_reason_str(unsigned int exit_reason)
1868  {
1869  	unsigned int n1;
1870  
1871  	for (n1 = 0; n1 < ARRAY_SIZE(exit_reasons_known); n1++) {
1872  		if (exit_reason == exit_reasons_known[n1].reason)
1873  			return exit_reasons_known[n1].name;
1874  	}
1875  
1876  	return "Unknown";
1877  }
1878  
1879  /*
1880   * Physical Contiguous Page Allocator
1881   *
1882   * Input Args:
1883   *   vm - Virtual Machine
1884   *   num - number of pages
1885   *   paddr_min - Physical address minimum
1886   *   memslot - Memory region to allocate page from
1887   *
1888   * Output Args: None
1889   *
1890   * Return:
1891   *   Starting physical address
1892   *
1893   * Within the VM specified by vm, locates a range of available physical
1894   * pages at or above paddr_min. If found, the pages are marked as in use
1895   * and their base address is returned. A TEST_ASSERT failure occurs if
1896   * not enough pages are available at or above paddr_min.
1897   */
1898  vm_paddr_t vm_phy_pages_alloc(struct kvm_vm *vm, size_t num,
1899  			      vm_paddr_t paddr_min, uint32_t memslot)
1900  {
1901  	struct userspace_mem_region *region;
1902  	sparsebit_idx_t pg, base;
1903  
1904  	TEST_ASSERT(num > 0, "Must allocate at least one page");
1905  
1906  	TEST_ASSERT((paddr_min % vm->page_size) == 0, "Min physical address "
1907  		"not divisible by page size.\n"
1908  		"  paddr_min: 0x%lx page_size: 0x%x",
1909  		paddr_min, vm->page_size);
1910  
1911  	region = memslot2region(vm, memslot);
1912  	base = pg = paddr_min >> vm->page_shift;
1913  
1914  	do {
1915  		for (; pg < base + num; ++pg) {
1916  			if (!sparsebit_is_set(region->unused_phy_pages, pg)) {
1917  				base = pg = sparsebit_next_set(region->unused_phy_pages, pg);
1918  				break;
1919  			}
1920  		}
1921  	} while (pg && pg != base + num);
1922  
1923  	if (pg == 0) {
1924  		fprintf(stderr, "No guest physical page available, "
1925  			"paddr_min: 0x%lx page_size: 0x%x memslot: %u\n",
1926  			paddr_min, vm->page_size, memslot);
1927  		fputs("---- vm dump ----\n", stderr);
1928  		vm_dump(stderr, vm, 2);
1929  		abort();
1930  	}
1931  
1932  	for (pg = base; pg < base + num; ++pg)
1933  		sparsebit_clear(region->unused_phy_pages, pg);
1934  
1935  	return base * vm->page_size;
1936  }
1937  
1938  vm_paddr_t vm_phy_page_alloc(struct kvm_vm *vm, vm_paddr_t paddr_min,
1939  			     uint32_t memslot)
1940  {
1941  	return vm_phy_pages_alloc(vm, 1, paddr_min, memslot);
1942  }
1943  
1944  /* Arbitrary minimum physical address used for virtual translation tables. */
1945  #define KVM_GUEST_PAGE_TABLE_MIN_PADDR 0x180000
1946  
1947  vm_paddr_t vm_alloc_page_table(struct kvm_vm *vm)
1948  {
1949  	return vm_phy_page_alloc(vm, KVM_GUEST_PAGE_TABLE_MIN_PADDR,
1950  				 vm->memslots[MEM_REGION_PT]);
1951  }
1952  
1953  /*
1954   * Address Guest Virtual to Host Virtual
1955   *
1956   * Input Args:
1957   *   vm - Virtual Machine
1958   *   gva - VM virtual address
1959   *
1960   * Output Args: None
1961   *
1962   * Return:
1963   *   Equivalent host virtual address
1964   */
1965  void *addr_gva2hva(struct kvm_vm *vm, vm_vaddr_t gva)
1966  {
1967  	return addr_gpa2hva(vm, addr_gva2gpa(vm, gva));
1968  }
1969  
1970  unsigned long __weak vm_compute_max_gfn(struct kvm_vm *vm)
1971  {
1972  	return ((1ULL << vm->pa_bits) >> vm->page_shift) - 1;
1973  }
1974  
1975  static unsigned int vm_calc_num_pages(unsigned int num_pages,
1976  				      unsigned int page_shift,
1977  				      unsigned int new_page_shift,
1978  				      bool ceil)
1979  {
1980  	unsigned int n = 1 << (new_page_shift - page_shift);
1981  
1982  	if (page_shift >= new_page_shift)
1983  		return num_pages * (1 << (page_shift - new_page_shift));
1984  
1985  	return num_pages / n + !!(ceil && num_pages % n);
1986  }
1987  
1988  static inline int getpageshift(void)
1989  {
1990  	return __builtin_ffs(getpagesize()) - 1;
1991  }
1992  
1993  unsigned int
1994  vm_num_host_pages(enum vm_guest_mode mode, unsigned int num_guest_pages)
1995  {
1996  	return vm_calc_num_pages(num_guest_pages,
1997  				 vm_guest_mode_params[mode].page_shift,
1998  				 getpageshift(), true);
1999  }
2000  
2001  unsigned int
2002  vm_num_guest_pages(enum vm_guest_mode mode, unsigned int num_host_pages)
2003  {
2004  	return vm_calc_num_pages(num_host_pages, getpageshift(),
2005  				 vm_guest_mode_params[mode].page_shift, false);
2006  }
2007  
2008  unsigned int vm_calc_num_guest_pages(enum vm_guest_mode mode, size_t size)
2009  {
2010  	unsigned int n;
2011  	n = DIV_ROUND_UP(size, vm_guest_mode_params[mode].page_size);
2012  	return vm_adjust_num_guest_pages(mode, n);
2013  }
2014  
2015  /*
2016   * Read binary stats descriptors
2017   *
2018   * Input Args:
2019   *   stats_fd - the file descriptor for the binary stats file from which to read
2020   *   header - the binary stats metadata header corresponding to the given FD
2021   *
2022   * Output Args: None
2023   *
2024   * Return:
2025   *   A pointer to a newly allocated series of stat descriptors.
2026   *   Caller is responsible for freeing the returned kvm_stats_desc.
2027   *
2028   * Read the stats descriptors from the binary stats interface.
2029   */
2030  struct kvm_stats_desc *read_stats_descriptors(int stats_fd,
2031  					      struct kvm_stats_header *header)
2032  {
2033  	struct kvm_stats_desc *stats_desc;
2034  	ssize_t desc_size, total_size, ret;
2035  
2036  	desc_size = get_stats_descriptor_size(header);
2037  	total_size = header->num_desc * desc_size;
2038  
2039  	stats_desc = calloc(header->num_desc, desc_size);
2040  	TEST_ASSERT(stats_desc, "Allocate memory for stats descriptors");
2041  
2042  	ret = pread(stats_fd, stats_desc, total_size, header->desc_offset);
2043  	TEST_ASSERT(ret == total_size, "Read KVM stats descriptors");
2044  
2045  	return stats_desc;
2046  }
2047  
2048  /*
2049   * Read stat data for a particular stat
2050   *
2051   * Input Args:
2052   *   stats_fd - the file descriptor for the binary stats file from which to read
2053   *   header - the binary stats metadata header corresponding to the given FD
2054   *   desc - the binary stat metadata for the particular stat to be read
2055   *   max_elements - the maximum number of 8-byte values to read into data
2056   *
2057   * Output Args:
2058   *   data - the buffer into which stat data should be read
2059   *
2060   * Read the data values of a specified stat from the binary stats interface.
2061   */
2062  void read_stat_data(int stats_fd, struct kvm_stats_header *header,
2063  		    struct kvm_stats_desc *desc, uint64_t *data,
2064  		    size_t max_elements)
2065  {
2066  	size_t nr_elements = min_t(ssize_t, desc->size, max_elements);
2067  	size_t size = nr_elements * sizeof(*data);
2068  	ssize_t ret;
2069  
2070  	TEST_ASSERT(desc->size, "No elements in stat '%s'", desc->name);
2071  	TEST_ASSERT(max_elements, "Zero elements requested for stat '%s'", desc->name);
2072  
2073  	ret = pread(stats_fd, data, size,
2074  		    header->data_offset + desc->offset);
2075  
2076  	TEST_ASSERT(ret >= 0, "pread() failed on stat '%s', errno: %i (%s)",
2077  		    desc->name, errno, strerror(errno));
2078  	TEST_ASSERT(ret == size,
2079  		    "pread() on stat '%s' read %ld bytes, wanted %lu bytes",
2080  		    desc->name, size, ret);
2081  }
2082  
2083  /*
2084   * Read the data of the named stat
2085   *
2086   * Input Args:
2087   *   vm - the VM for which the stat should be read
2088   *   stat_name - the name of the stat to read
2089   *   max_elements - the maximum number of 8-byte values to read into data
2090   *
2091   * Output Args:
2092   *   data - the buffer into which stat data should be read
2093   *
2094   * Read the data values of a specified stat from the binary stats interface.
2095   */
2096  void __vm_get_stat(struct kvm_vm *vm, const char *stat_name, uint64_t *data,
2097  		   size_t max_elements)
2098  {
2099  	struct kvm_stats_desc *desc;
2100  	size_t size_desc;
2101  	int i;
2102  
2103  	if (!vm->stats_fd) {
2104  		vm->stats_fd = vm_get_stats_fd(vm);
2105  		read_stats_header(vm->stats_fd, &vm->stats_header);
2106  		vm->stats_desc = read_stats_descriptors(vm->stats_fd,
2107  							&vm->stats_header);
2108  	}
2109  
2110  	size_desc = get_stats_descriptor_size(&vm->stats_header);
2111  
2112  	for (i = 0; i < vm->stats_header.num_desc; ++i) {
2113  		desc = (void *)vm->stats_desc + (i * size_desc);
2114  
2115  		if (strcmp(desc->name, stat_name))
2116  			continue;
2117  
2118  		read_stat_data(vm->stats_fd, &vm->stats_header, desc,
2119  			       data, max_elements);
2120  
2121  		break;
2122  	}
2123  }
2124  
2125  __weak void kvm_arch_vm_post_create(struct kvm_vm *vm)
2126  {
2127  }
2128  
2129  __weak void kvm_selftest_arch_init(void)
2130  {
2131  }
2132  
2133  void __attribute((constructor)) kvm_selftest_init(void)
2134  {
2135  	/* Tell stdout not to buffer its content. */
2136  	setbuf(stdout, NULL);
2137  
2138  	kvm_selftest_arch_init();
2139  }
2140