1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * tools/testing/selftests/kvm/lib/kvm_util.c 4 * 5 * Copyright (C) 2018, Google LLC. 6 */ 7 #include "test_util.h" 8 #include "kvm_util.h" 9 #include "processor.h" 10 #include "ucall_common.h" 11 12 #include <assert.h> 13 #include <sched.h> 14 #include <sys/mman.h> 15 #include <sys/types.h> 16 #include <sys/stat.h> 17 #include <unistd.h> 18 #include <linux/kernel.h> 19 20 #define KVM_UTIL_MIN_PFN 2 21 22 uint32_t guest_random_seed; 23 struct guest_random_state guest_rng; 24 static uint32_t last_guest_seed; 25 26 static int vcpu_mmap_sz(void); 27 28 int open_path_or_exit(const char *path, int flags) 29 { 30 int fd; 31 32 fd = open(path, flags); 33 __TEST_REQUIRE(fd >= 0 || errno != ENOENT, "Cannot open %s: %s", path, strerror(errno)); 34 TEST_ASSERT(fd >= 0, "Failed to open '%s'", path); 35 36 return fd; 37 } 38 39 /* 40 * Open KVM_DEV_PATH if available, otherwise exit the entire program. 41 * 42 * Input Args: 43 * flags - The flags to pass when opening KVM_DEV_PATH. 44 * 45 * Return: 46 * The opened file descriptor of /dev/kvm. 47 */ 48 static int _open_kvm_dev_path_or_exit(int flags) 49 { 50 return open_path_or_exit(KVM_DEV_PATH, flags); 51 } 52 53 int open_kvm_dev_path_or_exit(void) 54 { 55 return _open_kvm_dev_path_or_exit(O_RDONLY); 56 } 57 58 static ssize_t get_module_param(const char *module_name, const char *param, 59 void *buffer, size_t buffer_size) 60 { 61 const int path_size = 128; 62 char path[path_size]; 63 ssize_t bytes_read; 64 int fd, r; 65 66 r = snprintf(path, path_size, "/sys/module/%s/parameters/%s", 67 module_name, param); 68 TEST_ASSERT(r < path_size, 69 "Failed to construct sysfs path in %d bytes.", path_size); 70 71 fd = open_path_or_exit(path, O_RDONLY); 72 73 bytes_read = read(fd, buffer, buffer_size); 74 TEST_ASSERT(bytes_read > 0, "read(%s) returned %ld, wanted %ld bytes", 75 path, bytes_read, buffer_size); 76 77 r = close(fd); 78 TEST_ASSERT(!r, "close(%s) failed", path); 79 return bytes_read; 80 } 81 82 static int get_module_param_integer(const char *module_name, const char *param) 83 { 84 /* 85 * 16 bytes to hold a 64-bit value (1 byte per char), 1 byte for the 86 * NUL char, and 1 byte because the kernel sucks and inserts a newline 87 * at the end. 88 */ 89 char value[16 + 1 + 1]; 90 ssize_t r; 91 92 memset(value, '\0', sizeof(value)); 93 94 r = get_module_param(module_name, param, value, sizeof(value)); 95 TEST_ASSERT(value[r - 1] == '\n', 96 "Expected trailing newline, got char '%c'", value[r - 1]); 97 98 /* 99 * Squash the newline, otherwise atoi_paranoid() will complain about 100 * trailing non-NUL characters in the string. 101 */ 102 value[r - 1] = '\0'; 103 return atoi_paranoid(value); 104 } 105 106 static bool get_module_param_bool(const char *module_name, const char *param) 107 { 108 char value; 109 ssize_t r; 110 111 r = get_module_param(module_name, param, &value, sizeof(value)); 112 TEST_ASSERT_EQ(r, 1); 113 114 if (value == 'Y') 115 return true; 116 else if (value == 'N') 117 return false; 118 119 TEST_FAIL("Unrecognized value '%c' for boolean module param", value); 120 } 121 122 bool get_kvm_param_bool(const char *param) 123 { 124 return get_module_param_bool("kvm", param); 125 } 126 127 bool get_kvm_intel_param_bool(const char *param) 128 { 129 return get_module_param_bool("kvm_intel", param); 130 } 131 132 bool get_kvm_amd_param_bool(const char *param) 133 { 134 return get_module_param_bool("kvm_amd", param); 135 } 136 137 int get_kvm_param_integer(const char *param) 138 { 139 return get_module_param_integer("kvm", param); 140 } 141 142 int get_kvm_intel_param_integer(const char *param) 143 { 144 return get_module_param_integer("kvm_intel", param); 145 } 146 147 int get_kvm_amd_param_integer(const char *param) 148 { 149 return get_module_param_integer("kvm_amd", param); 150 } 151 152 /* 153 * Capability 154 * 155 * Input Args: 156 * cap - Capability 157 * 158 * Output Args: None 159 * 160 * Return: 161 * On success, the Value corresponding to the capability (KVM_CAP_*) 162 * specified by the value of cap. On failure a TEST_ASSERT failure 163 * is produced. 164 * 165 * Looks up and returns the value corresponding to the capability 166 * (KVM_CAP_*) given by cap. 167 */ 168 unsigned int kvm_check_cap(long cap) 169 { 170 int ret; 171 int kvm_fd; 172 173 kvm_fd = open_kvm_dev_path_or_exit(); 174 ret = __kvm_ioctl(kvm_fd, KVM_CHECK_EXTENSION, (void *)cap); 175 TEST_ASSERT(ret >= 0, KVM_IOCTL_ERROR(KVM_CHECK_EXTENSION, ret)); 176 177 close(kvm_fd); 178 179 return (unsigned int)ret; 180 } 181 182 void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size) 183 { 184 if (vm_check_cap(vm, KVM_CAP_DIRTY_LOG_RING_ACQ_REL)) 185 vm_enable_cap(vm, KVM_CAP_DIRTY_LOG_RING_ACQ_REL, ring_size); 186 else 187 vm_enable_cap(vm, KVM_CAP_DIRTY_LOG_RING, ring_size); 188 vm->dirty_ring_size = ring_size; 189 } 190 191 static void vm_open(struct kvm_vm *vm) 192 { 193 vm->kvm_fd = _open_kvm_dev_path_or_exit(O_RDWR); 194 195 TEST_REQUIRE(kvm_has_cap(KVM_CAP_IMMEDIATE_EXIT)); 196 197 vm->fd = __kvm_ioctl(vm->kvm_fd, KVM_CREATE_VM, (void *)vm->type); 198 TEST_ASSERT(vm->fd >= 0, KVM_IOCTL_ERROR(KVM_CREATE_VM, vm->fd)); 199 } 200 201 const char *vm_guest_mode_string(uint32_t i) 202 { 203 static const char * const strings[] = { 204 [VM_MODE_P52V48_4K] = "PA-bits:52, VA-bits:48, 4K pages", 205 [VM_MODE_P52V48_16K] = "PA-bits:52, VA-bits:48, 16K pages", 206 [VM_MODE_P52V48_64K] = "PA-bits:52, VA-bits:48, 64K pages", 207 [VM_MODE_P48V48_4K] = "PA-bits:48, VA-bits:48, 4K pages", 208 [VM_MODE_P48V48_16K] = "PA-bits:48, VA-bits:48, 16K pages", 209 [VM_MODE_P48V48_64K] = "PA-bits:48, VA-bits:48, 64K pages", 210 [VM_MODE_P40V48_4K] = "PA-bits:40, VA-bits:48, 4K pages", 211 [VM_MODE_P40V48_16K] = "PA-bits:40, VA-bits:48, 16K pages", 212 [VM_MODE_P40V48_64K] = "PA-bits:40, VA-bits:48, 64K pages", 213 [VM_MODE_PXXV48_4K] = "PA-bits:ANY, VA-bits:48, 4K pages", 214 [VM_MODE_P47V64_4K] = "PA-bits:47, VA-bits:64, 4K pages", 215 [VM_MODE_P44V64_4K] = "PA-bits:44, VA-bits:64, 4K pages", 216 [VM_MODE_P36V48_4K] = "PA-bits:36, VA-bits:48, 4K pages", 217 [VM_MODE_P36V48_16K] = "PA-bits:36, VA-bits:48, 16K pages", 218 [VM_MODE_P36V48_64K] = "PA-bits:36, VA-bits:48, 64K pages", 219 [VM_MODE_P36V47_16K] = "PA-bits:36, VA-bits:47, 16K pages", 220 }; 221 _Static_assert(sizeof(strings)/sizeof(char *) == NUM_VM_MODES, 222 "Missing new mode strings?"); 223 224 TEST_ASSERT(i < NUM_VM_MODES, "Guest mode ID %d too big", i); 225 226 return strings[i]; 227 } 228 229 const struct vm_guest_mode_params vm_guest_mode_params[] = { 230 [VM_MODE_P52V48_4K] = { 52, 48, 0x1000, 12 }, 231 [VM_MODE_P52V48_16K] = { 52, 48, 0x4000, 14 }, 232 [VM_MODE_P52V48_64K] = { 52, 48, 0x10000, 16 }, 233 [VM_MODE_P48V48_4K] = { 48, 48, 0x1000, 12 }, 234 [VM_MODE_P48V48_16K] = { 48, 48, 0x4000, 14 }, 235 [VM_MODE_P48V48_64K] = { 48, 48, 0x10000, 16 }, 236 [VM_MODE_P40V48_4K] = { 40, 48, 0x1000, 12 }, 237 [VM_MODE_P40V48_16K] = { 40, 48, 0x4000, 14 }, 238 [VM_MODE_P40V48_64K] = { 40, 48, 0x10000, 16 }, 239 [VM_MODE_PXXV48_4K] = { 0, 0, 0x1000, 12 }, 240 [VM_MODE_P47V64_4K] = { 47, 64, 0x1000, 12 }, 241 [VM_MODE_P44V64_4K] = { 44, 64, 0x1000, 12 }, 242 [VM_MODE_P36V48_4K] = { 36, 48, 0x1000, 12 }, 243 [VM_MODE_P36V48_16K] = { 36, 48, 0x4000, 14 }, 244 [VM_MODE_P36V48_64K] = { 36, 48, 0x10000, 16 }, 245 [VM_MODE_P36V47_16K] = { 36, 47, 0x4000, 14 }, 246 }; 247 _Static_assert(sizeof(vm_guest_mode_params)/sizeof(struct vm_guest_mode_params) == NUM_VM_MODES, 248 "Missing new mode params?"); 249 250 /* 251 * Initializes vm->vpages_valid to match the canonical VA space of the 252 * architecture. 253 * 254 * The default implementation is valid for architectures which split the 255 * range addressed by a single page table into a low and high region 256 * based on the MSB of the VA. On architectures with this behavior 257 * the VA region spans [0, 2^(va_bits - 1)), [-(2^(va_bits - 1), -1]. 258 */ 259 __weak void vm_vaddr_populate_bitmap(struct kvm_vm *vm) 260 { 261 sparsebit_set_num(vm->vpages_valid, 262 0, (1ULL << (vm->va_bits - 1)) >> vm->page_shift); 263 sparsebit_set_num(vm->vpages_valid, 264 (~((1ULL << (vm->va_bits - 1)) - 1)) >> vm->page_shift, 265 (1ULL << (vm->va_bits - 1)) >> vm->page_shift); 266 } 267 268 struct kvm_vm *____vm_create(struct vm_shape shape) 269 { 270 struct kvm_vm *vm; 271 272 vm = calloc(1, sizeof(*vm)); 273 TEST_ASSERT(vm != NULL, "Insufficient Memory"); 274 275 INIT_LIST_HEAD(&vm->vcpus); 276 vm->regions.gpa_tree = RB_ROOT; 277 vm->regions.hva_tree = RB_ROOT; 278 hash_init(vm->regions.slot_hash); 279 280 vm->mode = shape.mode; 281 vm->type = shape.type; 282 283 vm->pa_bits = vm_guest_mode_params[vm->mode].pa_bits; 284 vm->va_bits = vm_guest_mode_params[vm->mode].va_bits; 285 vm->page_size = vm_guest_mode_params[vm->mode].page_size; 286 vm->page_shift = vm_guest_mode_params[vm->mode].page_shift; 287 288 /* Setup mode specific traits. */ 289 switch (vm->mode) { 290 case VM_MODE_P52V48_4K: 291 vm->pgtable_levels = 4; 292 break; 293 case VM_MODE_P52V48_64K: 294 vm->pgtable_levels = 3; 295 break; 296 case VM_MODE_P48V48_4K: 297 vm->pgtable_levels = 4; 298 break; 299 case VM_MODE_P48V48_64K: 300 vm->pgtable_levels = 3; 301 break; 302 case VM_MODE_P40V48_4K: 303 case VM_MODE_P36V48_4K: 304 vm->pgtable_levels = 4; 305 break; 306 case VM_MODE_P40V48_64K: 307 case VM_MODE_P36V48_64K: 308 vm->pgtable_levels = 3; 309 break; 310 case VM_MODE_P52V48_16K: 311 case VM_MODE_P48V48_16K: 312 case VM_MODE_P40V48_16K: 313 case VM_MODE_P36V48_16K: 314 vm->pgtable_levels = 4; 315 break; 316 case VM_MODE_P36V47_16K: 317 vm->pgtable_levels = 3; 318 break; 319 case VM_MODE_PXXV48_4K: 320 #ifdef __x86_64__ 321 kvm_get_cpu_address_width(&vm->pa_bits, &vm->va_bits); 322 kvm_init_vm_address_properties(vm); 323 /* 324 * Ignore KVM support for 5-level paging (vm->va_bits == 57), 325 * it doesn't take effect unless a CR4.LA57 is set, which it 326 * isn't for this mode (48-bit virtual address space). 327 */ 328 TEST_ASSERT(vm->va_bits == 48 || vm->va_bits == 57, 329 "Linear address width (%d bits) not supported", 330 vm->va_bits); 331 pr_debug("Guest physical address width detected: %d\n", 332 vm->pa_bits); 333 vm->pgtable_levels = 4; 334 vm->va_bits = 48; 335 #else 336 TEST_FAIL("VM_MODE_PXXV48_4K not supported on non-x86 platforms"); 337 #endif 338 break; 339 case VM_MODE_P47V64_4K: 340 vm->pgtable_levels = 5; 341 break; 342 case VM_MODE_P44V64_4K: 343 vm->pgtable_levels = 5; 344 break; 345 default: 346 TEST_FAIL("Unknown guest mode: 0x%x", vm->mode); 347 } 348 349 #ifdef __aarch64__ 350 TEST_ASSERT(!vm->type, "ARM doesn't support test-provided types"); 351 if (vm->pa_bits != 40) 352 vm->type = KVM_VM_TYPE_ARM_IPA_SIZE(vm->pa_bits); 353 #endif 354 355 vm_open(vm); 356 357 /* Limit to VA-bit canonical virtual addresses. */ 358 vm->vpages_valid = sparsebit_alloc(); 359 vm_vaddr_populate_bitmap(vm); 360 361 /* Limit physical addresses to PA-bits. */ 362 vm->max_gfn = vm_compute_max_gfn(vm); 363 364 /* Allocate and setup memory for guest. */ 365 vm->vpages_mapped = sparsebit_alloc(); 366 367 return vm; 368 } 369 370 static uint64_t vm_nr_pages_required(enum vm_guest_mode mode, 371 uint32_t nr_runnable_vcpus, 372 uint64_t extra_mem_pages) 373 { 374 uint64_t page_size = vm_guest_mode_params[mode].page_size; 375 uint64_t nr_pages; 376 377 TEST_ASSERT(nr_runnable_vcpus, 378 "Use vm_create_barebones() for VMs that _never_ have vCPUs"); 379 380 TEST_ASSERT(nr_runnable_vcpus <= kvm_check_cap(KVM_CAP_MAX_VCPUS), 381 "nr_vcpus = %d too large for host, max-vcpus = %d", 382 nr_runnable_vcpus, kvm_check_cap(KVM_CAP_MAX_VCPUS)); 383 384 /* 385 * Arbitrarily allocate 512 pages (2mb when page size is 4kb) for the 386 * test code and other per-VM assets that will be loaded into memslot0. 387 */ 388 nr_pages = 512; 389 390 /* Account for the per-vCPU stacks on behalf of the test. */ 391 nr_pages += nr_runnable_vcpus * DEFAULT_STACK_PGS; 392 393 /* 394 * Account for the number of pages needed for the page tables. The 395 * maximum page table size for a memory region will be when the 396 * smallest page size is used. Considering each page contains x page 397 * table descriptors, the total extra size for page tables (for extra 398 * N pages) will be: N/x+N/x^2+N/x^3+... which is definitely smaller 399 * than N/x*2. 400 */ 401 nr_pages += (nr_pages + extra_mem_pages) / PTES_PER_MIN_PAGE * 2; 402 403 /* Account for the number of pages needed by ucall. */ 404 nr_pages += ucall_nr_pages_required(page_size); 405 406 return vm_adjust_num_guest_pages(mode, nr_pages); 407 } 408 409 struct kvm_vm *__vm_create(struct vm_shape shape, uint32_t nr_runnable_vcpus, 410 uint64_t nr_extra_pages) 411 { 412 uint64_t nr_pages = vm_nr_pages_required(shape.mode, nr_runnable_vcpus, 413 nr_extra_pages); 414 struct userspace_mem_region *slot0; 415 struct kvm_vm *vm; 416 int i; 417 418 pr_debug("%s: mode='%s' type='%d', pages='%ld'\n", __func__, 419 vm_guest_mode_string(shape.mode), shape.type, nr_pages); 420 421 vm = ____vm_create(shape); 422 423 vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, 0, 0, nr_pages, 0); 424 for (i = 0; i < NR_MEM_REGIONS; i++) 425 vm->memslots[i] = 0; 426 427 kvm_vm_elf_load(vm, program_invocation_name); 428 429 /* 430 * TODO: Add proper defines to protect the library's memslots, and then 431 * carve out memslot1 for the ucall MMIO address. KVM treats writes to 432 * read-only memslots as MMIO, and creating a read-only memslot for the 433 * MMIO region would prevent silently clobbering the MMIO region. 434 */ 435 slot0 = memslot2region(vm, 0); 436 ucall_init(vm, slot0->region.guest_phys_addr + slot0->region.memory_size); 437 438 if (guest_random_seed != last_guest_seed) { 439 pr_info("Random seed: 0x%x\n", guest_random_seed); 440 last_guest_seed = guest_random_seed; 441 } 442 guest_rng = new_guest_random_state(guest_random_seed); 443 sync_global_to_guest(vm, guest_rng); 444 445 kvm_arch_vm_post_create(vm); 446 447 return vm; 448 } 449 450 /* 451 * VM Create with customized parameters 452 * 453 * Input Args: 454 * mode - VM Mode (e.g. VM_MODE_P52V48_4K) 455 * nr_vcpus - VCPU count 456 * extra_mem_pages - Non-slot0 physical memory total size 457 * guest_code - Guest entry point 458 * vcpuids - VCPU IDs 459 * 460 * Output Args: None 461 * 462 * Return: 463 * Pointer to opaque structure that describes the created VM. 464 * 465 * Creates a VM with the mode specified by mode (e.g. VM_MODE_P52V48_4K). 466 * extra_mem_pages is only used to calculate the maximum page table size, 467 * no real memory allocation for non-slot0 memory in this function. 468 */ 469 struct kvm_vm *__vm_create_with_vcpus(struct vm_shape shape, uint32_t nr_vcpus, 470 uint64_t extra_mem_pages, 471 void *guest_code, struct kvm_vcpu *vcpus[]) 472 { 473 struct kvm_vm *vm; 474 int i; 475 476 TEST_ASSERT(!nr_vcpus || vcpus, "Must provide vCPU array"); 477 478 vm = __vm_create(shape, nr_vcpus, extra_mem_pages); 479 480 for (i = 0; i < nr_vcpus; ++i) 481 vcpus[i] = vm_vcpu_add(vm, i, guest_code); 482 483 return vm; 484 } 485 486 struct kvm_vm *__vm_create_shape_with_one_vcpu(struct vm_shape shape, 487 struct kvm_vcpu **vcpu, 488 uint64_t extra_mem_pages, 489 void *guest_code) 490 { 491 struct kvm_vcpu *vcpus[1]; 492 struct kvm_vm *vm; 493 494 vm = __vm_create_with_vcpus(shape, 1, extra_mem_pages, guest_code, vcpus); 495 496 *vcpu = vcpus[0]; 497 return vm; 498 } 499 500 /* 501 * VM Restart 502 * 503 * Input Args: 504 * vm - VM that has been released before 505 * 506 * Output Args: None 507 * 508 * Reopens the file descriptors associated to the VM and reinstates the 509 * global state, such as the irqchip and the memory regions that are mapped 510 * into the guest. 511 */ 512 void kvm_vm_restart(struct kvm_vm *vmp) 513 { 514 int ctr; 515 struct userspace_mem_region *region; 516 517 vm_open(vmp); 518 if (vmp->has_irqchip) 519 vm_create_irqchip(vmp); 520 521 hash_for_each(vmp->regions.slot_hash, ctr, region, slot_node) { 522 int ret = ioctl(vmp->fd, KVM_SET_USER_MEMORY_REGION2, ®ion->region); 523 524 TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION2 IOCTL failed,\n" 525 " rc: %i errno: %i\n" 526 " slot: %u flags: 0x%x\n" 527 " guest_phys_addr: 0x%llx size: 0x%llx", 528 ret, errno, region->region.slot, 529 region->region.flags, 530 region->region.guest_phys_addr, 531 region->region.memory_size); 532 } 533 } 534 535 __weak struct kvm_vcpu *vm_arch_vcpu_recreate(struct kvm_vm *vm, 536 uint32_t vcpu_id) 537 { 538 return __vm_vcpu_add(vm, vcpu_id); 539 } 540 541 struct kvm_vcpu *vm_recreate_with_one_vcpu(struct kvm_vm *vm) 542 { 543 kvm_vm_restart(vm); 544 545 return vm_vcpu_recreate(vm, 0); 546 } 547 548 void kvm_pin_this_task_to_pcpu(uint32_t pcpu) 549 { 550 cpu_set_t mask; 551 int r; 552 553 CPU_ZERO(&mask); 554 CPU_SET(pcpu, &mask); 555 r = sched_setaffinity(0, sizeof(mask), &mask); 556 TEST_ASSERT(!r, "sched_setaffinity() failed for pCPU '%u'.", pcpu); 557 } 558 559 static uint32_t parse_pcpu(const char *cpu_str, const cpu_set_t *allowed_mask) 560 { 561 uint32_t pcpu = atoi_non_negative("CPU number", cpu_str); 562 563 TEST_ASSERT(CPU_ISSET(pcpu, allowed_mask), 564 "Not allowed to run on pCPU '%d', check cgroups?", pcpu); 565 return pcpu; 566 } 567 568 void kvm_print_vcpu_pinning_help(void) 569 { 570 const char *name = program_invocation_name; 571 572 printf(" -c: Pin tasks to physical CPUs. Takes a list of comma separated\n" 573 " values (target pCPU), one for each vCPU, plus an optional\n" 574 " entry for the main application task (specified via entry\n" 575 " <nr_vcpus + 1>). If used, entries must be provided for all\n" 576 " vCPUs, i.e. pinning vCPUs is all or nothing.\n\n" 577 " E.g. to create 3 vCPUs, pin vCPU0=>pCPU22, vCPU1=>pCPU23,\n" 578 " vCPU2=>pCPU24, and pin the application task to pCPU50:\n\n" 579 " %s -v 3 -c 22,23,24,50\n\n" 580 " To leave the application task unpinned, drop the final entry:\n\n" 581 " %s -v 3 -c 22,23,24\n\n" 582 " (default: no pinning)\n", name, name); 583 } 584 585 void kvm_parse_vcpu_pinning(const char *pcpus_string, uint32_t vcpu_to_pcpu[], 586 int nr_vcpus) 587 { 588 cpu_set_t allowed_mask; 589 char *cpu, *cpu_list; 590 char delim[2] = ","; 591 int i, r; 592 593 cpu_list = strdup(pcpus_string); 594 TEST_ASSERT(cpu_list, "strdup() allocation failed."); 595 596 r = sched_getaffinity(0, sizeof(allowed_mask), &allowed_mask); 597 TEST_ASSERT(!r, "sched_getaffinity() failed"); 598 599 cpu = strtok(cpu_list, delim); 600 601 /* 1. Get all pcpus for vcpus. */ 602 for (i = 0; i < nr_vcpus; i++) { 603 TEST_ASSERT(cpu, "pCPU not provided for vCPU '%d'", i); 604 vcpu_to_pcpu[i] = parse_pcpu(cpu, &allowed_mask); 605 cpu = strtok(NULL, delim); 606 } 607 608 /* 2. Check if the main worker needs to be pinned. */ 609 if (cpu) { 610 kvm_pin_this_task_to_pcpu(parse_pcpu(cpu, &allowed_mask)); 611 cpu = strtok(NULL, delim); 612 } 613 614 TEST_ASSERT(!cpu, "pCPU list contains trailing garbage characters '%s'", cpu); 615 free(cpu_list); 616 } 617 618 /* 619 * Userspace Memory Region Find 620 * 621 * Input Args: 622 * vm - Virtual Machine 623 * start - Starting VM physical address 624 * end - Ending VM physical address, inclusive. 625 * 626 * Output Args: None 627 * 628 * Return: 629 * Pointer to overlapping region, NULL if no such region. 630 * 631 * Searches for a region with any physical memory that overlaps with 632 * any portion of the guest physical addresses from start to end 633 * inclusive. If multiple overlapping regions exist, a pointer to any 634 * of the regions is returned. Null is returned only when no overlapping 635 * region exists. 636 */ 637 static struct userspace_mem_region * 638 userspace_mem_region_find(struct kvm_vm *vm, uint64_t start, uint64_t end) 639 { 640 struct rb_node *node; 641 642 for (node = vm->regions.gpa_tree.rb_node; node; ) { 643 struct userspace_mem_region *region = 644 container_of(node, struct userspace_mem_region, gpa_node); 645 uint64_t existing_start = region->region.guest_phys_addr; 646 uint64_t existing_end = region->region.guest_phys_addr 647 + region->region.memory_size - 1; 648 if (start <= existing_end && end >= existing_start) 649 return region; 650 651 if (start < existing_start) 652 node = node->rb_left; 653 else 654 node = node->rb_right; 655 } 656 657 return NULL; 658 } 659 660 __weak void vcpu_arch_free(struct kvm_vcpu *vcpu) 661 { 662 663 } 664 665 /* 666 * VM VCPU Remove 667 * 668 * Input Args: 669 * vcpu - VCPU to remove 670 * 671 * Output Args: None 672 * 673 * Return: None, TEST_ASSERT failures for all error conditions 674 * 675 * Removes a vCPU from a VM and frees its resources. 676 */ 677 static void vm_vcpu_rm(struct kvm_vm *vm, struct kvm_vcpu *vcpu) 678 { 679 int ret; 680 681 if (vcpu->dirty_gfns) { 682 ret = munmap(vcpu->dirty_gfns, vm->dirty_ring_size); 683 TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("munmap()", ret)); 684 vcpu->dirty_gfns = NULL; 685 } 686 687 ret = munmap(vcpu->run, vcpu_mmap_sz()); 688 TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("munmap()", ret)); 689 690 ret = close(vcpu->fd); 691 TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("close()", ret)); 692 693 list_del(&vcpu->list); 694 695 vcpu_arch_free(vcpu); 696 free(vcpu); 697 } 698 699 void kvm_vm_release(struct kvm_vm *vmp) 700 { 701 struct kvm_vcpu *vcpu, *tmp; 702 int ret; 703 704 list_for_each_entry_safe(vcpu, tmp, &vmp->vcpus, list) 705 vm_vcpu_rm(vmp, vcpu); 706 707 ret = close(vmp->fd); 708 TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("close()", ret)); 709 710 ret = close(vmp->kvm_fd); 711 TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("close()", ret)); 712 } 713 714 static void __vm_mem_region_delete(struct kvm_vm *vm, 715 struct userspace_mem_region *region) 716 { 717 int ret; 718 719 rb_erase(®ion->gpa_node, &vm->regions.gpa_tree); 720 rb_erase(®ion->hva_node, &vm->regions.hva_tree); 721 hash_del(®ion->slot_node); 722 723 sparsebit_free(®ion->unused_phy_pages); 724 sparsebit_free(®ion->protected_phy_pages); 725 ret = munmap(region->mmap_start, region->mmap_size); 726 TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("munmap()", ret)); 727 if (region->fd >= 0) { 728 /* There's an extra map when using shared memory. */ 729 ret = munmap(region->mmap_alias, region->mmap_size); 730 TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("munmap()", ret)); 731 close(region->fd); 732 } 733 if (region->region.guest_memfd >= 0) 734 close(region->region.guest_memfd); 735 736 free(region); 737 } 738 739 /* 740 * Destroys and frees the VM pointed to by vmp. 741 */ 742 void kvm_vm_free(struct kvm_vm *vmp) 743 { 744 int ctr; 745 struct hlist_node *node; 746 struct userspace_mem_region *region; 747 748 if (vmp == NULL) 749 return; 750 751 /* Free cached stats metadata and close FD */ 752 if (vmp->stats_fd) { 753 free(vmp->stats_desc); 754 close(vmp->stats_fd); 755 } 756 757 /* Free userspace_mem_regions. */ 758 hash_for_each_safe(vmp->regions.slot_hash, ctr, node, region, slot_node) 759 __vm_mem_region_delete(vmp, region); 760 761 /* Free sparsebit arrays. */ 762 sparsebit_free(&vmp->vpages_valid); 763 sparsebit_free(&vmp->vpages_mapped); 764 765 kvm_vm_release(vmp); 766 767 /* Free the structure describing the VM. */ 768 free(vmp); 769 } 770 771 int kvm_memfd_alloc(size_t size, bool hugepages) 772 { 773 int memfd_flags = MFD_CLOEXEC; 774 int fd, r; 775 776 if (hugepages) 777 memfd_flags |= MFD_HUGETLB; 778 779 fd = memfd_create("kvm_selftest", memfd_flags); 780 TEST_ASSERT(fd != -1, __KVM_SYSCALL_ERROR("memfd_create()", fd)); 781 782 r = ftruncate(fd, size); 783 TEST_ASSERT(!r, __KVM_SYSCALL_ERROR("ftruncate()", r)); 784 785 r = fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, size); 786 TEST_ASSERT(!r, __KVM_SYSCALL_ERROR("fallocate()", r)); 787 788 return fd; 789 } 790 791 static void vm_userspace_mem_region_gpa_insert(struct rb_root *gpa_tree, 792 struct userspace_mem_region *region) 793 { 794 struct rb_node **cur, *parent; 795 796 for (cur = &gpa_tree->rb_node, parent = NULL; *cur; ) { 797 struct userspace_mem_region *cregion; 798 799 cregion = container_of(*cur, typeof(*cregion), gpa_node); 800 parent = *cur; 801 if (region->region.guest_phys_addr < 802 cregion->region.guest_phys_addr) 803 cur = &(*cur)->rb_left; 804 else { 805 TEST_ASSERT(region->region.guest_phys_addr != 806 cregion->region.guest_phys_addr, 807 "Duplicate GPA in region tree"); 808 809 cur = &(*cur)->rb_right; 810 } 811 } 812 813 rb_link_node(®ion->gpa_node, parent, cur); 814 rb_insert_color(®ion->gpa_node, gpa_tree); 815 } 816 817 static void vm_userspace_mem_region_hva_insert(struct rb_root *hva_tree, 818 struct userspace_mem_region *region) 819 { 820 struct rb_node **cur, *parent; 821 822 for (cur = &hva_tree->rb_node, parent = NULL; *cur; ) { 823 struct userspace_mem_region *cregion; 824 825 cregion = container_of(*cur, typeof(*cregion), hva_node); 826 parent = *cur; 827 if (region->host_mem < cregion->host_mem) 828 cur = &(*cur)->rb_left; 829 else { 830 TEST_ASSERT(region->host_mem != 831 cregion->host_mem, 832 "Duplicate HVA in region tree"); 833 834 cur = &(*cur)->rb_right; 835 } 836 } 837 838 rb_link_node(®ion->hva_node, parent, cur); 839 rb_insert_color(®ion->hva_node, hva_tree); 840 } 841 842 843 int __vm_set_user_memory_region(struct kvm_vm *vm, uint32_t slot, uint32_t flags, 844 uint64_t gpa, uint64_t size, void *hva) 845 { 846 struct kvm_userspace_memory_region region = { 847 .slot = slot, 848 .flags = flags, 849 .guest_phys_addr = gpa, 850 .memory_size = size, 851 .userspace_addr = (uintptr_t)hva, 852 }; 853 854 return ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION, ®ion); 855 } 856 857 void vm_set_user_memory_region(struct kvm_vm *vm, uint32_t slot, uint32_t flags, 858 uint64_t gpa, uint64_t size, void *hva) 859 { 860 int ret = __vm_set_user_memory_region(vm, slot, flags, gpa, size, hva); 861 862 TEST_ASSERT(!ret, "KVM_SET_USER_MEMORY_REGION failed, errno = %d (%s)", 863 errno, strerror(errno)); 864 } 865 866 #define TEST_REQUIRE_SET_USER_MEMORY_REGION2() \ 867 __TEST_REQUIRE(kvm_has_cap(KVM_CAP_USER_MEMORY2), \ 868 "KVM selftests now require KVM_SET_USER_MEMORY_REGION2 (introduced in v6.8)") 869 870 int __vm_set_user_memory_region2(struct kvm_vm *vm, uint32_t slot, uint32_t flags, 871 uint64_t gpa, uint64_t size, void *hva, 872 uint32_t guest_memfd, uint64_t guest_memfd_offset) 873 { 874 struct kvm_userspace_memory_region2 region = { 875 .slot = slot, 876 .flags = flags, 877 .guest_phys_addr = gpa, 878 .memory_size = size, 879 .userspace_addr = (uintptr_t)hva, 880 .guest_memfd = guest_memfd, 881 .guest_memfd_offset = guest_memfd_offset, 882 }; 883 884 TEST_REQUIRE_SET_USER_MEMORY_REGION2(); 885 886 return ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION2, ®ion); 887 } 888 889 void vm_set_user_memory_region2(struct kvm_vm *vm, uint32_t slot, uint32_t flags, 890 uint64_t gpa, uint64_t size, void *hva, 891 uint32_t guest_memfd, uint64_t guest_memfd_offset) 892 { 893 int ret = __vm_set_user_memory_region2(vm, slot, flags, gpa, size, hva, 894 guest_memfd, guest_memfd_offset); 895 896 TEST_ASSERT(!ret, "KVM_SET_USER_MEMORY_REGION2 failed, errno = %d (%s)", 897 errno, strerror(errno)); 898 } 899 900 901 /* FIXME: This thing needs to be ripped apart and rewritten. */ 902 void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, 903 uint64_t guest_paddr, uint32_t slot, uint64_t npages, 904 uint32_t flags, int guest_memfd, uint64_t guest_memfd_offset) 905 { 906 int ret; 907 struct userspace_mem_region *region; 908 size_t backing_src_pagesz = get_backing_src_pagesz(src_type); 909 size_t mem_size = npages * vm->page_size; 910 size_t alignment; 911 912 TEST_REQUIRE_SET_USER_MEMORY_REGION2(); 913 914 TEST_ASSERT(vm_adjust_num_guest_pages(vm->mode, npages) == npages, 915 "Number of guest pages is not compatible with the host. " 916 "Try npages=%d", vm_adjust_num_guest_pages(vm->mode, npages)); 917 918 TEST_ASSERT((guest_paddr % vm->page_size) == 0, "Guest physical " 919 "address not on a page boundary.\n" 920 " guest_paddr: 0x%lx vm->page_size: 0x%x", 921 guest_paddr, vm->page_size); 922 TEST_ASSERT((((guest_paddr >> vm->page_shift) + npages) - 1) 923 <= vm->max_gfn, "Physical range beyond maximum " 924 "supported physical address,\n" 925 " guest_paddr: 0x%lx npages: 0x%lx\n" 926 " vm->max_gfn: 0x%lx vm->page_size: 0x%x", 927 guest_paddr, npages, vm->max_gfn, vm->page_size); 928 929 /* 930 * Confirm a mem region with an overlapping address doesn't 931 * already exist. 932 */ 933 region = (struct userspace_mem_region *) userspace_mem_region_find( 934 vm, guest_paddr, (guest_paddr + npages * vm->page_size) - 1); 935 if (region != NULL) 936 TEST_FAIL("overlapping userspace_mem_region already " 937 "exists\n" 938 " requested guest_paddr: 0x%lx npages: 0x%lx " 939 "page_size: 0x%x\n" 940 " existing guest_paddr: 0x%lx size: 0x%lx", 941 guest_paddr, npages, vm->page_size, 942 (uint64_t) region->region.guest_phys_addr, 943 (uint64_t) region->region.memory_size); 944 945 /* Confirm no region with the requested slot already exists. */ 946 hash_for_each_possible(vm->regions.slot_hash, region, slot_node, 947 slot) { 948 if (region->region.slot != slot) 949 continue; 950 951 TEST_FAIL("A mem region with the requested slot " 952 "already exists.\n" 953 " requested slot: %u paddr: 0x%lx npages: 0x%lx\n" 954 " existing slot: %u paddr: 0x%lx size: 0x%lx", 955 slot, guest_paddr, npages, 956 region->region.slot, 957 (uint64_t) region->region.guest_phys_addr, 958 (uint64_t) region->region.memory_size); 959 } 960 961 /* Allocate and initialize new mem region structure. */ 962 region = calloc(1, sizeof(*region)); 963 TEST_ASSERT(region != NULL, "Insufficient Memory"); 964 region->mmap_size = mem_size; 965 966 #ifdef __s390x__ 967 /* On s390x, the host address must be aligned to 1M (due to PGSTEs) */ 968 alignment = 0x100000; 969 #else 970 alignment = 1; 971 #endif 972 973 /* 974 * When using THP mmap is not guaranteed to returned a hugepage aligned 975 * address so we have to pad the mmap. Padding is not needed for HugeTLB 976 * because mmap will always return an address aligned to the HugeTLB 977 * page size. 978 */ 979 if (src_type == VM_MEM_SRC_ANONYMOUS_THP) 980 alignment = max(backing_src_pagesz, alignment); 981 982 TEST_ASSERT_EQ(guest_paddr, align_up(guest_paddr, backing_src_pagesz)); 983 984 /* Add enough memory to align up if necessary */ 985 if (alignment > 1) 986 region->mmap_size += alignment; 987 988 region->fd = -1; 989 if (backing_src_is_shared(src_type)) 990 region->fd = kvm_memfd_alloc(region->mmap_size, 991 src_type == VM_MEM_SRC_SHARED_HUGETLB); 992 993 region->mmap_start = mmap(NULL, region->mmap_size, 994 PROT_READ | PROT_WRITE, 995 vm_mem_backing_src_alias(src_type)->flag, 996 region->fd, 0); 997 TEST_ASSERT(region->mmap_start != MAP_FAILED, 998 __KVM_SYSCALL_ERROR("mmap()", (int)(unsigned long)MAP_FAILED)); 999 1000 TEST_ASSERT(!is_backing_src_hugetlb(src_type) || 1001 region->mmap_start == align_ptr_up(region->mmap_start, backing_src_pagesz), 1002 "mmap_start %p is not aligned to HugeTLB page size 0x%lx", 1003 region->mmap_start, backing_src_pagesz); 1004 1005 /* Align host address */ 1006 region->host_mem = align_ptr_up(region->mmap_start, alignment); 1007 1008 /* As needed perform madvise */ 1009 if ((src_type == VM_MEM_SRC_ANONYMOUS || 1010 src_type == VM_MEM_SRC_ANONYMOUS_THP) && thp_configured()) { 1011 ret = madvise(region->host_mem, mem_size, 1012 src_type == VM_MEM_SRC_ANONYMOUS ? MADV_NOHUGEPAGE : MADV_HUGEPAGE); 1013 TEST_ASSERT(ret == 0, "madvise failed, addr: %p length: 0x%lx src_type: %s", 1014 region->host_mem, mem_size, 1015 vm_mem_backing_src_alias(src_type)->name); 1016 } 1017 1018 region->backing_src_type = src_type; 1019 1020 if (flags & KVM_MEM_GUEST_MEMFD) { 1021 if (guest_memfd < 0) { 1022 uint32_t guest_memfd_flags = 0; 1023 TEST_ASSERT(!guest_memfd_offset, 1024 "Offset must be zero when creating new guest_memfd"); 1025 guest_memfd = vm_create_guest_memfd(vm, mem_size, guest_memfd_flags); 1026 } else { 1027 /* 1028 * Install a unique fd for each memslot so that the fd 1029 * can be closed when the region is deleted without 1030 * needing to track if the fd is owned by the framework 1031 * or by the caller. 1032 */ 1033 guest_memfd = dup(guest_memfd); 1034 TEST_ASSERT(guest_memfd >= 0, __KVM_SYSCALL_ERROR("dup()", guest_memfd)); 1035 } 1036 1037 region->region.guest_memfd = guest_memfd; 1038 region->region.guest_memfd_offset = guest_memfd_offset; 1039 } else { 1040 region->region.guest_memfd = -1; 1041 } 1042 1043 region->unused_phy_pages = sparsebit_alloc(); 1044 if (vm_arch_has_protected_memory(vm)) 1045 region->protected_phy_pages = sparsebit_alloc(); 1046 sparsebit_set_num(region->unused_phy_pages, 1047 guest_paddr >> vm->page_shift, npages); 1048 region->region.slot = slot; 1049 region->region.flags = flags; 1050 region->region.guest_phys_addr = guest_paddr; 1051 region->region.memory_size = npages * vm->page_size; 1052 region->region.userspace_addr = (uintptr_t) region->host_mem; 1053 ret = __vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, ®ion->region); 1054 TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION2 IOCTL failed,\n" 1055 " rc: %i errno: %i\n" 1056 " slot: %u flags: 0x%x\n" 1057 " guest_phys_addr: 0x%lx size: 0x%lx guest_memfd: %d", 1058 ret, errno, slot, flags, 1059 guest_paddr, (uint64_t) region->region.memory_size, 1060 region->region.guest_memfd); 1061 1062 /* Add to quick lookup data structures */ 1063 vm_userspace_mem_region_gpa_insert(&vm->regions.gpa_tree, region); 1064 vm_userspace_mem_region_hva_insert(&vm->regions.hva_tree, region); 1065 hash_add(vm->regions.slot_hash, ®ion->slot_node, slot); 1066 1067 /* If shared memory, create an alias. */ 1068 if (region->fd >= 0) { 1069 region->mmap_alias = mmap(NULL, region->mmap_size, 1070 PROT_READ | PROT_WRITE, 1071 vm_mem_backing_src_alias(src_type)->flag, 1072 region->fd, 0); 1073 TEST_ASSERT(region->mmap_alias != MAP_FAILED, 1074 __KVM_SYSCALL_ERROR("mmap()", (int)(unsigned long)MAP_FAILED)); 1075 1076 /* Align host alias address */ 1077 region->host_alias = align_ptr_up(region->mmap_alias, alignment); 1078 } 1079 } 1080 1081 void vm_userspace_mem_region_add(struct kvm_vm *vm, 1082 enum vm_mem_backing_src_type src_type, 1083 uint64_t guest_paddr, uint32_t slot, 1084 uint64_t npages, uint32_t flags) 1085 { 1086 vm_mem_add(vm, src_type, guest_paddr, slot, npages, flags, -1, 0); 1087 } 1088 1089 /* 1090 * Memslot to region 1091 * 1092 * Input Args: 1093 * vm - Virtual Machine 1094 * memslot - KVM memory slot ID 1095 * 1096 * Output Args: None 1097 * 1098 * Return: 1099 * Pointer to memory region structure that describe memory region 1100 * using kvm memory slot ID given by memslot. TEST_ASSERT failure 1101 * on error (e.g. currently no memory region using memslot as a KVM 1102 * memory slot ID). 1103 */ 1104 struct userspace_mem_region * 1105 memslot2region(struct kvm_vm *vm, uint32_t memslot) 1106 { 1107 struct userspace_mem_region *region; 1108 1109 hash_for_each_possible(vm->regions.slot_hash, region, slot_node, 1110 memslot) 1111 if (region->region.slot == memslot) 1112 return region; 1113 1114 fprintf(stderr, "No mem region with the requested slot found,\n" 1115 " requested slot: %u\n", memslot); 1116 fputs("---- vm dump ----\n", stderr); 1117 vm_dump(stderr, vm, 2); 1118 TEST_FAIL("Mem region not found"); 1119 return NULL; 1120 } 1121 1122 /* 1123 * VM Memory Region Flags Set 1124 * 1125 * Input Args: 1126 * vm - Virtual Machine 1127 * flags - Starting guest physical address 1128 * 1129 * Output Args: None 1130 * 1131 * Return: None 1132 * 1133 * Sets the flags of the memory region specified by the value of slot, 1134 * to the values given by flags. 1135 */ 1136 void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags) 1137 { 1138 int ret; 1139 struct userspace_mem_region *region; 1140 1141 region = memslot2region(vm, slot); 1142 1143 region->region.flags = flags; 1144 1145 ret = __vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, ®ion->region); 1146 1147 TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION2 IOCTL failed,\n" 1148 " rc: %i errno: %i slot: %u flags: 0x%x", 1149 ret, errno, slot, flags); 1150 } 1151 1152 /* 1153 * VM Memory Region Move 1154 * 1155 * Input Args: 1156 * vm - Virtual Machine 1157 * slot - Slot of the memory region to move 1158 * new_gpa - Starting guest physical address 1159 * 1160 * Output Args: None 1161 * 1162 * Return: None 1163 * 1164 * Change the gpa of a memory region. 1165 */ 1166 void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa) 1167 { 1168 struct userspace_mem_region *region; 1169 int ret; 1170 1171 region = memslot2region(vm, slot); 1172 1173 region->region.guest_phys_addr = new_gpa; 1174 1175 ret = __vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, ®ion->region); 1176 1177 TEST_ASSERT(!ret, "KVM_SET_USER_MEMORY_REGION2 failed\n" 1178 "ret: %i errno: %i slot: %u new_gpa: 0x%lx", 1179 ret, errno, slot, new_gpa); 1180 } 1181 1182 /* 1183 * VM Memory Region Delete 1184 * 1185 * Input Args: 1186 * vm - Virtual Machine 1187 * slot - Slot of the memory region to delete 1188 * 1189 * Output Args: None 1190 * 1191 * Return: None 1192 * 1193 * Delete a memory region. 1194 */ 1195 void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot) 1196 { 1197 struct userspace_mem_region *region = memslot2region(vm, slot); 1198 1199 region->region.memory_size = 0; 1200 vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, ®ion->region); 1201 1202 __vm_mem_region_delete(vm, region); 1203 } 1204 1205 void vm_guest_mem_fallocate(struct kvm_vm *vm, uint64_t base, uint64_t size, 1206 bool punch_hole) 1207 { 1208 const int mode = FALLOC_FL_KEEP_SIZE | (punch_hole ? FALLOC_FL_PUNCH_HOLE : 0); 1209 struct userspace_mem_region *region; 1210 uint64_t end = base + size; 1211 uint64_t gpa, len; 1212 off_t fd_offset; 1213 int ret; 1214 1215 for (gpa = base; gpa < end; gpa += len) { 1216 uint64_t offset; 1217 1218 region = userspace_mem_region_find(vm, gpa, gpa); 1219 TEST_ASSERT(region && region->region.flags & KVM_MEM_GUEST_MEMFD, 1220 "Private memory region not found for GPA 0x%lx", gpa); 1221 1222 offset = gpa - region->region.guest_phys_addr; 1223 fd_offset = region->region.guest_memfd_offset + offset; 1224 len = min_t(uint64_t, end - gpa, region->region.memory_size - offset); 1225 1226 ret = fallocate(region->region.guest_memfd, mode, fd_offset, len); 1227 TEST_ASSERT(!ret, "fallocate() failed to %s at %lx (len = %lu), fd = %d, mode = %x, offset = %lx", 1228 punch_hole ? "punch hole" : "allocate", gpa, len, 1229 region->region.guest_memfd, mode, fd_offset); 1230 } 1231 } 1232 1233 /* Returns the size of a vCPU's kvm_run structure. */ 1234 static int vcpu_mmap_sz(void) 1235 { 1236 int dev_fd, ret; 1237 1238 dev_fd = open_kvm_dev_path_or_exit(); 1239 1240 ret = ioctl(dev_fd, KVM_GET_VCPU_MMAP_SIZE, NULL); 1241 TEST_ASSERT(ret >= sizeof(struct kvm_run), 1242 KVM_IOCTL_ERROR(KVM_GET_VCPU_MMAP_SIZE, ret)); 1243 1244 close(dev_fd); 1245 1246 return ret; 1247 } 1248 1249 static bool vcpu_exists(struct kvm_vm *vm, uint32_t vcpu_id) 1250 { 1251 struct kvm_vcpu *vcpu; 1252 1253 list_for_each_entry(vcpu, &vm->vcpus, list) { 1254 if (vcpu->id == vcpu_id) 1255 return true; 1256 } 1257 1258 return false; 1259 } 1260 1261 /* 1262 * Adds a virtual CPU to the VM specified by vm with the ID given by vcpu_id. 1263 * No additional vCPU setup is done. Returns the vCPU. 1264 */ 1265 struct kvm_vcpu *__vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id) 1266 { 1267 struct kvm_vcpu *vcpu; 1268 1269 /* Confirm a vcpu with the specified id doesn't already exist. */ 1270 TEST_ASSERT(!vcpu_exists(vm, vcpu_id), "vCPU%d already exists", vcpu_id); 1271 1272 /* Allocate and initialize new vcpu structure. */ 1273 vcpu = calloc(1, sizeof(*vcpu)); 1274 TEST_ASSERT(vcpu != NULL, "Insufficient Memory"); 1275 1276 vcpu->vm = vm; 1277 vcpu->id = vcpu_id; 1278 vcpu->fd = __vm_ioctl(vm, KVM_CREATE_VCPU, (void *)(unsigned long)vcpu_id); 1279 TEST_ASSERT_VM_VCPU_IOCTL(vcpu->fd >= 0, KVM_CREATE_VCPU, vcpu->fd, vm); 1280 1281 TEST_ASSERT(vcpu_mmap_sz() >= sizeof(*vcpu->run), "vcpu mmap size " 1282 "smaller than expected, vcpu_mmap_sz: %i expected_min: %zi", 1283 vcpu_mmap_sz(), sizeof(*vcpu->run)); 1284 vcpu->run = (struct kvm_run *) mmap(NULL, vcpu_mmap_sz(), 1285 PROT_READ | PROT_WRITE, MAP_SHARED, vcpu->fd, 0); 1286 TEST_ASSERT(vcpu->run != MAP_FAILED, 1287 __KVM_SYSCALL_ERROR("mmap()", (int)(unsigned long)MAP_FAILED)); 1288 1289 /* Add to linked-list of VCPUs. */ 1290 list_add(&vcpu->list, &vm->vcpus); 1291 1292 return vcpu; 1293 } 1294 1295 /* 1296 * VM Virtual Address Unused Gap 1297 * 1298 * Input Args: 1299 * vm - Virtual Machine 1300 * sz - Size (bytes) 1301 * vaddr_min - Minimum Virtual Address 1302 * 1303 * Output Args: None 1304 * 1305 * Return: 1306 * Lowest virtual address at or below vaddr_min, with at least 1307 * sz unused bytes. TEST_ASSERT failure if no area of at least 1308 * size sz is available. 1309 * 1310 * Within the VM specified by vm, locates the lowest starting virtual 1311 * address >= vaddr_min, that has at least sz unallocated bytes. A 1312 * TEST_ASSERT failure occurs for invalid input or no area of at least 1313 * sz unallocated bytes >= vaddr_min is available. 1314 */ 1315 vm_vaddr_t vm_vaddr_unused_gap(struct kvm_vm *vm, size_t sz, 1316 vm_vaddr_t vaddr_min) 1317 { 1318 uint64_t pages = (sz + vm->page_size - 1) >> vm->page_shift; 1319 1320 /* Determine lowest permitted virtual page index. */ 1321 uint64_t pgidx_start = (vaddr_min + vm->page_size - 1) >> vm->page_shift; 1322 if ((pgidx_start * vm->page_size) < vaddr_min) 1323 goto no_va_found; 1324 1325 /* Loop over section with enough valid virtual page indexes. */ 1326 if (!sparsebit_is_set_num(vm->vpages_valid, 1327 pgidx_start, pages)) 1328 pgidx_start = sparsebit_next_set_num(vm->vpages_valid, 1329 pgidx_start, pages); 1330 do { 1331 /* 1332 * Are there enough unused virtual pages available at 1333 * the currently proposed starting virtual page index. 1334 * If not, adjust proposed starting index to next 1335 * possible. 1336 */ 1337 if (sparsebit_is_clear_num(vm->vpages_mapped, 1338 pgidx_start, pages)) 1339 goto va_found; 1340 pgidx_start = sparsebit_next_clear_num(vm->vpages_mapped, 1341 pgidx_start, pages); 1342 if (pgidx_start == 0) 1343 goto no_va_found; 1344 1345 /* 1346 * If needed, adjust proposed starting virtual address, 1347 * to next range of valid virtual addresses. 1348 */ 1349 if (!sparsebit_is_set_num(vm->vpages_valid, 1350 pgidx_start, pages)) { 1351 pgidx_start = sparsebit_next_set_num( 1352 vm->vpages_valid, pgidx_start, pages); 1353 if (pgidx_start == 0) 1354 goto no_va_found; 1355 } 1356 } while (pgidx_start != 0); 1357 1358 no_va_found: 1359 TEST_FAIL("No vaddr of specified pages available, pages: 0x%lx", pages); 1360 1361 /* NOT REACHED */ 1362 return -1; 1363 1364 va_found: 1365 TEST_ASSERT(sparsebit_is_set_num(vm->vpages_valid, 1366 pgidx_start, pages), 1367 "Unexpected, invalid virtual page index range,\n" 1368 " pgidx_start: 0x%lx\n" 1369 " pages: 0x%lx", 1370 pgidx_start, pages); 1371 TEST_ASSERT(sparsebit_is_clear_num(vm->vpages_mapped, 1372 pgidx_start, pages), 1373 "Unexpected, pages already mapped,\n" 1374 " pgidx_start: 0x%lx\n" 1375 " pages: 0x%lx", 1376 pgidx_start, pages); 1377 1378 return pgidx_start * vm->page_size; 1379 } 1380 1381 static vm_vaddr_t ____vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, 1382 vm_vaddr_t vaddr_min, 1383 enum kvm_mem_region_type type, 1384 bool protected) 1385 { 1386 uint64_t pages = (sz >> vm->page_shift) + ((sz % vm->page_size) != 0); 1387 1388 virt_pgd_alloc(vm); 1389 vm_paddr_t paddr = __vm_phy_pages_alloc(vm, pages, 1390 KVM_UTIL_MIN_PFN * vm->page_size, 1391 vm->memslots[type], protected); 1392 1393 /* 1394 * Find an unused range of virtual page addresses of at least 1395 * pages in length. 1396 */ 1397 vm_vaddr_t vaddr_start = vm_vaddr_unused_gap(vm, sz, vaddr_min); 1398 1399 /* Map the virtual pages. */ 1400 for (vm_vaddr_t vaddr = vaddr_start; pages > 0; 1401 pages--, vaddr += vm->page_size, paddr += vm->page_size) { 1402 1403 virt_pg_map(vm, vaddr, paddr); 1404 1405 sparsebit_set(vm->vpages_mapped, vaddr >> vm->page_shift); 1406 } 1407 1408 return vaddr_start; 1409 } 1410 1411 vm_vaddr_t __vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min, 1412 enum kvm_mem_region_type type) 1413 { 1414 return ____vm_vaddr_alloc(vm, sz, vaddr_min, type, 1415 vm_arch_has_protected_memory(vm)); 1416 } 1417 1418 vm_vaddr_t vm_vaddr_alloc_shared(struct kvm_vm *vm, size_t sz, 1419 vm_vaddr_t vaddr_min, 1420 enum kvm_mem_region_type type) 1421 { 1422 return ____vm_vaddr_alloc(vm, sz, vaddr_min, type, false); 1423 } 1424 1425 /* 1426 * VM Virtual Address Allocate 1427 * 1428 * Input Args: 1429 * vm - Virtual Machine 1430 * sz - Size in bytes 1431 * vaddr_min - Minimum starting virtual address 1432 * 1433 * Output Args: None 1434 * 1435 * Return: 1436 * Starting guest virtual address 1437 * 1438 * Allocates at least sz bytes within the virtual address space of the vm 1439 * given by vm. The allocated bytes are mapped to a virtual address >= 1440 * the address given by vaddr_min. Note that each allocation uses a 1441 * a unique set of pages, with the minimum real allocation being at least 1442 * a page. The allocated physical space comes from the TEST_DATA memory region. 1443 */ 1444 vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min) 1445 { 1446 return __vm_vaddr_alloc(vm, sz, vaddr_min, MEM_REGION_TEST_DATA); 1447 } 1448 1449 /* 1450 * VM Virtual Address Allocate Pages 1451 * 1452 * Input Args: 1453 * vm - Virtual Machine 1454 * 1455 * Output Args: None 1456 * 1457 * Return: 1458 * Starting guest virtual address 1459 * 1460 * Allocates at least N system pages worth of bytes within the virtual address 1461 * space of the vm. 1462 */ 1463 vm_vaddr_t vm_vaddr_alloc_pages(struct kvm_vm *vm, int nr_pages) 1464 { 1465 return vm_vaddr_alloc(vm, nr_pages * getpagesize(), KVM_UTIL_MIN_VADDR); 1466 } 1467 1468 vm_vaddr_t __vm_vaddr_alloc_page(struct kvm_vm *vm, enum kvm_mem_region_type type) 1469 { 1470 return __vm_vaddr_alloc(vm, getpagesize(), KVM_UTIL_MIN_VADDR, type); 1471 } 1472 1473 /* 1474 * VM Virtual Address Allocate Page 1475 * 1476 * Input Args: 1477 * vm - Virtual Machine 1478 * 1479 * Output Args: None 1480 * 1481 * Return: 1482 * Starting guest virtual address 1483 * 1484 * Allocates at least one system page worth of bytes within the virtual address 1485 * space of the vm. 1486 */ 1487 vm_vaddr_t vm_vaddr_alloc_page(struct kvm_vm *vm) 1488 { 1489 return vm_vaddr_alloc_pages(vm, 1); 1490 } 1491 1492 /* 1493 * Map a range of VM virtual address to the VM's physical address 1494 * 1495 * Input Args: 1496 * vm - Virtual Machine 1497 * vaddr - Virtuall address to map 1498 * paddr - VM Physical Address 1499 * npages - The number of pages to map 1500 * 1501 * Output Args: None 1502 * 1503 * Return: None 1504 * 1505 * Within the VM given by @vm, creates a virtual translation for 1506 * @npages starting at @vaddr to the page range starting at @paddr. 1507 */ 1508 void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, 1509 unsigned int npages) 1510 { 1511 size_t page_size = vm->page_size; 1512 size_t size = npages * page_size; 1513 1514 TEST_ASSERT(vaddr + size > vaddr, "Vaddr overflow"); 1515 TEST_ASSERT(paddr + size > paddr, "Paddr overflow"); 1516 1517 while (npages--) { 1518 virt_pg_map(vm, vaddr, paddr); 1519 sparsebit_set(vm->vpages_mapped, vaddr >> vm->page_shift); 1520 1521 vaddr += page_size; 1522 paddr += page_size; 1523 } 1524 } 1525 1526 /* 1527 * Address VM Physical to Host Virtual 1528 * 1529 * Input Args: 1530 * vm - Virtual Machine 1531 * gpa - VM physical address 1532 * 1533 * Output Args: None 1534 * 1535 * Return: 1536 * Equivalent host virtual address 1537 * 1538 * Locates the memory region containing the VM physical address given 1539 * by gpa, within the VM given by vm. When found, the host virtual 1540 * address providing the memory to the vm physical address is returned. 1541 * A TEST_ASSERT failure occurs if no region containing gpa exists. 1542 */ 1543 void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa) 1544 { 1545 struct userspace_mem_region *region; 1546 1547 gpa = vm_untag_gpa(vm, gpa); 1548 1549 region = userspace_mem_region_find(vm, gpa, gpa); 1550 if (!region) { 1551 TEST_FAIL("No vm physical memory at 0x%lx", gpa); 1552 return NULL; 1553 } 1554 1555 return (void *)((uintptr_t)region->host_mem 1556 + (gpa - region->region.guest_phys_addr)); 1557 } 1558 1559 /* 1560 * Address Host Virtual to VM Physical 1561 * 1562 * Input Args: 1563 * vm - Virtual Machine 1564 * hva - Host virtual address 1565 * 1566 * Output Args: None 1567 * 1568 * Return: 1569 * Equivalent VM physical address 1570 * 1571 * Locates the memory region containing the host virtual address given 1572 * by hva, within the VM given by vm. When found, the equivalent 1573 * VM physical address is returned. A TEST_ASSERT failure occurs if no 1574 * region containing hva exists. 1575 */ 1576 vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva) 1577 { 1578 struct rb_node *node; 1579 1580 for (node = vm->regions.hva_tree.rb_node; node; ) { 1581 struct userspace_mem_region *region = 1582 container_of(node, struct userspace_mem_region, hva_node); 1583 1584 if (hva >= region->host_mem) { 1585 if (hva <= (region->host_mem 1586 + region->region.memory_size - 1)) 1587 return (vm_paddr_t)((uintptr_t) 1588 region->region.guest_phys_addr 1589 + (hva - (uintptr_t)region->host_mem)); 1590 1591 node = node->rb_right; 1592 } else 1593 node = node->rb_left; 1594 } 1595 1596 TEST_FAIL("No mapping to a guest physical address, hva: %p", hva); 1597 return -1; 1598 } 1599 1600 /* 1601 * Address VM physical to Host Virtual *alias*. 1602 * 1603 * Input Args: 1604 * vm - Virtual Machine 1605 * gpa - VM physical address 1606 * 1607 * Output Args: None 1608 * 1609 * Return: 1610 * Equivalent address within the host virtual *alias* area, or NULL 1611 * (without failing the test) if the guest memory is not shared (so 1612 * no alias exists). 1613 * 1614 * Create a writable, shared virtual=>physical alias for the specific GPA. 1615 * The primary use case is to allow the host selftest to manipulate guest 1616 * memory without mapping said memory in the guest's address space. And, for 1617 * userfaultfd-based demand paging, to do so without triggering userfaults. 1618 */ 1619 void *addr_gpa2alias(struct kvm_vm *vm, vm_paddr_t gpa) 1620 { 1621 struct userspace_mem_region *region; 1622 uintptr_t offset; 1623 1624 region = userspace_mem_region_find(vm, gpa, gpa); 1625 if (!region) 1626 return NULL; 1627 1628 if (!region->host_alias) 1629 return NULL; 1630 1631 offset = gpa - region->region.guest_phys_addr; 1632 return (void *) ((uintptr_t) region->host_alias + offset); 1633 } 1634 1635 /* Create an interrupt controller chip for the specified VM. */ 1636 void vm_create_irqchip(struct kvm_vm *vm) 1637 { 1638 vm_ioctl(vm, KVM_CREATE_IRQCHIP, NULL); 1639 1640 vm->has_irqchip = true; 1641 } 1642 1643 int _vcpu_run(struct kvm_vcpu *vcpu) 1644 { 1645 int rc; 1646 1647 do { 1648 rc = __vcpu_run(vcpu); 1649 } while (rc == -1 && errno == EINTR); 1650 1651 assert_on_unhandled_exception(vcpu); 1652 1653 return rc; 1654 } 1655 1656 /* 1657 * Invoke KVM_RUN on a vCPU until KVM returns something other than -EINTR. 1658 * Assert if the KVM returns an error (other than -EINTR). 1659 */ 1660 void vcpu_run(struct kvm_vcpu *vcpu) 1661 { 1662 int ret = _vcpu_run(vcpu); 1663 1664 TEST_ASSERT(!ret, KVM_IOCTL_ERROR(KVM_RUN, ret)); 1665 } 1666 1667 void vcpu_run_complete_io(struct kvm_vcpu *vcpu) 1668 { 1669 int ret; 1670 1671 vcpu->run->immediate_exit = 1; 1672 ret = __vcpu_run(vcpu); 1673 vcpu->run->immediate_exit = 0; 1674 1675 TEST_ASSERT(ret == -1 && errno == EINTR, 1676 "KVM_RUN IOCTL didn't exit immediately, rc: %i, errno: %i", 1677 ret, errno); 1678 } 1679 1680 /* 1681 * Get the list of guest registers which are supported for 1682 * KVM_GET_ONE_REG/KVM_SET_ONE_REG ioctls. Returns a kvm_reg_list pointer, 1683 * it is the caller's responsibility to free the list. 1684 */ 1685 struct kvm_reg_list *vcpu_get_reg_list(struct kvm_vcpu *vcpu) 1686 { 1687 struct kvm_reg_list reg_list_n = { .n = 0 }, *reg_list; 1688 int ret; 1689 1690 ret = __vcpu_ioctl(vcpu, KVM_GET_REG_LIST, ®_list_n); 1691 TEST_ASSERT(ret == -1 && errno == E2BIG, "KVM_GET_REG_LIST n=0"); 1692 1693 reg_list = calloc(1, sizeof(*reg_list) + reg_list_n.n * sizeof(__u64)); 1694 reg_list->n = reg_list_n.n; 1695 vcpu_ioctl(vcpu, KVM_GET_REG_LIST, reg_list); 1696 return reg_list; 1697 } 1698 1699 void *vcpu_map_dirty_ring(struct kvm_vcpu *vcpu) 1700 { 1701 uint32_t page_size = getpagesize(); 1702 uint32_t size = vcpu->vm->dirty_ring_size; 1703 1704 TEST_ASSERT(size > 0, "Should enable dirty ring first"); 1705 1706 if (!vcpu->dirty_gfns) { 1707 void *addr; 1708 1709 addr = mmap(NULL, size, PROT_READ, MAP_PRIVATE, vcpu->fd, 1710 page_size * KVM_DIRTY_LOG_PAGE_OFFSET); 1711 TEST_ASSERT(addr == MAP_FAILED, "Dirty ring mapped private"); 1712 1713 addr = mmap(NULL, size, PROT_READ | PROT_EXEC, MAP_PRIVATE, vcpu->fd, 1714 page_size * KVM_DIRTY_LOG_PAGE_OFFSET); 1715 TEST_ASSERT(addr == MAP_FAILED, "Dirty ring mapped exec"); 1716 1717 addr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, vcpu->fd, 1718 page_size * KVM_DIRTY_LOG_PAGE_OFFSET); 1719 TEST_ASSERT(addr != MAP_FAILED, "Dirty ring map failed"); 1720 1721 vcpu->dirty_gfns = addr; 1722 vcpu->dirty_gfns_count = size / sizeof(struct kvm_dirty_gfn); 1723 } 1724 1725 return vcpu->dirty_gfns; 1726 } 1727 1728 /* 1729 * Device Ioctl 1730 */ 1731 1732 int __kvm_has_device_attr(int dev_fd, uint32_t group, uint64_t attr) 1733 { 1734 struct kvm_device_attr attribute = { 1735 .group = group, 1736 .attr = attr, 1737 .flags = 0, 1738 }; 1739 1740 return ioctl(dev_fd, KVM_HAS_DEVICE_ATTR, &attribute); 1741 } 1742 1743 int __kvm_test_create_device(struct kvm_vm *vm, uint64_t type) 1744 { 1745 struct kvm_create_device create_dev = { 1746 .type = type, 1747 .flags = KVM_CREATE_DEVICE_TEST, 1748 }; 1749 1750 return __vm_ioctl(vm, KVM_CREATE_DEVICE, &create_dev); 1751 } 1752 1753 int __kvm_create_device(struct kvm_vm *vm, uint64_t type) 1754 { 1755 struct kvm_create_device create_dev = { 1756 .type = type, 1757 .fd = -1, 1758 .flags = 0, 1759 }; 1760 int err; 1761 1762 err = __vm_ioctl(vm, KVM_CREATE_DEVICE, &create_dev); 1763 TEST_ASSERT(err <= 0, "KVM_CREATE_DEVICE shouldn't return a positive value"); 1764 return err ? : create_dev.fd; 1765 } 1766 1767 int __kvm_device_attr_get(int dev_fd, uint32_t group, uint64_t attr, void *val) 1768 { 1769 struct kvm_device_attr kvmattr = { 1770 .group = group, 1771 .attr = attr, 1772 .flags = 0, 1773 .addr = (uintptr_t)val, 1774 }; 1775 1776 return __kvm_ioctl(dev_fd, KVM_GET_DEVICE_ATTR, &kvmattr); 1777 } 1778 1779 int __kvm_device_attr_set(int dev_fd, uint32_t group, uint64_t attr, void *val) 1780 { 1781 struct kvm_device_attr kvmattr = { 1782 .group = group, 1783 .attr = attr, 1784 .flags = 0, 1785 .addr = (uintptr_t)val, 1786 }; 1787 1788 return __kvm_ioctl(dev_fd, KVM_SET_DEVICE_ATTR, &kvmattr); 1789 } 1790 1791 /* 1792 * IRQ related functions. 1793 */ 1794 1795 int _kvm_irq_line(struct kvm_vm *vm, uint32_t irq, int level) 1796 { 1797 struct kvm_irq_level irq_level = { 1798 .irq = irq, 1799 .level = level, 1800 }; 1801 1802 return __vm_ioctl(vm, KVM_IRQ_LINE, &irq_level); 1803 } 1804 1805 void kvm_irq_line(struct kvm_vm *vm, uint32_t irq, int level) 1806 { 1807 int ret = _kvm_irq_line(vm, irq, level); 1808 1809 TEST_ASSERT(ret >= 0, KVM_IOCTL_ERROR(KVM_IRQ_LINE, ret)); 1810 } 1811 1812 struct kvm_irq_routing *kvm_gsi_routing_create(void) 1813 { 1814 struct kvm_irq_routing *routing; 1815 size_t size; 1816 1817 size = sizeof(struct kvm_irq_routing); 1818 /* Allocate space for the max number of entries: this wastes 196 KBs. */ 1819 size += KVM_MAX_IRQ_ROUTES * sizeof(struct kvm_irq_routing_entry); 1820 routing = calloc(1, size); 1821 assert(routing); 1822 1823 return routing; 1824 } 1825 1826 void kvm_gsi_routing_irqchip_add(struct kvm_irq_routing *routing, 1827 uint32_t gsi, uint32_t pin) 1828 { 1829 int i; 1830 1831 assert(routing); 1832 assert(routing->nr < KVM_MAX_IRQ_ROUTES); 1833 1834 i = routing->nr; 1835 routing->entries[i].gsi = gsi; 1836 routing->entries[i].type = KVM_IRQ_ROUTING_IRQCHIP; 1837 routing->entries[i].flags = 0; 1838 routing->entries[i].u.irqchip.irqchip = 0; 1839 routing->entries[i].u.irqchip.pin = pin; 1840 routing->nr++; 1841 } 1842 1843 int _kvm_gsi_routing_write(struct kvm_vm *vm, struct kvm_irq_routing *routing) 1844 { 1845 int ret; 1846 1847 assert(routing); 1848 ret = __vm_ioctl(vm, KVM_SET_GSI_ROUTING, routing); 1849 free(routing); 1850 1851 return ret; 1852 } 1853 1854 void kvm_gsi_routing_write(struct kvm_vm *vm, struct kvm_irq_routing *routing) 1855 { 1856 int ret; 1857 1858 ret = _kvm_gsi_routing_write(vm, routing); 1859 TEST_ASSERT(!ret, KVM_IOCTL_ERROR(KVM_SET_GSI_ROUTING, ret)); 1860 } 1861 1862 /* 1863 * VM Dump 1864 * 1865 * Input Args: 1866 * vm - Virtual Machine 1867 * indent - Left margin indent amount 1868 * 1869 * Output Args: 1870 * stream - Output FILE stream 1871 * 1872 * Return: None 1873 * 1874 * Dumps the current state of the VM given by vm, to the FILE stream 1875 * given by stream. 1876 */ 1877 void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) 1878 { 1879 int ctr; 1880 struct userspace_mem_region *region; 1881 struct kvm_vcpu *vcpu; 1882 1883 fprintf(stream, "%*smode: 0x%x\n", indent, "", vm->mode); 1884 fprintf(stream, "%*sfd: %i\n", indent, "", vm->fd); 1885 fprintf(stream, "%*spage_size: 0x%x\n", indent, "", vm->page_size); 1886 fprintf(stream, "%*sMem Regions:\n", indent, ""); 1887 hash_for_each(vm->regions.slot_hash, ctr, region, slot_node) { 1888 fprintf(stream, "%*sguest_phys: 0x%lx size: 0x%lx " 1889 "host_virt: %p\n", indent + 2, "", 1890 (uint64_t) region->region.guest_phys_addr, 1891 (uint64_t) region->region.memory_size, 1892 region->host_mem); 1893 fprintf(stream, "%*sunused_phy_pages: ", indent + 2, ""); 1894 sparsebit_dump(stream, region->unused_phy_pages, 0); 1895 if (region->protected_phy_pages) { 1896 fprintf(stream, "%*sprotected_phy_pages: ", indent + 2, ""); 1897 sparsebit_dump(stream, region->protected_phy_pages, 0); 1898 } 1899 } 1900 fprintf(stream, "%*sMapped Virtual Pages:\n", indent, ""); 1901 sparsebit_dump(stream, vm->vpages_mapped, indent + 2); 1902 fprintf(stream, "%*spgd_created: %u\n", indent, "", 1903 vm->pgd_created); 1904 if (vm->pgd_created) { 1905 fprintf(stream, "%*sVirtual Translation Tables:\n", 1906 indent + 2, ""); 1907 virt_dump(stream, vm, indent + 4); 1908 } 1909 fprintf(stream, "%*sVCPUs:\n", indent, ""); 1910 1911 list_for_each_entry(vcpu, &vm->vcpus, list) 1912 vcpu_dump(stream, vcpu, indent + 2); 1913 } 1914 1915 #define KVM_EXIT_STRING(x) {KVM_EXIT_##x, #x} 1916 1917 /* Known KVM exit reasons */ 1918 static struct exit_reason { 1919 unsigned int reason; 1920 const char *name; 1921 } exit_reasons_known[] = { 1922 KVM_EXIT_STRING(UNKNOWN), 1923 KVM_EXIT_STRING(EXCEPTION), 1924 KVM_EXIT_STRING(IO), 1925 KVM_EXIT_STRING(HYPERCALL), 1926 KVM_EXIT_STRING(DEBUG), 1927 KVM_EXIT_STRING(HLT), 1928 KVM_EXIT_STRING(MMIO), 1929 KVM_EXIT_STRING(IRQ_WINDOW_OPEN), 1930 KVM_EXIT_STRING(SHUTDOWN), 1931 KVM_EXIT_STRING(FAIL_ENTRY), 1932 KVM_EXIT_STRING(INTR), 1933 KVM_EXIT_STRING(SET_TPR), 1934 KVM_EXIT_STRING(TPR_ACCESS), 1935 KVM_EXIT_STRING(S390_SIEIC), 1936 KVM_EXIT_STRING(S390_RESET), 1937 KVM_EXIT_STRING(DCR), 1938 KVM_EXIT_STRING(NMI), 1939 KVM_EXIT_STRING(INTERNAL_ERROR), 1940 KVM_EXIT_STRING(OSI), 1941 KVM_EXIT_STRING(PAPR_HCALL), 1942 KVM_EXIT_STRING(S390_UCONTROL), 1943 KVM_EXIT_STRING(WATCHDOG), 1944 KVM_EXIT_STRING(S390_TSCH), 1945 KVM_EXIT_STRING(EPR), 1946 KVM_EXIT_STRING(SYSTEM_EVENT), 1947 KVM_EXIT_STRING(S390_STSI), 1948 KVM_EXIT_STRING(IOAPIC_EOI), 1949 KVM_EXIT_STRING(HYPERV), 1950 KVM_EXIT_STRING(ARM_NISV), 1951 KVM_EXIT_STRING(X86_RDMSR), 1952 KVM_EXIT_STRING(X86_WRMSR), 1953 KVM_EXIT_STRING(DIRTY_RING_FULL), 1954 KVM_EXIT_STRING(AP_RESET_HOLD), 1955 KVM_EXIT_STRING(X86_BUS_LOCK), 1956 KVM_EXIT_STRING(XEN), 1957 KVM_EXIT_STRING(RISCV_SBI), 1958 KVM_EXIT_STRING(RISCV_CSR), 1959 KVM_EXIT_STRING(NOTIFY), 1960 #ifdef KVM_EXIT_MEMORY_NOT_PRESENT 1961 KVM_EXIT_STRING(MEMORY_NOT_PRESENT), 1962 #endif 1963 }; 1964 1965 /* 1966 * Exit Reason String 1967 * 1968 * Input Args: 1969 * exit_reason - Exit reason 1970 * 1971 * Output Args: None 1972 * 1973 * Return: 1974 * Constant string pointer describing the exit reason. 1975 * 1976 * Locates and returns a constant string that describes the KVM exit 1977 * reason given by exit_reason. If no such string is found, a constant 1978 * string of "Unknown" is returned. 1979 */ 1980 const char *exit_reason_str(unsigned int exit_reason) 1981 { 1982 unsigned int n1; 1983 1984 for (n1 = 0; n1 < ARRAY_SIZE(exit_reasons_known); n1++) { 1985 if (exit_reason == exit_reasons_known[n1].reason) 1986 return exit_reasons_known[n1].name; 1987 } 1988 1989 return "Unknown"; 1990 } 1991 1992 /* 1993 * Physical Contiguous Page Allocator 1994 * 1995 * Input Args: 1996 * vm - Virtual Machine 1997 * num - number of pages 1998 * paddr_min - Physical address minimum 1999 * memslot - Memory region to allocate page from 2000 * protected - True if the pages will be used as protected/private memory 2001 * 2002 * Output Args: None 2003 * 2004 * Return: 2005 * Starting physical address 2006 * 2007 * Within the VM specified by vm, locates a range of available physical 2008 * pages at or above paddr_min. If found, the pages are marked as in use 2009 * and their base address is returned. A TEST_ASSERT failure occurs if 2010 * not enough pages are available at or above paddr_min. 2011 */ 2012 vm_paddr_t __vm_phy_pages_alloc(struct kvm_vm *vm, size_t num, 2013 vm_paddr_t paddr_min, uint32_t memslot, 2014 bool protected) 2015 { 2016 struct userspace_mem_region *region; 2017 sparsebit_idx_t pg, base; 2018 2019 TEST_ASSERT(num > 0, "Must allocate at least one page"); 2020 2021 TEST_ASSERT((paddr_min % vm->page_size) == 0, "Min physical address " 2022 "not divisible by page size.\n" 2023 " paddr_min: 0x%lx page_size: 0x%x", 2024 paddr_min, vm->page_size); 2025 2026 region = memslot2region(vm, memslot); 2027 TEST_ASSERT(!protected || region->protected_phy_pages, 2028 "Region doesn't support protected memory"); 2029 2030 base = pg = paddr_min >> vm->page_shift; 2031 do { 2032 for (; pg < base + num; ++pg) { 2033 if (!sparsebit_is_set(region->unused_phy_pages, pg)) { 2034 base = pg = sparsebit_next_set(region->unused_phy_pages, pg); 2035 break; 2036 } 2037 } 2038 } while (pg && pg != base + num); 2039 2040 if (pg == 0) { 2041 fprintf(stderr, "No guest physical page available, " 2042 "paddr_min: 0x%lx page_size: 0x%x memslot: %u\n", 2043 paddr_min, vm->page_size, memslot); 2044 fputs("---- vm dump ----\n", stderr); 2045 vm_dump(stderr, vm, 2); 2046 abort(); 2047 } 2048 2049 for (pg = base; pg < base + num; ++pg) { 2050 sparsebit_clear(region->unused_phy_pages, pg); 2051 if (protected) 2052 sparsebit_set(region->protected_phy_pages, pg); 2053 } 2054 2055 return base * vm->page_size; 2056 } 2057 2058 vm_paddr_t vm_phy_page_alloc(struct kvm_vm *vm, vm_paddr_t paddr_min, 2059 uint32_t memslot) 2060 { 2061 return vm_phy_pages_alloc(vm, 1, paddr_min, memslot); 2062 } 2063 2064 vm_paddr_t vm_alloc_page_table(struct kvm_vm *vm) 2065 { 2066 return vm_phy_page_alloc(vm, KVM_GUEST_PAGE_TABLE_MIN_PADDR, 2067 vm->memslots[MEM_REGION_PT]); 2068 } 2069 2070 /* 2071 * Address Guest Virtual to Host Virtual 2072 * 2073 * Input Args: 2074 * vm - Virtual Machine 2075 * gva - VM virtual address 2076 * 2077 * Output Args: None 2078 * 2079 * Return: 2080 * Equivalent host virtual address 2081 */ 2082 void *addr_gva2hva(struct kvm_vm *vm, vm_vaddr_t gva) 2083 { 2084 return addr_gpa2hva(vm, addr_gva2gpa(vm, gva)); 2085 } 2086 2087 unsigned long __weak vm_compute_max_gfn(struct kvm_vm *vm) 2088 { 2089 return ((1ULL << vm->pa_bits) >> vm->page_shift) - 1; 2090 } 2091 2092 static unsigned int vm_calc_num_pages(unsigned int num_pages, 2093 unsigned int page_shift, 2094 unsigned int new_page_shift, 2095 bool ceil) 2096 { 2097 unsigned int n = 1 << (new_page_shift - page_shift); 2098 2099 if (page_shift >= new_page_shift) 2100 return num_pages * (1 << (page_shift - new_page_shift)); 2101 2102 return num_pages / n + !!(ceil && num_pages % n); 2103 } 2104 2105 static inline int getpageshift(void) 2106 { 2107 return __builtin_ffs(getpagesize()) - 1; 2108 } 2109 2110 unsigned int 2111 vm_num_host_pages(enum vm_guest_mode mode, unsigned int num_guest_pages) 2112 { 2113 return vm_calc_num_pages(num_guest_pages, 2114 vm_guest_mode_params[mode].page_shift, 2115 getpageshift(), true); 2116 } 2117 2118 unsigned int 2119 vm_num_guest_pages(enum vm_guest_mode mode, unsigned int num_host_pages) 2120 { 2121 return vm_calc_num_pages(num_host_pages, getpageshift(), 2122 vm_guest_mode_params[mode].page_shift, false); 2123 } 2124 2125 unsigned int vm_calc_num_guest_pages(enum vm_guest_mode mode, size_t size) 2126 { 2127 unsigned int n; 2128 n = DIV_ROUND_UP(size, vm_guest_mode_params[mode].page_size); 2129 return vm_adjust_num_guest_pages(mode, n); 2130 } 2131 2132 /* 2133 * Read binary stats descriptors 2134 * 2135 * Input Args: 2136 * stats_fd - the file descriptor for the binary stats file from which to read 2137 * header - the binary stats metadata header corresponding to the given FD 2138 * 2139 * Output Args: None 2140 * 2141 * Return: 2142 * A pointer to a newly allocated series of stat descriptors. 2143 * Caller is responsible for freeing the returned kvm_stats_desc. 2144 * 2145 * Read the stats descriptors from the binary stats interface. 2146 */ 2147 struct kvm_stats_desc *read_stats_descriptors(int stats_fd, 2148 struct kvm_stats_header *header) 2149 { 2150 struct kvm_stats_desc *stats_desc; 2151 ssize_t desc_size, total_size, ret; 2152 2153 desc_size = get_stats_descriptor_size(header); 2154 total_size = header->num_desc * desc_size; 2155 2156 stats_desc = calloc(header->num_desc, desc_size); 2157 TEST_ASSERT(stats_desc, "Allocate memory for stats descriptors"); 2158 2159 ret = pread(stats_fd, stats_desc, total_size, header->desc_offset); 2160 TEST_ASSERT(ret == total_size, "Read KVM stats descriptors"); 2161 2162 return stats_desc; 2163 } 2164 2165 /* 2166 * Read stat data for a particular stat 2167 * 2168 * Input Args: 2169 * stats_fd - the file descriptor for the binary stats file from which to read 2170 * header - the binary stats metadata header corresponding to the given FD 2171 * desc - the binary stat metadata for the particular stat to be read 2172 * max_elements - the maximum number of 8-byte values to read into data 2173 * 2174 * Output Args: 2175 * data - the buffer into which stat data should be read 2176 * 2177 * Read the data values of a specified stat from the binary stats interface. 2178 */ 2179 void read_stat_data(int stats_fd, struct kvm_stats_header *header, 2180 struct kvm_stats_desc *desc, uint64_t *data, 2181 size_t max_elements) 2182 { 2183 size_t nr_elements = min_t(ssize_t, desc->size, max_elements); 2184 size_t size = nr_elements * sizeof(*data); 2185 ssize_t ret; 2186 2187 TEST_ASSERT(desc->size, "No elements in stat '%s'", desc->name); 2188 TEST_ASSERT(max_elements, "Zero elements requested for stat '%s'", desc->name); 2189 2190 ret = pread(stats_fd, data, size, 2191 header->data_offset + desc->offset); 2192 2193 TEST_ASSERT(ret >= 0, "pread() failed on stat '%s', errno: %i (%s)", 2194 desc->name, errno, strerror(errno)); 2195 TEST_ASSERT(ret == size, 2196 "pread() on stat '%s' read %ld bytes, wanted %lu bytes", 2197 desc->name, size, ret); 2198 } 2199 2200 /* 2201 * Read the data of the named stat 2202 * 2203 * Input Args: 2204 * vm - the VM for which the stat should be read 2205 * stat_name - the name of the stat to read 2206 * max_elements - the maximum number of 8-byte values to read into data 2207 * 2208 * Output Args: 2209 * data - the buffer into which stat data should be read 2210 * 2211 * Read the data values of a specified stat from the binary stats interface. 2212 */ 2213 void __vm_get_stat(struct kvm_vm *vm, const char *stat_name, uint64_t *data, 2214 size_t max_elements) 2215 { 2216 struct kvm_stats_desc *desc; 2217 size_t size_desc; 2218 int i; 2219 2220 if (!vm->stats_fd) { 2221 vm->stats_fd = vm_get_stats_fd(vm); 2222 read_stats_header(vm->stats_fd, &vm->stats_header); 2223 vm->stats_desc = read_stats_descriptors(vm->stats_fd, 2224 &vm->stats_header); 2225 } 2226 2227 size_desc = get_stats_descriptor_size(&vm->stats_header); 2228 2229 for (i = 0; i < vm->stats_header.num_desc; ++i) { 2230 desc = (void *)vm->stats_desc + (i * size_desc); 2231 2232 if (strcmp(desc->name, stat_name)) 2233 continue; 2234 2235 read_stat_data(vm->stats_fd, &vm->stats_header, desc, 2236 data, max_elements); 2237 2238 break; 2239 } 2240 } 2241 2242 __weak void kvm_arch_vm_post_create(struct kvm_vm *vm) 2243 { 2244 } 2245 2246 __weak void kvm_selftest_arch_init(void) 2247 { 2248 } 2249 2250 void __attribute((constructor)) kvm_selftest_init(void) 2251 { 2252 /* Tell stdout not to buffer its content. */ 2253 setbuf(stdout, NULL); 2254 2255 guest_random_seed = last_guest_seed = random(); 2256 pr_info("Random seed: 0x%x\n", guest_random_seed); 2257 2258 kvm_selftest_arch_init(); 2259 } 2260 2261 bool vm_is_gpa_protected(struct kvm_vm *vm, vm_paddr_t paddr) 2262 { 2263 sparsebit_idx_t pg = 0; 2264 struct userspace_mem_region *region; 2265 2266 if (!vm_arch_has_protected_memory(vm)) 2267 return false; 2268 2269 region = userspace_mem_region_find(vm, paddr, paddr); 2270 TEST_ASSERT(region, "No vm physical memory at 0x%lx", paddr); 2271 2272 pg = paddr >> vm->page_shift; 2273 return sparsebit_is_set(region->protected_phy_pages, pg); 2274 } 2275