1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * tools/testing/selftests/kvm/lib/kvm_util.c 4 * 5 * Copyright (C) 2018, Google LLC. 6 */ 7 #include "test_util.h" 8 #include "kvm_util.h" 9 #include "processor.h" 10 #include "ucall_common.h" 11 12 #include <assert.h> 13 #include <sched.h> 14 #include <sys/mman.h> 15 #include <sys/resource.h> 16 #include <sys/types.h> 17 #include <sys/stat.h> 18 #include <unistd.h> 19 #include <linux/kernel.h> 20 21 #define KVM_UTIL_MIN_PFN 2 22 23 uint32_t guest_random_seed; 24 struct guest_random_state guest_rng; 25 static uint32_t last_guest_seed; 26 27 static int vcpu_mmap_sz(void); 28 29 int __open_path_or_exit(const char *path, int flags, const char *enoent_help) 30 { 31 int fd; 32 33 fd = open(path, flags); 34 if (fd < 0) 35 goto error; 36 37 return fd; 38 39 error: 40 if (errno == EACCES || errno == ENOENT) 41 ksft_exit_skip("- Cannot open '%s': %s. %s\n", 42 path, strerror(errno), 43 errno == EACCES ? "Root required?" : enoent_help); 44 TEST_FAIL("Failed to open '%s'", path); 45 } 46 47 int open_path_or_exit(const char *path, int flags) 48 { 49 return __open_path_or_exit(path, flags, ""); 50 } 51 52 /* 53 * Open KVM_DEV_PATH if available, otherwise exit the entire program. 54 * 55 * Input Args: 56 * flags - The flags to pass when opening KVM_DEV_PATH. 57 * 58 * Return: 59 * The opened file descriptor of /dev/kvm. 60 */ 61 static int _open_kvm_dev_path_or_exit(int flags) 62 { 63 return __open_path_or_exit(KVM_DEV_PATH, flags, "Is KVM loaded and enabled?"); 64 } 65 66 int open_kvm_dev_path_or_exit(void) 67 { 68 return _open_kvm_dev_path_or_exit(O_RDONLY); 69 } 70 71 static ssize_t get_module_param(const char *module_name, const char *param, 72 void *buffer, size_t buffer_size) 73 { 74 const int path_size = 128; 75 char path[path_size]; 76 ssize_t bytes_read; 77 int fd, r; 78 79 /* Verify KVM is loaded, to provide a more helpful SKIP message. */ 80 close(open_kvm_dev_path_or_exit()); 81 82 r = snprintf(path, path_size, "/sys/module/%s/parameters/%s", 83 module_name, param); 84 TEST_ASSERT(r < path_size, 85 "Failed to construct sysfs path in %d bytes.", path_size); 86 87 fd = open_path_or_exit(path, O_RDONLY); 88 89 bytes_read = read(fd, buffer, buffer_size); 90 TEST_ASSERT(bytes_read > 0, "read(%s) returned %ld, wanted %ld bytes", 91 path, bytes_read, buffer_size); 92 93 r = close(fd); 94 TEST_ASSERT(!r, "close(%s) failed", path); 95 return bytes_read; 96 } 97 98 static int get_module_param_integer(const char *module_name, const char *param) 99 { 100 /* 101 * 16 bytes to hold a 64-bit value (1 byte per char), 1 byte for the 102 * NUL char, and 1 byte because the kernel sucks and inserts a newline 103 * at the end. 104 */ 105 char value[16 + 1 + 1]; 106 ssize_t r; 107 108 memset(value, '\0', sizeof(value)); 109 110 r = get_module_param(module_name, param, value, sizeof(value)); 111 TEST_ASSERT(value[r - 1] == '\n', 112 "Expected trailing newline, got char '%c'", value[r - 1]); 113 114 /* 115 * Squash the newline, otherwise atoi_paranoid() will complain about 116 * trailing non-NUL characters in the string. 117 */ 118 value[r - 1] = '\0'; 119 return atoi_paranoid(value); 120 } 121 122 static bool get_module_param_bool(const char *module_name, const char *param) 123 { 124 char value; 125 ssize_t r; 126 127 r = get_module_param(module_name, param, &value, sizeof(value)); 128 TEST_ASSERT_EQ(r, 1); 129 130 if (value == 'Y') 131 return true; 132 else if (value == 'N') 133 return false; 134 135 TEST_FAIL("Unrecognized value '%c' for boolean module param", value); 136 } 137 138 bool get_kvm_param_bool(const char *param) 139 { 140 return get_module_param_bool("kvm", param); 141 } 142 143 bool get_kvm_intel_param_bool(const char *param) 144 { 145 return get_module_param_bool("kvm_intel", param); 146 } 147 148 bool get_kvm_amd_param_bool(const char *param) 149 { 150 return get_module_param_bool("kvm_amd", param); 151 } 152 153 int get_kvm_param_integer(const char *param) 154 { 155 return get_module_param_integer("kvm", param); 156 } 157 158 int get_kvm_intel_param_integer(const char *param) 159 { 160 return get_module_param_integer("kvm_intel", param); 161 } 162 163 int get_kvm_amd_param_integer(const char *param) 164 { 165 return get_module_param_integer("kvm_amd", param); 166 } 167 168 /* 169 * Capability 170 * 171 * Input Args: 172 * cap - Capability 173 * 174 * Output Args: None 175 * 176 * Return: 177 * On success, the Value corresponding to the capability (KVM_CAP_*) 178 * specified by the value of cap. On failure a TEST_ASSERT failure 179 * is produced. 180 * 181 * Looks up and returns the value corresponding to the capability 182 * (KVM_CAP_*) given by cap. 183 */ 184 unsigned int kvm_check_cap(long cap) 185 { 186 int ret; 187 int kvm_fd; 188 189 kvm_fd = open_kvm_dev_path_or_exit(); 190 ret = __kvm_ioctl(kvm_fd, KVM_CHECK_EXTENSION, (void *)cap); 191 TEST_ASSERT(ret >= 0, KVM_IOCTL_ERROR(KVM_CHECK_EXTENSION, ret)); 192 193 close(kvm_fd); 194 195 return (unsigned int)ret; 196 } 197 198 void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size) 199 { 200 if (vm_check_cap(vm, KVM_CAP_DIRTY_LOG_RING_ACQ_REL)) 201 vm_enable_cap(vm, KVM_CAP_DIRTY_LOG_RING_ACQ_REL, ring_size); 202 else 203 vm_enable_cap(vm, KVM_CAP_DIRTY_LOG_RING, ring_size); 204 vm->dirty_ring_size = ring_size; 205 } 206 207 static void vm_open(struct kvm_vm *vm) 208 { 209 vm->kvm_fd = _open_kvm_dev_path_or_exit(O_RDWR); 210 211 TEST_REQUIRE(kvm_has_cap(KVM_CAP_IMMEDIATE_EXIT)); 212 213 vm->fd = __kvm_ioctl(vm->kvm_fd, KVM_CREATE_VM, (void *)vm->type); 214 TEST_ASSERT(vm->fd >= 0, KVM_IOCTL_ERROR(KVM_CREATE_VM, vm->fd)); 215 216 if (kvm_has_cap(KVM_CAP_BINARY_STATS_FD)) 217 vm->stats.fd = vm_get_stats_fd(vm); 218 else 219 vm->stats.fd = -1; 220 } 221 222 const char *vm_guest_mode_string(uint32_t i) 223 { 224 static const char * const strings[] = { 225 [VM_MODE_P52V48_4K] = "PA-bits:52, VA-bits:48, 4K pages", 226 [VM_MODE_P52V48_16K] = "PA-bits:52, VA-bits:48, 16K pages", 227 [VM_MODE_P52V48_64K] = "PA-bits:52, VA-bits:48, 64K pages", 228 [VM_MODE_P48V48_4K] = "PA-bits:48, VA-bits:48, 4K pages", 229 [VM_MODE_P48V48_16K] = "PA-bits:48, VA-bits:48, 16K pages", 230 [VM_MODE_P48V48_64K] = "PA-bits:48, VA-bits:48, 64K pages", 231 [VM_MODE_P40V48_4K] = "PA-bits:40, VA-bits:48, 4K pages", 232 [VM_MODE_P40V48_16K] = "PA-bits:40, VA-bits:48, 16K pages", 233 [VM_MODE_P40V48_64K] = "PA-bits:40, VA-bits:48, 64K pages", 234 [VM_MODE_PXXV48_4K] = "PA-bits:ANY, VA-bits:48, 4K pages", 235 [VM_MODE_P47V64_4K] = "PA-bits:47, VA-bits:64, 4K pages", 236 [VM_MODE_P44V64_4K] = "PA-bits:44, VA-bits:64, 4K pages", 237 [VM_MODE_P36V48_4K] = "PA-bits:36, VA-bits:48, 4K pages", 238 [VM_MODE_P36V48_16K] = "PA-bits:36, VA-bits:48, 16K pages", 239 [VM_MODE_P36V48_64K] = "PA-bits:36, VA-bits:48, 64K pages", 240 [VM_MODE_P47V47_16K] = "PA-bits:47, VA-bits:47, 16K pages", 241 [VM_MODE_P36V47_16K] = "PA-bits:36, VA-bits:47, 16K pages", 242 }; 243 _Static_assert(sizeof(strings)/sizeof(char *) == NUM_VM_MODES, 244 "Missing new mode strings?"); 245 246 TEST_ASSERT(i < NUM_VM_MODES, "Guest mode ID %d too big", i); 247 248 return strings[i]; 249 } 250 251 const struct vm_guest_mode_params vm_guest_mode_params[] = { 252 [VM_MODE_P52V48_4K] = { 52, 48, 0x1000, 12 }, 253 [VM_MODE_P52V48_16K] = { 52, 48, 0x4000, 14 }, 254 [VM_MODE_P52V48_64K] = { 52, 48, 0x10000, 16 }, 255 [VM_MODE_P48V48_4K] = { 48, 48, 0x1000, 12 }, 256 [VM_MODE_P48V48_16K] = { 48, 48, 0x4000, 14 }, 257 [VM_MODE_P48V48_64K] = { 48, 48, 0x10000, 16 }, 258 [VM_MODE_P40V48_4K] = { 40, 48, 0x1000, 12 }, 259 [VM_MODE_P40V48_16K] = { 40, 48, 0x4000, 14 }, 260 [VM_MODE_P40V48_64K] = { 40, 48, 0x10000, 16 }, 261 [VM_MODE_PXXV48_4K] = { 0, 0, 0x1000, 12 }, 262 [VM_MODE_P47V64_4K] = { 47, 64, 0x1000, 12 }, 263 [VM_MODE_P44V64_4K] = { 44, 64, 0x1000, 12 }, 264 [VM_MODE_P36V48_4K] = { 36, 48, 0x1000, 12 }, 265 [VM_MODE_P36V48_16K] = { 36, 48, 0x4000, 14 }, 266 [VM_MODE_P36V48_64K] = { 36, 48, 0x10000, 16 }, 267 [VM_MODE_P47V47_16K] = { 47, 47, 0x4000, 14 }, 268 [VM_MODE_P36V47_16K] = { 36, 47, 0x4000, 14 }, 269 }; 270 _Static_assert(sizeof(vm_guest_mode_params)/sizeof(struct vm_guest_mode_params) == NUM_VM_MODES, 271 "Missing new mode params?"); 272 273 /* 274 * Initializes vm->vpages_valid to match the canonical VA space of the 275 * architecture. 276 * 277 * The default implementation is valid for architectures which split the 278 * range addressed by a single page table into a low and high region 279 * based on the MSB of the VA. On architectures with this behavior 280 * the VA region spans [0, 2^(va_bits - 1)), [-(2^(va_bits - 1), -1]. 281 */ 282 __weak void vm_vaddr_populate_bitmap(struct kvm_vm *vm) 283 { 284 sparsebit_set_num(vm->vpages_valid, 285 0, (1ULL << (vm->va_bits - 1)) >> vm->page_shift); 286 sparsebit_set_num(vm->vpages_valid, 287 (~((1ULL << (vm->va_bits - 1)) - 1)) >> vm->page_shift, 288 (1ULL << (vm->va_bits - 1)) >> vm->page_shift); 289 } 290 291 struct kvm_vm *____vm_create(struct vm_shape shape) 292 { 293 struct kvm_vm *vm; 294 295 vm = calloc(1, sizeof(*vm)); 296 TEST_ASSERT(vm != NULL, "Insufficient Memory"); 297 298 INIT_LIST_HEAD(&vm->vcpus); 299 vm->regions.gpa_tree = RB_ROOT; 300 vm->regions.hva_tree = RB_ROOT; 301 hash_init(vm->regions.slot_hash); 302 303 vm->mode = shape.mode; 304 vm->type = shape.type; 305 306 vm->pa_bits = vm_guest_mode_params[vm->mode].pa_bits; 307 vm->va_bits = vm_guest_mode_params[vm->mode].va_bits; 308 vm->page_size = vm_guest_mode_params[vm->mode].page_size; 309 vm->page_shift = vm_guest_mode_params[vm->mode].page_shift; 310 311 /* Setup mode specific traits. */ 312 switch (vm->mode) { 313 case VM_MODE_P52V48_4K: 314 vm->pgtable_levels = 4; 315 break; 316 case VM_MODE_P52V48_64K: 317 vm->pgtable_levels = 3; 318 break; 319 case VM_MODE_P48V48_4K: 320 vm->pgtable_levels = 4; 321 break; 322 case VM_MODE_P48V48_64K: 323 vm->pgtable_levels = 3; 324 break; 325 case VM_MODE_P40V48_4K: 326 case VM_MODE_P36V48_4K: 327 vm->pgtable_levels = 4; 328 break; 329 case VM_MODE_P40V48_64K: 330 case VM_MODE_P36V48_64K: 331 vm->pgtable_levels = 3; 332 break; 333 case VM_MODE_P52V48_16K: 334 case VM_MODE_P48V48_16K: 335 case VM_MODE_P40V48_16K: 336 case VM_MODE_P36V48_16K: 337 vm->pgtable_levels = 4; 338 break; 339 case VM_MODE_P47V47_16K: 340 case VM_MODE_P36V47_16K: 341 vm->pgtable_levels = 3; 342 break; 343 case VM_MODE_PXXV48_4K: 344 #ifdef __x86_64__ 345 kvm_get_cpu_address_width(&vm->pa_bits, &vm->va_bits); 346 kvm_init_vm_address_properties(vm); 347 /* 348 * Ignore KVM support for 5-level paging (vm->va_bits == 57), 349 * it doesn't take effect unless a CR4.LA57 is set, which it 350 * isn't for this mode (48-bit virtual address space). 351 */ 352 TEST_ASSERT(vm->va_bits == 48 || vm->va_bits == 57, 353 "Linear address width (%d bits) not supported", 354 vm->va_bits); 355 pr_debug("Guest physical address width detected: %d\n", 356 vm->pa_bits); 357 vm->pgtable_levels = 4; 358 vm->va_bits = 48; 359 #else 360 TEST_FAIL("VM_MODE_PXXV48_4K not supported on non-x86 platforms"); 361 #endif 362 break; 363 case VM_MODE_P47V64_4K: 364 vm->pgtable_levels = 5; 365 break; 366 case VM_MODE_P44V64_4K: 367 vm->pgtable_levels = 5; 368 break; 369 default: 370 TEST_FAIL("Unknown guest mode: 0x%x", vm->mode); 371 } 372 373 #ifdef __aarch64__ 374 TEST_ASSERT(!vm->type, "ARM doesn't support test-provided types"); 375 if (vm->pa_bits != 40) 376 vm->type = KVM_VM_TYPE_ARM_IPA_SIZE(vm->pa_bits); 377 #endif 378 379 vm_open(vm); 380 381 /* Limit to VA-bit canonical virtual addresses. */ 382 vm->vpages_valid = sparsebit_alloc(); 383 vm_vaddr_populate_bitmap(vm); 384 385 /* Limit physical addresses to PA-bits. */ 386 vm->max_gfn = vm_compute_max_gfn(vm); 387 388 /* Allocate and setup memory for guest. */ 389 vm->vpages_mapped = sparsebit_alloc(); 390 391 return vm; 392 } 393 394 static uint64_t vm_nr_pages_required(enum vm_guest_mode mode, 395 uint32_t nr_runnable_vcpus, 396 uint64_t extra_mem_pages) 397 { 398 uint64_t page_size = vm_guest_mode_params[mode].page_size; 399 uint64_t nr_pages; 400 401 TEST_ASSERT(nr_runnable_vcpus, 402 "Use vm_create_barebones() for VMs that _never_ have vCPUs"); 403 404 TEST_ASSERT(nr_runnable_vcpus <= kvm_check_cap(KVM_CAP_MAX_VCPUS), 405 "nr_vcpus = %d too large for host, max-vcpus = %d", 406 nr_runnable_vcpus, kvm_check_cap(KVM_CAP_MAX_VCPUS)); 407 408 /* 409 * Arbitrarily allocate 512 pages (2mb when page size is 4kb) for the 410 * test code and other per-VM assets that will be loaded into memslot0. 411 */ 412 nr_pages = 512; 413 414 /* Account for the per-vCPU stacks on behalf of the test. */ 415 nr_pages += nr_runnable_vcpus * DEFAULT_STACK_PGS; 416 417 /* 418 * Account for the number of pages needed for the page tables. The 419 * maximum page table size for a memory region will be when the 420 * smallest page size is used. Considering each page contains x page 421 * table descriptors, the total extra size for page tables (for extra 422 * N pages) will be: N/x+N/x^2+N/x^3+... which is definitely smaller 423 * than N/x*2. 424 */ 425 nr_pages += (nr_pages + extra_mem_pages) / PTES_PER_MIN_PAGE * 2; 426 427 /* Account for the number of pages needed by ucall. */ 428 nr_pages += ucall_nr_pages_required(page_size); 429 430 return vm_adjust_num_guest_pages(mode, nr_pages); 431 } 432 433 void kvm_set_files_rlimit(uint32_t nr_vcpus) 434 { 435 /* 436 * Each vCPU will open two file descriptors: the vCPU itself and the 437 * vCPU's binary stats file descriptor. Add an arbitrary amount of 438 * buffer for all other files a test may open. 439 */ 440 int nr_fds_wanted = nr_vcpus * 2 + 100; 441 struct rlimit rl; 442 443 /* 444 * Check that we're allowed to open nr_fds_wanted file descriptors and 445 * try raising the limits if needed. 446 */ 447 TEST_ASSERT(!getrlimit(RLIMIT_NOFILE, &rl), "getrlimit() failed!"); 448 449 if (rl.rlim_cur < nr_fds_wanted) { 450 rl.rlim_cur = nr_fds_wanted; 451 if (rl.rlim_max < nr_fds_wanted) { 452 int old_rlim_max = rl.rlim_max; 453 454 rl.rlim_max = nr_fds_wanted; 455 __TEST_REQUIRE(setrlimit(RLIMIT_NOFILE, &rl) >= 0, 456 "RLIMIT_NOFILE hard limit is too low (%d, wanted %d)", 457 old_rlim_max, nr_fds_wanted); 458 } else { 459 TEST_ASSERT(!setrlimit(RLIMIT_NOFILE, &rl), "setrlimit() failed!"); 460 } 461 } 462 463 } 464 465 static bool is_guest_memfd_required(struct vm_shape shape) 466 { 467 #ifdef __x86_64__ 468 return shape.type == KVM_X86_SNP_VM; 469 #else 470 return false; 471 #endif 472 } 473 474 struct kvm_vm *__vm_create(struct vm_shape shape, uint32_t nr_runnable_vcpus, 475 uint64_t nr_extra_pages) 476 { 477 uint64_t nr_pages = vm_nr_pages_required(shape.mode, nr_runnable_vcpus, 478 nr_extra_pages); 479 struct userspace_mem_region *slot0; 480 struct kvm_vm *vm; 481 int i, flags; 482 483 kvm_set_files_rlimit(nr_runnable_vcpus); 484 485 pr_debug("%s: mode='%s' type='%d', pages='%ld'\n", __func__, 486 vm_guest_mode_string(shape.mode), shape.type, nr_pages); 487 488 vm = ____vm_create(shape); 489 490 /* 491 * Force GUEST_MEMFD for the primary memory region if necessary, e.g. 492 * for CoCo VMs that require GUEST_MEMFD backed private memory. 493 */ 494 flags = 0; 495 if (is_guest_memfd_required(shape)) 496 flags |= KVM_MEM_GUEST_MEMFD; 497 498 vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, 0, 0, nr_pages, flags); 499 for (i = 0; i < NR_MEM_REGIONS; i++) 500 vm->memslots[i] = 0; 501 502 kvm_vm_elf_load(vm, program_invocation_name); 503 504 /* 505 * TODO: Add proper defines to protect the library's memslots, and then 506 * carve out memslot1 for the ucall MMIO address. KVM treats writes to 507 * read-only memslots as MMIO, and creating a read-only memslot for the 508 * MMIO region would prevent silently clobbering the MMIO region. 509 */ 510 slot0 = memslot2region(vm, 0); 511 ucall_init(vm, slot0->region.guest_phys_addr + slot0->region.memory_size); 512 513 if (guest_random_seed != last_guest_seed) { 514 pr_info("Random seed: 0x%x\n", guest_random_seed); 515 last_guest_seed = guest_random_seed; 516 } 517 guest_rng = new_guest_random_state(guest_random_seed); 518 sync_global_to_guest(vm, guest_rng); 519 520 kvm_arch_vm_post_create(vm); 521 522 return vm; 523 } 524 525 /* 526 * VM Create with customized parameters 527 * 528 * Input Args: 529 * mode - VM Mode (e.g. VM_MODE_P52V48_4K) 530 * nr_vcpus - VCPU count 531 * extra_mem_pages - Non-slot0 physical memory total size 532 * guest_code - Guest entry point 533 * vcpuids - VCPU IDs 534 * 535 * Output Args: None 536 * 537 * Return: 538 * Pointer to opaque structure that describes the created VM. 539 * 540 * Creates a VM with the mode specified by mode (e.g. VM_MODE_P52V48_4K). 541 * extra_mem_pages is only used to calculate the maximum page table size, 542 * no real memory allocation for non-slot0 memory in this function. 543 */ 544 struct kvm_vm *__vm_create_with_vcpus(struct vm_shape shape, uint32_t nr_vcpus, 545 uint64_t extra_mem_pages, 546 void *guest_code, struct kvm_vcpu *vcpus[]) 547 { 548 struct kvm_vm *vm; 549 int i; 550 551 TEST_ASSERT(!nr_vcpus || vcpus, "Must provide vCPU array"); 552 553 vm = __vm_create(shape, nr_vcpus, extra_mem_pages); 554 555 for (i = 0; i < nr_vcpus; ++i) 556 vcpus[i] = vm_vcpu_add(vm, i, guest_code); 557 558 return vm; 559 } 560 561 struct kvm_vm *__vm_create_shape_with_one_vcpu(struct vm_shape shape, 562 struct kvm_vcpu **vcpu, 563 uint64_t extra_mem_pages, 564 void *guest_code) 565 { 566 struct kvm_vcpu *vcpus[1]; 567 struct kvm_vm *vm; 568 569 vm = __vm_create_with_vcpus(shape, 1, extra_mem_pages, guest_code, vcpus); 570 571 *vcpu = vcpus[0]; 572 return vm; 573 } 574 575 /* 576 * VM Restart 577 * 578 * Input Args: 579 * vm - VM that has been released before 580 * 581 * Output Args: None 582 * 583 * Reopens the file descriptors associated to the VM and reinstates the 584 * global state, such as the irqchip and the memory regions that are mapped 585 * into the guest. 586 */ 587 void kvm_vm_restart(struct kvm_vm *vmp) 588 { 589 int ctr; 590 struct userspace_mem_region *region; 591 592 vm_open(vmp); 593 if (vmp->has_irqchip) 594 vm_create_irqchip(vmp); 595 596 hash_for_each(vmp->regions.slot_hash, ctr, region, slot_node) { 597 int ret = ioctl(vmp->fd, KVM_SET_USER_MEMORY_REGION2, ®ion->region); 598 599 TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION2 IOCTL failed,\n" 600 " rc: %i errno: %i\n" 601 " slot: %u flags: 0x%x\n" 602 " guest_phys_addr: 0x%llx size: 0x%llx", 603 ret, errno, region->region.slot, 604 region->region.flags, 605 region->region.guest_phys_addr, 606 region->region.memory_size); 607 } 608 } 609 610 __weak struct kvm_vcpu *vm_arch_vcpu_recreate(struct kvm_vm *vm, 611 uint32_t vcpu_id) 612 { 613 return __vm_vcpu_add(vm, vcpu_id); 614 } 615 616 struct kvm_vcpu *vm_recreate_with_one_vcpu(struct kvm_vm *vm) 617 { 618 kvm_vm_restart(vm); 619 620 return vm_vcpu_recreate(vm, 0); 621 } 622 623 int __pin_task_to_cpu(pthread_t task, int cpu) 624 { 625 cpu_set_t cpuset; 626 627 CPU_ZERO(&cpuset); 628 CPU_SET(cpu, &cpuset); 629 630 return pthread_setaffinity_np(task, sizeof(cpuset), &cpuset); 631 } 632 633 static uint32_t parse_pcpu(const char *cpu_str, const cpu_set_t *allowed_mask) 634 { 635 uint32_t pcpu = atoi_non_negative("CPU number", cpu_str); 636 637 TEST_ASSERT(CPU_ISSET(pcpu, allowed_mask), 638 "Not allowed to run on pCPU '%d', check cgroups?", pcpu); 639 return pcpu; 640 } 641 642 void kvm_print_vcpu_pinning_help(void) 643 { 644 const char *name = program_invocation_name; 645 646 printf(" -c: Pin tasks to physical CPUs. Takes a list of comma separated\n" 647 " values (target pCPU), one for each vCPU, plus an optional\n" 648 " entry for the main application task (specified via entry\n" 649 " <nr_vcpus + 1>). If used, entries must be provided for all\n" 650 " vCPUs, i.e. pinning vCPUs is all or nothing.\n\n" 651 " E.g. to create 3 vCPUs, pin vCPU0=>pCPU22, vCPU1=>pCPU23,\n" 652 " vCPU2=>pCPU24, and pin the application task to pCPU50:\n\n" 653 " %s -v 3 -c 22,23,24,50\n\n" 654 " To leave the application task unpinned, drop the final entry:\n\n" 655 " %s -v 3 -c 22,23,24\n\n" 656 " (default: no pinning)\n", name, name); 657 } 658 659 void kvm_parse_vcpu_pinning(const char *pcpus_string, uint32_t vcpu_to_pcpu[], 660 int nr_vcpus) 661 { 662 cpu_set_t allowed_mask; 663 char *cpu, *cpu_list; 664 char delim[2] = ","; 665 int i, r; 666 667 cpu_list = strdup(pcpus_string); 668 TEST_ASSERT(cpu_list, "strdup() allocation failed."); 669 670 r = sched_getaffinity(0, sizeof(allowed_mask), &allowed_mask); 671 TEST_ASSERT(!r, "sched_getaffinity() failed"); 672 673 cpu = strtok(cpu_list, delim); 674 675 /* 1. Get all pcpus for vcpus. */ 676 for (i = 0; i < nr_vcpus; i++) { 677 TEST_ASSERT(cpu, "pCPU not provided for vCPU '%d'", i); 678 vcpu_to_pcpu[i] = parse_pcpu(cpu, &allowed_mask); 679 cpu = strtok(NULL, delim); 680 } 681 682 /* 2. Check if the main worker needs to be pinned. */ 683 if (cpu) { 684 pin_self_to_cpu(parse_pcpu(cpu, &allowed_mask)); 685 cpu = strtok(NULL, delim); 686 } 687 688 TEST_ASSERT(!cpu, "pCPU list contains trailing garbage characters '%s'", cpu); 689 free(cpu_list); 690 } 691 692 /* 693 * Userspace Memory Region Find 694 * 695 * Input Args: 696 * vm - Virtual Machine 697 * start - Starting VM physical address 698 * end - Ending VM physical address, inclusive. 699 * 700 * Output Args: None 701 * 702 * Return: 703 * Pointer to overlapping region, NULL if no such region. 704 * 705 * Searches for a region with any physical memory that overlaps with 706 * any portion of the guest physical addresses from start to end 707 * inclusive. If multiple overlapping regions exist, a pointer to any 708 * of the regions is returned. Null is returned only when no overlapping 709 * region exists. 710 */ 711 static struct userspace_mem_region * 712 userspace_mem_region_find(struct kvm_vm *vm, uint64_t start, uint64_t end) 713 { 714 struct rb_node *node; 715 716 for (node = vm->regions.gpa_tree.rb_node; node; ) { 717 struct userspace_mem_region *region = 718 container_of(node, struct userspace_mem_region, gpa_node); 719 uint64_t existing_start = region->region.guest_phys_addr; 720 uint64_t existing_end = region->region.guest_phys_addr 721 + region->region.memory_size - 1; 722 if (start <= existing_end && end >= existing_start) 723 return region; 724 725 if (start < existing_start) 726 node = node->rb_left; 727 else 728 node = node->rb_right; 729 } 730 731 return NULL; 732 } 733 734 static void kvm_stats_release(struct kvm_binary_stats *stats) 735 { 736 int ret; 737 738 if (stats->fd < 0) 739 return; 740 741 if (stats->desc) { 742 free(stats->desc); 743 stats->desc = NULL; 744 } 745 746 ret = close(stats->fd); 747 TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("close()", ret)); 748 stats->fd = -1; 749 } 750 751 __weak void vcpu_arch_free(struct kvm_vcpu *vcpu) 752 { 753 754 } 755 756 /* 757 * VM VCPU Remove 758 * 759 * Input Args: 760 * vcpu - VCPU to remove 761 * 762 * Output Args: None 763 * 764 * Return: None, TEST_ASSERT failures for all error conditions 765 * 766 * Removes a vCPU from a VM and frees its resources. 767 */ 768 static void vm_vcpu_rm(struct kvm_vm *vm, struct kvm_vcpu *vcpu) 769 { 770 int ret; 771 772 if (vcpu->dirty_gfns) { 773 ret = munmap(vcpu->dirty_gfns, vm->dirty_ring_size); 774 TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("munmap()", ret)); 775 vcpu->dirty_gfns = NULL; 776 } 777 778 ret = munmap(vcpu->run, vcpu_mmap_sz()); 779 TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("munmap()", ret)); 780 781 ret = close(vcpu->fd); 782 TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("close()", ret)); 783 784 kvm_stats_release(&vcpu->stats); 785 786 list_del(&vcpu->list); 787 788 vcpu_arch_free(vcpu); 789 free(vcpu); 790 } 791 792 void kvm_vm_release(struct kvm_vm *vmp) 793 { 794 struct kvm_vcpu *vcpu, *tmp; 795 int ret; 796 797 list_for_each_entry_safe(vcpu, tmp, &vmp->vcpus, list) 798 vm_vcpu_rm(vmp, vcpu); 799 800 ret = close(vmp->fd); 801 TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("close()", ret)); 802 803 ret = close(vmp->kvm_fd); 804 TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("close()", ret)); 805 806 /* Free cached stats metadata and close FD */ 807 kvm_stats_release(&vmp->stats); 808 } 809 810 static void __vm_mem_region_delete(struct kvm_vm *vm, 811 struct userspace_mem_region *region) 812 { 813 int ret; 814 815 rb_erase(®ion->gpa_node, &vm->regions.gpa_tree); 816 rb_erase(®ion->hva_node, &vm->regions.hva_tree); 817 hash_del(®ion->slot_node); 818 819 sparsebit_free(®ion->unused_phy_pages); 820 sparsebit_free(®ion->protected_phy_pages); 821 ret = munmap(region->mmap_start, region->mmap_size); 822 TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("munmap()", ret)); 823 if (region->fd >= 0) { 824 /* There's an extra map when using shared memory. */ 825 ret = munmap(region->mmap_alias, region->mmap_size); 826 TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("munmap()", ret)); 827 close(region->fd); 828 } 829 if (region->region.guest_memfd >= 0) 830 close(region->region.guest_memfd); 831 832 free(region); 833 } 834 835 /* 836 * Destroys and frees the VM pointed to by vmp. 837 */ 838 void kvm_vm_free(struct kvm_vm *vmp) 839 { 840 int ctr; 841 struct hlist_node *node; 842 struct userspace_mem_region *region; 843 844 if (vmp == NULL) 845 return; 846 847 /* Free userspace_mem_regions. */ 848 hash_for_each_safe(vmp->regions.slot_hash, ctr, node, region, slot_node) 849 __vm_mem_region_delete(vmp, region); 850 851 /* Free sparsebit arrays. */ 852 sparsebit_free(&vmp->vpages_valid); 853 sparsebit_free(&vmp->vpages_mapped); 854 855 kvm_vm_release(vmp); 856 857 /* Free the structure describing the VM. */ 858 free(vmp); 859 } 860 861 int kvm_memfd_alloc(size_t size, bool hugepages) 862 { 863 int memfd_flags = MFD_CLOEXEC; 864 int fd, r; 865 866 if (hugepages) 867 memfd_flags |= MFD_HUGETLB; 868 869 fd = memfd_create("kvm_selftest", memfd_flags); 870 TEST_ASSERT(fd != -1, __KVM_SYSCALL_ERROR("memfd_create()", fd)); 871 872 r = ftruncate(fd, size); 873 TEST_ASSERT(!r, __KVM_SYSCALL_ERROR("ftruncate()", r)); 874 875 r = fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, size); 876 TEST_ASSERT(!r, __KVM_SYSCALL_ERROR("fallocate()", r)); 877 878 return fd; 879 } 880 881 static void vm_userspace_mem_region_gpa_insert(struct rb_root *gpa_tree, 882 struct userspace_mem_region *region) 883 { 884 struct rb_node **cur, *parent; 885 886 for (cur = &gpa_tree->rb_node, parent = NULL; *cur; ) { 887 struct userspace_mem_region *cregion; 888 889 cregion = container_of(*cur, typeof(*cregion), gpa_node); 890 parent = *cur; 891 if (region->region.guest_phys_addr < 892 cregion->region.guest_phys_addr) 893 cur = &(*cur)->rb_left; 894 else { 895 TEST_ASSERT(region->region.guest_phys_addr != 896 cregion->region.guest_phys_addr, 897 "Duplicate GPA in region tree"); 898 899 cur = &(*cur)->rb_right; 900 } 901 } 902 903 rb_link_node(®ion->gpa_node, parent, cur); 904 rb_insert_color(®ion->gpa_node, gpa_tree); 905 } 906 907 static void vm_userspace_mem_region_hva_insert(struct rb_root *hva_tree, 908 struct userspace_mem_region *region) 909 { 910 struct rb_node **cur, *parent; 911 912 for (cur = &hva_tree->rb_node, parent = NULL; *cur; ) { 913 struct userspace_mem_region *cregion; 914 915 cregion = container_of(*cur, typeof(*cregion), hva_node); 916 parent = *cur; 917 if (region->host_mem < cregion->host_mem) 918 cur = &(*cur)->rb_left; 919 else { 920 TEST_ASSERT(region->host_mem != 921 cregion->host_mem, 922 "Duplicate HVA in region tree"); 923 924 cur = &(*cur)->rb_right; 925 } 926 } 927 928 rb_link_node(®ion->hva_node, parent, cur); 929 rb_insert_color(®ion->hva_node, hva_tree); 930 } 931 932 933 int __vm_set_user_memory_region(struct kvm_vm *vm, uint32_t slot, uint32_t flags, 934 uint64_t gpa, uint64_t size, void *hva) 935 { 936 struct kvm_userspace_memory_region region = { 937 .slot = slot, 938 .flags = flags, 939 .guest_phys_addr = gpa, 940 .memory_size = size, 941 .userspace_addr = (uintptr_t)hva, 942 }; 943 944 return ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION, ®ion); 945 } 946 947 void vm_set_user_memory_region(struct kvm_vm *vm, uint32_t slot, uint32_t flags, 948 uint64_t gpa, uint64_t size, void *hva) 949 { 950 int ret = __vm_set_user_memory_region(vm, slot, flags, gpa, size, hva); 951 952 TEST_ASSERT(!ret, "KVM_SET_USER_MEMORY_REGION failed, errno = %d (%s)", 953 errno, strerror(errno)); 954 } 955 956 #define TEST_REQUIRE_SET_USER_MEMORY_REGION2() \ 957 __TEST_REQUIRE(kvm_has_cap(KVM_CAP_USER_MEMORY2), \ 958 "KVM selftests now require KVM_SET_USER_MEMORY_REGION2 (introduced in v6.8)") 959 960 int __vm_set_user_memory_region2(struct kvm_vm *vm, uint32_t slot, uint32_t flags, 961 uint64_t gpa, uint64_t size, void *hva, 962 uint32_t guest_memfd, uint64_t guest_memfd_offset) 963 { 964 struct kvm_userspace_memory_region2 region = { 965 .slot = slot, 966 .flags = flags, 967 .guest_phys_addr = gpa, 968 .memory_size = size, 969 .userspace_addr = (uintptr_t)hva, 970 .guest_memfd = guest_memfd, 971 .guest_memfd_offset = guest_memfd_offset, 972 }; 973 974 TEST_REQUIRE_SET_USER_MEMORY_REGION2(); 975 976 return ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION2, ®ion); 977 } 978 979 void vm_set_user_memory_region2(struct kvm_vm *vm, uint32_t slot, uint32_t flags, 980 uint64_t gpa, uint64_t size, void *hva, 981 uint32_t guest_memfd, uint64_t guest_memfd_offset) 982 { 983 int ret = __vm_set_user_memory_region2(vm, slot, flags, gpa, size, hva, 984 guest_memfd, guest_memfd_offset); 985 986 TEST_ASSERT(!ret, "KVM_SET_USER_MEMORY_REGION2 failed, errno = %d (%s)", 987 errno, strerror(errno)); 988 } 989 990 991 /* FIXME: This thing needs to be ripped apart and rewritten. */ 992 void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, 993 uint64_t guest_paddr, uint32_t slot, uint64_t npages, 994 uint32_t flags, int guest_memfd, uint64_t guest_memfd_offset) 995 { 996 int ret; 997 struct userspace_mem_region *region; 998 size_t backing_src_pagesz = get_backing_src_pagesz(src_type); 999 size_t mem_size = npages * vm->page_size; 1000 size_t alignment; 1001 1002 TEST_REQUIRE_SET_USER_MEMORY_REGION2(); 1003 1004 TEST_ASSERT(vm_adjust_num_guest_pages(vm->mode, npages) == npages, 1005 "Number of guest pages is not compatible with the host. " 1006 "Try npages=%d", vm_adjust_num_guest_pages(vm->mode, npages)); 1007 1008 TEST_ASSERT((guest_paddr % vm->page_size) == 0, "Guest physical " 1009 "address not on a page boundary.\n" 1010 " guest_paddr: 0x%lx vm->page_size: 0x%x", 1011 guest_paddr, vm->page_size); 1012 TEST_ASSERT((((guest_paddr >> vm->page_shift) + npages) - 1) 1013 <= vm->max_gfn, "Physical range beyond maximum " 1014 "supported physical address,\n" 1015 " guest_paddr: 0x%lx npages: 0x%lx\n" 1016 " vm->max_gfn: 0x%lx vm->page_size: 0x%x", 1017 guest_paddr, npages, vm->max_gfn, vm->page_size); 1018 1019 /* 1020 * Confirm a mem region with an overlapping address doesn't 1021 * already exist. 1022 */ 1023 region = (struct userspace_mem_region *) userspace_mem_region_find( 1024 vm, guest_paddr, (guest_paddr + npages * vm->page_size) - 1); 1025 if (region != NULL) 1026 TEST_FAIL("overlapping userspace_mem_region already " 1027 "exists\n" 1028 " requested guest_paddr: 0x%lx npages: 0x%lx " 1029 "page_size: 0x%x\n" 1030 " existing guest_paddr: 0x%lx size: 0x%lx", 1031 guest_paddr, npages, vm->page_size, 1032 (uint64_t) region->region.guest_phys_addr, 1033 (uint64_t) region->region.memory_size); 1034 1035 /* Confirm no region with the requested slot already exists. */ 1036 hash_for_each_possible(vm->regions.slot_hash, region, slot_node, 1037 slot) { 1038 if (region->region.slot != slot) 1039 continue; 1040 1041 TEST_FAIL("A mem region with the requested slot " 1042 "already exists.\n" 1043 " requested slot: %u paddr: 0x%lx npages: 0x%lx\n" 1044 " existing slot: %u paddr: 0x%lx size: 0x%lx", 1045 slot, guest_paddr, npages, 1046 region->region.slot, 1047 (uint64_t) region->region.guest_phys_addr, 1048 (uint64_t) region->region.memory_size); 1049 } 1050 1051 /* Allocate and initialize new mem region structure. */ 1052 region = calloc(1, sizeof(*region)); 1053 TEST_ASSERT(region != NULL, "Insufficient Memory"); 1054 region->mmap_size = mem_size; 1055 1056 #ifdef __s390x__ 1057 /* On s390x, the host address must be aligned to 1M (due to PGSTEs) */ 1058 alignment = 0x100000; 1059 #else 1060 alignment = 1; 1061 #endif 1062 1063 /* 1064 * When using THP mmap is not guaranteed to returned a hugepage aligned 1065 * address so we have to pad the mmap. Padding is not needed for HugeTLB 1066 * because mmap will always return an address aligned to the HugeTLB 1067 * page size. 1068 */ 1069 if (src_type == VM_MEM_SRC_ANONYMOUS_THP) 1070 alignment = max(backing_src_pagesz, alignment); 1071 1072 TEST_ASSERT_EQ(guest_paddr, align_up(guest_paddr, backing_src_pagesz)); 1073 1074 /* Add enough memory to align up if necessary */ 1075 if (alignment > 1) 1076 region->mmap_size += alignment; 1077 1078 region->fd = -1; 1079 if (backing_src_is_shared(src_type)) 1080 region->fd = kvm_memfd_alloc(region->mmap_size, 1081 src_type == VM_MEM_SRC_SHARED_HUGETLB); 1082 1083 region->mmap_start = mmap(NULL, region->mmap_size, 1084 PROT_READ | PROT_WRITE, 1085 vm_mem_backing_src_alias(src_type)->flag, 1086 region->fd, 0); 1087 TEST_ASSERT(region->mmap_start != MAP_FAILED, 1088 __KVM_SYSCALL_ERROR("mmap()", (int)(unsigned long)MAP_FAILED)); 1089 1090 TEST_ASSERT(!is_backing_src_hugetlb(src_type) || 1091 region->mmap_start == align_ptr_up(region->mmap_start, backing_src_pagesz), 1092 "mmap_start %p is not aligned to HugeTLB page size 0x%lx", 1093 region->mmap_start, backing_src_pagesz); 1094 1095 /* Align host address */ 1096 region->host_mem = align_ptr_up(region->mmap_start, alignment); 1097 1098 /* As needed perform madvise */ 1099 if ((src_type == VM_MEM_SRC_ANONYMOUS || 1100 src_type == VM_MEM_SRC_ANONYMOUS_THP) && thp_configured()) { 1101 ret = madvise(region->host_mem, mem_size, 1102 src_type == VM_MEM_SRC_ANONYMOUS ? MADV_NOHUGEPAGE : MADV_HUGEPAGE); 1103 TEST_ASSERT(ret == 0, "madvise failed, addr: %p length: 0x%lx src_type: %s", 1104 region->host_mem, mem_size, 1105 vm_mem_backing_src_alias(src_type)->name); 1106 } 1107 1108 region->backing_src_type = src_type; 1109 1110 if (flags & KVM_MEM_GUEST_MEMFD) { 1111 if (guest_memfd < 0) { 1112 uint32_t guest_memfd_flags = 0; 1113 TEST_ASSERT(!guest_memfd_offset, 1114 "Offset must be zero when creating new guest_memfd"); 1115 guest_memfd = vm_create_guest_memfd(vm, mem_size, guest_memfd_flags); 1116 } else { 1117 /* 1118 * Install a unique fd for each memslot so that the fd 1119 * can be closed when the region is deleted without 1120 * needing to track if the fd is owned by the framework 1121 * or by the caller. 1122 */ 1123 guest_memfd = dup(guest_memfd); 1124 TEST_ASSERT(guest_memfd >= 0, __KVM_SYSCALL_ERROR("dup()", guest_memfd)); 1125 } 1126 1127 region->region.guest_memfd = guest_memfd; 1128 region->region.guest_memfd_offset = guest_memfd_offset; 1129 } else { 1130 region->region.guest_memfd = -1; 1131 } 1132 1133 region->unused_phy_pages = sparsebit_alloc(); 1134 if (vm_arch_has_protected_memory(vm)) 1135 region->protected_phy_pages = sparsebit_alloc(); 1136 sparsebit_set_num(region->unused_phy_pages, 1137 guest_paddr >> vm->page_shift, npages); 1138 region->region.slot = slot; 1139 region->region.flags = flags; 1140 region->region.guest_phys_addr = guest_paddr; 1141 region->region.memory_size = npages * vm->page_size; 1142 region->region.userspace_addr = (uintptr_t) region->host_mem; 1143 ret = __vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, ®ion->region); 1144 TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION2 IOCTL failed,\n" 1145 " rc: %i errno: %i\n" 1146 " slot: %u flags: 0x%x\n" 1147 " guest_phys_addr: 0x%lx size: 0x%lx guest_memfd: %d", 1148 ret, errno, slot, flags, 1149 guest_paddr, (uint64_t) region->region.memory_size, 1150 region->region.guest_memfd); 1151 1152 /* Add to quick lookup data structures */ 1153 vm_userspace_mem_region_gpa_insert(&vm->regions.gpa_tree, region); 1154 vm_userspace_mem_region_hva_insert(&vm->regions.hva_tree, region); 1155 hash_add(vm->regions.slot_hash, ®ion->slot_node, slot); 1156 1157 /* If shared memory, create an alias. */ 1158 if (region->fd >= 0) { 1159 region->mmap_alias = mmap(NULL, region->mmap_size, 1160 PROT_READ | PROT_WRITE, 1161 vm_mem_backing_src_alias(src_type)->flag, 1162 region->fd, 0); 1163 TEST_ASSERT(region->mmap_alias != MAP_FAILED, 1164 __KVM_SYSCALL_ERROR("mmap()", (int)(unsigned long)MAP_FAILED)); 1165 1166 /* Align host alias address */ 1167 region->host_alias = align_ptr_up(region->mmap_alias, alignment); 1168 } 1169 } 1170 1171 void vm_userspace_mem_region_add(struct kvm_vm *vm, 1172 enum vm_mem_backing_src_type src_type, 1173 uint64_t guest_paddr, uint32_t slot, 1174 uint64_t npages, uint32_t flags) 1175 { 1176 vm_mem_add(vm, src_type, guest_paddr, slot, npages, flags, -1, 0); 1177 } 1178 1179 /* 1180 * Memslot to region 1181 * 1182 * Input Args: 1183 * vm - Virtual Machine 1184 * memslot - KVM memory slot ID 1185 * 1186 * Output Args: None 1187 * 1188 * Return: 1189 * Pointer to memory region structure that describe memory region 1190 * using kvm memory slot ID given by memslot. TEST_ASSERT failure 1191 * on error (e.g. currently no memory region using memslot as a KVM 1192 * memory slot ID). 1193 */ 1194 struct userspace_mem_region * 1195 memslot2region(struct kvm_vm *vm, uint32_t memslot) 1196 { 1197 struct userspace_mem_region *region; 1198 1199 hash_for_each_possible(vm->regions.slot_hash, region, slot_node, 1200 memslot) 1201 if (region->region.slot == memslot) 1202 return region; 1203 1204 fprintf(stderr, "No mem region with the requested slot found,\n" 1205 " requested slot: %u\n", memslot); 1206 fputs("---- vm dump ----\n", stderr); 1207 vm_dump(stderr, vm, 2); 1208 TEST_FAIL("Mem region not found"); 1209 return NULL; 1210 } 1211 1212 /* 1213 * VM Memory Region Flags Set 1214 * 1215 * Input Args: 1216 * vm - Virtual Machine 1217 * flags - Starting guest physical address 1218 * 1219 * Output Args: None 1220 * 1221 * Return: None 1222 * 1223 * Sets the flags of the memory region specified by the value of slot, 1224 * to the values given by flags. 1225 */ 1226 void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags) 1227 { 1228 int ret; 1229 struct userspace_mem_region *region; 1230 1231 region = memslot2region(vm, slot); 1232 1233 region->region.flags = flags; 1234 1235 ret = __vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, ®ion->region); 1236 1237 TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION2 IOCTL failed,\n" 1238 " rc: %i errno: %i slot: %u flags: 0x%x", 1239 ret, errno, slot, flags); 1240 } 1241 1242 /* 1243 * VM Memory Region Move 1244 * 1245 * Input Args: 1246 * vm - Virtual Machine 1247 * slot - Slot of the memory region to move 1248 * new_gpa - Starting guest physical address 1249 * 1250 * Output Args: None 1251 * 1252 * Return: None 1253 * 1254 * Change the gpa of a memory region. 1255 */ 1256 void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa) 1257 { 1258 struct userspace_mem_region *region; 1259 int ret; 1260 1261 region = memslot2region(vm, slot); 1262 1263 region->region.guest_phys_addr = new_gpa; 1264 1265 ret = __vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, ®ion->region); 1266 1267 TEST_ASSERT(!ret, "KVM_SET_USER_MEMORY_REGION2 failed\n" 1268 "ret: %i errno: %i slot: %u new_gpa: 0x%lx", 1269 ret, errno, slot, new_gpa); 1270 } 1271 1272 /* 1273 * VM Memory Region Delete 1274 * 1275 * Input Args: 1276 * vm - Virtual Machine 1277 * slot - Slot of the memory region to delete 1278 * 1279 * Output Args: None 1280 * 1281 * Return: None 1282 * 1283 * Delete a memory region. 1284 */ 1285 void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot) 1286 { 1287 struct userspace_mem_region *region = memslot2region(vm, slot); 1288 1289 region->region.memory_size = 0; 1290 vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, ®ion->region); 1291 1292 __vm_mem_region_delete(vm, region); 1293 } 1294 1295 void vm_guest_mem_fallocate(struct kvm_vm *vm, uint64_t base, uint64_t size, 1296 bool punch_hole) 1297 { 1298 const int mode = FALLOC_FL_KEEP_SIZE | (punch_hole ? FALLOC_FL_PUNCH_HOLE : 0); 1299 struct userspace_mem_region *region; 1300 uint64_t end = base + size; 1301 uint64_t gpa, len; 1302 off_t fd_offset; 1303 int ret; 1304 1305 for (gpa = base; gpa < end; gpa += len) { 1306 uint64_t offset; 1307 1308 region = userspace_mem_region_find(vm, gpa, gpa); 1309 TEST_ASSERT(region && region->region.flags & KVM_MEM_GUEST_MEMFD, 1310 "Private memory region not found for GPA 0x%lx", gpa); 1311 1312 offset = gpa - region->region.guest_phys_addr; 1313 fd_offset = region->region.guest_memfd_offset + offset; 1314 len = min_t(uint64_t, end - gpa, region->region.memory_size - offset); 1315 1316 ret = fallocate(region->region.guest_memfd, mode, fd_offset, len); 1317 TEST_ASSERT(!ret, "fallocate() failed to %s at %lx (len = %lu), fd = %d, mode = %x, offset = %lx", 1318 punch_hole ? "punch hole" : "allocate", gpa, len, 1319 region->region.guest_memfd, mode, fd_offset); 1320 } 1321 } 1322 1323 /* Returns the size of a vCPU's kvm_run structure. */ 1324 static int vcpu_mmap_sz(void) 1325 { 1326 int dev_fd, ret; 1327 1328 dev_fd = open_kvm_dev_path_or_exit(); 1329 1330 ret = ioctl(dev_fd, KVM_GET_VCPU_MMAP_SIZE, NULL); 1331 TEST_ASSERT(ret >= sizeof(struct kvm_run), 1332 KVM_IOCTL_ERROR(KVM_GET_VCPU_MMAP_SIZE, ret)); 1333 1334 close(dev_fd); 1335 1336 return ret; 1337 } 1338 1339 static bool vcpu_exists(struct kvm_vm *vm, uint32_t vcpu_id) 1340 { 1341 struct kvm_vcpu *vcpu; 1342 1343 list_for_each_entry(vcpu, &vm->vcpus, list) { 1344 if (vcpu->id == vcpu_id) 1345 return true; 1346 } 1347 1348 return false; 1349 } 1350 1351 /* 1352 * Adds a virtual CPU to the VM specified by vm with the ID given by vcpu_id. 1353 * No additional vCPU setup is done. Returns the vCPU. 1354 */ 1355 struct kvm_vcpu *__vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id) 1356 { 1357 struct kvm_vcpu *vcpu; 1358 1359 /* Confirm a vcpu with the specified id doesn't already exist. */ 1360 TEST_ASSERT(!vcpu_exists(vm, vcpu_id), "vCPU%d already exists", vcpu_id); 1361 1362 /* Allocate and initialize new vcpu structure. */ 1363 vcpu = calloc(1, sizeof(*vcpu)); 1364 TEST_ASSERT(vcpu != NULL, "Insufficient Memory"); 1365 1366 vcpu->vm = vm; 1367 vcpu->id = vcpu_id; 1368 vcpu->fd = __vm_ioctl(vm, KVM_CREATE_VCPU, (void *)(unsigned long)vcpu_id); 1369 TEST_ASSERT_VM_VCPU_IOCTL(vcpu->fd >= 0, KVM_CREATE_VCPU, vcpu->fd, vm); 1370 1371 TEST_ASSERT(vcpu_mmap_sz() >= sizeof(*vcpu->run), "vcpu mmap size " 1372 "smaller than expected, vcpu_mmap_sz: %i expected_min: %zi", 1373 vcpu_mmap_sz(), sizeof(*vcpu->run)); 1374 vcpu->run = (struct kvm_run *) mmap(NULL, vcpu_mmap_sz(), 1375 PROT_READ | PROT_WRITE, MAP_SHARED, vcpu->fd, 0); 1376 TEST_ASSERT(vcpu->run != MAP_FAILED, 1377 __KVM_SYSCALL_ERROR("mmap()", (int)(unsigned long)MAP_FAILED)); 1378 1379 if (kvm_has_cap(KVM_CAP_BINARY_STATS_FD)) 1380 vcpu->stats.fd = vcpu_get_stats_fd(vcpu); 1381 else 1382 vcpu->stats.fd = -1; 1383 1384 /* Add to linked-list of VCPUs. */ 1385 list_add(&vcpu->list, &vm->vcpus); 1386 1387 return vcpu; 1388 } 1389 1390 /* 1391 * VM Virtual Address Unused Gap 1392 * 1393 * Input Args: 1394 * vm - Virtual Machine 1395 * sz - Size (bytes) 1396 * vaddr_min - Minimum Virtual Address 1397 * 1398 * Output Args: None 1399 * 1400 * Return: 1401 * Lowest virtual address at or below vaddr_min, with at least 1402 * sz unused bytes. TEST_ASSERT failure if no area of at least 1403 * size sz is available. 1404 * 1405 * Within the VM specified by vm, locates the lowest starting virtual 1406 * address >= vaddr_min, that has at least sz unallocated bytes. A 1407 * TEST_ASSERT failure occurs for invalid input or no area of at least 1408 * sz unallocated bytes >= vaddr_min is available. 1409 */ 1410 vm_vaddr_t vm_vaddr_unused_gap(struct kvm_vm *vm, size_t sz, 1411 vm_vaddr_t vaddr_min) 1412 { 1413 uint64_t pages = (sz + vm->page_size - 1) >> vm->page_shift; 1414 1415 /* Determine lowest permitted virtual page index. */ 1416 uint64_t pgidx_start = (vaddr_min + vm->page_size - 1) >> vm->page_shift; 1417 if ((pgidx_start * vm->page_size) < vaddr_min) 1418 goto no_va_found; 1419 1420 /* Loop over section with enough valid virtual page indexes. */ 1421 if (!sparsebit_is_set_num(vm->vpages_valid, 1422 pgidx_start, pages)) 1423 pgidx_start = sparsebit_next_set_num(vm->vpages_valid, 1424 pgidx_start, pages); 1425 do { 1426 /* 1427 * Are there enough unused virtual pages available at 1428 * the currently proposed starting virtual page index. 1429 * If not, adjust proposed starting index to next 1430 * possible. 1431 */ 1432 if (sparsebit_is_clear_num(vm->vpages_mapped, 1433 pgidx_start, pages)) 1434 goto va_found; 1435 pgidx_start = sparsebit_next_clear_num(vm->vpages_mapped, 1436 pgidx_start, pages); 1437 if (pgidx_start == 0) 1438 goto no_va_found; 1439 1440 /* 1441 * If needed, adjust proposed starting virtual address, 1442 * to next range of valid virtual addresses. 1443 */ 1444 if (!sparsebit_is_set_num(vm->vpages_valid, 1445 pgidx_start, pages)) { 1446 pgidx_start = sparsebit_next_set_num( 1447 vm->vpages_valid, pgidx_start, pages); 1448 if (pgidx_start == 0) 1449 goto no_va_found; 1450 } 1451 } while (pgidx_start != 0); 1452 1453 no_va_found: 1454 TEST_FAIL("No vaddr of specified pages available, pages: 0x%lx", pages); 1455 1456 /* NOT REACHED */ 1457 return -1; 1458 1459 va_found: 1460 TEST_ASSERT(sparsebit_is_set_num(vm->vpages_valid, 1461 pgidx_start, pages), 1462 "Unexpected, invalid virtual page index range,\n" 1463 " pgidx_start: 0x%lx\n" 1464 " pages: 0x%lx", 1465 pgidx_start, pages); 1466 TEST_ASSERT(sparsebit_is_clear_num(vm->vpages_mapped, 1467 pgidx_start, pages), 1468 "Unexpected, pages already mapped,\n" 1469 " pgidx_start: 0x%lx\n" 1470 " pages: 0x%lx", 1471 pgidx_start, pages); 1472 1473 return pgidx_start * vm->page_size; 1474 } 1475 1476 static vm_vaddr_t ____vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, 1477 vm_vaddr_t vaddr_min, 1478 enum kvm_mem_region_type type, 1479 bool protected) 1480 { 1481 uint64_t pages = (sz >> vm->page_shift) + ((sz % vm->page_size) != 0); 1482 1483 virt_pgd_alloc(vm); 1484 vm_paddr_t paddr = __vm_phy_pages_alloc(vm, pages, 1485 KVM_UTIL_MIN_PFN * vm->page_size, 1486 vm->memslots[type], protected); 1487 1488 /* 1489 * Find an unused range of virtual page addresses of at least 1490 * pages in length. 1491 */ 1492 vm_vaddr_t vaddr_start = vm_vaddr_unused_gap(vm, sz, vaddr_min); 1493 1494 /* Map the virtual pages. */ 1495 for (vm_vaddr_t vaddr = vaddr_start; pages > 0; 1496 pages--, vaddr += vm->page_size, paddr += vm->page_size) { 1497 1498 virt_pg_map(vm, vaddr, paddr); 1499 1500 sparsebit_set(vm->vpages_mapped, vaddr >> vm->page_shift); 1501 } 1502 1503 return vaddr_start; 1504 } 1505 1506 vm_vaddr_t __vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min, 1507 enum kvm_mem_region_type type) 1508 { 1509 return ____vm_vaddr_alloc(vm, sz, vaddr_min, type, 1510 vm_arch_has_protected_memory(vm)); 1511 } 1512 1513 vm_vaddr_t vm_vaddr_alloc_shared(struct kvm_vm *vm, size_t sz, 1514 vm_vaddr_t vaddr_min, 1515 enum kvm_mem_region_type type) 1516 { 1517 return ____vm_vaddr_alloc(vm, sz, vaddr_min, type, false); 1518 } 1519 1520 /* 1521 * VM Virtual Address Allocate 1522 * 1523 * Input Args: 1524 * vm - Virtual Machine 1525 * sz - Size in bytes 1526 * vaddr_min - Minimum starting virtual address 1527 * 1528 * Output Args: None 1529 * 1530 * Return: 1531 * Starting guest virtual address 1532 * 1533 * Allocates at least sz bytes within the virtual address space of the vm 1534 * given by vm. The allocated bytes are mapped to a virtual address >= 1535 * the address given by vaddr_min. Note that each allocation uses a 1536 * a unique set of pages, with the minimum real allocation being at least 1537 * a page. The allocated physical space comes from the TEST_DATA memory region. 1538 */ 1539 vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min) 1540 { 1541 return __vm_vaddr_alloc(vm, sz, vaddr_min, MEM_REGION_TEST_DATA); 1542 } 1543 1544 /* 1545 * VM Virtual Address Allocate Pages 1546 * 1547 * Input Args: 1548 * vm - Virtual Machine 1549 * 1550 * Output Args: None 1551 * 1552 * Return: 1553 * Starting guest virtual address 1554 * 1555 * Allocates at least N system pages worth of bytes within the virtual address 1556 * space of the vm. 1557 */ 1558 vm_vaddr_t vm_vaddr_alloc_pages(struct kvm_vm *vm, int nr_pages) 1559 { 1560 return vm_vaddr_alloc(vm, nr_pages * getpagesize(), KVM_UTIL_MIN_VADDR); 1561 } 1562 1563 vm_vaddr_t __vm_vaddr_alloc_page(struct kvm_vm *vm, enum kvm_mem_region_type type) 1564 { 1565 return __vm_vaddr_alloc(vm, getpagesize(), KVM_UTIL_MIN_VADDR, type); 1566 } 1567 1568 /* 1569 * VM Virtual Address Allocate Page 1570 * 1571 * Input Args: 1572 * vm - Virtual Machine 1573 * 1574 * Output Args: None 1575 * 1576 * Return: 1577 * Starting guest virtual address 1578 * 1579 * Allocates at least one system page worth of bytes within the virtual address 1580 * space of the vm. 1581 */ 1582 vm_vaddr_t vm_vaddr_alloc_page(struct kvm_vm *vm) 1583 { 1584 return vm_vaddr_alloc_pages(vm, 1); 1585 } 1586 1587 /* 1588 * Map a range of VM virtual address to the VM's physical address 1589 * 1590 * Input Args: 1591 * vm - Virtual Machine 1592 * vaddr - Virtuall address to map 1593 * paddr - VM Physical Address 1594 * npages - The number of pages to map 1595 * 1596 * Output Args: None 1597 * 1598 * Return: None 1599 * 1600 * Within the VM given by @vm, creates a virtual translation for 1601 * @npages starting at @vaddr to the page range starting at @paddr. 1602 */ 1603 void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, 1604 unsigned int npages) 1605 { 1606 size_t page_size = vm->page_size; 1607 size_t size = npages * page_size; 1608 1609 TEST_ASSERT(vaddr + size > vaddr, "Vaddr overflow"); 1610 TEST_ASSERT(paddr + size > paddr, "Paddr overflow"); 1611 1612 while (npages--) { 1613 virt_pg_map(vm, vaddr, paddr); 1614 sparsebit_set(vm->vpages_mapped, vaddr >> vm->page_shift); 1615 1616 vaddr += page_size; 1617 paddr += page_size; 1618 } 1619 } 1620 1621 /* 1622 * Address VM Physical to Host Virtual 1623 * 1624 * Input Args: 1625 * vm - Virtual Machine 1626 * gpa - VM physical address 1627 * 1628 * Output Args: None 1629 * 1630 * Return: 1631 * Equivalent host virtual address 1632 * 1633 * Locates the memory region containing the VM physical address given 1634 * by gpa, within the VM given by vm. When found, the host virtual 1635 * address providing the memory to the vm physical address is returned. 1636 * A TEST_ASSERT failure occurs if no region containing gpa exists. 1637 */ 1638 void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa) 1639 { 1640 struct userspace_mem_region *region; 1641 1642 gpa = vm_untag_gpa(vm, gpa); 1643 1644 region = userspace_mem_region_find(vm, gpa, gpa); 1645 if (!region) { 1646 TEST_FAIL("No vm physical memory at 0x%lx", gpa); 1647 return NULL; 1648 } 1649 1650 return (void *)((uintptr_t)region->host_mem 1651 + (gpa - region->region.guest_phys_addr)); 1652 } 1653 1654 /* 1655 * Address Host Virtual to VM Physical 1656 * 1657 * Input Args: 1658 * vm - Virtual Machine 1659 * hva - Host virtual address 1660 * 1661 * Output Args: None 1662 * 1663 * Return: 1664 * Equivalent VM physical address 1665 * 1666 * Locates the memory region containing the host virtual address given 1667 * by hva, within the VM given by vm. When found, the equivalent 1668 * VM physical address is returned. A TEST_ASSERT failure occurs if no 1669 * region containing hva exists. 1670 */ 1671 vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva) 1672 { 1673 struct rb_node *node; 1674 1675 for (node = vm->regions.hva_tree.rb_node; node; ) { 1676 struct userspace_mem_region *region = 1677 container_of(node, struct userspace_mem_region, hva_node); 1678 1679 if (hva >= region->host_mem) { 1680 if (hva <= (region->host_mem 1681 + region->region.memory_size - 1)) 1682 return (vm_paddr_t)((uintptr_t) 1683 region->region.guest_phys_addr 1684 + (hva - (uintptr_t)region->host_mem)); 1685 1686 node = node->rb_right; 1687 } else 1688 node = node->rb_left; 1689 } 1690 1691 TEST_FAIL("No mapping to a guest physical address, hva: %p", hva); 1692 return -1; 1693 } 1694 1695 /* 1696 * Address VM physical to Host Virtual *alias*. 1697 * 1698 * Input Args: 1699 * vm - Virtual Machine 1700 * gpa - VM physical address 1701 * 1702 * Output Args: None 1703 * 1704 * Return: 1705 * Equivalent address within the host virtual *alias* area, or NULL 1706 * (without failing the test) if the guest memory is not shared (so 1707 * no alias exists). 1708 * 1709 * Create a writable, shared virtual=>physical alias for the specific GPA. 1710 * The primary use case is to allow the host selftest to manipulate guest 1711 * memory without mapping said memory in the guest's address space. And, for 1712 * userfaultfd-based demand paging, to do so without triggering userfaults. 1713 */ 1714 void *addr_gpa2alias(struct kvm_vm *vm, vm_paddr_t gpa) 1715 { 1716 struct userspace_mem_region *region; 1717 uintptr_t offset; 1718 1719 region = userspace_mem_region_find(vm, gpa, gpa); 1720 if (!region) 1721 return NULL; 1722 1723 if (!region->host_alias) 1724 return NULL; 1725 1726 offset = gpa - region->region.guest_phys_addr; 1727 return (void *) ((uintptr_t) region->host_alias + offset); 1728 } 1729 1730 /* Create an interrupt controller chip for the specified VM. */ 1731 void vm_create_irqchip(struct kvm_vm *vm) 1732 { 1733 int r; 1734 1735 /* 1736 * Allocate a fully in-kernel IRQ chip by default, but fall back to a 1737 * split model (x86 only) if that fails (KVM x86 allows compiling out 1738 * support for KVM_CREATE_IRQCHIP). 1739 */ 1740 r = __vm_ioctl(vm, KVM_CREATE_IRQCHIP, NULL); 1741 if (r && errno == ENOTTY && kvm_has_cap(KVM_CAP_SPLIT_IRQCHIP)) 1742 vm_enable_cap(vm, KVM_CAP_SPLIT_IRQCHIP, 24); 1743 else 1744 TEST_ASSERT_VM_VCPU_IOCTL(!r, KVM_CREATE_IRQCHIP, r, vm); 1745 1746 vm->has_irqchip = true; 1747 } 1748 1749 int _vcpu_run(struct kvm_vcpu *vcpu) 1750 { 1751 int rc; 1752 1753 do { 1754 rc = __vcpu_run(vcpu); 1755 } while (rc == -1 && errno == EINTR); 1756 1757 if (!rc) 1758 assert_on_unhandled_exception(vcpu); 1759 1760 return rc; 1761 } 1762 1763 /* 1764 * Invoke KVM_RUN on a vCPU until KVM returns something other than -EINTR. 1765 * Assert if the KVM returns an error (other than -EINTR). 1766 */ 1767 void vcpu_run(struct kvm_vcpu *vcpu) 1768 { 1769 int ret = _vcpu_run(vcpu); 1770 1771 TEST_ASSERT(!ret, KVM_IOCTL_ERROR(KVM_RUN, ret)); 1772 } 1773 1774 void vcpu_run_complete_io(struct kvm_vcpu *vcpu) 1775 { 1776 int ret; 1777 1778 vcpu->run->immediate_exit = 1; 1779 ret = __vcpu_run(vcpu); 1780 vcpu->run->immediate_exit = 0; 1781 1782 TEST_ASSERT(ret == -1 && errno == EINTR, 1783 "KVM_RUN IOCTL didn't exit immediately, rc: %i, errno: %i", 1784 ret, errno); 1785 } 1786 1787 /* 1788 * Get the list of guest registers which are supported for 1789 * KVM_GET_ONE_REG/KVM_SET_ONE_REG ioctls. Returns a kvm_reg_list pointer, 1790 * it is the caller's responsibility to free the list. 1791 */ 1792 struct kvm_reg_list *vcpu_get_reg_list(struct kvm_vcpu *vcpu) 1793 { 1794 struct kvm_reg_list reg_list_n = { .n = 0 }, *reg_list; 1795 int ret; 1796 1797 ret = __vcpu_ioctl(vcpu, KVM_GET_REG_LIST, ®_list_n); 1798 TEST_ASSERT(ret == -1 && errno == E2BIG, "KVM_GET_REG_LIST n=0"); 1799 1800 reg_list = calloc(1, sizeof(*reg_list) + reg_list_n.n * sizeof(__u64)); 1801 reg_list->n = reg_list_n.n; 1802 vcpu_ioctl(vcpu, KVM_GET_REG_LIST, reg_list); 1803 return reg_list; 1804 } 1805 1806 void *vcpu_map_dirty_ring(struct kvm_vcpu *vcpu) 1807 { 1808 uint32_t page_size = getpagesize(); 1809 uint32_t size = vcpu->vm->dirty_ring_size; 1810 1811 TEST_ASSERT(size > 0, "Should enable dirty ring first"); 1812 1813 if (!vcpu->dirty_gfns) { 1814 void *addr; 1815 1816 addr = mmap(NULL, size, PROT_READ, MAP_PRIVATE, vcpu->fd, 1817 page_size * KVM_DIRTY_LOG_PAGE_OFFSET); 1818 TEST_ASSERT(addr == MAP_FAILED, "Dirty ring mapped private"); 1819 1820 addr = mmap(NULL, size, PROT_READ | PROT_EXEC, MAP_PRIVATE, vcpu->fd, 1821 page_size * KVM_DIRTY_LOG_PAGE_OFFSET); 1822 TEST_ASSERT(addr == MAP_FAILED, "Dirty ring mapped exec"); 1823 1824 addr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, vcpu->fd, 1825 page_size * KVM_DIRTY_LOG_PAGE_OFFSET); 1826 TEST_ASSERT(addr != MAP_FAILED, "Dirty ring map failed"); 1827 1828 vcpu->dirty_gfns = addr; 1829 vcpu->dirty_gfns_count = size / sizeof(struct kvm_dirty_gfn); 1830 } 1831 1832 return vcpu->dirty_gfns; 1833 } 1834 1835 /* 1836 * Device Ioctl 1837 */ 1838 1839 int __kvm_has_device_attr(int dev_fd, uint32_t group, uint64_t attr) 1840 { 1841 struct kvm_device_attr attribute = { 1842 .group = group, 1843 .attr = attr, 1844 .flags = 0, 1845 }; 1846 1847 return ioctl(dev_fd, KVM_HAS_DEVICE_ATTR, &attribute); 1848 } 1849 1850 int __kvm_test_create_device(struct kvm_vm *vm, uint64_t type) 1851 { 1852 struct kvm_create_device create_dev = { 1853 .type = type, 1854 .flags = KVM_CREATE_DEVICE_TEST, 1855 }; 1856 1857 return __vm_ioctl(vm, KVM_CREATE_DEVICE, &create_dev); 1858 } 1859 1860 int __kvm_create_device(struct kvm_vm *vm, uint64_t type) 1861 { 1862 struct kvm_create_device create_dev = { 1863 .type = type, 1864 .fd = -1, 1865 .flags = 0, 1866 }; 1867 int err; 1868 1869 err = __vm_ioctl(vm, KVM_CREATE_DEVICE, &create_dev); 1870 TEST_ASSERT(err <= 0, "KVM_CREATE_DEVICE shouldn't return a positive value"); 1871 return err ? : create_dev.fd; 1872 } 1873 1874 int __kvm_device_attr_get(int dev_fd, uint32_t group, uint64_t attr, void *val) 1875 { 1876 struct kvm_device_attr kvmattr = { 1877 .group = group, 1878 .attr = attr, 1879 .flags = 0, 1880 .addr = (uintptr_t)val, 1881 }; 1882 1883 return __kvm_ioctl(dev_fd, KVM_GET_DEVICE_ATTR, &kvmattr); 1884 } 1885 1886 int __kvm_device_attr_set(int dev_fd, uint32_t group, uint64_t attr, void *val) 1887 { 1888 struct kvm_device_attr kvmattr = { 1889 .group = group, 1890 .attr = attr, 1891 .flags = 0, 1892 .addr = (uintptr_t)val, 1893 }; 1894 1895 return __kvm_ioctl(dev_fd, KVM_SET_DEVICE_ATTR, &kvmattr); 1896 } 1897 1898 /* 1899 * IRQ related functions. 1900 */ 1901 1902 int _kvm_irq_line(struct kvm_vm *vm, uint32_t irq, int level) 1903 { 1904 struct kvm_irq_level irq_level = { 1905 .irq = irq, 1906 .level = level, 1907 }; 1908 1909 return __vm_ioctl(vm, KVM_IRQ_LINE, &irq_level); 1910 } 1911 1912 void kvm_irq_line(struct kvm_vm *vm, uint32_t irq, int level) 1913 { 1914 int ret = _kvm_irq_line(vm, irq, level); 1915 1916 TEST_ASSERT(ret >= 0, KVM_IOCTL_ERROR(KVM_IRQ_LINE, ret)); 1917 } 1918 1919 struct kvm_irq_routing *kvm_gsi_routing_create(void) 1920 { 1921 struct kvm_irq_routing *routing; 1922 size_t size; 1923 1924 size = sizeof(struct kvm_irq_routing); 1925 /* Allocate space for the max number of entries: this wastes 196 KBs. */ 1926 size += KVM_MAX_IRQ_ROUTES * sizeof(struct kvm_irq_routing_entry); 1927 routing = calloc(1, size); 1928 assert(routing); 1929 1930 return routing; 1931 } 1932 1933 void kvm_gsi_routing_irqchip_add(struct kvm_irq_routing *routing, 1934 uint32_t gsi, uint32_t pin) 1935 { 1936 int i; 1937 1938 assert(routing); 1939 assert(routing->nr < KVM_MAX_IRQ_ROUTES); 1940 1941 i = routing->nr; 1942 routing->entries[i].gsi = gsi; 1943 routing->entries[i].type = KVM_IRQ_ROUTING_IRQCHIP; 1944 routing->entries[i].flags = 0; 1945 routing->entries[i].u.irqchip.irqchip = 0; 1946 routing->entries[i].u.irqchip.pin = pin; 1947 routing->nr++; 1948 } 1949 1950 int _kvm_gsi_routing_write(struct kvm_vm *vm, struct kvm_irq_routing *routing) 1951 { 1952 int ret; 1953 1954 assert(routing); 1955 ret = __vm_ioctl(vm, KVM_SET_GSI_ROUTING, routing); 1956 free(routing); 1957 1958 return ret; 1959 } 1960 1961 void kvm_gsi_routing_write(struct kvm_vm *vm, struct kvm_irq_routing *routing) 1962 { 1963 int ret; 1964 1965 ret = _kvm_gsi_routing_write(vm, routing); 1966 TEST_ASSERT(!ret, KVM_IOCTL_ERROR(KVM_SET_GSI_ROUTING, ret)); 1967 } 1968 1969 /* 1970 * VM Dump 1971 * 1972 * Input Args: 1973 * vm - Virtual Machine 1974 * indent - Left margin indent amount 1975 * 1976 * Output Args: 1977 * stream - Output FILE stream 1978 * 1979 * Return: None 1980 * 1981 * Dumps the current state of the VM given by vm, to the FILE stream 1982 * given by stream. 1983 */ 1984 void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) 1985 { 1986 int ctr; 1987 struct userspace_mem_region *region; 1988 struct kvm_vcpu *vcpu; 1989 1990 fprintf(stream, "%*smode: 0x%x\n", indent, "", vm->mode); 1991 fprintf(stream, "%*sfd: %i\n", indent, "", vm->fd); 1992 fprintf(stream, "%*spage_size: 0x%x\n", indent, "", vm->page_size); 1993 fprintf(stream, "%*sMem Regions:\n", indent, ""); 1994 hash_for_each(vm->regions.slot_hash, ctr, region, slot_node) { 1995 fprintf(stream, "%*sguest_phys: 0x%lx size: 0x%lx " 1996 "host_virt: %p\n", indent + 2, "", 1997 (uint64_t) region->region.guest_phys_addr, 1998 (uint64_t) region->region.memory_size, 1999 region->host_mem); 2000 fprintf(stream, "%*sunused_phy_pages: ", indent + 2, ""); 2001 sparsebit_dump(stream, region->unused_phy_pages, 0); 2002 if (region->protected_phy_pages) { 2003 fprintf(stream, "%*sprotected_phy_pages: ", indent + 2, ""); 2004 sparsebit_dump(stream, region->protected_phy_pages, 0); 2005 } 2006 } 2007 fprintf(stream, "%*sMapped Virtual Pages:\n", indent, ""); 2008 sparsebit_dump(stream, vm->vpages_mapped, indent + 2); 2009 fprintf(stream, "%*spgd_created: %u\n", indent, "", 2010 vm->pgd_created); 2011 if (vm->pgd_created) { 2012 fprintf(stream, "%*sVirtual Translation Tables:\n", 2013 indent + 2, ""); 2014 virt_dump(stream, vm, indent + 4); 2015 } 2016 fprintf(stream, "%*sVCPUs:\n", indent, ""); 2017 2018 list_for_each_entry(vcpu, &vm->vcpus, list) 2019 vcpu_dump(stream, vcpu, indent + 2); 2020 } 2021 2022 #define KVM_EXIT_STRING(x) {KVM_EXIT_##x, #x} 2023 2024 /* Known KVM exit reasons */ 2025 static struct exit_reason { 2026 unsigned int reason; 2027 const char *name; 2028 } exit_reasons_known[] = { 2029 KVM_EXIT_STRING(UNKNOWN), 2030 KVM_EXIT_STRING(EXCEPTION), 2031 KVM_EXIT_STRING(IO), 2032 KVM_EXIT_STRING(HYPERCALL), 2033 KVM_EXIT_STRING(DEBUG), 2034 KVM_EXIT_STRING(HLT), 2035 KVM_EXIT_STRING(MMIO), 2036 KVM_EXIT_STRING(IRQ_WINDOW_OPEN), 2037 KVM_EXIT_STRING(SHUTDOWN), 2038 KVM_EXIT_STRING(FAIL_ENTRY), 2039 KVM_EXIT_STRING(INTR), 2040 KVM_EXIT_STRING(SET_TPR), 2041 KVM_EXIT_STRING(TPR_ACCESS), 2042 KVM_EXIT_STRING(S390_SIEIC), 2043 KVM_EXIT_STRING(S390_RESET), 2044 KVM_EXIT_STRING(DCR), 2045 KVM_EXIT_STRING(NMI), 2046 KVM_EXIT_STRING(INTERNAL_ERROR), 2047 KVM_EXIT_STRING(OSI), 2048 KVM_EXIT_STRING(PAPR_HCALL), 2049 KVM_EXIT_STRING(S390_UCONTROL), 2050 KVM_EXIT_STRING(WATCHDOG), 2051 KVM_EXIT_STRING(S390_TSCH), 2052 KVM_EXIT_STRING(EPR), 2053 KVM_EXIT_STRING(SYSTEM_EVENT), 2054 KVM_EXIT_STRING(S390_STSI), 2055 KVM_EXIT_STRING(IOAPIC_EOI), 2056 KVM_EXIT_STRING(HYPERV), 2057 KVM_EXIT_STRING(ARM_NISV), 2058 KVM_EXIT_STRING(X86_RDMSR), 2059 KVM_EXIT_STRING(X86_WRMSR), 2060 KVM_EXIT_STRING(DIRTY_RING_FULL), 2061 KVM_EXIT_STRING(AP_RESET_HOLD), 2062 KVM_EXIT_STRING(X86_BUS_LOCK), 2063 KVM_EXIT_STRING(XEN), 2064 KVM_EXIT_STRING(RISCV_SBI), 2065 KVM_EXIT_STRING(RISCV_CSR), 2066 KVM_EXIT_STRING(NOTIFY), 2067 KVM_EXIT_STRING(LOONGARCH_IOCSR), 2068 KVM_EXIT_STRING(MEMORY_FAULT), 2069 }; 2070 2071 /* 2072 * Exit Reason String 2073 * 2074 * Input Args: 2075 * exit_reason - Exit reason 2076 * 2077 * Output Args: None 2078 * 2079 * Return: 2080 * Constant string pointer describing the exit reason. 2081 * 2082 * Locates and returns a constant string that describes the KVM exit 2083 * reason given by exit_reason. If no such string is found, a constant 2084 * string of "Unknown" is returned. 2085 */ 2086 const char *exit_reason_str(unsigned int exit_reason) 2087 { 2088 unsigned int n1; 2089 2090 for (n1 = 0; n1 < ARRAY_SIZE(exit_reasons_known); n1++) { 2091 if (exit_reason == exit_reasons_known[n1].reason) 2092 return exit_reasons_known[n1].name; 2093 } 2094 2095 return "Unknown"; 2096 } 2097 2098 /* 2099 * Physical Contiguous Page Allocator 2100 * 2101 * Input Args: 2102 * vm - Virtual Machine 2103 * num - number of pages 2104 * paddr_min - Physical address minimum 2105 * memslot - Memory region to allocate page from 2106 * protected - True if the pages will be used as protected/private memory 2107 * 2108 * Output Args: None 2109 * 2110 * Return: 2111 * Starting physical address 2112 * 2113 * Within the VM specified by vm, locates a range of available physical 2114 * pages at or above paddr_min. If found, the pages are marked as in use 2115 * and their base address is returned. A TEST_ASSERT failure occurs if 2116 * not enough pages are available at or above paddr_min. 2117 */ 2118 vm_paddr_t __vm_phy_pages_alloc(struct kvm_vm *vm, size_t num, 2119 vm_paddr_t paddr_min, uint32_t memslot, 2120 bool protected) 2121 { 2122 struct userspace_mem_region *region; 2123 sparsebit_idx_t pg, base; 2124 2125 TEST_ASSERT(num > 0, "Must allocate at least one page"); 2126 2127 TEST_ASSERT((paddr_min % vm->page_size) == 0, "Min physical address " 2128 "not divisible by page size.\n" 2129 " paddr_min: 0x%lx page_size: 0x%x", 2130 paddr_min, vm->page_size); 2131 2132 region = memslot2region(vm, memslot); 2133 TEST_ASSERT(!protected || region->protected_phy_pages, 2134 "Region doesn't support protected memory"); 2135 2136 base = pg = paddr_min >> vm->page_shift; 2137 do { 2138 for (; pg < base + num; ++pg) { 2139 if (!sparsebit_is_set(region->unused_phy_pages, pg)) { 2140 base = pg = sparsebit_next_set(region->unused_phy_pages, pg); 2141 break; 2142 } 2143 } 2144 } while (pg && pg != base + num); 2145 2146 if (pg == 0) { 2147 fprintf(stderr, "No guest physical page available, " 2148 "paddr_min: 0x%lx page_size: 0x%x memslot: %u\n", 2149 paddr_min, vm->page_size, memslot); 2150 fputs("---- vm dump ----\n", stderr); 2151 vm_dump(stderr, vm, 2); 2152 abort(); 2153 } 2154 2155 for (pg = base; pg < base + num; ++pg) { 2156 sparsebit_clear(region->unused_phy_pages, pg); 2157 if (protected) 2158 sparsebit_set(region->protected_phy_pages, pg); 2159 } 2160 2161 return base * vm->page_size; 2162 } 2163 2164 vm_paddr_t vm_phy_page_alloc(struct kvm_vm *vm, vm_paddr_t paddr_min, 2165 uint32_t memslot) 2166 { 2167 return vm_phy_pages_alloc(vm, 1, paddr_min, memslot); 2168 } 2169 2170 vm_paddr_t vm_alloc_page_table(struct kvm_vm *vm) 2171 { 2172 return vm_phy_page_alloc(vm, KVM_GUEST_PAGE_TABLE_MIN_PADDR, 2173 vm->memslots[MEM_REGION_PT]); 2174 } 2175 2176 /* 2177 * Address Guest Virtual to Host Virtual 2178 * 2179 * Input Args: 2180 * vm - Virtual Machine 2181 * gva - VM virtual address 2182 * 2183 * Output Args: None 2184 * 2185 * Return: 2186 * Equivalent host virtual address 2187 */ 2188 void *addr_gva2hva(struct kvm_vm *vm, vm_vaddr_t gva) 2189 { 2190 return addr_gpa2hva(vm, addr_gva2gpa(vm, gva)); 2191 } 2192 2193 unsigned long __weak vm_compute_max_gfn(struct kvm_vm *vm) 2194 { 2195 return ((1ULL << vm->pa_bits) >> vm->page_shift) - 1; 2196 } 2197 2198 static unsigned int vm_calc_num_pages(unsigned int num_pages, 2199 unsigned int page_shift, 2200 unsigned int new_page_shift, 2201 bool ceil) 2202 { 2203 unsigned int n = 1 << (new_page_shift - page_shift); 2204 2205 if (page_shift >= new_page_shift) 2206 return num_pages * (1 << (page_shift - new_page_shift)); 2207 2208 return num_pages / n + !!(ceil && num_pages % n); 2209 } 2210 2211 static inline int getpageshift(void) 2212 { 2213 return __builtin_ffs(getpagesize()) - 1; 2214 } 2215 2216 unsigned int 2217 vm_num_host_pages(enum vm_guest_mode mode, unsigned int num_guest_pages) 2218 { 2219 return vm_calc_num_pages(num_guest_pages, 2220 vm_guest_mode_params[mode].page_shift, 2221 getpageshift(), true); 2222 } 2223 2224 unsigned int 2225 vm_num_guest_pages(enum vm_guest_mode mode, unsigned int num_host_pages) 2226 { 2227 return vm_calc_num_pages(num_host_pages, getpageshift(), 2228 vm_guest_mode_params[mode].page_shift, false); 2229 } 2230 2231 unsigned int vm_calc_num_guest_pages(enum vm_guest_mode mode, size_t size) 2232 { 2233 unsigned int n; 2234 n = DIV_ROUND_UP(size, vm_guest_mode_params[mode].page_size); 2235 return vm_adjust_num_guest_pages(mode, n); 2236 } 2237 2238 /* 2239 * Read binary stats descriptors 2240 * 2241 * Input Args: 2242 * stats_fd - the file descriptor for the binary stats file from which to read 2243 * header - the binary stats metadata header corresponding to the given FD 2244 * 2245 * Output Args: None 2246 * 2247 * Return: 2248 * A pointer to a newly allocated series of stat descriptors. 2249 * Caller is responsible for freeing the returned kvm_stats_desc. 2250 * 2251 * Read the stats descriptors from the binary stats interface. 2252 */ 2253 struct kvm_stats_desc *read_stats_descriptors(int stats_fd, 2254 struct kvm_stats_header *header) 2255 { 2256 struct kvm_stats_desc *stats_desc; 2257 ssize_t desc_size, total_size, ret; 2258 2259 desc_size = get_stats_descriptor_size(header); 2260 total_size = header->num_desc * desc_size; 2261 2262 stats_desc = calloc(header->num_desc, desc_size); 2263 TEST_ASSERT(stats_desc, "Allocate memory for stats descriptors"); 2264 2265 ret = pread(stats_fd, stats_desc, total_size, header->desc_offset); 2266 TEST_ASSERT(ret == total_size, "Read KVM stats descriptors"); 2267 2268 return stats_desc; 2269 } 2270 2271 /* 2272 * Read stat data for a particular stat 2273 * 2274 * Input Args: 2275 * stats_fd - the file descriptor for the binary stats file from which to read 2276 * header - the binary stats metadata header corresponding to the given FD 2277 * desc - the binary stat metadata for the particular stat to be read 2278 * max_elements - the maximum number of 8-byte values to read into data 2279 * 2280 * Output Args: 2281 * data - the buffer into which stat data should be read 2282 * 2283 * Read the data values of a specified stat from the binary stats interface. 2284 */ 2285 void read_stat_data(int stats_fd, struct kvm_stats_header *header, 2286 struct kvm_stats_desc *desc, uint64_t *data, 2287 size_t max_elements) 2288 { 2289 size_t nr_elements = min_t(ssize_t, desc->size, max_elements); 2290 size_t size = nr_elements * sizeof(*data); 2291 ssize_t ret; 2292 2293 TEST_ASSERT(desc->size, "No elements in stat '%s'", desc->name); 2294 TEST_ASSERT(max_elements, "Zero elements requested for stat '%s'", desc->name); 2295 2296 ret = pread(stats_fd, data, size, 2297 header->data_offset + desc->offset); 2298 2299 TEST_ASSERT(ret >= 0, "pread() failed on stat '%s', errno: %i (%s)", 2300 desc->name, errno, strerror(errno)); 2301 TEST_ASSERT(ret == size, 2302 "pread() on stat '%s' read %ld bytes, wanted %lu bytes", 2303 desc->name, size, ret); 2304 } 2305 2306 void kvm_get_stat(struct kvm_binary_stats *stats, const char *name, 2307 uint64_t *data, size_t max_elements) 2308 { 2309 struct kvm_stats_desc *desc; 2310 size_t size_desc; 2311 int i; 2312 2313 if (!stats->desc) { 2314 read_stats_header(stats->fd, &stats->header); 2315 stats->desc = read_stats_descriptors(stats->fd, &stats->header); 2316 } 2317 2318 size_desc = get_stats_descriptor_size(&stats->header); 2319 2320 for (i = 0; i < stats->header.num_desc; ++i) { 2321 desc = (void *)stats->desc + (i * size_desc); 2322 2323 if (strcmp(desc->name, name)) 2324 continue; 2325 2326 read_stat_data(stats->fd, &stats->header, desc, data, max_elements); 2327 return; 2328 } 2329 2330 TEST_FAIL("Unable to find stat '%s'", name); 2331 } 2332 2333 __weak void kvm_arch_vm_post_create(struct kvm_vm *vm) 2334 { 2335 } 2336 2337 __weak void kvm_selftest_arch_init(void) 2338 { 2339 } 2340 2341 void __attribute((constructor)) kvm_selftest_init(void) 2342 { 2343 /* Tell stdout not to buffer its content. */ 2344 setbuf(stdout, NULL); 2345 2346 guest_random_seed = last_guest_seed = random(); 2347 pr_info("Random seed: 0x%x\n", guest_random_seed); 2348 2349 kvm_selftest_arch_init(); 2350 } 2351 2352 bool vm_is_gpa_protected(struct kvm_vm *vm, vm_paddr_t paddr) 2353 { 2354 sparsebit_idx_t pg = 0; 2355 struct userspace_mem_region *region; 2356 2357 if (!vm_arch_has_protected_memory(vm)) 2358 return false; 2359 2360 region = userspace_mem_region_find(vm, paddr, paddr); 2361 TEST_ASSERT(region, "No vm physical memory at 0x%lx", paddr); 2362 2363 pg = paddr >> vm->page_shift; 2364 return sparsebit_is_set(region->protected_phy_pages, pg); 2365 } 2366