1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * access_tracking_perf_test 4 * 5 * Copyright (C) 2021, Google, Inc. 6 * 7 * This test measures the performance effects of KVM's access tracking. 8 * Access tracking is driven by the MMU notifiers test_young, clear_young, and 9 * clear_flush_young. These notifiers do not have a direct userspace API, 10 * however the clear_young notifier can be triggered by marking a pages as idle 11 * in /sys/kernel/mm/page_idle/bitmap. This test leverages that mechanism to 12 * enable access tracking on guest memory. 13 * 14 * To measure performance this test runs a VM with a configurable number of 15 * vCPUs that each touch every page in disjoint regions of memory. Performance 16 * is measured in the time it takes all vCPUs to finish touching their 17 * predefined region. 18 * 19 * Note that a deterministic correctness test of access tracking is not possible 20 * by using page_idle as it exists today. This is for a few reasons: 21 * 22 * 1. page_idle only issues clear_young notifiers, which lack a TLB flush. This 23 * means subsequent guest accesses are not guaranteed to see page table 24 * updates made by KVM until some time in the future. 25 * 26 * 2. page_idle only operates on LRU pages. Newly allocated pages are not 27 * immediately allocated to LRU lists. Instead they are held in a "pagevec", 28 * which is drained to LRU lists some time in the future. There is no 29 * userspace API to force this drain to occur. 30 * 31 * These limitations are worked around in this test by using a large enough 32 * region of memory for each vCPU such that the number of translations cached in 33 * the TLB and the number of pages held in pagevecs are a small fraction of the 34 * overall workload. And if either of those conditions are not true (for example 35 * in nesting, where TLB size is unlimited) this test will print a warning 36 * rather than silently passing. 37 */ 38 #include <inttypes.h> 39 #include <limits.h> 40 #include <pthread.h> 41 #include <sys/mman.h> 42 #include <sys/types.h> 43 #include <sys/stat.h> 44 45 #include "kvm_util.h" 46 #include "test_util.h" 47 #include "memstress.h" 48 #include "guest_modes.h" 49 #include "processor.h" 50 51 /* Global variable used to synchronize all of the vCPU threads. */ 52 static int iteration; 53 54 /* Defines what vCPU threads should do during a given iteration. */ 55 static enum { 56 /* Run the vCPU to access all its memory. */ 57 ITERATION_ACCESS_MEMORY, 58 /* Mark the vCPU's memory idle in page_idle. */ 59 ITERATION_MARK_IDLE, 60 } iteration_work; 61 62 /* Set to true when vCPU threads should exit. */ 63 static bool done; 64 65 /* The iteration that was last completed by each vCPU. */ 66 static int vcpu_last_completed_iteration[KVM_MAX_VCPUS]; 67 68 /* Whether to overlap the regions of memory vCPUs access. */ 69 static bool overlap_memory_access; 70 71 struct test_params { 72 /* The backing source for the region of memory. */ 73 enum vm_mem_backing_src_type backing_src; 74 75 /* The amount of memory to allocate for each vCPU. */ 76 uint64_t vcpu_memory_bytes; 77 78 /* The number of vCPUs to create in the VM. */ 79 int nr_vcpus; 80 }; 81 82 static uint64_t pread_uint64(int fd, const char *filename, uint64_t index) 83 { 84 uint64_t value; 85 off_t offset = index * sizeof(value); 86 87 TEST_ASSERT(pread(fd, &value, sizeof(value), offset) == sizeof(value), 88 "pread from %s offset 0x%" PRIx64 " failed!", 89 filename, offset); 90 91 return value; 92 93 } 94 95 #define PAGEMAP_PRESENT (1ULL << 63) 96 #define PAGEMAP_PFN_MASK ((1ULL << 55) - 1) 97 98 static uint64_t lookup_pfn(int pagemap_fd, struct kvm_vm *vm, uint64_t gva) 99 { 100 uint64_t hva = (uint64_t) addr_gva2hva(vm, gva); 101 uint64_t entry; 102 uint64_t pfn; 103 104 entry = pread_uint64(pagemap_fd, "pagemap", hva / getpagesize()); 105 if (!(entry & PAGEMAP_PRESENT)) 106 return 0; 107 108 pfn = entry & PAGEMAP_PFN_MASK; 109 __TEST_REQUIRE(pfn, "Looking up PFNs requires CAP_SYS_ADMIN"); 110 111 return pfn; 112 } 113 114 static bool is_page_idle(int page_idle_fd, uint64_t pfn) 115 { 116 uint64_t bits = pread_uint64(page_idle_fd, "page_idle", pfn / 64); 117 118 return !!((bits >> (pfn % 64)) & 1); 119 } 120 121 static void mark_page_idle(int page_idle_fd, uint64_t pfn) 122 { 123 uint64_t bits = 1ULL << (pfn % 64); 124 125 TEST_ASSERT(pwrite(page_idle_fd, &bits, 8, 8 * (pfn / 64)) == 8, 126 "Set page_idle bits for PFN 0x%" PRIx64, pfn); 127 } 128 129 static void mark_vcpu_memory_idle(struct kvm_vm *vm, 130 struct memstress_vcpu_args *vcpu_args) 131 { 132 int vcpu_idx = vcpu_args->vcpu_idx; 133 uint64_t base_gva = vcpu_args->gva; 134 uint64_t pages = vcpu_args->pages; 135 uint64_t page; 136 uint64_t still_idle = 0; 137 uint64_t no_pfn = 0; 138 int page_idle_fd; 139 int pagemap_fd; 140 141 /* If vCPUs are using an overlapping region, let vCPU 0 mark it idle. */ 142 if (overlap_memory_access && vcpu_idx) 143 return; 144 145 page_idle_fd = open("/sys/kernel/mm/page_idle/bitmap", O_RDWR); 146 TEST_ASSERT(page_idle_fd > 0, "Failed to open page_idle."); 147 148 pagemap_fd = open("/proc/self/pagemap", O_RDONLY); 149 TEST_ASSERT(pagemap_fd > 0, "Failed to open pagemap."); 150 151 for (page = 0; page < pages; page++) { 152 uint64_t gva = base_gva + page * memstress_args.guest_page_size; 153 uint64_t pfn = lookup_pfn(pagemap_fd, vm, gva); 154 155 if (!pfn) { 156 no_pfn++; 157 continue; 158 } 159 160 if (is_page_idle(page_idle_fd, pfn)) { 161 still_idle++; 162 continue; 163 } 164 165 mark_page_idle(page_idle_fd, pfn); 166 } 167 168 /* 169 * Assumption: Less than 1% of pages are going to be swapped out from 170 * under us during this test. 171 */ 172 TEST_ASSERT(no_pfn < pages / 100, 173 "vCPU %d: No PFN for %" PRIu64 " out of %" PRIu64 " pages.", 174 vcpu_idx, no_pfn, pages); 175 176 /* 177 * Check that at least 90% of memory has been marked idle (the rest 178 * might not be marked idle because the pages have not yet made it to an 179 * LRU list or the translations are still cached in the TLB). 90% is 180 * arbitrary; high enough that we ensure most memory access went through 181 * access tracking but low enough as to not make the test too brittle 182 * over time and across architectures. 183 * 184 * When running the guest as a nested VM, "warn" instead of asserting 185 * as the TLB size is effectively unlimited and the KVM doesn't 186 * explicitly flush the TLB when aging SPTEs. As a result, more pages 187 * are cached and the guest won't see the "idle" bit cleared. 188 */ 189 if (still_idle >= pages / 10) { 190 #ifdef __x86_64__ 191 TEST_ASSERT(this_cpu_has(X86_FEATURE_HYPERVISOR), 192 "vCPU%d: Too many pages still idle (%lu out of %lu)", 193 vcpu_idx, still_idle, pages); 194 #endif 195 printf("WARNING: vCPU%d: Too many pages still idle (%lu out of %lu), " 196 "this will affect performance results.\n", 197 vcpu_idx, still_idle, pages); 198 } 199 200 close(page_idle_fd); 201 close(pagemap_fd); 202 } 203 204 static void assert_ucall(struct kvm_vcpu *vcpu, uint64_t expected_ucall) 205 { 206 struct ucall uc; 207 uint64_t actual_ucall = get_ucall(vcpu, &uc); 208 209 TEST_ASSERT(expected_ucall == actual_ucall, 210 "Guest exited unexpectedly (expected ucall %" PRIu64 211 ", got %" PRIu64 ")", 212 expected_ucall, actual_ucall); 213 } 214 215 static bool spin_wait_for_next_iteration(int *current_iteration) 216 { 217 int last_iteration = *current_iteration; 218 219 do { 220 if (READ_ONCE(done)) 221 return false; 222 223 *current_iteration = READ_ONCE(iteration); 224 } while (last_iteration == *current_iteration); 225 226 return true; 227 } 228 229 static void vcpu_thread_main(struct memstress_vcpu_args *vcpu_args) 230 { 231 struct kvm_vcpu *vcpu = vcpu_args->vcpu; 232 struct kvm_vm *vm = memstress_args.vm; 233 int vcpu_idx = vcpu_args->vcpu_idx; 234 int current_iteration = 0; 235 236 while (spin_wait_for_next_iteration(¤t_iteration)) { 237 switch (READ_ONCE(iteration_work)) { 238 case ITERATION_ACCESS_MEMORY: 239 vcpu_run(vcpu); 240 assert_ucall(vcpu, UCALL_SYNC); 241 break; 242 case ITERATION_MARK_IDLE: 243 mark_vcpu_memory_idle(vm, vcpu_args); 244 break; 245 }; 246 247 vcpu_last_completed_iteration[vcpu_idx] = current_iteration; 248 } 249 } 250 251 static void spin_wait_for_vcpu(int vcpu_idx, int target_iteration) 252 { 253 while (READ_ONCE(vcpu_last_completed_iteration[vcpu_idx]) != 254 target_iteration) { 255 continue; 256 } 257 } 258 259 /* The type of memory accesses to perform in the VM. */ 260 enum access_type { 261 ACCESS_READ, 262 ACCESS_WRITE, 263 }; 264 265 static void run_iteration(struct kvm_vm *vm, int nr_vcpus, const char *description) 266 { 267 struct timespec ts_start; 268 struct timespec ts_elapsed; 269 int next_iteration, i; 270 271 /* Kick off the vCPUs by incrementing iteration. */ 272 next_iteration = ++iteration; 273 274 clock_gettime(CLOCK_MONOTONIC, &ts_start); 275 276 /* Wait for all vCPUs to finish the iteration. */ 277 for (i = 0; i < nr_vcpus; i++) 278 spin_wait_for_vcpu(i, next_iteration); 279 280 ts_elapsed = timespec_elapsed(ts_start); 281 pr_info("%-30s: %ld.%09lds\n", 282 description, ts_elapsed.tv_sec, ts_elapsed.tv_nsec); 283 } 284 285 static void access_memory(struct kvm_vm *vm, int nr_vcpus, 286 enum access_type access, const char *description) 287 { 288 memstress_set_write_percent(vm, (access == ACCESS_READ) ? 0 : 100); 289 iteration_work = ITERATION_ACCESS_MEMORY; 290 run_iteration(vm, nr_vcpus, description); 291 } 292 293 static void mark_memory_idle(struct kvm_vm *vm, int nr_vcpus) 294 { 295 /* 296 * Even though this parallelizes the work across vCPUs, this is still a 297 * very slow operation because page_idle forces the test to mark one pfn 298 * at a time and the clear_young notifier serializes on the KVM MMU 299 * lock. 300 */ 301 pr_debug("Marking VM memory idle (slow)...\n"); 302 iteration_work = ITERATION_MARK_IDLE; 303 run_iteration(vm, nr_vcpus, "Mark memory idle"); 304 } 305 306 static void run_test(enum vm_guest_mode mode, void *arg) 307 { 308 struct test_params *params = arg; 309 struct kvm_vm *vm; 310 int nr_vcpus = params->nr_vcpus; 311 312 vm = memstress_create_vm(mode, nr_vcpus, params->vcpu_memory_bytes, 1, 313 params->backing_src, !overlap_memory_access); 314 315 memstress_start_vcpu_threads(nr_vcpus, vcpu_thread_main); 316 317 pr_info("\n"); 318 access_memory(vm, nr_vcpus, ACCESS_WRITE, "Populating memory"); 319 320 /* As a control, read and write to the populated memory first. */ 321 access_memory(vm, nr_vcpus, ACCESS_WRITE, "Writing to populated memory"); 322 access_memory(vm, nr_vcpus, ACCESS_READ, "Reading from populated memory"); 323 324 /* Repeat on memory that has been marked as idle. */ 325 mark_memory_idle(vm, nr_vcpus); 326 access_memory(vm, nr_vcpus, ACCESS_WRITE, "Writing to idle memory"); 327 mark_memory_idle(vm, nr_vcpus); 328 access_memory(vm, nr_vcpus, ACCESS_READ, "Reading from idle memory"); 329 330 /* Set done to signal the vCPU threads to exit */ 331 done = true; 332 333 memstress_join_vcpu_threads(nr_vcpus); 334 memstress_destroy_vm(vm); 335 } 336 337 static void help(char *name) 338 { 339 puts(""); 340 printf("usage: %s [-h] [-m mode] [-b vcpu_bytes] [-v vcpus] [-o] [-s mem_type]\n", 341 name); 342 puts(""); 343 printf(" -h: Display this help message."); 344 guest_modes_help(); 345 printf(" -b: specify the size of the memory region which should be\n" 346 " dirtied by each vCPU. e.g. 10M or 3G.\n" 347 " (default: 1G)\n"); 348 printf(" -v: specify the number of vCPUs to run.\n"); 349 printf(" -o: Overlap guest memory accesses instead of partitioning\n" 350 " them into a separate region of memory for each vCPU.\n"); 351 backing_src_help("-s"); 352 puts(""); 353 exit(0); 354 } 355 356 int main(int argc, char *argv[]) 357 { 358 struct test_params params = { 359 .backing_src = DEFAULT_VM_MEM_SRC, 360 .vcpu_memory_bytes = DEFAULT_PER_VCPU_MEM_SIZE, 361 .nr_vcpus = 1, 362 }; 363 int page_idle_fd; 364 int opt; 365 366 guest_modes_append_default(); 367 368 while ((opt = getopt(argc, argv, "hm:b:v:os:")) != -1) { 369 switch (opt) { 370 case 'm': 371 guest_modes_cmdline(optarg); 372 break; 373 case 'b': 374 params.vcpu_memory_bytes = parse_size(optarg); 375 break; 376 case 'v': 377 params.nr_vcpus = atoi_positive("Number of vCPUs", optarg); 378 break; 379 case 'o': 380 overlap_memory_access = true; 381 break; 382 case 's': 383 params.backing_src = parse_backing_src_type(optarg); 384 break; 385 case 'h': 386 default: 387 help(argv[0]); 388 break; 389 } 390 } 391 392 page_idle_fd = open("/sys/kernel/mm/page_idle/bitmap", O_RDWR); 393 __TEST_REQUIRE(page_idle_fd >= 0, 394 "CONFIG_IDLE_PAGE_TRACKING is not enabled"); 395 close(page_idle_fd); 396 397 for_each_guest_mode(run_test, ¶ms); 398 399 return 0; 400 } 401