1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * vgic_lpi_stress - Stress test for KVM's ITS emulation 4 * 5 * Copyright (c) 2024 Google LLC 6 */ 7 8 #include <linux/sizes.h> 9 #include <pthread.h> 10 #include <stdatomic.h> 11 #include <sys/sysinfo.h> 12 13 #include "kvm_util.h" 14 #include "gic.h" 15 #include "gic_v3.h" 16 #include "gic_v3_its.h" 17 #include "processor.h" 18 #include "ucall.h" 19 #include "vgic.h" 20 21 #define TEST_MEMSLOT_INDEX 1 22 23 #define GIC_LPI_OFFSET 8192 24 25 static size_t nr_iterations = 1000; 26 static vm_paddr_t gpa_base; 27 28 static struct kvm_vm *vm; 29 static struct kvm_vcpu **vcpus; 30 static int its_fd; 31 32 static struct test_data { 33 bool request_vcpus_stop; 34 u32 nr_cpus; 35 u32 nr_devices; 36 u32 nr_event_ids; 37 38 vm_paddr_t device_table; 39 vm_paddr_t collection_table; 40 vm_paddr_t cmdq_base; 41 void *cmdq_base_va; 42 vm_paddr_t itt_tables; 43 44 vm_paddr_t lpi_prop_table; 45 vm_paddr_t lpi_pend_tables; 46 } test_data = { 47 .nr_cpus = 1, 48 .nr_devices = 1, 49 .nr_event_ids = 16, 50 }; 51 52 static void guest_irq_handler(struct ex_regs *regs) 53 { 54 u32 intid = gic_get_and_ack_irq(); 55 56 if (intid == IAR_SPURIOUS) 57 return; 58 59 GUEST_ASSERT(intid >= GIC_LPI_OFFSET); 60 gic_set_eoi(intid); 61 } 62 63 static void guest_setup_its_mappings(void) 64 { 65 u32 coll_id, device_id, event_id, intid = GIC_LPI_OFFSET; 66 u32 nr_events = test_data.nr_event_ids; 67 u32 nr_devices = test_data.nr_devices; 68 u32 nr_cpus = test_data.nr_cpus; 69 70 for (coll_id = 0; coll_id < nr_cpus; coll_id++) 71 its_send_mapc_cmd(test_data.cmdq_base_va, coll_id, coll_id, true); 72 73 /* Round-robin the LPIs to all of the vCPUs in the VM */ 74 coll_id = 0; 75 for (device_id = 0; device_id < nr_devices; device_id++) { 76 vm_paddr_t itt_base = test_data.itt_tables + (device_id * SZ_64K); 77 78 its_send_mapd_cmd(test_data.cmdq_base_va, device_id, 79 itt_base, SZ_64K, true); 80 81 for (event_id = 0; event_id < nr_events; event_id++) { 82 its_send_mapti_cmd(test_data.cmdq_base_va, device_id, 83 event_id, coll_id, intid++); 84 85 coll_id = (coll_id + 1) % test_data.nr_cpus; 86 } 87 } 88 } 89 90 static void guest_invalidate_all_rdists(void) 91 { 92 int i; 93 94 for (i = 0; i < test_data.nr_cpus; i++) 95 its_send_invall_cmd(test_data.cmdq_base_va, i); 96 } 97 98 static void guest_setup_gic(void) 99 { 100 static atomic_int nr_cpus_ready = 0; 101 u32 cpuid = guest_get_vcpuid(); 102 103 gic_init(GIC_V3, test_data.nr_cpus); 104 gic_rdist_enable_lpis(test_data.lpi_prop_table, SZ_64K, 105 test_data.lpi_pend_tables + (cpuid * SZ_64K)); 106 107 atomic_fetch_add(&nr_cpus_ready, 1); 108 109 if (cpuid > 0) 110 return; 111 112 while (atomic_load(&nr_cpus_ready) < test_data.nr_cpus) 113 cpu_relax(); 114 115 its_init(test_data.collection_table, SZ_64K, 116 test_data.device_table, SZ_64K, 117 test_data.cmdq_base, SZ_64K); 118 119 guest_setup_its_mappings(); 120 guest_invalidate_all_rdists(); 121 122 /* SYNC to ensure ITS setup is complete */ 123 for (cpuid = 0; cpuid < test_data.nr_cpus; cpuid++) 124 its_send_sync_cmd(test_data.cmdq_base_va, cpuid); 125 } 126 127 static void guest_code(size_t nr_lpis) 128 { 129 guest_setup_gic(); 130 local_irq_enable(); 131 132 GUEST_SYNC(0); 133 134 /* 135 * Don't use WFI here to avoid blocking the vCPU thread indefinitely and 136 * never getting the stop signal. 137 */ 138 while (!READ_ONCE(test_data.request_vcpus_stop)) 139 cpu_relax(); 140 141 GUEST_DONE(); 142 } 143 144 static void setup_memslot(void) 145 { 146 size_t pages; 147 size_t sz; 148 149 /* 150 * For the ITS: 151 * - A single level device table 152 * - A single level collection table 153 * - The command queue 154 * - An ITT for each device 155 */ 156 sz = (3 + test_data.nr_devices) * SZ_64K; 157 158 /* 159 * For the redistributors: 160 * - A shared LPI configuration table 161 * - An LPI pending table for each vCPU 162 */ 163 sz += (1 + test_data.nr_cpus) * SZ_64K; 164 165 pages = sz / vm->page_size; 166 gpa_base = ((vm_compute_max_gfn(vm) + 1) * vm->page_size) - sz; 167 vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, gpa_base, 168 TEST_MEMSLOT_INDEX, pages, 0); 169 } 170 171 #define LPI_PROP_DEFAULT_PRIO 0xa0 172 173 static void configure_lpis(void) 174 { 175 size_t nr_lpis = test_data.nr_devices * test_data.nr_event_ids; 176 u8 *tbl = addr_gpa2hva(vm, test_data.lpi_prop_table); 177 size_t i; 178 179 for (i = 0; i < nr_lpis; i++) { 180 tbl[i] = LPI_PROP_DEFAULT_PRIO | 181 LPI_PROP_GROUP1 | 182 LPI_PROP_ENABLED; 183 } 184 } 185 186 static void setup_test_data(void) 187 { 188 size_t pages_per_64k = vm_calc_num_guest_pages(vm->mode, SZ_64K); 189 u32 nr_devices = test_data.nr_devices; 190 u32 nr_cpus = test_data.nr_cpus; 191 vm_paddr_t cmdq_base; 192 193 test_data.device_table = vm_phy_pages_alloc(vm, pages_per_64k, 194 gpa_base, 195 TEST_MEMSLOT_INDEX); 196 197 test_data.collection_table = vm_phy_pages_alloc(vm, pages_per_64k, 198 gpa_base, 199 TEST_MEMSLOT_INDEX); 200 201 cmdq_base = vm_phy_pages_alloc(vm, pages_per_64k, gpa_base, 202 TEST_MEMSLOT_INDEX); 203 virt_map(vm, cmdq_base, cmdq_base, pages_per_64k); 204 test_data.cmdq_base = cmdq_base; 205 test_data.cmdq_base_va = (void *)cmdq_base; 206 207 test_data.itt_tables = vm_phy_pages_alloc(vm, pages_per_64k * nr_devices, 208 gpa_base, TEST_MEMSLOT_INDEX); 209 210 test_data.lpi_prop_table = vm_phy_pages_alloc(vm, pages_per_64k, 211 gpa_base, TEST_MEMSLOT_INDEX); 212 configure_lpis(); 213 214 test_data.lpi_pend_tables = vm_phy_pages_alloc(vm, pages_per_64k * nr_cpus, 215 gpa_base, TEST_MEMSLOT_INDEX); 216 217 sync_global_to_guest(vm, test_data); 218 } 219 220 static void setup_gic(void) 221 { 222 its_fd = vgic_its_setup(vm); 223 } 224 225 static void signal_lpi(u32 device_id, u32 event_id) 226 { 227 vm_paddr_t db_addr = GITS_BASE_GPA + GITS_TRANSLATER; 228 229 struct kvm_msi msi = { 230 .address_lo = db_addr, 231 .address_hi = db_addr >> 32, 232 .data = event_id, 233 .devid = device_id, 234 .flags = KVM_MSI_VALID_DEVID, 235 }; 236 237 /* 238 * KVM_SIGNAL_MSI returns 1 if the MSI wasn't 'blocked' by the VM, 239 * which for arm64 implies having a valid translation in the ITS. 240 */ 241 TEST_ASSERT(__vm_ioctl(vm, KVM_SIGNAL_MSI, &msi) == 1, 242 "KVM_SIGNAL_MSI ioctl failed"); 243 } 244 245 static pthread_barrier_t test_setup_barrier; 246 247 static void *lpi_worker_thread(void *data) 248 { 249 u32 device_id = (size_t)data; 250 u32 event_id; 251 size_t i; 252 253 pthread_barrier_wait(&test_setup_barrier); 254 255 for (i = 0; i < nr_iterations; i++) 256 for (event_id = 0; event_id < test_data.nr_event_ids; event_id++) 257 signal_lpi(device_id, event_id); 258 259 return NULL; 260 } 261 262 static void *vcpu_worker_thread(void *data) 263 { 264 struct kvm_vcpu *vcpu = data; 265 struct ucall uc; 266 267 while (true) { 268 vcpu_run(vcpu); 269 270 switch (get_ucall(vcpu, &uc)) { 271 case UCALL_SYNC: 272 pthread_barrier_wait(&test_setup_barrier); 273 continue; 274 case UCALL_DONE: 275 return NULL; 276 case UCALL_ABORT: 277 REPORT_GUEST_ASSERT(uc); 278 break; 279 default: 280 TEST_FAIL("Unknown ucall: %lu", uc.cmd); 281 } 282 } 283 284 return NULL; 285 } 286 287 static void report_stats(struct timespec delta) 288 { 289 double nr_lpis; 290 double time; 291 292 nr_lpis = test_data.nr_devices * test_data.nr_event_ids * nr_iterations; 293 294 time = delta.tv_sec; 295 time += ((double)delta.tv_nsec) / NSEC_PER_SEC; 296 297 pr_info("Rate: %.2f LPIs/sec\n", nr_lpis / time); 298 } 299 300 static void run_test(void) 301 { 302 u32 nr_devices = test_data.nr_devices; 303 u32 nr_vcpus = test_data.nr_cpus; 304 pthread_t *lpi_threads = malloc(nr_devices * sizeof(pthread_t)); 305 pthread_t *vcpu_threads = malloc(nr_vcpus * sizeof(pthread_t)); 306 struct timespec start, delta; 307 size_t i; 308 309 TEST_ASSERT(lpi_threads && vcpu_threads, "Failed to allocate pthread arrays"); 310 311 pthread_barrier_init(&test_setup_barrier, NULL, nr_vcpus + nr_devices + 1); 312 313 for (i = 0; i < nr_vcpus; i++) 314 pthread_create(&vcpu_threads[i], NULL, vcpu_worker_thread, vcpus[i]); 315 316 for (i = 0; i < nr_devices; i++) 317 pthread_create(&lpi_threads[i], NULL, lpi_worker_thread, (void *)i); 318 319 pthread_barrier_wait(&test_setup_barrier); 320 321 clock_gettime(CLOCK_MONOTONIC, &start); 322 323 for (i = 0; i < nr_devices; i++) 324 pthread_join(lpi_threads[i], NULL); 325 326 delta = timespec_elapsed(start); 327 write_guest_global(vm, test_data.request_vcpus_stop, true); 328 329 for (i = 0; i < nr_vcpus; i++) 330 pthread_join(vcpu_threads[i], NULL); 331 332 report_stats(delta); 333 } 334 335 static void setup_vm(void) 336 { 337 int i; 338 339 vcpus = malloc(test_data.nr_cpus * sizeof(struct kvm_vcpu *)); 340 TEST_ASSERT(vcpus, "Failed to allocate vCPU array"); 341 342 vm = vm_create_with_vcpus(test_data.nr_cpus, guest_code, vcpus); 343 344 vm_init_descriptor_tables(vm); 345 for (i = 0; i < test_data.nr_cpus; i++) 346 vcpu_init_descriptor_tables(vcpus[i]); 347 348 vm_install_exception_handler(vm, VECTOR_IRQ_CURRENT, guest_irq_handler); 349 350 setup_memslot(); 351 352 setup_gic(); 353 354 setup_test_data(); 355 } 356 357 static void destroy_vm(void) 358 { 359 close(its_fd); 360 kvm_vm_free(vm); 361 free(vcpus); 362 } 363 364 static void pr_usage(const char *name) 365 { 366 pr_info("%s [-v NR_VCPUS] [-d NR_DEVICES] [-e NR_EVENTS] [-i ITERS] -h\n", name); 367 pr_info(" -v:\tnumber of vCPUs (default: %u)\n", test_data.nr_cpus); 368 pr_info(" -d:\tnumber of devices (default: %u)\n", test_data.nr_devices); 369 pr_info(" -e:\tnumber of event IDs per device (default: %u)\n", test_data.nr_event_ids); 370 pr_info(" -i:\tnumber of iterations (default: %lu)\n", nr_iterations); 371 } 372 373 int main(int argc, char **argv) 374 { 375 u32 nr_threads; 376 int c; 377 378 TEST_REQUIRE(kvm_supports_vgic_v3()); 379 380 while ((c = getopt(argc, argv, "hv:d:e:i:")) != -1) { 381 switch (c) { 382 case 'v': 383 test_data.nr_cpus = atoi(optarg); 384 break; 385 case 'd': 386 test_data.nr_devices = atoi(optarg); 387 break; 388 case 'e': 389 test_data.nr_event_ids = atoi(optarg); 390 break; 391 case 'i': 392 nr_iterations = strtoul(optarg, NULL, 0); 393 break; 394 case 'h': 395 default: 396 pr_usage(argv[0]); 397 return 1; 398 } 399 } 400 401 nr_threads = test_data.nr_cpus + test_data.nr_devices; 402 if (nr_threads > get_nprocs()) 403 pr_info("WARNING: running %u threads on %d CPUs; performance is degraded.\n", 404 nr_threads, get_nprocs()); 405 406 setup_vm(); 407 408 run_test(); 409 410 destroy_vm(); 411 412 return 0; 413 } 414