1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * vgic_lpi_stress - Stress test for KVM's ITS emulation 4 * 5 * Copyright (c) 2024 Google LLC 6 */ 7 8 #include <linux/sizes.h> 9 #include <pthread.h> 10 #include <stdatomic.h> 11 #include <sys/sysinfo.h> 12 13 #include "kvm_util.h" 14 #include "gic.h" 15 #include "gic_v3.h" 16 #include "gic_v3_its.h" 17 #include "processor.h" 18 #include "ucall.h" 19 #include "vgic.h" 20 21 #define TEST_MEMSLOT_INDEX 1 22 23 #define GIC_LPI_OFFSET 8192 24 25 static size_t nr_iterations = 1000; 26 static vm_paddr_t gpa_base; 27 28 static struct kvm_vm *vm; 29 static struct kvm_vcpu **vcpus; 30 static int its_fd; 31 32 static struct test_data { 33 bool request_vcpus_stop; 34 u32 nr_cpus; 35 u32 nr_devices; 36 u32 nr_event_ids; 37 38 vm_paddr_t device_table; 39 vm_paddr_t collection_table; 40 vm_paddr_t cmdq_base; 41 void *cmdq_base_va; 42 vm_paddr_t itt_tables; 43 44 vm_paddr_t lpi_prop_table; 45 vm_paddr_t lpi_pend_tables; 46 } test_data = { 47 .nr_cpus = 1, 48 .nr_devices = 1, 49 .nr_event_ids = 16, 50 }; 51 52 static void guest_irq_handler(struct ex_regs *regs) 53 { 54 u32 intid = gic_get_and_ack_irq(); 55 56 if (intid == IAR_SPURIOUS) 57 return; 58 59 GUEST_ASSERT(intid >= GIC_LPI_OFFSET); 60 gic_set_eoi(intid); 61 } 62 63 static void guest_setup_its_mappings(void) 64 { 65 u32 coll_id, device_id, event_id, intid = GIC_LPI_OFFSET; 66 u32 nr_events = test_data.nr_event_ids; 67 u32 nr_devices = test_data.nr_devices; 68 u32 nr_cpus = test_data.nr_cpus; 69 70 for (coll_id = 0; coll_id < nr_cpus; coll_id++) 71 its_send_mapc_cmd(test_data.cmdq_base_va, coll_id, coll_id, true); 72 73 /* Round-robin the LPIs to all of the vCPUs in the VM */ 74 coll_id = 0; 75 for (device_id = 0; device_id < nr_devices; device_id++) { 76 vm_paddr_t itt_base = test_data.itt_tables + (device_id * SZ_64K); 77 78 its_send_mapd_cmd(test_data.cmdq_base_va, device_id, 79 itt_base, SZ_64K, true); 80 81 for (event_id = 0; event_id < nr_events; event_id++) { 82 its_send_mapti_cmd(test_data.cmdq_base_va, device_id, 83 event_id, coll_id, intid++); 84 85 coll_id = (coll_id + 1) % test_data.nr_cpus; 86 } 87 } 88 } 89 90 static void guest_invalidate_all_rdists(void) 91 { 92 int i; 93 94 for (i = 0; i < test_data.nr_cpus; i++) 95 its_send_invall_cmd(test_data.cmdq_base_va, i); 96 } 97 98 static void guest_setup_gic(void) 99 { 100 static atomic_int nr_cpus_ready = 0; 101 u32 cpuid = guest_get_vcpuid(); 102 103 gic_init(GIC_V3, test_data.nr_cpus); 104 gic_rdist_enable_lpis(test_data.lpi_prop_table, SZ_64K, 105 test_data.lpi_pend_tables + (cpuid * SZ_64K)); 106 107 atomic_fetch_add(&nr_cpus_ready, 1); 108 109 if (cpuid > 0) 110 return; 111 112 while (atomic_load(&nr_cpus_ready) < test_data.nr_cpus) 113 cpu_relax(); 114 115 its_init(test_data.collection_table, SZ_64K, 116 test_data.device_table, SZ_64K, 117 test_data.cmdq_base, SZ_64K); 118 119 guest_setup_its_mappings(); 120 guest_invalidate_all_rdists(); 121 } 122 123 static void guest_code(size_t nr_lpis) 124 { 125 guest_setup_gic(); 126 local_irq_enable(); 127 128 GUEST_SYNC(0); 129 130 /* 131 * Don't use WFI here to avoid blocking the vCPU thread indefinitely and 132 * never getting the stop signal. 133 */ 134 while (!READ_ONCE(test_data.request_vcpus_stop)) 135 cpu_relax(); 136 137 GUEST_DONE(); 138 } 139 140 static void setup_memslot(void) 141 { 142 size_t pages; 143 size_t sz; 144 145 /* 146 * For the ITS: 147 * - A single level device table 148 * - A single level collection table 149 * - The command queue 150 * - An ITT for each device 151 */ 152 sz = (3 + test_data.nr_devices) * SZ_64K; 153 154 /* 155 * For the redistributors: 156 * - A shared LPI configuration table 157 * - An LPI pending table for each vCPU 158 */ 159 sz += (1 + test_data.nr_cpus) * SZ_64K; 160 161 pages = sz / vm->page_size; 162 gpa_base = ((vm_compute_max_gfn(vm) + 1) * vm->page_size) - sz; 163 vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, gpa_base, 164 TEST_MEMSLOT_INDEX, pages, 0); 165 } 166 167 #define LPI_PROP_DEFAULT_PRIO 0xa0 168 169 static void configure_lpis(void) 170 { 171 size_t nr_lpis = test_data.nr_devices * test_data.nr_event_ids; 172 u8 *tbl = addr_gpa2hva(vm, test_data.lpi_prop_table); 173 size_t i; 174 175 for (i = 0; i < nr_lpis; i++) { 176 tbl[i] = LPI_PROP_DEFAULT_PRIO | 177 LPI_PROP_GROUP1 | 178 LPI_PROP_ENABLED; 179 } 180 } 181 182 static void setup_test_data(void) 183 { 184 size_t pages_per_64k = vm_calc_num_guest_pages(vm->mode, SZ_64K); 185 u32 nr_devices = test_data.nr_devices; 186 u32 nr_cpus = test_data.nr_cpus; 187 vm_paddr_t cmdq_base; 188 189 test_data.device_table = vm_phy_pages_alloc(vm, pages_per_64k, 190 gpa_base, 191 TEST_MEMSLOT_INDEX); 192 193 test_data.collection_table = vm_phy_pages_alloc(vm, pages_per_64k, 194 gpa_base, 195 TEST_MEMSLOT_INDEX); 196 197 cmdq_base = vm_phy_pages_alloc(vm, pages_per_64k, gpa_base, 198 TEST_MEMSLOT_INDEX); 199 virt_map(vm, cmdq_base, cmdq_base, pages_per_64k); 200 test_data.cmdq_base = cmdq_base; 201 test_data.cmdq_base_va = (void *)cmdq_base; 202 203 test_data.itt_tables = vm_phy_pages_alloc(vm, pages_per_64k * nr_devices, 204 gpa_base, TEST_MEMSLOT_INDEX); 205 206 test_data.lpi_prop_table = vm_phy_pages_alloc(vm, pages_per_64k, 207 gpa_base, TEST_MEMSLOT_INDEX); 208 configure_lpis(); 209 210 test_data.lpi_pend_tables = vm_phy_pages_alloc(vm, pages_per_64k * nr_cpus, 211 gpa_base, TEST_MEMSLOT_INDEX); 212 213 sync_global_to_guest(vm, test_data); 214 } 215 216 static void setup_gic(void) 217 { 218 its_fd = vgic_its_setup(vm); 219 } 220 221 static void signal_lpi(u32 device_id, u32 event_id) 222 { 223 vm_paddr_t db_addr = GITS_BASE_GPA + GITS_TRANSLATER; 224 225 struct kvm_msi msi = { 226 .address_lo = db_addr, 227 .address_hi = db_addr >> 32, 228 .data = event_id, 229 .devid = device_id, 230 .flags = KVM_MSI_VALID_DEVID, 231 }; 232 233 /* 234 * KVM_SIGNAL_MSI returns 1 if the MSI wasn't 'blocked' by the VM, 235 * which for arm64 implies having a valid translation in the ITS. 236 */ 237 TEST_ASSERT(__vm_ioctl(vm, KVM_SIGNAL_MSI, &msi) == 1, 238 "KVM_SIGNAL_MSI ioctl failed"); 239 } 240 241 static pthread_barrier_t test_setup_barrier; 242 243 static void *lpi_worker_thread(void *data) 244 { 245 u32 device_id = (size_t)data; 246 u32 event_id; 247 size_t i; 248 249 pthread_barrier_wait(&test_setup_barrier); 250 251 for (i = 0; i < nr_iterations; i++) 252 for (event_id = 0; event_id < test_data.nr_event_ids; event_id++) 253 signal_lpi(device_id, event_id); 254 255 return NULL; 256 } 257 258 static void *vcpu_worker_thread(void *data) 259 { 260 struct kvm_vcpu *vcpu = data; 261 struct ucall uc; 262 263 while (true) { 264 vcpu_run(vcpu); 265 266 switch (get_ucall(vcpu, &uc)) { 267 case UCALL_SYNC: 268 pthread_barrier_wait(&test_setup_barrier); 269 continue; 270 case UCALL_DONE: 271 return NULL; 272 case UCALL_ABORT: 273 REPORT_GUEST_ASSERT(uc); 274 break; 275 default: 276 TEST_FAIL("Unknown ucall: %lu", uc.cmd); 277 } 278 } 279 280 return NULL; 281 } 282 283 static void report_stats(struct timespec delta) 284 { 285 double nr_lpis; 286 double time; 287 288 nr_lpis = test_data.nr_devices * test_data.nr_event_ids * nr_iterations; 289 290 time = delta.tv_sec; 291 time += ((double)delta.tv_nsec) / NSEC_PER_SEC; 292 293 pr_info("Rate: %.2f LPIs/sec\n", nr_lpis / time); 294 } 295 296 static void run_test(void) 297 { 298 u32 nr_devices = test_data.nr_devices; 299 u32 nr_vcpus = test_data.nr_cpus; 300 pthread_t *lpi_threads = malloc(nr_devices * sizeof(pthread_t)); 301 pthread_t *vcpu_threads = malloc(nr_vcpus * sizeof(pthread_t)); 302 struct timespec start, delta; 303 size_t i; 304 305 TEST_ASSERT(lpi_threads && vcpu_threads, "Failed to allocate pthread arrays"); 306 307 pthread_barrier_init(&test_setup_barrier, NULL, nr_vcpus + nr_devices + 1); 308 309 for (i = 0; i < nr_vcpus; i++) 310 pthread_create(&vcpu_threads[i], NULL, vcpu_worker_thread, vcpus[i]); 311 312 for (i = 0; i < nr_devices; i++) 313 pthread_create(&lpi_threads[i], NULL, lpi_worker_thread, (void *)i); 314 315 pthread_barrier_wait(&test_setup_barrier); 316 317 clock_gettime(CLOCK_MONOTONIC, &start); 318 319 for (i = 0; i < nr_devices; i++) 320 pthread_join(lpi_threads[i], NULL); 321 322 delta = timespec_elapsed(start); 323 write_guest_global(vm, test_data.request_vcpus_stop, true); 324 325 for (i = 0; i < nr_vcpus; i++) 326 pthread_join(vcpu_threads[i], NULL); 327 328 report_stats(delta); 329 } 330 331 static void setup_vm(void) 332 { 333 int i; 334 335 vcpus = malloc(test_data.nr_cpus * sizeof(struct kvm_vcpu *)); 336 TEST_ASSERT(vcpus, "Failed to allocate vCPU array"); 337 338 vm = vm_create_with_vcpus(test_data.nr_cpus, guest_code, vcpus); 339 340 vm_init_descriptor_tables(vm); 341 for (i = 0; i < test_data.nr_cpus; i++) 342 vcpu_init_descriptor_tables(vcpus[i]); 343 344 vm_install_exception_handler(vm, VECTOR_IRQ_CURRENT, guest_irq_handler); 345 346 setup_memslot(); 347 348 setup_gic(); 349 350 setup_test_data(); 351 } 352 353 static void destroy_vm(void) 354 { 355 close(its_fd); 356 kvm_vm_free(vm); 357 free(vcpus); 358 } 359 360 static void pr_usage(const char *name) 361 { 362 pr_info("%s [-v NR_VCPUS] [-d NR_DEVICES] [-e NR_EVENTS] [-i ITERS] -h\n", name); 363 pr_info(" -v:\tnumber of vCPUs (default: %u)\n", test_data.nr_cpus); 364 pr_info(" -d:\tnumber of devices (default: %u)\n", test_data.nr_devices); 365 pr_info(" -e:\tnumber of event IDs per device (default: %u)\n", test_data.nr_event_ids); 366 pr_info(" -i:\tnumber of iterations (default: %lu)\n", nr_iterations); 367 } 368 369 int main(int argc, char **argv) 370 { 371 u32 nr_threads; 372 int c; 373 374 TEST_REQUIRE(kvm_supports_vgic_v3()); 375 376 while ((c = getopt(argc, argv, "hv:d:e:i:")) != -1) { 377 switch (c) { 378 case 'v': 379 test_data.nr_cpus = atoi(optarg); 380 break; 381 case 'd': 382 test_data.nr_devices = atoi(optarg); 383 break; 384 case 'e': 385 test_data.nr_event_ids = atoi(optarg); 386 break; 387 case 'i': 388 nr_iterations = strtoul(optarg, NULL, 0); 389 break; 390 case 'h': 391 default: 392 pr_usage(argv[0]); 393 return 1; 394 } 395 } 396 397 nr_threads = test_data.nr_cpus + test_data.nr_devices; 398 if (nr_threads > get_nprocs()) 399 pr_info("WARNING: running %u threads on %d CPUs; performance is degraded.\n", 400 nr_threads, get_nprocs()); 401 402 setup_vm(); 403 404 run_test(); 405 406 destroy_vm(); 407 408 return 0; 409 } 410