xref: /linux/tools/testing/selftests/kvm/mmu_stress_test.c (revision df2e3152f1cb798ed8ffa7e488c50261e6dc50e3)
1 // SPDX-License-Identifier: GPL-2.0
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <pthread.h>
5 #include <semaphore.h>
6 #include <sys/types.h>
7 #include <signal.h>
8 #include <errno.h>
9 #include <linux/bitmap.h>
10 #include <linux/bitops.h>
11 #include <linux/atomic.h>
12 #include <linux/sizes.h>
13 
14 #include "kvm_util.h"
15 #include "test_util.h"
16 #include "guest_modes.h"
17 #include "processor.h"
18 #include "ucall_common.h"
19 
20 static bool mprotect_ro_done;
21 
22 static void guest_code(uint64_t start_gpa, uint64_t end_gpa, uint64_t stride)
23 {
24 	uint64_t gpa;
25 	int i;
26 
27 	for (i = 0; i < 2; i++) {
28 		for (gpa = start_gpa; gpa < end_gpa; gpa += stride)
29 			vcpu_arch_put_guest(*((volatile uint64_t *)gpa), gpa);
30 		GUEST_SYNC(i);
31 	}
32 
33 	for (gpa = start_gpa; gpa < end_gpa; gpa += stride)
34 		*((volatile uint64_t *)gpa);
35 	GUEST_SYNC(2);
36 
37 	/*
38 	 * Write to the region while mprotect(PROT_READ) is underway.  Keep
39 	 * looping until the memory is guaranteed to be read-only, otherwise
40 	 * vCPUs may complete their writes and advance to the next stage
41 	 * prematurely.
42 	 *
43 	 * For architectures that support skipping the faulting instruction,
44 	 * generate the store via inline assembly to ensure the exact length
45 	 * of the instruction is known and stable (vcpu_arch_put_guest() on
46 	 * fixed-length architectures should work, but the cost of paranoia
47 	 * is low in this case).  For x86, hand-code the exact opcode so that
48 	 * there is no room for variability in the generated instruction.
49 	 */
50 	do {
51 		for (gpa = start_gpa; gpa < end_gpa; gpa += stride)
52 #ifdef __x86_64__
53 			asm volatile(".byte 0x48,0x89,0x00" :: "a"(gpa) : "memory"); /* mov %rax, (%rax) */
54 #elif defined(__aarch64__)
55 			asm volatile("str %0, [%0]" :: "r" (gpa) : "memory");
56 #else
57 			vcpu_arch_put_guest(*((volatile uint64_t *)gpa), gpa);
58 #endif
59 	} while (!READ_ONCE(mprotect_ro_done));
60 
61 	/*
62 	 * Only architectures that write the entire range can explicitly sync,
63 	 * as other architectures will be stuck on the write fault.
64 	 */
65 #if defined(__x86_64__) || defined(__aarch64__)
66 	GUEST_SYNC(3);
67 #endif
68 
69 	for (gpa = start_gpa; gpa < end_gpa; gpa += stride)
70 		vcpu_arch_put_guest(*((volatile uint64_t *)gpa), gpa);
71 	GUEST_SYNC(4);
72 
73 	GUEST_ASSERT(0);
74 }
75 
76 struct vcpu_info {
77 	struct kvm_vcpu *vcpu;
78 	uint64_t start_gpa;
79 	uint64_t end_gpa;
80 };
81 
82 static int nr_vcpus;
83 static atomic_t rendezvous;
84 
85 static void rendezvous_with_boss(void)
86 {
87 	int orig = atomic_read(&rendezvous);
88 
89 	if (orig > 0) {
90 		atomic_dec_and_test(&rendezvous);
91 		while (atomic_read(&rendezvous) > 0)
92 			cpu_relax();
93 	} else {
94 		atomic_inc(&rendezvous);
95 		while (atomic_read(&rendezvous) < 0)
96 			cpu_relax();
97 	}
98 }
99 
100 static void assert_sync_stage(struct kvm_vcpu *vcpu, int stage)
101 {
102 	struct ucall uc;
103 
104 	TEST_ASSERT_EQ(get_ucall(vcpu, &uc), UCALL_SYNC);
105 	TEST_ASSERT_EQ(uc.args[1], stage);
106 }
107 
108 static void run_vcpu(struct kvm_vcpu *vcpu, int stage)
109 {
110 	vcpu_run(vcpu);
111 	assert_sync_stage(vcpu, stage);
112 }
113 
114 static void *vcpu_worker(void *data)
115 {
116 	struct kvm_sregs __maybe_unused sregs;
117 	struct vcpu_info *info = data;
118 	struct kvm_vcpu *vcpu = info->vcpu;
119 	struct kvm_vm *vm = vcpu->vm;
120 	int r;
121 
122 	vcpu_args_set(vcpu, 3, info->start_gpa, info->end_gpa, vm->page_size);
123 
124 	rendezvous_with_boss();
125 
126 	/* Stage 0, write all of guest memory. */
127 	run_vcpu(vcpu, 0);
128 	rendezvous_with_boss();
129 #ifdef __x86_64__
130 	vcpu_sregs_get(vcpu, &sregs);
131 	/* Toggle CR0.WP to trigger a MMU context reset. */
132 	sregs.cr0 ^= X86_CR0_WP;
133 	vcpu_sregs_set(vcpu, &sregs);
134 #endif
135 	rendezvous_with_boss();
136 
137 	/* Stage 1, re-write all of guest memory. */
138 	run_vcpu(vcpu, 1);
139 	rendezvous_with_boss();
140 
141 	/* Stage 2, read all of guest memory, which is now read-only. */
142 	run_vcpu(vcpu, 2);
143 
144 	/*
145 	 * Stage 3, write guest memory and verify KVM returns -EFAULT for once
146 	 * the mprotect(PROT_READ) lands.  Only architectures that support
147 	 * validating *all* of guest memory sync for this stage, as vCPUs will
148 	 * be stuck on the faulting instruction for other architectures.  Go to
149 	 * stage 3 without a rendezvous
150 	 */
151 	do {
152 		r = _vcpu_run(vcpu);
153 	} while (!r);
154 	TEST_ASSERT(r == -1 && errno == EFAULT,
155 		    "Expected EFAULT on write to RO memory, got r = %d, errno = %d", r, errno);
156 
157 #if defined(__x86_64__) || defined(__aarch64__)
158 	/*
159 	 * Verify *all* writes from the guest hit EFAULT due to the VMA now
160 	 * being read-only.  x86 and arm64 only at this time as skipping the
161 	 * instruction that hits the EFAULT requires advancing the program
162 	 * counter, which is arch specific and relies on inline assembly.
163 	 */
164 #ifdef __x86_64__
165 	vcpu->run->kvm_valid_regs = KVM_SYNC_X86_REGS;
166 #endif
167 	for (;;) {
168 		r = _vcpu_run(vcpu);
169 		if (!r)
170 			break;
171 		TEST_ASSERT_EQ(errno, EFAULT);
172 #if defined(__x86_64__)
173 		WRITE_ONCE(vcpu->run->kvm_dirty_regs, KVM_SYNC_X86_REGS);
174 		vcpu->run->s.regs.regs.rip += 3;
175 #elif defined(__aarch64__)
176 		vcpu_set_reg(vcpu, ARM64_CORE_REG(regs.pc),
177 			     vcpu_get_reg(vcpu, ARM64_CORE_REG(regs.pc)) + 4);
178 #endif
179 
180 	}
181 	assert_sync_stage(vcpu, 3);
182 #endif /* __x86_64__ || __aarch64__ */
183 	rendezvous_with_boss();
184 
185 	/*
186 	 * Stage 4.  Run to completion, waiting for mprotect(PROT_WRITE) to
187 	 * make the memory writable again.
188 	 */
189 	do {
190 		r = _vcpu_run(vcpu);
191 	} while (r && errno == EFAULT);
192 	TEST_ASSERT_EQ(r, 0);
193 	assert_sync_stage(vcpu, 4);
194 	rendezvous_with_boss();
195 
196 	return NULL;
197 }
198 
199 static pthread_t *spawn_workers(struct kvm_vm *vm, struct kvm_vcpu **vcpus,
200 				uint64_t start_gpa, uint64_t end_gpa)
201 {
202 	struct vcpu_info *info;
203 	uint64_t gpa, nr_bytes;
204 	pthread_t *threads;
205 	int i;
206 
207 	threads = malloc(nr_vcpus * sizeof(*threads));
208 	TEST_ASSERT(threads, "Failed to allocate vCPU threads");
209 
210 	info = malloc(nr_vcpus * sizeof(*info));
211 	TEST_ASSERT(info, "Failed to allocate vCPU gpa ranges");
212 
213 	nr_bytes = ((end_gpa - start_gpa) / nr_vcpus) &
214 			~((uint64_t)vm->page_size - 1);
215 	TEST_ASSERT(nr_bytes, "C'mon, no way you have %d CPUs", nr_vcpus);
216 
217 	for (i = 0, gpa = start_gpa; i < nr_vcpus; i++, gpa += nr_bytes) {
218 		info[i].vcpu = vcpus[i];
219 		info[i].start_gpa = gpa;
220 		info[i].end_gpa = gpa + nr_bytes;
221 		pthread_create(&threads[i], NULL, vcpu_worker, &info[i]);
222 	}
223 	return threads;
224 }
225 
226 static void rendezvous_with_vcpus(struct timespec *time, const char *name)
227 {
228 	int i, rendezvoused;
229 
230 	pr_info("Waiting for vCPUs to finish %s...\n", name);
231 
232 	rendezvoused = atomic_read(&rendezvous);
233 	for (i = 0; abs(rendezvoused) != 1; i++) {
234 		usleep(100);
235 		if (!(i & 0x3f))
236 			pr_info("\r%d vCPUs haven't rendezvoused...",
237 				abs(rendezvoused) - 1);
238 		rendezvoused = atomic_read(&rendezvous);
239 	}
240 
241 	clock_gettime(CLOCK_MONOTONIC, time);
242 
243 	/* Release the vCPUs after getting the time of the previous action. */
244 	pr_info("\rAll vCPUs finished %s, releasing...\n", name);
245 	if (rendezvoused > 0)
246 		atomic_set(&rendezvous, -nr_vcpus - 1);
247 	else
248 		atomic_set(&rendezvous, nr_vcpus + 1);
249 }
250 
251 static void calc_default_nr_vcpus(void)
252 {
253 	cpu_set_t possible_mask;
254 	int r;
255 
256 	r = sched_getaffinity(0, sizeof(possible_mask), &possible_mask);
257 	TEST_ASSERT(!r, "sched_getaffinity failed, errno = %d (%s)",
258 		    errno, strerror(errno));
259 
260 	nr_vcpus = CPU_COUNT(&possible_mask) * 3/4;
261 	TEST_ASSERT(nr_vcpus > 0, "Uh, no CPUs?");
262 }
263 
264 int main(int argc, char *argv[])
265 {
266 	/*
267 	 * Skip the first 4gb and slot0.  slot0 maps <1gb and is used to back
268 	 * the guest's code, stack, and page tables.  Because selftests creates
269 	 * an IRQCHIP, a.k.a. a local APIC, KVM creates an internal memslot
270 	 * just below the 4gb boundary.  This test could create memory at
271 	 * 1gb-3gb,but it's simpler to skip straight to 4gb.
272 	 */
273 	const uint64_t start_gpa = SZ_4G;
274 	const int first_slot = 1;
275 
276 	struct timespec time_start, time_run1, time_reset, time_run2, time_ro, time_rw;
277 	uint64_t max_gpa, gpa, slot_size, max_mem, i;
278 	int max_slots, slot, opt, fd;
279 	bool hugepages = false;
280 	struct kvm_vcpu **vcpus;
281 	pthread_t *threads;
282 	struct kvm_vm *vm;
283 	void *mem;
284 
285 	/*
286 	 * Default to 2gb so that maxing out systems with MAXPHADDR=46, which
287 	 * are quite common for x86, requires changing only max_mem (KVM allows
288 	 * 32k memslots, 32k * 2gb == ~64tb of guest memory).
289 	 */
290 	slot_size = SZ_2G;
291 
292 	max_slots = kvm_check_cap(KVM_CAP_NR_MEMSLOTS);
293 	TEST_ASSERT(max_slots > first_slot, "KVM is broken");
294 
295 	/* All KVM MMUs should be able to survive a 128gb guest. */
296 	max_mem = 128ull * SZ_1G;
297 
298 	calc_default_nr_vcpus();
299 
300 	while ((opt = getopt(argc, argv, "c:h:m:s:H")) != -1) {
301 		switch (opt) {
302 		case 'c':
303 			nr_vcpus = atoi_positive("Number of vCPUs", optarg);
304 			break;
305 		case 'm':
306 			max_mem = 1ull * atoi_positive("Memory size", optarg) * SZ_1G;
307 			break;
308 		case 's':
309 			slot_size = 1ull * atoi_positive("Slot size", optarg) * SZ_1G;
310 			break;
311 		case 'H':
312 			hugepages = true;
313 			break;
314 		case 'h':
315 		default:
316 			printf("usage: %s [-c nr_vcpus] [-m max_mem_in_gb] [-s slot_size_in_gb] [-H]\n", argv[0]);
317 			exit(1);
318 		}
319 	}
320 
321 	vcpus = malloc(nr_vcpus * sizeof(*vcpus));
322 	TEST_ASSERT(vcpus, "Failed to allocate vCPU array");
323 
324 	vm = __vm_create_with_vcpus(VM_SHAPE_DEFAULT, nr_vcpus,
325 #ifdef __x86_64__
326 				    max_mem / SZ_1G,
327 #else
328 				    max_mem / vm_guest_mode_params[VM_MODE_DEFAULT].page_size,
329 #endif
330 				    guest_code, vcpus);
331 
332 	max_gpa = vm->max_gfn << vm->page_shift;
333 	TEST_ASSERT(max_gpa > (4 * slot_size), "MAXPHYADDR <4gb ");
334 
335 	fd = kvm_memfd_alloc(slot_size, hugepages);
336 	mem = mmap(NULL, slot_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
337 	TEST_ASSERT(mem != MAP_FAILED, "mmap() failed");
338 
339 	TEST_ASSERT(!madvise(mem, slot_size, MADV_NOHUGEPAGE), "madvise() failed");
340 
341 	/* Pre-fault the memory to avoid taking mmap_sem on guest page faults. */
342 	for (i = 0; i < slot_size; i += vm->page_size)
343 		((uint8_t *)mem)[i] = 0xaa;
344 
345 	gpa = 0;
346 	for (slot = first_slot; slot < max_slots; slot++) {
347 		gpa = start_gpa + ((slot - first_slot) * slot_size);
348 		if (gpa + slot_size > max_gpa)
349 			break;
350 
351 		if ((gpa - start_gpa) >= max_mem)
352 			break;
353 
354 		vm_set_user_memory_region(vm, slot, 0, gpa, slot_size, mem);
355 
356 #ifdef __x86_64__
357 		/* Identity map memory in the guest using 1gb pages. */
358 		for (i = 0; i < slot_size; i += SZ_1G)
359 			__virt_pg_map(vm, gpa + i, gpa + i, PG_LEVEL_1G);
360 #else
361 		for (i = 0; i < slot_size; i += vm->page_size)
362 			virt_pg_map(vm, gpa + i, gpa + i);
363 #endif
364 	}
365 
366 	atomic_set(&rendezvous, nr_vcpus + 1);
367 	threads = spawn_workers(vm, vcpus, start_gpa, gpa);
368 
369 	free(vcpus);
370 	vcpus = NULL;
371 
372 	pr_info("Running with %lugb of guest memory and %u vCPUs\n",
373 		(gpa - start_gpa) / SZ_1G, nr_vcpus);
374 
375 	rendezvous_with_vcpus(&time_start, "spawning");
376 	rendezvous_with_vcpus(&time_run1, "run 1");
377 	rendezvous_with_vcpus(&time_reset, "reset");
378 	rendezvous_with_vcpus(&time_run2, "run 2");
379 
380 	mprotect(mem, slot_size, PROT_READ);
381 	usleep(10);
382 	mprotect_ro_done = true;
383 	sync_global_to_guest(vm, mprotect_ro_done);
384 
385 	rendezvous_with_vcpus(&time_ro, "mprotect RO");
386 	mprotect(mem, slot_size, PROT_READ | PROT_WRITE);
387 	rendezvous_with_vcpus(&time_rw, "mprotect RW");
388 
389 	time_rw    = timespec_sub(time_rw,     time_ro);
390 	time_ro    = timespec_sub(time_ro,     time_run2);
391 	time_run2  = timespec_sub(time_run2,   time_reset);
392 	time_reset = timespec_sub(time_reset,  time_run1);
393 	time_run1  = timespec_sub(time_run1,   time_start);
394 
395 	pr_info("run1 = %ld.%.9lds, reset = %ld.%.9lds, run2 = %ld.%.9lds, "
396 		"ro = %ld.%.9lds, rw = %ld.%.9lds\n",
397 		time_run1.tv_sec, time_run1.tv_nsec,
398 		time_reset.tv_sec, time_reset.tv_nsec,
399 		time_run2.tv_sec, time_run2.tv_nsec,
400 		time_ro.tv_sec, time_ro.tv_nsec,
401 		time_rw.tv_sec, time_rw.tv_nsec);
402 
403 	/*
404 	 * Delete even numbered slots (arbitrary) and unmap the first half of
405 	 * the backing (also arbitrary) to verify KVM correctly drops all
406 	 * references to the removed regions.
407 	 */
408 	for (slot = (slot - 1) & ~1ull; slot >= first_slot; slot -= 2)
409 		vm_set_user_memory_region(vm, slot, 0, 0, 0, NULL);
410 
411 	munmap(mem, slot_size / 2);
412 
413 	/* Sanity check that the vCPUs actually ran. */
414 	for (i = 0; i < nr_vcpus; i++)
415 		pthread_join(threads[i], NULL);
416 
417 	/*
418 	 * Deliberately exit without deleting the remaining memslots or closing
419 	 * kvm_fd to test cleanup via mmu_notifier.release.
420 	 */
421 }
422