xref: /linux/drivers/gpu/drm/xe/tests/xe_bo.c (revision 2c1ed907520c50326b8f604907a8478b27881a2e)
1 // SPDX-License-Identifier: GPL-2.0 AND MIT
2 /*
3  * Copyright © 2022 Intel Corporation
4  */
5 
6 #include <kunit/test.h>
7 #include <kunit/visibility.h>
8 
9 #include <linux/iosys-map.h>
10 #include <linux/math64.h>
11 #include <linux/prandom.h>
12 #include <linux/swap.h>
13 
14 #include <uapi/linux/sysinfo.h>
15 
16 #include "tests/xe_kunit_helpers.h"
17 #include "tests/xe_pci_test.h"
18 #include "tests/xe_test.h"
19 
20 #include "xe_bo_evict.h"
21 #include "xe_pci.h"
22 #include "xe_pm.h"
23 
ccs_test_migrate(struct xe_tile * tile,struct xe_bo * bo,bool clear,u64 get_val,u64 assign_val,struct kunit * test)24 static int ccs_test_migrate(struct xe_tile *tile, struct xe_bo *bo,
25 			    bool clear, u64 get_val, u64 assign_val,
26 			    struct kunit *test)
27 {
28 	struct dma_fence *fence;
29 	struct ttm_tt *ttm;
30 	struct page *page;
31 	pgoff_t ccs_page;
32 	long timeout;
33 	u64 *cpu_map;
34 	int ret;
35 	u32 offset;
36 
37 	/* Move bo to VRAM if not already there. */
38 	ret = xe_bo_validate(bo, NULL, false);
39 	if (ret) {
40 		KUNIT_FAIL(test, "Failed to validate bo.\n");
41 		return ret;
42 	}
43 
44 	/* Optionally clear bo *and* CCS data in VRAM. */
45 	if (clear) {
46 		fence = xe_migrate_clear(tile->migrate, bo, bo->ttm.resource,
47 					 XE_MIGRATE_CLEAR_FLAG_FULL);
48 		if (IS_ERR(fence)) {
49 			KUNIT_FAIL(test, "Failed to submit bo clear.\n");
50 			return PTR_ERR(fence);
51 		}
52 
53 		if (dma_fence_wait_timeout(fence, false, 5 * HZ) <= 0) {
54 			dma_fence_put(fence);
55 			KUNIT_FAIL(test, "Timeout while clearing bo.\n");
56 			return  -ETIME;
57 		}
58 
59 		dma_fence_put(fence);
60 	}
61 
62 	/* Evict to system. CCS data should be copied. */
63 	ret = xe_bo_evict(bo, true);
64 	if (ret) {
65 		KUNIT_FAIL(test, "Failed to evict bo.\n");
66 		return ret;
67 	}
68 
69 	/* Sync all migration blits */
70 	timeout = dma_resv_wait_timeout(bo->ttm.base.resv,
71 					DMA_RESV_USAGE_KERNEL,
72 					true,
73 					5 * HZ);
74 	if (timeout <= 0) {
75 		KUNIT_FAIL(test, "Failed to sync bo eviction.\n");
76 		return -ETIME;
77 	}
78 
79 	/*
80 	 * Bo with CCS data is now in system memory. Verify backing store
81 	 * and data integrity. Then assign for the next testing round while
82 	 * we still have a CPU map.
83 	 */
84 	ttm = bo->ttm.ttm;
85 	if (!ttm || !ttm_tt_is_populated(ttm)) {
86 		KUNIT_FAIL(test, "Bo was not in expected placement.\n");
87 		return -EINVAL;
88 	}
89 
90 	ccs_page = xe_bo_ccs_pages_start(bo) >> PAGE_SHIFT;
91 	if (ccs_page >= ttm->num_pages) {
92 		KUNIT_FAIL(test, "No TTM CCS pages present.\n");
93 		return -EINVAL;
94 	}
95 
96 	page = ttm->pages[ccs_page];
97 	cpu_map = kmap_local_page(page);
98 
99 	/* Check first CCS value */
100 	if (cpu_map[0] != get_val) {
101 		KUNIT_FAIL(test,
102 			   "Expected CCS readout 0x%016llx, got 0x%016llx.\n",
103 			   (unsigned long long)get_val,
104 			   (unsigned long long)cpu_map[0]);
105 		ret = -EINVAL;
106 	}
107 
108 	/* Check last CCS value, or at least last value in page. */
109 	offset = xe_device_ccs_bytes(tile_to_xe(tile), bo->size);
110 	offset = min_t(u32, offset, PAGE_SIZE) / sizeof(u64) - 1;
111 	if (cpu_map[offset] != get_val) {
112 		KUNIT_FAIL(test,
113 			   "Expected CCS readout 0x%016llx, got 0x%016llx.\n",
114 			   (unsigned long long)get_val,
115 			   (unsigned long long)cpu_map[offset]);
116 		ret = -EINVAL;
117 	}
118 
119 	cpu_map[0] = assign_val;
120 	cpu_map[offset] = assign_val;
121 	kunmap_local(cpu_map);
122 
123 	return ret;
124 }
125 
ccs_test_run_tile(struct xe_device * xe,struct xe_tile * tile,struct kunit * test)126 static void ccs_test_run_tile(struct xe_device *xe, struct xe_tile *tile,
127 			      struct kunit *test)
128 {
129 	struct xe_bo *bo;
130 
131 	int ret;
132 
133 	/* TODO: Sanity check */
134 	unsigned int bo_flags = XE_BO_FLAG_VRAM_IF_DGFX(tile);
135 
136 	if (IS_DGFX(xe))
137 		kunit_info(test, "Testing vram id %u\n", tile->id);
138 	else
139 		kunit_info(test, "Testing system memory\n");
140 
141 	bo = xe_bo_create_user(xe, NULL, NULL, SZ_1M, DRM_XE_GEM_CPU_CACHING_WC,
142 			       bo_flags);
143 	if (IS_ERR(bo)) {
144 		KUNIT_FAIL(test, "Failed to create bo.\n");
145 		return;
146 	}
147 
148 	xe_bo_lock(bo, false);
149 
150 	kunit_info(test, "Verifying that CCS data is cleared on creation.\n");
151 	ret = ccs_test_migrate(tile, bo, false, 0ULL, 0xdeadbeefdeadbeefULL,
152 			       test);
153 	if (ret)
154 		goto out_unlock;
155 
156 	kunit_info(test, "Verifying that CCS data survives migration.\n");
157 	ret = ccs_test_migrate(tile, bo, false, 0xdeadbeefdeadbeefULL,
158 			       0xdeadbeefdeadbeefULL, test);
159 	if (ret)
160 		goto out_unlock;
161 
162 	kunit_info(test, "Verifying that CCS data can be properly cleared.\n");
163 	ret = ccs_test_migrate(tile, bo, true, 0ULL, 0ULL, test);
164 
165 out_unlock:
166 	xe_bo_unlock(bo);
167 	xe_bo_put(bo);
168 }
169 
ccs_test_run_device(struct xe_device * xe)170 static int ccs_test_run_device(struct xe_device *xe)
171 {
172 	struct kunit *test = kunit_get_current_test();
173 	struct xe_tile *tile;
174 	int id;
175 
176 	if (!xe_device_has_flat_ccs(xe)) {
177 		kunit_skip(test, "non-flat-ccs device\n");
178 		return 0;
179 	}
180 
181 	/* For xe2+ dgfx, we don't handle ccs metadata */
182 	if (GRAPHICS_VER(xe) >= 20 && IS_DGFX(xe)) {
183 		kunit_skip(test, "xe2+ dgfx device\n");
184 		return 0;
185 	}
186 
187 	xe_pm_runtime_get(xe);
188 
189 	for_each_tile(tile, xe, id) {
190 		/* For igfx run only for primary tile */
191 		if (!IS_DGFX(xe) && id > 0)
192 			continue;
193 		ccs_test_run_tile(xe, tile, test);
194 	}
195 
196 	xe_pm_runtime_put(xe);
197 
198 	return 0;
199 }
200 
xe_ccs_migrate_kunit(struct kunit * test)201 static void xe_ccs_migrate_kunit(struct kunit *test)
202 {
203 	struct xe_device *xe = test->priv;
204 
205 	ccs_test_run_device(xe);
206 }
207 
evict_test_run_tile(struct xe_device * xe,struct xe_tile * tile,struct kunit * test)208 static int evict_test_run_tile(struct xe_device *xe, struct xe_tile *tile, struct kunit *test)
209 {
210 	struct xe_bo *bo, *external;
211 	unsigned int bo_flags = XE_BO_FLAG_VRAM_IF_DGFX(tile);
212 	struct xe_vm *vm = xe_migrate_get_vm(xe_device_get_root_tile(xe)->migrate);
213 	struct xe_gt *__gt;
214 	int err, i, id;
215 
216 	kunit_info(test, "Testing device %s vram id %u\n",
217 		   dev_name(xe->drm.dev), tile->id);
218 
219 	for (i = 0; i < 2; ++i) {
220 		xe_vm_lock(vm, false);
221 		bo = xe_bo_create_user(xe, NULL, vm, 0x10000,
222 				       DRM_XE_GEM_CPU_CACHING_WC,
223 				       bo_flags);
224 		xe_vm_unlock(vm);
225 		if (IS_ERR(bo)) {
226 			KUNIT_FAIL(test, "bo create err=%pe\n", bo);
227 			break;
228 		}
229 
230 		external = xe_bo_create_user(xe, NULL, NULL, 0x10000,
231 					     DRM_XE_GEM_CPU_CACHING_WC,
232 					     bo_flags);
233 		if (IS_ERR(external)) {
234 			KUNIT_FAIL(test, "external bo create err=%pe\n", external);
235 			goto cleanup_bo;
236 		}
237 
238 		xe_bo_lock(external, false);
239 		err = xe_bo_pin_external(external);
240 		xe_bo_unlock(external);
241 		if (err) {
242 			KUNIT_FAIL(test, "external bo pin err=%pe\n",
243 				   ERR_PTR(err));
244 			goto cleanup_external;
245 		}
246 
247 		err = xe_bo_evict_all(xe);
248 		if (err) {
249 			KUNIT_FAIL(test, "evict err=%pe\n", ERR_PTR(err));
250 			goto cleanup_all;
251 		}
252 
253 		for_each_gt(__gt, xe, id)
254 			xe_gt_sanitize(__gt);
255 		err = xe_bo_restore_kernel(xe);
256 		/*
257 		 * Snapshotting the CTB and copying back a potentially old
258 		 * version seems risky, depending on what might have been
259 		 * inflight. Also it seems snapshotting the ADS object and
260 		 * copying back results in serious breakage. Normally when
261 		 * calling xe_bo_restore_kernel() we always fully restart the
262 		 * GT, which re-intializes such things.  We could potentially
263 		 * skip saving and restoring such objects in xe_bo_evict_all()
264 		 * however seems quite fragile not to also restart the GT. Try
265 		 * to do that here by triggering a GT reset.
266 		 */
267 		for_each_gt(__gt, xe, id)
268 			xe_gt_reset(__gt);
269 
270 		if (err) {
271 			KUNIT_FAIL(test, "restore kernel err=%pe\n",
272 				   ERR_PTR(err));
273 			goto cleanup_all;
274 		}
275 
276 		err = xe_bo_restore_user(xe);
277 		if (err) {
278 			KUNIT_FAIL(test, "restore user err=%pe\n", ERR_PTR(err));
279 			goto cleanup_all;
280 		}
281 
282 		if (!xe_bo_is_vram(external)) {
283 			KUNIT_FAIL(test, "external bo is not vram\n");
284 			err = -EPROTO;
285 			goto cleanup_all;
286 		}
287 
288 		if (xe_bo_is_vram(bo)) {
289 			KUNIT_FAIL(test, "bo is vram\n");
290 			err = -EPROTO;
291 			goto cleanup_all;
292 		}
293 
294 		if (i) {
295 			down_read(&vm->lock);
296 			xe_vm_lock(vm, false);
297 			err = xe_bo_validate(bo, bo->vm, false);
298 			xe_vm_unlock(vm);
299 			up_read(&vm->lock);
300 			if (err) {
301 				KUNIT_FAIL(test, "bo valid err=%pe\n",
302 					   ERR_PTR(err));
303 				goto cleanup_all;
304 			}
305 			xe_bo_lock(external, false);
306 			err = xe_bo_validate(external, NULL, false);
307 			xe_bo_unlock(external);
308 			if (err) {
309 				KUNIT_FAIL(test, "external bo valid err=%pe\n",
310 					   ERR_PTR(err));
311 				goto cleanup_all;
312 			}
313 		}
314 
315 		xe_bo_lock(external, false);
316 		xe_bo_unpin_external(external);
317 		xe_bo_unlock(external);
318 
319 		xe_bo_put(external);
320 
321 		xe_bo_lock(bo, false);
322 		__xe_bo_unset_bulk_move(bo);
323 		xe_bo_unlock(bo);
324 		xe_bo_put(bo);
325 		continue;
326 
327 cleanup_all:
328 		xe_bo_lock(external, false);
329 		xe_bo_unpin_external(external);
330 		xe_bo_unlock(external);
331 cleanup_external:
332 		xe_bo_put(external);
333 cleanup_bo:
334 		xe_bo_lock(bo, false);
335 		__xe_bo_unset_bulk_move(bo);
336 		xe_bo_unlock(bo);
337 		xe_bo_put(bo);
338 		break;
339 	}
340 
341 	xe_vm_put(vm);
342 
343 	return 0;
344 }
345 
evict_test_run_device(struct xe_device * xe)346 static int evict_test_run_device(struct xe_device *xe)
347 {
348 	struct kunit *test = kunit_get_current_test();
349 	struct xe_tile *tile;
350 	int id;
351 
352 	if (!IS_DGFX(xe)) {
353 		kunit_skip(test, "non-discrete device\n");
354 		return 0;
355 	}
356 
357 	xe_pm_runtime_get(xe);
358 
359 	for_each_tile(tile, xe, id)
360 		evict_test_run_tile(xe, tile, test);
361 
362 	xe_pm_runtime_put(xe);
363 
364 	return 0;
365 }
366 
xe_bo_evict_kunit(struct kunit * test)367 static void xe_bo_evict_kunit(struct kunit *test)
368 {
369 	struct xe_device *xe = test->priv;
370 
371 	evict_test_run_device(xe);
372 }
373 
374 struct xe_bo_link {
375 	struct list_head link;
376 	struct xe_bo *bo;
377 	u32 val;
378 };
379 
380 #define XE_BO_SHRINK_SIZE ((unsigned long)SZ_64M)
381 
shrink_test_fill_random(struct xe_bo * bo,struct rnd_state * state,struct xe_bo_link * link)382 static int shrink_test_fill_random(struct xe_bo *bo, struct rnd_state *state,
383 				   struct xe_bo_link *link)
384 {
385 	struct iosys_map map;
386 	int ret = ttm_bo_vmap(&bo->ttm, &map);
387 	size_t __maybe_unused i;
388 
389 	if (ret)
390 		return ret;
391 
392 	for (i = 0; i < bo->ttm.base.size; i += sizeof(u32)) {
393 		u32 val = prandom_u32_state(state);
394 
395 		iosys_map_wr(&map, i, u32, val);
396 		if (i == 0)
397 			link->val = val;
398 	}
399 
400 	ttm_bo_vunmap(&bo->ttm, &map);
401 	return 0;
402 }
403 
shrink_test_verify(struct kunit * test,struct xe_bo * bo,unsigned int bo_nr,struct rnd_state * state,struct xe_bo_link * link)404 static bool shrink_test_verify(struct kunit *test, struct xe_bo *bo,
405 			       unsigned int bo_nr, struct rnd_state *state,
406 			       struct xe_bo_link *link)
407 {
408 	struct iosys_map map;
409 	int ret = ttm_bo_vmap(&bo->ttm, &map);
410 	size_t i;
411 	bool failed = false;
412 
413 	if (ret) {
414 		KUNIT_FAIL(test, "Error mapping bo %u for content check.\n", bo_nr);
415 		return true;
416 	}
417 
418 	for (i = 0; i < bo->ttm.base.size; i += sizeof(u32)) {
419 		u32 val = prandom_u32_state(state);
420 
421 		if (iosys_map_rd(&map, i, u32) != val) {
422 			KUNIT_FAIL(test, "Content not preserved, bo %u offset 0x%016llx",
423 				   bo_nr, (unsigned long long)i);
424 			kunit_info(test, "Failed value is 0x%08x, recorded 0x%08x\n",
425 				   (unsigned int)iosys_map_rd(&map, i, u32), val);
426 			if (i == 0 && val != link->val)
427 				kunit_info(test, "Looks like PRNG is out of sync.\n");
428 			failed = true;
429 			break;
430 		}
431 	}
432 
433 	ttm_bo_vunmap(&bo->ttm, &map);
434 
435 	return failed;
436 }
437 
438 /*
439  * Try to create system bos corresponding to twice the amount
440  * of available system memory to test shrinker functionality.
441  * If no swap space is available to accommodate the
442  * memory overcommit, mark bos purgeable.
443  */
shrink_test_run_device(struct xe_device * xe)444 static int shrink_test_run_device(struct xe_device *xe)
445 {
446 	struct kunit *test = kunit_get_current_test();
447 	LIST_HEAD(bos);
448 	struct xe_bo_link *link, *next;
449 	struct sysinfo si;
450 	u64 ram, ram_and_swap, purgeable = 0, alloced, to_alloc, limit;
451 	unsigned int interrupted = 0, successful = 0, count = 0;
452 	struct rnd_state prng;
453 	u64 rand_seed;
454 	bool failed = false;
455 
456 	rand_seed = get_random_u64();
457 	prandom_seed_state(&prng, rand_seed);
458 	kunit_info(test, "Random seed is 0x%016llx.\n",
459 		   (unsigned long long)rand_seed);
460 
461 	/* Skip if execution time is expected to be too long. */
462 
463 	limit = SZ_32G;
464 	/* IGFX with flat CCS needs to copy when swapping / shrinking */
465 	if (!IS_DGFX(xe) && xe_device_has_flat_ccs(xe))
466 		limit = SZ_16G;
467 
468 	si_meminfo(&si);
469 	ram = (size_t)si.freeram * si.mem_unit;
470 	if (ram > limit) {
471 		kunit_skip(test, "Too long expected execution time.\n");
472 		return 0;
473 	}
474 	to_alloc = ram * 2;
475 
476 	ram_and_swap = ram + get_nr_swap_pages() * PAGE_SIZE;
477 	if (to_alloc > ram_and_swap)
478 		purgeable = to_alloc - ram_and_swap;
479 	purgeable += div64_u64(purgeable, 5);
480 
481 	kunit_info(test, "Free ram is %lu bytes. Will allocate twice of that.\n",
482 		   (unsigned long)ram);
483 	for (alloced = 0; alloced < to_alloc; alloced += XE_BO_SHRINK_SIZE) {
484 		struct xe_bo *bo;
485 		unsigned int mem_type;
486 		struct xe_ttm_tt *xe_tt;
487 
488 		link = kzalloc(sizeof(*link), GFP_KERNEL);
489 		if (!link) {
490 			KUNIT_FAIL(test, "Unexpected link allocation failure\n");
491 			failed = true;
492 			break;
493 		}
494 
495 		INIT_LIST_HEAD(&link->link);
496 
497 		/* We can create bos using WC caching here. But it is slower. */
498 		bo = xe_bo_create_user(xe, NULL, NULL, XE_BO_SHRINK_SIZE,
499 				       DRM_XE_GEM_CPU_CACHING_WB,
500 				       XE_BO_FLAG_SYSTEM);
501 		if (IS_ERR(bo)) {
502 			if (bo != ERR_PTR(-ENOMEM) && bo != ERR_PTR(-ENOSPC) &&
503 			    bo != ERR_PTR(-EINTR) && bo != ERR_PTR(-ERESTARTSYS))
504 				KUNIT_FAIL(test, "Error creating bo: %pe\n", bo);
505 			kfree(link);
506 			failed = true;
507 			break;
508 		}
509 		xe_bo_lock(bo, false);
510 		xe_tt = container_of(bo->ttm.ttm, typeof(*xe_tt), ttm);
511 
512 		/*
513 		 * Allocate purgeable bos first, because if we do it the
514 		 * other way around, they may not be subject to swapping...
515 		 */
516 		if (alloced < purgeable) {
517 			xe_tt->purgeable = true;
518 			bo->ttm.priority = 0;
519 		} else {
520 			int ret = shrink_test_fill_random(bo, &prng, link);
521 
522 			if (ret) {
523 				xe_bo_unlock(bo);
524 				xe_bo_put(bo);
525 				KUNIT_FAIL(test, "Error filling bo with random data: %pe\n",
526 					   ERR_PTR(ret));
527 				kfree(link);
528 				failed = true;
529 				break;
530 			}
531 		}
532 
533 		mem_type = bo->ttm.resource->mem_type;
534 		xe_bo_unlock(bo);
535 		link->bo = bo;
536 		list_add_tail(&link->link, &bos);
537 
538 		if (mem_type != XE_PL_TT) {
539 			KUNIT_FAIL(test, "Bo in incorrect memory type: %u\n",
540 				   bo->ttm.resource->mem_type);
541 			failed = true;
542 		}
543 		cond_resched();
544 		if (signal_pending(current))
545 			break;
546 	}
547 
548 	/*
549 	 * Read back and destroy bos. Reset the pseudo-random seed to get an
550 	 * identical pseudo-random number sequence for readback.
551 	 */
552 	prandom_seed_state(&prng, rand_seed);
553 	list_for_each_entry_safe(link, next, &bos, link) {
554 		static struct ttm_operation_ctx ctx = {.interruptible = true};
555 		struct xe_bo *bo = link->bo;
556 		struct xe_ttm_tt *xe_tt;
557 		int ret;
558 
559 		count++;
560 		if (!signal_pending(current) && !failed) {
561 			bool purgeable, intr = false;
562 
563 			xe_bo_lock(bo, NULL);
564 
565 			/* xe_tt->purgeable is cleared on validate. */
566 			xe_tt = container_of(bo->ttm.ttm, typeof(*xe_tt), ttm);
567 			purgeable = xe_tt->purgeable;
568 			do {
569 				ret = ttm_bo_validate(&bo->ttm, &tt_placement, &ctx);
570 				if (ret == -EINTR)
571 					intr = true;
572 			} while (ret == -EINTR && !signal_pending(current));
573 
574 			if (!ret && !purgeable)
575 				failed = shrink_test_verify(test, bo, count, &prng, link);
576 
577 			xe_bo_unlock(bo);
578 			if (ret) {
579 				KUNIT_FAIL(test, "Validation failed: %pe\n",
580 					   ERR_PTR(ret));
581 				failed = true;
582 			} else if (intr) {
583 				interrupted++;
584 			} else {
585 				successful++;
586 			}
587 		}
588 		xe_bo_put(link->bo);
589 		list_del(&link->link);
590 		kfree(link);
591 	}
592 	kunit_info(test, "Readbacks interrupted: %u successful: %u\n",
593 		   interrupted, successful);
594 
595 	return 0;
596 }
597 
xe_bo_shrink_kunit(struct kunit * test)598 static void xe_bo_shrink_kunit(struct kunit *test)
599 {
600 	struct xe_device *xe = test->priv;
601 
602 	shrink_test_run_device(xe);
603 }
604 
605 static struct kunit_case xe_bo_tests[] = {
606 	KUNIT_CASE_PARAM(xe_ccs_migrate_kunit, xe_pci_live_device_gen_param),
607 	KUNIT_CASE_PARAM(xe_bo_evict_kunit, xe_pci_live_device_gen_param),
608 	{}
609 };
610 
611 VISIBLE_IF_KUNIT
612 struct kunit_suite xe_bo_test_suite = {
613 	.name = "xe_bo",
614 	.test_cases = xe_bo_tests,
615 	.init = xe_kunit_helper_xe_device_live_test_init,
616 };
617 EXPORT_SYMBOL_IF_KUNIT(xe_bo_test_suite);
618 
619 static struct kunit_case xe_bo_shrink_test[] = {
620 	KUNIT_CASE_PARAM_ATTR(xe_bo_shrink_kunit, xe_pci_live_device_gen_param,
621 			      {.speed = KUNIT_SPEED_SLOW}),
622 	{}
623 };
624 
625 VISIBLE_IF_KUNIT
626 struct kunit_suite xe_bo_shrink_test_suite = {
627 	.name = "xe_bo_shrink",
628 	.test_cases = xe_bo_shrink_test,
629 	.init = xe_kunit_helper_xe_device_live_test_init,
630 };
631 EXPORT_SYMBOL_IF_KUNIT(xe_bo_shrink_test_suite);
632