xref: /linux/drivers/gpu/drm/xe/xe_vm_madvise.c (revision 6916d5703ddf9a38f1f6c2cc793381a24ee914c6)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2025 Intel Corporation
4  */
5 
6 #include "xe_vm_madvise.h"
7 
8 #include <linux/nospec.h>
9 #include <drm/xe_drm.h>
10 
11 #include "xe_bo.h"
12 #include "xe_pat.h"
13 #include "xe_pt.h"
14 #include "xe_svm.h"
15 #include "xe_tlb_inval.h"
16 #include "xe_vm.h"
17 
18 struct xe_vmas_in_madvise_range {
19 	u64 addr;
20 	u64 range;
21 	struct xe_vma **vmas;
22 	int num_vmas;
23 	bool has_bo_vmas;
24 	bool has_svm_userptr_vmas;
25 };
26 
27 /**
28  * struct xe_madvise_details - Argument to madvise_funcs
29  * @dpagemap: Reference-counted pointer to a struct drm_pagemap.
30  * @has_purged_bo: Track if any BO was purged (for purgeable state)
31  * @retained_ptr: User pointer for retained value (for purgeable state)
32  *
33  * The madvise IOCTL handler may, in addition to the user-space
34  * args, have additional info to pass into the madvise_func that
35  * handles the madvise type. Use a struct_xe_madvise_details
36  * for that and extend the struct as necessary.
37  */
38 struct xe_madvise_details {
39 	struct drm_pagemap *dpagemap;
40 	bool has_purged_bo;
41 	u64 retained_ptr;
42 };
43 
get_vmas(struct xe_vm * vm,struct xe_vmas_in_madvise_range * madvise_range)44 static int get_vmas(struct xe_vm *vm, struct xe_vmas_in_madvise_range *madvise_range)
45 {
46 	u64 addr = madvise_range->addr;
47 	u64 range = madvise_range->range;
48 
49 	struct xe_vma  **__vmas;
50 	struct drm_gpuva *gpuva;
51 	int max_vmas = 8;
52 
53 	lockdep_assert_held(&vm->lock);
54 
55 	madvise_range->num_vmas = 0;
56 	madvise_range->vmas = kmalloc_objs(*madvise_range->vmas, max_vmas);
57 	if (!madvise_range->vmas)
58 		return -ENOMEM;
59 
60 	vm_dbg(&vm->xe->drm, "VMA's in range: start=0x%016llx, end=0x%016llx", addr, addr + range);
61 
62 	drm_gpuvm_for_each_va_range(gpuva, &vm->gpuvm, addr, addr + range) {
63 		struct xe_vma *vma = gpuva_to_vma(gpuva);
64 
65 		if (xe_vma_bo(vma))
66 			madvise_range->has_bo_vmas = true;
67 		else if (xe_vma_is_cpu_addr_mirror(vma) || xe_vma_is_userptr(vma))
68 			madvise_range->has_svm_userptr_vmas = true;
69 
70 		if (madvise_range->num_vmas == max_vmas) {
71 			max_vmas <<= 1;
72 			__vmas = krealloc(madvise_range->vmas,
73 					  max_vmas * sizeof(*madvise_range->vmas),
74 					  GFP_KERNEL);
75 			if (!__vmas) {
76 				kfree(madvise_range->vmas);
77 				return -ENOMEM;
78 			}
79 			madvise_range->vmas = __vmas;
80 		}
81 
82 		madvise_range->vmas[madvise_range->num_vmas] = vma;
83 		(madvise_range->num_vmas)++;
84 	}
85 
86 	if (!madvise_range->num_vmas)
87 		kfree(madvise_range->vmas);
88 
89 	vm_dbg(&vm->xe->drm, "madvise_range-num_vmas = %d\n", madvise_range->num_vmas);
90 
91 	return 0;
92 }
93 
madvise_preferred_mem_loc(struct xe_device * xe,struct xe_vm * vm,struct xe_vma ** vmas,int num_vmas,struct drm_xe_madvise * op,struct xe_madvise_details * details)94 static void madvise_preferred_mem_loc(struct xe_device *xe, struct xe_vm *vm,
95 				      struct xe_vma **vmas, int num_vmas,
96 				      struct drm_xe_madvise *op,
97 				      struct xe_madvise_details *details)
98 {
99 	int i;
100 
101 	xe_assert(vm->xe, op->type == DRM_XE_MEM_RANGE_ATTR_PREFERRED_LOC);
102 
103 	for (i = 0; i < num_vmas; i++) {
104 		struct xe_vma *vma = vmas[i];
105 		struct xe_vma_preferred_loc *loc = &vma->attr.preferred_loc;
106 
107 		/*TODO: Extend attributes to bo based vmas */
108 		if ((loc->devmem_fd == op->preferred_mem_loc.devmem_fd &&
109 		     loc->migration_policy == op->preferred_mem_loc.migration_policy) ||
110 		    !xe_vma_is_cpu_addr_mirror(vma)) {
111 			vma->skip_invalidation = true;
112 		} else {
113 			vma->skip_invalidation = false;
114 			loc->devmem_fd = op->preferred_mem_loc.devmem_fd;
115 			/* Till multi-device support is not added migration_policy
116 			 * is of no use and can be ignored.
117 			 */
118 			loc->migration_policy = op->preferred_mem_loc.migration_policy;
119 			drm_pagemap_put(loc->dpagemap);
120 			loc->dpagemap = NULL;
121 			if (details->dpagemap)
122 				loc->dpagemap = drm_pagemap_get(details->dpagemap);
123 		}
124 	}
125 }
126 
madvise_atomic(struct xe_device * xe,struct xe_vm * vm,struct xe_vma ** vmas,int num_vmas,struct drm_xe_madvise * op,struct xe_madvise_details * details)127 static void madvise_atomic(struct xe_device *xe, struct xe_vm *vm,
128 			   struct xe_vma **vmas, int num_vmas,
129 			   struct drm_xe_madvise *op,
130 			   struct xe_madvise_details *details)
131 {
132 	struct xe_bo *bo;
133 	int i;
134 
135 	xe_assert(vm->xe, op->type == DRM_XE_MEM_RANGE_ATTR_ATOMIC);
136 	xe_assert(vm->xe, op->atomic.val <= DRM_XE_ATOMIC_CPU);
137 
138 	for (i = 0; i < num_vmas; i++) {
139 		if (xe_vma_is_userptr(vmas[i]) &&
140 		    !(op->atomic.val == DRM_XE_ATOMIC_DEVICE &&
141 		      xe->info.has_device_atomics_on_smem)) {
142 			vmas[i]->skip_invalidation = true;
143 			continue;
144 		}
145 
146 		if (vmas[i]->attr.atomic_access == op->atomic.val) {
147 			vmas[i]->skip_invalidation = true;
148 		} else {
149 			vmas[i]->skip_invalidation = false;
150 			vmas[i]->attr.atomic_access = op->atomic.val;
151 		}
152 
153 		bo = xe_vma_bo(vmas[i]);
154 		if (!bo || bo->attr.atomic_access == op->atomic.val)
155 			continue;
156 
157 		vmas[i]->skip_invalidation = false;
158 		xe_bo_assert_held(bo);
159 		bo->attr.atomic_access = op->atomic.val;
160 
161 		/* Invalidate cpu page table, so bo can migrate to smem in next access */
162 		if (xe_bo_is_vram(bo) &&
163 		    (bo->attr.atomic_access == DRM_XE_ATOMIC_CPU ||
164 		     bo->attr.atomic_access == DRM_XE_ATOMIC_GLOBAL))
165 			ttm_bo_unmap_virtual(&bo->ttm);
166 	}
167 }
168 
madvise_pat_index(struct xe_device * xe,struct xe_vm * vm,struct xe_vma ** vmas,int num_vmas,struct drm_xe_madvise * op,struct xe_madvise_details * details)169 static void madvise_pat_index(struct xe_device *xe, struct xe_vm *vm,
170 			      struct xe_vma **vmas, int num_vmas,
171 			      struct drm_xe_madvise *op,
172 			      struct xe_madvise_details *details)
173 {
174 	int i;
175 
176 	xe_assert(vm->xe, op->type == DRM_XE_MEM_RANGE_ATTR_PAT);
177 
178 	for (i = 0; i < num_vmas; i++) {
179 		if (vmas[i]->attr.pat_index == op->pat_index.val) {
180 			vmas[i]->skip_invalidation = true;
181 		} else {
182 			vmas[i]->skip_invalidation = false;
183 			vmas[i]->attr.pat_index = op->pat_index.val;
184 		}
185 	}
186 }
187 
188 /**
189  * madvise_purgeable - Handle purgeable buffer object advice
190  * @xe: XE device
191  * @vm: VM
192  * @vmas: Array of VMAs
193  * @num_vmas: Number of VMAs
194  * @op: Madvise operation
195  * @details: Madvise details for return values
196  *
197  * Handles DONTNEED/WILLNEED/PURGED states. Tracks if any BO was purged
198  * in details->has_purged_bo for later copy to userspace.
199  */
madvise_purgeable(struct xe_device * xe,struct xe_vm * vm,struct xe_vma ** vmas,int num_vmas,struct drm_xe_madvise * op,struct xe_madvise_details * details)200 static void madvise_purgeable(struct xe_device *xe, struct xe_vm *vm,
201 			      struct xe_vma **vmas, int num_vmas,
202 			      struct drm_xe_madvise *op,
203 			      struct xe_madvise_details *details)
204 {
205 	int i;
206 
207 	xe_assert(vm->xe, op->type == DRM_XE_VMA_ATTR_PURGEABLE_STATE);
208 
209 	for (i = 0; i < num_vmas; i++) {
210 		struct xe_bo *bo = xe_vma_bo(vmas[i]);
211 
212 		if (!bo) {
213 			/* Purgeable state applies to BOs only, skip non-BO VMAs */
214 			vmas[i]->skip_invalidation = true;
215 			continue;
216 		}
217 
218 		/* BO must be locked before modifying madv state */
219 		xe_bo_assert_held(bo);
220 
221 		/*
222 		 * Once purged, always purged. Cannot transition back to WILLNEED.
223 		 * This matches i915 semantics where purged BOs are permanently invalid.
224 		 */
225 		if (xe_bo_is_purged(bo)) {
226 			details->has_purged_bo = true;
227 			vmas[i]->skip_invalidation = true;
228 			continue;
229 		}
230 
231 		switch (op->purge_state_val.val) {
232 		case DRM_XE_VMA_PURGEABLE_STATE_WILLNEED:
233 			vmas[i]->skip_invalidation = true;
234 			/* Only act on a real DONTNEED -> WILLNEED transition. */
235 			if (vmas[i]->attr.purgeable_state == XE_MADV_PURGEABLE_DONTNEED) {
236 				vmas[i]->attr.purgeable_state = XE_MADV_PURGEABLE_WILLNEED;
237 				xe_bo_willneed_get_locked(bo);
238 			}
239 			break;
240 		case DRM_XE_VMA_PURGEABLE_STATE_DONTNEED:
241 			/*
242 			 * Don't zap PTEs at DONTNEED time -- pages are still
243 			 * alive. The zap happens in xe_bo_move_notify() right
244 			 * before the shrinker frees them.
245 			 */
246 			vmas[i]->skip_invalidation = true;
247 
248 			/* Only act on a real WILLNEED -> DONTNEED transition. */
249 			if (vmas[i]->attr.purgeable_state == XE_MADV_PURGEABLE_WILLNEED) {
250 				vmas[i]->attr.purgeable_state = XE_MADV_PURGEABLE_DONTNEED;
251 				xe_bo_willneed_put_locked(bo);
252 			}
253 			break;
254 		default:
255 			/* Should never hit - values validated in madvise_args_are_sane() */
256 			xe_assert(vm->xe, 0);
257 			return;
258 		}
259 	}
260 }
261 
262 typedef void (*madvise_func)(struct xe_device *xe, struct xe_vm *vm,
263 			     struct xe_vma **vmas, int num_vmas,
264 			     struct drm_xe_madvise *op,
265 			     struct xe_madvise_details *details);
266 
267 static const madvise_func madvise_funcs[] = {
268 	[DRM_XE_MEM_RANGE_ATTR_PREFERRED_LOC] = madvise_preferred_mem_loc,
269 	[DRM_XE_MEM_RANGE_ATTR_ATOMIC] = madvise_atomic,
270 	[DRM_XE_MEM_RANGE_ATTR_PAT] = madvise_pat_index,
271 	[DRM_XE_VMA_ATTR_PURGEABLE_STATE] = madvise_purgeable,
272 };
273 
xe_zap_ptes_in_madvise_range(struct xe_vm * vm,u64 start,u64 end)274 static u8 xe_zap_ptes_in_madvise_range(struct xe_vm *vm, u64 start, u64 end)
275 {
276 	struct drm_gpuva *gpuva;
277 	struct xe_tile *tile;
278 	u8 id, tile_mask = 0;
279 
280 	lockdep_assert_held_write(&vm->lock);
281 
282 	/* Wait for pending binds */
283 	if (dma_resv_wait_timeout(xe_vm_resv(vm), DMA_RESV_USAGE_BOOKKEEP,
284 				  false, MAX_SCHEDULE_TIMEOUT) <= 0)
285 		XE_WARN_ON(1);
286 
287 	drm_gpuvm_for_each_va_range(gpuva, &vm->gpuvm, start, end) {
288 		struct xe_vma *vma = gpuva_to_vma(gpuva);
289 
290 		if (vma->skip_invalidation || xe_vma_is_null(vma))
291 			continue;
292 
293 		if (xe_vma_is_cpu_addr_mirror(vma)) {
294 			tile_mask |= xe_svm_ranges_zap_ptes_in_range(vm,
295 								      xe_vma_start(vma),
296 								      xe_vma_end(vma));
297 		} else {
298 			for_each_tile(tile, vm->xe, id) {
299 				if (xe_pt_zap_ptes(tile, vma)) {
300 					tile_mask |= BIT(id);
301 
302 					/*
303 					 * WRITE_ONCE pairs with READ_ONCE
304 					 * in xe_vm_has_valid_gpu_mapping()
305 					 */
306 					WRITE_ONCE(vma->tile_invalidated,
307 						   vma->tile_invalidated | BIT(id));
308 				}
309 			}
310 		}
311 	}
312 
313 	return tile_mask;
314 }
315 
xe_vm_invalidate_madvise_range(struct xe_vm * vm,u64 start,u64 end)316 static int xe_vm_invalidate_madvise_range(struct xe_vm *vm, u64 start, u64 end)
317 {
318 	u8 tile_mask = xe_zap_ptes_in_madvise_range(vm, start, end);
319 	struct xe_tlb_inval_batch batch;
320 	int err;
321 
322 	if (!tile_mask)
323 		return 0;
324 
325 	xe_device_wmb(vm->xe);
326 
327 	err = xe_tlb_inval_range_tilemask_submit(vm->xe, vm->usm.asid, start, end,
328 						 tile_mask, &batch);
329 	if (!err)
330 		xe_tlb_inval_batch_wait(&batch);
331 
332 	return err;
333 }
334 
madvise_args_are_sane(struct xe_device * xe,const struct drm_xe_madvise * args)335 static bool madvise_args_are_sane(struct xe_device *xe, const struct drm_xe_madvise *args)
336 {
337 	if (XE_IOCTL_DBG(xe, !args))
338 		return false;
339 
340 	if (XE_IOCTL_DBG(xe, !IS_ALIGNED(args->start, SZ_4K)))
341 		return false;
342 
343 	if (XE_IOCTL_DBG(xe, !IS_ALIGNED(args->range, SZ_4K)))
344 		return false;
345 
346 	if (XE_IOCTL_DBG(xe, args->range < SZ_4K))
347 		return false;
348 
349 	switch (args->type) {
350 	case DRM_XE_MEM_RANGE_ATTR_PREFERRED_LOC:
351 	{
352 		s32 fd = (s32)args->preferred_mem_loc.devmem_fd;
353 
354 		if (XE_IOCTL_DBG(xe, fd < DRM_XE_PREFERRED_LOC_DEFAULT_SYSTEM))
355 			return false;
356 
357 		if (XE_IOCTL_DBG(xe, fd <= DRM_XE_PREFERRED_LOC_DEFAULT_DEVICE &&
358 				 args->preferred_mem_loc.region_instance != 0))
359 			return false;
360 
361 		if (XE_IOCTL_DBG(xe, args->preferred_mem_loc.migration_policy >
362 				     DRM_XE_MIGRATE_ONLY_SYSTEM_PAGES))
363 			return false;
364 
365 		if (XE_IOCTL_DBG(xe, args->preferred_mem_loc.reserved))
366 			return false;
367 		break;
368 	}
369 	case DRM_XE_MEM_RANGE_ATTR_ATOMIC:
370 		if (XE_IOCTL_DBG(xe, args->atomic.val > DRM_XE_ATOMIC_CPU))
371 			return false;
372 
373 		if (XE_IOCTL_DBG(xe, args->atomic.pad))
374 			return false;
375 
376 		if (XE_IOCTL_DBG(xe, args->atomic.reserved))
377 			return false;
378 
379 		break;
380 	case DRM_XE_MEM_RANGE_ATTR_PAT:
381 	{
382 		u16 pat_index, coh_mode;
383 
384 		if (XE_IOCTL_DBG(xe, args->pat_index.val >= xe->pat.n_entries))
385 			return false;
386 
387 		pat_index = array_index_nospec(args->pat_index.val, xe->pat.n_entries);
388 		coh_mode = xe_pat_index_get_coh_mode(xe, pat_index);
389 		if (XE_IOCTL_DBG(xe, !coh_mode))
390 			return false;
391 
392 		if (XE_WARN_ON(coh_mode > XE_COH_2WAY))
393 			return false;
394 
395 		if (XE_IOCTL_DBG(xe, args->pat_index.pad))
396 			return false;
397 
398 		if (XE_IOCTL_DBG(xe, args->pat_index.reserved))
399 			return false;
400 		break;
401 	}
402 	case DRM_XE_VMA_ATTR_PURGEABLE_STATE:
403 	{
404 		u32 val = args->purge_state_val.val;
405 
406 		if (XE_IOCTL_DBG(xe, !(val == DRM_XE_VMA_PURGEABLE_STATE_WILLNEED ||
407 				       val == DRM_XE_VMA_PURGEABLE_STATE_DONTNEED)))
408 			return false;
409 
410 		if (XE_IOCTL_DBG(xe, args->purge_state_val.pad))
411 			return false;
412 
413 		break;
414 	}
415 	default:
416 		if (XE_IOCTL_DBG(xe, 1))
417 			return false;
418 	}
419 
420 	if (XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
421 		return false;
422 
423 	return true;
424 }
425 
xe_madvise_details_init(struct xe_vm * vm,const struct drm_xe_madvise * args,struct xe_madvise_details * details)426 static int xe_madvise_details_init(struct xe_vm *vm, const struct drm_xe_madvise *args,
427 				   struct xe_madvise_details *details)
428 {
429 	struct xe_device *xe = vm->xe;
430 
431 	memset(details, 0, sizeof(*details));
432 
433 	/* Store retained pointer for purgeable state */
434 	if (args->type == DRM_XE_VMA_ATTR_PURGEABLE_STATE) {
435 		details->retained_ptr = args->purge_state_val.retained_ptr;
436 		return 0;
437 	}
438 
439 	if (args->type == DRM_XE_MEM_RANGE_ATTR_PREFERRED_LOC) {
440 		int fd = args->preferred_mem_loc.devmem_fd;
441 		struct drm_pagemap *dpagemap;
442 
443 		if (fd <= 0)
444 			return 0;
445 
446 		dpagemap = xe_drm_pagemap_from_fd(args->preferred_mem_loc.devmem_fd,
447 						  args->preferred_mem_loc.region_instance);
448 		if (XE_IOCTL_DBG(xe, IS_ERR(dpagemap)))
449 			return PTR_ERR(dpagemap);
450 
451 		/* Don't allow a foreign placement without a fast interconnect! */
452 		if (XE_IOCTL_DBG(xe, dpagemap->pagemap->owner != vm->svm.peer.owner)) {
453 			drm_pagemap_put(dpagemap);
454 			return -ENOLINK;
455 		}
456 		details->dpagemap = dpagemap;
457 	}
458 
459 	return 0;
460 }
461 
xe_madvise_details_fini(struct xe_madvise_details * details)462 static void xe_madvise_details_fini(struct xe_madvise_details *details)
463 {
464 	drm_pagemap_put(details->dpagemap);
465 }
466 
xe_madvise_purgeable_retained_to_user(const struct xe_madvise_details * details)467 static int xe_madvise_purgeable_retained_to_user(const struct xe_madvise_details *details)
468 {
469 	u32 retained;
470 
471 	if (!details->retained_ptr)
472 		return 0;
473 
474 	retained = !details->has_purged_bo;
475 
476 	if (put_user(retained, (u32 __user *)u64_to_user_ptr(details->retained_ptr)))
477 		return -EFAULT;
478 
479 	return 0;
480 }
481 
check_pat_args_are_sane(struct xe_device * xe,struct xe_vmas_in_madvise_range * madvise_range,u16 pat_index)482 static bool check_pat_args_are_sane(struct xe_device *xe,
483 				    struct xe_vmas_in_madvise_range *madvise_range,
484 				    u16 pat_index)
485 {
486 	u16 coh_mode = xe_pat_index_get_coh_mode(xe, pat_index);
487 	int i;
488 
489 	/*
490 	 * Using coh_none with CPU cached buffers is not allowed on iGPU.
491 	 * On iGPU the GPU shares the LLC with the CPU, so with coh_none
492 	 * the GPU bypasses CPU caches and reads directly from DRAM,
493 	 * potentially seeing stale sensitive data from previously freed
494 	 * pages. On dGPU this restriction does not apply, because the
495 	 * platform does not provide a non-coherent system memory access
496 	 * path that would violate the DMA coherency contract.
497 	 */
498 	if (coh_mode != XE_COH_NONE || IS_DGFX(xe))
499 		return true;
500 
501 	for (i = 0; i < madvise_range->num_vmas; i++) {
502 		struct xe_vma *vma = madvise_range->vmas[i];
503 		struct xe_bo *bo = xe_vma_bo(vma);
504 
505 		if (bo) {
506 			/* BO with WB caching + COH_NONE is not allowed */
507 			if (XE_IOCTL_DBG(xe, bo->cpu_caching == DRM_XE_GEM_CPU_CACHING_WB))
508 				return false;
509 			/* Imported dma-buf without caching info, assume cached */
510 			if (XE_IOCTL_DBG(xe, !bo->cpu_caching))
511 				return false;
512 		} else if (XE_IOCTL_DBG(xe, xe_vma_is_cpu_addr_mirror(vma) ||
513 					    xe_vma_is_userptr(vma)))
514 			/* System memory (userptr/SVM) is always CPU cached */
515 			return false;
516 	}
517 
518 	return true;
519 }
520 
check_bo_args_are_sane(struct xe_vm * vm,struct xe_vma ** vmas,int num_vmas,u32 atomic_val)521 static bool check_bo_args_are_sane(struct xe_vm *vm, struct xe_vma **vmas,
522 				   int num_vmas, u32 atomic_val)
523 {
524 	struct xe_device *xe = vm->xe;
525 	struct xe_bo *bo;
526 	int i;
527 
528 	for (i = 0; i < num_vmas; i++) {
529 		bo = xe_vma_bo(vmas[i]);
530 		if (!bo)
531 			continue;
532 		/*
533 		 * NOTE: The following atomic checks are platform-specific. For example,
534 		 * if a device supports CXL atomics, these may not be necessary or
535 		 * may behave differently.
536 		 */
537 		if (XE_IOCTL_DBG(xe, atomic_val == DRM_XE_ATOMIC_CPU &&
538 				 !(bo->flags & XE_BO_FLAG_SYSTEM)))
539 			return false;
540 
541 		if (XE_IOCTL_DBG(xe, atomic_val == DRM_XE_ATOMIC_DEVICE &&
542 				 !(bo->flags & XE_BO_FLAG_VRAM0) &&
543 				 !(bo->flags & XE_BO_FLAG_VRAM1) &&
544 				 !(bo->flags & XE_BO_FLAG_SYSTEM &&
545 				   xe->info.has_device_atomics_on_smem)))
546 			return false;
547 
548 		if (XE_IOCTL_DBG(xe, atomic_val == DRM_XE_ATOMIC_GLOBAL &&
549 				 (!(bo->flags & XE_BO_FLAG_SYSTEM) ||
550 				  (!(bo->flags & XE_BO_FLAG_VRAM0) &&
551 				   !(bo->flags & XE_BO_FLAG_VRAM1)))))
552 			return false;
553 	}
554 	return true;
555 }
556 /**
557  * xe_vm_madvise_ioctl - Handle MADVise ioctl for a VM
558  * @dev: DRM device pointer
559  * @data: Pointer to ioctl data (drm_xe_madvise*)
560  * @file: DRM file pointer
561  *
562  * Handles the MADVISE ioctl to provide memory advice for vma's within
563  * input range.
564  *
565  * Return: 0 on success or a negative error code on failure.
566  */
xe_vm_madvise_ioctl(struct drm_device * dev,void * data,struct drm_file * file)567 int xe_vm_madvise_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
568 {
569 	struct xe_device *xe = to_xe_device(dev);
570 	struct xe_file *xef = to_xe_file(file);
571 	struct drm_xe_madvise *args = data;
572 	struct xe_vmas_in_madvise_range madvise_range = {
573 		/*
574 		 * Userspace may pass canonical (sign-extended) addresses.
575 		 * Strip the sign extension to get the internal non-canonical
576 		 * form used by the GPUVM, matching xe_vm_bind_ioctl() behavior.
577 		 */
578 		.addr = xe_device_uncanonicalize_addr(xe, args->start),
579 		.range = args->range,
580 	};
581 	struct xe_madvise_details details;
582 	u16 pat_index, coh_mode;
583 	struct xe_vm *vm;
584 	struct drm_exec exec;
585 	int err, attr_type;
586 	bool do_retained;
587 
588 	vm = xe_vm_lookup(xef, args->vm_id);
589 	if (XE_IOCTL_DBG(xe, !vm))
590 		return -EINVAL;
591 
592 	if (!madvise_args_are_sane(vm->xe, args)) {
593 		err = -EINVAL;
594 		goto put_vm;
595 	}
596 
597 	/* Cache whether we need to write retained, and validate it's initialized to 0 */
598 	do_retained = args->type == DRM_XE_VMA_ATTR_PURGEABLE_STATE &&
599 		      args->purge_state_val.retained_ptr;
600 	if (do_retained) {
601 		u32 retained;
602 		u32 __user *retained_ptr;
603 
604 		retained_ptr = u64_to_user_ptr(args->purge_state_val.retained_ptr);
605 		if (get_user(retained, retained_ptr)) {
606 			err = -EFAULT;
607 			goto put_vm;
608 		}
609 
610 		if (XE_IOCTL_DBG(xe, retained != 0)) {
611 			err = -EINVAL;
612 			goto put_vm;
613 		}
614 	}
615 
616 	xe_svm_flush(vm);
617 
618 	err = down_write_killable(&vm->lock);
619 	if (err)
620 		goto put_vm;
621 
622 	if (XE_IOCTL_DBG(xe, xe_vm_is_closed_or_banned(vm))) {
623 		err = -ENOENT;
624 		goto unlock_vm;
625 	}
626 
627 	err = xe_madvise_details_init(vm, args, &details);
628 	if (err)
629 		goto unlock_vm;
630 
631 	err = xe_vm_alloc_madvise_vma(vm, madvise_range.addr, args->range);
632 	if (err)
633 		goto madv_fini;
634 
635 	err = get_vmas(vm, &madvise_range);
636 	if (err || !madvise_range.num_vmas)
637 		goto madv_fini;
638 
639 	if (args->type == DRM_XE_MEM_RANGE_ATTR_PAT) {
640 		pat_index = array_index_nospec(args->pat_index.val, xe->pat.n_entries);
641 		coh_mode = xe_pat_index_get_coh_mode(xe, pat_index);
642 		if (XE_IOCTL_DBG(xe, madvise_range.has_svm_userptr_vmas &&
643 				 xe_device_is_l2_flush_optimized(xe) &&
644 				 (pat_index != 19 && coh_mode != XE_COH_2WAY))) {
645 			err = -EINVAL;
646 			goto madv_fini;
647 		}
648 	}
649 
650 	if (args->type == DRM_XE_MEM_RANGE_ATTR_PAT) {
651 		if (!check_pat_args_are_sane(xe, &madvise_range,
652 					     args->pat_index.val)) {
653 			err = -EINVAL;
654 			goto free_vmas;
655 		}
656 	}
657 
658 	if (madvise_range.has_bo_vmas) {
659 		if (args->type == DRM_XE_MEM_RANGE_ATTR_ATOMIC) {
660 			if (!check_bo_args_are_sane(vm, madvise_range.vmas,
661 						    madvise_range.num_vmas,
662 						    args->atomic.val)) {
663 				err = -EINVAL;
664 				goto free_vmas;
665 			}
666 		}
667 
668 		drm_exec_init(&exec, DRM_EXEC_IGNORE_DUPLICATES | DRM_EXEC_INTERRUPTIBLE_WAIT, 0);
669 		drm_exec_until_all_locked(&exec) {
670 			for (int i = 0; i < madvise_range.num_vmas; i++) {
671 				struct xe_bo *bo = xe_vma_bo(madvise_range.vmas[i]);
672 
673 				if (!bo)
674 					continue;
675 
676 				if (args->type == DRM_XE_MEM_RANGE_ATTR_PAT) {
677 					if (XE_IOCTL_DBG(xe, bo->ttm.base.import_attach &&
678 							 xe_device_is_l2_flush_optimized(xe) &&
679 							 (pat_index != 19 &&
680 							  coh_mode != XE_COH_2WAY))) {
681 						err = -EINVAL;
682 						goto err_fini;
683 					}
684 				}
685 
686 				err = drm_exec_lock_obj(&exec, &bo->ttm.base);
687 				drm_exec_retry_on_contention(&exec);
688 				if (err)
689 					goto err_fini;
690 			}
691 		}
692 	}
693 
694 	if (madvise_range.has_svm_userptr_vmas) {
695 		err = xe_svm_notifier_lock_interruptible(vm);
696 		if (err)
697 			goto err_fini;
698 	}
699 
700 	attr_type = array_index_nospec(args->type, ARRAY_SIZE(madvise_funcs));
701 
702 	/* Ensure the madvise function exists for this type */
703 	if (!madvise_funcs[attr_type]) {
704 		err = -EINVAL;
705 		goto err_fini;
706 	}
707 
708 	madvise_funcs[attr_type](xe, vm, madvise_range.vmas, madvise_range.num_vmas, args,
709 				 &details);
710 
711 	err = xe_vm_invalidate_madvise_range(vm, madvise_range.addr,
712 					     madvise_range.addr + args->range);
713 
714 	if (madvise_range.has_svm_userptr_vmas)
715 		xe_svm_notifier_unlock(vm);
716 
717 err_fini:
718 	if (madvise_range.has_bo_vmas)
719 		drm_exec_fini(&exec);
720 free_vmas:
721 	kfree(madvise_range.vmas);
722 	madvise_range.vmas = NULL;
723 madv_fini:
724 	xe_madvise_details_fini(&details);
725 unlock_vm:
726 	up_write(&vm->lock);
727 
728 	/* Write retained value to user after releasing all locks */
729 	if (!err && do_retained)
730 		err = xe_madvise_purgeable_retained_to_user(&details);
731 put_vm:
732 	xe_vm_put(vm);
733 	return err;
734 }
735