xref: /linux/drivers/gpu/drm/xe/xe_vm_madvise.c (revision 06bc7ff0a1e0f2b0102e1314e3527a7ec0997851)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2025 Intel Corporation
4  */
5 
6 #include "xe_vm_madvise.h"
7 
8 #include <linux/nospec.h>
9 #include <drm/xe_drm.h>
10 
11 #include "xe_bo.h"
12 #include "xe_pat.h"
13 #include "xe_pt.h"
14 #include "xe_svm.h"
15 #include "xe_tlb_inval.h"
16 #include "xe_vm.h"
17 
18 struct xe_vmas_in_madvise_range {
19 	u64 addr;
20 	u64 range;
21 	struct xe_vma **vmas;
22 	int num_vmas;
23 	bool has_bo_vmas;
24 	bool has_svm_userptr_vmas;
25 };
26 
27 /**
28  * struct xe_madvise_details - Argument to madvise_funcs
29  * @dpagemap: Reference-counted pointer to a struct drm_pagemap.
30  * @has_purged_bo: Track if any BO was purged (for purgeable state)
31  * @retained_ptr: User pointer for retained value (for purgeable state)
32  *
33  * The madvise IOCTL handler may, in addition to the user-space
34  * args, have additional info to pass into the madvise_func that
35  * handles the madvise type. Use a struct_xe_madvise_details
36  * for that and extend the struct as necessary.
37  */
38 struct xe_madvise_details {
39 	struct drm_pagemap *dpagemap;
40 	bool has_purged_bo;
41 	u64 retained_ptr;
42 };
43 
get_vmas(struct xe_vm * vm,struct xe_vmas_in_madvise_range * madvise_range)44 static int get_vmas(struct xe_vm *vm, struct xe_vmas_in_madvise_range *madvise_range)
45 {
46 	u64 addr = madvise_range->addr;
47 	u64 range = madvise_range->range;
48 
49 	struct xe_vma  **__vmas;
50 	struct drm_gpuva *gpuva;
51 	int max_vmas = 8;
52 
53 	lockdep_assert_held(&vm->lock);
54 
55 	madvise_range->num_vmas = 0;
56 	madvise_range->vmas = kmalloc_objs(*madvise_range->vmas, max_vmas);
57 	if (!madvise_range->vmas)
58 		return -ENOMEM;
59 
60 	vm_dbg(&vm->xe->drm, "VMA's in range: start=0x%016llx, end=0x%016llx", addr, addr + range);
61 
62 	drm_gpuvm_for_each_va_range(gpuva, &vm->gpuvm, addr, addr + range) {
63 		struct xe_vma *vma = gpuva_to_vma(gpuva);
64 
65 		if (xe_vma_bo(vma))
66 			madvise_range->has_bo_vmas = true;
67 		else if (xe_vma_is_cpu_addr_mirror(vma) || xe_vma_is_userptr(vma))
68 			madvise_range->has_svm_userptr_vmas = true;
69 
70 		if (madvise_range->num_vmas == max_vmas) {
71 			max_vmas <<= 1;
72 			__vmas = krealloc(madvise_range->vmas,
73 					  max_vmas * sizeof(*madvise_range->vmas),
74 					  GFP_KERNEL);
75 			if (!__vmas) {
76 				kfree(madvise_range->vmas);
77 				return -ENOMEM;
78 			}
79 			madvise_range->vmas = __vmas;
80 		}
81 
82 		madvise_range->vmas[madvise_range->num_vmas] = vma;
83 		(madvise_range->num_vmas)++;
84 	}
85 
86 	if (!madvise_range->num_vmas)
87 		kfree(madvise_range->vmas);
88 
89 	vm_dbg(&vm->xe->drm, "madvise_range-num_vmas = %d\n", madvise_range->num_vmas);
90 
91 	return 0;
92 }
93 
madvise_preferred_mem_loc(struct xe_device * xe,struct xe_vm * vm,struct xe_vma ** vmas,int num_vmas,struct drm_xe_madvise * op,struct xe_madvise_details * details)94 static void madvise_preferred_mem_loc(struct xe_device *xe, struct xe_vm *vm,
95 				      struct xe_vma **vmas, int num_vmas,
96 				      struct drm_xe_madvise *op,
97 				      struct xe_madvise_details *details)
98 {
99 	int i;
100 
101 	xe_assert(vm->xe, op->type == DRM_XE_MEM_RANGE_ATTR_PREFERRED_LOC);
102 
103 	for (i = 0; i < num_vmas; i++) {
104 		struct xe_vma *vma = vmas[i];
105 		struct xe_vma_preferred_loc *loc = &vma->attr.preferred_loc;
106 
107 		/*TODO: Extend attributes to bo based vmas */
108 		if ((loc->devmem_fd == op->preferred_mem_loc.devmem_fd &&
109 		     loc->migration_policy == op->preferred_mem_loc.migration_policy) ||
110 		    !xe_vma_is_cpu_addr_mirror(vma)) {
111 			vma->skip_invalidation = true;
112 		} else {
113 			vma->skip_invalidation = false;
114 			loc->devmem_fd = op->preferred_mem_loc.devmem_fd;
115 			/* Till multi-device support is not added migration_policy
116 			 * is of no use and can be ignored.
117 			 */
118 			loc->migration_policy = op->preferred_mem_loc.migration_policy;
119 			drm_pagemap_put(loc->dpagemap);
120 			loc->dpagemap = NULL;
121 			if (details->dpagemap)
122 				loc->dpagemap = drm_pagemap_get(details->dpagemap);
123 		}
124 	}
125 }
126 
madvise_atomic(struct xe_device * xe,struct xe_vm * vm,struct xe_vma ** vmas,int num_vmas,struct drm_xe_madvise * op,struct xe_madvise_details * details)127 static void madvise_atomic(struct xe_device *xe, struct xe_vm *vm,
128 			   struct xe_vma **vmas, int num_vmas,
129 			   struct drm_xe_madvise *op,
130 			   struct xe_madvise_details *details)
131 {
132 	struct xe_bo *bo;
133 	int i;
134 
135 	xe_assert(vm->xe, op->type == DRM_XE_MEM_RANGE_ATTR_ATOMIC);
136 	xe_assert(vm->xe, op->atomic.val <= DRM_XE_ATOMIC_CPU);
137 
138 	for (i = 0; i < num_vmas; i++) {
139 		if (xe_vma_is_userptr(vmas[i]) &&
140 		    !(op->atomic.val == DRM_XE_ATOMIC_DEVICE &&
141 		      xe->info.has_device_atomics_on_smem)) {
142 			vmas[i]->skip_invalidation = true;
143 			continue;
144 		}
145 
146 		if (vmas[i]->attr.atomic_access == op->atomic.val) {
147 			vmas[i]->skip_invalidation = true;
148 		} else {
149 			vmas[i]->skip_invalidation = false;
150 			vmas[i]->attr.atomic_access = op->atomic.val;
151 		}
152 
153 		bo = xe_vma_bo(vmas[i]);
154 		if (!bo || bo->attr.atomic_access == op->atomic.val)
155 			continue;
156 
157 		vmas[i]->skip_invalidation = false;
158 		xe_bo_assert_held(bo);
159 		bo->attr.atomic_access = op->atomic.val;
160 
161 		/* Invalidate cpu page table, so bo can migrate to smem in next access */
162 		if (xe_bo_is_vram(bo) &&
163 		    (bo->attr.atomic_access == DRM_XE_ATOMIC_CPU ||
164 		     bo->attr.atomic_access == DRM_XE_ATOMIC_GLOBAL))
165 			ttm_bo_unmap_virtual(&bo->ttm);
166 	}
167 }
168 
madvise_pat_index(struct xe_device * xe,struct xe_vm * vm,struct xe_vma ** vmas,int num_vmas,struct drm_xe_madvise * op,struct xe_madvise_details * details)169 static void madvise_pat_index(struct xe_device *xe, struct xe_vm *vm,
170 			      struct xe_vma **vmas, int num_vmas,
171 			      struct drm_xe_madvise *op,
172 			      struct xe_madvise_details *details)
173 {
174 	int i;
175 
176 	xe_assert(vm->xe, op->type == DRM_XE_MEM_RANGE_ATTR_PAT);
177 
178 	for (i = 0; i < num_vmas; i++) {
179 		if (vmas[i]->attr.pat_index == op->pat_index.val) {
180 			vmas[i]->skip_invalidation = true;
181 		} else {
182 			vmas[i]->skip_invalidation = false;
183 			vmas[i]->attr.pat_index = op->pat_index.val;
184 		}
185 	}
186 }
187 
188 /**
189  * xe_bo_is_dmabuf_shared() - Check if BO is shared via dma-buf
190  * @bo: Buffer object
191  *
192  * Prevent marking imported or exported dma-bufs as purgeable.
193  * For imported BOs, Xe doesn't own the backing store and cannot
194  * safely reclaim pages (exporter or other devices may still be
195  * using them). For exported BOs, external devices may have active
196  * mappings we cannot track.
197  *
198  * Return: true if BO is imported or exported, false otherwise
199  */
xe_bo_is_dmabuf_shared(struct xe_bo * bo)200 static bool xe_bo_is_dmabuf_shared(struct xe_bo *bo)
201 {
202 	struct drm_gem_object *obj = &bo->ttm.base;
203 
204 	/* Imported: exporter owns backing store */
205 	if (drm_gem_is_imported(obj))
206 		return true;
207 
208 	/* Exported: external devices may be accessing */
209 	if (obj->dma_buf)
210 		return true;
211 
212 	return false;
213 }
214 
215 /**
216  * enum xe_bo_vmas_purge_state - VMA purgeable state aggregation
217  *
218  * Distinguishes whether a BO's VMAs are all DONTNEED, have at least
219  * one WILLNEED, or have no VMAs at all.
220  *
221  * Enum values align with XE_MADV_PURGEABLE_* states for consistency.
222  */
223 enum xe_bo_vmas_purge_state {
224 	/** @XE_BO_VMAS_STATE_WILLNEED: At least one VMA is WILLNEED */
225 	XE_BO_VMAS_STATE_WILLNEED = 0,
226 	/** @XE_BO_VMAS_STATE_DONTNEED: All VMAs are DONTNEED */
227 	XE_BO_VMAS_STATE_DONTNEED = 1,
228 	/** @XE_BO_VMAS_STATE_NO_VMAS: BO has no VMAs */
229 	XE_BO_VMAS_STATE_NO_VMAS = 2,
230 };
231 
232 /*
233  * xe_bo_recompute_purgeable_state() casts between xe_bo_vmas_purge_state and
234  * xe_madv_purgeable_state. Enforce that WILLNEED=0 and DONTNEED=1 match across
235  * both enums so the single-line cast is always valid.
236  */
237 static_assert(XE_BO_VMAS_STATE_WILLNEED == (int)XE_MADV_PURGEABLE_WILLNEED,
238 	      "VMA purge state WILLNEED must equal madv purgeable WILLNEED");
239 static_assert(XE_BO_VMAS_STATE_DONTNEED == (int)XE_MADV_PURGEABLE_DONTNEED,
240 	      "VMA purge state DONTNEED must equal madv purgeable DONTNEED");
241 
242 /**
243  * xe_bo_all_vmas_dontneed() - Determine BO VMA purgeable state
244  * @bo: Buffer object
245  *
246  * Check all VMAs across all VMs to determine aggregate purgeable state.
247  * Shared BOs require unanimous DONTNEED state from all mappings.
248  *
249  * Caller must hold BO dma-resv lock.
250  *
251  * Return: XE_BO_VMAS_STATE_DONTNEED if all VMAs are DONTNEED,
252  *         XE_BO_VMAS_STATE_WILLNEED if at least one VMA is not DONTNEED,
253  *         XE_BO_VMAS_STATE_NO_VMAS if BO has no VMAs
254  */
xe_bo_all_vmas_dontneed(struct xe_bo * bo)255 static enum xe_bo_vmas_purge_state xe_bo_all_vmas_dontneed(struct xe_bo *bo)
256 {
257 	struct drm_gpuvm_bo *vm_bo;
258 	struct drm_gpuva *gpuva;
259 	struct drm_gem_object *obj = &bo->ttm.base;
260 	bool has_vmas = false;
261 
262 	xe_bo_assert_held(bo);
263 
264 	/* Shared dma-bufs cannot be purgeable */
265 	if (xe_bo_is_dmabuf_shared(bo))
266 		return XE_BO_VMAS_STATE_WILLNEED;
267 
268 	drm_gem_for_each_gpuvm_bo(vm_bo, obj) {
269 		drm_gpuvm_bo_for_each_va(gpuva, vm_bo) {
270 			struct xe_vma *vma = gpuva_to_vma(gpuva);
271 
272 			has_vmas = true;
273 
274 			/* Any non-DONTNEED VMA prevents purging */
275 			if (vma->attr.purgeable_state != XE_MADV_PURGEABLE_DONTNEED)
276 				return XE_BO_VMAS_STATE_WILLNEED;
277 		}
278 	}
279 
280 	/*
281 	 * No VMAs => preserve existing BO purgeable state.
282 	 * Avoids incorrectly flipping DONTNEED -> WILLNEED when last VMA unmapped.
283 	 */
284 	if (!has_vmas)
285 		return XE_BO_VMAS_STATE_NO_VMAS;
286 
287 	return XE_BO_VMAS_STATE_DONTNEED;
288 }
289 
290 /**
291  * xe_bo_recompute_purgeable_state() - Recompute BO purgeable state from VMAs
292  * @bo: Buffer object
293  *
294  * Walk all VMAs to determine if BO should be purgeable or not.
295  * Shared BOs require unanimous DONTNEED state from all mappings.
296  * If the BO has no VMAs the existing state is preserved.
297  *
298  * Locking: Caller must hold BO dma-resv lock. When iterating GPUVM lists,
299  * VM lock must also be held (write) to prevent concurrent VMA modifications.
300  * This is satisfied at both call sites:
301  * - xe_vma_destroy(): holds vm->lock write
302  * - madvise_purgeable(): holds vm->lock write (from madvise ioctl path)
303  *
304  * Return: nothing
305  */
xe_bo_recompute_purgeable_state(struct xe_bo * bo)306 void xe_bo_recompute_purgeable_state(struct xe_bo *bo)
307 {
308 	enum xe_bo_vmas_purge_state vma_state;
309 
310 	if (!bo)
311 		return;
312 
313 	xe_bo_assert_held(bo);
314 
315 	/*
316 	 * Once purged, always purged. Cannot transition back to WILLNEED.
317 	 * This matches i915 semantics where purged BOs are permanently invalid.
318 	 */
319 	if (bo->madv_purgeable == XE_MADV_PURGEABLE_PURGED)
320 		return;
321 
322 	vma_state = xe_bo_all_vmas_dontneed(bo);
323 
324 	if (vma_state != (enum xe_bo_vmas_purge_state)bo->madv_purgeable &&
325 	    vma_state != XE_BO_VMAS_STATE_NO_VMAS)
326 		xe_bo_set_purgeable_state(bo, (enum xe_madv_purgeable_state)vma_state);
327 }
328 
329 /**
330  * madvise_purgeable - Handle purgeable buffer object advice
331  * @xe: XE device
332  * @vm: VM
333  * @vmas: Array of VMAs
334  * @num_vmas: Number of VMAs
335  * @op: Madvise operation
336  * @details: Madvise details for return values
337  *
338  * Handles DONTNEED/WILLNEED/PURGED states. Tracks if any BO was purged
339  * in details->has_purged_bo for later copy to userspace.
340  */
madvise_purgeable(struct xe_device * xe,struct xe_vm * vm,struct xe_vma ** vmas,int num_vmas,struct drm_xe_madvise * op,struct xe_madvise_details * details)341 static void madvise_purgeable(struct xe_device *xe, struct xe_vm *vm,
342 			      struct xe_vma **vmas, int num_vmas,
343 			      struct drm_xe_madvise *op,
344 			      struct xe_madvise_details *details)
345 {
346 	int i;
347 
348 	xe_assert(vm->xe, op->type == DRM_XE_VMA_ATTR_PURGEABLE_STATE);
349 
350 	for (i = 0; i < num_vmas; i++) {
351 		struct xe_bo *bo = xe_vma_bo(vmas[i]);
352 
353 		if (!bo) {
354 			/* Purgeable state applies to BOs only, skip non-BO VMAs */
355 			vmas[i]->skip_invalidation = true;
356 			continue;
357 		}
358 
359 		/* BO must be locked before modifying madv state */
360 		xe_bo_assert_held(bo);
361 
362 		/* Skip shared dma-bufs - no PTEs to zap */
363 		if (xe_bo_is_dmabuf_shared(bo)) {
364 			vmas[i]->skip_invalidation = true;
365 			continue;
366 		}
367 
368 		/*
369 		 * Once purged, always purged. Cannot transition back to WILLNEED.
370 		 * This matches i915 semantics where purged BOs are permanently invalid.
371 		 */
372 		if (xe_bo_is_purged(bo)) {
373 			details->has_purged_bo = true;
374 			vmas[i]->skip_invalidation = true;
375 			continue;
376 		}
377 
378 		switch (op->purge_state_val.val) {
379 		case DRM_XE_VMA_PURGEABLE_STATE_WILLNEED:
380 			vmas[i]->attr.purgeable_state = XE_MADV_PURGEABLE_WILLNEED;
381 			vmas[i]->skip_invalidation = true;
382 
383 			xe_bo_recompute_purgeable_state(bo);
384 			break;
385 		case DRM_XE_VMA_PURGEABLE_STATE_DONTNEED:
386 			vmas[i]->attr.purgeable_state = XE_MADV_PURGEABLE_DONTNEED;
387 			/*
388 			 * Don't zap PTEs at DONTNEED time -- pages are still
389 			 * alive. The zap happens in xe_bo_move_notify() right
390 			 * before the shrinker frees them.
391 			 */
392 			vmas[i]->skip_invalidation = true;
393 
394 			xe_bo_recompute_purgeable_state(bo);
395 			break;
396 		default:
397 			/* Should never hit - values validated in madvise_args_are_sane() */
398 			xe_assert(vm->xe, 0);
399 			return;
400 		}
401 	}
402 }
403 
404 typedef void (*madvise_func)(struct xe_device *xe, struct xe_vm *vm,
405 			     struct xe_vma **vmas, int num_vmas,
406 			     struct drm_xe_madvise *op,
407 			     struct xe_madvise_details *details);
408 
409 static const madvise_func madvise_funcs[] = {
410 	[DRM_XE_MEM_RANGE_ATTR_PREFERRED_LOC] = madvise_preferred_mem_loc,
411 	[DRM_XE_MEM_RANGE_ATTR_ATOMIC] = madvise_atomic,
412 	[DRM_XE_MEM_RANGE_ATTR_PAT] = madvise_pat_index,
413 	[DRM_XE_VMA_ATTR_PURGEABLE_STATE] = madvise_purgeable,
414 };
415 
xe_zap_ptes_in_madvise_range(struct xe_vm * vm,u64 start,u64 end)416 static u8 xe_zap_ptes_in_madvise_range(struct xe_vm *vm, u64 start, u64 end)
417 {
418 	struct drm_gpuva *gpuva;
419 	struct xe_tile *tile;
420 	u8 id, tile_mask = 0;
421 
422 	lockdep_assert_held_write(&vm->lock);
423 
424 	/* Wait for pending binds */
425 	if (dma_resv_wait_timeout(xe_vm_resv(vm), DMA_RESV_USAGE_BOOKKEEP,
426 				  false, MAX_SCHEDULE_TIMEOUT) <= 0)
427 		XE_WARN_ON(1);
428 
429 	drm_gpuvm_for_each_va_range(gpuva, &vm->gpuvm, start, end) {
430 		struct xe_vma *vma = gpuva_to_vma(gpuva);
431 
432 		if (vma->skip_invalidation || xe_vma_is_null(vma))
433 			continue;
434 
435 		if (xe_vma_is_cpu_addr_mirror(vma)) {
436 			tile_mask |= xe_svm_ranges_zap_ptes_in_range(vm,
437 								      xe_vma_start(vma),
438 								      xe_vma_end(vma));
439 		} else {
440 			for_each_tile(tile, vm->xe, id) {
441 				if (xe_pt_zap_ptes(tile, vma)) {
442 					tile_mask |= BIT(id);
443 
444 					/*
445 					 * WRITE_ONCE pairs with READ_ONCE
446 					 * in xe_vm_has_valid_gpu_mapping()
447 					 */
448 					WRITE_ONCE(vma->tile_invalidated,
449 						   vma->tile_invalidated | BIT(id));
450 				}
451 			}
452 		}
453 	}
454 
455 	return tile_mask;
456 }
457 
xe_vm_invalidate_madvise_range(struct xe_vm * vm,u64 start,u64 end)458 static int xe_vm_invalidate_madvise_range(struct xe_vm *vm, u64 start, u64 end)
459 {
460 	u8 tile_mask = xe_zap_ptes_in_madvise_range(vm, start, end);
461 	struct xe_tlb_inval_batch batch;
462 	int err;
463 
464 	if (!tile_mask)
465 		return 0;
466 
467 	xe_device_wmb(vm->xe);
468 
469 	err = xe_tlb_inval_range_tilemask_submit(vm->xe, vm->usm.asid, start, end,
470 						 tile_mask, &batch);
471 	if (!err)
472 		xe_tlb_inval_batch_wait(&batch);
473 
474 	return err;
475 }
476 
madvise_args_are_sane(struct xe_device * xe,const struct drm_xe_madvise * args)477 static bool madvise_args_are_sane(struct xe_device *xe, const struct drm_xe_madvise *args)
478 {
479 	if (XE_IOCTL_DBG(xe, !args))
480 		return false;
481 
482 	if (XE_IOCTL_DBG(xe, !IS_ALIGNED(args->start, SZ_4K)))
483 		return false;
484 
485 	if (XE_IOCTL_DBG(xe, !IS_ALIGNED(args->range, SZ_4K)))
486 		return false;
487 
488 	if (XE_IOCTL_DBG(xe, args->range < SZ_4K))
489 		return false;
490 
491 	switch (args->type) {
492 	case DRM_XE_MEM_RANGE_ATTR_PREFERRED_LOC:
493 	{
494 		s32 fd = (s32)args->preferred_mem_loc.devmem_fd;
495 
496 		if (XE_IOCTL_DBG(xe, fd < DRM_XE_PREFERRED_LOC_DEFAULT_SYSTEM))
497 			return false;
498 
499 		if (XE_IOCTL_DBG(xe, fd <= DRM_XE_PREFERRED_LOC_DEFAULT_DEVICE &&
500 				 args->preferred_mem_loc.region_instance != 0))
501 			return false;
502 
503 		if (XE_IOCTL_DBG(xe, args->preferred_mem_loc.migration_policy >
504 				     DRM_XE_MIGRATE_ONLY_SYSTEM_PAGES))
505 			return false;
506 
507 		if (XE_IOCTL_DBG(xe, args->preferred_mem_loc.reserved))
508 			return false;
509 		break;
510 	}
511 	case DRM_XE_MEM_RANGE_ATTR_ATOMIC:
512 		if (XE_IOCTL_DBG(xe, args->atomic.val > DRM_XE_ATOMIC_CPU))
513 			return false;
514 
515 		if (XE_IOCTL_DBG(xe, args->atomic.pad))
516 			return false;
517 
518 		if (XE_IOCTL_DBG(xe, args->atomic.reserved))
519 			return false;
520 
521 		break;
522 	case DRM_XE_MEM_RANGE_ATTR_PAT:
523 	{
524 		u16 pat_index, coh_mode;
525 
526 		if (XE_IOCTL_DBG(xe, args->pat_index.val >= xe->pat.n_entries))
527 			return false;
528 
529 		pat_index = array_index_nospec(args->pat_index.val, xe->pat.n_entries);
530 		coh_mode = xe_pat_index_get_coh_mode(xe, pat_index);
531 		if (XE_IOCTL_DBG(xe, !coh_mode))
532 			return false;
533 
534 		if (XE_WARN_ON(coh_mode > XE_COH_2WAY))
535 			return false;
536 
537 		if (XE_IOCTL_DBG(xe, args->pat_index.pad))
538 			return false;
539 
540 		if (XE_IOCTL_DBG(xe, args->pat_index.reserved))
541 			return false;
542 		break;
543 	}
544 	case DRM_XE_VMA_ATTR_PURGEABLE_STATE:
545 	{
546 		u32 val = args->purge_state_val.val;
547 
548 		if (XE_IOCTL_DBG(xe, !(val == DRM_XE_VMA_PURGEABLE_STATE_WILLNEED ||
549 				       val == DRM_XE_VMA_PURGEABLE_STATE_DONTNEED)))
550 			return false;
551 
552 		if (XE_IOCTL_DBG(xe, args->purge_state_val.pad))
553 			return false;
554 
555 		break;
556 	}
557 	default:
558 		if (XE_IOCTL_DBG(xe, 1))
559 			return false;
560 	}
561 
562 	if (XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
563 		return false;
564 
565 	return true;
566 }
567 
xe_madvise_details_init(struct xe_vm * vm,const struct drm_xe_madvise * args,struct xe_madvise_details * details)568 static int xe_madvise_details_init(struct xe_vm *vm, const struct drm_xe_madvise *args,
569 				   struct xe_madvise_details *details)
570 {
571 	struct xe_device *xe = vm->xe;
572 
573 	memset(details, 0, sizeof(*details));
574 
575 	/* Store retained pointer for purgeable state */
576 	if (args->type == DRM_XE_VMA_ATTR_PURGEABLE_STATE) {
577 		details->retained_ptr = args->purge_state_val.retained_ptr;
578 		return 0;
579 	}
580 
581 	if (args->type == DRM_XE_MEM_RANGE_ATTR_PREFERRED_LOC) {
582 		int fd = args->preferred_mem_loc.devmem_fd;
583 		struct drm_pagemap *dpagemap;
584 
585 		if (fd <= 0)
586 			return 0;
587 
588 		dpagemap = xe_drm_pagemap_from_fd(args->preferred_mem_loc.devmem_fd,
589 						  args->preferred_mem_loc.region_instance);
590 		if (XE_IOCTL_DBG(xe, IS_ERR(dpagemap)))
591 			return PTR_ERR(dpagemap);
592 
593 		/* Don't allow a foreign placement without a fast interconnect! */
594 		if (XE_IOCTL_DBG(xe, dpagemap->pagemap->owner != vm->svm.peer.owner)) {
595 			drm_pagemap_put(dpagemap);
596 			return -ENOLINK;
597 		}
598 		details->dpagemap = dpagemap;
599 	}
600 
601 	return 0;
602 }
603 
xe_madvise_details_fini(struct xe_madvise_details * details)604 static void xe_madvise_details_fini(struct xe_madvise_details *details)
605 {
606 	drm_pagemap_put(details->dpagemap);
607 }
608 
xe_madvise_purgeable_retained_to_user(const struct xe_madvise_details * details)609 static int xe_madvise_purgeable_retained_to_user(const struct xe_madvise_details *details)
610 {
611 	u32 retained;
612 
613 	if (!details->retained_ptr)
614 		return 0;
615 
616 	retained = !details->has_purged_bo;
617 
618 	if (put_user(retained, (u32 __user *)u64_to_user_ptr(details->retained_ptr)))
619 		return -EFAULT;
620 
621 	return 0;
622 }
623 
check_pat_args_are_sane(struct xe_device * xe,struct xe_vmas_in_madvise_range * madvise_range,u16 pat_index)624 static bool check_pat_args_are_sane(struct xe_device *xe,
625 				    struct xe_vmas_in_madvise_range *madvise_range,
626 				    u16 pat_index)
627 {
628 	u16 coh_mode = xe_pat_index_get_coh_mode(xe, pat_index);
629 	int i;
630 
631 	/*
632 	 * Using coh_none with CPU cached buffers is not allowed on iGPU.
633 	 * On iGPU the GPU shares the LLC with the CPU, so with coh_none
634 	 * the GPU bypasses CPU caches and reads directly from DRAM,
635 	 * potentially seeing stale sensitive data from previously freed
636 	 * pages. On dGPU this restriction does not apply, because the
637 	 * platform does not provide a non-coherent system memory access
638 	 * path that would violate the DMA coherency contract.
639 	 */
640 	if (coh_mode != XE_COH_NONE || IS_DGFX(xe))
641 		return true;
642 
643 	for (i = 0; i < madvise_range->num_vmas; i++) {
644 		struct xe_vma *vma = madvise_range->vmas[i];
645 		struct xe_bo *bo = xe_vma_bo(vma);
646 
647 		if (bo) {
648 			/* BO with WB caching + COH_NONE is not allowed */
649 			if (XE_IOCTL_DBG(xe, bo->cpu_caching == DRM_XE_GEM_CPU_CACHING_WB))
650 				return false;
651 			/* Imported dma-buf without caching info, assume cached */
652 			if (XE_IOCTL_DBG(xe, !bo->cpu_caching))
653 				return false;
654 		} else if (XE_IOCTL_DBG(xe, xe_vma_is_cpu_addr_mirror(vma) ||
655 					    xe_vma_is_userptr(vma)))
656 			/* System memory (userptr/SVM) is always CPU cached */
657 			return false;
658 	}
659 
660 	return true;
661 }
662 
check_bo_args_are_sane(struct xe_vm * vm,struct xe_vma ** vmas,int num_vmas,u32 atomic_val)663 static bool check_bo_args_are_sane(struct xe_vm *vm, struct xe_vma **vmas,
664 				   int num_vmas, u32 atomic_val)
665 {
666 	struct xe_device *xe = vm->xe;
667 	struct xe_bo *bo;
668 	int i;
669 
670 	for (i = 0; i < num_vmas; i++) {
671 		bo = xe_vma_bo(vmas[i]);
672 		if (!bo)
673 			continue;
674 		/*
675 		 * NOTE: The following atomic checks are platform-specific. For example,
676 		 * if a device supports CXL atomics, these may not be necessary or
677 		 * may behave differently.
678 		 */
679 		if (XE_IOCTL_DBG(xe, atomic_val == DRM_XE_ATOMIC_CPU &&
680 				 !(bo->flags & XE_BO_FLAG_SYSTEM)))
681 			return false;
682 
683 		if (XE_IOCTL_DBG(xe, atomic_val == DRM_XE_ATOMIC_DEVICE &&
684 				 !(bo->flags & XE_BO_FLAG_VRAM0) &&
685 				 !(bo->flags & XE_BO_FLAG_VRAM1) &&
686 				 !(bo->flags & XE_BO_FLAG_SYSTEM &&
687 				   xe->info.has_device_atomics_on_smem)))
688 			return false;
689 
690 		if (XE_IOCTL_DBG(xe, atomic_val == DRM_XE_ATOMIC_GLOBAL &&
691 				 (!(bo->flags & XE_BO_FLAG_SYSTEM) ||
692 				  (!(bo->flags & XE_BO_FLAG_VRAM0) &&
693 				   !(bo->flags & XE_BO_FLAG_VRAM1)))))
694 			return false;
695 	}
696 	return true;
697 }
698 /**
699  * xe_vm_madvise_ioctl - Handle MADVise ioctl for a VM
700  * @dev: DRM device pointer
701  * @data: Pointer to ioctl data (drm_xe_madvise*)
702  * @file: DRM file pointer
703  *
704  * Handles the MADVISE ioctl to provide memory advice for vma's within
705  * input range.
706  *
707  * Return: 0 on success or a negative error code on failure.
708  */
xe_vm_madvise_ioctl(struct drm_device * dev,void * data,struct drm_file * file)709 int xe_vm_madvise_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
710 {
711 	struct xe_device *xe = to_xe_device(dev);
712 	struct xe_file *xef = to_xe_file(file);
713 	struct drm_xe_madvise *args = data;
714 	struct xe_vmas_in_madvise_range madvise_range = {
715 		/*
716 		 * Userspace may pass canonical (sign-extended) addresses.
717 		 * Strip the sign extension to get the internal non-canonical
718 		 * form used by the GPUVM, matching xe_vm_bind_ioctl() behavior.
719 		 */
720 		.addr = xe_device_uncanonicalize_addr(xe, args->start),
721 		.range = args->range,
722 	};
723 	struct xe_madvise_details details;
724 	u16 pat_index, coh_mode;
725 	struct xe_vm *vm;
726 	struct drm_exec exec;
727 	int err, attr_type;
728 	bool do_retained;
729 
730 	vm = xe_vm_lookup(xef, args->vm_id);
731 	if (XE_IOCTL_DBG(xe, !vm))
732 		return -EINVAL;
733 
734 	if (!madvise_args_are_sane(vm->xe, args)) {
735 		err = -EINVAL;
736 		goto put_vm;
737 	}
738 
739 	/* Cache whether we need to write retained, and validate it's initialized to 0 */
740 	do_retained = args->type == DRM_XE_VMA_ATTR_PURGEABLE_STATE &&
741 		      args->purge_state_val.retained_ptr;
742 	if (do_retained) {
743 		u32 retained;
744 		u32 __user *retained_ptr;
745 
746 		retained_ptr = u64_to_user_ptr(args->purge_state_val.retained_ptr);
747 		if (get_user(retained, retained_ptr)) {
748 			err = -EFAULT;
749 			goto put_vm;
750 		}
751 
752 		if (XE_IOCTL_DBG(xe, retained != 0)) {
753 			err = -EINVAL;
754 			goto put_vm;
755 		}
756 	}
757 
758 	xe_svm_flush(vm);
759 
760 	err = down_write_killable(&vm->lock);
761 	if (err)
762 		goto put_vm;
763 
764 	if (XE_IOCTL_DBG(xe, xe_vm_is_closed_or_banned(vm))) {
765 		err = -ENOENT;
766 		goto unlock_vm;
767 	}
768 
769 	err = xe_madvise_details_init(vm, args, &details);
770 	if (err)
771 		goto unlock_vm;
772 
773 	err = xe_vm_alloc_madvise_vma(vm, madvise_range.addr, args->range);
774 	if (err)
775 		goto madv_fini;
776 
777 	err = get_vmas(vm, &madvise_range);
778 	if (err || !madvise_range.num_vmas)
779 		goto madv_fini;
780 
781 	if (args->type == DRM_XE_MEM_RANGE_ATTR_PAT) {
782 		pat_index = array_index_nospec(args->pat_index.val, xe->pat.n_entries);
783 		coh_mode = xe_pat_index_get_coh_mode(xe, pat_index);
784 		if (XE_IOCTL_DBG(xe, madvise_range.has_svm_userptr_vmas &&
785 				 xe_device_is_l2_flush_optimized(xe) &&
786 				 (pat_index != 19 && coh_mode != XE_COH_2WAY))) {
787 			err = -EINVAL;
788 			goto madv_fini;
789 		}
790 	}
791 
792 	if (args->type == DRM_XE_MEM_RANGE_ATTR_PAT) {
793 		if (!check_pat_args_are_sane(xe, &madvise_range,
794 					     args->pat_index.val)) {
795 			err = -EINVAL;
796 			goto free_vmas;
797 		}
798 	}
799 
800 	if (madvise_range.has_bo_vmas) {
801 		if (args->type == DRM_XE_MEM_RANGE_ATTR_ATOMIC) {
802 			if (!check_bo_args_are_sane(vm, madvise_range.vmas,
803 						    madvise_range.num_vmas,
804 						    args->atomic.val)) {
805 				err = -EINVAL;
806 				goto free_vmas;
807 			}
808 		}
809 
810 		drm_exec_init(&exec, DRM_EXEC_IGNORE_DUPLICATES | DRM_EXEC_INTERRUPTIBLE_WAIT, 0);
811 		drm_exec_until_all_locked(&exec) {
812 			for (int i = 0; i < madvise_range.num_vmas; i++) {
813 				struct xe_bo *bo = xe_vma_bo(madvise_range.vmas[i]);
814 
815 				if (!bo)
816 					continue;
817 
818 				if (args->type == DRM_XE_MEM_RANGE_ATTR_PAT) {
819 					if (XE_IOCTL_DBG(xe, bo->ttm.base.import_attach &&
820 							 xe_device_is_l2_flush_optimized(xe) &&
821 							 (pat_index != 19 &&
822 							  coh_mode != XE_COH_2WAY))) {
823 						err = -EINVAL;
824 						goto err_fini;
825 					}
826 				}
827 
828 				err = drm_exec_lock_obj(&exec, &bo->ttm.base);
829 				drm_exec_retry_on_contention(&exec);
830 				if (err)
831 					goto err_fini;
832 			}
833 		}
834 	}
835 
836 	if (madvise_range.has_svm_userptr_vmas) {
837 		err = xe_svm_notifier_lock_interruptible(vm);
838 		if (err)
839 			goto err_fini;
840 	}
841 
842 	attr_type = array_index_nospec(args->type, ARRAY_SIZE(madvise_funcs));
843 
844 	/* Ensure the madvise function exists for this type */
845 	if (!madvise_funcs[attr_type]) {
846 		err = -EINVAL;
847 		goto err_fini;
848 	}
849 
850 	madvise_funcs[attr_type](xe, vm, madvise_range.vmas, madvise_range.num_vmas, args,
851 				 &details);
852 
853 	err = xe_vm_invalidate_madvise_range(vm, madvise_range.addr,
854 					     madvise_range.addr + args->range);
855 
856 	if (madvise_range.has_svm_userptr_vmas)
857 		xe_svm_notifier_unlock(vm);
858 
859 err_fini:
860 	if (madvise_range.has_bo_vmas)
861 		drm_exec_fini(&exec);
862 free_vmas:
863 	kfree(madvise_range.vmas);
864 	madvise_range.vmas = NULL;
865 madv_fini:
866 	xe_madvise_details_fini(&details);
867 unlock_vm:
868 	up_write(&vm->lock);
869 
870 	/* Write retained value to user after releasing all locks */
871 	if (!err && do_retained)
872 		err = xe_madvise_purgeable_retained_to_user(&details);
873 put_vm:
874 	xe_vm_put(vm);
875 	return err;
876 }
877