xref: /linux/drivers/gpu/drm/xe/xe_vm_madvise.c (revision b6c0783ff278671e38fed978fefb732101ac8836)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2025 Intel Corporation
4  */
5 
6 #include "xe_vm_madvise.h"
7 
8 #include <linux/nospec.h>
9 #include <drm/xe_drm.h>
10 
11 #include "xe_bo.h"
12 #include "xe_pat.h"
13 #include "xe_pt.h"
14 #include "xe_svm.h"
15 #include "xe_tlb_inval.h"
16 
17 struct xe_vmas_in_madvise_range {
18 	u64 addr;
19 	u64 range;
20 	struct xe_vma **vmas;
21 	int num_vmas;
22 	bool has_bo_vmas;
23 	bool has_svm_userptr_vmas;
24 };
25 
26 /**
27  * struct xe_madvise_details - Argument to madvise_funcs
28  * @dpagemap: Reference-counted pointer to a struct drm_pagemap.
29  *
30  * The madvise IOCTL handler may, in addition to the user-space
31  * args, have additional info to pass into the madvise_func that
32  * handles the madvise type. Use a struct_xe_madvise_details
33  * for that and extend the struct as necessary.
34  */
35 struct xe_madvise_details {
36 	struct drm_pagemap *dpagemap;
37 };
38 
39 static int get_vmas(struct xe_vm *vm, struct xe_vmas_in_madvise_range *madvise_range)
40 {
41 	u64 addr = madvise_range->addr;
42 	u64 range = madvise_range->range;
43 
44 	struct xe_vma  **__vmas;
45 	struct drm_gpuva *gpuva;
46 	int max_vmas = 8;
47 
48 	lockdep_assert_held(&vm->lock);
49 
50 	madvise_range->num_vmas = 0;
51 	madvise_range->vmas = kmalloc_objs(*madvise_range->vmas, max_vmas);
52 	if (!madvise_range->vmas)
53 		return -ENOMEM;
54 
55 	vm_dbg(&vm->xe->drm, "VMA's in range: start=0x%016llx, end=0x%016llx", addr, addr + range);
56 
57 	drm_gpuvm_for_each_va_range(gpuva, &vm->gpuvm, addr, addr + range) {
58 		struct xe_vma *vma = gpuva_to_vma(gpuva);
59 
60 		if (xe_vma_bo(vma))
61 			madvise_range->has_bo_vmas = true;
62 		else if (xe_vma_is_cpu_addr_mirror(vma) || xe_vma_is_userptr(vma))
63 			madvise_range->has_svm_userptr_vmas = true;
64 
65 		if (madvise_range->num_vmas == max_vmas) {
66 			max_vmas <<= 1;
67 			__vmas = krealloc(madvise_range->vmas,
68 					  max_vmas * sizeof(*madvise_range->vmas),
69 					  GFP_KERNEL);
70 			if (!__vmas) {
71 				kfree(madvise_range->vmas);
72 				return -ENOMEM;
73 			}
74 			madvise_range->vmas = __vmas;
75 		}
76 
77 		madvise_range->vmas[madvise_range->num_vmas] = vma;
78 		(madvise_range->num_vmas)++;
79 	}
80 
81 	if (!madvise_range->num_vmas)
82 		kfree(madvise_range->vmas);
83 
84 	vm_dbg(&vm->xe->drm, "madvise_range-num_vmas = %d\n", madvise_range->num_vmas);
85 
86 	return 0;
87 }
88 
89 static void madvise_preferred_mem_loc(struct xe_device *xe, struct xe_vm *vm,
90 				      struct xe_vma **vmas, int num_vmas,
91 				      struct drm_xe_madvise *op,
92 				      struct xe_madvise_details *details)
93 {
94 	int i;
95 
96 	xe_assert(vm->xe, op->type == DRM_XE_MEM_RANGE_ATTR_PREFERRED_LOC);
97 
98 	for (i = 0; i < num_vmas; i++) {
99 		struct xe_vma *vma = vmas[i];
100 		struct xe_vma_preferred_loc *loc = &vma->attr.preferred_loc;
101 
102 		/*TODO: Extend attributes to bo based vmas */
103 		if ((loc->devmem_fd == op->preferred_mem_loc.devmem_fd &&
104 		     loc->migration_policy == op->preferred_mem_loc.migration_policy) ||
105 		    !xe_vma_is_cpu_addr_mirror(vma)) {
106 			vma->skip_invalidation = true;
107 		} else {
108 			vma->skip_invalidation = false;
109 			loc->devmem_fd = op->preferred_mem_loc.devmem_fd;
110 			/* Till multi-device support is not added migration_policy
111 			 * is of no use and can be ignored.
112 			 */
113 			loc->migration_policy = op->preferred_mem_loc.migration_policy;
114 			drm_pagemap_put(loc->dpagemap);
115 			loc->dpagemap = NULL;
116 			if (details->dpagemap)
117 				loc->dpagemap = drm_pagemap_get(details->dpagemap);
118 		}
119 	}
120 }
121 
122 static void madvise_atomic(struct xe_device *xe, struct xe_vm *vm,
123 			   struct xe_vma **vmas, int num_vmas,
124 			   struct drm_xe_madvise *op,
125 			   struct xe_madvise_details *details)
126 {
127 	struct xe_bo *bo;
128 	int i;
129 
130 	xe_assert(vm->xe, op->type == DRM_XE_MEM_RANGE_ATTR_ATOMIC);
131 	xe_assert(vm->xe, op->atomic.val <= DRM_XE_ATOMIC_CPU);
132 
133 	for (i = 0; i < num_vmas; i++) {
134 		if (xe_vma_is_userptr(vmas[i]) &&
135 		    !(op->atomic.val == DRM_XE_ATOMIC_DEVICE &&
136 		      xe->info.has_device_atomics_on_smem)) {
137 			vmas[i]->skip_invalidation = true;
138 			continue;
139 		}
140 
141 		if (vmas[i]->attr.atomic_access == op->atomic.val) {
142 			vmas[i]->skip_invalidation = true;
143 		} else {
144 			vmas[i]->skip_invalidation = false;
145 			vmas[i]->attr.atomic_access = op->atomic.val;
146 		}
147 
148 		bo = xe_vma_bo(vmas[i]);
149 		if (!bo || bo->attr.atomic_access == op->atomic.val)
150 			continue;
151 
152 		vmas[i]->skip_invalidation = false;
153 		xe_bo_assert_held(bo);
154 		bo->attr.atomic_access = op->atomic.val;
155 
156 		/* Invalidate cpu page table, so bo can migrate to smem in next access */
157 		if (xe_bo_is_vram(bo) &&
158 		    (bo->attr.atomic_access == DRM_XE_ATOMIC_CPU ||
159 		     bo->attr.atomic_access == DRM_XE_ATOMIC_GLOBAL))
160 			ttm_bo_unmap_virtual(&bo->ttm);
161 	}
162 }
163 
164 static void madvise_pat_index(struct xe_device *xe, struct xe_vm *vm,
165 			      struct xe_vma **vmas, int num_vmas,
166 			      struct drm_xe_madvise *op,
167 			      struct xe_madvise_details *details)
168 {
169 	int i;
170 
171 	xe_assert(vm->xe, op->type == DRM_XE_MEM_RANGE_ATTR_PAT);
172 
173 	for (i = 0; i < num_vmas; i++) {
174 		if (vmas[i]->attr.pat_index == op->pat_index.val) {
175 			vmas[i]->skip_invalidation = true;
176 		} else {
177 			vmas[i]->skip_invalidation = false;
178 			vmas[i]->attr.pat_index = op->pat_index.val;
179 		}
180 	}
181 }
182 
183 typedef void (*madvise_func)(struct xe_device *xe, struct xe_vm *vm,
184 			     struct xe_vma **vmas, int num_vmas,
185 			     struct drm_xe_madvise *op,
186 			     struct xe_madvise_details *details);
187 
188 static const madvise_func madvise_funcs[] = {
189 	[DRM_XE_MEM_RANGE_ATTR_PREFERRED_LOC] = madvise_preferred_mem_loc,
190 	[DRM_XE_MEM_RANGE_ATTR_ATOMIC] = madvise_atomic,
191 	[DRM_XE_MEM_RANGE_ATTR_PAT] = madvise_pat_index,
192 };
193 
194 static u8 xe_zap_ptes_in_madvise_range(struct xe_vm *vm, u64 start, u64 end)
195 {
196 	struct drm_gpuva *gpuva;
197 	struct xe_tile *tile;
198 	u8 id, tile_mask = 0;
199 
200 	lockdep_assert_held_write(&vm->lock);
201 
202 	/* Wait for pending binds */
203 	if (dma_resv_wait_timeout(xe_vm_resv(vm), DMA_RESV_USAGE_BOOKKEEP,
204 				  false, MAX_SCHEDULE_TIMEOUT) <= 0)
205 		XE_WARN_ON(1);
206 
207 	drm_gpuvm_for_each_va_range(gpuva, &vm->gpuvm, start, end) {
208 		struct xe_vma *vma = gpuva_to_vma(gpuva);
209 
210 		if (vma->skip_invalidation || xe_vma_is_null(vma))
211 			continue;
212 
213 		if (xe_vma_is_cpu_addr_mirror(vma)) {
214 			tile_mask |= xe_svm_ranges_zap_ptes_in_range(vm,
215 								      xe_vma_start(vma),
216 								      xe_vma_end(vma));
217 		} else {
218 			for_each_tile(tile, vm->xe, id) {
219 				if (xe_pt_zap_ptes(tile, vma)) {
220 					tile_mask |= BIT(id);
221 
222 					/*
223 					 * WRITE_ONCE pairs with READ_ONCE
224 					 * in xe_vm_has_valid_gpu_mapping()
225 					 */
226 					WRITE_ONCE(vma->tile_invalidated,
227 						   vma->tile_invalidated | BIT(id));
228 				}
229 			}
230 		}
231 	}
232 
233 	return tile_mask;
234 }
235 
236 static int xe_vm_invalidate_madvise_range(struct xe_vm *vm, u64 start, u64 end)
237 {
238 	u8 tile_mask = xe_zap_ptes_in_madvise_range(vm, start, end);
239 	struct xe_tlb_inval_batch batch;
240 	int err;
241 
242 	if (!tile_mask)
243 		return 0;
244 
245 	xe_device_wmb(vm->xe);
246 
247 	err = xe_tlb_inval_range_tilemask_submit(vm->xe, vm->usm.asid, start, end,
248 						 tile_mask, &batch);
249 	if (!err)
250 		xe_tlb_inval_batch_wait(&batch);
251 
252 	return err;
253 }
254 
255 static bool madvise_args_are_sane(struct xe_device *xe, const struct drm_xe_madvise *args)
256 {
257 	if (XE_IOCTL_DBG(xe, !args))
258 		return false;
259 
260 	if (XE_IOCTL_DBG(xe, !IS_ALIGNED(args->start, SZ_4K)))
261 		return false;
262 
263 	if (XE_IOCTL_DBG(xe, !IS_ALIGNED(args->range, SZ_4K)))
264 		return false;
265 
266 	if (XE_IOCTL_DBG(xe, args->range < SZ_4K))
267 		return false;
268 
269 	switch (args->type) {
270 	case DRM_XE_MEM_RANGE_ATTR_PREFERRED_LOC:
271 	{
272 		s32 fd = (s32)args->preferred_mem_loc.devmem_fd;
273 
274 		if (XE_IOCTL_DBG(xe, fd < DRM_XE_PREFERRED_LOC_DEFAULT_SYSTEM))
275 			return false;
276 
277 		if (XE_IOCTL_DBG(xe, fd <= DRM_XE_PREFERRED_LOC_DEFAULT_DEVICE &&
278 				 args->preferred_mem_loc.region_instance != 0))
279 			return false;
280 
281 		if (XE_IOCTL_DBG(xe, args->preferred_mem_loc.migration_policy >
282 				     DRM_XE_MIGRATE_ONLY_SYSTEM_PAGES))
283 			return false;
284 
285 		if (XE_IOCTL_DBG(xe, args->preferred_mem_loc.reserved))
286 			return false;
287 		break;
288 	}
289 	case DRM_XE_MEM_RANGE_ATTR_ATOMIC:
290 		if (XE_IOCTL_DBG(xe, args->atomic.val > DRM_XE_ATOMIC_CPU))
291 			return false;
292 
293 		if (XE_IOCTL_DBG(xe, args->atomic.pad))
294 			return false;
295 
296 		if (XE_IOCTL_DBG(xe, args->atomic.reserved))
297 			return false;
298 
299 		break;
300 	case DRM_XE_MEM_RANGE_ATTR_PAT:
301 	{
302 		u16 pat_index, coh_mode;
303 
304 		if (XE_IOCTL_DBG(xe, args->pat_index.val >= xe->pat.n_entries))
305 			return false;
306 
307 		pat_index = array_index_nospec(args->pat_index.val, xe->pat.n_entries);
308 		coh_mode = xe_pat_index_get_coh_mode(xe, pat_index);
309 		if (XE_IOCTL_DBG(xe, !coh_mode))
310 			return false;
311 
312 		if (XE_WARN_ON(coh_mode > XE_COH_AT_LEAST_1WAY))
313 			return false;
314 
315 		if (XE_IOCTL_DBG(xe, args->pat_index.pad))
316 			return false;
317 
318 		if (XE_IOCTL_DBG(xe, args->pat_index.reserved))
319 			return false;
320 		break;
321 	}
322 	default:
323 		if (XE_IOCTL_DBG(xe, 1))
324 			return false;
325 	}
326 
327 	if (XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
328 		return false;
329 
330 	return true;
331 }
332 
333 static int xe_madvise_details_init(struct xe_vm *vm, const struct drm_xe_madvise *args,
334 				   struct xe_madvise_details *details)
335 {
336 	struct xe_device *xe = vm->xe;
337 
338 	memset(details, 0, sizeof(*details));
339 
340 	if (args->type == DRM_XE_MEM_RANGE_ATTR_PREFERRED_LOC) {
341 		int fd = args->preferred_mem_loc.devmem_fd;
342 		struct drm_pagemap *dpagemap;
343 
344 		if (fd <= 0)
345 			return 0;
346 
347 		dpagemap = xe_drm_pagemap_from_fd(args->preferred_mem_loc.devmem_fd,
348 						  args->preferred_mem_loc.region_instance);
349 		if (XE_IOCTL_DBG(xe, IS_ERR(dpagemap)))
350 			return PTR_ERR(dpagemap);
351 
352 		/* Don't allow a foreign placement without a fast interconnect! */
353 		if (XE_IOCTL_DBG(xe, dpagemap->pagemap->owner != vm->svm.peer.owner)) {
354 			drm_pagemap_put(dpagemap);
355 			return -ENOLINK;
356 		}
357 		details->dpagemap = dpagemap;
358 	}
359 
360 	return 0;
361 }
362 
363 static void xe_madvise_details_fini(struct xe_madvise_details *details)
364 {
365 	drm_pagemap_put(details->dpagemap);
366 }
367 
368 static bool check_bo_args_are_sane(struct xe_vm *vm, struct xe_vma **vmas,
369 				   int num_vmas, u32 atomic_val)
370 {
371 	struct xe_device *xe = vm->xe;
372 	struct xe_bo *bo;
373 	int i;
374 
375 	for (i = 0; i < num_vmas; i++) {
376 		bo = xe_vma_bo(vmas[i]);
377 		if (!bo)
378 			continue;
379 		/*
380 		 * NOTE: The following atomic checks are platform-specific. For example,
381 		 * if a device supports CXL atomics, these may not be necessary or
382 		 * may behave differently.
383 		 */
384 		if (XE_IOCTL_DBG(xe, atomic_val == DRM_XE_ATOMIC_CPU &&
385 				 !(bo->flags & XE_BO_FLAG_SYSTEM)))
386 			return false;
387 
388 		if (XE_IOCTL_DBG(xe, atomic_val == DRM_XE_ATOMIC_DEVICE &&
389 				 !(bo->flags & XE_BO_FLAG_VRAM0) &&
390 				 !(bo->flags & XE_BO_FLAG_VRAM1) &&
391 				 !(bo->flags & XE_BO_FLAG_SYSTEM &&
392 				   xe->info.has_device_atomics_on_smem)))
393 			return false;
394 
395 		if (XE_IOCTL_DBG(xe, atomic_val == DRM_XE_ATOMIC_GLOBAL &&
396 				 (!(bo->flags & XE_BO_FLAG_SYSTEM) ||
397 				  (!(bo->flags & XE_BO_FLAG_VRAM0) &&
398 				   !(bo->flags & XE_BO_FLAG_VRAM1)))))
399 			return false;
400 	}
401 	return true;
402 }
403 /**
404  * xe_vm_madvise_ioctl - Handle MADVise ioctl for a VM
405  * @dev: DRM device pointer
406  * @data: Pointer to ioctl data (drm_xe_madvise*)
407  * @file: DRM file pointer
408  *
409  * Handles the MADVISE ioctl to provide memory advice for vma's within
410  * input range.
411  *
412  * Return: 0 on success or a negative error code on failure.
413  */
414 int xe_vm_madvise_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
415 {
416 	struct xe_device *xe = to_xe_device(dev);
417 	struct xe_file *xef = to_xe_file(file);
418 	struct drm_xe_madvise *args = data;
419 	struct xe_vmas_in_madvise_range madvise_range = {.addr = args->start,
420 							 .range =  args->range, };
421 	struct xe_madvise_details details;
422 	struct xe_vm *vm;
423 	struct drm_exec exec;
424 	int err, attr_type;
425 
426 	vm = xe_vm_lookup(xef, args->vm_id);
427 	if (XE_IOCTL_DBG(xe, !vm))
428 		return -EINVAL;
429 
430 	if (!madvise_args_are_sane(vm->xe, args)) {
431 		err = -EINVAL;
432 		goto put_vm;
433 	}
434 
435 	xe_svm_flush(vm);
436 
437 	err = down_write_killable(&vm->lock);
438 	if (err)
439 		goto put_vm;
440 
441 	if (XE_IOCTL_DBG(xe, xe_vm_is_closed_or_banned(vm))) {
442 		err = -ENOENT;
443 		goto unlock_vm;
444 	}
445 
446 	err = xe_madvise_details_init(vm, args, &details);
447 	if (err)
448 		goto unlock_vm;
449 
450 	err = xe_vm_alloc_madvise_vma(vm, args->start, args->range);
451 	if (err)
452 		goto madv_fini;
453 
454 	err = get_vmas(vm, &madvise_range);
455 	if (err || !madvise_range.num_vmas)
456 		goto madv_fini;
457 
458 	if (madvise_range.has_bo_vmas) {
459 		if (args->type == DRM_XE_MEM_RANGE_ATTR_ATOMIC) {
460 			if (!check_bo_args_are_sane(vm, madvise_range.vmas,
461 						    madvise_range.num_vmas,
462 						    args->atomic.val)) {
463 				err = -EINVAL;
464 				goto free_vmas;
465 			}
466 		}
467 
468 		drm_exec_init(&exec, DRM_EXEC_IGNORE_DUPLICATES | DRM_EXEC_INTERRUPTIBLE_WAIT, 0);
469 		drm_exec_until_all_locked(&exec) {
470 			for (int i = 0; i < madvise_range.num_vmas; i++) {
471 				struct xe_bo *bo = xe_vma_bo(madvise_range.vmas[i]);
472 
473 				if (!bo)
474 					continue;
475 				err = drm_exec_lock_obj(&exec, &bo->ttm.base);
476 				drm_exec_retry_on_contention(&exec);
477 				if (err)
478 					goto err_fini;
479 			}
480 		}
481 	}
482 
483 	if (madvise_range.has_svm_userptr_vmas) {
484 		err = xe_svm_notifier_lock_interruptible(vm);
485 		if (err)
486 			goto err_fini;
487 	}
488 
489 	attr_type = array_index_nospec(args->type, ARRAY_SIZE(madvise_funcs));
490 	madvise_funcs[attr_type](xe, vm, madvise_range.vmas, madvise_range.num_vmas, args,
491 				 &details);
492 
493 	err = xe_vm_invalidate_madvise_range(vm, args->start, args->start + args->range);
494 
495 	if (madvise_range.has_svm_userptr_vmas)
496 		xe_svm_notifier_unlock(vm);
497 
498 err_fini:
499 	if (madvise_range.has_bo_vmas)
500 		drm_exec_fini(&exec);
501 free_vmas:
502 	kfree(madvise_range.vmas);
503 	madvise_range.vmas = NULL;
504 madv_fini:
505 	xe_madvise_details_fini(&details);
506 unlock_vm:
507 	up_write(&vm->lock);
508 put_vm:
509 	xe_vm_put(vm);
510 	return err;
511 }
512