1 // SPDX-License-Identifier: MIT
2 /*
3 * Copyright © 2025 Intel Corporation
4 */
5
6 #include "xe_vm_madvise.h"
7
8 #include <linux/nospec.h>
9 #include <drm/xe_drm.h>
10
11 #include "xe_bo.h"
12 #include "xe_pat.h"
13 #include "xe_pt.h"
14 #include "xe_svm.h"
15
16 struct xe_vmas_in_madvise_range {
17 u64 addr;
18 u64 range;
19 struct xe_vma **vmas;
20 int num_vmas;
21 bool has_bo_vmas;
22 bool has_svm_userptr_vmas;
23 };
24
25 /**
26 * struct xe_madvise_details - Argument to madvise_funcs
27 * @dpagemap: Reference-counted pointer to a struct drm_pagemap.
28 *
29 * The madvise IOCTL handler may, in addition to the user-space
30 * args, have additional info to pass into the madvise_func that
31 * handles the madvise type. Use a struct_xe_madvise_details
32 * for that and extend the struct as necessary.
33 */
34 struct xe_madvise_details {
35 struct drm_pagemap *dpagemap;
36 };
37
get_vmas(struct xe_vm * vm,struct xe_vmas_in_madvise_range * madvise_range)38 static int get_vmas(struct xe_vm *vm, struct xe_vmas_in_madvise_range *madvise_range)
39 {
40 u64 addr = madvise_range->addr;
41 u64 range = madvise_range->range;
42
43 struct xe_vma **__vmas;
44 struct drm_gpuva *gpuva;
45 int max_vmas = 8;
46
47 lockdep_assert_held(&vm->lock);
48
49 madvise_range->num_vmas = 0;
50 madvise_range->vmas = kmalloc_objs(*madvise_range->vmas, max_vmas);
51 if (!madvise_range->vmas)
52 return -ENOMEM;
53
54 vm_dbg(&vm->xe->drm, "VMA's in range: start=0x%016llx, end=0x%016llx", addr, addr + range);
55
56 drm_gpuvm_for_each_va_range(gpuva, &vm->gpuvm, addr, addr + range) {
57 struct xe_vma *vma = gpuva_to_vma(gpuva);
58
59 if (xe_vma_bo(vma))
60 madvise_range->has_bo_vmas = true;
61 else if (xe_vma_is_cpu_addr_mirror(vma) || xe_vma_is_userptr(vma))
62 madvise_range->has_svm_userptr_vmas = true;
63
64 if (madvise_range->num_vmas == max_vmas) {
65 max_vmas <<= 1;
66 __vmas = krealloc(madvise_range->vmas,
67 max_vmas * sizeof(*madvise_range->vmas),
68 GFP_KERNEL);
69 if (!__vmas) {
70 kfree(madvise_range->vmas);
71 return -ENOMEM;
72 }
73 madvise_range->vmas = __vmas;
74 }
75
76 madvise_range->vmas[madvise_range->num_vmas] = vma;
77 (madvise_range->num_vmas)++;
78 }
79
80 if (!madvise_range->num_vmas)
81 kfree(madvise_range->vmas);
82
83 vm_dbg(&vm->xe->drm, "madvise_range-num_vmas = %d\n", madvise_range->num_vmas);
84
85 return 0;
86 }
87
madvise_preferred_mem_loc(struct xe_device * xe,struct xe_vm * vm,struct xe_vma ** vmas,int num_vmas,struct drm_xe_madvise * op,struct xe_madvise_details * details)88 static void madvise_preferred_mem_loc(struct xe_device *xe, struct xe_vm *vm,
89 struct xe_vma **vmas, int num_vmas,
90 struct drm_xe_madvise *op,
91 struct xe_madvise_details *details)
92 {
93 int i;
94
95 xe_assert(vm->xe, op->type == DRM_XE_MEM_RANGE_ATTR_PREFERRED_LOC);
96
97 for (i = 0; i < num_vmas; i++) {
98 struct xe_vma *vma = vmas[i];
99 struct xe_vma_preferred_loc *loc = &vma->attr.preferred_loc;
100
101 /*TODO: Extend attributes to bo based vmas */
102 if ((loc->devmem_fd == op->preferred_mem_loc.devmem_fd &&
103 loc->migration_policy == op->preferred_mem_loc.migration_policy) ||
104 !xe_vma_is_cpu_addr_mirror(vma)) {
105 vma->skip_invalidation = true;
106 } else {
107 vma->skip_invalidation = false;
108 loc->devmem_fd = op->preferred_mem_loc.devmem_fd;
109 /* Till multi-device support is not added migration_policy
110 * is of no use and can be ignored.
111 */
112 loc->migration_policy = op->preferred_mem_loc.migration_policy;
113 drm_pagemap_put(loc->dpagemap);
114 loc->dpagemap = NULL;
115 if (details->dpagemap)
116 loc->dpagemap = drm_pagemap_get(details->dpagemap);
117 }
118 }
119 }
120
madvise_atomic(struct xe_device * xe,struct xe_vm * vm,struct xe_vma ** vmas,int num_vmas,struct drm_xe_madvise * op,struct xe_madvise_details * details)121 static void madvise_atomic(struct xe_device *xe, struct xe_vm *vm,
122 struct xe_vma **vmas, int num_vmas,
123 struct drm_xe_madvise *op,
124 struct xe_madvise_details *details)
125 {
126 struct xe_bo *bo;
127 int i;
128
129 xe_assert(vm->xe, op->type == DRM_XE_MEM_RANGE_ATTR_ATOMIC);
130 xe_assert(vm->xe, op->atomic.val <= DRM_XE_ATOMIC_CPU);
131
132 for (i = 0; i < num_vmas; i++) {
133 if (xe_vma_is_userptr(vmas[i]) &&
134 !(op->atomic.val == DRM_XE_ATOMIC_DEVICE &&
135 xe->info.has_device_atomics_on_smem)) {
136 vmas[i]->skip_invalidation = true;
137 continue;
138 }
139
140 if (vmas[i]->attr.atomic_access == op->atomic.val) {
141 vmas[i]->skip_invalidation = true;
142 } else {
143 vmas[i]->skip_invalidation = false;
144 vmas[i]->attr.atomic_access = op->atomic.val;
145 }
146
147 bo = xe_vma_bo(vmas[i]);
148 if (!bo || bo->attr.atomic_access == op->atomic.val)
149 continue;
150
151 vmas[i]->skip_invalidation = false;
152 xe_bo_assert_held(bo);
153 bo->attr.atomic_access = op->atomic.val;
154
155 /* Invalidate cpu page table, so bo can migrate to smem in next access */
156 if (xe_bo_is_vram(bo) &&
157 (bo->attr.atomic_access == DRM_XE_ATOMIC_CPU ||
158 bo->attr.atomic_access == DRM_XE_ATOMIC_GLOBAL))
159 ttm_bo_unmap_virtual(&bo->ttm);
160 }
161 }
162
madvise_pat_index(struct xe_device * xe,struct xe_vm * vm,struct xe_vma ** vmas,int num_vmas,struct drm_xe_madvise * op,struct xe_madvise_details * details)163 static void madvise_pat_index(struct xe_device *xe, struct xe_vm *vm,
164 struct xe_vma **vmas, int num_vmas,
165 struct drm_xe_madvise *op,
166 struct xe_madvise_details *details)
167 {
168 int i;
169
170 xe_assert(vm->xe, op->type == DRM_XE_MEM_RANGE_ATTR_PAT);
171
172 for (i = 0; i < num_vmas; i++) {
173 if (vmas[i]->attr.pat_index == op->pat_index.val) {
174 vmas[i]->skip_invalidation = true;
175 } else {
176 vmas[i]->skip_invalidation = false;
177 vmas[i]->attr.pat_index = op->pat_index.val;
178 }
179 }
180 }
181
182 typedef void (*madvise_func)(struct xe_device *xe, struct xe_vm *vm,
183 struct xe_vma **vmas, int num_vmas,
184 struct drm_xe_madvise *op,
185 struct xe_madvise_details *details);
186
187 static const madvise_func madvise_funcs[] = {
188 [DRM_XE_MEM_RANGE_ATTR_PREFERRED_LOC] = madvise_preferred_mem_loc,
189 [DRM_XE_MEM_RANGE_ATTR_ATOMIC] = madvise_atomic,
190 [DRM_XE_MEM_RANGE_ATTR_PAT] = madvise_pat_index,
191 };
192
xe_zap_ptes_in_madvise_range(struct xe_vm * vm,u64 start,u64 end)193 static u8 xe_zap_ptes_in_madvise_range(struct xe_vm *vm, u64 start, u64 end)
194 {
195 struct drm_gpuva *gpuva;
196 struct xe_tile *tile;
197 u8 id, tile_mask = 0;
198
199 lockdep_assert_held_write(&vm->lock);
200
201 /* Wait for pending binds */
202 if (dma_resv_wait_timeout(xe_vm_resv(vm), DMA_RESV_USAGE_BOOKKEEP,
203 false, MAX_SCHEDULE_TIMEOUT) <= 0)
204 XE_WARN_ON(1);
205
206 drm_gpuvm_for_each_va_range(gpuva, &vm->gpuvm, start, end) {
207 struct xe_vma *vma = gpuva_to_vma(gpuva);
208
209 if (vma->skip_invalidation || xe_vma_is_null(vma))
210 continue;
211
212 if (xe_vma_is_cpu_addr_mirror(vma)) {
213 tile_mask |= xe_svm_ranges_zap_ptes_in_range(vm,
214 xe_vma_start(vma),
215 xe_vma_end(vma));
216 } else {
217 for_each_tile(tile, vm->xe, id) {
218 if (xe_pt_zap_ptes(tile, vma)) {
219 tile_mask |= BIT(id);
220
221 /*
222 * WRITE_ONCE pairs with READ_ONCE
223 * in xe_vm_has_valid_gpu_mapping()
224 */
225 WRITE_ONCE(vma->tile_invalidated,
226 vma->tile_invalidated | BIT(id));
227 }
228 }
229 }
230 }
231
232 return tile_mask;
233 }
234
xe_vm_invalidate_madvise_range(struct xe_vm * vm,u64 start,u64 end)235 static int xe_vm_invalidate_madvise_range(struct xe_vm *vm, u64 start, u64 end)
236 {
237 u8 tile_mask = xe_zap_ptes_in_madvise_range(vm, start, end);
238
239 if (!tile_mask)
240 return 0;
241
242 xe_device_wmb(vm->xe);
243
244 return xe_vm_range_tilemask_tlb_inval(vm, start, end, tile_mask);
245 }
246
madvise_args_are_sane(struct xe_device * xe,const struct drm_xe_madvise * args)247 static bool madvise_args_are_sane(struct xe_device *xe, const struct drm_xe_madvise *args)
248 {
249 if (XE_IOCTL_DBG(xe, !args))
250 return false;
251
252 if (XE_IOCTL_DBG(xe, !IS_ALIGNED(args->start, SZ_4K)))
253 return false;
254
255 if (XE_IOCTL_DBG(xe, !IS_ALIGNED(args->range, SZ_4K)))
256 return false;
257
258 if (XE_IOCTL_DBG(xe, args->range < SZ_4K))
259 return false;
260
261 switch (args->type) {
262 case DRM_XE_MEM_RANGE_ATTR_PREFERRED_LOC:
263 {
264 s32 fd = (s32)args->preferred_mem_loc.devmem_fd;
265
266 if (XE_IOCTL_DBG(xe, fd < DRM_XE_PREFERRED_LOC_DEFAULT_SYSTEM))
267 return false;
268
269 if (XE_IOCTL_DBG(xe, fd <= DRM_XE_PREFERRED_LOC_DEFAULT_DEVICE &&
270 args->preferred_mem_loc.region_instance != 0))
271 return false;
272
273 if (XE_IOCTL_DBG(xe, args->preferred_mem_loc.migration_policy >
274 DRM_XE_MIGRATE_ONLY_SYSTEM_PAGES))
275 return false;
276
277 if (XE_IOCTL_DBG(xe, args->preferred_mem_loc.reserved))
278 return false;
279 break;
280 }
281 case DRM_XE_MEM_RANGE_ATTR_ATOMIC:
282 if (XE_IOCTL_DBG(xe, args->atomic.val > DRM_XE_ATOMIC_CPU))
283 return false;
284
285 if (XE_IOCTL_DBG(xe, args->atomic.pad))
286 return false;
287
288 if (XE_IOCTL_DBG(xe, args->atomic.reserved))
289 return false;
290
291 break;
292 case DRM_XE_MEM_RANGE_ATTR_PAT:
293 {
294 u16 pat_index, coh_mode;
295
296 if (XE_IOCTL_DBG(xe, args->pat_index.val >= xe->pat.n_entries))
297 return false;
298
299 pat_index = array_index_nospec(args->pat_index.val, xe->pat.n_entries);
300 coh_mode = xe_pat_index_get_coh_mode(xe, pat_index);
301 if (XE_IOCTL_DBG(xe, !coh_mode))
302 return false;
303
304 if (XE_WARN_ON(coh_mode > XE_COH_AT_LEAST_1WAY))
305 return false;
306
307 if (XE_IOCTL_DBG(xe, args->pat_index.pad))
308 return false;
309
310 if (XE_IOCTL_DBG(xe, args->pat_index.reserved))
311 return false;
312 break;
313 }
314 default:
315 if (XE_IOCTL_DBG(xe, 1))
316 return false;
317 }
318
319 if (XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
320 return false;
321
322 return true;
323 }
324
xe_madvise_details_init(struct xe_vm * vm,const struct drm_xe_madvise * args,struct xe_madvise_details * details)325 static int xe_madvise_details_init(struct xe_vm *vm, const struct drm_xe_madvise *args,
326 struct xe_madvise_details *details)
327 {
328 struct xe_device *xe = vm->xe;
329
330 memset(details, 0, sizeof(*details));
331
332 if (args->type == DRM_XE_MEM_RANGE_ATTR_PREFERRED_LOC) {
333 int fd = args->preferred_mem_loc.devmem_fd;
334 struct drm_pagemap *dpagemap;
335
336 if (fd <= 0)
337 return 0;
338
339 dpagemap = xe_drm_pagemap_from_fd(args->preferred_mem_loc.devmem_fd,
340 args->preferred_mem_loc.region_instance);
341 if (XE_IOCTL_DBG(xe, IS_ERR(dpagemap)))
342 return PTR_ERR(dpagemap);
343
344 /* Don't allow a foreign placement without a fast interconnect! */
345 if (XE_IOCTL_DBG(xe, dpagemap->pagemap->owner != vm->svm.peer.owner)) {
346 drm_pagemap_put(dpagemap);
347 return -ENOLINK;
348 }
349 details->dpagemap = dpagemap;
350 }
351
352 return 0;
353 }
354
xe_madvise_details_fini(struct xe_madvise_details * details)355 static void xe_madvise_details_fini(struct xe_madvise_details *details)
356 {
357 drm_pagemap_put(details->dpagemap);
358 }
359
check_bo_args_are_sane(struct xe_vm * vm,struct xe_vma ** vmas,int num_vmas,u32 atomic_val)360 static bool check_bo_args_are_sane(struct xe_vm *vm, struct xe_vma **vmas,
361 int num_vmas, u32 atomic_val)
362 {
363 struct xe_device *xe = vm->xe;
364 struct xe_bo *bo;
365 int i;
366
367 for (i = 0; i < num_vmas; i++) {
368 bo = xe_vma_bo(vmas[i]);
369 if (!bo)
370 continue;
371 /*
372 * NOTE: The following atomic checks are platform-specific. For example,
373 * if a device supports CXL atomics, these may not be necessary or
374 * may behave differently.
375 */
376 if (XE_IOCTL_DBG(xe, atomic_val == DRM_XE_ATOMIC_CPU &&
377 !(bo->flags & XE_BO_FLAG_SYSTEM)))
378 return false;
379
380 if (XE_IOCTL_DBG(xe, atomic_val == DRM_XE_ATOMIC_DEVICE &&
381 !(bo->flags & XE_BO_FLAG_VRAM0) &&
382 !(bo->flags & XE_BO_FLAG_VRAM1) &&
383 !(bo->flags & XE_BO_FLAG_SYSTEM &&
384 xe->info.has_device_atomics_on_smem)))
385 return false;
386
387 if (XE_IOCTL_DBG(xe, atomic_val == DRM_XE_ATOMIC_GLOBAL &&
388 (!(bo->flags & XE_BO_FLAG_SYSTEM) ||
389 (!(bo->flags & XE_BO_FLAG_VRAM0) &&
390 !(bo->flags & XE_BO_FLAG_VRAM1)))))
391 return false;
392 }
393 return true;
394 }
395 /**
396 * xe_vm_madvise_ioctl - Handle MADVise ioctl for a VM
397 * @dev: DRM device pointer
398 * @data: Pointer to ioctl data (drm_xe_madvise*)
399 * @file: DRM file pointer
400 *
401 * Handles the MADVISE ioctl to provide memory advice for vma's within
402 * input range.
403 *
404 * Return: 0 on success or a negative error code on failure.
405 */
xe_vm_madvise_ioctl(struct drm_device * dev,void * data,struct drm_file * file)406 int xe_vm_madvise_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
407 {
408 struct xe_device *xe = to_xe_device(dev);
409 struct xe_file *xef = to_xe_file(file);
410 struct drm_xe_madvise *args = data;
411 struct xe_vmas_in_madvise_range madvise_range = {.addr = args->start,
412 .range = args->range, };
413 struct xe_madvise_details details;
414 struct xe_vm *vm;
415 struct drm_exec exec;
416 int err, attr_type;
417
418 vm = xe_vm_lookup(xef, args->vm_id);
419 if (XE_IOCTL_DBG(xe, !vm))
420 return -EINVAL;
421
422 if (!madvise_args_are_sane(vm->xe, args)) {
423 err = -EINVAL;
424 goto put_vm;
425 }
426
427 xe_svm_flush(vm);
428
429 err = down_write_killable(&vm->lock);
430 if (err)
431 goto put_vm;
432
433 if (XE_IOCTL_DBG(xe, xe_vm_is_closed_or_banned(vm))) {
434 err = -ENOENT;
435 goto unlock_vm;
436 }
437
438 err = xe_madvise_details_init(vm, args, &details);
439 if (err)
440 goto unlock_vm;
441
442 err = xe_vm_alloc_madvise_vma(vm, args->start, args->range);
443 if (err)
444 goto madv_fini;
445
446 err = get_vmas(vm, &madvise_range);
447 if (err || !madvise_range.num_vmas)
448 goto madv_fini;
449
450 if (madvise_range.has_bo_vmas) {
451 if (args->type == DRM_XE_MEM_RANGE_ATTR_ATOMIC) {
452 if (!check_bo_args_are_sane(vm, madvise_range.vmas,
453 madvise_range.num_vmas,
454 args->atomic.val)) {
455 err = -EINVAL;
456 goto free_vmas;
457 }
458 }
459
460 drm_exec_init(&exec, DRM_EXEC_IGNORE_DUPLICATES | DRM_EXEC_INTERRUPTIBLE_WAIT, 0);
461 drm_exec_until_all_locked(&exec) {
462 for (int i = 0; i < madvise_range.num_vmas; i++) {
463 struct xe_bo *bo = xe_vma_bo(madvise_range.vmas[i]);
464
465 if (!bo)
466 continue;
467 err = drm_exec_lock_obj(&exec, &bo->ttm.base);
468 drm_exec_retry_on_contention(&exec);
469 if (err)
470 goto err_fini;
471 }
472 }
473 }
474
475 if (madvise_range.has_svm_userptr_vmas) {
476 err = xe_svm_notifier_lock_interruptible(vm);
477 if (err)
478 goto err_fini;
479 }
480
481 attr_type = array_index_nospec(args->type, ARRAY_SIZE(madvise_funcs));
482 madvise_funcs[attr_type](xe, vm, madvise_range.vmas, madvise_range.num_vmas, args,
483 &details);
484
485 err = xe_vm_invalidate_madvise_range(vm, args->start, args->start + args->range);
486
487 if (madvise_range.has_svm_userptr_vmas)
488 xe_svm_notifier_unlock(vm);
489
490 err_fini:
491 if (madvise_range.has_bo_vmas)
492 drm_exec_fini(&exec);
493 free_vmas:
494 kfree(madvise_range.vmas);
495 madvise_range.vmas = NULL;
496 madv_fini:
497 xe_madvise_details_fini(&details);
498 unlock_vm:
499 up_write(&vm->lock);
500 put_vm:
501 xe_vm_put(vm);
502 return err;
503 }
504