xref: /linux/drivers/gpu/drm/xe/xe_vm.c (revision f9f0b4a1f35d39a1a2a2f8ec46eb7b81efc70a63)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2021 Intel Corporation
4  */
5 
6 #include "xe_vm.h"
7 
8 #include <linux/dma-fence-array.h>
9 #include <linux/nospec.h>
10 
11 #include <drm/drm_drv.h>
12 #include <drm/drm_exec.h>
13 #include <drm/drm_print.h>
14 #include <drm/ttm/ttm_tt.h>
15 #include <uapi/drm/xe_drm.h>
16 #include <linux/ascii85.h>
17 #include <linux/delay.h>
18 #include <linux/kthread.h>
19 #include <linux/mm.h>
20 #include <linux/swap.h>
21 
22 #include <generated/xe_wa_oob.h>
23 
24 #include "regs/xe_gtt_defs.h"
25 #include "xe_assert.h"
26 #include "xe_bo.h"
27 #include "xe_device.h"
28 #include "xe_drm_client.h"
29 #include "xe_exec_queue.h"
30 #include "xe_migrate.h"
31 #include "xe_pat.h"
32 #include "xe_pm.h"
33 #include "xe_preempt_fence.h"
34 #include "xe_pt.h"
35 #include "xe_pxp.h"
36 #include "xe_sriov_vf.h"
37 #include "xe_svm.h"
38 #include "xe_sync.h"
39 #include "xe_tile.h"
40 #include "xe_tlb_inval.h"
41 #include "xe_trace_bo.h"
42 #include "xe_wa.h"
43 
44 static struct drm_gem_object *xe_vm_obj(struct xe_vm *vm)
45 {
46 	return vm->gpuvm.r_obj;
47 }
48 
49 /**
50  * xe_vm_drm_exec_lock() - Lock the vm's resv with a drm_exec transaction
51  * @vm: The vm whose resv is to be locked.
52  * @exec: The drm_exec transaction.
53  *
54  * Helper to lock the vm's resv as part of a drm_exec transaction.
55  *
56  * Return: %0 on success. See drm_exec_lock_obj() for error codes.
57  */
58 int xe_vm_drm_exec_lock(struct xe_vm *vm, struct drm_exec *exec)
59 {
60 	return drm_exec_lock_obj(exec, xe_vm_obj(vm));
61 }
62 
63 static bool preempt_fences_waiting(struct xe_vm *vm)
64 {
65 	struct xe_exec_queue *q;
66 
67 	lockdep_assert_held(&vm->lock);
68 	xe_vm_assert_held(vm);
69 
70 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link) {
71 		if (!q->lr.pfence ||
72 		    test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
73 			     &q->lr.pfence->flags)) {
74 			return true;
75 		}
76 	}
77 
78 	return false;
79 }
80 
81 static void free_preempt_fences(struct list_head *list)
82 {
83 	struct list_head *link, *next;
84 
85 	list_for_each_safe(link, next, list)
86 		xe_preempt_fence_free(to_preempt_fence_from_link(link));
87 }
88 
89 static int alloc_preempt_fences(struct xe_vm *vm, struct list_head *list,
90 				unsigned int *count)
91 {
92 	lockdep_assert_held(&vm->lock);
93 	xe_vm_assert_held(vm);
94 
95 	if (*count >= vm->preempt.num_exec_queues)
96 		return 0;
97 
98 	for (; *count < vm->preempt.num_exec_queues; ++(*count)) {
99 		struct xe_preempt_fence *pfence = xe_preempt_fence_alloc();
100 
101 		if (IS_ERR(pfence))
102 			return PTR_ERR(pfence);
103 
104 		list_move_tail(xe_preempt_fence_link(pfence), list);
105 	}
106 
107 	return 0;
108 }
109 
110 static int wait_for_existing_preempt_fences(struct xe_vm *vm)
111 {
112 	struct xe_exec_queue *q;
113 	bool vf_migration = IS_SRIOV_VF(vm->xe) &&
114 		xe_sriov_vf_migration_supported(vm->xe);
115 	signed long wait_time = vf_migration ? HZ / 5 : MAX_SCHEDULE_TIMEOUT;
116 
117 	xe_vm_assert_held(vm);
118 
119 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link) {
120 		if (q->lr.pfence) {
121 			long timeout;
122 
123 			timeout = dma_fence_wait_timeout(q->lr.pfence, false,
124 							 wait_time);
125 			if (!timeout) {
126 				xe_assert(vm->xe, vf_migration);
127 				return -EAGAIN;
128 			}
129 
130 			/* Only -ETIME on fence indicates VM needs to be killed */
131 			if (timeout < 0 || q->lr.pfence->error == -ETIME)
132 				return -ETIME;
133 
134 			dma_fence_put(q->lr.pfence);
135 			q->lr.pfence = NULL;
136 		}
137 	}
138 
139 	return 0;
140 }
141 
142 static bool xe_vm_is_idle(struct xe_vm *vm)
143 {
144 	struct xe_exec_queue *q;
145 
146 	xe_vm_assert_held(vm);
147 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link) {
148 		if (!xe_exec_queue_is_idle(q))
149 			return false;
150 	}
151 
152 	return true;
153 }
154 
155 static void arm_preempt_fences(struct xe_vm *vm, struct list_head *list)
156 {
157 	struct list_head *link;
158 	struct xe_exec_queue *q;
159 
160 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link) {
161 		struct dma_fence *fence;
162 
163 		link = list->next;
164 		xe_assert(vm->xe, link != list);
165 
166 		fence = xe_preempt_fence_arm(to_preempt_fence_from_link(link),
167 					     q, q->lr.context,
168 					     ++q->lr.seqno);
169 		dma_fence_put(q->lr.pfence);
170 		q->lr.pfence = fence;
171 	}
172 }
173 
174 static int add_preempt_fences(struct xe_vm *vm, struct xe_bo *bo)
175 {
176 	struct xe_exec_queue *q;
177 	int err;
178 
179 	xe_bo_assert_held(bo);
180 
181 	if (!vm->preempt.num_exec_queues)
182 		return 0;
183 
184 	err = dma_resv_reserve_fences(bo->ttm.base.resv, vm->preempt.num_exec_queues);
185 	if (err)
186 		return err;
187 
188 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link)
189 		if (q->lr.pfence) {
190 			dma_resv_add_fence(bo->ttm.base.resv,
191 					   q->lr.pfence,
192 					   DMA_RESV_USAGE_BOOKKEEP);
193 		}
194 
195 	return 0;
196 }
197 
198 static void resume_and_reinstall_preempt_fences(struct xe_vm *vm,
199 						struct drm_exec *exec)
200 {
201 	struct xe_exec_queue *q;
202 
203 	lockdep_assert_held(&vm->lock);
204 	xe_vm_assert_held(vm);
205 
206 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link) {
207 		q->ops->resume(q);
208 
209 		drm_gpuvm_resv_add_fence(&vm->gpuvm, exec, q->lr.pfence,
210 					 DMA_RESV_USAGE_BOOKKEEP, DMA_RESV_USAGE_BOOKKEEP);
211 	}
212 }
213 
214 int xe_vm_add_compute_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
215 {
216 	struct drm_gpuvm_exec vm_exec = {
217 		.vm = &vm->gpuvm,
218 		.flags = DRM_EXEC_INTERRUPTIBLE_WAIT,
219 		.num_fences = 1,
220 	};
221 	struct drm_exec *exec = &vm_exec.exec;
222 	struct xe_validation_ctx ctx;
223 	struct dma_fence *pfence;
224 	int err;
225 	bool wait;
226 
227 	xe_assert(vm->xe, xe_vm_in_preempt_fence_mode(vm));
228 
229 	down_write(&vm->lock);
230 	err = xe_validation_exec_lock(&ctx, &vm_exec, &vm->xe->val);
231 	if (err)
232 		goto out_up_write;
233 
234 	pfence = xe_preempt_fence_create(q, q->lr.context,
235 					 ++q->lr.seqno);
236 	if (IS_ERR(pfence)) {
237 		err = PTR_ERR(pfence);
238 		goto out_fini;
239 	}
240 
241 	list_add(&q->lr.link, &vm->preempt.exec_queues);
242 	++vm->preempt.num_exec_queues;
243 	q->lr.pfence = pfence;
244 
245 	xe_svm_notifier_lock(vm);
246 
247 	drm_gpuvm_resv_add_fence(&vm->gpuvm, exec, pfence,
248 				 DMA_RESV_USAGE_BOOKKEEP, DMA_RESV_USAGE_BOOKKEEP);
249 
250 	/*
251 	 * Check to see if a preemption on VM is in flight or userptr
252 	 * invalidation, if so trigger this preempt fence to sync state with
253 	 * other preempt fences on the VM.
254 	 */
255 	wait = __xe_vm_userptr_needs_repin(vm) || preempt_fences_waiting(vm);
256 	if (wait)
257 		dma_fence_enable_sw_signaling(pfence);
258 
259 	xe_svm_notifier_unlock(vm);
260 
261 out_fini:
262 	xe_validation_ctx_fini(&ctx);
263 out_up_write:
264 	up_write(&vm->lock);
265 
266 	return err;
267 }
268 ALLOW_ERROR_INJECTION(xe_vm_add_compute_exec_queue, ERRNO);
269 
270 /**
271  * xe_vm_remove_compute_exec_queue() - Remove compute exec queue from VM
272  * @vm: The VM.
273  * @q: The exec_queue
274  *
275  * Note that this function might be called multiple times on the same queue.
276  */
277 void xe_vm_remove_compute_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
278 {
279 	if (!xe_vm_in_preempt_fence_mode(vm))
280 		return;
281 
282 	down_write(&vm->lock);
283 	if (!list_empty(&q->lr.link)) {
284 		list_del_init(&q->lr.link);
285 		--vm->preempt.num_exec_queues;
286 	}
287 	if (q->lr.pfence) {
288 		dma_fence_enable_sw_signaling(q->lr.pfence);
289 		dma_fence_put(q->lr.pfence);
290 		q->lr.pfence = NULL;
291 	}
292 	up_write(&vm->lock);
293 }
294 
295 #define XE_VM_REBIND_RETRY_TIMEOUT_MS 1000
296 
297 /**
298  * xe_vm_kill() - VM Kill
299  * @vm: The VM.
300  * @unlocked: Flag indicates the VM's dma-resv is not held
301  *
302  * Kill the VM by setting banned flag indicated VM is no longer available for
303  * use. If in preempt fence mode, also kill all exec queue attached to the VM.
304  */
305 void xe_vm_kill(struct xe_vm *vm, bool unlocked)
306 {
307 	struct xe_exec_queue *q;
308 
309 	lockdep_assert_held(&vm->lock);
310 
311 	if (unlocked)
312 		xe_vm_lock(vm, false);
313 
314 	vm->flags |= XE_VM_FLAG_BANNED;
315 	trace_xe_vm_kill(vm);
316 
317 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link)
318 		q->ops->kill(q);
319 
320 	if (unlocked)
321 		xe_vm_unlock(vm);
322 
323 	/* TODO: Inform user the VM is banned */
324 }
325 
326 static int xe_gpuvm_validate(struct drm_gpuvm_bo *vm_bo, struct drm_exec *exec)
327 {
328 	struct xe_vm *vm = gpuvm_to_vm(vm_bo->vm);
329 	struct drm_gpuva *gpuva;
330 	int ret;
331 
332 	lockdep_assert_held(&vm->lock);
333 	drm_gpuvm_bo_for_each_va(gpuva, vm_bo)
334 		list_move_tail(&gpuva_to_vma(gpuva)->combined_links.rebind,
335 			       &vm->rebind_list);
336 
337 	if (!try_wait_for_completion(&vm->xe->pm_block))
338 		return -EAGAIN;
339 
340 	ret = xe_bo_validate(gem_to_xe_bo(vm_bo->obj), vm, false, exec);
341 	if (ret)
342 		return ret;
343 
344 	vm_bo->evicted = false;
345 	return 0;
346 }
347 
348 /**
349  * xe_vm_validate_rebind() - Validate buffer objects and rebind vmas
350  * @vm: The vm for which we are rebinding.
351  * @exec: The struct drm_exec with the locked GEM objects.
352  * @num_fences: The number of fences to reserve for the operation, not
353  * including rebinds and validations.
354  *
355  * Validates all evicted gem objects and rebinds their vmas. Note that
356  * rebindings may cause evictions and hence the validation-rebind
357  * sequence is rerun until there are no more objects to validate.
358  *
359  * Return: 0 on success, negative error code on error. In particular,
360  * may return -EINTR or -ERESTARTSYS if interrupted, and -EDEADLK if
361  * the drm_exec transaction needs to be restarted.
362  */
363 int xe_vm_validate_rebind(struct xe_vm *vm, struct drm_exec *exec,
364 			  unsigned int num_fences)
365 {
366 	struct drm_gem_object *obj;
367 	unsigned long index;
368 	int ret;
369 
370 	do {
371 		ret = drm_gpuvm_validate(&vm->gpuvm, exec);
372 		if (ret)
373 			return ret;
374 
375 		ret = xe_vm_rebind(vm, false);
376 		if (ret)
377 			return ret;
378 	} while (!list_empty(&vm->gpuvm.evict.list));
379 
380 	drm_exec_for_each_locked_object(exec, index, obj) {
381 		ret = dma_resv_reserve_fences(obj->resv, num_fences);
382 		if (ret)
383 			return ret;
384 	}
385 
386 	return 0;
387 }
388 
389 static int xe_preempt_work_begin(struct drm_exec *exec, struct xe_vm *vm,
390 				 bool *done)
391 {
392 	int err;
393 
394 	err = drm_gpuvm_prepare_vm(&vm->gpuvm, exec, 0);
395 	if (err)
396 		return err;
397 
398 	if (xe_vm_is_idle(vm)) {
399 		vm->preempt.rebind_deactivated = true;
400 		*done = true;
401 		return 0;
402 	}
403 
404 	if (!preempt_fences_waiting(vm)) {
405 		*done = true;
406 		return 0;
407 	}
408 
409 	err = drm_gpuvm_prepare_objects(&vm->gpuvm, exec, 0);
410 	if (err)
411 		return err;
412 
413 	err = wait_for_existing_preempt_fences(vm);
414 	if (err)
415 		return err;
416 
417 	/*
418 	 * Add validation and rebinding to the locking loop since both can
419 	 * cause evictions which may require blocing dma_resv locks.
420 	 * The fence reservation here is intended for the new preempt fences
421 	 * we attach at the end of the rebind work.
422 	 */
423 	return xe_vm_validate_rebind(vm, exec, vm->preempt.num_exec_queues);
424 }
425 
426 static bool vm_suspend_rebind_worker(struct xe_vm *vm)
427 {
428 	struct xe_device *xe = vm->xe;
429 	bool ret = false;
430 
431 	mutex_lock(&xe->rebind_resume_lock);
432 	if (!try_wait_for_completion(&vm->xe->pm_block)) {
433 		ret = true;
434 		list_move_tail(&vm->preempt.pm_activate_link, &xe->rebind_resume_list);
435 	}
436 	mutex_unlock(&xe->rebind_resume_lock);
437 
438 	return ret;
439 }
440 
441 /**
442  * xe_vm_resume_rebind_worker() - Resume the rebind worker.
443  * @vm: The vm whose preempt worker to resume.
444  *
445  * Resume a preempt worker that was previously suspended by
446  * vm_suspend_rebind_worker().
447  */
448 void xe_vm_resume_rebind_worker(struct xe_vm *vm)
449 {
450 	queue_work(vm->xe->ordered_wq, &vm->preempt.rebind_work);
451 }
452 
453 static void preempt_rebind_work_func(struct work_struct *w)
454 {
455 	struct xe_vm *vm = container_of(w, struct xe_vm, preempt.rebind_work);
456 	struct xe_validation_ctx ctx;
457 	struct drm_exec exec;
458 	unsigned int fence_count = 0;
459 	LIST_HEAD(preempt_fences);
460 	int err = 0;
461 	long wait;
462 	int __maybe_unused tries = 0;
463 
464 	xe_assert(vm->xe, xe_vm_in_preempt_fence_mode(vm));
465 	trace_xe_vm_rebind_worker_enter(vm);
466 
467 	down_write(&vm->lock);
468 
469 	if (xe_vm_is_closed_or_banned(vm)) {
470 		up_write(&vm->lock);
471 		trace_xe_vm_rebind_worker_exit(vm);
472 		return;
473 	}
474 
475 retry:
476 	if (!try_wait_for_completion(&vm->xe->pm_block) && vm_suspend_rebind_worker(vm)) {
477 		up_write(&vm->lock);
478 		/* We don't actually block but don't make progress. */
479 		xe_pm_might_block_on_suspend();
480 		return;
481 	}
482 
483 	if (xe_vm_userptr_check_repin(vm)) {
484 		err = xe_vm_userptr_pin(vm);
485 		if (err)
486 			goto out_unlock_outer;
487 	}
488 
489 	err = xe_validation_ctx_init(&ctx, &vm->xe->val, &exec,
490 				     (struct xe_val_flags) {.interruptible = true});
491 	if (err)
492 		goto out_unlock_outer;
493 
494 	drm_exec_until_all_locked(&exec) {
495 		bool done = false;
496 
497 		err = xe_preempt_work_begin(&exec, vm, &done);
498 		drm_exec_retry_on_contention(&exec);
499 		xe_validation_retry_on_oom(&ctx, &err);
500 		if (err || done) {
501 			xe_validation_ctx_fini(&ctx);
502 			goto out_unlock_outer;
503 		}
504 	}
505 
506 	err = alloc_preempt_fences(vm, &preempt_fences, &fence_count);
507 	if (err)
508 		goto out_unlock;
509 
510 	xe_vm_set_validation_exec(vm, &exec);
511 	err = xe_vm_rebind(vm, true);
512 	xe_vm_set_validation_exec(vm, NULL);
513 	if (err)
514 		goto out_unlock;
515 
516 	/* Wait on rebinds and munmap style VM unbinds */
517 	wait = dma_resv_wait_timeout(xe_vm_resv(vm),
518 				     DMA_RESV_USAGE_KERNEL,
519 				     false, MAX_SCHEDULE_TIMEOUT);
520 	if (wait <= 0) {
521 		err = -ETIME;
522 		goto out_unlock;
523 	}
524 
525 #define retry_required(__tries, __vm) \
526 	(IS_ENABLED(CONFIG_DRM_XE_USERPTR_INVAL_INJECT) ? \
527 	(!(__tries)++ || __xe_vm_userptr_needs_repin(__vm)) : \
528 	__xe_vm_userptr_needs_repin(__vm))
529 
530 	xe_svm_notifier_lock(vm);
531 	if (retry_required(tries, vm)) {
532 		xe_svm_notifier_unlock(vm);
533 		err = -EAGAIN;
534 		goto out_unlock;
535 	}
536 
537 #undef retry_required
538 
539 	spin_lock(&vm->xe->ttm.lru_lock);
540 	ttm_lru_bulk_move_tail(&vm->lru_bulk_move);
541 	spin_unlock(&vm->xe->ttm.lru_lock);
542 
543 	/* Point of no return. */
544 	arm_preempt_fences(vm, &preempt_fences);
545 	resume_and_reinstall_preempt_fences(vm, &exec);
546 	xe_svm_notifier_unlock(vm);
547 
548 out_unlock:
549 	xe_validation_ctx_fini(&ctx);
550 out_unlock_outer:
551 	if (err == -EAGAIN) {
552 		trace_xe_vm_rebind_worker_retry(vm);
553 
554 		/*
555 		 * We can't block in workers on a VF which supports migration
556 		 * given this can block the VF post-migration workers from
557 		 * getting scheduled.
558 		 */
559 		if (IS_SRIOV_VF(vm->xe) &&
560 		    xe_sriov_vf_migration_supported(vm->xe)) {
561 			up_write(&vm->lock);
562 			xe_vm_queue_rebind_worker(vm);
563 			return;
564 		}
565 
566 		goto retry;
567 	}
568 
569 	if (err) {
570 		drm_warn(&vm->xe->drm, "VM worker error: %d\n", err);
571 		xe_vm_kill(vm, true);
572 	}
573 	up_write(&vm->lock);
574 
575 	free_preempt_fences(&preempt_fences);
576 
577 	trace_xe_vm_rebind_worker_exit(vm);
578 }
579 
580 static int xe_vma_ops_alloc(struct xe_vma_ops *vops, bool array_of_binds)
581 {
582 	int i;
583 
584 	for (i = 0; i < XE_MAX_TILES_PER_DEVICE; ++i) {
585 		if (!vops->pt_update_ops[i].num_ops)
586 			continue;
587 
588 		vops->pt_update_ops[i].ops =
589 			kmalloc_objs(*vops->pt_update_ops[i].ops,
590 				     vops->pt_update_ops[i].num_ops,
591 				     GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN);
592 		if (!vops->pt_update_ops[i].ops)
593 			return array_of_binds ? -ENOBUFS : -ENOMEM;
594 	}
595 
596 	return 0;
597 }
598 ALLOW_ERROR_INJECTION(xe_vma_ops_alloc, ERRNO);
599 
600 static void xe_vma_svm_prefetch_op_fini(struct xe_vma_op *op)
601 {
602 	struct xe_vma *vma;
603 
604 	vma = gpuva_to_vma(op->base.prefetch.va);
605 
606 	if (op->base.op == DRM_GPUVA_OP_PREFETCH && xe_vma_is_cpu_addr_mirror(vma))
607 		xa_destroy(&op->prefetch_range.range);
608 }
609 
610 static void xe_vma_svm_prefetch_ops_fini(struct xe_vma_ops *vops)
611 {
612 	struct xe_vma_op *op;
613 
614 	if (!(vops->flags & XE_VMA_OPS_FLAG_HAS_SVM_PREFETCH))
615 		return;
616 
617 	list_for_each_entry(op, &vops->list, link)
618 		xe_vma_svm_prefetch_op_fini(op);
619 }
620 
621 static void xe_vma_ops_fini(struct xe_vma_ops *vops)
622 {
623 	int i;
624 
625 	xe_vma_svm_prefetch_ops_fini(vops);
626 
627 	for (i = 0; i < XE_MAX_TILES_PER_DEVICE; ++i)
628 		kfree(vops->pt_update_ops[i].ops);
629 }
630 
631 static void xe_vma_ops_incr_pt_update_ops(struct xe_vma_ops *vops, u8 tile_mask, int inc_val)
632 {
633 	int i;
634 
635 	if (!inc_val)
636 		return;
637 
638 	for (i = 0; i < XE_MAX_TILES_PER_DEVICE; ++i)
639 		if (BIT(i) & tile_mask)
640 			vops->pt_update_ops[i].num_ops += inc_val;
641 }
642 
643 #define XE_VMA_CREATE_MASK (		    \
644 	XE_VMA_READ_ONLY |		    \
645 	XE_VMA_DUMPABLE |		    \
646 	XE_VMA_SYSTEM_ALLOCATOR |           \
647 	DRM_GPUVA_SPARSE |		    \
648 	XE_VMA_MADV_AUTORESET)
649 
650 static void xe_vm_populate_rebind(struct xe_vma_op *op, struct xe_vma *vma,
651 				  u8 tile_mask)
652 {
653 	INIT_LIST_HEAD(&op->link);
654 	op->tile_mask = tile_mask;
655 	op->base.op = DRM_GPUVA_OP_MAP;
656 	op->base.map.va.addr = vma->gpuva.va.addr;
657 	op->base.map.va.range = vma->gpuva.va.range;
658 	op->base.map.gem.obj = vma->gpuva.gem.obj;
659 	op->base.map.gem.offset = vma->gpuva.gem.offset;
660 	op->map.vma = vma;
661 	op->map.immediate = true;
662 	op->map.vma_flags = vma->gpuva.flags & XE_VMA_CREATE_MASK;
663 }
664 
665 static int xe_vm_ops_add_rebind(struct xe_vma_ops *vops, struct xe_vma *vma,
666 				u8 tile_mask)
667 {
668 	struct xe_vma_op *op;
669 
670 	op = kzalloc_obj(*op);
671 	if (!op)
672 		return -ENOMEM;
673 
674 	xe_vm_populate_rebind(op, vma, tile_mask);
675 	list_add_tail(&op->link, &vops->list);
676 	xe_vma_ops_incr_pt_update_ops(vops, tile_mask, 1);
677 
678 	return 0;
679 }
680 
681 static struct dma_fence *ops_execute(struct xe_vm *vm,
682 				     struct xe_vma_ops *vops);
683 static void xe_vma_ops_init(struct xe_vma_ops *vops, struct xe_vm *vm,
684 			    struct xe_exec_queue *q,
685 			    struct xe_sync_entry *syncs, u32 num_syncs);
686 
687 int xe_vm_rebind(struct xe_vm *vm, bool rebind_worker)
688 {
689 	struct dma_fence *fence;
690 	struct xe_vma *vma, *next;
691 	struct xe_vma_ops vops;
692 	struct xe_vma_op *op, *next_op;
693 	int err, i;
694 
695 	lockdep_assert_held(&vm->lock);
696 	if ((xe_vm_in_lr_mode(vm) && !rebind_worker) ||
697 	    list_empty(&vm->rebind_list))
698 		return 0;
699 
700 	xe_vma_ops_init(&vops, vm, NULL, NULL, 0);
701 	for (i = 0; i < XE_MAX_TILES_PER_DEVICE; ++i)
702 		vops.pt_update_ops[i].wait_vm_bookkeep = true;
703 
704 	xe_vm_assert_held(vm);
705 	list_for_each_entry(vma, &vm->rebind_list, combined_links.rebind) {
706 		xe_assert(vm->xe, vma->tile_present);
707 
708 		if (rebind_worker)
709 			trace_xe_vma_rebind_worker(vma);
710 		else
711 			trace_xe_vma_rebind_exec(vma);
712 
713 		err = xe_vm_ops_add_rebind(&vops, vma,
714 					   vma->tile_present);
715 		if (err)
716 			goto free_ops;
717 	}
718 
719 	err = xe_vma_ops_alloc(&vops, false);
720 	if (err)
721 		goto free_ops;
722 
723 	fence = ops_execute(vm, &vops);
724 	if (IS_ERR(fence)) {
725 		err = PTR_ERR(fence);
726 	} else {
727 		dma_fence_put(fence);
728 		list_for_each_entry_safe(vma, next, &vm->rebind_list,
729 					 combined_links.rebind)
730 			list_del_init(&vma->combined_links.rebind);
731 	}
732 free_ops:
733 	list_for_each_entry_safe(op, next_op, &vops.list, link) {
734 		list_del(&op->link);
735 		kfree(op);
736 	}
737 	xe_vma_ops_fini(&vops);
738 
739 	return err;
740 }
741 
742 struct dma_fence *xe_vma_rebind(struct xe_vm *vm, struct xe_vma *vma, u8 tile_mask)
743 {
744 	struct dma_fence *fence = NULL;
745 	struct xe_vma_ops vops;
746 	struct xe_vma_op *op, *next_op;
747 	struct xe_tile *tile;
748 	u8 id;
749 	int err;
750 
751 	lockdep_assert_held(&vm->lock);
752 	xe_vm_assert_held(vm);
753 	xe_assert(vm->xe, xe_vm_in_fault_mode(vm));
754 
755 	xe_vma_ops_init(&vops, vm, NULL, NULL, 0);
756 	vops.flags |= XE_VMA_OPS_FLAG_SKIP_TLB_WAIT;
757 	for_each_tile(tile, vm->xe, id) {
758 		vops.pt_update_ops[id].wait_vm_bookkeep = true;
759 		vops.pt_update_ops[tile->id].q =
760 			xe_migrate_exec_queue(tile->migrate);
761 	}
762 
763 	err = xe_vm_ops_add_rebind(&vops, vma, tile_mask);
764 	if (err)
765 		return ERR_PTR(err);
766 
767 	err = xe_vma_ops_alloc(&vops, false);
768 	if (err) {
769 		fence = ERR_PTR(err);
770 		goto free_ops;
771 	}
772 
773 	fence = ops_execute(vm, &vops);
774 
775 free_ops:
776 	list_for_each_entry_safe(op, next_op, &vops.list, link) {
777 		list_del(&op->link);
778 		kfree(op);
779 	}
780 	xe_vma_ops_fini(&vops);
781 
782 	return fence;
783 }
784 
785 static void xe_vm_populate_range_rebind(struct xe_vma_op *op,
786 					struct xe_vma *vma,
787 					struct xe_svm_range *range,
788 					u8 tile_mask)
789 {
790 	INIT_LIST_HEAD(&op->link);
791 	op->tile_mask = tile_mask;
792 	op->base.op = DRM_GPUVA_OP_DRIVER;
793 	op->subop = XE_VMA_SUBOP_MAP_RANGE;
794 	op->map_range.vma = vma;
795 	op->map_range.range = range;
796 }
797 
798 static int
799 xe_vm_ops_add_range_rebind(struct xe_vma_ops *vops,
800 			   struct xe_vma *vma,
801 			   struct xe_svm_range *range,
802 			   u8 tile_mask)
803 {
804 	struct xe_vma_op *op;
805 
806 	op = kzalloc_obj(*op);
807 	if (!op)
808 		return -ENOMEM;
809 
810 	xe_vm_populate_range_rebind(op, vma, range, tile_mask);
811 	list_add_tail(&op->link, &vops->list);
812 	xe_vma_ops_incr_pt_update_ops(vops, tile_mask, 1);
813 
814 	return 0;
815 }
816 
817 /**
818  * xe_vm_range_rebind() - VM range (re)bind
819  * @vm: The VM which the range belongs to.
820  * @vma: The VMA which the range belongs to.
821  * @range: SVM range to rebind.
822  * @tile_mask: Tile mask to bind the range to.
823  *
824  * (re)bind SVM range setting up GPU page tables for the range.
825  *
826  * Return: dma fence for rebind to signal completion on success, ERR_PTR on
827  * failure
828  */
829 struct dma_fence *xe_vm_range_rebind(struct xe_vm *vm,
830 				     struct xe_vma *vma,
831 				     struct xe_svm_range *range,
832 				     u8 tile_mask)
833 {
834 	struct dma_fence *fence = NULL;
835 	struct xe_vma_ops vops;
836 	struct xe_vma_op *op, *next_op;
837 	struct xe_tile *tile;
838 	u8 id;
839 	int err;
840 
841 	lockdep_assert_held(&vm->lock);
842 	xe_vm_assert_held(vm);
843 	xe_assert(vm->xe, xe_vm_in_fault_mode(vm));
844 	xe_assert(vm->xe, xe_vma_is_cpu_addr_mirror(vma));
845 
846 	xe_vma_ops_init(&vops, vm, NULL, NULL, 0);
847 	vops.flags |= XE_VMA_OPS_FLAG_SKIP_TLB_WAIT;
848 	for_each_tile(tile, vm->xe, id) {
849 		vops.pt_update_ops[id].wait_vm_bookkeep = true;
850 		vops.pt_update_ops[tile->id].q =
851 			xe_migrate_exec_queue(tile->migrate);
852 	}
853 
854 	err = xe_vm_ops_add_range_rebind(&vops, vma, range, tile_mask);
855 	if (err)
856 		return ERR_PTR(err);
857 
858 	err = xe_vma_ops_alloc(&vops, false);
859 	if (err) {
860 		fence = ERR_PTR(err);
861 		goto free_ops;
862 	}
863 
864 	fence = ops_execute(vm, &vops);
865 
866 free_ops:
867 	list_for_each_entry_safe(op, next_op, &vops.list, link) {
868 		list_del(&op->link);
869 		kfree(op);
870 	}
871 	xe_vma_ops_fini(&vops);
872 
873 	return fence;
874 }
875 
876 static void xe_vm_populate_range_unbind(struct xe_vma_op *op,
877 					struct xe_svm_range *range)
878 {
879 	INIT_LIST_HEAD(&op->link);
880 	op->tile_mask = range->tile_present;
881 	op->base.op = DRM_GPUVA_OP_DRIVER;
882 	op->subop = XE_VMA_SUBOP_UNMAP_RANGE;
883 	op->unmap_range.range = range;
884 }
885 
886 static int
887 xe_vm_ops_add_range_unbind(struct xe_vma_ops *vops,
888 			   struct xe_svm_range *range)
889 {
890 	struct xe_vma_op *op;
891 
892 	op = kzalloc_obj(*op);
893 	if (!op)
894 		return -ENOMEM;
895 
896 	xe_vm_populate_range_unbind(op, range);
897 	list_add_tail(&op->link, &vops->list);
898 	xe_vma_ops_incr_pt_update_ops(vops, range->tile_present, 1);
899 
900 	return 0;
901 }
902 
903 /**
904  * xe_vm_range_unbind() - VM range unbind
905  * @vm: The VM which the range belongs to.
906  * @range: SVM range to rebind.
907  *
908  * Unbind SVM range removing the GPU page tables for the range.
909  *
910  * Return: dma fence for unbind to signal completion on success, ERR_PTR on
911  * failure
912  */
913 struct dma_fence *xe_vm_range_unbind(struct xe_vm *vm,
914 				     struct xe_svm_range *range)
915 {
916 	struct dma_fence *fence = NULL;
917 	struct xe_vma_ops vops;
918 	struct xe_vma_op *op, *next_op;
919 	struct xe_tile *tile;
920 	u8 id;
921 	int err;
922 
923 	lockdep_assert_held(&vm->lock);
924 	xe_vm_assert_held(vm);
925 	xe_assert(vm->xe, xe_vm_in_fault_mode(vm));
926 
927 	if (!range->tile_present)
928 		return dma_fence_get_stub();
929 
930 	xe_vma_ops_init(&vops, vm, NULL, NULL, 0);
931 	for_each_tile(tile, vm->xe, id) {
932 		vops.pt_update_ops[id].wait_vm_bookkeep = true;
933 		vops.pt_update_ops[tile->id].q =
934 			xe_migrate_exec_queue(tile->migrate);
935 	}
936 
937 	err = xe_vm_ops_add_range_unbind(&vops, range);
938 	if (err)
939 		return ERR_PTR(err);
940 
941 	err = xe_vma_ops_alloc(&vops, false);
942 	if (err) {
943 		fence = ERR_PTR(err);
944 		goto free_ops;
945 	}
946 
947 	fence = ops_execute(vm, &vops);
948 
949 free_ops:
950 	list_for_each_entry_safe(op, next_op, &vops.list, link) {
951 		list_del(&op->link);
952 		kfree(op);
953 	}
954 	xe_vma_ops_fini(&vops);
955 
956 	return fence;
957 }
958 
959 static void xe_vma_mem_attr_fini(struct xe_vma_mem_attr *attr)
960 {
961 	drm_pagemap_put(attr->preferred_loc.dpagemap);
962 }
963 
964 static void xe_vma_free(struct xe_vma *vma)
965 {
966 	xe_vma_mem_attr_fini(&vma->attr);
967 
968 	if (xe_vma_is_userptr(vma))
969 		kfree(to_userptr_vma(vma));
970 	else
971 		kfree(vma);
972 }
973 
974 /**
975  * xe_vma_mem_attr_copy() - copy an xe_vma_mem_attr structure.
976  * @to: Destination.
977  * @from: Source.
978  *
979  * Copies an xe_vma_mem_attr structure taking care to get reference
980  * counting of individual members right.
981  */
982 void xe_vma_mem_attr_copy(struct xe_vma_mem_attr *to, struct xe_vma_mem_attr *from)
983 {
984 	xe_vma_mem_attr_fini(to);
985 	*to = *from;
986 	if (to->preferred_loc.dpagemap)
987 		drm_pagemap_get(to->preferred_loc.dpagemap);
988 }
989 
990 static struct xe_vma *xe_vma_create(struct xe_vm *vm,
991 				    struct xe_bo *bo,
992 				    u64 bo_offset_or_userptr,
993 				    u64 start, u64 end,
994 				    struct xe_vma_mem_attr *attr,
995 				    unsigned int flags)
996 {
997 	struct xe_vma *vma;
998 	struct xe_tile *tile;
999 	u8 id;
1000 	bool is_null = (flags & DRM_GPUVA_SPARSE);
1001 	bool is_cpu_addr_mirror = (flags & XE_VMA_SYSTEM_ALLOCATOR);
1002 
1003 	xe_assert(vm->xe, start < end);
1004 	xe_assert(vm->xe, end < vm->size);
1005 
1006 	/*
1007 	 * Allocate and ensure that the xe_vma_is_userptr() return
1008 	 * matches what was allocated.
1009 	 */
1010 	if (!bo && !is_null && !is_cpu_addr_mirror) {
1011 		struct xe_userptr_vma *uvma = kzalloc_obj(*uvma);
1012 
1013 		if (!uvma)
1014 			return ERR_PTR(-ENOMEM);
1015 
1016 		vma = &uvma->vma;
1017 	} else {
1018 		vma = kzalloc_obj(*vma);
1019 		if (!vma)
1020 			return ERR_PTR(-ENOMEM);
1021 
1022 		if (bo)
1023 			vma->gpuva.gem.obj = &bo->ttm.base;
1024 	}
1025 
1026 	INIT_LIST_HEAD(&vma->combined_links.rebind);
1027 
1028 	INIT_LIST_HEAD(&vma->gpuva.gem.entry);
1029 	vma->gpuva.vm = &vm->gpuvm;
1030 	vma->gpuva.va.addr = start;
1031 	vma->gpuva.va.range = end - start + 1;
1032 	vma->gpuva.flags = flags;
1033 
1034 	for_each_tile(tile, vm->xe, id)
1035 		vma->tile_mask |= 0x1 << id;
1036 
1037 	if (vm->xe->info.has_atomic_enable_pte_bit)
1038 		vma->gpuva.flags |= XE_VMA_ATOMIC_PTE_BIT;
1039 
1040 	xe_vma_mem_attr_copy(&vma->attr, attr);
1041 	if (bo) {
1042 		struct drm_gpuvm_bo *vm_bo;
1043 
1044 		xe_bo_assert_held(bo);
1045 
1046 		vm_bo = drm_gpuvm_bo_obtain_locked(vma->gpuva.vm, &bo->ttm.base);
1047 		if (IS_ERR(vm_bo)) {
1048 			xe_vma_free(vma);
1049 			return ERR_CAST(vm_bo);
1050 		}
1051 
1052 		drm_gpuvm_bo_extobj_add(vm_bo);
1053 		drm_gem_object_get(&bo->ttm.base);
1054 		vma->gpuva.gem.offset = bo_offset_or_userptr;
1055 		drm_gpuva_link(&vma->gpuva, vm_bo);
1056 		drm_gpuvm_bo_put(vm_bo);
1057 	} else /* userptr or null */ {
1058 		if (!is_null && !is_cpu_addr_mirror) {
1059 			struct xe_userptr_vma *uvma = to_userptr_vma(vma);
1060 			u64 size = end - start + 1;
1061 			int err;
1062 
1063 			vma->gpuva.gem.offset = bo_offset_or_userptr;
1064 
1065 			err = xe_userptr_setup(uvma, xe_vma_userptr(vma), size);
1066 			if (err) {
1067 				xe_vma_free(vma);
1068 				return ERR_PTR(err);
1069 			}
1070 		}
1071 
1072 		xe_vm_get(vm);
1073 	}
1074 
1075 	return vma;
1076 }
1077 
1078 static void xe_vma_destroy_late(struct xe_vma *vma)
1079 {
1080 	struct xe_vm *vm = xe_vma_vm(vma);
1081 
1082 	if (vma->ufence) {
1083 		xe_sync_ufence_put(vma->ufence);
1084 		vma->ufence = NULL;
1085 	}
1086 
1087 	if (xe_vma_is_userptr(vma)) {
1088 		struct xe_userptr_vma *uvma = to_userptr_vma(vma);
1089 
1090 		xe_userptr_remove(uvma);
1091 		xe_vm_put(vm);
1092 	} else if (xe_vma_is_null(vma) || xe_vma_is_cpu_addr_mirror(vma)) {
1093 		xe_vm_put(vm);
1094 	} else {
1095 		xe_bo_put(xe_vma_bo(vma));
1096 	}
1097 
1098 	xe_vma_free(vma);
1099 }
1100 
1101 static void vma_destroy_work_func(struct work_struct *w)
1102 {
1103 	struct xe_vma *vma =
1104 		container_of(w, struct xe_vma, destroy_work);
1105 
1106 	xe_vma_destroy_late(vma);
1107 }
1108 
1109 static void vma_destroy_cb(struct dma_fence *fence,
1110 			   struct dma_fence_cb *cb)
1111 {
1112 	struct xe_vma *vma = container_of(cb, struct xe_vma, destroy_cb);
1113 
1114 	INIT_WORK(&vma->destroy_work, vma_destroy_work_func);
1115 	queue_work(system_dfl_wq, &vma->destroy_work);
1116 }
1117 
1118 static void xe_vma_destroy(struct xe_vma *vma, struct dma_fence *fence)
1119 {
1120 	struct xe_vm *vm = xe_vma_vm(vma);
1121 
1122 	lockdep_assert_held_write(&vm->lock);
1123 	xe_assert(vm->xe, list_empty(&vma->combined_links.destroy));
1124 
1125 	if (xe_vma_is_userptr(vma)) {
1126 		xe_assert(vm->xe, vma->gpuva.flags & XE_VMA_DESTROYED);
1127 		xe_userptr_destroy(to_userptr_vma(vma));
1128 	} else if (!xe_vma_is_null(vma) && !xe_vma_is_cpu_addr_mirror(vma)) {
1129 		xe_bo_assert_held(xe_vma_bo(vma));
1130 
1131 		drm_gpuva_unlink(&vma->gpuva);
1132 	}
1133 
1134 	xe_vm_assert_held(vm);
1135 	if (fence) {
1136 		int ret = dma_fence_add_callback(fence, &vma->destroy_cb,
1137 						 vma_destroy_cb);
1138 
1139 		if (ret) {
1140 			XE_WARN_ON(ret != -ENOENT);
1141 			xe_vma_destroy_late(vma);
1142 		}
1143 	} else {
1144 		xe_vma_destroy_late(vma);
1145 	}
1146 }
1147 
1148 /**
1149  * xe_vm_lock_vma() - drm_exec utility to lock a vma
1150  * @exec: The drm_exec object we're currently locking for.
1151  * @vma: The vma for witch we want to lock the vm resv and any attached
1152  * object's resv.
1153  *
1154  * Return: 0 on success, negative error code on error. In particular
1155  * may return -EDEADLK on WW transaction contention and -EINTR if
1156  * an interruptible wait is terminated by a signal.
1157  */
1158 int xe_vm_lock_vma(struct drm_exec *exec, struct xe_vma *vma)
1159 {
1160 	struct xe_vm *vm = xe_vma_vm(vma);
1161 	struct xe_bo *bo = xe_vma_bo(vma);
1162 	int err;
1163 
1164 	XE_WARN_ON(!vm);
1165 
1166 	err = drm_exec_lock_obj(exec, xe_vm_obj(vm));
1167 	if (!err && bo && !bo->vm)
1168 		err = drm_exec_lock_obj(exec, &bo->ttm.base);
1169 
1170 	return err;
1171 }
1172 
1173 static void xe_vma_destroy_unlocked(struct xe_vma *vma)
1174 {
1175 	struct xe_device *xe = xe_vma_vm(vma)->xe;
1176 	struct xe_validation_ctx ctx;
1177 	struct drm_exec exec;
1178 	int err = 0;
1179 
1180 	xe_validation_guard(&ctx, &xe->val, &exec, (struct xe_val_flags) {}, err) {
1181 		err = xe_vm_lock_vma(&exec, vma);
1182 		drm_exec_retry_on_contention(&exec);
1183 		if (XE_WARN_ON(err))
1184 			break;
1185 		xe_vma_destroy(vma, NULL);
1186 	}
1187 	xe_assert(xe, !err);
1188 }
1189 
1190 struct xe_vma *
1191 xe_vm_find_overlapping_vma(struct xe_vm *vm, u64 start, u64 range)
1192 {
1193 	struct drm_gpuva *gpuva;
1194 
1195 	lockdep_assert_held(&vm->lock);
1196 
1197 	if (xe_vm_is_closed_or_banned(vm))
1198 		return NULL;
1199 
1200 	xe_assert(vm->xe, start + range <= vm->size);
1201 
1202 	gpuva = drm_gpuva_find_first(&vm->gpuvm, start, range);
1203 
1204 	return gpuva ? gpuva_to_vma(gpuva) : NULL;
1205 }
1206 
1207 static int xe_vm_insert_vma(struct xe_vm *vm, struct xe_vma *vma)
1208 {
1209 	int err;
1210 
1211 	xe_assert(vm->xe, xe_vma_vm(vma) == vm);
1212 	lockdep_assert_held(&vm->lock);
1213 
1214 	mutex_lock(&vm->snap_mutex);
1215 	err = drm_gpuva_insert(&vm->gpuvm, &vma->gpuva);
1216 	mutex_unlock(&vm->snap_mutex);
1217 	XE_WARN_ON(err);	/* Shouldn't be possible */
1218 
1219 	return err;
1220 }
1221 
1222 static void xe_vm_remove_vma(struct xe_vm *vm, struct xe_vma *vma)
1223 {
1224 	xe_assert(vm->xe, xe_vma_vm(vma) == vm);
1225 	lockdep_assert_held(&vm->lock);
1226 
1227 	mutex_lock(&vm->snap_mutex);
1228 	drm_gpuva_remove(&vma->gpuva);
1229 	mutex_unlock(&vm->snap_mutex);
1230 	if (vm->usm.last_fault_vma == vma)
1231 		vm->usm.last_fault_vma = NULL;
1232 }
1233 
1234 static struct drm_gpuva_op *xe_vm_op_alloc(void)
1235 {
1236 	struct xe_vma_op *op;
1237 
1238 	op = kzalloc_obj(*op);
1239 
1240 	if (unlikely(!op))
1241 		return NULL;
1242 
1243 	return &op->base;
1244 }
1245 
1246 static void xe_vm_free(struct drm_gpuvm *gpuvm);
1247 
1248 static const struct drm_gpuvm_ops gpuvm_ops = {
1249 	.op_alloc = xe_vm_op_alloc,
1250 	.vm_bo_validate = xe_gpuvm_validate,
1251 	.vm_free = xe_vm_free,
1252 };
1253 
1254 static u64 pde_encode_pat_index(u16 pat_index)
1255 {
1256 	u64 pte = 0;
1257 
1258 	if (pat_index & BIT(0))
1259 		pte |= XE_PPGTT_PTE_PAT0;
1260 
1261 	if (pat_index & BIT(1))
1262 		pte |= XE_PPGTT_PTE_PAT1;
1263 
1264 	return pte;
1265 }
1266 
1267 static u64 pte_encode_pat_index(u16 pat_index, u32 pt_level)
1268 {
1269 	u64 pte = 0;
1270 
1271 	if (pat_index & BIT(0))
1272 		pte |= XE_PPGTT_PTE_PAT0;
1273 
1274 	if (pat_index & BIT(1))
1275 		pte |= XE_PPGTT_PTE_PAT1;
1276 
1277 	if (pat_index & BIT(2)) {
1278 		if (pt_level)
1279 			pte |= XE_PPGTT_PDE_PDPE_PAT2;
1280 		else
1281 			pte |= XE_PPGTT_PTE_PAT2;
1282 	}
1283 
1284 	if (pat_index & BIT(3))
1285 		pte |= XELPG_PPGTT_PTE_PAT3;
1286 
1287 	if (pat_index & (BIT(4)))
1288 		pte |= XE2_PPGTT_PTE_PAT4;
1289 
1290 	return pte;
1291 }
1292 
1293 static u64 pte_encode_ps(u32 pt_level)
1294 {
1295 	XE_WARN_ON(pt_level > MAX_HUGEPTE_LEVEL);
1296 
1297 	if (pt_level == 1)
1298 		return XE_PDE_PS_2M;
1299 	else if (pt_level == 2)
1300 		return XE_PDPE_PS_1G;
1301 
1302 	return 0;
1303 }
1304 
1305 static u16 pde_pat_index(struct xe_bo *bo)
1306 {
1307 	struct xe_device *xe = xe_bo_device(bo);
1308 	u16 pat_index;
1309 
1310 	/*
1311 	 * We only have two bits to encode the PAT index in non-leaf nodes, but
1312 	 * these only point to other paging structures so we only need a minimal
1313 	 * selection of options. The user PAT index is only for encoding leaf
1314 	 * nodes, where we have use of more bits to do the encoding. The
1315 	 * non-leaf nodes are instead under driver control so the chosen index
1316 	 * here should be distinct from the user PAT index. Also the
1317 	 * corresponding coherency of the PAT index should be tied to the
1318 	 * allocation type of the page table (or at least we should pick
1319 	 * something which is always safe).
1320 	 */
1321 	if (!xe_bo_is_vram(bo) && bo->ttm.ttm->caching == ttm_cached)
1322 		pat_index = xe->pat.idx[XE_CACHE_WB];
1323 	else
1324 		pat_index = xe->pat.idx[XE_CACHE_NONE];
1325 
1326 	xe_assert(xe, pat_index <= 3);
1327 
1328 	return pat_index;
1329 }
1330 
1331 static u64 xelp_pde_encode_bo(struct xe_bo *bo, u64 bo_offset)
1332 {
1333 	u64 pde;
1334 
1335 	pde = xe_bo_addr(bo, bo_offset, XE_PAGE_SIZE);
1336 	pde |= XE_PAGE_PRESENT | XE_PAGE_RW;
1337 	pde |= pde_encode_pat_index(pde_pat_index(bo));
1338 
1339 	return pde;
1340 }
1341 
1342 static u64 xelp_pte_encode_bo(struct xe_bo *bo, u64 bo_offset,
1343 			      u16 pat_index, u32 pt_level)
1344 {
1345 	u64 pte;
1346 
1347 	pte = xe_bo_addr(bo, bo_offset, XE_PAGE_SIZE);
1348 	pte |= XE_PAGE_PRESENT | XE_PAGE_RW;
1349 	pte |= pte_encode_pat_index(pat_index, pt_level);
1350 	pte |= pte_encode_ps(pt_level);
1351 
1352 	if (xe_bo_is_vram(bo) || xe_bo_is_stolen_devmem(bo))
1353 		pte |= XE_PPGTT_PTE_DM;
1354 
1355 	return pte;
1356 }
1357 
1358 static u64 xelp_pte_encode_vma(u64 pte, struct xe_vma *vma,
1359 			       u16 pat_index, u32 pt_level)
1360 {
1361 	pte |= XE_PAGE_PRESENT;
1362 
1363 	if (likely(!xe_vma_read_only(vma)))
1364 		pte |= XE_PAGE_RW;
1365 
1366 	pte |= pte_encode_pat_index(pat_index, pt_level);
1367 	pte |= pte_encode_ps(pt_level);
1368 
1369 	if (unlikely(xe_vma_is_null(vma)))
1370 		pte |= XE_PTE_NULL;
1371 
1372 	return pte;
1373 }
1374 
1375 static u64 xelp_pte_encode_addr(struct xe_device *xe, u64 addr,
1376 				u16 pat_index,
1377 				u32 pt_level, bool devmem, u64 flags)
1378 {
1379 	u64 pte;
1380 
1381 	/* Avoid passing random bits directly as flags */
1382 	xe_assert(xe, !(flags & ~XE_PTE_PS64));
1383 
1384 	pte = addr;
1385 	pte |= XE_PAGE_PRESENT | XE_PAGE_RW;
1386 	pte |= pte_encode_pat_index(pat_index, pt_level);
1387 	pte |= pte_encode_ps(pt_level);
1388 
1389 	if (devmem)
1390 		pte |= XE_PPGTT_PTE_DM;
1391 
1392 	pte |= flags;
1393 
1394 	return pte;
1395 }
1396 
1397 static const struct xe_pt_ops xelp_pt_ops = {
1398 	.pte_encode_bo = xelp_pte_encode_bo,
1399 	.pte_encode_vma = xelp_pte_encode_vma,
1400 	.pte_encode_addr = xelp_pte_encode_addr,
1401 	.pde_encode_bo = xelp_pde_encode_bo,
1402 };
1403 
1404 static void vm_destroy_work_func(struct work_struct *w);
1405 
1406 /**
1407  * xe_vm_create_scratch() - Setup a scratch memory pagetable tree for the
1408  * given tile and vm.
1409  * @xe: xe device.
1410  * @tile: tile to set up for.
1411  * @vm: vm to set up for.
1412  * @exec: The struct drm_exec object used to lock the vm resv.
1413  *
1414  * Sets up a pagetable tree with one page-table per level and a single
1415  * leaf PTE. All pagetable entries point to the single page-table or,
1416  * for MAX_HUGEPTE_LEVEL, a NULL huge PTE returning 0 on read and
1417  * writes become NOPs.
1418  *
1419  * Return: 0 on success, negative error code on error.
1420  */
1421 static int xe_vm_create_scratch(struct xe_device *xe, struct xe_tile *tile,
1422 				struct xe_vm *vm, struct drm_exec *exec)
1423 {
1424 	u8 id = tile->id;
1425 	int i;
1426 
1427 	for (i = MAX_HUGEPTE_LEVEL; i < vm->pt_root[id]->level; i++) {
1428 		vm->scratch_pt[id][i] = xe_pt_create(vm, tile, i, exec);
1429 		if (IS_ERR(vm->scratch_pt[id][i])) {
1430 			int err = PTR_ERR(vm->scratch_pt[id][i]);
1431 
1432 			vm->scratch_pt[id][i] = NULL;
1433 			return err;
1434 		}
1435 		xe_pt_populate_empty(tile, vm, vm->scratch_pt[id][i]);
1436 	}
1437 
1438 	return 0;
1439 }
1440 ALLOW_ERROR_INJECTION(xe_vm_create_scratch, ERRNO);
1441 
1442 static void xe_vm_free_scratch(struct xe_vm *vm)
1443 {
1444 	struct xe_tile *tile;
1445 	u8 id;
1446 
1447 	if (!xe_vm_has_scratch(vm))
1448 		return;
1449 
1450 	for_each_tile(tile, vm->xe, id) {
1451 		u32 i;
1452 
1453 		if (!vm->pt_root[id])
1454 			continue;
1455 
1456 		for (i = MAX_HUGEPTE_LEVEL; i < vm->pt_root[id]->level; ++i)
1457 			if (vm->scratch_pt[id][i])
1458 				xe_pt_destroy(vm->scratch_pt[id][i], vm->flags, NULL);
1459 	}
1460 }
1461 
1462 static void xe_vm_pt_destroy(struct xe_vm *vm)
1463 {
1464 	struct xe_tile *tile;
1465 	u8 id;
1466 
1467 	xe_vm_assert_held(vm);
1468 
1469 	for_each_tile(tile, vm->xe, id) {
1470 		if (vm->pt_root[id]) {
1471 			xe_pt_destroy(vm->pt_root[id], vm->flags, NULL);
1472 			vm->pt_root[id] = NULL;
1473 		}
1474 	}
1475 }
1476 
1477 static void xe_vm_init_prove_locking(struct xe_device *xe, struct xe_vm *vm)
1478 {
1479 	if (!IS_ENABLED(CONFIG_PROVE_LOCKING))
1480 		return;
1481 
1482 	fs_reclaim_acquire(GFP_KERNEL);
1483 	might_lock(&vm->exec_queues.lock);
1484 	fs_reclaim_release(GFP_KERNEL);
1485 
1486 	down_read(&vm->exec_queues.lock);
1487 	might_lock(&xe_root_mmio_gt(xe)->uc.guc.ct.lock);
1488 	up_read(&vm->exec_queues.lock);
1489 }
1490 
1491 struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags, struct xe_file *xef)
1492 {
1493 	struct drm_gem_object *vm_resv_obj;
1494 	struct xe_validation_ctx ctx;
1495 	struct drm_exec exec;
1496 	struct xe_vm *vm;
1497 	int err;
1498 	struct xe_tile *tile;
1499 	u8 id;
1500 
1501 	/*
1502 	 * Since the GSCCS is not user-accessible, we don't expect a GSC VM to
1503 	 * ever be in faulting mode.
1504 	 */
1505 	xe_assert(xe, !((flags & XE_VM_FLAG_GSC) && (flags & XE_VM_FLAG_FAULT_MODE)));
1506 
1507 	vm = kzalloc(sizeof(*vm), GFP_KERNEL);
1508 	if (!vm)
1509 		return ERR_PTR(-ENOMEM);
1510 
1511 	vm->xe = xe;
1512 
1513 	vm->size = 1ull << xe->info.va_bits;
1514 	vm->flags = flags;
1515 
1516 	if (xef)
1517 		vm->xef = xe_file_get(xef);
1518 	/**
1519 	 * GSC VMs are kernel-owned, only used for PXP ops and can sometimes be
1520 	 * manipulated under the PXP mutex. However, the PXP mutex can be taken
1521 	 * under a user-VM lock when the PXP session is started at exec_queue
1522 	 * creation time. Those are different VMs and therefore there is no risk
1523 	 * of deadlock, but we need to tell lockdep that this is the case or it
1524 	 * will print a warning.
1525 	 */
1526 	if (flags & XE_VM_FLAG_GSC) {
1527 		static struct lock_class_key gsc_vm_key;
1528 
1529 		__init_rwsem(&vm->lock, "gsc_vm", &gsc_vm_key);
1530 	} else {
1531 		init_rwsem(&vm->lock);
1532 	}
1533 	mutex_init(&vm->snap_mutex);
1534 
1535 	INIT_LIST_HEAD(&vm->rebind_list);
1536 
1537 	INIT_LIST_HEAD(&vm->userptr.repin_list);
1538 	INIT_LIST_HEAD(&vm->userptr.invalidated);
1539 	spin_lock_init(&vm->userptr.invalidated_lock);
1540 
1541 	ttm_lru_bulk_move_init(&vm->lru_bulk_move);
1542 
1543 	INIT_WORK(&vm->destroy_work, vm_destroy_work_func);
1544 
1545 	INIT_LIST_HEAD(&vm->preempt.exec_queues);
1546 	for (id = 0; id < XE_MAX_TILES_PER_DEVICE * XE_MAX_GT_PER_TILE; ++id)
1547 		INIT_LIST_HEAD(&vm->exec_queues.list[id]);
1548 	if (flags & XE_VM_FLAG_FAULT_MODE)
1549 		vm->preempt.min_run_period_ms = xe->min_run_period_pf_ms;
1550 	else
1551 		vm->preempt.min_run_period_ms = xe->min_run_period_lr_ms;
1552 
1553 	init_rwsem(&vm->exec_queues.lock);
1554 	xe_vm_init_prove_locking(xe, vm);
1555 
1556 	for_each_tile(tile, xe, id)
1557 		xe_range_fence_tree_init(&vm->rftree[id]);
1558 
1559 	vm->pt_ops = &xelp_pt_ops;
1560 
1561 	/*
1562 	 * Long-running workloads are not protected by the scheduler references.
1563 	 * By design, run_job for long-running workloads returns NULL and the
1564 	 * scheduler drops all the references of it, hence protecting the VM
1565 	 * for this case is necessary.
1566 	 */
1567 	if (flags & XE_VM_FLAG_LR_MODE) {
1568 		INIT_WORK(&vm->preempt.rebind_work, preempt_rebind_work_func);
1569 		xe_pm_runtime_get_noresume(xe);
1570 		INIT_LIST_HEAD(&vm->preempt.pm_activate_link);
1571 	}
1572 
1573 	err = xe_svm_init(vm);
1574 	if (err)
1575 		goto err_no_resv;
1576 
1577 	vm_resv_obj = drm_gpuvm_resv_object_alloc(&xe->drm);
1578 	if (!vm_resv_obj) {
1579 		err = -ENOMEM;
1580 		goto err_svm_fini;
1581 	}
1582 
1583 	drm_gpuvm_init(&vm->gpuvm, "Xe VM", DRM_GPUVM_RESV_PROTECTED, &xe->drm,
1584 		       vm_resv_obj, 0, vm->size, 0, 0, &gpuvm_ops);
1585 
1586 	drm_gem_object_put(vm_resv_obj);
1587 
1588 	err = 0;
1589 	xe_validation_guard(&ctx, &xe->val, &exec, (struct xe_val_flags) {.interruptible = true},
1590 			    err) {
1591 		err = xe_vm_drm_exec_lock(vm, &exec);
1592 		drm_exec_retry_on_contention(&exec);
1593 
1594 		if (IS_DGFX(xe) && xe->info.vram_flags & XE_VRAM_FLAGS_NEED64K)
1595 			vm->flags |= XE_VM_FLAG_64K;
1596 
1597 		for_each_tile(tile, xe, id) {
1598 			if (flags & XE_VM_FLAG_MIGRATION &&
1599 			    tile->id != XE_VM_FLAG_TILE_ID(flags))
1600 				continue;
1601 
1602 			vm->pt_root[id] = xe_pt_create(vm, tile, xe->info.vm_max_level,
1603 						       &exec);
1604 			if (IS_ERR(vm->pt_root[id])) {
1605 				err = PTR_ERR(vm->pt_root[id]);
1606 				vm->pt_root[id] = NULL;
1607 				xe_vm_pt_destroy(vm);
1608 				drm_exec_retry_on_contention(&exec);
1609 				xe_validation_retry_on_oom(&ctx, &err);
1610 				break;
1611 			}
1612 		}
1613 		if (err)
1614 			break;
1615 
1616 		if (xe_vm_has_scratch(vm)) {
1617 			for_each_tile(tile, xe, id) {
1618 				if (!vm->pt_root[id])
1619 					continue;
1620 
1621 				err = xe_vm_create_scratch(xe, tile, vm, &exec);
1622 				if (err) {
1623 					xe_vm_free_scratch(vm);
1624 					xe_vm_pt_destroy(vm);
1625 					drm_exec_retry_on_contention(&exec);
1626 					xe_validation_retry_on_oom(&ctx, &err);
1627 					break;
1628 				}
1629 			}
1630 			if (err)
1631 				break;
1632 			vm->batch_invalidate_tlb = true;
1633 		}
1634 
1635 		if (vm->flags & XE_VM_FLAG_LR_MODE) {
1636 			INIT_WORK(&vm->preempt.rebind_work, preempt_rebind_work_func);
1637 			vm->batch_invalidate_tlb = false;
1638 		}
1639 
1640 		/* Fill pt_root after allocating scratch tables */
1641 		for_each_tile(tile, xe, id) {
1642 			if (!vm->pt_root[id])
1643 				continue;
1644 
1645 			xe_pt_populate_empty(tile, vm, vm->pt_root[id]);
1646 		}
1647 	}
1648 	if (err)
1649 		goto err_close;
1650 
1651 	/* Kernel migration VM shouldn't have a circular loop.. */
1652 	if (!(flags & XE_VM_FLAG_MIGRATION)) {
1653 		for_each_tile(tile, xe, id) {
1654 			struct xe_exec_queue *q;
1655 			u32 create_flags = EXEC_QUEUE_FLAG_VM;
1656 
1657 			if (!vm->pt_root[id])
1658 				continue;
1659 
1660 			if (!xef) /* Not from userspace */
1661 				create_flags |= EXEC_QUEUE_FLAG_KERNEL;
1662 
1663 			q = xe_exec_queue_create_bind(xe, tile, vm, create_flags, 0);
1664 			if (IS_ERR(q)) {
1665 				err = PTR_ERR(q);
1666 				goto err_close;
1667 			}
1668 			vm->q[id] = q;
1669 		}
1670 	}
1671 
1672 	if (xef && xe->info.has_asid) {
1673 		u32 asid;
1674 
1675 		down_write(&xe->usm.lock);
1676 		err = xa_alloc_cyclic(&xe->usm.asid_to_vm, &asid, vm,
1677 				      XA_LIMIT(1, XE_MAX_ASID - 1),
1678 				      &xe->usm.next_asid, GFP_NOWAIT);
1679 		up_write(&xe->usm.lock);
1680 		if (err < 0)
1681 			goto err_close;
1682 
1683 		vm->usm.asid = asid;
1684 	}
1685 
1686 	trace_xe_vm_create(vm);
1687 
1688 	return vm;
1689 
1690 err_close:
1691 	xe_vm_close_and_put(vm);
1692 	return ERR_PTR(err);
1693 
1694 err_svm_fini:
1695 	if (flags & XE_VM_FLAG_FAULT_MODE) {
1696 		vm->size = 0; /* close the vm */
1697 		xe_svm_fini(vm);
1698 	}
1699 err_no_resv:
1700 	mutex_destroy(&vm->snap_mutex);
1701 	for_each_tile(tile, xe, id)
1702 		xe_range_fence_tree_fini(&vm->rftree[id]);
1703 	ttm_lru_bulk_move_fini(&xe->ttm, &vm->lru_bulk_move);
1704 	if (vm->xef)
1705 		xe_file_put(vm->xef);
1706 	kfree(vm);
1707 	if (flags & XE_VM_FLAG_LR_MODE)
1708 		xe_pm_runtime_put(xe);
1709 	return ERR_PTR(err);
1710 }
1711 
1712 static void xe_vm_close(struct xe_vm *vm)
1713 {
1714 	struct xe_device *xe = vm->xe;
1715 	bool bound;
1716 	int idx;
1717 
1718 	bound = drm_dev_enter(&xe->drm, &idx);
1719 
1720 	down_write(&vm->lock);
1721 	if (xe_vm_in_fault_mode(vm))
1722 		xe_svm_notifier_lock(vm);
1723 
1724 	vm->size = 0;
1725 
1726 	if (!((vm->flags & XE_VM_FLAG_MIGRATION))) {
1727 		struct xe_tile *tile;
1728 		struct xe_gt *gt;
1729 		u8 id;
1730 
1731 		/* Wait for pending binds */
1732 		dma_resv_wait_timeout(xe_vm_resv(vm),
1733 				      DMA_RESV_USAGE_BOOKKEEP,
1734 				      false, MAX_SCHEDULE_TIMEOUT);
1735 
1736 		if (bound) {
1737 			for_each_tile(tile, xe, id)
1738 				if (vm->pt_root[id])
1739 					xe_pt_clear(xe, vm->pt_root[id]);
1740 
1741 			for_each_gt(gt, xe, id)
1742 				xe_tlb_inval_vm(&gt->tlb_inval, vm);
1743 		}
1744 	}
1745 
1746 	if (xe_vm_in_fault_mode(vm))
1747 		xe_svm_notifier_unlock(vm);
1748 	up_write(&vm->lock);
1749 
1750 	if (bound)
1751 		drm_dev_exit(idx);
1752 }
1753 
1754 void xe_vm_close_and_put(struct xe_vm *vm)
1755 {
1756 	LIST_HEAD(contested);
1757 	struct xe_device *xe = vm->xe;
1758 	struct xe_tile *tile;
1759 	struct xe_vma *vma, *next_vma;
1760 	struct drm_gpuva *gpuva, *next;
1761 	u8 id;
1762 
1763 	xe_assert(xe, !vm->preempt.num_exec_queues);
1764 
1765 	xe_vm_close(vm);
1766 	if (xe_vm_in_preempt_fence_mode(vm)) {
1767 		mutex_lock(&xe->rebind_resume_lock);
1768 		list_del_init(&vm->preempt.pm_activate_link);
1769 		mutex_unlock(&xe->rebind_resume_lock);
1770 		flush_work(&vm->preempt.rebind_work);
1771 	}
1772 	if (xe_vm_in_fault_mode(vm))
1773 		xe_svm_close(vm);
1774 
1775 	down_write(&vm->lock);
1776 	for_each_tile(tile, xe, id) {
1777 		if (vm->q[id]) {
1778 			int i;
1779 
1780 			xe_exec_queue_last_fence_put(vm->q[id], vm);
1781 			for_each_tlb_inval(i)
1782 				xe_exec_queue_tlb_inval_last_fence_put(vm->q[id], vm, i);
1783 		}
1784 	}
1785 	up_write(&vm->lock);
1786 
1787 	for_each_tile(tile, xe, id) {
1788 		if (vm->q[id]) {
1789 			xe_exec_queue_kill(vm->q[id]);
1790 			xe_exec_queue_put(vm->q[id]);
1791 			vm->q[id] = NULL;
1792 		}
1793 	}
1794 
1795 	down_write(&vm->lock);
1796 	xe_vm_lock(vm, false);
1797 	drm_gpuvm_for_each_va_safe(gpuva, next, &vm->gpuvm) {
1798 		vma = gpuva_to_vma(gpuva);
1799 
1800 		if (xe_vma_has_no_bo(vma)) {
1801 			xe_svm_notifier_lock(vm);
1802 			vma->gpuva.flags |= XE_VMA_DESTROYED;
1803 			xe_svm_notifier_unlock(vm);
1804 		}
1805 
1806 		xe_vm_remove_vma(vm, vma);
1807 
1808 		/* easy case, remove from VMA? */
1809 		if (xe_vma_has_no_bo(vma) || xe_vma_bo(vma)->vm) {
1810 			list_del_init(&vma->combined_links.rebind);
1811 			xe_vma_destroy(vma, NULL);
1812 			continue;
1813 		}
1814 
1815 		list_move_tail(&vma->combined_links.destroy, &contested);
1816 		vma->gpuva.flags |= XE_VMA_DESTROYED;
1817 	}
1818 
1819 	/*
1820 	 * All vm operations will add shared fences to resv.
1821 	 * The only exception is eviction for a shared object,
1822 	 * but even so, the unbind when evicted would still
1823 	 * install a fence to resv. Hence it's safe to
1824 	 * destroy the pagetables immediately.
1825 	 */
1826 	xe_vm_free_scratch(vm);
1827 	xe_vm_pt_destroy(vm);
1828 	xe_vm_unlock(vm);
1829 
1830 	/*
1831 	 * VM is now dead, cannot re-add nodes to vm->vmas if it's NULL
1832 	 * Since we hold a refcount to the bo, we can remove and free
1833 	 * the members safely without locking.
1834 	 */
1835 	list_for_each_entry_safe(vma, next_vma, &contested,
1836 				 combined_links.destroy) {
1837 		list_del_init(&vma->combined_links.destroy);
1838 		xe_vma_destroy_unlocked(vma);
1839 	}
1840 
1841 	xe_svm_fini(vm);
1842 
1843 	up_write(&vm->lock);
1844 
1845 	down_write(&xe->usm.lock);
1846 	if (vm->usm.asid) {
1847 		void *lookup;
1848 
1849 		xe_assert(xe, xe->info.has_asid);
1850 		xe_assert(xe, !(vm->flags & XE_VM_FLAG_MIGRATION));
1851 
1852 		lookup = xa_erase(&xe->usm.asid_to_vm, vm->usm.asid);
1853 		xe_assert(xe, lookup == vm);
1854 	}
1855 	up_write(&xe->usm.lock);
1856 
1857 	for_each_tile(tile, xe, id)
1858 		xe_range_fence_tree_fini(&vm->rftree[id]);
1859 
1860 	xe_vm_put(vm);
1861 }
1862 
1863 static void vm_destroy_work_func(struct work_struct *w)
1864 {
1865 	struct xe_vm *vm =
1866 		container_of(w, struct xe_vm, destroy_work);
1867 	struct xe_device *xe = vm->xe;
1868 	struct xe_tile *tile;
1869 	u8 id;
1870 
1871 	/* xe_vm_close_and_put was not called? */
1872 	xe_assert(xe, !vm->size);
1873 
1874 	if (xe_vm_in_preempt_fence_mode(vm))
1875 		flush_work(&vm->preempt.rebind_work);
1876 
1877 	mutex_destroy(&vm->snap_mutex);
1878 
1879 	if (vm->flags & XE_VM_FLAG_LR_MODE)
1880 		xe_pm_runtime_put(xe);
1881 
1882 	for_each_tile(tile, xe, id)
1883 		XE_WARN_ON(vm->pt_root[id]);
1884 
1885 	trace_xe_vm_free(vm);
1886 
1887 	ttm_lru_bulk_move_fini(&xe->ttm, &vm->lru_bulk_move);
1888 
1889 	if (vm->xef)
1890 		xe_file_put(vm->xef);
1891 
1892 	kfree(vm);
1893 }
1894 
1895 static void xe_vm_free(struct drm_gpuvm *gpuvm)
1896 {
1897 	struct xe_vm *vm = container_of(gpuvm, struct xe_vm, gpuvm);
1898 
1899 	/* To destroy the VM we need to be able to sleep */
1900 	queue_work(system_dfl_wq, &vm->destroy_work);
1901 }
1902 
1903 struct xe_vm *xe_vm_lookup(struct xe_file *xef, u32 id)
1904 {
1905 	struct xe_vm *vm;
1906 
1907 	mutex_lock(&xef->vm.lock);
1908 	vm = xa_load(&xef->vm.xa, id);
1909 	if (vm)
1910 		xe_vm_get(vm);
1911 	mutex_unlock(&xef->vm.lock);
1912 
1913 	return vm;
1914 }
1915 
1916 u64 xe_vm_pdp4_descriptor(struct xe_vm *vm, struct xe_tile *tile)
1917 {
1918 	return vm->pt_ops->pde_encode_bo(vm->pt_root[tile->id]->bo, 0);
1919 }
1920 
1921 static struct xe_exec_queue *
1922 to_wait_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
1923 {
1924 	return q ? q : vm->q[0];
1925 }
1926 
1927 static struct xe_user_fence *
1928 find_ufence_get(struct xe_sync_entry *syncs, u32 num_syncs)
1929 {
1930 	unsigned int i;
1931 
1932 	for (i = 0; i < num_syncs; i++) {
1933 		struct xe_sync_entry *e = &syncs[i];
1934 
1935 		if (xe_sync_is_ufence(e))
1936 			return xe_sync_ufence_get(e);
1937 	}
1938 
1939 	return NULL;
1940 }
1941 
1942 #define ALL_DRM_XE_VM_CREATE_FLAGS (DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE | \
1943 				    DRM_XE_VM_CREATE_FLAG_LR_MODE | \
1944 				    DRM_XE_VM_CREATE_FLAG_FAULT_MODE | \
1945 				    DRM_XE_VM_CREATE_FLAG_NO_VM_OVERCOMMIT)
1946 
1947 int xe_vm_create_ioctl(struct drm_device *dev, void *data,
1948 		       struct drm_file *file)
1949 {
1950 	struct xe_device *xe = to_xe_device(dev);
1951 	struct xe_file *xef = to_xe_file(file);
1952 	struct drm_xe_vm_create *args = data;
1953 	struct xe_gt *wa_gt = xe_root_mmio_gt(xe);
1954 	struct xe_vm *vm;
1955 	u32 id;
1956 	int err;
1957 	u32 flags = 0;
1958 
1959 	if (XE_IOCTL_DBG(xe, args->extensions))
1960 		return -EINVAL;
1961 
1962 	if (wa_gt && XE_GT_WA(wa_gt, 22014953428))
1963 		args->flags |= DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE;
1964 
1965 	if (XE_IOCTL_DBG(xe, args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE &&
1966 			 !xe->info.has_usm))
1967 		return -EINVAL;
1968 
1969 	if (XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
1970 		return -EINVAL;
1971 
1972 	if (XE_IOCTL_DBG(xe, args->flags & ~ALL_DRM_XE_VM_CREATE_FLAGS))
1973 		return -EINVAL;
1974 
1975 	if (XE_IOCTL_DBG(xe, args->flags & DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE &&
1976 			 args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE &&
1977 			 !xe->info.needs_scratch))
1978 		return -EINVAL;
1979 
1980 	if (XE_IOCTL_DBG(xe, !(args->flags & DRM_XE_VM_CREATE_FLAG_LR_MODE) &&
1981 			 args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE))
1982 		return -EINVAL;
1983 
1984 	if (XE_IOCTL_DBG(xe, !(args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE) &&
1985 			 args->flags & DRM_XE_VM_CREATE_FLAG_NO_VM_OVERCOMMIT))
1986 		return -EINVAL;
1987 
1988 	if (args->flags & DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE)
1989 		flags |= XE_VM_FLAG_SCRATCH_PAGE;
1990 	if (args->flags & DRM_XE_VM_CREATE_FLAG_LR_MODE)
1991 		flags |= XE_VM_FLAG_LR_MODE;
1992 	if (args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE)
1993 		flags |= XE_VM_FLAG_FAULT_MODE;
1994 	if (args->flags & DRM_XE_VM_CREATE_FLAG_NO_VM_OVERCOMMIT)
1995 		flags |= XE_VM_FLAG_NO_VM_OVERCOMMIT;
1996 
1997 	vm = xe_vm_create(xe, flags, xef);
1998 	if (IS_ERR(vm))
1999 		return PTR_ERR(vm);
2000 
2001 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_MEM)
2002 	/* Warning: Security issue - never enable by default */
2003 	args->reserved[0] = xe_bo_main_addr(vm->pt_root[0]->bo, XE_PAGE_SIZE);
2004 #endif
2005 
2006 	/* user id alloc must always be last in ioctl to prevent UAF */
2007 	err = xa_alloc(&xef->vm.xa, &id, vm, xa_limit_32b, GFP_KERNEL);
2008 	if (err)
2009 		goto err_close_and_put;
2010 
2011 	args->vm_id = id;
2012 
2013 	return 0;
2014 
2015 err_close_and_put:
2016 	xe_vm_close_and_put(vm);
2017 
2018 	return err;
2019 }
2020 
2021 int xe_vm_destroy_ioctl(struct drm_device *dev, void *data,
2022 			struct drm_file *file)
2023 {
2024 	struct xe_device *xe = to_xe_device(dev);
2025 	struct xe_file *xef = to_xe_file(file);
2026 	struct drm_xe_vm_destroy *args = data;
2027 	struct xe_vm *vm;
2028 	int err = 0;
2029 
2030 	if (XE_IOCTL_DBG(xe, args->pad) ||
2031 	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
2032 		return -EINVAL;
2033 
2034 	mutex_lock(&xef->vm.lock);
2035 	vm = xa_load(&xef->vm.xa, args->vm_id);
2036 	if (XE_IOCTL_DBG(xe, !vm))
2037 		err = -ENOENT;
2038 	else if (XE_IOCTL_DBG(xe, vm->preempt.num_exec_queues))
2039 		err = -EBUSY;
2040 	else
2041 		xa_erase(&xef->vm.xa, args->vm_id);
2042 	mutex_unlock(&xef->vm.lock);
2043 
2044 	if (!err)
2045 		xe_vm_close_and_put(vm);
2046 
2047 	return err;
2048 }
2049 
2050 static int xe_vm_query_vmas(struct xe_vm *vm, u64 start, u64 end)
2051 {
2052 	struct drm_gpuva *gpuva;
2053 	u32 num_vmas = 0;
2054 
2055 	lockdep_assert_held(&vm->lock);
2056 	drm_gpuvm_for_each_va_range(gpuva, &vm->gpuvm, start, end)
2057 		num_vmas++;
2058 
2059 	return num_vmas;
2060 }
2061 
2062 static int get_mem_attrs(struct xe_vm *vm, u32 *num_vmas, u64 start,
2063 			 u64 end, struct drm_xe_mem_range_attr *attrs)
2064 {
2065 	struct drm_gpuva *gpuva;
2066 	int i = 0;
2067 
2068 	lockdep_assert_held(&vm->lock);
2069 
2070 	drm_gpuvm_for_each_va_range(gpuva, &vm->gpuvm, start, end) {
2071 		struct xe_vma *vma = gpuva_to_vma(gpuva);
2072 
2073 		if (i == *num_vmas)
2074 			return -ENOSPC;
2075 
2076 		attrs[i].start = xe_vma_start(vma);
2077 		attrs[i].end = xe_vma_end(vma);
2078 		attrs[i].atomic.val = vma->attr.atomic_access;
2079 		attrs[i].pat_index.val = vma->attr.pat_index;
2080 		attrs[i].preferred_mem_loc.devmem_fd = vma->attr.preferred_loc.devmem_fd;
2081 		attrs[i].preferred_mem_loc.migration_policy =
2082 		vma->attr.preferred_loc.migration_policy;
2083 
2084 		i++;
2085 	}
2086 
2087 	*num_vmas = i;
2088 	return 0;
2089 }
2090 
2091 int xe_vm_query_vmas_attrs_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
2092 {
2093 	struct xe_device *xe = to_xe_device(dev);
2094 	struct xe_file *xef = to_xe_file(file);
2095 	struct drm_xe_mem_range_attr *mem_attrs;
2096 	struct drm_xe_vm_query_mem_range_attr *args = data;
2097 	u64 __user *attrs_user = u64_to_user_ptr(args->vector_of_mem_attr);
2098 	struct xe_vm *vm;
2099 	int err = 0;
2100 
2101 	if (XE_IOCTL_DBG(xe,
2102 			 ((args->num_mem_ranges == 0 &&
2103 			  (attrs_user || args->sizeof_mem_range_attr != 0)) ||
2104 			 (args->num_mem_ranges > 0 &&
2105 			  (!attrs_user ||
2106 			   args->sizeof_mem_range_attr !=
2107 			   sizeof(struct drm_xe_mem_range_attr))))))
2108 		return -EINVAL;
2109 
2110 	vm = xe_vm_lookup(xef, args->vm_id);
2111 	if (XE_IOCTL_DBG(xe, !vm))
2112 		return -EINVAL;
2113 
2114 	err = down_read_interruptible(&vm->lock);
2115 	if (err)
2116 		goto put_vm;
2117 
2118 	attrs_user = u64_to_user_ptr(args->vector_of_mem_attr);
2119 
2120 	if (args->num_mem_ranges == 0 && !attrs_user) {
2121 		args->num_mem_ranges = xe_vm_query_vmas(vm, args->start, args->start + args->range);
2122 		args->sizeof_mem_range_attr = sizeof(struct drm_xe_mem_range_attr);
2123 		goto unlock_vm;
2124 	}
2125 
2126 	mem_attrs = kvmalloc_array(args->num_mem_ranges, args->sizeof_mem_range_attr,
2127 				   GFP_KERNEL | __GFP_ACCOUNT |
2128 				   __GFP_RETRY_MAYFAIL | __GFP_NOWARN);
2129 	if (!mem_attrs) {
2130 		err = args->num_mem_ranges > 1 ? -ENOBUFS : -ENOMEM;
2131 		goto unlock_vm;
2132 	}
2133 
2134 	memset(mem_attrs, 0, args->num_mem_ranges * args->sizeof_mem_range_attr);
2135 	err = get_mem_attrs(vm, &args->num_mem_ranges, args->start,
2136 			    args->start + args->range, mem_attrs);
2137 	if (err)
2138 		goto free_mem_attrs;
2139 
2140 	err = copy_to_user(attrs_user, mem_attrs,
2141 			   args->sizeof_mem_range_attr * args->num_mem_ranges);
2142 	if (err)
2143 		err = -EFAULT;
2144 
2145 free_mem_attrs:
2146 	kvfree(mem_attrs);
2147 unlock_vm:
2148 	up_read(&vm->lock);
2149 put_vm:
2150 	xe_vm_put(vm);
2151 	return err;
2152 }
2153 
2154 static bool vma_matches(struct xe_vma *vma, u64 page_addr)
2155 {
2156 	if (page_addr > xe_vma_end(vma) - 1 ||
2157 	    page_addr + SZ_4K - 1 < xe_vma_start(vma))
2158 		return false;
2159 
2160 	return true;
2161 }
2162 
2163 /**
2164  * xe_vm_find_vma_by_addr() - Find a VMA by its address
2165  *
2166  * @vm: the xe_vm the vma belongs to
2167  * @page_addr: address to look up
2168  */
2169 struct xe_vma *xe_vm_find_vma_by_addr(struct xe_vm *vm, u64 page_addr)
2170 {
2171 	struct xe_vma *vma = NULL;
2172 
2173 	if (vm->usm.last_fault_vma) {   /* Fast lookup */
2174 		if (vma_matches(vm->usm.last_fault_vma, page_addr))
2175 			vma = vm->usm.last_fault_vma;
2176 	}
2177 	if (!vma)
2178 		vma = xe_vm_find_overlapping_vma(vm, page_addr, SZ_4K);
2179 
2180 	return vma;
2181 }
2182 
2183 static const u32 region_to_mem_type[] = {
2184 	XE_PL_TT,
2185 	XE_PL_VRAM0,
2186 	XE_PL_VRAM1,
2187 };
2188 
2189 static void prep_vma_destroy(struct xe_vm *vm, struct xe_vma *vma,
2190 			     bool post_commit)
2191 {
2192 	xe_svm_notifier_lock(vm);
2193 	vma->gpuva.flags |= XE_VMA_DESTROYED;
2194 	xe_svm_notifier_unlock(vm);
2195 	if (post_commit)
2196 		xe_vm_remove_vma(vm, vma);
2197 }
2198 
2199 #undef ULL
2200 #define ULL	unsigned long long
2201 
2202 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_VM)
2203 static void print_op(struct xe_device *xe, struct drm_gpuva_op *op)
2204 {
2205 	struct xe_vma *vma;
2206 
2207 	switch (op->op) {
2208 	case DRM_GPUVA_OP_MAP:
2209 		vm_dbg(&xe->drm, "MAP: addr=0x%016llx, range=0x%016llx",
2210 		       (ULL)op->map.va.addr, (ULL)op->map.va.range);
2211 		break;
2212 	case DRM_GPUVA_OP_REMAP:
2213 		vma = gpuva_to_vma(op->remap.unmap->va);
2214 		vm_dbg(&xe->drm, "REMAP:UNMAP: addr=0x%016llx, range=0x%016llx, keep=%d",
2215 		       (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma),
2216 		       op->remap.unmap->keep ? 1 : 0);
2217 		if (op->remap.prev)
2218 			vm_dbg(&xe->drm,
2219 			       "REMAP:PREV: addr=0x%016llx, range=0x%016llx",
2220 			       (ULL)op->remap.prev->va.addr,
2221 			       (ULL)op->remap.prev->va.range);
2222 		if (op->remap.next)
2223 			vm_dbg(&xe->drm,
2224 			       "REMAP:NEXT: addr=0x%016llx, range=0x%016llx",
2225 			       (ULL)op->remap.next->va.addr,
2226 			       (ULL)op->remap.next->va.range);
2227 		break;
2228 	case DRM_GPUVA_OP_UNMAP:
2229 		vma = gpuva_to_vma(op->unmap.va);
2230 		vm_dbg(&xe->drm, "UNMAP: addr=0x%016llx, range=0x%016llx, keep=%d",
2231 		       (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma),
2232 		       op->unmap.keep ? 1 : 0);
2233 		break;
2234 	case DRM_GPUVA_OP_PREFETCH:
2235 		vma = gpuva_to_vma(op->prefetch.va);
2236 		vm_dbg(&xe->drm, "PREFETCH: addr=0x%016llx, range=0x%016llx",
2237 		       (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma));
2238 		break;
2239 	default:
2240 		drm_warn(&xe->drm, "NOT POSSIBLE\n");
2241 	}
2242 }
2243 #else
2244 static void print_op(struct xe_device *xe, struct drm_gpuva_op *op)
2245 {
2246 }
2247 #endif
2248 
2249 static bool __xe_vm_needs_clear_scratch_pages(struct xe_vm *vm, u32 bind_flags)
2250 {
2251 	if (!xe_vm_in_fault_mode(vm))
2252 		return false;
2253 
2254 	if (!xe_vm_has_scratch(vm))
2255 		return false;
2256 
2257 	if (bind_flags & DRM_XE_VM_BIND_FLAG_IMMEDIATE)
2258 		return false;
2259 
2260 	return true;
2261 }
2262 
2263 static void xe_svm_prefetch_gpuva_ops_fini(struct drm_gpuva_ops *ops)
2264 {
2265 	struct drm_gpuva_op *__op;
2266 
2267 	drm_gpuva_for_each_op(__op, ops) {
2268 		struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2269 
2270 		xe_vma_svm_prefetch_op_fini(op);
2271 	}
2272 }
2273 
2274 /*
2275  * Create operations list from IOCTL arguments, setup operations fields so parse
2276  * and commit steps are decoupled from IOCTL arguments. This step can fail.
2277  */
2278 static struct drm_gpuva_ops *
2279 vm_bind_ioctl_ops_create(struct xe_vm *vm, struct xe_vma_ops *vops,
2280 			 struct xe_bo *bo, u64 bo_offset_or_userptr,
2281 			 u64 addr, u64 range,
2282 			 u32 operation, u32 flags,
2283 			 u32 prefetch_region, u16 pat_index)
2284 {
2285 	struct drm_gem_object *obj = bo ? &bo->ttm.base : NULL;
2286 	struct drm_gpuva_ops *ops;
2287 	struct drm_gpuva_op *__op;
2288 	struct drm_gpuvm_bo *vm_bo;
2289 	u64 range_start = addr;
2290 	u64 range_end = addr + range;
2291 	int err;
2292 
2293 	lockdep_assert_held_write(&vm->lock);
2294 
2295 	vm_dbg(&vm->xe->drm,
2296 	       "op=%d, addr=0x%016llx, range=0x%016llx, bo_offset_or_userptr=0x%016llx",
2297 	       operation, (ULL)addr, (ULL)range,
2298 	       (ULL)bo_offset_or_userptr);
2299 
2300 	switch (operation) {
2301 	case DRM_XE_VM_BIND_OP_MAP:
2302 		if (flags & DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR) {
2303 			xe_vm_find_cpu_addr_mirror_vma_range(vm, &range_start, &range_end);
2304 			vops->flags |= XE_VMA_OPS_FLAG_ALLOW_SVM_UNMAP;
2305 		}
2306 
2307 		fallthrough;
2308 	case DRM_XE_VM_BIND_OP_MAP_USERPTR: {
2309 		struct drm_gpuvm_map_req map_req = {
2310 			.map.va.addr = range_start,
2311 			.map.va.range = range_end - range_start,
2312 			.map.gem.obj = obj,
2313 			.map.gem.offset = bo_offset_or_userptr,
2314 		};
2315 
2316 		ops = drm_gpuvm_sm_map_ops_create(&vm->gpuvm, &map_req);
2317 		break;
2318 	}
2319 	case DRM_XE_VM_BIND_OP_UNMAP:
2320 		ops = drm_gpuvm_sm_unmap_ops_create(&vm->gpuvm, addr, range);
2321 		break;
2322 	case DRM_XE_VM_BIND_OP_PREFETCH:
2323 		ops = drm_gpuvm_prefetch_ops_create(&vm->gpuvm, addr, range);
2324 		break;
2325 	case DRM_XE_VM_BIND_OP_UNMAP_ALL:
2326 		xe_assert(vm->xe, bo);
2327 
2328 		err = xe_bo_lock(bo, true);
2329 		if (err)
2330 			return ERR_PTR(err);
2331 
2332 		vm_bo = drm_gpuvm_bo_obtain_locked(&vm->gpuvm, obj);
2333 		if (IS_ERR(vm_bo)) {
2334 			xe_bo_unlock(bo);
2335 			return ERR_CAST(vm_bo);
2336 		}
2337 
2338 		ops = drm_gpuvm_bo_unmap_ops_create(vm_bo);
2339 		drm_gpuvm_bo_put(vm_bo);
2340 		xe_bo_unlock(bo);
2341 		break;
2342 	default:
2343 		drm_warn(&vm->xe->drm, "NOT POSSIBLE\n");
2344 		ops = ERR_PTR(-EINVAL);
2345 	}
2346 	if (IS_ERR(ops))
2347 		return ops;
2348 
2349 	drm_gpuva_for_each_op(__op, ops) {
2350 		struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2351 
2352 		if (__op->op == DRM_GPUVA_OP_MAP) {
2353 			op->map.immediate =
2354 				flags & DRM_XE_VM_BIND_FLAG_IMMEDIATE;
2355 			if (flags & DRM_XE_VM_BIND_FLAG_READONLY)
2356 				op->map.vma_flags |= XE_VMA_READ_ONLY;
2357 			if (flags & DRM_XE_VM_BIND_FLAG_NULL)
2358 				op->map.vma_flags |= DRM_GPUVA_SPARSE;
2359 			if (flags & DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR)
2360 				op->map.vma_flags |= XE_VMA_SYSTEM_ALLOCATOR;
2361 			if (flags & DRM_XE_VM_BIND_FLAG_DUMPABLE)
2362 				op->map.vma_flags |= XE_VMA_DUMPABLE;
2363 			if (flags & DRM_XE_VM_BIND_FLAG_MADVISE_AUTORESET)
2364 				op->map.vma_flags |= XE_VMA_MADV_AUTORESET;
2365 			op->map.pat_index = pat_index;
2366 			op->map.invalidate_on_bind =
2367 				__xe_vm_needs_clear_scratch_pages(vm, flags);
2368 		} else if (__op->op == DRM_GPUVA_OP_PREFETCH) {
2369 			struct xe_vma *vma = gpuva_to_vma(op->base.prefetch.va);
2370 			struct xe_tile *tile;
2371 			struct xe_svm_range *svm_range;
2372 			struct drm_gpusvm_ctx ctx = {};
2373 			struct drm_pagemap *dpagemap = NULL;
2374 			u8 id, tile_mask = 0;
2375 			u32 i;
2376 
2377 			if (!xe_vma_is_cpu_addr_mirror(vma)) {
2378 				op->prefetch.region = prefetch_region;
2379 				break;
2380 			}
2381 
2382 			ctx.read_only = xe_vma_read_only(vma);
2383 			ctx.devmem_possible = IS_DGFX(vm->xe) &&
2384 					      IS_ENABLED(CONFIG_DRM_XE_PAGEMAP);
2385 
2386 			for_each_tile(tile, vm->xe, id)
2387 				tile_mask |= 0x1 << id;
2388 
2389 			xa_init_flags(&op->prefetch_range.range, XA_FLAGS_ALLOC);
2390 			op->prefetch_range.ranges_count = 0;
2391 
2392 			if (prefetch_region == DRM_XE_CONSULT_MEM_ADVISE_PREF_LOC) {
2393 				dpagemap = xe_vma_resolve_pagemap(vma,
2394 								  xe_device_get_root_tile(vm->xe));
2395 			} else if (prefetch_region) {
2396 				tile = &vm->xe->tiles[region_to_mem_type[prefetch_region] -
2397 						      XE_PL_VRAM0];
2398 				dpagemap = xe_tile_local_pagemap(tile);
2399 			}
2400 
2401 			op->prefetch_range.dpagemap = dpagemap;
2402 alloc_next_range:
2403 			svm_range = xe_svm_range_find_or_insert(vm, addr, vma, &ctx);
2404 
2405 			if (PTR_ERR(svm_range) == -ENOENT) {
2406 				u64 ret = xe_svm_find_vma_start(vm, addr, range_end, vma);
2407 
2408 				addr = ret == ULONG_MAX ? 0 : ret;
2409 				if (addr)
2410 					goto alloc_next_range;
2411 				else
2412 					goto print_op_label;
2413 			}
2414 
2415 			if (IS_ERR(svm_range)) {
2416 				err = PTR_ERR(svm_range);
2417 				goto unwind_prefetch_ops;
2418 			}
2419 
2420 			if (xe_svm_range_validate(vm, svm_range, tile_mask, dpagemap)) {
2421 				xe_svm_range_debug(svm_range, "PREFETCH - RANGE IS VALID");
2422 				goto check_next_range;
2423 			}
2424 
2425 			err = xa_alloc(&op->prefetch_range.range,
2426 				       &i, svm_range, xa_limit_32b,
2427 				       GFP_KERNEL);
2428 
2429 			if (err)
2430 				goto unwind_prefetch_ops;
2431 
2432 			op->prefetch_range.ranges_count++;
2433 			vops->flags |= XE_VMA_OPS_FLAG_HAS_SVM_PREFETCH;
2434 			xe_svm_range_debug(svm_range, "PREFETCH - RANGE CREATED");
2435 check_next_range:
2436 			if (range_end > xe_svm_range_end(svm_range) &&
2437 			    xe_svm_range_end(svm_range) < xe_vma_end(vma)) {
2438 				addr = xe_svm_range_end(svm_range);
2439 				goto alloc_next_range;
2440 			}
2441 		}
2442 print_op_label:
2443 		print_op(vm->xe, __op);
2444 	}
2445 
2446 	return ops;
2447 
2448 unwind_prefetch_ops:
2449 	xe_svm_prefetch_gpuva_ops_fini(ops);
2450 	drm_gpuva_ops_free(&vm->gpuvm, ops);
2451 	return ERR_PTR(err);
2452 }
2453 
2454 ALLOW_ERROR_INJECTION(vm_bind_ioctl_ops_create, ERRNO);
2455 
2456 static struct xe_vma *new_vma(struct xe_vm *vm, struct drm_gpuva_op_map *op,
2457 			      struct xe_vma_mem_attr *attr, unsigned int flags)
2458 {
2459 	struct xe_bo *bo = op->gem.obj ? gem_to_xe_bo(op->gem.obj) : NULL;
2460 	struct xe_validation_ctx ctx;
2461 	struct drm_exec exec;
2462 	struct xe_vma *vma;
2463 	int err = 0;
2464 
2465 	lockdep_assert_held_write(&vm->lock);
2466 
2467 	if (bo) {
2468 		err = 0;
2469 		xe_validation_guard(&ctx, &vm->xe->val, &exec,
2470 				    (struct xe_val_flags) {.interruptible = true}, err) {
2471 			if (!bo->vm) {
2472 				err = drm_exec_lock_obj(&exec, xe_vm_obj(vm));
2473 				drm_exec_retry_on_contention(&exec);
2474 			}
2475 			if (!err) {
2476 				err = drm_exec_lock_obj(&exec, &bo->ttm.base);
2477 				drm_exec_retry_on_contention(&exec);
2478 			}
2479 			if (err)
2480 				return ERR_PTR(err);
2481 
2482 			vma = xe_vma_create(vm, bo, op->gem.offset,
2483 					    op->va.addr, op->va.addr +
2484 					    op->va.range - 1, attr, flags);
2485 			if (IS_ERR(vma))
2486 				return vma;
2487 
2488 			if (!bo->vm) {
2489 				err = add_preempt_fences(vm, bo);
2490 				if (err) {
2491 					prep_vma_destroy(vm, vma, false);
2492 					xe_vma_destroy(vma, NULL);
2493 				}
2494 			}
2495 		}
2496 		if (err)
2497 			return ERR_PTR(err);
2498 	} else {
2499 		vma = xe_vma_create(vm, NULL, op->gem.offset,
2500 				    op->va.addr, op->va.addr +
2501 				    op->va.range - 1, attr, flags);
2502 		if (IS_ERR(vma))
2503 			return vma;
2504 
2505 		if (xe_vma_is_userptr(vma)) {
2506 			err = xe_vma_userptr_pin_pages(to_userptr_vma(vma));
2507 			/*
2508 			 * -EBUSY has dedicated meaning that a user fence
2509 			 * attached to the VMA is busy, in practice
2510 			 * xe_vma_userptr_pin_pages can only fail with -EBUSY if
2511 			 * we are low on memory so convert this to -ENOMEM.
2512 			 */
2513 			if (err == -EBUSY)
2514 				err = -ENOMEM;
2515 		}
2516 	}
2517 	if (err) {
2518 		prep_vma_destroy(vm, vma, false);
2519 		xe_vma_destroy_unlocked(vma);
2520 		vma = ERR_PTR(err);
2521 	}
2522 
2523 	return vma;
2524 }
2525 
2526 static u64 xe_vma_max_pte_size(struct xe_vma *vma)
2527 {
2528 	if (vma->gpuva.flags & XE_VMA_PTE_1G)
2529 		return SZ_1G;
2530 	else if (vma->gpuva.flags & (XE_VMA_PTE_2M | XE_VMA_PTE_COMPACT))
2531 		return SZ_2M;
2532 	else if (vma->gpuva.flags & XE_VMA_PTE_64K)
2533 		return SZ_64K;
2534 	else if (vma->gpuva.flags & XE_VMA_PTE_4K)
2535 		return SZ_4K;
2536 
2537 	return SZ_1G;	/* Uninitialized, used max size */
2538 }
2539 
2540 static void xe_vma_set_pte_size(struct xe_vma *vma, u64 size)
2541 {
2542 	switch (size) {
2543 	case SZ_1G:
2544 		vma->gpuva.flags |= XE_VMA_PTE_1G;
2545 		break;
2546 	case SZ_2M:
2547 		vma->gpuva.flags |= XE_VMA_PTE_2M;
2548 		break;
2549 	case SZ_64K:
2550 		vma->gpuva.flags |= XE_VMA_PTE_64K;
2551 		break;
2552 	case SZ_4K:
2553 		vma->gpuva.flags |= XE_VMA_PTE_4K;
2554 		break;
2555 	}
2556 }
2557 
2558 static int xe_vma_op_commit(struct xe_vm *vm, struct xe_vma_op *op)
2559 {
2560 	int err = 0;
2561 
2562 	lockdep_assert_held_write(&vm->lock);
2563 
2564 	switch (op->base.op) {
2565 	case DRM_GPUVA_OP_MAP:
2566 		err |= xe_vm_insert_vma(vm, op->map.vma);
2567 		if (!err)
2568 			op->flags |= XE_VMA_OP_COMMITTED;
2569 		break;
2570 	case DRM_GPUVA_OP_REMAP:
2571 	{
2572 		u8 tile_present =
2573 			gpuva_to_vma(op->base.remap.unmap->va)->tile_present;
2574 
2575 		prep_vma_destroy(vm, gpuva_to_vma(op->base.remap.unmap->va),
2576 				 true);
2577 		op->flags |= XE_VMA_OP_COMMITTED;
2578 
2579 		if (op->remap.prev) {
2580 			err |= xe_vm_insert_vma(vm, op->remap.prev);
2581 			if (!err)
2582 				op->flags |= XE_VMA_OP_PREV_COMMITTED;
2583 			if (!err && op->remap.skip_prev) {
2584 				op->remap.prev->tile_present =
2585 					tile_present;
2586 				op->remap.prev = NULL;
2587 			}
2588 		}
2589 		if (op->remap.next) {
2590 			err |= xe_vm_insert_vma(vm, op->remap.next);
2591 			if (!err)
2592 				op->flags |= XE_VMA_OP_NEXT_COMMITTED;
2593 			if (!err && op->remap.skip_next) {
2594 				op->remap.next->tile_present =
2595 					tile_present;
2596 				op->remap.next = NULL;
2597 			}
2598 		}
2599 
2600 		/* Adjust for partial unbind after removing VMA from VM */
2601 		if (!err) {
2602 			op->base.remap.unmap->va->va.addr = op->remap.start;
2603 			op->base.remap.unmap->va->va.range = op->remap.range;
2604 		}
2605 		break;
2606 	}
2607 	case DRM_GPUVA_OP_UNMAP:
2608 		prep_vma_destroy(vm, gpuva_to_vma(op->base.unmap.va), true);
2609 		op->flags |= XE_VMA_OP_COMMITTED;
2610 		break;
2611 	case DRM_GPUVA_OP_PREFETCH:
2612 		op->flags |= XE_VMA_OP_COMMITTED;
2613 		break;
2614 	default:
2615 		drm_warn(&vm->xe->drm, "NOT POSSIBLE\n");
2616 	}
2617 
2618 	return err;
2619 }
2620 
2621 /**
2622  * xe_vma_has_default_mem_attrs - Check if a VMA has default memory attributes
2623  * @vma: Pointer to the xe_vma structure to check
2624  *
2625  * This function determines whether the given VMA (Virtual Memory Area)
2626  * has its memory attributes set to their default values. Specifically,
2627  * it checks the following conditions:
2628  *
2629  * - `atomic_access` is `DRM_XE_VMA_ATOMIC_UNDEFINED`
2630  * - `pat_index` is equal to `default_pat_index`
2631  * - `preferred_loc.devmem_fd` is `DRM_XE_PREFERRED_LOC_DEFAULT_DEVICE`
2632  * - `preferred_loc.migration_policy` is `DRM_XE_MIGRATE_ALL_PAGES`
2633  *
2634  * Return: true if all attributes are at their default values, false otherwise.
2635  */
2636 bool xe_vma_has_default_mem_attrs(struct xe_vma *vma)
2637 {
2638 	return (vma->attr.atomic_access == DRM_XE_ATOMIC_UNDEFINED &&
2639 		vma->attr.pat_index ==  vma->attr.default_pat_index &&
2640 		vma->attr.preferred_loc.devmem_fd == DRM_XE_PREFERRED_LOC_DEFAULT_DEVICE &&
2641 		vma->attr.preferred_loc.migration_policy == DRM_XE_MIGRATE_ALL_PAGES);
2642 }
2643 
2644 static int vm_bind_ioctl_ops_parse(struct xe_vm *vm, struct drm_gpuva_ops *ops,
2645 				   struct xe_vma_ops *vops)
2646 {
2647 	struct xe_device *xe = vm->xe;
2648 	struct drm_gpuva_op *__op;
2649 	struct xe_tile *tile;
2650 	u8 id, tile_mask = 0;
2651 	int err = 0;
2652 
2653 	lockdep_assert_held_write(&vm->lock);
2654 
2655 	for_each_tile(tile, vm->xe, id)
2656 		tile_mask |= 0x1 << id;
2657 
2658 	drm_gpuva_for_each_op(__op, ops) {
2659 		struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2660 		struct xe_vma *vma;
2661 		unsigned int flags = 0;
2662 
2663 		INIT_LIST_HEAD(&op->link);
2664 		list_add_tail(&op->link, &vops->list);
2665 		op->tile_mask = tile_mask;
2666 
2667 		switch (op->base.op) {
2668 		case DRM_GPUVA_OP_MAP:
2669 		{
2670 			struct xe_vma_mem_attr default_attr = {
2671 				.preferred_loc = {
2672 					.devmem_fd = DRM_XE_PREFERRED_LOC_DEFAULT_DEVICE,
2673 					.migration_policy = DRM_XE_MIGRATE_ALL_PAGES,
2674 				},
2675 				.atomic_access = DRM_XE_ATOMIC_UNDEFINED,
2676 				.default_pat_index = op->map.pat_index,
2677 				.pat_index = op->map.pat_index,
2678 			};
2679 
2680 			flags |= op->map.vma_flags & XE_VMA_CREATE_MASK;
2681 
2682 			vma = new_vma(vm, &op->base.map, &default_attr,
2683 				      flags);
2684 			if (IS_ERR(vma))
2685 				return PTR_ERR(vma);
2686 
2687 			op->map.vma = vma;
2688 			if (((op->map.immediate || !xe_vm_in_fault_mode(vm)) &&
2689 			     !(op->map.vma_flags & XE_VMA_SYSTEM_ALLOCATOR)) ||
2690 			    op->map.invalidate_on_bind)
2691 				xe_vma_ops_incr_pt_update_ops(vops,
2692 							      op->tile_mask, 1);
2693 			break;
2694 		}
2695 		case DRM_GPUVA_OP_REMAP:
2696 		{
2697 			struct xe_vma *old =
2698 				gpuva_to_vma(op->base.remap.unmap->va);
2699 			bool skip = xe_vma_is_cpu_addr_mirror(old);
2700 			u64 start = xe_vma_start(old), end = xe_vma_end(old);
2701 			int num_remap_ops = 0;
2702 
2703 			if (op->base.remap.prev)
2704 				start = op->base.remap.prev->va.addr +
2705 					op->base.remap.prev->va.range;
2706 			if (op->base.remap.next)
2707 				end = op->base.remap.next->va.addr;
2708 
2709 			if (xe_vma_is_cpu_addr_mirror(old) &&
2710 			    xe_svm_has_mapping(vm, start, end)) {
2711 				if (vops->flags & XE_VMA_OPS_FLAG_MADVISE)
2712 					xe_svm_unmap_address_range(vm, start, end);
2713 				else
2714 					return -EBUSY;
2715 			}
2716 
2717 			op->remap.start = xe_vma_start(old);
2718 			op->remap.range = xe_vma_size(old);
2719 
2720 			flags |= op->base.remap.unmap->va->flags & XE_VMA_CREATE_MASK;
2721 			if (op->base.remap.prev) {
2722 				vma = new_vma(vm, op->base.remap.prev,
2723 					      &old->attr, flags);
2724 				if (IS_ERR(vma))
2725 					return PTR_ERR(vma);
2726 
2727 				op->remap.prev = vma;
2728 
2729 				/*
2730 				 * Userptr creates a new SG mapping so
2731 				 * we must also rebind.
2732 				 */
2733 				op->remap.skip_prev = skip ||
2734 					(!xe_vma_is_userptr(old) &&
2735 					IS_ALIGNED(xe_vma_end(vma),
2736 						   xe_vma_max_pte_size(old)));
2737 				if (op->remap.skip_prev) {
2738 					xe_vma_set_pte_size(vma, xe_vma_max_pte_size(old));
2739 					op->remap.range -=
2740 						xe_vma_end(vma) -
2741 						xe_vma_start(old);
2742 					op->remap.start = xe_vma_end(vma);
2743 					vm_dbg(&xe->drm, "REMAP:SKIP_PREV: addr=0x%016llx, range=0x%016llx",
2744 					       (ULL)op->remap.start,
2745 					       (ULL)op->remap.range);
2746 				} else {
2747 					num_remap_ops++;
2748 				}
2749 			}
2750 
2751 			if (op->base.remap.next) {
2752 				vma = new_vma(vm, op->base.remap.next,
2753 					      &old->attr, flags);
2754 				if (IS_ERR(vma))
2755 					return PTR_ERR(vma);
2756 
2757 				op->remap.next = vma;
2758 
2759 				/*
2760 				 * Userptr creates a new SG mapping so
2761 				 * we must also rebind.
2762 				 */
2763 				op->remap.skip_next = skip ||
2764 					(!xe_vma_is_userptr(old) &&
2765 					IS_ALIGNED(xe_vma_start(vma),
2766 						   xe_vma_max_pte_size(old)));
2767 				if (op->remap.skip_next) {
2768 					xe_vma_set_pte_size(vma, xe_vma_max_pte_size(old));
2769 					op->remap.range -=
2770 						xe_vma_end(old) -
2771 						xe_vma_start(vma);
2772 					vm_dbg(&xe->drm, "REMAP:SKIP_NEXT: addr=0x%016llx, range=0x%016llx",
2773 					       (ULL)op->remap.start,
2774 					       (ULL)op->remap.range);
2775 				} else {
2776 					num_remap_ops++;
2777 				}
2778 			}
2779 			if (!skip)
2780 				num_remap_ops++;
2781 
2782 			xe_vma_ops_incr_pt_update_ops(vops, op->tile_mask, num_remap_ops);
2783 			break;
2784 		}
2785 		case DRM_GPUVA_OP_UNMAP:
2786 			vma = gpuva_to_vma(op->base.unmap.va);
2787 
2788 			if (xe_vma_is_cpu_addr_mirror(vma) &&
2789 			    xe_svm_has_mapping(vm, xe_vma_start(vma),
2790 					       xe_vma_end(vma)) &&
2791 			    !(vops->flags & XE_VMA_OPS_FLAG_ALLOW_SVM_UNMAP))
2792 				return -EBUSY;
2793 
2794 			if (!xe_vma_is_cpu_addr_mirror(vma))
2795 				xe_vma_ops_incr_pt_update_ops(vops, op->tile_mask, 1);
2796 			break;
2797 		case DRM_GPUVA_OP_PREFETCH:
2798 			vma = gpuva_to_vma(op->base.prefetch.va);
2799 
2800 			if (xe_vma_is_userptr(vma)) {
2801 				err = xe_vma_userptr_pin_pages(to_userptr_vma(vma));
2802 				if (err)
2803 					return err;
2804 			}
2805 
2806 			if (xe_vma_is_cpu_addr_mirror(vma))
2807 				xe_vma_ops_incr_pt_update_ops(vops, op->tile_mask,
2808 							      op->prefetch_range.ranges_count);
2809 			else
2810 				xe_vma_ops_incr_pt_update_ops(vops, op->tile_mask, 1);
2811 
2812 			break;
2813 		default:
2814 			drm_warn(&vm->xe->drm, "NOT POSSIBLE\n");
2815 		}
2816 
2817 		err = xe_vma_op_commit(vm, op);
2818 		if (err)
2819 			return err;
2820 	}
2821 
2822 	return 0;
2823 }
2824 
2825 static void xe_vma_op_unwind(struct xe_vm *vm, struct xe_vma_op *op,
2826 			     bool post_commit, bool prev_post_commit,
2827 			     bool next_post_commit)
2828 {
2829 	lockdep_assert_held_write(&vm->lock);
2830 
2831 	switch (op->base.op) {
2832 	case DRM_GPUVA_OP_MAP:
2833 		if (op->map.vma) {
2834 			prep_vma_destroy(vm, op->map.vma, post_commit);
2835 			xe_vma_destroy_unlocked(op->map.vma);
2836 		}
2837 		break;
2838 	case DRM_GPUVA_OP_UNMAP:
2839 	{
2840 		struct xe_vma *vma = gpuva_to_vma(op->base.unmap.va);
2841 
2842 		if (vma) {
2843 			xe_svm_notifier_lock(vm);
2844 			vma->gpuva.flags &= ~XE_VMA_DESTROYED;
2845 			xe_svm_notifier_unlock(vm);
2846 			if (post_commit)
2847 				xe_vm_insert_vma(vm, vma);
2848 		}
2849 		break;
2850 	}
2851 	case DRM_GPUVA_OP_REMAP:
2852 	{
2853 		struct xe_vma *vma = gpuva_to_vma(op->base.remap.unmap->va);
2854 
2855 		if (op->remap.prev) {
2856 			prep_vma_destroy(vm, op->remap.prev, prev_post_commit);
2857 			xe_vma_destroy_unlocked(op->remap.prev);
2858 		}
2859 		if (op->remap.next) {
2860 			prep_vma_destroy(vm, op->remap.next, next_post_commit);
2861 			xe_vma_destroy_unlocked(op->remap.next);
2862 		}
2863 		if (vma) {
2864 			xe_svm_notifier_lock(vm);
2865 			vma->gpuva.flags &= ~XE_VMA_DESTROYED;
2866 			xe_svm_notifier_unlock(vm);
2867 			if (post_commit)
2868 				xe_vm_insert_vma(vm, vma);
2869 		}
2870 		break;
2871 	}
2872 	case DRM_GPUVA_OP_PREFETCH:
2873 		/* Nothing to do */
2874 		break;
2875 	default:
2876 		drm_warn(&vm->xe->drm, "NOT POSSIBLE\n");
2877 	}
2878 }
2879 
2880 static void vm_bind_ioctl_ops_unwind(struct xe_vm *vm,
2881 				     struct drm_gpuva_ops **ops,
2882 				     int num_ops_list)
2883 {
2884 	int i;
2885 
2886 	for (i = num_ops_list - 1; i >= 0; --i) {
2887 		struct drm_gpuva_ops *__ops = ops[i];
2888 		struct drm_gpuva_op *__op;
2889 
2890 		if (!__ops)
2891 			continue;
2892 
2893 		drm_gpuva_for_each_op_reverse(__op, __ops) {
2894 			struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2895 
2896 			xe_vma_op_unwind(vm, op,
2897 					 op->flags & XE_VMA_OP_COMMITTED,
2898 					 op->flags & XE_VMA_OP_PREV_COMMITTED,
2899 					 op->flags & XE_VMA_OP_NEXT_COMMITTED);
2900 		}
2901 	}
2902 }
2903 
2904 static int vma_lock_and_validate(struct drm_exec *exec, struct xe_vma *vma,
2905 				 bool res_evict, bool validate)
2906 {
2907 	struct xe_bo *bo = xe_vma_bo(vma);
2908 	struct xe_vm *vm = xe_vma_vm(vma);
2909 	int err = 0;
2910 
2911 	if (bo) {
2912 		if (!bo->vm)
2913 			err = drm_exec_lock_obj(exec, &bo->ttm.base);
2914 		if (!err && validate)
2915 			err = xe_bo_validate(bo, vm,
2916 					     xe_vm_allow_vm_eviction(vm) &&
2917 					     res_evict, exec);
2918 	}
2919 
2920 	return err;
2921 }
2922 
2923 static int check_ufence(struct xe_vma *vma)
2924 {
2925 	if (vma->ufence) {
2926 		struct xe_user_fence * const f = vma->ufence;
2927 
2928 		if (!xe_sync_ufence_get_status(f))
2929 			return -EBUSY;
2930 
2931 		vma->ufence = NULL;
2932 		xe_sync_ufence_put(f);
2933 	}
2934 
2935 	return 0;
2936 }
2937 
2938 static int prefetch_ranges(struct xe_vm *vm, struct xe_vma_op *op)
2939 {
2940 	bool devmem_possible = IS_DGFX(vm->xe) && IS_ENABLED(CONFIG_DRM_XE_PAGEMAP);
2941 	struct xe_vma *vma = gpuva_to_vma(op->base.prefetch.va);
2942 	struct drm_pagemap *dpagemap = op->prefetch_range.dpagemap;
2943 	int err = 0;
2944 
2945 	struct xe_svm_range *svm_range;
2946 	struct drm_gpusvm_ctx ctx = {};
2947 	unsigned long i;
2948 
2949 	if (!xe_vma_is_cpu_addr_mirror(vma))
2950 		return 0;
2951 
2952 	ctx.read_only = xe_vma_read_only(vma);
2953 	ctx.devmem_possible = devmem_possible;
2954 	ctx.check_pages_threshold = devmem_possible ? SZ_64K : 0;
2955 	ctx.device_private_page_owner = xe_svm_private_page_owner(vm, !dpagemap);
2956 
2957 	/* TODO: Threading the migration */
2958 	xa_for_each(&op->prefetch_range.range, i, svm_range) {
2959 		if (!dpagemap)
2960 			xe_svm_range_migrate_to_smem(vm, svm_range);
2961 
2962 		if (IS_ENABLED(CONFIG_DRM_XE_DEBUG_VM)) {
2963 			drm_dbg(&vm->xe->drm,
2964 				"Prefetch pagemap is %s start 0x%016lx end 0x%016lx\n",
2965 				dpagemap ? dpagemap->drm->unique : "system",
2966 				xe_svm_range_start(svm_range), xe_svm_range_end(svm_range));
2967 		}
2968 
2969 		if (xe_svm_range_needs_migrate_to_vram(svm_range, vma, dpagemap)) {
2970 			err = xe_svm_alloc_vram(svm_range, &ctx, dpagemap);
2971 			if (err) {
2972 				drm_dbg(&vm->xe->drm, "VRAM allocation failed, retry from userspace, asid=%u, gpusvm=%p, errno=%pe\n",
2973 					vm->usm.asid, &vm->svm.gpusvm, ERR_PTR(err));
2974 				return -ENODATA;
2975 			}
2976 			xe_svm_range_debug(svm_range, "PREFETCH - RANGE MIGRATED TO VRAM");
2977 		}
2978 
2979 		err = xe_svm_range_get_pages(vm, svm_range, &ctx);
2980 		if (err) {
2981 			drm_dbg(&vm->xe->drm, "Get pages failed, asid=%u, gpusvm=%p, errno=%pe\n",
2982 				vm->usm.asid, &vm->svm.gpusvm, ERR_PTR(err));
2983 			if (err == -EOPNOTSUPP || err == -EFAULT || err == -EPERM)
2984 				err = -ENODATA;
2985 			return err;
2986 		}
2987 		xe_svm_range_debug(svm_range, "PREFETCH - RANGE GET PAGES DONE");
2988 	}
2989 
2990 	return err;
2991 }
2992 
2993 static int op_lock_and_prep(struct drm_exec *exec, struct xe_vm *vm,
2994 			    struct xe_vma_ops *vops, struct xe_vma_op *op)
2995 {
2996 	int err = 0;
2997 	bool res_evict;
2998 
2999 	/*
3000 	 * We only allow evicting a BO within the VM if it is not part of an
3001 	 * array of binds, as an array of binds can evict another BO within the
3002 	 * bind.
3003 	 */
3004 	res_evict = !(vops->flags & XE_VMA_OPS_ARRAY_OF_BINDS);
3005 
3006 	switch (op->base.op) {
3007 	case DRM_GPUVA_OP_MAP:
3008 		if (!op->map.invalidate_on_bind)
3009 			err = vma_lock_and_validate(exec, op->map.vma,
3010 						    res_evict,
3011 						    !xe_vm_in_fault_mode(vm) ||
3012 						    op->map.immediate);
3013 		break;
3014 	case DRM_GPUVA_OP_REMAP:
3015 		err = check_ufence(gpuva_to_vma(op->base.remap.unmap->va));
3016 		if (err)
3017 			break;
3018 
3019 		err = vma_lock_and_validate(exec,
3020 					    gpuva_to_vma(op->base.remap.unmap->va),
3021 					    res_evict, false);
3022 		if (!err && op->remap.prev)
3023 			err = vma_lock_and_validate(exec, op->remap.prev,
3024 						    res_evict, true);
3025 		if (!err && op->remap.next)
3026 			err = vma_lock_and_validate(exec, op->remap.next,
3027 						    res_evict, true);
3028 		break;
3029 	case DRM_GPUVA_OP_UNMAP:
3030 		err = check_ufence(gpuva_to_vma(op->base.unmap.va));
3031 		if (err)
3032 			break;
3033 
3034 		err = vma_lock_and_validate(exec,
3035 					    gpuva_to_vma(op->base.unmap.va),
3036 					    res_evict, false);
3037 		break;
3038 	case DRM_GPUVA_OP_PREFETCH:
3039 	{
3040 		struct xe_vma *vma = gpuva_to_vma(op->base.prefetch.va);
3041 		u32 region;
3042 
3043 		if (!xe_vma_is_cpu_addr_mirror(vma)) {
3044 			region = op->prefetch.region;
3045 			xe_assert(vm->xe, region == DRM_XE_CONSULT_MEM_ADVISE_PREF_LOC ||
3046 				  region <= ARRAY_SIZE(region_to_mem_type));
3047 		}
3048 
3049 		err = vma_lock_and_validate(exec,
3050 					    gpuva_to_vma(op->base.prefetch.va),
3051 					    res_evict, false);
3052 		if (!err && !xe_vma_has_no_bo(vma))
3053 			err = xe_bo_migrate(xe_vma_bo(vma),
3054 					    region_to_mem_type[region],
3055 					    NULL,
3056 					    exec);
3057 		break;
3058 	}
3059 	default:
3060 		drm_warn(&vm->xe->drm, "NOT POSSIBLE\n");
3061 	}
3062 
3063 	return err;
3064 }
3065 
3066 static int vm_bind_ioctl_ops_prefetch_ranges(struct xe_vm *vm, struct xe_vma_ops *vops)
3067 {
3068 	struct xe_vma_op *op;
3069 	int err;
3070 
3071 	if (!(vops->flags & XE_VMA_OPS_FLAG_HAS_SVM_PREFETCH))
3072 		return 0;
3073 
3074 	list_for_each_entry(op, &vops->list, link) {
3075 		if (op->base.op  == DRM_GPUVA_OP_PREFETCH) {
3076 			err = prefetch_ranges(vm, op);
3077 			if (err)
3078 				return err;
3079 		}
3080 	}
3081 
3082 	return 0;
3083 }
3084 
3085 static int vm_bind_ioctl_ops_lock_and_prep(struct drm_exec *exec,
3086 					   struct xe_vm *vm,
3087 					   struct xe_vma_ops *vops)
3088 {
3089 	struct xe_vma_op *op;
3090 	int err;
3091 
3092 	err = drm_exec_lock_obj(exec, xe_vm_obj(vm));
3093 	if (err)
3094 		return err;
3095 
3096 	list_for_each_entry(op, &vops->list, link) {
3097 		err = op_lock_and_prep(exec, vm, vops, op);
3098 		if (err)
3099 			return err;
3100 	}
3101 
3102 #ifdef TEST_VM_OPS_ERROR
3103 	if (vops->inject_error &&
3104 	    vm->xe->vm_inject_error_position == FORCE_OP_ERROR_LOCK)
3105 		return -ENOSPC;
3106 #endif
3107 
3108 	return 0;
3109 }
3110 
3111 static void op_trace(struct xe_vma_op *op)
3112 {
3113 	switch (op->base.op) {
3114 	case DRM_GPUVA_OP_MAP:
3115 		trace_xe_vma_bind(op->map.vma);
3116 		break;
3117 	case DRM_GPUVA_OP_REMAP:
3118 		trace_xe_vma_unbind(gpuva_to_vma(op->base.remap.unmap->va));
3119 		if (op->remap.prev)
3120 			trace_xe_vma_bind(op->remap.prev);
3121 		if (op->remap.next)
3122 			trace_xe_vma_bind(op->remap.next);
3123 		break;
3124 	case DRM_GPUVA_OP_UNMAP:
3125 		trace_xe_vma_unbind(gpuva_to_vma(op->base.unmap.va));
3126 		break;
3127 	case DRM_GPUVA_OP_PREFETCH:
3128 		trace_xe_vma_bind(gpuva_to_vma(op->base.prefetch.va));
3129 		break;
3130 	case DRM_GPUVA_OP_DRIVER:
3131 		break;
3132 	default:
3133 		XE_WARN_ON("NOT POSSIBLE");
3134 	}
3135 }
3136 
3137 static void trace_xe_vm_ops_execute(struct xe_vma_ops *vops)
3138 {
3139 	struct xe_vma_op *op;
3140 
3141 	list_for_each_entry(op, &vops->list, link)
3142 		op_trace(op);
3143 }
3144 
3145 static int vm_ops_setup_tile_args(struct xe_vm *vm, struct xe_vma_ops *vops)
3146 {
3147 	struct xe_exec_queue *q = vops->q;
3148 	struct xe_tile *tile;
3149 	int number_tiles = 0;
3150 	u8 id;
3151 
3152 	for_each_tile(tile, vm->xe, id) {
3153 		if (vops->pt_update_ops[id].num_ops)
3154 			++number_tiles;
3155 
3156 		if (vops->pt_update_ops[id].q)
3157 			continue;
3158 
3159 		if (q) {
3160 			vops->pt_update_ops[id].q = q;
3161 			if (vm->pt_root[id] && !list_empty(&q->multi_gt_list))
3162 				q = list_next_entry(q, multi_gt_list);
3163 		} else {
3164 			vops->pt_update_ops[id].q = vm->q[id];
3165 		}
3166 	}
3167 
3168 	return number_tiles;
3169 }
3170 
3171 static struct dma_fence *ops_execute(struct xe_vm *vm,
3172 				     struct xe_vma_ops *vops)
3173 {
3174 	struct xe_tile *tile;
3175 	struct dma_fence *fence = NULL;
3176 	struct dma_fence **fences = NULL;
3177 	struct dma_fence_array *cf = NULL;
3178 	int number_tiles = 0, current_fence = 0, n_fence = 0, err, i;
3179 	u8 id;
3180 
3181 	number_tiles = vm_ops_setup_tile_args(vm, vops);
3182 	if (number_tiles == 0)
3183 		return ERR_PTR(-ENODATA);
3184 
3185 	for_each_tile(tile, vm->xe, id) {
3186 		++n_fence;
3187 
3188 		if (!(vops->flags & XE_VMA_OPS_FLAG_SKIP_TLB_WAIT))
3189 			for_each_tlb_inval(i)
3190 				++n_fence;
3191 	}
3192 
3193 	fences = kmalloc_objs(*fences, n_fence);
3194 	if (!fences) {
3195 		fence = ERR_PTR(-ENOMEM);
3196 		goto err_trace;
3197 	}
3198 
3199 	cf = dma_fence_array_alloc(n_fence);
3200 	if (!cf) {
3201 		fence = ERR_PTR(-ENOMEM);
3202 		goto err_out;
3203 	}
3204 
3205 	for_each_tile(tile, vm->xe, id) {
3206 		if (!vops->pt_update_ops[id].num_ops)
3207 			continue;
3208 
3209 		err = xe_pt_update_ops_prepare(tile, vops);
3210 		if (err) {
3211 			fence = ERR_PTR(err);
3212 			goto err_out;
3213 		}
3214 	}
3215 
3216 	trace_xe_vm_ops_execute(vops);
3217 
3218 	for_each_tile(tile, vm->xe, id) {
3219 		struct xe_exec_queue *q = vops->pt_update_ops[tile->id].q;
3220 
3221 		fence = NULL;
3222 		if (!vops->pt_update_ops[id].num_ops)
3223 			goto collect_fences;
3224 
3225 		fence = xe_pt_update_ops_run(tile, vops);
3226 		if (IS_ERR(fence))
3227 			goto err_out;
3228 
3229 collect_fences:
3230 		fences[current_fence++] = fence ?: dma_fence_get_stub();
3231 		if (vops->flags & XE_VMA_OPS_FLAG_SKIP_TLB_WAIT)
3232 			continue;
3233 
3234 		xe_migrate_job_lock(tile->migrate, q);
3235 		for_each_tlb_inval(i)
3236 			fences[current_fence++] =
3237 				xe_exec_queue_tlb_inval_last_fence_get(q, vm, i);
3238 		xe_migrate_job_unlock(tile->migrate, q);
3239 	}
3240 
3241 	xe_assert(vm->xe, current_fence == n_fence);
3242 	dma_fence_array_init(cf, n_fence, fences, dma_fence_context_alloc(1),
3243 			     1, false);
3244 	fence = &cf->base;
3245 
3246 	for_each_tile(tile, vm->xe, id) {
3247 		if (!vops->pt_update_ops[id].num_ops)
3248 			continue;
3249 
3250 		xe_pt_update_ops_fini(tile, vops);
3251 	}
3252 
3253 	return fence;
3254 
3255 err_out:
3256 	for_each_tile(tile, vm->xe, id) {
3257 		if (!vops->pt_update_ops[id].num_ops)
3258 			continue;
3259 
3260 		xe_pt_update_ops_abort(tile, vops);
3261 	}
3262 	while (current_fence)
3263 		dma_fence_put(fences[--current_fence]);
3264 	kfree(fences);
3265 	kfree(cf);
3266 
3267 err_trace:
3268 	trace_xe_vm_ops_fail(vm);
3269 	return fence;
3270 }
3271 
3272 static void vma_add_ufence(struct xe_vma *vma, struct xe_user_fence *ufence)
3273 {
3274 	if (vma->ufence)
3275 		xe_sync_ufence_put(vma->ufence);
3276 	vma->ufence = __xe_sync_ufence_get(ufence);
3277 }
3278 
3279 static void op_add_ufence(struct xe_vm *vm, struct xe_vma_op *op,
3280 			  struct xe_user_fence *ufence)
3281 {
3282 	switch (op->base.op) {
3283 	case DRM_GPUVA_OP_MAP:
3284 		if (!xe_vma_is_cpu_addr_mirror(op->map.vma))
3285 			vma_add_ufence(op->map.vma, ufence);
3286 		break;
3287 	case DRM_GPUVA_OP_REMAP:
3288 		if (op->remap.prev)
3289 			vma_add_ufence(op->remap.prev, ufence);
3290 		if (op->remap.next)
3291 			vma_add_ufence(op->remap.next, ufence);
3292 		break;
3293 	case DRM_GPUVA_OP_UNMAP:
3294 		break;
3295 	case DRM_GPUVA_OP_PREFETCH:
3296 		vma_add_ufence(gpuva_to_vma(op->base.prefetch.va), ufence);
3297 		break;
3298 	default:
3299 		drm_warn(&vm->xe->drm, "NOT POSSIBLE\n");
3300 	}
3301 }
3302 
3303 static void vm_bind_ioctl_ops_fini(struct xe_vm *vm, struct xe_vma_ops *vops,
3304 				   struct dma_fence *fence)
3305 {
3306 	struct xe_user_fence *ufence;
3307 	struct xe_vma_op *op;
3308 	int i;
3309 
3310 	ufence = find_ufence_get(vops->syncs, vops->num_syncs);
3311 	list_for_each_entry(op, &vops->list, link) {
3312 		if (ufence)
3313 			op_add_ufence(vm, op, ufence);
3314 
3315 		if (op->base.op == DRM_GPUVA_OP_UNMAP)
3316 			xe_vma_destroy(gpuva_to_vma(op->base.unmap.va), fence);
3317 		else if (op->base.op == DRM_GPUVA_OP_REMAP)
3318 			xe_vma_destroy(gpuva_to_vma(op->base.remap.unmap->va),
3319 				       fence);
3320 	}
3321 	if (ufence)
3322 		xe_sync_ufence_put(ufence);
3323 	if (fence) {
3324 		for (i = 0; i < vops->num_syncs; i++)
3325 			xe_sync_entry_signal(vops->syncs + i, fence);
3326 	}
3327 }
3328 
3329 static struct dma_fence *vm_bind_ioctl_ops_execute(struct xe_vm *vm,
3330 						   struct xe_vma_ops *vops)
3331 {
3332 	struct xe_validation_ctx ctx;
3333 	struct drm_exec exec;
3334 	struct dma_fence *fence;
3335 	int err = 0;
3336 
3337 	lockdep_assert_held_write(&vm->lock);
3338 
3339 	xe_validation_guard(&ctx, &vm->xe->val, &exec,
3340 			    ((struct xe_val_flags) {
3341 				    .interruptible = true,
3342 				    .exec_ignore_duplicates = true,
3343 			    }), err) {
3344 		err = vm_bind_ioctl_ops_lock_and_prep(&exec, vm, vops);
3345 		drm_exec_retry_on_contention(&exec);
3346 		xe_validation_retry_on_oom(&ctx, &err);
3347 		if (err)
3348 			return ERR_PTR(err);
3349 
3350 		xe_vm_set_validation_exec(vm, &exec);
3351 		fence = ops_execute(vm, vops);
3352 		xe_vm_set_validation_exec(vm, NULL);
3353 		if (IS_ERR(fence)) {
3354 			if (PTR_ERR(fence) == -ENODATA)
3355 				vm_bind_ioctl_ops_fini(vm, vops, NULL);
3356 			return fence;
3357 		}
3358 
3359 		vm_bind_ioctl_ops_fini(vm, vops, fence);
3360 	}
3361 
3362 	return err ? ERR_PTR(err) : fence;
3363 }
3364 ALLOW_ERROR_INJECTION(vm_bind_ioctl_ops_execute, ERRNO);
3365 
3366 #define SUPPORTED_FLAGS_STUB  \
3367 	(DRM_XE_VM_BIND_FLAG_READONLY | \
3368 	 DRM_XE_VM_BIND_FLAG_IMMEDIATE | \
3369 	 DRM_XE_VM_BIND_FLAG_NULL | \
3370 	 DRM_XE_VM_BIND_FLAG_DUMPABLE | \
3371 	 DRM_XE_VM_BIND_FLAG_CHECK_PXP | \
3372 	 DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR | \
3373 	 DRM_XE_VM_BIND_FLAG_MADVISE_AUTORESET)
3374 
3375 #ifdef TEST_VM_OPS_ERROR
3376 #define SUPPORTED_FLAGS	(SUPPORTED_FLAGS_STUB | FORCE_OP_ERROR)
3377 #else
3378 #define SUPPORTED_FLAGS	SUPPORTED_FLAGS_STUB
3379 #endif
3380 
3381 #define XE_64K_PAGE_MASK 0xffffull
3382 #define ALL_DRM_XE_SYNCS_FLAGS (DRM_XE_SYNCS_FLAG_WAIT_FOR_OP)
3383 
3384 static int vm_bind_ioctl_check_args(struct xe_device *xe, struct xe_vm *vm,
3385 				    struct drm_xe_vm_bind *args,
3386 				    struct drm_xe_vm_bind_op **bind_ops)
3387 {
3388 	int err;
3389 	int i;
3390 
3391 	if (XE_IOCTL_DBG(xe, args->pad || args->pad2) ||
3392 	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
3393 		return -EINVAL;
3394 
3395 	if (XE_IOCTL_DBG(xe, args->extensions))
3396 		return -EINVAL;
3397 
3398 	if (XE_IOCTL_DBG(xe, args->num_syncs > DRM_XE_MAX_SYNCS))
3399 		return -EINVAL;
3400 
3401 	if (args->num_binds > 1) {
3402 		u64 __user *bind_user =
3403 			u64_to_user_ptr(args->vector_of_binds);
3404 
3405 		*bind_ops = kvmalloc_objs(struct drm_xe_vm_bind_op,
3406 					  args->num_binds,
3407 					  GFP_KERNEL | __GFP_ACCOUNT | __GFP_RETRY_MAYFAIL | __GFP_NOWARN);
3408 		if (!*bind_ops)
3409 			return args->num_binds > 1 ? -ENOBUFS : -ENOMEM;
3410 
3411 		err = copy_from_user(*bind_ops, bind_user,
3412 				     sizeof(struct drm_xe_vm_bind_op) *
3413 				     args->num_binds);
3414 		if (XE_IOCTL_DBG(xe, err)) {
3415 			err = -EFAULT;
3416 			goto free_bind_ops;
3417 		}
3418 	} else {
3419 		*bind_ops = &args->bind;
3420 	}
3421 
3422 	for (i = 0; i < args->num_binds; ++i) {
3423 		u64 range = (*bind_ops)[i].range;
3424 		u64 addr = (*bind_ops)[i].addr;
3425 		u32 op = (*bind_ops)[i].op;
3426 		u32 flags = (*bind_ops)[i].flags;
3427 		u32 obj = (*bind_ops)[i].obj;
3428 		u64 obj_offset = (*bind_ops)[i].obj_offset;
3429 		u32 prefetch_region = (*bind_ops)[i].prefetch_mem_region_instance;
3430 		bool is_null = flags & DRM_XE_VM_BIND_FLAG_NULL;
3431 		bool is_cpu_addr_mirror = flags &
3432 			DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR;
3433 		u16 pat_index = (*bind_ops)[i].pat_index;
3434 		u16 coh_mode;
3435 		bool comp_en;
3436 
3437 		if (XE_IOCTL_DBG(xe, is_cpu_addr_mirror &&
3438 				 (!xe_vm_in_fault_mode(vm) ||
3439 				 !IS_ENABLED(CONFIG_DRM_XE_GPUSVM)))) {
3440 			err = -EINVAL;
3441 			goto free_bind_ops;
3442 		}
3443 
3444 		if (XE_IOCTL_DBG(xe, pat_index >= xe->pat.n_entries)) {
3445 			err = -EINVAL;
3446 			goto free_bind_ops;
3447 		}
3448 
3449 		pat_index = array_index_nospec(pat_index, xe->pat.n_entries);
3450 		(*bind_ops)[i].pat_index = pat_index;
3451 		coh_mode = xe_pat_index_get_coh_mode(xe, pat_index);
3452 		comp_en = xe_pat_index_get_comp_en(xe, pat_index);
3453 		if (XE_IOCTL_DBG(xe, !coh_mode)) { /* hw reserved */
3454 			err = -EINVAL;
3455 			goto free_bind_ops;
3456 		}
3457 
3458 		if (XE_WARN_ON(coh_mode > XE_COH_AT_LEAST_1WAY)) {
3459 			err = -EINVAL;
3460 			goto free_bind_ops;
3461 		}
3462 
3463 		if (XE_IOCTL_DBG(xe, op > DRM_XE_VM_BIND_OP_PREFETCH) ||
3464 		    XE_IOCTL_DBG(xe, flags & ~SUPPORTED_FLAGS) ||
3465 		    XE_IOCTL_DBG(xe, obj && (is_null || is_cpu_addr_mirror)) ||
3466 		    XE_IOCTL_DBG(xe, obj_offset && (is_null ||
3467 						    is_cpu_addr_mirror)) ||
3468 		    XE_IOCTL_DBG(xe, op != DRM_XE_VM_BIND_OP_MAP &&
3469 				 (is_null || is_cpu_addr_mirror)) ||
3470 		    XE_IOCTL_DBG(xe, !obj &&
3471 				 op == DRM_XE_VM_BIND_OP_MAP &&
3472 				 !is_null && !is_cpu_addr_mirror) ||
3473 		    XE_IOCTL_DBG(xe, !obj &&
3474 				 op == DRM_XE_VM_BIND_OP_UNMAP_ALL) ||
3475 		    XE_IOCTL_DBG(xe, addr &&
3476 				 op == DRM_XE_VM_BIND_OP_UNMAP_ALL) ||
3477 		    XE_IOCTL_DBG(xe, range &&
3478 				 op == DRM_XE_VM_BIND_OP_UNMAP_ALL) ||
3479 		    XE_IOCTL_DBG(xe, obj &&
3480 				 op == DRM_XE_VM_BIND_OP_MAP_USERPTR) ||
3481 		    XE_IOCTL_DBG(xe, coh_mode == XE_COH_NONE &&
3482 				 op == DRM_XE_VM_BIND_OP_MAP_USERPTR) ||
3483 		    XE_IOCTL_DBG(xe, comp_en &&
3484 				 op == DRM_XE_VM_BIND_OP_MAP_USERPTR) ||
3485 		    XE_IOCTL_DBG(xe, op == DRM_XE_VM_BIND_OP_MAP_USERPTR &&
3486 				 !IS_ENABLED(CONFIG_DRM_GPUSVM)) ||
3487 		    XE_IOCTL_DBG(xe, obj &&
3488 				 op == DRM_XE_VM_BIND_OP_PREFETCH) ||
3489 		    XE_IOCTL_DBG(xe, prefetch_region &&
3490 				 op != DRM_XE_VM_BIND_OP_PREFETCH) ||
3491 		    XE_IOCTL_DBG(xe, (prefetch_region != DRM_XE_CONSULT_MEM_ADVISE_PREF_LOC &&
3492 				      /* Guard against undefined shift in BIT(prefetch_region) */
3493 				      (prefetch_region >= (sizeof(xe->info.mem_region_mask) * 8) ||
3494 				      !(BIT(prefetch_region) & xe->info.mem_region_mask)))) ||
3495 		    XE_IOCTL_DBG(xe, obj &&
3496 				 op == DRM_XE_VM_BIND_OP_UNMAP) ||
3497 		    XE_IOCTL_DBG(xe, (flags & DRM_XE_VM_BIND_FLAG_MADVISE_AUTORESET) &&
3498 				 (!is_cpu_addr_mirror || op != DRM_XE_VM_BIND_OP_MAP))) {
3499 			err = -EINVAL;
3500 			goto free_bind_ops;
3501 		}
3502 
3503 		if (XE_IOCTL_DBG(xe, obj_offset & ~PAGE_MASK) ||
3504 		    XE_IOCTL_DBG(xe, addr & ~PAGE_MASK) ||
3505 		    XE_IOCTL_DBG(xe, range & ~PAGE_MASK) ||
3506 		    XE_IOCTL_DBG(xe, !range &&
3507 				 op != DRM_XE_VM_BIND_OP_UNMAP_ALL)) {
3508 			err = -EINVAL;
3509 			goto free_bind_ops;
3510 		}
3511 	}
3512 
3513 	return 0;
3514 
3515 free_bind_ops:
3516 	if (args->num_binds > 1)
3517 		kvfree(*bind_ops);
3518 	*bind_ops = NULL;
3519 	return err;
3520 }
3521 
3522 static int vm_bind_ioctl_signal_fences(struct xe_vm *vm,
3523 				       struct xe_exec_queue *q,
3524 				       struct xe_sync_entry *syncs,
3525 				       int num_syncs)
3526 {
3527 	struct dma_fence *fence = NULL;
3528 	int i, err = 0;
3529 
3530 	if (num_syncs) {
3531 		fence = xe_sync_in_fence_get(syncs, num_syncs,
3532 					     to_wait_exec_queue(vm, q), vm);
3533 		if (IS_ERR(fence))
3534 			return PTR_ERR(fence);
3535 
3536 		for (i = 0; i < num_syncs; i++)
3537 			xe_sync_entry_signal(&syncs[i], fence);
3538 	}
3539 
3540 	dma_fence_put(fence);
3541 
3542 	return err;
3543 }
3544 
3545 static void xe_vma_ops_init(struct xe_vma_ops *vops, struct xe_vm *vm,
3546 			    struct xe_exec_queue *q,
3547 			    struct xe_sync_entry *syncs, u32 num_syncs)
3548 {
3549 	memset(vops, 0, sizeof(*vops));
3550 	INIT_LIST_HEAD(&vops->list);
3551 	vops->vm = vm;
3552 	vops->q = q;
3553 	vops->syncs = syncs;
3554 	vops->num_syncs = num_syncs;
3555 	vops->flags = 0;
3556 }
3557 
3558 static int xe_vm_bind_ioctl_validate_bo(struct xe_device *xe, struct xe_bo *bo,
3559 					u64 addr, u64 range, u64 obj_offset,
3560 					u16 pat_index, u32 op, u32 bind_flags)
3561 {
3562 	u16 coh_mode;
3563 	bool comp_en;
3564 
3565 	if (XE_IOCTL_DBG(xe, (bo->flags & XE_BO_FLAG_NO_COMPRESSION) &&
3566 			 xe_pat_index_get_comp_en(xe, pat_index)))
3567 		return -EINVAL;
3568 
3569 	if (XE_IOCTL_DBG(xe, range > xe_bo_size(bo)) ||
3570 	    XE_IOCTL_DBG(xe, obj_offset >
3571 			 xe_bo_size(bo) - range)) {
3572 		return -EINVAL;
3573 	}
3574 
3575 	/*
3576 	 * Some platforms require 64k VM_BIND alignment,
3577 	 * specifically those with XE_VRAM_FLAGS_NEED64K.
3578 	 *
3579 	 * Other platforms may have BO's set to 64k physical placement,
3580 	 * but can be mapped at 4k offsets anyway. This check is only
3581 	 * there for the former case.
3582 	 */
3583 	if ((bo->flags & XE_BO_FLAG_INTERNAL_64K) &&
3584 	    (xe->info.vram_flags & XE_VRAM_FLAGS_NEED64K)) {
3585 		if (XE_IOCTL_DBG(xe, obj_offset &
3586 				 XE_64K_PAGE_MASK) ||
3587 		    XE_IOCTL_DBG(xe, addr & XE_64K_PAGE_MASK) ||
3588 		    XE_IOCTL_DBG(xe, range & XE_64K_PAGE_MASK)) {
3589 			return -EINVAL;
3590 		}
3591 	}
3592 
3593 	coh_mode = xe_pat_index_get_coh_mode(xe, pat_index);
3594 	if (bo->cpu_caching) {
3595 		if (XE_IOCTL_DBG(xe, coh_mode == XE_COH_NONE &&
3596 				 bo->cpu_caching == DRM_XE_GEM_CPU_CACHING_WB)) {
3597 			return -EINVAL;
3598 		}
3599 	} else if (XE_IOCTL_DBG(xe, coh_mode == XE_COH_NONE)) {
3600 		/*
3601 		 * Imported dma-buf from a different device should
3602 		 * require 1way or 2way coherency since we don't know
3603 		 * how it was mapped on the CPU. Just assume is it
3604 		 * potentially cached on CPU side.
3605 		 */
3606 		return -EINVAL;
3607 	}
3608 
3609 	/*
3610 	 * Ensures that imported buffer objects (dma-bufs) are not mapped
3611 	 * with a PAT index that enables compression.
3612 	 */
3613 	comp_en = xe_pat_index_get_comp_en(xe, pat_index);
3614 	if (XE_IOCTL_DBG(xe, bo->ttm.base.import_attach && comp_en))
3615 		return -EINVAL;
3616 
3617 	/* If a BO is protected it can only be mapped if the key is still valid */
3618 	if ((bind_flags & DRM_XE_VM_BIND_FLAG_CHECK_PXP) && xe_bo_is_protected(bo) &&
3619 	    op != DRM_XE_VM_BIND_OP_UNMAP && op != DRM_XE_VM_BIND_OP_UNMAP_ALL)
3620 		if (XE_IOCTL_DBG(xe, xe_pxp_bo_key_check(xe->pxp, bo) != 0))
3621 			return -ENOEXEC;
3622 
3623 	return 0;
3624 }
3625 
3626 int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
3627 {
3628 	struct xe_device *xe = to_xe_device(dev);
3629 	struct xe_file *xef = to_xe_file(file);
3630 	struct drm_xe_vm_bind *args = data;
3631 	struct drm_xe_sync __user *syncs_user;
3632 	struct xe_bo **bos = NULL;
3633 	struct drm_gpuva_ops **ops = NULL;
3634 	struct xe_vm *vm;
3635 	struct xe_exec_queue *q = NULL;
3636 	u32 num_syncs, num_ufence = 0;
3637 	struct xe_sync_entry *syncs = NULL;
3638 	struct drm_xe_vm_bind_op *bind_ops = NULL;
3639 	struct xe_vma_ops vops;
3640 	struct dma_fence *fence;
3641 	int err;
3642 	int i;
3643 
3644 	vm = xe_vm_lookup(xef, args->vm_id);
3645 	if (XE_IOCTL_DBG(xe, !vm))
3646 		return -EINVAL;
3647 
3648 	err = vm_bind_ioctl_check_args(xe, vm, args, &bind_ops);
3649 	if (err)
3650 		goto put_vm;
3651 
3652 	if (args->exec_queue_id) {
3653 		q = xe_exec_queue_lookup(xef, args->exec_queue_id);
3654 		if (XE_IOCTL_DBG(xe, !q)) {
3655 			err = -ENOENT;
3656 			goto free_bind_ops;
3657 		}
3658 
3659 		if (XE_IOCTL_DBG(xe, !(q->flags & EXEC_QUEUE_FLAG_VM))) {
3660 			err = -EINVAL;
3661 			goto put_exec_queue;
3662 		}
3663 	}
3664 
3665 	if (XE_IOCTL_DBG(xe, q && vm != q->user_vm)) {
3666 		err = -EINVAL;
3667 		goto put_exec_queue;
3668 	}
3669 
3670 	/* Ensure all UNMAPs visible */
3671 	xe_svm_flush(vm);
3672 
3673 	err = down_write_killable(&vm->lock);
3674 	if (err)
3675 		goto put_exec_queue;
3676 
3677 	if (XE_IOCTL_DBG(xe, xe_vm_is_closed_or_banned(vm))) {
3678 		err = -ENOENT;
3679 		goto release_vm_lock;
3680 	}
3681 
3682 	for (i = 0; i < args->num_binds; ++i) {
3683 		u64 range = bind_ops[i].range;
3684 		u64 addr = bind_ops[i].addr;
3685 
3686 		if (XE_IOCTL_DBG(xe, range > vm->size) ||
3687 		    XE_IOCTL_DBG(xe, addr > vm->size - range)) {
3688 			err = -EINVAL;
3689 			goto release_vm_lock;
3690 		}
3691 	}
3692 
3693 	if (args->num_binds) {
3694 		bos = kvzalloc_objs(*bos, args->num_binds,
3695 				    GFP_KERNEL | __GFP_ACCOUNT | __GFP_RETRY_MAYFAIL | __GFP_NOWARN);
3696 		if (!bos) {
3697 			err = -ENOMEM;
3698 			goto release_vm_lock;
3699 		}
3700 
3701 		ops = kvzalloc_objs(*ops, args->num_binds,
3702 				    GFP_KERNEL | __GFP_ACCOUNT | __GFP_RETRY_MAYFAIL | __GFP_NOWARN);
3703 		if (!ops) {
3704 			err = -ENOMEM;
3705 			goto free_bos;
3706 		}
3707 	}
3708 
3709 	for (i = 0; i < args->num_binds; ++i) {
3710 		struct drm_gem_object *gem_obj;
3711 		u64 range = bind_ops[i].range;
3712 		u64 addr = bind_ops[i].addr;
3713 		u32 obj = bind_ops[i].obj;
3714 		u64 obj_offset = bind_ops[i].obj_offset;
3715 		u16 pat_index = bind_ops[i].pat_index;
3716 		u32 op = bind_ops[i].op;
3717 		u32 bind_flags = bind_ops[i].flags;
3718 
3719 		if (!obj)
3720 			continue;
3721 
3722 		gem_obj = drm_gem_object_lookup(file, obj);
3723 		if (XE_IOCTL_DBG(xe, !gem_obj)) {
3724 			err = -ENOENT;
3725 			goto put_obj;
3726 		}
3727 		bos[i] = gem_to_xe_bo(gem_obj);
3728 
3729 		err = xe_vm_bind_ioctl_validate_bo(xe, bos[i], addr, range,
3730 						   obj_offset, pat_index, op,
3731 						   bind_flags);
3732 		if (err)
3733 			goto put_obj;
3734 	}
3735 
3736 	if (args->num_syncs) {
3737 		syncs = kzalloc_objs(*syncs, args->num_syncs);
3738 		if (!syncs) {
3739 			err = -ENOMEM;
3740 			goto put_obj;
3741 		}
3742 	}
3743 
3744 	syncs_user = u64_to_user_ptr(args->syncs);
3745 	for (num_syncs = 0; num_syncs < args->num_syncs; num_syncs++) {
3746 		struct xe_exec_queue *__q = q ?: vm->q[0];
3747 
3748 		err = xe_sync_entry_parse(xe, xef, &syncs[num_syncs],
3749 					  &syncs_user[num_syncs],
3750 					  __q->ufence_syncobj,
3751 					  ++__q->ufence_timeline_value,
3752 					  (xe_vm_in_lr_mode(vm) ?
3753 					   SYNC_PARSE_FLAG_LR_MODE : 0) |
3754 					  (!args->num_binds ?
3755 					   SYNC_PARSE_FLAG_DISALLOW_USER_FENCE : 0));
3756 		if (err)
3757 			goto free_syncs;
3758 
3759 		if (xe_sync_is_ufence(&syncs[num_syncs]))
3760 			num_ufence++;
3761 	}
3762 
3763 	if (XE_IOCTL_DBG(xe, num_ufence > 1)) {
3764 		err = -EINVAL;
3765 		goto free_syncs;
3766 	}
3767 
3768 	if (!args->num_binds) {
3769 		err = -ENODATA;
3770 		goto free_syncs;
3771 	}
3772 
3773 	xe_vma_ops_init(&vops, vm, q, syncs, num_syncs);
3774 	if (args->num_binds > 1)
3775 		vops.flags |= XE_VMA_OPS_ARRAY_OF_BINDS;
3776 	for (i = 0; i < args->num_binds; ++i) {
3777 		u64 range = bind_ops[i].range;
3778 		u64 addr = bind_ops[i].addr;
3779 		u32 op = bind_ops[i].op;
3780 		u32 flags = bind_ops[i].flags;
3781 		u64 obj_offset = bind_ops[i].obj_offset;
3782 		u32 prefetch_region = bind_ops[i].prefetch_mem_region_instance;
3783 		u16 pat_index = bind_ops[i].pat_index;
3784 
3785 		ops[i] = vm_bind_ioctl_ops_create(vm, &vops, bos[i], obj_offset,
3786 						  addr, range, op, flags,
3787 						  prefetch_region, pat_index);
3788 		if (IS_ERR(ops[i])) {
3789 			err = PTR_ERR(ops[i]);
3790 			ops[i] = NULL;
3791 			goto unwind_ops;
3792 		}
3793 
3794 		err = vm_bind_ioctl_ops_parse(vm, ops[i], &vops);
3795 		if (err)
3796 			goto unwind_ops;
3797 
3798 #ifdef TEST_VM_OPS_ERROR
3799 		if (flags & FORCE_OP_ERROR) {
3800 			vops.inject_error = true;
3801 			vm->xe->vm_inject_error_position =
3802 				(vm->xe->vm_inject_error_position + 1) %
3803 				FORCE_OP_ERROR_COUNT;
3804 		}
3805 #endif
3806 	}
3807 
3808 	/* Nothing to do */
3809 	if (list_empty(&vops.list)) {
3810 		err = -ENODATA;
3811 		goto unwind_ops;
3812 	}
3813 
3814 	err = xe_vma_ops_alloc(&vops, args->num_binds > 1);
3815 	if (err)
3816 		goto unwind_ops;
3817 
3818 	err = vm_bind_ioctl_ops_prefetch_ranges(vm, &vops);
3819 	if (err)
3820 		goto unwind_ops;
3821 
3822 	fence = vm_bind_ioctl_ops_execute(vm, &vops);
3823 	if (IS_ERR(fence))
3824 		err = PTR_ERR(fence);
3825 	else
3826 		dma_fence_put(fence);
3827 
3828 unwind_ops:
3829 	if (err && err != -ENODATA)
3830 		vm_bind_ioctl_ops_unwind(vm, ops, args->num_binds);
3831 	xe_vma_ops_fini(&vops);
3832 	for (i = args->num_binds - 1; i >= 0; --i)
3833 		if (ops[i])
3834 			drm_gpuva_ops_free(&vm->gpuvm, ops[i]);
3835 free_syncs:
3836 	if (err == -ENODATA)
3837 		err = vm_bind_ioctl_signal_fences(vm, q, syncs, num_syncs);
3838 	while (num_syncs--)
3839 		xe_sync_entry_cleanup(&syncs[num_syncs]);
3840 
3841 	kfree(syncs);
3842 put_obj:
3843 	for (i = 0; i < args->num_binds; ++i)
3844 		xe_bo_put(bos[i]);
3845 
3846 	kvfree(ops);
3847 free_bos:
3848 	kvfree(bos);
3849 release_vm_lock:
3850 	up_write(&vm->lock);
3851 put_exec_queue:
3852 	if (q)
3853 		xe_exec_queue_put(q);
3854 free_bind_ops:
3855 	if (args->num_binds > 1)
3856 		kvfree(bind_ops);
3857 put_vm:
3858 	xe_vm_put(vm);
3859 	return err;
3860 }
3861 
3862 /**
3863  * xe_vm_bind_kernel_bo - bind a kernel BO to a VM
3864  * @vm: VM to bind the BO to
3865  * @bo: BO to bind
3866  * @q: exec queue to use for the bind (optional)
3867  * @addr: address at which to bind the BO
3868  * @cache_lvl: PAT cache level to use
3869  *
3870  * Execute a VM bind map operation on a kernel-owned BO to bind it into a
3871  * kernel-owned VM.
3872  *
3873  * Returns a dma_fence to track the binding completion if the job to do so was
3874  * successfully submitted, an error pointer otherwise.
3875  */
3876 struct dma_fence *xe_vm_bind_kernel_bo(struct xe_vm *vm, struct xe_bo *bo,
3877 				       struct xe_exec_queue *q, u64 addr,
3878 				       enum xe_cache_level cache_lvl)
3879 {
3880 	struct xe_vma_ops vops;
3881 	struct drm_gpuva_ops *ops = NULL;
3882 	struct dma_fence *fence;
3883 	int err;
3884 
3885 	xe_bo_get(bo);
3886 	xe_vm_get(vm);
3887 	if (q)
3888 		xe_exec_queue_get(q);
3889 
3890 	down_write(&vm->lock);
3891 
3892 	xe_vma_ops_init(&vops, vm, q, NULL, 0);
3893 
3894 	ops = vm_bind_ioctl_ops_create(vm, &vops, bo, 0, addr, xe_bo_size(bo),
3895 				       DRM_XE_VM_BIND_OP_MAP, 0, 0,
3896 				       vm->xe->pat.idx[cache_lvl]);
3897 	if (IS_ERR(ops)) {
3898 		err = PTR_ERR(ops);
3899 		goto release_vm_lock;
3900 	}
3901 
3902 	err = vm_bind_ioctl_ops_parse(vm, ops, &vops);
3903 	if (err)
3904 		goto release_vm_lock;
3905 
3906 	xe_assert(vm->xe, !list_empty(&vops.list));
3907 
3908 	err = xe_vma_ops_alloc(&vops, false);
3909 	if (err)
3910 		goto unwind_ops;
3911 
3912 	fence = vm_bind_ioctl_ops_execute(vm, &vops);
3913 	if (IS_ERR(fence))
3914 		err = PTR_ERR(fence);
3915 
3916 unwind_ops:
3917 	if (err && err != -ENODATA)
3918 		vm_bind_ioctl_ops_unwind(vm, &ops, 1);
3919 
3920 	xe_vma_ops_fini(&vops);
3921 	drm_gpuva_ops_free(&vm->gpuvm, ops);
3922 
3923 release_vm_lock:
3924 	up_write(&vm->lock);
3925 
3926 	if (q)
3927 		xe_exec_queue_put(q);
3928 	xe_vm_put(vm);
3929 	xe_bo_put(bo);
3930 
3931 	if (err)
3932 		fence = ERR_PTR(err);
3933 
3934 	return fence;
3935 }
3936 
3937 /**
3938  * xe_vm_lock() - Lock the vm's dma_resv object
3939  * @vm: The struct xe_vm whose lock is to be locked
3940  * @intr: Whether to perform any wait interruptible
3941  *
3942  * Return: 0 on success, -EINTR if @intr is true and the wait for a
3943  * contended lock was interrupted. If @intr is false, the function
3944  * always returns 0.
3945  */
3946 int xe_vm_lock(struct xe_vm *vm, bool intr)
3947 {
3948 	int ret;
3949 
3950 	if (intr)
3951 		ret = dma_resv_lock_interruptible(xe_vm_resv(vm), NULL);
3952 	else
3953 		ret = dma_resv_lock(xe_vm_resv(vm), NULL);
3954 
3955 	return ret;
3956 }
3957 
3958 /**
3959  * xe_vm_unlock() - Unlock the vm's dma_resv object
3960  * @vm: The struct xe_vm whose lock is to be released.
3961  *
3962  * Unlock a buffer object lock that was locked by xe_vm_lock().
3963  */
3964 void xe_vm_unlock(struct xe_vm *vm)
3965 {
3966 	dma_resv_unlock(xe_vm_resv(vm));
3967 }
3968 
3969 /**
3970  * xe_vm_range_tilemask_tlb_inval - Issue a TLB invalidation on this tilemask for an
3971  * address range
3972  * @vm: The VM
3973  * @start: start address
3974  * @end: end address
3975  * @tile_mask: mask for which gt's issue tlb invalidation
3976  *
3977  * Issue a range based TLB invalidation for gt's in tilemask
3978  *
3979  * Returns 0 for success, negative error code otherwise.
3980  */
3981 int xe_vm_range_tilemask_tlb_inval(struct xe_vm *vm, u64 start,
3982 				   u64 end, u8 tile_mask)
3983 {
3984 	struct xe_tlb_inval_fence
3985 		fence[XE_MAX_TILES_PER_DEVICE * XE_MAX_GT_PER_TILE];
3986 	struct xe_tile *tile;
3987 	u32 fence_id = 0;
3988 	u8 id;
3989 	int err;
3990 
3991 	if (!tile_mask)
3992 		return 0;
3993 
3994 	for_each_tile(tile, vm->xe, id) {
3995 		if (!(tile_mask & BIT(id)))
3996 			continue;
3997 
3998 		xe_tlb_inval_fence_init(&tile->primary_gt->tlb_inval,
3999 					&fence[fence_id], true);
4000 
4001 		err = xe_tlb_inval_range(&tile->primary_gt->tlb_inval,
4002 					 &fence[fence_id], start, end,
4003 					 vm->usm.asid, NULL);
4004 		if (err)
4005 			goto wait;
4006 		++fence_id;
4007 
4008 		if (!tile->media_gt)
4009 			continue;
4010 
4011 		xe_tlb_inval_fence_init(&tile->media_gt->tlb_inval,
4012 					&fence[fence_id], true);
4013 
4014 		err = xe_tlb_inval_range(&tile->media_gt->tlb_inval,
4015 					 &fence[fence_id], start, end,
4016 					 vm->usm.asid, NULL);
4017 		if (err)
4018 			goto wait;
4019 		++fence_id;
4020 	}
4021 
4022 wait:
4023 	for (id = 0; id < fence_id; ++id)
4024 		xe_tlb_inval_fence_wait(&fence[id]);
4025 
4026 	return err;
4027 }
4028 
4029 /**
4030  * xe_vm_invalidate_vma - invalidate GPU mappings for VMA without a lock
4031  * @vma: VMA to invalidate
4032  *
4033  * Walks a list of page tables leaves which it memset the entries owned by this
4034  * VMA to zero, invalidates the TLBs, and block until TLBs invalidation is
4035  * complete.
4036  *
4037  * Returns 0 for success, negative error code otherwise.
4038  */
4039 int xe_vm_invalidate_vma(struct xe_vma *vma)
4040 {
4041 	struct xe_device *xe = xe_vma_vm(vma)->xe;
4042 	struct xe_vm *vm = xe_vma_vm(vma);
4043 	struct xe_tile *tile;
4044 	u8 tile_mask = 0;
4045 	int ret = 0;
4046 	u8 id;
4047 
4048 	xe_assert(xe, !xe_vma_is_null(vma));
4049 	xe_assert(xe, !xe_vma_is_cpu_addr_mirror(vma));
4050 	trace_xe_vma_invalidate(vma);
4051 
4052 	vm_dbg(&vm->xe->drm,
4053 	       "INVALIDATE: addr=0x%016llx, range=0x%016llx",
4054 		xe_vma_start(vma), xe_vma_size(vma));
4055 
4056 	/*
4057 	 * Check that we don't race with page-table updates, tile_invalidated
4058 	 * update is safe
4059 	 */
4060 	if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
4061 		if (xe_vma_is_userptr(vma)) {
4062 			lockdep_assert(lockdep_is_held_type(&vm->svm.gpusvm.notifier_lock, 0) ||
4063 				       (lockdep_is_held_type(&vm->svm.gpusvm.notifier_lock, 1) &&
4064 					lockdep_is_held(&xe_vm_resv(vm)->lock.base)));
4065 
4066 			WARN_ON_ONCE(!mmu_interval_check_retry
4067 				     (&to_userptr_vma(vma)->userptr.notifier,
4068 				      to_userptr_vma(vma)->userptr.pages.notifier_seq));
4069 			WARN_ON_ONCE(!dma_resv_test_signaled(xe_vm_resv(vm),
4070 							     DMA_RESV_USAGE_BOOKKEEP));
4071 
4072 		} else {
4073 			xe_bo_assert_held(xe_vma_bo(vma));
4074 		}
4075 	}
4076 
4077 	for_each_tile(tile, xe, id)
4078 		if (xe_pt_zap_ptes(tile, vma))
4079 			tile_mask |= BIT(id);
4080 
4081 	xe_device_wmb(xe);
4082 
4083 	ret = xe_vm_range_tilemask_tlb_inval(xe_vma_vm(vma), xe_vma_start(vma),
4084 					     xe_vma_end(vma), tile_mask);
4085 
4086 	/* WRITE_ONCE pairs with READ_ONCE in xe_vm_has_valid_gpu_mapping() */
4087 	WRITE_ONCE(vma->tile_invalidated, vma->tile_mask);
4088 
4089 	return ret;
4090 }
4091 
4092 int xe_vm_validate_protected(struct xe_vm *vm)
4093 {
4094 	struct drm_gpuva *gpuva;
4095 	int err = 0;
4096 
4097 	if (!vm)
4098 		return -ENODEV;
4099 
4100 	mutex_lock(&vm->snap_mutex);
4101 
4102 	drm_gpuvm_for_each_va(gpuva, &vm->gpuvm) {
4103 		struct xe_vma *vma = gpuva_to_vma(gpuva);
4104 		struct xe_bo *bo = vma->gpuva.gem.obj ?
4105 			gem_to_xe_bo(vma->gpuva.gem.obj) : NULL;
4106 
4107 		if (!bo)
4108 			continue;
4109 
4110 		if (xe_bo_is_protected(bo)) {
4111 			err = xe_pxp_bo_key_check(vm->xe->pxp, bo);
4112 			if (err)
4113 				break;
4114 		}
4115 	}
4116 
4117 	mutex_unlock(&vm->snap_mutex);
4118 	return err;
4119 }
4120 
4121 struct xe_vm_snapshot {
4122 	int uapi_flags;
4123 	unsigned long num_snaps;
4124 	struct {
4125 		u64 ofs, bo_ofs;
4126 		unsigned long len;
4127 #define XE_VM_SNAP_FLAG_USERPTR		BIT(0)
4128 #define XE_VM_SNAP_FLAG_READ_ONLY	BIT(1)
4129 #define XE_VM_SNAP_FLAG_IS_NULL		BIT(2)
4130 		unsigned long flags;
4131 		int uapi_mem_region;
4132 		int pat_index;
4133 		int cpu_caching;
4134 		struct xe_bo *bo;
4135 		void *data;
4136 		struct mm_struct *mm;
4137 	} snap[];
4138 };
4139 
4140 struct xe_vm_snapshot *xe_vm_snapshot_capture(struct xe_vm *vm)
4141 {
4142 	unsigned long num_snaps = 0, i;
4143 	struct xe_vm_snapshot *snap = NULL;
4144 	struct drm_gpuva *gpuva;
4145 
4146 	if (!vm)
4147 		return NULL;
4148 
4149 	mutex_lock(&vm->snap_mutex);
4150 	drm_gpuvm_for_each_va(gpuva, &vm->gpuvm) {
4151 		if (gpuva->flags & XE_VMA_DUMPABLE)
4152 			num_snaps++;
4153 	}
4154 
4155 	if (num_snaps)
4156 		snap = kvzalloc(offsetof(struct xe_vm_snapshot, snap[num_snaps]), GFP_NOWAIT);
4157 	if (!snap) {
4158 		snap = num_snaps ? ERR_PTR(-ENOMEM) : ERR_PTR(-ENODEV);
4159 		goto out_unlock;
4160 	}
4161 
4162 	if (vm->flags & XE_VM_FLAG_FAULT_MODE)
4163 		snap->uapi_flags |= DRM_XE_VM_CREATE_FLAG_FAULT_MODE;
4164 	if (vm->flags & XE_VM_FLAG_LR_MODE)
4165 		snap->uapi_flags |= DRM_XE_VM_CREATE_FLAG_LR_MODE;
4166 	if (vm->flags & XE_VM_FLAG_SCRATCH_PAGE)
4167 		snap->uapi_flags |= DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE;
4168 
4169 	snap->num_snaps = num_snaps;
4170 	i = 0;
4171 	drm_gpuvm_for_each_va(gpuva, &vm->gpuvm) {
4172 		struct xe_vma *vma = gpuva_to_vma(gpuva);
4173 		struct xe_bo *bo = vma->gpuva.gem.obj ?
4174 			gem_to_xe_bo(vma->gpuva.gem.obj) : NULL;
4175 
4176 		if (!(gpuva->flags & XE_VMA_DUMPABLE))
4177 			continue;
4178 
4179 		snap->snap[i].ofs = xe_vma_start(vma);
4180 		snap->snap[i].len = xe_vma_size(vma);
4181 		snap->snap[i].flags = xe_vma_read_only(vma) ?
4182 			XE_VM_SNAP_FLAG_READ_ONLY : 0;
4183 		snap->snap[i].pat_index = vma->attr.pat_index;
4184 		if (bo) {
4185 			snap->snap[i].cpu_caching = bo->cpu_caching;
4186 			snap->snap[i].bo = xe_bo_get(bo);
4187 			snap->snap[i].bo_ofs = xe_vma_bo_offset(vma);
4188 			switch (bo->ttm.resource->mem_type) {
4189 			case XE_PL_SYSTEM:
4190 			case XE_PL_TT:
4191 				snap->snap[i].uapi_mem_region = 0;
4192 				break;
4193 			case XE_PL_VRAM0:
4194 				snap->snap[i].uapi_mem_region = 1;
4195 				break;
4196 			case XE_PL_VRAM1:
4197 				snap->snap[i].uapi_mem_region = 2;
4198 				break;
4199 			}
4200 		} else if (xe_vma_is_userptr(vma)) {
4201 			struct mm_struct *mm =
4202 				to_userptr_vma(vma)->userptr.notifier.mm;
4203 
4204 			if (mmget_not_zero(mm))
4205 				snap->snap[i].mm = mm;
4206 			else
4207 				snap->snap[i].data = ERR_PTR(-EFAULT);
4208 
4209 			snap->snap[i].bo_ofs = xe_vma_userptr(vma);
4210 			snap->snap[i].flags |= XE_VM_SNAP_FLAG_USERPTR;
4211 			snap->snap[i].uapi_mem_region = 0;
4212 		} else if (xe_vma_is_null(vma)) {
4213 			snap->snap[i].flags |= XE_VM_SNAP_FLAG_IS_NULL;
4214 			snap->snap[i].uapi_mem_region = -1;
4215 		} else {
4216 			snap->snap[i].data = ERR_PTR(-ENOENT);
4217 			snap->snap[i].uapi_mem_region = -1;
4218 		}
4219 		i++;
4220 	}
4221 
4222 out_unlock:
4223 	mutex_unlock(&vm->snap_mutex);
4224 	return snap;
4225 }
4226 
4227 void xe_vm_snapshot_capture_delayed(struct xe_vm_snapshot *snap)
4228 {
4229 	if (IS_ERR_OR_NULL(snap))
4230 		return;
4231 
4232 	for (int i = 0; i < snap->num_snaps; i++) {
4233 		struct xe_bo *bo = snap->snap[i].bo;
4234 		int err;
4235 
4236 		if (IS_ERR(snap->snap[i].data) ||
4237 		    snap->snap[i].flags & XE_VM_SNAP_FLAG_IS_NULL)
4238 			continue;
4239 
4240 		snap->snap[i].data = kvmalloc(snap->snap[i].len, GFP_USER);
4241 		if (!snap->snap[i].data) {
4242 			snap->snap[i].data = ERR_PTR(-ENOMEM);
4243 			goto cleanup_bo;
4244 		}
4245 
4246 		if (bo) {
4247 			err = xe_bo_read(bo, snap->snap[i].bo_ofs,
4248 					 snap->snap[i].data, snap->snap[i].len);
4249 		} else {
4250 			void __user *userptr = (void __user *)(size_t)snap->snap[i].bo_ofs;
4251 
4252 			kthread_use_mm(snap->snap[i].mm);
4253 			if (!copy_from_user(snap->snap[i].data, userptr, snap->snap[i].len))
4254 				err = 0;
4255 			else
4256 				err = -EFAULT;
4257 			kthread_unuse_mm(snap->snap[i].mm);
4258 
4259 			mmput(snap->snap[i].mm);
4260 			snap->snap[i].mm = NULL;
4261 		}
4262 
4263 		if (err) {
4264 			kvfree(snap->snap[i].data);
4265 			snap->snap[i].data = ERR_PTR(err);
4266 		}
4267 
4268 cleanup_bo:
4269 		xe_bo_put(bo);
4270 		snap->snap[i].bo = NULL;
4271 	}
4272 }
4273 
4274 void xe_vm_snapshot_print(struct xe_vm_snapshot *snap, struct drm_printer *p)
4275 {
4276 	unsigned long i, j;
4277 
4278 	if (IS_ERR_OR_NULL(snap)) {
4279 		drm_printf(p, "[0].error: %li\n", PTR_ERR(snap));
4280 		return;
4281 	}
4282 
4283 	drm_printf(p, "VM.uapi_flags: 0x%x\n", snap->uapi_flags);
4284 	for (i = 0; i < snap->num_snaps; i++) {
4285 		drm_printf(p, "[%llx].length: 0x%lx\n", snap->snap[i].ofs, snap->snap[i].len);
4286 
4287 		drm_printf(p, "[%llx].properties: %s|%s|mem_region=0x%lx|pat_index=%d|cpu_caching=%d\n",
4288 			   snap->snap[i].ofs,
4289 			   snap->snap[i].flags & XE_VM_SNAP_FLAG_READ_ONLY ?
4290 			   "read_only" : "read_write",
4291 			   snap->snap[i].flags & XE_VM_SNAP_FLAG_IS_NULL ?
4292 			   "null_sparse" :
4293 			   snap->snap[i].flags & XE_VM_SNAP_FLAG_USERPTR ?
4294 			   "userptr" : "bo",
4295 			   snap->snap[i].uapi_mem_region == -1 ? 0 :
4296 			   BIT(snap->snap[i].uapi_mem_region),
4297 			   snap->snap[i].pat_index,
4298 			   snap->snap[i].cpu_caching);
4299 
4300 		if (IS_ERR(snap->snap[i].data)) {
4301 			drm_printf(p, "[%llx].error: %li\n", snap->snap[i].ofs,
4302 				   PTR_ERR(snap->snap[i].data));
4303 			continue;
4304 		}
4305 
4306 		if (snap->snap[i].flags & XE_VM_SNAP_FLAG_IS_NULL)
4307 			continue;
4308 
4309 		drm_printf(p, "[%llx].data: ", snap->snap[i].ofs);
4310 
4311 		for (j = 0; j < snap->snap[i].len; j += sizeof(u32)) {
4312 			u32 *val = snap->snap[i].data + j;
4313 			char dumped[ASCII85_BUFSZ];
4314 
4315 			drm_puts(p, ascii85_encode(*val, dumped));
4316 		}
4317 
4318 		drm_puts(p, "\n");
4319 
4320 		if (drm_coredump_printer_is_full(p))
4321 			return;
4322 	}
4323 }
4324 
4325 void xe_vm_snapshot_free(struct xe_vm_snapshot *snap)
4326 {
4327 	unsigned long i;
4328 
4329 	if (IS_ERR_OR_NULL(snap))
4330 		return;
4331 
4332 	for (i = 0; i < snap->num_snaps; i++) {
4333 		if (!IS_ERR(snap->snap[i].data))
4334 			kvfree(snap->snap[i].data);
4335 		xe_bo_put(snap->snap[i].bo);
4336 		if (snap->snap[i].mm)
4337 			mmput(snap->snap[i].mm);
4338 	}
4339 	kvfree(snap);
4340 }
4341 
4342 /**
4343  * xe_vma_need_vram_for_atomic - Check if VMA needs VRAM migration for atomic operations
4344  * @xe: Pointer to the Xe device structure
4345  * @vma: Pointer to the virtual memory area (VMA) structure
4346  * @is_atomic: In pagefault path and atomic operation
4347  *
4348  * This function determines whether the given VMA needs to be migrated to
4349  * VRAM in order to do atomic GPU operation.
4350  *
4351  * Return:
4352  *   1        - Migration to VRAM is required
4353  *   0        - Migration is not required
4354  *   -EACCES  - Invalid access for atomic memory attr
4355  *
4356  */
4357 int xe_vma_need_vram_for_atomic(struct xe_device *xe, struct xe_vma *vma, bool is_atomic)
4358 {
4359 	u32 atomic_access = xe_vma_bo(vma) ? xe_vma_bo(vma)->attr.atomic_access :
4360 					     vma->attr.atomic_access;
4361 
4362 	if (!IS_DGFX(xe) || !is_atomic)
4363 		return false;
4364 
4365 	/*
4366 	 * NOTE: The checks implemented here are platform-specific. For
4367 	 * instance, on a device supporting CXL atomics, these would ideally
4368 	 * work universally without additional handling.
4369 	 */
4370 	switch (atomic_access) {
4371 	case DRM_XE_ATOMIC_DEVICE:
4372 		return !xe->info.has_device_atomics_on_smem;
4373 
4374 	case DRM_XE_ATOMIC_CPU:
4375 		return -EACCES;
4376 
4377 	case DRM_XE_ATOMIC_UNDEFINED:
4378 	case DRM_XE_ATOMIC_GLOBAL:
4379 	default:
4380 		return 1;
4381 	}
4382 }
4383 
4384 static int xe_vm_alloc_vma(struct xe_vm *vm,
4385 			   struct drm_gpuvm_map_req *map_req,
4386 			   bool is_madvise)
4387 {
4388 	struct xe_vma_ops vops;
4389 	struct drm_gpuva_ops *ops = NULL;
4390 	struct drm_gpuva_op *__op;
4391 	unsigned int vma_flags = 0;
4392 	bool remap_op = false;
4393 	struct xe_vma_mem_attr tmp_attr = {};
4394 	u16 default_pat;
4395 	int err;
4396 
4397 	lockdep_assert_held_write(&vm->lock);
4398 
4399 	if (is_madvise)
4400 		ops = drm_gpuvm_madvise_ops_create(&vm->gpuvm, map_req);
4401 	else
4402 		ops = drm_gpuvm_sm_map_ops_create(&vm->gpuvm, map_req);
4403 
4404 	if (IS_ERR(ops))
4405 		return PTR_ERR(ops);
4406 
4407 	if (list_empty(&ops->list)) {
4408 		err = 0;
4409 		goto free_ops;
4410 	}
4411 
4412 	drm_gpuva_for_each_op(__op, ops) {
4413 		struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
4414 		struct xe_vma *vma = NULL;
4415 
4416 		if (!is_madvise) {
4417 			if (__op->op == DRM_GPUVA_OP_UNMAP) {
4418 				vma = gpuva_to_vma(op->base.unmap.va);
4419 				XE_WARN_ON(!xe_vma_has_default_mem_attrs(vma));
4420 				default_pat = vma->attr.default_pat_index;
4421 				vma_flags = vma->gpuva.flags;
4422 			}
4423 
4424 			if (__op->op == DRM_GPUVA_OP_REMAP) {
4425 				vma = gpuva_to_vma(op->base.remap.unmap->va);
4426 				default_pat = vma->attr.default_pat_index;
4427 				vma_flags = vma->gpuva.flags;
4428 			}
4429 
4430 			if (__op->op == DRM_GPUVA_OP_MAP) {
4431 				op->map.vma_flags |= vma_flags & XE_VMA_CREATE_MASK;
4432 				op->map.pat_index = default_pat;
4433 			}
4434 		} else {
4435 			if (__op->op == DRM_GPUVA_OP_REMAP) {
4436 				vma = gpuva_to_vma(op->base.remap.unmap->va);
4437 				xe_assert(vm->xe, !remap_op);
4438 				xe_assert(vm->xe, xe_vma_has_no_bo(vma));
4439 				remap_op = true;
4440 				vma_flags = vma->gpuva.flags;
4441 			}
4442 
4443 			if (__op->op == DRM_GPUVA_OP_MAP) {
4444 				xe_assert(vm->xe, remap_op);
4445 				remap_op = false;
4446 				/*
4447 				 * In case of madvise ops DRM_GPUVA_OP_MAP is
4448 				 * always after DRM_GPUVA_OP_REMAP, so ensure
4449 				 * to propagate the flags from the vma we're
4450 				 * unmapping.
4451 				 */
4452 				op->map.vma_flags |= vma_flags & XE_VMA_CREATE_MASK;
4453 			}
4454 		}
4455 		print_op(vm->xe, __op);
4456 	}
4457 
4458 	xe_vma_ops_init(&vops, vm, NULL, NULL, 0);
4459 
4460 	if (is_madvise)
4461 		vops.flags |= XE_VMA_OPS_FLAG_MADVISE;
4462 	else
4463 		vops.flags |= XE_VMA_OPS_FLAG_ALLOW_SVM_UNMAP;
4464 
4465 	err = vm_bind_ioctl_ops_parse(vm, ops, &vops);
4466 	if (err)
4467 		goto unwind_ops;
4468 
4469 	xe_vm_lock(vm, false);
4470 
4471 	drm_gpuva_for_each_op(__op, ops) {
4472 		struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
4473 		struct xe_vma *vma;
4474 
4475 		if (__op->op == DRM_GPUVA_OP_UNMAP) {
4476 			vma = gpuva_to_vma(op->base.unmap.va);
4477 			/* There should be no unmap for madvise */
4478 			if (is_madvise)
4479 				XE_WARN_ON("UNEXPECTED UNMAP");
4480 
4481 			xe_vma_destroy(vma, NULL);
4482 		} else if (__op->op == DRM_GPUVA_OP_REMAP) {
4483 			vma = gpuva_to_vma(op->base.remap.unmap->va);
4484 			/* In case of madvise ops Store attributes for REMAP UNMAPPED
4485 			 * VMA, so they can be assigned to newly MAP created vma.
4486 			 */
4487 			if (is_madvise)
4488 				xe_vma_mem_attr_copy(&tmp_attr, &vma->attr);
4489 
4490 			xe_vma_destroy(gpuva_to_vma(op->base.remap.unmap->va), NULL);
4491 		} else if (__op->op == DRM_GPUVA_OP_MAP) {
4492 			vma = op->map.vma;
4493 			/* In case of madvise call, MAP will always be followed by REMAP.
4494 			 * Therefore temp_attr will always have sane values, making it safe to
4495 			 * copy them to new vma.
4496 			 */
4497 			if (is_madvise)
4498 				xe_vma_mem_attr_copy(&vma->attr, &tmp_attr);
4499 		}
4500 	}
4501 
4502 	xe_vm_unlock(vm);
4503 	drm_gpuva_ops_free(&vm->gpuvm, ops);
4504 	xe_vma_mem_attr_fini(&tmp_attr);
4505 	return 0;
4506 
4507 unwind_ops:
4508 	vm_bind_ioctl_ops_unwind(vm, &ops, 1);
4509 free_ops:
4510 	drm_gpuva_ops_free(&vm->gpuvm, ops);
4511 	return err;
4512 }
4513 
4514 /**
4515  * xe_vm_alloc_madvise_vma - Allocate VMA's with madvise ops
4516  * @vm: Pointer to the xe_vm structure
4517  * @start: Starting input address
4518  * @range: Size of the input range
4519  *
4520  * This function splits existing vma to create new vma for user provided input range
4521  *
4522  * Return: 0 if success
4523  */
4524 int xe_vm_alloc_madvise_vma(struct xe_vm *vm, uint64_t start, uint64_t range)
4525 {
4526 	struct drm_gpuvm_map_req map_req = {
4527 		.map.va.addr = start,
4528 		.map.va.range = range,
4529 	};
4530 
4531 	lockdep_assert_held_write(&vm->lock);
4532 
4533 	vm_dbg(&vm->xe->drm, "MADVISE_OPS_CREATE: addr=0x%016llx, size=0x%016llx", start, range);
4534 
4535 	return xe_vm_alloc_vma(vm, &map_req, true);
4536 }
4537 
4538 static bool is_cpu_addr_vma_with_default_attr(struct xe_vma *vma)
4539 {
4540 	return vma && xe_vma_is_cpu_addr_mirror(vma) &&
4541 	       xe_vma_has_default_mem_attrs(vma);
4542 }
4543 
4544 /**
4545  * xe_vm_find_cpu_addr_mirror_vma_range - Extend a VMA range to include adjacent CPU-mirrored VMAs
4546  * @vm: VM to search within
4547  * @start: Input/output pointer to the starting address of the range
4548  * @end: Input/output pointer to the end address of the range
4549  *
4550  * Given a range defined by @start and @range, this function checks the VMAs
4551  * immediately before and after the range. If those neighboring VMAs are
4552  * CPU-address-mirrored and have default memory attributes, the function
4553  * updates @start and @range to include them. This extended range can then
4554  * be used for merging or other operations that require a unified VMA.
4555  *
4556  * The function does not perform the merge itself; it only computes the
4557  * mergeable boundaries.
4558  */
4559 void xe_vm_find_cpu_addr_mirror_vma_range(struct xe_vm *vm, u64 *start, u64 *end)
4560 {
4561 	struct xe_vma *prev, *next;
4562 
4563 	lockdep_assert_held(&vm->lock);
4564 
4565 	if (*start >= SZ_4K) {
4566 		prev = xe_vm_find_vma_by_addr(vm, *start - SZ_4K);
4567 		if (is_cpu_addr_vma_with_default_attr(prev))
4568 			*start = xe_vma_start(prev);
4569 	}
4570 
4571 	if (*end < vm->size) {
4572 		next = xe_vm_find_vma_by_addr(vm, *end + 1);
4573 		if (is_cpu_addr_vma_with_default_attr(next))
4574 			*end = xe_vma_end(next);
4575 	}
4576 }
4577 
4578 /**
4579  * xe_vm_alloc_cpu_addr_mirror_vma - Allocate CPU addr mirror vma
4580  * @vm: Pointer to the xe_vm structure
4581  * @start: Starting input address
4582  * @range: Size of the input range
4583  *
4584  * This function splits/merges existing vma to create new vma for user provided input range
4585  *
4586  * Return: 0 if success
4587  */
4588 int xe_vm_alloc_cpu_addr_mirror_vma(struct xe_vm *vm, uint64_t start, uint64_t range)
4589 {
4590 	struct drm_gpuvm_map_req map_req = {
4591 		.map.va.addr = start,
4592 		.map.va.range = range,
4593 	};
4594 
4595 	lockdep_assert_held_write(&vm->lock);
4596 
4597 	vm_dbg(&vm->xe->drm, "CPU_ADDR_MIRROR_VMA_OPS_CREATE: addr=0x%016llx, size=0x%016llx",
4598 	       start, range);
4599 
4600 	return xe_vm_alloc_vma(vm, &map_req, false);
4601 }
4602 
4603 /**
4604  * xe_vm_add_exec_queue() - Add exec queue to VM
4605  * @vm: The VM.
4606  * @q: The exec_queue
4607  *
4608  * Add exec queue to VM, skipped if the device does not have context based TLB
4609  * invalidations.
4610  */
4611 void xe_vm_add_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
4612 {
4613 	struct xe_device *xe = vm->xe;
4614 
4615 	/* User VMs and queues only */
4616 	xe_assert(xe, !(q->flags & EXEC_QUEUE_FLAG_KERNEL));
4617 	xe_assert(xe, !(q->flags & EXEC_QUEUE_FLAG_PERMANENT));
4618 	xe_assert(xe, !(q->flags & EXEC_QUEUE_FLAG_VM));
4619 	xe_assert(xe, !(q->flags & EXEC_QUEUE_FLAG_MIGRATE));
4620 	xe_assert(xe, vm->xef);
4621 	xe_assert(xe, vm == q->vm);
4622 
4623 	if (!xe->info.has_ctx_tlb_inval)
4624 		return;
4625 
4626 	down_write(&vm->exec_queues.lock);
4627 	list_add(&q->vm_exec_queue_link, &vm->exec_queues.list[q->gt->info.id]);
4628 	++vm->exec_queues.count[q->gt->info.id];
4629 	up_write(&vm->exec_queues.lock);
4630 }
4631 
4632 /**
4633  * xe_vm_remove_exec_queue() - Remove exec queue from VM
4634  * @vm: The VM.
4635  * @q: The exec_queue
4636  *
4637  * Remove exec queue from VM, skipped if the device does not have context based
4638  * TLB invalidations.
4639  */
4640 void xe_vm_remove_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
4641 {
4642 	if (!vm->xe->info.has_ctx_tlb_inval)
4643 		return;
4644 
4645 	down_write(&vm->exec_queues.lock);
4646 	if (!list_empty(&q->vm_exec_queue_link)) {
4647 		list_del(&q->vm_exec_queue_link);
4648 		--vm->exec_queues.count[q->gt->info.id];
4649 	}
4650 	up_write(&vm->exec_queues.lock);
4651 }
4652