xref: /linux/drivers/gpu/drm/xe/xe_vm.c (revision 6f17ab9a63e670bd62a287f95e3982f99eafd77e)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2021 Intel Corporation
4  */
5 
6 #include "xe_vm.h"
7 
8 #include <linux/dma-fence-array.h>
9 #include <linux/nospec.h>
10 
11 #include <drm/drm_drv.h>
12 #include <drm/drm_exec.h>
13 #include <drm/drm_print.h>
14 #include <drm/ttm/ttm_tt.h>
15 #include <uapi/drm/xe_drm.h>
16 #include <linux/ascii85.h>
17 #include <linux/delay.h>
18 #include <linux/kthread.h>
19 #include <linux/mm.h>
20 #include <linux/swap.h>
21 
22 #include <generated/xe_wa_oob.h>
23 
24 #include "regs/xe_gtt_defs.h"
25 #include "xe_assert.h"
26 #include "xe_bo.h"
27 #include "xe_device.h"
28 #include "xe_drm_client.h"
29 #include "xe_exec_queue.h"
30 #include "xe_gt_pagefault.h"
31 #include "xe_migrate.h"
32 #include "xe_pat.h"
33 #include "xe_pm.h"
34 #include "xe_preempt_fence.h"
35 #include "xe_pt.h"
36 #include "xe_pxp.h"
37 #include "xe_res_cursor.h"
38 #include "xe_svm.h"
39 #include "xe_sync.h"
40 #include "xe_tile.h"
41 #include "xe_tlb_inval.h"
42 #include "xe_trace_bo.h"
43 #include "xe_wa.h"
44 #include "xe_hmm.h"
45 
46 static struct drm_gem_object *xe_vm_obj(struct xe_vm *vm)
47 {
48 	return vm->gpuvm.r_obj;
49 }
50 
51 /**
52  * xe_vma_userptr_check_repin() - Advisory check for repin needed
53  * @uvma: The userptr vma
54  *
55  * Check if the userptr vma has been invalidated since last successful
56  * repin. The check is advisory only and can the function can be called
57  * without the vm->userptr.notifier_lock held. There is no guarantee that the
58  * vma userptr will remain valid after a lockless check, so typically
59  * the call needs to be followed by a proper check under the notifier_lock.
60  *
61  * Return: 0 if userptr vma is valid, -EAGAIN otherwise; repin recommended.
62  */
63 int xe_vma_userptr_check_repin(struct xe_userptr_vma *uvma)
64 {
65 	return mmu_interval_check_retry(&uvma->userptr.notifier,
66 					uvma->userptr.notifier_seq) ?
67 		-EAGAIN : 0;
68 }
69 
70 int xe_vma_userptr_pin_pages(struct xe_userptr_vma *uvma)
71 {
72 	struct xe_vma *vma = &uvma->vma;
73 	struct xe_vm *vm = xe_vma_vm(vma);
74 	struct xe_device *xe = vm->xe;
75 
76 	lockdep_assert_held(&vm->lock);
77 	xe_assert(xe, xe_vma_is_userptr(vma));
78 
79 	return xe_hmm_userptr_populate_range(uvma, false);
80 }
81 
82 static bool preempt_fences_waiting(struct xe_vm *vm)
83 {
84 	struct xe_exec_queue *q;
85 
86 	lockdep_assert_held(&vm->lock);
87 	xe_vm_assert_held(vm);
88 
89 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link) {
90 		if (!q->lr.pfence ||
91 		    test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
92 			     &q->lr.pfence->flags)) {
93 			return true;
94 		}
95 	}
96 
97 	return false;
98 }
99 
100 static void free_preempt_fences(struct list_head *list)
101 {
102 	struct list_head *link, *next;
103 
104 	list_for_each_safe(link, next, list)
105 		xe_preempt_fence_free(to_preempt_fence_from_link(link));
106 }
107 
108 static int alloc_preempt_fences(struct xe_vm *vm, struct list_head *list,
109 				unsigned int *count)
110 {
111 	lockdep_assert_held(&vm->lock);
112 	xe_vm_assert_held(vm);
113 
114 	if (*count >= vm->preempt.num_exec_queues)
115 		return 0;
116 
117 	for (; *count < vm->preempt.num_exec_queues; ++(*count)) {
118 		struct xe_preempt_fence *pfence = xe_preempt_fence_alloc();
119 
120 		if (IS_ERR(pfence))
121 			return PTR_ERR(pfence);
122 
123 		list_move_tail(xe_preempt_fence_link(pfence), list);
124 	}
125 
126 	return 0;
127 }
128 
129 static int wait_for_existing_preempt_fences(struct xe_vm *vm)
130 {
131 	struct xe_exec_queue *q;
132 
133 	xe_vm_assert_held(vm);
134 
135 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link) {
136 		if (q->lr.pfence) {
137 			long timeout = dma_fence_wait(q->lr.pfence, false);
138 
139 			/* Only -ETIME on fence indicates VM needs to be killed */
140 			if (timeout < 0 || q->lr.pfence->error == -ETIME)
141 				return -ETIME;
142 
143 			dma_fence_put(q->lr.pfence);
144 			q->lr.pfence = NULL;
145 		}
146 	}
147 
148 	return 0;
149 }
150 
151 static bool xe_vm_is_idle(struct xe_vm *vm)
152 {
153 	struct xe_exec_queue *q;
154 
155 	xe_vm_assert_held(vm);
156 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link) {
157 		if (!xe_exec_queue_is_idle(q))
158 			return false;
159 	}
160 
161 	return true;
162 }
163 
164 static void arm_preempt_fences(struct xe_vm *vm, struct list_head *list)
165 {
166 	struct list_head *link;
167 	struct xe_exec_queue *q;
168 
169 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link) {
170 		struct dma_fence *fence;
171 
172 		link = list->next;
173 		xe_assert(vm->xe, link != list);
174 
175 		fence = xe_preempt_fence_arm(to_preempt_fence_from_link(link),
176 					     q, q->lr.context,
177 					     ++q->lr.seqno);
178 		dma_fence_put(q->lr.pfence);
179 		q->lr.pfence = fence;
180 	}
181 }
182 
183 static int add_preempt_fences(struct xe_vm *vm, struct xe_bo *bo)
184 {
185 	struct xe_exec_queue *q;
186 	int err;
187 
188 	xe_bo_assert_held(bo);
189 
190 	if (!vm->preempt.num_exec_queues)
191 		return 0;
192 
193 	err = dma_resv_reserve_fences(bo->ttm.base.resv, vm->preempt.num_exec_queues);
194 	if (err)
195 		return err;
196 
197 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link)
198 		if (q->lr.pfence) {
199 			dma_resv_add_fence(bo->ttm.base.resv,
200 					   q->lr.pfence,
201 					   DMA_RESV_USAGE_BOOKKEEP);
202 		}
203 
204 	return 0;
205 }
206 
207 static void resume_and_reinstall_preempt_fences(struct xe_vm *vm,
208 						struct drm_exec *exec)
209 {
210 	struct xe_exec_queue *q;
211 
212 	lockdep_assert_held(&vm->lock);
213 	xe_vm_assert_held(vm);
214 
215 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link) {
216 		q->ops->resume(q);
217 
218 		drm_gpuvm_resv_add_fence(&vm->gpuvm, exec, q->lr.pfence,
219 					 DMA_RESV_USAGE_BOOKKEEP, DMA_RESV_USAGE_BOOKKEEP);
220 	}
221 }
222 
223 int xe_vm_add_compute_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
224 {
225 	struct drm_gpuvm_exec vm_exec = {
226 		.vm = &vm->gpuvm,
227 		.flags = DRM_EXEC_INTERRUPTIBLE_WAIT,
228 		.num_fences = 1,
229 	};
230 	struct drm_exec *exec = &vm_exec.exec;
231 	struct dma_fence *pfence;
232 	int err;
233 	bool wait;
234 
235 	xe_assert(vm->xe, xe_vm_in_preempt_fence_mode(vm));
236 
237 	down_write(&vm->lock);
238 	err = drm_gpuvm_exec_lock(&vm_exec);
239 	if (err)
240 		goto out_up_write;
241 
242 	pfence = xe_preempt_fence_create(q, q->lr.context,
243 					 ++q->lr.seqno);
244 	if (!pfence) {
245 		err = -ENOMEM;
246 		goto out_fini;
247 	}
248 
249 	list_add(&q->lr.link, &vm->preempt.exec_queues);
250 	++vm->preempt.num_exec_queues;
251 	q->lr.pfence = pfence;
252 
253 	down_read(&vm->userptr.notifier_lock);
254 
255 	drm_gpuvm_resv_add_fence(&vm->gpuvm, exec, pfence,
256 				 DMA_RESV_USAGE_BOOKKEEP, DMA_RESV_USAGE_BOOKKEEP);
257 
258 	/*
259 	 * Check to see if a preemption on VM is in flight or userptr
260 	 * invalidation, if so trigger this preempt fence to sync state with
261 	 * other preempt fences on the VM.
262 	 */
263 	wait = __xe_vm_userptr_needs_repin(vm) || preempt_fences_waiting(vm);
264 	if (wait)
265 		dma_fence_enable_sw_signaling(pfence);
266 
267 	up_read(&vm->userptr.notifier_lock);
268 
269 out_fini:
270 	drm_exec_fini(exec);
271 out_up_write:
272 	up_write(&vm->lock);
273 
274 	return err;
275 }
276 ALLOW_ERROR_INJECTION(xe_vm_add_compute_exec_queue, ERRNO);
277 
278 /**
279  * xe_vm_remove_compute_exec_queue() - Remove compute exec queue from VM
280  * @vm: The VM.
281  * @q: The exec_queue
282  *
283  * Note that this function might be called multiple times on the same queue.
284  */
285 void xe_vm_remove_compute_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
286 {
287 	if (!xe_vm_in_preempt_fence_mode(vm))
288 		return;
289 
290 	down_write(&vm->lock);
291 	if (!list_empty(&q->lr.link)) {
292 		list_del_init(&q->lr.link);
293 		--vm->preempt.num_exec_queues;
294 	}
295 	if (q->lr.pfence) {
296 		dma_fence_enable_sw_signaling(q->lr.pfence);
297 		dma_fence_put(q->lr.pfence);
298 		q->lr.pfence = NULL;
299 	}
300 	up_write(&vm->lock);
301 }
302 
303 /**
304  * __xe_vm_userptr_needs_repin() - Check whether the VM does have userptrs
305  * that need repinning.
306  * @vm: The VM.
307  *
308  * This function checks for whether the VM has userptrs that need repinning,
309  * and provides a release-type barrier on the userptr.notifier_lock after
310  * checking.
311  *
312  * Return: 0 if there are no userptrs needing repinning, -EAGAIN if there are.
313  */
314 int __xe_vm_userptr_needs_repin(struct xe_vm *vm)
315 {
316 	lockdep_assert_held_read(&vm->userptr.notifier_lock);
317 
318 	return (list_empty(&vm->userptr.repin_list) &&
319 		list_empty(&vm->userptr.invalidated)) ? 0 : -EAGAIN;
320 }
321 
322 #define XE_VM_REBIND_RETRY_TIMEOUT_MS 1000
323 
324 /**
325  * xe_vm_kill() - VM Kill
326  * @vm: The VM.
327  * @unlocked: Flag indicates the VM's dma-resv is not held
328  *
329  * Kill the VM by setting banned flag indicated VM is no longer available for
330  * use. If in preempt fence mode, also kill all exec queue attached to the VM.
331  */
332 void xe_vm_kill(struct xe_vm *vm, bool unlocked)
333 {
334 	struct xe_exec_queue *q;
335 
336 	lockdep_assert_held(&vm->lock);
337 
338 	if (unlocked)
339 		xe_vm_lock(vm, false);
340 
341 	vm->flags |= XE_VM_FLAG_BANNED;
342 	trace_xe_vm_kill(vm);
343 
344 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link)
345 		q->ops->kill(q);
346 
347 	if (unlocked)
348 		xe_vm_unlock(vm);
349 
350 	/* TODO: Inform user the VM is banned */
351 }
352 
353 /**
354  * xe_vm_validate_should_retry() - Whether to retry after a validate error.
355  * @exec: The drm_exec object used for locking before validation.
356  * @err: The error returned from ttm_bo_validate().
357  * @end: A ktime_t cookie that should be set to 0 before first use and
358  * that should be reused on subsequent calls.
359  *
360  * With multiple active VMs, under memory pressure, it is possible that
361  * ttm_bo_validate() run into -EDEADLK and in such case returns -ENOMEM.
362  * Until ttm properly handles locking in such scenarios, best thing the
363  * driver can do is retry with a timeout. Check if that is necessary, and
364  * if so unlock the drm_exec's objects while keeping the ticket to prepare
365  * for a rerun.
366  *
367  * Return: true if a retry after drm_exec_init() is recommended;
368  * false otherwise.
369  */
370 bool xe_vm_validate_should_retry(struct drm_exec *exec, int err, ktime_t *end)
371 {
372 	ktime_t cur;
373 
374 	if (err != -ENOMEM)
375 		return false;
376 
377 	cur = ktime_get();
378 	*end = *end ? : ktime_add_ms(cur, XE_VM_REBIND_RETRY_TIMEOUT_MS);
379 	if (!ktime_before(cur, *end))
380 		return false;
381 
382 	msleep(20);
383 	return true;
384 }
385 
386 static int xe_gpuvm_validate(struct drm_gpuvm_bo *vm_bo, struct drm_exec *exec)
387 {
388 	struct xe_vm *vm = gpuvm_to_vm(vm_bo->vm);
389 	struct drm_gpuva *gpuva;
390 	int ret;
391 
392 	lockdep_assert_held(&vm->lock);
393 	drm_gpuvm_bo_for_each_va(gpuva, vm_bo)
394 		list_move_tail(&gpuva_to_vma(gpuva)->combined_links.rebind,
395 			       &vm->rebind_list);
396 
397 	if (!try_wait_for_completion(&vm->xe->pm_block))
398 		return -EAGAIN;
399 
400 	ret = xe_bo_validate(gem_to_xe_bo(vm_bo->obj), vm, false);
401 	if (ret)
402 		return ret;
403 
404 	vm_bo->evicted = false;
405 	return 0;
406 }
407 
408 /**
409  * xe_vm_validate_rebind() - Validate buffer objects and rebind vmas
410  * @vm: The vm for which we are rebinding.
411  * @exec: The struct drm_exec with the locked GEM objects.
412  * @num_fences: The number of fences to reserve for the operation, not
413  * including rebinds and validations.
414  *
415  * Validates all evicted gem objects and rebinds their vmas. Note that
416  * rebindings may cause evictions and hence the validation-rebind
417  * sequence is rerun until there are no more objects to validate.
418  *
419  * Return: 0 on success, negative error code on error. In particular,
420  * may return -EINTR or -ERESTARTSYS if interrupted, and -EDEADLK if
421  * the drm_exec transaction needs to be restarted.
422  */
423 int xe_vm_validate_rebind(struct xe_vm *vm, struct drm_exec *exec,
424 			  unsigned int num_fences)
425 {
426 	struct drm_gem_object *obj;
427 	unsigned long index;
428 	int ret;
429 
430 	do {
431 		ret = drm_gpuvm_validate(&vm->gpuvm, exec);
432 		if (ret)
433 			return ret;
434 
435 		ret = xe_vm_rebind(vm, false);
436 		if (ret)
437 			return ret;
438 	} while (!list_empty(&vm->gpuvm.evict.list));
439 
440 	drm_exec_for_each_locked_object(exec, index, obj) {
441 		ret = dma_resv_reserve_fences(obj->resv, num_fences);
442 		if (ret)
443 			return ret;
444 	}
445 
446 	return 0;
447 }
448 
449 static int xe_preempt_work_begin(struct drm_exec *exec, struct xe_vm *vm,
450 				 bool *done)
451 {
452 	int err;
453 
454 	err = drm_gpuvm_prepare_vm(&vm->gpuvm, exec, 0);
455 	if (err)
456 		return err;
457 
458 	if (xe_vm_is_idle(vm)) {
459 		vm->preempt.rebind_deactivated = true;
460 		*done = true;
461 		return 0;
462 	}
463 
464 	if (!preempt_fences_waiting(vm)) {
465 		*done = true;
466 		return 0;
467 	}
468 
469 	err = drm_gpuvm_prepare_objects(&vm->gpuvm, exec, 0);
470 	if (err)
471 		return err;
472 
473 	err = wait_for_existing_preempt_fences(vm);
474 	if (err)
475 		return err;
476 
477 	/*
478 	 * Add validation and rebinding to the locking loop since both can
479 	 * cause evictions which may require blocing dma_resv locks.
480 	 * The fence reservation here is intended for the new preempt fences
481 	 * we attach at the end of the rebind work.
482 	 */
483 	return xe_vm_validate_rebind(vm, exec, vm->preempt.num_exec_queues);
484 }
485 
486 static bool vm_suspend_rebind_worker(struct xe_vm *vm)
487 {
488 	struct xe_device *xe = vm->xe;
489 	bool ret = false;
490 
491 	mutex_lock(&xe->rebind_resume_lock);
492 	if (!try_wait_for_completion(&vm->xe->pm_block)) {
493 		ret = true;
494 		list_move_tail(&vm->preempt.pm_activate_link, &xe->rebind_resume_list);
495 	}
496 	mutex_unlock(&xe->rebind_resume_lock);
497 
498 	return ret;
499 }
500 
501 /**
502  * xe_vm_resume_rebind_worker() - Resume the rebind worker.
503  * @vm: The vm whose preempt worker to resume.
504  *
505  * Resume a preempt worker that was previously suspended by
506  * vm_suspend_rebind_worker().
507  */
508 void xe_vm_resume_rebind_worker(struct xe_vm *vm)
509 {
510 	queue_work(vm->xe->ordered_wq, &vm->preempt.rebind_work);
511 }
512 
513 static void preempt_rebind_work_func(struct work_struct *w)
514 {
515 	struct xe_vm *vm = container_of(w, struct xe_vm, preempt.rebind_work);
516 	struct drm_exec exec;
517 	unsigned int fence_count = 0;
518 	LIST_HEAD(preempt_fences);
519 	ktime_t end = 0;
520 	int err = 0;
521 	long wait;
522 	int __maybe_unused tries = 0;
523 
524 	xe_assert(vm->xe, xe_vm_in_preempt_fence_mode(vm));
525 	trace_xe_vm_rebind_worker_enter(vm);
526 
527 	down_write(&vm->lock);
528 
529 	if (xe_vm_is_closed_or_banned(vm)) {
530 		up_write(&vm->lock);
531 		trace_xe_vm_rebind_worker_exit(vm);
532 		return;
533 	}
534 
535 retry:
536 	if (!try_wait_for_completion(&vm->xe->pm_block) && vm_suspend_rebind_worker(vm)) {
537 		up_write(&vm->lock);
538 		return;
539 	}
540 
541 	if (xe_vm_userptr_check_repin(vm)) {
542 		err = xe_vm_userptr_pin(vm);
543 		if (err)
544 			goto out_unlock_outer;
545 	}
546 
547 	drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT, 0);
548 
549 	drm_exec_until_all_locked(&exec) {
550 		bool done = false;
551 
552 		err = xe_preempt_work_begin(&exec, vm, &done);
553 		drm_exec_retry_on_contention(&exec);
554 		if (err || done) {
555 			drm_exec_fini(&exec);
556 			if (err && xe_vm_validate_should_retry(&exec, err, &end))
557 				err = -EAGAIN;
558 
559 			goto out_unlock_outer;
560 		}
561 	}
562 
563 	err = alloc_preempt_fences(vm, &preempt_fences, &fence_count);
564 	if (err)
565 		goto out_unlock;
566 
567 	err = xe_vm_rebind(vm, true);
568 	if (err)
569 		goto out_unlock;
570 
571 	/* Wait on rebinds and munmap style VM unbinds */
572 	wait = dma_resv_wait_timeout(xe_vm_resv(vm),
573 				     DMA_RESV_USAGE_KERNEL,
574 				     false, MAX_SCHEDULE_TIMEOUT);
575 	if (wait <= 0) {
576 		err = -ETIME;
577 		goto out_unlock;
578 	}
579 
580 #define retry_required(__tries, __vm) \
581 	(IS_ENABLED(CONFIG_DRM_XE_USERPTR_INVAL_INJECT) ? \
582 	(!(__tries)++ || __xe_vm_userptr_needs_repin(__vm)) : \
583 	__xe_vm_userptr_needs_repin(__vm))
584 
585 	down_read(&vm->userptr.notifier_lock);
586 	if (retry_required(tries, vm)) {
587 		up_read(&vm->userptr.notifier_lock);
588 		err = -EAGAIN;
589 		goto out_unlock;
590 	}
591 
592 #undef retry_required
593 
594 	spin_lock(&vm->xe->ttm.lru_lock);
595 	ttm_lru_bulk_move_tail(&vm->lru_bulk_move);
596 	spin_unlock(&vm->xe->ttm.lru_lock);
597 
598 	/* Point of no return. */
599 	arm_preempt_fences(vm, &preempt_fences);
600 	resume_and_reinstall_preempt_fences(vm, &exec);
601 	up_read(&vm->userptr.notifier_lock);
602 
603 out_unlock:
604 	drm_exec_fini(&exec);
605 out_unlock_outer:
606 	if (err == -EAGAIN) {
607 		trace_xe_vm_rebind_worker_retry(vm);
608 		goto retry;
609 	}
610 
611 	if (err) {
612 		drm_warn(&vm->xe->drm, "VM worker error: %d\n", err);
613 		xe_vm_kill(vm, true);
614 	}
615 	up_write(&vm->lock);
616 
617 	free_preempt_fences(&preempt_fences);
618 
619 	trace_xe_vm_rebind_worker_exit(vm);
620 }
621 
622 static void __vma_userptr_invalidate(struct xe_vm *vm, struct xe_userptr_vma *uvma)
623 {
624 	struct xe_userptr *userptr = &uvma->userptr;
625 	struct xe_vma *vma = &uvma->vma;
626 	struct dma_resv_iter cursor;
627 	struct dma_fence *fence;
628 	long err;
629 
630 	/*
631 	 * Tell exec and rebind worker they need to repin and rebind this
632 	 * userptr.
633 	 */
634 	if (!xe_vm_in_fault_mode(vm) &&
635 	    !(vma->gpuva.flags & XE_VMA_DESTROYED)) {
636 		spin_lock(&vm->userptr.invalidated_lock);
637 		list_move_tail(&userptr->invalidate_link,
638 			       &vm->userptr.invalidated);
639 		spin_unlock(&vm->userptr.invalidated_lock);
640 	}
641 
642 	/*
643 	 * Preempt fences turn into schedule disables, pipeline these.
644 	 * Note that even in fault mode, we need to wait for binds and
645 	 * unbinds to complete, and those are attached as BOOKMARK fences
646 	 * to the vm.
647 	 */
648 	dma_resv_iter_begin(&cursor, xe_vm_resv(vm),
649 			    DMA_RESV_USAGE_BOOKKEEP);
650 	dma_resv_for_each_fence_unlocked(&cursor, fence)
651 		dma_fence_enable_sw_signaling(fence);
652 	dma_resv_iter_end(&cursor);
653 
654 	err = dma_resv_wait_timeout(xe_vm_resv(vm),
655 				    DMA_RESV_USAGE_BOOKKEEP,
656 				    false, MAX_SCHEDULE_TIMEOUT);
657 	XE_WARN_ON(err <= 0);
658 
659 	if (xe_vm_in_fault_mode(vm) && userptr->initial_bind) {
660 		err = xe_vm_invalidate_vma(vma);
661 		XE_WARN_ON(err);
662 	}
663 
664 	xe_hmm_userptr_unmap(uvma);
665 }
666 
667 static bool vma_userptr_invalidate(struct mmu_interval_notifier *mni,
668 				   const struct mmu_notifier_range *range,
669 				   unsigned long cur_seq)
670 {
671 	struct xe_userptr_vma *uvma = container_of(mni, typeof(*uvma), userptr.notifier);
672 	struct xe_vma *vma = &uvma->vma;
673 	struct xe_vm *vm = xe_vma_vm(vma);
674 
675 	xe_assert(vm->xe, xe_vma_is_userptr(vma));
676 	trace_xe_vma_userptr_invalidate(vma);
677 
678 	if (!mmu_notifier_range_blockable(range))
679 		return false;
680 
681 	vm_dbg(&xe_vma_vm(vma)->xe->drm,
682 	       "NOTIFIER: addr=0x%016llx, range=0x%016llx",
683 		xe_vma_start(vma), xe_vma_size(vma));
684 
685 	down_write(&vm->userptr.notifier_lock);
686 	mmu_interval_set_seq(mni, cur_seq);
687 
688 	__vma_userptr_invalidate(vm, uvma);
689 	up_write(&vm->userptr.notifier_lock);
690 	trace_xe_vma_userptr_invalidate_complete(vma);
691 
692 	return true;
693 }
694 
695 static const struct mmu_interval_notifier_ops vma_userptr_notifier_ops = {
696 	.invalidate = vma_userptr_invalidate,
697 };
698 
699 #if IS_ENABLED(CONFIG_DRM_XE_USERPTR_INVAL_INJECT)
700 /**
701  * xe_vma_userptr_force_invalidate() - force invalidate a userptr
702  * @uvma: The userptr vma to invalidate
703  *
704  * Perform a forced userptr invalidation for testing purposes.
705  */
706 void xe_vma_userptr_force_invalidate(struct xe_userptr_vma *uvma)
707 {
708 	struct xe_vm *vm = xe_vma_vm(&uvma->vma);
709 
710 	/* Protect against concurrent userptr pinning */
711 	lockdep_assert_held(&vm->lock);
712 	/* Protect against concurrent notifiers */
713 	lockdep_assert_held(&vm->userptr.notifier_lock);
714 	/*
715 	 * Protect against concurrent instances of this function and
716 	 * the critical exec sections
717 	 */
718 	xe_vm_assert_held(vm);
719 
720 	if (!mmu_interval_read_retry(&uvma->userptr.notifier,
721 				     uvma->userptr.notifier_seq))
722 		uvma->userptr.notifier_seq -= 2;
723 	__vma_userptr_invalidate(vm, uvma);
724 }
725 #endif
726 
727 int xe_vm_userptr_pin(struct xe_vm *vm)
728 {
729 	struct xe_userptr_vma *uvma, *next;
730 	int err = 0;
731 
732 	xe_assert(vm->xe, !xe_vm_in_fault_mode(vm));
733 	lockdep_assert_held_write(&vm->lock);
734 
735 	/* Collect invalidated userptrs */
736 	spin_lock(&vm->userptr.invalidated_lock);
737 	xe_assert(vm->xe, list_empty(&vm->userptr.repin_list));
738 	list_for_each_entry_safe(uvma, next, &vm->userptr.invalidated,
739 				 userptr.invalidate_link) {
740 		list_del_init(&uvma->userptr.invalidate_link);
741 		list_add_tail(&uvma->userptr.repin_link,
742 			      &vm->userptr.repin_list);
743 	}
744 	spin_unlock(&vm->userptr.invalidated_lock);
745 
746 	/* Pin and move to bind list */
747 	list_for_each_entry_safe(uvma, next, &vm->userptr.repin_list,
748 				 userptr.repin_link) {
749 		err = xe_vma_userptr_pin_pages(uvma);
750 		if (err == -EFAULT) {
751 			list_del_init(&uvma->userptr.repin_link);
752 			/*
753 			 * We might have already done the pin once already, but
754 			 * then had to retry before the re-bind happened, due
755 			 * some other condition in the caller, but in the
756 			 * meantime the userptr got dinged by the notifier such
757 			 * that we need to revalidate here, but this time we hit
758 			 * the EFAULT. In such a case make sure we remove
759 			 * ourselves from the rebind list to avoid going down in
760 			 * flames.
761 			 */
762 			if (!list_empty(&uvma->vma.combined_links.rebind))
763 				list_del_init(&uvma->vma.combined_links.rebind);
764 
765 			/* Wait for pending binds */
766 			xe_vm_lock(vm, false);
767 			dma_resv_wait_timeout(xe_vm_resv(vm),
768 					      DMA_RESV_USAGE_BOOKKEEP,
769 					      false, MAX_SCHEDULE_TIMEOUT);
770 
771 			down_read(&vm->userptr.notifier_lock);
772 			err = xe_vm_invalidate_vma(&uvma->vma);
773 			up_read(&vm->userptr.notifier_lock);
774 			xe_vm_unlock(vm);
775 			if (err)
776 				break;
777 		} else {
778 			if (err)
779 				break;
780 
781 			list_del_init(&uvma->userptr.repin_link);
782 			list_move_tail(&uvma->vma.combined_links.rebind,
783 				       &vm->rebind_list);
784 		}
785 	}
786 
787 	if (err) {
788 		down_write(&vm->userptr.notifier_lock);
789 		spin_lock(&vm->userptr.invalidated_lock);
790 		list_for_each_entry_safe(uvma, next, &vm->userptr.repin_list,
791 					 userptr.repin_link) {
792 			list_del_init(&uvma->userptr.repin_link);
793 			list_move_tail(&uvma->userptr.invalidate_link,
794 				       &vm->userptr.invalidated);
795 		}
796 		spin_unlock(&vm->userptr.invalidated_lock);
797 		up_write(&vm->userptr.notifier_lock);
798 	}
799 	return err;
800 }
801 
802 /**
803  * xe_vm_userptr_check_repin() - Check whether the VM might have userptrs
804  * that need repinning.
805  * @vm: The VM.
806  *
807  * This function does an advisory check for whether the VM has userptrs that
808  * need repinning.
809  *
810  * Return: 0 if there are no indications of userptrs needing repinning,
811  * -EAGAIN if there are.
812  */
813 int xe_vm_userptr_check_repin(struct xe_vm *vm)
814 {
815 	return (list_empty_careful(&vm->userptr.repin_list) &&
816 		list_empty_careful(&vm->userptr.invalidated)) ? 0 : -EAGAIN;
817 }
818 
819 static int xe_vma_ops_alloc(struct xe_vma_ops *vops, bool array_of_binds)
820 {
821 	int i;
822 
823 	for (i = 0; i < XE_MAX_TILES_PER_DEVICE; ++i) {
824 		if (!vops->pt_update_ops[i].num_ops)
825 			continue;
826 
827 		vops->pt_update_ops[i].ops =
828 			kmalloc_array(vops->pt_update_ops[i].num_ops,
829 				      sizeof(*vops->pt_update_ops[i].ops),
830 				      GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN);
831 		if (!vops->pt_update_ops[i].ops)
832 			return array_of_binds ? -ENOBUFS : -ENOMEM;
833 	}
834 
835 	return 0;
836 }
837 ALLOW_ERROR_INJECTION(xe_vma_ops_alloc, ERRNO);
838 
839 static void xe_vma_svm_prefetch_op_fini(struct xe_vma_op *op)
840 {
841 	struct xe_vma *vma;
842 
843 	vma = gpuva_to_vma(op->base.prefetch.va);
844 
845 	if (op->base.op == DRM_GPUVA_OP_PREFETCH && xe_vma_is_cpu_addr_mirror(vma))
846 		xa_destroy(&op->prefetch_range.range);
847 }
848 
849 static void xe_vma_svm_prefetch_ops_fini(struct xe_vma_ops *vops)
850 {
851 	struct xe_vma_op *op;
852 
853 	if (!(vops->flags & XE_VMA_OPS_FLAG_HAS_SVM_PREFETCH))
854 		return;
855 
856 	list_for_each_entry(op, &vops->list, link)
857 		xe_vma_svm_prefetch_op_fini(op);
858 }
859 
860 static void xe_vma_ops_fini(struct xe_vma_ops *vops)
861 {
862 	int i;
863 
864 	xe_vma_svm_prefetch_ops_fini(vops);
865 
866 	for (i = 0; i < XE_MAX_TILES_PER_DEVICE; ++i)
867 		kfree(vops->pt_update_ops[i].ops);
868 }
869 
870 static void xe_vma_ops_incr_pt_update_ops(struct xe_vma_ops *vops, u8 tile_mask, int inc_val)
871 {
872 	int i;
873 
874 	if (!inc_val)
875 		return;
876 
877 	for (i = 0; i < XE_MAX_TILES_PER_DEVICE; ++i)
878 		if (BIT(i) & tile_mask)
879 			vops->pt_update_ops[i].num_ops += inc_val;
880 }
881 
882 static void xe_vm_populate_rebind(struct xe_vma_op *op, struct xe_vma *vma,
883 				  u8 tile_mask)
884 {
885 	INIT_LIST_HEAD(&op->link);
886 	op->tile_mask = tile_mask;
887 	op->base.op = DRM_GPUVA_OP_MAP;
888 	op->base.map.va.addr = vma->gpuva.va.addr;
889 	op->base.map.va.range = vma->gpuva.va.range;
890 	op->base.map.gem.obj = vma->gpuva.gem.obj;
891 	op->base.map.gem.offset = vma->gpuva.gem.offset;
892 	op->map.vma = vma;
893 	op->map.immediate = true;
894 	op->map.dumpable = vma->gpuva.flags & XE_VMA_DUMPABLE;
895 	op->map.is_null = xe_vma_is_null(vma);
896 }
897 
898 static int xe_vm_ops_add_rebind(struct xe_vma_ops *vops, struct xe_vma *vma,
899 				u8 tile_mask)
900 {
901 	struct xe_vma_op *op;
902 
903 	op = kzalloc(sizeof(*op), GFP_KERNEL);
904 	if (!op)
905 		return -ENOMEM;
906 
907 	xe_vm_populate_rebind(op, vma, tile_mask);
908 	list_add_tail(&op->link, &vops->list);
909 	xe_vma_ops_incr_pt_update_ops(vops, tile_mask, 1);
910 
911 	return 0;
912 }
913 
914 static struct dma_fence *ops_execute(struct xe_vm *vm,
915 				     struct xe_vma_ops *vops);
916 static void xe_vma_ops_init(struct xe_vma_ops *vops, struct xe_vm *vm,
917 			    struct xe_exec_queue *q,
918 			    struct xe_sync_entry *syncs, u32 num_syncs);
919 
920 int xe_vm_rebind(struct xe_vm *vm, bool rebind_worker)
921 {
922 	struct dma_fence *fence;
923 	struct xe_vma *vma, *next;
924 	struct xe_vma_ops vops;
925 	struct xe_vma_op *op, *next_op;
926 	int err, i;
927 
928 	lockdep_assert_held(&vm->lock);
929 	if ((xe_vm_in_lr_mode(vm) && !rebind_worker) ||
930 	    list_empty(&vm->rebind_list))
931 		return 0;
932 
933 	xe_vma_ops_init(&vops, vm, NULL, NULL, 0);
934 	for (i = 0; i < XE_MAX_TILES_PER_DEVICE; ++i)
935 		vops.pt_update_ops[i].wait_vm_bookkeep = true;
936 
937 	xe_vm_assert_held(vm);
938 	list_for_each_entry(vma, &vm->rebind_list, combined_links.rebind) {
939 		xe_assert(vm->xe, vma->tile_present);
940 
941 		if (rebind_worker)
942 			trace_xe_vma_rebind_worker(vma);
943 		else
944 			trace_xe_vma_rebind_exec(vma);
945 
946 		err = xe_vm_ops_add_rebind(&vops, vma,
947 					   vma->tile_present);
948 		if (err)
949 			goto free_ops;
950 	}
951 
952 	err = xe_vma_ops_alloc(&vops, false);
953 	if (err)
954 		goto free_ops;
955 
956 	fence = ops_execute(vm, &vops);
957 	if (IS_ERR(fence)) {
958 		err = PTR_ERR(fence);
959 	} else {
960 		dma_fence_put(fence);
961 		list_for_each_entry_safe(vma, next, &vm->rebind_list,
962 					 combined_links.rebind)
963 			list_del_init(&vma->combined_links.rebind);
964 	}
965 free_ops:
966 	list_for_each_entry_safe(op, next_op, &vops.list, link) {
967 		list_del(&op->link);
968 		kfree(op);
969 	}
970 	xe_vma_ops_fini(&vops);
971 
972 	return err;
973 }
974 
975 struct dma_fence *xe_vma_rebind(struct xe_vm *vm, struct xe_vma *vma, u8 tile_mask)
976 {
977 	struct dma_fence *fence = NULL;
978 	struct xe_vma_ops vops;
979 	struct xe_vma_op *op, *next_op;
980 	struct xe_tile *tile;
981 	u8 id;
982 	int err;
983 
984 	lockdep_assert_held(&vm->lock);
985 	xe_vm_assert_held(vm);
986 	xe_assert(vm->xe, xe_vm_in_fault_mode(vm));
987 
988 	xe_vma_ops_init(&vops, vm, NULL, NULL, 0);
989 	for_each_tile(tile, vm->xe, id) {
990 		vops.pt_update_ops[id].wait_vm_bookkeep = true;
991 		vops.pt_update_ops[tile->id].q =
992 			xe_migrate_exec_queue(tile->migrate);
993 	}
994 
995 	err = xe_vm_ops_add_rebind(&vops, vma, tile_mask);
996 	if (err)
997 		return ERR_PTR(err);
998 
999 	err = xe_vma_ops_alloc(&vops, false);
1000 	if (err) {
1001 		fence = ERR_PTR(err);
1002 		goto free_ops;
1003 	}
1004 
1005 	fence = ops_execute(vm, &vops);
1006 
1007 free_ops:
1008 	list_for_each_entry_safe(op, next_op, &vops.list, link) {
1009 		list_del(&op->link);
1010 		kfree(op);
1011 	}
1012 	xe_vma_ops_fini(&vops);
1013 
1014 	return fence;
1015 }
1016 
1017 static void xe_vm_populate_range_rebind(struct xe_vma_op *op,
1018 					struct xe_vma *vma,
1019 					struct xe_svm_range *range,
1020 					u8 tile_mask)
1021 {
1022 	INIT_LIST_HEAD(&op->link);
1023 	op->tile_mask = tile_mask;
1024 	op->base.op = DRM_GPUVA_OP_DRIVER;
1025 	op->subop = XE_VMA_SUBOP_MAP_RANGE;
1026 	op->map_range.vma = vma;
1027 	op->map_range.range = range;
1028 }
1029 
1030 static int
1031 xe_vm_ops_add_range_rebind(struct xe_vma_ops *vops,
1032 			   struct xe_vma *vma,
1033 			   struct xe_svm_range *range,
1034 			   u8 tile_mask)
1035 {
1036 	struct xe_vma_op *op;
1037 
1038 	op = kzalloc(sizeof(*op), GFP_KERNEL);
1039 	if (!op)
1040 		return -ENOMEM;
1041 
1042 	xe_vm_populate_range_rebind(op, vma, range, tile_mask);
1043 	list_add_tail(&op->link, &vops->list);
1044 	xe_vma_ops_incr_pt_update_ops(vops, tile_mask, 1);
1045 
1046 	return 0;
1047 }
1048 
1049 /**
1050  * xe_vm_range_rebind() - VM range (re)bind
1051  * @vm: The VM which the range belongs to.
1052  * @vma: The VMA which the range belongs to.
1053  * @range: SVM range to rebind.
1054  * @tile_mask: Tile mask to bind the range to.
1055  *
1056  * (re)bind SVM range setting up GPU page tables for the range.
1057  *
1058  * Return: dma fence for rebind to signal completion on succees, ERR_PTR on
1059  * failure
1060  */
1061 struct dma_fence *xe_vm_range_rebind(struct xe_vm *vm,
1062 				     struct xe_vma *vma,
1063 				     struct xe_svm_range *range,
1064 				     u8 tile_mask)
1065 {
1066 	struct dma_fence *fence = NULL;
1067 	struct xe_vma_ops vops;
1068 	struct xe_vma_op *op, *next_op;
1069 	struct xe_tile *tile;
1070 	u8 id;
1071 	int err;
1072 
1073 	lockdep_assert_held(&vm->lock);
1074 	xe_vm_assert_held(vm);
1075 	xe_assert(vm->xe, xe_vm_in_fault_mode(vm));
1076 	xe_assert(vm->xe, xe_vma_is_cpu_addr_mirror(vma));
1077 
1078 	xe_vma_ops_init(&vops, vm, NULL, NULL, 0);
1079 	for_each_tile(tile, vm->xe, id) {
1080 		vops.pt_update_ops[id].wait_vm_bookkeep = true;
1081 		vops.pt_update_ops[tile->id].q =
1082 			xe_migrate_exec_queue(tile->migrate);
1083 	}
1084 
1085 	err = xe_vm_ops_add_range_rebind(&vops, vma, range, tile_mask);
1086 	if (err)
1087 		return ERR_PTR(err);
1088 
1089 	err = xe_vma_ops_alloc(&vops, false);
1090 	if (err) {
1091 		fence = ERR_PTR(err);
1092 		goto free_ops;
1093 	}
1094 
1095 	fence = ops_execute(vm, &vops);
1096 
1097 free_ops:
1098 	list_for_each_entry_safe(op, next_op, &vops.list, link) {
1099 		list_del(&op->link);
1100 		kfree(op);
1101 	}
1102 	xe_vma_ops_fini(&vops);
1103 
1104 	return fence;
1105 }
1106 
1107 static void xe_vm_populate_range_unbind(struct xe_vma_op *op,
1108 					struct xe_svm_range *range)
1109 {
1110 	INIT_LIST_HEAD(&op->link);
1111 	op->tile_mask = range->tile_present;
1112 	op->base.op = DRM_GPUVA_OP_DRIVER;
1113 	op->subop = XE_VMA_SUBOP_UNMAP_RANGE;
1114 	op->unmap_range.range = range;
1115 }
1116 
1117 static int
1118 xe_vm_ops_add_range_unbind(struct xe_vma_ops *vops,
1119 			   struct xe_svm_range *range)
1120 {
1121 	struct xe_vma_op *op;
1122 
1123 	op = kzalloc(sizeof(*op), GFP_KERNEL);
1124 	if (!op)
1125 		return -ENOMEM;
1126 
1127 	xe_vm_populate_range_unbind(op, range);
1128 	list_add_tail(&op->link, &vops->list);
1129 	xe_vma_ops_incr_pt_update_ops(vops, range->tile_present, 1);
1130 
1131 	return 0;
1132 }
1133 
1134 /**
1135  * xe_vm_range_unbind() - VM range unbind
1136  * @vm: The VM which the range belongs to.
1137  * @range: SVM range to rebind.
1138  *
1139  * Unbind SVM range removing the GPU page tables for the range.
1140  *
1141  * Return: dma fence for unbind to signal completion on succees, ERR_PTR on
1142  * failure
1143  */
1144 struct dma_fence *xe_vm_range_unbind(struct xe_vm *vm,
1145 				     struct xe_svm_range *range)
1146 {
1147 	struct dma_fence *fence = NULL;
1148 	struct xe_vma_ops vops;
1149 	struct xe_vma_op *op, *next_op;
1150 	struct xe_tile *tile;
1151 	u8 id;
1152 	int err;
1153 
1154 	lockdep_assert_held(&vm->lock);
1155 	xe_vm_assert_held(vm);
1156 	xe_assert(vm->xe, xe_vm_in_fault_mode(vm));
1157 
1158 	if (!range->tile_present)
1159 		return dma_fence_get_stub();
1160 
1161 	xe_vma_ops_init(&vops, vm, NULL, NULL, 0);
1162 	for_each_tile(tile, vm->xe, id) {
1163 		vops.pt_update_ops[id].wait_vm_bookkeep = true;
1164 		vops.pt_update_ops[tile->id].q =
1165 			xe_migrate_exec_queue(tile->migrate);
1166 	}
1167 
1168 	err = xe_vm_ops_add_range_unbind(&vops, range);
1169 	if (err)
1170 		return ERR_PTR(err);
1171 
1172 	err = xe_vma_ops_alloc(&vops, false);
1173 	if (err) {
1174 		fence = ERR_PTR(err);
1175 		goto free_ops;
1176 	}
1177 
1178 	fence = ops_execute(vm, &vops);
1179 
1180 free_ops:
1181 	list_for_each_entry_safe(op, next_op, &vops.list, link) {
1182 		list_del(&op->link);
1183 		kfree(op);
1184 	}
1185 	xe_vma_ops_fini(&vops);
1186 
1187 	return fence;
1188 }
1189 
1190 static void xe_vma_free(struct xe_vma *vma)
1191 {
1192 	if (xe_vma_is_userptr(vma))
1193 		kfree(to_userptr_vma(vma));
1194 	else
1195 		kfree(vma);
1196 }
1197 
1198 #define VMA_CREATE_FLAG_READ_ONLY		BIT(0)
1199 #define VMA_CREATE_FLAG_IS_NULL			BIT(1)
1200 #define VMA_CREATE_FLAG_DUMPABLE		BIT(2)
1201 #define VMA_CREATE_FLAG_IS_SYSTEM_ALLOCATOR	BIT(3)
1202 
1203 static struct xe_vma *xe_vma_create(struct xe_vm *vm,
1204 				    struct xe_bo *bo,
1205 				    u64 bo_offset_or_userptr,
1206 				    u64 start, u64 end,
1207 				    struct xe_vma_mem_attr *attr,
1208 				    unsigned int flags)
1209 {
1210 	struct xe_vma *vma;
1211 	struct xe_tile *tile;
1212 	u8 id;
1213 	bool read_only = (flags & VMA_CREATE_FLAG_READ_ONLY);
1214 	bool is_null = (flags & VMA_CREATE_FLAG_IS_NULL);
1215 	bool dumpable = (flags & VMA_CREATE_FLAG_DUMPABLE);
1216 	bool is_cpu_addr_mirror =
1217 		(flags & VMA_CREATE_FLAG_IS_SYSTEM_ALLOCATOR);
1218 
1219 	xe_assert(vm->xe, start < end);
1220 	xe_assert(vm->xe, end < vm->size);
1221 
1222 	/*
1223 	 * Allocate and ensure that the xe_vma_is_userptr() return
1224 	 * matches what was allocated.
1225 	 */
1226 	if (!bo && !is_null && !is_cpu_addr_mirror) {
1227 		struct xe_userptr_vma *uvma = kzalloc(sizeof(*uvma), GFP_KERNEL);
1228 
1229 		if (!uvma)
1230 			return ERR_PTR(-ENOMEM);
1231 
1232 		vma = &uvma->vma;
1233 	} else {
1234 		vma = kzalloc(sizeof(*vma), GFP_KERNEL);
1235 		if (!vma)
1236 			return ERR_PTR(-ENOMEM);
1237 
1238 		if (is_cpu_addr_mirror)
1239 			vma->gpuva.flags |= XE_VMA_SYSTEM_ALLOCATOR;
1240 		if (is_null)
1241 			vma->gpuva.flags |= DRM_GPUVA_SPARSE;
1242 		if (bo)
1243 			vma->gpuva.gem.obj = &bo->ttm.base;
1244 	}
1245 
1246 	INIT_LIST_HEAD(&vma->combined_links.rebind);
1247 
1248 	INIT_LIST_HEAD(&vma->gpuva.gem.entry);
1249 	vma->gpuva.vm = &vm->gpuvm;
1250 	vma->gpuva.va.addr = start;
1251 	vma->gpuva.va.range = end - start + 1;
1252 	if (read_only)
1253 		vma->gpuva.flags |= XE_VMA_READ_ONLY;
1254 	if (dumpable)
1255 		vma->gpuva.flags |= XE_VMA_DUMPABLE;
1256 
1257 	for_each_tile(tile, vm->xe, id)
1258 		vma->tile_mask |= 0x1 << id;
1259 
1260 	if (vm->xe->info.has_atomic_enable_pte_bit)
1261 		vma->gpuva.flags |= XE_VMA_ATOMIC_PTE_BIT;
1262 
1263 	vma->attr = *attr;
1264 
1265 	if (bo) {
1266 		struct drm_gpuvm_bo *vm_bo;
1267 
1268 		xe_bo_assert_held(bo);
1269 
1270 		vm_bo = drm_gpuvm_bo_obtain(vma->gpuva.vm, &bo->ttm.base);
1271 		if (IS_ERR(vm_bo)) {
1272 			xe_vma_free(vma);
1273 			return ERR_CAST(vm_bo);
1274 		}
1275 
1276 		drm_gpuvm_bo_extobj_add(vm_bo);
1277 		drm_gem_object_get(&bo->ttm.base);
1278 		vma->gpuva.gem.offset = bo_offset_or_userptr;
1279 		drm_gpuva_link(&vma->gpuva, vm_bo);
1280 		drm_gpuvm_bo_put(vm_bo);
1281 	} else /* userptr or null */ {
1282 		if (!is_null && !is_cpu_addr_mirror) {
1283 			struct xe_userptr *userptr = &to_userptr_vma(vma)->userptr;
1284 			u64 size = end - start + 1;
1285 			int err;
1286 
1287 			INIT_LIST_HEAD(&userptr->invalidate_link);
1288 			INIT_LIST_HEAD(&userptr->repin_link);
1289 			vma->gpuva.gem.offset = bo_offset_or_userptr;
1290 			mutex_init(&userptr->unmap_mutex);
1291 
1292 			err = mmu_interval_notifier_insert(&userptr->notifier,
1293 							   current->mm,
1294 							   xe_vma_userptr(vma), size,
1295 							   &vma_userptr_notifier_ops);
1296 			if (err) {
1297 				xe_vma_free(vma);
1298 				return ERR_PTR(err);
1299 			}
1300 
1301 			userptr->notifier_seq = LONG_MAX;
1302 		}
1303 
1304 		xe_vm_get(vm);
1305 	}
1306 
1307 	return vma;
1308 }
1309 
1310 static void xe_vma_destroy_late(struct xe_vma *vma)
1311 {
1312 	struct xe_vm *vm = xe_vma_vm(vma);
1313 
1314 	if (vma->ufence) {
1315 		xe_sync_ufence_put(vma->ufence);
1316 		vma->ufence = NULL;
1317 	}
1318 
1319 	if (xe_vma_is_userptr(vma)) {
1320 		struct xe_userptr_vma *uvma = to_userptr_vma(vma);
1321 		struct xe_userptr *userptr = &uvma->userptr;
1322 
1323 		if (userptr->sg)
1324 			xe_hmm_userptr_free_sg(uvma);
1325 
1326 		/*
1327 		 * Since userptr pages are not pinned, we can't remove
1328 		 * the notifier until we're sure the GPU is not accessing
1329 		 * them anymore
1330 		 */
1331 		mmu_interval_notifier_remove(&userptr->notifier);
1332 		mutex_destroy(&userptr->unmap_mutex);
1333 		xe_vm_put(vm);
1334 	} else if (xe_vma_is_null(vma) || xe_vma_is_cpu_addr_mirror(vma)) {
1335 		xe_vm_put(vm);
1336 	} else {
1337 		xe_bo_put(xe_vma_bo(vma));
1338 	}
1339 
1340 	xe_vma_free(vma);
1341 }
1342 
1343 static void vma_destroy_work_func(struct work_struct *w)
1344 {
1345 	struct xe_vma *vma =
1346 		container_of(w, struct xe_vma, destroy_work);
1347 
1348 	xe_vma_destroy_late(vma);
1349 }
1350 
1351 static void vma_destroy_cb(struct dma_fence *fence,
1352 			   struct dma_fence_cb *cb)
1353 {
1354 	struct xe_vma *vma = container_of(cb, struct xe_vma, destroy_cb);
1355 
1356 	INIT_WORK(&vma->destroy_work, vma_destroy_work_func);
1357 	queue_work(system_unbound_wq, &vma->destroy_work);
1358 }
1359 
1360 static void xe_vma_destroy(struct xe_vma *vma, struct dma_fence *fence)
1361 {
1362 	struct xe_vm *vm = xe_vma_vm(vma);
1363 
1364 	lockdep_assert_held_write(&vm->lock);
1365 	xe_assert(vm->xe, list_empty(&vma->combined_links.destroy));
1366 
1367 	if (xe_vma_is_userptr(vma)) {
1368 		xe_assert(vm->xe, vma->gpuva.flags & XE_VMA_DESTROYED);
1369 
1370 		spin_lock(&vm->userptr.invalidated_lock);
1371 		xe_assert(vm->xe, list_empty(&to_userptr_vma(vma)->userptr.repin_link));
1372 		list_del(&to_userptr_vma(vma)->userptr.invalidate_link);
1373 		spin_unlock(&vm->userptr.invalidated_lock);
1374 	} else if (!xe_vma_is_null(vma) && !xe_vma_is_cpu_addr_mirror(vma)) {
1375 		xe_bo_assert_held(xe_vma_bo(vma));
1376 
1377 		drm_gpuva_unlink(&vma->gpuva);
1378 	}
1379 
1380 	xe_vm_assert_held(vm);
1381 	if (fence) {
1382 		int ret = dma_fence_add_callback(fence, &vma->destroy_cb,
1383 						 vma_destroy_cb);
1384 
1385 		if (ret) {
1386 			XE_WARN_ON(ret != -ENOENT);
1387 			xe_vma_destroy_late(vma);
1388 		}
1389 	} else {
1390 		xe_vma_destroy_late(vma);
1391 	}
1392 }
1393 
1394 /**
1395  * xe_vm_lock_vma() - drm_exec utility to lock a vma
1396  * @exec: The drm_exec object we're currently locking for.
1397  * @vma: The vma for witch we want to lock the vm resv and any attached
1398  * object's resv.
1399  *
1400  * Return: 0 on success, negative error code on error. In particular
1401  * may return -EDEADLK on WW transaction contention and -EINTR if
1402  * an interruptible wait is terminated by a signal.
1403  */
1404 int xe_vm_lock_vma(struct drm_exec *exec, struct xe_vma *vma)
1405 {
1406 	struct xe_vm *vm = xe_vma_vm(vma);
1407 	struct xe_bo *bo = xe_vma_bo(vma);
1408 	int err;
1409 
1410 	XE_WARN_ON(!vm);
1411 
1412 	err = drm_exec_lock_obj(exec, xe_vm_obj(vm));
1413 	if (!err && bo && !bo->vm)
1414 		err = drm_exec_lock_obj(exec, &bo->ttm.base);
1415 
1416 	return err;
1417 }
1418 
1419 static void xe_vma_destroy_unlocked(struct xe_vma *vma)
1420 {
1421 	struct drm_exec exec;
1422 	int err;
1423 
1424 	drm_exec_init(&exec, 0, 0);
1425 	drm_exec_until_all_locked(&exec) {
1426 		err = xe_vm_lock_vma(&exec, vma);
1427 		drm_exec_retry_on_contention(&exec);
1428 		if (XE_WARN_ON(err))
1429 			break;
1430 	}
1431 
1432 	xe_vma_destroy(vma, NULL);
1433 
1434 	drm_exec_fini(&exec);
1435 }
1436 
1437 struct xe_vma *
1438 xe_vm_find_overlapping_vma(struct xe_vm *vm, u64 start, u64 range)
1439 {
1440 	struct drm_gpuva *gpuva;
1441 
1442 	lockdep_assert_held(&vm->lock);
1443 
1444 	if (xe_vm_is_closed_or_banned(vm))
1445 		return NULL;
1446 
1447 	xe_assert(vm->xe, start + range <= vm->size);
1448 
1449 	gpuva = drm_gpuva_find_first(&vm->gpuvm, start, range);
1450 
1451 	return gpuva ? gpuva_to_vma(gpuva) : NULL;
1452 }
1453 
1454 static int xe_vm_insert_vma(struct xe_vm *vm, struct xe_vma *vma)
1455 {
1456 	int err;
1457 
1458 	xe_assert(vm->xe, xe_vma_vm(vma) == vm);
1459 	lockdep_assert_held(&vm->lock);
1460 
1461 	mutex_lock(&vm->snap_mutex);
1462 	err = drm_gpuva_insert(&vm->gpuvm, &vma->gpuva);
1463 	mutex_unlock(&vm->snap_mutex);
1464 	XE_WARN_ON(err);	/* Shouldn't be possible */
1465 
1466 	return err;
1467 }
1468 
1469 static void xe_vm_remove_vma(struct xe_vm *vm, struct xe_vma *vma)
1470 {
1471 	xe_assert(vm->xe, xe_vma_vm(vma) == vm);
1472 	lockdep_assert_held(&vm->lock);
1473 
1474 	mutex_lock(&vm->snap_mutex);
1475 	drm_gpuva_remove(&vma->gpuva);
1476 	mutex_unlock(&vm->snap_mutex);
1477 	if (vm->usm.last_fault_vma == vma)
1478 		vm->usm.last_fault_vma = NULL;
1479 }
1480 
1481 static struct drm_gpuva_op *xe_vm_op_alloc(void)
1482 {
1483 	struct xe_vma_op *op;
1484 
1485 	op = kzalloc(sizeof(*op), GFP_KERNEL);
1486 
1487 	if (unlikely(!op))
1488 		return NULL;
1489 
1490 	return &op->base;
1491 }
1492 
1493 static void xe_vm_free(struct drm_gpuvm *gpuvm);
1494 
1495 static const struct drm_gpuvm_ops gpuvm_ops = {
1496 	.op_alloc = xe_vm_op_alloc,
1497 	.vm_bo_validate = xe_gpuvm_validate,
1498 	.vm_free = xe_vm_free,
1499 };
1500 
1501 static u64 pde_encode_pat_index(u16 pat_index)
1502 {
1503 	u64 pte = 0;
1504 
1505 	if (pat_index & BIT(0))
1506 		pte |= XE_PPGTT_PTE_PAT0;
1507 
1508 	if (pat_index & BIT(1))
1509 		pte |= XE_PPGTT_PTE_PAT1;
1510 
1511 	return pte;
1512 }
1513 
1514 static u64 pte_encode_pat_index(u16 pat_index, u32 pt_level)
1515 {
1516 	u64 pte = 0;
1517 
1518 	if (pat_index & BIT(0))
1519 		pte |= XE_PPGTT_PTE_PAT0;
1520 
1521 	if (pat_index & BIT(1))
1522 		pte |= XE_PPGTT_PTE_PAT1;
1523 
1524 	if (pat_index & BIT(2)) {
1525 		if (pt_level)
1526 			pte |= XE_PPGTT_PDE_PDPE_PAT2;
1527 		else
1528 			pte |= XE_PPGTT_PTE_PAT2;
1529 	}
1530 
1531 	if (pat_index & BIT(3))
1532 		pte |= XELPG_PPGTT_PTE_PAT3;
1533 
1534 	if (pat_index & (BIT(4)))
1535 		pte |= XE2_PPGTT_PTE_PAT4;
1536 
1537 	return pte;
1538 }
1539 
1540 static u64 pte_encode_ps(u32 pt_level)
1541 {
1542 	XE_WARN_ON(pt_level > MAX_HUGEPTE_LEVEL);
1543 
1544 	if (pt_level == 1)
1545 		return XE_PDE_PS_2M;
1546 	else if (pt_level == 2)
1547 		return XE_PDPE_PS_1G;
1548 
1549 	return 0;
1550 }
1551 
1552 static u16 pde_pat_index(struct xe_bo *bo)
1553 {
1554 	struct xe_device *xe = xe_bo_device(bo);
1555 	u16 pat_index;
1556 
1557 	/*
1558 	 * We only have two bits to encode the PAT index in non-leaf nodes, but
1559 	 * these only point to other paging structures so we only need a minimal
1560 	 * selection of options. The user PAT index is only for encoding leaf
1561 	 * nodes, where we have use of more bits to do the encoding. The
1562 	 * non-leaf nodes are instead under driver control so the chosen index
1563 	 * here should be distict from the user PAT index. Also the
1564 	 * corresponding coherency of the PAT index should be tied to the
1565 	 * allocation type of the page table (or at least we should pick
1566 	 * something which is always safe).
1567 	 */
1568 	if (!xe_bo_is_vram(bo) && bo->ttm.ttm->caching == ttm_cached)
1569 		pat_index = xe->pat.idx[XE_CACHE_WB];
1570 	else
1571 		pat_index = xe->pat.idx[XE_CACHE_NONE];
1572 
1573 	xe_assert(xe, pat_index <= 3);
1574 
1575 	return pat_index;
1576 }
1577 
1578 static u64 xelp_pde_encode_bo(struct xe_bo *bo, u64 bo_offset)
1579 {
1580 	u64 pde;
1581 
1582 	pde = xe_bo_addr(bo, bo_offset, XE_PAGE_SIZE);
1583 	pde |= XE_PAGE_PRESENT | XE_PAGE_RW;
1584 	pde |= pde_encode_pat_index(pde_pat_index(bo));
1585 
1586 	return pde;
1587 }
1588 
1589 static u64 xelp_pte_encode_bo(struct xe_bo *bo, u64 bo_offset,
1590 			      u16 pat_index, u32 pt_level)
1591 {
1592 	u64 pte;
1593 
1594 	pte = xe_bo_addr(bo, bo_offset, XE_PAGE_SIZE);
1595 	pte |= XE_PAGE_PRESENT | XE_PAGE_RW;
1596 	pte |= pte_encode_pat_index(pat_index, pt_level);
1597 	pte |= pte_encode_ps(pt_level);
1598 
1599 	if (xe_bo_is_vram(bo) || xe_bo_is_stolen_devmem(bo))
1600 		pte |= XE_PPGTT_PTE_DM;
1601 
1602 	return pte;
1603 }
1604 
1605 static u64 xelp_pte_encode_vma(u64 pte, struct xe_vma *vma,
1606 			       u16 pat_index, u32 pt_level)
1607 {
1608 	pte |= XE_PAGE_PRESENT;
1609 
1610 	if (likely(!xe_vma_read_only(vma)))
1611 		pte |= XE_PAGE_RW;
1612 
1613 	pte |= pte_encode_pat_index(pat_index, pt_level);
1614 	pte |= pte_encode_ps(pt_level);
1615 
1616 	if (unlikely(xe_vma_is_null(vma)))
1617 		pte |= XE_PTE_NULL;
1618 
1619 	return pte;
1620 }
1621 
1622 static u64 xelp_pte_encode_addr(struct xe_device *xe, u64 addr,
1623 				u16 pat_index,
1624 				u32 pt_level, bool devmem, u64 flags)
1625 {
1626 	u64 pte;
1627 
1628 	/* Avoid passing random bits directly as flags */
1629 	xe_assert(xe, !(flags & ~XE_PTE_PS64));
1630 
1631 	pte = addr;
1632 	pte |= XE_PAGE_PRESENT | XE_PAGE_RW;
1633 	pte |= pte_encode_pat_index(pat_index, pt_level);
1634 	pte |= pte_encode_ps(pt_level);
1635 
1636 	if (devmem)
1637 		pte |= XE_PPGTT_PTE_DM;
1638 
1639 	pte |= flags;
1640 
1641 	return pte;
1642 }
1643 
1644 static const struct xe_pt_ops xelp_pt_ops = {
1645 	.pte_encode_bo = xelp_pte_encode_bo,
1646 	.pte_encode_vma = xelp_pte_encode_vma,
1647 	.pte_encode_addr = xelp_pte_encode_addr,
1648 	.pde_encode_bo = xelp_pde_encode_bo,
1649 };
1650 
1651 static void vm_destroy_work_func(struct work_struct *w);
1652 
1653 /**
1654  * xe_vm_create_scratch() - Setup a scratch memory pagetable tree for the
1655  * given tile and vm.
1656  * @xe: xe device.
1657  * @tile: tile to set up for.
1658  * @vm: vm to set up for.
1659  *
1660  * Sets up a pagetable tree with one page-table per level and a single
1661  * leaf PTE. All pagetable entries point to the single page-table or,
1662  * for MAX_HUGEPTE_LEVEL, a NULL huge PTE returning 0 on read and
1663  * writes become NOPs.
1664  *
1665  * Return: 0 on success, negative error code on error.
1666  */
1667 static int xe_vm_create_scratch(struct xe_device *xe, struct xe_tile *tile,
1668 				struct xe_vm *vm)
1669 {
1670 	u8 id = tile->id;
1671 	int i;
1672 
1673 	for (i = MAX_HUGEPTE_LEVEL; i < vm->pt_root[id]->level; i++) {
1674 		vm->scratch_pt[id][i] = xe_pt_create(vm, tile, i);
1675 		if (IS_ERR(vm->scratch_pt[id][i])) {
1676 			int err = PTR_ERR(vm->scratch_pt[id][i]);
1677 
1678 			vm->scratch_pt[id][i] = NULL;
1679 			return err;
1680 		}
1681 
1682 		xe_pt_populate_empty(tile, vm, vm->scratch_pt[id][i]);
1683 	}
1684 
1685 	return 0;
1686 }
1687 ALLOW_ERROR_INJECTION(xe_vm_create_scratch, ERRNO);
1688 
1689 static void xe_vm_free_scratch(struct xe_vm *vm)
1690 {
1691 	struct xe_tile *tile;
1692 	u8 id;
1693 
1694 	if (!xe_vm_has_scratch(vm))
1695 		return;
1696 
1697 	for_each_tile(tile, vm->xe, id) {
1698 		u32 i;
1699 
1700 		if (!vm->pt_root[id])
1701 			continue;
1702 
1703 		for (i = MAX_HUGEPTE_LEVEL; i < vm->pt_root[id]->level; ++i)
1704 			if (vm->scratch_pt[id][i])
1705 				xe_pt_destroy(vm->scratch_pt[id][i], vm->flags, NULL);
1706 	}
1707 }
1708 
1709 struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags, struct xe_file *xef)
1710 {
1711 	struct drm_gem_object *vm_resv_obj;
1712 	struct xe_vm *vm;
1713 	int err, number_tiles = 0;
1714 	struct xe_tile *tile;
1715 	u8 id;
1716 
1717 	/*
1718 	 * Since the GSCCS is not user-accessible, we don't expect a GSC VM to
1719 	 * ever be in faulting mode.
1720 	 */
1721 	xe_assert(xe, !((flags & XE_VM_FLAG_GSC) && (flags & XE_VM_FLAG_FAULT_MODE)));
1722 
1723 	vm = kzalloc(sizeof(*vm), GFP_KERNEL);
1724 	if (!vm)
1725 		return ERR_PTR(-ENOMEM);
1726 
1727 	vm->xe = xe;
1728 
1729 	vm->size = 1ull << xe->info.va_bits;
1730 	vm->flags = flags;
1731 
1732 	if (xef)
1733 		vm->xef = xe_file_get(xef);
1734 	/**
1735 	 * GSC VMs are kernel-owned, only used for PXP ops and can sometimes be
1736 	 * manipulated under the PXP mutex. However, the PXP mutex can be taken
1737 	 * under a user-VM lock when the PXP session is started at exec_queue
1738 	 * creation time. Those are different VMs and therefore there is no risk
1739 	 * of deadlock, but we need to tell lockdep that this is the case or it
1740 	 * will print a warning.
1741 	 */
1742 	if (flags & XE_VM_FLAG_GSC) {
1743 		static struct lock_class_key gsc_vm_key;
1744 
1745 		__init_rwsem(&vm->lock, "gsc_vm", &gsc_vm_key);
1746 	} else {
1747 		init_rwsem(&vm->lock);
1748 	}
1749 	mutex_init(&vm->snap_mutex);
1750 
1751 	INIT_LIST_HEAD(&vm->rebind_list);
1752 
1753 	INIT_LIST_HEAD(&vm->userptr.repin_list);
1754 	INIT_LIST_HEAD(&vm->userptr.invalidated);
1755 	init_rwsem(&vm->userptr.notifier_lock);
1756 	spin_lock_init(&vm->userptr.invalidated_lock);
1757 
1758 	ttm_lru_bulk_move_init(&vm->lru_bulk_move);
1759 
1760 	INIT_WORK(&vm->destroy_work, vm_destroy_work_func);
1761 
1762 	INIT_LIST_HEAD(&vm->preempt.exec_queues);
1763 	vm->preempt.min_run_period_ms = 10;	/* FIXME: Wire up to uAPI */
1764 
1765 	for_each_tile(tile, xe, id)
1766 		xe_range_fence_tree_init(&vm->rftree[id]);
1767 
1768 	vm->pt_ops = &xelp_pt_ops;
1769 
1770 	/*
1771 	 * Long-running workloads are not protected by the scheduler references.
1772 	 * By design, run_job for long-running workloads returns NULL and the
1773 	 * scheduler drops all the references of it, hence protecting the VM
1774 	 * for this case is necessary.
1775 	 */
1776 	if (flags & XE_VM_FLAG_LR_MODE) {
1777 		INIT_WORK(&vm->preempt.rebind_work, preempt_rebind_work_func);
1778 		xe_pm_runtime_get_noresume(xe);
1779 		INIT_LIST_HEAD(&vm->preempt.pm_activate_link);
1780 	}
1781 
1782 	if (flags & XE_VM_FLAG_FAULT_MODE) {
1783 		err = xe_svm_init(vm);
1784 		if (err)
1785 			goto err_no_resv;
1786 	}
1787 
1788 	vm_resv_obj = drm_gpuvm_resv_object_alloc(&xe->drm);
1789 	if (!vm_resv_obj) {
1790 		err = -ENOMEM;
1791 		goto err_svm_fini;
1792 	}
1793 
1794 	drm_gpuvm_init(&vm->gpuvm, "Xe VM", DRM_GPUVM_RESV_PROTECTED, &xe->drm,
1795 		       vm_resv_obj, 0, vm->size, 0, 0, &gpuvm_ops);
1796 
1797 	drm_gem_object_put(vm_resv_obj);
1798 
1799 	err = xe_vm_lock(vm, true);
1800 	if (err)
1801 		goto err_close;
1802 
1803 	if (IS_DGFX(xe) && xe->info.vram_flags & XE_VRAM_FLAGS_NEED64K)
1804 		vm->flags |= XE_VM_FLAG_64K;
1805 
1806 	for_each_tile(tile, xe, id) {
1807 		if (flags & XE_VM_FLAG_MIGRATION &&
1808 		    tile->id != XE_VM_FLAG_TILE_ID(flags))
1809 			continue;
1810 
1811 		vm->pt_root[id] = xe_pt_create(vm, tile, xe->info.vm_max_level);
1812 		if (IS_ERR(vm->pt_root[id])) {
1813 			err = PTR_ERR(vm->pt_root[id]);
1814 			vm->pt_root[id] = NULL;
1815 			goto err_unlock_close;
1816 		}
1817 	}
1818 
1819 	if (xe_vm_has_scratch(vm)) {
1820 		for_each_tile(tile, xe, id) {
1821 			if (!vm->pt_root[id])
1822 				continue;
1823 
1824 			err = xe_vm_create_scratch(xe, tile, vm);
1825 			if (err)
1826 				goto err_unlock_close;
1827 		}
1828 		vm->batch_invalidate_tlb = true;
1829 	}
1830 
1831 	if (vm->flags & XE_VM_FLAG_LR_MODE)
1832 		vm->batch_invalidate_tlb = false;
1833 
1834 	/* Fill pt_root after allocating scratch tables */
1835 	for_each_tile(tile, xe, id) {
1836 		if (!vm->pt_root[id])
1837 			continue;
1838 
1839 		xe_pt_populate_empty(tile, vm, vm->pt_root[id]);
1840 	}
1841 	xe_vm_unlock(vm);
1842 
1843 	/* Kernel migration VM shouldn't have a circular loop.. */
1844 	if (!(flags & XE_VM_FLAG_MIGRATION)) {
1845 		for_each_tile(tile, xe, id) {
1846 			struct xe_exec_queue *q;
1847 			u32 create_flags = EXEC_QUEUE_FLAG_VM;
1848 
1849 			if (!vm->pt_root[id])
1850 				continue;
1851 
1852 			q = xe_exec_queue_create_bind(xe, tile, create_flags, 0);
1853 			if (IS_ERR(q)) {
1854 				err = PTR_ERR(q);
1855 				goto err_close;
1856 			}
1857 			vm->q[id] = q;
1858 			number_tiles++;
1859 		}
1860 	}
1861 
1862 	if (number_tiles > 1)
1863 		vm->composite_fence_ctx = dma_fence_context_alloc(1);
1864 
1865 	if (xef && xe->info.has_asid) {
1866 		u32 asid;
1867 
1868 		down_write(&xe->usm.lock);
1869 		err = xa_alloc_cyclic(&xe->usm.asid_to_vm, &asid, vm,
1870 				      XA_LIMIT(1, XE_MAX_ASID - 1),
1871 				      &xe->usm.next_asid, GFP_KERNEL);
1872 		up_write(&xe->usm.lock);
1873 		if (err < 0)
1874 			goto err_unlock_close;
1875 
1876 		vm->usm.asid = asid;
1877 	}
1878 
1879 	trace_xe_vm_create(vm);
1880 
1881 	return vm;
1882 
1883 err_unlock_close:
1884 	xe_vm_unlock(vm);
1885 err_close:
1886 	xe_vm_close_and_put(vm);
1887 	return ERR_PTR(err);
1888 
1889 err_svm_fini:
1890 	if (flags & XE_VM_FLAG_FAULT_MODE) {
1891 		vm->size = 0; /* close the vm */
1892 		xe_svm_fini(vm);
1893 	}
1894 err_no_resv:
1895 	mutex_destroy(&vm->snap_mutex);
1896 	for_each_tile(tile, xe, id)
1897 		xe_range_fence_tree_fini(&vm->rftree[id]);
1898 	ttm_lru_bulk_move_fini(&xe->ttm, &vm->lru_bulk_move);
1899 	if (vm->xef)
1900 		xe_file_put(vm->xef);
1901 	kfree(vm);
1902 	if (flags & XE_VM_FLAG_LR_MODE)
1903 		xe_pm_runtime_put(xe);
1904 	return ERR_PTR(err);
1905 }
1906 
1907 static void xe_vm_close(struct xe_vm *vm)
1908 {
1909 	struct xe_device *xe = vm->xe;
1910 	bool bound;
1911 	int idx;
1912 
1913 	bound = drm_dev_enter(&xe->drm, &idx);
1914 
1915 	down_write(&vm->lock);
1916 	if (xe_vm_in_fault_mode(vm))
1917 		xe_svm_notifier_lock(vm);
1918 
1919 	vm->size = 0;
1920 
1921 	if (!((vm->flags & XE_VM_FLAG_MIGRATION))) {
1922 		struct xe_tile *tile;
1923 		struct xe_gt *gt;
1924 		u8 id;
1925 
1926 		/* Wait for pending binds */
1927 		dma_resv_wait_timeout(xe_vm_resv(vm),
1928 				      DMA_RESV_USAGE_BOOKKEEP,
1929 				      false, MAX_SCHEDULE_TIMEOUT);
1930 
1931 		if (bound) {
1932 			for_each_tile(tile, xe, id)
1933 				if (vm->pt_root[id])
1934 					xe_pt_clear(xe, vm->pt_root[id]);
1935 
1936 			for_each_gt(gt, xe, id)
1937 				xe_tlb_inval_vm(&gt->tlb_inval, vm);
1938 		}
1939 	}
1940 
1941 	if (xe_vm_in_fault_mode(vm))
1942 		xe_svm_notifier_unlock(vm);
1943 	up_write(&vm->lock);
1944 
1945 	if (bound)
1946 		drm_dev_exit(idx);
1947 }
1948 
1949 void xe_vm_close_and_put(struct xe_vm *vm)
1950 {
1951 	LIST_HEAD(contested);
1952 	struct xe_device *xe = vm->xe;
1953 	struct xe_tile *tile;
1954 	struct xe_vma *vma, *next_vma;
1955 	struct drm_gpuva *gpuva, *next;
1956 	u8 id;
1957 
1958 	xe_assert(xe, !vm->preempt.num_exec_queues);
1959 
1960 	xe_vm_close(vm);
1961 	if (xe_vm_in_preempt_fence_mode(vm)) {
1962 		mutex_lock(&xe->rebind_resume_lock);
1963 		list_del_init(&vm->preempt.pm_activate_link);
1964 		mutex_unlock(&xe->rebind_resume_lock);
1965 		flush_work(&vm->preempt.rebind_work);
1966 	}
1967 	if (xe_vm_in_fault_mode(vm))
1968 		xe_svm_close(vm);
1969 
1970 	down_write(&vm->lock);
1971 	for_each_tile(tile, xe, id) {
1972 		if (vm->q[id])
1973 			xe_exec_queue_last_fence_put(vm->q[id], vm);
1974 	}
1975 	up_write(&vm->lock);
1976 
1977 	for_each_tile(tile, xe, id) {
1978 		if (vm->q[id]) {
1979 			xe_exec_queue_kill(vm->q[id]);
1980 			xe_exec_queue_put(vm->q[id]);
1981 			vm->q[id] = NULL;
1982 		}
1983 	}
1984 
1985 	down_write(&vm->lock);
1986 	xe_vm_lock(vm, false);
1987 	drm_gpuvm_for_each_va_safe(gpuva, next, &vm->gpuvm) {
1988 		vma = gpuva_to_vma(gpuva);
1989 
1990 		if (xe_vma_has_no_bo(vma)) {
1991 			down_read(&vm->userptr.notifier_lock);
1992 			vma->gpuva.flags |= XE_VMA_DESTROYED;
1993 			up_read(&vm->userptr.notifier_lock);
1994 		}
1995 
1996 		xe_vm_remove_vma(vm, vma);
1997 
1998 		/* easy case, remove from VMA? */
1999 		if (xe_vma_has_no_bo(vma) || xe_vma_bo(vma)->vm) {
2000 			list_del_init(&vma->combined_links.rebind);
2001 			xe_vma_destroy(vma, NULL);
2002 			continue;
2003 		}
2004 
2005 		list_move_tail(&vma->combined_links.destroy, &contested);
2006 		vma->gpuva.flags |= XE_VMA_DESTROYED;
2007 	}
2008 
2009 	/*
2010 	 * All vm operations will add shared fences to resv.
2011 	 * The only exception is eviction for a shared object,
2012 	 * but even so, the unbind when evicted would still
2013 	 * install a fence to resv. Hence it's safe to
2014 	 * destroy the pagetables immediately.
2015 	 */
2016 	xe_vm_free_scratch(vm);
2017 
2018 	for_each_tile(tile, xe, id) {
2019 		if (vm->pt_root[id]) {
2020 			xe_pt_destroy(vm->pt_root[id], vm->flags, NULL);
2021 			vm->pt_root[id] = NULL;
2022 		}
2023 	}
2024 	xe_vm_unlock(vm);
2025 
2026 	/*
2027 	 * VM is now dead, cannot re-add nodes to vm->vmas if it's NULL
2028 	 * Since we hold a refcount to the bo, we can remove and free
2029 	 * the members safely without locking.
2030 	 */
2031 	list_for_each_entry_safe(vma, next_vma, &contested,
2032 				 combined_links.destroy) {
2033 		list_del_init(&vma->combined_links.destroy);
2034 		xe_vma_destroy_unlocked(vma);
2035 	}
2036 
2037 	if (xe_vm_in_fault_mode(vm))
2038 		xe_svm_fini(vm);
2039 
2040 	up_write(&vm->lock);
2041 
2042 	down_write(&xe->usm.lock);
2043 	if (vm->usm.asid) {
2044 		void *lookup;
2045 
2046 		xe_assert(xe, xe->info.has_asid);
2047 		xe_assert(xe, !(vm->flags & XE_VM_FLAG_MIGRATION));
2048 
2049 		lookup = xa_erase(&xe->usm.asid_to_vm, vm->usm.asid);
2050 		xe_assert(xe, lookup == vm);
2051 	}
2052 	up_write(&xe->usm.lock);
2053 
2054 	for_each_tile(tile, xe, id)
2055 		xe_range_fence_tree_fini(&vm->rftree[id]);
2056 
2057 	xe_vm_put(vm);
2058 }
2059 
2060 static void vm_destroy_work_func(struct work_struct *w)
2061 {
2062 	struct xe_vm *vm =
2063 		container_of(w, struct xe_vm, destroy_work);
2064 	struct xe_device *xe = vm->xe;
2065 	struct xe_tile *tile;
2066 	u8 id;
2067 
2068 	/* xe_vm_close_and_put was not called? */
2069 	xe_assert(xe, !vm->size);
2070 
2071 	if (xe_vm_in_preempt_fence_mode(vm))
2072 		flush_work(&vm->preempt.rebind_work);
2073 
2074 	mutex_destroy(&vm->snap_mutex);
2075 
2076 	if (vm->flags & XE_VM_FLAG_LR_MODE)
2077 		xe_pm_runtime_put(xe);
2078 
2079 	for_each_tile(tile, xe, id)
2080 		XE_WARN_ON(vm->pt_root[id]);
2081 
2082 	trace_xe_vm_free(vm);
2083 
2084 	ttm_lru_bulk_move_fini(&xe->ttm, &vm->lru_bulk_move);
2085 
2086 	if (vm->xef)
2087 		xe_file_put(vm->xef);
2088 
2089 	kfree(vm);
2090 }
2091 
2092 static void xe_vm_free(struct drm_gpuvm *gpuvm)
2093 {
2094 	struct xe_vm *vm = container_of(gpuvm, struct xe_vm, gpuvm);
2095 
2096 	/* To destroy the VM we need to be able to sleep */
2097 	queue_work(system_unbound_wq, &vm->destroy_work);
2098 }
2099 
2100 struct xe_vm *xe_vm_lookup(struct xe_file *xef, u32 id)
2101 {
2102 	struct xe_vm *vm;
2103 
2104 	mutex_lock(&xef->vm.lock);
2105 	vm = xa_load(&xef->vm.xa, id);
2106 	if (vm)
2107 		xe_vm_get(vm);
2108 	mutex_unlock(&xef->vm.lock);
2109 
2110 	return vm;
2111 }
2112 
2113 u64 xe_vm_pdp4_descriptor(struct xe_vm *vm, struct xe_tile *tile)
2114 {
2115 	return vm->pt_ops->pde_encode_bo(vm->pt_root[tile->id]->bo, 0);
2116 }
2117 
2118 static struct xe_exec_queue *
2119 to_wait_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
2120 {
2121 	return q ? q : vm->q[0];
2122 }
2123 
2124 static struct xe_user_fence *
2125 find_ufence_get(struct xe_sync_entry *syncs, u32 num_syncs)
2126 {
2127 	unsigned int i;
2128 
2129 	for (i = 0; i < num_syncs; i++) {
2130 		struct xe_sync_entry *e = &syncs[i];
2131 
2132 		if (xe_sync_is_ufence(e))
2133 			return xe_sync_ufence_get(e);
2134 	}
2135 
2136 	return NULL;
2137 }
2138 
2139 #define ALL_DRM_XE_VM_CREATE_FLAGS (DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE | \
2140 				    DRM_XE_VM_CREATE_FLAG_LR_MODE | \
2141 				    DRM_XE_VM_CREATE_FLAG_FAULT_MODE)
2142 
2143 int xe_vm_create_ioctl(struct drm_device *dev, void *data,
2144 		       struct drm_file *file)
2145 {
2146 	struct xe_device *xe = to_xe_device(dev);
2147 	struct xe_file *xef = to_xe_file(file);
2148 	struct drm_xe_vm_create *args = data;
2149 	struct xe_vm *vm;
2150 	u32 id;
2151 	int err;
2152 	u32 flags = 0;
2153 
2154 	if (XE_IOCTL_DBG(xe, args->extensions))
2155 		return -EINVAL;
2156 
2157 	if (XE_GT_WA(xe_root_mmio_gt(xe), 14016763929))
2158 		args->flags |= DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE;
2159 
2160 	if (XE_IOCTL_DBG(xe, args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE &&
2161 			 !xe->info.has_usm))
2162 		return -EINVAL;
2163 
2164 	if (XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
2165 		return -EINVAL;
2166 
2167 	if (XE_IOCTL_DBG(xe, args->flags & ~ALL_DRM_XE_VM_CREATE_FLAGS))
2168 		return -EINVAL;
2169 
2170 	if (XE_IOCTL_DBG(xe, args->flags & DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE &&
2171 			 args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE &&
2172 			 !xe->info.needs_scratch))
2173 		return -EINVAL;
2174 
2175 	if (XE_IOCTL_DBG(xe, !(args->flags & DRM_XE_VM_CREATE_FLAG_LR_MODE) &&
2176 			 args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE))
2177 		return -EINVAL;
2178 
2179 	if (args->flags & DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE)
2180 		flags |= XE_VM_FLAG_SCRATCH_PAGE;
2181 	if (args->flags & DRM_XE_VM_CREATE_FLAG_LR_MODE)
2182 		flags |= XE_VM_FLAG_LR_MODE;
2183 	if (args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE)
2184 		flags |= XE_VM_FLAG_FAULT_MODE;
2185 
2186 	vm = xe_vm_create(xe, flags, xef);
2187 	if (IS_ERR(vm))
2188 		return PTR_ERR(vm);
2189 
2190 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_MEM)
2191 	/* Warning: Security issue - never enable by default */
2192 	args->reserved[0] = xe_bo_main_addr(vm->pt_root[0]->bo, XE_PAGE_SIZE);
2193 #endif
2194 
2195 	/* user id alloc must always be last in ioctl to prevent UAF */
2196 	err = xa_alloc(&xef->vm.xa, &id, vm, xa_limit_32b, GFP_KERNEL);
2197 	if (err)
2198 		goto err_close_and_put;
2199 
2200 	args->vm_id = id;
2201 
2202 	return 0;
2203 
2204 err_close_and_put:
2205 	xe_vm_close_and_put(vm);
2206 
2207 	return err;
2208 }
2209 
2210 int xe_vm_destroy_ioctl(struct drm_device *dev, void *data,
2211 			struct drm_file *file)
2212 {
2213 	struct xe_device *xe = to_xe_device(dev);
2214 	struct xe_file *xef = to_xe_file(file);
2215 	struct drm_xe_vm_destroy *args = data;
2216 	struct xe_vm *vm;
2217 	int err = 0;
2218 
2219 	if (XE_IOCTL_DBG(xe, args->pad) ||
2220 	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
2221 		return -EINVAL;
2222 
2223 	mutex_lock(&xef->vm.lock);
2224 	vm = xa_load(&xef->vm.xa, args->vm_id);
2225 	if (XE_IOCTL_DBG(xe, !vm))
2226 		err = -ENOENT;
2227 	else if (XE_IOCTL_DBG(xe, vm->preempt.num_exec_queues))
2228 		err = -EBUSY;
2229 	else
2230 		xa_erase(&xef->vm.xa, args->vm_id);
2231 	mutex_unlock(&xef->vm.lock);
2232 
2233 	if (!err)
2234 		xe_vm_close_and_put(vm);
2235 
2236 	return err;
2237 }
2238 
2239 static int xe_vm_query_vmas(struct xe_vm *vm, u64 start, u64 end)
2240 {
2241 	struct drm_gpuva *gpuva;
2242 	u32 num_vmas = 0;
2243 
2244 	lockdep_assert_held(&vm->lock);
2245 	drm_gpuvm_for_each_va_range(gpuva, &vm->gpuvm, start, end)
2246 		num_vmas++;
2247 
2248 	return num_vmas;
2249 }
2250 
2251 static int get_mem_attrs(struct xe_vm *vm, u32 *num_vmas, u64 start,
2252 			 u64 end, struct drm_xe_mem_range_attr *attrs)
2253 {
2254 	struct drm_gpuva *gpuva;
2255 	int i = 0;
2256 
2257 	lockdep_assert_held(&vm->lock);
2258 
2259 	drm_gpuvm_for_each_va_range(gpuva, &vm->gpuvm, start, end) {
2260 		struct xe_vma *vma = gpuva_to_vma(gpuva);
2261 
2262 		if (i == *num_vmas)
2263 			return -ENOSPC;
2264 
2265 		attrs[i].start = xe_vma_start(vma);
2266 		attrs[i].end = xe_vma_end(vma);
2267 		attrs[i].atomic.val = vma->attr.atomic_access;
2268 		attrs[i].pat_index.val = vma->attr.pat_index;
2269 		attrs[i].preferred_mem_loc.devmem_fd = vma->attr.preferred_loc.devmem_fd;
2270 		attrs[i].preferred_mem_loc.migration_policy =
2271 		vma->attr.preferred_loc.migration_policy;
2272 
2273 		i++;
2274 	}
2275 
2276 	*num_vmas = i;
2277 	return 0;
2278 }
2279 
2280 int xe_vm_query_vmas_attrs_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
2281 {
2282 	struct xe_device *xe = to_xe_device(dev);
2283 	struct xe_file *xef = to_xe_file(file);
2284 	struct drm_xe_mem_range_attr *mem_attrs;
2285 	struct drm_xe_vm_query_mem_range_attr *args = data;
2286 	u64 __user *attrs_user = u64_to_user_ptr(args->vector_of_mem_attr);
2287 	struct xe_vm *vm;
2288 	int err = 0;
2289 
2290 	if (XE_IOCTL_DBG(xe,
2291 			 ((args->num_mem_ranges == 0 &&
2292 			  (attrs_user || args->sizeof_mem_range_attr != 0)) ||
2293 			 (args->num_mem_ranges > 0 &&
2294 			  (!attrs_user ||
2295 			   args->sizeof_mem_range_attr !=
2296 			   sizeof(struct drm_xe_mem_range_attr))))))
2297 		return -EINVAL;
2298 
2299 	vm = xe_vm_lookup(xef, args->vm_id);
2300 	if (XE_IOCTL_DBG(xe, !vm))
2301 		return -EINVAL;
2302 
2303 	err = down_read_interruptible(&vm->lock);
2304 	if (err)
2305 		goto put_vm;
2306 
2307 	attrs_user = u64_to_user_ptr(args->vector_of_mem_attr);
2308 
2309 	if (args->num_mem_ranges == 0 && !attrs_user) {
2310 		args->num_mem_ranges = xe_vm_query_vmas(vm, args->start, args->start + args->range);
2311 		args->sizeof_mem_range_attr = sizeof(struct drm_xe_mem_range_attr);
2312 		goto unlock_vm;
2313 	}
2314 
2315 	mem_attrs = kvmalloc_array(args->num_mem_ranges, args->sizeof_mem_range_attr,
2316 				   GFP_KERNEL | __GFP_ACCOUNT |
2317 				   __GFP_RETRY_MAYFAIL | __GFP_NOWARN);
2318 	if (!mem_attrs) {
2319 		err = args->num_mem_ranges > 1 ? -ENOBUFS : -ENOMEM;
2320 		goto unlock_vm;
2321 	}
2322 
2323 	memset(mem_attrs, 0, args->num_mem_ranges * args->sizeof_mem_range_attr);
2324 	err = get_mem_attrs(vm, &args->num_mem_ranges, args->start,
2325 			    args->start + args->range, mem_attrs);
2326 	if (err)
2327 		goto free_mem_attrs;
2328 
2329 	err = copy_to_user(attrs_user, mem_attrs,
2330 			   args->sizeof_mem_range_attr * args->num_mem_ranges);
2331 
2332 free_mem_attrs:
2333 	kvfree(mem_attrs);
2334 unlock_vm:
2335 	up_read(&vm->lock);
2336 put_vm:
2337 	xe_vm_put(vm);
2338 	return err;
2339 }
2340 
2341 static bool vma_matches(struct xe_vma *vma, u64 page_addr)
2342 {
2343 	if (page_addr > xe_vma_end(vma) - 1 ||
2344 	    page_addr + SZ_4K - 1 < xe_vma_start(vma))
2345 		return false;
2346 
2347 	return true;
2348 }
2349 
2350 /**
2351  * xe_vm_find_vma_by_addr() - Find a VMA by its address
2352  *
2353  * @vm: the xe_vm the vma belongs to
2354  * @page_addr: address to look up
2355  */
2356 struct xe_vma *xe_vm_find_vma_by_addr(struct xe_vm *vm, u64 page_addr)
2357 {
2358 	struct xe_vma *vma = NULL;
2359 
2360 	if (vm->usm.last_fault_vma) {   /* Fast lookup */
2361 		if (vma_matches(vm->usm.last_fault_vma, page_addr))
2362 			vma = vm->usm.last_fault_vma;
2363 	}
2364 	if (!vma)
2365 		vma = xe_vm_find_overlapping_vma(vm, page_addr, SZ_4K);
2366 
2367 	return vma;
2368 }
2369 
2370 static const u32 region_to_mem_type[] = {
2371 	XE_PL_TT,
2372 	XE_PL_VRAM0,
2373 	XE_PL_VRAM1,
2374 };
2375 
2376 static void prep_vma_destroy(struct xe_vm *vm, struct xe_vma *vma,
2377 			     bool post_commit)
2378 {
2379 	down_read(&vm->userptr.notifier_lock);
2380 	vma->gpuva.flags |= XE_VMA_DESTROYED;
2381 	up_read(&vm->userptr.notifier_lock);
2382 	if (post_commit)
2383 		xe_vm_remove_vma(vm, vma);
2384 }
2385 
2386 #undef ULL
2387 #define ULL	unsigned long long
2388 
2389 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_VM)
2390 static void print_op(struct xe_device *xe, struct drm_gpuva_op *op)
2391 {
2392 	struct xe_vma *vma;
2393 
2394 	switch (op->op) {
2395 	case DRM_GPUVA_OP_MAP:
2396 		vm_dbg(&xe->drm, "MAP: addr=0x%016llx, range=0x%016llx",
2397 		       (ULL)op->map.va.addr, (ULL)op->map.va.range);
2398 		break;
2399 	case DRM_GPUVA_OP_REMAP:
2400 		vma = gpuva_to_vma(op->remap.unmap->va);
2401 		vm_dbg(&xe->drm, "REMAP:UNMAP: addr=0x%016llx, range=0x%016llx, keep=%d",
2402 		       (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma),
2403 		       op->remap.unmap->keep ? 1 : 0);
2404 		if (op->remap.prev)
2405 			vm_dbg(&xe->drm,
2406 			       "REMAP:PREV: addr=0x%016llx, range=0x%016llx",
2407 			       (ULL)op->remap.prev->va.addr,
2408 			       (ULL)op->remap.prev->va.range);
2409 		if (op->remap.next)
2410 			vm_dbg(&xe->drm,
2411 			       "REMAP:NEXT: addr=0x%016llx, range=0x%016llx",
2412 			       (ULL)op->remap.next->va.addr,
2413 			       (ULL)op->remap.next->va.range);
2414 		break;
2415 	case DRM_GPUVA_OP_UNMAP:
2416 		vma = gpuva_to_vma(op->unmap.va);
2417 		vm_dbg(&xe->drm, "UNMAP: addr=0x%016llx, range=0x%016llx, keep=%d",
2418 		       (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma),
2419 		       op->unmap.keep ? 1 : 0);
2420 		break;
2421 	case DRM_GPUVA_OP_PREFETCH:
2422 		vma = gpuva_to_vma(op->prefetch.va);
2423 		vm_dbg(&xe->drm, "PREFETCH: addr=0x%016llx, range=0x%016llx",
2424 		       (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma));
2425 		break;
2426 	default:
2427 		drm_warn(&xe->drm, "NOT POSSIBLE");
2428 	}
2429 }
2430 #else
2431 static void print_op(struct xe_device *xe, struct drm_gpuva_op *op)
2432 {
2433 }
2434 #endif
2435 
2436 static bool __xe_vm_needs_clear_scratch_pages(struct xe_vm *vm, u32 bind_flags)
2437 {
2438 	if (!xe_vm_in_fault_mode(vm))
2439 		return false;
2440 
2441 	if (!xe_vm_has_scratch(vm))
2442 		return false;
2443 
2444 	if (bind_flags & DRM_XE_VM_BIND_FLAG_IMMEDIATE)
2445 		return false;
2446 
2447 	return true;
2448 }
2449 
2450 static void xe_svm_prefetch_gpuva_ops_fini(struct drm_gpuva_ops *ops)
2451 {
2452 	struct drm_gpuva_op *__op;
2453 
2454 	drm_gpuva_for_each_op(__op, ops) {
2455 		struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2456 
2457 		xe_vma_svm_prefetch_op_fini(op);
2458 	}
2459 }
2460 
2461 /*
2462  * Create operations list from IOCTL arguments, setup operations fields so parse
2463  * and commit steps are decoupled from IOCTL arguments. This step can fail.
2464  */
2465 static struct drm_gpuva_ops *
2466 vm_bind_ioctl_ops_create(struct xe_vm *vm, struct xe_vma_ops *vops,
2467 			 struct xe_bo *bo, u64 bo_offset_or_userptr,
2468 			 u64 addr, u64 range,
2469 			 u32 operation, u32 flags,
2470 			 u32 prefetch_region, u16 pat_index)
2471 {
2472 	struct drm_gem_object *obj = bo ? &bo->ttm.base : NULL;
2473 	struct drm_gpuva_ops *ops;
2474 	struct drm_gpuva_op *__op;
2475 	struct drm_gpuvm_bo *vm_bo;
2476 	u64 range_end = addr + range;
2477 	int err;
2478 
2479 	lockdep_assert_held_write(&vm->lock);
2480 
2481 	vm_dbg(&vm->xe->drm,
2482 	       "op=%d, addr=0x%016llx, range=0x%016llx, bo_offset_or_userptr=0x%016llx",
2483 	       operation, (ULL)addr, (ULL)range,
2484 	       (ULL)bo_offset_or_userptr);
2485 
2486 	switch (operation) {
2487 	case DRM_XE_VM_BIND_OP_MAP:
2488 	case DRM_XE_VM_BIND_OP_MAP_USERPTR: {
2489 		struct drm_gpuvm_map_req map_req = {
2490 			.map.va.addr = addr,
2491 			.map.va.range = range,
2492 			.map.gem.obj = obj,
2493 			.map.gem.offset = bo_offset_or_userptr,
2494 		};
2495 
2496 		ops = drm_gpuvm_sm_map_ops_create(&vm->gpuvm, &map_req);
2497 		break;
2498 	}
2499 	case DRM_XE_VM_BIND_OP_UNMAP:
2500 		ops = drm_gpuvm_sm_unmap_ops_create(&vm->gpuvm, addr, range);
2501 		break;
2502 	case DRM_XE_VM_BIND_OP_PREFETCH:
2503 		ops = drm_gpuvm_prefetch_ops_create(&vm->gpuvm, addr, range);
2504 		break;
2505 	case DRM_XE_VM_BIND_OP_UNMAP_ALL:
2506 		xe_assert(vm->xe, bo);
2507 
2508 		err = xe_bo_lock(bo, true);
2509 		if (err)
2510 			return ERR_PTR(err);
2511 
2512 		vm_bo = drm_gpuvm_bo_obtain(&vm->gpuvm, obj);
2513 		if (IS_ERR(vm_bo)) {
2514 			xe_bo_unlock(bo);
2515 			return ERR_CAST(vm_bo);
2516 		}
2517 
2518 		ops = drm_gpuvm_bo_unmap_ops_create(vm_bo);
2519 		drm_gpuvm_bo_put(vm_bo);
2520 		xe_bo_unlock(bo);
2521 		break;
2522 	default:
2523 		drm_warn(&vm->xe->drm, "NOT POSSIBLE");
2524 		ops = ERR_PTR(-EINVAL);
2525 	}
2526 	if (IS_ERR(ops))
2527 		return ops;
2528 
2529 	drm_gpuva_for_each_op(__op, ops) {
2530 		struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2531 
2532 		if (__op->op == DRM_GPUVA_OP_MAP) {
2533 			op->map.immediate =
2534 				flags & DRM_XE_VM_BIND_FLAG_IMMEDIATE;
2535 			op->map.read_only =
2536 				flags & DRM_XE_VM_BIND_FLAG_READONLY;
2537 			op->map.is_null = flags & DRM_XE_VM_BIND_FLAG_NULL;
2538 			op->map.is_cpu_addr_mirror = flags &
2539 				DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR;
2540 			op->map.dumpable = flags & DRM_XE_VM_BIND_FLAG_DUMPABLE;
2541 			op->map.pat_index = pat_index;
2542 			op->map.invalidate_on_bind =
2543 				__xe_vm_needs_clear_scratch_pages(vm, flags);
2544 		} else if (__op->op == DRM_GPUVA_OP_PREFETCH) {
2545 			struct xe_vma *vma = gpuva_to_vma(op->base.prefetch.va);
2546 			struct xe_tile *tile;
2547 			struct xe_svm_range *svm_range;
2548 			struct drm_gpusvm_ctx ctx = {};
2549 			struct drm_pagemap *dpagemap;
2550 			u8 id, tile_mask = 0;
2551 			u32 i;
2552 
2553 			if (!xe_vma_is_cpu_addr_mirror(vma)) {
2554 				op->prefetch.region = prefetch_region;
2555 				break;
2556 			}
2557 
2558 			ctx.read_only = xe_vma_read_only(vma);
2559 			ctx.devmem_possible = IS_DGFX(vm->xe) &&
2560 					      IS_ENABLED(CONFIG_DRM_XE_PAGEMAP);
2561 
2562 			for_each_tile(tile, vm->xe, id)
2563 				tile_mask |= 0x1 << id;
2564 
2565 			xa_init_flags(&op->prefetch_range.range, XA_FLAGS_ALLOC);
2566 			op->prefetch_range.ranges_count = 0;
2567 			tile = NULL;
2568 
2569 			if (prefetch_region == DRM_XE_CONSULT_MEM_ADVISE_PREF_LOC) {
2570 				dpagemap = xe_vma_resolve_pagemap(vma,
2571 								  xe_device_get_root_tile(vm->xe));
2572 				/*
2573 				 * TODO: Once multigpu support is enabled will need
2574 				 * something to dereference tile from dpagemap.
2575 				 */
2576 				if (dpagemap)
2577 					tile = xe_device_get_root_tile(vm->xe);
2578 			} else if (prefetch_region) {
2579 				tile = &vm->xe->tiles[region_to_mem_type[prefetch_region] -
2580 						      XE_PL_VRAM0];
2581 			}
2582 
2583 			op->prefetch_range.tile = tile;
2584 alloc_next_range:
2585 			svm_range = xe_svm_range_find_or_insert(vm, addr, vma, &ctx);
2586 
2587 			if (PTR_ERR(svm_range) == -ENOENT) {
2588 				u64 ret = xe_svm_find_vma_start(vm, addr, range_end, vma);
2589 
2590 				addr = ret == ULONG_MAX ? 0 : ret;
2591 				if (addr)
2592 					goto alloc_next_range;
2593 				else
2594 					goto print_op_label;
2595 			}
2596 
2597 			if (IS_ERR(svm_range)) {
2598 				err = PTR_ERR(svm_range);
2599 				goto unwind_prefetch_ops;
2600 			}
2601 
2602 			if (xe_svm_range_validate(vm, svm_range, tile_mask, !!tile)) {
2603 				xe_svm_range_debug(svm_range, "PREFETCH - RANGE IS VALID");
2604 				goto check_next_range;
2605 			}
2606 
2607 			err = xa_alloc(&op->prefetch_range.range,
2608 				       &i, svm_range, xa_limit_32b,
2609 				       GFP_KERNEL);
2610 
2611 			if (err)
2612 				goto unwind_prefetch_ops;
2613 
2614 			op->prefetch_range.ranges_count++;
2615 			vops->flags |= XE_VMA_OPS_FLAG_HAS_SVM_PREFETCH;
2616 			xe_svm_range_debug(svm_range, "PREFETCH - RANGE CREATED");
2617 check_next_range:
2618 			if (range_end > xe_svm_range_end(svm_range) &&
2619 			    xe_svm_range_end(svm_range) < xe_vma_end(vma)) {
2620 				addr = xe_svm_range_end(svm_range);
2621 				goto alloc_next_range;
2622 			}
2623 		}
2624 print_op_label:
2625 		print_op(vm->xe, __op);
2626 	}
2627 
2628 	return ops;
2629 
2630 unwind_prefetch_ops:
2631 	xe_svm_prefetch_gpuva_ops_fini(ops);
2632 	drm_gpuva_ops_free(&vm->gpuvm, ops);
2633 	return ERR_PTR(err);
2634 }
2635 
2636 ALLOW_ERROR_INJECTION(vm_bind_ioctl_ops_create, ERRNO);
2637 
2638 static struct xe_vma *new_vma(struct xe_vm *vm, struct drm_gpuva_op_map *op,
2639 			      struct xe_vma_mem_attr *attr, unsigned int flags)
2640 {
2641 	struct xe_bo *bo = op->gem.obj ? gem_to_xe_bo(op->gem.obj) : NULL;
2642 	struct drm_exec exec;
2643 	struct xe_vma *vma;
2644 	int err = 0;
2645 
2646 	lockdep_assert_held_write(&vm->lock);
2647 
2648 	if (bo) {
2649 		drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT, 0);
2650 		drm_exec_until_all_locked(&exec) {
2651 			err = 0;
2652 			if (!bo->vm) {
2653 				err = drm_exec_lock_obj(&exec, xe_vm_obj(vm));
2654 				drm_exec_retry_on_contention(&exec);
2655 			}
2656 			if (!err) {
2657 				err = drm_exec_lock_obj(&exec, &bo->ttm.base);
2658 				drm_exec_retry_on_contention(&exec);
2659 			}
2660 			if (err) {
2661 				drm_exec_fini(&exec);
2662 				return ERR_PTR(err);
2663 			}
2664 		}
2665 	}
2666 	vma = xe_vma_create(vm, bo, op->gem.offset,
2667 			    op->va.addr, op->va.addr +
2668 			    op->va.range - 1, attr, flags);
2669 	if (IS_ERR(vma))
2670 		goto err_unlock;
2671 
2672 	if (xe_vma_is_userptr(vma))
2673 		err = xe_vma_userptr_pin_pages(to_userptr_vma(vma));
2674 	else if (!xe_vma_has_no_bo(vma) && !bo->vm)
2675 		err = add_preempt_fences(vm, bo);
2676 
2677 err_unlock:
2678 	if (bo)
2679 		drm_exec_fini(&exec);
2680 
2681 	if (err) {
2682 		prep_vma_destroy(vm, vma, false);
2683 		xe_vma_destroy_unlocked(vma);
2684 		vma = ERR_PTR(err);
2685 	}
2686 
2687 	return vma;
2688 }
2689 
2690 static u64 xe_vma_max_pte_size(struct xe_vma *vma)
2691 {
2692 	if (vma->gpuva.flags & XE_VMA_PTE_1G)
2693 		return SZ_1G;
2694 	else if (vma->gpuva.flags & (XE_VMA_PTE_2M | XE_VMA_PTE_COMPACT))
2695 		return SZ_2M;
2696 	else if (vma->gpuva.flags & XE_VMA_PTE_64K)
2697 		return SZ_64K;
2698 	else if (vma->gpuva.flags & XE_VMA_PTE_4K)
2699 		return SZ_4K;
2700 
2701 	return SZ_1G;	/* Uninitialized, used max size */
2702 }
2703 
2704 static void xe_vma_set_pte_size(struct xe_vma *vma, u64 size)
2705 {
2706 	switch (size) {
2707 	case SZ_1G:
2708 		vma->gpuva.flags |= XE_VMA_PTE_1G;
2709 		break;
2710 	case SZ_2M:
2711 		vma->gpuva.flags |= XE_VMA_PTE_2M;
2712 		break;
2713 	case SZ_64K:
2714 		vma->gpuva.flags |= XE_VMA_PTE_64K;
2715 		break;
2716 	case SZ_4K:
2717 		vma->gpuva.flags |= XE_VMA_PTE_4K;
2718 		break;
2719 	}
2720 }
2721 
2722 static int xe_vma_op_commit(struct xe_vm *vm, struct xe_vma_op *op)
2723 {
2724 	int err = 0;
2725 
2726 	lockdep_assert_held_write(&vm->lock);
2727 
2728 	switch (op->base.op) {
2729 	case DRM_GPUVA_OP_MAP:
2730 		err |= xe_vm_insert_vma(vm, op->map.vma);
2731 		if (!err)
2732 			op->flags |= XE_VMA_OP_COMMITTED;
2733 		break;
2734 	case DRM_GPUVA_OP_REMAP:
2735 	{
2736 		u8 tile_present =
2737 			gpuva_to_vma(op->base.remap.unmap->va)->tile_present;
2738 
2739 		prep_vma_destroy(vm, gpuva_to_vma(op->base.remap.unmap->va),
2740 				 true);
2741 		op->flags |= XE_VMA_OP_COMMITTED;
2742 
2743 		if (op->remap.prev) {
2744 			err |= xe_vm_insert_vma(vm, op->remap.prev);
2745 			if (!err)
2746 				op->flags |= XE_VMA_OP_PREV_COMMITTED;
2747 			if (!err && op->remap.skip_prev) {
2748 				op->remap.prev->tile_present =
2749 					tile_present;
2750 				op->remap.prev = NULL;
2751 			}
2752 		}
2753 		if (op->remap.next) {
2754 			err |= xe_vm_insert_vma(vm, op->remap.next);
2755 			if (!err)
2756 				op->flags |= XE_VMA_OP_NEXT_COMMITTED;
2757 			if (!err && op->remap.skip_next) {
2758 				op->remap.next->tile_present =
2759 					tile_present;
2760 				op->remap.next = NULL;
2761 			}
2762 		}
2763 
2764 		/* Adjust for partial unbind after removing VMA from VM */
2765 		if (!err) {
2766 			op->base.remap.unmap->va->va.addr = op->remap.start;
2767 			op->base.remap.unmap->va->va.range = op->remap.range;
2768 		}
2769 		break;
2770 	}
2771 	case DRM_GPUVA_OP_UNMAP:
2772 		prep_vma_destroy(vm, gpuva_to_vma(op->base.unmap.va), true);
2773 		op->flags |= XE_VMA_OP_COMMITTED;
2774 		break;
2775 	case DRM_GPUVA_OP_PREFETCH:
2776 		op->flags |= XE_VMA_OP_COMMITTED;
2777 		break;
2778 	default:
2779 		drm_warn(&vm->xe->drm, "NOT POSSIBLE");
2780 	}
2781 
2782 	return err;
2783 }
2784 
2785 /**
2786  * xe_vma_has_default_mem_attrs - Check if a VMA has default memory attributes
2787  * @vma: Pointer to the xe_vma structure to check
2788  *
2789  * This function determines whether the given VMA (Virtual Memory Area)
2790  * has its memory attributes set to their default values. Specifically,
2791  * it checks the following conditions:
2792  *
2793  * - `atomic_access` is `DRM_XE_VMA_ATOMIC_UNDEFINED`
2794  * - `pat_index` is equal to `default_pat_index`
2795  * - `preferred_loc.devmem_fd` is `DRM_XE_PREFERRED_LOC_DEFAULT_DEVICE`
2796  * - `preferred_loc.migration_policy` is `DRM_XE_MIGRATE_ALL_PAGES`
2797  *
2798  * Return: true if all attributes are at their default values, false otherwise.
2799  */
2800 bool xe_vma_has_default_mem_attrs(struct xe_vma *vma)
2801 {
2802 	return (vma->attr.atomic_access == DRM_XE_ATOMIC_UNDEFINED &&
2803 		vma->attr.pat_index ==  vma->attr.default_pat_index &&
2804 		vma->attr.preferred_loc.devmem_fd == DRM_XE_PREFERRED_LOC_DEFAULT_DEVICE &&
2805 		vma->attr.preferred_loc.migration_policy == DRM_XE_MIGRATE_ALL_PAGES);
2806 }
2807 
2808 static int vm_bind_ioctl_ops_parse(struct xe_vm *vm, struct drm_gpuva_ops *ops,
2809 				   struct xe_vma_ops *vops)
2810 {
2811 	struct xe_device *xe = vm->xe;
2812 	struct drm_gpuva_op *__op;
2813 	struct xe_tile *tile;
2814 	u8 id, tile_mask = 0;
2815 	int err = 0;
2816 
2817 	lockdep_assert_held_write(&vm->lock);
2818 
2819 	for_each_tile(tile, vm->xe, id)
2820 		tile_mask |= 0x1 << id;
2821 
2822 	drm_gpuva_for_each_op(__op, ops) {
2823 		struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2824 		struct xe_vma *vma;
2825 		unsigned int flags = 0;
2826 
2827 		INIT_LIST_HEAD(&op->link);
2828 		list_add_tail(&op->link, &vops->list);
2829 		op->tile_mask = tile_mask;
2830 
2831 		switch (op->base.op) {
2832 		case DRM_GPUVA_OP_MAP:
2833 		{
2834 			struct xe_vma_mem_attr default_attr = {
2835 				.preferred_loc = {
2836 					.devmem_fd = DRM_XE_PREFERRED_LOC_DEFAULT_DEVICE,
2837 					.migration_policy = DRM_XE_MIGRATE_ALL_PAGES,
2838 				},
2839 				.atomic_access = DRM_XE_ATOMIC_UNDEFINED,
2840 				.default_pat_index = op->map.pat_index,
2841 				.pat_index = op->map.pat_index,
2842 			};
2843 
2844 			flags |= op->map.read_only ?
2845 				VMA_CREATE_FLAG_READ_ONLY : 0;
2846 			flags |= op->map.is_null ?
2847 				VMA_CREATE_FLAG_IS_NULL : 0;
2848 			flags |= op->map.dumpable ?
2849 				VMA_CREATE_FLAG_DUMPABLE : 0;
2850 			flags |= op->map.is_cpu_addr_mirror ?
2851 				VMA_CREATE_FLAG_IS_SYSTEM_ALLOCATOR : 0;
2852 
2853 			vma = new_vma(vm, &op->base.map, &default_attr,
2854 				      flags);
2855 			if (IS_ERR(vma))
2856 				return PTR_ERR(vma);
2857 
2858 			op->map.vma = vma;
2859 			if (((op->map.immediate || !xe_vm_in_fault_mode(vm)) &&
2860 			     !op->map.is_cpu_addr_mirror) ||
2861 			    op->map.invalidate_on_bind)
2862 				xe_vma_ops_incr_pt_update_ops(vops,
2863 							      op->tile_mask, 1);
2864 			break;
2865 		}
2866 		case DRM_GPUVA_OP_REMAP:
2867 		{
2868 			struct xe_vma *old =
2869 				gpuva_to_vma(op->base.remap.unmap->va);
2870 			bool skip = xe_vma_is_cpu_addr_mirror(old);
2871 			u64 start = xe_vma_start(old), end = xe_vma_end(old);
2872 			int num_remap_ops = 0;
2873 
2874 			if (op->base.remap.prev)
2875 				start = op->base.remap.prev->va.addr +
2876 					op->base.remap.prev->va.range;
2877 			if (op->base.remap.next)
2878 				end = op->base.remap.next->va.addr;
2879 
2880 			if (xe_vma_is_cpu_addr_mirror(old) &&
2881 			    xe_svm_has_mapping(vm, start, end)) {
2882 				if (vops->flags & XE_VMA_OPS_FLAG_MADVISE)
2883 					xe_svm_unmap_address_range(vm, start, end);
2884 				else
2885 					return -EBUSY;
2886 			}
2887 
2888 			op->remap.start = xe_vma_start(old);
2889 			op->remap.range = xe_vma_size(old);
2890 
2891 			flags |= op->base.remap.unmap->va->flags &
2892 				XE_VMA_READ_ONLY ?
2893 				VMA_CREATE_FLAG_READ_ONLY : 0;
2894 			flags |= op->base.remap.unmap->va->flags &
2895 				DRM_GPUVA_SPARSE ?
2896 				VMA_CREATE_FLAG_IS_NULL : 0;
2897 			flags |= op->base.remap.unmap->va->flags &
2898 				XE_VMA_DUMPABLE ?
2899 				VMA_CREATE_FLAG_DUMPABLE : 0;
2900 			flags |= xe_vma_is_cpu_addr_mirror(old) ?
2901 				VMA_CREATE_FLAG_IS_SYSTEM_ALLOCATOR : 0;
2902 
2903 			if (op->base.remap.prev) {
2904 				vma = new_vma(vm, op->base.remap.prev,
2905 					      &old->attr, flags);
2906 				if (IS_ERR(vma))
2907 					return PTR_ERR(vma);
2908 
2909 				op->remap.prev = vma;
2910 
2911 				/*
2912 				 * Userptr creates a new SG mapping so
2913 				 * we must also rebind.
2914 				 */
2915 				op->remap.skip_prev = skip ||
2916 					(!xe_vma_is_userptr(old) &&
2917 					IS_ALIGNED(xe_vma_end(vma),
2918 						   xe_vma_max_pte_size(old)));
2919 				if (op->remap.skip_prev) {
2920 					xe_vma_set_pte_size(vma, xe_vma_max_pte_size(old));
2921 					op->remap.range -=
2922 						xe_vma_end(vma) -
2923 						xe_vma_start(old);
2924 					op->remap.start = xe_vma_end(vma);
2925 					vm_dbg(&xe->drm, "REMAP:SKIP_PREV: addr=0x%016llx, range=0x%016llx",
2926 					       (ULL)op->remap.start,
2927 					       (ULL)op->remap.range);
2928 				} else {
2929 					num_remap_ops++;
2930 				}
2931 			}
2932 
2933 			if (op->base.remap.next) {
2934 				vma = new_vma(vm, op->base.remap.next,
2935 					      &old->attr, flags);
2936 				if (IS_ERR(vma))
2937 					return PTR_ERR(vma);
2938 
2939 				op->remap.next = vma;
2940 
2941 				/*
2942 				 * Userptr creates a new SG mapping so
2943 				 * we must also rebind.
2944 				 */
2945 				op->remap.skip_next = skip ||
2946 					(!xe_vma_is_userptr(old) &&
2947 					IS_ALIGNED(xe_vma_start(vma),
2948 						   xe_vma_max_pte_size(old)));
2949 				if (op->remap.skip_next) {
2950 					xe_vma_set_pte_size(vma, xe_vma_max_pte_size(old));
2951 					op->remap.range -=
2952 						xe_vma_end(old) -
2953 						xe_vma_start(vma);
2954 					vm_dbg(&xe->drm, "REMAP:SKIP_NEXT: addr=0x%016llx, range=0x%016llx",
2955 					       (ULL)op->remap.start,
2956 					       (ULL)op->remap.range);
2957 				} else {
2958 					num_remap_ops++;
2959 				}
2960 			}
2961 			if (!skip)
2962 				num_remap_ops++;
2963 
2964 			xe_vma_ops_incr_pt_update_ops(vops, op->tile_mask, num_remap_ops);
2965 			break;
2966 		}
2967 		case DRM_GPUVA_OP_UNMAP:
2968 			vma = gpuva_to_vma(op->base.unmap.va);
2969 
2970 			if (xe_vma_is_cpu_addr_mirror(vma) &&
2971 			    xe_svm_has_mapping(vm, xe_vma_start(vma),
2972 					       xe_vma_end(vma)))
2973 				return -EBUSY;
2974 
2975 			if (!xe_vma_is_cpu_addr_mirror(vma))
2976 				xe_vma_ops_incr_pt_update_ops(vops, op->tile_mask, 1);
2977 			break;
2978 		case DRM_GPUVA_OP_PREFETCH:
2979 			vma = gpuva_to_vma(op->base.prefetch.va);
2980 
2981 			if (xe_vma_is_userptr(vma)) {
2982 				err = xe_vma_userptr_pin_pages(to_userptr_vma(vma));
2983 				if (err)
2984 					return err;
2985 			}
2986 
2987 			if (xe_vma_is_cpu_addr_mirror(vma))
2988 				xe_vma_ops_incr_pt_update_ops(vops, op->tile_mask,
2989 							      op->prefetch_range.ranges_count);
2990 			else
2991 				xe_vma_ops_incr_pt_update_ops(vops, op->tile_mask, 1);
2992 
2993 			break;
2994 		default:
2995 			drm_warn(&vm->xe->drm, "NOT POSSIBLE");
2996 		}
2997 
2998 		err = xe_vma_op_commit(vm, op);
2999 		if (err)
3000 			return err;
3001 	}
3002 
3003 	return 0;
3004 }
3005 
3006 static void xe_vma_op_unwind(struct xe_vm *vm, struct xe_vma_op *op,
3007 			     bool post_commit, bool prev_post_commit,
3008 			     bool next_post_commit)
3009 {
3010 	lockdep_assert_held_write(&vm->lock);
3011 
3012 	switch (op->base.op) {
3013 	case DRM_GPUVA_OP_MAP:
3014 		if (op->map.vma) {
3015 			prep_vma_destroy(vm, op->map.vma, post_commit);
3016 			xe_vma_destroy_unlocked(op->map.vma);
3017 		}
3018 		break;
3019 	case DRM_GPUVA_OP_UNMAP:
3020 	{
3021 		struct xe_vma *vma = gpuva_to_vma(op->base.unmap.va);
3022 
3023 		if (vma) {
3024 			down_read(&vm->userptr.notifier_lock);
3025 			vma->gpuva.flags &= ~XE_VMA_DESTROYED;
3026 			up_read(&vm->userptr.notifier_lock);
3027 			if (post_commit)
3028 				xe_vm_insert_vma(vm, vma);
3029 		}
3030 		break;
3031 	}
3032 	case DRM_GPUVA_OP_REMAP:
3033 	{
3034 		struct xe_vma *vma = gpuva_to_vma(op->base.remap.unmap->va);
3035 
3036 		if (op->remap.prev) {
3037 			prep_vma_destroy(vm, op->remap.prev, prev_post_commit);
3038 			xe_vma_destroy_unlocked(op->remap.prev);
3039 		}
3040 		if (op->remap.next) {
3041 			prep_vma_destroy(vm, op->remap.next, next_post_commit);
3042 			xe_vma_destroy_unlocked(op->remap.next);
3043 		}
3044 		if (vma) {
3045 			down_read(&vm->userptr.notifier_lock);
3046 			vma->gpuva.flags &= ~XE_VMA_DESTROYED;
3047 			up_read(&vm->userptr.notifier_lock);
3048 			if (post_commit)
3049 				xe_vm_insert_vma(vm, vma);
3050 		}
3051 		break;
3052 	}
3053 	case DRM_GPUVA_OP_PREFETCH:
3054 		/* Nothing to do */
3055 		break;
3056 	default:
3057 		drm_warn(&vm->xe->drm, "NOT POSSIBLE");
3058 	}
3059 }
3060 
3061 static void vm_bind_ioctl_ops_unwind(struct xe_vm *vm,
3062 				     struct drm_gpuva_ops **ops,
3063 				     int num_ops_list)
3064 {
3065 	int i;
3066 
3067 	for (i = num_ops_list - 1; i >= 0; --i) {
3068 		struct drm_gpuva_ops *__ops = ops[i];
3069 		struct drm_gpuva_op *__op;
3070 
3071 		if (!__ops)
3072 			continue;
3073 
3074 		drm_gpuva_for_each_op_reverse(__op, __ops) {
3075 			struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
3076 
3077 			xe_vma_op_unwind(vm, op,
3078 					 op->flags & XE_VMA_OP_COMMITTED,
3079 					 op->flags & XE_VMA_OP_PREV_COMMITTED,
3080 					 op->flags & XE_VMA_OP_NEXT_COMMITTED);
3081 		}
3082 	}
3083 }
3084 
3085 static int vma_lock_and_validate(struct drm_exec *exec, struct xe_vma *vma,
3086 				 bool validate)
3087 {
3088 	struct xe_bo *bo = xe_vma_bo(vma);
3089 	struct xe_vm *vm = xe_vma_vm(vma);
3090 	int err = 0;
3091 
3092 	if (bo) {
3093 		if (!bo->vm)
3094 			err = drm_exec_lock_obj(exec, &bo->ttm.base);
3095 		if (!err && validate)
3096 			err = xe_bo_validate(bo, vm,
3097 					     !xe_vm_in_preempt_fence_mode(vm));
3098 	}
3099 
3100 	return err;
3101 }
3102 
3103 static int check_ufence(struct xe_vma *vma)
3104 {
3105 	if (vma->ufence) {
3106 		struct xe_user_fence * const f = vma->ufence;
3107 
3108 		if (!xe_sync_ufence_get_status(f))
3109 			return -EBUSY;
3110 
3111 		vma->ufence = NULL;
3112 		xe_sync_ufence_put(f);
3113 	}
3114 
3115 	return 0;
3116 }
3117 
3118 static int prefetch_ranges(struct xe_vm *vm, struct xe_vma_op *op)
3119 {
3120 	bool devmem_possible = IS_DGFX(vm->xe) && IS_ENABLED(CONFIG_DRM_XE_PAGEMAP);
3121 	struct xe_vma *vma = gpuva_to_vma(op->base.prefetch.va);
3122 	struct xe_tile *tile = op->prefetch_range.tile;
3123 	int err = 0;
3124 
3125 	struct xe_svm_range *svm_range;
3126 	struct drm_gpusvm_ctx ctx = {};
3127 	unsigned long i;
3128 
3129 	if (!xe_vma_is_cpu_addr_mirror(vma))
3130 		return 0;
3131 
3132 	ctx.read_only = xe_vma_read_only(vma);
3133 	ctx.devmem_possible = devmem_possible;
3134 	ctx.check_pages_threshold = devmem_possible ? SZ_64K : 0;
3135 
3136 	/* TODO: Threading the migration */
3137 	xa_for_each(&op->prefetch_range.range, i, svm_range) {
3138 		if (!tile)
3139 			xe_svm_range_migrate_to_smem(vm, svm_range);
3140 
3141 		if (xe_svm_range_needs_migrate_to_vram(svm_range, vma, !!tile)) {
3142 			err = xe_svm_alloc_vram(tile, svm_range, &ctx);
3143 			if (err) {
3144 				drm_dbg(&vm->xe->drm, "VRAM allocation failed, retry from userspace, asid=%u, gpusvm=%p, errno=%pe\n",
3145 					vm->usm.asid, &vm->svm.gpusvm, ERR_PTR(err));
3146 				return -ENODATA;
3147 			}
3148 			xe_svm_range_debug(svm_range, "PREFETCH - RANGE MIGRATED TO VRAM");
3149 		}
3150 
3151 		err = xe_svm_range_get_pages(vm, svm_range, &ctx);
3152 		if (err) {
3153 			drm_dbg(&vm->xe->drm, "Get pages failed, asid=%u, gpusvm=%p, errno=%pe\n",
3154 				vm->usm.asid, &vm->svm.gpusvm, ERR_PTR(err));
3155 			if (err == -EOPNOTSUPP || err == -EFAULT || err == -EPERM)
3156 				err = -ENODATA;
3157 			return err;
3158 		}
3159 		xe_svm_range_debug(svm_range, "PREFETCH - RANGE GET PAGES DONE");
3160 	}
3161 
3162 	return err;
3163 }
3164 
3165 static int op_lock_and_prep(struct drm_exec *exec, struct xe_vm *vm,
3166 			    struct xe_vma_op *op)
3167 {
3168 	int err = 0;
3169 
3170 	switch (op->base.op) {
3171 	case DRM_GPUVA_OP_MAP:
3172 		if (!op->map.invalidate_on_bind)
3173 			err = vma_lock_and_validate(exec, op->map.vma,
3174 						    !xe_vm_in_fault_mode(vm) ||
3175 						    op->map.immediate);
3176 		break;
3177 	case DRM_GPUVA_OP_REMAP:
3178 		err = check_ufence(gpuva_to_vma(op->base.remap.unmap->va));
3179 		if (err)
3180 			break;
3181 
3182 		err = vma_lock_and_validate(exec,
3183 					    gpuva_to_vma(op->base.remap.unmap->va),
3184 					    false);
3185 		if (!err && op->remap.prev)
3186 			err = vma_lock_and_validate(exec, op->remap.prev, true);
3187 		if (!err && op->remap.next)
3188 			err = vma_lock_and_validate(exec, op->remap.next, true);
3189 		break;
3190 	case DRM_GPUVA_OP_UNMAP:
3191 		err = check_ufence(gpuva_to_vma(op->base.unmap.va));
3192 		if (err)
3193 			break;
3194 
3195 		err = vma_lock_and_validate(exec,
3196 					    gpuva_to_vma(op->base.unmap.va),
3197 					    false);
3198 		break;
3199 	case DRM_GPUVA_OP_PREFETCH:
3200 	{
3201 		struct xe_vma *vma = gpuva_to_vma(op->base.prefetch.va);
3202 		u32 region;
3203 
3204 		if (!xe_vma_is_cpu_addr_mirror(vma)) {
3205 			region = op->prefetch.region;
3206 			xe_assert(vm->xe, region == DRM_XE_CONSULT_MEM_ADVISE_PREF_LOC ||
3207 				  region <= ARRAY_SIZE(region_to_mem_type));
3208 		}
3209 
3210 		err = vma_lock_and_validate(exec,
3211 					    gpuva_to_vma(op->base.prefetch.va),
3212 					    false);
3213 		if (!err && !xe_vma_has_no_bo(vma))
3214 			err = xe_bo_migrate(xe_vma_bo(vma),
3215 					    region_to_mem_type[region]);
3216 		break;
3217 	}
3218 	default:
3219 		drm_warn(&vm->xe->drm, "NOT POSSIBLE");
3220 	}
3221 
3222 	return err;
3223 }
3224 
3225 static int vm_bind_ioctl_ops_prefetch_ranges(struct xe_vm *vm, struct xe_vma_ops *vops)
3226 {
3227 	struct xe_vma_op *op;
3228 	int err;
3229 
3230 	if (!(vops->flags & XE_VMA_OPS_FLAG_HAS_SVM_PREFETCH))
3231 		return 0;
3232 
3233 	list_for_each_entry(op, &vops->list, link) {
3234 		if (op->base.op  == DRM_GPUVA_OP_PREFETCH) {
3235 			err = prefetch_ranges(vm, op);
3236 			if (err)
3237 				return err;
3238 		}
3239 	}
3240 
3241 	return 0;
3242 }
3243 
3244 static int vm_bind_ioctl_ops_lock_and_prep(struct drm_exec *exec,
3245 					   struct xe_vm *vm,
3246 					   struct xe_vma_ops *vops)
3247 {
3248 	struct xe_vma_op *op;
3249 	int err;
3250 
3251 	err = drm_exec_lock_obj(exec, xe_vm_obj(vm));
3252 	if (err)
3253 		return err;
3254 
3255 	list_for_each_entry(op, &vops->list, link) {
3256 		err = op_lock_and_prep(exec, vm, op);
3257 		if (err)
3258 			return err;
3259 	}
3260 
3261 #ifdef TEST_VM_OPS_ERROR
3262 	if (vops->inject_error &&
3263 	    vm->xe->vm_inject_error_position == FORCE_OP_ERROR_LOCK)
3264 		return -ENOSPC;
3265 #endif
3266 
3267 	return 0;
3268 }
3269 
3270 static void op_trace(struct xe_vma_op *op)
3271 {
3272 	switch (op->base.op) {
3273 	case DRM_GPUVA_OP_MAP:
3274 		trace_xe_vma_bind(op->map.vma);
3275 		break;
3276 	case DRM_GPUVA_OP_REMAP:
3277 		trace_xe_vma_unbind(gpuva_to_vma(op->base.remap.unmap->va));
3278 		if (op->remap.prev)
3279 			trace_xe_vma_bind(op->remap.prev);
3280 		if (op->remap.next)
3281 			trace_xe_vma_bind(op->remap.next);
3282 		break;
3283 	case DRM_GPUVA_OP_UNMAP:
3284 		trace_xe_vma_unbind(gpuva_to_vma(op->base.unmap.va));
3285 		break;
3286 	case DRM_GPUVA_OP_PREFETCH:
3287 		trace_xe_vma_bind(gpuva_to_vma(op->base.prefetch.va));
3288 		break;
3289 	case DRM_GPUVA_OP_DRIVER:
3290 		break;
3291 	default:
3292 		XE_WARN_ON("NOT POSSIBLE");
3293 	}
3294 }
3295 
3296 static void trace_xe_vm_ops_execute(struct xe_vma_ops *vops)
3297 {
3298 	struct xe_vma_op *op;
3299 
3300 	list_for_each_entry(op, &vops->list, link)
3301 		op_trace(op);
3302 }
3303 
3304 static int vm_ops_setup_tile_args(struct xe_vm *vm, struct xe_vma_ops *vops)
3305 {
3306 	struct xe_exec_queue *q = vops->q;
3307 	struct xe_tile *tile;
3308 	int number_tiles = 0;
3309 	u8 id;
3310 
3311 	for_each_tile(tile, vm->xe, id) {
3312 		if (vops->pt_update_ops[id].num_ops)
3313 			++number_tiles;
3314 
3315 		if (vops->pt_update_ops[id].q)
3316 			continue;
3317 
3318 		if (q) {
3319 			vops->pt_update_ops[id].q = q;
3320 			if (vm->pt_root[id] && !list_empty(&q->multi_gt_list))
3321 				q = list_next_entry(q, multi_gt_list);
3322 		} else {
3323 			vops->pt_update_ops[id].q = vm->q[id];
3324 		}
3325 	}
3326 
3327 	return number_tiles;
3328 }
3329 
3330 static struct dma_fence *ops_execute(struct xe_vm *vm,
3331 				     struct xe_vma_ops *vops)
3332 {
3333 	struct xe_tile *tile;
3334 	struct dma_fence *fence = NULL;
3335 	struct dma_fence **fences = NULL;
3336 	struct dma_fence_array *cf = NULL;
3337 	int number_tiles = 0, current_fence = 0, err;
3338 	u8 id;
3339 
3340 	number_tiles = vm_ops_setup_tile_args(vm, vops);
3341 	if (number_tiles == 0)
3342 		return ERR_PTR(-ENODATA);
3343 
3344 	if (number_tiles > 1) {
3345 		fences = kmalloc_array(number_tiles, sizeof(*fences),
3346 				       GFP_KERNEL);
3347 		if (!fences) {
3348 			fence = ERR_PTR(-ENOMEM);
3349 			goto err_trace;
3350 		}
3351 	}
3352 
3353 	for_each_tile(tile, vm->xe, id) {
3354 		if (!vops->pt_update_ops[id].num_ops)
3355 			continue;
3356 
3357 		err = xe_pt_update_ops_prepare(tile, vops);
3358 		if (err) {
3359 			fence = ERR_PTR(err);
3360 			goto err_out;
3361 		}
3362 	}
3363 
3364 	trace_xe_vm_ops_execute(vops);
3365 
3366 	for_each_tile(tile, vm->xe, id) {
3367 		if (!vops->pt_update_ops[id].num_ops)
3368 			continue;
3369 
3370 		fence = xe_pt_update_ops_run(tile, vops);
3371 		if (IS_ERR(fence))
3372 			goto err_out;
3373 
3374 		if (fences)
3375 			fences[current_fence++] = fence;
3376 	}
3377 
3378 	if (fences) {
3379 		cf = dma_fence_array_create(number_tiles, fences,
3380 					    vm->composite_fence_ctx,
3381 					    vm->composite_fence_seqno++,
3382 					    false);
3383 		if (!cf) {
3384 			--vm->composite_fence_seqno;
3385 			fence = ERR_PTR(-ENOMEM);
3386 			goto err_out;
3387 		}
3388 		fence = &cf->base;
3389 	}
3390 
3391 	for_each_tile(tile, vm->xe, id) {
3392 		if (!vops->pt_update_ops[id].num_ops)
3393 			continue;
3394 
3395 		xe_pt_update_ops_fini(tile, vops);
3396 	}
3397 
3398 	return fence;
3399 
3400 err_out:
3401 	for_each_tile(tile, vm->xe, id) {
3402 		if (!vops->pt_update_ops[id].num_ops)
3403 			continue;
3404 
3405 		xe_pt_update_ops_abort(tile, vops);
3406 	}
3407 	while (current_fence)
3408 		dma_fence_put(fences[--current_fence]);
3409 	kfree(fences);
3410 	kfree(cf);
3411 
3412 err_trace:
3413 	trace_xe_vm_ops_fail(vm);
3414 	return fence;
3415 }
3416 
3417 static void vma_add_ufence(struct xe_vma *vma, struct xe_user_fence *ufence)
3418 {
3419 	if (vma->ufence)
3420 		xe_sync_ufence_put(vma->ufence);
3421 	vma->ufence = __xe_sync_ufence_get(ufence);
3422 }
3423 
3424 static void op_add_ufence(struct xe_vm *vm, struct xe_vma_op *op,
3425 			  struct xe_user_fence *ufence)
3426 {
3427 	switch (op->base.op) {
3428 	case DRM_GPUVA_OP_MAP:
3429 		vma_add_ufence(op->map.vma, ufence);
3430 		break;
3431 	case DRM_GPUVA_OP_REMAP:
3432 		if (op->remap.prev)
3433 			vma_add_ufence(op->remap.prev, ufence);
3434 		if (op->remap.next)
3435 			vma_add_ufence(op->remap.next, ufence);
3436 		break;
3437 	case DRM_GPUVA_OP_UNMAP:
3438 		break;
3439 	case DRM_GPUVA_OP_PREFETCH:
3440 		vma_add_ufence(gpuva_to_vma(op->base.prefetch.va), ufence);
3441 		break;
3442 	default:
3443 		drm_warn(&vm->xe->drm, "NOT POSSIBLE");
3444 	}
3445 }
3446 
3447 static void vm_bind_ioctl_ops_fini(struct xe_vm *vm, struct xe_vma_ops *vops,
3448 				   struct dma_fence *fence)
3449 {
3450 	struct xe_exec_queue *wait_exec_queue = to_wait_exec_queue(vm, vops->q);
3451 	struct xe_user_fence *ufence;
3452 	struct xe_vma_op *op;
3453 	int i;
3454 
3455 	ufence = find_ufence_get(vops->syncs, vops->num_syncs);
3456 	list_for_each_entry(op, &vops->list, link) {
3457 		if (ufence)
3458 			op_add_ufence(vm, op, ufence);
3459 
3460 		if (op->base.op == DRM_GPUVA_OP_UNMAP)
3461 			xe_vma_destroy(gpuva_to_vma(op->base.unmap.va), fence);
3462 		else if (op->base.op == DRM_GPUVA_OP_REMAP)
3463 			xe_vma_destroy(gpuva_to_vma(op->base.remap.unmap->va),
3464 				       fence);
3465 	}
3466 	if (ufence)
3467 		xe_sync_ufence_put(ufence);
3468 	if (fence) {
3469 		for (i = 0; i < vops->num_syncs; i++)
3470 			xe_sync_entry_signal(vops->syncs + i, fence);
3471 		xe_exec_queue_last_fence_set(wait_exec_queue, vm, fence);
3472 	}
3473 }
3474 
3475 static struct dma_fence *vm_bind_ioctl_ops_execute(struct xe_vm *vm,
3476 						   struct xe_vma_ops *vops)
3477 {
3478 	struct drm_exec exec;
3479 	struct dma_fence *fence;
3480 	int err;
3481 
3482 	lockdep_assert_held_write(&vm->lock);
3483 
3484 	drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT |
3485 		      DRM_EXEC_IGNORE_DUPLICATES, 0);
3486 	drm_exec_until_all_locked(&exec) {
3487 		err = vm_bind_ioctl_ops_lock_and_prep(&exec, vm, vops);
3488 		drm_exec_retry_on_contention(&exec);
3489 		if (err) {
3490 			fence = ERR_PTR(err);
3491 			goto unlock;
3492 		}
3493 
3494 		fence = ops_execute(vm, vops);
3495 		if (IS_ERR(fence)) {
3496 			if (PTR_ERR(fence) == -ENODATA)
3497 				vm_bind_ioctl_ops_fini(vm, vops, NULL);
3498 			goto unlock;
3499 		}
3500 
3501 		vm_bind_ioctl_ops_fini(vm, vops, fence);
3502 	}
3503 
3504 unlock:
3505 	drm_exec_fini(&exec);
3506 	return fence;
3507 }
3508 ALLOW_ERROR_INJECTION(vm_bind_ioctl_ops_execute, ERRNO);
3509 
3510 #define SUPPORTED_FLAGS_STUB  \
3511 	(DRM_XE_VM_BIND_FLAG_READONLY | \
3512 	 DRM_XE_VM_BIND_FLAG_IMMEDIATE | \
3513 	 DRM_XE_VM_BIND_FLAG_NULL | \
3514 	 DRM_XE_VM_BIND_FLAG_DUMPABLE | \
3515 	 DRM_XE_VM_BIND_FLAG_CHECK_PXP | \
3516 	 DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR)
3517 
3518 #ifdef TEST_VM_OPS_ERROR
3519 #define SUPPORTED_FLAGS	(SUPPORTED_FLAGS_STUB | FORCE_OP_ERROR)
3520 #else
3521 #define SUPPORTED_FLAGS	SUPPORTED_FLAGS_STUB
3522 #endif
3523 
3524 #define XE_64K_PAGE_MASK 0xffffull
3525 #define ALL_DRM_XE_SYNCS_FLAGS (DRM_XE_SYNCS_FLAG_WAIT_FOR_OP)
3526 
3527 static int vm_bind_ioctl_check_args(struct xe_device *xe, struct xe_vm *vm,
3528 				    struct drm_xe_vm_bind *args,
3529 				    struct drm_xe_vm_bind_op **bind_ops)
3530 {
3531 	int err;
3532 	int i;
3533 
3534 	if (XE_IOCTL_DBG(xe, args->pad || args->pad2) ||
3535 	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
3536 		return -EINVAL;
3537 
3538 	if (XE_IOCTL_DBG(xe, args->extensions))
3539 		return -EINVAL;
3540 
3541 	if (args->num_binds > 1) {
3542 		u64 __user *bind_user =
3543 			u64_to_user_ptr(args->vector_of_binds);
3544 
3545 		*bind_ops = kvmalloc_array(args->num_binds,
3546 					   sizeof(struct drm_xe_vm_bind_op),
3547 					   GFP_KERNEL | __GFP_ACCOUNT |
3548 					   __GFP_RETRY_MAYFAIL | __GFP_NOWARN);
3549 		if (!*bind_ops)
3550 			return args->num_binds > 1 ? -ENOBUFS : -ENOMEM;
3551 
3552 		err = copy_from_user(*bind_ops, bind_user,
3553 				     sizeof(struct drm_xe_vm_bind_op) *
3554 				     args->num_binds);
3555 		if (XE_IOCTL_DBG(xe, err)) {
3556 			err = -EFAULT;
3557 			goto free_bind_ops;
3558 		}
3559 	} else {
3560 		*bind_ops = &args->bind;
3561 	}
3562 
3563 	for (i = 0; i < args->num_binds; ++i) {
3564 		u64 range = (*bind_ops)[i].range;
3565 		u64 addr = (*bind_ops)[i].addr;
3566 		u32 op = (*bind_ops)[i].op;
3567 		u32 flags = (*bind_ops)[i].flags;
3568 		u32 obj = (*bind_ops)[i].obj;
3569 		u64 obj_offset = (*bind_ops)[i].obj_offset;
3570 		u32 prefetch_region = (*bind_ops)[i].prefetch_mem_region_instance;
3571 		bool is_null = flags & DRM_XE_VM_BIND_FLAG_NULL;
3572 		bool is_cpu_addr_mirror = flags &
3573 			DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR;
3574 		u16 pat_index = (*bind_ops)[i].pat_index;
3575 		u16 coh_mode;
3576 
3577 		if (XE_IOCTL_DBG(xe, is_cpu_addr_mirror &&
3578 				 (!xe_vm_in_fault_mode(vm) ||
3579 				 !IS_ENABLED(CONFIG_DRM_XE_GPUSVM)))) {
3580 			err = -EINVAL;
3581 			goto free_bind_ops;
3582 		}
3583 
3584 		if (XE_IOCTL_DBG(xe, pat_index >= xe->pat.n_entries)) {
3585 			err = -EINVAL;
3586 			goto free_bind_ops;
3587 		}
3588 
3589 		pat_index = array_index_nospec(pat_index, xe->pat.n_entries);
3590 		(*bind_ops)[i].pat_index = pat_index;
3591 		coh_mode = xe_pat_index_get_coh_mode(xe, pat_index);
3592 		if (XE_IOCTL_DBG(xe, !coh_mode)) { /* hw reserved */
3593 			err = -EINVAL;
3594 			goto free_bind_ops;
3595 		}
3596 
3597 		if (XE_WARN_ON(coh_mode > XE_COH_AT_LEAST_1WAY)) {
3598 			err = -EINVAL;
3599 			goto free_bind_ops;
3600 		}
3601 
3602 		if (XE_IOCTL_DBG(xe, op > DRM_XE_VM_BIND_OP_PREFETCH) ||
3603 		    XE_IOCTL_DBG(xe, flags & ~SUPPORTED_FLAGS) ||
3604 		    XE_IOCTL_DBG(xe, obj && (is_null || is_cpu_addr_mirror)) ||
3605 		    XE_IOCTL_DBG(xe, obj_offset && (is_null ||
3606 						    is_cpu_addr_mirror)) ||
3607 		    XE_IOCTL_DBG(xe, op != DRM_XE_VM_BIND_OP_MAP &&
3608 				 (is_null || is_cpu_addr_mirror)) ||
3609 		    XE_IOCTL_DBG(xe, !obj &&
3610 				 op == DRM_XE_VM_BIND_OP_MAP &&
3611 				 !is_null && !is_cpu_addr_mirror) ||
3612 		    XE_IOCTL_DBG(xe, !obj &&
3613 				 op == DRM_XE_VM_BIND_OP_UNMAP_ALL) ||
3614 		    XE_IOCTL_DBG(xe, addr &&
3615 				 op == DRM_XE_VM_BIND_OP_UNMAP_ALL) ||
3616 		    XE_IOCTL_DBG(xe, range &&
3617 				 op == DRM_XE_VM_BIND_OP_UNMAP_ALL) ||
3618 		    XE_IOCTL_DBG(xe, obj &&
3619 				 op == DRM_XE_VM_BIND_OP_MAP_USERPTR) ||
3620 		    XE_IOCTL_DBG(xe, coh_mode == XE_COH_NONE &&
3621 				 op == DRM_XE_VM_BIND_OP_MAP_USERPTR) ||
3622 		    XE_IOCTL_DBG(xe, obj &&
3623 				 op == DRM_XE_VM_BIND_OP_PREFETCH) ||
3624 		    XE_IOCTL_DBG(xe, prefetch_region &&
3625 				 op != DRM_XE_VM_BIND_OP_PREFETCH) ||
3626 		    XE_IOCTL_DBG(xe,  (prefetch_region != DRM_XE_CONSULT_MEM_ADVISE_PREF_LOC &&
3627 				       !(BIT(prefetch_region) & xe->info.mem_region_mask))) ||
3628 		    XE_IOCTL_DBG(xe, obj &&
3629 				 op == DRM_XE_VM_BIND_OP_UNMAP)) {
3630 			err = -EINVAL;
3631 			goto free_bind_ops;
3632 		}
3633 
3634 		if (XE_IOCTL_DBG(xe, obj_offset & ~PAGE_MASK) ||
3635 		    XE_IOCTL_DBG(xe, addr & ~PAGE_MASK) ||
3636 		    XE_IOCTL_DBG(xe, range & ~PAGE_MASK) ||
3637 		    XE_IOCTL_DBG(xe, !range &&
3638 				 op != DRM_XE_VM_BIND_OP_UNMAP_ALL)) {
3639 			err = -EINVAL;
3640 			goto free_bind_ops;
3641 		}
3642 	}
3643 
3644 	return 0;
3645 
3646 free_bind_ops:
3647 	if (args->num_binds > 1)
3648 		kvfree(*bind_ops);
3649 	*bind_ops = NULL;
3650 	return err;
3651 }
3652 
3653 static int vm_bind_ioctl_signal_fences(struct xe_vm *vm,
3654 				       struct xe_exec_queue *q,
3655 				       struct xe_sync_entry *syncs,
3656 				       int num_syncs)
3657 {
3658 	struct dma_fence *fence;
3659 	int i, err = 0;
3660 
3661 	fence = xe_sync_in_fence_get(syncs, num_syncs,
3662 				     to_wait_exec_queue(vm, q), vm);
3663 	if (IS_ERR(fence))
3664 		return PTR_ERR(fence);
3665 
3666 	for (i = 0; i < num_syncs; i++)
3667 		xe_sync_entry_signal(&syncs[i], fence);
3668 
3669 	xe_exec_queue_last_fence_set(to_wait_exec_queue(vm, q), vm,
3670 				     fence);
3671 	dma_fence_put(fence);
3672 
3673 	return err;
3674 }
3675 
3676 static void xe_vma_ops_init(struct xe_vma_ops *vops, struct xe_vm *vm,
3677 			    struct xe_exec_queue *q,
3678 			    struct xe_sync_entry *syncs, u32 num_syncs)
3679 {
3680 	memset(vops, 0, sizeof(*vops));
3681 	INIT_LIST_HEAD(&vops->list);
3682 	vops->vm = vm;
3683 	vops->q = q;
3684 	vops->syncs = syncs;
3685 	vops->num_syncs = num_syncs;
3686 	vops->flags = 0;
3687 }
3688 
3689 static int xe_vm_bind_ioctl_validate_bo(struct xe_device *xe, struct xe_bo *bo,
3690 					u64 addr, u64 range, u64 obj_offset,
3691 					u16 pat_index, u32 op, u32 bind_flags)
3692 {
3693 	u16 coh_mode;
3694 
3695 	if (XE_IOCTL_DBG(xe, range > xe_bo_size(bo)) ||
3696 	    XE_IOCTL_DBG(xe, obj_offset >
3697 			 xe_bo_size(bo) - range)) {
3698 		return -EINVAL;
3699 	}
3700 
3701 	/*
3702 	 * Some platforms require 64k VM_BIND alignment,
3703 	 * specifically those with XE_VRAM_FLAGS_NEED64K.
3704 	 *
3705 	 * Other platforms may have BO's set to 64k physical placement,
3706 	 * but can be mapped at 4k offsets anyway. This check is only
3707 	 * there for the former case.
3708 	 */
3709 	if ((bo->flags & XE_BO_FLAG_INTERNAL_64K) &&
3710 	    (xe->info.vram_flags & XE_VRAM_FLAGS_NEED64K)) {
3711 		if (XE_IOCTL_DBG(xe, obj_offset &
3712 				 XE_64K_PAGE_MASK) ||
3713 		    XE_IOCTL_DBG(xe, addr & XE_64K_PAGE_MASK) ||
3714 		    XE_IOCTL_DBG(xe, range & XE_64K_PAGE_MASK)) {
3715 			return -EINVAL;
3716 		}
3717 	}
3718 
3719 	coh_mode = xe_pat_index_get_coh_mode(xe, pat_index);
3720 	if (bo->cpu_caching) {
3721 		if (XE_IOCTL_DBG(xe, coh_mode == XE_COH_NONE &&
3722 				 bo->cpu_caching == DRM_XE_GEM_CPU_CACHING_WB)) {
3723 			return -EINVAL;
3724 		}
3725 	} else if (XE_IOCTL_DBG(xe, coh_mode == XE_COH_NONE)) {
3726 		/*
3727 		 * Imported dma-buf from a different device should
3728 		 * require 1way or 2way coherency since we don't know
3729 		 * how it was mapped on the CPU. Just assume is it
3730 		 * potentially cached on CPU side.
3731 		 */
3732 		return -EINVAL;
3733 	}
3734 
3735 	/* If a BO is protected it can only be mapped if the key is still valid */
3736 	if ((bind_flags & DRM_XE_VM_BIND_FLAG_CHECK_PXP) && xe_bo_is_protected(bo) &&
3737 	    op != DRM_XE_VM_BIND_OP_UNMAP && op != DRM_XE_VM_BIND_OP_UNMAP_ALL)
3738 		if (XE_IOCTL_DBG(xe, xe_pxp_bo_key_check(xe->pxp, bo) != 0))
3739 			return -ENOEXEC;
3740 
3741 	return 0;
3742 }
3743 
3744 int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
3745 {
3746 	struct xe_device *xe = to_xe_device(dev);
3747 	struct xe_file *xef = to_xe_file(file);
3748 	struct drm_xe_vm_bind *args = data;
3749 	struct drm_xe_sync __user *syncs_user;
3750 	struct xe_bo **bos = NULL;
3751 	struct drm_gpuva_ops **ops = NULL;
3752 	struct xe_vm *vm;
3753 	struct xe_exec_queue *q = NULL;
3754 	u32 num_syncs, num_ufence = 0;
3755 	struct xe_sync_entry *syncs = NULL;
3756 	struct drm_xe_vm_bind_op *bind_ops = NULL;
3757 	struct xe_vma_ops vops;
3758 	struct dma_fence *fence;
3759 	int err;
3760 	int i;
3761 
3762 	vm = xe_vm_lookup(xef, args->vm_id);
3763 	if (XE_IOCTL_DBG(xe, !vm))
3764 		return -EINVAL;
3765 
3766 	err = vm_bind_ioctl_check_args(xe, vm, args, &bind_ops);
3767 	if (err)
3768 		goto put_vm;
3769 
3770 	if (args->exec_queue_id) {
3771 		q = xe_exec_queue_lookup(xef, args->exec_queue_id);
3772 		if (XE_IOCTL_DBG(xe, !q)) {
3773 			err = -ENOENT;
3774 			goto free_bind_ops;
3775 		}
3776 
3777 		if (XE_IOCTL_DBG(xe, !(q->flags & EXEC_QUEUE_FLAG_VM))) {
3778 			err = -EINVAL;
3779 			goto put_exec_queue;
3780 		}
3781 	}
3782 
3783 	/* Ensure all UNMAPs visible */
3784 	xe_svm_flush(vm);
3785 
3786 	err = down_write_killable(&vm->lock);
3787 	if (err)
3788 		goto put_exec_queue;
3789 
3790 	if (XE_IOCTL_DBG(xe, xe_vm_is_closed_or_banned(vm))) {
3791 		err = -ENOENT;
3792 		goto release_vm_lock;
3793 	}
3794 
3795 	for (i = 0; i < args->num_binds; ++i) {
3796 		u64 range = bind_ops[i].range;
3797 		u64 addr = bind_ops[i].addr;
3798 
3799 		if (XE_IOCTL_DBG(xe, range > vm->size) ||
3800 		    XE_IOCTL_DBG(xe, addr > vm->size - range)) {
3801 			err = -EINVAL;
3802 			goto release_vm_lock;
3803 		}
3804 	}
3805 
3806 	if (args->num_binds) {
3807 		bos = kvcalloc(args->num_binds, sizeof(*bos),
3808 			       GFP_KERNEL | __GFP_ACCOUNT |
3809 			       __GFP_RETRY_MAYFAIL | __GFP_NOWARN);
3810 		if (!bos) {
3811 			err = -ENOMEM;
3812 			goto release_vm_lock;
3813 		}
3814 
3815 		ops = kvcalloc(args->num_binds, sizeof(*ops),
3816 			       GFP_KERNEL | __GFP_ACCOUNT |
3817 			       __GFP_RETRY_MAYFAIL | __GFP_NOWARN);
3818 		if (!ops) {
3819 			err = -ENOMEM;
3820 			goto free_bos;
3821 		}
3822 	}
3823 
3824 	for (i = 0; i < args->num_binds; ++i) {
3825 		struct drm_gem_object *gem_obj;
3826 		u64 range = bind_ops[i].range;
3827 		u64 addr = bind_ops[i].addr;
3828 		u32 obj = bind_ops[i].obj;
3829 		u64 obj_offset = bind_ops[i].obj_offset;
3830 		u16 pat_index = bind_ops[i].pat_index;
3831 		u32 op = bind_ops[i].op;
3832 		u32 bind_flags = bind_ops[i].flags;
3833 
3834 		if (!obj)
3835 			continue;
3836 
3837 		gem_obj = drm_gem_object_lookup(file, obj);
3838 		if (XE_IOCTL_DBG(xe, !gem_obj)) {
3839 			err = -ENOENT;
3840 			goto put_obj;
3841 		}
3842 		bos[i] = gem_to_xe_bo(gem_obj);
3843 
3844 		err = xe_vm_bind_ioctl_validate_bo(xe, bos[i], addr, range,
3845 						   obj_offset, pat_index, op,
3846 						   bind_flags);
3847 		if (err)
3848 			goto put_obj;
3849 	}
3850 
3851 	if (args->num_syncs) {
3852 		syncs = kcalloc(args->num_syncs, sizeof(*syncs), GFP_KERNEL);
3853 		if (!syncs) {
3854 			err = -ENOMEM;
3855 			goto put_obj;
3856 		}
3857 	}
3858 
3859 	syncs_user = u64_to_user_ptr(args->syncs);
3860 	for (num_syncs = 0; num_syncs < args->num_syncs; num_syncs++) {
3861 		err = xe_sync_entry_parse(xe, xef, &syncs[num_syncs],
3862 					  &syncs_user[num_syncs],
3863 					  (xe_vm_in_lr_mode(vm) ?
3864 					   SYNC_PARSE_FLAG_LR_MODE : 0) |
3865 					  (!args->num_binds ?
3866 					   SYNC_PARSE_FLAG_DISALLOW_USER_FENCE : 0));
3867 		if (err)
3868 			goto free_syncs;
3869 
3870 		if (xe_sync_is_ufence(&syncs[num_syncs]))
3871 			num_ufence++;
3872 	}
3873 
3874 	if (XE_IOCTL_DBG(xe, num_ufence > 1)) {
3875 		err = -EINVAL;
3876 		goto free_syncs;
3877 	}
3878 
3879 	if (!args->num_binds) {
3880 		err = -ENODATA;
3881 		goto free_syncs;
3882 	}
3883 
3884 	xe_vma_ops_init(&vops, vm, q, syncs, num_syncs);
3885 	for (i = 0; i < args->num_binds; ++i) {
3886 		u64 range = bind_ops[i].range;
3887 		u64 addr = bind_ops[i].addr;
3888 		u32 op = bind_ops[i].op;
3889 		u32 flags = bind_ops[i].flags;
3890 		u64 obj_offset = bind_ops[i].obj_offset;
3891 		u32 prefetch_region = bind_ops[i].prefetch_mem_region_instance;
3892 		u16 pat_index = bind_ops[i].pat_index;
3893 
3894 		ops[i] = vm_bind_ioctl_ops_create(vm, &vops, bos[i], obj_offset,
3895 						  addr, range, op, flags,
3896 						  prefetch_region, pat_index);
3897 		if (IS_ERR(ops[i])) {
3898 			err = PTR_ERR(ops[i]);
3899 			ops[i] = NULL;
3900 			goto unwind_ops;
3901 		}
3902 
3903 		err = vm_bind_ioctl_ops_parse(vm, ops[i], &vops);
3904 		if (err)
3905 			goto unwind_ops;
3906 
3907 #ifdef TEST_VM_OPS_ERROR
3908 		if (flags & FORCE_OP_ERROR) {
3909 			vops.inject_error = true;
3910 			vm->xe->vm_inject_error_position =
3911 				(vm->xe->vm_inject_error_position + 1) %
3912 				FORCE_OP_ERROR_COUNT;
3913 		}
3914 #endif
3915 	}
3916 
3917 	/* Nothing to do */
3918 	if (list_empty(&vops.list)) {
3919 		err = -ENODATA;
3920 		goto unwind_ops;
3921 	}
3922 
3923 	err = xe_vma_ops_alloc(&vops, args->num_binds > 1);
3924 	if (err)
3925 		goto unwind_ops;
3926 
3927 	err = vm_bind_ioctl_ops_prefetch_ranges(vm, &vops);
3928 	if (err)
3929 		goto unwind_ops;
3930 
3931 	fence = vm_bind_ioctl_ops_execute(vm, &vops);
3932 	if (IS_ERR(fence))
3933 		err = PTR_ERR(fence);
3934 	else
3935 		dma_fence_put(fence);
3936 
3937 unwind_ops:
3938 	if (err && err != -ENODATA)
3939 		vm_bind_ioctl_ops_unwind(vm, ops, args->num_binds);
3940 	xe_vma_ops_fini(&vops);
3941 	for (i = args->num_binds - 1; i >= 0; --i)
3942 		if (ops[i])
3943 			drm_gpuva_ops_free(&vm->gpuvm, ops[i]);
3944 free_syncs:
3945 	if (err == -ENODATA)
3946 		err = vm_bind_ioctl_signal_fences(vm, q, syncs, num_syncs);
3947 	while (num_syncs--)
3948 		xe_sync_entry_cleanup(&syncs[num_syncs]);
3949 
3950 	kfree(syncs);
3951 put_obj:
3952 	for (i = 0; i < args->num_binds; ++i)
3953 		xe_bo_put(bos[i]);
3954 
3955 	kvfree(ops);
3956 free_bos:
3957 	kvfree(bos);
3958 release_vm_lock:
3959 	up_write(&vm->lock);
3960 put_exec_queue:
3961 	if (q)
3962 		xe_exec_queue_put(q);
3963 free_bind_ops:
3964 	if (args->num_binds > 1)
3965 		kvfree(bind_ops);
3966 put_vm:
3967 	xe_vm_put(vm);
3968 	return err;
3969 }
3970 
3971 /**
3972  * xe_vm_bind_kernel_bo - bind a kernel BO to a VM
3973  * @vm: VM to bind the BO to
3974  * @bo: BO to bind
3975  * @q: exec queue to use for the bind (optional)
3976  * @addr: address at which to bind the BO
3977  * @cache_lvl: PAT cache level to use
3978  *
3979  * Execute a VM bind map operation on a kernel-owned BO to bind it into a
3980  * kernel-owned VM.
3981  *
3982  * Returns a dma_fence to track the binding completion if the job to do so was
3983  * successfully submitted, an error pointer otherwise.
3984  */
3985 struct dma_fence *xe_vm_bind_kernel_bo(struct xe_vm *vm, struct xe_bo *bo,
3986 				       struct xe_exec_queue *q, u64 addr,
3987 				       enum xe_cache_level cache_lvl)
3988 {
3989 	struct xe_vma_ops vops;
3990 	struct drm_gpuva_ops *ops = NULL;
3991 	struct dma_fence *fence;
3992 	int err;
3993 
3994 	xe_bo_get(bo);
3995 	xe_vm_get(vm);
3996 	if (q)
3997 		xe_exec_queue_get(q);
3998 
3999 	down_write(&vm->lock);
4000 
4001 	xe_vma_ops_init(&vops, vm, q, NULL, 0);
4002 
4003 	ops = vm_bind_ioctl_ops_create(vm, &vops, bo, 0, addr, xe_bo_size(bo),
4004 				       DRM_XE_VM_BIND_OP_MAP, 0, 0,
4005 				       vm->xe->pat.idx[cache_lvl]);
4006 	if (IS_ERR(ops)) {
4007 		err = PTR_ERR(ops);
4008 		goto release_vm_lock;
4009 	}
4010 
4011 	err = vm_bind_ioctl_ops_parse(vm, ops, &vops);
4012 	if (err)
4013 		goto release_vm_lock;
4014 
4015 	xe_assert(vm->xe, !list_empty(&vops.list));
4016 
4017 	err = xe_vma_ops_alloc(&vops, false);
4018 	if (err)
4019 		goto unwind_ops;
4020 
4021 	fence = vm_bind_ioctl_ops_execute(vm, &vops);
4022 	if (IS_ERR(fence))
4023 		err = PTR_ERR(fence);
4024 
4025 unwind_ops:
4026 	if (err && err != -ENODATA)
4027 		vm_bind_ioctl_ops_unwind(vm, &ops, 1);
4028 
4029 	xe_vma_ops_fini(&vops);
4030 	drm_gpuva_ops_free(&vm->gpuvm, ops);
4031 
4032 release_vm_lock:
4033 	up_write(&vm->lock);
4034 
4035 	if (q)
4036 		xe_exec_queue_put(q);
4037 	xe_vm_put(vm);
4038 	xe_bo_put(bo);
4039 
4040 	if (err)
4041 		fence = ERR_PTR(err);
4042 
4043 	return fence;
4044 }
4045 
4046 /**
4047  * xe_vm_lock() - Lock the vm's dma_resv object
4048  * @vm: The struct xe_vm whose lock is to be locked
4049  * @intr: Whether to perform any wait interruptible
4050  *
4051  * Return: 0 on success, -EINTR if @intr is true and the wait for a
4052  * contended lock was interrupted. If @intr is false, the function
4053  * always returns 0.
4054  */
4055 int xe_vm_lock(struct xe_vm *vm, bool intr)
4056 {
4057 	if (intr)
4058 		return dma_resv_lock_interruptible(xe_vm_resv(vm), NULL);
4059 
4060 	return dma_resv_lock(xe_vm_resv(vm), NULL);
4061 }
4062 
4063 /**
4064  * xe_vm_unlock() - Unlock the vm's dma_resv object
4065  * @vm: The struct xe_vm whose lock is to be released.
4066  *
4067  * Unlock a buffer object lock that was locked by xe_vm_lock().
4068  */
4069 void xe_vm_unlock(struct xe_vm *vm)
4070 {
4071 	dma_resv_unlock(xe_vm_resv(vm));
4072 }
4073 
4074 /**
4075  * xe_vm_range_tilemask_tlb_inval - Issue a TLB invalidation on this tilemask for an
4076  * address range
4077  * @vm: The VM
4078  * @start: start address
4079  * @end: end address
4080  * @tile_mask: mask for which gt's issue tlb invalidation
4081  *
4082  * Issue a range based TLB invalidation for gt's in tilemask
4083  *
4084  * Returns 0 for success, negative error code otherwise.
4085  */
4086 int xe_vm_range_tilemask_tlb_inval(struct xe_vm *vm, u64 start,
4087 				   u64 end, u8 tile_mask)
4088 {
4089 	struct xe_tlb_inval_fence
4090 		fence[XE_MAX_TILES_PER_DEVICE * XE_MAX_GT_PER_TILE];
4091 	struct xe_tile *tile;
4092 	u32 fence_id = 0;
4093 	u8 id;
4094 	int err;
4095 
4096 	if (!tile_mask)
4097 		return 0;
4098 
4099 	for_each_tile(tile, vm->xe, id) {
4100 		if (!(tile_mask & BIT(id)))
4101 			continue;
4102 
4103 		xe_tlb_inval_fence_init(&tile->primary_gt->tlb_inval,
4104 					&fence[fence_id], true);
4105 
4106 		err = xe_tlb_inval_range(&tile->primary_gt->tlb_inval,
4107 					 &fence[fence_id], start, end,
4108 					 vm->usm.asid);
4109 		if (err)
4110 			goto wait;
4111 		++fence_id;
4112 
4113 		if (!tile->media_gt)
4114 			continue;
4115 
4116 		xe_tlb_inval_fence_init(&tile->media_gt->tlb_inval,
4117 					&fence[fence_id], true);
4118 
4119 		err = xe_tlb_inval_range(&tile->media_gt->tlb_inval,
4120 					 &fence[fence_id], start, end,
4121 					 vm->usm.asid);
4122 		if (err)
4123 			goto wait;
4124 		++fence_id;
4125 	}
4126 
4127 wait:
4128 	for (id = 0; id < fence_id; ++id)
4129 		xe_tlb_inval_fence_wait(&fence[id]);
4130 
4131 	return err;
4132 }
4133 
4134 /**
4135  * xe_vm_invalidate_vma - invalidate GPU mappings for VMA without a lock
4136  * @vma: VMA to invalidate
4137  *
4138  * Walks a list of page tables leaves which it memset the entries owned by this
4139  * VMA to zero, invalidates the TLBs, and block until TLBs invalidation is
4140  * complete.
4141  *
4142  * Returns 0 for success, negative error code otherwise.
4143  */
4144 int xe_vm_invalidate_vma(struct xe_vma *vma)
4145 {
4146 	struct xe_device *xe = xe_vma_vm(vma)->xe;
4147 	struct xe_vm *vm = xe_vma_vm(vma);
4148 	struct xe_tile *tile;
4149 	u8 tile_mask = 0;
4150 	int ret = 0;
4151 	u8 id;
4152 
4153 	xe_assert(xe, !xe_vma_is_null(vma));
4154 	xe_assert(xe, !xe_vma_is_cpu_addr_mirror(vma));
4155 	trace_xe_vma_invalidate(vma);
4156 
4157 	vm_dbg(&vm->xe->drm,
4158 	       "INVALIDATE: addr=0x%016llx, range=0x%016llx",
4159 		xe_vma_start(vma), xe_vma_size(vma));
4160 
4161 	/*
4162 	 * Check that we don't race with page-table updates, tile_invalidated
4163 	 * update is safe
4164 	 */
4165 	if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
4166 		if (xe_vma_is_userptr(vma)) {
4167 			lockdep_assert(lockdep_is_held_type(&vm->userptr.notifier_lock, 0) ||
4168 				       (lockdep_is_held_type(&vm->userptr.notifier_lock, 1) &&
4169 					lockdep_is_held(&xe_vm_resv(vm)->lock.base)));
4170 
4171 			WARN_ON_ONCE(!mmu_interval_check_retry
4172 				     (&to_userptr_vma(vma)->userptr.notifier,
4173 				      to_userptr_vma(vma)->userptr.notifier_seq));
4174 			WARN_ON_ONCE(!dma_resv_test_signaled(xe_vm_resv(vm),
4175 							     DMA_RESV_USAGE_BOOKKEEP));
4176 
4177 		} else {
4178 			xe_bo_assert_held(xe_vma_bo(vma));
4179 		}
4180 	}
4181 
4182 	for_each_tile(tile, xe, id)
4183 		if (xe_pt_zap_ptes(tile, vma))
4184 			tile_mask |= BIT(id);
4185 
4186 	xe_device_wmb(xe);
4187 
4188 	ret = xe_vm_range_tilemask_tlb_inval(xe_vma_vm(vma), xe_vma_start(vma),
4189 					     xe_vma_end(vma), tile_mask);
4190 
4191 	/* WRITE_ONCE pairs with READ_ONCE in xe_vm_has_valid_gpu_mapping() */
4192 	WRITE_ONCE(vma->tile_invalidated, vma->tile_mask);
4193 
4194 	return ret;
4195 }
4196 
4197 int xe_vm_validate_protected(struct xe_vm *vm)
4198 {
4199 	struct drm_gpuva *gpuva;
4200 	int err = 0;
4201 
4202 	if (!vm)
4203 		return -ENODEV;
4204 
4205 	mutex_lock(&vm->snap_mutex);
4206 
4207 	drm_gpuvm_for_each_va(gpuva, &vm->gpuvm) {
4208 		struct xe_vma *vma = gpuva_to_vma(gpuva);
4209 		struct xe_bo *bo = vma->gpuva.gem.obj ?
4210 			gem_to_xe_bo(vma->gpuva.gem.obj) : NULL;
4211 
4212 		if (!bo)
4213 			continue;
4214 
4215 		if (xe_bo_is_protected(bo)) {
4216 			err = xe_pxp_bo_key_check(vm->xe->pxp, bo);
4217 			if (err)
4218 				break;
4219 		}
4220 	}
4221 
4222 	mutex_unlock(&vm->snap_mutex);
4223 	return err;
4224 }
4225 
4226 struct xe_vm_snapshot {
4227 	unsigned long num_snaps;
4228 	struct {
4229 		u64 ofs, bo_ofs;
4230 		unsigned long len;
4231 		struct xe_bo *bo;
4232 		void *data;
4233 		struct mm_struct *mm;
4234 	} snap[];
4235 };
4236 
4237 struct xe_vm_snapshot *xe_vm_snapshot_capture(struct xe_vm *vm)
4238 {
4239 	unsigned long num_snaps = 0, i;
4240 	struct xe_vm_snapshot *snap = NULL;
4241 	struct drm_gpuva *gpuva;
4242 
4243 	if (!vm)
4244 		return NULL;
4245 
4246 	mutex_lock(&vm->snap_mutex);
4247 	drm_gpuvm_for_each_va(gpuva, &vm->gpuvm) {
4248 		if (gpuva->flags & XE_VMA_DUMPABLE)
4249 			num_snaps++;
4250 	}
4251 
4252 	if (num_snaps)
4253 		snap = kvzalloc(offsetof(struct xe_vm_snapshot, snap[num_snaps]), GFP_NOWAIT);
4254 	if (!snap) {
4255 		snap = num_snaps ? ERR_PTR(-ENOMEM) : ERR_PTR(-ENODEV);
4256 		goto out_unlock;
4257 	}
4258 
4259 	snap->num_snaps = num_snaps;
4260 	i = 0;
4261 	drm_gpuvm_for_each_va(gpuva, &vm->gpuvm) {
4262 		struct xe_vma *vma = gpuva_to_vma(gpuva);
4263 		struct xe_bo *bo = vma->gpuva.gem.obj ?
4264 			gem_to_xe_bo(vma->gpuva.gem.obj) : NULL;
4265 
4266 		if (!(gpuva->flags & XE_VMA_DUMPABLE))
4267 			continue;
4268 
4269 		snap->snap[i].ofs = xe_vma_start(vma);
4270 		snap->snap[i].len = xe_vma_size(vma);
4271 		if (bo) {
4272 			snap->snap[i].bo = xe_bo_get(bo);
4273 			snap->snap[i].bo_ofs = xe_vma_bo_offset(vma);
4274 		} else if (xe_vma_is_userptr(vma)) {
4275 			struct mm_struct *mm =
4276 				to_userptr_vma(vma)->userptr.notifier.mm;
4277 
4278 			if (mmget_not_zero(mm))
4279 				snap->snap[i].mm = mm;
4280 			else
4281 				snap->snap[i].data = ERR_PTR(-EFAULT);
4282 
4283 			snap->snap[i].bo_ofs = xe_vma_userptr(vma);
4284 		} else {
4285 			snap->snap[i].data = ERR_PTR(-ENOENT);
4286 		}
4287 		i++;
4288 	}
4289 
4290 out_unlock:
4291 	mutex_unlock(&vm->snap_mutex);
4292 	return snap;
4293 }
4294 
4295 void xe_vm_snapshot_capture_delayed(struct xe_vm_snapshot *snap)
4296 {
4297 	if (IS_ERR_OR_NULL(snap))
4298 		return;
4299 
4300 	for (int i = 0; i < snap->num_snaps; i++) {
4301 		struct xe_bo *bo = snap->snap[i].bo;
4302 		int err;
4303 
4304 		if (IS_ERR(snap->snap[i].data))
4305 			continue;
4306 
4307 		snap->snap[i].data = kvmalloc(snap->snap[i].len, GFP_USER);
4308 		if (!snap->snap[i].data) {
4309 			snap->snap[i].data = ERR_PTR(-ENOMEM);
4310 			goto cleanup_bo;
4311 		}
4312 
4313 		if (bo) {
4314 			err = xe_bo_read(bo, snap->snap[i].bo_ofs,
4315 					 snap->snap[i].data, snap->snap[i].len);
4316 		} else {
4317 			void __user *userptr = (void __user *)(size_t)snap->snap[i].bo_ofs;
4318 
4319 			kthread_use_mm(snap->snap[i].mm);
4320 			if (!copy_from_user(snap->snap[i].data, userptr, snap->snap[i].len))
4321 				err = 0;
4322 			else
4323 				err = -EFAULT;
4324 			kthread_unuse_mm(snap->snap[i].mm);
4325 
4326 			mmput(snap->snap[i].mm);
4327 			snap->snap[i].mm = NULL;
4328 		}
4329 
4330 		if (err) {
4331 			kvfree(snap->snap[i].data);
4332 			snap->snap[i].data = ERR_PTR(err);
4333 		}
4334 
4335 cleanup_bo:
4336 		xe_bo_put(bo);
4337 		snap->snap[i].bo = NULL;
4338 	}
4339 }
4340 
4341 void xe_vm_snapshot_print(struct xe_vm_snapshot *snap, struct drm_printer *p)
4342 {
4343 	unsigned long i, j;
4344 
4345 	if (IS_ERR_OR_NULL(snap)) {
4346 		drm_printf(p, "[0].error: %li\n", PTR_ERR(snap));
4347 		return;
4348 	}
4349 
4350 	for (i = 0; i < snap->num_snaps; i++) {
4351 		drm_printf(p, "[%llx].length: 0x%lx\n", snap->snap[i].ofs, snap->snap[i].len);
4352 
4353 		if (IS_ERR(snap->snap[i].data)) {
4354 			drm_printf(p, "[%llx].error: %li\n", snap->snap[i].ofs,
4355 				   PTR_ERR(snap->snap[i].data));
4356 			continue;
4357 		}
4358 
4359 		drm_printf(p, "[%llx].data: ", snap->snap[i].ofs);
4360 
4361 		for (j = 0; j < snap->snap[i].len; j += sizeof(u32)) {
4362 			u32 *val = snap->snap[i].data + j;
4363 			char dumped[ASCII85_BUFSZ];
4364 
4365 			drm_puts(p, ascii85_encode(*val, dumped));
4366 		}
4367 
4368 		drm_puts(p, "\n");
4369 
4370 		if (drm_coredump_printer_is_full(p))
4371 			return;
4372 	}
4373 }
4374 
4375 void xe_vm_snapshot_free(struct xe_vm_snapshot *snap)
4376 {
4377 	unsigned long i;
4378 
4379 	if (IS_ERR_OR_NULL(snap))
4380 		return;
4381 
4382 	for (i = 0; i < snap->num_snaps; i++) {
4383 		if (!IS_ERR(snap->snap[i].data))
4384 			kvfree(snap->snap[i].data);
4385 		xe_bo_put(snap->snap[i].bo);
4386 		if (snap->snap[i].mm)
4387 			mmput(snap->snap[i].mm);
4388 	}
4389 	kvfree(snap);
4390 }
4391 
4392 /**
4393  * xe_vma_need_vram_for_atomic - Check if VMA needs VRAM migration for atomic operations
4394  * @xe: Pointer to the XE device structure
4395  * @vma: Pointer to the virtual memory area (VMA) structure
4396  * @is_atomic: In pagefault path and atomic operation
4397  *
4398  * This function determines whether the given VMA needs to be migrated to
4399  * VRAM in order to do atomic GPU operation.
4400  *
4401  * Return:
4402  *   1        - Migration to VRAM is required
4403  *   0        - Migration is not required
4404  *   -EACCES  - Invalid access for atomic memory attr
4405  *
4406  */
4407 int xe_vma_need_vram_for_atomic(struct xe_device *xe, struct xe_vma *vma, bool is_atomic)
4408 {
4409 	u32 atomic_access = xe_vma_bo(vma) ? xe_vma_bo(vma)->attr.atomic_access :
4410 					     vma->attr.atomic_access;
4411 
4412 	if (!IS_DGFX(xe) || !is_atomic)
4413 		return false;
4414 
4415 	/*
4416 	 * NOTE: The checks implemented here are platform-specific. For
4417 	 * instance, on a device supporting CXL atomics, these would ideally
4418 	 * work universally without additional handling.
4419 	 */
4420 	switch (atomic_access) {
4421 	case DRM_XE_ATOMIC_DEVICE:
4422 		return !xe->info.has_device_atomics_on_smem;
4423 
4424 	case DRM_XE_ATOMIC_CPU:
4425 		return -EACCES;
4426 
4427 	case DRM_XE_ATOMIC_UNDEFINED:
4428 	case DRM_XE_ATOMIC_GLOBAL:
4429 	default:
4430 		return 1;
4431 	}
4432 }
4433 
4434 static int xe_vm_alloc_vma(struct xe_vm *vm,
4435 			   struct drm_gpuvm_map_req *map_req,
4436 			   bool is_madvise)
4437 {
4438 	struct xe_vma_ops vops;
4439 	struct drm_gpuva_ops *ops = NULL;
4440 	struct drm_gpuva_op *__op;
4441 	bool is_cpu_addr_mirror = false;
4442 	bool remap_op = false;
4443 	struct xe_vma_mem_attr tmp_attr;
4444 	u16 default_pat;
4445 	int err;
4446 
4447 	lockdep_assert_held_write(&vm->lock);
4448 
4449 	if (is_madvise)
4450 		ops = drm_gpuvm_madvise_ops_create(&vm->gpuvm, map_req);
4451 	else
4452 		ops = drm_gpuvm_sm_map_ops_create(&vm->gpuvm, map_req);
4453 
4454 	if (IS_ERR(ops))
4455 		return PTR_ERR(ops);
4456 
4457 	if (list_empty(&ops->list)) {
4458 		err = 0;
4459 		goto free_ops;
4460 	}
4461 
4462 	drm_gpuva_for_each_op(__op, ops) {
4463 		struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
4464 		struct xe_vma *vma = NULL;
4465 
4466 		if (!is_madvise) {
4467 			if (__op->op == DRM_GPUVA_OP_UNMAP) {
4468 				vma = gpuva_to_vma(op->base.unmap.va);
4469 				XE_WARN_ON(!xe_vma_has_default_mem_attrs(vma));
4470 				default_pat = vma->attr.default_pat_index;
4471 			}
4472 
4473 			if (__op->op == DRM_GPUVA_OP_REMAP) {
4474 				vma = gpuva_to_vma(op->base.remap.unmap->va);
4475 				default_pat = vma->attr.default_pat_index;
4476 			}
4477 
4478 			if (__op->op == DRM_GPUVA_OP_MAP) {
4479 				op->map.is_cpu_addr_mirror = true;
4480 				op->map.pat_index = default_pat;
4481 			}
4482 		} else {
4483 			if (__op->op == DRM_GPUVA_OP_REMAP) {
4484 				vma = gpuva_to_vma(op->base.remap.unmap->va);
4485 				xe_assert(vm->xe, !remap_op);
4486 				xe_assert(vm->xe, xe_vma_has_no_bo(vma));
4487 				remap_op = true;
4488 
4489 				if (xe_vma_is_cpu_addr_mirror(vma))
4490 					is_cpu_addr_mirror = true;
4491 				else
4492 					is_cpu_addr_mirror = false;
4493 			}
4494 
4495 			if (__op->op == DRM_GPUVA_OP_MAP) {
4496 				xe_assert(vm->xe, remap_op);
4497 				remap_op = false;
4498 				/*
4499 				 * In case of madvise ops DRM_GPUVA_OP_MAP is
4500 				 * always after DRM_GPUVA_OP_REMAP, so ensure
4501 				 * we assign op->map.is_cpu_addr_mirror true
4502 				 * if REMAP is for xe_vma_is_cpu_addr_mirror vma
4503 				 */
4504 				op->map.is_cpu_addr_mirror = is_cpu_addr_mirror;
4505 			}
4506 		}
4507 		print_op(vm->xe, __op);
4508 	}
4509 
4510 	xe_vma_ops_init(&vops, vm, NULL, NULL, 0);
4511 
4512 	if (is_madvise)
4513 		vops.flags |= XE_VMA_OPS_FLAG_MADVISE;
4514 
4515 	err = vm_bind_ioctl_ops_parse(vm, ops, &vops);
4516 	if (err)
4517 		goto unwind_ops;
4518 
4519 	xe_vm_lock(vm, false);
4520 
4521 	drm_gpuva_for_each_op(__op, ops) {
4522 		struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
4523 		struct xe_vma *vma;
4524 
4525 		if (__op->op == DRM_GPUVA_OP_UNMAP) {
4526 			vma = gpuva_to_vma(op->base.unmap.va);
4527 			/* There should be no unmap for madvise */
4528 			if (is_madvise)
4529 				XE_WARN_ON("UNEXPECTED UNMAP");
4530 
4531 			xe_vma_destroy(vma, NULL);
4532 		} else if (__op->op == DRM_GPUVA_OP_REMAP) {
4533 			vma = gpuva_to_vma(op->base.remap.unmap->va);
4534 			/* In case of madvise ops Store attributes for REMAP UNMAPPED
4535 			 * VMA, so they can be assigned to newly MAP created vma.
4536 			 */
4537 			if (is_madvise)
4538 				tmp_attr = vma->attr;
4539 
4540 			xe_vma_destroy(gpuva_to_vma(op->base.remap.unmap->va), NULL);
4541 		} else if (__op->op == DRM_GPUVA_OP_MAP) {
4542 			vma = op->map.vma;
4543 			/* In case of madvise call, MAP will always be follwed by REMAP.
4544 			 * Therefore temp_attr will always have sane values, making it safe to
4545 			 * copy them to new vma.
4546 			 */
4547 			if (is_madvise)
4548 				vma->attr = tmp_attr;
4549 		}
4550 	}
4551 
4552 	xe_vm_unlock(vm);
4553 	drm_gpuva_ops_free(&vm->gpuvm, ops);
4554 	return 0;
4555 
4556 unwind_ops:
4557 	vm_bind_ioctl_ops_unwind(vm, &ops, 1);
4558 free_ops:
4559 	drm_gpuva_ops_free(&vm->gpuvm, ops);
4560 	return err;
4561 }
4562 
4563 /**
4564  * xe_vm_alloc_madvise_vma - Allocate VMA's with madvise ops
4565  * @vm: Pointer to the xe_vm structure
4566  * @start: Starting input address
4567  * @range: Size of the input range
4568  *
4569  * This function splits existing vma to create new vma for user provided input range
4570  *
4571  * Return: 0 if success
4572  */
4573 int xe_vm_alloc_madvise_vma(struct xe_vm *vm, uint64_t start, uint64_t range)
4574 {
4575 	struct drm_gpuvm_map_req map_req = {
4576 		.map.va.addr = start,
4577 		.map.va.range = range,
4578 	};
4579 
4580 	lockdep_assert_held_write(&vm->lock);
4581 
4582 	vm_dbg(&vm->xe->drm, "MADVISE_OPS_CREATE: addr=0x%016llx, size=0x%016llx", start, range);
4583 
4584 	return xe_vm_alloc_vma(vm, &map_req, true);
4585 }
4586 
4587 /**
4588  * xe_vm_alloc_cpu_addr_mirror_vma - Allocate CPU addr mirror vma
4589  * @vm: Pointer to the xe_vm structure
4590  * @start: Starting input address
4591  * @range: Size of the input range
4592  *
4593  * This function splits/merges existing vma to create new vma for user provided input range
4594  *
4595  * Return: 0 if success
4596  */
4597 int xe_vm_alloc_cpu_addr_mirror_vma(struct xe_vm *vm, uint64_t start, uint64_t range)
4598 {
4599 	struct drm_gpuvm_map_req map_req = {
4600 		.map.va.addr = start,
4601 		.map.va.range = range,
4602 	};
4603 
4604 	lockdep_assert_held_write(&vm->lock);
4605 
4606 	vm_dbg(&vm->xe->drm, "CPU_ADDR_MIRROR_VMA_OPS_CREATE: addr=0x%016llx, size=0x%016llx",
4607 	       start, range);
4608 
4609 	return xe_vm_alloc_vma(vm, &map_req, false);
4610 }
4611