xref: /linux/drivers/gpu/drm/xe/xe_vm.c (revision 29042df3acdc7364af1c251b2a05f7c1c8fe0401)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2021 Intel Corporation
4  */
5 
6 #include "xe_vm.h"
7 
8 #include <linux/dma-fence-array.h>
9 #include <linux/nospec.h>
10 
11 #include <drm/drm_drv.h>
12 #include <drm/drm_exec.h>
13 #include <drm/drm_print.h>
14 #include <drm/ttm/ttm_tt.h>
15 #include <uapi/drm/xe_drm.h>
16 #include <linux/ascii85.h>
17 #include <linux/delay.h>
18 #include <linux/kthread.h>
19 #include <linux/mm.h>
20 #include <linux/swap.h>
21 
22 #include <generated/xe_wa_oob.h>
23 
24 #include "regs/xe_gtt_defs.h"
25 #include "xe_assert.h"
26 #include "xe_bo.h"
27 #include "xe_device.h"
28 #include "xe_drm_client.h"
29 #include "xe_exec_queue.h"
30 #include "xe_gt_pagefault.h"
31 #include "xe_gt_tlb_invalidation.h"
32 #include "xe_migrate.h"
33 #include "xe_pat.h"
34 #include "xe_pm.h"
35 #include "xe_preempt_fence.h"
36 #include "xe_pt.h"
37 #include "xe_pxp.h"
38 #include "xe_res_cursor.h"
39 #include "xe_svm.h"
40 #include "xe_sync.h"
41 #include "xe_trace_bo.h"
42 #include "xe_wa.h"
43 #include "xe_hmm.h"
44 
45 static struct drm_gem_object *xe_vm_obj(struct xe_vm *vm)
46 {
47 	return vm->gpuvm.r_obj;
48 }
49 
50 /**
51  * xe_vma_userptr_check_repin() - Advisory check for repin needed
52  * @uvma: The userptr vma
53  *
54  * Check if the userptr vma has been invalidated since last successful
55  * repin. The check is advisory only and can the function can be called
56  * without the vm->userptr.notifier_lock held. There is no guarantee that the
57  * vma userptr will remain valid after a lockless check, so typically
58  * the call needs to be followed by a proper check under the notifier_lock.
59  *
60  * Return: 0 if userptr vma is valid, -EAGAIN otherwise; repin recommended.
61  */
62 int xe_vma_userptr_check_repin(struct xe_userptr_vma *uvma)
63 {
64 	return mmu_interval_check_retry(&uvma->userptr.notifier,
65 					uvma->userptr.notifier_seq) ?
66 		-EAGAIN : 0;
67 }
68 
69 int xe_vma_userptr_pin_pages(struct xe_userptr_vma *uvma)
70 {
71 	struct xe_vma *vma = &uvma->vma;
72 	struct xe_vm *vm = xe_vma_vm(vma);
73 	struct xe_device *xe = vm->xe;
74 
75 	lockdep_assert_held(&vm->lock);
76 	xe_assert(xe, xe_vma_is_userptr(vma));
77 
78 	return xe_hmm_userptr_populate_range(uvma, false);
79 }
80 
81 static bool preempt_fences_waiting(struct xe_vm *vm)
82 {
83 	struct xe_exec_queue *q;
84 
85 	lockdep_assert_held(&vm->lock);
86 	xe_vm_assert_held(vm);
87 
88 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link) {
89 		if (!q->lr.pfence ||
90 		    test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
91 			     &q->lr.pfence->flags)) {
92 			return true;
93 		}
94 	}
95 
96 	return false;
97 }
98 
99 static void free_preempt_fences(struct list_head *list)
100 {
101 	struct list_head *link, *next;
102 
103 	list_for_each_safe(link, next, list)
104 		xe_preempt_fence_free(to_preempt_fence_from_link(link));
105 }
106 
107 static int alloc_preempt_fences(struct xe_vm *vm, struct list_head *list,
108 				unsigned int *count)
109 {
110 	lockdep_assert_held(&vm->lock);
111 	xe_vm_assert_held(vm);
112 
113 	if (*count >= vm->preempt.num_exec_queues)
114 		return 0;
115 
116 	for (; *count < vm->preempt.num_exec_queues; ++(*count)) {
117 		struct xe_preempt_fence *pfence = xe_preempt_fence_alloc();
118 
119 		if (IS_ERR(pfence))
120 			return PTR_ERR(pfence);
121 
122 		list_move_tail(xe_preempt_fence_link(pfence), list);
123 	}
124 
125 	return 0;
126 }
127 
128 static int wait_for_existing_preempt_fences(struct xe_vm *vm)
129 {
130 	struct xe_exec_queue *q;
131 
132 	xe_vm_assert_held(vm);
133 
134 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link) {
135 		if (q->lr.pfence) {
136 			long timeout = dma_fence_wait(q->lr.pfence, false);
137 
138 			/* Only -ETIME on fence indicates VM needs to be killed */
139 			if (timeout < 0 || q->lr.pfence->error == -ETIME)
140 				return -ETIME;
141 
142 			dma_fence_put(q->lr.pfence);
143 			q->lr.pfence = NULL;
144 		}
145 	}
146 
147 	return 0;
148 }
149 
150 static bool xe_vm_is_idle(struct xe_vm *vm)
151 {
152 	struct xe_exec_queue *q;
153 
154 	xe_vm_assert_held(vm);
155 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link) {
156 		if (!xe_exec_queue_is_idle(q))
157 			return false;
158 	}
159 
160 	return true;
161 }
162 
163 static void arm_preempt_fences(struct xe_vm *vm, struct list_head *list)
164 {
165 	struct list_head *link;
166 	struct xe_exec_queue *q;
167 
168 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link) {
169 		struct dma_fence *fence;
170 
171 		link = list->next;
172 		xe_assert(vm->xe, link != list);
173 
174 		fence = xe_preempt_fence_arm(to_preempt_fence_from_link(link),
175 					     q, q->lr.context,
176 					     ++q->lr.seqno);
177 		dma_fence_put(q->lr.pfence);
178 		q->lr.pfence = fence;
179 	}
180 }
181 
182 static int add_preempt_fences(struct xe_vm *vm, struct xe_bo *bo)
183 {
184 	struct xe_exec_queue *q;
185 	int err;
186 
187 	xe_bo_assert_held(bo);
188 
189 	if (!vm->preempt.num_exec_queues)
190 		return 0;
191 
192 	err = dma_resv_reserve_fences(bo->ttm.base.resv, vm->preempt.num_exec_queues);
193 	if (err)
194 		return err;
195 
196 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link)
197 		if (q->lr.pfence) {
198 			dma_resv_add_fence(bo->ttm.base.resv,
199 					   q->lr.pfence,
200 					   DMA_RESV_USAGE_BOOKKEEP);
201 		}
202 
203 	return 0;
204 }
205 
206 static void resume_and_reinstall_preempt_fences(struct xe_vm *vm,
207 						struct drm_exec *exec)
208 {
209 	struct xe_exec_queue *q;
210 
211 	lockdep_assert_held(&vm->lock);
212 	xe_vm_assert_held(vm);
213 
214 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link) {
215 		q->ops->resume(q);
216 
217 		drm_gpuvm_resv_add_fence(&vm->gpuvm, exec, q->lr.pfence,
218 					 DMA_RESV_USAGE_BOOKKEEP, DMA_RESV_USAGE_BOOKKEEP);
219 	}
220 }
221 
222 int xe_vm_add_compute_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
223 {
224 	struct drm_gpuvm_exec vm_exec = {
225 		.vm = &vm->gpuvm,
226 		.flags = DRM_EXEC_INTERRUPTIBLE_WAIT,
227 		.num_fences = 1,
228 	};
229 	struct drm_exec *exec = &vm_exec.exec;
230 	struct dma_fence *pfence;
231 	int err;
232 	bool wait;
233 
234 	xe_assert(vm->xe, xe_vm_in_preempt_fence_mode(vm));
235 
236 	down_write(&vm->lock);
237 	err = drm_gpuvm_exec_lock(&vm_exec);
238 	if (err)
239 		goto out_up_write;
240 
241 	pfence = xe_preempt_fence_create(q, q->lr.context,
242 					 ++q->lr.seqno);
243 	if (!pfence) {
244 		err = -ENOMEM;
245 		goto out_fini;
246 	}
247 
248 	list_add(&q->lr.link, &vm->preempt.exec_queues);
249 	++vm->preempt.num_exec_queues;
250 	q->lr.pfence = pfence;
251 
252 	down_read(&vm->userptr.notifier_lock);
253 
254 	drm_gpuvm_resv_add_fence(&vm->gpuvm, exec, pfence,
255 				 DMA_RESV_USAGE_BOOKKEEP, DMA_RESV_USAGE_BOOKKEEP);
256 
257 	/*
258 	 * Check to see if a preemption on VM is in flight or userptr
259 	 * invalidation, if so trigger this preempt fence to sync state with
260 	 * other preempt fences on the VM.
261 	 */
262 	wait = __xe_vm_userptr_needs_repin(vm) || preempt_fences_waiting(vm);
263 	if (wait)
264 		dma_fence_enable_sw_signaling(pfence);
265 
266 	up_read(&vm->userptr.notifier_lock);
267 
268 out_fini:
269 	drm_exec_fini(exec);
270 out_up_write:
271 	up_write(&vm->lock);
272 
273 	return err;
274 }
275 ALLOW_ERROR_INJECTION(xe_vm_add_compute_exec_queue, ERRNO);
276 
277 /**
278  * xe_vm_remove_compute_exec_queue() - Remove compute exec queue from VM
279  * @vm: The VM.
280  * @q: The exec_queue
281  *
282  * Note that this function might be called multiple times on the same queue.
283  */
284 void xe_vm_remove_compute_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
285 {
286 	if (!xe_vm_in_preempt_fence_mode(vm))
287 		return;
288 
289 	down_write(&vm->lock);
290 	if (!list_empty(&q->lr.link)) {
291 		list_del_init(&q->lr.link);
292 		--vm->preempt.num_exec_queues;
293 	}
294 	if (q->lr.pfence) {
295 		dma_fence_enable_sw_signaling(q->lr.pfence);
296 		dma_fence_put(q->lr.pfence);
297 		q->lr.pfence = NULL;
298 	}
299 	up_write(&vm->lock);
300 }
301 
302 /**
303  * __xe_vm_userptr_needs_repin() - Check whether the VM does have userptrs
304  * that need repinning.
305  * @vm: The VM.
306  *
307  * This function checks for whether the VM has userptrs that need repinning,
308  * and provides a release-type barrier on the userptr.notifier_lock after
309  * checking.
310  *
311  * Return: 0 if there are no userptrs needing repinning, -EAGAIN if there are.
312  */
313 int __xe_vm_userptr_needs_repin(struct xe_vm *vm)
314 {
315 	lockdep_assert_held_read(&vm->userptr.notifier_lock);
316 
317 	return (list_empty(&vm->userptr.repin_list) &&
318 		list_empty(&vm->userptr.invalidated)) ? 0 : -EAGAIN;
319 }
320 
321 #define XE_VM_REBIND_RETRY_TIMEOUT_MS 1000
322 
323 /**
324  * xe_vm_kill() - VM Kill
325  * @vm: The VM.
326  * @unlocked: Flag indicates the VM's dma-resv is not held
327  *
328  * Kill the VM by setting banned flag indicated VM is no longer available for
329  * use. If in preempt fence mode, also kill all exec queue attached to the VM.
330  */
331 void xe_vm_kill(struct xe_vm *vm, bool unlocked)
332 {
333 	struct xe_exec_queue *q;
334 
335 	lockdep_assert_held(&vm->lock);
336 
337 	if (unlocked)
338 		xe_vm_lock(vm, false);
339 
340 	vm->flags |= XE_VM_FLAG_BANNED;
341 	trace_xe_vm_kill(vm);
342 
343 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link)
344 		q->ops->kill(q);
345 
346 	if (unlocked)
347 		xe_vm_unlock(vm);
348 
349 	/* TODO: Inform user the VM is banned */
350 }
351 
352 /**
353  * xe_vm_validate_should_retry() - Whether to retry after a validate error.
354  * @exec: The drm_exec object used for locking before validation.
355  * @err: The error returned from ttm_bo_validate().
356  * @end: A ktime_t cookie that should be set to 0 before first use and
357  * that should be reused on subsequent calls.
358  *
359  * With multiple active VMs, under memory pressure, it is possible that
360  * ttm_bo_validate() run into -EDEADLK and in such case returns -ENOMEM.
361  * Until ttm properly handles locking in such scenarios, best thing the
362  * driver can do is retry with a timeout. Check if that is necessary, and
363  * if so unlock the drm_exec's objects while keeping the ticket to prepare
364  * for a rerun.
365  *
366  * Return: true if a retry after drm_exec_init() is recommended;
367  * false otherwise.
368  */
369 bool xe_vm_validate_should_retry(struct drm_exec *exec, int err, ktime_t *end)
370 {
371 	ktime_t cur;
372 
373 	if (err != -ENOMEM)
374 		return false;
375 
376 	cur = ktime_get();
377 	*end = *end ? : ktime_add_ms(cur, XE_VM_REBIND_RETRY_TIMEOUT_MS);
378 	if (!ktime_before(cur, *end))
379 		return false;
380 
381 	msleep(20);
382 	return true;
383 }
384 
385 static int xe_gpuvm_validate(struct drm_gpuvm_bo *vm_bo, struct drm_exec *exec)
386 {
387 	struct xe_vm *vm = gpuvm_to_vm(vm_bo->vm);
388 	struct drm_gpuva *gpuva;
389 	int ret;
390 
391 	lockdep_assert_held(&vm->lock);
392 	drm_gpuvm_bo_for_each_va(gpuva, vm_bo)
393 		list_move_tail(&gpuva_to_vma(gpuva)->combined_links.rebind,
394 			       &vm->rebind_list);
395 
396 	ret = xe_bo_validate(gem_to_xe_bo(vm_bo->obj), vm, false);
397 	if (ret)
398 		return ret;
399 
400 	vm_bo->evicted = false;
401 	return 0;
402 }
403 
404 /**
405  * xe_vm_validate_rebind() - Validate buffer objects and rebind vmas
406  * @vm: The vm for which we are rebinding.
407  * @exec: The struct drm_exec with the locked GEM objects.
408  * @num_fences: The number of fences to reserve for the operation, not
409  * including rebinds and validations.
410  *
411  * Validates all evicted gem objects and rebinds their vmas. Note that
412  * rebindings may cause evictions and hence the validation-rebind
413  * sequence is rerun until there are no more objects to validate.
414  *
415  * Return: 0 on success, negative error code on error. In particular,
416  * may return -EINTR or -ERESTARTSYS if interrupted, and -EDEADLK if
417  * the drm_exec transaction needs to be restarted.
418  */
419 int xe_vm_validate_rebind(struct xe_vm *vm, struct drm_exec *exec,
420 			  unsigned int num_fences)
421 {
422 	struct drm_gem_object *obj;
423 	unsigned long index;
424 	int ret;
425 
426 	do {
427 		ret = drm_gpuvm_validate(&vm->gpuvm, exec);
428 		if (ret)
429 			return ret;
430 
431 		ret = xe_vm_rebind(vm, false);
432 		if (ret)
433 			return ret;
434 	} while (!list_empty(&vm->gpuvm.evict.list));
435 
436 	drm_exec_for_each_locked_object(exec, index, obj) {
437 		ret = dma_resv_reserve_fences(obj->resv, num_fences);
438 		if (ret)
439 			return ret;
440 	}
441 
442 	return 0;
443 }
444 
445 static int xe_preempt_work_begin(struct drm_exec *exec, struct xe_vm *vm,
446 				 bool *done)
447 {
448 	int err;
449 
450 	err = drm_gpuvm_prepare_vm(&vm->gpuvm, exec, 0);
451 	if (err)
452 		return err;
453 
454 	if (xe_vm_is_idle(vm)) {
455 		vm->preempt.rebind_deactivated = true;
456 		*done = true;
457 		return 0;
458 	}
459 
460 	if (!preempt_fences_waiting(vm)) {
461 		*done = true;
462 		return 0;
463 	}
464 
465 	err = drm_gpuvm_prepare_objects(&vm->gpuvm, exec, 0);
466 	if (err)
467 		return err;
468 
469 	err = wait_for_existing_preempt_fences(vm);
470 	if (err)
471 		return err;
472 
473 	/*
474 	 * Add validation and rebinding to the locking loop since both can
475 	 * cause evictions which may require blocing dma_resv locks.
476 	 * The fence reservation here is intended for the new preempt fences
477 	 * we attach at the end of the rebind work.
478 	 */
479 	return xe_vm_validate_rebind(vm, exec, vm->preempt.num_exec_queues);
480 }
481 
482 static void preempt_rebind_work_func(struct work_struct *w)
483 {
484 	struct xe_vm *vm = container_of(w, struct xe_vm, preempt.rebind_work);
485 	struct drm_exec exec;
486 	unsigned int fence_count = 0;
487 	LIST_HEAD(preempt_fences);
488 	ktime_t end = 0;
489 	int err = 0;
490 	long wait;
491 	int __maybe_unused tries = 0;
492 
493 	xe_assert(vm->xe, xe_vm_in_preempt_fence_mode(vm));
494 	trace_xe_vm_rebind_worker_enter(vm);
495 
496 	down_write(&vm->lock);
497 
498 	if (xe_vm_is_closed_or_banned(vm)) {
499 		up_write(&vm->lock);
500 		trace_xe_vm_rebind_worker_exit(vm);
501 		return;
502 	}
503 
504 retry:
505 	if (xe_vm_userptr_check_repin(vm)) {
506 		err = xe_vm_userptr_pin(vm);
507 		if (err)
508 			goto out_unlock_outer;
509 	}
510 
511 	drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT, 0);
512 
513 	drm_exec_until_all_locked(&exec) {
514 		bool done = false;
515 
516 		err = xe_preempt_work_begin(&exec, vm, &done);
517 		drm_exec_retry_on_contention(&exec);
518 		if (err || done) {
519 			drm_exec_fini(&exec);
520 			if (err && xe_vm_validate_should_retry(&exec, err, &end))
521 				err = -EAGAIN;
522 
523 			goto out_unlock_outer;
524 		}
525 	}
526 
527 	err = alloc_preempt_fences(vm, &preempt_fences, &fence_count);
528 	if (err)
529 		goto out_unlock;
530 
531 	err = xe_vm_rebind(vm, true);
532 	if (err)
533 		goto out_unlock;
534 
535 	/* Wait on rebinds and munmap style VM unbinds */
536 	wait = dma_resv_wait_timeout(xe_vm_resv(vm),
537 				     DMA_RESV_USAGE_KERNEL,
538 				     false, MAX_SCHEDULE_TIMEOUT);
539 	if (wait <= 0) {
540 		err = -ETIME;
541 		goto out_unlock;
542 	}
543 
544 #define retry_required(__tries, __vm) \
545 	(IS_ENABLED(CONFIG_DRM_XE_USERPTR_INVAL_INJECT) ? \
546 	(!(__tries)++ || __xe_vm_userptr_needs_repin(__vm)) : \
547 	__xe_vm_userptr_needs_repin(__vm))
548 
549 	down_read(&vm->userptr.notifier_lock);
550 	if (retry_required(tries, vm)) {
551 		up_read(&vm->userptr.notifier_lock);
552 		err = -EAGAIN;
553 		goto out_unlock;
554 	}
555 
556 #undef retry_required
557 
558 	spin_lock(&vm->xe->ttm.lru_lock);
559 	ttm_lru_bulk_move_tail(&vm->lru_bulk_move);
560 	spin_unlock(&vm->xe->ttm.lru_lock);
561 
562 	/* Point of no return. */
563 	arm_preempt_fences(vm, &preempt_fences);
564 	resume_and_reinstall_preempt_fences(vm, &exec);
565 	up_read(&vm->userptr.notifier_lock);
566 
567 out_unlock:
568 	drm_exec_fini(&exec);
569 out_unlock_outer:
570 	if (err == -EAGAIN) {
571 		trace_xe_vm_rebind_worker_retry(vm);
572 		goto retry;
573 	}
574 
575 	if (err) {
576 		drm_warn(&vm->xe->drm, "VM worker error: %d\n", err);
577 		xe_vm_kill(vm, true);
578 	}
579 	up_write(&vm->lock);
580 
581 	free_preempt_fences(&preempt_fences);
582 
583 	trace_xe_vm_rebind_worker_exit(vm);
584 }
585 
586 static void __vma_userptr_invalidate(struct xe_vm *vm, struct xe_userptr_vma *uvma)
587 {
588 	struct xe_userptr *userptr = &uvma->userptr;
589 	struct xe_vma *vma = &uvma->vma;
590 	struct dma_resv_iter cursor;
591 	struct dma_fence *fence;
592 	long err;
593 
594 	/*
595 	 * Tell exec and rebind worker they need to repin and rebind this
596 	 * userptr.
597 	 */
598 	if (!xe_vm_in_fault_mode(vm) &&
599 	    !(vma->gpuva.flags & XE_VMA_DESTROYED)) {
600 		spin_lock(&vm->userptr.invalidated_lock);
601 		list_move_tail(&userptr->invalidate_link,
602 			       &vm->userptr.invalidated);
603 		spin_unlock(&vm->userptr.invalidated_lock);
604 	}
605 
606 	/*
607 	 * Preempt fences turn into schedule disables, pipeline these.
608 	 * Note that even in fault mode, we need to wait for binds and
609 	 * unbinds to complete, and those are attached as BOOKMARK fences
610 	 * to the vm.
611 	 */
612 	dma_resv_iter_begin(&cursor, xe_vm_resv(vm),
613 			    DMA_RESV_USAGE_BOOKKEEP);
614 	dma_resv_for_each_fence_unlocked(&cursor, fence)
615 		dma_fence_enable_sw_signaling(fence);
616 	dma_resv_iter_end(&cursor);
617 
618 	err = dma_resv_wait_timeout(xe_vm_resv(vm),
619 				    DMA_RESV_USAGE_BOOKKEEP,
620 				    false, MAX_SCHEDULE_TIMEOUT);
621 	XE_WARN_ON(err <= 0);
622 
623 	if (xe_vm_in_fault_mode(vm) && userptr->initial_bind) {
624 		err = xe_vm_invalidate_vma(vma);
625 		XE_WARN_ON(err);
626 	}
627 
628 	xe_hmm_userptr_unmap(uvma);
629 }
630 
631 static bool vma_userptr_invalidate(struct mmu_interval_notifier *mni,
632 				   const struct mmu_notifier_range *range,
633 				   unsigned long cur_seq)
634 {
635 	struct xe_userptr_vma *uvma = container_of(mni, typeof(*uvma), userptr.notifier);
636 	struct xe_vma *vma = &uvma->vma;
637 	struct xe_vm *vm = xe_vma_vm(vma);
638 
639 	xe_assert(vm->xe, xe_vma_is_userptr(vma));
640 	trace_xe_vma_userptr_invalidate(vma);
641 
642 	if (!mmu_notifier_range_blockable(range))
643 		return false;
644 
645 	vm_dbg(&xe_vma_vm(vma)->xe->drm,
646 	       "NOTIFIER: addr=0x%016llx, range=0x%016llx",
647 		xe_vma_start(vma), xe_vma_size(vma));
648 
649 	down_write(&vm->userptr.notifier_lock);
650 	mmu_interval_set_seq(mni, cur_seq);
651 
652 	__vma_userptr_invalidate(vm, uvma);
653 	up_write(&vm->userptr.notifier_lock);
654 	trace_xe_vma_userptr_invalidate_complete(vma);
655 
656 	return true;
657 }
658 
659 static const struct mmu_interval_notifier_ops vma_userptr_notifier_ops = {
660 	.invalidate = vma_userptr_invalidate,
661 };
662 
663 #if IS_ENABLED(CONFIG_DRM_XE_USERPTR_INVAL_INJECT)
664 /**
665  * xe_vma_userptr_force_invalidate() - force invalidate a userptr
666  * @uvma: The userptr vma to invalidate
667  *
668  * Perform a forced userptr invalidation for testing purposes.
669  */
670 void xe_vma_userptr_force_invalidate(struct xe_userptr_vma *uvma)
671 {
672 	struct xe_vm *vm = xe_vma_vm(&uvma->vma);
673 
674 	/* Protect against concurrent userptr pinning */
675 	lockdep_assert_held(&vm->lock);
676 	/* Protect against concurrent notifiers */
677 	lockdep_assert_held(&vm->userptr.notifier_lock);
678 	/*
679 	 * Protect against concurrent instances of this function and
680 	 * the critical exec sections
681 	 */
682 	xe_vm_assert_held(vm);
683 
684 	if (!mmu_interval_read_retry(&uvma->userptr.notifier,
685 				     uvma->userptr.notifier_seq))
686 		uvma->userptr.notifier_seq -= 2;
687 	__vma_userptr_invalidate(vm, uvma);
688 }
689 #endif
690 
691 int xe_vm_userptr_pin(struct xe_vm *vm)
692 {
693 	struct xe_userptr_vma *uvma, *next;
694 	int err = 0;
695 
696 	xe_assert(vm->xe, !xe_vm_in_fault_mode(vm));
697 	lockdep_assert_held_write(&vm->lock);
698 
699 	/* Collect invalidated userptrs */
700 	spin_lock(&vm->userptr.invalidated_lock);
701 	xe_assert(vm->xe, list_empty(&vm->userptr.repin_list));
702 	list_for_each_entry_safe(uvma, next, &vm->userptr.invalidated,
703 				 userptr.invalidate_link) {
704 		list_del_init(&uvma->userptr.invalidate_link);
705 		list_add_tail(&uvma->userptr.repin_link,
706 			      &vm->userptr.repin_list);
707 	}
708 	spin_unlock(&vm->userptr.invalidated_lock);
709 
710 	/* Pin and move to bind list */
711 	list_for_each_entry_safe(uvma, next, &vm->userptr.repin_list,
712 				 userptr.repin_link) {
713 		err = xe_vma_userptr_pin_pages(uvma);
714 		if (err == -EFAULT) {
715 			list_del_init(&uvma->userptr.repin_link);
716 			/*
717 			 * We might have already done the pin once already, but
718 			 * then had to retry before the re-bind happened, due
719 			 * some other condition in the caller, but in the
720 			 * meantime the userptr got dinged by the notifier such
721 			 * that we need to revalidate here, but this time we hit
722 			 * the EFAULT. In such a case make sure we remove
723 			 * ourselves from the rebind list to avoid going down in
724 			 * flames.
725 			 */
726 			if (!list_empty(&uvma->vma.combined_links.rebind))
727 				list_del_init(&uvma->vma.combined_links.rebind);
728 
729 			/* Wait for pending binds */
730 			xe_vm_lock(vm, false);
731 			dma_resv_wait_timeout(xe_vm_resv(vm),
732 					      DMA_RESV_USAGE_BOOKKEEP,
733 					      false, MAX_SCHEDULE_TIMEOUT);
734 
735 			down_read(&vm->userptr.notifier_lock);
736 			err = xe_vm_invalidate_vma(&uvma->vma);
737 			up_read(&vm->userptr.notifier_lock);
738 			xe_vm_unlock(vm);
739 			if (err)
740 				break;
741 		} else {
742 			if (err)
743 				break;
744 
745 			list_del_init(&uvma->userptr.repin_link);
746 			list_move_tail(&uvma->vma.combined_links.rebind,
747 				       &vm->rebind_list);
748 		}
749 	}
750 
751 	if (err) {
752 		down_write(&vm->userptr.notifier_lock);
753 		spin_lock(&vm->userptr.invalidated_lock);
754 		list_for_each_entry_safe(uvma, next, &vm->userptr.repin_list,
755 					 userptr.repin_link) {
756 			list_del_init(&uvma->userptr.repin_link);
757 			list_move_tail(&uvma->userptr.invalidate_link,
758 				       &vm->userptr.invalidated);
759 		}
760 		spin_unlock(&vm->userptr.invalidated_lock);
761 		up_write(&vm->userptr.notifier_lock);
762 	}
763 	return err;
764 }
765 
766 /**
767  * xe_vm_userptr_check_repin() - Check whether the VM might have userptrs
768  * that need repinning.
769  * @vm: The VM.
770  *
771  * This function does an advisory check for whether the VM has userptrs that
772  * need repinning.
773  *
774  * Return: 0 if there are no indications of userptrs needing repinning,
775  * -EAGAIN if there are.
776  */
777 int xe_vm_userptr_check_repin(struct xe_vm *vm)
778 {
779 	return (list_empty_careful(&vm->userptr.repin_list) &&
780 		list_empty_careful(&vm->userptr.invalidated)) ? 0 : -EAGAIN;
781 }
782 
783 static int xe_vma_ops_alloc(struct xe_vma_ops *vops, bool array_of_binds)
784 {
785 	int i;
786 
787 	for (i = 0; i < XE_MAX_TILES_PER_DEVICE; ++i) {
788 		if (!vops->pt_update_ops[i].num_ops)
789 			continue;
790 
791 		vops->pt_update_ops[i].ops =
792 			kmalloc_array(vops->pt_update_ops[i].num_ops,
793 				      sizeof(*vops->pt_update_ops[i].ops),
794 				      GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN);
795 		if (!vops->pt_update_ops[i].ops)
796 			return array_of_binds ? -ENOBUFS : -ENOMEM;
797 	}
798 
799 	return 0;
800 }
801 ALLOW_ERROR_INJECTION(xe_vma_ops_alloc, ERRNO);
802 
803 static void xe_vma_svm_prefetch_op_fini(struct xe_vma_op *op)
804 {
805 	struct xe_vma *vma;
806 
807 	vma = gpuva_to_vma(op->base.prefetch.va);
808 
809 	if (op->base.op == DRM_GPUVA_OP_PREFETCH && xe_vma_is_cpu_addr_mirror(vma))
810 		xa_destroy(&op->prefetch_range.range);
811 }
812 
813 static void xe_vma_svm_prefetch_ops_fini(struct xe_vma_ops *vops)
814 {
815 	struct xe_vma_op *op;
816 
817 	if (!(vops->flags & XE_VMA_OPS_FLAG_HAS_SVM_PREFETCH))
818 		return;
819 
820 	list_for_each_entry(op, &vops->list, link)
821 		xe_vma_svm_prefetch_op_fini(op);
822 }
823 
824 static void xe_vma_ops_fini(struct xe_vma_ops *vops)
825 {
826 	int i;
827 
828 	xe_vma_svm_prefetch_ops_fini(vops);
829 
830 	for (i = 0; i < XE_MAX_TILES_PER_DEVICE; ++i)
831 		kfree(vops->pt_update_ops[i].ops);
832 }
833 
834 static void xe_vma_ops_incr_pt_update_ops(struct xe_vma_ops *vops, u8 tile_mask, int inc_val)
835 {
836 	int i;
837 
838 	if (!inc_val)
839 		return;
840 
841 	for (i = 0; i < XE_MAX_TILES_PER_DEVICE; ++i)
842 		if (BIT(i) & tile_mask)
843 			vops->pt_update_ops[i].num_ops += inc_val;
844 }
845 
846 static void xe_vm_populate_rebind(struct xe_vma_op *op, struct xe_vma *vma,
847 				  u8 tile_mask)
848 {
849 	INIT_LIST_HEAD(&op->link);
850 	op->tile_mask = tile_mask;
851 	op->base.op = DRM_GPUVA_OP_MAP;
852 	op->base.map.va.addr = vma->gpuva.va.addr;
853 	op->base.map.va.range = vma->gpuva.va.range;
854 	op->base.map.gem.obj = vma->gpuva.gem.obj;
855 	op->base.map.gem.offset = vma->gpuva.gem.offset;
856 	op->map.vma = vma;
857 	op->map.immediate = true;
858 	op->map.dumpable = vma->gpuva.flags & XE_VMA_DUMPABLE;
859 	op->map.is_null = xe_vma_is_null(vma);
860 }
861 
862 static int xe_vm_ops_add_rebind(struct xe_vma_ops *vops, struct xe_vma *vma,
863 				u8 tile_mask)
864 {
865 	struct xe_vma_op *op;
866 
867 	op = kzalloc(sizeof(*op), GFP_KERNEL);
868 	if (!op)
869 		return -ENOMEM;
870 
871 	xe_vm_populate_rebind(op, vma, tile_mask);
872 	list_add_tail(&op->link, &vops->list);
873 	xe_vma_ops_incr_pt_update_ops(vops, tile_mask, 1);
874 
875 	return 0;
876 }
877 
878 static struct dma_fence *ops_execute(struct xe_vm *vm,
879 				     struct xe_vma_ops *vops);
880 static void xe_vma_ops_init(struct xe_vma_ops *vops, struct xe_vm *vm,
881 			    struct xe_exec_queue *q,
882 			    struct xe_sync_entry *syncs, u32 num_syncs);
883 
884 int xe_vm_rebind(struct xe_vm *vm, bool rebind_worker)
885 {
886 	struct dma_fence *fence;
887 	struct xe_vma *vma, *next;
888 	struct xe_vma_ops vops;
889 	struct xe_vma_op *op, *next_op;
890 	int err, i;
891 
892 	lockdep_assert_held(&vm->lock);
893 	if ((xe_vm_in_lr_mode(vm) && !rebind_worker) ||
894 	    list_empty(&vm->rebind_list))
895 		return 0;
896 
897 	xe_vma_ops_init(&vops, vm, NULL, NULL, 0);
898 	for (i = 0; i < XE_MAX_TILES_PER_DEVICE; ++i)
899 		vops.pt_update_ops[i].wait_vm_bookkeep = true;
900 
901 	xe_vm_assert_held(vm);
902 	list_for_each_entry(vma, &vm->rebind_list, combined_links.rebind) {
903 		xe_assert(vm->xe, vma->tile_present);
904 
905 		if (rebind_worker)
906 			trace_xe_vma_rebind_worker(vma);
907 		else
908 			trace_xe_vma_rebind_exec(vma);
909 
910 		err = xe_vm_ops_add_rebind(&vops, vma,
911 					   vma->tile_present);
912 		if (err)
913 			goto free_ops;
914 	}
915 
916 	err = xe_vma_ops_alloc(&vops, false);
917 	if (err)
918 		goto free_ops;
919 
920 	fence = ops_execute(vm, &vops);
921 	if (IS_ERR(fence)) {
922 		err = PTR_ERR(fence);
923 	} else {
924 		dma_fence_put(fence);
925 		list_for_each_entry_safe(vma, next, &vm->rebind_list,
926 					 combined_links.rebind)
927 			list_del_init(&vma->combined_links.rebind);
928 	}
929 free_ops:
930 	list_for_each_entry_safe(op, next_op, &vops.list, link) {
931 		list_del(&op->link);
932 		kfree(op);
933 	}
934 	xe_vma_ops_fini(&vops);
935 
936 	return err;
937 }
938 
939 struct dma_fence *xe_vma_rebind(struct xe_vm *vm, struct xe_vma *vma, u8 tile_mask)
940 {
941 	struct dma_fence *fence = NULL;
942 	struct xe_vma_ops vops;
943 	struct xe_vma_op *op, *next_op;
944 	struct xe_tile *tile;
945 	u8 id;
946 	int err;
947 
948 	lockdep_assert_held(&vm->lock);
949 	xe_vm_assert_held(vm);
950 	xe_assert(vm->xe, xe_vm_in_fault_mode(vm));
951 
952 	xe_vma_ops_init(&vops, vm, NULL, NULL, 0);
953 	for_each_tile(tile, vm->xe, id) {
954 		vops.pt_update_ops[id].wait_vm_bookkeep = true;
955 		vops.pt_update_ops[tile->id].q =
956 			xe_migrate_exec_queue(tile->migrate);
957 	}
958 
959 	err = xe_vm_ops_add_rebind(&vops, vma, tile_mask);
960 	if (err)
961 		return ERR_PTR(err);
962 
963 	err = xe_vma_ops_alloc(&vops, false);
964 	if (err) {
965 		fence = ERR_PTR(err);
966 		goto free_ops;
967 	}
968 
969 	fence = ops_execute(vm, &vops);
970 
971 free_ops:
972 	list_for_each_entry_safe(op, next_op, &vops.list, link) {
973 		list_del(&op->link);
974 		kfree(op);
975 	}
976 	xe_vma_ops_fini(&vops);
977 
978 	return fence;
979 }
980 
981 static void xe_vm_populate_range_rebind(struct xe_vma_op *op,
982 					struct xe_vma *vma,
983 					struct xe_svm_range *range,
984 					u8 tile_mask)
985 {
986 	INIT_LIST_HEAD(&op->link);
987 	op->tile_mask = tile_mask;
988 	op->base.op = DRM_GPUVA_OP_DRIVER;
989 	op->subop = XE_VMA_SUBOP_MAP_RANGE;
990 	op->map_range.vma = vma;
991 	op->map_range.range = range;
992 }
993 
994 static int
995 xe_vm_ops_add_range_rebind(struct xe_vma_ops *vops,
996 			   struct xe_vma *vma,
997 			   struct xe_svm_range *range,
998 			   u8 tile_mask)
999 {
1000 	struct xe_vma_op *op;
1001 
1002 	op = kzalloc(sizeof(*op), GFP_KERNEL);
1003 	if (!op)
1004 		return -ENOMEM;
1005 
1006 	xe_vm_populate_range_rebind(op, vma, range, tile_mask);
1007 	list_add_tail(&op->link, &vops->list);
1008 	xe_vma_ops_incr_pt_update_ops(vops, tile_mask, 1);
1009 
1010 	return 0;
1011 }
1012 
1013 /**
1014  * xe_vm_range_rebind() - VM range (re)bind
1015  * @vm: The VM which the range belongs to.
1016  * @vma: The VMA which the range belongs to.
1017  * @range: SVM range to rebind.
1018  * @tile_mask: Tile mask to bind the range to.
1019  *
1020  * (re)bind SVM range setting up GPU page tables for the range.
1021  *
1022  * Return: dma fence for rebind to signal completion on succees, ERR_PTR on
1023  * failure
1024  */
1025 struct dma_fence *xe_vm_range_rebind(struct xe_vm *vm,
1026 				     struct xe_vma *vma,
1027 				     struct xe_svm_range *range,
1028 				     u8 tile_mask)
1029 {
1030 	struct dma_fence *fence = NULL;
1031 	struct xe_vma_ops vops;
1032 	struct xe_vma_op *op, *next_op;
1033 	struct xe_tile *tile;
1034 	u8 id;
1035 	int err;
1036 
1037 	lockdep_assert_held(&vm->lock);
1038 	xe_vm_assert_held(vm);
1039 	xe_assert(vm->xe, xe_vm_in_fault_mode(vm));
1040 	xe_assert(vm->xe, xe_vma_is_cpu_addr_mirror(vma));
1041 
1042 	xe_vma_ops_init(&vops, vm, NULL, NULL, 0);
1043 	for_each_tile(tile, vm->xe, id) {
1044 		vops.pt_update_ops[id].wait_vm_bookkeep = true;
1045 		vops.pt_update_ops[tile->id].q =
1046 			xe_migrate_exec_queue(tile->migrate);
1047 	}
1048 
1049 	err = xe_vm_ops_add_range_rebind(&vops, vma, range, tile_mask);
1050 	if (err)
1051 		return ERR_PTR(err);
1052 
1053 	err = xe_vma_ops_alloc(&vops, false);
1054 	if (err) {
1055 		fence = ERR_PTR(err);
1056 		goto free_ops;
1057 	}
1058 
1059 	fence = ops_execute(vm, &vops);
1060 
1061 free_ops:
1062 	list_for_each_entry_safe(op, next_op, &vops.list, link) {
1063 		list_del(&op->link);
1064 		kfree(op);
1065 	}
1066 	xe_vma_ops_fini(&vops);
1067 
1068 	return fence;
1069 }
1070 
1071 static void xe_vm_populate_range_unbind(struct xe_vma_op *op,
1072 					struct xe_svm_range *range)
1073 {
1074 	INIT_LIST_HEAD(&op->link);
1075 	op->tile_mask = range->tile_present;
1076 	op->base.op = DRM_GPUVA_OP_DRIVER;
1077 	op->subop = XE_VMA_SUBOP_UNMAP_RANGE;
1078 	op->unmap_range.range = range;
1079 }
1080 
1081 static int
1082 xe_vm_ops_add_range_unbind(struct xe_vma_ops *vops,
1083 			   struct xe_svm_range *range)
1084 {
1085 	struct xe_vma_op *op;
1086 
1087 	op = kzalloc(sizeof(*op), GFP_KERNEL);
1088 	if (!op)
1089 		return -ENOMEM;
1090 
1091 	xe_vm_populate_range_unbind(op, range);
1092 	list_add_tail(&op->link, &vops->list);
1093 	xe_vma_ops_incr_pt_update_ops(vops, range->tile_present, 1);
1094 
1095 	return 0;
1096 }
1097 
1098 /**
1099  * xe_vm_range_unbind() - VM range unbind
1100  * @vm: The VM which the range belongs to.
1101  * @range: SVM range to rebind.
1102  *
1103  * Unbind SVM range removing the GPU page tables for the range.
1104  *
1105  * Return: dma fence for unbind to signal completion on succees, ERR_PTR on
1106  * failure
1107  */
1108 struct dma_fence *xe_vm_range_unbind(struct xe_vm *vm,
1109 				     struct xe_svm_range *range)
1110 {
1111 	struct dma_fence *fence = NULL;
1112 	struct xe_vma_ops vops;
1113 	struct xe_vma_op *op, *next_op;
1114 	struct xe_tile *tile;
1115 	u8 id;
1116 	int err;
1117 
1118 	lockdep_assert_held(&vm->lock);
1119 	xe_vm_assert_held(vm);
1120 	xe_assert(vm->xe, xe_vm_in_fault_mode(vm));
1121 
1122 	if (!range->tile_present)
1123 		return dma_fence_get_stub();
1124 
1125 	xe_vma_ops_init(&vops, vm, NULL, NULL, 0);
1126 	for_each_tile(tile, vm->xe, id) {
1127 		vops.pt_update_ops[id].wait_vm_bookkeep = true;
1128 		vops.pt_update_ops[tile->id].q =
1129 			xe_migrate_exec_queue(tile->migrate);
1130 	}
1131 
1132 	err = xe_vm_ops_add_range_unbind(&vops, range);
1133 	if (err)
1134 		return ERR_PTR(err);
1135 
1136 	err = xe_vma_ops_alloc(&vops, false);
1137 	if (err) {
1138 		fence = ERR_PTR(err);
1139 		goto free_ops;
1140 	}
1141 
1142 	fence = ops_execute(vm, &vops);
1143 
1144 free_ops:
1145 	list_for_each_entry_safe(op, next_op, &vops.list, link) {
1146 		list_del(&op->link);
1147 		kfree(op);
1148 	}
1149 	xe_vma_ops_fini(&vops);
1150 
1151 	return fence;
1152 }
1153 
1154 static void xe_vma_free(struct xe_vma *vma)
1155 {
1156 	if (xe_vma_is_userptr(vma))
1157 		kfree(to_userptr_vma(vma));
1158 	else
1159 		kfree(vma);
1160 }
1161 
1162 #define VMA_CREATE_FLAG_READ_ONLY		BIT(0)
1163 #define VMA_CREATE_FLAG_IS_NULL			BIT(1)
1164 #define VMA_CREATE_FLAG_DUMPABLE		BIT(2)
1165 #define VMA_CREATE_FLAG_IS_SYSTEM_ALLOCATOR	BIT(3)
1166 
1167 static struct xe_vma *xe_vma_create(struct xe_vm *vm,
1168 				    struct xe_bo *bo,
1169 				    u64 bo_offset_or_userptr,
1170 				    u64 start, u64 end,
1171 				    u16 pat_index, unsigned int flags)
1172 {
1173 	struct xe_vma *vma;
1174 	struct xe_tile *tile;
1175 	u8 id;
1176 	bool read_only = (flags & VMA_CREATE_FLAG_READ_ONLY);
1177 	bool is_null = (flags & VMA_CREATE_FLAG_IS_NULL);
1178 	bool dumpable = (flags & VMA_CREATE_FLAG_DUMPABLE);
1179 	bool is_cpu_addr_mirror =
1180 		(flags & VMA_CREATE_FLAG_IS_SYSTEM_ALLOCATOR);
1181 
1182 	xe_assert(vm->xe, start < end);
1183 	xe_assert(vm->xe, end < vm->size);
1184 
1185 	/*
1186 	 * Allocate and ensure that the xe_vma_is_userptr() return
1187 	 * matches what was allocated.
1188 	 */
1189 	if (!bo && !is_null && !is_cpu_addr_mirror) {
1190 		struct xe_userptr_vma *uvma = kzalloc(sizeof(*uvma), GFP_KERNEL);
1191 
1192 		if (!uvma)
1193 			return ERR_PTR(-ENOMEM);
1194 
1195 		vma = &uvma->vma;
1196 	} else {
1197 		vma = kzalloc(sizeof(*vma), GFP_KERNEL);
1198 		if (!vma)
1199 			return ERR_PTR(-ENOMEM);
1200 
1201 		if (is_cpu_addr_mirror)
1202 			vma->gpuva.flags |= XE_VMA_SYSTEM_ALLOCATOR;
1203 		if (is_null)
1204 			vma->gpuva.flags |= DRM_GPUVA_SPARSE;
1205 		if (bo)
1206 			vma->gpuva.gem.obj = &bo->ttm.base;
1207 	}
1208 
1209 	INIT_LIST_HEAD(&vma->combined_links.rebind);
1210 
1211 	INIT_LIST_HEAD(&vma->gpuva.gem.entry);
1212 	vma->gpuva.vm = &vm->gpuvm;
1213 	vma->gpuva.va.addr = start;
1214 	vma->gpuva.va.range = end - start + 1;
1215 	if (read_only)
1216 		vma->gpuva.flags |= XE_VMA_READ_ONLY;
1217 	if (dumpable)
1218 		vma->gpuva.flags |= XE_VMA_DUMPABLE;
1219 
1220 	for_each_tile(tile, vm->xe, id)
1221 		vma->tile_mask |= 0x1 << id;
1222 
1223 	if (vm->xe->info.has_atomic_enable_pte_bit)
1224 		vma->gpuva.flags |= XE_VMA_ATOMIC_PTE_BIT;
1225 
1226 	vma->pat_index = pat_index;
1227 
1228 	if (bo) {
1229 		struct drm_gpuvm_bo *vm_bo;
1230 
1231 		xe_bo_assert_held(bo);
1232 
1233 		vm_bo = drm_gpuvm_bo_obtain(vma->gpuva.vm, &bo->ttm.base);
1234 		if (IS_ERR(vm_bo)) {
1235 			xe_vma_free(vma);
1236 			return ERR_CAST(vm_bo);
1237 		}
1238 
1239 		drm_gpuvm_bo_extobj_add(vm_bo);
1240 		drm_gem_object_get(&bo->ttm.base);
1241 		vma->gpuva.gem.offset = bo_offset_or_userptr;
1242 		drm_gpuva_link(&vma->gpuva, vm_bo);
1243 		drm_gpuvm_bo_put(vm_bo);
1244 	} else /* userptr or null */ {
1245 		if (!is_null && !is_cpu_addr_mirror) {
1246 			struct xe_userptr *userptr = &to_userptr_vma(vma)->userptr;
1247 			u64 size = end - start + 1;
1248 			int err;
1249 
1250 			INIT_LIST_HEAD(&userptr->invalidate_link);
1251 			INIT_LIST_HEAD(&userptr->repin_link);
1252 			vma->gpuva.gem.offset = bo_offset_or_userptr;
1253 			mutex_init(&userptr->unmap_mutex);
1254 
1255 			err = mmu_interval_notifier_insert(&userptr->notifier,
1256 							   current->mm,
1257 							   xe_vma_userptr(vma), size,
1258 							   &vma_userptr_notifier_ops);
1259 			if (err) {
1260 				xe_vma_free(vma);
1261 				return ERR_PTR(err);
1262 			}
1263 
1264 			userptr->notifier_seq = LONG_MAX;
1265 		}
1266 
1267 		xe_vm_get(vm);
1268 	}
1269 
1270 	return vma;
1271 }
1272 
1273 static void xe_vma_destroy_late(struct xe_vma *vma)
1274 {
1275 	struct xe_vm *vm = xe_vma_vm(vma);
1276 
1277 	if (vma->ufence) {
1278 		xe_sync_ufence_put(vma->ufence);
1279 		vma->ufence = NULL;
1280 	}
1281 
1282 	if (xe_vma_is_userptr(vma)) {
1283 		struct xe_userptr_vma *uvma = to_userptr_vma(vma);
1284 		struct xe_userptr *userptr = &uvma->userptr;
1285 
1286 		if (userptr->sg)
1287 			xe_hmm_userptr_free_sg(uvma);
1288 
1289 		/*
1290 		 * Since userptr pages are not pinned, we can't remove
1291 		 * the notifier until we're sure the GPU is not accessing
1292 		 * them anymore
1293 		 */
1294 		mmu_interval_notifier_remove(&userptr->notifier);
1295 		mutex_destroy(&userptr->unmap_mutex);
1296 		xe_vm_put(vm);
1297 	} else if (xe_vma_is_null(vma) || xe_vma_is_cpu_addr_mirror(vma)) {
1298 		xe_vm_put(vm);
1299 	} else {
1300 		xe_bo_put(xe_vma_bo(vma));
1301 	}
1302 
1303 	xe_vma_free(vma);
1304 }
1305 
1306 static void vma_destroy_work_func(struct work_struct *w)
1307 {
1308 	struct xe_vma *vma =
1309 		container_of(w, struct xe_vma, destroy_work);
1310 
1311 	xe_vma_destroy_late(vma);
1312 }
1313 
1314 static void vma_destroy_cb(struct dma_fence *fence,
1315 			   struct dma_fence_cb *cb)
1316 {
1317 	struct xe_vma *vma = container_of(cb, struct xe_vma, destroy_cb);
1318 
1319 	INIT_WORK(&vma->destroy_work, vma_destroy_work_func);
1320 	queue_work(system_unbound_wq, &vma->destroy_work);
1321 }
1322 
1323 static void xe_vma_destroy(struct xe_vma *vma, struct dma_fence *fence)
1324 {
1325 	struct xe_vm *vm = xe_vma_vm(vma);
1326 
1327 	lockdep_assert_held_write(&vm->lock);
1328 	xe_assert(vm->xe, list_empty(&vma->combined_links.destroy));
1329 
1330 	if (xe_vma_is_userptr(vma)) {
1331 		xe_assert(vm->xe, vma->gpuva.flags & XE_VMA_DESTROYED);
1332 
1333 		spin_lock(&vm->userptr.invalidated_lock);
1334 		xe_assert(vm->xe, list_empty(&to_userptr_vma(vma)->userptr.repin_link));
1335 		list_del(&to_userptr_vma(vma)->userptr.invalidate_link);
1336 		spin_unlock(&vm->userptr.invalidated_lock);
1337 	} else if (!xe_vma_is_null(vma) && !xe_vma_is_cpu_addr_mirror(vma)) {
1338 		xe_bo_assert_held(xe_vma_bo(vma));
1339 
1340 		drm_gpuva_unlink(&vma->gpuva);
1341 	}
1342 
1343 	xe_vm_assert_held(vm);
1344 	if (fence) {
1345 		int ret = dma_fence_add_callback(fence, &vma->destroy_cb,
1346 						 vma_destroy_cb);
1347 
1348 		if (ret) {
1349 			XE_WARN_ON(ret != -ENOENT);
1350 			xe_vma_destroy_late(vma);
1351 		}
1352 	} else {
1353 		xe_vma_destroy_late(vma);
1354 	}
1355 }
1356 
1357 /**
1358  * xe_vm_lock_vma() - drm_exec utility to lock a vma
1359  * @exec: The drm_exec object we're currently locking for.
1360  * @vma: The vma for witch we want to lock the vm resv and any attached
1361  * object's resv.
1362  *
1363  * Return: 0 on success, negative error code on error. In particular
1364  * may return -EDEADLK on WW transaction contention and -EINTR if
1365  * an interruptible wait is terminated by a signal.
1366  */
1367 int xe_vm_lock_vma(struct drm_exec *exec, struct xe_vma *vma)
1368 {
1369 	struct xe_vm *vm = xe_vma_vm(vma);
1370 	struct xe_bo *bo = xe_vma_bo(vma);
1371 	int err;
1372 
1373 	XE_WARN_ON(!vm);
1374 
1375 	err = drm_exec_lock_obj(exec, xe_vm_obj(vm));
1376 	if (!err && bo && !bo->vm)
1377 		err = drm_exec_lock_obj(exec, &bo->ttm.base);
1378 
1379 	return err;
1380 }
1381 
1382 static void xe_vma_destroy_unlocked(struct xe_vma *vma)
1383 {
1384 	struct drm_exec exec;
1385 	int err;
1386 
1387 	drm_exec_init(&exec, 0, 0);
1388 	drm_exec_until_all_locked(&exec) {
1389 		err = xe_vm_lock_vma(&exec, vma);
1390 		drm_exec_retry_on_contention(&exec);
1391 		if (XE_WARN_ON(err))
1392 			break;
1393 	}
1394 
1395 	xe_vma_destroy(vma, NULL);
1396 
1397 	drm_exec_fini(&exec);
1398 }
1399 
1400 struct xe_vma *
1401 xe_vm_find_overlapping_vma(struct xe_vm *vm, u64 start, u64 range)
1402 {
1403 	struct drm_gpuva *gpuva;
1404 
1405 	lockdep_assert_held(&vm->lock);
1406 
1407 	if (xe_vm_is_closed_or_banned(vm))
1408 		return NULL;
1409 
1410 	xe_assert(vm->xe, start + range <= vm->size);
1411 
1412 	gpuva = drm_gpuva_find_first(&vm->gpuvm, start, range);
1413 
1414 	return gpuva ? gpuva_to_vma(gpuva) : NULL;
1415 }
1416 
1417 static int xe_vm_insert_vma(struct xe_vm *vm, struct xe_vma *vma)
1418 {
1419 	int err;
1420 
1421 	xe_assert(vm->xe, xe_vma_vm(vma) == vm);
1422 	lockdep_assert_held(&vm->lock);
1423 
1424 	mutex_lock(&vm->snap_mutex);
1425 	err = drm_gpuva_insert(&vm->gpuvm, &vma->gpuva);
1426 	mutex_unlock(&vm->snap_mutex);
1427 	XE_WARN_ON(err);	/* Shouldn't be possible */
1428 
1429 	return err;
1430 }
1431 
1432 static void xe_vm_remove_vma(struct xe_vm *vm, struct xe_vma *vma)
1433 {
1434 	xe_assert(vm->xe, xe_vma_vm(vma) == vm);
1435 	lockdep_assert_held(&vm->lock);
1436 
1437 	mutex_lock(&vm->snap_mutex);
1438 	drm_gpuva_remove(&vma->gpuva);
1439 	mutex_unlock(&vm->snap_mutex);
1440 	if (vm->usm.last_fault_vma == vma)
1441 		vm->usm.last_fault_vma = NULL;
1442 }
1443 
1444 static struct drm_gpuva_op *xe_vm_op_alloc(void)
1445 {
1446 	struct xe_vma_op *op;
1447 
1448 	op = kzalloc(sizeof(*op), GFP_KERNEL);
1449 
1450 	if (unlikely(!op))
1451 		return NULL;
1452 
1453 	return &op->base;
1454 }
1455 
1456 static void xe_vm_free(struct drm_gpuvm *gpuvm);
1457 
1458 static const struct drm_gpuvm_ops gpuvm_ops = {
1459 	.op_alloc = xe_vm_op_alloc,
1460 	.vm_bo_validate = xe_gpuvm_validate,
1461 	.vm_free = xe_vm_free,
1462 };
1463 
1464 static u64 pde_encode_pat_index(u16 pat_index)
1465 {
1466 	u64 pte = 0;
1467 
1468 	if (pat_index & BIT(0))
1469 		pte |= XE_PPGTT_PTE_PAT0;
1470 
1471 	if (pat_index & BIT(1))
1472 		pte |= XE_PPGTT_PTE_PAT1;
1473 
1474 	return pte;
1475 }
1476 
1477 static u64 pte_encode_pat_index(u16 pat_index, u32 pt_level)
1478 {
1479 	u64 pte = 0;
1480 
1481 	if (pat_index & BIT(0))
1482 		pte |= XE_PPGTT_PTE_PAT0;
1483 
1484 	if (pat_index & BIT(1))
1485 		pte |= XE_PPGTT_PTE_PAT1;
1486 
1487 	if (pat_index & BIT(2)) {
1488 		if (pt_level)
1489 			pte |= XE_PPGTT_PDE_PDPE_PAT2;
1490 		else
1491 			pte |= XE_PPGTT_PTE_PAT2;
1492 	}
1493 
1494 	if (pat_index & BIT(3))
1495 		pte |= XELPG_PPGTT_PTE_PAT3;
1496 
1497 	if (pat_index & (BIT(4)))
1498 		pte |= XE2_PPGTT_PTE_PAT4;
1499 
1500 	return pte;
1501 }
1502 
1503 static u64 pte_encode_ps(u32 pt_level)
1504 {
1505 	XE_WARN_ON(pt_level > MAX_HUGEPTE_LEVEL);
1506 
1507 	if (pt_level == 1)
1508 		return XE_PDE_PS_2M;
1509 	else if (pt_level == 2)
1510 		return XE_PDPE_PS_1G;
1511 
1512 	return 0;
1513 }
1514 
1515 static u16 pde_pat_index(struct xe_bo *bo)
1516 {
1517 	struct xe_device *xe = xe_bo_device(bo);
1518 	u16 pat_index;
1519 
1520 	/*
1521 	 * We only have two bits to encode the PAT index in non-leaf nodes, but
1522 	 * these only point to other paging structures so we only need a minimal
1523 	 * selection of options. The user PAT index is only for encoding leaf
1524 	 * nodes, where we have use of more bits to do the encoding. The
1525 	 * non-leaf nodes are instead under driver control so the chosen index
1526 	 * here should be distict from the user PAT index. Also the
1527 	 * corresponding coherency of the PAT index should be tied to the
1528 	 * allocation type of the page table (or at least we should pick
1529 	 * something which is always safe).
1530 	 */
1531 	if (!xe_bo_is_vram(bo) && bo->ttm.ttm->caching == ttm_cached)
1532 		pat_index = xe->pat.idx[XE_CACHE_WB];
1533 	else
1534 		pat_index = xe->pat.idx[XE_CACHE_NONE];
1535 
1536 	xe_assert(xe, pat_index <= 3);
1537 
1538 	return pat_index;
1539 }
1540 
1541 static u64 xelp_pde_encode_bo(struct xe_bo *bo, u64 bo_offset)
1542 {
1543 	u64 pde;
1544 
1545 	pde = xe_bo_addr(bo, bo_offset, XE_PAGE_SIZE);
1546 	pde |= XE_PAGE_PRESENT | XE_PAGE_RW;
1547 	pde |= pde_encode_pat_index(pde_pat_index(bo));
1548 
1549 	return pde;
1550 }
1551 
1552 static u64 xelp_pte_encode_bo(struct xe_bo *bo, u64 bo_offset,
1553 			      u16 pat_index, u32 pt_level)
1554 {
1555 	u64 pte;
1556 
1557 	pte = xe_bo_addr(bo, bo_offset, XE_PAGE_SIZE);
1558 	pte |= XE_PAGE_PRESENT | XE_PAGE_RW;
1559 	pte |= pte_encode_pat_index(pat_index, pt_level);
1560 	pte |= pte_encode_ps(pt_level);
1561 
1562 	if (xe_bo_is_vram(bo) || xe_bo_is_stolen_devmem(bo))
1563 		pte |= XE_PPGTT_PTE_DM;
1564 
1565 	return pte;
1566 }
1567 
1568 static u64 xelp_pte_encode_vma(u64 pte, struct xe_vma *vma,
1569 			       u16 pat_index, u32 pt_level)
1570 {
1571 	pte |= XE_PAGE_PRESENT;
1572 
1573 	if (likely(!xe_vma_read_only(vma)))
1574 		pte |= XE_PAGE_RW;
1575 
1576 	pte |= pte_encode_pat_index(pat_index, pt_level);
1577 	pte |= pte_encode_ps(pt_level);
1578 
1579 	if (unlikely(xe_vma_is_null(vma)))
1580 		pte |= XE_PTE_NULL;
1581 
1582 	return pte;
1583 }
1584 
1585 static u64 xelp_pte_encode_addr(struct xe_device *xe, u64 addr,
1586 				u16 pat_index,
1587 				u32 pt_level, bool devmem, u64 flags)
1588 {
1589 	u64 pte;
1590 
1591 	/* Avoid passing random bits directly as flags */
1592 	xe_assert(xe, !(flags & ~XE_PTE_PS64));
1593 
1594 	pte = addr;
1595 	pte |= XE_PAGE_PRESENT | XE_PAGE_RW;
1596 	pte |= pte_encode_pat_index(pat_index, pt_level);
1597 	pte |= pte_encode_ps(pt_level);
1598 
1599 	if (devmem)
1600 		pte |= XE_PPGTT_PTE_DM;
1601 
1602 	pte |= flags;
1603 
1604 	return pte;
1605 }
1606 
1607 static const struct xe_pt_ops xelp_pt_ops = {
1608 	.pte_encode_bo = xelp_pte_encode_bo,
1609 	.pte_encode_vma = xelp_pte_encode_vma,
1610 	.pte_encode_addr = xelp_pte_encode_addr,
1611 	.pde_encode_bo = xelp_pde_encode_bo,
1612 };
1613 
1614 static void vm_destroy_work_func(struct work_struct *w);
1615 
1616 /**
1617  * xe_vm_create_scratch() - Setup a scratch memory pagetable tree for the
1618  * given tile and vm.
1619  * @xe: xe device.
1620  * @tile: tile to set up for.
1621  * @vm: vm to set up for.
1622  *
1623  * Sets up a pagetable tree with one page-table per level and a single
1624  * leaf PTE. All pagetable entries point to the single page-table or,
1625  * for MAX_HUGEPTE_LEVEL, a NULL huge PTE returning 0 on read and
1626  * writes become NOPs.
1627  *
1628  * Return: 0 on success, negative error code on error.
1629  */
1630 static int xe_vm_create_scratch(struct xe_device *xe, struct xe_tile *tile,
1631 				struct xe_vm *vm)
1632 {
1633 	u8 id = tile->id;
1634 	int i;
1635 
1636 	for (i = MAX_HUGEPTE_LEVEL; i < vm->pt_root[id]->level; i++) {
1637 		vm->scratch_pt[id][i] = xe_pt_create(vm, tile, i);
1638 		if (IS_ERR(vm->scratch_pt[id][i]))
1639 			return PTR_ERR(vm->scratch_pt[id][i]);
1640 
1641 		xe_pt_populate_empty(tile, vm, vm->scratch_pt[id][i]);
1642 	}
1643 
1644 	return 0;
1645 }
1646 ALLOW_ERROR_INJECTION(xe_vm_create_scratch, ERRNO);
1647 
1648 static void xe_vm_free_scratch(struct xe_vm *vm)
1649 {
1650 	struct xe_tile *tile;
1651 	u8 id;
1652 
1653 	if (!xe_vm_has_scratch(vm))
1654 		return;
1655 
1656 	for_each_tile(tile, vm->xe, id) {
1657 		u32 i;
1658 
1659 		if (!vm->pt_root[id])
1660 			continue;
1661 
1662 		for (i = MAX_HUGEPTE_LEVEL; i < vm->pt_root[id]->level; ++i)
1663 			if (vm->scratch_pt[id][i])
1664 				xe_pt_destroy(vm->scratch_pt[id][i], vm->flags, NULL);
1665 	}
1666 }
1667 
1668 struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags, struct xe_file *xef)
1669 {
1670 	struct drm_gem_object *vm_resv_obj;
1671 	struct xe_vm *vm;
1672 	int err, number_tiles = 0;
1673 	struct xe_tile *tile;
1674 	u8 id;
1675 
1676 	/*
1677 	 * Since the GSCCS is not user-accessible, we don't expect a GSC VM to
1678 	 * ever be in faulting mode.
1679 	 */
1680 	xe_assert(xe, !((flags & XE_VM_FLAG_GSC) && (flags & XE_VM_FLAG_FAULT_MODE)));
1681 
1682 	vm = kzalloc(sizeof(*vm), GFP_KERNEL);
1683 	if (!vm)
1684 		return ERR_PTR(-ENOMEM);
1685 
1686 	vm->xe = xe;
1687 
1688 	vm->size = 1ull << xe->info.va_bits;
1689 	vm->flags = flags;
1690 
1691 	if (xef)
1692 		vm->xef = xe_file_get(xef);
1693 	/**
1694 	 * GSC VMs are kernel-owned, only used for PXP ops and can sometimes be
1695 	 * manipulated under the PXP mutex. However, the PXP mutex can be taken
1696 	 * under a user-VM lock when the PXP session is started at exec_queue
1697 	 * creation time. Those are different VMs and therefore there is no risk
1698 	 * of deadlock, but we need to tell lockdep that this is the case or it
1699 	 * will print a warning.
1700 	 */
1701 	if (flags & XE_VM_FLAG_GSC) {
1702 		static struct lock_class_key gsc_vm_key;
1703 
1704 		__init_rwsem(&vm->lock, "gsc_vm", &gsc_vm_key);
1705 	} else {
1706 		init_rwsem(&vm->lock);
1707 	}
1708 	mutex_init(&vm->snap_mutex);
1709 
1710 	INIT_LIST_HEAD(&vm->rebind_list);
1711 
1712 	INIT_LIST_HEAD(&vm->userptr.repin_list);
1713 	INIT_LIST_HEAD(&vm->userptr.invalidated);
1714 	init_rwsem(&vm->userptr.notifier_lock);
1715 	spin_lock_init(&vm->userptr.invalidated_lock);
1716 
1717 	ttm_lru_bulk_move_init(&vm->lru_bulk_move);
1718 
1719 	INIT_WORK(&vm->destroy_work, vm_destroy_work_func);
1720 
1721 	INIT_LIST_HEAD(&vm->preempt.exec_queues);
1722 	vm->preempt.min_run_period_ms = 10;	/* FIXME: Wire up to uAPI */
1723 
1724 	for_each_tile(tile, xe, id)
1725 		xe_range_fence_tree_init(&vm->rftree[id]);
1726 
1727 	vm->pt_ops = &xelp_pt_ops;
1728 
1729 	/*
1730 	 * Long-running workloads are not protected by the scheduler references.
1731 	 * By design, run_job for long-running workloads returns NULL and the
1732 	 * scheduler drops all the references of it, hence protecting the VM
1733 	 * for this case is necessary.
1734 	 */
1735 	if (flags & XE_VM_FLAG_LR_MODE) {
1736 		INIT_WORK(&vm->preempt.rebind_work, preempt_rebind_work_func);
1737 		xe_pm_runtime_get_noresume(xe);
1738 	}
1739 
1740 	if (flags & XE_VM_FLAG_FAULT_MODE) {
1741 		err = xe_svm_init(vm);
1742 		if (err)
1743 			goto err_no_resv;
1744 	}
1745 
1746 	vm_resv_obj = drm_gpuvm_resv_object_alloc(&xe->drm);
1747 	if (!vm_resv_obj) {
1748 		err = -ENOMEM;
1749 		goto err_svm_fini;
1750 	}
1751 
1752 	drm_gpuvm_init(&vm->gpuvm, "Xe VM", DRM_GPUVM_RESV_PROTECTED, &xe->drm,
1753 		       vm_resv_obj, 0, vm->size, 0, 0, &gpuvm_ops);
1754 
1755 	drm_gem_object_put(vm_resv_obj);
1756 
1757 	err = xe_vm_lock(vm, true);
1758 	if (err)
1759 		goto err_close;
1760 
1761 	if (IS_DGFX(xe) && xe->info.vram_flags & XE_VRAM_FLAGS_NEED64K)
1762 		vm->flags |= XE_VM_FLAG_64K;
1763 
1764 	for_each_tile(tile, xe, id) {
1765 		if (flags & XE_VM_FLAG_MIGRATION &&
1766 		    tile->id != XE_VM_FLAG_TILE_ID(flags))
1767 			continue;
1768 
1769 		vm->pt_root[id] = xe_pt_create(vm, tile, xe->info.vm_max_level);
1770 		if (IS_ERR(vm->pt_root[id])) {
1771 			err = PTR_ERR(vm->pt_root[id]);
1772 			vm->pt_root[id] = NULL;
1773 			goto err_unlock_close;
1774 		}
1775 	}
1776 
1777 	if (xe_vm_has_scratch(vm)) {
1778 		for_each_tile(tile, xe, id) {
1779 			if (!vm->pt_root[id])
1780 				continue;
1781 
1782 			err = xe_vm_create_scratch(xe, tile, vm);
1783 			if (err)
1784 				goto err_unlock_close;
1785 		}
1786 		vm->batch_invalidate_tlb = true;
1787 	}
1788 
1789 	if (vm->flags & XE_VM_FLAG_LR_MODE)
1790 		vm->batch_invalidate_tlb = false;
1791 
1792 	/* Fill pt_root after allocating scratch tables */
1793 	for_each_tile(tile, xe, id) {
1794 		if (!vm->pt_root[id])
1795 			continue;
1796 
1797 		xe_pt_populate_empty(tile, vm, vm->pt_root[id]);
1798 	}
1799 	xe_vm_unlock(vm);
1800 
1801 	/* Kernel migration VM shouldn't have a circular loop.. */
1802 	if (!(flags & XE_VM_FLAG_MIGRATION)) {
1803 		for_each_tile(tile, xe, id) {
1804 			struct xe_exec_queue *q;
1805 			u32 create_flags = EXEC_QUEUE_FLAG_VM;
1806 
1807 			if (!vm->pt_root[id])
1808 				continue;
1809 
1810 			q = xe_exec_queue_create_bind(xe, tile, create_flags, 0);
1811 			if (IS_ERR(q)) {
1812 				err = PTR_ERR(q);
1813 				goto err_close;
1814 			}
1815 			vm->q[id] = q;
1816 			number_tiles++;
1817 		}
1818 	}
1819 
1820 	if (number_tiles > 1)
1821 		vm->composite_fence_ctx = dma_fence_context_alloc(1);
1822 
1823 	if (xef && xe->info.has_asid) {
1824 		u32 asid;
1825 
1826 		down_write(&xe->usm.lock);
1827 		err = xa_alloc_cyclic(&xe->usm.asid_to_vm, &asid, vm,
1828 				      XA_LIMIT(1, XE_MAX_ASID - 1),
1829 				      &xe->usm.next_asid, GFP_KERNEL);
1830 		up_write(&xe->usm.lock);
1831 		if (err < 0)
1832 			goto err_unlock_close;
1833 
1834 		vm->usm.asid = asid;
1835 	}
1836 
1837 	trace_xe_vm_create(vm);
1838 
1839 	return vm;
1840 
1841 err_unlock_close:
1842 	xe_vm_unlock(vm);
1843 err_close:
1844 	xe_vm_close_and_put(vm);
1845 	return ERR_PTR(err);
1846 
1847 err_svm_fini:
1848 	if (flags & XE_VM_FLAG_FAULT_MODE) {
1849 		vm->size = 0; /* close the vm */
1850 		xe_svm_fini(vm);
1851 	}
1852 err_no_resv:
1853 	mutex_destroy(&vm->snap_mutex);
1854 	for_each_tile(tile, xe, id)
1855 		xe_range_fence_tree_fini(&vm->rftree[id]);
1856 	ttm_lru_bulk_move_fini(&xe->ttm, &vm->lru_bulk_move);
1857 	if (vm->xef)
1858 		xe_file_put(vm->xef);
1859 	kfree(vm);
1860 	if (flags & XE_VM_FLAG_LR_MODE)
1861 		xe_pm_runtime_put(xe);
1862 	return ERR_PTR(err);
1863 }
1864 
1865 static void xe_vm_close(struct xe_vm *vm)
1866 {
1867 	struct xe_device *xe = vm->xe;
1868 	bool bound;
1869 	int idx;
1870 
1871 	bound = drm_dev_enter(&xe->drm, &idx);
1872 
1873 	down_write(&vm->lock);
1874 	if (xe_vm_in_fault_mode(vm))
1875 		xe_svm_notifier_lock(vm);
1876 
1877 	vm->size = 0;
1878 
1879 	if (!((vm->flags & XE_VM_FLAG_MIGRATION))) {
1880 		struct xe_tile *tile;
1881 		struct xe_gt *gt;
1882 		u8 id;
1883 
1884 		/* Wait for pending binds */
1885 		dma_resv_wait_timeout(xe_vm_resv(vm),
1886 				      DMA_RESV_USAGE_BOOKKEEP,
1887 				      false, MAX_SCHEDULE_TIMEOUT);
1888 
1889 		if (bound) {
1890 			for_each_tile(tile, xe, id)
1891 				if (vm->pt_root[id])
1892 					xe_pt_clear(xe, vm->pt_root[id]);
1893 
1894 			for_each_gt(gt, xe, id)
1895 				xe_gt_tlb_invalidation_vm(gt, vm);
1896 		}
1897 	}
1898 
1899 	if (xe_vm_in_fault_mode(vm))
1900 		xe_svm_notifier_unlock(vm);
1901 	up_write(&vm->lock);
1902 
1903 	if (bound)
1904 		drm_dev_exit(idx);
1905 }
1906 
1907 void xe_vm_close_and_put(struct xe_vm *vm)
1908 {
1909 	LIST_HEAD(contested);
1910 	struct xe_device *xe = vm->xe;
1911 	struct xe_tile *tile;
1912 	struct xe_vma *vma, *next_vma;
1913 	struct drm_gpuva *gpuva, *next;
1914 	u8 id;
1915 
1916 	xe_assert(xe, !vm->preempt.num_exec_queues);
1917 
1918 	xe_vm_close(vm);
1919 	if (xe_vm_in_preempt_fence_mode(vm))
1920 		flush_work(&vm->preempt.rebind_work);
1921 	if (xe_vm_in_fault_mode(vm))
1922 		xe_svm_close(vm);
1923 
1924 	down_write(&vm->lock);
1925 	for_each_tile(tile, xe, id) {
1926 		if (vm->q[id])
1927 			xe_exec_queue_last_fence_put(vm->q[id], vm);
1928 	}
1929 	up_write(&vm->lock);
1930 
1931 	for_each_tile(tile, xe, id) {
1932 		if (vm->q[id]) {
1933 			xe_exec_queue_kill(vm->q[id]);
1934 			xe_exec_queue_put(vm->q[id]);
1935 			vm->q[id] = NULL;
1936 		}
1937 	}
1938 
1939 	down_write(&vm->lock);
1940 	xe_vm_lock(vm, false);
1941 	drm_gpuvm_for_each_va_safe(gpuva, next, &vm->gpuvm) {
1942 		vma = gpuva_to_vma(gpuva);
1943 
1944 		if (xe_vma_has_no_bo(vma)) {
1945 			down_read(&vm->userptr.notifier_lock);
1946 			vma->gpuva.flags |= XE_VMA_DESTROYED;
1947 			up_read(&vm->userptr.notifier_lock);
1948 		}
1949 
1950 		xe_vm_remove_vma(vm, vma);
1951 
1952 		/* easy case, remove from VMA? */
1953 		if (xe_vma_has_no_bo(vma) || xe_vma_bo(vma)->vm) {
1954 			list_del_init(&vma->combined_links.rebind);
1955 			xe_vma_destroy(vma, NULL);
1956 			continue;
1957 		}
1958 
1959 		list_move_tail(&vma->combined_links.destroy, &contested);
1960 		vma->gpuva.flags |= XE_VMA_DESTROYED;
1961 	}
1962 
1963 	/*
1964 	 * All vm operations will add shared fences to resv.
1965 	 * The only exception is eviction for a shared object,
1966 	 * but even so, the unbind when evicted would still
1967 	 * install a fence to resv. Hence it's safe to
1968 	 * destroy the pagetables immediately.
1969 	 */
1970 	xe_vm_free_scratch(vm);
1971 
1972 	for_each_tile(tile, xe, id) {
1973 		if (vm->pt_root[id]) {
1974 			xe_pt_destroy(vm->pt_root[id], vm->flags, NULL);
1975 			vm->pt_root[id] = NULL;
1976 		}
1977 	}
1978 	xe_vm_unlock(vm);
1979 
1980 	/*
1981 	 * VM is now dead, cannot re-add nodes to vm->vmas if it's NULL
1982 	 * Since we hold a refcount to the bo, we can remove and free
1983 	 * the members safely without locking.
1984 	 */
1985 	list_for_each_entry_safe(vma, next_vma, &contested,
1986 				 combined_links.destroy) {
1987 		list_del_init(&vma->combined_links.destroy);
1988 		xe_vma_destroy_unlocked(vma);
1989 	}
1990 
1991 	if (xe_vm_in_fault_mode(vm))
1992 		xe_svm_fini(vm);
1993 
1994 	up_write(&vm->lock);
1995 
1996 	down_write(&xe->usm.lock);
1997 	if (vm->usm.asid) {
1998 		void *lookup;
1999 
2000 		xe_assert(xe, xe->info.has_asid);
2001 		xe_assert(xe, !(vm->flags & XE_VM_FLAG_MIGRATION));
2002 
2003 		lookup = xa_erase(&xe->usm.asid_to_vm, vm->usm.asid);
2004 		xe_assert(xe, lookup == vm);
2005 	}
2006 	up_write(&xe->usm.lock);
2007 
2008 	for_each_tile(tile, xe, id)
2009 		xe_range_fence_tree_fini(&vm->rftree[id]);
2010 
2011 	xe_vm_put(vm);
2012 }
2013 
2014 static void vm_destroy_work_func(struct work_struct *w)
2015 {
2016 	struct xe_vm *vm =
2017 		container_of(w, struct xe_vm, destroy_work);
2018 	struct xe_device *xe = vm->xe;
2019 	struct xe_tile *tile;
2020 	u8 id;
2021 
2022 	/* xe_vm_close_and_put was not called? */
2023 	xe_assert(xe, !vm->size);
2024 
2025 	if (xe_vm_in_preempt_fence_mode(vm))
2026 		flush_work(&vm->preempt.rebind_work);
2027 
2028 	mutex_destroy(&vm->snap_mutex);
2029 
2030 	if (vm->flags & XE_VM_FLAG_LR_MODE)
2031 		xe_pm_runtime_put(xe);
2032 
2033 	for_each_tile(tile, xe, id)
2034 		XE_WARN_ON(vm->pt_root[id]);
2035 
2036 	trace_xe_vm_free(vm);
2037 
2038 	ttm_lru_bulk_move_fini(&xe->ttm, &vm->lru_bulk_move);
2039 
2040 	if (vm->xef)
2041 		xe_file_put(vm->xef);
2042 
2043 	kfree(vm);
2044 }
2045 
2046 static void xe_vm_free(struct drm_gpuvm *gpuvm)
2047 {
2048 	struct xe_vm *vm = container_of(gpuvm, struct xe_vm, gpuvm);
2049 
2050 	/* To destroy the VM we need to be able to sleep */
2051 	queue_work(system_unbound_wq, &vm->destroy_work);
2052 }
2053 
2054 struct xe_vm *xe_vm_lookup(struct xe_file *xef, u32 id)
2055 {
2056 	struct xe_vm *vm;
2057 
2058 	mutex_lock(&xef->vm.lock);
2059 	vm = xa_load(&xef->vm.xa, id);
2060 	if (vm)
2061 		xe_vm_get(vm);
2062 	mutex_unlock(&xef->vm.lock);
2063 
2064 	return vm;
2065 }
2066 
2067 u64 xe_vm_pdp4_descriptor(struct xe_vm *vm, struct xe_tile *tile)
2068 {
2069 	return vm->pt_ops->pde_encode_bo(vm->pt_root[tile->id]->bo, 0);
2070 }
2071 
2072 static struct xe_exec_queue *
2073 to_wait_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
2074 {
2075 	return q ? q : vm->q[0];
2076 }
2077 
2078 static struct xe_user_fence *
2079 find_ufence_get(struct xe_sync_entry *syncs, u32 num_syncs)
2080 {
2081 	unsigned int i;
2082 
2083 	for (i = 0; i < num_syncs; i++) {
2084 		struct xe_sync_entry *e = &syncs[i];
2085 
2086 		if (xe_sync_is_ufence(e))
2087 			return xe_sync_ufence_get(e);
2088 	}
2089 
2090 	return NULL;
2091 }
2092 
2093 #define ALL_DRM_XE_VM_CREATE_FLAGS (DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE | \
2094 				    DRM_XE_VM_CREATE_FLAG_LR_MODE | \
2095 				    DRM_XE_VM_CREATE_FLAG_FAULT_MODE)
2096 
2097 int xe_vm_create_ioctl(struct drm_device *dev, void *data,
2098 		       struct drm_file *file)
2099 {
2100 	struct xe_device *xe = to_xe_device(dev);
2101 	struct xe_file *xef = to_xe_file(file);
2102 	struct drm_xe_vm_create *args = data;
2103 	struct xe_vm *vm;
2104 	u32 id;
2105 	int err;
2106 	u32 flags = 0;
2107 
2108 	if (XE_IOCTL_DBG(xe, args->extensions))
2109 		return -EINVAL;
2110 
2111 	if (XE_GT_WA(xe_root_mmio_gt(xe), 14016763929))
2112 		args->flags |= DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE;
2113 
2114 	if (XE_IOCTL_DBG(xe, args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE &&
2115 			 !xe->info.has_usm))
2116 		return -EINVAL;
2117 
2118 	if (XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
2119 		return -EINVAL;
2120 
2121 	if (XE_IOCTL_DBG(xe, args->flags & ~ALL_DRM_XE_VM_CREATE_FLAGS))
2122 		return -EINVAL;
2123 
2124 	if (XE_IOCTL_DBG(xe, args->flags & DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE &&
2125 			 args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE &&
2126 			 !xe->info.needs_scratch))
2127 		return -EINVAL;
2128 
2129 	if (XE_IOCTL_DBG(xe, !(args->flags & DRM_XE_VM_CREATE_FLAG_LR_MODE) &&
2130 			 args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE))
2131 		return -EINVAL;
2132 
2133 	if (args->flags & DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE)
2134 		flags |= XE_VM_FLAG_SCRATCH_PAGE;
2135 	if (args->flags & DRM_XE_VM_CREATE_FLAG_LR_MODE)
2136 		flags |= XE_VM_FLAG_LR_MODE;
2137 	if (args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE)
2138 		flags |= XE_VM_FLAG_FAULT_MODE;
2139 
2140 	vm = xe_vm_create(xe, flags, xef);
2141 	if (IS_ERR(vm))
2142 		return PTR_ERR(vm);
2143 
2144 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_MEM)
2145 	/* Warning: Security issue - never enable by default */
2146 	args->reserved[0] = xe_bo_main_addr(vm->pt_root[0]->bo, XE_PAGE_SIZE);
2147 #endif
2148 
2149 	/* user id alloc must always be last in ioctl to prevent UAF */
2150 	err = xa_alloc(&xef->vm.xa, &id, vm, xa_limit_32b, GFP_KERNEL);
2151 	if (err)
2152 		goto err_close_and_put;
2153 
2154 	args->vm_id = id;
2155 
2156 	return 0;
2157 
2158 err_close_and_put:
2159 	xe_vm_close_and_put(vm);
2160 
2161 	return err;
2162 }
2163 
2164 int xe_vm_destroy_ioctl(struct drm_device *dev, void *data,
2165 			struct drm_file *file)
2166 {
2167 	struct xe_device *xe = to_xe_device(dev);
2168 	struct xe_file *xef = to_xe_file(file);
2169 	struct drm_xe_vm_destroy *args = data;
2170 	struct xe_vm *vm;
2171 	int err = 0;
2172 
2173 	if (XE_IOCTL_DBG(xe, args->pad) ||
2174 	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
2175 		return -EINVAL;
2176 
2177 	mutex_lock(&xef->vm.lock);
2178 	vm = xa_load(&xef->vm.xa, args->vm_id);
2179 	if (XE_IOCTL_DBG(xe, !vm))
2180 		err = -ENOENT;
2181 	else if (XE_IOCTL_DBG(xe, vm->preempt.num_exec_queues))
2182 		err = -EBUSY;
2183 	else
2184 		xa_erase(&xef->vm.xa, args->vm_id);
2185 	mutex_unlock(&xef->vm.lock);
2186 
2187 	if (!err)
2188 		xe_vm_close_and_put(vm);
2189 
2190 	return err;
2191 }
2192 
2193 static bool vma_matches(struct xe_vma *vma, u64 page_addr)
2194 {
2195 	if (page_addr > xe_vma_end(vma) - 1 ||
2196 	    page_addr + SZ_4K - 1 < xe_vma_start(vma))
2197 		return false;
2198 
2199 	return true;
2200 }
2201 
2202 /**
2203  * xe_vm_find_vma_by_addr() - Find a VMA by its address
2204  *
2205  * @vm: the xe_vm the vma belongs to
2206  * @page_addr: address to look up
2207  */
2208 struct xe_vma *xe_vm_find_vma_by_addr(struct xe_vm *vm, u64 page_addr)
2209 {
2210 	struct xe_vma *vma = NULL;
2211 
2212 	if (vm->usm.last_fault_vma) {   /* Fast lookup */
2213 		if (vma_matches(vm->usm.last_fault_vma, page_addr))
2214 			vma = vm->usm.last_fault_vma;
2215 	}
2216 	if (!vma)
2217 		vma = xe_vm_find_overlapping_vma(vm, page_addr, SZ_4K);
2218 
2219 	return vma;
2220 }
2221 
2222 static const u32 region_to_mem_type[] = {
2223 	XE_PL_TT,
2224 	XE_PL_VRAM0,
2225 	XE_PL_VRAM1,
2226 };
2227 
2228 static void prep_vma_destroy(struct xe_vm *vm, struct xe_vma *vma,
2229 			     bool post_commit)
2230 {
2231 	down_read(&vm->userptr.notifier_lock);
2232 	vma->gpuva.flags |= XE_VMA_DESTROYED;
2233 	up_read(&vm->userptr.notifier_lock);
2234 	if (post_commit)
2235 		xe_vm_remove_vma(vm, vma);
2236 }
2237 
2238 #undef ULL
2239 #define ULL	unsigned long long
2240 
2241 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_VM)
2242 static void print_op(struct xe_device *xe, struct drm_gpuva_op *op)
2243 {
2244 	struct xe_vma *vma;
2245 
2246 	switch (op->op) {
2247 	case DRM_GPUVA_OP_MAP:
2248 		vm_dbg(&xe->drm, "MAP: addr=0x%016llx, range=0x%016llx",
2249 		       (ULL)op->map.va.addr, (ULL)op->map.va.range);
2250 		break;
2251 	case DRM_GPUVA_OP_REMAP:
2252 		vma = gpuva_to_vma(op->remap.unmap->va);
2253 		vm_dbg(&xe->drm, "REMAP:UNMAP: addr=0x%016llx, range=0x%016llx, keep=%d",
2254 		       (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma),
2255 		       op->remap.unmap->keep ? 1 : 0);
2256 		if (op->remap.prev)
2257 			vm_dbg(&xe->drm,
2258 			       "REMAP:PREV: addr=0x%016llx, range=0x%016llx",
2259 			       (ULL)op->remap.prev->va.addr,
2260 			       (ULL)op->remap.prev->va.range);
2261 		if (op->remap.next)
2262 			vm_dbg(&xe->drm,
2263 			       "REMAP:NEXT: addr=0x%016llx, range=0x%016llx",
2264 			       (ULL)op->remap.next->va.addr,
2265 			       (ULL)op->remap.next->va.range);
2266 		break;
2267 	case DRM_GPUVA_OP_UNMAP:
2268 		vma = gpuva_to_vma(op->unmap.va);
2269 		vm_dbg(&xe->drm, "UNMAP: addr=0x%016llx, range=0x%016llx, keep=%d",
2270 		       (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma),
2271 		       op->unmap.keep ? 1 : 0);
2272 		break;
2273 	case DRM_GPUVA_OP_PREFETCH:
2274 		vma = gpuva_to_vma(op->prefetch.va);
2275 		vm_dbg(&xe->drm, "PREFETCH: addr=0x%016llx, range=0x%016llx",
2276 		       (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma));
2277 		break;
2278 	default:
2279 		drm_warn(&xe->drm, "NOT POSSIBLE");
2280 	}
2281 }
2282 #else
2283 static void print_op(struct xe_device *xe, struct drm_gpuva_op *op)
2284 {
2285 }
2286 #endif
2287 
2288 static bool __xe_vm_needs_clear_scratch_pages(struct xe_vm *vm, u32 bind_flags)
2289 {
2290 	if (!xe_vm_in_fault_mode(vm))
2291 		return false;
2292 
2293 	if (!xe_vm_has_scratch(vm))
2294 		return false;
2295 
2296 	if (bind_flags & DRM_XE_VM_BIND_FLAG_IMMEDIATE)
2297 		return false;
2298 
2299 	return true;
2300 }
2301 
2302 static void xe_svm_prefetch_gpuva_ops_fini(struct drm_gpuva_ops *ops)
2303 {
2304 	struct drm_gpuva_op *__op;
2305 
2306 	drm_gpuva_for_each_op(__op, ops) {
2307 		struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2308 
2309 		xe_vma_svm_prefetch_op_fini(op);
2310 	}
2311 }
2312 
2313 /*
2314  * Create operations list from IOCTL arguments, setup operations fields so parse
2315  * and commit steps are decoupled from IOCTL arguments. This step can fail.
2316  */
2317 static struct drm_gpuva_ops *
2318 vm_bind_ioctl_ops_create(struct xe_vm *vm, struct xe_vma_ops *vops,
2319 			 struct xe_bo *bo, u64 bo_offset_or_userptr,
2320 			 u64 addr, u64 range,
2321 			 u32 operation, u32 flags,
2322 			 u32 prefetch_region, u16 pat_index)
2323 {
2324 	struct drm_gem_object *obj = bo ? &bo->ttm.base : NULL;
2325 	struct drm_gpuva_ops *ops;
2326 	struct drm_gpuva_op *__op;
2327 	struct drm_gpuvm_bo *vm_bo;
2328 	u64 range_end = addr + range;
2329 	int err;
2330 
2331 	lockdep_assert_held_write(&vm->lock);
2332 
2333 	vm_dbg(&vm->xe->drm,
2334 	       "op=%d, addr=0x%016llx, range=0x%016llx, bo_offset_or_userptr=0x%016llx",
2335 	       operation, (ULL)addr, (ULL)range,
2336 	       (ULL)bo_offset_or_userptr);
2337 
2338 	switch (operation) {
2339 	case DRM_XE_VM_BIND_OP_MAP:
2340 	case DRM_XE_VM_BIND_OP_MAP_USERPTR:
2341 		ops = drm_gpuvm_sm_map_ops_create(&vm->gpuvm, addr, range,
2342 						  obj, bo_offset_or_userptr);
2343 		break;
2344 	case DRM_XE_VM_BIND_OP_UNMAP:
2345 		ops = drm_gpuvm_sm_unmap_ops_create(&vm->gpuvm, addr, range);
2346 		break;
2347 	case DRM_XE_VM_BIND_OP_PREFETCH:
2348 		ops = drm_gpuvm_prefetch_ops_create(&vm->gpuvm, addr, range);
2349 		break;
2350 	case DRM_XE_VM_BIND_OP_UNMAP_ALL:
2351 		xe_assert(vm->xe, bo);
2352 
2353 		err = xe_bo_lock(bo, true);
2354 		if (err)
2355 			return ERR_PTR(err);
2356 
2357 		vm_bo = drm_gpuvm_bo_obtain(&vm->gpuvm, obj);
2358 		if (IS_ERR(vm_bo)) {
2359 			xe_bo_unlock(bo);
2360 			return ERR_CAST(vm_bo);
2361 		}
2362 
2363 		ops = drm_gpuvm_bo_unmap_ops_create(vm_bo);
2364 		drm_gpuvm_bo_put(vm_bo);
2365 		xe_bo_unlock(bo);
2366 		break;
2367 	default:
2368 		drm_warn(&vm->xe->drm, "NOT POSSIBLE");
2369 		ops = ERR_PTR(-EINVAL);
2370 	}
2371 	if (IS_ERR(ops))
2372 		return ops;
2373 
2374 	drm_gpuva_for_each_op(__op, ops) {
2375 		struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2376 
2377 		if (__op->op == DRM_GPUVA_OP_MAP) {
2378 			op->map.immediate =
2379 				flags & DRM_XE_VM_BIND_FLAG_IMMEDIATE;
2380 			op->map.read_only =
2381 				flags & DRM_XE_VM_BIND_FLAG_READONLY;
2382 			op->map.is_null = flags & DRM_XE_VM_BIND_FLAG_NULL;
2383 			op->map.is_cpu_addr_mirror = flags &
2384 				DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR;
2385 			op->map.dumpable = flags & DRM_XE_VM_BIND_FLAG_DUMPABLE;
2386 			op->map.pat_index = pat_index;
2387 			op->map.invalidate_on_bind =
2388 				__xe_vm_needs_clear_scratch_pages(vm, flags);
2389 		} else if (__op->op == DRM_GPUVA_OP_PREFETCH) {
2390 			struct xe_vma *vma = gpuva_to_vma(op->base.prefetch.va);
2391 			struct xe_svm_range *svm_range;
2392 			struct drm_gpusvm_ctx ctx = {};
2393 			struct xe_tile *tile;
2394 			u8 id, tile_mask = 0;
2395 			u32 i;
2396 
2397 			if (!xe_vma_is_cpu_addr_mirror(vma)) {
2398 				op->prefetch.region = prefetch_region;
2399 				break;
2400 			}
2401 
2402 			ctx.read_only = xe_vma_read_only(vma);
2403 			ctx.devmem_possible = IS_DGFX(vm->xe) &&
2404 					      IS_ENABLED(CONFIG_DRM_XE_PAGEMAP);
2405 
2406 			for_each_tile(tile, vm->xe, id)
2407 				tile_mask |= 0x1 << id;
2408 
2409 			xa_init_flags(&op->prefetch_range.range, XA_FLAGS_ALLOC);
2410 			op->prefetch_range.region = prefetch_region;
2411 			op->prefetch_range.ranges_count = 0;
2412 alloc_next_range:
2413 			svm_range = xe_svm_range_find_or_insert(vm, addr, vma, &ctx);
2414 
2415 			if (PTR_ERR(svm_range) == -ENOENT) {
2416 				u64 ret = xe_svm_find_vma_start(vm, addr, range_end, vma);
2417 
2418 				addr = ret == ULONG_MAX ? 0 : ret;
2419 				if (addr)
2420 					goto alloc_next_range;
2421 				else
2422 					goto print_op_label;
2423 			}
2424 
2425 			if (IS_ERR(svm_range)) {
2426 				err = PTR_ERR(svm_range);
2427 				goto unwind_prefetch_ops;
2428 			}
2429 
2430 			if (xe_svm_range_validate(vm, svm_range, tile_mask, !!prefetch_region)) {
2431 				xe_svm_range_debug(svm_range, "PREFETCH - RANGE IS VALID");
2432 				goto check_next_range;
2433 			}
2434 
2435 			err = xa_alloc(&op->prefetch_range.range,
2436 				       &i, svm_range, xa_limit_32b,
2437 				       GFP_KERNEL);
2438 
2439 			if (err)
2440 				goto unwind_prefetch_ops;
2441 
2442 			op->prefetch_range.ranges_count++;
2443 			vops->flags |= XE_VMA_OPS_FLAG_HAS_SVM_PREFETCH;
2444 			xe_svm_range_debug(svm_range, "PREFETCH - RANGE CREATED");
2445 check_next_range:
2446 			if (range_end > xe_svm_range_end(svm_range) &&
2447 			    xe_svm_range_end(svm_range) < xe_vma_end(vma)) {
2448 				addr = xe_svm_range_end(svm_range);
2449 				goto alloc_next_range;
2450 			}
2451 		}
2452 print_op_label:
2453 		print_op(vm->xe, __op);
2454 	}
2455 
2456 	return ops;
2457 
2458 unwind_prefetch_ops:
2459 	xe_svm_prefetch_gpuva_ops_fini(ops);
2460 	drm_gpuva_ops_free(&vm->gpuvm, ops);
2461 	return ERR_PTR(err);
2462 }
2463 
2464 ALLOW_ERROR_INJECTION(vm_bind_ioctl_ops_create, ERRNO);
2465 
2466 static struct xe_vma *new_vma(struct xe_vm *vm, struct drm_gpuva_op_map *op,
2467 			      u16 pat_index, unsigned int flags)
2468 {
2469 	struct xe_bo *bo = op->gem.obj ? gem_to_xe_bo(op->gem.obj) : NULL;
2470 	struct drm_exec exec;
2471 	struct xe_vma *vma;
2472 	int err = 0;
2473 
2474 	lockdep_assert_held_write(&vm->lock);
2475 
2476 	if (bo) {
2477 		drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT, 0);
2478 		drm_exec_until_all_locked(&exec) {
2479 			err = 0;
2480 			if (!bo->vm) {
2481 				err = drm_exec_lock_obj(&exec, xe_vm_obj(vm));
2482 				drm_exec_retry_on_contention(&exec);
2483 			}
2484 			if (!err) {
2485 				err = drm_exec_lock_obj(&exec, &bo->ttm.base);
2486 				drm_exec_retry_on_contention(&exec);
2487 			}
2488 			if (err) {
2489 				drm_exec_fini(&exec);
2490 				return ERR_PTR(err);
2491 			}
2492 		}
2493 	}
2494 	vma = xe_vma_create(vm, bo, op->gem.offset,
2495 			    op->va.addr, op->va.addr +
2496 			    op->va.range - 1, pat_index, flags);
2497 	if (IS_ERR(vma))
2498 		goto err_unlock;
2499 
2500 	if (xe_vma_is_userptr(vma))
2501 		err = xe_vma_userptr_pin_pages(to_userptr_vma(vma));
2502 	else if (!xe_vma_has_no_bo(vma) && !bo->vm)
2503 		err = add_preempt_fences(vm, bo);
2504 
2505 err_unlock:
2506 	if (bo)
2507 		drm_exec_fini(&exec);
2508 
2509 	if (err) {
2510 		prep_vma_destroy(vm, vma, false);
2511 		xe_vma_destroy_unlocked(vma);
2512 		vma = ERR_PTR(err);
2513 	}
2514 
2515 	return vma;
2516 }
2517 
2518 static u64 xe_vma_max_pte_size(struct xe_vma *vma)
2519 {
2520 	if (vma->gpuva.flags & XE_VMA_PTE_1G)
2521 		return SZ_1G;
2522 	else if (vma->gpuva.flags & (XE_VMA_PTE_2M | XE_VMA_PTE_COMPACT))
2523 		return SZ_2M;
2524 	else if (vma->gpuva.flags & XE_VMA_PTE_64K)
2525 		return SZ_64K;
2526 	else if (vma->gpuva.flags & XE_VMA_PTE_4K)
2527 		return SZ_4K;
2528 
2529 	return SZ_1G;	/* Uninitialized, used max size */
2530 }
2531 
2532 static void xe_vma_set_pte_size(struct xe_vma *vma, u64 size)
2533 {
2534 	switch (size) {
2535 	case SZ_1G:
2536 		vma->gpuva.flags |= XE_VMA_PTE_1G;
2537 		break;
2538 	case SZ_2M:
2539 		vma->gpuva.flags |= XE_VMA_PTE_2M;
2540 		break;
2541 	case SZ_64K:
2542 		vma->gpuva.flags |= XE_VMA_PTE_64K;
2543 		break;
2544 	case SZ_4K:
2545 		vma->gpuva.flags |= XE_VMA_PTE_4K;
2546 		break;
2547 	}
2548 }
2549 
2550 static int xe_vma_op_commit(struct xe_vm *vm, struct xe_vma_op *op)
2551 {
2552 	int err = 0;
2553 
2554 	lockdep_assert_held_write(&vm->lock);
2555 
2556 	switch (op->base.op) {
2557 	case DRM_GPUVA_OP_MAP:
2558 		err |= xe_vm_insert_vma(vm, op->map.vma);
2559 		if (!err)
2560 			op->flags |= XE_VMA_OP_COMMITTED;
2561 		break;
2562 	case DRM_GPUVA_OP_REMAP:
2563 	{
2564 		u8 tile_present =
2565 			gpuva_to_vma(op->base.remap.unmap->va)->tile_present;
2566 
2567 		prep_vma_destroy(vm, gpuva_to_vma(op->base.remap.unmap->va),
2568 				 true);
2569 		op->flags |= XE_VMA_OP_COMMITTED;
2570 
2571 		if (op->remap.prev) {
2572 			err |= xe_vm_insert_vma(vm, op->remap.prev);
2573 			if (!err)
2574 				op->flags |= XE_VMA_OP_PREV_COMMITTED;
2575 			if (!err && op->remap.skip_prev) {
2576 				op->remap.prev->tile_present =
2577 					tile_present;
2578 				op->remap.prev = NULL;
2579 			}
2580 		}
2581 		if (op->remap.next) {
2582 			err |= xe_vm_insert_vma(vm, op->remap.next);
2583 			if (!err)
2584 				op->flags |= XE_VMA_OP_NEXT_COMMITTED;
2585 			if (!err && op->remap.skip_next) {
2586 				op->remap.next->tile_present =
2587 					tile_present;
2588 				op->remap.next = NULL;
2589 			}
2590 		}
2591 
2592 		/* Adjust for partial unbind after removing VMA from VM */
2593 		if (!err) {
2594 			op->base.remap.unmap->va->va.addr = op->remap.start;
2595 			op->base.remap.unmap->va->va.range = op->remap.range;
2596 		}
2597 		break;
2598 	}
2599 	case DRM_GPUVA_OP_UNMAP:
2600 		prep_vma_destroy(vm, gpuva_to_vma(op->base.unmap.va), true);
2601 		op->flags |= XE_VMA_OP_COMMITTED;
2602 		break;
2603 	case DRM_GPUVA_OP_PREFETCH:
2604 		op->flags |= XE_VMA_OP_COMMITTED;
2605 		break;
2606 	default:
2607 		drm_warn(&vm->xe->drm, "NOT POSSIBLE");
2608 	}
2609 
2610 	return err;
2611 }
2612 
2613 static int vm_bind_ioctl_ops_parse(struct xe_vm *vm, struct drm_gpuva_ops *ops,
2614 				   struct xe_vma_ops *vops)
2615 {
2616 	struct xe_device *xe = vm->xe;
2617 	struct drm_gpuva_op *__op;
2618 	struct xe_tile *tile;
2619 	u8 id, tile_mask = 0;
2620 	int err = 0;
2621 
2622 	lockdep_assert_held_write(&vm->lock);
2623 
2624 	for_each_tile(tile, vm->xe, id)
2625 		tile_mask |= 0x1 << id;
2626 
2627 	drm_gpuva_for_each_op(__op, ops) {
2628 		struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2629 		struct xe_vma *vma;
2630 		unsigned int flags = 0;
2631 
2632 		INIT_LIST_HEAD(&op->link);
2633 		list_add_tail(&op->link, &vops->list);
2634 		op->tile_mask = tile_mask;
2635 
2636 		switch (op->base.op) {
2637 		case DRM_GPUVA_OP_MAP:
2638 		{
2639 			flags |= op->map.read_only ?
2640 				VMA_CREATE_FLAG_READ_ONLY : 0;
2641 			flags |= op->map.is_null ?
2642 				VMA_CREATE_FLAG_IS_NULL : 0;
2643 			flags |= op->map.dumpable ?
2644 				VMA_CREATE_FLAG_DUMPABLE : 0;
2645 			flags |= op->map.is_cpu_addr_mirror ?
2646 				VMA_CREATE_FLAG_IS_SYSTEM_ALLOCATOR : 0;
2647 
2648 			vma = new_vma(vm, &op->base.map, op->map.pat_index,
2649 				      flags);
2650 			if (IS_ERR(vma))
2651 				return PTR_ERR(vma);
2652 
2653 			op->map.vma = vma;
2654 			if (((op->map.immediate || !xe_vm_in_fault_mode(vm)) &&
2655 			     !op->map.is_cpu_addr_mirror) ||
2656 			    op->map.invalidate_on_bind)
2657 				xe_vma_ops_incr_pt_update_ops(vops,
2658 							      op->tile_mask, 1);
2659 			break;
2660 		}
2661 		case DRM_GPUVA_OP_REMAP:
2662 		{
2663 			struct xe_vma *old =
2664 				gpuva_to_vma(op->base.remap.unmap->va);
2665 			bool skip = xe_vma_is_cpu_addr_mirror(old);
2666 			u64 start = xe_vma_start(old), end = xe_vma_end(old);
2667 			int num_remap_ops = 0;
2668 
2669 			if (op->base.remap.prev)
2670 				start = op->base.remap.prev->va.addr +
2671 					op->base.remap.prev->va.range;
2672 			if (op->base.remap.next)
2673 				end = op->base.remap.next->va.addr;
2674 
2675 			if (xe_vma_is_cpu_addr_mirror(old) &&
2676 			    xe_svm_has_mapping(vm, start, end))
2677 				return -EBUSY;
2678 
2679 			op->remap.start = xe_vma_start(old);
2680 			op->remap.range = xe_vma_size(old);
2681 
2682 			flags |= op->base.remap.unmap->va->flags &
2683 				XE_VMA_READ_ONLY ?
2684 				VMA_CREATE_FLAG_READ_ONLY : 0;
2685 			flags |= op->base.remap.unmap->va->flags &
2686 				DRM_GPUVA_SPARSE ?
2687 				VMA_CREATE_FLAG_IS_NULL : 0;
2688 			flags |= op->base.remap.unmap->va->flags &
2689 				XE_VMA_DUMPABLE ?
2690 				VMA_CREATE_FLAG_DUMPABLE : 0;
2691 			flags |= xe_vma_is_cpu_addr_mirror(old) ?
2692 				VMA_CREATE_FLAG_IS_SYSTEM_ALLOCATOR : 0;
2693 
2694 			if (op->base.remap.prev) {
2695 				vma = new_vma(vm, op->base.remap.prev,
2696 					      old->pat_index, flags);
2697 				if (IS_ERR(vma))
2698 					return PTR_ERR(vma);
2699 
2700 				op->remap.prev = vma;
2701 
2702 				/*
2703 				 * Userptr creates a new SG mapping so
2704 				 * we must also rebind.
2705 				 */
2706 				op->remap.skip_prev = skip ||
2707 					(!xe_vma_is_userptr(old) &&
2708 					IS_ALIGNED(xe_vma_end(vma),
2709 						   xe_vma_max_pte_size(old)));
2710 				if (op->remap.skip_prev) {
2711 					xe_vma_set_pte_size(vma, xe_vma_max_pte_size(old));
2712 					op->remap.range -=
2713 						xe_vma_end(vma) -
2714 						xe_vma_start(old);
2715 					op->remap.start = xe_vma_end(vma);
2716 					vm_dbg(&xe->drm, "REMAP:SKIP_PREV: addr=0x%016llx, range=0x%016llx",
2717 					       (ULL)op->remap.start,
2718 					       (ULL)op->remap.range);
2719 				} else {
2720 					num_remap_ops++;
2721 				}
2722 			}
2723 
2724 			if (op->base.remap.next) {
2725 				vma = new_vma(vm, op->base.remap.next,
2726 					      old->pat_index, flags);
2727 				if (IS_ERR(vma))
2728 					return PTR_ERR(vma);
2729 
2730 				op->remap.next = vma;
2731 
2732 				/*
2733 				 * Userptr creates a new SG mapping so
2734 				 * we must also rebind.
2735 				 */
2736 				op->remap.skip_next = skip ||
2737 					(!xe_vma_is_userptr(old) &&
2738 					IS_ALIGNED(xe_vma_start(vma),
2739 						   xe_vma_max_pte_size(old)));
2740 				if (op->remap.skip_next) {
2741 					xe_vma_set_pte_size(vma, xe_vma_max_pte_size(old));
2742 					op->remap.range -=
2743 						xe_vma_end(old) -
2744 						xe_vma_start(vma);
2745 					vm_dbg(&xe->drm, "REMAP:SKIP_NEXT: addr=0x%016llx, range=0x%016llx",
2746 					       (ULL)op->remap.start,
2747 					       (ULL)op->remap.range);
2748 				} else {
2749 					num_remap_ops++;
2750 				}
2751 			}
2752 			if (!skip)
2753 				num_remap_ops++;
2754 
2755 			xe_vma_ops_incr_pt_update_ops(vops, op->tile_mask, num_remap_ops);
2756 			break;
2757 		}
2758 		case DRM_GPUVA_OP_UNMAP:
2759 			vma = gpuva_to_vma(op->base.unmap.va);
2760 
2761 			if (xe_vma_is_cpu_addr_mirror(vma) &&
2762 			    xe_svm_has_mapping(vm, xe_vma_start(vma),
2763 					       xe_vma_end(vma)))
2764 				return -EBUSY;
2765 
2766 			if (!xe_vma_is_cpu_addr_mirror(vma))
2767 				xe_vma_ops_incr_pt_update_ops(vops, op->tile_mask, 1);
2768 			break;
2769 		case DRM_GPUVA_OP_PREFETCH:
2770 			vma = gpuva_to_vma(op->base.prefetch.va);
2771 
2772 			if (xe_vma_is_userptr(vma)) {
2773 				err = xe_vma_userptr_pin_pages(to_userptr_vma(vma));
2774 				if (err)
2775 					return err;
2776 			}
2777 
2778 			if (xe_vma_is_cpu_addr_mirror(vma))
2779 				xe_vma_ops_incr_pt_update_ops(vops, op->tile_mask,
2780 							      op->prefetch_range.ranges_count);
2781 			else
2782 				xe_vma_ops_incr_pt_update_ops(vops, op->tile_mask, 1);
2783 
2784 			break;
2785 		default:
2786 			drm_warn(&vm->xe->drm, "NOT POSSIBLE");
2787 		}
2788 
2789 		err = xe_vma_op_commit(vm, op);
2790 		if (err)
2791 			return err;
2792 	}
2793 
2794 	return 0;
2795 }
2796 
2797 static void xe_vma_op_unwind(struct xe_vm *vm, struct xe_vma_op *op,
2798 			     bool post_commit, bool prev_post_commit,
2799 			     bool next_post_commit)
2800 {
2801 	lockdep_assert_held_write(&vm->lock);
2802 
2803 	switch (op->base.op) {
2804 	case DRM_GPUVA_OP_MAP:
2805 		if (op->map.vma) {
2806 			prep_vma_destroy(vm, op->map.vma, post_commit);
2807 			xe_vma_destroy_unlocked(op->map.vma);
2808 		}
2809 		break;
2810 	case DRM_GPUVA_OP_UNMAP:
2811 	{
2812 		struct xe_vma *vma = gpuva_to_vma(op->base.unmap.va);
2813 
2814 		if (vma) {
2815 			down_read(&vm->userptr.notifier_lock);
2816 			vma->gpuva.flags &= ~XE_VMA_DESTROYED;
2817 			up_read(&vm->userptr.notifier_lock);
2818 			if (post_commit)
2819 				xe_vm_insert_vma(vm, vma);
2820 		}
2821 		break;
2822 	}
2823 	case DRM_GPUVA_OP_REMAP:
2824 	{
2825 		struct xe_vma *vma = gpuva_to_vma(op->base.remap.unmap->va);
2826 
2827 		if (op->remap.prev) {
2828 			prep_vma_destroy(vm, op->remap.prev, prev_post_commit);
2829 			xe_vma_destroy_unlocked(op->remap.prev);
2830 		}
2831 		if (op->remap.next) {
2832 			prep_vma_destroy(vm, op->remap.next, next_post_commit);
2833 			xe_vma_destroy_unlocked(op->remap.next);
2834 		}
2835 		if (vma) {
2836 			down_read(&vm->userptr.notifier_lock);
2837 			vma->gpuva.flags &= ~XE_VMA_DESTROYED;
2838 			up_read(&vm->userptr.notifier_lock);
2839 			if (post_commit)
2840 				xe_vm_insert_vma(vm, vma);
2841 		}
2842 		break;
2843 	}
2844 	case DRM_GPUVA_OP_PREFETCH:
2845 		/* Nothing to do */
2846 		break;
2847 	default:
2848 		drm_warn(&vm->xe->drm, "NOT POSSIBLE");
2849 	}
2850 }
2851 
2852 static void vm_bind_ioctl_ops_unwind(struct xe_vm *vm,
2853 				     struct drm_gpuva_ops **ops,
2854 				     int num_ops_list)
2855 {
2856 	int i;
2857 
2858 	for (i = num_ops_list - 1; i >= 0; --i) {
2859 		struct drm_gpuva_ops *__ops = ops[i];
2860 		struct drm_gpuva_op *__op;
2861 
2862 		if (!__ops)
2863 			continue;
2864 
2865 		drm_gpuva_for_each_op_reverse(__op, __ops) {
2866 			struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2867 
2868 			xe_vma_op_unwind(vm, op,
2869 					 op->flags & XE_VMA_OP_COMMITTED,
2870 					 op->flags & XE_VMA_OP_PREV_COMMITTED,
2871 					 op->flags & XE_VMA_OP_NEXT_COMMITTED);
2872 		}
2873 	}
2874 }
2875 
2876 static int vma_lock_and_validate(struct drm_exec *exec, struct xe_vma *vma,
2877 				 bool validate)
2878 {
2879 	struct xe_bo *bo = xe_vma_bo(vma);
2880 	struct xe_vm *vm = xe_vma_vm(vma);
2881 	int err = 0;
2882 
2883 	if (bo) {
2884 		if (!bo->vm)
2885 			err = drm_exec_lock_obj(exec, &bo->ttm.base);
2886 		if (!err && validate)
2887 			err = xe_bo_validate(bo, vm,
2888 					     !xe_vm_in_preempt_fence_mode(vm));
2889 	}
2890 
2891 	return err;
2892 }
2893 
2894 static int check_ufence(struct xe_vma *vma)
2895 {
2896 	if (vma->ufence) {
2897 		struct xe_user_fence * const f = vma->ufence;
2898 
2899 		if (!xe_sync_ufence_get_status(f))
2900 			return -EBUSY;
2901 
2902 		vma->ufence = NULL;
2903 		xe_sync_ufence_put(f);
2904 	}
2905 
2906 	return 0;
2907 }
2908 
2909 static int prefetch_ranges(struct xe_vm *vm, struct xe_vma_op *op)
2910 {
2911 	bool devmem_possible = IS_DGFX(vm->xe) && IS_ENABLED(CONFIG_DRM_XE_PAGEMAP);
2912 	struct xe_vma *vma = gpuva_to_vma(op->base.prefetch.va);
2913 	int err = 0;
2914 
2915 	struct xe_svm_range *svm_range;
2916 	struct drm_gpusvm_ctx ctx = {};
2917 	struct xe_tile *tile;
2918 	unsigned long i;
2919 	u32 region;
2920 
2921 	if (!xe_vma_is_cpu_addr_mirror(vma))
2922 		return 0;
2923 
2924 	region = op->prefetch_range.region;
2925 
2926 	ctx.read_only = xe_vma_read_only(vma);
2927 	ctx.devmem_possible = devmem_possible;
2928 	ctx.check_pages_threshold = devmem_possible ? SZ_64K : 0;
2929 
2930 	/* TODO: Threading the migration */
2931 	xa_for_each(&op->prefetch_range.range, i, svm_range) {
2932 		if (!region)
2933 			xe_svm_range_migrate_to_smem(vm, svm_range);
2934 
2935 		if (xe_svm_range_needs_migrate_to_vram(svm_range, vma, region)) {
2936 			tile = &vm->xe->tiles[region_to_mem_type[region] - XE_PL_VRAM0];
2937 			err = xe_svm_alloc_vram(tile, svm_range, &ctx);
2938 			if (err) {
2939 				drm_dbg(&vm->xe->drm, "VRAM allocation failed, retry from userspace, asid=%u, gpusvm=%p, errno=%pe\n",
2940 					vm->usm.asid, &vm->svm.gpusvm, ERR_PTR(err));
2941 				return -ENODATA;
2942 			}
2943 			xe_svm_range_debug(svm_range, "PREFETCH - RANGE MIGRATED TO VRAM");
2944 		}
2945 
2946 		err = xe_svm_range_get_pages(vm, svm_range, &ctx);
2947 		if (err) {
2948 			drm_dbg(&vm->xe->drm, "Get pages failed, asid=%u, gpusvm=%p, errno=%pe\n",
2949 				vm->usm.asid, &vm->svm.gpusvm, ERR_PTR(err));
2950 			if (err == -EOPNOTSUPP || err == -EFAULT || err == -EPERM)
2951 				err = -ENODATA;
2952 			return err;
2953 		}
2954 		xe_svm_range_debug(svm_range, "PREFETCH - RANGE GET PAGES DONE");
2955 	}
2956 
2957 	return err;
2958 }
2959 
2960 static int op_lock_and_prep(struct drm_exec *exec, struct xe_vm *vm,
2961 			    struct xe_vma_op *op)
2962 {
2963 	int err = 0;
2964 
2965 	switch (op->base.op) {
2966 	case DRM_GPUVA_OP_MAP:
2967 		if (!op->map.invalidate_on_bind)
2968 			err = vma_lock_and_validate(exec, op->map.vma,
2969 						    !xe_vm_in_fault_mode(vm) ||
2970 						    op->map.immediate);
2971 		break;
2972 	case DRM_GPUVA_OP_REMAP:
2973 		err = check_ufence(gpuva_to_vma(op->base.remap.unmap->va));
2974 		if (err)
2975 			break;
2976 
2977 		err = vma_lock_and_validate(exec,
2978 					    gpuva_to_vma(op->base.remap.unmap->va),
2979 					    false);
2980 		if (!err && op->remap.prev)
2981 			err = vma_lock_and_validate(exec, op->remap.prev, true);
2982 		if (!err && op->remap.next)
2983 			err = vma_lock_and_validate(exec, op->remap.next, true);
2984 		break;
2985 	case DRM_GPUVA_OP_UNMAP:
2986 		err = check_ufence(gpuva_to_vma(op->base.unmap.va));
2987 		if (err)
2988 			break;
2989 
2990 		err = vma_lock_and_validate(exec,
2991 					    gpuva_to_vma(op->base.unmap.va),
2992 					    false);
2993 		break;
2994 	case DRM_GPUVA_OP_PREFETCH:
2995 	{
2996 		struct xe_vma *vma = gpuva_to_vma(op->base.prefetch.va);
2997 		u32 region;
2998 
2999 		if (xe_vma_is_cpu_addr_mirror(vma))
3000 			region = op->prefetch_range.region;
3001 		else
3002 			region = op->prefetch.region;
3003 
3004 		xe_assert(vm->xe, region <= ARRAY_SIZE(region_to_mem_type));
3005 
3006 		err = vma_lock_and_validate(exec,
3007 					    gpuva_to_vma(op->base.prefetch.va),
3008 					    false);
3009 		if (!err && !xe_vma_has_no_bo(vma))
3010 			err = xe_bo_migrate(xe_vma_bo(vma),
3011 					    region_to_mem_type[region]);
3012 		break;
3013 	}
3014 	default:
3015 		drm_warn(&vm->xe->drm, "NOT POSSIBLE");
3016 	}
3017 
3018 	return err;
3019 }
3020 
3021 static int vm_bind_ioctl_ops_prefetch_ranges(struct xe_vm *vm, struct xe_vma_ops *vops)
3022 {
3023 	struct xe_vma_op *op;
3024 	int err;
3025 
3026 	if (!(vops->flags & XE_VMA_OPS_FLAG_HAS_SVM_PREFETCH))
3027 		return 0;
3028 
3029 	list_for_each_entry(op, &vops->list, link) {
3030 		if (op->base.op  == DRM_GPUVA_OP_PREFETCH) {
3031 			err = prefetch_ranges(vm, op);
3032 			if (err)
3033 				return err;
3034 		}
3035 	}
3036 
3037 	return 0;
3038 }
3039 
3040 static int vm_bind_ioctl_ops_lock_and_prep(struct drm_exec *exec,
3041 					   struct xe_vm *vm,
3042 					   struct xe_vma_ops *vops)
3043 {
3044 	struct xe_vma_op *op;
3045 	int err;
3046 
3047 	err = drm_exec_lock_obj(exec, xe_vm_obj(vm));
3048 	if (err)
3049 		return err;
3050 
3051 	list_for_each_entry(op, &vops->list, link) {
3052 		err = op_lock_and_prep(exec, vm, op);
3053 		if (err)
3054 			return err;
3055 	}
3056 
3057 #ifdef TEST_VM_OPS_ERROR
3058 	if (vops->inject_error &&
3059 	    vm->xe->vm_inject_error_position == FORCE_OP_ERROR_LOCK)
3060 		return -ENOSPC;
3061 #endif
3062 
3063 	return 0;
3064 }
3065 
3066 static void op_trace(struct xe_vma_op *op)
3067 {
3068 	switch (op->base.op) {
3069 	case DRM_GPUVA_OP_MAP:
3070 		trace_xe_vma_bind(op->map.vma);
3071 		break;
3072 	case DRM_GPUVA_OP_REMAP:
3073 		trace_xe_vma_unbind(gpuva_to_vma(op->base.remap.unmap->va));
3074 		if (op->remap.prev)
3075 			trace_xe_vma_bind(op->remap.prev);
3076 		if (op->remap.next)
3077 			trace_xe_vma_bind(op->remap.next);
3078 		break;
3079 	case DRM_GPUVA_OP_UNMAP:
3080 		trace_xe_vma_unbind(gpuva_to_vma(op->base.unmap.va));
3081 		break;
3082 	case DRM_GPUVA_OP_PREFETCH:
3083 		trace_xe_vma_bind(gpuva_to_vma(op->base.prefetch.va));
3084 		break;
3085 	case DRM_GPUVA_OP_DRIVER:
3086 		break;
3087 	default:
3088 		XE_WARN_ON("NOT POSSIBLE");
3089 	}
3090 }
3091 
3092 static void trace_xe_vm_ops_execute(struct xe_vma_ops *vops)
3093 {
3094 	struct xe_vma_op *op;
3095 
3096 	list_for_each_entry(op, &vops->list, link)
3097 		op_trace(op);
3098 }
3099 
3100 static int vm_ops_setup_tile_args(struct xe_vm *vm, struct xe_vma_ops *vops)
3101 {
3102 	struct xe_exec_queue *q = vops->q;
3103 	struct xe_tile *tile;
3104 	int number_tiles = 0;
3105 	u8 id;
3106 
3107 	for_each_tile(tile, vm->xe, id) {
3108 		if (vops->pt_update_ops[id].num_ops)
3109 			++number_tiles;
3110 
3111 		if (vops->pt_update_ops[id].q)
3112 			continue;
3113 
3114 		if (q) {
3115 			vops->pt_update_ops[id].q = q;
3116 			if (vm->pt_root[id] && !list_empty(&q->multi_gt_list))
3117 				q = list_next_entry(q, multi_gt_list);
3118 		} else {
3119 			vops->pt_update_ops[id].q = vm->q[id];
3120 		}
3121 	}
3122 
3123 	return number_tiles;
3124 }
3125 
3126 static struct dma_fence *ops_execute(struct xe_vm *vm,
3127 				     struct xe_vma_ops *vops)
3128 {
3129 	struct xe_tile *tile;
3130 	struct dma_fence *fence = NULL;
3131 	struct dma_fence **fences = NULL;
3132 	struct dma_fence_array *cf = NULL;
3133 	int number_tiles = 0, current_fence = 0, err;
3134 	u8 id;
3135 
3136 	number_tiles = vm_ops_setup_tile_args(vm, vops);
3137 	if (number_tiles == 0)
3138 		return ERR_PTR(-ENODATA);
3139 
3140 	if (number_tiles > 1) {
3141 		fences = kmalloc_array(number_tiles, sizeof(*fences),
3142 				       GFP_KERNEL);
3143 		if (!fences) {
3144 			fence = ERR_PTR(-ENOMEM);
3145 			goto err_trace;
3146 		}
3147 	}
3148 
3149 	for_each_tile(tile, vm->xe, id) {
3150 		if (!vops->pt_update_ops[id].num_ops)
3151 			continue;
3152 
3153 		err = xe_pt_update_ops_prepare(tile, vops);
3154 		if (err) {
3155 			fence = ERR_PTR(err);
3156 			goto err_out;
3157 		}
3158 	}
3159 
3160 	trace_xe_vm_ops_execute(vops);
3161 
3162 	for_each_tile(tile, vm->xe, id) {
3163 		if (!vops->pt_update_ops[id].num_ops)
3164 			continue;
3165 
3166 		fence = xe_pt_update_ops_run(tile, vops);
3167 		if (IS_ERR(fence))
3168 			goto err_out;
3169 
3170 		if (fences)
3171 			fences[current_fence++] = fence;
3172 	}
3173 
3174 	if (fences) {
3175 		cf = dma_fence_array_create(number_tiles, fences,
3176 					    vm->composite_fence_ctx,
3177 					    vm->composite_fence_seqno++,
3178 					    false);
3179 		if (!cf) {
3180 			--vm->composite_fence_seqno;
3181 			fence = ERR_PTR(-ENOMEM);
3182 			goto err_out;
3183 		}
3184 		fence = &cf->base;
3185 	}
3186 
3187 	for_each_tile(tile, vm->xe, id) {
3188 		if (!vops->pt_update_ops[id].num_ops)
3189 			continue;
3190 
3191 		xe_pt_update_ops_fini(tile, vops);
3192 	}
3193 
3194 	return fence;
3195 
3196 err_out:
3197 	for_each_tile(tile, vm->xe, id) {
3198 		if (!vops->pt_update_ops[id].num_ops)
3199 			continue;
3200 
3201 		xe_pt_update_ops_abort(tile, vops);
3202 	}
3203 	while (current_fence)
3204 		dma_fence_put(fences[--current_fence]);
3205 	kfree(fences);
3206 	kfree(cf);
3207 
3208 err_trace:
3209 	trace_xe_vm_ops_fail(vm);
3210 	return fence;
3211 }
3212 
3213 static void vma_add_ufence(struct xe_vma *vma, struct xe_user_fence *ufence)
3214 {
3215 	if (vma->ufence)
3216 		xe_sync_ufence_put(vma->ufence);
3217 	vma->ufence = __xe_sync_ufence_get(ufence);
3218 }
3219 
3220 static void op_add_ufence(struct xe_vm *vm, struct xe_vma_op *op,
3221 			  struct xe_user_fence *ufence)
3222 {
3223 	switch (op->base.op) {
3224 	case DRM_GPUVA_OP_MAP:
3225 		vma_add_ufence(op->map.vma, ufence);
3226 		break;
3227 	case DRM_GPUVA_OP_REMAP:
3228 		if (op->remap.prev)
3229 			vma_add_ufence(op->remap.prev, ufence);
3230 		if (op->remap.next)
3231 			vma_add_ufence(op->remap.next, ufence);
3232 		break;
3233 	case DRM_GPUVA_OP_UNMAP:
3234 		break;
3235 	case DRM_GPUVA_OP_PREFETCH:
3236 		vma_add_ufence(gpuva_to_vma(op->base.prefetch.va), ufence);
3237 		break;
3238 	default:
3239 		drm_warn(&vm->xe->drm, "NOT POSSIBLE");
3240 	}
3241 }
3242 
3243 static void vm_bind_ioctl_ops_fini(struct xe_vm *vm, struct xe_vma_ops *vops,
3244 				   struct dma_fence *fence)
3245 {
3246 	struct xe_exec_queue *wait_exec_queue = to_wait_exec_queue(vm, vops->q);
3247 	struct xe_user_fence *ufence;
3248 	struct xe_vma_op *op;
3249 	int i;
3250 
3251 	ufence = find_ufence_get(vops->syncs, vops->num_syncs);
3252 	list_for_each_entry(op, &vops->list, link) {
3253 		if (ufence)
3254 			op_add_ufence(vm, op, ufence);
3255 
3256 		if (op->base.op == DRM_GPUVA_OP_UNMAP)
3257 			xe_vma_destroy(gpuva_to_vma(op->base.unmap.va), fence);
3258 		else if (op->base.op == DRM_GPUVA_OP_REMAP)
3259 			xe_vma_destroy(gpuva_to_vma(op->base.remap.unmap->va),
3260 				       fence);
3261 	}
3262 	if (ufence)
3263 		xe_sync_ufence_put(ufence);
3264 	if (fence) {
3265 		for (i = 0; i < vops->num_syncs; i++)
3266 			xe_sync_entry_signal(vops->syncs + i, fence);
3267 		xe_exec_queue_last_fence_set(wait_exec_queue, vm, fence);
3268 	}
3269 }
3270 
3271 static struct dma_fence *vm_bind_ioctl_ops_execute(struct xe_vm *vm,
3272 						   struct xe_vma_ops *vops)
3273 {
3274 	struct drm_exec exec;
3275 	struct dma_fence *fence;
3276 	int err;
3277 
3278 	lockdep_assert_held_write(&vm->lock);
3279 
3280 	drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT |
3281 		      DRM_EXEC_IGNORE_DUPLICATES, 0);
3282 	drm_exec_until_all_locked(&exec) {
3283 		err = vm_bind_ioctl_ops_lock_and_prep(&exec, vm, vops);
3284 		drm_exec_retry_on_contention(&exec);
3285 		if (err) {
3286 			fence = ERR_PTR(err);
3287 			goto unlock;
3288 		}
3289 
3290 		fence = ops_execute(vm, vops);
3291 		if (IS_ERR(fence)) {
3292 			if (PTR_ERR(fence) == -ENODATA)
3293 				vm_bind_ioctl_ops_fini(vm, vops, NULL);
3294 			goto unlock;
3295 		}
3296 
3297 		vm_bind_ioctl_ops_fini(vm, vops, fence);
3298 	}
3299 
3300 unlock:
3301 	drm_exec_fini(&exec);
3302 	return fence;
3303 }
3304 ALLOW_ERROR_INJECTION(vm_bind_ioctl_ops_execute, ERRNO);
3305 
3306 #define SUPPORTED_FLAGS_STUB  \
3307 	(DRM_XE_VM_BIND_FLAG_READONLY | \
3308 	 DRM_XE_VM_BIND_FLAG_IMMEDIATE | \
3309 	 DRM_XE_VM_BIND_FLAG_NULL | \
3310 	 DRM_XE_VM_BIND_FLAG_DUMPABLE | \
3311 	 DRM_XE_VM_BIND_FLAG_CHECK_PXP | \
3312 	 DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR)
3313 
3314 #ifdef TEST_VM_OPS_ERROR
3315 #define SUPPORTED_FLAGS	(SUPPORTED_FLAGS_STUB | FORCE_OP_ERROR)
3316 #else
3317 #define SUPPORTED_FLAGS	SUPPORTED_FLAGS_STUB
3318 #endif
3319 
3320 #define XE_64K_PAGE_MASK 0xffffull
3321 #define ALL_DRM_XE_SYNCS_FLAGS (DRM_XE_SYNCS_FLAG_WAIT_FOR_OP)
3322 
3323 static int vm_bind_ioctl_check_args(struct xe_device *xe, struct xe_vm *vm,
3324 				    struct drm_xe_vm_bind *args,
3325 				    struct drm_xe_vm_bind_op **bind_ops)
3326 {
3327 	int err;
3328 	int i;
3329 
3330 	if (XE_IOCTL_DBG(xe, args->pad || args->pad2) ||
3331 	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
3332 		return -EINVAL;
3333 
3334 	if (XE_IOCTL_DBG(xe, args->extensions))
3335 		return -EINVAL;
3336 
3337 	if (args->num_binds > 1) {
3338 		u64 __user *bind_user =
3339 			u64_to_user_ptr(args->vector_of_binds);
3340 
3341 		*bind_ops = kvmalloc_array(args->num_binds,
3342 					   sizeof(struct drm_xe_vm_bind_op),
3343 					   GFP_KERNEL | __GFP_ACCOUNT |
3344 					   __GFP_RETRY_MAYFAIL | __GFP_NOWARN);
3345 		if (!*bind_ops)
3346 			return args->num_binds > 1 ? -ENOBUFS : -ENOMEM;
3347 
3348 		err = copy_from_user(*bind_ops, bind_user,
3349 				     sizeof(struct drm_xe_vm_bind_op) *
3350 				     args->num_binds);
3351 		if (XE_IOCTL_DBG(xe, err)) {
3352 			err = -EFAULT;
3353 			goto free_bind_ops;
3354 		}
3355 	} else {
3356 		*bind_ops = &args->bind;
3357 	}
3358 
3359 	for (i = 0; i < args->num_binds; ++i) {
3360 		u64 range = (*bind_ops)[i].range;
3361 		u64 addr = (*bind_ops)[i].addr;
3362 		u32 op = (*bind_ops)[i].op;
3363 		u32 flags = (*bind_ops)[i].flags;
3364 		u32 obj = (*bind_ops)[i].obj;
3365 		u64 obj_offset = (*bind_ops)[i].obj_offset;
3366 		u32 prefetch_region = (*bind_ops)[i].prefetch_mem_region_instance;
3367 		bool is_null = flags & DRM_XE_VM_BIND_FLAG_NULL;
3368 		bool is_cpu_addr_mirror = flags &
3369 			DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR;
3370 		u16 pat_index = (*bind_ops)[i].pat_index;
3371 		u16 coh_mode;
3372 
3373 		if (XE_IOCTL_DBG(xe, is_cpu_addr_mirror &&
3374 				 (!xe_vm_in_fault_mode(vm) ||
3375 				 !IS_ENABLED(CONFIG_DRM_XE_GPUSVM)))) {
3376 			err = -EINVAL;
3377 			goto free_bind_ops;
3378 		}
3379 
3380 		if (XE_IOCTL_DBG(xe, pat_index >= xe->pat.n_entries)) {
3381 			err = -EINVAL;
3382 			goto free_bind_ops;
3383 		}
3384 
3385 		pat_index = array_index_nospec(pat_index, xe->pat.n_entries);
3386 		(*bind_ops)[i].pat_index = pat_index;
3387 		coh_mode = xe_pat_index_get_coh_mode(xe, pat_index);
3388 		if (XE_IOCTL_DBG(xe, !coh_mode)) { /* hw reserved */
3389 			err = -EINVAL;
3390 			goto free_bind_ops;
3391 		}
3392 
3393 		if (XE_WARN_ON(coh_mode > XE_COH_AT_LEAST_1WAY)) {
3394 			err = -EINVAL;
3395 			goto free_bind_ops;
3396 		}
3397 
3398 		if (XE_IOCTL_DBG(xe, op > DRM_XE_VM_BIND_OP_PREFETCH) ||
3399 		    XE_IOCTL_DBG(xe, flags & ~SUPPORTED_FLAGS) ||
3400 		    XE_IOCTL_DBG(xe, obj && (is_null || is_cpu_addr_mirror)) ||
3401 		    XE_IOCTL_DBG(xe, obj_offset && (is_null ||
3402 						    is_cpu_addr_mirror)) ||
3403 		    XE_IOCTL_DBG(xe, op != DRM_XE_VM_BIND_OP_MAP &&
3404 				 (is_null || is_cpu_addr_mirror)) ||
3405 		    XE_IOCTL_DBG(xe, !obj &&
3406 				 op == DRM_XE_VM_BIND_OP_MAP &&
3407 				 !is_null && !is_cpu_addr_mirror) ||
3408 		    XE_IOCTL_DBG(xe, !obj &&
3409 				 op == DRM_XE_VM_BIND_OP_UNMAP_ALL) ||
3410 		    XE_IOCTL_DBG(xe, addr &&
3411 				 op == DRM_XE_VM_BIND_OP_UNMAP_ALL) ||
3412 		    XE_IOCTL_DBG(xe, range &&
3413 				 op == DRM_XE_VM_BIND_OP_UNMAP_ALL) ||
3414 		    XE_IOCTL_DBG(xe, obj &&
3415 				 op == DRM_XE_VM_BIND_OP_MAP_USERPTR) ||
3416 		    XE_IOCTL_DBG(xe, coh_mode == XE_COH_NONE &&
3417 				 op == DRM_XE_VM_BIND_OP_MAP_USERPTR) ||
3418 		    XE_IOCTL_DBG(xe, obj &&
3419 				 op == DRM_XE_VM_BIND_OP_PREFETCH) ||
3420 		    XE_IOCTL_DBG(xe, prefetch_region &&
3421 				 op != DRM_XE_VM_BIND_OP_PREFETCH) ||
3422 		    XE_IOCTL_DBG(xe, !(BIT(prefetch_region) &
3423 				       xe->info.mem_region_mask)) ||
3424 		    XE_IOCTL_DBG(xe, obj &&
3425 				 op == DRM_XE_VM_BIND_OP_UNMAP)) {
3426 			err = -EINVAL;
3427 			goto free_bind_ops;
3428 		}
3429 
3430 		if (XE_IOCTL_DBG(xe, obj_offset & ~PAGE_MASK) ||
3431 		    XE_IOCTL_DBG(xe, addr & ~PAGE_MASK) ||
3432 		    XE_IOCTL_DBG(xe, range & ~PAGE_MASK) ||
3433 		    XE_IOCTL_DBG(xe, !range &&
3434 				 op != DRM_XE_VM_BIND_OP_UNMAP_ALL)) {
3435 			err = -EINVAL;
3436 			goto free_bind_ops;
3437 		}
3438 	}
3439 
3440 	return 0;
3441 
3442 free_bind_ops:
3443 	if (args->num_binds > 1)
3444 		kvfree(*bind_ops);
3445 	*bind_ops = NULL;
3446 	return err;
3447 }
3448 
3449 static int vm_bind_ioctl_signal_fences(struct xe_vm *vm,
3450 				       struct xe_exec_queue *q,
3451 				       struct xe_sync_entry *syncs,
3452 				       int num_syncs)
3453 {
3454 	struct dma_fence *fence;
3455 	int i, err = 0;
3456 
3457 	fence = xe_sync_in_fence_get(syncs, num_syncs,
3458 				     to_wait_exec_queue(vm, q), vm);
3459 	if (IS_ERR(fence))
3460 		return PTR_ERR(fence);
3461 
3462 	for (i = 0; i < num_syncs; i++)
3463 		xe_sync_entry_signal(&syncs[i], fence);
3464 
3465 	xe_exec_queue_last_fence_set(to_wait_exec_queue(vm, q), vm,
3466 				     fence);
3467 	dma_fence_put(fence);
3468 
3469 	return err;
3470 }
3471 
3472 static void xe_vma_ops_init(struct xe_vma_ops *vops, struct xe_vm *vm,
3473 			    struct xe_exec_queue *q,
3474 			    struct xe_sync_entry *syncs, u32 num_syncs)
3475 {
3476 	memset(vops, 0, sizeof(*vops));
3477 	INIT_LIST_HEAD(&vops->list);
3478 	vops->vm = vm;
3479 	vops->q = q;
3480 	vops->syncs = syncs;
3481 	vops->num_syncs = num_syncs;
3482 	vops->flags = 0;
3483 }
3484 
3485 static int xe_vm_bind_ioctl_validate_bo(struct xe_device *xe, struct xe_bo *bo,
3486 					u64 addr, u64 range, u64 obj_offset,
3487 					u16 pat_index, u32 op, u32 bind_flags)
3488 {
3489 	u16 coh_mode;
3490 
3491 	if (XE_IOCTL_DBG(xe, range > xe_bo_size(bo)) ||
3492 	    XE_IOCTL_DBG(xe, obj_offset >
3493 			 xe_bo_size(bo) - range)) {
3494 		return -EINVAL;
3495 	}
3496 
3497 	/*
3498 	 * Some platforms require 64k VM_BIND alignment,
3499 	 * specifically those with XE_VRAM_FLAGS_NEED64K.
3500 	 *
3501 	 * Other platforms may have BO's set to 64k physical placement,
3502 	 * but can be mapped at 4k offsets anyway. This check is only
3503 	 * there for the former case.
3504 	 */
3505 	if ((bo->flags & XE_BO_FLAG_INTERNAL_64K) &&
3506 	    (xe->info.vram_flags & XE_VRAM_FLAGS_NEED64K)) {
3507 		if (XE_IOCTL_DBG(xe, obj_offset &
3508 				 XE_64K_PAGE_MASK) ||
3509 		    XE_IOCTL_DBG(xe, addr & XE_64K_PAGE_MASK) ||
3510 		    XE_IOCTL_DBG(xe, range & XE_64K_PAGE_MASK)) {
3511 			return -EINVAL;
3512 		}
3513 	}
3514 
3515 	coh_mode = xe_pat_index_get_coh_mode(xe, pat_index);
3516 	if (bo->cpu_caching) {
3517 		if (XE_IOCTL_DBG(xe, coh_mode == XE_COH_NONE &&
3518 				 bo->cpu_caching == DRM_XE_GEM_CPU_CACHING_WB)) {
3519 			return -EINVAL;
3520 		}
3521 	} else if (XE_IOCTL_DBG(xe, coh_mode == XE_COH_NONE)) {
3522 		/*
3523 		 * Imported dma-buf from a different device should
3524 		 * require 1way or 2way coherency since we don't know
3525 		 * how it was mapped on the CPU. Just assume is it
3526 		 * potentially cached on CPU side.
3527 		 */
3528 		return -EINVAL;
3529 	}
3530 
3531 	/* If a BO is protected it can only be mapped if the key is still valid */
3532 	if ((bind_flags & DRM_XE_VM_BIND_FLAG_CHECK_PXP) && xe_bo_is_protected(bo) &&
3533 	    op != DRM_XE_VM_BIND_OP_UNMAP && op != DRM_XE_VM_BIND_OP_UNMAP_ALL)
3534 		if (XE_IOCTL_DBG(xe, xe_pxp_bo_key_check(xe->pxp, bo) != 0))
3535 			return -ENOEXEC;
3536 
3537 	return 0;
3538 }
3539 
3540 int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
3541 {
3542 	struct xe_device *xe = to_xe_device(dev);
3543 	struct xe_file *xef = to_xe_file(file);
3544 	struct drm_xe_vm_bind *args = data;
3545 	struct drm_xe_sync __user *syncs_user;
3546 	struct xe_bo **bos = NULL;
3547 	struct drm_gpuva_ops **ops = NULL;
3548 	struct xe_vm *vm;
3549 	struct xe_exec_queue *q = NULL;
3550 	u32 num_syncs, num_ufence = 0;
3551 	struct xe_sync_entry *syncs = NULL;
3552 	struct drm_xe_vm_bind_op *bind_ops = NULL;
3553 	struct xe_vma_ops vops;
3554 	struct dma_fence *fence;
3555 	int err;
3556 	int i;
3557 
3558 	vm = xe_vm_lookup(xef, args->vm_id);
3559 	if (XE_IOCTL_DBG(xe, !vm))
3560 		return -EINVAL;
3561 
3562 	err = vm_bind_ioctl_check_args(xe, vm, args, &bind_ops);
3563 	if (err)
3564 		goto put_vm;
3565 
3566 	if (args->exec_queue_id) {
3567 		q = xe_exec_queue_lookup(xef, args->exec_queue_id);
3568 		if (XE_IOCTL_DBG(xe, !q)) {
3569 			err = -ENOENT;
3570 			goto free_bind_ops;
3571 		}
3572 
3573 		if (XE_IOCTL_DBG(xe, !(q->flags & EXEC_QUEUE_FLAG_VM))) {
3574 			err = -EINVAL;
3575 			goto put_exec_queue;
3576 		}
3577 	}
3578 
3579 	/* Ensure all UNMAPs visible */
3580 	xe_svm_flush(vm);
3581 
3582 	err = down_write_killable(&vm->lock);
3583 	if (err)
3584 		goto put_exec_queue;
3585 
3586 	if (XE_IOCTL_DBG(xe, xe_vm_is_closed_or_banned(vm))) {
3587 		err = -ENOENT;
3588 		goto release_vm_lock;
3589 	}
3590 
3591 	for (i = 0; i < args->num_binds; ++i) {
3592 		u64 range = bind_ops[i].range;
3593 		u64 addr = bind_ops[i].addr;
3594 
3595 		if (XE_IOCTL_DBG(xe, range > vm->size) ||
3596 		    XE_IOCTL_DBG(xe, addr > vm->size - range)) {
3597 			err = -EINVAL;
3598 			goto release_vm_lock;
3599 		}
3600 	}
3601 
3602 	if (args->num_binds) {
3603 		bos = kvcalloc(args->num_binds, sizeof(*bos),
3604 			       GFP_KERNEL | __GFP_ACCOUNT |
3605 			       __GFP_RETRY_MAYFAIL | __GFP_NOWARN);
3606 		if (!bos) {
3607 			err = -ENOMEM;
3608 			goto release_vm_lock;
3609 		}
3610 
3611 		ops = kvcalloc(args->num_binds, sizeof(*ops),
3612 			       GFP_KERNEL | __GFP_ACCOUNT |
3613 			       __GFP_RETRY_MAYFAIL | __GFP_NOWARN);
3614 		if (!ops) {
3615 			err = -ENOMEM;
3616 			goto free_bos;
3617 		}
3618 	}
3619 
3620 	for (i = 0; i < args->num_binds; ++i) {
3621 		struct drm_gem_object *gem_obj;
3622 		u64 range = bind_ops[i].range;
3623 		u64 addr = bind_ops[i].addr;
3624 		u32 obj = bind_ops[i].obj;
3625 		u64 obj_offset = bind_ops[i].obj_offset;
3626 		u16 pat_index = bind_ops[i].pat_index;
3627 		u32 op = bind_ops[i].op;
3628 		u32 bind_flags = bind_ops[i].flags;
3629 
3630 		if (!obj)
3631 			continue;
3632 
3633 		gem_obj = drm_gem_object_lookup(file, obj);
3634 		if (XE_IOCTL_DBG(xe, !gem_obj)) {
3635 			err = -ENOENT;
3636 			goto put_obj;
3637 		}
3638 		bos[i] = gem_to_xe_bo(gem_obj);
3639 
3640 		err = xe_vm_bind_ioctl_validate_bo(xe, bos[i], addr, range,
3641 						   obj_offset, pat_index, op,
3642 						   bind_flags);
3643 		if (err)
3644 			goto put_obj;
3645 	}
3646 
3647 	if (args->num_syncs) {
3648 		syncs = kcalloc(args->num_syncs, sizeof(*syncs), GFP_KERNEL);
3649 		if (!syncs) {
3650 			err = -ENOMEM;
3651 			goto put_obj;
3652 		}
3653 	}
3654 
3655 	syncs_user = u64_to_user_ptr(args->syncs);
3656 	for (num_syncs = 0; num_syncs < args->num_syncs; num_syncs++) {
3657 		err = xe_sync_entry_parse(xe, xef, &syncs[num_syncs],
3658 					  &syncs_user[num_syncs],
3659 					  (xe_vm_in_lr_mode(vm) ?
3660 					   SYNC_PARSE_FLAG_LR_MODE : 0) |
3661 					  (!args->num_binds ?
3662 					   SYNC_PARSE_FLAG_DISALLOW_USER_FENCE : 0));
3663 		if (err)
3664 			goto free_syncs;
3665 
3666 		if (xe_sync_is_ufence(&syncs[num_syncs]))
3667 			num_ufence++;
3668 	}
3669 
3670 	if (XE_IOCTL_DBG(xe, num_ufence > 1)) {
3671 		err = -EINVAL;
3672 		goto free_syncs;
3673 	}
3674 
3675 	if (!args->num_binds) {
3676 		err = -ENODATA;
3677 		goto free_syncs;
3678 	}
3679 
3680 	xe_vma_ops_init(&vops, vm, q, syncs, num_syncs);
3681 	for (i = 0; i < args->num_binds; ++i) {
3682 		u64 range = bind_ops[i].range;
3683 		u64 addr = bind_ops[i].addr;
3684 		u32 op = bind_ops[i].op;
3685 		u32 flags = bind_ops[i].flags;
3686 		u64 obj_offset = bind_ops[i].obj_offset;
3687 		u32 prefetch_region = bind_ops[i].prefetch_mem_region_instance;
3688 		u16 pat_index = bind_ops[i].pat_index;
3689 
3690 		ops[i] = vm_bind_ioctl_ops_create(vm, &vops, bos[i], obj_offset,
3691 						  addr, range, op, flags,
3692 						  prefetch_region, pat_index);
3693 		if (IS_ERR(ops[i])) {
3694 			err = PTR_ERR(ops[i]);
3695 			ops[i] = NULL;
3696 			goto unwind_ops;
3697 		}
3698 
3699 		err = vm_bind_ioctl_ops_parse(vm, ops[i], &vops);
3700 		if (err)
3701 			goto unwind_ops;
3702 
3703 #ifdef TEST_VM_OPS_ERROR
3704 		if (flags & FORCE_OP_ERROR) {
3705 			vops.inject_error = true;
3706 			vm->xe->vm_inject_error_position =
3707 				(vm->xe->vm_inject_error_position + 1) %
3708 				FORCE_OP_ERROR_COUNT;
3709 		}
3710 #endif
3711 	}
3712 
3713 	/* Nothing to do */
3714 	if (list_empty(&vops.list)) {
3715 		err = -ENODATA;
3716 		goto unwind_ops;
3717 	}
3718 
3719 	err = xe_vma_ops_alloc(&vops, args->num_binds > 1);
3720 	if (err)
3721 		goto unwind_ops;
3722 
3723 	err = vm_bind_ioctl_ops_prefetch_ranges(vm, &vops);
3724 	if (err)
3725 		goto unwind_ops;
3726 
3727 	fence = vm_bind_ioctl_ops_execute(vm, &vops);
3728 	if (IS_ERR(fence))
3729 		err = PTR_ERR(fence);
3730 	else
3731 		dma_fence_put(fence);
3732 
3733 unwind_ops:
3734 	if (err && err != -ENODATA)
3735 		vm_bind_ioctl_ops_unwind(vm, ops, args->num_binds);
3736 	xe_vma_ops_fini(&vops);
3737 	for (i = args->num_binds - 1; i >= 0; --i)
3738 		if (ops[i])
3739 			drm_gpuva_ops_free(&vm->gpuvm, ops[i]);
3740 free_syncs:
3741 	if (err == -ENODATA)
3742 		err = vm_bind_ioctl_signal_fences(vm, q, syncs, num_syncs);
3743 	while (num_syncs--)
3744 		xe_sync_entry_cleanup(&syncs[num_syncs]);
3745 
3746 	kfree(syncs);
3747 put_obj:
3748 	for (i = 0; i < args->num_binds; ++i)
3749 		xe_bo_put(bos[i]);
3750 
3751 	kvfree(ops);
3752 free_bos:
3753 	kvfree(bos);
3754 release_vm_lock:
3755 	up_write(&vm->lock);
3756 put_exec_queue:
3757 	if (q)
3758 		xe_exec_queue_put(q);
3759 free_bind_ops:
3760 	if (args->num_binds > 1)
3761 		kvfree(bind_ops);
3762 put_vm:
3763 	xe_vm_put(vm);
3764 	return err;
3765 }
3766 
3767 /**
3768  * xe_vm_bind_kernel_bo - bind a kernel BO to a VM
3769  * @vm: VM to bind the BO to
3770  * @bo: BO to bind
3771  * @q: exec queue to use for the bind (optional)
3772  * @addr: address at which to bind the BO
3773  * @cache_lvl: PAT cache level to use
3774  *
3775  * Execute a VM bind map operation on a kernel-owned BO to bind it into a
3776  * kernel-owned VM.
3777  *
3778  * Returns a dma_fence to track the binding completion if the job to do so was
3779  * successfully submitted, an error pointer otherwise.
3780  */
3781 struct dma_fence *xe_vm_bind_kernel_bo(struct xe_vm *vm, struct xe_bo *bo,
3782 				       struct xe_exec_queue *q, u64 addr,
3783 				       enum xe_cache_level cache_lvl)
3784 {
3785 	struct xe_vma_ops vops;
3786 	struct drm_gpuva_ops *ops = NULL;
3787 	struct dma_fence *fence;
3788 	int err;
3789 
3790 	xe_bo_get(bo);
3791 	xe_vm_get(vm);
3792 	if (q)
3793 		xe_exec_queue_get(q);
3794 
3795 	down_write(&vm->lock);
3796 
3797 	xe_vma_ops_init(&vops, vm, q, NULL, 0);
3798 
3799 	ops = vm_bind_ioctl_ops_create(vm, &vops, bo, 0, addr, xe_bo_size(bo),
3800 				       DRM_XE_VM_BIND_OP_MAP, 0, 0,
3801 				       vm->xe->pat.idx[cache_lvl]);
3802 	if (IS_ERR(ops)) {
3803 		err = PTR_ERR(ops);
3804 		goto release_vm_lock;
3805 	}
3806 
3807 	err = vm_bind_ioctl_ops_parse(vm, ops, &vops);
3808 	if (err)
3809 		goto release_vm_lock;
3810 
3811 	xe_assert(vm->xe, !list_empty(&vops.list));
3812 
3813 	err = xe_vma_ops_alloc(&vops, false);
3814 	if (err)
3815 		goto unwind_ops;
3816 
3817 	fence = vm_bind_ioctl_ops_execute(vm, &vops);
3818 	if (IS_ERR(fence))
3819 		err = PTR_ERR(fence);
3820 
3821 unwind_ops:
3822 	if (err && err != -ENODATA)
3823 		vm_bind_ioctl_ops_unwind(vm, &ops, 1);
3824 
3825 	xe_vma_ops_fini(&vops);
3826 	drm_gpuva_ops_free(&vm->gpuvm, ops);
3827 
3828 release_vm_lock:
3829 	up_write(&vm->lock);
3830 
3831 	if (q)
3832 		xe_exec_queue_put(q);
3833 	xe_vm_put(vm);
3834 	xe_bo_put(bo);
3835 
3836 	if (err)
3837 		fence = ERR_PTR(err);
3838 
3839 	return fence;
3840 }
3841 
3842 /**
3843  * xe_vm_lock() - Lock the vm's dma_resv object
3844  * @vm: The struct xe_vm whose lock is to be locked
3845  * @intr: Whether to perform any wait interruptible
3846  *
3847  * Return: 0 on success, -EINTR if @intr is true and the wait for a
3848  * contended lock was interrupted. If @intr is false, the function
3849  * always returns 0.
3850  */
3851 int xe_vm_lock(struct xe_vm *vm, bool intr)
3852 {
3853 	if (intr)
3854 		return dma_resv_lock_interruptible(xe_vm_resv(vm), NULL);
3855 
3856 	return dma_resv_lock(xe_vm_resv(vm), NULL);
3857 }
3858 
3859 /**
3860  * xe_vm_unlock() - Unlock the vm's dma_resv object
3861  * @vm: The struct xe_vm whose lock is to be released.
3862  *
3863  * Unlock a buffer object lock that was locked by xe_vm_lock().
3864  */
3865 void xe_vm_unlock(struct xe_vm *vm)
3866 {
3867 	dma_resv_unlock(xe_vm_resv(vm));
3868 }
3869 
3870 /**
3871  * xe_vm_range_tilemask_tlb_invalidation - Issue a TLB invalidation on this tilemask for an
3872  * address range
3873  * @vm: The VM
3874  * @start: start address
3875  * @end: end address
3876  * @tile_mask: mask for which gt's issue tlb invalidation
3877  *
3878  * Issue a range based TLB invalidation for gt's in tilemask
3879  *
3880  * Returns 0 for success, negative error code otherwise.
3881  */
3882 int xe_vm_range_tilemask_tlb_invalidation(struct xe_vm *vm, u64 start,
3883 					  u64 end, u8 tile_mask)
3884 {
3885 	struct xe_gt_tlb_invalidation_fence fence[XE_MAX_TILES_PER_DEVICE * XE_MAX_GT_PER_TILE];
3886 	struct xe_tile *tile;
3887 	u32 fence_id = 0;
3888 	u8 id;
3889 	int err;
3890 
3891 	if (!tile_mask)
3892 		return 0;
3893 
3894 	for_each_tile(tile, vm->xe, id) {
3895 		if (tile_mask & BIT(id)) {
3896 			xe_gt_tlb_invalidation_fence_init(tile->primary_gt,
3897 							  &fence[fence_id], true);
3898 
3899 			err = xe_gt_tlb_invalidation_range(tile->primary_gt,
3900 							   &fence[fence_id],
3901 							   start,
3902 							   end,
3903 							   vm->usm.asid);
3904 			if (err)
3905 				goto wait;
3906 			++fence_id;
3907 
3908 			if (!tile->media_gt)
3909 				continue;
3910 
3911 			xe_gt_tlb_invalidation_fence_init(tile->media_gt,
3912 							  &fence[fence_id], true);
3913 
3914 			err = xe_gt_tlb_invalidation_range(tile->media_gt,
3915 							   &fence[fence_id],
3916 							   start,
3917 							   end,
3918 							   vm->usm.asid);
3919 			if (err)
3920 				goto wait;
3921 			++fence_id;
3922 		}
3923 	}
3924 
3925 wait:
3926 	for (id = 0; id < fence_id; ++id)
3927 		xe_gt_tlb_invalidation_fence_wait(&fence[id]);
3928 
3929 	return err;
3930 }
3931 
3932 /**
3933  * xe_vm_invalidate_vma - invalidate GPU mappings for VMA without a lock
3934  * @vma: VMA to invalidate
3935  *
3936  * Walks a list of page tables leaves which it memset the entries owned by this
3937  * VMA to zero, invalidates the TLBs, and block until TLBs invalidation is
3938  * complete.
3939  *
3940  * Returns 0 for success, negative error code otherwise.
3941  */
3942 int xe_vm_invalidate_vma(struct xe_vma *vma)
3943 {
3944 	struct xe_device *xe = xe_vma_vm(vma)->xe;
3945 	struct xe_vm *vm = xe_vma_vm(vma);
3946 	struct xe_tile *tile;
3947 	u8 tile_mask = 0;
3948 	int ret = 0;
3949 	u8 id;
3950 
3951 	xe_assert(xe, !xe_vma_is_null(vma));
3952 	xe_assert(xe, !xe_vma_is_cpu_addr_mirror(vma));
3953 	trace_xe_vma_invalidate(vma);
3954 
3955 	vm_dbg(&vm->xe->drm,
3956 	       "INVALIDATE: addr=0x%016llx, range=0x%016llx",
3957 		xe_vma_start(vma), xe_vma_size(vma));
3958 
3959 	/*
3960 	 * Check that we don't race with page-table updates, tile_invalidated
3961 	 * update is safe
3962 	 */
3963 	if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
3964 		if (xe_vma_is_userptr(vma)) {
3965 			lockdep_assert(lockdep_is_held_type(&vm->userptr.notifier_lock, 0) ||
3966 				       (lockdep_is_held_type(&vm->userptr.notifier_lock, 1) &&
3967 					lockdep_is_held(&xe_vm_resv(vm)->lock.base)));
3968 
3969 			WARN_ON_ONCE(!mmu_interval_check_retry
3970 				     (&to_userptr_vma(vma)->userptr.notifier,
3971 				      to_userptr_vma(vma)->userptr.notifier_seq));
3972 			WARN_ON_ONCE(!dma_resv_test_signaled(xe_vm_resv(vm),
3973 							     DMA_RESV_USAGE_BOOKKEEP));
3974 
3975 		} else {
3976 			xe_bo_assert_held(xe_vma_bo(vma));
3977 		}
3978 	}
3979 
3980 	for_each_tile(tile, xe, id)
3981 		if (xe_pt_zap_ptes(tile, vma))
3982 			tile_mask |= BIT(id);
3983 
3984 	xe_device_wmb(xe);
3985 
3986 	ret = xe_vm_range_tilemask_tlb_invalidation(xe_vma_vm(vma), xe_vma_start(vma),
3987 						    xe_vma_end(vma), tile_mask);
3988 
3989 	/* WRITE_ONCE pairs with READ_ONCE in xe_vm_has_valid_gpu_mapping() */
3990 	WRITE_ONCE(vma->tile_invalidated, vma->tile_mask);
3991 
3992 	return ret;
3993 }
3994 
3995 int xe_vm_validate_protected(struct xe_vm *vm)
3996 {
3997 	struct drm_gpuva *gpuva;
3998 	int err = 0;
3999 
4000 	if (!vm)
4001 		return -ENODEV;
4002 
4003 	mutex_lock(&vm->snap_mutex);
4004 
4005 	drm_gpuvm_for_each_va(gpuva, &vm->gpuvm) {
4006 		struct xe_vma *vma = gpuva_to_vma(gpuva);
4007 		struct xe_bo *bo = vma->gpuva.gem.obj ?
4008 			gem_to_xe_bo(vma->gpuva.gem.obj) : NULL;
4009 
4010 		if (!bo)
4011 			continue;
4012 
4013 		if (xe_bo_is_protected(bo)) {
4014 			err = xe_pxp_bo_key_check(vm->xe->pxp, bo);
4015 			if (err)
4016 				break;
4017 		}
4018 	}
4019 
4020 	mutex_unlock(&vm->snap_mutex);
4021 	return err;
4022 }
4023 
4024 struct xe_vm_snapshot {
4025 	unsigned long num_snaps;
4026 	struct {
4027 		u64 ofs, bo_ofs;
4028 		unsigned long len;
4029 		struct xe_bo *bo;
4030 		void *data;
4031 		struct mm_struct *mm;
4032 	} snap[];
4033 };
4034 
4035 struct xe_vm_snapshot *xe_vm_snapshot_capture(struct xe_vm *vm)
4036 {
4037 	unsigned long num_snaps = 0, i;
4038 	struct xe_vm_snapshot *snap = NULL;
4039 	struct drm_gpuva *gpuva;
4040 
4041 	if (!vm)
4042 		return NULL;
4043 
4044 	mutex_lock(&vm->snap_mutex);
4045 	drm_gpuvm_for_each_va(gpuva, &vm->gpuvm) {
4046 		if (gpuva->flags & XE_VMA_DUMPABLE)
4047 			num_snaps++;
4048 	}
4049 
4050 	if (num_snaps)
4051 		snap = kvzalloc(offsetof(struct xe_vm_snapshot, snap[num_snaps]), GFP_NOWAIT);
4052 	if (!snap) {
4053 		snap = num_snaps ? ERR_PTR(-ENOMEM) : ERR_PTR(-ENODEV);
4054 		goto out_unlock;
4055 	}
4056 
4057 	snap->num_snaps = num_snaps;
4058 	i = 0;
4059 	drm_gpuvm_for_each_va(gpuva, &vm->gpuvm) {
4060 		struct xe_vma *vma = gpuva_to_vma(gpuva);
4061 		struct xe_bo *bo = vma->gpuva.gem.obj ?
4062 			gem_to_xe_bo(vma->gpuva.gem.obj) : NULL;
4063 
4064 		if (!(gpuva->flags & XE_VMA_DUMPABLE))
4065 			continue;
4066 
4067 		snap->snap[i].ofs = xe_vma_start(vma);
4068 		snap->snap[i].len = xe_vma_size(vma);
4069 		if (bo) {
4070 			snap->snap[i].bo = xe_bo_get(bo);
4071 			snap->snap[i].bo_ofs = xe_vma_bo_offset(vma);
4072 		} else if (xe_vma_is_userptr(vma)) {
4073 			struct mm_struct *mm =
4074 				to_userptr_vma(vma)->userptr.notifier.mm;
4075 
4076 			if (mmget_not_zero(mm))
4077 				snap->snap[i].mm = mm;
4078 			else
4079 				snap->snap[i].data = ERR_PTR(-EFAULT);
4080 
4081 			snap->snap[i].bo_ofs = xe_vma_userptr(vma);
4082 		} else {
4083 			snap->snap[i].data = ERR_PTR(-ENOENT);
4084 		}
4085 		i++;
4086 	}
4087 
4088 out_unlock:
4089 	mutex_unlock(&vm->snap_mutex);
4090 	return snap;
4091 }
4092 
4093 void xe_vm_snapshot_capture_delayed(struct xe_vm_snapshot *snap)
4094 {
4095 	if (IS_ERR_OR_NULL(snap))
4096 		return;
4097 
4098 	for (int i = 0; i < snap->num_snaps; i++) {
4099 		struct xe_bo *bo = snap->snap[i].bo;
4100 		int err;
4101 
4102 		if (IS_ERR(snap->snap[i].data))
4103 			continue;
4104 
4105 		snap->snap[i].data = kvmalloc(snap->snap[i].len, GFP_USER);
4106 		if (!snap->snap[i].data) {
4107 			snap->snap[i].data = ERR_PTR(-ENOMEM);
4108 			goto cleanup_bo;
4109 		}
4110 
4111 		if (bo) {
4112 			err = xe_bo_read(bo, snap->snap[i].bo_ofs,
4113 					 snap->snap[i].data, snap->snap[i].len);
4114 		} else {
4115 			void __user *userptr = (void __user *)(size_t)snap->snap[i].bo_ofs;
4116 
4117 			kthread_use_mm(snap->snap[i].mm);
4118 			if (!copy_from_user(snap->snap[i].data, userptr, snap->snap[i].len))
4119 				err = 0;
4120 			else
4121 				err = -EFAULT;
4122 			kthread_unuse_mm(snap->snap[i].mm);
4123 
4124 			mmput(snap->snap[i].mm);
4125 			snap->snap[i].mm = NULL;
4126 		}
4127 
4128 		if (err) {
4129 			kvfree(snap->snap[i].data);
4130 			snap->snap[i].data = ERR_PTR(err);
4131 		}
4132 
4133 cleanup_bo:
4134 		xe_bo_put(bo);
4135 		snap->snap[i].bo = NULL;
4136 	}
4137 }
4138 
4139 void xe_vm_snapshot_print(struct xe_vm_snapshot *snap, struct drm_printer *p)
4140 {
4141 	unsigned long i, j;
4142 
4143 	if (IS_ERR_OR_NULL(snap)) {
4144 		drm_printf(p, "[0].error: %li\n", PTR_ERR(snap));
4145 		return;
4146 	}
4147 
4148 	for (i = 0; i < snap->num_snaps; i++) {
4149 		drm_printf(p, "[%llx].length: 0x%lx\n", snap->snap[i].ofs, snap->snap[i].len);
4150 
4151 		if (IS_ERR(snap->snap[i].data)) {
4152 			drm_printf(p, "[%llx].error: %li\n", snap->snap[i].ofs,
4153 				   PTR_ERR(snap->snap[i].data));
4154 			continue;
4155 		}
4156 
4157 		drm_printf(p, "[%llx].data: ", snap->snap[i].ofs);
4158 
4159 		for (j = 0; j < snap->snap[i].len; j += sizeof(u32)) {
4160 			u32 *val = snap->snap[i].data + j;
4161 			char dumped[ASCII85_BUFSZ];
4162 
4163 			drm_puts(p, ascii85_encode(*val, dumped));
4164 		}
4165 
4166 		drm_puts(p, "\n");
4167 
4168 		if (drm_coredump_printer_is_full(p))
4169 			return;
4170 	}
4171 }
4172 
4173 void xe_vm_snapshot_free(struct xe_vm_snapshot *snap)
4174 {
4175 	unsigned long i;
4176 
4177 	if (IS_ERR_OR_NULL(snap))
4178 		return;
4179 
4180 	for (i = 0; i < snap->num_snaps; i++) {
4181 		if (!IS_ERR(snap->snap[i].data))
4182 			kvfree(snap->snap[i].data);
4183 		xe_bo_put(snap->snap[i].bo);
4184 		if (snap->snap[i].mm)
4185 			mmput(snap->snap[i].mm);
4186 	}
4187 	kvfree(snap);
4188 }
4189