xref: /linux/drivers/gpu/drm/xe/xe_vm.c (revision af53f0fd99c3bbb3afd29f1612c9e88c5a92cc01)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2021 Intel Corporation
4  */
5 
6 #include "xe_vm.h"
7 
8 #include <linux/dma-fence-array.h>
9 #include <linux/nospec.h>
10 
11 #include <drm/drm_drv.h>
12 #include <drm/drm_exec.h>
13 #include <drm/drm_print.h>
14 #include <drm/ttm/ttm_tt.h>
15 #include <uapi/drm/xe_drm.h>
16 #include <linux/ascii85.h>
17 #include <linux/delay.h>
18 #include <linux/kthread.h>
19 #include <linux/mm.h>
20 #include <linux/swap.h>
21 
22 #include <generated/xe_wa_oob.h>
23 
24 #include "regs/xe_gtt_defs.h"
25 #include "xe_assert.h"
26 #include "xe_bo.h"
27 #include "xe_device.h"
28 #include "xe_drm_client.h"
29 #include "xe_exec_queue.h"
30 #include "xe_gt_pagefault.h"
31 #include "xe_gt_tlb_invalidation.h"
32 #include "xe_migrate.h"
33 #include "xe_pat.h"
34 #include "xe_pm.h"
35 #include "xe_preempt_fence.h"
36 #include "xe_pt.h"
37 #include "xe_pxp.h"
38 #include "xe_res_cursor.h"
39 #include "xe_svm.h"
40 #include "xe_sync.h"
41 #include "xe_trace_bo.h"
42 #include "xe_wa.h"
43 #include "xe_hmm.h"
44 
45 static struct drm_gem_object *xe_vm_obj(struct xe_vm *vm)
46 {
47 	return vm->gpuvm.r_obj;
48 }
49 
50 /**
51  * xe_vma_userptr_check_repin() - Advisory check for repin needed
52  * @uvma: The userptr vma
53  *
54  * Check if the userptr vma has been invalidated since last successful
55  * repin. The check is advisory only and can the function can be called
56  * without the vm->userptr.notifier_lock held. There is no guarantee that the
57  * vma userptr will remain valid after a lockless check, so typically
58  * the call needs to be followed by a proper check under the notifier_lock.
59  *
60  * Return: 0 if userptr vma is valid, -EAGAIN otherwise; repin recommended.
61  */
62 int xe_vma_userptr_check_repin(struct xe_userptr_vma *uvma)
63 {
64 	return mmu_interval_check_retry(&uvma->userptr.notifier,
65 					uvma->userptr.notifier_seq) ?
66 		-EAGAIN : 0;
67 }
68 
69 int xe_vma_userptr_pin_pages(struct xe_userptr_vma *uvma)
70 {
71 	struct xe_vma *vma = &uvma->vma;
72 	struct xe_vm *vm = xe_vma_vm(vma);
73 	struct xe_device *xe = vm->xe;
74 
75 	lockdep_assert_held(&vm->lock);
76 	xe_assert(xe, xe_vma_is_userptr(vma));
77 
78 	return xe_hmm_userptr_populate_range(uvma, false);
79 }
80 
81 static bool preempt_fences_waiting(struct xe_vm *vm)
82 {
83 	struct xe_exec_queue *q;
84 
85 	lockdep_assert_held(&vm->lock);
86 	xe_vm_assert_held(vm);
87 
88 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link) {
89 		if (!q->lr.pfence ||
90 		    test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
91 			     &q->lr.pfence->flags)) {
92 			return true;
93 		}
94 	}
95 
96 	return false;
97 }
98 
99 static void free_preempt_fences(struct list_head *list)
100 {
101 	struct list_head *link, *next;
102 
103 	list_for_each_safe(link, next, list)
104 		xe_preempt_fence_free(to_preempt_fence_from_link(link));
105 }
106 
107 static int alloc_preempt_fences(struct xe_vm *vm, struct list_head *list,
108 				unsigned int *count)
109 {
110 	lockdep_assert_held(&vm->lock);
111 	xe_vm_assert_held(vm);
112 
113 	if (*count >= vm->preempt.num_exec_queues)
114 		return 0;
115 
116 	for (; *count < vm->preempt.num_exec_queues; ++(*count)) {
117 		struct xe_preempt_fence *pfence = xe_preempt_fence_alloc();
118 
119 		if (IS_ERR(pfence))
120 			return PTR_ERR(pfence);
121 
122 		list_move_tail(xe_preempt_fence_link(pfence), list);
123 	}
124 
125 	return 0;
126 }
127 
128 static int wait_for_existing_preempt_fences(struct xe_vm *vm)
129 {
130 	struct xe_exec_queue *q;
131 
132 	xe_vm_assert_held(vm);
133 
134 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link) {
135 		if (q->lr.pfence) {
136 			long timeout = dma_fence_wait(q->lr.pfence, false);
137 
138 			/* Only -ETIME on fence indicates VM needs to be killed */
139 			if (timeout < 0 || q->lr.pfence->error == -ETIME)
140 				return -ETIME;
141 
142 			dma_fence_put(q->lr.pfence);
143 			q->lr.pfence = NULL;
144 		}
145 	}
146 
147 	return 0;
148 }
149 
150 static bool xe_vm_is_idle(struct xe_vm *vm)
151 {
152 	struct xe_exec_queue *q;
153 
154 	xe_vm_assert_held(vm);
155 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link) {
156 		if (!xe_exec_queue_is_idle(q))
157 			return false;
158 	}
159 
160 	return true;
161 }
162 
163 static void arm_preempt_fences(struct xe_vm *vm, struct list_head *list)
164 {
165 	struct list_head *link;
166 	struct xe_exec_queue *q;
167 
168 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link) {
169 		struct dma_fence *fence;
170 
171 		link = list->next;
172 		xe_assert(vm->xe, link != list);
173 
174 		fence = xe_preempt_fence_arm(to_preempt_fence_from_link(link),
175 					     q, q->lr.context,
176 					     ++q->lr.seqno);
177 		dma_fence_put(q->lr.pfence);
178 		q->lr.pfence = fence;
179 	}
180 }
181 
182 static int add_preempt_fences(struct xe_vm *vm, struct xe_bo *bo)
183 {
184 	struct xe_exec_queue *q;
185 	int err;
186 
187 	xe_bo_assert_held(bo);
188 
189 	if (!vm->preempt.num_exec_queues)
190 		return 0;
191 
192 	err = dma_resv_reserve_fences(bo->ttm.base.resv, vm->preempt.num_exec_queues);
193 	if (err)
194 		return err;
195 
196 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link)
197 		if (q->lr.pfence) {
198 			dma_resv_add_fence(bo->ttm.base.resv,
199 					   q->lr.pfence,
200 					   DMA_RESV_USAGE_BOOKKEEP);
201 		}
202 
203 	return 0;
204 }
205 
206 static void resume_and_reinstall_preempt_fences(struct xe_vm *vm,
207 						struct drm_exec *exec)
208 {
209 	struct xe_exec_queue *q;
210 
211 	lockdep_assert_held(&vm->lock);
212 	xe_vm_assert_held(vm);
213 
214 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link) {
215 		q->ops->resume(q);
216 
217 		drm_gpuvm_resv_add_fence(&vm->gpuvm, exec, q->lr.pfence,
218 					 DMA_RESV_USAGE_BOOKKEEP, DMA_RESV_USAGE_BOOKKEEP);
219 	}
220 }
221 
222 int xe_vm_add_compute_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
223 {
224 	struct drm_gpuvm_exec vm_exec = {
225 		.vm = &vm->gpuvm,
226 		.flags = DRM_EXEC_INTERRUPTIBLE_WAIT,
227 		.num_fences = 1,
228 	};
229 	struct drm_exec *exec = &vm_exec.exec;
230 	struct dma_fence *pfence;
231 	int err;
232 	bool wait;
233 
234 	xe_assert(vm->xe, xe_vm_in_preempt_fence_mode(vm));
235 
236 	down_write(&vm->lock);
237 	err = drm_gpuvm_exec_lock(&vm_exec);
238 	if (err)
239 		goto out_up_write;
240 
241 	pfence = xe_preempt_fence_create(q, q->lr.context,
242 					 ++q->lr.seqno);
243 	if (!pfence) {
244 		err = -ENOMEM;
245 		goto out_fini;
246 	}
247 
248 	list_add(&q->lr.link, &vm->preempt.exec_queues);
249 	++vm->preempt.num_exec_queues;
250 	q->lr.pfence = pfence;
251 
252 	down_read(&vm->userptr.notifier_lock);
253 
254 	drm_gpuvm_resv_add_fence(&vm->gpuvm, exec, pfence,
255 				 DMA_RESV_USAGE_BOOKKEEP, DMA_RESV_USAGE_BOOKKEEP);
256 
257 	/*
258 	 * Check to see if a preemption on VM is in flight or userptr
259 	 * invalidation, if so trigger this preempt fence to sync state with
260 	 * other preempt fences on the VM.
261 	 */
262 	wait = __xe_vm_userptr_needs_repin(vm) || preempt_fences_waiting(vm);
263 	if (wait)
264 		dma_fence_enable_sw_signaling(pfence);
265 
266 	up_read(&vm->userptr.notifier_lock);
267 
268 out_fini:
269 	drm_exec_fini(exec);
270 out_up_write:
271 	up_write(&vm->lock);
272 
273 	return err;
274 }
275 ALLOW_ERROR_INJECTION(xe_vm_add_compute_exec_queue, ERRNO);
276 
277 /**
278  * xe_vm_remove_compute_exec_queue() - Remove compute exec queue from VM
279  * @vm: The VM.
280  * @q: The exec_queue
281  *
282  * Note that this function might be called multiple times on the same queue.
283  */
284 void xe_vm_remove_compute_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
285 {
286 	if (!xe_vm_in_preempt_fence_mode(vm))
287 		return;
288 
289 	down_write(&vm->lock);
290 	if (!list_empty(&q->lr.link)) {
291 		list_del_init(&q->lr.link);
292 		--vm->preempt.num_exec_queues;
293 	}
294 	if (q->lr.pfence) {
295 		dma_fence_enable_sw_signaling(q->lr.pfence);
296 		dma_fence_put(q->lr.pfence);
297 		q->lr.pfence = NULL;
298 	}
299 	up_write(&vm->lock);
300 }
301 
302 /**
303  * __xe_vm_userptr_needs_repin() - Check whether the VM does have userptrs
304  * that need repinning.
305  * @vm: The VM.
306  *
307  * This function checks for whether the VM has userptrs that need repinning,
308  * and provides a release-type barrier on the userptr.notifier_lock after
309  * checking.
310  *
311  * Return: 0 if there are no userptrs needing repinning, -EAGAIN if there are.
312  */
313 int __xe_vm_userptr_needs_repin(struct xe_vm *vm)
314 {
315 	lockdep_assert_held_read(&vm->userptr.notifier_lock);
316 
317 	return (list_empty(&vm->userptr.repin_list) &&
318 		list_empty(&vm->userptr.invalidated)) ? 0 : -EAGAIN;
319 }
320 
321 #define XE_VM_REBIND_RETRY_TIMEOUT_MS 1000
322 
323 /**
324  * xe_vm_kill() - VM Kill
325  * @vm: The VM.
326  * @unlocked: Flag indicates the VM's dma-resv is not held
327  *
328  * Kill the VM by setting banned flag indicated VM is no longer available for
329  * use. If in preempt fence mode, also kill all exec queue attached to the VM.
330  */
331 void xe_vm_kill(struct xe_vm *vm, bool unlocked)
332 {
333 	struct xe_exec_queue *q;
334 
335 	lockdep_assert_held(&vm->lock);
336 
337 	if (unlocked)
338 		xe_vm_lock(vm, false);
339 
340 	vm->flags |= XE_VM_FLAG_BANNED;
341 	trace_xe_vm_kill(vm);
342 
343 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link)
344 		q->ops->kill(q);
345 
346 	if (unlocked)
347 		xe_vm_unlock(vm);
348 
349 	/* TODO: Inform user the VM is banned */
350 }
351 
352 /**
353  * xe_vm_validate_should_retry() - Whether to retry after a validate error.
354  * @exec: The drm_exec object used for locking before validation.
355  * @err: The error returned from ttm_bo_validate().
356  * @end: A ktime_t cookie that should be set to 0 before first use and
357  * that should be reused on subsequent calls.
358  *
359  * With multiple active VMs, under memory pressure, it is possible that
360  * ttm_bo_validate() run into -EDEADLK and in such case returns -ENOMEM.
361  * Until ttm properly handles locking in such scenarios, best thing the
362  * driver can do is retry with a timeout. Check if that is necessary, and
363  * if so unlock the drm_exec's objects while keeping the ticket to prepare
364  * for a rerun.
365  *
366  * Return: true if a retry after drm_exec_init() is recommended;
367  * false otherwise.
368  */
369 bool xe_vm_validate_should_retry(struct drm_exec *exec, int err, ktime_t *end)
370 {
371 	ktime_t cur;
372 
373 	if (err != -ENOMEM)
374 		return false;
375 
376 	cur = ktime_get();
377 	*end = *end ? : ktime_add_ms(cur, XE_VM_REBIND_RETRY_TIMEOUT_MS);
378 	if (!ktime_before(cur, *end))
379 		return false;
380 
381 	msleep(20);
382 	return true;
383 }
384 
385 static int xe_gpuvm_validate(struct drm_gpuvm_bo *vm_bo, struct drm_exec *exec)
386 {
387 	struct xe_vm *vm = gpuvm_to_vm(vm_bo->vm);
388 	struct drm_gpuva *gpuva;
389 	int ret;
390 
391 	lockdep_assert_held(&vm->lock);
392 	drm_gpuvm_bo_for_each_va(gpuva, vm_bo)
393 		list_move_tail(&gpuva_to_vma(gpuva)->combined_links.rebind,
394 			       &vm->rebind_list);
395 
396 	ret = xe_bo_validate(gem_to_xe_bo(vm_bo->obj), vm, false);
397 	if (ret)
398 		return ret;
399 
400 	vm_bo->evicted = false;
401 	return 0;
402 }
403 
404 /**
405  * xe_vm_validate_rebind() - Validate buffer objects and rebind vmas
406  * @vm: The vm for which we are rebinding.
407  * @exec: The struct drm_exec with the locked GEM objects.
408  * @num_fences: The number of fences to reserve for the operation, not
409  * including rebinds and validations.
410  *
411  * Validates all evicted gem objects and rebinds their vmas. Note that
412  * rebindings may cause evictions and hence the validation-rebind
413  * sequence is rerun until there are no more objects to validate.
414  *
415  * Return: 0 on success, negative error code on error. In particular,
416  * may return -EINTR or -ERESTARTSYS if interrupted, and -EDEADLK if
417  * the drm_exec transaction needs to be restarted.
418  */
419 int xe_vm_validate_rebind(struct xe_vm *vm, struct drm_exec *exec,
420 			  unsigned int num_fences)
421 {
422 	struct drm_gem_object *obj;
423 	unsigned long index;
424 	int ret;
425 
426 	do {
427 		ret = drm_gpuvm_validate(&vm->gpuvm, exec);
428 		if (ret)
429 			return ret;
430 
431 		ret = xe_vm_rebind(vm, false);
432 		if (ret)
433 			return ret;
434 	} while (!list_empty(&vm->gpuvm.evict.list));
435 
436 	drm_exec_for_each_locked_object(exec, index, obj) {
437 		ret = dma_resv_reserve_fences(obj->resv, num_fences);
438 		if (ret)
439 			return ret;
440 	}
441 
442 	return 0;
443 }
444 
445 static int xe_preempt_work_begin(struct drm_exec *exec, struct xe_vm *vm,
446 				 bool *done)
447 {
448 	int err;
449 
450 	err = drm_gpuvm_prepare_vm(&vm->gpuvm, exec, 0);
451 	if (err)
452 		return err;
453 
454 	if (xe_vm_is_idle(vm)) {
455 		vm->preempt.rebind_deactivated = true;
456 		*done = true;
457 		return 0;
458 	}
459 
460 	if (!preempt_fences_waiting(vm)) {
461 		*done = true;
462 		return 0;
463 	}
464 
465 	err = drm_gpuvm_prepare_objects(&vm->gpuvm, exec, 0);
466 	if (err)
467 		return err;
468 
469 	err = wait_for_existing_preempt_fences(vm);
470 	if (err)
471 		return err;
472 
473 	/*
474 	 * Add validation and rebinding to the locking loop since both can
475 	 * cause evictions which may require blocing dma_resv locks.
476 	 * The fence reservation here is intended for the new preempt fences
477 	 * we attach at the end of the rebind work.
478 	 */
479 	return xe_vm_validate_rebind(vm, exec, vm->preempt.num_exec_queues);
480 }
481 
482 static void preempt_rebind_work_func(struct work_struct *w)
483 {
484 	struct xe_vm *vm = container_of(w, struct xe_vm, preempt.rebind_work);
485 	struct drm_exec exec;
486 	unsigned int fence_count = 0;
487 	LIST_HEAD(preempt_fences);
488 	ktime_t end = 0;
489 	int err = 0;
490 	long wait;
491 	int __maybe_unused tries = 0;
492 
493 	xe_assert(vm->xe, xe_vm_in_preempt_fence_mode(vm));
494 	trace_xe_vm_rebind_worker_enter(vm);
495 
496 	down_write(&vm->lock);
497 
498 	if (xe_vm_is_closed_or_banned(vm)) {
499 		up_write(&vm->lock);
500 		trace_xe_vm_rebind_worker_exit(vm);
501 		return;
502 	}
503 
504 retry:
505 	if (xe_vm_userptr_check_repin(vm)) {
506 		err = xe_vm_userptr_pin(vm);
507 		if (err)
508 			goto out_unlock_outer;
509 	}
510 
511 	drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT, 0);
512 
513 	drm_exec_until_all_locked(&exec) {
514 		bool done = false;
515 
516 		err = xe_preempt_work_begin(&exec, vm, &done);
517 		drm_exec_retry_on_contention(&exec);
518 		if (err || done) {
519 			drm_exec_fini(&exec);
520 			if (err && xe_vm_validate_should_retry(&exec, err, &end))
521 				err = -EAGAIN;
522 
523 			goto out_unlock_outer;
524 		}
525 	}
526 
527 	err = alloc_preempt_fences(vm, &preempt_fences, &fence_count);
528 	if (err)
529 		goto out_unlock;
530 
531 	err = xe_vm_rebind(vm, true);
532 	if (err)
533 		goto out_unlock;
534 
535 	/* Wait on rebinds and munmap style VM unbinds */
536 	wait = dma_resv_wait_timeout(xe_vm_resv(vm),
537 				     DMA_RESV_USAGE_KERNEL,
538 				     false, MAX_SCHEDULE_TIMEOUT);
539 	if (wait <= 0) {
540 		err = -ETIME;
541 		goto out_unlock;
542 	}
543 
544 #define retry_required(__tries, __vm) \
545 	(IS_ENABLED(CONFIG_DRM_XE_USERPTR_INVAL_INJECT) ? \
546 	(!(__tries)++ || __xe_vm_userptr_needs_repin(__vm)) : \
547 	__xe_vm_userptr_needs_repin(__vm))
548 
549 	down_read(&vm->userptr.notifier_lock);
550 	if (retry_required(tries, vm)) {
551 		up_read(&vm->userptr.notifier_lock);
552 		err = -EAGAIN;
553 		goto out_unlock;
554 	}
555 
556 #undef retry_required
557 
558 	spin_lock(&vm->xe->ttm.lru_lock);
559 	ttm_lru_bulk_move_tail(&vm->lru_bulk_move);
560 	spin_unlock(&vm->xe->ttm.lru_lock);
561 
562 	/* Point of no return. */
563 	arm_preempt_fences(vm, &preempt_fences);
564 	resume_and_reinstall_preempt_fences(vm, &exec);
565 	up_read(&vm->userptr.notifier_lock);
566 
567 out_unlock:
568 	drm_exec_fini(&exec);
569 out_unlock_outer:
570 	if (err == -EAGAIN) {
571 		trace_xe_vm_rebind_worker_retry(vm);
572 		goto retry;
573 	}
574 
575 	if (err) {
576 		drm_warn(&vm->xe->drm, "VM worker error: %d\n", err);
577 		xe_vm_kill(vm, true);
578 	}
579 	up_write(&vm->lock);
580 
581 	free_preempt_fences(&preempt_fences);
582 
583 	trace_xe_vm_rebind_worker_exit(vm);
584 }
585 
586 static void __vma_userptr_invalidate(struct xe_vm *vm, struct xe_userptr_vma *uvma)
587 {
588 	struct xe_userptr *userptr = &uvma->userptr;
589 	struct xe_vma *vma = &uvma->vma;
590 	struct dma_resv_iter cursor;
591 	struct dma_fence *fence;
592 	long err;
593 
594 	/*
595 	 * Tell exec and rebind worker they need to repin and rebind this
596 	 * userptr.
597 	 */
598 	if (!xe_vm_in_fault_mode(vm) &&
599 	    !(vma->gpuva.flags & XE_VMA_DESTROYED)) {
600 		spin_lock(&vm->userptr.invalidated_lock);
601 		list_move_tail(&userptr->invalidate_link,
602 			       &vm->userptr.invalidated);
603 		spin_unlock(&vm->userptr.invalidated_lock);
604 	}
605 
606 	/*
607 	 * Preempt fences turn into schedule disables, pipeline these.
608 	 * Note that even in fault mode, we need to wait for binds and
609 	 * unbinds to complete, and those are attached as BOOKMARK fences
610 	 * to the vm.
611 	 */
612 	dma_resv_iter_begin(&cursor, xe_vm_resv(vm),
613 			    DMA_RESV_USAGE_BOOKKEEP);
614 	dma_resv_for_each_fence_unlocked(&cursor, fence)
615 		dma_fence_enable_sw_signaling(fence);
616 	dma_resv_iter_end(&cursor);
617 
618 	err = dma_resv_wait_timeout(xe_vm_resv(vm),
619 				    DMA_RESV_USAGE_BOOKKEEP,
620 				    false, MAX_SCHEDULE_TIMEOUT);
621 	XE_WARN_ON(err <= 0);
622 
623 	if (xe_vm_in_fault_mode(vm) && userptr->initial_bind) {
624 		err = xe_vm_invalidate_vma(vma);
625 		XE_WARN_ON(err);
626 	}
627 
628 	xe_hmm_userptr_unmap(uvma);
629 }
630 
631 static bool vma_userptr_invalidate(struct mmu_interval_notifier *mni,
632 				   const struct mmu_notifier_range *range,
633 				   unsigned long cur_seq)
634 {
635 	struct xe_userptr_vma *uvma = container_of(mni, typeof(*uvma), userptr.notifier);
636 	struct xe_vma *vma = &uvma->vma;
637 	struct xe_vm *vm = xe_vma_vm(vma);
638 
639 	xe_assert(vm->xe, xe_vma_is_userptr(vma));
640 	trace_xe_vma_userptr_invalidate(vma);
641 
642 	if (!mmu_notifier_range_blockable(range))
643 		return false;
644 
645 	vm_dbg(&xe_vma_vm(vma)->xe->drm,
646 	       "NOTIFIER: addr=0x%016llx, range=0x%016llx",
647 		xe_vma_start(vma), xe_vma_size(vma));
648 
649 	down_write(&vm->userptr.notifier_lock);
650 	mmu_interval_set_seq(mni, cur_seq);
651 
652 	__vma_userptr_invalidate(vm, uvma);
653 	up_write(&vm->userptr.notifier_lock);
654 	trace_xe_vma_userptr_invalidate_complete(vma);
655 
656 	return true;
657 }
658 
659 static const struct mmu_interval_notifier_ops vma_userptr_notifier_ops = {
660 	.invalidate = vma_userptr_invalidate,
661 };
662 
663 #if IS_ENABLED(CONFIG_DRM_XE_USERPTR_INVAL_INJECT)
664 /**
665  * xe_vma_userptr_force_invalidate() - force invalidate a userptr
666  * @uvma: The userptr vma to invalidate
667  *
668  * Perform a forced userptr invalidation for testing purposes.
669  */
670 void xe_vma_userptr_force_invalidate(struct xe_userptr_vma *uvma)
671 {
672 	struct xe_vm *vm = xe_vma_vm(&uvma->vma);
673 
674 	/* Protect against concurrent userptr pinning */
675 	lockdep_assert_held(&vm->lock);
676 	/* Protect against concurrent notifiers */
677 	lockdep_assert_held(&vm->userptr.notifier_lock);
678 	/*
679 	 * Protect against concurrent instances of this function and
680 	 * the critical exec sections
681 	 */
682 	xe_vm_assert_held(vm);
683 
684 	if (!mmu_interval_read_retry(&uvma->userptr.notifier,
685 				     uvma->userptr.notifier_seq))
686 		uvma->userptr.notifier_seq -= 2;
687 	__vma_userptr_invalidate(vm, uvma);
688 }
689 #endif
690 
691 int xe_vm_userptr_pin(struct xe_vm *vm)
692 {
693 	struct xe_userptr_vma *uvma, *next;
694 	int err = 0;
695 
696 	xe_assert(vm->xe, !xe_vm_in_fault_mode(vm));
697 	lockdep_assert_held_write(&vm->lock);
698 
699 	/* Collect invalidated userptrs */
700 	spin_lock(&vm->userptr.invalidated_lock);
701 	xe_assert(vm->xe, list_empty(&vm->userptr.repin_list));
702 	list_for_each_entry_safe(uvma, next, &vm->userptr.invalidated,
703 				 userptr.invalidate_link) {
704 		list_del_init(&uvma->userptr.invalidate_link);
705 		list_add_tail(&uvma->userptr.repin_link,
706 			      &vm->userptr.repin_list);
707 	}
708 	spin_unlock(&vm->userptr.invalidated_lock);
709 
710 	/* Pin and move to bind list */
711 	list_for_each_entry_safe(uvma, next, &vm->userptr.repin_list,
712 				 userptr.repin_link) {
713 		err = xe_vma_userptr_pin_pages(uvma);
714 		if (err == -EFAULT) {
715 			list_del_init(&uvma->userptr.repin_link);
716 			/*
717 			 * We might have already done the pin once already, but
718 			 * then had to retry before the re-bind happened, due
719 			 * some other condition in the caller, but in the
720 			 * meantime the userptr got dinged by the notifier such
721 			 * that we need to revalidate here, but this time we hit
722 			 * the EFAULT. In such a case make sure we remove
723 			 * ourselves from the rebind list to avoid going down in
724 			 * flames.
725 			 */
726 			if (!list_empty(&uvma->vma.combined_links.rebind))
727 				list_del_init(&uvma->vma.combined_links.rebind);
728 
729 			/* Wait for pending binds */
730 			xe_vm_lock(vm, false);
731 			dma_resv_wait_timeout(xe_vm_resv(vm),
732 					      DMA_RESV_USAGE_BOOKKEEP,
733 					      false, MAX_SCHEDULE_TIMEOUT);
734 
735 			err = xe_vm_invalidate_vma(&uvma->vma);
736 			xe_vm_unlock(vm);
737 			if (err)
738 				break;
739 		} else {
740 			if (err)
741 				break;
742 
743 			list_del_init(&uvma->userptr.repin_link);
744 			list_move_tail(&uvma->vma.combined_links.rebind,
745 				       &vm->rebind_list);
746 		}
747 	}
748 
749 	if (err) {
750 		down_write(&vm->userptr.notifier_lock);
751 		spin_lock(&vm->userptr.invalidated_lock);
752 		list_for_each_entry_safe(uvma, next, &vm->userptr.repin_list,
753 					 userptr.repin_link) {
754 			list_del_init(&uvma->userptr.repin_link);
755 			list_move_tail(&uvma->userptr.invalidate_link,
756 				       &vm->userptr.invalidated);
757 		}
758 		spin_unlock(&vm->userptr.invalidated_lock);
759 		up_write(&vm->userptr.notifier_lock);
760 	}
761 	return err;
762 }
763 
764 /**
765  * xe_vm_userptr_check_repin() - Check whether the VM might have userptrs
766  * that need repinning.
767  * @vm: The VM.
768  *
769  * This function does an advisory check for whether the VM has userptrs that
770  * need repinning.
771  *
772  * Return: 0 if there are no indications of userptrs needing repinning,
773  * -EAGAIN if there are.
774  */
775 int xe_vm_userptr_check_repin(struct xe_vm *vm)
776 {
777 	return (list_empty_careful(&vm->userptr.repin_list) &&
778 		list_empty_careful(&vm->userptr.invalidated)) ? 0 : -EAGAIN;
779 }
780 
781 static int xe_vma_ops_alloc(struct xe_vma_ops *vops, bool array_of_binds)
782 {
783 	int i;
784 
785 	for (i = 0; i < XE_MAX_TILES_PER_DEVICE; ++i) {
786 		if (!vops->pt_update_ops[i].num_ops)
787 			continue;
788 
789 		vops->pt_update_ops[i].ops =
790 			kmalloc_array(vops->pt_update_ops[i].num_ops,
791 				      sizeof(*vops->pt_update_ops[i].ops),
792 				      GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN);
793 		if (!vops->pt_update_ops[i].ops)
794 			return array_of_binds ? -ENOBUFS : -ENOMEM;
795 	}
796 
797 	return 0;
798 }
799 ALLOW_ERROR_INJECTION(xe_vma_ops_alloc, ERRNO);
800 
801 static void xe_vma_svm_prefetch_op_fini(struct xe_vma_op *op)
802 {
803 	struct xe_vma *vma;
804 
805 	vma = gpuva_to_vma(op->base.prefetch.va);
806 
807 	if (op->base.op == DRM_GPUVA_OP_PREFETCH && xe_vma_is_cpu_addr_mirror(vma))
808 		xa_destroy(&op->prefetch_range.range);
809 }
810 
811 static void xe_vma_svm_prefetch_ops_fini(struct xe_vma_ops *vops)
812 {
813 	struct xe_vma_op *op;
814 
815 	if (!(vops->flags & XE_VMA_OPS_FLAG_HAS_SVM_PREFETCH))
816 		return;
817 
818 	list_for_each_entry(op, &vops->list, link)
819 		xe_vma_svm_prefetch_op_fini(op);
820 }
821 
822 static void xe_vma_ops_fini(struct xe_vma_ops *vops)
823 {
824 	int i;
825 
826 	xe_vma_svm_prefetch_ops_fini(vops);
827 
828 	for (i = 0; i < XE_MAX_TILES_PER_DEVICE; ++i)
829 		kfree(vops->pt_update_ops[i].ops);
830 }
831 
832 static void xe_vma_ops_incr_pt_update_ops(struct xe_vma_ops *vops, u8 tile_mask, int inc_val)
833 {
834 	int i;
835 
836 	if (!inc_val)
837 		return;
838 
839 	for (i = 0; i < XE_MAX_TILES_PER_DEVICE; ++i)
840 		if (BIT(i) & tile_mask)
841 			vops->pt_update_ops[i].num_ops += inc_val;
842 }
843 
844 static void xe_vm_populate_rebind(struct xe_vma_op *op, struct xe_vma *vma,
845 				  u8 tile_mask)
846 {
847 	INIT_LIST_HEAD(&op->link);
848 	op->tile_mask = tile_mask;
849 	op->base.op = DRM_GPUVA_OP_MAP;
850 	op->base.map.va.addr = vma->gpuva.va.addr;
851 	op->base.map.va.range = vma->gpuva.va.range;
852 	op->base.map.gem.obj = vma->gpuva.gem.obj;
853 	op->base.map.gem.offset = vma->gpuva.gem.offset;
854 	op->map.vma = vma;
855 	op->map.immediate = true;
856 	op->map.dumpable = vma->gpuva.flags & XE_VMA_DUMPABLE;
857 	op->map.is_null = xe_vma_is_null(vma);
858 }
859 
860 static int xe_vm_ops_add_rebind(struct xe_vma_ops *vops, struct xe_vma *vma,
861 				u8 tile_mask)
862 {
863 	struct xe_vma_op *op;
864 
865 	op = kzalloc(sizeof(*op), GFP_KERNEL);
866 	if (!op)
867 		return -ENOMEM;
868 
869 	xe_vm_populate_rebind(op, vma, tile_mask);
870 	list_add_tail(&op->link, &vops->list);
871 	xe_vma_ops_incr_pt_update_ops(vops, tile_mask, 1);
872 
873 	return 0;
874 }
875 
876 static struct dma_fence *ops_execute(struct xe_vm *vm,
877 				     struct xe_vma_ops *vops);
878 static void xe_vma_ops_init(struct xe_vma_ops *vops, struct xe_vm *vm,
879 			    struct xe_exec_queue *q,
880 			    struct xe_sync_entry *syncs, u32 num_syncs);
881 
882 int xe_vm_rebind(struct xe_vm *vm, bool rebind_worker)
883 {
884 	struct dma_fence *fence;
885 	struct xe_vma *vma, *next;
886 	struct xe_vma_ops vops;
887 	struct xe_vma_op *op, *next_op;
888 	int err, i;
889 
890 	lockdep_assert_held(&vm->lock);
891 	if ((xe_vm_in_lr_mode(vm) && !rebind_worker) ||
892 	    list_empty(&vm->rebind_list))
893 		return 0;
894 
895 	xe_vma_ops_init(&vops, vm, NULL, NULL, 0);
896 	for (i = 0; i < XE_MAX_TILES_PER_DEVICE; ++i)
897 		vops.pt_update_ops[i].wait_vm_bookkeep = true;
898 
899 	xe_vm_assert_held(vm);
900 	list_for_each_entry(vma, &vm->rebind_list, combined_links.rebind) {
901 		xe_assert(vm->xe, vma->tile_present);
902 
903 		if (rebind_worker)
904 			trace_xe_vma_rebind_worker(vma);
905 		else
906 			trace_xe_vma_rebind_exec(vma);
907 
908 		err = xe_vm_ops_add_rebind(&vops, vma,
909 					   vma->tile_present);
910 		if (err)
911 			goto free_ops;
912 	}
913 
914 	err = xe_vma_ops_alloc(&vops, false);
915 	if (err)
916 		goto free_ops;
917 
918 	fence = ops_execute(vm, &vops);
919 	if (IS_ERR(fence)) {
920 		err = PTR_ERR(fence);
921 	} else {
922 		dma_fence_put(fence);
923 		list_for_each_entry_safe(vma, next, &vm->rebind_list,
924 					 combined_links.rebind)
925 			list_del_init(&vma->combined_links.rebind);
926 	}
927 free_ops:
928 	list_for_each_entry_safe(op, next_op, &vops.list, link) {
929 		list_del(&op->link);
930 		kfree(op);
931 	}
932 	xe_vma_ops_fini(&vops);
933 
934 	return err;
935 }
936 
937 struct dma_fence *xe_vma_rebind(struct xe_vm *vm, struct xe_vma *vma, u8 tile_mask)
938 {
939 	struct dma_fence *fence = NULL;
940 	struct xe_vma_ops vops;
941 	struct xe_vma_op *op, *next_op;
942 	struct xe_tile *tile;
943 	u8 id;
944 	int err;
945 
946 	lockdep_assert_held(&vm->lock);
947 	xe_vm_assert_held(vm);
948 	xe_assert(vm->xe, xe_vm_in_fault_mode(vm));
949 
950 	xe_vma_ops_init(&vops, vm, NULL, NULL, 0);
951 	for_each_tile(tile, vm->xe, id) {
952 		vops.pt_update_ops[id].wait_vm_bookkeep = true;
953 		vops.pt_update_ops[tile->id].q =
954 			xe_tile_migrate_exec_queue(tile);
955 	}
956 
957 	err = xe_vm_ops_add_rebind(&vops, vma, tile_mask);
958 	if (err)
959 		return ERR_PTR(err);
960 
961 	err = xe_vma_ops_alloc(&vops, false);
962 	if (err) {
963 		fence = ERR_PTR(err);
964 		goto free_ops;
965 	}
966 
967 	fence = ops_execute(vm, &vops);
968 
969 free_ops:
970 	list_for_each_entry_safe(op, next_op, &vops.list, link) {
971 		list_del(&op->link);
972 		kfree(op);
973 	}
974 	xe_vma_ops_fini(&vops);
975 
976 	return fence;
977 }
978 
979 static void xe_vm_populate_range_rebind(struct xe_vma_op *op,
980 					struct xe_vma *vma,
981 					struct xe_svm_range *range,
982 					u8 tile_mask)
983 {
984 	INIT_LIST_HEAD(&op->link);
985 	op->tile_mask = tile_mask;
986 	op->base.op = DRM_GPUVA_OP_DRIVER;
987 	op->subop = XE_VMA_SUBOP_MAP_RANGE;
988 	op->map_range.vma = vma;
989 	op->map_range.range = range;
990 }
991 
992 static int
993 xe_vm_ops_add_range_rebind(struct xe_vma_ops *vops,
994 			   struct xe_vma *vma,
995 			   struct xe_svm_range *range,
996 			   u8 tile_mask)
997 {
998 	struct xe_vma_op *op;
999 
1000 	op = kzalloc(sizeof(*op), GFP_KERNEL);
1001 	if (!op)
1002 		return -ENOMEM;
1003 
1004 	xe_vm_populate_range_rebind(op, vma, range, tile_mask);
1005 	list_add_tail(&op->link, &vops->list);
1006 	xe_vma_ops_incr_pt_update_ops(vops, tile_mask, 1);
1007 
1008 	return 0;
1009 }
1010 
1011 /**
1012  * xe_vm_range_rebind() - VM range (re)bind
1013  * @vm: The VM which the range belongs to.
1014  * @vma: The VMA which the range belongs to.
1015  * @range: SVM range to rebind.
1016  * @tile_mask: Tile mask to bind the range to.
1017  *
1018  * (re)bind SVM range setting up GPU page tables for the range.
1019  *
1020  * Return: dma fence for rebind to signal completion on succees, ERR_PTR on
1021  * failure
1022  */
1023 struct dma_fence *xe_vm_range_rebind(struct xe_vm *vm,
1024 				     struct xe_vma *vma,
1025 				     struct xe_svm_range *range,
1026 				     u8 tile_mask)
1027 {
1028 	struct dma_fence *fence = NULL;
1029 	struct xe_vma_ops vops;
1030 	struct xe_vma_op *op, *next_op;
1031 	struct xe_tile *tile;
1032 	u8 id;
1033 	int err;
1034 
1035 	lockdep_assert_held(&vm->lock);
1036 	xe_vm_assert_held(vm);
1037 	xe_assert(vm->xe, xe_vm_in_fault_mode(vm));
1038 	xe_assert(vm->xe, xe_vma_is_cpu_addr_mirror(vma));
1039 
1040 	xe_vma_ops_init(&vops, vm, NULL, NULL, 0);
1041 	for_each_tile(tile, vm->xe, id) {
1042 		vops.pt_update_ops[id].wait_vm_bookkeep = true;
1043 		vops.pt_update_ops[tile->id].q =
1044 			xe_tile_migrate_exec_queue(tile);
1045 	}
1046 
1047 	err = xe_vm_ops_add_range_rebind(&vops, vma, range, tile_mask);
1048 	if (err)
1049 		return ERR_PTR(err);
1050 
1051 	err = xe_vma_ops_alloc(&vops, false);
1052 	if (err) {
1053 		fence = ERR_PTR(err);
1054 		goto free_ops;
1055 	}
1056 
1057 	fence = ops_execute(vm, &vops);
1058 
1059 free_ops:
1060 	list_for_each_entry_safe(op, next_op, &vops.list, link) {
1061 		list_del(&op->link);
1062 		kfree(op);
1063 	}
1064 	xe_vma_ops_fini(&vops);
1065 
1066 	return fence;
1067 }
1068 
1069 static void xe_vm_populate_range_unbind(struct xe_vma_op *op,
1070 					struct xe_svm_range *range)
1071 {
1072 	INIT_LIST_HEAD(&op->link);
1073 	op->tile_mask = range->tile_present;
1074 	op->base.op = DRM_GPUVA_OP_DRIVER;
1075 	op->subop = XE_VMA_SUBOP_UNMAP_RANGE;
1076 	op->unmap_range.range = range;
1077 }
1078 
1079 static int
1080 xe_vm_ops_add_range_unbind(struct xe_vma_ops *vops,
1081 			   struct xe_svm_range *range)
1082 {
1083 	struct xe_vma_op *op;
1084 
1085 	op = kzalloc(sizeof(*op), GFP_KERNEL);
1086 	if (!op)
1087 		return -ENOMEM;
1088 
1089 	xe_vm_populate_range_unbind(op, range);
1090 	list_add_tail(&op->link, &vops->list);
1091 	xe_vma_ops_incr_pt_update_ops(vops, range->tile_present, 1);
1092 
1093 	return 0;
1094 }
1095 
1096 /**
1097  * xe_vm_range_unbind() - VM range unbind
1098  * @vm: The VM which the range belongs to.
1099  * @range: SVM range to rebind.
1100  *
1101  * Unbind SVM range removing the GPU page tables for the range.
1102  *
1103  * Return: dma fence for unbind to signal completion on succees, ERR_PTR on
1104  * failure
1105  */
1106 struct dma_fence *xe_vm_range_unbind(struct xe_vm *vm,
1107 				     struct xe_svm_range *range)
1108 {
1109 	struct dma_fence *fence = NULL;
1110 	struct xe_vma_ops vops;
1111 	struct xe_vma_op *op, *next_op;
1112 	struct xe_tile *tile;
1113 	u8 id;
1114 	int err;
1115 
1116 	lockdep_assert_held(&vm->lock);
1117 	xe_vm_assert_held(vm);
1118 	xe_assert(vm->xe, xe_vm_in_fault_mode(vm));
1119 
1120 	if (!range->tile_present)
1121 		return dma_fence_get_stub();
1122 
1123 	xe_vma_ops_init(&vops, vm, NULL, NULL, 0);
1124 	for_each_tile(tile, vm->xe, id) {
1125 		vops.pt_update_ops[id].wait_vm_bookkeep = true;
1126 		vops.pt_update_ops[tile->id].q =
1127 			xe_tile_migrate_exec_queue(tile);
1128 	}
1129 
1130 	err = xe_vm_ops_add_range_unbind(&vops, range);
1131 	if (err)
1132 		return ERR_PTR(err);
1133 
1134 	err = xe_vma_ops_alloc(&vops, false);
1135 	if (err) {
1136 		fence = ERR_PTR(err);
1137 		goto free_ops;
1138 	}
1139 
1140 	fence = ops_execute(vm, &vops);
1141 
1142 free_ops:
1143 	list_for_each_entry_safe(op, next_op, &vops.list, link) {
1144 		list_del(&op->link);
1145 		kfree(op);
1146 	}
1147 	xe_vma_ops_fini(&vops);
1148 
1149 	return fence;
1150 }
1151 
1152 static void xe_vma_free(struct xe_vma *vma)
1153 {
1154 	if (xe_vma_is_userptr(vma))
1155 		kfree(to_userptr_vma(vma));
1156 	else
1157 		kfree(vma);
1158 }
1159 
1160 #define VMA_CREATE_FLAG_READ_ONLY		BIT(0)
1161 #define VMA_CREATE_FLAG_IS_NULL			BIT(1)
1162 #define VMA_CREATE_FLAG_DUMPABLE		BIT(2)
1163 #define VMA_CREATE_FLAG_IS_SYSTEM_ALLOCATOR	BIT(3)
1164 
1165 static struct xe_vma *xe_vma_create(struct xe_vm *vm,
1166 				    struct xe_bo *bo,
1167 				    u64 bo_offset_or_userptr,
1168 				    u64 start, u64 end,
1169 				    u16 pat_index, unsigned int flags)
1170 {
1171 	struct xe_vma *vma;
1172 	struct xe_tile *tile;
1173 	u8 id;
1174 	bool read_only = (flags & VMA_CREATE_FLAG_READ_ONLY);
1175 	bool is_null = (flags & VMA_CREATE_FLAG_IS_NULL);
1176 	bool dumpable = (flags & VMA_CREATE_FLAG_DUMPABLE);
1177 	bool is_cpu_addr_mirror =
1178 		(flags & VMA_CREATE_FLAG_IS_SYSTEM_ALLOCATOR);
1179 
1180 	xe_assert(vm->xe, start < end);
1181 	xe_assert(vm->xe, end < vm->size);
1182 
1183 	/*
1184 	 * Allocate and ensure that the xe_vma_is_userptr() return
1185 	 * matches what was allocated.
1186 	 */
1187 	if (!bo && !is_null && !is_cpu_addr_mirror) {
1188 		struct xe_userptr_vma *uvma = kzalloc(sizeof(*uvma), GFP_KERNEL);
1189 
1190 		if (!uvma)
1191 			return ERR_PTR(-ENOMEM);
1192 
1193 		vma = &uvma->vma;
1194 	} else {
1195 		vma = kzalloc(sizeof(*vma), GFP_KERNEL);
1196 		if (!vma)
1197 			return ERR_PTR(-ENOMEM);
1198 
1199 		if (is_cpu_addr_mirror)
1200 			vma->gpuva.flags |= XE_VMA_SYSTEM_ALLOCATOR;
1201 		if (is_null)
1202 			vma->gpuva.flags |= DRM_GPUVA_SPARSE;
1203 		if (bo)
1204 			vma->gpuva.gem.obj = &bo->ttm.base;
1205 	}
1206 
1207 	INIT_LIST_HEAD(&vma->combined_links.rebind);
1208 
1209 	INIT_LIST_HEAD(&vma->gpuva.gem.entry);
1210 	vma->gpuva.vm = &vm->gpuvm;
1211 	vma->gpuva.va.addr = start;
1212 	vma->gpuva.va.range = end - start + 1;
1213 	if (read_only)
1214 		vma->gpuva.flags |= XE_VMA_READ_ONLY;
1215 	if (dumpable)
1216 		vma->gpuva.flags |= XE_VMA_DUMPABLE;
1217 
1218 	for_each_tile(tile, vm->xe, id)
1219 		vma->tile_mask |= 0x1 << id;
1220 
1221 	if (vm->xe->info.has_atomic_enable_pte_bit)
1222 		vma->gpuva.flags |= XE_VMA_ATOMIC_PTE_BIT;
1223 
1224 	vma->pat_index = pat_index;
1225 
1226 	if (bo) {
1227 		struct drm_gpuvm_bo *vm_bo;
1228 
1229 		xe_bo_assert_held(bo);
1230 
1231 		vm_bo = drm_gpuvm_bo_obtain(vma->gpuva.vm, &bo->ttm.base);
1232 		if (IS_ERR(vm_bo)) {
1233 			xe_vma_free(vma);
1234 			return ERR_CAST(vm_bo);
1235 		}
1236 
1237 		drm_gpuvm_bo_extobj_add(vm_bo);
1238 		drm_gem_object_get(&bo->ttm.base);
1239 		vma->gpuva.gem.offset = bo_offset_or_userptr;
1240 		drm_gpuva_link(&vma->gpuva, vm_bo);
1241 		drm_gpuvm_bo_put(vm_bo);
1242 	} else /* userptr or null */ {
1243 		if (!is_null && !is_cpu_addr_mirror) {
1244 			struct xe_userptr *userptr = &to_userptr_vma(vma)->userptr;
1245 			u64 size = end - start + 1;
1246 			int err;
1247 
1248 			INIT_LIST_HEAD(&userptr->invalidate_link);
1249 			INIT_LIST_HEAD(&userptr->repin_link);
1250 			vma->gpuva.gem.offset = bo_offset_or_userptr;
1251 			mutex_init(&userptr->unmap_mutex);
1252 
1253 			err = mmu_interval_notifier_insert(&userptr->notifier,
1254 							   current->mm,
1255 							   xe_vma_userptr(vma), size,
1256 							   &vma_userptr_notifier_ops);
1257 			if (err) {
1258 				xe_vma_free(vma);
1259 				return ERR_PTR(err);
1260 			}
1261 
1262 			userptr->notifier_seq = LONG_MAX;
1263 		}
1264 
1265 		xe_vm_get(vm);
1266 	}
1267 
1268 	return vma;
1269 }
1270 
1271 static void xe_vma_destroy_late(struct xe_vma *vma)
1272 {
1273 	struct xe_vm *vm = xe_vma_vm(vma);
1274 
1275 	if (vma->ufence) {
1276 		xe_sync_ufence_put(vma->ufence);
1277 		vma->ufence = NULL;
1278 	}
1279 
1280 	if (xe_vma_is_userptr(vma)) {
1281 		struct xe_userptr_vma *uvma = to_userptr_vma(vma);
1282 		struct xe_userptr *userptr = &uvma->userptr;
1283 
1284 		if (userptr->sg)
1285 			xe_hmm_userptr_free_sg(uvma);
1286 
1287 		/*
1288 		 * Since userptr pages are not pinned, we can't remove
1289 		 * the notifier until we're sure the GPU is not accessing
1290 		 * them anymore
1291 		 */
1292 		mmu_interval_notifier_remove(&userptr->notifier);
1293 		mutex_destroy(&userptr->unmap_mutex);
1294 		xe_vm_put(vm);
1295 	} else if (xe_vma_is_null(vma) || xe_vma_is_cpu_addr_mirror(vma)) {
1296 		xe_vm_put(vm);
1297 	} else {
1298 		xe_bo_put(xe_vma_bo(vma));
1299 	}
1300 
1301 	xe_vma_free(vma);
1302 }
1303 
1304 static void vma_destroy_work_func(struct work_struct *w)
1305 {
1306 	struct xe_vma *vma =
1307 		container_of(w, struct xe_vma, destroy_work);
1308 
1309 	xe_vma_destroy_late(vma);
1310 }
1311 
1312 static void vma_destroy_cb(struct dma_fence *fence,
1313 			   struct dma_fence_cb *cb)
1314 {
1315 	struct xe_vma *vma = container_of(cb, struct xe_vma, destroy_cb);
1316 
1317 	INIT_WORK(&vma->destroy_work, vma_destroy_work_func);
1318 	queue_work(system_unbound_wq, &vma->destroy_work);
1319 }
1320 
1321 static void xe_vma_destroy(struct xe_vma *vma, struct dma_fence *fence)
1322 {
1323 	struct xe_vm *vm = xe_vma_vm(vma);
1324 
1325 	lockdep_assert_held_write(&vm->lock);
1326 	xe_assert(vm->xe, list_empty(&vma->combined_links.destroy));
1327 
1328 	if (xe_vma_is_userptr(vma)) {
1329 		xe_assert(vm->xe, vma->gpuva.flags & XE_VMA_DESTROYED);
1330 
1331 		spin_lock(&vm->userptr.invalidated_lock);
1332 		xe_assert(vm->xe, list_empty(&to_userptr_vma(vma)->userptr.repin_link));
1333 		list_del(&to_userptr_vma(vma)->userptr.invalidate_link);
1334 		spin_unlock(&vm->userptr.invalidated_lock);
1335 	} else if (!xe_vma_is_null(vma) && !xe_vma_is_cpu_addr_mirror(vma)) {
1336 		xe_bo_assert_held(xe_vma_bo(vma));
1337 
1338 		drm_gpuva_unlink(&vma->gpuva);
1339 	}
1340 
1341 	xe_vm_assert_held(vm);
1342 	if (fence) {
1343 		int ret = dma_fence_add_callback(fence, &vma->destroy_cb,
1344 						 vma_destroy_cb);
1345 
1346 		if (ret) {
1347 			XE_WARN_ON(ret != -ENOENT);
1348 			xe_vma_destroy_late(vma);
1349 		}
1350 	} else {
1351 		xe_vma_destroy_late(vma);
1352 	}
1353 }
1354 
1355 /**
1356  * xe_vm_lock_vma() - drm_exec utility to lock a vma
1357  * @exec: The drm_exec object we're currently locking for.
1358  * @vma: The vma for witch we want to lock the vm resv and any attached
1359  * object's resv.
1360  *
1361  * Return: 0 on success, negative error code on error. In particular
1362  * may return -EDEADLK on WW transaction contention and -EINTR if
1363  * an interruptible wait is terminated by a signal.
1364  */
1365 int xe_vm_lock_vma(struct drm_exec *exec, struct xe_vma *vma)
1366 {
1367 	struct xe_vm *vm = xe_vma_vm(vma);
1368 	struct xe_bo *bo = xe_vma_bo(vma);
1369 	int err;
1370 
1371 	XE_WARN_ON(!vm);
1372 
1373 	err = drm_exec_lock_obj(exec, xe_vm_obj(vm));
1374 	if (!err && bo && !bo->vm)
1375 		err = drm_exec_lock_obj(exec, &bo->ttm.base);
1376 
1377 	return err;
1378 }
1379 
1380 static void xe_vma_destroy_unlocked(struct xe_vma *vma)
1381 {
1382 	struct drm_exec exec;
1383 	int err;
1384 
1385 	drm_exec_init(&exec, 0, 0);
1386 	drm_exec_until_all_locked(&exec) {
1387 		err = xe_vm_lock_vma(&exec, vma);
1388 		drm_exec_retry_on_contention(&exec);
1389 		if (XE_WARN_ON(err))
1390 			break;
1391 	}
1392 
1393 	xe_vma_destroy(vma, NULL);
1394 
1395 	drm_exec_fini(&exec);
1396 }
1397 
1398 struct xe_vma *
1399 xe_vm_find_overlapping_vma(struct xe_vm *vm, u64 start, u64 range)
1400 {
1401 	struct drm_gpuva *gpuva;
1402 
1403 	lockdep_assert_held(&vm->lock);
1404 
1405 	if (xe_vm_is_closed_or_banned(vm))
1406 		return NULL;
1407 
1408 	xe_assert(vm->xe, start + range <= vm->size);
1409 
1410 	gpuva = drm_gpuva_find_first(&vm->gpuvm, start, range);
1411 
1412 	return gpuva ? gpuva_to_vma(gpuva) : NULL;
1413 }
1414 
1415 static int xe_vm_insert_vma(struct xe_vm *vm, struct xe_vma *vma)
1416 {
1417 	int err;
1418 
1419 	xe_assert(vm->xe, xe_vma_vm(vma) == vm);
1420 	lockdep_assert_held(&vm->lock);
1421 
1422 	mutex_lock(&vm->snap_mutex);
1423 	err = drm_gpuva_insert(&vm->gpuvm, &vma->gpuva);
1424 	mutex_unlock(&vm->snap_mutex);
1425 	XE_WARN_ON(err);	/* Shouldn't be possible */
1426 
1427 	return err;
1428 }
1429 
1430 static void xe_vm_remove_vma(struct xe_vm *vm, struct xe_vma *vma)
1431 {
1432 	xe_assert(vm->xe, xe_vma_vm(vma) == vm);
1433 	lockdep_assert_held(&vm->lock);
1434 
1435 	mutex_lock(&vm->snap_mutex);
1436 	drm_gpuva_remove(&vma->gpuva);
1437 	mutex_unlock(&vm->snap_mutex);
1438 	if (vm->usm.last_fault_vma == vma)
1439 		vm->usm.last_fault_vma = NULL;
1440 }
1441 
1442 static struct drm_gpuva_op *xe_vm_op_alloc(void)
1443 {
1444 	struct xe_vma_op *op;
1445 
1446 	op = kzalloc(sizeof(*op), GFP_KERNEL);
1447 
1448 	if (unlikely(!op))
1449 		return NULL;
1450 
1451 	return &op->base;
1452 }
1453 
1454 static void xe_vm_free(struct drm_gpuvm *gpuvm);
1455 
1456 static const struct drm_gpuvm_ops gpuvm_ops = {
1457 	.op_alloc = xe_vm_op_alloc,
1458 	.vm_bo_validate = xe_gpuvm_validate,
1459 	.vm_free = xe_vm_free,
1460 };
1461 
1462 static u64 pde_encode_pat_index(u16 pat_index)
1463 {
1464 	u64 pte = 0;
1465 
1466 	if (pat_index & BIT(0))
1467 		pte |= XE_PPGTT_PTE_PAT0;
1468 
1469 	if (pat_index & BIT(1))
1470 		pte |= XE_PPGTT_PTE_PAT1;
1471 
1472 	return pte;
1473 }
1474 
1475 static u64 pte_encode_pat_index(u16 pat_index, u32 pt_level)
1476 {
1477 	u64 pte = 0;
1478 
1479 	if (pat_index & BIT(0))
1480 		pte |= XE_PPGTT_PTE_PAT0;
1481 
1482 	if (pat_index & BIT(1))
1483 		pte |= XE_PPGTT_PTE_PAT1;
1484 
1485 	if (pat_index & BIT(2)) {
1486 		if (pt_level)
1487 			pte |= XE_PPGTT_PDE_PDPE_PAT2;
1488 		else
1489 			pte |= XE_PPGTT_PTE_PAT2;
1490 	}
1491 
1492 	if (pat_index & BIT(3))
1493 		pte |= XELPG_PPGTT_PTE_PAT3;
1494 
1495 	if (pat_index & (BIT(4)))
1496 		pte |= XE2_PPGTT_PTE_PAT4;
1497 
1498 	return pte;
1499 }
1500 
1501 static u64 pte_encode_ps(u32 pt_level)
1502 {
1503 	XE_WARN_ON(pt_level > MAX_HUGEPTE_LEVEL);
1504 
1505 	if (pt_level == 1)
1506 		return XE_PDE_PS_2M;
1507 	else if (pt_level == 2)
1508 		return XE_PDPE_PS_1G;
1509 
1510 	return 0;
1511 }
1512 
1513 static u64 xelp_pde_encode_bo(struct xe_bo *bo, u64 bo_offset,
1514 			      const u16 pat_index)
1515 {
1516 	u64 pde;
1517 
1518 	pde = xe_bo_addr(bo, bo_offset, XE_PAGE_SIZE);
1519 	pde |= XE_PAGE_PRESENT | XE_PAGE_RW;
1520 	pde |= pde_encode_pat_index(pat_index);
1521 
1522 	return pde;
1523 }
1524 
1525 static u64 xelp_pte_encode_bo(struct xe_bo *bo, u64 bo_offset,
1526 			      u16 pat_index, u32 pt_level)
1527 {
1528 	u64 pte;
1529 
1530 	pte = xe_bo_addr(bo, bo_offset, XE_PAGE_SIZE);
1531 	pte |= XE_PAGE_PRESENT | XE_PAGE_RW;
1532 	pte |= pte_encode_pat_index(pat_index, pt_level);
1533 	pte |= pte_encode_ps(pt_level);
1534 
1535 	if (xe_bo_is_vram(bo) || xe_bo_is_stolen_devmem(bo))
1536 		pte |= XE_PPGTT_PTE_DM;
1537 
1538 	return pte;
1539 }
1540 
1541 static u64 xelp_pte_encode_vma(u64 pte, struct xe_vma *vma,
1542 			       u16 pat_index, u32 pt_level)
1543 {
1544 	pte |= XE_PAGE_PRESENT;
1545 
1546 	if (likely(!xe_vma_read_only(vma)))
1547 		pte |= XE_PAGE_RW;
1548 
1549 	pte |= pte_encode_pat_index(pat_index, pt_level);
1550 	pte |= pte_encode_ps(pt_level);
1551 
1552 	if (unlikely(xe_vma_is_null(vma)))
1553 		pte |= XE_PTE_NULL;
1554 
1555 	return pte;
1556 }
1557 
1558 static u64 xelp_pte_encode_addr(struct xe_device *xe, u64 addr,
1559 				u16 pat_index,
1560 				u32 pt_level, bool devmem, u64 flags)
1561 {
1562 	u64 pte;
1563 
1564 	/* Avoid passing random bits directly as flags */
1565 	xe_assert(xe, !(flags & ~XE_PTE_PS64));
1566 
1567 	pte = addr;
1568 	pte |= XE_PAGE_PRESENT | XE_PAGE_RW;
1569 	pte |= pte_encode_pat_index(pat_index, pt_level);
1570 	pte |= pte_encode_ps(pt_level);
1571 
1572 	if (devmem)
1573 		pte |= XE_PPGTT_PTE_DM;
1574 
1575 	pte |= flags;
1576 
1577 	return pte;
1578 }
1579 
1580 static const struct xe_pt_ops xelp_pt_ops = {
1581 	.pte_encode_bo = xelp_pte_encode_bo,
1582 	.pte_encode_vma = xelp_pte_encode_vma,
1583 	.pte_encode_addr = xelp_pte_encode_addr,
1584 	.pde_encode_bo = xelp_pde_encode_bo,
1585 };
1586 
1587 static void vm_destroy_work_func(struct work_struct *w);
1588 
1589 /**
1590  * xe_vm_create_scratch() - Setup a scratch memory pagetable tree for the
1591  * given tile and vm.
1592  * @xe: xe device.
1593  * @tile: tile to set up for.
1594  * @vm: vm to set up for.
1595  *
1596  * Sets up a pagetable tree with one page-table per level and a single
1597  * leaf PTE. All pagetable entries point to the single page-table or,
1598  * for MAX_HUGEPTE_LEVEL, a NULL huge PTE returning 0 on read and
1599  * writes become NOPs.
1600  *
1601  * Return: 0 on success, negative error code on error.
1602  */
1603 static int xe_vm_create_scratch(struct xe_device *xe, struct xe_tile *tile,
1604 				struct xe_vm *vm)
1605 {
1606 	u8 id = tile->id;
1607 	int i;
1608 
1609 	for (i = MAX_HUGEPTE_LEVEL; i < vm->pt_root[id]->level; i++) {
1610 		vm->scratch_pt[id][i] = xe_pt_create(vm, tile, i);
1611 		if (IS_ERR(vm->scratch_pt[id][i]))
1612 			return PTR_ERR(vm->scratch_pt[id][i]);
1613 
1614 		xe_pt_populate_empty(tile, vm, vm->scratch_pt[id][i]);
1615 	}
1616 
1617 	return 0;
1618 }
1619 ALLOW_ERROR_INJECTION(xe_vm_create_scratch, ERRNO);
1620 
1621 static void xe_vm_free_scratch(struct xe_vm *vm)
1622 {
1623 	struct xe_tile *tile;
1624 	u8 id;
1625 
1626 	if (!xe_vm_has_scratch(vm))
1627 		return;
1628 
1629 	for_each_tile(tile, vm->xe, id) {
1630 		u32 i;
1631 
1632 		if (!vm->pt_root[id])
1633 			continue;
1634 
1635 		for (i = MAX_HUGEPTE_LEVEL; i < vm->pt_root[id]->level; ++i)
1636 			if (vm->scratch_pt[id][i])
1637 				xe_pt_destroy(vm->scratch_pt[id][i], vm->flags, NULL);
1638 	}
1639 }
1640 
1641 struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags)
1642 {
1643 	struct drm_gem_object *vm_resv_obj;
1644 	struct xe_vm *vm;
1645 	int err, number_tiles = 0;
1646 	struct xe_tile *tile;
1647 	u8 id;
1648 
1649 	/*
1650 	 * Since the GSCCS is not user-accessible, we don't expect a GSC VM to
1651 	 * ever be in faulting mode.
1652 	 */
1653 	xe_assert(xe, !((flags & XE_VM_FLAG_GSC) && (flags & XE_VM_FLAG_FAULT_MODE)));
1654 
1655 	vm = kzalloc(sizeof(*vm), GFP_KERNEL);
1656 	if (!vm)
1657 		return ERR_PTR(-ENOMEM);
1658 
1659 	vm->xe = xe;
1660 
1661 	vm->size = 1ull << xe->info.va_bits;
1662 
1663 	vm->flags = flags;
1664 
1665 	/**
1666 	 * GSC VMs are kernel-owned, only used for PXP ops and can sometimes be
1667 	 * manipulated under the PXP mutex. However, the PXP mutex can be taken
1668 	 * under a user-VM lock when the PXP session is started at exec_queue
1669 	 * creation time. Those are different VMs and therefore there is no risk
1670 	 * of deadlock, but we need to tell lockdep that this is the case or it
1671 	 * will print a warning.
1672 	 */
1673 	if (flags & XE_VM_FLAG_GSC) {
1674 		static struct lock_class_key gsc_vm_key;
1675 
1676 		__init_rwsem(&vm->lock, "gsc_vm", &gsc_vm_key);
1677 	} else {
1678 		init_rwsem(&vm->lock);
1679 	}
1680 	mutex_init(&vm->snap_mutex);
1681 
1682 	INIT_LIST_HEAD(&vm->rebind_list);
1683 
1684 	INIT_LIST_HEAD(&vm->userptr.repin_list);
1685 	INIT_LIST_HEAD(&vm->userptr.invalidated);
1686 	init_rwsem(&vm->userptr.notifier_lock);
1687 	spin_lock_init(&vm->userptr.invalidated_lock);
1688 
1689 	ttm_lru_bulk_move_init(&vm->lru_bulk_move);
1690 
1691 	INIT_WORK(&vm->destroy_work, vm_destroy_work_func);
1692 
1693 	INIT_LIST_HEAD(&vm->preempt.exec_queues);
1694 	vm->preempt.min_run_period_ms = 10;	/* FIXME: Wire up to uAPI */
1695 
1696 	for_each_tile(tile, xe, id)
1697 		xe_range_fence_tree_init(&vm->rftree[id]);
1698 
1699 	vm->pt_ops = &xelp_pt_ops;
1700 
1701 	/*
1702 	 * Long-running workloads are not protected by the scheduler references.
1703 	 * By design, run_job for long-running workloads returns NULL and the
1704 	 * scheduler drops all the references of it, hence protecting the VM
1705 	 * for this case is necessary.
1706 	 */
1707 	if (flags & XE_VM_FLAG_LR_MODE)
1708 		xe_pm_runtime_get_noresume(xe);
1709 
1710 	vm_resv_obj = drm_gpuvm_resv_object_alloc(&xe->drm);
1711 	if (!vm_resv_obj) {
1712 		err = -ENOMEM;
1713 		goto err_no_resv;
1714 	}
1715 
1716 	drm_gpuvm_init(&vm->gpuvm, "Xe VM", DRM_GPUVM_RESV_PROTECTED, &xe->drm,
1717 		       vm_resv_obj, 0, vm->size, 0, 0, &gpuvm_ops);
1718 
1719 	drm_gem_object_put(vm_resv_obj);
1720 
1721 	err = xe_vm_lock(vm, true);
1722 	if (err)
1723 		goto err_close;
1724 
1725 	if (IS_DGFX(xe) && xe->info.vram_flags & XE_VRAM_FLAGS_NEED64K)
1726 		vm->flags |= XE_VM_FLAG_64K;
1727 
1728 	for_each_tile(tile, xe, id) {
1729 		if (flags & XE_VM_FLAG_MIGRATION &&
1730 		    tile->id != XE_VM_FLAG_TILE_ID(flags))
1731 			continue;
1732 
1733 		vm->pt_root[id] = xe_pt_create(vm, tile, xe->info.vm_max_level);
1734 		if (IS_ERR(vm->pt_root[id])) {
1735 			err = PTR_ERR(vm->pt_root[id]);
1736 			vm->pt_root[id] = NULL;
1737 			goto err_unlock_close;
1738 		}
1739 	}
1740 
1741 	if (xe_vm_has_scratch(vm)) {
1742 		for_each_tile(tile, xe, id) {
1743 			if (!vm->pt_root[id])
1744 				continue;
1745 
1746 			err = xe_vm_create_scratch(xe, tile, vm);
1747 			if (err)
1748 				goto err_unlock_close;
1749 		}
1750 		vm->batch_invalidate_tlb = true;
1751 	}
1752 
1753 	if (vm->flags & XE_VM_FLAG_LR_MODE) {
1754 		INIT_WORK(&vm->preempt.rebind_work, preempt_rebind_work_func);
1755 		vm->batch_invalidate_tlb = false;
1756 	}
1757 
1758 	/* Fill pt_root after allocating scratch tables */
1759 	for_each_tile(tile, xe, id) {
1760 		if (!vm->pt_root[id])
1761 			continue;
1762 
1763 		xe_pt_populate_empty(tile, vm, vm->pt_root[id]);
1764 	}
1765 	xe_vm_unlock(vm);
1766 
1767 	/* Kernel migration VM shouldn't have a circular loop.. */
1768 	if (!(flags & XE_VM_FLAG_MIGRATION)) {
1769 		for_each_tile(tile, xe, id) {
1770 			struct xe_exec_queue *q;
1771 			u32 create_flags = EXEC_QUEUE_FLAG_VM;
1772 
1773 			if (!vm->pt_root[id])
1774 				continue;
1775 
1776 			q = xe_exec_queue_create_bind(xe, tile, create_flags, 0);
1777 			if (IS_ERR(q)) {
1778 				err = PTR_ERR(q);
1779 				goto err_close;
1780 			}
1781 			vm->q[id] = q;
1782 			number_tiles++;
1783 		}
1784 	}
1785 
1786 	if (flags & XE_VM_FLAG_FAULT_MODE) {
1787 		err = xe_svm_init(vm);
1788 		if (err)
1789 			goto err_close;
1790 	}
1791 
1792 	if (number_tiles > 1)
1793 		vm->composite_fence_ctx = dma_fence_context_alloc(1);
1794 
1795 	trace_xe_vm_create(vm);
1796 
1797 	return vm;
1798 
1799 err_unlock_close:
1800 	xe_vm_unlock(vm);
1801 err_close:
1802 	xe_vm_close_and_put(vm);
1803 	return ERR_PTR(err);
1804 
1805 err_no_resv:
1806 	mutex_destroy(&vm->snap_mutex);
1807 	for_each_tile(tile, xe, id)
1808 		xe_range_fence_tree_fini(&vm->rftree[id]);
1809 	ttm_lru_bulk_move_fini(&xe->ttm, &vm->lru_bulk_move);
1810 	kfree(vm);
1811 	if (flags & XE_VM_FLAG_LR_MODE)
1812 		xe_pm_runtime_put(xe);
1813 	return ERR_PTR(err);
1814 }
1815 
1816 static void xe_vm_close(struct xe_vm *vm)
1817 {
1818 	struct xe_device *xe = vm->xe;
1819 	bool bound;
1820 	int idx;
1821 
1822 	bound = drm_dev_enter(&xe->drm, &idx);
1823 
1824 	down_write(&vm->lock);
1825 	if (xe_vm_in_fault_mode(vm))
1826 		xe_svm_notifier_lock(vm);
1827 
1828 	vm->size = 0;
1829 
1830 	if (!((vm->flags & XE_VM_FLAG_MIGRATION))) {
1831 		struct xe_tile *tile;
1832 		struct xe_gt *gt;
1833 		u8 id;
1834 
1835 		/* Wait for pending binds */
1836 		dma_resv_wait_timeout(xe_vm_resv(vm),
1837 				      DMA_RESV_USAGE_BOOKKEEP,
1838 				      false, MAX_SCHEDULE_TIMEOUT);
1839 
1840 		if (bound) {
1841 			for_each_tile(tile, xe, id)
1842 				if (vm->pt_root[id])
1843 					xe_pt_clear(xe, vm->pt_root[id]);
1844 
1845 			for_each_gt(gt, xe, id)
1846 				xe_gt_tlb_invalidation_vm(gt, vm);
1847 		}
1848 	}
1849 
1850 	if (xe_vm_in_fault_mode(vm))
1851 		xe_svm_notifier_unlock(vm);
1852 	up_write(&vm->lock);
1853 
1854 	if (bound)
1855 		drm_dev_exit(idx);
1856 }
1857 
1858 void xe_vm_close_and_put(struct xe_vm *vm)
1859 {
1860 	LIST_HEAD(contested);
1861 	struct xe_device *xe = vm->xe;
1862 	struct xe_tile *tile;
1863 	struct xe_vma *vma, *next_vma;
1864 	struct drm_gpuva *gpuva, *next;
1865 	u8 id;
1866 
1867 	xe_assert(xe, !vm->preempt.num_exec_queues);
1868 
1869 	xe_vm_close(vm);
1870 	if (xe_vm_in_preempt_fence_mode(vm))
1871 		flush_work(&vm->preempt.rebind_work);
1872 	if (xe_vm_in_fault_mode(vm))
1873 		xe_svm_close(vm);
1874 
1875 	down_write(&vm->lock);
1876 	for_each_tile(tile, xe, id) {
1877 		if (vm->q[id])
1878 			xe_exec_queue_last_fence_put(vm->q[id], vm);
1879 	}
1880 	up_write(&vm->lock);
1881 
1882 	for_each_tile(tile, xe, id) {
1883 		if (vm->q[id]) {
1884 			xe_exec_queue_kill(vm->q[id]);
1885 			xe_exec_queue_put(vm->q[id]);
1886 			vm->q[id] = NULL;
1887 		}
1888 	}
1889 
1890 	down_write(&vm->lock);
1891 	xe_vm_lock(vm, false);
1892 	drm_gpuvm_for_each_va_safe(gpuva, next, &vm->gpuvm) {
1893 		vma = gpuva_to_vma(gpuva);
1894 
1895 		if (xe_vma_has_no_bo(vma)) {
1896 			down_read(&vm->userptr.notifier_lock);
1897 			vma->gpuva.flags |= XE_VMA_DESTROYED;
1898 			up_read(&vm->userptr.notifier_lock);
1899 		}
1900 
1901 		xe_vm_remove_vma(vm, vma);
1902 
1903 		/* easy case, remove from VMA? */
1904 		if (xe_vma_has_no_bo(vma) || xe_vma_bo(vma)->vm) {
1905 			list_del_init(&vma->combined_links.rebind);
1906 			xe_vma_destroy(vma, NULL);
1907 			continue;
1908 		}
1909 
1910 		list_move_tail(&vma->combined_links.destroy, &contested);
1911 		vma->gpuva.flags |= XE_VMA_DESTROYED;
1912 	}
1913 
1914 	/*
1915 	 * All vm operations will add shared fences to resv.
1916 	 * The only exception is eviction for a shared object,
1917 	 * but even so, the unbind when evicted would still
1918 	 * install a fence to resv. Hence it's safe to
1919 	 * destroy the pagetables immediately.
1920 	 */
1921 	xe_vm_free_scratch(vm);
1922 
1923 	for_each_tile(tile, xe, id) {
1924 		if (vm->pt_root[id]) {
1925 			xe_pt_destroy(vm->pt_root[id], vm->flags, NULL);
1926 			vm->pt_root[id] = NULL;
1927 		}
1928 	}
1929 	xe_vm_unlock(vm);
1930 
1931 	/*
1932 	 * VM is now dead, cannot re-add nodes to vm->vmas if it's NULL
1933 	 * Since we hold a refcount to the bo, we can remove and free
1934 	 * the members safely without locking.
1935 	 */
1936 	list_for_each_entry_safe(vma, next_vma, &contested,
1937 				 combined_links.destroy) {
1938 		list_del_init(&vma->combined_links.destroy);
1939 		xe_vma_destroy_unlocked(vma);
1940 	}
1941 
1942 	if (xe_vm_in_fault_mode(vm))
1943 		xe_svm_fini(vm);
1944 
1945 	up_write(&vm->lock);
1946 
1947 	down_write(&xe->usm.lock);
1948 	if (vm->usm.asid) {
1949 		void *lookup;
1950 
1951 		xe_assert(xe, xe->info.has_asid);
1952 		xe_assert(xe, !(vm->flags & XE_VM_FLAG_MIGRATION));
1953 
1954 		lookup = xa_erase(&xe->usm.asid_to_vm, vm->usm.asid);
1955 		xe_assert(xe, lookup == vm);
1956 	}
1957 	up_write(&xe->usm.lock);
1958 
1959 	for_each_tile(tile, xe, id)
1960 		xe_range_fence_tree_fini(&vm->rftree[id]);
1961 
1962 	xe_vm_put(vm);
1963 }
1964 
1965 static void vm_destroy_work_func(struct work_struct *w)
1966 {
1967 	struct xe_vm *vm =
1968 		container_of(w, struct xe_vm, destroy_work);
1969 	struct xe_device *xe = vm->xe;
1970 	struct xe_tile *tile;
1971 	u8 id;
1972 
1973 	/* xe_vm_close_and_put was not called? */
1974 	xe_assert(xe, !vm->size);
1975 
1976 	if (xe_vm_in_preempt_fence_mode(vm))
1977 		flush_work(&vm->preempt.rebind_work);
1978 
1979 	mutex_destroy(&vm->snap_mutex);
1980 
1981 	if (vm->flags & XE_VM_FLAG_LR_MODE)
1982 		xe_pm_runtime_put(xe);
1983 
1984 	for_each_tile(tile, xe, id)
1985 		XE_WARN_ON(vm->pt_root[id]);
1986 
1987 	trace_xe_vm_free(vm);
1988 
1989 	ttm_lru_bulk_move_fini(&xe->ttm, &vm->lru_bulk_move);
1990 
1991 	if (vm->xef)
1992 		xe_file_put(vm->xef);
1993 
1994 	kfree(vm);
1995 }
1996 
1997 static void xe_vm_free(struct drm_gpuvm *gpuvm)
1998 {
1999 	struct xe_vm *vm = container_of(gpuvm, struct xe_vm, gpuvm);
2000 
2001 	/* To destroy the VM we need to be able to sleep */
2002 	queue_work(system_unbound_wq, &vm->destroy_work);
2003 }
2004 
2005 struct xe_vm *xe_vm_lookup(struct xe_file *xef, u32 id)
2006 {
2007 	struct xe_vm *vm;
2008 
2009 	mutex_lock(&xef->vm.lock);
2010 	vm = xa_load(&xef->vm.xa, id);
2011 	if (vm)
2012 		xe_vm_get(vm);
2013 	mutex_unlock(&xef->vm.lock);
2014 
2015 	return vm;
2016 }
2017 
2018 u64 xe_vm_pdp4_descriptor(struct xe_vm *vm, struct xe_tile *tile)
2019 {
2020 	return vm->pt_ops->pde_encode_bo(vm->pt_root[tile->id]->bo, 0,
2021 					 tile_to_xe(tile)->pat.idx[XE_CACHE_WB]);
2022 }
2023 
2024 static struct xe_exec_queue *
2025 to_wait_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
2026 {
2027 	return q ? q : vm->q[0];
2028 }
2029 
2030 static struct xe_user_fence *
2031 find_ufence_get(struct xe_sync_entry *syncs, u32 num_syncs)
2032 {
2033 	unsigned int i;
2034 
2035 	for (i = 0; i < num_syncs; i++) {
2036 		struct xe_sync_entry *e = &syncs[i];
2037 
2038 		if (xe_sync_is_ufence(e))
2039 			return xe_sync_ufence_get(e);
2040 	}
2041 
2042 	return NULL;
2043 }
2044 
2045 #define ALL_DRM_XE_VM_CREATE_FLAGS (DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE | \
2046 				    DRM_XE_VM_CREATE_FLAG_LR_MODE | \
2047 				    DRM_XE_VM_CREATE_FLAG_FAULT_MODE)
2048 
2049 int xe_vm_create_ioctl(struct drm_device *dev, void *data,
2050 		       struct drm_file *file)
2051 {
2052 	struct xe_device *xe = to_xe_device(dev);
2053 	struct xe_file *xef = to_xe_file(file);
2054 	struct drm_xe_vm_create *args = data;
2055 	struct xe_tile *tile;
2056 	struct xe_vm *vm;
2057 	u32 id, asid;
2058 	int err;
2059 	u32 flags = 0;
2060 
2061 	if (XE_IOCTL_DBG(xe, args->extensions))
2062 		return -EINVAL;
2063 
2064 	if (XE_WA(xe_root_mmio_gt(xe), 14016763929))
2065 		args->flags |= DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE;
2066 
2067 	if (XE_IOCTL_DBG(xe, args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE &&
2068 			 !xe->info.has_usm))
2069 		return -EINVAL;
2070 
2071 	if (XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
2072 		return -EINVAL;
2073 
2074 	if (XE_IOCTL_DBG(xe, args->flags & ~ALL_DRM_XE_VM_CREATE_FLAGS))
2075 		return -EINVAL;
2076 
2077 	if (XE_IOCTL_DBG(xe, args->flags & DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE &&
2078 			 args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE &&
2079 			 !xe->info.needs_scratch))
2080 		return -EINVAL;
2081 
2082 	if (XE_IOCTL_DBG(xe, !(args->flags & DRM_XE_VM_CREATE_FLAG_LR_MODE) &&
2083 			 args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE))
2084 		return -EINVAL;
2085 
2086 	if (args->flags & DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE)
2087 		flags |= XE_VM_FLAG_SCRATCH_PAGE;
2088 	if (args->flags & DRM_XE_VM_CREATE_FLAG_LR_MODE)
2089 		flags |= XE_VM_FLAG_LR_MODE;
2090 	if (args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE)
2091 		flags |= XE_VM_FLAG_FAULT_MODE;
2092 
2093 	vm = xe_vm_create(xe, flags);
2094 	if (IS_ERR(vm))
2095 		return PTR_ERR(vm);
2096 
2097 	if (xe->info.has_asid) {
2098 		down_write(&xe->usm.lock);
2099 		err = xa_alloc_cyclic(&xe->usm.asid_to_vm, &asid, vm,
2100 				      XA_LIMIT(1, XE_MAX_ASID - 1),
2101 				      &xe->usm.next_asid, GFP_KERNEL);
2102 		up_write(&xe->usm.lock);
2103 		if (err < 0)
2104 			goto err_close_and_put;
2105 
2106 		vm->usm.asid = asid;
2107 	}
2108 
2109 	vm->xef = xe_file_get(xef);
2110 
2111 	/* Record BO memory for VM pagetable created against client */
2112 	for_each_tile(tile, xe, id)
2113 		if (vm->pt_root[id])
2114 			xe_drm_client_add_bo(vm->xef->client, vm->pt_root[id]->bo);
2115 
2116 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_MEM)
2117 	/* Warning: Security issue - never enable by default */
2118 	args->reserved[0] = xe_bo_main_addr(vm->pt_root[0]->bo, XE_PAGE_SIZE);
2119 #endif
2120 
2121 	/* user id alloc must always be last in ioctl to prevent UAF */
2122 	err = xa_alloc(&xef->vm.xa, &id, vm, xa_limit_32b, GFP_KERNEL);
2123 	if (err)
2124 		goto err_close_and_put;
2125 
2126 	args->vm_id = id;
2127 
2128 	return 0;
2129 
2130 err_close_and_put:
2131 	xe_vm_close_and_put(vm);
2132 
2133 	return err;
2134 }
2135 
2136 int xe_vm_destroy_ioctl(struct drm_device *dev, void *data,
2137 			struct drm_file *file)
2138 {
2139 	struct xe_device *xe = to_xe_device(dev);
2140 	struct xe_file *xef = to_xe_file(file);
2141 	struct drm_xe_vm_destroy *args = data;
2142 	struct xe_vm *vm;
2143 	int err = 0;
2144 
2145 	if (XE_IOCTL_DBG(xe, args->pad) ||
2146 	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
2147 		return -EINVAL;
2148 
2149 	mutex_lock(&xef->vm.lock);
2150 	vm = xa_load(&xef->vm.xa, args->vm_id);
2151 	if (XE_IOCTL_DBG(xe, !vm))
2152 		err = -ENOENT;
2153 	else if (XE_IOCTL_DBG(xe, vm->preempt.num_exec_queues))
2154 		err = -EBUSY;
2155 	else
2156 		xa_erase(&xef->vm.xa, args->vm_id);
2157 	mutex_unlock(&xef->vm.lock);
2158 
2159 	if (!err)
2160 		xe_vm_close_and_put(vm);
2161 
2162 	return err;
2163 }
2164 
2165 static bool vma_matches(struct xe_vma *vma, u64 page_addr)
2166 {
2167 	if (page_addr > xe_vma_end(vma) - 1 ||
2168 	    page_addr + SZ_4K - 1 < xe_vma_start(vma))
2169 		return false;
2170 
2171 	return true;
2172 }
2173 
2174 /**
2175  * xe_vm_find_vma_by_addr() - Find a VMA by its address
2176  *
2177  * @vm: the xe_vm the vma belongs to
2178  * @page_addr: address to look up
2179  */
2180 struct xe_vma *xe_vm_find_vma_by_addr(struct xe_vm *vm, u64 page_addr)
2181 {
2182 	struct xe_vma *vma = NULL;
2183 
2184 	if (vm->usm.last_fault_vma) {   /* Fast lookup */
2185 		if (vma_matches(vm->usm.last_fault_vma, page_addr))
2186 			vma = vm->usm.last_fault_vma;
2187 	}
2188 	if (!vma)
2189 		vma = xe_vm_find_overlapping_vma(vm, page_addr, SZ_4K);
2190 
2191 	return vma;
2192 }
2193 
2194 static const u32 region_to_mem_type[] = {
2195 	XE_PL_TT,
2196 	XE_PL_VRAM0,
2197 	XE_PL_VRAM1,
2198 };
2199 
2200 static void prep_vma_destroy(struct xe_vm *vm, struct xe_vma *vma,
2201 			     bool post_commit)
2202 {
2203 	down_read(&vm->userptr.notifier_lock);
2204 	vma->gpuva.flags |= XE_VMA_DESTROYED;
2205 	up_read(&vm->userptr.notifier_lock);
2206 	if (post_commit)
2207 		xe_vm_remove_vma(vm, vma);
2208 }
2209 
2210 #undef ULL
2211 #define ULL	unsigned long long
2212 
2213 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_VM)
2214 static void print_op(struct xe_device *xe, struct drm_gpuva_op *op)
2215 {
2216 	struct xe_vma *vma;
2217 
2218 	switch (op->op) {
2219 	case DRM_GPUVA_OP_MAP:
2220 		vm_dbg(&xe->drm, "MAP: addr=0x%016llx, range=0x%016llx",
2221 		       (ULL)op->map.va.addr, (ULL)op->map.va.range);
2222 		break;
2223 	case DRM_GPUVA_OP_REMAP:
2224 		vma = gpuva_to_vma(op->remap.unmap->va);
2225 		vm_dbg(&xe->drm, "REMAP:UNMAP: addr=0x%016llx, range=0x%016llx, keep=%d",
2226 		       (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma),
2227 		       op->remap.unmap->keep ? 1 : 0);
2228 		if (op->remap.prev)
2229 			vm_dbg(&xe->drm,
2230 			       "REMAP:PREV: addr=0x%016llx, range=0x%016llx",
2231 			       (ULL)op->remap.prev->va.addr,
2232 			       (ULL)op->remap.prev->va.range);
2233 		if (op->remap.next)
2234 			vm_dbg(&xe->drm,
2235 			       "REMAP:NEXT: addr=0x%016llx, range=0x%016llx",
2236 			       (ULL)op->remap.next->va.addr,
2237 			       (ULL)op->remap.next->va.range);
2238 		break;
2239 	case DRM_GPUVA_OP_UNMAP:
2240 		vma = gpuva_to_vma(op->unmap.va);
2241 		vm_dbg(&xe->drm, "UNMAP: addr=0x%016llx, range=0x%016llx, keep=%d",
2242 		       (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma),
2243 		       op->unmap.keep ? 1 : 0);
2244 		break;
2245 	case DRM_GPUVA_OP_PREFETCH:
2246 		vma = gpuva_to_vma(op->prefetch.va);
2247 		vm_dbg(&xe->drm, "PREFETCH: addr=0x%016llx, range=0x%016llx",
2248 		       (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma));
2249 		break;
2250 	default:
2251 		drm_warn(&xe->drm, "NOT POSSIBLE");
2252 	}
2253 }
2254 #else
2255 static void print_op(struct xe_device *xe, struct drm_gpuva_op *op)
2256 {
2257 }
2258 #endif
2259 
2260 static bool __xe_vm_needs_clear_scratch_pages(struct xe_vm *vm, u32 bind_flags)
2261 {
2262 	if (!xe_vm_in_fault_mode(vm))
2263 		return false;
2264 
2265 	if (!xe_vm_has_scratch(vm))
2266 		return false;
2267 
2268 	if (bind_flags & DRM_XE_VM_BIND_FLAG_IMMEDIATE)
2269 		return false;
2270 
2271 	return true;
2272 }
2273 
2274 static void xe_svm_prefetch_gpuva_ops_fini(struct drm_gpuva_ops *ops)
2275 {
2276 	struct drm_gpuva_op *__op;
2277 
2278 	drm_gpuva_for_each_op(__op, ops) {
2279 		struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2280 
2281 		xe_vma_svm_prefetch_op_fini(op);
2282 	}
2283 }
2284 
2285 /*
2286  * Create operations list from IOCTL arguments, setup operations fields so parse
2287  * and commit steps are decoupled from IOCTL arguments. This step can fail.
2288  */
2289 static struct drm_gpuva_ops *
2290 vm_bind_ioctl_ops_create(struct xe_vm *vm, struct xe_vma_ops *vops,
2291 			 struct xe_bo *bo, u64 bo_offset_or_userptr,
2292 			 u64 addr, u64 range,
2293 			 u32 operation, u32 flags,
2294 			 u32 prefetch_region, u16 pat_index)
2295 {
2296 	struct drm_gem_object *obj = bo ? &bo->ttm.base : NULL;
2297 	struct drm_gpuva_ops *ops;
2298 	struct drm_gpuva_op *__op;
2299 	struct drm_gpuvm_bo *vm_bo;
2300 	u64 range_end = addr + range;
2301 	int err;
2302 
2303 	lockdep_assert_held_write(&vm->lock);
2304 
2305 	vm_dbg(&vm->xe->drm,
2306 	       "op=%d, addr=0x%016llx, range=0x%016llx, bo_offset_or_userptr=0x%016llx",
2307 	       operation, (ULL)addr, (ULL)range,
2308 	       (ULL)bo_offset_or_userptr);
2309 
2310 	switch (operation) {
2311 	case DRM_XE_VM_BIND_OP_MAP:
2312 	case DRM_XE_VM_BIND_OP_MAP_USERPTR:
2313 		ops = drm_gpuvm_sm_map_ops_create(&vm->gpuvm, addr, range,
2314 						  obj, bo_offset_or_userptr);
2315 		break;
2316 	case DRM_XE_VM_BIND_OP_UNMAP:
2317 		ops = drm_gpuvm_sm_unmap_ops_create(&vm->gpuvm, addr, range);
2318 		break;
2319 	case DRM_XE_VM_BIND_OP_PREFETCH:
2320 		ops = drm_gpuvm_prefetch_ops_create(&vm->gpuvm, addr, range);
2321 		break;
2322 	case DRM_XE_VM_BIND_OP_UNMAP_ALL:
2323 		xe_assert(vm->xe, bo);
2324 
2325 		err = xe_bo_lock(bo, true);
2326 		if (err)
2327 			return ERR_PTR(err);
2328 
2329 		vm_bo = drm_gpuvm_bo_obtain(&vm->gpuvm, obj);
2330 		if (IS_ERR(vm_bo)) {
2331 			xe_bo_unlock(bo);
2332 			return ERR_CAST(vm_bo);
2333 		}
2334 
2335 		ops = drm_gpuvm_bo_unmap_ops_create(vm_bo);
2336 		drm_gpuvm_bo_put(vm_bo);
2337 		xe_bo_unlock(bo);
2338 		break;
2339 	default:
2340 		drm_warn(&vm->xe->drm, "NOT POSSIBLE");
2341 		ops = ERR_PTR(-EINVAL);
2342 	}
2343 	if (IS_ERR(ops))
2344 		return ops;
2345 
2346 	drm_gpuva_for_each_op(__op, ops) {
2347 		struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2348 
2349 		if (__op->op == DRM_GPUVA_OP_MAP) {
2350 			op->map.immediate =
2351 				flags & DRM_XE_VM_BIND_FLAG_IMMEDIATE;
2352 			op->map.read_only =
2353 				flags & DRM_XE_VM_BIND_FLAG_READONLY;
2354 			op->map.is_null = flags & DRM_XE_VM_BIND_FLAG_NULL;
2355 			op->map.is_cpu_addr_mirror = flags &
2356 				DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR;
2357 			op->map.dumpable = flags & DRM_XE_VM_BIND_FLAG_DUMPABLE;
2358 			op->map.pat_index = pat_index;
2359 			op->map.invalidate_on_bind =
2360 				__xe_vm_needs_clear_scratch_pages(vm, flags);
2361 		} else if (__op->op == DRM_GPUVA_OP_PREFETCH) {
2362 			struct xe_vma *vma = gpuva_to_vma(op->base.prefetch.va);
2363 			struct xe_svm_range *svm_range;
2364 			struct drm_gpusvm_ctx ctx = {};
2365 			struct xe_tile *tile;
2366 			u8 id, tile_mask = 0;
2367 			u32 i;
2368 
2369 			if (!xe_vma_is_cpu_addr_mirror(vma)) {
2370 				op->prefetch.region = prefetch_region;
2371 				break;
2372 			}
2373 
2374 			ctx.read_only = xe_vma_read_only(vma);
2375 			ctx.devmem_possible = IS_DGFX(vm->xe) &&
2376 					      IS_ENABLED(CONFIG_DRM_XE_DEVMEM_MIRROR);
2377 
2378 			for_each_tile(tile, vm->xe, id)
2379 				tile_mask |= 0x1 << id;
2380 
2381 			xa_init_flags(&op->prefetch_range.range, XA_FLAGS_ALLOC);
2382 			op->prefetch_range.region = prefetch_region;
2383 			op->prefetch_range.ranges_count = 0;
2384 alloc_next_range:
2385 			svm_range = xe_svm_range_find_or_insert(vm, addr, vma, &ctx);
2386 
2387 			if (PTR_ERR(svm_range) == -ENOENT) {
2388 				u64 ret = xe_svm_find_vma_start(vm, addr, range_end, vma);
2389 
2390 				addr = ret == ULONG_MAX ? 0 : ret;
2391 				if (addr)
2392 					goto alloc_next_range;
2393 				else
2394 					goto print_op_label;
2395 			}
2396 
2397 			if (IS_ERR(svm_range)) {
2398 				err = PTR_ERR(svm_range);
2399 				goto unwind_prefetch_ops;
2400 			}
2401 
2402 			if (xe_svm_range_validate(vm, svm_range, tile_mask, !!prefetch_region)) {
2403 				xe_svm_range_debug(svm_range, "PREFETCH - RANGE IS VALID");
2404 				goto check_next_range;
2405 			}
2406 
2407 			err = xa_alloc(&op->prefetch_range.range,
2408 				       &i, svm_range, xa_limit_32b,
2409 				       GFP_KERNEL);
2410 
2411 			if (err)
2412 				goto unwind_prefetch_ops;
2413 
2414 			op->prefetch_range.ranges_count++;
2415 			vops->flags |= XE_VMA_OPS_FLAG_HAS_SVM_PREFETCH;
2416 			xe_svm_range_debug(svm_range, "PREFETCH - RANGE CREATED");
2417 check_next_range:
2418 			if (range_end > xe_svm_range_end(svm_range) &&
2419 			    xe_svm_range_end(svm_range) < xe_vma_end(vma)) {
2420 				addr = xe_svm_range_end(svm_range);
2421 				goto alloc_next_range;
2422 			}
2423 		}
2424 print_op_label:
2425 		print_op(vm->xe, __op);
2426 	}
2427 
2428 	return ops;
2429 
2430 unwind_prefetch_ops:
2431 	xe_svm_prefetch_gpuva_ops_fini(ops);
2432 	drm_gpuva_ops_free(&vm->gpuvm, ops);
2433 	return ERR_PTR(err);
2434 }
2435 
2436 ALLOW_ERROR_INJECTION(vm_bind_ioctl_ops_create, ERRNO);
2437 
2438 static struct xe_vma *new_vma(struct xe_vm *vm, struct drm_gpuva_op_map *op,
2439 			      u16 pat_index, unsigned int flags)
2440 {
2441 	struct xe_bo *bo = op->gem.obj ? gem_to_xe_bo(op->gem.obj) : NULL;
2442 	struct drm_exec exec;
2443 	struct xe_vma *vma;
2444 	int err = 0;
2445 
2446 	lockdep_assert_held_write(&vm->lock);
2447 
2448 	if (bo) {
2449 		drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT, 0);
2450 		drm_exec_until_all_locked(&exec) {
2451 			err = 0;
2452 			if (!bo->vm) {
2453 				err = drm_exec_lock_obj(&exec, xe_vm_obj(vm));
2454 				drm_exec_retry_on_contention(&exec);
2455 			}
2456 			if (!err) {
2457 				err = drm_exec_lock_obj(&exec, &bo->ttm.base);
2458 				drm_exec_retry_on_contention(&exec);
2459 			}
2460 			if (err) {
2461 				drm_exec_fini(&exec);
2462 				return ERR_PTR(err);
2463 			}
2464 		}
2465 	}
2466 	vma = xe_vma_create(vm, bo, op->gem.offset,
2467 			    op->va.addr, op->va.addr +
2468 			    op->va.range - 1, pat_index, flags);
2469 	if (IS_ERR(vma))
2470 		goto err_unlock;
2471 
2472 	if (xe_vma_is_userptr(vma))
2473 		err = xe_vma_userptr_pin_pages(to_userptr_vma(vma));
2474 	else if (!xe_vma_has_no_bo(vma) && !bo->vm)
2475 		err = add_preempt_fences(vm, bo);
2476 
2477 err_unlock:
2478 	if (bo)
2479 		drm_exec_fini(&exec);
2480 
2481 	if (err) {
2482 		prep_vma_destroy(vm, vma, false);
2483 		xe_vma_destroy_unlocked(vma);
2484 		vma = ERR_PTR(err);
2485 	}
2486 
2487 	return vma;
2488 }
2489 
2490 static u64 xe_vma_max_pte_size(struct xe_vma *vma)
2491 {
2492 	if (vma->gpuva.flags & XE_VMA_PTE_1G)
2493 		return SZ_1G;
2494 	else if (vma->gpuva.flags & (XE_VMA_PTE_2M | XE_VMA_PTE_COMPACT))
2495 		return SZ_2M;
2496 	else if (vma->gpuva.flags & XE_VMA_PTE_64K)
2497 		return SZ_64K;
2498 	else if (vma->gpuva.flags & XE_VMA_PTE_4K)
2499 		return SZ_4K;
2500 
2501 	return SZ_1G;	/* Uninitialized, used max size */
2502 }
2503 
2504 static void xe_vma_set_pte_size(struct xe_vma *vma, u64 size)
2505 {
2506 	switch (size) {
2507 	case SZ_1G:
2508 		vma->gpuva.flags |= XE_VMA_PTE_1G;
2509 		break;
2510 	case SZ_2M:
2511 		vma->gpuva.flags |= XE_VMA_PTE_2M;
2512 		break;
2513 	case SZ_64K:
2514 		vma->gpuva.flags |= XE_VMA_PTE_64K;
2515 		break;
2516 	case SZ_4K:
2517 		vma->gpuva.flags |= XE_VMA_PTE_4K;
2518 		break;
2519 	}
2520 }
2521 
2522 static int xe_vma_op_commit(struct xe_vm *vm, struct xe_vma_op *op)
2523 {
2524 	int err = 0;
2525 
2526 	lockdep_assert_held_write(&vm->lock);
2527 
2528 	switch (op->base.op) {
2529 	case DRM_GPUVA_OP_MAP:
2530 		err |= xe_vm_insert_vma(vm, op->map.vma);
2531 		if (!err)
2532 			op->flags |= XE_VMA_OP_COMMITTED;
2533 		break;
2534 	case DRM_GPUVA_OP_REMAP:
2535 	{
2536 		u8 tile_present =
2537 			gpuva_to_vma(op->base.remap.unmap->va)->tile_present;
2538 
2539 		prep_vma_destroy(vm, gpuva_to_vma(op->base.remap.unmap->va),
2540 				 true);
2541 		op->flags |= XE_VMA_OP_COMMITTED;
2542 
2543 		if (op->remap.prev) {
2544 			err |= xe_vm_insert_vma(vm, op->remap.prev);
2545 			if (!err)
2546 				op->flags |= XE_VMA_OP_PREV_COMMITTED;
2547 			if (!err && op->remap.skip_prev) {
2548 				op->remap.prev->tile_present =
2549 					tile_present;
2550 				op->remap.prev = NULL;
2551 			}
2552 		}
2553 		if (op->remap.next) {
2554 			err |= xe_vm_insert_vma(vm, op->remap.next);
2555 			if (!err)
2556 				op->flags |= XE_VMA_OP_NEXT_COMMITTED;
2557 			if (!err && op->remap.skip_next) {
2558 				op->remap.next->tile_present =
2559 					tile_present;
2560 				op->remap.next = NULL;
2561 			}
2562 		}
2563 
2564 		/* Adjust for partial unbind after removing VMA from VM */
2565 		if (!err) {
2566 			op->base.remap.unmap->va->va.addr = op->remap.start;
2567 			op->base.remap.unmap->va->va.range = op->remap.range;
2568 		}
2569 		break;
2570 	}
2571 	case DRM_GPUVA_OP_UNMAP:
2572 		prep_vma_destroy(vm, gpuva_to_vma(op->base.unmap.va), true);
2573 		op->flags |= XE_VMA_OP_COMMITTED;
2574 		break;
2575 	case DRM_GPUVA_OP_PREFETCH:
2576 		op->flags |= XE_VMA_OP_COMMITTED;
2577 		break;
2578 	default:
2579 		drm_warn(&vm->xe->drm, "NOT POSSIBLE");
2580 	}
2581 
2582 	return err;
2583 }
2584 
2585 static int vm_bind_ioctl_ops_parse(struct xe_vm *vm, struct drm_gpuva_ops *ops,
2586 				   struct xe_vma_ops *vops)
2587 {
2588 	struct xe_device *xe = vm->xe;
2589 	struct drm_gpuva_op *__op;
2590 	struct xe_tile *tile;
2591 	u8 id, tile_mask = 0;
2592 	int err = 0;
2593 
2594 	lockdep_assert_held_write(&vm->lock);
2595 
2596 	for_each_tile(tile, vm->xe, id)
2597 		tile_mask |= 0x1 << id;
2598 
2599 	drm_gpuva_for_each_op(__op, ops) {
2600 		struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2601 		struct xe_vma *vma;
2602 		unsigned int flags = 0;
2603 
2604 		INIT_LIST_HEAD(&op->link);
2605 		list_add_tail(&op->link, &vops->list);
2606 		op->tile_mask = tile_mask;
2607 
2608 		switch (op->base.op) {
2609 		case DRM_GPUVA_OP_MAP:
2610 		{
2611 			flags |= op->map.read_only ?
2612 				VMA_CREATE_FLAG_READ_ONLY : 0;
2613 			flags |= op->map.is_null ?
2614 				VMA_CREATE_FLAG_IS_NULL : 0;
2615 			flags |= op->map.dumpable ?
2616 				VMA_CREATE_FLAG_DUMPABLE : 0;
2617 			flags |= op->map.is_cpu_addr_mirror ?
2618 				VMA_CREATE_FLAG_IS_SYSTEM_ALLOCATOR : 0;
2619 
2620 			vma = new_vma(vm, &op->base.map, op->map.pat_index,
2621 				      flags);
2622 			if (IS_ERR(vma))
2623 				return PTR_ERR(vma);
2624 
2625 			op->map.vma = vma;
2626 			if (((op->map.immediate || !xe_vm_in_fault_mode(vm)) &&
2627 			     !op->map.is_cpu_addr_mirror) ||
2628 			    op->map.invalidate_on_bind)
2629 				xe_vma_ops_incr_pt_update_ops(vops,
2630 							      op->tile_mask, 1);
2631 			break;
2632 		}
2633 		case DRM_GPUVA_OP_REMAP:
2634 		{
2635 			struct xe_vma *old =
2636 				gpuva_to_vma(op->base.remap.unmap->va);
2637 			bool skip = xe_vma_is_cpu_addr_mirror(old);
2638 			u64 start = xe_vma_start(old), end = xe_vma_end(old);
2639 			int num_remap_ops = 0;
2640 
2641 			if (op->base.remap.prev)
2642 				start = op->base.remap.prev->va.addr +
2643 					op->base.remap.prev->va.range;
2644 			if (op->base.remap.next)
2645 				end = op->base.remap.next->va.addr;
2646 
2647 			if (xe_vma_is_cpu_addr_mirror(old) &&
2648 			    xe_svm_has_mapping(vm, start, end))
2649 				return -EBUSY;
2650 
2651 			op->remap.start = xe_vma_start(old);
2652 			op->remap.range = xe_vma_size(old);
2653 
2654 			flags |= op->base.remap.unmap->va->flags &
2655 				XE_VMA_READ_ONLY ?
2656 				VMA_CREATE_FLAG_READ_ONLY : 0;
2657 			flags |= op->base.remap.unmap->va->flags &
2658 				DRM_GPUVA_SPARSE ?
2659 				VMA_CREATE_FLAG_IS_NULL : 0;
2660 			flags |= op->base.remap.unmap->va->flags &
2661 				XE_VMA_DUMPABLE ?
2662 				VMA_CREATE_FLAG_DUMPABLE : 0;
2663 			flags |= xe_vma_is_cpu_addr_mirror(old) ?
2664 				VMA_CREATE_FLAG_IS_SYSTEM_ALLOCATOR : 0;
2665 
2666 			if (op->base.remap.prev) {
2667 				vma = new_vma(vm, op->base.remap.prev,
2668 					      old->pat_index, flags);
2669 				if (IS_ERR(vma))
2670 					return PTR_ERR(vma);
2671 
2672 				op->remap.prev = vma;
2673 
2674 				/*
2675 				 * Userptr creates a new SG mapping so
2676 				 * we must also rebind.
2677 				 */
2678 				op->remap.skip_prev = skip ||
2679 					(!xe_vma_is_userptr(old) &&
2680 					IS_ALIGNED(xe_vma_end(vma),
2681 						   xe_vma_max_pte_size(old)));
2682 				if (op->remap.skip_prev) {
2683 					xe_vma_set_pte_size(vma, xe_vma_max_pte_size(old));
2684 					op->remap.range -=
2685 						xe_vma_end(vma) -
2686 						xe_vma_start(old);
2687 					op->remap.start = xe_vma_end(vma);
2688 					vm_dbg(&xe->drm, "REMAP:SKIP_PREV: addr=0x%016llx, range=0x%016llx",
2689 					       (ULL)op->remap.start,
2690 					       (ULL)op->remap.range);
2691 				} else {
2692 					num_remap_ops++;
2693 				}
2694 			}
2695 
2696 			if (op->base.remap.next) {
2697 				vma = new_vma(vm, op->base.remap.next,
2698 					      old->pat_index, flags);
2699 				if (IS_ERR(vma))
2700 					return PTR_ERR(vma);
2701 
2702 				op->remap.next = vma;
2703 
2704 				/*
2705 				 * Userptr creates a new SG mapping so
2706 				 * we must also rebind.
2707 				 */
2708 				op->remap.skip_next = skip ||
2709 					(!xe_vma_is_userptr(old) &&
2710 					IS_ALIGNED(xe_vma_start(vma),
2711 						   xe_vma_max_pte_size(old)));
2712 				if (op->remap.skip_next) {
2713 					xe_vma_set_pte_size(vma, xe_vma_max_pte_size(old));
2714 					op->remap.range -=
2715 						xe_vma_end(old) -
2716 						xe_vma_start(vma);
2717 					vm_dbg(&xe->drm, "REMAP:SKIP_NEXT: addr=0x%016llx, range=0x%016llx",
2718 					       (ULL)op->remap.start,
2719 					       (ULL)op->remap.range);
2720 				} else {
2721 					num_remap_ops++;
2722 				}
2723 			}
2724 			if (!skip)
2725 				num_remap_ops++;
2726 
2727 			xe_vma_ops_incr_pt_update_ops(vops, op->tile_mask, num_remap_ops);
2728 			break;
2729 		}
2730 		case DRM_GPUVA_OP_UNMAP:
2731 			vma = gpuva_to_vma(op->base.unmap.va);
2732 
2733 			if (xe_vma_is_cpu_addr_mirror(vma) &&
2734 			    xe_svm_has_mapping(vm, xe_vma_start(vma),
2735 					       xe_vma_end(vma)))
2736 				return -EBUSY;
2737 
2738 			if (!xe_vma_is_cpu_addr_mirror(vma))
2739 				xe_vma_ops_incr_pt_update_ops(vops, op->tile_mask, 1);
2740 			break;
2741 		case DRM_GPUVA_OP_PREFETCH:
2742 			vma = gpuva_to_vma(op->base.prefetch.va);
2743 
2744 			if (xe_vma_is_userptr(vma)) {
2745 				err = xe_vma_userptr_pin_pages(to_userptr_vma(vma));
2746 				if (err)
2747 					return err;
2748 			}
2749 
2750 			if (xe_vma_is_cpu_addr_mirror(vma))
2751 				xe_vma_ops_incr_pt_update_ops(vops, op->tile_mask,
2752 							      op->prefetch_range.ranges_count);
2753 			else
2754 				xe_vma_ops_incr_pt_update_ops(vops, op->tile_mask, 1);
2755 
2756 			break;
2757 		default:
2758 			drm_warn(&vm->xe->drm, "NOT POSSIBLE");
2759 		}
2760 
2761 		err = xe_vma_op_commit(vm, op);
2762 		if (err)
2763 			return err;
2764 	}
2765 
2766 	return 0;
2767 }
2768 
2769 static void xe_vma_op_unwind(struct xe_vm *vm, struct xe_vma_op *op,
2770 			     bool post_commit, bool prev_post_commit,
2771 			     bool next_post_commit)
2772 {
2773 	lockdep_assert_held_write(&vm->lock);
2774 
2775 	switch (op->base.op) {
2776 	case DRM_GPUVA_OP_MAP:
2777 		if (op->map.vma) {
2778 			prep_vma_destroy(vm, op->map.vma, post_commit);
2779 			xe_vma_destroy_unlocked(op->map.vma);
2780 		}
2781 		break;
2782 	case DRM_GPUVA_OP_UNMAP:
2783 	{
2784 		struct xe_vma *vma = gpuva_to_vma(op->base.unmap.va);
2785 
2786 		if (vma) {
2787 			down_read(&vm->userptr.notifier_lock);
2788 			vma->gpuva.flags &= ~XE_VMA_DESTROYED;
2789 			up_read(&vm->userptr.notifier_lock);
2790 			if (post_commit)
2791 				xe_vm_insert_vma(vm, vma);
2792 		}
2793 		break;
2794 	}
2795 	case DRM_GPUVA_OP_REMAP:
2796 	{
2797 		struct xe_vma *vma = gpuva_to_vma(op->base.remap.unmap->va);
2798 
2799 		if (op->remap.prev) {
2800 			prep_vma_destroy(vm, op->remap.prev, prev_post_commit);
2801 			xe_vma_destroy_unlocked(op->remap.prev);
2802 		}
2803 		if (op->remap.next) {
2804 			prep_vma_destroy(vm, op->remap.next, next_post_commit);
2805 			xe_vma_destroy_unlocked(op->remap.next);
2806 		}
2807 		if (vma) {
2808 			down_read(&vm->userptr.notifier_lock);
2809 			vma->gpuva.flags &= ~XE_VMA_DESTROYED;
2810 			up_read(&vm->userptr.notifier_lock);
2811 			if (post_commit)
2812 				xe_vm_insert_vma(vm, vma);
2813 		}
2814 		break;
2815 	}
2816 	case DRM_GPUVA_OP_PREFETCH:
2817 		/* Nothing to do */
2818 		break;
2819 	default:
2820 		drm_warn(&vm->xe->drm, "NOT POSSIBLE");
2821 	}
2822 }
2823 
2824 static void vm_bind_ioctl_ops_unwind(struct xe_vm *vm,
2825 				     struct drm_gpuva_ops **ops,
2826 				     int num_ops_list)
2827 {
2828 	int i;
2829 
2830 	for (i = num_ops_list - 1; i >= 0; --i) {
2831 		struct drm_gpuva_ops *__ops = ops[i];
2832 		struct drm_gpuva_op *__op;
2833 
2834 		if (!__ops)
2835 			continue;
2836 
2837 		drm_gpuva_for_each_op_reverse(__op, __ops) {
2838 			struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2839 
2840 			xe_vma_op_unwind(vm, op,
2841 					 op->flags & XE_VMA_OP_COMMITTED,
2842 					 op->flags & XE_VMA_OP_PREV_COMMITTED,
2843 					 op->flags & XE_VMA_OP_NEXT_COMMITTED);
2844 		}
2845 	}
2846 }
2847 
2848 static int vma_lock_and_validate(struct drm_exec *exec, struct xe_vma *vma,
2849 				 bool validate)
2850 {
2851 	struct xe_bo *bo = xe_vma_bo(vma);
2852 	struct xe_vm *vm = xe_vma_vm(vma);
2853 	int err = 0;
2854 
2855 	if (bo) {
2856 		if (!bo->vm)
2857 			err = drm_exec_lock_obj(exec, &bo->ttm.base);
2858 		if (!err && validate)
2859 			err = xe_bo_validate(bo, vm,
2860 					     !xe_vm_in_preempt_fence_mode(vm));
2861 	}
2862 
2863 	return err;
2864 }
2865 
2866 static int check_ufence(struct xe_vma *vma)
2867 {
2868 	if (vma->ufence) {
2869 		struct xe_user_fence * const f = vma->ufence;
2870 
2871 		if (!xe_sync_ufence_get_status(f))
2872 			return -EBUSY;
2873 
2874 		vma->ufence = NULL;
2875 		xe_sync_ufence_put(f);
2876 	}
2877 
2878 	return 0;
2879 }
2880 
2881 static int prefetch_ranges(struct xe_vm *vm, struct xe_vma_op *op)
2882 {
2883 	bool devmem_possible = IS_DGFX(vm->xe) && IS_ENABLED(CONFIG_DRM_XE_DEVMEM_MIRROR);
2884 	struct xe_vma *vma = gpuva_to_vma(op->base.prefetch.va);
2885 	int err = 0;
2886 
2887 	struct xe_svm_range *svm_range;
2888 	struct drm_gpusvm_ctx ctx = {};
2889 	struct xe_tile *tile;
2890 	unsigned long i;
2891 	u32 region;
2892 
2893 	if (!xe_vma_is_cpu_addr_mirror(vma))
2894 		return 0;
2895 
2896 	region = op->prefetch_range.region;
2897 
2898 	ctx.read_only = xe_vma_read_only(vma);
2899 	ctx.devmem_possible = devmem_possible;
2900 	ctx.check_pages_threshold = devmem_possible ? SZ_64K : 0;
2901 
2902 	/* TODO: Threading the migration */
2903 	xa_for_each(&op->prefetch_range.range, i, svm_range) {
2904 		if (!region)
2905 			xe_svm_range_migrate_to_smem(vm, svm_range);
2906 
2907 		if (xe_svm_range_needs_migrate_to_vram(svm_range, vma, region)) {
2908 			tile = &vm->xe->tiles[region_to_mem_type[region] - XE_PL_VRAM0];
2909 			err = xe_svm_alloc_vram(vm, tile, svm_range, &ctx);
2910 			if (err) {
2911 				drm_dbg(&vm->xe->drm, "VRAM allocation failed, retry from userspace, asid=%u, gpusvm=%p, errno=%pe\n",
2912 					vm->usm.asid, &vm->svm.gpusvm, ERR_PTR(err));
2913 				return -ENODATA;
2914 			}
2915 			xe_svm_range_debug(svm_range, "PREFETCH - RANGE MIGRATED TO VRAM");
2916 		}
2917 
2918 		err = xe_svm_range_get_pages(vm, svm_range, &ctx);
2919 		if (err) {
2920 			if (err == -EOPNOTSUPP || err == -EFAULT || err == -EPERM)
2921 				err = -ENODATA;
2922 			drm_dbg(&vm->xe->drm, "Get pages failed, asid=%u, gpusvm=%p, errno=%pe\n",
2923 				vm->usm.asid, &vm->svm.gpusvm, ERR_PTR(err));
2924 			return err;
2925 		}
2926 		xe_svm_range_debug(svm_range, "PREFETCH - RANGE GET PAGES DONE");
2927 	}
2928 
2929 	return err;
2930 }
2931 
2932 static int op_lock_and_prep(struct drm_exec *exec, struct xe_vm *vm,
2933 			    struct xe_vma_op *op)
2934 {
2935 	int err = 0;
2936 
2937 	switch (op->base.op) {
2938 	case DRM_GPUVA_OP_MAP:
2939 		if (!op->map.invalidate_on_bind)
2940 			err = vma_lock_and_validate(exec, op->map.vma,
2941 						    !xe_vm_in_fault_mode(vm) ||
2942 						    op->map.immediate);
2943 		break;
2944 	case DRM_GPUVA_OP_REMAP:
2945 		err = check_ufence(gpuva_to_vma(op->base.remap.unmap->va));
2946 		if (err)
2947 			break;
2948 
2949 		err = vma_lock_and_validate(exec,
2950 					    gpuva_to_vma(op->base.remap.unmap->va),
2951 					    false);
2952 		if (!err && op->remap.prev)
2953 			err = vma_lock_and_validate(exec, op->remap.prev, true);
2954 		if (!err && op->remap.next)
2955 			err = vma_lock_and_validate(exec, op->remap.next, true);
2956 		break;
2957 	case DRM_GPUVA_OP_UNMAP:
2958 		err = check_ufence(gpuva_to_vma(op->base.unmap.va));
2959 		if (err)
2960 			break;
2961 
2962 		err = vma_lock_and_validate(exec,
2963 					    gpuva_to_vma(op->base.unmap.va),
2964 					    false);
2965 		break;
2966 	case DRM_GPUVA_OP_PREFETCH:
2967 	{
2968 		struct xe_vma *vma = gpuva_to_vma(op->base.prefetch.va);
2969 		u32 region;
2970 
2971 		if (xe_vma_is_cpu_addr_mirror(vma))
2972 			region = op->prefetch_range.region;
2973 		else
2974 			region = op->prefetch.region;
2975 
2976 		xe_assert(vm->xe, region <= ARRAY_SIZE(region_to_mem_type));
2977 
2978 		err = vma_lock_and_validate(exec,
2979 					    gpuva_to_vma(op->base.prefetch.va),
2980 					    false);
2981 		if (!err && !xe_vma_has_no_bo(vma))
2982 			err = xe_bo_migrate(xe_vma_bo(vma),
2983 					    region_to_mem_type[region]);
2984 		break;
2985 	}
2986 	default:
2987 		drm_warn(&vm->xe->drm, "NOT POSSIBLE");
2988 	}
2989 
2990 	return err;
2991 }
2992 
2993 static int vm_bind_ioctl_ops_prefetch_ranges(struct xe_vm *vm, struct xe_vma_ops *vops)
2994 {
2995 	struct xe_vma_op *op;
2996 	int err;
2997 
2998 	if (!(vops->flags & XE_VMA_OPS_FLAG_HAS_SVM_PREFETCH))
2999 		return 0;
3000 
3001 	list_for_each_entry(op, &vops->list, link) {
3002 		if (op->base.op  == DRM_GPUVA_OP_PREFETCH) {
3003 			err = prefetch_ranges(vm, op);
3004 			if (err)
3005 				return err;
3006 		}
3007 	}
3008 
3009 	return 0;
3010 }
3011 
3012 static int vm_bind_ioctl_ops_lock_and_prep(struct drm_exec *exec,
3013 					   struct xe_vm *vm,
3014 					   struct xe_vma_ops *vops)
3015 {
3016 	struct xe_vma_op *op;
3017 	int err;
3018 
3019 	err = drm_exec_lock_obj(exec, xe_vm_obj(vm));
3020 	if (err)
3021 		return err;
3022 
3023 	list_for_each_entry(op, &vops->list, link) {
3024 		err = op_lock_and_prep(exec, vm, op);
3025 		if (err)
3026 			return err;
3027 	}
3028 
3029 #ifdef TEST_VM_OPS_ERROR
3030 	if (vops->inject_error &&
3031 	    vm->xe->vm_inject_error_position == FORCE_OP_ERROR_LOCK)
3032 		return -ENOSPC;
3033 #endif
3034 
3035 	return 0;
3036 }
3037 
3038 static void op_trace(struct xe_vma_op *op)
3039 {
3040 	switch (op->base.op) {
3041 	case DRM_GPUVA_OP_MAP:
3042 		trace_xe_vma_bind(op->map.vma);
3043 		break;
3044 	case DRM_GPUVA_OP_REMAP:
3045 		trace_xe_vma_unbind(gpuva_to_vma(op->base.remap.unmap->va));
3046 		if (op->remap.prev)
3047 			trace_xe_vma_bind(op->remap.prev);
3048 		if (op->remap.next)
3049 			trace_xe_vma_bind(op->remap.next);
3050 		break;
3051 	case DRM_GPUVA_OP_UNMAP:
3052 		trace_xe_vma_unbind(gpuva_to_vma(op->base.unmap.va));
3053 		break;
3054 	case DRM_GPUVA_OP_PREFETCH:
3055 		trace_xe_vma_bind(gpuva_to_vma(op->base.prefetch.va));
3056 		break;
3057 	case DRM_GPUVA_OP_DRIVER:
3058 		break;
3059 	default:
3060 		XE_WARN_ON("NOT POSSIBLE");
3061 	}
3062 }
3063 
3064 static void trace_xe_vm_ops_execute(struct xe_vma_ops *vops)
3065 {
3066 	struct xe_vma_op *op;
3067 
3068 	list_for_each_entry(op, &vops->list, link)
3069 		op_trace(op);
3070 }
3071 
3072 static int vm_ops_setup_tile_args(struct xe_vm *vm, struct xe_vma_ops *vops)
3073 {
3074 	struct xe_exec_queue *q = vops->q;
3075 	struct xe_tile *tile;
3076 	int number_tiles = 0;
3077 	u8 id;
3078 
3079 	for_each_tile(tile, vm->xe, id) {
3080 		if (vops->pt_update_ops[id].num_ops)
3081 			++number_tiles;
3082 
3083 		if (vops->pt_update_ops[id].q)
3084 			continue;
3085 
3086 		if (q) {
3087 			vops->pt_update_ops[id].q = q;
3088 			if (vm->pt_root[id] && !list_empty(&q->multi_gt_list))
3089 				q = list_next_entry(q, multi_gt_list);
3090 		} else {
3091 			vops->pt_update_ops[id].q = vm->q[id];
3092 		}
3093 	}
3094 
3095 	return number_tiles;
3096 }
3097 
3098 static struct dma_fence *ops_execute(struct xe_vm *vm,
3099 				     struct xe_vma_ops *vops)
3100 {
3101 	struct xe_tile *tile;
3102 	struct dma_fence *fence = NULL;
3103 	struct dma_fence **fences = NULL;
3104 	struct dma_fence_array *cf = NULL;
3105 	int number_tiles = 0, current_fence = 0, err;
3106 	u8 id;
3107 
3108 	number_tiles = vm_ops_setup_tile_args(vm, vops);
3109 	if (number_tiles == 0)
3110 		return ERR_PTR(-ENODATA);
3111 
3112 	if (number_tiles > 1) {
3113 		fences = kmalloc_array(number_tiles, sizeof(*fences),
3114 				       GFP_KERNEL);
3115 		if (!fences) {
3116 			fence = ERR_PTR(-ENOMEM);
3117 			goto err_trace;
3118 		}
3119 	}
3120 
3121 	for_each_tile(tile, vm->xe, id) {
3122 		if (!vops->pt_update_ops[id].num_ops)
3123 			continue;
3124 
3125 		err = xe_pt_update_ops_prepare(tile, vops);
3126 		if (err) {
3127 			fence = ERR_PTR(err);
3128 			goto err_out;
3129 		}
3130 	}
3131 
3132 	trace_xe_vm_ops_execute(vops);
3133 
3134 	for_each_tile(tile, vm->xe, id) {
3135 		if (!vops->pt_update_ops[id].num_ops)
3136 			continue;
3137 
3138 		fence = xe_pt_update_ops_run(tile, vops);
3139 		if (IS_ERR(fence))
3140 			goto err_out;
3141 
3142 		if (fences)
3143 			fences[current_fence++] = fence;
3144 	}
3145 
3146 	if (fences) {
3147 		cf = dma_fence_array_create(number_tiles, fences,
3148 					    vm->composite_fence_ctx,
3149 					    vm->composite_fence_seqno++,
3150 					    false);
3151 		if (!cf) {
3152 			--vm->composite_fence_seqno;
3153 			fence = ERR_PTR(-ENOMEM);
3154 			goto err_out;
3155 		}
3156 		fence = &cf->base;
3157 	}
3158 
3159 	for_each_tile(tile, vm->xe, id) {
3160 		if (!vops->pt_update_ops[id].num_ops)
3161 			continue;
3162 
3163 		xe_pt_update_ops_fini(tile, vops);
3164 	}
3165 
3166 	return fence;
3167 
3168 err_out:
3169 	for_each_tile(tile, vm->xe, id) {
3170 		if (!vops->pt_update_ops[id].num_ops)
3171 			continue;
3172 
3173 		xe_pt_update_ops_abort(tile, vops);
3174 	}
3175 	while (current_fence)
3176 		dma_fence_put(fences[--current_fence]);
3177 	kfree(fences);
3178 	kfree(cf);
3179 
3180 err_trace:
3181 	trace_xe_vm_ops_fail(vm);
3182 	return fence;
3183 }
3184 
3185 static void vma_add_ufence(struct xe_vma *vma, struct xe_user_fence *ufence)
3186 {
3187 	if (vma->ufence)
3188 		xe_sync_ufence_put(vma->ufence);
3189 	vma->ufence = __xe_sync_ufence_get(ufence);
3190 }
3191 
3192 static void op_add_ufence(struct xe_vm *vm, struct xe_vma_op *op,
3193 			  struct xe_user_fence *ufence)
3194 {
3195 	switch (op->base.op) {
3196 	case DRM_GPUVA_OP_MAP:
3197 		vma_add_ufence(op->map.vma, ufence);
3198 		break;
3199 	case DRM_GPUVA_OP_REMAP:
3200 		if (op->remap.prev)
3201 			vma_add_ufence(op->remap.prev, ufence);
3202 		if (op->remap.next)
3203 			vma_add_ufence(op->remap.next, ufence);
3204 		break;
3205 	case DRM_GPUVA_OP_UNMAP:
3206 		break;
3207 	case DRM_GPUVA_OP_PREFETCH:
3208 		vma_add_ufence(gpuva_to_vma(op->base.prefetch.va), ufence);
3209 		break;
3210 	default:
3211 		drm_warn(&vm->xe->drm, "NOT POSSIBLE");
3212 	}
3213 }
3214 
3215 static void vm_bind_ioctl_ops_fini(struct xe_vm *vm, struct xe_vma_ops *vops,
3216 				   struct dma_fence *fence)
3217 {
3218 	struct xe_exec_queue *wait_exec_queue = to_wait_exec_queue(vm, vops->q);
3219 	struct xe_user_fence *ufence;
3220 	struct xe_vma_op *op;
3221 	int i;
3222 
3223 	ufence = find_ufence_get(vops->syncs, vops->num_syncs);
3224 	list_for_each_entry(op, &vops->list, link) {
3225 		if (ufence)
3226 			op_add_ufence(vm, op, ufence);
3227 
3228 		if (op->base.op == DRM_GPUVA_OP_UNMAP)
3229 			xe_vma_destroy(gpuva_to_vma(op->base.unmap.va), fence);
3230 		else if (op->base.op == DRM_GPUVA_OP_REMAP)
3231 			xe_vma_destroy(gpuva_to_vma(op->base.remap.unmap->va),
3232 				       fence);
3233 	}
3234 	if (ufence)
3235 		xe_sync_ufence_put(ufence);
3236 	if (fence) {
3237 		for (i = 0; i < vops->num_syncs; i++)
3238 			xe_sync_entry_signal(vops->syncs + i, fence);
3239 		xe_exec_queue_last_fence_set(wait_exec_queue, vm, fence);
3240 	}
3241 }
3242 
3243 static struct dma_fence *vm_bind_ioctl_ops_execute(struct xe_vm *vm,
3244 						   struct xe_vma_ops *vops)
3245 {
3246 	struct drm_exec exec;
3247 	struct dma_fence *fence;
3248 	int err;
3249 
3250 	lockdep_assert_held_write(&vm->lock);
3251 
3252 	drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT |
3253 		      DRM_EXEC_IGNORE_DUPLICATES, 0);
3254 	drm_exec_until_all_locked(&exec) {
3255 		err = vm_bind_ioctl_ops_lock_and_prep(&exec, vm, vops);
3256 		drm_exec_retry_on_contention(&exec);
3257 		if (err) {
3258 			fence = ERR_PTR(err);
3259 			goto unlock;
3260 		}
3261 
3262 		fence = ops_execute(vm, vops);
3263 		if (IS_ERR(fence)) {
3264 			if (PTR_ERR(fence) == -ENODATA)
3265 				vm_bind_ioctl_ops_fini(vm, vops, NULL);
3266 			goto unlock;
3267 		}
3268 
3269 		vm_bind_ioctl_ops_fini(vm, vops, fence);
3270 	}
3271 
3272 unlock:
3273 	drm_exec_fini(&exec);
3274 	return fence;
3275 }
3276 ALLOW_ERROR_INJECTION(vm_bind_ioctl_ops_execute, ERRNO);
3277 
3278 #define SUPPORTED_FLAGS_STUB  \
3279 	(DRM_XE_VM_BIND_FLAG_READONLY | \
3280 	 DRM_XE_VM_BIND_FLAG_IMMEDIATE | \
3281 	 DRM_XE_VM_BIND_FLAG_NULL | \
3282 	 DRM_XE_VM_BIND_FLAG_DUMPABLE | \
3283 	 DRM_XE_VM_BIND_FLAG_CHECK_PXP | \
3284 	 DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR)
3285 
3286 #ifdef TEST_VM_OPS_ERROR
3287 #define SUPPORTED_FLAGS	(SUPPORTED_FLAGS_STUB | FORCE_OP_ERROR)
3288 #else
3289 #define SUPPORTED_FLAGS	SUPPORTED_FLAGS_STUB
3290 #endif
3291 
3292 #define XE_64K_PAGE_MASK 0xffffull
3293 #define ALL_DRM_XE_SYNCS_FLAGS (DRM_XE_SYNCS_FLAG_WAIT_FOR_OP)
3294 
3295 static int vm_bind_ioctl_check_args(struct xe_device *xe, struct xe_vm *vm,
3296 				    struct drm_xe_vm_bind *args,
3297 				    struct drm_xe_vm_bind_op **bind_ops)
3298 {
3299 	int err;
3300 	int i;
3301 
3302 	if (XE_IOCTL_DBG(xe, args->pad || args->pad2) ||
3303 	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
3304 		return -EINVAL;
3305 
3306 	if (XE_IOCTL_DBG(xe, args->extensions))
3307 		return -EINVAL;
3308 
3309 	if (args->num_binds > 1) {
3310 		u64 __user *bind_user =
3311 			u64_to_user_ptr(args->vector_of_binds);
3312 
3313 		*bind_ops = kvmalloc_array(args->num_binds,
3314 					   sizeof(struct drm_xe_vm_bind_op),
3315 					   GFP_KERNEL | __GFP_ACCOUNT |
3316 					   __GFP_RETRY_MAYFAIL | __GFP_NOWARN);
3317 		if (!*bind_ops)
3318 			return args->num_binds > 1 ? -ENOBUFS : -ENOMEM;
3319 
3320 		err = copy_from_user(*bind_ops, bind_user,
3321 				     sizeof(struct drm_xe_vm_bind_op) *
3322 				     args->num_binds);
3323 		if (XE_IOCTL_DBG(xe, err)) {
3324 			err = -EFAULT;
3325 			goto free_bind_ops;
3326 		}
3327 	} else {
3328 		*bind_ops = &args->bind;
3329 	}
3330 
3331 	for (i = 0; i < args->num_binds; ++i) {
3332 		u64 range = (*bind_ops)[i].range;
3333 		u64 addr = (*bind_ops)[i].addr;
3334 		u32 op = (*bind_ops)[i].op;
3335 		u32 flags = (*bind_ops)[i].flags;
3336 		u32 obj = (*bind_ops)[i].obj;
3337 		u64 obj_offset = (*bind_ops)[i].obj_offset;
3338 		u32 prefetch_region = (*bind_ops)[i].prefetch_mem_region_instance;
3339 		bool is_null = flags & DRM_XE_VM_BIND_FLAG_NULL;
3340 		bool is_cpu_addr_mirror = flags &
3341 			DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR;
3342 		u16 pat_index = (*bind_ops)[i].pat_index;
3343 		u16 coh_mode;
3344 
3345 		if (XE_IOCTL_DBG(xe, is_cpu_addr_mirror &&
3346 				 (!xe_vm_in_fault_mode(vm) ||
3347 				 !IS_ENABLED(CONFIG_DRM_XE_GPUSVM)))) {
3348 			err = -EINVAL;
3349 			goto free_bind_ops;
3350 		}
3351 
3352 		if (XE_IOCTL_DBG(xe, pat_index >= xe->pat.n_entries)) {
3353 			err = -EINVAL;
3354 			goto free_bind_ops;
3355 		}
3356 
3357 		pat_index = array_index_nospec(pat_index, xe->pat.n_entries);
3358 		(*bind_ops)[i].pat_index = pat_index;
3359 		coh_mode = xe_pat_index_get_coh_mode(xe, pat_index);
3360 		if (XE_IOCTL_DBG(xe, !coh_mode)) { /* hw reserved */
3361 			err = -EINVAL;
3362 			goto free_bind_ops;
3363 		}
3364 
3365 		if (XE_WARN_ON(coh_mode > XE_COH_AT_LEAST_1WAY)) {
3366 			err = -EINVAL;
3367 			goto free_bind_ops;
3368 		}
3369 
3370 		if (XE_IOCTL_DBG(xe, op > DRM_XE_VM_BIND_OP_PREFETCH) ||
3371 		    XE_IOCTL_DBG(xe, flags & ~SUPPORTED_FLAGS) ||
3372 		    XE_IOCTL_DBG(xe, obj && (is_null || is_cpu_addr_mirror)) ||
3373 		    XE_IOCTL_DBG(xe, obj_offset && (is_null ||
3374 						    is_cpu_addr_mirror)) ||
3375 		    XE_IOCTL_DBG(xe, op != DRM_XE_VM_BIND_OP_MAP &&
3376 				 (is_null || is_cpu_addr_mirror)) ||
3377 		    XE_IOCTL_DBG(xe, !obj &&
3378 				 op == DRM_XE_VM_BIND_OP_MAP &&
3379 				 !is_null && !is_cpu_addr_mirror) ||
3380 		    XE_IOCTL_DBG(xe, !obj &&
3381 				 op == DRM_XE_VM_BIND_OP_UNMAP_ALL) ||
3382 		    XE_IOCTL_DBG(xe, addr &&
3383 				 op == DRM_XE_VM_BIND_OP_UNMAP_ALL) ||
3384 		    XE_IOCTL_DBG(xe, range &&
3385 				 op == DRM_XE_VM_BIND_OP_UNMAP_ALL) ||
3386 		    XE_IOCTL_DBG(xe, obj &&
3387 				 op == DRM_XE_VM_BIND_OP_MAP_USERPTR) ||
3388 		    XE_IOCTL_DBG(xe, coh_mode == XE_COH_NONE &&
3389 				 op == DRM_XE_VM_BIND_OP_MAP_USERPTR) ||
3390 		    XE_IOCTL_DBG(xe, obj &&
3391 				 op == DRM_XE_VM_BIND_OP_PREFETCH) ||
3392 		    XE_IOCTL_DBG(xe, prefetch_region &&
3393 				 op != DRM_XE_VM_BIND_OP_PREFETCH) ||
3394 		    XE_IOCTL_DBG(xe, !(BIT(prefetch_region) &
3395 				       xe->info.mem_region_mask)) ||
3396 		    XE_IOCTL_DBG(xe, obj &&
3397 				 op == DRM_XE_VM_BIND_OP_UNMAP)) {
3398 			err = -EINVAL;
3399 			goto free_bind_ops;
3400 		}
3401 
3402 		if (XE_IOCTL_DBG(xe, obj_offset & ~PAGE_MASK) ||
3403 		    XE_IOCTL_DBG(xe, addr & ~PAGE_MASK) ||
3404 		    XE_IOCTL_DBG(xe, range & ~PAGE_MASK) ||
3405 		    XE_IOCTL_DBG(xe, !range &&
3406 				 op != DRM_XE_VM_BIND_OP_UNMAP_ALL)) {
3407 			err = -EINVAL;
3408 			goto free_bind_ops;
3409 		}
3410 	}
3411 
3412 	return 0;
3413 
3414 free_bind_ops:
3415 	if (args->num_binds > 1)
3416 		kvfree(*bind_ops);
3417 	return err;
3418 }
3419 
3420 static int vm_bind_ioctl_signal_fences(struct xe_vm *vm,
3421 				       struct xe_exec_queue *q,
3422 				       struct xe_sync_entry *syncs,
3423 				       int num_syncs)
3424 {
3425 	struct dma_fence *fence;
3426 	int i, err = 0;
3427 
3428 	fence = xe_sync_in_fence_get(syncs, num_syncs,
3429 				     to_wait_exec_queue(vm, q), vm);
3430 	if (IS_ERR(fence))
3431 		return PTR_ERR(fence);
3432 
3433 	for (i = 0; i < num_syncs; i++)
3434 		xe_sync_entry_signal(&syncs[i], fence);
3435 
3436 	xe_exec_queue_last_fence_set(to_wait_exec_queue(vm, q), vm,
3437 				     fence);
3438 	dma_fence_put(fence);
3439 
3440 	return err;
3441 }
3442 
3443 static void xe_vma_ops_init(struct xe_vma_ops *vops, struct xe_vm *vm,
3444 			    struct xe_exec_queue *q,
3445 			    struct xe_sync_entry *syncs, u32 num_syncs)
3446 {
3447 	memset(vops, 0, sizeof(*vops));
3448 	INIT_LIST_HEAD(&vops->list);
3449 	vops->vm = vm;
3450 	vops->q = q;
3451 	vops->syncs = syncs;
3452 	vops->num_syncs = num_syncs;
3453 	vops->flags = 0;
3454 }
3455 
3456 static int xe_vm_bind_ioctl_validate_bo(struct xe_device *xe, struct xe_bo *bo,
3457 					u64 addr, u64 range, u64 obj_offset,
3458 					u16 pat_index, u32 op, u32 bind_flags)
3459 {
3460 	u16 coh_mode;
3461 
3462 	if (XE_IOCTL_DBG(xe, range > bo->size) ||
3463 	    XE_IOCTL_DBG(xe, obj_offset >
3464 			 bo->size - range)) {
3465 		return -EINVAL;
3466 	}
3467 
3468 	/*
3469 	 * Some platforms require 64k VM_BIND alignment,
3470 	 * specifically those with XE_VRAM_FLAGS_NEED64K.
3471 	 *
3472 	 * Other platforms may have BO's set to 64k physical placement,
3473 	 * but can be mapped at 4k offsets anyway. This check is only
3474 	 * there for the former case.
3475 	 */
3476 	if ((bo->flags & XE_BO_FLAG_INTERNAL_64K) &&
3477 	    (xe->info.vram_flags & XE_VRAM_FLAGS_NEED64K)) {
3478 		if (XE_IOCTL_DBG(xe, obj_offset &
3479 				 XE_64K_PAGE_MASK) ||
3480 		    XE_IOCTL_DBG(xe, addr & XE_64K_PAGE_MASK) ||
3481 		    XE_IOCTL_DBG(xe, range & XE_64K_PAGE_MASK)) {
3482 			return -EINVAL;
3483 		}
3484 	}
3485 
3486 	coh_mode = xe_pat_index_get_coh_mode(xe, pat_index);
3487 	if (bo->cpu_caching) {
3488 		if (XE_IOCTL_DBG(xe, coh_mode == XE_COH_NONE &&
3489 				 bo->cpu_caching == DRM_XE_GEM_CPU_CACHING_WB)) {
3490 			return -EINVAL;
3491 		}
3492 	} else if (XE_IOCTL_DBG(xe, coh_mode == XE_COH_NONE)) {
3493 		/*
3494 		 * Imported dma-buf from a different device should
3495 		 * require 1way or 2way coherency since we don't know
3496 		 * how it was mapped on the CPU. Just assume is it
3497 		 * potentially cached on CPU side.
3498 		 */
3499 		return -EINVAL;
3500 	}
3501 
3502 	/* If a BO is protected it can only be mapped if the key is still valid */
3503 	if ((bind_flags & DRM_XE_VM_BIND_FLAG_CHECK_PXP) && xe_bo_is_protected(bo) &&
3504 	    op != DRM_XE_VM_BIND_OP_UNMAP && op != DRM_XE_VM_BIND_OP_UNMAP_ALL)
3505 		if (XE_IOCTL_DBG(xe, xe_pxp_bo_key_check(xe->pxp, bo) != 0))
3506 			return -ENOEXEC;
3507 
3508 	return 0;
3509 }
3510 
3511 int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
3512 {
3513 	struct xe_device *xe = to_xe_device(dev);
3514 	struct xe_file *xef = to_xe_file(file);
3515 	struct drm_xe_vm_bind *args = data;
3516 	struct drm_xe_sync __user *syncs_user;
3517 	struct xe_bo **bos = NULL;
3518 	struct drm_gpuva_ops **ops = NULL;
3519 	struct xe_vm *vm;
3520 	struct xe_exec_queue *q = NULL;
3521 	u32 num_syncs, num_ufence = 0;
3522 	struct xe_sync_entry *syncs = NULL;
3523 	struct drm_xe_vm_bind_op *bind_ops;
3524 	struct xe_vma_ops vops;
3525 	struct dma_fence *fence;
3526 	int err;
3527 	int i;
3528 
3529 	vm = xe_vm_lookup(xef, args->vm_id);
3530 	if (XE_IOCTL_DBG(xe, !vm))
3531 		return -EINVAL;
3532 
3533 	err = vm_bind_ioctl_check_args(xe, vm, args, &bind_ops);
3534 	if (err)
3535 		goto put_vm;
3536 
3537 	if (args->exec_queue_id) {
3538 		q = xe_exec_queue_lookup(xef, args->exec_queue_id);
3539 		if (XE_IOCTL_DBG(xe, !q)) {
3540 			err = -ENOENT;
3541 			goto put_vm;
3542 		}
3543 
3544 		if (XE_IOCTL_DBG(xe, !(q->flags & EXEC_QUEUE_FLAG_VM))) {
3545 			err = -EINVAL;
3546 			goto put_exec_queue;
3547 		}
3548 	}
3549 
3550 	/* Ensure all UNMAPs visible */
3551 	xe_svm_flush(vm);
3552 
3553 	err = down_write_killable(&vm->lock);
3554 	if (err)
3555 		goto put_exec_queue;
3556 
3557 	if (XE_IOCTL_DBG(xe, xe_vm_is_closed_or_banned(vm))) {
3558 		err = -ENOENT;
3559 		goto release_vm_lock;
3560 	}
3561 
3562 	for (i = 0; i < args->num_binds; ++i) {
3563 		u64 range = bind_ops[i].range;
3564 		u64 addr = bind_ops[i].addr;
3565 
3566 		if (XE_IOCTL_DBG(xe, range > vm->size) ||
3567 		    XE_IOCTL_DBG(xe, addr > vm->size - range)) {
3568 			err = -EINVAL;
3569 			goto release_vm_lock;
3570 		}
3571 	}
3572 
3573 	if (args->num_binds) {
3574 		bos = kvcalloc(args->num_binds, sizeof(*bos),
3575 			       GFP_KERNEL | __GFP_ACCOUNT |
3576 			       __GFP_RETRY_MAYFAIL | __GFP_NOWARN);
3577 		if (!bos) {
3578 			err = -ENOMEM;
3579 			goto release_vm_lock;
3580 		}
3581 
3582 		ops = kvcalloc(args->num_binds, sizeof(*ops),
3583 			       GFP_KERNEL | __GFP_ACCOUNT |
3584 			       __GFP_RETRY_MAYFAIL | __GFP_NOWARN);
3585 		if (!ops) {
3586 			err = -ENOMEM;
3587 			goto release_vm_lock;
3588 		}
3589 	}
3590 
3591 	for (i = 0; i < args->num_binds; ++i) {
3592 		struct drm_gem_object *gem_obj;
3593 		u64 range = bind_ops[i].range;
3594 		u64 addr = bind_ops[i].addr;
3595 		u32 obj = bind_ops[i].obj;
3596 		u64 obj_offset = bind_ops[i].obj_offset;
3597 		u16 pat_index = bind_ops[i].pat_index;
3598 		u32 op = bind_ops[i].op;
3599 		u32 bind_flags = bind_ops[i].flags;
3600 
3601 		if (!obj)
3602 			continue;
3603 
3604 		gem_obj = drm_gem_object_lookup(file, obj);
3605 		if (XE_IOCTL_DBG(xe, !gem_obj)) {
3606 			err = -ENOENT;
3607 			goto put_obj;
3608 		}
3609 		bos[i] = gem_to_xe_bo(gem_obj);
3610 
3611 		err = xe_vm_bind_ioctl_validate_bo(xe, bos[i], addr, range,
3612 						   obj_offset, pat_index, op,
3613 						   bind_flags);
3614 		if (err)
3615 			goto put_obj;
3616 	}
3617 
3618 	if (args->num_syncs) {
3619 		syncs = kcalloc(args->num_syncs, sizeof(*syncs), GFP_KERNEL);
3620 		if (!syncs) {
3621 			err = -ENOMEM;
3622 			goto put_obj;
3623 		}
3624 	}
3625 
3626 	syncs_user = u64_to_user_ptr(args->syncs);
3627 	for (num_syncs = 0; num_syncs < args->num_syncs; num_syncs++) {
3628 		err = xe_sync_entry_parse(xe, xef, &syncs[num_syncs],
3629 					  &syncs_user[num_syncs],
3630 					  (xe_vm_in_lr_mode(vm) ?
3631 					   SYNC_PARSE_FLAG_LR_MODE : 0) |
3632 					  (!args->num_binds ?
3633 					   SYNC_PARSE_FLAG_DISALLOW_USER_FENCE : 0));
3634 		if (err)
3635 			goto free_syncs;
3636 
3637 		if (xe_sync_is_ufence(&syncs[num_syncs]))
3638 			num_ufence++;
3639 	}
3640 
3641 	if (XE_IOCTL_DBG(xe, num_ufence > 1)) {
3642 		err = -EINVAL;
3643 		goto free_syncs;
3644 	}
3645 
3646 	if (!args->num_binds) {
3647 		err = -ENODATA;
3648 		goto free_syncs;
3649 	}
3650 
3651 	xe_vma_ops_init(&vops, vm, q, syncs, num_syncs);
3652 	for (i = 0; i < args->num_binds; ++i) {
3653 		u64 range = bind_ops[i].range;
3654 		u64 addr = bind_ops[i].addr;
3655 		u32 op = bind_ops[i].op;
3656 		u32 flags = bind_ops[i].flags;
3657 		u64 obj_offset = bind_ops[i].obj_offset;
3658 		u32 prefetch_region = bind_ops[i].prefetch_mem_region_instance;
3659 		u16 pat_index = bind_ops[i].pat_index;
3660 
3661 		ops[i] = vm_bind_ioctl_ops_create(vm, &vops, bos[i], obj_offset,
3662 						  addr, range, op, flags,
3663 						  prefetch_region, pat_index);
3664 		if (IS_ERR(ops[i])) {
3665 			err = PTR_ERR(ops[i]);
3666 			ops[i] = NULL;
3667 			goto unwind_ops;
3668 		}
3669 
3670 		err = vm_bind_ioctl_ops_parse(vm, ops[i], &vops);
3671 		if (err)
3672 			goto unwind_ops;
3673 
3674 #ifdef TEST_VM_OPS_ERROR
3675 		if (flags & FORCE_OP_ERROR) {
3676 			vops.inject_error = true;
3677 			vm->xe->vm_inject_error_position =
3678 				(vm->xe->vm_inject_error_position + 1) %
3679 				FORCE_OP_ERROR_COUNT;
3680 		}
3681 #endif
3682 	}
3683 
3684 	/* Nothing to do */
3685 	if (list_empty(&vops.list)) {
3686 		err = -ENODATA;
3687 		goto unwind_ops;
3688 	}
3689 
3690 	err = xe_vma_ops_alloc(&vops, args->num_binds > 1);
3691 	if (err)
3692 		goto unwind_ops;
3693 
3694 	err = vm_bind_ioctl_ops_prefetch_ranges(vm, &vops);
3695 	if (err)
3696 		goto unwind_ops;
3697 
3698 	fence = vm_bind_ioctl_ops_execute(vm, &vops);
3699 	if (IS_ERR(fence))
3700 		err = PTR_ERR(fence);
3701 	else
3702 		dma_fence_put(fence);
3703 
3704 unwind_ops:
3705 	if (err && err != -ENODATA)
3706 		vm_bind_ioctl_ops_unwind(vm, ops, args->num_binds);
3707 	xe_vma_ops_fini(&vops);
3708 	for (i = args->num_binds - 1; i >= 0; --i)
3709 		if (ops[i])
3710 			drm_gpuva_ops_free(&vm->gpuvm, ops[i]);
3711 free_syncs:
3712 	if (err == -ENODATA)
3713 		err = vm_bind_ioctl_signal_fences(vm, q, syncs, num_syncs);
3714 	while (num_syncs--)
3715 		xe_sync_entry_cleanup(&syncs[num_syncs]);
3716 
3717 	kfree(syncs);
3718 put_obj:
3719 	for (i = 0; i < args->num_binds; ++i)
3720 		xe_bo_put(bos[i]);
3721 release_vm_lock:
3722 	up_write(&vm->lock);
3723 put_exec_queue:
3724 	if (q)
3725 		xe_exec_queue_put(q);
3726 put_vm:
3727 	xe_vm_put(vm);
3728 	kvfree(bos);
3729 	kvfree(ops);
3730 	if (args->num_binds > 1)
3731 		kvfree(bind_ops);
3732 	return err;
3733 }
3734 
3735 /**
3736  * xe_vm_bind_kernel_bo - bind a kernel BO to a VM
3737  * @vm: VM to bind the BO to
3738  * @bo: BO to bind
3739  * @q: exec queue to use for the bind (optional)
3740  * @addr: address at which to bind the BO
3741  * @cache_lvl: PAT cache level to use
3742  *
3743  * Execute a VM bind map operation on a kernel-owned BO to bind it into a
3744  * kernel-owned VM.
3745  *
3746  * Returns a dma_fence to track the binding completion if the job to do so was
3747  * successfully submitted, an error pointer otherwise.
3748  */
3749 struct dma_fence *xe_vm_bind_kernel_bo(struct xe_vm *vm, struct xe_bo *bo,
3750 				       struct xe_exec_queue *q, u64 addr,
3751 				       enum xe_cache_level cache_lvl)
3752 {
3753 	struct xe_vma_ops vops;
3754 	struct drm_gpuva_ops *ops = NULL;
3755 	struct dma_fence *fence;
3756 	int err;
3757 
3758 	xe_bo_get(bo);
3759 	xe_vm_get(vm);
3760 	if (q)
3761 		xe_exec_queue_get(q);
3762 
3763 	down_write(&vm->lock);
3764 
3765 	xe_vma_ops_init(&vops, vm, q, NULL, 0);
3766 
3767 	ops = vm_bind_ioctl_ops_create(vm, &vops, bo, 0, addr, bo->size,
3768 				       DRM_XE_VM_BIND_OP_MAP, 0, 0,
3769 				       vm->xe->pat.idx[cache_lvl]);
3770 	if (IS_ERR(ops)) {
3771 		err = PTR_ERR(ops);
3772 		goto release_vm_lock;
3773 	}
3774 
3775 	err = vm_bind_ioctl_ops_parse(vm, ops, &vops);
3776 	if (err)
3777 		goto release_vm_lock;
3778 
3779 	xe_assert(vm->xe, !list_empty(&vops.list));
3780 
3781 	err = xe_vma_ops_alloc(&vops, false);
3782 	if (err)
3783 		goto unwind_ops;
3784 
3785 	fence = vm_bind_ioctl_ops_execute(vm, &vops);
3786 	if (IS_ERR(fence))
3787 		err = PTR_ERR(fence);
3788 
3789 unwind_ops:
3790 	if (err && err != -ENODATA)
3791 		vm_bind_ioctl_ops_unwind(vm, &ops, 1);
3792 
3793 	xe_vma_ops_fini(&vops);
3794 	drm_gpuva_ops_free(&vm->gpuvm, ops);
3795 
3796 release_vm_lock:
3797 	up_write(&vm->lock);
3798 
3799 	if (q)
3800 		xe_exec_queue_put(q);
3801 	xe_vm_put(vm);
3802 	xe_bo_put(bo);
3803 
3804 	if (err)
3805 		fence = ERR_PTR(err);
3806 
3807 	return fence;
3808 }
3809 
3810 /**
3811  * xe_vm_lock() - Lock the vm's dma_resv object
3812  * @vm: The struct xe_vm whose lock is to be locked
3813  * @intr: Whether to perform any wait interruptible
3814  *
3815  * Return: 0 on success, -EINTR if @intr is true and the wait for a
3816  * contended lock was interrupted. If @intr is false, the function
3817  * always returns 0.
3818  */
3819 int xe_vm_lock(struct xe_vm *vm, bool intr)
3820 {
3821 	if (intr)
3822 		return dma_resv_lock_interruptible(xe_vm_resv(vm), NULL);
3823 
3824 	return dma_resv_lock(xe_vm_resv(vm), NULL);
3825 }
3826 
3827 /**
3828  * xe_vm_unlock() - Unlock the vm's dma_resv object
3829  * @vm: The struct xe_vm whose lock is to be released.
3830  *
3831  * Unlock a buffer object lock that was locked by xe_vm_lock().
3832  */
3833 void xe_vm_unlock(struct xe_vm *vm)
3834 {
3835 	dma_resv_unlock(xe_vm_resv(vm));
3836 }
3837 
3838 /**
3839  * xe_vm_invalidate_vma - invalidate GPU mappings for VMA without a lock
3840  * @vma: VMA to invalidate
3841  *
3842  * Walks a list of page tables leaves which it memset the entries owned by this
3843  * VMA to zero, invalidates the TLBs, and block until TLBs invalidation is
3844  * complete.
3845  *
3846  * Returns 0 for success, negative error code otherwise.
3847  */
3848 int xe_vm_invalidate_vma(struct xe_vma *vma)
3849 {
3850 	struct xe_device *xe = xe_vma_vm(vma)->xe;
3851 	struct xe_tile *tile;
3852 	struct xe_gt_tlb_invalidation_fence
3853 		fence[XE_MAX_TILES_PER_DEVICE * XE_MAX_GT_PER_TILE];
3854 	u8 id;
3855 	u32 fence_id = 0;
3856 	int ret = 0;
3857 
3858 	xe_assert(xe, !xe_vma_is_null(vma));
3859 	xe_assert(xe, !xe_vma_is_cpu_addr_mirror(vma));
3860 	trace_xe_vma_invalidate(vma);
3861 
3862 	vm_dbg(&xe_vma_vm(vma)->xe->drm,
3863 	       "INVALIDATE: addr=0x%016llx, range=0x%016llx",
3864 		xe_vma_start(vma), xe_vma_size(vma));
3865 
3866 	/* Check that we don't race with page-table updates */
3867 	if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
3868 		if (xe_vma_is_userptr(vma)) {
3869 			WARN_ON_ONCE(!mmu_interval_check_retry
3870 				     (&to_userptr_vma(vma)->userptr.notifier,
3871 				      to_userptr_vma(vma)->userptr.notifier_seq));
3872 			WARN_ON_ONCE(!dma_resv_test_signaled(xe_vm_resv(xe_vma_vm(vma)),
3873 							     DMA_RESV_USAGE_BOOKKEEP));
3874 
3875 		} else {
3876 			xe_bo_assert_held(xe_vma_bo(vma));
3877 		}
3878 	}
3879 
3880 	for_each_tile(tile, xe, id) {
3881 		if (xe_pt_zap_ptes(tile, vma)) {
3882 			xe_device_wmb(xe);
3883 			xe_gt_tlb_invalidation_fence_init(tile->primary_gt,
3884 							  &fence[fence_id],
3885 							  true);
3886 
3887 			ret = xe_gt_tlb_invalidation_vma(tile->primary_gt,
3888 							 &fence[fence_id], vma);
3889 			if (ret)
3890 				goto wait;
3891 			++fence_id;
3892 
3893 			if (!tile->media_gt)
3894 				continue;
3895 
3896 			xe_gt_tlb_invalidation_fence_init(tile->media_gt,
3897 							  &fence[fence_id],
3898 							  true);
3899 
3900 			ret = xe_gt_tlb_invalidation_vma(tile->media_gt,
3901 							 &fence[fence_id], vma);
3902 			if (ret)
3903 				goto wait;
3904 			++fence_id;
3905 		}
3906 	}
3907 
3908 wait:
3909 	for (id = 0; id < fence_id; ++id)
3910 		xe_gt_tlb_invalidation_fence_wait(&fence[id]);
3911 
3912 	vma->tile_invalidated = vma->tile_mask;
3913 
3914 	return ret;
3915 }
3916 
3917 int xe_vm_validate_protected(struct xe_vm *vm)
3918 {
3919 	struct drm_gpuva *gpuva;
3920 	int err = 0;
3921 
3922 	if (!vm)
3923 		return -ENODEV;
3924 
3925 	mutex_lock(&vm->snap_mutex);
3926 
3927 	drm_gpuvm_for_each_va(gpuva, &vm->gpuvm) {
3928 		struct xe_vma *vma = gpuva_to_vma(gpuva);
3929 		struct xe_bo *bo = vma->gpuva.gem.obj ?
3930 			gem_to_xe_bo(vma->gpuva.gem.obj) : NULL;
3931 
3932 		if (!bo)
3933 			continue;
3934 
3935 		if (xe_bo_is_protected(bo)) {
3936 			err = xe_pxp_bo_key_check(vm->xe->pxp, bo);
3937 			if (err)
3938 				break;
3939 		}
3940 	}
3941 
3942 	mutex_unlock(&vm->snap_mutex);
3943 	return err;
3944 }
3945 
3946 struct xe_vm_snapshot {
3947 	unsigned long num_snaps;
3948 	struct {
3949 		u64 ofs, bo_ofs;
3950 		unsigned long len;
3951 		struct xe_bo *bo;
3952 		void *data;
3953 		struct mm_struct *mm;
3954 	} snap[];
3955 };
3956 
3957 struct xe_vm_snapshot *xe_vm_snapshot_capture(struct xe_vm *vm)
3958 {
3959 	unsigned long num_snaps = 0, i;
3960 	struct xe_vm_snapshot *snap = NULL;
3961 	struct drm_gpuva *gpuva;
3962 
3963 	if (!vm)
3964 		return NULL;
3965 
3966 	mutex_lock(&vm->snap_mutex);
3967 	drm_gpuvm_for_each_va(gpuva, &vm->gpuvm) {
3968 		if (gpuva->flags & XE_VMA_DUMPABLE)
3969 			num_snaps++;
3970 	}
3971 
3972 	if (num_snaps)
3973 		snap = kvzalloc(offsetof(struct xe_vm_snapshot, snap[num_snaps]), GFP_NOWAIT);
3974 	if (!snap) {
3975 		snap = num_snaps ? ERR_PTR(-ENOMEM) : ERR_PTR(-ENODEV);
3976 		goto out_unlock;
3977 	}
3978 
3979 	snap->num_snaps = num_snaps;
3980 	i = 0;
3981 	drm_gpuvm_for_each_va(gpuva, &vm->gpuvm) {
3982 		struct xe_vma *vma = gpuva_to_vma(gpuva);
3983 		struct xe_bo *bo = vma->gpuva.gem.obj ?
3984 			gem_to_xe_bo(vma->gpuva.gem.obj) : NULL;
3985 
3986 		if (!(gpuva->flags & XE_VMA_DUMPABLE))
3987 			continue;
3988 
3989 		snap->snap[i].ofs = xe_vma_start(vma);
3990 		snap->snap[i].len = xe_vma_size(vma);
3991 		if (bo) {
3992 			snap->snap[i].bo = xe_bo_get(bo);
3993 			snap->snap[i].bo_ofs = xe_vma_bo_offset(vma);
3994 		} else if (xe_vma_is_userptr(vma)) {
3995 			struct mm_struct *mm =
3996 				to_userptr_vma(vma)->userptr.notifier.mm;
3997 
3998 			if (mmget_not_zero(mm))
3999 				snap->snap[i].mm = mm;
4000 			else
4001 				snap->snap[i].data = ERR_PTR(-EFAULT);
4002 
4003 			snap->snap[i].bo_ofs = xe_vma_userptr(vma);
4004 		} else {
4005 			snap->snap[i].data = ERR_PTR(-ENOENT);
4006 		}
4007 		i++;
4008 	}
4009 
4010 out_unlock:
4011 	mutex_unlock(&vm->snap_mutex);
4012 	return snap;
4013 }
4014 
4015 void xe_vm_snapshot_capture_delayed(struct xe_vm_snapshot *snap)
4016 {
4017 	if (IS_ERR_OR_NULL(snap))
4018 		return;
4019 
4020 	for (int i = 0; i < snap->num_snaps; i++) {
4021 		struct xe_bo *bo = snap->snap[i].bo;
4022 		int err;
4023 
4024 		if (IS_ERR(snap->snap[i].data))
4025 			continue;
4026 
4027 		snap->snap[i].data = kvmalloc(snap->snap[i].len, GFP_USER);
4028 		if (!snap->snap[i].data) {
4029 			snap->snap[i].data = ERR_PTR(-ENOMEM);
4030 			goto cleanup_bo;
4031 		}
4032 
4033 		if (bo) {
4034 			err = xe_bo_read(bo, snap->snap[i].bo_ofs,
4035 					 snap->snap[i].data, snap->snap[i].len);
4036 		} else {
4037 			void __user *userptr = (void __user *)(size_t)snap->snap[i].bo_ofs;
4038 
4039 			kthread_use_mm(snap->snap[i].mm);
4040 			if (!copy_from_user(snap->snap[i].data, userptr, snap->snap[i].len))
4041 				err = 0;
4042 			else
4043 				err = -EFAULT;
4044 			kthread_unuse_mm(snap->snap[i].mm);
4045 
4046 			mmput(snap->snap[i].mm);
4047 			snap->snap[i].mm = NULL;
4048 		}
4049 
4050 		if (err) {
4051 			kvfree(snap->snap[i].data);
4052 			snap->snap[i].data = ERR_PTR(err);
4053 		}
4054 
4055 cleanup_bo:
4056 		xe_bo_put(bo);
4057 		snap->snap[i].bo = NULL;
4058 	}
4059 }
4060 
4061 void xe_vm_snapshot_print(struct xe_vm_snapshot *snap, struct drm_printer *p)
4062 {
4063 	unsigned long i, j;
4064 
4065 	if (IS_ERR_OR_NULL(snap)) {
4066 		drm_printf(p, "[0].error: %li\n", PTR_ERR(snap));
4067 		return;
4068 	}
4069 
4070 	for (i = 0; i < snap->num_snaps; i++) {
4071 		drm_printf(p, "[%llx].length: 0x%lx\n", snap->snap[i].ofs, snap->snap[i].len);
4072 
4073 		if (IS_ERR(snap->snap[i].data)) {
4074 			drm_printf(p, "[%llx].error: %li\n", snap->snap[i].ofs,
4075 				   PTR_ERR(snap->snap[i].data));
4076 			continue;
4077 		}
4078 
4079 		drm_printf(p, "[%llx].data: ", snap->snap[i].ofs);
4080 
4081 		for (j = 0; j < snap->snap[i].len; j += sizeof(u32)) {
4082 			u32 *val = snap->snap[i].data + j;
4083 			char dumped[ASCII85_BUFSZ];
4084 
4085 			drm_puts(p, ascii85_encode(*val, dumped));
4086 		}
4087 
4088 		drm_puts(p, "\n");
4089 
4090 		if (drm_coredump_printer_is_full(p))
4091 			return;
4092 	}
4093 }
4094 
4095 void xe_vm_snapshot_free(struct xe_vm_snapshot *snap)
4096 {
4097 	unsigned long i;
4098 
4099 	if (IS_ERR_OR_NULL(snap))
4100 		return;
4101 
4102 	for (i = 0; i < snap->num_snaps; i++) {
4103 		if (!IS_ERR(snap->snap[i].data))
4104 			kvfree(snap->snap[i].data);
4105 		xe_bo_put(snap->snap[i].bo);
4106 		if (snap->snap[i].mm)
4107 			mmput(snap->snap[i].mm);
4108 	}
4109 	kvfree(snap);
4110 }
4111