xref: /linux/drivers/gpu/drm/xe/xe_vm.c (revision 31eea29d727ce35b747e68c6be350ca07b7ecd9b)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2021 Intel Corporation
4  */
5 
6 #include "xe_vm.h"
7 
8 #include <linux/dma-fence-array.h>
9 #include <linux/nospec.h>
10 
11 #include <drm/drm_drv.h>
12 #include <drm/drm_exec.h>
13 #include <drm/drm_print.h>
14 #include <drm/ttm/ttm_tt.h>
15 #include <uapi/drm/xe_drm.h>
16 #include <linux/ascii85.h>
17 #include <linux/delay.h>
18 #include <linux/kthread.h>
19 #include <linux/mm.h>
20 #include <linux/swap.h>
21 
22 #include <generated/xe_wa_oob.h>
23 
24 #include "regs/xe_gtt_defs.h"
25 #include "xe_assert.h"
26 #include "xe_bo.h"
27 #include "xe_device.h"
28 #include "xe_drm_client.h"
29 #include "xe_exec_queue.h"
30 #include "xe_gt_pagefault.h"
31 #include "xe_gt_tlb_invalidation.h"
32 #include "xe_migrate.h"
33 #include "xe_pat.h"
34 #include "xe_pm.h"
35 #include "xe_preempt_fence.h"
36 #include "xe_pt.h"
37 #include "xe_pxp.h"
38 #include "xe_res_cursor.h"
39 #include "xe_svm.h"
40 #include "xe_sync.h"
41 #include "xe_trace_bo.h"
42 #include "xe_wa.h"
43 #include "xe_hmm.h"
44 
45 static struct drm_gem_object *xe_vm_obj(struct xe_vm *vm)
46 {
47 	return vm->gpuvm.r_obj;
48 }
49 
50 /**
51  * xe_vma_userptr_check_repin() - Advisory check for repin needed
52  * @uvma: The userptr vma
53  *
54  * Check if the userptr vma has been invalidated since last successful
55  * repin. The check is advisory only and can the function can be called
56  * without the vm->userptr.notifier_lock held. There is no guarantee that the
57  * vma userptr will remain valid after a lockless check, so typically
58  * the call needs to be followed by a proper check under the notifier_lock.
59  *
60  * Return: 0 if userptr vma is valid, -EAGAIN otherwise; repin recommended.
61  */
62 int xe_vma_userptr_check_repin(struct xe_userptr_vma *uvma)
63 {
64 	return mmu_interval_check_retry(&uvma->userptr.notifier,
65 					uvma->userptr.notifier_seq) ?
66 		-EAGAIN : 0;
67 }
68 
69 int xe_vma_userptr_pin_pages(struct xe_userptr_vma *uvma)
70 {
71 	struct xe_vma *vma = &uvma->vma;
72 	struct xe_vm *vm = xe_vma_vm(vma);
73 	struct xe_device *xe = vm->xe;
74 
75 	lockdep_assert_held(&vm->lock);
76 	xe_assert(xe, xe_vma_is_userptr(vma));
77 
78 	return xe_hmm_userptr_populate_range(uvma, false);
79 }
80 
81 static bool preempt_fences_waiting(struct xe_vm *vm)
82 {
83 	struct xe_exec_queue *q;
84 
85 	lockdep_assert_held(&vm->lock);
86 	xe_vm_assert_held(vm);
87 
88 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link) {
89 		if (!q->lr.pfence ||
90 		    test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
91 			     &q->lr.pfence->flags)) {
92 			return true;
93 		}
94 	}
95 
96 	return false;
97 }
98 
99 static void free_preempt_fences(struct list_head *list)
100 {
101 	struct list_head *link, *next;
102 
103 	list_for_each_safe(link, next, list)
104 		xe_preempt_fence_free(to_preempt_fence_from_link(link));
105 }
106 
107 static int alloc_preempt_fences(struct xe_vm *vm, struct list_head *list,
108 				unsigned int *count)
109 {
110 	lockdep_assert_held(&vm->lock);
111 	xe_vm_assert_held(vm);
112 
113 	if (*count >= vm->preempt.num_exec_queues)
114 		return 0;
115 
116 	for (; *count < vm->preempt.num_exec_queues; ++(*count)) {
117 		struct xe_preempt_fence *pfence = xe_preempt_fence_alloc();
118 
119 		if (IS_ERR(pfence))
120 			return PTR_ERR(pfence);
121 
122 		list_move_tail(xe_preempt_fence_link(pfence), list);
123 	}
124 
125 	return 0;
126 }
127 
128 static int wait_for_existing_preempt_fences(struct xe_vm *vm)
129 {
130 	struct xe_exec_queue *q;
131 
132 	xe_vm_assert_held(vm);
133 
134 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link) {
135 		if (q->lr.pfence) {
136 			long timeout = dma_fence_wait(q->lr.pfence, false);
137 
138 			/* Only -ETIME on fence indicates VM needs to be killed */
139 			if (timeout < 0 || q->lr.pfence->error == -ETIME)
140 				return -ETIME;
141 
142 			dma_fence_put(q->lr.pfence);
143 			q->lr.pfence = NULL;
144 		}
145 	}
146 
147 	return 0;
148 }
149 
150 static bool xe_vm_is_idle(struct xe_vm *vm)
151 {
152 	struct xe_exec_queue *q;
153 
154 	xe_vm_assert_held(vm);
155 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link) {
156 		if (!xe_exec_queue_is_idle(q))
157 			return false;
158 	}
159 
160 	return true;
161 }
162 
163 static void arm_preempt_fences(struct xe_vm *vm, struct list_head *list)
164 {
165 	struct list_head *link;
166 	struct xe_exec_queue *q;
167 
168 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link) {
169 		struct dma_fence *fence;
170 
171 		link = list->next;
172 		xe_assert(vm->xe, link != list);
173 
174 		fence = xe_preempt_fence_arm(to_preempt_fence_from_link(link),
175 					     q, q->lr.context,
176 					     ++q->lr.seqno);
177 		dma_fence_put(q->lr.pfence);
178 		q->lr.pfence = fence;
179 	}
180 }
181 
182 static int add_preempt_fences(struct xe_vm *vm, struct xe_bo *bo)
183 {
184 	struct xe_exec_queue *q;
185 	int err;
186 
187 	xe_bo_assert_held(bo);
188 
189 	if (!vm->preempt.num_exec_queues)
190 		return 0;
191 
192 	err = dma_resv_reserve_fences(bo->ttm.base.resv, vm->preempt.num_exec_queues);
193 	if (err)
194 		return err;
195 
196 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link)
197 		if (q->lr.pfence) {
198 			dma_resv_add_fence(bo->ttm.base.resv,
199 					   q->lr.pfence,
200 					   DMA_RESV_USAGE_BOOKKEEP);
201 		}
202 
203 	return 0;
204 }
205 
206 static void resume_and_reinstall_preempt_fences(struct xe_vm *vm,
207 						struct drm_exec *exec)
208 {
209 	struct xe_exec_queue *q;
210 
211 	lockdep_assert_held(&vm->lock);
212 	xe_vm_assert_held(vm);
213 
214 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link) {
215 		q->ops->resume(q);
216 
217 		drm_gpuvm_resv_add_fence(&vm->gpuvm, exec, q->lr.pfence,
218 					 DMA_RESV_USAGE_BOOKKEEP, DMA_RESV_USAGE_BOOKKEEP);
219 	}
220 }
221 
222 int xe_vm_add_compute_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
223 {
224 	struct drm_gpuvm_exec vm_exec = {
225 		.vm = &vm->gpuvm,
226 		.flags = DRM_EXEC_INTERRUPTIBLE_WAIT,
227 		.num_fences = 1,
228 	};
229 	struct drm_exec *exec = &vm_exec.exec;
230 	struct dma_fence *pfence;
231 	int err;
232 	bool wait;
233 
234 	xe_assert(vm->xe, xe_vm_in_preempt_fence_mode(vm));
235 
236 	down_write(&vm->lock);
237 	err = drm_gpuvm_exec_lock(&vm_exec);
238 	if (err)
239 		goto out_up_write;
240 
241 	pfence = xe_preempt_fence_create(q, q->lr.context,
242 					 ++q->lr.seqno);
243 	if (!pfence) {
244 		err = -ENOMEM;
245 		goto out_fini;
246 	}
247 
248 	list_add(&q->lr.link, &vm->preempt.exec_queues);
249 	++vm->preempt.num_exec_queues;
250 	q->lr.pfence = pfence;
251 
252 	down_read(&vm->userptr.notifier_lock);
253 
254 	drm_gpuvm_resv_add_fence(&vm->gpuvm, exec, pfence,
255 				 DMA_RESV_USAGE_BOOKKEEP, DMA_RESV_USAGE_BOOKKEEP);
256 
257 	/*
258 	 * Check to see if a preemption on VM is in flight or userptr
259 	 * invalidation, if so trigger this preempt fence to sync state with
260 	 * other preempt fences on the VM.
261 	 */
262 	wait = __xe_vm_userptr_needs_repin(vm) || preempt_fences_waiting(vm);
263 	if (wait)
264 		dma_fence_enable_sw_signaling(pfence);
265 
266 	up_read(&vm->userptr.notifier_lock);
267 
268 out_fini:
269 	drm_exec_fini(exec);
270 out_up_write:
271 	up_write(&vm->lock);
272 
273 	return err;
274 }
275 ALLOW_ERROR_INJECTION(xe_vm_add_compute_exec_queue, ERRNO);
276 
277 /**
278  * xe_vm_remove_compute_exec_queue() - Remove compute exec queue from VM
279  * @vm: The VM.
280  * @q: The exec_queue
281  *
282  * Note that this function might be called multiple times on the same queue.
283  */
284 void xe_vm_remove_compute_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
285 {
286 	if (!xe_vm_in_preempt_fence_mode(vm))
287 		return;
288 
289 	down_write(&vm->lock);
290 	if (!list_empty(&q->lr.link)) {
291 		list_del_init(&q->lr.link);
292 		--vm->preempt.num_exec_queues;
293 	}
294 	if (q->lr.pfence) {
295 		dma_fence_enable_sw_signaling(q->lr.pfence);
296 		dma_fence_put(q->lr.pfence);
297 		q->lr.pfence = NULL;
298 	}
299 	up_write(&vm->lock);
300 }
301 
302 /**
303  * __xe_vm_userptr_needs_repin() - Check whether the VM does have userptrs
304  * that need repinning.
305  * @vm: The VM.
306  *
307  * This function checks for whether the VM has userptrs that need repinning,
308  * and provides a release-type barrier on the userptr.notifier_lock after
309  * checking.
310  *
311  * Return: 0 if there are no userptrs needing repinning, -EAGAIN if there are.
312  */
313 int __xe_vm_userptr_needs_repin(struct xe_vm *vm)
314 {
315 	lockdep_assert_held_read(&vm->userptr.notifier_lock);
316 
317 	return (list_empty(&vm->userptr.repin_list) &&
318 		list_empty(&vm->userptr.invalidated)) ? 0 : -EAGAIN;
319 }
320 
321 #define XE_VM_REBIND_RETRY_TIMEOUT_MS 1000
322 
323 /**
324  * xe_vm_kill() - VM Kill
325  * @vm: The VM.
326  * @unlocked: Flag indicates the VM's dma-resv is not held
327  *
328  * Kill the VM by setting banned flag indicated VM is no longer available for
329  * use. If in preempt fence mode, also kill all exec queue attached to the VM.
330  */
331 void xe_vm_kill(struct xe_vm *vm, bool unlocked)
332 {
333 	struct xe_exec_queue *q;
334 
335 	lockdep_assert_held(&vm->lock);
336 
337 	if (unlocked)
338 		xe_vm_lock(vm, false);
339 
340 	vm->flags |= XE_VM_FLAG_BANNED;
341 	trace_xe_vm_kill(vm);
342 
343 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link)
344 		q->ops->kill(q);
345 
346 	if (unlocked)
347 		xe_vm_unlock(vm);
348 
349 	/* TODO: Inform user the VM is banned */
350 }
351 
352 /**
353  * xe_vm_validate_should_retry() - Whether to retry after a validate error.
354  * @exec: The drm_exec object used for locking before validation.
355  * @err: The error returned from ttm_bo_validate().
356  * @end: A ktime_t cookie that should be set to 0 before first use and
357  * that should be reused on subsequent calls.
358  *
359  * With multiple active VMs, under memory pressure, it is possible that
360  * ttm_bo_validate() run into -EDEADLK and in such case returns -ENOMEM.
361  * Until ttm properly handles locking in such scenarios, best thing the
362  * driver can do is retry with a timeout. Check if that is necessary, and
363  * if so unlock the drm_exec's objects while keeping the ticket to prepare
364  * for a rerun.
365  *
366  * Return: true if a retry after drm_exec_init() is recommended;
367  * false otherwise.
368  */
369 bool xe_vm_validate_should_retry(struct drm_exec *exec, int err, ktime_t *end)
370 {
371 	ktime_t cur;
372 
373 	if (err != -ENOMEM)
374 		return false;
375 
376 	cur = ktime_get();
377 	*end = *end ? : ktime_add_ms(cur, XE_VM_REBIND_RETRY_TIMEOUT_MS);
378 	if (!ktime_before(cur, *end))
379 		return false;
380 
381 	msleep(20);
382 	return true;
383 }
384 
385 static int xe_gpuvm_validate(struct drm_gpuvm_bo *vm_bo, struct drm_exec *exec)
386 {
387 	struct xe_vm *vm = gpuvm_to_vm(vm_bo->vm);
388 	struct drm_gpuva *gpuva;
389 	int ret;
390 
391 	lockdep_assert_held(&vm->lock);
392 	drm_gpuvm_bo_for_each_va(gpuva, vm_bo)
393 		list_move_tail(&gpuva_to_vma(gpuva)->combined_links.rebind,
394 			       &vm->rebind_list);
395 
396 	ret = xe_bo_validate(gem_to_xe_bo(vm_bo->obj), vm, false);
397 	if (ret)
398 		return ret;
399 
400 	vm_bo->evicted = false;
401 	return 0;
402 }
403 
404 /**
405  * xe_vm_validate_rebind() - Validate buffer objects and rebind vmas
406  * @vm: The vm for which we are rebinding.
407  * @exec: The struct drm_exec with the locked GEM objects.
408  * @num_fences: The number of fences to reserve for the operation, not
409  * including rebinds and validations.
410  *
411  * Validates all evicted gem objects and rebinds their vmas. Note that
412  * rebindings may cause evictions and hence the validation-rebind
413  * sequence is rerun until there are no more objects to validate.
414  *
415  * Return: 0 on success, negative error code on error. In particular,
416  * may return -EINTR or -ERESTARTSYS if interrupted, and -EDEADLK if
417  * the drm_exec transaction needs to be restarted.
418  */
419 int xe_vm_validate_rebind(struct xe_vm *vm, struct drm_exec *exec,
420 			  unsigned int num_fences)
421 {
422 	struct drm_gem_object *obj;
423 	unsigned long index;
424 	int ret;
425 
426 	do {
427 		ret = drm_gpuvm_validate(&vm->gpuvm, exec);
428 		if (ret)
429 			return ret;
430 
431 		ret = xe_vm_rebind(vm, false);
432 		if (ret)
433 			return ret;
434 	} while (!list_empty(&vm->gpuvm.evict.list));
435 
436 	drm_exec_for_each_locked_object(exec, index, obj) {
437 		ret = dma_resv_reserve_fences(obj->resv, num_fences);
438 		if (ret)
439 			return ret;
440 	}
441 
442 	return 0;
443 }
444 
445 static int xe_preempt_work_begin(struct drm_exec *exec, struct xe_vm *vm,
446 				 bool *done)
447 {
448 	int err;
449 
450 	err = drm_gpuvm_prepare_vm(&vm->gpuvm, exec, 0);
451 	if (err)
452 		return err;
453 
454 	if (xe_vm_is_idle(vm)) {
455 		vm->preempt.rebind_deactivated = true;
456 		*done = true;
457 		return 0;
458 	}
459 
460 	if (!preempt_fences_waiting(vm)) {
461 		*done = true;
462 		return 0;
463 	}
464 
465 	err = drm_gpuvm_prepare_objects(&vm->gpuvm, exec, 0);
466 	if (err)
467 		return err;
468 
469 	err = wait_for_existing_preempt_fences(vm);
470 	if (err)
471 		return err;
472 
473 	/*
474 	 * Add validation and rebinding to the locking loop since both can
475 	 * cause evictions which may require blocing dma_resv locks.
476 	 * The fence reservation here is intended for the new preempt fences
477 	 * we attach at the end of the rebind work.
478 	 */
479 	return xe_vm_validate_rebind(vm, exec, vm->preempt.num_exec_queues);
480 }
481 
482 static void preempt_rebind_work_func(struct work_struct *w)
483 {
484 	struct xe_vm *vm = container_of(w, struct xe_vm, preempt.rebind_work);
485 	struct drm_exec exec;
486 	unsigned int fence_count = 0;
487 	LIST_HEAD(preempt_fences);
488 	ktime_t end = 0;
489 	int err = 0;
490 	long wait;
491 	int __maybe_unused tries = 0;
492 
493 	xe_assert(vm->xe, xe_vm_in_preempt_fence_mode(vm));
494 	trace_xe_vm_rebind_worker_enter(vm);
495 
496 	down_write(&vm->lock);
497 
498 	if (xe_vm_is_closed_or_banned(vm)) {
499 		up_write(&vm->lock);
500 		trace_xe_vm_rebind_worker_exit(vm);
501 		return;
502 	}
503 
504 retry:
505 	if (xe_vm_userptr_check_repin(vm)) {
506 		err = xe_vm_userptr_pin(vm);
507 		if (err)
508 			goto out_unlock_outer;
509 	}
510 
511 	drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT, 0);
512 
513 	drm_exec_until_all_locked(&exec) {
514 		bool done = false;
515 
516 		err = xe_preempt_work_begin(&exec, vm, &done);
517 		drm_exec_retry_on_contention(&exec);
518 		if (err || done) {
519 			drm_exec_fini(&exec);
520 			if (err && xe_vm_validate_should_retry(&exec, err, &end))
521 				err = -EAGAIN;
522 
523 			goto out_unlock_outer;
524 		}
525 	}
526 
527 	err = alloc_preempt_fences(vm, &preempt_fences, &fence_count);
528 	if (err)
529 		goto out_unlock;
530 
531 	err = xe_vm_rebind(vm, true);
532 	if (err)
533 		goto out_unlock;
534 
535 	/* Wait on rebinds and munmap style VM unbinds */
536 	wait = dma_resv_wait_timeout(xe_vm_resv(vm),
537 				     DMA_RESV_USAGE_KERNEL,
538 				     false, MAX_SCHEDULE_TIMEOUT);
539 	if (wait <= 0) {
540 		err = -ETIME;
541 		goto out_unlock;
542 	}
543 
544 #define retry_required(__tries, __vm) \
545 	(IS_ENABLED(CONFIG_DRM_XE_USERPTR_INVAL_INJECT) ? \
546 	(!(__tries)++ || __xe_vm_userptr_needs_repin(__vm)) : \
547 	__xe_vm_userptr_needs_repin(__vm))
548 
549 	down_read(&vm->userptr.notifier_lock);
550 	if (retry_required(tries, vm)) {
551 		up_read(&vm->userptr.notifier_lock);
552 		err = -EAGAIN;
553 		goto out_unlock;
554 	}
555 
556 #undef retry_required
557 
558 	spin_lock(&vm->xe->ttm.lru_lock);
559 	ttm_lru_bulk_move_tail(&vm->lru_bulk_move);
560 	spin_unlock(&vm->xe->ttm.lru_lock);
561 
562 	/* Point of no return. */
563 	arm_preempt_fences(vm, &preempt_fences);
564 	resume_and_reinstall_preempt_fences(vm, &exec);
565 	up_read(&vm->userptr.notifier_lock);
566 
567 out_unlock:
568 	drm_exec_fini(&exec);
569 out_unlock_outer:
570 	if (err == -EAGAIN) {
571 		trace_xe_vm_rebind_worker_retry(vm);
572 		goto retry;
573 	}
574 
575 	if (err) {
576 		drm_warn(&vm->xe->drm, "VM worker error: %d\n", err);
577 		xe_vm_kill(vm, true);
578 	}
579 	up_write(&vm->lock);
580 
581 	free_preempt_fences(&preempt_fences);
582 
583 	trace_xe_vm_rebind_worker_exit(vm);
584 }
585 
586 static void __vma_userptr_invalidate(struct xe_vm *vm, struct xe_userptr_vma *uvma)
587 {
588 	struct xe_userptr *userptr = &uvma->userptr;
589 	struct xe_vma *vma = &uvma->vma;
590 	struct dma_resv_iter cursor;
591 	struct dma_fence *fence;
592 	long err;
593 
594 	/*
595 	 * Tell exec and rebind worker they need to repin and rebind this
596 	 * userptr.
597 	 */
598 	if (!xe_vm_in_fault_mode(vm) &&
599 	    !(vma->gpuva.flags & XE_VMA_DESTROYED)) {
600 		spin_lock(&vm->userptr.invalidated_lock);
601 		list_move_tail(&userptr->invalidate_link,
602 			       &vm->userptr.invalidated);
603 		spin_unlock(&vm->userptr.invalidated_lock);
604 	}
605 
606 	/*
607 	 * Preempt fences turn into schedule disables, pipeline these.
608 	 * Note that even in fault mode, we need to wait for binds and
609 	 * unbinds to complete, and those are attached as BOOKMARK fences
610 	 * to the vm.
611 	 */
612 	dma_resv_iter_begin(&cursor, xe_vm_resv(vm),
613 			    DMA_RESV_USAGE_BOOKKEEP);
614 	dma_resv_for_each_fence_unlocked(&cursor, fence)
615 		dma_fence_enable_sw_signaling(fence);
616 	dma_resv_iter_end(&cursor);
617 
618 	err = dma_resv_wait_timeout(xe_vm_resv(vm),
619 				    DMA_RESV_USAGE_BOOKKEEP,
620 				    false, MAX_SCHEDULE_TIMEOUT);
621 	XE_WARN_ON(err <= 0);
622 
623 	if (xe_vm_in_fault_mode(vm) && userptr->initial_bind) {
624 		err = xe_vm_invalidate_vma(vma);
625 		XE_WARN_ON(err);
626 	}
627 
628 	xe_hmm_userptr_unmap(uvma);
629 }
630 
631 static bool vma_userptr_invalidate(struct mmu_interval_notifier *mni,
632 				   const struct mmu_notifier_range *range,
633 				   unsigned long cur_seq)
634 {
635 	struct xe_userptr_vma *uvma = container_of(mni, typeof(*uvma), userptr.notifier);
636 	struct xe_vma *vma = &uvma->vma;
637 	struct xe_vm *vm = xe_vma_vm(vma);
638 
639 	xe_assert(vm->xe, xe_vma_is_userptr(vma));
640 	trace_xe_vma_userptr_invalidate(vma);
641 
642 	if (!mmu_notifier_range_blockable(range))
643 		return false;
644 
645 	vm_dbg(&xe_vma_vm(vma)->xe->drm,
646 	       "NOTIFIER: addr=0x%016llx, range=0x%016llx",
647 		xe_vma_start(vma), xe_vma_size(vma));
648 
649 	down_write(&vm->userptr.notifier_lock);
650 	mmu_interval_set_seq(mni, cur_seq);
651 
652 	__vma_userptr_invalidate(vm, uvma);
653 	up_write(&vm->userptr.notifier_lock);
654 	trace_xe_vma_userptr_invalidate_complete(vma);
655 
656 	return true;
657 }
658 
659 static const struct mmu_interval_notifier_ops vma_userptr_notifier_ops = {
660 	.invalidate = vma_userptr_invalidate,
661 };
662 
663 #if IS_ENABLED(CONFIG_DRM_XE_USERPTR_INVAL_INJECT)
664 /**
665  * xe_vma_userptr_force_invalidate() - force invalidate a userptr
666  * @uvma: The userptr vma to invalidate
667  *
668  * Perform a forced userptr invalidation for testing purposes.
669  */
670 void xe_vma_userptr_force_invalidate(struct xe_userptr_vma *uvma)
671 {
672 	struct xe_vm *vm = xe_vma_vm(&uvma->vma);
673 
674 	/* Protect against concurrent userptr pinning */
675 	lockdep_assert_held(&vm->lock);
676 	/* Protect against concurrent notifiers */
677 	lockdep_assert_held(&vm->userptr.notifier_lock);
678 	/*
679 	 * Protect against concurrent instances of this function and
680 	 * the critical exec sections
681 	 */
682 	xe_vm_assert_held(vm);
683 
684 	if (!mmu_interval_read_retry(&uvma->userptr.notifier,
685 				     uvma->userptr.notifier_seq))
686 		uvma->userptr.notifier_seq -= 2;
687 	__vma_userptr_invalidate(vm, uvma);
688 }
689 #endif
690 
691 int xe_vm_userptr_pin(struct xe_vm *vm)
692 {
693 	struct xe_userptr_vma *uvma, *next;
694 	int err = 0;
695 
696 	xe_assert(vm->xe, !xe_vm_in_fault_mode(vm));
697 	lockdep_assert_held_write(&vm->lock);
698 
699 	/* Collect invalidated userptrs */
700 	spin_lock(&vm->userptr.invalidated_lock);
701 	xe_assert(vm->xe, list_empty(&vm->userptr.repin_list));
702 	list_for_each_entry_safe(uvma, next, &vm->userptr.invalidated,
703 				 userptr.invalidate_link) {
704 		list_del_init(&uvma->userptr.invalidate_link);
705 		list_add_tail(&uvma->userptr.repin_link,
706 			      &vm->userptr.repin_list);
707 	}
708 	spin_unlock(&vm->userptr.invalidated_lock);
709 
710 	/* Pin and move to bind list */
711 	list_for_each_entry_safe(uvma, next, &vm->userptr.repin_list,
712 				 userptr.repin_link) {
713 		err = xe_vma_userptr_pin_pages(uvma);
714 		if (err == -EFAULT) {
715 			list_del_init(&uvma->userptr.repin_link);
716 			/*
717 			 * We might have already done the pin once already, but
718 			 * then had to retry before the re-bind happened, due
719 			 * some other condition in the caller, but in the
720 			 * meantime the userptr got dinged by the notifier such
721 			 * that we need to revalidate here, but this time we hit
722 			 * the EFAULT. In such a case make sure we remove
723 			 * ourselves from the rebind list to avoid going down in
724 			 * flames.
725 			 */
726 			if (!list_empty(&uvma->vma.combined_links.rebind))
727 				list_del_init(&uvma->vma.combined_links.rebind);
728 
729 			/* Wait for pending binds */
730 			xe_vm_lock(vm, false);
731 			dma_resv_wait_timeout(xe_vm_resv(vm),
732 					      DMA_RESV_USAGE_BOOKKEEP,
733 					      false, MAX_SCHEDULE_TIMEOUT);
734 
735 			down_read(&vm->userptr.notifier_lock);
736 			err = xe_vm_invalidate_vma(&uvma->vma);
737 			up_read(&vm->userptr.notifier_lock);
738 			xe_vm_unlock(vm);
739 			if (err)
740 				break;
741 		} else {
742 			if (err)
743 				break;
744 
745 			list_del_init(&uvma->userptr.repin_link);
746 			list_move_tail(&uvma->vma.combined_links.rebind,
747 				       &vm->rebind_list);
748 		}
749 	}
750 
751 	if (err) {
752 		down_write(&vm->userptr.notifier_lock);
753 		spin_lock(&vm->userptr.invalidated_lock);
754 		list_for_each_entry_safe(uvma, next, &vm->userptr.repin_list,
755 					 userptr.repin_link) {
756 			list_del_init(&uvma->userptr.repin_link);
757 			list_move_tail(&uvma->userptr.invalidate_link,
758 				       &vm->userptr.invalidated);
759 		}
760 		spin_unlock(&vm->userptr.invalidated_lock);
761 		up_write(&vm->userptr.notifier_lock);
762 	}
763 	return err;
764 }
765 
766 /**
767  * xe_vm_userptr_check_repin() - Check whether the VM might have userptrs
768  * that need repinning.
769  * @vm: The VM.
770  *
771  * This function does an advisory check for whether the VM has userptrs that
772  * need repinning.
773  *
774  * Return: 0 if there are no indications of userptrs needing repinning,
775  * -EAGAIN if there are.
776  */
777 int xe_vm_userptr_check_repin(struct xe_vm *vm)
778 {
779 	return (list_empty_careful(&vm->userptr.repin_list) &&
780 		list_empty_careful(&vm->userptr.invalidated)) ? 0 : -EAGAIN;
781 }
782 
783 static int xe_vma_ops_alloc(struct xe_vma_ops *vops, bool array_of_binds)
784 {
785 	int i;
786 
787 	for (i = 0; i < XE_MAX_TILES_PER_DEVICE; ++i) {
788 		if (!vops->pt_update_ops[i].num_ops)
789 			continue;
790 
791 		vops->pt_update_ops[i].ops =
792 			kmalloc_array(vops->pt_update_ops[i].num_ops,
793 				      sizeof(*vops->pt_update_ops[i].ops),
794 				      GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN);
795 		if (!vops->pt_update_ops[i].ops)
796 			return array_of_binds ? -ENOBUFS : -ENOMEM;
797 	}
798 
799 	return 0;
800 }
801 ALLOW_ERROR_INJECTION(xe_vma_ops_alloc, ERRNO);
802 
803 static void xe_vma_svm_prefetch_op_fini(struct xe_vma_op *op)
804 {
805 	struct xe_vma *vma;
806 
807 	vma = gpuva_to_vma(op->base.prefetch.va);
808 
809 	if (op->base.op == DRM_GPUVA_OP_PREFETCH && xe_vma_is_cpu_addr_mirror(vma))
810 		xa_destroy(&op->prefetch_range.range);
811 }
812 
813 static void xe_vma_svm_prefetch_ops_fini(struct xe_vma_ops *vops)
814 {
815 	struct xe_vma_op *op;
816 
817 	if (!(vops->flags & XE_VMA_OPS_FLAG_HAS_SVM_PREFETCH))
818 		return;
819 
820 	list_for_each_entry(op, &vops->list, link)
821 		xe_vma_svm_prefetch_op_fini(op);
822 }
823 
824 static void xe_vma_ops_fini(struct xe_vma_ops *vops)
825 {
826 	int i;
827 
828 	xe_vma_svm_prefetch_ops_fini(vops);
829 
830 	for (i = 0; i < XE_MAX_TILES_PER_DEVICE; ++i)
831 		kfree(vops->pt_update_ops[i].ops);
832 }
833 
834 static void xe_vma_ops_incr_pt_update_ops(struct xe_vma_ops *vops, u8 tile_mask, int inc_val)
835 {
836 	int i;
837 
838 	if (!inc_val)
839 		return;
840 
841 	for (i = 0; i < XE_MAX_TILES_PER_DEVICE; ++i)
842 		if (BIT(i) & tile_mask)
843 			vops->pt_update_ops[i].num_ops += inc_val;
844 }
845 
846 static void xe_vm_populate_rebind(struct xe_vma_op *op, struct xe_vma *vma,
847 				  u8 tile_mask)
848 {
849 	INIT_LIST_HEAD(&op->link);
850 	op->tile_mask = tile_mask;
851 	op->base.op = DRM_GPUVA_OP_MAP;
852 	op->base.map.va.addr = vma->gpuva.va.addr;
853 	op->base.map.va.range = vma->gpuva.va.range;
854 	op->base.map.gem.obj = vma->gpuva.gem.obj;
855 	op->base.map.gem.offset = vma->gpuva.gem.offset;
856 	op->map.vma = vma;
857 	op->map.immediate = true;
858 	op->map.dumpable = vma->gpuva.flags & XE_VMA_DUMPABLE;
859 	op->map.is_null = xe_vma_is_null(vma);
860 }
861 
862 static int xe_vm_ops_add_rebind(struct xe_vma_ops *vops, struct xe_vma *vma,
863 				u8 tile_mask)
864 {
865 	struct xe_vma_op *op;
866 
867 	op = kzalloc(sizeof(*op), GFP_KERNEL);
868 	if (!op)
869 		return -ENOMEM;
870 
871 	xe_vm_populate_rebind(op, vma, tile_mask);
872 	list_add_tail(&op->link, &vops->list);
873 	xe_vma_ops_incr_pt_update_ops(vops, tile_mask, 1);
874 
875 	return 0;
876 }
877 
878 static struct dma_fence *ops_execute(struct xe_vm *vm,
879 				     struct xe_vma_ops *vops);
880 static void xe_vma_ops_init(struct xe_vma_ops *vops, struct xe_vm *vm,
881 			    struct xe_exec_queue *q,
882 			    struct xe_sync_entry *syncs, u32 num_syncs);
883 
884 int xe_vm_rebind(struct xe_vm *vm, bool rebind_worker)
885 {
886 	struct dma_fence *fence;
887 	struct xe_vma *vma, *next;
888 	struct xe_vma_ops vops;
889 	struct xe_vma_op *op, *next_op;
890 	int err, i;
891 
892 	lockdep_assert_held(&vm->lock);
893 	if ((xe_vm_in_lr_mode(vm) && !rebind_worker) ||
894 	    list_empty(&vm->rebind_list))
895 		return 0;
896 
897 	xe_vma_ops_init(&vops, vm, NULL, NULL, 0);
898 	for (i = 0; i < XE_MAX_TILES_PER_DEVICE; ++i)
899 		vops.pt_update_ops[i].wait_vm_bookkeep = true;
900 
901 	xe_vm_assert_held(vm);
902 	list_for_each_entry(vma, &vm->rebind_list, combined_links.rebind) {
903 		xe_assert(vm->xe, vma->tile_present);
904 
905 		if (rebind_worker)
906 			trace_xe_vma_rebind_worker(vma);
907 		else
908 			trace_xe_vma_rebind_exec(vma);
909 
910 		err = xe_vm_ops_add_rebind(&vops, vma,
911 					   vma->tile_present);
912 		if (err)
913 			goto free_ops;
914 	}
915 
916 	err = xe_vma_ops_alloc(&vops, false);
917 	if (err)
918 		goto free_ops;
919 
920 	fence = ops_execute(vm, &vops);
921 	if (IS_ERR(fence)) {
922 		err = PTR_ERR(fence);
923 	} else {
924 		dma_fence_put(fence);
925 		list_for_each_entry_safe(vma, next, &vm->rebind_list,
926 					 combined_links.rebind)
927 			list_del_init(&vma->combined_links.rebind);
928 	}
929 free_ops:
930 	list_for_each_entry_safe(op, next_op, &vops.list, link) {
931 		list_del(&op->link);
932 		kfree(op);
933 	}
934 	xe_vma_ops_fini(&vops);
935 
936 	return err;
937 }
938 
939 struct dma_fence *xe_vma_rebind(struct xe_vm *vm, struct xe_vma *vma, u8 tile_mask)
940 {
941 	struct dma_fence *fence = NULL;
942 	struct xe_vma_ops vops;
943 	struct xe_vma_op *op, *next_op;
944 	struct xe_tile *tile;
945 	u8 id;
946 	int err;
947 
948 	lockdep_assert_held(&vm->lock);
949 	xe_vm_assert_held(vm);
950 	xe_assert(vm->xe, xe_vm_in_fault_mode(vm));
951 
952 	xe_vma_ops_init(&vops, vm, NULL, NULL, 0);
953 	for_each_tile(tile, vm->xe, id) {
954 		vops.pt_update_ops[id].wait_vm_bookkeep = true;
955 		vops.pt_update_ops[tile->id].q =
956 			xe_tile_migrate_exec_queue(tile);
957 	}
958 
959 	err = xe_vm_ops_add_rebind(&vops, vma, tile_mask);
960 	if (err)
961 		return ERR_PTR(err);
962 
963 	err = xe_vma_ops_alloc(&vops, false);
964 	if (err) {
965 		fence = ERR_PTR(err);
966 		goto free_ops;
967 	}
968 
969 	fence = ops_execute(vm, &vops);
970 
971 free_ops:
972 	list_for_each_entry_safe(op, next_op, &vops.list, link) {
973 		list_del(&op->link);
974 		kfree(op);
975 	}
976 	xe_vma_ops_fini(&vops);
977 
978 	return fence;
979 }
980 
981 static void xe_vm_populate_range_rebind(struct xe_vma_op *op,
982 					struct xe_vma *vma,
983 					struct xe_svm_range *range,
984 					u8 tile_mask)
985 {
986 	INIT_LIST_HEAD(&op->link);
987 	op->tile_mask = tile_mask;
988 	op->base.op = DRM_GPUVA_OP_DRIVER;
989 	op->subop = XE_VMA_SUBOP_MAP_RANGE;
990 	op->map_range.vma = vma;
991 	op->map_range.range = range;
992 }
993 
994 static int
995 xe_vm_ops_add_range_rebind(struct xe_vma_ops *vops,
996 			   struct xe_vma *vma,
997 			   struct xe_svm_range *range,
998 			   u8 tile_mask)
999 {
1000 	struct xe_vma_op *op;
1001 
1002 	op = kzalloc(sizeof(*op), GFP_KERNEL);
1003 	if (!op)
1004 		return -ENOMEM;
1005 
1006 	xe_vm_populate_range_rebind(op, vma, range, tile_mask);
1007 	list_add_tail(&op->link, &vops->list);
1008 	xe_vma_ops_incr_pt_update_ops(vops, tile_mask, 1);
1009 
1010 	return 0;
1011 }
1012 
1013 /**
1014  * xe_vm_range_rebind() - VM range (re)bind
1015  * @vm: The VM which the range belongs to.
1016  * @vma: The VMA which the range belongs to.
1017  * @range: SVM range to rebind.
1018  * @tile_mask: Tile mask to bind the range to.
1019  *
1020  * (re)bind SVM range setting up GPU page tables for the range.
1021  *
1022  * Return: dma fence for rebind to signal completion on succees, ERR_PTR on
1023  * failure
1024  */
1025 struct dma_fence *xe_vm_range_rebind(struct xe_vm *vm,
1026 				     struct xe_vma *vma,
1027 				     struct xe_svm_range *range,
1028 				     u8 tile_mask)
1029 {
1030 	struct dma_fence *fence = NULL;
1031 	struct xe_vma_ops vops;
1032 	struct xe_vma_op *op, *next_op;
1033 	struct xe_tile *tile;
1034 	u8 id;
1035 	int err;
1036 
1037 	lockdep_assert_held(&vm->lock);
1038 	xe_vm_assert_held(vm);
1039 	xe_assert(vm->xe, xe_vm_in_fault_mode(vm));
1040 	xe_assert(vm->xe, xe_vma_is_cpu_addr_mirror(vma));
1041 
1042 	xe_vma_ops_init(&vops, vm, NULL, NULL, 0);
1043 	for_each_tile(tile, vm->xe, id) {
1044 		vops.pt_update_ops[id].wait_vm_bookkeep = true;
1045 		vops.pt_update_ops[tile->id].q =
1046 			xe_tile_migrate_exec_queue(tile);
1047 	}
1048 
1049 	err = xe_vm_ops_add_range_rebind(&vops, vma, range, tile_mask);
1050 	if (err)
1051 		return ERR_PTR(err);
1052 
1053 	err = xe_vma_ops_alloc(&vops, false);
1054 	if (err) {
1055 		fence = ERR_PTR(err);
1056 		goto free_ops;
1057 	}
1058 
1059 	fence = ops_execute(vm, &vops);
1060 
1061 free_ops:
1062 	list_for_each_entry_safe(op, next_op, &vops.list, link) {
1063 		list_del(&op->link);
1064 		kfree(op);
1065 	}
1066 	xe_vma_ops_fini(&vops);
1067 
1068 	return fence;
1069 }
1070 
1071 static void xe_vm_populate_range_unbind(struct xe_vma_op *op,
1072 					struct xe_svm_range *range)
1073 {
1074 	INIT_LIST_HEAD(&op->link);
1075 	op->tile_mask = range->tile_present;
1076 	op->base.op = DRM_GPUVA_OP_DRIVER;
1077 	op->subop = XE_VMA_SUBOP_UNMAP_RANGE;
1078 	op->unmap_range.range = range;
1079 }
1080 
1081 static int
1082 xe_vm_ops_add_range_unbind(struct xe_vma_ops *vops,
1083 			   struct xe_svm_range *range)
1084 {
1085 	struct xe_vma_op *op;
1086 
1087 	op = kzalloc(sizeof(*op), GFP_KERNEL);
1088 	if (!op)
1089 		return -ENOMEM;
1090 
1091 	xe_vm_populate_range_unbind(op, range);
1092 	list_add_tail(&op->link, &vops->list);
1093 	xe_vma_ops_incr_pt_update_ops(vops, range->tile_present, 1);
1094 
1095 	return 0;
1096 }
1097 
1098 /**
1099  * xe_vm_range_unbind() - VM range unbind
1100  * @vm: The VM which the range belongs to.
1101  * @range: SVM range to rebind.
1102  *
1103  * Unbind SVM range removing the GPU page tables for the range.
1104  *
1105  * Return: dma fence for unbind to signal completion on succees, ERR_PTR on
1106  * failure
1107  */
1108 struct dma_fence *xe_vm_range_unbind(struct xe_vm *vm,
1109 				     struct xe_svm_range *range)
1110 {
1111 	struct dma_fence *fence = NULL;
1112 	struct xe_vma_ops vops;
1113 	struct xe_vma_op *op, *next_op;
1114 	struct xe_tile *tile;
1115 	u8 id;
1116 	int err;
1117 
1118 	lockdep_assert_held(&vm->lock);
1119 	xe_vm_assert_held(vm);
1120 	xe_assert(vm->xe, xe_vm_in_fault_mode(vm));
1121 
1122 	if (!range->tile_present)
1123 		return dma_fence_get_stub();
1124 
1125 	xe_vma_ops_init(&vops, vm, NULL, NULL, 0);
1126 	for_each_tile(tile, vm->xe, id) {
1127 		vops.pt_update_ops[id].wait_vm_bookkeep = true;
1128 		vops.pt_update_ops[tile->id].q =
1129 			xe_tile_migrate_exec_queue(tile);
1130 	}
1131 
1132 	err = xe_vm_ops_add_range_unbind(&vops, range);
1133 	if (err)
1134 		return ERR_PTR(err);
1135 
1136 	err = xe_vma_ops_alloc(&vops, false);
1137 	if (err) {
1138 		fence = ERR_PTR(err);
1139 		goto free_ops;
1140 	}
1141 
1142 	fence = ops_execute(vm, &vops);
1143 
1144 free_ops:
1145 	list_for_each_entry_safe(op, next_op, &vops.list, link) {
1146 		list_del(&op->link);
1147 		kfree(op);
1148 	}
1149 	xe_vma_ops_fini(&vops);
1150 
1151 	return fence;
1152 }
1153 
1154 static void xe_vma_free(struct xe_vma *vma)
1155 {
1156 	if (xe_vma_is_userptr(vma))
1157 		kfree(to_userptr_vma(vma));
1158 	else
1159 		kfree(vma);
1160 }
1161 
1162 #define VMA_CREATE_FLAG_READ_ONLY		BIT(0)
1163 #define VMA_CREATE_FLAG_IS_NULL			BIT(1)
1164 #define VMA_CREATE_FLAG_DUMPABLE		BIT(2)
1165 #define VMA_CREATE_FLAG_IS_SYSTEM_ALLOCATOR	BIT(3)
1166 
1167 static struct xe_vma *xe_vma_create(struct xe_vm *vm,
1168 				    struct xe_bo *bo,
1169 				    u64 bo_offset_or_userptr,
1170 				    u64 start, u64 end,
1171 				    u16 pat_index, unsigned int flags)
1172 {
1173 	struct xe_vma *vma;
1174 	struct xe_tile *tile;
1175 	u8 id;
1176 	bool read_only = (flags & VMA_CREATE_FLAG_READ_ONLY);
1177 	bool is_null = (flags & VMA_CREATE_FLAG_IS_NULL);
1178 	bool dumpable = (flags & VMA_CREATE_FLAG_DUMPABLE);
1179 	bool is_cpu_addr_mirror =
1180 		(flags & VMA_CREATE_FLAG_IS_SYSTEM_ALLOCATOR);
1181 
1182 	xe_assert(vm->xe, start < end);
1183 	xe_assert(vm->xe, end < vm->size);
1184 
1185 	/*
1186 	 * Allocate and ensure that the xe_vma_is_userptr() return
1187 	 * matches what was allocated.
1188 	 */
1189 	if (!bo && !is_null && !is_cpu_addr_mirror) {
1190 		struct xe_userptr_vma *uvma = kzalloc(sizeof(*uvma), GFP_KERNEL);
1191 
1192 		if (!uvma)
1193 			return ERR_PTR(-ENOMEM);
1194 
1195 		vma = &uvma->vma;
1196 	} else {
1197 		vma = kzalloc(sizeof(*vma), GFP_KERNEL);
1198 		if (!vma)
1199 			return ERR_PTR(-ENOMEM);
1200 
1201 		if (is_cpu_addr_mirror)
1202 			vma->gpuva.flags |= XE_VMA_SYSTEM_ALLOCATOR;
1203 		if (is_null)
1204 			vma->gpuva.flags |= DRM_GPUVA_SPARSE;
1205 		if (bo)
1206 			vma->gpuva.gem.obj = &bo->ttm.base;
1207 	}
1208 
1209 	INIT_LIST_HEAD(&vma->combined_links.rebind);
1210 
1211 	INIT_LIST_HEAD(&vma->gpuva.gem.entry);
1212 	vma->gpuva.vm = &vm->gpuvm;
1213 	vma->gpuva.va.addr = start;
1214 	vma->gpuva.va.range = end - start + 1;
1215 	if (read_only)
1216 		vma->gpuva.flags |= XE_VMA_READ_ONLY;
1217 	if (dumpable)
1218 		vma->gpuva.flags |= XE_VMA_DUMPABLE;
1219 
1220 	for_each_tile(tile, vm->xe, id)
1221 		vma->tile_mask |= 0x1 << id;
1222 
1223 	if (vm->xe->info.has_atomic_enable_pte_bit)
1224 		vma->gpuva.flags |= XE_VMA_ATOMIC_PTE_BIT;
1225 
1226 	vma->pat_index = pat_index;
1227 
1228 	if (bo) {
1229 		struct drm_gpuvm_bo *vm_bo;
1230 
1231 		xe_bo_assert_held(bo);
1232 
1233 		vm_bo = drm_gpuvm_bo_obtain(vma->gpuva.vm, &bo->ttm.base);
1234 		if (IS_ERR(vm_bo)) {
1235 			xe_vma_free(vma);
1236 			return ERR_CAST(vm_bo);
1237 		}
1238 
1239 		drm_gpuvm_bo_extobj_add(vm_bo);
1240 		drm_gem_object_get(&bo->ttm.base);
1241 		vma->gpuva.gem.offset = bo_offset_or_userptr;
1242 		drm_gpuva_link(&vma->gpuva, vm_bo);
1243 		drm_gpuvm_bo_put(vm_bo);
1244 	} else /* userptr or null */ {
1245 		if (!is_null && !is_cpu_addr_mirror) {
1246 			struct xe_userptr *userptr = &to_userptr_vma(vma)->userptr;
1247 			u64 size = end - start + 1;
1248 			int err;
1249 
1250 			INIT_LIST_HEAD(&userptr->invalidate_link);
1251 			INIT_LIST_HEAD(&userptr->repin_link);
1252 			vma->gpuva.gem.offset = bo_offset_or_userptr;
1253 			mutex_init(&userptr->unmap_mutex);
1254 
1255 			err = mmu_interval_notifier_insert(&userptr->notifier,
1256 							   current->mm,
1257 							   xe_vma_userptr(vma), size,
1258 							   &vma_userptr_notifier_ops);
1259 			if (err) {
1260 				xe_vma_free(vma);
1261 				return ERR_PTR(err);
1262 			}
1263 
1264 			userptr->notifier_seq = LONG_MAX;
1265 		}
1266 
1267 		xe_vm_get(vm);
1268 	}
1269 
1270 	return vma;
1271 }
1272 
1273 static void xe_vma_destroy_late(struct xe_vma *vma)
1274 {
1275 	struct xe_vm *vm = xe_vma_vm(vma);
1276 
1277 	if (vma->ufence) {
1278 		xe_sync_ufence_put(vma->ufence);
1279 		vma->ufence = NULL;
1280 	}
1281 
1282 	if (xe_vma_is_userptr(vma)) {
1283 		struct xe_userptr_vma *uvma = to_userptr_vma(vma);
1284 		struct xe_userptr *userptr = &uvma->userptr;
1285 
1286 		if (userptr->sg)
1287 			xe_hmm_userptr_free_sg(uvma);
1288 
1289 		/*
1290 		 * Since userptr pages are not pinned, we can't remove
1291 		 * the notifier until we're sure the GPU is not accessing
1292 		 * them anymore
1293 		 */
1294 		mmu_interval_notifier_remove(&userptr->notifier);
1295 		mutex_destroy(&userptr->unmap_mutex);
1296 		xe_vm_put(vm);
1297 	} else if (xe_vma_is_null(vma) || xe_vma_is_cpu_addr_mirror(vma)) {
1298 		xe_vm_put(vm);
1299 	} else {
1300 		xe_bo_put(xe_vma_bo(vma));
1301 	}
1302 
1303 	xe_vma_free(vma);
1304 }
1305 
1306 static void vma_destroy_work_func(struct work_struct *w)
1307 {
1308 	struct xe_vma *vma =
1309 		container_of(w, struct xe_vma, destroy_work);
1310 
1311 	xe_vma_destroy_late(vma);
1312 }
1313 
1314 static void vma_destroy_cb(struct dma_fence *fence,
1315 			   struct dma_fence_cb *cb)
1316 {
1317 	struct xe_vma *vma = container_of(cb, struct xe_vma, destroy_cb);
1318 
1319 	INIT_WORK(&vma->destroy_work, vma_destroy_work_func);
1320 	queue_work(system_unbound_wq, &vma->destroy_work);
1321 }
1322 
1323 static void xe_vma_destroy(struct xe_vma *vma, struct dma_fence *fence)
1324 {
1325 	struct xe_vm *vm = xe_vma_vm(vma);
1326 
1327 	lockdep_assert_held_write(&vm->lock);
1328 	xe_assert(vm->xe, list_empty(&vma->combined_links.destroy));
1329 
1330 	if (xe_vma_is_userptr(vma)) {
1331 		xe_assert(vm->xe, vma->gpuva.flags & XE_VMA_DESTROYED);
1332 
1333 		spin_lock(&vm->userptr.invalidated_lock);
1334 		xe_assert(vm->xe, list_empty(&to_userptr_vma(vma)->userptr.repin_link));
1335 		list_del(&to_userptr_vma(vma)->userptr.invalidate_link);
1336 		spin_unlock(&vm->userptr.invalidated_lock);
1337 	} else if (!xe_vma_is_null(vma) && !xe_vma_is_cpu_addr_mirror(vma)) {
1338 		xe_bo_assert_held(xe_vma_bo(vma));
1339 
1340 		drm_gpuva_unlink(&vma->gpuva);
1341 	}
1342 
1343 	xe_vm_assert_held(vm);
1344 	if (fence) {
1345 		int ret = dma_fence_add_callback(fence, &vma->destroy_cb,
1346 						 vma_destroy_cb);
1347 
1348 		if (ret) {
1349 			XE_WARN_ON(ret != -ENOENT);
1350 			xe_vma_destroy_late(vma);
1351 		}
1352 	} else {
1353 		xe_vma_destroy_late(vma);
1354 	}
1355 }
1356 
1357 /**
1358  * xe_vm_lock_vma() - drm_exec utility to lock a vma
1359  * @exec: The drm_exec object we're currently locking for.
1360  * @vma: The vma for witch we want to lock the vm resv and any attached
1361  * object's resv.
1362  *
1363  * Return: 0 on success, negative error code on error. In particular
1364  * may return -EDEADLK on WW transaction contention and -EINTR if
1365  * an interruptible wait is terminated by a signal.
1366  */
1367 int xe_vm_lock_vma(struct drm_exec *exec, struct xe_vma *vma)
1368 {
1369 	struct xe_vm *vm = xe_vma_vm(vma);
1370 	struct xe_bo *bo = xe_vma_bo(vma);
1371 	int err;
1372 
1373 	XE_WARN_ON(!vm);
1374 
1375 	err = drm_exec_lock_obj(exec, xe_vm_obj(vm));
1376 	if (!err && bo && !bo->vm)
1377 		err = drm_exec_lock_obj(exec, &bo->ttm.base);
1378 
1379 	return err;
1380 }
1381 
1382 static void xe_vma_destroy_unlocked(struct xe_vma *vma)
1383 {
1384 	struct drm_exec exec;
1385 	int err;
1386 
1387 	drm_exec_init(&exec, 0, 0);
1388 	drm_exec_until_all_locked(&exec) {
1389 		err = xe_vm_lock_vma(&exec, vma);
1390 		drm_exec_retry_on_contention(&exec);
1391 		if (XE_WARN_ON(err))
1392 			break;
1393 	}
1394 
1395 	xe_vma_destroy(vma, NULL);
1396 
1397 	drm_exec_fini(&exec);
1398 }
1399 
1400 struct xe_vma *
1401 xe_vm_find_overlapping_vma(struct xe_vm *vm, u64 start, u64 range)
1402 {
1403 	struct drm_gpuva *gpuva;
1404 
1405 	lockdep_assert_held(&vm->lock);
1406 
1407 	if (xe_vm_is_closed_or_banned(vm))
1408 		return NULL;
1409 
1410 	xe_assert(vm->xe, start + range <= vm->size);
1411 
1412 	gpuva = drm_gpuva_find_first(&vm->gpuvm, start, range);
1413 
1414 	return gpuva ? gpuva_to_vma(gpuva) : NULL;
1415 }
1416 
1417 static int xe_vm_insert_vma(struct xe_vm *vm, struct xe_vma *vma)
1418 {
1419 	int err;
1420 
1421 	xe_assert(vm->xe, xe_vma_vm(vma) == vm);
1422 	lockdep_assert_held(&vm->lock);
1423 
1424 	mutex_lock(&vm->snap_mutex);
1425 	err = drm_gpuva_insert(&vm->gpuvm, &vma->gpuva);
1426 	mutex_unlock(&vm->snap_mutex);
1427 	XE_WARN_ON(err);	/* Shouldn't be possible */
1428 
1429 	return err;
1430 }
1431 
1432 static void xe_vm_remove_vma(struct xe_vm *vm, struct xe_vma *vma)
1433 {
1434 	xe_assert(vm->xe, xe_vma_vm(vma) == vm);
1435 	lockdep_assert_held(&vm->lock);
1436 
1437 	mutex_lock(&vm->snap_mutex);
1438 	drm_gpuva_remove(&vma->gpuva);
1439 	mutex_unlock(&vm->snap_mutex);
1440 	if (vm->usm.last_fault_vma == vma)
1441 		vm->usm.last_fault_vma = NULL;
1442 }
1443 
1444 static struct drm_gpuva_op *xe_vm_op_alloc(void)
1445 {
1446 	struct xe_vma_op *op;
1447 
1448 	op = kzalloc(sizeof(*op), GFP_KERNEL);
1449 
1450 	if (unlikely(!op))
1451 		return NULL;
1452 
1453 	return &op->base;
1454 }
1455 
1456 static void xe_vm_free(struct drm_gpuvm *gpuvm);
1457 
1458 static const struct drm_gpuvm_ops gpuvm_ops = {
1459 	.op_alloc = xe_vm_op_alloc,
1460 	.vm_bo_validate = xe_gpuvm_validate,
1461 	.vm_free = xe_vm_free,
1462 };
1463 
1464 static u64 pde_encode_pat_index(u16 pat_index)
1465 {
1466 	u64 pte = 0;
1467 
1468 	if (pat_index & BIT(0))
1469 		pte |= XE_PPGTT_PTE_PAT0;
1470 
1471 	if (pat_index & BIT(1))
1472 		pte |= XE_PPGTT_PTE_PAT1;
1473 
1474 	return pte;
1475 }
1476 
1477 static u64 pte_encode_pat_index(u16 pat_index, u32 pt_level)
1478 {
1479 	u64 pte = 0;
1480 
1481 	if (pat_index & BIT(0))
1482 		pte |= XE_PPGTT_PTE_PAT0;
1483 
1484 	if (pat_index & BIT(1))
1485 		pte |= XE_PPGTT_PTE_PAT1;
1486 
1487 	if (pat_index & BIT(2)) {
1488 		if (pt_level)
1489 			pte |= XE_PPGTT_PDE_PDPE_PAT2;
1490 		else
1491 			pte |= XE_PPGTT_PTE_PAT2;
1492 	}
1493 
1494 	if (pat_index & BIT(3))
1495 		pte |= XELPG_PPGTT_PTE_PAT3;
1496 
1497 	if (pat_index & (BIT(4)))
1498 		pte |= XE2_PPGTT_PTE_PAT4;
1499 
1500 	return pte;
1501 }
1502 
1503 static u64 pte_encode_ps(u32 pt_level)
1504 {
1505 	XE_WARN_ON(pt_level > MAX_HUGEPTE_LEVEL);
1506 
1507 	if (pt_level == 1)
1508 		return XE_PDE_PS_2M;
1509 	else if (pt_level == 2)
1510 		return XE_PDPE_PS_1G;
1511 
1512 	return 0;
1513 }
1514 
1515 static u64 xelp_pde_encode_bo(struct xe_bo *bo, u64 bo_offset,
1516 			      const u16 pat_index)
1517 {
1518 	u64 pde;
1519 
1520 	pde = xe_bo_addr(bo, bo_offset, XE_PAGE_SIZE);
1521 	pde |= XE_PAGE_PRESENT | XE_PAGE_RW;
1522 	pde |= pde_encode_pat_index(pat_index);
1523 
1524 	return pde;
1525 }
1526 
1527 static u64 xelp_pte_encode_bo(struct xe_bo *bo, u64 bo_offset,
1528 			      u16 pat_index, u32 pt_level)
1529 {
1530 	u64 pte;
1531 
1532 	pte = xe_bo_addr(bo, bo_offset, XE_PAGE_SIZE);
1533 	pte |= XE_PAGE_PRESENT | XE_PAGE_RW;
1534 	pte |= pte_encode_pat_index(pat_index, pt_level);
1535 	pte |= pte_encode_ps(pt_level);
1536 
1537 	if (xe_bo_is_vram(bo) || xe_bo_is_stolen_devmem(bo))
1538 		pte |= XE_PPGTT_PTE_DM;
1539 
1540 	return pte;
1541 }
1542 
1543 static u64 xelp_pte_encode_vma(u64 pte, struct xe_vma *vma,
1544 			       u16 pat_index, u32 pt_level)
1545 {
1546 	pte |= XE_PAGE_PRESENT;
1547 
1548 	if (likely(!xe_vma_read_only(vma)))
1549 		pte |= XE_PAGE_RW;
1550 
1551 	pte |= pte_encode_pat_index(pat_index, pt_level);
1552 	pte |= pte_encode_ps(pt_level);
1553 
1554 	if (unlikely(xe_vma_is_null(vma)))
1555 		pte |= XE_PTE_NULL;
1556 
1557 	return pte;
1558 }
1559 
1560 static u64 xelp_pte_encode_addr(struct xe_device *xe, u64 addr,
1561 				u16 pat_index,
1562 				u32 pt_level, bool devmem, u64 flags)
1563 {
1564 	u64 pte;
1565 
1566 	/* Avoid passing random bits directly as flags */
1567 	xe_assert(xe, !(flags & ~XE_PTE_PS64));
1568 
1569 	pte = addr;
1570 	pte |= XE_PAGE_PRESENT | XE_PAGE_RW;
1571 	pte |= pte_encode_pat_index(pat_index, pt_level);
1572 	pte |= pte_encode_ps(pt_level);
1573 
1574 	if (devmem)
1575 		pte |= XE_PPGTT_PTE_DM;
1576 
1577 	pte |= flags;
1578 
1579 	return pte;
1580 }
1581 
1582 static const struct xe_pt_ops xelp_pt_ops = {
1583 	.pte_encode_bo = xelp_pte_encode_bo,
1584 	.pte_encode_vma = xelp_pte_encode_vma,
1585 	.pte_encode_addr = xelp_pte_encode_addr,
1586 	.pde_encode_bo = xelp_pde_encode_bo,
1587 };
1588 
1589 static void vm_destroy_work_func(struct work_struct *w);
1590 
1591 /**
1592  * xe_vm_create_scratch() - Setup a scratch memory pagetable tree for the
1593  * given tile and vm.
1594  * @xe: xe device.
1595  * @tile: tile to set up for.
1596  * @vm: vm to set up for.
1597  *
1598  * Sets up a pagetable tree with one page-table per level and a single
1599  * leaf PTE. All pagetable entries point to the single page-table or,
1600  * for MAX_HUGEPTE_LEVEL, a NULL huge PTE returning 0 on read and
1601  * writes become NOPs.
1602  *
1603  * Return: 0 on success, negative error code on error.
1604  */
1605 static int xe_vm_create_scratch(struct xe_device *xe, struct xe_tile *tile,
1606 				struct xe_vm *vm)
1607 {
1608 	u8 id = tile->id;
1609 	int i;
1610 
1611 	for (i = MAX_HUGEPTE_LEVEL; i < vm->pt_root[id]->level; i++) {
1612 		vm->scratch_pt[id][i] = xe_pt_create(vm, tile, i);
1613 		if (IS_ERR(vm->scratch_pt[id][i]))
1614 			return PTR_ERR(vm->scratch_pt[id][i]);
1615 
1616 		xe_pt_populate_empty(tile, vm, vm->scratch_pt[id][i]);
1617 	}
1618 
1619 	return 0;
1620 }
1621 ALLOW_ERROR_INJECTION(xe_vm_create_scratch, ERRNO);
1622 
1623 static void xe_vm_free_scratch(struct xe_vm *vm)
1624 {
1625 	struct xe_tile *tile;
1626 	u8 id;
1627 
1628 	if (!xe_vm_has_scratch(vm))
1629 		return;
1630 
1631 	for_each_tile(tile, vm->xe, id) {
1632 		u32 i;
1633 
1634 		if (!vm->pt_root[id])
1635 			continue;
1636 
1637 		for (i = MAX_HUGEPTE_LEVEL; i < vm->pt_root[id]->level; ++i)
1638 			if (vm->scratch_pt[id][i])
1639 				xe_pt_destroy(vm->scratch_pt[id][i], vm->flags, NULL);
1640 	}
1641 }
1642 
1643 struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags)
1644 {
1645 	struct drm_gem_object *vm_resv_obj;
1646 	struct xe_vm *vm;
1647 	int err, number_tiles = 0;
1648 	struct xe_tile *tile;
1649 	u8 id;
1650 
1651 	/*
1652 	 * Since the GSCCS is not user-accessible, we don't expect a GSC VM to
1653 	 * ever be in faulting mode.
1654 	 */
1655 	xe_assert(xe, !((flags & XE_VM_FLAG_GSC) && (flags & XE_VM_FLAG_FAULT_MODE)));
1656 
1657 	vm = kzalloc(sizeof(*vm), GFP_KERNEL);
1658 	if (!vm)
1659 		return ERR_PTR(-ENOMEM);
1660 
1661 	vm->xe = xe;
1662 
1663 	vm->size = 1ull << xe->info.va_bits;
1664 
1665 	vm->flags = flags;
1666 
1667 	/**
1668 	 * GSC VMs are kernel-owned, only used for PXP ops and can sometimes be
1669 	 * manipulated under the PXP mutex. However, the PXP mutex can be taken
1670 	 * under a user-VM lock when the PXP session is started at exec_queue
1671 	 * creation time. Those are different VMs and therefore there is no risk
1672 	 * of deadlock, but we need to tell lockdep that this is the case or it
1673 	 * will print a warning.
1674 	 */
1675 	if (flags & XE_VM_FLAG_GSC) {
1676 		static struct lock_class_key gsc_vm_key;
1677 
1678 		__init_rwsem(&vm->lock, "gsc_vm", &gsc_vm_key);
1679 	} else {
1680 		init_rwsem(&vm->lock);
1681 	}
1682 	mutex_init(&vm->snap_mutex);
1683 
1684 	INIT_LIST_HEAD(&vm->rebind_list);
1685 
1686 	INIT_LIST_HEAD(&vm->userptr.repin_list);
1687 	INIT_LIST_HEAD(&vm->userptr.invalidated);
1688 	init_rwsem(&vm->userptr.notifier_lock);
1689 	spin_lock_init(&vm->userptr.invalidated_lock);
1690 
1691 	ttm_lru_bulk_move_init(&vm->lru_bulk_move);
1692 
1693 	INIT_WORK(&vm->destroy_work, vm_destroy_work_func);
1694 
1695 	INIT_LIST_HEAD(&vm->preempt.exec_queues);
1696 	vm->preempt.min_run_period_ms = 10;	/* FIXME: Wire up to uAPI */
1697 
1698 	for_each_tile(tile, xe, id)
1699 		xe_range_fence_tree_init(&vm->rftree[id]);
1700 
1701 	vm->pt_ops = &xelp_pt_ops;
1702 
1703 	/*
1704 	 * Long-running workloads are not protected by the scheduler references.
1705 	 * By design, run_job for long-running workloads returns NULL and the
1706 	 * scheduler drops all the references of it, hence protecting the VM
1707 	 * for this case is necessary.
1708 	 */
1709 	if (flags & XE_VM_FLAG_LR_MODE) {
1710 		INIT_WORK(&vm->preempt.rebind_work, preempt_rebind_work_func);
1711 		xe_pm_runtime_get_noresume(xe);
1712 	}
1713 
1714 	if (flags & XE_VM_FLAG_FAULT_MODE) {
1715 		err = xe_svm_init(vm);
1716 		if (err)
1717 			goto err_no_resv;
1718 	}
1719 
1720 	vm_resv_obj = drm_gpuvm_resv_object_alloc(&xe->drm);
1721 	if (!vm_resv_obj) {
1722 		err = -ENOMEM;
1723 		goto err_svm_fini;
1724 	}
1725 
1726 	drm_gpuvm_init(&vm->gpuvm, "Xe VM", DRM_GPUVM_RESV_PROTECTED, &xe->drm,
1727 		       vm_resv_obj, 0, vm->size, 0, 0, &gpuvm_ops);
1728 
1729 	drm_gem_object_put(vm_resv_obj);
1730 
1731 	err = xe_vm_lock(vm, true);
1732 	if (err)
1733 		goto err_close;
1734 
1735 	if (IS_DGFX(xe) && xe->info.vram_flags & XE_VRAM_FLAGS_NEED64K)
1736 		vm->flags |= XE_VM_FLAG_64K;
1737 
1738 	for_each_tile(tile, xe, id) {
1739 		if (flags & XE_VM_FLAG_MIGRATION &&
1740 		    tile->id != XE_VM_FLAG_TILE_ID(flags))
1741 			continue;
1742 
1743 		vm->pt_root[id] = xe_pt_create(vm, tile, xe->info.vm_max_level);
1744 		if (IS_ERR(vm->pt_root[id])) {
1745 			err = PTR_ERR(vm->pt_root[id]);
1746 			vm->pt_root[id] = NULL;
1747 			goto err_unlock_close;
1748 		}
1749 	}
1750 
1751 	if (xe_vm_has_scratch(vm)) {
1752 		for_each_tile(tile, xe, id) {
1753 			if (!vm->pt_root[id])
1754 				continue;
1755 
1756 			err = xe_vm_create_scratch(xe, tile, vm);
1757 			if (err)
1758 				goto err_unlock_close;
1759 		}
1760 		vm->batch_invalidate_tlb = true;
1761 	}
1762 
1763 	if (vm->flags & XE_VM_FLAG_LR_MODE)
1764 		vm->batch_invalidate_tlb = false;
1765 
1766 	/* Fill pt_root after allocating scratch tables */
1767 	for_each_tile(tile, xe, id) {
1768 		if (!vm->pt_root[id])
1769 			continue;
1770 
1771 		xe_pt_populate_empty(tile, vm, vm->pt_root[id]);
1772 	}
1773 	xe_vm_unlock(vm);
1774 
1775 	/* Kernel migration VM shouldn't have a circular loop.. */
1776 	if (!(flags & XE_VM_FLAG_MIGRATION)) {
1777 		for_each_tile(tile, xe, id) {
1778 			struct xe_exec_queue *q;
1779 			u32 create_flags = EXEC_QUEUE_FLAG_VM;
1780 
1781 			if (!vm->pt_root[id])
1782 				continue;
1783 
1784 			q = xe_exec_queue_create_bind(xe, tile, create_flags, 0);
1785 			if (IS_ERR(q)) {
1786 				err = PTR_ERR(q);
1787 				goto err_close;
1788 			}
1789 			vm->q[id] = q;
1790 			number_tiles++;
1791 		}
1792 	}
1793 
1794 	if (number_tiles > 1)
1795 		vm->composite_fence_ctx = dma_fence_context_alloc(1);
1796 
1797 	trace_xe_vm_create(vm);
1798 
1799 	return vm;
1800 
1801 err_unlock_close:
1802 	xe_vm_unlock(vm);
1803 err_close:
1804 	xe_vm_close_and_put(vm);
1805 	return ERR_PTR(err);
1806 
1807 err_svm_fini:
1808 	if (flags & XE_VM_FLAG_FAULT_MODE) {
1809 		vm->size = 0; /* close the vm */
1810 		xe_svm_fini(vm);
1811 	}
1812 err_no_resv:
1813 	mutex_destroy(&vm->snap_mutex);
1814 	for_each_tile(tile, xe, id)
1815 		xe_range_fence_tree_fini(&vm->rftree[id]);
1816 	ttm_lru_bulk_move_fini(&xe->ttm, &vm->lru_bulk_move);
1817 	kfree(vm);
1818 	if (flags & XE_VM_FLAG_LR_MODE)
1819 		xe_pm_runtime_put(xe);
1820 	return ERR_PTR(err);
1821 }
1822 
1823 static void xe_vm_close(struct xe_vm *vm)
1824 {
1825 	struct xe_device *xe = vm->xe;
1826 	bool bound;
1827 	int idx;
1828 
1829 	bound = drm_dev_enter(&xe->drm, &idx);
1830 
1831 	down_write(&vm->lock);
1832 	if (xe_vm_in_fault_mode(vm))
1833 		xe_svm_notifier_lock(vm);
1834 
1835 	vm->size = 0;
1836 
1837 	if (!((vm->flags & XE_VM_FLAG_MIGRATION))) {
1838 		struct xe_tile *tile;
1839 		struct xe_gt *gt;
1840 		u8 id;
1841 
1842 		/* Wait for pending binds */
1843 		dma_resv_wait_timeout(xe_vm_resv(vm),
1844 				      DMA_RESV_USAGE_BOOKKEEP,
1845 				      false, MAX_SCHEDULE_TIMEOUT);
1846 
1847 		if (bound) {
1848 			for_each_tile(tile, xe, id)
1849 				if (vm->pt_root[id])
1850 					xe_pt_clear(xe, vm->pt_root[id]);
1851 
1852 			for_each_gt(gt, xe, id)
1853 				xe_gt_tlb_invalidation_vm(gt, vm);
1854 		}
1855 	}
1856 
1857 	if (xe_vm_in_fault_mode(vm))
1858 		xe_svm_notifier_unlock(vm);
1859 	up_write(&vm->lock);
1860 
1861 	if (bound)
1862 		drm_dev_exit(idx);
1863 }
1864 
1865 void xe_vm_close_and_put(struct xe_vm *vm)
1866 {
1867 	LIST_HEAD(contested);
1868 	struct xe_device *xe = vm->xe;
1869 	struct xe_tile *tile;
1870 	struct xe_vma *vma, *next_vma;
1871 	struct drm_gpuva *gpuva, *next;
1872 	u8 id;
1873 
1874 	xe_assert(xe, !vm->preempt.num_exec_queues);
1875 
1876 	xe_vm_close(vm);
1877 	if (xe_vm_in_preempt_fence_mode(vm))
1878 		flush_work(&vm->preempt.rebind_work);
1879 	if (xe_vm_in_fault_mode(vm))
1880 		xe_svm_close(vm);
1881 
1882 	down_write(&vm->lock);
1883 	for_each_tile(tile, xe, id) {
1884 		if (vm->q[id])
1885 			xe_exec_queue_last_fence_put(vm->q[id], vm);
1886 	}
1887 	up_write(&vm->lock);
1888 
1889 	for_each_tile(tile, xe, id) {
1890 		if (vm->q[id]) {
1891 			xe_exec_queue_kill(vm->q[id]);
1892 			xe_exec_queue_put(vm->q[id]);
1893 			vm->q[id] = NULL;
1894 		}
1895 	}
1896 
1897 	down_write(&vm->lock);
1898 	xe_vm_lock(vm, false);
1899 	drm_gpuvm_for_each_va_safe(gpuva, next, &vm->gpuvm) {
1900 		vma = gpuva_to_vma(gpuva);
1901 
1902 		if (xe_vma_has_no_bo(vma)) {
1903 			down_read(&vm->userptr.notifier_lock);
1904 			vma->gpuva.flags |= XE_VMA_DESTROYED;
1905 			up_read(&vm->userptr.notifier_lock);
1906 		}
1907 
1908 		xe_vm_remove_vma(vm, vma);
1909 
1910 		/* easy case, remove from VMA? */
1911 		if (xe_vma_has_no_bo(vma) || xe_vma_bo(vma)->vm) {
1912 			list_del_init(&vma->combined_links.rebind);
1913 			xe_vma_destroy(vma, NULL);
1914 			continue;
1915 		}
1916 
1917 		list_move_tail(&vma->combined_links.destroy, &contested);
1918 		vma->gpuva.flags |= XE_VMA_DESTROYED;
1919 	}
1920 
1921 	/*
1922 	 * All vm operations will add shared fences to resv.
1923 	 * The only exception is eviction for a shared object,
1924 	 * but even so, the unbind when evicted would still
1925 	 * install a fence to resv. Hence it's safe to
1926 	 * destroy the pagetables immediately.
1927 	 */
1928 	xe_vm_free_scratch(vm);
1929 
1930 	for_each_tile(tile, xe, id) {
1931 		if (vm->pt_root[id]) {
1932 			xe_pt_destroy(vm->pt_root[id], vm->flags, NULL);
1933 			vm->pt_root[id] = NULL;
1934 		}
1935 	}
1936 	xe_vm_unlock(vm);
1937 
1938 	/*
1939 	 * VM is now dead, cannot re-add nodes to vm->vmas if it's NULL
1940 	 * Since we hold a refcount to the bo, we can remove and free
1941 	 * the members safely without locking.
1942 	 */
1943 	list_for_each_entry_safe(vma, next_vma, &contested,
1944 				 combined_links.destroy) {
1945 		list_del_init(&vma->combined_links.destroy);
1946 		xe_vma_destroy_unlocked(vma);
1947 	}
1948 
1949 	if (xe_vm_in_fault_mode(vm))
1950 		xe_svm_fini(vm);
1951 
1952 	up_write(&vm->lock);
1953 
1954 	down_write(&xe->usm.lock);
1955 	if (vm->usm.asid) {
1956 		void *lookup;
1957 
1958 		xe_assert(xe, xe->info.has_asid);
1959 		xe_assert(xe, !(vm->flags & XE_VM_FLAG_MIGRATION));
1960 
1961 		lookup = xa_erase(&xe->usm.asid_to_vm, vm->usm.asid);
1962 		xe_assert(xe, lookup == vm);
1963 	}
1964 	up_write(&xe->usm.lock);
1965 
1966 	for_each_tile(tile, xe, id)
1967 		xe_range_fence_tree_fini(&vm->rftree[id]);
1968 
1969 	xe_vm_put(vm);
1970 }
1971 
1972 static void vm_destroy_work_func(struct work_struct *w)
1973 {
1974 	struct xe_vm *vm =
1975 		container_of(w, struct xe_vm, destroy_work);
1976 	struct xe_device *xe = vm->xe;
1977 	struct xe_tile *tile;
1978 	u8 id;
1979 
1980 	/* xe_vm_close_and_put was not called? */
1981 	xe_assert(xe, !vm->size);
1982 
1983 	if (xe_vm_in_preempt_fence_mode(vm))
1984 		flush_work(&vm->preempt.rebind_work);
1985 
1986 	mutex_destroy(&vm->snap_mutex);
1987 
1988 	if (vm->flags & XE_VM_FLAG_LR_MODE)
1989 		xe_pm_runtime_put(xe);
1990 
1991 	for_each_tile(tile, xe, id)
1992 		XE_WARN_ON(vm->pt_root[id]);
1993 
1994 	trace_xe_vm_free(vm);
1995 
1996 	ttm_lru_bulk_move_fini(&xe->ttm, &vm->lru_bulk_move);
1997 
1998 	if (vm->xef)
1999 		xe_file_put(vm->xef);
2000 
2001 	kfree(vm);
2002 }
2003 
2004 static void xe_vm_free(struct drm_gpuvm *gpuvm)
2005 {
2006 	struct xe_vm *vm = container_of(gpuvm, struct xe_vm, gpuvm);
2007 
2008 	/* To destroy the VM we need to be able to sleep */
2009 	queue_work(system_unbound_wq, &vm->destroy_work);
2010 }
2011 
2012 struct xe_vm *xe_vm_lookup(struct xe_file *xef, u32 id)
2013 {
2014 	struct xe_vm *vm;
2015 
2016 	mutex_lock(&xef->vm.lock);
2017 	vm = xa_load(&xef->vm.xa, id);
2018 	if (vm)
2019 		xe_vm_get(vm);
2020 	mutex_unlock(&xef->vm.lock);
2021 
2022 	return vm;
2023 }
2024 
2025 u64 xe_vm_pdp4_descriptor(struct xe_vm *vm, struct xe_tile *tile)
2026 {
2027 	return vm->pt_ops->pde_encode_bo(vm->pt_root[tile->id]->bo, 0,
2028 					 tile_to_xe(tile)->pat.idx[XE_CACHE_WB]);
2029 }
2030 
2031 static struct xe_exec_queue *
2032 to_wait_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
2033 {
2034 	return q ? q : vm->q[0];
2035 }
2036 
2037 static struct xe_user_fence *
2038 find_ufence_get(struct xe_sync_entry *syncs, u32 num_syncs)
2039 {
2040 	unsigned int i;
2041 
2042 	for (i = 0; i < num_syncs; i++) {
2043 		struct xe_sync_entry *e = &syncs[i];
2044 
2045 		if (xe_sync_is_ufence(e))
2046 			return xe_sync_ufence_get(e);
2047 	}
2048 
2049 	return NULL;
2050 }
2051 
2052 #define ALL_DRM_XE_VM_CREATE_FLAGS (DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE | \
2053 				    DRM_XE_VM_CREATE_FLAG_LR_MODE | \
2054 				    DRM_XE_VM_CREATE_FLAG_FAULT_MODE)
2055 
2056 int xe_vm_create_ioctl(struct drm_device *dev, void *data,
2057 		       struct drm_file *file)
2058 {
2059 	struct xe_device *xe = to_xe_device(dev);
2060 	struct xe_file *xef = to_xe_file(file);
2061 	struct drm_xe_vm_create *args = data;
2062 	struct xe_tile *tile;
2063 	struct xe_vm *vm;
2064 	u32 id, asid;
2065 	int err;
2066 	u32 flags = 0;
2067 
2068 	if (XE_IOCTL_DBG(xe, args->extensions))
2069 		return -EINVAL;
2070 
2071 	if (XE_WA(xe_root_mmio_gt(xe), 14016763929))
2072 		args->flags |= DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE;
2073 
2074 	if (XE_IOCTL_DBG(xe, args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE &&
2075 			 !xe->info.has_usm))
2076 		return -EINVAL;
2077 
2078 	if (XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
2079 		return -EINVAL;
2080 
2081 	if (XE_IOCTL_DBG(xe, args->flags & ~ALL_DRM_XE_VM_CREATE_FLAGS))
2082 		return -EINVAL;
2083 
2084 	if (XE_IOCTL_DBG(xe, args->flags & DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE &&
2085 			 args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE &&
2086 			 !xe->info.needs_scratch))
2087 		return -EINVAL;
2088 
2089 	if (XE_IOCTL_DBG(xe, !(args->flags & DRM_XE_VM_CREATE_FLAG_LR_MODE) &&
2090 			 args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE))
2091 		return -EINVAL;
2092 
2093 	if (args->flags & DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE)
2094 		flags |= XE_VM_FLAG_SCRATCH_PAGE;
2095 	if (args->flags & DRM_XE_VM_CREATE_FLAG_LR_MODE)
2096 		flags |= XE_VM_FLAG_LR_MODE;
2097 	if (args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE)
2098 		flags |= XE_VM_FLAG_FAULT_MODE;
2099 
2100 	vm = xe_vm_create(xe, flags);
2101 	if (IS_ERR(vm))
2102 		return PTR_ERR(vm);
2103 
2104 	if (xe->info.has_asid) {
2105 		down_write(&xe->usm.lock);
2106 		err = xa_alloc_cyclic(&xe->usm.asid_to_vm, &asid, vm,
2107 				      XA_LIMIT(1, XE_MAX_ASID - 1),
2108 				      &xe->usm.next_asid, GFP_KERNEL);
2109 		up_write(&xe->usm.lock);
2110 		if (err < 0)
2111 			goto err_close_and_put;
2112 
2113 		vm->usm.asid = asid;
2114 	}
2115 
2116 	vm->xef = xe_file_get(xef);
2117 
2118 	/* Record BO memory for VM pagetable created against client */
2119 	for_each_tile(tile, xe, id)
2120 		if (vm->pt_root[id])
2121 			xe_drm_client_add_bo(vm->xef->client, vm->pt_root[id]->bo);
2122 
2123 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_MEM)
2124 	/* Warning: Security issue - never enable by default */
2125 	args->reserved[0] = xe_bo_main_addr(vm->pt_root[0]->bo, XE_PAGE_SIZE);
2126 #endif
2127 
2128 	/* user id alloc must always be last in ioctl to prevent UAF */
2129 	err = xa_alloc(&xef->vm.xa, &id, vm, xa_limit_32b, GFP_KERNEL);
2130 	if (err)
2131 		goto err_close_and_put;
2132 
2133 	args->vm_id = id;
2134 
2135 	return 0;
2136 
2137 err_close_and_put:
2138 	xe_vm_close_and_put(vm);
2139 
2140 	return err;
2141 }
2142 
2143 int xe_vm_destroy_ioctl(struct drm_device *dev, void *data,
2144 			struct drm_file *file)
2145 {
2146 	struct xe_device *xe = to_xe_device(dev);
2147 	struct xe_file *xef = to_xe_file(file);
2148 	struct drm_xe_vm_destroy *args = data;
2149 	struct xe_vm *vm;
2150 	int err = 0;
2151 
2152 	if (XE_IOCTL_DBG(xe, args->pad) ||
2153 	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
2154 		return -EINVAL;
2155 
2156 	mutex_lock(&xef->vm.lock);
2157 	vm = xa_load(&xef->vm.xa, args->vm_id);
2158 	if (XE_IOCTL_DBG(xe, !vm))
2159 		err = -ENOENT;
2160 	else if (XE_IOCTL_DBG(xe, vm->preempt.num_exec_queues))
2161 		err = -EBUSY;
2162 	else
2163 		xa_erase(&xef->vm.xa, args->vm_id);
2164 	mutex_unlock(&xef->vm.lock);
2165 
2166 	if (!err)
2167 		xe_vm_close_and_put(vm);
2168 
2169 	return err;
2170 }
2171 
2172 static bool vma_matches(struct xe_vma *vma, u64 page_addr)
2173 {
2174 	if (page_addr > xe_vma_end(vma) - 1 ||
2175 	    page_addr + SZ_4K - 1 < xe_vma_start(vma))
2176 		return false;
2177 
2178 	return true;
2179 }
2180 
2181 /**
2182  * xe_vm_find_vma_by_addr() - Find a VMA by its address
2183  *
2184  * @vm: the xe_vm the vma belongs to
2185  * @page_addr: address to look up
2186  */
2187 struct xe_vma *xe_vm_find_vma_by_addr(struct xe_vm *vm, u64 page_addr)
2188 {
2189 	struct xe_vma *vma = NULL;
2190 
2191 	if (vm->usm.last_fault_vma) {   /* Fast lookup */
2192 		if (vma_matches(vm->usm.last_fault_vma, page_addr))
2193 			vma = vm->usm.last_fault_vma;
2194 	}
2195 	if (!vma)
2196 		vma = xe_vm_find_overlapping_vma(vm, page_addr, SZ_4K);
2197 
2198 	return vma;
2199 }
2200 
2201 static const u32 region_to_mem_type[] = {
2202 	XE_PL_TT,
2203 	XE_PL_VRAM0,
2204 	XE_PL_VRAM1,
2205 };
2206 
2207 static void prep_vma_destroy(struct xe_vm *vm, struct xe_vma *vma,
2208 			     bool post_commit)
2209 {
2210 	down_read(&vm->userptr.notifier_lock);
2211 	vma->gpuva.flags |= XE_VMA_DESTROYED;
2212 	up_read(&vm->userptr.notifier_lock);
2213 	if (post_commit)
2214 		xe_vm_remove_vma(vm, vma);
2215 }
2216 
2217 #undef ULL
2218 #define ULL	unsigned long long
2219 
2220 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_VM)
2221 static void print_op(struct xe_device *xe, struct drm_gpuva_op *op)
2222 {
2223 	struct xe_vma *vma;
2224 
2225 	switch (op->op) {
2226 	case DRM_GPUVA_OP_MAP:
2227 		vm_dbg(&xe->drm, "MAP: addr=0x%016llx, range=0x%016llx",
2228 		       (ULL)op->map.va.addr, (ULL)op->map.va.range);
2229 		break;
2230 	case DRM_GPUVA_OP_REMAP:
2231 		vma = gpuva_to_vma(op->remap.unmap->va);
2232 		vm_dbg(&xe->drm, "REMAP:UNMAP: addr=0x%016llx, range=0x%016llx, keep=%d",
2233 		       (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma),
2234 		       op->remap.unmap->keep ? 1 : 0);
2235 		if (op->remap.prev)
2236 			vm_dbg(&xe->drm,
2237 			       "REMAP:PREV: addr=0x%016llx, range=0x%016llx",
2238 			       (ULL)op->remap.prev->va.addr,
2239 			       (ULL)op->remap.prev->va.range);
2240 		if (op->remap.next)
2241 			vm_dbg(&xe->drm,
2242 			       "REMAP:NEXT: addr=0x%016llx, range=0x%016llx",
2243 			       (ULL)op->remap.next->va.addr,
2244 			       (ULL)op->remap.next->va.range);
2245 		break;
2246 	case DRM_GPUVA_OP_UNMAP:
2247 		vma = gpuva_to_vma(op->unmap.va);
2248 		vm_dbg(&xe->drm, "UNMAP: addr=0x%016llx, range=0x%016llx, keep=%d",
2249 		       (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma),
2250 		       op->unmap.keep ? 1 : 0);
2251 		break;
2252 	case DRM_GPUVA_OP_PREFETCH:
2253 		vma = gpuva_to_vma(op->prefetch.va);
2254 		vm_dbg(&xe->drm, "PREFETCH: addr=0x%016llx, range=0x%016llx",
2255 		       (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma));
2256 		break;
2257 	default:
2258 		drm_warn(&xe->drm, "NOT POSSIBLE");
2259 	}
2260 }
2261 #else
2262 static void print_op(struct xe_device *xe, struct drm_gpuva_op *op)
2263 {
2264 }
2265 #endif
2266 
2267 static bool __xe_vm_needs_clear_scratch_pages(struct xe_vm *vm, u32 bind_flags)
2268 {
2269 	if (!xe_vm_in_fault_mode(vm))
2270 		return false;
2271 
2272 	if (!xe_vm_has_scratch(vm))
2273 		return false;
2274 
2275 	if (bind_flags & DRM_XE_VM_BIND_FLAG_IMMEDIATE)
2276 		return false;
2277 
2278 	return true;
2279 }
2280 
2281 static void xe_svm_prefetch_gpuva_ops_fini(struct drm_gpuva_ops *ops)
2282 {
2283 	struct drm_gpuva_op *__op;
2284 
2285 	drm_gpuva_for_each_op(__op, ops) {
2286 		struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2287 
2288 		xe_vma_svm_prefetch_op_fini(op);
2289 	}
2290 }
2291 
2292 /*
2293  * Create operations list from IOCTL arguments, setup operations fields so parse
2294  * and commit steps are decoupled from IOCTL arguments. This step can fail.
2295  */
2296 static struct drm_gpuva_ops *
2297 vm_bind_ioctl_ops_create(struct xe_vm *vm, struct xe_vma_ops *vops,
2298 			 struct xe_bo *bo, u64 bo_offset_or_userptr,
2299 			 u64 addr, u64 range,
2300 			 u32 operation, u32 flags,
2301 			 u32 prefetch_region, u16 pat_index)
2302 {
2303 	struct drm_gem_object *obj = bo ? &bo->ttm.base : NULL;
2304 	struct drm_gpuva_ops *ops;
2305 	struct drm_gpuva_op *__op;
2306 	struct drm_gpuvm_bo *vm_bo;
2307 	u64 range_end = addr + range;
2308 	int err;
2309 
2310 	lockdep_assert_held_write(&vm->lock);
2311 
2312 	vm_dbg(&vm->xe->drm,
2313 	       "op=%d, addr=0x%016llx, range=0x%016llx, bo_offset_or_userptr=0x%016llx",
2314 	       operation, (ULL)addr, (ULL)range,
2315 	       (ULL)bo_offset_or_userptr);
2316 
2317 	switch (operation) {
2318 	case DRM_XE_VM_BIND_OP_MAP:
2319 	case DRM_XE_VM_BIND_OP_MAP_USERPTR: {
2320 		struct drm_gpuvm_map_req map_req = {
2321 			.map.va.addr = addr,
2322 			.map.va.range = range,
2323 			.map.gem.obj = obj,
2324 			.map.gem.offset = bo_offset_or_userptr,
2325 		};
2326 
2327 		ops = drm_gpuvm_sm_map_ops_create(&vm->gpuvm, &map_req);
2328 		break;
2329 	}
2330 	case DRM_XE_VM_BIND_OP_UNMAP:
2331 		ops = drm_gpuvm_sm_unmap_ops_create(&vm->gpuvm, addr, range);
2332 		break;
2333 	case DRM_XE_VM_BIND_OP_PREFETCH:
2334 		ops = drm_gpuvm_prefetch_ops_create(&vm->gpuvm, addr, range);
2335 		break;
2336 	case DRM_XE_VM_BIND_OP_UNMAP_ALL:
2337 		xe_assert(vm->xe, bo);
2338 
2339 		err = xe_bo_lock(bo, true);
2340 		if (err)
2341 			return ERR_PTR(err);
2342 
2343 		vm_bo = drm_gpuvm_bo_obtain(&vm->gpuvm, obj);
2344 		if (IS_ERR(vm_bo)) {
2345 			xe_bo_unlock(bo);
2346 			return ERR_CAST(vm_bo);
2347 		}
2348 
2349 		ops = drm_gpuvm_bo_unmap_ops_create(vm_bo);
2350 		drm_gpuvm_bo_put(vm_bo);
2351 		xe_bo_unlock(bo);
2352 		break;
2353 	default:
2354 		drm_warn(&vm->xe->drm, "NOT POSSIBLE");
2355 		ops = ERR_PTR(-EINVAL);
2356 	}
2357 	if (IS_ERR(ops))
2358 		return ops;
2359 
2360 	drm_gpuva_for_each_op(__op, ops) {
2361 		struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2362 
2363 		if (__op->op == DRM_GPUVA_OP_MAP) {
2364 			op->map.immediate =
2365 				flags & DRM_XE_VM_BIND_FLAG_IMMEDIATE;
2366 			op->map.read_only =
2367 				flags & DRM_XE_VM_BIND_FLAG_READONLY;
2368 			op->map.is_null = flags & DRM_XE_VM_BIND_FLAG_NULL;
2369 			op->map.is_cpu_addr_mirror = flags &
2370 				DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR;
2371 			op->map.dumpable = flags & DRM_XE_VM_BIND_FLAG_DUMPABLE;
2372 			op->map.pat_index = pat_index;
2373 			op->map.invalidate_on_bind =
2374 				__xe_vm_needs_clear_scratch_pages(vm, flags);
2375 		} else if (__op->op == DRM_GPUVA_OP_PREFETCH) {
2376 			struct xe_vma *vma = gpuva_to_vma(op->base.prefetch.va);
2377 			struct xe_svm_range *svm_range;
2378 			struct drm_gpusvm_ctx ctx = {};
2379 			struct xe_tile *tile;
2380 			u8 id, tile_mask = 0;
2381 			u32 i;
2382 
2383 			if (!xe_vma_is_cpu_addr_mirror(vma)) {
2384 				op->prefetch.region = prefetch_region;
2385 				break;
2386 			}
2387 
2388 			ctx.read_only = xe_vma_read_only(vma);
2389 			ctx.devmem_possible = IS_DGFX(vm->xe) &&
2390 					      IS_ENABLED(CONFIG_DRM_XE_PAGEMAP);
2391 
2392 			for_each_tile(tile, vm->xe, id)
2393 				tile_mask |= 0x1 << id;
2394 
2395 			xa_init_flags(&op->prefetch_range.range, XA_FLAGS_ALLOC);
2396 			op->prefetch_range.region = prefetch_region;
2397 			op->prefetch_range.ranges_count = 0;
2398 alloc_next_range:
2399 			svm_range = xe_svm_range_find_or_insert(vm, addr, vma, &ctx);
2400 
2401 			if (PTR_ERR(svm_range) == -ENOENT) {
2402 				u64 ret = xe_svm_find_vma_start(vm, addr, range_end, vma);
2403 
2404 				addr = ret == ULONG_MAX ? 0 : ret;
2405 				if (addr)
2406 					goto alloc_next_range;
2407 				else
2408 					goto print_op_label;
2409 			}
2410 
2411 			if (IS_ERR(svm_range)) {
2412 				err = PTR_ERR(svm_range);
2413 				goto unwind_prefetch_ops;
2414 			}
2415 
2416 			if (xe_svm_range_validate(vm, svm_range, tile_mask, !!prefetch_region)) {
2417 				xe_svm_range_debug(svm_range, "PREFETCH - RANGE IS VALID");
2418 				goto check_next_range;
2419 			}
2420 
2421 			err = xa_alloc(&op->prefetch_range.range,
2422 				       &i, svm_range, xa_limit_32b,
2423 				       GFP_KERNEL);
2424 
2425 			if (err)
2426 				goto unwind_prefetch_ops;
2427 
2428 			op->prefetch_range.ranges_count++;
2429 			vops->flags |= XE_VMA_OPS_FLAG_HAS_SVM_PREFETCH;
2430 			xe_svm_range_debug(svm_range, "PREFETCH - RANGE CREATED");
2431 check_next_range:
2432 			if (range_end > xe_svm_range_end(svm_range) &&
2433 			    xe_svm_range_end(svm_range) < xe_vma_end(vma)) {
2434 				addr = xe_svm_range_end(svm_range);
2435 				goto alloc_next_range;
2436 			}
2437 		}
2438 print_op_label:
2439 		print_op(vm->xe, __op);
2440 	}
2441 
2442 	return ops;
2443 
2444 unwind_prefetch_ops:
2445 	xe_svm_prefetch_gpuva_ops_fini(ops);
2446 	drm_gpuva_ops_free(&vm->gpuvm, ops);
2447 	return ERR_PTR(err);
2448 }
2449 
2450 ALLOW_ERROR_INJECTION(vm_bind_ioctl_ops_create, ERRNO);
2451 
2452 static struct xe_vma *new_vma(struct xe_vm *vm, struct drm_gpuva_op_map *op,
2453 			      u16 pat_index, unsigned int flags)
2454 {
2455 	struct xe_bo *bo = op->gem.obj ? gem_to_xe_bo(op->gem.obj) : NULL;
2456 	struct drm_exec exec;
2457 	struct xe_vma *vma;
2458 	int err = 0;
2459 
2460 	lockdep_assert_held_write(&vm->lock);
2461 
2462 	if (bo) {
2463 		drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT, 0);
2464 		drm_exec_until_all_locked(&exec) {
2465 			err = 0;
2466 			if (!bo->vm) {
2467 				err = drm_exec_lock_obj(&exec, xe_vm_obj(vm));
2468 				drm_exec_retry_on_contention(&exec);
2469 			}
2470 			if (!err) {
2471 				err = drm_exec_lock_obj(&exec, &bo->ttm.base);
2472 				drm_exec_retry_on_contention(&exec);
2473 			}
2474 			if (err) {
2475 				drm_exec_fini(&exec);
2476 				return ERR_PTR(err);
2477 			}
2478 		}
2479 	}
2480 	vma = xe_vma_create(vm, bo, op->gem.offset,
2481 			    op->va.addr, op->va.addr +
2482 			    op->va.range - 1, pat_index, flags);
2483 	if (IS_ERR(vma))
2484 		goto err_unlock;
2485 
2486 	if (xe_vma_is_userptr(vma))
2487 		err = xe_vma_userptr_pin_pages(to_userptr_vma(vma));
2488 	else if (!xe_vma_has_no_bo(vma) && !bo->vm)
2489 		err = add_preempt_fences(vm, bo);
2490 
2491 err_unlock:
2492 	if (bo)
2493 		drm_exec_fini(&exec);
2494 
2495 	if (err) {
2496 		prep_vma_destroy(vm, vma, false);
2497 		xe_vma_destroy_unlocked(vma);
2498 		vma = ERR_PTR(err);
2499 	}
2500 
2501 	return vma;
2502 }
2503 
2504 static u64 xe_vma_max_pte_size(struct xe_vma *vma)
2505 {
2506 	if (vma->gpuva.flags & XE_VMA_PTE_1G)
2507 		return SZ_1G;
2508 	else if (vma->gpuva.flags & (XE_VMA_PTE_2M | XE_VMA_PTE_COMPACT))
2509 		return SZ_2M;
2510 	else if (vma->gpuva.flags & XE_VMA_PTE_64K)
2511 		return SZ_64K;
2512 	else if (vma->gpuva.flags & XE_VMA_PTE_4K)
2513 		return SZ_4K;
2514 
2515 	return SZ_1G;	/* Uninitialized, used max size */
2516 }
2517 
2518 static void xe_vma_set_pte_size(struct xe_vma *vma, u64 size)
2519 {
2520 	switch (size) {
2521 	case SZ_1G:
2522 		vma->gpuva.flags |= XE_VMA_PTE_1G;
2523 		break;
2524 	case SZ_2M:
2525 		vma->gpuva.flags |= XE_VMA_PTE_2M;
2526 		break;
2527 	case SZ_64K:
2528 		vma->gpuva.flags |= XE_VMA_PTE_64K;
2529 		break;
2530 	case SZ_4K:
2531 		vma->gpuva.flags |= XE_VMA_PTE_4K;
2532 		break;
2533 	}
2534 }
2535 
2536 static int xe_vma_op_commit(struct xe_vm *vm, struct xe_vma_op *op)
2537 {
2538 	int err = 0;
2539 
2540 	lockdep_assert_held_write(&vm->lock);
2541 
2542 	switch (op->base.op) {
2543 	case DRM_GPUVA_OP_MAP:
2544 		err |= xe_vm_insert_vma(vm, op->map.vma);
2545 		if (!err)
2546 			op->flags |= XE_VMA_OP_COMMITTED;
2547 		break;
2548 	case DRM_GPUVA_OP_REMAP:
2549 	{
2550 		u8 tile_present =
2551 			gpuva_to_vma(op->base.remap.unmap->va)->tile_present;
2552 
2553 		prep_vma_destroy(vm, gpuva_to_vma(op->base.remap.unmap->va),
2554 				 true);
2555 		op->flags |= XE_VMA_OP_COMMITTED;
2556 
2557 		if (op->remap.prev) {
2558 			err |= xe_vm_insert_vma(vm, op->remap.prev);
2559 			if (!err)
2560 				op->flags |= XE_VMA_OP_PREV_COMMITTED;
2561 			if (!err && op->remap.skip_prev) {
2562 				op->remap.prev->tile_present =
2563 					tile_present;
2564 				op->remap.prev = NULL;
2565 			}
2566 		}
2567 		if (op->remap.next) {
2568 			err |= xe_vm_insert_vma(vm, op->remap.next);
2569 			if (!err)
2570 				op->flags |= XE_VMA_OP_NEXT_COMMITTED;
2571 			if (!err && op->remap.skip_next) {
2572 				op->remap.next->tile_present =
2573 					tile_present;
2574 				op->remap.next = NULL;
2575 			}
2576 		}
2577 
2578 		/* Adjust for partial unbind after removing VMA from VM */
2579 		if (!err) {
2580 			op->base.remap.unmap->va->va.addr = op->remap.start;
2581 			op->base.remap.unmap->va->va.range = op->remap.range;
2582 		}
2583 		break;
2584 	}
2585 	case DRM_GPUVA_OP_UNMAP:
2586 		prep_vma_destroy(vm, gpuva_to_vma(op->base.unmap.va), true);
2587 		op->flags |= XE_VMA_OP_COMMITTED;
2588 		break;
2589 	case DRM_GPUVA_OP_PREFETCH:
2590 		op->flags |= XE_VMA_OP_COMMITTED;
2591 		break;
2592 	default:
2593 		drm_warn(&vm->xe->drm, "NOT POSSIBLE");
2594 	}
2595 
2596 	return err;
2597 }
2598 
2599 static int vm_bind_ioctl_ops_parse(struct xe_vm *vm, struct drm_gpuva_ops *ops,
2600 				   struct xe_vma_ops *vops)
2601 {
2602 	struct xe_device *xe = vm->xe;
2603 	struct drm_gpuva_op *__op;
2604 	struct xe_tile *tile;
2605 	u8 id, tile_mask = 0;
2606 	int err = 0;
2607 
2608 	lockdep_assert_held_write(&vm->lock);
2609 
2610 	for_each_tile(tile, vm->xe, id)
2611 		tile_mask |= 0x1 << id;
2612 
2613 	drm_gpuva_for_each_op(__op, ops) {
2614 		struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2615 		struct xe_vma *vma;
2616 		unsigned int flags = 0;
2617 
2618 		INIT_LIST_HEAD(&op->link);
2619 		list_add_tail(&op->link, &vops->list);
2620 		op->tile_mask = tile_mask;
2621 
2622 		switch (op->base.op) {
2623 		case DRM_GPUVA_OP_MAP:
2624 		{
2625 			flags |= op->map.read_only ?
2626 				VMA_CREATE_FLAG_READ_ONLY : 0;
2627 			flags |= op->map.is_null ?
2628 				VMA_CREATE_FLAG_IS_NULL : 0;
2629 			flags |= op->map.dumpable ?
2630 				VMA_CREATE_FLAG_DUMPABLE : 0;
2631 			flags |= op->map.is_cpu_addr_mirror ?
2632 				VMA_CREATE_FLAG_IS_SYSTEM_ALLOCATOR : 0;
2633 
2634 			vma = new_vma(vm, &op->base.map, op->map.pat_index,
2635 				      flags);
2636 			if (IS_ERR(vma))
2637 				return PTR_ERR(vma);
2638 
2639 			op->map.vma = vma;
2640 			if (((op->map.immediate || !xe_vm_in_fault_mode(vm)) &&
2641 			     !op->map.is_cpu_addr_mirror) ||
2642 			    op->map.invalidate_on_bind)
2643 				xe_vma_ops_incr_pt_update_ops(vops,
2644 							      op->tile_mask, 1);
2645 			break;
2646 		}
2647 		case DRM_GPUVA_OP_REMAP:
2648 		{
2649 			struct xe_vma *old =
2650 				gpuva_to_vma(op->base.remap.unmap->va);
2651 			bool skip = xe_vma_is_cpu_addr_mirror(old);
2652 			u64 start = xe_vma_start(old), end = xe_vma_end(old);
2653 			int num_remap_ops = 0;
2654 
2655 			if (op->base.remap.prev)
2656 				start = op->base.remap.prev->va.addr +
2657 					op->base.remap.prev->va.range;
2658 			if (op->base.remap.next)
2659 				end = op->base.remap.next->va.addr;
2660 
2661 			if (xe_vma_is_cpu_addr_mirror(old) &&
2662 			    xe_svm_has_mapping(vm, start, end))
2663 				return -EBUSY;
2664 
2665 			op->remap.start = xe_vma_start(old);
2666 			op->remap.range = xe_vma_size(old);
2667 
2668 			flags |= op->base.remap.unmap->va->flags &
2669 				XE_VMA_READ_ONLY ?
2670 				VMA_CREATE_FLAG_READ_ONLY : 0;
2671 			flags |= op->base.remap.unmap->va->flags &
2672 				DRM_GPUVA_SPARSE ?
2673 				VMA_CREATE_FLAG_IS_NULL : 0;
2674 			flags |= op->base.remap.unmap->va->flags &
2675 				XE_VMA_DUMPABLE ?
2676 				VMA_CREATE_FLAG_DUMPABLE : 0;
2677 			flags |= xe_vma_is_cpu_addr_mirror(old) ?
2678 				VMA_CREATE_FLAG_IS_SYSTEM_ALLOCATOR : 0;
2679 
2680 			if (op->base.remap.prev) {
2681 				vma = new_vma(vm, op->base.remap.prev,
2682 					      old->pat_index, flags);
2683 				if (IS_ERR(vma))
2684 					return PTR_ERR(vma);
2685 
2686 				op->remap.prev = vma;
2687 
2688 				/*
2689 				 * Userptr creates a new SG mapping so
2690 				 * we must also rebind.
2691 				 */
2692 				op->remap.skip_prev = skip ||
2693 					(!xe_vma_is_userptr(old) &&
2694 					IS_ALIGNED(xe_vma_end(vma),
2695 						   xe_vma_max_pte_size(old)));
2696 				if (op->remap.skip_prev) {
2697 					xe_vma_set_pte_size(vma, xe_vma_max_pte_size(old));
2698 					op->remap.range -=
2699 						xe_vma_end(vma) -
2700 						xe_vma_start(old);
2701 					op->remap.start = xe_vma_end(vma);
2702 					vm_dbg(&xe->drm, "REMAP:SKIP_PREV: addr=0x%016llx, range=0x%016llx",
2703 					       (ULL)op->remap.start,
2704 					       (ULL)op->remap.range);
2705 				} else {
2706 					num_remap_ops++;
2707 				}
2708 			}
2709 
2710 			if (op->base.remap.next) {
2711 				vma = new_vma(vm, op->base.remap.next,
2712 					      old->pat_index, flags);
2713 				if (IS_ERR(vma))
2714 					return PTR_ERR(vma);
2715 
2716 				op->remap.next = vma;
2717 
2718 				/*
2719 				 * Userptr creates a new SG mapping so
2720 				 * we must also rebind.
2721 				 */
2722 				op->remap.skip_next = skip ||
2723 					(!xe_vma_is_userptr(old) &&
2724 					IS_ALIGNED(xe_vma_start(vma),
2725 						   xe_vma_max_pte_size(old)));
2726 				if (op->remap.skip_next) {
2727 					xe_vma_set_pte_size(vma, xe_vma_max_pte_size(old));
2728 					op->remap.range -=
2729 						xe_vma_end(old) -
2730 						xe_vma_start(vma);
2731 					vm_dbg(&xe->drm, "REMAP:SKIP_NEXT: addr=0x%016llx, range=0x%016llx",
2732 					       (ULL)op->remap.start,
2733 					       (ULL)op->remap.range);
2734 				} else {
2735 					num_remap_ops++;
2736 				}
2737 			}
2738 			if (!skip)
2739 				num_remap_ops++;
2740 
2741 			xe_vma_ops_incr_pt_update_ops(vops, op->tile_mask, num_remap_ops);
2742 			break;
2743 		}
2744 		case DRM_GPUVA_OP_UNMAP:
2745 			vma = gpuva_to_vma(op->base.unmap.va);
2746 
2747 			if (xe_vma_is_cpu_addr_mirror(vma) &&
2748 			    xe_svm_has_mapping(vm, xe_vma_start(vma),
2749 					       xe_vma_end(vma)))
2750 				return -EBUSY;
2751 
2752 			if (!xe_vma_is_cpu_addr_mirror(vma))
2753 				xe_vma_ops_incr_pt_update_ops(vops, op->tile_mask, 1);
2754 			break;
2755 		case DRM_GPUVA_OP_PREFETCH:
2756 			vma = gpuva_to_vma(op->base.prefetch.va);
2757 
2758 			if (xe_vma_is_userptr(vma)) {
2759 				err = xe_vma_userptr_pin_pages(to_userptr_vma(vma));
2760 				if (err)
2761 					return err;
2762 			}
2763 
2764 			if (xe_vma_is_cpu_addr_mirror(vma))
2765 				xe_vma_ops_incr_pt_update_ops(vops, op->tile_mask,
2766 							      op->prefetch_range.ranges_count);
2767 			else
2768 				xe_vma_ops_incr_pt_update_ops(vops, op->tile_mask, 1);
2769 
2770 			break;
2771 		default:
2772 			drm_warn(&vm->xe->drm, "NOT POSSIBLE");
2773 		}
2774 
2775 		err = xe_vma_op_commit(vm, op);
2776 		if (err)
2777 			return err;
2778 	}
2779 
2780 	return 0;
2781 }
2782 
2783 static void xe_vma_op_unwind(struct xe_vm *vm, struct xe_vma_op *op,
2784 			     bool post_commit, bool prev_post_commit,
2785 			     bool next_post_commit)
2786 {
2787 	lockdep_assert_held_write(&vm->lock);
2788 
2789 	switch (op->base.op) {
2790 	case DRM_GPUVA_OP_MAP:
2791 		if (op->map.vma) {
2792 			prep_vma_destroy(vm, op->map.vma, post_commit);
2793 			xe_vma_destroy_unlocked(op->map.vma);
2794 		}
2795 		break;
2796 	case DRM_GPUVA_OP_UNMAP:
2797 	{
2798 		struct xe_vma *vma = gpuva_to_vma(op->base.unmap.va);
2799 
2800 		if (vma) {
2801 			down_read(&vm->userptr.notifier_lock);
2802 			vma->gpuva.flags &= ~XE_VMA_DESTROYED;
2803 			up_read(&vm->userptr.notifier_lock);
2804 			if (post_commit)
2805 				xe_vm_insert_vma(vm, vma);
2806 		}
2807 		break;
2808 	}
2809 	case DRM_GPUVA_OP_REMAP:
2810 	{
2811 		struct xe_vma *vma = gpuva_to_vma(op->base.remap.unmap->va);
2812 
2813 		if (op->remap.prev) {
2814 			prep_vma_destroy(vm, op->remap.prev, prev_post_commit);
2815 			xe_vma_destroy_unlocked(op->remap.prev);
2816 		}
2817 		if (op->remap.next) {
2818 			prep_vma_destroy(vm, op->remap.next, next_post_commit);
2819 			xe_vma_destroy_unlocked(op->remap.next);
2820 		}
2821 		if (vma) {
2822 			down_read(&vm->userptr.notifier_lock);
2823 			vma->gpuva.flags &= ~XE_VMA_DESTROYED;
2824 			up_read(&vm->userptr.notifier_lock);
2825 			if (post_commit)
2826 				xe_vm_insert_vma(vm, vma);
2827 		}
2828 		break;
2829 	}
2830 	case DRM_GPUVA_OP_PREFETCH:
2831 		/* Nothing to do */
2832 		break;
2833 	default:
2834 		drm_warn(&vm->xe->drm, "NOT POSSIBLE");
2835 	}
2836 }
2837 
2838 static void vm_bind_ioctl_ops_unwind(struct xe_vm *vm,
2839 				     struct drm_gpuva_ops **ops,
2840 				     int num_ops_list)
2841 {
2842 	int i;
2843 
2844 	for (i = num_ops_list - 1; i >= 0; --i) {
2845 		struct drm_gpuva_ops *__ops = ops[i];
2846 		struct drm_gpuva_op *__op;
2847 
2848 		if (!__ops)
2849 			continue;
2850 
2851 		drm_gpuva_for_each_op_reverse(__op, __ops) {
2852 			struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2853 
2854 			xe_vma_op_unwind(vm, op,
2855 					 op->flags & XE_VMA_OP_COMMITTED,
2856 					 op->flags & XE_VMA_OP_PREV_COMMITTED,
2857 					 op->flags & XE_VMA_OP_NEXT_COMMITTED);
2858 		}
2859 	}
2860 }
2861 
2862 static int vma_lock_and_validate(struct drm_exec *exec, struct xe_vma *vma,
2863 				 bool validate)
2864 {
2865 	struct xe_bo *bo = xe_vma_bo(vma);
2866 	struct xe_vm *vm = xe_vma_vm(vma);
2867 	int err = 0;
2868 
2869 	if (bo) {
2870 		if (!bo->vm)
2871 			err = drm_exec_lock_obj(exec, &bo->ttm.base);
2872 		if (!err && validate)
2873 			err = xe_bo_validate(bo, vm,
2874 					     !xe_vm_in_preempt_fence_mode(vm));
2875 	}
2876 
2877 	return err;
2878 }
2879 
2880 static int check_ufence(struct xe_vma *vma)
2881 {
2882 	if (vma->ufence) {
2883 		struct xe_user_fence * const f = vma->ufence;
2884 
2885 		if (!xe_sync_ufence_get_status(f))
2886 			return -EBUSY;
2887 
2888 		vma->ufence = NULL;
2889 		xe_sync_ufence_put(f);
2890 	}
2891 
2892 	return 0;
2893 }
2894 
2895 static int prefetch_ranges(struct xe_vm *vm, struct xe_vma_op *op)
2896 {
2897 	bool devmem_possible = IS_DGFX(vm->xe) && IS_ENABLED(CONFIG_DRM_XE_PAGEMAP);
2898 	struct xe_vma *vma = gpuva_to_vma(op->base.prefetch.va);
2899 	int err = 0;
2900 
2901 	struct xe_svm_range *svm_range;
2902 	struct drm_gpusvm_ctx ctx = {};
2903 	struct xe_tile *tile;
2904 	unsigned long i;
2905 	u32 region;
2906 
2907 	if (!xe_vma_is_cpu_addr_mirror(vma))
2908 		return 0;
2909 
2910 	region = op->prefetch_range.region;
2911 
2912 	ctx.read_only = xe_vma_read_only(vma);
2913 	ctx.devmem_possible = devmem_possible;
2914 	ctx.check_pages_threshold = devmem_possible ? SZ_64K : 0;
2915 
2916 	/* TODO: Threading the migration */
2917 	xa_for_each(&op->prefetch_range.range, i, svm_range) {
2918 		if (!region)
2919 			xe_svm_range_migrate_to_smem(vm, svm_range);
2920 
2921 		if (xe_svm_range_needs_migrate_to_vram(svm_range, vma, region)) {
2922 			tile = &vm->xe->tiles[region_to_mem_type[region] - XE_PL_VRAM0];
2923 			err = xe_svm_alloc_vram(tile, svm_range, &ctx);
2924 			if (err) {
2925 				drm_dbg(&vm->xe->drm, "VRAM allocation failed, retry from userspace, asid=%u, gpusvm=%p, errno=%pe\n",
2926 					vm->usm.asid, &vm->svm.gpusvm, ERR_PTR(err));
2927 				return -ENODATA;
2928 			}
2929 			xe_svm_range_debug(svm_range, "PREFETCH - RANGE MIGRATED TO VRAM");
2930 		}
2931 
2932 		err = xe_svm_range_get_pages(vm, svm_range, &ctx);
2933 		if (err) {
2934 			drm_dbg(&vm->xe->drm, "Get pages failed, asid=%u, gpusvm=%p, errno=%pe\n",
2935 				vm->usm.asid, &vm->svm.gpusvm, ERR_PTR(err));
2936 			if (err == -EOPNOTSUPP || err == -EFAULT || err == -EPERM)
2937 				err = -ENODATA;
2938 			return err;
2939 		}
2940 		xe_svm_range_debug(svm_range, "PREFETCH - RANGE GET PAGES DONE");
2941 	}
2942 
2943 	return err;
2944 }
2945 
2946 static int op_lock_and_prep(struct drm_exec *exec, struct xe_vm *vm,
2947 			    struct xe_vma_op *op)
2948 {
2949 	int err = 0;
2950 
2951 	switch (op->base.op) {
2952 	case DRM_GPUVA_OP_MAP:
2953 		if (!op->map.invalidate_on_bind)
2954 			err = vma_lock_and_validate(exec, op->map.vma,
2955 						    !xe_vm_in_fault_mode(vm) ||
2956 						    op->map.immediate);
2957 		break;
2958 	case DRM_GPUVA_OP_REMAP:
2959 		err = check_ufence(gpuva_to_vma(op->base.remap.unmap->va));
2960 		if (err)
2961 			break;
2962 
2963 		err = vma_lock_and_validate(exec,
2964 					    gpuva_to_vma(op->base.remap.unmap->va),
2965 					    false);
2966 		if (!err && op->remap.prev)
2967 			err = vma_lock_and_validate(exec, op->remap.prev, true);
2968 		if (!err && op->remap.next)
2969 			err = vma_lock_and_validate(exec, op->remap.next, true);
2970 		break;
2971 	case DRM_GPUVA_OP_UNMAP:
2972 		err = check_ufence(gpuva_to_vma(op->base.unmap.va));
2973 		if (err)
2974 			break;
2975 
2976 		err = vma_lock_and_validate(exec,
2977 					    gpuva_to_vma(op->base.unmap.va),
2978 					    false);
2979 		break;
2980 	case DRM_GPUVA_OP_PREFETCH:
2981 	{
2982 		struct xe_vma *vma = gpuva_to_vma(op->base.prefetch.va);
2983 		u32 region;
2984 
2985 		if (xe_vma_is_cpu_addr_mirror(vma))
2986 			region = op->prefetch_range.region;
2987 		else
2988 			region = op->prefetch.region;
2989 
2990 		xe_assert(vm->xe, region <= ARRAY_SIZE(region_to_mem_type));
2991 
2992 		err = vma_lock_and_validate(exec,
2993 					    gpuva_to_vma(op->base.prefetch.va),
2994 					    false);
2995 		if (!err && !xe_vma_has_no_bo(vma))
2996 			err = xe_bo_migrate(xe_vma_bo(vma),
2997 					    region_to_mem_type[region]);
2998 		break;
2999 	}
3000 	default:
3001 		drm_warn(&vm->xe->drm, "NOT POSSIBLE");
3002 	}
3003 
3004 	return err;
3005 }
3006 
3007 static int vm_bind_ioctl_ops_prefetch_ranges(struct xe_vm *vm, struct xe_vma_ops *vops)
3008 {
3009 	struct xe_vma_op *op;
3010 	int err;
3011 
3012 	if (!(vops->flags & XE_VMA_OPS_FLAG_HAS_SVM_PREFETCH))
3013 		return 0;
3014 
3015 	list_for_each_entry(op, &vops->list, link) {
3016 		if (op->base.op  == DRM_GPUVA_OP_PREFETCH) {
3017 			err = prefetch_ranges(vm, op);
3018 			if (err)
3019 				return err;
3020 		}
3021 	}
3022 
3023 	return 0;
3024 }
3025 
3026 static int vm_bind_ioctl_ops_lock_and_prep(struct drm_exec *exec,
3027 					   struct xe_vm *vm,
3028 					   struct xe_vma_ops *vops)
3029 {
3030 	struct xe_vma_op *op;
3031 	int err;
3032 
3033 	err = drm_exec_lock_obj(exec, xe_vm_obj(vm));
3034 	if (err)
3035 		return err;
3036 
3037 	list_for_each_entry(op, &vops->list, link) {
3038 		err = op_lock_and_prep(exec, vm, op);
3039 		if (err)
3040 			return err;
3041 	}
3042 
3043 #ifdef TEST_VM_OPS_ERROR
3044 	if (vops->inject_error &&
3045 	    vm->xe->vm_inject_error_position == FORCE_OP_ERROR_LOCK)
3046 		return -ENOSPC;
3047 #endif
3048 
3049 	return 0;
3050 }
3051 
3052 static void op_trace(struct xe_vma_op *op)
3053 {
3054 	switch (op->base.op) {
3055 	case DRM_GPUVA_OP_MAP:
3056 		trace_xe_vma_bind(op->map.vma);
3057 		break;
3058 	case DRM_GPUVA_OP_REMAP:
3059 		trace_xe_vma_unbind(gpuva_to_vma(op->base.remap.unmap->va));
3060 		if (op->remap.prev)
3061 			trace_xe_vma_bind(op->remap.prev);
3062 		if (op->remap.next)
3063 			trace_xe_vma_bind(op->remap.next);
3064 		break;
3065 	case DRM_GPUVA_OP_UNMAP:
3066 		trace_xe_vma_unbind(gpuva_to_vma(op->base.unmap.va));
3067 		break;
3068 	case DRM_GPUVA_OP_PREFETCH:
3069 		trace_xe_vma_bind(gpuva_to_vma(op->base.prefetch.va));
3070 		break;
3071 	case DRM_GPUVA_OP_DRIVER:
3072 		break;
3073 	default:
3074 		XE_WARN_ON("NOT POSSIBLE");
3075 	}
3076 }
3077 
3078 static void trace_xe_vm_ops_execute(struct xe_vma_ops *vops)
3079 {
3080 	struct xe_vma_op *op;
3081 
3082 	list_for_each_entry(op, &vops->list, link)
3083 		op_trace(op);
3084 }
3085 
3086 static int vm_ops_setup_tile_args(struct xe_vm *vm, struct xe_vma_ops *vops)
3087 {
3088 	struct xe_exec_queue *q = vops->q;
3089 	struct xe_tile *tile;
3090 	int number_tiles = 0;
3091 	u8 id;
3092 
3093 	for_each_tile(tile, vm->xe, id) {
3094 		if (vops->pt_update_ops[id].num_ops)
3095 			++number_tiles;
3096 
3097 		if (vops->pt_update_ops[id].q)
3098 			continue;
3099 
3100 		if (q) {
3101 			vops->pt_update_ops[id].q = q;
3102 			if (vm->pt_root[id] && !list_empty(&q->multi_gt_list))
3103 				q = list_next_entry(q, multi_gt_list);
3104 		} else {
3105 			vops->pt_update_ops[id].q = vm->q[id];
3106 		}
3107 	}
3108 
3109 	return number_tiles;
3110 }
3111 
3112 static struct dma_fence *ops_execute(struct xe_vm *vm,
3113 				     struct xe_vma_ops *vops)
3114 {
3115 	struct xe_tile *tile;
3116 	struct dma_fence *fence = NULL;
3117 	struct dma_fence **fences = NULL;
3118 	struct dma_fence_array *cf = NULL;
3119 	int number_tiles = 0, current_fence = 0, err;
3120 	u8 id;
3121 
3122 	number_tiles = vm_ops_setup_tile_args(vm, vops);
3123 	if (number_tiles == 0)
3124 		return ERR_PTR(-ENODATA);
3125 
3126 	if (number_tiles > 1) {
3127 		fences = kmalloc_array(number_tiles, sizeof(*fences),
3128 				       GFP_KERNEL);
3129 		if (!fences) {
3130 			fence = ERR_PTR(-ENOMEM);
3131 			goto err_trace;
3132 		}
3133 	}
3134 
3135 	for_each_tile(tile, vm->xe, id) {
3136 		if (!vops->pt_update_ops[id].num_ops)
3137 			continue;
3138 
3139 		err = xe_pt_update_ops_prepare(tile, vops);
3140 		if (err) {
3141 			fence = ERR_PTR(err);
3142 			goto err_out;
3143 		}
3144 	}
3145 
3146 	trace_xe_vm_ops_execute(vops);
3147 
3148 	for_each_tile(tile, vm->xe, id) {
3149 		if (!vops->pt_update_ops[id].num_ops)
3150 			continue;
3151 
3152 		fence = xe_pt_update_ops_run(tile, vops);
3153 		if (IS_ERR(fence))
3154 			goto err_out;
3155 
3156 		if (fences)
3157 			fences[current_fence++] = fence;
3158 	}
3159 
3160 	if (fences) {
3161 		cf = dma_fence_array_create(number_tiles, fences,
3162 					    vm->composite_fence_ctx,
3163 					    vm->composite_fence_seqno++,
3164 					    false);
3165 		if (!cf) {
3166 			--vm->composite_fence_seqno;
3167 			fence = ERR_PTR(-ENOMEM);
3168 			goto err_out;
3169 		}
3170 		fence = &cf->base;
3171 	}
3172 
3173 	for_each_tile(tile, vm->xe, id) {
3174 		if (!vops->pt_update_ops[id].num_ops)
3175 			continue;
3176 
3177 		xe_pt_update_ops_fini(tile, vops);
3178 	}
3179 
3180 	return fence;
3181 
3182 err_out:
3183 	for_each_tile(tile, vm->xe, id) {
3184 		if (!vops->pt_update_ops[id].num_ops)
3185 			continue;
3186 
3187 		xe_pt_update_ops_abort(tile, vops);
3188 	}
3189 	while (current_fence)
3190 		dma_fence_put(fences[--current_fence]);
3191 	kfree(fences);
3192 	kfree(cf);
3193 
3194 err_trace:
3195 	trace_xe_vm_ops_fail(vm);
3196 	return fence;
3197 }
3198 
3199 static void vma_add_ufence(struct xe_vma *vma, struct xe_user_fence *ufence)
3200 {
3201 	if (vma->ufence)
3202 		xe_sync_ufence_put(vma->ufence);
3203 	vma->ufence = __xe_sync_ufence_get(ufence);
3204 }
3205 
3206 static void op_add_ufence(struct xe_vm *vm, struct xe_vma_op *op,
3207 			  struct xe_user_fence *ufence)
3208 {
3209 	switch (op->base.op) {
3210 	case DRM_GPUVA_OP_MAP:
3211 		vma_add_ufence(op->map.vma, ufence);
3212 		break;
3213 	case DRM_GPUVA_OP_REMAP:
3214 		if (op->remap.prev)
3215 			vma_add_ufence(op->remap.prev, ufence);
3216 		if (op->remap.next)
3217 			vma_add_ufence(op->remap.next, ufence);
3218 		break;
3219 	case DRM_GPUVA_OP_UNMAP:
3220 		break;
3221 	case DRM_GPUVA_OP_PREFETCH:
3222 		vma_add_ufence(gpuva_to_vma(op->base.prefetch.va), ufence);
3223 		break;
3224 	default:
3225 		drm_warn(&vm->xe->drm, "NOT POSSIBLE");
3226 	}
3227 }
3228 
3229 static void vm_bind_ioctl_ops_fini(struct xe_vm *vm, struct xe_vma_ops *vops,
3230 				   struct dma_fence *fence)
3231 {
3232 	struct xe_exec_queue *wait_exec_queue = to_wait_exec_queue(vm, vops->q);
3233 	struct xe_user_fence *ufence;
3234 	struct xe_vma_op *op;
3235 	int i;
3236 
3237 	ufence = find_ufence_get(vops->syncs, vops->num_syncs);
3238 	list_for_each_entry(op, &vops->list, link) {
3239 		if (ufence)
3240 			op_add_ufence(vm, op, ufence);
3241 
3242 		if (op->base.op == DRM_GPUVA_OP_UNMAP)
3243 			xe_vma_destroy(gpuva_to_vma(op->base.unmap.va), fence);
3244 		else if (op->base.op == DRM_GPUVA_OP_REMAP)
3245 			xe_vma_destroy(gpuva_to_vma(op->base.remap.unmap->va),
3246 				       fence);
3247 	}
3248 	if (ufence)
3249 		xe_sync_ufence_put(ufence);
3250 	if (fence) {
3251 		for (i = 0; i < vops->num_syncs; i++)
3252 			xe_sync_entry_signal(vops->syncs + i, fence);
3253 		xe_exec_queue_last_fence_set(wait_exec_queue, vm, fence);
3254 	}
3255 }
3256 
3257 static struct dma_fence *vm_bind_ioctl_ops_execute(struct xe_vm *vm,
3258 						   struct xe_vma_ops *vops)
3259 {
3260 	struct drm_exec exec;
3261 	struct dma_fence *fence;
3262 	int err;
3263 
3264 	lockdep_assert_held_write(&vm->lock);
3265 
3266 	drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT |
3267 		      DRM_EXEC_IGNORE_DUPLICATES, 0);
3268 	drm_exec_until_all_locked(&exec) {
3269 		err = vm_bind_ioctl_ops_lock_and_prep(&exec, vm, vops);
3270 		drm_exec_retry_on_contention(&exec);
3271 		if (err) {
3272 			fence = ERR_PTR(err);
3273 			goto unlock;
3274 		}
3275 
3276 		fence = ops_execute(vm, vops);
3277 		if (IS_ERR(fence)) {
3278 			if (PTR_ERR(fence) == -ENODATA)
3279 				vm_bind_ioctl_ops_fini(vm, vops, NULL);
3280 			goto unlock;
3281 		}
3282 
3283 		vm_bind_ioctl_ops_fini(vm, vops, fence);
3284 	}
3285 
3286 unlock:
3287 	drm_exec_fini(&exec);
3288 	return fence;
3289 }
3290 ALLOW_ERROR_INJECTION(vm_bind_ioctl_ops_execute, ERRNO);
3291 
3292 #define SUPPORTED_FLAGS_STUB  \
3293 	(DRM_XE_VM_BIND_FLAG_READONLY | \
3294 	 DRM_XE_VM_BIND_FLAG_IMMEDIATE | \
3295 	 DRM_XE_VM_BIND_FLAG_NULL | \
3296 	 DRM_XE_VM_BIND_FLAG_DUMPABLE | \
3297 	 DRM_XE_VM_BIND_FLAG_CHECK_PXP | \
3298 	 DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR)
3299 
3300 #ifdef TEST_VM_OPS_ERROR
3301 #define SUPPORTED_FLAGS	(SUPPORTED_FLAGS_STUB | FORCE_OP_ERROR)
3302 #else
3303 #define SUPPORTED_FLAGS	SUPPORTED_FLAGS_STUB
3304 #endif
3305 
3306 #define XE_64K_PAGE_MASK 0xffffull
3307 #define ALL_DRM_XE_SYNCS_FLAGS (DRM_XE_SYNCS_FLAG_WAIT_FOR_OP)
3308 
3309 static int vm_bind_ioctl_check_args(struct xe_device *xe, struct xe_vm *vm,
3310 				    struct drm_xe_vm_bind *args,
3311 				    struct drm_xe_vm_bind_op **bind_ops)
3312 {
3313 	int err;
3314 	int i;
3315 
3316 	if (XE_IOCTL_DBG(xe, args->pad || args->pad2) ||
3317 	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
3318 		return -EINVAL;
3319 
3320 	if (XE_IOCTL_DBG(xe, args->extensions))
3321 		return -EINVAL;
3322 
3323 	if (args->num_binds > 1) {
3324 		u64 __user *bind_user =
3325 			u64_to_user_ptr(args->vector_of_binds);
3326 
3327 		*bind_ops = kvmalloc_array(args->num_binds,
3328 					   sizeof(struct drm_xe_vm_bind_op),
3329 					   GFP_KERNEL | __GFP_ACCOUNT |
3330 					   __GFP_RETRY_MAYFAIL | __GFP_NOWARN);
3331 		if (!*bind_ops)
3332 			return args->num_binds > 1 ? -ENOBUFS : -ENOMEM;
3333 
3334 		err = copy_from_user(*bind_ops, bind_user,
3335 				     sizeof(struct drm_xe_vm_bind_op) *
3336 				     args->num_binds);
3337 		if (XE_IOCTL_DBG(xe, err)) {
3338 			err = -EFAULT;
3339 			goto free_bind_ops;
3340 		}
3341 	} else {
3342 		*bind_ops = &args->bind;
3343 	}
3344 
3345 	for (i = 0; i < args->num_binds; ++i) {
3346 		u64 range = (*bind_ops)[i].range;
3347 		u64 addr = (*bind_ops)[i].addr;
3348 		u32 op = (*bind_ops)[i].op;
3349 		u32 flags = (*bind_ops)[i].flags;
3350 		u32 obj = (*bind_ops)[i].obj;
3351 		u64 obj_offset = (*bind_ops)[i].obj_offset;
3352 		u32 prefetch_region = (*bind_ops)[i].prefetch_mem_region_instance;
3353 		bool is_null = flags & DRM_XE_VM_BIND_FLAG_NULL;
3354 		bool is_cpu_addr_mirror = flags &
3355 			DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR;
3356 		u16 pat_index = (*bind_ops)[i].pat_index;
3357 		u16 coh_mode;
3358 
3359 		if (XE_IOCTL_DBG(xe, is_cpu_addr_mirror &&
3360 				 (!xe_vm_in_fault_mode(vm) ||
3361 				 !IS_ENABLED(CONFIG_DRM_XE_GPUSVM)))) {
3362 			err = -EINVAL;
3363 			goto free_bind_ops;
3364 		}
3365 
3366 		if (XE_IOCTL_DBG(xe, pat_index >= xe->pat.n_entries)) {
3367 			err = -EINVAL;
3368 			goto free_bind_ops;
3369 		}
3370 
3371 		pat_index = array_index_nospec(pat_index, xe->pat.n_entries);
3372 		(*bind_ops)[i].pat_index = pat_index;
3373 		coh_mode = xe_pat_index_get_coh_mode(xe, pat_index);
3374 		if (XE_IOCTL_DBG(xe, !coh_mode)) { /* hw reserved */
3375 			err = -EINVAL;
3376 			goto free_bind_ops;
3377 		}
3378 
3379 		if (XE_WARN_ON(coh_mode > XE_COH_AT_LEAST_1WAY)) {
3380 			err = -EINVAL;
3381 			goto free_bind_ops;
3382 		}
3383 
3384 		if (XE_IOCTL_DBG(xe, op > DRM_XE_VM_BIND_OP_PREFETCH) ||
3385 		    XE_IOCTL_DBG(xe, flags & ~SUPPORTED_FLAGS) ||
3386 		    XE_IOCTL_DBG(xe, obj && (is_null || is_cpu_addr_mirror)) ||
3387 		    XE_IOCTL_DBG(xe, obj_offset && (is_null ||
3388 						    is_cpu_addr_mirror)) ||
3389 		    XE_IOCTL_DBG(xe, op != DRM_XE_VM_BIND_OP_MAP &&
3390 				 (is_null || is_cpu_addr_mirror)) ||
3391 		    XE_IOCTL_DBG(xe, !obj &&
3392 				 op == DRM_XE_VM_BIND_OP_MAP &&
3393 				 !is_null && !is_cpu_addr_mirror) ||
3394 		    XE_IOCTL_DBG(xe, !obj &&
3395 				 op == DRM_XE_VM_BIND_OP_UNMAP_ALL) ||
3396 		    XE_IOCTL_DBG(xe, addr &&
3397 				 op == DRM_XE_VM_BIND_OP_UNMAP_ALL) ||
3398 		    XE_IOCTL_DBG(xe, range &&
3399 				 op == DRM_XE_VM_BIND_OP_UNMAP_ALL) ||
3400 		    XE_IOCTL_DBG(xe, obj &&
3401 				 op == DRM_XE_VM_BIND_OP_MAP_USERPTR) ||
3402 		    XE_IOCTL_DBG(xe, coh_mode == XE_COH_NONE &&
3403 				 op == DRM_XE_VM_BIND_OP_MAP_USERPTR) ||
3404 		    XE_IOCTL_DBG(xe, obj &&
3405 				 op == DRM_XE_VM_BIND_OP_PREFETCH) ||
3406 		    XE_IOCTL_DBG(xe, prefetch_region &&
3407 				 op != DRM_XE_VM_BIND_OP_PREFETCH) ||
3408 		    XE_IOCTL_DBG(xe, !(BIT(prefetch_region) &
3409 				       xe->info.mem_region_mask)) ||
3410 		    XE_IOCTL_DBG(xe, obj &&
3411 				 op == DRM_XE_VM_BIND_OP_UNMAP)) {
3412 			err = -EINVAL;
3413 			goto free_bind_ops;
3414 		}
3415 
3416 		if (XE_IOCTL_DBG(xe, obj_offset & ~PAGE_MASK) ||
3417 		    XE_IOCTL_DBG(xe, addr & ~PAGE_MASK) ||
3418 		    XE_IOCTL_DBG(xe, range & ~PAGE_MASK) ||
3419 		    XE_IOCTL_DBG(xe, !range &&
3420 				 op != DRM_XE_VM_BIND_OP_UNMAP_ALL)) {
3421 			err = -EINVAL;
3422 			goto free_bind_ops;
3423 		}
3424 	}
3425 
3426 	return 0;
3427 
3428 free_bind_ops:
3429 	if (args->num_binds > 1)
3430 		kvfree(*bind_ops);
3431 	return err;
3432 }
3433 
3434 static int vm_bind_ioctl_signal_fences(struct xe_vm *vm,
3435 				       struct xe_exec_queue *q,
3436 				       struct xe_sync_entry *syncs,
3437 				       int num_syncs)
3438 {
3439 	struct dma_fence *fence;
3440 	int i, err = 0;
3441 
3442 	fence = xe_sync_in_fence_get(syncs, num_syncs,
3443 				     to_wait_exec_queue(vm, q), vm);
3444 	if (IS_ERR(fence))
3445 		return PTR_ERR(fence);
3446 
3447 	for (i = 0; i < num_syncs; i++)
3448 		xe_sync_entry_signal(&syncs[i], fence);
3449 
3450 	xe_exec_queue_last_fence_set(to_wait_exec_queue(vm, q), vm,
3451 				     fence);
3452 	dma_fence_put(fence);
3453 
3454 	return err;
3455 }
3456 
3457 static void xe_vma_ops_init(struct xe_vma_ops *vops, struct xe_vm *vm,
3458 			    struct xe_exec_queue *q,
3459 			    struct xe_sync_entry *syncs, u32 num_syncs)
3460 {
3461 	memset(vops, 0, sizeof(*vops));
3462 	INIT_LIST_HEAD(&vops->list);
3463 	vops->vm = vm;
3464 	vops->q = q;
3465 	vops->syncs = syncs;
3466 	vops->num_syncs = num_syncs;
3467 	vops->flags = 0;
3468 }
3469 
3470 static int xe_vm_bind_ioctl_validate_bo(struct xe_device *xe, struct xe_bo *bo,
3471 					u64 addr, u64 range, u64 obj_offset,
3472 					u16 pat_index, u32 op, u32 bind_flags)
3473 {
3474 	u16 coh_mode;
3475 
3476 	if (XE_IOCTL_DBG(xe, range > xe_bo_size(bo)) ||
3477 	    XE_IOCTL_DBG(xe, obj_offset >
3478 			 xe_bo_size(bo) - range)) {
3479 		return -EINVAL;
3480 	}
3481 
3482 	/*
3483 	 * Some platforms require 64k VM_BIND alignment,
3484 	 * specifically those with XE_VRAM_FLAGS_NEED64K.
3485 	 *
3486 	 * Other platforms may have BO's set to 64k physical placement,
3487 	 * but can be mapped at 4k offsets anyway. This check is only
3488 	 * there for the former case.
3489 	 */
3490 	if ((bo->flags & XE_BO_FLAG_INTERNAL_64K) &&
3491 	    (xe->info.vram_flags & XE_VRAM_FLAGS_NEED64K)) {
3492 		if (XE_IOCTL_DBG(xe, obj_offset &
3493 				 XE_64K_PAGE_MASK) ||
3494 		    XE_IOCTL_DBG(xe, addr & XE_64K_PAGE_MASK) ||
3495 		    XE_IOCTL_DBG(xe, range & XE_64K_PAGE_MASK)) {
3496 			return -EINVAL;
3497 		}
3498 	}
3499 
3500 	coh_mode = xe_pat_index_get_coh_mode(xe, pat_index);
3501 	if (bo->cpu_caching) {
3502 		if (XE_IOCTL_DBG(xe, coh_mode == XE_COH_NONE &&
3503 				 bo->cpu_caching == DRM_XE_GEM_CPU_CACHING_WB)) {
3504 			return -EINVAL;
3505 		}
3506 	} else if (XE_IOCTL_DBG(xe, coh_mode == XE_COH_NONE)) {
3507 		/*
3508 		 * Imported dma-buf from a different device should
3509 		 * require 1way or 2way coherency since we don't know
3510 		 * how it was mapped on the CPU. Just assume is it
3511 		 * potentially cached on CPU side.
3512 		 */
3513 		return -EINVAL;
3514 	}
3515 
3516 	/* If a BO is protected it can only be mapped if the key is still valid */
3517 	if ((bind_flags & DRM_XE_VM_BIND_FLAG_CHECK_PXP) && xe_bo_is_protected(bo) &&
3518 	    op != DRM_XE_VM_BIND_OP_UNMAP && op != DRM_XE_VM_BIND_OP_UNMAP_ALL)
3519 		if (XE_IOCTL_DBG(xe, xe_pxp_bo_key_check(xe->pxp, bo) != 0))
3520 			return -ENOEXEC;
3521 
3522 	return 0;
3523 }
3524 
3525 int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
3526 {
3527 	struct xe_device *xe = to_xe_device(dev);
3528 	struct xe_file *xef = to_xe_file(file);
3529 	struct drm_xe_vm_bind *args = data;
3530 	struct drm_xe_sync __user *syncs_user;
3531 	struct xe_bo **bos = NULL;
3532 	struct drm_gpuva_ops **ops = NULL;
3533 	struct xe_vm *vm;
3534 	struct xe_exec_queue *q = NULL;
3535 	u32 num_syncs, num_ufence = 0;
3536 	struct xe_sync_entry *syncs = NULL;
3537 	struct drm_xe_vm_bind_op *bind_ops;
3538 	struct xe_vma_ops vops;
3539 	struct dma_fence *fence;
3540 	int err;
3541 	int i;
3542 
3543 	vm = xe_vm_lookup(xef, args->vm_id);
3544 	if (XE_IOCTL_DBG(xe, !vm))
3545 		return -EINVAL;
3546 
3547 	err = vm_bind_ioctl_check_args(xe, vm, args, &bind_ops);
3548 	if (err)
3549 		goto put_vm;
3550 
3551 	if (args->exec_queue_id) {
3552 		q = xe_exec_queue_lookup(xef, args->exec_queue_id);
3553 		if (XE_IOCTL_DBG(xe, !q)) {
3554 			err = -ENOENT;
3555 			goto put_vm;
3556 		}
3557 
3558 		if (XE_IOCTL_DBG(xe, !(q->flags & EXEC_QUEUE_FLAG_VM))) {
3559 			err = -EINVAL;
3560 			goto put_exec_queue;
3561 		}
3562 	}
3563 
3564 	/* Ensure all UNMAPs visible */
3565 	xe_svm_flush(vm);
3566 
3567 	err = down_write_killable(&vm->lock);
3568 	if (err)
3569 		goto put_exec_queue;
3570 
3571 	if (XE_IOCTL_DBG(xe, xe_vm_is_closed_or_banned(vm))) {
3572 		err = -ENOENT;
3573 		goto release_vm_lock;
3574 	}
3575 
3576 	for (i = 0; i < args->num_binds; ++i) {
3577 		u64 range = bind_ops[i].range;
3578 		u64 addr = bind_ops[i].addr;
3579 
3580 		if (XE_IOCTL_DBG(xe, range > vm->size) ||
3581 		    XE_IOCTL_DBG(xe, addr > vm->size - range)) {
3582 			err = -EINVAL;
3583 			goto release_vm_lock;
3584 		}
3585 	}
3586 
3587 	if (args->num_binds) {
3588 		bos = kvcalloc(args->num_binds, sizeof(*bos),
3589 			       GFP_KERNEL | __GFP_ACCOUNT |
3590 			       __GFP_RETRY_MAYFAIL | __GFP_NOWARN);
3591 		if (!bos) {
3592 			err = -ENOMEM;
3593 			goto release_vm_lock;
3594 		}
3595 
3596 		ops = kvcalloc(args->num_binds, sizeof(*ops),
3597 			       GFP_KERNEL | __GFP_ACCOUNT |
3598 			       __GFP_RETRY_MAYFAIL | __GFP_NOWARN);
3599 		if (!ops) {
3600 			err = -ENOMEM;
3601 			goto release_vm_lock;
3602 		}
3603 	}
3604 
3605 	for (i = 0; i < args->num_binds; ++i) {
3606 		struct drm_gem_object *gem_obj;
3607 		u64 range = bind_ops[i].range;
3608 		u64 addr = bind_ops[i].addr;
3609 		u32 obj = bind_ops[i].obj;
3610 		u64 obj_offset = bind_ops[i].obj_offset;
3611 		u16 pat_index = bind_ops[i].pat_index;
3612 		u32 op = bind_ops[i].op;
3613 		u32 bind_flags = bind_ops[i].flags;
3614 
3615 		if (!obj)
3616 			continue;
3617 
3618 		gem_obj = drm_gem_object_lookup(file, obj);
3619 		if (XE_IOCTL_DBG(xe, !gem_obj)) {
3620 			err = -ENOENT;
3621 			goto put_obj;
3622 		}
3623 		bos[i] = gem_to_xe_bo(gem_obj);
3624 
3625 		err = xe_vm_bind_ioctl_validate_bo(xe, bos[i], addr, range,
3626 						   obj_offset, pat_index, op,
3627 						   bind_flags);
3628 		if (err)
3629 			goto put_obj;
3630 	}
3631 
3632 	if (args->num_syncs) {
3633 		syncs = kcalloc(args->num_syncs, sizeof(*syncs), GFP_KERNEL);
3634 		if (!syncs) {
3635 			err = -ENOMEM;
3636 			goto put_obj;
3637 		}
3638 	}
3639 
3640 	syncs_user = u64_to_user_ptr(args->syncs);
3641 	for (num_syncs = 0; num_syncs < args->num_syncs; num_syncs++) {
3642 		err = xe_sync_entry_parse(xe, xef, &syncs[num_syncs],
3643 					  &syncs_user[num_syncs],
3644 					  (xe_vm_in_lr_mode(vm) ?
3645 					   SYNC_PARSE_FLAG_LR_MODE : 0) |
3646 					  (!args->num_binds ?
3647 					   SYNC_PARSE_FLAG_DISALLOW_USER_FENCE : 0));
3648 		if (err)
3649 			goto free_syncs;
3650 
3651 		if (xe_sync_is_ufence(&syncs[num_syncs]))
3652 			num_ufence++;
3653 	}
3654 
3655 	if (XE_IOCTL_DBG(xe, num_ufence > 1)) {
3656 		err = -EINVAL;
3657 		goto free_syncs;
3658 	}
3659 
3660 	if (!args->num_binds) {
3661 		err = -ENODATA;
3662 		goto free_syncs;
3663 	}
3664 
3665 	xe_vma_ops_init(&vops, vm, q, syncs, num_syncs);
3666 	for (i = 0; i < args->num_binds; ++i) {
3667 		u64 range = bind_ops[i].range;
3668 		u64 addr = bind_ops[i].addr;
3669 		u32 op = bind_ops[i].op;
3670 		u32 flags = bind_ops[i].flags;
3671 		u64 obj_offset = bind_ops[i].obj_offset;
3672 		u32 prefetch_region = bind_ops[i].prefetch_mem_region_instance;
3673 		u16 pat_index = bind_ops[i].pat_index;
3674 
3675 		ops[i] = vm_bind_ioctl_ops_create(vm, &vops, bos[i], obj_offset,
3676 						  addr, range, op, flags,
3677 						  prefetch_region, pat_index);
3678 		if (IS_ERR(ops[i])) {
3679 			err = PTR_ERR(ops[i]);
3680 			ops[i] = NULL;
3681 			goto unwind_ops;
3682 		}
3683 
3684 		err = vm_bind_ioctl_ops_parse(vm, ops[i], &vops);
3685 		if (err)
3686 			goto unwind_ops;
3687 
3688 #ifdef TEST_VM_OPS_ERROR
3689 		if (flags & FORCE_OP_ERROR) {
3690 			vops.inject_error = true;
3691 			vm->xe->vm_inject_error_position =
3692 				(vm->xe->vm_inject_error_position + 1) %
3693 				FORCE_OP_ERROR_COUNT;
3694 		}
3695 #endif
3696 	}
3697 
3698 	/* Nothing to do */
3699 	if (list_empty(&vops.list)) {
3700 		err = -ENODATA;
3701 		goto unwind_ops;
3702 	}
3703 
3704 	err = xe_vma_ops_alloc(&vops, args->num_binds > 1);
3705 	if (err)
3706 		goto unwind_ops;
3707 
3708 	err = vm_bind_ioctl_ops_prefetch_ranges(vm, &vops);
3709 	if (err)
3710 		goto unwind_ops;
3711 
3712 	fence = vm_bind_ioctl_ops_execute(vm, &vops);
3713 	if (IS_ERR(fence))
3714 		err = PTR_ERR(fence);
3715 	else
3716 		dma_fence_put(fence);
3717 
3718 unwind_ops:
3719 	if (err && err != -ENODATA)
3720 		vm_bind_ioctl_ops_unwind(vm, ops, args->num_binds);
3721 	xe_vma_ops_fini(&vops);
3722 	for (i = args->num_binds - 1; i >= 0; --i)
3723 		if (ops[i])
3724 			drm_gpuva_ops_free(&vm->gpuvm, ops[i]);
3725 free_syncs:
3726 	if (err == -ENODATA)
3727 		err = vm_bind_ioctl_signal_fences(vm, q, syncs, num_syncs);
3728 	while (num_syncs--)
3729 		xe_sync_entry_cleanup(&syncs[num_syncs]);
3730 
3731 	kfree(syncs);
3732 put_obj:
3733 	for (i = 0; i < args->num_binds; ++i)
3734 		xe_bo_put(bos[i]);
3735 release_vm_lock:
3736 	up_write(&vm->lock);
3737 put_exec_queue:
3738 	if (q)
3739 		xe_exec_queue_put(q);
3740 put_vm:
3741 	xe_vm_put(vm);
3742 	kvfree(bos);
3743 	kvfree(ops);
3744 	if (args->num_binds > 1)
3745 		kvfree(bind_ops);
3746 	return err;
3747 }
3748 
3749 /**
3750  * xe_vm_bind_kernel_bo - bind a kernel BO to a VM
3751  * @vm: VM to bind the BO to
3752  * @bo: BO to bind
3753  * @q: exec queue to use for the bind (optional)
3754  * @addr: address at which to bind the BO
3755  * @cache_lvl: PAT cache level to use
3756  *
3757  * Execute a VM bind map operation on a kernel-owned BO to bind it into a
3758  * kernel-owned VM.
3759  *
3760  * Returns a dma_fence to track the binding completion if the job to do so was
3761  * successfully submitted, an error pointer otherwise.
3762  */
3763 struct dma_fence *xe_vm_bind_kernel_bo(struct xe_vm *vm, struct xe_bo *bo,
3764 				       struct xe_exec_queue *q, u64 addr,
3765 				       enum xe_cache_level cache_lvl)
3766 {
3767 	struct xe_vma_ops vops;
3768 	struct drm_gpuva_ops *ops = NULL;
3769 	struct dma_fence *fence;
3770 	int err;
3771 
3772 	xe_bo_get(bo);
3773 	xe_vm_get(vm);
3774 	if (q)
3775 		xe_exec_queue_get(q);
3776 
3777 	down_write(&vm->lock);
3778 
3779 	xe_vma_ops_init(&vops, vm, q, NULL, 0);
3780 
3781 	ops = vm_bind_ioctl_ops_create(vm, &vops, bo, 0, addr, xe_bo_size(bo),
3782 				       DRM_XE_VM_BIND_OP_MAP, 0, 0,
3783 				       vm->xe->pat.idx[cache_lvl]);
3784 	if (IS_ERR(ops)) {
3785 		err = PTR_ERR(ops);
3786 		goto release_vm_lock;
3787 	}
3788 
3789 	err = vm_bind_ioctl_ops_parse(vm, ops, &vops);
3790 	if (err)
3791 		goto release_vm_lock;
3792 
3793 	xe_assert(vm->xe, !list_empty(&vops.list));
3794 
3795 	err = xe_vma_ops_alloc(&vops, false);
3796 	if (err)
3797 		goto unwind_ops;
3798 
3799 	fence = vm_bind_ioctl_ops_execute(vm, &vops);
3800 	if (IS_ERR(fence))
3801 		err = PTR_ERR(fence);
3802 
3803 unwind_ops:
3804 	if (err && err != -ENODATA)
3805 		vm_bind_ioctl_ops_unwind(vm, &ops, 1);
3806 
3807 	xe_vma_ops_fini(&vops);
3808 	drm_gpuva_ops_free(&vm->gpuvm, ops);
3809 
3810 release_vm_lock:
3811 	up_write(&vm->lock);
3812 
3813 	if (q)
3814 		xe_exec_queue_put(q);
3815 	xe_vm_put(vm);
3816 	xe_bo_put(bo);
3817 
3818 	if (err)
3819 		fence = ERR_PTR(err);
3820 
3821 	return fence;
3822 }
3823 
3824 /**
3825  * xe_vm_lock() - Lock the vm's dma_resv object
3826  * @vm: The struct xe_vm whose lock is to be locked
3827  * @intr: Whether to perform any wait interruptible
3828  *
3829  * Return: 0 on success, -EINTR if @intr is true and the wait for a
3830  * contended lock was interrupted. If @intr is false, the function
3831  * always returns 0.
3832  */
3833 int xe_vm_lock(struct xe_vm *vm, bool intr)
3834 {
3835 	if (intr)
3836 		return dma_resv_lock_interruptible(xe_vm_resv(vm), NULL);
3837 
3838 	return dma_resv_lock(xe_vm_resv(vm), NULL);
3839 }
3840 
3841 /**
3842  * xe_vm_unlock() - Unlock the vm's dma_resv object
3843  * @vm: The struct xe_vm whose lock is to be released.
3844  *
3845  * Unlock a buffer object lock that was locked by xe_vm_lock().
3846  */
3847 void xe_vm_unlock(struct xe_vm *vm)
3848 {
3849 	dma_resv_unlock(xe_vm_resv(vm));
3850 }
3851 
3852 /**
3853  * xe_vm_range_tilemask_tlb_invalidation - Issue a TLB invalidation on this tilemask for an
3854  * address range
3855  * @vm: The VM
3856  * @start: start address
3857  * @end: end address
3858  * @tile_mask: mask for which gt's issue tlb invalidation
3859  *
3860  * Issue a range based TLB invalidation for gt's in tilemask
3861  *
3862  * Returns 0 for success, negative error code otherwise.
3863  */
3864 int xe_vm_range_tilemask_tlb_invalidation(struct xe_vm *vm, u64 start,
3865 					  u64 end, u8 tile_mask)
3866 {
3867 	struct xe_gt_tlb_invalidation_fence fence[XE_MAX_TILES_PER_DEVICE * XE_MAX_GT_PER_TILE];
3868 	struct xe_tile *tile;
3869 	u32 fence_id = 0;
3870 	u8 id;
3871 	int err;
3872 
3873 	if (!tile_mask)
3874 		return 0;
3875 
3876 	for_each_tile(tile, vm->xe, id) {
3877 		if (tile_mask & BIT(id)) {
3878 			xe_gt_tlb_invalidation_fence_init(tile->primary_gt,
3879 							  &fence[fence_id], true);
3880 
3881 			err = xe_gt_tlb_invalidation_range(tile->primary_gt,
3882 							   &fence[fence_id],
3883 							   start,
3884 							   end,
3885 							   vm->usm.asid);
3886 			if (err)
3887 				goto wait;
3888 			++fence_id;
3889 
3890 			if (!tile->media_gt)
3891 				continue;
3892 
3893 			xe_gt_tlb_invalidation_fence_init(tile->media_gt,
3894 							  &fence[fence_id], true);
3895 
3896 			err = xe_gt_tlb_invalidation_range(tile->media_gt,
3897 							   &fence[fence_id],
3898 							   start,
3899 							   end,
3900 							   vm->usm.asid);
3901 			if (err)
3902 				goto wait;
3903 			++fence_id;
3904 		}
3905 	}
3906 
3907 wait:
3908 	for (id = 0; id < fence_id; ++id)
3909 		xe_gt_tlb_invalidation_fence_wait(&fence[id]);
3910 
3911 	return err;
3912 }
3913 
3914 /**
3915  * xe_vm_invalidate_vma - invalidate GPU mappings for VMA without a lock
3916  * @vma: VMA to invalidate
3917  *
3918  * Walks a list of page tables leaves which it memset the entries owned by this
3919  * VMA to zero, invalidates the TLBs, and block until TLBs invalidation is
3920  * complete.
3921  *
3922  * Returns 0 for success, negative error code otherwise.
3923  */
3924 int xe_vm_invalidate_vma(struct xe_vma *vma)
3925 {
3926 	struct xe_device *xe = xe_vma_vm(vma)->xe;
3927 	struct xe_vm *vm = xe_vma_vm(vma);
3928 	struct xe_tile *tile;
3929 	u8 tile_mask = 0;
3930 	int ret = 0;
3931 	u8 id;
3932 
3933 	xe_assert(xe, !xe_vma_is_null(vma));
3934 	xe_assert(xe, !xe_vma_is_cpu_addr_mirror(vma));
3935 	trace_xe_vma_invalidate(vma);
3936 
3937 	vm_dbg(&vm->xe->drm,
3938 	       "INVALIDATE: addr=0x%016llx, range=0x%016llx",
3939 		xe_vma_start(vma), xe_vma_size(vma));
3940 
3941 	/*
3942 	 * Check that we don't race with page-table updates, tile_invalidated
3943 	 * update is safe
3944 	 */
3945 	if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
3946 		if (xe_vma_is_userptr(vma)) {
3947 			lockdep_assert(lockdep_is_held_type(&vm->userptr.notifier_lock, 0) ||
3948 				       (lockdep_is_held_type(&vm->userptr.notifier_lock, 1) &&
3949 					lockdep_is_held(&xe_vm_resv(vm)->lock.base)));
3950 
3951 			WARN_ON_ONCE(!mmu_interval_check_retry
3952 				     (&to_userptr_vma(vma)->userptr.notifier,
3953 				      to_userptr_vma(vma)->userptr.notifier_seq));
3954 			WARN_ON_ONCE(!dma_resv_test_signaled(xe_vm_resv(vm),
3955 							     DMA_RESV_USAGE_BOOKKEEP));
3956 
3957 		} else {
3958 			xe_bo_assert_held(xe_vma_bo(vma));
3959 		}
3960 	}
3961 
3962 	for_each_tile(tile, xe, id)
3963 		if (xe_pt_zap_ptes(tile, vma))
3964 			tile_mask |= BIT(id);
3965 
3966 	xe_device_wmb(xe);
3967 
3968 	ret = xe_vm_range_tilemask_tlb_invalidation(xe_vma_vm(vma), xe_vma_start(vma),
3969 						    xe_vma_end(vma), tile_mask);
3970 
3971 	/* WRITE_ONCE pairs with READ_ONCE in xe_vm_has_valid_gpu_mapping() */
3972 	WRITE_ONCE(vma->tile_invalidated, vma->tile_mask);
3973 
3974 	return ret;
3975 }
3976 
3977 int xe_vm_validate_protected(struct xe_vm *vm)
3978 {
3979 	struct drm_gpuva *gpuva;
3980 	int err = 0;
3981 
3982 	if (!vm)
3983 		return -ENODEV;
3984 
3985 	mutex_lock(&vm->snap_mutex);
3986 
3987 	drm_gpuvm_for_each_va(gpuva, &vm->gpuvm) {
3988 		struct xe_vma *vma = gpuva_to_vma(gpuva);
3989 		struct xe_bo *bo = vma->gpuva.gem.obj ?
3990 			gem_to_xe_bo(vma->gpuva.gem.obj) : NULL;
3991 
3992 		if (!bo)
3993 			continue;
3994 
3995 		if (xe_bo_is_protected(bo)) {
3996 			err = xe_pxp_bo_key_check(vm->xe->pxp, bo);
3997 			if (err)
3998 				break;
3999 		}
4000 	}
4001 
4002 	mutex_unlock(&vm->snap_mutex);
4003 	return err;
4004 }
4005 
4006 struct xe_vm_snapshot {
4007 	unsigned long num_snaps;
4008 	struct {
4009 		u64 ofs, bo_ofs;
4010 		unsigned long len;
4011 		struct xe_bo *bo;
4012 		void *data;
4013 		struct mm_struct *mm;
4014 	} snap[];
4015 };
4016 
4017 struct xe_vm_snapshot *xe_vm_snapshot_capture(struct xe_vm *vm)
4018 {
4019 	unsigned long num_snaps = 0, i;
4020 	struct xe_vm_snapshot *snap = NULL;
4021 	struct drm_gpuva *gpuva;
4022 
4023 	if (!vm)
4024 		return NULL;
4025 
4026 	mutex_lock(&vm->snap_mutex);
4027 	drm_gpuvm_for_each_va(gpuva, &vm->gpuvm) {
4028 		if (gpuva->flags & XE_VMA_DUMPABLE)
4029 			num_snaps++;
4030 	}
4031 
4032 	if (num_snaps)
4033 		snap = kvzalloc(offsetof(struct xe_vm_snapshot, snap[num_snaps]), GFP_NOWAIT);
4034 	if (!snap) {
4035 		snap = num_snaps ? ERR_PTR(-ENOMEM) : ERR_PTR(-ENODEV);
4036 		goto out_unlock;
4037 	}
4038 
4039 	snap->num_snaps = num_snaps;
4040 	i = 0;
4041 	drm_gpuvm_for_each_va(gpuva, &vm->gpuvm) {
4042 		struct xe_vma *vma = gpuva_to_vma(gpuva);
4043 		struct xe_bo *bo = vma->gpuva.gem.obj ?
4044 			gem_to_xe_bo(vma->gpuva.gem.obj) : NULL;
4045 
4046 		if (!(gpuva->flags & XE_VMA_DUMPABLE))
4047 			continue;
4048 
4049 		snap->snap[i].ofs = xe_vma_start(vma);
4050 		snap->snap[i].len = xe_vma_size(vma);
4051 		if (bo) {
4052 			snap->snap[i].bo = xe_bo_get(bo);
4053 			snap->snap[i].bo_ofs = xe_vma_bo_offset(vma);
4054 		} else if (xe_vma_is_userptr(vma)) {
4055 			struct mm_struct *mm =
4056 				to_userptr_vma(vma)->userptr.notifier.mm;
4057 
4058 			if (mmget_not_zero(mm))
4059 				snap->snap[i].mm = mm;
4060 			else
4061 				snap->snap[i].data = ERR_PTR(-EFAULT);
4062 
4063 			snap->snap[i].bo_ofs = xe_vma_userptr(vma);
4064 		} else {
4065 			snap->snap[i].data = ERR_PTR(-ENOENT);
4066 		}
4067 		i++;
4068 	}
4069 
4070 out_unlock:
4071 	mutex_unlock(&vm->snap_mutex);
4072 	return snap;
4073 }
4074 
4075 void xe_vm_snapshot_capture_delayed(struct xe_vm_snapshot *snap)
4076 {
4077 	if (IS_ERR_OR_NULL(snap))
4078 		return;
4079 
4080 	for (int i = 0; i < snap->num_snaps; i++) {
4081 		struct xe_bo *bo = snap->snap[i].bo;
4082 		int err;
4083 
4084 		if (IS_ERR(snap->snap[i].data))
4085 			continue;
4086 
4087 		snap->snap[i].data = kvmalloc(snap->snap[i].len, GFP_USER);
4088 		if (!snap->snap[i].data) {
4089 			snap->snap[i].data = ERR_PTR(-ENOMEM);
4090 			goto cleanup_bo;
4091 		}
4092 
4093 		if (bo) {
4094 			err = xe_bo_read(bo, snap->snap[i].bo_ofs,
4095 					 snap->snap[i].data, snap->snap[i].len);
4096 		} else {
4097 			void __user *userptr = (void __user *)(size_t)snap->snap[i].bo_ofs;
4098 
4099 			kthread_use_mm(snap->snap[i].mm);
4100 			if (!copy_from_user(snap->snap[i].data, userptr, snap->snap[i].len))
4101 				err = 0;
4102 			else
4103 				err = -EFAULT;
4104 			kthread_unuse_mm(snap->snap[i].mm);
4105 
4106 			mmput(snap->snap[i].mm);
4107 			snap->snap[i].mm = NULL;
4108 		}
4109 
4110 		if (err) {
4111 			kvfree(snap->snap[i].data);
4112 			snap->snap[i].data = ERR_PTR(err);
4113 		}
4114 
4115 cleanup_bo:
4116 		xe_bo_put(bo);
4117 		snap->snap[i].bo = NULL;
4118 	}
4119 }
4120 
4121 void xe_vm_snapshot_print(struct xe_vm_snapshot *snap, struct drm_printer *p)
4122 {
4123 	unsigned long i, j;
4124 
4125 	if (IS_ERR_OR_NULL(snap)) {
4126 		drm_printf(p, "[0].error: %li\n", PTR_ERR(snap));
4127 		return;
4128 	}
4129 
4130 	for (i = 0; i < snap->num_snaps; i++) {
4131 		drm_printf(p, "[%llx].length: 0x%lx\n", snap->snap[i].ofs, snap->snap[i].len);
4132 
4133 		if (IS_ERR(snap->snap[i].data)) {
4134 			drm_printf(p, "[%llx].error: %li\n", snap->snap[i].ofs,
4135 				   PTR_ERR(snap->snap[i].data));
4136 			continue;
4137 		}
4138 
4139 		drm_printf(p, "[%llx].data: ", snap->snap[i].ofs);
4140 
4141 		for (j = 0; j < snap->snap[i].len; j += sizeof(u32)) {
4142 			u32 *val = snap->snap[i].data + j;
4143 			char dumped[ASCII85_BUFSZ];
4144 
4145 			drm_puts(p, ascii85_encode(*val, dumped));
4146 		}
4147 
4148 		drm_puts(p, "\n");
4149 
4150 		if (drm_coredump_printer_is_full(p))
4151 			return;
4152 	}
4153 }
4154 
4155 void xe_vm_snapshot_free(struct xe_vm_snapshot *snap)
4156 {
4157 	unsigned long i;
4158 
4159 	if (IS_ERR_OR_NULL(snap))
4160 		return;
4161 
4162 	for (i = 0; i < snap->num_snaps; i++) {
4163 		if (!IS_ERR(snap->snap[i].data))
4164 			kvfree(snap->snap[i].data);
4165 		xe_bo_put(snap->snap[i].bo);
4166 		if (snap->snap[i].mm)
4167 			mmput(snap->snap[i].mm);
4168 	}
4169 	kvfree(snap);
4170 }
4171