xref: /linux/drivers/gpu/drm/xe/xe_vm.c (revision bfb4a6c721517a11b277e8841f8a7a64b1b14b72)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2021 Intel Corporation
4  */
5 
6 #include "xe_vm.h"
7 
8 #include <linux/dma-fence-array.h>
9 #include <linux/nospec.h>
10 
11 #include <drm/drm_drv.h>
12 #include <drm/drm_exec.h>
13 #include <drm/drm_print.h>
14 #include <drm/ttm/ttm_tt.h>
15 #include <uapi/drm/xe_drm.h>
16 #include <linux/ascii85.h>
17 #include <linux/delay.h>
18 #include <linux/kthread.h>
19 #include <linux/mm.h>
20 #include <linux/swap.h>
21 
22 #include <generated/xe_wa_oob.h>
23 
24 #include "regs/xe_gtt_defs.h"
25 #include "xe_assert.h"
26 #include "xe_bo.h"
27 #include "xe_device.h"
28 #include "xe_drm_client.h"
29 #include "xe_exec_queue.h"
30 #include "xe_gt_pagefault.h"
31 #include "xe_gt_tlb_invalidation.h"
32 #include "xe_migrate.h"
33 #include "xe_pat.h"
34 #include "xe_pm.h"
35 #include "xe_preempt_fence.h"
36 #include "xe_pt.h"
37 #include "xe_pxp.h"
38 #include "xe_res_cursor.h"
39 #include "xe_svm.h"
40 #include "xe_sync.h"
41 #include "xe_trace_bo.h"
42 #include "xe_wa.h"
43 #include "xe_hmm.h"
44 
45 static struct drm_gem_object *xe_vm_obj(struct xe_vm *vm)
46 {
47 	return vm->gpuvm.r_obj;
48 }
49 
50 /**
51  * xe_vma_userptr_check_repin() - Advisory check for repin needed
52  * @uvma: The userptr vma
53  *
54  * Check if the userptr vma has been invalidated since last successful
55  * repin. The check is advisory only and can the function can be called
56  * without the vm->userptr.notifier_lock held. There is no guarantee that the
57  * vma userptr will remain valid after a lockless check, so typically
58  * the call needs to be followed by a proper check under the notifier_lock.
59  *
60  * Return: 0 if userptr vma is valid, -EAGAIN otherwise; repin recommended.
61  */
62 int xe_vma_userptr_check_repin(struct xe_userptr_vma *uvma)
63 {
64 	return mmu_interval_check_retry(&uvma->userptr.notifier,
65 					uvma->userptr.notifier_seq) ?
66 		-EAGAIN : 0;
67 }
68 
69 int xe_vma_userptr_pin_pages(struct xe_userptr_vma *uvma)
70 {
71 	struct xe_vma *vma = &uvma->vma;
72 	struct xe_vm *vm = xe_vma_vm(vma);
73 	struct xe_device *xe = vm->xe;
74 
75 	lockdep_assert_held(&vm->lock);
76 	xe_assert(xe, xe_vma_is_userptr(vma));
77 
78 	return xe_hmm_userptr_populate_range(uvma, false);
79 }
80 
81 static bool preempt_fences_waiting(struct xe_vm *vm)
82 {
83 	struct xe_exec_queue *q;
84 
85 	lockdep_assert_held(&vm->lock);
86 	xe_vm_assert_held(vm);
87 
88 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link) {
89 		if (!q->lr.pfence ||
90 		    test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
91 			     &q->lr.pfence->flags)) {
92 			return true;
93 		}
94 	}
95 
96 	return false;
97 }
98 
99 static void free_preempt_fences(struct list_head *list)
100 {
101 	struct list_head *link, *next;
102 
103 	list_for_each_safe(link, next, list)
104 		xe_preempt_fence_free(to_preempt_fence_from_link(link));
105 }
106 
107 static int alloc_preempt_fences(struct xe_vm *vm, struct list_head *list,
108 				unsigned int *count)
109 {
110 	lockdep_assert_held(&vm->lock);
111 	xe_vm_assert_held(vm);
112 
113 	if (*count >= vm->preempt.num_exec_queues)
114 		return 0;
115 
116 	for (; *count < vm->preempt.num_exec_queues; ++(*count)) {
117 		struct xe_preempt_fence *pfence = xe_preempt_fence_alloc();
118 
119 		if (IS_ERR(pfence))
120 			return PTR_ERR(pfence);
121 
122 		list_move_tail(xe_preempt_fence_link(pfence), list);
123 	}
124 
125 	return 0;
126 }
127 
128 static int wait_for_existing_preempt_fences(struct xe_vm *vm)
129 {
130 	struct xe_exec_queue *q;
131 
132 	xe_vm_assert_held(vm);
133 
134 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link) {
135 		if (q->lr.pfence) {
136 			long timeout = dma_fence_wait(q->lr.pfence, false);
137 
138 			/* Only -ETIME on fence indicates VM needs to be killed */
139 			if (timeout < 0 || q->lr.pfence->error == -ETIME)
140 				return -ETIME;
141 
142 			dma_fence_put(q->lr.pfence);
143 			q->lr.pfence = NULL;
144 		}
145 	}
146 
147 	return 0;
148 }
149 
150 static bool xe_vm_is_idle(struct xe_vm *vm)
151 {
152 	struct xe_exec_queue *q;
153 
154 	xe_vm_assert_held(vm);
155 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link) {
156 		if (!xe_exec_queue_is_idle(q))
157 			return false;
158 	}
159 
160 	return true;
161 }
162 
163 static void arm_preempt_fences(struct xe_vm *vm, struct list_head *list)
164 {
165 	struct list_head *link;
166 	struct xe_exec_queue *q;
167 
168 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link) {
169 		struct dma_fence *fence;
170 
171 		link = list->next;
172 		xe_assert(vm->xe, link != list);
173 
174 		fence = xe_preempt_fence_arm(to_preempt_fence_from_link(link),
175 					     q, q->lr.context,
176 					     ++q->lr.seqno);
177 		dma_fence_put(q->lr.pfence);
178 		q->lr.pfence = fence;
179 	}
180 }
181 
182 static int add_preempt_fences(struct xe_vm *vm, struct xe_bo *bo)
183 {
184 	struct xe_exec_queue *q;
185 	int err;
186 
187 	xe_bo_assert_held(bo);
188 
189 	if (!vm->preempt.num_exec_queues)
190 		return 0;
191 
192 	err = dma_resv_reserve_fences(bo->ttm.base.resv, vm->preempt.num_exec_queues);
193 	if (err)
194 		return err;
195 
196 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link)
197 		if (q->lr.pfence) {
198 			dma_resv_add_fence(bo->ttm.base.resv,
199 					   q->lr.pfence,
200 					   DMA_RESV_USAGE_BOOKKEEP);
201 		}
202 
203 	return 0;
204 }
205 
206 static void resume_and_reinstall_preempt_fences(struct xe_vm *vm,
207 						struct drm_exec *exec)
208 {
209 	struct xe_exec_queue *q;
210 
211 	lockdep_assert_held(&vm->lock);
212 	xe_vm_assert_held(vm);
213 
214 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link) {
215 		q->ops->resume(q);
216 
217 		drm_gpuvm_resv_add_fence(&vm->gpuvm, exec, q->lr.pfence,
218 					 DMA_RESV_USAGE_BOOKKEEP, DMA_RESV_USAGE_BOOKKEEP);
219 	}
220 }
221 
222 int xe_vm_add_compute_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
223 {
224 	struct drm_gpuvm_exec vm_exec = {
225 		.vm = &vm->gpuvm,
226 		.flags = DRM_EXEC_INTERRUPTIBLE_WAIT,
227 		.num_fences = 1,
228 	};
229 	struct drm_exec *exec = &vm_exec.exec;
230 	struct dma_fence *pfence;
231 	int err;
232 	bool wait;
233 
234 	xe_assert(vm->xe, xe_vm_in_preempt_fence_mode(vm));
235 
236 	down_write(&vm->lock);
237 	err = drm_gpuvm_exec_lock(&vm_exec);
238 	if (err)
239 		goto out_up_write;
240 
241 	pfence = xe_preempt_fence_create(q, q->lr.context,
242 					 ++q->lr.seqno);
243 	if (!pfence) {
244 		err = -ENOMEM;
245 		goto out_fini;
246 	}
247 
248 	list_add(&q->lr.link, &vm->preempt.exec_queues);
249 	++vm->preempt.num_exec_queues;
250 	q->lr.pfence = pfence;
251 
252 	down_read(&vm->userptr.notifier_lock);
253 
254 	drm_gpuvm_resv_add_fence(&vm->gpuvm, exec, pfence,
255 				 DMA_RESV_USAGE_BOOKKEEP, DMA_RESV_USAGE_BOOKKEEP);
256 
257 	/*
258 	 * Check to see if a preemption on VM is in flight or userptr
259 	 * invalidation, if so trigger this preempt fence to sync state with
260 	 * other preempt fences on the VM.
261 	 */
262 	wait = __xe_vm_userptr_needs_repin(vm) || preempt_fences_waiting(vm);
263 	if (wait)
264 		dma_fence_enable_sw_signaling(pfence);
265 
266 	up_read(&vm->userptr.notifier_lock);
267 
268 out_fini:
269 	drm_exec_fini(exec);
270 out_up_write:
271 	up_write(&vm->lock);
272 
273 	return err;
274 }
275 ALLOW_ERROR_INJECTION(xe_vm_add_compute_exec_queue, ERRNO);
276 
277 /**
278  * xe_vm_remove_compute_exec_queue() - Remove compute exec queue from VM
279  * @vm: The VM.
280  * @q: The exec_queue
281  *
282  * Note that this function might be called multiple times on the same queue.
283  */
284 void xe_vm_remove_compute_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
285 {
286 	if (!xe_vm_in_preempt_fence_mode(vm))
287 		return;
288 
289 	down_write(&vm->lock);
290 	if (!list_empty(&q->lr.link)) {
291 		list_del_init(&q->lr.link);
292 		--vm->preempt.num_exec_queues;
293 	}
294 	if (q->lr.pfence) {
295 		dma_fence_enable_sw_signaling(q->lr.pfence);
296 		dma_fence_put(q->lr.pfence);
297 		q->lr.pfence = NULL;
298 	}
299 	up_write(&vm->lock);
300 }
301 
302 /**
303  * __xe_vm_userptr_needs_repin() - Check whether the VM does have userptrs
304  * that need repinning.
305  * @vm: The VM.
306  *
307  * This function checks for whether the VM has userptrs that need repinning,
308  * and provides a release-type barrier on the userptr.notifier_lock after
309  * checking.
310  *
311  * Return: 0 if there are no userptrs needing repinning, -EAGAIN if there are.
312  */
313 int __xe_vm_userptr_needs_repin(struct xe_vm *vm)
314 {
315 	lockdep_assert_held_read(&vm->userptr.notifier_lock);
316 
317 	return (list_empty(&vm->userptr.repin_list) &&
318 		list_empty(&vm->userptr.invalidated)) ? 0 : -EAGAIN;
319 }
320 
321 #define XE_VM_REBIND_RETRY_TIMEOUT_MS 1000
322 
323 /**
324  * xe_vm_kill() - VM Kill
325  * @vm: The VM.
326  * @unlocked: Flag indicates the VM's dma-resv is not held
327  *
328  * Kill the VM by setting banned flag indicated VM is no longer available for
329  * use. If in preempt fence mode, also kill all exec queue attached to the VM.
330  */
331 void xe_vm_kill(struct xe_vm *vm, bool unlocked)
332 {
333 	struct xe_exec_queue *q;
334 
335 	lockdep_assert_held(&vm->lock);
336 
337 	if (unlocked)
338 		xe_vm_lock(vm, false);
339 
340 	vm->flags |= XE_VM_FLAG_BANNED;
341 	trace_xe_vm_kill(vm);
342 
343 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link)
344 		q->ops->kill(q);
345 
346 	if (unlocked)
347 		xe_vm_unlock(vm);
348 
349 	/* TODO: Inform user the VM is banned */
350 }
351 
352 /**
353  * xe_vm_validate_should_retry() - Whether to retry after a validate error.
354  * @exec: The drm_exec object used for locking before validation.
355  * @err: The error returned from ttm_bo_validate().
356  * @end: A ktime_t cookie that should be set to 0 before first use and
357  * that should be reused on subsequent calls.
358  *
359  * With multiple active VMs, under memory pressure, it is possible that
360  * ttm_bo_validate() run into -EDEADLK and in such case returns -ENOMEM.
361  * Until ttm properly handles locking in such scenarios, best thing the
362  * driver can do is retry with a timeout. Check if that is necessary, and
363  * if so unlock the drm_exec's objects while keeping the ticket to prepare
364  * for a rerun.
365  *
366  * Return: true if a retry after drm_exec_init() is recommended;
367  * false otherwise.
368  */
369 bool xe_vm_validate_should_retry(struct drm_exec *exec, int err, ktime_t *end)
370 {
371 	ktime_t cur;
372 
373 	if (err != -ENOMEM)
374 		return false;
375 
376 	cur = ktime_get();
377 	*end = *end ? : ktime_add_ms(cur, XE_VM_REBIND_RETRY_TIMEOUT_MS);
378 	if (!ktime_before(cur, *end))
379 		return false;
380 
381 	msleep(20);
382 	return true;
383 }
384 
385 static int xe_gpuvm_validate(struct drm_gpuvm_bo *vm_bo, struct drm_exec *exec)
386 {
387 	struct xe_vm *vm = gpuvm_to_vm(vm_bo->vm);
388 	struct drm_gpuva *gpuva;
389 	int ret;
390 
391 	lockdep_assert_held(&vm->lock);
392 	drm_gpuvm_bo_for_each_va(gpuva, vm_bo)
393 		list_move_tail(&gpuva_to_vma(gpuva)->combined_links.rebind,
394 			       &vm->rebind_list);
395 
396 	ret = xe_bo_validate(gem_to_xe_bo(vm_bo->obj), vm, false);
397 	if (ret)
398 		return ret;
399 
400 	vm_bo->evicted = false;
401 	return 0;
402 }
403 
404 /**
405  * xe_vm_validate_rebind() - Validate buffer objects and rebind vmas
406  * @vm: The vm for which we are rebinding.
407  * @exec: The struct drm_exec with the locked GEM objects.
408  * @num_fences: The number of fences to reserve for the operation, not
409  * including rebinds and validations.
410  *
411  * Validates all evicted gem objects and rebinds their vmas. Note that
412  * rebindings may cause evictions and hence the validation-rebind
413  * sequence is rerun until there are no more objects to validate.
414  *
415  * Return: 0 on success, negative error code on error. In particular,
416  * may return -EINTR or -ERESTARTSYS if interrupted, and -EDEADLK if
417  * the drm_exec transaction needs to be restarted.
418  */
419 int xe_vm_validate_rebind(struct xe_vm *vm, struct drm_exec *exec,
420 			  unsigned int num_fences)
421 {
422 	struct drm_gem_object *obj;
423 	unsigned long index;
424 	int ret;
425 
426 	do {
427 		ret = drm_gpuvm_validate(&vm->gpuvm, exec);
428 		if (ret)
429 			return ret;
430 
431 		ret = xe_vm_rebind(vm, false);
432 		if (ret)
433 			return ret;
434 	} while (!list_empty(&vm->gpuvm.evict.list));
435 
436 	drm_exec_for_each_locked_object(exec, index, obj) {
437 		ret = dma_resv_reserve_fences(obj->resv, num_fences);
438 		if (ret)
439 			return ret;
440 	}
441 
442 	return 0;
443 }
444 
445 static int xe_preempt_work_begin(struct drm_exec *exec, struct xe_vm *vm,
446 				 bool *done)
447 {
448 	int err;
449 
450 	err = drm_gpuvm_prepare_vm(&vm->gpuvm, exec, 0);
451 	if (err)
452 		return err;
453 
454 	if (xe_vm_is_idle(vm)) {
455 		vm->preempt.rebind_deactivated = true;
456 		*done = true;
457 		return 0;
458 	}
459 
460 	if (!preempt_fences_waiting(vm)) {
461 		*done = true;
462 		return 0;
463 	}
464 
465 	err = drm_gpuvm_prepare_objects(&vm->gpuvm, exec, 0);
466 	if (err)
467 		return err;
468 
469 	err = wait_for_existing_preempt_fences(vm);
470 	if (err)
471 		return err;
472 
473 	/*
474 	 * Add validation and rebinding to the locking loop since both can
475 	 * cause evictions which may require blocing dma_resv locks.
476 	 * The fence reservation here is intended for the new preempt fences
477 	 * we attach at the end of the rebind work.
478 	 */
479 	return xe_vm_validate_rebind(vm, exec, vm->preempt.num_exec_queues);
480 }
481 
482 static void preempt_rebind_work_func(struct work_struct *w)
483 {
484 	struct xe_vm *vm = container_of(w, struct xe_vm, preempt.rebind_work);
485 	struct drm_exec exec;
486 	unsigned int fence_count = 0;
487 	LIST_HEAD(preempt_fences);
488 	ktime_t end = 0;
489 	int err = 0;
490 	long wait;
491 	int __maybe_unused tries = 0;
492 
493 	xe_assert(vm->xe, xe_vm_in_preempt_fence_mode(vm));
494 	trace_xe_vm_rebind_worker_enter(vm);
495 
496 	down_write(&vm->lock);
497 
498 	if (xe_vm_is_closed_or_banned(vm)) {
499 		up_write(&vm->lock);
500 		trace_xe_vm_rebind_worker_exit(vm);
501 		return;
502 	}
503 
504 retry:
505 	if (xe_vm_userptr_check_repin(vm)) {
506 		err = xe_vm_userptr_pin(vm);
507 		if (err)
508 			goto out_unlock_outer;
509 	}
510 
511 	drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT, 0);
512 
513 	drm_exec_until_all_locked(&exec) {
514 		bool done = false;
515 
516 		err = xe_preempt_work_begin(&exec, vm, &done);
517 		drm_exec_retry_on_contention(&exec);
518 		if (err || done) {
519 			drm_exec_fini(&exec);
520 			if (err && xe_vm_validate_should_retry(&exec, err, &end))
521 				err = -EAGAIN;
522 
523 			goto out_unlock_outer;
524 		}
525 	}
526 
527 	err = alloc_preempt_fences(vm, &preempt_fences, &fence_count);
528 	if (err)
529 		goto out_unlock;
530 
531 	err = xe_vm_rebind(vm, true);
532 	if (err)
533 		goto out_unlock;
534 
535 	/* Wait on rebinds and munmap style VM unbinds */
536 	wait = dma_resv_wait_timeout(xe_vm_resv(vm),
537 				     DMA_RESV_USAGE_KERNEL,
538 				     false, MAX_SCHEDULE_TIMEOUT);
539 	if (wait <= 0) {
540 		err = -ETIME;
541 		goto out_unlock;
542 	}
543 
544 #define retry_required(__tries, __vm) \
545 	(IS_ENABLED(CONFIG_DRM_XE_USERPTR_INVAL_INJECT) ? \
546 	(!(__tries)++ || __xe_vm_userptr_needs_repin(__vm)) : \
547 	__xe_vm_userptr_needs_repin(__vm))
548 
549 	down_read(&vm->userptr.notifier_lock);
550 	if (retry_required(tries, vm)) {
551 		up_read(&vm->userptr.notifier_lock);
552 		err = -EAGAIN;
553 		goto out_unlock;
554 	}
555 
556 #undef retry_required
557 
558 	spin_lock(&vm->xe->ttm.lru_lock);
559 	ttm_lru_bulk_move_tail(&vm->lru_bulk_move);
560 	spin_unlock(&vm->xe->ttm.lru_lock);
561 
562 	/* Point of no return. */
563 	arm_preempt_fences(vm, &preempt_fences);
564 	resume_and_reinstall_preempt_fences(vm, &exec);
565 	up_read(&vm->userptr.notifier_lock);
566 
567 out_unlock:
568 	drm_exec_fini(&exec);
569 out_unlock_outer:
570 	if (err == -EAGAIN) {
571 		trace_xe_vm_rebind_worker_retry(vm);
572 		goto retry;
573 	}
574 
575 	if (err) {
576 		drm_warn(&vm->xe->drm, "VM worker error: %d\n", err);
577 		xe_vm_kill(vm, true);
578 	}
579 	up_write(&vm->lock);
580 
581 	free_preempt_fences(&preempt_fences);
582 
583 	trace_xe_vm_rebind_worker_exit(vm);
584 }
585 
586 static void __vma_userptr_invalidate(struct xe_vm *vm, struct xe_userptr_vma *uvma)
587 {
588 	struct xe_userptr *userptr = &uvma->userptr;
589 	struct xe_vma *vma = &uvma->vma;
590 	struct dma_resv_iter cursor;
591 	struct dma_fence *fence;
592 	long err;
593 
594 	/*
595 	 * Tell exec and rebind worker they need to repin and rebind this
596 	 * userptr.
597 	 */
598 	if (!xe_vm_in_fault_mode(vm) &&
599 	    !(vma->gpuva.flags & XE_VMA_DESTROYED)) {
600 		spin_lock(&vm->userptr.invalidated_lock);
601 		list_move_tail(&userptr->invalidate_link,
602 			       &vm->userptr.invalidated);
603 		spin_unlock(&vm->userptr.invalidated_lock);
604 	}
605 
606 	/*
607 	 * Preempt fences turn into schedule disables, pipeline these.
608 	 * Note that even in fault mode, we need to wait for binds and
609 	 * unbinds to complete, and those are attached as BOOKMARK fences
610 	 * to the vm.
611 	 */
612 	dma_resv_iter_begin(&cursor, xe_vm_resv(vm),
613 			    DMA_RESV_USAGE_BOOKKEEP);
614 	dma_resv_for_each_fence_unlocked(&cursor, fence)
615 		dma_fence_enable_sw_signaling(fence);
616 	dma_resv_iter_end(&cursor);
617 
618 	err = dma_resv_wait_timeout(xe_vm_resv(vm),
619 				    DMA_RESV_USAGE_BOOKKEEP,
620 				    false, MAX_SCHEDULE_TIMEOUT);
621 	XE_WARN_ON(err <= 0);
622 
623 	if (xe_vm_in_fault_mode(vm) && userptr->initial_bind) {
624 		err = xe_vm_invalidate_vma(vma);
625 		XE_WARN_ON(err);
626 	}
627 
628 	xe_hmm_userptr_unmap(uvma);
629 }
630 
631 static bool vma_userptr_invalidate(struct mmu_interval_notifier *mni,
632 				   const struct mmu_notifier_range *range,
633 				   unsigned long cur_seq)
634 {
635 	struct xe_userptr_vma *uvma = container_of(mni, typeof(*uvma), userptr.notifier);
636 	struct xe_vma *vma = &uvma->vma;
637 	struct xe_vm *vm = xe_vma_vm(vma);
638 
639 	xe_assert(vm->xe, xe_vma_is_userptr(vma));
640 	trace_xe_vma_userptr_invalidate(vma);
641 
642 	if (!mmu_notifier_range_blockable(range))
643 		return false;
644 
645 	vm_dbg(&xe_vma_vm(vma)->xe->drm,
646 	       "NOTIFIER: addr=0x%016llx, range=0x%016llx",
647 		xe_vma_start(vma), xe_vma_size(vma));
648 
649 	down_write(&vm->userptr.notifier_lock);
650 	mmu_interval_set_seq(mni, cur_seq);
651 
652 	__vma_userptr_invalidate(vm, uvma);
653 	up_write(&vm->userptr.notifier_lock);
654 	trace_xe_vma_userptr_invalidate_complete(vma);
655 
656 	return true;
657 }
658 
659 static const struct mmu_interval_notifier_ops vma_userptr_notifier_ops = {
660 	.invalidate = vma_userptr_invalidate,
661 };
662 
663 #if IS_ENABLED(CONFIG_DRM_XE_USERPTR_INVAL_INJECT)
664 /**
665  * xe_vma_userptr_force_invalidate() - force invalidate a userptr
666  * @uvma: The userptr vma to invalidate
667  *
668  * Perform a forced userptr invalidation for testing purposes.
669  */
670 void xe_vma_userptr_force_invalidate(struct xe_userptr_vma *uvma)
671 {
672 	struct xe_vm *vm = xe_vma_vm(&uvma->vma);
673 
674 	/* Protect against concurrent userptr pinning */
675 	lockdep_assert_held(&vm->lock);
676 	/* Protect against concurrent notifiers */
677 	lockdep_assert_held(&vm->userptr.notifier_lock);
678 	/*
679 	 * Protect against concurrent instances of this function and
680 	 * the critical exec sections
681 	 */
682 	xe_vm_assert_held(vm);
683 
684 	if (!mmu_interval_read_retry(&uvma->userptr.notifier,
685 				     uvma->userptr.notifier_seq))
686 		uvma->userptr.notifier_seq -= 2;
687 	__vma_userptr_invalidate(vm, uvma);
688 }
689 #endif
690 
691 int xe_vm_userptr_pin(struct xe_vm *vm)
692 {
693 	struct xe_userptr_vma *uvma, *next;
694 	int err = 0;
695 
696 	xe_assert(vm->xe, !xe_vm_in_fault_mode(vm));
697 	lockdep_assert_held_write(&vm->lock);
698 
699 	/* Collect invalidated userptrs */
700 	spin_lock(&vm->userptr.invalidated_lock);
701 	xe_assert(vm->xe, list_empty(&vm->userptr.repin_list));
702 	list_for_each_entry_safe(uvma, next, &vm->userptr.invalidated,
703 				 userptr.invalidate_link) {
704 		list_del_init(&uvma->userptr.invalidate_link);
705 		list_add_tail(&uvma->userptr.repin_link,
706 			      &vm->userptr.repin_list);
707 	}
708 	spin_unlock(&vm->userptr.invalidated_lock);
709 
710 	/* Pin and move to bind list */
711 	list_for_each_entry_safe(uvma, next, &vm->userptr.repin_list,
712 				 userptr.repin_link) {
713 		err = xe_vma_userptr_pin_pages(uvma);
714 		if (err == -EFAULT) {
715 			list_del_init(&uvma->userptr.repin_link);
716 			/*
717 			 * We might have already done the pin once already, but
718 			 * then had to retry before the re-bind happened, due
719 			 * some other condition in the caller, but in the
720 			 * meantime the userptr got dinged by the notifier such
721 			 * that we need to revalidate here, but this time we hit
722 			 * the EFAULT. In such a case make sure we remove
723 			 * ourselves from the rebind list to avoid going down in
724 			 * flames.
725 			 */
726 			if (!list_empty(&uvma->vma.combined_links.rebind))
727 				list_del_init(&uvma->vma.combined_links.rebind);
728 
729 			/* Wait for pending binds */
730 			xe_vm_lock(vm, false);
731 			dma_resv_wait_timeout(xe_vm_resv(vm),
732 					      DMA_RESV_USAGE_BOOKKEEP,
733 					      false, MAX_SCHEDULE_TIMEOUT);
734 
735 			err = xe_vm_invalidate_vma(&uvma->vma);
736 			xe_vm_unlock(vm);
737 			if (err)
738 				break;
739 		} else {
740 			if (err)
741 				break;
742 
743 			list_del_init(&uvma->userptr.repin_link);
744 			list_move_tail(&uvma->vma.combined_links.rebind,
745 				       &vm->rebind_list);
746 		}
747 	}
748 
749 	if (err) {
750 		down_write(&vm->userptr.notifier_lock);
751 		spin_lock(&vm->userptr.invalidated_lock);
752 		list_for_each_entry_safe(uvma, next, &vm->userptr.repin_list,
753 					 userptr.repin_link) {
754 			list_del_init(&uvma->userptr.repin_link);
755 			list_move_tail(&uvma->userptr.invalidate_link,
756 				       &vm->userptr.invalidated);
757 		}
758 		spin_unlock(&vm->userptr.invalidated_lock);
759 		up_write(&vm->userptr.notifier_lock);
760 	}
761 	return err;
762 }
763 
764 /**
765  * xe_vm_userptr_check_repin() - Check whether the VM might have userptrs
766  * that need repinning.
767  * @vm: The VM.
768  *
769  * This function does an advisory check for whether the VM has userptrs that
770  * need repinning.
771  *
772  * Return: 0 if there are no indications of userptrs needing repinning,
773  * -EAGAIN if there are.
774  */
775 int xe_vm_userptr_check_repin(struct xe_vm *vm)
776 {
777 	return (list_empty_careful(&vm->userptr.repin_list) &&
778 		list_empty_careful(&vm->userptr.invalidated)) ? 0 : -EAGAIN;
779 }
780 
781 static int xe_vma_ops_alloc(struct xe_vma_ops *vops, bool array_of_binds)
782 {
783 	int i;
784 
785 	for (i = 0; i < XE_MAX_TILES_PER_DEVICE; ++i) {
786 		if (!vops->pt_update_ops[i].num_ops)
787 			continue;
788 
789 		vops->pt_update_ops[i].ops =
790 			kmalloc_array(vops->pt_update_ops[i].num_ops,
791 				      sizeof(*vops->pt_update_ops[i].ops),
792 				      GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN);
793 		if (!vops->pt_update_ops[i].ops)
794 			return array_of_binds ? -ENOBUFS : -ENOMEM;
795 	}
796 
797 	return 0;
798 }
799 ALLOW_ERROR_INJECTION(xe_vma_ops_alloc, ERRNO);
800 
801 static void xe_vma_ops_fini(struct xe_vma_ops *vops)
802 {
803 	int i;
804 
805 	for (i = 0; i < XE_MAX_TILES_PER_DEVICE; ++i)
806 		kfree(vops->pt_update_ops[i].ops);
807 }
808 
809 static void xe_vma_ops_incr_pt_update_ops(struct xe_vma_ops *vops, u8 tile_mask)
810 {
811 	int i;
812 
813 	for (i = 0; i < XE_MAX_TILES_PER_DEVICE; ++i)
814 		if (BIT(i) & tile_mask)
815 			++vops->pt_update_ops[i].num_ops;
816 }
817 
818 static void xe_vm_populate_rebind(struct xe_vma_op *op, struct xe_vma *vma,
819 				  u8 tile_mask)
820 {
821 	INIT_LIST_HEAD(&op->link);
822 	op->tile_mask = tile_mask;
823 	op->base.op = DRM_GPUVA_OP_MAP;
824 	op->base.map.va.addr = vma->gpuva.va.addr;
825 	op->base.map.va.range = vma->gpuva.va.range;
826 	op->base.map.gem.obj = vma->gpuva.gem.obj;
827 	op->base.map.gem.offset = vma->gpuva.gem.offset;
828 	op->map.vma = vma;
829 	op->map.immediate = true;
830 	op->map.dumpable = vma->gpuva.flags & XE_VMA_DUMPABLE;
831 	op->map.is_null = xe_vma_is_null(vma);
832 }
833 
834 static int xe_vm_ops_add_rebind(struct xe_vma_ops *vops, struct xe_vma *vma,
835 				u8 tile_mask)
836 {
837 	struct xe_vma_op *op;
838 
839 	op = kzalloc(sizeof(*op), GFP_KERNEL);
840 	if (!op)
841 		return -ENOMEM;
842 
843 	xe_vm_populate_rebind(op, vma, tile_mask);
844 	list_add_tail(&op->link, &vops->list);
845 	xe_vma_ops_incr_pt_update_ops(vops, tile_mask);
846 
847 	return 0;
848 }
849 
850 static struct dma_fence *ops_execute(struct xe_vm *vm,
851 				     struct xe_vma_ops *vops);
852 static void xe_vma_ops_init(struct xe_vma_ops *vops, struct xe_vm *vm,
853 			    struct xe_exec_queue *q,
854 			    struct xe_sync_entry *syncs, u32 num_syncs);
855 
856 int xe_vm_rebind(struct xe_vm *vm, bool rebind_worker)
857 {
858 	struct dma_fence *fence;
859 	struct xe_vma *vma, *next;
860 	struct xe_vma_ops vops;
861 	struct xe_vma_op *op, *next_op;
862 	int err, i;
863 
864 	lockdep_assert_held(&vm->lock);
865 	if ((xe_vm_in_lr_mode(vm) && !rebind_worker) ||
866 	    list_empty(&vm->rebind_list))
867 		return 0;
868 
869 	xe_vma_ops_init(&vops, vm, NULL, NULL, 0);
870 	for (i = 0; i < XE_MAX_TILES_PER_DEVICE; ++i)
871 		vops.pt_update_ops[i].wait_vm_bookkeep = true;
872 
873 	xe_vm_assert_held(vm);
874 	list_for_each_entry(vma, &vm->rebind_list, combined_links.rebind) {
875 		xe_assert(vm->xe, vma->tile_present);
876 
877 		if (rebind_worker)
878 			trace_xe_vma_rebind_worker(vma);
879 		else
880 			trace_xe_vma_rebind_exec(vma);
881 
882 		err = xe_vm_ops_add_rebind(&vops, vma,
883 					   vma->tile_present);
884 		if (err)
885 			goto free_ops;
886 	}
887 
888 	err = xe_vma_ops_alloc(&vops, false);
889 	if (err)
890 		goto free_ops;
891 
892 	fence = ops_execute(vm, &vops);
893 	if (IS_ERR(fence)) {
894 		err = PTR_ERR(fence);
895 	} else {
896 		dma_fence_put(fence);
897 		list_for_each_entry_safe(vma, next, &vm->rebind_list,
898 					 combined_links.rebind)
899 			list_del_init(&vma->combined_links.rebind);
900 	}
901 free_ops:
902 	list_for_each_entry_safe(op, next_op, &vops.list, link) {
903 		list_del(&op->link);
904 		kfree(op);
905 	}
906 	xe_vma_ops_fini(&vops);
907 
908 	return err;
909 }
910 
911 struct dma_fence *xe_vma_rebind(struct xe_vm *vm, struct xe_vma *vma, u8 tile_mask)
912 {
913 	struct dma_fence *fence = NULL;
914 	struct xe_vma_ops vops;
915 	struct xe_vma_op *op, *next_op;
916 	struct xe_tile *tile;
917 	u8 id;
918 	int err;
919 
920 	lockdep_assert_held(&vm->lock);
921 	xe_vm_assert_held(vm);
922 	xe_assert(vm->xe, xe_vm_in_fault_mode(vm));
923 
924 	xe_vma_ops_init(&vops, vm, NULL, NULL, 0);
925 	for_each_tile(tile, vm->xe, id) {
926 		vops.pt_update_ops[id].wait_vm_bookkeep = true;
927 		vops.pt_update_ops[tile->id].q =
928 			xe_tile_migrate_exec_queue(tile);
929 	}
930 
931 	err = xe_vm_ops_add_rebind(&vops, vma, tile_mask);
932 	if (err)
933 		return ERR_PTR(err);
934 
935 	err = xe_vma_ops_alloc(&vops, false);
936 	if (err) {
937 		fence = ERR_PTR(err);
938 		goto free_ops;
939 	}
940 
941 	fence = ops_execute(vm, &vops);
942 
943 free_ops:
944 	list_for_each_entry_safe(op, next_op, &vops.list, link) {
945 		list_del(&op->link);
946 		kfree(op);
947 	}
948 	xe_vma_ops_fini(&vops);
949 
950 	return fence;
951 }
952 
953 static void xe_vm_populate_range_rebind(struct xe_vma_op *op,
954 					struct xe_vma *vma,
955 					struct xe_svm_range *range,
956 					u8 tile_mask)
957 {
958 	INIT_LIST_HEAD(&op->link);
959 	op->tile_mask = tile_mask;
960 	op->base.op = DRM_GPUVA_OP_DRIVER;
961 	op->subop = XE_VMA_SUBOP_MAP_RANGE;
962 	op->map_range.vma = vma;
963 	op->map_range.range = range;
964 }
965 
966 static int
967 xe_vm_ops_add_range_rebind(struct xe_vma_ops *vops,
968 			   struct xe_vma *vma,
969 			   struct xe_svm_range *range,
970 			   u8 tile_mask)
971 {
972 	struct xe_vma_op *op;
973 
974 	op = kzalloc(sizeof(*op), GFP_KERNEL);
975 	if (!op)
976 		return -ENOMEM;
977 
978 	xe_vm_populate_range_rebind(op, vma, range, tile_mask);
979 	list_add_tail(&op->link, &vops->list);
980 	xe_vma_ops_incr_pt_update_ops(vops, tile_mask);
981 
982 	return 0;
983 }
984 
985 /**
986  * xe_vm_range_rebind() - VM range (re)bind
987  * @vm: The VM which the range belongs to.
988  * @vma: The VMA which the range belongs to.
989  * @range: SVM range to rebind.
990  * @tile_mask: Tile mask to bind the range to.
991  *
992  * (re)bind SVM range setting up GPU page tables for the range.
993  *
994  * Return: dma fence for rebind to signal completion on succees, ERR_PTR on
995  * failure
996  */
997 struct dma_fence *xe_vm_range_rebind(struct xe_vm *vm,
998 				     struct xe_vma *vma,
999 				     struct xe_svm_range *range,
1000 				     u8 tile_mask)
1001 {
1002 	struct dma_fence *fence = NULL;
1003 	struct xe_vma_ops vops;
1004 	struct xe_vma_op *op, *next_op;
1005 	struct xe_tile *tile;
1006 	u8 id;
1007 	int err;
1008 
1009 	lockdep_assert_held(&vm->lock);
1010 	xe_vm_assert_held(vm);
1011 	xe_assert(vm->xe, xe_vm_in_fault_mode(vm));
1012 	xe_assert(vm->xe, xe_vma_is_cpu_addr_mirror(vma));
1013 
1014 	xe_vma_ops_init(&vops, vm, NULL, NULL, 0);
1015 	for_each_tile(tile, vm->xe, id) {
1016 		vops.pt_update_ops[id].wait_vm_bookkeep = true;
1017 		vops.pt_update_ops[tile->id].q =
1018 			xe_tile_migrate_exec_queue(tile);
1019 	}
1020 
1021 	err = xe_vm_ops_add_range_rebind(&vops, vma, range, tile_mask);
1022 	if (err)
1023 		return ERR_PTR(err);
1024 
1025 	err = xe_vma_ops_alloc(&vops, false);
1026 	if (err) {
1027 		fence = ERR_PTR(err);
1028 		goto free_ops;
1029 	}
1030 
1031 	fence = ops_execute(vm, &vops);
1032 
1033 free_ops:
1034 	list_for_each_entry_safe(op, next_op, &vops.list, link) {
1035 		list_del(&op->link);
1036 		kfree(op);
1037 	}
1038 	xe_vma_ops_fini(&vops);
1039 
1040 	return fence;
1041 }
1042 
1043 static void xe_vm_populate_range_unbind(struct xe_vma_op *op,
1044 					struct xe_svm_range *range)
1045 {
1046 	INIT_LIST_HEAD(&op->link);
1047 	op->tile_mask = range->tile_present;
1048 	op->base.op = DRM_GPUVA_OP_DRIVER;
1049 	op->subop = XE_VMA_SUBOP_UNMAP_RANGE;
1050 	op->unmap_range.range = range;
1051 }
1052 
1053 static int
1054 xe_vm_ops_add_range_unbind(struct xe_vma_ops *vops,
1055 			   struct xe_svm_range *range)
1056 {
1057 	struct xe_vma_op *op;
1058 
1059 	op = kzalloc(sizeof(*op), GFP_KERNEL);
1060 	if (!op)
1061 		return -ENOMEM;
1062 
1063 	xe_vm_populate_range_unbind(op, range);
1064 	list_add_tail(&op->link, &vops->list);
1065 	xe_vma_ops_incr_pt_update_ops(vops, range->tile_present);
1066 
1067 	return 0;
1068 }
1069 
1070 /**
1071  * xe_vm_range_unbind() - VM range unbind
1072  * @vm: The VM which the range belongs to.
1073  * @range: SVM range to rebind.
1074  *
1075  * Unbind SVM range removing the GPU page tables for the range.
1076  *
1077  * Return: dma fence for unbind to signal completion on succees, ERR_PTR on
1078  * failure
1079  */
1080 struct dma_fence *xe_vm_range_unbind(struct xe_vm *vm,
1081 				     struct xe_svm_range *range)
1082 {
1083 	struct dma_fence *fence = NULL;
1084 	struct xe_vma_ops vops;
1085 	struct xe_vma_op *op, *next_op;
1086 	struct xe_tile *tile;
1087 	u8 id;
1088 	int err;
1089 
1090 	lockdep_assert_held(&vm->lock);
1091 	xe_vm_assert_held(vm);
1092 	xe_assert(vm->xe, xe_vm_in_fault_mode(vm));
1093 
1094 	if (!range->tile_present)
1095 		return dma_fence_get_stub();
1096 
1097 	xe_vma_ops_init(&vops, vm, NULL, NULL, 0);
1098 	for_each_tile(tile, vm->xe, id) {
1099 		vops.pt_update_ops[id].wait_vm_bookkeep = true;
1100 		vops.pt_update_ops[tile->id].q =
1101 			xe_tile_migrate_exec_queue(tile);
1102 	}
1103 
1104 	err = xe_vm_ops_add_range_unbind(&vops, range);
1105 	if (err)
1106 		return ERR_PTR(err);
1107 
1108 	err = xe_vma_ops_alloc(&vops, false);
1109 	if (err) {
1110 		fence = ERR_PTR(err);
1111 		goto free_ops;
1112 	}
1113 
1114 	fence = ops_execute(vm, &vops);
1115 
1116 free_ops:
1117 	list_for_each_entry_safe(op, next_op, &vops.list, link) {
1118 		list_del(&op->link);
1119 		kfree(op);
1120 	}
1121 	xe_vma_ops_fini(&vops);
1122 
1123 	return fence;
1124 }
1125 
1126 static void xe_vma_free(struct xe_vma *vma)
1127 {
1128 	if (xe_vma_is_userptr(vma))
1129 		kfree(to_userptr_vma(vma));
1130 	else
1131 		kfree(vma);
1132 }
1133 
1134 #define VMA_CREATE_FLAG_READ_ONLY		BIT(0)
1135 #define VMA_CREATE_FLAG_IS_NULL			BIT(1)
1136 #define VMA_CREATE_FLAG_DUMPABLE		BIT(2)
1137 #define VMA_CREATE_FLAG_IS_SYSTEM_ALLOCATOR	BIT(3)
1138 
1139 static struct xe_vma *xe_vma_create(struct xe_vm *vm,
1140 				    struct xe_bo *bo,
1141 				    u64 bo_offset_or_userptr,
1142 				    u64 start, u64 end,
1143 				    u16 pat_index, unsigned int flags)
1144 {
1145 	struct xe_vma *vma;
1146 	struct xe_tile *tile;
1147 	u8 id;
1148 	bool read_only = (flags & VMA_CREATE_FLAG_READ_ONLY);
1149 	bool is_null = (flags & VMA_CREATE_FLAG_IS_NULL);
1150 	bool dumpable = (flags & VMA_CREATE_FLAG_DUMPABLE);
1151 	bool is_cpu_addr_mirror =
1152 		(flags & VMA_CREATE_FLAG_IS_SYSTEM_ALLOCATOR);
1153 
1154 	xe_assert(vm->xe, start < end);
1155 	xe_assert(vm->xe, end < vm->size);
1156 
1157 	/*
1158 	 * Allocate and ensure that the xe_vma_is_userptr() return
1159 	 * matches what was allocated.
1160 	 */
1161 	if (!bo && !is_null && !is_cpu_addr_mirror) {
1162 		struct xe_userptr_vma *uvma = kzalloc(sizeof(*uvma), GFP_KERNEL);
1163 
1164 		if (!uvma)
1165 			return ERR_PTR(-ENOMEM);
1166 
1167 		vma = &uvma->vma;
1168 	} else {
1169 		vma = kzalloc(sizeof(*vma), GFP_KERNEL);
1170 		if (!vma)
1171 			return ERR_PTR(-ENOMEM);
1172 
1173 		if (is_cpu_addr_mirror)
1174 			vma->gpuva.flags |= XE_VMA_SYSTEM_ALLOCATOR;
1175 		if (is_null)
1176 			vma->gpuva.flags |= DRM_GPUVA_SPARSE;
1177 		if (bo)
1178 			vma->gpuva.gem.obj = &bo->ttm.base;
1179 	}
1180 
1181 	INIT_LIST_HEAD(&vma->combined_links.rebind);
1182 
1183 	INIT_LIST_HEAD(&vma->gpuva.gem.entry);
1184 	vma->gpuva.vm = &vm->gpuvm;
1185 	vma->gpuva.va.addr = start;
1186 	vma->gpuva.va.range = end - start + 1;
1187 	if (read_only)
1188 		vma->gpuva.flags |= XE_VMA_READ_ONLY;
1189 	if (dumpable)
1190 		vma->gpuva.flags |= XE_VMA_DUMPABLE;
1191 
1192 	for_each_tile(tile, vm->xe, id)
1193 		vma->tile_mask |= 0x1 << id;
1194 
1195 	if (vm->xe->info.has_atomic_enable_pte_bit)
1196 		vma->gpuva.flags |= XE_VMA_ATOMIC_PTE_BIT;
1197 
1198 	vma->pat_index = pat_index;
1199 
1200 	if (bo) {
1201 		struct drm_gpuvm_bo *vm_bo;
1202 
1203 		xe_bo_assert_held(bo);
1204 
1205 		vm_bo = drm_gpuvm_bo_obtain(vma->gpuva.vm, &bo->ttm.base);
1206 		if (IS_ERR(vm_bo)) {
1207 			xe_vma_free(vma);
1208 			return ERR_CAST(vm_bo);
1209 		}
1210 
1211 		drm_gpuvm_bo_extobj_add(vm_bo);
1212 		drm_gem_object_get(&bo->ttm.base);
1213 		vma->gpuva.gem.offset = bo_offset_or_userptr;
1214 		drm_gpuva_link(&vma->gpuva, vm_bo);
1215 		drm_gpuvm_bo_put(vm_bo);
1216 	} else /* userptr or null */ {
1217 		if (!is_null && !is_cpu_addr_mirror) {
1218 			struct xe_userptr *userptr = &to_userptr_vma(vma)->userptr;
1219 			u64 size = end - start + 1;
1220 			int err;
1221 
1222 			INIT_LIST_HEAD(&userptr->invalidate_link);
1223 			INIT_LIST_HEAD(&userptr->repin_link);
1224 			vma->gpuva.gem.offset = bo_offset_or_userptr;
1225 			mutex_init(&userptr->unmap_mutex);
1226 
1227 			err = mmu_interval_notifier_insert(&userptr->notifier,
1228 							   current->mm,
1229 							   xe_vma_userptr(vma), size,
1230 							   &vma_userptr_notifier_ops);
1231 			if (err) {
1232 				xe_vma_free(vma);
1233 				return ERR_PTR(err);
1234 			}
1235 
1236 			userptr->notifier_seq = LONG_MAX;
1237 		}
1238 
1239 		xe_vm_get(vm);
1240 	}
1241 
1242 	return vma;
1243 }
1244 
1245 static void xe_vma_destroy_late(struct xe_vma *vma)
1246 {
1247 	struct xe_vm *vm = xe_vma_vm(vma);
1248 
1249 	if (vma->ufence) {
1250 		xe_sync_ufence_put(vma->ufence);
1251 		vma->ufence = NULL;
1252 	}
1253 
1254 	if (xe_vma_is_userptr(vma)) {
1255 		struct xe_userptr_vma *uvma = to_userptr_vma(vma);
1256 		struct xe_userptr *userptr = &uvma->userptr;
1257 
1258 		if (userptr->sg)
1259 			xe_hmm_userptr_free_sg(uvma);
1260 
1261 		/*
1262 		 * Since userptr pages are not pinned, we can't remove
1263 		 * the notifier until we're sure the GPU is not accessing
1264 		 * them anymore
1265 		 */
1266 		mmu_interval_notifier_remove(&userptr->notifier);
1267 		mutex_destroy(&userptr->unmap_mutex);
1268 		xe_vm_put(vm);
1269 	} else if (xe_vma_is_null(vma) || xe_vma_is_cpu_addr_mirror(vma)) {
1270 		xe_vm_put(vm);
1271 	} else {
1272 		xe_bo_put(xe_vma_bo(vma));
1273 	}
1274 
1275 	xe_vma_free(vma);
1276 }
1277 
1278 static void vma_destroy_work_func(struct work_struct *w)
1279 {
1280 	struct xe_vma *vma =
1281 		container_of(w, struct xe_vma, destroy_work);
1282 
1283 	xe_vma_destroy_late(vma);
1284 }
1285 
1286 static void vma_destroy_cb(struct dma_fence *fence,
1287 			   struct dma_fence_cb *cb)
1288 {
1289 	struct xe_vma *vma = container_of(cb, struct xe_vma, destroy_cb);
1290 
1291 	INIT_WORK(&vma->destroy_work, vma_destroy_work_func);
1292 	queue_work(system_unbound_wq, &vma->destroy_work);
1293 }
1294 
1295 static void xe_vma_destroy(struct xe_vma *vma, struct dma_fence *fence)
1296 {
1297 	struct xe_vm *vm = xe_vma_vm(vma);
1298 
1299 	lockdep_assert_held_write(&vm->lock);
1300 	xe_assert(vm->xe, list_empty(&vma->combined_links.destroy));
1301 
1302 	if (xe_vma_is_userptr(vma)) {
1303 		xe_assert(vm->xe, vma->gpuva.flags & XE_VMA_DESTROYED);
1304 
1305 		spin_lock(&vm->userptr.invalidated_lock);
1306 		xe_assert(vm->xe, list_empty(&to_userptr_vma(vma)->userptr.repin_link));
1307 		list_del(&to_userptr_vma(vma)->userptr.invalidate_link);
1308 		spin_unlock(&vm->userptr.invalidated_lock);
1309 	} else if (!xe_vma_is_null(vma) && !xe_vma_is_cpu_addr_mirror(vma)) {
1310 		xe_bo_assert_held(xe_vma_bo(vma));
1311 
1312 		drm_gpuva_unlink(&vma->gpuva);
1313 	}
1314 
1315 	xe_vm_assert_held(vm);
1316 	if (fence) {
1317 		int ret = dma_fence_add_callback(fence, &vma->destroy_cb,
1318 						 vma_destroy_cb);
1319 
1320 		if (ret) {
1321 			XE_WARN_ON(ret != -ENOENT);
1322 			xe_vma_destroy_late(vma);
1323 		}
1324 	} else {
1325 		xe_vma_destroy_late(vma);
1326 	}
1327 }
1328 
1329 /**
1330  * xe_vm_lock_vma() - drm_exec utility to lock a vma
1331  * @exec: The drm_exec object we're currently locking for.
1332  * @vma: The vma for witch we want to lock the vm resv and any attached
1333  * object's resv.
1334  *
1335  * Return: 0 on success, negative error code on error. In particular
1336  * may return -EDEADLK on WW transaction contention and -EINTR if
1337  * an interruptible wait is terminated by a signal.
1338  */
1339 int xe_vm_lock_vma(struct drm_exec *exec, struct xe_vma *vma)
1340 {
1341 	struct xe_vm *vm = xe_vma_vm(vma);
1342 	struct xe_bo *bo = xe_vma_bo(vma);
1343 	int err;
1344 
1345 	XE_WARN_ON(!vm);
1346 
1347 	err = drm_exec_lock_obj(exec, xe_vm_obj(vm));
1348 	if (!err && bo && !bo->vm)
1349 		err = drm_exec_lock_obj(exec, &bo->ttm.base);
1350 
1351 	return err;
1352 }
1353 
1354 static void xe_vma_destroy_unlocked(struct xe_vma *vma)
1355 {
1356 	struct drm_exec exec;
1357 	int err;
1358 
1359 	drm_exec_init(&exec, 0, 0);
1360 	drm_exec_until_all_locked(&exec) {
1361 		err = xe_vm_lock_vma(&exec, vma);
1362 		drm_exec_retry_on_contention(&exec);
1363 		if (XE_WARN_ON(err))
1364 			break;
1365 	}
1366 
1367 	xe_vma_destroy(vma, NULL);
1368 
1369 	drm_exec_fini(&exec);
1370 }
1371 
1372 struct xe_vma *
1373 xe_vm_find_overlapping_vma(struct xe_vm *vm, u64 start, u64 range)
1374 {
1375 	struct drm_gpuva *gpuva;
1376 
1377 	lockdep_assert_held(&vm->lock);
1378 
1379 	if (xe_vm_is_closed_or_banned(vm))
1380 		return NULL;
1381 
1382 	xe_assert(vm->xe, start + range <= vm->size);
1383 
1384 	gpuva = drm_gpuva_find_first(&vm->gpuvm, start, range);
1385 
1386 	return gpuva ? gpuva_to_vma(gpuva) : NULL;
1387 }
1388 
1389 static int xe_vm_insert_vma(struct xe_vm *vm, struct xe_vma *vma)
1390 {
1391 	int err;
1392 
1393 	xe_assert(vm->xe, xe_vma_vm(vma) == vm);
1394 	lockdep_assert_held(&vm->lock);
1395 
1396 	mutex_lock(&vm->snap_mutex);
1397 	err = drm_gpuva_insert(&vm->gpuvm, &vma->gpuva);
1398 	mutex_unlock(&vm->snap_mutex);
1399 	XE_WARN_ON(err);	/* Shouldn't be possible */
1400 
1401 	return err;
1402 }
1403 
1404 static void xe_vm_remove_vma(struct xe_vm *vm, struct xe_vma *vma)
1405 {
1406 	xe_assert(vm->xe, xe_vma_vm(vma) == vm);
1407 	lockdep_assert_held(&vm->lock);
1408 
1409 	mutex_lock(&vm->snap_mutex);
1410 	drm_gpuva_remove(&vma->gpuva);
1411 	mutex_unlock(&vm->snap_mutex);
1412 	if (vm->usm.last_fault_vma == vma)
1413 		vm->usm.last_fault_vma = NULL;
1414 }
1415 
1416 static struct drm_gpuva_op *xe_vm_op_alloc(void)
1417 {
1418 	struct xe_vma_op *op;
1419 
1420 	op = kzalloc(sizeof(*op), GFP_KERNEL);
1421 
1422 	if (unlikely(!op))
1423 		return NULL;
1424 
1425 	return &op->base;
1426 }
1427 
1428 static void xe_vm_free(struct drm_gpuvm *gpuvm);
1429 
1430 static const struct drm_gpuvm_ops gpuvm_ops = {
1431 	.op_alloc = xe_vm_op_alloc,
1432 	.vm_bo_validate = xe_gpuvm_validate,
1433 	.vm_free = xe_vm_free,
1434 };
1435 
1436 static u64 pde_encode_pat_index(u16 pat_index)
1437 {
1438 	u64 pte = 0;
1439 
1440 	if (pat_index & BIT(0))
1441 		pte |= XE_PPGTT_PTE_PAT0;
1442 
1443 	if (pat_index & BIT(1))
1444 		pte |= XE_PPGTT_PTE_PAT1;
1445 
1446 	return pte;
1447 }
1448 
1449 static u64 pte_encode_pat_index(u16 pat_index, u32 pt_level)
1450 {
1451 	u64 pte = 0;
1452 
1453 	if (pat_index & BIT(0))
1454 		pte |= XE_PPGTT_PTE_PAT0;
1455 
1456 	if (pat_index & BIT(1))
1457 		pte |= XE_PPGTT_PTE_PAT1;
1458 
1459 	if (pat_index & BIT(2)) {
1460 		if (pt_level)
1461 			pte |= XE_PPGTT_PDE_PDPE_PAT2;
1462 		else
1463 			pte |= XE_PPGTT_PTE_PAT2;
1464 	}
1465 
1466 	if (pat_index & BIT(3))
1467 		pte |= XELPG_PPGTT_PTE_PAT3;
1468 
1469 	if (pat_index & (BIT(4)))
1470 		pte |= XE2_PPGTT_PTE_PAT4;
1471 
1472 	return pte;
1473 }
1474 
1475 static u64 pte_encode_ps(u32 pt_level)
1476 {
1477 	XE_WARN_ON(pt_level > MAX_HUGEPTE_LEVEL);
1478 
1479 	if (pt_level == 1)
1480 		return XE_PDE_PS_2M;
1481 	else if (pt_level == 2)
1482 		return XE_PDPE_PS_1G;
1483 
1484 	return 0;
1485 }
1486 
1487 static u64 xelp_pde_encode_bo(struct xe_bo *bo, u64 bo_offset,
1488 			      const u16 pat_index)
1489 {
1490 	u64 pde;
1491 
1492 	pde = xe_bo_addr(bo, bo_offset, XE_PAGE_SIZE);
1493 	pde |= XE_PAGE_PRESENT | XE_PAGE_RW;
1494 	pde |= pde_encode_pat_index(pat_index);
1495 
1496 	return pde;
1497 }
1498 
1499 static u64 xelp_pte_encode_bo(struct xe_bo *bo, u64 bo_offset,
1500 			      u16 pat_index, u32 pt_level)
1501 {
1502 	u64 pte;
1503 
1504 	pte = xe_bo_addr(bo, bo_offset, XE_PAGE_SIZE);
1505 	pte |= XE_PAGE_PRESENT | XE_PAGE_RW;
1506 	pte |= pte_encode_pat_index(pat_index, pt_level);
1507 	pte |= pte_encode_ps(pt_level);
1508 
1509 	if (xe_bo_is_vram(bo) || xe_bo_is_stolen_devmem(bo))
1510 		pte |= XE_PPGTT_PTE_DM;
1511 
1512 	return pte;
1513 }
1514 
1515 static u64 xelp_pte_encode_vma(u64 pte, struct xe_vma *vma,
1516 			       u16 pat_index, u32 pt_level)
1517 {
1518 	pte |= XE_PAGE_PRESENT;
1519 
1520 	if (likely(!xe_vma_read_only(vma)))
1521 		pte |= XE_PAGE_RW;
1522 
1523 	pte |= pte_encode_pat_index(pat_index, pt_level);
1524 	pte |= pte_encode_ps(pt_level);
1525 
1526 	if (unlikely(xe_vma_is_null(vma)))
1527 		pte |= XE_PTE_NULL;
1528 
1529 	return pte;
1530 }
1531 
1532 static u64 xelp_pte_encode_addr(struct xe_device *xe, u64 addr,
1533 				u16 pat_index,
1534 				u32 pt_level, bool devmem, u64 flags)
1535 {
1536 	u64 pte;
1537 
1538 	/* Avoid passing random bits directly as flags */
1539 	xe_assert(xe, !(flags & ~XE_PTE_PS64));
1540 
1541 	pte = addr;
1542 	pte |= XE_PAGE_PRESENT | XE_PAGE_RW;
1543 	pte |= pte_encode_pat_index(pat_index, pt_level);
1544 	pte |= pte_encode_ps(pt_level);
1545 
1546 	if (devmem)
1547 		pte |= XE_PPGTT_PTE_DM;
1548 
1549 	pte |= flags;
1550 
1551 	return pte;
1552 }
1553 
1554 static const struct xe_pt_ops xelp_pt_ops = {
1555 	.pte_encode_bo = xelp_pte_encode_bo,
1556 	.pte_encode_vma = xelp_pte_encode_vma,
1557 	.pte_encode_addr = xelp_pte_encode_addr,
1558 	.pde_encode_bo = xelp_pde_encode_bo,
1559 };
1560 
1561 static void vm_destroy_work_func(struct work_struct *w);
1562 
1563 /**
1564  * xe_vm_create_scratch() - Setup a scratch memory pagetable tree for the
1565  * given tile and vm.
1566  * @xe: xe device.
1567  * @tile: tile to set up for.
1568  * @vm: vm to set up for.
1569  *
1570  * Sets up a pagetable tree with one page-table per level and a single
1571  * leaf PTE. All pagetable entries point to the single page-table or,
1572  * for MAX_HUGEPTE_LEVEL, a NULL huge PTE returning 0 on read and
1573  * writes become NOPs.
1574  *
1575  * Return: 0 on success, negative error code on error.
1576  */
1577 static int xe_vm_create_scratch(struct xe_device *xe, struct xe_tile *tile,
1578 				struct xe_vm *vm)
1579 {
1580 	u8 id = tile->id;
1581 	int i;
1582 
1583 	for (i = MAX_HUGEPTE_LEVEL; i < vm->pt_root[id]->level; i++) {
1584 		vm->scratch_pt[id][i] = xe_pt_create(vm, tile, i);
1585 		if (IS_ERR(vm->scratch_pt[id][i]))
1586 			return PTR_ERR(vm->scratch_pt[id][i]);
1587 
1588 		xe_pt_populate_empty(tile, vm, vm->scratch_pt[id][i]);
1589 	}
1590 
1591 	return 0;
1592 }
1593 ALLOW_ERROR_INJECTION(xe_vm_create_scratch, ERRNO);
1594 
1595 static void xe_vm_free_scratch(struct xe_vm *vm)
1596 {
1597 	struct xe_tile *tile;
1598 	u8 id;
1599 
1600 	if (!xe_vm_has_scratch(vm))
1601 		return;
1602 
1603 	for_each_tile(tile, vm->xe, id) {
1604 		u32 i;
1605 
1606 		if (!vm->pt_root[id])
1607 			continue;
1608 
1609 		for (i = MAX_HUGEPTE_LEVEL; i < vm->pt_root[id]->level; ++i)
1610 			if (vm->scratch_pt[id][i])
1611 				xe_pt_destroy(vm->scratch_pt[id][i], vm->flags, NULL);
1612 	}
1613 }
1614 
1615 struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags)
1616 {
1617 	struct drm_gem_object *vm_resv_obj;
1618 	struct xe_vm *vm;
1619 	int err, number_tiles = 0;
1620 	struct xe_tile *tile;
1621 	u8 id;
1622 
1623 	/*
1624 	 * Since the GSCCS is not user-accessible, we don't expect a GSC VM to
1625 	 * ever be in faulting mode.
1626 	 */
1627 	xe_assert(xe, !((flags & XE_VM_FLAG_GSC) && (flags & XE_VM_FLAG_FAULT_MODE)));
1628 
1629 	vm = kzalloc(sizeof(*vm), GFP_KERNEL);
1630 	if (!vm)
1631 		return ERR_PTR(-ENOMEM);
1632 
1633 	vm->xe = xe;
1634 
1635 	vm->size = 1ull << xe->info.va_bits;
1636 
1637 	vm->flags = flags;
1638 
1639 	/**
1640 	 * GSC VMs are kernel-owned, only used for PXP ops and can sometimes be
1641 	 * manipulated under the PXP mutex. However, the PXP mutex can be taken
1642 	 * under a user-VM lock when the PXP session is started at exec_queue
1643 	 * creation time. Those are different VMs and therefore there is no risk
1644 	 * of deadlock, but we need to tell lockdep that this is the case or it
1645 	 * will print a warning.
1646 	 */
1647 	if (flags & XE_VM_FLAG_GSC) {
1648 		static struct lock_class_key gsc_vm_key;
1649 
1650 		__init_rwsem(&vm->lock, "gsc_vm", &gsc_vm_key);
1651 	} else {
1652 		init_rwsem(&vm->lock);
1653 	}
1654 	mutex_init(&vm->snap_mutex);
1655 
1656 	INIT_LIST_HEAD(&vm->rebind_list);
1657 
1658 	INIT_LIST_HEAD(&vm->userptr.repin_list);
1659 	INIT_LIST_HEAD(&vm->userptr.invalidated);
1660 	init_rwsem(&vm->userptr.notifier_lock);
1661 	spin_lock_init(&vm->userptr.invalidated_lock);
1662 
1663 	ttm_lru_bulk_move_init(&vm->lru_bulk_move);
1664 
1665 	INIT_WORK(&vm->destroy_work, vm_destroy_work_func);
1666 
1667 	INIT_LIST_HEAD(&vm->preempt.exec_queues);
1668 	vm->preempt.min_run_period_ms = 10;	/* FIXME: Wire up to uAPI */
1669 
1670 	for_each_tile(tile, xe, id)
1671 		xe_range_fence_tree_init(&vm->rftree[id]);
1672 
1673 	vm->pt_ops = &xelp_pt_ops;
1674 
1675 	/*
1676 	 * Long-running workloads are not protected by the scheduler references.
1677 	 * By design, run_job for long-running workloads returns NULL and the
1678 	 * scheduler drops all the references of it, hence protecting the VM
1679 	 * for this case is necessary.
1680 	 */
1681 	if (flags & XE_VM_FLAG_LR_MODE) {
1682 		INIT_WORK(&vm->preempt.rebind_work, preempt_rebind_work_func);
1683 		xe_pm_runtime_get_noresume(xe);
1684 	}
1685 
1686 	if (flags & XE_VM_FLAG_FAULT_MODE) {
1687 		err = xe_svm_init(vm);
1688 		if (err)
1689 			goto err_no_resv;
1690 	}
1691 
1692 	vm_resv_obj = drm_gpuvm_resv_object_alloc(&xe->drm);
1693 	if (!vm_resv_obj) {
1694 		err = -ENOMEM;
1695 		goto err_svm_fini;
1696 	}
1697 
1698 	drm_gpuvm_init(&vm->gpuvm, "Xe VM", DRM_GPUVM_RESV_PROTECTED, &xe->drm,
1699 		       vm_resv_obj, 0, vm->size, 0, 0, &gpuvm_ops);
1700 
1701 	drm_gem_object_put(vm_resv_obj);
1702 
1703 	err = xe_vm_lock(vm, true);
1704 	if (err)
1705 		goto err_close;
1706 
1707 	if (IS_DGFX(xe) && xe->info.vram_flags & XE_VRAM_FLAGS_NEED64K)
1708 		vm->flags |= XE_VM_FLAG_64K;
1709 
1710 	for_each_tile(tile, xe, id) {
1711 		if (flags & XE_VM_FLAG_MIGRATION &&
1712 		    tile->id != XE_VM_FLAG_TILE_ID(flags))
1713 			continue;
1714 
1715 		vm->pt_root[id] = xe_pt_create(vm, tile, xe->info.vm_max_level);
1716 		if (IS_ERR(vm->pt_root[id])) {
1717 			err = PTR_ERR(vm->pt_root[id]);
1718 			vm->pt_root[id] = NULL;
1719 			goto err_unlock_close;
1720 		}
1721 	}
1722 
1723 	if (xe_vm_has_scratch(vm)) {
1724 		for_each_tile(tile, xe, id) {
1725 			if (!vm->pt_root[id])
1726 				continue;
1727 
1728 			err = xe_vm_create_scratch(xe, tile, vm);
1729 			if (err)
1730 				goto err_unlock_close;
1731 		}
1732 		vm->batch_invalidate_tlb = true;
1733 	}
1734 
1735 	if (vm->flags & XE_VM_FLAG_LR_MODE)
1736 		vm->batch_invalidate_tlb = false;
1737 
1738 	/* Fill pt_root after allocating scratch tables */
1739 	for_each_tile(tile, xe, id) {
1740 		if (!vm->pt_root[id])
1741 			continue;
1742 
1743 		xe_pt_populate_empty(tile, vm, vm->pt_root[id]);
1744 	}
1745 	xe_vm_unlock(vm);
1746 
1747 	/* Kernel migration VM shouldn't have a circular loop.. */
1748 	if (!(flags & XE_VM_FLAG_MIGRATION)) {
1749 		for_each_tile(tile, xe, id) {
1750 			struct xe_exec_queue *q;
1751 			u32 create_flags = EXEC_QUEUE_FLAG_VM;
1752 
1753 			if (!vm->pt_root[id])
1754 				continue;
1755 
1756 			q = xe_exec_queue_create_bind(xe, tile, create_flags, 0);
1757 			if (IS_ERR(q)) {
1758 				err = PTR_ERR(q);
1759 				goto err_close;
1760 			}
1761 			vm->q[id] = q;
1762 			number_tiles++;
1763 		}
1764 	}
1765 
1766 	if (number_tiles > 1)
1767 		vm->composite_fence_ctx = dma_fence_context_alloc(1);
1768 
1769 	trace_xe_vm_create(vm);
1770 
1771 	return vm;
1772 
1773 err_unlock_close:
1774 	xe_vm_unlock(vm);
1775 err_close:
1776 	xe_vm_close_and_put(vm);
1777 	return ERR_PTR(err);
1778 
1779 err_svm_fini:
1780 	if (flags & XE_VM_FLAG_FAULT_MODE) {
1781 		vm->size = 0; /* close the vm */
1782 		xe_svm_fini(vm);
1783 	}
1784 err_no_resv:
1785 	mutex_destroy(&vm->snap_mutex);
1786 	for_each_tile(tile, xe, id)
1787 		xe_range_fence_tree_fini(&vm->rftree[id]);
1788 	ttm_lru_bulk_move_fini(&xe->ttm, &vm->lru_bulk_move);
1789 	kfree(vm);
1790 	if (flags & XE_VM_FLAG_LR_MODE)
1791 		xe_pm_runtime_put(xe);
1792 	return ERR_PTR(err);
1793 }
1794 
1795 static void xe_vm_close(struct xe_vm *vm)
1796 {
1797 	struct xe_device *xe = vm->xe;
1798 	bool bound;
1799 	int idx;
1800 
1801 	bound = drm_dev_enter(&xe->drm, &idx);
1802 
1803 	down_write(&vm->lock);
1804 	if (xe_vm_in_fault_mode(vm))
1805 		xe_svm_notifier_lock(vm);
1806 
1807 	vm->size = 0;
1808 
1809 	if (!((vm->flags & XE_VM_FLAG_MIGRATION))) {
1810 		struct xe_tile *tile;
1811 		struct xe_gt *gt;
1812 		u8 id;
1813 
1814 		/* Wait for pending binds */
1815 		dma_resv_wait_timeout(xe_vm_resv(vm),
1816 				      DMA_RESV_USAGE_BOOKKEEP,
1817 				      false, MAX_SCHEDULE_TIMEOUT);
1818 
1819 		if (bound) {
1820 			for_each_tile(tile, xe, id)
1821 				if (vm->pt_root[id])
1822 					xe_pt_clear(xe, vm->pt_root[id]);
1823 
1824 			for_each_gt(gt, xe, id)
1825 				xe_gt_tlb_invalidation_vm(gt, vm);
1826 		}
1827 	}
1828 
1829 	if (xe_vm_in_fault_mode(vm))
1830 		xe_svm_notifier_unlock(vm);
1831 	up_write(&vm->lock);
1832 
1833 	if (bound)
1834 		drm_dev_exit(idx);
1835 }
1836 
1837 void xe_vm_close_and_put(struct xe_vm *vm)
1838 {
1839 	LIST_HEAD(contested);
1840 	struct xe_device *xe = vm->xe;
1841 	struct xe_tile *tile;
1842 	struct xe_vma *vma, *next_vma;
1843 	struct drm_gpuva *gpuva, *next;
1844 	u8 id;
1845 
1846 	xe_assert(xe, !vm->preempt.num_exec_queues);
1847 
1848 	xe_vm_close(vm);
1849 	if (xe_vm_in_preempt_fence_mode(vm))
1850 		flush_work(&vm->preempt.rebind_work);
1851 	if (xe_vm_in_fault_mode(vm))
1852 		xe_svm_close(vm);
1853 
1854 	down_write(&vm->lock);
1855 	for_each_tile(tile, xe, id) {
1856 		if (vm->q[id])
1857 			xe_exec_queue_last_fence_put(vm->q[id], vm);
1858 	}
1859 	up_write(&vm->lock);
1860 
1861 	for_each_tile(tile, xe, id) {
1862 		if (vm->q[id]) {
1863 			xe_exec_queue_kill(vm->q[id]);
1864 			xe_exec_queue_put(vm->q[id]);
1865 			vm->q[id] = NULL;
1866 		}
1867 	}
1868 
1869 	down_write(&vm->lock);
1870 	xe_vm_lock(vm, false);
1871 	drm_gpuvm_for_each_va_safe(gpuva, next, &vm->gpuvm) {
1872 		vma = gpuva_to_vma(gpuva);
1873 
1874 		if (xe_vma_has_no_bo(vma)) {
1875 			down_read(&vm->userptr.notifier_lock);
1876 			vma->gpuva.flags |= XE_VMA_DESTROYED;
1877 			up_read(&vm->userptr.notifier_lock);
1878 		}
1879 
1880 		xe_vm_remove_vma(vm, vma);
1881 
1882 		/* easy case, remove from VMA? */
1883 		if (xe_vma_has_no_bo(vma) || xe_vma_bo(vma)->vm) {
1884 			list_del_init(&vma->combined_links.rebind);
1885 			xe_vma_destroy(vma, NULL);
1886 			continue;
1887 		}
1888 
1889 		list_move_tail(&vma->combined_links.destroy, &contested);
1890 		vma->gpuva.flags |= XE_VMA_DESTROYED;
1891 	}
1892 
1893 	/*
1894 	 * All vm operations will add shared fences to resv.
1895 	 * The only exception is eviction for a shared object,
1896 	 * but even so, the unbind when evicted would still
1897 	 * install a fence to resv. Hence it's safe to
1898 	 * destroy the pagetables immediately.
1899 	 */
1900 	xe_vm_free_scratch(vm);
1901 
1902 	for_each_tile(tile, xe, id) {
1903 		if (vm->pt_root[id]) {
1904 			xe_pt_destroy(vm->pt_root[id], vm->flags, NULL);
1905 			vm->pt_root[id] = NULL;
1906 		}
1907 	}
1908 	xe_vm_unlock(vm);
1909 
1910 	/*
1911 	 * VM is now dead, cannot re-add nodes to vm->vmas if it's NULL
1912 	 * Since we hold a refcount to the bo, we can remove and free
1913 	 * the members safely without locking.
1914 	 */
1915 	list_for_each_entry_safe(vma, next_vma, &contested,
1916 				 combined_links.destroy) {
1917 		list_del_init(&vma->combined_links.destroy);
1918 		xe_vma_destroy_unlocked(vma);
1919 	}
1920 
1921 	if (xe_vm_in_fault_mode(vm))
1922 		xe_svm_fini(vm);
1923 
1924 	up_write(&vm->lock);
1925 
1926 	down_write(&xe->usm.lock);
1927 	if (vm->usm.asid) {
1928 		void *lookup;
1929 
1930 		xe_assert(xe, xe->info.has_asid);
1931 		xe_assert(xe, !(vm->flags & XE_VM_FLAG_MIGRATION));
1932 
1933 		lookup = xa_erase(&xe->usm.asid_to_vm, vm->usm.asid);
1934 		xe_assert(xe, lookup == vm);
1935 	}
1936 	up_write(&xe->usm.lock);
1937 
1938 	for_each_tile(tile, xe, id)
1939 		xe_range_fence_tree_fini(&vm->rftree[id]);
1940 
1941 	xe_vm_put(vm);
1942 }
1943 
1944 static void vm_destroy_work_func(struct work_struct *w)
1945 {
1946 	struct xe_vm *vm =
1947 		container_of(w, struct xe_vm, destroy_work);
1948 	struct xe_device *xe = vm->xe;
1949 	struct xe_tile *tile;
1950 	u8 id;
1951 
1952 	/* xe_vm_close_and_put was not called? */
1953 	xe_assert(xe, !vm->size);
1954 
1955 	if (xe_vm_in_preempt_fence_mode(vm))
1956 		flush_work(&vm->preempt.rebind_work);
1957 
1958 	mutex_destroy(&vm->snap_mutex);
1959 
1960 	if (vm->flags & XE_VM_FLAG_LR_MODE)
1961 		xe_pm_runtime_put(xe);
1962 
1963 	for_each_tile(tile, xe, id)
1964 		XE_WARN_ON(vm->pt_root[id]);
1965 
1966 	trace_xe_vm_free(vm);
1967 
1968 	ttm_lru_bulk_move_fini(&xe->ttm, &vm->lru_bulk_move);
1969 
1970 	if (vm->xef)
1971 		xe_file_put(vm->xef);
1972 
1973 	kfree(vm);
1974 }
1975 
1976 static void xe_vm_free(struct drm_gpuvm *gpuvm)
1977 {
1978 	struct xe_vm *vm = container_of(gpuvm, struct xe_vm, gpuvm);
1979 
1980 	/* To destroy the VM we need to be able to sleep */
1981 	queue_work(system_unbound_wq, &vm->destroy_work);
1982 }
1983 
1984 struct xe_vm *xe_vm_lookup(struct xe_file *xef, u32 id)
1985 {
1986 	struct xe_vm *vm;
1987 
1988 	mutex_lock(&xef->vm.lock);
1989 	vm = xa_load(&xef->vm.xa, id);
1990 	if (vm)
1991 		xe_vm_get(vm);
1992 	mutex_unlock(&xef->vm.lock);
1993 
1994 	return vm;
1995 }
1996 
1997 u64 xe_vm_pdp4_descriptor(struct xe_vm *vm, struct xe_tile *tile)
1998 {
1999 	return vm->pt_ops->pde_encode_bo(vm->pt_root[tile->id]->bo, 0,
2000 					 tile_to_xe(tile)->pat.idx[XE_CACHE_WB]);
2001 }
2002 
2003 static struct xe_exec_queue *
2004 to_wait_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
2005 {
2006 	return q ? q : vm->q[0];
2007 }
2008 
2009 static struct xe_user_fence *
2010 find_ufence_get(struct xe_sync_entry *syncs, u32 num_syncs)
2011 {
2012 	unsigned int i;
2013 
2014 	for (i = 0; i < num_syncs; i++) {
2015 		struct xe_sync_entry *e = &syncs[i];
2016 
2017 		if (xe_sync_is_ufence(e))
2018 			return xe_sync_ufence_get(e);
2019 	}
2020 
2021 	return NULL;
2022 }
2023 
2024 #define ALL_DRM_XE_VM_CREATE_FLAGS (DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE | \
2025 				    DRM_XE_VM_CREATE_FLAG_LR_MODE | \
2026 				    DRM_XE_VM_CREATE_FLAG_FAULT_MODE)
2027 
2028 int xe_vm_create_ioctl(struct drm_device *dev, void *data,
2029 		       struct drm_file *file)
2030 {
2031 	struct xe_device *xe = to_xe_device(dev);
2032 	struct xe_file *xef = to_xe_file(file);
2033 	struct drm_xe_vm_create *args = data;
2034 	struct xe_tile *tile;
2035 	struct xe_vm *vm;
2036 	u32 id, asid;
2037 	int err;
2038 	u32 flags = 0;
2039 
2040 	if (XE_IOCTL_DBG(xe, args->extensions))
2041 		return -EINVAL;
2042 
2043 	if (XE_WA(xe_root_mmio_gt(xe), 14016763929))
2044 		args->flags |= DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE;
2045 
2046 	if (XE_IOCTL_DBG(xe, args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE &&
2047 			 !xe->info.has_usm))
2048 		return -EINVAL;
2049 
2050 	if (XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
2051 		return -EINVAL;
2052 
2053 	if (XE_IOCTL_DBG(xe, args->flags & ~ALL_DRM_XE_VM_CREATE_FLAGS))
2054 		return -EINVAL;
2055 
2056 	if (XE_IOCTL_DBG(xe, args->flags & DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE &&
2057 			 args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE &&
2058 			 !xe->info.needs_scratch))
2059 		return -EINVAL;
2060 
2061 	if (XE_IOCTL_DBG(xe, !(args->flags & DRM_XE_VM_CREATE_FLAG_LR_MODE) &&
2062 			 args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE))
2063 		return -EINVAL;
2064 
2065 	if (args->flags & DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE)
2066 		flags |= XE_VM_FLAG_SCRATCH_PAGE;
2067 	if (args->flags & DRM_XE_VM_CREATE_FLAG_LR_MODE)
2068 		flags |= XE_VM_FLAG_LR_MODE;
2069 	if (args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE)
2070 		flags |= XE_VM_FLAG_FAULT_MODE;
2071 
2072 	vm = xe_vm_create(xe, flags);
2073 	if (IS_ERR(vm))
2074 		return PTR_ERR(vm);
2075 
2076 	if (xe->info.has_asid) {
2077 		down_write(&xe->usm.lock);
2078 		err = xa_alloc_cyclic(&xe->usm.asid_to_vm, &asid, vm,
2079 				      XA_LIMIT(1, XE_MAX_ASID - 1),
2080 				      &xe->usm.next_asid, GFP_KERNEL);
2081 		up_write(&xe->usm.lock);
2082 		if (err < 0)
2083 			goto err_close_and_put;
2084 
2085 		vm->usm.asid = asid;
2086 	}
2087 
2088 	vm->xef = xe_file_get(xef);
2089 
2090 	/* Record BO memory for VM pagetable created against client */
2091 	for_each_tile(tile, xe, id)
2092 		if (vm->pt_root[id])
2093 			xe_drm_client_add_bo(vm->xef->client, vm->pt_root[id]->bo);
2094 
2095 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_MEM)
2096 	/* Warning: Security issue - never enable by default */
2097 	args->reserved[0] = xe_bo_main_addr(vm->pt_root[0]->bo, XE_PAGE_SIZE);
2098 #endif
2099 
2100 	/* user id alloc must always be last in ioctl to prevent UAF */
2101 	err = xa_alloc(&xef->vm.xa, &id, vm, xa_limit_32b, GFP_KERNEL);
2102 	if (err)
2103 		goto err_close_and_put;
2104 
2105 	args->vm_id = id;
2106 
2107 	return 0;
2108 
2109 err_close_and_put:
2110 	xe_vm_close_and_put(vm);
2111 
2112 	return err;
2113 }
2114 
2115 int xe_vm_destroy_ioctl(struct drm_device *dev, void *data,
2116 			struct drm_file *file)
2117 {
2118 	struct xe_device *xe = to_xe_device(dev);
2119 	struct xe_file *xef = to_xe_file(file);
2120 	struct drm_xe_vm_destroy *args = data;
2121 	struct xe_vm *vm;
2122 	int err = 0;
2123 
2124 	if (XE_IOCTL_DBG(xe, args->pad) ||
2125 	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
2126 		return -EINVAL;
2127 
2128 	mutex_lock(&xef->vm.lock);
2129 	vm = xa_load(&xef->vm.xa, args->vm_id);
2130 	if (XE_IOCTL_DBG(xe, !vm))
2131 		err = -ENOENT;
2132 	else if (XE_IOCTL_DBG(xe, vm->preempt.num_exec_queues))
2133 		err = -EBUSY;
2134 	else
2135 		xa_erase(&xef->vm.xa, args->vm_id);
2136 	mutex_unlock(&xef->vm.lock);
2137 
2138 	if (!err)
2139 		xe_vm_close_and_put(vm);
2140 
2141 	return err;
2142 }
2143 
2144 static const u32 region_to_mem_type[] = {
2145 	XE_PL_TT,
2146 	XE_PL_VRAM0,
2147 	XE_PL_VRAM1,
2148 };
2149 
2150 static void prep_vma_destroy(struct xe_vm *vm, struct xe_vma *vma,
2151 			     bool post_commit)
2152 {
2153 	down_read(&vm->userptr.notifier_lock);
2154 	vma->gpuva.flags |= XE_VMA_DESTROYED;
2155 	up_read(&vm->userptr.notifier_lock);
2156 	if (post_commit)
2157 		xe_vm_remove_vma(vm, vma);
2158 }
2159 
2160 #undef ULL
2161 #define ULL	unsigned long long
2162 
2163 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_VM)
2164 static void print_op(struct xe_device *xe, struct drm_gpuva_op *op)
2165 {
2166 	struct xe_vma *vma;
2167 
2168 	switch (op->op) {
2169 	case DRM_GPUVA_OP_MAP:
2170 		vm_dbg(&xe->drm, "MAP: addr=0x%016llx, range=0x%016llx",
2171 		       (ULL)op->map.va.addr, (ULL)op->map.va.range);
2172 		break;
2173 	case DRM_GPUVA_OP_REMAP:
2174 		vma = gpuva_to_vma(op->remap.unmap->va);
2175 		vm_dbg(&xe->drm, "REMAP:UNMAP: addr=0x%016llx, range=0x%016llx, keep=%d",
2176 		       (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma),
2177 		       op->remap.unmap->keep ? 1 : 0);
2178 		if (op->remap.prev)
2179 			vm_dbg(&xe->drm,
2180 			       "REMAP:PREV: addr=0x%016llx, range=0x%016llx",
2181 			       (ULL)op->remap.prev->va.addr,
2182 			       (ULL)op->remap.prev->va.range);
2183 		if (op->remap.next)
2184 			vm_dbg(&xe->drm,
2185 			       "REMAP:NEXT: addr=0x%016llx, range=0x%016llx",
2186 			       (ULL)op->remap.next->va.addr,
2187 			       (ULL)op->remap.next->va.range);
2188 		break;
2189 	case DRM_GPUVA_OP_UNMAP:
2190 		vma = gpuva_to_vma(op->unmap.va);
2191 		vm_dbg(&xe->drm, "UNMAP: addr=0x%016llx, range=0x%016llx, keep=%d",
2192 		       (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma),
2193 		       op->unmap.keep ? 1 : 0);
2194 		break;
2195 	case DRM_GPUVA_OP_PREFETCH:
2196 		vma = gpuva_to_vma(op->prefetch.va);
2197 		vm_dbg(&xe->drm, "PREFETCH: addr=0x%016llx, range=0x%016llx",
2198 		       (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma));
2199 		break;
2200 	default:
2201 		drm_warn(&xe->drm, "NOT POSSIBLE");
2202 	}
2203 }
2204 #else
2205 static void print_op(struct xe_device *xe, struct drm_gpuva_op *op)
2206 {
2207 }
2208 #endif
2209 
2210 static bool __xe_vm_needs_clear_scratch_pages(struct xe_vm *vm, u32 bind_flags)
2211 {
2212 	if (!xe_vm_in_fault_mode(vm))
2213 		return false;
2214 
2215 	if (!xe_vm_has_scratch(vm))
2216 		return false;
2217 
2218 	if (bind_flags & DRM_XE_VM_BIND_FLAG_IMMEDIATE)
2219 		return false;
2220 
2221 	return true;
2222 }
2223 
2224 /*
2225  * Create operations list from IOCTL arguments, setup operations fields so parse
2226  * and commit steps are decoupled from IOCTL arguments. This step can fail.
2227  */
2228 static struct drm_gpuva_ops *
2229 vm_bind_ioctl_ops_create(struct xe_vm *vm, struct xe_bo *bo,
2230 			 u64 bo_offset_or_userptr, u64 addr, u64 range,
2231 			 u32 operation, u32 flags,
2232 			 u32 prefetch_region, u16 pat_index)
2233 {
2234 	struct drm_gem_object *obj = bo ? &bo->ttm.base : NULL;
2235 	struct drm_gpuva_ops *ops;
2236 	struct drm_gpuva_op *__op;
2237 	struct drm_gpuvm_bo *vm_bo;
2238 	int err;
2239 
2240 	lockdep_assert_held_write(&vm->lock);
2241 
2242 	vm_dbg(&vm->xe->drm,
2243 	       "op=%d, addr=0x%016llx, range=0x%016llx, bo_offset_or_userptr=0x%016llx",
2244 	       operation, (ULL)addr, (ULL)range,
2245 	       (ULL)bo_offset_or_userptr);
2246 
2247 	switch (operation) {
2248 	case DRM_XE_VM_BIND_OP_MAP:
2249 	case DRM_XE_VM_BIND_OP_MAP_USERPTR:
2250 		ops = drm_gpuvm_sm_map_ops_create(&vm->gpuvm, addr, range,
2251 						  obj, bo_offset_or_userptr);
2252 		break;
2253 	case DRM_XE_VM_BIND_OP_UNMAP:
2254 		ops = drm_gpuvm_sm_unmap_ops_create(&vm->gpuvm, addr, range);
2255 		break;
2256 	case DRM_XE_VM_BIND_OP_PREFETCH:
2257 		ops = drm_gpuvm_prefetch_ops_create(&vm->gpuvm, addr, range);
2258 		break;
2259 	case DRM_XE_VM_BIND_OP_UNMAP_ALL:
2260 		xe_assert(vm->xe, bo);
2261 
2262 		err = xe_bo_lock(bo, true);
2263 		if (err)
2264 			return ERR_PTR(err);
2265 
2266 		vm_bo = drm_gpuvm_bo_obtain(&vm->gpuvm, obj);
2267 		if (IS_ERR(vm_bo)) {
2268 			xe_bo_unlock(bo);
2269 			return ERR_CAST(vm_bo);
2270 		}
2271 
2272 		ops = drm_gpuvm_bo_unmap_ops_create(vm_bo);
2273 		drm_gpuvm_bo_put(vm_bo);
2274 		xe_bo_unlock(bo);
2275 		break;
2276 	default:
2277 		drm_warn(&vm->xe->drm, "NOT POSSIBLE");
2278 		ops = ERR_PTR(-EINVAL);
2279 	}
2280 	if (IS_ERR(ops))
2281 		return ops;
2282 
2283 	drm_gpuva_for_each_op(__op, ops) {
2284 		struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2285 
2286 		if (__op->op == DRM_GPUVA_OP_MAP) {
2287 			op->map.immediate =
2288 				flags & DRM_XE_VM_BIND_FLAG_IMMEDIATE;
2289 			op->map.read_only =
2290 				flags & DRM_XE_VM_BIND_FLAG_READONLY;
2291 			op->map.is_null = flags & DRM_XE_VM_BIND_FLAG_NULL;
2292 			op->map.is_cpu_addr_mirror = flags &
2293 				DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR;
2294 			op->map.dumpable = flags & DRM_XE_VM_BIND_FLAG_DUMPABLE;
2295 			op->map.pat_index = pat_index;
2296 			op->map.invalidate_on_bind =
2297 				__xe_vm_needs_clear_scratch_pages(vm, flags);
2298 		} else if (__op->op == DRM_GPUVA_OP_PREFETCH) {
2299 			op->prefetch.region = prefetch_region;
2300 		}
2301 
2302 		print_op(vm->xe, __op);
2303 	}
2304 
2305 	return ops;
2306 }
2307 ALLOW_ERROR_INJECTION(vm_bind_ioctl_ops_create, ERRNO);
2308 
2309 static struct xe_vma *new_vma(struct xe_vm *vm, struct drm_gpuva_op_map *op,
2310 			      u16 pat_index, unsigned int flags)
2311 {
2312 	struct xe_bo *bo = op->gem.obj ? gem_to_xe_bo(op->gem.obj) : NULL;
2313 	struct drm_exec exec;
2314 	struct xe_vma *vma;
2315 	int err = 0;
2316 
2317 	lockdep_assert_held_write(&vm->lock);
2318 
2319 	if (bo) {
2320 		drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT, 0);
2321 		drm_exec_until_all_locked(&exec) {
2322 			err = 0;
2323 			if (!bo->vm) {
2324 				err = drm_exec_lock_obj(&exec, xe_vm_obj(vm));
2325 				drm_exec_retry_on_contention(&exec);
2326 			}
2327 			if (!err) {
2328 				err = drm_exec_lock_obj(&exec, &bo->ttm.base);
2329 				drm_exec_retry_on_contention(&exec);
2330 			}
2331 			if (err) {
2332 				drm_exec_fini(&exec);
2333 				return ERR_PTR(err);
2334 			}
2335 		}
2336 	}
2337 	vma = xe_vma_create(vm, bo, op->gem.offset,
2338 			    op->va.addr, op->va.addr +
2339 			    op->va.range - 1, pat_index, flags);
2340 	if (IS_ERR(vma))
2341 		goto err_unlock;
2342 
2343 	if (xe_vma_is_userptr(vma))
2344 		err = xe_vma_userptr_pin_pages(to_userptr_vma(vma));
2345 	else if (!xe_vma_has_no_bo(vma) && !bo->vm)
2346 		err = add_preempt_fences(vm, bo);
2347 
2348 err_unlock:
2349 	if (bo)
2350 		drm_exec_fini(&exec);
2351 
2352 	if (err) {
2353 		prep_vma_destroy(vm, vma, false);
2354 		xe_vma_destroy_unlocked(vma);
2355 		vma = ERR_PTR(err);
2356 	}
2357 
2358 	return vma;
2359 }
2360 
2361 static u64 xe_vma_max_pte_size(struct xe_vma *vma)
2362 {
2363 	if (vma->gpuva.flags & XE_VMA_PTE_1G)
2364 		return SZ_1G;
2365 	else if (vma->gpuva.flags & (XE_VMA_PTE_2M | XE_VMA_PTE_COMPACT))
2366 		return SZ_2M;
2367 	else if (vma->gpuva.flags & XE_VMA_PTE_64K)
2368 		return SZ_64K;
2369 	else if (vma->gpuva.flags & XE_VMA_PTE_4K)
2370 		return SZ_4K;
2371 
2372 	return SZ_1G;	/* Uninitialized, used max size */
2373 }
2374 
2375 static void xe_vma_set_pte_size(struct xe_vma *vma, u64 size)
2376 {
2377 	switch (size) {
2378 	case SZ_1G:
2379 		vma->gpuva.flags |= XE_VMA_PTE_1G;
2380 		break;
2381 	case SZ_2M:
2382 		vma->gpuva.flags |= XE_VMA_PTE_2M;
2383 		break;
2384 	case SZ_64K:
2385 		vma->gpuva.flags |= XE_VMA_PTE_64K;
2386 		break;
2387 	case SZ_4K:
2388 		vma->gpuva.flags |= XE_VMA_PTE_4K;
2389 		break;
2390 	}
2391 }
2392 
2393 static int xe_vma_op_commit(struct xe_vm *vm, struct xe_vma_op *op)
2394 {
2395 	int err = 0;
2396 
2397 	lockdep_assert_held_write(&vm->lock);
2398 
2399 	switch (op->base.op) {
2400 	case DRM_GPUVA_OP_MAP:
2401 		err |= xe_vm_insert_vma(vm, op->map.vma);
2402 		if (!err)
2403 			op->flags |= XE_VMA_OP_COMMITTED;
2404 		break;
2405 	case DRM_GPUVA_OP_REMAP:
2406 	{
2407 		u8 tile_present =
2408 			gpuva_to_vma(op->base.remap.unmap->va)->tile_present;
2409 
2410 		prep_vma_destroy(vm, gpuva_to_vma(op->base.remap.unmap->va),
2411 				 true);
2412 		op->flags |= XE_VMA_OP_COMMITTED;
2413 
2414 		if (op->remap.prev) {
2415 			err |= xe_vm_insert_vma(vm, op->remap.prev);
2416 			if (!err)
2417 				op->flags |= XE_VMA_OP_PREV_COMMITTED;
2418 			if (!err && op->remap.skip_prev) {
2419 				op->remap.prev->tile_present =
2420 					tile_present;
2421 				op->remap.prev = NULL;
2422 			}
2423 		}
2424 		if (op->remap.next) {
2425 			err |= xe_vm_insert_vma(vm, op->remap.next);
2426 			if (!err)
2427 				op->flags |= XE_VMA_OP_NEXT_COMMITTED;
2428 			if (!err && op->remap.skip_next) {
2429 				op->remap.next->tile_present =
2430 					tile_present;
2431 				op->remap.next = NULL;
2432 			}
2433 		}
2434 
2435 		/* Adjust for partial unbind after removing VMA from VM */
2436 		if (!err) {
2437 			op->base.remap.unmap->va->va.addr = op->remap.start;
2438 			op->base.remap.unmap->va->va.range = op->remap.range;
2439 		}
2440 		break;
2441 	}
2442 	case DRM_GPUVA_OP_UNMAP:
2443 		prep_vma_destroy(vm, gpuva_to_vma(op->base.unmap.va), true);
2444 		op->flags |= XE_VMA_OP_COMMITTED;
2445 		break;
2446 	case DRM_GPUVA_OP_PREFETCH:
2447 		op->flags |= XE_VMA_OP_COMMITTED;
2448 		break;
2449 	default:
2450 		drm_warn(&vm->xe->drm, "NOT POSSIBLE");
2451 	}
2452 
2453 	return err;
2454 }
2455 
2456 static int vm_bind_ioctl_ops_parse(struct xe_vm *vm, struct drm_gpuva_ops *ops,
2457 				   struct xe_vma_ops *vops)
2458 {
2459 	struct xe_device *xe = vm->xe;
2460 	struct drm_gpuva_op *__op;
2461 	struct xe_tile *tile;
2462 	u8 id, tile_mask = 0;
2463 	int err = 0;
2464 
2465 	lockdep_assert_held_write(&vm->lock);
2466 
2467 	for_each_tile(tile, vm->xe, id)
2468 		tile_mask |= 0x1 << id;
2469 
2470 	drm_gpuva_for_each_op(__op, ops) {
2471 		struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2472 		struct xe_vma *vma;
2473 		unsigned int flags = 0;
2474 
2475 		INIT_LIST_HEAD(&op->link);
2476 		list_add_tail(&op->link, &vops->list);
2477 		op->tile_mask = tile_mask;
2478 
2479 		switch (op->base.op) {
2480 		case DRM_GPUVA_OP_MAP:
2481 		{
2482 			flags |= op->map.read_only ?
2483 				VMA_CREATE_FLAG_READ_ONLY : 0;
2484 			flags |= op->map.is_null ?
2485 				VMA_CREATE_FLAG_IS_NULL : 0;
2486 			flags |= op->map.dumpable ?
2487 				VMA_CREATE_FLAG_DUMPABLE : 0;
2488 			flags |= op->map.is_cpu_addr_mirror ?
2489 				VMA_CREATE_FLAG_IS_SYSTEM_ALLOCATOR : 0;
2490 
2491 			vma = new_vma(vm, &op->base.map, op->map.pat_index,
2492 				      flags);
2493 			if (IS_ERR(vma))
2494 				return PTR_ERR(vma);
2495 
2496 			op->map.vma = vma;
2497 			if (((op->map.immediate || !xe_vm_in_fault_mode(vm)) &&
2498 			     !op->map.is_cpu_addr_mirror) ||
2499 			    op->map.invalidate_on_bind)
2500 				xe_vma_ops_incr_pt_update_ops(vops,
2501 							      op->tile_mask);
2502 			break;
2503 		}
2504 		case DRM_GPUVA_OP_REMAP:
2505 		{
2506 			struct xe_vma *old =
2507 				gpuva_to_vma(op->base.remap.unmap->va);
2508 			bool skip = xe_vma_is_cpu_addr_mirror(old);
2509 			u64 start = xe_vma_start(old), end = xe_vma_end(old);
2510 
2511 			if (op->base.remap.prev)
2512 				start = op->base.remap.prev->va.addr +
2513 					op->base.remap.prev->va.range;
2514 			if (op->base.remap.next)
2515 				end = op->base.remap.next->va.addr;
2516 
2517 			if (xe_vma_is_cpu_addr_mirror(old) &&
2518 			    xe_svm_has_mapping(vm, start, end))
2519 				return -EBUSY;
2520 
2521 			op->remap.start = xe_vma_start(old);
2522 			op->remap.range = xe_vma_size(old);
2523 
2524 			flags |= op->base.remap.unmap->va->flags &
2525 				XE_VMA_READ_ONLY ?
2526 				VMA_CREATE_FLAG_READ_ONLY : 0;
2527 			flags |= op->base.remap.unmap->va->flags &
2528 				DRM_GPUVA_SPARSE ?
2529 				VMA_CREATE_FLAG_IS_NULL : 0;
2530 			flags |= op->base.remap.unmap->va->flags &
2531 				XE_VMA_DUMPABLE ?
2532 				VMA_CREATE_FLAG_DUMPABLE : 0;
2533 			flags |= xe_vma_is_cpu_addr_mirror(old) ?
2534 				VMA_CREATE_FLAG_IS_SYSTEM_ALLOCATOR : 0;
2535 
2536 			if (op->base.remap.prev) {
2537 				vma = new_vma(vm, op->base.remap.prev,
2538 					      old->pat_index, flags);
2539 				if (IS_ERR(vma))
2540 					return PTR_ERR(vma);
2541 
2542 				op->remap.prev = vma;
2543 
2544 				/*
2545 				 * Userptr creates a new SG mapping so
2546 				 * we must also rebind.
2547 				 */
2548 				op->remap.skip_prev = skip ||
2549 					(!xe_vma_is_userptr(old) &&
2550 					IS_ALIGNED(xe_vma_end(vma),
2551 						   xe_vma_max_pte_size(old)));
2552 				if (op->remap.skip_prev) {
2553 					xe_vma_set_pte_size(vma, xe_vma_max_pte_size(old));
2554 					op->remap.range -=
2555 						xe_vma_end(vma) -
2556 						xe_vma_start(old);
2557 					op->remap.start = xe_vma_end(vma);
2558 					vm_dbg(&xe->drm, "REMAP:SKIP_PREV: addr=0x%016llx, range=0x%016llx",
2559 					       (ULL)op->remap.start,
2560 					       (ULL)op->remap.range);
2561 				} else {
2562 					xe_vma_ops_incr_pt_update_ops(vops, op->tile_mask);
2563 				}
2564 			}
2565 
2566 			if (op->base.remap.next) {
2567 				vma = new_vma(vm, op->base.remap.next,
2568 					      old->pat_index, flags);
2569 				if (IS_ERR(vma))
2570 					return PTR_ERR(vma);
2571 
2572 				op->remap.next = vma;
2573 
2574 				/*
2575 				 * Userptr creates a new SG mapping so
2576 				 * we must also rebind.
2577 				 */
2578 				op->remap.skip_next = skip ||
2579 					(!xe_vma_is_userptr(old) &&
2580 					IS_ALIGNED(xe_vma_start(vma),
2581 						   xe_vma_max_pte_size(old)));
2582 				if (op->remap.skip_next) {
2583 					xe_vma_set_pte_size(vma, xe_vma_max_pte_size(old));
2584 					op->remap.range -=
2585 						xe_vma_end(old) -
2586 						xe_vma_start(vma);
2587 					vm_dbg(&xe->drm, "REMAP:SKIP_NEXT: addr=0x%016llx, range=0x%016llx",
2588 					       (ULL)op->remap.start,
2589 					       (ULL)op->remap.range);
2590 				} else {
2591 					xe_vma_ops_incr_pt_update_ops(vops, op->tile_mask);
2592 				}
2593 			}
2594 			if (!skip)
2595 				xe_vma_ops_incr_pt_update_ops(vops, op->tile_mask);
2596 			break;
2597 		}
2598 		case DRM_GPUVA_OP_UNMAP:
2599 			vma = gpuva_to_vma(op->base.unmap.va);
2600 
2601 			if (xe_vma_is_cpu_addr_mirror(vma) &&
2602 			    xe_svm_has_mapping(vm, xe_vma_start(vma),
2603 					       xe_vma_end(vma)))
2604 				return -EBUSY;
2605 
2606 			if (!xe_vma_is_cpu_addr_mirror(vma))
2607 				xe_vma_ops_incr_pt_update_ops(vops, op->tile_mask);
2608 			break;
2609 		case DRM_GPUVA_OP_PREFETCH:
2610 			vma = gpuva_to_vma(op->base.prefetch.va);
2611 
2612 			if (xe_vma_is_userptr(vma)) {
2613 				err = xe_vma_userptr_pin_pages(to_userptr_vma(vma));
2614 				if (err)
2615 					return err;
2616 			}
2617 
2618 			if (!xe_vma_is_cpu_addr_mirror(vma))
2619 				xe_vma_ops_incr_pt_update_ops(vops, op->tile_mask);
2620 			break;
2621 		default:
2622 			drm_warn(&vm->xe->drm, "NOT POSSIBLE");
2623 		}
2624 
2625 		err = xe_vma_op_commit(vm, op);
2626 		if (err)
2627 			return err;
2628 	}
2629 
2630 	return 0;
2631 }
2632 
2633 static void xe_vma_op_unwind(struct xe_vm *vm, struct xe_vma_op *op,
2634 			     bool post_commit, bool prev_post_commit,
2635 			     bool next_post_commit)
2636 {
2637 	lockdep_assert_held_write(&vm->lock);
2638 
2639 	switch (op->base.op) {
2640 	case DRM_GPUVA_OP_MAP:
2641 		if (op->map.vma) {
2642 			prep_vma_destroy(vm, op->map.vma, post_commit);
2643 			xe_vma_destroy_unlocked(op->map.vma);
2644 		}
2645 		break;
2646 	case DRM_GPUVA_OP_UNMAP:
2647 	{
2648 		struct xe_vma *vma = gpuva_to_vma(op->base.unmap.va);
2649 
2650 		if (vma) {
2651 			down_read(&vm->userptr.notifier_lock);
2652 			vma->gpuva.flags &= ~XE_VMA_DESTROYED;
2653 			up_read(&vm->userptr.notifier_lock);
2654 			if (post_commit)
2655 				xe_vm_insert_vma(vm, vma);
2656 		}
2657 		break;
2658 	}
2659 	case DRM_GPUVA_OP_REMAP:
2660 	{
2661 		struct xe_vma *vma = gpuva_to_vma(op->base.remap.unmap->va);
2662 
2663 		if (op->remap.prev) {
2664 			prep_vma_destroy(vm, op->remap.prev, prev_post_commit);
2665 			xe_vma_destroy_unlocked(op->remap.prev);
2666 		}
2667 		if (op->remap.next) {
2668 			prep_vma_destroy(vm, op->remap.next, next_post_commit);
2669 			xe_vma_destroy_unlocked(op->remap.next);
2670 		}
2671 		if (vma) {
2672 			down_read(&vm->userptr.notifier_lock);
2673 			vma->gpuva.flags &= ~XE_VMA_DESTROYED;
2674 			up_read(&vm->userptr.notifier_lock);
2675 			if (post_commit)
2676 				xe_vm_insert_vma(vm, vma);
2677 		}
2678 		break;
2679 	}
2680 	case DRM_GPUVA_OP_PREFETCH:
2681 		/* Nothing to do */
2682 		break;
2683 	default:
2684 		drm_warn(&vm->xe->drm, "NOT POSSIBLE");
2685 	}
2686 }
2687 
2688 static void vm_bind_ioctl_ops_unwind(struct xe_vm *vm,
2689 				     struct drm_gpuva_ops **ops,
2690 				     int num_ops_list)
2691 {
2692 	int i;
2693 
2694 	for (i = num_ops_list - 1; i >= 0; --i) {
2695 		struct drm_gpuva_ops *__ops = ops[i];
2696 		struct drm_gpuva_op *__op;
2697 
2698 		if (!__ops)
2699 			continue;
2700 
2701 		drm_gpuva_for_each_op_reverse(__op, __ops) {
2702 			struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2703 
2704 			xe_vma_op_unwind(vm, op,
2705 					 op->flags & XE_VMA_OP_COMMITTED,
2706 					 op->flags & XE_VMA_OP_PREV_COMMITTED,
2707 					 op->flags & XE_VMA_OP_NEXT_COMMITTED);
2708 		}
2709 	}
2710 }
2711 
2712 static int vma_lock_and_validate(struct drm_exec *exec, struct xe_vma *vma,
2713 				 bool validate)
2714 {
2715 	struct xe_bo *bo = xe_vma_bo(vma);
2716 	struct xe_vm *vm = xe_vma_vm(vma);
2717 	int err = 0;
2718 
2719 	if (bo) {
2720 		if (!bo->vm)
2721 			err = drm_exec_lock_obj(exec, &bo->ttm.base);
2722 		if (!err && validate)
2723 			err = xe_bo_validate(bo, vm,
2724 					     !xe_vm_in_preempt_fence_mode(vm));
2725 	}
2726 
2727 	return err;
2728 }
2729 
2730 static int check_ufence(struct xe_vma *vma)
2731 {
2732 	if (vma->ufence) {
2733 		struct xe_user_fence * const f = vma->ufence;
2734 
2735 		if (!xe_sync_ufence_get_status(f))
2736 			return -EBUSY;
2737 
2738 		vma->ufence = NULL;
2739 		xe_sync_ufence_put(f);
2740 	}
2741 
2742 	return 0;
2743 }
2744 
2745 static int op_lock_and_prep(struct drm_exec *exec, struct xe_vm *vm,
2746 			    struct xe_vma_op *op)
2747 {
2748 	int err = 0;
2749 
2750 	switch (op->base.op) {
2751 	case DRM_GPUVA_OP_MAP:
2752 		if (!op->map.invalidate_on_bind)
2753 			err = vma_lock_and_validate(exec, op->map.vma,
2754 						    !xe_vm_in_fault_mode(vm) ||
2755 						    op->map.immediate);
2756 		break;
2757 	case DRM_GPUVA_OP_REMAP:
2758 		err = check_ufence(gpuva_to_vma(op->base.remap.unmap->va));
2759 		if (err)
2760 			break;
2761 
2762 		err = vma_lock_and_validate(exec,
2763 					    gpuva_to_vma(op->base.remap.unmap->va),
2764 					    false);
2765 		if (!err && op->remap.prev)
2766 			err = vma_lock_and_validate(exec, op->remap.prev, true);
2767 		if (!err && op->remap.next)
2768 			err = vma_lock_and_validate(exec, op->remap.next, true);
2769 		break;
2770 	case DRM_GPUVA_OP_UNMAP:
2771 		err = check_ufence(gpuva_to_vma(op->base.unmap.va));
2772 		if (err)
2773 			break;
2774 
2775 		err = vma_lock_and_validate(exec,
2776 					    gpuva_to_vma(op->base.unmap.va),
2777 					    false);
2778 		break;
2779 	case DRM_GPUVA_OP_PREFETCH:
2780 	{
2781 		struct xe_vma *vma = gpuva_to_vma(op->base.prefetch.va);
2782 		u32 region = op->prefetch.region;
2783 
2784 		xe_assert(vm->xe, region <= ARRAY_SIZE(region_to_mem_type));
2785 
2786 		err = vma_lock_and_validate(exec,
2787 					    gpuva_to_vma(op->base.prefetch.va),
2788 					    false);
2789 		if (!err && !xe_vma_has_no_bo(vma))
2790 			err = xe_bo_migrate(xe_vma_bo(vma),
2791 					    region_to_mem_type[region]);
2792 		break;
2793 	}
2794 	default:
2795 		drm_warn(&vm->xe->drm, "NOT POSSIBLE");
2796 	}
2797 
2798 	return err;
2799 }
2800 
2801 static int vm_bind_ioctl_ops_lock_and_prep(struct drm_exec *exec,
2802 					   struct xe_vm *vm,
2803 					   struct xe_vma_ops *vops)
2804 {
2805 	struct xe_vma_op *op;
2806 	int err;
2807 
2808 	err = drm_exec_lock_obj(exec, xe_vm_obj(vm));
2809 	if (err)
2810 		return err;
2811 
2812 	list_for_each_entry(op, &vops->list, link) {
2813 		err = op_lock_and_prep(exec, vm, op);
2814 		if (err)
2815 			return err;
2816 	}
2817 
2818 #ifdef TEST_VM_OPS_ERROR
2819 	if (vops->inject_error &&
2820 	    vm->xe->vm_inject_error_position == FORCE_OP_ERROR_LOCK)
2821 		return -ENOSPC;
2822 #endif
2823 
2824 	return 0;
2825 }
2826 
2827 static void op_trace(struct xe_vma_op *op)
2828 {
2829 	switch (op->base.op) {
2830 	case DRM_GPUVA_OP_MAP:
2831 		trace_xe_vma_bind(op->map.vma);
2832 		break;
2833 	case DRM_GPUVA_OP_REMAP:
2834 		trace_xe_vma_unbind(gpuva_to_vma(op->base.remap.unmap->va));
2835 		if (op->remap.prev)
2836 			trace_xe_vma_bind(op->remap.prev);
2837 		if (op->remap.next)
2838 			trace_xe_vma_bind(op->remap.next);
2839 		break;
2840 	case DRM_GPUVA_OP_UNMAP:
2841 		trace_xe_vma_unbind(gpuva_to_vma(op->base.unmap.va));
2842 		break;
2843 	case DRM_GPUVA_OP_PREFETCH:
2844 		trace_xe_vma_bind(gpuva_to_vma(op->base.prefetch.va));
2845 		break;
2846 	case DRM_GPUVA_OP_DRIVER:
2847 		break;
2848 	default:
2849 		XE_WARN_ON("NOT POSSIBLE");
2850 	}
2851 }
2852 
2853 static void trace_xe_vm_ops_execute(struct xe_vma_ops *vops)
2854 {
2855 	struct xe_vma_op *op;
2856 
2857 	list_for_each_entry(op, &vops->list, link)
2858 		op_trace(op);
2859 }
2860 
2861 static int vm_ops_setup_tile_args(struct xe_vm *vm, struct xe_vma_ops *vops)
2862 {
2863 	struct xe_exec_queue *q = vops->q;
2864 	struct xe_tile *tile;
2865 	int number_tiles = 0;
2866 	u8 id;
2867 
2868 	for_each_tile(tile, vm->xe, id) {
2869 		if (vops->pt_update_ops[id].num_ops)
2870 			++number_tiles;
2871 
2872 		if (vops->pt_update_ops[id].q)
2873 			continue;
2874 
2875 		if (q) {
2876 			vops->pt_update_ops[id].q = q;
2877 			if (vm->pt_root[id] && !list_empty(&q->multi_gt_list))
2878 				q = list_next_entry(q, multi_gt_list);
2879 		} else {
2880 			vops->pt_update_ops[id].q = vm->q[id];
2881 		}
2882 	}
2883 
2884 	return number_tiles;
2885 }
2886 
2887 static struct dma_fence *ops_execute(struct xe_vm *vm,
2888 				     struct xe_vma_ops *vops)
2889 {
2890 	struct xe_tile *tile;
2891 	struct dma_fence *fence = NULL;
2892 	struct dma_fence **fences = NULL;
2893 	struct dma_fence_array *cf = NULL;
2894 	int number_tiles = 0, current_fence = 0, err;
2895 	u8 id;
2896 
2897 	number_tiles = vm_ops_setup_tile_args(vm, vops);
2898 	if (number_tiles == 0)
2899 		return ERR_PTR(-ENODATA);
2900 
2901 	if (number_tiles > 1) {
2902 		fences = kmalloc_array(number_tiles, sizeof(*fences),
2903 				       GFP_KERNEL);
2904 		if (!fences) {
2905 			fence = ERR_PTR(-ENOMEM);
2906 			goto err_trace;
2907 		}
2908 	}
2909 
2910 	for_each_tile(tile, vm->xe, id) {
2911 		if (!vops->pt_update_ops[id].num_ops)
2912 			continue;
2913 
2914 		err = xe_pt_update_ops_prepare(tile, vops);
2915 		if (err) {
2916 			fence = ERR_PTR(err);
2917 			goto err_out;
2918 		}
2919 	}
2920 
2921 	trace_xe_vm_ops_execute(vops);
2922 
2923 	for_each_tile(tile, vm->xe, id) {
2924 		if (!vops->pt_update_ops[id].num_ops)
2925 			continue;
2926 
2927 		fence = xe_pt_update_ops_run(tile, vops);
2928 		if (IS_ERR(fence))
2929 			goto err_out;
2930 
2931 		if (fences)
2932 			fences[current_fence++] = fence;
2933 	}
2934 
2935 	if (fences) {
2936 		cf = dma_fence_array_create(number_tiles, fences,
2937 					    vm->composite_fence_ctx,
2938 					    vm->composite_fence_seqno++,
2939 					    false);
2940 		if (!cf) {
2941 			--vm->composite_fence_seqno;
2942 			fence = ERR_PTR(-ENOMEM);
2943 			goto err_out;
2944 		}
2945 		fence = &cf->base;
2946 	}
2947 
2948 	for_each_tile(tile, vm->xe, id) {
2949 		if (!vops->pt_update_ops[id].num_ops)
2950 			continue;
2951 
2952 		xe_pt_update_ops_fini(tile, vops);
2953 	}
2954 
2955 	return fence;
2956 
2957 err_out:
2958 	for_each_tile(tile, vm->xe, id) {
2959 		if (!vops->pt_update_ops[id].num_ops)
2960 			continue;
2961 
2962 		xe_pt_update_ops_abort(tile, vops);
2963 	}
2964 	while (current_fence)
2965 		dma_fence_put(fences[--current_fence]);
2966 	kfree(fences);
2967 	kfree(cf);
2968 
2969 err_trace:
2970 	trace_xe_vm_ops_fail(vm);
2971 	return fence;
2972 }
2973 
2974 static void vma_add_ufence(struct xe_vma *vma, struct xe_user_fence *ufence)
2975 {
2976 	if (vma->ufence)
2977 		xe_sync_ufence_put(vma->ufence);
2978 	vma->ufence = __xe_sync_ufence_get(ufence);
2979 }
2980 
2981 static void op_add_ufence(struct xe_vm *vm, struct xe_vma_op *op,
2982 			  struct xe_user_fence *ufence)
2983 {
2984 	switch (op->base.op) {
2985 	case DRM_GPUVA_OP_MAP:
2986 		vma_add_ufence(op->map.vma, ufence);
2987 		break;
2988 	case DRM_GPUVA_OP_REMAP:
2989 		if (op->remap.prev)
2990 			vma_add_ufence(op->remap.prev, ufence);
2991 		if (op->remap.next)
2992 			vma_add_ufence(op->remap.next, ufence);
2993 		break;
2994 	case DRM_GPUVA_OP_UNMAP:
2995 		break;
2996 	case DRM_GPUVA_OP_PREFETCH:
2997 		vma_add_ufence(gpuva_to_vma(op->base.prefetch.va), ufence);
2998 		break;
2999 	default:
3000 		drm_warn(&vm->xe->drm, "NOT POSSIBLE");
3001 	}
3002 }
3003 
3004 static void vm_bind_ioctl_ops_fini(struct xe_vm *vm, struct xe_vma_ops *vops,
3005 				   struct dma_fence *fence)
3006 {
3007 	struct xe_exec_queue *wait_exec_queue = to_wait_exec_queue(vm, vops->q);
3008 	struct xe_user_fence *ufence;
3009 	struct xe_vma_op *op;
3010 	int i;
3011 
3012 	ufence = find_ufence_get(vops->syncs, vops->num_syncs);
3013 	list_for_each_entry(op, &vops->list, link) {
3014 		if (ufence)
3015 			op_add_ufence(vm, op, ufence);
3016 
3017 		if (op->base.op == DRM_GPUVA_OP_UNMAP)
3018 			xe_vma_destroy(gpuva_to_vma(op->base.unmap.va), fence);
3019 		else if (op->base.op == DRM_GPUVA_OP_REMAP)
3020 			xe_vma_destroy(gpuva_to_vma(op->base.remap.unmap->va),
3021 				       fence);
3022 	}
3023 	if (ufence)
3024 		xe_sync_ufence_put(ufence);
3025 	if (fence) {
3026 		for (i = 0; i < vops->num_syncs; i++)
3027 			xe_sync_entry_signal(vops->syncs + i, fence);
3028 		xe_exec_queue_last_fence_set(wait_exec_queue, vm, fence);
3029 	}
3030 }
3031 
3032 static struct dma_fence *vm_bind_ioctl_ops_execute(struct xe_vm *vm,
3033 						   struct xe_vma_ops *vops)
3034 {
3035 	struct drm_exec exec;
3036 	struct dma_fence *fence;
3037 	int err;
3038 
3039 	lockdep_assert_held_write(&vm->lock);
3040 
3041 	drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT |
3042 		      DRM_EXEC_IGNORE_DUPLICATES, 0);
3043 	drm_exec_until_all_locked(&exec) {
3044 		err = vm_bind_ioctl_ops_lock_and_prep(&exec, vm, vops);
3045 		drm_exec_retry_on_contention(&exec);
3046 		if (err) {
3047 			fence = ERR_PTR(err);
3048 			goto unlock;
3049 		}
3050 
3051 		fence = ops_execute(vm, vops);
3052 		if (IS_ERR(fence)) {
3053 			if (PTR_ERR(fence) == -ENODATA)
3054 				vm_bind_ioctl_ops_fini(vm, vops, NULL);
3055 			goto unlock;
3056 		}
3057 
3058 		vm_bind_ioctl_ops_fini(vm, vops, fence);
3059 	}
3060 
3061 unlock:
3062 	drm_exec_fini(&exec);
3063 	return fence;
3064 }
3065 ALLOW_ERROR_INJECTION(vm_bind_ioctl_ops_execute, ERRNO);
3066 
3067 #define SUPPORTED_FLAGS_STUB  \
3068 	(DRM_XE_VM_BIND_FLAG_READONLY | \
3069 	 DRM_XE_VM_BIND_FLAG_IMMEDIATE | \
3070 	 DRM_XE_VM_BIND_FLAG_NULL | \
3071 	 DRM_XE_VM_BIND_FLAG_DUMPABLE | \
3072 	 DRM_XE_VM_BIND_FLAG_CHECK_PXP | \
3073 	 DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR)
3074 
3075 #ifdef TEST_VM_OPS_ERROR
3076 #define SUPPORTED_FLAGS	(SUPPORTED_FLAGS_STUB | FORCE_OP_ERROR)
3077 #else
3078 #define SUPPORTED_FLAGS	SUPPORTED_FLAGS_STUB
3079 #endif
3080 
3081 #define XE_64K_PAGE_MASK 0xffffull
3082 #define ALL_DRM_XE_SYNCS_FLAGS (DRM_XE_SYNCS_FLAG_WAIT_FOR_OP)
3083 
3084 static int vm_bind_ioctl_check_args(struct xe_device *xe, struct xe_vm *vm,
3085 				    struct drm_xe_vm_bind *args,
3086 				    struct drm_xe_vm_bind_op **bind_ops)
3087 {
3088 	int err;
3089 	int i;
3090 
3091 	if (XE_IOCTL_DBG(xe, args->pad || args->pad2) ||
3092 	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
3093 		return -EINVAL;
3094 
3095 	if (XE_IOCTL_DBG(xe, args->extensions))
3096 		return -EINVAL;
3097 
3098 	if (args->num_binds > 1) {
3099 		u64 __user *bind_user =
3100 			u64_to_user_ptr(args->vector_of_binds);
3101 
3102 		*bind_ops = kvmalloc_array(args->num_binds,
3103 					   sizeof(struct drm_xe_vm_bind_op),
3104 					   GFP_KERNEL | __GFP_ACCOUNT |
3105 					   __GFP_RETRY_MAYFAIL | __GFP_NOWARN);
3106 		if (!*bind_ops)
3107 			return args->num_binds > 1 ? -ENOBUFS : -ENOMEM;
3108 
3109 		err = copy_from_user(*bind_ops, bind_user,
3110 				     sizeof(struct drm_xe_vm_bind_op) *
3111 				     args->num_binds);
3112 		if (XE_IOCTL_DBG(xe, err)) {
3113 			err = -EFAULT;
3114 			goto free_bind_ops;
3115 		}
3116 	} else {
3117 		*bind_ops = &args->bind;
3118 	}
3119 
3120 	for (i = 0; i < args->num_binds; ++i) {
3121 		u64 range = (*bind_ops)[i].range;
3122 		u64 addr = (*bind_ops)[i].addr;
3123 		u32 op = (*bind_ops)[i].op;
3124 		u32 flags = (*bind_ops)[i].flags;
3125 		u32 obj = (*bind_ops)[i].obj;
3126 		u64 obj_offset = (*bind_ops)[i].obj_offset;
3127 		u32 prefetch_region = (*bind_ops)[i].prefetch_mem_region_instance;
3128 		bool is_null = flags & DRM_XE_VM_BIND_FLAG_NULL;
3129 		bool is_cpu_addr_mirror = flags &
3130 			DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR;
3131 		u16 pat_index = (*bind_ops)[i].pat_index;
3132 		u16 coh_mode;
3133 
3134 		if (XE_IOCTL_DBG(xe, is_cpu_addr_mirror &&
3135 				 (!xe_vm_in_fault_mode(vm) ||
3136 				 !IS_ENABLED(CONFIG_DRM_XE_GPUSVM)))) {
3137 			err = -EINVAL;
3138 			goto free_bind_ops;
3139 		}
3140 
3141 		if (XE_IOCTL_DBG(xe, pat_index >= xe->pat.n_entries)) {
3142 			err = -EINVAL;
3143 			goto free_bind_ops;
3144 		}
3145 
3146 		pat_index = array_index_nospec(pat_index, xe->pat.n_entries);
3147 		(*bind_ops)[i].pat_index = pat_index;
3148 		coh_mode = xe_pat_index_get_coh_mode(xe, pat_index);
3149 		if (XE_IOCTL_DBG(xe, !coh_mode)) { /* hw reserved */
3150 			err = -EINVAL;
3151 			goto free_bind_ops;
3152 		}
3153 
3154 		if (XE_WARN_ON(coh_mode > XE_COH_AT_LEAST_1WAY)) {
3155 			err = -EINVAL;
3156 			goto free_bind_ops;
3157 		}
3158 
3159 		if (XE_IOCTL_DBG(xe, op > DRM_XE_VM_BIND_OP_PREFETCH) ||
3160 		    XE_IOCTL_DBG(xe, flags & ~SUPPORTED_FLAGS) ||
3161 		    XE_IOCTL_DBG(xe, obj && (is_null || is_cpu_addr_mirror)) ||
3162 		    XE_IOCTL_DBG(xe, obj_offset && (is_null ||
3163 						    is_cpu_addr_mirror)) ||
3164 		    XE_IOCTL_DBG(xe, op != DRM_XE_VM_BIND_OP_MAP &&
3165 				 (is_null || is_cpu_addr_mirror)) ||
3166 		    XE_IOCTL_DBG(xe, !obj &&
3167 				 op == DRM_XE_VM_BIND_OP_MAP &&
3168 				 !is_null && !is_cpu_addr_mirror) ||
3169 		    XE_IOCTL_DBG(xe, !obj &&
3170 				 op == DRM_XE_VM_BIND_OP_UNMAP_ALL) ||
3171 		    XE_IOCTL_DBG(xe, addr &&
3172 				 op == DRM_XE_VM_BIND_OP_UNMAP_ALL) ||
3173 		    XE_IOCTL_DBG(xe, range &&
3174 				 op == DRM_XE_VM_BIND_OP_UNMAP_ALL) ||
3175 		    XE_IOCTL_DBG(xe, obj &&
3176 				 op == DRM_XE_VM_BIND_OP_MAP_USERPTR) ||
3177 		    XE_IOCTL_DBG(xe, coh_mode == XE_COH_NONE &&
3178 				 op == DRM_XE_VM_BIND_OP_MAP_USERPTR) ||
3179 		    XE_IOCTL_DBG(xe, obj &&
3180 				 op == DRM_XE_VM_BIND_OP_PREFETCH) ||
3181 		    XE_IOCTL_DBG(xe, prefetch_region &&
3182 				 op != DRM_XE_VM_BIND_OP_PREFETCH) ||
3183 		    XE_IOCTL_DBG(xe, !(BIT(prefetch_region) &
3184 				       xe->info.mem_region_mask)) ||
3185 		    XE_IOCTL_DBG(xe, obj &&
3186 				 op == DRM_XE_VM_BIND_OP_UNMAP)) {
3187 			err = -EINVAL;
3188 			goto free_bind_ops;
3189 		}
3190 
3191 		if (XE_IOCTL_DBG(xe, obj_offset & ~PAGE_MASK) ||
3192 		    XE_IOCTL_DBG(xe, addr & ~PAGE_MASK) ||
3193 		    XE_IOCTL_DBG(xe, range & ~PAGE_MASK) ||
3194 		    XE_IOCTL_DBG(xe, !range &&
3195 				 op != DRM_XE_VM_BIND_OP_UNMAP_ALL)) {
3196 			err = -EINVAL;
3197 			goto free_bind_ops;
3198 		}
3199 	}
3200 
3201 	return 0;
3202 
3203 free_bind_ops:
3204 	if (args->num_binds > 1)
3205 		kvfree(*bind_ops);
3206 	return err;
3207 }
3208 
3209 static int vm_bind_ioctl_signal_fences(struct xe_vm *vm,
3210 				       struct xe_exec_queue *q,
3211 				       struct xe_sync_entry *syncs,
3212 				       int num_syncs)
3213 {
3214 	struct dma_fence *fence;
3215 	int i, err = 0;
3216 
3217 	fence = xe_sync_in_fence_get(syncs, num_syncs,
3218 				     to_wait_exec_queue(vm, q), vm);
3219 	if (IS_ERR(fence))
3220 		return PTR_ERR(fence);
3221 
3222 	for (i = 0; i < num_syncs; i++)
3223 		xe_sync_entry_signal(&syncs[i], fence);
3224 
3225 	xe_exec_queue_last_fence_set(to_wait_exec_queue(vm, q), vm,
3226 				     fence);
3227 	dma_fence_put(fence);
3228 
3229 	return err;
3230 }
3231 
3232 static void xe_vma_ops_init(struct xe_vma_ops *vops, struct xe_vm *vm,
3233 			    struct xe_exec_queue *q,
3234 			    struct xe_sync_entry *syncs, u32 num_syncs)
3235 {
3236 	memset(vops, 0, sizeof(*vops));
3237 	INIT_LIST_HEAD(&vops->list);
3238 	vops->vm = vm;
3239 	vops->q = q;
3240 	vops->syncs = syncs;
3241 	vops->num_syncs = num_syncs;
3242 }
3243 
3244 static int xe_vm_bind_ioctl_validate_bo(struct xe_device *xe, struct xe_bo *bo,
3245 					u64 addr, u64 range, u64 obj_offset,
3246 					u16 pat_index, u32 op, u32 bind_flags)
3247 {
3248 	u16 coh_mode;
3249 
3250 	if (XE_IOCTL_DBG(xe, range > bo->size) ||
3251 	    XE_IOCTL_DBG(xe, obj_offset >
3252 			 bo->size - range)) {
3253 		return -EINVAL;
3254 	}
3255 
3256 	/*
3257 	 * Some platforms require 64k VM_BIND alignment,
3258 	 * specifically those with XE_VRAM_FLAGS_NEED64K.
3259 	 *
3260 	 * Other platforms may have BO's set to 64k physical placement,
3261 	 * but can be mapped at 4k offsets anyway. This check is only
3262 	 * there for the former case.
3263 	 */
3264 	if ((bo->flags & XE_BO_FLAG_INTERNAL_64K) &&
3265 	    (xe->info.vram_flags & XE_VRAM_FLAGS_NEED64K)) {
3266 		if (XE_IOCTL_DBG(xe, obj_offset &
3267 				 XE_64K_PAGE_MASK) ||
3268 		    XE_IOCTL_DBG(xe, addr & XE_64K_PAGE_MASK) ||
3269 		    XE_IOCTL_DBG(xe, range & XE_64K_PAGE_MASK)) {
3270 			return -EINVAL;
3271 		}
3272 	}
3273 
3274 	coh_mode = xe_pat_index_get_coh_mode(xe, pat_index);
3275 	if (bo->cpu_caching) {
3276 		if (XE_IOCTL_DBG(xe, coh_mode == XE_COH_NONE &&
3277 				 bo->cpu_caching == DRM_XE_GEM_CPU_CACHING_WB)) {
3278 			return -EINVAL;
3279 		}
3280 	} else if (XE_IOCTL_DBG(xe, coh_mode == XE_COH_NONE)) {
3281 		/*
3282 		 * Imported dma-buf from a different device should
3283 		 * require 1way or 2way coherency since we don't know
3284 		 * how it was mapped on the CPU. Just assume is it
3285 		 * potentially cached on CPU side.
3286 		 */
3287 		return -EINVAL;
3288 	}
3289 
3290 	/* If a BO is protected it can only be mapped if the key is still valid */
3291 	if ((bind_flags & DRM_XE_VM_BIND_FLAG_CHECK_PXP) && xe_bo_is_protected(bo) &&
3292 	    op != DRM_XE_VM_BIND_OP_UNMAP && op != DRM_XE_VM_BIND_OP_UNMAP_ALL)
3293 		if (XE_IOCTL_DBG(xe, xe_pxp_bo_key_check(xe->pxp, bo) != 0))
3294 			return -ENOEXEC;
3295 
3296 	return 0;
3297 }
3298 
3299 int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
3300 {
3301 	struct xe_device *xe = to_xe_device(dev);
3302 	struct xe_file *xef = to_xe_file(file);
3303 	struct drm_xe_vm_bind *args = data;
3304 	struct drm_xe_sync __user *syncs_user;
3305 	struct xe_bo **bos = NULL;
3306 	struct drm_gpuva_ops **ops = NULL;
3307 	struct xe_vm *vm;
3308 	struct xe_exec_queue *q = NULL;
3309 	u32 num_syncs, num_ufence = 0;
3310 	struct xe_sync_entry *syncs = NULL;
3311 	struct drm_xe_vm_bind_op *bind_ops;
3312 	struct xe_vma_ops vops;
3313 	struct dma_fence *fence;
3314 	int err;
3315 	int i;
3316 
3317 	vm = xe_vm_lookup(xef, args->vm_id);
3318 	if (XE_IOCTL_DBG(xe, !vm))
3319 		return -EINVAL;
3320 
3321 	err = vm_bind_ioctl_check_args(xe, vm, args, &bind_ops);
3322 	if (err)
3323 		goto put_vm;
3324 
3325 	if (args->exec_queue_id) {
3326 		q = xe_exec_queue_lookup(xef, args->exec_queue_id);
3327 		if (XE_IOCTL_DBG(xe, !q)) {
3328 			err = -ENOENT;
3329 			goto put_vm;
3330 		}
3331 
3332 		if (XE_IOCTL_DBG(xe, !(q->flags & EXEC_QUEUE_FLAG_VM))) {
3333 			err = -EINVAL;
3334 			goto put_exec_queue;
3335 		}
3336 	}
3337 
3338 	/* Ensure all UNMAPs visible */
3339 	xe_svm_flush(vm);
3340 
3341 	err = down_write_killable(&vm->lock);
3342 	if (err)
3343 		goto put_exec_queue;
3344 
3345 	if (XE_IOCTL_DBG(xe, xe_vm_is_closed_or_banned(vm))) {
3346 		err = -ENOENT;
3347 		goto release_vm_lock;
3348 	}
3349 
3350 	for (i = 0; i < args->num_binds; ++i) {
3351 		u64 range = bind_ops[i].range;
3352 		u64 addr = bind_ops[i].addr;
3353 
3354 		if (XE_IOCTL_DBG(xe, range > vm->size) ||
3355 		    XE_IOCTL_DBG(xe, addr > vm->size - range)) {
3356 			err = -EINVAL;
3357 			goto release_vm_lock;
3358 		}
3359 	}
3360 
3361 	if (args->num_binds) {
3362 		bos = kvcalloc(args->num_binds, sizeof(*bos),
3363 			       GFP_KERNEL | __GFP_ACCOUNT |
3364 			       __GFP_RETRY_MAYFAIL | __GFP_NOWARN);
3365 		if (!bos) {
3366 			err = -ENOMEM;
3367 			goto release_vm_lock;
3368 		}
3369 
3370 		ops = kvcalloc(args->num_binds, sizeof(*ops),
3371 			       GFP_KERNEL | __GFP_ACCOUNT |
3372 			       __GFP_RETRY_MAYFAIL | __GFP_NOWARN);
3373 		if (!ops) {
3374 			err = -ENOMEM;
3375 			goto release_vm_lock;
3376 		}
3377 	}
3378 
3379 	for (i = 0; i < args->num_binds; ++i) {
3380 		struct drm_gem_object *gem_obj;
3381 		u64 range = bind_ops[i].range;
3382 		u64 addr = bind_ops[i].addr;
3383 		u32 obj = bind_ops[i].obj;
3384 		u64 obj_offset = bind_ops[i].obj_offset;
3385 		u16 pat_index = bind_ops[i].pat_index;
3386 		u32 op = bind_ops[i].op;
3387 		u32 bind_flags = bind_ops[i].flags;
3388 
3389 		if (!obj)
3390 			continue;
3391 
3392 		gem_obj = drm_gem_object_lookup(file, obj);
3393 		if (XE_IOCTL_DBG(xe, !gem_obj)) {
3394 			err = -ENOENT;
3395 			goto put_obj;
3396 		}
3397 		bos[i] = gem_to_xe_bo(gem_obj);
3398 
3399 		err = xe_vm_bind_ioctl_validate_bo(xe, bos[i], addr, range,
3400 						   obj_offset, pat_index, op,
3401 						   bind_flags);
3402 		if (err)
3403 			goto put_obj;
3404 	}
3405 
3406 	if (args->num_syncs) {
3407 		syncs = kcalloc(args->num_syncs, sizeof(*syncs), GFP_KERNEL);
3408 		if (!syncs) {
3409 			err = -ENOMEM;
3410 			goto put_obj;
3411 		}
3412 	}
3413 
3414 	syncs_user = u64_to_user_ptr(args->syncs);
3415 	for (num_syncs = 0; num_syncs < args->num_syncs; num_syncs++) {
3416 		err = xe_sync_entry_parse(xe, xef, &syncs[num_syncs],
3417 					  &syncs_user[num_syncs],
3418 					  (xe_vm_in_lr_mode(vm) ?
3419 					   SYNC_PARSE_FLAG_LR_MODE : 0) |
3420 					  (!args->num_binds ?
3421 					   SYNC_PARSE_FLAG_DISALLOW_USER_FENCE : 0));
3422 		if (err)
3423 			goto free_syncs;
3424 
3425 		if (xe_sync_is_ufence(&syncs[num_syncs]))
3426 			num_ufence++;
3427 	}
3428 
3429 	if (XE_IOCTL_DBG(xe, num_ufence > 1)) {
3430 		err = -EINVAL;
3431 		goto free_syncs;
3432 	}
3433 
3434 	if (!args->num_binds) {
3435 		err = -ENODATA;
3436 		goto free_syncs;
3437 	}
3438 
3439 	xe_vma_ops_init(&vops, vm, q, syncs, num_syncs);
3440 	for (i = 0; i < args->num_binds; ++i) {
3441 		u64 range = bind_ops[i].range;
3442 		u64 addr = bind_ops[i].addr;
3443 		u32 op = bind_ops[i].op;
3444 		u32 flags = bind_ops[i].flags;
3445 		u64 obj_offset = bind_ops[i].obj_offset;
3446 		u32 prefetch_region = bind_ops[i].prefetch_mem_region_instance;
3447 		u16 pat_index = bind_ops[i].pat_index;
3448 
3449 		ops[i] = vm_bind_ioctl_ops_create(vm, bos[i], obj_offset,
3450 						  addr, range, op, flags,
3451 						  prefetch_region, pat_index);
3452 		if (IS_ERR(ops[i])) {
3453 			err = PTR_ERR(ops[i]);
3454 			ops[i] = NULL;
3455 			goto unwind_ops;
3456 		}
3457 
3458 		err = vm_bind_ioctl_ops_parse(vm, ops[i], &vops);
3459 		if (err)
3460 			goto unwind_ops;
3461 
3462 #ifdef TEST_VM_OPS_ERROR
3463 		if (flags & FORCE_OP_ERROR) {
3464 			vops.inject_error = true;
3465 			vm->xe->vm_inject_error_position =
3466 				(vm->xe->vm_inject_error_position + 1) %
3467 				FORCE_OP_ERROR_COUNT;
3468 		}
3469 #endif
3470 	}
3471 
3472 	/* Nothing to do */
3473 	if (list_empty(&vops.list)) {
3474 		err = -ENODATA;
3475 		goto unwind_ops;
3476 	}
3477 
3478 	err = xe_vma_ops_alloc(&vops, args->num_binds > 1);
3479 	if (err)
3480 		goto unwind_ops;
3481 
3482 	fence = vm_bind_ioctl_ops_execute(vm, &vops);
3483 	if (IS_ERR(fence))
3484 		err = PTR_ERR(fence);
3485 	else
3486 		dma_fence_put(fence);
3487 
3488 unwind_ops:
3489 	if (err && err != -ENODATA)
3490 		vm_bind_ioctl_ops_unwind(vm, ops, args->num_binds);
3491 	xe_vma_ops_fini(&vops);
3492 	for (i = args->num_binds - 1; i >= 0; --i)
3493 		if (ops[i])
3494 			drm_gpuva_ops_free(&vm->gpuvm, ops[i]);
3495 free_syncs:
3496 	if (err == -ENODATA)
3497 		err = vm_bind_ioctl_signal_fences(vm, q, syncs, num_syncs);
3498 	while (num_syncs--)
3499 		xe_sync_entry_cleanup(&syncs[num_syncs]);
3500 
3501 	kfree(syncs);
3502 put_obj:
3503 	for (i = 0; i < args->num_binds; ++i)
3504 		xe_bo_put(bos[i]);
3505 release_vm_lock:
3506 	up_write(&vm->lock);
3507 put_exec_queue:
3508 	if (q)
3509 		xe_exec_queue_put(q);
3510 put_vm:
3511 	xe_vm_put(vm);
3512 	kvfree(bos);
3513 	kvfree(ops);
3514 	if (args->num_binds > 1)
3515 		kvfree(bind_ops);
3516 	return err;
3517 }
3518 
3519 /**
3520  * xe_vm_bind_kernel_bo - bind a kernel BO to a VM
3521  * @vm: VM to bind the BO to
3522  * @bo: BO to bind
3523  * @q: exec queue to use for the bind (optional)
3524  * @addr: address at which to bind the BO
3525  * @cache_lvl: PAT cache level to use
3526  *
3527  * Execute a VM bind map operation on a kernel-owned BO to bind it into a
3528  * kernel-owned VM.
3529  *
3530  * Returns a dma_fence to track the binding completion if the job to do so was
3531  * successfully submitted, an error pointer otherwise.
3532  */
3533 struct dma_fence *xe_vm_bind_kernel_bo(struct xe_vm *vm, struct xe_bo *bo,
3534 				       struct xe_exec_queue *q, u64 addr,
3535 				       enum xe_cache_level cache_lvl)
3536 {
3537 	struct xe_vma_ops vops;
3538 	struct drm_gpuva_ops *ops = NULL;
3539 	struct dma_fence *fence;
3540 	int err;
3541 
3542 	xe_bo_get(bo);
3543 	xe_vm_get(vm);
3544 	if (q)
3545 		xe_exec_queue_get(q);
3546 
3547 	down_write(&vm->lock);
3548 
3549 	xe_vma_ops_init(&vops, vm, q, NULL, 0);
3550 
3551 	ops = vm_bind_ioctl_ops_create(vm, bo, 0, addr, bo->size,
3552 				       DRM_XE_VM_BIND_OP_MAP, 0, 0,
3553 				       vm->xe->pat.idx[cache_lvl]);
3554 	if (IS_ERR(ops)) {
3555 		err = PTR_ERR(ops);
3556 		goto release_vm_lock;
3557 	}
3558 
3559 	err = vm_bind_ioctl_ops_parse(vm, ops, &vops);
3560 	if (err)
3561 		goto release_vm_lock;
3562 
3563 	xe_assert(vm->xe, !list_empty(&vops.list));
3564 
3565 	err = xe_vma_ops_alloc(&vops, false);
3566 	if (err)
3567 		goto unwind_ops;
3568 
3569 	fence = vm_bind_ioctl_ops_execute(vm, &vops);
3570 	if (IS_ERR(fence))
3571 		err = PTR_ERR(fence);
3572 
3573 unwind_ops:
3574 	if (err && err != -ENODATA)
3575 		vm_bind_ioctl_ops_unwind(vm, &ops, 1);
3576 
3577 	xe_vma_ops_fini(&vops);
3578 	drm_gpuva_ops_free(&vm->gpuvm, ops);
3579 
3580 release_vm_lock:
3581 	up_write(&vm->lock);
3582 
3583 	if (q)
3584 		xe_exec_queue_put(q);
3585 	xe_vm_put(vm);
3586 	xe_bo_put(bo);
3587 
3588 	if (err)
3589 		fence = ERR_PTR(err);
3590 
3591 	return fence;
3592 }
3593 
3594 /**
3595  * xe_vm_lock() - Lock the vm's dma_resv object
3596  * @vm: The struct xe_vm whose lock is to be locked
3597  * @intr: Whether to perform any wait interruptible
3598  *
3599  * Return: 0 on success, -EINTR if @intr is true and the wait for a
3600  * contended lock was interrupted. If @intr is false, the function
3601  * always returns 0.
3602  */
3603 int xe_vm_lock(struct xe_vm *vm, bool intr)
3604 {
3605 	if (intr)
3606 		return dma_resv_lock_interruptible(xe_vm_resv(vm), NULL);
3607 
3608 	return dma_resv_lock(xe_vm_resv(vm), NULL);
3609 }
3610 
3611 /**
3612  * xe_vm_unlock() - Unlock the vm's dma_resv object
3613  * @vm: The struct xe_vm whose lock is to be released.
3614  *
3615  * Unlock a buffer object lock that was locked by xe_vm_lock().
3616  */
3617 void xe_vm_unlock(struct xe_vm *vm)
3618 {
3619 	dma_resv_unlock(xe_vm_resv(vm));
3620 }
3621 
3622 /**
3623  * xe_vm_invalidate_vma - invalidate GPU mappings for VMA without a lock
3624  * @vma: VMA to invalidate
3625  *
3626  * Walks a list of page tables leaves which it memset the entries owned by this
3627  * VMA to zero, invalidates the TLBs, and block until TLBs invalidation is
3628  * complete.
3629  *
3630  * Returns 0 for success, negative error code otherwise.
3631  */
3632 int xe_vm_invalidate_vma(struct xe_vma *vma)
3633 {
3634 	struct xe_device *xe = xe_vma_vm(vma)->xe;
3635 	struct xe_tile *tile;
3636 	struct xe_gt_tlb_invalidation_fence
3637 		fence[XE_MAX_TILES_PER_DEVICE * XE_MAX_GT_PER_TILE];
3638 	u8 id;
3639 	u32 fence_id = 0;
3640 	int ret = 0;
3641 
3642 	xe_assert(xe, !xe_vma_is_null(vma));
3643 	xe_assert(xe, !xe_vma_is_cpu_addr_mirror(vma));
3644 	trace_xe_vma_invalidate(vma);
3645 
3646 	vm_dbg(&xe_vma_vm(vma)->xe->drm,
3647 	       "INVALIDATE: addr=0x%016llx, range=0x%016llx",
3648 		xe_vma_start(vma), xe_vma_size(vma));
3649 
3650 	/* Check that we don't race with page-table updates */
3651 	if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
3652 		if (xe_vma_is_userptr(vma)) {
3653 			WARN_ON_ONCE(!mmu_interval_check_retry
3654 				     (&to_userptr_vma(vma)->userptr.notifier,
3655 				      to_userptr_vma(vma)->userptr.notifier_seq));
3656 			WARN_ON_ONCE(!dma_resv_test_signaled(xe_vm_resv(xe_vma_vm(vma)),
3657 							     DMA_RESV_USAGE_BOOKKEEP));
3658 
3659 		} else {
3660 			xe_bo_assert_held(xe_vma_bo(vma));
3661 		}
3662 	}
3663 
3664 	for_each_tile(tile, xe, id) {
3665 		if (xe_pt_zap_ptes(tile, vma)) {
3666 			xe_device_wmb(xe);
3667 			xe_gt_tlb_invalidation_fence_init(tile->primary_gt,
3668 							  &fence[fence_id],
3669 							  true);
3670 
3671 			ret = xe_gt_tlb_invalidation_vma(tile->primary_gt,
3672 							 &fence[fence_id], vma);
3673 			if (ret)
3674 				goto wait;
3675 			++fence_id;
3676 
3677 			if (!tile->media_gt)
3678 				continue;
3679 
3680 			xe_gt_tlb_invalidation_fence_init(tile->media_gt,
3681 							  &fence[fence_id],
3682 							  true);
3683 
3684 			ret = xe_gt_tlb_invalidation_vma(tile->media_gt,
3685 							 &fence[fence_id], vma);
3686 			if (ret)
3687 				goto wait;
3688 			++fence_id;
3689 		}
3690 	}
3691 
3692 wait:
3693 	for (id = 0; id < fence_id; ++id)
3694 		xe_gt_tlb_invalidation_fence_wait(&fence[id]);
3695 
3696 	vma->tile_invalidated = vma->tile_mask;
3697 
3698 	return ret;
3699 }
3700 
3701 int xe_vm_validate_protected(struct xe_vm *vm)
3702 {
3703 	struct drm_gpuva *gpuva;
3704 	int err = 0;
3705 
3706 	if (!vm)
3707 		return -ENODEV;
3708 
3709 	mutex_lock(&vm->snap_mutex);
3710 
3711 	drm_gpuvm_for_each_va(gpuva, &vm->gpuvm) {
3712 		struct xe_vma *vma = gpuva_to_vma(gpuva);
3713 		struct xe_bo *bo = vma->gpuva.gem.obj ?
3714 			gem_to_xe_bo(vma->gpuva.gem.obj) : NULL;
3715 
3716 		if (!bo)
3717 			continue;
3718 
3719 		if (xe_bo_is_protected(bo)) {
3720 			err = xe_pxp_bo_key_check(vm->xe->pxp, bo);
3721 			if (err)
3722 				break;
3723 		}
3724 	}
3725 
3726 	mutex_unlock(&vm->snap_mutex);
3727 	return err;
3728 }
3729 
3730 struct xe_vm_snapshot {
3731 	unsigned long num_snaps;
3732 	struct {
3733 		u64 ofs, bo_ofs;
3734 		unsigned long len;
3735 		struct xe_bo *bo;
3736 		void *data;
3737 		struct mm_struct *mm;
3738 	} snap[];
3739 };
3740 
3741 struct xe_vm_snapshot *xe_vm_snapshot_capture(struct xe_vm *vm)
3742 {
3743 	unsigned long num_snaps = 0, i;
3744 	struct xe_vm_snapshot *snap = NULL;
3745 	struct drm_gpuva *gpuva;
3746 
3747 	if (!vm)
3748 		return NULL;
3749 
3750 	mutex_lock(&vm->snap_mutex);
3751 	drm_gpuvm_for_each_va(gpuva, &vm->gpuvm) {
3752 		if (gpuva->flags & XE_VMA_DUMPABLE)
3753 			num_snaps++;
3754 	}
3755 
3756 	if (num_snaps)
3757 		snap = kvzalloc(offsetof(struct xe_vm_snapshot, snap[num_snaps]), GFP_NOWAIT);
3758 	if (!snap) {
3759 		snap = num_snaps ? ERR_PTR(-ENOMEM) : ERR_PTR(-ENODEV);
3760 		goto out_unlock;
3761 	}
3762 
3763 	snap->num_snaps = num_snaps;
3764 	i = 0;
3765 	drm_gpuvm_for_each_va(gpuva, &vm->gpuvm) {
3766 		struct xe_vma *vma = gpuva_to_vma(gpuva);
3767 		struct xe_bo *bo = vma->gpuva.gem.obj ?
3768 			gem_to_xe_bo(vma->gpuva.gem.obj) : NULL;
3769 
3770 		if (!(gpuva->flags & XE_VMA_DUMPABLE))
3771 			continue;
3772 
3773 		snap->snap[i].ofs = xe_vma_start(vma);
3774 		snap->snap[i].len = xe_vma_size(vma);
3775 		if (bo) {
3776 			snap->snap[i].bo = xe_bo_get(bo);
3777 			snap->snap[i].bo_ofs = xe_vma_bo_offset(vma);
3778 		} else if (xe_vma_is_userptr(vma)) {
3779 			struct mm_struct *mm =
3780 				to_userptr_vma(vma)->userptr.notifier.mm;
3781 
3782 			if (mmget_not_zero(mm))
3783 				snap->snap[i].mm = mm;
3784 			else
3785 				snap->snap[i].data = ERR_PTR(-EFAULT);
3786 
3787 			snap->snap[i].bo_ofs = xe_vma_userptr(vma);
3788 		} else {
3789 			snap->snap[i].data = ERR_PTR(-ENOENT);
3790 		}
3791 		i++;
3792 	}
3793 
3794 out_unlock:
3795 	mutex_unlock(&vm->snap_mutex);
3796 	return snap;
3797 }
3798 
3799 void xe_vm_snapshot_capture_delayed(struct xe_vm_snapshot *snap)
3800 {
3801 	if (IS_ERR_OR_NULL(snap))
3802 		return;
3803 
3804 	for (int i = 0; i < snap->num_snaps; i++) {
3805 		struct xe_bo *bo = snap->snap[i].bo;
3806 		int err;
3807 
3808 		if (IS_ERR(snap->snap[i].data))
3809 			continue;
3810 
3811 		snap->snap[i].data = kvmalloc(snap->snap[i].len, GFP_USER);
3812 		if (!snap->snap[i].data) {
3813 			snap->snap[i].data = ERR_PTR(-ENOMEM);
3814 			goto cleanup_bo;
3815 		}
3816 
3817 		if (bo) {
3818 			err = xe_bo_read(bo, snap->snap[i].bo_ofs,
3819 					 snap->snap[i].data, snap->snap[i].len);
3820 		} else {
3821 			void __user *userptr = (void __user *)(size_t)snap->snap[i].bo_ofs;
3822 
3823 			kthread_use_mm(snap->snap[i].mm);
3824 			if (!copy_from_user(snap->snap[i].data, userptr, snap->snap[i].len))
3825 				err = 0;
3826 			else
3827 				err = -EFAULT;
3828 			kthread_unuse_mm(snap->snap[i].mm);
3829 
3830 			mmput(snap->snap[i].mm);
3831 			snap->snap[i].mm = NULL;
3832 		}
3833 
3834 		if (err) {
3835 			kvfree(snap->snap[i].data);
3836 			snap->snap[i].data = ERR_PTR(err);
3837 		}
3838 
3839 cleanup_bo:
3840 		xe_bo_put(bo);
3841 		snap->snap[i].bo = NULL;
3842 	}
3843 }
3844 
3845 void xe_vm_snapshot_print(struct xe_vm_snapshot *snap, struct drm_printer *p)
3846 {
3847 	unsigned long i, j;
3848 
3849 	if (IS_ERR_OR_NULL(snap)) {
3850 		drm_printf(p, "[0].error: %li\n", PTR_ERR(snap));
3851 		return;
3852 	}
3853 
3854 	for (i = 0; i < snap->num_snaps; i++) {
3855 		drm_printf(p, "[%llx].length: 0x%lx\n", snap->snap[i].ofs, snap->snap[i].len);
3856 
3857 		if (IS_ERR(snap->snap[i].data)) {
3858 			drm_printf(p, "[%llx].error: %li\n", snap->snap[i].ofs,
3859 				   PTR_ERR(snap->snap[i].data));
3860 			continue;
3861 		}
3862 
3863 		drm_printf(p, "[%llx].data: ", snap->snap[i].ofs);
3864 
3865 		for (j = 0; j < snap->snap[i].len; j += sizeof(u32)) {
3866 			u32 *val = snap->snap[i].data + j;
3867 			char dumped[ASCII85_BUFSZ];
3868 
3869 			drm_puts(p, ascii85_encode(*val, dumped));
3870 		}
3871 
3872 		drm_puts(p, "\n");
3873 
3874 		if (drm_coredump_printer_is_full(p))
3875 			return;
3876 	}
3877 }
3878 
3879 void xe_vm_snapshot_free(struct xe_vm_snapshot *snap)
3880 {
3881 	unsigned long i;
3882 
3883 	if (IS_ERR_OR_NULL(snap))
3884 		return;
3885 
3886 	for (i = 0; i < snap->num_snaps; i++) {
3887 		if (!IS_ERR(snap->snap[i].data))
3888 			kvfree(snap->snap[i].data);
3889 		xe_bo_put(snap->snap[i].bo);
3890 		if (snap->snap[i].mm)
3891 			mmput(snap->snap[i].mm);
3892 	}
3893 	kvfree(snap);
3894 }
3895