xref: /linux/drivers/gpu/drm/xe/xe_vm.c (revision 1cc3462159babb69c84c39cb1b4e262aef3ea325)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2021 Intel Corporation
4  */
5 
6 #include "xe_vm.h"
7 
8 #include <linux/dma-fence-array.h>
9 #include <linux/nospec.h>
10 
11 #include <drm/drm_exec.h>
12 #include <drm/drm_print.h>
13 #include <drm/ttm/ttm_tt.h>
14 #include <uapi/drm/xe_drm.h>
15 #include <linux/ascii85.h>
16 #include <linux/delay.h>
17 #include <linux/kthread.h>
18 #include <linux/mm.h>
19 #include <linux/swap.h>
20 
21 #include <generated/xe_wa_oob.h>
22 
23 #include "regs/xe_gtt_defs.h"
24 #include "xe_assert.h"
25 #include "xe_bo.h"
26 #include "xe_device.h"
27 #include "xe_drm_client.h"
28 #include "xe_exec_queue.h"
29 #include "xe_gt_pagefault.h"
30 #include "xe_gt_tlb_invalidation.h"
31 #include "xe_migrate.h"
32 #include "xe_pat.h"
33 #include "xe_pm.h"
34 #include "xe_preempt_fence.h"
35 #include "xe_pt.h"
36 #include "xe_res_cursor.h"
37 #include "xe_sync.h"
38 #include "xe_trace_bo.h"
39 #include "xe_wa.h"
40 #include "xe_hmm.h"
41 
42 static struct drm_gem_object *xe_vm_obj(struct xe_vm *vm)
43 {
44 	return vm->gpuvm.r_obj;
45 }
46 
47 /**
48  * xe_vma_userptr_check_repin() - Advisory check for repin needed
49  * @uvma: The userptr vma
50  *
51  * Check if the userptr vma has been invalidated since last successful
52  * repin. The check is advisory only and can the function can be called
53  * without the vm->userptr.notifier_lock held. There is no guarantee that the
54  * vma userptr will remain valid after a lockless check, so typically
55  * the call needs to be followed by a proper check under the notifier_lock.
56  *
57  * Return: 0 if userptr vma is valid, -EAGAIN otherwise; repin recommended.
58  */
59 int xe_vma_userptr_check_repin(struct xe_userptr_vma *uvma)
60 {
61 	return mmu_interval_check_retry(&uvma->userptr.notifier,
62 					uvma->userptr.notifier_seq) ?
63 		-EAGAIN : 0;
64 }
65 
66 int xe_vma_userptr_pin_pages(struct xe_userptr_vma *uvma)
67 {
68 	struct xe_vma *vma = &uvma->vma;
69 	struct xe_vm *vm = xe_vma_vm(vma);
70 	struct xe_device *xe = vm->xe;
71 
72 	lockdep_assert_held(&vm->lock);
73 	xe_assert(xe, xe_vma_is_userptr(vma));
74 
75 	return xe_hmm_userptr_populate_range(uvma, false);
76 }
77 
78 static bool preempt_fences_waiting(struct xe_vm *vm)
79 {
80 	struct xe_exec_queue *q;
81 
82 	lockdep_assert_held(&vm->lock);
83 	xe_vm_assert_held(vm);
84 
85 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link) {
86 		if (!q->lr.pfence ||
87 		    test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
88 			     &q->lr.pfence->flags)) {
89 			return true;
90 		}
91 	}
92 
93 	return false;
94 }
95 
96 static void free_preempt_fences(struct list_head *list)
97 {
98 	struct list_head *link, *next;
99 
100 	list_for_each_safe(link, next, list)
101 		xe_preempt_fence_free(to_preempt_fence_from_link(link));
102 }
103 
104 static int alloc_preempt_fences(struct xe_vm *vm, struct list_head *list,
105 				unsigned int *count)
106 {
107 	lockdep_assert_held(&vm->lock);
108 	xe_vm_assert_held(vm);
109 
110 	if (*count >= vm->preempt.num_exec_queues)
111 		return 0;
112 
113 	for (; *count < vm->preempt.num_exec_queues; ++(*count)) {
114 		struct xe_preempt_fence *pfence = xe_preempt_fence_alloc();
115 
116 		if (IS_ERR(pfence))
117 			return PTR_ERR(pfence);
118 
119 		list_move_tail(xe_preempt_fence_link(pfence), list);
120 	}
121 
122 	return 0;
123 }
124 
125 static int wait_for_existing_preempt_fences(struct xe_vm *vm)
126 {
127 	struct xe_exec_queue *q;
128 
129 	xe_vm_assert_held(vm);
130 
131 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link) {
132 		if (q->lr.pfence) {
133 			long timeout = dma_fence_wait(q->lr.pfence, false);
134 
135 			/* Only -ETIME on fence indicates VM needs to be killed */
136 			if (timeout < 0 || q->lr.pfence->error == -ETIME)
137 				return -ETIME;
138 
139 			dma_fence_put(q->lr.pfence);
140 			q->lr.pfence = NULL;
141 		}
142 	}
143 
144 	return 0;
145 }
146 
147 static bool xe_vm_is_idle(struct xe_vm *vm)
148 {
149 	struct xe_exec_queue *q;
150 
151 	xe_vm_assert_held(vm);
152 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link) {
153 		if (!xe_exec_queue_is_idle(q))
154 			return false;
155 	}
156 
157 	return true;
158 }
159 
160 static void arm_preempt_fences(struct xe_vm *vm, struct list_head *list)
161 {
162 	struct list_head *link;
163 	struct xe_exec_queue *q;
164 
165 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link) {
166 		struct dma_fence *fence;
167 
168 		link = list->next;
169 		xe_assert(vm->xe, link != list);
170 
171 		fence = xe_preempt_fence_arm(to_preempt_fence_from_link(link),
172 					     q, q->lr.context,
173 					     ++q->lr.seqno);
174 		dma_fence_put(q->lr.pfence);
175 		q->lr.pfence = fence;
176 	}
177 }
178 
179 static int add_preempt_fences(struct xe_vm *vm, struct xe_bo *bo)
180 {
181 	struct xe_exec_queue *q;
182 	int err;
183 
184 	xe_bo_assert_held(bo);
185 
186 	if (!vm->preempt.num_exec_queues)
187 		return 0;
188 
189 	err = dma_resv_reserve_fences(bo->ttm.base.resv, vm->preempt.num_exec_queues);
190 	if (err)
191 		return err;
192 
193 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link)
194 		if (q->lr.pfence) {
195 			dma_resv_add_fence(bo->ttm.base.resv,
196 					   q->lr.pfence,
197 					   DMA_RESV_USAGE_BOOKKEEP);
198 		}
199 
200 	return 0;
201 }
202 
203 static void resume_and_reinstall_preempt_fences(struct xe_vm *vm,
204 						struct drm_exec *exec)
205 {
206 	struct xe_exec_queue *q;
207 
208 	lockdep_assert_held(&vm->lock);
209 	xe_vm_assert_held(vm);
210 
211 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link) {
212 		q->ops->resume(q);
213 
214 		drm_gpuvm_resv_add_fence(&vm->gpuvm, exec, q->lr.pfence,
215 					 DMA_RESV_USAGE_BOOKKEEP, DMA_RESV_USAGE_BOOKKEEP);
216 	}
217 }
218 
219 int xe_vm_add_compute_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
220 {
221 	struct drm_gpuvm_exec vm_exec = {
222 		.vm = &vm->gpuvm,
223 		.flags = DRM_EXEC_INTERRUPTIBLE_WAIT,
224 		.num_fences = 1,
225 	};
226 	struct drm_exec *exec = &vm_exec.exec;
227 	struct dma_fence *pfence;
228 	int err;
229 	bool wait;
230 
231 	xe_assert(vm->xe, xe_vm_in_preempt_fence_mode(vm));
232 
233 	down_write(&vm->lock);
234 	err = drm_gpuvm_exec_lock(&vm_exec);
235 	if (err)
236 		goto out_up_write;
237 
238 	pfence = xe_preempt_fence_create(q, q->lr.context,
239 					 ++q->lr.seqno);
240 	if (!pfence) {
241 		err = -ENOMEM;
242 		goto out_fini;
243 	}
244 
245 	list_add(&q->lr.link, &vm->preempt.exec_queues);
246 	++vm->preempt.num_exec_queues;
247 	q->lr.pfence = pfence;
248 
249 	down_read(&vm->userptr.notifier_lock);
250 
251 	drm_gpuvm_resv_add_fence(&vm->gpuvm, exec, pfence,
252 				 DMA_RESV_USAGE_BOOKKEEP, DMA_RESV_USAGE_BOOKKEEP);
253 
254 	/*
255 	 * Check to see if a preemption on VM is in flight or userptr
256 	 * invalidation, if so trigger this preempt fence to sync state with
257 	 * other preempt fences on the VM.
258 	 */
259 	wait = __xe_vm_userptr_needs_repin(vm) || preempt_fences_waiting(vm);
260 	if (wait)
261 		dma_fence_enable_sw_signaling(pfence);
262 
263 	up_read(&vm->userptr.notifier_lock);
264 
265 out_fini:
266 	drm_exec_fini(exec);
267 out_up_write:
268 	up_write(&vm->lock);
269 
270 	return err;
271 }
272 
273 /**
274  * xe_vm_remove_compute_exec_queue() - Remove compute exec queue from VM
275  * @vm: The VM.
276  * @q: The exec_queue
277  *
278  * Note that this function might be called multiple times on the same queue.
279  */
280 void xe_vm_remove_compute_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
281 {
282 	if (!xe_vm_in_preempt_fence_mode(vm))
283 		return;
284 
285 	down_write(&vm->lock);
286 	if (!list_empty(&q->lr.link)) {
287 		list_del_init(&q->lr.link);
288 		--vm->preempt.num_exec_queues;
289 	}
290 	if (q->lr.pfence) {
291 		dma_fence_enable_sw_signaling(q->lr.pfence);
292 		dma_fence_put(q->lr.pfence);
293 		q->lr.pfence = NULL;
294 	}
295 	up_write(&vm->lock);
296 }
297 
298 /**
299  * __xe_vm_userptr_needs_repin() - Check whether the VM does have userptrs
300  * that need repinning.
301  * @vm: The VM.
302  *
303  * This function checks for whether the VM has userptrs that need repinning,
304  * and provides a release-type barrier on the userptr.notifier_lock after
305  * checking.
306  *
307  * Return: 0 if there are no userptrs needing repinning, -EAGAIN if there are.
308  */
309 int __xe_vm_userptr_needs_repin(struct xe_vm *vm)
310 {
311 	lockdep_assert_held_read(&vm->userptr.notifier_lock);
312 
313 	return (list_empty(&vm->userptr.repin_list) &&
314 		list_empty(&vm->userptr.invalidated)) ? 0 : -EAGAIN;
315 }
316 
317 #define XE_VM_REBIND_RETRY_TIMEOUT_MS 1000
318 
319 /**
320  * xe_vm_kill() - VM Kill
321  * @vm: The VM.
322  * @unlocked: Flag indicates the VM's dma-resv is not held
323  *
324  * Kill the VM by setting banned flag indicated VM is no longer available for
325  * use. If in preempt fence mode, also kill all exec queue attached to the VM.
326  */
327 void xe_vm_kill(struct xe_vm *vm, bool unlocked)
328 {
329 	struct xe_exec_queue *q;
330 
331 	lockdep_assert_held(&vm->lock);
332 
333 	if (unlocked)
334 		xe_vm_lock(vm, false);
335 
336 	vm->flags |= XE_VM_FLAG_BANNED;
337 	trace_xe_vm_kill(vm);
338 
339 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link)
340 		q->ops->kill(q);
341 
342 	if (unlocked)
343 		xe_vm_unlock(vm);
344 
345 	/* TODO: Inform user the VM is banned */
346 }
347 
348 /**
349  * xe_vm_validate_should_retry() - Whether to retry after a validate error.
350  * @exec: The drm_exec object used for locking before validation.
351  * @err: The error returned from ttm_bo_validate().
352  * @end: A ktime_t cookie that should be set to 0 before first use and
353  * that should be reused on subsequent calls.
354  *
355  * With multiple active VMs, under memory pressure, it is possible that
356  * ttm_bo_validate() run into -EDEADLK and in such case returns -ENOMEM.
357  * Until ttm properly handles locking in such scenarios, best thing the
358  * driver can do is retry with a timeout. Check if that is necessary, and
359  * if so unlock the drm_exec's objects while keeping the ticket to prepare
360  * for a rerun.
361  *
362  * Return: true if a retry after drm_exec_init() is recommended;
363  * false otherwise.
364  */
365 bool xe_vm_validate_should_retry(struct drm_exec *exec, int err, ktime_t *end)
366 {
367 	ktime_t cur;
368 
369 	if (err != -ENOMEM)
370 		return false;
371 
372 	cur = ktime_get();
373 	*end = *end ? : ktime_add_ms(cur, XE_VM_REBIND_RETRY_TIMEOUT_MS);
374 	if (!ktime_before(cur, *end))
375 		return false;
376 
377 	msleep(20);
378 	return true;
379 }
380 
381 static int xe_gpuvm_validate(struct drm_gpuvm_bo *vm_bo, struct drm_exec *exec)
382 {
383 	struct xe_vm *vm = gpuvm_to_vm(vm_bo->vm);
384 	struct drm_gpuva *gpuva;
385 	int ret;
386 
387 	lockdep_assert_held(&vm->lock);
388 	drm_gpuvm_bo_for_each_va(gpuva, vm_bo)
389 		list_move_tail(&gpuva_to_vma(gpuva)->combined_links.rebind,
390 			       &vm->rebind_list);
391 
392 	ret = xe_bo_validate(gem_to_xe_bo(vm_bo->obj), vm, false);
393 	if (ret)
394 		return ret;
395 
396 	vm_bo->evicted = false;
397 	return 0;
398 }
399 
400 /**
401  * xe_vm_validate_rebind() - Validate buffer objects and rebind vmas
402  * @vm: The vm for which we are rebinding.
403  * @exec: The struct drm_exec with the locked GEM objects.
404  * @num_fences: The number of fences to reserve for the operation, not
405  * including rebinds and validations.
406  *
407  * Validates all evicted gem objects and rebinds their vmas. Note that
408  * rebindings may cause evictions and hence the validation-rebind
409  * sequence is rerun until there are no more objects to validate.
410  *
411  * Return: 0 on success, negative error code on error. In particular,
412  * may return -EINTR or -ERESTARTSYS if interrupted, and -EDEADLK if
413  * the drm_exec transaction needs to be restarted.
414  */
415 int xe_vm_validate_rebind(struct xe_vm *vm, struct drm_exec *exec,
416 			  unsigned int num_fences)
417 {
418 	struct drm_gem_object *obj;
419 	unsigned long index;
420 	int ret;
421 
422 	do {
423 		ret = drm_gpuvm_validate(&vm->gpuvm, exec);
424 		if (ret)
425 			return ret;
426 
427 		ret = xe_vm_rebind(vm, false);
428 		if (ret)
429 			return ret;
430 	} while (!list_empty(&vm->gpuvm.evict.list));
431 
432 	drm_exec_for_each_locked_object(exec, index, obj) {
433 		ret = dma_resv_reserve_fences(obj->resv, num_fences);
434 		if (ret)
435 			return ret;
436 	}
437 
438 	return 0;
439 }
440 
441 static int xe_preempt_work_begin(struct drm_exec *exec, struct xe_vm *vm,
442 				 bool *done)
443 {
444 	int err;
445 
446 	err = drm_gpuvm_prepare_vm(&vm->gpuvm, exec, 0);
447 	if (err)
448 		return err;
449 
450 	if (xe_vm_is_idle(vm)) {
451 		vm->preempt.rebind_deactivated = true;
452 		*done = true;
453 		return 0;
454 	}
455 
456 	if (!preempt_fences_waiting(vm)) {
457 		*done = true;
458 		return 0;
459 	}
460 
461 	err = drm_gpuvm_prepare_objects(&vm->gpuvm, exec, 0);
462 	if (err)
463 		return err;
464 
465 	err = wait_for_existing_preempt_fences(vm);
466 	if (err)
467 		return err;
468 
469 	/*
470 	 * Add validation and rebinding to the locking loop since both can
471 	 * cause evictions which may require blocing dma_resv locks.
472 	 * The fence reservation here is intended for the new preempt fences
473 	 * we attach at the end of the rebind work.
474 	 */
475 	return xe_vm_validate_rebind(vm, exec, vm->preempt.num_exec_queues);
476 }
477 
478 static void preempt_rebind_work_func(struct work_struct *w)
479 {
480 	struct xe_vm *vm = container_of(w, struct xe_vm, preempt.rebind_work);
481 	struct drm_exec exec;
482 	unsigned int fence_count = 0;
483 	LIST_HEAD(preempt_fences);
484 	ktime_t end = 0;
485 	int err = 0;
486 	long wait;
487 	int __maybe_unused tries = 0;
488 
489 	xe_assert(vm->xe, xe_vm_in_preempt_fence_mode(vm));
490 	trace_xe_vm_rebind_worker_enter(vm);
491 
492 	down_write(&vm->lock);
493 
494 	if (xe_vm_is_closed_or_banned(vm)) {
495 		up_write(&vm->lock);
496 		trace_xe_vm_rebind_worker_exit(vm);
497 		return;
498 	}
499 
500 retry:
501 	if (xe_vm_userptr_check_repin(vm)) {
502 		err = xe_vm_userptr_pin(vm);
503 		if (err)
504 			goto out_unlock_outer;
505 	}
506 
507 	drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT, 0);
508 
509 	drm_exec_until_all_locked(&exec) {
510 		bool done = false;
511 
512 		err = xe_preempt_work_begin(&exec, vm, &done);
513 		drm_exec_retry_on_contention(&exec);
514 		if (err || done) {
515 			drm_exec_fini(&exec);
516 			if (err && xe_vm_validate_should_retry(&exec, err, &end))
517 				err = -EAGAIN;
518 
519 			goto out_unlock_outer;
520 		}
521 	}
522 
523 	err = alloc_preempt_fences(vm, &preempt_fences, &fence_count);
524 	if (err)
525 		goto out_unlock;
526 
527 	err = xe_vm_rebind(vm, true);
528 	if (err)
529 		goto out_unlock;
530 
531 	/* Wait on rebinds and munmap style VM unbinds */
532 	wait = dma_resv_wait_timeout(xe_vm_resv(vm),
533 				     DMA_RESV_USAGE_KERNEL,
534 				     false, MAX_SCHEDULE_TIMEOUT);
535 	if (wait <= 0) {
536 		err = -ETIME;
537 		goto out_unlock;
538 	}
539 
540 #define retry_required(__tries, __vm) \
541 	(IS_ENABLED(CONFIG_DRM_XE_USERPTR_INVAL_INJECT) ? \
542 	(!(__tries)++ || __xe_vm_userptr_needs_repin(__vm)) : \
543 	__xe_vm_userptr_needs_repin(__vm))
544 
545 	down_read(&vm->userptr.notifier_lock);
546 	if (retry_required(tries, vm)) {
547 		up_read(&vm->userptr.notifier_lock);
548 		err = -EAGAIN;
549 		goto out_unlock;
550 	}
551 
552 #undef retry_required
553 
554 	spin_lock(&vm->xe->ttm.lru_lock);
555 	ttm_lru_bulk_move_tail(&vm->lru_bulk_move);
556 	spin_unlock(&vm->xe->ttm.lru_lock);
557 
558 	/* Point of no return. */
559 	arm_preempt_fences(vm, &preempt_fences);
560 	resume_and_reinstall_preempt_fences(vm, &exec);
561 	up_read(&vm->userptr.notifier_lock);
562 
563 out_unlock:
564 	drm_exec_fini(&exec);
565 out_unlock_outer:
566 	if (err == -EAGAIN) {
567 		trace_xe_vm_rebind_worker_retry(vm);
568 		goto retry;
569 	}
570 
571 	if (err) {
572 		drm_warn(&vm->xe->drm, "VM worker error: %d\n", err);
573 		xe_vm_kill(vm, true);
574 	}
575 	up_write(&vm->lock);
576 
577 	free_preempt_fences(&preempt_fences);
578 
579 	trace_xe_vm_rebind_worker_exit(vm);
580 }
581 
582 static bool vma_userptr_invalidate(struct mmu_interval_notifier *mni,
583 				   const struct mmu_notifier_range *range,
584 				   unsigned long cur_seq)
585 {
586 	struct xe_userptr *userptr = container_of(mni, typeof(*userptr), notifier);
587 	struct xe_userptr_vma *uvma = container_of(userptr, typeof(*uvma), userptr);
588 	struct xe_vma *vma = &uvma->vma;
589 	struct xe_vm *vm = xe_vma_vm(vma);
590 	struct dma_resv_iter cursor;
591 	struct dma_fence *fence;
592 	long err;
593 
594 	xe_assert(vm->xe, xe_vma_is_userptr(vma));
595 	trace_xe_vma_userptr_invalidate(vma);
596 
597 	if (!mmu_notifier_range_blockable(range))
598 		return false;
599 
600 	vm_dbg(&xe_vma_vm(vma)->xe->drm,
601 	       "NOTIFIER: addr=0x%016llx, range=0x%016llx",
602 		xe_vma_start(vma), xe_vma_size(vma));
603 
604 	down_write(&vm->userptr.notifier_lock);
605 	mmu_interval_set_seq(mni, cur_seq);
606 
607 	/* No need to stop gpu access if the userptr is not yet bound. */
608 	if (!userptr->initial_bind) {
609 		up_write(&vm->userptr.notifier_lock);
610 		return true;
611 	}
612 
613 	/*
614 	 * Tell exec and rebind worker they need to repin and rebind this
615 	 * userptr.
616 	 */
617 	if (!xe_vm_in_fault_mode(vm) &&
618 	    !(vma->gpuva.flags & XE_VMA_DESTROYED) && vma->tile_present) {
619 		spin_lock(&vm->userptr.invalidated_lock);
620 		list_move_tail(&userptr->invalidate_link,
621 			       &vm->userptr.invalidated);
622 		spin_unlock(&vm->userptr.invalidated_lock);
623 	}
624 
625 	up_write(&vm->userptr.notifier_lock);
626 
627 	/*
628 	 * Preempt fences turn into schedule disables, pipeline these.
629 	 * Note that even in fault mode, we need to wait for binds and
630 	 * unbinds to complete, and those are attached as BOOKMARK fences
631 	 * to the vm.
632 	 */
633 	dma_resv_iter_begin(&cursor, xe_vm_resv(vm),
634 			    DMA_RESV_USAGE_BOOKKEEP);
635 	dma_resv_for_each_fence_unlocked(&cursor, fence)
636 		dma_fence_enable_sw_signaling(fence);
637 	dma_resv_iter_end(&cursor);
638 
639 	err = dma_resv_wait_timeout(xe_vm_resv(vm),
640 				    DMA_RESV_USAGE_BOOKKEEP,
641 				    false, MAX_SCHEDULE_TIMEOUT);
642 	XE_WARN_ON(err <= 0);
643 
644 	if (xe_vm_in_fault_mode(vm)) {
645 		err = xe_vm_invalidate_vma(vma);
646 		XE_WARN_ON(err);
647 	}
648 
649 	trace_xe_vma_userptr_invalidate_complete(vma);
650 
651 	return true;
652 }
653 
654 static const struct mmu_interval_notifier_ops vma_userptr_notifier_ops = {
655 	.invalidate = vma_userptr_invalidate,
656 };
657 
658 int xe_vm_userptr_pin(struct xe_vm *vm)
659 {
660 	struct xe_userptr_vma *uvma, *next;
661 	int err = 0;
662 	LIST_HEAD(tmp_evict);
663 
664 	xe_assert(vm->xe, !xe_vm_in_fault_mode(vm));
665 	lockdep_assert_held_write(&vm->lock);
666 
667 	/* Collect invalidated userptrs */
668 	spin_lock(&vm->userptr.invalidated_lock);
669 	xe_assert(vm->xe, list_empty(&vm->userptr.repin_list));
670 	list_for_each_entry_safe(uvma, next, &vm->userptr.invalidated,
671 				 userptr.invalidate_link) {
672 		list_del_init(&uvma->userptr.invalidate_link);
673 		list_add_tail(&uvma->userptr.repin_link,
674 			      &vm->userptr.repin_list);
675 	}
676 	spin_unlock(&vm->userptr.invalidated_lock);
677 
678 	/* Pin and move to bind list */
679 	list_for_each_entry_safe(uvma, next, &vm->userptr.repin_list,
680 				 userptr.repin_link) {
681 		err = xe_vma_userptr_pin_pages(uvma);
682 		if (err == -EFAULT) {
683 			list_del_init(&uvma->userptr.repin_link);
684 			/*
685 			 * We might have already done the pin once already, but
686 			 * then had to retry before the re-bind happened, due
687 			 * some other condition in the caller, but in the
688 			 * meantime the userptr got dinged by the notifier such
689 			 * that we need to revalidate here, but this time we hit
690 			 * the EFAULT. In such a case make sure we remove
691 			 * ourselves from the rebind list to avoid going down in
692 			 * flames.
693 			 */
694 			if (!list_empty(&uvma->vma.combined_links.rebind))
695 				list_del_init(&uvma->vma.combined_links.rebind);
696 
697 			/* Wait for pending binds */
698 			xe_vm_lock(vm, false);
699 			dma_resv_wait_timeout(xe_vm_resv(vm),
700 					      DMA_RESV_USAGE_BOOKKEEP,
701 					      false, MAX_SCHEDULE_TIMEOUT);
702 
703 			err = xe_vm_invalidate_vma(&uvma->vma);
704 			xe_vm_unlock(vm);
705 			if (err)
706 				break;
707 		} else {
708 			if (err)
709 				break;
710 
711 			list_del_init(&uvma->userptr.repin_link);
712 			list_move_tail(&uvma->vma.combined_links.rebind,
713 				       &vm->rebind_list);
714 		}
715 	}
716 
717 	if (err) {
718 		down_write(&vm->userptr.notifier_lock);
719 		spin_lock(&vm->userptr.invalidated_lock);
720 		list_for_each_entry_safe(uvma, next, &vm->userptr.repin_list,
721 					 userptr.repin_link) {
722 			list_del_init(&uvma->userptr.repin_link);
723 			list_move_tail(&uvma->userptr.invalidate_link,
724 				       &vm->userptr.invalidated);
725 		}
726 		spin_unlock(&vm->userptr.invalidated_lock);
727 		up_write(&vm->userptr.notifier_lock);
728 	}
729 	return err;
730 }
731 
732 /**
733  * xe_vm_userptr_check_repin() - Check whether the VM might have userptrs
734  * that need repinning.
735  * @vm: The VM.
736  *
737  * This function does an advisory check for whether the VM has userptrs that
738  * need repinning.
739  *
740  * Return: 0 if there are no indications of userptrs needing repinning,
741  * -EAGAIN if there are.
742  */
743 int xe_vm_userptr_check_repin(struct xe_vm *vm)
744 {
745 	return (list_empty_careful(&vm->userptr.repin_list) &&
746 		list_empty_careful(&vm->userptr.invalidated)) ? 0 : -EAGAIN;
747 }
748 
749 static int xe_vma_ops_alloc(struct xe_vma_ops *vops, bool array_of_binds)
750 {
751 	int i;
752 
753 	for (i = 0; i < XE_MAX_TILES_PER_DEVICE; ++i) {
754 		if (!vops->pt_update_ops[i].num_ops)
755 			continue;
756 
757 		vops->pt_update_ops[i].ops =
758 			kmalloc_array(vops->pt_update_ops[i].num_ops,
759 				      sizeof(*vops->pt_update_ops[i].ops),
760 				      GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN);
761 		if (!vops->pt_update_ops[i].ops)
762 			return array_of_binds ? -ENOBUFS : -ENOMEM;
763 	}
764 
765 	return 0;
766 }
767 ALLOW_ERROR_INJECTION(xe_vma_ops_alloc, ERRNO);
768 
769 static void xe_vma_ops_fini(struct xe_vma_ops *vops)
770 {
771 	int i;
772 
773 	for (i = 0; i < XE_MAX_TILES_PER_DEVICE; ++i)
774 		kfree(vops->pt_update_ops[i].ops);
775 }
776 
777 static void xe_vma_ops_incr_pt_update_ops(struct xe_vma_ops *vops, u8 tile_mask)
778 {
779 	int i;
780 
781 	for (i = 0; i < XE_MAX_TILES_PER_DEVICE; ++i)
782 		if (BIT(i) & tile_mask)
783 			++vops->pt_update_ops[i].num_ops;
784 }
785 
786 static void xe_vm_populate_rebind(struct xe_vma_op *op, struct xe_vma *vma,
787 				  u8 tile_mask)
788 {
789 	INIT_LIST_HEAD(&op->link);
790 	op->tile_mask = tile_mask;
791 	op->base.op = DRM_GPUVA_OP_MAP;
792 	op->base.map.va.addr = vma->gpuva.va.addr;
793 	op->base.map.va.range = vma->gpuva.va.range;
794 	op->base.map.gem.obj = vma->gpuva.gem.obj;
795 	op->base.map.gem.offset = vma->gpuva.gem.offset;
796 	op->map.vma = vma;
797 	op->map.immediate = true;
798 	op->map.dumpable = vma->gpuva.flags & XE_VMA_DUMPABLE;
799 	op->map.is_null = xe_vma_is_null(vma);
800 }
801 
802 static int xe_vm_ops_add_rebind(struct xe_vma_ops *vops, struct xe_vma *vma,
803 				u8 tile_mask)
804 {
805 	struct xe_vma_op *op;
806 
807 	op = kzalloc(sizeof(*op), GFP_KERNEL);
808 	if (!op)
809 		return -ENOMEM;
810 
811 	xe_vm_populate_rebind(op, vma, tile_mask);
812 	list_add_tail(&op->link, &vops->list);
813 	xe_vma_ops_incr_pt_update_ops(vops, tile_mask);
814 
815 	return 0;
816 }
817 
818 static struct dma_fence *ops_execute(struct xe_vm *vm,
819 				     struct xe_vma_ops *vops);
820 static void xe_vma_ops_init(struct xe_vma_ops *vops, struct xe_vm *vm,
821 			    struct xe_exec_queue *q,
822 			    struct xe_sync_entry *syncs, u32 num_syncs);
823 
824 int xe_vm_rebind(struct xe_vm *vm, bool rebind_worker)
825 {
826 	struct dma_fence *fence;
827 	struct xe_vma *vma, *next;
828 	struct xe_vma_ops vops;
829 	struct xe_vma_op *op, *next_op;
830 	int err, i;
831 
832 	lockdep_assert_held(&vm->lock);
833 	if ((xe_vm_in_lr_mode(vm) && !rebind_worker) ||
834 	    list_empty(&vm->rebind_list))
835 		return 0;
836 
837 	xe_vma_ops_init(&vops, vm, NULL, NULL, 0);
838 	for (i = 0; i < XE_MAX_TILES_PER_DEVICE; ++i)
839 		vops.pt_update_ops[i].wait_vm_bookkeep = true;
840 
841 	xe_vm_assert_held(vm);
842 	list_for_each_entry(vma, &vm->rebind_list, combined_links.rebind) {
843 		xe_assert(vm->xe, vma->tile_present);
844 
845 		if (rebind_worker)
846 			trace_xe_vma_rebind_worker(vma);
847 		else
848 			trace_xe_vma_rebind_exec(vma);
849 
850 		err = xe_vm_ops_add_rebind(&vops, vma,
851 					   vma->tile_present);
852 		if (err)
853 			goto free_ops;
854 	}
855 
856 	err = xe_vma_ops_alloc(&vops, false);
857 	if (err)
858 		goto free_ops;
859 
860 	fence = ops_execute(vm, &vops);
861 	if (IS_ERR(fence)) {
862 		err = PTR_ERR(fence);
863 	} else {
864 		dma_fence_put(fence);
865 		list_for_each_entry_safe(vma, next, &vm->rebind_list,
866 					 combined_links.rebind)
867 			list_del_init(&vma->combined_links.rebind);
868 	}
869 free_ops:
870 	list_for_each_entry_safe(op, next_op, &vops.list, link) {
871 		list_del(&op->link);
872 		kfree(op);
873 	}
874 	xe_vma_ops_fini(&vops);
875 
876 	return err;
877 }
878 
879 struct dma_fence *xe_vma_rebind(struct xe_vm *vm, struct xe_vma *vma, u8 tile_mask)
880 {
881 	struct dma_fence *fence = NULL;
882 	struct xe_vma_ops vops;
883 	struct xe_vma_op *op, *next_op;
884 	struct xe_tile *tile;
885 	u8 id;
886 	int err;
887 
888 	lockdep_assert_held(&vm->lock);
889 	xe_vm_assert_held(vm);
890 	xe_assert(vm->xe, xe_vm_in_fault_mode(vm));
891 
892 	xe_vma_ops_init(&vops, vm, NULL, NULL, 0);
893 	for_each_tile(tile, vm->xe, id) {
894 		vops.pt_update_ops[id].wait_vm_bookkeep = true;
895 		vops.pt_update_ops[tile->id].q =
896 			xe_tile_migrate_exec_queue(tile);
897 	}
898 
899 	err = xe_vm_ops_add_rebind(&vops, vma, tile_mask);
900 	if (err)
901 		return ERR_PTR(err);
902 
903 	err = xe_vma_ops_alloc(&vops, false);
904 	if (err) {
905 		fence = ERR_PTR(err);
906 		goto free_ops;
907 	}
908 
909 	fence = ops_execute(vm, &vops);
910 
911 free_ops:
912 	list_for_each_entry_safe(op, next_op, &vops.list, link) {
913 		list_del(&op->link);
914 		kfree(op);
915 	}
916 	xe_vma_ops_fini(&vops);
917 
918 	return fence;
919 }
920 
921 static void xe_vma_free(struct xe_vma *vma)
922 {
923 	if (xe_vma_is_userptr(vma))
924 		kfree(to_userptr_vma(vma));
925 	else
926 		kfree(vma);
927 }
928 
929 #define VMA_CREATE_FLAG_READ_ONLY	BIT(0)
930 #define VMA_CREATE_FLAG_IS_NULL		BIT(1)
931 #define VMA_CREATE_FLAG_DUMPABLE	BIT(2)
932 
933 static struct xe_vma *xe_vma_create(struct xe_vm *vm,
934 				    struct xe_bo *bo,
935 				    u64 bo_offset_or_userptr,
936 				    u64 start, u64 end,
937 				    u16 pat_index, unsigned int flags)
938 {
939 	struct xe_vma *vma;
940 	struct xe_tile *tile;
941 	u8 id;
942 	bool read_only = (flags & VMA_CREATE_FLAG_READ_ONLY);
943 	bool is_null = (flags & VMA_CREATE_FLAG_IS_NULL);
944 	bool dumpable = (flags & VMA_CREATE_FLAG_DUMPABLE);
945 
946 	xe_assert(vm->xe, start < end);
947 	xe_assert(vm->xe, end < vm->size);
948 
949 	/*
950 	 * Allocate and ensure that the xe_vma_is_userptr() return
951 	 * matches what was allocated.
952 	 */
953 	if (!bo && !is_null) {
954 		struct xe_userptr_vma *uvma = kzalloc(sizeof(*uvma), GFP_KERNEL);
955 
956 		if (!uvma)
957 			return ERR_PTR(-ENOMEM);
958 
959 		vma = &uvma->vma;
960 	} else {
961 		vma = kzalloc(sizeof(*vma), GFP_KERNEL);
962 		if (!vma)
963 			return ERR_PTR(-ENOMEM);
964 
965 		if (is_null)
966 			vma->gpuva.flags |= DRM_GPUVA_SPARSE;
967 		if (bo)
968 			vma->gpuva.gem.obj = &bo->ttm.base;
969 	}
970 
971 	INIT_LIST_HEAD(&vma->combined_links.rebind);
972 
973 	INIT_LIST_HEAD(&vma->gpuva.gem.entry);
974 	vma->gpuva.vm = &vm->gpuvm;
975 	vma->gpuva.va.addr = start;
976 	vma->gpuva.va.range = end - start + 1;
977 	if (read_only)
978 		vma->gpuva.flags |= XE_VMA_READ_ONLY;
979 	if (dumpable)
980 		vma->gpuva.flags |= XE_VMA_DUMPABLE;
981 
982 	for_each_tile(tile, vm->xe, id)
983 		vma->tile_mask |= 0x1 << id;
984 
985 	if (vm->xe->info.has_atomic_enable_pte_bit)
986 		vma->gpuva.flags |= XE_VMA_ATOMIC_PTE_BIT;
987 
988 	vma->pat_index = pat_index;
989 
990 	if (bo) {
991 		struct drm_gpuvm_bo *vm_bo;
992 
993 		xe_bo_assert_held(bo);
994 
995 		vm_bo = drm_gpuvm_bo_obtain(vma->gpuva.vm, &bo->ttm.base);
996 		if (IS_ERR(vm_bo)) {
997 			xe_vma_free(vma);
998 			return ERR_CAST(vm_bo);
999 		}
1000 
1001 		drm_gpuvm_bo_extobj_add(vm_bo);
1002 		drm_gem_object_get(&bo->ttm.base);
1003 		vma->gpuva.gem.offset = bo_offset_or_userptr;
1004 		drm_gpuva_link(&vma->gpuva, vm_bo);
1005 		drm_gpuvm_bo_put(vm_bo);
1006 	} else /* userptr or null */ {
1007 		if (!is_null) {
1008 			struct xe_userptr *userptr = &to_userptr_vma(vma)->userptr;
1009 			u64 size = end - start + 1;
1010 			int err;
1011 
1012 			INIT_LIST_HEAD(&userptr->invalidate_link);
1013 			INIT_LIST_HEAD(&userptr->repin_link);
1014 			vma->gpuva.gem.offset = bo_offset_or_userptr;
1015 
1016 			err = mmu_interval_notifier_insert(&userptr->notifier,
1017 							   current->mm,
1018 							   xe_vma_userptr(vma), size,
1019 							   &vma_userptr_notifier_ops);
1020 			if (err) {
1021 				xe_vma_free(vma);
1022 				return ERR_PTR(err);
1023 			}
1024 
1025 			userptr->notifier_seq = LONG_MAX;
1026 		}
1027 
1028 		xe_vm_get(vm);
1029 	}
1030 
1031 	return vma;
1032 }
1033 
1034 static void xe_vma_destroy_late(struct xe_vma *vma)
1035 {
1036 	struct xe_vm *vm = xe_vma_vm(vma);
1037 
1038 	if (vma->ufence) {
1039 		xe_sync_ufence_put(vma->ufence);
1040 		vma->ufence = NULL;
1041 	}
1042 
1043 	if (xe_vma_is_userptr(vma)) {
1044 		struct xe_userptr_vma *uvma = to_userptr_vma(vma);
1045 		struct xe_userptr *userptr = &uvma->userptr;
1046 
1047 		if (userptr->sg)
1048 			xe_hmm_userptr_free_sg(uvma);
1049 
1050 		/*
1051 		 * Since userptr pages are not pinned, we can't remove
1052 		 * the notifier until we're sure the GPU is not accessing
1053 		 * them anymore
1054 		 */
1055 		mmu_interval_notifier_remove(&userptr->notifier);
1056 		xe_vm_put(vm);
1057 	} else if (xe_vma_is_null(vma)) {
1058 		xe_vm_put(vm);
1059 	} else {
1060 		xe_bo_put(xe_vma_bo(vma));
1061 	}
1062 
1063 	xe_vma_free(vma);
1064 }
1065 
1066 static void vma_destroy_work_func(struct work_struct *w)
1067 {
1068 	struct xe_vma *vma =
1069 		container_of(w, struct xe_vma, destroy_work);
1070 
1071 	xe_vma_destroy_late(vma);
1072 }
1073 
1074 static void vma_destroy_cb(struct dma_fence *fence,
1075 			   struct dma_fence_cb *cb)
1076 {
1077 	struct xe_vma *vma = container_of(cb, struct xe_vma, destroy_cb);
1078 
1079 	INIT_WORK(&vma->destroy_work, vma_destroy_work_func);
1080 	queue_work(system_unbound_wq, &vma->destroy_work);
1081 }
1082 
1083 static void xe_vma_destroy(struct xe_vma *vma, struct dma_fence *fence)
1084 {
1085 	struct xe_vm *vm = xe_vma_vm(vma);
1086 
1087 	lockdep_assert_held_write(&vm->lock);
1088 	xe_assert(vm->xe, list_empty(&vma->combined_links.destroy));
1089 
1090 	if (xe_vma_is_userptr(vma)) {
1091 		xe_assert(vm->xe, vma->gpuva.flags & XE_VMA_DESTROYED);
1092 
1093 		spin_lock(&vm->userptr.invalidated_lock);
1094 		xe_assert(vm->xe, list_empty(&to_userptr_vma(vma)->userptr.repin_link));
1095 		list_del(&to_userptr_vma(vma)->userptr.invalidate_link);
1096 		spin_unlock(&vm->userptr.invalidated_lock);
1097 	} else if (!xe_vma_is_null(vma)) {
1098 		xe_bo_assert_held(xe_vma_bo(vma));
1099 
1100 		drm_gpuva_unlink(&vma->gpuva);
1101 	}
1102 
1103 	xe_vm_assert_held(vm);
1104 	if (fence) {
1105 		int ret = dma_fence_add_callback(fence, &vma->destroy_cb,
1106 						 vma_destroy_cb);
1107 
1108 		if (ret) {
1109 			XE_WARN_ON(ret != -ENOENT);
1110 			xe_vma_destroy_late(vma);
1111 		}
1112 	} else {
1113 		xe_vma_destroy_late(vma);
1114 	}
1115 }
1116 
1117 /**
1118  * xe_vm_lock_vma() - drm_exec utility to lock a vma
1119  * @exec: The drm_exec object we're currently locking for.
1120  * @vma: The vma for witch we want to lock the vm resv and any attached
1121  * object's resv.
1122  *
1123  * Return: 0 on success, negative error code on error. In particular
1124  * may return -EDEADLK on WW transaction contention and -EINTR if
1125  * an interruptible wait is terminated by a signal.
1126  */
1127 int xe_vm_lock_vma(struct drm_exec *exec, struct xe_vma *vma)
1128 {
1129 	struct xe_vm *vm = xe_vma_vm(vma);
1130 	struct xe_bo *bo = xe_vma_bo(vma);
1131 	int err;
1132 
1133 	XE_WARN_ON(!vm);
1134 
1135 	err = drm_exec_lock_obj(exec, xe_vm_obj(vm));
1136 	if (!err && bo && !bo->vm)
1137 		err = drm_exec_lock_obj(exec, &bo->ttm.base);
1138 
1139 	return err;
1140 }
1141 
1142 static void xe_vma_destroy_unlocked(struct xe_vma *vma)
1143 {
1144 	struct drm_exec exec;
1145 	int err;
1146 
1147 	drm_exec_init(&exec, 0, 0);
1148 	drm_exec_until_all_locked(&exec) {
1149 		err = xe_vm_lock_vma(&exec, vma);
1150 		drm_exec_retry_on_contention(&exec);
1151 		if (XE_WARN_ON(err))
1152 			break;
1153 	}
1154 
1155 	xe_vma_destroy(vma, NULL);
1156 
1157 	drm_exec_fini(&exec);
1158 }
1159 
1160 struct xe_vma *
1161 xe_vm_find_overlapping_vma(struct xe_vm *vm, u64 start, u64 range)
1162 {
1163 	struct drm_gpuva *gpuva;
1164 
1165 	lockdep_assert_held(&vm->lock);
1166 
1167 	if (xe_vm_is_closed_or_banned(vm))
1168 		return NULL;
1169 
1170 	xe_assert(vm->xe, start + range <= vm->size);
1171 
1172 	gpuva = drm_gpuva_find_first(&vm->gpuvm, start, range);
1173 
1174 	return gpuva ? gpuva_to_vma(gpuva) : NULL;
1175 }
1176 
1177 static int xe_vm_insert_vma(struct xe_vm *vm, struct xe_vma *vma)
1178 {
1179 	int err;
1180 
1181 	xe_assert(vm->xe, xe_vma_vm(vma) == vm);
1182 	lockdep_assert_held(&vm->lock);
1183 
1184 	mutex_lock(&vm->snap_mutex);
1185 	err = drm_gpuva_insert(&vm->gpuvm, &vma->gpuva);
1186 	mutex_unlock(&vm->snap_mutex);
1187 	XE_WARN_ON(err);	/* Shouldn't be possible */
1188 
1189 	return err;
1190 }
1191 
1192 static void xe_vm_remove_vma(struct xe_vm *vm, struct xe_vma *vma)
1193 {
1194 	xe_assert(vm->xe, xe_vma_vm(vma) == vm);
1195 	lockdep_assert_held(&vm->lock);
1196 
1197 	mutex_lock(&vm->snap_mutex);
1198 	drm_gpuva_remove(&vma->gpuva);
1199 	mutex_unlock(&vm->snap_mutex);
1200 	if (vm->usm.last_fault_vma == vma)
1201 		vm->usm.last_fault_vma = NULL;
1202 }
1203 
1204 static struct drm_gpuva_op *xe_vm_op_alloc(void)
1205 {
1206 	struct xe_vma_op *op;
1207 
1208 	op = kzalloc(sizeof(*op), GFP_KERNEL);
1209 
1210 	if (unlikely(!op))
1211 		return NULL;
1212 
1213 	return &op->base;
1214 }
1215 
1216 static void xe_vm_free(struct drm_gpuvm *gpuvm);
1217 
1218 static const struct drm_gpuvm_ops gpuvm_ops = {
1219 	.op_alloc = xe_vm_op_alloc,
1220 	.vm_bo_validate = xe_gpuvm_validate,
1221 	.vm_free = xe_vm_free,
1222 };
1223 
1224 static u64 pde_encode_pat_index(u16 pat_index)
1225 {
1226 	u64 pte = 0;
1227 
1228 	if (pat_index & BIT(0))
1229 		pte |= XE_PPGTT_PTE_PAT0;
1230 
1231 	if (pat_index & BIT(1))
1232 		pte |= XE_PPGTT_PTE_PAT1;
1233 
1234 	return pte;
1235 }
1236 
1237 static u64 pte_encode_pat_index(u16 pat_index, u32 pt_level)
1238 {
1239 	u64 pte = 0;
1240 
1241 	if (pat_index & BIT(0))
1242 		pte |= XE_PPGTT_PTE_PAT0;
1243 
1244 	if (pat_index & BIT(1))
1245 		pte |= XE_PPGTT_PTE_PAT1;
1246 
1247 	if (pat_index & BIT(2)) {
1248 		if (pt_level)
1249 			pte |= XE_PPGTT_PDE_PDPE_PAT2;
1250 		else
1251 			pte |= XE_PPGTT_PTE_PAT2;
1252 	}
1253 
1254 	if (pat_index & BIT(3))
1255 		pte |= XELPG_PPGTT_PTE_PAT3;
1256 
1257 	if (pat_index & (BIT(4)))
1258 		pte |= XE2_PPGTT_PTE_PAT4;
1259 
1260 	return pte;
1261 }
1262 
1263 static u64 pte_encode_ps(u32 pt_level)
1264 {
1265 	XE_WARN_ON(pt_level > MAX_HUGEPTE_LEVEL);
1266 
1267 	if (pt_level == 1)
1268 		return XE_PDE_PS_2M;
1269 	else if (pt_level == 2)
1270 		return XE_PDPE_PS_1G;
1271 
1272 	return 0;
1273 }
1274 
1275 static u64 xelp_pde_encode_bo(struct xe_bo *bo, u64 bo_offset,
1276 			      const u16 pat_index)
1277 {
1278 	u64 pde;
1279 
1280 	pde = xe_bo_addr(bo, bo_offset, XE_PAGE_SIZE);
1281 	pde |= XE_PAGE_PRESENT | XE_PAGE_RW;
1282 	pde |= pde_encode_pat_index(pat_index);
1283 
1284 	return pde;
1285 }
1286 
1287 static u64 xelp_pte_encode_bo(struct xe_bo *bo, u64 bo_offset,
1288 			      u16 pat_index, u32 pt_level)
1289 {
1290 	u64 pte;
1291 
1292 	pte = xe_bo_addr(bo, bo_offset, XE_PAGE_SIZE);
1293 	pte |= XE_PAGE_PRESENT | XE_PAGE_RW;
1294 	pte |= pte_encode_pat_index(pat_index, pt_level);
1295 	pte |= pte_encode_ps(pt_level);
1296 
1297 	if (xe_bo_is_vram(bo) || xe_bo_is_stolen_devmem(bo))
1298 		pte |= XE_PPGTT_PTE_DM;
1299 
1300 	return pte;
1301 }
1302 
1303 static u64 xelp_pte_encode_vma(u64 pte, struct xe_vma *vma,
1304 			       u16 pat_index, u32 pt_level)
1305 {
1306 	pte |= XE_PAGE_PRESENT;
1307 
1308 	if (likely(!xe_vma_read_only(vma)))
1309 		pte |= XE_PAGE_RW;
1310 
1311 	pte |= pte_encode_pat_index(pat_index, pt_level);
1312 	pte |= pte_encode_ps(pt_level);
1313 
1314 	if (unlikely(xe_vma_is_null(vma)))
1315 		pte |= XE_PTE_NULL;
1316 
1317 	return pte;
1318 }
1319 
1320 static u64 xelp_pte_encode_addr(struct xe_device *xe, u64 addr,
1321 				u16 pat_index,
1322 				u32 pt_level, bool devmem, u64 flags)
1323 {
1324 	u64 pte;
1325 
1326 	/* Avoid passing random bits directly as flags */
1327 	xe_assert(xe, !(flags & ~XE_PTE_PS64));
1328 
1329 	pte = addr;
1330 	pte |= XE_PAGE_PRESENT | XE_PAGE_RW;
1331 	pte |= pte_encode_pat_index(pat_index, pt_level);
1332 	pte |= pte_encode_ps(pt_level);
1333 
1334 	if (devmem)
1335 		pte |= XE_PPGTT_PTE_DM;
1336 
1337 	pte |= flags;
1338 
1339 	return pte;
1340 }
1341 
1342 static const struct xe_pt_ops xelp_pt_ops = {
1343 	.pte_encode_bo = xelp_pte_encode_bo,
1344 	.pte_encode_vma = xelp_pte_encode_vma,
1345 	.pte_encode_addr = xelp_pte_encode_addr,
1346 	.pde_encode_bo = xelp_pde_encode_bo,
1347 };
1348 
1349 static void vm_destroy_work_func(struct work_struct *w);
1350 
1351 /**
1352  * xe_vm_create_scratch() - Setup a scratch memory pagetable tree for the
1353  * given tile and vm.
1354  * @xe: xe device.
1355  * @tile: tile to set up for.
1356  * @vm: vm to set up for.
1357  *
1358  * Sets up a pagetable tree with one page-table per level and a single
1359  * leaf PTE. All pagetable entries point to the single page-table or,
1360  * for MAX_HUGEPTE_LEVEL, a NULL huge PTE returning 0 on read and
1361  * writes become NOPs.
1362  *
1363  * Return: 0 on success, negative error code on error.
1364  */
1365 static int xe_vm_create_scratch(struct xe_device *xe, struct xe_tile *tile,
1366 				struct xe_vm *vm)
1367 {
1368 	u8 id = tile->id;
1369 	int i;
1370 
1371 	for (i = MAX_HUGEPTE_LEVEL; i < vm->pt_root[id]->level; i++) {
1372 		vm->scratch_pt[id][i] = xe_pt_create(vm, tile, i);
1373 		if (IS_ERR(vm->scratch_pt[id][i]))
1374 			return PTR_ERR(vm->scratch_pt[id][i]);
1375 
1376 		xe_pt_populate_empty(tile, vm, vm->scratch_pt[id][i]);
1377 	}
1378 
1379 	return 0;
1380 }
1381 ALLOW_ERROR_INJECTION(xe_vm_create_scratch, ERRNO);
1382 
1383 static void xe_vm_free_scratch(struct xe_vm *vm)
1384 {
1385 	struct xe_tile *tile;
1386 	u8 id;
1387 
1388 	if (!xe_vm_has_scratch(vm))
1389 		return;
1390 
1391 	for_each_tile(tile, vm->xe, id) {
1392 		u32 i;
1393 
1394 		if (!vm->pt_root[id])
1395 			continue;
1396 
1397 		for (i = MAX_HUGEPTE_LEVEL; i < vm->pt_root[id]->level; ++i)
1398 			if (vm->scratch_pt[id][i])
1399 				xe_pt_destroy(vm->scratch_pt[id][i], vm->flags, NULL);
1400 	}
1401 }
1402 
1403 struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags)
1404 {
1405 	struct drm_gem_object *vm_resv_obj;
1406 	struct xe_vm *vm;
1407 	int err, number_tiles = 0;
1408 	struct xe_tile *tile;
1409 	u8 id;
1410 
1411 	vm = kzalloc(sizeof(*vm), GFP_KERNEL);
1412 	if (!vm)
1413 		return ERR_PTR(-ENOMEM);
1414 
1415 	vm->xe = xe;
1416 
1417 	vm->size = 1ull << xe->info.va_bits;
1418 
1419 	vm->flags = flags;
1420 
1421 	init_rwsem(&vm->lock);
1422 	mutex_init(&vm->snap_mutex);
1423 
1424 	INIT_LIST_HEAD(&vm->rebind_list);
1425 
1426 	INIT_LIST_HEAD(&vm->userptr.repin_list);
1427 	INIT_LIST_HEAD(&vm->userptr.invalidated);
1428 	init_rwsem(&vm->userptr.notifier_lock);
1429 	spin_lock_init(&vm->userptr.invalidated_lock);
1430 
1431 	ttm_lru_bulk_move_init(&vm->lru_bulk_move);
1432 
1433 	INIT_WORK(&vm->destroy_work, vm_destroy_work_func);
1434 
1435 	INIT_LIST_HEAD(&vm->preempt.exec_queues);
1436 	vm->preempt.min_run_period_ms = 10;	/* FIXME: Wire up to uAPI */
1437 
1438 	for_each_tile(tile, xe, id)
1439 		xe_range_fence_tree_init(&vm->rftree[id]);
1440 
1441 	vm->pt_ops = &xelp_pt_ops;
1442 
1443 	/*
1444 	 * Long-running workloads are not protected by the scheduler references.
1445 	 * By design, run_job for long-running workloads returns NULL and the
1446 	 * scheduler drops all the references of it, hence protecting the VM
1447 	 * for this case is necessary.
1448 	 */
1449 	if (flags & XE_VM_FLAG_LR_MODE)
1450 		xe_pm_runtime_get_noresume(xe);
1451 
1452 	vm_resv_obj = drm_gpuvm_resv_object_alloc(&xe->drm);
1453 	if (!vm_resv_obj) {
1454 		err = -ENOMEM;
1455 		goto err_no_resv;
1456 	}
1457 
1458 	drm_gpuvm_init(&vm->gpuvm, "Xe VM", DRM_GPUVM_RESV_PROTECTED, &xe->drm,
1459 		       vm_resv_obj, 0, vm->size, 0, 0, &gpuvm_ops);
1460 
1461 	drm_gem_object_put(vm_resv_obj);
1462 
1463 	err = xe_vm_lock(vm, true);
1464 	if (err)
1465 		goto err_close;
1466 
1467 	if (IS_DGFX(xe) && xe->info.vram_flags & XE_VRAM_FLAGS_NEED64K)
1468 		vm->flags |= XE_VM_FLAG_64K;
1469 
1470 	for_each_tile(tile, xe, id) {
1471 		if (flags & XE_VM_FLAG_MIGRATION &&
1472 		    tile->id != XE_VM_FLAG_TILE_ID(flags))
1473 			continue;
1474 
1475 		vm->pt_root[id] = xe_pt_create(vm, tile, xe->info.vm_max_level);
1476 		if (IS_ERR(vm->pt_root[id])) {
1477 			err = PTR_ERR(vm->pt_root[id]);
1478 			vm->pt_root[id] = NULL;
1479 			goto err_unlock_close;
1480 		}
1481 	}
1482 
1483 	if (xe_vm_has_scratch(vm)) {
1484 		for_each_tile(tile, xe, id) {
1485 			if (!vm->pt_root[id])
1486 				continue;
1487 
1488 			err = xe_vm_create_scratch(xe, tile, vm);
1489 			if (err)
1490 				goto err_unlock_close;
1491 		}
1492 		vm->batch_invalidate_tlb = true;
1493 	}
1494 
1495 	if (vm->flags & XE_VM_FLAG_LR_MODE) {
1496 		INIT_WORK(&vm->preempt.rebind_work, preempt_rebind_work_func);
1497 		vm->batch_invalidate_tlb = false;
1498 	}
1499 
1500 	/* Fill pt_root after allocating scratch tables */
1501 	for_each_tile(tile, xe, id) {
1502 		if (!vm->pt_root[id])
1503 			continue;
1504 
1505 		xe_pt_populate_empty(tile, vm, vm->pt_root[id]);
1506 	}
1507 	xe_vm_unlock(vm);
1508 
1509 	/* Kernel migration VM shouldn't have a circular loop.. */
1510 	if (!(flags & XE_VM_FLAG_MIGRATION)) {
1511 		for_each_tile(tile, xe, id) {
1512 			struct xe_exec_queue *q;
1513 			u32 create_flags = EXEC_QUEUE_FLAG_VM;
1514 
1515 			if (!vm->pt_root[id])
1516 				continue;
1517 
1518 			q = xe_exec_queue_create_bind(xe, tile, create_flags, 0);
1519 			if (IS_ERR(q)) {
1520 				err = PTR_ERR(q);
1521 				goto err_close;
1522 			}
1523 			vm->q[id] = q;
1524 			number_tiles++;
1525 		}
1526 	}
1527 
1528 	if (number_tiles > 1)
1529 		vm->composite_fence_ctx = dma_fence_context_alloc(1);
1530 
1531 	trace_xe_vm_create(vm);
1532 
1533 	return vm;
1534 
1535 err_unlock_close:
1536 	xe_vm_unlock(vm);
1537 err_close:
1538 	xe_vm_close_and_put(vm);
1539 	return ERR_PTR(err);
1540 
1541 err_no_resv:
1542 	mutex_destroy(&vm->snap_mutex);
1543 	for_each_tile(tile, xe, id)
1544 		xe_range_fence_tree_fini(&vm->rftree[id]);
1545 	ttm_lru_bulk_move_fini(&xe->ttm, &vm->lru_bulk_move);
1546 	kfree(vm);
1547 	if (flags & XE_VM_FLAG_LR_MODE)
1548 		xe_pm_runtime_put(xe);
1549 	return ERR_PTR(err);
1550 }
1551 
1552 static void xe_vm_close(struct xe_vm *vm)
1553 {
1554 	down_write(&vm->lock);
1555 	vm->size = 0;
1556 	up_write(&vm->lock);
1557 }
1558 
1559 void xe_vm_close_and_put(struct xe_vm *vm)
1560 {
1561 	LIST_HEAD(contested);
1562 	struct xe_device *xe = vm->xe;
1563 	struct xe_tile *tile;
1564 	struct xe_vma *vma, *next_vma;
1565 	struct drm_gpuva *gpuva, *next;
1566 	u8 id;
1567 
1568 	xe_assert(xe, !vm->preempt.num_exec_queues);
1569 
1570 	xe_vm_close(vm);
1571 	if (xe_vm_in_preempt_fence_mode(vm))
1572 		flush_work(&vm->preempt.rebind_work);
1573 
1574 	down_write(&vm->lock);
1575 	for_each_tile(tile, xe, id) {
1576 		if (vm->q[id])
1577 			xe_exec_queue_last_fence_put(vm->q[id], vm);
1578 	}
1579 	up_write(&vm->lock);
1580 
1581 	for_each_tile(tile, xe, id) {
1582 		if (vm->q[id]) {
1583 			xe_exec_queue_kill(vm->q[id]);
1584 			xe_exec_queue_put(vm->q[id]);
1585 			vm->q[id] = NULL;
1586 		}
1587 	}
1588 
1589 	down_write(&vm->lock);
1590 	xe_vm_lock(vm, false);
1591 	drm_gpuvm_for_each_va_safe(gpuva, next, &vm->gpuvm) {
1592 		vma = gpuva_to_vma(gpuva);
1593 
1594 		if (xe_vma_has_no_bo(vma)) {
1595 			down_read(&vm->userptr.notifier_lock);
1596 			vma->gpuva.flags |= XE_VMA_DESTROYED;
1597 			up_read(&vm->userptr.notifier_lock);
1598 		}
1599 
1600 		xe_vm_remove_vma(vm, vma);
1601 
1602 		/* easy case, remove from VMA? */
1603 		if (xe_vma_has_no_bo(vma) || xe_vma_bo(vma)->vm) {
1604 			list_del_init(&vma->combined_links.rebind);
1605 			xe_vma_destroy(vma, NULL);
1606 			continue;
1607 		}
1608 
1609 		list_move_tail(&vma->combined_links.destroy, &contested);
1610 		vma->gpuva.flags |= XE_VMA_DESTROYED;
1611 	}
1612 
1613 	/*
1614 	 * All vm operations will add shared fences to resv.
1615 	 * The only exception is eviction for a shared object,
1616 	 * but even so, the unbind when evicted would still
1617 	 * install a fence to resv. Hence it's safe to
1618 	 * destroy the pagetables immediately.
1619 	 */
1620 	xe_vm_free_scratch(vm);
1621 
1622 	for_each_tile(tile, xe, id) {
1623 		if (vm->pt_root[id]) {
1624 			xe_pt_destroy(vm->pt_root[id], vm->flags, NULL);
1625 			vm->pt_root[id] = NULL;
1626 		}
1627 	}
1628 	xe_vm_unlock(vm);
1629 
1630 	/*
1631 	 * VM is now dead, cannot re-add nodes to vm->vmas if it's NULL
1632 	 * Since we hold a refcount to the bo, we can remove and free
1633 	 * the members safely without locking.
1634 	 */
1635 	list_for_each_entry_safe(vma, next_vma, &contested,
1636 				 combined_links.destroy) {
1637 		list_del_init(&vma->combined_links.destroy);
1638 		xe_vma_destroy_unlocked(vma);
1639 	}
1640 
1641 	up_write(&vm->lock);
1642 
1643 	down_write(&xe->usm.lock);
1644 	if (vm->usm.asid) {
1645 		void *lookup;
1646 
1647 		xe_assert(xe, xe->info.has_asid);
1648 		xe_assert(xe, !(vm->flags & XE_VM_FLAG_MIGRATION));
1649 
1650 		lookup = xa_erase(&xe->usm.asid_to_vm, vm->usm.asid);
1651 		xe_assert(xe, lookup == vm);
1652 	}
1653 	up_write(&xe->usm.lock);
1654 
1655 	for_each_tile(tile, xe, id)
1656 		xe_range_fence_tree_fini(&vm->rftree[id]);
1657 
1658 	xe_vm_put(vm);
1659 }
1660 
1661 static void vm_destroy_work_func(struct work_struct *w)
1662 {
1663 	struct xe_vm *vm =
1664 		container_of(w, struct xe_vm, destroy_work);
1665 	struct xe_device *xe = vm->xe;
1666 	struct xe_tile *tile;
1667 	u8 id;
1668 
1669 	/* xe_vm_close_and_put was not called? */
1670 	xe_assert(xe, !vm->size);
1671 
1672 	if (xe_vm_in_preempt_fence_mode(vm))
1673 		flush_work(&vm->preempt.rebind_work);
1674 
1675 	mutex_destroy(&vm->snap_mutex);
1676 
1677 	if (vm->flags & XE_VM_FLAG_LR_MODE)
1678 		xe_pm_runtime_put(xe);
1679 
1680 	for_each_tile(tile, xe, id)
1681 		XE_WARN_ON(vm->pt_root[id]);
1682 
1683 	trace_xe_vm_free(vm);
1684 
1685 	ttm_lru_bulk_move_fini(&xe->ttm, &vm->lru_bulk_move);
1686 
1687 	if (vm->xef)
1688 		xe_file_put(vm->xef);
1689 
1690 	kfree(vm);
1691 }
1692 
1693 static void xe_vm_free(struct drm_gpuvm *gpuvm)
1694 {
1695 	struct xe_vm *vm = container_of(gpuvm, struct xe_vm, gpuvm);
1696 
1697 	/* To destroy the VM we need to be able to sleep */
1698 	queue_work(system_unbound_wq, &vm->destroy_work);
1699 }
1700 
1701 struct xe_vm *xe_vm_lookup(struct xe_file *xef, u32 id)
1702 {
1703 	struct xe_vm *vm;
1704 
1705 	mutex_lock(&xef->vm.lock);
1706 	vm = xa_load(&xef->vm.xa, id);
1707 	if (vm)
1708 		xe_vm_get(vm);
1709 	mutex_unlock(&xef->vm.lock);
1710 
1711 	return vm;
1712 }
1713 
1714 u64 xe_vm_pdp4_descriptor(struct xe_vm *vm, struct xe_tile *tile)
1715 {
1716 	return vm->pt_ops->pde_encode_bo(vm->pt_root[tile->id]->bo, 0,
1717 					 tile_to_xe(tile)->pat.idx[XE_CACHE_WB]);
1718 }
1719 
1720 static struct xe_exec_queue *
1721 to_wait_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
1722 {
1723 	return q ? q : vm->q[0];
1724 }
1725 
1726 static struct xe_user_fence *
1727 find_ufence_get(struct xe_sync_entry *syncs, u32 num_syncs)
1728 {
1729 	unsigned int i;
1730 
1731 	for (i = 0; i < num_syncs; i++) {
1732 		struct xe_sync_entry *e = &syncs[i];
1733 
1734 		if (xe_sync_is_ufence(e))
1735 			return xe_sync_ufence_get(e);
1736 	}
1737 
1738 	return NULL;
1739 }
1740 
1741 #define ALL_DRM_XE_VM_CREATE_FLAGS (DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE | \
1742 				    DRM_XE_VM_CREATE_FLAG_LR_MODE | \
1743 				    DRM_XE_VM_CREATE_FLAG_FAULT_MODE)
1744 
1745 int xe_vm_create_ioctl(struct drm_device *dev, void *data,
1746 		       struct drm_file *file)
1747 {
1748 	struct xe_device *xe = to_xe_device(dev);
1749 	struct xe_file *xef = to_xe_file(file);
1750 	struct drm_xe_vm_create *args = data;
1751 	struct xe_tile *tile;
1752 	struct xe_vm *vm;
1753 	u32 id, asid;
1754 	int err;
1755 	u32 flags = 0;
1756 
1757 	if (XE_IOCTL_DBG(xe, args->extensions))
1758 		return -EINVAL;
1759 
1760 	if (XE_WA(xe_root_mmio_gt(xe), 14016763929))
1761 		args->flags |= DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE;
1762 
1763 	if (XE_IOCTL_DBG(xe, args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE &&
1764 			 !xe->info.has_usm))
1765 		return -EINVAL;
1766 
1767 	if (XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
1768 		return -EINVAL;
1769 
1770 	if (XE_IOCTL_DBG(xe, args->flags & ~ALL_DRM_XE_VM_CREATE_FLAGS))
1771 		return -EINVAL;
1772 
1773 	if (XE_IOCTL_DBG(xe, args->flags & DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE &&
1774 			 args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE))
1775 		return -EINVAL;
1776 
1777 	if (XE_IOCTL_DBG(xe, !(args->flags & DRM_XE_VM_CREATE_FLAG_LR_MODE) &&
1778 			 args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE))
1779 		return -EINVAL;
1780 
1781 	if (XE_IOCTL_DBG(xe, args->extensions))
1782 		return -EINVAL;
1783 
1784 	if (args->flags & DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE)
1785 		flags |= XE_VM_FLAG_SCRATCH_PAGE;
1786 	if (args->flags & DRM_XE_VM_CREATE_FLAG_LR_MODE)
1787 		flags |= XE_VM_FLAG_LR_MODE;
1788 	if (args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE)
1789 		flags |= XE_VM_FLAG_FAULT_MODE;
1790 
1791 	vm = xe_vm_create(xe, flags);
1792 	if (IS_ERR(vm))
1793 		return PTR_ERR(vm);
1794 
1795 	if (xe->info.has_asid) {
1796 		down_write(&xe->usm.lock);
1797 		err = xa_alloc_cyclic(&xe->usm.asid_to_vm, &asid, vm,
1798 				      XA_LIMIT(1, XE_MAX_ASID - 1),
1799 				      &xe->usm.next_asid, GFP_KERNEL);
1800 		up_write(&xe->usm.lock);
1801 		if (err < 0)
1802 			goto err_close_and_put;
1803 
1804 		vm->usm.asid = asid;
1805 	}
1806 
1807 	vm->xef = xe_file_get(xef);
1808 
1809 	/* Record BO memory for VM pagetable created against client */
1810 	for_each_tile(tile, xe, id)
1811 		if (vm->pt_root[id])
1812 			xe_drm_client_add_bo(vm->xef->client, vm->pt_root[id]->bo);
1813 
1814 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_MEM)
1815 	/* Warning: Security issue - never enable by default */
1816 	args->reserved[0] = xe_bo_main_addr(vm->pt_root[0]->bo, XE_PAGE_SIZE);
1817 #endif
1818 
1819 	/* user id alloc must always be last in ioctl to prevent UAF */
1820 	err = xa_alloc(&xef->vm.xa, &id, vm, xa_limit_32b, GFP_KERNEL);
1821 	if (err)
1822 		goto err_close_and_put;
1823 
1824 	args->vm_id = id;
1825 
1826 	return 0;
1827 
1828 err_close_and_put:
1829 	xe_vm_close_and_put(vm);
1830 
1831 	return err;
1832 }
1833 
1834 int xe_vm_destroy_ioctl(struct drm_device *dev, void *data,
1835 			struct drm_file *file)
1836 {
1837 	struct xe_device *xe = to_xe_device(dev);
1838 	struct xe_file *xef = to_xe_file(file);
1839 	struct drm_xe_vm_destroy *args = data;
1840 	struct xe_vm *vm;
1841 	int err = 0;
1842 
1843 	if (XE_IOCTL_DBG(xe, args->pad) ||
1844 	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
1845 		return -EINVAL;
1846 
1847 	mutex_lock(&xef->vm.lock);
1848 	vm = xa_load(&xef->vm.xa, args->vm_id);
1849 	if (XE_IOCTL_DBG(xe, !vm))
1850 		err = -ENOENT;
1851 	else if (XE_IOCTL_DBG(xe, vm->preempt.num_exec_queues))
1852 		err = -EBUSY;
1853 	else
1854 		xa_erase(&xef->vm.xa, args->vm_id);
1855 	mutex_unlock(&xef->vm.lock);
1856 
1857 	if (!err)
1858 		xe_vm_close_and_put(vm);
1859 
1860 	return err;
1861 }
1862 
1863 static const u32 region_to_mem_type[] = {
1864 	XE_PL_TT,
1865 	XE_PL_VRAM0,
1866 	XE_PL_VRAM1,
1867 };
1868 
1869 static void prep_vma_destroy(struct xe_vm *vm, struct xe_vma *vma,
1870 			     bool post_commit)
1871 {
1872 	down_read(&vm->userptr.notifier_lock);
1873 	vma->gpuva.flags |= XE_VMA_DESTROYED;
1874 	up_read(&vm->userptr.notifier_lock);
1875 	if (post_commit)
1876 		xe_vm_remove_vma(vm, vma);
1877 }
1878 
1879 #undef ULL
1880 #define ULL	unsigned long long
1881 
1882 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_VM)
1883 static void print_op(struct xe_device *xe, struct drm_gpuva_op *op)
1884 {
1885 	struct xe_vma *vma;
1886 
1887 	switch (op->op) {
1888 	case DRM_GPUVA_OP_MAP:
1889 		vm_dbg(&xe->drm, "MAP: addr=0x%016llx, range=0x%016llx",
1890 		       (ULL)op->map.va.addr, (ULL)op->map.va.range);
1891 		break;
1892 	case DRM_GPUVA_OP_REMAP:
1893 		vma = gpuva_to_vma(op->remap.unmap->va);
1894 		vm_dbg(&xe->drm, "REMAP:UNMAP: addr=0x%016llx, range=0x%016llx, keep=%d",
1895 		       (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma),
1896 		       op->remap.unmap->keep ? 1 : 0);
1897 		if (op->remap.prev)
1898 			vm_dbg(&xe->drm,
1899 			       "REMAP:PREV: addr=0x%016llx, range=0x%016llx",
1900 			       (ULL)op->remap.prev->va.addr,
1901 			       (ULL)op->remap.prev->va.range);
1902 		if (op->remap.next)
1903 			vm_dbg(&xe->drm,
1904 			       "REMAP:NEXT: addr=0x%016llx, range=0x%016llx",
1905 			       (ULL)op->remap.next->va.addr,
1906 			       (ULL)op->remap.next->va.range);
1907 		break;
1908 	case DRM_GPUVA_OP_UNMAP:
1909 		vma = gpuva_to_vma(op->unmap.va);
1910 		vm_dbg(&xe->drm, "UNMAP: addr=0x%016llx, range=0x%016llx, keep=%d",
1911 		       (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma),
1912 		       op->unmap.keep ? 1 : 0);
1913 		break;
1914 	case DRM_GPUVA_OP_PREFETCH:
1915 		vma = gpuva_to_vma(op->prefetch.va);
1916 		vm_dbg(&xe->drm, "PREFETCH: addr=0x%016llx, range=0x%016llx",
1917 		       (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma));
1918 		break;
1919 	default:
1920 		drm_warn(&xe->drm, "NOT POSSIBLE");
1921 	}
1922 }
1923 #else
1924 static void print_op(struct xe_device *xe, struct drm_gpuva_op *op)
1925 {
1926 }
1927 #endif
1928 
1929 /*
1930  * Create operations list from IOCTL arguments, setup operations fields so parse
1931  * and commit steps are decoupled from IOCTL arguments. This step can fail.
1932  */
1933 static struct drm_gpuva_ops *
1934 vm_bind_ioctl_ops_create(struct xe_vm *vm, struct xe_bo *bo,
1935 			 u64 bo_offset_or_userptr, u64 addr, u64 range,
1936 			 u32 operation, u32 flags,
1937 			 u32 prefetch_region, u16 pat_index)
1938 {
1939 	struct drm_gem_object *obj = bo ? &bo->ttm.base : NULL;
1940 	struct drm_gpuva_ops *ops;
1941 	struct drm_gpuva_op *__op;
1942 	struct drm_gpuvm_bo *vm_bo;
1943 	int err;
1944 
1945 	lockdep_assert_held_write(&vm->lock);
1946 
1947 	vm_dbg(&vm->xe->drm,
1948 	       "op=%d, addr=0x%016llx, range=0x%016llx, bo_offset_or_userptr=0x%016llx",
1949 	       operation, (ULL)addr, (ULL)range,
1950 	       (ULL)bo_offset_or_userptr);
1951 
1952 	switch (operation) {
1953 	case DRM_XE_VM_BIND_OP_MAP:
1954 	case DRM_XE_VM_BIND_OP_MAP_USERPTR:
1955 		ops = drm_gpuvm_sm_map_ops_create(&vm->gpuvm, addr, range,
1956 						  obj, bo_offset_or_userptr);
1957 		break;
1958 	case DRM_XE_VM_BIND_OP_UNMAP:
1959 		ops = drm_gpuvm_sm_unmap_ops_create(&vm->gpuvm, addr, range);
1960 		break;
1961 	case DRM_XE_VM_BIND_OP_PREFETCH:
1962 		ops = drm_gpuvm_prefetch_ops_create(&vm->gpuvm, addr, range);
1963 		break;
1964 	case DRM_XE_VM_BIND_OP_UNMAP_ALL:
1965 		xe_assert(vm->xe, bo);
1966 
1967 		err = xe_bo_lock(bo, true);
1968 		if (err)
1969 			return ERR_PTR(err);
1970 
1971 		vm_bo = drm_gpuvm_bo_obtain(&vm->gpuvm, obj);
1972 		if (IS_ERR(vm_bo)) {
1973 			xe_bo_unlock(bo);
1974 			return ERR_CAST(vm_bo);
1975 		}
1976 
1977 		ops = drm_gpuvm_bo_unmap_ops_create(vm_bo);
1978 		drm_gpuvm_bo_put(vm_bo);
1979 		xe_bo_unlock(bo);
1980 		break;
1981 	default:
1982 		drm_warn(&vm->xe->drm, "NOT POSSIBLE");
1983 		ops = ERR_PTR(-EINVAL);
1984 	}
1985 	if (IS_ERR(ops))
1986 		return ops;
1987 
1988 	drm_gpuva_for_each_op(__op, ops) {
1989 		struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
1990 
1991 		if (__op->op == DRM_GPUVA_OP_MAP) {
1992 			op->map.immediate =
1993 				flags & DRM_XE_VM_BIND_FLAG_IMMEDIATE;
1994 			op->map.read_only =
1995 				flags & DRM_XE_VM_BIND_FLAG_READONLY;
1996 			op->map.is_null = flags & DRM_XE_VM_BIND_FLAG_NULL;
1997 			op->map.dumpable = flags & DRM_XE_VM_BIND_FLAG_DUMPABLE;
1998 			op->map.pat_index = pat_index;
1999 		} else if (__op->op == DRM_GPUVA_OP_PREFETCH) {
2000 			op->prefetch.region = prefetch_region;
2001 		}
2002 
2003 		print_op(vm->xe, __op);
2004 	}
2005 
2006 	return ops;
2007 }
2008 ALLOW_ERROR_INJECTION(vm_bind_ioctl_ops_create, ERRNO);
2009 
2010 static struct xe_vma *new_vma(struct xe_vm *vm, struct drm_gpuva_op_map *op,
2011 			      u16 pat_index, unsigned int flags)
2012 {
2013 	struct xe_bo *bo = op->gem.obj ? gem_to_xe_bo(op->gem.obj) : NULL;
2014 	struct drm_exec exec;
2015 	struct xe_vma *vma;
2016 	int err = 0;
2017 
2018 	lockdep_assert_held_write(&vm->lock);
2019 
2020 	if (bo) {
2021 		drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT, 0);
2022 		drm_exec_until_all_locked(&exec) {
2023 			err = 0;
2024 			if (!bo->vm) {
2025 				err = drm_exec_lock_obj(&exec, xe_vm_obj(vm));
2026 				drm_exec_retry_on_contention(&exec);
2027 			}
2028 			if (!err) {
2029 				err = drm_exec_lock_obj(&exec, &bo->ttm.base);
2030 				drm_exec_retry_on_contention(&exec);
2031 			}
2032 			if (err) {
2033 				drm_exec_fini(&exec);
2034 				return ERR_PTR(err);
2035 			}
2036 		}
2037 	}
2038 	vma = xe_vma_create(vm, bo, op->gem.offset,
2039 			    op->va.addr, op->va.addr +
2040 			    op->va.range - 1, pat_index, flags);
2041 	if (IS_ERR(vma))
2042 		goto err_unlock;
2043 
2044 	if (xe_vma_is_userptr(vma))
2045 		err = xe_vma_userptr_pin_pages(to_userptr_vma(vma));
2046 	else if (!xe_vma_has_no_bo(vma) && !bo->vm)
2047 		err = add_preempt_fences(vm, bo);
2048 
2049 err_unlock:
2050 	if (bo)
2051 		drm_exec_fini(&exec);
2052 
2053 	if (err) {
2054 		prep_vma_destroy(vm, vma, false);
2055 		xe_vma_destroy_unlocked(vma);
2056 		vma = ERR_PTR(err);
2057 	}
2058 
2059 	return vma;
2060 }
2061 
2062 static u64 xe_vma_max_pte_size(struct xe_vma *vma)
2063 {
2064 	if (vma->gpuva.flags & XE_VMA_PTE_1G)
2065 		return SZ_1G;
2066 	else if (vma->gpuva.flags & (XE_VMA_PTE_2M | XE_VMA_PTE_COMPACT))
2067 		return SZ_2M;
2068 	else if (vma->gpuva.flags & XE_VMA_PTE_64K)
2069 		return SZ_64K;
2070 	else if (vma->gpuva.flags & XE_VMA_PTE_4K)
2071 		return SZ_4K;
2072 
2073 	return SZ_1G;	/* Uninitialized, used max size */
2074 }
2075 
2076 static void xe_vma_set_pte_size(struct xe_vma *vma, u64 size)
2077 {
2078 	switch (size) {
2079 	case SZ_1G:
2080 		vma->gpuva.flags |= XE_VMA_PTE_1G;
2081 		break;
2082 	case SZ_2M:
2083 		vma->gpuva.flags |= XE_VMA_PTE_2M;
2084 		break;
2085 	case SZ_64K:
2086 		vma->gpuva.flags |= XE_VMA_PTE_64K;
2087 		break;
2088 	case SZ_4K:
2089 		vma->gpuva.flags |= XE_VMA_PTE_4K;
2090 		break;
2091 	}
2092 }
2093 
2094 static int xe_vma_op_commit(struct xe_vm *vm, struct xe_vma_op *op)
2095 {
2096 	int err = 0;
2097 
2098 	lockdep_assert_held_write(&vm->lock);
2099 
2100 	switch (op->base.op) {
2101 	case DRM_GPUVA_OP_MAP:
2102 		err |= xe_vm_insert_vma(vm, op->map.vma);
2103 		if (!err)
2104 			op->flags |= XE_VMA_OP_COMMITTED;
2105 		break;
2106 	case DRM_GPUVA_OP_REMAP:
2107 	{
2108 		u8 tile_present =
2109 			gpuva_to_vma(op->base.remap.unmap->va)->tile_present;
2110 
2111 		prep_vma_destroy(vm, gpuva_to_vma(op->base.remap.unmap->va),
2112 				 true);
2113 		op->flags |= XE_VMA_OP_COMMITTED;
2114 
2115 		if (op->remap.prev) {
2116 			err |= xe_vm_insert_vma(vm, op->remap.prev);
2117 			if (!err)
2118 				op->flags |= XE_VMA_OP_PREV_COMMITTED;
2119 			if (!err && op->remap.skip_prev) {
2120 				op->remap.prev->tile_present =
2121 					tile_present;
2122 				op->remap.prev = NULL;
2123 			}
2124 		}
2125 		if (op->remap.next) {
2126 			err |= xe_vm_insert_vma(vm, op->remap.next);
2127 			if (!err)
2128 				op->flags |= XE_VMA_OP_NEXT_COMMITTED;
2129 			if (!err && op->remap.skip_next) {
2130 				op->remap.next->tile_present =
2131 					tile_present;
2132 				op->remap.next = NULL;
2133 			}
2134 		}
2135 
2136 		/* Adjust for partial unbind after removing VMA from VM */
2137 		if (!err) {
2138 			op->base.remap.unmap->va->va.addr = op->remap.start;
2139 			op->base.remap.unmap->va->va.range = op->remap.range;
2140 		}
2141 		break;
2142 	}
2143 	case DRM_GPUVA_OP_UNMAP:
2144 		prep_vma_destroy(vm, gpuva_to_vma(op->base.unmap.va), true);
2145 		op->flags |= XE_VMA_OP_COMMITTED;
2146 		break;
2147 	case DRM_GPUVA_OP_PREFETCH:
2148 		op->flags |= XE_VMA_OP_COMMITTED;
2149 		break;
2150 	default:
2151 		drm_warn(&vm->xe->drm, "NOT POSSIBLE");
2152 	}
2153 
2154 	return err;
2155 }
2156 
2157 static int vm_bind_ioctl_ops_parse(struct xe_vm *vm, struct drm_gpuva_ops *ops,
2158 				   struct xe_vma_ops *vops)
2159 {
2160 	struct xe_device *xe = vm->xe;
2161 	struct drm_gpuva_op *__op;
2162 	struct xe_tile *tile;
2163 	u8 id, tile_mask = 0;
2164 	int err = 0;
2165 
2166 	lockdep_assert_held_write(&vm->lock);
2167 
2168 	for_each_tile(tile, vm->xe, id)
2169 		tile_mask |= 0x1 << id;
2170 
2171 	drm_gpuva_for_each_op(__op, ops) {
2172 		struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2173 		struct xe_vma *vma;
2174 		unsigned int flags = 0;
2175 
2176 		INIT_LIST_HEAD(&op->link);
2177 		list_add_tail(&op->link, &vops->list);
2178 		op->tile_mask = tile_mask;
2179 
2180 		switch (op->base.op) {
2181 		case DRM_GPUVA_OP_MAP:
2182 		{
2183 			flags |= op->map.read_only ?
2184 				VMA_CREATE_FLAG_READ_ONLY : 0;
2185 			flags |= op->map.is_null ?
2186 				VMA_CREATE_FLAG_IS_NULL : 0;
2187 			flags |= op->map.dumpable ?
2188 				VMA_CREATE_FLAG_DUMPABLE : 0;
2189 
2190 			vma = new_vma(vm, &op->base.map, op->map.pat_index,
2191 				      flags);
2192 			if (IS_ERR(vma))
2193 				return PTR_ERR(vma);
2194 
2195 			op->map.vma = vma;
2196 			if (op->map.immediate || !xe_vm_in_fault_mode(vm))
2197 				xe_vma_ops_incr_pt_update_ops(vops,
2198 							      op->tile_mask);
2199 			break;
2200 		}
2201 		case DRM_GPUVA_OP_REMAP:
2202 		{
2203 			struct xe_vma *old =
2204 				gpuva_to_vma(op->base.remap.unmap->va);
2205 
2206 			op->remap.start = xe_vma_start(old);
2207 			op->remap.range = xe_vma_size(old);
2208 
2209 			if (op->base.remap.prev) {
2210 				flags |= op->base.remap.unmap->va->flags &
2211 					XE_VMA_READ_ONLY ?
2212 					VMA_CREATE_FLAG_READ_ONLY : 0;
2213 				flags |= op->base.remap.unmap->va->flags &
2214 					DRM_GPUVA_SPARSE ?
2215 					VMA_CREATE_FLAG_IS_NULL : 0;
2216 				flags |= op->base.remap.unmap->va->flags &
2217 					XE_VMA_DUMPABLE ?
2218 					VMA_CREATE_FLAG_DUMPABLE : 0;
2219 
2220 				vma = new_vma(vm, op->base.remap.prev,
2221 					      old->pat_index, flags);
2222 				if (IS_ERR(vma))
2223 					return PTR_ERR(vma);
2224 
2225 				op->remap.prev = vma;
2226 
2227 				/*
2228 				 * Userptr creates a new SG mapping so
2229 				 * we must also rebind.
2230 				 */
2231 				op->remap.skip_prev = !xe_vma_is_userptr(old) &&
2232 					IS_ALIGNED(xe_vma_end(vma),
2233 						   xe_vma_max_pte_size(old));
2234 				if (op->remap.skip_prev) {
2235 					xe_vma_set_pte_size(vma, xe_vma_max_pte_size(old));
2236 					op->remap.range -=
2237 						xe_vma_end(vma) -
2238 						xe_vma_start(old);
2239 					op->remap.start = xe_vma_end(vma);
2240 					vm_dbg(&xe->drm, "REMAP:SKIP_PREV: addr=0x%016llx, range=0x%016llx",
2241 					       (ULL)op->remap.start,
2242 					       (ULL)op->remap.range);
2243 				} else {
2244 					xe_vma_ops_incr_pt_update_ops(vops, op->tile_mask);
2245 				}
2246 			}
2247 
2248 			if (op->base.remap.next) {
2249 				flags |= op->base.remap.unmap->va->flags &
2250 					XE_VMA_READ_ONLY ?
2251 					VMA_CREATE_FLAG_READ_ONLY : 0;
2252 				flags |= op->base.remap.unmap->va->flags &
2253 					DRM_GPUVA_SPARSE ?
2254 					VMA_CREATE_FLAG_IS_NULL : 0;
2255 				flags |= op->base.remap.unmap->va->flags &
2256 					XE_VMA_DUMPABLE ?
2257 					VMA_CREATE_FLAG_DUMPABLE : 0;
2258 
2259 				vma = new_vma(vm, op->base.remap.next,
2260 					      old->pat_index, flags);
2261 				if (IS_ERR(vma))
2262 					return PTR_ERR(vma);
2263 
2264 				op->remap.next = vma;
2265 
2266 				/*
2267 				 * Userptr creates a new SG mapping so
2268 				 * we must also rebind.
2269 				 */
2270 				op->remap.skip_next = !xe_vma_is_userptr(old) &&
2271 					IS_ALIGNED(xe_vma_start(vma),
2272 						   xe_vma_max_pte_size(old));
2273 				if (op->remap.skip_next) {
2274 					xe_vma_set_pte_size(vma, xe_vma_max_pte_size(old));
2275 					op->remap.range -=
2276 						xe_vma_end(old) -
2277 						xe_vma_start(vma);
2278 					vm_dbg(&xe->drm, "REMAP:SKIP_NEXT: addr=0x%016llx, range=0x%016llx",
2279 					       (ULL)op->remap.start,
2280 					       (ULL)op->remap.range);
2281 				} else {
2282 					xe_vma_ops_incr_pt_update_ops(vops, op->tile_mask);
2283 				}
2284 			}
2285 			xe_vma_ops_incr_pt_update_ops(vops, op->tile_mask);
2286 			break;
2287 		}
2288 		case DRM_GPUVA_OP_UNMAP:
2289 		case DRM_GPUVA_OP_PREFETCH:
2290 			/* FIXME: Need to skip some prefetch ops */
2291 			xe_vma_ops_incr_pt_update_ops(vops, op->tile_mask);
2292 			break;
2293 		default:
2294 			drm_warn(&vm->xe->drm, "NOT POSSIBLE");
2295 		}
2296 
2297 		err = xe_vma_op_commit(vm, op);
2298 		if (err)
2299 			return err;
2300 	}
2301 
2302 	return 0;
2303 }
2304 
2305 static void xe_vma_op_unwind(struct xe_vm *vm, struct xe_vma_op *op,
2306 			     bool post_commit, bool prev_post_commit,
2307 			     bool next_post_commit)
2308 {
2309 	lockdep_assert_held_write(&vm->lock);
2310 
2311 	switch (op->base.op) {
2312 	case DRM_GPUVA_OP_MAP:
2313 		if (op->map.vma) {
2314 			prep_vma_destroy(vm, op->map.vma, post_commit);
2315 			xe_vma_destroy_unlocked(op->map.vma);
2316 		}
2317 		break;
2318 	case DRM_GPUVA_OP_UNMAP:
2319 	{
2320 		struct xe_vma *vma = gpuva_to_vma(op->base.unmap.va);
2321 
2322 		if (vma) {
2323 			down_read(&vm->userptr.notifier_lock);
2324 			vma->gpuva.flags &= ~XE_VMA_DESTROYED;
2325 			up_read(&vm->userptr.notifier_lock);
2326 			if (post_commit)
2327 				xe_vm_insert_vma(vm, vma);
2328 		}
2329 		break;
2330 	}
2331 	case DRM_GPUVA_OP_REMAP:
2332 	{
2333 		struct xe_vma *vma = gpuva_to_vma(op->base.remap.unmap->va);
2334 
2335 		if (op->remap.prev) {
2336 			prep_vma_destroy(vm, op->remap.prev, prev_post_commit);
2337 			xe_vma_destroy_unlocked(op->remap.prev);
2338 		}
2339 		if (op->remap.next) {
2340 			prep_vma_destroy(vm, op->remap.next, next_post_commit);
2341 			xe_vma_destroy_unlocked(op->remap.next);
2342 		}
2343 		if (vma) {
2344 			down_read(&vm->userptr.notifier_lock);
2345 			vma->gpuva.flags &= ~XE_VMA_DESTROYED;
2346 			up_read(&vm->userptr.notifier_lock);
2347 			if (post_commit)
2348 				xe_vm_insert_vma(vm, vma);
2349 		}
2350 		break;
2351 	}
2352 	case DRM_GPUVA_OP_PREFETCH:
2353 		/* Nothing to do */
2354 		break;
2355 	default:
2356 		drm_warn(&vm->xe->drm, "NOT POSSIBLE");
2357 	}
2358 }
2359 
2360 static void vm_bind_ioctl_ops_unwind(struct xe_vm *vm,
2361 				     struct drm_gpuva_ops **ops,
2362 				     int num_ops_list)
2363 {
2364 	int i;
2365 
2366 	for (i = num_ops_list - 1; i >= 0; --i) {
2367 		struct drm_gpuva_ops *__ops = ops[i];
2368 		struct drm_gpuva_op *__op;
2369 
2370 		if (!__ops)
2371 			continue;
2372 
2373 		drm_gpuva_for_each_op_reverse(__op, __ops) {
2374 			struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2375 
2376 			xe_vma_op_unwind(vm, op,
2377 					 op->flags & XE_VMA_OP_COMMITTED,
2378 					 op->flags & XE_VMA_OP_PREV_COMMITTED,
2379 					 op->flags & XE_VMA_OP_NEXT_COMMITTED);
2380 		}
2381 	}
2382 }
2383 
2384 static int vma_lock_and_validate(struct drm_exec *exec, struct xe_vma *vma,
2385 				 bool validate)
2386 {
2387 	struct xe_bo *bo = xe_vma_bo(vma);
2388 	struct xe_vm *vm = xe_vma_vm(vma);
2389 	int err = 0;
2390 
2391 	if (bo) {
2392 		if (!bo->vm)
2393 			err = drm_exec_lock_obj(exec, &bo->ttm.base);
2394 		if (!err && validate)
2395 			err = xe_bo_validate(bo, vm,
2396 					     !xe_vm_in_preempt_fence_mode(vm));
2397 	}
2398 
2399 	return err;
2400 }
2401 
2402 static int check_ufence(struct xe_vma *vma)
2403 {
2404 	if (vma->ufence) {
2405 		struct xe_user_fence * const f = vma->ufence;
2406 
2407 		if (!xe_sync_ufence_get_status(f))
2408 			return -EBUSY;
2409 
2410 		vma->ufence = NULL;
2411 		xe_sync_ufence_put(f);
2412 	}
2413 
2414 	return 0;
2415 }
2416 
2417 static int op_lock_and_prep(struct drm_exec *exec, struct xe_vm *vm,
2418 			    struct xe_vma_op *op)
2419 {
2420 	int err = 0;
2421 
2422 	switch (op->base.op) {
2423 	case DRM_GPUVA_OP_MAP:
2424 		err = vma_lock_and_validate(exec, op->map.vma,
2425 					    !xe_vm_in_fault_mode(vm) ||
2426 					    op->map.immediate);
2427 		break;
2428 	case DRM_GPUVA_OP_REMAP:
2429 		err = check_ufence(gpuva_to_vma(op->base.remap.unmap->va));
2430 		if (err)
2431 			break;
2432 
2433 		err = vma_lock_and_validate(exec,
2434 					    gpuva_to_vma(op->base.remap.unmap->va),
2435 					    false);
2436 		if (!err && op->remap.prev)
2437 			err = vma_lock_and_validate(exec, op->remap.prev, true);
2438 		if (!err && op->remap.next)
2439 			err = vma_lock_and_validate(exec, op->remap.next, true);
2440 		break;
2441 	case DRM_GPUVA_OP_UNMAP:
2442 		err = check_ufence(gpuva_to_vma(op->base.unmap.va));
2443 		if (err)
2444 			break;
2445 
2446 		err = vma_lock_and_validate(exec,
2447 					    gpuva_to_vma(op->base.unmap.va),
2448 					    false);
2449 		break;
2450 	case DRM_GPUVA_OP_PREFETCH:
2451 	{
2452 		struct xe_vma *vma = gpuva_to_vma(op->base.prefetch.va);
2453 		u32 region = op->prefetch.region;
2454 
2455 		xe_assert(vm->xe, region <= ARRAY_SIZE(region_to_mem_type));
2456 
2457 		err = vma_lock_and_validate(exec,
2458 					    gpuva_to_vma(op->base.prefetch.va),
2459 					    false);
2460 		if (!err && !xe_vma_has_no_bo(vma))
2461 			err = xe_bo_migrate(xe_vma_bo(vma),
2462 					    region_to_mem_type[region]);
2463 		break;
2464 	}
2465 	default:
2466 		drm_warn(&vm->xe->drm, "NOT POSSIBLE");
2467 	}
2468 
2469 	return err;
2470 }
2471 
2472 static int vm_bind_ioctl_ops_lock_and_prep(struct drm_exec *exec,
2473 					   struct xe_vm *vm,
2474 					   struct xe_vma_ops *vops)
2475 {
2476 	struct xe_vma_op *op;
2477 	int err;
2478 
2479 	err = drm_exec_lock_obj(exec, xe_vm_obj(vm));
2480 	if (err)
2481 		return err;
2482 
2483 	list_for_each_entry(op, &vops->list, link) {
2484 		err = op_lock_and_prep(exec, vm, op);
2485 		if (err)
2486 			return err;
2487 	}
2488 
2489 #ifdef TEST_VM_OPS_ERROR
2490 	if (vops->inject_error &&
2491 	    vm->xe->vm_inject_error_position == FORCE_OP_ERROR_LOCK)
2492 		return -ENOSPC;
2493 #endif
2494 
2495 	return 0;
2496 }
2497 
2498 static void op_trace(struct xe_vma_op *op)
2499 {
2500 	switch (op->base.op) {
2501 	case DRM_GPUVA_OP_MAP:
2502 		trace_xe_vma_bind(op->map.vma);
2503 		break;
2504 	case DRM_GPUVA_OP_REMAP:
2505 		trace_xe_vma_unbind(gpuva_to_vma(op->base.remap.unmap->va));
2506 		if (op->remap.prev)
2507 			trace_xe_vma_bind(op->remap.prev);
2508 		if (op->remap.next)
2509 			trace_xe_vma_bind(op->remap.next);
2510 		break;
2511 	case DRM_GPUVA_OP_UNMAP:
2512 		trace_xe_vma_unbind(gpuva_to_vma(op->base.unmap.va));
2513 		break;
2514 	case DRM_GPUVA_OP_PREFETCH:
2515 		trace_xe_vma_bind(gpuva_to_vma(op->base.prefetch.va));
2516 		break;
2517 	default:
2518 		XE_WARN_ON("NOT POSSIBLE");
2519 	}
2520 }
2521 
2522 static void trace_xe_vm_ops_execute(struct xe_vma_ops *vops)
2523 {
2524 	struct xe_vma_op *op;
2525 
2526 	list_for_each_entry(op, &vops->list, link)
2527 		op_trace(op);
2528 }
2529 
2530 static int vm_ops_setup_tile_args(struct xe_vm *vm, struct xe_vma_ops *vops)
2531 {
2532 	struct xe_exec_queue *q = vops->q;
2533 	struct xe_tile *tile;
2534 	int number_tiles = 0;
2535 	u8 id;
2536 
2537 	for_each_tile(tile, vm->xe, id) {
2538 		if (vops->pt_update_ops[id].num_ops)
2539 			++number_tiles;
2540 
2541 		if (vops->pt_update_ops[id].q)
2542 			continue;
2543 
2544 		if (q) {
2545 			vops->pt_update_ops[id].q = q;
2546 			if (vm->pt_root[id] && !list_empty(&q->multi_gt_list))
2547 				q = list_next_entry(q, multi_gt_list);
2548 		} else {
2549 			vops->pt_update_ops[id].q = vm->q[id];
2550 		}
2551 	}
2552 
2553 	return number_tiles;
2554 }
2555 
2556 static struct dma_fence *ops_execute(struct xe_vm *vm,
2557 				     struct xe_vma_ops *vops)
2558 {
2559 	struct xe_tile *tile;
2560 	struct dma_fence *fence = NULL;
2561 	struct dma_fence **fences = NULL;
2562 	struct dma_fence_array *cf = NULL;
2563 	int number_tiles = 0, current_fence = 0, err;
2564 	u8 id;
2565 
2566 	number_tiles = vm_ops_setup_tile_args(vm, vops);
2567 	if (number_tiles == 0)
2568 		return ERR_PTR(-ENODATA);
2569 
2570 	if (number_tiles > 1) {
2571 		fences = kmalloc_array(number_tiles, sizeof(*fences),
2572 				       GFP_KERNEL);
2573 		if (!fences) {
2574 			fence = ERR_PTR(-ENOMEM);
2575 			goto err_trace;
2576 		}
2577 	}
2578 
2579 	for_each_tile(tile, vm->xe, id) {
2580 		if (!vops->pt_update_ops[id].num_ops)
2581 			continue;
2582 
2583 		err = xe_pt_update_ops_prepare(tile, vops);
2584 		if (err) {
2585 			fence = ERR_PTR(err);
2586 			goto err_out;
2587 		}
2588 	}
2589 
2590 	trace_xe_vm_ops_execute(vops);
2591 
2592 	for_each_tile(tile, vm->xe, id) {
2593 		if (!vops->pt_update_ops[id].num_ops)
2594 			continue;
2595 
2596 		fence = xe_pt_update_ops_run(tile, vops);
2597 		if (IS_ERR(fence))
2598 			goto err_out;
2599 
2600 		if (fences)
2601 			fences[current_fence++] = fence;
2602 	}
2603 
2604 	if (fences) {
2605 		cf = dma_fence_array_create(number_tiles, fences,
2606 					    vm->composite_fence_ctx,
2607 					    vm->composite_fence_seqno++,
2608 					    false);
2609 		if (!cf) {
2610 			--vm->composite_fence_seqno;
2611 			fence = ERR_PTR(-ENOMEM);
2612 			goto err_out;
2613 		}
2614 		fence = &cf->base;
2615 	}
2616 
2617 	for_each_tile(tile, vm->xe, id) {
2618 		if (!vops->pt_update_ops[id].num_ops)
2619 			continue;
2620 
2621 		xe_pt_update_ops_fini(tile, vops);
2622 	}
2623 
2624 	return fence;
2625 
2626 err_out:
2627 	for_each_tile(tile, vm->xe, id) {
2628 		if (!vops->pt_update_ops[id].num_ops)
2629 			continue;
2630 
2631 		xe_pt_update_ops_abort(tile, vops);
2632 	}
2633 	while (current_fence)
2634 		dma_fence_put(fences[--current_fence]);
2635 	kfree(fences);
2636 	kfree(cf);
2637 
2638 err_trace:
2639 	trace_xe_vm_ops_fail(vm);
2640 	return fence;
2641 }
2642 
2643 static void vma_add_ufence(struct xe_vma *vma, struct xe_user_fence *ufence)
2644 {
2645 	if (vma->ufence)
2646 		xe_sync_ufence_put(vma->ufence);
2647 	vma->ufence = __xe_sync_ufence_get(ufence);
2648 }
2649 
2650 static void op_add_ufence(struct xe_vm *vm, struct xe_vma_op *op,
2651 			  struct xe_user_fence *ufence)
2652 {
2653 	switch (op->base.op) {
2654 	case DRM_GPUVA_OP_MAP:
2655 		vma_add_ufence(op->map.vma, ufence);
2656 		break;
2657 	case DRM_GPUVA_OP_REMAP:
2658 		if (op->remap.prev)
2659 			vma_add_ufence(op->remap.prev, ufence);
2660 		if (op->remap.next)
2661 			vma_add_ufence(op->remap.next, ufence);
2662 		break;
2663 	case DRM_GPUVA_OP_UNMAP:
2664 		break;
2665 	case DRM_GPUVA_OP_PREFETCH:
2666 		vma_add_ufence(gpuva_to_vma(op->base.prefetch.va), ufence);
2667 		break;
2668 	default:
2669 		drm_warn(&vm->xe->drm, "NOT POSSIBLE");
2670 	}
2671 }
2672 
2673 static void vm_bind_ioctl_ops_fini(struct xe_vm *vm, struct xe_vma_ops *vops,
2674 				   struct dma_fence *fence)
2675 {
2676 	struct xe_exec_queue *wait_exec_queue = to_wait_exec_queue(vm, vops->q);
2677 	struct xe_user_fence *ufence;
2678 	struct xe_vma_op *op;
2679 	int i;
2680 
2681 	ufence = find_ufence_get(vops->syncs, vops->num_syncs);
2682 	list_for_each_entry(op, &vops->list, link) {
2683 		if (ufence)
2684 			op_add_ufence(vm, op, ufence);
2685 
2686 		if (op->base.op == DRM_GPUVA_OP_UNMAP)
2687 			xe_vma_destroy(gpuva_to_vma(op->base.unmap.va), fence);
2688 		else if (op->base.op == DRM_GPUVA_OP_REMAP)
2689 			xe_vma_destroy(gpuva_to_vma(op->base.remap.unmap->va),
2690 				       fence);
2691 	}
2692 	if (ufence)
2693 		xe_sync_ufence_put(ufence);
2694 	for (i = 0; i < vops->num_syncs; i++)
2695 		xe_sync_entry_signal(vops->syncs + i, fence);
2696 	xe_exec_queue_last_fence_set(wait_exec_queue, vm, fence);
2697 	dma_fence_put(fence);
2698 }
2699 
2700 static int vm_bind_ioctl_ops_execute(struct xe_vm *vm,
2701 				     struct xe_vma_ops *vops)
2702 {
2703 	struct drm_exec exec;
2704 	struct dma_fence *fence;
2705 	int err;
2706 
2707 	lockdep_assert_held_write(&vm->lock);
2708 
2709 	drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT |
2710 		      DRM_EXEC_IGNORE_DUPLICATES, 0);
2711 	drm_exec_until_all_locked(&exec) {
2712 		err = vm_bind_ioctl_ops_lock_and_prep(&exec, vm, vops);
2713 		drm_exec_retry_on_contention(&exec);
2714 		if (err)
2715 			goto unlock;
2716 
2717 		fence = ops_execute(vm, vops);
2718 		if (IS_ERR(fence)) {
2719 			err = PTR_ERR(fence);
2720 			goto unlock;
2721 		}
2722 
2723 		vm_bind_ioctl_ops_fini(vm, vops, fence);
2724 	}
2725 
2726 unlock:
2727 	drm_exec_fini(&exec);
2728 	return err;
2729 }
2730 ALLOW_ERROR_INJECTION(vm_bind_ioctl_ops_execute, ERRNO);
2731 
2732 #define SUPPORTED_FLAGS_STUB  \
2733 	(DRM_XE_VM_BIND_FLAG_READONLY | \
2734 	 DRM_XE_VM_BIND_FLAG_IMMEDIATE | \
2735 	 DRM_XE_VM_BIND_FLAG_NULL | \
2736 	 DRM_XE_VM_BIND_FLAG_DUMPABLE)
2737 
2738 #ifdef TEST_VM_OPS_ERROR
2739 #define SUPPORTED_FLAGS	(SUPPORTED_FLAGS_STUB | FORCE_OP_ERROR)
2740 #else
2741 #define SUPPORTED_FLAGS	SUPPORTED_FLAGS_STUB
2742 #endif
2743 
2744 #define XE_64K_PAGE_MASK 0xffffull
2745 #define ALL_DRM_XE_SYNCS_FLAGS (DRM_XE_SYNCS_FLAG_WAIT_FOR_OP)
2746 
2747 static int vm_bind_ioctl_check_args(struct xe_device *xe,
2748 				    struct drm_xe_vm_bind *args,
2749 				    struct drm_xe_vm_bind_op **bind_ops)
2750 {
2751 	int err;
2752 	int i;
2753 
2754 	if (XE_IOCTL_DBG(xe, args->pad || args->pad2) ||
2755 	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
2756 		return -EINVAL;
2757 
2758 	if (XE_IOCTL_DBG(xe, args->extensions))
2759 		return -EINVAL;
2760 
2761 	if (args->num_binds > 1) {
2762 		u64 __user *bind_user =
2763 			u64_to_user_ptr(args->vector_of_binds);
2764 
2765 		*bind_ops = kvmalloc_array(args->num_binds,
2766 					   sizeof(struct drm_xe_vm_bind_op),
2767 					   GFP_KERNEL | __GFP_ACCOUNT |
2768 					   __GFP_RETRY_MAYFAIL | __GFP_NOWARN);
2769 		if (!*bind_ops)
2770 			return args->num_binds > 1 ? -ENOBUFS : -ENOMEM;
2771 
2772 		err = __copy_from_user(*bind_ops, bind_user,
2773 				       sizeof(struct drm_xe_vm_bind_op) *
2774 				       args->num_binds);
2775 		if (XE_IOCTL_DBG(xe, err)) {
2776 			err = -EFAULT;
2777 			goto free_bind_ops;
2778 		}
2779 	} else {
2780 		*bind_ops = &args->bind;
2781 	}
2782 
2783 	for (i = 0; i < args->num_binds; ++i) {
2784 		u64 range = (*bind_ops)[i].range;
2785 		u64 addr = (*bind_ops)[i].addr;
2786 		u32 op = (*bind_ops)[i].op;
2787 		u32 flags = (*bind_ops)[i].flags;
2788 		u32 obj = (*bind_ops)[i].obj;
2789 		u64 obj_offset = (*bind_ops)[i].obj_offset;
2790 		u32 prefetch_region = (*bind_ops)[i].prefetch_mem_region_instance;
2791 		bool is_null = flags & DRM_XE_VM_BIND_FLAG_NULL;
2792 		u16 pat_index = (*bind_ops)[i].pat_index;
2793 		u16 coh_mode;
2794 
2795 		if (XE_IOCTL_DBG(xe, pat_index >= xe->pat.n_entries)) {
2796 			err = -EINVAL;
2797 			goto free_bind_ops;
2798 		}
2799 
2800 		pat_index = array_index_nospec(pat_index, xe->pat.n_entries);
2801 		(*bind_ops)[i].pat_index = pat_index;
2802 		coh_mode = xe_pat_index_get_coh_mode(xe, pat_index);
2803 		if (XE_IOCTL_DBG(xe, !coh_mode)) { /* hw reserved */
2804 			err = -EINVAL;
2805 			goto free_bind_ops;
2806 		}
2807 
2808 		if (XE_WARN_ON(coh_mode > XE_COH_AT_LEAST_1WAY)) {
2809 			err = -EINVAL;
2810 			goto free_bind_ops;
2811 		}
2812 
2813 		if (XE_IOCTL_DBG(xe, op > DRM_XE_VM_BIND_OP_PREFETCH) ||
2814 		    XE_IOCTL_DBG(xe, flags & ~SUPPORTED_FLAGS) ||
2815 		    XE_IOCTL_DBG(xe, obj && is_null) ||
2816 		    XE_IOCTL_DBG(xe, obj_offset && is_null) ||
2817 		    XE_IOCTL_DBG(xe, op != DRM_XE_VM_BIND_OP_MAP &&
2818 				 is_null) ||
2819 		    XE_IOCTL_DBG(xe, !obj &&
2820 				 op == DRM_XE_VM_BIND_OP_MAP &&
2821 				 !is_null) ||
2822 		    XE_IOCTL_DBG(xe, !obj &&
2823 				 op == DRM_XE_VM_BIND_OP_UNMAP_ALL) ||
2824 		    XE_IOCTL_DBG(xe, addr &&
2825 				 op == DRM_XE_VM_BIND_OP_UNMAP_ALL) ||
2826 		    XE_IOCTL_DBG(xe, range &&
2827 				 op == DRM_XE_VM_BIND_OP_UNMAP_ALL) ||
2828 		    XE_IOCTL_DBG(xe, obj &&
2829 				 op == DRM_XE_VM_BIND_OP_MAP_USERPTR) ||
2830 		    XE_IOCTL_DBG(xe, coh_mode == XE_COH_NONE &&
2831 				 op == DRM_XE_VM_BIND_OP_MAP_USERPTR) ||
2832 		    XE_IOCTL_DBG(xe, obj &&
2833 				 op == DRM_XE_VM_BIND_OP_PREFETCH) ||
2834 		    XE_IOCTL_DBG(xe, prefetch_region &&
2835 				 op != DRM_XE_VM_BIND_OP_PREFETCH) ||
2836 		    XE_IOCTL_DBG(xe, !(BIT(prefetch_region) &
2837 				       xe->info.mem_region_mask)) ||
2838 		    XE_IOCTL_DBG(xe, obj &&
2839 				 op == DRM_XE_VM_BIND_OP_UNMAP)) {
2840 			err = -EINVAL;
2841 			goto free_bind_ops;
2842 		}
2843 
2844 		if (XE_IOCTL_DBG(xe, obj_offset & ~PAGE_MASK) ||
2845 		    XE_IOCTL_DBG(xe, addr & ~PAGE_MASK) ||
2846 		    XE_IOCTL_DBG(xe, range & ~PAGE_MASK) ||
2847 		    XE_IOCTL_DBG(xe, !range &&
2848 				 op != DRM_XE_VM_BIND_OP_UNMAP_ALL)) {
2849 			err = -EINVAL;
2850 			goto free_bind_ops;
2851 		}
2852 	}
2853 
2854 	return 0;
2855 
2856 free_bind_ops:
2857 	if (args->num_binds > 1)
2858 		kvfree(*bind_ops);
2859 	return err;
2860 }
2861 
2862 static int vm_bind_ioctl_signal_fences(struct xe_vm *vm,
2863 				       struct xe_exec_queue *q,
2864 				       struct xe_sync_entry *syncs,
2865 				       int num_syncs)
2866 {
2867 	struct dma_fence *fence;
2868 	int i, err = 0;
2869 
2870 	fence = xe_sync_in_fence_get(syncs, num_syncs,
2871 				     to_wait_exec_queue(vm, q), vm);
2872 	if (IS_ERR(fence))
2873 		return PTR_ERR(fence);
2874 
2875 	for (i = 0; i < num_syncs; i++)
2876 		xe_sync_entry_signal(&syncs[i], fence);
2877 
2878 	xe_exec_queue_last_fence_set(to_wait_exec_queue(vm, q), vm,
2879 				     fence);
2880 	dma_fence_put(fence);
2881 
2882 	return err;
2883 }
2884 
2885 static void xe_vma_ops_init(struct xe_vma_ops *vops, struct xe_vm *vm,
2886 			    struct xe_exec_queue *q,
2887 			    struct xe_sync_entry *syncs, u32 num_syncs)
2888 {
2889 	memset(vops, 0, sizeof(*vops));
2890 	INIT_LIST_HEAD(&vops->list);
2891 	vops->vm = vm;
2892 	vops->q = q;
2893 	vops->syncs = syncs;
2894 	vops->num_syncs = num_syncs;
2895 }
2896 
2897 static int xe_vm_bind_ioctl_validate_bo(struct xe_device *xe, struct xe_bo *bo,
2898 					u64 addr, u64 range, u64 obj_offset,
2899 					u16 pat_index)
2900 {
2901 	u16 coh_mode;
2902 
2903 	if (XE_IOCTL_DBG(xe, range > bo->size) ||
2904 	    XE_IOCTL_DBG(xe, obj_offset >
2905 			 bo->size - range)) {
2906 		return -EINVAL;
2907 	}
2908 
2909 	/*
2910 	 * Some platforms require 64k VM_BIND alignment,
2911 	 * specifically those with XE_VRAM_FLAGS_NEED64K.
2912 	 *
2913 	 * Other platforms may have BO's set to 64k physical placement,
2914 	 * but can be mapped at 4k offsets anyway. This check is only
2915 	 * there for the former case.
2916 	 */
2917 	if ((bo->flags & XE_BO_FLAG_INTERNAL_64K) &&
2918 	    (xe->info.vram_flags & XE_VRAM_FLAGS_NEED64K)) {
2919 		if (XE_IOCTL_DBG(xe, obj_offset &
2920 				 XE_64K_PAGE_MASK) ||
2921 		    XE_IOCTL_DBG(xe, addr & XE_64K_PAGE_MASK) ||
2922 		    XE_IOCTL_DBG(xe, range & XE_64K_PAGE_MASK)) {
2923 			return  -EINVAL;
2924 		}
2925 	}
2926 
2927 	coh_mode = xe_pat_index_get_coh_mode(xe, pat_index);
2928 	if (bo->cpu_caching) {
2929 		if (XE_IOCTL_DBG(xe, coh_mode == XE_COH_NONE &&
2930 				 bo->cpu_caching == DRM_XE_GEM_CPU_CACHING_WB)) {
2931 			return  -EINVAL;
2932 		}
2933 	} else if (XE_IOCTL_DBG(xe, coh_mode == XE_COH_NONE)) {
2934 		/*
2935 		 * Imported dma-buf from a different device should
2936 		 * require 1way or 2way coherency since we don't know
2937 		 * how it was mapped on the CPU. Just assume is it
2938 		 * potentially cached on CPU side.
2939 		 */
2940 		return  -EINVAL;
2941 	}
2942 
2943 	return 0;
2944 }
2945 
2946 int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
2947 {
2948 	struct xe_device *xe = to_xe_device(dev);
2949 	struct xe_file *xef = to_xe_file(file);
2950 	struct drm_xe_vm_bind *args = data;
2951 	struct drm_xe_sync __user *syncs_user;
2952 	struct xe_bo **bos = NULL;
2953 	struct drm_gpuva_ops **ops = NULL;
2954 	struct xe_vm *vm;
2955 	struct xe_exec_queue *q = NULL;
2956 	u32 num_syncs, num_ufence = 0;
2957 	struct xe_sync_entry *syncs = NULL;
2958 	struct drm_xe_vm_bind_op *bind_ops;
2959 	struct xe_vma_ops vops;
2960 	int err;
2961 	int i;
2962 
2963 	err = vm_bind_ioctl_check_args(xe, args, &bind_ops);
2964 	if (err)
2965 		return err;
2966 
2967 	if (args->exec_queue_id) {
2968 		q = xe_exec_queue_lookup(xef, args->exec_queue_id);
2969 		if (XE_IOCTL_DBG(xe, !q)) {
2970 			err = -ENOENT;
2971 			goto free_objs;
2972 		}
2973 
2974 		if (XE_IOCTL_DBG(xe, !(q->flags & EXEC_QUEUE_FLAG_VM))) {
2975 			err = -EINVAL;
2976 			goto put_exec_queue;
2977 		}
2978 	}
2979 
2980 	vm = xe_vm_lookup(xef, args->vm_id);
2981 	if (XE_IOCTL_DBG(xe, !vm)) {
2982 		err = -EINVAL;
2983 		goto put_exec_queue;
2984 	}
2985 
2986 	err = down_write_killable(&vm->lock);
2987 	if (err)
2988 		goto put_vm;
2989 
2990 	if (XE_IOCTL_DBG(xe, xe_vm_is_closed_or_banned(vm))) {
2991 		err = -ENOENT;
2992 		goto release_vm_lock;
2993 	}
2994 
2995 	for (i = 0; i < args->num_binds; ++i) {
2996 		u64 range = bind_ops[i].range;
2997 		u64 addr = bind_ops[i].addr;
2998 
2999 		if (XE_IOCTL_DBG(xe, range > vm->size) ||
3000 		    XE_IOCTL_DBG(xe, addr > vm->size - range)) {
3001 			err = -EINVAL;
3002 			goto release_vm_lock;
3003 		}
3004 	}
3005 
3006 	if (args->num_binds) {
3007 		bos = kvcalloc(args->num_binds, sizeof(*bos),
3008 			       GFP_KERNEL | __GFP_ACCOUNT |
3009 			       __GFP_RETRY_MAYFAIL | __GFP_NOWARN);
3010 		if (!bos) {
3011 			err = -ENOMEM;
3012 			goto release_vm_lock;
3013 		}
3014 
3015 		ops = kvcalloc(args->num_binds, sizeof(*ops),
3016 			       GFP_KERNEL | __GFP_ACCOUNT |
3017 			       __GFP_RETRY_MAYFAIL | __GFP_NOWARN);
3018 		if (!ops) {
3019 			err = -ENOMEM;
3020 			goto release_vm_lock;
3021 		}
3022 	}
3023 
3024 	for (i = 0; i < args->num_binds; ++i) {
3025 		struct drm_gem_object *gem_obj;
3026 		u64 range = bind_ops[i].range;
3027 		u64 addr = bind_ops[i].addr;
3028 		u32 obj = bind_ops[i].obj;
3029 		u64 obj_offset = bind_ops[i].obj_offset;
3030 		u16 pat_index = bind_ops[i].pat_index;
3031 
3032 		if (!obj)
3033 			continue;
3034 
3035 		gem_obj = drm_gem_object_lookup(file, obj);
3036 		if (XE_IOCTL_DBG(xe, !gem_obj)) {
3037 			err = -ENOENT;
3038 			goto put_obj;
3039 		}
3040 		bos[i] = gem_to_xe_bo(gem_obj);
3041 
3042 		err = xe_vm_bind_ioctl_validate_bo(xe, bos[i], addr, range,
3043 						   obj_offset, pat_index);
3044 		if (err)
3045 			goto put_obj;
3046 	}
3047 
3048 	if (args->num_syncs) {
3049 		syncs = kcalloc(args->num_syncs, sizeof(*syncs), GFP_KERNEL);
3050 		if (!syncs) {
3051 			err = -ENOMEM;
3052 			goto put_obj;
3053 		}
3054 	}
3055 
3056 	syncs_user = u64_to_user_ptr(args->syncs);
3057 	for (num_syncs = 0; num_syncs < args->num_syncs; num_syncs++) {
3058 		err = xe_sync_entry_parse(xe, xef, &syncs[num_syncs],
3059 					  &syncs_user[num_syncs],
3060 					  (xe_vm_in_lr_mode(vm) ?
3061 					   SYNC_PARSE_FLAG_LR_MODE : 0) |
3062 					  (!args->num_binds ?
3063 					   SYNC_PARSE_FLAG_DISALLOW_USER_FENCE : 0));
3064 		if (err)
3065 			goto free_syncs;
3066 
3067 		if (xe_sync_is_ufence(&syncs[num_syncs]))
3068 			num_ufence++;
3069 	}
3070 
3071 	if (XE_IOCTL_DBG(xe, num_ufence > 1)) {
3072 		err = -EINVAL;
3073 		goto free_syncs;
3074 	}
3075 
3076 	if (!args->num_binds) {
3077 		err = -ENODATA;
3078 		goto free_syncs;
3079 	}
3080 
3081 	xe_vma_ops_init(&vops, vm, q, syncs, num_syncs);
3082 	for (i = 0; i < args->num_binds; ++i) {
3083 		u64 range = bind_ops[i].range;
3084 		u64 addr = bind_ops[i].addr;
3085 		u32 op = bind_ops[i].op;
3086 		u32 flags = bind_ops[i].flags;
3087 		u64 obj_offset = bind_ops[i].obj_offset;
3088 		u32 prefetch_region = bind_ops[i].prefetch_mem_region_instance;
3089 		u16 pat_index = bind_ops[i].pat_index;
3090 
3091 		ops[i] = vm_bind_ioctl_ops_create(vm, bos[i], obj_offset,
3092 						  addr, range, op, flags,
3093 						  prefetch_region, pat_index);
3094 		if (IS_ERR(ops[i])) {
3095 			err = PTR_ERR(ops[i]);
3096 			ops[i] = NULL;
3097 			goto unwind_ops;
3098 		}
3099 
3100 		err = vm_bind_ioctl_ops_parse(vm, ops[i], &vops);
3101 		if (err)
3102 			goto unwind_ops;
3103 
3104 #ifdef TEST_VM_OPS_ERROR
3105 		if (flags & FORCE_OP_ERROR) {
3106 			vops.inject_error = true;
3107 			vm->xe->vm_inject_error_position =
3108 				(vm->xe->vm_inject_error_position + 1) %
3109 				FORCE_OP_ERROR_COUNT;
3110 		}
3111 #endif
3112 	}
3113 
3114 	/* Nothing to do */
3115 	if (list_empty(&vops.list)) {
3116 		err = -ENODATA;
3117 		goto unwind_ops;
3118 	}
3119 
3120 	err = xe_vma_ops_alloc(&vops, args->num_binds > 1);
3121 	if (err)
3122 		goto unwind_ops;
3123 
3124 	err = vm_bind_ioctl_ops_execute(vm, &vops);
3125 
3126 unwind_ops:
3127 	if (err && err != -ENODATA)
3128 		vm_bind_ioctl_ops_unwind(vm, ops, args->num_binds);
3129 	xe_vma_ops_fini(&vops);
3130 	for (i = args->num_binds - 1; i >= 0; --i)
3131 		if (ops[i])
3132 			drm_gpuva_ops_free(&vm->gpuvm, ops[i]);
3133 free_syncs:
3134 	if (err == -ENODATA)
3135 		err = vm_bind_ioctl_signal_fences(vm, q, syncs, num_syncs);
3136 	while (num_syncs--)
3137 		xe_sync_entry_cleanup(&syncs[num_syncs]);
3138 
3139 	kfree(syncs);
3140 put_obj:
3141 	for (i = 0; i < args->num_binds; ++i)
3142 		xe_bo_put(bos[i]);
3143 release_vm_lock:
3144 	up_write(&vm->lock);
3145 put_vm:
3146 	xe_vm_put(vm);
3147 put_exec_queue:
3148 	if (q)
3149 		xe_exec_queue_put(q);
3150 free_objs:
3151 	kvfree(bos);
3152 	kvfree(ops);
3153 	if (args->num_binds > 1)
3154 		kvfree(bind_ops);
3155 	return err;
3156 }
3157 
3158 /**
3159  * xe_vm_lock() - Lock the vm's dma_resv object
3160  * @vm: The struct xe_vm whose lock is to be locked
3161  * @intr: Whether to perform any wait interruptible
3162  *
3163  * Return: 0 on success, -EINTR if @intr is true and the wait for a
3164  * contended lock was interrupted. If @intr is false, the function
3165  * always returns 0.
3166  */
3167 int xe_vm_lock(struct xe_vm *vm, bool intr)
3168 {
3169 	if (intr)
3170 		return dma_resv_lock_interruptible(xe_vm_resv(vm), NULL);
3171 
3172 	return dma_resv_lock(xe_vm_resv(vm), NULL);
3173 }
3174 
3175 /**
3176  * xe_vm_unlock() - Unlock the vm's dma_resv object
3177  * @vm: The struct xe_vm whose lock is to be released.
3178  *
3179  * Unlock a buffer object lock that was locked by xe_vm_lock().
3180  */
3181 void xe_vm_unlock(struct xe_vm *vm)
3182 {
3183 	dma_resv_unlock(xe_vm_resv(vm));
3184 }
3185 
3186 /**
3187  * xe_vm_invalidate_vma - invalidate GPU mappings for VMA without a lock
3188  * @vma: VMA to invalidate
3189  *
3190  * Walks a list of page tables leaves which it memset the entries owned by this
3191  * VMA to zero, invalidates the TLBs, and block until TLBs invalidation is
3192  * complete.
3193  *
3194  * Returns 0 for success, negative error code otherwise.
3195  */
3196 int xe_vm_invalidate_vma(struct xe_vma *vma)
3197 {
3198 	struct xe_device *xe = xe_vma_vm(vma)->xe;
3199 	struct xe_tile *tile;
3200 	struct xe_gt_tlb_invalidation_fence
3201 		fence[XE_MAX_TILES_PER_DEVICE * XE_MAX_GT_PER_TILE];
3202 	u8 id;
3203 	u32 fence_id = 0;
3204 	int ret = 0;
3205 
3206 	xe_assert(xe, !xe_vma_is_null(vma));
3207 	trace_xe_vma_invalidate(vma);
3208 
3209 	vm_dbg(&xe_vma_vm(vma)->xe->drm,
3210 	       "INVALIDATE: addr=0x%016llx, range=0x%016llx",
3211 		xe_vma_start(vma), xe_vma_size(vma));
3212 
3213 	/* Check that we don't race with page-table updates */
3214 	if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
3215 		if (xe_vma_is_userptr(vma)) {
3216 			WARN_ON_ONCE(!mmu_interval_check_retry
3217 				     (&to_userptr_vma(vma)->userptr.notifier,
3218 				      to_userptr_vma(vma)->userptr.notifier_seq));
3219 			WARN_ON_ONCE(!dma_resv_test_signaled(xe_vm_resv(xe_vma_vm(vma)),
3220 							     DMA_RESV_USAGE_BOOKKEEP));
3221 
3222 		} else {
3223 			xe_bo_assert_held(xe_vma_bo(vma));
3224 		}
3225 	}
3226 
3227 	for_each_tile(tile, xe, id) {
3228 		if (xe_pt_zap_ptes(tile, vma)) {
3229 			xe_device_wmb(xe);
3230 			xe_gt_tlb_invalidation_fence_init(tile->primary_gt,
3231 							  &fence[fence_id],
3232 							  true);
3233 
3234 			ret = xe_gt_tlb_invalidation_vma(tile->primary_gt,
3235 							 &fence[fence_id], vma);
3236 			if (ret)
3237 				goto wait;
3238 			++fence_id;
3239 
3240 			if (!tile->media_gt)
3241 				continue;
3242 
3243 			xe_gt_tlb_invalidation_fence_init(tile->media_gt,
3244 							  &fence[fence_id],
3245 							  true);
3246 
3247 			ret = xe_gt_tlb_invalidation_vma(tile->media_gt,
3248 							 &fence[fence_id], vma);
3249 			if (ret)
3250 				goto wait;
3251 			++fence_id;
3252 		}
3253 	}
3254 
3255 wait:
3256 	for (id = 0; id < fence_id; ++id)
3257 		xe_gt_tlb_invalidation_fence_wait(&fence[id]);
3258 
3259 	vma->tile_invalidated = vma->tile_mask;
3260 
3261 	return ret;
3262 }
3263 
3264 struct xe_vm_snapshot {
3265 	unsigned long num_snaps;
3266 	struct {
3267 		u64 ofs, bo_ofs;
3268 		unsigned long len;
3269 		struct xe_bo *bo;
3270 		void *data;
3271 		struct mm_struct *mm;
3272 	} snap[];
3273 };
3274 
3275 struct xe_vm_snapshot *xe_vm_snapshot_capture(struct xe_vm *vm)
3276 {
3277 	unsigned long num_snaps = 0, i;
3278 	struct xe_vm_snapshot *snap = NULL;
3279 	struct drm_gpuva *gpuva;
3280 
3281 	if (!vm)
3282 		return NULL;
3283 
3284 	mutex_lock(&vm->snap_mutex);
3285 	drm_gpuvm_for_each_va(gpuva, &vm->gpuvm) {
3286 		if (gpuva->flags & XE_VMA_DUMPABLE)
3287 			num_snaps++;
3288 	}
3289 
3290 	if (num_snaps)
3291 		snap = kvzalloc(offsetof(struct xe_vm_snapshot, snap[num_snaps]), GFP_NOWAIT);
3292 	if (!snap) {
3293 		snap = num_snaps ? ERR_PTR(-ENOMEM) : ERR_PTR(-ENODEV);
3294 		goto out_unlock;
3295 	}
3296 
3297 	snap->num_snaps = num_snaps;
3298 	i = 0;
3299 	drm_gpuvm_for_each_va(gpuva, &vm->gpuvm) {
3300 		struct xe_vma *vma = gpuva_to_vma(gpuva);
3301 		struct xe_bo *bo = vma->gpuva.gem.obj ?
3302 			gem_to_xe_bo(vma->gpuva.gem.obj) : NULL;
3303 
3304 		if (!(gpuva->flags & XE_VMA_DUMPABLE))
3305 			continue;
3306 
3307 		snap->snap[i].ofs = xe_vma_start(vma);
3308 		snap->snap[i].len = xe_vma_size(vma);
3309 		if (bo) {
3310 			snap->snap[i].bo = xe_bo_get(bo);
3311 			snap->snap[i].bo_ofs = xe_vma_bo_offset(vma);
3312 		} else if (xe_vma_is_userptr(vma)) {
3313 			struct mm_struct *mm =
3314 				to_userptr_vma(vma)->userptr.notifier.mm;
3315 
3316 			if (mmget_not_zero(mm))
3317 				snap->snap[i].mm = mm;
3318 			else
3319 				snap->snap[i].data = ERR_PTR(-EFAULT);
3320 
3321 			snap->snap[i].bo_ofs = xe_vma_userptr(vma);
3322 		} else {
3323 			snap->snap[i].data = ERR_PTR(-ENOENT);
3324 		}
3325 		i++;
3326 	}
3327 
3328 out_unlock:
3329 	mutex_unlock(&vm->snap_mutex);
3330 	return snap;
3331 }
3332 
3333 void xe_vm_snapshot_capture_delayed(struct xe_vm_snapshot *snap)
3334 {
3335 	if (IS_ERR_OR_NULL(snap))
3336 		return;
3337 
3338 	for (int i = 0; i < snap->num_snaps; i++) {
3339 		struct xe_bo *bo = snap->snap[i].bo;
3340 		int err;
3341 
3342 		if (IS_ERR(snap->snap[i].data))
3343 			continue;
3344 
3345 		snap->snap[i].data = kvmalloc(snap->snap[i].len, GFP_USER);
3346 		if (!snap->snap[i].data) {
3347 			snap->snap[i].data = ERR_PTR(-ENOMEM);
3348 			goto cleanup_bo;
3349 		}
3350 
3351 		if (bo) {
3352 			err = xe_bo_read(bo, snap->snap[i].bo_ofs,
3353 					 snap->snap[i].data, snap->snap[i].len);
3354 		} else {
3355 			void __user *userptr = (void __user *)(size_t)snap->snap[i].bo_ofs;
3356 
3357 			kthread_use_mm(snap->snap[i].mm);
3358 			if (!copy_from_user(snap->snap[i].data, userptr, snap->snap[i].len))
3359 				err = 0;
3360 			else
3361 				err = -EFAULT;
3362 			kthread_unuse_mm(snap->snap[i].mm);
3363 
3364 			mmput(snap->snap[i].mm);
3365 			snap->snap[i].mm = NULL;
3366 		}
3367 
3368 		if (err) {
3369 			kvfree(snap->snap[i].data);
3370 			snap->snap[i].data = ERR_PTR(err);
3371 		}
3372 
3373 cleanup_bo:
3374 		xe_bo_put(bo);
3375 		snap->snap[i].bo = NULL;
3376 	}
3377 }
3378 
3379 void xe_vm_snapshot_print(struct xe_vm_snapshot *snap, struct drm_printer *p)
3380 {
3381 	unsigned long i, j;
3382 
3383 	if (IS_ERR_OR_NULL(snap)) {
3384 		drm_printf(p, "[0].error: %li\n", PTR_ERR(snap));
3385 		return;
3386 	}
3387 
3388 	for (i = 0; i < snap->num_snaps; i++) {
3389 		drm_printf(p, "[%llx].length: 0x%lx\n", snap->snap[i].ofs, snap->snap[i].len);
3390 
3391 		if (IS_ERR(snap->snap[i].data)) {
3392 			drm_printf(p, "[%llx].error: %li\n", snap->snap[i].ofs,
3393 				   PTR_ERR(snap->snap[i].data));
3394 			continue;
3395 		}
3396 
3397 		drm_printf(p, "[%llx].data: ", snap->snap[i].ofs);
3398 
3399 		for (j = 0; j < snap->snap[i].len; j += sizeof(u32)) {
3400 			u32 *val = snap->snap[i].data + j;
3401 			char dumped[ASCII85_BUFSZ];
3402 
3403 			drm_puts(p, ascii85_encode(*val, dumped));
3404 		}
3405 
3406 		drm_puts(p, "\n");
3407 	}
3408 }
3409 
3410 void xe_vm_snapshot_free(struct xe_vm_snapshot *snap)
3411 {
3412 	unsigned long i;
3413 
3414 	if (IS_ERR_OR_NULL(snap))
3415 		return;
3416 
3417 	for (i = 0; i < snap->num_snaps; i++) {
3418 		if (!IS_ERR(snap->snap[i].data))
3419 			kvfree(snap->snap[i].data);
3420 		xe_bo_put(snap->snap[i].bo);
3421 		if (snap->snap[i].mm)
3422 			mmput(snap->snap[i].mm);
3423 	}
3424 	kvfree(snap);
3425 }
3426