xref: /linux/drivers/gpu/drm/xe/xe_vm.c (revision 8cdcef1c2f82d207aa8b2a02298fbc17191c6261)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2021 Intel Corporation
4  */
5 
6 #include "xe_vm.h"
7 
8 #include <linux/dma-fence-array.h>
9 #include <linux/nospec.h>
10 
11 #include <drm/drm_exec.h>
12 #include <drm/drm_print.h>
13 #include <drm/ttm/ttm_execbuf_util.h>
14 #include <drm/ttm/ttm_tt.h>
15 #include <drm/xe_drm.h>
16 #include <linux/delay.h>
17 #include <linux/kthread.h>
18 #include <linux/mm.h>
19 #include <linux/swap.h>
20 
21 #include "xe_assert.h"
22 #include "xe_bo.h"
23 #include "xe_device.h"
24 #include "xe_drm_client.h"
25 #include "xe_exec_queue.h"
26 #include "xe_gt.h"
27 #include "xe_gt_pagefault.h"
28 #include "xe_gt_tlb_invalidation.h"
29 #include "xe_migrate.h"
30 #include "xe_pat.h"
31 #include "xe_pm.h"
32 #include "xe_preempt_fence.h"
33 #include "xe_pt.h"
34 #include "xe_res_cursor.h"
35 #include "xe_sync.h"
36 #include "xe_trace.h"
37 #include "generated/xe_wa_oob.h"
38 #include "xe_wa.h"
39 
40 #define TEST_VM_ASYNC_OPS_ERROR
41 
42 static struct drm_gem_object *xe_vm_obj(struct xe_vm *vm)
43 {
44 	return vm->gpuvm.r_obj;
45 }
46 
47 /**
48  * xe_vma_userptr_check_repin() - Advisory check for repin needed
49  * @vma: The userptr vma
50  *
51  * Check if the userptr vma has been invalidated since last successful
52  * repin. The check is advisory only and can the function can be called
53  * without the vm->userptr.notifier_lock held. There is no guarantee that the
54  * vma userptr will remain valid after a lockless check, so typically
55  * the call needs to be followed by a proper check under the notifier_lock.
56  *
57  * Return: 0 if userptr vma is valid, -EAGAIN otherwise; repin recommended.
58  */
59 int xe_vma_userptr_check_repin(struct xe_vma *vma)
60 {
61 	return mmu_interval_check_retry(&vma->userptr.notifier,
62 					vma->userptr.notifier_seq) ?
63 		-EAGAIN : 0;
64 }
65 
66 int xe_vma_userptr_pin_pages(struct xe_vma *vma)
67 {
68 	struct xe_vm *vm = xe_vma_vm(vma);
69 	struct xe_device *xe = vm->xe;
70 	const unsigned long num_pages = xe_vma_size(vma) >> PAGE_SHIFT;
71 	struct page **pages;
72 	bool in_kthread = !current->mm;
73 	unsigned long notifier_seq;
74 	int pinned, ret, i;
75 	bool read_only = xe_vma_read_only(vma);
76 
77 	lockdep_assert_held(&vm->lock);
78 	xe_assert(xe, xe_vma_is_userptr(vma));
79 retry:
80 	if (vma->gpuva.flags & XE_VMA_DESTROYED)
81 		return 0;
82 
83 	notifier_seq = mmu_interval_read_begin(&vma->userptr.notifier);
84 	if (notifier_seq == vma->userptr.notifier_seq)
85 		return 0;
86 
87 	pages = kvmalloc_array(num_pages, sizeof(*pages), GFP_KERNEL);
88 	if (!pages)
89 		return -ENOMEM;
90 
91 	if (vma->userptr.sg) {
92 		dma_unmap_sgtable(xe->drm.dev,
93 				  vma->userptr.sg,
94 				  read_only ? DMA_TO_DEVICE :
95 				  DMA_BIDIRECTIONAL, 0);
96 		sg_free_table(vma->userptr.sg);
97 		vma->userptr.sg = NULL;
98 	}
99 
100 	pinned = ret = 0;
101 	if (in_kthread) {
102 		if (!mmget_not_zero(vma->userptr.notifier.mm)) {
103 			ret = -EFAULT;
104 			goto mm_closed;
105 		}
106 		kthread_use_mm(vma->userptr.notifier.mm);
107 	}
108 
109 	while (pinned < num_pages) {
110 		ret = get_user_pages_fast(xe_vma_userptr(vma) +
111 					  pinned * PAGE_SIZE,
112 					  num_pages - pinned,
113 					  read_only ? 0 : FOLL_WRITE,
114 					  &pages[pinned]);
115 		if (ret < 0) {
116 			if (in_kthread)
117 				ret = 0;
118 			break;
119 		}
120 
121 		pinned += ret;
122 		ret = 0;
123 	}
124 
125 	if (in_kthread) {
126 		kthread_unuse_mm(vma->userptr.notifier.mm);
127 		mmput(vma->userptr.notifier.mm);
128 	}
129 mm_closed:
130 	if (ret)
131 		goto out;
132 
133 	ret = sg_alloc_table_from_pages_segment(&vma->userptr.sgt, pages,
134 						pinned, 0,
135 						(u64)pinned << PAGE_SHIFT,
136 						xe_sg_segment_size(xe->drm.dev),
137 						GFP_KERNEL);
138 	if (ret) {
139 		vma->userptr.sg = NULL;
140 		goto out;
141 	}
142 	vma->userptr.sg = &vma->userptr.sgt;
143 
144 	ret = dma_map_sgtable(xe->drm.dev, vma->userptr.sg,
145 			      read_only ? DMA_TO_DEVICE :
146 			      DMA_BIDIRECTIONAL,
147 			      DMA_ATTR_SKIP_CPU_SYNC |
148 			      DMA_ATTR_NO_KERNEL_MAPPING);
149 	if (ret) {
150 		sg_free_table(vma->userptr.sg);
151 		vma->userptr.sg = NULL;
152 		goto out;
153 	}
154 
155 	for (i = 0; i < pinned; ++i) {
156 		if (!read_only) {
157 			lock_page(pages[i]);
158 			set_page_dirty(pages[i]);
159 			unlock_page(pages[i]);
160 		}
161 
162 		mark_page_accessed(pages[i]);
163 	}
164 
165 out:
166 	release_pages(pages, pinned);
167 	kvfree(pages);
168 
169 	if (!(ret < 0)) {
170 		vma->userptr.notifier_seq = notifier_seq;
171 		if (xe_vma_userptr_check_repin(vma) == -EAGAIN)
172 			goto retry;
173 	}
174 
175 	return ret < 0 ? ret : 0;
176 }
177 
178 static bool preempt_fences_waiting(struct xe_vm *vm)
179 {
180 	struct xe_exec_queue *q;
181 
182 	lockdep_assert_held(&vm->lock);
183 	xe_vm_assert_held(vm);
184 
185 	list_for_each_entry(q, &vm->preempt.exec_queues, compute.link) {
186 		if (!q->compute.pfence ||
187 		    (q->compute.pfence && test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
188 						   &q->compute.pfence->flags))) {
189 			return true;
190 		}
191 	}
192 
193 	return false;
194 }
195 
196 static void free_preempt_fences(struct list_head *list)
197 {
198 	struct list_head *link, *next;
199 
200 	list_for_each_safe(link, next, list)
201 		xe_preempt_fence_free(to_preempt_fence_from_link(link));
202 }
203 
204 static int alloc_preempt_fences(struct xe_vm *vm, struct list_head *list,
205 				unsigned int *count)
206 {
207 	lockdep_assert_held(&vm->lock);
208 	xe_vm_assert_held(vm);
209 
210 	if (*count >= vm->preempt.num_exec_queues)
211 		return 0;
212 
213 	for (; *count < vm->preempt.num_exec_queues; ++(*count)) {
214 		struct xe_preempt_fence *pfence = xe_preempt_fence_alloc();
215 
216 		if (IS_ERR(pfence))
217 			return PTR_ERR(pfence);
218 
219 		list_move_tail(xe_preempt_fence_link(pfence), list);
220 	}
221 
222 	return 0;
223 }
224 
225 static int wait_for_existing_preempt_fences(struct xe_vm *vm)
226 {
227 	struct xe_exec_queue *q;
228 
229 	xe_vm_assert_held(vm);
230 
231 	list_for_each_entry(q, &vm->preempt.exec_queues, compute.link) {
232 		if (q->compute.pfence) {
233 			long timeout = dma_fence_wait(q->compute.pfence, false);
234 
235 			if (timeout < 0)
236 				return -ETIME;
237 			dma_fence_put(q->compute.pfence);
238 			q->compute.pfence = NULL;
239 		}
240 	}
241 
242 	return 0;
243 }
244 
245 static bool xe_vm_is_idle(struct xe_vm *vm)
246 {
247 	struct xe_exec_queue *q;
248 
249 	xe_vm_assert_held(vm);
250 	list_for_each_entry(q, &vm->preempt.exec_queues, compute.link) {
251 		if (!xe_exec_queue_is_idle(q))
252 			return false;
253 	}
254 
255 	return true;
256 }
257 
258 static void arm_preempt_fences(struct xe_vm *vm, struct list_head *list)
259 {
260 	struct list_head *link;
261 	struct xe_exec_queue *q;
262 
263 	list_for_each_entry(q, &vm->preempt.exec_queues, compute.link) {
264 		struct dma_fence *fence;
265 
266 		link = list->next;
267 		xe_assert(vm->xe, link != list);
268 
269 		fence = xe_preempt_fence_arm(to_preempt_fence_from_link(link),
270 					     q, q->compute.context,
271 					     ++q->compute.seqno);
272 		dma_fence_put(q->compute.pfence);
273 		q->compute.pfence = fence;
274 	}
275 }
276 
277 static int add_preempt_fences(struct xe_vm *vm, struct xe_bo *bo)
278 {
279 	struct xe_exec_queue *q;
280 	int err;
281 
282 	err = xe_bo_lock(bo, true);
283 	if (err)
284 		return err;
285 
286 	err = dma_resv_reserve_fences(bo->ttm.base.resv, vm->preempt.num_exec_queues);
287 	if (err)
288 		goto out_unlock;
289 
290 	list_for_each_entry(q, &vm->preempt.exec_queues, compute.link)
291 		if (q->compute.pfence) {
292 			dma_resv_add_fence(bo->ttm.base.resv,
293 					   q->compute.pfence,
294 					   DMA_RESV_USAGE_BOOKKEEP);
295 		}
296 
297 out_unlock:
298 	xe_bo_unlock(bo);
299 	return err;
300 }
301 
302 /**
303  * xe_vm_fence_all_extobjs() - Add a fence to vm's external objects' resv
304  * @vm: The vm.
305  * @fence: The fence to add.
306  * @usage: The resv usage for the fence.
307  *
308  * Loops over all of the vm's external object bindings and adds a @fence
309  * with the given @usage to all of the external object's reservation
310  * objects.
311  */
312 void xe_vm_fence_all_extobjs(struct xe_vm *vm, struct dma_fence *fence,
313 			     enum dma_resv_usage usage)
314 {
315 	struct xe_vma *vma;
316 
317 	list_for_each_entry(vma, &vm->extobj.list, extobj.link)
318 		dma_resv_add_fence(xe_vma_bo(vma)->ttm.base.resv, fence, usage);
319 }
320 
321 static void resume_and_reinstall_preempt_fences(struct xe_vm *vm)
322 {
323 	struct xe_exec_queue *q;
324 
325 	lockdep_assert_held(&vm->lock);
326 	xe_vm_assert_held(vm);
327 
328 	list_for_each_entry(q, &vm->preempt.exec_queues, compute.link) {
329 		q->ops->resume(q);
330 
331 		dma_resv_add_fence(xe_vm_resv(vm), q->compute.pfence,
332 				   DMA_RESV_USAGE_BOOKKEEP);
333 		xe_vm_fence_all_extobjs(vm, q->compute.pfence,
334 					DMA_RESV_USAGE_BOOKKEEP);
335 	}
336 }
337 
338 int xe_vm_add_compute_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
339 {
340 	struct drm_exec exec;
341 	struct dma_fence *pfence;
342 	int err;
343 	bool wait;
344 
345 	xe_assert(vm->xe, xe_vm_in_preempt_fence_mode(vm));
346 
347 	down_write(&vm->lock);
348 	drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT);
349 	drm_exec_until_all_locked(&exec) {
350 		err = xe_vm_lock_dma_resv(vm, &exec, 1, true);
351 		drm_exec_retry_on_contention(&exec);
352 		if (err)
353 			goto out_unlock;
354 	}
355 
356 	pfence = xe_preempt_fence_create(q, q->compute.context,
357 					 ++q->compute.seqno);
358 	if (!pfence) {
359 		err = -ENOMEM;
360 		goto out_unlock;
361 	}
362 
363 	list_add(&q->compute.link, &vm->preempt.exec_queues);
364 	++vm->preempt.num_exec_queues;
365 	q->compute.pfence = pfence;
366 
367 	down_read(&vm->userptr.notifier_lock);
368 
369 	dma_resv_add_fence(xe_vm_resv(vm), pfence,
370 			   DMA_RESV_USAGE_BOOKKEEP);
371 
372 	xe_vm_fence_all_extobjs(vm, pfence, DMA_RESV_USAGE_BOOKKEEP);
373 
374 	/*
375 	 * Check to see if a preemption on VM is in flight or userptr
376 	 * invalidation, if so trigger this preempt fence to sync state with
377 	 * other preempt fences on the VM.
378 	 */
379 	wait = __xe_vm_userptr_needs_repin(vm) || preempt_fences_waiting(vm);
380 	if (wait)
381 		dma_fence_enable_sw_signaling(pfence);
382 
383 	up_read(&vm->userptr.notifier_lock);
384 
385 out_unlock:
386 	drm_exec_fini(&exec);
387 	up_write(&vm->lock);
388 
389 	return err;
390 }
391 
392 /**
393  * xe_vm_remove_compute_exec_queue() - Remove compute exec queue from VM
394  * @vm: The VM.
395  * @q: The exec_queue
396  */
397 void xe_vm_remove_compute_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
398 {
399 	if (!xe_vm_in_preempt_fence_mode(vm))
400 		return;
401 
402 	down_write(&vm->lock);
403 	list_del(&q->compute.link);
404 	--vm->preempt.num_exec_queues;
405 	if (q->compute.pfence) {
406 		dma_fence_enable_sw_signaling(q->compute.pfence);
407 		dma_fence_put(q->compute.pfence);
408 		q->compute.pfence = NULL;
409 	}
410 	up_write(&vm->lock);
411 }
412 
413 /**
414  * __xe_vm_userptr_needs_repin() - Check whether the VM does have userptrs
415  * that need repinning.
416  * @vm: The VM.
417  *
418  * This function checks for whether the VM has userptrs that need repinning,
419  * and provides a release-type barrier on the userptr.notifier_lock after
420  * checking.
421  *
422  * Return: 0 if there are no userptrs needing repinning, -EAGAIN if there are.
423  */
424 int __xe_vm_userptr_needs_repin(struct xe_vm *vm)
425 {
426 	lockdep_assert_held_read(&vm->userptr.notifier_lock);
427 
428 	return (list_empty(&vm->userptr.repin_list) &&
429 		list_empty(&vm->userptr.invalidated)) ? 0 : -EAGAIN;
430 }
431 
432 /**
433  * xe_vm_lock_dma_resv() - Lock the vm dma_resv object and the dma_resv
434  * objects of the vm's external buffer objects.
435  * @vm: The vm.
436  * @exec: Pointer to a struct drm_exec locking context.
437  * @num_shared: Number of dma-fence slots to reserve in the locked objects.
438  * @lock_vm: Lock also the vm's dma_resv.
439  *
440  * Locks the vm dma-resv objects and all the dma-resv objects of the
441  * buffer objects on the vm external object list.
442  *
443  * Return: 0 on success, Negative error code on error. In particular if
444  * @intr is set to true, -EINTR or -ERESTARTSYS may be returned.
445  */
446 int xe_vm_lock_dma_resv(struct xe_vm *vm, struct drm_exec *exec,
447 			unsigned int num_shared, bool lock_vm)
448 {
449 	struct xe_vma *vma, *next;
450 	int err = 0;
451 
452 	lockdep_assert_held(&vm->lock);
453 
454 	if (lock_vm) {
455 		err = drm_exec_prepare_obj(exec, xe_vm_obj(vm), num_shared);
456 		if (err)
457 			return err;
458 	}
459 
460 	list_for_each_entry(vma, &vm->extobj.list, extobj.link) {
461 		err = drm_exec_prepare_obj(exec, &xe_vma_bo(vma)->ttm.base, num_shared);
462 		if (err)
463 			return err;
464 	}
465 
466 	spin_lock(&vm->notifier.list_lock);
467 	list_for_each_entry_safe(vma, next, &vm->notifier.rebind_list,
468 				 notifier.rebind_link) {
469 		xe_bo_assert_held(xe_vma_bo(vma));
470 
471 		list_del_init(&vma->notifier.rebind_link);
472 		if (vma->tile_present && !(vma->gpuva.flags & XE_VMA_DESTROYED))
473 			list_move_tail(&vma->combined_links.rebind,
474 				       &vm->rebind_list);
475 	}
476 	spin_unlock(&vm->notifier.list_lock);
477 
478 	return 0;
479 }
480 
481 #define XE_VM_REBIND_RETRY_TIMEOUT_MS 1000
482 
483 static void xe_vm_kill(struct xe_vm *vm)
484 {
485 	struct xe_exec_queue *q;
486 
487 	lockdep_assert_held(&vm->lock);
488 
489 	xe_vm_lock(vm, false);
490 	vm->flags |= XE_VM_FLAG_BANNED;
491 	trace_xe_vm_kill(vm);
492 
493 	list_for_each_entry(q, &vm->preempt.exec_queues, compute.link)
494 		q->ops->kill(q);
495 	xe_vm_unlock(vm);
496 
497 	/* TODO: Inform user the VM is banned */
498 }
499 
500 /**
501  * xe_vm_validate_should_retry() - Whether to retry after a validate error.
502  * @exec: The drm_exec object used for locking before validation.
503  * @err: The error returned from ttm_bo_validate().
504  * @end: A ktime_t cookie that should be set to 0 before first use and
505  * that should be reused on subsequent calls.
506  *
507  * With multiple active VMs, under memory pressure, it is possible that
508  * ttm_bo_validate() run into -EDEADLK and in such case returns -ENOMEM.
509  * Until ttm properly handles locking in such scenarios, best thing the
510  * driver can do is retry with a timeout. Check if that is necessary, and
511  * if so unlock the drm_exec's objects while keeping the ticket to prepare
512  * for a rerun.
513  *
514  * Return: true if a retry after drm_exec_init() is recommended;
515  * false otherwise.
516  */
517 bool xe_vm_validate_should_retry(struct drm_exec *exec, int err, ktime_t *end)
518 {
519 	ktime_t cur;
520 
521 	if (err != -ENOMEM)
522 		return false;
523 
524 	cur = ktime_get();
525 	*end = *end ? : ktime_add_ms(cur, XE_VM_REBIND_RETRY_TIMEOUT_MS);
526 	if (!ktime_before(cur, *end))
527 		return false;
528 
529 	/*
530 	 * We would like to keep the ticket here with
531 	 * drm_exec_unlock_all(), but WW mutex asserts currently
532 	 * stop us from that. In any case this function could go away
533 	 * with proper TTM -EDEADLK handling.
534 	 */
535 	drm_exec_fini(exec);
536 
537 	msleep(20);
538 	return true;
539 }
540 
541 static int xe_preempt_work_begin(struct drm_exec *exec, struct xe_vm *vm,
542 				 bool *done)
543 {
544 	struct xe_vma *vma;
545 	int err;
546 
547 	/*
548 	 * 1 fence for each preempt fence plus a fence for each tile from a
549 	 * possible rebind
550 	 */
551 	err = drm_exec_prepare_obj(exec, xe_vm_obj(vm),
552 				   vm->preempt.num_exec_queues +
553 				   vm->xe->info.tile_count);
554 	if (err)
555 		return err;
556 
557 	if (xe_vm_is_idle(vm)) {
558 		vm->preempt.rebind_deactivated = true;
559 		*done = true;
560 		return 0;
561 	}
562 
563 	if (!preempt_fences_waiting(vm)) {
564 		*done = true;
565 		return 0;
566 	}
567 
568 	err = xe_vm_lock_dma_resv(vm, exec, vm->preempt.num_exec_queues, false);
569 	if (err)
570 		return err;
571 
572 	err = wait_for_existing_preempt_fences(vm);
573 	if (err)
574 		return err;
575 
576 	list_for_each_entry(vma, &vm->rebind_list, combined_links.rebind) {
577 		if (xe_vma_has_no_bo(vma) ||
578 		    vma->gpuva.flags & XE_VMA_DESTROYED)
579 			continue;
580 
581 		err = xe_bo_validate(xe_vma_bo(vma), vm, false);
582 		if (err)
583 			break;
584 	}
585 
586 	return err;
587 }
588 
589 static void preempt_rebind_work_func(struct work_struct *w)
590 {
591 	struct xe_vm *vm = container_of(w, struct xe_vm, preempt.rebind_work);
592 	struct drm_exec exec;
593 	struct dma_fence *rebind_fence;
594 	unsigned int fence_count = 0;
595 	LIST_HEAD(preempt_fences);
596 	ktime_t end = 0;
597 	int err = 0;
598 	long wait;
599 	int __maybe_unused tries = 0;
600 
601 	xe_assert(vm->xe, xe_vm_in_preempt_fence_mode(vm));
602 	trace_xe_vm_rebind_worker_enter(vm);
603 
604 	down_write(&vm->lock);
605 
606 	if (xe_vm_is_closed_or_banned(vm)) {
607 		up_write(&vm->lock);
608 		trace_xe_vm_rebind_worker_exit(vm);
609 		return;
610 	}
611 
612 retry:
613 	if (xe_vm_userptr_check_repin(vm)) {
614 		err = xe_vm_userptr_pin(vm);
615 		if (err)
616 			goto out_unlock_outer;
617 	}
618 
619 	drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT);
620 
621 	drm_exec_until_all_locked(&exec) {
622 		bool done = false;
623 
624 		err = xe_preempt_work_begin(&exec, vm, &done);
625 		drm_exec_retry_on_contention(&exec);
626 		if (err && xe_vm_validate_should_retry(&exec, err, &end)) {
627 			err = -EAGAIN;
628 			goto out_unlock_outer;
629 		}
630 		if (err || done)
631 			goto out_unlock;
632 	}
633 
634 	err = alloc_preempt_fences(vm, &preempt_fences, &fence_count);
635 	if (err)
636 		goto out_unlock;
637 
638 	rebind_fence = xe_vm_rebind(vm, true);
639 	if (IS_ERR(rebind_fence)) {
640 		err = PTR_ERR(rebind_fence);
641 		goto out_unlock;
642 	}
643 
644 	if (rebind_fence) {
645 		dma_fence_wait(rebind_fence, false);
646 		dma_fence_put(rebind_fence);
647 	}
648 
649 	/* Wait on munmap style VM unbinds */
650 	wait = dma_resv_wait_timeout(xe_vm_resv(vm),
651 				     DMA_RESV_USAGE_KERNEL,
652 				     false, MAX_SCHEDULE_TIMEOUT);
653 	if (wait <= 0) {
654 		err = -ETIME;
655 		goto out_unlock;
656 	}
657 
658 #define retry_required(__tries, __vm) \
659 	(IS_ENABLED(CONFIG_DRM_XE_USERPTR_INVAL_INJECT) ? \
660 	(!(__tries)++ || __xe_vm_userptr_needs_repin(__vm)) : \
661 	__xe_vm_userptr_needs_repin(__vm))
662 
663 	down_read(&vm->userptr.notifier_lock);
664 	if (retry_required(tries, vm)) {
665 		up_read(&vm->userptr.notifier_lock);
666 		err = -EAGAIN;
667 		goto out_unlock;
668 	}
669 
670 #undef retry_required
671 
672 	spin_lock(&vm->xe->ttm.lru_lock);
673 	ttm_lru_bulk_move_tail(&vm->lru_bulk_move);
674 	spin_unlock(&vm->xe->ttm.lru_lock);
675 
676 	/* Point of no return. */
677 	arm_preempt_fences(vm, &preempt_fences);
678 	resume_and_reinstall_preempt_fences(vm);
679 	up_read(&vm->userptr.notifier_lock);
680 
681 out_unlock:
682 	drm_exec_fini(&exec);
683 out_unlock_outer:
684 	if (err == -EAGAIN) {
685 		trace_xe_vm_rebind_worker_retry(vm);
686 		goto retry;
687 	}
688 
689 	if (err) {
690 		drm_warn(&vm->xe->drm, "VM worker error: %d\n", err);
691 		xe_vm_kill(vm);
692 	}
693 	up_write(&vm->lock);
694 
695 	free_preempt_fences(&preempt_fences);
696 
697 	trace_xe_vm_rebind_worker_exit(vm);
698 }
699 
700 static bool vma_userptr_invalidate(struct mmu_interval_notifier *mni,
701 				   const struct mmu_notifier_range *range,
702 				   unsigned long cur_seq)
703 {
704 	struct xe_vma *vma = container_of(mni, struct xe_vma, userptr.notifier);
705 	struct xe_vm *vm = xe_vma_vm(vma);
706 	struct dma_resv_iter cursor;
707 	struct dma_fence *fence;
708 	long err;
709 
710 	xe_assert(vm->xe, xe_vma_is_userptr(vma));
711 	trace_xe_vma_userptr_invalidate(vma);
712 
713 	if (!mmu_notifier_range_blockable(range))
714 		return false;
715 
716 	down_write(&vm->userptr.notifier_lock);
717 	mmu_interval_set_seq(mni, cur_seq);
718 
719 	/* No need to stop gpu access if the userptr is not yet bound. */
720 	if (!vma->userptr.initial_bind) {
721 		up_write(&vm->userptr.notifier_lock);
722 		return true;
723 	}
724 
725 	/*
726 	 * Tell exec and rebind worker they need to repin and rebind this
727 	 * userptr.
728 	 */
729 	if (!xe_vm_in_fault_mode(vm) &&
730 	    !(vma->gpuva.flags & XE_VMA_DESTROYED) && vma->tile_present) {
731 		spin_lock(&vm->userptr.invalidated_lock);
732 		list_move_tail(&vma->userptr.invalidate_link,
733 			       &vm->userptr.invalidated);
734 		spin_unlock(&vm->userptr.invalidated_lock);
735 	}
736 
737 	up_write(&vm->userptr.notifier_lock);
738 
739 	/*
740 	 * Preempt fences turn into schedule disables, pipeline these.
741 	 * Note that even in fault mode, we need to wait for binds and
742 	 * unbinds to complete, and those are attached as BOOKMARK fences
743 	 * to the vm.
744 	 */
745 	dma_resv_iter_begin(&cursor, xe_vm_resv(vm),
746 			    DMA_RESV_USAGE_BOOKKEEP);
747 	dma_resv_for_each_fence_unlocked(&cursor, fence)
748 		dma_fence_enable_sw_signaling(fence);
749 	dma_resv_iter_end(&cursor);
750 
751 	err = dma_resv_wait_timeout(xe_vm_resv(vm),
752 				    DMA_RESV_USAGE_BOOKKEEP,
753 				    false, MAX_SCHEDULE_TIMEOUT);
754 	XE_WARN_ON(err <= 0);
755 
756 	if (xe_vm_in_fault_mode(vm)) {
757 		err = xe_vm_invalidate_vma(vma);
758 		XE_WARN_ON(err);
759 	}
760 
761 	trace_xe_vma_userptr_invalidate_complete(vma);
762 
763 	return true;
764 }
765 
766 static const struct mmu_interval_notifier_ops vma_userptr_notifier_ops = {
767 	.invalidate = vma_userptr_invalidate,
768 };
769 
770 int xe_vm_userptr_pin(struct xe_vm *vm)
771 {
772 	struct xe_vma *vma, *next;
773 	int err = 0;
774 	LIST_HEAD(tmp_evict);
775 
776 	lockdep_assert_held_write(&vm->lock);
777 
778 	/* Collect invalidated userptrs */
779 	spin_lock(&vm->userptr.invalidated_lock);
780 	list_for_each_entry_safe(vma, next, &vm->userptr.invalidated,
781 				 userptr.invalidate_link) {
782 		list_del_init(&vma->userptr.invalidate_link);
783 		if (list_empty(&vma->combined_links.userptr))
784 			list_move_tail(&vma->combined_links.userptr,
785 				       &vm->userptr.repin_list);
786 	}
787 	spin_unlock(&vm->userptr.invalidated_lock);
788 
789 	/* Pin and move to temporary list */
790 	list_for_each_entry_safe(vma, next, &vm->userptr.repin_list,
791 				 combined_links.userptr) {
792 		err = xe_vma_userptr_pin_pages(vma);
793 		if (err < 0)
794 			goto out_err;
795 
796 		list_move_tail(&vma->combined_links.userptr, &tmp_evict);
797 	}
798 
799 	/* Take lock and move to rebind_list for rebinding. */
800 	err = dma_resv_lock_interruptible(xe_vm_resv(vm), NULL);
801 	if (err)
802 		goto out_err;
803 
804 	list_for_each_entry_safe(vma, next, &tmp_evict, combined_links.userptr)
805 		list_move_tail(&vma->combined_links.rebind, &vm->rebind_list);
806 
807 	dma_resv_unlock(xe_vm_resv(vm));
808 
809 	return 0;
810 
811 out_err:
812 	list_splice_tail(&tmp_evict, &vm->userptr.repin_list);
813 
814 	return err;
815 }
816 
817 /**
818  * xe_vm_userptr_check_repin() - Check whether the VM might have userptrs
819  * that need repinning.
820  * @vm: The VM.
821  *
822  * This function does an advisory check for whether the VM has userptrs that
823  * need repinning.
824  *
825  * Return: 0 if there are no indications of userptrs needing repinning,
826  * -EAGAIN if there are.
827  */
828 int xe_vm_userptr_check_repin(struct xe_vm *vm)
829 {
830 	return (list_empty_careful(&vm->userptr.repin_list) &&
831 		list_empty_careful(&vm->userptr.invalidated)) ? 0 : -EAGAIN;
832 }
833 
834 static struct dma_fence *
835 xe_vm_bind_vma(struct xe_vma *vma, struct xe_exec_queue *q,
836 	       struct xe_sync_entry *syncs, u32 num_syncs,
837 	       bool first_op, bool last_op);
838 
839 struct dma_fence *xe_vm_rebind(struct xe_vm *vm, bool rebind_worker)
840 {
841 	struct dma_fence *fence = NULL;
842 	struct xe_vma *vma, *next;
843 
844 	lockdep_assert_held(&vm->lock);
845 	if (xe_vm_in_lr_mode(vm) && !rebind_worker)
846 		return NULL;
847 
848 	xe_vm_assert_held(vm);
849 	list_for_each_entry_safe(vma, next, &vm->rebind_list,
850 				 combined_links.rebind) {
851 		xe_assert(vm->xe, vma->tile_present);
852 
853 		list_del_init(&vma->combined_links.rebind);
854 		dma_fence_put(fence);
855 		if (rebind_worker)
856 			trace_xe_vma_rebind_worker(vma);
857 		else
858 			trace_xe_vma_rebind_exec(vma);
859 		fence = xe_vm_bind_vma(vma, NULL, NULL, 0, false, false);
860 		if (IS_ERR(fence))
861 			return fence;
862 	}
863 
864 	return fence;
865 }
866 
867 static struct xe_vma *xe_vma_create(struct xe_vm *vm,
868 				    struct xe_bo *bo,
869 				    u64 bo_offset_or_userptr,
870 				    u64 start, u64 end,
871 				    bool read_only,
872 				    bool is_null,
873 				    u8 tile_mask,
874 				    u16 pat_index)
875 {
876 	struct xe_vma *vma;
877 	struct xe_tile *tile;
878 	u8 id;
879 
880 	xe_assert(vm->xe, start < end);
881 	xe_assert(vm->xe, end < vm->size);
882 
883 	if (!bo && !is_null)	/* userptr */
884 		vma = kzalloc(sizeof(*vma), GFP_KERNEL);
885 	else
886 		vma = kzalloc(sizeof(*vma) - sizeof(struct xe_userptr),
887 			      GFP_KERNEL);
888 	if (!vma) {
889 		vma = ERR_PTR(-ENOMEM);
890 		return vma;
891 	}
892 
893 	INIT_LIST_HEAD(&vma->combined_links.rebind);
894 	INIT_LIST_HEAD(&vma->notifier.rebind_link);
895 	INIT_LIST_HEAD(&vma->extobj.link);
896 
897 	INIT_LIST_HEAD(&vma->gpuva.gem.entry);
898 	vma->gpuva.vm = &vm->gpuvm;
899 	vma->gpuva.va.addr = start;
900 	vma->gpuva.va.range = end - start + 1;
901 	if (read_only)
902 		vma->gpuva.flags |= XE_VMA_READ_ONLY;
903 	if (is_null)
904 		vma->gpuva.flags |= DRM_GPUVA_SPARSE;
905 
906 	if (tile_mask) {
907 		vma->tile_mask = tile_mask;
908 	} else {
909 		for_each_tile(tile, vm->xe, id)
910 			vma->tile_mask |= 0x1 << id;
911 	}
912 
913 	if (GRAPHICS_VER(vm->xe) >= 20 || vm->xe->info.platform == XE_PVC)
914 		vma->gpuva.flags |= XE_VMA_ATOMIC_PTE_BIT;
915 
916 	vma->pat_index = pat_index;
917 
918 	if (bo) {
919 		struct drm_gpuvm_bo *vm_bo;
920 
921 		xe_bo_assert_held(bo);
922 
923 		vm_bo = drm_gpuvm_bo_obtain(vma->gpuva.vm, &bo->ttm.base);
924 		if (IS_ERR(vm_bo)) {
925 			kfree(vma);
926 			return ERR_CAST(vm_bo);
927 		}
928 
929 		drm_gem_object_get(&bo->ttm.base);
930 		vma->gpuva.gem.obj = &bo->ttm.base;
931 		vma->gpuva.gem.offset = bo_offset_or_userptr;
932 		drm_gpuva_link(&vma->gpuva, vm_bo);
933 		drm_gpuvm_bo_put(vm_bo);
934 	} else /* userptr or null */ {
935 		if (!is_null) {
936 			u64 size = end - start + 1;
937 			int err;
938 
939 			INIT_LIST_HEAD(&vma->userptr.invalidate_link);
940 			vma->gpuva.gem.offset = bo_offset_or_userptr;
941 
942 			err = mmu_interval_notifier_insert(&vma->userptr.notifier,
943 							   current->mm,
944 							   xe_vma_userptr(vma), size,
945 							   &vma_userptr_notifier_ops);
946 			if (err) {
947 				kfree(vma);
948 				vma = ERR_PTR(err);
949 				return vma;
950 			}
951 
952 			vma->userptr.notifier_seq = LONG_MAX;
953 		}
954 
955 		xe_vm_get(vm);
956 	}
957 
958 	return vma;
959 }
960 
961 static bool vm_remove_extobj(struct xe_vma *vma)
962 {
963 	if (!list_empty(&vma->extobj.link)) {
964 		xe_vma_vm(vma)->extobj.entries--;
965 		list_del_init(&vma->extobj.link);
966 		return true;
967 	}
968 	return false;
969 }
970 
971 static void xe_vma_destroy_late(struct xe_vma *vma)
972 {
973 	struct xe_vm *vm = xe_vma_vm(vma);
974 	struct xe_device *xe = vm->xe;
975 	bool read_only = xe_vma_read_only(vma);
976 
977 	if (xe_vma_is_userptr(vma)) {
978 		if (vma->userptr.sg) {
979 			dma_unmap_sgtable(xe->drm.dev,
980 					  vma->userptr.sg,
981 					  read_only ? DMA_TO_DEVICE :
982 					  DMA_BIDIRECTIONAL, 0);
983 			sg_free_table(vma->userptr.sg);
984 			vma->userptr.sg = NULL;
985 		}
986 
987 		/*
988 		 * Since userptr pages are not pinned, we can't remove
989 		 * the notifer until we're sure the GPU is not accessing
990 		 * them anymore
991 		 */
992 		mmu_interval_notifier_remove(&vma->userptr.notifier);
993 		xe_vm_put(vm);
994 	} else if (xe_vma_is_null(vma)) {
995 		xe_vm_put(vm);
996 	} else {
997 		xe_bo_put(xe_vma_bo(vma));
998 	}
999 
1000 	kfree(vma);
1001 }
1002 
1003 static void vma_destroy_work_func(struct work_struct *w)
1004 {
1005 	struct xe_vma *vma =
1006 		container_of(w, struct xe_vma, destroy_work);
1007 
1008 	xe_vma_destroy_late(vma);
1009 }
1010 
1011 static struct xe_vma *
1012 bo_has_vm_references_locked(struct xe_bo *bo, struct xe_vm *vm,
1013 			    struct xe_vma *ignore)
1014 {
1015 	struct drm_gpuvm_bo *vm_bo;
1016 	struct drm_gpuva *va;
1017 	struct drm_gem_object *obj = &bo->ttm.base;
1018 
1019 	xe_bo_assert_held(bo);
1020 
1021 	drm_gem_for_each_gpuvm_bo(vm_bo, obj) {
1022 		drm_gpuvm_bo_for_each_va(va, vm_bo) {
1023 			struct xe_vma *vma = gpuva_to_vma(va);
1024 
1025 			if (vma != ignore && xe_vma_vm(vma) == vm)
1026 				return vma;
1027 		}
1028 	}
1029 
1030 	return NULL;
1031 }
1032 
1033 static bool bo_has_vm_references(struct xe_bo *bo, struct xe_vm *vm,
1034 				 struct xe_vma *ignore)
1035 {
1036 	bool ret;
1037 
1038 	xe_bo_lock(bo, false);
1039 	ret = !!bo_has_vm_references_locked(bo, vm, ignore);
1040 	xe_bo_unlock(bo);
1041 
1042 	return ret;
1043 }
1044 
1045 static void __vm_insert_extobj(struct xe_vm *vm, struct xe_vma *vma)
1046 {
1047 	lockdep_assert_held_write(&vm->lock);
1048 
1049 	list_add(&vma->extobj.link, &vm->extobj.list);
1050 	vm->extobj.entries++;
1051 }
1052 
1053 static void vm_insert_extobj(struct xe_vm *vm, struct xe_vma *vma)
1054 {
1055 	struct xe_bo *bo = xe_vma_bo(vma);
1056 
1057 	lockdep_assert_held_write(&vm->lock);
1058 
1059 	if (bo_has_vm_references(bo, vm, vma))
1060 		return;
1061 
1062 	__vm_insert_extobj(vm, vma);
1063 }
1064 
1065 static void vma_destroy_cb(struct dma_fence *fence,
1066 			   struct dma_fence_cb *cb)
1067 {
1068 	struct xe_vma *vma = container_of(cb, struct xe_vma, destroy_cb);
1069 
1070 	INIT_WORK(&vma->destroy_work, vma_destroy_work_func);
1071 	queue_work(system_unbound_wq, &vma->destroy_work);
1072 }
1073 
1074 static void xe_vma_destroy(struct xe_vma *vma, struct dma_fence *fence)
1075 {
1076 	struct xe_vm *vm = xe_vma_vm(vma);
1077 
1078 	lockdep_assert_held_write(&vm->lock);
1079 	xe_assert(vm->xe, list_empty(&vma->combined_links.destroy));
1080 
1081 	if (xe_vma_is_userptr(vma)) {
1082 		xe_assert(vm->xe, vma->gpuva.flags & XE_VMA_DESTROYED);
1083 
1084 		spin_lock(&vm->userptr.invalidated_lock);
1085 		list_del(&vma->userptr.invalidate_link);
1086 		spin_unlock(&vm->userptr.invalidated_lock);
1087 	} else if (!xe_vma_is_null(vma)) {
1088 		xe_bo_assert_held(xe_vma_bo(vma));
1089 
1090 		spin_lock(&vm->notifier.list_lock);
1091 		list_del(&vma->notifier.rebind_link);
1092 		spin_unlock(&vm->notifier.list_lock);
1093 
1094 		drm_gpuva_unlink(&vma->gpuva);
1095 
1096 		if (!xe_vma_bo(vma)->vm && vm_remove_extobj(vma)) {
1097 			struct xe_vma *other;
1098 
1099 			other = bo_has_vm_references_locked(xe_vma_bo(vma), vm, NULL);
1100 
1101 			if (other)
1102 				__vm_insert_extobj(vm, other);
1103 		}
1104 	}
1105 
1106 	xe_vm_assert_held(vm);
1107 	if (fence) {
1108 		int ret = dma_fence_add_callback(fence, &vma->destroy_cb,
1109 						 vma_destroy_cb);
1110 
1111 		if (ret) {
1112 			XE_WARN_ON(ret != -ENOENT);
1113 			xe_vma_destroy_late(vma);
1114 		}
1115 	} else {
1116 		xe_vma_destroy_late(vma);
1117 	}
1118 }
1119 
1120 /**
1121  * xe_vm_prepare_vma() - drm_exec utility to lock a vma
1122  * @exec: The drm_exec object we're currently locking for.
1123  * @vma: The vma for witch we want to lock the vm resv and any attached
1124  * object's resv.
1125  * @num_shared: The number of dma-fence slots to pre-allocate in the
1126  * objects' reservation objects.
1127  *
1128  * Return: 0 on success, negative error code on error. In particular
1129  * may return -EDEADLK on WW transaction contention and -EINTR if
1130  * an interruptible wait is terminated by a signal.
1131  */
1132 int xe_vm_prepare_vma(struct drm_exec *exec, struct xe_vma *vma,
1133 		      unsigned int num_shared)
1134 {
1135 	struct xe_vm *vm = xe_vma_vm(vma);
1136 	struct xe_bo *bo = xe_vma_bo(vma);
1137 	int err;
1138 
1139 	XE_WARN_ON(!vm);
1140 	err = drm_exec_prepare_obj(exec, xe_vm_obj(vm), num_shared);
1141 	if (!err && bo && !bo->vm)
1142 		err = drm_exec_prepare_obj(exec, &bo->ttm.base, num_shared);
1143 
1144 	return err;
1145 }
1146 
1147 static void xe_vma_destroy_unlocked(struct xe_vma *vma)
1148 {
1149 	struct drm_exec exec;
1150 	int err;
1151 
1152 	drm_exec_init(&exec, 0);
1153 	drm_exec_until_all_locked(&exec) {
1154 		err = xe_vm_prepare_vma(&exec, vma, 0);
1155 		drm_exec_retry_on_contention(&exec);
1156 		if (XE_WARN_ON(err))
1157 			break;
1158 	}
1159 
1160 	xe_vma_destroy(vma, NULL);
1161 
1162 	drm_exec_fini(&exec);
1163 }
1164 
1165 struct xe_vma *
1166 xe_vm_find_overlapping_vma(struct xe_vm *vm, u64 start, u64 range)
1167 {
1168 	struct drm_gpuva *gpuva;
1169 
1170 	lockdep_assert_held(&vm->lock);
1171 
1172 	if (xe_vm_is_closed_or_banned(vm))
1173 		return NULL;
1174 
1175 	xe_assert(vm->xe, start + range <= vm->size);
1176 
1177 	gpuva = drm_gpuva_find_first(&vm->gpuvm, start, range);
1178 
1179 	return gpuva ? gpuva_to_vma(gpuva) : NULL;
1180 }
1181 
1182 static int xe_vm_insert_vma(struct xe_vm *vm, struct xe_vma *vma)
1183 {
1184 	int err;
1185 
1186 	xe_assert(vm->xe, xe_vma_vm(vma) == vm);
1187 	lockdep_assert_held(&vm->lock);
1188 
1189 	err = drm_gpuva_insert(&vm->gpuvm, &vma->gpuva);
1190 	XE_WARN_ON(err);	/* Shouldn't be possible */
1191 
1192 	return err;
1193 }
1194 
1195 static void xe_vm_remove_vma(struct xe_vm *vm, struct xe_vma *vma)
1196 {
1197 	xe_assert(vm->xe, xe_vma_vm(vma) == vm);
1198 	lockdep_assert_held(&vm->lock);
1199 
1200 	drm_gpuva_remove(&vma->gpuva);
1201 	if (vm->usm.last_fault_vma == vma)
1202 		vm->usm.last_fault_vma = NULL;
1203 }
1204 
1205 static struct drm_gpuva_op *xe_vm_op_alloc(void)
1206 {
1207 	struct xe_vma_op *op;
1208 
1209 	op = kzalloc(sizeof(*op), GFP_KERNEL);
1210 
1211 	if (unlikely(!op))
1212 		return NULL;
1213 
1214 	return &op->base;
1215 }
1216 
1217 static void xe_vm_free(struct drm_gpuvm *gpuvm);
1218 
1219 static struct drm_gpuvm_ops gpuvm_ops = {
1220 	.op_alloc = xe_vm_op_alloc,
1221 	.vm_free = xe_vm_free,
1222 };
1223 
1224 static u64 pde_encode_pat_index(struct xe_device *xe, u16 pat_index)
1225 {
1226 	u64 pte = 0;
1227 
1228 	if (pat_index & BIT(0))
1229 		pte |= XE_PPGTT_PTE_PAT0;
1230 
1231 	if (pat_index & BIT(1))
1232 		pte |= XE_PPGTT_PTE_PAT1;
1233 
1234 	return pte;
1235 }
1236 
1237 static u64 pte_encode_pat_index(struct xe_device *xe, u16 pat_index,
1238 				u32 pt_level)
1239 {
1240 	u64 pte = 0;
1241 
1242 	if (pat_index & BIT(0))
1243 		pte |= XE_PPGTT_PTE_PAT0;
1244 
1245 	if (pat_index & BIT(1))
1246 		pte |= XE_PPGTT_PTE_PAT1;
1247 
1248 	if (pat_index & BIT(2)) {
1249 		if (pt_level)
1250 			pte |= XE_PPGTT_PDE_PDPE_PAT2;
1251 		else
1252 			pte |= XE_PPGTT_PTE_PAT2;
1253 	}
1254 
1255 	if (pat_index & BIT(3))
1256 		pte |= XELPG_PPGTT_PTE_PAT3;
1257 
1258 	if (pat_index & (BIT(4)))
1259 		pte |= XE2_PPGTT_PTE_PAT4;
1260 
1261 	return pte;
1262 }
1263 
1264 static u64 pte_encode_ps(u32 pt_level)
1265 {
1266 	XE_WARN_ON(pt_level > 2);
1267 
1268 	if (pt_level == 1)
1269 		return XE_PDE_PS_2M;
1270 	else if (pt_level == 2)
1271 		return XE_PDPE_PS_1G;
1272 
1273 	return 0;
1274 }
1275 
1276 static u64 xelp_pde_encode_bo(struct xe_bo *bo, u64 bo_offset,
1277 			      const u16 pat_index)
1278 {
1279 	struct xe_device *xe = xe_bo_device(bo);
1280 	u64 pde;
1281 
1282 	pde = xe_bo_addr(bo, bo_offset, XE_PAGE_SIZE);
1283 	pde |= XE_PAGE_PRESENT | XE_PAGE_RW;
1284 	pde |= pde_encode_pat_index(xe, pat_index);
1285 
1286 	return pde;
1287 }
1288 
1289 static u64 xelp_pte_encode_bo(struct xe_bo *bo, u64 bo_offset,
1290 			      u16 pat_index, u32 pt_level)
1291 {
1292 	struct xe_device *xe = xe_bo_device(bo);
1293 	u64 pte;
1294 
1295 	pte = xe_bo_addr(bo, bo_offset, XE_PAGE_SIZE);
1296 	pte |= XE_PAGE_PRESENT | XE_PAGE_RW;
1297 	pte |= pte_encode_pat_index(xe, pat_index, pt_level);
1298 	pte |= pte_encode_ps(pt_level);
1299 
1300 	if (xe_bo_is_vram(bo) || xe_bo_is_stolen_devmem(bo))
1301 		pte |= XE_PPGTT_PTE_DM;
1302 
1303 	return pte;
1304 }
1305 
1306 static u64 xelp_pte_encode_vma(u64 pte, struct xe_vma *vma,
1307 			       u16 pat_index, u32 pt_level)
1308 {
1309 	struct xe_device *xe = xe_vma_vm(vma)->xe;
1310 
1311 	pte |= XE_PAGE_PRESENT;
1312 
1313 	if (likely(!xe_vma_read_only(vma)))
1314 		pte |= XE_PAGE_RW;
1315 
1316 	pte |= pte_encode_pat_index(xe, pat_index, pt_level);
1317 	pte |= pte_encode_ps(pt_level);
1318 
1319 	if (unlikely(xe_vma_is_null(vma)))
1320 		pte |= XE_PTE_NULL;
1321 
1322 	return pte;
1323 }
1324 
1325 static u64 xelp_pte_encode_addr(struct xe_device *xe, u64 addr,
1326 				u16 pat_index,
1327 				u32 pt_level, bool devmem, u64 flags)
1328 {
1329 	u64 pte;
1330 
1331 	/* Avoid passing random bits directly as flags */
1332 	xe_assert(xe, !(flags & ~XE_PTE_PS64));
1333 
1334 	pte = addr;
1335 	pte |= XE_PAGE_PRESENT | XE_PAGE_RW;
1336 	pte |= pte_encode_pat_index(xe, pat_index, pt_level);
1337 	pte |= pte_encode_ps(pt_level);
1338 
1339 	if (devmem)
1340 		pte |= XE_PPGTT_PTE_DM;
1341 
1342 	pte |= flags;
1343 
1344 	return pte;
1345 }
1346 
1347 static const struct xe_pt_ops xelp_pt_ops = {
1348 	.pte_encode_bo = xelp_pte_encode_bo,
1349 	.pte_encode_vma = xelp_pte_encode_vma,
1350 	.pte_encode_addr = xelp_pte_encode_addr,
1351 	.pde_encode_bo = xelp_pde_encode_bo,
1352 };
1353 
1354 static void vm_destroy_work_func(struct work_struct *w);
1355 
1356 struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags)
1357 {
1358 	struct drm_gem_object *vm_resv_obj;
1359 	struct xe_vm *vm;
1360 	int err, number_tiles = 0;
1361 	struct xe_tile *tile;
1362 	u8 id;
1363 
1364 	vm = kzalloc(sizeof(*vm), GFP_KERNEL);
1365 	if (!vm)
1366 		return ERR_PTR(-ENOMEM);
1367 
1368 	vm->xe = xe;
1369 
1370 	vm->size = 1ull << xe->info.va_bits;
1371 
1372 	vm->flags = flags;
1373 
1374 	init_rwsem(&vm->lock);
1375 
1376 	INIT_LIST_HEAD(&vm->rebind_list);
1377 
1378 	INIT_LIST_HEAD(&vm->userptr.repin_list);
1379 	INIT_LIST_HEAD(&vm->userptr.invalidated);
1380 	init_rwsem(&vm->userptr.notifier_lock);
1381 	spin_lock_init(&vm->userptr.invalidated_lock);
1382 
1383 	INIT_LIST_HEAD(&vm->notifier.rebind_list);
1384 	spin_lock_init(&vm->notifier.list_lock);
1385 
1386 	INIT_WORK(&vm->destroy_work, vm_destroy_work_func);
1387 
1388 	INIT_LIST_HEAD(&vm->preempt.exec_queues);
1389 	vm->preempt.min_run_period_ms = 10;	/* FIXME: Wire up to uAPI */
1390 
1391 	for_each_tile(tile, xe, id)
1392 		xe_range_fence_tree_init(&vm->rftree[id]);
1393 
1394 	INIT_LIST_HEAD(&vm->extobj.list);
1395 
1396 	vm->pt_ops = &xelp_pt_ops;
1397 
1398 	if (!(flags & XE_VM_FLAG_MIGRATION))
1399 		xe_device_mem_access_get(xe);
1400 
1401 	vm_resv_obj = drm_gpuvm_resv_object_alloc(&xe->drm);
1402 	if (!vm_resv_obj) {
1403 		err = -ENOMEM;
1404 		goto err_no_resv;
1405 	}
1406 
1407 	drm_gpuvm_init(&vm->gpuvm, "Xe VM", 0, &xe->drm, vm_resv_obj,
1408 		       0, vm->size, 0, 0, &gpuvm_ops);
1409 
1410 	drm_gem_object_put(vm_resv_obj);
1411 
1412 	err = dma_resv_lock_interruptible(xe_vm_resv(vm), NULL);
1413 	if (err)
1414 		goto err_close;
1415 
1416 	if (IS_DGFX(xe) && xe->info.vram_flags & XE_VRAM_FLAGS_NEED64K)
1417 		vm->flags |= XE_VM_FLAG_64K;
1418 
1419 	for_each_tile(tile, xe, id) {
1420 		if (flags & XE_VM_FLAG_MIGRATION &&
1421 		    tile->id != XE_VM_FLAG_TILE_ID(flags))
1422 			continue;
1423 
1424 		vm->pt_root[id] = xe_pt_create(vm, tile, xe->info.vm_max_level);
1425 		if (IS_ERR(vm->pt_root[id])) {
1426 			err = PTR_ERR(vm->pt_root[id]);
1427 			vm->pt_root[id] = NULL;
1428 			goto err_unlock_close;
1429 		}
1430 	}
1431 
1432 	if (flags & XE_VM_FLAG_SCRATCH_PAGE) {
1433 		for_each_tile(tile, xe, id) {
1434 			if (!vm->pt_root[id])
1435 				continue;
1436 
1437 			err = xe_pt_create_scratch(xe, tile, vm);
1438 			if (err)
1439 				goto err_unlock_close;
1440 		}
1441 		vm->batch_invalidate_tlb = true;
1442 	}
1443 
1444 	if (flags & XE_VM_FLAG_LR_MODE) {
1445 		INIT_WORK(&vm->preempt.rebind_work, preempt_rebind_work_func);
1446 		vm->flags |= XE_VM_FLAG_LR_MODE;
1447 		vm->batch_invalidate_tlb = false;
1448 	}
1449 
1450 	/* Fill pt_root after allocating scratch tables */
1451 	for_each_tile(tile, xe, id) {
1452 		if (!vm->pt_root[id])
1453 			continue;
1454 
1455 		xe_pt_populate_empty(tile, vm, vm->pt_root[id]);
1456 	}
1457 	dma_resv_unlock(xe_vm_resv(vm));
1458 
1459 	/* Kernel migration VM shouldn't have a circular loop.. */
1460 	if (!(flags & XE_VM_FLAG_MIGRATION)) {
1461 		for_each_tile(tile, xe, id) {
1462 			struct xe_gt *gt = tile->primary_gt;
1463 			struct xe_vm *migrate_vm;
1464 			struct xe_exec_queue *q;
1465 			u32 create_flags = EXEC_QUEUE_FLAG_VM |
1466 				((flags & XE_VM_FLAG_ASYNC_DEFAULT) ?
1467 				EXEC_QUEUE_FLAG_VM_ASYNC : 0);
1468 
1469 			if (!vm->pt_root[id])
1470 				continue;
1471 
1472 			migrate_vm = xe_migrate_get_vm(tile->migrate);
1473 			q = xe_exec_queue_create_class(xe, gt, migrate_vm,
1474 						       XE_ENGINE_CLASS_COPY,
1475 						       create_flags);
1476 			xe_vm_put(migrate_vm);
1477 			if (IS_ERR(q)) {
1478 				err = PTR_ERR(q);
1479 				goto err_close;
1480 			}
1481 			vm->q[id] = q;
1482 			number_tiles++;
1483 		}
1484 	}
1485 
1486 	if (number_tiles > 1)
1487 		vm->composite_fence_ctx = dma_fence_context_alloc(1);
1488 
1489 	mutex_lock(&xe->usm.lock);
1490 	if (flags & XE_VM_FLAG_FAULT_MODE)
1491 		xe->usm.num_vm_in_fault_mode++;
1492 	else if (!(flags & XE_VM_FLAG_MIGRATION))
1493 		xe->usm.num_vm_in_non_fault_mode++;
1494 	mutex_unlock(&xe->usm.lock);
1495 
1496 	trace_xe_vm_create(vm);
1497 
1498 	return vm;
1499 
1500 err_unlock_close:
1501 	dma_resv_unlock(xe_vm_resv(vm));
1502 err_close:
1503 	xe_vm_close_and_put(vm);
1504 	return ERR_PTR(err);
1505 
1506 err_no_resv:
1507 	for_each_tile(tile, xe, id)
1508 		xe_range_fence_tree_fini(&vm->rftree[id]);
1509 	kfree(vm);
1510 	if (!(flags & XE_VM_FLAG_MIGRATION))
1511 		xe_device_mem_access_put(xe);
1512 	return ERR_PTR(err);
1513 }
1514 
1515 static void xe_vm_close(struct xe_vm *vm)
1516 {
1517 	down_write(&vm->lock);
1518 	vm->size = 0;
1519 	up_write(&vm->lock);
1520 }
1521 
1522 void xe_vm_close_and_put(struct xe_vm *vm)
1523 {
1524 	LIST_HEAD(contested);
1525 	struct xe_device *xe = vm->xe;
1526 	struct xe_tile *tile;
1527 	struct xe_vma *vma, *next_vma;
1528 	struct drm_gpuva *gpuva, *next;
1529 	u8 id;
1530 
1531 	xe_assert(xe, !vm->preempt.num_exec_queues);
1532 
1533 	xe_vm_close(vm);
1534 	if (xe_vm_in_preempt_fence_mode(vm))
1535 		flush_work(&vm->preempt.rebind_work);
1536 
1537 	down_write(&vm->lock);
1538 	for_each_tile(tile, xe, id) {
1539 		if (vm->q[id])
1540 			xe_exec_queue_last_fence_put(vm->q[id], vm);
1541 	}
1542 	up_write(&vm->lock);
1543 
1544 	for_each_tile(tile, xe, id) {
1545 		if (vm->q[id]) {
1546 			xe_exec_queue_kill(vm->q[id]);
1547 			xe_exec_queue_put(vm->q[id]);
1548 			vm->q[id] = NULL;
1549 		}
1550 	}
1551 
1552 	down_write(&vm->lock);
1553 	xe_vm_lock(vm, false);
1554 	drm_gpuvm_for_each_va_safe(gpuva, next, &vm->gpuvm) {
1555 		vma = gpuva_to_vma(gpuva);
1556 
1557 		if (xe_vma_has_no_bo(vma)) {
1558 			down_read(&vm->userptr.notifier_lock);
1559 			vma->gpuva.flags |= XE_VMA_DESTROYED;
1560 			up_read(&vm->userptr.notifier_lock);
1561 		}
1562 
1563 		xe_vm_remove_vma(vm, vma);
1564 
1565 		/* easy case, remove from VMA? */
1566 		if (xe_vma_has_no_bo(vma) || xe_vma_bo(vma)->vm) {
1567 			list_del_init(&vma->combined_links.rebind);
1568 			xe_vma_destroy(vma, NULL);
1569 			continue;
1570 		}
1571 
1572 		list_move_tail(&vma->combined_links.destroy, &contested);
1573 		vma->gpuva.flags |= XE_VMA_DESTROYED;
1574 	}
1575 
1576 	/*
1577 	 * All vm operations will add shared fences to resv.
1578 	 * The only exception is eviction for a shared object,
1579 	 * but even so, the unbind when evicted would still
1580 	 * install a fence to resv. Hence it's safe to
1581 	 * destroy the pagetables immediately.
1582 	 */
1583 	for_each_tile(tile, xe, id) {
1584 		if (vm->scratch_bo[id]) {
1585 			u32 i;
1586 
1587 			xe_bo_unpin(vm->scratch_bo[id]);
1588 			xe_bo_put(vm->scratch_bo[id]);
1589 			for (i = 0; i < vm->pt_root[id]->level; i++)
1590 				xe_pt_destroy(vm->scratch_pt[id][i], vm->flags,
1591 					      NULL);
1592 		}
1593 		if (vm->pt_root[id]) {
1594 			xe_pt_destroy(vm->pt_root[id], vm->flags, NULL);
1595 			vm->pt_root[id] = NULL;
1596 		}
1597 	}
1598 	xe_vm_unlock(vm);
1599 
1600 	/*
1601 	 * VM is now dead, cannot re-add nodes to vm->vmas if it's NULL
1602 	 * Since we hold a refcount to the bo, we can remove and free
1603 	 * the members safely without locking.
1604 	 */
1605 	list_for_each_entry_safe(vma, next_vma, &contested,
1606 				 combined_links.destroy) {
1607 		list_del_init(&vma->combined_links.destroy);
1608 		xe_vma_destroy_unlocked(vma);
1609 	}
1610 
1611 	xe_assert(xe, list_empty(&vm->extobj.list));
1612 	up_write(&vm->lock);
1613 
1614 	mutex_lock(&xe->usm.lock);
1615 	if (vm->flags & XE_VM_FLAG_FAULT_MODE)
1616 		xe->usm.num_vm_in_fault_mode--;
1617 	else if (!(vm->flags & XE_VM_FLAG_MIGRATION))
1618 		xe->usm.num_vm_in_non_fault_mode--;
1619 	mutex_unlock(&xe->usm.lock);
1620 
1621 	for_each_tile(tile, xe, id)
1622 		xe_range_fence_tree_fini(&vm->rftree[id]);
1623 
1624 	xe_vm_put(vm);
1625 }
1626 
1627 static void vm_destroy_work_func(struct work_struct *w)
1628 {
1629 	struct xe_vm *vm =
1630 		container_of(w, struct xe_vm, destroy_work);
1631 	struct xe_device *xe = vm->xe;
1632 	struct xe_tile *tile;
1633 	u8 id;
1634 	void *lookup;
1635 
1636 	/* xe_vm_close_and_put was not called? */
1637 	xe_assert(xe, !vm->size);
1638 
1639 	if (!(vm->flags & XE_VM_FLAG_MIGRATION)) {
1640 		xe_device_mem_access_put(xe);
1641 
1642 		if (xe->info.has_asid) {
1643 			mutex_lock(&xe->usm.lock);
1644 			lookup = xa_erase(&xe->usm.asid_to_vm, vm->usm.asid);
1645 			xe_assert(xe, lookup == vm);
1646 			mutex_unlock(&xe->usm.lock);
1647 		}
1648 	}
1649 
1650 	for_each_tile(tile, xe, id)
1651 		XE_WARN_ON(vm->pt_root[id]);
1652 
1653 	trace_xe_vm_free(vm);
1654 	dma_fence_put(vm->rebind_fence);
1655 	kfree(vm);
1656 }
1657 
1658 static void xe_vm_free(struct drm_gpuvm *gpuvm)
1659 {
1660 	struct xe_vm *vm = container_of(gpuvm, struct xe_vm, gpuvm);
1661 
1662 	/* To destroy the VM we need to be able to sleep */
1663 	queue_work(system_unbound_wq, &vm->destroy_work);
1664 }
1665 
1666 struct xe_vm *xe_vm_lookup(struct xe_file *xef, u32 id)
1667 {
1668 	struct xe_vm *vm;
1669 
1670 	mutex_lock(&xef->vm.lock);
1671 	vm = xa_load(&xef->vm.xa, id);
1672 	if (vm)
1673 		xe_vm_get(vm);
1674 	mutex_unlock(&xef->vm.lock);
1675 
1676 	return vm;
1677 }
1678 
1679 u64 xe_vm_pdp4_descriptor(struct xe_vm *vm, struct xe_tile *tile)
1680 {
1681 	return vm->pt_ops->pde_encode_bo(vm->pt_root[tile->id]->bo, 0,
1682 					 tile_to_xe(tile)->pat.idx[XE_CACHE_WB]);
1683 }
1684 
1685 static struct xe_exec_queue *
1686 to_wait_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
1687 {
1688 	return q ? q : vm->q[0];
1689 }
1690 
1691 static struct dma_fence *
1692 xe_vm_unbind_vma(struct xe_vma *vma, struct xe_exec_queue *q,
1693 		 struct xe_sync_entry *syncs, u32 num_syncs,
1694 		 bool first_op, bool last_op)
1695 {
1696 	struct xe_vm *vm = xe_vma_vm(vma);
1697 	struct xe_exec_queue *wait_exec_queue = to_wait_exec_queue(vm, q);
1698 	struct xe_tile *tile;
1699 	struct dma_fence *fence = NULL;
1700 	struct dma_fence **fences = NULL;
1701 	struct dma_fence_array *cf = NULL;
1702 	int cur_fence = 0, i;
1703 	int number_tiles = hweight8(vma->tile_present);
1704 	int err;
1705 	u8 id;
1706 
1707 	trace_xe_vma_unbind(vma);
1708 
1709 	if (number_tiles > 1) {
1710 		fences = kmalloc_array(number_tiles, sizeof(*fences),
1711 				       GFP_KERNEL);
1712 		if (!fences)
1713 			return ERR_PTR(-ENOMEM);
1714 	}
1715 
1716 	for_each_tile(tile, vm->xe, id) {
1717 		if (!(vma->tile_present & BIT(id)))
1718 			goto next;
1719 
1720 		fence = __xe_pt_unbind_vma(tile, vma, q ? q : vm->q[id],
1721 					   first_op ? syncs : NULL,
1722 					   first_op ? num_syncs : 0);
1723 		if (IS_ERR(fence)) {
1724 			err = PTR_ERR(fence);
1725 			goto err_fences;
1726 		}
1727 
1728 		if (fences)
1729 			fences[cur_fence++] = fence;
1730 
1731 next:
1732 		if (q && vm->pt_root[id] && !list_empty(&q->multi_gt_list))
1733 			q = list_next_entry(q, multi_gt_list);
1734 	}
1735 
1736 	if (fences) {
1737 		cf = dma_fence_array_create(number_tiles, fences,
1738 					    vm->composite_fence_ctx,
1739 					    vm->composite_fence_seqno++,
1740 					    false);
1741 		if (!cf) {
1742 			--vm->composite_fence_seqno;
1743 			err = -ENOMEM;
1744 			goto err_fences;
1745 		}
1746 	}
1747 
1748 	fence = cf ? &cf->base : !fence ?
1749 		xe_exec_queue_last_fence_get(wait_exec_queue, vm) : fence;
1750 	if (last_op) {
1751 		for (i = 0; i < num_syncs; i++)
1752 			xe_sync_entry_signal(&syncs[i], NULL, fence);
1753 	}
1754 
1755 	return fence;
1756 
1757 err_fences:
1758 	if (fences) {
1759 		while (cur_fence)
1760 			dma_fence_put(fences[--cur_fence]);
1761 		kfree(fences);
1762 	}
1763 
1764 	return ERR_PTR(err);
1765 }
1766 
1767 static struct dma_fence *
1768 xe_vm_bind_vma(struct xe_vma *vma, struct xe_exec_queue *q,
1769 	       struct xe_sync_entry *syncs, u32 num_syncs,
1770 	       bool first_op, bool last_op)
1771 {
1772 	struct xe_tile *tile;
1773 	struct dma_fence *fence;
1774 	struct dma_fence **fences = NULL;
1775 	struct dma_fence_array *cf = NULL;
1776 	struct xe_vm *vm = xe_vma_vm(vma);
1777 	int cur_fence = 0, i;
1778 	int number_tiles = hweight8(vma->tile_mask);
1779 	int err;
1780 	u8 id;
1781 
1782 	trace_xe_vma_bind(vma);
1783 
1784 	if (number_tiles > 1) {
1785 		fences = kmalloc_array(number_tiles, sizeof(*fences),
1786 				       GFP_KERNEL);
1787 		if (!fences)
1788 			return ERR_PTR(-ENOMEM);
1789 	}
1790 
1791 	for_each_tile(tile, vm->xe, id) {
1792 		if (!(vma->tile_mask & BIT(id)))
1793 			goto next;
1794 
1795 		fence = __xe_pt_bind_vma(tile, vma, q ? q : vm->q[id],
1796 					 first_op ? syncs : NULL,
1797 					 first_op ? num_syncs : 0,
1798 					 vma->tile_present & BIT(id));
1799 		if (IS_ERR(fence)) {
1800 			err = PTR_ERR(fence);
1801 			goto err_fences;
1802 		}
1803 
1804 		if (fences)
1805 			fences[cur_fence++] = fence;
1806 
1807 next:
1808 		if (q && vm->pt_root[id] && !list_empty(&q->multi_gt_list))
1809 			q = list_next_entry(q, multi_gt_list);
1810 	}
1811 
1812 	if (fences) {
1813 		cf = dma_fence_array_create(number_tiles, fences,
1814 					    vm->composite_fence_ctx,
1815 					    vm->composite_fence_seqno++,
1816 					    false);
1817 		if (!cf) {
1818 			--vm->composite_fence_seqno;
1819 			err = -ENOMEM;
1820 			goto err_fences;
1821 		}
1822 	}
1823 
1824 	if (last_op) {
1825 		for (i = 0; i < num_syncs; i++)
1826 			xe_sync_entry_signal(&syncs[i], NULL,
1827 					     cf ? &cf->base : fence);
1828 	}
1829 
1830 	return cf ? &cf->base : fence;
1831 
1832 err_fences:
1833 	if (fences) {
1834 		while (cur_fence)
1835 			dma_fence_put(fences[--cur_fence]);
1836 		kfree(fences);
1837 	}
1838 
1839 	return ERR_PTR(err);
1840 }
1841 
1842 static bool xe_vm_sync_mode(struct xe_vm *vm, struct xe_exec_queue *q)
1843 {
1844 	return q ? !(q->flags & EXEC_QUEUE_FLAG_VM_ASYNC) :
1845 		!(vm->flags & XE_VM_FLAG_ASYNC_DEFAULT);
1846 }
1847 
1848 static int __xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma,
1849 			struct xe_exec_queue *q, struct xe_sync_entry *syncs,
1850 			u32 num_syncs, bool immediate, bool first_op,
1851 			bool last_op)
1852 {
1853 	struct dma_fence *fence;
1854 	struct xe_exec_queue *wait_exec_queue = to_wait_exec_queue(vm, q);
1855 
1856 	xe_vm_assert_held(vm);
1857 
1858 	if (immediate) {
1859 		fence = xe_vm_bind_vma(vma, q, syncs, num_syncs, first_op,
1860 				       last_op);
1861 		if (IS_ERR(fence))
1862 			return PTR_ERR(fence);
1863 	} else {
1864 		int i;
1865 
1866 		xe_assert(vm->xe, xe_vm_in_fault_mode(vm));
1867 
1868 		fence = xe_exec_queue_last_fence_get(wait_exec_queue, vm);
1869 		if (last_op) {
1870 			for (i = 0; i < num_syncs; i++)
1871 				xe_sync_entry_signal(&syncs[i], NULL, fence);
1872 		}
1873 	}
1874 
1875 	if (last_op)
1876 		xe_exec_queue_last_fence_set(wait_exec_queue, vm, fence);
1877 	if (last_op && xe_vm_sync_mode(vm, q))
1878 		dma_fence_wait(fence, true);
1879 	dma_fence_put(fence);
1880 
1881 	return 0;
1882 }
1883 
1884 static int xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma, struct xe_exec_queue *q,
1885 		      struct xe_bo *bo, struct xe_sync_entry *syncs,
1886 		      u32 num_syncs, bool immediate, bool first_op,
1887 		      bool last_op)
1888 {
1889 	int err;
1890 
1891 	xe_vm_assert_held(vm);
1892 	xe_bo_assert_held(bo);
1893 
1894 	if (bo && immediate) {
1895 		err = xe_bo_validate(bo, vm, true);
1896 		if (err)
1897 			return err;
1898 	}
1899 
1900 	return __xe_vm_bind(vm, vma, q, syncs, num_syncs, immediate, first_op,
1901 			    last_op);
1902 }
1903 
1904 static int xe_vm_unbind(struct xe_vm *vm, struct xe_vma *vma,
1905 			struct xe_exec_queue *q, struct xe_sync_entry *syncs,
1906 			u32 num_syncs, bool first_op, bool last_op)
1907 {
1908 	struct dma_fence *fence;
1909 	struct xe_exec_queue *wait_exec_queue = to_wait_exec_queue(vm, q);
1910 
1911 	xe_vm_assert_held(vm);
1912 	xe_bo_assert_held(xe_vma_bo(vma));
1913 
1914 	fence = xe_vm_unbind_vma(vma, q, syncs, num_syncs, first_op, last_op);
1915 	if (IS_ERR(fence))
1916 		return PTR_ERR(fence);
1917 
1918 	xe_vma_destroy(vma, fence);
1919 	if (last_op)
1920 		xe_exec_queue_last_fence_set(wait_exec_queue, vm, fence);
1921 	if (last_op && xe_vm_sync_mode(vm, q))
1922 		dma_fence_wait(fence, true);
1923 	dma_fence_put(fence);
1924 
1925 	return 0;
1926 }
1927 
1928 #define ALL_DRM_XE_VM_CREATE_FLAGS (DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE | \
1929 				    DRM_XE_VM_CREATE_FLAG_COMPUTE_MODE | \
1930 				    DRM_XE_VM_CREATE_FLAG_ASYNC_DEFAULT | \
1931 				    DRM_XE_VM_CREATE_FLAG_FAULT_MODE)
1932 
1933 int xe_vm_create_ioctl(struct drm_device *dev, void *data,
1934 		       struct drm_file *file)
1935 {
1936 	struct xe_device *xe = to_xe_device(dev);
1937 	struct xe_file *xef = to_xe_file(file);
1938 	struct drm_xe_vm_create *args = data;
1939 	struct xe_tile *tile;
1940 	struct xe_vm *vm;
1941 	u32 id, asid;
1942 	int err;
1943 	u32 flags = 0;
1944 
1945 	if (XE_IOCTL_DBG(xe, args->extensions))
1946 		return -EINVAL;
1947 
1948 	if (XE_WA(xe_root_mmio_gt(xe), 14016763929))
1949 		args->flags |= DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE;
1950 
1951 	if (XE_IOCTL_DBG(xe, args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE &&
1952 			 !xe->info.supports_usm))
1953 		return -EINVAL;
1954 
1955 	if (XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
1956 		return -EINVAL;
1957 
1958 	if (XE_IOCTL_DBG(xe, args->flags & ~ALL_DRM_XE_VM_CREATE_FLAGS))
1959 		return -EINVAL;
1960 
1961 	if (XE_IOCTL_DBG(xe, args->flags & DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE &&
1962 			 args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE))
1963 		return -EINVAL;
1964 
1965 	if (XE_IOCTL_DBG(xe, args->flags & DRM_XE_VM_CREATE_FLAG_COMPUTE_MODE &&
1966 			 args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE))
1967 		return -EINVAL;
1968 
1969 	if (XE_IOCTL_DBG(xe, args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE &&
1970 			 xe_device_in_non_fault_mode(xe)))
1971 		return -EINVAL;
1972 
1973 	if (XE_IOCTL_DBG(xe, !(args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE) &&
1974 			 xe_device_in_fault_mode(xe)))
1975 		return -EINVAL;
1976 
1977 	if (XE_IOCTL_DBG(xe, args->extensions))
1978 		return -EINVAL;
1979 
1980 	if (args->flags & DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE)
1981 		flags |= XE_VM_FLAG_SCRATCH_PAGE;
1982 	if (args->flags & DRM_XE_VM_CREATE_FLAG_COMPUTE_MODE)
1983 		flags |= XE_VM_FLAG_LR_MODE;
1984 	if (args->flags & DRM_XE_VM_CREATE_FLAG_ASYNC_DEFAULT)
1985 		flags |= XE_VM_FLAG_ASYNC_DEFAULT;
1986 	if (args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE)
1987 		flags |= XE_VM_FLAG_LR_MODE | XE_VM_FLAG_FAULT_MODE;
1988 
1989 	vm = xe_vm_create(xe, flags);
1990 	if (IS_ERR(vm))
1991 		return PTR_ERR(vm);
1992 
1993 	mutex_lock(&xef->vm.lock);
1994 	err = xa_alloc(&xef->vm.xa, &id, vm, xa_limit_32b, GFP_KERNEL);
1995 	mutex_unlock(&xef->vm.lock);
1996 	if (err) {
1997 		xe_vm_close_and_put(vm);
1998 		return err;
1999 	}
2000 
2001 	if (xe->info.has_asid) {
2002 		mutex_lock(&xe->usm.lock);
2003 		err = xa_alloc_cyclic(&xe->usm.asid_to_vm, &asid, vm,
2004 				      XA_LIMIT(1, XE_MAX_ASID - 1),
2005 				      &xe->usm.next_asid, GFP_KERNEL);
2006 		mutex_unlock(&xe->usm.lock);
2007 		if (err < 0) {
2008 			xe_vm_close_and_put(vm);
2009 			return err;
2010 		}
2011 		err = 0;
2012 		vm->usm.asid = asid;
2013 	}
2014 
2015 	args->vm_id = id;
2016 	vm->xef = xef;
2017 
2018 	/* Record BO memory for VM pagetable created against client */
2019 	for_each_tile(tile, xe, id)
2020 		if (vm->pt_root[id])
2021 			xe_drm_client_add_bo(vm->xef->client, vm->pt_root[id]->bo);
2022 
2023 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_MEM)
2024 	/* Warning: Security issue - never enable by default */
2025 	args->reserved[0] = xe_bo_main_addr(vm->pt_root[0]->bo, XE_PAGE_SIZE);
2026 #endif
2027 
2028 	return 0;
2029 }
2030 
2031 int xe_vm_destroy_ioctl(struct drm_device *dev, void *data,
2032 			struct drm_file *file)
2033 {
2034 	struct xe_device *xe = to_xe_device(dev);
2035 	struct xe_file *xef = to_xe_file(file);
2036 	struct drm_xe_vm_destroy *args = data;
2037 	struct xe_vm *vm;
2038 	int err = 0;
2039 
2040 	if (XE_IOCTL_DBG(xe, args->pad) ||
2041 	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
2042 		return -EINVAL;
2043 
2044 	mutex_lock(&xef->vm.lock);
2045 	vm = xa_load(&xef->vm.xa, args->vm_id);
2046 	if (XE_IOCTL_DBG(xe, !vm))
2047 		err = -ENOENT;
2048 	else if (XE_IOCTL_DBG(xe, vm->preempt.num_exec_queues))
2049 		err = -EBUSY;
2050 	else
2051 		xa_erase(&xef->vm.xa, args->vm_id);
2052 	mutex_unlock(&xef->vm.lock);
2053 
2054 	if (!err)
2055 		xe_vm_close_and_put(vm);
2056 
2057 	return err;
2058 }
2059 
2060 static const u32 region_to_mem_type[] = {
2061 	XE_PL_TT,
2062 	XE_PL_VRAM0,
2063 	XE_PL_VRAM1,
2064 };
2065 
2066 static int xe_vm_prefetch(struct xe_vm *vm, struct xe_vma *vma,
2067 			  struct xe_exec_queue *q, u32 region,
2068 			  struct xe_sync_entry *syncs, u32 num_syncs,
2069 			  bool first_op, bool last_op)
2070 {
2071 	struct xe_exec_queue *wait_exec_queue = to_wait_exec_queue(vm, q);
2072 	int err;
2073 
2074 	xe_assert(vm->xe, region <= ARRAY_SIZE(region_to_mem_type));
2075 
2076 	if (!xe_vma_has_no_bo(vma)) {
2077 		err = xe_bo_migrate(xe_vma_bo(vma), region_to_mem_type[region]);
2078 		if (err)
2079 			return err;
2080 	}
2081 
2082 	if (vma->tile_mask != (vma->tile_present & ~vma->usm.tile_invalidated)) {
2083 		return xe_vm_bind(vm, vma, q, xe_vma_bo(vma), syncs, num_syncs,
2084 				  true, first_op, last_op);
2085 	} else {
2086 		int i;
2087 
2088 		/* Nothing to do, signal fences now */
2089 		if (last_op) {
2090 			for (i = 0; i < num_syncs; i++) {
2091 				struct dma_fence *fence =
2092 					xe_exec_queue_last_fence_get(wait_exec_queue, vm);
2093 
2094 				xe_sync_entry_signal(&syncs[i], NULL, fence);
2095 			}
2096 		}
2097 
2098 		return 0;
2099 	}
2100 }
2101 
2102 static void prep_vma_destroy(struct xe_vm *vm, struct xe_vma *vma,
2103 			     bool post_commit)
2104 {
2105 	down_read(&vm->userptr.notifier_lock);
2106 	vma->gpuva.flags |= XE_VMA_DESTROYED;
2107 	up_read(&vm->userptr.notifier_lock);
2108 	if (post_commit)
2109 		xe_vm_remove_vma(vm, vma);
2110 }
2111 
2112 #undef ULL
2113 #define ULL	unsigned long long
2114 
2115 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_VM)
2116 static void print_op(struct xe_device *xe, struct drm_gpuva_op *op)
2117 {
2118 	struct xe_vma *vma;
2119 
2120 	switch (op->op) {
2121 	case DRM_GPUVA_OP_MAP:
2122 		vm_dbg(&xe->drm, "MAP: addr=0x%016llx, range=0x%016llx",
2123 		       (ULL)op->map.va.addr, (ULL)op->map.va.range);
2124 		break;
2125 	case DRM_GPUVA_OP_REMAP:
2126 		vma = gpuva_to_vma(op->remap.unmap->va);
2127 		vm_dbg(&xe->drm, "REMAP:UNMAP: addr=0x%016llx, range=0x%016llx, keep=%d",
2128 		       (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma),
2129 		       op->remap.unmap->keep ? 1 : 0);
2130 		if (op->remap.prev)
2131 			vm_dbg(&xe->drm,
2132 			       "REMAP:PREV: addr=0x%016llx, range=0x%016llx",
2133 			       (ULL)op->remap.prev->va.addr,
2134 			       (ULL)op->remap.prev->va.range);
2135 		if (op->remap.next)
2136 			vm_dbg(&xe->drm,
2137 			       "REMAP:NEXT: addr=0x%016llx, range=0x%016llx",
2138 			       (ULL)op->remap.next->va.addr,
2139 			       (ULL)op->remap.next->va.range);
2140 		break;
2141 	case DRM_GPUVA_OP_UNMAP:
2142 		vma = gpuva_to_vma(op->unmap.va);
2143 		vm_dbg(&xe->drm, "UNMAP: addr=0x%016llx, range=0x%016llx, keep=%d",
2144 		       (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma),
2145 		       op->unmap.keep ? 1 : 0);
2146 		break;
2147 	case DRM_GPUVA_OP_PREFETCH:
2148 		vma = gpuva_to_vma(op->prefetch.va);
2149 		vm_dbg(&xe->drm, "PREFETCH: addr=0x%016llx, range=0x%016llx",
2150 		       (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma));
2151 		break;
2152 	default:
2153 		drm_warn(&xe->drm, "NOT POSSIBLE");
2154 	}
2155 }
2156 #else
2157 static void print_op(struct xe_device *xe, struct drm_gpuva_op *op)
2158 {
2159 }
2160 #endif
2161 
2162 /*
2163  * Create operations list from IOCTL arguments, setup operations fields so parse
2164  * and commit steps are decoupled from IOCTL arguments. This step can fail.
2165  */
2166 static struct drm_gpuva_ops *
2167 vm_bind_ioctl_ops_create(struct xe_vm *vm, struct xe_bo *bo,
2168 			 u64 bo_offset_or_userptr, u64 addr, u64 range,
2169 			 u32 operation, u32 flags, u8 tile_mask,
2170 			 u32 prefetch_region, u16 pat_index)
2171 {
2172 	struct drm_gem_object *obj = bo ? &bo->ttm.base : NULL;
2173 	struct drm_gpuva_ops *ops;
2174 	struct drm_gpuva_op *__op;
2175 	struct xe_vma_op *op;
2176 	struct drm_gpuvm_bo *vm_bo;
2177 	int err;
2178 
2179 	lockdep_assert_held_write(&vm->lock);
2180 
2181 	vm_dbg(&vm->xe->drm,
2182 	       "op=%d, addr=0x%016llx, range=0x%016llx, bo_offset_or_userptr=0x%016llx",
2183 	       operation, (ULL)addr, (ULL)range,
2184 	       (ULL)bo_offset_or_userptr);
2185 
2186 	switch (operation) {
2187 	case DRM_XE_VM_BIND_OP_MAP:
2188 	case DRM_XE_VM_BIND_OP_MAP_USERPTR:
2189 		ops = drm_gpuvm_sm_map_ops_create(&vm->gpuvm, addr, range,
2190 						  obj, bo_offset_or_userptr);
2191 		break;
2192 	case DRM_XE_VM_BIND_OP_UNMAP:
2193 		ops = drm_gpuvm_sm_unmap_ops_create(&vm->gpuvm, addr, range);
2194 		break;
2195 	case DRM_XE_VM_BIND_OP_PREFETCH:
2196 		ops = drm_gpuvm_prefetch_ops_create(&vm->gpuvm, addr, range);
2197 		break;
2198 	case DRM_XE_VM_BIND_OP_UNMAP_ALL:
2199 		xe_assert(vm->xe, bo);
2200 
2201 		err = xe_bo_lock(bo, true);
2202 		if (err)
2203 			return ERR_PTR(err);
2204 
2205 		vm_bo = drm_gpuvm_bo_find(&vm->gpuvm, obj);
2206 		if (!vm_bo)
2207 			break;
2208 
2209 		ops = drm_gpuvm_bo_unmap_ops_create(vm_bo);
2210 		drm_gpuvm_bo_put(vm_bo);
2211 		xe_bo_unlock(bo);
2212 		break;
2213 	default:
2214 		drm_warn(&vm->xe->drm, "NOT POSSIBLE");
2215 		ops = ERR_PTR(-EINVAL);
2216 	}
2217 	if (IS_ERR(ops))
2218 		return ops;
2219 
2220 #ifdef TEST_VM_ASYNC_OPS_ERROR
2221 	if (operation & FORCE_ASYNC_OP_ERROR) {
2222 		op = list_first_entry_or_null(&ops->list, struct xe_vma_op,
2223 					      base.entry);
2224 		if (op)
2225 			op->inject_error = true;
2226 	}
2227 #endif
2228 
2229 	drm_gpuva_for_each_op(__op, ops) {
2230 		struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2231 
2232 		op->tile_mask = tile_mask;
2233 		if (__op->op == DRM_GPUVA_OP_MAP) {
2234 			op->map.immediate =
2235 				flags & DRM_XE_VM_BIND_FLAG_IMMEDIATE;
2236 			op->map.read_only =
2237 				flags & DRM_XE_VM_BIND_FLAG_READONLY;
2238 			op->map.is_null = flags & DRM_XE_VM_BIND_FLAG_NULL;
2239 			op->map.pat_index = pat_index;
2240 		} else if (__op->op == DRM_GPUVA_OP_PREFETCH) {
2241 			op->prefetch.region = prefetch_region;
2242 		}
2243 
2244 		print_op(vm->xe, __op);
2245 	}
2246 
2247 	return ops;
2248 }
2249 
2250 static struct xe_vma *new_vma(struct xe_vm *vm, struct drm_gpuva_op_map *op,
2251 			      u8 tile_mask, bool read_only, bool is_null,
2252 			      u16 pat_index)
2253 {
2254 	struct xe_bo *bo = op->gem.obj ? gem_to_xe_bo(op->gem.obj) : NULL;
2255 	struct xe_vma *vma;
2256 	int err;
2257 
2258 	lockdep_assert_held_write(&vm->lock);
2259 
2260 	if (bo) {
2261 		err = xe_bo_lock(bo, true);
2262 		if (err)
2263 			return ERR_PTR(err);
2264 	}
2265 	vma = xe_vma_create(vm, bo, op->gem.offset,
2266 			    op->va.addr, op->va.addr +
2267 			    op->va.range - 1, read_only, is_null,
2268 			    tile_mask, pat_index);
2269 	if (bo)
2270 		xe_bo_unlock(bo);
2271 
2272 	if (xe_vma_is_userptr(vma)) {
2273 		err = xe_vma_userptr_pin_pages(vma);
2274 		if (err) {
2275 			prep_vma_destroy(vm, vma, false);
2276 			xe_vma_destroy_unlocked(vma);
2277 			return ERR_PTR(err);
2278 		}
2279 	} else if (!xe_vma_has_no_bo(vma) && !bo->vm) {
2280 		vm_insert_extobj(vm, vma);
2281 		err = add_preempt_fences(vm, bo);
2282 		if (err) {
2283 			prep_vma_destroy(vm, vma, false);
2284 			xe_vma_destroy_unlocked(vma);
2285 			return ERR_PTR(err);
2286 		}
2287 	}
2288 
2289 	return vma;
2290 }
2291 
2292 static u64 xe_vma_max_pte_size(struct xe_vma *vma)
2293 {
2294 	if (vma->gpuva.flags & XE_VMA_PTE_1G)
2295 		return SZ_1G;
2296 	else if (vma->gpuva.flags & XE_VMA_PTE_2M)
2297 		return SZ_2M;
2298 
2299 	return SZ_4K;
2300 }
2301 
2302 static u64 xe_vma_set_pte_size(struct xe_vma *vma, u64 size)
2303 {
2304 	switch (size) {
2305 	case SZ_1G:
2306 		vma->gpuva.flags |= XE_VMA_PTE_1G;
2307 		break;
2308 	case SZ_2M:
2309 		vma->gpuva.flags |= XE_VMA_PTE_2M;
2310 		break;
2311 	}
2312 
2313 	return SZ_4K;
2314 }
2315 
2316 static int xe_vma_op_commit(struct xe_vm *vm, struct xe_vma_op *op)
2317 {
2318 	int err = 0;
2319 
2320 	lockdep_assert_held_write(&vm->lock);
2321 
2322 	switch (op->base.op) {
2323 	case DRM_GPUVA_OP_MAP:
2324 		err |= xe_vm_insert_vma(vm, op->map.vma);
2325 		if (!err)
2326 			op->flags |= XE_VMA_OP_COMMITTED;
2327 		break;
2328 	case DRM_GPUVA_OP_REMAP:
2329 	{
2330 		u8 tile_present =
2331 			gpuva_to_vma(op->base.remap.unmap->va)->tile_present;
2332 
2333 		prep_vma_destroy(vm, gpuva_to_vma(op->base.remap.unmap->va),
2334 				 true);
2335 		op->flags |= XE_VMA_OP_COMMITTED;
2336 
2337 		if (op->remap.prev) {
2338 			err |= xe_vm_insert_vma(vm, op->remap.prev);
2339 			if (!err)
2340 				op->flags |= XE_VMA_OP_PREV_COMMITTED;
2341 			if (!err && op->remap.skip_prev) {
2342 				op->remap.prev->tile_present =
2343 					tile_present;
2344 				op->remap.prev = NULL;
2345 			}
2346 		}
2347 		if (op->remap.next) {
2348 			err |= xe_vm_insert_vma(vm, op->remap.next);
2349 			if (!err)
2350 				op->flags |= XE_VMA_OP_NEXT_COMMITTED;
2351 			if (!err && op->remap.skip_next) {
2352 				op->remap.next->tile_present =
2353 					tile_present;
2354 				op->remap.next = NULL;
2355 			}
2356 		}
2357 
2358 		/* Adjust for partial unbind after removin VMA from VM */
2359 		if (!err) {
2360 			op->base.remap.unmap->va->va.addr = op->remap.start;
2361 			op->base.remap.unmap->va->va.range = op->remap.range;
2362 		}
2363 		break;
2364 	}
2365 	case DRM_GPUVA_OP_UNMAP:
2366 		prep_vma_destroy(vm, gpuva_to_vma(op->base.unmap.va), true);
2367 		op->flags |= XE_VMA_OP_COMMITTED;
2368 		break;
2369 	case DRM_GPUVA_OP_PREFETCH:
2370 		op->flags |= XE_VMA_OP_COMMITTED;
2371 		break;
2372 	default:
2373 		drm_warn(&vm->xe->drm, "NOT POSSIBLE");
2374 	}
2375 
2376 	return err;
2377 }
2378 
2379 
2380 static int vm_bind_ioctl_ops_parse(struct xe_vm *vm, struct xe_exec_queue *q,
2381 				   struct drm_gpuva_ops *ops,
2382 				   struct xe_sync_entry *syncs, u32 num_syncs,
2383 				   struct list_head *ops_list, bool last,
2384 				   bool async)
2385 {
2386 	struct xe_vma_op *last_op = NULL;
2387 	struct drm_gpuva_op *__op;
2388 	int err = 0;
2389 
2390 	lockdep_assert_held_write(&vm->lock);
2391 
2392 	drm_gpuva_for_each_op(__op, ops) {
2393 		struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2394 		bool first = list_empty(ops_list);
2395 
2396 		INIT_LIST_HEAD(&op->link);
2397 		list_add_tail(&op->link, ops_list);
2398 
2399 		if (first) {
2400 			op->flags |= XE_VMA_OP_FIRST;
2401 			op->num_syncs = num_syncs;
2402 			op->syncs = syncs;
2403 		}
2404 
2405 		op->q = q;
2406 
2407 		switch (op->base.op) {
2408 		case DRM_GPUVA_OP_MAP:
2409 		{
2410 			struct xe_vma *vma;
2411 
2412 			vma = new_vma(vm, &op->base.map,
2413 				      op->tile_mask, op->map.read_only,
2414 				      op->map.is_null, op->map.pat_index);
2415 			if (IS_ERR(vma))
2416 				return PTR_ERR(vma);
2417 
2418 			op->map.vma = vma;
2419 			break;
2420 		}
2421 		case DRM_GPUVA_OP_REMAP:
2422 		{
2423 			struct xe_vma *old =
2424 				gpuva_to_vma(op->base.remap.unmap->va);
2425 
2426 			op->remap.start = xe_vma_start(old);
2427 			op->remap.range = xe_vma_size(old);
2428 
2429 			if (op->base.remap.prev) {
2430 				struct xe_vma *vma;
2431 				bool read_only =
2432 					op->base.remap.unmap->va->flags &
2433 					XE_VMA_READ_ONLY;
2434 				bool is_null =
2435 					op->base.remap.unmap->va->flags &
2436 					DRM_GPUVA_SPARSE;
2437 
2438 				vma = new_vma(vm, op->base.remap.prev,
2439 					      op->tile_mask, read_only,
2440 					      is_null, old->pat_index);
2441 				if (IS_ERR(vma))
2442 					return PTR_ERR(vma);
2443 
2444 				op->remap.prev = vma;
2445 
2446 				/*
2447 				 * Userptr creates a new SG mapping so
2448 				 * we must also rebind.
2449 				 */
2450 				op->remap.skip_prev = !xe_vma_is_userptr(old) &&
2451 					IS_ALIGNED(xe_vma_end(vma),
2452 						   xe_vma_max_pte_size(old));
2453 				if (op->remap.skip_prev) {
2454 					xe_vma_set_pte_size(vma, xe_vma_max_pte_size(old));
2455 					op->remap.range -=
2456 						xe_vma_end(vma) -
2457 						xe_vma_start(old);
2458 					op->remap.start = xe_vma_end(vma);
2459 				}
2460 			}
2461 
2462 			if (op->base.remap.next) {
2463 				struct xe_vma *vma;
2464 				bool read_only =
2465 					op->base.remap.unmap->va->flags &
2466 					XE_VMA_READ_ONLY;
2467 
2468 				bool is_null =
2469 					op->base.remap.unmap->va->flags &
2470 					DRM_GPUVA_SPARSE;
2471 
2472 				vma = new_vma(vm, op->base.remap.next,
2473 					      op->tile_mask, read_only,
2474 					      is_null, old->pat_index);
2475 				if (IS_ERR(vma))
2476 					return PTR_ERR(vma);
2477 
2478 				op->remap.next = vma;
2479 
2480 				/*
2481 				 * Userptr creates a new SG mapping so
2482 				 * we must also rebind.
2483 				 */
2484 				op->remap.skip_next = !xe_vma_is_userptr(old) &&
2485 					IS_ALIGNED(xe_vma_start(vma),
2486 						   xe_vma_max_pte_size(old));
2487 				if (op->remap.skip_next) {
2488 					xe_vma_set_pte_size(vma, xe_vma_max_pte_size(old));
2489 					op->remap.range -=
2490 						xe_vma_end(old) -
2491 						xe_vma_start(vma);
2492 				}
2493 			}
2494 			break;
2495 		}
2496 		case DRM_GPUVA_OP_UNMAP:
2497 		case DRM_GPUVA_OP_PREFETCH:
2498 			/* Nothing to do */
2499 			break;
2500 		default:
2501 			drm_warn(&vm->xe->drm, "NOT POSSIBLE");
2502 		}
2503 
2504 		last_op = op;
2505 
2506 		err = xe_vma_op_commit(vm, op);
2507 		if (err)
2508 			return err;
2509 	}
2510 
2511 	/* FIXME: Unhandled corner case */
2512 	XE_WARN_ON(!last_op && last && !list_empty(ops_list));
2513 
2514 	if (!last_op)
2515 		return 0;
2516 
2517 	last_op->ops = ops;
2518 	if (last) {
2519 		last_op->flags |= XE_VMA_OP_LAST;
2520 		last_op->num_syncs = num_syncs;
2521 		last_op->syncs = syncs;
2522 	}
2523 
2524 	return 0;
2525 }
2526 
2527 static int op_execute(struct drm_exec *exec, struct xe_vm *vm,
2528 		      struct xe_vma *vma, struct xe_vma_op *op)
2529 {
2530 	int err;
2531 
2532 	lockdep_assert_held_write(&vm->lock);
2533 
2534 	err = xe_vm_prepare_vma(exec, vma, 1);
2535 	if (err)
2536 		return err;
2537 
2538 	xe_vm_assert_held(vm);
2539 	xe_bo_assert_held(xe_vma_bo(vma));
2540 
2541 	switch (op->base.op) {
2542 	case DRM_GPUVA_OP_MAP:
2543 		err = xe_vm_bind(vm, vma, op->q, xe_vma_bo(vma),
2544 				 op->syncs, op->num_syncs,
2545 				 op->map.immediate || !xe_vm_in_fault_mode(vm),
2546 				 op->flags & XE_VMA_OP_FIRST,
2547 				 op->flags & XE_VMA_OP_LAST);
2548 		break;
2549 	case DRM_GPUVA_OP_REMAP:
2550 	{
2551 		bool prev = !!op->remap.prev;
2552 		bool next = !!op->remap.next;
2553 
2554 		if (!op->remap.unmap_done) {
2555 			if (prev || next)
2556 				vma->gpuva.flags |= XE_VMA_FIRST_REBIND;
2557 			err = xe_vm_unbind(vm, vma, op->q, op->syncs,
2558 					   op->num_syncs,
2559 					   op->flags & XE_VMA_OP_FIRST,
2560 					   op->flags & XE_VMA_OP_LAST &&
2561 					   !prev && !next);
2562 			if (err)
2563 				break;
2564 			op->remap.unmap_done = true;
2565 		}
2566 
2567 		if (prev) {
2568 			op->remap.prev->gpuva.flags |= XE_VMA_LAST_REBIND;
2569 			err = xe_vm_bind(vm, op->remap.prev, op->q,
2570 					 xe_vma_bo(op->remap.prev), op->syncs,
2571 					 op->num_syncs, true, false,
2572 					 op->flags & XE_VMA_OP_LAST && !next);
2573 			op->remap.prev->gpuva.flags &= ~XE_VMA_LAST_REBIND;
2574 			if (err)
2575 				break;
2576 			op->remap.prev = NULL;
2577 		}
2578 
2579 		if (next) {
2580 			op->remap.next->gpuva.flags |= XE_VMA_LAST_REBIND;
2581 			err = xe_vm_bind(vm, op->remap.next, op->q,
2582 					 xe_vma_bo(op->remap.next),
2583 					 op->syncs, op->num_syncs,
2584 					 true, false,
2585 					 op->flags & XE_VMA_OP_LAST);
2586 			op->remap.next->gpuva.flags &= ~XE_VMA_LAST_REBIND;
2587 			if (err)
2588 				break;
2589 			op->remap.next = NULL;
2590 		}
2591 
2592 		break;
2593 	}
2594 	case DRM_GPUVA_OP_UNMAP:
2595 		err = xe_vm_unbind(vm, vma, op->q, op->syncs,
2596 				   op->num_syncs, op->flags & XE_VMA_OP_FIRST,
2597 				   op->flags & XE_VMA_OP_LAST);
2598 		break;
2599 	case DRM_GPUVA_OP_PREFETCH:
2600 		err = xe_vm_prefetch(vm, vma, op->q, op->prefetch.region,
2601 				     op->syncs, op->num_syncs,
2602 				     op->flags & XE_VMA_OP_FIRST,
2603 				     op->flags & XE_VMA_OP_LAST);
2604 		break;
2605 	default:
2606 		drm_warn(&vm->xe->drm, "NOT POSSIBLE");
2607 	}
2608 
2609 	if (err)
2610 		trace_xe_vma_fail(vma);
2611 
2612 	return err;
2613 }
2614 
2615 static int __xe_vma_op_execute(struct xe_vm *vm, struct xe_vma *vma,
2616 			       struct xe_vma_op *op)
2617 {
2618 	struct drm_exec exec;
2619 	int err;
2620 
2621 retry_userptr:
2622 	drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT);
2623 	drm_exec_until_all_locked(&exec) {
2624 		err = op_execute(&exec, vm, vma, op);
2625 		drm_exec_retry_on_contention(&exec);
2626 		if (err)
2627 			break;
2628 	}
2629 	drm_exec_fini(&exec);
2630 
2631 	if (err == -EAGAIN && xe_vma_is_userptr(vma)) {
2632 		lockdep_assert_held_write(&vm->lock);
2633 		err = xe_vma_userptr_pin_pages(vma);
2634 		if (!err)
2635 			goto retry_userptr;
2636 
2637 		trace_xe_vma_fail(vma);
2638 	}
2639 
2640 	return err;
2641 }
2642 
2643 static int xe_vma_op_execute(struct xe_vm *vm, struct xe_vma_op *op)
2644 {
2645 	int ret = 0;
2646 
2647 	lockdep_assert_held_write(&vm->lock);
2648 
2649 #ifdef TEST_VM_ASYNC_OPS_ERROR
2650 	if (op->inject_error) {
2651 		op->inject_error = false;
2652 		return -ENOMEM;
2653 	}
2654 #endif
2655 
2656 	switch (op->base.op) {
2657 	case DRM_GPUVA_OP_MAP:
2658 		ret = __xe_vma_op_execute(vm, op->map.vma, op);
2659 		break;
2660 	case DRM_GPUVA_OP_REMAP:
2661 	{
2662 		struct xe_vma *vma;
2663 
2664 		if (!op->remap.unmap_done)
2665 			vma = gpuva_to_vma(op->base.remap.unmap->va);
2666 		else if (op->remap.prev)
2667 			vma = op->remap.prev;
2668 		else
2669 			vma = op->remap.next;
2670 
2671 		ret = __xe_vma_op_execute(vm, vma, op);
2672 		break;
2673 	}
2674 	case DRM_GPUVA_OP_UNMAP:
2675 		ret = __xe_vma_op_execute(vm, gpuva_to_vma(op->base.unmap.va),
2676 					  op);
2677 		break;
2678 	case DRM_GPUVA_OP_PREFETCH:
2679 		ret = __xe_vma_op_execute(vm,
2680 					  gpuva_to_vma(op->base.prefetch.va),
2681 					  op);
2682 		break;
2683 	default:
2684 		drm_warn(&vm->xe->drm, "NOT POSSIBLE");
2685 	}
2686 
2687 	return ret;
2688 }
2689 
2690 static void xe_vma_op_cleanup(struct xe_vm *vm, struct xe_vma_op *op)
2691 {
2692 	bool last = op->flags & XE_VMA_OP_LAST;
2693 
2694 	if (last) {
2695 		while (op->num_syncs--)
2696 			xe_sync_entry_cleanup(&op->syncs[op->num_syncs]);
2697 		kfree(op->syncs);
2698 		if (op->q)
2699 			xe_exec_queue_put(op->q);
2700 	}
2701 	if (!list_empty(&op->link))
2702 		list_del(&op->link);
2703 	if (op->ops)
2704 		drm_gpuva_ops_free(&vm->gpuvm, op->ops);
2705 	if (last)
2706 		xe_vm_put(vm);
2707 }
2708 
2709 static void xe_vma_op_unwind(struct xe_vm *vm, struct xe_vma_op *op,
2710 			     bool post_commit, bool prev_post_commit,
2711 			     bool next_post_commit)
2712 {
2713 	lockdep_assert_held_write(&vm->lock);
2714 
2715 	switch (op->base.op) {
2716 	case DRM_GPUVA_OP_MAP:
2717 		if (op->map.vma) {
2718 			prep_vma_destroy(vm, op->map.vma, post_commit);
2719 			xe_vma_destroy_unlocked(op->map.vma);
2720 		}
2721 		break;
2722 	case DRM_GPUVA_OP_UNMAP:
2723 	{
2724 		struct xe_vma *vma = gpuva_to_vma(op->base.unmap.va);
2725 
2726 		if (vma) {
2727 			down_read(&vm->userptr.notifier_lock);
2728 			vma->gpuva.flags &= ~XE_VMA_DESTROYED;
2729 			up_read(&vm->userptr.notifier_lock);
2730 			if (post_commit)
2731 				xe_vm_insert_vma(vm, vma);
2732 		}
2733 		break;
2734 	}
2735 	case DRM_GPUVA_OP_REMAP:
2736 	{
2737 		struct xe_vma *vma = gpuva_to_vma(op->base.remap.unmap->va);
2738 
2739 		if (op->remap.prev) {
2740 			prep_vma_destroy(vm, op->remap.prev, prev_post_commit);
2741 			xe_vma_destroy_unlocked(op->remap.prev);
2742 		}
2743 		if (op->remap.next) {
2744 			prep_vma_destroy(vm, op->remap.next, next_post_commit);
2745 			xe_vma_destroy_unlocked(op->remap.next);
2746 		}
2747 		if (vma) {
2748 			down_read(&vm->userptr.notifier_lock);
2749 			vma->gpuva.flags &= ~XE_VMA_DESTROYED;
2750 			up_read(&vm->userptr.notifier_lock);
2751 			if (post_commit)
2752 				xe_vm_insert_vma(vm, vma);
2753 		}
2754 		break;
2755 	}
2756 	case DRM_GPUVA_OP_PREFETCH:
2757 		/* Nothing to do */
2758 		break;
2759 	default:
2760 		drm_warn(&vm->xe->drm, "NOT POSSIBLE");
2761 	}
2762 }
2763 
2764 static void vm_bind_ioctl_ops_unwind(struct xe_vm *vm,
2765 				     struct drm_gpuva_ops **ops,
2766 				     int num_ops_list)
2767 {
2768 	int i;
2769 
2770 	for (i = num_ops_list - 1; i; ++i) {
2771 		struct drm_gpuva_ops *__ops = ops[i];
2772 		struct drm_gpuva_op *__op;
2773 
2774 		if (!__ops)
2775 			continue;
2776 
2777 		drm_gpuva_for_each_op_reverse(__op, __ops) {
2778 			struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2779 
2780 			xe_vma_op_unwind(vm, op,
2781 					 op->flags & XE_VMA_OP_COMMITTED,
2782 					 op->flags & XE_VMA_OP_PREV_COMMITTED,
2783 					 op->flags & XE_VMA_OP_NEXT_COMMITTED);
2784 		}
2785 
2786 		drm_gpuva_ops_free(&vm->gpuvm, __ops);
2787 	}
2788 }
2789 
2790 static int vm_bind_ioctl_ops_execute(struct xe_vm *vm,
2791 				     struct list_head *ops_list)
2792 {
2793 	struct xe_vma_op *op, *next;
2794 	int err;
2795 
2796 	lockdep_assert_held_write(&vm->lock);
2797 
2798 	list_for_each_entry_safe(op, next, ops_list, link) {
2799 		err = xe_vma_op_execute(vm, op);
2800 		if (err) {
2801 			drm_warn(&vm->xe->drm, "VM op(%d) failed with %d",
2802 				 op->base.op, err);
2803 			/*
2804 			 * FIXME: Killing VM rather than proper error handling
2805 			 */
2806 			xe_vm_kill(vm);
2807 			return -ENOSPC;
2808 		}
2809 		xe_vma_op_cleanup(vm, op);
2810 	}
2811 
2812 	return 0;
2813 }
2814 
2815 #ifdef TEST_VM_ASYNC_OPS_ERROR
2816 #define SUPPORTED_FLAGS	\
2817 	(FORCE_ASYNC_OP_ERROR | DRM_XE_VM_BIND_FLAG_ASYNC | \
2818 	 DRM_XE_VM_BIND_FLAG_READONLY | DRM_XE_VM_BIND_FLAG_IMMEDIATE | \
2819 	 DRM_XE_VM_BIND_FLAG_NULL | 0xffff)
2820 #else
2821 #define SUPPORTED_FLAGS	\
2822 	(DRM_XE_VM_BIND_FLAG_ASYNC | DRM_XE_VM_BIND_FLAG_READONLY | \
2823 	 DRM_XE_VM_BIND_FLAG_IMMEDIATE | DRM_XE_VM_BIND_FLAG_NULL | \
2824 	 0xffff)
2825 #endif
2826 #define XE_64K_PAGE_MASK 0xffffull
2827 
2828 #define MAX_BINDS	512	/* FIXME: Picking random upper limit */
2829 
2830 static int vm_bind_ioctl_check_args(struct xe_device *xe,
2831 				    struct drm_xe_vm_bind *args,
2832 				    struct drm_xe_vm_bind_op **bind_ops,
2833 				    bool *async)
2834 {
2835 	int err;
2836 	int i;
2837 
2838 	if (XE_IOCTL_DBG(xe, args->extensions) ||
2839 	    XE_IOCTL_DBG(xe, !args->num_binds) ||
2840 	    XE_IOCTL_DBG(xe, args->num_binds > MAX_BINDS))
2841 		return -EINVAL;
2842 
2843 	if (args->num_binds > 1) {
2844 		u64 __user *bind_user =
2845 			u64_to_user_ptr(args->vector_of_binds);
2846 
2847 		*bind_ops = kmalloc(sizeof(struct drm_xe_vm_bind_op) *
2848 				    args->num_binds, GFP_KERNEL);
2849 		if (!*bind_ops)
2850 			return -ENOMEM;
2851 
2852 		err = __copy_from_user(*bind_ops, bind_user,
2853 				       sizeof(struct drm_xe_vm_bind_op) *
2854 				       args->num_binds);
2855 		if (XE_IOCTL_DBG(xe, err)) {
2856 			err = -EFAULT;
2857 			goto free_bind_ops;
2858 		}
2859 	} else {
2860 		*bind_ops = &args->bind;
2861 	}
2862 
2863 	for (i = 0; i < args->num_binds; ++i) {
2864 		u64 range = (*bind_ops)[i].range;
2865 		u64 addr = (*bind_ops)[i].addr;
2866 		u32 op = (*bind_ops)[i].op;
2867 		u32 flags = (*bind_ops)[i].flags;
2868 		u32 obj = (*bind_ops)[i].obj;
2869 		u64 obj_offset = (*bind_ops)[i].obj_offset;
2870 		u32 prefetch_region = (*bind_ops)[i].prefetch_mem_region_instance;
2871 		bool is_null = flags & DRM_XE_VM_BIND_FLAG_NULL;
2872 		u16 pat_index = (*bind_ops)[i].pat_index;
2873 		u16 coh_mode;
2874 
2875 		if (XE_IOCTL_DBG(xe, pat_index >= xe->pat.n_entries)) {
2876 			err = -EINVAL;
2877 			goto free_bind_ops;
2878 		}
2879 
2880 		pat_index = array_index_nospec(pat_index, xe->pat.n_entries);
2881 		(*bind_ops)[i].pat_index = pat_index;
2882 		coh_mode = xe_pat_index_get_coh_mode(xe, pat_index);
2883 		if (XE_IOCTL_DBG(xe, !coh_mode)) { /* hw reserved */
2884 			err = -EINVAL;
2885 			goto free_bind_ops;
2886 		}
2887 
2888 		if (XE_WARN_ON(coh_mode > XE_COH_AT_LEAST_1WAY)) {
2889 			err = -EINVAL;
2890 			goto free_bind_ops;
2891 		}
2892 
2893 		if (i == 0) {
2894 			*async = !!(flags & DRM_XE_VM_BIND_FLAG_ASYNC);
2895 			if (XE_IOCTL_DBG(xe, !*async && args->num_syncs)) {
2896 				err = -EINVAL;
2897 				goto free_bind_ops;
2898 			}
2899 		} else if (XE_IOCTL_DBG(xe, *async !=
2900 					!!(flags & DRM_XE_VM_BIND_FLAG_ASYNC))) {
2901 			err = -EINVAL;
2902 			goto free_bind_ops;
2903 		}
2904 
2905 		if (XE_IOCTL_DBG(xe, op > DRM_XE_VM_BIND_OP_PREFETCH) ||
2906 		    XE_IOCTL_DBG(xe, flags & ~SUPPORTED_FLAGS) ||
2907 		    XE_IOCTL_DBG(xe, obj && is_null) ||
2908 		    XE_IOCTL_DBG(xe, obj_offset && is_null) ||
2909 		    XE_IOCTL_DBG(xe, op != DRM_XE_VM_BIND_OP_MAP &&
2910 				 is_null) ||
2911 		    XE_IOCTL_DBG(xe, !obj &&
2912 				 op == DRM_XE_VM_BIND_OP_MAP &&
2913 				 !is_null) ||
2914 		    XE_IOCTL_DBG(xe, !obj &&
2915 				 op == DRM_XE_VM_BIND_OP_UNMAP_ALL) ||
2916 		    XE_IOCTL_DBG(xe, addr &&
2917 				 op == DRM_XE_VM_BIND_OP_UNMAP_ALL) ||
2918 		    XE_IOCTL_DBG(xe, range &&
2919 				 op == DRM_XE_VM_BIND_OP_UNMAP_ALL) ||
2920 		    XE_IOCTL_DBG(xe, obj &&
2921 				 op == DRM_XE_VM_BIND_OP_MAP_USERPTR) ||
2922 		    XE_IOCTL_DBG(xe, coh_mode == XE_COH_NONE &&
2923 				 op == DRM_XE_VM_BIND_OP_MAP_USERPTR) ||
2924 		    XE_IOCTL_DBG(xe, obj &&
2925 				 op == DRM_XE_VM_BIND_OP_PREFETCH) ||
2926 		    XE_IOCTL_DBG(xe, prefetch_region &&
2927 				 op != DRM_XE_VM_BIND_OP_PREFETCH) ||
2928 		    XE_IOCTL_DBG(xe, !(BIT(prefetch_region) &
2929 				       xe->info.mem_region_mask)) ||
2930 		    XE_IOCTL_DBG(xe, obj &&
2931 				 op == DRM_XE_VM_BIND_OP_UNMAP)) {
2932 			err = -EINVAL;
2933 			goto free_bind_ops;
2934 		}
2935 
2936 		if (XE_IOCTL_DBG(xe, obj_offset & ~PAGE_MASK) ||
2937 		    XE_IOCTL_DBG(xe, addr & ~PAGE_MASK) ||
2938 		    XE_IOCTL_DBG(xe, range & ~PAGE_MASK) ||
2939 		    XE_IOCTL_DBG(xe, !range &&
2940 				 op != DRM_XE_VM_BIND_OP_UNMAP_ALL)) {
2941 			err = -EINVAL;
2942 			goto free_bind_ops;
2943 		}
2944 	}
2945 
2946 	return 0;
2947 
2948 free_bind_ops:
2949 	if (args->num_binds > 1)
2950 		kfree(*bind_ops);
2951 	return err;
2952 }
2953 
2954 int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
2955 {
2956 	struct xe_device *xe = to_xe_device(dev);
2957 	struct xe_file *xef = to_xe_file(file);
2958 	struct drm_xe_vm_bind *args = data;
2959 	struct drm_xe_sync __user *syncs_user;
2960 	struct xe_bo **bos = NULL;
2961 	struct drm_gpuva_ops **ops = NULL;
2962 	struct xe_vm *vm;
2963 	struct xe_exec_queue *q = NULL;
2964 	u32 num_syncs;
2965 	struct xe_sync_entry *syncs = NULL;
2966 	struct drm_xe_vm_bind_op *bind_ops;
2967 	LIST_HEAD(ops_list);
2968 	bool async;
2969 	int err;
2970 	int i;
2971 
2972 	err = vm_bind_ioctl_check_args(xe, args, &bind_ops, &async);
2973 	if (err)
2974 		return err;
2975 
2976 	if (args->exec_queue_id) {
2977 		q = xe_exec_queue_lookup(xef, args->exec_queue_id);
2978 		if (XE_IOCTL_DBG(xe, !q)) {
2979 			err = -ENOENT;
2980 			goto free_objs;
2981 		}
2982 
2983 		if (XE_IOCTL_DBG(xe, !(q->flags & EXEC_QUEUE_FLAG_VM))) {
2984 			err = -EINVAL;
2985 			goto put_exec_queue;
2986 		}
2987 
2988 		if (XE_IOCTL_DBG(xe, async !=
2989 				 !!(q->flags & EXEC_QUEUE_FLAG_VM_ASYNC))) {
2990 			err = -EINVAL;
2991 			goto put_exec_queue;
2992 		}
2993 	}
2994 
2995 	vm = xe_vm_lookup(xef, args->vm_id);
2996 	if (XE_IOCTL_DBG(xe, !vm)) {
2997 		err = -EINVAL;
2998 		goto put_exec_queue;
2999 	}
3000 
3001 	if (!args->exec_queue_id) {
3002 		if (XE_IOCTL_DBG(xe, async !=
3003 				 !!(vm->flags & XE_VM_FLAG_ASYNC_DEFAULT))) {
3004 			err = -EINVAL;
3005 			goto put_vm;
3006 		}
3007 	}
3008 
3009 	err = down_write_killable(&vm->lock);
3010 	if (err)
3011 		goto put_vm;
3012 
3013 	if (XE_IOCTL_DBG(xe, xe_vm_is_closed_or_banned(vm))) {
3014 		err = -ENOENT;
3015 		goto release_vm_lock;
3016 	}
3017 
3018 	for (i = 0; i < args->num_binds; ++i) {
3019 		u64 range = bind_ops[i].range;
3020 		u64 addr = bind_ops[i].addr;
3021 
3022 		if (XE_IOCTL_DBG(xe, range > vm->size) ||
3023 		    XE_IOCTL_DBG(xe, addr > vm->size - range)) {
3024 			err = -EINVAL;
3025 			goto release_vm_lock;
3026 		}
3027 
3028 		if (bind_ops[i].tile_mask) {
3029 			u64 valid_tiles = BIT(xe->info.tile_count) - 1;
3030 
3031 			if (XE_IOCTL_DBG(xe, bind_ops[i].tile_mask &
3032 					 ~valid_tiles)) {
3033 				err = -EINVAL;
3034 				goto release_vm_lock;
3035 			}
3036 		}
3037 	}
3038 
3039 	bos = kzalloc(sizeof(*bos) * args->num_binds, GFP_KERNEL);
3040 	if (!bos) {
3041 		err = -ENOMEM;
3042 		goto release_vm_lock;
3043 	}
3044 
3045 	ops = kzalloc(sizeof(*ops) * args->num_binds, GFP_KERNEL);
3046 	if (!ops) {
3047 		err = -ENOMEM;
3048 		goto release_vm_lock;
3049 	}
3050 
3051 	for (i = 0; i < args->num_binds; ++i) {
3052 		struct drm_gem_object *gem_obj;
3053 		u64 range = bind_ops[i].range;
3054 		u64 addr = bind_ops[i].addr;
3055 		u32 obj = bind_ops[i].obj;
3056 		u64 obj_offset = bind_ops[i].obj_offset;
3057 		u16 pat_index = bind_ops[i].pat_index;
3058 		u16 coh_mode;
3059 
3060 		if (!obj)
3061 			continue;
3062 
3063 		gem_obj = drm_gem_object_lookup(file, obj);
3064 		if (XE_IOCTL_DBG(xe, !gem_obj)) {
3065 			err = -ENOENT;
3066 			goto put_obj;
3067 		}
3068 		bos[i] = gem_to_xe_bo(gem_obj);
3069 
3070 		if (XE_IOCTL_DBG(xe, range > bos[i]->size) ||
3071 		    XE_IOCTL_DBG(xe, obj_offset >
3072 				 bos[i]->size - range)) {
3073 			err = -EINVAL;
3074 			goto put_obj;
3075 		}
3076 
3077 		if (bos[i]->flags & XE_BO_INTERNAL_64K) {
3078 			if (XE_IOCTL_DBG(xe, obj_offset &
3079 					 XE_64K_PAGE_MASK) ||
3080 			    XE_IOCTL_DBG(xe, addr & XE_64K_PAGE_MASK) ||
3081 			    XE_IOCTL_DBG(xe, range & XE_64K_PAGE_MASK)) {
3082 				err = -EINVAL;
3083 				goto put_obj;
3084 			}
3085 		}
3086 
3087 		coh_mode = xe_pat_index_get_coh_mode(xe, pat_index);
3088 		if (bos[i]->cpu_caching) {
3089 			if (XE_IOCTL_DBG(xe, coh_mode == XE_COH_NONE &&
3090 					 bos[i]->cpu_caching == DRM_XE_GEM_CPU_CACHING_WB)) {
3091 				err = -EINVAL;
3092 				goto put_obj;
3093 			}
3094 		} else if (XE_IOCTL_DBG(xe, coh_mode == XE_COH_NONE)) {
3095 			/*
3096 			 * Imported dma-buf from a different device should
3097 			 * require 1way or 2way coherency since we don't know
3098 			 * how it was mapped on the CPU. Just assume is it
3099 			 * potentially cached on CPU side.
3100 			 */
3101 			err = -EINVAL;
3102 			goto put_obj;
3103 		}
3104 	}
3105 
3106 	if (args->num_syncs) {
3107 		syncs = kcalloc(args->num_syncs, sizeof(*syncs), GFP_KERNEL);
3108 		if (!syncs) {
3109 			err = -ENOMEM;
3110 			goto put_obj;
3111 		}
3112 	}
3113 
3114 	syncs_user = u64_to_user_ptr(args->syncs);
3115 	for (num_syncs = 0; num_syncs < args->num_syncs; num_syncs++) {
3116 		err = xe_sync_entry_parse(xe, xef, &syncs[num_syncs],
3117 					  &syncs_user[num_syncs], false,
3118 					  xe_vm_in_lr_mode(vm));
3119 		if (err)
3120 			goto free_syncs;
3121 	}
3122 
3123 	for (i = 0; i < args->num_binds; ++i) {
3124 		u64 range = bind_ops[i].range;
3125 		u64 addr = bind_ops[i].addr;
3126 		u32 op = bind_ops[i].op;
3127 		u32 flags = bind_ops[i].flags;
3128 		u64 obj_offset = bind_ops[i].obj_offset;
3129 		u8 tile_mask = bind_ops[i].tile_mask;
3130 		u32 prefetch_region = bind_ops[i].prefetch_mem_region_instance;
3131 		u16 pat_index = bind_ops[i].pat_index;
3132 
3133 		ops[i] = vm_bind_ioctl_ops_create(vm, bos[i], obj_offset,
3134 						  addr, range, op, flags,
3135 						  tile_mask, prefetch_region,
3136 						  pat_index);
3137 		if (IS_ERR(ops[i])) {
3138 			err = PTR_ERR(ops[i]);
3139 			ops[i] = NULL;
3140 			goto unwind_ops;
3141 		}
3142 
3143 		err = vm_bind_ioctl_ops_parse(vm, q, ops[i], syncs, num_syncs,
3144 					      &ops_list,
3145 					      i == args->num_binds - 1,
3146 					      async);
3147 		if (err)
3148 			goto unwind_ops;
3149 	}
3150 
3151 	/* Nothing to do */
3152 	if (list_empty(&ops_list)) {
3153 		err = -ENODATA;
3154 		goto unwind_ops;
3155 	}
3156 
3157 	xe_vm_get(vm);
3158 	if (q)
3159 		xe_exec_queue_get(q);
3160 
3161 	err = vm_bind_ioctl_ops_execute(vm, &ops_list);
3162 
3163 	up_write(&vm->lock);
3164 
3165 	if (q)
3166 		xe_exec_queue_put(q);
3167 	xe_vm_put(vm);
3168 
3169 	for (i = 0; bos && i < args->num_binds; ++i)
3170 		xe_bo_put(bos[i]);
3171 
3172 	kfree(bos);
3173 	kfree(ops);
3174 	if (args->num_binds > 1)
3175 		kfree(bind_ops);
3176 
3177 	return err;
3178 
3179 unwind_ops:
3180 	vm_bind_ioctl_ops_unwind(vm, ops, args->num_binds);
3181 free_syncs:
3182 	for (i = 0; err == -ENODATA && i < num_syncs; i++) {
3183 		struct dma_fence *fence =
3184 			xe_exec_queue_last_fence_get(to_wait_exec_queue(vm, q), vm);
3185 
3186 		xe_sync_entry_signal(&syncs[i], NULL, fence);
3187 	}
3188 	while (num_syncs--)
3189 		xe_sync_entry_cleanup(&syncs[num_syncs]);
3190 
3191 	kfree(syncs);
3192 put_obj:
3193 	for (i = 0; i < args->num_binds; ++i)
3194 		xe_bo_put(bos[i]);
3195 release_vm_lock:
3196 	up_write(&vm->lock);
3197 put_vm:
3198 	xe_vm_put(vm);
3199 put_exec_queue:
3200 	if (q)
3201 		xe_exec_queue_put(q);
3202 free_objs:
3203 	kfree(bos);
3204 	kfree(ops);
3205 	if (args->num_binds > 1)
3206 		kfree(bind_ops);
3207 	return err == -ENODATA ? 0 : err;
3208 }
3209 
3210 /**
3211  * xe_vm_lock() - Lock the vm's dma_resv object
3212  * @vm: The struct xe_vm whose lock is to be locked
3213  * @intr: Whether to perform any wait interruptible
3214  *
3215  * Return: 0 on success, -EINTR if @intr is true and the wait for a
3216  * contended lock was interrupted. If @intr is false, the function
3217  * always returns 0.
3218  */
3219 int xe_vm_lock(struct xe_vm *vm, bool intr)
3220 {
3221 	if (intr)
3222 		return dma_resv_lock_interruptible(xe_vm_resv(vm), NULL);
3223 
3224 	return dma_resv_lock(xe_vm_resv(vm), NULL);
3225 }
3226 
3227 /**
3228  * xe_vm_unlock() - Unlock the vm's dma_resv object
3229  * @vm: The struct xe_vm whose lock is to be released.
3230  *
3231  * Unlock a buffer object lock that was locked by xe_vm_lock().
3232  */
3233 void xe_vm_unlock(struct xe_vm *vm)
3234 {
3235 	dma_resv_unlock(xe_vm_resv(vm));
3236 }
3237 
3238 /**
3239  * xe_vm_invalidate_vma - invalidate GPU mappings for VMA without a lock
3240  * @vma: VMA to invalidate
3241  *
3242  * Walks a list of page tables leaves which it memset the entries owned by this
3243  * VMA to zero, invalidates the TLBs, and block until TLBs invalidation is
3244  * complete.
3245  *
3246  * Returns 0 for success, negative error code otherwise.
3247  */
3248 int xe_vm_invalidate_vma(struct xe_vma *vma)
3249 {
3250 	struct xe_device *xe = xe_vma_vm(vma)->xe;
3251 	struct xe_tile *tile;
3252 	u32 tile_needs_invalidate = 0;
3253 	int seqno[XE_MAX_TILES_PER_DEVICE];
3254 	u8 id;
3255 	int ret;
3256 
3257 	xe_assert(xe, xe_vm_in_fault_mode(xe_vma_vm(vma)));
3258 	xe_assert(xe, !xe_vma_is_null(vma));
3259 	trace_xe_vma_usm_invalidate(vma);
3260 
3261 	/* Check that we don't race with page-table updates */
3262 	if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
3263 		if (xe_vma_is_userptr(vma)) {
3264 			WARN_ON_ONCE(!mmu_interval_check_retry
3265 				     (&vma->userptr.notifier,
3266 				      vma->userptr.notifier_seq));
3267 			WARN_ON_ONCE(!dma_resv_test_signaled(xe_vm_resv(xe_vma_vm(vma)),
3268 							     DMA_RESV_USAGE_BOOKKEEP));
3269 
3270 		} else {
3271 			xe_bo_assert_held(xe_vma_bo(vma));
3272 		}
3273 	}
3274 
3275 	for_each_tile(tile, xe, id) {
3276 		if (xe_pt_zap_ptes(tile, vma)) {
3277 			tile_needs_invalidate |= BIT(id);
3278 			xe_device_wmb(xe);
3279 			/*
3280 			 * FIXME: We potentially need to invalidate multiple
3281 			 * GTs within the tile
3282 			 */
3283 			seqno[id] = xe_gt_tlb_invalidation_vma(tile->primary_gt, NULL, vma);
3284 			if (seqno[id] < 0)
3285 				return seqno[id];
3286 		}
3287 	}
3288 
3289 	for_each_tile(tile, xe, id) {
3290 		if (tile_needs_invalidate & BIT(id)) {
3291 			ret = xe_gt_tlb_invalidation_wait(tile->primary_gt, seqno[id]);
3292 			if (ret < 0)
3293 				return ret;
3294 		}
3295 	}
3296 
3297 	vma->usm.tile_invalidated = vma->tile_mask;
3298 
3299 	return 0;
3300 }
3301 
3302 int xe_analyze_vm(struct drm_printer *p, struct xe_vm *vm, int gt_id)
3303 {
3304 	struct drm_gpuva *gpuva;
3305 	bool is_vram;
3306 	uint64_t addr;
3307 
3308 	if (!down_read_trylock(&vm->lock)) {
3309 		drm_printf(p, " Failed to acquire VM lock to dump capture");
3310 		return 0;
3311 	}
3312 	if (vm->pt_root[gt_id]) {
3313 		addr = xe_bo_addr(vm->pt_root[gt_id]->bo, 0, XE_PAGE_SIZE);
3314 		is_vram = xe_bo_is_vram(vm->pt_root[gt_id]->bo);
3315 		drm_printf(p, " VM root: A:0x%llx %s\n", addr,
3316 			   is_vram ? "VRAM" : "SYS");
3317 	}
3318 
3319 	drm_gpuvm_for_each_va(gpuva, &vm->gpuvm) {
3320 		struct xe_vma *vma = gpuva_to_vma(gpuva);
3321 		bool is_userptr = xe_vma_is_userptr(vma);
3322 		bool is_null = xe_vma_is_null(vma);
3323 
3324 		if (is_null) {
3325 			addr = 0;
3326 		} else if (is_userptr) {
3327 			struct xe_res_cursor cur;
3328 
3329 			if (vma->userptr.sg) {
3330 				xe_res_first_sg(vma->userptr.sg, 0, XE_PAGE_SIZE,
3331 						&cur);
3332 				addr = xe_res_dma(&cur);
3333 			} else {
3334 				addr = 0;
3335 			}
3336 		} else {
3337 			addr = __xe_bo_addr(xe_vma_bo(vma), 0, XE_PAGE_SIZE);
3338 			is_vram = xe_bo_is_vram(xe_vma_bo(vma));
3339 		}
3340 		drm_printf(p, " [%016llx-%016llx] S:0x%016llx A:%016llx %s\n",
3341 			   xe_vma_start(vma), xe_vma_end(vma) - 1,
3342 			   xe_vma_size(vma),
3343 			   addr, is_null ? "NULL" : is_userptr ? "USR" :
3344 			   is_vram ? "VRAM" : "SYS");
3345 	}
3346 	up_read(&vm->lock);
3347 
3348 	return 0;
3349 }
3350