xref: /linux/drivers/gpu/drm/xe/xe_vm.c (revision 44343e8b250abb2f6bfd615493ca07a7f11f3cc2)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2021 Intel Corporation
4  */
5 
6 #include "xe_vm.h"
7 
8 #include <linux/dma-fence-array.h>
9 #include <linux/nospec.h>
10 
11 #include <drm/drm_drv.h>
12 #include <drm/drm_exec.h>
13 #include <drm/drm_print.h>
14 #include <drm/ttm/ttm_tt.h>
15 #include <uapi/drm/xe_drm.h>
16 #include <linux/ascii85.h>
17 #include <linux/delay.h>
18 #include <linux/kthread.h>
19 #include <linux/mm.h>
20 #include <linux/swap.h>
21 
22 #include <generated/xe_wa_oob.h>
23 
24 #include "regs/xe_gtt_defs.h"
25 #include "xe_assert.h"
26 #include "xe_bo.h"
27 #include "xe_device.h"
28 #include "xe_drm_client.h"
29 #include "xe_exec_queue.h"
30 #include "xe_gt_pagefault.h"
31 #include "xe_migrate.h"
32 #include "xe_pat.h"
33 #include "xe_pm.h"
34 #include "xe_preempt_fence.h"
35 #include "xe_pt.h"
36 #include "xe_pxp.h"
37 #include "xe_res_cursor.h"
38 #include "xe_svm.h"
39 #include "xe_sync.h"
40 #include "xe_tile.h"
41 #include "xe_tlb_inval.h"
42 #include "xe_trace_bo.h"
43 #include "xe_wa.h"
44 #include "xe_hmm.h"
45 
46 static struct drm_gem_object *xe_vm_obj(struct xe_vm *vm)
47 {
48 	return vm->gpuvm.r_obj;
49 }
50 
51 /**
52  * xe_vma_userptr_check_repin() - Advisory check for repin needed
53  * @uvma: The userptr vma
54  *
55  * Check if the userptr vma has been invalidated since last successful
56  * repin. The check is advisory only and can the function can be called
57  * without the vm->userptr.notifier_lock held. There is no guarantee that the
58  * vma userptr will remain valid after a lockless check, so typically
59  * the call needs to be followed by a proper check under the notifier_lock.
60  *
61  * Return: 0 if userptr vma is valid, -EAGAIN otherwise; repin recommended.
62  */
63 int xe_vma_userptr_check_repin(struct xe_userptr_vma *uvma)
64 {
65 	return mmu_interval_check_retry(&uvma->userptr.notifier,
66 					uvma->userptr.notifier_seq) ?
67 		-EAGAIN : 0;
68 }
69 
70 int xe_vma_userptr_pin_pages(struct xe_userptr_vma *uvma)
71 {
72 	struct xe_vma *vma = &uvma->vma;
73 	struct xe_vm *vm = xe_vma_vm(vma);
74 	struct xe_device *xe = vm->xe;
75 
76 	lockdep_assert_held(&vm->lock);
77 	xe_assert(xe, xe_vma_is_userptr(vma));
78 
79 	return xe_hmm_userptr_populate_range(uvma, false);
80 }
81 
82 static bool preempt_fences_waiting(struct xe_vm *vm)
83 {
84 	struct xe_exec_queue *q;
85 
86 	lockdep_assert_held(&vm->lock);
87 	xe_vm_assert_held(vm);
88 
89 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link) {
90 		if (!q->lr.pfence ||
91 		    test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
92 			     &q->lr.pfence->flags)) {
93 			return true;
94 		}
95 	}
96 
97 	return false;
98 }
99 
100 static void free_preempt_fences(struct list_head *list)
101 {
102 	struct list_head *link, *next;
103 
104 	list_for_each_safe(link, next, list)
105 		xe_preempt_fence_free(to_preempt_fence_from_link(link));
106 }
107 
108 static int alloc_preempt_fences(struct xe_vm *vm, struct list_head *list,
109 				unsigned int *count)
110 {
111 	lockdep_assert_held(&vm->lock);
112 	xe_vm_assert_held(vm);
113 
114 	if (*count >= vm->preempt.num_exec_queues)
115 		return 0;
116 
117 	for (; *count < vm->preempt.num_exec_queues; ++(*count)) {
118 		struct xe_preempt_fence *pfence = xe_preempt_fence_alloc();
119 
120 		if (IS_ERR(pfence))
121 			return PTR_ERR(pfence);
122 
123 		list_move_tail(xe_preempt_fence_link(pfence), list);
124 	}
125 
126 	return 0;
127 }
128 
129 static int wait_for_existing_preempt_fences(struct xe_vm *vm)
130 {
131 	struct xe_exec_queue *q;
132 
133 	xe_vm_assert_held(vm);
134 
135 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link) {
136 		if (q->lr.pfence) {
137 			long timeout = dma_fence_wait(q->lr.pfence, false);
138 
139 			/* Only -ETIME on fence indicates VM needs to be killed */
140 			if (timeout < 0 || q->lr.pfence->error == -ETIME)
141 				return -ETIME;
142 
143 			dma_fence_put(q->lr.pfence);
144 			q->lr.pfence = NULL;
145 		}
146 	}
147 
148 	return 0;
149 }
150 
151 static bool xe_vm_is_idle(struct xe_vm *vm)
152 {
153 	struct xe_exec_queue *q;
154 
155 	xe_vm_assert_held(vm);
156 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link) {
157 		if (!xe_exec_queue_is_idle(q))
158 			return false;
159 	}
160 
161 	return true;
162 }
163 
164 static void arm_preempt_fences(struct xe_vm *vm, struct list_head *list)
165 {
166 	struct list_head *link;
167 	struct xe_exec_queue *q;
168 
169 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link) {
170 		struct dma_fence *fence;
171 
172 		link = list->next;
173 		xe_assert(vm->xe, link != list);
174 
175 		fence = xe_preempt_fence_arm(to_preempt_fence_from_link(link),
176 					     q, q->lr.context,
177 					     ++q->lr.seqno);
178 		dma_fence_put(q->lr.pfence);
179 		q->lr.pfence = fence;
180 	}
181 }
182 
183 static int add_preempt_fences(struct xe_vm *vm, struct xe_bo *bo)
184 {
185 	struct xe_exec_queue *q;
186 	int err;
187 
188 	xe_bo_assert_held(bo);
189 
190 	if (!vm->preempt.num_exec_queues)
191 		return 0;
192 
193 	err = dma_resv_reserve_fences(bo->ttm.base.resv, vm->preempt.num_exec_queues);
194 	if (err)
195 		return err;
196 
197 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link)
198 		if (q->lr.pfence) {
199 			dma_resv_add_fence(bo->ttm.base.resv,
200 					   q->lr.pfence,
201 					   DMA_RESV_USAGE_BOOKKEEP);
202 		}
203 
204 	return 0;
205 }
206 
207 static void resume_and_reinstall_preempt_fences(struct xe_vm *vm,
208 						struct drm_exec *exec)
209 {
210 	struct xe_exec_queue *q;
211 
212 	lockdep_assert_held(&vm->lock);
213 	xe_vm_assert_held(vm);
214 
215 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link) {
216 		q->ops->resume(q);
217 
218 		drm_gpuvm_resv_add_fence(&vm->gpuvm, exec, q->lr.pfence,
219 					 DMA_RESV_USAGE_BOOKKEEP, DMA_RESV_USAGE_BOOKKEEP);
220 	}
221 }
222 
223 int xe_vm_add_compute_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
224 {
225 	struct drm_gpuvm_exec vm_exec = {
226 		.vm = &vm->gpuvm,
227 		.flags = DRM_EXEC_INTERRUPTIBLE_WAIT,
228 		.num_fences = 1,
229 	};
230 	struct drm_exec *exec = &vm_exec.exec;
231 	struct dma_fence *pfence;
232 	int err;
233 	bool wait;
234 
235 	xe_assert(vm->xe, xe_vm_in_preempt_fence_mode(vm));
236 
237 	down_write(&vm->lock);
238 	err = drm_gpuvm_exec_lock(&vm_exec);
239 	if (err)
240 		goto out_up_write;
241 
242 	pfence = xe_preempt_fence_create(q, q->lr.context,
243 					 ++q->lr.seqno);
244 	if (!pfence) {
245 		err = -ENOMEM;
246 		goto out_fini;
247 	}
248 
249 	list_add(&q->lr.link, &vm->preempt.exec_queues);
250 	++vm->preempt.num_exec_queues;
251 	q->lr.pfence = pfence;
252 
253 	down_read(&vm->userptr.notifier_lock);
254 
255 	drm_gpuvm_resv_add_fence(&vm->gpuvm, exec, pfence,
256 				 DMA_RESV_USAGE_BOOKKEEP, DMA_RESV_USAGE_BOOKKEEP);
257 
258 	/*
259 	 * Check to see if a preemption on VM is in flight or userptr
260 	 * invalidation, if so trigger this preempt fence to sync state with
261 	 * other preempt fences on the VM.
262 	 */
263 	wait = __xe_vm_userptr_needs_repin(vm) || preempt_fences_waiting(vm);
264 	if (wait)
265 		dma_fence_enable_sw_signaling(pfence);
266 
267 	up_read(&vm->userptr.notifier_lock);
268 
269 out_fini:
270 	drm_exec_fini(exec);
271 out_up_write:
272 	up_write(&vm->lock);
273 
274 	return err;
275 }
276 ALLOW_ERROR_INJECTION(xe_vm_add_compute_exec_queue, ERRNO);
277 
278 /**
279  * xe_vm_remove_compute_exec_queue() - Remove compute exec queue from VM
280  * @vm: The VM.
281  * @q: The exec_queue
282  *
283  * Note that this function might be called multiple times on the same queue.
284  */
285 void xe_vm_remove_compute_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
286 {
287 	if (!xe_vm_in_preempt_fence_mode(vm))
288 		return;
289 
290 	down_write(&vm->lock);
291 	if (!list_empty(&q->lr.link)) {
292 		list_del_init(&q->lr.link);
293 		--vm->preempt.num_exec_queues;
294 	}
295 	if (q->lr.pfence) {
296 		dma_fence_enable_sw_signaling(q->lr.pfence);
297 		dma_fence_put(q->lr.pfence);
298 		q->lr.pfence = NULL;
299 	}
300 	up_write(&vm->lock);
301 }
302 
303 /**
304  * __xe_vm_userptr_needs_repin() - Check whether the VM does have userptrs
305  * that need repinning.
306  * @vm: The VM.
307  *
308  * This function checks for whether the VM has userptrs that need repinning,
309  * and provides a release-type barrier on the userptr.notifier_lock after
310  * checking.
311  *
312  * Return: 0 if there are no userptrs needing repinning, -EAGAIN if there are.
313  */
314 int __xe_vm_userptr_needs_repin(struct xe_vm *vm)
315 {
316 	lockdep_assert_held_read(&vm->userptr.notifier_lock);
317 
318 	return (list_empty(&vm->userptr.repin_list) &&
319 		list_empty(&vm->userptr.invalidated)) ? 0 : -EAGAIN;
320 }
321 
322 #define XE_VM_REBIND_RETRY_TIMEOUT_MS 1000
323 
324 /**
325  * xe_vm_kill() - VM Kill
326  * @vm: The VM.
327  * @unlocked: Flag indicates the VM's dma-resv is not held
328  *
329  * Kill the VM by setting banned flag indicated VM is no longer available for
330  * use. If in preempt fence mode, also kill all exec queue attached to the VM.
331  */
332 void xe_vm_kill(struct xe_vm *vm, bool unlocked)
333 {
334 	struct xe_exec_queue *q;
335 
336 	lockdep_assert_held(&vm->lock);
337 
338 	if (unlocked)
339 		xe_vm_lock(vm, false);
340 
341 	vm->flags |= XE_VM_FLAG_BANNED;
342 	trace_xe_vm_kill(vm);
343 
344 	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link)
345 		q->ops->kill(q);
346 
347 	if (unlocked)
348 		xe_vm_unlock(vm);
349 
350 	/* TODO: Inform user the VM is banned */
351 }
352 
353 /**
354  * xe_vm_validate_should_retry() - Whether to retry after a validate error.
355  * @exec: The drm_exec object used for locking before validation.
356  * @err: The error returned from ttm_bo_validate().
357  * @end: A ktime_t cookie that should be set to 0 before first use and
358  * that should be reused on subsequent calls.
359  *
360  * With multiple active VMs, under memory pressure, it is possible that
361  * ttm_bo_validate() run into -EDEADLK and in such case returns -ENOMEM.
362  * Until ttm properly handles locking in such scenarios, best thing the
363  * driver can do is retry with a timeout. Check if that is necessary, and
364  * if so unlock the drm_exec's objects while keeping the ticket to prepare
365  * for a rerun.
366  *
367  * Return: true if a retry after drm_exec_init() is recommended;
368  * false otherwise.
369  */
370 bool xe_vm_validate_should_retry(struct drm_exec *exec, int err, ktime_t *end)
371 {
372 	ktime_t cur;
373 
374 	if (err != -ENOMEM)
375 		return false;
376 
377 	cur = ktime_get();
378 	*end = *end ? : ktime_add_ms(cur, XE_VM_REBIND_RETRY_TIMEOUT_MS);
379 	if (!ktime_before(cur, *end))
380 		return false;
381 
382 	msleep(20);
383 	return true;
384 }
385 
386 static int xe_gpuvm_validate(struct drm_gpuvm_bo *vm_bo, struct drm_exec *exec)
387 {
388 	struct xe_vm *vm = gpuvm_to_vm(vm_bo->vm);
389 	struct drm_gpuva *gpuva;
390 	int ret;
391 
392 	lockdep_assert_held(&vm->lock);
393 	drm_gpuvm_bo_for_each_va(gpuva, vm_bo)
394 		list_move_tail(&gpuva_to_vma(gpuva)->combined_links.rebind,
395 			       &vm->rebind_list);
396 
397 	ret = xe_bo_validate(gem_to_xe_bo(vm_bo->obj), vm, false);
398 	if (ret)
399 		return ret;
400 
401 	vm_bo->evicted = false;
402 	return 0;
403 }
404 
405 /**
406  * xe_vm_validate_rebind() - Validate buffer objects and rebind vmas
407  * @vm: The vm for which we are rebinding.
408  * @exec: The struct drm_exec with the locked GEM objects.
409  * @num_fences: The number of fences to reserve for the operation, not
410  * including rebinds and validations.
411  *
412  * Validates all evicted gem objects and rebinds their vmas. Note that
413  * rebindings may cause evictions and hence the validation-rebind
414  * sequence is rerun until there are no more objects to validate.
415  *
416  * Return: 0 on success, negative error code on error. In particular,
417  * may return -EINTR or -ERESTARTSYS if interrupted, and -EDEADLK if
418  * the drm_exec transaction needs to be restarted.
419  */
420 int xe_vm_validate_rebind(struct xe_vm *vm, struct drm_exec *exec,
421 			  unsigned int num_fences)
422 {
423 	struct drm_gem_object *obj;
424 	unsigned long index;
425 	int ret;
426 
427 	do {
428 		ret = drm_gpuvm_validate(&vm->gpuvm, exec);
429 		if (ret)
430 			return ret;
431 
432 		ret = xe_vm_rebind(vm, false);
433 		if (ret)
434 			return ret;
435 	} while (!list_empty(&vm->gpuvm.evict.list));
436 
437 	drm_exec_for_each_locked_object(exec, index, obj) {
438 		ret = dma_resv_reserve_fences(obj->resv, num_fences);
439 		if (ret)
440 			return ret;
441 	}
442 
443 	return 0;
444 }
445 
446 static int xe_preempt_work_begin(struct drm_exec *exec, struct xe_vm *vm,
447 				 bool *done)
448 {
449 	int err;
450 
451 	err = drm_gpuvm_prepare_vm(&vm->gpuvm, exec, 0);
452 	if (err)
453 		return err;
454 
455 	if (xe_vm_is_idle(vm)) {
456 		vm->preempt.rebind_deactivated = true;
457 		*done = true;
458 		return 0;
459 	}
460 
461 	if (!preempt_fences_waiting(vm)) {
462 		*done = true;
463 		return 0;
464 	}
465 
466 	err = drm_gpuvm_prepare_objects(&vm->gpuvm, exec, 0);
467 	if (err)
468 		return err;
469 
470 	err = wait_for_existing_preempt_fences(vm);
471 	if (err)
472 		return err;
473 
474 	/*
475 	 * Add validation and rebinding to the locking loop since both can
476 	 * cause evictions which may require blocing dma_resv locks.
477 	 * The fence reservation here is intended for the new preempt fences
478 	 * we attach at the end of the rebind work.
479 	 */
480 	return xe_vm_validate_rebind(vm, exec, vm->preempt.num_exec_queues);
481 }
482 
483 static void preempt_rebind_work_func(struct work_struct *w)
484 {
485 	struct xe_vm *vm = container_of(w, struct xe_vm, preempt.rebind_work);
486 	struct drm_exec exec;
487 	unsigned int fence_count = 0;
488 	LIST_HEAD(preempt_fences);
489 	ktime_t end = 0;
490 	int err = 0;
491 	long wait;
492 	int __maybe_unused tries = 0;
493 
494 	xe_assert(vm->xe, xe_vm_in_preempt_fence_mode(vm));
495 	trace_xe_vm_rebind_worker_enter(vm);
496 
497 	down_write(&vm->lock);
498 
499 	if (xe_vm_is_closed_or_banned(vm)) {
500 		up_write(&vm->lock);
501 		trace_xe_vm_rebind_worker_exit(vm);
502 		return;
503 	}
504 
505 retry:
506 	if (xe_vm_userptr_check_repin(vm)) {
507 		err = xe_vm_userptr_pin(vm);
508 		if (err)
509 			goto out_unlock_outer;
510 	}
511 
512 	drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT, 0);
513 
514 	drm_exec_until_all_locked(&exec) {
515 		bool done = false;
516 
517 		err = xe_preempt_work_begin(&exec, vm, &done);
518 		drm_exec_retry_on_contention(&exec);
519 		if (err || done) {
520 			drm_exec_fini(&exec);
521 			if (err && xe_vm_validate_should_retry(&exec, err, &end))
522 				err = -EAGAIN;
523 
524 			goto out_unlock_outer;
525 		}
526 	}
527 
528 	err = alloc_preempt_fences(vm, &preempt_fences, &fence_count);
529 	if (err)
530 		goto out_unlock;
531 
532 	err = xe_vm_rebind(vm, true);
533 	if (err)
534 		goto out_unlock;
535 
536 	/* Wait on rebinds and munmap style VM unbinds */
537 	wait = dma_resv_wait_timeout(xe_vm_resv(vm),
538 				     DMA_RESV_USAGE_KERNEL,
539 				     false, MAX_SCHEDULE_TIMEOUT);
540 	if (wait <= 0) {
541 		err = -ETIME;
542 		goto out_unlock;
543 	}
544 
545 #define retry_required(__tries, __vm) \
546 	(IS_ENABLED(CONFIG_DRM_XE_USERPTR_INVAL_INJECT) ? \
547 	(!(__tries)++ || __xe_vm_userptr_needs_repin(__vm)) : \
548 	__xe_vm_userptr_needs_repin(__vm))
549 
550 	down_read(&vm->userptr.notifier_lock);
551 	if (retry_required(tries, vm)) {
552 		up_read(&vm->userptr.notifier_lock);
553 		err = -EAGAIN;
554 		goto out_unlock;
555 	}
556 
557 #undef retry_required
558 
559 	spin_lock(&vm->xe->ttm.lru_lock);
560 	ttm_lru_bulk_move_tail(&vm->lru_bulk_move);
561 	spin_unlock(&vm->xe->ttm.lru_lock);
562 
563 	/* Point of no return. */
564 	arm_preempt_fences(vm, &preempt_fences);
565 	resume_and_reinstall_preempt_fences(vm, &exec);
566 	up_read(&vm->userptr.notifier_lock);
567 
568 out_unlock:
569 	drm_exec_fini(&exec);
570 out_unlock_outer:
571 	if (err == -EAGAIN) {
572 		trace_xe_vm_rebind_worker_retry(vm);
573 		goto retry;
574 	}
575 
576 	if (err) {
577 		drm_warn(&vm->xe->drm, "VM worker error: %d\n", err);
578 		xe_vm_kill(vm, true);
579 	}
580 	up_write(&vm->lock);
581 
582 	free_preempt_fences(&preempt_fences);
583 
584 	trace_xe_vm_rebind_worker_exit(vm);
585 }
586 
587 static void __vma_userptr_invalidate(struct xe_vm *vm, struct xe_userptr_vma *uvma)
588 {
589 	struct xe_userptr *userptr = &uvma->userptr;
590 	struct xe_vma *vma = &uvma->vma;
591 	struct dma_resv_iter cursor;
592 	struct dma_fence *fence;
593 	long err;
594 
595 	/*
596 	 * Tell exec and rebind worker they need to repin and rebind this
597 	 * userptr.
598 	 */
599 	if (!xe_vm_in_fault_mode(vm) &&
600 	    !(vma->gpuva.flags & XE_VMA_DESTROYED)) {
601 		spin_lock(&vm->userptr.invalidated_lock);
602 		list_move_tail(&userptr->invalidate_link,
603 			       &vm->userptr.invalidated);
604 		spin_unlock(&vm->userptr.invalidated_lock);
605 	}
606 
607 	/*
608 	 * Preempt fences turn into schedule disables, pipeline these.
609 	 * Note that even in fault mode, we need to wait for binds and
610 	 * unbinds to complete, and those are attached as BOOKMARK fences
611 	 * to the vm.
612 	 */
613 	dma_resv_iter_begin(&cursor, xe_vm_resv(vm),
614 			    DMA_RESV_USAGE_BOOKKEEP);
615 	dma_resv_for_each_fence_unlocked(&cursor, fence)
616 		dma_fence_enable_sw_signaling(fence);
617 	dma_resv_iter_end(&cursor);
618 
619 	err = dma_resv_wait_timeout(xe_vm_resv(vm),
620 				    DMA_RESV_USAGE_BOOKKEEP,
621 				    false, MAX_SCHEDULE_TIMEOUT);
622 	XE_WARN_ON(err <= 0);
623 
624 	if (xe_vm_in_fault_mode(vm) && userptr->initial_bind) {
625 		err = xe_vm_invalidate_vma(vma);
626 		XE_WARN_ON(err);
627 	}
628 
629 	xe_hmm_userptr_unmap(uvma);
630 }
631 
632 static bool vma_userptr_invalidate(struct mmu_interval_notifier *mni,
633 				   const struct mmu_notifier_range *range,
634 				   unsigned long cur_seq)
635 {
636 	struct xe_userptr_vma *uvma = container_of(mni, typeof(*uvma), userptr.notifier);
637 	struct xe_vma *vma = &uvma->vma;
638 	struct xe_vm *vm = xe_vma_vm(vma);
639 
640 	xe_assert(vm->xe, xe_vma_is_userptr(vma));
641 	trace_xe_vma_userptr_invalidate(vma);
642 
643 	if (!mmu_notifier_range_blockable(range))
644 		return false;
645 
646 	vm_dbg(&xe_vma_vm(vma)->xe->drm,
647 	       "NOTIFIER: addr=0x%016llx, range=0x%016llx",
648 		xe_vma_start(vma), xe_vma_size(vma));
649 
650 	down_write(&vm->userptr.notifier_lock);
651 	mmu_interval_set_seq(mni, cur_seq);
652 
653 	__vma_userptr_invalidate(vm, uvma);
654 	up_write(&vm->userptr.notifier_lock);
655 	trace_xe_vma_userptr_invalidate_complete(vma);
656 
657 	return true;
658 }
659 
660 static const struct mmu_interval_notifier_ops vma_userptr_notifier_ops = {
661 	.invalidate = vma_userptr_invalidate,
662 };
663 
664 #if IS_ENABLED(CONFIG_DRM_XE_USERPTR_INVAL_INJECT)
665 /**
666  * xe_vma_userptr_force_invalidate() - force invalidate a userptr
667  * @uvma: The userptr vma to invalidate
668  *
669  * Perform a forced userptr invalidation for testing purposes.
670  */
671 void xe_vma_userptr_force_invalidate(struct xe_userptr_vma *uvma)
672 {
673 	struct xe_vm *vm = xe_vma_vm(&uvma->vma);
674 
675 	/* Protect against concurrent userptr pinning */
676 	lockdep_assert_held(&vm->lock);
677 	/* Protect against concurrent notifiers */
678 	lockdep_assert_held(&vm->userptr.notifier_lock);
679 	/*
680 	 * Protect against concurrent instances of this function and
681 	 * the critical exec sections
682 	 */
683 	xe_vm_assert_held(vm);
684 
685 	if (!mmu_interval_read_retry(&uvma->userptr.notifier,
686 				     uvma->userptr.notifier_seq))
687 		uvma->userptr.notifier_seq -= 2;
688 	__vma_userptr_invalidate(vm, uvma);
689 }
690 #endif
691 
692 int xe_vm_userptr_pin(struct xe_vm *vm)
693 {
694 	struct xe_userptr_vma *uvma, *next;
695 	int err = 0;
696 
697 	xe_assert(vm->xe, !xe_vm_in_fault_mode(vm));
698 	lockdep_assert_held_write(&vm->lock);
699 
700 	/* Collect invalidated userptrs */
701 	spin_lock(&vm->userptr.invalidated_lock);
702 	xe_assert(vm->xe, list_empty(&vm->userptr.repin_list));
703 	list_for_each_entry_safe(uvma, next, &vm->userptr.invalidated,
704 				 userptr.invalidate_link) {
705 		list_del_init(&uvma->userptr.invalidate_link);
706 		list_add_tail(&uvma->userptr.repin_link,
707 			      &vm->userptr.repin_list);
708 	}
709 	spin_unlock(&vm->userptr.invalidated_lock);
710 
711 	/* Pin and move to bind list */
712 	list_for_each_entry_safe(uvma, next, &vm->userptr.repin_list,
713 				 userptr.repin_link) {
714 		err = xe_vma_userptr_pin_pages(uvma);
715 		if (err == -EFAULT) {
716 			list_del_init(&uvma->userptr.repin_link);
717 			/*
718 			 * We might have already done the pin once already, but
719 			 * then had to retry before the re-bind happened, due
720 			 * some other condition in the caller, but in the
721 			 * meantime the userptr got dinged by the notifier such
722 			 * that we need to revalidate here, but this time we hit
723 			 * the EFAULT. In such a case make sure we remove
724 			 * ourselves from the rebind list to avoid going down in
725 			 * flames.
726 			 */
727 			if (!list_empty(&uvma->vma.combined_links.rebind))
728 				list_del_init(&uvma->vma.combined_links.rebind);
729 
730 			/* Wait for pending binds */
731 			xe_vm_lock(vm, false);
732 			dma_resv_wait_timeout(xe_vm_resv(vm),
733 					      DMA_RESV_USAGE_BOOKKEEP,
734 					      false, MAX_SCHEDULE_TIMEOUT);
735 
736 			down_read(&vm->userptr.notifier_lock);
737 			err = xe_vm_invalidate_vma(&uvma->vma);
738 			up_read(&vm->userptr.notifier_lock);
739 			xe_vm_unlock(vm);
740 			if (err)
741 				break;
742 		} else {
743 			if (err)
744 				break;
745 
746 			list_del_init(&uvma->userptr.repin_link);
747 			list_move_tail(&uvma->vma.combined_links.rebind,
748 				       &vm->rebind_list);
749 		}
750 	}
751 
752 	if (err) {
753 		down_write(&vm->userptr.notifier_lock);
754 		spin_lock(&vm->userptr.invalidated_lock);
755 		list_for_each_entry_safe(uvma, next, &vm->userptr.repin_list,
756 					 userptr.repin_link) {
757 			list_del_init(&uvma->userptr.repin_link);
758 			list_move_tail(&uvma->userptr.invalidate_link,
759 				       &vm->userptr.invalidated);
760 		}
761 		spin_unlock(&vm->userptr.invalidated_lock);
762 		up_write(&vm->userptr.notifier_lock);
763 	}
764 	return err;
765 }
766 
767 /**
768  * xe_vm_userptr_check_repin() - Check whether the VM might have userptrs
769  * that need repinning.
770  * @vm: The VM.
771  *
772  * This function does an advisory check for whether the VM has userptrs that
773  * need repinning.
774  *
775  * Return: 0 if there are no indications of userptrs needing repinning,
776  * -EAGAIN if there are.
777  */
778 int xe_vm_userptr_check_repin(struct xe_vm *vm)
779 {
780 	return (list_empty_careful(&vm->userptr.repin_list) &&
781 		list_empty_careful(&vm->userptr.invalidated)) ? 0 : -EAGAIN;
782 }
783 
784 static int xe_vma_ops_alloc(struct xe_vma_ops *vops, bool array_of_binds)
785 {
786 	int i;
787 
788 	for (i = 0; i < XE_MAX_TILES_PER_DEVICE; ++i) {
789 		if (!vops->pt_update_ops[i].num_ops)
790 			continue;
791 
792 		vops->pt_update_ops[i].ops =
793 			kmalloc_array(vops->pt_update_ops[i].num_ops,
794 				      sizeof(*vops->pt_update_ops[i].ops),
795 				      GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN);
796 		if (!vops->pt_update_ops[i].ops)
797 			return array_of_binds ? -ENOBUFS : -ENOMEM;
798 	}
799 
800 	return 0;
801 }
802 ALLOW_ERROR_INJECTION(xe_vma_ops_alloc, ERRNO);
803 
804 static void xe_vma_svm_prefetch_op_fini(struct xe_vma_op *op)
805 {
806 	struct xe_vma *vma;
807 
808 	vma = gpuva_to_vma(op->base.prefetch.va);
809 
810 	if (op->base.op == DRM_GPUVA_OP_PREFETCH && xe_vma_is_cpu_addr_mirror(vma))
811 		xa_destroy(&op->prefetch_range.range);
812 }
813 
814 static void xe_vma_svm_prefetch_ops_fini(struct xe_vma_ops *vops)
815 {
816 	struct xe_vma_op *op;
817 
818 	if (!(vops->flags & XE_VMA_OPS_FLAG_HAS_SVM_PREFETCH))
819 		return;
820 
821 	list_for_each_entry(op, &vops->list, link)
822 		xe_vma_svm_prefetch_op_fini(op);
823 }
824 
825 static void xe_vma_ops_fini(struct xe_vma_ops *vops)
826 {
827 	int i;
828 
829 	xe_vma_svm_prefetch_ops_fini(vops);
830 
831 	for (i = 0; i < XE_MAX_TILES_PER_DEVICE; ++i)
832 		kfree(vops->pt_update_ops[i].ops);
833 }
834 
835 static void xe_vma_ops_incr_pt_update_ops(struct xe_vma_ops *vops, u8 tile_mask, int inc_val)
836 {
837 	int i;
838 
839 	if (!inc_val)
840 		return;
841 
842 	for (i = 0; i < XE_MAX_TILES_PER_DEVICE; ++i)
843 		if (BIT(i) & tile_mask)
844 			vops->pt_update_ops[i].num_ops += inc_val;
845 }
846 
847 static void xe_vm_populate_rebind(struct xe_vma_op *op, struct xe_vma *vma,
848 				  u8 tile_mask)
849 {
850 	INIT_LIST_HEAD(&op->link);
851 	op->tile_mask = tile_mask;
852 	op->base.op = DRM_GPUVA_OP_MAP;
853 	op->base.map.va.addr = vma->gpuva.va.addr;
854 	op->base.map.va.range = vma->gpuva.va.range;
855 	op->base.map.gem.obj = vma->gpuva.gem.obj;
856 	op->base.map.gem.offset = vma->gpuva.gem.offset;
857 	op->map.vma = vma;
858 	op->map.immediate = true;
859 	op->map.dumpable = vma->gpuva.flags & XE_VMA_DUMPABLE;
860 	op->map.is_null = xe_vma_is_null(vma);
861 }
862 
863 static int xe_vm_ops_add_rebind(struct xe_vma_ops *vops, struct xe_vma *vma,
864 				u8 tile_mask)
865 {
866 	struct xe_vma_op *op;
867 
868 	op = kzalloc(sizeof(*op), GFP_KERNEL);
869 	if (!op)
870 		return -ENOMEM;
871 
872 	xe_vm_populate_rebind(op, vma, tile_mask);
873 	list_add_tail(&op->link, &vops->list);
874 	xe_vma_ops_incr_pt_update_ops(vops, tile_mask, 1);
875 
876 	return 0;
877 }
878 
879 static struct dma_fence *ops_execute(struct xe_vm *vm,
880 				     struct xe_vma_ops *vops);
881 static void xe_vma_ops_init(struct xe_vma_ops *vops, struct xe_vm *vm,
882 			    struct xe_exec_queue *q,
883 			    struct xe_sync_entry *syncs, u32 num_syncs);
884 
885 int xe_vm_rebind(struct xe_vm *vm, bool rebind_worker)
886 {
887 	struct dma_fence *fence;
888 	struct xe_vma *vma, *next;
889 	struct xe_vma_ops vops;
890 	struct xe_vma_op *op, *next_op;
891 	int err, i;
892 
893 	lockdep_assert_held(&vm->lock);
894 	if ((xe_vm_in_lr_mode(vm) && !rebind_worker) ||
895 	    list_empty(&vm->rebind_list))
896 		return 0;
897 
898 	xe_vma_ops_init(&vops, vm, NULL, NULL, 0);
899 	for (i = 0; i < XE_MAX_TILES_PER_DEVICE; ++i)
900 		vops.pt_update_ops[i].wait_vm_bookkeep = true;
901 
902 	xe_vm_assert_held(vm);
903 	list_for_each_entry(vma, &vm->rebind_list, combined_links.rebind) {
904 		xe_assert(vm->xe, vma->tile_present);
905 
906 		if (rebind_worker)
907 			trace_xe_vma_rebind_worker(vma);
908 		else
909 			trace_xe_vma_rebind_exec(vma);
910 
911 		err = xe_vm_ops_add_rebind(&vops, vma,
912 					   vma->tile_present);
913 		if (err)
914 			goto free_ops;
915 	}
916 
917 	err = xe_vma_ops_alloc(&vops, false);
918 	if (err)
919 		goto free_ops;
920 
921 	fence = ops_execute(vm, &vops);
922 	if (IS_ERR(fence)) {
923 		err = PTR_ERR(fence);
924 	} else {
925 		dma_fence_put(fence);
926 		list_for_each_entry_safe(vma, next, &vm->rebind_list,
927 					 combined_links.rebind)
928 			list_del_init(&vma->combined_links.rebind);
929 	}
930 free_ops:
931 	list_for_each_entry_safe(op, next_op, &vops.list, link) {
932 		list_del(&op->link);
933 		kfree(op);
934 	}
935 	xe_vma_ops_fini(&vops);
936 
937 	return err;
938 }
939 
940 struct dma_fence *xe_vma_rebind(struct xe_vm *vm, struct xe_vma *vma, u8 tile_mask)
941 {
942 	struct dma_fence *fence = NULL;
943 	struct xe_vma_ops vops;
944 	struct xe_vma_op *op, *next_op;
945 	struct xe_tile *tile;
946 	u8 id;
947 	int err;
948 
949 	lockdep_assert_held(&vm->lock);
950 	xe_vm_assert_held(vm);
951 	xe_assert(vm->xe, xe_vm_in_fault_mode(vm));
952 
953 	xe_vma_ops_init(&vops, vm, NULL, NULL, 0);
954 	for_each_tile(tile, vm->xe, id) {
955 		vops.pt_update_ops[id].wait_vm_bookkeep = true;
956 		vops.pt_update_ops[tile->id].q =
957 			xe_migrate_exec_queue(tile->migrate);
958 	}
959 
960 	err = xe_vm_ops_add_rebind(&vops, vma, tile_mask);
961 	if (err)
962 		return ERR_PTR(err);
963 
964 	err = xe_vma_ops_alloc(&vops, false);
965 	if (err) {
966 		fence = ERR_PTR(err);
967 		goto free_ops;
968 	}
969 
970 	fence = ops_execute(vm, &vops);
971 
972 free_ops:
973 	list_for_each_entry_safe(op, next_op, &vops.list, link) {
974 		list_del(&op->link);
975 		kfree(op);
976 	}
977 	xe_vma_ops_fini(&vops);
978 
979 	return fence;
980 }
981 
982 static void xe_vm_populate_range_rebind(struct xe_vma_op *op,
983 					struct xe_vma *vma,
984 					struct xe_svm_range *range,
985 					u8 tile_mask)
986 {
987 	INIT_LIST_HEAD(&op->link);
988 	op->tile_mask = tile_mask;
989 	op->base.op = DRM_GPUVA_OP_DRIVER;
990 	op->subop = XE_VMA_SUBOP_MAP_RANGE;
991 	op->map_range.vma = vma;
992 	op->map_range.range = range;
993 }
994 
995 static int
996 xe_vm_ops_add_range_rebind(struct xe_vma_ops *vops,
997 			   struct xe_vma *vma,
998 			   struct xe_svm_range *range,
999 			   u8 tile_mask)
1000 {
1001 	struct xe_vma_op *op;
1002 
1003 	op = kzalloc(sizeof(*op), GFP_KERNEL);
1004 	if (!op)
1005 		return -ENOMEM;
1006 
1007 	xe_vm_populate_range_rebind(op, vma, range, tile_mask);
1008 	list_add_tail(&op->link, &vops->list);
1009 	xe_vma_ops_incr_pt_update_ops(vops, tile_mask, 1);
1010 
1011 	return 0;
1012 }
1013 
1014 /**
1015  * xe_vm_range_rebind() - VM range (re)bind
1016  * @vm: The VM which the range belongs to.
1017  * @vma: The VMA which the range belongs to.
1018  * @range: SVM range to rebind.
1019  * @tile_mask: Tile mask to bind the range to.
1020  *
1021  * (re)bind SVM range setting up GPU page tables for the range.
1022  *
1023  * Return: dma fence for rebind to signal completion on succees, ERR_PTR on
1024  * failure
1025  */
1026 struct dma_fence *xe_vm_range_rebind(struct xe_vm *vm,
1027 				     struct xe_vma *vma,
1028 				     struct xe_svm_range *range,
1029 				     u8 tile_mask)
1030 {
1031 	struct dma_fence *fence = NULL;
1032 	struct xe_vma_ops vops;
1033 	struct xe_vma_op *op, *next_op;
1034 	struct xe_tile *tile;
1035 	u8 id;
1036 	int err;
1037 
1038 	lockdep_assert_held(&vm->lock);
1039 	xe_vm_assert_held(vm);
1040 	xe_assert(vm->xe, xe_vm_in_fault_mode(vm));
1041 	xe_assert(vm->xe, xe_vma_is_cpu_addr_mirror(vma));
1042 
1043 	xe_vma_ops_init(&vops, vm, NULL, NULL, 0);
1044 	for_each_tile(tile, vm->xe, id) {
1045 		vops.pt_update_ops[id].wait_vm_bookkeep = true;
1046 		vops.pt_update_ops[tile->id].q =
1047 			xe_migrate_exec_queue(tile->migrate);
1048 	}
1049 
1050 	err = xe_vm_ops_add_range_rebind(&vops, vma, range, tile_mask);
1051 	if (err)
1052 		return ERR_PTR(err);
1053 
1054 	err = xe_vma_ops_alloc(&vops, false);
1055 	if (err) {
1056 		fence = ERR_PTR(err);
1057 		goto free_ops;
1058 	}
1059 
1060 	fence = ops_execute(vm, &vops);
1061 
1062 free_ops:
1063 	list_for_each_entry_safe(op, next_op, &vops.list, link) {
1064 		list_del(&op->link);
1065 		kfree(op);
1066 	}
1067 	xe_vma_ops_fini(&vops);
1068 
1069 	return fence;
1070 }
1071 
1072 static void xe_vm_populate_range_unbind(struct xe_vma_op *op,
1073 					struct xe_svm_range *range)
1074 {
1075 	INIT_LIST_HEAD(&op->link);
1076 	op->tile_mask = range->tile_present;
1077 	op->base.op = DRM_GPUVA_OP_DRIVER;
1078 	op->subop = XE_VMA_SUBOP_UNMAP_RANGE;
1079 	op->unmap_range.range = range;
1080 }
1081 
1082 static int
1083 xe_vm_ops_add_range_unbind(struct xe_vma_ops *vops,
1084 			   struct xe_svm_range *range)
1085 {
1086 	struct xe_vma_op *op;
1087 
1088 	op = kzalloc(sizeof(*op), GFP_KERNEL);
1089 	if (!op)
1090 		return -ENOMEM;
1091 
1092 	xe_vm_populate_range_unbind(op, range);
1093 	list_add_tail(&op->link, &vops->list);
1094 	xe_vma_ops_incr_pt_update_ops(vops, range->tile_present, 1);
1095 
1096 	return 0;
1097 }
1098 
1099 /**
1100  * xe_vm_range_unbind() - VM range unbind
1101  * @vm: The VM which the range belongs to.
1102  * @range: SVM range to rebind.
1103  *
1104  * Unbind SVM range removing the GPU page tables for the range.
1105  *
1106  * Return: dma fence for unbind to signal completion on succees, ERR_PTR on
1107  * failure
1108  */
1109 struct dma_fence *xe_vm_range_unbind(struct xe_vm *vm,
1110 				     struct xe_svm_range *range)
1111 {
1112 	struct dma_fence *fence = NULL;
1113 	struct xe_vma_ops vops;
1114 	struct xe_vma_op *op, *next_op;
1115 	struct xe_tile *tile;
1116 	u8 id;
1117 	int err;
1118 
1119 	lockdep_assert_held(&vm->lock);
1120 	xe_vm_assert_held(vm);
1121 	xe_assert(vm->xe, xe_vm_in_fault_mode(vm));
1122 
1123 	if (!range->tile_present)
1124 		return dma_fence_get_stub();
1125 
1126 	xe_vma_ops_init(&vops, vm, NULL, NULL, 0);
1127 	for_each_tile(tile, vm->xe, id) {
1128 		vops.pt_update_ops[id].wait_vm_bookkeep = true;
1129 		vops.pt_update_ops[tile->id].q =
1130 			xe_migrate_exec_queue(tile->migrate);
1131 	}
1132 
1133 	err = xe_vm_ops_add_range_unbind(&vops, range);
1134 	if (err)
1135 		return ERR_PTR(err);
1136 
1137 	err = xe_vma_ops_alloc(&vops, false);
1138 	if (err) {
1139 		fence = ERR_PTR(err);
1140 		goto free_ops;
1141 	}
1142 
1143 	fence = ops_execute(vm, &vops);
1144 
1145 free_ops:
1146 	list_for_each_entry_safe(op, next_op, &vops.list, link) {
1147 		list_del(&op->link);
1148 		kfree(op);
1149 	}
1150 	xe_vma_ops_fini(&vops);
1151 
1152 	return fence;
1153 }
1154 
1155 static void xe_vma_free(struct xe_vma *vma)
1156 {
1157 	if (xe_vma_is_userptr(vma))
1158 		kfree(to_userptr_vma(vma));
1159 	else
1160 		kfree(vma);
1161 }
1162 
1163 #define VMA_CREATE_FLAG_READ_ONLY		BIT(0)
1164 #define VMA_CREATE_FLAG_IS_NULL			BIT(1)
1165 #define VMA_CREATE_FLAG_DUMPABLE		BIT(2)
1166 #define VMA_CREATE_FLAG_IS_SYSTEM_ALLOCATOR	BIT(3)
1167 
1168 static struct xe_vma *xe_vma_create(struct xe_vm *vm,
1169 				    struct xe_bo *bo,
1170 				    u64 bo_offset_or_userptr,
1171 				    u64 start, u64 end,
1172 				    struct xe_vma_mem_attr *attr,
1173 				    unsigned int flags)
1174 {
1175 	struct xe_vma *vma;
1176 	struct xe_tile *tile;
1177 	u8 id;
1178 	bool read_only = (flags & VMA_CREATE_FLAG_READ_ONLY);
1179 	bool is_null = (flags & VMA_CREATE_FLAG_IS_NULL);
1180 	bool dumpable = (flags & VMA_CREATE_FLAG_DUMPABLE);
1181 	bool is_cpu_addr_mirror =
1182 		(flags & VMA_CREATE_FLAG_IS_SYSTEM_ALLOCATOR);
1183 
1184 	xe_assert(vm->xe, start < end);
1185 	xe_assert(vm->xe, end < vm->size);
1186 
1187 	/*
1188 	 * Allocate and ensure that the xe_vma_is_userptr() return
1189 	 * matches what was allocated.
1190 	 */
1191 	if (!bo && !is_null && !is_cpu_addr_mirror) {
1192 		struct xe_userptr_vma *uvma = kzalloc(sizeof(*uvma), GFP_KERNEL);
1193 
1194 		if (!uvma)
1195 			return ERR_PTR(-ENOMEM);
1196 
1197 		vma = &uvma->vma;
1198 	} else {
1199 		vma = kzalloc(sizeof(*vma), GFP_KERNEL);
1200 		if (!vma)
1201 			return ERR_PTR(-ENOMEM);
1202 
1203 		if (is_cpu_addr_mirror)
1204 			vma->gpuva.flags |= XE_VMA_SYSTEM_ALLOCATOR;
1205 		if (is_null)
1206 			vma->gpuva.flags |= DRM_GPUVA_SPARSE;
1207 		if (bo)
1208 			vma->gpuva.gem.obj = &bo->ttm.base;
1209 	}
1210 
1211 	INIT_LIST_HEAD(&vma->combined_links.rebind);
1212 
1213 	INIT_LIST_HEAD(&vma->gpuva.gem.entry);
1214 	vma->gpuva.vm = &vm->gpuvm;
1215 	vma->gpuva.va.addr = start;
1216 	vma->gpuva.va.range = end - start + 1;
1217 	if (read_only)
1218 		vma->gpuva.flags |= XE_VMA_READ_ONLY;
1219 	if (dumpable)
1220 		vma->gpuva.flags |= XE_VMA_DUMPABLE;
1221 
1222 	for_each_tile(tile, vm->xe, id)
1223 		vma->tile_mask |= 0x1 << id;
1224 
1225 	if (vm->xe->info.has_atomic_enable_pte_bit)
1226 		vma->gpuva.flags |= XE_VMA_ATOMIC_PTE_BIT;
1227 
1228 	vma->attr = *attr;
1229 
1230 	if (bo) {
1231 		struct drm_gpuvm_bo *vm_bo;
1232 
1233 		xe_bo_assert_held(bo);
1234 
1235 		vm_bo = drm_gpuvm_bo_obtain(vma->gpuva.vm, &bo->ttm.base);
1236 		if (IS_ERR(vm_bo)) {
1237 			xe_vma_free(vma);
1238 			return ERR_CAST(vm_bo);
1239 		}
1240 
1241 		drm_gpuvm_bo_extobj_add(vm_bo);
1242 		drm_gem_object_get(&bo->ttm.base);
1243 		vma->gpuva.gem.offset = bo_offset_or_userptr;
1244 		drm_gpuva_link(&vma->gpuva, vm_bo);
1245 		drm_gpuvm_bo_put(vm_bo);
1246 	} else /* userptr or null */ {
1247 		if (!is_null && !is_cpu_addr_mirror) {
1248 			struct xe_userptr *userptr = &to_userptr_vma(vma)->userptr;
1249 			u64 size = end - start + 1;
1250 			int err;
1251 
1252 			INIT_LIST_HEAD(&userptr->invalidate_link);
1253 			INIT_LIST_HEAD(&userptr->repin_link);
1254 			vma->gpuva.gem.offset = bo_offset_or_userptr;
1255 			mutex_init(&userptr->unmap_mutex);
1256 
1257 			err = mmu_interval_notifier_insert(&userptr->notifier,
1258 							   current->mm,
1259 							   xe_vma_userptr(vma), size,
1260 							   &vma_userptr_notifier_ops);
1261 			if (err) {
1262 				xe_vma_free(vma);
1263 				return ERR_PTR(err);
1264 			}
1265 
1266 			userptr->notifier_seq = LONG_MAX;
1267 		}
1268 
1269 		xe_vm_get(vm);
1270 	}
1271 
1272 	return vma;
1273 }
1274 
1275 static void xe_vma_destroy_late(struct xe_vma *vma)
1276 {
1277 	struct xe_vm *vm = xe_vma_vm(vma);
1278 
1279 	if (vma->ufence) {
1280 		xe_sync_ufence_put(vma->ufence);
1281 		vma->ufence = NULL;
1282 	}
1283 
1284 	if (xe_vma_is_userptr(vma)) {
1285 		struct xe_userptr_vma *uvma = to_userptr_vma(vma);
1286 		struct xe_userptr *userptr = &uvma->userptr;
1287 
1288 		if (userptr->sg)
1289 			xe_hmm_userptr_free_sg(uvma);
1290 
1291 		/*
1292 		 * Since userptr pages are not pinned, we can't remove
1293 		 * the notifier until we're sure the GPU is not accessing
1294 		 * them anymore
1295 		 */
1296 		mmu_interval_notifier_remove(&userptr->notifier);
1297 		mutex_destroy(&userptr->unmap_mutex);
1298 		xe_vm_put(vm);
1299 	} else if (xe_vma_is_null(vma) || xe_vma_is_cpu_addr_mirror(vma)) {
1300 		xe_vm_put(vm);
1301 	} else {
1302 		xe_bo_put(xe_vma_bo(vma));
1303 	}
1304 
1305 	xe_vma_free(vma);
1306 }
1307 
1308 static void vma_destroy_work_func(struct work_struct *w)
1309 {
1310 	struct xe_vma *vma =
1311 		container_of(w, struct xe_vma, destroy_work);
1312 
1313 	xe_vma_destroy_late(vma);
1314 }
1315 
1316 static void vma_destroy_cb(struct dma_fence *fence,
1317 			   struct dma_fence_cb *cb)
1318 {
1319 	struct xe_vma *vma = container_of(cb, struct xe_vma, destroy_cb);
1320 
1321 	INIT_WORK(&vma->destroy_work, vma_destroy_work_func);
1322 	queue_work(system_unbound_wq, &vma->destroy_work);
1323 }
1324 
1325 static void xe_vma_destroy(struct xe_vma *vma, struct dma_fence *fence)
1326 {
1327 	struct xe_vm *vm = xe_vma_vm(vma);
1328 
1329 	lockdep_assert_held_write(&vm->lock);
1330 	xe_assert(vm->xe, list_empty(&vma->combined_links.destroy));
1331 
1332 	if (xe_vma_is_userptr(vma)) {
1333 		xe_assert(vm->xe, vma->gpuva.flags & XE_VMA_DESTROYED);
1334 
1335 		spin_lock(&vm->userptr.invalidated_lock);
1336 		xe_assert(vm->xe, list_empty(&to_userptr_vma(vma)->userptr.repin_link));
1337 		list_del(&to_userptr_vma(vma)->userptr.invalidate_link);
1338 		spin_unlock(&vm->userptr.invalidated_lock);
1339 	} else if (!xe_vma_is_null(vma) && !xe_vma_is_cpu_addr_mirror(vma)) {
1340 		xe_bo_assert_held(xe_vma_bo(vma));
1341 
1342 		drm_gpuva_unlink(&vma->gpuva);
1343 	}
1344 
1345 	xe_vm_assert_held(vm);
1346 	if (fence) {
1347 		int ret = dma_fence_add_callback(fence, &vma->destroy_cb,
1348 						 vma_destroy_cb);
1349 
1350 		if (ret) {
1351 			XE_WARN_ON(ret != -ENOENT);
1352 			xe_vma_destroy_late(vma);
1353 		}
1354 	} else {
1355 		xe_vma_destroy_late(vma);
1356 	}
1357 }
1358 
1359 /**
1360  * xe_vm_lock_vma() - drm_exec utility to lock a vma
1361  * @exec: The drm_exec object we're currently locking for.
1362  * @vma: The vma for witch we want to lock the vm resv and any attached
1363  * object's resv.
1364  *
1365  * Return: 0 on success, negative error code on error. In particular
1366  * may return -EDEADLK on WW transaction contention and -EINTR if
1367  * an interruptible wait is terminated by a signal.
1368  */
1369 int xe_vm_lock_vma(struct drm_exec *exec, struct xe_vma *vma)
1370 {
1371 	struct xe_vm *vm = xe_vma_vm(vma);
1372 	struct xe_bo *bo = xe_vma_bo(vma);
1373 	int err;
1374 
1375 	XE_WARN_ON(!vm);
1376 
1377 	err = drm_exec_lock_obj(exec, xe_vm_obj(vm));
1378 	if (!err && bo && !bo->vm)
1379 		err = drm_exec_lock_obj(exec, &bo->ttm.base);
1380 
1381 	return err;
1382 }
1383 
1384 static void xe_vma_destroy_unlocked(struct xe_vma *vma)
1385 {
1386 	struct drm_exec exec;
1387 	int err;
1388 
1389 	drm_exec_init(&exec, 0, 0);
1390 	drm_exec_until_all_locked(&exec) {
1391 		err = xe_vm_lock_vma(&exec, vma);
1392 		drm_exec_retry_on_contention(&exec);
1393 		if (XE_WARN_ON(err))
1394 			break;
1395 	}
1396 
1397 	xe_vma_destroy(vma, NULL);
1398 
1399 	drm_exec_fini(&exec);
1400 }
1401 
1402 struct xe_vma *
1403 xe_vm_find_overlapping_vma(struct xe_vm *vm, u64 start, u64 range)
1404 {
1405 	struct drm_gpuva *gpuva;
1406 
1407 	lockdep_assert_held(&vm->lock);
1408 
1409 	if (xe_vm_is_closed_or_banned(vm))
1410 		return NULL;
1411 
1412 	xe_assert(vm->xe, start + range <= vm->size);
1413 
1414 	gpuva = drm_gpuva_find_first(&vm->gpuvm, start, range);
1415 
1416 	return gpuva ? gpuva_to_vma(gpuva) : NULL;
1417 }
1418 
1419 static int xe_vm_insert_vma(struct xe_vm *vm, struct xe_vma *vma)
1420 {
1421 	int err;
1422 
1423 	xe_assert(vm->xe, xe_vma_vm(vma) == vm);
1424 	lockdep_assert_held(&vm->lock);
1425 
1426 	mutex_lock(&vm->snap_mutex);
1427 	err = drm_gpuva_insert(&vm->gpuvm, &vma->gpuva);
1428 	mutex_unlock(&vm->snap_mutex);
1429 	XE_WARN_ON(err);	/* Shouldn't be possible */
1430 
1431 	return err;
1432 }
1433 
1434 static void xe_vm_remove_vma(struct xe_vm *vm, struct xe_vma *vma)
1435 {
1436 	xe_assert(vm->xe, xe_vma_vm(vma) == vm);
1437 	lockdep_assert_held(&vm->lock);
1438 
1439 	mutex_lock(&vm->snap_mutex);
1440 	drm_gpuva_remove(&vma->gpuva);
1441 	mutex_unlock(&vm->snap_mutex);
1442 	if (vm->usm.last_fault_vma == vma)
1443 		vm->usm.last_fault_vma = NULL;
1444 }
1445 
1446 static struct drm_gpuva_op *xe_vm_op_alloc(void)
1447 {
1448 	struct xe_vma_op *op;
1449 
1450 	op = kzalloc(sizeof(*op), GFP_KERNEL);
1451 
1452 	if (unlikely(!op))
1453 		return NULL;
1454 
1455 	return &op->base;
1456 }
1457 
1458 static void xe_vm_free(struct drm_gpuvm *gpuvm);
1459 
1460 static const struct drm_gpuvm_ops gpuvm_ops = {
1461 	.op_alloc = xe_vm_op_alloc,
1462 	.vm_bo_validate = xe_gpuvm_validate,
1463 	.vm_free = xe_vm_free,
1464 };
1465 
1466 static u64 pde_encode_pat_index(u16 pat_index)
1467 {
1468 	u64 pte = 0;
1469 
1470 	if (pat_index & BIT(0))
1471 		pte |= XE_PPGTT_PTE_PAT0;
1472 
1473 	if (pat_index & BIT(1))
1474 		pte |= XE_PPGTT_PTE_PAT1;
1475 
1476 	return pte;
1477 }
1478 
1479 static u64 pte_encode_pat_index(u16 pat_index, u32 pt_level)
1480 {
1481 	u64 pte = 0;
1482 
1483 	if (pat_index & BIT(0))
1484 		pte |= XE_PPGTT_PTE_PAT0;
1485 
1486 	if (pat_index & BIT(1))
1487 		pte |= XE_PPGTT_PTE_PAT1;
1488 
1489 	if (pat_index & BIT(2)) {
1490 		if (pt_level)
1491 			pte |= XE_PPGTT_PDE_PDPE_PAT2;
1492 		else
1493 			pte |= XE_PPGTT_PTE_PAT2;
1494 	}
1495 
1496 	if (pat_index & BIT(3))
1497 		pte |= XELPG_PPGTT_PTE_PAT3;
1498 
1499 	if (pat_index & (BIT(4)))
1500 		pte |= XE2_PPGTT_PTE_PAT4;
1501 
1502 	return pte;
1503 }
1504 
1505 static u64 pte_encode_ps(u32 pt_level)
1506 {
1507 	XE_WARN_ON(pt_level > MAX_HUGEPTE_LEVEL);
1508 
1509 	if (pt_level == 1)
1510 		return XE_PDE_PS_2M;
1511 	else if (pt_level == 2)
1512 		return XE_PDPE_PS_1G;
1513 
1514 	return 0;
1515 }
1516 
1517 static u16 pde_pat_index(struct xe_bo *bo)
1518 {
1519 	struct xe_device *xe = xe_bo_device(bo);
1520 	u16 pat_index;
1521 
1522 	/*
1523 	 * We only have two bits to encode the PAT index in non-leaf nodes, but
1524 	 * these only point to other paging structures so we only need a minimal
1525 	 * selection of options. The user PAT index is only for encoding leaf
1526 	 * nodes, where we have use of more bits to do the encoding. The
1527 	 * non-leaf nodes are instead under driver control so the chosen index
1528 	 * here should be distict from the user PAT index. Also the
1529 	 * corresponding coherency of the PAT index should be tied to the
1530 	 * allocation type of the page table (or at least we should pick
1531 	 * something which is always safe).
1532 	 */
1533 	if (!xe_bo_is_vram(bo) && bo->ttm.ttm->caching == ttm_cached)
1534 		pat_index = xe->pat.idx[XE_CACHE_WB];
1535 	else
1536 		pat_index = xe->pat.idx[XE_CACHE_NONE];
1537 
1538 	xe_assert(xe, pat_index <= 3);
1539 
1540 	return pat_index;
1541 }
1542 
1543 static u64 xelp_pde_encode_bo(struct xe_bo *bo, u64 bo_offset)
1544 {
1545 	u64 pde;
1546 
1547 	pde = xe_bo_addr(bo, bo_offset, XE_PAGE_SIZE);
1548 	pde |= XE_PAGE_PRESENT | XE_PAGE_RW;
1549 	pde |= pde_encode_pat_index(pde_pat_index(bo));
1550 
1551 	return pde;
1552 }
1553 
1554 static u64 xelp_pte_encode_bo(struct xe_bo *bo, u64 bo_offset,
1555 			      u16 pat_index, u32 pt_level)
1556 {
1557 	u64 pte;
1558 
1559 	pte = xe_bo_addr(bo, bo_offset, XE_PAGE_SIZE);
1560 	pte |= XE_PAGE_PRESENT | XE_PAGE_RW;
1561 	pte |= pte_encode_pat_index(pat_index, pt_level);
1562 	pte |= pte_encode_ps(pt_level);
1563 
1564 	if (xe_bo_is_vram(bo) || xe_bo_is_stolen_devmem(bo))
1565 		pte |= XE_PPGTT_PTE_DM;
1566 
1567 	return pte;
1568 }
1569 
1570 static u64 xelp_pte_encode_vma(u64 pte, struct xe_vma *vma,
1571 			       u16 pat_index, u32 pt_level)
1572 {
1573 	pte |= XE_PAGE_PRESENT;
1574 
1575 	if (likely(!xe_vma_read_only(vma)))
1576 		pte |= XE_PAGE_RW;
1577 
1578 	pte |= pte_encode_pat_index(pat_index, pt_level);
1579 	pte |= pte_encode_ps(pt_level);
1580 
1581 	if (unlikely(xe_vma_is_null(vma)))
1582 		pte |= XE_PTE_NULL;
1583 
1584 	return pte;
1585 }
1586 
1587 static u64 xelp_pte_encode_addr(struct xe_device *xe, u64 addr,
1588 				u16 pat_index,
1589 				u32 pt_level, bool devmem, u64 flags)
1590 {
1591 	u64 pte;
1592 
1593 	/* Avoid passing random bits directly as flags */
1594 	xe_assert(xe, !(flags & ~XE_PTE_PS64));
1595 
1596 	pte = addr;
1597 	pte |= XE_PAGE_PRESENT | XE_PAGE_RW;
1598 	pte |= pte_encode_pat_index(pat_index, pt_level);
1599 	pte |= pte_encode_ps(pt_level);
1600 
1601 	if (devmem)
1602 		pte |= XE_PPGTT_PTE_DM;
1603 
1604 	pte |= flags;
1605 
1606 	return pte;
1607 }
1608 
1609 static const struct xe_pt_ops xelp_pt_ops = {
1610 	.pte_encode_bo = xelp_pte_encode_bo,
1611 	.pte_encode_vma = xelp_pte_encode_vma,
1612 	.pte_encode_addr = xelp_pte_encode_addr,
1613 	.pde_encode_bo = xelp_pde_encode_bo,
1614 };
1615 
1616 static void vm_destroy_work_func(struct work_struct *w);
1617 
1618 /**
1619  * xe_vm_create_scratch() - Setup a scratch memory pagetable tree for the
1620  * given tile and vm.
1621  * @xe: xe device.
1622  * @tile: tile to set up for.
1623  * @vm: vm to set up for.
1624  *
1625  * Sets up a pagetable tree with one page-table per level and a single
1626  * leaf PTE. All pagetable entries point to the single page-table or,
1627  * for MAX_HUGEPTE_LEVEL, a NULL huge PTE returning 0 on read and
1628  * writes become NOPs.
1629  *
1630  * Return: 0 on success, negative error code on error.
1631  */
1632 static int xe_vm_create_scratch(struct xe_device *xe, struct xe_tile *tile,
1633 				struct xe_vm *vm)
1634 {
1635 	u8 id = tile->id;
1636 	int i;
1637 
1638 	for (i = MAX_HUGEPTE_LEVEL; i < vm->pt_root[id]->level; i++) {
1639 		vm->scratch_pt[id][i] = xe_pt_create(vm, tile, i);
1640 		if (IS_ERR(vm->scratch_pt[id][i])) {
1641 			int err = PTR_ERR(vm->scratch_pt[id][i]);
1642 
1643 			vm->scratch_pt[id][i] = NULL;
1644 			return err;
1645 		}
1646 
1647 		xe_pt_populate_empty(tile, vm, vm->scratch_pt[id][i]);
1648 	}
1649 
1650 	return 0;
1651 }
1652 ALLOW_ERROR_INJECTION(xe_vm_create_scratch, ERRNO);
1653 
1654 static void xe_vm_free_scratch(struct xe_vm *vm)
1655 {
1656 	struct xe_tile *tile;
1657 	u8 id;
1658 
1659 	if (!xe_vm_has_scratch(vm))
1660 		return;
1661 
1662 	for_each_tile(tile, vm->xe, id) {
1663 		u32 i;
1664 
1665 		if (!vm->pt_root[id])
1666 			continue;
1667 
1668 		for (i = MAX_HUGEPTE_LEVEL; i < vm->pt_root[id]->level; ++i)
1669 			if (vm->scratch_pt[id][i])
1670 				xe_pt_destroy(vm->scratch_pt[id][i], vm->flags, NULL);
1671 	}
1672 }
1673 
1674 struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags, struct xe_file *xef)
1675 {
1676 	struct drm_gem_object *vm_resv_obj;
1677 	struct xe_vm *vm;
1678 	int err, number_tiles = 0;
1679 	struct xe_tile *tile;
1680 	u8 id;
1681 
1682 	/*
1683 	 * Since the GSCCS is not user-accessible, we don't expect a GSC VM to
1684 	 * ever be in faulting mode.
1685 	 */
1686 	xe_assert(xe, !((flags & XE_VM_FLAG_GSC) && (flags & XE_VM_FLAG_FAULT_MODE)));
1687 
1688 	vm = kzalloc(sizeof(*vm), GFP_KERNEL);
1689 	if (!vm)
1690 		return ERR_PTR(-ENOMEM);
1691 
1692 	vm->xe = xe;
1693 
1694 	vm->size = 1ull << xe->info.va_bits;
1695 	vm->flags = flags;
1696 
1697 	if (xef)
1698 		vm->xef = xe_file_get(xef);
1699 	/**
1700 	 * GSC VMs are kernel-owned, only used for PXP ops and can sometimes be
1701 	 * manipulated under the PXP mutex. However, the PXP mutex can be taken
1702 	 * under a user-VM lock when the PXP session is started at exec_queue
1703 	 * creation time. Those are different VMs and therefore there is no risk
1704 	 * of deadlock, but we need to tell lockdep that this is the case or it
1705 	 * will print a warning.
1706 	 */
1707 	if (flags & XE_VM_FLAG_GSC) {
1708 		static struct lock_class_key gsc_vm_key;
1709 
1710 		__init_rwsem(&vm->lock, "gsc_vm", &gsc_vm_key);
1711 	} else {
1712 		init_rwsem(&vm->lock);
1713 	}
1714 	mutex_init(&vm->snap_mutex);
1715 
1716 	INIT_LIST_HEAD(&vm->rebind_list);
1717 
1718 	INIT_LIST_HEAD(&vm->userptr.repin_list);
1719 	INIT_LIST_HEAD(&vm->userptr.invalidated);
1720 	init_rwsem(&vm->userptr.notifier_lock);
1721 	spin_lock_init(&vm->userptr.invalidated_lock);
1722 
1723 	ttm_lru_bulk_move_init(&vm->lru_bulk_move);
1724 
1725 	INIT_WORK(&vm->destroy_work, vm_destroy_work_func);
1726 
1727 	INIT_LIST_HEAD(&vm->preempt.exec_queues);
1728 	vm->preempt.min_run_period_ms = 10;	/* FIXME: Wire up to uAPI */
1729 
1730 	for_each_tile(tile, xe, id)
1731 		xe_range_fence_tree_init(&vm->rftree[id]);
1732 
1733 	vm->pt_ops = &xelp_pt_ops;
1734 
1735 	/*
1736 	 * Long-running workloads are not protected by the scheduler references.
1737 	 * By design, run_job for long-running workloads returns NULL and the
1738 	 * scheduler drops all the references of it, hence protecting the VM
1739 	 * for this case is necessary.
1740 	 */
1741 	if (flags & XE_VM_FLAG_LR_MODE) {
1742 		INIT_WORK(&vm->preempt.rebind_work, preempt_rebind_work_func);
1743 		xe_pm_runtime_get_noresume(xe);
1744 	}
1745 
1746 	if (flags & XE_VM_FLAG_FAULT_MODE) {
1747 		err = xe_svm_init(vm);
1748 		if (err)
1749 			goto err_no_resv;
1750 	}
1751 
1752 	vm_resv_obj = drm_gpuvm_resv_object_alloc(&xe->drm);
1753 	if (!vm_resv_obj) {
1754 		err = -ENOMEM;
1755 		goto err_svm_fini;
1756 	}
1757 
1758 	drm_gpuvm_init(&vm->gpuvm, "Xe VM", DRM_GPUVM_RESV_PROTECTED, &xe->drm,
1759 		       vm_resv_obj, 0, vm->size, 0, 0, &gpuvm_ops);
1760 
1761 	drm_gem_object_put(vm_resv_obj);
1762 
1763 	err = xe_vm_lock(vm, true);
1764 	if (err)
1765 		goto err_close;
1766 
1767 	if (IS_DGFX(xe) && xe->info.vram_flags & XE_VRAM_FLAGS_NEED64K)
1768 		vm->flags |= XE_VM_FLAG_64K;
1769 
1770 	for_each_tile(tile, xe, id) {
1771 		if (flags & XE_VM_FLAG_MIGRATION &&
1772 		    tile->id != XE_VM_FLAG_TILE_ID(flags))
1773 			continue;
1774 
1775 		vm->pt_root[id] = xe_pt_create(vm, tile, xe->info.vm_max_level);
1776 		if (IS_ERR(vm->pt_root[id])) {
1777 			err = PTR_ERR(vm->pt_root[id]);
1778 			vm->pt_root[id] = NULL;
1779 			goto err_unlock_close;
1780 		}
1781 	}
1782 
1783 	if (xe_vm_has_scratch(vm)) {
1784 		for_each_tile(tile, xe, id) {
1785 			if (!vm->pt_root[id])
1786 				continue;
1787 
1788 			err = xe_vm_create_scratch(xe, tile, vm);
1789 			if (err)
1790 				goto err_unlock_close;
1791 		}
1792 		vm->batch_invalidate_tlb = true;
1793 	}
1794 
1795 	if (vm->flags & XE_VM_FLAG_LR_MODE)
1796 		vm->batch_invalidate_tlb = false;
1797 
1798 	/* Fill pt_root after allocating scratch tables */
1799 	for_each_tile(tile, xe, id) {
1800 		if (!vm->pt_root[id])
1801 			continue;
1802 
1803 		xe_pt_populate_empty(tile, vm, vm->pt_root[id]);
1804 	}
1805 	xe_vm_unlock(vm);
1806 
1807 	/* Kernel migration VM shouldn't have a circular loop.. */
1808 	if (!(flags & XE_VM_FLAG_MIGRATION)) {
1809 		for_each_tile(tile, xe, id) {
1810 			struct xe_exec_queue *q;
1811 			u32 create_flags = EXEC_QUEUE_FLAG_VM;
1812 
1813 			if (!vm->pt_root[id])
1814 				continue;
1815 
1816 			q = xe_exec_queue_create_bind(xe, tile, create_flags, 0);
1817 			if (IS_ERR(q)) {
1818 				err = PTR_ERR(q);
1819 				goto err_close;
1820 			}
1821 			vm->q[id] = q;
1822 			number_tiles++;
1823 		}
1824 	}
1825 
1826 	if (number_tiles > 1)
1827 		vm->composite_fence_ctx = dma_fence_context_alloc(1);
1828 
1829 	if (xef && xe->info.has_asid) {
1830 		u32 asid;
1831 
1832 		down_write(&xe->usm.lock);
1833 		err = xa_alloc_cyclic(&xe->usm.asid_to_vm, &asid, vm,
1834 				      XA_LIMIT(1, XE_MAX_ASID - 1),
1835 				      &xe->usm.next_asid, GFP_KERNEL);
1836 		up_write(&xe->usm.lock);
1837 		if (err < 0)
1838 			goto err_unlock_close;
1839 
1840 		vm->usm.asid = asid;
1841 	}
1842 
1843 	trace_xe_vm_create(vm);
1844 
1845 	return vm;
1846 
1847 err_unlock_close:
1848 	xe_vm_unlock(vm);
1849 err_close:
1850 	xe_vm_close_and_put(vm);
1851 	return ERR_PTR(err);
1852 
1853 err_svm_fini:
1854 	if (flags & XE_VM_FLAG_FAULT_MODE) {
1855 		vm->size = 0; /* close the vm */
1856 		xe_svm_fini(vm);
1857 	}
1858 err_no_resv:
1859 	mutex_destroy(&vm->snap_mutex);
1860 	for_each_tile(tile, xe, id)
1861 		xe_range_fence_tree_fini(&vm->rftree[id]);
1862 	ttm_lru_bulk_move_fini(&xe->ttm, &vm->lru_bulk_move);
1863 	if (vm->xef)
1864 		xe_file_put(vm->xef);
1865 	kfree(vm);
1866 	if (flags & XE_VM_FLAG_LR_MODE)
1867 		xe_pm_runtime_put(xe);
1868 	return ERR_PTR(err);
1869 }
1870 
1871 static void xe_vm_close(struct xe_vm *vm)
1872 {
1873 	struct xe_device *xe = vm->xe;
1874 	bool bound;
1875 	int idx;
1876 
1877 	bound = drm_dev_enter(&xe->drm, &idx);
1878 
1879 	down_write(&vm->lock);
1880 	if (xe_vm_in_fault_mode(vm))
1881 		xe_svm_notifier_lock(vm);
1882 
1883 	vm->size = 0;
1884 
1885 	if (!((vm->flags & XE_VM_FLAG_MIGRATION))) {
1886 		struct xe_tile *tile;
1887 		struct xe_gt *gt;
1888 		u8 id;
1889 
1890 		/* Wait for pending binds */
1891 		dma_resv_wait_timeout(xe_vm_resv(vm),
1892 				      DMA_RESV_USAGE_BOOKKEEP,
1893 				      false, MAX_SCHEDULE_TIMEOUT);
1894 
1895 		if (bound) {
1896 			for_each_tile(tile, xe, id)
1897 				if (vm->pt_root[id])
1898 					xe_pt_clear(xe, vm->pt_root[id]);
1899 
1900 			for_each_gt(gt, xe, id)
1901 				xe_tlb_inval_vm(&gt->tlb_inval, vm);
1902 		}
1903 	}
1904 
1905 	if (xe_vm_in_fault_mode(vm))
1906 		xe_svm_notifier_unlock(vm);
1907 	up_write(&vm->lock);
1908 
1909 	if (bound)
1910 		drm_dev_exit(idx);
1911 }
1912 
1913 void xe_vm_close_and_put(struct xe_vm *vm)
1914 {
1915 	LIST_HEAD(contested);
1916 	struct xe_device *xe = vm->xe;
1917 	struct xe_tile *tile;
1918 	struct xe_vma *vma, *next_vma;
1919 	struct drm_gpuva *gpuva, *next;
1920 	u8 id;
1921 
1922 	xe_assert(xe, !vm->preempt.num_exec_queues);
1923 
1924 	xe_vm_close(vm);
1925 	if (xe_vm_in_preempt_fence_mode(vm))
1926 		flush_work(&vm->preempt.rebind_work);
1927 	if (xe_vm_in_fault_mode(vm))
1928 		xe_svm_close(vm);
1929 
1930 	down_write(&vm->lock);
1931 	for_each_tile(tile, xe, id) {
1932 		if (vm->q[id])
1933 			xe_exec_queue_last_fence_put(vm->q[id], vm);
1934 	}
1935 	up_write(&vm->lock);
1936 
1937 	for_each_tile(tile, xe, id) {
1938 		if (vm->q[id]) {
1939 			xe_exec_queue_kill(vm->q[id]);
1940 			xe_exec_queue_put(vm->q[id]);
1941 			vm->q[id] = NULL;
1942 		}
1943 	}
1944 
1945 	down_write(&vm->lock);
1946 	xe_vm_lock(vm, false);
1947 	drm_gpuvm_for_each_va_safe(gpuva, next, &vm->gpuvm) {
1948 		vma = gpuva_to_vma(gpuva);
1949 
1950 		if (xe_vma_has_no_bo(vma)) {
1951 			down_read(&vm->userptr.notifier_lock);
1952 			vma->gpuva.flags |= XE_VMA_DESTROYED;
1953 			up_read(&vm->userptr.notifier_lock);
1954 		}
1955 
1956 		xe_vm_remove_vma(vm, vma);
1957 
1958 		/* easy case, remove from VMA? */
1959 		if (xe_vma_has_no_bo(vma) || xe_vma_bo(vma)->vm) {
1960 			list_del_init(&vma->combined_links.rebind);
1961 			xe_vma_destroy(vma, NULL);
1962 			continue;
1963 		}
1964 
1965 		list_move_tail(&vma->combined_links.destroy, &contested);
1966 		vma->gpuva.flags |= XE_VMA_DESTROYED;
1967 	}
1968 
1969 	/*
1970 	 * All vm operations will add shared fences to resv.
1971 	 * The only exception is eviction for a shared object,
1972 	 * but even so, the unbind when evicted would still
1973 	 * install a fence to resv. Hence it's safe to
1974 	 * destroy the pagetables immediately.
1975 	 */
1976 	xe_vm_free_scratch(vm);
1977 
1978 	for_each_tile(tile, xe, id) {
1979 		if (vm->pt_root[id]) {
1980 			xe_pt_destroy(vm->pt_root[id], vm->flags, NULL);
1981 			vm->pt_root[id] = NULL;
1982 		}
1983 	}
1984 	xe_vm_unlock(vm);
1985 
1986 	/*
1987 	 * VM is now dead, cannot re-add nodes to vm->vmas if it's NULL
1988 	 * Since we hold a refcount to the bo, we can remove and free
1989 	 * the members safely without locking.
1990 	 */
1991 	list_for_each_entry_safe(vma, next_vma, &contested,
1992 				 combined_links.destroy) {
1993 		list_del_init(&vma->combined_links.destroy);
1994 		xe_vma_destroy_unlocked(vma);
1995 	}
1996 
1997 	if (xe_vm_in_fault_mode(vm))
1998 		xe_svm_fini(vm);
1999 
2000 	up_write(&vm->lock);
2001 
2002 	down_write(&xe->usm.lock);
2003 	if (vm->usm.asid) {
2004 		void *lookup;
2005 
2006 		xe_assert(xe, xe->info.has_asid);
2007 		xe_assert(xe, !(vm->flags & XE_VM_FLAG_MIGRATION));
2008 
2009 		lookup = xa_erase(&xe->usm.asid_to_vm, vm->usm.asid);
2010 		xe_assert(xe, lookup == vm);
2011 	}
2012 	up_write(&xe->usm.lock);
2013 
2014 	for_each_tile(tile, xe, id)
2015 		xe_range_fence_tree_fini(&vm->rftree[id]);
2016 
2017 	xe_vm_put(vm);
2018 }
2019 
2020 static void vm_destroy_work_func(struct work_struct *w)
2021 {
2022 	struct xe_vm *vm =
2023 		container_of(w, struct xe_vm, destroy_work);
2024 	struct xe_device *xe = vm->xe;
2025 	struct xe_tile *tile;
2026 	u8 id;
2027 
2028 	/* xe_vm_close_and_put was not called? */
2029 	xe_assert(xe, !vm->size);
2030 
2031 	if (xe_vm_in_preempt_fence_mode(vm))
2032 		flush_work(&vm->preempt.rebind_work);
2033 
2034 	mutex_destroy(&vm->snap_mutex);
2035 
2036 	if (vm->flags & XE_VM_FLAG_LR_MODE)
2037 		xe_pm_runtime_put(xe);
2038 
2039 	for_each_tile(tile, xe, id)
2040 		XE_WARN_ON(vm->pt_root[id]);
2041 
2042 	trace_xe_vm_free(vm);
2043 
2044 	ttm_lru_bulk_move_fini(&xe->ttm, &vm->lru_bulk_move);
2045 
2046 	if (vm->xef)
2047 		xe_file_put(vm->xef);
2048 
2049 	kfree(vm);
2050 }
2051 
2052 static void xe_vm_free(struct drm_gpuvm *gpuvm)
2053 {
2054 	struct xe_vm *vm = container_of(gpuvm, struct xe_vm, gpuvm);
2055 
2056 	/* To destroy the VM we need to be able to sleep */
2057 	queue_work(system_unbound_wq, &vm->destroy_work);
2058 }
2059 
2060 struct xe_vm *xe_vm_lookup(struct xe_file *xef, u32 id)
2061 {
2062 	struct xe_vm *vm;
2063 
2064 	mutex_lock(&xef->vm.lock);
2065 	vm = xa_load(&xef->vm.xa, id);
2066 	if (vm)
2067 		xe_vm_get(vm);
2068 	mutex_unlock(&xef->vm.lock);
2069 
2070 	return vm;
2071 }
2072 
2073 u64 xe_vm_pdp4_descriptor(struct xe_vm *vm, struct xe_tile *tile)
2074 {
2075 	return vm->pt_ops->pde_encode_bo(vm->pt_root[tile->id]->bo, 0);
2076 }
2077 
2078 static struct xe_exec_queue *
2079 to_wait_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
2080 {
2081 	return q ? q : vm->q[0];
2082 }
2083 
2084 static struct xe_user_fence *
2085 find_ufence_get(struct xe_sync_entry *syncs, u32 num_syncs)
2086 {
2087 	unsigned int i;
2088 
2089 	for (i = 0; i < num_syncs; i++) {
2090 		struct xe_sync_entry *e = &syncs[i];
2091 
2092 		if (xe_sync_is_ufence(e))
2093 			return xe_sync_ufence_get(e);
2094 	}
2095 
2096 	return NULL;
2097 }
2098 
2099 #define ALL_DRM_XE_VM_CREATE_FLAGS (DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE | \
2100 				    DRM_XE_VM_CREATE_FLAG_LR_MODE | \
2101 				    DRM_XE_VM_CREATE_FLAG_FAULT_MODE)
2102 
2103 int xe_vm_create_ioctl(struct drm_device *dev, void *data,
2104 		       struct drm_file *file)
2105 {
2106 	struct xe_device *xe = to_xe_device(dev);
2107 	struct xe_file *xef = to_xe_file(file);
2108 	struct drm_xe_vm_create *args = data;
2109 	struct xe_vm *vm;
2110 	u32 id;
2111 	int err;
2112 	u32 flags = 0;
2113 
2114 	if (XE_IOCTL_DBG(xe, args->extensions))
2115 		return -EINVAL;
2116 
2117 	if (XE_GT_WA(xe_root_mmio_gt(xe), 14016763929))
2118 		args->flags |= DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE;
2119 
2120 	if (XE_IOCTL_DBG(xe, args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE &&
2121 			 !xe->info.has_usm))
2122 		return -EINVAL;
2123 
2124 	if (XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
2125 		return -EINVAL;
2126 
2127 	if (XE_IOCTL_DBG(xe, args->flags & ~ALL_DRM_XE_VM_CREATE_FLAGS))
2128 		return -EINVAL;
2129 
2130 	if (XE_IOCTL_DBG(xe, args->flags & DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE &&
2131 			 args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE &&
2132 			 !xe->info.needs_scratch))
2133 		return -EINVAL;
2134 
2135 	if (XE_IOCTL_DBG(xe, !(args->flags & DRM_XE_VM_CREATE_FLAG_LR_MODE) &&
2136 			 args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE))
2137 		return -EINVAL;
2138 
2139 	if (args->flags & DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE)
2140 		flags |= XE_VM_FLAG_SCRATCH_PAGE;
2141 	if (args->flags & DRM_XE_VM_CREATE_FLAG_LR_MODE)
2142 		flags |= XE_VM_FLAG_LR_MODE;
2143 	if (args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE)
2144 		flags |= XE_VM_FLAG_FAULT_MODE;
2145 
2146 	vm = xe_vm_create(xe, flags, xef);
2147 	if (IS_ERR(vm))
2148 		return PTR_ERR(vm);
2149 
2150 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_MEM)
2151 	/* Warning: Security issue - never enable by default */
2152 	args->reserved[0] = xe_bo_main_addr(vm->pt_root[0]->bo, XE_PAGE_SIZE);
2153 #endif
2154 
2155 	/* user id alloc must always be last in ioctl to prevent UAF */
2156 	err = xa_alloc(&xef->vm.xa, &id, vm, xa_limit_32b, GFP_KERNEL);
2157 	if (err)
2158 		goto err_close_and_put;
2159 
2160 	args->vm_id = id;
2161 
2162 	return 0;
2163 
2164 err_close_and_put:
2165 	xe_vm_close_and_put(vm);
2166 
2167 	return err;
2168 }
2169 
2170 int xe_vm_destroy_ioctl(struct drm_device *dev, void *data,
2171 			struct drm_file *file)
2172 {
2173 	struct xe_device *xe = to_xe_device(dev);
2174 	struct xe_file *xef = to_xe_file(file);
2175 	struct drm_xe_vm_destroy *args = data;
2176 	struct xe_vm *vm;
2177 	int err = 0;
2178 
2179 	if (XE_IOCTL_DBG(xe, args->pad) ||
2180 	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
2181 		return -EINVAL;
2182 
2183 	mutex_lock(&xef->vm.lock);
2184 	vm = xa_load(&xef->vm.xa, args->vm_id);
2185 	if (XE_IOCTL_DBG(xe, !vm))
2186 		err = -ENOENT;
2187 	else if (XE_IOCTL_DBG(xe, vm->preempt.num_exec_queues))
2188 		err = -EBUSY;
2189 	else
2190 		xa_erase(&xef->vm.xa, args->vm_id);
2191 	mutex_unlock(&xef->vm.lock);
2192 
2193 	if (!err)
2194 		xe_vm_close_and_put(vm);
2195 
2196 	return err;
2197 }
2198 
2199 static int xe_vm_query_vmas(struct xe_vm *vm, u64 start, u64 end)
2200 {
2201 	struct drm_gpuva *gpuva;
2202 	u32 num_vmas = 0;
2203 
2204 	lockdep_assert_held(&vm->lock);
2205 	drm_gpuvm_for_each_va_range(gpuva, &vm->gpuvm, start, end)
2206 		num_vmas++;
2207 
2208 	return num_vmas;
2209 }
2210 
2211 static int get_mem_attrs(struct xe_vm *vm, u32 *num_vmas, u64 start,
2212 			 u64 end, struct drm_xe_mem_range_attr *attrs)
2213 {
2214 	struct drm_gpuva *gpuva;
2215 	int i = 0;
2216 
2217 	lockdep_assert_held(&vm->lock);
2218 
2219 	drm_gpuvm_for_each_va_range(gpuva, &vm->gpuvm, start, end) {
2220 		struct xe_vma *vma = gpuva_to_vma(gpuva);
2221 
2222 		if (i == *num_vmas)
2223 			return -ENOSPC;
2224 
2225 		attrs[i].start = xe_vma_start(vma);
2226 		attrs[i].end = xe_vma_end(vma);
2227 		attrs[i].atomic.val = vma->attr.atomic_access;
2228 		attrs[i].pat_index.val = vma->attr.pat_index;
2229 		attrs[i].preferred_mem_loc.devmem_fd = vma->attr.preferred_loc.devmem_fd;
2230 		attrs[i].preferred_mem_loc.migration_policy =
2231 		vma->attr.preferred_loc.migration_policy;
2232 
2233 		i++;
2234 	}
2235 
2236 	*num_vmas = i;
2237 	return 0;
2238 }
2239 
2240 int xe_vm_query_vmas_attrs_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
2241 {
2242 	struct xe_device *xe = to_xe_device(dev);
2243 	struct xe_file *xef = to_xe_file(file);
2244 	struct drm_xe_mem_range_attr *mem_attrs;
2245 	struct drm_xe_vm_query_mem_range_attr *args = data;
2246 	u64 __user *attrs_user = u64_to_user_ptr(args->vector_of_mem_attr);
2247 	struct xe_vm *vm;
2248 	int err = 0;
2249 
2250 	if (XE_IOCTL_DBG(xe,
2251 			 ((args->num_mem_ranges == 0 &&
2252 			  (attrs_user || args->sizeof_mem_range_attr != 0)) ||
2253 			 (args->num_mem_ranges > 0 &&
2254 			  (!attrs_user ||
2255 			   args->sizeof_mem_range_attr !=
2256 			   sizeof(struct drm_xe_mem_range_attr))))))
2257 		return -EINVAL;
2258 
2259 	vm = xe_vm_lookup(xef, args->vm_id);
2260 	if (XE_IOCTL_DBG(xe, !vm))
2261 		return -EINVAL;
2262 
2263 	err = down_read_interruptible(&vm->lock);
2264 	if (err)
2265 		goto put_vm;
2266 
2267 	attrs_user = u64_to_user_ptr(args->vector_of_mem_attr);
2268 
2269 	if (args->num_mem_ranges == 0 && !attrs_user) {
2270 		args->num_mem_ranges = xe_vm_query_vmas(vm, args->start, args->start + args->range);
2271 		args->sizeof_mem_range_attr = sizeof(struct drm_xe_mem_range_attr);
2272 		goto unlock_vm;
2273 	}
2274 
2275 	mem_attrs = kvmalloc_array(args->num_mem_ranges, args->sizeof_mem_range_attr,
2276 				   GFP_KERNEL | __GFP_ACCOUNT |
2277 				   __GFP_RETRY_MAYFAIL | __GFP_NOWARN);
2278 	if (!mem_attrs) {
2279 		err = args->num_mem_ranges > 1 ? -ENOBUFS : -ENOMEM;
2280 		goto unlock_vm;
2281 	}
2282 
2283 	memset(mem_attrs, 0, args->num_mem_ranges * args->sizeof_mem_range_attr);
2284 	err = get_mem_attrs(vm, &args->num_mem_ranges, args->start,
2285 			    args->start + args->range, mem_attrs);
2286 	if (err)
2287 		goto free_mem_attrs;
2288 
2289 	err = copy_to_user(attrs_user, mem_attrs,
2290 			   args->sizeof_mem_range_attr * args->num_mem_ranges);
2291 
2292 free_mem_attrs:
2293 	kvfree(mem_attrs);
2294 unlock_vm:
2295 	up_read(&vm->lock);
2296 put_vm:
2297 	xe_vm_put(vm);
2298 	return err;
2299 }
2300 
2301 static bool vma_matches(struct xe_vma *vma, u64 page_addr)
2302 {
2303 	if (page_addr > xe_vma_end(vma) - 1 ||
2304 	    page_addr + SZ_4K - 1 < xe_vma_start(vma))
2305 		return false;
2306 
2307 	return true;
2308 }
2309 
2310 /**
2311  * xe_vm_find_vma_by_addr() - Find a VMA by its address
2312  *
2313  * @vm: the xe_vm the vma belongs to
2314  * @page_addr: address to look up
2315  */
2316 struct xe_vma *xe_vm_find_vma_by_addr(struct xe_vm *vm, u64 page_addr)
2317 {
2318 	struct xe_vma *vma = NULL;
2319 
2320 	if (vm->usm.last_fault_vma) {   /* Fast lookup */
2321 		if (vma_matches(vm->usm.last_fault_vma, page_addr))
2322 			vma = vm->usm.last_fault_vma;
2323 	}
2324 	if (!vma)
2325 		vma = xe_vm_find_overlapping_vma(vm, page_addr, SZ_4K);
2326 
2327 	return vma;
2328 }
2329 
2330 static const u32 region_to_mem_type[] = {
2331 	XE_PL_TT,
2332 	XE_PL_VRAM0,
2333 	XE_PL_VRAM1,
2334 };
2335 
2336 static void prep_vma_destroy(struct xe_vm *vm, struct xe_vma *vma,
2337 			     bool post_commit)
2338 {
2339 	down_read(&vm->userptr.notifier_lock);
2340 	vma->gpuva.flags |= XE_VMA_DESTROYED;
2341 	up_read(&vm->userptr.notifier_lock);
2342 	if (post_commit)
2343 		xe_vm_remove_vma(vm, vma);
2344 }
2345 
2346 #undef ULL
2347 #define ULL	unsigned long long
2348 
2349 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_VM)
2350 static void print_op(struct xe_device *xe, struct drm_gpuva_op *op)
2351 {
2352 	struct xe_vma *vma;
2353 
2354 	switch (op->op) {
2355 	case DRM_GPUVA_OP_MAP:
2356 		vm_dbg(&xe->drm, "MAP: addr=0x%016llx, range=0x%016llx",
2357 		       (ULL)op->map.va.addr, (ULL)op->map.va.range);
2358 		break;
2359 	case DRM_GPUVA_OP_REMAP:
2360 		vma = gpuva_to_vma(op->remap.unmap->va);
2361 		vm_dbg(&xe->drm, "REMAP:UNMAP: addr=0x%016llx, range=0x%016llx, keep=%d",
2362 		       (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma),
2363 		       op->remap.unmap->keep ? 1 : 0);
2364 		if (op->remap.prev)
2365 			vm_dbg(&xe->drm,
2366 			       "REMAP:PREV: addr=0x%016llx, range=0x%016llx",
2367 			       (ULL)op->remap.prev->va.addr,
2368 			       (ULL)op->remap.prev->va.range);
2369 		if (op->remap.next)
2370 			vm_dbg(&xe->drm,
2371 			       "REMAP:NEXT: addr=0x%016llx, range=0x%016llx",
2372 			       (ULL)op->remap.next->va.addr,
2373 			       (ULL)op->remap.next->va.range);
2374 		break;
2375 	case DRM_GPUVA_OP_UNMAP:
2376 		vma = gpuva_to_vma(op->unmap.va);
2377 		vm_dbg(&xe->drm, "UNMAP: addr=0x%016llx, range=0x%016llx, keep=%d",
2378 		       (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma),
2379 		       op->unmap.keep ? 1 : 0);
2380 		break;
2381 	case DRM_GPUVA_OP_PREFETCH:
2382 		vma = gpuva_to_vma(op->prefetch.va);
2383 		vm_dbg(&xe->drm, "PREFETCH: addr=0x%016llx, range=0x%016llx",
2384 		       (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma));
2385 		break;
2386 	default:
2387 		drm_warn(&xe->drm, "NOT POSSIBLE");
2388 	}
2389 }
2390 #else
2391 static void print_op(struct xe_device *xe, struct drm_gpuva_op *op)
2392 {
2393 }
2394 #endif
2395 
2396 static bool __xe_vm_needs_clear_scratch_pages(struct xe_vm *vm, u32 bind_flags)
2397 {
2398 	if (!xe_vm_in_fault_mode(vm))
2399 		return false;
2400 
2401 	if (!xe_vm_has_scratch(vm))
2402 		return false;
2403 
2404 	if (bind_flags & DRM_XE_VM_BIND_FLAG_IMMEDIATE)
2405 		return false;
2406 
2407 	return true;
2408 }
2409 
2410 static void xe_svm_prefetch_gpuva_ops_fini(struct drm_gpuva_ops *ops)
2411 {
2412 	struct drm_gpuva_op *__op;
2413 
2414 	drm_gpuva_for_each_op(__op, ops) {
2415 		struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2416 
2417 		xe_vma_svm_prefetch_op_fini(op);
2418 	}
2419 }
2420 
2421 /*
2422  * Create operations list from IOCTL arguments, setup operations fields so parse
2423  * and commit steps are decoupled from IOCTL arguments. This step can fail.
2424  */
2425 static struct drm_gpuva_ops *
2426 vm_bind_ioctl_ops_create(struct xe_vm *vm, struct xe_vma_ops *vops,
2427 			 struct xe_bo *bo, u64 bo_offset_or_userptr,
2428 			 u64 addr, u64 range,
2429 			 u32 operation, u32 flags,
2430 			 u32 prefetch_region, u16 pat_index)
2431 {
2432 	struct drm_gem_object *obj = bo ? &bo->ttm.base : NULL;
2433 	struct drm_gpuva_ops *ops;
2434 	struct drm_gpuva_op *__op;
2435 	struct drm_gpuvm_bo *vm_bo;
2436 	u64 range_end = addr + range;
2437 	int err;
2438 
2439 	lockdep_assert_held_write(&vm->lock);
2440 
2441 	vm_dbg(&vm->xe->drm,
2442 	       "op=%d, addr=0x%016llx, range=0x%016llx, bo_offset_or_userptr=0x%016llx",
2443 	       operation, (ULL)addr, (ULL)range,
2444 	       (ULL)bo_offset_or_userptr);
2445 
2446 	switch (operation) {
2447 	case DRM_XE_VM_BIND_OP_MAP:
2448 	case DRM_XE_VM_BIND_OP_MAP_USERPTR: {
2449 		struct drm_gpuvm_map_req map_req = {
2450 			.map.va.addr = addr,
2451 			.map.va.range = range,
2452 			.map.gem.obj = obj,
2453 			.map.gem.offset = bo_offset_or_userptr,
2454 		};
2455 
2456 		ops = drm_gpuvm_sm_map_ops_create(&vm->gpuvm, &map_req);
2457 		break;
2458 	}
2459 	case DRM_XE_VM_BIND_OP_UNMAP:
2460 		ops = drm_gpuvm_sm_unmap_ops_create(&vm->gpuvm, addr, range);
2461 		break;
2462 	case DRM_XE_VM_BIND_OP_PREFETCH:
2463 		ops = drm_gpuvm_prefetch_ops_create(&vm->gpuvm, addr, range);
2464 		break;
2465 	case DRM_XE_VM_BIND_OP_UNMAP_ALL:
2466 		xe_assert(vm->xe, bo);
2467 
2468 		err = xe_bo_lock(bo, true);
2469 		if (err)
2470 			return ERR_PTR(err);
2471 
2472 		vm_bo = drm_gpuvm_bo_obtain(&vm->gpuvm, obj);
2473 		if (IS_ERR(vm_bo)) {
2474 			xe_bo_unlock(bo);
2475 			return ERR_CAST(vm_bo);
2476 		}
2477 
2478 		ops = drm_gpuvm_bo_unmap_ops_create(vm_bo);
2479 		drm_gpuvm_bo_put(vm_bo);
2480 		xe_bo_unlock(bo);
2481 		break;
2482 	default:
2483 		drm_warn(&vm->xe->drm, "NOT POSSIBLE");
2484 		ops = ERR_PTR(-EINVAL);
2485 	}
2486 	if (IS_ERR(ops))
2487 		return ops;
2488 
2489 	drm_gpuva_for_each_op(__op, ops) {
2490 		struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2491 
2492 		if (__op->op == DRM_GPUVA_OP_MAP) {
2493 			op->map.immediate =
2494 				flags & DRM_XE_VM_BIND_FLAG_IMMEDIATE;
2495 			op->map.read_only =
2496 				flags & DRM_XE_VM_BIND_FLAG_READONLY;
2497 			op->map.is_null = flags & DRM_XE_VM_BIND_FLAG_NULL;
2498 			op->map.is_cpu_addr_mirror = flags &
2499 				DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR;
2500 			op->map.dumpable = flags & DRM_XE_VM_BIND_FLAG_DUMPABLE;
2501 			op->map.pat_index = pat_index;
2502 			op->map.invalidate_on_bind =
2503 				__xe_vm_needs_clear_scratch_pages(vm, flags);
2504 		} else if (__op->op == DRM_GPUVA_OP_PREFETCH) {
2505 			struct xe_vma *vma = gpuva_to_vma(op->base.prefetch.va);
2506 			struct xe_tile *tile;
2507 			struct xe_svm_range *svm_range;
2508 			struct drm_gpusvm_ctx ctx = {};
2509 			struct drm_pagemap *dpagemap;
2510 			u8 id, tile_mask = 0;
2511 			u32 i;
2512 
2513 			if (!xe_vma_is_cpu_addr_mirror(vma)) {
2514 				op->prefetch.region = prefetch_region;
2515 				break;
2516 			}
2517 
2518 			ctx.read_only = xe_vma_read_only(vma);
2519 			ctx.devmem_possible = IS_DGFX(vm->xe) &&
2520 					      IS_ENABLED(CONFIG_DRM_XE_PAGEMAP);
2521 
2522 			for_each_tile(tile, vm->xe, id)
2523 				tile_mask |= 0x1 << id;
2524 
2525 			xa_init_flags(&op->prefetch_range.range, XA_FLAGS_ALLOC);
2526 			op->prefetch_range.ranges_count = 0;
2527 			tile = NULL;
2528 
2529 			if (prefetch_region == DRM_XE_CONSULT_MEM_ADVISE_PREF_LOC) {
2530 				dpagemap = xe_vma_resolve_pagemap(vma,
2531 								  xe_device_get_root_tile(vm->xe));
2532 				/*
2533 				 * TODO: Once multigpu support is enabled will need
2534 				 * something to dereference tile from dpagemap.
2535 				 */
2536 				if (dpagemap)
2537 					tile = xe_device_get_root_tile(vm->xe);
2538 			} else if (prefetch_region) {
2539 				tile = &vm->xe->tiles[region_to_mem_type[prefetch_region] -
2540 						      XE_PL_VRAM0];
2541 			}
2542 
2543 			op->prefetch_range.tile = tile;
2544 alloc_next_range:
2545 			svm_range = xe_svm_range_find_or_insert(vm, addr, vma, &ctx);
2546 
2547 			if (PTR_ERR(svm_range) == -ENOENT) {
2548 				u64 ret = xe_svm_find_vma_start(vm, addr, range_end, vma);
2549 
2550 				addr = ret == ULONG_MAX ? 0 : ret;
2551 				if (addr)
2552 					goto alloc_next_range;
2553 				else
2554 					goto print_op_label;
2555 			}
2556 
2557 			if (IS_ERR(svm_range)) {
2558 				err = PTR_ERR(svm_range);
2559 				goto unwind_prefetch_ops;
2560 			}
2561 
2562 			if (xe_svm_range_validate(vm, svm_range, tile_mask, !!tile)) {
2563 				xe_svm_range_debug(svm_range, "PREFETCH - RANGE IS VALID");
2564 				goto check_next_range;
2565 			}
2566 
2567 			err = xa_alloc(&op->prefetch_range.range,
2568 				       &i, svm_range, xa_limit_32b,
2569 				       GFP_KERNEL);
2570 
2571 			if (err)
2572 				goto unwind_prefetch_ops;
2573 
2574 			op->prefetch_range.ranges_count++;
2575 			vops->flags |= XE_VMA_OPS_FLAG_HAS_SVM_PREFETCH;
2576 			xe_svm_range_debug(svm_range, "PREFETCH - RANGE CREATED");
2577 check_next_range:
2578 			if (range_end > xe_svm_range_end(svm_range) &&
2579 			    xe_svm_range_end(svm_range) < xe_vma_end(vma)) {
2580 				addr = xe_svm_range_end(svm_range);
2581 				goto alloc_next_range;
2582 			}
2583 		}
2584 print_op_label:
2585 		print_op(vm->xe, __op);
2586 	}
2587 
2588 	return ops;
2589 
2590 unwind_prefetch_ops:
2591 	xe_svm_prefetch_gpuva_ops_fini(ops);
2592 	drm_gpuva_ops_free(&vm->gpuvm, ops);
2593 	return ERR_PTR(err);
2594 }
2595 
2596 ALLOW_ERROR_INJECTION(vm_bind_ioctl_ops_create, ERRNO);
2597 
2598 static struct xe_vma *new_vma(struct xe_vm *vm, struct drm_gpuva_op_map *op,
2599 			      struct xe_vma_mem_attr *attr, unsigned int flags)
2600 {
2601 	struct xe_bo *bo = op->gem.obj ? gem_to_xe_bo(op->gem.obj) : NULL;
2602 	struct drm_exec exec;
2603 	struct xe_vma *vma;
2604 	int err = 0;
2605 
2606 	lockdep_assert_held_write(&vm->lock);
2607 
2608 	if (bo) {
2609 		drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT, 0);
2610 		drm_exec_until_all_locked(&exec) {
2611 			err = 0;
2612 			if (!bo->vm) {
2613 				err = drm_exec_lock_obj(&exec, xe_vm_obj(vm));
2614 				drm_exec_retry_on_contention(&exec);
2615 			}
2616 			if (!err) {
2617 				err = drm_exec_lock_obj(&exec, &bo->ttm.base);
2618 				drm_exec_retry_on_contention(&exec);
2619 			}
2620 			if (err) {
2621 				drm_exec_fini(&exec);
2622 				return ERR_PTR(err);
2623 			}
2624 		}
2625 	}
2626 	vma = xe_vma_create(vm, bo, op->gem.offset,
2627 			    op->va.addr, op->va.addr +
2628 			    op->va.range - 1, attr, flags);
2629 	if (IS_ERR(vma))
2630 		goto err_unlock;
2631 
2632 	if (xe_vma_is_userptr(vma))
2633 		err = xe_vma_userptr_pin_pages(to_userptr_vma(vma));
2634 	else if (!xe_vma_has_no_bo(vma) && !bo->vm)
2635 		err = add_preempt_fences(vm, bo);
2636 
2637 err_unlock:
2638 	if (bo)
2639 		drm_exec_fini(&exec);
2640 
2641 	if (err) {
2642 		prep_vma_destroy(vm, vma, false);
2643 		xe_vma_destroy_unlocked(vma);
2644 		vma = ERR_PTR(err);
2645 	}
2646 
2647 	return vma;
2648 }
2649 
2650 static u64 xe_vma_max_pte_size(struct xe_vma *vma)
2651 {
2652 	if (vma->gpuva.flags & XE_VMA_PTE_1G)
2653 		return SZ_1G;
2654 	else if (vma->gpuva.flags & (XE_VMA_PTE_2M | XE_VMA_PTE_COMPACT))
2655 		return SZ_2M;
2656 	else if (vma->gpuva.flags & XE_VMA_PTE_64K)
2657 		return SZ_64K;
2658 	else if (vma->gpuva.flags & XE_VMA_PTE_4K)
2659 		return SZ_4K;
2660 
2661 	return SZ_1G;	/* Uninitialized, used max size */
2662 }
2663 
2664 static void xe_vma_set_pte_size(struct xe_vma *vma, u64 size)
2665 {
2666 	switch (size) {
2667 	case SZ_1G:
2668 		vma->gpuva.flags |= XE_VMA_PTE_1G;
2669 		break;
2670 	case SZ_2M:
2671 		vma->gpuva.flags |= XE_VMA_PTE_2M;
2672 		break;
2673 	case SZ_64K:
2674 		vma->gpuva.flags |= XE_VMA_PTE_64K;
2675 		break;
2676 	case SZ_4K:
2677 		vma->gpuva.flags |= XE_VMA_PTE_4K;
2678 		break;
2679 	}
2680 }
2681 
2682 static int xe_vma_op_commit(struct xe_vm *vm, struct xe_vma_op *op)
2683 {
2684 	int err = 0;
2685 
2686 	lockdep_assert_held_write(&vm->lock);
2687 
2688 	switch (op->base.op) {
2689 	case DRM_GPUVA_OP_MAP:
2690 		err |= xe_vm_insert_vma(vm, op->map.vma);
2691 		if (!err)
2692 			op->flags |= XE_VMA_OP_COMMITTED;
2693 		break;
2694 	case DRM_GPUVA_OP_REMAP:
2695 	{
2696 		u8 tile_present =
2697 			gpuva_to_vma(op->base.remap.unmap->va)->tile_present;
2698 
2699 		prep_vma_destroy(vm, gpuva_to_vma(op->base.remap.unmap->va),
2700 				 true);
2701 		op->flags |= XE_VMA_OP_COMMITTED;
2702 
2703 		if (op->remap.prev) {
2704 			err |= xe_vm_insert_vma(vm, op->remap.prev);
2705 			if (!err)
2706 				op->flags |= XE_VMA_OP_PREV_COMMITTED;
2707 			if (!err && op->remap.skip_prev) {
2708 				op->remap.prev->tile_present =
2709 					tile_present;
2710 				op->remap.prev = NULL;
2711 			}
2712 		}
2713 		if (op->remap.next) {
2714 			err |= xe_vm_insert_vma(vm, op->remap.next);
2715 			if (!err)
2716 				op->flags |= XE_VMA_OP_NEXT_COMMITTED;
2717 			if (!err && op->remap.skip_next) {
2718 				op->remap.next->tile_present =
2719 					tile_present;
2720 				op->remap.next = NULL;
2721 			}
2722 		}
2723 
2724 		/* Adjust for partial unbind after removing VMA from VM */
2725 		if (!err) {
2726 			op->base.remap.unmap->va->va.addr = op->remap.start;
2727 			op->base.remap.unmap->va->va.range = op->remap.range;
2728 		}
2729 		break;
2730 	}
2731 	case DRM_GPUVA_OP_UNMAP:
2732 		prep_vma_destroy(vm, gpuva_to_vma(op->base.unmap.va), true);
2733 		op->flags |= XE_VMA_OP_COMMITTED;
2734 		break;
2735 	case DRM_GPUVA_OP_PREFETCH:
2736 		op->flags |= XE_VMA_OP_COMMITTED;
2737 		break;
2738 	default:
2739 		drm_warn(&vm->xe->drm, "NOT POSSIBLE");
2740 	}
2741 
2742 	return err;
2743 }
2744 
2745 /**
2746  * xe_vma_has_default_mem_attrs - Check if a VMA has default memory attributes
2747  * @vma: Pointer to the xe_vma structure to check
2748  *
2749  * This function determines whether the given VMA (Virtual Memory Area)
2750  * has its memory attributes set to their default values. Specifically,
2751  * it checks the following conditions:
2752  *
2753  * - `atomic_access` is `DRM_XE_VMA_ATOMIC_UNDEFINED`
2754  * - `pat_index` is equal to `default_pat_index`
2755  * - `preferred_loc.devmem_fd` is `DRM_XE_PREFERRED_LOC_DEFAULT_DEVICE`
2756  * - `preferred_loc.migration_policy` is `DRM_XE_MIGRATE_ALL_PAGES`
2757  *
2758  * Return: true if all attributes are at their default values, false otherwise.
2759  */
2760 bool xe_vma_has_default_mem_attrs(struct xe_vma *vma)
2761 {
2762 	return (vma->attr.atomic_access == DRM_XE_ATOMIC_UNDEFINED &&
2763 		vma->attr.pat_index ==  vma->attr.default_pat_index &&
2764 		vma->attr.preferred_loc.devmem_fd == DRM_XE_PREFERRED_LOC_DEFAULT_DEVICE &&
2765 		vma->attr.preferred_loc.migration_policy == DRM_XE_MIGRATE_ALL_PAGES);
2766 }
2767 
2768 static int vm_bind_ioctl_ops_parse(struct xe_vm *vm, struct drm_gpuva_ops *ops,
2769 				   struct xe_vma_ops *vops)
2770 {
2771 	struct xe_device *xe = vm->xe;
2772 	struct drm_gpuva_op *__op;
2773 	struct xe_tile *tile;
2774 	u8 id, tile_mask = 0;
2775 	int err = 0;
2776 
2777 	lockdep_assert_held_write(&vm->lock);
2778 
2779 	for_each_tile(tile, vm->xe, id)
2780 		tile_mask |= 0x1 << id;
2781 
2782 	drm_gpuva_for_each_op(__op, ops) {
2783 		struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2784 		struct xe_vma *vma;
2785 		unsigned int flags = 0;
2786 
2787 		INIT_LIST_HEAD(&op->link);
2788 		list_add_tail(&op->link, &vops->list);
2789 		op->tile_mask = tile_mask;
2790 
2791 		switch (op->base.op) {
2792 		case DRM_GPUVA_OP_MAP:
2793 		{
2794 			struct xe_vma_mem_attr default_attr = {
2795 				.preferred_loc = {
2796 					.devmem_fd = DRM_XE_PREFERRED_LOC_DEFAULT_DEVICE,
2797 					.migration_policy = DRM_XE_MIGRATE_ALL_PAGES,
2798 				},
2799 				.atomic_access = DRM_XE_ATOMIC_UNDEFINED,
2800 				.default_pat_index = op->map.pat_index,
2801 				.pat_index = op->map.pat_index,
2802 			};
2803 
2804 			flags |= op->map.read_only ?
2805 				VMA_CREATE_FLAG_READ_ONLY : 0;
2806 			flags |= op->map.is_null ?
2807 				VMA_CREATE_FLAG_IS_NULL : 0;
2808 			flags |= op->map.dumpable ?
2809 				VMA_CREATE_FLAG_DUMPABLE : 0;
2810 			flags |= op->map.is_cpu_addr_mirror ?
2811 				VMA_CREATE_FLAG_IS_SYSTEM_ALLOCATOR : 0;
2812 
2813 			vma = new_vma(vm, &op->base.map, &default_attr,
2814 				      flags);
2815 			if (IS_ERR(vma))
2816 				return PTR_ERR(vma);
2817 
2818 			op->map.vma = vma;
2819 			if (((op->map.immediate || !xe_vm_in_fault_mode(vm)) &&
2820 			     !op->map.is_cpu_addr_mirror) ||
2821 			    op->map.invalidate_on_bind)
2822 				xe_vma_ops_incr_pt_update_ops(vops,
2823 							      op->tile_mask, 1);
2824 			break;
2825 		}
2826 		case DRM_GPUVA_OP_REMAP:
2827 		{
2828 			struct xe_vma *old =
2829 				gpuva_to_vma(op->base.remap.unmap->va);
2830 			bool skip = xe_vma_is_cpu_addr_mirror(old);
2831 			u64 start = xe_vma_start(old), end = xe_vma_end(old);
2832 			int num_remap_ops = 0;
2833 
2834 			if (op->base.remap.prev)
2835 				start = op->base.remap.prev->va.addr +
2836 					op->base.remap.prev->va.range;
2837 			if (op->base.remap.next)
2838 				end = op->base.remap.next->va.addr;
2839 
2840 			if (xe_vma_is_cpu_addr_mirror(old) &&
2841 			    xe_svm_has_mapping(vm, start, end)) {
2842 				if (vops->flags & XE_VMA_OPS_FLAG_MADVISE)
2843 					xe_svm_unmap_address_range(vm, start, end);
2844 				else
2845 					return -EBUSY;
2846 			}
2847 
2848 			op->remap.start = xe_vma_start(old);
2849 			op->remap.range = xe_vma_size(old);
2850 
2851 			flags |= op->base.remap.unmap->va->flags &
2852 				XE_VMA_READ_ONLY ?
2853 				VMA_CREATE_FLAG_READ_ONLY : 0;
2854 			flags |= op->base.remap.unmap->va->flags &
2855 				DRM_GPUVA_SPARSE ?
2856 				VMA_CREATE_FLAG_IS_NULL : 0;
2857 			flags |= op->base.remap.unmap->va->flags &
2858 				XE_VMA_DUMPABLE ?
2859 				VMA_CREATE_FLAG_DUMPABLE : 0;
2860 			flags |= xe_vma_is_cpu_addr_mirror(old) ?
2861 				VMA_CREATE_FLAG_IS_SYSTEM_ALLOCATOR : 0;
2862 
2863 			if (op->base.remap.prev) {
2864 				vma = new_vma(vm, op->base.remap.prev,
2865 					      &old->attr, flags);
2866 				if (IS_ERR(vma))
2867 					return PTR_ERR(vma);
2868 
2869 				op->remap.prev = vma;
2870 
2871 				/*
2872 				 * Userptr creates a new SG mapping so
2873 				 * we must also rebind.
2874 				 */
2875 				op->remap.skip_prev = skip ||
2876 					(!xe_vma_is_userptr(old) &&
2877 					IS_ALIGNED(xe_vma_end(vma),
2878 						   xe_vma_max_pte_size(old)));
2879 				if (op->remap.skip_prev) {
2880 					xe_vma_set_pte_size(vma, xe_vma_max_pte_size(old));
2881 					op->remap.range -=
2882 						xe_vma_end(vma) -
2883 						xe_vma_start(old);
2884 					op->remap.start = xe_vma_end(vma);
2885 					vm_dbg(&xe->drm, "REMAP:SKIP_PREV: addr=0x%016llx, range=0x%016llx",
2886 					       (ULL)op->remap.start,
2887 					       (ULL)op->remap.range);
2888 				} else {
2889 					num_remap_ops++;
2890 				}
2891 			}
2892 
2893 			if (op->base.remap.next) {
2894 				vma = new_vma(vm, op->base.remap.next,
2895 					      &old->attr, flags);
2896 				if (IS_ERR(vma))
2897 					return PTR_ERR(vma);
2898 
2899 				op->remap.next = vma;
2900 
2901 				/*
2902 				 * Userptr creates a new SG mapping so
2903 				 * we must also rebind.
2904 				 */
2905 				op->remap.skip_next = skip ||
2906 					(!xe_vma_is_userptr(old) &&
2907 					IS_ALIGNED(xe_vma_start(vma),
2908 						   xe_vma_max_pte_size(old)));
2909 				if (op->remap.skip_next) {
2910 					xe_vma_set_pte_size(vma, xe_vma_max_pte_size(old));
2911 					op->remap.range -=
2912 						xe_vma_end(old) -
2913 						xe_vma_start(vma);
2914 					vm_dbg(&xe->drm, "REMAP:SKIP_NEXT: addr=0x%016llx, range=0x%016llx",
2915 					       (ULL)op->remap.start,
2916 					       (ULL)op->remap.range);
2917 				} else {
2918 					num_remap_ops++;
2919 				}
2920 			}
2921 			if (!skip)
2922 				num_remap_ops++;
2923 
2924 			xe_vma_ops_incr_pt_update_ops(vops, op->tile_mask, num_remap_ops);
2925 			break;
2926 		}
2927 		case DRM_GPUVA_OP_UNMAP:
2928 			vma = gpuva_to_vma(op->base.unmap.va);
2929 
2930 			if (xe_vma_is_cpu_addr_mirror(vma) &&
2931 			    xe_svm_has_mapping(vm, xe_vma_start(vma),
2932 					       xe_vma_end(vma)))
2933 				return -EBUSY;
2934 
2935 			if (!xe_vma_is_cpu_addr_mirror(vma))
2936 				xe_vma_ops_incr_pt_update_ops(vops, op->tile_mask, 1);
2937 			break;
2938 		case DRM_GPUVA_OP_PREFETCH:
2939 			vma = gpuva_to_vma(op->base.prefetch.va);
2940 
2941 			if (xe_vma_is_userptr(vma)) {
2942 				err = xe_vma_userptr_pin_pages(to_userptr_vma(vma));
2943 				if (err)
2944 					return err;
2945 			}
2946 
2947 			if (xe_vma_is_cpu_addr_mirror(vma))
2948 				xe_vma_ops_incr_pt_update_ops(vops, op->tile_mask,
2949 							      op->prefetch_range.ranges_count);
2950 			else
2951 				xe_vma_ops_incr_pt_update_ops(vops, op->tile_mask, 1);
2952 
2953 			break;
2954 		default:
2955 			drm_warn(&vm->xe->drm, "NOT POSSIBLE");
2956 		}
2957 
2958 		err = xe_vma_op_commit(vm, op);
2959 		if (err)
2960 			return err;
2961 	}
2962 
2963 	return 0;
2964 }
2965 
2966 static void xe_vma_op_unwind(struct xe_vm *vm, struct xe_vma_op *op,
2967 			     bool post_commit, bool prev_post_commit,
2968 			     bool next_post_commit)
2969 {
2970 	lockdep_assert_held_write(&vm->lock);
2971 
2972 	switch (op->base.op) {
2973 	case DRM_GPUVA_OP_MAP:
2974 		if (op->map.vma) {
2975 			prep_vma_destroy(vm, op->map.vma, post_commit);
2976 			xe_vma_destroy_unlocked(op->map.vma);
2977 		}
2978 		break;
2979 	case DRM_GPUVA_OP_UNMAP:
2980 	{
2981 		struct xe_vma *vma = gpuva_to_vma(op->base.unmap.va);
2982 
2983 		if (vma) {
2984 			down_read(&vm->userptr.notifier_lock);
2985 			vma->gpuva.flags &= ~XE_VMA_DESTROYED;
2986 			up_read(&vm->userptr.notifier_lock);
2987 			if (post_commit)
2988 				xe_vm_insert_vma(vm, vma);
2989 		}
2990 		break;
2991 	}
2992 	case DRM_GPUVA_OP_REMAP:
2993 	{
2994 		struct xe_vma *vma = gpuva_to_vma(op->base.remap.unmap->va);
2995 
2996 		if (op->remap.prev) {
2997 			prep_vma_destroy(vm, op->remap.prev, prev_post_commit);
2998 			xe_vma_destroy_unlocked(op->remap.prev);
2999 		}
3000 		if (op->remap.next) {
3001 			prep_vma_destroy(vm, op->remap.next, next_post_commit);
3002 			xe_vma_destroy_unlocked(op->remap.next);
3003 		}
3004 		if (vma) {
3005 			down_read(&vm->userptr.notifier_lock);
3006 			vma->gpuva.flags &= ~XE_VMA_DESTROYED;
3007 			up_read(&vm->userptr.notifier_lock);
3008 			if (post_commit)
3009 				xe_vm_insert_vma(vm, vma);
3010 		}
3011 		break;
3012 	}
3013 	case DRM_GPUVA_OP_PREFETCH:
3014 		/* Nothing to do */
3015 		break;
3016 	default:
3017 		drm_warn(&vm->xe->drm, "NOT POSSIBLE");
3018 	}
3019 }
3020 
3021 static void vm_bind_ioctl_ops_unwind(struct xe_vm *vm,
3022 				     struct drm_gpuva_ops **ops,
3023 				     int num_ops_list)
3024 {
3025 	int i;
3026 
3027 	for (i = num_ops_list - 1; i >= 0; --i) {
3028 		struct drm_gpuva_ops *__ops = ops[i];
3029 		struct drm_gpuva_op *__op;
3030 
3031 		if (!__ops)
3032 			continue;
3033 
3034 		drm_gpuva_for_each_op_reverse(__op, __ops) {
3035 			struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
3036 
3037 			xe_vma_op_unwind(vm, op,
3038 					 op->flags & XE_VMA_OP_COMMITTED,
3039 					 op->flags & XE_VMA_OP_PREV_COMMITTED,
3040 					 op->flags & XE_VMA_OP_NEXT_COMMITTED);
3041 		}
3042 	}
3043 }
3044 
3045 static int vma_lock_and_validate(struct drm_exec *exec, struct xe_vma *vma,
3046 				 bool validate)
3047 {
3048 	struct xe_bo *bo = xe_vma_bo(vma);
3049 	struct xe_vm *vm = xe_vma_vm(vma);
3050 	int err = 0;
3051 
3052 	if (bo) {
3053 		if (!bo->vm)
3054 			err = drm_exec_lock_obj(exec, &bo->ttm.base);
3055 		if (!err && validate)
3056 			err = xe_bo_validate(bo, vm,
3057 					     !xe_vm_in_preempt_fence_mode(vm));
3058 	}
3059 
3060 	return err;
3061 }
3062 
3063 static int check_ufence(struct xe_vma *vma)
3064 {
3065 	if (vma->ufence) {
3066 		struct xe_user_fence * const f = vma->ufence;
3067 
3068 		if (!xe_sync_ufence_get_status(f))
3069 			return -EBUSY;
3070 
3071 		vma->ufence = NULL;
3072 		xe_sync_ufence_put(f);
3073 	}
3074 
3075 	return 0;
3076 }
3077 
3078 static int prefetch_ranges(struct xe_vm *vm, struct xe_vma_op *op)
3079 {
3080 	bool devmem_possible = IS_DGFX(vm->xe) && IS_ENABLED(CONFIG_DRM_XE_PAGEMAP);
3081 	struct xe_vma *vma = gpuva_to_vma(op->base.prefetch.va);
3082 	struct xe_tile *tile = op->prefetch_range.tile;
3083 	int err = 0;
3084 
3085 	struct xe_svm_range *svm_range;
3086 	struct drm_gpusvm_ctx ctx = {};
3087 	unsigned long i;
3088 
3089 	if (!xe_vma_is_cpu_addr_mirror(vma))
3090 		return 0;
3091 
3092 	ctx.read_only = xe_vma_read_only(vma);
3093 	ctx.devmem_possible = devmem_possible;
3094 	ctx.check_pages_threshold = devmem_possible ? SZ_64K : 0;
3095 
3096 	/* TODO: Threading the migration */
3097 	xa_for_each(&op->prefetch_range.range, i, svm_range) {
3098 		if (!tile)
3099 			xe_svm_range_migrate_to_smem(vm, svm_range);
3100 
3101 		if (xe_svm_range_needs_migrate_to_vram(svm_range, vma, !!tile)) {
3102 			err = xe_svm_alloc_vram(tile, svm_range, &ctx);
3103 			if (err) {
3104 				drm_dbg(&vm->xe->drm, "VRAM allocation failed, retry from userspace, asid=%u, gpusvm=%p, errno=%pe\n",
3105 					vm->usm.asid, &vm->svm.gpusvm, ERR_PTR(err));
3106 				return -ENODATA;
3107 			}
3108 			xe_svm_range_debug(svm_range, "PREFETCH - RANGE MIGRATED TO VRAM");
3109 		}
3110 
3111 		err = xe_svm_range_get_pages(vm, svm_range, &ctx);
3112 		if (err) {
3113 			drm_dbg(&vm->xe->drm, "Get pages failed, asid=%u, gpusvm=%p, errno=%pe\n",
3114 				vm->usm.asid, &vm->svm.gpusvm, ERR_PTR(err));
3115 			if (err == -EOPNOTSUPP || err == -EFAULT || err == -EPERM)
3116 				err = -ENODATA;
3117 			return err;
3118 		}
3119 		xe_svm_range_debug(svm_range, "PREFETCH - RANGE GET PAGES DONE");
3120 	}
3121 
3122 	return err;
3123 }
3124 
3125 static int op_lock_and_prep(struct drm_exec *exec, struct xe_vm *vm,
3126 			    struct xe_vma_op *op)
3127 {
3128 	int err = 0;
3129 
3130 	switch (op->base.op) {
3131 	case DRM_GPUVA_OP_MAP:
3132 		if (!op->map.invalidate_on_bind)
3133 			err = vma_lock_and_validate(exec, op->map.vma,
3134 						    !xe_vm_in_fault_mode(vm) ||
3135 						    op->map.immediate);
3136 		break;
3137 	case DRM_GPUVA_OP_REMAP:
3138 		err = check_ufence(gpuva_to_vma(op->base.remap.unmap->va));
3139 		if (err)
3140 			break;
3141 
3142 		err = vma_lock_and_validate(exec,
3143 					    gpuva_to_vma(op->base.remap.unmap->va),
3144 					    false);
3145 		if (!err && op->remap.prev)
3146 			err = vma_lock_and_validate(exec, op->remap.prev, true);
3147 		if (!err && op->remap.next)
3148 			err = vma_lock_and_validate(exec, op->remap.next, true);
3149 		break;
3150 	case DRM_GPUVA_OP_UNMAP:
3151 		err = check_ufence(gpuva_to_vma(op->base.unmap.va));
3152 		if (err)
3153 			break;
3154 
3155 		err = vma_lock_and_validate(exec,
3156 					    gpuva_to_vma(op->base.unmap.va),
3157 					    false);
3158 		break;
3159 	case DRM_GPUVA_OP_PREFETCH:
3160 	{
3161 		struct xe_vma *vma = gpuva_to_vma(op->base.prefetch.va);
3162 		u32 region;
3163 
3164 		if (!xe_vma_is_cpu_addr_mirror(vma)) {
3165 			region = op->prefetch.region;
3166 			xe_assert(vm->xe, region == DRM_XE_CONSULT_MEM_ADVISE_PREF_LOC ||
3167 				  region <= ARRAY_SIZE(region_to_mem_type));
3168 		}
3169 
3170 		err = vma_lock_and_validate(exec,
3171 					    gpuva_to_vma(op->base.prefetch.va),
3172 					    false);
3173 		if (!err && !xe_vma_has_no_bo(vma))
3174 			err = xe_bo_migrate(xe_vma_bo(vma),
3175 					    region_to_mem_type[region]);
3176 		break;
3177 	}
3178 	default:
3179 		drm_warn(&vm->xe->drm, "NOT POSSIBLE");
3180 	}
3181 
3182 	return err;
3183 }
3184 
3185 static int vm_bind_ioctl_ops_prefetch_ranges(struct xe_vm *vm, struct xe_vma_ops *vops)
3186 {
3187 	struct xe_vma_op *op;
3188 	int err;
3189 
3190 	if (!(vops->flags & XE_VMA_OPS_FLAG_HAS_SVM_PREFETCH))
3191 		return 0;
3192 
3193 	list_for_each_entry(op, &vops->list, link) {
3194 		if (op->base.op  == DRM_GPUVA_OP_PREFETCH) {
3195 			err = prefetch_ranges(vm, op);
3196 			if (err)
3197 				return err;
3198 		}
3199 	}
3200 
3201 	return 0;
3202 }
3203 
3204 static int vm_bind_ioctl_ops_lock_and_prep(struct drm_exec *exec,
3205 					   struct xe_vm *vm,
3206 					   struct xe_vma_ops *vops)
3207 {
3208 	struct xe_vma_op *op;
3209 	int err;
3210 
3211 	err = drm_exec_lock_obj(exec, xe_vm_obj(vm));
3212 	if (err)
3213 		return err;
3214 
3215 	list_for_each_entry(op, &vops->list, link) {
3216 		err = op_lock_and_prep(exec, vm, op);
3217 		if (err)
3218 			return err;
3219 	}
3220 
3221 #ifdef TEST_VM_OPS_ERROR
3222 	if (vops->inject_error &&
3223 	    vm->xe->vm_inject_error_position == FORCE_OP_ERROR_LOCK)
3224 		return -ENOSPC;
3225 #endif
3226 
3227 	return 0;
3228 }
3229 
3230 static void op_trace(struct xe_vma_op *op)
3231 {
3232 	switch (op->base.op) {
3233 	case DRM_GPUVA_OP_MAP:
3234 		trace_xe_vma_bind(op->map.vma);
3235 		break;
3236 	case DRM_GPUVA_OP_REMAP:
3237 		trace_xe_vma_unbind(gpuva_to_vma(op->base.remap.unmap->va));
3238 		if (op->remap.prev)
3239 			trace_xe_vma_bind(op->remap.prev);
3240 		if (op->remap.next)
3241 			trace_xe_vma_bind(op->remap.next);
3242 		break;
3243 	case DRM_GPUVA_OP_UNMAP:
3244 		trace_xe_vma_unbind(gpuva_to_vma(op->base.unmap.va));
3245 		break;
3246 	case DRM_GPUVA_OP_PREFETCH:
3247 		trace_xe_vma_bind(gpuva_to_vma(op->base.prefetch.va));
3248 		break;
3249 	case DRM_GPUVA_OP_DRIVER:
3250 		break;
3251 	default:
3252 		XE_WARN_ON("NOT POSSIBLE");
3253 	}
3254 }
3255 
3256 static void trace_xe_vm_ops_execute(struct xe_vma_ops *vops)
3257 {
3258 	struct xe_vma_op *op;
3259 
3260 	list_for_each_entry(op, &vops->list, link)
3261 		op_trace(op);
3262 }
3263 
3264 static int vm_ops_setup_tile_args(struct xe_vm *vm, struct xe_vma_ops *vops)
3265 {
3266 	struct xe_exec_queue *q = vops->q;
3267 	struct xe_tile *tile;
3268 	int number_tiles = 0;
3269 	u8 id;
3270 
3271 	for_each_tile(tile, vm->xe, id) {
3272 		if (vops->pt_update_ops[id].num_ops)
3273 			++number_tiles;
3274 
3275 		if (vops->pt_update_ops[id].q)
3276 			continue;
3277 
3278 		if (q) {
3279 			vops->pt_update_ops[id].q = q;
3280 			if (vm->pt_root[id] && !list_empty(&q->multi_gt_list))
3281 				q = list_next_entry(q, multi_gt_list);
3282 		} else {
3283 			vops->pt_update_ops[id].q = vm->q[id];
3284 		}
3285 	}
3286 
3287 	return number_tiles;
3288 }
3289 
3290 static struct dma_fence *ops_execute(struct xe_vm *vm,
3291 				     struct xe_vma_ops *vops)
3292 {
3293 	struct xe_tile *tile;
3294 	struct dma_fence *fence = NULL;
3295 	struct dma_fence **fences = NULL;
3296 	struct dma_fence_array *cf = NULL;
3297 	int number_tiles = 0, current_fence = 0, err;
3298 	u8 id;
3299 
3300 	number_tiles = vm_ops_setup_tile_args(vm, vops);
3301 	if (number_tiles == 0)
3302 		return ERR_PTR(-ENODATA);
3303 
3304 	if (number_tiles > 1) {
3305 		fences = kmalloc_array(number_tiles, sizeof(*fences),
3306 				       GFP_KERNEL);
3307 		if (!fences) {
3308 			fence = ERR_PTR(-ENOMEM);
3309 			goto err_trace;
3310 		}
3311 	}
3312 
3313 	for_each_tile(tile, vm->xe, id) {
3314 		if (!vops->pt_update_ops[id].num_ops)
3315 			continue;
3316 
3317 		err = xe_pt_update_ops_prepare(tile, vops);
3318 		if (err) {
3319 			fence = ERR_PTR(err);
3320 			goto err_out;
3321 		}
3322 	}
3323 
3324 	trace_xe_vm_ops_execute(vops);
3325 
3326 	for_each_tile(tile, vm->xe, id) {
3327 		if (!vops->pt_update_ops[id].num_ops)
3328 			continue;
3329 
3330 		fence = xe_pt_update_ops_run(tile, vops);
3331 		if (IS_ERR(fence))
3332 			goto err_out;
3333 
3334 		if (fences)
3335 			fences[current_fence++] = fence;
3336 	}
3337 
3338 	if (fences) {
3339 		cf = dma_fence_array_create(number_tiles, fences,
3340 					    vm->composite_fence_ctx,
3341 					    vm->composite_fence_seqno++,
3342 					    false);
3343 		if (!cf) {
3344 			--vm->composite_fence_seqno;
3345 			fence = ERR_PTR(-ENOMEM);
3346 			goto err_out;
3347 		}
3348 		fence = &cf->base;
3349 	}
3350 
3351 	for_each_tile(tile, vm->xe, id) {
3352 		if (!vops->pt_update_ops[id].num_ops)
3353 			continue;
3354 
3355 		xe_pt_update_ops_fini(tile, vops);
3356 	}
3357 
3358 	return fence;
3359 
3360 err_out:
3361 	for_each_tile(tile, vm->xe, id) {
3362 		if (!vops->pt_update_ops[id].num_ops)
3363 			continue;
3364 
3365 		xe_pt_update_ops_abort(tile, vops);
3366 	}
3367 	while (current_fence)
3368 		dma_fence_put(fences[--current_fence]);
3369 	kfree(fences);
3370 	kfree(cf);
3371 
3372 err_trace:
3373 	trace_xe_vm_ops_fail(vm);
3374 	return fence;
3375 }
3376 
3377 static void vma_add_ufence(struct xe_vma *vma, struct xe_user_fence *ufence)
3378 {
3379 	if (vma->ufence)
3380 		xe_sync_ufence_put(vma->ufence);
3381 	vma->ufence = __xe_sync_ufence_get(ufence);
3382 }
3383 
3384 static void op_add_ufence(struct xe_vm *vm, struct xe_vma_op *op,
3385 			  struct xe_user_fence *ufence)
3386 {
3387 	switch (op->base.op) {
3388 	case DRM_GPUVA_OP_MAP:
3389 		vma_add_ufence(op->map.vma, ufence);
3390 		break;
3391 	case DRM_GPUVA_OP_REMAP:
3392 		if (op->remap.prev)
3393 			vma_add_ufence(op->remap.prev, ufence);
3394 		if (op->remap.next)
3395 			vma_add_ufence(op->remap.next, ufence);
3396 		break;
3397 	case DRM_GPUVA_OP_UNMAP:
3398 		break;
3399 	case DRM_GPUVA_OP_PREFETCH:
3400 		vma_add_ufence(gpuva_to_vma(op->base.prefetch.va), ufence);
3401 		break;
3402 	default:
3403 		drm_warn(&vm->xe->drm, "NOT POSSIBLE");
3404 	}
3405 }
3406 
3407 static void vm_bind_ioctl_ops_fini(struct xe_vm *vm, struct xe_vma_ops *vops,
3408 				   struct dma_fence *fence)
3409 {
3410 	struct xe_exec_queue *wait_exec_queue = to_wait_exec_queue(vm, vops->q);
3411 	struct xe_user_fence *ufence;
3412 	struct xe_vma_op *op;
3413 	int i;
3414 
3415 	ufence = find_ufence_get(vops->syncs, vops->num_syncs);
3416 	list_for_each_entry(op, &vops->list, link) {
3417 		if (ufence)
3418 			op_add_ufence(vm, op, ufence);
3419 
3420 		if (op->base.op == DRM_GPUVA_OP_UNMAP)
3421 			xe_vma_destroy(gpuva_to_vma(op->base.unmap.va), fence);
3422 		else if (op->base.op == DRM_GPUVA_OP_REMAP)
3423 			xe_vma_destroy(gpuva_to_vma(op->base.remap.unmap->va),
3424 				       fence);
3425 	}
3426 	if (ufence)
3427 		xe_sync_ufence_put(ufence);
3428 	if (fence) {
3429 		for (i = 0; i < vops->num_syncs; i++)
3430 			xe_sync_entry_signal(vops->syncs + i, fence);
3431 		xe_exec_queue_last_fence_set(wait_exec_queue, vm, fence);
3432 	}
3433 }
3434 
3435 static struct dma_fence *vm_bind_ioctl_ops_execute(struct xe_vm *vm,
3436 						   struct xe_vma_ops *vops)
3437 {
3438 	struct drm_exec exec;
3439 	struct dma_fence *fence;
3440 	int err;
3441 
3442 	lockdep_assert_held_write(&vm->lock);
3443 
3444 	drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT |
3445 		      DRM_EXEC_IGNORE_DUPLICATES, 0);
3446 	drm_exec_until_all_locked(&exec) {
3447 		err = vm_bind_ioctl_ops_lock_and_prep(&exec, vm, vops);
3448 		drm_exec_retry_on_contention(&exec);
3449 		if (err) {
3450 			fence = ERR_PTR(err);
3451 			goto unlock;
3452 		}
3453 
3454 		fence = ops_execute(vm, vops);
3455 		if (IS_ERR(fence)) {
3456 			if (PTR_ERR(fence) == -ENODATA)
3457 				vm_bind_ioctl_ops_fini(vm, vops, NULL);
3458 			goto unlock;
3459 		}
3460 
3461 		vm_bind_ioctl_ops_fini(vm, vops, fence);
3462 	}
3463 
3464 unlock:
3465 	drm_exec_fini(&exec);
3466 	return fence;
3467 }
3468 ALLOW_ERROR_INJECTION(vm_bind_ioctl_ops_execute, ERRNO);
3469 
3470 #define SUPPORTED_FLAGS_STUB  \
3471 	(DRM_XE_VM_BIND_FLAG_READONLY | \
3472 	 DRM_XE_VM_BIND_FLAG_IMMEDIATE | \
3473 	 DRM_XE_VM_BIND_FLAG_NULL | \
3474 	 DRM_XE_VM_BIND_FLAG_DUMPABLE | \
3475 	 DRM_XE_VM_BIND_FLAG_CHECK_PXP | \
3476 	 DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR)
3477 
3478 #ifdef TEST_VM_OPS_ERROR
3479 #define SUPPORTED_FLAGS	(SUPPORTED_FLAGS_STUB | FORCE_OP_ERROR)
3480 #else
3481 #define SUPPORTED_FLAGS	SUPPORTED_FLAGS_STUB
3482 #endif
3483 
3484 #define XE_64K_PAGE_MASK 0xffffull
3485 #define ALL_DRM_XE_SYNCS_FLAGS (DRM_XE_SYNCS_FLAG_WAIT_FOR_OP)
3486 
3487 static int vm_bind_ioctl_check_args(struct xe_device *xe, struct xe_vm *vm,
3488 				    struct drm_xe_vm_bind *args,
3489 				    struct drm_xe_vm_bind_op **bind_ops)
3490 {
3491 	int err;
3492 	int i;
3493 
3494 	if (XE_IOCTL_DBG(xe, args->pad || args->pad2) ||
3495 	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
3496 		return -EINVAL;
3497 
3498 	if (XE_IOCTL_DBG(xe, args->extensions))
3499 		return -EINVAL;
3500 
3501 	if (args->num_binds > 1) {
3502 		u64 __user *bind_user =
3503 			u64_to_user_ptr(args->vector_of_binds);
3504 
3505 		*bind_ops = kvmalloc_array(args->num_binds,
3506 					   sizeof(struct drm_xe_vm_bind_op),
3507 					   GFP_KERNEL | __GFP_ACCOUNT |
3508 					   __GFP_RETRY_MAYFAIL | __GFP_NOWARN);
3509 		if (!*bind_ops)
3510 			return args->num_binds > 1 ? -ENOBUFS : -ENOMEM;
3511 
3512 		err = copy_from_user(*bind_ops, bind_user,
3513 				     sizeof(struct drm_xe_vm_bind_op) *
3514 				     args->num_binds);
3515 		if (XE_IOCTL_DBG(xe, err)) {
3516 			err = -EFAULT;
3517 			goto free_bind_ops;
3518 		}
3519 	} else {
3520 		*bind_ops = &args->bind;
3521 	}
3522 
3523 	for (i = 0; i < args->num_binds; ++i) {
3524 		u64 range = (*bind_ops)[i].range;
3525 		u64 addr = (*bind_ops)[i].addr;
3526 		u32 op = (*bind_ops)[i].op;
3527 		u32 flags = (*bind_ops)[i].flags;
3528 		u32 obj = (*bind_ops)[i].obj;
3529 		u64 obj_offset = (*bind_ops)[i].obj_offset;
3530 		u32 prefetch_region = (*bind_ops)[i].prefetch_mem_region_instance;
3531 		bool is_null = flags & DRM_XE_VM_BIND_FLAG_NULL;
3532 		bool is_cpu_addr_mirror = flags &
3533 			DRM_XE_VM_BIND_FLAG_CPU_ADDR_MIRROR;
3534 		u16 pat_index = (*bind_ops)[i].pat_index;
3535 		u16 coh_mode;
3536 
3537 		if (XE_IOCTL_DBG(xe, is_cpu_addr_mirror &&
3538 				 (!xe_vm_in_fault_mode(vm) ||
3539 				 !IS_ENABLED(CONFIG_DRM_XE_GPUSVM)))) {
3540 			err = -EINVAL;
3541 			goto free_bind_ops;
3542 		}
3543 
3544 		if (XE_IOCTL_DBG(xe, pat_index >= xe->pat.n_entries)) {
3545 			err = -EINVAL;
3546 			goto free_bind_ops;
3547 		}
3548 
3549 		pat_index = array_index_nospec(pat_index, xe->pat.n_entries);
3550 		(*bind_ops)[i].pat_index = pat_index;
3551 		coh_mode = xe_pat_index_get_coh_mode(xe, pat_index);
3552 		if (XE_IOCTL_DBG(xe, !coh_mode)) { /* hw reserved */
3553 			err = -EINVAL;
3554 			goto free_bind_ops;
3555 		}
3556 
3557 		if (XE_WARN_ON(coh_mode > XE_COH_AT_LEAST_1WAY)) {
3558 			err = -EINVAL;
3559 			goto free_bind_ops;
3560 		}
3561 
3562 		if (XE_IOCTL_DBG(xe, op > DRM_XE_VM_BIND_OP_PREFETCH) ||
3563 		    XE_IOCTL_DBG(xe, flags & ~SUPPORTED_FLAGS) ||
3564 		    XE_IOCTL_DBG(xe, obj && (is_null || is_cpu_addr_mirror)) ||
3565 		    XE_IOCTL_DBG(xe, obj_offset && (is_null ||
3566 						    is_cpu_addr_mirror)) ||
3567 		    XE_IOCTL_DBG(xe, op != DRM_XE_VM_BIND_OP_MAP &&
3568 				 (is_null || is_cpu_addr_mirror)) ||
3569 		    XE_IOCTL_DBG(xe, !obj &&
3570 				 op == DRM_XE_VM_BIND_OP_MAP &&
3571 				 !is_null && !is_cpu_addr_mirror) ||
3572 		    XE_IOCTL_DBG(xe, !obj &&
3573 				 op == DRM_XE_VM_BIND_OP_UNMAP_ALL) ||
3574 		    XE_IOCTL_DBG(xe, addr &&
3575 				 op == DRM_XE_VM_BIND_OP_UNMAP_ALL) ||
3576 		    XE_IOCTL_DBG(xe, range &&
3577 				 op == DRM_XE_VM_BIND_OP_UNMAP_ALL) ||
3578 		    XE_IOCTL_DBG(xe, obj &&
3579 				 op == DRM_XE_VM_BIND_OP_MAP_USERPTR) ||
3580 		    XE_IOCTL_DBG(xe, coh_mode == XE_COH_NONE &&
3581 				 op == DRM_XE_VM_BIND_OP_MAP_USERPTR) ||
3582 		    XE_IOCTL_DBG(xe, obj &&
3583 				 op == DRM_XE_VM_BIND_OP_PREFETCH) ||
3584 		    XE_IOCTL_DBG(xe, prefetch_region &&
3585 				 op != DRM_XE_VM_BIND_OP_PREFETCH) ||
3586 		    XE_IOCTL_DBG(xe,  (prefetch_region != DRM_XE_CONSULT_MEM_ADVISE_PREF_LOC &&
3587 				       !(BIT(prefetch_region) & xe->info.mem_region_mask))) ||
3588 		    XE_IOCTL_DBG(xe, obj &&
3589 				 op == DRM_XE_VM_BIND_OP_UNMAP)) {
3590 			err = -EINVAL;
3591 			goto free_bind_ops;
3592 		}
3593 
3594 		if (XE_IOCTL_DBG(xe, obj_offset & ~PAGE_MASK) ||
3595 		    XE_IOCTL_DBG(xe, addr & ~PAGE_MASK) ||
3596 		    XE_IOCTL_DBG(xe, range & ~PAGE_MASK) ||
3597 		    XE_IOCTL_DBG(xe, !range &&
3598 				 op != DRM_XE_VM_BIND_OP_UNMAP_ALL)) {
3599 			err = -EINVAL;
3600 			goto free_bind_ops;
3601 		}
3602 	}
3603 
3604 	return 0;
3605 
3606 free_bind_ops:
3607 	if (args->num_binds > 1)
3608 		kvfree(*bind_ops);
3609 	*bind_ops = NULL;
3610 	return err;
3611 }
3612 
3613 static int vm_bind_ioctl_signal_fences(struct xe_vm *vm,
3614 				       struct xe_exec_queue *q,
3615 				       struct xe_sync_entry *syncs,
3616 				       int num_syncs)
3617 {
3618 	struct dma_fence *fence;
3619 	int i, err = 0;
3620 
3621 	fence = xe_sync_in_fence_get(syncs, num_syncs,
3622 				     to_wait_exec_queue(vm, q), vm);
3623 	if (IS_ERR(fence))
3624 		return PTR_ERR(fence);
3625 
3626 	for (i = 0; i < num_syncs; i++)
3627 		xe_sync_entry_signal(&syncs[i], fence);
3628 
3629 	xe_exec_queue_last_fence_set(to_wait_exec_queue(vm, q), vm,
3630 				     fence);
3631 	dma_fence_put(fence);
3632 
3633 	return err;
3634 }
3635 
3636 static void xe_vma_ops_init(struct xe_vma_ops *vops, struct xe_vm *vm,
3637 			    struct xe_exec_queue *q,
3638 			    struct xe_sync_entry *syncs, u32 num_syncs)
3639 {
3640 	memset(vops, 0, sizeof(*vops));
3641 	INIT_LIST_HEAD(&vops->list);
3642 	vops->vm = vm;
3643 	vops->q = q;
3644 	vops->syncs = syncs;
3645 	vops->num_syncs = num_syncs;
3646 	vops->flags = 0;
3647 }
3648 
3649 static int xe_vm_bind_ioctl_validate_bo(struct xe_device *xe, struct xe_bo *bo,
3650 					u64 addr, u64 range, u64 obj_offset,
3651 					u16 pat_index, u32 op, u32 bind_flags)
3652 {
3653 	u16 coh_mode;
3654 
3655 	if (XE_IOCTL_DBG(xe, range > xe_bo_size(bo)) ||
3656 	    XE_IOCTL_DBG(xe, obj_offset >
3657 			 xe_bo_size(bo) - range)) {
3658 		return -EINVAL;
3659 	}
3660 
3661 	/*
3662 	 * Some platforms require 64k VM_BIND alignment,
3663 	 * specifically those with XE_VRAM_FLAGS_NEED64K.
3664 	 *
3665 	 * Other platforms may have BO's set to 64k physical placement,
3666 	 * but can be mapped at 4k offsets anyway. This check is only
3667 	 * there for the former case.
3668 	 */
3669 	if ((bo->flags & XE_BO_FLAG_INTERNAL_64K) &&
3670 	    (xe->info.vram_flags & XE_VRAM_FLAGS_NEED64K)) {
3671 		if (XE_IOCTL_DBG(xe, obj_offset &
3672 				 XE_64K_PAGE_MASK) ||
3673 		    XE_IOCTL_DBG(xe, addr & XE_64K_PAGE_MASK) ||
3674 		    XE_IOCTL_DBG(xe, range & XE_64K_PAGE_MASK)) {
3675 			return -EINVAL;
3676 		}
3677 	}
3678 
3679 	coh_mode = xe_pat_index_get_coh_mode(xe, pat_index);
3680 	if (bo->cpu_caching) {
3681 		if (XE_IOCTL_DBG(xe, coh_mode == XE_COH_NONE &&
3682 				 bo->cpu_caching == DRM_XE_GEM_CPU_CACHING_WB)) {
3683 			return -EINVAL;
3684 		}
3685 	} else if (XE_IOCTL_DBG(xe, coh_mode == XE_COH_NONE)) {
3686 		/*
3687 		 * Imported dma-buf from a different device should
3688 		 * require 1way or 2way coherency since we don't know
3689 		 * how it was mapped on the CPU. Just assume is it
3690 		 * potentially cached on CPU side.
3691 		 */
3692 		return -EINVAL;
3693 	}
3694 
3695 	/* If a BO is protected it can only be mapped if the key is still valid */
3696 	if ((bind_flags & DRM_XE_VM_BIND_FLAG_CHECK_PXP) && xe_bo_is_protected(bo) &&
3697 	    op != DRM_XE_VM_BIND_OP_UNMAP && op != DRM_XE_VM_BIND_OP_UNMAP_ALL)
3698 		if (XE_IOCTL_DBG(xe, xe_pxp_bo_key_check(xe->pxp, bo) != 0))
3699 			return -ENOEXEC;
3700 
3701 	return 0;
3702 }
3703 
3704 int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
3705 {
3706 	struct xe_device *xe = to_xe_device(dev);
3707 	struct xe_file *xef = to_xe_file(file);
3708 	struct drm_xe_vm_bind *args = data;
3709 	struct drm_xe_sync __user *syncs_user;
3710 	struct xe_bo **bos = NULL;
3711 	struct drm_gpuva_ops **ops = NULL;
3712 	struct xe_vm *vm;
3713 	struct xe_exec_queue *q = NULL;
3714 	u32 num_syncs, num_ufence = 0;
3715 	struct xe_sync_entry *syncs = NULL;
3716 	struct drm_xe_vm_bind_op *bind_ops = NULL;
3717 	struct xe_vma_ops vops;
3718 	struct dma_fence *fence;
3719 	int err;
3720 	int i;
3721 
3722 	vm = xe_vm_lookup(xef, args->vm_id);
3723 	if (XE_IOCTL_DBG(xe, !vm))
3724 		return -EINVAL;
3725 
3726 	err = vm_bind_ioctl_check_args(xe, vm, args, &bind_ops);
3727 	if (err)
3728 		goto put_vm;
3729 
3730 	if (args->exec_queue_id) {
3731 		q = xe_exec_queue_lookup(xef, args->exec_queue_id);
3732 		if (XE_IOCTL_DBG(xe, !q)) {
3733 			err = -ENOENT;
3734 			goto free_bind_ops;
3735 		}
3736 
3737 		if (XE_IOCTL_DBG(xe, !(q->flags & EXEC_QUEUE_FLAG_VM))) {
3738 			err = -EINVAL;
3739 			goto put_exec_queue;
3740 		}
3741 	}
3742 
3743 	/* Ensure all UNMAPs visible */
3744 	xe_svm_flush(vm);
3745 
3746 	err = down_write_killable(&vm->lock);
3747 	if (err)
3748 		goto put_exec_queue;
3749 
3750 	if (XE_IOCTL_DBG(xe, xe_vm_is_closed_or_banned(vm))) {
3751 		err = -ENOENT;
3752 		goto release_vm_lock;
3753 	}
3754 
3755 	for (i = 0; i < args->num_binds; ++i) {
3756 		u64 range = bind_ops[i].range;
3757 		u64 addr = bind_ops[i].addr;
3758 
3759 		if (XE_IOCTL_DBG(xe, range > vm->size) ||
3760 		    XE_IOCTL_DBG(xe, addr > vm->size - range)) {
3761 			err = -EINVAL;
3762 			goto release_vm_lock;
3763 		}
3764 	}
3765 
3766 	if (args->num_binds) {
3767 		bos = kvcalloc(args->num_binds, sizeof(*bos),
3768 			       GFP_KERNEL | __GFP_ACCOUNT |
3769 			       __GFP_RETRY_MAYFAIL | __GFP_NOWARN);
3770 		if (!bos) {
3771 			err = -ENOMEM;
3772 			goto release_vm_lock;
3773 		}
3774 
3775 		ops = kvcalloc(args->num_binds, sizeof(*ops),
3776 			       GFP_KERNEL | __GFP_ACCOUNT |
3777 			       __GFP_RETRY_MAYFAIL | __GFP_NOWARN);
3778 		if (!ops) {
3779 			err = -ENOMEM;
3780 			goto free_bos;
3781 		}
3782 	}
3783 
3784 	for (i = 0; i < args->num_binds; ++i) {
3785 		struct drm_gem_object *gem_obj;
3786 		u64 range = bind_ops[i].range;
3787 		u64 addr = bind_ops[i].addr;
3788 		u32 obj = bind_ops[i].obj;
3789 		u64 obj_offset = bind_ops[i].obj_offset;
3790 		u16 pat_index = bind_ops[i].pat_index;
3791 		u32 op = bind_ops[i].op;
3792 		u32 bind_flags = bind_ops[i].flags;
3793 
3794 		if (!obj)
3795 			continue;
3796 
3797 		gem_obj = drm_gem_object_lookup(file, obj);
3798 		if (XE_IOCTL_DBG(xe, !gem_obj)) {
3799 			err = -ENOENT;
3800 			goto put_obj;
3801 		}
3802 		bos[i] = gem_to_xe_bo(gem_obj);
3803 
3804 		err = xe_vm_bind_ioctl_validate_bo(xe, bos[i], addr, range,
3805 						   obj_offset, pat_index, op,
3806 						   bind_flags);
3807 		if (err)
3808 			goto put_obj;
3809 	}
3810 
3811 	if (args->num_syncs) {
3812 		syncs = kcalloc(args->num_syncs, sizeof(*syncs), GFP_KERNEL);
3813 		if (!syncs) {
3814 			err = -ENOMEM;
3815 			goto put_obj;
3816 		}
3817 	}
3818 
3819 	syncs_user = u64_to_user_ptr(args->syncs);
3820 	for (num_syncs = 0; num_syncs < args->num_syncs; num_syncs++) {
3821 		err = xe_sync_entry_parse(xe, xef, &syncs[num_syncs],
3822 					  &syncs_user[num_syncs],
3823 					  (xe_vm_in_lr_mode(vm) ?
3824 					   SYNC_PARSE_FLAG_LR_MODE : 0) |
3825 					  (!args->num_binds ?
3826 					   SYNC_PARSE_FLAG_DISALLOW_USER_FENCE : 0));
3827 		if (err)
3828 			goto free_syncs;
3829 
3830 		if (xe_sync_is_ufence(&syncs[num_syncs]))
3831 			num_ufence++;
3832 	}
3833 
3834 	if (XE_IOCTL_DBG(xe, num_ufence > 1)) {
3835 		err = -EINVAL;
3836 		goto free_syncs;
3837 	}
3838 
3839 	if (!args->num_binds) {
3840 		err = -ENODATA;
3841 		goto free_syncs;
3842 	}
3843 
3844 	xe_vma_ops_init(&vops, vm, q, syncs, num_syncs);
3845 	for (i = 0; i < args->num_binds; ++i) {
3846 		u64 range = bind_ops[i].range;
3847 		u64 addr = bind_ops[i].addr;
3848 		u32 op = bind_ops[i].op;
3849 		u32 flags = bind_ops[i].flags;
3850 		u64 obj_offset = bind_ops[i].obj_offset;
3851 		u32 prefetch_region = bind_ops[i].prefetch_mem_region_instance;
3852 		u16 pat_index = bind_ops[i].pat_index;
3853 
3854 		ops[i] = vm_bind_ioctl_ops_create(vm, &vops, bos[i], obj_offset,
3855 						  addr, range, op, flags,
3856 						  prefetch_region, pat_index);
3857 		if (IS_ERR(ops[i])) {
3858 			err = PTR_ERR(ops[i]);
3859 			ops[i] = NULL;
3860 			goto unwind_ops;
3861 		}
3862 
3863 		err = vm_bind_ioctl_ops_parse(vm, ops[i], &vops);
3864 		if (err)
3865 			goto unwind_ops;
3866 
3867 #ifdef TEST_VM_OPS_ERROR
3868 		if (flags & FORCE_OP_ERROR) {
3869 			vops.inject_error = true;
3870 			vm->xe->vm_inject_error_position =
3871 				(vm->xe->vm_inject_error_position + 1) %
3872 				FORCE_OP_ERROR_COUNT;
3873 		}
3874 #endif
3875 	}
3876 
3877 	/* Nothing to do */
3878 	if (list_empty(&vops.list)) {
3879 		err = -ENODATA;
3880 		goto unwind_ops;
3881 	}
3882 
3883 	err = xe_vma_ops_alloc(&vops, args->num_binds > 1);
3884 	if (err)
3885 		goto unwind_ops;
3886 
3887 	err = vm_bind_ioctl_ops_prefetch_ranges(vm, &vops);
3888 	if (err)
3889 		goto unwind_ops;
3890 
3891 	fence = vm_bind_ioctl_ops_execute(vm, &vops);
3892 	if (IS_ERR(fence))
3893 		err = PTR_ERR(fence);
3894 	else
3895 		dma_fence_put(fence);
3896 
3897 unwind_ops:
3898 	if (err && err != -ENODATA)
3899 		vm_bind_ioctl_ops_unwind(vm, ops, args->num_binds);
3900 	xe_vma_ops_fini(&vops);
3901 	for (i = args->num_binds - 1; i >= 0; --i)
3902 		if (ops[i])
3903 			drm_gpuva_ops_free(&vm->gpuvm, ops[i]);
3904 free_syncs:
3905 	if (err == -ENODATA)
3906 		err = vm_bind_ioctl_signal_fences(vm, q, syncs, num_syncs);
3907 	while (num_syncs--)
3908 		xe_sync_entry_cleanup(&syncs[num_syncs]);
3909 
3910 	kfree(syncs);
3911 put_obj:
3912 	for (i = 0; i < args->num_binds; ++i)
3913 		xe_bo_put(bos[i]);
3914 
3915 	kvfree(ops);
3916 free_bos:
3917 	kvfree(bos);
3918 release_vm_lock:
3919 	up_write(&vm->lock);
3920 put_exec_queue:
3921 	if (q)
3922 		xe_exec_queue_put(q);
3923 free_bind_ops:
3924 	if (args->num_binds > 1)
3925 		kvfree(bind_ops);
3926 put_vm:
3927 	xe_vm_put(vm);
3928 	return err;
3929 }
3930 
3931 /**
3932  * xe_vm_bind_kernel_bo - bind a kernel BO to a VM
3933  * @vm: VM to bind the BO to
3934  * @bo: BO to bind
3935  * @q: exec queue to use for the bind (optional)
3936  * @addr: address at which to bind the BO
3937  * @cache_lvl: PAT cache level to use
3938  *
3939  * Execute a VM bind map operation on a kernel-owned BO to bind it into a
3940  * kernel-owned VM.
3941  *
3942  * Returns a dma_fence to track the binding completion if the job to do so was
3943  * successfully submitted, an error pointer otherwise.
3944  */
3945 struct dma_fence *xe_vm_bind_kernel_bo(struct xe_vm *vm, struct xe_bo *bo,
3946 				       struct xe_exec_queue *q, u64 addr,
3947 				       enum xe_cache_level cache_lvl)
3948 {
3949 	struct xe_vma_ops vops;
3950 	struct drm_gpuva_ops *ops = NULL;
3951 	struct dma_fence *fence;
3952 	int err;
3953 
3954 	xe_bo_get(bo);
3955 	xe_vm_get(vm);
3956 	if (q)
3957 		xe_exec_queue_get(q);
3958 
3959 	down_write(&vm->lock);
3960 
3961 	xe_vma_ops_init(&vops, vm, q, NULL, 0);
3962 
3963 	ops = vm_bind_ioctl_ops_create(vm, &vops, bo, 0, addr, xe_bo_size(bo),
3964 				       DRM_XE_VM_BIND_OP_MAP, 0, 0,
3965 				       vm->xe->pat.idx[cache_lvl]);
3966 	if (IS_ERR(ops)) {
3967 		err = PTR_ERR(ops);
3968 		goto release_vm_lock;
3969 	}
3970 
3971 	err = vm_bind_ioctl_ops_parse(vm, ops, &vops);
3972 	if (err)
3973 		goto release_vm_lock;
3974 
3975 	xe_assert(vm->xe, !list_empty(&vops.list));
3976 
3977 	err = xe_vma_ops_alloc(&vops, false);
3978 	if (err)
3979 		goto unwind_ops;
3980 
3981 	fence = vm_bind_ioctl_ops_execute(vm, &vops);
3982 	if (IS_ERR(fence))
3983 		err = PTR_ERR(fence);
3984 
3985 unwind_ops:
3986 	if (err && err != -ENODATA)
3987 		vm_bind_ioctl_ops_unwind(vm, &ops, 1);
3988 
3989 	xe_vma_ops_fini(&vops);
3990 	drm_gpuva_ops_free(&vm->gpuvm, ops);
3991 
3992 release_vm_lock:
3993 	up_write(&vm->lock);
3994 
3995 	if (q)
3996 		xe_exec_queue_put(q);
3997 	xe_vm_put(vm);
3998 	xe_bo_put(bo);
3999 
4000 	if (err)
4001 		fence = ERR_PTR(err);
4002 
4003 	return fence;
4004 }
4005 
4006 /**
4007  * xe_vm_lock() - Lock the vm's dma_resv object
4008  * @vm: The struct xe_vm whose lock is to be locked
4009  * @intr: Whether to perform any wait interruptible
4010  *
4011  * Return: 0 on success, -EINTR if @intr is true and the wait for a
4012  * contended lock was interrupted. If @intr is false, the function
4013  * always returns 0.
4014  */
4015 int xe_vm_lock(struct xe_vm *vm, bool intr)
4016 {
4017 	if (intr)
4018 		return dma_resv_lock_interruptible(xe_vm_resv(vm), NULL);
4019 
4020 	return dma_resv_lock(xe_vm_resv(vm), NULL);
4021 }
4022 
4023 /**
4024  * xe_vm_unlock() - Unlock the vm's dma_resv object
4025  * @vm: The struct xe_vm whose lock is to be released.
4026  *
4027  * Unlock a buffer object lock that was locked by xe_vm_lock().
4028  */
4029 void xe_vm_unlock(struct xe_vm *vm)
4030 {
4031 	dma_resv_unlock(xe_vm_resv(vm));
4032 }
4033 
4034 /**
4035  * xe_vm_range_tilemask_tlb_inval - Issue a TLB invalidation on this tilemask for an
4036  * address range
4037  * @vm: The VM
4038  * @start: start address
4039  * @end: end address
4040  * @tile_mask: mask for which gt's issue tlb invalidation
4041  *
4042  * Issue a range based TLB invalidation for gt's in tilemask
4043  *
4044  * Returns 0 for success, negative error code otherwise.
4045  */
4046 int xe_vm_range_tilemask_tlb_inval(struct xe_vm *vm, u64 start,
4047 				   u64 end, u8 tile_mask)
4048 {
4049 	struct xe_tlb_inval_fence
4050 		fence[XE_MAX_TILES_PER_DEVICE * XE_MAX_GT_PER_TILE];
4051 	struct xe_tile *tile;
4052 	u32 fence_id = 0;
4053 	u8 id;
4054 	int err;
4055 
4056 	if (!tile_mask)
4057 		return 0;
4058 
4059 	for_each_tile(tile, vm->xe, id) {
4060 		if (!(tile_mask & BIT(id)))
4061 			continue;
4062 
4063 		xe_tlb_inval_fence_init(&tile->primary_gt->tlb_inval,
4064 					&fence[fence_id], true);
4065 
4066 		err = xe_tlb_inval_range(&tile->primary_gt->tlb_inval,
4067 					 &fence[fence_id], start, end,
4068 					 vm->usm.asid);
4069 		if (err)
4070 			goto wait;
4071 		++fence_id;
4072 
4073 		if (!tile->media_gt)
4074 			continue;
4075 
4076 		xe_tlb_inval_fence_init(&tile->media_gt->tlb_inval,
4077 					&fence[fence_id], true);
4078 
4079 		err = xe_tlb_inval_range(&tile->media_gt->tlb_inval,
4080 					 &fence[fence_id], start, end,
4081 					 vm->usm.asid);
4082 		if (err)
4083 			goto wait;
4084 		++fence_id;
4085 	}
4086 
4087 wait:
4088 	for (id = 0; id < fence_id; ++id)
4089 		xe_tlb_inval_fence_wait(&fence[id]);
4090 
4091 	return err;
4092 }
4093 
4094 /**
4095  * xe_vm_invalidate_vma - invalidate GPU mappings for VMA without a lock
4096  * @vma: VMA to invalidate
4097  *
4098  * Walks a list of page tables leaves which it memset the entries owned by this
4099  * VMA to zero, invalidates the TLBs, and block until TLBs invalidation is
4100  * complete.
4101  *
4102  * Returns 0 for success, negative error code otherwise.
4103  */
4104 int xe_vm_invalidate_vma(struct xe_vma *vma)
4105 {
4106 	struct xe_device *xe = xe_vma_vm(vma)->xe;
4107 	struct xe_vm *vm = xe_vma_vm(vma);
4108 	struct xe_tile *tile;
4109 	u8 tile_mask = 0;
4110 	int ret = 0;
4111 	u8 id;
4112 
4113 	xe_assert(xe, !xe_vma_is_null(vma));
4114 	xe_assert(xe, !xe_vma_is_cpu_addr_mirror(vma));
4115 	trace_xe_vma_invalidate(vma);
4116 
4117 	vm_dbg(&vm->xe->drm,
4118 	       "INVALIDATE: addr=0x%016llx, range=0x%016llx",
4119 		xe_vma_start(vma), xe_vma_size(vma));
4120 
4121 	/*
4122 	 * Check that we don't race with page-table updates, tile_invalidated
4123 	 * update is safe
4124 	 */
4125 	if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
4126 		if (xe_vma_is_userptr(vma)) {
4127 			lockdep_assert(lockdep_is_held_type(&vm->userptr.notifier_lock, 0) ||
4128 				       (lockdep_is_held_type(&vm->userptr.notifier_lock, 1) &&
4129 					lockdep_is_held(&xe_vm_resv(vm)->lock.base)));
4130 
4131 			WARN_ON_ONCE(!mmu_interval_check_retry
4132 				     (&to_userptr_vma(vma)->userptr.notifier,
4133 				      to_userptr_vma(vma)->userptr.notifier_seq));
4134 			WARN_ON_ONCE(!dma_resv_test_signaled(xe_vm_resv(vm),
4135 							     DMA_RESV_USAGE_BOOKKEEP));
4136 
4137 		} else {
4138 			xe_bo_assert_held(xe_vma_bo(vma));
4139 		}
4140 	}
4141 
4142 	for_each_tile(tile, xe, id)
4143 		if (xe_pt_zap_ptes(tile, vma))
4144 			tile_mask |= BIT(id);
4145 
4146 	xe_device_wmb(xe);
4147 
4148 	ret = xe_vm_range_tilemask_tlb_inval(xe_vma_vm(vma), xe_vma_start(vma),
4149 					     xe_vma_end(vma), tile_mask);
4150 
4151 	/* WRITE_ONCE pairs with READ_ONCE in xe_vm_has_valid_gpu_mapping() */
4152 	WRITE_ONCE(vma->tile_invalidated, vma->tile_mask);
4153 
4154 	return ret;
4155 }
4156 
4157 int xe_vm_validate_protected(struct xe_vm *vm)
4158 {
4159 	struct drm_gpuva *gpuva;
4160 	int err = 0;
4161 
4162 	if (!vm)
4163 		return -ENODEV;
4164 
4165 	mutex_lock(&vm->snap_mutex);
4166 
4167 	drm_gpuvm_for_each_va(gpuva, &vm->gpuvm) {
4168 		struct xe_vma *vma = gpuva_to_vma(gpuva);
4169 		struct xe_bo *bo = vma->gpuva.gem.obj ?
4170 			gem_to_xe_bo(vma->gpuva.gem.obj) : NULL;
4171 
4172 		if (!bo)
4173 			continue;
4174 
4175 		if (xe_bo_is_protected(bo)) {
4176 			err = xe_pxp_bo_key_check(vm->xe->pxp, bo);
4177 			if (err)
4178 				break;
4179 		}
4180 	}
4181 
4182 	mutex_unlock(&vm->snap_mutex);
4183 	return err;
4184 }
4185 
4186 struct xe_vm_snapshot {
4187 	unsigned long num_snaps;
4188 	struct {
4189 		u64 ofs, bo_ofs;
4190 		unsigned long len;
4191 		struct xe_bo *bo;
4192 		void *data;
4193 		struct mm_struct *mm;
4194 	} snap[];
4195 };
4196 
4197 struct xe_vm_snapshot *xe_vm_snapshot_capture(struct xe_vm *vm)
4198 {
4199 	unsigned long num_snaps = 0, i;
4200 	struct xe_vm_snapshot *snap = NULL;
4201 	struct drm_gpuva *gpuva;
4202 
4203 	if (!vm)
4204 		return NULL;
4205 
4206 	mutex_lock(&vm->snap_mutex);
4207 	drm_gpuvm_for_each_va(gpuva, &vm->gpuvm) {
4208 		if (gpuva->flags & XE_VMA_DUMPABLE)
4209 			num_snaps++;
4210 	}
4211 
4212 	if (num_snaps)
4213 		snap = kvzalloc(offsetof(struct xe_vm_snapshot, snap[num_snaps]), GFP_NOWAIT);
4214 	if (!snap) {
4215 		snap = num_snaps ? ERR_PTR(-ENOMEM) : ERR_PTR(-ENODEV);
4216 		goto out_unlock;
4217 	}
4218 
4219 	snap->num_snaps = num_snaps;
4220 	i = 0;
4221 	drm_gpuvm_for_each_va(gpuva, &vm->gpuvm) {
4222 		struct xe_vma *vma = gpuva_to_vma(gpuva);
4223 		struct xe_bo *bo = vma->gpuva.gem.obj ?
4224 			gem_to_xe_bo(vma->gpuva.gem.obj) : NULL;
4225 
4226 		if (!(gpuva->flags & XE_VMA_DUMPABLE))
4227 			continue;
4228 
4229 		snap->snap[i].ofs = xe_vma_start(vma);
4230 		snap->snap[i].len = xe_vma_size(vma);
4231 		if (bo) {
4232 			snap->snap[i].bo = xe_bo_get(bo);
4233 			snap->snap[i].bo_ofs = xe_vma_bo_offset(vma);
4234 		} else if (xe_vma_is_userptr(vma)) {
4235 			struct mm_struct *mm =
4236 				to_userptr_vma(vma)->userptr.notifier.mm;
4237 
4238 			if (mmget_not_zero(mm))
4239 				snap->snap[i].mm = mm;
4240 			else
4241 				snap->snap[i].data = ERR_PTR(-EFAULT);
4242 
4243 			snap->snap[i].bo_ofs = xe_vma_userptr(vma);
4244 		} else {
4245 			snap->snap[i].data = ERR_PTR(-ENOENT);
4246 		}
4247 		i++;
4248 	}
4249 
4250 out_unlock:
4251 	mutex_unlock(&vm->snap_mutex);
4252 	return snap;
4253 }
4254 
4255 void xe_vm_snapshot_capture_delayed(struct xe_vm_snapshot *snap)
4256 {
4257 	if (IS_ERR_OR_NULL(snap))
4258 		return;
4259 
4260 	for (int i = 0; i < snap->num_snaps; i++) {
4261 		struct xe_bo *bo = snap->snap[i].bo;
4262 		int err;
4263 
4264 		if (IS_ERR(snap->snap[i].data))
4265 			continue;
4266 
4267 		snap->snap[i].data = kvmalloc(snap->snap[i].len, GFP_USER);
4268 		if (!snap->snap[i].data) {
4269 			snap->snap[i].data = ERR_PTR(-ENOMEM);
4270 			goto cleanup_bo;
4271 		}
4272 
4273 		if (bo) {
4274 			err = xe_bo_read(bo, snap->snap[i].bo_ofs,
4275 					 snap->snap[i].data, snap->snap[i].len);
4276 		} else {
4277 			void __user *userptr = (void __user *)(size_t)snap->snap[i].bo_ofs;
4278 
4279 			kthread_use_mm(snap->snap[i].mm);
4280 			if (!copy_from_user(snap->snap[i].data, userptr, snap->snap[i].len))
4281 				err = 0;
4282 			else
4283 				err = -EFAULT;
4284 			kthread_unuse_mm(snap->snap[i].mm);
4285 
4286 			mmput(snap->snap[i].mm);
4287 			snap->snap[i].mm = NULL;
4288 		}
4289 
4290 		if (err) {
4291 			kvfree(snap->snap[i].data);
4292 			snap->snap[i].data = ERR_PTR(err);
4293 		}
4294 
4295 cleanup_bo:
4296 		xe_bo_put(bo);
4297 		snap->snap[i].bo = NULL;
4298 	}
4299 }
4300 
4301 void xe_vm_snapshot_print(struct xe_vm_snapshot *snap, struct drm_printer *p)
4302 {
4303 	unsigned long i, j;
4304 
4305 	if (IS_ERR_OR_NULL(snap)) {
4306 		drm_printf(p, "[0].error: %li\n", PTR_ERR(snap));
4307 		return;
4308 	}
4309 
4310 	for (i = 0; i < snap->num_snaps; i++) {
4311 		drm_printf(p, "[%llx].length: 0x%lx\n", snap->snap[i].ofs, snap->snap[i].len);
4312 
4313 		if (IS_ERR(snap->snap[i].data)) {
4314 			drm_printf(p, "[%llx].error: %li\n", snap->snap[i].ofs,
4315 				   PTR_ERR(snap->snap[i].data));
4316 			continue;
4317 		}
4318 
4319 		drm_printf(p, "[%llx].data: ", snap->snap[i].ofs);
4320 
4321 		for (j = 0; j < snap->snap[i].len; j += sizeof(u32)) {
4322 			u32 *val = snap->snap[i].data + j;
4323 			char dumped[ASCII85_BUFSZ];
4324 
4325 			drm_puts(p, ascii85_encode(*val, dumped));
4326 		}
4327 
4328 		drm_puts(p, "\n");
4329 
4330 		if (drm_coredump_printer_is_full(p))
4331 			return;
4332 	}
4333 }
4334 
4335 void xe_vm_snapshot_free(struct xe_vm_snapshot *snap)
4336 {
4337 	unsigned long i;
4338 
4339 	if (IS_ERR_OR_NULL(snap))
4340 		return;
4341 
4342 	for (i = 0; i < snap->num_snaps; i++) {
4343 		if (!IS_ERR(snap->snap[i].data))
4344 			kvfree(snap->snap[i].data);
4345 		xe_bo_put(snap->snap[i].bo);
4346 		if (snap->snap[i].mm)
4347 			mmput(snap->snap[i].mm);
4348 	}
4349 	kvfree(snap);
4350 }
4351 
4352 /**
4353  * xe_vma_need_vram_for_atomic - Check if VMA needs VRAM migration for atomic operations
4354  * @xe: Pointer to the XE device structure
4355  * @vma: Pointer to the virtual memory area (VMA) structure
4356  * @is_atomic: In pagefault path and atomic operation
4357  *
4358  * This function determines whether the given VMA needs to be migrated to
4359  * VRAM in order to do atomic GPU operation.
4360  *
4361  * Return:
4362  *   1        - Migration to VRAM is required
4363  *   0        - Migration is not required
4364  *   -EACCES  - Invalid access for atomic memory attr
4365  *
4366  */
4367 int xe_vma_need_vram_for_atomic(struct xe_device *xe, struct xe_vma *vma, bool is_atomic)
4368 {
4369 	u32 atomic_access = xe_vma_bo(vma) ? xe_vma_bo(vma)->attr.atomic_access :
4370 					     vma->attr.atomic_access;
4371 
4372 	if (!IS_DGFX(xe) || !is_atomic)
4373 		return false;
4374 
4375 	/*
4376 	 * NOTE: The checks implemented here are platform-specific. For
4377 	 * instance, on a device supporting CXL atomics, these would ideally
4378 	 * work universally without additional handling.
4379 	 */
4380 	switch (atomic_access) {
4381 	case DRM_XE_ATOMIC_DEVICE:
4382 		return !xe->info.has_device_atomics_on_smem;
4383 
4384 	case DRM_XE_ATOMIC_CPU:
4385 		return -EACCES;
4386 
4387 	case DRM_XE_ATOMIC_UNDEFINED:
4388 	case DRM_XE_ATOMIC_GLOBAL:
4389 	default:
4390 		return 1;
4391 	}
4392 }
4393 
4394 static int xe_vm_alloc_vma(struct xe_vm *vm,
4395 			   struct drm_gpuvm_map_req *map_req,
4396 			   bool is_madvise)
4397 {
4398 	struct xe_vma_ops vops;
4399 	struct drm_gpuva_ops *ops = NULL;
4400 	struct drm_gpuva_op *__op;
4401 	bool is_cpu_addr_mirror = false;
4402 	bool remap_op = false;
4403 	struct xe_vma_mem_attr tmp_attr;
4404 	u16 default_pat;
4405 	int err;
4406 
4407 	lockdep_assert_held_write(&vm->lock);
4408 
4409 	if (is_madvise)
4410 		ops = drm_gpuvm_madvise_ops_create(&vm->gpuvm, map_req);
4411 	else
4412 		ops = drm_gpuvm_sm_map_ops_create(&vm->gpuvm, map_req);
4413 
4414 	if (IS_ERR(ops))
4415 		return PTR_ERR(ops);
4416 
4417 	if (list_empty(&ops->list)) {
4418 		err = 0;
4419 		goto free_ops;
4420 	}
4421 
4422 	drm_gpuva_for_each_op(__op, ops) {
4423 		struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
4424 		struct xe_vma *vma = NULL;
4425 
4426 		if (!is_madvise) {
4427 			if (__op->op == DRM_GPUVA_OP_UNMAP) {
4428 				vma = gpuva_to_vma(op->base.unmap.va);
4429 				XE_WARN_ON(!xe_vma_has_default_mem_attrs(vma));
4430 				default_pat = vma->attr.default_pat_index;
4431 			}
4432 
4433 			if (__op->op == DRM_GPUVA_OP_REMAP) {
4434 				vma = gpuva_to_vma(op->base.remap.unmap->va);
4435 				default_pat = vma->attr.default_pat_index;
4436 			}
4437 
4438 			if (__op->op == DRM_GPUVA_OP_MAP) {
4439 				op->map.is_cpu_addr_mirror = true;
4440 				op->map.pat_index = default_pat;
4441 			}
4442 		} else {
4443 			if (__op->op == DRM_GPUVA_OP_REMAP) {
4444 				vma = gpuva_to_vma(op->base.remap.unmap->va);
4445 				xe_assert(vm->xe, !remap_op);
4446 				xe_assert(vm->xe, xe_vma_has_no_bo(vma));
4447 				remap_op = true;
4448 
4449 				if (xe_vma_is_cpu_addr_mirror(vma))
4450 					is_cpu_addr_mirror = true;
4451 				else
4452 					is_cpu_addr_mirror = false;
4453 			}
4454 
4455 			if (__op->op == DRM_GPUVA_OP_MAP) {
4456 				xe_assert(vm->xe, remap_op);
4457 				remap_op = false;
4458 				/*
4459 				 * In case of madvise ops DRM_GPUVA_OP_MAP is
4460 				 * always after DRM_GPUVA_OP_REMAP, so ensure
4461 				 * we assign op->map.is_cpu_addr_mirror true
4462 				 * if REMAP is for xe_vma_is_cpu_addr_mirror vma
4463 				 */
4464 				op->map.is_cpu_addr_mirror = is_cpu_addr_mirror;
4465 			}
4466 		}
4467 		print_op(vm->xe, __op);
4468 	}
4469 
4470 	xe_vma_ops_init(&vops, vm, NULL, NULL, 0);
4471 
4472 	if (is_madvise)
4473 		vops.flags |= XE_VMA_OPS_FLAG_MADVISE;
4474 
4475 	err = vm_bind_ioctl_ops_parse(vm, ops, &vops);
4476 	if (err)
4477 		goto unwind_ops;
4478 
4479 	xe_vm_lock(vm, false);
4480 
4481 	drm_gpuva_for_each_op(__op, ops) {
4482 		struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
4483 		struct xe_vma *vma;
4484 
4485 		if (__op->op == DRM_GPUVA_OP_UNMAP) {
4486 			vma = gpuva_to_vma(op->base.unmap.va);
4487 			/* There should be no unmap for madvise */
4488 			if (is_madvise)
4489 				XE_WARN_ON("UNEXPECTED UNMAP");
4490 
4491 			xe_vma_destroy(vma, NULL);
4492 		} else if (__op->op == DRM_GPUVA_OP_REMAP) {
4493 			vma = gpuva_to_vma(op->base.remap.unmap->va);
4494 			/* In case of madvise ops Store attributes for REMAP UNMAPPED
4495 			 * VMA, so they can be assigned to newly MAP created vma.
4496 			 */
4497 			if (is_madvise)
4498 				tmp_attr = vma->attr;
4499 
4500 			xe_vma_destroy(gpuva_to_vma(op->base.remap.unmap->va), NULL);
4501 		} else if (__op->op == DRM_GPUVA_OP_MAP) {
4502 			vma = op->map.vma;
4503 			/* In case of madvise call, MAP will always be follwed by REMAP.
4504 			 * Therefore temp_attr will always have sane values, making it safe to
4505 			 * copy them to new vma.
4506 			 */
4507 			if (is_madvise)
4508 				vma->attr = tmp_attr;
4509 		}
4510 	}
4511 
4512 	xe_vm_unlock(vm);
4513 	drm_gpuva_ops_free(&vm->gpuvm, ops);
4514 	return 0;
4515 
4516 unwind_ops:
4517 	vm_bind_ioctl_ops_unwind(vm, &ops, 1);
4518 free_ops:
4519 	drm_gpuva_ops_free(&vm->gpuvm, ops);
4520 	return err;
4521 }
4522 
4523 /**
4524  * xe_vm_alloc_madvise_vma - Allocate VMA's with madvise ops
4525  * @vm: Pointer to the xe_vm structure
4526  * @start: Starting input address
4527  * @range: Size of the input range
4528  *
4529  * This function splits existing vma to create new vma for user provided input range
4530  *
4531  * Return: 0 if success
4532  */
4533 int xe_vm_alloc_madvise_vma(struct xe_vm *vm, uint64_t start, uint64_t range)
4534 {
4535 	struct drm_gpuvm_map_req map_req = {
4536 		.map.va.addr = start,
4537 		.map.va.range = range,
4538 	};
4539 
4540 	lockdep_assert_held_write(&vm->lock);
4541 
4542 	vm_dbg(&vm->xe->drm, "MADVISE_OPS_CREATE: addr=0x%016llx, size=0x%016llx", start, range);
4543 
4544 	return xe_vm_alloc_vma(vm, &map_req, true);
4545 }
4546 
4547 /**
4548  * xe_vm_alloc_cpu_addr_mirror_vma - Allocate CPU addr mirror vma
4549  * @vm: Pointer to the xe_vm structure
4550  * @start: Starting input address
4551  * @range: Size of the input range
4552  *
4553  * This function splits/merges existing vma to create new vma for user provided input range
4554  *
4555  * Return: 0 if success
4556  */
4557 int xe_vm_alloc_cpu_addr_mirror_vma(struct xe_vm *vm, uint64_t start, uint64_t range)
4558 {
4559 	struct drm_gpuvm_map_req map_req = {
4560 		.map.va.addr = start,
4561 		.map.va.range = range,
4562 	};
4563 
4564 	lockdep_assert_held_write(&vm->lock);
4565 
4566 	vm_dbg(&vm->xe->drm, "CPU_ADDR_MIRROR_VMA_OPS_CREATE: addr=0x%016llx, size=0x%016llx",
4567 	       start, range);
4568 
4569 	return xe_vm_alloc_vma(vm, &map_req, false);
4570 }
4571